导读:副标题#e# 也是好久没写博客了,前段时间一直在找工作,没有做什么实质性的工作。最近工作也定下了,百度流量质量控制部的反作弊算法团队,不算是百度的什么土豪团队,但是99%以上的流量收入都是要从这个团队过一遍的,团队资历实力可见一斑。 好了不吹b了



# coding:utf-8

import nltk,os,sys

#  进度条
def View_Bar(flag,sum):
    rate = float(flag) / sum
    rate_num = rate * 100
    if flag % 15.0 == 0:
        print 'r%.2f%%: ' %(rate_num),# r%.2f后面跟的两个百分号会输出一个'%'

#  筛选 指定类型的 店家
def Filter_Business(path,type): # path是business表路径,type:待筛选店家的categories
    lines = open(path,'r').readlines()
    filter_bussiness_id = {} # 筛选完成的商家id的字典 {"5UmKMjUEUNdYWqANhGckJw":None,"UsFtqoBl7naz8AVUBZMjQQ":None,}

    flag = 0 # 进度条
    for line in lines:
        line = line.split('"categories": ')
        categories = line[1].split(',"city"')[0] # ["Fast Food","Restaurants"]
        business_id = line[0].split(',"full_address"')[0].split('"business_id": ')[1]
        if type in categories:
            filter_bussiness_id[business_id[1:-1]] = 0
        flag += 1
        print ('筛选数据进度: ' + str(flag/85901.0))

    print ('字典: ' + str(filter_bussiness_id))
    print ('字典长度: ' + str(len(filter_bussiness_id)))

    return filter_bussiness_id

#  提取指定类型店家的 review
def Filter_Business_Review(path,filter_business_id): # path是review文件路径
    lines = open(path,'r').readlines()
    f = open('/Users/John/Desktop/yelp_dataset_challenge_academic_dataset/review_text.txt','w')

    flag = 0 # 进度条
    for line in lines:
        dict = eval(line)
        business_id = dict['business_id'] # 该评论对应的商家
        if filter_business_id.has_key(business_id) == True:
            text = dict['text'].replace('n','')
            f.write(text + 'n')
        flag += 1
        print (flag/2685066.0)

    return 0

#            #
#  词性标注  #
#            #
def Tag_Word(path): # path 是所有用户的评论文件路径
    lines = open(path,'r').readlines()
    tags = [] # 保存每个文章分词后的词性 [ [('Excellent','JJ'),('food','NN'),('.','.')],#                           [('Superb','NNP'),('customer',('service','.')] ]
    feature_word = [] # 提出的服务价值分布特征

    # 分词、赋词性
    f = open('/Users/John/Desktop/yelp_dataset_challenge_academic_dataset/word_tagged_sentences.txt','w')  # 保存一下词性标注后的结果
    flag = 0 # 进度条
    for text in lines:
        sentences = nltk.sent_tokenize(text) # 将文本拆分成句子列表
        # 先对每个句子进行分词,在对这个句子进行词性标注(这样效果比较好)
        for sentence in sentences:
            word = nltk.word_tokenize(sentence) # 先对句子进行分词 ['Excellent','food','.']
            word_tagged = nltk.pos_tag(word) # 再对这个分好的句子进行词性标注 [('Excellent','.')]
            for item in word_tagged: # 将标注好的词写入文件中
                f.write(item[0] + '/' + item[1] + ' ') # 'Excellent/JJ food/NN ./. '
            f.write('n') # 这里我认为每个能展现feature的评论都是蕴含在一句话中的,因此每句话一行,到时候找feature的时候也是一行一行的去找
        flag += 1
        print ('分词进度: ' + str(flag/2687201.0))

    return 0

#                     #
#  筛选 feature 词汇  #
#                     #
def Featuer_Word(path,window): # path 是词性标注后的评论句子
    flag = 0 # 进度条
    lines = open(path,'r').readlines()
    len_lines = float(len(lines))
    tagged_sentences = [] # 保存所有标注好的句子
                         # [ [(“'Excellent',#   [('Superb','.')] ]
    feature_list = [] # 挖到的feature

    # 设置一个滑窗,寻找距离这个滑窗最近的一个NN、NNS
    def Slip_Window_Func(tagged_sentence,i,window):
        len_sentence = len(tagged_sentence)
        feature = ''
        k = 1

        while k <= window: # 同时向目标词两边找 NNNNS
            if i-k >= 0:
                if tagged_sentence[i-k][1] == ('NN' or 'NNS'):
                    feature = tagged_sentence[i - k][0]
            if i+k < len_sentence:
                if tagged_sentence[i+k][1] == ('NN' or 'NNS'):
                    feature = tagged_sentence[i + k][0]
            if feature == '':
                k += 1

        return feature

    # 数据预处理
    flag = 0 # 进度条
    print ('数据预处理进度: ')
    for line in lines: # 预处理一下字符串 'Excellent/JJ food/NN ./. n'
        sentence = line[:-3].split(' ') # ['Excellent/JJ','food/NN','./.']
        tagged_sentence = [] # 标注好的一个句子 [('Excellent','.')]
        for item in sentence:
        flag += 1
        # if flag == 100:
        #     break

    # 使用滑窗window确定 feature
    flag = 0 # 进度条
    print ('feature挖掘进度: ')
    for tagged_sentence in tagged_sentences:
        for i,tagged_word in enumerate(tagged_sentence): # ('Excellent','JJ')
            if tagged_word[1] == ('JJ' or 'JJR' or 'JJS'): # 如果遇到形容词、比较级、最高级的话
                feature = Slip_Window_Func(tagged_sentence,5) # 设置一个滑窗,寻找距离这个滑窗最近的一个NN、NNS
                if feature != '' and feature_list != []: # 如果挖到了feature的话
                    if feature != feature_list[-1]: # 这一步是防止挖到有滑窗交集的feature
                elif feature != '' and feature_list == []:
        flag += 1
    print ('所有的feature:')
    print (feature_list)

    # 将feature词汇保存一下
    f = open('/Users/John/Desktop/yelp_dataset_challenge_academic_dataset/feature.txt','w')
    for item in feature_list:
        f.write(str(item) + 'n')

#  对 feature 词汇进行再清洗
def Feature_Data_Cleaning(path): # path是装有feature词汇的文件路径
    lines = open(path,'r').readlines()
    feature_dict = {} # 保存feature的字典

    # 把原始文件放到字典中
    for feature in lines:
        feature = feature[:-1]
        if feature_dict.has_key(feature) == False: # 如果字典里没有这个feature
            feature_dict[feature] = 1 # 赋一下key-value对
        else: # 如果有这个feature
            feature_dict[feature] += 1

    # 对字典排序
    feature_dict = sorted(feature_dict.iteritems(),key=lambda asd:asd[1],reverse=True) # 对value进行降序排序

    print ('原始feature数目: ' + str(len(lines)))
    print ('放到dict中的数目:' + str(len(feature_dict)))

    # 将feature字典保存成文件
    f = open('/Users/John/Desktop/yelp_dataset_challenge_academic_dataset/feature_dict.txt','w')
    for item in feature_dict:
        f.write(str(item) + 'n')

    return 0

#   只筛选 餐厅 类型的服务行业
# filter_business_id = Filter_Business('/Users/John/Desktop/yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_business.json',"Restaurants")
#   保存review
# Filter_Business_Review('/Users/John/Desktop/yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_review.json',filter_business_id)

#   词性标注
# Tag_Word('/Users/John/Desktop/yelp_dataset_challenge_academic_dataset/review_text.txt')
#   筛选feature词汇
# Featuer_Word('/Users/John/Desktop/yelp_dataset_challenge_academic_dataset/word_tagged_sentences.txt',window=5)

#  对feature自会进行在清洗

