項目:微博感情分析(NLTK)

37203 ワード

1.NLTK紹介
応用:感情分析、テキスト類似度、テキスト分析.分類・分詞機能付き.ライブラリ、実際に実際の言語を使用します.
2.インストール
import nltk nltk.download()コーパスnltk.corpusモジュールwww.nltk.org/py-modindex.html
3.分詞
文を言語の意味学的意味を持つ語に差分する.中国語と英語の分詞の違い.中国語の分詞ツール、例えば結巴分詞.分詞結果が得られた後、中国語と英語の後続処理には大きな違いはなかった.語幹抽出stemming.語性はlemmatizationを集計し,単語の様々な語性を1つに集計し,posは語性を制定する.語性を正規化する.
4.品詞
nltk.word_tokenize語性寸法.ストレージスペースを節約し、効率を向上させるためにNLPは、いくつかの単語を自動的にフィルタします.停用語はいずれも人工入力であり,非自動化で生成され,停用語表(語気語,無意味語)を形成する.中国語停止語表:中国語停止語表、合工大停止語表、四川大学停止語表.NLTKを使用して、ストップワードstopworks.works()を除去します.
5.一般的なテキスト処理プロセス
元のテキスト-分詞-品詞正規化-無効語の除去-処理された単語リスト
6.感情分析
自然言語(テキスト)をコンピュータプログラムがより理解しやすい形式に変換します.前処理で得られた文字列–量子化(フィーチャー抽出).
6.1簡単な感情分析
感情辞書:人工的に辞書を構築し、キーワードをマッチングします.簡単で乱暴だが使いやすいが、新語の特殊語に遭遇すると拡張性が悪い.機械を使ってnltk.classifyを勉強します.
7.テキスト類似度
メトリックテキスト間の類似性;語周波数を使用してテキストの特徴を表す.テキストをベクトルとして表す.コサイン類似度
8.テキスト分類
TF-IDF(ワード周波数-逆ドキュメント周波数)TF,Term Frequency(ワード周波数)、あるワードがファイルに現れる回数.IDFは、ある語の普遍的な重要性を測定するために、逆ドキュメントの頻度です.TF-IDF=TF*IDF.TF=現在語が文書に出現する回数/文書中語の総数IDF=log(総文書個数/現在語が出現する文書個数)nlkt実装:TextCollection.tf_idf()
9.ケース
データ:0喜び1怒り2嫌悪3落ち込みステップ:テキスト読み取り、分割テストセットトレーニングセット、特徴抽出、モデルトレーニング予測
10.コードおよびコメント
main,py
# -*- coding: utf-8 -*-


import os
import pandas as pd
import nltk
from tools import proc_text, split_train_test, get_word_list_from_data, \
    extract_feat_from_data, cal_acc
from nltk.text import TextCollection
from sklearn.naive_bayes import GaussianNB

dataset_path = './dataset'
text_filenames = ['0_simplifyweibo.txt', '1_simplifyweibo.txt',
                  '2_simplifyweibo.txt', '3_simplifyweibo.txt']

#      csv  
output_text_filename = 'raw_weibo_text.csv'

#           
output_cln_text_filename = 'clean_weibo_text.csv'

#               ,    is_first_run    
#                         ,    True
#              ,              ,  False  
is_first_run = True


def read_and_save_to_csv():
    """
                ,           csv
    """

    text_w_label_df_lst = []
    for text_filename in text_filenames:
        text_file = os.path.join(dataset_path, text_filename)

        #     , 0, 1, 2, 3
        label = int(text_filename[0])

        #       
        with open(text_file, 'r', encoding='utf-8') as f:
            lines = f.read().splitlines()

        labels = [label] * len(lines)

        text_series = pd.Series(lines)
        label_series = pd.Series(labels)

        #   dataframe
        text_w_label_df = pd.concat([label_series, text_series], axis=1)
        text_w_label_df_lst.append(text_w_label_df)

    result_df = pd.concat(text_w_label_df_lst, axis=0)

    #    csv  
    result_df.columns = ['label', 'text']
    result_df.to_csv(os.path.join(dataset_path, output_text_filename),
                     index=None, encoding='utf-8')


def run_main():
    """
           
    """
    # 1.     ,  ,  ,  
    if is_first_run:
        print('         ...', end=' ')
        #                         

        #         ,           csv
        read_and_save_to_csv()

        #       csv  ,     
        text_df = pd.read_csv(os.path.join(dataset_path, output_text_filename),
                              encoding='utf-8')

        #       ,apply        
        text_df['text'] = text_df['text'].apply(proc_text)

        #       
        text_df = text_df[text_df['text'] != '']

        #           
        text_df.to_csv(os.path.join(dataset_path, output_cln_text_filename),
                       index=None, encoding='utf-8')
        print('  ,     。')

    # 2.      、   
    print('          ')
    clean_text_df = pd.read_csv(os.path.join(dataset_path, output_cln_text_filename),
                                encoding='utf-8')
    #          
    train_text_df, test_text_df = split_train_test(clean_text_df)
    #             
    print('           :', train_text_df.groupby('label').size())
    print('           :', test_text_df.groupby('label').size())

    # 3.     
    #     
    n_common_words = 200

    #                
    print('    ...')
    all_words_in_train = get_word_list_from_data(train_text_df)
    fdisk = nltk.FreqDist(all_words_in_train)
    common_words_freqs = fdisk.most_common(n_common_words)
    print('     {}   :'.format(n_common_words))
    for word, count in common_words_freqs:
        print('{}: {} '.format(word, count))
    print()

    #          
    text_collection = TextCollection(train_text_df['text'].values.tolist())
    print('        ...', end=' ')
    train_X, train_y = extract_feat_from_data(train_text_df, text_collection, common_words_freqs)
    print('  ')
    print()

    print('        ...', end=' ')
    test_X, test_y = extract_feat_from_data(test_text_df, text_collection, common_words_freqs)
    print('  ')

    # 4.     Naive Bayes
    print('    ...', end=' ')
    gnb = GaussianNB()
    gnb.fit(train_X, train_y)
    print('  ')
    print()

    # 5.   
    print('    ...', end=' ')
    test_pred = gnb.predict(test_X)
    print('  ')

    #      
    print('   :', cal_acc(test_y, test_pred))

if __name__ == '__main__':
    run_main()


tools.py
# -*- coding: utf-8 -*-


import re
import jieba.posseg as pseg
import pandas as pd
import math
import numpy as np

#        
stopwords1 = [line.rstrip() for line in open('./      .txt', 'r', encoding='utf-8')]
# stopwords2 = [line.rstrip() for line in open('./       .txt', 'r', encoding='utf-8')]
# stopwords3 = [line.rstrip() for line in open('./               .txt', 'r', encoding='utf-8')]
# stopwords = stopwords1 + stopwords2 + stopwords3
stopwords = stopwords1


def proc_text(raw_line):
    """
                 
              
    """
    # 1.               
    filter_pattern = re.compile('[^\u4E00-\u9FD5]+')
    chinese_only = filter_pattern.sub('', raw_line)

    # 2.     +    
    words_lst = pseg.cut(chinese_only)

    # 3.      
    meaninful_words = []
    for word, flag in words_lst:
        # if (word not in stopwords) and (flag == 'v'):
            #             
        if word not in stopwords:
            meaninful_words.append(word)

    return ' '.join(meaninful_words)


def split_train_test(text_df, size=0.8):
    """
                 
    """
    #                           ,              
    train_text_df = pd.DataFrame()
    test_text_df = pd.DataFrame()

    labels = [0, 1, 2, 3]
    for label in labels:
        #   label   
        text_df_w_label = text_df[text_df['label'] == label]
        #       ,          0    ,       
        text_df_w_label = text_df_w_label.reset_index()

        #    80%   ,20%     
        #         ,  80%      , 20%      
        #          80%,20%(     DataFrame      )

        #        
        n_lines = text_df_w_label.shape[0]
        split_line_no = math.floor(n_lines * size)
        text_df_w_label_train = text_df_w_label.iloc[:split_line_no, :]
        text_df_w_label_test = text_df_w_label.iloc[split_line_no:, :]

        #        ,    
        train_text_df = train_text_df.append(text_df_w_label_train)
        test_text_df = test_text_df.append(text_df_w_label_test)

    train_text_df = train_text_df.reset_index()
    test_text_df = test_text_df.reset_index()
    return train_text_df, test_text_df


def get_word_list_from_data(text_df):
    """
                        
    """
    word_list = []
    for _, r_data in text_df.iterrows():
        word_list += r_data['text'].split(' ')
    return word_list


def extract_feat_from_data(text_df, text_collection, common_words_freqs):
    """
            
    """
    #      TF-IDF      
    #                      

    n_sample = text_df.shape[0]
    n_feat = len(common_words_freqs)
    common_words = [word for word, _ in common_words_freqs]

    #    
    X = np.zeros([n_sample, n_feat])
    y = np.zeros(n_sample)

    print('    ...')
    for i, r_data in text_df.iterrows():
        if (i + 1) % 5000 == 0:
            print('   {}        '.format(i + 1))

        text = r_data['text']

        feat_vec = []
        for word in common_words:
            if word in text:
                #        ,  TF-IDF 
                tf_idf_val = text_collection.tf_idf(word, text)
            else:
                tf_idf_val = 0

            feat_vec.append(tf_idf_val)

        #   
        X[i, :] = np.array(feat_vec)
        y[i] = int(r_data['label'])

    return X, y


def cal_acc(true_labels, pred_labels):
    """
             
    """
    n_total = len(true_labels)
    correct_list = [true_labels[i] == pred_labels[i] for i in range(n_total)]

    acc = sum(correct_list) / n_total
    return acc