ニュースタイプ分類

29801 ワード

分類問題Demo
これは私が考えることができる最も簡単な方法であるべきで、私が理解できるemmの中でニュースの分類だけではなくて、ラベルの分類に拡張することができます.ここはニュースだけで実験鬼は私がどれだけのブログを参考にしたことを知っていて、1つの直接走ることができるのはすべてなくて、申し訳ありませんブログの参考が多すぎて探し出せませんでした
構想

異なる種類のニュースを通じてキーワード

を提出する

gensim訓練を用いて意味ベクトル

を確立する

ベクトル対比による語意マッチング

データソース
爬虫類が這うニュースは自分で分類する必要がある=.=,そこでsougou 2008年のデータをサボって使いました(http://www.sogou.com/labs/resource/cs.php)
三方ライブラリの使用

jieba

gensim

爬虫類に関するものはともかく

コードロジック
捜索実験室で取得したデータを処理する
異なるタイプの文章をjiebaでキーワードを抽出し,キーワードを格納する
(ps:ダウンロードしたデータ、文字セット符号化はgb 18030で、utf-8を手動で1回回転しました)
コード#コード#

import os
import jieba
from jieba import analyse
from bs4 import BeautifulSoup


def jieba_content(contnet):
    a = analyse.extract_tags(contnet)
    return list(a)


def file_read(file_dir):  #     .txt   
    for root, dirs, files in os.walk(file_dir):
        for f in files:
            file_path2 = file_dir + "/a/" + f
            print(file_path2)
            with open(file_path2) as f2:
                content = f2.read()
                soup = BeautifulSoup(content, "html.parser", from_encoding="utf-8")
                doc_list = soup.find_all('doc')
                for index, i in enumerate(doc_list):
                    content = i.find('content').text.strip()
                    if content == '':
                        continue
                    url = i.find('url').text
                    for item in url.split('/'):
                        if item in dicurl:
                            keys = jieba_content(content)
                            data = {'type': str(dicurl[item].strip()), 'key_list': keys}
                            data_write_csv(str(dicurl[item].strip()), data)


def data_write_csv(filename, datas):  
    if filename not in file_list:
        file_list.append(filename)
    filename = 'test/' + filename + '.txt'
    with open(filename, 'a+', encoding='utf-8') as f: 
        f.write(','.join(datas['key_list']))


def write_file_list(file_list):
    with open('filename.txt', 'w') as f:
        f.write(file_list)


if __name__ == "__main__":
    file_list = []
    path = "SogouCS"
    #   url        
    dicurl = {'auto.sohu.com': 'qiche', 'it.sohu.com': 'hulianwang', 'health.sohu.com': 'jiankang', \
              'sports.sohu.com': 'tiyu', 'travel.sohu.com': 'lvyou', 'learning.sohu.com': 'jiaoyu', \
              'career.sohu.com': 'zhaopin', 'cul.sohu.com': 'wenhua', 'mil.news.sohu.com': 'junshi', \
              'house.sohu.com': 'fangchan', 'yule.sohu.com': 'yule', 'women.sohu.com': 'shishang', \
              'media.sohu.com': 'chuanmei', 'gongyi.sohu.com': 'gongyi', '2008.sohu.com': 'aoyun', \
              'business.sohu.com': 'shangye', 'news.sohu.com': 'other'}

    jieba.load_userdict("user.txt")
    analyse.set_stop_words("stopword.txt")
    file_read(path)

    write_file_list(file_list)

gensimトレーニングコーパス
前のステップで処理したデータによるコーパストレーニング
コード#コード#

import os
import jieba
from jieba import analyse
from collections import defaultdict
from gensim import corpora, models, similarities


def file_read(file_dir):  #     .txt   
    for root, dirs, files in os.walk(file_dir):
        for f in files:
            key_list.append(f)
            file_path2 = file_dir + "/" + f
            with open(file_path2) as f2:
                content = f2.read()
                dic[f] = content.split(',')


def jieba_content(contnet):
    a = analyse.extract_tags(contnet)
    return list(a)


if __name__ == '__main__':
    dic = {}
    key_list = []

    file_read('test')
    texts = dic.values()

    frequency = defaultdict(int)  #       
    for text in texts:  #   2            。               
        for token in text:
            frequency[token] = +1
    dictionary = corpora.Dictionary(texts)
    dictionary.save('dictionary.txt')

    texts = [[word for word in text]
             for text in texts]  

    corpus = [dictionary.doc2bow(text) for text in texts]  #        
    corpora.MmCorpus.serialize("XinYU.mm", corpus)  #

コーパスペアリングによるニュースのタイプの取得

ローディングコーパス

新しいコンテンツをjiebaにより対応キーワード

を取得する.

このキーワードセットとコーパスのマッチング率が最も高いタイプ

を取得する.
コード#コード#

import os
from jieba import analyse
from gensim import corpora, models, similarities


def jieba_content(contnet):
    a = analyse.extract_tags(contnet)
    return list(a)


def load():
    dictionary = corpora.Dictionary.load('dictionary.txt')
    corpus = corpora.MmCorpus('XinYU.mm')
    file_list = [files for root, dirs, files in os.walk('test')][0]
    return dictionary, corpus, file_list



def get2(dictionary, corpus, content, file_list):
    new_vec = dictionary.doc2bow(jieba_content(content))  #     
    tfidf = models.TfidfModel(corpus)  #   tfidf  
    featureNum = len(dictionary.token2id.keys())  #   token2id     
    index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=featureNum)  #        ，      
    sim = index[tfidf[new_vec]]  #          
    key_index = list(sim).index(max(sim))
    return dic.get(file_list[key_index])


if __name__ == '__main__':

    dic = {'qiche.txt': '  ', 'hulianwang.txt': '   ', 'jiankang.txt': '  ', \
              'tiyu.txt': '  ', 'lvyou.txt': '  ', 'jiaoyu.txt': '  ', \
              'zhaopin.txt': '  ', 'wenhua.txt': '  ', 'junshi.txt': '  ', \
              'fangchan.txt': '  ', 'yule.txt': '  ', 'shishang.txt': '  ', \
              'chuanmei.txt': '  ', 'gongyi.txt': '  ', 'aoyun.txt': '  ', \
              'shangye.txt': '  ', 'other.txt': 'other'}

    dictionary, corpus, file_list = load()
   
    content = """
                         ，       ，      。        ，      、    、    、            。    """
    key = get2(dictionary, corpus, content, file_list)
    print('  : {}
     ： {}    '.format(content, key))

Java Timer Example

org.apache.http.impl.client.DefaultHttpClient