jiebaを用いたデータプリプロセッシング(分詞,無効語および句読点のフィルタリング,語周波数,キーワードの取得など)

18532 ワード

ストップワードを整理して空行と両側のスペースをとる

#encoding=utf-8
filename = "stop_words.txt"

f = open(filename,"r",encoding='utf-8')
result = list()
for line in f.readlines():
    line = line.strip()
    if not len(line):
        continue

    result.append(line)
f.close
with open("stop_words2.txt","w",encoding='utf-8') as fw:
    for sentence in result:
        sentence.encode('utf-8')
        data=sentence.strip()  
        if len(data)!=0:  
            fw.write(data)
            fw.write("
") 
print ("end")

分詞、無効語フィルタ(句読点を含む)

#encoding=utf-8
import jieba
filename = "../data/1000   2.txt"
stopwords_file = "../data/stop_words2.txt"

stop_f = open(stopwords_file,"r",encoding='utf-8')
stop_words = list()
for line in stop_f.readlines():
    line = line.strip()
    if not len(line):
        continue

    stop_words.append(line)
stop_f.close

print(len(stop_words))

f = open(filename,"r",encoding='utf-8')
result = list()
for line in f.readlines():
    line = line.strip()
    if not len(line):
        continue
    outstr = '' 
    seg_list = jieba.cut(line,cut_all=False) 
    for word in seg_list:  
        if word not in stop_words:  
            if word != '\t':  
                outstr += word 
                outstr += " "  
   # seg_list = " ".join(seg_list)
    result.append(outstr.strip())
f.close

with open("../data/test2.txt","w",encoding='utf-8') as fw:
    for sentence in result:
        sentence.encode('utf-8')
        data=sentence.strip()  
        if len(data)!=0:  
            fw.write(data)
            fw.write("
") 


print ("end")


'''
seg_list = jieba.cut("                    24               ",cut_all=False)
seg_list = "/".join(seg_list)
print (seg_list)
'''

トレーニングワードベクトル(word 2 vec)

#encoding=utf-8
from gensim.models import word2vec
import sys
sentences=word2vec.Text8Corpus(u'e:/workspace/Word2Vec/data/test2.txt')
model=word2vec.Word2Vec(sentences, size=10)
for i in model.most_similar(u"  ",topn=20):
    print (i[0],i[1])

tf-idfとtextrankはキーワードを取得します:

from jieba import analyse
#   TF-IDF       
tfidf = analyse.extract_tags
textrank = analyse.textrank

filename = "e:/workspace/Word2Vec/data/test2.txt"
#   TF-IDF         
content = open(filename, 'rb').read()
#keywords = tfidf(content)
#print ("keywords by tfidf:")
#          
#for keyword in keywords:
#    print (keyword + "/")





print ("
keywords by textrank:")
#   TextRank         
keywords = textrank(content)
#          
for keyword in keywords:
    print (keyword)    
print("end")

しゅうはすう

import collections  
#coding=utf-8
filename = "e:/workspace/Word2Vec/data/test2.txt"
with open (filename,'rb') as f:  
    words_box=[]
    words_box2=[]  
    for line in f:   
        line.decode("utf-8")
        words_box.extend(line.strip().split())
    for word in words_box:
        word2 = word.decode("utf-8")
        words_box2.append(word2)
print("     ：%s"%len(words_box2))
print("    ：%s"%collections.Counter(words_box2))

トレーニングワードベクトル(Word 2 vec):

#encoding=utf-8
from gensim.models import word2vec
import sys
sentences=word2vec.Text8Corpus(u'e:/workspace/Word2Vec/data/test2.txt')
model=word2vec.Word2Vec(sentences, size=10)
for i in model.most_similar(u"  ",topn=20):
    print (i[0],i[1])

ストップワードテーブル(1893個):
from:http://blog.csdn.net/shijiebei2009/article/details/39696571

!  
"  
#  
$  
%  
&  
'  
(  
)  
*  
+  
,  
-  
--  
.  
..  
...  
......  
...................  
./  
.   
.   
.   
/  
//  
0  
1  
2  
3  
4  
5  
6  
7  
8  
9  
:  
://  
::  
;  
<  
=  
>  
>>  
?  
@  
A  
Lex  
[  
\  
]  
^  
_  
`  
exp  
sub  
sup  
|  
}  
~  
~~~~  
·  
×  
×××  
Δ  
Ψ  
γ  
μ  
φ  
φ．  
В  
—  
——  
———  
‘  
’  
’‘  
“  
”  
”，  
…  
……  
…………………………………………………③  
′∈  
′｜  
℃  
Ⅲ  
↑  
→  
∈［  
∪φ∈  
≈  
①  
②  
②ｃ  
③  
③］  
④  
⑤  
⑥  
⑦  
⑧  
⑨  
⑩  
──  
■  
▲  
　  
、  
。  
〈  
〉  
《  
》  
》），  
」  
『  
』  
【  
】  
〔  
〕  
〕〔  
㈧  
   
 .  
    
    
    
    
    
    
    
      
    
    
     
    
    
    
    
    
    
    
    
    
    
    
     
    
    
   
    
   
      
      
      
   
    
    
    
    
    
    
   
    
    
    
    
   
    
    
    
    
      
    
  ...    
     
      
    
    
  ...    
    
    
    
    
    
    
    
    
      
      
    
    
     
    
    
    
    
    
    
    
    
     
    
    
    
    
     
     
     
    
     
    
    
    
    
      
    
    
    
    
    
    
    
    
      
    
    
    
    
      
    
    
     
      
    
      
     
    
    
     
     
    
    
    
    
    
    
    
    
    
   
    
     
    
      
    
   
     
    
    
    
    
   
    
    
    
    
    
    
   
    
   
    
    
     
     
    
    
    
    
    
    
    
    
   
    
     
   
   
    
    
    
    
     
    
    
   
   
   
    
    
    
    
    
   
   
    
      
    
    
   
    
    
   
    
      
      
   
    
     
    
    
   
    
   
   
    
   
    
    
    
    
    
    
   
    
    
    
    
    
     
    
   
    
   
    
    
    
    
    
   
    
    
   
    
    
    
    
      
    
      
      
    
    
    
    
      
      
    
    
    
      
    
    
    
    
   
    
    
    
    
    
   
    
    
    
    
    
    
    
    
    
    
    
    
    
    
     
    
   
   
    
    
    
    
    
   
    
   
    
    
    
    
   
    
    
    
   
       
    
    
    
    
    
    
    
    
    
    
    
    
   
    
    
    
   
    
    
    
   
    
    
    
   
    
    
    
    
    
   
    
    
    
     
      
    
   
    
    
    
    
   
    
    
    
    
    
    
    
    
    
    
   
   
    
 ／   
     
     
    
     
    
    
    
   
    
    
    
    
    
     
    
    
    
   
    
    
   
   
   
    
    
    
   
    
    
    
    
    
    
    
    
    
    
      
      
      
    
    
   
   
     
    
    
    
    
     
    
   
   
    
    
    
    
    
    
    
   
    
    
    
    
    
   
    
   
    
   
    
    
    
    
    
    
    
      
   
     
    
    
    
   
    
   
    
    
    
   
   
    
    
    
    
    
    
   
     
    
    
     
    
       
    
    
    
    
    
    
    
    
    
    
     
    
    
    
    
   
    
      
   
   
    
    
   
    
    
    
    
    
    
    
     
    
   
    
   
    
   
   
    
   
    
    
    
    
    
      
     
    
     
    
    
    
    
     
      
    
    
    
    
    
   
     
      
    
    
     
    
   
    
    
    
    
    
    
    
   
    
    
    
    
   
    
    
    
    
    
   
    
    
    
    
    
    
    
    
    
   
    
    
    
   
    
    
    
   
    
    
   
   
    
   
    
   
   
   
     
   
   
   
   
    
   
    
   
    
   
    
    
   
   
   
   
   
   
    
   
   
   
    
   
   
    
    
   
    
   
   
   
   
    
    
    
    
    
    
    
    
    
   
    
   
    
   
    
    
    
   
   
   
    
    
   
   
   
    
   
   
    
   
   
   
   
    
    
   
   
   
   
    
   
   
    
    
    
    
    
   
    
   
    
    
   
   
    
    
    
    
     
    
    
    
    
   
    
    
    
      
      
    
     
     
    
    
     
   
     
    
    
    
     
    
    
    
     
    
    
      
    
    
    
    
    
    
    
    
      
    
   
   
    
   
    
    
    
   
    
    
    
   
    
      
    
    
    
    
      
    
    
    
    
    
    
    
      
    
    
    
    
    
    
    
   
    
    
    
   
    
     
    
    
    
    
    
   
    
    
    
    
    
   
    
    
    
    
    
   
    
    
    
   
    
   
    
    
    
    
    
   
    
    
     
     
    
    
    
   
     
      
      
      
    
    
    
    
      
    
    
    
    
    
   
    
    
      
   
    
    
    
      
    
    
    
     
     
   
   
    
    
   
    
   
    
   
    
     
       
     
    
      
   
    
     
    
    
    
    
     
    
    
    
    
    
    
    
    
    
    
    
    
    
   
      
    
    
   
      
      
    
   
    
    
    
    
    
     
    
    
    
    
    
    
    
    
    
    
    
   
    
    
   
    
   
    
   
    
    
    
    
   
    
    
    
      
    
    
   
    
    
    
    
    
   
    
    
    
   
    
     
     
    
    
    
   
     
   
     
    
    
      
      
      
    
      
    
    
    
    
    
    
    
      
    
   
    
    
    
    
    
    
    
    
    
    
      
    
   
    
    
    
   
    
      
    
    
    
    
    
    
    
   
    
    
    
    
    
   
    
    
   
    
         
    
   
    
     
    
   
   
    
    
    
    
    
    
      
    
    
      
      
      
     
   
    
    
      
    
    
    
    
     
    
    
      
    
   
    
    
    
     
   
    
    
 /  
    
    
   
    
    
    
    
    
   
    
    
    
   
 ...   
    
    
    
      
    
    
    
    
    
    
    
    
   
     
    
    
    
    
    
    
    
    
     
    
   
    
    
      
   
    
   
    
   
    
    
    
    
    
    
   
    
    
    
    
    
    
    
    
    
    
     
    
    
   
   
    
 ##   
   
    
    
    
    
    
   
     
     
    
    
    
    
    
   
    
    
    
    
    
    
    
    
    
    
   
    
    
    
    
    
    
   
   
    
    
   
    
    
    
    
    
    
   
    
    
    
    
    
    
    
    
   
    
   
    
    
    
    
      
    
    
   
    
    
     
    
    
    
    
    
    
    
      
       
   
    
   
     
    
   
    
    
   
    
    
   
    
    
   
   
    
    
    
    
   
    
    
     
    
    
    
    
   
    
    
     
    
    
    
    
    
    
    
    
    
    
    
    
    
     
   
    
   
   
   
    
    
    
      
   
    
    
    
   
    
   
    
    
    
    
    
    
    
    
    
    
    
      
    
    
    
    
   
     
    
    
    
     
    
    
     
    
    
    
   
    
   
    
    
    
   
    
    
      
   
   
    
    
    
      
    
    
   
   
    
    
    
    
    
   
    
    
   
    
   
    
    
     
    
      
     
   
    
   
    
     
    
    
   
    
   
    
    
    
    
    
   
    
    
    
    
   
   
    
    
    
    
    
    
    
    
    
      
    
    
   
    
    
      
    
   
   
    
    
    
    
    
    
    
    
    
    
    
     
     
   
    
    
   
   
     
    
     
    
    
    
    
    
   
   
    
    
    
   
    
    
   
    
    
    
    
    
   
    
     
    
    
    
    
    
   
    
    
    
   
    
    
    
    
   
   
    
     
     
    
    
    
   
    
    
    
    
    
    
    
   
    
   
    
    
    
    
    
    
    
   
    
    
    
    
    
   
    
    
   
    
    
    
   
    
    
    
   
    
       
   
    
    
    
    
    
    
   
    
    
    
    
    
    
    
   
   
    
    
    
    
    
   
    
    
    
   
    
    
   
    
   
    
    
    
    
   
      
     
    
   
    
    
    
   
     
    
    
     
     
      
    
     
    
      
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
   
    
    
    
     
    
    
    
    
    
    
    
    
    
    
    
    
   
    
    
    
    
    
   
    
    
     
     
    
     
    
    
    
    
    
    
    
    
    
   
    
    
    
    
    
    
    
    
      
      
    
      
    
    
    
   
    
    
    
    
   
    
    
    
    
    
    
      
      
      
    
   
    
    
    
    
    
    
    
    
    
    
     
    
   
    
    
    
    
    
    
    
   
    
   
    
      
     
   
    
    
   
      
   
    
    
    
    
    
     
   
︿  
！  
＃  
＄  
％  
＆  
＇  
（  
）  
）÷（１－  
）、  
＊  
＋  
＋ξ  
＋＋  
，  
，   
－  
－β  
－－  
－［＊］－  
．  
／  
０  
０：２  
１  
１．  
１２％  
２  
２．３％  
３  
４  
５  
５：０  
６  
７  
８  
９  
：  
；  
＜  
＜±  
＜Δ  
＜λ  
＜φ  
＜＜  
＝  
＝″  
＝☆  
＝（  
＝－  
＝［  
＝｛  
＞  
＞λ  
？  
＠  
Ａ  
ＬＩ  
Ｒ．Ｌ．  
ＺＸＦＩＴＬ  
［  
［①①］  
［①②］  
［①③］  
［①④］  
［①⑤］  
［①⑥］  
［①⑦］  
［①⑧］  
［①⑨］  
［①Ａ］  
［①Ｂ］  
［①Ｃ］  
［①Ｄ］  
［①Ｅ］  
［①］  
［①ａ］  
［①ｃ］  
［①ｄ］  
［①ｅ］  
［①ｆ］  
［①ｇ］  
［①ｈ］  
［①ｉ］  
［①ｏ］  
［②  
［②①］  
［②②］  
［②③］  
［②④  
［②⑤］  
［②⑥］  
［②⑦］  
［②⑧］  
［②⑩］  
［②Ｂ］  
［②Ｇ］  
［②］  
［②ａ］  
［②ｂ］  
［②ｃ］  
［②ｄ］  
［②ｅ］  
［②ｆ］  
［②ｇ］  
［②ｈ］  
［②ｉ］  
［②ｊ］  
［③①］  
［③⑩］  
［③Ｆ］  
［③］  
［③ａ］  
［③ｂ］  
［③ｃ］  
［③ｄ］  
［③ｅ］  
［③ｇ］  
［③ｈ］  
［④］  
［④ａ］  
［④ｂ］  
［④ｃ］  
［④ｄ］  
［④ｅ］  
［⑤］  
［⑤］］  
［⑤ａ］  
［⑤ｂ］  
［⑤ｄ］  
［⑤ｅ］  
［⑤ｆ］  
［⑥］  
［⑦］  
［⑧］  
［⑨］  
［⑩］  
［＊］  
［－  
［］  
］  
］∧′＝［  
］［  
＿  
ａ］  
ｂ］  
ｃ］  
ｅ］  
ｆ］  
ｎｇ   
｛  
｛－  
｜  
｝  
｝＞  
～  
～±  
～＋  
￥

Reactでステート変数を使用する際に注意する点の備忘録

JNIメモリ漏洩処理方法概要