jiebaを用いたデータプリプロセッシング(分詞,無効語および句読点のフィルタリング,語周波数,キーワードの取得など)

18532 ワード

ストップワードを整理して空行と両側のスペースをとる
#encoding=utf-8
filename = "stop_words.txt"

f = open(filename,"r",encoding='utf-8')
result = list()
for line in f.readlines():
    line = line.strip()
    if not len(line):
        continue

    result.append(line)
f.close
with open("stop_words2.txt","w",encoding='utf-8') as fw:
    for sentence in result:
        sentence.encode('utf-8')
        data=sentence.strip()  
        if len(data)!=0:  
            fw.write(data)
            fw.write("
"
) print ("end")

分詞、無効語フィルタ(句読点を含む)
#encoding=utf-8
import jieba
filename = "../data/1000   2.txt"
stopwords_file = "../data/stop_words2.txt"

stop_f = open(stopwords_file,"r",encoding='utf-8')
stop_words = list()
for line in stop_f.readlines():
    line = line.strip()
    if not len(line):
        continue

    stop_words.append(line)
stop_f.close

print(len(stop_words))

f = open(filename,"r",encoding='utf-8')
result = list()
for line in f.readlines():
    line = line.strip()
    if not len(line):
        continue
    outstr = '' 
    seg_list = jieba.cut(line,cut_all=False) 
    for word in seg_list:  
        if word not in stop_words:  
            if word != '\t':  
                outstr += word 
                outstr += " "  
   # seg_list = " ".join(seg_list)
    result.append(outstr.strip())
f.close

with open("../data/test2.txt","w",encoding='utf-8') as fw:
    for sentence in result:
        sentence.encode('utf-8')
        data=sentence.strip()  
        if len(data)!=0:  
            fw.write(data)
            fw.write("
"
) print ("end") ''' seg_list = jieba.cut(" 24 ",cut_all=False) seg_list = "/".join(seg_list) print (seg_list) '''

トレーニングワードベクトル(word 2 vec)
#encoding=utf-8
from gensim.models import word2vec
import sys
sentences=word2vec.Text8Corpus(u'e:/workspace/Word2Vec/data/test2.txt')
model=word2vec.Word2Vec(sentences, size=10)
for i in model.most_similar(u"  ",topn=20):
    print (i[0],i[1])

tf-idfとtextrankはキーワードを取得します:
from jieba import analyse
#   TF-IDF       
tfidf = analyse.extract_tags
textrank = analyse.textrank

filename = "e:/workspace/Word2Vec/data/test2.txt"
#   TF-IDF         
content = open(filename, 'rb').read()
#keywords = tfidf(content)
#print ("keywords by tfidf:")
#          
#for keyword in keywords:
#    print (keyword + "/")





print ("
keywords by textrank:"
) # TextRank keywords = textrank(content) # for keyword in keywords: print (keyword) print("end")

しゅうはすう
import collections  
#coding=utf-8
filename = "e:/workspace/Word2Vec/data/test2.txt"
with open (filename,'rb') as f:  
    words_box=[]
    words_box2=[]  
    for line in f:   
        line.decode("utf-8")
        words_box.extend(line.strip().split())
    for word in words_box:
        word2 = word.decode("utf-8")
        words_box2.append(word2)
print("     :%s"%len(words_box2))
print("    :%s"%collections.Counter(words_box2))  


トレーニングワードベクトル(Word 2 vec):
#encoding=utf-8
from gensim.models import word2vec
import sys
sentences=word2vec.Text8Corpus(u'e:/workspace/Word2Vec/data/test2.txt')
model=word2vec.Word2Vec(sentences, size=10)
for i in model.most_similar(u"  ",topn=20):
    print (i[0],i[1])

ストップワードテーブル(1893個):
from:http://blog.csdn.net/shijiebei2009/article/details/39696571
!  
"  
#  
$  
%  
&  
'  
(  
)  
*  
+  
,  
-  
--  
.  
..  
...  
......  
...................  
./  
.   
.   
.   
/  
//  
0  
1  
2  
3  
4  
5  
6  
7  
8  
9  
:  
://  
::  
;  
<  
=  
>  
>>  
?  
@  
A  
Lex  
[  
\  
]  
^  
_  
`  
exp  
sub  
sup  
|  
}  
~  
~~~~  
·  
×  
×××  
Δ  
Ψ  
γ  
μ  
φ  
φ.  
В  
—  
——  
———  
‘  
’  
’‘  
“  
”  
”,  
…  
……  
…………………………………………………③  
′∈  
′|  
℃  
Ⅲ  
↑  
→  
∈[  
∪φ∈  
≈  
①  
②  
②c  
③  
③]  
④  
⑤  
⑥  
⑦  
⑧  
⑨  
⑩  
──  
■  
▲  
   
、  
。  
〈  
〉  
《  
》  
》),  
」  
『  
』  
【  
】  
〔  
〕  
〕〔  
㈧  
   
 .  
    
    
    
    
    
    
    
      
    
    
     
    
    
    
    
    
    
    
    
    
    
    
     
    
    
   
    
   
      
      
      
   
    
    
    
    
    
    
   
    
    
    
    
   
    
    
    
    
      
    
  ...    
     
      
    
    
  ...    
    
    
    
    
    
    
    
    
      
      
    
    
     
    
    
    
    
    
    
    
    
     
    
    
    
    
     
     
     
    
     
    
    
    
    
      
    
    
    
    
    
    
    
    
      
    
    
    
    
      
    
    
     
      
    
      
     
    
    
     
     
    
    
    
    
    
    
    
    
    
   
    
     
    
      
    
   
     
    
    
    
    
   
    
    
    
    
    
    
   
    
   
    
    
     
     
    
    
    
    
    
    
    
    
   
    
     
   
   
    
    
    
    
     
    
    
   
   
   
    
    
    
    
    
   
   
    
      
    
    
   
    
    
   
    
      
      
   
    
     
    
    
   
    
   
   
    
   
    
    
    
    
    
    
   
    
    
    
    
    
     
    
   
    
   
    
    
    
    
    
   
    
    
   
    
    
    
    
      
    
      
      
    
    
    
    
      
      
    
    
    
      
    
    
    
    
   
    
    
    
    
    
   
    
    
    
    
    
    
    
    
    
    
    
    
    
    
     
    
   
   
    
    
    
    
    
   
    
   
    
    
    
    
   
    
    
    
   
       
    
    
    
    
    
    
    
    
    
    
    
    
   
    
    
    
   
    
    
    
   
    
    
    
   
    
    
    
    
    
   
    
    
    
     
      
    
   
    
    
    
    
   
    
    
    
    
    
    
    
    
    
    
   
   
    
 /   
     
     
    
     
    
    
    
   
    
    
    
    
    
     
    
    
    
   
    
    
   
   
   
    
    
    
   
    
    
    
    
    
    
    
    
    
    
      
      
      
    
    
   
   
     
    
    
    
    
     
    
   
   
    
    
    
    
    
    
    
   
    
    
    
    
    
   
    
   
    
   
    
    
    
    
    
    
    
      
   
     
    
    
    
   
    
   
    
    
    
   
   
    
    
    
    
    
    
   
     
    
    
     
    
       
    
    
    
    
    
    
    
    
    
    
     
    
    
    
    
   
    
      
   
   
    
    
   
    
    
    
    
    
    
    
     
    
   
    
   
    
   
   
    
   
    
    
    
    
    
      
     
    
     
    
    
    
    
     
      
    
    
    
    
    
   
     
      
    
    
     
    
   
    
    
    
    
    
    
    
   
    
    
    
    
   
    
    
    
    
    
   
    
    
    
    
    
    
    
    
    
   
    
    
    
   
    
    
    
   
    
    
   
   
    
   
    
   
   
   
     
   
   
   
   
    
   
    
   
    
   
    
    
   
   
   
   
   
   
    
   
   
   
    
   
   
    
    
   
    
   
   
   
   
    
    
    
    
    
    
    
    
    
   
    
   
    
   
    
    
    
   
   
   
    
    
   
   
   
    
   
   
    
   
   
   
   
    
    
   
   
   
   
    
   
   
    
    
    
    
    
   
    
   
    
    
   
   
    
    
    
    
     
    
    
    
    
   
    
    
    
      
      
    
     
     
    
    
     
   
     
    
    
    
     
    
    
    
     
    
    
      
    
    
    
    
    
    
    
    
      
    
   
   
    
   
    
    
    
   
    
    
    
   
    
      
    
    
    
    
      
    
    
    
    
    
    
    
      
    
    
    
    
    
    
    
   
    
    
    
   
    
     
    
    
    
    
    
   
    
    
    
    
    
   
    
    
    
    
    
   
    
    
    
   
    
   
    
    
    
    
    
   
    
    
     
     
    
    
    
   
     
      
      
      
    
    
    
    
      
    
    
    
    
    
   
    
    
      
   
    
    
    
      
    
    
    
     
     
   
   
    
    
   
    
   
    
   
    
     
       
     
    
      
   
    
     
    
    
    
    
     
    
    
    
    
    
    
    
    
    
    
    
    
    
   
      
    
    
   
      
      
    
   
    
    
    
    
    
     
    
    
    
    
    
    
    
    
    
    
    
   
    
    
   
    
   
    
   
    
    
    
    
   
    
    
    
      
    
    
   
    
    
    
    
    
   
    
    
    
   
    
     
     
    
    
    
   
     
   
     
    
    
      
      
      
    
      
    
    
    
    
    
    
    
      
    
   
    
    
    
    
    
    
    
    
    
    
      
    
   
    
    
    
   
    
      
    
    
    
    
    
    
    
   
    
    
    
    
    
   
    
    
   
    
         
    
   
    
     
    
   
   
    
    
    
    
    
    
      
    
    
      
      
      
     
   
    
    
      
    
    
    
    
     
    
    
      
    
   
    
    
    
     
   
    
    
 /  
    
    
   
    
    
    
    
    
   
    
    
    
   
 ...   
    
    
    
      
    
    
    
    
    
    
    
    
   
     
    
    
    
    
    
    
    
    
     
    
   
    
    
      
   
    
   
    
   
    
    
    
    
    
    
   
    
    
    
    
    
    
    
    
    
    
     
    
    
   
   
    
 ##   
   
    
    
    
    
    
   
     
     
    
    
    
    
    
   
    
    
    
    
    
    
    
    
    
    
   
    
    
    
    
    
    
   
   
    
    
   
    
    
    
    
    
    
   
    
    
    
    
    
    
    
    
   
    
   
    
    
    
    
      
    
    
   
    
    
     
    
    
    
    
    
    
    
      
       
   
    
   
     
    
   
    
    
   
    
    
   
    
    
   
   
    
    
    
    
   
    
    
     
    
    
    
    
   
    
    
     
    
    
    
    
    
    
    
    
    
    
    
    
    
     
   
    
   
   
   
    
    
    
      
   
    
    
    
   
    
   
    
    
    
    
    
    
    
    
    
    
    
      
    
    
    
    
   
     
    
    
    
     
    
    
     
    
    
    
   
    
   
    
    
    
   
    
    
      
   
   
    
    
    
      
    
    
   
   
    
    
    
    
    
   
    
    
   
    
   
    
    
     
    
      
     
   
    
   
    
     
    
    
   
    
   
    
    
    
    
    
   
    
    
    
    
   
   
    
    
    
    
    
    
    
    
    
      
    
    
   
    
    
      
    
   
   
    
    
    
    
    
    
    
    
    
    
    
     
     
   
    
    
   
   
     
    
     
    
    
    
    
    
   
   
    
    
    
   
    
    
   
    
    
    
    
    
   
    
     
    
    
    
    
    
   
    
    
    
   
    
    
    
    
   
   
    
     
     
    
    
    
   
    
    
    
    
    
    
    
   
    
   
    
    
    
    
    
    
    
   
    
    
    
    
    
   
    
    
   
    
    
    
   
    
    
    
   
    
       
   
    
    
    
    
    
    
   
    
    
    
    
    
    
    
   
   
    
    
    
    
    
   
    
    
    
   
    
    
   
    
   
    
    
    
    
   
      
     
    
   
    
    
    
   
     
    
    
     
     
      
    
     
    
      
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
   
    
    
    
     
    
    
    
    
    
    
    
    
    
    
    
    
   
    
    
    
    
    
   
    
    
     
     
    
     
    
    
    
    
    
    
    
    
    
   
    
    
    
    
    
    
    
    
      
      
    
      
    
    
    
   
    
    
    
    
   
    
    
    
    
    
    
      
      
      
    
   
    
    
    
    
    
    
    
    
    
    
     
    
   
    
    
    
    
    
    
    
   
    
   
    
      
     
   
    
    
   
      
   
    
    
    
    
    
     
   
︿  
!  
#  
$  
%  
&  
'  
(  
)  
)÷(1-  
)、  
*  
+  
+ξ  
++  
,  
,   
-  
-β  
--  
-[*]-  
.  
/  
0  
0:2  
1  
1.  
12%  
2  
2.3%  
3  
4  
5  
5:0  
6  
7  
8  
9  
:  
;  
<  
<±  
<Δ  
<λ  
<φ  
<<  
=  
=″  
=☆  
=(  
=-  
=[  
={  
>  
>λ  
?  
@  
A  
LI  
R.L.  
ZXFITL  
[  
[①①]  
[①②]  
[①③]  
[①④]  
[①⑤]  
[①⑥]  
[①⑦]  
[①⑧]  
[①⑨]  
[①A]  
[①B]  
[①C]  
[①D]  
[①E]  
[①]  
[①a]  
[①c]  
[①d]  
[①e]  
[①f]  
[①g]  
[①h]  
[①i]  
[①o]  
[②  
[②①]  
[②②]  
[②③]  
[②④  
[②⑤]  
[②⑥]  
[②⑦]  
[②⑧]  
[②⑩]  
[②B]  
[②G]  
[②]  
[②a]  
[②b]  
[②c]  
[②d]  
[②e]  
[②f]  
[②g]  
[②h]  
[②i]  
[②j]  
[③①]  
[③⑩]  
[③F]  
[③]  
[③a]  
[③b]  
[③c]  
[③d]  
[③e]  
[③g]  
[③h]  
[④]  
[④a]  
[④b]  
[④c]  
[④d]  
[④e]  
[⑤]  
[⑤]]  
[⑤a]  
[⑤b]  
[⑤d]  
[⑤e]  
[⑤f]  
[⑥]  
[⑦]  
[⑧]  
[⑨]  
[⑩]  
[*]  
[-  
[]  
]  
]∧′=[  
][  
_  
a]  
b]  
c]  
e]  
f]  
ng   
{  
{-  
|  
}  
}>  
~  
~±  
~+  
¥