語ベクトル平均合成文ベクトルを求める
1914 ワード
#!/usr/bin/python
# -*- coding: UTF-8 -*-
import sys
import codecs #
import numpy as np
reload(sys) #zzh , !!! QAQ
sys.setdefaultencoding('utf-8')
import gensim
#model = gensim.models.Word2Vec.load("22620491.model")
model = gensim.models.KeyedVectors.load_word2vec_format('news_12g_baidubaike_20g_novel_90g_embedding_64.bin', binary=True)
word_vec = model.wv
del model # word_vec, Model 。
'''
print word_vec[u' ']
'''
f = codecs.open("xlj_fenci.txt",'r','utf-8') #codecs TXT
lines = f.readlines()
doc = open('xlj_vec.txt', 'w')
for line in lines: #
if lines.index(line) % 100 ==0: #
print lines.index(line)
list=[]
for i in range(0,64):
list.append(0)
list=np.array(list) # array
words=line.split(" ",1)[1].split(" ")
count=0
for word in words:
if word != "\r
":
print(word)
try:
print(word_vec[word])
count=count+1
print(type(word_vec[word]))
print(type(list))
list=word_vec[word]+list #
print(list)
print(count)
except:
continue
print(count)
if count !=0:
list=list/count
print(list)
list=list.tolist() # List SVM
list.append(int(float(line.split(" ",1)[0].split(" ",5)[2])))
list.append(int(float(line.split(" ", 1)[0].split(" ", 5)[3])))
list.append(int(float(line.split(" ", 1)[0].split(" ", 5)[4])))
list.append(int(float(line.split(" ", 1)[0].split(" ", 5)[5])))
print(list)
doc.write(line.split(" ",2)[0]+" "+str(list)+"\r
")
print("end")
f.close()
doc.close()