語ベクトル平均合成文ベクトルを求める

1914 ワード

#!/usr/bin/python
# -*- coding: UTF-8 -*-
import sys
import codecs   # 
import numpy as np

reload(sys)               #zzh , !!!    QAQ
sys.setdefaultencoding('utf-8')
import gensim

#model = gensim.models.Word2Vec.load("22620491.model")
model = gensim.models.KeyedVectors.load_word2vec_format('news_12g_baidubaike_20g_novel_90g_embedding_64.bin', binary=True)
word_vec = model.wv
del model     # word_vec, Model 。
'''
print word_vec[u' ']
'''


f = codecs.open("xlj_fenci.txt",'r','utf-8')    #codecs TXT 
lines = f.readlines()
doc = open('xlj_vec.txt', 'w')




for line in lines: # 
    if lines.index(line) % 100 ==0:   # 
        print lines.index(line)
    list=[]
    for i in range(0,64):
        list.append(0)
    list=np.array(list)          # array 
    words=line.split("  ",1)[1].split(" ")
    count=0
    for word in words:
        if word != "\r
": print(word) try: print(word_vec[word]) count=count+1 print(type(word_vec[word])) print(type(list)) list=word_vec[word]+list # print(list) print(count) except: continue print(count) if count !=0: list=list/count print(list) list=list.tolist() # List SVM list.append(int(float(line.split(" ",1)[0].split(" ",5)[2]))) list.append(int(float(line.split(" ", 1)[0].split(" ", 5)[3]))) list.append(int(float(line.split(" ", 1)[0].split(" ", 5)[4]))) list.append(int(float(line.split(" ", 1)[0].split(" ", 5)[5]))) print(list) doc.write(line.split(" ",2)[0]+" "+str(list)+"\r
") print("end") f.close() doc.close()