gensimを書き換える.word 2 vecのテキスト類似度マッチング関数(wmdistance)

7175 ワード

1.なぜ書き直すのか
D jangoではわけがわからないのでimport gensimできません.それはできないmodels import Word2Vec. できませんmodel.word2vec_model.wmdistance(sentence1,sentence2). したがって、gensimパケットを導入したコンテンツの一部を元のコードに従って変更した.
2.書き換え後のコード
import pyemd
from gensim.corpora.dictionary import Dictionary
from numpy import exp, log, dot, zeros, outer, random, dtype, float32 as REAL,\
    double, uint32, seterr, array, uint8, vstack, fromstring, sqrt, newaxis,\
    ndarray, empty, sum as np_sum, prod, ones, ascontiguousarray
from keras.models import load_model
import cPickle as pickle

import logging

logger = logging.getLogger(__name__)
import sys
reload(sys)
sys.setdefaultencoding('utf-8')


def sentence_distance(document1, document2):
    len_pre_oov1 = len(document1)
    len_pre_oov2 = len(document2)


    document1 = [word_index.get(token) for token in document1 if word_index.has_key(token)]
    document2 = [word_index.get(token) for token in document2 if word_index.has_key(token)]

    diff1 = len_pre_oov1 - len(document1)
    diff2 = len_pre_oov2 - len(document2)
    if diff1 > 0 or diff2 > 0:
        print ('Removed %d and %d OOV words from document 1 and 2 (respectively).',
                    diff1, diff2)

    if len(document1) == 0 or len(document2) == 0:
        print ('At least one of the documents had no words that were'
                    'in the vocabulary. Aborting (returning inf).')
        return float('inf')

    #dictionary = Dictionary(documents=[document1, document2])
    dictionarys = list(set(document1+document2))
    dictionary = dict(enumerate(dictionarys))

    vocab_len = len(dictionary)
    if vocab_len == 1:
        # Both documents are composed by a single unique token
        return 0.0


    docset1 = set(document1)
    docset2 = set(document2)


    distance_matrix = zeros((vocab_len, vocab_len), dtype=double)
    for i, t1 in dictionary.items():
        for j, t2 in dictionary.items():
            if not t1 in docset1 or not t2 in docset2:
                continue
            distance_matrix[i, j] = sqrt(np_sum((embedding[t1] - embedding[t2]) ** 2))

    if np_sum(distance_matrix) == 0.0:
        print ('The distance matrix is all zeros. Aborting (returning inf).')
        return float('inf')

    def nbow(document):
        d = zeros(vocab_len, dtype=double)
        nbow = doc2bow(document,vocab_len,dictionary)  # Word frequencies.
        doc_len = len(document)
        for (idx,freq) in  nbow.items():
        #for idx, freq in nbow:
            d[idx] = float(freq) / float(doc_len)  # Normalized word frequencies.
        return d


    def doc2bow(document,vocab_len,dictionary):
        freq_dic = dict()
        for i in document:
            if freq_dic.has_key(i):
                freq_dic[i] = freq_dic[i]+1
            else:
                freq_dic[i] = 1

        return_freq = dict()
        for i in range(len(document)):
            if return_freq.has_key(i):
                for key in range(len(dictionary)):
                    if(dictionary[key] == document[i]):
                        return_freq[key] = freq_dic[document[i]]
            else:
                for key in range(len(dictionary)):
                    if(dictionary[key] == document[i]):
                        return_freq[key] = freq_dic[document[i]]
        return return_freq


    d1 = nbow(document1)
    d2 = nbow(document2)

    print pyemd.emd(d1,d2,distance_matrix)
    return pyemd.emd(d1, d2, distance_matrix)


if __name__ == "__main__":
    embedding = pickle.load(open('/home/.../word2vec_save.pkl'))
    word_index = pickle.load(open('/home/...distance('       ','     ')