TF-IDF計算Python

1224 ワード

def ComputeFreq(wordlist, text):
    result = []
    for word in wordlist:
        countword = text.count(word)
        texted = nltk.word_tokenize(text)
        length = len(texted)
        freq = countword/length
        temp = {}
        temp['word'] = word
        temp['freq'] = freq
        #print freq
        result.append(temp)
    return result

def Computetfidf(wordfreq, corpus):
    result = []
    for item in wordfreq:
        word = item['word']
        tf = item['freq']
        dlength = len(corpus)
        count = 1
        for line in corpus:
            if line.find(word)!=-1:
                count = count+1
        idf = math.log10(dlength/count)
        tfidf = tf*idf
#         tempword.append(word)
#         temptfidf.append(tfidf)  
        temp = {}
        temp['word'] = word
        temp['tfidf'] = tfidf
        result.append(temp)
    result.sort(lambda x,y : -cmp(x['tfidf'], y['tfidf']))  
    return result

最初の関数:wordのtextでの語周波数を計算します
wordlistはlist形式のword、textは対応するdocument、pythonのstring形式です
2番目の関数:wordのコーパス内のTF-IFを計算する
wordfreqは最初の関数の出力結果であり、corpusはdocumentのlistストレージフォーマットである.