TF-IDF計算Python
1224 ワード
def ComputeFreq(wordlist, text):
result = []
for word in wordlist:
countword = text.count(word)
texted = nltk.word_tokenize(text)
length = len(texted)
freq = countword/length
temp = {}
temp['word'] = word
temp['freq'] = freq
#print freq
result.append(temp)
return result
def Computetfidf(wordfreq, corpus):
result = []
for item in wordfreq:
word = item['word']
tf = item['freq']
dlength = len(corpus)
count = 1
for line in corpus:
if line.find(word)!=-1:
count = count+1
idf = math.log10(dlength/count)
tfidf = tf*idf
# tempword.append(word)
# temptfidf.append(tfidf)
temp = {}
temp['word'] = word
temp['tfidf'] = tfidf
result.append(temp)
result.sort(lambda x,y : -cmp(x['tfidf'], y['tfidf']))
return result
最初の関数:wordのtextでの語周波数を計算します
wordlistはlist形式のword、textは対応するdocument、pythonのstring形式です
2番目の関数:wordのコーパス内のTF-IFを計算する
wordfreqは最初の関数の出力結果であり、corpusはdocumentのlistストレージフォーマットである.