Python抽出テキストtf,idf

1722 ワード

"""python     tfidf  """

import math
from collections import Counter

# 1.   
corpus = [
    'this is the first document',
    'this is the second second document',
    'and the third one',
    'is this the first document'
]

# 2.       
word_list = []
for i in range(len(corpus)):
    word_list.append(corpus[i].split(' '))
print('2-->', word_list)

# 3.    
countlist = []
for i in range(len(word_list)):
    count = Counter(word_list[i])
    countlist.append(count)
print('3  -->', countlist)


# 4.    tfidf     
# count[word]           , sum(count.values())           
def tf(word, count):
    return count[word] / sum(count.values())


#              
def n_containing(word, count_list):
    return sum(1 for count in count_list if word in count)


# len(count_list)       ,n_containing(word, count_list)             , 1        0
def idf(word, count_list):
    return math.log(len(count_list) / (1 + n_containing(word, count_list)))


#  tf idf  
def tfidf(word, count, count_list):
    return tf(word, count) * idf(word, count_list)


all_dict = {}
for counte in countlist:
    counter = dict(counte)
    for k, v in counter.items():
        try:
            all_dict[k] += v
        except:
            all_dict[k] = v
print('merge-->', all_dict)

with open('tf.txt', 'w+') as tfin, open('idf.txt', 'w+') as idfin:
    for k in all_dict.keys():
        # k_tf = tf(k, all_dict)
        tfin.write(k + ' ' + str(all_dict[k]) + '
') k_idf = idf(k, countlist) idfin.write(k + ' ' + str(k_idf) + '
')