データマイニングノート-似たような文章を探す-Phothon


関連原理知識はデータマイニングノート-似たような文章を探す-Javaという文章で紹介した.ここでは記録がPython言語で実現されているだけです.分詞器は結巴分詞器pythonパッケージを使用しています.
コード管理:https://github.com/fighting-one-piece/repository-datamining.git
class Doc:
    
    def __init__(self, name):
        self._name = name
     
    def setName(self, name):
        self._name = name
    
    def getName(self):
        return self._name
    
    def setCategory(self, category):
        self._category = category
        
    def getCategory(self):
        return self._category
        
    def setWords(self, words):
        self._words = words
        
    def getWords(self):
        return self._words
    
    def setTfidfWords(self, tfidfWords):
        self._tfidfWords = tfidfWords
        
    def getTfidfWords(self):
        return self._tfidfWords
    
    def getSortedTfidfWords(self):
        results = [sorted(self._tfidfWords.items(), key=lambda i : i[1], reverse=True), ]
        return results
    
    def setCHIWords(self, chiWords):
        self._chiWords = chiWords
        
    def getCHIWords(self):
        return self._chiWords

    def setSimilarities(self, similarities):
        self._similarities = similarities
        
    def getSimilarities(self):
        return self._similarities
    
class DocSimilarity:
    
    def getName1(self):
        return self._name1

    def setName1(self, name1):
        self._name1 = name1
        
    def getName2(self):
        return self._name2

    def setName2(self, name2):
        self._name2 = name2
    
    def getVector1(self):
        return self._vector1
    
    def setVector1(self, vector1):
        self._vector1 = vector1
        
    def getVector2(self):
        return self._vector2
    
    def setVector2(self, vector2):
        self._vector2 = vector2
        
    def getCosine(self):
        return self._cosine
        
    def setCosine(self, cosine):
        self._cosine = cosine
    
        
class DocHelper:
    
    @staticmethod
    def genDocs(path):
        docs = []
        DocHelper.genDocsIterator(path, docs)
        return docs
    
    @staticmethod
    def genDocsIterator(path, docs):
        if os.path.isdir(path):
            for subPathName in os.listdir(path):
                subPath = os.path.join(path, subPathName)
                DocHelper.genDocsIterator(subPath, docs)
        else:
            name = path[path.rfind('\\') + 1 : path.rfind('.')]
            doc = Doc(name)
            doc.setCategory(path.split('\\')[-2])
            doc.setWords(WordUtils.splitFile(path));
            docs.append(doc)
    
    @staticmethod
    def docHasWord(doc, word):
        for dword in doc.getWords():
            if dword == word:
                return True
        return False
    
    @staticmethod
    def docWordsStatistics(doc):
        map = {}
        for word in doc.getWords():
            count = map.get(word)
            if count is None:
                count = 0
            map[word] = count + 1
        return map
    
    @staticmethod
    def docCategorySplit(docs):
        docSplits = {}
        for doc in docs:
            category = doc.getCategory()
            if docSplits.has_key(category):
                cDocs = docSplits.get(category)
                cDocs.append(doc)
            else :
                cDocs = [doc]
                docSplits[category] = cDocs
        return docSplits
    
    @staticmethod
    def docTopNWords(doc, n):
        sortedWords = DocHelper.sortWordValueMap(doc.getTfidfWords())
        words = []
        for item in sortedWords:
            for i in item[0:n]:
                words.append(i[0])
        return words
                    
    
    @staticmethod
    def docWordsVector(doc, words):
        vector = []
        docWords = DocHelper.docWordsStatistics(doc)
        for word in words:
            count = docWords.get(word)
            if count is None:
                vector.append(0)
            else :
                vector.append(count)
        return vector
    
    @staticmethod
    def wordCategorySplit(category, docs):
        belongDocs = []
        nobelongDocs = []
        for doc in docs:
            if category == doc.getCategory():
                belongDocs.append(doc)
            else:
                nobelongDocs.append(doc)
        return belongDocs, nobelongDocs
    
    @staticmethod
    def wordInDocsStatistics(word, docs):
        sum = 0
        for doc in docs:
            if DocHelper.docHasWord(doc, word):
                sum += 1
        return sum

    @staticmethod
    def wordNotInDocsStatistics(word, docs):
        sum = 0
        for doc in docs:
            if DocHelper.docHasWord(doc, word) == False:
                sum += 1
        return sum
    
    @staticmethod
    def calculateTFIDF(docs):
        docTotalCount = float(len(docs))
        for doc in docs:
            wordTotalCount = len(doc.getWords())
            tfidfWords = {}
            docWords = DocHelper.docWordsStatistics(doc)
            for word in docWords.keys():
                wordCount = docWords.get(word)
                tf = float(wordCount) / wordTotalCount
                docCount = DocHelper.wordInDocsStatistics(word, docs) + 1
                if docCount > docTotalCount:
                    docCount = docTotalCount
                idf = math.log(docTotalCount / docCount);
                tfidf = tf * idf
                tfidfWords[word] = tfidf
            doc.setTfidfWords(tfidfWords)
        
    @staticmethod
    def calculateSimilar(docs):
        for doc in docs:
            topWords = DocHelper.docTopNWords(doc, 20)
            similarities = []
            for odoc in docs:
                otopWords = DocHelper.docTopNWords(odoc, 20)
                words = WordUtils.mergeAndRemoveRepeat(topWords, otopWords);
                v1 = DocHelper.docWordsVector(doc, words)
                v2 = DocHelper.docWordsVector(odoc, words)
                cosine = DistanceUtils.cosine(v1,v2)
                similarity = DocSimilarity()
                similarity.setName1(doc.getName())
                similarity.setName2(odoc.getName())
                similarity.setVector1(v1)
                similarity.setVector2(v2)
                similarity.setCosine(cosine)
                similarities.append(similarity)
            doc.setSimilarities(similarities)
                
    @staticmethod
    def sortWordValueMap(wordValueMap):
        results = [sorted(wordValueMap.items(), key=lambda i : i[1], reverse=True), ]
        return results
import jieba as ws

class WordUtils:

    @staticmethod
    def split(input):
        seg_list = ws.cut(input, cut_all=False)
        words = []
        for word in seg_list:
            words.append(word)
        return words
    
    @staticmethod
    def splitFile(path):
        file = open(path)
        words = []
        for line in file.readlines():
            line = line.strip();
            if len(line) > 0:
                for w in WordUtils.split(line):
                    words.append(w)
        file.close()
        return WordUtils.removeStopWords(words)
    
    @staticmethod
    def removeStopWords(words):
        file = open("stopwords.dic")
        stopwords = []
        for line in file.readlines():
            line = line.strip();
            if len(line) > 0:
                stopwords.append(line)
        file.close()
        rwords = []
        for word in words:
            flag = True
            for stopword in stopwords:
                #if word.encode('utf-8') == stopword.encode('utf-8'):
                if word == stopword:
                    flag = False
                    break
            if flag and len(word.strip()) > 0:
                rwords.append(word)
        return rwords
    
    @staticmethod
    def mergeAndRemoveRepeat(w1, w2):
        all = [i1 for i1 in w1]
        all += [i2 for i2 in w2]
        return [i for i in set(all)] 
def testSimilarity(): 
    path = r'D:\resources\chinese'
    docs = DocHelper.genDocs(path)
    DocHelper.calculateTFIDF(docs)
    DocHelper.calculateSimilar(docs)
    for doc in docs:
        print '----------'
        for similarity in doc.getSimilarities():
            print '%s-%s-%s' %(similarity.getName1(),\
                    similarity.getName2(), similarity.getCosine())