ベイズアルゴリズムの例

21161 ワード

#-*- coding: utf-8 -*- #      
from numpy import *

#         
#     
def loadDataSet():
    postingList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
                 ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                 ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                 ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                 ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                 ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    #    :1     ,0    
    classVec = [0,1,0,1,0,1]     
    #      ,    
    return postingList,classVec

#     
#  :dataSet        
#  :                  
def createVocabList(dataSet):
    #  set  ,        
    vocabSet = set([])
    #        ,          
    for document in dataSet:
        #  set(document),  document       
        vocabSet = vocabSet | set(document) #   
    return list(vocabSet)

#***    :         
#vocabList:   
#inputSet :      
def setOfWords2Vec(vocabList, inputSet):
    #        0   
    returnVec = [0]*len(vocabList)
    #                  ,          1
    for word in inputSet:
        if word in vocabList:
        #         ,   1 
            returnVec[vocabList.index(word)] = 1 #    
        #        ,      ,      ,        0  
        else: print "the word: %s is not in my Vocabulary!" % word
    return returnVec

#====     ,        ,    =====
#  trainMatrix:      
#  trainCategory:          
#  p0Vect:                      
#  p1Vect:                       
#  pAbusive:               
def trainNB00(trainMatrix,trainCategory):
    #numTrainDocs      
    numTrainDocs = len(trainMatrix)
    #             
    numWords = len(trainMatrix[0])
    #      (            )
    pAbusive = sum(trainCategory)/float(numTrainDocs) 
    #*             p(   |    )=p0Num/p0Denom
    p0Num = zeros(numWords); #      0
    #*              p(   |     )=p1Num/p1Denom 
    p1Num = zeros(numWords)  #      0
    #       0 
    p0Denom = 0; 
    p1Denom = 0               
    #        
    for i in range(numTrainDocs):
        #     
        if trainCategory[i] == 1:
            #                 
            p1Num += trainMatrix[i]
            #p1Denom       
            p1Denom += sum(trainMatrix[i])

        #     
        else:
            #                 
            p0Num += trainMatrix[i]
            #p0Denom       
            p0Denom += sum(trainMatrix[i])   
    #                       
    p1Vect = p1Num/p1Denom        
    #                       
    p0Vect = p0Num/p0Denom
    return p0Vect,p1Vect,pAbusive


#=====     ,    =====
#  trainMatrix:      
#  trainCategory:          
#  p0Vect:                      
#  p1Vect:                       
#  pAbusive:               
def trainNB0(trainMatrix,trainCategory):
    #      :  
    numTrainDocs = len(trainMatrix)
    #          :     
    numWords = len(trainMatrix[0])
    #      (            )
    pAbusive = sum(trainCategory)/float(numTrainDocs)    
    #*              0,      1,   2
    #       1
    p0Num = ones(numWords); #      1
    #       1 
    p1Num = ones(numWords)  #      1
    #       2 
    p0Denom = 2.0; 
    p1Denom = 2.0               
    #          
    for i in range(numTrainDocs):
        #     
        if trainCategory[i] == 1:
            #                 
            p1Num += trainMatrix[i] #  
            #p1Denom       
            p1Denom += sum(trainMatrix[i])

        #     
        else:
            #                 
            p0Num += trainMatrix[i]
            #p0Denom       
            p0Denom += sum(trainMatrix[i])   
    #   log,      p(x1|c1) log,      
    p1Vect = log(p1Num/p1Denom)         
    p0Vect = log(p0Num/p0Denom) 
    return p0Vect,p1Vect,pAbusive

#vec2Classify:      
#p0Vect:                           
#p1Vect:                            
#pClass1:               
def classifyNB(vec2Classify, p0Vect, p1Vect, pClass1):
    #          ,                 
    #print'0p1=',sum(vec2Classify * p0Vect) #    
    #print'0p0=',sum(vec2Classify * p0Vect)
    p1 = sum(vec2Classify * p1Vect) + log(pClass1)    
    p0 = sum(vec2Classify * p0Vect) + log(1.0 - pClass1)
    #print'p1=',p1
    #print'p0=',p0
    if p1 > p0:
        return 1
    else: 
        return 0

def testingNB():
    #      ,    
    listOPosts,listClasses = loadDataSet()
    #     
    myVocabList = createVocabList(listOPosts)
    #    ,      
    trainMat=[]

    #      ,      ,        
    for postinDoc in listOPosts:
        #             trainMat 
        trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
    #     
    p0V,p1V,pAb = trainNB0(array(trainMat),array(listClasses))

    #===    (1)
    testEntry = ['love', 'my', 'dalmation']
    #         
    thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
    #      
    print testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb)

    #===    (2)
    testEntry = ['stupid', 'garbage']
    #         
    thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
    #      
    print testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb)    



#***    :         
#vocabList:   
#inputSet :      
def bagOfWords2VecMN(vocabList, inputSet):
    #        0   
    returnVec = [0]*len(vocabList)
    #                  ,             
    for word in inputSet:
        if word in vocabList:
            #           
            returnVec[vocabList.index(word)] += 1
        #        ,      ,      ,        0  
        else: print "the word: %s is not in my Vocabulary!" % word
    return returnVec


#    ,        
#         2     
def textParse(bigString):    
    import re
    listOfTokens = re.split(r'\W*', bigString)
    #tok.lower()          
    return [tok.lower() for tok in listOfTokens if len(tok) > 2] 

def spamTest():
    #      
    docList=[]; 
    #      
    classList = [];
    #         
    fullText =[]
    for i in range(1,26):
        #      
        #wordList = textParse(open('D:/work/python/email/spam/%d.txt' % i).read()) 
        wordList = textParse(open('D:/machine learning/python/bayes/email/spam/%d.txt' % i).read())    
        #docList      
        docList.append(wordList)
        #fullText         
        fullText.extend(wordList)
        #         1
        classList.append(1)

        #      
        #wordList = textParse(open('D:/work/python/email/ham/%d.txt' % i).read())
        wordList = textParse(open('D:/machine learning/python/bayes/email/ham/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        #         0
        classList.append(0)

    #     
    vocabList = createVocabList(docList)
    #    50   
    trainingSet = range(50);
    #     
    testSet=[]
    #    10       ,              
    for i in range(10):
        #0-50        
        randIndex = int(random.uniform(0,len(trainingSet)))
        #           ,      
        testSet.append(trainingSet[randIndex])
        #      
        del(trainingSet[randIndex])  

    #    ,        
    trainMat=[]; #    
    trainClasses = [] #    

    #          
    for docIndex in trainingSet:
        #            ,  trainMat     
        trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))
        #trainClasses         
        trainClasses.append(classList[docIndex])
    #     
    p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))

    #errorCount          
    errorCount = 0
    #       ,           
    for docIndex in testSet:
        #           
        wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])
        #                  ,      ,errorCount  1
        if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
            errorCount += 1
            #       
            print "classification error",docList[docIndex]

    #     ,     /     
    print 'the error rate is: ',float(errorCount)/len(testSet)


    #return vocabList,fullText

if __name__ == "__main__":

###**********************     :     start
### #    
    listOPosts,listClasses = loadDataSet()  
# #     
    myVocabList = createVocabList(listOPosts)
    print 'myVocabList=',myVocabList
    print 'result=',setOfWords2Vec(myVocabList, listOPosts[0])
    trainMat = []
    for postinDoc in listOPosts:
        #      
        trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
    p0Vect,p1Vect,pAbusive = trainNB0(trainMat, listClasses)
    print 'p0Vect='
    print p0Vect
    print 'p1Vect='
    print p1Vect
    print 'pAbusive='
    print pAbusive
    print 'trainMatrix='
    print trainMat
    print 'listClasses=',listClasses
###**********************     :     end 

## #       
    print'==================================='
    testingNB()

#***********************     
## #      
    print'=======spam filtering============='
    spamTest()