ベイズアルゴリズムの例
21161 ワード
#-*- coding: utf-8 -*- #
from numpy import *
#
#
def loadDataSet():
postingList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
['stop', 'posting', 'stupid', 'worthless', 'garbage'],
['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
# :1 ,0
classVec = [0,1,0,1,0,1]
# ,
return postingList,classVec
#
# :dataSet
# :
def createVocabList(dataSet):
# set ,
vocabSet = set([])
# ,
for document in dataSet:
# set(document), document
vocabSet = vocabSet | set(document) #
return list(vocabSet)
#*** :
#vocabList:
#inputSet :
def setOfWords2Vec(vocabList, inputSet):
# 0
returnVec = [0]*len(vocabList)
# , 1
for word in inputSet:
if word in vocabList:
# , 1
returnVec[vocabList.index(word)] = 1 #
# , , , 0
else: print "the word: %s is not in my Vocabulary!" % word
return returnVec
#==== , , =====
# trainMatrix:
# trainCategory:
# p0Vect:
# p1Vect:
# pAbusive:
def trainNB00(trainMatrix,trainCategory):
#numTrainDocs
numTrainDocs = len(trainMatrix)
#
numWords = len(trainMatrix[0])
# ( )
pAbusive = sum(trainCategory)/float(numTrainDocs)
#* p( | )=p0Num/p0Denom
p0Num = zeros(numWords); # 0
#* p( | )=p1Num/p1Denom
p1Num = zeros(numWords) # 0
# 0
p0Denom = 0;
p1Denom = 0
#
for i in range(numTrainDocs):
#
if trainCategory[i] == 1:
#
p1Num += trainMatrix[i]
#p1Denom
p1Denom += sum(trainMatrix[i])
#
else:
#
p0Num += trainMatrix[i]
#p0Denom
p0Denom += sum(trainMatrix[i])
#
p1Vect = p1Num/p1Denom
#
p0Vect = p0Num/p0Denom
return p0Vect,p1Vect,pAbusive
#===== , =====
# trainMatrix:
# trainCategory:
# p0Vect:
# p1Vect:
# pAbusive:
def trainNB0(trainMatrix,trainCategory):
# :
numTrainDocs = len(trainMatrix)
# :
numWords = len(trainMatrix[0])
# ( )
pAbusive = sum(trainCategory)/float(numTrainDocs)
#* 0, 1, 2
# 1
p0Num = ones(numWords); # 1
# 1
p1Num = ones(numWords) # 1
# 2
p0Denom = 2.0;
p1Denom = 2.0
#
for i in range(numTrainDocs):
#
if trainCategory[i] == 1:
#
p1Num += trainMatrix[i] #
#p1Denom
p1Denom += sum(trainMatrix[i])
#
else:
#
p0Num += trainMatrix[i]
#p0Denom
p0Denom += sum(trainMatrix[i])
# log, p(x1|c1) log,
p1Vect = log(p1Num/p1Denom)
p0Vect = log(p0Num/p0Denom)
return p0Vect,p1Vect,pAbusive
#vec2Classify:
#p0Vect:
#p1Vect:
#pClass1:
def classifyNB(vec2Classify, p0Vect, p1Vect, pClass1):
# ,
#print'0p1=',sum(vec2Classify * p0Vect) #
#print'0p0=',sum(vec2Classify * p0Vect)
p1 = sum(vec2Classify * p1Vect) + log(pClass1)
p0 = sum(vec2Classify * p0Vect) + log(1.0 - pClass1)
#print'p1=',p1
#print'p0=',p0
if p1 > p0:
return 1
else:
return 0
def testingNB():
# ,
listOPosts,listClasses = loadDataSet()
#
myVocabList = createVocabList(listOPosts)
# ,
trainMat=[]
# , ,
for postinDoc in listOPosts:
# trainMat
trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
#
p0V,p1V,pAb = trainNB0(array(trainMat),array(listClasses))
#=== (1)
testEntry = ['love', 'my', 'dalmation']
#
thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
#
print testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb)
#=== (2)
testEntry = ['stupid', 'garbage']
#
thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
#
print testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb)
#*** :
#vocabList:
#inputSet :
def bagOfWords2VecMN(vocabList, inputSet):
# 0
returnVec = [0]*len(vocabList)
# ,
for word in inputSet:
if word in vocabList:
#
returnVec[vocabList.index(word)] += 1
# , , , 0
else: print "the word: %s is not in my Vocabulary!" % word
return returnVec
# ,
# 2
def textParse(bigString):
import re
listOfTokens = re.split(r'\W*', bigString)
#tok.lower()
return [tok.lower() for tok in listOfTokens if len(tok) > 2]
def spamTest():
#
docList=[];
#
classList = [];
#
fullText =[]
for i in range(1,26):
#
#wordList = textParse(open('D:/work/python/email/spam/%d.txt' % i).read())
wordList = textParse(open('D:/machine learning/python/bayes/email/spam/%d.txt' % i).read())
#docList
docList.append(wordList)
#fullText
fullText.extend(wordList)
# 1
classList.append(1)
#
#wordList = textParse(open('D:/work/python/email/ham/%d.txt' % i).read())
wordList = textParse(open('D:/machine learning/python/bayes/email/ham/%d.txt' % i).read())
docList.append(wordList)
fullText.extend(wordList)
# 0
classList.append(0)
#
vocabList = createVocabList(docList)
# 50
trainingSet = range(50);
#
testSet=[]
# 10 ,
for i in range(10):
#0-50
randIndex = int(random.uniform(0,len(trainingSet)))
# ,
testSet.append(trainingSet[randIndex])
#
del(trainingSet[randIndex])
# ,
trainMat=[]; #
trainClasses = [] #
#
for docIndex in trainingSet:
# , trainMat
trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))
#trainClasses
trainClasses.append(classList[docIndex])
#
p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))
#errorCount
errorCount = 0
# ,
for docIndex in testSet:
#
wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])
# , ,errorCount 1
if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
errorCount += 1
#
print "classification error",docList[docIndex]
# , /
print 'the error rate is: ',float(errorCount)/len(testSet)
#return vocabList,fullText
if __name__ == "__main__":
###********************** : start
### #
listOPosts,listClasses = loadDataSet()
# #
myVocabList = createVocabList(listOPosts)
print 'myVocabList=',myVocabList
print 'result=',setOfWords2Vec(myVocabList, listOPosts[0])
trainMat = []
for postinDoc in listOPosts:
#
trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
p0Vect,p1Vect,pAbusive = trainNB0(trainMat, listClasses)
print 'p0Vect='
print p0Vect
print 'p1Vect='
print p1Vect
print 'pAbusive='
print pAbusive
print 'trainMatrix='
print trainMat
print 'listClasses=',listClasses
###********************** : end
## #
print'==================================='
testingNB()
#***********************
## #
print'=======spam filtering============='
spamTest()