【データマイニング】ドキュメント分類の素朴ベイズアルゴリズム
2831 ワード
#!/usr/bin/python
import re
import math
def getwords(doc):
splitter=re.compile('\\W*')
# Split the words by non-alpha characters
words=[s.lower() for s in splitter.split(doc) if len(s)>2 and len(s)<20]
return dict([(w,1) for w in words])
# demo, , Nobody owns the water features,good cat
def sampletrain(cl):
cl.train('Nobody owns the water.','good')
cl.train('the quick rabbit jumps fences','good')
cl.train('buy pharmaceuticals now','bad')
cl.train('make quick money at the online casino','bad')
cl.train('the quick brown fox jumps','good')
# item 。
class classifier:
def __init__(self,getfeatrues,filename=None):
#
self.fc={}
#
self.cc={}
self.getfeatures=getfeatures
#
def incf(self,f,cat):
self.fc.setdefault(f,{})
self.fc[f].setdefault(cat,0)
self.fc[f][cat]+=1
#
def incc(self,cat):
self.cc.setdefault(cat,0)
self.cc[cat]+=1
#
def fcount(self,f,cat):
if f in self.fc and cat in self.fc[f]:
return float(self.fc[f][cat])
return 0.0
# category
def catcount(self,cat):
if cat in self.cc:
return float(self.cc[cat])
return 0.0
#item
def totalcount(self):
return sum(self.cc.values())
# category
def categories(self):
return self.cc.keys()
# feature ,cat feature
def train(self,item,cat):
features= self.getfeatrues(item)
for f in features:
self.incf(f,cat)
self.incc(cat)
#
def fprob(self,f,cat):
if self.catcount[cat] == 0:
return 0
return self.fcount(f,cat)/self.catcount(cat)
#
#(weight*assumedprob + count*fprob)/(count+weight)
def wieghtprob(self,f,cat,weight=1,ap=0.5,prf):
basicprob=prf(f,cat)
totals=sum(self.fcount(f,c) for c in self.categories())
bp=((weight*ap)+(totals*basicprob))/(weight+totals)
return bp
#
class naivebayes(classifier):
# , , : item
def docprob(self,item,cat):
features=self.getfeatures(item)
p=1
for f in features: p*=self.weightedprob(f,cat,self.fprob)
return p
def prob(self,item,cat):
catprob = self.catcount(cat)/self.totalcount()
docprob=self.docprob(item,cat)
return catprob*docprob
まとめ:
1)sampleベーストレーニング分類器
2)itemの条件確率を計算する
3)ベイズ理論に基づいてdocの条件確率を計算する