統計の学習方法の第4章の質素な貝葉斯は見積もって、例題の4.2コードの実践
10995 ワード
学習方法を統計します.第四章は素朴なベ葉斯の推計です.例題4.2コードの実践(もし必要ならば極めて類似している推計のアルゴリズムを調べてください.別の文章を見てください.http://blog.csdn.net/grinandbearit/article/details/79044065)バイヤスアルゴリズムがやや複雑になりました.点は分子分母を矯正しました.乗数0現象が発生するのを防止します.
I know I can because I have a heart that beat s
#-*- coding:utf-8 -*-
from numpy import *
# , array ! 0.0
def loadDataSet():
dataSet=[[1,1,1,1,1,2,2,2,2,2,3,3,3,3,3],['S','M','M','S','S','S','M','M','L','L','L','M','M','L','L']]
labels=[-1,-1,1,1,-1,-1,-1,1,1,1,1,1,1,1,-1]
return array(dataSet).transpose().tolist(),labels
'''
labels , , n ( ),
alpha
'''
def calc_label(labels,alpha):
m=len(labels)
uniqueLabel=set(labels) #
diffLabelNum=len(uniqueLabel)
labelRate={}
for label in uniqueLabel:
labelRate[label]=(labels.count(label)+alpha)/float(m+diffLabelNum*alpha) #
return labelRate,list(uniqueLabel) # uniqueLabel set , list
# ,
def calcVocaulary(dataset):
voca=set()
for content in dataset:
voca = voca | set(content)
return list(voca)
# , 1
def calcVector(voca,vector):
n=len(voca)
originVector=zeros(n)
for word in vector:
if word in voca:
originVector[voca.index(word)] += 1
return array(originVector) # array
#
def calcUniqueValueNum(dataset,labels,label,voca):
labelDataSet=[]
for i in range(len(labels)):
if labels[i]==label:
labelDataSet.append(dataset[i])
m,n=shape(labelDataSet)
uniqueValueDict={}
for i in range(n):
uniqueValue=set()
[uniqueValue.add(content[i]) for content in labelDataSet]
for value in uniqueValue:
uniqueValueDict[value] = len(uniqueValue)
a=len(voca)
returnArray=zeros(a)
for key in uniqueValueDict:
returnArray[voca.index(key)] = float(uniqueValueDict[key])
return returnArray
# , key , n
def Bayes(dataset,labels,uniqueLabel,voca,alpha):
n=len(uniqueLabel);m=len(dataset)
trainVecDict={}
for i in range(n):
labelVector=array(ones(len(voca)))*alpha
for j in range(m):
if labels[j]== uniqueLabel[i]:
labelVector += calcVector(voca,dataset[j]) #
labelVector /= (labels.count(uniqueLabel[i])+calcUniqueValueNum(dataset,labels,uniqueLabel[i],voca)*alpha) #
trainVecDict[uniqueLabel[i]]=labelVector #
return trainVecDict
# ,
def testFunction(testArray,voca,trainVecDict,labelRate):
result = -1;maxRate = -inf
for key in trainVecDict:
singleLabelRate=1.0
for word in testArray:
singleLabelRate *= trainVecDict[key][voca.index(word)] #
if singleLabelRate*labelRate[key] > maxRate:
result = key;maxRate =singleLabelRate*labelRate[key]
return result
dataSet,labels=loadDataSet()
labelRate,uniqueLabel=calc_label(labels,1)
voca=calcVocaulary(dataSet)
trainVecDict=Bayes(dataSet,labels,uniqueLabel,voca,1)
testArray=array([2,'S'])
print testFunction(testArray,voca,trainVecDict,labelRate)
print labelRate
print trainVecDict
:
['1', 'S', '2', 'M', '3', 'L']
-1
{1: 0.5882352941176471, -1: 0.4117647058823529}
{1: array([ 0.25 , 0.16666667, 0.33333333, 0.41666667, 0.41666667,
0.41666667]), -1: array([ 0.44444444, 0.44444444, 0.33333333, 0.33333333, 0.22222222,
0.22222222])}
made by zcl at CUMTI know I can because I have a heart that beat s