統計の学習方法の第4章の質素な貝葉斯は見積もって、例題の4.2コードの実践


学習方法を統計します.第四章は素朴なベ葉斯の推計です.例題4.2コードの実践(もし必要ならば極めて類似している推計のアルゴリズムを調べてください.別の文章を見てください.http://blog.csdn.net/grinandbearit/article/details/79044065)バイヤスアルゴリズムがやや複雑になりました.点は分子分母を矯正しました.乗数0現象が発生するのを防止します.
#-*- coding:utf-8 -*-
from numpy import *

#        ,               array      !       0.0
def loadDataSet():
    dataSet=[[1,1,1,1,1,2,2,2,2,2,3,3,3,3,3],['S','M','M','S','S','S','M','M','L','L','L','M','M','L','L']]
    labels=[-1,-1,1,1,-1,-1,-1,1,1,1,1,1,1,1,-1]
    return array(dataSet).transpose().tolist(),labels
'''
   labels                           ,          ,  n  (          ),
alpha              
'''
def calc_label(labels,alpha):
    m=len(labels)
    uniqueLabel=set(labels) #        
    diffLabelNum=len(uniqueLabel)
    labelRate={}
    for label in uniqueLabel:
        labelRate[label]=(labels.count(label)+alpha)/float(m+diffLabelNum*alpha) #                 
    return labelRate,list(uniqueLabel) #    uniqueLabel set       ,     list

#     ,                   
def calcVocaulary(dataset):
    voca=set()
    for content in dataset:
        voca = voca | set(content)
    return list(voca)

#     ,              1
def calcVector(voca,vector):
    n=len(voca)
    originVector=zeros(n)
    for word in vector:
        if word in voca:
            originVector[voca.index(word)] += 1
    return array(originVector) #                array  

#                     
def calcUniqueValueNum(dataset,labels,label,voca):
    labelDataSet=[]
    for i in range(len(labels)):
        if labels[i]==label:
            labelDataSet.append(dataset[i])
    m,n=shape(labelDataSet)
    uniqueValueDict={}
    for i in range(n):
        uniqueValue=set()
        [uniqueValue.add(content[i])  for content in labelDataSet]
        for value in uniqueValue:
            uniqueValueDict[value] = len(uniqueValue)
    a=len(voca)
    returnArray=zeros(a)
    for key in uniqueValueDict:
        returnArray[voca.index(key)] = float(uniqueValueDict[key])
    return returnArray

#                                    ,     key             ,   n   
def Bayes(dataset,labels,uniqueLabel,voca,alpha):
    n=len(uniqueLabel);m=len(dataset)
    trainVecDict={}
    for i in range(n):
        labelVector=array(ones(len(voca)))*alpha
        for j in range(m):
            if labels[j]== uniqueLabel[i]:
                labelVector += calcVector(voca,dataset[j]) #           

        labelVector /=  (labels.count(uniqueLabel[i])+calcUniqueValueNum(dataset,labels,uniqueLabel[i],voca)*alpha) #                
        trainVecDict[uniqueLabel[i]]=labelVector  #                        
    return trainVecDict

#    ,               
def testFunction(testArray,voca,trainVecDict,labelRate):
    result = -1;maxRate = -inf
    for key in trainVecDict:
        singleLabelRate=1.0
        for word in testArray:
            singleLabelRate *= trainVecDict[key][voca.index(word)] #                              
        if singleLabelRate*labelRate[key] > maxRate:
                result = key;maxRate =singleLabelRate*labelRate[key]
    return result

dataSet,labels=loadDataSet()
labelRate,uniqueLabel=calc_label(labels,1)
voca=calcVocaulary(dataSet)
trainVecDict=Bayes(dataSet,labels,uniqueLabel,voca,1)
testArray=array([2,'S'])
print  testFunction(testArray,voca,trainVecDict,labelRate)
print labelRate
print trainVecDict
['1', 'S', '2', 'M', '3', 'L']
-1
{1: 0.5882352941176471, -1: 0.4117647058823529}
{1: array([ 0.25      ,  0.16666667,  0.33333333,  0.41666667,  0.41666667,
        0.41666667]), -1: array([ 0.44444444,  0.44444444,  0.33333333,  0.33333333,  0.22222222,
        0.22222222])}






made by zcl at CUMT
I know I can because I have a heart that beat s