統計学習方法第四章極めて類似している見積もりの質素な貝葉斯分類方法例題4.1コード実践


統計学習方法第四章極めて類似している推計の質素なベキス分類方法の例題4.1コードの実践(ベキストの推計を確認する必要があるので、私の別の文章を見ることができます.http://blog.csdn.net/grinandbearit/article/details/79045143)
コードは以下の通りです
#-*- coding:utf-8 -*-
from numpy import *

#        ,               array      !       0.0
def loadDataSet():
    dataSet=[[1,1,1,1,1,2,2,2,2,2,3,3,3,3,3],['S','M','M','S','S','S','M','M','L','L','L','M','M','L','L']]
    labels=[-1,-1,1,1,-1,-1,-1,1,1,1,1,1,1,1,-1]
    return array(dataSet).transpose().tolist(),labels

#   labels                           ,          ,  n  (          )
def calc_label(labels):
    m=len(labels)
    uniqueLabel=set(labels) #        
    labelRate={}
    for label in uniqueLabel:
        labelRate[label]=labels.count(label)/float(m)
    return labelRate,list(uniqueLabel) #    uniqueLabel set       ,     list

#     ,                   
def calcVocaulary(dataset):
    voca=set()
    for content in dataset:
        voca = voca | set(content)
    return list(voca)

#     ,              1
def calcVector(voca,vector):
    n=len(voca)
    originVector=zeros(n)
    for word in vector:
        if word in voca:
            originVector[voca.index(word)] += 1
    return array(originVector) #                array  

#                                    ,     key             ,   n   
def Bayes(dataset,labels,uniqueLabel,voca):
    n=len(uniqueLabel);m=len(dataset)
    trainVecDict={}
    for i in range(n):
        labelVector=array(zeros(len(voca)))
        for j in range(m):
            if labels[j]== uniqueLabel[i]:
                labelVector += calcVector(voca,dataset[j]) #           
        labelVector /= float(labels.count(uniqueLabel[i])) #                
        trainVecDict[uniqueLabel[i]]=labelVector  #                        
    return trainVecDict

#    ,               
def testFunction(testArray,voca,trainVecDict,labelRate):
    result = -1;maxRate = -inf
    for key in trainVecDict:
        singleLabelRate=1.0
        for word in testArray:
            singleLabelRate *= trainVecDict[key][voca.index(word)] #                              
        if singleLabelRate*labelRate[key] > maxRate:
                result = key;maxRate =singleLabelRate*labelRate[key]
    return result

dataSet,labels=loadDataSet()
labelRate,uniqueLabel=calc_label(labels)
voca=calcVocaulary(dataSet)
print voca
trainVecDict=Bayes(dataSet,labels,uniqueLabel,voca)
testArray=array([2,'S'])
print labelRate
print trainVecDict
	
print  testFunction(testArray,voca,trainVecDict,labelRate)

['1', 'S', '2', 'M', '3', 'L']
{1: 0.6, -1: 0.4}
{1: array([ 0.22222222,  0.11111111,  0.33333333,  0.44444444,  0.44444444,
        0.44444444]), -1: array([ 0.5       ,  0.5       ,  0.33333333,  0.33333333,  0.16666667,
        0.16666667])}
-1
made by zcl at CUMT
I know I can because I have a heart that beats