機械学習実戦(一)——kNNアルゴリズム
最近、コンピューター学院のモード識別課を選んで、その中でParzen窓とkNNアルゴリズムの2種類の全体に対する非パラメータ推定方法について話しました.以前は機械学習の実戦でkNNアルゴリズムで手書きデジタル識別を行いましたが、今日はParzen窓原理のPNN(確率神経ネットワーク)アルゴリズムでpythonプログラムを書いて優劣を比較し続けます.PNNアルゴリズムは私が書いた最初の訓練が必要な機械学習プログラムです.2016.3.15
———分割線、以下は前の内容、kNNアルゴリズム—————————————————————————
kNN.py
test_kNN.py
———分割線、以下は前の内容、kNNアルゴリズム—————————————————————————
kNN.py
# -*- coding: utf-8 -*-
###########
#KNN: the k nearest neighbours
###########
from numpy import *
import operator
import os
##############
#new_input : a matrix with [1,1024]
#dataset : a matrix with [num,1024]
#labels : 0:9
# k : the k in knn
def kNNClassify(new_input,dataset,labels,k):
#cal the distance
num = dataset.shape[0]
subMatrix=tile(new_input,[num,1])
dist=subMatrix-dataset
dist=dist**2
#distance=dist.sum(axis=1)
distance=sum(dist,axis=1)
disttance=distance**0.5
#sort
sortedDistIndice=argsort(distance)
#cal the most
voteCount={}
for i in xrange(k):
voteIndice=labels[sortedDistIndice[i]]
#cal the votes
voteCount[voteIndice]=voteCount.get(voteIndice,0)+1
#max votecount
maxCount = 0
for key,value in voteCount.items():
if value > maxCount:
maxCount = value
maxIndex = key
return maxIndex
def img2Vector(filename):
rows = 32
cols = 32
imgVector = zeros([1,rows*cols])
fileIn = open(filename)
for row in xrange(rows):
lineStr = fileIn.readline()
for col in xrange(cols):
imgVector[0,row*cols+col] = int(lineStr[col])
return imgVector
def loadDataSet():
#getting training set
print 'getting training set'
rows = 32
cols = 32
dirTrain = './1/'
fileTrain = os.listdir('./1/')
train_x = zeros((len(fileTrain),rows*cols))
#labels = []
#zeros(1,len(filename))
train_y =[]
for i in xrange(len(fileTrain)):
dirFile=dirTrain+fileTrain[i]
train_x[i,:] = img2Vector(dirFile)
label = int(fileTrain[i].split('_')[0])
train_y.append(label)
print 'getting test set'
dirTest = './2/'
fileTest = os.listdir('./2/')
test_x=zeros((len(fileTest),rows*cols))
test_y=[]
for j in xrange(len(fileTest)):
dirFile=dirTest+fileTest[j]
test_x[j,:]=img2Vector(dirFile)
label = int(fileTest[j].split('_')[0])
test_y.append(label)
return train_x,train_y,test_x,test_y
def testHandWriting():
print 'loading data'
train_x,train_y,test_x,test_y =loadDataSet()
print 'traning'
pass
print 'testing'
numTestSamples = test_x.shape[0]
matchCount = 0
for i in xrange(numTestSamples):
predict = kNNClassify(test_x[i],train_x,train_y,3)
if predict == test_y[i]:
matchCount += 1
accuracy = float(matchCount)/numTestSamples
print 'show the result...
'
print '%.2f%%' %(accuracy*100)
test_kNN.py
#test-kNN
import kNN
kNN.testHandWriting()