機械学習実戦(一)——kNNアルゴリズム

6514 ワード

最近、コンピューター学院のモード識別課を選んで、その中でParzen窓とkNNアルゴリズムの2種類の全体に対する非パラメータ推定方法について話しました.以前は機械学習の実戦でkNNアルゴリズムで手書きデジタル識別を行いましたが、今日はParzen窓原理のPNN(確率神経ネットワーク)アルゴリズムでpythonプログラムを書いて優劣を比較し続けます.PNNアルゴリズムは私が書いた最初の訓練が必要な機械学習プログラムです.2016.3.15
———分割線、以下は前の内容、kNNアルゴリズム—————————————————————————
kNN.py
# -*- coding: utf-8 -*-
###########
#KNN: the k nearest neighbours

###########


from numpy import *
import operator
import os
##############
#new_input : a matrix with [1,1024]
#dataset : a matrix with [num,1024]
#labels : 0:9
# k : the k in knn
def kNNClassify(new_input,dataset,labels,k):
#cal the distance
    num = dataset.shape[0]
    subMatrix=tile(new_input,[num,1])
    dist=subMatrix-dataset
    dist=dist**2
    #distance=dist.sum(axis=1)
    distance=sum(dist,axis=1)
    disttance=distance**0.5

#sort
    sortedDistIndice=argsort(distance)
#cal the most
    voteCount={}
    for i in xrange(k):
        voteIndice=labels[sortedDistIndice[i]]
        #cal the votes
        voteCount[voteIndice]=voteCount.get(voteIndice,0)+1

    #max votecount
    maxCount = 0
    for key,value in voteCount.items():
        if value > maxCount:
            maxCount = value
            maxIndex = key

    return maxIndex


def img2Vector(filename):
    rows = 32
    cols = 32
    imgVector = zeros([1,rows*cols])
    fileIn = open(filename)
    for row in xrange(rows):
        lineStr = fileIn.readline()
        for col in xrange(cols):
            imgVector[0,row*cols+col] = int(lineStr[col])

    return imgVector

def loadDataSet():
    #getting training set
    print 'getting training set'
    rows = 32
    cols = 32
    dirTrain = './1/'
    fileTrain = os.listdir('./1/')
    train_x = zeros((len(fileTrain),rows*cols))
    #labels = []
    #zeros(1,len(filename))
    train_y =[]
    for i in xrange(len(fileTrain)):
        dirFile=dirTrain+fileTrain[i]
        train_x[i,:] = img2Vector(dirFile)
        label = int(fileTrain[i].split('_')[0])
        train_y.append(label)

    print 'getting test set'
    dirTest = './2/'
    fileTest = os.listdir('./2/')
    test_x=zeros((len(fileTest),rows*cols))
    test_y=[]
    for j in xrange(len(fileTest)):
        dirFile=dirTest+fileTest[j]
        test_x[j,:]=img2Vector(dirFile)
        label = int(fileTest[j].split('_')[0])
        test_y.append(label)

    return train_x,train_y,test_x,test_y

def testHandWriting():
    print 'loading data'
    train_x,train_y,test_x,test_y =loadDataSet()

    print 'traning'
    pass

    print 'testing'

    numTestSamples = test_x.shape[0]
    matchCount = 0
    for i in xrange(numTestSamples):
        predict = kNNClassify(test_x[i],train_x,train_y,3)
        if predict == test_y[i]:
            matchCount += 1

    accuracy = float(matchCount)/numTestSamples

    print 'show the result...
'
print '%.2f%%' %(accuracy*100)

test_kNN.py
#test-kNN
import kNN
kNN.testHandWriting()