PCA+KNNを用いてMNISTデータセットを手書きデジタル認識python


第一歩は公式サイトからhttp://yann.lecun.com/exdb/mnist/MNISTデータベースをダウンロードして解凍
第2ステップデータを読み込み、参考ブログをテストする[1]
ダウンロードしたファイルはバイナリ形式で次のとおりです.
[offset] [type]          [value]          [description] 0000     32 bit integer  0x00000803(2051) magic number 0004     32 bit integer  60000            number of images 0008     32 bit integer  28               number of rows 0012     32 bit integer  28               number of columns 0016     unsigned byte   ??               pixel 0017     unsigned byte   ??               pixel ........ xxxx     unsigned byte   ??               pixel
つまり、32 bit integerを4つ読み込む前に
最初の16個の数字を読み取り、同時にラベルを入力するテスト
# -*- coding: utf-8 -*-   
import struct  
import matplotlib.pyplot as plt  
import operator
import numpy as np

#   trainingMat
filename = 'train-images.idx3-ubyte'  
binfile = open(filename , 'rb')  
buf = binfile.read()  
index = 0  
#'>IIII'         unsigned int32  
magic, numImages , numRows , numColumns = struct.unpack_from('>IIII' , buf , index)  
index += struct.calcsize('>IIII')  

#  labels
filename1 =  'train-labels.idx1-ubyte'  
binfile1 = open(filename1 , 'rb')  
buf1 = binfile1.read()  
  
index1 = 0  
#'>IIII'         unsigned int32  
magic1, numLabels1 = struct.unpack_from('>II' , buf , index)  
index1 += struct.calcsize('>II')  

#    PCA      traingMat   label
#for i in range(trainingNumbers): 
for i in range(16):   
	im = struct.unpack_from('>784B' ,buf, index)  
	index += struct.calcsize('>784B')  
	im = np.array(im) 
	im=im.reshape(28,28)
	plt.subplot(4,4,i+1)
	plt.imshow(im)
	#    
	numtemp = struct.unpack_from('1B' ,buf1, index1) 
	label = numtemp[0]
	index1 += struct.calcsize('1B')
	print label
plt.show()

画像は以下の通りです.
使用PCA + KNN对MNIST数据集进行手写数字识别 python_第1张图片
PCAアルゴリズムとKNNアルゴリズムを加入して、この2つのアルゴリズムの原理はもう繰り返さないで、関数は機械が実戦のあの本の完全なアルゴリズムのコードを学ぶことに由来して以下の通りです
# -*- coding: utf-8 -*-   
from numpy import *
import numpy as np  
import struct  
import matplotlib.pyplot as plt  
import operator

#                   PCA    
global redEigVects

def pca(dataMat, topNfeat=9999999):
	global redEigVects
	meanVals = mean(dataMat, axis=0)	
	meanRemoved = dataMat - meanVals #remove mean
	covMat = cov(meanRemoved, rowvar=0)
	eigVals,eigVects = linalg.eig(mat(covMat))
	eigValInd = argsort(eigVals)#sort, sort goes smallest to largest
	eigValInd = eigValInd[:-(topNfeat+1):-1]  #cut off unwanted dimensions
	redEigVects = eigVects[:,eigValInd]   #reorganize eig vects largest to smallest
	#       
	lowDDataMat = meanRemoved * redEigVects#transform data into new dimensions
	reconMat = (lowDDataMat * redEigVects.T) + meanVals
	return lowDDataMat, reconMat
def KNN(inX, dataSet, labels, k):
    dataSetSize = dataSet.shape[0]
    diffMat = tile(inX, (dataSetSize,1)) - dataSet
    sqDiffMat = diffMat**2
    sqDistances = sqDiffMat.sum(axis=1) #axis=0,    。axis=1,    。
    distances = sqDistances**0.5
    sortedDistIndicies = distances.argsort()
    classCount={}          
    for i in range(k):
        voteIlabel = labels[sortedDistIndicies[i]]
        classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1
    sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True)
    return sortedClassCount[0][0]

#   trainingMat
filename = 'train-images.idx3-ubyte'  
binfile = open(filename , 'rb')  
buf = binfile.read()  
index = 0  
#'>IIII'         unsigned int32  
magic, numImages , numRows , numColumns = struct.unpack_from('>IIII' , buf , index)  
index += struct.calcsize('>IIII')  
'''  
#        
print magic  
print numImages  
print numRows  
print numColumns  
'''

#  labels
filename1 =  'train-labels.idx1-ubyte'  
binfile1 = open(filename1 , 'rb')  
buf1 = binfile1.read()  
  
index1 = 0  
#'>IIII'         unsigned int32  
magic1, numLabels1 = struct.unpack_from('>II' , buf , index)  
index1 += struct.calcsize('>II')  


#       2500 
trainingNumbers=2500
#       7           40  
DD=40
#   traingMat
trainingMatO=zeros((trainingNumbers,28*28))
#     
trainingLabels=[]


#    PCA      traingMat   label
#for i in range(trainingNumbers): 
for i in range(trainingNumbers):   
	im = struct.unpack_from('>784B' ,buf, index)  
	index += struct.calcsize('>784B')  
	im = np.array(im) 
	trainingMatO[i]=im
	#    
	numtemp = struct.unpack_from('1B' ,buf1, index1) 
	label = numtemp[0]
	index1 += struct.calcsize('1B')
	trainingLabels.append(label)

#PCA
trainingMat,reconMat=pca(trainingMatO,DD)
'''
**************************************************
'''
#   testMat
filename3 = 't10k-images.idx3-ubyte'  
binfile3 = open(filename3 , 'rb')  
buf3 = binfile3.read()  
index3 = 0  
#'>IIII'         unsigned int32  
magic3, numImages3 , numRows3 , numColumns3 = struct.unpack_from('>IIII' , buf3 , index3)  
index3 += struct.calcsize('>IIII')  

#  labels
filename4 =  't10k-labels.idx1-ubyte'  
binfile4 = open(filename4, 'rb')  
buf4 = binfile4.read()  
  
index4= 0  
#'>IIII'         unsigned int32  
magic4, numLabels4 = struct.unpack_from('>II' , buf4 , index4)  
index4 += struct.calcsize('>II')  

'''
**************************************************
'''
#    
testNumbers=300
#    
errCount=0
#    PCA      testMat   label
for i in range(testNumbers):  
	im3 = struct.unpack_from('>784B' ,buf3, index3)  
	index3 += struct.calcsize('>784B')  
	im3 = np.array(im3)  
	
	#             
	meanVals = mean(im3, axis=0)
	meanRemoved = im3 - meanVals #remove mean
	#                           
	testingMat=meanRemoved*redEigVects
	
	#    
	numtemp4 = struct.unpack_from('1B' ,buf4, index4) 
	label4 = numtemp4[0]
	index4 += struct.calcsize('1B')
	#.getA()                PCA                        
	classifierResult = KNN(testingMat.getA(), trainingMat.getA(), trainingLabels, 10)
	print "the classifier came back with: %d, the real answer is: %d" % (classifierResult, label4)
	if  classifierResult is not label4:
		errCount=errCount+1
print 'the err rate is ',float(errCount)/testNumbers


効果は以下の通りです.
使用PCA + KNN对MNIST数据集进行手写数字识别 python_第2张图片
ブログを参考に
[1]http://www.cnblogs.com/x1957/archive/2012/06/02/2531503.htmlpython Mnist読み出し
[2]http://blog.codinglabs.org/articles/pca-tutorial.htmlpcaの簡単な数学の原理
[3]機械学習実戦書第二章KNN第十三章PCA降維