クロス検証でKNNモデルのパラメータを調整する

1682 ワード

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
#    
def inspect_data(file_root):
    dataframe=pd.read_csv(file_root)
    print("      :")
    print(dataframe.info())
    print("   %i ,%i "%(dataframe.shape[0],dataframe.shape[1]))
    print("    :")
    print(dataframe.head())
    return dataframe
#      
def processing_missing_data(dataframe):
    if dataframe.isnull().values.any():
        dataframe=dataframe.dropna()
        #dataframe=dataframe.fillna(0)
    return dataframe
#    
dataframe=pd.read_csv("H:/pythonfigure/voice.csv")
#      
dataframe=processing_missing_data(dataframe)
#    
dataframe.replace("male",1,inplace=True)
dataframe.replace("female",0,inplace=True)
#    
x=dataframe.ix[:,:-1]
y=dataframe.ix[:,-1]
#     
from sklearn import preprocessing
x=preprocessing.scale(x)
#         
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=1/3.,random_state=5)
#    
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
k_range=range(1,31)
cv_score=[]
for k in k_range:
    knn=KNeighborsClassifier(k)
    scores=cross_val_score(knn,x_train,y_train,cv=10,scoring="accuracy")
    score_mean=scores.mean()
    cv_score.append(score_mean)
    print(k,score_mean)
best_k=np.argmax(cv_score)+1
print("   k %i"%(best_k))
plt.plot(k_range,cv_score)
plt.xlabel("k")
plt.ylabel("score")
plt.show()
#    
knn_model=KNeighborsClassifier(best_k)
knn_model.fit(x_train,y_train)
print("     :",knn_model.score(x_test,y_test))