GridSearchCV の実装に関するメモ


Sample Code

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#matplotlib inline

iris = sns.load_dataset('iris')

#iris2 = pd.get_dummies(iris)
#iris2.head(3)

species_mapping = {'setosa': 0, 'versicolor': 1, 'virginica': 2}
iris3 = iris.copy()
iris3['species'] = iris['species'].map(species_mapping)
iris3.head(3)

plt.figure(figsize = (5, 5))
sns.heatmap(iris3.corr(), annot = True, linewidths = 0.1, linecolor = 'white')
plt.xticks(rotation = 30)
plt.show()

x = iris3['sepal_length']
y = iris3['petal_length']
target = iris3['species']

plt.figure(figsize = (5, 3))
plt.scatter(x = x, y = y, \
            c = [['orange', 'green', 'blue'][t] for t in target])
plt.show()

features = iris3.drop('species', axis = 1)
target = iris3['species']
feat_labels = features.columns

from sklearn.model_selection import train_test_split

x_train, x_val, y_train, y_val = train_test_split(features, target, test_size = 0.2)

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(x_train, y_train)

importances = rf.feature_importances_

#indices = np.argsort(importances)[::-1]
#plt.figure(figsize = (5, 3))
#plt.bar(x = range(x_train.shape[1]), height = importances[indices])
#plt.xticks(range(x_train.shape[1]), feat_labels[indices], \
#           rotation = 30)
#plt.title('Feature importances')
#plt.show()

indices = np.argsort(importances)
plt.figure(figsize = (5, 3))
plt.barh(y = range(len(indices)), width = importances[indices])
plt.yticks(range(len(indices)), feat_labels[indices])
plt.title('Feature importances')
plt.show()

from sklearn.metrics import confusion_matrix, accuracy_score

y_pred = rf.predict(x_val)
acc = accuracy_score(y_true = y_val, y_pred = y_pred)
conf_mat = confusion_matrix(y_true = y_val, y_pred = y_pred)

print ('accuracy:', acc)

plt.figure(figsize = (5, 5))
sns.heatmap(conf_mat, annot = True, linewidths = 0.1, linecolor = 'white')
plt.title('Confusion matrix')
plt.xlabel('predicted label')
plt.ylabel('true label')
plt.show()

from sklearn.grid_search import GridSearchCV

param_grid = [{'n_estimators': [10, 20], \
              'max_depth' : [2, 5], \
              'criterion' : ['gini']}, \
              {'n_estimators': [10, 20], \
              'max_depth' : [2, 5], \
              'criterion' :['entropy']}]


gs = GridSearchCV(estimator = rf, param_grid = param_grid, cv = 3, \
                 scoring = 'accuracy')
gs.fit(x_train, y_train)

print (gs.best_score_)
print (gs.best_params_)