nested k-fold cross validation


nested k-fold cross validationの勉強のため
https://axa.biopapyrus.jp/machine-learning/model-evaluation/nested-k-fold-cross-validation.html
のコードを使用させていただいた。ここに勉強した内容をまとめておく。

Google Colaboratoryで実行しようとすると
9行目の
from sklearn.grid_search import GridSearchCV
ではエラーが出たので
from sklearn.model_selection import GridSearchCV
と変更した。

他にも最後のところで少し誤記があったので修正し、下記のコードになった。

import numpy as np
from sklearn import datasets
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
#from sklearn.grid_search import GridSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score


# load data
cancer = datasets.load_breast_cancer()
x = cancer.data
y = cancer.target

print(x.shape)
## (569, 30)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
print(x_train.shape)
## (455, 30)


# SVM model
ppln_svm = Pipeline([
               ('scale', StandardScaler()),
               ('pca', PCA(0.80)),
               ('clf', SVC())
           ])

# SVM model hyperparameters
param_grid_svm = [
    {
        'clf__kernel': ['rbf'],
        'clf__C': 10 ** np.linspace(-5, 5, 20),
        'clf__gamma': 10 ** np.linspace(-5, 5, 20)
    }
]

# Random Forest model
ppln_rf = Pipeline([
               ('scale', StandardScaler()),
               ('pca', PCA(0.80)),
               ('clf', RandomForestClassifier())
           ])

# Random Forest model hyperparameters
param_grid_rf = [
    {'clf__max_depth': [2, 3, 4, 5, 6, 7, 8]}
]


# grid search in inner-loop
gs_svm = GridSearchCV(estimator=ppln_svm, param_grid=param_grid_svm, scoring='f1', cv=2, n_jobs=1)
gs_rf = GridSearchCV(estimator=ppln_rf, param_grid=param_grid_rf, scoring='f1', cv=2, n_jobs=1)


# validate model in outer-loop
scores_svm = cross_val_score(gs_svm, x_train, y_train, scoring='f1', cv=10)
scores_rf = cross_val_score(gs_rf, x_train, y_train, scoring='f1', cv=10)

print('SVM: %.2f±%.2f' % (np.mean(scores_svm), np.std(scores_svm)))
## SVM: 0.97±0.02

print('RF: %.2f±%.2f' % (np.mean(scores_rf), np.std(scores_rf)))
## RF: 0.95±0.03

メモ