sklearnのモデル選択と評価のクロス検証

57142 ワード

sklearn

import numpy as np

クロス検証クロス検証:ラーニングのパフォーマンスを評価する

データをトレーニングセットとテストセットに分割

X, y = np.arange(10).reshape((5, 2)), range(5)
print(X)
print(y)

[[0 1]
 [2 3]
 [4 5]
 [6 7]
 [8 9]]
range(0, 5)

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size= 0.33,random_state=42)

print(X_train)
print(y_train)

[[4 5]
 [0 1]
 [6 7]]
[2, 0, 3]

print(X_test,"
",y_test)

[[2 3]
 [8 9]] 
 [1, 4]

クロス検証の指標の計算

cross_val_score

from sklearn import datasets, linear_model
from sklearn.model_selection import cross_val_score

diabetes = datasets.load_diabetes()
X= diabetes.data[:150]
y = diabetes.target[:150]

lasso = linear_model.Lasso()
cross_val_score(lasso,X,y)

array([ 0.33150734,  0.08022311,  0.03531764])

cross_validate関数とマルチメトリック評価

複数の指標を指定する評価を許可する.

は、テストスコアに加えて、トレーニングスコア、フィット回数、score-times(スコア回数)を含む辞書

を返します.

クロス検証による予測の取得

from sklearn.model_selection import cross_val_predict
y_pred = cross_val_predict(lasso, X, y)
y_pred

array([ 174.26933996,  117.6539241 ,  164.60228641,  155.65049088,
        132.68647979,  128.49511245,  120.76146877,  141.069413  ,
        164.18904498,  182.37394949,  111.04181265,  127.94311443,
        135.0869234 ,  162.83066014,  135.3573514 ,  157.64516523,
        178.95843326,  163.3919841 ,  143.85237903,  144.29748882,
        133.58117218,  124.77928571,  132.90918003,  208.52927   ,
        153.61908967,  154.16616341,  118.95351821,  163.50467541,
        145.89406196,  168.3308101 ,  155.87411031,  123.45960148,
        185.70459144,  133.38468582,  117.2789469 ,  150.27895019,
        174.1541028 ,  160.03235091,  192.31389633,  161.58568256,
        154.2224809 ,  119.35517679,  146.15706413,  133.82056934,
        179.68118754,  137.96619936,  146.07788398,  126.77579723,
        123.32101099,  166.26710247,  146.41559964,  161.67261029,
        147.47731459,  138.44595305,  144.85421048,  113.77990664,
        185.54970402,  115.31624749,  142.23672103,  171.07792136,
        132.5394716 ,  177.80524864,  116.5616502 ,  134.25230846,
        142.88707475,  173.2830912 ,  154.31273504,  149.16680759,
        144.88238997,  121.97783103,  110.38457621,  180.25559631,
        199.06141058,  151.1195546 ,  161.14217698,  153.96960812,
        150.77179755,  113.30903579,  165.15755771,  115.85735727,
        174.19267171,  150.12027233,  115.47891783,  153.38967232,
        115.31573467,  156.49909623,   92.62211515,  178.15649994,
        131.59320715,  134.46166754,  116.97678633,  190.00790119,
        166.01173292,  126.25944471,  134.29256991,  144.71971963,
        190.9769591 ,  182.39199466,  154.45325308,  148.30325558,
        151.72036937,  124.12825466,  138.6011155 ,  137.75891286,
        123.0917243 ,  131.74735403,  112.07367481,  124.56956904,
        156.78432061,  128.63135591,   93.68260079,  130.54324394,
        131.8693231 ,  154.5708257 ,  179.81343019,  165.78130755,
        150.04779033,  162.37974736,  143.92996797,  143.15645843,
        125.20161377,  145.99590279,  155.3505536 ,  145.97574185,
        134.66120515,  163.92450638,  101.92329396,  139.33014324,
        122.71377023,  152.20573113,  153.36931089,  116.76545147,
        131.96936127,  109.74817383,  132.57453994,  159.38030328,
        109.31343881,  147.69926269,  156.3664255 ,  161.12509958,
        128.16523686,  156.78446286,  154.04375702,  124.83705022,
        143.85606595,  143.23651701,  147.76316913,  154.21572891,
        129.07895017,  157.79644923])

クロス検証反復

クロス検証反復器-データのループ

K折

from sklearn.model_selection import KFold
X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
y= np.array([1, 2, 3, 4])

kf = KFold(n_splits=2)
kf.get_n_splits(X)

print(kf)

KFold(n_splits=2, random_state=None, shuffle=False)

for train_index,test_index in kf.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    print(" ：",X[train_index],"
",y[train_index])
    print(" ：",X[test_index],"
",y[test_index])

TRAIN: [2 3] TEST: [0 1]
 ： [[1 2]
 [3 4]] 
 [3 4]
 ： [[1 2]
 [3 4]] 
 [1 2]
TRAIN: [0 1] TEST: [2 3]
 ： [[1 2]
 [3 4]] 
 [1 2]
 ： [[1 2]
 [3 4]] 
 [3 4]

繰り返しk折

from sklearn.model_selection import RepeatedKFold
X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
y = np.array([0, 0, 1, 1])
rkf = RepeatedKFold(n_splits=2, n_repeats=2, random_state=2652124)

for train_index, test_index in rkf.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

TRAIN: [0 1] TEST: [2 3]
TRAIN: [2 3] TEST: [0 1]
TRAIN: [1 2] TEST: [0 3]
TRAIN: [0 3] TEST: [1 2]

クロス検証(LOO)を残す

from sklearn.model_selection import LeaveOneOut
X = np.array([[1, 2], [3, 4]])
y = np.array([1, 2])
loo = LeaveOneOut()
loo.get_n_splits(X)

loo

LeaveOneOut()

for train_index, test_index in loo.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    print(X_train, X_test, y_train, y_test)

TRAIN: [1] TEST: [0]
[[3 4]] [[1 2]] [2] [1]
TRAIN: [0] TEST: [1]
[[1 2]] [[3 4]] [1] [2]

留Pクロス検証(LPO)

from sklearn.model_selection import LeavePOut
X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
y = np.array([1, 2, 3, 4])
lpo = LeavePOut(2)
lpo.get_n_splits(X)
print(lpo)
for train_index, test_index in lpo.split(X):
   print("TRAIN:", train_index, "TEST:", test_index)
   X_train, X_test = X[train_index], X[test_index]
   y_train, y_test = y[train_index], y[test_index]

LeavePOut(p=2)
TRAIN: [2 3] TEST: [0 1]
TRAIN: [1 3] TEST: [0 2]
TRAIN: [1 2] TEST: [0 3]
TRAIN: [0 3] TEST: [1 2]
TRAIN: [0 2] TEST: [1 3]
TRAIN: [0 1] TEST: [2 3]

ランダム配列クロス検証

from sklearn.model_selection import ShuffleSplit
X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
y = np.array([1, 2, 1, 2])
rs = ShuffleSplit(n_splits=3, test_size=.25, random_state=0)
rs.get_n_splits(X)

for train_index, test_index in rs.split(X):
       print("TRAIN:", train_index, "TEST:", test_index)

TRAIN: [3 1 0] TEST: [2]
TRAIN: [2 1 3] TEST: [0]
TRAIN: [0 2 1] TEST: [3]

rs = ShuffleSplit(n_splits=3, train_size=0.5, test_size=.25,
                  random_state=0)
for train_index, test_index in rs.split(X):
       print("TRAIN:", train_index, "TEST:", test_index)

TRAIN: [3 1] TEST: [2]
TRAIN: [2 1] TEST: [0]
TRAIN: [0 2] TEST: [3]

クラスラベルに基づいて、階層化されたクロス検証反復器

層状k折

各小集合では,各カテゴリのサンプルの割合はほぼ完全なデータセットと同じである.

from sklearn.model_selection import StratifiedKFold
X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
y = np.array([0, 0, 1, 1])
skf = StratifiedKFold(n_splits=2)
skf.get_n_splits(X, y)

for train_index, test_index in skf.split(X, y):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

TRAIN: [1 3] TEST: [0 2]
TRAIN: [0 2] TEST: [1 3]

階層ランダムsplit

区分を作成しますが、区分内の各クラスの割合は、完全なデータセットと同じです.

from sklearn.model_selection import StratifiedShuffleSplit
X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
y = np.array([0, 0, 1, 1])
sss = StratifiedShuffleSplit(n_splits=3, test_size=0.5, random_state=0)
sss.get_n_splits(X, y)

for train_index, test_index in sss.split(X, y):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

TRAIN: [1 2] TEST: [3 0]
TRAIN: [0 2] TEST: [1 3]
TRAIN: [0 2] TEST: [3 1]

データのグループ化に使用されるクロス検証反復器

グループk-fold

from sklearn.model_selection import GroupKFold
X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
y = np.array([1, 2, 3, 4])
groups = np.array([0, 0, 2, 2])
group_kfold = GroupKFold(n_splits=2)
group_kfold.get_n_splits(X, y, groups)

for train_index, test_index in group_kfold.split(X, y, groups):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    print(X_train, X_test, y_train, y_test)

TRAIN: [0 1] TEST: [2 3]
[[1 2]
 [3 4]] [[5 6]
 [7 8]] [1 2] [3 4]
TRAIN: [2 3] TEST: [0 1]
[[5 6]
 [7 8]] [[1 2]
 [3 4]] [3 4] [1 2]

クロス検証のセットを残す

from sklearn.model_selection import LeaveOneGroupOut
X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
y = np.array([1, 2, 1, 2])
groups = np.array([1, 1, 2, 2])
logo = LeaveOneGroupOut()
logo.get_n_splits(X, y, groups)

logo.get_n_splits(groups=groups)

for train_index, test_index in logo.split(X, y, groups):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    print(X_train, X_test, y_train, y_test)

TRAIN: [2 3] TEST: [0 1]
[[5 6]
 [7 8]] [[1 2]
 [3 4]] [1 2] [1 2]
TRAIN: [0 1] TEST: [2 3]
[[1 2]
 [3 4]] [[5 6]
 [7 8]] [1 2] [1 2]

Pグループのクロス検証を残す

from sklearn.model_selection import LeavePGroupsOut
X = np.array([[1, 2], [3, 4], [5, 6]])
y = np.array([1, 2, 1])
groups = np.array([1, 2, 3])
lpgo = LeavePGroupsOut(n_groups=2)
lpgo.get_n_splits(X, y, groups)

lpgo.get_n_splits(groups=groups)

for train_index, test_index in lpgo.split(X, y, groups):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    print(X_train, X_test, y_train, y_test)

TRAIN: [2] TEST: [0 1]
[[5 6]] [[1 2]
 [3 4]] [1] [1 2]
TRAIN: [1] TEST: [0 2]
[[3 4]] [[1 2]
 [5 6]] [2] [1 1]
TRAIN: [0] TEST: [1 2]
[[1 2]] [[3 4]
 [5 6]] [1] [2 1]

グループ分割(ランダム)

from sklearn.model_selection import GroupShuffleSplit

X = [0.1, 0.2, 2.2, 2.4, 2.3, 4.55, 5.8, 0.001]
y = ["a", "b", "b", "b", "c", "c", "c", "a"]
groups = [1, 1, 2, 2, 3, 3, 4, 4]
gss = GroupShuffleSplit(n_splits=4, test_size=0.5, random_state=0)
for train, test in gss.split(X, y, groups=groups):
    print("%s %s" % (train, test))

[0 1 2 3] [4 5 6 7]
[2 3 6 7] [0 1 4 5]
[2 3 4 5] [0 1 6 7]
[4 5 6 7] [0 1 2 3]

事前定義された折りたたみ、検証セット

時系列データへのクロス検証の適用

じかんれつぶんかつ

from sklearn.model_selection import TimeSeriesSplit
X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
y = np.array([1, 2, 3, 4])
tscv = TimeSeriesSplit(n_splits=3)
print(tscv)

TimeSeriesSplit(max_train_size=None, n_splits=3)

for train_index, test_index in tscv.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

TRAIN: [0] TEST: [1]
TRAIN: [0 1] TEST: [2]
TRAIN: [0 1 2] TEST: [3]

Fiddlerの弱いネットテスト(2 G,3 G,4 G)

テーマ1000:計算a+b