sklearnのモデル選択と評価のクロス検証
57142 ワード
import numpy as np
クロス検証クロス検証:ラーニングのパフォーマンスを評価する
データをトレーニングセットとテストセットに分割
X, y = np.arange(10).reshape((5, 2)), range(5)
print(X)
print(y)
[[0 1]
[2 3]
[4 5]
[6 7]
[8 9]]
range(0, 5)
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size= 0.33,random_state=42)
print(X_train)
print(y_train)
[[4 5]
[0 1]
[6 7]]
[2, 0, 3]
print(X_test,"
",y_test)
[[2 3]
[8 9]]
[1, 4]
クロス検証の指標の計算
cross_val_score
from sklearn import datasets, linear_model
from sklearn.model_selection import cross_val_score
diabetes = datasets.load_diabetes()
X= diabetes.data[:150]
y = diabetes.target[:150]
lasso = linear_model.Lasso()
cross_val_score(lasso,X,y)
array([ 0.33150734, 0.08022311, 0.03531764])
cross_validate関数とマルチメトリック評価
X, y = np.arange(10).reshape((5, 2)), range(5)
print(X)
print(y)
[[0 1]
[2 3]
[4 5]
[6 7]
[8 9]]
range(0, 5)
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size= 0.33,random_state=42)
print(X_train)
print(y_train)
[[4 5]
[0 1]
[6 7]]
[2, 0, 3]
print(X_test,"
",y_test)
[[2 3]
[8 9]]
[1, 4]
from sklearn import datasets, linear_model
from sklearn.model_selection import cross_val_score
diabetes = datasets.load_diabetes()
X= diabetes.data[:150]
y = diabetes.target[:150]
lasso = linear_model.Lasso()
cross_val_score(lasso,X,y)
array([ 0.33150734, 0.08022311, 0.03531764])
クロス検証による予測の取得
from sklearn.model_selection import cross_val_predict
y_pred = cross_val_predict(lasso, X, y)
y_pred
array([ 174.26933996, 117.6539241 , 164.60228641, 155.65049088,
132.68647979, 128.49511245, 120.76146877, 141.069413 ,
164.18904498, 182.37394949, 111.04181265, 127.94311443,
135.0869234 , 162.83066014, 135.3573514 , 157.64516523,
178.95843326, 163.3919841 , 143.85237903, 144.29748882,
133.58117218, 124.77928571, 132.90918003, 208.52927 ,
153.61908967, 154.16616341, 118.95351821, 163.50467541,
145.89406196, 168.3308101 , 155.87411031, 123.45960148,
185.70459144, 133.38468582, 117.2789469 , 150.27895019,
174.1541028 , 160.03235091, 192.31389633, 161.58568256,
154.2224809 , 119.35517679, 146.15706413, 133.82056934,
179.68118754, 137.96619936, 146.07788398, 126.77579723,
123.32101099, 166.26710247, 146.41559964, 161.67261029,
147.47731459, 138.44595305, 144.85421048, 113.77990664,
185.54970402, 115.31624749, 142.23672103, 171.07792136,
132.5394716 , 177.80524864, 116.5616502 , 134.25230846,
142.88707475, 173.2830912 , 154.31273504, 149.16680759,
144.88238997, 121.97783103, 110.38457621, 180.25559631,
199.06141058, 151.1195546 , 161.14217698, 153.96960812,
150.77179755, 113.30903579, 165.15755771, 115.85735727,
174.19267171, 150.12027233, 115.47891783, 153.38967232,
115.31573467, 156.49909623, 92.62211515, 178.15649994,
131.59320715, 134.46166754, 116.97678633, 190.00790119,
166.01173292, 126.25944471, 134.29256991, 144.71971963,
190.9769591 , 182.39199466, 154.45325308, 148.30325558,
151.72036937, 124.12825466, 138.6011155 , 137.75891286,
123.0917243 , 131.74735403, 112.07367481, 124.56956904,
156.78432061, 128.63135591, 93.68260079, 130.54324394,
131.8693231 , 154.5708257 , 179.81343019, 165.78130755,
150.04779033, 162.37974736, 143.92996797, 143.15645843,
125.20161377, 145.99590279, 155.3505536 , 145.97574185,
134.66120515, 163.92450638, 101.92329396, 139.33014324,
122.71377023, 152.20573113, 153.36931089, 116.76545147,
131.96936127, 109.74817383, 132.57453994, 159.38030328,
109.31343881, 147.69926269, 156.3664255 , 161.12509958,
128.16523686, 156.78446286, 154.04375702, 124.83705022,
143.85606595, 143.23651701, 147.76316913, 154.21572891,
129.07895017, 157.79644923])
クロス検証反復
クロス検証反復器-データのループ
K折
from sklearn.model_selection import KFold
X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
y= np.array([1, 2, 3, 4])
kf = KFold(n_splits=2)
kf.get_n_splits(X)
2
print(kf)
KFold(n_splits=2, random_state=None, shuffle=False)
for train_index,test_index in kf.split(X):
print("TRAIN:", train_index, "TEST:", test_index)
print(" :",X[train_index],"
",y[train_index])
print(" :",X[test_index],"
",y[test_index])
TRAIN: [2 3] TEST: [0 1]
: [[1 2]
[3 4]]
[3 4]
: [[1 2]
[3 4]]
[1 2]
TRAIN: [0 1] TEST: [2 3]
: [[1 2]
[3 4]]
[1 2]
: [[1 2]
[3 4]]
[3 4]
繰り返しk折
from sklearn.model_selection import RepeatedKFold
X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
y = np.array([0, 0, 1, 1])
rkf = RepeatedKFold(n_splits=2, n_repeats=2, random_state=2652124)
for train_index, test_index in rkf.split(X):
print("TRAIN:", train_index, "TEST:", test_index)
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
TRAIN: [0 1] TEST: [2 3]
TRAIN: [2 3] TEST: [0 1]
TRAIN: [1 2] TEST: [0 3]
TRAIN: [0 3] TEST: [1 2]
クロス検証(LOO)を残す
from sklearn.model_selection import LeaveOneOut
X = np.array([[1, 2], [3, 4]])
y = np.array([1, 2])
loo = LeaveOneOut()
loo.get_n_splits(X)
2
loo
LeaveOneOut()
for train_index, test_index in loo.split(X):
print("TRAIN:", train_index, "TEST:", test_index)
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
print(X_train, X_test, y_train, y_test)
TRAIN: [1] TEST: [0]
[[3 4]] [[1 2]] [2] [1]
TRAIN: [0] TEST: [1]
[[1 2]] [[3 4]] [1] [2]
留Pクロス検証(LPO)
from sklearn.model_selection import LeavePOut
X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
y = np.array([1, 2, 3, 4])
lpo = LeavePOut(2)
lpo.get_n_splits(X)
print(lpo)
for train_index, test_index in lpo.split(X):
print("TRAIN:", train_index, "TEST:", test_index)
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
LeavePOut(p=2)
TRAIN: [2 3] TEST: [0 1]
TRAIN: [1 3] TEST: [0 2]
TRAIN: [1 2] TEST: [0 3]
TRAIN: [0 3] TEST: [1 2]
TRAIN: [0 2] TEST: [1 3]
TRAIN: [0 1] TEST: [2 3]
ランダム配列クロス検証
from sklearn.model_selection import ShuffleSplit
X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
y = np.array([1, 2, 1, 2])
rs = ShuffleSplit(n_splits=3, test_size=.25, random_state=0)
rs.get_n_splits(X)
3
for train_index, test_index in rs.split(X):
print("TRAIN:", train_index, "TEST:", test_index)
TRAIN: [3 1 0] TEST: [2]
TRAIN: [2 1 3] TEST: [0]
TRAIN: [0 2 1] TEST: [3]
rs = ShuffleSplit(n_splits=3, train_size=0.5, test_size=.25,
random_state=0)
for train_index, test_index in rs.split(X):
print("TRAIN:", train_index, "TEST:", test_index)
TRAIN: [3 1] TEST: [2]
TRAIN: [2 1] TEST: [0]
TRAIN: [0 2] TEST: [3]
クラスラベルに基づいて、階層化されたクロス検証反復器
層状k折
各小集合では,各カテゴリのサンプルの割合はほぼ完全なデータセットと同じである.
from sklearn.model_selection import StratifiedKFold
X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
y = np.array([0, 0, 1, 1])
skf = StratifiedKFold(n_splits=2)
skf.get_n_splits(X, y)
2
for train_index, test_index in skf.split(X, y):
print("TRAIN:", train_index, "TEST:", test_index)
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
TRAIN: [1 3] TEST: [0 2]
TRAIN: [0 2] TEST: [1 3]
階層ランダムsplit
区分を作成しますが、区分内の各クラスの割合は、完全なデータセットと同じです.
from sklearn.model_selection import StratifiedShuffleSplit
X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
y = np.array([0, 0, 1, 1])
sss = StratifiedShuffleSplit(n_splits=3, test_size=0.5, random_state=0)
sss.get_n_splits(X, y)
3
for train_index, test_index in sss.split(X, y):
print("TRAIN:", train_index, "TEST:", test_index)
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
TRAIN: [1 2] TEST: [3 0]
TRAIN: [0 2] TEST: [1 3]
TRAIN: [0 2] TEST: [3 1]
データのグループ化に使用されるクロス検証反復器
グループk-fold
from sklearn.model_selection import GroupKFold
X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
y = np.array([1, 2, 3, 4])
groups = np.array([0, 0, 2, 2])
group_kfold = GroupKFold(n_splits=2)
group_kfold.get_n_splits(X, y, groups)
2
for train_index, test_index in group_kfold.split(X, y, groups):
print("TRAIN:", train_index, "TEST:", test_index)
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
print(X_train, X_test, y_train, y_test)
TRAIN: [0 1] TEST: [2 3]
[[1 2]
[3 4]] [[5 6]
[7 8]] [1 2] [3 4]
TRAIN: [2 3] TEST: [0 1]
[[5 6]
[7 8]] [[1 2]
[3 4]] [3 4] [1 2]
クロス検証のセットを残す
from sklearn.model_selection import LeaveOneGroupOut
X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
y = np.array([1, 2, 1, 2])
groups = np.array([1, 1, 2, 2])
logo = LeaveOneGroupOut()
logo.get_n_splits(X, y, groups)
logo.get_n_splits(groups=groups)
2
for train_index, test_index in logo.split(X, y, groups):
print("TRAIN:", train_index, "TEST:", test_index)
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
print(X_train, X_test, y_train, y_test)
TRAIN: [2 3] TEST: [0 1]
[[5 6]
[7 8]] [[1 2]
[3 4]] [1 2] [1 2]
TRAIN: [0 1] TEST: [2 3]
[[1 2]
[3 4]] [[5 6]
[7 8]] [1 2] [1 2]
Pグループのクロス検証を残す
from sklearn.model_selection import LeavePGroupsOut
X = np.array([[1, 2], [3, 4], [5, 6]])
y = np.array([1, 2, 1])
groups = np.array([1, 2, 3])
lpgo = LeavePGroupsOut(n_groups=2)
lpgo.get_n_splits(X, y, groups)
lpgo.get_n_splits(groups=groups)
3
for train_index, test_index in lpgo.split(X, y, groups):
print("TRAIN:", train_index, "TEST:", test_index)
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
print(X_train, X_test, y_train, y_test)
TRAIN: [2] TEST: [0 1]
[[5 6]] [[1 2]
[3 4]] [1] [1 2]
TRAIN: [1] TEST: [0 2]
[[3 4]] [[1 2]
[5 6]] [2] [1 1]
TRAIN: [0] TEST: [1 2]
[[1 2]] [[3 4]
[5 6]] [1] [2 1]
グループ分割(ランダム)
from sklearn.model_selection import GroupShuffleSplit
X = [0.1, 0.2, 2.2, 2.4, 2.3, 4.55, 5.8, 0.001]
y = ["a", "b", "b", "b", "c", "c", "c", "a"]
groups = [1, 1, 2, 2, 3, 3, 4, 4]
gss = GroupShuffleSplit(n_splits=4, test_size=0.5, random_state=0)
for train, test in gss.split(X, y, groups=groups):
print("%s %s" % (train, test))
[0 1 2 3] [4 5 6 7]
[2 3 6 7] [0 1 4 5]
[2 3 4 5] [0 1 6 7]
[4 5 6 7] [0 1 2 3]
事前定義された折りたたみ、検証セット
時系列データへのクロス検証の適用
じかんれつぶんかつ
from sklearn.model_selection import TimeSeriesSplit
X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
y = np.array([1, 2, 3, 4])
tscv = TimeSeriesSplit(n_splits=3)
print(tscv)
TimeSeriesSplit(max_train_size=None, n_splits=3)
for train_index, test_index in tscv.split(X):
print("TRAIN:", train_index, "TEST:", test_index)
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
TRAIN: [0] TEST: [1]
TRAIN: [0 1] TEST: [2]
TRAIN: [0 1 2] TEST: [3]