sklearnのモデル選択と評価のクロス検証

57142 ワード

import numpy as np

クロス検証クロス検証:ラーニングのパフォーマンスを評価する


データをトレーニングセットとテストセットに分割

X, y = np.arange(10).reshape((5, 2)), range(5)
print(X)
print(y)
[[0 1]
 [2 3]
 [4 5]
 [6 7]
 [8 9]]
range(0, 5)
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size= 0.33,random_state=42)
print(X_train)
print(y_train)
[[4 5]
 [0 1]
 [6 7]]
[2, 0, 3]
print(X_test,"
"
,y_test)
[[2 3]
 [8 9]] 
 [1, 4]

クロス検証の指標の計算


cross_val_score

from sklearn import datasets, linear_model
from sklearn.model_selection import cross_val_score
diabetes = datasets.load_diabetes()
X= diabetes.data[:150]
y = diabetes.target[:150]
lasso = linear_model.Lasso()
cross_val_score(lasso,X,y)
array([ 0.33150734,  0.08022311,  0.03531764])

cross_validate関数とマルチメトリック評価

  • 複数の指標を指定する評価を許可する.
  • は、テストスコアに加えて、トレーニングスコア、フィット回数、score-times(スコア回数)を含む辞書
  • を返します.

    クロス検証による予測の取得

    from sklearn.model_selection import cross_val_predict
    y_pred = cross_val_predict(lasso, X, y)
    y_pred
    
    array([ 174.26933996,  117.6539241 ,  164.60228641,  155.65049088,
            132.68647979,  128.49511245,  120.76146877,  141.069413  ,
            164.18904498,  182.37394949,  111.04181265,  127.94311443,
            135.0869234 ,  162.83066014,  135.3573514 ,  157.64516523,
            178.95843326,  163.3919841 ,  143.85237903,  144.29748882,
            133.58117218,  124.77928571,  132.90918003,  208.52927   ,
            153.61908967,  154.16616341,  118.95351821,  163.50467541,
            145.89406196,  168.3308101 ,  155.87411031,  123.45960148,
            185.70459144,  133.38468582,  117.2789469 ,  150.27895019,
            174.1541028 ,  160.03235091,  192.31389633,  161.58568256,
            154.2224809 ,  119.35517679,  146.15706413,  133.82056934,
            179.68118754,  137.96619936,  146.07788398,  126.77579723,
            123.32101099,  166.26710247,  146.41559964,  161.67261029,
            147.47731459,  138.44595305,  144.85421048,  113.77990664,
            185.54970402,  115.31624749,  142.23672103,  171.07792136,
            132.5394716 ,  177.80524864,  116.5616502 ,  134.25230846,
            142.88707475,  173.2830912 ,  154.31273504,  149.16680759,
            144.88238997,  121.97783103,  110.38457621,  180.25559631,
            199.06141058,  151.1195546 ,  161.14217698,  153.96960812,
            150.77179755,  113.30903579,  165.15755771,  115.85735727,
            174.19267171,  150.12027233,  115.47891783,  153.38967232,
            115.31573467,  156.49909623,   92.62211515,  178.15649994,
            131.59320715,  134.46166754,  116.97678633,  190.00790119,
            166.01173292,  126.25944471,  134.29256991,  144.71971963,
            190.9769591 ,  182.39199466,  154.45325308,  148.30325558,
            151.72036937,  124.12825466,  138.6011155 ,  137.75891286,
            123.0917243 ,  131.74735403,  112.07367481,  124.56956904,
            156.78432061,  128.63135591,   93.68260079,  130.54324394,
            131.8693231 ,  154.5708257 ,  179.81343019,  165.78130755,
            150.04779033,  162.37974736,  143.92996797,  143.15645843,
            125.20161377,  145.99590279,  155.3505536 ,  145.97574185,
            134.66120515,  163.92450638,  101.92329396,  139.33014324,
            122.71377023,  152.20573113,  153.36931089,  116.76545147,
            131.96936127,  109.74817383,  132.57453994,  159.38030328,
            109.31343881,  147.69926269,  156.3664255 ,  161.12509958,
            128.16523686,  156.78446286,  154.04375702,  124.83705022,
            143.85606595,  143.23651701,  147.76316913,  154.21572891,
            129.07895017,  157.79644923])
    

    クロス検証反復


    クロス検証反復器-データのループ


    K折

    from sklearn.model_selection import KFold
    X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
    y= np.array([1, 2, 3, 4])
    
    kf = KFold(n_splits=2)
    kf.get_n_splits(X)
    
    2
    
    print(kf)
    
    KFold(n_splits=2, random_state=None, shuffle=False)
    
    for train_index,test_index in kf.split(X):
        print("TRAIN:", train_index, "TEST:", test_index)
        print(" :",X[train_index],"
    "
    ,y[train_index]) print(" :",X[test_index],"
    "
    ,y[test_index])
    TRAIN: [2 3] TEST: [0 1]
     : [[1 2]
     [3 4]] 
     [3 4]
     : [[1 2]
     [3 4]] 
     [1 2]
    TRAIN: [0 1] TEST: [2 3]
     : [[1 2]
     [3 4]] 
     [1 2]
     : [[1 2]
     [3 4]] 
     [3 4]
    

    繰り返しk折

    from sklearn.model_selection import RepeatedKFold
    X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
    y = np.array([0, 0, 1, 1])
    rkf = RepeatedKFold(n_splits=2, n_repeats=2, random_state=2652124)
    
    for train_index, test_index in rkf.split(X):
        print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
    
    TRAIN: [0 1] TEST: [2 3]
    TRAIN: [2 3] TEST: [0 1]
    TRAIN: [1 2] TEST: [0 3]
    TRAIN: [0 3] TEST: [1 2]
    

    クロス検証(LOO)を残す

    from sklearn.model_selection import LeaveOneOut
    X = np.array([[1, 2], [3, 4]])
    y = np.array([1, 2])
    loo = LeaveOneOut()
    loo.get_n_splits(X)
    
    2
    
    loo
    
    LeaveOneOut()
    
    for train_index, test_index in loo.split(X):
        print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        print(X_train, X_test, y_train, y_test)
    
    TRAIN: [1] TEST: [0]
    [[3 4]] [[1 2]] [2] [1]
    TRAIN: [0] TEST: [1]
    [[1 2]] [[3 4]] [1] [2]
    

    留Pクロス検証(LPO)

    from sklearn.model_selection import LeavePOut
    X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
    y = np.array([1, 2, 3, 4])
    lpo = LeavePOut(2)
    lpo.get_n_splits(X)
    print(lpo)
    for train_index, test_index in lpo.split(X):
       print("TRAIN:", train_index, "TEST:", test_index)
       X_train, X_test = X[train_index], X[test_index]
       y_train, y_test = y[train_index], y[test_index]
    
    LeavePOut(p=2)
    TRAIN: [2 3] TEST: [0 1]
    TRAIN: [1 3] TEST: [0 2]
    TRAIN: [1 2] TEST: [0 3]
    TRAIN: [0 3] TEST: [1 2]
    TRAIN: [0 2] TEST: [1 3]
    TRAIN: [0 1] TEST: [2 3]
    

    ランダム配列クロス検証

    from sklearn.model_selection import ShuffleSplit
    X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
    y = np.array([1, 2, 1, 2])
    rs = ShuffleSplit(n_splits=3, test_size=.25, random_state=0)
    rs.get_n_splits(X)
    
    3
    
    for train_index, test_index in rs.split(X):
           print("TRAIN:", train_index, "TEST:", test_index)
    
    TRAIN: [3 1 0] TEST: [2]
    TRAIN: [2 1 3] TEST: [0]
    TRAIN: [0 2 1] TEST: [3]
    
    rs = ShuffleSplit(n_splits=3, train_size=0.5, test_size=.25,
                      random_state=0)
    for train_index, test_index in rs.split(X):
           print("TRAIN:", train_index, "TEST:", test_index)
    
    TRAIN: [3 1] TEST: [2]
    TRAIN: [2 1] TEST: [0]
    TRAIN: [0 2] TEST: [3]
    

    クラスラベルに基づいて、階層化されたクロス検証反復器


    層状k折


    各小集合では,各カテゴリのサンプルの割合はほぼ完全なデータセットと同じである.
    from sklearn.model_selection import StratifiedKFold
    X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
    y = np.array([0, 0, 1, 1])
    skf = StratifiedKFold(n_splits=2)
    skf.get_n_splits(X, y)
    
    2
    
    for train_index, test_index in skf.split(X, y):
        print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
    
    TRAIN: [1 3] TEST: [0 2]
    TRAIN: [0 2] TEST: [1 3]
    

    階層ランダムsplit


    区分を作成しますが、区分内の各クラスの割合は、完全なデータセットと同じです.
    from sklearn.model_selection import StratifiedShuffleSplit
    X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
    y = np.array([0, 0, 1, 1])
    sss = StratifiedShuffleSplit(n_splits=3, test_size=0.5, random_state=0)
    sss.get_n_splits(X, y)
    
    3
    
    for train_index, test_index in sss.split(X, y):
        print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
    
    TRAIN: [1 2] TEST: [3 0]
    TRAIN: [0 2] TEST: [1 3]
    TRAIN: [0 2] TEST: [3 1]
    

    データのグループ化に使用されるクロス検証反復器


    グループk-fold

    from sklearn.model_selection import GroupKFold
    X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
    y = np.array([1, 2, 3, 4])
    groups = np.array([0, 0, 2, 2])
    group_kfold = GroupKFold(n_splits=2)
    group_kfold.get_n_splits(X, y, groups)
    
    2
    
    for train_index, test_index in group_kfold.split(X, y, groups):
        print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        print(X_train, X_test, y_train, y_test)
    
    TRAIN: [0 1] TEST: [2 3]
    [[1 2]
     [3 4]] [[5 6]
     [7 8]] [1 2] [3 4]
    TRAIN: [2 3] TEST: [0 1]
    [[5 6]
     [7 8]] [[1 2]
     [3 4]] [3 4] [1 2]
    

    クロス検証のセットを残す

    from sklearn.model_selection import LeaveOneGroupOut
    X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
    y = np.array([1, 2, 1, 2])
    groups = np.array([1, 1, 2, 2])
    logo = LeaveOneGroupOut()
    logo.get_n_splits(X, y, groups)
    
    logo.get_n_splits(groups=groups)
    
    2
    
    for train_index, test_index in logo.split(X, y, groups):
        print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        print(X_train, X_test, y_train, y_test)
    
    TRAIN: [2 3] TEST: [0 1]
    [[5 6]
     [7 8]] [[1 2]
     [3 4]] [1 2] [1 2]
    TRAIN: [0 1] TEST: [2 3]
    [[1 2]
     [3 4]] [[5 6]
     [7 8]] [1 2] [1 2]
    

    Pグループのクロス検証を残す

    from sklearn.model_selection import LeavePGroupsOut
    X = np.array([[1, 2], [3, 4], [5, 6]])
    y = np.array([1, 2, 1])
    groups = np.array([1, 2, 3])
    lpgo = LeavePGroupsOut(n_groups=2)
    lpgo.get_n_splits(X, y, groups)
    
    lpgo.get_n_splits(groups=groups)
    
    3
    
    for train_index, test_index in lpgo.split(X, y, groups):
        print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        print(X_train, X_test, y_train, y_test)
    
    TRAIN: [2] TEST: [0 1]
    [[5 6]] [[1 2]
     [3 4]] [1] [1 2]
    TRAIN: [1] TEST: [0 2]
    [[3 4]] [[1 2]
     [5 6]] [2] [1 1]
    TRAIN: [0] TEST: [1 2]
    [[1 2]] [[3 4]
     [5 6]] [1] [2 1]
    

    グループ分割(ランダム)

    from sklearn.model_selection import GroupShuffleSplit
    
    X = [0.1, 0.2, 2.2, 2.4, 2.3, 4.55, 5.8, 0.001]
    y = ["a", "b", "b", "b", "c", "c", "c", "a"]
    groups = [1, 1, 2, 2, 3, 3, 4, 4]
    gss = GroupShuffleSplit(n_splits=4, test_size=0.5, random_state=0)
    for train, test in gss.split(X, y, groups=groups):
        print("%s %s" % (train, test))
    
    [0 1 2 3] [4 5 6 7]
    [2 3 6 7] [0 1 4 5]
    [2 3 4 5] [0 1 6 7]
    [4 5 6 7] [0 1 2 3]
    

    事前定義された折りたたみ、検証セット


    時系列データへのクロス検証の適用


    じかんれつぶんかつ

    from sklearn.model_selection import TimeSeriesSplit
    X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
    y = np.array([1, 2, 3, 4])
    tscv = TimeSeriesSplit(n_splits=3)
    print(tscv)  
    
    TimeSeriesSplit(max_train_size=None, n_splits=3)
    
    for train_index, test_index in tscv.split(X):
        print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
    
    TRAIN: [0] TEST: [1]
    TRAIN: [0 1] TEST: [2]
    TRAIN: [0 1 2] TEST: [3]