ワイン品質分類


  • の精度を高める方法は?
  • 予測モデル
  • を変更
  • 独立変数の重みを削除または付与します(ワインの品質にはどのような重要な特性がありますか?)
  • スーパーパラメータ調整?
  • import pandas as pd
    %matplotlib inline
    import matplotlib.pyplot as plt
    import seaborn as sns
    import warnings
    warnings.filterwarnings(action='ignore')
    train = pd.read_csv('data/train.csv')
    test = pd.read_csv('data/test.csv')
    1.簡単なEDA
    print(train.shape, test.shape)
    (5497, 14) (1000, 13)
    train.head(2)
    index
    quality
    fixed acidity
    volatile acidity
    citric acid
    residual sugar
    chlorides
    free sulfur dioxide
    total sulfur dioxide
    density
    pH
    sulphates
    alcohol
    type
    0
    0
    5
    5.6
    0.695
    0.06
    6.8
    0.042
    9.0
    84.0
    0.99432
    3.44
    0.44
    10.2
    white
    1
    1
    5
    8.8
    0.610
    0.14
    2.4
    0.067
    10.0
    42.0
    0.99690
    3.19
    0.59
    9.5
    red
    test.head(2) # quality 변수가 없네.. quality가 target?
    index
    fixed acidity
    volatile acidity
    citric acid
    residual sugar
    chlorides
    free sulfur dioxide
    total sulfur dioxide
    density
    pH
    sulphates
    alcohol
    type
    0
    0
    9.0
    0.31
    0.48
    6.6
    0.043
    11.0
    73.0
    0.9938
    2.90
    0.38
    11.6
    white
    1
    1
    13.3
    0.43
    0.58
    1.9
    0.070
    15.0
    40.0
    1.0004
    3.06
    0.49
    9.0
    red
    train.info()  # 다행히 결측치는 없다
    <class 'pandas.core.frame.DataFrame'>
    RangeIndex: 5497 entries, 0 to 5496
    Data columns (total 14 columns):
     #   Column                Non-Null Count  Dtype
    ---  ------                --------------  -----
     0   index                 5497 non-null   int64
     1   quality               5497 non-null   int64
     2   fixed acidity         5497 non-null   float64
     3   volatile acidity      5497 non-null   float64
     4   citric acid           5497 non-null   float64
     5   residual sugar        5497 non-null   float64
     6   chlorides             5497 non-null   float64
     7   free sulfur dioxide   5497 non-null   float64
     8   total sulfur dioxide  5497 non-null   float64
     9   density               5497 non-null   float64
     10  pH                    5497 non-null   float64
     11  sulphates             5497 non-null   float64
     12  alcohol               5497 non-null   float64
     13  type                  5497 non-null   object
    dtypes: float64(11), int64(2), object(1)
    memory usage: 601.4+ KB
    # train 데이터셋 변수간 상관관계(train) 보기 - heatmap
    plt.figure(figsize=(12,12))
    sns.heatmap(data = train.corr(), annot=True);
    # train의 각 변수별 분포 확인 (subplot)
    plt.figure(figsize=(12,12))
    for i in range(1,13):
        plt.subplot(3,4,i)
        sns.distplot(train.iloc[:,i])
    plt.tight_layout();
        # density 값은 왜 다르지?
        # 확률밀도함수의 y축이 density, 저 함수의 값을 적분하면 값이 1
    # quality 변수를 기준 다른 피처들의 분포 확인 (barplot)
    for i in range(11):
        fig = plt.figure(figsize = (12,6))
        sns.barplot(x= 'quality', y = train.columns[i+2], data = train)
    
        # 막대그래프는 평균? or 최빈값? , 가운데 선은 편차?
        # 오차막대(?) - 분산?











    2.データの前処理
    # type은 white와 red 두 종류인데 각각 0과 1로 변환
    from sklearn.preprocessing import LabelEncoder
    
    enc = LabelEncoder()
    enc.fit(train['type'])
    train['type'] = enc.transform(train['type'])
    test['type'] = enc.transform(test['type'])
    # 데이터 분리 및 불필요한 변수 제거
        # 독립변수 중 제거해도 좋은 변수가 있을까? 있다면 어떻게 알 수 있을까?
    train_x = train.drop(['index','quality'], axis = 1)
    train_y = train['quality']
    test_x = test.drop('index', axis = 1)
    train_x.shape, train_y.shape, test_x.shape
    ((5497, 12), (5497,), (1000, 12))
    3.モデリング
    # from sklearn.ensemble import RandomForestClassifier
    # # 모델선언
    # model = RandomForestClassifier()
    
    # # 모델학습
    # model.fit(train_x, train_y)
    RandomForestClassifier()
    from xgboost import XGBClassifier
    from sklearn.metrics import roc_auc_score
    
    # XGBClassifier 객체 생성
    model = XGBClassifier(n_estimators=500, random_state=156) # 100 -> 500 (0.03 늘어남..)
    
    # 학습 : 성능 평가 지표를 auc로 설정하고 학습 수행.
    model.fit(train_x, train_y)
    XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                  colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
                  gamma=0, gpu_id=-1, importance_type=None,
                  interaction_constraints='', learning_rate=0.300000012,
                  max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
                  monotone_constraints='()', n_estimators=500, n_jobs=8,
                  num_parallel_tree=1, objective='multi:softprob', predictor='auto',
                  random_state=156, reg_alpha=0, reg_lambda=1,
                  scale_pos_weight=None, subsample=1, tree_method='exact',
                  validate_parameters=1, verbosity=None)
    # 학습된 모델로 test 데이터 예측
    y_pred = model.predict(test_x)
    # 제출파일 생성
    submission = pd.read_csv('data/sample_submission.csv')
    submission['quality'] = y_pred
    submission.to_csv('data/wine_quality_2.csv',index=False)