ワイン品質分類
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings(action='ignore')
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
1.簡単なEDAprint(train.shape, test.shape)
(5497, 14) (1000, 13)
train.head(2)
indexquality
fixed acidity
volatile acidity
citric acid
residual sugar
chlorides
free sulfur dioxide
total sulfur dioxide
density
pH
sulphates
alcohol
type
0
0
5
5.6
0.695
0.06
6.8
0.042
9.0
84.0
0.99432
3.44
0.44
10.2
white
1
1
5
8.8
0.610
0.14
2.4
0.067
10.0
42.0
0.99690
3.19
0.59
9.5
red
test.head(2) # quality 변수가 없네.. quality가 target?
indexfixed acidity
volatile acidity
citric acid
residual sugar
chlorides
free sulfur dioxide
total sulfur dioxide
density
pH
sulphates
alcohol
type
0
0
9.0
0.31
0.48
6.6
0.043
11.0
73.0
0.9938
2.90
0.38
11.6
white
1
1
13.3
0.43
0.58
1.9
0.070
15.0
40.0
1.0004
3.06
0.49
9.0
red
train.info() # 다행히 결측치는 없다
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5497 entries, 0 to 5496
Data columns (total 14 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 index 5497 non-null int64
1 quality 5497 non-null int64
2 fixed acidity 5497 non-null float64
3 volatile acidity 5497 non-null float64
4 citric acid 5497 non-null float64
5 residual sugar 5497 non-null float64
6 chlorides 5497 non-null float64
7 free sulfur dioxide 5497 non-null float64
8 total sulfur dioxide 5497 non-null float64
9 density 5497 non-null float64
10 pH 5497 non-null float64
11 sulphates 5497 non-null float64
12 alcohol 5497 non-null float64
13 type 5497 non-null object
dtypes: float64(11), int64(2), object(1)
memory usage: 601.4+ KB
# train 데이터셋 변수간 상관관계(train) 보기 - heatmap
plt.figure(figsize=(12,12))
sns.heatmap(data = train.corr(), annot=True);
# train의 각 변수별 분포 확인 (subplot)
plt.figure(figsize=(12,12))
for i in range(1,13):
plt.subplot(3,4,i)
sns.distplot(train.iloc[:,i])
plt.tight_layout();
# density 값은 왜 다르지?
# 확률밀도함수의 y축이 density, 저 함수의 값을 적분하면 값이 1
# quality 변수를 기준 다른 피처들의 분포 확인 (barplot)
for i in range(11):
fig = plt.figure(figsize = (12,6))
sns.barplot(x= 'quality', y = train.columns[i+2], data = train)
# 막대그래프는 평균? or 최빈값? , 가운데 선은 편차?
# 오차막대(?) - 분산?
2.データの前処理
# type은 white와 red 두 종류인데 각각 0과 1로 변환
from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()
enc.fit(train['type'])
train['type'] = enc.transform(train['type'])
test['type'] = enc.transform(test['type'])
# 데이터 분리 및 불필요한 변수 제거
# 독립변수 중 제거해도 좋은 변수가 있을까? 있다면 어떻게 알 수 있을까?
train_x = train.drop(['index','quality'], axis = 1)
train_y = train['quality']
test_x = test.drop('index', axis = 1)
train_x.shape, train_y.shape, test_x.shape
((5497, 12), (5497,), (1000, 12))
3.モデリング# from sklearn.ensemble import RandomForestClassifier
# # 모델선언
# model = RandomForestClassifier()
# # 모델학습
# model.fit(train_x, train_y)
RandomForestClassifier()
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
# XGBClassifier 객체 생성
model = XGBClassifier(n_estimators=500, random_state=156) # 100 -> 500 (0.03 늘어남..)
# 학습 : 성능 평가 지표를 auc로 설정하고 학습 수행.
model.fit(train_x, train_y)
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
gamma=0, gpu_id=-1, importance_type=None,
interaction_constraints='', learning_rate=0.300000012,
max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=500, n_jobs=8,
num_parallel_tree=1, objective='multi:softprob', predictor='auto',
random_state=156, reg_alpha=0, reg_lambda=1,
scale_pos_weight=None, subsample=1, tree_method='exact',
validate_parameters=1, verbosity=None)
# 학습된 모델로 test 데이터 예측
y_pred = model.predict(test_x)
# 제출파일 생성
submission = pd.read_csv('data/sample_submission.csv')
submission['quality'] = y_pred
submission.to_csv('data/wine_quality_2.csv',index=False)
Reference
この問題について(ワイン品質分類), 我々は、より多くの情報をここで見つけました https://velog.io/@wltn39/와인-품질-분류テキストは自由に共有またはコピーできます。ただし、このドキュメントのURLは参考URLとして残しておいてください。
Collection and Share based on the CC Protocol