machine learning/Predict survival on the Titanic- using scikit-learn
45144 ワード
タイタニック号データセットの挑戦
乗客の年齢、性別、乗客等級、乗船位置などの属性に基づいて乗客の生存を予測することを目標としている.
カグのタイタニック号挑戦から
train.csv
とtest.csv
をダウンロード2つのファイルをデータセットディレクトリにtitanc trainします.csv titanic_test.csvとして保存
1.データのロード
import pandas as pd
train_data = pd.read_csv("./titanic_train.csv")
test_data = pd.read_csv("./titanic_test.csv")
2.データナビゲーション
train dataの表示
train_data.head()
PassengerIdSurvived
Pclass
Name
Sex
Age
SibSp
Parch
Ticket
Fare
Cabin
Embarked
0
1
0
3
Braund, Mr. Owen Harris
male
22.0
1
0
A/5 21171
7.2500
NaN
S
1
2
1
1
Cumings, Mrs. John Bradley (Florence Briggs Th...
female
38.0
1
0
PC 17599
71.2833
C85
C
2
3
1
3
Heikkinen, Miss. Laina
female
26.0
0
0
STON/O2. 3101282
7.9250
NaN
S
3
4
1
1
Futrelle, Mrs. Jacques Heath (Lily May Peel)
female
35.0
1
0
113803
53.1000
C123
S
4
5
0
3
Allen, Mr. William Henry
male
35.0
0
0
373450
8.0500
NaN
S
欠落データの表示
train_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 PassengerId 891 non-null int64
1 Survived 891 non-null int64
2 Pclass 891 non-null int64
3 Name 891 non-null object
4 Sex 891 non-null object
5 Age 714 non-null float64
6 SibSp 891 non-null int64
7 Parch 891 non-null int64
8 Ticket 891 non-null object
9 Fare 891 non-null float64
10 Cabin 204 non-null object
11 Embarked 889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
統計の表示
train_data.describe()
PassengerIdSurvived
Pclass
Age
SibSp
Parch
Fare
count
891.000000
891.000000
891.000000
714.000000
891.000000
891.000000
891.000000
mean
446.000000
0.383838
2.308642
29.699118
0.523008
0.381594
32.204208
std
257.353842
0.486592
0.836071
14.526497
1.102743
0.806057
49.693429
min
1.000000
0.000000
1.000000
0.420000
0.000000
0.000000
0.000000
25%
223.500000
0.000000
2.000000
20.125000
0.000000
0.000000
7.910400
50%
446.000000
0.000000
3.000000
28.000000
0.000000
0.000000
14.454200
75%
668.500000
1.000000
3.000000
38.000000
1.000000
0.000000
31.000000
max
891.000000
1.000000
3.000000
80.000000
8.000000
6.000000
512.329200
Survived(機械学習の目標)が0と1であることを検証する
train_data["Survived"].value_counts()
0 549
1 342
Name: Survived, dtype: int64
カテゴリ(カテゴリ)プロパティの決定
train_data["Pclass"].value_counts()
3 491
1 216
2 184
Name: Pclass, dtype: int64
train_data["Sex"].value_counts()
male 577
female 314
Name: Sex, dtype: int64
train_data["Embarked"].value_counts()
S 644
C 168
Q 77
Name: Embarked, dtype: int64
Embarked特性には,C=Cherbourg,Q=Queenstown,S=Southamptonが含まれる.3.前処理ライン
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
import numpy as np
train_data.head()
PassengerIdSurvived
Pclass
Name
Sex
Age
SibSp
Parch
Ticket
Fare
Cabin
Embarked
0
1
0
3
Braund, Mr. Owen Harris
male
22.0
1
0
A/5 21171
7.2500
NaN
S
1
2
1
1
Cumings, Mrs. John Bradley (Florence Briggs Th...
female
38.0
1
0
PC 17599
71.2833
C85
C
2
3
1
3
Heikkinen, Miss. Laina
female
26.0
0
0
STON/O2. 3101282
7.9250
NaN
S
3
4
1
1
Futrelle, Mrs. Jacques Heath (Lily May Peel)
female
35.0
1
0
113803
53.1000
C123
S
4
5
0
3
Allen, Mr. William Henry
male
35.0
0
0
373450
8.0500
NaN
S
X_train = train_data.drop("Survived", axis=1)
y_train = train_data["Survived"].copy()
X_train.shape
(891, 11)
#train_data['RelativesOnboard'] = train_data['SibSp'] + train_data['Parch']+1
# train_data["AgeBucket"] = train_data["Age"] // 15 * 15
# train_data[["AgeBucket", "Survived"]].groupby(['AgeBucket']).mean()
X_train.values[:, 5].shape
(891,)
from sklearn.base import BaseEstimator, TransformerMixin
col_names = "SibSp", "Parch"
num_attirbs = ['SibSp', 'Parch', 'Fare']
# 열 인덱스
SibSp_ix, Parch_ix = [num_attirbs.index(c) for c in col_names]
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
def __init__(self): # *args 또는 **kargs 없음
pass
def fit(self, X, y=None):
return self # 아무것도 하지 않습니다
def transform(self, X):
RelativesOnboard = X[:, SibSp_ix] + X[:, Parch_ix] + 1
return np.c_[X, RelativesOnboard]
from sklearn.base import BaseEstimator, TransformerMixin
# train_data["AgeBucket"] = train_data["Age"] // 15 * 15
class AgetoCategory(BaseEstimator, TransformerMixin):
def __init__(self): # *args 또는 **kargs 없음
pass
def fit(self, X, y=None):
return self # 아무것도 하지 않습니다
def transform(self, X):
AgeBucket = X // 15 * 15
return np.c_[AgeBucket]
# 1. 누락치처리,
# 2. 카테고리형
# 3. 더미변수(원핫인코딩)
age_pipeline = Pipeline([
("imputer", SimpleImputer(strategy = "median")),
("age_cat", AgetoCategory()),
("cat_encoder", OneHotEncoder(sparse=False) )
])
# 1. 누락값을 most_frequent 로 대체
# 2. OneHot Encoding
cat_pipeline = Pipeline([
("imputer", SimpleImputer(strategy = "most_frequent")),
("cat_encoder", OneHotEncoder(sparse=False) )
])
# 1. 누락값을 median 로 대체
num_pipeline = Pipeline([
("imputer", SimpleImputer(strategy = "median")),
("attribs_adder",CombinedAttributesAdder() )
])
age_attribs = ["Age"]
num_attribs = ['SibSp', 'Parch', 'Fare']
cat_attribs = ['Pclass', 'Sex', 'Embarked']
preprocess_pipeline = ColumnTransformer([
("age", age_pipeline, age_attribs),
("num", num_pipeline, num_attribs),
("cat", cat_pipeline, cat_attribs)
])
X_train_prepared = preprocess_pipeline.fit_transform(X_train)
X_train_prepared.shape
(891, 18)
モデル選択、トレーニング、評価(クロス検証)
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
log_reg = LogisticRegression(solver="liblinear" , random_state = 42)
log_reg.fit(X_train_prepared, y_train)
LogisticRegression(random_state=42, solver='liblinear')
cross_val_score(log_reg, X_train_prepared, y_train, cv=3, scoring="accuracy")
array([0.78114478, 0.79461279, 0.7979798 ])
y_predict_lr = cross_val_predict(log_reg, X_train_prepared, y_train, cv=3)
precision_score(y_train, y_predict_lr)
0.7516129032258064
recall_score(y_train, y_predict_lr)
0.6812865497076024
svm_clf = SVC(random_state=42)
svm_clf.fit(X_train_prepared, y_train)
SVC(random_state=42)
cross_val_score(svm_clf, X_train_prepared, y_train, cv=3, scoring="accuracy")
array([0.62962963, 0.69023569, 0.68013468])
knn_clf = KNeighborsClassifier(n_neighbors = 3)
knn_clf.fit(X_train_prepared, y_train)
KNeighborsClassifier(n_neighbors=3)
cross_val_score(knn_clf, X_train_prepared, y_train, cv=3, scoring="accuracy")
array([0.74410774, 0.78114478, 0.75084175])
sgd_clf = SGDClassifier(random_state = 42)
sgd_clf.fit(X_train_prepared, y_train)
SGDClassifier(random_state=42)
cross_val_score(sgd_clf, X_train_prepared, y_train, cv=3, scoring="accuracy")
array([0.72390572, 0.4006734 , 0.41077441])
y_predict_sgd = cross_val_predict(sgd_clf, X_train_prepared, y_train, cv=3)
precision_score(y_train, y_predict_sgd)
0.4340425531914894
recall_score(y_train, y_predict_sgd)
0.8947368421052632
f1_score(y_train, y_predict_sgd)
0.5845272206303724
y_score_sgd = cross_val_predict(sgd_clf, X_train_prepared, y_train, cv=3, method="decision_function")
roc_auc_score(y_train, y_score_sgd)
0.6755025085482377
forest_clf = RandomForestClassifier(random_state = 42)
forest_clf.fit(X_train_prepared, y_train)
RandomForestClassifier(random_state=42)
cross_val_score(forest_clf, X_train_prepared, y_train, cv=3, scoring="accuracy")
array([0.81481481, 0.7979798 , 0.82154882])
y_predict_forest = cross_val_predict(forest_clf, X_train_prepared, y_train, cv=3)
precision_score(y_train, y_predict_forest)
0.7636363636363637
recall_score(y_train, y_predict_forest)
0.7368421052631579
f1_score(y_train, y_predict_forest)
0.7499999999999999
y_score_forest = cross_val_predict(forest_clf, X_train_prepared, y_train, cv=3, method="predict_proba")
y_score_forest = y_score_forest[:, 1] # 양성 예측률
roc_auc_score(y_train, y_score_forest)
0.8496522118897729
pamran_grid = [
{'n_estimators': [100,200,300], 'max_features': [2,4,6,8,10,12]}
]
grid_search = GridSearchCV(forest_clf, pamran_grid, cv=5, scoring="accuracy", n_jobs=1)
grid_search.fit(X_train_prepared, y_train)
GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42), n_jobs=1,
param_grid=[{'max_features': [2, 4, 6, 8, 10, 12],
'n_estimators': [100, 200, 300]}],
scoring='accuracy')
grid_search.best_params_
{'max_features': 8, 'n_estimators': 100}
grid_search.best_score_
0.8226727763480006
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
print(mean_score, params)
0.8125729709371665 {'max_features': 2, 'n_estimators': 100}
0.8125855250768941 {'max_features': 2, 'n_estimators': 200}
0.8137028435126483 {'max_features': 2, 'n_estimators': 300}
0.8103320569957944 {'max_features': 4, 'n_estimators': 100}
0.8136902893729208 {'max_features': 4, 'n_estimators': 200}
0.8148138848785388 {'max_features': 4, 'n_estimators': 300}
0.8170673529596384 {'max_features': 6, 'n_estimators': 100}
0.8193019898311468 {'max_features': 6, 'n_estimators': 200}
0.8170610758897746 {'max_features': 6, 'n_estimators': 300}
0.8226727763480006 {'max_features': 8, 'n_estimators': 100}
0.8204381394764925 {'max_features': 8, 'n_estimators': 200}
0.8170673529596384 {'max_features': 8, 'n_estimators': 300}
0.82045069361622 {'max_features': 10, 'n_estimators': 100}
0.8182035026049841 {'max_features': 10, 'n_estimators': 200}
0.8148327160881301 {'max_features': 10, 'n_estimators': 300}
0.8204444165463561 {'max_features': 12, 'n_estimators': 100}
0.8181909484652564 {'max_features': 12, 'n_estimators': 200}
0.8170736300295023 {'max_features': 12, 'n_estimators': 300}
final_model = grid_search.best_estimator_
X_test = preprocess_pipeline.transform(test_data)
y_pred = final_model.predict(X_test)
fig, (ax1, ax2) = plt.subplots(ncols=2)
fig.set_size_inches(12, 5)
sns.histplot(y_train, ax=ax1, bins=50)
ax1.set(title="y_train")
sns.histplot(y_pred, ax=ax2, bins=50)
ax2.set(title="y_pred")
[Text(0.5, 1.0, 'y_pred')]
CSVを作成して
submission = pd.read_csv("./gender_submission.csv")
submission
submission["Survived"] = y_pred
print(submission.shape)
submission.head()
ver = 2
submission.to_csv("./ver_{0}_submission.csv".format(ver), index=False)
(418, 2)
Reference
この問題について(machine learning/Predict survival on the Titanic- using scikit-learn), 我々は、より多くの情報をここで見つけました https://velog.io/@bbkyoo/machine-learningPredict-survival-on-the-Titanic-using-scikit-learnテキストは自由に共有またはコピーできます。ただし、このドキュメントのURLは参考URLとして残しておいてください。
Collection and Share based on the CC Protocol