Titanic Top 4% with ensemble modeling(2)
Filling missing values
Age
#explore age vs sex, parch, pclass and sibsp
g = sns.factorplot(y="Age",x="Sex",data=dataset, kind="box")
g = sns.factorplot(y="Age",x="Sex", hue="Pclass", data=dataset, kind='box')
g = sns.factorplot(y="Age",x="Parch", data=dataset, kind="box")
g = sns.factorplot(y="Age",x="SibSp", data=dataset, kind="box")
#convert sex into categorical value 0 for male and 1 for female
dataset["Sex"] = dataset["Sex"].map({"male":0, "female":1})
g = sns.heatmap(dataset[["Age","Sex","SibSp","Parch","Pclass"]].corr(),cmap="BrBG",annot=True)
#Filling missing value of Age
##Filling Age with median age of similar rows according to Pclass, Parch and SibSp
#Index of NaN age rows
index_NaN_age = list(dataset["Age"][dataset["Age"].isnull()].index)
for i in index_NaN_age:
age_med = dataset["Age"].median()
age_pred = dataset["Age"][((dataset["SibSp"] == dataset.iloc[i]["SibSp"]) & (dataset["Parch"] == dataset.iloc[i]["Parch"]) & (dataset["Pclass"] == dataset.iloc[i]["Pclass"]))].median()
if not np.isnan(age_pred):
dataset["Age"].iloc[i] = age_pred
else:
dataset["Age"].iloc[i] = age_med
g = sns.factorplot(x="Survived", y="Age", data=train, kind="box")
g = sns.factorplot(x="Survived", y="Age", data=train, kind="violin")
Feature Engineering
Name/Title
dataset["Name"].head()
#Get Title from Name
dataset_title = [i.split(",")[1].split(".")[0].strip() for i in dataset["Name"]]
dataset["Title"] = pd.Series(dataset_title)
dataset["Title"].head()
g = sns.countplot(x="Title", data= dataset)
g = plt.setp(g.get_xticklabels(), rotation=45)
#convert to categorical values title
dataset["Title"] = dataset["Title"].replace(["Lady", "the Countess","Countess",'Capt',"Col","Don","Dr","Major","Rev","Sir","Jonkheer","Dona"],"Rare")
dataset["Title"] = dataset["Title"].map({"Master": 0, "Miss":1, "Ms":1, "Mme":1, "Mlle":1, "Mrs":1, "Mr":2, "Rare":3})
dataset["Title"] = dataset["Title"].astype(int)
g = sns.countplot(dataset["Title"])
g = g.set_xticklabels(["Master", "Miss/Ms/Mme/Mlle/Mrs","Mr","Rare"])
g = sns.factorplot(x="Title", y="Survived", data=dataset, kind="bar")
g = g.set_xticklabels(["Master","Miss-Mrs","Mr","Rare"])
g = g.set_ylabels("survival probability")
# Drop Name variable
dataset.drop(labels = ["Name"], axis = 1, inplace= True)
Family Size
#create a family size descriptor from SibSp and Parch
dataset["Fsize"] = dataset["SibSp"] + dataset["Parch"] + 1
g = sns.factorplot(x="Fsize", y="Survived", data = dataset)
g = g.set_ylabels("Survival Probability")
#create new feature of family size
dataset['Single'] = dataset['Fsize'].map(lambda s: 1 if s == 1 else 0)
dataset['SmallF'] = dataset["Fsize"].map(lambda s: 1 if s == 2 else 0)
dataset['MedF'] = dataset['Fsize'].map(lambda s: 1 if 3 <= s <=4 else 0)
dataset['LargeF'] = dataset['Fsize'].map(lambda s: 1 if s>=5 else 0)
g = sns.factorplot(x="Single", y="Survived", data=dataset, kind="bar")
g = g.set_ylabels("Survival Probability")
g = sns.factorplot(x="SmallF",y="Survived", data=dataset,kind="bar")
g = g.set_ylabels("Survival Probability")
g = sns.factorplot(x="MedF", y="Survived", data=dataset, kind="bar")
g = g.set_ylabels("Survival Probability")
g = sns.factorplot(x="LargeF", y="Survived", data=dataset, kind="bar")
g = g.set_ylabels("Survival Probability")
#convert to indicator values Title and Embarked
dataset = pd.get_dummies(dataset, columns = ['Title'])
dataset = pd.get_dummies(dataset, columns = ["Embarked"], prefix ="Em")
Cabin
dataset["Cabin"].head()
dataset["Cabin"].describe()
dataset["Cabin"].isnull().sum() #1007
dataset["Cabin"][dataset["Cabin"].notnull()].head()
#Replace the Cabin number by the type of cabin 'X' if not
dataset["Cabin"] = pd.Series([i[0] if not pd.isnull(i) else 'X' for i in dataset['Cabin']])
g = sns.countplot(dataset["Cabin"], order=['A','B','C','D','E','F','G','T','X'])
g = sns.factorplot(y="Survived", x="Cabin", data=dataset, kind='bar', order=['A','B','C','D','E','F','G','T','X'])
g = g.set_ylabels("Survival Probability")
dataset = pd.get_dummies(dataset, columns = ["Cabin"], prefix="Cabin")
Ticket
dataset["Ticket"].head()
#Treat Ticket by extracting the ticket prefix. When there is no prefix it returns X.
Ticket = []
for i in list(dataset.Ticket):
if not i.isdigit():
Ticket.append(i.replace(".","").replace("/","").strip().split(' ')[0]) #Take prefix
else:
Ticket.append("X")
dataset['Ticket'] = Ticket
dataset['Ticket'].head()
dataset = pd.get_dummies(dataset, columns = ["Ticket"], prefix="T")
#Create Categorical values for Pclass
dataset["Pclass"] = dataset["Pclass"].astype("category")
dataset = pd.get_dummies(dataset, columns = ["Pclass"], prefix ="Pc")
#Drop useless variables
dataset.drop(labels = ["PassengerId"], axis = 1, inplace= True)
dataset.head()
Reference
この問題について(Titanic Top 4% with ensemble modeling(2)), 我々は、より多くの情報をここで見つけました https://velog.io/@qsdcfd/Titanic-Top-4-with-ensemble-modeling2テキストは自由に共有またはコピーできます。ただし、このドキュメントのURLは参考URLとして残しておいてください。
Collection and Share based on the CC Protocol