1、データの読み取り
import pandas as pd
data = pd.read_csv("boston_housing.csv")
#
data.head()
#
data.info()
#
data.isnull().sum()
#
data.describe()
2、単一変数分析
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
%matplotlib inline
warnings.filterwarnings("ignore")
plt.figure()
sns.distplot(data.MEDV.values, bins=30, kde=True)
plt.xlabel('Median value of owner-occupied homes', fontsize=12)
plt.show()
#
plt.scatter(range(data.shape[0]), data["MEDV"].values,color='purple')
plt.title("Distribution of Price");
# y 50
data = data[data.MEDV < 50]
# 0: 1:
data.shape
#
sns.countplot(data.CHAS);
plt.xlabel('Charles River');
plt.ylabel('Number of occurrences');
3、両特徴間の相関
# Calculates pearson co-efficient for all combinations, 0.5
data_corr = data.corr().abs()
plt.subplots(figsize=(13, 9))
sns.heatmap(data_corr,annot=True)
# Mask unimportant features
sns.heatmap(data_corr, mask=data_corr < 1, cbar=False)
plt.savefig('house_coor.png' )
plt.show()
#get the names of all the columns
cols=data.columns
#Set the threshold to select only highly correlated attributes
threshold = 0.5
# List of pairs along with correlation above threshold
corr_list = []
#size = data.shape[1]
size = data_corr.shape[0]
#Search for the highly correlated pairs
for i in range(0, size): #for 'size' features
for j in range(i+1,size): #avoid repetition
if (data_corr.iloc[i,j] >= threshold and data_corr.iloc[i,j] < 1) or (data_corr.iloc[i,j] < 0 and data_corr.iloc[i,j] <= -threshold):
corr_list.append([data_corr.iloc[i,j],i,j]) #store correlation and columns index
#Sort to show higher ones first
s_corr_list = sorted(corr_list,key=lambda x: -abs(x[0]))
#Print correlations and column names
for v,i,j in s_corr_list:
print ("%s and %s = %.2f" % (cols[i],cols[j],v))
#
for v,i,j in s_corr_list:
sns.pairplot(data, size=6, x_vars=cols[i],y_vars=cols[j] )
plt.show()