人工知能—データ探索操作ノート


1、データの読み取り

import pandas as pd
data = pd.read_csv("boston_housing.csv")
# 
data.head() 
#  
data.info() 
#  
data.isnull().sum() 
#  
data.describe()

2、単一変数分析

import matplotlib.pyplot as plt
import seaborn as sns
import warnings
%matplotlib inline

warnings.filterwarnings("ignore")
plt.figure()
sns.distplot(data.MEDV.values, bins=30, kde=True)
plt.xlabel('Median value of owner-occupied homes', fontsize=12)
plt.show()

#  
plt.scatter(range(data.shape[0]), data["MEDV"].values,color='purple')
plt.title("Distribution of Price");

#  y 50 
data = data[data.MEDV < 50]
#   0:  1: 
data.shape

# 
sns.countplot(data.CHAS);
plt.xlabel('Charles River');
plt.ylabel('Number of occurrences');


3、両特徴間の相関

# Calculates pearson co-efficient for all combinations, 0.5 
data_corr = data.corr().abs()

plt.subplots(figsize=(13, 9))
sns.heatmap(data_corr,annot=True)

# Mask unimportant features
sns.heatmap(data_corr, mask=data_corr < 1, cbar=False)

plt.savefig('house_coor.png' )
plt.show()
#get the names of all the columns
cols=data.columns 
#Set the threshold to select only highly correlated attributes
threshold = 0.5
# List of pairs along with correlation above threshold
corr_list = []
#size = data.shape[1]
size = data_corr.shape[0]

#Search for the highly correlated pairs
for i in range(0, size): #for 'size' features
    for j in range(i+1,size): #avoid repetition
        if (data_corr.iloc[i,j] >= threshold and data_corr.iloc[i,j] < 1) or (data_corr.iloc[i,j] < 0 and data_corr.iloc[i,j] <= -threshold):
            corr_list.append([data_corr.iloc[i,j],i,j]) #store correlation and columns index

#Sort to show higher ones first            
s_corr_list = sorted(corr_list,key=lambda x: -abs(x[0]))

#Print correlations and column names
for v,i,j in s_corr_list:
    print ("%s and %s = %.2f" % (cols[i],cols[j],v))

# 
for v,i,j in s_corr_list:
    sns.pairplot(data, size=6, x_vars=cols[i],y_vars=cols[j] )
    plt.show()