Pythonデータ分析プロセス
13568 ワード
一.データ分析の手順:
1.データを確認して質問する
2.データ洗浄
3.コード作成、結果データ抽出、異常データの有無分析、コード修正
4.データに応じて適切なグラフを選択して表示する
5.グラフグループのディスカッションに基づいて最終的な結果を得る
二.環境と原始データの準備
Anaconda 2バージョンをインストールし、パッケージを更新して最新バージョンを更新 conda upgrade --all
first.zipファイルをダウンロードし、解凍します.
中には3枚のcsvファイルがあります.それぞれenrollments.csv、daily_engagements.csv,project_submission.csvとipythonのnotebook
cmd起動解凍後のファイル入力jupyter notebook起動ipythonノートパソコン
三.分析データ
1.csvからのデータのロード
2.データ型の修正
3.データのフォーマットを変更する問題
4.探索データ
5.問題データの特定
6.残りの問題を追跡する(データセットのテストアカウントを削除する)
7.最初の週にすでに有料で申し込んだ学生を取得する
8.学習時間が最も長い学生と時間を取得する
9.最初の週のアクセス数を特定
10.項目が合格した学生を区別する
11.2組の学生のデータを比較する
12.ヒストグラムの描画
13.グラフの改善と分析
1.データを確認して質問する
2.データ洗浄
3.コード作成、結果データ抽出、異常データの有無分析、コード修正
4.データに応じて適切なグラフを選択して表示する
5.グラフグループのディスカッションに基づいて最終的な結果を得る
二.環境と原始データの準備
Anaconda 2バージョンをインストールし、パッケージを更新して最新バージョンを更新 conda upgrade --all
first.zipファイルをダウンロードし、解凍します.
中には3枚のcsvファイルがあります.それぞれenrollments.csv、daily_engagements.csv,project_submission.csvとipythonのnotebook
cmd起動解凍後のファイル入力jupyter notebook起動ipythonノートパソコン
三.分析データ
1.csvからのデータのロード
import unicodecsv
def readcsv(filename):
with open(filename,'rb') as f:
#
reader = unicodecsv.DictReader(f)
return list(reader)
## daily_engagement.csv project_submissions.csv
## , 1 。
daily_engagement = readcsv('daily-engagement.csv')
project_submissions = readcsv('project-submissions.csv')
enrollments = readcsv('enrollments.csv')
print daily_engagement[0]
print project_submissions[0]
print enrollments[0]
2.データ型の修正
from datetime import datetime as dt
# Python datetime 。
# , None
def parse_date(date):
if date == '':
return None
else:
return dt.strptime(date, '%Y-%m-%d')
# None。
def parse_maybe_int(i):
if i == '':
return None
else:
return int(i)
# enrollments ( , , , , Udacity )
for enrollment in enrollments:
enrollment['cancel_date'] = parse_date(enrollment['cancel_date'])
enrollment['join_date'] = parse_date(enrollment['join_date'])
enrollment['days_to_cancel'] = parse_maybe_int(enrollment['days_to_cancel'])
enrollment['is_canceled'] = enrollment['is_canceled'] == 'True'
enrollment['is_udacity'] = enrollment['is_udacity'] == 'True'
enrollments[0]
# engagement ( , , , , )
for engagement_record in daily_engagement:
engagement_record['utc_date'] = parse_date(engagement_record['utc_date'])
engagement_record['num_courses_visited'] = int(float(engagement_record['num_courses_visited']))
engagement_record['lessons_completed'] = int(float(engagement_record['lessons_completed']))
engagement_record['projects_completed'] = int(float(engagement_record['projects_completed']))
engagement_record['total_minutes_visited'] = float(engagement_record['total_minutes_visited'])
daily_engagement[0]
# submissions ( , )
for submission in project_submissions:
submission['creation_date'] = parse_date(submission['creation_date'])
submission['completion_date'] = parse_date(submission['completion_date'])
project_submissions[0]
3.データのフォーマットを変更する問題
## daily_engagement "acct" ”account_key"
for engagement_record in daily_engagement:
engagement_record['account_key'] = engagement_record['acct']
del [engagement_record['acct']]
4.探索データ
## , ( account keys)
def unique_student_data(data):
unique_data = set()
for data_point in data:
unique_data.add(data_point['account_key'])
return unique_data
len(enrollments)
unique_enrolled_students = unique_student_data(enrollments)
len(unique_enrolled_students)
len(daily_engagement)
unique_daily_engagement = unique_student_data(daily_engagement)
len(unique_daily_engagement)
len(project_submissions)
unique_project_submissions = unique_student_data(project_submissions)
len(unique_project_submissions)
5.問題データの特定
## ( enrollments , engagement )
num_problem_students = 0
for enrollment in enrollments:
if enrollment['account_key'] not in unique_daily_engagement and enrollment['join_date'] != enrollment['cancel_date']:
num_problem_students +=1
print enrollment
print num_problem_students
6.残りの問題を追跡する(データセットのテストアカウントを削除する)
# Udacity set
udacity_test_account = set()
for enrollment in enrollments:
if enrollment['is_udacity']:
udacity_test_account.add(enrollment['account_key'])
len(udacity_test_account)
# account_key Udacity
def remove_udacity_account(data):
non_udacity_data = []
for data_point in data:
if data_point['account_key'] not in udacity_test_account:
non_udacity_data.append(data_point)
return non_udacity_data
# 3 Udacity
non_udacity_enrollments = remove_udacity_account(enrollments)
non_udacity_engagement = remove_udacity_account(daily_engagement)
non_udacity_submissions = remove_udacity_account(project_submissions)
# paid_students , 7
paid_students = {}
for enrollment in non_udacity_enrollments:
# , key
if not enrollment['is_canceled'] or enrollment['days_to_cancel'] > 7:
account_key = enrollment['account_key']
enrollment_date = enrollment['join_date']
# account_key , paid_student
if account_key not in paid_students or enrollment_date > paid_students[account_key]:
paid_students[account_key] = enrollment_date
len(paid_students)#
7.最初の週にすでに有料で申し込んだ学生を取得する
# , ,
def within_one_week(join_date ,engagement_date):
time_delta = join_date - enrollment_date
return time_delta.days >= 0 and time_delta.days < 7
#
def remove_free_trial_cancels(data):
new_data = []
for data_point in data:
if data_point['account_key'] in paid_students:
new_data.append(data_point)
return new_data
paid_enrollment = remove_free_trial_cancels(non_udacity_enrollments)
paid_engagement = remove_free_trial_cancels(non_udacity_engagement)
paid_project_missions = remove_free_trial_cancels(non_udacity_submissions)
print len(paid_enrollment)
print len(paid_engagement)
print len(paid_project_missions)
## engagement , 7
##
paid_engagement_in_first_week = []
for engagement_record in paid_engagement:
join_date = paid_students[engagement_record['account_key']]
engagement_record_date = engagement_record['utc_date']
if within_one_week(join_date,engagement_record_date):
paid_engagement_in_first_week.append(engagement_record)
len(paid_engagement_in_first_week)
from collections import defaultdict
import numpy as np
# student engagement , (account key),
def group_data(data,key_name):
grouped_data = defaultdict(list)
for data_point in data:
key = data_point[key_name]
grouped_data[key].append(data_point)
return grouped_data
# 1 。 (account key), ( )
def sum_grouped_items(grouped_data,field_name):
sumed_data = {}
for key,data_points in grouped_data.items():
total = 0
for data_point in data_points:
total += data_point[field_name]
sumed_data[key] = total
return sumed_data
#
def describe_data(data):
print 'Mean:', np.mean(data)
print 'Standard deviation:', np.std(data)
print 'Minimum:', np.min(data)
print 'Maximum:', np.max(data)
8.学習時間が最も長い学生と時間を取得する
total_minutes_by_account = sum_grouped_items(engagement_by_account,'total_minutes_visited')
student_with_max_minutes = None
max_minutes = 0
for student,total_nums in total_minutes_by_account.items():
if total_nums > max_minutes:
max_minutes = total_nums
student_with_max_minutes = student
print max_minutes
for engagement_record in paid_engagement_in_first_week:
if engagement_record['account_key'] == student:
print engagement_record
9.最初の週のアクセス数を特定
## 1 、 、 、 。
for engagement_record in paid_engagement:
if engagement_record['num_courses_visited'] > 0:
engagement_record['has_visited'] = 1
else:
engagement_record['has_visited'] = 0
days_visited_by_account = sum_grouped_items(engagement_by_account,'has_visited')
describe_data(days_visited_by_account.values())
10.項目が合格した学生を区別する
## 1 (engagement)。 1 , 2 。
subway_project_lesson_keys = ['746169184', '3176718735']
# key
pass_subway_project = set()
for submission in paid_project_missions:
project = submission['lesson_key']
rating = submission['assigned_rating']
# passed distinction pass_subway_project
if project in subway_project_lesson_keys and (rating == 'PASSED' or rating == 'DISTINCTION'):
pass_subway_project.add(submission['account_key'])
passing_engagement = [] #
non_passing_engagement =[] #
for engagement_record in paid_engagement_in_first_week:
if engagement_record['account_key'] in pass_subway_project:
passing_engagement.append(engagement_record)
else:
non_passing_engagement.append(engagement_record)
print len(passing_engagement)
print len(non_passing_engagement)
11.2組の学生のデータを比較する
## , 。
## ( 、 、 )。
passing_engagement_by_account = group_data(passing_engagement,'account_key')
non_passing_engagement_by_account = group_data(non_passing_engagement,'account_key')
print 'non-passing students'
non_passing_minute = sum_grouped_items(non_passing_engagement_by_account,'total_minutes_visited')
describe_data(non_passing_minute.values())
print 'passing students'
passing_minute = sum_grouped_items(passing_engagement_by_account,'total_minutes_visited')
describe_data(passing_minute.values())
print 'non-passing lessons'
non_passing_lessons = sum_grouped_items(non_passing_engagement_by_account,'lessons_completed')
describe_data(non_passing_lessons.values())
print 'passing lessons'
passing_lessons = sum_grouped_items(passing_engagement_by_account,'lessons_completed')
describe_data(passing_lessons.values())
print 'non-passing visited'
non_passing_visited = sum_grouped_items(non_passing_engagement_by_account,'has_visited')
describe_data(non_passing_visited.values())
print 'passing visited'
passing_visited = sum_grouped_items(passing_engagement_by_account,'has_visited')
describe_data(passing_visited.values())
12.ヒストグラムの描画
%pylab inline
import matplotlib.pyplot as plt
import numpy as np
def describe_data(data):
print 'Mean:', np.mean(data)
print 'Standard deviation:', np.std(data)
print 'Minimum:', np.min(data)
print 'Maximum:', np.max(data)
plt.hist(data)
describe_data(passing_minute.values())
describe_data(non_passing_minute.values())
13.グラフの改善と分析
## , seaborn 。
## , hist() 。
%pylab inline
import seaborn as sns
sns.set(color_codes=True)
plt.hist(non_passing_minute.values(),bins=8)
plt.xlabel('mean of minut')
plt.title('Distribution of classroom visits in the first week ' +
'for students who do not pass the subway project')
plt.hist(passing_minute.values(),bins=8)
plt.xlabel('mean of minut')
plt.title('Distribution of classroom visits in the first week ' +
'for students who do not pass the subway project')