Pythonデータ分析プロセス

13568 ワード

一.データ分析の手順:
1.データを確認して質問する
2.データ洗浄
3.コード作成、結果データ抽出、異常データの有無分析、コード修正
4.データに応じて適切なグラフを選択して表示する
5.グラフグループのディスカッションに基づいて最終的な結果を得る
 
二.環境と原始データの準備
Anaconda 2バージョンをインストールし、パッケージを更新して最新バージョンを更新  conda upgrade --all
first.zipファイルをダウンロードし、解凍します.
中には3枚のcsvファイルがあります.それぞれenrollments.csv、daily_engagements.csv,project_submission.csvとipythonのnotebook
cmd起動解凍後のファイル入力jupyter notebook起動ipythonノートパソコン
 
三.分析データ
1.csvからのデータのロード
import unicodecsv


def readcsv(filename):
    with open(filename,'rb') as f:
        #             
        reader = unicodecsv.DictReader(f)
        return list(reader)    

 
##   daily_engagement.csv   project_submissions.csv       
##         ,         1 。

daily_engagement = readcsv('daily-engagement.csv')
project_submissions = readcsv('project-submissions.csv')
enrollments = readcsv('enrollments.csv')

print daily_engagement[0]
print project_submissions[0]
print enrollments[0]

 2.データ型の修正
from datetime import datetime as dt

#             Python datetime      。
#            ,   None
def parse_date(date):
    if date == '':
        return None
    else:
        return dt.strptime(date, '%Y-%m-%d')

    
#                          None。
def parse_maybe_int(i):
    if i == '':
        return None
    else:
        return int(i)

 
#    enrollments         (     ,    ,     ,    ,   Udacity    )
for enrollment in enrollments:
    enrollment['cancel_date'] = parse_date(enrollment['cancel_date'])
    enrollment['join_date'] = parse_date(enrollment['join_date'])
    enrollment['days_to_cancel'] = parse_maybe_int(enrollment['days_to_cancel'])
    enrollment['is_canceled'] = enrollment['is_canceled'] == 'True'
    enrollment['is_udacity'] = enrollment['is_udacity'] == 'True'
    
enrollments[0]

#    engagement      (  ,    ,      ,      ,       )
for engagement_record in daily_engagement:
    engagement_record['utc_date'] = parse_date(engagement_record['utc_date'])
    engagement_record['num_courses_visited'] = int(float(engagement_record['num_courses_visited']))
    engagement_record['lessons_completed'] = int(float(engagement_record['lessons_completed']))
    engagement_record['projects_completed'] = int(float(engagement_record['projects_completed']))
    engagement_record['total_minutes_visited'] = float(engagement_record['total_minutes_visited'])
    
daily_engagement[0]

#    submissions      (       ,     )
for submission in project_submissions:
    submission['creation_date'] = parse_date(submission['creation_date'])
    submission['completion_date'] = parse_date(submission['completion_date'])

project_submissions[0]

3.データのフォーマットを変更する問題
##   daily_engagement     "acct"      ”account_key"
for engagement_record in daily_engagement:
    engagement_record['account_key'] = engagement_record['acct']
    del [engagement_record['acct']]

 4.探索データ
##           ,     (      account keys)   
def unique_student_data(data):
    unique_data = set()
    for data_point in data:
        unique_data.add(data_point['account_key'])
    return unique_data
len(enrollments)
unique_enrolled_students = unique_student_data(enrollments)
len(unique_enrolled_students)

len(daily_engagement)
unique_daily_engagement = unique_student_data(daily_engagement)
len(unique_daily_engagement)

len(project_submissions)
unique_project_submissions = unique_student_data(project_submissions)
len(unique_project_submissions)

 5.問題データの特定
##             (  enrollments    ,   engagement     )
num_problem_students = 0
for enrollment in enrollments:
    if enrollment['account_key'] not in unique_daily_engagement and enrollment['join_date'] != enrollment['cancel_date']:
        num_problem_students +=1
        print enrollment
        print num_problem_students

 6.残りの問題を追跡する(データセットのテストアカウントを削除する)
#     Udacity          set 
udacity_test_account = set()
for enrollment in enrollments:
    if enrollment['is_udacity']:
        udacity_test_account.add(enrollment['account_key'])
len(udacity_test_account)


#    account_key      Udacity      
def remove_udacity_account(data):
    non_udacity_data = []
    for data_point in data:
        if data_point['account_key'] not in udacity_test_account:
            non_udacity_data.append(data_point)
    return non_udacity_data

#  3        Udacity      
non_udacity_enrollments = remove_udacity_account(enrollments)
non_udacity_engagement = remove_udacity_account(daily_engagement)
non_udacity_submissions = remove_udacity_account(project_submissions)
#      paid_students    ,                      7    
paid_students = {}
for enrollment in non_udacity_enrollments:
  # , key if not enrollment['is_canceled'] or enrollment['days_to_cancel'] > 7: account_key = enrollment['account_key'] enrollment_date = enrollment['join_date']
     # account_key , paid_student if account_key not in paid_students or enrollment_date > paid_students[account_key]: paid_students[account_key] = enrollment_date len(paid_students)#

 7.最初の週にすでに有料で申し込んだ学生を取得する
#     ,    ,    
def within_one_week(join_date ,engagement_date): time_delta = join_date - enrollment_date return time_delta.days >= 0 and time_delta.days < 7
# def remove_free_trial_cancels(data): new_data = [] for data_point in data: if data_point['account_key'] in paid_students: new_data.append(data_point) return new_data paid_enrollment = remove_free_trial_cancels(non_udacity_enrollments) paid_engagement = remove_free_trial_cancels(non_udacity_engagement) paid_project_missions = remove_free_trial_cancels(non_udacity_submissions) print len(paid_enrollment) print len(paid_engagement) print len(paid_project_missions)
## engagement , 7
## paid_engagement_in_first_week = [] for engagement_record in paid_engagement: join_date = paid_students[engagement_record['account_key']] engagement_record_date = engagement_record['utc_date'] if within_one_week(join_date,engagement_record_date): paid_engagement_in_first_week.append(engagement_record) len(paid_engagement_in_first_week)
from collections import defaultdict
import numpy as np
#     student   engagement        ,       (account key),           
def group_data(data,key_name):
    grouped_data = defaultdict(list)
    for data_point in data:
        key = data_point[key_name]
        grouped_data[key].append(data_point)
    return grouped_data

#           1            。    (account key),    (     )
def sum_grouped_items(grouped_data,field_name):
    sumed_data = {}
    for key,data_points in grouped_data.items():
        total = 0
        for data_point in data_points:
            total += data_point[field_name]
        sumed_data[key] = total
    return sumed_data

#                 
def describe_data(data):
    print 'Mean:', np.mean(data)
    print 'Standard deviation:', np.std(data)
    print 'Minimum:', np.min(data)
    print 'Maximum:', np.max(data)

 8.学習時間が最も長い学生と時間を取得する
total_minutes_by_account = sum_grouped_items(engagement_by_account,'total_minutes_visited')

student_with_max_minutes = None
max_minutes = 0
for student,total_nums in total_minutes_by_account.items():
    if total_nums > max_minutes:
        max_minutes = total_nums
        student_with_max_minutes = student
print max_minutes

for engagement_record in paid_engagement_in_first_week:
    if engagement_record['account_key'] == student:
        print engagement_record

 9.最初の週のアクセス数を特定
##    1             、   、   、   。
for engagement_record in paid_engagement:
    if engagement_record['num_courses_visited'] > 0:
        engagement_record['has_visited'] = 1
    else:
        engagement_record['has_visited'] = 0
        
days_visited_by_account = sum_grouped_items(engagement_by_account,'has_visited')
describe_data(days_visited_by_account.values())

 10.項目が合格した学生を区別する
##          1        (engagement)。 1          , 2           。

subway_project_lesson_keys = ['746169184', '3176718735']
# key pass_subway_project = set() for submission in paid_project_missions: project = submission['lesson_key'] rating = submission['assigned_rating']
  # passed distinction pass_subway_project if project in subway_project_lesson_keys and (rating == 'PASSED' or rating == 'DISTINCTION'): pass_subway_project.add(submission['account_key']) passing_engagement = [] # non_passing_engagement =[] # for engagement_record in paid_engagement_in_first_week: if engagement_record['account_key'] in pass_subway_project: passing_engagement.append(engagement_record) else: non_passing_engagement.append(engagement_record) print len(passing_engagement) print len(non_passing_engagement)

 11.2組の学生のデータを比較する
##             ,                       。
##                   (       、     、    )。
passing_engagement_by_account = group_data(passing_engagement,'account_key')
non_passing_engagement_by_account = group_data(non_passing_engagement,'account_key')

print 'non-passing students'
non_passing_minute = sum_grouped_items(non_passing_engagement_by_account,'total_minutes_visited')
describe_data(non_passing_minute.values())
print 'passing students'
passing_minute = sum_grouped_items(passing_engagement_by_account,'total_minutes_visited')
describe_data(passing_minute.values())

print 'non-passing lessons'
non_passing_lessons = sum_grouped_items(non_passing_engagement_by_account,'lessons_completed')
describe_data(non_passing_lessons.values())
print 'passing lessons'
passing_lessons = sum_grouped_items(passing_engagement_by_account,'lessons_completed')
describe_data(passing_lessons.values())

print 'non-passing visited'
non_passing_visited = sum_grouped_items(non_passing_engagement_by_account,'has_visited')
describe_data(non_passing_visited.values())
print 'passing visited'
passing_visited = sum_grouped_items(passing_engagement_by_account,'has_visited')
describe_data(passing_visited.values())

 12.ヒストグラムの描画
%pylab inline
import matplotlib.pyplot as plt
import numpy as np

def describe_data(data):
    print 'Mean:', np.mean(data)
    print 'Standard deviation:', np.std(data)
    print 'Minimum:', np.min(data)
    print 'Maximum:', np.max(data)
    plt.hist(data)
    
describe_data(passing_minute.values())
describe_data(non_passing_minute.values())

 13.グラフの改善と分析
##               ,     seaborn             。
##         ,         hist()     。
%pylab inline
import seaborn as sns
sns.set(color_codes=True)
plt.hist(non_passing_minute.values(),bins=8)
plt.xlabel('mean of minut')
plt.title('Distribution of classroom visits in the first week ' + 
          'for students who do not pass the subway project')

plt.hist(passing_minute.values(),bins=8)
plt.xlabel('mean of minut')
plt.title('Distribution of classroom visits in the first week ' + 
          'for students who do not pass the subway project')