クレジットカードユーザ遅延予測EDA Part 3


7.変数値の変更

  • 変数では、信用格付け2を除いて、女性顧客の信用格付けは男性顧客より高い.
    女性1人を0に指定します.
  • 自動車や不動産については、所有する場合は1に指定します.
  • 名の子供に対して4名以上の場合は4に指定することが望ましい.
  • 世代については20歳から60歳の順に並んだ.
  • 7.1女性値の指定

    train['women'] = np.zeros(len(train))
    train['women'][train['gender']=='F'] = 1

    7.2不動産と自動車の価格設定

    # 자동차
    train['yesCar'] = np.zeros(len(train))
    train['yesCar'][train['car']=='Y'] = 1
    # 부동산
    train['yesReality'] = np.zeros(len(train))
    train['yesReality'][train['reality']=='Y'] = 1

    7.2.1変数の削除

    train.drop(columns=['gender', 'car', 'reality'], inplace=True)

    7.3 4人または4人以上の子供に対する処理

    train['child_num'][train['child_num'] > 3] = 4

    7.4年齢グループの数値化

    train['age_group'][train['age']<30] = 0
    
    train['age_group'][(train['age']>=30) & (train['age']<40)] = 1
    
    train['age_group'][(train['age']>=40) & (train['age']<50)] = 2
    
    train['age_group'][(train['age']>=50) & (train['age']<60)] = 3
    
    train['age_group'][train['age']>=60] = 4
    train.describe(include='all')

    8.テストセットデータの変換


    8.1テストセットの読み込み

    myfile2 = files.upload()
    test = pd.read_csv('test.csv')
    # 불필요한 변수 Drop
    test.drop(columns=['index','FLAG_MOBIL','phone','email','work_phone', 'edu_type'], inplace=True)

    8.2収益による変数の指定

    # 4, 5 10 분위 지정시 test의 값이 아닌 training에 의한 4, 5, 10분위 지정을 해준다.
    # variable 'income_quintile' 생성
    test['income_quartile'] = np.zeros(10000)
    # variable 'income_quintile' 생성
    test['income_quintile'] = np.zeros(10000)
    # variable 'income_decile' 생성
    test['income_decile'] = np.zeros(10000)
    # income_quartile에 값 할당하기
    test['income_quartile'][test['income_total'] < train['income_total'].quantile(0.25)] = 1
    
    test['income_quartile'][(test['income_total'] >= train['income_total'].quantile(0.25)) &
                           (test['income_total'] < train['income_total'].quantile(0.5))] = 2
    
    test['income_quartile'][(test['income_total'] >= train['income_total'].quantile(0.5)) &
                           (test['income_total'] < train['income_total'].quantile(0.75))] = 3
    
    test['income_quartile'][test['income_total'] >= train['income_total'].quantile(0.75)] = 4
    # income_quintile에 값 할당하기
    test['income_quintile'][test['income_total'] < train['income_total'].quantile(0.2)] = 1
    
    test['income_quintile'][(test['income_total'] >= train['income_total'].quantile(0.2)) &
                           (test['income_total'] < train['income_total'].quantile(0.4))] = 2
    
    test['income_quintile'][(test['income_total'] >= train['income_total'].quantile(0.4)) &
                           (test['income_total'] < train['income_total'].quantile(0.6))] = 3
    
    test['income_quintile'][(test['income_total'] >= train['income_total'].quantile(0.6)) &
                           (test['income_total'] < train['income_total'].quantile(0.8))] = 4
    
    test['income_quintile'][test['income_total'] >= train['income_total'].quantile(0.8)] = 5
    # income_decile에 값 할당하기
    test['income_decile'][test['income_total'] < train['income_total'].quantile(0.1)] = 1
    
    test['income_decile'][(test['income_total'] >= train['income_total'].quantile(0.1)) &
                           (test['income_total'] < train['income_total'].quantile(0.2))] = 2
    
    test['income_decile'][(test['income_total'] >= train['income_total'].quantile(0.2)) &
                           (test['income_total'] < train['income_total'].quantile(0.3))] = 3
    
    test['income_decile'][(test['income_total'] >= train['income_total'].quantile(0.3)) &
                           (test['income_total'] < train['income_total'].quantile(0.4))] = 4
    
    test['income_decile'][(test['income_total'] >= train['income_total'].quantile(0.4)) &
                           (test['income_total'] < train['income_total'].quantile(0.5))] = 5
    
    test['income_decile'][(test['income_total'] >= train['income_total'].quantile(0.5)) &
                           (test['income_total'] < train['income_total'].quantile(0.6))] = 6
    
    test['income_decile'][(test['income_total'] >= train['income_total'].quantile(0.6)) &
                           (test['income_total'] < train['income_total'].quantile(0.7))] = 7
    
    test['income_decile'][(test['income_total'] >= train['income_total'].quantile(0.7)) &
                           (test['income_total'] < train['income_total'].quantile(0.8))] = 8
    
    test['income_decile'][(test['income_total'] >= train['income_total'].quantile(0.8)) &
                           (test['income_total'] < train['income_total'].quantile(0.9))] = 9
    
    test['income_decile'][test['income_total'] >= train['income_total'].quantile(0.9)] = 10

    8.3年齢層の設定

    # age 변수 생성
    # DAYS_BIRTH에 -1곱하고 365로 나누고 몫에다 +1
    test['age'] = (test['DAYS_BIRTH'] * (-1))//365+1
    # age_group 변수 생성
    test['age_group'] = np.zeros(10000)
    # 29Under, 30~39, 40~49, 50~64, 65+ 구간으로 나누어 주자
    test['age_group'][test['age']<30] = 0
    
    test['age_group'][(test['age']>=30) & (test['age']<40)] = 1
    
    test['age_group'][(test['age']>=40) & (test['age']<50)] = 2
    
    test['age_group'][(test['age']>=50) & (test['age']<60)] = 3
    
    test['age_group'][test['age']>=60] = 4

    8.4クレジットカード使用年数

    # 신용카드 사용 연수 생성
    # -1 곱해주고 12 나누고 내림
    test['used_years'] = test['begin_month']*(-1)//12

    8.5指定勤務年数

    # worked_year 변수지정
    test['worked_year'] = test['DAYS_EMPLOYED']*(-1)
    
    # 취업되지 않은 사람들에 대해 -365 지정
    test['worked_year'][test['worked_year']<0] = -365
    
    # 근무연수를 구하기 위해 worked_year를 365로 나누고 몫만 지정
    # 근무연수가 없는 사람들은 -1에 지정
    test['worked_year'] = test['worked_year']//365
    # 무직자와 연금수령자 중 무직자에 대해 Unempolyed 지정
    test['occyp_type'][(test.worked_year==-1)&(test.income_type=='Pensioner')] = 'Unempolyed'
    test['occyp_type'][test.worked_year==-1] = 'Unempolyed'
    # 결측치 제거
    test.dropna(axis=0, inplace=True)

    8.6性別、自動車、不動産価格の変更と停止

    # 성별
    test['women'] = np.zeros(len(test))
    test['women'][test['gender']=='F'] = 1
    # 자동차
    test['yesCar'] = np.zeros(len(test))
    test['yesCar'][test['car']=='Y'] = 1
    # 부동산
    test['yesReality'] = np.zeros(len(test))
    test['yesReality'][test['reality']=='Y'] = 1
    # Drop
    test.drop(columns=['gender', 'car', 'reality'], inplace=True)

    8.7 4人または4人以上の子供を処理する

    test['child_num'][test['child_num'] > 3] = 4

    9.トレーニングおよびテストセットの保存

    train.to_csv("newTrain.csv", sep=',',na_rep='NaN')
    test.to_csv("newTest.csv", sep=',',na_rep='NaN')