python計算変数のIV値

4183 ワード

機械学習の二分類問題では,IV値(Information Value)は主に入力変数の符号化と予測能力評価に用いられる.特徴変数IV値の大きさは、その変数の予測能力の強弱を表し、大量の変数に直面した場合、各変数のIV値を計算し、IV値が一定値より大きい変数をモデルに参加させることで、特徴が携帯する情報量を保持するだけでなく、モデルの効率性が向上し、お客様の説明と報告にも役立ちます.2.IV値計算(pythonコードは以下の通り)

    #########      IV 
    ##Xvar:   
    ##Yvar:   
    def CalcIV_Single(Xvar, Yvar): 
       N_0  = np.sum(Yvar==0)
       N_1 = np.sum(Yvar==1)
       N_0_group = np.zeros(np.unique(Xvar).shape)
       N_1_group = np.zeros(np.unique(Xvar).shape)
       iv = 0
       for i in range(len(np.unique(Xvar))):
           N_0_group[i] = Yvar[(Xvar == np.unique(Xvar)[i]) & (Yvar == 0)].count()
           N_1_group[i] = Yvar[(Xvar == np.unique(Xvar)[i]) & (Yvar == 1)].count()
       #iv = np.sum((N_0_group/N_0 - N_1_group/N_1) * np.log((N_0_group/N_0)/(N_1_group/N_1)))
           if N_0_group[i] == 0 or N_1_group[i] == 0:
               iv  = iv + 0 
           else:
               iv =iv + (N_0_group[i]/N_0 - N_1_group[i]/N_1) * np.log((N_0_group[i]/N_0)/(N_1_group[i]/N_1))
       return  iv
     #######         
     #     num   vec       
    def rangeMark(num,vec):
        mark = int()
        for i in range(len(vec)-1):
          if num >= vec[i] and num <= vec[i+1]:
            mark = i
            break
        return mark
    ########        
    
        ## Xvar:       ,n :      
        ##    Xvar    n        
    
    def cut_group(Xvar , n):
       bins = []
       labels = []
       for i in range(n+1):
             bins.append(np.percentile(Xvar , (1/ n ) * i * 100))
       for i in  Xvar:
             temp = rangeMark(i,bins)
             labels.append(temp)
       return labels
    
        ##        IV  
        ##  df_data:            
        ##  var_lst:     
        ##  Yvar:    
        ##         IV 
    def CalcIV_DataFrame_char(df_data , var_lst ,Yvar):
		iv_res = []
		 #df_data[Yvar]
		 for varName in var_lst:
			      iv = CalcIV_Single(df_data[varName], Yvar)
			      iv_res.append(iv)
		 df_iv= pd.DataFrame({"varName" : var_lst, "iv" : iv_res })
		 df_iv = df_iv.sort_values(by='iv' , ascending = False)
		 cols = list(df_iv)
		 cols.insert(0,cols.pop(cols.index('varName')))
		 df_iv = df_iv.loc[:,cols]
		return df_iv
        ## DataFrame     ;
        ## var_lst_num :             
        ## Yvar :      
        ## n:             
        ##           IV 
     def CalcIV_DataFrame_num(df_data , var_lst_num ,Yvar ,n): 
		        for var_name in  var_lst_num:
		            df_data[var_name] = cut_group(df_data[var_name] , n)
		        res = CalcIV_DataFrame_char(df_data , var_lst_num ,Yvar)
		        return res
    
    ############      ################
    
        if __name__ == '__main__':
            inputfile = 'D:/PycharmProjects/lessonOnLine/data/HR2.csv' 
            data = pd.read_csv(inputfile)
            ##       salary IV 
            res = CalcIV_Single(data['salary'], data['left'])  #    
            #####      
           Xvar = data['average_monthly_hours']
           n = 10
          res_cut = cut_group(Xvar , n)
          ################      
           var_lst_num = ['satisfaction_level',
                               'last_evaluation',
                                'average_monthly_hours']  
         ################                          
            var_lst_char = ['department',
                                    'salary',
                                   'number_project',
                                  'time_spend_company',
                                  'Work_accident',
                                  'promotion_last_5years']
          df_data = data
          Yvar = data['left']
          n= 10  #           10  
           ####          IV 
          df_iv_num = CalcIV_DataFrame_num(df_data , var_lst_num ,Yvar , n)
          ######        IV 
        df_iv_char = CalcIV_DataFrame_char(df_data , var_lst_char ,Yvar)
       ```
    :https://www.cnblogs.com/bigdatafengkong/p/9079934.html
               https://www.jianshu.com/p/cc4724a373f8