python計算変数のIV値
4183 ワード
機械学習の二分類問題では,IV値(Information Value)は主に入力変数の符号化と予測能力評価に用いられる.特徴変数IV値の大きさは、その変数の予測能力の強弱を表し、大量の変数に直面した場合、各変数のIV値を計算し、IV値が一定値より大きい変数をモデルに参加させることで、特徴が携帯する情報量を保持するだけでなく、モデルの効率性が向上し、お客様の説明と報告にも役立ちます.2.IV値計算(pythonコードは以下の通り)
######### IV
##Xvar:
##Yvar:
def CalcIV_Single(Xvar, Yvar):
N_0 = np.sum(Yvar==0)
N_1 = np.sum(Yvar==1)
N_0_group = np.zeros(np.unique(Xvar).shape)
N_1_group = np.zeros(np.unique(Xvar).shape)
iv = 0
for i in range(len(np.unique(Xvar))):
N_0_group[i] = Yvar[(Xvar == np.unique(Xvar)[i]) & (Yvar == 0)].count()
N_1_group[i] = Yvar[(Xvar == np.unique(Xvar)[i]) & (Yvar == 1)].count()
#iv = np.sum((N_0_group/N_0 - N_1_group/N_1) * np.log((N_0_group/N_0)/(N_1_group/N_1)))
if N_0_group[i] == 0 or N_1_group[i] == 0:
iv = iv + 0
else:
iv =iv + (N_0_group[i]/N_0 - N_1_group[i]/N_1) * np.log((N_0_group[i]/N_0)/(N_1_group[i]/N_1))
return iv
#######
# num vec
def rangeMark(num,vec):
mark = int()
for i in range(len(vec)-1):
if num >= vec[i] and num <= vec[i+1]:
mark = i
break
return mark
########
## Xvar: ,n :
## Xvar n
def cut_group(Xvar , n):
bins = []
labels = []
for i in range(n+1):
bins.append(np.percentile(Xvar , (1/ n ) * i * 100))
for i in Xvar:
temp = rangeMark(i,bins)
labels.append(temp)
return labels
## IV
## df_data:
## var_lst:
## Yvar:
## IV
def CalcIV_DataFrame_char(df_data , var_lst ,Yvar):
iv_res = []
#df_data[Yvar]
for varName in var_lst:
iv = CalcIV_Single(df_data[varName], Yvar)
iv_res.append(iv)
df_iv= pd.DataFrame({"varName" : var_lst, "iv" : iv_res })
df_iv = df_iv.sort_values(by='iv' , ascending = False)
cols = list(df_iv)
cols.insert(0,cols.pop(cols.index('varName')))
df_iv = df_iv.loc[:,cols]
return df_iv
## DataFrame ;
## var_lst_num :
## Yvar :
## n:
## IV
def CalcIV_DataFrame_num(df_data , var_lst_num ,Yvar ,n):
for var_name in var_lst_num:
df_data[var_name] = cut_group(df_data[var_name] , n)
res = CalcIV_DataFrame_char(df_data , var_lst_num ,Yvar)
return res
############ ################
if __name__ == '__main__':
inputfile = 'D:/PycharmProjects/lessonOnLine/data/HR2.csv'
data = pd.read_csv(inputfile)
## salary IV
res = CalcIV_Single(data['salary'], data['left']) #
#####
Xvar = data['average_monthly_hours']
n = 10
res_cut = cut_group(Xvar , n)
################
var_lst_num = ['satisfaction_level',
'last_evaluation',
'average_monthly_hours']
################
var_lst_char = ['department',
'salary',
'number_project',
'time_spend_company',
'Work_accident',
'promotion_last_5years']
df_data = data
Yvar = data['left']
n= 10 # 10
#### IV
df_iv_num = CalcIV_DataFrame_num(df_data , var_lst_num ,Yvar , n)
###### IV
df_iv_char = CalcIV_DataFrame_char(df_data , var_lst_char ,Yvar)
```
:https://www.cnblogs.com/bigdatafengkong/p/9079934.html
https://www.jianshu.com/p/cc4724a373f8