とうしゅうはすうぶんかつふごう
5304 ワード
次のコードは、元の処理済み変数解析が必要なデータを読み込むだけで、Y値はフィールド名を「15 A」に変更して実行します.
# -*- coding: utf-8 -*-
"""
"""
import pandas as pd
import numpy as np
import xlrd
#from pandasql import sqldf
#
def xlsxread(self):
content=xlrd.open_workbook(self,encoding_override='gdk')
df=pd.read_excel(content,engine='xlrd')
return df
data=xlsxread(r"C:a.xls")# #
data15A=data
#
#
df_new_dtypes=pd.DataFrame(data.dtypes,columns={'type'})
df_new_dtypes['title1']=df_new_dtypes.index
df_type_int=df_new_dtypes[(df_new_dtypes['type']=='int64') | (df_new_dtypes['type']=='float64')]
df_type_int=df_type_int[:-1]# y
cols=list(df_type_int.index)
#
type_vec=list(df_new_dtypes[(df_new_dtypes['type']=='object')].index)
#
def cutby_bfw(data,list_cut,var):
df=data.copy()
df1=data.copy()
df=df.dropna(subset=[var])
percentiles=list_cut
new_box=var+'_new'
num=pd.DataFrame(df[var].unique()).shape[0]
if num<=9:
df1[var]=df1[var].fillna(-999)
df1[new_box]=df1[var]
else:
l_bin =list(np.percentile(df[var], percentiles))
# ,
for i in range(len(l_bin)):
print(i,l_bin[i])
if i0 else 0)
data1['15A']=data1['15A_1']
for i in cols_bin:
data_new=[]
data_Iv=[]
#data_z=[]
data1=data15A[data15A[i].isnull()] #data1
data2=data15A[data15A[i].notnull()] #data2
data_z=pd.DataFrame([[' ',data1['15A'].count(),data1['15A'].sum(),data1['15A'].count()-data1['15A'].sum()]],columns=[i,'total','bad','good'])
total=data2.groupby(i)['15A'].count()
total=pd.DataFrame({'total':total})
bad=data2.groupby(i)['15A'].sum()
bad=pd.DataFrame({'bad':bad})
data3=total.merge(bad,left_index=True,right_index=True,how='left')
data3['good']=data3['total']-data3['bad']
data_3=data3.reset_index() #
data_Iv=pd.concat([data_z,data_3])
#data_new=data_Iv
data_Iv[' ']=data_Iv['total']/data_Iv['total'].sum()
data_Iv[' ']=data_Iv['bad']/data_Iv['total']
data_Iv['WOE']=np.log((data_Iv['bad']/data_Iv['bad'].sum())/(data_Iv['good']/data_Iv['good'].sum()))
data_Iv['IV']=data_Iv['WOE']*(data_Iv['bad']/data_Iv['bad'].sum()-data_Iv['good']/data_Iv['good'].sum())
# ,
#data_Iv=data_Iv.reset_index()
# ,
data_Iv[' ']=data_Iv['total']/data_Iv['total'].sum()
data_Iv[' ']=data_Iv['bad']/data_Iv['total']
data_Iv['WOE']=np.log((data_Iv['bad']/data_Iv['bad'].sum())/(data_Iv['good']/data_Iv['good'].sum()))
data_Iv['IV']=data_Iv['WOE']*(data_Iv['bad']/data_Iv['bad'].sum()-data_Iv['good']/data_Iv['good'].sum())
data_Iv.replace([-np.Inf,np.Inf],0,inplace=True)
# ,
#data_Iv=data_Iv.reset_index()
# ,
data_Iv['total_good']=data_Iv['good'].cumsum()
data_Iv['total_bad']=data_Iv['bad'].cumsum()
data_Iv['total_good%']=data_Iv['total_good']/data_Iv['good'].sum()
data_Iv['total_bad%']=data_Iv['total_bad']/data_Iv['bad'].sum()
data_Iv['KS_value']=abs(data_Iv['total_bad%']-data_Iv['total_good%'])
data_Iv_z=pd.DataFrame([[' ',data_Iv['total'].sum(),data_Iv['bad'].sum(),data_Iv['good'].sum(),data_Iv[' '].sum()
,data_Iv['bad'].sum()/data_Iv['total'].sum(),0,data_Iv['IV'].sum(),max(data_Iv['total_good']),max(data_Iv['total_bad'])
,max(data_Iv['total_good%']),max(data_Iv['total_bad%']),max(data_Iv['KS_value'])]]
,columns=[i,'total','bad','good',' ',' ','WOE','IV','total_good','total_bad','total_good%','total_bad%','KS_value'])
#data_new=data_Iv.append(data_Iv_z,ignore_index=True)
data_new=pd.concat([data_Iv,data_Iv_z])
data_new.to_excel(writer,sheet_name='15A',startrow=numindex)
#data_new.to_excel(writer,sheet_name='15A',startrow=numindex,index=False)
numindex=numindex+data_new.shape[0]+3
writer.close()