高校の合格点数の分析
9066 ワード
高校の合格点数の分析と予測
import pandas as pd
import os
import re
#moveDocLocation 15\16\17
def avg_moveDocLocation(historydata,year=2018):
historydata=historydata.fillna(-1)
historydata["matricDiffYear"]=historydata["matricDiffYear"].astype('int64')
historydata=historydata.sort_values(by=["collegeCode","order","matricDiffYear"])
collegecode=historydata["collegeCode"]
unique_collegecode = collegecode.unique()
result=pd.DataFrame()
for code in unique_collegecode:
temp=historydata.loc[historydata.collegeCode==code,["collegeCode","collegeName","order","matricDiffYear","moveDocLocation","averageLocation","moveDocGrade"]]
name= temp.loc[0,"collegeName"]
tt_rst=pd.DataFrame()
for order in temp["order"].unique():
temp_order=temp.loc[temp.order==order,:]
temp_year1 = temp_order.loc[temp_order.matricDiffYear>=(year-3),:]
temp_year2=temp_year1.loc[temp_year1.matricDiffYear<=(year-1),:]
temp_year3=temp_year2.loc[temp_year2.moveDocLocation>0,:]
temp_year4=temp_year3.loc[temp_year3.averageLocation>0,:]
tmp_value=temp_order.loc[temp_order.matricDiffYear==year,"moveDocGrade"]
if len(tmp_value)==0:
tmp_moveDocGrade=0
else:
tmp_moveDocGrade=int(tmp_value)
mean_mov_avg=list(temp_year4[["moveDocLocation","averageLocation"]].mean())
temp_result = pd.DataFrame([code,name,order,mean_mov_avg[0],mean_mov_avg[1],tmp_moveDocGrade],
index = ["collegecode","collegeName","order","moveDocLocation","averageLocation","moveDocGrade"])
temp_result=temp_result.transpose()
tt_rst=pd.concat([tt_rst,temp_result])
result=pd.concat([result,tt_rst])
result[["moveDocLocation","averageLocation"]]=result[["moveDocLocation","averageLocation"]].astype('float32')
return result
#
def get_recruittotal(df_source,df_input,year):
df_source=df_source.loc[df_source["year"].isin([year,year-1]),:]
df_source=df_source[["collegeCode","order","year","recruitTotal"]]
df_source=df_source.drop_duplicates()
df_source1=df_source.loc[df_source["year"].isin([year-1]),:]
df_source2=df_source.loc[df_source["year"].isin([year]),:]
# for i in range(len(df_input)):
# collegercode = df_input.loc[i,"collegeCode"]
# order = df_input.loc[i,"order"]
# recruitTotal =
result1=pd.merge(df_input,df_source1,how="inner",left_on=["collegecode","order"],right_on=["collegeCode","order"])
result2=pd.merge(df_input,df_source2,how="inner",left_on=["collegecode","order"],right_on=["collegeCode","order"])
if len(df_source1)==0:
result = result2
else:
result = pd.merge(result1,result2,how="inner",left_on=["collegecode","order"],right_on=["collegecode","order"])
return result
#
def get_score(df_source,df_input):
df_source.index = range(len(df_source))
df_source=df_source.fillna(999999999)
df_source["moveScore"]=0
df_source["avgScore"]=0
df_source["midScore"]=0
for i in range(len(df_source)):
moveDocLocation=df_source.loc[i,"moveDocLocation"]
averageLocation=df_source.loc[i,"averageLocation"]
flag1=0
flag2=0
for j in range(len(df_input)):
if flag1==0:
if j==(len(df_input)-1):
moveDocLocationscore=0
if flag2==0:
averageLocationscore=0
continue
if moveDocLocation=df_input.iloc[j,2]:
moveDocLocationscore=df_input.iloc[j,0]
flag1=1
if moveDocLocation =df_input.iloc[j,2]:
averageLocationscore=df_input.iloc[j,0]
flag2=1
if averageLocation df_object.loc[i,"2018 "]:
df_object.loc[i,"add5score"]=5
df_object = df_object[[" "," ","add5score"]]
return df_object
#
def result_socre(df_object):
df_object.index = range(len(df_object))
df_object["moveScore5"]=0
df_object["avgScore5"]=0
df_object["midScore5"]=0
for i in range(len(df_object)):
if int(df_object.loc[i,"moveScore"])<=0:
t=0
else:
t=df_object.loc[i,'add5score']
df_object.loc[i,"moveScore5"]=int(df_object.loc[i,"moveScore"])+t
df_object.loc[i,"avgScore5"]=int(df_object.loc[i,"avgScore"])+t
df_object.loc[i,"midScore5"]=int(df_object.loc[i,"midScore"])+t
return df_object
#####################################################
pd.set_option('display.float_format', lambda x: '%.3f' % x)
work_directory = r'D:\Nemo\ \data'
os.chdir(work_directory)
filename0 = "2018 ( ).xls"
filename1 = ' CollegePlan.xlsx'
filename2 = ' .xlsx'
filename3 = ' .xlsx'
filename4 = ' .xls'
#
art_cp=pd.read_excel(filename4)
science_cp = pd.read_excel(filename3)
#
art_score=pd.read_excel(filename0,sheetname=" ")
science_score = pd.read_excel(filename0,sheetname=" ")
# “ ”
for i in range(len(science_score)):
science_score.iloc[i,0]=re.sub(re.compile(" "),"",science_score.iloc[i,0])
for i in range(len(art_score)):
art_score.iloc[i,0]=re.sub(re.compile(" "),"",art_score.iloc[i,0])
#art_CollegePlan=pd.read_excel(filename1,sheetname="CollegePlan ")
#science_CollegePlan=pd.read_excel(filename1,sheetname="CollegePlan ")
#
art_historydata=pd.read_excel(filename2,sheetname=" ")
science_historydata=pd.read_excel(filename2,sheetname=" ")
#
sicence_avg_moveDocLocation = avg_moveDocLocation(science_historydata)
#science_avg=get_recruittotal(science_CollegePlan,sicence_avg_moveDocLocation,2018)
science_final=get_score(sicence_avg_moveDocLocation,science_score)
art_avg_moveDocLocation = avg_moveDocLocation(art_historydata)
#art_avg=get_recruittotal(art_CollegePlan,art_avg_moveDocLocation,2018)
art_final=get_score(art_avg_moveDocLocation,art_score)
#
art_cp = compare_cp(art_cp)
science_cp = compare_cp(science_cp)
#
art_result = pd.merge(art_final,art_cp,how="inner",left_on=["collegeName","order"],right_on=[" "," "])
science_result = pd.merge(science_final,art_cp,how="inner",left_on=["collegeName","order"],right_on=[" "," "])
# +5 or not,
art_result_score = result_socre(art_result)
science_result_score = result_socre(science_result)
def compare_rslt(df_obj):
df_obj["moveScore"]=df_obj["moveScore"].astype('int64')
df_obj["avgScore"]=df_obj["avgScore"].astype('int64')
df_obj["moveScore5 "]=df_obj.moveScore5>df_obj.moveDocGrade
df_obj["avgScore5 "]=df_obj.avgScore5>df_obj.moveDocGrade
df_obj["midScore5 "]=df_obj.midScore5>df_obj.moveDocGrade
df_obj.loc[df_obj.moveScore5 ==True,"moveScore5 "]='√'
df_obj.loc[df_obj.moveScore5 ==False,"moveScore5 "]='×'
df_obj.loc[df_obj.avgScore5 ==True,"avgScore5 "]='√'
df_obj.loc[df_obj.avgScore5 ==False,"avgScore5 "]='×'
df_obj.loc[df_obj.midScore5 ==True,"midScore5 "]='√'
df_obj.loc[df_obj.midScore5 ==False,"midScore5 "]='×'
df_obj=df_obj.drop(labels=[' ', ' '],axis=1)
return df_obj
art_result_score = compare_rslt(art_result)
science_result_score = compare_rslt(science_result)
#
writer = pd.ExcelWriter(' result.xlsx')
art_result_score.to_excel(writer,' ')
science_result_score.to_excel(writer,' ')
writer.save()