高校の合格点数の分析

9066 ワード

高校の合格点数の分析と予測
import pandas as pd
import os
import re

#moveDocLocation       15\16\17    
def avg_moveDocLocation(historydata,year=2018):
    historydata=historydata.fillna(-1)
    historydata["matricDiffYear"]=historydata["matricDiffYear"].astype('int64')
    historydata=historydata.sort_values(by=["collegeCode","order","matricDiffYear"])
    collegecode=historydata["collegeCode"]
    unique_collegecode = collegecode.unique()
    result=pd.DataFrame()
    for code in unique_collegecode:
        temp=historydata.loc[historydata.collegeCode==code,["collegeCode","collegeName","order","matricDiffYear","moveDocLocation","averageLocation","moveDocGrade"]]
        name= temp.loc[0,"collegeName"]
        tt_rst=pd.DataFrame()
        for order in temp["order"].unique():
            temp_order=temp.loc[temp.order==order,:]
            temp_year1 = temp_order.loc[temp_order.matricDiffYear>=(year-3),:]
            temp_year2=temp_year1.loc[temp_year1.matricDiffYear<=(year-1),:]
            temp_year3=temp_year2.loc[temp_year2.moveDocLocation>0,:]
            temp_year4=temp_year3.loc[temp_year3.averageLocation>0,:]
            tmp_value=temp_order.loc[temp_order.matricDiffYear==year,"moveDocGrade"]
            if len(tmp_value)==0:
                tmp_moveDocGrade=0
            else:
                tmp_moveDocGrade=int(tmp_value)
            mean_mov_avg=list(temp_year4[["moveDocLocation","averageLocation"]].mean())
            temp_result = pd.DataFrame([code,name,order,mean_mov_avg[0],mean_mov_avg[1],tmp_moveDocGrade],
                                       index = ["collegecode","collegeName","order","moveDocLocation","averageLocation","moveDocGrade"])
            temp_result=temp_result.transpose()
            tt_rst=pd.concat([tt_rst,temp_result])
        result=pd.concat([result,tt_rst])
        result[["moveDocLocation","averageLocation"]]=result[["moveDocLocation","averageLocation"]].astype('float32')
    return result


#      
def get_recruittotal(df_source,df_input,year):
    df_source=df_source.loc[df_source["year"].isin([year,year-1]),:]
    df_source=df_source[["collegeCode","order","year","recruitTotal"]]
    df_source=df_source.drop_duplicates()
    df_source1=df_source.loc[df_source["year"].isin([year-1]),:]
    df_source2=df_source.loc[df_source["year"].isin([year]),:]
#    for i in range(len(df_input)):
#        collegercode = df_input.loc[i,"collegeCode"]
#        order = df_input.loc[i,"order"]
#        recruitTotal = 
    result1=pd.merge(df_input,df_source1,how="inner",left_on=["collegecode","order"],right_on=["collegeCode","order"])
    result2=pd.merge(df_input,df_source2,how="inner",left_on=["collegecode","order"],right_on=["collegeCode","order"])
    if len(df_source1)==0:
        result = result2
    else:
        result = pd.merge(result1,result2,how="inner",left_on=["collegecode","order"],right_on=["collegecode","order"])
    return result

#       
def get_score(df_source,df_input):
    df_source.index = range(len(df_source))
    df_source=df_source.fillna(999999999)
    df_source["moveScore"]=0
    df_source["avgScore"]=0
    df_source["midScore"]=0
    for i in range(len(df_source)):
        moveDocLocation=df_source.loc[i,"moveDocLocation"]
        averageLocation=df_source.loc[i,"averageLocation"]
        flag1=0
        flag2=0
        for j in range(len(df_input)):
            if flag1==0:
                if  j==(len(df_input)-1):
                    moveDocLocationscore=0
                    if flag2==0:
                        averageLocationscore=0
                    continue
                if moveDocLocation=df_input.iloc[j,2]:
                    moveDocLocationscore=df_input.iloc[j,0]
                    flag1=1
                if  moveDocLocation =df_input.iloc[j,2]:
                    averageLocationscore=df_input.iloc[j,0]
                    flag2=1
                if  averageLocation df_object.loc[i,"2018   "]:
            df_object.loc[i,"add5score"]=5
    df_object = df_object[["    ","  ","add5score"]]
    return df_object

#         
def result_socre(df_object):
    df_object.index = range(len(df_object))
    df_object["moveScore5"]=0
    df_object["avgScore5"]=0
    df_object["midScore5"]=0
    for i in range(len(df_object)):
        if int(df_object.loc[i,"moveScore"])<=0:
            t=0
        else:
            t=df_object.loc[i,'add5score']
        df_object.loc[i,"moveScore5"]=int(df_object.loc[i,"moveScore"])+t
        df_object.loc[i,"avgScore5"]=int(df_object.loc[i,"avgScore"])+t
        df_object.loc[i,"midScore5"]=int(df_object.loc[i,"midScore"])+t
    return df_object


#####################################################
pd.set_option('display.float_format', lambda x: '%.3f' % x)

work_directory = r'D:\Nemo\    \data'
os.chdir(work_directory)

filename0 = "2018           (  ).xls"
filename1 = '    CollegePlan.xlsx'
filename2 = '        .xlsx'
filename3 = '      .xlsx'
filename4 = '      .xls'

#        
art_cp=pd.read_excel(filename4)
science_cp = pd.read_excel(filename3)

#         
art_score=pd.read_excel(filename0,sheetname="  ")
science_score = pd.read_excel(filename0,sheetname="  ")
#        “ ” 
for i in range(len(science_score)):
    science_score.iloc[i,0]=re.sub(re.compile(" "),"",science_score.iloc[i,0])
for i in range(len(art_score)):
    art_score.iloc[i,0]=re.sub(re.compile(" "),"",art_score.iloc[i,0])


#art_CollegePlan=pd.read_excel(filename1,sheetname="CollegePlan  ")
#science_CollegePlan=pd.read_excel(filename1,sheetname="CollegePlan  ")
#      
art_historydata=pd.read_excel(filename2,sheetname="  ")
science_historydata=pd.read_excel(filename2,sheetname="  ")

#          
sicence_avg_moveDocLocation = avg_moveDocLocation(science_historydata)
#science_avg=get_recruittotal(science_CollegePlan,sicence_avg_moveDocLocation,2018)
science_final=get_score(sicence_avg_moveDocLocation,science_score)

art_avg_moveDocLocation = avg_moveDocLocation(art_historydata)
#art_avg=get_recruittotal(art_CollegePlan,art_avg_moveDocLocation,2018)
art_final=get_score(art_avg_moveDocLocation,art_score)

#      
art_cp = compare_cp(art_cp)
science_cp = compare_cp(science_cp)

#             
art_result = pd.merge(art_final,art_cp,how="inner",left_on=["collegeName","order"],right_on=["    ","  "])
science_result = pd.merge(science_final,art_cp,how="inner",left_on=["collegeName","order"],right_on=["    ","  "])

#   +5 or not,       
art_result_score = result_socre(art_result)
science_result_score = result_socre(science_result)
def compare_rslt(df_obj):
    df_obj["moveScore"]=df_obj["moveScore"].astype('int64')
    df_obj["avgScore"]=df_obj["avgScore"].astype('int64')
    df_obj["moveScore5     "]=df_obj.moveScore5>df_obj.moveDocGrade
    df_obj["avgScore5     "]=df_obj.avgScore5>df_obj.moveDocGrade
    df_obj["midScore5     "]=df_obj.midScore5>df_obj.moveDocGrade
    df_obj.loc[df_obj.moveScore5     ==True,"moveScore5     "]='√'
    df_obj.loc[df_obj.moveScore5     ==False,"moveScore5     "]='×'
    df_obj.loc[df_obj.avgScore5     ==True,"avgScore5     "]='√'
    df_obj.loc[df_obj.avgScore5     ==False,"avgScore5     "]='×'
    df_obj.loc[df_obj.midScore5     ==True,"midScore5     "]='√'
    df_obj.loc[df_obj.midScore5     ==False,"midScore5     "]='×'
    df_obj=df_obj.drop(labels=['    ', '  '],axis=1)
    return df_obj

art_result_score = compare_rslt(art_result)
science_result_score = compare_rslt(science_result)
#       
writer = pd.ExcelWriter('    result.xlsx')
art_result_score.to_excel(writer,'  ')
science_result_score.to_excel(writer,'  ')
writer.save()