pythonデータ洗浄データテーブル処理


カスタムディレクトリのタイトルをここに書きます
  • 1.共通ライブラリ
  • 2.データ読み出し
  • 3. データはファイル
  • に保存される
  • 4.データフィルタ
  • 4.1データ表示
  • 4.2単純インデックス
  • 4.3 locとilocの使用
  • 5.データの追加と削除
  • 5.1列
  • を追加
  • 5.2削除
  • 5.3データ修正と検索
  • 5.4データ整理
  • 4.5階層化インデックス

  • 1.共通ライブラリ
    import pandas as pd
    import numpy as np
    import xlrd  #Excel    
    import os
    #      
    os.chdir(r'E:\python\  ')
    

    2.データ読み出し
    一般的にutf−8、gbk、gbk 2312、およびgb 18030などが符号化される.
    #  csv  
    df = pd.read_csv('baby_trade_history.csv', encoding='utf-8',dtype={'user_id':str})
    df1 = pd.read_csv('sam_tianchi_mum_baby.csv',encoding = 'utf-8',dtype =str)  #           
    #  excel  
    import xlrd
    workbook = xlrd.open_workbook('meal_order_detail.xlsx')
    sheet_name = workbook.sheet_names() #    sheet   
    order1 = pd.read_excel('meal_order_detail.xlsx',sheet_name ='meal_order_detail1',engine="python",encoding = "gbk")   #            ,  engine="python"
    
    open(file, mode='r', buffering=-1, encoding=None, errors=None, newline=None, closefd=True, opener=None)  #     ,  ,  ,    ,      ,   (        ),  closefd False                     。
    """
       
    r--    (  )
    w--      ,      
    x--               
    a--      ,      ,        
    b--     
    t--    (  )
    +--          (   )
    u--       (   )
    """
    

    3.データをファイルに保存
    df.to_csv(‘df.csv’,index=False)
    df.to_excel('a1.xlsx', sheet_name=‘sheet1', index= False,encoding='utf-8")
    

    module'pandas'has no attribute'to_Excel’は、to_Excel()関数は、Dataframesタイプで使用する必要があります.解決策:
    data = pd.DataFrame(data)
    writer = pd.ExcelWriter("       .xlsx")
    data.to_excel(writer,'Sheet1')
    writer.save()
    

    4.データフィルタ
    4.1データの表示
    df.info()
    df.head(5) #   5 
    df.tail(5) #   5 
    df.columns #      
    

    4.2単純索引
    df['user_id']
    df['user_id'][1:5]    #        (       )
    df[['user_id','buy_mount','day']][:5]   #      
    

    4.3 locとilocの使用
    df.loc[3:4]    #        ,      3 4  
    df.loc[:,['user_id','buy_mount']]    #     
    df.loc[1:3,['user_id','buy_mount']]    #loc            
    df.loc[df.user_id =='786295544',['user_id','buy_mount','day']]
    df.loc[(df.user_id =='786295544') | (df.user_id =='444069173'),['user_id','buy_mount','day']]    #       
    
    #  iloc   
    df.iloc[:,1:4]    #              
    df.iloc[:,[0,2]]    #         1   3 
    df.iloc[3,[1,2]]    #   4 , 2   3   ,    3             
    df.iloc[2:7,[1,2]]    #   3   7 , 2   3   
    
    #  loc iloc   
    df.loc[2:7]     #      2,3,4,5,6,7  
    df.iloc[2:7]     #   3   7 
    

    5.データの追加と削除
    5.1 1列追加
    #    ,   ,     3   ,  3   
    df['   '] = np.where(df['buy_mount'] >3,' ',' ')
    #     dataframe    ,          
    #     append    dataframe    
    
    """    insert  
    df.insert(  ,    , )
     auction_id   ,    """
    #         ,     auction_id,           ,       
    auction_id = df['auction_id']
    del df['auction_id']
    df.insert(0, 'auction_id', auction_id)
    df.head(5)
    

    5.2削除
    #      , inplace           ,1        
    #         ,        
    #   inplace =True,          ,         ,           
    # labels       , axis     ,inplace=True          ,
    # axis=0    (   ,       ), axis=1    
    df.drop(labels = ['property', '   '],axis = 1,inplace=True) #     , inplace           , 1        
    #      
    df.drop(labels = [3,4],inplace = True,axis= 0) #       3 4    
    df.drop(labels= range(6,11),axis=0,inplace=True)  #      1 10,  range       1 10
    df     #   
    

    5.3データの修正と検索
    df1 = pd.read_csv('sam_tianchi_mum_baby.csv',encoding = 'utf-8',dtype =str)
    df1.head(5)
    #  gender 0     ,1    ,2    
    df1.loc[df['gender'] =='0','gender'] ='  '
    df1.loc[df['gender'] =='1','gender'] ='  '
    df1.loc[df['gender'] =='2','gender'] ='  '
    df1.head(10)
    #     
    basic.rename(columns={},index={})
    #            
    df1.rename(columns = {'user_id':'  ID','birthday':'    ','gender':'  '},inplace = True)
    df1.rename(index = {1:'one',10:'ten' },inplace = True) #       
    df1.reset_index(drop=True,inplace=True)#     
    df1.head(10)
    #  
    #     
    df[df.buy_mount > 3] #      
    df[~(df.buy_mount > 3)] # ~   
    df[ (df.buy_mount > 3) &  (df.day > 20140101)] #      
    #  between,inclusive=True    
    df[ df['buy_mount'].between(4,10,inclusive=True)]
    #   pd.isin()  
    #   
    df[df['auction_id'].isin([41098319944, 17916191097,21896936223])]
    

    5.4データ整理
    横積みはデータ洗浄では一般的ではありませんが、縦積みは異なるテーブル、フィールド名と同じように理解できます.統合
    import xlrd
    workbook = xlrd.open_workbook('meal_order_detail.xlsx')
    sheet_name = workbook.sheet_names() #    sheet   
    sheet_name  #     
    order1 = pd.read_excel('meal_order_detail.xlsx',sheet_name ='meal_order_detail1')
    order2 = pd.read_excel('meal_order_detail.xlsx',sheet_name ='meal_order_detail2')
    order3 = pd.read_excel('meal_order_detail.xlsx',sheet_name ='meal_order_detail3')
    order = pd.concat([order1,order2,order3],axis=0,ignore_index=False)#        
    order1.shape
    
    #           
    basic = pd.DataFrame()
    for i in sheet_name:
        basic_i = pd.read_excel('meal_order_detail.xlsx', header = 0,sheet_name=i,encoding='utf-8')
        basic = pd.concat([basic,basic_i],axis=0)
    basic.shape
    
    #  ,          
    df = pd.read_csv('baby_trade_history.csv', encoding='utf-8',dtype={'user_id':str})#     
    df1 = pd.read_csv('sam_tianchi_mum_baby.csv',encoding = 'utf-8',dtype =str)#    
    df2 = pd.merge(left = df, right=df1,  how='inner',  left_on='user_id', right_on = 'user_id')#    
    df2.head(10)
    

    4.5階層化インデックス
    df = pd.read_csv('baby_trade_history.csv', encoding='utf-8',dtype={'user_id':str},index_col=[3,0])#    4   1     
    df.loc[28] #     
    df.loc[28].loc[[82830661,532110457]]#     
    
    #      
    df3.loc[(a,b),:] #a b              
    
    #  tuple
    df.loc[(28,[82830661,532110457]),:]#        ,    
    df.loc[(28,[82830661,532110457]),['auction_id','cat_id']]#        ,  2   
    df.loc[([28,50014815])] #      28 50014815