pythonデータ洗浄データテーブル処理

6691 ワード

pythonコード python基礎文法ビジネス分析

カスタムディレクトリのタイトルをここに書きます

1.共通ライブラリ

2.データ読み出し

3. データはファイル

に保存される

4.データフィルタ

4.1データ表示

4.2単純インデックス

4.3 locとilocの使用

5.データの追加と削除

5.1列

を追加

5.2削除

5.3データ修正と検索

5.4データ整理

4.5階層化インデックス

1.共通ライブラリ

import pandas as pd
import numpy as np
import xlrd  #Excel    
import os
#      
os.chdir(r'E:\python\  ')

2.データ読み出し
一般的にutf−8、gbk、gbk 2312、およびgb 18030などが符号化される.

#  csv  
df = pd.read_csv('baby_trade_history.csv', encoding='utf-8',dtype={'user_id':str})
df1 = pd.read_csv('sam_tianchi_mum_baby.csv',encoding = 'utf-8',dtype =str)  #           
#  excel  
import xlrd
workbook = xlrd.open_workbook('meal_order_detail.xlsx')
sheet_name = workbook.sheet_names() #    sheet   
order1 = pd.read_excel('meal_order_detail.xlsx',sheet_name ='meal_order_detail1',engine="python",encoding = "gbk")   #            ，  engine="python"

open(file, mode='r', buffering=-1, encoding=None, errors=None, newline=None, closefd=True, opener=None)  #     ，  ，  ，    ，      ，   （        ），  closefd False                     。
"""
   
r--    (  )
w--      ，      
x--               
a--      ，      ，        
b--     
t--    (  )
+--          (   )
u--       (   )
"""

3.データをファイルに保存

df.to_csv(‘df.csv’,index=False)
df.to_excel('a1.xlsx', sheet_name=‘sheet1', index= False,encoding='utf-8")

module'pandas'has no attribute'to_Excel’は、to_Excel()関数は、Dataframesタイプで使用する必要があります.解決策:

data = pd.DataFrame(data)
writer = pd.ExcelWriter("       .xlsx")
data.to_excel(writer,'Sheet1')
writer.save()

4.データフィルタ
4.1データの表示

df.info()
df.head(5) #   5 
df.tail(5) #   5 
df.columns #

4.2単純索引

df['user_id']
df['user_id'][1:5]    #        （       ）
df[['user_id','buy_mount','day']][:5]   #

4.3 locとilocの使用

df.loc[3:4]    #        ，      3 4  
df.loc[:,['user_id','buy_mount']]    #     
df.loc[1:3,['user_id','buy_mount']]    #loc            
df.loc[df.user_id =='786295544',['user_id','buy_mount','day']]
df.loc[(df.user_id =='786295544') | (df.user_id =='444069173'),['user_id','buy_mount','day']]    #       

#  iloc   
df.iloc[:,1:4]    #              
df.iloc[:,[0,2]]    #         1   3 
df.iloc[3,[1,2]]    #   4 ， 2   3   ,    3             
df.iloc[2:7,[1,2]]    #   3   7 ， 2   3   

#  loc iloc   
df.loc[2:7]     #      2,3,4,5,6,7  
df.iloc[2:7]     #   3   7

5.データの追加と削除
5.1 1列追加

#    ,   ,     3   ，  3   
df['   '] = np.where(df['buy_mount'] >3,' ',' ')
#     dataframe    ，          
#     append    dataframe    

"""    insert  
df.insert(  ,    ， )
 auction_id   ，    """
#         ，     auction_id,           ，       
auction_id = df['auction_id']
del df['auction_id']
df.insert(0, 'auction_id', auction_id)
df.head(5)

5.2削除

#      , inplace           ,1        
#         ，        
#   inplace =True,          ,         ，           
# labels       , axis     ，inplace=True          ,
# axis=0    （   ，       ）, axis=1    
df.drop(labels = ['property', '   '],axis = 1,inplace=True) #     , inplace           , 1        
#      
df.drop(labels = [3,4],inplace = True,axis= 0) #       3 4    
df.drop(labels= range(6,11),axis=0,inplace=True)  #      1 10,  range       1 10
df     #

5.3データの修正と検索

df1 = pd.read_csv('sam_tianchi_mum_baby.csv',encoding = 'utf-8',dtype =str)
df1.head(5)
#  gender 0     ，1    ，2    
df1.loc[df['gender'] =='0','gender'] ='  '
df1.loc[df['gender'] =='1','gender'] ='  '
df1.loc[df['gender'] =='2','gender'] ='  '
df1.head(10)
#     
basic.rename(columns={},index={})
#            
df1.rename(columns = {'user_id':'  ID','birthday':'    ','gender':'  '},inplace = True)
df1.rename(index = {1:'one',10:'ten' },inplace = True) #       
df1.reset_index(drop=True,inplace=True)#     
df1.head(10)
#  
#     
df[df.buy_mount > 3] #      
df[~(df.buy_mount > 3)] # ~   
df[ (df.buy_mount > 3) &  (df.day > 20140101)] #      
#  between,inclusive=True    
df[ df['buy_mount'].between(4,10,inclusive=True)]
#   pd.isin()  
#   
df[df['auction_id'].isin([41098319944, 17916191097,21896936223])]

5.4データ整理
横積みはデータ洗浄では一般的ではありませんが、縦積みは異なるテーブル、フィールド名と同じように理解できます.統合

import xlrd
workbook = xlrd.open_workbook('meal_order_detail.xlsx')
sheet_name = workbook.sheet_names() #    sheet   
sheet_name  #     
order1 = pd.read_excel('meal_order_detail.xlsx',sheet_name ='meal_order_detail1')
order2 = pd.read_excel('meal_order_detail.xlsx',sheet_name ='meal_order_detail2')
order3 = pd.read_excel('meal_order_detail.xlsx',sheet_name ='meal_order_detail3')
order = pd.concat([order1,order2,order3],axis=0,ignore_index=False)#        
order1.shape

#           
basic = pd.DataFrame()
for i in sheet_name:
    basic_i = pd.read_excel('meal_order_detail.xlsx', header = 0,sheet_name=i,encoding='utf-8')
    basic = pd.concat([basic,basic_i],axis=0)
basic.shape

#  ，          
df = pd.read_csv('baby_trade_history.csv', encoding='utf-8',dtype={'user_id':str})#     
df1 = pd.read_csv('sam_tianchi_mum_baby.csv',encoding = 'utf-8',dtype =str)#    
df2 = pd.merge(left = df, right=df1,  how='inner',  left_on='user_id', right_on = 'user_id')#    
df2.head(10)

4.5階層化インデックス

df = pd.read_csv('baby_trade_history.csv', encoding='utf-8',dtype={'user_id':str},index_col=[3,0])#    4   1     
df.loc[28] #     
df.loc[28].loc[[82830661,532110457]]#     

#      
df3.loc[(a,b),:] #a b              

#  tuple
df.loc[(28,[82830661,532110457]),:]#        ，    
df.loc[(28,[82830661,532110457]),['auction_id','cat_id']]#        ，  2   
df.loc[([28,50014815])] #      28 50014815

【Python】爬虫類アンドロイド偽装コード

複数NICマシンでのルーティング設定（static route）