python pandas(一)

7050 ワード

#coding=utf-8
import numpy as np
import pandas as pd

df1 = pd.DataFrame(pd.read_csv('./house_data/all/test.csv',header=1))   #   .csv  
print df1.shape

df2 = pd.DataFrame({"id":[1002,1001,1003,1004,1005,1006],
 "date":pd.date_range('20130102', periods=6),
  "city":['Beijing ', 'SH', ' guangzhou ', 'Shenzhen', 'shanghai', 'BEIJING '],
 "age":[23,44,54,32,34,32],
 "category":['100-A','100-B','110-A','110-C','210-A','130-F'],
  "price":[1200,np.nan,2133,5433,np.nan,4432]},
  columns =['id','date','city','category','age','price'])

df3=pd.DataFrame({"id":[1001,1002,1003,1004,1005,1006,1007,1008],
"gender":['male','female','male','female','male','female','male','female'],
"pay":['Y','N','Y','Y','N','Y','N','Y'],
"m-point":[10,12,20,40,40,40,30,20]})
print df3.info

'''
print df2.shape  #     
print df2.info   #        (  、   、    、     )
print df2.dtypes  #           

print df2["city"].dtype  # object
print df2.isnull()
print df2['price'].unique()  #           ,    
print  df2.values  #        
print '=============='
print df2.head(3) #   10   
print df2.tail(3)    #   10    
'''

#      
df3 = df2.fillna(value=0)  #    0    
print df3.info

df2['price'] = df2['price'].fillna(df2['price'].mean())  #    prince    NA      ,       
print df2.info
df2['city']=df2['city'].map(str.strip)  #     
print df2

df2['city']=df2['city'].str.lower()  #      
print df2.info
print df2.dtypes  #           
# df2['price'] = df2['price'].astype(int)  #         
# print df2.dtypes  #           

df2=df2.rename(columns={'category': 'category-size'})  #       ,      df2  
print df2.info
# df2['city'] = df2['city'].drop_duplicates()   #          
# print df2.info

df2['city'] = df2['city'].drop_duplicates(keep='last')  #          :
print df2.info

df2['city'] = df2['city'].replace('sh', 'shanghai')  #     
print df2.info
print '======='

''''
df_inner=pd.merge(df2,df3,how='inner')  #     ,  
print 'inner:'
print df_inner.info

df_outer=pd.merge(df2,df3,how='outer')  #  
print 'outer:'
print df_outer.info

print '========='
df_left=pd.merge(df2,df3,how='left')
print 'left:'
print df_left

print '========='
df_rigth=pd.merge(df2,df3,how='right')
print 'right:'
print df_rigth
'''

df2 = df2.set_index('id')  #   id   
df2 = df2.sort_index()   #        
print df2

df2 = df2.sort_values(by=['age'])
print df2

df2['group'] = np.where(df2['price'] > 3000,'high','low')  #    group 
print df2

# df2['price'] = np.where(df2['price'] > 3000,'high','low')  #    group 
# print df2
print df2.dtypes
df2.loc[(df2['city'] == 'beijing') & (df2['price'] >= 4000), 'sign']=1
print df2

print '====='
print df2.loc[df2['city']== 'beijing']  # loc          
print '===='
print df2.iloc[0:2]  #           

df2 = df2.reset_index()  #     
df2=df2.set_index('date')  #        
print '****'
print df2[:'2013-01-03']
print df2.iloc[:3,:2]  #                  ,         , 0  ,   ,   。

print df2.iloc[[0,2,5],[2]]  #    0、2、5 ,4、5     !!!!          ,        
print df2

print df2.ix[:'2013-01-03',:4] # 2013-01-03   ,         ix              
print df2['city'].isin(['beijing'])  #        False True  ,  city        
print df2['city'].isin(['beijing','shanghai'])  #   True  False  ,  city      beijing shanghai
print df2.loc[df2['city'].isin(['beijing','shanghai'])]  #   city      beijing shanghai,     
print df2
print pd.DataFrame(df2['city'].str[:3])  #   city      ,      ,    city 

#    、 、         、  、         ,        。

#          
print df2.loc[(df2['age'] > 25) & (df2['city'] == 'beijing'), ['id','city','age','category-size','price']]

#         
print ' :',df2.loc[(df2['age'] > 25) | (df2['city'] == 'beijing'), ['id','city','age','category-size','price']]

#         ,  id    ,city.count()     ,  4
print df2.loc[(df2['city'] != 'beijing'), ['id','city','age','category','gender']].sort_values(['id']).city.count()

#   query      
print df2.query('city == ["beijing","shanghai"]')

#         prince    
print df2.query('city == ["beijing","shanghai"]').price.sum()  # 11031.0
print '*****'
print df2
print df2.groupby('city').count()  #               
print '*****'
print df2.groupby('city').id.count()  #     id       
print df2.groupby(['city','age']).id.count()  #            ,   city age   ,    
print df2.groupby('city')['price'].agg([len,np.sum, np.mean]) #  city      ,     prince      
'''
           len     sum    mean
city
beijing    1.0  4432.0  4432.0
guangzhou  1.0  2133.0  2133.0
shanghai   2.0  6599.0  3299.5
shenzhen   1.0  5433.0  5433.0

'''
df = df2.sample(n=3)  #   3   
print df
print '===='
weights = [0.8, 0, 0, 0, 0.1, 0.1]
print df2.sample(n=2, weights=weights)

print df2.describe().round(2).T  #      
'''
      count    mean      std     min      25%     50%      75%     max
id       6.0  1003.5     1.87  1001.0  1002.25  1003.5  1004.75  1006.0
age      6.0    36.5    10.88    23.0    32.00    33.0    41.50    54.0
price    6.0  3299.5  1523.35  1200.0  2424.62  3299.5  4148.88  5433.0
sign     1.0     1.0      NaN     1.0     1.00     1.0     1.00     1.0
'''
print df2['price'].std() #            1523.35163373
print df2['price'].cov(df2['age'])  #        
print df2.cov() #              
'''
        id     age      price  sign
id        3.5    -4.9     1526.1   NaN
age      -4.9   118.3    -1353.5   NaN
price  1526.1 -1353.5  2320600.2   NaN
sign      NaN     NaN        NaN   NaN
'''
#           
print df2['price'].corr(df2['age']) #      -1 1  ,  1    ,  -1    ,0      -0.0816894035549328
#           
print df2.corr()

df2.to_csv('./excel_to_python.csv') #   CSV
df2.to_excel('./excel_to_python.xlsx', sheet_name='bluewhale_cc')  #   Excel