python pandas(一)
7050 ワード
#coding=utf-8
import numpy as np
import pandas as pd
df1 = pd.DataFrame(pd.read_csv('./house_data/all/test.csv',header=1)) # .csv
print df1.shape
df2 = pd.DataFrame({"id":[1002,1001,1003,1004,1005,1006],
"date":pd.date_range('20130102', periods=6),
"city":['Beijing ', 'SH', ' guangzhou ', 'Shenzhen', 'shanghai', 'BEIJING '],
"age":[23,44,54,32,34,32],
"category":['100-A','100-B','110-A','110-C','210-A','130-F'],
"price":[1200,np.nan,2133,5433,np.nan,4432]},
columns =['id','date','city','category','age','price'])
df3=pd.DataFrame({"id":[1001,1002,1003,1004,1005,1006,1007,1008],
"gender":['male','female','male','female','male','female','male','female'],
"pay":['Y','N','Y','Y','N','Y','N','Y'],
"m-point":[10,12,20,40,40,40,30,20]})
print df3.info
'''
print df2.shape #
print df2.info # ( 、 、 、 )
print df2.dtypes #
print df2["city"].dtype # object
print df2.isnull()
print df2['price'].unique() # ,
print df2.values #
print '=============='
print df2.head(3) # 10
print df2.tail(3) # 10
'''
#
df3 = df2.fillna(value=0) # 0
print df3.info
df2['price'] = df2['price'].fillna(df2['price'].mean()) # prince NA ,
print df2.info
df2['city']=df2['city'].map(str.strip) #
print df2
df2['city']=df2['city'].str.lower() #
print df2.info
print df2.dtypes #
# df2['price'] = df2['price'].astype(int) #
# print df2.dtypes #
df2=df2.rename(columns={'category': 'category-size'}) # , df2
print df2.info
# df2['city'] = df2['city'].drop_duplicates() #
# print df2.info
df2['city'] = df2['city'].drop_duplicates(keep='last') # :
print df2.info
df2['city'] = df2['city'].replace('sh', 'shanghai') #
print df2.info
print '======='
''''
df_inner=pd.merge(df2,df3,how='inner') # ,
print 'inner:'
print df_inner.info
df_outer=pd.merge(df2,df3,how='outer') #
print 'outer:'
print df_outer.info
print '========='
df_left=pd.merge(df2,df3,how='left')
print 'left:'
print df_left
print '========='
df_rigth=pd.merge(df2,df3,how='right')
print 'right:'
print df_rigth
'''
df2 = df2.set_index('id') # id
df2 = df2.sort_index() #
print df2
df2 = df2.sort_values(by=['age'])
print df2
df2['group'] = np.where(df2['price'] > 3000,'high','low') # group
print df2
# df2['price'] = np.where(df2['price'] > 3000,'high','low') # group
# print df2
print df2.dtypes
df2.loc[(df2['city'] == 'beijing') & (df2['price'] >= 4000), 'sign']=1
print df2
print '====='
print df2.loc[df2['city']== 'beijing'] # loc
print '===='
print df2.iloc[0:2] #
df2 = df2.reset_index() #
df2=df2.set_index('date') #
print '****'
print df2[:'2013-01-03']
print df2.iloc[:3,:2] # , , 0 , , 。
print df2.iloc[[0,2,5],[2]] # 0、2、5 ,4、5 !!!! ,
print df2
print df2.ix[:'2013-01-03',:4] # 2013-01-03 , ix
print df2['city'].isin(['beijing']) # False True , city
print df2['city'].isin(['beijing','shanghai']) # True False , city beijing shanghai
print df2.loc[df2['city'].isin(['beijing','shanghai'])] # city beijing shanghai,
print df2
print pd.DataFrame(df2['city'].str[:3]) # city , , city
# 、 、 、 、 , 。
#
print df2.loc[(df2['age'] > 25) & (df2['city'] == 'beijing'), ['id','city','age','category-size','price']]
#
print ' :',df2.loc[(df2['age'] > 25) | (df2['city'] == 'beijing'), ['id','city','age','category-size','price']]
# , id ,city.count() , 4
print df2.loc[(df2['city'] != 'beijing'), ['id','city','age','category','gender']].sort_values(['id']).city.count()
# query
print df2.query('city == ["beijing","shanghai"]')
# prince
print df2.query('city == ["beijing","shanghai"]').price.sum() # 11031.0
print '*****'
print df2
print df2.groupby('city').count() #
print '*****'
print df2.groupby('city').id.count() # id
print df2.groupby(['city','age']).id.count() # , city age ,
print df2.groupby('city')['price'].agg([len,np.sum, np.mean]) # city , prince
'''
len sum mean
city
beijing 1.0 4432.0 4432.0
guangzhou 1.0 2133.0 2133.0
shanghai 2.0 6599.0 3299.5
shenzhen 1.0 5433.0 5433.0
'''
df = df2.sample(n=3) # 3
print df
print '===='
weights = [0.8, 0, 0, 0, 0.1, 0.1]
print df2.sample(n=2, weights=weights)
print df2.describe().round(2).T #
'''
count mean std min 25% 50% 75% max
id 6.0 1003.5 1.87 1001.0 1002.25 1003.5 1004.75 1006.0
age 6.0 36.5 10.88 23.0 32.00 33.0 41.50 54.0
price 6.0 3299.5 1523.35 1200.0 2424.62 3299.5 4148.88 5433.0
sign 1.0 1.0 NaN 1.0 1.00 1.0 1.00 1.0
'''
print df2['price'].std() # 1523.35163373
print df2['price'].cov(df2['age']) #
print df2.cov() #
'''
id age price sign
id 3.5 -4.9 1526.1 NaN
age -4.9 118.3 -1353.5 NaN
price 1526.1 -1353.5 2320600.2 NaN
sign NaN NaN NaN NaN
'''
#
print df2['price'].corr(df2['age']) # -1 1 , 1 , -1 ,0 -0.0816894035549328
#
print df2.corr()
df2.to_csv('./excel_to_python.csv') # CSV
df2.to_excel('./excel_to_python.xlsx', sheet_name='bluewhale_cc') # Excel