python共通ライブラリ:pandas

10028 ワード

Python 3

pandasはnumpyベースのライブラリであり、pythonでデータ処理によく使われるライブラリでもある.

Series

DataFrame

Index

1. Series

>>> import pandas as pd
>>> import numpy as np
>>> s = pd.Series([7,'beijing',2.1,2,'happy'])
>>> s
0          7
1    beijing
2        2.1
3          2
4      happy
dtype: object
>>> s = pd.Series([7,'beijing',2.1,2,'happy'],index = ['A','B','C','D','E']) #  index
>>> s
A          7
B    beijing
C        2.1
D          2
E      happy
dtype: object
>>> type(s)

>>> cities = {'beijing':55000,'shanghai':60000,'shenzhen':40000,'guangzhou':25000}
>>> cities
{'beijing': 55000, 'shanghai': 60000, 'shenzhen': 40000, 'guangzhou': 25000}
>>> apts = pd.Series(cities)    # dict     Series，Series      key value pair
>>> apts
beijing      55000
guangzhou    25000
shanghai     60000
shenzhen     40000
dtype: int64
>>> apts[['beijing','shenzhen']]
beijing     55000
shenzhen    40000
dtype: int64
>>> apts[apts<50000]
guangzhou    25000
shenzhen     40000
dtype: int64
>>> 'beijing' in apts        #       Series 
True

>>> apts[apts.isnull()]       #  value null   
Series([], dtype: int64)

>>> apts[apts.notnull()]    #  value null   
	 
beijing      55000
guangzhou    25000
shanghai     60000
shenzhen     40000
dtype: int64

2. DataFrame
1つのDataFrameは1枚の表で、Seriesは1次元の配列を表して、DataFrameは1つの2次元の配列です


>>> import pandas as pd
>>> import numpy as np
>>> data = {'cities':['beijing','shanghai','guangzhou','shenzhen'],'years':[2014,2015,2016,2017],'population':[1000,2000,3000,4000]}
>>> type(pd.DataFrame(data))

>>> pd.DataFrame(data)                    #DataFrame    
      cities  population  years
0    beijing        1000   2014
1   shanghai        2000   2015
2  guangzhou        3000   2016
3   shenzhen        4000   2017
>>> pd.DataFrame(data,columns=['years','cities','population'])#      
   years     cities  population
0   2014    beijing        1000
1   2015   shanghai        2000
2   2016  guangzhou        3000
3   2017   shenzhen        4000
>>> pd.DataFrame(data,columns=['years','cities','population'],index=['one','two','three','foue'])#      
       years     cities  population
one     2014    beijing        1000
two     2015   shanghai        2000
three   2016  guangzhou        3000
foue    2017   shenzhen        4000
>>> frame2 = pd.DataFrame(data,columns=['years','cities','population'],index=['one','two','three','foue'])
>>> frame2
       years     cities  population
one     2014    beijing        1000
two     2015   shanghai        2000
three   2016  guangzhou        3000
foue    2017   shenzhen        4000
>>> frame2['cities']#     
one        beijing
two       shanghai
three    guangzhou
foue      shenzhen
Name: cities, dtype: object
>>> frame2.cities
one        beijing
two       shanghai
three    guangzhou
foue      shenzhen
Name: cities, dtype: object
>>> frame2.ix['three']#     
years              2016
cities        guangzhou
population         3000
Name: three, dtype: object
>>> frame2.ix[2]
years              2016
cities        guangzhou
population         3000
Name: three, dtype: object
>>> frame2['cities']['one'] = 'zhuhai'#      

Warning (from warnings module):
  File "__main__", line 1
SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
>>> frame2
       years     cities  population
one     2014     zhuhai        1000
two     2015   shanghai        2000
three   2016  guangzhou        3000
foue    2017   shenzhen        4000
>>> frame2['population'] = 1#     
>>> frame2
       years     cities  population
one     2014     zhuhai           1
two     2015   shanghai           1
three   2016  guangzhou           1
foue    2017   shenzhen           1
>>> frame2.ix['five'] = 1#     
>>> frame2
       years     cities  population
one     2014     zhuhai           1
two     2015   shanghai           1
three   2016  guangzhou           1
foue    2017   shenzhen           1
five       1          1           1
>>> frame2.ix['one'] = 2
>>> frame2
       years     cities  population
one        2          2           2
two     2015   shanghai           1
three   2016  guangzhou           1
foue    2017   shenzhen           1
five       1          1           1
>>> frame2.years = np.arange(5)#         
>>> frame2
       years     cities  population
one        0          2           2
two        1   shanghai           1
three      2  guangzhou           1
foue       3   shenzhen           1
five       4          1           1
>>> val = pd.Series([200,300,500],index=['two','three','five'])
>>> frame2['population'] = val#   Series      
>>> frame2
       years     cities  population
one        0          2         NaN
two        1   shanghai       200.0
three      2  guangzhou       300.0
foue       3   shenzhen         NaN
five       4          1       500.0
>>> frame2.columns
Index(['years', 'cities', 'population'], dtype='object')
>>> frame2.index
Index(['one', 'two', 'three', 'foue', 'five'], dtype='object')
>>> frame2.T#  
            one       two      three      foue five
years         0         1          2         3    4
cities        2  shanghai  guangzhou  shenzhen    1
population  NaN       200        300       NaN  500
>>> frame2['cities'][1:2]#       
two    shanghai
Name: cities, dtype: object
>>>

3. Index
indexのいくつかの操作について


>>> import pandas as pd
>>> import numpy as np
>>> obj = pd.Series(range(3))
>>> obj
0    0
1    1
2    2
dtype: int64
>>> obj = pd.Series(range(3),index=['a','b','c'])
>>> obj
a    0
b    1
c    2
dtype: int64
>>> obj[[0,2]]
a    0
c    2
dtype: int64
>>> obj[0:2]
a    0
b    1
dtype: int64
>>> obj['a':'c']
a    0
b    1
c    2
dtype: int64
>>> obj['a':'c'] = 3
>>> obj
a    3
b    3
c    3
dtype: int64
>>> frame = pd.DataFrame(np.arange(9).reshape(3,3),index = ['a','b','c'],columns=['beijing','shanghai','guangzhou'])
>>> frame
   beijing  shanghai  guangzhou
a        0         1          2
b        3         4          5
c        6         7          8
>>> frame.ix['a':'c']
   beijing  shanghai  guangzhou
a        0         1          2
b        3         4          5
c        6         7          8
>>> frame.ix[['a','c'],['beijing','guangzhou']]
   beijing  guangzhou
a        0          2
c        6          8
>>> frame.ix[:,'beijing':'guangzhou']
   beijing  shanghai  guangzhou
a        0         1          2
b        3         4          5
c        6         7          8
>>> frame.reindex(['e','f','g','h'])
   beijing  shanghai  guangzhou
e      NaN       NaN        NaN
f      NaN       NaN        NaN
g      NaN       NaN        NaN
h      NaN       NaN        NaN
>>> frame
   beijing  shanghai  guangzhou
a        0         1          2
b        3         4          5
c        6         7          8
>>> frame.drop('a')
   beijing  shanghai  guangzhou
b        3         4          5
c        6         7          8
>>> frame
   beijing  shanghai  guangzhou
a        0         1          2
b        3         4          5
c        6         7          8
>>> frame = frame.drop('a')
>>> frame
   beijing  shanghai  guangzhou
b        3         4          5
c        6         7          8
>>> data = pd.Series(np.random.randn(10),index=[['a','a','a','b','b','c','c','c','d','d'],[1,2,3,1,2,1,2,3,1,2]])
>>> data
a  1   -0.060544
   2   -1.680403
   3    0.408582
b  1    1.001766
   2    1.320155
c  1   -1.125726
   2    1.508404
   3    0.640139
d  1    0.824988
   2    0.148888
dtype: float64
>>> data.index
MultiIndex(levels=[['a', 'b', 'c', 'd'], [1, 2, 3]],
           labels=[[0, 0, 0, 1, 1, 2, 2, 2, 3, 3], [0, 1, 2, 0, 1, 0, 1, 2, 0, 1]])
>>> data['b':'d']
b  1    1.001766
   2    1.320155
c  1   -1.125726
   2    1.508404
   3    0.640139
d  1    0.824988
   2    0.148888
dtype: float64
>>> data[1:4]
a  2   -1.680403
   3    0.408582
b  1    1.001766
dtype: float64
>>> data.unstack()
          1         2         3
a -0.060544 -1.680403  0.408582
b  1.001766  1.320155       NaN
c -1.125726  1.508404  0.640139
d  0.824988  0.148888       NaN
>>> data.unstack().stack()
a  1   -0.060544
   2   -1.680403
   3    0.408582
b  1    1.001766
   2    1.320155
c  1   -1.125726
   2    1.508404
   3    0.640139
d  1    0.824988
   2    0.148888
dtype: float64
>>>

python一括変更ファイル拡張子の実装

64ビットのubuntu 14.10システムでpython 2をインストールします.7.9