python共通ライブラリ:pandas

10028 ワード

pandasはnumpyベースのライブラリであり、pythonでデータ処理によく使われるライブラリでもある.
  • Series
  • DataFrame
  • Index

  • 1. Series
    >>> import pandas as pd
    >>> import numpy as np
    >>> s = pd.Series([7,'beijing',2.1,2,'happy'])
    >>> s
    0          7
    1    beijing
    2        2.1
    3          2
    4      happy
    dtype: object
    >>> s = pd.Series([7,'beijing',2.1,2,'happy'],index = ['A','B','C','D','E']) #  index
    >>> s
    A          7
    B    beijing
    C        2.1
    D          2
    E      happy
    dtype: object
    >>> type(s)
    
    >>> cities = {'beijing':55000,'shanghai':60000,'shenzhen':40000,'guangzhou':25000}
    >>> cities
    {'beijing': 55000, 'shanghai': 60000, 'shenzhen': 40000, 'guangzhou': 25000}
    >>> apts = pd.Series(cities)    # dict     Series,Series      key value pair
    >>> apts
    beijing      55000
    guangzhou    25000
    shanghai     60000
    shenzhen     40000
    dtype: int64
    >>> apts[['beijing','shenzhen']]
    beijing     55000
    shenzhen    40000
    dtype: int64
    >>> apts[apts<50000]
    guangzhou    25000
    shenzhen     40000
    dtype: int64
    >>> 'beijing' in apts        #       Series 
    True
    
    >>> apts[apts.isnull()]       #  value null   
    Series([], dtype: int64)
    
    >>> apts[apts.notnull()]    #  value null   
    	 
    beijing      55000
    guangzhou    25000
    shanghai     60000
    shenzhen     40000
    dtype: int64

    2. DataFrame
    1つのDataFrameは1枚の表で、Seriesは1次元の配列を表して、DataFrameは1つの2次元の配列です
    
    >>> import pandas as pd
    >>> import numpy as np
    >>> data = {'cities':['beijing','shanghai','guangzhou','shenzhen'],'years':[2014,2015,2016,2017],'population':[1000,2000,3000,4000]}
    >>> type(pd.DataFrame(data))
    
    >>> pd.DataFrame(data)                    #DataFrame    
          cities  population  years
    0    beijing        1000   2014
    1   shanghai        2000   2015
    2  guangzhou        3000   2016
    3   shenzhen        4000   2017
    >>> pd.DataFrame(data,columns=['years','cities','population'])#      
       years     cities  population
    0   2014    beijing        1000
    1   2015   shanghai        2000
    2   2016  guangzhou        3000
    3   2017   shenzhen        4000
    >>> pd.DataFrame(data,columns=['years','cities','population'],index=['one','two','three','foue'])#      
           years     cities  population
    one     2014    beijing        1000
    two     2015   shanghai        2000
    three   2016  guangzhou        3000
    foue    2017   shenzhen        4000
    >>> frame2 = pd.DataFrame(data,columns=['years','cities','population'],index=['one','two','three','foue'])
    >>> frame2
           years     cities  population
    one     2014    beijing        1000
    two     2015   shanghai        2000
    three   2016  guangzhou        3000
    foue    2017   shenzhen        4000
    >>> frame2['cities']#     
    one        beijing
    two       shanghai
    three    guangzhou
    foue      shenzhen
    Name: cities, dtype: object
    >>> frame2.cities
    one        beijing
    two       shanghai
    three    guangzhou
    foue      shenzhen
    Name: cities, dtype: object
    >>> frame2.ix['three']#     
    years              2016
    cities        guangzhou
    population         3000
    Name: three, dtype: object
    >>> frame2.ix[2]
    years              2016
    cities        guangzhou
    population         3000
    Name: three, dtype: object
    >>> frame2['cities']['one'] = 'zhuhai'#      
    
    Warning (from warnings module):
      File "__main__", line 1
    SettingWithCopyWarning: 
    A value is trying to be set on a copy of a slice from a DataFrame
    
    See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
    >>> frame2
           years     cities  population
    one     2014     zhuhai        1000
    two     2015   shanghai        2000
    three   2016  guangzhou        3000
    foue    2017   shenzhen        4000
    >>> frame2['population'] = 1#     
    >>> frame2
           years     cities  population
    one     2014     zhuhai           1
    two     2015   shanghai           1
    three   2016  guangzhou           1
    foue    2017   shenzhen           1
    >>> frame2.ix['five'] = 1#     
    >>> frame2
           years     cities  population
    one     2014     zhuhai           1
    two     2015   shanghai           1
    three   2016  guangzhou           1
    foue    2017   shenzhen           1
    five       1          1           1
    >>> frame2.ix['one'] = 2
    >>> frame2
           years     cities  population
    one        2          2           2
    two     2015   shanghai           1
    three   2016  guangzhou           1
    foue    2017   shenzhen           1
    five       1          1           1
    >>> frame2.years = np.arange(5)#         
    >>> frame2
           years     cities  population
    one        0          2           2
    two        1   shanghai           1
    three      2  guangzhou           1
    foue       3   shenzhen           1
    five       4          1           1
    >>> val = pd.Series([200,300,500],index=['two','three','five'])
    >>> frame2['population'] = val#   Series      
    >>> frame2
           years     cities  population
    one        0          2         NaN
    two        1   shanghai       200.0
    three      2  guangzhou       300.0
    foue       3   shenzhen         NaN
    five       4          1       500.0
    >>> frame2.columns
    Index(['years', 'cities', 'population'], dtype='object')
    >>> frame2.index
    Index(['one', 'two', 'three', 'foue', 'five'], dtype='object')
    >>> frame2.T#  
                one       two      three      foue five
    years         0         1          2         3    4
    cities        2  shanghai  guangzhou  shenzhen    1
    population  NaN       200        300       NaN  500
    >>> frame2['cities'][1:2]#       
    two    shanghai
    Name: cities, dtype: object
    >>> 

    3. Index
    indexのいくつかの操作について
    
    >>> import pandas as pd
    >>> import numpy as np
    >>> obj = pd.Series(range(3))
    >>> obj
    0    0
    1    1
    2    2
    dtype: int64
    >>> obj = pd.Series(range(3),index=['a','b','c'])
    >>> obj
    a    0
    b    1
    c    2
    dtype: int64
    >>> obj[[0,2]]
    a    0
    c    2
    dtype: int64
    >>> obj[0:2]
    a    0
    b    1
    dtype: int64
    >>> obj['a':'c']
    a    0
    b    1
    c    2
    dtype: int64
    >>> obj['a':'c'] = 3
    >>> obj
    a    3
    b    3
    c    3
    dtype: int64
    >>> frame = pd.DataFrame(np.arange(9).reshape(3,3),index = ['a','b','c'],columns=['beijing','shanghai','guangzhou'])
    >>> frame
       beijing  shanghai  guangzhou
    a        0         1          2
    b        3         4          5
    c        6         7          8
    >>> frame.ix['a':'c']
       beijing  shanghai  guangzhou
    a        0         1          2
    b        3         4          5
    c        6         7          8
    >>> frame.ix[['a','c'],['beijing','guangzhou']]
       beijing  guangzhou
    a        0          2
    c        6          8
    >>> frame.ix[:,'beijing':'guangzhou']
       beijing  shanghai  guangzhou
    a        0         1          2
    b        3         4          5
    c        6         7          8
    >>> frame.reindex(['e','f','g','h'])
       beijing  shanghai  guangzhou
    e      NaN       NaN        NaN
    f      NaN       NaN        NaN
    g      NaN       NaN        NaN
    h      NaN       NaN        NaN
    >>> frame
       beijing  shanghai  guangzhou
    a        0         1          2
    b        3         4          5
    c        6         7          8
    >>> frame.drop('a')
       beijing  shanghai  guangzhou
    b        3         4          5
    c        6         7          8
    >>> frame
       beijing  shanghai  guangzhou
    a        0         1          2
    b        3         4          5
    c        6         7          8
    >>> frame = frame.drop('a')
    >>> frame
       beijing  shanghai  guangzhou
    b        3         4          5
    c        6         7          8
    >>> data = pd.Series(np.random.randn(10),index=[['a','a','a','b','b','c','c','c','d','d'],[1,2,3,1,2,1,2,3,1,2]])
    >>> data
    a  1   -0.060544
       2   -1.680403
       3    0.408582
    b  1    1.001766
       2    1.320155
    c  1   -1.125726
       2    1.508404
       3    0.640139
    d  1    0.824988
       2    0.148888
    dtype: float64
    >>> data.index
    MultiIndex(levels=[['a', 'b', 'c', 'd'], [1, 2, 3]],
               labels=[[0, 0, 0, 1, 1, 2, 2, 2, 3, 3], [0, 1, 2, 0, 1, 0, 1, 2, 0, 1]])
    >>> data['b':'d']
    b  1    1.001766
       2    1.320155
    c  1   -1.125726
       2    1.508404
       3    0.640139
    d  1    0.824988
       2    0.148888
    dtype: float64
    >>> data[1:4]
    a  2   -1.680403
       3    0.408582
    b  1    1.001766
    dtype: float64
    >>> data.unstack()
              1         2         3
    a -0.060544 -1.680403  0.408582
    b  1.001766  1.320155       NaN
    c -1.125726  1.508404  0.640139
    d  0.824988  0.148888       NaN
    >>> data.unstack().stack()
    a  1   -0.060544
       2   -1.680403
       3    0.408582
    b  1    1.001766
       2    1.320155
    c  1   -1.125726
       2    1.508404
       3    0.640139
    d  1    0.824988
       2    0.148888
    dtype: float64
    >>>