Pythonデータ分析ライブラリpandasの実践(上)田超凡

25088 ワード

        :    20190410

CSDNブログ:https://blog.csdn.net/qq_30056341
 
Seriesの作成と属性の使用
 
# coding: utf-8

# In[10]:


from pandas import Series,DataFrame;
import pandas as pd;
import numpy as np;

arr=np.array([1,2,3,4,5]);

#1.        Series
series1=Series(arr);
series1

print("series     ==>"+str(series1.dtype));
print("series    ==>"+str(series1.index));
print("series   ==>"+str(series1.values));

#           series
series1=Series(arr,index=["A","B","C","D","E"],dtype=np.int32);
series1


# In[12]:


#2.      series
dict={
    "A":1,
    "B":2,
    "C":3
};

series1=Series(dict);
series1


# In[15]:


#3.series      
#series        name  ,    
series1.name="      ";
series1.index.name="    ";
series1.name


# In[31]:


#4.series    
#      
series1.axes

#series      
series1.dtype

#series      
series1.empty

#series      
series1.ndim

#series         
series1.size

#series   
series1.values

#series   n 
series1.head(2)

#series   n 
series1.tail(2);


# In[43]:


#      series 
series1

series1["A"]
series1["A":"C"]
series1[0:2]


Series数学計算
# coding: utf-8

# series  
# Series         ,                 ![image.png](attachment:image.png)
#      Series   ,      Series  NumPy  ndarray       。ndarray               Series 。![image.png](attachment:image.png)

# In[13]:


from pandas import Series,DataFrame
import pandas as pd;
import numpy as np;

#Series    
arr=np.array([3,8,7,5,56,72.5,48.5,120]);
series_index=["A","B","C","D","E","F","G","H"];
series=Series(arr,index=series_index,dtype=np.float64);
series

print(series+3);
print("======================");
print(series-3);
print("======================");
print(series*3);
print("======================");
print(series/3);
print("======================");
print(series**3);
print("======================");
print(np.fabs(series));
print("======================");
print(np.square(series));


# In[27]:


#series    

#  series   ,       ,              ,     series       ,       NaN
#  series        ,  NaN  
arr1=np.random.randint(1,9,5);
arr2=np.random.randint(1,9,4);
series1=Series(arr1,index=list("ABCDE"));
series2=Series(arr2,index=list("ACDE"));

print(arr1);
print("======================");
print(arr2);
print("======================");
print(series1+series2);


# In[37]:


#series     
#  pandas isnull notnull  series   ,isnull ==> True      NaN notnull ==> False      NaN
add_series=series1+series2;
add_series;

pd.isnull(add_series);
pd.notnull(add_series);

#     
print(add_series[pd.isnull(add_series)]);
print("=============================================");

#      
print(add_series[pd.notnull(add_series)]);


 
DataFrameの作成と使用
 
# coding: utf-8

#     Series  DataFrame  ,      。(DataFrame        ,DataFrame           )
#                   (eg:     )
#         Series、DataFrame                  
#  Series  DataFrame               

# In[12]:


import numpy as np;
from pandas import Series,DataFrame;
import pandas as pd;

#1.      DataFrame
# DataFrame(datas,row_index,column_index,dtype,copy)
# data        , ndarray,series
#                   ( )


arr=np.array([
    ["Tom"," ","18"],
    ["Jim"," ","19"],
    ["Cindy"," ","17"]
]);

data_frame=DataFrame(arr,index=[1,2,3],columns=["name","sex","age"]);
data_frame;

print("=============================================");
print("   ==>"+str(data_frame.index));
print("   ==>"+str(data_frame.columns));
print("  ==>"+str(data_frame.values));

data_frame


# In[20]:


#    DataFrame
#           shape   ,     key     ,          
dict={
    "name":["Tom","Jim","Cindy"],
    "sex":[" "," "," "],
    "age":[18,19,17]
};
data_frame=DataFrame(dict,index=[1,2,3]);
data_frame


# In[41]:


#series  DataFrame
#zip             ,                   ,           
a=[1,2,3];
b=[4,5,6];
zipped=zip(a,b);
data=list(zipped);

data_frame=DataFrame(np.array(list(zip(*data))));
data_frame


# In[2]:


#set_index    
data_frame=DataFrame(np.random.randint(1,9,(3,4)));
data_frame;

#    0  1  2  3
# 0  4  7  3  3
# 1  6  8  8  3
# 2  8  7  7  6
print(data_frame);
print("==============================");

#            ,       
#    0  1  3
# 2         
# 3  4  7  3
# 8  6  8  3
# 7  8  7  6
df2=data_frame.set_index(2);
print(df2);
print("==============================");

#      0  1
# 2 3      
# 3 3  4  7
# 8 3  6  8
# 7 6  8  7
df3=data_frame.set_index([2,3]);
print(df3);
print("==============================");

#reset_index    
#drop       
#    0  1  2  3
# 0  4  7  3  3
# 1  6  8  8  3
# 2  8  7  7  6
df4=data_frame.reset_index(drop=True);
print(df4);


# In[1]:





DataFrameデータ操作
 
# coding: utf-8

# In[140]:


import numpy as np;
from pandas import Series,DataFrame;
import pandas as pd;

#DataFrame    
#1.       DataFrame  
arr=np.arange(10,34).reshape((4,6));
df1=DataFrame(arr,columns=list("ABCDEF"),index=["O","P","Q","R"]);

#     A   B   C   D   E   F
# O  10  11  12  13  14  15
# P  16  17  18  19  20  21
# Q  22  23  24  25  26  27
# R  28  29  30  31  32  33

# O    12
# P    18
# Q    24
# R    30
# Name: C, dtype: int32
# print(df1,end="

"); # print(df1["C"],end="

"); ''' ix (loc[start_index,end_index],iloc[start_index,end_index]) iloc ''' df1.loc["Q":"R"]; ''' ''' df1[["A","C","E"]] ''' DataFrame ''' df1["C"]=np.arange(1,5).reshape(4) df1 del(df1["F"]); # A B C D E # O 10 11 1 13 14 # P 16 17 2 19 20 # Q 22 23 3 25 26 # R 28 29 4 31 32 df1 # A B C D E F # O 10 11 1 13 14 5 # P 16 17 2 19 20 6 # Q 22 23 3 25 26 7 # R 28 29 4 31 32 8 df1["F"]=np.arange(5,9).reshape(4); df1; ''' loc ''' df2=df1.loc["O":"P","A":"C"]; df2 ''' DataFrame ''' df1.loc["P"]=np.arange(6,12).reshape(6); df1; # ( r*c , ) # A B C D E F # O 10 11 1 13 14 5 # P 1 2 3 4 5 6 # Q 7 8 9 10 11 12 # R 28 29 4 31 32 8 df1.loc["P":"Q"]=np.arange(1,13).reshape(2,6); df1 # # A B C D E F # P 1 2 3 4 5 6 # Q 7 8 9 10 11 12 # R 28 29 4 31 32 8 df1=df1.drop("O"); df1 # In[192]: #DataFrame Series # :1.loc[ ]=series 2.df.append(series); # append ,series name ,series # A B C D E F # P 6 7 8 9 10 11 # Q 22 23 3 25 26 7 # R 28 29 4 31 32 8 # index_1 50 51 52 53 54 55 series1=Series(np.arange(50,56).reshape(6),index=["A","B","C","D","E","F"]); series1.name="index_1"; series1 # df1.loc["index_1"]=series1 df2=df1.append(series1); df2; df3=df2.drop(series1.name); df3; #series #series df , series2=Series(np.arange(13,16).reshape(3),index=["P","Q","R"]); series2.name="index_2"; df1[series2.name]=series2; df1; del(df1[series2.name]); df1 # In[209]: #DataFrame DataFrame # DataFrame # : DataFrame , DataFrame DataFrame df1; new_df=DataFrame(np.arange(13,31).reshape(3,6),index=["R","S","T"],columns=list("ABCDEF")); new_df; df2=df1.append(new_df); # A B C D E F # P 1 2 3 4 5 6 # Q 7 8 9 10 11 12 # R 13 14 15 16 17 18 # S 19 20 21 22 23 24 # T 25 26 27 28 29 30 df2 # DataFrame # : DataFrame , DataFrame df3=df2.drop(["R","S","T"]); df3 new_df=DataFrame(np.arange(7,9).reshape(2),index=["P","Q"],columns=["G"]); df3["G"]=new_df; new_df=DataFrame(np.arange(13,15).reshape(2),index=["P","Q"],columns=["H"]); df3["H"]=new_df; # A B C D E F G H # P 1 2 3 4 5 6 7 13 # Q 7 8 9 10 11 12 8 14 df3

データファイル、テキストファイルの読み取りと格納
# coding: utf-8

# In[24]:


import numpy as np;
from pandas import Series,DataFrame;
import pandas as pd;
import json;

'''
  pandas   read_xxx               ,   DataFrame.
        ,          gbk,   utf-8
'''
#1.read_csv      
#pd.read_csv(file_path,separator,header,encoding);    、     、        、    

df=pd.read_csv("data_test.csv",sep=",",header=None,encoding="gbk");
df;

'''
DataFrame     csv
'''
df2=DataFrame(np.array([
    ["Tom","100"," "],
    ["Gina","100"," "],
    ["Cindy","100"," "]
]),index=[0,1,2],columns=["name","score","sex"]);
df2

df2.to_csv("data01.csv",sep=",",encoding="gbk");


# In[36]:


#DataFrame     excel
df2.to_excel("data.xlsx",encoding="gbk");

df3=pd.read_excel("data.xlsx",encoding="gbk");
df3

#DataFrame     json
df2.to_json("data.json");
print("OK");

df4=pd.read_json("data.json");
df4


Pandas数学統計関数
# coding: utf-8

# In[34]:


import numpy as np;
from pandas import Series,DataFrame;
import pandas as pd;

#pandas          
#  series            ,  DataFrame          
df=DataFrame(np.arange(3,27).reshape(4,6));
df;

series=Series(np.arange(1,11).reshape(10));
series;

df.loc[0,1]=np.NaN;
df.loc[0,3]=np.NaN;
df.loc[1,1]=np.NaN;
df.loc[3,3:5]=np.NaN;
df

print("count           ==>
"+str(df.count())); df print("============================="); print("describe ==>
"+str(df.describe())); print("============================="); print("max ==>
"+str(df.max())); print("============================="); print("min ==>
"+str(df.min())); print("============================="); print("idxmax ==>
"+str(df.idxmax())); print("============================="); print("idxmin ==>
"+str(df.idxmin())); print("============================="); print("quantile ( ) ==>
"+str(df.quantile())); print("============================="); print("sum ==>
"+str(df.sum())); print("============================="); print("mean ==>
"+str(df.mean())); print("============================="); print("median ==>
"+str(df.median())); df # In[45]: #pandas df=df.replace(np.NaN,0); print("==================================================================="); print("mad ==>
"+str(df.mad())); print("============================="); print("var ==>
"+str(df.var())); print("============================="); print("std ==>
"+str(df.std())); print("============================="); print("cumsum ==>
"+str(df.cumsum())); print("============================="); print("cummin ==>
"+str(df.cummin())); print("============================="); print("cummax ==>
"+str(df.cummax())); print("============================="); print("cumprod ==>
"+str(df.cumprod())); print("============================="); print("pct_change ==>
"+str(df.pct_change())); df

Pandasアルゴリズム演算関数とデータ整列
 
# coding: utf-8

#   -    -                
#     (Correlation coefficient):      /                  。 COV             ,           。
#     (Covariance, COV):      /                  。
# ![image.png](attachment:image.png)

# In[1]:


import numpy as np;
from pandas import Series,DataFrame;
import pandas as pd;

#pandas         
#          
#    :                 ,       DataFrame/Series            
#   :                 ,       DataFrame/Series               
#       :    N   ,        [X-MEAN(X,Y)]*[Y-MEAN(X,Y)]         
# :MEAN({[X1-MEAN(X1,Y1)]*[Y1-MEAN(X1,Y1)]} + {[X2-MEAN(X2,Y2)]*[Y2-MEAN(X2,Y2)]} +...+{[Xn-MEAN(Xn,Yn)]*[Yn-MEAN(Xn,Yn)]})


#   :  
#        :
# P=MEAN({[X1-MEAN(X1,Y1)]*[Y1-MEAN(X1,Y1)]} + {[X2-MEAN(X2,Y2)]*[Y2-MEAN(X2,Y2)]} +...+{[Xn-MEAN(Xn,Yn)]*[Yn-MEAN(Xn,Yn)]})
# 
# 
#   P>0, X&Y    ,   P          
# 
#   P<0, X&Y    ,   P          

# In[80]:


df=DataFrame(np.arange(21,41).reshape(4,5));
series=Series(np.arange(33,38).reshape(5));
series.name=4;
df2=df.append(series);
df2;

#  df.cov()       
#      ,              ,  df     series  , A.cov(B)  A B    
#       ,           
#    3、4     
p=df2[3].cov(df[4])
p;

df2.cov();

#    0、1     
p=df2.loc[0].cov(df2.loc[1]);
p

df2;

np.cov(df,df2);

# df.cov()
df2.cov()


#     :corr()
#    X、Y      X     Y    。  ,            :             、          。
# 1.                   ,        ,      
# 2.            ,        ,               ,                    。
#   :
#               +\infty  -\infty    ,    +1 -1    
#       1          ,     
#       0            ,      
#       -1               ,       
# ![image.png](attachment:image.png)

#   :  
#         :   
# P1=MEAN({[X1-MEAN(X1,Y1)][Y1-MEAN(X1,Y1)]} + {[X2-MEAN(X2,Y2)][Y2-MEAN(X2,Y2)]} +...+{[Xn-MEAN(Xn,Yn)]*[Yn-MEAN(Xn,Yn)]})   
# P2=MEAN({[X1-MEAN(X1,Y1)][Y1-MEAN(X1,Y1)]} + {[X2-MEAN(X2,Y2)][Y2-MEAN(X2,Y2)]} +...+{[Xn-MEAN(Xn,Yn)]*[Yn-MEAN(Xn,Yn)]})    
# 
# R1=P1/STD(X)   
# R2=P2/STD(Y)    
# -1=

インデックス、選択、およびデータフィルタ
# coding: utf-8

# In[14]:


import numpy as np;
from pandas import Series,DataFrame;
import pandas as pd;

'''
DataFrame      ,      DataFrame
'''
df=DataFrame(np.arange(100,150).reshape(5,10));
df

#   1-3   
#  1  2  3
# 0    101    102    103
# 1    111    112    113
# 2    121    122    123
# 3    131    132    133
# 4    141    142    143
df2=df[[1,2,3]];
df2;

#   2      4-6   
#   4  5  6
# 2    124    125    126
# 3    134    135    136
# 4    144    145    146
df3=df.loc[2:,4:6];
df3

# RangeIndex(start=2, stop=10, step=1)
df.columns[2:]


# In[34]:


'''
pandas     (  Series DataFrame      )
      /       
'''

#    0 1  2  3  4  5  6  7  8  9
# 0    100    101.0  102    103.0  104    105    106    107    108    109.0
# 1    110    111.0  112    113.0  114    115    116    117    118    119.0
# 2    120    NaN    122    123.0  124    125    126    127    128    129.0
# 3    130    131.0  132    NaN    134    135    136    137    138    139.0
# 4    140    141.0  142    143.0  144    145    146    147    148    NaN
df.loc[2,1]=np.NaN;
df.loc[3,3]=np.NaN;
df.loc[4,9]=np.NaN;
df

#1.  dropna         / 

#   0 1  2  3  4  5  6  7  8  9
# 0    100    101.0  102    103.0  104    105    106    107    108    109.0
# 1    110    111.0  112    113.0  114    115    116    117    118    119.0
# df2=df.dropna();#         
# df2

#      0  2  4  5  6  7  8
# 0    100    102    104    105    106    107    108
# 1    110    112    114    115    116    117    118
# 2    120    122    124    125    126    127    128
# 3    130    132    134    135    136    137    138
# 4    140    142    144    145    146    147    148
# df2=df.dropna(axis=1);#         
# df2

df2=df.dropna(how="all",axis=1);#   /              
df2


# In[93]:


#2.  fillna    /          
df2;

#  0     
df3=df2.fillna(0);
df3;

#           
#ffill    ,                 ,       
#bfill    ,                 ,       
#   0 1  2  3  4  5  6  7  8  9
# 0    100    101.0  102    103.0  104    105    106    107    108    109.0
# 1    110    111.0  112    113.0  114    115    116    117    118    119.0
# 2    120    111.0  122    123.0  124    125    126    127    128    129.0
# 3    130    131.0  132    123.0  134    135    136    137    138    139.0
# 4    140    141.0  142    143.0  144    145    146    147    148    139.0
df3=df2.fillna(method="ffill");
df3

#   0 1  2  3  4  5  6  7  8  9
# 0    100    101.0  102    103.0  104    105    106    107    108    109.0
# 1    110    111.0  112    113.0  114    115    116    117    118    119.0
# 2    120    131.0  122    123.0  124    125    126    127    128    129.0
# 3    130    131.0  132    143.0  134    135    136    137    138    139.0
# 4    140    141.0  142    143.0  144    145    146    147    148    NaN
df3=df2.fillna(method="bfill");
df3;

df2

#         ,{   :      }

# 0     1 2  3  4  5  6  7  8  9
# 0    100    101.0  102    103    104    105    106    107    108    109
# 1    110    111.0  112    113    114    115    116    117    118    119
# 2    120    NaN    122    123    124    125    126    127    128    129
# 3    130    131.0  132    33333  134    135    136    137    138    139
# 4    140    141.0  142    143    144    145    146    147    148    99999
df3=df2.fillna({3:"33333",9:"99999"});
df3

#df.replace          
#   0  1  2  3  4  5  6  7  8  9
# 0    100    101    102    103    104    105    106    107    108    109
# 1    110    111    112    113    114    115    116    117    118    119
# 2    120    0  122    123    124    125    126    127    128    129
# 3    130    131    132    0  134    135    136    137    138    139
# 4    140    141    142    143    144    145    146    147    148    0
df3=df.replace(np.NaN,"0");
df3;

#pd.isnull(df)       True/False  DataFrame
#   0  1  2  3  4  5  6  7  8  9
# 0    False  False  False  False  False  False  False  False  False  False
# 1    False  False  False  False  False  False  False  False  False  False
# 2    False  True   False  False  False  False  False  False  False  False
# 3    False  False  False  True   False  False  False  False  False  False
# 4    False  False  False  False  False  False  False  False  False  True
df3=pd.isnull(df);
df3;

#pd.notnull(df)       True/False  DataFrame, isnull  
#    0 1  2  3  4  5  6  7  8  9
# 0    True   True   True   True   True   True   True   True   True   True
# 1    True   True   True   True   True   True   True   True   True   True
# 2    True   False  True   True   True   True   True   True   True   True
# 3    True   True   True   False  True   True   True   True   True   True
# 4    True   True   True   True   True   True   True   True   True   False
df3=pd.notnull(df2);
df3


ユニークな値、値のカウント、メンバーシップ、インデックスおよびソート
# coding: utf-8

# In[109]:


import numpy as np;
from pandas import Series,DataFrame;
import pandas as pd;

#1.     df.unique()    Series     
#  DataFrame  ,           Series
df=DataFrame(np.random.randint(1,9,(5,5)),index=list("ABCDE"),columns=list("abcde"));
series=Series(np.arange(11,16).reshape(5),index=list("ABCDE"));
series.name="index_one";

# pd.unique(df)
print(df);
print(pd.unique(df.loc["D"]));
print(df.loc["D"].unique());

#2.   :  pd.value_counts()    Series            DataFrame,             
#     DataFrame ,       ,           
#value_counts()        Series(DataFrame       )
pd.value_counts(df.loc["D"]);
pd.value_counts(df["a"]);
df.loc["A"].value_counts();

#3.    :  isin()    DataFrame/Series      ,  DataFrame/Series            True  ,     False,        
df.isin([1,2,3]);
series.isin([14,15])


# In[110]:


#pandas    
#1.series     
series=Series(np.arange(11,16).reshape(5),index=[
    ["2011","2012","2013","2014","2015"], #     
    list("ABCDE"), #     
    list("abcde"), #     
    list("12345") #     
]);
series

#          
series["2011"]
#          
series["2011"]["A"]
#          
series["2011"]["A"]["a"]["1"];
series["2011","A","a","1"]

#swaplevel()           
# series.swaplevel()

df=DataFrame({
    "name":["zhangsan","lisi","wangwu"],
    "sex":[" "," "," "],
    "age":[18,20,25]
});
df;

#Series==>unstack()               ,  DataFrame
df2=series.unstack();
df2

#DataFrame==>stack()               ,  Series
series=df.stack()
series


# In[132]:


#DataFrame     
#  set_index([])  DataFrame     ,              ,        
df=df.reset_index();
df=df.set_index(["name","sex"]);
df

# #            
df.sum(level="sex");
df["age"].mean(level="sex")
df["age"].min(level="sex")
df

df3=df.reset_index();
df3;

df3=df3.set_index(["name","sex"]);
df3;

df3.mean(level="sex");
df3


# In[170]:


#        series  
series=Series(np.arange(15,18).reshape(3),index=[
    ["A","B","C"], #     
    ["A2","B2","C2"], #     
    ["A3","B3","C3"] #     
]);
series;

#  unstack                ,  level      (               ,                  )
df=series.unstack();
df;

#  sort_index()   Series DataFrame      
#  DataFrame  ,       axis:0        ( ),1        ( )
#ascending        ,      (True)
series.sort_index(ascending=True)
df.sort_index(axis=1,ascending=False);


# In[226]:


#Series DataFrame   
#  sort_values()   Series DataFrame      
#         Series   (  )
df5=DataFrame(np.arange(300,321).reshape(3,7),index=["A","B","C"]);
df5

#sort_values()  by             ,            ,            
#axis=0 by       ,axis=1 by       
df6=df5.sort_values(by="A",ascending=True,axis=1);
df6

df6=df6.sort_values(by=["B","C"],ascending=False,axis=1);
df6


# In[286]:


#pandas rank  ,       ( 1  ,              )
#  rank                            
#        

#Series rank  
series2=Series(np.arange(15,17).reshape(2),index=[
    ["A","B"], #    
    ["A2","B2"], #    
    ["A3","B3"] #    
]);
series2;

series3=Series(np.arange(17,21).reshape(4),index=[
    ["C","D","E","F"], #    
    ["C2","D2","E2","F2"], #    
    ["C3","D3","E3","F3"] #    
]);
series3

series4=series2.append(series3);
series4

#  2,4 13
series4["C"]["C2"]["C3"]=14;
series4["E"]["E2"]["E3"]=14;
series4

#series rank   
series4.rank()  #         1.5
series4.rank(method="min") #      1.0
series4.rank(method="max") #      2.0

#DataFrame rank  
df8=DataFrame(np.arange(21,36).reshape(3,5));
df8;

#     1   2-4   25
df8.loc[1,2:4]=25;
df8

#DataFrame    :      ,    axis     ,axis=0     ,axis=1     
#rank        (  ):
#      :  method   average,  1.5;  method min     ,   1.0;  method max   ,   2.0
df8.rank()
df8.loc[0,3]=25;

df8.rank()
df8.rank(method="average")
df8.rank(method="min")
df8.rank(method="max")
df8.rank(axis=1)
df8.rank(axis=1,method="min")
df8.rank(axis=1,method="max")


Pandas時系列
# coding: utf-8

# In[23]:


import numpy as np;
from pandas import Series,DataFrame;
import pandas as pd;

#pandas    
# pandas.date_range(start=None, end=None, periods=None, freq=’D’)          ,            (DateTimeIndex)
# start:string datetime-like,    None,       。            ,       ,      :yyyy-MM-dd HH:mm:ss
# end:string datetime-like,    None,       。            ,       ,      :yyyy-MM-dd HH:mm:ss
#       start end,            。             start end           
# periods:integer None,    None,                   ;   None  ,  start end     None。
# freq:string DateOffset,    ’D’,         ,            ,  ’5H’    5       。

#  pandas    
# DatetimeIndex(['2019-04-01', '2019-04-02', '2019-04-03', '2019-04-04',
#                '2019-04-05', '2019-04-06', '2019-04-07', '2019-04-08',
#                '2019-04-09', '2019-04-10'],
#               dtype='datetime64[ns]', freq='D')
pd.date_range("20190401","20190410");

#freq          ,periods             (  /     )
pd.date_range(start="20190401",periods=5,freq="D")
pd.date_range(end="20190410 21:00",periods=5,freq="h")