Pythonデータ分析ライブラリpandasの実践(上)田超凡
: 20190410
CSDNブログ:https://blog.csdn.net/qq_30056341
Seriesの作成と属性の使用
# coding: utf-8
# In[10]:
from pandas import Series,DataFrame;
import pandas as pd;
import numpy as np;
arr=np.array([1,2,3,4,5]);
#1. Series
series1=Series(arr);
series1
print("series ==>"+str(series1.dtype));
print("series ==>"+str(series1.index));
print("series ==>"+str(series1.values));
# series
series1=Series(arr,index=["A","B","C","D","E"],dtype=np.int32);
series1
# In[12]:
#2. series
dict={
"A":1,
"B":2,
"C":3
};
series1=Series(dict);
series1
# In[15]:
#3.series
#series name ,
series1.name=" ";
series1.index.name=" ";
series1.name
# In[31]:
#4.series
#
series1.axes
#series
series1.dtype
#series
series1.empty
#series
series1.ndim
#series
series1.size
#series
series1.values
#series n
series1.head(2)
#series n
series1.tail(2);
# In[43]:
# series
series1
series1["A"]
series1["A":"C"]
series1[0:2]
Series数学計算
# coding: utf-8
# series
# Series , ![image.png](attachment:image.png)
# Series , Series NumPy ndarray 。ndarray Series 。![image.png](attachment:image.png)
# In[13]:
from pandas import Series,DataFrame
import pandas as pd;
import numpy as np;
#Series
arr=np.array([3,8,7,5,56,72.5,48.5,120]);
series_index=["A","B","C","D","E","F","G","H"];
series=Series(arr,index=series_index,dtype=np.float64);
series
print(series+3);
print("======================");
print(series-3);
print("======================");
print(series*3);
print("======================");
print(series/3);
print("======================");
print(series**3);
print("======================");
print(np.fabs(series));
print("======================");
print(np.square(series));
# In[27]:
#series
# series , , , series , NaN
# series , NaN
arr1=np.random.randint(1,9,5);
arr2=np.random.randint(1,9,4);
series1=Series(arr1,index=list("ABCDE"));
series2=Series(arr2,index=list("ACDE"));
print(arr1);
print("======================");
print(arr2);
print("======================");
print(series1+series2);
# In[37]:
#series
# pandas isnull notnull series ,isnull ==> True NaN notnull ==> False NaN
add_series=series1+series2;
add_series;
pd.isnull(add_series);
pd.notnull(add_series);
#
print(add_series[pd.isnull(add_series)]);
print("=============================================");
#
print(add_series[pd.notnull(add_series)]);
DataFrameの作成と使用
# coding: utf-8
# Series DataFrame , 。(DataFrame ,DataFrame )
# (eg: )
# Series、DataFrame
# Series DataFrame
# In[12]:
import numpy as np;
from pandas import Series,DataFrame;
import pandas as pd;
#1. DataFrame
# DataFrame(datas,row_index,column_index,dtype,copy)
# data , ndarray,series
# ( )
arr=np.array([
["Tom"," ","18"],
["Jim"," ","19"],
["Cindy"," ","17"]
]);
data_frame=DataFrame(arr,index=[1,2,3],columns=["name","sex","age"]);
data_frame;
print("=============================================");
print(" ==>"+str(data_frame.index));
print(" ==>"+str(data_frame.columns));
print(" ==>"+str(data_frame.values));
data_frame
# In[20]:
# DataFrame
# shape , key ,
dict={
"name":["Tom","Jim","Cindy"],
"sex":[" "," "," "],
"age":[18,19,17]
};
data_frame=DataFrame(dict,index=[1,2,3]);
data_frame
# In[41]:
#series DataFrame
#zip , ,
a=[1,2,3];
b=[4,5,6];
zipped=zip(a,b);
data=list(zipped);
data_frame=DataFrame(np.array(list(zip(*data))));
data_frame
# In[2]:
#set_index
data_frame=DataFrame(np.random.randint(1,9,(3,4)));
data_frame;
# 0 1 2 3
# 0 4 7 3 3
# 1 6 8 8 3
# 2 8 7 7 6
print(data_frame);
print("==============================");
# ,
# 0 1 3
# 2
# 3 4 7 3
# 8 6 8 3
# 7 8 7 6
df2=data_frame.set_index(2);
print(df2);
print("==============================");
# 0 1
# 2 3
# 3 3 4 7
# 8 3 6 8
# 7 6 8 7
df3=data_frame.set_index([2,3]);
print(df3);
print("==============================");
#reset_index
#drop
# 0 1 2 3
# 0 4 7 3 3
# 1 6 8 8 3
# 2 8 7 7 6
df4=data_frame.reset_index(drop=True);
print(df4);
# In[1]:
DataFrameデータ操作
# coding: utf-8
# In[140]:
import numpy as np;
from pandas import Series,DataFrame;
import pandas as pd;
#DataFrame
#1. DataFrame
arr=np.arange(10,34).reshape((4,6));
df1=DataFrame(arr,columns=list("ABCDEF"),index=["O","P","Q","R"]);
# A B C D E F
# O 10 11 12 13 14 15
# P 16 17 18 19 20 21
# Q 22 23 24 25 26 27
# R 28 29 30 31 32 33
# O 12
# P 18
# Q 24
# R 30
# Name: C, dtype: int32
# print(df1,end="
");
# print(df1["C"],end="
");
'''
ix (loc[start_index,end_index],iloc[start_index,end_index])
iloc
'''
df1.loc["Q":"R"];
'''
'''
df1[["A","C","E"]]
'''
DataFrame
'''
df1["C"]=np.arange(1,5).reshape(4)
df1
del(df1["F"]);
# A B C D E
# O 10 11 1 13 14
# P 16 17 2 19 20
# Q 22 23 3 25 26
# R 28 29 4 31 32
df1
# A B C D E F
# O 10 11 1 13 14 5
# P 16 17 2 19 20 6
# Q 22 23 3 25 26 7
# R 28 29 4 31 32 8
df1["F"]=np.arange(5,9).reshape(4);
df1;
'''
loc
'''
df2=df1.loc["O":"P","A":"C"];
df2
'''
DataFrame
'''
df1.loc["P"]=np.arange(6,12).reshape(6);
df1;
# ( r*c , )
# A B C D E F
# O 10 11 1 13 14 5
# P 1 2 3 4 5 6
# Q 7 8 9 10 11 12
# R 28 29 4 31 32 8
df1.loc["P":"Q"]=np.arange(1,13).reshape(2,6);
df1
#
# A B C D E F
# P 1 2 3 4 5 6
# Q 7 8 9 10 11 12
# R 28 29 4 31 32 8
df1=df1.drop("O");
df1
# In[192]:
#DataFrame Series
# :1.loc[ ]=series 2.df.append(series);
# append ,series name ,series
# A B C D E F
# P 6 7 8 9 10 11
# Q 22 23 3 25 26 7
# R 28 29 4 31 32 8
# index_1 50 51 52 53 54 55
series1=Series(np.arange(50,56).reshape(6),index=["A","B","C","D","E","F"]);
series1.name="index_1";
series1
# df1.loc["index_1"]=series1
df2=df1.append(series1);
df2;
df3=df2.drop(series1.name);
df3;
#series
#series df ,
series2=Series(np.arange(13,16).reshape(3),index=["P","Q","R"]);
series2.name="index_2";
df1[series2.name]=series2;
df1;
del(df1[series2.name]);
df1
# In[209]:
#DataFrame DataFrame
# DataFrame
# : DataFrame , DataFrame DataFrame
df1;
new_df=DataFrame(np.arange(13,31).reshape(3,6),index=["R","S","T"],columns=list("ABCDEF"));
new_df;
df2=df1.append(new_df);
# A B C D E F
# P 1 2 3 4 5 6
# Q 7 8 9 10 11 12
# R 13 14 15 16 17 18
# S 19 20 21 22 23 24
# T 25 26 27 28 29 30
df2
# DataFrame
# : DataFrame , DataFrame
df3=df2.drop(["R","S","T"]);
df3
new_df=DataFrame(np.arange(7,9).reshape(2),index=["P","Q"],columns=["G"]);
df3["G"]=new_df;
new_df=DataFrame(np.arange(13,15).reshape(2),index=["P","Q"],columns=["H"]);
df3["H"]=new_df;
# A B C D E F G H
# P 1 2 3 4 5 6 7 13
# Q 7 8 9 10 11 12 8 14
df3
データファイル、テキストファイルの読み取りと格納
# coding: utf-8
# In[24]:
import numpy as np;
from pandas import Series,DataFrame;
import pandas as pd;
import json;
'''
pandas read_xxx , DataFrame.
, gbk, utf-8
'''
#1.read_csv
#pd.read_csv(file_path,separator,header,encoding); 、 、 、
df=pd.read_csv("data_test.csv",sep=",",header=None,encoding="gbk");
df;
'''
DataFrame csv
'''
df2=DataFrame(np.array([
["Tom","100"," "],
["Gina","100"," "],
["Cindy","100"," "]
]),index=[0,1,2],columns=["name","score","sex"]);
df2
df2.to_csv("data01.csv",sep=",",encoding="gbk");
# In[36]:
#DataFrame excel
df2.to_excel("data.xlsx",encoding="gbk");
df3=pd.read_excel("data.xlsx",encoding="gbk");
df3
#DataFrame json
df2.to_json("data.json");
print("OK");
df4=pd.read_json("data.json");
df4
Pandas数学統計関数
# coding: utf-8
# In[34]:
import numpy as np;
from pandas import Series,DataFrame;
import pandas as pd;
#pandas
# series , DataFrame
df=DataFrame(np.arange(3,27).reshape(4,6));
df;
series=Series(np.arange(1,11).reshape(10));
series;
df.loc[0,1]=np.NaN;
df.loc[0,3]=np.NaN;
df.loc[1,1]=np.NaN;
df.loc[3,3:5]=np.NaN;
df
print("count ==>
"+str(df.count()));
df
print("=============================");
print("describe ==>
"+str(df.describe()));
print("=============================");
print("max ==>
"+str(df.max()));
print("=============================");
print("min ==>
"+str(df.min()));
print("=============================");
print("idxmax ==>
"+str(df.idxmax()));
print("=============================");
print("idxmin ==>
"+str(df.idxmin()));
print("=============================");
print("quantile ( ) ==>
"+str(df.quantile()));
print("=============================");
print("sum ==>
"+str(df.sum()));
print("=============================");
print("mean ==>
"+str(df.mean()));
print("=============================");
print("median ==>
"+str(df.median()));
df
# In[45]:
#pandas
df=df.replace(np.NaN,0);
print("===================================================================");
print("mad ==>
"+str(df.mad()));
print("=============================");
print("var ==>
"+str(df.var()));
print("=============================");
print("std ==>
"+str(df.std()));
print("=============================");
print("cumsum ==>
"+str(df.cumsum()));
print("=============================");
print("cummin ==>
"+str(df.cummin()));
print("=============================");
print("cummax ==>
"+str(df.cummax()));
print("=============================");
print("cumprod ==>
"+str(df.cumprod()));
print("=============================");
print("pct_change ==>
"+str(df.pct_change()));
df
Pandasアルゴリズム演算関数とデータ整列
# coding: utf-8
# - -
# (Correlation coefficient): / 。 COV , 。
# (Covariance, COV): / 。
# ![image.png](attachment:image.png)
# In[1]:
import numpy as np;
from pandas import Series,DataFrame;
import pandas as pd;
#pandas
#
# : , DataFrame/Series
# : , DataFrame/Series
# : N , [X-MEAN(X,Y)]*[Y-MEAN(X,Y)]
# :MEAN({[X1-MEAN(X1,Y1)]*[Y1-MEAN(X1,Y1)]} + {[X2-MEAN(X2,Y2)]*[Y2-MEAN(X2,Y2)]} +...+{[Xn-MEAN(Xn,Yn)]*[Yn-MEAN(Xn,Yn)]})
# :
# :
# P=MEAN({[X1-MEAN(X1,Y1)]*[Y1-MEAN(X1,Y1)]} + {[X2-MEAN(X2,Y2)]*[Y2-MEAN(X2,Y2)]} +...+{[Xn-MEAN(Xn,Yn)]*[Yn-MEAN(Xn,Yn)]})
#
#
# P>0, X&Y , P
#
# P<0, X&Y , P
# In[80]:
df=DataFrame(np.arange(21,41).reshape(4,5));
series=Series(np.arange(33,38).reshape(5));
series.name=4;
df2=df.append(series);
df2;
# df.cov()
# , , df series , A.cov(B) A B
# ,
# 3、4
p=df2[3].cov(df[4])
p;
df2.cov();
# 0、1
p=df2.loc[0].cov(df2.loc[1]);
p
df2;
np.cov(df,df2);
# df.cov()
df2.cov()
# :corr()
# X、Y X Y 。 , : 、 。
# 1. , ,
# 2. , , , 。
# :
# +\infty -\infty , +1 -1
# 1 ,
# 0 ,
# -1 ,
# ![image.png](attachment:image.png)
# :
# :
# P1=MEAN({[X1-MEAN(X1,Y1)][Y1-MEAN(X1,Y1)]} + {[X2-MEAN(X2,Y2)][Y2-MEAN(X2,Y2)]} +...+{[Xn-MEAN(Xn,Yn)]*[Yn-MEAN(Xn,Yn)]})
# P2=MEAN({[X1-MEAN(X1,Y1)][Y1-MEAN(X1,Y1)]} + {[X2-MEAN(X2,Y2)][Y2-MEAN(X2,Y2)]} +...+{[Xn-MEAN(Xn,Yn)]*[Yn-MEAN(Xn,Yn)]})
#
# R1=P1/STD(X)
# R2=P2/STD(Y)
# -1=
インデックス、選択、およびデータフィルタ
# coding: utf-8
# In[14]:
import numpy as np;
from pandas import Series,DataFrame;
import pandas as pd;
'''
DataFrame , DataFrame
'''
df=DataFrame(np.arange(100,150).reshape(5,10));
df
# 1-3
# 1 2 3
# 0 101 102 103
# 1 111 112 113
# 2 121 122 123
# 3 131 132 133
# 4 141 142 143
df2=df[[1,2,3]];
df2;
# 2 4-6
# 4 5 6
# 2 124 125 126
# 3 134 135 136
# 4 144 145 146
df3=df.loc[2:,4:6];
df3
# RangeIndex(start=2, stop=10, step=1)
df.columns[2:]
# In[34]:
'''
pandas ( Series DataFrame )
/
'''
# 0 1 2 3 4 5 6 7 8 9
# 0 100 101.0 102 103.0 104 105 106 107 108 109.0
# 1 110 111.0 112 113.0 114 115 116 117 118 119.0
# 2 120 NaN 122 123.0 124 125 126 127 128 129.0
# 3 130 131.0 132 NaN 134 135 136 137 138 139.0
# 4 140 141.0 142 143.0 144 145 146 147 148 NaN
df.loc[2,1]=np.NaN;
df.loc[3,3]=np.NaN;
df.loc[4,9]=np.NaN;
df
#1. dropna /
# 0 1 2 3 4 5 6 7 8 9
# 0 100 101.0 102 103.0 104 105 106 107 108 109.0
# 1 110 111.0 112 113.0 114 115 116 117 118 119.0
# df2=df.dropna();#
# df2
# 0 2 4 5 6 7 8
# 0 100 102 104 105 106 107 108
# 1 110 112 114 115 116 117 118
# 2 120 122 124 125 126 127 128
# 3 130 132 134 135 136 137 138
# 4 140 142 144 145 146 147 148
# df2=df.dropna(axis=1);#
# df2
df2=df.dropna(how="all",axis=1);# /
df2
# In[93]:
#2. fillna /
df2;
# 0
df3=df2.fillna(0);
df3;
#
#ffill , ,
#bfill , ,
# 0 1 2 3 4 5 6 7 8 9
# 0 100 101.0 102 103.0 104 105 106 107 108 109.0
# 1 110 111.0 112 113.0 114 115 116 117 118 119.0
# 2 120 111.0 122 123.0 124 125 126 127 128 129.0
# 3 130 131.0 132 123.0 134 135 136 137 138 139.0
# 4 140 141.0 142 143.0 144 145 146 147 148 139.0
df3=df2.fillna(method="ffill");
df3
# 0 1 2 3 4 5 6 7 8 9
# 0 100 101.0 102 103.0 104 105 106 107 108 109.0
# 1 110 111.0 112 113.0 114 115 116 117 118 119.0
# 2 120 131.0 122 123.0 124 125 126 127 128 129.0
# 3 130 131.0 132 143.0 134 135 136 137 138 139.0
# 4 140 141.0 142 143.0 144 145 146 147 148 NaN
df3=df2.fillna(method="bfill");
df3;
df2
# ,{ : }
# 0 1 2 3 4 5 6 7 8 9
# 0 100 101.0 102 103 104 105 106 107 108 109
# 1 110 111.0 112 113 114 115 116 117 118 119
# 2 120 NaN 122 123 124 125 126 127 128 129
# 3 130 131.0 132 33333 134 135 136 137 138 139
# 4 140 141.0 142 143 144 145 146 147 148 99999
df3=df2.fillna({3:"33333",9:"99999"});
df3
#df.replace
# 0 1 2 3 4 5 6 7 8 9
# 0 100 101 102 103 104 105 106 107 108 109
# 1 110 111 112 113 114 115 116 117 118 119
# 2 120 0 122 123 124 125 126 127 128 129
# 3 130 131 132 0 134 135 136 137 138 139
# 4 140 141 142 143 144 145 146 147 148 0
df3=df.replace(np.NaN,"0");
df3;
#pd.isnull(df) True/False DataFrame
# 0 1 2 3 4 5 6 7 8 9
# 0 False False False False False False False False False False
# 1 False False False False False False False False False False
# 2 False True False False False False False False False False
# 3 False False False True False False False False False False
# 4 False False False False False False False False False True
df3=pd.isnull(df);
df3;
#pd.notnull(df) True/False DataFrame, isnull
# 0 1 2 3 4 5 6 7 8 9
# 0 True True True True True True True True True True
# 1 True True True True True True True True True True
# 2 True False True True True True True True True True
# 3 True True True False True True True True True True
# 4 True True True True True True True True True False
df3=pd.notnull(df2);
df3
ユニークな値、値のカウント、メンバーシップ、インデックスおよびソート
# coding: utf-8
# In[109]:
import numpy as np;
from pandas import Series,DataFrame;
import pandas as pd;
#1. df.unique() Series
# DataFrame , Series
df=DataFrame(np.random.randint(1,9,(5,5)),index=list("ABCDE"),columns=list("abcde"));
series=Series(np.arange(11,16).reshape(5),index=list("ABCDE"));
series.name="index_one";
# pd.unique(df)
print(df);
print(pd.unique(df.loc["D"]));
print(df.loc["D"].unique());
#2. : pd.value_counts() Series DataFrame,
# DataFrame , ,
#value_counts() Series(DataFrame )
pd.value_counts(df.loc["D"]);
pd.value_counts(df["a"]);
df.loc["A"].value_counts();
#3. : isin() DataFrame/Series , DataFrame/Series True , False,
df.isin([1,2,3]);
series.isin([14,15])
# In[110]:
#pandas
#1.series
series=Series(np.arange(11,16).reshape(5),index=[
["2011","2012","2013","2014","2015"], #
list("ABCDE"), #
list("abcde"), #
list("12345") #
]);
series
#
series["2011"]
#
series["2011"]["A"]
#
series["2011"]["A"]["a"]["1"];
series["2011","A","a","1"]
#swaplevel()
# series.swaplevel()
df=DataFrame({
"name":["zhangsan","lisi","wangwu"],
"sex":[" "," "," "],
"age":[18,20,25]
});
df;
#Series==>unstack() , DataFrame
df2=series.unstack();
df2
#DataFrame==>stack() , Series
series=df.stack()
series
# In[132]:
#DataFrame
# set_index([]) DataFrame , ,
df=df.reset_index();
df=df.set_index(["name","sex"]);
df
# #
df.sum(level="sex");
df["age"].mean(level="sex")
df["age"].min(level="sex")
df
df3=df.reset_index();
df3;
df3=df3.set_index(["name","sex"]);
df3;
df3.mean(level="sex");
df3
# In[170]:
# series
series=Series(np.arange(15,18).reshape(3),index=[
["A","B","C"], #
["A2","B2","C2"], #
["A3","B3","C3"] #
]);
series;
# unstack , level ( , )
df=series.unstack();
df;
# sort_index() Series DataFrame
# DataFrame , axis:0 ( ),1 ( )
#ascending , (True)
series.sort_index(ascending=True)
df.sort_index(axis=1,ascending=False);
# In[226]:
#Series DataFrame
# sort_values() Series DataFrame
# Series ( )
df5=DataFrame(np.arange(300,321).reshape(3,7),index=["A","B","C"]);
df5
#sort_values() by , ,
#axis=0 by ,axis=1 by
df6=df5.sort_values(by="A",ascending=True,axis=1);
df6
df6=df6.sort_values(by=["B","C"],ascending=False,axis=1);
df6
# In[286]:
#pandas rank , ( 1 , )
# rank
#
#Series rank
series2=Series(np.arange(15,17).reshape(2),index=[
["A","B"], #
["A2","B2"], #
["A3","B3"] #
]);
series2;
series3=Series(np.arange(17,21).reshape(4),index=[
["C","D","E","F"], #
["C2","D2","E2","F2"], #
["C3","D3","E3","F3"] #
]);
series3
series4=series2.append(series3);
series4
# 2,4 13
series4["C"]["C2"]["C3"]=14;
series4["E"]["E2"]["E3"]=14;
series4
#series rank
series4.rank() # 1.5
series4.rank(method="min") # 1.0
series4.rank(method="max") # 2.0
#DataFrame rank
df8=DataFrame(np.arange(21,36).reshape(3,5));
df8;
# 1 2-4 25
df8.loc[1,2:4]=25;
df8
#DataFrame : , axis ,axis=0 ,axis=1
#rank ( ):
# : method average, 1.5; method min , 1.0; method max , 2.0
df8.rank()
df8.loc[0,3]=25;
df8.rank()
df8.rank(method="average")
df8.rank(method="min")
df8.rank(method="max")
df8.rank(axis=1)
df8.rank(axis=1,method="min")
df8.rank(axis=1,method="max")
Pandas時系列
# coding: utf-8
# In[23]:
import numpy as np;
from pandas import Series,DataFrame;
import pandas as pd;
#pandas
# pandas.date_range(start=None, end=None, periods=None, freq=’D’) , (DateTimeIndex)
# start:string datetime-like, None, 。 , , :yyyy-MM-dd HH:mm:ss
# end:string datetime-like, None, 。 , , :yyyy-MM-dd HH:mm:ss
# start end, 。 start end
# periods:integer None, None, ; None , start end None。
# freq:string DateOffset, ’D’, , , ’5H’ 5 。
# pandas
# DatetimeIndex(['2019-04-01', '2019-04-02', '2019-04-03', '2019-04-04',
# '2019-04-05', '2019-04-06', '2019-04-07', '2019-04-08',
# '2019-04-09', '2019-04-10'],
# dtype='datetime64[ns]', freq='D')
pd.date_range("20190401","20190410");
#freq ,periods ( / )
pd.date_range(start="20190401",periods=5,freq="D")
pd.date_range(end="20190410 21:00",periods=5,freq="h")