11-数値計算と統計の基礎
7900 ワード
常用数学、統計方法
import numpy as np
import pandas as pd
df = pd.DataFrame({'key1':[4,5,3,np.nan,2],
'key2':[1,2,np.nan,4,5],
'key3':[1,2,3,'j','k']},
index = ['a','b','c','d','e'])
print("1".center(40,'*'))
print(df)
print("2".center(40,'*'))
print(df['key1'].dtype,df['key2'].dtype,df['key3'].dtype)
m1 = df.mean()
print("3".center(40,'*'))
print(m1,type(m1))
print(' :',df['key2'].mean())
# np.nan :
# .mean()
#
#
m2 = df.mean(axis=1)
print("4".center(40,'*'))
print(m2)
# axis : 0, ,axis=1, ,
m3 = df.mean(skipna=False)
print("5".center(40,'*'))
print(m3)
# skipna : NaN, True, False, NaN NaN
#
*******************1********************
key1 key2 key3
a 4.0 1.0 1
b 5.0 2.0 2
c 3.0 NaN 3
d NaN 4.0 j
e 2.0 5.0 k
*******************2********************
float64 float64 object
*******************3********************
key1 3.5
key2 3.0
dtype: float64
: 3.0
*******************4********************
a 2.5
b 3.5
c 3.0
d 4.0
e 3.5
dtype: float64
*******************5********************
key1 NaN
key2 NaN
dtype: float64
# , Series DataFrame(1)
df = pd.DataFrame({'key1':np.arange(10),
'key2':np.random.rand(10)*10})
print(df)
print('-----')
print(df.count(),'→ count Na
')
print(df.min(),'→ min
',df['key2'].max(),'→ max
')
print(df.quantile(q=0.75),'→ quantile , q
')
print(df.sum(),'→ sum
')
print(df.mean(),'→ mean
')
print(df.median(),'→ median ,50%
')
print(df.std(),'
',df.var(),'→ std,var ,
')
print(df.skew(),'→ skew
')
print(df.kurt(),'→ kurt
')
#
key1 key2
0 0 3.738954
1 1 3.832567
2 2 6.699210
3 3 4.084607
4 4 7.456708
5 5 8.323144
6 6 9.040738
7 7 5.164880
8 8 0.094538
9 9 7.399022
-----
key1 10
key2 10
dtype: int64 → count Na
key1 0.000000
key2 0.094538
dtype: float64 → min
9.040737765606417 → max
key1 6.750000
key2 7.442286
Name: 0.75, dtype: float64 → quantile , q
key1 45.000000
key2 55.834368
dtype: float64 → sum
key1 4.500000
key2 5.583437
dtype: float64 → mean
key1 4.500000
key2 5.932045
dtype: float64 → median ,50%
key1 3.027650
key2 2.718797
dtype: float64
key1 9.166667
key2 7.391858
dtype: float64 → std,var ,
key1 0.000000
key2 -0.722995
dtype: float64 → skew
key1 -1.200000
key2 0.285023
dtype: float64 → kurt
# , Series DataFrame(2)
df['key1_s'] = df['key1'].cumsum()
df['key2_s'] = df['key2'].cumsum()
print(df,'→ cumsum
')
df['key1_p'] = df['key1'].cumprod()
df['key2_p'] = df['key2'].cumprod()
print(df,'→ cumprod
')
print(df.cummax(),'
',df.cummin(),'→ cummax,cummin ,
')
# key1, key2
#
key1 key2 key1_s key2_s
0 0 3.738954 0 3.738954
1 1 3.832567 1 7.571522
2 2 6.699210 3 14.270731
3 3 4.084607 6 18.355338
4 4 7.456708 10 25.812046
5 5 8.323144 15 34.135191
6 6 9.040738 21 43.175928
7 7 5.164880 28 48.340808
8 8 0.094538 36 48.435346
9 9 7.399022 45 55.834368 → cumsum
key1 key2 key1_s key2_s key1_p key2_p
0 0 3.738954 0 3.738954 0 3.738954e+00
1 1 3.832567 1 7.571522 0 1.432979e+01
2 2 6.699210 3 14.270731 0 9.599830e+01
3 3 4.084607 6 18.355338 0 3.921153e+02
4 4 7.456708 10 25.812046 0 2.923889e+03
5 5 8.323144 15 34.135191 0 2.433595e+04
6 6 9.040738 21 43.175928 0 2.200150e+05
7 7 5.164880 28 48.340808 0 1.136351e+06
8 8 0.094538 36 48.435346 0 1.074280e+05
9 9 7.399022 45 55.834368 0 7.948625e+05 → cumprod
key1 key2 key1_s key2_s key1_p key2_p
0 0.0 3.738954 0.0 3.738954 0.0 3.738954e+00
1 1.0 3.832567 1.0 7.571522 0.0 1.432979e+01
2 2.0 6.699210 3.0 14.270731 0.0 9.599830e+01
3 3.0 6.699210 6.0 18.355338 0.0 3.921153e+02
4 4.0 7.456708 10.0 25.812046 0.0 2.923889e+03
5 5.0 8.323144 15.0 34.135191 0.0 2.433595e+04
6 6.0 9.040738 21.0 43.175928 0.0 2.200150e+05
7 7.0 9.040738 28.0 48.340808 0.0 1.136351e+06
8 8.0 9.040738 36.0 48.435346 0.0 1.136351e+06
9 9.0 9.040738 45.0 55.834368 0.0 1.136351e+06
key1 key2 key1_s key2_s key1_p key2_p
0 0.0 3.738954 0.0 3.738954 0.0 3.738954
1 0.0 3.738954 0.0 3.738954 0.0 3.738954
2 0.0 3.738954 0.0 3.738954 0.0 3.738954
3 0.0 3.738954 0.0 3.738954 0.0 3.738954
4 0.0 3.738954 0.0 3.738954 0.0 3.738954
5 0.0 3.738954 0.0 3.738954 0.0 3.738954
6 0.0 3.738954 0.0 3.738954 0.0 3.738954
7 0.0 3.738954 0.0 3.738954 0.0 3.738954
8 0.0 0.094538 0.0 3.738954 0.0 3.738954
9 0.0 0.094538 0.0 3.738954 0.0 3.738954 → cummax,cummin ,
# :.unique()
s = pd.Series(list('asdvasdcfgg'))
sq = s.unique()
print("1".center(40,'*'))
print(s)
print("2".center(40,'*'))
print(sq,type(sq))
print("3".center(40,'*'))
print(pd.Series(sq))
#
# pd.Series Series
sq.sort()
print("4".center(40,'*'))
print(sq)
#
#
*******************1********************
0 a
1 s
2 d
3 v
4 a
5 s
6 d
7 c
8 f
9 g
10 g
dtype: object
*******************2********************
['a' 's' 'd' 'v' 'c' 'f' 'g']
*******************3********************
0 a
1 s
2 d
3 v
4 c
5 f
6 g
dtype: object
*******************4********************
['a' 'c' 'd' 'f' 'g' 's' 'v']
sc = s.value_counts(sort = False) # :pd.value_counts(sc, sort = False)
print(sc)
# Series,
# sort : , True
#
c 1
d 2
v 1
g 2
s 2
f 1
a 2
dtype: int64
# :.isin()
s = pd.Series(np.arange(10,15))
df = pd.DataFrame({'key1':list('asdcbvasd'),
'key2':np.arange(4,13)})
print("1".center(40,'*'))
print(s)
print(df)
print("2".center(40,'*'))
print(s.isin([5,14]))
print(df.isin(['a','bc','10',8]))
# []
# Series Dataframe
#
*******************1********************
0 10
1 11
2 12
3 13
4 14
dtype: int32
key1 key2
0 a 4
1 s 5
2 d 6
3 c 7
4 b 8
5 v 9
6 a 10
7 s 11
8 d 12
*******************2********************
0 False
1 False
2 False
3 False
4 True
dtype: bool
key1 key2
0 True False
1 False False
2 False False
3 False False
4 False True
5 False False
6 True False
7 False False
8 False False