11-数値計算と統計の基礎

7900 ワード

常用数学、統計方法
import numpy as np
import pandas as pd

df = pd.DataFrame({'key1':[4,5,3,np.nan,2],
                 'key2':[1,2,np.nan,4,5],
                 'key3':[1,2,3,'j','k']},
                 index = ['a','b','c','d','e'])
print("1".center(40,'*'))
print(df)
print("2".center(40,'*'))
print(df['key1'].dtype,df['key2'].dtype,df['key3'].dtype)


m1 = df.mean()
print("3".center(40,'*'))
print(m1,type(m1))
print('      :',df['key2'].mean())
# np.nan :  
# .mean()    
#       
#             

m2 = df.mean(axis=1)
print("4".center(40,'*'))
print(m2)
# axis  :   0,     ,axis=1,     ,          

m3 = df.mean(skipna=False)
print("5".center(40,'*'))
print(m3)
# skipna  :    NaN,  True, False, NaN        NaN
#    
*******************1********************
   key1  key2 key3
a   4.0   1.0    1
b   5.0   2.0    2
c   3.0   NaN    3
d   NaN   4.0    j
e   2.0   5.0    k
*******************2********************
float64 float64 object
*******************3********************
key1    3.5
key2    3.0
dtype: float64 
      : 3.0
*******************4********************
a    2.5
b    3.5
c    3.0
d    4.0
e    3.5
dtype: float64
*******************5********************
key1   NaN
key2   NaN
dtype: float64
#         ,   Series DataFrame(1)

df = pd.DataFrame({'key1':np.arange(10),
                  'key2':np.random.rand(10)*10})
print(df)
print('-----')

print(df.count(),'→ count   Na    
') print(df.min(),'→ min
',df['key2'].max(),'→ max
') print(df.quantile(q=0.75),'→ quantile , q
') print(df.sum(),'→ sum
') print(df.mean(),'→ mean
') print(df.median(),'→ median ,50%
') print(df.std(),'
',df.var(),'→ std,var ,
') print(df.skew(),'→ skew
') print(df.kurt(),'→ kurt
') # key1 key2 0 0 3.738954 1 1 3.832567 2 2 6.699210 3 3 4.084607 4 4 7.456708 5 5 8.323144 6 6 9.040738 7 7 5.164880 8 8 0.094538 9 9 7.399022 ----- key1 10 key2 10 dtype: int64 → count Na key1 0.000000 key2 0.094538 dtype: float64 → min 9.040737765606417 → max key1 6.750000 key2 7.442286 Name: 0.75, dtype: float64 → quantile , q key1 45.000000 key2 55.834368 dtype: float64 → sum key1 4.500000 key2 5.583437 dtype: float64 → mean key1 4.500000 key2 5.932045 dtype: float64 → median ,50% key1 3.027650 key2 2.718797 dtype: float64 key1 9.166667 key2 7.391858 dtype: float64 → std,var , key1 0.000000 key2 -0.722995 dtype: float64 → skew key1 -1.200000 key2 0.285023 dtype: float64 → kurt
#         ,   Series DataFrame(2)

df['key1_s'] = df['key1'].cumsum()
df['key2_s'] = df['key2'].cumsum()
print(df,'→ cumsum      
') df['key1_p'] = df['key1'].cumprod() df['key2_p'] = df['key2'].cumprod() print(df,'→ cumprod
') print(df.cummax(),'
',df.cummin(),'→ cummax,cummin ,
') # key1, key2 # key1 key2 key1_s key2_s 0 0 3.738954 0 3.738954 1 1 3.832567 1 7.571522 2 2 6.699210 3 14.270731 3 3 4.084607 6 18.355338 4 4 7.456708 10 25.812046 5 5 8.323144 15 34.135191 6 6 9.040738 21 43.175928 7 7 5.164880 28 48.340808 8 8 0.094538 36 48.435346 9 9 7.399022 45 55.834368 → cumsum key1 key2 key1_s key2_s key1_p key2_p 0 0 3.738954 0 3.738954 0 3.738954e+00 1 1 3.832567 1 7.571522 0 1.432979e+01 2 2 6.699210 3 14.270731 0 9.599830e+01 3 3 4.084607 6 18.355338 0 3.921153e+02 4 4 7.456708 10 25.812046 0 2.923889e+03 5 5 8.323144 15 34.135191 0 2.433595e+04 6 6 9.040738 21 43.175928 0 2.200150e+05 7 7 5.164880 28 48.340808 0 1.136351e+06 8 8 0.094538 36 48.435346 0 1.074280e+05 9 9 7.399022 45 55.834368 0 7.948625e+05 → cumprod key1 key2 key1_s key2_s key1_p key2_p 0 0.0 3.738954 0.0 3.738954 0.0 3.738954e+00 1 1.0 3.832567 1.0 7.571522 0.0 1.432979e+01 2 2.0 6.699210 3.0 14.270731 0.0 9.599830e+01 3 3.0 6.699210 6.0 18.355338 0.0 3.921153e+02 4 4.0 7.456708 10.0 25.812046 0.0 2.923889e+03 5 5.0 8.323144 15.0 34.135191 0.0 2.433595e+04 6 6.0 9.040738 21.0 43.175928 0.0 2.200150e+05 7 7.0 9.040738 28.0 48.340808 0.0 1.136351e+06 8 8.0 9.040738 36.0 48.435346 0.0 1.136351e+06 9 9.0 9.040738 45.0 55.834368 0.0 1.136351e+06 key1 key2 key1_s key2_s key1_p key2_p 0 0.0 3.738954 0.0 3.738954 0.0 3.738954 1 0.0 3.738954 0.0 3.738954 0.0 3.738954 2 0.0 3.738954 0.0 3.738954 0.0 3.738954 3 0.0 3.738954 0.0 3.738954 0.0 3.738954 4 0.0 3.738954 0.0 3.738954 0.0 3.738954 5 0.0 3.738954 0.0 3.738954 0.0 3.738954 6 0.0 3.738954 0.0 3.738954 0.0 3.738954 7 0.0 3.738954 0.0 3.738954 0.0 3.738954 8 0.0 0.094538 0.0 3.738954 0.0 3.738954 9 0.0 0.094538 0.0 3.738954 0.0 3.738954 → cummax,cummin ,
#    :.unique()

s = pd.Series(list('asdvasdcfgg'))
sq = s.unique()
print("1".center(40,'*'))
print(s)
print("2".center(40,'*'))
print(sq,type(sq))
print("3".center(40,'*'))
print(pd.Series(sq))
#          
#   pd.Series      Series

sq.sort()
print("4".center(40,'*'))
print(sq)
#     
#    
*******************1********************
0     a
1     s
2     d
3     v
4     a
5     s
6     d
7     c
8     f
9     g
10    g
dtype: object
*******************2********************
['a' 's' 'd' 'v' 'c' 'f' 'g'] 
*******************3********************
0    a
1    s
2    d
3    v
4    c
5    f
6    g
dtype: object
*******************4********************
['a' 'c' 'd' 'f' 'g' 's' 'v']
sc = s.value_counts(sort = False)  #       :pd.value_counts(sc, sort = False)
print(sc)
#       Series,           
# sort  :  ,   True
#    
c    1
d    2
v    1
g    2
s    2
f    1
a    2
dtype: int64
#     :.isin()

s = pd.Series(np.arange(10,15))
df = pd.DataFrame({'key1':list('asdcbvasd'),
                  'key2':np.arange(4,13)})
print("1".center(40,'*'))
print(s)
print(df)

print("2".center(40,'*'))
print(s.isin([5,14]))
print(df.isin(['a','bc','10',8]))
#  []  
#         Series  Dataframe
#    
*******************1********************
0    10
1    11
2    12
3    13
4    14
dtype: int32
  key1  key2
0    a     4
1    s     5
2    d     6
3    c     7
4    b     8
5    v     9
6    a    10
7    s    11
8    d    12
*******************2********************
0    False
1    False
2    False
3    False
4     True
dtype: bool
    key1   key2
0   True  False
1  False  False
2  False  False
3  False  False
4  False   True
5  False  False
6   True  False
7  False  False
8  False  False