pythonデータ分析——パンダスの使い方

63452 ワード

Pandsは強力な分析構造化データのツールセットである.その使用の基礎はNumpyです.データマイニングとデータ解析に使用され、データ洗浄機能も提供されます.panel 4 D...-N次元:Series-二次元:DataFrame-三次元:Panel...-四次元:Panel 4 D...-N次元:PanelND...ただし、Pandsでの主要なデータ構造はSeries(一次元データ)とDataFrame(二次元データ)であり、これらの2つのデータ構造は金融、統計、社会科学、工程などの分野の多くの典型的な用例を扱うに十分である.
利器の一つ:Series
一次元配列に似たオブジェクトは、データのセット(各種NumPyデータタイプ)と関連するデータラベルのセット(すなわちインデックス)から構成されています.一組のデータだけでも簡単なSeriesオブジェクトが生成されます.SeriesはPandsの1次元データ構造で、PythonのリストとNumpyのNdarrayと似ています.違いは、Seriesは1次元で、異なるタイプのデータを記憶できます.インデックスのセットは要素に対応しています.1、seriesの作成

import pandas as pd
import numpy as np
import string
#   pandas  
print(pd.__version__)

# 1、    ，  series  
array = ['  ', '  ', '  ']
#           0  
s1 = pd.Series(data=array)
print(s1)
#     
ss1 = pd.Series(data=array, index=['A', 'B', 'C'])
print(ss1)

# 2、  numpy   Ndarray  Series
n = np.random.randn(5)
print(n)
s2 = pd.Series(data=n)
print(s2)
print(s2.dtype)
#          
ss2 = s2.astype(np.int)
print(ss2)

# 3、      Series  ，     key     ，   value   series 
#      
dict = {string.ascii_lowercase[i]: i for i in range(10)}
print(dict)
s3 = pd.Series(dict)
print(s3)

Series基本操作Series基本操作:番号属性または方法説明1 axesは、行軸ラベルリストに戻ります.2 dtypeは、対象のデータタイプ(dtype)を返します.3 emptyシリーズが空なら、Trueに戻ります.4 ndimは下のデータの次元を返します.デフォルトの定義は1です.5 sizeはベースデータの要素数を返します.6 valuesはシリーズをndarayとして返します.7ヘッド()が戻る前のn行.8 tail()は最後のn行を返します.

import pandas as pd
import numpy as np
import string

array = ['1', 2, 3]
s1 = pd.Series(data=array)
print(s1)
print(s1.axes)  # [RangeIndex(start=0, stop=3, step=1)]
print(s1.dtype)
print(s1.empty)
print(s1.ndim)
print(s1.values)
print(s1.size)   #      

# 1、  Series   
print(s1.index)
s1.index = ['A', 'B', 'C']
print(s1)

# Series    
array = ["  ", "  ", "westos"]
#        ，   0  
s2 = pd.Series(data=array)
s3 = s1.append(s2)
print(s3)

# 3、           
s3 = s3.drop('C')  #      C    
print(s3)

# 4、           
print(s3['B'])
s3['B'] = np.nan  # None, null, pandas    ,       , np.nan
print(s3)

# 5、     ---     
print(s3[:2])   #    2   
print(s3[::-1]) #       
print(s3[-2:])  #

パンダスの中のwhere方法

import pandas as pd
import numpy as np
import string

# &**********series  where       numpy     ;
s1 = pd.Series(np.arange(5), index=['a', 'b', 'c', 'd', 'e'])
print(s1[s1>3])
print(type(s1))
#   s1 value     3，     3，   ，   ，      
print(s1.where(s1 > 3))

#       3      10；  s1 value     3，     3，   ，   ，    10
print(s1.where(s1 > 3, 10))

#      3      10；
print(s1.mask(s1 > 3))
print(s1.mask(s1 > 3, 10))

器の二:DataFrame
DataFrameはPandsのテーブル型のデータ構造であり、順序正しい列のセットを含み、各列は異なる値タイプ(数値、文字列、ブール型など)であってもよく、DataFrameは行インデックスであり、列インデックスであり、Seriesからなる辞書と見なされてもよい.注意:Seriesは行の索引だけであり、DataFrameオブジェクトは行の索引もあれば、行のインデックスもあり、行のインデックスがないことを示し、横の索引はindexと呼ばれ、列のインデックスは異なる列、縦の索引はcolumnsと呼ばれます.
DataFrameデータタイプの作成Seriesは行インデックスのみで、Data Frameオブジェクトは行インデックスもあれば、行インデックスもあり、行インデックスもある.行インデックスもないことを示し、横方向索引はindexと呼ばれ、列インデックスは異なる列、縦方向索引はcolumnsと呼ばれ、

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
#    ：      
li = [
    [1, 2, 3, 4],
    [2, 3., 4, 5]
]
# d0 = np.ndarray(li)    
# DataFRame          ，    (0 ， axis=0)，    (1 ， axis=1)
# d1 = pd.DataFrame(data=li, index=['A', 'B'], columns=['views', 'loves', 'comments', 'tranfers'])
d1 = pd.DataFrame(data=li)
print(d1)
print(d1.dtypes)

#    ：  numpy    
narr = np.arange(8).reshape(2, 4)
d2 = pd.DataFrame(data=narr, index=['A', 'B'], columns=['views', 'loves', 'comments', 'tranfers'])
print(d2)

#    ：        
dict = {
    'views': [1, 2, ],
    'loves': [2, 3, ],
    'comments': [3, 4, ]
}
d3 = pd.DataFrame(data=dict, index=['  ', '  '])
print(d3)

#       
# pd.date_range()
dates = pd.date_range(start='1/1/2020', end='1/08/2020', freq='2D')
print(dates)

#    
dates = pd.date_range(start='today', periods=6, freq='2D')
print(dates)
print(type(dates))
#    
columns = ['A', 'B', 'C', 'D']
d4 = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=columns)
print(d4)

#     ：     2019        ，      ；
dates = pd.date_range(start='1/1/2021', end='1/3/2021', freq='D')
print(dates)
s1 = pd.Series([1, 2, 3], index=dates)
print(s1)

DataFrameの基礎的な属性と全体的な状況のクエリ
a)基礎属性df.shape腩行数、列数df.dtype腩列データタイプdf.dm.df.index腩行索引df.com com列インデックスdf.valuesオブジェクト値、二次元darray配列b)全体状況を検索します.行数、列数、索引、列の非空の値の個数、列のタイプ、メモリがdf.describe()を占有します.

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

narr = np.arange(8).reshape(2, 4)
# DataFRame          ，    (0 ， axis=0)，    (1 ， axis=1)
d2 = pd.DataFrame(data=narr, index=['A', 'B'], columns=['views', 'loves', 'comments', 'tranfers'])
print(d2)




# **********************1).       ***********************
print(d2.shape)  #        ;
print(d2.dtypes)  #      
print(d2.ndim)  #        
print(d2.index)  #    
print(d2.columns)  #    
print(d2.values, type(d2.values))  #     ，   ndarray  ;

# ******************************2).          *************
print(d2.head(1))  #        ，   5 
print(d2.tail(1))  #        ，   5 

print("*" * 10)
#        ：   ，   ，    ，     
print("info:", d2.info())

print("  ".center(50, '*'))
#         ：   ，   ，    ，    ， 1/4  ，    ， 3/4  ，    ;
print(d2.describe())

# 3).     
print("d2: 
", d2)
# print("d2 T: 
", d2.transpose())
print("d2 T: 
", d2.T)
# print("d2 T: 
", d2.swapaxes(1, 0))

# 4).       
print(d2)
#          ，      ，         ，  ascending=False;
print(d2.sort_values(by=["views", 'tranfers'], ascending=False))

# 5).      
print(d2)
print(d2[:2])  #       ，       ;
print('1:
', d2['views'])  #       ，       
print('2:
', d2.views)  #        ;
print(d2[['views', 'comments']])  #           

# 6).            ;
#       - iloc(            ),
#        - loc(t         )
# print(d2[0])
# print(d2)
print(d2.iloc[0])
print(d2.iloc[-1])


# print(d2['A'])    #   
print(d2)
print(d2.loc['A'])

# 7).   pandas  ；
d2.loc['A'] = np.nan
print(d2)

print(d2.info())

pandsのcsvファイルの読み書き操作

import os

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

# csv, excel, json........
# 1). csv     

df = pd.DataFrame(
    {'province': ['  ', '  ', '  ', '  ', '  '],
     'city': ['  ', '  ', '  ', '  ', '  '],
     'count1': [1, 2, 3, 4, 5],
     'count2': [1, 2, 33, 4, 5]
     }
)

print(df)

filename = os.path.join('doc', 'csvFile.csv')
"""
index=True/False          ,         
mode='w'                  ,    'w'(         ,    ), 'a'  
header=True/False          (   ),         
"""
# df.to_csv(filename, index=False, mode='a', header=False, sep=' ')  # index=False      
# print("csv      ")

# # 2). csv     
# df2 = pd.read_csv('doc/csvFile.csv')
# print(df2)

# 3). excel     
df.to_excel("doc\excelFile.xlsx", sheet_name="    ", index=False)
print("excel      ")

グループ化と統合操作グループグループ化は、柔軟かつ効率的なグループ化by機能を提供します.1)データセットを自然な方法でスライス、スライス、ダイジェストなどの操作ができます.2)1つまたは複数のキー(関数、配列、またはDataFrame列の名前であってもよい)に従ってパンdasオブジェクトを分割します.3)パケット要約統計を計算します.カウント、平均、標準偏差、またはユーザー定義関数などです.

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

df = pd.DataFrame(
    {'province': ['  ', '  ', '  ', '  ', '  '],
     'city': ['  ', '  ', '  ', '  ', '  '],
     'count1': [1, 2, 3, 4, 5],
     'count2': [1, 2, 33, 4, 5]
     }
)
#               1
#                 1

print(df)
#       key       ;
grouped = df['count1'].groupby(df['province'])
print(grouped.describe())
print(grouped.median())

#         cpunt1   ;
grouped = df['count1'].groupby(df['city'])
print(grouped.max())


#     key       ;
grouped = df['count1'].groupby([df['province'], df['city']])
print(grouped.max())
print(grouped.sum())
print(grouped.count())

#    unstack  ，         ;
print(grouped.max().unstack())

パンダスのデータに対する処理

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

series1= pd.Series(['$A:1', '$B:2', '$C:3', np.nan, '$cat:3'])
print(series1)

#              ，      
print(series1.str.lower())


#              ，      
print(series1.str.upper())

#   
print(series1.str.split(":"))

#            
print(series1.str.strip('$'))

実例一:商品データ分析

"""
    :           :     ,     ,     ，        ，      
  1：
    1).            ;      csv  ? to_csv
    2).             ；    dataframe         ? df['  '], df.   
    3）.            ，   ，      df    ? d2.sort_values(by=["      "], ascending=True)
          20       mosthighPrice.xlsx   ;       df  20      ? df.head(20)   df1.to_csv(xxxxxx)
  2：
    1).    [item_name]          ，     
            (           -   5   )
    2).     [odrder_id]   ，            。
    3).                        。
"""


import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

# #   1：
# #     1).            ;
# #     2).             ；
# goodsInfo = pd.read_csv('doc/chipo.csv')
# # print(goodsInfo.head())
# # print(goodsInfo.tail())
# # print(goodsInfo.info())
# # print(goodsInfo.describe())
# print("      : 
", goodsInfo['item_name'].head())
# print("      : 
", goodsInfo.item_name.head())
#
#
# #
# #   1：
# #
# #     3).            ，   ，
# #           20       mosthighPrice.xlsx   ;
# #     ；
# goodsInfo.item_price = goodsInfo.item_price.str.strip('$').astype(np.float)
# highPriceData = goodsInfo.sort_values('item_price', ascending=False).head(20)
# # print(highPriceData.head(5))
# filename = 'doc\mostHighPrice.xlsx'
# highPriceData.to_excel(filename)
# print("    .......")


#
#   2：
#     1).    [item_name]          ，     
#             (           -   5   )
goodsInfo = pd.read_csv('doc\chipo.csv')
# new_info             ;   Unnamed: 0               ；
newInfo = goodsInfo.groupby('item_name').count()
mostRaiseGoods = newInfo.sort_values('Unnamed: 0', ascending=False)['Unnamed: 0'].head(5)
print(mostRaiseGoods)       # Series  


#           ;
x = mostRaiseGoods.index
#          ;
y = mostRaiseGoods.values

# from pyecharts import Bar
#
# bar = Bar("           ")
# bar.add("", x, y)
# bar.render()

ケース2:消費金額と消費との関係
書類の内容:総消費金額、チップの金額、性別、喫煙するかどうか、日付、時間、週の需要:-喫煙客と吸わない客の消費金額とチップの間の散点図.女性と男性の喫煙と喫煙しないお客様の消費金額とチップの間の散点図関係.

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

filename = 'doc/tips.csv'
data = pd.read_csv(filename)
print(data.head())
print(data.info())

#                    
smoker = data[data['smoker'] == 'Yes']          #          
x_total_bill1 = smoker['total_bill']
y_tip1 = smoker['tip']

# #                     
# no_smoker = data[data['smoker'] != 'Yes']
# # print(smoker.head())
# x_total_bill2 = no_smoker['total_bill']
# y_tip2 = no_smoker['tip']

# from pyecharts import  Scatter
# scatter = Scatter("  /                  ")
# #    1     : x y
# scatter.add("  ", x_total_bill1, y_tip1)
# #    2     : x y
# scatter.add("   ", x_total_bill2, y_tip2)
# scatter.render()

# #                            ;
is_smoker = data['smoker'] == 'Yes'
is_female = data['sex'] == 'Female'

female_smoker = data[is_female & is_smoker]
female_no_smoker = data[is_female & ~is_smoker]
male_smoker = data[~is_female & is_smoker]
male_no_smoker = data[~is_female & ~is_smoker]
#
# 3).      
from pyecharts import  Scatter
scatter = Scatter("             ")
scatter.add("     ", female_smoker['total_bill'], female_smoker['tip'])
scatter.add("      ", female_no_smoker['total_bill'], female_no_smoker['tip'])
scatter.add("     ", male_smoker['total_bill'], male_smoker['tip'])
scatter.add("      ", male_no_smoker['total_bill'], male_no_smoker['tip'])

scatter.render()

linuxネットワークデバイスにおいて重要なデータ構造