Pandas DataFrame/Serieseの継承


TL;DR

  • pandasのDataFrameやSeriesを普通に継承するとViewの取得時などにpandasDataFrameのDataFrame/Seriesに戻ってしまうなど不都合が起こる
  • だから以下のようにする
  • 公式ドキュメントはこちら
from __future__ import print_function
import pandas

# 誤った実装方法
class MyWrongSeries(pandas.Series):
    def __init__(self, *args, **kwargs):
        pandas.Series.__init__(self, *args, **kwargs)
        self.my_attribute = 'value of my attribute'
        self.my_temporal_attribute = 'value of my_temporal_attribute'

# 正しい実装方法
class MySeries(pandas.Series):
    # temporary properties
    _internal_names = pandas.Series._internal_names + ['my_temporal_attribute']
    _internal_names_set = set(_internal_names)
    # normal properties
    _metadata = ['my_attribute']
    @property
    def _constructor(self):
        return MySeries
    def __init__(self, *args, **kwargs):
        # call super class constructor
        pandas.Series.__init__(self, *args, **kwargs)
        self.my_attribute = 'value of my_attribute'
        self.my_temporal_attribute = 'value of my_temporal_attribute'

if __name__ == '__main__':
    print('Wrong Implmentation')
    wrong = MyWrongSeries([1,2,3,4,5])
    print(type(wrong))      # __main__.MyWrongSeries
    print(type(wrong[0:1])) # pandas.Series
    print()

    print('Right Implmentation')
    s = MySeries([1,2,3,4,5])
    print(type(s))      # __main__.MySeries
    print(type(s[0:1])) # __main__.MySeries

    print(s.my_attribute)          # 'value of my_attribute'
    print(s.my_temporal_attribute) # 'value of my_temporal_attribute'

    s.my_attribute = 'changed value of my_attribute'
    s.my_temporal_attribute = 'changed value of my_temporal_attribute'

    print(s[0:1].my_attribute)          # 'changed value of my_attribute'
    print(s[0:1].my_temporal_attribute) # 'value of my_temporal_attribute'