マルチプロセスで複数のファイルを読み取り、メモリ損失を低減


import pandas as pd
def memory_usage_mb(df, *args, **kwargs):
    """Dataframe memory usage in MB. """
    return df.memory_usage(*args, **kwargs).sum() / 1024**2


def reduce_memory_usage(df, deep=True, verbose=True, categories=True):
    # All types that we want to change for "lighter" ones.
    # int8 and float16 are not include because we cannot reduce
    # those data types.
    # float32 is not include because float16 has too low precision.
    numeric2reduce = ["int16", "int32", "int64", "float64"]
    start_mem = 0
    if verbose:
        start_mem = memory_usage_mb(df, deep=deep)
 
    for col, col_type in df.dtypes.iteritems():
        best_type = None
#         if col_type == "object":
#             df[col] = df[col].astype("category")
#             best_type = "category"
        if col_type in numeric2reduce:
            downcast = "integer" if "int" in str(col_type) else "float"
            df[col] = pd.to_numeric(df[col], downcast=downcast)
            best_type = df[col].dtype.name
        # Log the conversion performed.
        if verbose and best_type is not None and best_type != str(col_type):
            print(f"Column '{col}' converted from {col_type} to {best_type}")
 
    if verbose:
        end_mem = memory_usage_mb(df, deep=deep)
        diff_mem = start_mem - end_mem
        percent_mem = 100 * diff_mem / start_mem
        print(f"Memory usage decreased from"
              f" {start_mem:.2f}MB to {end_mem:.2f}MB"
              f" ({diff_mem:.2f}MB, {percent_mem:.2f}% reduction)")
    return df

 
%%time
# CPU times: user 3.44 s, sys: 4 s, total: 7.45 s
# Wall time: 4min 34s

import datatable as dt
import multiprocessing
files = ['../input/train_transaction.csv',
         '../input/test_transaction.csv',
         '../input/train_identity.csv',
         '../input/test_identity.csv', 
         '../input/sample_submission.csv']
def load_data(file):
    file_df=dt.fread(file).to_pandas()
    file_df.set_index(["TransactionID"], inplace=True)
    return reduce_memory_usage(file_df)

with multiprocessing.Pool() as pool:
    train_transaction, test_transaction,train_identity,test_identity,sub = pool.map(load_data, files)

 
 
上記のコード:
#         if col_type == "object": #             df[col] = df[col].astype("category") #             best_type = "category"
コメントされなければなりません.そうしないと、fillna()関数を使用して空き値を入力できません.
 
 
 
 
もう一つの書き方は、
%%time
# CPU times: user 3.22 s, sys: 7 s, total: 10.2 s
# Wall time: 49 s

import pandas as pd
path="/kaggle/input/ieee-fraud-detection/"


import multiprocessing
lists=[path+"train_identity.csv",path+"test_identity.csv",path+"test_transaction.csv",path+"train_transaction.csv"]
def file_read(thing):
    return pd.read_csv(thing)
with multiprocessing.Pool() as pool:
    train_identity,test_identity,test_transaction,train_transaction = pool.map(file_read,lists)#   map