マルチプロセスで複数のファイルを読み取り、メモリ損失を低減
3256 ワード
import pandas as pd
def memory_usage_mb(df, *args, **kwargs):
"""Dataframe memory usage in MB. """
return df.memory_usage(*args, **kwargs).sum() / 1024**2
def reduce_memory_usage(df, deep=True, verbose=True, categories=True):
# All types that we want to change for "lighter" ones.
# int8 and float16 are not include because we cannot reduce
# those data types.
# float32 is not include because float16 has too low precision.
numeric2reduce = ["int16", "int32", "int64", "float64"]
start_mem = 0
if verbose:
start_mem = memory_usage_mb(df, deep=deep)
for col, col_type in df.dtypes.iteritems():
best_type = None
# if col_type == "object":
# df[col] = df[col].astype("category")
# best_type = "category"
if col_type in numeric2reduce:
downcast = "integer" if "int" in str(col_type) else "float"
df[col] = pd.to_numeric(df[col], downcast=downcast)
best_type = df[col].dtype.name
# Log the conversion performed.
if verbose and best_type is not None and best_type != str(col_type):
print(f"Column '{col}' converted from {col_type} to {best_type}")
if verbose:
end_mem = memory_usage_mb(df, deep=deep)
diff_mem = start_mem - end_mem
percent_mem = 100 * diff_mem / start_mem
print(f"Memory usage decreased from"
f" {start_mem:.2f}MB to {end_mem:.2f}MB"
f" ({diff_mem:.2f}MB, {percent_mem:.2f}% reduction)")
return df
%%time
# CPU times: user 3.44 s, sys: 4 s, total: 7.45 s
# Wall time: 4min 34s
import datatable as dt
import multiprocessing
files = ['../input/train_transaction.csv',
'../input/test_transaction.csv',
'../input/train_identity.csv',
'../input/test_identity.csv',
'../input/sample_submission.csv']
def load_data(file):
file_df=dt.fread(file).to_pandas()
file_df.set_index(["TransactionID"], inplace=True)
return reduce_memory_usage(file_df)
with multiprocessing.Pool() as pool:
train_transaction, test_transaction,train_identity,test_identity,sub = pool.map(load_data, files)
上記のコード:
# if col_type == "object": # df[col] = df[col].astype("category") # best_type = "category"
コメントされなければなりません.そうしないと、fillna()関数を使用して空き値を入力できません.
もう一つの書き方は、
%%time
# CPU times: user 3.22 s, sys: 7 s, total: 10.2 s
# Wall time: 49 s
import pandas as pd
path="/kaggle/input/ieee-fraud-detection/"
import multiprocessing
lists=[path+"train_identity.csv",path+"test_identity.csv",path+"test_transaction.csv",path+"train_transaction.csv"]
def file_read(thing):
return pd.read_csv(thing)
with multiprocessing.Pool() as pool:
train_identity,test_identity,test_transaction,train_transaction = pool.map(file_read,lists)# map