nlpタスクでのトレーニングセットとテストセットの区分
すべてのデータセットを直接トレーニングセットとテストセットに分割する方法を定義します.
内 pathはデータセットパス である. sizeは分割割合 である. sepはセパレータタグ である.
def nlp_split(path,size=0.3,sep = '__label__'):
from sklearn.model_selection import train_test_split
label_list = []
text_list = []
with open(path,'r',encoding = 'utf8') as file:
for line in file:
label = line.split(sep)[1]
text = line.split(sep)[0].strip('\t')
label_list.append(label)
text_list.append(text)
X_train, X_test, y_train, y_test = train_test_split(text_list, label_list, test_size=size, random_state=42)
with open('train.txt','a',encoding = 'utf8') as file:
for i in range(len(X_train)):
train_x = X_train[i]
train_y = y_train[i]
text = train_x + '\t'+sep+train_y
file.write(text)
with open('test.txt','a',encoding = 'utf8') as file:
for i in range(len(X_test)):
test_x = X_test[i]
test_y = y_test[i]
text = test_x + '\t'+sep+test_y
file.write(text)
内