Xgboostベースのテキスト分類——Python実現


1、背景データセットの大きさは1万で、そのうち訓練セット5000、テストセット5000、全部で10種類である.開発環境:Python 3.6+Windows+PyCharm
2、前期は1万数のデータセットを取得し、分詞し、単語を停止し、MySQLデータベースに保存するつもりです.
3、データベースからデータを読み込む
def train_corpus_generator():
    global db
    # coding:utf-8
    num = 0
    for topic in topics:
        num += 1
        #  
        print(topic)
        if num == 11:
            return [[topic, item[1],item[2]] for item in db.query(topic, 0, config["trainset_num"])]
        else:
            yield [[topic, item[1],item[2]] for item in db.query(topic, 0, config["trainset_num"])]

4、トレーニングセットとテストセットのデータを取得し、要求フォーマットに変換する
    train = train_corpus_generator()
    train = tuple(train)
    train_opinion, train_content = stop_words_ch(train)
    train_opinion = transLabel(train_opinion)  #  
    train_opinion = np.array(train_opinion)
    print("train data load finished")

    #  
    test = test_corpus_generator()
    test = tuple(test)
    test_opinion, test_content = stop_words_ch(test)
    test_opinion = transLabel(test_opinion)  #  
    test_opinion = np.array(test_opinion)
    print("test data load finished")

文字列カテゴリを数値に変換
def transLabel(labels):
    for i in range(len(labels)):
        for j in range(len(topics)):
            if labels[i]==topics[j]:
                labels[i]=j
    return labels

5、sklearnを用いてTF-IFを計算する
    #   TF-IDF
    vectorizer = CountVectorizer()
    tf_idf_transformer = TfidfTransformer()
    tf_idf = tf_idf_transformer.fit_transform(vectorizer.fit_transform(train_content))
    weight = tf_idf.toarray()
    wordss = vectorizer.get_feature_names() # 
    print(tf_idf.shape)

    #  DMatrix 
    dtrain = xgb.DMatrix(weight, label=train_opinion)  



    #  -TF-IDF
    test_tf_idf = tf_idf_transformer.transform(vectorizer.transform(test_content))
    test_weight = test_tf_idf.toarray()
    print(test_weight.shape)
    dtest = xgb.DMatrix(test_weight, label=test_opinion)

    #  , 
    dtest.save_binary('dtest.buffer') 

6、パラメータ設定+トレーニング開始
def xgboo_train():

    #  
    param = {
        'booster': 'gbtree',
        'objective': 'multi:softmax',
        'num_class': 10,  #  ,  multisoftmax  
        'max_depth': 20,
        'eta': 0.4,
        'eval_metric': 'merror',
        'silent': 1,
    }

    dtrain,dtest = Get_data()
    evallist = [(dtrain, 'train'), (dtest, 'test')]
    num_round = 200

    #  
    print(" !")
    bst = xgb.train(param, dtrain, num_round, evallist, early_stopping_rounds=10)

    #  
    bst.save_model('train.model')
    return dtest,bst

7、データをテストし、正確度、リコール率とF測定を計算する
def xgboo_test(dtest,bst):

    print(" !")
    preds = bst.predict(dtest)

    TP = [0 for n in range(10)]
    FP = [0 for n in range(10)]
    FN = [0 for n in range(10)]
    TN = [0 for n in range(10)]

    for i, pre in enumerate(preds):
        real_type = int(i / config["testset_num"])
        pre_type = int(pre)
        print("real,pre", real_type, pre_type)
        if real_type == pre_type:
            TP[real_type] += 1   #  , +1
        else:
            FP[real_type] += 1   #  , +1
            FN[pre_type] += 1    #  , +1

    ACC_SUM = 0
    for i in range(0, 10):
        TN[i] = sum(TP) - TP[i]
        ACC = TP[i] / (TP[i] + FP[i])    
        REC = TP[i] / (TP[i] + FN[i])
        FM = 2 * TP[i] / (2 * TP[i] + FP[i] + TN[i])
        ACC_SUM += ACC
        print(i, "  :", ACC, "  :",REC, " F- :", FM)
        print(" :", float('%.3f' % (ACC_SUM / 10)))

8、訓練済みの模型を利用して直接テストする
    bst = xgb.Booster(model_file='T_30.model')  # init model
    dtest = xgb.DMatrix('dtest.buffer')

    xgboo_test(dtest,bst)

半年前の草稿を出すのを忘れていたとは、失策だった.