pytorch実装語埋め込みの簡単なモデル

19434 ワード

NLP python 深度学習マシン学習

この文章は主に1つのgithubの上のコードから回転して、自分がword 2 vec語ベクトルの予備訓練モデルの埋め込みに対して少し熟知していないため、この簡単なモデルのコードを通じて、私にはっきりした認識を持って、必要な人を助けることができることを望みます.

import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
import gensim

1.テスト用の文テキスト情報の定義

# 2-gram
CONTEXT_SIZE = 2
# We will use Shakespeare Sonnet 2
test_sentence = """When forty winters shall besiege thy brow,
And dig deep trenches in thy beauty's field,
Thy youth's proud livery so gazed on now,
Will be a totter'd weed of small worth held:
Then being asked, where all thy beauty lies,
Where all the treasure of thy lusty days;
To say, within thine own deep sunken eyes,
Were an all-eating shame, and thriftless praise.
How much more praise deserv'd thy beauty's use,
If thou couldst answer 'This fair child of mine
Shall sum my count, and make my old excuse,'
Proving his beauty by succession thine!
This were to be new made when thou art old,
And see thy blood warm when thou feel'st it cold.""".split()

2.切り出した単語をエンコードする

#        ，             ，       word embeding     。
vocab = set(test_sentence) #   set        
word_to_idx = {
     word: i+1 for i, word in enumerate(vocab)}
#      unknown  ，               ，     unknown，       0。
word_to_idx[''] = 0
idx_to_word = {
     i+1: word for i, word in enumerate(vocab)}
idx_to_word[0] = ''

#       ，              ，             ，            。
trigram = [((test_sentence[i], test_sentence[i+1]), test_sentence[i+2])
           for i in range(len(test_sentence)-2)]

3.予備訓練語ベクトルの使用、ロード

wvmodel = gensim.models.KeyedVectors.load_word2vec_format('/Users/wyw/Documents/vectors/word2vec/word2vec.6B.100d.txt', binary=False, encoding='utf-8')

vocab_size = len(word_to_idx)
embed_size = 100
weight = torch.zeros(vocab_size, embed_size)

for i in range(len(wvmodel.index2word)):
    try:
        index = word_to_idx[wvmodel.index2word[i]]
    except:
        continue
    weight[index, :] = torch.from_numpy(wvmodel.get_vector(
        idx_to_word[word_to_idx[wvmodel.index2word[i]]]))

4.モデルの定義

class NgramModel(nn.Module):
    def __init__(self, vocb_size, context_size, n_dim):
        super(NgramModel, self).__init__()
        self.n_word = vocb_size
        
        #  Embedding          word2vec   
        # self.embedding = nn.Embedding(self.n_word, n_dim)
        
        #         
        self.embedding = nn.Embedding.from_pretrained(weight)
        # requires_grad                     
        self.embedding.weight.requires_grad = True
        
        self.linear1 = nn.Linear(context_size*n_dim, 128)
        self.linear2 = nn.Linear(128, self.n_word)

    def forward(self, x):
        emb = self.embedding(x)
        emb = emb.view(1, -1)
        out = self.linear1(emb)
        out = F.relu(out)
        out = self.linear2(out)
        log_prob = F.log_softmax(out)
        return log_prob

ngrammodel = NgramModel(len(word_to_idx), CONTEXT_SIZE, 100)
criterion = nn.NLLLoss()
optimizer = optim.SGD(ngrammodel.parameters(), lr=1e-3)

訓練を開始して合計300個のepochを走り、各epochでwordは予測単語の前の2つの語を表し、labelは予測する語を表し、ネットワークに入って結果を得、loss関数を通じてlossを得て逆伝播し、パラメータを更新する.この例では、予め訓練された語ベクトルを使用すると、訓練中の収束速度が予め訓練された語ベクトルを使用しないよりもはるかに遅くなる.これは、サンプルが小さすぎて、予め訓練された語ベクトルを使用しない場合、より速くオーバーフィットに達するため、損失値がより速く減少するからである.試してもいいしselfを直してもいいです.embedding.weight.requires_grad=False,トレーニング中に語ベクトルを微調整して影響を及ぼすかどうか見てみましょうが,サンプルが小さすぎて影響は見られないと思います==

for epoch in range(300):
    print('epoch: {}'.format(epoch+1))
    print('*'*10)
    running_loss = 0
    for data in trigram:
        word, label = data
        word = torch.LongTensor([word_to_idx[i] for i in word])
        label = torch.LongTensor([word_to_idx[label]])
        # forward
        out = ngrammodel(word)
        loss = criterion(out, label)
        running_loss += loss.data[0]
        # backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print('Loss: {:.6f}'.format(running_loss / len(word_to_idx)))

5.モデル効果の検出

word, label = trigram[3]
word = torch.LongTensor([word_to_idx[i] for i in word])
out = ngrammodel(word)
_, predict_label = torch.max(out, 1)
predict_word = idx_to_word[predict_label.item()]
print('real word is {}, predict word is {}'.format(label, predict_word))

変換元:https://github.com/atnlp/torchtext-summary/blob/master/Language-Model.ipynb参照先:https://blog.csdn.net/nlpuser/article/details/83627709

JPA注記の紹介

jpaのPagingAndSortingRepositoryページクエリ