スペイン語
92065 ワード
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals #
from sklearn.model_selection import train_test_split
import tensorflow.compat.v1 as tf
import matplotlib.ticker as ticker
import matplotlib.pyplot as plt
import numpy as np
import unicodedata
import re #
### .
### \w / / /
### \s
### \d
### \b
### ^
### $
### * ,
### + ,
### ? ,
### {n} n
### {n,} n
### {n,m} n m
###
### str=" s s s f re "
### print(re.findall("^ ",str)) # , , list
### [' ']
###
### str=" s s s f re "
### print(re.findall("[^a-z]",str)) # , , list
### [' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ']
###
### str=" s s s f re "
### print(re.findall(" $",str)) # , , list
### [' ']
###
### str=" s s s f re "
### print(re.findall(" *",str)) # 0 , list
### print(re.findall(" *",str)) # 0 , list
### ['', '', '', '', ' ', '', '', '', '', '', '', '', '', '', '', '', '', '', ' ', '', '', ' ', '']
### [' ', ' ']
###
### str=" s s s f re "
### print(re.findall(" +",str)) # 1 , list
### print(re.findall(" +",str)) # 1 , list
### [' ', ' ', ' ']
### [' ', ' ']
###
### str=" s s s f re "
### print(re.findall(" ?",str)) # 0 1 , list
### print(re.findall(" ?",str)) # 0 1 , list
### ['', '', '', '', ' ', '', '', '', '', '', '', '', '', '', '', '', '', '', ' ', ' ', '', '', ' ', ' ', ' ', '']
### [' ', ' ']
###
### str=" s s s f re "
### print(re.findall(" {2}",str)) # 2 , list
### print(re.findall(" {1,2}",str)) # 1-2 , list
### [' ', ' ']
### [' ', ' ']
###
### str=" s s s f re "
### print(re.findall(" [s,f] ",str)) # , [] , list
### [' s ', ' f ']
###
### str=" s s 4 s 3f re 2 "
### print(re.findall("\d",str)) # , list
### ['4', '3', '2']
###
### str=" s s 455 s 3f re 2 "
### print(re.findall("\d+",str)) # , list
### ['455', '3', '2']
###
### str=" s s 455 s 3f re 2 "
### print(re.findall("\D",str)) # , list
### [' ', ' ', 's', ' ', ' ', ' ', 's', ' ', ' ', ' ', ' ', ' ', ' ', 's', ' ', ' ', 'f', ' ', ' ', ' ', 'r', 'e', ' ', ' ', ' ']
###
### str=" s s
\t \f \v455 s 3f re 2 "
### print(re.findall("\s",str)) # (\t
\r\f\v), list
### [' ', ' ', ' ', '
', ' ', '\t', ' ', '\x0c', ' ', '\x0b']
###
### str=" s s
\t \f \v455 "
### print(re.findall("\S",str)) # (\t
\r\f\v), list
### [' ', ' ', 's', ' ', ' ', ' ', 's', ' ', ' ', ' ', ' ', '4', '5', '5', ' ']
###
### str=" s s _ S -455 "
### print(re.findall("\w",str)) # , , , , list
### [' ', ' ', 's', ' ', ' ', ' ', 's', ' ', '_', ' ', 'S', ' ', ' ', '4', '5', '5', ' ']
###
### str=" s s _ S -455 "
### print(re.findall("\W",str)) # , , , , list
### [' ', ' ', '-']
###
### str="a3a3ddd"
### print(re.search("(a3)+",str).group()) # a3
### a3a3
###
### str="a3 a3d dd"
### print(re.findall(r" | +",str)) # |
### [' ', ' ']
###
### str="hello egon bcd egon lge egon acd 19"
### r=re.match("h\w+",str) #match, , , None, , ,
### print(r.group()) # ,
### print(r.groups()) # ,
### print(r.groupdict()) # , key
### hello
### ()
### {}
###
### r2=re.match("h(\w+)",str) #match, , , None
### print(r2.group())
### print(r2.groups())
### print(r2.groupdict())
### hello
### ('ello',)
### {}
###
### r3=re.match("(?Ph)(?P\w+)",str) #?P<> key( ),<> key ,
### print(r3.group())
### print(r3.groups())
### print(r3.groupdict())
### hello
### ('h', 'ello')
### {'n1': 'h', 'n2': 'ello'}
###
### str="hello egon bcd egon lge egon acd 19"
### r=re.search("h\w+",str) #match, , , None, , ,
### print(r.group()) # ,
### print(r.groups()) # ,
### print(r.groupdict()) # , key
### hello
### ()
### {}
###
### r2=re.search("h(\w+)",str) #match, , , None
### print(r2.group())
### print(r2.groups())
### print(r2.groupdict())
### hello
### ('ello',)
### {}
###
### r3=re.search("(?Ph)(?P\w+)",str) #?P<> key( ),<> key ,
### print(r3.group())
### print(r3.groups())
### print(r3.groupdict())
### hello
### ('h', 'ello')
### {'n1': 'h', 'n2': 'ello'}
###
### r=re.findall("\d+\w\d+","a2b3c4d5") # , ,
### print(r)
### ['2b3', '4d5'] # , , 3c4
###
### r=re.findall("","a2b3c4d5") # , ,
### print(r)
### ['', '', '', '', '', '', '', '', ''] # , , , 8 , 9
###
### r=re.findall("(ca)*","ca2b3caa4d5") # , ,
### print(r)
### ['ca', '', '', '', 'ca', '', '', '', '', '']# *
###
### r=re.findall("a\w+","ca2b3 caa4d5") # , ,
### print(r)
### ['a2b3', 'aa4d5']# ,
###
### r=re.findall("a(\w+)","ca2b3 caa4d5") # : ,
### print(r)
### ['2b3', 'a4d5']#
###
### r=re.findall("(a)(\w+)","ca2b3 caa4d5") # : , ,
### print(r)
### [('a', '2b3'), ('a', 'a4d5')]#
###
### r=re.findall("(a)(\w+(b))","ca2b3 caa4b5") # : , , , , , ,
### print(r)
### [('a', '2b', 'b'), ('a', 'a4b', 'b')]#
###
### r=re.findall("a(?:\w+)","a2b3 a4b5 edd") #?: , , , ?: findall()
### print(r)
### ['a2b3', 'a4b5']
###
### r=re.split("a\w","sdfadfdfadsfsfafsff")
### print(r)
### r2=re.split("a\w","sdfadfdfadsfsfafsff",maxsplit=2)
### print(r2)
### ['sdf', 'fdf', 'sfsf', 'sff']
### ['sdf', 'fdf', 'sfsfafsff']
###
### r=re.sub("a\w"," ","sdfadfdfadsfsfafsff")
### print(r)
### sdf fdf sfsf sff
###
### a,b=re.subn("a\w"," ","sdfadfdfadsfsfafsff") # , ,
### print(a) #
### print(b) #
### sdf fdf sfsf sff
import os
import io
import time
tf.disable_v2_behavior()
tf.enable_eager_execution()
###
path_to_zip = tf.keras.utils.get_file('spa-eng.zip', origin='https://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip',extract=True)
path_to_file = os.path.dirname(path_to_zip)+"/spa-eng/spa.txt"
###
def unicode_to_ascii(s):
return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')
### normalize() ,NFC , NFD 。
### s1 = 'Spicy Jalape\u00f1o'
### s2 = 'Spicy Jalapen\u0303o'
### s1
### 'Spicy Jalape?o'
### s2
### 'Spicy Jalape?o'
### s1 == s2
### False
### len(s1)
### 14
### len(s2)
### 15
### t1 = unicodedata.normalize('NFC', s1)
### t2 = unicodedata.normalize('NFC', s2)
### t1 == t2
### True
### print(ascii(t1))
### 'Spicy Jalape\xf1o'
### t3 = unicodedata.normalize('NFD', s1)
### t4 = unicodedata.normalize('NFD', s2)
### t3 == t4
### True
### print(ascii(t3))
### 'Spicy Jalapen\u0303o'
def preprocess_sentence(w):
w = unicode_to_ascii(w.lower().strip())
w = re.sub(r"([?.!,?])", r" \1 ", w) #
w = re.sub(r'[" "]+', " ", w)
w = re.sub(r"[^a-zA-Z?.!,?]+", " ", w) # ,
w = w.rstrip().strip() # ,
w = ' ' + w + ' '
return w
en_sentence = u"May I borrow this book?"
sp_sentence = u"?Puedo tomar prestado este libro?"
print(" : ",preprocess_sentence(en_sentence))
print(" :",preprocess_sentence(sp_sentence).encode('utf-8'))
###
def create_dataset(path, num_examples):
lines = io.open(path, encoding='UTF-8').read().strip().split('
')
word_pairs = [[preprocess_sentence(w) for w in l.split('\t')] for l in lines[:num_examples]]
return zip(*word_pairs)
en, sp = create_dataset(path_to_file, None)
print(" : ",en[-1])
print(" :",sp[-1])
def max_length(tensor):
return max(len(t) for t in tensor)
def tokenize(lang):
lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
lang_tokenizer.fit_on_texts(lang)
tensor = lang_tokenizer.texts_to_sequences(lang)
tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,padding='post')
return tensor, lang_tokenizer
def load_dataset(path, num_examples=None):
targ_lang, inp_lang = create_dataset(path, num_examples)
input_tensor, inp_lang_tokenizer = tokenize(inp_lang)
target_tensor, targ_lang_tokenizer = tokenize(targ_lang)
return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer
num_examples = 30000
input_tensor, target_tensor, inp_lang, targ_lang = load_dataset(path_to_file, num_examples)
max_length_targ, max_length_inp = max_length(target_tensor), max_length(input_tensor)
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)
print(" :",len(input_tensor_train), " :",len(target_tensor_train), " :",len(input_tensor_val), " :",len(target_tensor_val))
def convert(lang, tensor):
for t in tensor:
if t!=0:
print ("%s ---------> %s" % (fixedlen(str(t)), lang.index_word[t]))
def fixedlen(inputstr):
if len(inputstr)<10:
inputstr=" "*(10-len(inputstr))+inputstr
return inputstr
print (" , ")
convert(inp_lang, input_tensor_train[0])
print ()
print (" , ")
convert(targ_lang, target_tensor_train[0])
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 64
steps_per_epoch = len(input_tensor_train)//BATCH_SIZE
embedding_dim = 256
units = 1024
vocab_inp_size = len(inp_lang.word_index)+1
vocab_tar_size = len(targ_lang.word_index)+1
dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)
example_input_batch, example_target_batch = next(iter(dataset))
example_input_batch.shape, example_target_batch.shape
class Encoder(tf.keras.Model):
def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
super(Encoder, self).__init__()
self.batch_sz = batch_sz
self.enc_units = enc_units
self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
self.gru = tf.keras.layers.GRU(self.enc_units,return_sequences=True,return_state=True,recurrent_initializer='glorot_uniform')
def call(self, x, hidden):
x = self.embedding(x)
output, state = self.gru(x, initial_state = hidden)
return output, state
def initialize_hidden_state(self):
return tf.zeros((self.batch_sz, self.enc_units))
class BahdanauAttention(tf.keras.layers.Layer):
def __init__(self, units):
super(BahdanauAttention, self).__init__()
self.W1 = tf.keras.layers.Dense(units)
self.W2 = tf.keras.layers.Dense(units)
self.V = tf.keras.layers.Dense(1)
def call(self, query, values):
hidden_with_time_axis = tf.expand_dims(query, 1)
score = self.V(tf.nn.tanh(self.W1(values) + self.W2(hidden_with_time_axis)))
attention_weights = tf.nn.softmax(score, axis=1)
context_vector = attention_weights * values
context_vector = tf.reduce_sum(context_vector, axis=1)
return context_vector, attention_weights
class Decoder(tf.keras.Model):
def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
super(Decoder, self).__init__()
self.batch_sz = batch_sz
self.dec_units = dec_units
self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
self.gru = tf.keras.layers.GRU(self.dec_units,return_sequences=True,return_state=True,recurrent_initializer='glorot_uniform')
self.fc = tf.keras.layers.Dense(vocab_size)
self.attention = BahdanauAttention(self.dec_units) #
def call(self, x, hidden, enc_output):
context_vector, attention_weights = self.attention(hidden, enc_output)
x = self.embedding(x)
x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
output, state = self.gru(x)
output = tf.reshape(output, (-1, output.shape[2]))
x = self.fc(output)
return x, state, attention_weights
def loss_function(real, pred):
mask = tf.math.logical_not(tf.math.equal(real, 0))
loss_ = loss_object(real, pred)
mask = tf.cast(mask, dtype=loss_.dtype)
loss_ *= mask
return tf.reduce_mean(loss_)
@tf.function # python
def train_step(inp, targ, enc_hidden):
loss = 0
with tf.GradientTape() as tape:
enc_output, enc_hidden = encoder(inp, enc_hidden)
dec_hidden = enc_hidden
dec_input = tf.expand_dims([targ_lang.word_index['' ]] * BATCH_SIZE, 1)
for t in range(1, targ.shape[1]):
predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
loss += loss_function(targ[:, t], predictions)
dec_input = tf.expand_dims(targ[:, t], 1)
batch_loss = (loss / int(targ.shape[1]))
variables = encoder.trainable_variables + decoder.trainable_variables
gradients = tape.gradient(loss, variables)
optimizer.apply_gradients(zip(gradients, variables))
return batch_loss
def evaluate(sentence):
attention_plot = np.zeros((max_length_targ, max_length_inp))
sentence = preprocess_sentence(sentence)
inputs = [inp_lang.word_index[i] for i in sentence.split(' ')]
inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],maxlen=max_length_inp,padding='post')
inputs = tf.convert_to_tensor(inputs)
result = ''
hidden = [tf.zeros((1, units))]
enc_out, enc_hidden = encoder(inputs, hidden)
dec_hidden = enc_hidden
dec_input = tf.expand_dims([targ_lang.word_index['' ]], 0)
for t in range(max_length_targ):
predictions, dec_hidden, attention_weights = decoder(dec_input,dec_hidden,enc_out)
attention_weights = tf.reshape(attention_weights, (-1, ))
attention_plot[t] = attention_weights.numpy()
predicted_id = tf.argmax(predictions[0]).numpy()
result += targ_lang.index_word[predicted_id] + ' '
if targ_lang.index_word[predicted_id] == '' :
return result, sentence, attention_plot
dec_input = tf.expand_dims([predicted_id], 0)
return result, sentence, attention_plot
def plot_attention(attention, sentence, predicted_sentence):
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(1, 1, 1)
ax.matshow(attention, cmap='viridis')
fontdict = {'fontsize': 14}
ax.set_xticklabels([''] + sentence, fontdict=fontdict, rotation=90)
ax.set_yticklabels([''] + predicted_sentence, fontdict=fontdict)
ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
ax.yaxis.set_major_locator(ticker.MultipleLocator(1))
plt.show()
def translate(sentence):
result, sentence, attention_plot = evaluate(sentence)
print(' :%s' % (sentence))
print(' :{}'.format(result))
attention_plot = attention_plot[:len(result.split(' ')), :len(sentence.split(' '))]
plot_attention(attention_plot, sentence.split(' '), result.split(' '))
# restoring the latest checkpoint in checkpoint_dir
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(example_input_batch, sample_hidden)
print (' : ( , , ) {}'.format(sample_output.shape))
print (' : ( , ) {}'.format(sample_hidden.shape))
attention_layer = BahdanauAttention(10)
attention_result, attention_weights = attention_layer(sample_hidden, sample_output)
print(" :( , ) {}".format(attention_result.shape))
print(" :( , , 1) {}".format(attention_weights.shape))
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)
sample_decoder_output, _, _ = decoder(tf.random.uniform((64, 1)),sample_hidden, sample_output)
print (' : ( , ) {}'.format(sample_decoder_output.shape))
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
checkpoint_dir = './machinetranslation.checkpoints'
checkpoint_prefix= os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,encoder=encoder,decoder=decoder)
checkpointmanager= tf.train.CheckpointManager(checkpoint, directory=checkpoint_dir, checkpoint_name='ckpt', max_to_keep=1)
if os.listdir(checkpoint_dir):
print("-------------------- --------------------")
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))
TrainYES=False
time.sleep(60)
STRpredicted=""
INTinitloss=100
epoch=0
while True:
if not TrainYES:
os.system("clear")
print(" : <<>> ")
print(" Train.............. ",STRpredicted)
print(" Translate.......... ")
print(" Quit............... ")
STRinput=input(" >>>>>>>> :")
STRinput=STRinput.upper() #
if STRinput=="TRAIN":
STRstarttime=">>>>>>>>>> :"+time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
print(STRstarttime)
start = time.time()
enc_hidden = encoder.initialize_hidden_state()
total_loss = 0
for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
batch_loss = train_step(inp, targ, enc_hidden)
total_loss += batch_loss
if batch % 10 == 0:
print(' :{:>4d}, :{:>10d}, :{:.10f}'.format(epoch + 1,batch,batch_loss.numpy()))
if total_loss / steps_per_epoch < INTinitloss:
INTinitloss=total_loss / steps_per_epoch
checkpointmanager.save()
### checkpoint.save(file_prefix = checkpoint_prefix)
STRendtime="<<<<<<<<<+time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
print(STRendtime)
print(' :{:>4d}, :{:.10f}, :{:.10f}
'.format(epoch + 1, total_loss / steps_per_epoch, time.time() - start ))
STRpredicted="..... :"+str(epoch + 1)+", :"+str(total_loss / steps_per_epoch)+", :"+str(time.time() - start)+" "
epoch+=1
if epoch < 5:
TrainYES=True
else:
TrainYES=False
elif STRinput=="TRANS" or STRinput=="TRANSLATE":
if os.listdir(checkpoint_dir):
print("-------------------- --------------------")
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))
while True:
print(" :hace mucho frio aqui. esta es mi vida. ?todavia estan en casa?")
STRinput1=input(" (Exit ):")
if STRinput1.upper()=="EXIT" or STRinput1.upper()=="E":
break
elif len(STRinput1)==0:
STRinput1=u'hace mucho frio aqui. esta es mi vida. ?todavia estan en casa?'
else:
STRinput1=u"'"+STRinput1+"'"
translate(STRinput1)
elif STRinput=="QUIT" or STRinput=="Q":
break
else:
continue