スペイン語

92065 ワード

# -*- coding: utf-8 -*-

from __future__ import absolute_import, division, print_function, unicode_literals   #                  
from sklearn.model_selection import train_test_split
import tensorflow.compat.v1 as tf
import matplotlib.ticker as ticker
import matplotlib.pyplot as plt
import numpy as np
import unicodedata
import re   #     
###   .                  
###   \w        /  /   /  
###   \s            
###   \d        
###   \b              
###   ^             
###   $             
###   *             ,    
###   +             ,    
###   ?           ,    
###   {n}     n 
###   {n,}    n     
###   {n,m}   n m 
### 
### str="  s   s      s  f   re   "
### print(re.findall("^  ",str)) #                        ,     ,    list
### ['  ']
###
### str="  s   s      s  f   re   "
### print(re.findall("[^a-z]",str)) #  ,          ,    list
### [' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ']
### 
### str="  s   s      s  f   re   "
### print(re.findall(" $",str)) #              ,     ,    list
### [' ']
### 
### str="  s   s      s  f   re   "
### print(re.findall(" *",str)) #            0    ,    list
### print(re.findall("  *",str)) #            0    ,    list
### ['', '', '', '', ' ', '', '', '', '', '', '', '', '', '', '', '', '', '', '  ', '', '', '   ', '']
### ['  ', '   ']
### 
### str="  s   s      s  f   re   "
### print(re.findall(" +",str)) #            1    ,    list
### print(re.findall("  +",str)) #             1    ,    list
### [' ', '  ', '   ']
### ['  ', '   ']
###
### str="  s   s      s  f   re   "
### print(re.findall(" ?",str)) #            0  1 ,    list
### print(re.findall("  ?",str)) #            0  1 ,    list
### ['', '', '', '', ' ', '', '', '', '', '', '', '', '', '', '', '', '', '', ' ', ' ', '', '', ' ', ' ', ' ', '']
### ['  ', '  ']
### 
### str="  s   s      s  f   re   "
### print(re.findall(" {2}",str)) #       2 ,    list
### print(re.findall("  {1,2}",str)) #       1-2 ,    list
### ['  ', '  ']
### ['  ', '   ']
### 
### str="  s   s      s  f   re   "
### print(re.findall("  [s,f]  ",str)) #     ,    []       ,    list
### ['  s  ', '  f  ']
### 
### str="  s   s    4  s  3f   re  2 "
### print(re.findall("\d",str)) #          ,    list
### ['4', '3', '2']
### 
### str="  s   s    455  s  3f   re  2 "
### print(re.findall("\d+",str)) #             ,    list
### ['455', '3', '2']
### 
### str="  s   s    455  s  3f   re  2 "
### print(re.findall("\D",str)) #         ,    list
### [' ', ' ', 's', ' ', ' ', ' ', 's', ' ', ' ', ' ', ' ', ' ', ' ', 's', ' ', ' ', 'f', ' ', ' ', ' ', 'r', 'e', ' ', ' ', ' ']
### 
### str="  s   s       
\t \f \v455 s 3f re 2 "
### print(re.findall("\s",str)) # (\t
\r\f\v), list
### [' ', ' ', ' ', '
', ' ', '\t', ' ', '\x0c', ' ', '\x0b']
### ### str=" s s
\t \f \v455 "
### print(re.findall("\S",str)) # (\t
\r\f\v), list
### [' ', ' ', 's', ' ', ' ', ' ', 's', ' ', ' ', ' ', ' ', '4', '5', '5', ' '] ### ### str=" s s _ S -455 " ### print(re.findall("\w",str)) # , , , , list ### [' ', ' ', 's', ' ', ' ', ' ', 's', ' ', '_', ' ', 'S', ' ', ' ', '4', '5', '5', ' '] ### ### str=" s s _ S -455 " ### print(re.findall("\W",str)) # , , , , list ### [' ', ' ', '-'] ### ### str="a3a3ddd" ### print(re.search("(a3)+",str).group()) # a3 ### a3a3 ### ### str="a3 a3d dd" ### print(re.findall(r" | +",str)) # | ### [' ', ' '] ### ### str="hello egon bcd egon lge egon acd 19" ### r=re.match("h\w+",str) #match, , , None, , , ### print(r.group()) # , ### print(r.groups()) # , ### print(r.groupdict()) # , key ### hello ### () ### {} ### ### r2=re.match("h(\w+)",str) #match, , , None ### print(r2.group()) ### print(r2.groups()) ### print(r2.groupdict()) ### hello ### ('ello',) ### {} ### ### r3=re.match("(?Ph)(?P\w+)",str) #?P<> key( ),<> key , ### print(r3.group()) ### print(r3.groups()) ### print(r3.groupdict()) ### hello ### ('h', 'ello') ### {'n1': 'h', 'n2': 'ello'} ### ### str="hello egon bcd egon lge egon acd 19" ### r=re.search("h\w+",str) #match, , , None, , , ### print(r.group()) # , ### print(r.groups()) # , ### print(r.groupdict()) # , key ### hello ### () ### {} ### ### r2=re.search("h(\w+)",str) #match, , , None ### print(r2.group()) ### print(r2.groups()) ### print(r2.groupdict()) ### hello ### ('ello',) ### {} ### ### r3=re.search("(?Ph)(?P\w+)",str) #?P<> key( ),<> key , ### print(r3.group()) ### print(r3.groups()) ### print(r3.groupdict()) ### hello ### ('h', 'ello') ### {'n1': 'h', 'n2': 'ello'} ### ### r=re.findall("\d+\w\d+","a2b3c4d5") # , , ### print(r) ### ['2b3', '4d5'] # , , 3c4 ### ### r=re.findall("","a2b3c4d5") # , , ### print(r) ### ['', '', '', '', '', '', '', '', ''] # , , , 8 , 9 ### ### r=re.findall("(ca)*","ca2b3caa4d5") # , , ### print(r) ### ['ca', '', '', '', 'ca', '', '', '', '', '']# * ### ### r=re.findall("a\w+","ca2b3 caa4d5") # , , ### print(r) ### ['a2b3', 'aa4d5']# , ### ### r=re.findall("a(\w+)","ca2b3 caa4d5") # : , ### print(r) ### ['2b3', 'a4d5']# ### ### r=re.findall("(a)(\w+)","ca2b3 caa4d5") # : , , ### print(r) ### [('a', '2b3'), ('a', 'a4d5')]# ### ### r=re.findall("(a)(\w+(b))","ca2b3 caa4b5") # : , , , , , , ### print(r) ### [('a', '2b', 'b'), ('a', 'a4b', 'b')]# ### ### r=re.findall("a(?:\w+)","a2b3 a4b5 edd") #?: , , , ?: findall() ### print(r) ### ['a2b3', 'a4b5'] ### ### r=re.split("a\w","sdfadfdfadsfsfafsff") ### print(r) ### r2=re.split("a\w","sdfadfdfadsfsfafsff",maxsplit=2) ### print(r2) ### ['sdf', 'fdf', 'sfsf', 'sff'] ### ['sdf', 'fdf', 'sfsfafsff'] ### ### r=re.sub("a\w"," ","sdfadfdfadsfsfafsff") ### print(r) ### sdf fdf sfsf sff ### ### a,b=re.subn("a\w"," ","sdfadfdfadsfsfafsff") # , , ### print(a) # ### print(b) # ### sdf fdf sfsf sff import os import io import time tf.disable_v2_behavior() tf.enable_eager_execution() ### path_to_zip = tf.keras.utils.get_file('spa-eng.zip', origin='https://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip',extract=True) path_to_file = os.path.dirname(path_to_zip)+"/spa-eng/spa.txt" ### def unicode_to_ascii(s): return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn') ### normalize() ,NFC , NFD 。 ### s1 = 'Spicy Jalape\u00f1o' ### s2 = 'Spicy Jalapen\u0303o' ### s1 ### 'Spicy Jalape?o' ### s2 ### 'Spicy Jalape?o' ### s1 == s2 ### False ### len(s1) ### 14 ### len(s2) ### 15 ### t1 = unicodedata.normalize('NFC', s1) ### t2 = unicodedata.normalize('NFC', s2) ### t1 == t2 ### True ### print(ascii(t1)) ### 'Spicy Jalape\xf1o' ### t3 = unicodedata.normalize('NFD', s1) ### t4 = unicodedata.normalize('NFD', s2) ### t3 == t4 ### True ### print(ascii(t3)) ### 'Spicy Jalapen\u0303o' def preprocess_sentence(w): w = unicode_to_ascii(w.lower().strip()) w = re.sub(r"([?.!,?])", r" \1 ", w) # w = re.sub(r'[" "]+', " ", w) w = re.sub(r"[^a-zA-Z?.!,?]+", " ", w) # , w = w.rstrip().strip() # , w = ' ' + w + ' ' return w en_sentence = u"May I borrow this book?" sp_sentence = u"?Puedo tomar prestado este libro?" print(" : ",preprocess_sentence(en_sentence)) print(" :",preprocess_sentence(sp_sentence).encode('utf-8')) ### def create_dataset(path, num_examples): lines = io.open(path, encoding='UTF-8').read().strip().split('
'
) word_pairs = [[preprocess_sentence(w) for w in l.split('\t')] for l in lines[:num_examples]] return zip(*word_pairs) en, sp = create_dataset(path_to_file, None) print(" : ",en[-1]) print(" :",sp[-1]) def max_length(tensor): return max(len(t) for t in tensor) def tokenize(lang): lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='') lang_tokenizer.fit_on_texts(lang) tensor = lang_tokenizer.texts_to_sequences(lang) tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,padding='post') return tensor, lang_tokenizer def load_dataset(path, num_examples=None): targ_lang, inp_lang = create_dataset(path, num_examples) input_tensor, inp_lang_tokenizer = tokenize(inp_lang) target_tensor, targ_lang_tokenizer = tokenize(targ_lang) return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer num_examples = 30000 input_tensor, target_tensor, inp_lang, targ_lang = load_dataset(path_to_file, num_examples) max_length_targ, max_length_inp = max_length(target_tensor), max_length(input_tensor) input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2) print(" :",len(input_tensor_train), " :",len(target_tensor_train), " :",len(input_tensor_val), " :",len(target_tensor_val)) def convert(lang, tensor): for t in tensor: if t!=0: print ("%s ---------> %s" % (fixedlen(str(t)), lang.index_word[t])) def fixedlen(inputstr): if len(inputstr)<10: inputstr=" "*(10-len(inputstr))+inputstr return inputstr print (" , ") convert(inp_lang, input_tensor_train[0]) print () print (" , ") convert(targ_lang, target_tensor_train[0]) BUFFER_SIZE = len(input_tensor_train) BATCH_SIZE = 64 steps_per_epoch = len(input_tensor_train)//BATCH_SIZE embedding_dim = 256 units = 1024 vocab_inp_size = len(inp_lang.word_index)+1 vocab_tar_size = len(targ_lang.word_index)+1 dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE) dataset = dataset.batch(BATCH_SIZE, drop_remainder=True) example_input_batch, example_target_batch = next(iter(dataset)) example_input_batch.shape, example_target_batch.shape class Encoder(tf.keras.Model): def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz): super(Encoder, self).__init__() self.batch_sz = batch_sz self.enc_units = enc_units self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim) self.gru = tf.keras.layers.GRU(self.enc_units,return_sequences=True,return_state=True,recurrent_initializer='glorot_uniform') def call(self, x, hidden): x = self.embedding(x) output, state = self.gru(x, initial_state = hidden) return output, state def initialize_hidden_state(self): return tf.zeros((self.batch_sz, self.enc_units)) class BahdanauAttention(tf.keras.layers.Layer): def __init__(self, units): super(BahdanauAttention, self).__init__() self.W1 = tf.keras.layers.Dense(units) self.W2 = tf.keras.layers.Dense(units) self.V = tf.keras.layers.Dense(1) def call(self, query, values): hidden_with_time_axis = tf.expand_dims(query, 1) score = self.V(tf.nn.tanh(self.W1(values) + self.W2(hidden_with_time_axis))) attention_weights = tf.nn.softmax(score, axis=1) context_vector = attention_weights * values context_vector = tf.reduce_sum(context_vector, axis=1) return context_vector, attention_weights class Decoder(tf.keras.Model): def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz): super(Decoder, self).__init__() self.batch_sz = batch_sz self.dec_units = dec_units self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim) self.gru = tf.keras.layers.GRU(self.dec_units,return_sequences=True,return_state=True,recurrent_initializer='glorot_uniform') self.fc = tf.keras.layers.Dense(vocab_size) self.attention = BahdanauAttention(self.dec_units) # def call(self, x, hidden, enc_output): context_vector, attention_weights = self.attention(hidden, enc_output) x = self.embedding(x) x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1) output, state = self.gru(x) output = tf.reshape(output, (-1, output.shape[2])) x = self.fc(output) return x, state, attention_weights def loss_function(real, pred): mask = tf.math.logical_not(tf.math.equal(real, 0)) loss_ = loss_object(real, pred) mask = tf.cast(mask, dtype=loss_.dtype) loss_ *= mask return tf.reduce_mean(loss_) @tf.function # python def train_step(inp, targ, enc_hidden): loss = 0 with tf.GradientTape() as tape: enc_output, enc_hidden = encoder(inp, enc_hidden) dec_hidden = enc_hidden dec_input = tf.expand_dims([targ_lang.word_index['']] * BATCH_SIZE, 1) for t in range(1, targ.shape[1]): predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output) loss += loss_function(targ[:, t], predictions) dec_input = tf.expand_dims(targ[:, t], 1) batch_loss = (loss / int(targ.shape[1])) variables = encoder.trainable_variables + decoder.trainable_variables gradients = tape.gradient(loss, variables) optimizer.apply_gradients(zip(gradients, variables)) return batch_loss def evaluate(sentence): attention_plot = np.zeros((max_length_targ, max_length_inp)) sentence = preprocess_sentence(sentence) inputs = [inp_lang.word_index[i] for i in sentence.split(' ')] inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],maxlen=max_length_inp,padding='post') inputs = tf.convert_to_tensor(inputs) result = '' hidden = [tf.zeros((1, units))] enc_out, enc_hidden = encoder(inputs, hidden) dec_hidden = enc_hidden dec_input = tf.expand_dims([targ_lang.word_index['']], 0) for t in range(max_length_targ): predictions, dec_hidden, attention_weights = decoder(dec_input,dec_hidden,enc_out) attention_weights = tf.reshape(attention_weights, (-1, )) attention_plot[t] = attention_weights.numpy() predicted_id = tf.argmax(predictions[0]).numpy() result += targ_lang.index_word[predicted_id] + ' ' if targ_lang.index_word[predicted_id] == '': return result, sentence, attention_plot dec_input = tf.expand_dims([predicted_id], 0) return result, sentence, attention_plot def plot_attention(attention, sentence, predicted_sentence): fig = plt.figure(figsize=(10,10)) ax = fig.add_subplot(1, 1, 1) ax.matshow(attention, cmap='viridis') fontdict = {'fontsize': 14} ax.set_xticklabels([''] + sentence, fontdict=fontdict, rotation=90) ax.set_yticklabels([''] + predicted_sentence, fontdict=fontdict) ax.xaxis.set_major_locator(ticker.MultipleLocator(1)) ax.yaxis.set_major_locator(ticker.MultipleLocator(1)) plt.show() def translate(sentence): result, sentence, attention_plot = evaluate(sentence) print(' :%s' % (sentence)) print(' :{}'.format(result)) attention_plot = attention_plot[:len(result.split(' ')), :len(sentence.split(' '))] plot_attention(attention_plot, sentence.split(' '), result.split(' ')) # restoring the latest checkpoint in checkpoint_dir encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE) sample_hidden = encoder.initialize_hidden_state() sample_output, sample_hidden = encoder(example_input_batch, sample_hidden) print (' : ( , , ) {}'.format(sample_output.shape)) print (' : ( , ) {}'.format(sample_hidden.shape)) attention_layer = BahdanauAttention(10) attention_result, attention_weights = attention_layer(sample_hidden, sample_output) print(" :( , ) {}".format(attention_result.shape)) print(" :( , , 1) {}".format(attention_weights.shape)) decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE) sample_decoder_output, _, _ = decoder(tf.random.uniform((64, 1)),sample_hidden, sample_output) print (' : ( , ) {}'.format(sample_decoder_output.shape)) optimizer = tf.keras.optimizers.Adam() loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none') checkpoint_dir = './machinetranslation.checkpoints' checkpoint_prefix= os.path.join(checkpoint_dir, "ckpt") checkpoint = tf.train.Checkpoint(optimizer=optimizer,encoder=encoder,decoder=decoder) checkpointmanager= tf.train.CheckpointManager(checkpoint, directory=checkpoint_dir, checkpoint_name='ckpt', max_to_keep=1) if os.listdir(checkpoint_dir): print("-------------------- --------------------") checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir)) TrainYES=False time.sleep(60) STRpredicted="" INTinitloss=100 epoch=0 while True: if not TrainYES: os.system("clear") print(" : <<>> ") print(" Train.............. ",STRpredicted) print(" Translate.......... ") print(" Quit............... ") STRinput=input(" >>>>>>>> :") STRinput=STRinput.upper() # if STRinput=="TRAIN": STRstarttime=">>>>>>>>>> :"+time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) print(STRstarttime) start = time.time() enc_hidden = encoder.initialize_hidden_state() total_loss = 0 for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)): batch_loss = train_step(inp, targ, enc_hidden) total_loss += batch_loss if batch % 10 == 0: print(' :{:>4d}, :{:>10d}, :{:.10f}'.format(epoch + 1,batch,batch_loss.numpy())) if total_loss / steps_per_epoch < INTinitloss: INTinitloss=total_loss / steps_per_epoch checkpointmanager.save() ### checkpoint.save(file_prefix = checkpoint_prefix) STRendtime="<<<<<<<<<+time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) print(STRendtime) print(' :{:>4d}, :{:.10f}, :{:.10f}
'
.format(epoch + 1, total_loss / steps_per_epoch, time.time() - start )) STRpredicted="..... :"+str(epoch + 1)+", :"+str(total_loss / steps_per_epoch)+", :"+str(time.time() - start)+" " epoch+=1 if epoch < 5: TrainYES=True else: TrainYES=False elif STRinput=="TRANS" or STRinput=="TRANSLATE": if os.listdir(checkpoint_dir): print("-------------------- --------------------") checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir)) while True: print(" :hace mucho frio aqui. esta es mi vida. ?todavia estan en casa?") STRinput1=input(" (Exit ):") if STRinput1.upper()=="EXIT" or STRinput1.upper()=="E": break elif len(STRinput1)==0: STRinput1=u'hace mucho frio aqui. esta es mi vida. ?todavia estan en casa?' else: STRinput1=u"'"+STRinput1+"'" translate(STRinput1) elif STRinput=="QUIT" or STRinput=="Q": break else: continue