CN-EN-Translation-BERT 迷南。 2023-07-02 05:25 8阅读 0赞 # ![20191009191333910.png][] # # [日萌社][Link 1] # [人工智能AI:Keras PyTorch MXNet TensorFlow PaddlePaddle 深度学习实战(不定时更新)][AI_Keras PyTorch MXNet TensorFlow PaddlePaddle] -------------------- # **github下载代码** # [https://github.com/dragen1860/Deep-Learning-with-TensorFlow-book][https_github.com_dragen1860_Deep-Learning-with-TensorFlow-book] [https://github.com/dragen1860/TensorFlow-2.x-Tutorials][https_github.com_dragen1860_TensorFlow-2.x-Tutorials] -------------------- # **attention.py** # import tensorflow as tf def scaled_dot_product_attention(q, k, v, mask): matmul_qk = tf.matmul(q, k, transpose_b=True) # (..., seq_len_q, seq_len_k) # scale matmul_qk dk = tf.cast(tf.shape(k)[-1], tf.float32) scaled_attention_logits = matmul_qk / tf.math.sqrt(dk) # add the mask to the scaled tensor. if mask is not None: scaled_attention_logits += (mask * -1e9) # softmax is normalized on the last axis (seq_len_k) so that the scores # add up to 1. attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1) # (..., seq_len_q, seq_len_k) output = tf.matmul(attention_weights, v) # (..., seq_len_q, depth_v) return output, attention_weights # ## Multi-head attention # In[ ]: class MultiHeadAttention(tf.keras.layers.Layer): def __init__(self, d_model, num_heads): super(MultiHeadAttention, self).__init__() self.num_heads = num_heads self.d_model = d_model assert d_model % self.num_heads == 0 self.depth = d_model // self.num_heads self.wq = tf.keras.layers.Dense(d_model) self.wk = tf.keras.layers.Dense(d_model) self.wv = tf.keras.layers.Dense(d_model) self.dense = tf.keras.layers.Dense(d_model) def split_heads(self, x, batch_size): """Split the last dimension into (num_heads, depth). Transpose the result such that the shape is (batch_size, num_heads, seq_len, depth) """ x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth)) return tf.transpose(x, perm=[0, 2, 1, 3]) def call(self, v, k, q, mask): batch_size = tf.shape(q)[0] q = self.wq(q) # (batch_size, seq_len, d_model) k = self.wk(k) # (batch_size, seq_len, d_model) v = self.wv(v) # (batch_size, seq_len, d_model) q = self.split_heads(q, batch_size) # (batch_size, num_heads, seq_len_q, depth) k = self.split_heads(k, batch_size) # (batch_size, num_heads, seq_len_k, depth) v = self.split_heads(v, batch_size) # (batch_size, num_heads, seq_len_v, depth) # scaled_attention.shape == (batch_size, num_heads, seq_len_q, depth) # attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k) scaled_attention, attention_weights = scaled_dot_product_attention( q, k, v, mask) scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3]) # (batch_size, seq_len_q, num_heads, depth) concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model)) # (batch_size, seq_len_q, d_model) output = self.dense(concat_attention) # (batch_size, seq_len_q, d_model) return output, attention_weights def main(): temp_mha = MultiHeadAttention(d_model=512, num_heads=8) y = tf.random.uniform((1, 60, 768)) # (batch_size, encoder_sequence, d_model) q = tf.random.uniform((1, 60, 512)) # (batch_size, encoder_sequence, d_model) out, attn = temp_mha(y, k=y, q=q, mask=None) out.shape, attn.shape if __name__ == '__main__': main() -------------------- # attlayer.py # import tensorflow as tf from attention import MultiHeadAttention from utils import positional_encoding def point_wise_feed_forward_network(d_model, dff): return tf.keras.Sequential([ tf.keras.layers.Dense(dff, activation='relu'), # (batch_size, seq_len, dff) tf.keras.layers.Dense(d_model) # (batch_size, seq_len, d_model) ]) # ## Decoder Layer and Decoder class EncoderLayer(tf.keras.layers.Layer): def __init__(self, d_model, num_heads, dff, rate=0.1): super(EncoderLayer, self).__init__() self.mha = MultiHeadAttention(d_model, num_heads) self.ffn = point_wise_feed_forward_network(d_model, dff) self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6) self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6) self.dropout1 = tf.keras.layers.Dropout(rate) self.dropout2 = tf.keras.layers.Dropout(rate) def call(self, x, training, mask): attn_output, _ = self.mha(x, x, x, mask) # (batch_size, input_seq_len, d_model) attn_output = self.dropout1(attn_output, training=training) out1 = self.layernorm1(x + attn_output) # (batch_size, input_seq_len, d_model) ffn_output = self.ffn(out1) # (batch_size, input_seq_len, d_model) ffn_output = self.dropout2(ffn_output, training=training) out2 = self.layernorm2(out1 + ffn_output) # (batch_size, input_seq_len, d_model) return out2 # Each decoder layer consists of sublayers: # # 1. Masked multi-head attention (with look ahead mask and padding mask) # # 2. Multi-head attention (with padding mask). V (value) and K (key) receive the encoder output as inputs. Q (query) receives the output from the masked multi-head attention sublaye # # 3. Point wise feed forward networks # # Each of these sublayers has a residual connection around it followed by a layer normalization. The output of each sublayer is LayerNorm(x + Sublayer(x)). The normalization is done on the d_model (last) axis. # In[ ]: class DecoderLayer(tf.keras.layers.Layer): def __init__(self, d_model, num_heads, dff, rate=0.1): super(DecoderLayer, self).__init__() self.mha1 = MultiHeadAttention(d_model, num_heads) self.mha2 = MultiHeadAttention(d_model, num_heads) self.ffn = point_wise_feed_forward_network(d_model, dff) self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6) self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6) self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6) self.dropout1 = tf.keras.layers.Dropout(rate) self.dropout2 = tf.keras.layers.Dropout(rate) self.dropout3 = tf.keras.layers.Dropout(rate) def call(self, x, enc_output, training, look_ahead_mask, padding_mask): # enc_output.shape == (batch_size, input_seq_len, d_model) attn1, attn_weights_block1 = self.mha1(x, x, x, look_ahead_mask) # (batch_size, target_seq_len, d_model) attn1 = self.dropout1(attn1, training=training) out1 = self.layernorm1(attn1 + x) attn2, attn_weights_block2 = self.mha2( enc_output, enc_output, out1, padding_mask) # (batch_size, target_seq_len, d_model) attn2 = self.dropout2(attn2, training=training) out2 = self.layernorm2(attn2 + out1) # (batch_size, target_seq_len, d_model) ffn_output = self.ffn(out2) # (batch_size, target_seq_len, d_model) ffn_output = self.dropout3(ffn_output, training=training) out3 = self.layernorm3(ffn_output + out2) # (batch_size, target_seq_len, d_model) return out3, attn_weights_block1, attn_weights_block2 def main(): # In[ ]: sample_encoder_layer = EncoderLayer(512, 8, 2048) sample_encoder_layer_output = sample_encoder_layer( tf.random.uniform((64, 43, 512)), False, None) sample_encoder_layer_output.shape # (batch_size, input_seq_len, d_model) sample_decoder_layer = DecoderLayer(512, 8, 2048) sample_encoder_output = tf.random.uniform((64, 128, 768)) sample_decoder_layer_output, _, _ = sample_decoder_layer( tf.random.uniform((64, 50, 512)), sample_encoder_output, False, None, None) sample_decoder_layer_output.shape # (batch_size, target_seq_len, d_model) if __name__ == '__main__': main() -------------------- # bert\_train.py # import tensorflow as tf import time import numpy as np import matplotlib.pyplot as plt import os os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' gpus = tf.config.experimental.list_physical_devices('GPU') if gpus: try: # Currently, memory growth needs to be the same across GPUs for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) logical_gpus = tf.config.experimental.list_logical_devices('GPU') print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs") except RuntimeError as e: # Memory growth must be set before GPUs have been initialized print(e) from tokenizer import get_tokenizer from bertmodel import Transformer, Config from utils import CustomSchedule, create_masks from test import Translator BUFFER_SIZE = 50000 BATCH_SIZE = 64 MAX_SEQ_LENGTH = 128 train_dataset, val_dataset, tokenizer_en, tokenizer_zh = \ get_tokenizer(MAX_SEQ_LENGTH, BATCH_SIZE) config = Config(num_layers=6, d_model=256, dff=1024, num_heads=8) target_vocab_size = tokenizer_en.vocab_size + 2 dropout_rate = 0.1 MODEL_DIR = "chinese_L-12_H-768_A-12" bert_config_file = os.path.join(MODEL_DIR, "bert_config.json") bert_ckpt_file = os.path.join(MODEL_DIR, "bert_model.ckpt") transformer = Transformer(config=config, target_vocab_size=target_vocab_size, bert_config_file=bert_config_file) inp = tf.random.uniform((BATCH_SIZE, MAX_SEQ_LENGTH)) tar_inp = tf.random.uniform((BATCH_SIZE, MAX_SEQ_LENGTH)) fn_out, _ = transformer(inp, tar_inp, True, enc_padding_mask=None, look_ahead_mask=None, dec_padding_mask=None) print(tar_inp.shape) # (batch_size, tar_seq_len) print(fn_out.shape) # (batch_size, tar_seq_len, target_vocab_size) # init bert pre-trained weights transformer.restore_encoder(bert_ckpt_file) transformer.summary() learning_rate = CustomSchedule(config.d_model) optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9) loss_object = tf.keras.losses.SparseCategoricalCrossentropy( from_logits=True, reduction='none') def loss_function(real, pred): mask = tf.math.logical_not(tf.math.equal(real, 0)) loss_ = loss_object(real, pred) mask = tf.cast(mask, dtype=loss_.dtype) loss_ *= mask return tf.reduce_mean(loss_) train_loss = tf.keras.metrics.Mean(name='train_loss') train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy( name='train_accuracy') checkpoint_path = "./zh-en/bert" ckpt = tf.train.Checkpoint(transformer=transformer, optimizer=optimizer) ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5) # if a checkpoint exists, restore the latest checkpoint. if ckpt_manager.latest_checkpoint: ckpt.restore(ckpt_manager.latest_checkpoint) print('Latest checkpoint restored!!') @tf.function def train_step(inp, tar): tar_inp = tar[:, :-1] tar_real = tar[:, 1:] enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp) with tf.GradientTape() as tape: predictions, _ = transformer(inp, tar_inp, True, enc_padding_mask, combined_mask, dec_padding_mask) loss = loss_function(tar_real, predictions) gradients = tape.gradient(loss, transformer.trainable_variables) optimizer.apply_gradients(zip(gradients, transformer.trainable_variables)) train_loss(loss) train_accuracy(tar_real, predictions) # Chinese is used as the input language and English is the target language. translator = Translator(tokenizer_zh, tokenizer_en, transformer, MAX_SEQ_LENGTH) for epoch in range(4): res = translator.do('虽然继承了祖荫,但朴槿惠已经证明了自己是个机敏而老练的政治家。') start = time.time() train_loss.reset_states() train_accuracy.reset_states() # inp -> chinese, tar -> english for (batch, (inp, tar)) in enumerate(train_dataset): train_step(inp, tar) if batch % 500 == 0: print('Epoch {} Batch {} Loss {:.4f} Accuracy {:.4f}'.format( epoch + 1, batch, train_loss.result(), train_accuracy.result())) if (epoch + 1) % 1 == 0: ckpt_save_path = ckpt_manager.save() print('Saving checkpoint for epoch {} at {}'.format(epoch + 1, ckpt_save_path)) print('Epoch {} Loss {:.4f} Accuracy {:.4f}'.format(epoch + 1, train_loss.result(), train_accuracy.result())) print('Time taken for 1 epoch: {} secs\n'.format(time.time() - start)) -------------------- # bertmodel.py # import tensorflow as tf from bert import BertModelLayer from bert.loader import StockBertConfig, load_stock_weights from bert.loader import map_to_stock_variable_name from utils import positional_encoding from attlayer import DecoderLayer class Config(object): def __init__(self, num_layers, d_model, dff, num_heads): self.num_layers = num_layers self.d_model = d_model self.dff = dff self.num_heads = num_heads # In[ ]: def build_encoder(config_file): with tf.io.gfile.GFile(config_file, "r") as reader: stock_params = StockBertConfig.from_json_string(reader.read()) bert_params = stock_params.to_bert_model_layer_params() return BertModelLayer.from_params(bert_params, name="bert") class Decoder(tf.keras.layers.Layer): def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size, rate=0.1): super(Decoder, self).__init__() self.d_model = d_model self.num_layers = num_layers self.embedding = tf.keras.layers.Embedding(target_vocab_size, d_model) self.pos_encoding = positional_encoding(target_vocab_size, self.d_model) self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate) for _ in range(num_layers)] self.dropout = tf.keras.layers.Dropout(rate) def call(self, x, enc_output, training, look_ahead_mask, padding_mask): seq_len = tf.shape(x)[1] attention_weights = {} x = self.embedding(x) # (batch_size, target_seq_len, d_model) x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32)) x += self.pos_encoding[:, :seq_len, :] x = self.dropout(x, training=training) for i in range(self.num_layers): x, block1, block2 = self.dec_layers[i](x, enc_output, training, look_ahead_mask, padding_mask) attention_weights['decoder_layer{}_block1'.format(i + 1)] = block1 attention_weights['decoder_layer{}_block2'.format(i + 1)] = block2 # x.shape == (batch_size, target_seq_len, d_model) return x, attention_weights class Transformer(tf.keras.Model): def __init__(self, config, target_vocab_size, bert_config_file, bert_training=False, rate=0.1, name='transformer'): super(Transformer, self).__init__(name=name) self.encoder = build_encoder(config_file=bert_config_file) self.encoder.trainable = bert_training self.decoder = Decoder(config.num_layers, config.d_model, config.num_heads, config.dff, target_vocab_size, rate) self.final_layer = tf.keras.layers.Dense(target_vocab_size) def load_stock_weights(self, bert: BertModelLayer, ckpt_file): assert isinstance(bert, BertModelLayer), "Expecting a BertModelLayer instance as first argument" assert tf.compat.v1.train.checkpoint_exists(ckpt_file), "Checkpoint does not exist: {}".format(ckpt_file) ckpt_reader = tf.train.load_checkpoint(ckpt_file) bert_prefix = 'transformer/bert' weights = [] for weight in bert.weights: stock_name = map_to_stock_variable_name(weight.name, bert_prefix) if ckpt_reader.has_tensor(stock_name): value = ckpt_reader.get_tensor(stock_name) weights.append(value) else: raise ValueError("No value for:[{}], i.e.:[{}] in:[{}]".format( weight.name, stock_name, ckpt_file)) bert.set_weights(weights) print("Done loading {} BERT weights from: {} into {} (prefix:{})".format( len(weights), ckpt_file, bert, bert_prefix)) def restore_encoder(self, bert_ckpt_file): # loading the original pre-trained weights into the BERT layer: self.load_stock_weights(self.encoder, bert_ckpt_file) def call(self, inp, tar, training, enc_padding_mask, look_ahead_mask, dec_padding_mask): enc_output = self.encoder(inp, training=self.encoder.trainable) # (batch_size, inp_seq_len, d_model) # dec_output.shape == (batch_size, tar_seq_len, d_model) dec_output, attention_weights = self.decoder( tar, enc_output, training, look_ahead_mask, dec_padding_mask) final_output = self.final_layer(dec_output) # (batch_size, tar_seq_len, target_vocab_size) return final_output, attention_weights # # -------------------- # test.py # import tensorflow as tf import time import numpy as np import matplotlib.pyplot as plt import os os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' gpus = tf.config.experimental.list_physical_devices('GPU') if gpus: try: # Currently, memory growth needs to be the same across GPUs for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) logical_gpus = tf.config.experimental.list_logical_devices('GPU') print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs") except RuntimeError as e: # Memory growth must be set before GPUs have been initialized print(e) from utils import create_masks class Translator: def __init__(self, tokenizer_zh, tokenize_en, model, MAX_SEQ_LENGTH): self.tokenizer_zh = tokenizer_zh self.tokenizer_en = tokenize_en self.model = model self.MAX_SEQ_LENGTH = MAX_SEQ_LENGTH def encode_zh(self, zh): tokens_zh = self.tokenizer_zh.tokenize(zh) lang1 = self.tokenizer_zh.convert_tokens_to_ids(['[CLS]'] + tokens_zh + ['[SEP]']) return lang1 def evaluate(self, inp_sentence): # normalize input sentence inp_sentence = self.encode_zh(inp_sentence) encoder_input = tf.expand_dims(inp_sentence, 0) # as the target is english, the first word to the transformer should be the # english start token. decoder_input = [self.tokenizer_en.vocab_size] output = tf.expand_dims(decoder_input, 0) for i in range(self.MAX_SEQ_LENGTH): enc_padding_mask, combined_mask, dec_padding_mask = create_masks( encoder_input, output) # predictions.shape == (batch_size, seq_len, vocab_size) predictions, attention_weights = self.model(encoder_input, output, False, enc_padding_mask, combined_mask, dec_padding_mask) # select the last word from the seq_len dimension predictions = predictions[:, -1:, :] # (batch_size, 1, vocab_size) predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32) # return the result if the predicted_id is equal to the end token if tf.equal(predicted_id, self.tokenizer_en.vocab_size + 1): return tf.squeeze(output, axis=0), attention_weights # concatentate the predicted_id to the output which is given to the decoder # as its input. output = tf.concat([output, predicted_id], axis=-1) return tf.squeeze(output, axis=0), attention_weights def plot_attention_weights(self, attention, sentence, result, layer): fig = plt.figure(figsize=(16, 8)) sentence_ids = self.encode_zh(sentence) attention = tf.squeeze(attention[layer], axis=0) for head in range(attention.shape[0]): ax = fig.add_subplot(2, 4, head + 1) # plot the attention weights ax.matshow(attention[head][:-1, :], cmap='viridis') fontdict = {'fontsize': 10, 'family': 'DFKai-SB'} ax.set_xticks(range(len(sentence_ids))) ax.set_yticks(range(len(result))) ax.set_ylim(len(result) - 1.5, -0.5) ax.set_xticklabels( self.tokenizer_zh.convert_ids_to_tokens(sentence_ids), fontdict=fontdict, rotation=90) ax.set_yticklabels([self.tokenizer_en.decode([i]) for i in result if i < self.tokenizer_en.vocab_size], fontdict=fontdict) ax.set_xlabel('Head {}'.format(head + 1)) plt.tight_layout() plt.show() # In[ ]: def do(self, sentence, plot=''): result, attention_weights = self.evaluate(sentence) predicted_sentence = self.tokenizer_en.decode([i for i in result if i < self.tokenizer_en.vocab_size]) print('Chinese src: {}'.format(sentence)) print('Translated : {}'.format(predicted_sentence)) if plot: self.plot_attention_weights(attention_weights, sentence, result, plot) def main(): # In[42]: sentence_ids = encode_zh("我爱你啊") print(tokenizer_zh.convert_ids_to_tokens(sentence_ids)) # In[ ]: # In[51]: translate(transformer, '虽然继承了祖荫,但朴槿惠已经证明了自己是个机敏而老练的政治家——她历经20年才爬上韩国大国家党最高领导层并成为全国知名人物。') print( 'Real translation: While Park derives some of her power from her family pedigree, she has proven to be an astute and seasoned politician – one who climbed the Grand National Party’s leadership ladder over the last two decades to emerge as a national figure.') # In[59]: translate(transformer, "我爱你是一件幸福的事情。") # ## Save weights # In[ ]: transformer.save_weights('bert_nmt_ckpt') # In[49]: new_transformer = Transformer(config=config, target_vocab_size=target_vocab_size, bert_config_file=bert_config_file) fn_out, _ = new_transformer(inp, tar_inp, True, look_ahead_mask=None, dec_padding_mask=None) new_transformer.load_weights('bert_nmt_ckpt') translate(new_transformer, '我爱你') if __name__ == '__main__': main() # # -------------------- # tokenizer.py # import tensorflow as tf import tensorflow_datasets as tfds import collections import unicodedata import os,sys import numpy as np def convert_to_unicode(text): """Converts `text` to Unicode (if it's not already), assuming utf-8 input.""" if isinstance(text, str): return text elif isinstance(text, bytes): return text.decode("utf-8", "ignore") else: raise ValueError("Unsupported string type: %s" % (type(text))) def load_vocab(vocab_file): """Loads a vocabulary file into a dictionary.""" vocab = collections.OrderedDict() index = 0 with tf.io.gfile.GFile(vocab_file, "r") as reader: while True: token = convert_to_unicode(reader.readline()) if not token: break token = token.strip() vocab[token] = index index += 1 return vocab def whitespace_tokenize(text): """Runs basic whitespace cleaning and splitting on a piece of text.""" text = text.strip() if not text: return [] tokens = text.split() return tokens def convert_by_vocab(vocab, items): """Converts a sequence of [tokens|ids] using the vocab.""" output = [] for item in items: output.append(vocab[item]) return output class FullTokenizer(object): """Runs end-to-end tokenziation.""" def __init__(self, vocab_file, do_lower_case=True): self.vocab = load_vocab(vocab_file) self.inv_vocab = {v: k for k, v in self.vocab.items()} self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case) self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) def tokenize(self, text): split_tokens = [] for token in self.basic_tokenizer.tokenize(text): for sub_token in self.wordpiece_tokenizer.tokenize(token): split_tokens.append(sub_token) return split_tokens def convert_tokens_to_ids(self, tokens): return convert_by_vocab(self.vocab, tokens) def convert_ids_to_tokens(self, ids): return convert_by_vocab(self.inv_vocab, ids) class BasicTokenizer(object): """Runs basic tokenization (punctuation splitting, lower casing, etc.).""" def __init__(self, do_lower_case=True): """Constructs a BasicTokenizer. Args: do_lower_case: Whether to lower case the input. """ self.do_lower_case = do_lower_case def tokenize(self, text): """Tokenizes a piece of text.""" text = convert_to_unicode(text) text = self._clean_text(text) # This was added on November 1st, 2018 for the multilingual and Chinese # models. This is also applied to the English models now, but it doesn't # matter since the English models were not trained on any Chinese data # and generally don't have any Chinese data in them (there are Chinese # characters in the vocabulary because Wikipedia does have some Chinese # words in the English Wikipedia.). text = self._tokenize_chinese_chars(text) orig_tokens = whitespace_tokenize(text) split_tokens = [] for token in orig_tokens: if self.do_lower_case: token = token.lower() token = self._run_strip_accents(token) split_tokens.extend(self._run_split_on_punc(token)) output_tokens = whitespace_tokenize(" ".join(split_tokens)) return output_tokens def _run_strip_accents(self, text): """Strips accents from a piece of text.""" text = unicodedata.normalize("NFD", text) output = [] for char in text: cat = unicodedata.category(char) if cat == "Mn": continue output.append(char) return "".join(output) def _run_split_on_punc(self, text): """Splits punctuation on a piece of text.""" chars = list(text) i = 0 start_new_word = True output = [] while i < len(chars): char = chars[i] if _is_punctuation(char): output.append([char]) start_new_word = True else: if start_new_word: output.append([]) start_new_word = False output[-1].append(char) i += 1 return ["".join(x) for x in output] def _tokenize_chinese_chars(self, text): """Adds whitespace around any CJK character.""" output = [] for char in text: cp = ord(char) if self._is_chinese_char(cp): output.append(" ") output.append(char) output.append(" ") else: output.append(char) return "".join(output) def _is_chinese_char(self, cp): """Checks whether CP is the codepoint of a CJK character.""" # This defines a "chinese character" as anything in the CJK Unicode block: # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) # # Note that the CJK Unicode block is NOT all Japanese and Korean characters, # despite its name. The modern Korean Hangul alphabet is a different block, # as is Japanese Hiragana and Katakana. Those alphabets are used to write # space-separated words, so they are not treated specially and handled # like the all of the other languages. if ((cp >= 0x4E00 and cp <= 0x9FFF) or # (cp >= 0x3400 and cp <= 0x4DBF) or # (cp >= 0x20000 and cp <= 0x2A6DF) or # (cp >= 0x2A700 and cp <= 0x2B73F) or # (cp >= 0x2B740 and cp <= 0x2B81F) or # (cp >= 0x2B820 and cp <= 0x2CEAF) or (cp >= 0xF900 and cp <= 0xFAFF) or # (cp >= 0x2F800 and cp <= 0x2FA1F)): # return True return False def _clean_text(self, text): """Performs invalid character removal and whitespace cleanup on text.""" output = [] for char in text: cp = ord(char) if cp == 0 or cp == 0xfffd or _is_control(char): continue if _is_whitespace(char): output.append(" ") else: output.append(char) return "".join(output) class WordpieceTokenizer(object): """Runs WordPiece tokenziation.""" def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=200): self.vocab = vocab self.unk_token = unk_token self.max_input_chars_per_word = max_input_chars_per_word def tokenize(self, text): """Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform tokenization using the given vocabulary. For example: input = "unaffable" output = ["un", "##aff", "##able"] Args: text: A single token or whitespace separated tokens. This should have already been passed through `BasicTokenizer. Returns: A list of wordpiece tokens. """ text = convert_to_unicode(text) output_tokens = [] for token in whitespace_tokenize(text): chars = list(token) if len(chars) > self.max_input_chars_per_word: output_tokens.append(self.unk_token) continue is_bad = False start = 0 sub_tokens = [] while start < len(chars): end = len(chars) cur_substr = None while start < end: substr = "".join(chars[start:end]) if start > 0: substr = "##" + substr if substr in self.vocab: cur_substr = substr break end -= 1 if cur_substr is None: is_bad = True break sub_tokens.append(cur_substr) start = end if is_bad: output_tokens.append(self.unk_token) else: output_tokens.extend(sub_tokens) return output_tokens def _is_whitespace(char): """Checks whether `chars` is a whitespace character.""" # \t, \n, and \r are technically contorl characters but we treat them # as whitespace since they are generally considered as such. if char == " " or char == "\t" or char == "\n" or char == "\r": return True cat = unicodedata.category(char) if cat == "Zs": return True return False def _is_control(char): """Checks whether `chars` is a control character.""" # These are technically control characters but we count them as whitespace # characters. if char == "\t" or char == "\n" or char == "\r": return False cat = unicodedata.category(char) if cat.startswith("C"): return True return False def _is_punctuation(char): """Checks whether `chars` is a punctuation character.""" cp = ord(char) # We treat all non-letter/number ASCII as punctuation. # Characters such as "^", "$", and "`" are not in the Unicode # Punctuation class but we treat them as punctuation anyways, for # consistency. if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): return True cat = unicodedata.category(char) if cat.startswith("P"): return True return False def get_tokenizer(MAX_SEQ_LENGTH, BATCH_SIZE): # ## Setup input pipleline # Use TFDS to load the wmt2019 zh-en translation dataset. if not os.path.exists('chinese_L-12_H-768_A-12'): # get_ipython().system('wget https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip') # get_ipython().system('unzip chinese_L-12_H-768_A-12') print('download pretrained first!') sys.exit() config = tfds.translate.wmt.WmtConfig( description="WMT 2019 translation task dataset.", version="0.0.3", language_pair=("zh", "en"), subsets={ tfds.Split.TRAIN: ["newscommentary_v13"], tfds.Split.VALIDATION: ["newsdev2017"], } ) builder = tfds.builder("wmt_translate", config=config) print(builder.info.splits) builder.download_and_prepare() datasets = builder.as_dataset(as_supervised=True) print('datasets is {}'.format(datasets)) # In[ ]: train_examples = datasets['train'] val_examples = datasets['validation'] # In[ ]: for zh, en in train_examples.take(1): # print((zh)) print(tf.compat.as_text(zh.numpy())) print(tf.compat.as_text(en.numpy())) # Create a custom subwords tokenizer from the training dataset for the decoder. # In[ ]: vocab_file = 'vocab_en' if os.path.isfile(vocab_file + '.subwords'): tokenizer_en = tfds.features.text.SubwordTextEncoder.load_from_file(vocab_file) else: tokenizer_en = tfds.features.text.SubwordTextEncoder.build_from_corpus( (en.numpy() for zh, en in train_examples), target_vocab_size=2 ** 13) tokenizer_en.save_to_file('vocab_en') sample_string = 'Transformer is awesome.' tokenized_string = tokenizer_en.encode(sample_string) for ts in tokenized_string: print('{} ----> {}'.format(ts, tokenizer_en.decode([ts]))) # The encoder uses BERT tokenizer. # In[ ]: tokenizer_zh = FullTokenizer( vocab_file='chinese_L-12_H-768_A-12/vocab.txt', do_lower_case=True) test_tokens = tokenizer_zh.tokenize('今天天气不错额。') test_ids = tokenizer_zh.convert_tokens_to_ids(['[CLS]'] + test_tokens + ['[SEP]']) print('tokens:', test_tokens) print('ids:', test_ids) print('convert_ids_to_tokens:', tokenizer_zh.convert_ids_to_tokens(test_ids)) def encode(zh, en, seq_length=MAX_SEQ_LENGTH): tokens_zh = tokenizer_zh.tokenize(tf.compat.as_text(zh.numpy())) lang1 = tokenizer_zh.convert_tokens_to_ids(['[CLS]'] + tokens_zh + ['[SEP]']) if len(lang1) < seq_length: lang1 = lang1 + list(np.zeros(seq_length - len(lang1), 'int32')) # insert SOS and EOS lang2 = [tokenizer_en.vocab_size] + tokenizer_en.encode( tf.compat.as_text(en.numpy())) + [tokenizer_en.vocab_size + 1] if len(lang2) < seq_length: lang2 = lang2 + list(np.zeros(seq_length - len(lang2), 'int32')) return lang1, lang2 def filter_max_length(x, y, max_length=MAX_SEQ_LENGTH): return tf.logical_and(tf.size(x) <= max_length, tf.size(y) <= max_length) train_dataset = train_examples.map( lambda zh, en: tf.py_function(encode, [zh, en], [tf.int32, tf.int32])) train_dataset = train_dataset.filter(filter_max_length) # cache the dataset to memory to get a speedup while reading from it. train_dataset = train_dataset.cache() train_dataset = train_dataset.shuffle(20000).padded_batch( BATCH_SIZE, padded_shapes=([-1], [-1]), drop_remainder=True) train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE) val_dataset = val_examples.map( lambda zh, en: tf.py_function(encode, [zh, en], [tf.int32, tf.int32])) val_dataset = val_dataset.filter(filter_max_length) val_dataset = val_dataset.padded_batch(BATCH_SIZE, padded_shapes=([-1], [-1])) return train_dataset, val_dataset, tokenizer_en, tokenizer_zh if __name__ == '__main__': get_tokenizer(100, 64) # # -------------------- # transformer.py # import tensorflow as tf import numpy as np from utils import positional_encoding from attlayer import EncoderLayer,DecoderLayer class Encoder(tf.keras.layers.Layer): def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, rate=0.1): super(Encoder, self).__init__() self.d_model = d_model self.num_layers = num_layers self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model) self.pos_encoding = positional_encoding(input_vocab_size, self.d_model) self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate) for _ in range(num_layers)] self.dropout = tf.keras.layers.Dropout(rate) def call(self, x, training, mask): seq_len = tf.shape(x)[1] # adding embedding and position encoding. x = self.embedding(x) # (batch_size, input_seq_len, d_model) x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32)) x += self.pos_encoding[:, :seq_len, :] x = self.dropout(x, training=training) for i in range(self.num_layers): x = self.enc_layers[i](x, training, mask) return x # (batch_size, input_seq_len, d_model) """### Decoder The `Decoder` consists of: 1. Output Embedding 2. Positional Encoding 3. N decoder layers The target is put through an embedding which is summed with the positional encoding. The output of this summation is the input to the decoder layers. The output of the decoder is the input to the final linear layer. """ class Decoder(tf.keras.layers.Layer): def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size, rate=0.1): super(Decoder, self).__init__() self.d_model = d_model self.num_layers = num_layers self.embedding = tf.keras.layers.Embedding(target_vocab_size, d_model) self.pos_encoding = positional_encoding(target_vocab_size, self.d_model) self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate) for _ in range(num_layers)] self.dropout = tf.keras.layers.Dropout(rate) def call(self, x, enc_output, training, look_ahead_mask, padding_mask): seq_len = tf.shape(x)[1] attention_weights = {} x = self.embedding(x) # (batch_size, target_seq_len, d_model) x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32)) x += self.pos_encoding[:, :seq_len, :] x = self.dropout(x, training=training) for i in range(self.num_layers): x, block1, block2 = self.dec_layers[i](x, enc_output, training, look_ahead_mask, padding_mask) attention_weights['decoder_layer{}_block1'.format(i + 1)] = block1 attention_weights['decoder_layer{}_block2'.format(i + 1)] = block2 # x.shape == (batch_size, target_seq_len, d_model) return x, attention_weights """## Create the Transformer Transformer consists of the encoder, decoder and a final linear layer. The output of the decoder is the input to the linear layer and its output is returned. """ class Transformer(tf.keras.Model): def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, target_vocab_size, rate=0.1): super(Transformer, self).__init__() self.encoder = Encoder(num_layers, d_model, num_heads, dff, input_vocab_size, rate) self.decoder = Decoder(num_layers, d_model, num_heads, dff, target_vocab_size, rate) self.final_layer = tf.keras.layers.Dense(target_vocab_size) def call(self, inp, tar, training, enc_padding_mask, look_ahead_mask, dec_padding_mask): enc_output = self.encoder(inp, training, enc_padding_mask) # (batch_size, inp_seq_len, d_model) # dec_output.shape == (batch_size, tar_seq_len, d_model) dec_output, attention_weights = self.decoder( tar, enc_output, training, look_ahead_mask, dec_padding_mask) final_output = self.final_layer(dec_output) # (batch_size, tar_seq_len, target_vocab_size) return final_output, attention_weights if __name__ == '__main__': sample_encoder = Encoder(num_layers=2, d_model=512, num_heads=8, dff=2048, input_vocab_size=8500) sample_encoder_output = sample_encoder(tf.random.uniform((64, 62)), training=False, mask=None) print(sample_encoder_output.shape) # (batch_size, input_seq_len, d_model) sample_decoder = Decoder(num_layers=2, d_model=512, num_heads=8, dff=2048, target_vocab_size=8000) output, attn = sample_decoder(tf.random.uniform((64, 26)), enc_output=sample_encoder_output, training=False, look_ahead_mask=None, padding_mask=None) output.shape, attn['decoder_layer2_block2'].shape sample_transformer = Transformer( num_layers=2, d_model=512, num_heads=8, dff=2048, input_vocab_size=8500, target_vocab_size=8000) temp_input = tf.random.uniform((64, 62)) temp_target = tf.random.uniform((64, 26)) fn_out, _ = sample_transformer(temp_input, temp_target, training=False, enc_padding_mask=None, look_ahead_mask=None, dec_padding_mask=None) fn_out.shape # (batch_size, tar_seq_len, target_vocab_size) # # -------------------- # transformer\_train.py # import tensorflow as tf import time import numpy as np import matplotlib.pyplot as plt import os os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' gpus = tf.config.experimental.list_physical_devices('GPU') if gpus: try: # Currently, memory growth needs to be the same across GPUs for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) logical_gpus = tf.config.experimental.list_logical_devices('GPU') print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs") except RuntimeError as e: # Memory growth must be set before GPUs have been initialized print(e) from tokenizer import get_tokenizer from transformer import Transformer from utils import CustomSchedule, create_masks from test import Translator BUFFER_SIZE = 20000 BATCH_SIZE = 64 MAX_SEQ_LENGTH = 128 train_dataset, val_dataset, tokenizer_en, tokenizer_zh = \ get_tokenizer(MAX_SEQ_LENGTH, BATCH_SIZE) # Chinese -> English translation input_vocab_size = 21128 target_vocab_size = tokenizer_en.vocab_size + 2 dropout_rate = 0.1 num_layers=4 d_model=512 dff=2048 num_heads=8 transformer = Transformer(num_layers, d_model, num_heads, dff, input_vocab_size, target_vocab_size, dropout_rate) inp = tf.random.uniform((BATCH_SIZE, MAX_SEQ_LENGTH)) tar_inp = tf.random.uniform((BATCH_SIZE, MAX_SEQ_LENGTH)) fn_out, _ = transformer(inp, tar_inp, True, enc_padding_mask=None, look_ahead_mask=None, dec_padding_mask=None) print(tar_inp.shape) # (batch_size, tar_seq_len) print(fn_out.shape) # (batch_size, tar_seq_len, target_vocab_size) transformer.summary() learning_rate = CustomSchedule(d_model) optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9) loss_object = tf.keras.losses.SparseCategoricalCrossentropy( from_logits=True, reduction='none') def loss_function(real, pred): mask = tf.math.logical_not(tf.math.equal(real, 0)) loss_ = loss_object(real, pred) mask = tf.cast(mask, dtype=loss_.dtype) loss_ *= mask return tf.reduce_mean(loss_) train_loss = tf.keras.metrics.Mean(name='train_loss') train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy( name='train_accuracy') checkpoint_path = "./zh-en/transformer" ckpt = tf.train.Checkpoint(transformer=transformer, optimizer=optimizer) ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5) # if a checkpoint exists, restore the latest checkpoint. if ckpt_manager.latest_checkpoint: ckpt.restore(ckpt_manager.latest_checkpoint) print('Latest checkpoint restored!!') @tf.function def train_step(inp, tar): tar_inp = tar[:, :-1] tar_real = tar[:, 1:] enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp) with tf.GradientTape() as tape: predictions, _ = transformer(inp, tar_inp, True, enc_padding_mask, combined_mask, dec_padding_mask) loss = loss_function(tar_real, predictions) gradients = tape.gradient(loss, transformer.trainable_variables) optimizer.apply_gradients(zip(gradients, transformer.trainable_variables)) train_loss(loss) train_accuracy(tar_real, predictions) # Chinese is used as the input language and English is the target language. translator = Translator(tokenizer_zh, tokenizer_en, transformer, MAX_SEQ_LENGTH) for epoch in range(20): (cn_code, en_code) = next(iter(val_dataset)) cn_code, en_code = cn_code[epoch].numpy(), en_code[epoch].numpy() # print(cn_code) # print(en_code) en = tokenizer_en.decode([i for i in en_code if i < tokenizer_en.vocab_size]) cn_code = [int(i) for i in cn_code if (i!=101 and i!=102 and i!=1 and i!=0)] # print(cn_code) cn = tokenizer_zh.convert_ids_to_tokens(cn_code) cn = "".join(cn) translator.do(cn) print('Real:', en) print('\n') start = time.time() train_loss.reset_states() train_accuracy.reset_states() # inp -> chinese, tar -> english for (batch, (inp, tar)) in enumerate(train_dataset): train_step(inp, tar) if batch % 50 == 0: print('Epoch {} Batch {} Loss {:.4f} Accuracy {:.4f}'.format( epoch + 1, batch, train_loss.result(), train_accuracy.result())) if (epoch + 1) % 3 == 0: ckpt_save_path = ckpt_manager.save() print('Saving checkpoint for epoch {} at {}'.format(epoch + 1, ckpt_save_path)) print('Epoch {} Loss {:.4f} Accuracy {:.4f}'.format(epoch + 1, train_loss.result(), train_accuracy.result())) print('Time taken for 1 epoch: {} secs\n'.format(time.time() - start)) -------------------- # utils.py # import tensorflow as tf import numpy as np import matplotlib.pyplot as plt def get_angles(pos, i, d_model): angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model)) return pos * angle_rates def positional_encoding(position, d_model): angle_rads = get_angles(np.arange(position)[:, np.newaxis], np.arange(d_model)[np.newaxis, :], d_model) # apply sin to even indices in the array; 2i sines = np.sin(angle_rads[:, 0::2]) # apply cos to odd indices in the array; 2i+1 cosines = np.cos(angle_rads[:, 1::2]) pos_encoding = np.concatenate([sines, cosines], axis=-1) pos_encoding = pos_encoding[np.newaxis, ...] return tf.cast(pos_encoding, dtype=tf.float32) # ## Masking # Mask all the pad tokens in the batch of sequence. It ensures that the model does not treat padding as the input. The mask indicates where pad value 0 is present: it outputs a 1 at those locations, and a 0 otherwise. # In[ ]: def create_padding_mask(seq): seq = tf.cast(tf.math.equal(seq, 0), tf.float32) # add extra dimensions so that we can add the padding # to the attention logits. return seq[:, tf.newaxis, tf.newaxis, :] # (batch_size, 1, 1, seq_len) # The look-ahead mask is used to mask the future tokens in a sequence. In other words, the mask indicates which entries should not be used. # # This means that to predict the third word, only the first and second word will be used. Similarly to predict the fourth word, only the first, second and the third word will be used and so on. # In[ ]: def create_look_ahead_mask(size): mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0) return mask # (seq_len, seq_len) def create_masks(inp, tar): # Encoder padding mask enc_padding_mask = create_padding_mask(inp) # Used in the 2nd attention block in the decoder. # This padding mask is used to mask the encoder outputs. dec_padding_mask = create_padding_mask(inp) # Used in the 1st attention block in the decoder. # It is used to pad and mask future tokens in the input received by # the decoder. look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1]) dec_target_padding_mask = create_padding_mask(tar) combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask) return enc_padding_mask, combined_mask, dec_padding_mask # def create_masks2(inp, tar): # # Used in the 2nd attention block in the decoder. # # This padding mask is used to mask the encoder outputs. # dec_padding_mask = create_padding_mask(inp) # # # Used in the 1st attention block in the decoder. # # It is used to pad and mask future tokens in the input received by # # the decoder. # look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1]) # dec_target_padding_mask = create_padding_mask(tar) # combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask) # # return combined_mask, dec_padding_mask class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule): def __init__(self, d_model, warmup_steps=4000): super(CustomSchedule, self).__init__() self.d_model = d_model self.d_model = tf.cast(self.d_model, tf.float32) self.warmup_steps = warmup_steps def __call__(self, step): arg1 = tf.math.rsqrt(step) arg2 = step * (self.warmup_steps ** -1.5) return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2) def main(): # In[ ]: temp_learning_rate_schedule = CustomSchedule(config.d_model) plt.plot(temp_learning_rate_schedule(tf.range(40000, dtype=tf.float32))) plt.ylabel("Learning Rate") plt.xlabel("Train Step") if __name__ == '__main__': main() [20191009191333910.png]: https://img-blog.csdnimg.cn/20191009191333910.png [Link 1]: http://www.rimengshe.com/ [AI_Keras PyTorch MXNet TensorFlow PaddlePaddle]: https://blog.csdn.net/zimiao552147572/article/details/88867161 [https_github.com_dragen1860_Deep-Learning-with-TensorFlow-book]: https://github.com/dragen1860/Deep-Learning-with-TensorFlow-book [https_github.com_dragen1860_TensorFlow-2.x-Tutorials]: https://github.com/dragen1860/TensorFlow-2.x-Tutorials
还没有评论,来说两句吧...