CN-EN-Translation-BERT

迷南。 2023-07-02 05:25 8阅读 0赞

# ![20191009191333910.png][] #

# [日萌社][Link 1] #

[人工智能AI：Keras PyTorch MXNet TensorFlow PaddlePaddle 深度学习实战（不定时更新）][AI_Keras PyTorch MXNet TensorFlow PaddlePaddle]

--------------------

# **github下载代码** #

[https://github.com/dragen1860/Deep-Learning-with-TensorFlow-book][https_github.com_dragen1860_Deep-Learning-with-TensorFlow-book]

[https://github.com/dragen1860/TensorFlow-2.x-Tutorials][https_github.com_dragen1860_TensorFlow-2.x-Tutorials]

--------------------

# **attention.py**  #

import  tensorflow as tf
    
    def scaled_dot_product_attention(q, k, v, mask):
        matmul_qk = tf.matmul(q, k, transpose_b=True)  # (..., seq_len_q, seq_len_k)
    
        # scale matmul_qk
        dk = tf.cast(tf.shape(k)[-1], tf.float32)
        scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
    
        # add the mask to the scaled tensor.
        if mask is not None:
            scaled_attention_logits += (mask * -1e9)
    
            # softmax is normalized on the last axis (seq_len_k) so that the scores
        # add up to 1.
        attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)  # (..., seq_len_q, seq_len_k)
    
        output = tf.matmul(attention_weights, v)  # (..., seq_len_q, depth_v)
    
        return output, attention_weights
    
    
    # ## Multi-head attention
    
    # In[ ]:
    
    
    class MultiHeadAttention(tf.keras.layers.Layer):
        def __init__(self, d_model, num_heads):
            super(MultiHeadAttention, self).__init__()
            self.num_heads = num_heads
            self.d_model = d_model
    
            assert d_model % self.num_heads == 0
    
            self.depth = d_model // self.num_heads
    
            self.wq = tf.keras.layers.Dense(d_model)
            self.wk = tf.keras.layers.Dense(d_model)
            self.wv = tf.keras.layers.Dense(d_model)
    
            self.dense = tf.keras.layers.Dense(d_model)
    
        def split_heads(self, x, batch_size):
            """Split the last dimension into (num_heads, depth).
            Transpose the result such that the shape is (batch_size, num_heads, seq_len, depth)
            """
            x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
            return tf.transpose(x, perm=[0, 2, 1, 3])
    
        def call(self, v, k, q, mask):
            batch_size = tf.shape(q)[0]
    
            q = self.wq(q)  # (batch_size, seq_len, d_model)
            k = self.wk(k)  # (batch_size, seq_len, d_model)
            v = self.wv(v)  # (batch_size, seq_len, d_model)
    
            q = self.split_heads(q, batch_size)  # (batch_size, num_heads, seq_len_q, depth)
            k = self.split_heads(k, batch_size)  # (batch_size, num_heads, seq_len_k, depth)
            v = self.split_heads(v, batch_size)  # (batch_size, num_heads, seq_len_v, depth)
    
            # scaled_attention.shape == (batch_size, num_heads, seq_len_q, depth)
            # attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)
            scaled_attention, attention_weights = scaled_dot_product_attention(
                q, k, v, mask)
    
            scaled_attention = tf.transpose(scaled_attention,
                                            perm=[0, 2, 1, 3])  # (batch_size, seq_len_q, num_heads, depth)
    
            concat_attention = tf.reshape(scaled_attention,
                                          (batch_size, -1, self.d_model))  # (batch_size, seq_len_q, d_model)
    
            output = self.dense(concat_attention)  # (batch_size, seq_len_q, d_model)
    
            return output, attention_weights
    
    def main():
        temp_mha = MultiHeadAttention(d_model=512, num_heads=8)
        y = tf.random.uniform((1, 60, 768))  # (batch_size, encoder_sequence, d_model)
        q = tf.random.uniform((1, 60, 512))  # (batch_size, encoder_sequence, d_model)
        out, attn = temp_mha(y, k=y, q=q, mask=None)
        out.shape, attn.shape
    
    
    
    if __name__ == '__main__':
        main()

--------------------

# attlayer.py #

import tensorflow as tf
    from    attention import MultiHeadAttention
    from    utils import positional_encoding
    
    def point_wise_feed_forward_network(d_model, dff):
        return tf.keras.Sequential([
            tf.keras.layers.Dense(dff, activation='relu'),  # (batch_size, seq_len, dff)
            tf.keras.layers.Dense(d_model)  # (batch_size, seq_len, d_model)
        ])
    
    
    
    
    
    # ## Decoder Layer and Decoder
    
    class EncoderLayer(tf.keras.layers.Layer):
        def __init__(self, d_model, num_heads, dff, rate=0.1):
            super(EncoderLayer, self).__init__()
    
            self.mha = MultiHeadAttention(d_model, num_heads)
            self.ffn = point_wise_feed_forward_network(d_model, dff)
    
            self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
            self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    
            self.dropout1 = tf.keras.layers.Dropout(rate)
            self.dropout2 = tf.keras.layers.Dropout(rate)
    
        def call(self, x, training, mask):
            attn_output, _ = self.mha(x, x, x, mask)  # (batch_size, input_seq_len, d_model)
            attn_output = self.dropout1(attn_output, training=training)
            out1 = self.layernorm1(x + attn_output)  # (batch_size, input_seq_len, d_model)
    
            ffn_output = self.ffn(out1)  # (batch_size, input_seq_len, d_model)
            ffn_output = self.dropout2(ffn_output, training=training)
            out2 = self.layernorm2(out1 + ffn_output)  # (batch_size, input_seq_len, d_model)
    
            return out2
    
    # Each decoder layer consists of sublayers:
    #
    # 1. Masked multi-head attention (with look ahead mask and padding mask)
    #
    # 2. Multi-head attention (with padding mask). V (value) and K (key) receive the encoder output as inputs. Q (query) receives the output from the masked multi-head attention sublaye
    #
    # 3. Point wise feed forward networks
    #
    # Each of these sublayers has a residual connection around it followed by a layer normalization. The output of each sublayer is LayerNorm(x + Sublayer(x)). The normalization is done on the d_model (last) axis.
    
    # In[ ]:
    
    
    class DecoderLayer(tf.keras.layers.Layer):
        def __init__(self, d_model, num_heads, dff, rate=0.1):
            super(DecoderLayer, self).__init__()
    
            self.mha1 = MultiHeadAttention(d_model, num_heads)
            self.mha2 = MultiHeadAttention(d_model, num_heads)
    
            self.ffn = point_wise_feed_forward_network(d_model, dff)
    
            self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
            self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
            self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    
            self.dropout1 = tf.keras.layers.Dropout(rate)
            self.dropout2 = tf.keras.layers.Dropout(rate)
            self.dropout3 = tf.keras.layers.Dropout(rate)
    
        def call(self, x, enc_output, training,
                 look_ahead_mask, padding_mask):
            # enc_output.shape == (batch_size, input_seq_len, d_model)
    
            attn1, attn_weights_block1 = self.mha1(x, x, x, look_ahead_mask)  # (batch_size, target_seq_len, d_model)
            attn1 = self.dropout1(attn1, training=training)
            out1 = self.layernorm1(attn1 + x)
    
            attn2, attn_weights_block2 = self.mha2(
                enc_output, enc_output, out1, padding_mask)  # (batch_size, target_seq_len, d_model)
            attn2 = self.dropout2(attn2, training=training)
            out2 = self.layernorm2(attn2 + out1)  # (batch_size, target_seq_len, d_model)
    
            ffn_output = self.ffn(out2)  # (batch_size, target_seq_len, d_model)
            ffn_output = self.dropout3(ffn_output, training=training)
            out3 = self.layernorm3(ffn_output + out2)  # (batch_size, target_seq_len, d_model)
    
            return out3, attn_weights_block1, attn_weights_block2
    
    
    
    
    
    
    
    def main():
        # In[ ]:
        sample_encoder_layer = EncoderLayer(512, 8, 2048)
    
        sample_encoder_layer_output = sample_encoder_layer(
            tf.random.uniform((64, 43, 512)), False, None)
    
        sample_encoder_layer_output.shape  # (batch_size, input_seq_len, d_model)
    
    
        sample_decoder_layer = DecoderLayer(512, 8, 2048)
        sample_encoder_output = tf.random.uniform((64, 128, 768))
    
        sample_decoder_layer_output, _, _ = sample_decoder_layer(
            tf.random.uniform((64, 50, 512)), sample_encoder_output,
            False, None, None)
    
        sample_decoder_layer_output.shape  # (batch_size, target_seq_len, d_model)
    
    
    if __name__ == '__main__':
        main()

--------------------

# bert\_train.py #

import  tensorflow as tf
    
    import  time
    import  numpy as np
    import  matplotlib.pyplot as plt
    import  os
    
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
    gpus = tf.config.experimental.list_physical_devices('GPU')
    if gpus:
      try:
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
          tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
      except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)
    
    from    tokenizer import get_tokenizer
    from    bertmodel import Transformer, Config
    from    utils import CustomSchedule, create_masks
    from    test import Translator
    
    
    
    BUFFER_SIZE = 50000
    BATCH_SIZE = 64
    MAX_SEQ_LENGTH = 128
    
    train_dataset, val_dataset, tokenizer_en, tokenizer_zh = \
        get_tokenizer(MAX_SEQ_LENGTH, BATCH_SIZE)
    
    
    config = Config(num_layers=6, d_model=256, dff=1024, num_heads=8)
    
    
    target_vocab_size = tokenizer_en.vocab_size + 2
    dropout_rate = 0.1
    
    
    
    
    MODEL_DIR = "chinese_L-12_H-768_A-12"
    bert_config_file = os.path.join(MODEL_DIR, "bert_config.json")
    bert_ckpt_file = os.path.join(MODEL_DIR, "bert_model.ckpt")
    
    transformer = Transformer(config=config,
                              target_vocab_size=target_vocab_size,
                              bert_config_file=bert_config_file)
    
    inp = tf.random.uniform((BATCH_SIZE, MAX_SEQ_LENGTH))
    tar_inp = tf.random.uniform((BATCH_SIZE, MAX_SEQ_LENGTH))
    fn_out, _ = transformer(inp, tar_inp,
                            True,
                            enc_padding_mask=None,
                            look_ahead_mask=None,
                            dec_padding_mask=None)
    print(tar_inp.shape)  # (batch_size, tar_seq_len)
    print(fn_out.shape)  # (batch_size, tar_seq_len, target_vocab_size) 
    
    # init bert pre-trained weights
    transformer.restore_encoder(bert_ckpt_file)
    transformer.summary()
    
    
    
    learning_rate = CustomSchedule(config.d_model)
    
    optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98,
                                         epsilon=1e-9)
    
    
    
    loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits=True, reduction='none')
    
    def loss_function(real, pred):
        mask = tf.math.logical_not(tf.math.equal(real, 0))
        loss_ = loss_object(real, pred)
    
        mask = tf.cast(mask, dtype=loss_.dtype)
        loss_ *= mask
    
        return tf.reduce_mean(loss_)
    
    
    train_loss = tf.keras.metrics.Mean(name='train_loss')
    train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
        name='train_accuracy')
    
    
    checkpoint_path = "./zh-en/bert"
    
    ckpt = tf.train.Checkpoint(transformer=transformer,
                               optimizer=optimizer)
    
    ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)
    
    # if a checkpoint exists, restore the latest checkpoint.
    if ckpt_manager.latest_checkpoint:
        ckpt.restore(ckpt_manager.latest_checkpoint)
        print('Latest checkpoint restored!!')
    
    
    
    @tf.function
    def train_step(inp, tar):
        tar_inp = tar[:, :-1]
        tar_real = tar[:, 1:]
    
        enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp)
    
        with tf.GradientTape() as tape:
            predictions, _ = transformer(inp, tar_inp,
                                         True,
                                         enc_padding_mask,
                                         combined_mask,
                                         dec_padding_mask)
            loss = loss_function(tar_real, predictions)
    
        gradients = tape.gradient(loss, transformer.trainable_variables)
        optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))
    
        train_loss(loss)
        train_accuracy(tar_real, predictions)
    
    
    # Chinese is used as the input language and English is the target language.
    translator = Translator(tokenizer_zh, tokenizer_en, transformer, MAX_SEQ_LENGTH)
    
    for epoch in range(4):
    
        res = translator.do('虽然继承了祖荫，但朴槿惠已经证明了自己是个机敏而老练的政治家。')
    
    
        start = time.time()
    
        train_loss.reset_states()
        train_accuracy.reset_states()
    
        # inp -> chinese, tar -> english
        for (batch, (inp, tar)) in enumerate(train_dataset):
            train_step(inp, tar)
    
            if batch % 500 == 0:
                print('Epoch {} Batch {} Loss {:.4f} Accuracy {:.4f}'.format(
                    epoch + 1, batch, train_loss.result(), train_accuracy.result()))
    
        if (epoch + 1) % 1 == 0:
            ckpt_save_path = ckpt_manager.save()
            print('Saving checkpoint for epoch {} at {}'.format(epoch + 1,
                                                                ckpt_save_path))
    
        print('Epoch {} Loss {:.4f} Accuracy {:.4f}'.format(epoch + 1,
                                                            train_loss.result(),
                                                            train_accuracy.result()))
    
        print('Time taken for 1 epoch: {} secs\n'.format(time.time() - start))

--------------------

# bertmodel.py #

import tensorflow as tf
    
    from bert import BertModelLayer
    from bert.loader import StockBertConfig, load_stock_weights
    from bert.loader import map_to_stock_variable_name
    
    
    from    utils import positional_encoding
    from    attlayer import DecoderLayer
    
    class Config(object):
        def __init__(self, num_layers, d_model, dff, num_heads):
            self.num_layers = num_layers
            self.d_model = d_model
            self.dff = dff
            self.num_heads = num_heads
    
    
    # In[ ]:
    
    def build_encoder(config_file):
        with tf.io.gfile.GFile(config_file, "r") as reader:
            stock_params = StockBertConfig.from_json_string(reader.read())
            bert_params = stock_params.to_bert_model_layer_params()
    
        return BertModelLayer.from_params(bert_params, name="bert")
    
    
    
    class Decoder(tf.keras.layers.Layer):
        def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size,
                     rate=0.1):
            super(Decoder, self).__init__()
    
            self.d_model = d_model
            self.num_layers = num_layers
    
            self.embedding = tf.keras.layers.Embedding(target_vocab_size, d_model)
            self.pos_encoding = positional_encoding(target_vocab_size, self.d_model)
    
            self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate)
                               for _ in range(num_layers)]
            self.dropout = tf.keras.layers.Dropout(rate)
    
        def call(self, x, enc_output, training,
                 look_ahead_mask, padding_mask):
            seq_len = tf.shape(x)[1]
            attention_weights = {}
    
            x = self.embedding(x)  # (batch_size, target_seq_len, d_model)
            x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
            x += self.pos_encoding[:, :seq_len, :]
    
            x = self.dropout(x, training=training)
    
            for i in range(self.num_layers):
                x, block1, block2 = self.dec_layers[i](x, enc_output, training,
                                                       look_ahead_mask, padding_mask)
    
                attention_weights['decoder_layer{}_block1'.format(i + 1)] = block1
                attention_weights['decoder_layer{}_block2'.format(i + 1)] = block2
    
            # x.shape == (batch_size, target_seq_len, d_model)
            return x, attention_weights
    
    
    
    class Transformer(tf.keras.Model):
        def __init__(self, config,
                     target_vocab_size,
                     bert_config_file,
                     bert_training=False,
                     rate=0.1,
                     name='transformer'):
            super(Transformer, self).__init__(name=name)
    
            self.encoder = build_encoder(config_file=bert_config_file)
            self.encoder.trainable = bert_training
    
            self.decoder = Decoder(config.num_layers, config.d_model,
                                   config.num_heads, config.dff, target_vocab_size, rate)
    
            self.final_layer = tf.keras.layers.Dense(target_vocab_size)
    
        def load_stock_weights(self, bert: BertModelLayer, ckpt_file):
            assert isinstance(bert, BertModelLayer), "Expecting a BertModelLayer instance as first argument"
            assert tf.compat.v1.train.checkpoint_exists(ckpt_file), "Checkpoint does not exist: {}".format(ckpt_file)
            ckpt_reader = tf.train.load_checkpoint(ckpt_file)
    
            bert_prefix = 'transformer/bert'
    
            weights = []
            for weight in bert.weights:
                stock_name = map_to_stock_variable_name(weight.name, bert_prefix)
                if ckpt_reader.has_tensor(stock_name):
                    value = ckpt_reader.get_tensor(stock_name)
                    weights.append(value)
                else:
                    raise ValueError("No value for:[{}], i.e.:[{}] in:[{}]".format(
                        weight.name, stock_name, ckpt_file))
            bert.set_weights(weights)
            print("Done loading {} BERT weights from: {} into {} (prefix:{})".format(
                len(weights), ckpt_file, bert, bert_prefix))
    
        def restore_encoder(self, bert_ckpt_file):
            # loading the original pre-trained weights into the BERT layer:
            self.load_stock_weights(self.encoder, bert_ckpt_file)
    
        def call(self, inp, tar, training, enc_padding_mask,
                 look_ahead_mask, dec_padding_mask):
            enc_output = self.encoder(inp, training=self.encoder.trainable)  # (batch_size, inp_seq_len, d_model)
    
            # dec_output.shape == (batch_size, tar_seq_len, d_model)
            dec_output, attention_weights = self.decoder(
                tar, enc_output, training, look_ahead_mask, dec_padding_mask)
    
            final_output = self.final_layer(dec_output)  # (batch_size, tar_seq_len, target_vocab_size)
    
            return final_output, attention_weights

#  #

--------------------

# test.py #

import  tensorflow as tf
    
    import  time
    import  numpy as np
    import  matplotlib.pyplot as plt
    import  os
    
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
    gpus = tf.config.experimental.list_physical_devices('GPU')
    if gpus:
      try:
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
          tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
      except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)
    
    
    from utils import create_masks
    
    
    class Translator:
    
        def __init__(self, tokenizer_zh, tokenize_en, model, MAX_SEQ_LENGTH):
    
            self.tokenizer_zh = tokenizer_zh
            self.tokenizer_en = tokenize_en
            self.model = model
            self.MAX_SEQ_LENGTH = MAX_SEQ_LENGTH
    
    
        def encode_zh(self, zh):
            tokens_zh = self.tokenizer_zh.tokenize(zh)
            lang1 = self.tokenizer_zh.convert_tokens_to_ids(['[CLS]'] + tokens_zh + ['[SEP]'])
    
            return lang1
    
    
    
    
    
        def evaluate(self, inp_sentence):
            # normalize input sentence
            inp_sentence = self.encode_zh(inp_sentence)
            encoder_input = tf.expand_dims(inp_sentence, 0)
    
            # as the target is english, the first word to the transformer should be the
            # english start token.
            decoder_input = [self.tokenizer_en.vocab_size]
            output = tf.expand_dims(decoder_input, 0)
    
            for i in range(self.MAX_SEQ_LENGTH):
                enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
                                    encoder_input, output)
    
                # predictions.shape == (batch_size, seq_len, vocab_size)
                predictions, attention_weights = self.model(encoder_input,
                                                             output,
                                                             False,
                                                             enc_padding_mask,
                                                             combined_mask,
                                                             dec_padding_mask)
    
                # select the last word from the seq_len dimension
                predictions = predictions[:, -1:, :]  # (batch_size, 1, vocab_size)
    
                predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)
    
                # return the result if the predicted_id is equal to the end token
                if tf.equal(predicted_id, self.tokenizer_en.vocab_size + 1):
                    return tf.squeeze(output, axis=0), attention_weights
    
                # concatentate the predicted_id to the output which is given to the decoder
                # as its input.
                output = tf.concat([output, predicted_id], axis=-1)
    
            return tf.squeeze(output, axis=0), attention_weights
    
    
    
    
    
        def plot_attention_weights(self, attention, sentence, result, layer):
            fig = plt.figure(figsize=(16, 8))
    
            sentence_ids = self.encode_zh(sentence)
    
            attention = tf.squeeze(attention[layer], axis=0)
    
            for head in range(attention.shape[0]):
                ax = fig.add_subplot(2, 4, head + 1)
    
                # plot the attention weights
                ax.matshow(attention[head][:-1, :], cmap='viridis')
    
                fontdict = {'fontsize': 10, 'family': 'DFKai-SB'}
    
                ax.set_xticks(range(len(sentence_ids)))
                ax.set_yticks(range(len(result)))
    
                ax.set_ylim(len(result) - 1.5, -0.5)
    
                ax.set_xticklabels(
                    self.tokenizer_zh.convert_ids_to_tokens(sentence_ids),
                    fontdict=fontdict, rotation=90)
    
                ax.set_yticklabels([self.tokenizer_en.decode([i]) for i in result
                                    if i < self.tokenizer_en.vocab_size],
                                   fontdict=fontdict)
    
                ax.set_xlabel('Head {}'.format(head + 1))
    
            plt.tight_layout()
            plt.show()
    
    
    # In[ ]:
    
    
        def do(self, sentence, plot=''):
            result, attention_weights = self.evaluate(sentence)
    
            predicted_sentence = self.tokenizer_en.decode([i for i in result
                                                      if i < self.tokenizer_en.vocab_size])
    
            print('Chinese src: {}'.format(sentence))
            print('Translated : {}'.format(predicted_sentence))
    
            if plot:
                self.plot_attention_weights(attention_weights, sentence, result, plot)
    
    
    
    
    def main():
        # In[42]:
    
        sentence_ids = encode_zh("我爱你啊")
        print(tokenizer_zh.convert_ids_to_tokens(sentence_ids))
    
        # In[ ]:
    
        # In[51]:
    
        translate(transformer, '虽然继承了祖荫，但朴槿惠已经证明了自己是个机敏而老练的政治家——她历经20年才爬上韩国大国家党最高领导层并成为全国知名人物。')
        print(
            'Real translation: While Park derives some of her power from her family pedigree, she has proven to be an astute and seasoned politician – one who climbed the Grand National Party’s leadership ladder over the last two decades to emerge as a national figure.')
    
        # In[59]:
    
        translate(transformer, "我爱你是一件幸福的事情。")
    
        # ## Save weights
    
        # In[ ]:
    
        transformer.save_weights('bert_nmt_ckpt')
    
        # In[49]:
    
        new_transformer = Transformer(config=config,
                                      target_vocab_size=target_vocab_size,
                                      bert_config_file=bert_config_file)
    
        fn_out, _ = new_transformer(inp, tar_inp,
                                    True,
                                    look_ahead_mask=None,
                                    dec_padding_mask=None)
        new_transformer.load_weights('bert_nmt_ckpt')
    
        translate(new_transformer, '我爱你')
    
    if __name__ == '__main__':
        main()

#  #

--------------------

# tokenizer.py #

import  tensorflow as tf
    import  tensorflow_datasets as tfds
    
    
    import  collections
    import  unicodedata
    import  os,sys
    import  numpy as np
    
    
    
    
    
    def convert_to_unicode(text):
        """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
        if isinstance(text, str):
            return text
        elif isinstance(text, bytes):
            return text.decode("utf-8", "ignore")
        else:
            raise ValueError("Unsupported string type: %s" % (type(text)))
    
    
    def load_vocab(vocab_file):
        """Loads a vocabulary file into a dictionary."""
        vocab = collections.OrderedDict()
        index = 0
        with tf.io.gfile.GFile(vocab_file, "r") as reader:
            while True:
                token = convert_to_unicode(reader.readline())
                if not token:
                    break
                token = token.strip()
                vocab[token] = index
                index += 1
        return vocab
    
    
    def whitespace_tokenize(text):
        """Runs basic whitespace cleaning and splitting on a piece of text."""
        text = text.strip()
        if not text:
            return []
        tokens = text.split()
        return tokens
    
    
    def convert_by_vocab(vocab, items):
        """Converts a sequence of [tokens|ids] using the vocab."""
        output = []
        for item in items:
            output.append(vocab[item])
        return output
    
    
    class FullTokenizer(object):
        """Runs end-to-end tokenziation."""
    
        def __init__(self, vocab_file, do_lower_case=True):
            self.vocab = load_vocab(vocab_file)
            self.inv_vocab = {v: k for k, v in self.vocab.items()}
            self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
            self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
    
        def tokenize(self, text):
            split_tokens = []
            for token in self.basic_tokenizer.tokenize(text):
                for sub_token in self.wordpiece_tokenizer.tokenize(token):
                    split_tokens.append(sub_token)
    
            return split_tokens
    
        def convert_tokens_to_ids(self, tokens):
            return convert_by_vocab(self.vocab, tokens)
    
        def convert_ids_to_tokens(self, ids):
            return convert_by_vocab(self.inv_vocab, ids)
    
    
    class BasicTokenizer(object):
        """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
    
        def __init__(self, do_lower_case=True):
            """Constructs a BasicTokenizer.
    
            Args:
              do_lower_case: Whether to lower case the input.
            """
            self.do_lower_case = do_lower_case
    
        def tokenize(self, text):
            """Tokenizes a piece of text."""
            text = convert_to_unicode(text)
            text = self._clean_text(text)
    
            # This was added on November 1st, 2018 for the multilingual and Chinese
            # models. This is also applied to the English models now, but it doesn't
            # matter since the English models were not trained on any Chinese data
            # and generally don't have any Chinese data in them (there are Chinese
            # characters in the vocabulary because Wikipedia does have some Chinese
            # words in the English Wikipedia.).
            text = self._tokenize_chinese_chars(text)
    
            orig_tokens = whitespace_tokenize(text)
            split_tokens = []
            for token in orig_tokens:
                if self.do_lower_case:
                    token = token.lower()
                    token = self._run_strip_accents(token)
                split_tokens.extend(self._run_split_on_punc(token))
    
            output_tokens = whitespace_tokenize(" ".join(split_tokens))
            return output_tokens
    
        def _run_strip_accents(self, text):
            """Strips accents from a piece of text."""
            text = unicodedata.normalize("NFD", text)
            output = []
            for char in text:
                cat = unicodedata.category(char)
                if cat == "Mn":
                    continue
                output.append(char)
            return "".join(output)
    
        def _run_split_on_punc(self, text):
            """Splits punctuation on a piece of text."""
            chars = list(text)
            i = 0
            start_new_word = True
            output = []
            while i < len(chars):
                char = chars[i]
                if _is_punctuation(char):
                    output.append([char])
                    start_new_word = True
                else:
                    if start_new_word:
                        output.append([])
                    start_new_word = False
                    output[-1].append(char)
                i += 1
    
            return ["".join(x) for x in output]
    
        def _tokenize_chinese_chars(self, text):
            """Adds whitespace around any CJK character."""
            output = []
            for char in text:
                cp = ord(char)
                if self._is_chinese_char(cp):
                    output.append(" ")
                    output.append(char)
                    output.append(" ")
                else:
                    output.append(char)
            return "".join(output)
    
        def _is_chinese_char(self, cp):
            """Checks whether CP is the codepoint of a CJK character."""
            # This defines a "chinese character" as anything in the CJK Unicode block:
            #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
            #
            # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
            # despite its name. The modern Korean Hangul alphabet is a different block,
            # as is Japanese Hiragana and Katakana. Those alphabets are used to write
            # space-separated words, so they are not treated specially and handled
            # like the all of the other languages.
            if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
                    (cp >= 0x3400 and cp <= 0x4DBF) or  #
                    (cp >= 0x20000 and cp <= 0x2A6DF) or  #
                    (cp >= 0x2A700 and cp <= 0x2B73F) or  #
                    (cp >= 0x2B740 and cp <= 0x2B81F) or  #
                    (cp >= 0x2B820 and cp <= 0x2CEAF) or
                    (cp >= 0xF900 and cp <= 0xFAFF) or  #
                    (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
                return True
    
            return False
    
        def _clean_text(self, text):
            """Performs invalid character removal and whitespace cleanup on text."""
            output = []
            for char in text:
                cp = ord(char)
                if cp == 0 or cp == 0xfffd or _is_control(char):
                    continue
                if _is_whitespace(char):
                    output.append(" ")
                else:
                    output.append(char)
            return "".join(output)
    
    
    class WordpieceTokenizer(object):
        """Runs WordPiece tokenziation."""
    
        def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=200):
            self.vocab = vocab
            self.unk_token = unk_token
            self.max_input_chars_per_word = max_input_chars_per_word
    
        def tokenize(self, text):
            """Tokenizes a piece of text into its word pieces.
    
            This uses a greedy longest-match-first algorithm to perform tokenization
            using the given vocabulary.
    
            For example:
              input = "unaffable"
              output = ["un", "##aff", "##able"]
    
            Args:
              text: A single token or whitespace separated tokens. This should have
                already been passed through `BasicTokenizer.
    
            Returns:
              A list of wordpiece tokens.
            """
    
            text = convert_to_unicode(text)
    
            output_tokens = []
            for token in whitespace_tokenize(text):
                chars = list(token)
                if len(chars) > self.max_input_chars_per_word:
                    output_tokens.append(self.unk_token)
                    continue
    
                is_bad = False
                start = 0
                sub_tokens = []
                while start < len(chars):
                    end = len(chars)
                    cur_substr = None
                    while start < end:
                        substr = "".join(chars[start:end])
                        if start > 0:
                            substr = "##" + substr
                        if substr in self.vocab:
                            cur_substr = substr
                            break
                        end -= 1
                    if cur_substr is None:
                        is_bad = True
                        break
                    sub_tokens.append(cur_substr)
                    start = end
    
                if is_bad:
                    output_tokens.append(self.unk_token)
                else:
                    output_tokens.extend(sub_tokens)
            return output_tokens
    
    
    def _is_whitespace(char):
        """Checks whether `chars` is a whitespace character."""
        # \t, \n, and \r are technically contorl characters but we treat them
        # as whitespace since they are generally considered as such.
        if char == " " or char == "\t" or char == "\n" or char == "\r":
            return True
        cat = unicodedata.category(char)
        if cat == "Zs":
            return True
        return False
    
    
    def _is_control(char):
        """Checks whether `chars` is a control character."""
        # These are technically control characters but we count them as whitespace
        # characters.
        if char == "\t" or char == "\n" or char == "\r":
            return False
        cat = unicodedata.category(char)
        if cat.startswith("C"):
            return True
        return False
    
    
    def _is_punctuation(char):
        """Checks whether `chars` is a punctuation character."""
        cp = ord(char)
        # We treat all non-letter/number ASCII as punctuation.
        # Characters such as "^", "$", and "`" are not in the Unicode
        # Punctuation class but we treat them as punctuation anyways, for
        # consistency.
        if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
                (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
            return True
        cat = unicodedata.category(char)
        if cat.startswith("P"):
            return True
        return False
    
    
    
    
    def get_tokenizer(MAX_SEQ_LENGTH, BATCH_SIZE):
        # ## Setup input pipleline
    
        # Use TFDS to load the wmt2019 zh-en translation dataset.
    
        if not os.path.exists('chinese_L-12_H-768_A-12'):
            # get_ipython().system('wget https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip')
            # get_ipython().system('unzip chinese_L-12_H-768_A-12')
            print('download pretrained first!')
            sys.exit()
    
        config = tfds.translate.wmt.WmtConfig(
            description="WMT 2019 translation task dataset.",
            version="0.0.3",
            language_pair=("zh", "en"),
            subsets={
                tfds.Split.TRAIN: ["newscommentary_v13"],
                tfds.Split.VALIDATION: ["newsdev2017"],
            }
        )
    
        builder = tfds.builder("wmt_translate", config=config)
        print(builder.info.splits)
        builder.download_and_prepare()
        datasets = builder.as_dataset(as_supervised=True)
        print('datasets is {}'.format(datasets))
    
        # In[ ]:
    
        train_examples = datasets['train']
        val_examples = datasets['validation']
    
        # In[ ]:
    
        for zh, en in train_examples.take(1):
            # print((zh))
            print(tf.compat.as_text(zh.numpy()))
            print(tf.compat.as_text(en.numpy()))
    
        # Create a custom subwords tokenizer from the training dataset for the decoder.
    
        # In[ ]:
    
        vocab_file = 'vocab_en'
        if os.path.isfile(vocab_file + '.subwords'):
            tokenizer_en = tfds.features.text.SubwordTextEncoder.load_from_file(vocab_file)
        else:
            tokenizer_en = tfds.features.text.SubwordTextEncoder.build_from_corpus(
                (en.numpy() for zh, en in train_examples), target_vocab_size=2 ** 13)
            tokenizer_en.save_to_file('vocab_en')
    
        sample_string = 'Transformer is awesome.'
        tokenized_string = tokenizer_en.encode(sample_string)
        for ts in tokenized_string:
            print('{} ----> {}'.format(ts, tokenizer_en.decode([ts])))
    
        # The encoder uses BERT tokenizer.
    
        # In[ ]:
    
        tokenizer_zh = FullTokenizer(
            vocab_file='chinese_L-12_H-768_A-12/vocab.txt', do_lower_case=True)
    
        test_tokens = tokenizer_zh.tokenize('今天天气不错额。')
        test_ids = tokenizer_zh.convert_tokens_to_ids(['[CLS]'] + test_tokens + ['[SEP]'])
        print('tokens:', test_tokens)
        print('ids:', test_ids)
        print('convert_ids_to_tokens:', tokenizer_zh.convert_ids_to_tokens(test_ids))
    
    
        def encode(zh, en, seq_length=MAX_SEQ_LENGTH):
            tokens_zh = tokenizer_zh.tokenize(tf.compat.as_text(zh.numpy()))
            lang1 = tokenizer_zh.convert_tokens_to_ids(['[CLS]'] + tokens_zh + ['[SEP]'])
            if len(lang1) < seq_length:
                lang1 = lang1 + list(np.zeros(seq_length - len(lang1), 'int32'))
    
            # insert SOS and EOS
            lang2 = [tokenizer_en.vocab_size] + tokenizer_en.encode(
                tf.compat.as_text(en.numpy())) + [tokenizer_en.vocab_size + 1]
            if len(lang2) < seq_length:
                lang2 = lang2 + list(np.zeros(seq_length - len(lang2), 'int32'))
    
            return lang1, lang2
    
    
    
        def filter_max_length(x, y, max_length=MAX_SEQ_LENGTH):
            return tf.logical_and(tf.size(x) <= max_length,
                                  tf.size(y) <= max_length)
    
        train_dataset = train_examples.map(
            lambda zh, en: tf.py_function(encode, [zh, en], [tf.int32, tf.int32]))
        train_dataset = train_dataset.filter(filter_max_length)
    
        # cache the dataset to memory to get a speedup while reading from it.
        train_dataset = train_dataset.cache()
        train_dataset = train_dataset.shuffle(20000).padded_batch(
            BATCH_SIZE, padded_shapes=([-1], [-1]), drop_remainder=True)
        train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE)
    
        val_dataset = val_examples.map(
            lambda zh, en: tf.py_function(encode, [zh, en], [tf.int32, tf.int32]))
        val_dataset = val_dataset.filter(filter_max_length)
        val_dataset = val_dataset.padded_batch(BATCH_SIZE, padded_shapes=([-1], [-1]))
    
    
    
    
        return train_dataset, val_dataset, tokenizer_en, tokenizer_zh
    
    
    if __name__ == '__main__':
        get_tokenizer(100, 64)

#  #

--------------------

# transformer.py #

import tensorflow as tf
    import numpy as np
    
    from    utils import positional_encoding
    from    attlayer import EncoderLayer,DecoderLayer
    
    
    
    class Encoder(tf.keras.layers.Layer):
        def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size,
                     rate=0.1):
            super(Encoder, self).__init__()
    
            self.d_model = d_model
            self.num_layers = num_layers
    
            self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model)
            self.pos_encoding = positional_encoding(input_vocab_size, self.d_model)
    
            self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate)
                               for _ in range(num_layers)]
    
            self.dropout = tf.keras.layers.Dropout(rate)
    
        def call(self, x, training, mask):
            seq_len = tf.shape(x)[1]
    
            # adding embedding and position encoding.
            x = self.embedding(x)  # (batch_size, input_seq_len, d_model)
            x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
            x += self.pos_encoding[:, :seq_len, :]
    
            x = self.dropout(x, training=training)
    
            for i in range(self.num_layers):
                x = self.enc_layers[i](x, training, mask)
    
            return x  # (batch_size, input_seq_len, d_model)
    
    
    """### Decoder
    
    The `Decoder` consists of:
    1.   Output Embedding
    2.   Positional Encoding
    3.   N decoder layers
    
    The target is put through an embedding which is summed with the positional encoding. The output of this summation is the input to the decoder layers. The output of the decoder is the input to the final linear layer.
    """
    
    
    class Decoder(tf.keras.layers.Layer):
        def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size,
                     rate=0.1):
            super(Decoder, self).__init__()
    
            self.d_model = d_model
            self.num_layers = num_layers
    
            self.embedding = tf.keras.layers.Embedding(target_vocab_size, d_model)
            self.pos_encoding = positional_encoding(target_vocab_size, self.d_model)
    
            self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate)
                               for _ in range(num_layers)]
            self.dropout = tf.keras.layers.Dropout(rate)
    
        def call(self, x, enc_output, training,
                 look_ahead_mask, padding_mask):
            seq_len = tf.shape(x)[1]
            attention_weights = {}
    
            x = self.embedding(x)  # (batch_size, target_seq_len, d_model)
            x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
            x += self.pos_encoding[:, :seq_len, :]
    
            x = self.dropout(x, training=training)
    
            for i in range(self.num_layers):
                x, block1, block2 = self.dec_layers[i](x, enc_output, training,
                                                       look_ahead_mask, padding_mask)
    
                attention_weights['decoder_layer{}_block1'.format(i + 1)] = block1
                attention_weights['decoder_layer{}_block2'.format(i + 1)] = block2
    
            # x.shape == (batch_size, target_seq_len, d_model)
            return x, attention_weights
    
    
    
    """## Create the Transformer
    
    Transformer consists of the encoder, decoder and a final linear layer. The output of the decoder is the input to the linear layer and its output is returned.
    """
    
    
    class Transformer(tf.keras.Model):
        def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size,
                     target_vocab_size, rate=0.1):
            super(Transformer, self).__init__()
    
            self.encoder = Encoder(num_layers, d_model, num_heads, dff,
                                   input_vocab_size, rate)
    
            self.decoder = Decoder(num_layers, d_model, num_heads, dff,
                                   target_vocab_size, rate)
    
            self.final_layer = tf.keras.layers.Dense(target_vocab_size)
    
        def call(self, inp, tar, training, enc_padding_mask,
                 look_ahead_mask, dec_padding_mask):
            enc_output = self.encoder(inp, training, enc_padding_mask)  # (batch_size, inp_seq_len, d_model)
    
            # dec_output.shape == (batch_size, tar_seq_len, d_model)
            dec_output, attention_weights = self.decoder(
                tar, enc_output, training, look_ahead_mask, dec_padding_mask)
    
            final_output = self.final_layer(dec_output)  # (batch_size, tar_seq_len, target_vocab_size)
    
            return final_output, attention_weights
    
    
    
    
    
    if __name__ == '__main__':
        sample_encoder = Encoder(num_layers=2, d_model=512, num_heads=8,
                                 dff=2048, input_vocab_size=8500)
    
        sample_encoder_output = sample_encoder(tf.random.uniform((64, 62)),
                                               training=False, mask=None)
    
        print(sample_encoder_output.shape)  # (batch_size, input_seq_len, d_model)
    
        sample_decoder = Decoder(num_layers=2, d_model=512, num_heads=8,
                                 dff=2048, target_vocab_size=8000)
    
        output, attn = sample_decoder(tf.random.uniform((64, 26)),
                                      enc_output=sample_encoder_output,
                                      training=False, look_ahead_mask=None,
                                      padding_mask=None)
    
        output.shape, attn['decoder_layer2_block2'].shape
    
    
        sample_transformer = Transformer(
            num_layers=2, d_model=512, num_heads=8, dff=2048,
            input_vocab_size=8500, target_vocab_size=8000)
    
        temp_input = tf.random.uniform((64, 62))
        temp_target = tf.random.uniform((64, 26))
    
        fn_out, _ = sample_transformer(temp_input, temp_target, training=False,
                                       enc_padding_mask=None,
                                       look_ahead_mask=None,
                                       dec_padding_mask=None)
    
        fn_out.shape  # (batch_size, tar_seq_len, target_vocab_size)

#  #

--------------------

# transformer\_train.py #

import  tensorflow as tf
    
    import  time
    import  numpy as np
    import  matplotlib.pyplot as plt
    import  os
    
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
    gpus = tf.config.experimental.list_physical_devices('GPU')
    if gpus:
      try:
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
          tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
      except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)
    
    from    tokenizer import get_tokenizer
    from    transformer import Transformer
    from    utils import CustomSchedule, create_masks
    from    test import Translator
    
    
    
    
    BUFFER_SIZE = 20000
    BATCH_SIZE = 64
    MAX_SEQ_LENGTH = 128
    
    train_dataset, val_dataset, tokenizer_en, tokenizer_zh = \
        get_tokenizer(MAX_SEQ_LENGTH, BATCH_SIZE)
    
    # Chinese -> English translation
    input_vocab_size = 21128
    target_vocab_size = tokenizer_en.vocab_size + 2
    dropout_rate = 0.1
    num_layers=4
    d_model=512
    dff=2048
    num_heads=8
    
    transformer = Transformer(num_layers, d_model, num_heads, dff,
                              input_vocab_size, target_vocab_size, dropout_rate)
    
    inp = tf.random.uniform((BATCH_SIZE, MAX_SEQ_LENGTH))
    tar_inp = tf.random.uniform((BATCH_SIZE, MAX_SEQ_LENGTH))
    
    fn_out, _ = transformer(inp, tar_inp,
                            True,
                            enc_padding_mask=None,
                            look_ahead_mask=None,
                            dec_padding_mask=None)
    print(tar_inp.shape)  # (batch_size, tar_seq_len)
    print(fn_out.shape)  # (batch_size, tar_seq_len, target_vocab_size)
    transformer.summary()
    
    
    
    learning_rate = CustomSchedule(d_model)
    
    optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98,
                                         epsilon=1e-9)
    
    
    
    loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
                    from_logits=True, reduction='none')
    
    def loss_function(real, pred):
        mask = tf.math.logical_not(tf.math.equal(real, 0))
        loss_ = loss_object(real, pred)
    
        mask = tf.cast(mask, dtype=loss_.dtype)
        loss_ *= mask
    
        return tf.reduce_mean(loss_)
    
    
    train_loss = tf.keras.metrics.Mean(name='train_loss')
    train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
                        name='train_accuracy')
    
    
    checkpoint_path = "./zh-en/transformer"
    
    ckpt = tf.train.Checkpoint(transformer=transformer,
                               optimizer=optimizer)
    
    ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)
    
    # if a checkpoint exists, restore the latest checkpoint.
    if ckpt_manager.latest_checkpoint:
        ckpt.restore(ckpt_manager.latest_checkpoint)
        print('Latest checkpoint restored!!')
    
    
    
    @tf.function
    def train_step(inp, tar):
        tar_inp = tar[:, :-1]
        tar_real = tar[:, 1:]
    
        enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp)
    
        with tf.GradientTape() as tape:
            predictions, _ = transformer(inp, tar_inp,
                                         True,
                                         enc_padding_mask,
                                         combined_mask,
                                         dec_padding_mask)
            loss = loss_function(tar_real, predictions)
    
        gradients = tape.gradient(loss, transformer.trainable_variables)
        optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))
    
        train_loss(loss)
        train_accuracy(tar_real, predictions)
    
    
    # Chinese is used as the input language and English is the target language.
    translator = Translator(tokenizer_zh, tokenizer_en, transformer, MAX_SEQ_LENGTH)
    
    for epoch in range(20):
    
        (cn_code, en_code) = next(iter(val_dataset))
        cn_code, en_code = cn_code[epoch].numpy(), en_code[epoch].numpy()
        # print(cn_code)
        # print(en_code)
        en = tokenizer_en.decode([i for i in en_code if i < tokenizer_en.vocab_size])
        cn_code = [int(i)
                        for i in cn_code if (i!=101 and i!=102 and i!=1 and i!=0)]
        # print(cn_code)
        cn = tokenizer_zh.convert_ids_to_tokens(cn_code)
        cn = "".join(cn)
        translator.do(cn)
        print('Real:', en)
        print('\n')
    
    
    
        start = time.time()
    
        train_loss.reset_states()
        train_accuracy.reset_states()
    
        # inp -> chinese, tar -> english
        for (batch, (inp, tar)) in enumerate(train_dataset):
            train_step(inp, tar)
    
            if batch % 50 == 0:
                print('Epoch {} Batch {} Loss {:.4f} Accuracy {:.4f}'.format(
                    epoch + 1, batch, train_loss.result(), train_accuracy.result()))
    
        if (epoch + 1) % 3 == 0:
            ckpt_save_path = ckpt_manager.save()
            print('Saving checkpoint for epoch {} at {}'.format(epoch + 1,
                                                                ckpt_save_path))
    
        print('Epoch {} Loss {:.4f} Accuracy {:.4f}'.format(epoch + 1,
                                                            train_loss.result(),
                                                            train_accuracy.result()))
    
        print('Time taken for 1 epoch: {} secs\n'.format(time.time() - start))

--------------------

# utils.py #

import  tensorflow as tf
    import  numpy as np
    import  matplotlib.pyplot as plt
    
    def get_angles(pos, i, d_model):
        angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
        return pos * angle_rates
    
    
    def positional_encoding(position, d_model):
        angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                                np.arange(d_model)[np.newaxis, :],
                                d_model)
    
        # apply sin to even indices in the array; 2i
        sines = np.sin(angle_rads[:, 0::2])
    
        # apply cos to odd indices in the array; 2i+1
        cosines = np.cos(angle_rads[:, 1::2])
    
        pos_encoding = np.concatenate([sines, cosines], axis=-1)
    
        pos_encoding = pos_encoding[np.newaxis, ...]
    
        return tf.cast(pos_encoding, dtype=tf.float32)
    
    
    # ## Masking
    
    # Mask all the pad tokens in the batch of sequence. It ensures that the model does not treat padding as the input. The mask indicates where pad value 0 is present: it outputs a 1 at those locations, and a 0 otherwise.
    
    # In[ ]:
    
    
    def create_padding_mask(seq):
        seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
    
        # add extra dimensions so that we can add the padding
        # to the attention logits.
        return seq[:, tf.newaxis, tf.newaxis, :]  # (batch_size, 1, 1, seq_len)
    
    
    # The look-ahead mask is used to mask the future tokens in a sequence. In other words, the mask indicates which entries should not be used.
    #
    # This means that to predict the third word, only the first and second word will be used. Similarly to predict the fourth word, only the first, second and the third word will be used and so on.
    
    # In[ ]:
    
    
    def create_look_ahead_mask(size):
        mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
        return mask  # (seq_len, seq_len)
    
    
    def create_masks(inp, tar):
        # Encoder padding mask
        enc_padding_mask = create_padding_mask(inp)
    
        # Used in the 2nd attention block in the decoder.
        # This padding mask is used to mask the encoder outputs.
        dec_padding_mask = create_padding_mask(inp)
    
        # Used in the 1st attention block in the decoder.
        # It is used to pad and mask future tokens in the input received by
        # the decoder.
        look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1])
        dec_target_padding_mask = create_padding_mask(tar)
        combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)
    
        return enc_padding_mask, combined_mask, dec_padding_mask
    
    # def create_masks2(inp, tar):
    #     # Used in the 2nd attention block in the decoder.
    #     # This padding mask is used to mask the encoder outputs.
    #     dec_padding_mask = create_padding_mask(inp)
    #
    #     # Used in the 1st attention block in the decoder.
    #     # It is used to pad and mask future tokens in the input received by
    #     # the decoder.
    #     look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1])
    #     dec_target_padding_mask = create_padding_mask(tar)
    #     combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)
    #
    #     return combined_mask, dec_padding_mask
    
    
    class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
        def __init__(self, d_model, warmup_steps=4000):
            super(CustomSchedule, self).__init__()
    
            self.d_model = d_model
            self.d_model = tf.cast(self.d_model, tf.float32)
    
            self.warmup_steps = warmup_steps
    
        def __call__(self, step):
            arg1 = tf.math.rsqrt(step)
            arg2 = step * (self.warmup_steps ** -1.5)
    
            return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)
    
    
    
    def main():
        # In[ ]:
    
        temp_learning_rate_schedule = CustomSchedule(config.d_model)
    
        plt.plot(temp_learning_rate_schedule(tf.range(40000, dtype=tf.float32)))
        plt.ylabel("Learning Rate")
        plt.xlabel("Train Step")
    
    
    if __name__ == '__main__':
        main()

[20191009191333910.png]: https://img-blog.csdnimg.cn/20191009191333910.png
[Link 1]: http://www.rimengshe.com/
[AI_Keras PyTorch MXNet TensorFlow PaddlePaddle]: https://blog.csdn.net/zimiao552147572/article/details/88867161
[https_github.com_dragen1860_Deep-Learning-with-TensorFlow-book]: https://github.com/dragen1860/Deep-Learning-with-TensorFlow-book
[https_github.com_dragen1860_TensorFlow-2.x-Tutorials]: https://github.com/dragen1860/TensorFlow-2.x-Tutorials

CN-EN-Translation-BERT

发表评论取消回复

还没有评论，来说两句吧...

相关阅读