Commit 4b5dcaed by libei

add model hparams instruction for transformer and transformer_dla

parent 3492c676
...@@ -807,6 +807,7 @@ def transformer_base_v2(): ...@@ -807,6 +807,7 @@ def transformer_base_v2():
hparams.relu_dropout = 0.1 hparams.relu_dropout = 0.1
return hparams return hparams
@registry.register_hparams @registry.register_hparams
def transformer_base_rpr_dropout1(): def transformer_base_rpr_dropout1():
hparams = transformer_base() hparams = transformer_base()
...@@ -834,14 +835,35 @@ def transformer_big_multistep2(): ...@@ -834,14 +835,35 @@ def transformer_big_multistep2():
hparams.optimizer = "MultistepAdam" hparams.optimizer = "MultistepAdam"
hparams.optimizer_multistep_accumulate_steps = 2 hparams.optimizer_multistep_accumulate_steps = 2
hparams.batch_size = 2048 hparams.batch_size = 2048
#hparams.attention_dropout = 0.1 hparams.attention_dropout = 0.1
#hparams.relu_dropout = 0.1 #hparams.relu_dropout = 0.1
return hparams return hparams
@registry.register_hparams @registry.register_hparams
def transformer_before_test(): def transformer_before_shared():
# new model use optimizer MultistepAdam # new model use optimizer MultistepAdam
hparams = transformer_before() hparams = transformer_before()
hparams.shared_decoder_input_and_softmax_weights = int(False) hparams.shared_decoder_input_and_softmax_weights = int(True)
return hparams
@registry.register_hparams
def transformer_before_shared25():
# new model use optimizer MultistepAdam to train Deep Transformer-pre-norm
# we found that the model can achieve better performance when we set the true update batch to 8192 and revise
# the learning rate to 0.4. Then the max learning rate you get is nearly to 0.002, it is worth noting that the real learning
# rate is varying according to the global step , affected by "learning_rate" and "learning_rate_warmup_steps".
hparams = transformer_before_shared()
hparams.learning_rate = 0.4
hparams.learning_rate_warmup_steps = 8000
hparams.optimizer = "MultistepAdam"
hparams.optimizer_multistep_accumulate_steps = 4
# it's likely to oom when you train deep transformer-pre-norm within 4096 batch_size
hparams.batch_size = 2048
return hparams return hparams
# you can define your own hparams like above to fix the target task
# @registry.register_hparams
# def transformer_new****():
# return hparams
\ No newline at end of file
...@@ -437,6 +437,9 @@ def transformer_dla(): ...@@ -437,6 +437,9 @@ def transformer_dla():
@registry.register_hparams @registry.register_hparams
def transformer_dla_base(): def transformer_dla_base():
# we implement the Dynamic linear combination of layers in Transformer, we choose the "learnable dense" schema and
# we reviese the learning rate to 0.4 , warmup step to 8000 based Transformer-Pre-Norm . Due to the much more representation
# than baseline's, we reset the batch size to 2048, but four steps per update instead.
hparams = transformer_dla() hparams = transformer_dla()
hparams.encoder_layers = 6 hparams.encoder_layers = 6
hparams.decoder_layers = 6 hparams.decoder_layers = 6
...@@ -452,9 +455,9 @@ def transformer_dla_base(): ...@@ -452,9 +455,9 @@ def transformer_dla_base():
return hparams return hparams
@registry.register_hparams @registry.register_hparams
def transformer_dla_base25_shared(): def transformer_dla_base25_shared():
# share the decoder input and the softmax embedding
hparams = transformer_dla_base() hparams = transformer_dla_base()
hparams.shared_decoder_input_and_softmax_weights = int(True) hparams.shared_decoder_input_and_softmax_weights = int(True)
hparams.encoder_layers = 25 hparams.encoder_layers = 25
...@@ -462,7 +465,33 @@ def transformer_dla_base25_shared(): ...@@ -462,7 +465,33 @@ def transformer_dla_base25_shared():
@registry.register_hparams @registry.register_hparams
def transformer_dla_base30_shared():
hparams = transformer_dla_base()
hparams.shared_decoder_input_and_softmax_weights = int(True)
hparams.encoder_layers = 30
return hparams
@registry.register_hparams
def transformer_dla_base25_shared_filter4096():
# only enlarge the filter size
hparams = transformer_dla_base25_shared()
hparams.filter_size = 4096
return hparams
@registry.register_hparams
def transformer_dla_base25_shared_big():
# enlarge the hidden size and filter size, keep the relation of filter_size = 4 * hidden_size
hparams = transformer_dla_base25_shared()
hparams.hidden_size = 768
hparams.filter_size = 3072
return hparams
@registry.register_hparams
def transformer_dla_base_v2(): def transformer_dla_base_v2():
# in our paper, we found the larger warmup can even performer better in En-De task
hparams = transformer_dla_base() hparams = transformer_dla_base()
hparams.learning_rate = 0.4 * (2**0.5) hparams.learning_rate = 0.4 * (2**0.5)
hparams.learning_rate_warmup_steps = 16000 hparams.learning_rate_warmup_steps = 16000
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论