Commit 90010cd3 by libei

reviese bugs

parent fb9ee9e7
......@@ -3,9 +3,8 @@
<component name="ChangeListManager">
<list default="true" id="7d6d9926-f879-4708-ad8e-442bac96b62a" name="Default" comment="">
<change beforePath="$PROJECT_DIR$/.idea/workspace.xml" afterPath="$PROJECT_DIR$/.idea/workspace.xml" />
<change beforePath="$PROJECT_DIR$/tensor2tensor/models/common_hparams.py" afterPath="$PROJECT_DIR$/tensor2tensor/models/common_hparams.py" />
<change beforePath="$PROJECT_DIR$/tensor2tensor/models/transformer.py" afterPath="$PROJECT_DIR$/tensor2tensor/models/transformer.py" />
<change beforePath="$PROJECT_DIR$/tensor2tensor/utils/trainer_utils.py" afterPath="$PROJECT_DIR$/tensor2tensor/utils/trainer_utils.py" />
<change beforePath="$PROJECT_DIR$/tensor2tensor/models/transformer_dla.py" afterPath="$PROJECT_DIR$/tensor2tensor/models/transformer_dla.py" />
</list>
<option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
<option name="TRACKING_ENABLED" value="true" />
......@@ -16,11 +15,11 @@
</component>
<component name="FileEditorManager">
<leaf SIDE_TABS_SIZE_LIMIT_KEY="300">
<file leaf-file-name="transformer.py" pinned="false" current-in-tab="true">
<file leaf-file-name="transformer.py" pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/tensor2tensor/models/transformer.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="541">
<caret line="364" column="28" lean-forward="true" selection-start-line="364" selection-start-column="28" selection-end-line="364" selection-end-column="28" />
<state relative-caret-position="352">
<caret line="294" column="0" lean-forward="false" selection-start-line="294" selection-start-column="0" selection-end-line="294" selection-end-column="0" />
<folding />
</state>
</provider>
......@@ -29,8 +28,8 @@
<file leaf-file-name="common_hparams.py" pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/tensor2tensor/models/common_hparams.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="235">
<caret line="30" column="11" lean-forward="true" selection-start-line="30" selection-start-column="11" selection-end-line="30" selection-end-column="11" />
<state relative-caret-position="540">
<caret line="30" column="11" lean-forward="false" selection-start-line="30" selection-start-column="11" selection-end-line="30" selection-end-column="11" />
<folding />
</state>
</provider>
......@@ -39,8 +38,8 @@
<file leaf-file-name="trainer_utils.py" pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/tensor2tensor/utils/trainer_utils.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="142">
<caret line="194" column="30" lean-forward="true" selection-start-line="194" selection-start-column="30" selection-end-line="194" selection-end-column="30" />
<state relative-caret-position="243">
<caret line="197" column="42" lean-forward="false" selection-start-line="197" selection-start-column="42" selection-end-line="197" selection-end-column="42" />
<folding>
<element signature="e#18286#18629#1" expanded="false" />
<element signature="e#18684#18904#0" expanded="false" />
......@@ -64,11 +63,11 @@
</provider>
</entry>
</file>
<file leaf-file-name="transformer_dla.py" pinned="false" current-in-tab="false">
<file leaf-file-name="transformer_dla.py" pinned="false" current-in-tab="true">
<entry file="file://$PROJECT_DIR$/tensor2tensor/models/transformer_dla.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="74">
<caret line="92" column="43" lean-forward="false" selection-start-line="92" selection-start-column="43" selection-end-line="92" selection-end-column="43" />
<state relative-caret-position="-1723">
<caret line="209" column="0" lean-forward="false" selection-start-line="209" selection-start-column="0" selection-end-line="209" selection-end-column="0" />
<folding>
<element signature="e#738#776#0" expanded="true" />
</folding>
......@@ -79,7 +78,7 @@
<file leaf-file-name="layer_history.py" pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/tensor2tensor/models/layer_history.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="432">
<state relative-caret-position="378">
<caret line="16" column="0" lean-forward="false" selection-start-line="16" selection-start-column="0" selection-end-line="16" selection-end-column="0" />
<folding>
<element signature="e#0#23#0" expanded="true" />
......@@ -110,11 +109,11 @@
<list>
<option value="$PROJECT_DIR$/tensor2tensor/models/layer_history.py" />
<option value="$PROJECT_DIR$/tensor2tensor/models/common_layers.py" />
<option value="$PROJECT_DIR$/tensor2tensor/models/transformer_dla.py" />
<option value="$PROJECT_DIR$/tensor2tensor/models/libei.py" />
<option value="$PROJECT_DIR$/tensor2tensor/utils/trainer_utils.py" />
<option value="$PROJECT_DIR$/tensor2tensor/models/common_hparams.py" />
<option value="$PROJECT_DIR$/tensor2tensor/models/transformer.py" />
<option value="$PROJECT_DIR$/tensor2tensor/models/transformer_dla.py" />
</list>
</option>
</component>
......@@ -124,7 +123,7 @@
<detection-done>true</detection-done>
<sorting>DEFINITION_ORDER</sorting>
</component>
<component name="ProjectFrameBounds" extendedState="6">
<component name="ProjectFrameBounds" extendedState="7">
<option name="x" value="22" />
<option name="y" value="5" />
<option name="width" value="1909" />
......@@ -148,8 +147,6 @@
<foldersAlwaysOnTop value="true" />
</navigator>
<panes>
<pane id="Scratches" />
<pane id="Scope" />
<pane id="ProjectPane">
<subPane>
<expand>
......@@ -178,13 +175,15 @@
<select />
</subPane>
</pane>
<pane id="Scope" />
<pane id="Scratches" />
</panes>
</component>
<component name="PropertiesComponent">
<property name="nodejs_interpreter_path.stuck_in_default_project" value="undefined stuck path" />
<property name="settings.editor.selected.configurable" value="com.jetbrains.python.configuration.PyActiveSdkModuleConfigurable" />
<property name="WebServerToolWindowFactoryState" value="false" />
<property name="last_opened_file_path" value="$PROJECT_DIR$" />
<property name="last_opened_file_path" value="$PROJECT_DIR$/../DeepTransformer-v4" />
</component>
<component name="RecentsManager">
<key name="CopyFile.RECENT_KEYS">
......@@ -220,23 +219,22 @@
<servers />
</component>
<component name="ToolWindowManager">
<frame x="-8" y="-8" width="1936" height="1056" extended-state="6" />
<editor active="true" />
<frame x="-8" y="-8" width="1936" height="1056" extended-state="7" />
<layout>
<window_info id="TODO" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="11" side_tool="false" content_ui="tabs" />
<window_info id="Event Log" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="0" side_tool="true" content_ui="tabs" />
<window_info id="File Transfer" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="true" show_stripe_button="true" weight="0.20457019" sideWeight="0.5" order="-1" side_tool="false" content_ui="tabs" />
<window_info id="Run" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="7" side_tool="false" content_ui="tabs" />
<window_info id="File Transfer" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="true" show_stripe_button="true" weight="0.20457019" sideWeight="0.5" order="12" side_tool="false" content_ui="tabs" />
<window_info id="Version Control" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
<window_info id="Python Console" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="2" side_tool="false" content_ui="tabs" />
<window_info id="Run" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="7" side_tool="false" content_ui="tabs" />
<window_info id="Terminal" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="3" side_tool="false" content_ui="tabs" />
<window_info id="Project" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="true" show_stripe_button="true" weight="0.1609808" sideWeight="0.5" order="1" side_tool="false" content_ui="combo" />
<window_info id="Project" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="true" show_stripe_button="true" weight="0.16044776" sideWeight="0.5" order="1" side_tool="false" content_ui="combo" />
<window_info id="Docker" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="false" weight="0.33" sideWeight="0.5" order="4" side_tool="false" content_ui="tabs" />
<window_info id="Database" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
<window_info id="SciView" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="0" side_tool="false" content_ui="tabs" />
<window_info id="Structure" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="2" side_tool="false" content_ui="tabs" />
<window_info id="Debug" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.4" sideWeight="0.5" order="8" side_tool="false" content_ui="tabs" />
<window_info id="Favorites" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="0" side_tool="true" content_ui="tabs" />
<window_info id="Debug" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.4" sideWeight="0.5" order="8" side_tool="false" content_ui="tabs" />
<window_info id="Cvs" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="9" side_tool="false" content_ui="tabs" />
<window_info id="Message" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="5" side_tool="false" content_ui="tabs" />
<window_info id="Commander" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.4" sideWeight="0.5" order="2" side_tool="false" content_ui="tabs" />
......@@ -363,16 +361,6 @@
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/tensor2tensor/models/layer_history.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="432">
<caret line="16" column="0" lean-forward="false" selection-start-line="16" selection-start-column="0" selection-end-line="16" selection-end-column="0" />
<folding>
<element signature="e#0#23#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/tensor2tensor/utils/multistep_optimizer.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="459">
......@@ -381,18 +369,29 @@
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/tensor2tensor/models/libei.py">
<entry file="file://$PROJECT_DIR$/tensor2tensor/models/libei.py" />
<entry file="file://$PROJECT_DIR$/tensor2tensor/models/common_hparams.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="398">
<caret line="410" column="32" lean-forward="true" selection-start-line="409" selection-start-column="20" selection-end-line="410" selection-end-column="32" />
<state relative-caret-position="540">
<caret line="30" column="11" lean-forward="false" selection-start-line="30" selection-start-column="11" selection-end-line="30" selection-end-column="11" />
<folding />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/tensor2tensor/models/layer_history.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="378">
<caret line="16" column="0" lean-forward="false" selection-start-line="16" selection-start-column="0" selection-end-line="16" selection-end-column="0" />
<folding>
<element signature="e#0#23#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/tensor2tensor/utils/trainer_utils.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="142">
<caret line="194" column="30" lean-forward="true" selection-start-line="194" selection-start-column="30" selection-end-line="194" selection-end-column="30" />
<state relative-caret-position="243">
<caret line="197" column="42" lean-forward="false" selection-start-line="197" selection-start-column="42" selection-end-line="197" selection-end-column="42" />
<folding>
<element signature="e#18286#18629#1" expanded="false" />
<element signature="e#18684#18904#0" expanded="false" />
......@@ -413,29 +412,21 @@
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/tensor2tensor/models/transformer_dla.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="74">
<caret line="92" column="43" lean-forward="false" selection-start-line="92" selection-start-column="43" selection-end-line="92" selection-end-column="43" />
<folding>
<element signature="e#738#776#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/tensor2tensor/models/common_hparams.py">
<entry file="file://$PROJECT_DIR$/tensor2tensor/models/transformer.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="235">
<caret line="30" column="11" lean-forward="true" selection-start-line="30" selection-start-column="11" selection-end-line="30" selection-end-column="11" />
<state relative-caret-position="352">
<caret line="294" column="0" lean-forward="false" selection-start-line="294" selection-start-column="0" selection-end-line="294" selection-end-column="0" />
<folding />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/tensor2tensor/models/transformer.py">
<entry file="file://$PROJECT_DIR$/tensor2tensor/models/transformer_dla.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="541">
<caret line="364" column="28" lean-forward="true" selection-start-line="364" selection-start-column="28" selection-end-line="364" selection-end-column="28" />
<folding />
<state relative-caret-position="-1723">
<caret line="209" column="0" lean-forward="false" selection-start-line="209" selection-start-column="0" selection-end-line="209" selection-end-column="0" />
<folding>
<element signature="e#738#776#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
......
......@@ -194,6 +194,9 @@ def transformer_encoder(encoder_input,
broadcast_dims=residual_dropout_broadcast_dims)
x = residual + x
x = may_be_layernorm(x, hparams, after=True)
if hparams.normalize_before:
x = may_be_layernorm(x, hparams, before=True, name="norm_top")
return x
......
......@@ -33,7 +33,6 @@ from tensor2tensor.models import common_hparams
from tensor2tensor.models import common_layers
from tensor2tensor.utils import registry
from tensor2tensor.utils import t2t_model
from tensor2tensor.models import layer_history
import tensorflow as tf
......@@ -57,9 +56,6 @@ class TransformerDLA(t2t_model.T2TModel):
(decoder_input, decoder_self_attention_bias) = transformer_prepare_decoder(
targets, hparams)
def residual_fn(x, y, dropout_broadcast_dims=None):
return common_layers.layer_norm(x + common_layers.dropout_with_broadcast_dims(
y, 1.0 - hparams.residual_dropout, broadcast_dims=dropout_broadcast_dims))
# encoder_input = tf.squeeze(encoder_input, 2)
# decoder_input = tf.squeeze(decoder_input, 2)
......@@ -68,16 +64,19 @@ class TransformerDLA(t2t_model.T2TModel):
encoder_layer = layer_history.CreateLayerHistory(self._hparams, True, name="encoder")
encoder_output = transformer_encoder(encoder_input, residual_fn,
encoder_output = transformer_encoder(encoder_input,
encoder_attention_bias, hparams, encoder_layer)
decoder_layer = layer_history.CreateLayerHistory(self._hparams, False, name="decoder")
decoder_output = transformer_decoder(
decoder_input, encoder_output, residual_fn, decoder_self_attention_bias,
decoder_input, encoder_output, decoder_self_attention_bias,
encoder_attention_bias, hparams, decoder_layer)
decoder_output = tf.expand_dims(decoder_output, 2)
return decoder_output
......@@ -132,8 +131,14 @@ def transformer_prepare_decoder(targets, hparams):
return (decoder_input, decoder_self_attention_bias)
def may_be_layernorm(input, hparams, before=False, after=False, name=None):
assert before ^ after
if after ^ hparams.normalize_before:
return common_layers.layer_norm(input, name=name)
else:
return input
def transformer_encoder(encoder_input,
residual_fn,
encoder_self_attention_bias,
hparams,
encoder_layer,
......@@ -163,11 +168,15 @@ def transformer_encoder(encoder_input,
# Summaries don't work in multi-problem setting yet.
summaries = "problems" not in hparams.values() or len(hparams.problems) == 1
with tf.variable_scope(name):
for layer in xrange(hparams.num_hidden_layers):
if hparams.use_emb:
encoder_layer.add(x)
for layer in xrange(hparams.encoder_layers):
with tf.variable_scope("layer_%d" % layer):
x = residual_fn(
x,
common_attention.multihead_attention(
#self-attention network
residual = x
x = may_be_layernorm(x, hparams, before=True)
x = common_attention.multihead_attention(
x,
None,
encoder_self_attention_bias,
......@@ -180,16 +189,30 @@ def transformer_encoder(encoder_input,
max_relative_length=hparams.max_relative_length,
dropout_broadcast_dims=attention_dropout_broadcast_dims,
summaries=False,
name="encoder_self_attention"),
dropout_broadcast_dims=residual_dropout_broadcast_dims)
x = residual_fn(x, transformer_ffn_layer(x, hparams),
dropout_broadcast_dims=residual_dropout_broadcast_dims)
name="encoder_self_attention")
x = common_layers.dropout_with_broadcast_dims(x,
1.0 - hparams.residual_dropout,
broadcast_dims=residual_dropout_broadcast_dims)
x = residual + x
x = may_be_layernorm(x, hparams, after=True)
# feed-forward network
residual = x
x = may_be_layernorm(x, hparams, before=True)
x = transformer_ffn_layer(x, hparams)
x = common_layers.dropout_with_broadcast_dims(x,
1.0 - hparams.residual_dropout,
broadcast_dims=residual_dropout_broadcast_dims)
x = residual + x
x = may_be_layernorm(x, hparams, after=True)
if hparams.normalize_before:
x = may_be_layernorm(x, hparams, before=True, name="norm_top")
return x
def transformer_decoder(decoder_input,
encoder_output,
residual_fn,
decoder_self_attention_bias,
encoder_decoder_attention_bias,
hparams,
......@@ -223,11 +246,12 @@ def transformer_decoder(decoder_input,
# Summaries don't work in multi-problem setting yet.
summaries = "problems" not in hparams.values() or len(hparams.problems) == 1
with tf.variable_scope(name):
for layer in xrange(hparams.num_hidden_layers):
for layer in xrange(hparams.decoder_layers):
with tf.variable_scope("layer_%d" % layer):
x = residual_fn(
x,
common_attention.multihead_attention(
# self-attention network
residual = x
x = may_be_layernorm(x, hparams, before=True)
x = common_attention.multihead_attention(
x,
None,
decoder_self_attention_bias,
......@@ -240,11 +264,17 @@ def transformer_decoder(decoder_input,
max_relative_length=hparams.max_relative_length,
dropout_broadcast_dims=attention_dropout_broadcast_dims,
summaries=False,
name="decoder_self_attention"),
dropout_broadcast_dims=residual_dropout_broadcast_dims)
x = residual_fn(
x,
common_attention.multihead_attention(
name="decoder_self_attention")
x = common_layers.dropout_with_broadcast_dims(x,
1.0 - hparams.residual_dropout,
broadcast_dims=residual_dropout_broadcast_dims)
x = residual + x
x = may_be_layernorm(x, hparams, after=True)
# encoder-decoder-attention network
residual = x
x = may_be_layernorm(x, hparams, before=True)
x = common_attention.multihead_attention(
x,
encoder_output,
encoder_decoder_attention_bias,
......@@ -255,10 +285,24 @@ def transformer_decoder(decoder_input,
hparams.attention_dropout,
dropout_broadcast_dims=attention_dropout_broadcast_dims,
summaries=False,
name="encdec_attention"),
dropout_broadcast_dims=residual_dropout_broadcast_dims)
x = residual_fn(x, transformer_ffn_layer(x, hparams),
dropout_broadcast_dims=residual_dropout_broadcast_dims)
name="encdec_attention")
x = common_layers.dropout_with_broadcast_dims(x,
1.0 - hparams.residual_dropout,
broadcast_dims=residual_dropout_broadcast_dims)
x = residual + x
x = may_be_layernorm(x, hparams, after=True)
# feed-forward network
residual = x
x = may_be_layernorm(x, hparams, before=True)
x = transformer_ffn_layer(x, hparams)
x = common_layers.dropout_with_broadcast_dims(x,
1.0 - hparams.residual_dropout,
broadcast_dims=residual_dropout_broadcast_dims)
x = residual + x
x = may_be_layernorm(x, hparams, after=True)
if hparams.normalize_before:
x = may_be_layernorm(x, hparams, before=True, name="norm_top")
return x
......@@ -332,7 +376,8 @@ def transformer_base():
hparams.learning_rate = 0.1
hparams.learning_rate_warmup_steps = 4000
hparams.initializer_gain = 1.0
hparams.num_hidden_layers = 6
hparams.encoder_layers = 6
hparams.decoder_layers = 6
hparams.initializer = "uniform_unit_scaling"
hparams.weight_decay = 0.0
hparams.optimizer_adam_beta1 = 0.9
......@@ -370,6 +415,7 @@ def transformer_base():
hparams.add_hparam("attention_dropout_broadcast_dims", "0,1") # batch, heads
hparams.add_hparam("relu_dropout_broadcast_dims", "1") # length
hparams.add_hparam("residual_dropout_broadcast_dims", "1") # length
hparams.add_hparam("normalize_before", False)
return hparams
......@@ -386,187 +432,49 @@ def transformer_big():
@registry.register_hparams
def transformer_big_single_gpu():
"""HParams for transformer big model for single gpu."""
hparams = transformer_big()
hparams.residual_dropout = 0.1
hparams.learning_rate_warmup_steps = 16000
hparams.optimizer_adam_beta2 = 0.998
hparams.batching_mantissa_bits = 3
return hparams
@registry.register_hparams
def transformer_base_single_gpu():
"""HParams for transformer base model for single gpu."""
def transformer_before():
"""HParams for transfomer big model on WMT."""
hparams = transformer_base()
hparams.batch_size = 8192
hparams.learning_rate_warmup_steps = 16000
hparams.batching_mantissa_bits = 2
hparams.normalize_before = True
hparams.relu_dropout = 0.1
hparams.attention_dropout = 0.1
hparams.learning_rate = 0.2
hparams.learning_rate_warmup_steps = 8000
hparams.optimizer_adam_beta1 = 0.9
hparams.optimizer_adam_beta2 = 0.997
return hparams
@registry.register_hparams
def transformer_parsing_base():
"""Hparams for parsing on wsj only."""
hparams = transformer_base()
hparams.attention_dropout = 0.2
hparams.residual_dropout = 0.2
hparams.max_length = 512
hparams.learning_rate_warmup_steps = 16000
def transformer_before_big():
"""HParams for transfomer big model on WMT."""
hparams = transformer_before()
hparams.hidden_size = 1024
hparams.learning_rate = 0.05
hparams.shared_embedding_and_softmax_weights = int(False)
hparams.filter_size = 4096
hparams.num_heads = 16
hparams.batching_mantissa_bits = 2
hparams.residual_dropout = 0.3
return hparams
@registry.register_hparams
def transformer_parsing_big():
"""HParams for parsing on wsj semi-supervised."""
def transformer_big_single_gpu():
"""HParams for transformer big model for single gpu."""
hparams = transformer_big()
hparams.max_length = 512
hparams.shared_source_target_embedding = int(False)
hparams.learning_rate_warmup_steps = 4000
hparams.residual_dropout = 0.1
hparams.batch_size = 2048
hparams.learning_rate = 0.05
return hparams
@registry.register_hparams
def transformer_parsing_ice():
"""Hparams for parsing Icelandic text."""
hparams = transformer_base_single_gpu()
hparams.batch_size = 4096
hparams.shared_embedding_and_softmax_weights = int(False)
return hparams
@registry.register_hparams
def transformer_tiny():
hparams = transformer_base()
hparams.hidden_size = 64
hparams.filter_size = 128
hparams.num_heads = 4
return hparams
@registry.register_hparams
def transformer_l2():
hparams = transformer_base()
hparams.num_hidden_layers = 2
return hparams
@registry.register_hparams
def transformer_l4():
hparams = transformer_base()
hparams.num_hidden_layers = 4
return hparams
@registry.register_hparams
def transformer_l8():
hparams = transformer_base()
hparams.num_hidden_layers = 8
return hparams
@registry.register_hparams
def transformer_h1():
hparams = transformer_base()
hparams.num_heads = 1
return hparams
@registry.register_hparams
def transformer_h4():
hparams = transformer_base()
hparams.num_heads = 4
return hparams
@registry.register_hparams
def transformer_h16():
hparams = transformer_base()
hparams.num_heads = 16
return hparams
@registry.register_hparams
def transformer_h32():
hparams = transformer_base()
hparams.num_heads = 32
return hparams
@registry.register_hparams
def transformer_k128():
hparams = transformer_base()
hparams.attention_key_channels = 128
return hparams
@registry.register_hparams
def transformer_k256():
hparams = transformer_base()
hparams.attention_key_channels = 256
return hparams
@registry.register_hparams
def transformer_ff1024():
hparams = transformer_base()
hparams.filter_size = 1024
return hparams
@registry.register_hparams
def transformer_ff4096():
hparams = transformer_base()
hparams.filter_size = 4096
return hparams
@registry.register_hparams
def transformer_dr0():
hparams = transformer_base()
hparams.residual_dropout = 0.0
return hparams
@registry.register_hparams
def transformer_dr2():
hparams = transformer_base()
hparams.residual_dropout = 0.2
return hparams
@registry.register_hparams
def transformer_ls0():
hparams = transformer_base()
hparams.label_smoothing = 0.0
return hparams
@registry.register_hparams
def transformer_ls2():
hparams = transformer_base()
hparams.label_smoothing = 0.2
return hparams
@registry.register_hparams
def transformer_hs256():
hparams = transformer_base()
hparams.hidden_size = 256
hparams.learning_rate_warmup_steps = 16000
hparams.optimizer_adam_beta2 = 0.998
hparams.batching_mantissa_bits = 3
return hparams
@registry.register_hparams
def transformer_hs1024():
def transformer_base_single_gpu():
"""HParams for transformer base model for single gpu."""
hparams = transformer_base()
hparams.hidden_size = 1024
hparams.batch_size = 8192
hparams.learning_rate_warmup_steps = 16000
hparams.batching_mantissa_bits = 2
return hparams
......@@ -598,36 +506,6 @@ def transformer_big_dr2():
@registry.register_hparams
def transformer_parameter_attention_a():
hparams = transformer_base()
hparams.ffn_layer = "parameter_attention"
hparams.filter_size = 1536
return hparams
@registry.register_hparams
def transformer_parameter_attention_b():
hparams = transformer_base()
hparams.ffn_layer = "parameter_attention"
hparams.filter_size = 512
hparams.parameter_attention_key_channels = 1024
hparams.parameter_attention_value_channels = 1024
hparams.num_heads = 16
return hparams
@registry.register_ranged_hparams("transformer_big_single_gpu")
def transformer_range1(rhp):
"""Small range of hyperparameters."""
hparams = transformer_big_single_gpu()
common_hparams.fill_ranged_hparams_from_hparams(hparams, rhp)
rhp.set_float("learning_rate", 0.3, 3.0, scale=rhp.LOG_SCALE)
rhp.set_float("initializer_gain", 0.5, 2.0)
rhp.set_float("optimizer_adam_beta2", 0.97, 0.99)
rhp.set_float("weight_decay", 0.0, 2.0)
@registry.register_hparams
def transformer_base_ldcd():
"""Set of hyperparameters."""
hparams = transformer_base()
......@@ -663,14 +541,6 @@ def transformer_base_ldcd_n1():
hparams.learning_rate_warmup_steps = 4000
return hparams
@registry.register_hparams
def transformer_base_nosmooth_dropout1():
"""Set of hyperparameters."""
hparams = transformer_base()
hparams.label_smoothing = 0.0
hparams.relu_dropout = 0.1
hparams.attention_dropout = 0.1
return hparams
@registry.register_hparams
def transformer_base_amsgrad():
......@@ -722,7 +592,6 @@ def transformer_base_ldrestart_n3():
@registry.register_hparams
def transformer_base_powersign():
"""Set of hyperparameters."""
......@@ -817,15 +686,6 @@ def transformer_big_adafactor():
hparams.optimizer_adafactor_beta2 = 0.997
return hparams
@registry.register_hparams
def transformer_base_debug():
hparams = transformer_big_adafactor()
hparams.num_hidden_layers = 3
hparams.fused_inner_hidden = 128
hparams.hidden_size = 64
hparams.filter_size = 128
hparams.batch_size = 128
return hparams
@registry.register_hparams
def transformer_base_v2():
......@@ -848,7 +708,7 @@ def transformer_base_rpr_dropout1():
@registry.register_hparams
def transformer_base_v2_filter4096():
def transformer_base_v3():
"""Set of hyperparameters.
set filter as 4096
"""
......@@ -856,42 +716,7 @@ def transformer_base_v2_filter4096():
hparams.filter_size = 4096
return hparams
@registry.register_hparams
def transformer_base_filter4096():
"""Set of hyperparameters.
set filter as 4096
"""
hparams = transformer_base()
hparams.filter_size = 4096
return hparams
@registry.register_hparams
def transformer_base_v2_dropout2_filter4096():
"""Set of hyperparameters.
set relu_dropout and attention_dropout as 0.2
"""
hparams = transformer_base()
hparams.attention_dropout = 0.2
hparams.relu_dropout = 0.2
hparams.filter_size = 4096
return hparams
@registry.register_hparams
def transformer_base_lr2():
"""Set of hyperparameters.
set relu_dropout and attention_dropout as 0.2
"""
hparams = transformer_base()
hparams.learning_rate = 0.2
return hparams
@registry.register_hparams
def transformer_base_multistep2():
# new model use optimizer MultistepAdam
hparams = transformer_base()
hparams.optimizer = "MultistepAdam"
hparams.optimizer_multistep_accumulate_steps = 2
return hparams
@registry.register_hparams
def transformer_big_multistep2():
......@@ -914,14 +739,3 @@ def transformer_big_adafactor_test():
hparams.optimizer_adafactor_beta2 = 0.999
return hparams
@registry.register_hparams
def transformer_mobile():
# new model use optimizer MultistepAdam
hparams = transformer_base()
hparams.hidden_size = 256
hparams.num_hidden_layers = 4
hparams.residual_dropout = 0.1
hparams.dropout = 0.1
return hparams
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论