Commit 90010cd3 by libei

reviese bugs

parent fb9ee9e7
...@@ -3,9 +3,8 @@ ...@@ -3,9 +3,8 @@
<component name="ChangeListManager"> <component name="ChangeListManager">
<list default="true" id="7d6d9926-f879-4708-ad8e-442bac96b62a" name="Default" comment=""> <list default="true" id="7d6d9926-f879-4708-ad8e-442bac96b62a" name="Default" comment="">
<change beforePath="$PROJECT_DIR$/.idea/workspace.xml" afterPath="$PROJECT_DIR$/.idea/workspace.xml" /> <change beforePath="$PROJECT_DIR$/.idea/workspace.xml" afterPath="$PROJECT_DIR$/.idea/workspace.xml" />
<change beforePath="$PROJECT_DIR$/tensor2tensor/models/common_hparams.py" afterPath="$PROJECT_DIR$/tensor2tensor/models/common_hparams.py" />
<change beforePath="$PROJECT_DIR$/tensor2tensor/models/transformer.py" afterPath="$PROJECT_DIR$/tensor2tensor/models/transformer.py" /> <change beforePath="$PROJECT_DIR$/tensor2tensor/models/transformer.py" afterPath="$PROJECT_DIR$/tensor2tensor/models/transformer.py" />
<change beforePath="$PROJECT_DIR$/tensor2tensor/utils/trainer_utils.py" afterPath="$PROJECT_DIR$/tensor2tensor/utils/trainer_utils.py" /> <change beforePath="$PROJECT_DIR$/tensor2tensor/models/transformer_dla.py" afterPath="$PROJECT_DIR$/tensor2tensor/models/transformer_dla.py" />
</list> </list>
<option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" /> <option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
<option name="TRACKING_ENABLED" value="true" /> <option name="TRACKING_ENABLED" value="true" />
...@@ -16,11 +15,11 @@ ...@@ -16,11 +15,11 @@
</component> </component>
<component name="FileEditorManager"> <component name="FileEditorManager">
<leaf SIDE_TABS_SIZE_LIMIT_KEY="300"> <leaf SIDE_TABS_SIZE_LIMIT_KEY="300">
<file leaf-file-name="transformer.py" pinned="false" current-in-tab="true"> <file leaf-file-name="transformer.py" pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/tensor2tensor/models/transformer.py"> <entry file="file://$PROJECT_DIR$/tensor2tensor/models/transformer.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="541"> <state relative-caret-position="352">
<caret line="364" column="28" lean-forward="true" selection-start-line="364" selection-start-column="28" selection-end-line="364" selection-end-column="28" /> <caret line="294" column="0" lean-forward="false" selection-start-line="294" selection-start-column="0" selection-end-line="294" selection-end-column="0" />
<folding /> <folding />
</state> </state>
</provider> </provider>
...@@ -29,8 +28,8 @@ ...@@ -29,8 +28,8 @@
<file leaf-file-name="common_hparams.py" pinned="false" current-in-tab="false"> <file leaf-file-name="common_hparams.py" pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/tensor2tensor/models/common_hparams.py"> <entry file="file://$PROJECT_DIR$/tensor2tensor/models/common_hparams.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="235"> <state relative-caret-position="540">
<caret line="30" column="11" lean-forward="true" selection-start-line="30" selection-start-column="11" selection-end-line="30" selection-end-column="11" /> <caret line="30" column="11" lean-forward="false" selection-start-line="30" selection-start-column="11" selection-end-line="30" selection-end-column="11" />
<folding /> <folding />
</state> </state>
</provider> </provider>
...@@ -39,8 +38,8 @@ ...@@ -39,8 +38,8 @@
<file leaf-file-name="trainer_utils.py" pinned="false" current-in-tab="false"> <file leaf-file-name="trainer_utils.py" pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/tensor2tensor/utils/trainer_utils.py"> <entry file="file://$PROJECT_DIR$/tensor2tensor/utils/trainer_utils.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="142"> <state relative-caret-position="243">
<caret line="194" column="30" lean-forward="true" selection-start-line="194" selection-start-column="30" selection-end-line="194" selection-end-column="30" /> <caret line="197" column="42" lean-forward="false" selection-start-line="197" selection-start-column="42" selection-end-line="197" selection-end-column="42" />
<folding> <folding>
<element signature="e#18286#18629#1" expanded="false" /> <element signature="e#18286#18629#1" expanded="false" />
<element signature="e#18684#18904#0" expanded="false" /> <element signature="e#18684#18904#0" expanded="false" />
...@@ -64,11 +63,11 @@ ...@@ -64,11 +63,11 @@
</provider> </provider>
</entry> </entry>
</file> </file>
<file leaf-file-name="transformer_dla.py" pinned="false" current-in-tab="false"> <file leaf-file-name="transformer_dla.py" pinned="false" current-in-tab="true">
<entry file="file://$PROJECT_DIR$/tensor2tensor/models/transformer_dla.py"> <entry file="file://$PROJECT_DIR$/tensor2tensor/models/transformer_dla.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="74"> <state relative-caret-position="-1723">
<caret line="92" column="43" lean-forward="false" selection-start-line="92" selection-start-column="43" selection-end-line="92" selection-end-column="43" /> <caret line="209" column="0" lean-forward="false" selection-start-line="209" selection-start-column="0" selection-end-line="209" selection-end-column="0" />
<folding> <folding>
<element signature="e#738#776#0" expanded="true" /> <element signature="e#738#776#0" expanded="true" />
</folding> </folding>
...@@ -79,7 +78,7 @@ ...@@ -79,7 +78,7 @@
<file leaf-file-name="layer_history.py" pinned="false" current-in-tab="false"> <file leaf-file-name="layer_history.py" pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/tensor2tensor/models/layer_history.py"> <entry file="file://$PROJECT_DIR$/tensor2tensor/models/layer_history.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="432"> <state relative-caret-position="378">
<caret line="16" column="0" lean-forward="false" selection-start-line="16" selection-start-column="0" selection-end-line="16" selection-end-column="0" /> <caret line="16" column="0" lean-forward="false" selection-start-line="16" selection-start-column="0" selection-end-line="16" selection-end-column="0" />
<folding> <folding>
<element signature="e#0#23#0" expanded="true" /> <element signature="e#0#23#0" expanded="true" />
...@@ -110,11 +109,11 @@ ...@@ -110,11 +109,11 @@
<list> <list>
<option value="$PROJECT_DIR$/tensor2tensor/models/layer_history.py" /> <option value="$PROJECT_DIR$/tensor2tensor/models/layer_history.py" />
<option value="$PROJECT_DIR$/tensor2tensor/models/common_layers.py" /> <option value="$PROJECT_DIR$/tensor2tensor/models/common_layers.py" />
<option value="$PROJECT_DIR$/tensor2tensor/models/transformer_dla.py" />
<option value="$PROJECT_DIR$/tensor2tensor/models/libei.py" /> <option value="$PROJECT_DIR$/tensor2tensor/models/libei.py" />
<option value="$PROJECT_DIR$/tensor2tensor/utils/trainer_utils.py" /> <option value="$PROJECT_DIR$/tensor2tensor/utils/trainer_utils.py" />
<option value="$PROJECT_DIR$/tensor2tensor/models/common_hparams.py" /> <option value="$PROJECT_DIR$/tensor2tensor/models/common_hparams.py" />
<option value="$PROJECT_DIR$/tensor2tensor/models/transformer.py" /> <option value="$PROJECT_DIR$/tensor2tensor/models/transformer.py" />
<option value="$PROJECT_DIR$/tensor2tensor/models/transformer_dla.py" />
</list> </list>
</option> </option>
</component> </component>
...@@ -124,7 +123,7 @@ ...@@ -124,7 +123,7 @@
<detection-done>true</detection-done> <detection-done>true</detection-done>
<sorting>DEFINITION_ORDER</sorting> <sorting>DEFINITION_ORDER</sorting>
</component> </component>
<component name="ProjectFrameBounds" extendedState="6"> <component name="ProjectFrameBounds" extendedState="7">
<option name="x" value="22" /> <option name="x" value="22" />
<option name="y" value="5" /> <option name="y" value="5" />
<option name="width" value="1909" /> <option name="width" value="1909" />
...@@ -148,8 +147,6 @@ ...@@ -148,8 +147,6 @@
<foldersAlwaysOnTop value="true" /> <foldersAlwaysOnTop value="true" />
</navigator> </navigator>
<panes> <panes>
<pane id="Scratches" />
<pane id="Scope" />
<pane id="ProjectPane"> <pane id="ProjectPane">
<subPane> <subPane>
<expand> <expand>
...@@ -178,13 +175,15 @@ ...@@ -178,13 +175,15 @@
<select /> <select />
</subPane> </subPane>
</pane> </pane>
<pane id="Scope" />
<pane id="Scratches" />
</panes> </panes>
</component> </component>
<component name="PropertiesComponent"> <component name="PropertiesComponent">
<property name="nodejs_interpreter_path.stuck_in_default_project" value="undefined stuck path" /> <property name="nodejs_interpreter_path.stuck_in_default_project" value="undefined stuck path" />
<property name="settings.editor.selected.configurable" value="com.jetbrains.python.configuration.PyActiveSdkModuleConfigurable" /> <property name="settings.editor.selected.configurable" value="com.jetbrains.python.configuration.PyActiveSdkModuleConfigurable" />
<property name="WebServerToolWindowFactoryState" value="false" /> <property name="WebServerToolWindowFactoryState" value="false" />
<property name="last_opened_file_path" value="$PROJECT_DIR$" /> <property name="last_opened_file_path" value="$PROJECT_DIR$/../DeepTransformer-v4" />
</component> </component>
<component name="RecentsManager"> <component name="RecentsManager">
<key name="CopyFile.RECENT_KEYS"> <key name="CopyFile.RECENT_KEYS">
...@@ -220,23 +219,22 @@ ...@@ -220,23 +219,22 @@
<servers /> <servers />
</component> </component>
<component name="ToolWindowManager"> <component name="ToolWindowManager">
<frame x="-8" y="-8" width="1936" height="1056" extended-state="6" /> <frame x="-8" y="-8" width="1936" height="1056" extended-state="7" />
<editor active="true" />
<layout> <layout>
<window_info id="TODO" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="11" side_tool="false" content_ui="tabs" /> <window_info id="TODO" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="11" side_tool="false" content_ui="tabs" />
<window_info id="Event Log" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="0" side_tool="true" content_ui="tabs" /> <window_info id="Event Log" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="0" side_tool="true" content_ui="tabs" />
<window_info id="File Transfer" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="true" show_stripe_button="true" weight="0.20457019" sideWeight="0.5" order="-1" side_tool="false" content_ui="tabs" /> <window_info id="File Transfer" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="true" show_stripe_button="true" weight="0.20457019" sideWeight="0.5" order="12" side_tool="false" content_ui="tabs" />
<window_info id="Run" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="7" side_tool="false" content_ui="tabs" />
<window_info id="Version Control" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" /> <window_info id="Version Control" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
<window_info id="Python Console" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="2" side_tool="false" content_ui="tabs" /> <window_info id="Python Console" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="2" side_tool="false" content_ui="tabs" />
<window_info id="Run" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="7" side_tool="false" content_ui="tabs" />
<window_info id="Terminal" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="3" side_tool="false" content_ui="tabs" /> <window_info id="Terminal" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="3" side_tool="false" content_ui="tabs" />
<window_info id="Project" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="true" show_stripe_button="true" weight="0.1609808" sideWeight="0.5" order="1" side_tool="false" content_ui="combo" /> <window_info id="Project" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="true" show_stripe_button="true" weight="0.16044776" sideWeight="0.5" order="1" side_tool="false" content_ui="combo" />
<window_info id="Docker" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="false" weight="0.33" sideWeight="0.5" order="4" side_tool="false" content_ui="tabs" /> <window_info id="Docker" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="false" weight="0.33" sideWeight="0.5" order="4" side_tool="false" content_ui="tabs" />
<window_info id="Database" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" /> <window_info id="Database" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
<window_info id="SciView" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="0" side_tool="false" content_ui="tabs" /> <window_info id="SciView" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="0" side_tool="false" content_ui="tabs" />
<window_info id="Structure" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="2" side_tool="false" content_ui="tabs" /> <window_info id="Structure" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="2" side_tool="false" content_ui="tabs" />
<window_info id="Debug" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.4" sideWeight="0.5" order="8" side_tool="false" content_ui="tabs" />
<window_info id="Favorites" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="0" side_tool="true" content_ui="tabs" /> <window_info id="Favorites" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="0" side_tool="true" content_ui="tabs" />
<window_info id="Debug" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.4" sideWeight="0.5" order="8" side_tool="false" content_ui="tabs" />
<window_info id="Cvs" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="9" side_tool="false" content_ui="tabs" /> <window_info id="Cvs" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="9" side_tool="false" content_ui="tabs" />
<window_info id="Message" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="5" side_tool="false" content_ui="tabs" /> <window_info id="Message" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="5" side_tool="false" content_ui="tabs" />
<window_info id="Commander" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.4" sideWeight="0.5" order="2" side_tool="false" content_ui="tabs" /> <window_info id="Commander" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.4" sideWeight="0.5" order="2" side_tool="false" content_ui="tabs" />
...@@ -363,16 +361,6 @@ ...@@ -363,16 +361,6 @@
</state> </state>
</provider> </provider>
</entry> </entry>
<entry file="file://$PROJECT_DIR$/tensor2tensor/models/layer_history.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="432">
<caret line="16" column="0" lean-forward="false" selection-start-line="16" selection-start-column="0" selection-end-line="16" selection-end-column="0" />
<folding>
<element signature="e#0#23#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/tensor2tensor/utils/multistep_optimizer.py"> <entry file="file://$PROJECT_DIR$/tensor2tensor/utils/multistep_optimizer.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="459"> <state relative-caret-position="459">
...@@ -381,18 +369,29 @@ ...@@ -381,18 +369,29 @@
</state> </state>
</provider> </provider>
</entry> </entry>
<entry file="file://$PROJECT_DIR$/tensor2tensor/models/libei.py"> <entry file="file://$PROJECT_DIR$/tensor2tensor/models/libei.py" />
<entry file="file://$PROJECT_DIR$/tensor2tensor/models/common_hparams.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="398"> <state relative-caret-position="540">
<caret line="410" column="32" lean-forward="true" selection-start-line="409" selection-start-column="20" selection-end-line="410" selection-end-column="32" /> <caret line="30" column="11" lean-forward="false" selection-start-line="30" selection-start-column="11" selection-end-line="30" selection-end-column="11" />
<folding /> <folding />
</state> </state>
</provider> </provider>
</entry> </entry>
<entry file="file://$PROJECT_DIR$/tensor2tensor/models/layer_history.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="378">
<caret line="16" column="0" lean-forward="false" selection-start-line="16" selection-start-column="0" selection-end-line="16" selection-end-column="0" />
<folding>
<element signature="e#0#23#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/tensor2tensor/utils/trainer_utils.py"> <entry file="file://$PROJECT_DIR$/tensor2tensor/utils/trainer_utils.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="142"> <state relative-caret-position="243">
<caret line="194" column="30" lean-forward="true" selection-start-line="194" selection-start-column="30" selection-end-line="194" selection-end-column="30" /> <caret line="197" column="42" lean-forward="false" selection-start-line="197" selection-start-column="42" selection-end-line="197" selection-end-column="42" />
<folding> <folding>
<element signature="e#18286#18629#1" expanded="false" /> <element signature="e#18286#18629#1" expanded="false" />
<element signature="e#18684#18904#0" expanded="false" /> <element signature="e#18684#18904#0" expanded="false" />
...@@ -413,29 +412,21 @@ ...@@ -413,29 +412,21 @@
</state> </state>
</provider> </provider>
</entry> </entry>
<entry file="file://$PROJECT_DIR$/tensor2tensor/models/transformer_dla.py"> <entry file="file://$PROJECT_DIR$/tensor2tensor/models/transformer.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="74">
<caret line="92" column="43" lean-forward="false" selection-start-line="92" selection-start-column="43" selection-end-line="92" selection-end-column="43" />
<folding>
<element signature="e#738#776#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/tensor2tensor/models/common_hparams.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="235"> <state relative-caret-position="352">
<caret line="30" column="11" lean-forward="true" selection-start-line="30" selection-start-column="11" selection-end-line="30" selection-end-column="11" /> <caret line="294" column="0" lean-forward="false" selection-start-line="294" selection-start-column="0" selection-end-line="294" selection-end-column="0" />
<folding /> <folding />
</state> </state>
</provider> </provider>
</entry> </entry>
<entry file="file://$PROJECT_DIR$/tensor2tensor/models/transformer.py"> <entry file="file://$PROJECT_DIR$/tensor2tensor/models/transformer_dla.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="541"> <state relative-caret-position="-1723">
<caret line="364" column="28" lean-forward="true" selection-start-line="364" selection-start-column="28" selection-end-line="364" selection-end-column="28" /> <caret line="209" column="0" lean-forward="false" selection-start-line="209" selection-start-column="0" selection-end-line="209" selection-end-column="0" />
<folding /> <folding>
<element signature="e#738#776#0" expanded="true" />
</folding>
</state> </state>
</provider> </provider>
</entry> </entry>
......
...@@ -194,6 +194,9 @@ def transformer_encoder(encoder_input, ...@@ -194,6 +194,9 @@ def transformer_encoder(encoder_input,
broadcast_dims=residual_dropout_broadcast_dims) broadcast_dims=residual_dropout_broadcast_dims)
x = residual + x x = residual + x
x = may_be_layernorm(x, hparams, after=True) x = may_be_layernorm(x, hparams, after=True)
if hparams.normalize_before:
x = may_be_layernorm(x, hparams, before=True, name="norm_top")
return x return x
......
...@@ -33,7 +33,6 @@ from tensor2tensor.models import common_hparams ...@@ -33,7 +33,6 @@ from tensor2tensor.models import common_hparams
from tensor2tensor.models import common_layers from tensor2tensor.models import common_layers
from tensor2tensor.utils import registry from tensor2tensor.utils import registry
from tensor2tensor.utils import t2t_model from tensor2tensor.utils import t2t_model
from tensor2tensor.models import layer_history
import tensorflow as tf import tensorflow as tf
...@@ -57,9 +56,6 @@ class TransformerDLA(t2t_model.T2TModel): ...@@ -57,9 +56,6 @@ class TransformerDLA(t2t_model.T2TModel):
(decoder_input, decoder_self_attention_bias) = transformer_prepare_decoder( (decoder_input, decoder_self_attention_bias) = transformer_prepare_decoder(
targets, hparams) targets, hparams)
def residual_fn(x, y, dropout_broadcast_dims=None):
return common_layers.layer_norm(x + common_layers.dropout_with_broadcast_dims(
y, 1.0 - hparams.residual_dropout, broadcast_dims=dropout_broadcast_dims))
# encoder_input = tf.squeeze(encoder_input, 2) # encoder_input = tf.squeeze(encoder_input, 2)
# decoder_input = tf.squeeze(decoder_input, 2) # decoder_input = tf.squeeze(decoder_input, 2)
...@@ -68,16 +64,19 @@ class TransformerDLA(t2t_model.T2TModel): ...@@ -68,16 +64,19 @@ class TransformerDLA(t2t_model.T2TModel):
encoder_layer = layer_history.CreateLayerHistory(self._hparams, True, name="encoder") encoder_layer = layer_history.CreateLayerHistory(self._hparams, True, name="encoder")
encoder_output = transformer_encoder(encoder_input, residual_fn, encoder_output = transformer_encoder(encoder_input,
encoder_attention_bias, hparams, encoder_layer) encoder_attention_bias, hparams, encoder_layer)
decoder_layer = layer_history.CreateLayerHistory(self._hparams, False, name="decoder") decoder_layer = layer_history.CreateLayerHistory(self._hparams, False, name="decoder")
decoder_output = transformer_decoder( decoder_output = transformer_decoder(
decoder_input, encoder_output, residual_fn, decoder_self_attention_bias, decoder_input, encoder_output, decoder_self_attention_bias,
encoder_attention_bias, hparams, decoder_layer) encoder_attention_bias, hparams, decoder_layer)
decoder_output = tf.expand_dims(decoder_output, 2) decoder_output = tf.expand_dims(decoder_output, 2)
return decoder_output return decoder_output
...@@ -132,8 +131,14 @@ def transformer_prepare_decoder(targets, hparams): ...@@ -132,8 +131,14 @@ def transformer_prepare_decoder(targets, hparams):
return (decoder_input, decoder_self_attention_bias) return (decoder_input, decoder_self_attention_bias)
def may_be_layernorm(input, hparams, before=False, after=False, name=None):
assert before ^ after
if after ^ hparams.normalize_before:
return common_layers.layer_norm(input, name=name)
else:
return input
def transformer_encoder(encoder_input, def transformer_encoder(encoder_input,
residual_fn,
encoder_self_attention_bias, encoder_self_attention_bias,
hparams, hparams,
encoder_layer, encoder_layer,
...@@ -163,11 +168,15 @@ def transformer_encoder(encoder_input, ...@@ -163,11 +168,15 @@ def transformer_encoder(encoder_input,
# Summaries don't work in multi-problem setting yet. # Summaries don't work in multi-problem setting yet.
summaries = "problems" not in hparams.values() or len(hparams.problems) == 1 summaries = "problems" not in hparams.values() or len(hparams.problems) == 1
with tf.variable_scope(name): with tf.variable_scope(name):
for layer in xrange(hparams.num_hidden_layers): if hparams.use_emb:
encoder_layer.add(x)
for layer in xrange(hparams.encoder_layers):
with tf.variable_scope("layer_%d" % layer): with tf.variable_scope("layer_%d" % layer):
x = residual_fn(
x, #self-attention network
common_attention.multihead_attention( residual = x
x = may_be_layernorm(x, hparams, before=True)
x = common_attention.multihead_attention(
x, x,
None, None,
encoder_self_attention_bias, encoder_self_attention_bias,
...@@ -180,16 +189,30 @@ def transformer_encoder(encoder_input, ...@@ -180,16 +189,30 @@ def transformer_encoder(encoder_input,
max_relative_length=hparams.max_relative_length, max_relative_length=hparams.max_relative_length,
dropout_broadcast_dims=attention_dropout_broadcast_dims, dropout_broadcast_dims=attention_dropout_broadcast_dims,
summaries=False, summaries=False,
name="encoder_self_attention"), name="encoder_self_attention")
dropout_broadcast_dims=residual_dropout_broadcast_dims) x = common_layers.dropout_with_broadcast_dims(x,
x = residual_fn(x, transformer_ffn_layer(x, hparams), 1.0 - hparams.residual_dropout,
dropout_broadcast_dims=residual_dropout_broadcast_dims) broadcast_dims=residual_dropout_broadcast_dims)
x = residual + x
x = may_be_layernorm(x, hparams, after=True)
# feed-forward network
residual = x
x = may_be_layernorm(x, hparams, before=True)
x = transformer_ffn_layer(x, hparams)
x = common_layers.dropout_with_broadcast_dims(x,
1.0 - hparams.residual_dropout,
broadcast_dims=residual_dropout_broadcast_dims)
x = residual + x
x = may_be_layernorm(x, hparams, after=True)
if hparams.normalize_before:
x = may_be_layernorm(x, hparams, before=True, name="norm_top")
return x return x
def transformer_decoder(decoder_input, def transformer_decoder(decoder_input,
encoder_output, encoder_output,
residual_fn,
decoder_self_attention_bias, decoder_self_attention_bias,
encoder_decoder_attention_bias, encoder_decoder_attention_bias,
hparams, hparams,
...@@ -223,11 +246,12 @@ def transformer_decoder(decoder_input, ...@@ -223,11 +246,12 @@ def transformer_decoder(decoder_input,
# Summaries don't work in multi-problem setting yet. # Summaries don't work in multi-problem setting yet.
summaries = "problems" not in hparams.values() or len(hparams.problems) == 1 summaries = "problems" not in hparams.values() or len(hparams.problems) == 1
with tf.variable_scope(name): with tf.variable_scope(name):
for layer in xrange(hparams.num_hidden_layers): for layer in xrange(hparams.decoder_layers):
with tf.variable_scope("layer_%d" % layer): with tf.variable_scope("layer_%d" % layer):
x = residual_fn( # self-attention network
x, residual = x
common_attention.multihead_attention( x = may_be_layernorm(x, hparams, before=True)
x = common_attention.multihead_attention(
x, x,
None, None,
decoder_self_attention_bias, decoder_self_attention_bias,
...@@ -240,11 +264,17 @@ def transformer_decoder(decoder_input, ...@@ -240,11 +264,17 @@ def transformer_decoder(decoder_input,
max_relative_length=hparams.max_relative_length, max_relative_length=hparams.max_relative_length,
dropout_broadcast_dims=attention_dropout_broadcast_dims, dropout_broadcast_dims=attention_dropout_broadcast_dims,
summaries=False, summaries=False,
name="decoder_self_attention"), name="decoder_self_attention")
dropout_broadcast_dims=residual_dropout_broadcast_dims) x = common_layers.dropout_with_broadcast_dims(x,
x = residual_fn( 1.0 - hparams.residual_dropout,
x, broadcast_dims=residual_dropout_broadcast_dims)
common_attention.multihead_attention( x = residual + x
x = may_be_layernorm(x, hparams, after=True)
# encoder-decoder-attention network
residual = x
x = may_be_layernorm(x, hparams, before=True)
x = common_attention.multihead_attention(
x, x,
encoder_output, encoder_output,
encoder_decoder_attention_bias, encoder_decoder_attention_bias,
...@@ -255,10 +285,24 @@ def transformer_decoder(decoder_input, ...@@ -255,10 +285,24 @@ def transformer_decoder(decoder_input,
hparams.attention_dropout, hparams.attention_dropout,
dropout_broadcast_dims=attention_dropout_broadcast_dims, dropout_broadcast_dims=attention_dropout_broadcast_dims,
summaries=False, summaries=False,
name="encdec_attention"), name="encdec_attention")
dropout_broadcast_dims=residual_dropout_broadcast_dims) x = common_layers.dropout_with_broadcast_dims(x,
x = residual_fn(x, transformer_ffn_layer(x, hparams), 1.0 - hparams.residual_dropout,
dropout_broadcast_dims=residual_dropout_broadcast_dims) broadcast_dims=residual_dropout_broadcast_dims)
x = residual + x
x = may_be_layernorm(x, hparams, after=True)
# feed-forward network
residual = x
x = may_be_layernorm(x, hparams, before=True)
x = transformer_ffn_layer(x, hparams)
x = common_layers.dropout_with_broadcast_dims(x,
1.0 - hparams.residual_dropout,
broadcast_dims=residual_dropout_broadcast_dims)
x = residual + x
x = may_be_layernorm(x, hparams, after=True)
if hparams.normalize_before:
x = may_be_layernorm(x, hparams, before=True, name="norm_top")
return x return x
...@@ -332,7 +376,8 @@ def transformer_base(): ...@@ -332,7 +376,8 @@ def transformer_base():
hparams.learning_rate = 0.1 hparams.learning_rate = 0.1
hparams.learning_rate_warmup_steps = 4000 hparams.learning_rate_warmup_steps = 4000
hparams.initializer_gain = 1.0 hparams.initializer_gain = 1.0
hparams.num_hidden_layers = 6 hparams.encoder_layers = 6
hparams.decoder_layers = 6
hparams.initializer = "uniform_unit_scaling" hparams.initializer = "uniform_unit_scaling"
hparams.weight_decay = 0.0 hparams.weight_decay = 0.0
hparams.optimizer_adam_beta1 = 0.9 hparams.optimizer_adam_beta1 = 0.9
...@@ -370,6 +415,7 @@ def transformer_base(): ...@@ -370,6 +415,7 @@ def transformer_base():
hparams.add_hparam("attention_dropout_broadcast_dims", "0,1") # batch, heads hparams.add_hparam("attention_dropout_broadcast_dims", "0,1") # batch, heads
hparams.add_hparam("relu_dropout_broadcast_dims", "1") # length hparams.add_hparam("relu_dropout_broadcast_dims", "1") # length
hparams.add_hparam("residual_dropout_broadcast_dims", "1") # length hparams.add_hparam("residual_dropout_broadcast_dims", "1") # length
hparams.add_hparam("normalize_before", False)
return hparams return hparams
...@@ -386,187 +432,49 @@ def transformer_big(): ...@@ -386,187 +432,49 @@ def transformer_big():
@registry.register_hparams @registry.register_hparams
def transformer_big_single_gpu(): def transformer_before():
"""HParams for transformer big model for single gpu.""" """HParams for transfomer big model on WMT."""
hparams = transformer_big()
hparams.residual_dropout = 0.1
hparams.learning_rate_warmup_steps = 16000
hparams.optimizer_adam_beta2 = 0.998
hparams.batching_mantissa_bits = 3
return hparams
@registry.register_hparams
def transformer_base_single_gpu():
"""HParams for transformer base model for single gpu."""
hparams = transformer_base() hparams = transformer_base()
hparams.batch_size = 8192 hparams.normalize_before = True
hparams.learning_rate_warmup_steps = 16000 hparams.relu_dropout = 0.1
hparams.batching_mantissa_bits = 2 hparams.attention_dropout = 0.1
hparams.learning_rate = 0.2
hparams.learning_rate_warmup_steps = 8000
hparams.optimizer_adam_beta1 = 0.9
hparams.optimizer_adam_beta2 = 0.997
return hparams return hparams
@registry.register_hparams @registry.register_hparams
def transformer_parsing_base(): def transformer_before_big():
"""Hparams for parsing on wsj only.""" """HParams for transfomer big model on WMT."""
hparams = transformer_base() hparams = transformer_before()
hparams.attention_dropout = 0.2
hparams.residual_dropout = 0.2
hparams.max_length = 512
hparams.learning_rate_warmup_steps = 16000
hparams.hidden_size = 1024 hparams.hidden_size = 1024
hparams.learning_rate = 0.05 hparams.filter_size = 4096
hparams.shared_embedding_and_softmax_weights = int(False) hparams.num_heads = 16
hparams.batching_mantissa_bits = 2
hparams.residual_dropout = 0.3
return hparams return hparams
@registry.register_hparams @registry.register_hparams
def transformer_parsing_big(): def transformer_big_single_gpu():
"""HParams for parsing on wsj semi-supervised.""" """HParams for transformer big model for single gpu."""
hparams = transformer_big() hparams = transformer_big()
hparams.max_length = 512
hparams.shared_source_target_embedding = int(False)
hparams.learning_rate_warmup_steps = 4000
hparams.residual_dropout = 0.1 hparams.residual_dropout = 0.1
hparams.batch_size = 2048 hparams.learning_rate_warmup_steps = 16000
hparams.learning_rate = 0.05 hparams.optimizer_adam_beta2 = 0.998
return hparams hparams.batching_mantissa_bits = 3
@registry.register_hparams
def transformer_parsing_ice():
"""Hparams for parsing Icelandic text."""
hparams = transformer_base_single_gpu()
hparams.batch_size = 4096
hparams.shared_embedding_and_softmax_weights = int(False)
return hparams
@registry.register_hparams
def transformer_tiny():
hparams = transformer_base()
hparams.hidden_size = 64
hparams.filter_size = 128
hparams.num_heads = 4
return hparams
@registry.register_hparams
def transformer_l2():
hparams = transformer_base()
hparams.num_hidden_layers = 2
return hparams
@registry.register_hparams
def transformer_l4():
hparams = transformer_base()
hparams.num_hidden_layers = 4
return hparams
@registry.register_hparams
def transformer_l8():
hparams = transformer_base()
hparams.num_hidden_layers = 8
return hparams
@registry.register_hparams
def transformer_h1():
hparams = transformer_base()
hparams.num_heads = 1
return hparams
@registry.register_hparams
def transformer_h4():
hparams = transformer_base()
hparams.num_heads = 4
return hparams
@registry.register_hparams
def transformer_h16():
hparams = transformer_base()
hparams.num_heads = 16
return hparams
@registry.register_hparams
def transformer_h32():
hparams = transformer_base()
hparams.num_heads = 32
return hparams
@registry.register_hparams
def transformer_k128():
hparams = transformer_base()
hparams.attention_key_channels = 128
return hparams
@registry.register_hparams
def transformer_k256():
hparams = transformer_base()
hparams.attention_key_channels = 256
return hparams
@registry.register_hparams
def transformer_ff1024():
hparams = transformer_base()
hparams.filter_size = 1024
return hparams
@registry.register_hparams
def transformer_ff4096():
hparams = transformer_base()
hparams.filter_size = 4096
return hparams
@registry.register_hparams
def transformer_dr0():
hparams = transformer_base()
hparams.residual_dropout = 0.0
return hparams
@registry.register_hparams
def transformer_dr2():
hparams = transformer_base()
hparams.residual_dropout = 0.2
return hparams
@registry.register_hparams
def transformer_ls0():
hparams = transformer_base()
hparams.label_smoothing = 0.0
return hparams
@registry.register_hparams
def transformer_ls2():
hparams = transformer_base()
hparams.label_smoothing = 0.2
return hparams
@registry.register_hparams
def transformer_hs256():
hparams = transformer_base()
hparams.hidden_size = 256
return hparams return hparams
@registry.register_hparams @registry.register_hparams
def transformer_hs1024(): def transformer_base_single_gpu():
"""HParams for transformer base model for single gpu."""
hparams = transformer_base() hparams = transformer_base()
hparams.hidden_size = 1024 hparams.batch_size = 8192
hparams.learning_rate_warmup_steps = 16000
hparams.batching_mantissa_bits = 2
return hparams return hparams
...@@ -598,36 +506,6 @@ def transformer_big_dr2(): ...@@ -598,36 +506,6 @@ def transformer_big_dr2():
@registry.register_hparams @registry.register_hparams
def transformer_parameter_attention_a():
hparams = transformer_base()
hparams.ffn_layer = "parameter_attention"
hparams.filter_size = 1536
return hparams
@registry.register_hparams
def transformer_parameter_attention_b():
hparams = transformer_base()
hparams.ffn_layer = "parameter_attention"
hparams.filter_size = 512
hparams.parameter_attention_key_channels = 1024
hparams.parameter_attention_value_channels = 1024
hparams.num_heads = 16
return hparams
@registry.register_ranged_hparams("transformer_big_single_gpu")
def transformer_range1(rhp):
"""Small range of hyperparameters."""
hparams = transformer_big_single_gpu()
common_hparams.fill_ranged_hparams_from_hparams(hparams, rhp)
rhp.set_float("learning_rate", 0.3, 3.0, scale=rhp.LOG_SCALE)
rhp.set_float("initializer_gain", 0.5, 2.0)
rhp.set_float("optimizer_adam_beta2", 0.97, 0.99)
rhp.set_float("weight_decay", 0.0, 2.0)
@registry.register_hparams
def transformer_base_ldcd(): def transformer_base_ldcd():
"""Set of hyperparameters.""" """Set of hyperparameters."""
hparams = transformer_base() hparams = transformer_base()
...@@ -663,14 +541,6 @@ def transformer_base_ldcd_n1(): ...@@ -663,14 +541,6 @@ def transformer_base_ldcd_n1():
hparams.learning_rate_warmup_steps = 4000 hparams.learning_rate_warmup_steps = 4000
return hparams return hparams
@registry.register_hparams
def transformer_base_nosmooth_dropout1():
"""Set of hyperparameters."""
hparams = transformer_base()
hparams.label_smoothing = 0.0
hparams.relu_dropout = 0.1
hparams.attention_dropout = 0.1
return hparams
@registry.register_hparams @registry.register_hparams
def transformer_base_amsgrad(): def transformer_base_amsgrad():
...@@ -722,7 +592,6 @@ def transformer_base_ldrestart_n3(): ...@@ -722,7 +592,6 @@ def transformer_base_ldrestart_n3():
@registry.register_hparams @registry.register_hparams
def transformer_base_powersign(): def transformer_base_powersign():
"""Set of hyperparameters.""" """Set of hyperparameters."""
...@@ -817,15 +686,6 @@ def transformer_big_adafactor(): ...@@ -817,15 +686,6 @@ def transformer_big_adafactor():
hparams.optimizer_adafactor_beta2 = 0.997 hparams.optimizer_adafactor_beta2 = 0.997
return hparams return hparams
@registry.register_hparams
def transformer_base_debug():
hparams = transformer_big_adafactor()
hparams.num_hidden_layers = 3
hparams.fused_inner_hidden = 128
hparams.hidden_size = 64
hparams.filter_size = 128
hparams.batch_size = 128
return hparams
@registry.register_hparams @registry.register_hparams
def transformer_base_v2(): def transformer_base_v2():
...@@ -848,7 +708,7 @@ def transformer_base_rpr_dropout1(): ...@@ -848,7 +708,7 @@ def transformer_base_rpr_dropout1():
@registry.register_hparams @registry.register_hparams
def transformer_base_v2_filter4096(): def transformer_base_v3():
"""Set of hyperparameters. """Set of hyperparameters.
set filter as 4096 set filter as 4096
""" """
...@@ -856,42 +716,7 @@ def transformer_base_v2_filter4096(): ...@@ -856,42 +716,7 @@ def transformer_base_v2_filter4096():
hparams.filter_size = 4096 hparams.filter_size = 4096
return hparams return hparams
@registry.register_hparams
def transformer_base_filter4096():
"""Set of hyperparameters.
set filter as 4096
"""
hparams = transformer_base()
hparams.filter_size = 4096
return hparams
@registry.register_hparams
def transformer_base_v2_dropout2_filter4096():
"""Set of hyperparameters.
set relu_dropout and attention_dropout as 0.2
"""
hparams = transformer_base()
hparams.attention_dropout = 0.2
hparams.relu_dropout = 0.2
hparams.filter_size = 4096
return hparams
@registry.register_hparams
def transformer_base_lr2():
"""Set of hyperparameters.
set relu_dropout and attention_dropout as 0.2
"""
hparams = transformer_base()
hparams.learning_rate = 0.2
return hparams
@registry.register_hparams
def transformer_base_multistep2():
# new model use optimizer MultistepAdam
hparams = transformer_base()
hparams.optimizer = "MultistepAdam"
hparams.optimizer_multistep_accumulate_steps = 2
return hparams
@registry.register_hparams @registry.register_hparams
def transformer_big_multistep2(): def transformer_big_multistep2():
...@@ -914,14 +739,3 @@ def transformer_big_adafactor_test(): ...@@ -914,14 +739,3 @@ def transformer_big_adafactor_test():
hparams.optimizer_adafactor_beta2 = 0.999 hparams.optimizer_adafactor_beta2 = 0.999
return hparams return hparams
@registry.register_hparams
def transformer_mobile():
# new model use optimizer MultistepAdam
hparams = transformer_base()
hparams.hidden_size = 256
hparams.num_hidden_layers = 4
hparams.residual_dropout = 0.1
hparams.dropout = 0.1
return hparams
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论