add fast decoding

8ab393f2 · libei · 81667eab · 8ab393f2 · 8ab393f2 · 8ab393f2
Commit 8ab393f2 authored Feb 21, 2019 by libei
--- a/.idea/WMT19.iml
+++ b/.idea/WMT19.iml
@@ -2,7 +2,7 @@
 <module type="PYTHON_MODULE" version="4">
  <component name="NewModuleRootManager">
    <content url="file://$MODULE_DIR$" />
-    <orderEntry type="inheritedJdk" />
+    <orderEntry type="jdk" jdkName="Python 3.6 (1)" jdkType="Python SDK" />
    <orderEntry type="sourceFolder" forTests="false" />
  </component>
  <component name="TestRunnerService">

--- a/.idea/codeStyles/codeStyleConfig.xml
+++ b/.idea/codeStyles/codeStyleConfig.xml
+<component name="ProjectCodeStyleConfiguration">
+  <state>
+    <option name="PREFERRED_PROJECT_CODE_STYLE" value="Default" />
+  </state>
+</component>
\ No newline at end of file
--- a/.idea/deployment.xml
+++ b/.idea/deployment.xml
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
-  <component name="PublishConfigData" autoUpload="Always" serverName="39.104.93.174">
+  <component name="PublishConfigData" autoUpload="Always" serverName="39.104.62.93">
    <serverData>
+      <paths name="39.104.62.93">
+        <serverdata>
+          <mappings>
+            <mapping deploy="/WMT19" local="$PROJECT_DIR$" web="/" />
+          </mappings>
+        </serverdata>
+      </paths>
      <paths name="39.104.93.174">
        <serverdata>
          <mappings>

--- a/.idea/dictionaries/LiBei.xml
+++ b/.idea/dictionaries/LiBei.xml
+<component name="ProjectDictionaryState">
+  <dictionary name="LiBei" />
+</component>
\ No newline at end of file
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6 (1)" project-jdk-type="Python SDK" />
+</project>
\ No newline at end of file
--- a/.idea/webServers.xml
+++ b/.idea/webServers.xml
@@ -10,6 +10,14 @@
          <option name="port" value="22" />
        </fileTransfer>
      </webServer>
+      <webServer id="73ec0a32-7aa0-46b4-9d59-59f5be3898b1" name="39.104.62.93" url="http://39.104.62.93">
+        <fileTransfer host="39.104.62.93" port="22" rootFolder="/wmt/libei" accessType="SFTP">
+          <advancedOptions>
+            <advancedOptions dataProtectionLevel="Private" />
+          </advancedOptions>
+          <option name="port" value="22" />
+        </fileTransfer>
+      </webServer>
    </option>
  </component>
 </project>
\ No newline at end of file
--- a/.idea/workspace.xml
+++ b/.idea/workspace.xml
@@ -2,9 +2,21 @@
 <project version="4">
  <component name="ChangeListManager">
    <list default="true" id="7d6d9926-f879-4708-ad8e-442bac96b62a" name="Default" comment="">
+      <change beforePath="" afterPath="$PROJECT_DIR$/.idea/codeStyles/codeStyleConfig.xml" />
+      <change beforePath="" afterPath="$PROJECT_DIR$/.idea/dictionaries/LiBei.xml" />
+      <change beforePath="" afterPath="$PROJECT_DIR$/.idea/misc.xml" />
+      <change beforePath="" afterPath="$PROJECT_DIR$/tensor2tensor/utils/beam_search_slow.py" />
+      <change beforePath="$PROJECT_DIR$/.idea/WMT19.iml" afterPath="$PROJECT_DIR$/.idea/WMT19.iml" />
+      <change beforePath="$PROJECT_DIR$/.idea/deployment.xml" afterPath="$PROJECT_DIR$/.idea/deployment.xml" />
+      <change beforePath="$PROJECT_DIR$/.idea/webServers.xml" afterPath="$PROJECT_DIR$/.idea/webServers.xml" />
      <change beforePath="$PROJECT_DIR$/.idea/workspace.xml" afterPath="$PROJECT_DIR$/.idea/workspace.xml" />
-      <change beforePath="$PROJECT_DIR$/tensor2tensor/models/models.py" afterPath="$PROJECT_DIR$/tensor2tensor/models/models.py" />
+      <change beforePath="$PROJECT_DIR$/tensor2tensor/models/common_attention.py" afterPath="$PROJECT_DIR$/tensor2tensor/models/common_attention.py" />
-      <change beforePath="$PROJECT_DIR$/tensor2tensor/models/transformer_dropout.py" afterPath="" />
+      <change beforePath="$PROJECT_DIR$/tensor2tensor/models/common_layers.py" afterPath="$PROJECT_DIR$/tensor2tensor/models/common_layers.py" />
+      <change beforePath="$PROJECT_DIR$/tensor2tensor/models/transformer.py" afterPath="$PROJECT_DIR$/tensor2tensor/models/transformer.py" />
+      <change beforePath="$PROJECT_DIR$/tensor2tensor/utils/beam_search.py" afterPath="$PROJECT_DIR$/tensor2tensor/utils/beam_search.py" />
+      <change beforePath="$PROJECT_DIR$/tensor2tensor/utils/beam_search_test.py" afterPath="$PROJECT_DIR$/tensor2tensor/utils/beam_search_test.py" />
+      <change beforePath="$PROJECT_DIR$/tensor2tensor/utils/modality.py" afterPath="$PROJECT_DIR$/tensor2tensor/utils/modality.py" />
+      <change beforePath="$PROJECT_DIR$/tensor2tensor/utils/t2t_model.py" afterPath="$PROJECT_DIR$/tensor2tensor/utils/t2t_model.py" />
    </list>
    <option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
    <option name="TRACKING_ENABLED" value="true" />
@@ -18,28 +30,108 @@
      <file leaf-file-name="transformer.py" pinned="false" current-in-tab="false">
        <entry file="file://$PROJECT_DIR$/tensor2tensor/models/transformer.py">
          <provider selected="true" editor-type-id="text-editor">
-            <state relative-caret-position="162">
+            <state relative-caret-position="92">
-              <caret line="40" column="12" lean-forward="true" selection-start-line="40" selection-start-column="12" selection-end-line="40" selection-end-column="12" />
+              <caret line="130" column="2" lean-forward="false" selection-start-line="130" selection-start-column="2" selection-end-line="135" selection-end-column="30" />
+              <folding>
+                <element signature="e#738#776#0" expanded="true" />
+              </folding>
+            </state>
+          </provider>
+        </entry>
+      </file>
+      <file leaf-file-name="problem_hparams.py" pinned="false" current-in-tab="false">
+        <entry file="file://$PROJECT_DIR$/tensor2tensor/data_generators/problem_hparams.py">
+          <provider selected="true" editor-type-id="text-editor">
+            <state relative-caret-position="-54">
+              <caret line="463" column="58" lean-forward="false" selection-start-line="463" selection-start-column="57" selection-end-line="463" selection-end-column="58" />
              <folding />
            </state>
          </provider>
        </entry>
      </file>
-      <file leaf-file-name="transformer_dla.py" pinned="false" current-in-tab="false">
+      <file leaf-file-name="wmt.py" pinned="false" current-in-tab="false">
-        <entry file="file://$PROJECT_DIR$/tensor2tensor/models/transformer_dla.py">
+        <entry file="file://$PROJECT_DIR$/tensor2tensor/data_generators/wmt.py">
          <provider selected="true" editor-type-id="text-editor">
-            <state relative-caret-position="216">
+            <state relative-caret-position="471">
-              <caret line="35" column="46" lean-forward="true" selection-start-line="35" selection-start-column="46" selection-end-line="35" selection-end-column="46" />
+              <caret line="433" column="0" lean-forward="false" selection-start-line="433" selection-start-column="0" selection-end-line="433" selection-end-column="0" />
              <folding />
            </state>
          </provider>
        </entry>
      </file>
-      <file leaf-file-name="models.py" pinned="false" current-in-tab="true">
+      <file leaf-file-name="common_attention.py" pinned="false" current-in-tab="false">
-        <entry file="file://$PROJECT_DIR$/tensor2tensor/models/models.py">
+        <entry file="file://$PROJECT_DIR$/tensor2tensor/models/common_attention.py">
+          <provider selected="true" editor-type-id="text-editor">
+            <state relative-caret-position="184">
+              <caret line="419" column="2" lean-forward="false" selection-start-line="419" selection-start-column="2" selection-end-line="470" selection-end-column="12" />
+              <folding />
+            </state>
+          </provider>
+        </entry>
+      </file>
+      <file leaf-file-name="beam_search.py" pinned="false" current-in-tab="false">
+        <entry file="file://$PROJECT_DIR$/tensor2tensor/utils/beam_search.py">
+          <provider selected="true" editor-type-id="text-editor">
+            <state relative-caret-position="180">
+              <caret line="354" column="60" lean-forward="false" selection-start-line="354" selection-start-column="40" selection-end-line="354" selection-end-column="60" />
+              <folding>
+                <element signature="e#658#696#0" expanded="true" />
+              </folding>
+            </state>
+          </provider>
+        </entry>
+      </file>
+      <file leaf-file-name="trainer_utils.py" pinned="false" current-in-tab="false">
+        <entry file="file://$PROJECT_DIR$/tensor2tensor/utils/trainer_utils.py">
+          <provider selected="true" editor-type-id="text-editor">
+            <state relative-caret-position="309">
+              <caret line="486" column="30" lean-forward="false" selection-start-line="486" selection-start-column="30" selection-end-line="486" selection-end-column="30" />
+              <folding>
+                <element signature="e#19415#19927#0" expanded="false" />
+                <element signature="e#22668#23415#0" expanded="false" />
+                <element signature="e#23535#23889#0" expanded="false" />
+              </folding>
+            </state>
+          </provider>
+        </entry>
+      </file>
+      <file leaf-file-name="beam_search_slow.py" pinned="false" current-in-tab="false">
+        <entry file="file://$PROJECT_DIR$/tensor2tensor/utils/beam_search_slow.py">
+          <provider selected="true" editor-type-id="text-editor">
+            <state relative-caret-position="406">
+              <caret line="36" column="69" lean-forward="false" selection-start-line="36" selection-start-column="69" selection-end-line="36" selection-end-column="69" />
+              <folding />
+            </state>
+          </provider>
+        </entry>
+      </file>
+      <file leaf-file-name="t2t_model.py" pinned="false" current-in-tab="false">
+        <entry file="file://$PROJECT_DIR$/tensor2tensor/utils/t2t_model.py">
+          <provider selected="true" editor-type-id="text-editor">
+            <state relative-caret-position="440">
+              <caret line="507" column="0" lean-forward="false" selection-start-line="507" selection-start-column="0" selection-end-line="507" selection-end-column="0" />
+              <folding>
+                <element signature="e#618#656#0" expanded="true" />
+              </folding>
+            </state>
+          </provider>
+        </entry>
+      </file>
+      <file leaf-file-name="modalities.py" pinned="false" current-in-tab="false">
+        <entry file="file://$PROJECT_DIR$/tensor2tensor/models/modalities.py">
+          <provider selected="true" editor-type-id="text-editor">
+            <state relative-caret-position="487">
+              <caret line="90" column="43" lean-forward="false" selection-start-line="90" selection-start-column="43" selection-end-line="90" selection-end-column="43" />
+              <folding />
+            </state>
+          </provider>
+        </entry>
+      </file>
+      <file leaf-file-name="common_layers.py" pinned="false" current-in-tab="true">
+        <entry file="file://$PROJECT_DIR$/tensor2tensor/models/common_layers.py">
          <provider selected="true" editor-type-id="text-editor">
-            <state relative-caret-position="408">
+            <state relative-caret-position="-612">
-              <caret line="37" column="46" lean-forward="false" selection-start-line="37" selection-start-column="46" selection-end-line="37" selection-end-column="46" />
+              <caret line="1605" column="74" lean-forward="false" selection-start-line="1605" selection-start-column="74" selection-end-line="1605" selection-end-column="74" />
              <folding />
            </state>
          </provider>
@@ -64,6 +156,17 @@
      <find>assertEqual</find>
      <find>transformer_alt</find>
      <find>registry</find>
+      <find>shared_embedding_and_softmax_weights</find>
+      <find>self_attention_type</find>
+      <find>fast</find>
+      <find>partial_targets</find>
+      <find>comm</find>
+      <find>single</find>
+      <find>_decode_hparams</find>
+      <find>shape_list</find>
+      <find>initial</find>
+      <find>print</find>
+      <find>tf.Print</find>
    </findStrings>
    <dirStrings>
      <dir>C:\Users\LiBei\Desktop\WMT19</dir>
@@ -76,15 +179,19 @@
    <option name="CHANGED_PATHS">
      <list>
        <option value="$PROJECT_DIR$/tensor2tensor/models/layer_history.py" />
-        <option value="$PROJECT_DIR$/tensor2tensor/models/common_layers.py" />
        <option value="$PROJECT_DIR$/tensor2tensor/models/libei.py" />
        <option value="$PROJECT_DIR$/tensor2tensor/utils/trainer_utils.py" />
        <option value="$PROJECT_DIR$/tensor2tensor/models/common_hparams.py" />
-        <option value="$PROJECT_DIR$/tensor2tensor/models/transformer.py" />
        <option value="$PROJECT_DIR$/tensor2tensor/models/transformer_dla.py" />
        <option value="$PROJECT_DIR$/tensor2tensor/models/transformer_libei.py" />
        <option value="$PROJECT_DIR$/tensor2tensor/models/__init__.py" />
        <option value="$PROJECT_DIR$/tensor2tensor/models/models.py" />
+        <option value="$PROJECT_DIR$/tensor2tensor/utils/beam_search.py" />
+        <option value="$PROJECT_DIR$/tensor2tensor/utils/modality.py" />
+        <option value="$PROJECT_DIR$/tensor2tensor/models/common_layers.py" />
+        <option value="$PROJECT_DIR$/tensor2tensor/models/transformer.py" />
+        <option value="$PROJECT_DIR$/tensor2tensor/models/common_attention.py" />
+        <option value="$PROJECT_DIR$/tensor2tensor/utils/t2t_model.py" />
      </list>
    </option>
  </component>
@@ -94,11 +201,26 @@
    <detection-done>true</detection-done>
    <sorting>DEFINITION_ORDER</sorting>
  </component>
-  <component name="ProjectFrameBounds" extendedState="6">
+  <component name="ProjectFrameBounds" extendedState="7">
-    <option name="x" value="22" />
+    <option name="x" value="6" />
-    <option name="y" value="5" />
    <option name="width" value="1909" />
-    <option name="height" value="1042" />
+    <option name="height" value="1047" />
+  </component>
+  <component name="ProjectInspectionProfilesVisibleTreeState">
+    <entry key="Project Default">
+      <profile-state>
+        <expanded-state>
+          <State>
+            <id />
+          </State>
+        </expanded-state>
+        <selected-state>
+          <State>
+            <id>AngularJS</id>
+          </State>
+        </selected-state>
+      </profile-state>
+    </entry>
  </component>
  <component name="ProjectLevelVcsManager" settingsEditedManually="true">
    <ConfirmationsSetting value="2" id="Add" />
@@ -134,6 +256,12 @@
              <item name="WMT19" type="b2602c69:ProjectViewProjectNode" />
              <item name="WMT19" type="462c0819:PsiDirectoryNode" />
              <item name="tensor2tensor" type="462c0819:PsiDirectoryNode" />
+              <item name="docs" type="462c0819:PsiDirectoryNode" />
+            </path>
+            <path>
+              <item name="WMT19" type="b2602c69:ProjectViewProjectNode" />
+              <item name="WMT19" type="462c0819:PsiDirectoryNode" />
+              <item name="tensor2tensor" type="462c0819:PsiDirectoryNode" />
              <item name="models" type="462c0819:PsiDirectoryNode" />
            </path>
            <path>
@@ -154,10 +282,11 @@
    <property name="nodejs_interpreter_path.stuck_in_default_project" value="undefined stuck path" />
    <property name="settings.editor.selected.configurable" value="com.jetbrains.python.configuration.PyActiveSdkModuleConfigurable" />
    <property name="WebServerToolWindowFactoryState" value="false" />
-    <property name="last_opened_file_path" value="$PROJECT_DIR$/../DeepTransformer-v4" />
+    <property name="last_opened_file_path" value="D:/NMT/tensor2tensor-1.6.5" />
  </component>
  <component name="RecentsManager">
    <key name="CopyFile.RECENT_KEYS">
+      <recent name="C:\Users\LiBei\Desktop\WMT19\tensor2tensor\utils" />
      <recent name="C:\Users\LiBei\Desktop\WMT19\tensor2tensor\models" />
    </key>
  </component>
@@ -190,17 +319,16 @@
    <servers />
  </component>
  <component name="ToolWindowManager">
-    <frame x="-8" y="-8" width="1936" height="1056" extended-state="6" />
+    <frame x="-8" y="-8" width="1936" height="1056" extended-state="7" />
-    <editor active="true" />
    <layout>
      <window_info id="TODO" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="11" side_tool="false" content_ui="tabs" />
      <window_info id="Event Log" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="0" side_tool="true" content_ui="tabs" />
-      <window_info id="File Transfer" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="true" show_stripe_button="true" weight="0.20457019" sideWeight="0.5" order="12" side_tool="false" content_ui="tabs" />
+      <window_info id="File Transfer" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="true" show_stripe_button="true" weight="0.25680086" sideWeight="0.5" order="12" side_tool="false" content_ui="tabs" />
      <window_info id="Version Control" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
      <window_info id="Python Console" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="2" side_tool="false" content_ui="tabs" />
      <window_info id="Run" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="7" side_tool="false" content_ui="tabs" />
      <window_info id="Terminal" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="3" side_tool="false" content_ui="tabs" />
-      <window_info id="Project" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="true" show_stripe_button="true" weight="0.16044776" sideWeight="0.5" order="1" side_tool="false" content_ui="combo" />
+      <window_info id="Project" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="true" show_stripe_button="true" weight="0.2563966" sideWeight="0.5" order="1" side_tool="false" content_ui="combo" />
      <window_info id="Docker" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="false" weight="0.33" sideWeight="0.5" order="4" side_tool="false" content_ui="tabs" />
      <window_info id="Database" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
      <window_info id="Find" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="6" side_tool="false" content_ui="tabs" />
@@ -231,7 +359,9 @@
      <provider selected="true" editor-type-id="text-editor">
        <state relative-caret-position="0">
          <caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
-          <folding />
+          <folding>
+            <element signature="e#738#776#0" expanded="true" />
+          </folding>
        </state>
      </provider>
    </entry>
@@ -247,7 +377,9 @@
      <provider selected="true" editor-type-id="text-editor">
        <state relative-caret-position="2484">
          <caret line="92" column="43" lean-forward="false" selection-start-line="92" selection-start-column="43" selection-end-line="92" selection-end-column="43" />
-          <folding />
+          <folding>
+            <element signature="e#738#776#0" expanded="false" />
+          </folding>
        </state>
      </provider>
    </entry>
@@ -292,13 +424,6 @@
        </state>
      </provider>
    </entry>
-    <entry file="file://$PROJECT_DIR$/tensor2tensor/utils/beam_search.py">
-      <provider selected="true" editor-type-id="text-editor">
-        <state relative-caret-position="411">
-          <caret line="413" column="0" lean-forward="true" selection-start-line="413" selection-start-column="0" selection-end-line="413" selection-end-column="0" />
-        </state>
-      </provider>
-    </entry>
    <entry file="file://$PROJECT_DIR$/tensor2tensor/utils/multistep_optimizer.py">
      <provider selected="true" editor-type-id="text-editor">
        <state relative-caret-position="459">
@@ -308,14 +433,6 @@
      </provider>
    </entry>
    <entry file="file://$PROJECT_DIR$/tensor2tensor/models/libei.py" />
-    <entry file="file://$PROJECT_DIR$/tensor2tensor/models/common_hparams.py">
-      <provider selected="true" editor-type-id="text-editor">
-        <state relative-caret-position="540">
-          <caret line="30" column="11" lean-forward="false" selection-start-line="30" selection-start-column="11" selection-end-line="30" selection-end-column="11" />
-          <folding />
-        </state>
-      </provider>
-    </entry>
    <entry file="file://$PROJECT_DIR$/tensor2tensor/models/layer_history.py">
      <provider selected="true" editor-type-id="text-editor">
        <state relative-caret-position="378">
@@ -326,22 +443,6 @@
        </state>
      </provider>
    </entry>
-    <entry file="file://$PROJECT_DIR$/tensor2tensor/models/common_layers.py">
-      <provider selected="true" editor-type-id="text-editor">
-        <state relative-caret-position="298">
-          <caret line="428" column="9" lean-forward="false" selection-start-line="428" selection-start-column="9" selection-end-line="428" selection-end-column="9" />
-          <folding />
-        </state>
-      </provider>
-    </entry>
-    <entry file="file://$PROJECT_DIR$/tensor2tensor/models/common_attention.py">
-      <provider selected="true" editor-type-id="text-editor">
-        <state relative-caret-position="-162">
-          <caret line="781" column="33" lean-forward="true" selection-start-line="781" selection-start-column="33" selection-end-line="782" selection-end-column="28" />
-          <folding />
-        </state>
-      </provider>
-    </entry>
    <entry file="file://$PROJECT_DIR$/tensor2tensor/models/transformer_relative_pos.py">
      <provider selected="true" editor-type-id="text-editor">
        <state relative-caret-position="378">
@@ -350,14 +451,6 @@
        </state>
      </provider>
    </entry>
-    <entry file="file://$PROJECT_DIR$/tensor2tensor/utils/modality.py">
-      <provider selected="true" editor-type-id="text-editor">
-        <state relative-caret-position="207">
-          <caret line="47" column="38" lean-forward="false" selection-start-line="47" selection-start-column="38" selection-end-line="47" selection-end-column="38" />
-          <folding />
-        </state>
-      </provider>
-    </entry>
    <entry file="file://$PROJECT_DIR$/tensor2tensor/models/transformer_test.py">
      <provider selected="true" editor-type-id="text-editor">
        <state relative-caret-position="356">
@@ -366,21 +459,6 @@
        </state>
      </provider>
    </entry>
-    <entry file="file://$PROJECT_DIR$/tensor2tensor/utils/trainer_utils.py">
-      <provider selected="true" editor-type-id="text-editor">
-        <state relative-caret-position="384">
-          <caret line="482" column="32" lean-forward="false" selection-start-line="482" selection-start-column="32" selection-end-line="482" selection-end-column="32" />
-          <folding>
-            <element signature="e#18286#18629#1" expanded="false" />
-            <element signature="e#18684#18904#0" expanded="false" />
-            <element signature="e#18909#18935#0" expanded="false" />
-            <element signature="e#19415#19927#0" expanded="false" />
-            <element signature="e#22668#23415#0" expanded="false" />
-            <element signature="e#23535#23889#0" expanded="false" />
-          </folding>
-        </state>
-      </provider>
-    </entry>
    <entry file="file://$PROJECT_DIR$/tensor2tensor/models/lstm.py">
      <provider selected="true" editor-type-id="text-editor">
        <state relative-caret-position="-324">
@@ -405,14 +483,6 @@
        </state>
      </provider>
    </entry>
-    <entry file="file://$PROJECT_DIR$/tensor2tensor/utils/t2t_model.py">
-      <provider selected="true" editor-type-id="text-editor">
-        <state relative-caret-position="207">
-          <caret line="125" column="31" lean-forward="false" selection-start-line="125" selection-start-column="31" selection-end-line="125" selection-end-column="31" />
-          <folding />
-        </state>
-      </provider>
-    </entry>
    <entry file="file://$PROJECT_DIR$/tensor2tensor/utils/registry.py">
      <provider selected="true" editor-type-id="text-editor">
        <state relative-caret-position="-2425">
@@ -453,14 +523,6 @@
        </state>
      </provider>
    </entry>
-    <entry file="file://$PROJECT_DIR$/tensor2tensor/models/transformer.py">
-      <provider selected="true" editor-type-id="text-editor">
-        <state relative-caret-position="162">
-          <caret line="40" column="12" lean-forward="true" selection-start-line="40" selection-start-column="12" selection-end-line="40" selection-end-column="12" />
-          <folding />
-        </state>
-      </provider>
-    </entry>
    <entry file="file://$PROJECT_DIR$/tensor2tensor/models/transformer_alternative.py">
      <provider selected="true" editor-type-id="text-editor">
        <state relative-caret-position="302">
@@ -474,7 +536,7 @@
        <state relative-caret-position="-972">
          <caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
          <folding>
-            <element signature="e#608#646#0" expanded="true" />
+            <element signature="e#608#646#0" expanded="false" />
          </folding>
        </state>
      </provider>
@@ -484,7 +546,7 @@
        <state relative-caret-position="567">
          <caret line="45" column="27" lean-forward="true" selection-start-line="45" selection-start-column="27" selection-end-line="45" selection-end-column="27" />
          <folding>
-            <element signature="e#719#757#0" expanded="true" />
+            <element signature="e#719#757#0" expanded="false" />
          </folding>
        </state>
      </provider>
@@ -497,18 +559,146 @@
        </state>
      </provider>
    </entry>
+    <entry file="file://$PROJECT_DIR$/tensor2tensor/models/models.py">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="516">
+          <caret line="41" column="0" lean-forward="true" selection-start-line="41" selection-start-column="0" selection-end-line="41" selection-end-column="0" />
+          <folding>
+            <element signature="e#654#692#0" expanded="false" />
+          </folding>
+        </state>
+      </provider>
+    </entry>
+    <entry file="file://$PROJECT_DIR$/tensor2tensor/utils/expert_utils.py">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="181">
+          <caret line="158" column="10" lean-forward="true" selection-start-line="158" selection-start-column="10" selection-end-line="158" selection-end-column="10" />
+          <folding />
+        </state>
+      </provider>
+    </entry>
+    <entry file="file://$PROJECT_DIR$/tensor2tensor/models/modalities_test.py">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="235">
+          <caret line="49" column="25" lean-forward="false" selection-start-line="49" selection-start-column="25" selection-end-line="49" selection-end-column="25" />
+          <folding />
+        </state>
+      </provider>
+    </entry>
    <entry file="file://$PROJECT_DIR$/tensor2tensor/models/transformer_dla.py">
      <provider selected="true" editor-type-id="text-editor">
-        <state relative-caret-position="216">
+        <state relative-caret-position="541">
-          <caret line="35" column="46" lean-forward="true" selection-start-line="35" selection-start-column="46" selection-end-line="35" selection-end-column="46" />
+          <caret line="110" column="47" lean-forward="false" selection-start-line="110" selection-start-column="47" selection-end-line="110" selection-end-column="47" />
+          <folding>
+            <element signature="e#738#776#0" expanded="false" />
+          </folding>
+        </state>
+      </provider>
+    </entry>
+    <entry file="file://$PROJECT_DIR$/tensor2tensor/models/common_hparams.py">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="540">
+          <caret line="30" column="11" lean-forward="false" selection-start-line="30" selection-start-column="11" selection-end-line="30" selection-end-column="11" />
          <folding />
        </state>
      </provider>
    </entry>
-    <entry file="file://$PROJECT_DIR$/tensor2tensor/models/models.py">
+    <entry file="file://$PROJECT_DIR$/tensor2tensor/utils/modality.py">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="-107">
+          <caret line="69" column="6" lean-forward="false" selection-start-line="69" selection-start-column="6" selection-end-line="69" selection-end-column="6" />
+          <folding>
+            <element signature="e#2651#2689#0" expanded="false" />
+          </folding>
+        </state>
+      </provider>
+    </entry>
+    <entry file="file://$PROJECT_DIR$/tensor2tensor/utils/trainer_utils.py">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="309">
+          <caret line="486" column="30" lean-forward="false" selection-start-line="486" selection-start-column="30" selection-end-line="486" selection-end-column="30" />
+          <folding>
+            <element signature="e#19415#19927#0" expanded="false" />
+            <element signature="e#22668#23415#0" expanded="false" />
+            <element signature="e#23535#23889#0" expanded="false" />
+          </folding>
+        </state>
+      </provider>
+    </entry>
+    <entry file="file://$PROJECT_DIR$/tensor2tensor/utils/beam_search_slow.py">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="406">
+          <caret line="36" column="69" lean-forward="false" selection-start-line="36" selection-start-column="69" selection-end-line="36" selection-end-column="69" />
+          <folding />
+        </state>
+      </provider>
+    </entry>
+    <entry file="file://$PROJECT_DIR$/tensor2tensor/models/modalities.py">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="487">
+          <caret line="90" column="43" lean-forward="false" selection-start-line="90" selection-start-column="43" selection-end-line="90" selection-end-column="43" />
+          <folding />
+        </state>
+      </provider>
+    </entry>
+    <entry file="file://$PROJECT_DIR$/tensor2tensor/utils/beam_search.py">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="180">
+          <caret line="354" column="60" lean-forward="false" selection-start-line="354" selection-start-column="40" selection-end-line="354" selection-end-column="60" />
+          <folding>
+            <element signature="e#658#696#0" expanded="true" />
+          </folding>
+        </state>
+      </provider>
+    </entry>
+    <entry file="file://$PROJECT_DIR$/tensor2tensor/models/common_attention.py">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="184">
+          <caret line="419" column="2" lean-forward="false" selection-start-line="419" selection-start-column="2" selection-end-line="470" selection-end-column="12" />
+          <folding />
+        </state>
+      </provider>
+    </entry>
+    <entry file="file://$PROJECT_DIR$/tensor2tensor/utils/t2t_model.py">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="440">
+          <caret line="507" column="0" lean-forward="false" selection-start-line="507" selection-start-column="0" selection-end-line="507" selection-end-column="0" />
+          <folding>
+            <element signature="e#618#656#0" expanded="true" />
+          </folding>
+        </state>
+      </provider>
+    </entry>
+    <entry file="file://$PROJECT_DIR$/tensor2tensor/models/transformer.py">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="92">
+          <caret line="130" column="2" lean-forward="false" selection-start-line="130" selection-start-column="2" selection-end-line="135" selection-end-column="30" />
+          <folding>
+            <element signature="e#738#776#0" expanded="true" />
+          </folding>
+        </state>
+      </provider>
+    </entry>
+    <entry file="file://$PROJECT_DIR$/tensor2tensor/data_generators/problem_hparams.py">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="-54">
+          <caret line="463" column="58" lean-forward="false" selection-start-line="463" selection-start-column="57" selection-end-line="463" selection-end-column="58" />
+          <folding />
+        </state>
+      </provider>
+    </entry>
+    <entry file="file://$PROJECT_DIR$/tensor2tensor/data_generators/wmt.py">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="471">
+          <caret line="433" column="0" lean-forward="false" selection-start-line="433" selection-start-column="0" selection-end-line="433" selection-end-column="0" />
+          <folding />
+        </state>
+      </provider>
+    </entry>
+    <entry file="file://$PROJECT_DIR$/tensor2tensor/models/common_layers.py">
      <provider selected="true" editor-type-id="text-editor">
-        <state relative-caret-position="408">
+        <state relative-caret-position="-612">
-          <caret line="37" column="46" lean-forward="false" selection-start-line="37" selection-start-column="46" selection-end-line="37" selection-end-column="46" />
+          <caret line="1605" column="74" lean-forward="false" selection-start-line="1605" selection-start-column="74" selection-end-line="1605" selection-end-column="74" />
          <folding />
        </state>
      </provider>

--- a/tensor2tensor/models/common_attention.py
+++ b/tensor2tensor/models/common_attention.py
@@ -27,6 +27,12 @@ import tensorflow as tf
 def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
+  length = common_layers.shape_list(x)[1]
+  channels = common_layers.shape_list(x)[2]
+  signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
+  return x + signal
+def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4):
  """Adds a bunch of sinusoids of different frequencies to a Tensor.
  Each channel of the input Tensor is incremented by a sinusoid of a different
@@ -54,8 +60,7 @@ def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
  Returns:
    a Tensor the same shape as x.
  """
-  length = tf.shape(x)[1]
-  channels = tf.shape(x)[2]
  position = tf.to_float(tf.range(length))
  num_timescales = channels // 2
  log_timescale_increment = (
@@ -67,7 +72,7 @@ def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
  signal = tf.concat([tf.sin(scaled_time), tf.cos(scaled_time)], axis=1)
  signal = tf.pad(signal, [[0, 0], [0, tf.mod(channels, 2)]])
  signal = tf.reshape(signal, [1, length, channels])
-  return x + signal
+  return signal
 def add_timing_signal_nd(x, min_timescale=1.0, max_timescale=1.0e4):
@@ -204,7 +209,7 @@ def attention_bias_ignore_padding(memory_padding):
  return tf.expand_dims(tf.expand_dims(ret, 1), 1)
-def split_last_dimension(x, n):
+def split_last_dimension1(x, n):
  """Reshape x so that the last dimension becomes two dimensions.
  The first of these two dimensions is n.
@@ -223,6 +228,23 @@ def split_last_dimension(x, n):
  ret.set_shape(new_shape)
  return ret
+def split_last_dimension(x, n):
+  """Reshape x so that the last dimension becomes two dimensions.
+    The first of these two dimensions is n.
+    Args:
+      x: a Tensor with shape [..., m]
+      n: an integer.
+    Returns:
+      a Tensor with shape [..., n, m/n]
+  """
+  x_shape = common_layers.shape_list(x)
+  m = x_shape[-1]
+  if isinstance(m, int) and isinstance(n, int):
+    assert m % n == 0
+  return tf.reshape(x, x_shape[:-1] + [n, m // n])
 def combine_last_two_dimensions(x):
  """Reshape x so that the last two dimension become one.
@@ -409,21 +431,32 @@ def multihead_attention(query_antecedent,
      q, k, v = tf.split(
          combined, [total_key_depth, total_key_depth, total_value_depth],
          axis=2)
+      k = split_heads(k, num_heads)
+      v = split_heads(v, num_heads)
+      if cache is not None:
+        k = cache["k"] = tf.concat([cache["k"], k], axis=2)
+        v = cache["v"] = tf.concat([cache["v"], v], axis=2)
    else:
      q = common_layers.conv1d(
          query_antecedent, total_key_depth, 1, name="q_transform")
+      if cache is not None:
+        k = cache["k_encdec"]
+        v = cache["v_encdec"]
+      else:
        combined = common_layers.conv1d(
          memory_antecedent,
          total_key_depth + total_value_depth,
          1,
          name="kv_transform")
        k, v = tf.split(combined, [total_key_depth, total_value_depth], axis=2)
-    q = split_heads(q, num_heads)
        k = split_heads(k, num_heads)
        v = split_heads(v, num_heads)
+    q = split_heads(q, num_heads)
    key_depth_per_head = total_key_depth // num_heads
    q *= key_depth_per_head**-0.5
    if attention_type == "dot_product":
        x = dot_product_attention(
            q, k, v, bias, dropout_rate, summaries, image_shapes, dropout_broadcast_dims=dropout_broadcast_dims)

--- a/tensor2tensor/models/common_layers.py
+++ b/tensor2tensor/models/common_layers.py
@@ -1669,3 +1669,6 @@ def Linear(input, output_dim, name, activation=None, bias=True):
      bias: a boolean to choose if use bias
    """
    return tf.layers.dense(input, output_dim, name=name, activation=activation, use_bias=bias)
+def log_prob_from_logits(logits, reduce_axis=-1):
+  return logits - tf.reduce_logsumexp(logits, axis=reduce_axis, keepdims=True)
\ No newline at end of file
--- a/tensor2tensor/models/transformer.py
+++ b/tensor2tensor/models/transformer.py
@@ -33,6 +33,7 @@ from tensor2tensor.models import common_hparams
 from tensor2tensor.models import common_layers
 from tensor2tensor.utils import registry
 from tensor2tensor.utils import t2t_model
+from tensor2tensor.utils import beam_search
 import tensorflow as tf
@@ -48,30 +49,374 @@ class Transformer(t2t_model.T2TModel):
    inputs = features.get("inputs")
    target_space = features.get("target_space_id")
-    inputs = common_layers.flatten4d3d(inputs)
+    encoder_output, encoder_attention_bias = self.encode(inputs,
+                                                         target_space,
+                                                         hparams)
    targets = common_layers.flatten4d3d(targets)
+    decoder_input, decoder_self_attention_bias = transformer_prepare_decoder(
+      targets, hparams)
+    decoder_output = self.decode(decoder_input,
+                                 encoder_output,
+                                 encoder_attention_bias,
+                                 decoder_self_attention_bias,
+                                 hparams)
+    decoder_output = tf.expand_dims(decoder_output, 2)
+    return decoder_output
+  def encode(self,
+             inputs,
+             target_space,
+             hparams,
+             features=None):
+    inputs = common_layers.flatten4d3d(inputs)
    (encoder_input, encoder_attention_bias, _) = (transformer_prepare_encoder(
        inputs, target_space, hparams))
-    (decoder_input, decoder_self_attention_bias) = transformer_prepare_decoder(
+    encoder_input = tf.nn.dropout(encoder_input, 1.0 - hparams.residual_dropout)
-        targets, hparams)
+    encoder_output = transformer_encoder(encoder_input,
+                                         encoder_attention_bias,
+                                         hparams)
+    return encoder_output, encoder_attention_bias
+  def decode(self,
+             decoder_input,
+             encoder_output,
+             encoder_attention_bias,
+             decoder_self_attention_bias,
+             hparams,
+             cache=None):
-    # encoder_input = tf.squeeze(encoder_input, 2)
-    # decoder_input = tf.squeeze(decoder_input, 2)
-    encoder_input = tf.nn.dropout(encoder_input, 1.0 - hparams.residual_dropout)
    decoder_input = tf.nn.dropout(decoder_input, 1.0 - hparams.residual_dropout)
-    encoder_output = transformer_encoder(encoder_input,
-                                         encoder_attention_bias, hparams)
-    decoder_output = transformer_decoder(
+    decoder_output = transformer_decoder(decoder_input,
-        decoder_input, encoder_output, decoder_self_attention_bias,
+                                         encoder_output,
-        encoder_attention_bias, hparams)
+                                         decoder_self_attention_bias,
-    decoder_output = tf.expand_dims(decoder_output, 2)
+                                         encoder_attention_bias,
+                                         hparams,
+                                         cache=cache)
    return decoder_output
+  def _beam_decode(self, features, decode_length, beam_size, top_beams, last_position_only, alpha):
+    """Beam search decoding.
+    Args:
+      features: an map of string to `Tensor`
+      decode_length: an integer.  How many additional timesteps to decode.
+      beam_size: number of beams.
+      top_beams: an integer. How many of the beams to return.
+      alpha: Float that controls the length penalty. larger the alpha, stronger
+        the preference for longer translations.
+    Returns:
+      A dict of decoding results {
+          "outputs": integer `Tensor` of decoded ids of shape
+              [batch_size, <= decode_length] if beam_size == 1 or
+              [batch_size, top_beams, <= decode_length]
+          "scores": decoding log probs from the beam search,
+              None if using greedy decoding (beam_size=1)
+      }
+    """
+    return self._fast_decode(features, decode_length, beam_size, top_beams,
+                               alpha)
+  def _fast_decode(self,
+                   features,
+                   decode_length,
+                   beam_size=1,
+                   top_beams=1,
+                   alpha=1.0):
+    """Fast decoding.
+    Implements both greedy and beam search decoding, uses beam search iff
+    beam_size > 1, otherwise beam search related arguments are ignored.
+    Args:
+      features: a map of string to model  features.
+      decode_length: an integer.  How many additional timesteps to decode.
+      beam_size: number of beams.
+      top_beams: an integer. How many of the beams to return.
+      alpha: Float that controls the length penalty. larger the alpha, stronger
+        the preference for longer translations.
+    Returns:
+      A dict of decoding results {
+          "outputs": integer `Tensor` of decoded ids of shape
+              [batch_size, <= decode_length] if beam_size == 1 or
+              [batch_size, top_beams, <= decode_length]
+          "scores": decoding log probs from the beam search,
+              None if using greedy decoding (beam_size=1)
+      }
+    Raises:
+      NotImplementedError: If there are multiple data shards.
+    """
+    if self._num_datashards != 1:
+      raise NotImplementedError("Fast decoding only supports a single shard.")
+    dp = self._data_parallelism
+    hparams = self._hparams
+    target_modality = self._problem_hparams.target_modality
+    inputs = features["inputs"]
+    decode_length = (common_layers.shape_list(inputs)[1] + features.get(
+      "decode_length", decode_length))
+    inputs = tf.expand_dims(inputs, axis=1)
+    if len(inputs.shape) < 5:
+      inputs = tf.expand_dims(inputs, axis=4)
+    s = common_layers.shape_list(inputs)
+    batch_size = s[0]
+    inputs = tf.reshape(inputs, [s[0] * s[1], s[2], s[3], s[4]])
+    # _shard_features called to ensure that the variable names match
+    inputs = self._shard_features({"inputs": inputs})["inputs"]
+    input_modality = self._problem_hparams.input_modality["inputs"]
+    with tf.variable_scope(input_modality.name):
+      inputs = input_modality.bottom_sharded(inputs, dp)
+    with tf.variable_scope("body"):
+      encoder_output, encoder_decoder_attention_bias = dp(
+        self.encode,
+        inputs,
+        features["target_space_id"],
+        hparams)
+    encoder_output = encoder_output[0]
+    encoder_decoder_attention_bias = encoder_decoder_attention_bias[0]
+    if hparams.pos == "timing":
+      timing_signal = common_attention.get_timing_signal_1d(
+          decode_length + 1, hparams.attention_key_channels or hparams.hidden_size)
+    def preprocess_targets(targets, i):
+      """Performs preprocessing steps on the targets to prepare for the decoder.
+      This includes:
+        - Embedding the ids.
+        - Flattening to 3D tensor.
+        - Optionally adding timing signals.
+      Args:
+        targets: inputs ids to the decoder. [batch_size, 1]
+        i: scalar, Step number of the decoding loop.
+      Returns:
+        Processed targets [batch_size, 1, hidden_dim]
+      """
+      # _shard_features called to ensure that the variable names match
+      targets = self._shard_features({"targets": targets})["targets"]
+      with tf.variable_scope(target_modality.name):
+        targets = target_modality.targets_bottom_sharded(targets, dp)[0]
+      targets = common_layers.flatten4d3d(targets)
+      # TODO(llion): Explain! Is this even needed?
+      targets = tf.cond(
+          tf.equal(i, 0), lambda: tf.zeros_like(targets), lambda: targets)
+      if hparams.pos == "timing":
+        targets += timing_signal[:, i:i + 1]
+      return targets
+    decoder_self_attention_bias = (
+        common_attention.attention_bias_lower_triangle(decode_length))
+    def symbols_to_logits_fn(ids, i, cache):
+      """Go from ids to logits for next symbol."""
+      ids = ids[:, -1:]
+      targets = tf.expand_dims(tf.expand_dims(ids, axis=2), axis=3)
+      targets = preprocess_targets(targets, i)
+      bias = decoder_self_attention_bias[:, :, i:i + 1, :i + 1]
+      with tf.variable_scope("body"):
+        body_outputs = dp(
+            self.decode,
+            targets,
+            cache.get("encoder_output"),
+            cache.get("encoder_decoder_attention_bias"),
+            bias,
+            hparams,
+            cache)
+      with tf.variable_scope(target_modality.name):
+        logits= target_modality.top_sharded_logits(body_outputs, targets, dp)[0]
+      ret = tf.squeeze(logits)
+      return ret, cache
+    ret = fast_decode(
+        encoder_output=encoder_output,
+        encoder_decoder_attention_bias=encoder_decoder_attention_bias,
+        symbols_to_logits_fn=symbols_to_logits_fn,
+        hparams=hparams,
+        decode_length=decode_length,
+        vocab_size=target_modality.top_dimensionality,
+        beam_size=beam_size,
+        top_beams=top_beams,
+        alpha=alpha,
+        batch_size=batch_size,
+        force_decode_length=False)
+    return ret
+def fast_decode(encoder_output,
+                encoder_decoder_attention_bias,
+                symbols_to_logits_fn,
+                hparams,
+                decode_length,
+                vocab_size,
+                beam_size=1,
+                top_beams=1,
+                alpha=1.0,
+                eos_id=beam_search.EOS_ID,
+                batch_size=None,
+                force_decode_length=False):
+  """Given encoder output and a symbols to logits function, does fast decoding.
+  Implements both greedy and beam search decoding, uses beam search iff
+  beam_size > 1, otherwise beam search related arguments are ignored.
+  Args:
+    encoder_output: Output from encoder.
+    encoder_decoder_attention_bias: a bias tensor for use in encoder-decoder
+      attention
+    symbols_to_logits_fn: Incremental decoding; function mapping triple
+      `(ids, step, cache)` to symbol logits.
+    hparams: run hyperparameters
+    decode_length: an integer.  How many additional timesteps to decode.
+    vocab_size: Output vocabulary size.
+    beam_size: number of beams.
+    top_beams: an integer. How many of the beams to return.
+    alpha: Float that controls the length penalty. larger the alpha, stronger
+      the preference for longer translations.
+    eos_id: End-of-sequence symbol in beam search.
+    batch_size: an integer scalar - must be passed if there is no input
+    force_decode_length: bool, whether to force the full decode length, or if
+      False, stop when all beams hit eos_id.
+  Returns:
+      A dict of decoding results {
+          "outputs": integer `Tensor` of decoded ids of shape
+              [batch_size, <= decode_length] if top_beams == 1 or
+              [batch_size, top_beams, <= decode_length] otherwise
+          "scores": decoding log probs from the beam search,
+              None if using greedy decoding (beam_size=1)
+      }
+    Raises:
+      NotImplementedError: If beam size > 1 with partial targets.
+  """
+  if encoder_output is not None:
+    batch_size = common_layers.shape_list(encoder_output)[0]
+  key_channels = hparams.attention_key_channels or hparams.hidden_size
+  value_channels = hparams.attention_value_channels or hparams.hidden_size
+  num_layers = hparams.decoder_layers
+  cache = {
+      "layer_%d" % layer: {
+          "k":
+              common_attention.split_heads(
+                  tf.zeros([batch_size, 0, key_channels]), hparams.num_heads),
+          "v":
+              common_attention.split_heads(
+                  tf.zeros([batch_size, 0, value_channels]), hparams.num_heads)
+      } for layer in range(num_layers)
+  }
+  if encoder_output is not None:
+    for layer in range(num_layers):
+      layer_name = "layer_%d" % layer
+      with tf.variable_scope(
+          "body/decoder/%s/encdec_attention" % layer_name):
+        combined = common_layers.conv1d(
+          encoder_output,
+          2 * key_channels,
+          1,
+          name="kv_transform")
+        k_encdec, v_encdec = tf.split(combined, [key_channels, key_channels], axis=2)
+        k_encdec = common_attention.split_heads(k_encdec, hparams.num_heads)
+        v_encdec = common_attention.split_heads(v_encdec, hparams.num_heads)
+      cache[layer_name]["k_encdec"] = k_encdec
+      cache[layer_name]["v_encdec"] = v_encdec
+    cache["encoder_output"] = encoder_output
+    cache["encoder_decoder_attention_bias"] = encoder_decoder_attention_bias
+  if beam_size > 1:  # Beam Search
+    initial_ids = tf.zeros([batch_size], dtype=tf.int32)
+    decoded_ids, scores = beam_search.beam_search(
+        symbols_to_logits_fn,
+        initial_ids,
+        beam_size,
+        decode_length,
+        vocab_size,
+        alpha,
+        states=cache,
+        eos_id=eos_id,
+        stop_early=(top_beams == 1))
+    if top_beams == 1:
+      decoded_ids = decoded_ids[:, 0, 1:]
+      scores = scores[:, 0]
+    else:
+      decoded_ids = decoded_ids[:, :top_beams, 1:]
+      scores = scores[:, :top_beams]
+  else:  # Greedy
+    def inner_loop(i, hit_eos, next_id, decoded_ids, cache, log_prob):
+      """One step of greedy decoding."""
+      logits, cache = symbols_to_logits_fn(next_id, i, cache)
+      log_probs = common_layers.log_prob_from_logits(logits)
+      temperature = (0.0 if hparams.sampling_method == "argmax" else
+                     hparams.sampling_temp)
+      next_id = common_layers.sample_with_temperature(logits, temperature)
+      hit_eos |= tf.equal(next_id, eos_id)
+      log_prob_indices = tf.stack(
+          [tf.range(tf.to_int64(batch_size)), next_id], axis=1)
+      log_prob += tf.gather_nd(log_probs, log_prob_indices)
+      next_id = tf.expand_dims(next_id, axis=1)
+      decoded_ids = tf.concat([decoded_ids, next_id], axis=1)
+      return i + 1, hit_eos, next_id, decoded_ids, cache, log_prob
+    def is_not_finished(i, hit_eos, *_):
+      finished = i >= decode_length
+      if not force_decode_length:
+        finished |= tf.reduce_all(hit_eos)
+      return tf.logical_not(finished)
+    decoded_ids = tf.zeros([batch_size, 0], dtype=tf.int64)
+    hit_eos = tf.fill([batch_size], False)
+    next_id = tf.zeros([batch_size, 1], dtype=tf.int64)
+    initial_log_prob = tf.zeros([batch_size], dtype=tf.float32)
+    _, _, _, decoded_ids, _, log_prob = tf.while_loop(
+        is_not_finished,
+        inner_loop, [
+            tf.constant(0), hit_eos, next_id, decoded_ids, cache,
+            initial_log_prob
+        ],
+        shape_invariants=[
+            tf.TensorShape([]),
+            tf.TensorShape([None]),
+            tf.TensorShape([None, None]),
+            tf.TensorShape([None, None]),
+            nest.map_structure(beam_search_slow.get_state_shape_invariants, cache),
+            tf.TensorShape([None]),
+        ])
+    scores = log_prob
+  return {"outputs": decoded_ids, "scores": scores}
 def transformer_prepare_encoder(inputs, target_space, hparams):
  """Prepare one shard of the model for the encoder.
@@ -205,6 +550,7 @@ def transformer_decoder(decoder_input,
                        decoder_self_attention_bias,
                        encoder_decoder_attention_bias,
                        hparams,
+                        cache=None,
                        name="decoder"):
  """A stack of transformer layers.
@@ -234,7 +580,10 @@ def transformer_decoder(decoder_input,
  # Summaries don't work in multi-problem setting yet.
  summaries = "problems" not in hparams.values() or len(hparams.problems) == 1
  with tf.variable_scope(name):
    for layer in xrange(hparams.decoder_layers):
+      layer_name = "layer_%d" % layer
+      layer_cache = cache[layer_name] if cache is not None else None
      with tf.variable_scope("layer_%d" % layer):
        # self-attention network
        residual = x
@@ -251,6 +600,7 @@ def transformer_decoder(decoder_input,
                attention_type=hparams.attention_type,
                max_relative_length=hparams.max_relative_length,
                dropout_broadcast_dims=attention_dropout_broadcast_dims,
+                cache=layer_cache,
                summaries=False,
                name="decoder_self_attention")
        x = common_layers.dropout_with_broadcast_dims(x,
@@ -272,6 +622,7 @@ def transformer_decoder(decoder_input,
                hparams.num_heads,
                hparams.attention_dropout,
                dropout_broadcast_dims=attention_dropout_broadcast_dims,
+                cache=layer_cache,
                summaries=False,
                name="encdec_attention")
        x = common_layers.dropout_with_broadcast_dims(x,

--- a/tensor2tensor/utils/__pycache__/beam_search.cpython-35.pyc
+++ b/tensor2tensor/utils/__pycache__/beam_search.cpython-35.pyc
--- a/tensor2tensor/utils/__pycache__/beam_search.cpython-36.pyc
+++ b/tensor2tensor/utils/__pycache__/beam_search.cpython-36.pyc
--- a/tensor2tensor/utils/beam_search.py
+++ b/tensor2tensor/utils/beam_search.py
-# Copyright 2017 The Tensor2Tensor Authors.
+# coding=utf-8
+# Copyright 2018 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,28 +12,81 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""Implementation of beam search with penalties."""
-"""Implemetation of beam seach with penalties."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+from tensor2tensor.models import  common_layers
-# Dependency imports
 import tensorflow as tf
+from tensorflow.python.util import nest
 # Assuming EOS_ID is 1
 EOS_ID = 1
 # Default value for INF
 INF = 1. * 1e7
-def log_prob_from_logits(logits):
+def _merge_beam_dim(tensor):
-  return logits - tf.reduce_logsumexp(logits, axis=2, keep_dims=True)
+  """Reshapes first two dimensions in to single dimension.
+  Args:
+    tensor: Tensor to reshape of shape [A, B, ...]
+  Returns:
+    Reshaped tensor of shape [A*B, ...]
+  """
+  shape = common_layers.shape_list(tensor)
+  shape[0] *= shape[1]  # batch -> batch * beam_size
+  shape.pop(1)  # Remove beam dim
+  return tf.reshape(tensor, shape)
+def _unmerge_beam_dim(tensor, batch_size, beam_size):
+  """Reshapes first dimension back to [batch_size, beam_size].
+  Args:
+    tensor: Tensor to reshape of shape [batch_size*beam_size, ...]
+    batch_size: Tensor, original batch size.
+    beam_size: int, original beam size.
+  Returns:
+    Reshaped tensor of shape [batch_size, beam_size, ...]
+  """
+  shape = common_layers.shape_list(tensor)
+  new_shape = [batch_size] + [beam_size] + shape[1:]
+  return tf.reshape(tensor, new_shape)
+def _expand_to_beam_size(tensor, beam_size):
+  """Tiles a given tensor by beam_size.
+  Args:
+    tensor: tensor to tile [batch_size, ...]
+    beam_size: How much to tile the tensor by.
+  Returns:
+    Tiled tensor [batch_size, beam_size, ...]
+  """
+  tensor = tf.expand_dims(tensor, axis=1)
+  tile_dims = [1] * tensor.shape.ndims
+  tile_dims[1] = beam_size
+  return tf.tile(tensor, tile_dims)
+def get_state_shape_invariants(tensor):
+  """Returns the shape of the tensor but sets middle dims to None."""
+  shape = tensor.shape.as_list()
+  for i in range(1, len(shape) - 1):
+    shape[i] = None
+  return tf.TensorShape(shape)
 def compute_batch_indices(batch_size, beam_size):
-  """Computes the i'th coodinate that contains the batch index for gathers.
+  """Computes the i'th coordinate that contains the batch index for gathers.
  Batch pos is a tensor like [[0,0,0,0,],[1,1,1,1],..]. It says which
  batch the beam item is in. This will create the i of the i,j coordinate
@@ -50,13 +104,20 @@ def compute_batch_indices(batch_size, beam_size):
 def compute_topk_scores_and_seq(sequences, scores, scores_to_gather, flags,
-                                beam_size, batch_size):
+                                beam_size, batch_size, prefix="default",
+                                states_to_gather=None):
  """Given sequences and scores, will gather the top k=beam size sequences.
  This function is used to grow alive, and finished. It takes sequences,
  scores, and flags, and returns the top k from sequences, scores_to_gather,
  and flags based on the values in scores.
+  This method permits easy introspection using tfdbg.  It adds three named ops
+  that are prefixed by `prefix`:
+    - _topk_seq: the tensor for topk_seq returned by this method.
+    - _topk_flags: the tensor for topk_finished_flags returned by this method.
+    - _topk_scores: the tensor for tokp_gathered_scores returned by this method.
  Args:
    sequences: Tensor of sequences that we need to gather from.
      [batch_size, beam_size, seq_length]
@@ -66,11 +127,13 @@ def compute_topk_scores_and_seq(sequences, scores, scores_to_gather, flags,
      [batch_size, beam_size]. We will return the gathered scores from here.
      Scores to gather is different from scores because for grow_alive, we will
      need to return log_probs, while for grow_finished, we will need to return
-      the length penalized scors.
+      the length penalized scores.
    flags: Tensor of bools for sequences that say whether a sequence has reached
      EOS or not
    beam_size: int
    batch_size: int
+    prefix: string that will prefix unique names for the ops run.
+    states_to_gather: dict (possibly nested) of decoding states.
  Returns:
    Tuple of
    (topk_seq [batch_size, beam_size, decode_length],
@@ -90,11 +153,20 @@ def compute_topk_scores_and_seq(sequences, scores, scores_to_gather, flags,
  # last dimension contains the i,j gathering coordinates.
  top_coordinates = tf.stack([batch_pos, topk_indexes], axis=2)
-  # Gather up the highest scoring sequences
+  # Gather up the highest scoring sequences.  For each operation added, give it
-  topk_seq = tf.gather_nd(sequences, top_coordinates)
+  # a concrete name to simplify observing these operations with tfdbg.  Clients
-  topk_flags = tf.gather_nd(flags, top_coordinates)
+  # can capture these tensors by watching these node names.
-  topk_gathered_scores = tf.gather_nd(scores_to_gather, top_coordinates)
+  def gather(tensor, name):
-  return topk_seq, topk_gathered_scores, topk_flags
+    return tf.gather_nd(tensor, top_coordinates, name=(prefix + name))
+  topk_seq = gather(sequences, "_topk_seq")
+  topk_flags = gather(flags, "_topk_flags")
+  topk_gathered_scores = gather(scores_to_gather, "_topk_scores")
+  if states_to_gather:
+    topk_gathered_states = nest.map_structure(
+        lambda state: gather(state, "_topk_states"), states_to_gather)
+  else:
+    topk_gathered_states = states_to_gather
+  return topk_seq, topk_gathered_scores, topk_flags, topk_gathered_states
 def beam_search(symbols_to_logits_fn,
@@ -103,14 +175,35 @@ def beam_search(symbols_to_logits_fn,
                decode_length,
                vocab_size,
                alpha,
-                eos_id=EOS_ID):
+                states=None,
+                eos_id=EOS_ID,
+                stop_early=True):
  """Beam search with length penalties.
-  Uses an interface specific to the sequence cnn models;
+  Requires a function that can take the currently decoded symbols and return
-  Requires a function that can take the currently decoded sybmols and return
  the logits for the next symbol. The implementation is inspired by
  https://arxiv.org/abs/1609.08144.
+  When running, the beam search steps can be visualized by using tfdbg to watch
+  the operations generating the output ids for each beam step.  These operations
+  have the pattern:
+    (alive|finished)_topk_(seq,scores)
+  Operations marked `alive` represent the new beam sequences that will be
+  processed in the next step.  Operations marked `finished` represent the
+  completed beam sequences, which may be padded with 0s if no beams finished.
+  Operations marked `seq` store the full beam sequence for the time step.
+  Operations marked `scores` store the sequence's final log scores.
+  The beam search steps will be processed sequentially in order, so when
+  capturing observed from these operations, tensors, clients can make
+  assumptions about which step is being recorded.
+  WARNING: Assumes 2nd dimension of tensors in `states` and not invariant, this
+  means that the shape of the 2nd dimension of these tensors will not be
+  available (i.e. set to None) inside symbols_to_logits_fn.
  Args:
    symbols_to_logits_fn: Interface to the model, to provide logits.
        Shoud take [batch_size, decoded_ids] and return [batch_size, vocab_size]
@@ -122,27 +215,34 @@ def beam_search(symbols_to_logits_fn,
    vocab_size: Size of the vocab, must equal the size of the logits returned by
        symbols_to_logits_fn
    alpha: alpha for length penalty.
+    states: dict (possibly nested) of decoding states.
    eos_id: ID for end of sentence.
+    stop_early: a boolean - stop once best sequence is provably determined.
  Returns:
    Tuple of
    (decoded beams [batch_size, beam_size, decode_length]
-     decoding probablities [batch_size, beam_size])
+     decoding probabilities [batch_size, beam_size])
  """
-  batch_size = tf.shape(initial_ids)[0]
+  batch_size = common_layers.shape_list(initial_ids)[0]
  # Assume initial_ids are prob 1.0
  initial_log_probs = tf.constant([[0.] + [-float("inf")] * (beam_size - 1)])
  # Expand to beam_size (batch_size, beam_size)
  alive_log_probs = tf.tile(initial_log_probs, [batch_size, 1])
-  # Expand each batch to beam_size
+  # Expand each batch and state to beam_size
-  alive_seq = tf.tile(tf.expand_dims(initial_ids, 1), [1, beam_size])
+  alive_seq = _expand_to_beam_size(initial_ids, beam_size)
-  alive_seq = tf.expand_dims(alive_seq, 2)  # (batch_size, beam_size, 1)
+  alive_seq = tf.expand_dims(alive_seq, axis=2)  # (batch_size, beam_size, 1)
+  if states:
+    states = nest.map_structure(
+        lambda state: _expand_to_beam_size(state, beam_size), states)
+  else:
+    states = {}
  # Finished will keep track of all the sequences that have finished so far
  # Finished log probs will be negative infinity in the beginning
  # finished_flags will keep track of booleans
-  finished_seq = tf.zeros(tf.shape(alive_seq), tf.int32)
+  finished_seq = tf.zeros(common_layers.shape_list(alive_seq), tf.int32)
  # Setting the scores of the initial to negative infinity.
  finished_scores = tf.ones([batch_size, beam_size]) * -INF
  finished_flags = tf.zeros([batch_size, beam_size], tf.bool)
@@ -184,9 +284,9 @@ def beam_search(symbols_to_logits_fn,
    curr_finished_flags = tf.concat([finished_flags, curr_finished], axis=1)
    return compute_topk_scores_and_seq(
        curr_finished_seq, curr_finished_scores, curr_finished_scores,
-        curr_finished_flags, beam_size, batch_size)
+        curr_finished_flags, beam_size, batch_size, "grow_finished")
-  def grow_alive(curr_seq, curr_scores, curr_log_probs, curr_finished):
+  def grow_alive(curr_seq, curr_scores, curr_log_probs, curr_finished, states):
    """Given sequences and scores, will gather the top k=beam size sequences.
    Args:
@@ -197,6 +297,7 @@ def beam_search(symbols_to_logits_fn,
        [batch_size, beam_size]
      curr_finished: Finished flags for each of these sequences.
        [batch_size, beam_size]
+      states: dict (possibly nested) of decoding states.
    Returns:
      Tuple of
        (Topk sequences based on scores,
@@ -207,10 +308,11 @@ def beam_search(symbols_to_logits_fn,
    # values
    curr_scores += tf.to_float(curr_finished) * -INF
    return compute_topk_scores_and_seq(curr_seq, curr_scores, curr_log_probs,
-                                       curr_finished, beam_size, batch_size)
+                                       curr_finished, beam_size, batch_size,
+                                       "grow_alive", states)
-  def grow_topk(i, alive_seq, alive_log_probs):
+  def grow_topk(i, alive_seq, alive_log_probs, states):
-    r"""Inner beam seach loop.
+    r"""Inner beam search loop.
    This function takes the current alive sequences, and grows them to topk
    sequences where k = 2*beam. We use 2*beam because, we could have beam_size
@@ -226,36 +328,45 @@ def beam_search(symbols_to_logits_fn,
      i: loop index
      alive_seq: Topk sequences decoded so far [batch_size, beam_size, i+1]
      alive_log_probs: probabilities of these sequences. [batch_size, beam_size]
+      states: dict (possibly nested) of decoding states.
    Returns:
      Tuple of
        (Topk sequences extended by the next word,
         The log probs of these sequences,
         The scores with length penalty of these sequences,
-         Flags indicating which of these sequences have finished decoding)
+         Flags indicating which of these sequences have finished decoding,
+         dict of transformed decoding states)
    """
    # Get the logits for all the possible next symbols
    flat_ids = tf.reshape(alive_seq, [batch_size * beam_size, -1])
    # (batch_size * beam_size, decoded_length)
+    if states:
+      flat_states = nest.map_structure(_merge_beam_dim, states)
+      flat_logits, flat_states = symbols_to_logits_fn(flat_ids, i, flat_states)
+      states = nest.map_structure(
+          lambda t: _unmerge_beam_dim(t, batch_size, beam_size), flat_states)
+    else:
      flat_logits = symbols_to_logits_fn(flat_ids)
-    logits = tf.reshape(flat_logits, (batch_size, beam_size, -1))
+    logits = tf.reshape(flat_logits, [batch_size, beam_size, -1])
    # Convert logits to normalized log probs
-    candidate_log_probs = log_prob_from_logits(logits)
+    candidate_log_probs = common_layers.log_prob_from_logits(logits)
-    # Multiply the probabilites by the current probabilites of the beam.
+    # Multiply the probabilities by the current probabilities of the beam.
    # (batch_size, beam_size, vocab_size) + (batch_size, beam_size, 1)
    log_probs = candidate_log_probs + tf.expand_dims(alive_log_probs, axis=2)
    length_penalty = tf.pow(((5. + tf.to_float(i + 1)) / 6.), alpha)
    curr_scores = log_probs / length_penalty
-    # Flatten out (beam_size, vocab_size) probs in to a list of possibilites
+    # Flatten out (beam_size, vocab_size) probs in to a list of possibilities
    flat_curr_scores = tf.reshape(curr_scores, [-1, beam_size * vocab_size])
    topk_scores, topk_ids = tf.nn.top_k(flat_curr_scores, k=beam_size * 2)
-    # Recovering the log probs becuase we will need to send them back
+    # Recovering the log probs because we will need to send them back
    topk_log_probs = topk_scores * length_penalty
    # Work out what beam the top probs are in.
@@ -263,7 +374,7 @@ def beam_search(symbols_to_logits_fn,
    topk_ids %= vocab_size  # Unflatten the ids
    # The next three steps are to create coordinates for tf.gather_nd to pull
-    # out the correct seqences from id's that we need to grow.
+    # out the correct sequences from id's that we need to grow.
    # We will also use the coordinates to gather the booleans of the beam items
    # that survived.
    batch_pos = compute_batch_indices(batch_size, beam_size * 2)
@@ -276,17 +387,20 @@ def beam_search(symbols_to_logits_fn,
    # Gather up the most probable 2*beams both for the ids and finished_in_alive
    # bools
    topk_seq = tf.gather_nd(alive_seq, topk_coordinates)
+    if states:
+      states = nest.map_structure(
+          lambda state: tf.gather_nd(state, topk_coordinates), states)
    # Append the most probable alive
    topk_seq = tf.concat([topk_seq, tf.expand_dims(topk_ids, axis=2)], axis=2)
    topk_finished = tf.equal(topk_ids, eos_id)
-    return topk_seq, topk_log_probs, topk_scores, topk_finished
+    return topk_seq, topk_log_probs, topk_scores, topk_finished, states
  def inner_loop(i, alive_seq, alive_log_probs, finished_seq, finished_scores,
-                 finished_flags):
+                 finished_flags, states):
-    """Inner beam seach loop.
+    """Inner beam search loop.
    There are three groups of tensors, alive, finished, and topk.
    The alive group contains information about the current alive sequences
@@ -317,6 +431,7 @@ def beam_search(symbols_to_logits_fn,
        [batch_size, beam_size]
      finished_flags: finished bools for each of these sequences.
        [batch_size, beam_size]
+      states: dict (possibly nested) of decoding states.
    Returns:
      Tuple of
@@ -325,30 +440,31 @@ def beam_search(symbols_to_logits_fn,
         Log probs of the alive sequences,
         New finished sequences,
         Scores of the new finished sequences,
-         Flags inidicating which sequence in finished as reached EOS)
+         Flags indicating which sequence in finished as reached EOS,
+         dict of final decoding states)
    """
    # Each inner loop, we carry out three steps:
    # 1. Get the current topk items.
    # 2. Extract the ones that have finished and haven't finished
    # 3. Recompute the contents of finished based on scores.
-    topk_seq, topk_log_probs, topk_scores, topk_finished = grow_topk(
+    topk_seq, topk_log_probs, topk_scores, topk_finished, states = grow_topk(
-        i, alive_seq, alive_log_probs)
+        i, alive_seq, alive_log_probs, states)
-    alive_seq, alive_log_probs, _ = grow_alive(topk_seq, topk_scores,
+    alive_seq, alive_log_probs, _, states = grow_alive(
-                                               topk_log_probs, topk_finished)
+        topk_seq, topk_scores, topk_log_probs, topk_finished, states)
-    finished_seq, finished_scores, finished_flags = grow_finished(
+    finished_seq, finished_scores, finished_flags, _ = grow_finished(
        finished_seq, finished_scores, finished_flags, topk_seq, topk_scores,
        topk_finished)
    return (i + 1, alive_seq, alive_log_probs, finished_seq, finished_scores,
-            finished_flags)
+            finished_flags, states)
  def _is_finished(i, unused_alive_seq, alive_log_probs, unused_finished_seq,
-                   finished_scores, finished_in_finished):
+                   finished_scores, finished_in_finished, unused_states):
    """Checking termination condition.
    We terminate when we decoded up to decode_length or the lowest scoring item
-    in finished has a greater score that the higest prob item in alive divided
+    in finished has a greater score that the highest prob item in alive divided
    by the max length penalty
    Args:
@@ -362,41 +478,38 @@ def beam_search(symbols_to_logits_fn,
    Returns:
      Bool.
    """
+    if not stop_early:
+      return tf.less(i, decode_length)
    max_length_penalty = tf.pow(((5. + tf.to_float(decode_length)) / 6.), alpha)
-    # The best possible score of the most likley alive sequence
+    # The best possible score of the most likely alive sequence.
    lower_bound_alive_scores = alive_log_probs[:, 0] / max_length_penalty
    # Now to compute the lowest score of a finished sequence in finished
    # If the sequence isn't finished, we multiply it's score by 0. since
    # scores are all -ve, taking the min will give us the score of the lowest
    # finished item.
-    lowest_score_of_fininshed_in_finished = tf.reduce_min(
+    lowest_score_of_finished_in_finished = tf.reduce_min(
        finished_scores * tf.to_float(finished_in_finished), axis=1)
    # If none of the sequences have finished, then the min will be 0 and
    # we have to replace it by -ve INF if it is. The score of any seq in alive
    # will be much higher than -ve INF and the termination condition will not
    # be met.
-    lowest_score_of_fininshed_in_finished += (
+    lowest_score_of_finished_in_finished += (
        (1. - tf.to_float(tf.reduce_any(finished_in_finished, 1))) * -INF)
    bound_is_met = tf.reduce_all(
-        tf.greater(lowest_score_of_fininshed_in_finished,
+        tf.greater(lowest_score_of_finished_in_finished,
                   lower_bound_alive_scores))
    return tf.logical_and(
        tf.less(i, decode_length), tf.logical_not(bound_is_met))
-  """
-  (_, alive_seq, alive_log_probs, finished_seq, finished_scores,
-  finished_flags) = inner_loop(tf.constant(0), alive_seq, alive_log_probs, finished_seq,
-           finished_scores, finished_flags)
-  """
  (_, alive_seq, alive_log_probs, finished_seq, finished_scores,
-   finished_flags) = tf.while_loop(
+   finished_flags, _) = tf.while_loop(
       _is_finished,
       inner_loop, [
           tf.constant(0), alive_seq, alive_log_probs, finished_seq,
-           finished_scores, finished_flags
+           finished_scores, finished_flags, states
       ],
       shape_invariants=[
           tf.TensorShape([]),
@@ -404,7 +517,8 @@ def beam_search(symbols_to_logits_fn,
           alive_log_probs.get_shape(),
           tf.TensorShape([None, None, None]),
           finished_scores.get_shape(),
-           finished_flags.get_shape()
+           finished_flags.get_shape(),
+           nest.map_structure(get_state_shape_invariants, states),
       ],
       parallel_iterations=1,
       back_prop=False)

--- a/tensor2tensor/utils/beam_search_slow.py
+++ b/tensor2tensor/utils/beam_search_slow.py
+# Copyright 2017 The Tensor2Tensor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Implemetation of beam seach with penalties."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+# Dependency imports
+import tensorflow as tf
+# Assuming EOS_ID is 1
+EOS_ID = 1
+# Default value for INF
+INF = 1. * 1e7
+def log_prob_from_logits(logits):
+  return logits - tf.reduce_logsumexp(logits, axis=2, keep_dims=True)
+def compute_batch_indices(batch_size, beam_size):
+  """Computes the i'th coodinate that contains the batch index for gathers.
+  Batch pos is a tensor like [[0,0,0,0,],[1,1,1,1],..]. It says which
+  batch the beam item is in. This will create the i of the i,j coordinate
+  needed for the gather.
+  Args:
+    batch_size: Batch size
+    beam_size: Size of the beam.
+  Returns:
+    batch_pos: [batch_size, beam_size] tensor of ids
+  """
+  batch_pos = tf.range(batch_size * beam_size) // beam_size
+  batch_pos = tf.reshape(batch_pos, [batch_size, beam_size])
+  return batch_pos
+def compute_topk_scores_and_seq(sequences, scores, scores_to_gather, flags,
+                                beam_size, batch_size):
+  """Given sequences and scores, will gather the top k=beam size sequences.
+  This function is used to grow alive, and finished. It takes sequences,
+  scores, and flags, and returns the top k from sequences, scores_to_gather,
+  and flags based on the values in scores.
+  Args:
+    sequences: Tensor of sequences that we need to gather from.
+      [batch_size, beam_size, seq_length]
+    scores: Tensor of scores for each sequence in sequences.
+      [batch_size, beam_size]. We will use these to compute the topk.
+    scores_to_gather: Tensor of scores for each sequence in sequences.
+      [batch_size, beam_size]. We will return the gathered scores from here.
+      Scores to gather is different from scores because for grow_alive, we will
+      need to return log_probs, while for grow_finished, we will need to return
+      the length penalized scors.
+    flags: Tensor of bools for sequences that say whether a sequence has reached
+      EOS or not
+    beam_size: int
+    batch_size: int
+  Returns:
+    Tuple of
+    (topk_seq [batch_size, beam_size, decode_length],
+     topk_gathered_scores [batch_size, beam_size],
+     topk_finished_flags[batch_size, beam_size])
+  """
+  _, topk_indexes = tf.nn.top_k(scores, k=beam_size)
+  # The next three steps are to create coordinates for tf.gather_nd to pull
+  # out the topk sequences from sequences based on scores.
+  # batch pos is a tensor like [[0,0,0,0,],[1,1,1,1],..]. It says which
+  # batch the beam item is in. This will create the i of the i,j coordinate
+  # needed for the gather
+  batch_pos = compute_batch_indices(batch_size, beam_size)
+  # top coordinates will give us the actual coordinates to do the gather.
+  # stacking will create a tensor of dimension batch * beam * 2, where the
+  # last dimension contains the i,j gathering coordinates.
+  top_coordinates = tf.stack([batch_pos, topk_indexes], axis=2)
+  # Gather up the highest scoring sequences
+  topk_seq = tf.gather_nd(sequences, top_coordinates)
+  topk_flags = tf.gather_nd(flags, top_coordinates)
+  topk_gathered_scores = tf.gather_nd(scores_to_gather, top_coordinates)
+  return topk_seq, topk_gathered_scores, topk_flags
+def beam_search(symbols_to_logits_fn,
+                initial_ids,
+                beam_size,
+                decode_length,
+                vocab_size,
+                alpha,
+                eos_id=EOS_ID):
+  """Beam search with length penalties.
+  Uses an interface specific to the sequence cnn models;
+  Requires a function that can take the currently decoded sybmols and return
+  the logits for the next symbol. The implementation is inspired by
+  https://arxiv.org/abs/1609.08144.
+  Args:
+    symbols_to_logits_fn: Interface to the model, to provide logits.
+        Shoud take [batch_size, decoded_ids] and return [batch_size, vocab_size]
+    initial_ids: Ids to start off the decoding, this will be the first thing
+        handed to symbols_to_logits_fn (after expanding to beam size)
+        [batch_size]
+    beam_size: Size of the beam.
+    decode_length: Number of steps to decode for.
+    vocab_size: Size of the vocab, must equal the size of the logits returned by
+        symbols_to_logits_fn
+    alpha: alpha for length penalty.
+    eos_id: ID for end of sentence.
+  Returns:
+    Tuple of
+    (decoded beams [batch_size, beam_size, decode_length]
+     decoding probablities [batch_size, beam_size])
+  """
+  batch_size = tf.shape(initial_ids)[0]
+  # Assume initial_ids are prob 1.0
+  initial_log_probs = tf.constant([[0.] + [-float("inf")] * (beam_size - 1)])
+  # Expand to beam_size (batch_size, beam_size)
+  alive_log_probs = tf.tile(initial_log_probs, [batch_size, 1])
+  # Expand each batch to beam_size
+  alive_seq = tf.tile(tf.expand_dims(initial_ids, 1), [1, beam_size])
+  alive_seq = tf.expand_dims(alive_seq, 2)  # (batch_size, beam_size, 1)
+  # Finished will keep track of all the sequences that have finished so far
+  # Finished log probs will be negative infinity in the beginning
+  # finished_flags will keep track of booleans
+  finished_seq = tf.zeros(tf.shape(alive_seq), tf.int32)
+  # Setting the scores of the initial to negative infinity.
+  finished_scores = tf.ones([batch_size, beam_size]) * -INF
+  finished_flags = tf.zeros([batch_size, beam_size], tf.bool)
+  def grow_finished(finished_seq, finished_scores, finished_flags, curr_seq,
+                    curr_scores, curr_finished):
+    """Given sequences and scores, will gather the top k=beam size sequences.
+    Args:
+      finished_seq: Current finished sequences.
+        [batch_size, beam_size, current_decoded_length]
+      finished_scores: scores for each of these sequences.
+        [batch_size, beam_size]
+      finished_flags: finished bools for each of these sequences.
+        [batch_size, beam_size]
+      curr_seq: current topk sequence that has been grown by one position.
+        [batch_size, beam_size, current_decoded_length]
+      curr_scores: scores for each of these sequences. [batch_size, beam_size]
+      curr_finished: Finished flags for each of these sequences.
+        [batch_size, beam_size]
+    Returns:
+      Tuple of
+        (Topk sequences based on scores,
+         log probs of these sequences,
+         Finished flags of these sequences)
+    """
+    # First append a column of 0'ids to finished to make the same length with
+    # finished scores
+    finished_seq = tf.concat(
+        [finished_seq,
+         tf.zeros([batch_size, beam_size, 1], tf.int32)], axis=2)
+    # Set the scores of the unfinished seq in curr_seq to large negative
+    # values
+    curr_scores += (1. - tf.to_float(curr_finished)) * -INF
+    # concatenating the sequences and scores along beam axis
+    curr_finished_seq = tf.concat([finished_seq, curr_seq], axis=1)
+    curr_finished_scores = tf.concat([finished_scores, curr_scores], axis=1)
+    curr_finished_flags = tf.concat([finished_flags, curr_finished], axis=1)
+    return compute_topk_scores_and_seq(
+        curr_finished_seq, curr_finished_scores, curr_finished_scores,
+        curr_finished_flags, beam_size, batch_size)
+  def grow_alive(curr_seq, curr_scores, curr_log_probs, curr_finished):
+    """Given sequences and scores, will gather the top k=beam size sequences.
+    Args:
+      curr_seq: current topk sequence that has been grown by one position.
+        [batch_size, beam_size, i+1]
+      curr_scores: scores for each of these sequences. [batch_size, beam_size]
+      curr_log_probs: log probs for each of these sequences.
+        [batch_size, beam_size]
+      curr_finished: Finished flags for each of these sequences.
+        [batch_size, beam_size]
+    Returns:
+      Tuple of
+        (Topk sequences based on scores,
+         log probs of these sequences,
+         Finished flags of these sequences)
+    """
+    # Set the scores of the finished seq in curr_seq to large negative
+    # values
+    curr_scores += tf.to_float(curr_finished) * -INF
+    return compute_topk_scores_and_seq(curr_seq, curr_scores, curr_log_probs,
+                                       curr_finished, beam_size, batch_size)
+  def grow_topk(i, alive_seq, alive_log_probs):
+    r"""Inner beam seach loop.
+    This function takes the current alive sequences, and grows them to topk
+    sequences where k = 2*beam. We use 2*beam because, we could have beam_size
+    number of sequences that might hit <EOS> and there will be no alive
+    sequences to continue. With 2*beam_size, this will not happen. This relies
+    on the assumption the vocab size is > beam size. If this is true, we'll
+    have at least beam_size non <EOS> extensions if we extract the next top
+    2*beam words.
+    Length penalty is given by = (5+len(decode)/6) ^ -\alpha. Pls refer to
+    https://arxiv.org/abs/1609.08144.
+    Args:
+      i: loop index
+      alive_seq: Topk sequences decoded so far [batch_size, beam_size, i+1]
+      alive_log_probs: probabilities of these sequences. [batch_size, beam_size]
+    Returns:
+      Tuple of
+        (Topk sequences extended by the next word,
+         The log probs of these sequences,
+         The scores with length penalty of these sequences,
+         Flags indicating which of these sequences have finished decoding)
+    """
+    # Get the logits for all the possible next symbols
+    flat_ids = tf.reshape(alive_seq, [batch_size * beam_size, -1])
+    # (batch_size * beam_size, decoded_length)
+    flat_logits = symbols_to_logits_fn(flat_ids)
+    logits = tf.reshape(flat_logits, (batch_size, beam_size, -1))
+    # Convert logits to normalized log probs
+    candidate_log_probs = log_prob_from_logits(logits)
+    # Multiply the probabilites by the current probabilites of the beam.
+    # (batch_size, beam_size, vocab_size) + (batch_size, beam_size, 1)
+    log_probs = candidate_log_probs + tf.expand_dims(alive_log_probs, axis=2)
+    length_penalty = tf.pow(((5. + tf.to_float(i + 1)) / 6.), alpha)
+    curr_scores = log_probs / length_penalty
+    # Flatten out (beam_size, vocab_size) probs in to a list of possibilites
+    flat_curr_scores = tf.reshape(curr_scores, [-1, beam_size * vocab_size])
+    topk_scores, topk_ids = tf.nn.top_k(flat_curr_scores, k=beam_size * 2)
+    # Recovering the log probs becuase we will need to send them back
+    topk_log_probs = topk_scores * length_penalty
+    # Work out what beam the top probs are in.
+    topk_beam_index = topk_ids // vocab_size
+    topk_ids %= vocab_size  # Unflatten the ids
+    # The next three steps are to create coordinates for tf.gather_nd to pull
+    # out the correct seqences from id's that we need to grow.
+    # We will also use the coordinates to gather the booleans of the beam items
+    # that survived.
+    batch_pos = compute_batch_indices(batch_size, beam_size * 2)
+    # top beams will give us the actual coordinates to do the gather.
+    # stacking will create a tensor of dimension batch * beam * 2, where the
+    # last dimension contains the i,j gathering coordinates.
+    topk_coordinates = tf.stack([batch_pos, topk_beam_index], axis=2)
+    # Gather up the most probable 2*beams both for the ids and finished_in_alive
+    # bools
+    topk_seq = tf.gather_nd(alive_seq, topk_coordinates)
+    # Append the most probable alive
+    topk_seq = tf.concat([topk_seq, tf.expand_dims(topk_ids, axis=2)], axis=2)
+    topk_finished = tf.equal(topk_ids, eos_id)
+    return topk_seq, topk_log_probs, topk_scores, topk_finished
+  def inner_loop(i, alive_seq, alive_log_probs, finished_seq, finished_scores,
+                 finished_flags):
+    """Inner beam seach loop.
+    There are three groups of tensors, alive, finished, and topk.
+    The alive group contains information about the current alive sequences
+    The topk group contains information about alive + topk current decoded words
+    the finished group contains information about finished sentences, that is,
+    the ones that have decoded to <EOS>. These are what we return.
+    The general beam search algorithm is as follows:
+    While we haven't terminated (pls look at termination condition)
+      1. Grow the current alive to get beam*2 topk sequences
+      2. Among the topk, keep the top beam_size ones that haven't reached EOS
+      into alive
+      3. Among the topk, keep the top beam_size ones have reached EOS into
+      finished
+    Repeat
+    To make things simple with using fixed size tensors, we will end
+    up inserting unfinished sequences into finished in the beginning. To stop
+    that we add -ve INF to the score of the unfinished sequence so that when a
+    true finished sequence does appear, it will have a higher score than all the
+    unfinished ones.
+    Args:
+      i: loop index
+      alive_seq: Topk sequences decoded so far [batch_size, beam_size, i+1]
+      alive_log_probs: probabilities of the beams. [batch_size, beam_size]
+      finished_seq: Current finished sequences.
+        [batch_size, beam_size, i+1]
+      finished_scores: scores for each of these sequences.
+        [batch_size, beam_size]
+      finished_flags: finished bools for each of these sequences.
+        [batch_size, beam_size]
+    Returns:
+      Tuple of
+        (Incremented loop index
+         New alive sequences,
+         Log probs of the alive sequences,
+         New finished sequences,
+         Scores of the new finished sequences,
+         Flags inidicating which sequence in finished as reached EOS)
+    """
+    # Each inner loop, we carry out three steps:
+    # 1. Get the current topk items.
+    # 2. Extract the ones that have finished and haven't finished
+    # 3. Recompute the contents of finished based on scores.
+    topk_seq, topk_log_probs, topk_scores, topk_finished = grow_topk(
+        i, alive_seq, alive_log_probs)
+    alive_seq, alive_log_probs, _ = grow_alive(topk_seq, topk_scores,
+                                               topk_log_probs, topk_finished)
+    finished_seq, finished_scores, finished_flags = grow_finished(
+        finished_seq, finished_scores, finished_flags, topk_seq, topk_scores,
+        topk_finished)
+    return (i + 1, alive_seq, alive_log_probs, finished_seq, finished_scores,
+            finished_flags)
+  def _is_finished(i, unused_alive_seq, alive_log_probs, unused_finished_seq,
+                   finished_scores, finished_in_finished):
+    """Checking termination condition.
+    We terminate when we decoded up to decode_length or the lowest scoring item
+    in finished has a greater score that the higest prob item in alive divided
+    by the max length penalty
+    Args:
+      i: loop index
+      alive_log_probs: probabilities of the beams. [batch_size, beam_size]
+      finished_scores: scores for each of these sequences.
+        [batch_size, beam_size]
+      finished_in_finished: finished bools for each of these sequences.
+        [batch_size, beam_size]
+    Returns:
+      Bool.
+    """
+    max_length_penalty = tf.pow(((5. + tf.to_float(decode_length)) / 6.), alpha)
+    # The best possible score of the most likley alive sequence
+    lower_bound_alive_scores = alive_log_probs[:, 0] / max_length_penalty
+    # Now to compute the lowest score of a finished sequence in finished
+    # If the sequence isn't finished, we multiply it's score by 0. since
+    # scores are all -ve, taking the min will give us the score of the lowest
+    # finished item.
+    lowest_score_of_fininshed_in_finished = tf.reduce_min(
+        finished_scores * tf.to_float(finished_in_finished), axis=1)
+    # If none of the sequences have finished, then the min will be 0 and
+    # we have to replace it by -ve INF if it is. The score of any seq in alive
+    # will be much higher than -ve INF and the termination condition will not
+    # be met.
+    lowest_score_of_fininshed_in_finished += (
+        (1. - tf.to_float(tf.reduce_any(finished_in_finished, 1))) * -INF)
+    bound_is_met = tf.reduce_all(
+        tf.greater(lowest_score_of_fininshed_in_finished,
+                   lower_bound_alive_scores))
+    return tf.logical_and(
+        tf.less(i, decode_length), tf.logical_not(bound_is_met))
+  """
+  (_, alive_seq, alive_log_probs, finished_seq, finished_scores,
+  finished_flags) = inner_loop(tf.constant(0), alive_seq, alive_log_probs, finished_seq,
+           finished_scores, finished_flags)
+  """
+  (_, alive_seq, alive_log_probs, finished_seq, finished_scores,
+   finished_flags) = tf.while_loop(
+       _is_finished,
+       inner_loop, [
+           tf.constant(0), alive_seq, alive_log_probs, finished_seq,
+           finished_scores, finished_flags
+       ],
+       shape_invariants=[
+           tf.TensorShape([]),
+           tf.TensorShape([None, None, None]),
+           alive_log_probs.get_shape(),
+           tf.TensorShape([None, None, None]),
+           finished_scores.get_shape(),
+           finished_flags.get_shape()
+       ],
+       parallel_iterations=1,
+       back_prop=False)
+  alive_seq.set_shape((None, beam_size, None))
+  finished_seq.set_shape((None, beam_size, None))
+  # Accounting for corner case: It's possible that no sequence in alive for a
+  # particular batch item ever reached EOS. In that case, we should just copy
+  # the contents of alive for that batch item. tf.reduce_any(finished_flags, 1)
+  # if 0, means that no sequence for that batch index had reached EOS. We need
+  # to do the same for the scores as well.
+  finished_seq = tf.where(
+      tf.reduce_any(finished_flags, 1), finished_seq, alive_seq)
+  finished_scores = tf.where(
+      tf.reduce_any(finished_flags, 1), finished_scores, alive_log_probs)
+  return finished_seq, finished_scores
--- a/tensor2tensor/utils/beam_search_test.py
+++ b/tensor2tensor/utils/beam_search_test.py
@@ -21,7 +21,7 @@ from __future__ import print_function
 # Dependency imports
 import numpy as np
-from tensor2tensor.utils import beam_search
+from tensor2tensor.utils import beam_search_slow
 import tensorflow as tf
@@ -40,7 +40,7 @@ class BeamSearchTest(tf.test.TestCase):
      # Just return random logits
      return tf.random_uniform((batch_size * beam_size, vocab_size))
-    final_ids, final_probs = beam_search.beam_search(
+    final_ids, final_probs = beam_search_slow.beam_search(
        symbols_to_logits, initial_ids, beam_size, decode_length, vocab_size,
        0.)
@@ -60,7 +60,7 @@ class BeamSearchTest(tf.test.TestCase):
    flags = tf.constant([[True, False, False, True],
                         [False, False, False, True]])
-    topk_seq, topk_scores, topk_flags = beam_search.compute_topk_scores_and_seq(
+    topk_seq, topk_scores, topk_flags = beam_search_slow.compute_topk_scores_and_seq(
        sequences, scores, scores, flags, beam_size, batch_size)
    with self.test_session():
@@ -115,7 +115,7 @@ class BeamSearchTest(tf.test.TestCase):
      logits = tf.to_float(tf.log(probabilities[pos - 1, :]))
      return logits
-    final_ids, final_probs = beam_search.beam_search(
+    final_ids, final_probs = beam_search_slow.beam_search(
        symbols_to_logits,
        initial_ids,
        beam_size,
@@ -146,7 +146,7 @@ class BeamSearchTest(tf.test.TestCase):
      logits = tf.to_float(tf.log(probabilities[pos - 1, :]))
      return logits
-    final_ids, final_probs = beam_search.beam_search(
+    final_ids, final_probs = beam_search_slow.beam_search(
        symbols_to_logits,
        initial_ids,
        beam_size,
@@ -175,7 +175,7 @@ class BeamSearchTest(tf.test.TestCase):
      logits = tf.to_float(tf.log(probabilities[pos - 1, :]))
      return logits
-    final_ids, final_probs = beam_search.beam_search(
+    final_ids, final_probs = beam_search_slow.beam_search(
        symbols_to_logits,
        initial_ids,
        beam_size,
@@ -215,7 +215,7 @@ class BeamSearchTest(tf.test.TestCase):
      logits = tf.to_float(tf.log(probabilities[pos - 1, :]))
      return logits
-    final_ids, final_scores = beam_search.beam_search(
+    final_ids, final_scores = beam_search_slow.beam_search(
        symbols_to_logits,
        initial_ids,
        beam_size,
@@ -258,7 +258,7 @@ class BeamSearchTest(tf.test.TestCase):
      return logits
    # Disable early stopping
-    final_ids, final_scores = beam_search.beam_search(
+    final_ids, final_scores = beam_search_slow.beam_search(
        symbols_to_logits,
        initial_ids,
        beam_size,

--- a/tensor2tensor/utils/modality.py
+++ b/tensor2tensor/utils/modality.py
@@ -156,3 +156,24 @@ class Modality(object):
        weights_fn=weights_fn)
    loss = tf.add_n(loss_num) / tf.maximum(1.0, tf.add_n(loss_den))
    return sharded_logits, loss
+  def top_sharded_logits(self,
+                  sharded_body_output,
+                  sharded_targets,
+                  data_parallelism):
+    """Transform all shards of targets.
+    Classes with cross-shard interaction will override this function.
+    Args:
+      sharded_body_output: A list of Tensors.
+      sharded_targets: A list of Tensors.
+      data_parallelism: a expert_utils.Parallelism object.
+      weights_fn: function from targets to target weights.
+    Returns:
+      shaded_logits: A list of Tensors.
+      training_loss: a Scalar.
+    """
+    sharded_logits = data_parallelism(self.top, sharded_body_output,
+                                      sharded_targets)
+    return sharded_logits
--- a/tensor2tensor/utils/t2t_model.py
+++ b/tensor2tensor/utils/t2t_model.py
@@ -16,6 +16,7 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+import numpy as np
 import copy
 import time
@@ -25,11 +26,11 @@ import time
 import six
 from six.moves import xrange  # pylint: disable=redefined-builtin
-from tensor2tensor.utils import beam_search
+from tensor2tensor.utils import beam_search_slow
 from tensor2tensor.utils import expert_utils as eu
 from tensor2tensor.utils import modality
 from tensor2tensor.utils import registry
+from tensorflow.python.layers import base
 import tensorflow as tf
@@ -233,7 +234,7 @@ class T2TModel(object):
    vocab_size = target_modality.top_dimensionality
    # Setting decode length to input length + decode_length
    decode_length = tf.shape(features["inputs"])[1] + tf.constant(decode_length)
-    ids, scores = beam_search.beam_search(symbols_to_logits_fn, initial_ids,
+    ids, scores = beam_search_slow.beam_search(symbols_to_logits_fn, initial_ids,
                                               beam_size, decode_length, vocab_size,
                                               alpha)
@@ -490,6 +491,8 @@ class T2TModel(object):
    """
    raise NotImplementedError("Abstract Method")
  @property
  def hparams(self):
    return self._hparams