re-submit

65c39211 · xiaotong · 1764d5c6 · 65c39211 · 65c39211
Commit 65c39211 authored Nov 18, 2019 by xiaotong
--- a/Section06-Neural-Machine-Translation/section06-test.tex
+++ b/Section06-Neural-Machine-Translation/section06-test.tex
@@ -28,6 +28,7 @@

 \usetikzlibrary{calc,intersections}
 \usetikzlibrary{matrix}
+\usetikzlibrary{patterns}
 \usetikzlibrary{arrows,decorations.pathreplacing}
 \usetikzlibrary{shadows} % LATEX and plain TEX when using Tik Z
 \usetikzlibrary{shadows.blur}
@@ -144,30 +145,170 @@
 \subsection{注意力机制}

 %%%------------------------------------------------------------------------------------------------------------
-%%% 如何定义注意力函数
-\begin{frame}{计算注意力权重 - 注意力函数}
+%%% 解码 - beam search
+\begin{frame}{推断 - Beam Search}
 \begin{itemize}
-\item 再来看一下注意力权重的定义。这个过程实际上是对$a(\cdot,\cdot)$做指数归一化：\\
-\vspace{-0.3em}
-\begin{displaymath}
-\alpha_{i,j} = \frac{\exp(a(s_{i-1}, h_j))}{\sum_{j'} \exp(a(s_{i-1}, h_{j'}))}
-\end{displaymath}
-
-\item<2-> 注意力函数$a(s,h)$的目的是捕捉$s$和$h$之间的\alert{相似性}，这也可以被看作是目标语表示和源语言表示的一种``统一化''，即把源语言和目标语表示在同一个语义空间，进而语义相近的内容有更大的相似性。\visible<3->{定义$a(s,h)$的方式：}
-    
-    \visible<3->{
-    \begin{displaymath}
-    a(s,h) =  \left\{ \begin{array}{ll}
-    s h^T & \textrm{向量乘} \\
-    \textrm{cos}(s, h) & \textrm{向量夹角} \\
-    s \textbf{W} h^T & \textrm{线性模型} \\
-    \textrm{TanH}(\textbf{W}[s,h])\textbf{v}^T & \textrm{拼接}
-    \end{array}
-    \right.
-    \end{displaymath}
-    $\textbf{W}$和$\textbf{v}$是可学习参数
-    }
+\item \textbf{Greedy Search}: 目标语每一个位置，输出层的Softmax可以得到所有单词的概率，然后选择一个概率最大单词输出，下一个位置的预测就基于这一步输出的单词
+\item \textbf{Beach Search}: 为了避免贪婪方法造成的错误累加，可以每次对$b$个单词进行扩展，而不是只使用一个单词，其中$b$称做束的宽度 - 这样可以搜索更多可能的译文
 \end{itemize}
+
+\vspace{-0.3em}
+\visible<2->{
+\begin{center}
+\begin{tikzpicture}
+\begin{scope}
+\tikzstyle{rnnnode} = [minimum height=1.1em,minimum width=3.5em,inner sep=2pt,rounded corners=1pt,draw,fill=red!20];
+\tikzstyle{wnode} = [minimum height=1.0em,minimum width=3.5em,inner sep=2pt,rounded corners=1pt,draw,fill=white];
+
+
+\visible<3->{
+\node [rnnnode,anchor=west,fill=green!20] (t1) at (0,0) {\tiny{$e_y()$}};
+}
+\visible<7->{
+\node [rnnnode,anchor=west,fill=green!20] (t2) at ([xshift=2.2em]t1.east) {\tiny{$e_y()$ ($\times 3$)}};
+}
+\visible<8->{
+\node [rnnnode,anchor=west,fill=green!20] (t3) at ([xshift=2.2em]t2.east) {\tiny{$e_y()$ ($\times 3$)}};
+\node [anchor=west,inner sep=2pt] (t4) at ([xshift=0.3em]t3.east) {\tiny{...}};
+}
+\visible<3->{
+\node [rnnnode,anchor=south] (s1) at ([yshift=1em]t1.north) {\tiny{$\textbf{s}_1$}};
+}
+\visible<7->{
+\node [rnnnode,anchor=south] (s2) at ([yshift=1em]t2.north) {\tiny{$\textbf{s}_2$ ($\times 3$)}};
+}
+\visible<8->{
+\node [rnnnode,anchor=south] (s3) at ([yshift=1em]t3.north) {\tiny{$\textbf{s}_3$ ($\times 3$)}};
+\node [anchor=west,inner sep=2pt] (s4) at ([xshift=0.3em]s3.east) {\tiny{...}};
+}
+\visible<3->{
+\node [rnnnode,anchor=south,fill=blue!20] (o1) at ([yshift=1em]s1.north) {\tiny{softmax}};
+}
+\visible<7->{
+\node [rnnnode,anchor=south,fill=blue!20] (o2) at ([yshift=1em]s2.north) {\tiny{softmax ($\times 3$)}};
+}
+\visible<8->{
+\node [rnnnode,anchor=south,fill=blue!20] (o3) at ([yshift=1em]s3.north) {\tiny{softmax ($\times 3$)}};
+\node [anchor=west,inner sep=2pt] (o4) at ([xshift=0.3em]o3.east) {\tiny{...}};
+}
+
+\node [wnode,anchor=north] (wt1) at ([yshift=-0.8em]t1.south) {\tiny{EOS}};
+
+\visible<6->{
+\node [wnode,anchor=north] (wt2) at ([yshift=-0.8em]t2.south) {\tiny{Have}};
+\node [wnode,anchor=north] (wt2copy1) at ([xshift=-0.2em,yshift=-0.2em]wt2.north) {\tiny{Have}};
+\node [wnode,anchor=north] (wt2copy2) at ([xshift=-0.4em,yshift=-0.4em]wt2.north) {\tiny{Have}};
+}
+
+\visible<8->{
+\node [wnode,anchor=north,inner sep=2pt] (wt3) at ([yshift=-0.8em]t3.south) {\tiny{you}};
+\node [wnode,anchor=north] (wt3copy1) at ([xshift=-0.2em,yshift=-0.2em]wt3.north) {\tiny{you}};
+\node [wnode,anchor=north] (wt3copy2) at ([xshift=-0.4em,yshift=-0.4em]wt3.north) {\tiny{you}};
+}
+
+\visible<5->{
+\node [wnode,anchor=center,inner sep=2pt] (wo1) at ([xshift=0.4em,yshift=1.8em]o1.north) {\tiny{Have}};
+\node [wnode,anchor=north] (wo1copy1) at ([xshift=-0.2em,yshift=-0.2em]wo1.north) {\tiny{Have}};
+\node [wnode,anchor=north] (wo1copy2) at ([xshift=-0.4em,yshift=-0.4em]wo1.north) {\tiny{Have}};
+}
+
+\visible<8->{
+\node [wnode,anchor=center,inner sep=2pt] (wo2) at ([xshift=0.4em,yshift=1.8em]o2.north) {\tiny{you}};
+\node [wnode,anchor=north] (wo2copy1) at ([xshift=-0.2em,yshift=-0.2em]wo2.north) {\tiny{you}};
+\node [wnode,anchor=north] (wo2copy2) at ([xshift=-0.4em,yshift=-0.4em]wo2.north) {\tiny{you}};
+}
+
+\visible<8->{
+\node [wnode,anchor=center,inner sep=2pt] (wo3) at ([xshift=0.4em,yshift=1.8em]o3.north) {\tiny{learned}};
+\node [wnode,anchor=north] (wo3copy1) at ([xshift=-0.2em,yshift=-0.2em]wo3.north) {\tiny{learned}};
+\node [wnode,anchor=north] (wo3copy2) at ([xshift=-0.4em,yshift=-0.4em]wo3.north) {\tiny{learned}};
+}
+
+\visible<3->{
+\foreach \x in {1}{
+    \draw [->] ([yshift=-0.7em]t\x.south) -- ([yshift=-0.1em]t\x.south);
+    \draw [->] ([yshift=0.1em]t\x.north) -- ([yshift=-0.1em]s\x.south);
+    \draw [->] ([yshift=0.1em]s\x.north) -- ([yshift=-0.1em]o\x.south);
+   
+}
+}
+
+\visible<5->{
+ \draw [->] ([yshift=0.1em]o1.north) -- ([yshift=0.8em]o1.north) node [pos=0.5,right] {\tiny{top-3}};
+ }
+
+\visible<7->{
+\foreach \x in {2}{
+    \draw [->] ([yshift=-0.7em]t\x.south) -- ([yshift=-0.1em]t\x.south);
+    \draw [->] ([yshift=0.1em]t\x.north) -- ([yshift=-0.1em]s\x.south);
+    \draw [->] ([yshift=0.1em]s\x.north) -- ([yshift=-0.1em]o\x.south);
+    \draw [->] ([yshift=0.1em]o\x.north) -- ([yshift=0.8em]o\x.north) node [pos=0.5,right] {\tiny{top-3}};
+}
+}
+
+\visible<8->{
+\foreach \x in {3}{
+    \draw [->] ([yshift=-0.7em]t\x.south) -- ([yshift=-0.1em]t\x.south);
+    \draw [->] ([yshift=0.1em]t\x.north) -- ([yshift=-0.1em]s\x.south);
+    \draw [->] ([yshift=0.1em]s\x.north) -- ([yshift=-0.1em]o\x.south);
+    \draw [->] ([yshift=0.1em]o\x.north) -- ([yshift=0.8em]o\x.north) node [pos=0.5,right] {\tiny{top-3}};
+}
+}
+
+\visible<3->{
+\draw [->] ([xshift=-0.5em]s1.west) -- ([xshift=-0.1em]s1.west) node [pos=0,left,inner sep=1pt] {\tiny{0}};
+}
+\visible<7->{
+\draw [->] ([xshift=0.1em]s1.east) -- ([xshift=-0.1em]s2.west);
+}
+\visible<8->{
+\draw [->] ([xshift=0.1em]s2.east) -- ([xshift=-0.1em]s3.west);
+}
+
+\visible<6->{
+\draw [->,very thick,dotted] (wo1.east) .. controls +(east:0.6) and +(west:0.8) ..(wt2copy2.west);
+}
+\visible<8->{
+\draw [->,very thick,dotted] (wo2.east) .. controls +(east:0.6) and +(west:0.8) ..(wt3copy2.west);
+}
+
+\visible<7->{
+\node [circle,draw,anchor=north,inner sep=2pt,fill=orange!20] (c2) at ([yshift=-2.5em]t1.south) {\tiny{$\textbf{C}_2$}};
+\node [circle,draw,inner sep=2pt,fill=orange!20] (c2copy1) at ([yshift=-0.1em,xshift=-0.1em]c2) {\tiny{$\textbf{C}_2$}};
+\node [circle,draw,inner sep=2pt,fill=orange!20] (c2copy2) at ([yshift=-0.2em,xshift=-0.2em]c2) {\tiny{$\textbf{C}_2$}};
+\draw [->] ([xshift=-0.9em]c2.west) -- ([xshift=-0.3em]c2.west);
+\draw [->] ([xshift=0.1em]c2.east) .. controls +(east:1.5) and +(west:0.8) ..([yshift=-0.3em,xshift=-0.1em]s2.west);
+}
+
+\visible<8->{
+\node [circle,draw,anchor=north,inner sep=2pt,fill=orange!20] (c3) at ([yshift=-2.5em]t2.south) {\tiny{$\textbf{C}_3$}};
+\node [circle,draw,inner sep=2pt,fill=orange!20] (c3copy1) at ([yshift=-0.1em,xshift=-0.1em]c3) {\tiny{$\textbf{C}_3$}};
+\node [circle,draw,inner sep=2pt,fill=orange!20] (c3copy2) at ([yshift=-0.2em,xshift=-0.2em]c3) {\tiny{$\textbf{C}_3$}};
+\draw [->] ([xshift=-0.9em]c3.west) -- ([xshift=-0.3em]c3.west);
+\draw [->] ([xshift=0.1em]c3.east) .. controls +(east:1.5) and +(west:0.8) ..([yshift=-0.3em,xshift=-0.1em]s3.west);
+}
+
+\visible<3->{
+\node [anchor=east] (vocab) at ([xshift=-5em]s1.west) {\tiny{$\begin{bmatrix} \textrm{Have} & 0.50 \\ \textrm{I} & 0.02 \\ \textrm{it} & 0.03 \\ \textrm{has} & 0.30 \\ \textrm{you} & 0.01 \\ \textrm{the} & 0.01 \\ \textrm{a} & 0.01 \\ \textrm{an} & 0.02 \\ \textrm{he} & 0.03 \\ \textrm{she} & 0.01 \\ \textrm{are} & 0.00 \\ \textrm{am} & 0.01 \\ ... & ... \end{bmatrix}$}};
+\node [anchor=south] (vocablabel) at (vocab.north) {\tiny{单词的概率分布}};
+\draw [->,red,very thick,dotted] (o1.west) .. controls +(west:1) and +(east:2) .. ([yshift=1em]vocab.south east); 
+}
+
+\visible<4->{
+\node [anchor=east,inner sep=1pt] (vocabtopn) at ([xshift=-0.5em,yshift=-0.5em]wo1.west) {\tiny{$\begin{bmatrix} \textrm{Have} \\ \textrm{has} \\ \textrm{it} \end{bmatrix}$}};
+\draw [->] ([yshift=-1.6em,xshift=-0.4em]vocab.north east) .. controls +(east:1) and +(west:1) ..  ([xshift=0.1em,yshift=0.4em]vocabtopn.west) node [pos=0.3,below] (topnlabel) {\tiny{top-3}};
+
+\visible<4->{
+\node [anchor=north] (cap) at (vocab.south east) {\scriptsize{\textbf{束搜索($b=3$)}}};
+}
+}
+
+
+\end{scope}
+\end{tikzpicture}
+\end{center}
+}
+
 \end{frame}

 %%%------------------------------------------------------------------------------------------------------------

--- a/Section06-Neural-Machine-Translation/section06.tex
+++ b/Section06-Neural-Machine-Translation/section06.tex
@@ -28,6 +28,7 @@

 \usetikzlibrary{calc,intersections}
 \usetikzlibrary{matrix}
+\usetikzlibrary{patterns}
 \usetikzlibrary{arrows,decorations.pathreplacing}
 \usetikzlibrary{shadows} % LATEX and plain TEX when using Tik Z
 \usetikzlibrary{shadows.blur}
@@ -469,9 +470,9 @@ NLP问题的隐含结构假设 & 无隐含结构假设，端到端学习 \\
 \begin{itemize}
 \item 一个简单的例子：基于循环神经网络的翻译过程
        \begin{itemize}
-        \item 顺序处理源语言单词
-        \item 源语言句子信息被表示在最后一个循环单元的输出中
-        \item 逐词生成目标语译文
+        \item<1-> \textbf{编码器}顺序处理源语言单词
+        \item<5-> 源语言句子信息被表示在最后一个循环单元的输出中
+        \item<6-> \textbf{解码器}利用源语言句子信息逐词生成目标语译文
        \end{itemize}
 \end{itemize}
 %%% 运行实例的图
@@ -480,7 +481,7 @@ NLP问题的隐含结构假设 & 无隐含结构假设，端到端学习 \\
        \setlength{\base}{0.6cm}

        \tikzstyle{rnnnode} = [minimum size=\base,inner sep=0pt,rounded corners=1pt,draw]
-        \tikzstyle{wordnode} = [font=\normalsize]
+        \tikzstyle{wordnode} = [font=\normalsize,align=center]

        \begin{scope}
            \visible<1->{
@@ -504,63 +505,76 @@ NLP问题的隐含结构假设 & 无隐含结构假设，端到端学习 \\
                \node[wordnode,below=0pt of emb3] (word3) {好};
                \draw[-latex'] (emb3.north) to (rnn3.south);
                \draw[-latex'] (rnn2.east) to (rnn3.west);
-
+                
                \node[rnnnode,fill=blue!30!white,right=\base of rnn3] (rnn4) {};
                \node[rnnnode,fill=green!30!white,below=\base of rnn4] (emb4) {};
-                \node[wordnode,below=0pt of emb4] (word4) {$\langle$eos$\rangle$};
+                \node[wordnode,below=0pt of emb4] (word4) {EOS};
                \draw[-latex'] (emb4.north) to (rnn4.south);
                \draw[-latex'] (rnn3.east) to (rnn4.west);
            }
-            \visible<4>{
-                \node[rnnnode,fill=purple] (repr) at (rnn4) {};
-                \node[wordnode,above=\base of repr] (label) {句子表示};
-                \draw[->,dashed] (label.south) to (rnn4.north);
+            \visible<4->{
+                \draw[decoration={mirror,brace},decorate] (word1.south west) to node [auto,anchor=north,align=center] {编码器} ([yshift=-0.2em]word4.south east);
            }
            \visible<5->{
-                \node[rnnnode,fill=red!30!white,above=\base of rnn4] (softmax1) {};
-                \node[wordnode,above=0pt of softmax1] (out1) {I};
-                \draw[-latex'] (rnn4.north) to (softmax1.south);
+                \node[rnnnode,fill=purple] (repr) at (rnn4) {};
+                \node[wordnode,above=\base of rnn2] (label) {源语言句子信息};
+                \draw[->,dashed,thick] (label.east) .. controls +(east:\base) and +(north:\base) .. (rnn4.north);
            }
            \visible<6->{
                \node[rnnnode,fill=blue!30!white,right=\base of rnn4] (rnn5) {};
                \node[rnnnode,fill=green!30!white,below=\base of rnn5] (emb5) {};
-                \node[rnnnode,fill=red!30!white,above=\base of rnn5] (softmax2) {};
-                \ExtractX{$(emb5)$}
-                \ExtractY{$(word4.base)$}
-                \node[wordnode,anchor=base] (word5) at (\XCoord,\YCoord) {I};
-                \ExtractX{$(emb5)$}
-                \ExtractY{$(out1.base)$}
-                \node[wordnode,anchor=base] (out2) at (\XCoord,\YCoord) {am};
+                \node[wordnode,below=0pt of emb5] (word5) {SOS};
                \draw[-latex'] (emb5.north) to (rnn5.south);
                \draw[-latex'] (rnn4.east) to (rnn5.west);
-                \draw[-latex'] (rnn5.north) to (softmax2.south);
+                \node[rnnnode,fill=red!30!white,above=\base of rnn5] (softmax1) {};
+                \node[wordnode,above=0pt of softmax1] (out1) {I};
+                \draw[-latex'] (rnn5.north) to (softmax1.south);
            }
            \visible<7->{
                \node[rnnnode,fill=blue!30!white,right=\base of rnn5] (rnn6) {};
                \node[rnnnode,fill=green!30!white,below=\base of rnn6] (emb6) {};
-                \node[rnnnode,fill=red!30!white,above=\base of rnn6] (softmax3) {};
+                \node[rnnnode,fill=red!30!white,above=\base of rnn6] (softmax2) {};
                \ExtractX{$(emb6)$}
                \ExtractY{$(word4.base)$}
-                \node[wordnode,anchor=base] (word6) at (\XCoord,\YCoord) {am};
+                \node[wordnode,anchor=base] (word6) at (\XCoord,\YCoord) {I};
                \ExtractX{$(emb6)$}
                \ExtractY{$(out1.base)$}
-                \node[wordnode,anchor=base] (out3) at (\XCoord,\YCoord) {fine};
+                \node[wordnode,anchor=base] (out2) at (\XCoord,\YCoord) {am};
                \draw[-latex'] (emb6.north) to (rnn6.south);
                \draw[-latex'] (rnn5.east) to (rnn6.west);
-                \draw[-latex'] (rnn6.north) to (softmax3.south);
-
+                \draw[-latex'] (rnn6.north) to (softmax2.south);
+            }
+            \visible<8->{
                \node[rnnnode,fill=blue!30!white,right=\base of rnn6] (rnn7) {};
                \node[rnnnode,fill=green!30!white,below=\base of rnn7] (emb7) {};
-                \node[rnnnode,fill=red!30!white,above=\base of rnn7] (softmax4) {};
+                \node[rnnnode,fill=red!30!white,above=\base of rnn7] (softmax3) {};
                \ExtractX{$(emb7)$}
                \ExtractY{$(word4.base)$}
-                \node[wordnode,anchor=base] (word7) at (\XCoord,\YCoord) {fine};
+                \node[wordnode,anchor=base] (word7) at (\XCoord,\YCoord) {am};
                \ExtractX{$(emb7)$}
                \ExtractY{$(out1.base)$}
-                \node[wordnode,anchor=base] (out4) at (\XCoord,\YCoord) {$\langle$eos$\rangle$};
+                \node[wordnode,anchor=base] (out3) at (\XCoord,\YCoord) {fine};
                \draw[-latex'] (emb7.north) to (rnn7.south);
                \draw[-latex'] (rnn6.east) to (rnn7.west);
-                \draw[-latex'] (rnn7.north) to (softmax4.south);
+                \draw[-latex'] (rnn7.north) to (softmax3.south);
+
+                \node[rnnnode,fill=blue!30!white,right=\base of rnn7] (rnn8) {};
+                \node[rnnnode,fill=green!30!white,below=\base of rnn8] (emb8) {};
+                \node[rnnnode,fill=red!30!white,above=\base of rnn8] (softmax4) {};
+                \ExtractX{$(emb8)$}
+                \ExtractY{$(word4.base)$}
+                \node[wordnode,anchor=base] (word8) at (\XCoord,\YCoord) {fine};
+                \ExtractX{$(emb8)$}
+                \ExtractY{$(out1.base)$}
+                \node[wordnode,anchor=base] (out4) at (\XCoord,\YCoord) {EOS};
+                \draw[-latex'] (emb8.north) to (rnn8.south);
+                \draw[-latex'] (rnn7.east) to (rnn8.west);
+                \draw[-latex'] (rnn8.north) to (softmax4.south);
+            }
+            \visible<9->{
+                \ExtractX{$(word8.east)$}
+                \ExtractY{$(word5.south)$}
+                \draw[decoration={mirror,brace},decorate] ([yshift=-0.2em]word5.south west) to node [auto,anchor=north,align=center] {解码器} (\XCoord,\YCoord-0.2em);
            }
        \end{scope}
    \end{tikzpicture}
@@ -639,22 +653,23 @@ NLP问题的隐含结构假设 & 无隐含结构假设，端到端学习 \\
 \textbf{入门：循环网络翻译模型及注意力机制} \\
 \small{1. 起源} \\
 \small{2. 模型结构} \\
-\small{3. 注意力机制}
+\small{3. 注意力机制} \\
+\small{4. 训练和推断}
 }
 \end{tcolorbox}

-\vspace{0.5em}
+\vspace{0.2em}

 \begin{tcolorbox}[enhanced,size=normal,left=2mm,right=1mm,colback=red!5!white,colframe=red!75!black,drop fuzzy shadow]
 {\large
 \textbf{热门：Transformer} \\
-\small{1. 多头自注意力模型} \\
-\small{2. 训练及推断} \\
-\small{3. 深层网络翻译模型}
+\small{1. 自注意力模型} \\
+\small{2. 多头注意力和层正则化} \\
+\small{3. 更深、更宽的模型}
 }
 \end{tcolorbox}

-\vspace{0.5em}
+\vspace{0.2em}

 \begin{tcolorbox}[enhanced,size=normal,left=2mm,right=1mm,colback=red!5!white,colframe=red!75!black,drop fuzzy shadow]
 {\large
@@ -957,8 +972,8 @@ NLP问题的隐含结构假设 & 无隐含结构假设，端到端学习 \\
 \item<2-> \textbf{核心}：如何求解$\textrm{P}(y_j|\textbf{y}_{<j}, \textbf{x})$。在这个循环神经网络模型中，有三个步骤
    \begin{enumerate}
    \item 输入的单词用分布式表示，如$\textbf{x}$被表示为词向量序列$e_x(\textbf{x})$，同理$\textbf{y}_{<j}$被表示为$e_y(\textbf{y}_{<j})$
-    \item 源语言句子被一个RNN编码为一个表示$C$，如前面的例子中是一个实数向量
-    \item 目标端解码用另一个RNN，因此生成$y_j$时只考虑前一个状态$s_{j-1}$（这里，$s_{j-1}$表示RNN第$j-1$步骤的隐层状态）
+    \item 源语言句子被一个RNN编码为一个表示$\textbf{C}$，如前面的例子中是一个实数向量
+    \item 目标端解码用另一个RNN，因此生成$y_j$时只考虑前一个状态$\textbf{s}_{j-1}$（这里，$\textbf{s}_{j-1}$表示RNN第$j-1$步骤的隐层状态）
    \end{enumerate}

 \end{itemize}
@@ -985,9 +1000,9 @@ NLP问题的隐含结构假设 & 无隐含结构假设，端到端学习 \\
 					\node[rnnnode,minimum height=0.5\base,fill=green!30!white,anchor=west] (eemb\x) at ([xshift=0.4\base]eemb\y.east) {\tiny{$e_x()$}};
 				\foreach \x in {1,2,...,3}
 					\node[rnnnode,fill=blue!30!white,anchor=south] (enc\x) at ([yshift=0.3\base]eemb\x.north) {};
-			        \node[] (enclabel1) at (enc1) {\tiny{$h_{m-2}$}};
-			        \node[] (enclabel2) at (enc2) {\tiny{$h_{m-1}$}};
-			        \node[rnnnode,fill=purple!30!white] (enclabel3) at (enc3) {\tiny{$h_{m}$}};
+			        \node[] (enclabel1) at (enc1) {\tiny{$\textbf{h}_{m-2}$}};
+			        \node[] (enclabel2) at (enc2) {\tiny{$\textbf{h}_{m-1}$}};
+			        \node[rnnnode,fill=purple!30!white] (enclabel3) at (enc3) {\tiny{$\textbf{h}_{m}$}};
 				\node[wordnode,left=0.4\base of enc1] (init1) {$\cdots$};
 				\node[wordnode,left=0.4\base of eemb1] (init2) {$\cdots$};

@@ -999,7 +1014,7 @@ NLP问题的隐含结构假设 & 无隐含结构假设，端到端学习 \\
 				\foreach \x in {1,2,...,3}
 					\node[rnnnode,minimum height=0.5\base,fill=green!30!white,anchor=south] (demb\x) at ([yshift=\base]enc\x.north) {\tiny{$e_y()$}};
 				\foreach \x in {1,2,...,3}
-					\node[rnnnode,fill=blue!30!white,anchor=south] (dec\x) at ([yshift=0.3\base]demb\x.north) {{\tiny{$s_\x$}}};
+					\node[rnnnode,fill=blue!30!white,anchor=south] (dec\x) at ([yshift=0.3\base]demb\x.north) {{\tiny{$\textbf{s}_\x$}}};
 				\foreach \x in {1,2,...,3}
 					\node[rnnnode,minimum height=0.5\base,fill=red!30!white,anchor=south] (softmax\x) at ([yshift=0.3\base]dec\x.north) {\tiny{Softmax}};
 				\node[wordnode,right=0.4\base of demb3] (end1) {$\cdots$};
@@ -1043,7 +1058,7 @@ NLP问题的隐含结构假设 & 无隐含结构假设，端到端学习 \\
 				\draw[-latex'] (enc3.north) .. controls +(north:0.3\base) and +(east:\base) .. (bridge) .. controls +(west:2.7\base) and +(west:0.3\base) .. (dec1.west);
 				
 				\visible<2->{
-				\node [anchor=east] (line1) at ([xshift=-3em,yshift=0.5em]softmax1.west) {\scriptsize{基于RNN的隐层状态$s_i$}};
+				\node [anchor=east] (line1) at ([xshift=-3em,yshift=0.5em]softmax1.west) {\scriptsize{基于RNN的隐层状态$\textbf{s}_i$}};
 				\node [anchor=north west] (line2) at ([yshift=0.3em]line1.south west) {\scriptsize{预测目标词的概率}};
 				\node [anchor=north west] (line3) at ([yshift=0.3em]line2.south west) {\scriptsize{通常，用Softmax函数}};
 				\node [anchor=north west] (line4) at ([yshift=0.3em]line3.south west) {\scriptsize{实现 $\textrm{P}(y_i|...)$}};
@@ -1060,7 +1075,7 @@ NLP问题的隐含结构假设 & 无隐含结构假设，端到端学习 \\
 				\node [anchor=west] (line21) at ([xshift=1.3em,yshift=1.5em]enc3.east)  {\scriptsize{源语编码器最后一个}};
 				\node [anchor=north west] (line22) at ([yshift=0.3em]line21.south west) {\scriptsize{循环单元的输出被}};
 				\node [anchor=north west] (line23) at ([yshift=0.3em]line22.south west) {\scriptsize{看作是句子的表示,}};
-				\node [anchor=north west] (line24) at ([yshift=0.3em]line23.south west) {\scriptsize{记为$C$}};
+				\node [anchor=north west] (line24) at ([yshift=0.3em]line23.south west) {\scriptsize{记为$\textbf{C}$}};
 				}
 				
 				\begin{pgfonlayer}{background}
@@ -1096,7 +1111,7 @@ NLP问题的隐含结构假设 & 无隐含结构假设，端到端学习 \\
 \item 可以重新定义\\
 \vspace{-0.8em}
 \begin{displaymath}
-\textrm{P}(y_j|\textbf{y}_{<j}, \textbf{x}) \triangleq \textrm{P}(y_j|s_{j-1}, C)
+\textrm{P}(y_j|\textbf{y}_{<j}, \textbf{x}) \triangleq \textrm{P}(y_j|\textbf{s}_{j-1}, \textbf{C})
 \end{displaymath}

 对于上图中的模型，进一步化简为：\\
@@ -1105,8 +1120,8 @@ NLP问题的隐含结构假设 & 无隐含结构假设，端到端学习 \\
 \begin{displaymath}
 \textrm{P}(y_j|\textbf{y}_{<j}, \textbf{x}) \triangleq \left\{
    \begin{matrix}
-        \textrm{P}(y_j|C)\ \ \ \  & j = 1 \\
-        \textrm{P}(y_j|s_{j-1}) & j > 1
+        \textrm{P}(y_j|\textbf{C})\ \ \ \  & j = 1 \\
+        \textrm{P}(y_j|\textbf{s}_{j-1}) & j > 1
    \end{matrix} \right.
 \end{displaymath}

@@ -1203,7 +1218,7 @@ NLP问题的隐含结构假设 & 无隐含结构假设，端到端学习 \\
                \node[rnnnode,minimum height=0.5\base,fill=red!30!white,anchor=south] (softmax\x) at ([yshift=0.5\base]dec\x.north) {};

            % Decoder input words
-            \node[wordnode,below=0pt of demb1] (decwordin) {$\langle$sos$\rangle$};
+            \node[wordnode,below=0pt of demb1] (decwordin) {EOS};
            \ExtractX{$(demb2.south)$}
            \ExtractY{$(decwordin.base)$}
            \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {Do};
@@ -1260,7 +1275,7 @@ NLP问题的隐含结构假设 & 无隐含结构假设，端到端学习 \\
            \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {Station};
            \ExtractX{$(softmax10.north)$}
            \ExtractY{$(decwordout.base)$}
-            \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {$\langle$eos$\rangle$};
+            \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {EOS};

            % Connections
            \draw[-latex'] (init.east) to (enc1.west);
@@ -1341,7 +1356,7 @@ NLP问题的隐含结构假设 & 无隐含结构假设，端到端学习 \\
            \node[wordnode,below=0pt of eemb7] () {怎么};
            \node[wordnode,below=0pt of eemb8] () {走};
            \node[wordnode,below=0pt of eemb9] () {吗};
-            \node[wordnode,below=0pt of eemb10] () {$\langle$eos$\rangle$};
+            \node[wordnode,below=0pt of eemb10] () {EOS};

            % RNN Decoder
            \foreach \x in {1,2,...,10}
@@ -1411,7 +1426,7 @@ NLP问题的隐含结构假设 & 无隐含结构假设，端到端学习 \\
            \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {Station};
            \ExtractX{$(softmax10.north)$}
            \ExtractY{$(decwordout.base)$}
-            \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {$\langle$eos$\rangle$};
+            \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {EOS};

            % Connections
            \draw[-latex'] (init1.east) to (enc11.west);
@@ -1454,12 +1469,6 @@ NLP问题的隐含结构假设 & 无隐含结构假设，端到端学习 \\
 \end{frame}

 %%%------------------------------------------------------------------------------------------------------------
-%%% 一些变种
-\begin{frame}{改进 - fine-tuning}
-%%% 图
-\end{frame}
-
-%%%------------------------------------------------------------------------------------------------------------
 \subsection{注意力机制}

 %%%------------------------------------------------------------------------------------------------------------
@@ -1542,6 +1551,7 @@ NLP问题的隐含结构假设 & 无隐含结构假设，端到端学习 \\
    \begin{itemize}
    \item 关注的顺序：大狗的帽子 $\to$ 大狗 $\to$ 小狗的帽子 $\to$ 小狗
    \end{itemize}
+\item 人往往不是``均匀地''看图像中的所有区域，翻译是一个道理，生成一个目标语单词时参考的源语单词不会太多
 \end{itemize}

 \begin{center}
@@ -1549,7 +1559,6 @@ NLP问题的隐含结构假设 & 无隐含结构假设，端到端学习 \\
 \end{center}

 \begin{itemize}
-\item<2-> 人往往不是``均匀地''看图像中的所有区域，翻译是一个道理，生成一个目标语单词时参考的源语单词不会太多
 \item<2-> \alert{注意力机制}在机器翻译中已经成功应用，经典的论文\\
 \textbf{Neural Machine Translation by Jointly Learning to Align and Translate}\\
 \textbf{Bahdanau et al., 2015, In Proc of ICLR}
@@ -1563,7 +1572,7 @@ NLP问题的隐含结构假设 & 无隐含结构假设，端到端学习 \\
 \begin{itemize}
 \item 在注意力机制中，每个目标语单词的生成会使用一个动态的源语表示，而非一个统一的固定表示
    \begin{itemize}
-    \item 这里$C_i$表示第$i$个目标语单词所使用的源语表示
+    \item 这里$\textbf{C}_i$表示第$i$个目标语单词所使用的源语表示
    \end{itemize}
 \end{itemize}

@@ -1645,9 +1654,9 @@ NLP问题的隐含结构假设 & 无隐含结构假设，端到端学习 \\

 \draw[<-] ([yshift=0.1em,xshift=1em]t6.north) -- ([yshift=1.2em,xshift=1em]t6.north);

-\draw [->] ([yshift=3em]s6.north) -- ([yshift=4em]s6.north) -- ([yshift=4em]t1.north) node [pos=0.5,fill=green!30,inner sep=2pt] (c1) {\scriptsize{表示$C_1$}} -- ([yshift=3em]t1.north) ;
-\draw [->] ([yshift=3em]s5.north) -- ([yshift=5.3em]s5.north) -- ([yshift=5.3em]t2.north) node [pos=0.5,fill=green!30,inner sep=2pt] (c2) {\scriptsize{表示$C_2$}} -- ([yshift=3em]t2.north) ;
-\draw [->] ([yshift=3.5em]s3.north) -- ([yshift=6.6em]s3.north) -- ([yshift=6.6em]t4.north) node [pos=0.5,fill=green!30,inner sep=2pt] (c3) {\scriptsize{表示$C_i$}} -- ([yshift=3.5em]t4.north) ;
+\draw [->] ([yshift=3em]s6.north) -- ([yshift=4em]s6.north) -- ([yshift=4em]t1.north) node [pos=0.5,fill=green!30,inner sep=2pt] (c1) {\scriptsize{表示$\textbf{C}_1$}} -- ([yshift=3em]t1.north) ;
+\draw [->] ([yshift=3em]s5.north) -- ([yshift=5.3em]s5.north) -- ([yshift=5.3em]t2.north) node [pos=0.5,fill=green!30,inner sep=2pt] (c2) {\scriptsize{表示$\textbf{C}_2$}} -- ([yshift=3em]t2.north) ;
+\draw [->] ([yshift=3.5em]s3.north) -- ([yshift=6.6em]s3.north) -- ([yshift=6.6em]t4.north) node [pos=0.5,fill=green!30,inner sep=2pt] (c3) {\scriptsize{表示$\textbf{C}_i$}} -- ([yshift=3.5em]t4.north) ;
 \node [anchor=north] (smore) at ([yshift=3.5em]s3.north) {...};
 \node [anchor=north] (tmore) at ([yshift=3.5em]t4.north) {...};

@@ -1662,15 +1671,15 @@ NLP问题的隐含结构假设 & 无隐含结构假设，端到端学习 \\

 %%%------------------------------------------------------------------------------------------------------------
 %%% C_i的定义
-\begin{frame}{上下文向量$C_i$}
+\begin{frame}{上下文向量$\textbf{C}_i$}
 \begin{itemize}
-\item 对于目标语位置$i$，$C_i$是目标语$i$使用的上下文向量
+\item 对于目标语位置$i$，$\textbf{C}_i$是目标语$i$使用的上下文向量
 	\begin{itemize}
-	\item $h_j$表示编码器第$j$个位置的隐层状态
-	\item $s_i$表示解码器第$i$个位置的隐层状态
+	\item $\textbf{h}_j$表示编码器第$j$个位置的隐层状态
+	\item $\textbf{s}_i$表示解码器第$i$个位置的隐层状态
 	\item<2-> $\alpha_{i,j}$表示注意力权重，表示目标语第$i$个位置与源语第$j$个位置之间的相关性大小
-	\item<2-> $a(\cdot)$表示注意力函数，计算$s_{i-1}$和$h_j$之间的相关性
-	\item<3-> $C_i$是所有源语编码表示$\{h_j\}$的加权求和，权重为$\{\alpha_{i,j}\}$
+	\item<2-> $a(\cdot)$表示注意力函数，计算$\textbf{s}_{i-1}$和$\textbf{h}_j$之间的相关性
+	\item<3-> $\textbf{C}_i$是所有源语编码表示$\{\textbf{h}_j\}$的加权求和，权重为$\{\alpha_{i,j}\}$
 	\end{itemize}
 \end{itemize}

@@ -1679,17 +1688,17 @@ NLP问题的隐含结构假设 & 无隐含结构假设，端到端学习 \\

 \begin{scope}

-\node [anchor=west,draw,fill=red!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (h1) at (0,0) {\scriptsize{$h_1$}};
-\node [anchor=west,draw,fill=red!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (h2) at ([xshift=1em]h1.east) {\scriptsize{$h_2$}};
+\node [anchor=west,draw,fill=red!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (h1) at (0,0) {\scriptsize{$\textbf{h}_1$}};
+\node [anchor=west,draw,fill=red!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (h2) at ([xshift=1em]h1.east) {\scriptsize{$\textbf{h}_2$}};
 \node [anchor=west,inner sep=0pt,minimum width=3em] (h3) at ([xshift=0.5em]h2.east) {\scriptsize{...}};
-\node [anchor=west,draw,fill=red!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (h4) at ([xshift=0.5em]h3.east) {\scriptsize{$h_n$}};
+\node [anchor=west,draw,fill=red!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (h4) at ([xshift=0.5em]h3.east) {\scriptsize{$\textbf{h}_n$}};

 \node [anchor=south,circle,minimum size=1.0em,draw,ublue,thick] (sum) at ([yshift=2em]h2.north east) {};
 \draw [thick,-,ublue] (sum.north) -- (sum.south);
 \draw [thick,-,ublue] (sum.west) -- (sum.east);

-\node [anchor=south,draw,fill=green!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (th1) at ([yshift=2em,xshift=-1em]sum.north west) {\scriptsize{$s_{i-1}$}};
-\node [anchor=west,draw,fill=green!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (th2) at ([xshift=2em]th1.east) {\scriptsize{$s_{i}$}};
+\node [anchor=south,draw,fill=green!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (th1) at ([yshift=2em,xshift=-1em]sum.north west) {\scriptsize{$\textbf{s}_{i-1}$}};
+\node [anchor=west,draw,fill=green!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (th2) at ([xshift=2em]th1.east) {\scriptsize{$\textbf{s}_{i}$}};

 \draw [->] (h1.north) .. controls +(north:0.8) and +(west:1) ..  (sum.190) node [pos=0.3,left] {\tiny{$\alpha_{i,1}$}};
 \draw [->] (h2.north) .. controls +(north:0.6) and +(220:0.2) ..  (sum.220) node [pos=0.2,right] {\tiny{$\alpha_{i,2}$}};
@@ -1698,7 +1707,7 @@ NLP问题的隐含结构假设 & 无隐含结构假设，端到端学习 \\
 \draw [->] ([xshift=-1.5em]th1.west) -- ([xshift=-0.1em]th1.west);
 \draw [->] ([xshift=0.1em]th1.east) -- ([xshift=-0.1em]th2.west);
 \draw [->] ([xshift=0.1em]th2.east) -- ([xshift=1.5em]th2.east);
-\draw [->] (sum.north) .. controls +(north:0.8) and +(west:0.2) ..  ([yshift=-0.4em,xshift=-0.1em]th2.west) node [pos=0.2,right] (ci) {\scriptsize{$C_{i}$}};
+\draw [->] (sum.north) .. controls +(north:0.8) and +(west:0.2) ..  ([yshift=-0.4em,xshift=-0.1em]th2.west) node [pos=0.2,right] (ci) {\scriptsize{$\textbf{C}_{i}$}};

 \node [anchor=south,inner sep=1pt] (output) at ([yshift=0.8em]th2.north) {\tiny{输出层}};
 \draw [->] ([yshift=0.1em]th2.north) -- ([yshift=-0.1em]output.south);
@@ -1710,11 +1719,11 @@ NLP问题的隐含结构假设 & 无隐含结构假设，端到端学习 \\
 \node [anchor=north] (enc42) at ([yshift=0.5em]enc4.south) {\tiny{(位置$4$)}};

 \visible<2->{
-\node [anchor=west] (math1) at ([xshift=5em,yshift=1em]th2.east) {$C_i = \sum_{j} \alpha_{i,j} h_j \ \ $};
+\node [anchor=west] (math1) at ([xshift=5em,yshift=1em]th2.east) {$\textbf{C}_i = \sum_{j} \alpha_{i,j} \textbf{h}_j \ \ $};
 }
 \visible<3->{
 \node [anchor=north west] (math2) at ([yshift=-2em]math1.south west) {$\alpha_{i,j} = \frac{\exp(\beta_{i,j})}{\sum_{j'} \exp(\beta_{i,j'})}$};
-\node [anchor=north west] (math3) at ([yshift=-0em]math2.south west) {$\beta_{i,j} = a(s_{i-1}, h_j)$};
+\node [anchor=north west] (math3) at ([yshift=-0em]math2.south west) {$\beta_{i,j} = a(\textbf{s}_{i-1}, \textbf{h}_j)$};
 }

 \begin{pgfonlayer}{background}
@@ -1820,7 +1829,7 @@ NLP问题的隐含结构假设 & 无隐含结构假设，端到端学习 \\

 \visible<3->{
 % alignment bars 2
-\node[probnode,anchor=south west,minimum height=0.4\hnode,inner sep=0.1pt,fill=red!40,label=below:\scriptsize{$0.4$}] (attn21) at ([xshift=2.3\hnode,yshift=-0.0\hnode]alignment2.east) {};
+\node[probnode,anchor=south west,minimum height=0.4\hnode,inner sep=0.1pt,fill=red!40,label=below:\scriptsize{$0.4$}] (attn21) at ([xshift=2.3\hnode,yshift=0.5\hnode]alignment2.east) {};
 \node[probnode,anchor=south west,minimum height=0.4\hnode,inner sep=0.1pt,fill=red!40,label=below:\scriptsize{$0.4$}] (attn22) at ([xshift=1pt]attn21.south east) {};
 \node[probnode,anchor=south west,minimum height=0.05\hnode,inner sep=0.1pt,fill=red!40,label=below:\scriptsize{$0$}] (attn23) at ([xshift=1pt]attn22.south east) {};
 \node[probnode,anchor=south west,minimum height=0.1\hnode,inner sep=0.1pt,fill=red!40,label=below:\scriptsize{$0.1$}] (attn24) at ([xshift=1pt]attn23.south east) {};
@@ -1840,12 +1849,14 @@ NLP问题的隐含结构假设 & 无隐含结构假设，端到端学习 \\

 \visible<3->{
 % coverage score formula node
-\node[anchor=north west] (formula) at ([xshift=-0.3\hnode,yshift=-2.5\hnode]attn11.south) {\small{不同$C_i$所对应的源语言词的权重是不同的}};
+\node [anchor=north west] (formula) at ([xshift=-0.3\hnode,yshift=-1.5\hnode]attn11.south) {\small{不同$\textbf{C}_i$所对应的源语言词的权重是不同的}};
+\node [anchor=north west] (example) at (formula.south west) {\footnotesize{$\textbf{C}_2=0.4 \times \textbf{h}(\textrm{``你''}) + 0.4 \times \textbf{h}(\textrm{``什么''}) +$}};
+\node [anchor=north west] (example2) at ([yshift=0.4em]example.south west) {\footnotesize{$\ \ \ \ \ \ \ \ 0 \times \textbf{h}(\textrm{``都''}) + 0.1 \times \textbf{h}(\textrm{``没''}) + ..$}};
 }

 \visible<3->{
 % matrix -> attn2
-\draw[->,red] ([xshift=0.1em,yshift=2.3em]alignment2.east).. controls +(east:1.9cm) and +(west:1.0cm) ..([xshift=-0.15\hnode,yshift=-0.0\hnode]attn21.north west);
+\draw[->,red] ([xshift=0.1em,yshift=2.3em]alignment2.east).. controls +(east:1.9cm) and +(west:1.0cm) ..([xshift=-0.15\hnode,yshift=-1em]attn21.north west);
 }

 \visible<2->{
@@ -1854,7 +1865,7 @@ NLP问题的隐含结构假设 & 无隐含结构假设，端到端学习 \\

 \visible<3->{
 % attn2 -> cov2
-\draw[->] ([xshift=0.2\hnode,yshift=0.0\hnode]attn26.east)--([xshift=0.7\hnode,yshift=0.0\hnode]attn26.east) node[pos=0.5,above] (sum2) {\small{$\sum$}}; % 0.3 - 0.5 height of the
+\draw[->] ([xshift=0.2\hnode,yshift=0.0\hnode]attn26.east)--([xshift=0.7\hnode,yshift=0]attn26.east) node[pos=0.5,above] (sum2) {\small{$\sum$}}; % 0.3 - 0.5 height of the
 }

 \visible<2->{
@@ -1864,11 +1875,11 @@ NLP问题的隐含结构假设 & 无隐含结构假设，端到端学习 \\

 % coverage score for each source word
 \visible<2->{
-\node[anchor=west] (sc1) at ([xshift=0.9\hnode]attn16.east) {$C_1 = \sum_{i=1}^{8} \alpha_{i1} h_{i}$};
+\node[anchor=west] (sc1) at ([xshift=0.9\hnode]attn16.east) {$\textbf{C}_1 = \sum_{i=1}^{8} \alpha_{i1} \textbf{h}_{i}$};
 }

 \visible<3->{
-\node[anchor=west] (sc2) at ([xshift=0.9\hnode,yshift=0.0\hnode]attn26.east) {$C_2 = \sum_{i=1}^{8} \alpha_{i2} h_{i}$};
+\node[anchor=west] (sc2) at ([xshift=0.9\hnode,yshift=0.0\hnode]attn26.east) {$\textbf{C}_2 = \sum_{i=1}^{8} \alpha_{i2} \textbf{h}_{i}$};
 }

 \end{tikzpicture}
@@ -1883,8 +1894,8 @@ NLP问题的隐含结构假设 & 无隐含结构假设，端到端学习 \\
 {\small
 \begin{tabular}{l | l}
 引入注意力机制以前 & 引入注意力机制以后 \\ \hline
-$\textrm{``Have''} = \argmax_{y} \textrm{P}(y|0, \alert{C})$ & $\textrm{``Have''} = \argmax_{y} \textrm{P}(y|0, \alert{C_1})$ \\
-$\textrm{``you''} = \argmax_{y} \textrm{P}(y|s_1, \alert{C})$ & $\textrm{``you''} = \argmax_{y} \textrm{P}(y|s_1, \alert{C_2})$
+$\textrm{``Have''} = \argmax_{y} \textrm{P}(y|0, \alert{\textbf{C}})$ & $\textrm{``Have''} = \argmax_{y} \textrm{P}(y|0, \alert{\textbf{C}_1})$ \\
+$\textrm{``you''} = \argmax_{y} \textrm{P}(y|\textbf{s}_1, \alert{\textbf{C}})$ & $\textrm{``you''} = \argmax_{y} \textrm{P}(y|\textbf{s}_1, \alert{\textbf{C}_2})$
 \end{tabular}
 }
 \end{center}
@@ -1899,19 +1910,19 @@ $\textrm{``you''} = \argmax_{y} \textrm{P}(y|s_1, \alert{C})$ & $\textrm{``you''
 \item 再来看一下注意力权重的定义。这个过程实际上是对$a(\cdot,\cdot)$做指数归一化：\\
 \vspace{-0.3em}
 \begin{displaymath}
-\alpha_{i,j} = \frac{\exp(a(s_{i-1}, h_j))}{\sum_{j'} \exp(a(s_{i-1}, h_{j'}))}
+\alpha_{i,j} = \frac{\exp(a(\textbf{s}_{i-1}, \textbf{h}_j))}{\sum_{j'} \exp(a(\textbf{s}_{i-1}, \textbf{h}_{j'}))}
 \end{displaymath}

-\item<2-> 注意力函数$a(s,h)$的目的是捕捉$s$和$h$之间的\alert{相似性}，这也可以被看作是目标语表示和源语言表示的一种``统一化''，即把源语言和目标语表示在同一个语义空间，进而语义相近的内容有更大的相似性。\visible<3->{定义$a(s,h)$的方式：}
+\item<2-> 注意力函数$a(\textbf{s},\textbf{h})$的目的是捕捉$\textbf{s}$和$\textbf{h}$之间的\alert{相似性}，这也可以被看作是目标语表示和源语言表示的一种``统一化''，即把源语言和目标语表示在同一个语义空间，进而语义相近的内容有更大的相似性。\visible<3->{定义$a(\textbf{s},\textbf{h})$的方式：}
    \vspace{-1em}

    \visible<3->{
    \begin{displaymath}
-    a(s,h) =  \left\{ \begin{array}{ll}
-    s h^T & \textrm{向量乘} \\
-    \textrm{cos}(s, h) & \textrm{向量夹角} \\
-    s \textbf{W} h^T & \textrm{线性模型} \\
-    \textrm{TanH}(\textbf{W}[s,h])\textbf{v}^T & \textrm{拼接}[s,h]+\textrm{单层网络}
+    a(\textbf{s},\textbf{h}) =  \left\{ \begin{array}{ll}
+    \textbf{s} \textbf{h}^{\textrm{T}} & \textrm{向量乘} \\
+    \textrm{cos}(\textbf{s}, \textbf{h}) & \textrm{向量夹角} \\
+    \textbf{s} \textbf{W} \textbf{h}^{\textrm{T}} & \textrm{线性模型} \\
+    \textrm{TanH}(\textbf{W}[\textbf{s},\textbf{h}])\textbf{v}^{\textrm{T}} & \textrm{拼接}[\textbf{s},\textbf{h}]+\textrm{单层网络}
    \end{array}
    \right.
    \end{displaymath}
@@ -1922,6 +1933,524 @@ $\textrm{``you''} = \argmax_{y} \textrm{P}(y|s_1, \alert{C})$ & $\textrm{``you''
 \end{frame}

 %%%------------------------------------------------------------------------------------------------------------
+%%% 注意力模型的效果 - 热图
+\begin{frame}{真实的实例}
+\begin{itemize}
+\item 注意力的权重符合双语对应的规律
+\end{itemize}
+\end{frame}
+
+%%%------------------------------------------------------------------------------------------------------------
+%%% 如何进一步理解注意力机制
+\begin{frame}{重新解释注意力机制}
+\begin{itemize}
+\item 换一个问题，假设有若干key-value单元，其中key是这个单元的索引表示，value是这个单元的值。对于任意一个query，可以找到匹配的key，并输出其对应的value
+\end{itemize}
+
+\vspace{-0.8em}
+\begin{center}
+\begin{tikzpicture}
+\begin{scope}
+
+\tikzstyle{rnode} = [draw,minimum width=3em,minimum height=1.2em]
+
+\node [rnode,anchor=south west,fill=blue!20!white] (value1) at (0,0) {\scriptsize{value$_1$}};
+\node [rnode,anchor=south west,fill=blue!20!white] (value2) at ([xshift=1em]value1.south east) {\scriptsize{value$_2$}};
+\node [rnode,anchor=south west,fill=red!20!white] (value3) at ([xshift=1em]value2.south east) {\scriptsize{value$_3$}};
+\node [rnode,anchor=south west,fill=blue!20!white] (value4) at ([xshift=1em]value3.south east) {\scriptsize{value$_4$}};
+
+\node [rnode,anchor=south west,pattern=north east lines] (key1) at ([yshift=0.2em]value1.north west) {};
+\node [rnode,anchor=south west,pattern=dots] (key2) at ([yshift=0.2em]value2.north west) {};
+\node [rnode,anchor=south west,pattern=horizontal lines] (key3) at ([yshift=0.2em]value3.north west) {};
+\node [rnode,anchor=south west,pattern=crosshatch dots] (key4) at ([yshift=0.2em]value4.north west) {};
+\node [fill=white,inner sep=1pt] (key1label) at (key1) {\scriptsize{key$_1$}};
+\node [fill=white,inner sep=1pt] (key1label) at (key2) {\scriptsize{key$_2$}};
+\node [fill=white,inner sep=1pt] (key1label) at (key3) {\scriptsize{key$_3$}};
+\node [fill=white,inner sep=1pt] (key1label) at (key4) {\scriptsize{key$_4$}};
+
+\node [rnode,anchor=east,pattern=horizontal lines] (query) at ([xshift=-3em]key1.west) {};
+\node [anchor=east] (querylabel) at ([xshift=-0.2em]query.west) {\scriptsize{query}};
+
+\draw [->] ([yshift=1pt]query.north) .. controls +(90:2em) and +(90:2em) .. ([yshift=1pt]key3.north) node [pos=0.5,below,yshift=0.2em] {\scriptsize{匹配}};
+
+\node [anchor=north] (result) at (value3.south) {\scriptsize{\alert{返回结果}}};
+
+\end{scope}
+\end{tikzpicture}
+\end{center}
+
+\vspace{-0.7em}
+
+\begin{itemize}
+\item<2-> 注意力机制也可以被看做对key-value单元的查询，但是所有key和query之间都有一种匹配程度，返回结果是对所有value的加权
+\end{itemize}
+
+\visible<2->{
+\vspace{-0.5em}
+\begin{center}
+\begin{tikzpicture}
+\begin{scope}
+
+\tikzstyle{rnode} = [draw,minimum width=3em,minimum height=1.2em]
+
+\node [rnode,anchor=south west,fill=red!20!white] (value1) at (0,0) {\scriptsize{value$_1$}};
+\node [rnode,anchor=south west,fill=red!20!white] (value2) at ([xshift=1em]value1.south east) {\scriptsize{value$_2$}};
+\node [rnode,anchor=south west,fill=red!20!white] (value3) at ([xshift=1em]value2.south east) {\scriptsize{value$_3$}};
+\node [rnode,anchor=south west,fill=red!20!white] (value4) at ([xshift=1em]value3.south east) {\scriptsize{value$_4$}};
+
+\node [rnode,anchor=south west,pattern=north east lines] (key1) at ([yshift=0.2em]value1.north west) {};
+\node [rnode,anchor=south west,pattern=dots] (key2) at ([yshift=0.2em]value2.north west) {};
+\node [rnode,anchor=south west,pattern=horizontal lines] (key3) at ([yshift=0.2em]value3.north west) {};
+\node [rnode,anchor=south west,pattern=crosshatch dots] (key4) at ([yshift=0.2em]value4.north west) {};
+\node [fill=white,inner sep=1pt] (key1label) at (key1) {\scriptsize{key$_1$}};
+\node [fill=white,inner sep=1pt] (key1label) at (key2) {\scriptsize{key$_2$}};
+\node [fill=white,inner sep=1pt] (key1label) at (key3) {\scriptsize{key$_3$}};
+\node [fill=white,inner sep=1pt] (key1label) at (key4) {\scriptsize{key$_4$}};
+
+\node [rnode,anchor=east,pattern=vertical lines] (query) at ([xshift=-3em]key1.west) {};
+\node [anchor=east] (querylabel) at ([xshift=-0.2em]query.west) {\scriptsize{query}};
+
+\draw [->] ([yshift=1pt,xshift=6pt]query.north) .. controls +(90:1em) and +(90:1em) .. ([yshift=1pt]key1.north);
+\draw [->] ([yshift=1pt,xshift=3pt]query.north) .. controls +(90:1.5em) and +(90:1.5em) .. ([yshift=1pt]key2.north);
+\draw [->] ([yshift=1pt]query.north) .. controls +(90:2em) and +(90:2em) .. ([yshift=1pt]key3.north);
+\draw [->] ([yshift=1pt,xshift=-3pt]query.north) .. controls +(90:2.5em) and +(90:2.5em) .. ([yshift=1pt]key4.north);
+\node [anchor=south east] (alpha1) at (key1.north east) {\scriptsize{$\alpha_1$}};
+\node [anchor=south east] (alpha2) at (key2.north east) {\scriptsize{$\alpha_2$}};
+\node [anchor=south east] (alpha3) at (key3.north east) {\scriptsize{$\alpha_3$}};
+\node [anchor=south east] (alpha4) at (key4.north east) {\scriptsize{$\alpha_4$}};
+
+\node [anchor=north] (result) at ([xshift=-1.5em]value2.south east) {\scriptsize{\alert{返回结果}=$\alpha_1 \cdot \textrm{value}_1 + \alpha_2 \cdot \textrm{value}_2 + \alpha_3 \cdot \textrm{value}_3 + \alpha_4 \cdot \textrm{value}_4$}};
+
+\end{scope}
+\end{tikzpicture}
+\end{center}
+}
+
+\end{frame}
+
+%%%------------------------------------------------------------------------------------------------------------
+%%% 如何进一步理解注意力机制 - 回到机器翻译任务
+\begin{frame}{重新解释注意力机制(续)}
+\begin{itemize}
+\item 回到机器翻译，如果把目标语状态$\textbf{s}_{i-1}$看做query，而把源语言所有位置的最上层RNN表示$\textbf{h}_{j}$看做{\color{ugreen} \textbf{key}}和{\color{red} \textbf{value}}
+\end{itemize}
+
+\vspace{-1.5em}
+\begin{center}
+\begin{tikzpicture}
+\begin{scope}
+
+\tikzstyle{rnode} = [draw,minimum width=3.5em,minimum height=1.2em]
+
+\node [rnode,anchor=south west,fill=red!20!white] (value1) at (0,0) {\scriptsize{$\textbf{h}(\textrm{``你''})$}};
+\node [rnode,anchor=south west,fill=red!20!white] (value2) at ([xshift=1em]value1.south east) {\scriptsize{$\textbf{h}(\textrm{``什么''})$}};
+\node [rnode,anchor=south west,fill=red!20!white] (value3) at ([xshift=1em]value2.south east) {\scriptsize{$\textbf{h}(\textrm{``也''})$}};
+\node [rnode,anchor=south west,fill=red!20!white] (value4) at ([xshift=1em]value3.south east) {\scriptsize{$\textbf{h}(\textrm{``没''})$}};
+
+\node [rnode,anchor=south west,fill=green!20!white] (key1) at ([yshift=0.2em]value1.north west) {\scriptsize{$\textbf{h}(\textrm{``你''})$}};
+\node [rnode,anchor=south west,fill=green!20!white] (key2) at ([yshift=0.2em]value2.north west) {\scriptsize{$\textbf{h}(\textrm{``什么''})$}};
+\node [rnode,anchor=south west,fill=green!20!white] (key3) at ([yshift=0.2em]value3.north west) {\scriptsize{$\textbf{h}(\textrm{``也''})$}};
+\node [rnode,anchor=south west,fill=green!20!white] (key4) at ([yshift=0.2em]value4.north west) {\scriptsize{$\textbf{h}(\textrm{``没''})$}};
+
+\node [rnode,anchor=east] (query) at ([xshift=-2em]key1.west) {\scriptsize{$\textbf{s}(\textrm{``you''})$}};
+\node [anchor=east] (querylabel) at ([xshift=-0.2em]query.west) {\scriptsize{query}};
+
+\draw [->] ([yshift=1pt,xshift=6pt]query.north) .. controls +(90:1em) and +(90:1em) .. ([yshift=1pt]key1.north);
+\draw [->] ([yshift=1pt,xshift=3pt]query.north) .. controls +(90:1.5em) and +(90:1.5em) .. ([yshift=1pt]key2.north);
+\draw [->] ([yshift=1pt]query.north) .. controls +(90:2em) and +(90:2em) .. ([yshift=1pt]key3.north);
+\draw [->] ([yshift=1pt,xshift=-3pt]query.north) .. controls +(90:2.5em) and +(90:2.5em) .. ([yshift=1pt]key4.north);
+\node [anchor=south east] (alpha1) at ([xshift=1em]key1.north east) {\scriptsize{$\alpha_1=.4$}};
+\node [anchor=south east] (alpha2) at ([xshift=1em]key2.north east) {\scriptsize{$\alpha_2=.4$}};
+\node [anchor=south east] (alpha3) at ([xshift=1em]key3.north east) {\scriptsize{$\alpha_3=0$}};
+\node [anchor=south east] (alpha4) at ([xshift=1em]key4.north east) {\scriptsize{$\alpha_4=.1$}};
+
+\end{scope}
+\end{tikzpicture}
+\end{center}
+
+\vspace{-2.5em}
+\begin{eqnarray}
+\textbf{C}_3 & = & 0.4 \times \textbf{h}(\textrm{``什么''}) + 0.4 \times \textbf{h}(\textrm{``也''}) + \nonumber \\
+             &   & 0 \times \textbf{h}(\textrm{``没''}) + 0.1 \times \textbf{h}(\textrm{``学''}) \nonumber
+\end{eqnarray}
+
+\vspace{-0.5em}
+\begin{itemize}
+\item<2-> 注意力机制也可以被看做是一个重新生成value的过程：对于一组value值，注意力模型对他们加权求和，并得到一个新的value。而这个新的value实际上就是query所对应查询结果，在机器翻译中被看做是目标语所对应的源语言上下文表示。
+\end{itemize}
+
+\end{frame}
+
+\subsection{训练及推断}
+
+%%%------------------------------------------------------------------------------------------------------------
+%%% 训练
+\begin{frame}{训练}
+\end{frame}
+
+%%%------------------------------------------------------------------------------------------------------------
+%%% 解码
+\begin{frame}{推断}
+\begin{itemize}
+\item 使用NMT时，对于源语言句子$\textbf{x}$，需要得到最优译文$\hat{\textbf{y}}$
+
+\vspace{-1.5em}
+\begin{displaymath}
+\hat{\textbf{y}} = \argmax_{\textbf{y}} \log\textrm{P}(\textbf{y}|\textbf{x}) = \argmax_{\textbf{y}} \sum_{j=1}^{n} \log\textrm{P}(y_j|\textbf{y}_{<j}, \textbf{x})
+\end{displaymath}
+
+\item<2-> 由于$y_i$的生成需要依赖$y_{i-1}$，因此无法同时生成$\{y_1,...,y_n\}$。常用的方法是自左向右逐个单词生成
+
+\end{itemize}
+
+\vspace{-0.8em}
+\visible<3->{
+\vspace{-0.5em}
+\begin{center}
+\begin{tikzpicture}
+\begin{scope}
+\tikzstyle{rnnnode} = [minimum height=1.1em,minimum width=2.1em,inner sep=2pt,rounded corners=1pt,draw,fill=red!20];
+
+\node [rnnnode,anchor=west] (h1) at (0,0) {\tiny{$\textbf{h}_1$}};
+\node [anchor=west] (h2) at ([xshift=1em]h1.east) {\tiny{...}};
+\node [rnnnode,anchor=west] (h3) at ([xshift=1em]h2.east) {\tiny{$\textbf{h}_m$}};
+\node [rnnnode,anchor=north,fill=green!20] (e1) at ([yshift=-1em]h1.south) {\tiny{$e_x()$}};
+\node [anchor=west] (e2) at ([xshift=1em]e1.east) {\tiny{...}};
+\node [rnnnode,anchor=west,fill=green!20] (e3) at ([xshift=1em]e2.east) {\tiny{$e_x()$}};
+\node [anchor=north,inner sep=2pt] (w1) at ([yshift=-0.6em]e1.south) {\tiny{你}};
+\node [anchor=north,inner sep=2pt] (w2) at ([yshift=-0.8em]e2.south) {\tiny{...}};
+\node [anchor=north,inner sep=2pt] (w3) at ([yshift=-0.6em]e3.south) {\tiny{EOS}};
+
+\draw [->] (w1.north) -- ([yshift=-0.1em]e1.south);
+\draw [->] (w3.north) -- ([yshift=-0.1em]e3.south);
+\draw [->] ([yshift=0.1em]e1.north) -- ([yshift=-0.1em]h1.south);
+\draw [->] ([yshift=0.1em]e3.north) -- ([yshift=-0.1em]h3.south);
+\draw [->] ([xshift=0.1em]h1.east) -- ([xshift=-0.1em]h2.west);
+\draw [->] ([xshift=0.1em]h2.east) -- ([xshift=-0.1em]h3.west);
+\draw [->] ([xshift=-0.8em]h1.west) -- ([xshift=-0.1em]h1.west) node [pos=0,left,inner sep=2pt] {\tiny{0}};
+\node [anchor=south] (encoder) at ([xshift=-0.2em]h1.north west) {\scriptsize{\textbf{编码器}}};
+
+\visible<5->{
+\node [rnnnode,anchor=west,fill=green!20] (t1) at ([xshift=3em]h3.east) {\tiny{$e_y()$}};
+}
+\visible<8->{
+\node [rnnnode,anchor=west,fill=green!20] (t2) at ([xshift=1.5em]t1.east) {\tiny{$e_y()$}};
+}
+\visible<9->{
+\node [rnnnode,anchor=west,fill=green!20] (t3) at ([xshift=1.5em]t2.east) {\tiny{$e_y()$}};
+\node [rnnnode,anchor=west,fill=green!20] (t4) at ([xshift=1.5em]t3.east) {\tiny{$e_y()$}};
+\node [anchor=west,inner sep=2pt] (t5) at ([xshift=0.3em]t4.east) {\tiny{...}};
+}
+\visible<5->{
+\node [rnnnode,anchor=south] (s1) at ([yshift=1em]t1.north) {\tiny{$\textbf{s}_1$}};
+}
+\visible<8->{
+\node [rnnnode,anchor=south] (s2) at ([yshift=1em]t2.north) {\tiny{$\textbf{s}_2$}};
+}
+\visible<9->{
+\node [rnnnode,anchor=south] (s3) at ([yshift=1em]t3.north) {\tiny{$\textbf{s}_3$}};
+\node [rnnnode,anchor=south] (s4) at ([yshift=1em]t4.north) {\tiny{$\textbf{s}_4$}};
+\node [anchor=west,inner sep=2pt] (s5) at ([xshift=0.3em]s4.east) {\tiny{...}};
+}
+\visible<5->{
+\node [rnnnode,anchor=south,fill=blue!20] (o1) at ([yshift=1em]s1.north) {\tiny{softmax}};
+\node [anchor=east] (decoder) at ([xshift=-0.5em]o1.north west) {\scriptsize{\textbf{解码器}}};
+}
+\visible<8->{
+\node [rnnnode,anchor=south,fill=blue!20] (o2) at ([yshift=1em]s2.north) {\tiny{softmax}};
+}
+\visible<9->{
+\node [rnnnode,anchor=south,fill=blue!20] (o3) at ([yshift=1em]s3.north) {\tiny{softmax}};
+\node [rnnnode,anchor=south,fill=blue!20] (o4) at ([yshift=1em]s4.north) {\tiny{softmax}};
+\node [anchor=west,inner sep=2pt] (o5) at ([xshift=0.3em]o4.east) {\tiny{...}};
+}
+\visible<4->{
+\node [anchor=north,inner sep=2pt] (wt1) at ([yshift=-0.6em]t1.south) {\tiny{EOS}};
+}
+\visible<7->{
+\node [anchor=north,inner sep=2pt] (wt2) at ([yshift=-0.6em]t2.south) {\tiny{Have}};
+}
+\visible<9->{
+\node [anchor=north,inner sep=2pt] (wt3) at ([yshift=-0.8em]t3.south) {\tiny{you}};
+\node [anchor=north,inner sep=2pt] (wt4) at ([yshift=-0.6em]t4.south) {\tiny{learned}};
+}
+\visible<5->{
+\node [anchor=center,inner sep=2pt] (wo1) at ([yshift=1.2em]o1.north) {\tiny{Have}};
+}
+\visible<4->{
+\node [anchor=south,inner sep=2pt] (wos1) at (wo1.north) {\tiny{\textbf{[step 1]}}};
+}
+\visible<8->{
+\node [anchor=center,inner sep=2pt] (wo2) at ([yshift=1.2em]o2.north) {\tiny{you}};
+}
+\visible<7->{
+\node [anchor=south,inner sep=2pt] (wos2) at (wo2.north) {\tiny{\textbf{[step 2]}}};
+}
+\visible<9->{
+\node [anchor=center,inner sep=2pt] (wo3) at ([yshift=1.2em]o3.north) {\tiny{learned}};
+\node [anchor=south,inner sep=2pt] (wos3) at (wo3.north) {\tiny{\textbf{[step 3]}}};
+\node [anchor=center,inner sep=2pt] (wo4) at ([yshift=1.2em]o4.north) {\tiny{nothing}};
+\node [anchor=south,inner sep=2pt] (wos4) at (wo4.north) {\tiny{\textbf{[step 4]}}};
+}
+
+\visible<5->{
+\foreach \x in {1}{
+    \draw [->] ([yshift=-0.7em]t\x.south) -- ([yshift=-0.1em]t\x.south);
+    \draw [->] ([yshift=0.1em]t\x.north) -- ([yshift=-0.1em]s\x.south);
+    \draw [->] ([yshift=0.1em]s\x.north) -- ([yshift=-0.1em]o\x.south);
+    \draw [->] ([yshift=0.1em]o\x.north) -- ([yshift=0.8em]o\x.north) node [pos=0.5,right] {\tiny{top1}};
+}
+}
+
+\visible<8->{
+\foreach \x in {2}{
+    \draw [->] ([yshift=-0.7em]t\x.south) -- ([yshift=-0.1em]t\x.south);
+    \draw [->] ([yshift=0.1em]t\x.north) -- ([yshift=-0.1em]s\x.south);
+    \draw [->] ([yshift=0.1em]s\x.north) -- ([yshift=-0.1em]o\x.south);
+    \draw [->] ([yshift=0.1em]o\x.north) -- ([yshift=0.8em]o\x.north) node [pos=0.5,right] {\tiny{top1}};
+}
+}
+
+\visible<9->{
+\foreach \x in {3,4}{
+    \draw [->] ([yshift=-0.7em]t\x.south) -- ([yshift=-0.1em]t\x.south);
+    \draw [->] ([yshift=0.1em]t\x.north) -- ([yshift=-0.1em]s\x.south);
+    \draw [->] ([yshift=0.1em]s\x.north) -- ([yshift=-0.1em]o\x.south);
+    \draw [->] ([yshift=0.1em]o\x.north) -- ([yshift=0.8em]o\x.north) node [pos=0.5,right] {\tiny{top1}};
+}
+}
+
+\visible<5->{
+\draw [->] ([xshift=-0.8em]s1.west) -- ([xshift=-0.1em]s1.west) node [pos=0,left] {\tiny{0}};
+}
+\visible<8->{
+\draw [->] ([xshift=0.1em]s1.east) -- ([xshift=-0.1em]s2.west);
+}
+\visible<9->{
+\draw [->] ([xshift=0.1em]s2.east) -- ([xshift=-0.1em]s3.west);
+\draw [->] ([xshift=0.1em]s3.east) -- ([xshift=-0.1em]s4.west);
+}
+
+\visible<7->{
+\draw [->,thick,dotted] (wo1.east) .. controls +(east:1.0) and +(west:1.0) ..(wt2.west);
+}
+\visible<9->{
+\draw [->,thick,dotted] (wo2.east) .. controls +(east:1.3) and +(west:1.1) ..(wt3.west);
+\draw [->,thick,dotted] (wo3.east) .. controls +(east:0.9) and +(west:0.9) ..(wt4.west);
+}
+
+\visible<6->{
+\node [circle,draw,anchor=south,inner sep=3pt,fill=orange!20] (c2) at ([yshift=2em]h2.north) {\tiny{$\textbf{C}_2$}};
+\node [anchor=south] (c2label) at (c2.north) {\tiny{\textbf{注意力机制：上下文}}};
+\node [anchor=south] (c2more) at ([yshift=-1.5em]c2.south) {\tiny{...}};
+\draw [->] (h1.north) .. controls +(north:0.6) and +(250:0.9) .. (c2.250);
+\draw [->] (h3.north) .. controls +(north:0.6) and +(290:0.9) .. (c2.290);
+\draw [->] ([yshift=-0.3em]s1.west) .. controls +(west:2) and +(-50:0.3) .. (c2.-40);
+}
+\visible<8->{
+\draw [->] (c2.0) -- ([xshift=1.358in]c2.0) -- ([yshift=0.3em,xshift=-1.2em]s2.west) -- ([yshift=0.3em,xshift=-0.1em]s2.west);
+}
+
+\visible<9->{
+\node [circle,draw,anchor=north,inner sep=3pt,fill=orange!20] (c3) at ([yshift=-2em]t2.south) {\tiny{$\textbf{C}_3$}};
+\draw [->] ([xshift=-0.7em]c3.west) -- ([xshift=-0.1em]c3.west);
+\draw [->] ([xshift=0.1em]c3.east) .. controls +(east:0.6) and +(west:0.8) ..([yshift=-0.3em,xshift=-0.1em]s3.west);
+}
+
+\visible<9->{
+\node [circle,draw,anchor=north,inner sep=3pt,fill=orange!20] (c4) at ([yshift=-2em]t3.south) {\tiny{$\textbf{C}_4$}};
+\draw [->] ([xshift=-0.7em]c4.west) -- ([xshift=-0.1em]c4.west);
+\draw [->] ([xshift=0.1em]c4.east) .. controls +(east:0.6) and +(west:0.8) ..([yshift=-0.3em,xshift=-0.1em]s4.west);
+}
+
+\end{scope}
+\end{tikzpicture}
+\end{center}
+}
+
+\end{frame}
+
+%%%------------------------------------------------------------------------------------------------------------
+%%% 解码 - beam search
+\begin{frame}{推断 - Beam Search}
+\begin{itemize}
+\item \textbf{Greedy Search}: 目标语每一个位置，输出层的Softmax可以得到所有单词的概率，然后选择一个概率最大单词输出，下一个位置的预测就基于这一步输出的单词
+\item \textbf{Beach Search}: 为了避免贪婪方法造成的错误累加，可以每次对$b$个单词进行扩展，而不是只使用一个单词，其中$b$称做束的宽度 - 这样可以搜索更多可能的译文
+\end{itemize}
+
+\vspace{-0.3em}
+\visible<2->{
+\begin{center}
+\begin{tikzpicture}
+\begin{scope}
+\tikzstyle{rnnnode} = [minimum height=1.1em,minimum width=3.5em,inner sep=2pt,rounded corners=1pt,draw,fill=red!20];
+\tikzstyle{wnode} = [minimum height=1.0em,minimum width=3.5em,inner sep=2pt,rounded corners=1pt,draw,fill=white];
+
+
+\visible<3->{
+\node [rnnnode,anchor=west,fill=green!20] (t1) at (0,0) {\tiny{$e_y()$}};
+}
+\visible<7->{
+\node [rnnnode,anchor=west,fill=green!20] (t2) at ([xshift=2.2em]t1.east) {\tiny{$e_y()$ ($\times 3$)}};
+}
+\visible<8->{
+\node [rnnnode,anchor=west,fill=green!20] (t3) at ([xshift=2.2em]t2.east) {\tiny{$e_y()$ ($\times 3$)}};
+\node [anchor=west,inner sep=2pt] (t4) at ([xshift=0.3em]t3.east) {\tiny{...}};
+}
+\visible<3->{
+\node [rnnnode,anchor=south] (s1) at ([yshift=1em]t1.north) {\tiny{$\textbf{s}_1$}};
+}
+\visible<7->{
+\node [rnnnode,anchor=south] (s2) at ([yshift=1em]t2.north) {\tiny{$\textbf{s}_2$ ($\times 3$)}};
+}
+\visible<8->{
+\node [rnnnode,anchor=south] (s3) at ([yshift=1em]t3.north) {\tiny{$\textbf{s}_3$ ($\times 3$)}};
+\node [anchor=west,inner sep=2pt] (s4) at ([xshift=0.3em]s3.east) {\tiny{...}};
+}
+\visible<3->{
+\node [rnnnode,anchor=south,fill=blue!20] (o1) at ([yshift=1em]s1.north) {\tiny{softmax}};
+}
+\visible<7->{
+\node [rnnnode,anchor=south,fill=blue!20] (o2) at ([yshift=1em]s2.north) {\tiny{softmax ($\times 3$)}};
+}
+\visible<8->{
+\node [rnnnode,anchor=south,fill=blue!20] (o3) at ([yshift=1em]s3.north) {\tiny{softmax ($\times 3$)}};
+\node [anchor=west,inner sep=2pt] (o4) at ([xshift=0.3em]o3.east) {\tiny{...}};
+}
+
+\node [wnode,anchor=north] (wt1) at ([yshift=-0.8em]t1.south) {\tiny{EOS}};
+
+\visible<6->{
+\node [wnode,anchor=north] (wt2) at ([yshift=-0.8em]t2.south) {\tiny{Have}};
+\node [wnode,anchor=north] (wt2copy1) at ([xshift=-0.2em,yshift=-0.2em]wt2.north) {\tiny{Have}};
+\node [wnode,anchor=north] (wt2copy2) at ([xshift=-0.4em,yshift=-0.4em]wt2.north) {\tiny{Have}};
+}
+
+\visible<8->{
+\node [wnode,anchor=north,inner sep=2pt] (wt3) at ([yshift=-0.8em]t3.south) {\tiny{you}};
+\node [wnode,anchor=north] (wt3copy1) at ([xshift=-0.2em,yshift=-0.2em]wt3.north) {\tiny{you}};
+\node [wnode,anchor=north] (wt3copy2) at ([xshift=-0.4em,yshift=-0.4em]wt3.north) {\tiny{you}};
+}
+
+\visible<5->{
+\node [wnode,anchor=center,inner sep=2pt] (wo1) at ([xshift=0.4em,yshift=1.8em]o1.north) {\tiny{Have}};
+\node [wnode,anchor=north] (wo1copy1) at ([xshift=-0.2em,yshift=-0.2em]wo1.north) {\tiny{Have}};
+\node [wnode,anchor=north] (wo1copy2) at ([xshift=-0.4em,yshift=-0.4em]wo1.north) {\tiny{Have}};
+}
+
+\visible<8->{
+\node [wnode,anchor=center,inner sep=2pt] (wo2) at ([xshift=0.4em,yshift=1.8em]o2.north) {\tiny{you}};
+\node [wnode,anchor=north] (wo2copy1) at ([xshift=-0.2em,yshift=-0.2em]wo2.north) {\tiny{you}};
+\node [wnode,anchor=north] (wo2copy2) at ([xshift=-0.4em,yshift=-0.4em]wo2.north) {\tiny{you}};
+}
+
+\visible<8->{
+\node [wnode,anchor=center,inner sep=2pt] (wo3) at ([xshift=0.4em,yshift=1.8em]o3.north) {\tiny{learned}};
+\node [wnode,anchor=north] (wo3copy1) at ([xshift=-0.2em,yshift=-0.2em]wo3.north) {\tiny{learned}};
+\node [wnode,anchor=north] (wo3copy2) at ([xshift=-0.4em,yshift=-0.4em]wo3.north) {\tiny{learned}};
+}
+
+\visible<3->{
+\foreach \x in {1}{
+    \draw [->] ([yshift=-0.7em]t\x.south) -- ([yshift=-0.1em]t\x.south);
+    \draw [->] ([yshift=0.1em]t\x.north) -- ([yshift=-0.1em]s\x.south);
+    \draw [->] ([yshift=0.1em]s\x.north) -- ([yshift=-0.1em]o\x.south);
+   
+}
+}
+
+\visible<5->{
+ \draw [->] ([yshift=0.1em]o1.north) -- ([yshift=0.8em]o1.north) node [pos=0.5,right] {\tiny{top-3}};
+ }
+
+\visible<7->{
+\foreach \x in {2}{
+    \draw [->] ([yshift=-0.7em]t\x.south) -- ([yshift=-0.1em]t\x.south);
+    \draw [->] ([yshift=0.1em]t\x.north) -- ([yshift=-0.1em]s\x.south);
+    \draw [->] ([yshift=0.1em]s\x.north) -- ([yshift=-0.1em]o\x.south);
+    \draw [->] ([yshift=0.1em]o\x.north) -- ([yshift=0.8em]o\x.north) node [pos=0.5,right] {\tiny{top-3}};
+}
+}
+
+\visible<8->{
+\foreach \x in {3}{
+    \draw [->] ([yshift=-0.7em]t\x.south) -- ([yshift=-0.1em]t\x.south);
+    \draw [->] ([yshift=0.1em]t\x.north) -- ([yshift=-0.1em]s\x.south);
+    \draw [->] ([yshift=0.1em]s\x.north) -- ([yshift=-0.1em]o\x.south);
+    \draw [->] ([yshift=0.1em]o\x.north) -- ([yshift=0.8em]o\x.north) node [pos=0.5,right] {\tiny{top-3}};
+}
+}
+
+\visible<3->{
+\draw [->] ([xshift=-0.5em]s1.west) -- ([xshift=-0.1em]s1.west) node [pos=0,left,inner sep=1pt] {\tiny{0}};
+}
+\visible<7->{
+\draw [->] ([xshift=0.1em]s1.east) -- ([xshift=-0.1em]s2.west);
+}
+\visible<8->{
+\draw [->] ([xshift=0.1em]s2.east) -- ([xshift=-0.1em]s3.west);
+}
+
+\visible<6->{
+\draw [->,very thick,dotted] (wo1.east) .. controls +(east:0.6) and +(west:0.8) ..(wt2copy2.west);
+}
+\visible<8->{
+\draw [->,very thick,dotted] (wo2.east) .. controls +(east:0.6) and +(west:0.8) ..(wt3copy2.west);
+}
+
+\visible<7->{
+\node [circle,draw,anchor=north,inner sep=2pt,fill=orange!20] (c2) at ([yshift=-2.5em]t1.south) {\tiny{$\textbf{C}_2$}};
+\node [circle,draw,inner sep=2pt,fill=orange!20] (c2copy1) at ([yshift=-0.1em,xshift=-0.1em]c2) {\tiny{$\textbf{C}_2$}};
+\node [circle,draw,inner sep=2pt,fill=orange!20] (c2copy2) at ([yshift=-0.2em,xshift=-0.2em]c2) {\tiny{$\textbf{C}_2$}};
+\draw [->] ([xshift=-0.9em]c2.west) -- ([xshift=-0.3em]c2.west);
+\draw [->] ([xshift=0.1em]c2.east) .. controls +(east:1.5) and +(west:0.8) ..([yshift=-0.3em,xshift=-0.1em]s2.west);
+}
+
+\visible<8->{
+\node [circle,draw,anchor=north,inner sep=2pt,fill=orange!20] (c3) at ([yshift=-2.5em]t2.south) {\tiny{$\textbf{C}_3$}};
+\node [circle,draw,inner sep=2pt,fill=orange!20] (c3copy1) at ([yshift=-0.1em,xshift=-0.1em]c3) {\tiny{$\textbf{C}_3$}};
+\node [circle,draw,inner sep=2pt,fill=orange!20] (c3copy2) at ([yshift=-0.2em,xshift=-0.2em]c3) {\tiny{$\textbf{C}_3$}};
+\draw [->] ([xshift=-0.9em]c3.west) -- ([xshift=-0.3em]c3.west);
+\draw [->] ([xshift=0.1em]c3.east) .. controls +(east:1.5) and +(west:0.8) ..([yshift=-0.3em,xshift=-0.1em]s3.west);
+}
+
+\visible<3->{
+\node [anchor=east] (vocab) at ([xshift=-5em]s1.west) {\tiny{$\begin{bmatrix} \textrm{Have} & 0.50 \\ \textrm{I} & 0.02 \\ \textrm{it} & 0.03 \\ \textrm{has} & 0.30 \\ \textrm{you} & 0.01 \\ \textrm{the} & 0.01 \\ \textrm{a} & 0.01 \\ \textrm{an} & 0.02 \\ \textrm{he} & 0.03 \\ \textrm{she} & 0.01 \\ \textrm{are} & 0.00 \\ \textrm{am} & 0.01 \\ ... & ... \end{bmatrix}$}};
+\node [anchor=south] (vocablabel) at (vocab.north) {\tiny{单词的概率分布}};
+\draw [->,red,very thick,dotted] (o1.west) .. controls +(west:1) and +(east:2) .. ([yshift=1em]vocab.south east); 
+}
+
+\visible<4->{
+\node [anchor=east,inner sep=1pt] (vocabtopn) at ([xshift=-0.5em,yshift=-0.5em]wo1.west) {\tiny{$\begin{bmatrix} \textrm{Have} \\ \textrm{has} \\ \textrm{it} \end{bmatrix}$}};
+\draw [->] ([yshift=-1.6em,xshift=-0.4em]vocab.north east) .. controls +(east:1) and +(west:1) ..  ([xshift=0.1em,yshift=0.4em]vocabtopn.west) node [pos=0.3,below] (topnlabel) {\tiny{top-3}};
+
+\visible<4->{
+\node [anchor=north] (cap) at (vocab.south east) {\scriptsize{\textbf{束搜索($b=3$)}}};
+}
+}
+
+
+\end{scope}
+\end{tikzpicture}
+\end{center}
+}
+
+\end{frame}
+
+%%%------------------------------------------------------------------------------------------------------------
+%%% 实验结果
+\begin{frame}{效果}
+%% 实用注意力机制带来的提升
+%% 个大评测比赛没有不使用注意力机制的系统，已经成为标配
+\end{frame}
+
+%%%------------------------------------------------------------------------------------------------------------
+%%% GNMT
+\begin{frame}{成功案例 - GNMT}
+%% GNMT的图和几句话说它多牛
+\end{frame}
+
+%%%------------------------------------------------------------------------------------------------------------
 \section{Transformer}

 %%%------------------------------------------------------------------------------------------------------------