new pages of attention models

a147039a · xiaotong · 9974fde7 · a147039a · a147039a
Commit a147039a authored Nov 14, 2019 by xiaotong
--- a/Section06-Neural-Machine-Translation/section06-test.tex
+++ b/Section06-Neural-Machine-Translation/section06-test.tex
@@ -85,6 +85,11 @@
 \newlength{\mystep}
 \newlength{\base}

+\newlength{\wseg}
+\newlength{\hseg}
+\newlength{\wnode}
+\newlength{\hnode}
+
 \usefonttheme[onlylarge]{structurebold}

 \IfFileExists{C:/WINDOWS/win.ini}
@@ -139,176 +144,30 @@
 \subsection{注意力机制}

 %%%------------------------------------------------------------------------------------------------------------
-%%% NMT的数学描述
-\begin{frame}{数学建模}
+%%% 如何定义注意力函数
+\begin{frame}{计算注意力权重 - 注意力函数}
 \begin{itemize}
-\item 对于源语言序列$\textbf{x} = \{x_1,x_2,...,x_m\}$，生成目标语序列$\textbf{y} = \{y_1,y_2,...,y_n\}$的概率可以被描述为
-
-\begin{displaymath}
-\log\textrm{P}(\textbf{y}|\textbf{x}) = \sum_{j=1}^{n} \log\textrm{P}(y_j|\textbf{y}_{<j}, \textbf{x})
-\end{displaymath}
-
-根据源于句子$\textbf{x}$和已生成的译文$\textbf{y}_{<j} = \{y_1,y_2,...,y_{j-1}\}$生成第$j$个译文$y_j$
-
-\item<2-> \textbf{核心}：如何求解$\textrm{P}(y_j|\textbf{y}_{<j}, \textbf{x})$。在这个循环神经网络模型中，有三个步骤
-    \begin{enumerate}
-    \item 输入的单词用分布式表示，如$\textbf{x}$被表示为词向量序列$e_x(\textbf{x})$，同理$\textbf{y}_{<j}$被表示为$e_y(\textbf{y}_{<j})$
-    \item 源语言句子被一个RNN编码为一个表示$C$，如前面的例子中是一个实数向量
-    \item 目标端解码用另一个RNN，因此生成$y_j$时只考虑前一个状态$s_{j-1}$（这里，$s_{j-1}$表示RNN第$j-1$步骤的隐层状态）
-    \end{enumerate}
-    
-\end{itemize}
-\end{frame}
-
-%%%------------------------------------------------------------------------------------------------------------
-%%% 各部分的解释
-\begin{frame}{数学建模(续)}
-
-\vspace{-1.5em}
-	\begin{center}
-		% \hspace*{-1.5cm}
-		\begin{tikzpicture}
-			\setlength{\base}{0.9cm}
-	
-			\tikzstyle{rnnnode} = [rounded corners=1pt,minimum height=0.5\base,minimum width=1\base,draw,inner sep=0pt,outer sep=0pt]
-			\tikzstyle{wordnode} = [font=\tiny]
-	
-			% RNN translation model
-			\begin{scope}[local bounding box=RNNMT]
-				% RNN Encoder
-				\coordinate (eemb0) at (0,0);
-				\foreach \x [count=\y from 0] in {1,2,...,3}
-					\node[rnnnode,minimum height=0.5\base,fill=green!30!white,anchor=west] (eemb\x) at ([xshift=0.4\base]eemb\y.east) {\tiny{$e_x()$}};
-				\foreach \x in {1,2,...,3}
-					\node[rnnnode,fill=blue!30!white,anchor=south] (enc\x) at ([yshift=0.3\base]eemb\x.north) {};
-			        \node[] (enclabel1) at (enc1) {\tiny{$h_{m-2}$}};
-			        \node[] (enclabel2) at (enc2) {\tiny{$h_{m-1}$}};
-			        \node[rnnnode,fill=purple!30!white] (enclabel3) at (enc3) {\tiny{$h_{m}$}};
-				\node[wordnode,left=0.4\base of enc1] (init1) {$\cdots$};
-				\node[wordnode,left=0.4\base of eemb1] (init2) {$\cdots$};
-
-				\node[wordnode,below=0pt of eemb1] () {走};
-				\node[wordnode,below=0pt of eemb2] () {吗};
-				\node[wordnode,below=0pt of eemb3] () {$\langle$eos$\rangle$};
-	
-				% RNN Decoder
-				\foreach \x in {1,2,...,3}
-					\node[rnnnode,minimum height=0.5\base,fill=green!30!white,anchor=south] (demb\x) at ([yshift=\base]enc\x.north) {\tiny{$e_y()$}};
-				\foreach \x in {1,2,...,3}
-					\node[rnnnode,fill=blue!30!white,anchor=south] (dec\x) at ([yshift=0.3\base]demb\x.north) {{\tiny{$s_\x$}}};
-				\foreach \x in {1,2,...,3}
-					\node[rnnnode,minimum height=0.5\base,fill=red!30!white,anchor=south] (softmax\x) at ([yshift=0.3\base]dec\x.north) {\tiny{Softmax}};
-				\node[wordnode,right=0.4\base of demb3] (end1) {$\cdots$};
-				\node[wordnode,right=0.4\base of dec3] (end2) {$\cdots$};
-				\node[wordnode,right=0.4\base of softmax3] (end3) {$\cdots$};
-
-				% Decoder input words
-				\node[wordnode,below=0pt of demb1] (decwordin) {$\langle$sos$\rangle$};
-				\ExtractX{$(demb2.south)$}
-				\ExtractY{$(decwordin.base)$}
-				\node[wordnode,anchor=base] () at (\XCoord,\YCoord) {Do};
-				\ExtractX{$(demb3.south)$}
-				\ExtractY{$(decwordin.base)$}
-				\node[wordnode,anchor=base] () at (\XCoord,\YCoord) {you};
-
-				% Decoder output words
-				\node[wordnode,above=0pt of softmax1] (decwordout) {Do};
-				\ExtractX{$(softmax2.north)$}
-				\ExtractY{$(decwordout.base)$}
-				\node[wordnode,anchor=base] () at (\XCoord,\YCoord) {you};
-				\ExtractX{$(softmax3.north)$}
-				\ExtractY{$(decwordout.base)$}
-				\node[wordnode,anchor=base] () at (\XCoord,\YCoord) {know};
-	
-				% Connections
-				\draw[-latex'] (init1.east) to (enc1.west);
-				\draw[-latex'] (dec3.east) to (end2.west);
-				\foreach \x in {1,2,...,3}
-					\draw[-latex'] (eemb\x) to (enc\x);
-				\foreach \x in {1,2,...,3}
-					\draw[-latex'] (demb\x) to (dec\x);
-				\foreach \x in {1,2,...,3}
-					\draw[-latex'] (dec\x.north) to (softmax\x.south);
-				\foreach \x [count=\y from 2] in {1,2}
-				{
-					\draw[-latex'] (enc\x.east) to (enc\y.west);
-					\draw[-latex'] (dec\x.east) to (dec\y.west);
-				}
-	
-				\coordinate (bridge) at ([yshift=0.4\base]enc2.north west);
-				\draw[-latex'] (enc3.north) .. controls +(north:0.3\base) and +(east:\base) .. (bridge) .. controls +(west:2.7\base) and +(west:0.3\base) .. (dec1.west);
-				
-				\visible<2->{
-				\node [anchor=east] (line1) at ([xshift=-3em,yshift=0.5em]softmax1.west) {\scriptsize{基于RNN的隐层状态$s_i$}};
-				\node [anchor=north west] (line2) at ([yshift=0.3em]line1.south west) {\scriptsize{预测目标词的概率}};
-				\node [anchor=north west] (line3) at ([yshift=0.3em]line2.south west) {\scriptsize{通常，用Softmax函数}};
-				\node [anchor=north west] (line4) at ([yshift=0.3em]line3.south west) {\scriptsize{实现 $\textrm{P}(y_i|...)$}};
-				}
-				
-				\visible<3->{
-				\node [anchor=north west] (line11) at ([yshift=-1.8em]line4.west) {\scriptsize{每个词的one-hot}};
-				\node [anchor=north west] (line12) at ([yshift=0.3em]line11.south west) {\scriptsize{离散化表示都被转化为}};
-				\node [anchor=north west] (line13) at ([yshift=0.3em]line12.south west) {\scriptsize{实数向量，即词嵌入}};
-				\node [anchor=north west] (line14) at ([yshift=0.3em]line13.south west) {\scriptsize{($e_x()$和$e_y()$函数)}};
-				}
-				
-				\visible<4->{
-				\node [anchor=west] (line21) at ([xshift=1.3em,yshift=1.5em]enc3.east)  {\scriptsize{源语编码器最后一个}};
-				\node [anchor=north west] (line22) at ([yshift=0.3em]line21.south west) {\scriptsize{循环单元的输出被}};
-				\node [anchor=north west] (line23) at ([yshift=0.3em]line22.south west) {\scriptsize{看作是句子的表示,}};
-				\node [anchor=north west] (line24) at ([yshift=0.3em]line23.south west) {\scriptsize{记为$C$}};
-				}
-				
-				\begin{pgfonlayer}{background}
-				\visible<2->{
-				\node [rectangle,inner sep=0.2em,rounded corners=1pt,fill=red!10,drop shadow,draw=red] [fit = (line1) (line2) (line3) (line4)] (box1) {};
-				\node [rectangle,inner sep=0.2em,rounded corners=1pt,very thick,dotted,draw=red] [fit = (softmax1) (softmax2) (softmax3)] (box4) {};
-				\draw [->,dotted,very thick,red] ([yshift=1em,xshift=2.5em]box1.east) -- ([yshift=1em,xshift=0.1em]box1.east);
-				}
-				
-				\visible<3->{
-				\node [rectangle,inner sep=0.2em,rounded corners=1pt,fill=green!10,drop shadow,draw=ugreen] [fit = (line11) (line12) (line13) (line14)] (box2) {};
-				\node [rectangle,inner sep=0.2em,rounded corners=1pt,very thick,dotted,draw=ugreen] [fit = (eemb1) (eemb2) (eemb3)] (box5) {};
-				\node [rectangle,inner sep=0.2em,rounded corners=1pt,very thick,dotted,draw=ugreen] [fit = (demb1) (demb2) (demb3)] (box6) {};
-				\draw [->,dotted,very thick,ugreen] ([yshift=-1.3em,xshift=2.5em]box2.east) -- ([yshift=-1.3em,xshift=0.1em]box2.east);
-				\draw [->,dotted,very thick,ugreen] ([xshift=0.1em]box6.west) .. controls +(west:1) and +(east:1) .. ([yshift=1.0em]box2.east) ;
-				}
-				
-				\visible<4->{
-				\node [rectangle,inner sep=0.2em,rounded corners=1pt,fill=purple!10,drop shadow,draw=purple] [fit = (line21) (line22) (line23) (line24)] (box3) {};
-				\node [rectangle,inner sep=0.2em,rounded corners=1pt,very thick,dotted,draw=purple] [fit = (enc3)] (box7) {};
-				\draw [->,dotted,very thick,purple] ([xshift=0.1em]box7.east) -- ([xshift=0.8em]box7.east) ;
-				}
-							
-				\end{pgfonlayer}
-	
-			\end{scope}
-		\end{tikzpicture}
-	\end{center}
-
-\visible<5->{
-\vspace{-1.5em}
-\begin{itemize}
-\item 可以重新定义\\
-\vspace{-0.8em}
-\begin{displaymath}
-\textrm{P}(y_j|\textbf{y}_{<j}, \textbf{x}) \triangleq \textrm{P}(y_j|s_{j-1}, C)
-\end{displaymath}
-
-对于上图中的模型，进一步化简为：\\
+\item 再来看一下注意力权重的定义。这个过程实际上是对$a(\cdot,\cdot)$做指数归一化：\\
 \vspace{-0.3em}
-
 \begin{displaymath}
-\textrm{P}(y_j|\textbf{y}_{<j}, \textbf{x}) \triangleq \left\{ 
-    \begin{matrix}
-        \textrm{P}(y_j|C)\ \ \ \  & j = 1 \\
-        \textrm{P}(y_j|s_{j-1}) & j > 1
-    \end{matrix} \right.
+\alpha_{i,j} = \frac{\exp(a(s_{i-1}, h_j))}{\sum_{j'} \exp(a(s_{i-1}, h_{j'}))}
 \end{displaymath}

+\item<2-> 注意力函数$a(s,h)$的目的是捕捉$s$和$h$之间的\alert{相似性}，这也可以被看作是目标语表示和源语言表示的一种``统一化''，即把源语言和目标语表示在同一个语义空间，进而语义相近的内容有更大的相似性。\visible<3->{定义$a(s,h)$的方式：}
+    
+    \visible<3->{
+    \begin{displaymath}
+    a(s,h) =  \left\{ \begin{array}{ll}
+    s h^T & \textrm{向量乘} \\
+    \textrm{cos}(s, h) & \textrm{向量夹角} \\
+    s \textbf{W} h^T & \textrm{线性模型} \\
+    \textrm{TanH}(\textbf{W}[s,h])\textbf{v}^T & \textrm{拼接}
+    \end{array}
+    \right.
+    \end{displaymath}
+    $\textbf{W}$和$\textbf{v}$是可学习参数
+    }
 \end{itemize}
-}
-
 \end{frame}

 %%%------------------------------------------------------------------------------------------------------------

--- a/Section06-Neural-Machine-Translation/section06.tex
+++ b/Section06-Neural-Machine-Translation/section06.tex
@@ -82,6 +82,10 @@
 \newcounter{mycount3}
 \newcounter{mycount4}
 \newlength{\mystep}
+\newlength{\wseg}
+\newlength{\hseg}
+\newlength{\wnode}
+\newlength{\hnode}


 \usefonttheme[onlylarge]{structurebold}
@@ -866,7 +870,7 @@ NLP问题的隐含结构假设 & 无隐含结构假设，端到端学习 \\
    \item 源语言句子被一个RNN编码为一个表示$C$，如前面的例子中是一个实数向量
    \item 目标端解码用另一个RNN，因此生成$y_j$时只考虑前一个状态$s_{j-1}$（这里，$s_{j-1}$表示RNN第$j-1$步骤的隐层状态）
    \end{enumerate}
-    
+
 \end{itemize}
 \end{frame}

@@ -1009,7 +1013,7 @@ NLP问题的隐含结构假设 & 无隐含结构假设，端到端学习 \\
 \vspace{-0.3em}

 \begin{displaymath}
-\textrm{P}(y_j|\textbf{y}_{<j}, \textbf{x}) \triangleq \left\{ 
+\textrm{P}(y_j|\textbf{y}_{<j}, \textbf{x}) \triangleq \left\{
    \begin{matrix}
        \textrm{P}(y_j|C)\ \ \ \  & j = 1 \\
        \textrm{P}(y_j|s_{j-1}) & j > 1
@@ -1619,7 +1623,7 @@ NLP问题的隐含结构假设 & 无隐含结构假设，端到端学习 \\
 \node [anchor=west] (math1) at ([xshift=5em,yshift=1em]th2.east) {$C_i = \sum_{j} \alpha_{i,j} h_j \ \ $};
 }
 \visible<3->{
-\node [anchor=north west] (math2) at ([yshift=-2em]math1.south west) {$\alpha_{i,j} = \frac{\exp(\beta_{i,j})}{\sum_j \exp(\beta_{i,j})}$};
+\node [anchor=north west] (math2) at ([yshift=-2em]math1.south west) {$\alpha_{i,j} = \frac{\exp(\beta_{i,j})}{\sum_{j'} \exp(\beta_{i,j'})}$};
 \node [anchor=north west] (math3) at ([yshift=-0em]math2.south west) {$\beta_{i,j} = a(s_{i-1}, h_j)$};
 }

@@ -1647,6 +1651,187 @@ NLP问题的隐含结构假设 & 无隐含结构假设，端到端学习 \\
 \end{frame}

 %%%------------------------------------------------------------------------------------------------------------
+%%% 注意力权重的可视化
+\begin{frame}{注意力权重$\alpha_{ij}$}
+
+\begin{itemize}
+\item 注意力权重$\alpha_{ij}$的可视化
+\end{itemize}
+
+\vspace{-1.5em}
+
+\begin{center}
+\begin{tikzpicture}
+
+\setlength{\wseg}{1.5cm}
+\setlength{\hseg}{1.0cm}
+\setlength{\wnode}{3.75cm}
+\setlength{\hnode}{1.0cm}
+
+\tikzstyle{elementnode} = [rectangle,text=white,anchor=center]
+\tikzstyle{srcnode} = [rotate=45,font=\small,anchor=south west]
+\tikzstyle{tgtnode} = [left,font=\small,anchor=north east]
+\tikzstyle{alignmentnode} = [rectangle,draw,minimum height=3.6\hnode,minimum width=0.36\hnode]
+\tikzstyle{probnode} = [fill=blue!30,minimum width=0.4\hnode]
+\tikzstyle{labelnode} = [above]
+
+% alignment matrix
+\begin{scope}[scale=0.9,yshift=0.12in]
+\foreach \i / \j / \c in
+    {0/7/0.2, 1/7/0.45, 2/7/0.15, 3/7/0.15, 4/7/0.15, 5/7/0.15,
+    0/6/0.35, 1/6/0.45, 2/6/0.15, 3/6/0.15, 4/6/0.15, 5/6/0.15,
+    0/5/0.25, 1/5/0.15, 2/5/0.15, 3/5/0.35, 4/5/0.15, 5/5/0.15,
+    0/4/0.15, 1/4/0.25, 2/4/0.2, 3/4/0.30, 4/4/0.15, 5/4/0.15,
+    0/3/0.15, 1/3/0.15, 2/3/0.8, 3/3/0.25, 4/3/0.15, 5/3/0.25,
+    0/2/0.15, 1/2/0.15, 2/2/0.15, 3/2/0.15, 4/2/0.25, 5/2/0.3,
+    0/1/0.15, 1/1/0.15, 2/1/0.15, 3/1/0.15, 4/1/0.8, 5/1/0.15,
+    0/0/0.15, 1/0/0.15, 2/0/0.15, 3/0/0.15, 4/0/0.25, 5/0/0.60}
+    \node[elementnode,minimum size=0.6*\hnode*\c,inner sep=0.1pt,fill=blue] (a\i\j) at (0.5*\hnode*\i-5.4*0.5*\hnode,0.5*\hnode*\j-1.05*\hnode) {};
+
+%attention score labels
+\node[align=center] (l17) at (a17) {\scriptsize{{\color{white} .4}}};
+\node[align=center] (l26) at (a06) {\scriptsize{{\color{white} .3}}};
+\node[align=center] (l26) at (a16) {\scriptsize{{\color{white} .4}}};
+\node[align=center] (l17) at (a35) {\scriptsize{{\color{white} .3}}};
+\node[align=center] (l17) at (a34) {\tiny{{\color{white} .3}}};
+\node[align=center] (l17) at (a23) {\small{{\color{white} .8}}};
+\node[align=center] (l17) at (a41) {\small{{\color{white} .8}}};
+\node[align=center] (l17) at (a50) {\small{{\color{white} .7}}};
+
+% source
+\node[srcnode] (src1) at (-5.4*0.5*\hnode,-1.05*\hnode+7.5*0.5*\hnode) {\scriptsize{Have}};
+\node[srcnode] (src2) at ([xshift=0.5\hnode]src1.south west) {\scriptsize{you}};
+\node[srcnode] (src3) at ([xshift=0.5\hnode]src2.south west) {\scriptsize{learned}};
+\node[srcnode] (src4) at ([xshift=0.5\hnode]src3.south west) {\scriptsize{nothing}};
+\node[srcnode] (src5) at ([xshift=0.5\hnode]src4.south west) {\scriptsize{?}};
+\node[srcnode] (src6) at ([xshift=0.5\hnode]src5.south west) {\scriptsize{EOS}};
+
+% target
+\node[tgtnode] (tgt1) at (-6.0*0.5*\hnode,-1.05*\hnode+7.5*0.5*\hnode) {\scriptsize{你}};
+\node[tgtnode] (tgt2) at ([yshift=-0.5\hnode]tgt1.north east) {\scriptsize{什么}};
+\node[tgtnode] (tgt3) at ([yshift=-0.5\hnode]tgt2.north east) {\scriptsize{都}};
+\node[tgtnode] (tgt4) at ([yshift=-0.5\hnode]tgt3.north east) {\scriptsize{没}};
+\node[tgtnode] (tgt5) at ([yshift=-0.5\hnode]tgt4.north east) {\scriptsize{学}};
+\node[tgtnode] (tgt6) at ([yshift=-0.5\hnode]tgt5.north east) {\scriptsize{到}};
+\node[tgtnode] (tgt7) at ([yshift=-0.5\hnode]tgt6.north east) {\scriptsize{?}};
+\node[tgtnode] (tgt8) at ([yshift=-0.5\hnode]tgt7.north east) {\scriptsize{EOS}};
+
+\end{scope}
+
+\visible<2->{
+% alignment rectangle 2
+\node[alignmentnode, ugreen, anchor=north west] (alignment1) at ([xshift=-0.3em,yshift=0.4em]a07.north west) {};
+}
+
+\visible<3->{
+% alignment rectangle 1
+\node[alignmentnode, red, anchor=north west] (alignment2) at ([xshift=-0.1em,yshift=0.2em]a17.north west) {};
+}
+
+\visible<3->{
+% alignment bars 2
+\node[probnode,anchor=south west,minimum height=0.4\hnode,inner sep=0.1pt,fill=red!40,label=below:\scriptsize{$0.4$}] (attn21) at ([xshift=2.3\hnode,yshift=-0.0\hnode]alignment2.east) {};
+\node[probnode,anchor=south west,minimum height=0.4\hnode,inner sep=0.1pt,fill=red!40,label=below:\scriptsize{$0.4$}] (attn22) at ([xshift=1pt]attn21.south east) {};
+\node[probnode,anchor=south west,minimum height=0.05\hnode,inner sep=0.1pt,fill=red!40,label=below:\scriptsize{$0$}] (attn23) at ([xshift=1pt]attn22.south east) {};
+\node[probnode,anchor=south west,minimum height=0.1\hnode,inner sep=0.1pt,fill=red!40,label=below:\scriptsize{$0.1$}] (attn24) at ([xshift=1pt]attn23.south east) {};
+\node[probnode,anchor=south west,minimum height=0.05\hnode,inner sep=0.1pt,fill=red!40,label=below:\scriptsize{$0$}] (attn25) at ([xshift=1pt]attn24.south east) {};
+\node[probnode,anchor=south west,minimum height=0.05\hnode,inner sep=0.1pt,fill=red!40,label=below:\scriptsize{$...$}] (attn26) at ([xshift=1pt]attn25.south east) {};
+}
+
+\visible<2->{
+% alignment bars 1
+\node[probnode,anchor=south,minimum height=0.2\hnode,inner sep=0.1pt,fill=ugreen!40,label=below:\scriptsize{$0.2$}] (attn11) at ([xshift=2.5\hnode,yshift=-1em]alignment2.north east) {};
+\node[probnode,anchor=south west,minimum height=0.3\hnode,inner sep=0.1pt,fill=ugreen!40,label=below:\scriptsize{$0.3$}] (attn12) at ([xshift=1pt]attn11.south east) {};
+\node[probnode,anchor=south west,minimum height=0.2\hnode,inner sep=0.1pt,fill=ugreen!40,label=below:\scriptsize{$0.2$}] (attn13) at ([xshift=1pt]attn12.south east) {};
+\node[probnode,anchor=south west,minimum height=0.05\hnode,inner sep=0.1pt,fill=ugreen!40,label=below:\scriptsize{$0$}] (attn14) at ([xshift=1pt]attn13.south east) {};
+\node[probnode,anchor=south west,minimum height=0.05\hnode,inner sep=0.1pt,fill=ugreen!40,label=below:\scriptsize{$0$}] (attn15) at ([xshift=1pt]attn14.south east) {};
+\node[probnode,anchor=south west,minimum height=0.05\hnode,inner sep=0.1pt,fill=ugreen!40,label=below:\scriptsize{$...$}] (attn16) at ([xshift=1pt]attn15.south east) {};
+}
+
+\visible<3->{
+% coverage score formula node
+\node[anchor=north west] (formula) at ([xshift=-0.3\hnode,yshift=-2.5\hnode]attn11.south) {\small{不同$C_i$所对应的源语言词的权重是不同的}};
+}
+
+\visible<3->{
+% matrix -> attn2
+\draw[->,red] ([xshift=0.1em,yshift=2.3em]alignment2.east).. controls +(east:1.9cm) and +(west:1.0cm) ..([xshift=-0.15\hnode,yshift=-0.0\hnode]attn21.north west);
+}
+
+\visible<2->{
+\draw[->,ugreen] ([xshift=0.1em,yshift=-1.2em]alignment1.north east)--([xshift=2.2\hnode,yshift=-1.2em]alignment2.north east);
+}
+
+\visible<3->{
+% attn2 -> cov2
+\draw[->] ([xshift=0.2\hnode,yshift=0.0\hnode]attn26.east)--([xshift=0.7\hnode,yshift=0.0\hnode]attn26.east) node[pos=0.5,above] (sum2) {\small{$\sum$}}; % 0.3 - 0.5 height of the
+}
+
+\visible<2->{
+% attn1 -> cov1
+\draw[->] ([xshift=0.2\hnode]attn16.east)--([xshift=0.7\hnode]attn16.east) node[pos=0.5,above] (sum1) {\small{$\sum$}};
+}
+
+% coverage score for each source word
+\visible<2->{
+\node[anchor=west] (sc1) at ([xshift=0.9\hnode]attn16.east) {$C_1 = \sum_{i=1}^{8} \alpha_{i1} h_{i}$};
+}
+
+\visible<3->{
+\node[anchor=west] (sc2) at ([xshift=0.9\hnode,yshift=0.0\hnode]attn26.east) {$C_2 = \sum_{i=1}^{8} \alpha_{i2} h_{i}$};
+}
+
+\end{tikzpicture}
+\end{center}
+
+\visible<4->{
+\begin{itemize}
+\item 对比
+\end{itemize}
+
+\begin{center}
+{\small
+\begin{tabular}{l | l}
+引入注意力机制以前 & 引入注意力机制以后 \\ \hline
+$\textrm{``Have''} = \argmax_{y} \textrm{P}(y|0, \alert{C})$ & $\textrm{``Have''} = \argmax_{y} \textrm{P}(y|0, \alert{C_1})$ \\
+$\textrm{``you''} = \argmax_{y} \textrm{P}(y|s_1, \alert{C})$ & $\textrm{``you''} = \argmax_{y} \textrm{P}(y|s_1, \alert{C_2})$
+\end{tabular}
+}
+\end{center}
+}
+
+\end{frame}
+
+%%%------------------------------------------------------------------------------------------------------------
+%%% 如何定义注意力函数
+\begin{frame}{计算注意力权重 - 注意力函数}
+\begin{itemize}
+\item 再来看一下注意力权重的定义。这个过程实际上是对$a(\cdot,\cdot)$做指数归一化：\\
+\vspace{-0.3em}
+\begin{displaymath}
+\alpha_{i,j} = \frac{\exp(a(s_{i-1}, h_j))}{\sum_{j'} \exp(a(s_{i-1}, h_{j'}))}
+\end{displaymath}
+
+\item<2-> 注意力函数$a(s,h)$的目的是捕捉$s$和$h$之间的\alert{相似性}，这也可以被看作是目标语表示和源语言表示的一种``统一化''，即把源语言和目标语表示在同一个语义空间，进而语义相近的内容有更大的相似性。\visible<3->{定义$a(s,h)$的方式：}
+    \vspace{-1em}
+
+    \visible<3->{
+    \begin{displaymath}
+    a(s,h) =  \left\{ \begin{array}{ll}
+    s h^T & \textrm{向量乘} \\
+    \textrm{cos}(s, h) & \textrm{向量夹角} \\
+    s \textbf{W} h^T & \textrm{线性模型} \\
+    \textrm{TanH}(\textbf{W}[s,h])\textbf{v}^T & \textrm{拼接}[s,h]+\textrm{单层网络}
+    \end{array}
+    \right.
+    \end{displaymath}
+    \vspace{-0.3em}
+    $\textbf{W}$和$\textbf{v}$是可学习参数
+    }
+\end{itemize}
+\end{frame}
+
+%%%------------------------------------------------------------------------------------------------------------
 \section{Transformer}

 %%%------------------------------------------------------------------------------------------------------------