new pages

fbcf3b9f · xiaotong · 7edb503e · fbcf3b9f · fbcf3b9f
Commit fbcf3b9f authored Nov 16, 2019 by xiaotong
--- a/Section06-Neural-Machine-Translation/section06-test.tex
+++ b/Section06-Neural-Machine-Translation/section06-test.tex
@@ -28,6 +28,7 @@
 \usetikzlibrary{calc,intersections}
 \usetikzlibrary{matrix}
+\usetikzlibrary{patterns}
 \usetikzlibrary{arrows,decorations.pathreplacing}
 \usetikzlibrary{shadows} % LATEX and plain TEX when using Tik Z
 \usetikzlibrary{shadows.blur}
@@ -144,34 +145,21 @@
 \subsection{注意力机制}
 %%%------------------------------------------------------------------------------------------------------------
-%%% 如何定义注意力函数
+%%% 解码
-\begin{frame}{计算注意力权重 - 注意力函数}
+\begin{frame}{推断}
 \begin{itemize}
-\item 再来看一下注意力权重的定义。这个过程实际上是对$a(\cdot,\cdot)$做指数归一化：\\
+\item 使用NMT时，对于源语言句子$\textbf{x}$，需要得到最优译文$\hat{\textbf{y}}$
-\vspace{-0.3em}
+\vspace{-1.5em}
 \begin{displaymath}
-\alpha_{i,j} = \frac{\exp(a(s_{i-1}, h_j))}{\sum_{j'} \exp(a(s_{i-1}, h_{j'}))}
+\hat{\textbf{y}} = \argmax_{\textbf{y}} \log\textrm{P}(\textbf{y}|\textbf{x}) = \argmax_{\textbf{y}} \sum_{j=1}^{n} \log\textrm{P}(y_j|\textbf{y}_{<j}, \textbf{x})
 \end{displaymath}
-\item<2-> 注意力函数$a(s,h)$的目的是捕捉$s$和$h$之间的\alert{相似性}，这也可以被看作是目标语表示和源语言表示的一种``统一化''，即把源语言和目标语表示在同一个语义空间，进而语义相近的内容有更大的相似性。\visible<3->{定义$a(s,h)$的方式：}
+\item 由于生成$y_i$需要依赖$y_{i-1}$因此，无法同时生成${y_1,...,y_n}$。常用的方法是
-    \visible<3->{
-    \begin{displaymath}
-    a(s,h) =  \left\{ \begin{array}{ll}
-    s h^T & \textrm{向量乘} \\
-    \textrm{cos}(s, h) & \textrm{向量夹角} \\
-    s \textbf{W} h^T & \textrm{线性模型} \\
-    \textrm{TanH}(\textbf{W}[s,h])\textbf{v}^T & \textrm{拼接}
-    \end{array}
-    \right.
-    \end{displaymath}
-    $\textbf{W}$和$\textbf{v}$是可学习参数
-    }
 \end{itemize}
 \end{frame}
 %%%------------------------------------------------------------------------------------------------------------
 \section{Transformer}

--- a/Section06-Neural-Machine-Translation/section06.tex
+++ b/Section06-Neural-Machine-Translation/section06.tex
@@ -28,6 +28,7 @@
 \usetikzlibrary{calc,intersections}
 \usetikzlibrary{matrix}
+\usetikzlibrary{patterns}
 \usetikzlibrary{arrows,decorations.pathreplacing}
 \usetikzlibrary{shadows} % LATEX and plain TEX when using Tik Z
 \usetikzlibrary{shadows.blur}
@@ -958,8 +959,8 @@ NLP问题的隐含结构假设 & 无隐含结构假设，端到端学习 \\
 \item<2-> \textbf{核心}：如何求解$\textrm{P}(y_j|\textbf{y}_{<j}, \textbf{x})$。在这个循环神经网络模型中，有三个步骤
    \begin{enumerate}
    \item 输入的单词用分布式表示，如$\textbf{x}$被表示为词向量序列$e_x(\textbf{x})$，同理$\textbf{y}_{<j}$被表示为$e_y(\textbf{y}_{<j})$
-    \item 源语言句子被一个RNN编码为一个表示$C$，如前面的例子中是一个实数向量
+    \item 源语言句子被一个RNN编码为一个表示$\textbf{C}$，如前面的例子中是一个实数向量
-    \item 目标端解码用另一个RNN，因此生成$y_j$时只考虑前一个状态$s_{j-1}$（这里，$s_{j-1}$表示RNN第$j-1$步骤的隐层状态）
+    \item 目标端解码用另一个RNN，因此生成$y_j$时只考虑前一个状态$\textbf{s}_{j-1}$（这里，$\textbf{s}_{j-1}$表示RNN第$j-1$步骤的隐层状态）
    \end{enumerate}
 \end{itemize}
@@ -986,9 +987,9 @@ NLP问题的隐含结构假设 & 无隐含结构假设，端到端学习 \\
 					\node[rnnnode,minimum height=0.5\base,fill=green!30!white,anchor=west] (eemb\x) at ([xshift=0.4\base]eemb\y.east) {\tiny{$e_x()$}};
 				\foreach \x in {1,2,...,3}
 					\node[rnnnode,fill=blue!30!white,anchor=south] (enc\x) at ([yshift=0.3\base]eemb\x.north) {};
-			        \node[] (enclabel1) at (enc1) {\tiny{$h_{m-2}$}};
+			        \node[] (enclabel1) at (enc1) {\tiny{$\textbf{h}_{m-2}$}};
-			        \node[] (enclabel2) at (enc2) {\tiny{$h_{m-1}$}};
+			        \node[] (enclabel2) at (enc2) {\tiny{$\textbf{h}_{m-1}$}};
-			        \node[rnnnode,fill=purple!30!white] (enclabel3) at (enc3) {\tiny{$h_{m}$}};
+			        \node[rnnnode,fill=purple!30!white] (enclabel3) at (enc3) {\tiny{$\textbf{h}_{m}$}};
 				\node[wordnode,left=0.4\base of enc1] (init1) {$\cdots$};
 				\node[wordnode,left=0.4\base of eemb1] (init2) {$\cdots$};
@@ -1000,7 +1001,7 @@ NLP问题的隐含结构假设 & 无隐含结构假设，端到端学习 \\
 				\foreach \x in {1,2,...,3}
 					\node[rnnnode,minimum height=0.5\base,fill=green!30!white,anchor=south] (demb\x) at ([yshift=\base]enc\x.north) {\tiny{$e_y()$}};
 				\foreach \x in {1,2,...,3}
-					\node[rnnnode,fill=blue!30!white,anchor=south] (dec\x) at ([yshift=0.3\base]demb\x.north) {{\tiny{$s_\x$}}};
+					\node[rnnnode,fill=blue!30!white,anchor=south] (dec\x) at ([yshift=0.3\base]demb\x.north) {{\tiny{$\textbf{s}_\x$}}};
 				\foreach \x in {1,2,...,3}
 					\node[rnnnode,minimum height=0.5\base,fill=red!30!white,anchor=south] (softmax\x) at ([yshift=0.3\base]dec\x.north) {\tiny{Softmax}};
 				\node[wordnode,right=0.4\base of demb3] (end1) {$\cdots$};
@@ -1044,7 +1045,7 @@ NLP问题的隐含结构假设 & 无隐含结构假设，端到端学习 \\
 				\draw[-latex'] (enc3.north) .. controls +(north:0.3\base) and +(east:\base) .. (bridge) .. controls +(west:2.7\base) and +(west:0.3\base) .. (dec1.west);
 				\visible<2->{
-				\node [anchor=east] (line1) at ([xshift=-3em,yshift=0.5em]softmax1.west) {\scriptsize{基于RNN的隐层状态$s_i$}};
+				\node [anchor=east] (line1) at ([xshift=-3em,yshift=0.5em]softmax1.west) {\scriptsize{基于RNN的隐层状态$\textbf{s}_i$}};
 				\node [anchor=north west] (line2) at ([yshift=0.3em]line1.south west) {\scriptsize{预测目标词的概率}};
 				\node [anchor=north west] (line3) at ([yshift=0.3em]line2.south west) {\scriptsize{通常，用Softmax函数}};
 				\node [anchor=north west] (line4) at ([yshift=0.3em]line3.south west) {\scriptsize{实现 $\textrm{P}(y_i|...)$}};
@@ -1061,7 +1062,7 @@ NLP问题的隐含结构假设 & 无隐含结构假设，端到端学习 \\
 				\node [anchor=west] (line21) at ([xshift=1.3em,yshift=1.5em]enc3.east)  {\scriptsize{源语编码器最后一个}};
 				\node [anchor=north west] (line22) at ([yshift=0.3em]line21.south west) {\scriptsize{循环单元的输出被}};
 				\node [anchor=north west] (line23) at ([yshift=0.3em]line22.south west) {\scriptsize{看作是句子的表示,}};
-				\node [anchor=north west] (line24) at ([yshift=0.3em]line23.south west) {\scriptsize{记为$C$}};
+				\node [anchor=north west] (line24) at ([yshift=0.3em]line23.south west) {\scriptsize{记为$\textbf{C}$}};
 				}
 				\begin{pgfonlayer}{background}
@@ -1097,7 +1098,7 @@ NLP问题的隐含结构假设 & 无隐含结构假设，端到端学习 \\
 \item 可以重新定义\\
 \vspace{-0.8em}
 \begin{displaymath}
-\textrm{P}(y_j|\textbf{y}_{<j}, \textbf{x}) \triangleq \textrm{P}(y_j|s_{j-1}, C)
+\textrm{P}(y_j|\textbf{y}_{<j}, \textbf{x}) \triangleq \textrm{P}(y_j|\textbf{s}_{j-1}, \textbf{C})
 \end{displaymath}
 对于上图中的模型，进一步化简为：\\
@@ -1106,8 +1107,8 @@ NLP问题的隐含结构假设 & 无隐含结构假设，端到端学习 \\
 \begin{displaymath}
 \textrm{P}(y_j|\textbf{y}_{<j}, \textbf{x}) \triangleq \left\{
    \begin{matrix}
-        \textrm{P}(y_j|C)\ \ \ \  & j = 1 \\
+        \textrm{P}(y_j|\textbf{C})\ \ \ \  & j = 1 \\
-        \textrm{P}(y_j|s_{j-1}) & j > 1
+        \textrm{P}(y_j|\textbf{s}_{j-1}) & j > 1
    \end{matrix} \right.
 \end{displaymath}
@@ -1455,12 +1456,6 @@ NLP问题的隐含结构假设 & 无隐含结构假设，端到端学习 \\
 \end{frame}
 %%%------------------------------------------------------------------------------------------------------------
-%%% 一些变种
-\begin{frame}{改进 - fine-tuning}
-%%% 图
-\end{frame}
-%%%------------------------------------------------------------------------------------------------------------
 \subsection{注意力机制}
 %%%------------------------------------------------------------------------------------------------------------
@@ -1564,7 +1559,7 @@ NLP问题的隐含结构假设 & 无隐含结构假设，端到端学习 \\
 \begin{itemize}
 \item 在注意力机制中，每个目标语单词的生成会使用一个动态的源语表示，而非一个统一的固定表示
    \begin{itemize}
-    \item 这里$C_i$表示第$i$个目标语单词所使用的源语表示
+    \item 这里$\textbf{C}_i$表示第$i$个目标语单词所使用的源语表示
    \end{itemize}
 \end{itemize}
@@ -1646,9 +1641,9 @@ NLP问题的隐含结构假设 & 无隐含结构假设，端到端学习 \\
 \draw[<-] ([yshift=0.1em,xshift=1em]t6.north) -- ([yshift=1.2em,xshift=1em]t6.north);
-\draw [->] ([yshift=3em]s6.north) -- ([yshift=4em]s6.north) -- ([yshift=4em]t1.north) node [pos=0.5,fill=green!30,inner sep=2pt] (c1) {\scriptsize{表示$C_1$}} -- ([yshift=3em]t1.north) ;
+\draw [->] ([yshift=3em]s6.north) -- ([yshift=4em]s6.north) -- ([yshift=4em]t1.north) node [pos=0.5,fill=green!30,inner sep=2pt] (c1) {\scriptsize{表示$\textbf{C}_1$}} -- ([yshift=3em]t1.north) ;
-\draw [->] ([yshift=3em]s5.north) -- ([yshift=5.3em]s5.north) -- ([yshift=5.3em]t2.north) node [pos=0.5,fill=green!30,inner sep=2pt] (c2) {\scriptsize{表示$C_2$}} -- ([yshift=3em]t2.north) ;
+\draw [->] ([yshift=3em]s5.north) -- ([yshift=5.3em]s5.north) -- ([yshift=5.3em]t2.north) node [pos=0.5,fill=green!30,inner sep=2pt] (c2) {\scriptsize{表示$\textbf{C}_2$}} -- ([yshift=3em]t2.north) ;
-\draw [->] ([yshift=3.5em]s3.north) -- ([yshift=6.6em]s3.north) -- ([yshift=6.6em]t4.north) node [pos=0.5,fill=green!30,inner sep=2pt] (c3) {\scriptsize{表示$C_i$}} -- ([yshift=3.5em]t4.north) ;
+\draw [->] ([yshift=3.5em]s3.north) -- ([yshift=6.6em]s3.north) -- ([yshift=6.6em]t4.north) node [pos=0.5,fill=green!30,inner sep=2pt] (c3) {\scriptsize{表示$\textbf{C}_i$}} -- ([yshift=3.5em]t4.north) ;
 \node [anchor=north] (smore) at ([yshift=3.5em]s3.north) {...};
 \node [anchor=north] (tmore) at ([yshift=3.5em]t4.north) {...};
@@ -1663,15 +1658,15 @@ NLP问题的隐含结构假设 & 无隐含结构假设，端到端学习 \\
 %%%------------------------------------------------------------------------------------------------------------
 %%% C_i的定义
-\begin{frame}{上下文向量$C_i$}
+\begin{frame}{上下文向量$\textbf{C}_i$}
 \begin{itemize}
-\item 对于目标语位置$i$，$C_i$是目标语$i$使用的上下文向量
+\item 对于目标语位置$i$，$\textbf{C}_i$是目标语$i$使用的上下文向量
 	\begin{itemize}
-	\item $h_j$表示编码器第$j$个位置的隐层状态
+	\item $\textbf{h}_j$表示编码器第$j$个位置的隐层状态
-	\item $s_i$表示解码器第$i$个位置的隐层状态
+	\item $\textbf{s}_i$表示解码器第$i$个位置的隐层状态
 	\item<2-> $\alpha_{i,j}$表示注意力权重，表示目标语第$i$个位置与源语第$j$个位置之间的相关性大小
-	\item<2-> $a(\cdot)$表示注意力函数，计算$s_{i-1}$和$h_j$之间的相关性
+	\item<2-> $a(\cdot)$表示注意力函数，计算$\textbf{s}_{i-1}$和$\textbf{h}_j$之间的相关性
-	\item<3-> $C_i$是所有源语编码表示$\{h_j\}$的加权求和，权重为$\{\alpha_{i,j}\}$
+	\item<3-> $\textbf{C}_i$是所有源语编码表示$\{\textbf{h}_j\}$的加权求和，权重为$\{\alpha_{i,j}\}$
 	\end{itemize}
 \end{itemize}
@@ -1680,17 +1675,17 @@ NLP问题的隐含结构假设 & 无隐含结构假设，端到端学习 \\
 \begin{scope}
-\node [anchor=west,draw,fill=red!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (h1) at (0,0) {\scriptsize{$h_1$}};
+\node [anchor=west,draw,fill=red!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (h1) at (0,0) {\scriptsize{$\textbf{h}_1$}};
-\node [anchor=west,draw,fill=red!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (h2) at ([xshift=1em]h1.east) {\scriptsize{$h_2$}};
+\node [anchor=west,draw,fill=red!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (h2) at ([xshift=1em]h1.east) {\scriptsize{$\textbf{h}_2$}};
 \node [anchor=west,inner sep=0pt,minimum width=3em] (h3) at ([xshift=0.5em]h2.east) {\scriptsize{...}};
-\node [anchor=west,draw,fill=red!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (h4) at ([xshift=0.5em]h3.east) {\scriptsize{$h_n$}};
+\node [anchor=west,draw,fill=red!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (h4) at ([xshift=0.5em]h3.east) {\scriptsize{$\textbf{h}_n$}};
 \node [anchor=south,circle,minimum size=1.0em,draw,ublue,thick] (sum) at ([yshift=2em]h2.north east) {};
 \draw [thick,-,ublue] (sum.north) -- (sum.south);
 \draw [thick,-,ublue] (sum.west) -- (sum.east);
-\node [anchor=south,draw,fill=green!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (th1) at ([yshift=2em,xshift=-1em]sum.north west) {\scriptsize{$s_{i-1}$}};
+\node [anchor=south,draw,fill=green!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (th1) at ([yshift=2em,xshift=-1em]sum.north west) {\scriptsize{$\textbf{s}_{i-1}$}};
-\node [anchor=west,draw,fill=green!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (th2) at ([xshift=2em]th1.east) {\scriptsize{$s_{i}$}};
+\node [anchor=west,draw,fill=green!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (th2) at ([xshift=2em]th1.east) {\scriptsize{$\textbf{s}_{i}$}};
 \draw [->] (h1.north) .. controls +(north:0.8) and +(west:1) ..  (sum.190) node [pos=0.3,left] {\tiny{$\alpha_{i,1}$}};
 \draw [->] (h2.north) .. controls +(north:0.6) and +(220:0.2) ..  (sum.220) node [pos=0.2,right] {\tiny{$\alpha_{i,2}$}};
@@ -1699,7 +1694,7 @@ NLP问题的隐含结构假设 & 无隐含结构假设，端到端学习 \\
 \draw [->] ([xshift=-1.5em]th1.west) -- ([xshift=-0.1em]th1.west);
 \draw [->] ([xshift=0.1em]th1.east) -- ([xshift=-0.1em]th2.west);
 \draw [->] ([xshift=0.1em]th2.east) -- ([xshift=1.5em]th2.east);
-\draw [->] (sum.north) .. controls +(north:0.8) and +(west:0.2) ..  ([yshift=-0.4em,xshift=-0.1em]th2.west) node [pos=0.2,right] (ci) {\scriptsize{$C_{i}$}};
+\draw [->] (sum.north) .. controls +(north:0.8) and +(west:0.2) ..  ([yshift=-0.4em,xshift=-0.1em]th2.west) node [pos=0.2,right] (ci) {\scriptsize{$\textbf{C}_{i}$}};
 \node [anchor=south,inner sep=1pt] (output) at ([yshift=0.8em]th2.north) {\tiny{输出层}};
 \draw [->] ([yshift=0.1em]th2.north) -- ([yshift=-0.1em]output.south);
@@ -1711,11 +1706,11 @@ NLP问题的隐含结构假设 & 无隐含结构假设，端到端学习 \\
 \node [anchor=north] (enc42) at ([yshift=0.5em]enc4.south) {\tiny{(位置$4$)}};
 \visible<2->{
-\node [anchor=west] (math1) at ([xshift=5em,yshift=1em]th2.east) {$C_i = \sum_{j} \alpha_{i,j} h_j \ \ $};
+\node [anchor=west] (math1) at ([xshift=5em,yshift=1em]th2.east) {$\textbf{C}_i = \sum_{j} \alpha_{i,j} \textbf{h}_j \ \ $};
 }
 \visible<3->{
 \node [anchor=north west] (math2) at ([yshift=-2em]math1.south west) {$\alpha_{i,j} = \frac{\exp(\beta_{i,j})}{\sum_{j'} \exp(\beta_{i,j'})}$};
-\node [anchor=north west] (math3) at ([yshift=-0em]math2.south west) {$\beta_{i,j} = a(s_{i-1}, h_j)$};
+\node [anchor=north west] (math3) at ([yshift=-0em]math2.south west) {$\beta_{i,j} = a(\textbf{s}_{i-1}, \textbf{h}_j)$};
 }
 \begin{pgfonlayer}{background}
@@ -1841,9 +1836,9 @@ NLP问题的隐含结构假设 & 无隐含结构假设，端到端学习 \\
 \visible<3->{
 % coverage score formula node
-\node [anchor=north west] (formula) at ([xshift=-0.3\hnode,yshift=-1.5\hnode]attn11.south) {\small{不同$C_i$所对应的源语言词的权重是不同的}};
+\node [anchor=north west] (formula) at ([xshift=-0.3\hnode,yshift=-1.5\hnode]attn11.south) {\small{不同$\textbf{C}_i$所对应的源语言词的权重是不同的}};
-\node [anchor=north west] (example) at (formula.south west) {\footnotesize{$C_2=0.4 \times h(\textrm{``你''}) + 0.4 \times h(\textrm{``什么''}) +$}};
+\node [anchor=north west] (example) at (formula.south west) {\footnotesize{$\textbf{C}_2=0.4 \times \textbf{h}(\textrm{``你''}) + 0.4 \times \textbf{h}(\textrm{``什么''}) +$}};
-\node [anchor=north west] (example2) at ([yshift=0.4em]example.south west) {\footnotesize{$\ \ \ \ \ \ \ \ 0 \times h(\textrm{``都''}) + 0.1 \times h(\textrm{``没''}) + ..$}};
+\node [anchor=north west] (example2) at ([yshift=0.4em]example.south west) {\footnotesize{$\ \ \ \ \ \ \ \ 0 \times \textbf{h}(\textrm{``都''}) + 0.1 \times \textbf{h}(\textrm{``没''}) + ..$}};
 }
 \visible<3->{
@@ -1867,11 +1862,11 @@ NLP问题的隐含结构假设 & 无隐含结构假设，端到端学习 \\
 % coverage score for each source word
 \visible<2->{
-\node[anchor=west] (sc1) at ([xshift=0.9\hnode]attn16.east) {$C_1 = \sum_{i=1}^{8} \alpha_{i1} h_{i}$};
+\node[anchor=west] (sc1) at ([xshift=0.9\hnode]attn16.east) {$\textbf{C}_1 = \sum_{i=1}^{8} \alpha_{i1} \textbf{h}_{i}$};
 }
 \visible<3->{
-\node[anchor=west] (sc2) at ([xshift=0.9\hnode,yshift=0.0\hnode]attn26.east) {$C_2 = \sum_{i=1}^{8} \alpha_{i2} h_{i}$};
+\node[anchor=west] (sc2) at ([xshift=0.9\hnode,yshift=0.0\hnode]attn26.east) {$\textbf{C}_2 = \sum_{i=1}^{8} \alpha_{i2} \textbf{h}_{i}$};
 }
 \end{tikzpicture}
@@ -1886,8 +1881,8 @@ NLP问题的隐含结构假设 & 无隐含结构假设，端到端学习 \\
 {\small
 \begin{tabular}{l | l}
 引入注意力机制以前 & 引入注意力机制以后 \\ \hline
-$\textrm{``Have''} = \argmax_{y} \textrm{P}(y|0, \alert{C})$ & $\textrm{``Have''} = \argmax_{y} \textrm{P}(y|0, \alert{C_1})$ \\
+$\textrm{``Have''} = \argmax_{y} \textrm{P}(y|0, \alert{\textbf{C}})$ & $\textrm{``Have''} = \argmax_{y} \textrm{P}(y|0, \alert{\textbf{C}_1})$ \\
-$\textrm{``you''} = \argmax_{y} \textrm{P}(y|s_1, \alert{C})$ & $\textrm{``you''} = \argmax_{y} \textrm{P}(y|s_1, \alert{C_2})$
+$\textrm{``you''} = \argmax_{y} \textrm{P}(y|\textbf{s}_1, \alert{\textbf{C}})$ & $\textrm{``you''} = \argmax_{y} \textrm{P}(y|\textbf{s}_1, \alert{\textbf{C}_2})$
 \end{tabular}
 }
 \end{center}
@@ -1902,19 +1897,19 @@ $\textrm{``you''} = \argmax_{y} \textrm{P}(y|s_1, \alert{C})$ & $\textrm{``you''
 \item 再来看一下注意力权重的定义。这个过程实际上是对$a(\cdot,\cdot)$做指数归一化：\\
 \vspace{-0.3em}
 \begin{displaymath}
-\alpha_{i,j} = \frac{\exp(a(s_{i-1}, h_j))}{\sum_{j'} \exp(a(s_{i-1}, h_{j'}))}
+\alpha_{i,j} = \frac{\exp(a(\textbf{s}_{i-1}, \textbf{h}_j))}{\sum_{j'} \exp(a(\textbf{s}_{i-1}, \textbf{h}_{j'}))}
 \end{displaymath}
-\item<2-> 注意力函数$a(s,h)$的目的是捕捉$s$和$h$之间的\alert{相似性}，这也可以被看作是目标语表示和源语言表示的一种``统一化''，即把源语言和目标语表示在同一个语义空间，进而语义相近的内容有更大的相似性。\visible<3->{定义$a(s,h)$的方式：}
+\item<2-> 注意力函数$a(\textbf{s},\textbf{h})$的目的是捕捉$\textbf{s}$和$\textbf{h}$之间的\alert{相似性}，这也可以被看作是目标语表示和源语言表示的一种``统一化''，即把源语言和目标语表示在同一个语义空间，进而语义相近的内容有更大的相似性。\visible<3->{定义$a(\textbf{s},\textbf{h})$的方式：}
    \vspace{-1em}
    \visible<3->{
    \begin{displaymath}
-    a(s,h) =  \left\{ \begin{array}{ll}
+    a(\textbf{s},\textbf{h}) =  \left\{ \begin{array}{ll}
-    s h^T & \textrm{向量乘} \\
+    \textbf{s} \textbf{h}^{\textrm{T}} & \textrm{向量乘} \\
-    \textrm{cos}(s, h) & \textrm{向量夹角} \\
+    \textrm{cos}(\textbf{s}, \textbf{h}) & \textrm{向量夹角} \\
-    s \textbf{W} h^T & \textrm{线性模型} \\
+    \textbf{s} \textbf{W} \textbf{h}^{\textrm{T}} & \textrm{线性模型} \\
-    \textrm{TanH}(\textbf{W}[s,h])\textbf{v}^T & \textrm{拼接}[s,h]+\textrm{单层网络}
+    \textrm{TanH}(\textbf{W}[\textbf{s},\textbf{h}])\textbf{v}^{\textrm{T}} & \textrm{拼接}[\textbf{s},\textbf{h}]+\textrm{单层网络}
    \end{array}
    \right.
    \end{displaymath}
@@ -1933,13 +1928,149 @@ $\textrm{``you''} = \argmax_{y} \textrm{P}(y|s_1, \alert{C})$ & $\textrm{``you''
 \end{frame}
 %%%------------------------------------------------------------------------------------------------------------
-%%% 实验结果
+%%% 如何进一步理解注意力机制
-\begin{frame}{效果}
+\begin{frame}{重新解释注意力机制}
-%% 实用注意力机制带来的提升
+\begin{itemize}
-%% 个大评测比赛没有不使用注意力机制的系统，已经成为标配
+\item 换一个问题，假设有若干key-value单元，其中key是这个单元的索引表示，value是这个单元的值。对于任意一个query，可以找到匹配的key，并输出其对应的value
+\end{itemize}
+\vspace{-0.8em}
+\begin{center}
+\begin{tikzpicture}
+\begin{scope}
+\tikzstyle{rnode} = [draw,minimum width=3em,minimum height=1.2em]
+\node [rnode,anchor=south west,fill=blue!20!white] (value1) at (0,0) {\scriptsize{value$_1$}};
+\node [rnode,anchor=south west,fill=blue!20!white] (value2) at ([xshift=1em]value1.south east) {\scriptsize{value$_2$}};
+\node [rnode,anchor=south west,fill=red!20!white] (value3) at ([xshift=1em]value2.south east) {\scriptsize{value$_3$}};
+\node [rnode,anchor=south west,fill=blue!20!white] (value4) at ([xshift=1em]value3.south east) {\scriptsize{value$_4$}};
+\node [rnode,anchor=south west,pattern=north east lines] (key1) at ([yshift=0.2em]value1.north west) {};
+\node [rnode,anchor=south west,pattern=dots] (key2) at ([yshift=0.2em]value2.north west) {};
+\node [rnode,anchor=south west,pattern=horizontal lines] (key3) at ([yshift=0.2em]value3.north west) {};
+\node [rnode,anchor=south west,pattern=crosshatch dots] (key4) at ([yshift=0.2em]value4.north west) {};
+\node [fill=white,inner sep=1pt] (key1label) at (key1) {\scriptsize{key$_1$}};
+\node [fill=white,inner sep=1pt] (key1label) at (key2) {\scriptsize{key$_2$}};
+\node [fill=white,inner sep=1pt] (key1label) at (key3) {\scriptsize{key$_3$}};
+\node [fill=white,inner sep=1pt] (key1label) at (key4) {\scriptsize{key$_4$}};
+\node [rnode,anchor=east,pattern=horizontal lines] (query) at ([xshift=-3em]key1.west) {};
+\node [anchor=east] (querylabel) at ([xshift=-0.2em]query.west) {\scriptsize{query}};
+\draw [->] ([yshift=1pt]query.north) .. controls +(90:2em) and +(90:2em) .. ([yshift=1pt]key3.north) node [pos=0.5,below,yshift=0.2em] {\scriptsize{匹配}};
+\node [anchor=north] (result) at (value3.south) {\scriptsize{\alert{返回结果}}};
+\end{scope}
+\end{tikzpicture}
+\end{center}
+\vspace{-0.7em}
+\begin{itemize}
+\item<2-> 注意力机制也可以被看做对key-value单元的查询，但是所有key和query之间都有一种匹配程度，返回结果是对所有value的加权
+\end{itemize}
+\visible<2->{
+\vspace{-0.5em}
+\begin{center}
+\begin{tikzpicture}
+\begin{scope}
+\tikzstyle{rnode} = [draw,minimum width=3em,minimum height=1.2em]
+\node [rnode,anchor=south west,fill=red!20!white] (value1) at (0,0) {\scriptsize{value$_1$}};
+\node [rnode,anchor=south west,fill=red!20!white] (value2) at ([xshift=1em]value1.south east) {\scriptsize{value$_2$}};
+\node [rnode,anchor=south west,fill=red!20!white] (value3) at ([xshift=1em]value2.south east) {\scriptsize{value$_3$}};
+\node [rnode,anchor=south west,fill=red!20!white] (value4) at ([xshift=1em]value3.south east) {\scriptsize{value$_4$}};
+\node [rnode,anchor=south west,pattern=north east lines] (key1) at ([yshift=0.2em]value1.north west) {};
+\node [rnode,anchor=south west,pattern=dots] (key2) at ([yshift=0.2em]value2.north west) {};
+\node [rnode,anchor=south west,pattern=horizontal lines] (key3) at ([yshift=0.2em]value3.north west) {};
+\node [rnode,anchor=south west,pattern=crosshatch dots] (key4) at ([yshift=0.2em]value4.north west) {};
+\node [fill=white,inner sep=1pt] (key1label) at (key1) {\scriptsize{key$_1$}};
+\node [fill=white,inner sep=1pt] (key1label) at (key2) {\scriptsize{key$_2$}};
+\node [fill=white,inner sep=1pt] (key1label) at (key3) {\scriptsize{key$_3$}};
+\node [fill=white,inner sep=1pt] (key1label) at (key4) {\scriptsize{key$_4$}};
+\node [rnode,anchor=east,pattern=vertical lines] (query) at ([xshift=-3em]key1.west) {};
+\node [anchor=east] (querylabel) at ([xshift=-0.2em]query.west) {\scriptsize{query}};
+\draw [->] ([yshift=1pt,xshift=6pt]query.north) .. controls +(90:1em) and +(90:1em) .. ([yshift=1pt]key1.north);
+\draw [->] ([yshift=1pt,xshift=3pt]query.north) .. controls +(90:1.5em) and +(90:1.5em) .. ([yshift=1pt]key2.north);
+\draw [->] ([yshift=1pt]query.north) .. controls +(90:2em) and +(90:2em) .. ([yshift=1pt]key3.north);
+\draw [->] ([yshift=1pt,xshift=-3pt]query.north) .. controls +(90:2.5em) and +(90:2.5em) .. ([yshift=1pt]key4.north);
+\node [anchor=south east] (alpha1) at (key1.north east) {\scriptsize{$\alpha_1$}};
+\node [anchor=south east] (alpha2) at (key2.north east) {\scriptsize{$\alpha_2$}};
+\node [anchor=south east] (alpha3) at (key3.north east) {\scriptsize{$\alpha_3$}};
+\node [anchor=south east] (alpha4) at (key4.north east) {\scriptsize{$\alpha_4$}};
+\node [anchor=north] (result) at ([xshift=-1.5em]value2.south east) {\scriptsize{\alert{返回结果}=$\alpha_1 \cdot \textrm{value}_1 + \alpha_2 \cdot \textrm{value}_2 + \alpha_3 \cdot \textrm{value}_3 + \alpha_4 \cdot \textrm{value}_4$}};
+\end{scope}
+\end{tikzpicture}
+\end{center}
+}
 \end{frame}
 %%%------------------------------------------------------------------------------------------------------------
+%%% 如何进一步理解注意力机制 - 回到机器翻译任务
+\begin{frame}{重新解释注意力机制(续)}
+\begin{itemize}
+\item 回到机器翻译，如果把目标语状态$\textbf{s}_{i-1}$看做query，而把源语言所有位置的最上层RNN表示$\textbf{h}_{j}$看做{\color{ugreen} \textbf{key}}和{\color{red} \textbf{value}}
+\end{itemize}
+\vspace{-1.5em}
+\begin{center}
+\begin{tikzpicture}
+\begin{scope}
+\tikzstyle{rnode} = [draw,minimum width=3.5em,minimum height=1.2em]
+\node [rnode,anchor=south west,fill=red!20!white] (value1) at (0,0) {\scriptsize{$\textbf{h}(\textrm{``你''})$}};
+\node [rnode,anchor=south west,fill=red!20!white] (value2) at ([xshift=1em]value1.south east) {\scriptsize{$\textbf{h}(\textrm{``什么''})$}};
+\node [rnode,anchor=south west,fill=red!20!white] (value3) at ([xshift=1em]value2.south east) {\scriptsize{$\textbf{h}(\textrm{``也''})$}};
+\node [rnode,anchor=south west,fill=red!20!white] (value4) at ([xshift=1em]value3.south east) {\scriptsize{$\textbf{h}(\textrm{``没''})$}};
+\node [rnode,anchor=south west,fill=green!20!white] (key1) at ([yshift=0.2em]value1.north west) {\scriptsize{$\textbf{h}(\textrm{``你''})$}};
+\node [rnode,anchor=south west,fill=green!20!white] (key2) at ([yshift=0.2em]value2.north west) {\scriptsize{$\textbf{h}(\textrm{``什么''})$}};
+\node [rnode,anchor=south west,fill=green!20!white] (key3) at ([yshift=0.2em]value3.north west) {\scriptsize{$\textbf{h}(\textrm{``也''})$}};
+\node [rnode,anchor=south west,fill=green!20!white] (key4) at ([yshift=0.2em]value4.north west) {\scriptsize{$\textbf{h}(\textrm{``没''})$}};
+\node [rnode,anchor=east] (query) at ([xshift=-2em]key1.west) {\scriptsize{$\textbf{s}(\textrm{``you''})$}};
+\node [anchor=east] (querylabel) at ([xshift=-0.2em]query.west) {\scriptsize{query}};
+\draw [->] ([yshift=1pt,xshift=6pt]query.north) .. controls +(90:1em) and +(90:1em) .. ([yshift=1pt]key1.north);
+\draw [->] ([yshift=1pt,xshift=3pt]query.north) .. controls +(90:1.5em) and +(90:1.5em) .. ([yshift=1pt]key2.north);
+\draw [->] ([yshift=1pt]query.north) .. controls +(90:2em) and +(90:2em) .. ([yshift=1pt]key3.north);
+\draw [->] ([yshift=1pt,xshift=-3pt]query.north) .. controls +(90:2.5em) and +(90:2.5em) .. ([yshift=1pt]key4.north);
+\node [anchor=south east] (alpha1) at ([xshift=1em]key1.north east) {\scriptsize{$\alpha_1=.4$}};
+\node [anchor=south east] (alpha2) at ([xshift=1em]key2.north east) {\scriptsize{$\alpha_2=.4$}};
+\node [anchor=south east] (alpha3) at ([xshift=1em]key3.north east) {\scriptsize{$\alpha_3=0$}};
+\node [anchor=south east] (alpha4) at ([xshift=1em]key4.north east) {\scriptsize{$\alpha_4=.1$}};
+\end{scope}
+\end{tikzpicture}
+\end{center}
+\vspace{-2.5em}
+\begin{eqnarray}
+\textbf{C}_3 & = & 0.4 \times \textbf{h}(\textrm{``什么''}) + 0.4 \times \textbf{h}(\textrm{``也''}) + \nonumber \\
+             &   & 0 \times \textbf{h}(\textrm{``没''}) + 0.1 \times \textbf{h}(\textrm{``学''}) \nonumber
+\end{eqnarray}
+\vspace{-0.5em}
+\begin{itemize}
+\item<2-> 注意力机制也可以被看做是一个重新生成value的过程：对于一组value值，注意力模型对他们加权求和，并得到一个新的value。而这个新的value实际上就是query所对应查询结果，在机器翻译中被看做是目标语所对应的源语言上下文表示。
+\end{itemize}
+\end{frame}
+\subsection{训练及推断}
+%%%------------------------------------------------------------------------------------------------------------
 %%% 训练
 \begin{frame}{训练}
 \end{frame}
@@ -1949,6 +2080,14 @@ $\textrm{``you''} = \argmax_{y} \textrm{P}(y|s_1, \alert{C})$ & $\textrm{``you''
 \begin{frame}{推断}
 \end{frame}
+%%%------------------------------------------------------------------------------------------------------------
+%%% 实验结果
+\begin{frame}{效果}
+%% 实用注意力机制带来的提升
+%% 个大评测比赛没有不使用注意力机制的系统，已经成为标配
+\end{frame}
 %%%------------------------------------------------------------------------------------------------------------
 %%% GNMT
 \begin{frame}{成功案例 - GNMT}