new pages

1764d5c6 · xiaotong · 034b3c57 · 1764d5c6 · 1764d5c6
Commit 1764d5c6 authored Nov 18, 2019 by xiaotong
--- a/Section06-Neural-Machine-Translation/section06-test.tex
+++ b/Section06-Neural-Machine-Translation/section06-test.tex
@@ -28,7 +28,6 @@

 \usetikzlibrary{calc,intersections}
 \usetikzlibrary{matrix}
-\usetikzlibrary{patterns}
 \usetikzlibrary{arrows,decorations.pathreplacing}
 \usetikzlibrary{shadows} % LATEX and plain TEX when using Tik Z
 \usetikzlibrary{shadows.blur}
@@ -145,170 +144,30 @@
 \subsection{注意力机制}

 %%%------------------------------------------------------------------------------------------------------------
-%%% 解码 - beam search
-\begin{frame}{推断 - Beam Search}
+%%% 如何定义注意力函数
+\begin{frame}{计算注意力权重 - 注意力函数}
 \begin{itemize}
-\item \textbf{Greedy Search}: 目标语每一个位置，输出层的Softmax可以得到所有单词的概率，然后选择一个概率最大单词输出，下一个位置的预测就基于这一步输出的单词
-\item \textbf{Beach Search}: 为了避免贪婪方法造成的错误累加，可以每次对$b$个单词进行扩展，而不是只使用一个单词，其中$b$称做束的宽度 - 这样可以搜索更多可能的译文
-\end{itemize}
-
+\item 再来看一下注意力权重的定义。这个过程实际上是对$a(\cdot,\cdot)$做指数归一化：\\
 \vspace{-0.3em}
-\visible<2->{
-\begin{center}
-\begin{tikzpicture}
-\begin{scope}
-\tikzstyle{rnnnode} = [minimum height=1.1em,minimum width=3.5em,inner sep=2pt,rounded corners=1pt,draw,fill=red!20];
-\tikzstyle{wnode} = [minimum height=1.0em,minimum width=3.5em,inner sep=2pt,rounded corners=1pt,draw,fill=white];
-
-
-\visible<3->{
-\node [rnnnode,anchor=west,fill=green!20] (t1) at (0,0) {\tiny{$e_y()$}};
-}
-\visible<7->{
-\node [rnnnode,anchor=west,fill=green!20] (t2) at ([xshift=2.2em]t1.east) {\tiny{$e_y()$ ($\times 3$)}};
-}
-\visible<8->{
-\node [rnnnode,anchor=west,fill=green!20] (t3) at ([xshift=2.2em]t2.east) {\tiny{$e_y()$ ($\times 3$)}};
-\node [anchor=west,inner sep=2pt] (t4) at ([xshift=0.3em]t3.east) {\tiny{...}};
-}
-\visible<3->{
-\node [rnnnode,anchor=south] (s1) at ([yshift=1em]t1.north) {\tiny{$\textbf{s}_1$}};
-}
-\visible<7->{
-\node [rnnnode,anchor=south] (s2) at ([yshift=1em]t2.north) {\tiny{$\textbf{s}_2$ ($\times 3$)}};
-}
-\visible<8->{
-\node [rnnnode,anchor=south] (s3) at ([yshift=1em]t3.north) {\tiny{$\textbf{s}_3$ ($\times 3$)}};
-\node [anchor=west,inner sep=2pt] (s4) at ([xshift=0.3em]s3.east) {\tiny{...}};
-}
-\visible<3->{
-\node [rnnnode,anchor=south,fill=blue!20] (o1) at ([yshift=1em]s1.north) {\tiny{softmax}};
-}
-\visible<7->{
-\node [rnnnode,anchor=south,fill=blue!20] (o2) at ([yshift=1em]s2.north) {\tiny{softmax ($\times 3$)}};
-}
-\visible<8->{
-\node [rnnnode,anchor=south,fill=blue!20] (o3) at ([yshift=1em]s3.north) {\tiny{softmax ($\times 3$)}};
-\node [anchor=west,inner sep=2pt] (o4) at ([xshift=0.3em]o3.east) {\tiny{...}};
-}
-
-\node [wnode,anchor=north] (wt1) at ([yshift=-0.8em]t1.south) {\tiny{EOS}};
-
-\visible<6->{
-\node [wnode,anchor=north] (wt2) at ([yshift=-0.8em]t2.south) {\tiny{Have}};
-\node [wnode,anchor=north] (wt2copy1) at ([xshift=-0.2em,yshift=-0.2em]wt2.north) {\tiny{Have}};
-\node [wnode,anchor=north] (wt2copy2) at ([xshift=-0.4em,yshift=-0.4em]wt2.north) {\tiny{Have}};
-}
-
-\visible<8->{
-\node [wnode,anchor=north,inner sep=2pt] (wt3) at ([yshift=-0.8em]t3.south) {\tiny{you}};
-\node [wnode,anchor=north] (wt3copy1) at ([xshift=-0.2em,yshift=-0.2em]wt3.north) {\tiny{you}};
-\node [wnode,anchor=north] (wt3copy2) at ([xshift=-0.4em,yshift=-0.4em]wt3.north) {\tiny{you}};
-}
-
-\visible<5->{
-\node [wnode,anchor=center,inner sep=2pt] (wo1) at ([xshift=0.4em,yshift=1.8em]o1.north) {\tiny{Have}};
-\node [wnode,anchor=north] (wo1copy1) at ([xshift=-0.2em,yshift=-0.2em]wo1.north) {\tiny{Have}};
-\node [wnode,anchor=north] (wo1copy2) at ([xshift=-0.4em,yshift=-0.4em]wo1.north) {\tiny{Have}};
-}
-
-\visible<8->{
-\node [wnode,anchor=center,inner sep=2pt] (wo2) at ([xshift=0.4em,yshift=1.8em]o2.north) {\tiny{you}};
-\node [wnode,anchor=north] (wo2copy1) at ([xshift=-0.2em,yshift=-0.2em]wo2.north) {\tiny{you}};
-\node [wnode,anchor=north] (wo2copy2) at ([xshift=-0.4em,yshift=-0.4em]wo2.north) {\tiny{you}};
-}
-
-\visible<8->{
-\node [wnode,anchor=center,inner sep=2pt] (wo3) at ([xshift=0.4em,yshift=1.8em]o3.north) {\tiny{learned}};
-\node [wnode,anchor=north] (wo3copy1) at ([xshift=-0.2em,yshift=-0.2em]wo3.north) {\tiny{learned}};
-\node [wnode,anchor=north] (wo3copy2) at ([xshift=-0.4em,yshift=-0.4em]wo3.north) {\tiny{learned}};
-}
-
-\visible<3->{
-\foreach \x in {1}{
-    \draw [->] ([yshift=-0.7em]t\x.south) -- ([yshift=-0.1em]t\x.south);
-    \draw [->] ([yshift=0.1em]t\x.north) -- ([yshift=-0.1em]s\x.south);
-    \draw [->] ([yshift=0.1em]s\x.north) -- ([yshift=-0.1em]o\x.south);
-   
-}
-}
-
-\visible<5->{
- \draw [->] ([yshift=0.1em]o1.north) -- ([yshift=0.8em]o1.north) node [pos=0.5,right] {\tiny{top-3}};
- }
-
-\visible<7->{
-\foreach \x in {2}{
-    \draw [->] ([yshift=-0.7em]t\x.south) -- ([yshift=-0.1em]t\x.south);
-    \draw [->] ([yshift=0.1em]t\x.north) -- ([yshift=-0.1em]s\x.south);
-    \draw [->] ([yshift=0.1em]s\x.north) -- ([yshift=-0.1em]o\x.south);
-    \draw [->] ([yshift=0.1em]o\x.north) -- ([yshift=0.8em]o\x.north) node [pos=0.5,right] {\tiny{top-3}};
-}
-}
-
-\visible<8->{
-\foreach \x in {3}{
-    \draw [->] ([yshift=-0.7em]t\x.south) -- ([yshift=-0.1em]t\x.south);
-    \draw [->] ([yshift=0.1em]t\x.north) -- ([yshift=-0.1em]s\x.south);
-    \draw [->] ([yshift=0.1em]s\x.north) -- ([yshift=-0.1em]o\x.south);
-    \draw [->] ([yshift=0.1em]o\x.north) -- ([yshift=0.8em]o\x.north) node [pos=0.5,right] {\tiny{top-3}};
-}
-}
-
-\visible<3->{
-\draw [->] ([xshift=-0.5em]s1.west) -- ([xshift=-0.1em]s1.west) node [pos=0,left,inner sep=1pt] {\tiny{0}};
-}
-\visible<7->{
-\draw [->] ([xshift=0.1em]s1.east) -- ([xshift=-0.1em]s2.west);
-}
-\visible<8->{
-\draw [->] ([xshift=0.1em]s2.east) -- ([xshift=-0.1em]s3.west);
-}
-
-\visible<6->{
-\draw [->,very thick,dotted] (wo1.east) .. controls +(east:0.6) and +(west:0.8) ..(wt2copy2.west);
-}
-\visible<8->{
-\draw [->,very thick,dotted] (wo2.east) .. controls +(east:0.6) and +(west:0.8) ..(wt3copy2.west);
-}
-
-\visible<7->{
-\node [circle,draw,anchor=north,inner sep=2pt,fill=orange!20] (c2) at ([yshift=-2.5em]t1.south) {\tiny{$\textbf{C}_2$}};
-\node [circle,draw,inner sep=2pt,fill=orange!20] (c2copy1) at ([yshift=-0.1em,xshift=-0.1em]c2) {\tiny{$\textbf{C}_2$}};
-\node [circle,draw,inner sep=2pt,fill=orange!20] (c2copy2) at ([yshift=-0.2em,xshift=-0.2em]c2) {\tiny{$\textbf{C}_2$}};
-\draw [->] ([xshift=-0.9em]c2.west) -- ([xshift=-0.3em]c2.west);
-\draw [->] ([xshift=0.1em]c2.east) .. controls +(east:1.5) and +(west:0.8) ..([yshift=-0.3em,xshift=-0.1em]s2.west);
-}
-
-\visible<8->{
-\node [circle,draw,anchor=north,inner sep=2pt,fill=orange!20] (c3) at ([yshift=-2.5em]t2.south) {\tiny{$\textbf{C}_3$}};
-\node [circle,draw,inner sep=2pt,fill=orange!20] (c3copy1) at ([yshift=-0.1em,xshift=-0.1em]c3) {\tiny{$\textbf{C}_3$}};
-\node [circle,draw,inner sep=2pt,fill=orange!20] (c3copy2) at ([yshift=-0.2em,xshift=-0.2em]c3) {\tiny{$\textbf{C}_3$}};
-\draw [->] ([xshift=-0.9em]c3.west) -- ([xshift=-0.3em]c3.west);
-\draw [->] ([xshift=0.1em]c3.east) .. controls +(east:1.5) and +(west:0.8) ..([yshift=-0.3em,xshift=-0.1em]s3.west);
-}
-
-\visible<3->{
-\node [anchor=east] (vocab) at ([xshift=-5em]s1.west) {\tiny{$\begin{bmatrix} \textrm{Have} & 0.50 \\ \textrm{I} & 0.02 \\ \textrm{it} & 0.03 \\ \textrm{has} & 0.30 \\ \textrm{you} & 0.01 \\ \textrm{the} & 0.01 \\ \textrm{a} & 0.01 \\ \textrm{an} & 0.02 \\ \textrm{he} & 0.03 \\ \textrm{she} & 0.01 \\ \textrm{are} & 0.00 \\ \textrm{am} & 0.01 \\ ... & ... \end{bmatrix}$}};
-\node [anchor=south] (vocablabel) at (vocab.north) {\tiny{单词的概率分布}};
-\draw [->,red,very thick,dotted] (o1.west) .. controls +(west:1) and +(east:2) .. ([yshift=1em]vocab.south east); 
-}
-
-\visible<4->{
-\node [anchor=east,inner sep=1pt] (vocabtopn) at ([xshift=-0.5em,yshift=-0.5em]wo1.west) {\tiny{$\begin{bmatrix} \textrm{Have} \\ \textrm{has} \\ \textrm{it} \end{bmatrix}$}};
-\draw [->] ([yshift=-1.6em,xshift=-0.4em]vocab.north east) .. controls +(east:1) and +(west:1) ..  ([xshift=0.1em,yshift=0.4em]vocabtopn.west) node [pos=0.3,below] (topnlabel) {\tiny{top-3}};
-
-\visible<4->{
-\node [anchor=north] (cap) at (vocab.south east) {\scriptsize{\textbf{束搜索($b=3$)}}};
-}
-}
-
-
-\end{scope}
-\end{tikzpicture}
-\end{center}
-}
-
+\begin{displaymath}
+\alpha_{i,j} = \frac{\exp(a(s_{i-1}, h_j))}{\sum_{j'} \exp(a(s_{i-1}, h_{j'}))}
+\end{displaymath}
+
+\item<2-> 注意力函数$a(s,h)$的目的是捕捉$s$和$h$之间的\alert{相似性}，这也可以被看作是目标语表示和源语言表示的一种``统一化''，即把源语言和目标语表示在同一个语义空间，进而语义相近的内容有更大的相似性。\visible<3->{定义$a(s,h)$的方式：}
+    
+    \visible<3->{
+    \begin{displaymath}
+    a(s,h) =  \left\{ \begin{array}{ll}
+    s h^T & \textrm{向量乘} \\
+    \textrm{cos}(s, h) & \textrm{向量夹角} \\
+    s \textbf{W} h^T & \textrm{线性模型} \\
+    \textrm{TanH}(\textbf{W}[s,h])\textbf{v}^T & \textrm{拼接}
+    \end{array}
+    \right.
+    \end{displaymath}
+    $\textbf{W}$和$\textbf{v}$是可学习参数
+    }
+\end{itemize}
 \end{frame}

 %%%------------------------------------------------------------------------------------------------------------

--- a/Section06-Neural-Machine-Translation/section06.tex
+++ b/Section06-Neural-Machine-Translation/section06.tex