Commit 65c39211 by xiaotong

re-submit

parent 1764d5c6
......@@ -28,6 +28,7 @@
\usetikzlibrary{calc,intersections}
\usetikzlibrary{matrix}
\usetikzlibrary{patterns}
\usetikzlibrary{arrows,decorations.pathreplacing}
\usetikzlibrary{shadows} % LATEX and plain TEX when using Tik Z
\usetikzlibrary{shadows.blur}
......@@ -144,30 +145,170 @@
\subsection{注意力机制}
%%%------------------------------------------------------------------------------------------------------------
%%% 如何定义注意力函数
\begin{frame}{计算注意力权重 - 注意力函数}
%%% 解码 - beam search
\begin{frame}{推断 - Beam Search}
\begin{itemize}
\item 再来看一下注意力权重的定义。这个过程实际上是对$a(\cdot,\cdot)$做指数归一化:\\
\vspace{-0.3em}
\begin{displaymath}
\alpha_{i,j} = \frac{\exp(a(s_{i-1}, h_j))}{\sum_{j'} \exp(a(s_{i-1}, h_{j'}))}
\end{displaymath}
\item<2-> 注意力函数$a(s,h)$的目的是捕捉$s$$h$之间的\alert{相似性},这也可以被看作是目标语表示和源语言表示的一种``统一化'',即把源语言和目标语表示在同一个语义空间,进而语义相近的内容有更大的相似性。\visible<3->{定义$a(s,h)$的方式:}
\visible<3->{
\begin{displaymath}
a(s,h) = \left\{ \begin{array}{ll}
s h^T & \textrm{向量乘} \\
\textrm{cos}(s, h) & \textrm{向量夹角} \\
s \textbf{W} h^T & \textrm{线性模型} \\
\textrm{TanH}(\textbf{W}[s,h])\textbf{v}^T & \textrm{拼接}
\end{array}
\right.
\end{displaymath}
$\textbf{W}$$\textbf{v}$是可学习参数
}
\item \textbf{Greedy Search}: 目标语每一个位置,输出层的Softmax可以得到所有单词的概率,然后选择一个概率最大单词输出,下一个位置的预测就基于这一步输出的单词
\item \textbf{Beach Search}: 为了避免贪婪方法造成的错误累加,可以每次对$b$个单词进行扩展,而不是只使用一个单词,其中$b$称做束的宽度 - 这样可以搜索更多可能的译文
\end{itemize}
\vspace{-0.3em}
\visible<2->{
\begin{center}
\begin{tikzpicture}
\begin{scope}
\tikzstyle{rnnnode} = [minimum height=1.1em,minimum width=3.5em,inner sep=2pt,rounded corners=1pt,draw,fill=red!20];
\tikzstyle{wnode} = [minimum height=1.0em,minimum width=3.5em,inner sep=2pt,rounded corners=1pt,draw,fill=white];
\visible<3->{
\node [rnnnode,anchor=west,fill=green!20] (t1) at (0,0) {\tiny{$e_y()$}};
}
\visible<7->{
\node [rnnnode,anchor=west,fill=green!20] (t2) at ([xshift=2.2em]t1.east) {\tiny{$e_y()$ ($\times 3$)}};
}
\visible<8->{
\node [rnnnode,anchor=west,fill=green!20] (t3) at ([xshift=2.2em]t2.east) {\tiny{$e_y()$ ($\times 3$)}};
\node [anchor=west,inner sep=2pt] (t4) at ([xshift=0.3em]t3.east) {\tiny{...}};
}
\visible<3->{
\node [rnnnode,anchor=south] (s1) at ([yshift=1em]t1.north) {\tiny{$\textbf{s}_1$}};
}
\visible<7->{
\node [rnnnode,anchor=south] (s2) at ([yshift=1em]t2.north) {\tiny{$\textbf{s}_2$ ($\times 3$)}};
}
\visible<8->{
\node [rnnnode,anchor=south] (s3) at ([yshift=1em]t3.north) {\tiny{$\textbf{s}_3$ ($\times 3$)}};
\node [anchor=west,inner sep=2pt] (s4) at ([xshift=0.3em]s3.east) {\tiny{...}};
}
\visible<3->{
\node [rnnnode,anchor=south,fill=blue!20] (o1) at ([yshift=1em]s1.north) {\tiny{softmax}};
}
\visible<7->{
\node [rnnnode,anchor=south,fill=blue!20] (o2) at ([yshift=1em]s2.north) {\tiny{softmax ($\times 3$)}};
}
\visible<8->{
\node [rnnnode,anchor=south,fill=blue!20] (o3) at ([yshift=1em]s3.north) {\tiny{softmax ($\times 3$)}};
\node [anchor=west,inner sep=2pt] (o4) at ([xshift=0.3em]o3.east) {\tiny{...}};
}
\node [wnode,anchor=north] (wt1) at ([yshift=-0.8em]t1.south) {\tiny{EOS}};
\visible<6->{
\node [wnode,anchor=north] (wt2) at ([yshift=-0.8em]t2.south) {\tiny{Have}};
\node [wnode,anchor=north] (wt2copy1) at ([xshift=-0.2em,yshift=-0.2em]wt2.north) {\tiny{Have}};
\node [wnode,anchor=north] (wt2copy2) at ([xshift=-0.4em,yshift=-0.4em]wt2.north) {\tiny{Have}};
}
\visible<8->{
\node [wnode,anchor=north,inner sep=2pt] (wt3) at ([yshift=-0.8em]t3.south) {\tiny{you}};
\node [wnode,anchor=north] (wt3copy1) at ([xshift=-0.2em,yshift=-0.2em]wt3.north) {\tiny{you}};
\node [wnode,anchor=north] (wt3copy2) at ([xshift=-0.4em,yshift=-0.4em]wt3.north) {\tiny{you}};
}
\visible<5->{
\node [wnode,anchor=center,inner sep=2pt] (wo1) at ([xshift=0.4em,yshift=1.8em]o1.north) {\tiny{Have}};
\node [wnode,anchor=north] (wo1copy1) at ([xshift=-0.2em,yshift=-0.2em]wo1.north) {\tiny{Have}};
\node [wnode,anchor=north] (wo1copy2) at ([xshift=-0.4em,yshift=-0.4em]wo1.north) {\tiny{Have}};
}
\visible<8->{
\node [wnode,anchor=center,inner sep=2pt] (wo2) at ([xshift=0.4em,yshift=1.8em]o2.north) {\tiny{you}};
\node [wnode,anchor=north] (wo2copy1) at ([xshift=-0.2em,yshift=-0.2em]wo2.north) {\tiny{you}};
\node [wnode,anchor=north] (wo2copy2) at ([xshift=-0.4em,yshift=-0.4em]wo2.north) {\tiny{you}};
}
\visible<8->{
\node [wnode,anchor=center,inner sep=2pt] (wo3) at ([xshift=0.4em,yshift=1.8em]o3.north) {\tiny{learned}};
\node [wnode,anchor=north] (wo3copy1) at ([xshift=-0.2em,yshift=-0.2em]wo3.north) {\tiny{learned}};
\node [wnode,anchor=north] (wo3copy2) at ([xshift=-0.4em,yshift=-0.4em]wo3.north) {\tiny{learned}};
}
\visible<3->{
\foreach \x in {1}{
\draw [->] ([yshift=-0.7em]t\x.south) -- ([yshift=-0.1em]t\x.south);
\draw [->] ([yshift=0.1em]t\x.north) -- ([yshift=-0.1em]s\x.south);
\draw [->] ([yshift=0.1em]s\x.north) -- ([yshift=-0.1em]o\x.south);
}
}
\visible<5->{
\draw [->] ([yshift=0.1em]o1.north) -- ([yshift=0.8em]o1.north) node [pos=0.5,right] {\tiny{top-3}};
}
\visible<7->{
\foreach \x in {2}{
\draw [->] ([yshift=-0.7em]t\x.south) -- ([yshift=-0.1em]t\x.south);
\draw [->] ([yshift=0.1em]t\x.north) -- ([yshift=-0.1em]s\x.south);
\draw [->] ([yshift=0.1em]s\x.north) -- ([yshift=-0.1em]o\x.south);
\draw [->] ([yshift=0.1em]o\x.north) -- ([yshift=0.8em]o\x.north) node [pos=0.5,right] {\tiny{top-3}};
}
}
\visible<8->{
\foreach \x in {3}{
\draw [->] ([yshift=-0.7em]t\x.south) -- ([yshift=-0.1em]t\x.south);
\draw [->] ([yshift=0.1em]t\x.north) -- ([yshift=-0.1em]s\x.south);
\draw [->] ([yshift=0.1em]s\x.north) -- ([yshift=-0.1em]o\x.south);
\draw [->] ([yshift=0.1em]o\x.north) -- ([yshift=0.8em]o\x.north) node [pos=0.5,right] {\tiny{top-3}};
}
}
\visible<3->{
\draw [->] ([xshift=-0.5em]s1.west) -- ([xshift=-0.1em]s1.west) node [pos=0,left,inner sep=1pt] {\tiny{0}};
}
\visible<7->{
\draw [->] ([xshift=0.1em]s1.east) -- ([xshift=-0.1em]s2.west);
}
\visible<8->{
\draw [->] ([xshift=0.1em]s2.east) -- ([xshift=-0.1em]s3.west);
}
\visible<6->{
\draw [->,very thick,dotted] (wo1.east) .. controls +(east:0.6) and +(west:0.8) ..(wt2copy2.west);
}
\visible<8->{
\draw [->,very thick,dotted] (wo2.east) .. controls +(east:0.6) and +(west:0.8) ..(wt3copy2.west);
}
\visible<7->{
\node [circle,draw,anchor=north,inner sep=2pt,fill=orange!20] (c2) at ([yshift=-2.5em]t1.south) {\tiny{$\textbf{C}_2$}};
\node [circle,draw,inner sep=2pt,fill=orange!20] (c2copy1) at ([yshift=-0.1em,xshift=-0.1em]c2) {\tiny{$\textbf{C}_2$}};
\node [circle,draw,inner sep=2pt,fill=orange!20] (c2copy2) at ([yshift=-0.2em,xshift=-0.2em]c2) {\tiny{$\textbf{C}_2$}};
\draw [->] ([xshift=-0.9em]c2.west) -- ([xshift=-0.3em]c2.west);
\draw [->] ([xshift=0.1em]c2.east) .. controls +(east:1.5) and +(west:0.8) ..([yshift=-0.3em,xshift=-0.1em]s2.west);
}
\visible<8->{
\node [circle,draw,anchor=north,inner sep=2pt,fill=orange!20] (c3) at ([yshift=-2.5em]t2.south) {\tiny{$\textbf{C}_3$}};
\node [circle,draw,inner sep=2pt,fill=orange!20] (c3copy1) at ([yshift=-0.1em,xshift=-0.1em]c3) {\tiny{$\textbf{C}_3$}};
\node [circle,draw,inner sep=2pt,fill=orange!20] (c3copy2) at ([yshift=-0.2em,xshift=-0.2em]c3) {\tiny{$\textbf{C}_3$}};
\draw [->] ([xshift=-0.9em]c3.west) -- ([xshift=-0.3em]c3.west);
\draw [->] ([xshift=0.1em]c3.east) .. controls +(east:1.5) and +(west:0.8) ..([yshift=-0.3em,xshift=-0.1em]s3.west);
}
\visible<3->{
\node [anchor=east] (vocab) at ([xshift=-5em]s1.west) {\tiny{$\begin{bmatrix} \textrm{Have} & 0.50 \\ \textrm{I} & 0.02 \\ \textrm{it} & 0.03 \\ \textrm{has} & 0.30 \\ \textrm{you} & 0.01 \\ \textrm{the} & 0.01 \\ \textrm{a} & 0.01 \\ \textrm{an} & 0.02 \\ \textrm{he} & 0.03 \\ \textrm{she} & 0.01 \\ \textrm{are} & 0.00 \\ \textrm{am} & 0.01 \\ ... & ... \end{bmatrix}$}};
\node [anchor=south] (vocablabel) at (vocab.north) {\tiny{单词的概率分布}};
\draw [->,red,very thick,dotted] (o1.west) .. controls +(west:1) and +(east:2) .. ([yshift=1em]vocab.south east);
}
\visible<4->{
\node [anchor=east,inner sep=1pt] (vocabtopn) at ([xshift=-0.5em,yshift=-0.5em]wo1.west) {\tiny{$\begin{bmatrix} \textrm{Have} \\ \textrm{has} \\ \textrm{it} \end{bmatrix}$}};
\draw [->] ([yshift=-1.6em,xshift=-0.4em]vocab.north east) .. controls +(east:1) and +(west:1) .. ([xshift=0.1em,yshift=0.4em]vocabtopn.west) node [pos=0.3,below] (topnlabel) {\tiny{top-3}};
\visible<4->{
\node [anchor=north] (cap) at (vocab.south east) {\scriptsize{\textbf{束搜索($b=3$)}}};
}
}
\end{scope}
\end{tikzpicture}
\end{center}
}
\end{frame}
%%%------------------------------------------------------------------------------------------------------------
......
......@@ -28,6 +28,7 @@
\usetikzlibrary{calc,intersections}
\usetikzlibrary{matrix}
\usetikzlibrary{patterns}
\usetikzlibrary{arrows,decorations.pathreplacing}
\usetikzlibrary{shadows} % LATEX and plain TEX when using Tik Z
\usetikzlibrary{shadows.blur}
......@@ -469,9 +470,9 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\
\begin{itemize}
\item 一个简单的例子:基于循环神经网络的翻译过程
\begin{itemize}
\item 顺序处理源语言单词
\item 源语言句子信息被表示在最后一个循环单元的输出中
\item 逐词生成目标语译文
\item<1-> \textbf{编码器}顺序处理源语言单词
\item<5-> 源语言句子信息被表示在最后一个循环单元的输出中
\item<6-> \textbf{解码器}利用源语言句子信息逐词生成目标语译文
\end{itemize}
\end{itemize}
%%% 运行实例的图
......@@ -480,7 +481,7 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\
\setlength{\base}{0.6cm}
\tikzstyle{rnnnode} = [minimum size=\base,inner sep=0pt,rounded corners=1pt,draw]
\tikzstyle{wordnode} = [font=\normalsize]
\tikzstyle{wordnode} = [font=\normalsize,align=center]
\begin{scope}
\visible<1->{
......@@ -504,63 +505,76 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\
\node[wordnode,below=0pt of emb3] (word3) {};
\draw[-latex'] (emb3.north) to (rnn3.south);
\draw[-latex'] (rnn2.east) to (rnn3.west);
\node[rnnnode,fill=blue!30!white,right=\base of rnn3] (rnn4) {};
\node[rnnnode,fill=green!30!white,below=\base of rnn4] (emb4) {};
\node[wordnode,below=0pt of emb4] (word4) {$\langle$eos$\rangle$};
\node[wordnode,below=0pt of emb4] (word4) {EOS};
\draw[-latex'] (emb4.north) to (rnn4.south);
\draw[-latex'] (rnn3.east) to (rnn4.west);
}
\visible<4>{
\node[rnnnode,fill=purple] (repr) at (rnn4) {};
\node[wordnode,above=\base of repr] (label) {句子表示};
\draw[->,dashed] (label.south) to (rnn4.north);
\visible<4->{
\draw[decoration={mirror,brace},decorate] (word1.south west) to node [auto,anchor=north,align=center] {编码器} ([yshift=-0.2em]word4.south east);
}
\visible<5->{
\node[rnnnode,fill=red!30!white,above=\base of rnn4] (softmax1) {};
\node[wordnode,above=0pt of softmax1] (out1) {I};
\draw[-latex'] (rnn4.north) to (softmax1.south);
\node[rnnnode,fill=purple] (repr) at (rnn4) {};
\node[wordnode,above=\base of rnn2] (label) {源语言句子信息};
\draw[->,dashed,thick] (label.east) .. controls +(east:\base) and +(north:\base) .. (rnn4.north);
}
\visible<6->{
\node[rnnnode,fill=blue!30!white,right=\base of rnn4] (rnn5) {};
\node[rnnnode,fill=green!30!white,below=\base of rnn5] (emb5) {};
\node[rnnnode,fill=red!30!white,above=\base of rnn5] (softmax2) {};
\ExtractX{$(emb5)$}
\ExtractY{$(word4.base)$}
\node[wordnode,anchor=base] (word5) at (\XCoord,\YCoord) {I};
\ExtractX{$(emb5)$}
\ExtractY{$(out1.base)$}
\node[wordnode,anchor=base] (out2) at (\XCoord,\YCoord) {am};
\node[wordnode,below=0pt of emb5] (word5) {SOS};
\draw[-latex'] (emb5.north) to (rnn5.south);
\draw[-latex'] (rnn4.east) to (rnn5.west);
\draw[-latex'] (rnn5.north) to (softmax2.south);
\node[rnnnode,fill=red!30!white,above=\base of rnn5] (softmax1) {};
\node[wordnode,above=0pt of softmax1] (out1) {I};
\draw[-latex'] (rnn5.north) to (softmax1.south);
}
\visible<7->{
\node[rnnnode,fill=blue!30!white,right=\base of rnn5] (rnn6) {};
\node[rnnnode,fill=green!30!white,below=\base of rnn6] (emb6) {};
\node[rnnnode,fill=red!30!white,above=\base of rnn6] (softmax3) {};
\node[rnnnode,fill=red!30!white,above=\base of rnn6] (softmax2) {};
\ExtractX{$(emb6)$}
\ExtractY{$(word4.base)$}
\node[wordnode,anchor=base] (word6) at (\XCoord,\YCoord) {am};
\node[wordnode,anchor=base] (word6) at (\XCoord,\YCoord) {I};
\ExtractX{$(emb6)$}
\ExtractY{$(out1.base)$}
\node[wordnode,anchor=base] (out3) at (\XCoord,\YCoord) {fine};
\node[wordnode,anchor=base] (out2) at (\XCoord,\YCoord) {am};
\draw[-latex'] (emb6.north) to (rnn6.south);
\draw[-latex'] (rnn5.east) to (rnn6.west);
\draw[-latex'] (rnn6.north) to (softmax3.south);
\draw[-latex'] (rnn6.north) to (softmax2.south);
}
\visible<8->{
\node[rnnnode,fill=blue!30!white,right=\base of rnn6] (rnn7) {};
\node[rnnnode,fill=green!30!white,below=\base of rnn7] (emb7) {};
\node[rnnnode,fill=red!30!white,above=\base of rnn7] (softmax4) {};
\node[rnnnode,fill=red!30!white,above=\base of rnn7] (softmax3) {};
\ExtractX{$(emb7)$}
\ExtractY{$(word4.base)$}
\node[wordnode,anchor=base] (word7) at (\XCoord,\YCoord) {fine};
\node[wordnode,anchor=base] (word7) at (\XCoord,\YCoord) {am};
\ExtractX{$(emb7)$}
\ExtractY{$(out1.base)$}
\node[wordnode,anchor=base] (out4) at (\XCoord,\YCoord) {$\langle$eos$\rangle$};
\node[wordnode,anchor=base] (out3) at (\XCoord,\YCoord) {fine};
\draw[-latex'] (emb7.north) to (rnn7.south);
\draw[-latex'] (rnn6.east) to (rnn7.west);
\draw[-latex'] (rnn7.north) to (softmax4.south);
\draw[-latex'] (rnn7.north) to (softmax3.south);
\node[rnnnode,fill=blue!30!white,right=\base of rnn7] (rnn8) {};
\node[rnnnode,fill=green!30!white,below=\base of rnn8] (emb8) {};
\node[rnnnode,fill=red!30!white,above=\base of rnn8] (softmax4) {};
\ExtractX{$(emb8)$}
\ExtractY{$(word4.base)$}
\node[wordnode,anchor=base] (word8) at (\XCoord,\YCoord) {fine};
\ExtractX{$(emb8)$}
\ExtractY{$(out1.base)$}
\node[wordnode,anchor=base] (out4) at (\XCoord,\YCoord) {EOS};
\draw[-latex'] (emb8.north) to (rnn8.south);
\draw[-latex'] (rnn7.east) to (rnn8.west);
\draw[-latex'] (rnn8.north) to (softmax4.south);
}
\visible<9->{
\ExtractX{$(word8.east)$}
\ExtractY{$(word5.south)$}
\draw[decoration={mirror,brace},decorate] ([yshift=-0.2em]word5.south west) to node [auto,anchor=north,align=center] {解码器} (\XCoord,\YCoord-0.2em);
}
\end{scope}
\end{tikzpicture}
......@@ -639,22 +653,23 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\
\textbf{入门:循环网络翻译模型及注意力机制} \\
\small{1. 起源} \\
\small{2. 模型结构} \\
\small{3. 注意力机制}
\small{3. 注意力机制} \\
\small{4. 训练和推断}
}
\end{tcolorbox}
\vspace{0.5em}
\vspace{0.2em}
\begin{tcolorbox}[enhanced,size=normal,left=2mm,right=1mm,colback=red!5!white,colframe=red!75!black,drop fuzzy shadow]
{\large
\textbf{热门:Transformer} \\
\small{1. 多头自注意力模型} \\
\small{2. 训练及推断} \\
\small{3. 深层网络翻译模型}
\small{1. 自注意力模型} \\
\small{2. 多头注意力和层正则化} \\
\small{3. 更深、更宽的模型}
}
\end{tcolorbox}
\vspace{0.5em}
\vspace{0.2em}
\begin{tcolorbox}[enhanced,size=normal,left=2mm,right=1mm,colback=red!5!white,colframe=red!75!black,drop fuzzy shadow]
{\large
......@@ -957,8 +972,8 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\
\item<2-> \textbf{核心}:如何求解$\textrm{P}(y_j|\textbf{y}_{<j}, \textbf{x})$。在这个循环神经网络模型中,有三个步骤
\begin{enumerate}
\item 输入的单词用分布式表示,如$\textbf{x}$被表示为词向量序列$e_x(\textbf{x})$,同理$\textbf{y}_{<j}$被表示为$e_y(\textbf{y}_{<j})$
\item 源语言句子被一个RNN编码为一个表示$C$,如前面的例子中是一个实数向量
\item 目标端解码用另一个RNN,因此生成$y_j$时只考虑前一个状态$s_{j-1}$(这里,$s_{j-1}$表示RNN第$j-1$步骤的隐层状态)
\item 源语言句子被一个RNN编码为一个表示$\textbf{C}$,如前面的例子中是一个实数向量
\item 目标端解码用另一个RNN,因此生成$y_j$时只考虑前一个状态$\textbf{s}_{j-1}$(这里,$\textbf{s}_{j-1}$表示RNN第$j-1$步骤的隐层状态)
\end{enumerate}
\end{itemize}
......@@ -985,9 +1000,9 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\
\node[rnnnode,minimum height=0.5\base,fill=green!30!white,anchor=west] (eemb\x) at ([xshift=0.4\base]eemb\y.east) {\tiny{$e_x()$}};
\foreach \x in {1,2,...,3}
\node[rnnnode,fill=blue!30!white,anchor=south] (enc\x) at ([yshift=0.3\base]eemb\x.north) {};
\node[] (enclabel1) at (enc1) {\tiny{$h_{m-2}$}};
\node[] (enclabel2) at (enc2) {\tiny{$h_{m-1}$}};
\node[rnnnode,fill=purple!30!white] (enclabel3) at (enc3) {\tiny{$h_{m}$}};
\node[] (enclabel1) at (enc1) {\tiny{$\textbf{h}_{m-2}$}};
\node[] (enclabel2) at (enc2) {\tiny{$\textbf{h}_{m-1}$}};
\node[rnnnode,fill=purple!30!white] (enclabel3) at (enc3) {\tiny{$\textbf{h}_{m}$}};
\node[wordnode,left=0.4\base of enc1] (init1) {$\cdots$};
\node[wordnode,left=0.4\base of eemb1] (init2) {$\cdots$};
......@@ -999,7 +1014,7 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\
\foreach \x in {1,2,...,3}
\node[rnnnode,minimum height=0.5\base,fill=green!30!white,anchor=south] (demb\x) at ([yshift=\base]enc\x.north) {\tiny{$e_y()$}};
\foreach \x in {1,2,...,3}
\node[rnnnode,fill=blue!30!white,anchor=south] (dec\x) at ([yshift=0.3\base]demb\x.north) {{\tiny{$s_\x$}}};
\node[rnnnode,fill=blue!30!white,anchor=south] (dec\x) at ([yshift=0.3\base]demb\x.north) {{\tiny{$\textbf{s}_\x$}}};
\foreach \x in {1,2,...,3}
\node[rnnnode,minimum height=0.5\base,fill=red!30!white,anchor=south] (softmax\x) at ([yshift=0.3\base]dec\x.north) {\tiny{Softmax}};
\node[wordnode,right=0.4\base of demb3] (end1) {$\cdots$};
......@@ -1043,7 +1058,7 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\
\draw[-latex'] (enc3.north) .. controls +(north:0.3\base) and +(east:\base) .. (bridge) .. controls +(west:2.7\base) and +(west:0.3\base) .. (dec1.west);
\visible<2->{
\node [anchor=east] (line1) at ([xshift=-3em,yshift=0.5em]softmax1.west) {\scriptsize{基于RNN的隐层状态$s_i$}};
\node [anchor=east] (line1) at ([xshift=-3em,yshift=0.5em]softmax1.west) {\scriptsize{基于RNN的隐层状态$\textbf{s}_i$}};
\node [anchor=north west] (line2) at ([yshift=0.3em]line1.south west) {\scriptsize{预测目标词的概率}};
\node [anchor=north west] (line3) at ([yshift=0.3em]line2.south west) {\scriptsize{通常,用Softmax函数}};
\node [anchor=north west] (line4) at ([yshift=0.3em]line3.south west) {\scriptsize{实现 $\textrm{P}(y_i|...)$}};
......@@ -1060,7 +1075,7 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\
\node [anchor=west] (line21) at ([xshift=1.3em,yshift=1.5em]enc3.east) {\scriptsize{源语编码器最后一个}};
\node [anchor=north west] (line22) at ([yshift=0.3em]line21.south west) {\scriptsize{循环单元的输出被}};
\node [anchor=north west] (line23) at ([yshift=0.3em]line22.south west) {\scriptsize{看作是句子的表示,}};
\node [anchor=north west] (line24) at ([yshift=0.3em]line23.south west) {\scriptsize{记为$C$}};
\node [anchor=north west] (line24) at ([yshift=0.3em]line23.south west) {\scriptsize{记为$\textbf{C}$}};
}
\begin{pgfonlayer}{background}
......@@ -1096,7 +1111,7 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\
\item 可以重新定义\\
\vspace{-0.8em}
\begin{displaymath}
\textrm{P}(y_j|\textbf{y}_{<j}, \textbf{x}) \triangleq \textrm{P}(y_j|s_{j-1}, C)
\textrm{P}(y_j|\textbf{y}_{<j}, \textbf{x}) \triangleq \textrm{P}(y_j|\textbf{s}_{j-1}, \textbf{C})
\end{displaymath}
对于上图中的模型,进一步化简为:\\
......@@ -1105,8 +1120,8 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\
\begin{displaymath}
\textrm{P}(y_j|\textbf{y}_{<j}, \textbf{x}) \triangleq \left\{
\begin{matrix}
\textrm{P}(y_j|C)\ \ \ \ & j = 1 \\
\textrm{P}(y_j|s_{j-1}) & j > 1
\textrm{P}(y_j|\textbf{C})\ \ \ \ & j = 1 \\
\textrm{P}(y_j|\textbf{s}_{j-1}) & j > 1
\end{matrix} \right.
\end{displaymath}
......@@ -1203,7 +1218,7 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\
\node[rnnnode,minimum height=0.5\base,fill=red!30!white,anchor=south] (softmax\x) at ([yshift=0.5\base]dec\x.north) {};
% Decoder input words
\node[wordnode,below=0pt of demb1] (decwordin) {$\langle$sos$\rangle$};
\node[wordnode,below=0pt of demb1] (decwordin) {EOS};
\ExtractX{$(demb2.south)$}
\ExtractY{$(decwordin.base)$}
\node[wordnode,anchor=base] () at (\XCoord,\YCoord) {Do};
......@@ -1260,7 +1275,7 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\
\node[wordnode,anchor=base] () at (\XCoord,\YCoord) {Station};
\ExtractX{$(softmax10.north)$}
\ExtractY{$(decwordout.base)$}
\node[wordnode,anchor=base] () at (\XCoord,\YCoord) {$\langle$eos$\rangle$};
\node[wordnode,anchor=base] () at (\XCoord,\YCoord) {EOS};
% Connections
\draw[-latex'] (init.east) to (enc1.west);
......@@ -1341,7 +1356,7 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\
\node[wordnode,below=0pt of eemb7] () {怎么};
\node[wordnode,below=0pt of eemb8] () {};
\node[wordnode,below=0pt of eemb9] () {};
\node[wordnode,below=0pt of eemb10] () {$\langle$eos$\rangle$};
\node[wordnode,below=0pt of eemb10] () {EOS};
% RNN Decoder
\foreach \x in {1,2,...,10}
......@@ -1411,7 +1426,7 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\
\node[wordnode,anchor=base] () at (\XCoord,\YCoord) {Station};
\ExtractX{$(softmax10.north)$}
\ExtractY{$(decwordout.base)$}
\node[wordnode,anchor=base] () at (\XCoord,\YCoord) {$\langle$eos$\rangle$};
\node[wordnode,anchor=base] () at (\XCoord,\YCoord) {EOS};
% Connections
\draw[-latex'] (init1.east) to (enc11.west);
......@@ -1454,12 +1469,6 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\
\end{frame}
%%%------------------------------------------------------------------------------------------------------------
%%% 一些变种
\begin{frame}{改进 - fine-tuning}
%%% 图
\end{frame}
%%%------------------------------------------------------------------------------------------------------------
\subsection{注意力机制}
%%%------------------------------------------------------------------------------------------------------------
......@@ -1542,6 +1551,7 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\
\begin{itemize}
\item 关注的顺序:大狗的帽子 $\to$ 大狗 $\to$ 小狗的帽子 $\to$ 小狗
\end{itemize}
\item 人往往不是``均匀地''看图像中的所有区域,翻译是一个道理,生成一个目标语单词时参考的源语单词不会太多
\end{itemize}
\begin{center}
......@@ -1549,7 +1559,6 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\
\end{center}
\begin{itemize}
\item<2-> 人往往不是``均匀地''看图像中的所有区域,翻译是一个道理,生成一个目标语单词时参考的源语单词不会太多
\item<2-> \alert{注意力机制}在机器翻译中已经成功应用,经典的论文\\
\textbf{Neural Machine Translation by Jointly Learning to Align and Translate}\\
\textbf{Bahdanau et al., 2015, In Proc of ICLR}
......@@ -1563,7 +1572,7 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\
\begin{itemize}
\item 在注意力机制中,每个目标语单词的生成会使用一个动态的源语表示,而非一个统一的固定表示
\begin{itemize}
\item 这里$C_i$表示第$i$个目标语单词所使用的源语表示
\item 这里$\textbf{C}_i$表示第$i$个目标语单词所使用的源语表示
\end{itemize}
\end{itemize}
......@@ -1645,9 +1654,9 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\
\draw[<-] ([yshift=0.1em,xshift=1em]t6.north) -- ([yshift=1.2em,xshift=1em]t6.north);
\draw [->] ([yshift=3em]s6.north) -- ([yshift=4em]s6.north) -- ([yshift=4em]t1.north) node [pos=0.5,fill=green!30,inner sep=2pt] (c1) {\scriptsize{表示$C_1$}} -- ([yshift=3em]t1.north) ;
\draw [->] ([yshift=3em]s5.north) -- ([yshift=5.3em]s5.north) -- ([yshift=5.3em]t2.north) node [pos=0.5,fill=green!30,inner sep=2pt] (c2) {\scriptsize{表示$C_2$}} -- ([yshift=3em]t2.north) ;
\draw [->] ([yshift=3.5em]s3.north) -- ([yshift=6.6em]s3.north) -- ([yshift=6.6em]t4.north) node [pos=0.5,fill=green!30,inner sep=2pt] (c3) {\scriptsize{表示$C_i$}} -- ([yshift=3.5em]t4.north) ;
\draw [->] ([yshift=3em]s6.north) -- ([yshift=4em]s6.north) -- ([yshift=4em]t1.north) node [pos=0.5,fill=green!30,inner sep=2pt] (c1) {\scriptsize{表示$\textbf{C}_1$}} -- ([yshift=3em]t1.north) ;
\draw [->] ([yshift=3em]s5.north) -- ([yshift=5.3em]s5.north) -- ([yshift=5.3em]t2.north) node [pos=0.5,fill=green!30,inner sep=2pt] (c2) {\scriptsize{表示$\textbf{C}_2$}} -- ([yshift=3em]t2.north) ;
\draw [->] ([yshift=3.5em]s3.north) -- ([yshift=6.6em]s3.north) -- ([yshift=6.6em]t4.north) node [pos=0.5,fill=green!30,inner sep=2pt] (c3) {\scriptsize{表示$\textbf{C}_i$}} -- ([yshift=3.5em]t4.north) ;
\node [anchor=north] (smore) at ([yshift=3.5em]s3.north) {...};
\node [anchor=north] (tmore) at ([yshift=3.5em]t4.north) {...};
......@@ -1662,15 +1671,15 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\
%%%------------------------------------------------------------------------------------------------------------
%%% C_i的定义
\begin{frame}{上下文向量$C_i$}
\begin{frame}{上下文向量$\textbf{C}_i$}
\begin{itemize}
\item 对于目标语位置$i$$C_i$是目标语$i$使用的上下文向量
\item 对于目标语位置$i$$\textbf{C}_i$是目标语$i$使用的上下文向量
\begin{itemize}
\item $h_j$表示编码器第$j$个位置的隐层状态
\item $s_i$表示解码器第$i$个位置的隐层状态
\item $\textbf{h}_j$表示编码器第$j$个位置的隐层状态
\item $\textbf{s}_i$表示解码器第$i$个位置的隐层状态
\item<2-> $\alpha_{i,j}$表示注意力权重,表示目标语第$i$个位置与源语第$j$个位置之间的相关性大小
\item<2-> $a(\cdot)$表示注意力函数,计算$s_{i-1}$$h_j$之间的相关性
\item<3-> $C_i$是所有源语编码表示$\{h_j\}$的加权求和,权重为$\{\alpha_{i,j}\}$
\item<2-> $a(\cdot)$表示注意力函数,计算$\textbf{s}_{i-1}$$\textbf{h}_j$之间的相关性
\item<3-> $\textbf{C}_i$是所有源语编码表示$\{\textbf{h}_j\}$的加权求和,权重为$\{\alpha_{i,j}\}$
\end{itemize}
\end{itemize}
......@@ -1679,17 +1688,17 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\
\begin{scope}
\node [anchor=west,draw,fill=red!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (h1) at (0,0) {\scriptsize{$h_1$}};
\node [anchor=west,draw,fill=red!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (h2) at ([xshift=1em]h1.east) {\scriptsize{$h_2$}};
\node [anchor=west,draw,fill=red!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (h1) at (0,0) {\scriptsize{$\textbf{h}_1$}};
\node [anchor=west,draw,fill=red!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (h2) at ([xshift=1em]h1.east) {\scriptsize{$\textbf{h}_2$}};
\node [anchor=west,inner sep=0pt,minimum width=3em] (h3) at ([xshift=0.5em]h2.east) {\scriptsize{...}};
\node [anchor=west,draw,fill=red!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (h4) at ([xshift=0.5em]h3.east) {\scriptsize{$h_n$}};
\node [anchor=west,draw,fill=red!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (h4) at ([xshift=0.5em]h3.east) {\scriptsize{$\textbf{h}_n$}};
\node [anchor=south,circle,minimum size=1.0em,draw,ublue,thick] (sum) at ([yshift=2em]h2.north east) {};
\draw [thick,-,ublue] (sum.north) -- (sum.south);
\draw [thick,-,ublue] (sum.west) -- (sum.east);
\node [anchor=south,draw,fill=green!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (th1) at ([yshift=2em,xshift=-1em]sum.north west) {\scriptsize{$s_{i-1}$}};
\node [anchor=west,draw,fill=green!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (th2) at ([xshift=2em]th1.east) {\scriptsize{$s_{i}$}};
\node [anchor=south,draw,fill=green!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (th1) at ([yshift=2em,xshift=-1em]sum.north west) {\scriptsize{$\textbf{s}_{i-1}$}};
\node [anchor=west,draw,fill=green!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (th2) at ([xshift=2em]th1.east) {\scriptsize{$\textbf{s}_{i}$}};
\draw [->] (h1.north) .. controls +(north:0.8) and +(west:1) .. (sum.190) node [pos=0.3,left] {\tiny{$\alpha_{i,1}$}};
\draw [->] (h2.north) .. controls +(north:0.6) and +(220:0.2) .. (sum.220) node [pos=0.2,right] {\tiny{$\alpha_{i,2}$}};
......@@ -1698,7 +1707,7 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\
\draw [->] ([xshift=-1.5em]th1.west) -- ([xshift=-0.1em]th1.west);
\draw [->] ([xshift=0.1em]th1.east) -- ([xshift=-0.1em]th2.west);
\draw [->] ([xshift=0.1em]th2.east) -- ([xshift=1.5em]th2.east);
\draw [->] (sum.north) .. controls +(north:0.8) and +(west:0.2) .. ([yshift=-0.4em,xshift=-0.1em]th2.west) node [pos=0.2,right] (ci) {\scriptsize{$C_{i}$}};
\draw [->] (sum.north) .. controls +(north:0.8) and +(west:0.2) .. ([yshift=-0.4em,xshift=-0.1em]th2.west) node [pos=0.2,right] (ci) {\scriptsize{$\textbf{C}_{i}$}};
\node [anchor=south,inner sep=1pt] (output) at ([yshift=0.8em]th2.north) {\tiny{输出层}};
\draw [->] ([yshift=0.1em]th2.north) -- ([yshift=-0.1em]output.south);
......@@ -1710,11 +1719,11 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\
\node [anchor=north] (enc42) at ([yshift=0.5em]enc4.south) {\tiny{(位置$4$)}};
\visible<2->{
\node [anchor=west] (math1) at ([xshift=5em,yshift=1em]th2.east) {$C_i = \sum_{j} \alpha_{i,j} h_j \ \ $};
\node [anchor=west] (math1) at ([xshift=5em,yshift=1em]th2.east) {$\textbf{C}_i = \sum_{j} \alpha_{i,j} \textbf{h}_j \ \ $};
}
\visible<3->{
\node [anchor=north west] (math2) at ([yshift=-2em]math1.south west) {$\alpha_{i,j} = \frac{\exp(\beta_{i,j})}{\sum_{j'} \exp(\beta_{i,j'})}$};
\node [anchor=north west] (math3) at ([yshift=-0em]math2.south west) {$\beta_{i,j} = a(s_{i-1}, h_j)$};
\node [anchor=north west] (math3) at ([yshift=-0em]math2.south west) {$\beta_{i,j} = a(\textbf{s}_{i-1}, \textbf{h}_j)$};
}
\begin{pgfonlayer}{background}
......@@ -1820,7 +1829,7 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\
\visible<3->{
% alignment bars 2
\node[probnode,anchor=south west,minimum height=0.4\hnode,inner sep=0.1pt,fill=red!40,label=below:\scriptsize{$0.4$}] (attn21) at ([xshift=2.3\hnode,yshift=-0.0\hnode]alignment2.east) {};
\node[probnode,anchor=south west,minimum height=0.4\hnode,inner sep=0.1pt,fill=red!40,label=below:\scriptsize{$0.4$}] (attn21) at ([xshift=2.3\hnode,yshift=0.5\hnode]alignment2.east) {};
\node[probnode,anchor=south west,minimum height=0.4\hnode,inner sep=0.1pt,fill=red!40,label=below:\scriptsize{$0.4$}] (attn22) at ([xshift=1pt]attn21.south east) {};
\node[probnode,anchor=south west,minimum height=0.05\hnode,inner sep=0.1pt,fill=red!40,label=below:\scriptsize{$0$}] (attn23) at ([xshift=1pt]attn22.south east) {};
\node[probnode,anchor=south west,minimum height=0.1\hnode,inner sep=0.1pt,fill=red!40,label=below:\scriptsize{$0.1$}] (attn24) at ([xshift=1pt]attn23.south east) {};
......@@ -1840,12 +1849,14 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\
\visible<3->{
% coverage score formula node
\node[anchor=north west] (formula) at ([xshift=-0.3\hnode,yshift=-2.5\hnode]attn11.south) {\small{不同$C_i$所对应的源语言词的权重是不同的}};
\node [anchor=north west] (formula) at ([xshift=-0.3\hnode,yshift=-1.5\hnode]attn11.south) {\small{不同$\textbf{C}_i$所对应的源语言词的权重是不同的}};
\node [anchor=north west] (example) at (formula.south west) {\footnotesize{$\textbf{C}_2=0.4 \times \textbf{h}(\textrm{``你''}) + 0.4 \times \textbf{h}(\textrm{``什么''}) +$}};
\node [anchor=north west] (example2) at ([yshift=0.4em]example.south west) {\footnotesize{$\ \ \ \ \ \ \ \ 0 \times \textbf{h}(\textrm{``都''}) + 0.1 \times \textbf{h}(\textrm{``没''}) + ..$}};
}
\visible<3->{
% matrix -> attn2
\draw[->,red] ([xshift=0.1em,yshift=2.3em]alignment2.east).. controls +(east:1.9cm) and +(west:1.0cm) ..([xshift=-0.15\hnode,yshift=-0.0\hnode]attn21.north west);
\draw[->,red] ([xshift=0.1em,yshift=2.3em]alignment2.east).. controls +(east:1.9cm) and +(west:1.0cm) ..([xshift=-0.15\hnode,yshift=-1em]attn21.north west);
}
\visible<2->{
......@@ -1854,7 +1865,7 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\
\visible<3->{
% attn2 -> cov2
\draw[->] ([xshift=0.2\hnode,yshift=0.0\hnode]attn26.east)--([xshift=0.7\hnode,yshift=0.0\hnode]attn26.east) node[pos=0.5,above] (sum2) {\small{$\sum$}}; % 0.3 - 0.5 height of the
\draw[->] ([xshift=0.2\hnode,yshift=0.0\hnode]attn26.east)--([xshift=0.7\hnode,yshift=0]attn26.east) node[pos=0.5,above] (sum2) {\small{$\sum$}}; % 0.3 - 0.5 height of the
}
\visible<2->{
......@@ -1864,11 +1875,11 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\
% coverage score for each source word
\visible<2->{
\node[anchor=west] (sc1) at ([xshift=0.9\hnode]attn16.east) {$C_1 = \sum_{i=1}^{8} \alpha_{i1} h_{i}$};
\node[anchor=west] (sc1) at ([xshift=0.9\hnode]attn16.east) {$\textbf{C}_1 = \sum_{i=1}^{8} \alpha_{i1} \textbf{h}_{i}$};
}
\visible<3->{
\node[anchor=west] (sc2) at ([xshift=0.9\hnode,yshift=0.0\hnode]attn26.east) {$C_2 = \sum_{i=1}^{8} \alpha_{i2} h_{i}$};
\node[anchor=west] (sc2) at ([xshift=0.9\hnode,yshift=0.0\hnode]attn26.east) {$\textbf{C}_2 = \sum_{i=1}^{8} \alpha_{i2} \textbf{h}_{i}$};
}
\end{tikzpicture}
......@@ -1883,8 +1894,8 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\
{\small
\begin{tabular}{l | l}
引入注意力机制以前 & 引入注意力机制以后 \\ \hline
$\textrm{``Have''} = \argmax_{y} \textrm{P}(y|0, \alert{C})$ & $\textrm{``Have''} = \argmax_{y} \textrm{P}(y|0, \alert{C_1})$ \\
$\textrm{``you''} = \argmax_{y} \textrm{P}(y|s_1, \alert{C})$ & $\textrm{``you''} = \argmax_{y} \textrm{P}(y|s_1, \alert{C_2})$
$\textrm{``Have''} = \argmax_{y} \textrm{P}(y|0, \alert{\textbf{C}})$ & $\textrm{``Have''} = \argmax_{y} \textrm{P}(y|0, \alert{\textbf{C}_1})$ \\
$\textrm{``you''} = \argmax_{y} \textrm{P}(y|\textbf{s}_1, \alert{\textbf{C}})$ & $\textrm{``you''} = \argmax_{y} \textrm{P}(y|\textbf{s}_1, \alert{\textbf{C}_2})$
\end{tabular}
}
\end{center}
......@@ -1899,19 +1910,19 @@ $\textrm{``you''} = \argmax_{y} \textrm{P}(y|s_1, \alert{C})$ & $\textrm{``you''
\item 再来看一下注意力权重的定义。这个过程实际上是对$a(\cdot,\cdot)$做指数归一化:\\
\vspace{-0.3em}
\begin{displaymath}
\alpha_{i,j} = \frac{\exp(a(s_{i-1}, h_j))}{\sum_{j'} \exp(a(s_{i-1}, h_{j'}))}
\alpha_{i,j} = \frac{\exp(a(\textbf{s}_{i-1}, \textbf{h}_j))}{\sum_{j'} \exp(a(\textbf{s}_{i-1}, \textbf{h}_{j'}))}
\end{displaymath}
\item<2-> 注意力函数$a(s,h)$的目的是捕捉$s$$h$之间的\alert{相似性},这也可以被看作是目标语表示和源语言表示的一种``统一化'',即把源语言和目标语表示在同一个语义空间,进而语义相近的内容有更大的相似性。\visible<3->{定义$a(s,h)$的方式:}
\item<2-> 注意力函数$a(\textbf{s},\textbf{h})$的目的是捕捉$\textbf{s}$$\textbf{h}$之间的\alert{相似性},这也可以被看作是目标语表示和源语言表示的一种``统一化'',即把源语言和目标语表示在同一个语义空间,进而语义相近的内容有更大的相似性。\visible<3->{定义$a(\textbf{s},\textbf{h})$的方式:}
\vspace{-1em}
\visible<3->{
\begin{displaymath}
a(s,h) = \left\{ \begin{array}{ll}
s h^T & \textrm{向量乘} \\
\textrm{cos}(s, h) & \textrm{向量夹角} \\
s \textbf{W} h^T & \textrm{线性模型} \\
\textrm{TanH}(\textbf{W}[s,h])\textbf{v}^T & \textrm{拼接}[s,h]+\textrm{单层网络}
a(\textbf{s},\textbf{h}) = \left\{ \begin{array}{ll}
\textbf{s} \textbf{h}^{\textrm{T}} & \textrm{向量乘} \\
\textrm{cos}(\textbf{s}, \textbf{h}) & \textrm{向量夹角} \\
\textbf{s} \textbf{W} \textbf{h}^{\textrm{T}} & \textrm{线性模型} \\
\textrm{TanH}(\textbf{W}[\textbf{s},\textbf{h}])\textbf{v}^{\textrm{T}} & \textrm{拼接}[\textbf{s},\textbf{h}]+\textrm{单层网络}
\end{array}
\right.
\end{displaymath}
......@@ -1922,6 +1933,524 @@ $\textrm{``you''} = \argmax_{y} \textrm{P}(y|s_1, \alert{C})$ & $\textrm{``you''
\end{frame}
%%%------------------------------------------------------------------------------------------------------------
%%% 注意力模型的效果 - 热图
\begin{frame}{真实的实例}
\begin{itemize}
\item 注意力的权重符合双语对应的规律
\end{itemize}
\end{frame}
%%%------------------------------------------------------------------------------------------------------------
%%% 如何进一步理解注意力机制
\begin{frame}{重新解释注意力机制}
\begin{itemize}
\item 换一个问题,假设有若干key-value单元,其中key是这个单元的索引表示,value是这个单元的值。对于任意一个query,可以找到匹配的key,并输出其对应的value
\end{itemize}
\vspace{-0.8em}
\begin{center}
\begin{tikzpicture}
\begin{scope}
\tikzstyle{rnode} = [draw,minimum width=3em,minimum height=1.2em]
\node [rnode,anchor=south west,fill=blue!20!white] (value1) at (0,0) {\scriptsize{value$_1$}};
\node [rnode,anchor=south west,fill=blue!20!white] (value2) at ([xshift=1em]value1.south east) {\scriptsize{value$_2$}};
\node [rnode,anchor=south west,fill=red!20!white] (value3) at ([xshift=1em]value2.south east) {\scriptsize{value$_3$}};
\node [rnode,anchor=south west,fill=blue!20!white] (value4) at ([xshift=1em]value3.south east) {\scriptsize{value$_4$}};
\node [rnode,anchor=south west,pattern=north east lines] (key1) at ([yshift=0.2em]value1.north west) {};
\node [rnode,anchor=south west,pattern=dots] (key2) at ([yshift=0.2em]value2.north west) {};
\node [rnode,anchor=south west,pattern=horizontal lines] (key3) at ([yshift=0.2em]value3.north west) {};
\node [rnode,anchor=south west,pattern=crosshatch dots] (key4) at ([yshift=0.2em]value4.north west) {};
\node [fill=white,inner sep=1pt] (key1label) at (key1) {\scriptsize{key$_1$}};
\node [fill=white,inner sep=1pt] (key1label) at (key2) {\scriptsize{key$_2$}};
\node [fill=white,inner sep=1pt] (key1label) at (key3) {\scriptsize{key$_3$}};
\node [fill=white,inner sep=1pt] (key1label) at (key4) {\scriptsize{key$_4$}};
\node [rnode,anchor=east,pattern=horizontal lines] (query) at ([xshift=-3em]key1.west) {};
\node [anchor=east] (querylabel) at ([xshift=-0.2em]query.west) {\scriptsize{query}};
\draw [->] ([yshift=1pt]query.north) .. controls +(90:2em) and +(90:2em) .. ([yshift=1pt]key3.north) node [pos=0.5,below,yshift=0.2em] {\scriptsize{匹配}};
\node [anchor=north] (result) at (value3.south) {\scriptsize{\alert{返回结果}}};
\end{scope}
\end{tikzpicture}
\end{center}
\vspace{-0.7em}
\begin{itemize}
\item<2-> 注意力机制也可以被看做对key-value单元的查询,但是所有key和query之间都有一种匹配程度,返回结果是对所有value的加权
\end{itemize}
\visible<2->{
\vspace{-0.5em}
\begin{center}
\begin{tikzpicture}
\begin{scope}
\tikzstyle{rnode} = [draw,minimum width=3em,minimum height=1.2em]
\node [rnode,anchor=south west,fill=red!20!white] (value1) at (0,0) {\scriptsize{value$_1$}};
\node [rnode,anchor=south west,fill=red!20!white] (value2) at ([xshift=1em]value1.south east) {\scriptsize{value$_2$}};
\node [rnode,anchor=south west,fill=red!20!white] (value3) at ([xshift=1em]value2.south east) {\scriptsize{value$_3$}};
\node [rnode,anchor=south west,fill=red!20!white] (value4) at ([xshift=1em]value3.south east) {\scriptsize{value$_4$}};
\node [rnode,anchor=south west,pattern=north east lines] (key1) at ([yshift=0.2em]value1.north west) {};
\node [rnode,anchor=south west,pattern=dots] (key2) at ([yshift=0.2em]value2.north west) {};
\node [rnode,anchor=south west,pattern=horizontal lines] (key3) at ([yshift=0.2em]value3.north west) {};
\node [rnode,anchor=south west,pattern=crosshatch dots] (key4) at ([yshift=0.2em]value4.north west) {};
\node [fill=white,inner sep=1pt] (key1label) at (key1) {\scriptsize{key$_1$}};
\node [fill=white,inner sep=1pt] (key1label) at (key2) {\scriptsize{key$_2$}};
\node [fill=white,inner sep=1pt] (key1label) at (key3) {\scriptsize{key$_3$}};
\node [fill=white,inner sep=1pt] (key1label) at (key4) {\scriptsize{key$_4$}};
\node [rnode,anchor=east,pattern=vertical lines] (query) at ([xshift=-3em]key1.west) {};
\node [anchor=east] (querylabel) at ([xshift=-0.2em]query.west) {\scriptsize{query}};
\draw [->] ([yshift=1pt,xshift=6pt]query.north) .. controls +(90:1em) and +(90:1em) .. ([yshift=1pt]key1.north);
\draw [->] ([yshift=1pt,xshift=3pt]query.north) .. controls +(90:1.5em) and +(90:1.5em) .. ([yshift=1pt]key2.north);
\draw [->] ([yshift=1pt]query.north) .. controls +(90:2em) and +(90:2em) .. ([yshift=1pt]key3.north);
\draw [->] ([yshift=1pt,xshift=-3pt]query.north) .. controls +(90:2.5em) and +(90:2.5em) .. ([yshift=1pt]key4.north);
\node [anchor=south east] (alpha1) at (key1.north east) {\scriptsize{$\alpha_1$}};
\node [anchor=south east] (alpha2) at (key2.north east) {\scriptsize{$\alpha_2$}};
\node [anchor=south east] (alpha3) at (key3.north east) {\scriptsize{$\alpha_3$}};
\node [anchor=south east] (alpha4) at (key4.north east) {\scriptsize{$\alpha_4$}};
\node [anchor=north] (result) at ([xshift=-1.5em]value2.south east) {\scriptsize{\alert{返回结果}=$\alpha_1 \cdot \textrm{value}_1 + \alpha_2 \cdot \textrm{value}_2 + \alpha_3 \cdot \textrm{value}_3 + \alpha_4 \cdot \textrm{value}_4$}};
\end{scope}
\end{tikzpicture}
\end{center}
}
\end{frame}
%%%------------------------------------------------------------------------------------------------------------
%%% 如何进一步理解注意力机制 - 回到机器翻译任务
\begin{frame}{重新解释注意力机制(续)}
\begin{itemize}
\item 回到机器翻译,如果把目标语状态$\textbf{s}_{i-1}$看做query,而把源语言所有位置的最上层RNN表示$\textbf{h}_{j}$看做{\color{ugreen} \textbf{key}}{\color{red} \textbf{value}}
\end{itemize}
\vspace{-1.5em}
\begin{center}
\begin{tikzpicture}
\begin{scope}
\tikzstyle{rnode} = [draw,minimum width=3.5em,minimum height=1.2em]
\node [rnode,anchor=south west,fill=red!20!white] (value1) at (0,0) {\scriptsize{$\textbf{h}(\textrm{``你''})$}};
\node [rnode,anchor=south west,fill=red!20!white] (value2) at ([xshift=1em]value1.south east) {\scriptsize{$\textbf{h}(\textrm{``什么''})$}};
\node [rnode,anchor=south west,fill=red!20!white] (value3) at ([xshift=1em]value2.south east) {\scriptsize{$\textbf{h}(\textrm{``也''})$}};
\node [rnode,anchor=south west,fill=red!20!white] (value4) at ([xshift=1em]value3.south east) {\scriptsize{$\textbf{h}(\textrm{``没''})$}};
\node [rnode,anchor=south west,fill=green!20!white] (key1) at ([yshift=0.2em]value1.north west) {\scriptsize{$\textbf{h}(\textrm{``你''})$}};
\node [rnode,anchor=south west,fill=green!20!white] (key2) at ([yshift=0.2em]value2.north west) {\scriptsize{$\textbf{h}(\textrm{``什么''})$}};
\node [rnode,anchor=south west,fill=green!20!white] (key3) at ([yshift=0.2em]value3.north west) {\scriptsize{$\textbf{h}(\textrm{``也''})$}};
\node [rnode,anchor=south west,fill=green!20!white] (key4) at ([yshift=0.2em]value4.north west) {\scriptsize{$\textbf{h}(\textrm{``没''})$}};
\node [rnode,anchor=east] (query) at ([xshift=-2em]key1.west) {\scriptsize{$\textbf{s}(\textrm{``you''})$}};
\node [anchor=east] (querylabel) at ([xshift=-0.2em]query.west) {\scriptsize{query}};
\draw [->] ([yshift=1pt,xshift=6pt]query.north) .. controls +(90:1em) and +(90:1em) .. ([yshift=1pt]key1.north);
\draw [->] ([yshift=1pt,xshift=3pt]query.north) .. controls +(90:1.5em) and +(90:1.5em) .. ([yshift=1pt]key2.north);
\draw [->] ([yshift=1pt]query.north) .. controls +(90:2em) and +(90:2em) .. ([yshift=1pt]key3.north);
\draw [->] ([yshift=1pt,xshift=-3pt]query.north) .. controls +(90:2.5em) and +(90:2.5em) .. ([yshift=1pt]key4.north);
\node [anchor=south east] (alpha1) at ([xshift=1em]key1.north east) {\scriptsize{$\alpha_1=.4$}};
\node [anchor=south east] (alpha2) at ([xshift=1em]key2.north east) {\scriptsize{$\alpha_2=.4$}};
\node [anchor=south east] (alpha3) at ([xshift=1em]key3.north east) {\scriptsize{$\alpha_3=0$}};
\node [anchor=south east] (alpha4) at ([xshift=1em]key4.north east) {\scriptsize{$\alpha_4=.1$}};
\end{scope}
\end{tikzpicture}
\end{center}
\vspace{-2.5em}
\begin{eqnarray}
\textbf{C}_3 & = & 0.4 \times \textbf{h}(\textrm{``什么''}) + 0.4 \times \textbf{h}(\textrm{``也''}) + \nonumber \\
& & 0 \times \textbf{h}(\textrm{``没''}) + 0.1 \times \textbf{h}(\textrm{``学''}) \nonumber
\end{eqnarray}
\vspace{-0.5em}
\begin{itemize}
\item<2-> 注意力机制也可以被看做是一个重新生成value的过程:对于一组value值,注意力模型对他们加权求和,并得到一个新的value。而这个新的value实际上就是query所对应查询结果,在机器翻译中被看做是目标语所对应的源语言上下文表示。
\end{itemize}
\end{frame}
\subsection{训练及推断}
%%%------------------------------------------------------------------------------------------------------------
%%% 训练
\begin{frame}{训练}
\end{frame}
%%%------------------------------------------------------------------------------------------------------------
%%% 解码
\begin{frame}{推断}
\begin{itemize}
\item 使用NMT时,对于源语言句子$\textbf{x}$,需要得到最优译文$\hat{\textbf{y}}$
\vspace{-1.5em}
\begin{displaymath}
\hat{\textbf{y}} = \argmax_{\textbf{y}} \log\textrm{P}(\textbf{y}|\textbf{x}) = \argmax_{\textbf{y}} \sum_{j=1}^{n} \log\textrm{P}(y_j|\textbf{y}_{<j}, \textbf{x})
\end{displaymath}
\item<2-> 由于$y_i$的生成需要依赖$y_{i-1}$,因此无法同时生成$\{y_1,...,y_n\}$。常用的方法是自左向右逐个单词生成
\end{itemize}
\vspace{-0.8em}
\visible<3->{
\vspace{-0.5em}
\begin{center}
\begin{tikzpicture}
\begin{scope}
\tikzstyle{rnnnode} = [minimum height=1.1em,minimum width=2.1em,inner sep=2pt,rounded corners=1pt,draw,fill=red!20];
\node [rnnnode,anchor=west] (h1) at (0,0) {\tiny{$\textbf{h}_1$}};
\node [anchor=west] (h2) at ([xshift=1em]h1.east) {\tiny{...}};
\node [rnnnode,anchor=west] (h3) at ([xshift=1em]h2.east) {\tiny{$\textbf{h}_m$}};
\node [rnnnode,anchor=north,fill=green!20] (e1) at ([yshift=-1em]h1.south) {\tiny{$e_x()$}};
\node [anchor=west] (e2) at ([xshift=1em]e1.east) {\tiny{...}};
\node [rnnnode,anchor=west,fill=green!20] (e3) at ([xshift=1em]e2.east) {\tiny{$e_x()$}};
\node [anchor=north,inner sep=2pt] (w1) at ([yshift=-0.6em]e1.south) {\tiny{}};
\node [anchor=north,inner sep=2pt] (w2) at ([yshift=-0.8em]e2.south) {\tiny{...}};
\node [anchor=north,inner sep=2pt] (w3) at ([yshift=-0.6em]e3.south) {\tiny{EOS}};
\draw [->] (w1.north) -- ([yshift=-0.1em]e1.south);
\draw [->] (w3.north) -- ([yshift=-0.1em]e3.south);
\draw [->] ([yshift=0.1em]e1.north) -- ([yshift=-0.1em]h1.south);
\draw [->] ([yshift=0.1em]e3.north) -- ([yshift=-0.1em]h3.south);
\draw [->] ([xshift=0.1em]h1.east) -- ([xshift=-0.1em]h2.west);
\draw [->] ([xshift=0.1em]h2.east) -- ([xshift=-0.1em]h3.west);
\draw [->] ([xshift=-0.8em]h1.west) -- ([xshift=-0.1em]h1.west) node [pos=0,left,inner sep=2pt] {\tiny{0}};
\node [anchor=south] (encoder) at ([xshift=-0.2em]h1.north west) {\scriptsize{\textbf{编码器}}};
\visible<5->{
\node [rnnnode,anchor=west,fill=green!20] (t1) at ([xshift=3em]h3.east) {\tiny{$e_y()$}};
}
\visible<8->{
\node [rnnnode,anchor=west,fill=green!20] (t2) at ([xshift=1.5em]t1.east) {\tiny{$e_y()$}};
}
\visible<9->{
\node [rnnnode,anchor=west,fill=green!20] (t3) at ([xshift=1.5em]t2.east) {\tiny{$e_y()$}};
\node [rnnnode,anchor=west,fill=green!20] (t4) at ([xshift=1.5em]t3.east) {\tiny{$e_y()$}};
\node [anchor=west,inner sep=2pt] (t5) at ([xshift=0.3em]t4.east) {\tiny{...}};
}
\visible<5->{
\node [rnnnode,anchor=south] (s1) at ([yshift=1em]t1.north) {\tiny{$\textbf{s}_1$}};
}
\visible<8->{
\node [rnnnode,anchor=south] (s2) at ([yshift=1em]t2.north) {\tiny{$\textbf{s}_2$}};
}
\visible<9->{
\node [rnnnode,anchor=south] (s3) at ([yshift=1em]t3.north) {\tiny{$\textbf{s}_3$}};
\node [rnnnode,anchor=south] (s4) at ([yshift=1em]t4.north) {\tiny{$\textbf{s}_4$}};
\node [anchor=west,inner sep=2pt] (s5) at ([xshift=0.3em]s4.east) {\tiny{...}};
}
\visible<5->{
\node [rnnnode,anchor=south,fill=blue!20] (o1) at ([yshift=1em]s1.north) {\tiny{softmax}};
\node [anchor=east] (decoder) at ([xshift=-0.5em]o1.north west) {\scriptsize{\textbf{解码器}}};
}
\visible<8->{
\node [rnnnode,anchor=south,fill=blue!20] (o2) at ([yshift=1em]s2.north) {\tiny{softmax}};
}
\visible<9->{
\node [rnnnode,anchor=south,fill=blue!20] (o3) at ([yshift=1em]s3.north) {\tiny{softmax}};
\node [rnnnode,anchor=south,fill=blue!20] (o4) at ([yshift=1em]s4.north) {\tiny{softmax}};
\node [anchor=west,inner sep=2pt] (o5) at ([xshift=0.3em]o4.east) {\tiny{...}};
}
\visible<4->{
\node [anchor=north,inner sep=2pt] (wt1) at ([yshift=-0.6em]t1.south) {\tiny{EOS}};
}
\visible<7->{
\node [anchor=north,inner sep=2pt] (wt2) at ([yshift=-0.6em]t2.south) {\tiny{Have}};
}
\visible<9->{
\node [anchor=north,inner sep=2pt] (wt3) at ([yshift=-0.8em]t3.south) {\tiny{you}};
\node [anchor=north,inner sep=2pt] (wt4) at ([yshift=-0.6em]t4.south) {\tiny{learned}};
}
\visible<5->{
\node [anchor=center,inner sep=2pt] (wo1) at ([yshift=1.2em]o1.north) {\tiny{Have}};
}
\visible<4->{
\node [anchor=south,inner sep=2pt] (wos1) at (wo1.north) {\tiny{\textbf{[step 1]}}};
}
\visible<8->{
\node [anchor=center,inner sep=2pt] (wo2) at ([yshift=1.2em]o2.north) {\tiny{you}};
}
\visible<7->{
\node [anchor=south,inner sep=2pt] (wos2) at (wo2.north) {\tiny{\textbf{[step 2]}}};
}
\visible<9->{
\node [anchor=center,inner sep=2pt] (wo3) at ([yshift=1.2em]o3.north) {\tiny{learned}};
\node [anchor=south,inner sep=2pt] (wos3) at (wo3.north) {\tiny{\textbf{[step 3]}}};
\node [anchor=center,inner sep=2pt] (wo4) at ([yshift=1.2em]o4.north) {\tiny{nothing}};
\node [anchor=south,inner sep=2pt] (wos4) at (wo4.north) {\tiny{\textbf{[step 4]}}};
}
\visible<5->{
\foreach \x in {1}{
\draw [->] ([yshift=-0.7em]t\x.south) -- ([yshift=-0.1em]t\x.south);
\draw [->] ([yshift=0.1em]t\x.north) -- ([yshift=-0.1em]s\x.south);
\draw [->] ([yshift=0.1em]s\x.north) -- ([yshift=-0.1em]o\x.south);
\draw [->] ([yshift=0.1em]o\x.north) -- ([yshift=0.8em]o\x.north) node [pos=0.5,right] {\tiny{top1}};
}
}
\visible<8->{
\foreach \x in {2}{
\draw [->] ([yshift=-0.7em]t\x.south) -- ([yshift=-0.1em]t\x.south);
\draw [->] ([yshift=0.1em]t\x.north) -- ([yshift=-0.1em]s\x.south);
\draw [->] ([yshift=0.1em]s\x.north) -- ([yshift=-0.1em]o\x.south);
\draw [->] ([yshift=0.1em]o\x.north) -- ([yshift=0.8em]o\x.north) node [pos=0.5,right] {\tiny{top1}};
}
}
\visible<9->{
\foreach \x in {3,4}{
\draw [->] ([yshift=-0.7em]t\x.south) -- ([yshift=-0.1em]t\x.south);
\draw [->] ([yshift=0.1em]t\x.north) -- ([yshift=-0.1em]s\x.south);
\draw [->] ([yshift=0.1em]s\x.north) -- ([yshift=-0.1em]o\x.south);
\draw [->] ([yshift=0.1em]o\x.north) -- ([yshift=0.8em]o\x.north) node [pos=0.5,right] {\tiny{top1}};
}
}
\visible<5->{
\draw [->] ([xshift=-0.8em]s1.west) -- ([xshift=-0.1em]s1.west) node [pos=0,left] {\tiny{0}};
}
\visible<8->{
\draw [->] ([xshift=0.1em]s1.east) -- ([xshift=-0.1em]s2.west);
}
\visible<9->{
\draw [->] ([xshift=0.1em]s2.east) -- ([xshift=-0.1em]s3.west);
\draw [->] ([xshift=0.1em]s3.east) -- ([xshift=-0.1em]s4.west);
}
\visible<7->{
\draw [->,thick,dotted] (wo1.east) .. controls +(east:1.0) and +(west:1.0) ..(wt2.west);
}
\visible<9->{
\draw [->,thick,dotted] (wo2.east) .. controls +(east:1.3) and +(west:1.1) ..(wt3.west);
\draw [->,thick,dotted] (wo3.east) .. controls +(east:0.9) and +(west:0.9) ..(wt4.west);
}
\visible<6->{
\node [circle,draw,anchor=south,inner sep=3pt,fill=orange!20] (c2) at ([yshift=2em]h2.north) {\tiny{$\textbf{C}_2$}};
\node [anchor=south] (c2label) at (c2.north) {\tiny{\textbf{注意力机制:上下文}}};
\node [anchor=south] (c2more) at ([yshift=-1.5em]c2.south) {\tiny{...}};
\draw [->] (h1.north) .. controls +(north:0.6) and +(250:0.9) .. (c2.250);
\draw [->] (h3.north) .. controls +(north:0.6) and +(290:0.9) .. (c2.290);
\draw [->] ([yshift=-0.3em]s1.west) .. controls +(west:2) and +(-50:0.3) .. (c2.-40);
}
\visible<8->{
\draw [->] (c2.0) -- ([xshift=1.358in]c2.0) -- ([yshift=0.3em,xshift=-1.2em]s2.west) -- ([yshift=0.3em,xshift=-0.1em]s2.west);
}
\visible<9->{
\node [circle,draw,anchor=north,inner sep=3pt,fill=orange!20] (c3) at ([yshift=-2em]t2.south) {\tiny{$\textbf{C}_3$}};
\draw [->] ([xshift=-0.7em]c3.west) -- ([xshift=-0.1em]c3.west);
\draw [->] ([xshift=0.1em]c3.east) .. controls +(east:0.6) and +(west:0.8) ..([yshift=-0.3em,xshift=-0.1em]s3.west);
}
\visible<9->{
\node [circle,draw,anchor=north,inner sep=3pt,fill=orange!20] (c4) at ([yshift=-2em]t3.south) {\tiny{$\textbf{C}_4$}};
\draw [->] ([xshift=-0.7em]c4.west) -- ([xshift=-0.1em]c4.west);
\draw [->] ([xshift=0.1em]c4.east) .. controls +(east:0.6) and +(west:0.8) ..([yshift=-0.3em,xshift=-0.1em]s4.west);
}
\end{scope}
\end{tikzpicture}
\end{center}
}
\end{frame}
%%%------------------------------------------------------------------------------------------------------------
%%% 解码 - beam search
\begin{frame}{推断 - Beam Search}
\begin{itemize}
\item \textbf{Greedy Search}: 目标语每一个位置,输出层的Softmax可以得到所有单词的概率,然后选择一个概率最大单词输出,下一个位置的预测就基于这一步输出的单词
\item \textbf{Beach Search}: 为了避免贪婪方法造成的错误累加,可以每次对$b$个单词进行扩展,而不是只使用一个单词,其中$b$称做束的宽度 - 这样可以搜索更多可能的译文
\end{itemize}
\vspace{-0.3em}
\visible<2->{
\begin{center}
\begin{tikzpicture}
\begin{scope}
\tikzstyle{rnnnode} = [minimum height=1.1em,minimum width=3.5em,inner sep=2pt,rounded corners=1pt,draw,fill=red!20];
\tikzstyle{wnode} = [minimum height=1.0em,minimum width=3.5em,inner sep=2pt,rounded corners=1pt,draw,fill=white];
\visible<3->{
\node [rnnnode,anchor=west,fill=green!20] (t1) at (0,0) {\tiny{$e_y()$}};
}
\visible<7->{
\node [rnnnode,anchor=west,fill=green!20] (t2) at ([xshift=2.2em]t1.east) {\tiny{$e_y()$ ($\times 3$)}};
}
\visible<8->{
\node [rnnnode,anchor=west,fill=green!20] (t3) at ([xshift=2.2em]t2.east) {\tiny{$e_y()$ ($\times 3$)}};
\node [anchor=west,inner sep=2pt] (t4) at ([xshift=0.3em]t3.east) {\tiny{...}};
}
\visible<3->{
\node [rnnnode,anchor=south] (s1) at ([yshift=1em]t1.north) {\tiny{$\textbf{s}_1$}};
}
\visible<7->{
\node [rnnnode,anchor=south] (s2) at ([yshift=1em]t2.north) {\tiny{$\textbf{s}_2$ ($\times 3$)}};
}
\visible<8->{
\node [rnnnode,anchor=south] (s3) at ([yshift=1em]t3.north) {\tiny{$\textbf{s}_3$ ($\times 3$)}};
\node [anchor=west,inner sep=2pt] (s4) at ([xshift=0.3em]s3.east) {\tiny{...}};
}
\visible<3->{
\node [rnnnode,anchor=south,fill=blue!20] (o1) at ([yshift=1em]s1.north) {\tiny{softmax}};
}
\visible<7->{
\node [rnnnode,anchor=south,fill=blue!20] (o2) at ([yshift=1em]s2.north) {\tiny{softmax ($\times 3$)}};
}
\visible<8->{
\node [rnnnode,anchor=south,fill=blue!20] (o3) at ([yshift=1em]s3.north) {\tiny{softmax ($\times 3$)}};
\node [anchor=west,inner sep=2pt] (o4) at ([xshift=0.3em]o3.east) {\tiny{...}};
}
\node [wnode,anchor=north] (wt1) at ([yshift=-0.8em]t1.south) {\tiny{EOS}};
\visible<6->{
\node [wnode,anchor=north] (wt2) at ([yshift=-0.8em]t2.south) {\tiny{Have}};
\node [wnode,anchor=north] (wt2copy1) at ([xshift=-0.2em,yshift=-0.2em]wt2.north) {\tiny{Have}};
\node [wnode,anchor=north] (wt2copy2) at ([xshift=-0.4em,yshift=-0.4em]wt2.north) {\tiny{Have}};
}
\visible<8->{
\node [wnode,anchor=north,inner sep=2pt] (wt3) at ([yshift=-0.8em]t3.south) {\tiny{you}};
\node [wnode,anchor=north] (wt3copy1) at ([xshift=-0.2em,yshift=-0.2em]wt3.north) {\tiny{you}};
\node [wnode,anchor=north] (wt3copy2) at ([xshift=-0.4em,yshift=-0.4em]wt3.north) {\tiny{you}};
}
\visible<5->{
\node [wnode,anchor=center,inner sep=2pt] (wo1) at ([xshift=0.4em,yshift=1.8em]o1.north) {\tiny{Have}};
\node [wnode,anchor=north] (wo1copy1) at ([xshift=-0.2em,yshift=-0.2em]wo1.north) {\tiny{Have}};
\node [wnode,anchor=north] (wo1copy2) at ([xshift=-0.4em,yshift=-0.4em]wo1.north) {\tiny{Have}};
}
\visible<8->{
\node [wnode,anchor=center,inner sep=2pt] (wo2) at ([xshift=0.4em,yshift=1.8em]o2.north) {\tiny{you}};
\node [wnode,anchor=north] (wo2copy1) at ([xshift=-0.2em,yshift=-0.2em]wo2.north) {\tiny{you}};
\node [wnode,anchor=north] (wo2copy2) at ([xshift=-0.4em,yshift=-0.4em]wo2.north) {\tiny{you}};
}
\visible<8->{
\node [wnode,anchor=center,inner sep=2pt] (wo3) at ([xshift=0.4em,yshift=1.8em]o3.north) {\tiny{learned}};
\node [wnode,anchor=north] (wo3copy1) at ([xshift=-0.2em,yshift=-0.2em]wo3.north) {\tiny{learned}};
\node [wnode,anchor=north] (wo3copy2) at ([xshift=-0.4em,yshift=-0.4em]wo3.north) {\tiny{learned}};
}
\visible<3->{
\foreach \x in {1}{
\draw [->] ([yshift=-0.7em]t\x.south) -- ([yshift=-0.1em]t\x.south);
\draw [->] ([yshift=0.1em]t\x.north) -- ([yshift=-0.1em]s\x.south);
\draw [->] ([yshift=0.1em]s\x.north) -- ([yshift=-0.1em]o\x.south);
}
}
\visible<5->{
\draw [->] ([yshift=0.1em]o1.north) -- ([yshift=0.8em]o1.north) node [pos=0.5,right] {\tiny{top-3}};
}
\visible<7->{
\foreach \x in {2}{
\draw [->] ([yshift=-0.7em]t\x.south) -- ([yshift=-0.1em]t\x.south);
\draw [->] ([yshift=0.1em]t\x.north) -- ([yshift=-0.1em]s\x.south);
\draw [->] ([yshift=0.1em]s\x.north) -- ([yshift=-0.1em]o\x.south);
\draw [->] ([yshift=0.1em]o\x.north) -- ([yshift=0.8em]o\x.north) node [pos=0.5,right] {\tiny{top-3}};
}
}
\visible<8->{
\foreach \x in {3}{
\draw [->] ([yshift=-0.7em]t\x.south) -- ([yshift=-0.1em]t\x.south);
\draw [->] ([yshift=0.1em]t\x.north) -- ([yshift=-0.1em]s\x.south);
\draw [->] ([yshift=0.1em]s\x.north) -- ([yshift=-0.1em]o\x.south);
\draw [->] ([yshift=0.1em]o\x.north) -- ([yshift=0.8em]o\x.north) node [pos=0.5,right] {\tiny{top-3}};
}
}
\visible<3->{
\draw [->] ([xshift=-0.5em]s1.west) -- ([xshift=-0.1em]s1.west) node [pos=0,left,inner sep=1pt] {\tiny{0}};
}
\visible<7->{
\draw [->] ([xshift=0.1em]s1.east) -- ([xshift=-0.1em]s2.west);
}
\visible<8->{
\draw [->] ([xshift=0.1em]s2.east) -- ([xshift=-0.1em]s3.west);
}
\visible<6->{
\draw [->,very thick,dotted] (wo1.east) .. controls +(east:0.6) and +(west:0.8) ..(wt2copy2.west);
}
\visible<8->{
\draw [->,very thick,dotted] (wo2.east) .. controls +(east:0.6) and +(west:0.8) ..(wt3copy2.west);
}
\visible<7->{
\node [circle,draw,anchor=north,inner sep=2pt,fill=orange!20] (c2) at ([yshift=-2.5em]t1.south) {\tiny{$\textbf{C}_2$}};
\node [circle,draw,inner sep=2pt,fill=orange!20] (c2copy1) at ([yshift=-0.1em,xshift=-0.1em]c2) {\tiny{$\textbf{C}_2$}};
\node [circle,draw,inner sep=2pt,fill=orange!20] (c2copy2) at ([yshift=-0.2em,xshift=-0.2em]c2) {\tiny{$\textbf{C}_2$}};
\draw [->] ([xshift=-0.9em]c2.west) -- ([xshift=-0.3em]c2.west);
\draw [->] ([xshift=0.1em]c2.east) .. controls +(east:1.5) and +(west:0.8) ..([yshift=-0.3em,xshift=-0.1em]s2.west);
}
\visible<8->{
\node [circle,draw,anchor=north,inner sep=2pt,fill=orange!20] (c3) at ([yshift=-2.5em]t2.south) {\tiny{$\textbf{C}_3$}};
\node [circle,draw,inner sep=2pt,fill=orange!20] (c3copy1) at ([yshift=-0.1em,xshift=-0.1em]c3) {\tiny{$\textbf{C}_3$}};
\node [circle,draw,inner sep=2pt,fill=orange!20] (c3copy2) at ([yshift=-0.2em,xshift=-0.2em]c3) {\tiny{$\textbf{C}_3$}};
\draw [->] ([xshift=-0.9em]c3.west) -- ([xshift=-0.3em]c3.west);
\draw [->] ([xshift=0.1em]c3.east) .. controls +(east:1.5) and +(west:0.8) ..([yshift=-0.3em,xshift=-0.1em]s3.west);
}
\visible<3->{
\node [anchor=east] (vocab) at ([xshift=-5em]s1.west) {\tiny{$\begin{bmatrix} \textrm{Have} & 0.50 \\ \textrm{I} & 0.02 \\ \textrm{it} & 0.03 \\ \textrm{has} & 0.30 \\ \textrm{you} & 0.01 \\ \textrm{the} & 0.01 \\ \textrm{a} & 0.01 \\ \textrm{an} & 0.02 \\ \textrm{he} & 0.03 \\ \textrm{she} & 0.01 \\ \textrm{are} & 0.00 \\ \textrm{am} & 0.01 \\ ... & ... \end{bmatrix}$}};
\node [anchor=south] (vocablabel) at (vocab.north) {\tiny{单词的概率分布}};
\draw [->,red,very thick,dotted] (o1.west) .. controls +(west:1) and +(east:2) .. ([yshift=1em]vocab.south east);
}
\visible<4->{
\node [anchor=east,inner sep=1pt] (vocabtopn) at ([xshift=-0.5em,yshift=-0.5em]wo1.west) {\tiny{$\begin{bmatrix} \textrm{Have} \\ \textrm{has} \\ \textrm{it} \end{bmatrix}$}};
\draw [->] ([yshift=-1.6em,xshift=-0.4em]vocab.north east) .. controls +(east:1) and +(west:1) .. ([xshift=0.1em,yshift=0.4em]vocabtopn.west) node [pos=0.3,below] (topnlabel) {\tiny{top-3}};
\visible<4->{
\node [anchor=north] (cap) at (vocab.south east) {\scriptsize{\textbf{束搜索($b=3$)}}};
}
}
\end{scope}
\end{tikzpicture}
\end{center}
}
\end{frame}
%%%------------------------------------------------------------------------------------------------------------
%%% 实验结果
\begin{frame}{效果}
%% 实用注意力机制带来的提升
%% 个大评测比赛没有不使用注意力机制的系统,已经成为标配
\end{frame}
%%%------------------------------------------------------------------------------------------------------------
%%% GNMT
\begin{frame}{成功案例 - GNMT}
%% GNMT的图和几句话说它多牛
\end{frame}
%%%------------------------------------------------------------------------------------------------------------
\section{Transformer}
%%%------------------------------------------------------------------------------------------------------------
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论