Commit fbcf3b9f by xiaotong

new pages

parent 7edb503e
...@@ -28,6 +28,7 @@ ...@@ -28,6 +28,7 @@
\usetikzlibrary{calc,intersections} \usetikzlibrary{calc,intersections}
\usetikzlibrary{matrix} \usetikzlibrary{matrix}
\usetikzlibrary{patterns}
\usetikzlibrary{arrows,decorations.pathreplacing} \usetikzlibrary{arrows,decorations.pathreplacing}
\usetikzlibrary{shadows} % LATEX and plain TEX when using Tik Z \usetikzlibrary{shadows} % LATEX and plain TEX when using Tik Z
\usetikzlibrary{shadows.blur} \usetikzlibrary{shadows.blur}
...@@ -144,34 +145,21 @@ ...@@ -144,34 +145,21 @@
\subsection{注意力机制} \subsection{注意力机制}
%%%------------------------------------------------------------------------------------------------------------ %%%------------------------------------------------------------------------------------------------------------
%%% 如何定义注意力函数 %%% 解码
\begin{frame}{计算注意力权重 - 注意力函数} \begin{frame}{推断}
\begin{itemize} \begin{itemize}
\item 再来看一下注意力权重的定义。这个过程实际上是对$a(\cdot,\cdot)$做指数归一化:\\ \item 使用NMT时,对于源语言句子$\textbf{x}$,需要得到最优译文$\hat{\textbf{y}}$
\vspace{-0.3em}
\vspace{-1.5em}
\begin{displaymath} \begin{displaymath}
\alpha_{i,j} = \frac{\exp(a(s_{i-1}, h_j))}{\sum_{j'} \exp(a(s_{i-1}, h_{j'}))} \hat{\textbf{y}} = \argmax_{\textbf{y}} \log\textrm{P}(\textbf{y}|\textbf{x}) = \argmax_{\textbf{y}} \sum_{j=1}^{n} \log\textrm{P}(y_j|\textbf{y}_{<j}, \textbf{x})
\end{displaymath} \end{displaymath}
\item<2-> 注意力函数$a(s,h)$的目的是捕捉$s$$h$之间的\alert{相似性},这也可以被看作是目标语表示和源语言表示的一种``统一化'',即把源语言和目标语表示在同一个语义空间,进而语义相近的内容有更大的相似性。\visible<3->{定义$a(s,h)$的方式:} \item 由于生成$y_i$需要依赖$y_{i-1}$因此,无法同时生成${y_1,...,y_n}$。常用的方法是
\visible<3->{
\begin{displaymath}
a(s,h) = \left\{ \begin{array}{ll}
s h^T & \textrm{向量乘} \\
\textrm{cos}(s, h) & \textrm{向量夹角} \\
s \textbf{W} h^T & \textrm{线性模型} \\
\textrm{TanH}(\textbf{W}[s,h])\textbf{v}^T & \textrm{拼接}
\end{array}
\right.
\end{displaymath}
$\textbf{W}$$\textbf{v}$是可学习参数
}
\end{itemize} \end{itemize}
\end{frame} \end{frame}
%%%------------------------------------------------------------------------------------------------------------ %%%------------------------------------------------------------------------------------------------------------
\section{Transformer} \section{Transformer}
......
...@@ -28,6 +28,7 @@ ...@@ -28,6 +28,7 @@
\usetikzlibrary{calc,intersections} \usetikzlibrary{calc,intersections}
\usetikzlibrary{matrix} \usetikzlibrary{matrix}
\usetikzlibrary{patterns}
\usetikzlibrary{arrows,decorations.pathreplacing} \usetikzlibrary{arrows,decorations.pathreplacing}
\usetikzlibrary{shadows} % LATEX and plain TEX when using Tik Z \usetikzlibrary{shadows} % LATEX and plain TEX when using Tik Z
\usetikzlibrary{shadows.blur} \usetikzlibrary{shadows.blur}
...@@ -958,8 +959,8 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\ ...@@ -958,8 +959,8 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\
\item<2-> \textbf{核心}:如何求解$\textrm{P}(y_j|\textbf{y}_{<j}, \textbf{x})$。在这个循环神经网络模型中,有三个步骤 \item<2-> \textbf{核心}:如何求解$\textrm{P}(y_j|\textbf{y}_{<j}, \textbf{x})$。在这个循环神经网络模型中,有三个步骤
\begin{enumerate} \begin{enumerate}
\item 输入的单词用分布式表示,如$\textbf{x}$被表示为词向量序列$e_x(\textbf{x})$,同理$\textbf{y}_{<j}$被表示为$e_y(\textbf{y}_{<j})$ \item 输入的单词用分布式表示,如$\textbf{x}$被表示为词向量序列$e_x(\textbf{x})$,同理$\textbf{y}_{<j}$被表示为$e_y(\textbf{y}_{<j})$
\item 源语言句子被一个RNN编码为一个表示$C$,如前面的例子中是一个实数向量 \item 源语言句子被一个RNN编码为一个表示$\textbf{C}$,如前面的例子中是一个实数向量
\item 目标端解码用另一个RNN,因此生成$y_j$时只考虑前一个状态$s_{j-1}$(这里,$s_{j-1}$表示RNN第$j-1$步骤的隐层状态) \item 目标端解码用另一个RNN,因此生成$y_j$时只考虑前一个状态$\textbf{s}_{j-1}$(这里,$\textbf{s}_{j-1}$表示RNN第$j-1$步骤的隐层状态)
\end{enumerate} \end{enumerate}
\end{itemize} \end{itemize}
...@@ -986,9 +987,9 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\ ...@@ -986,9 +987,9 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\
\node[rnnnode,minimum height=0.5\base,fill=green!30!white,anchor=west] (eemb\x) at ([xshift=0.4\base]eemb\y.east) {\tiny{$e_x()$}}; \node[rnnnode,minimum height=0.5\base,fill=green!30!white,anchor=west] (eemb\x) at ([xshift=0.4\base]eemb\y.east) {\tiny{$e_x()$}};
\foreach \x in {1,2,...,3} \foreach \x in {1,2,...,3}
\node[rnnnode,fill=blue!30!white,anchor=south] (enc\x) at ([yshift=0.3\base]eemb\x.north) {}; \node[rnnnode,fill=blue!30!white,anchor=south] (enc\x) at ([yshift=0.3\base]eemb\x.north) {};
\node[] (enclabel1) at (enc1) {\tiny{$h_{m-2}$}}; \node[] (enclabel1) at (enc1) {\tiny{$\textbf{h}_{m-2}$}};
\node[] (enclabel2) at (enc2) {\tiny{$h_{m-1}$}}; \node[] (enclabel2) at (enc2) {\tiny{$\textbf{h}_{m-1}$}};
\node[rnnnode,fill=purple!30!white] (enclabel3) at (enc3) {\tiny{$h_{m}$}}; \node[rnnnode,fill=purple!30!white] (enclabel3) at (enc3) {\tiny{$\textbf{h}_{m}$}};
\node[wordnode,left=0.4\base of enc1] (init1) {$\cdots$}; \node[wordnode,left=0.4\base of enc1] (init1) {$\cdots$};
\node[wordnode,left=0.4\base of eemb1] (init2) {$\cdots$}; \node[wordnode,left=0.4\base of eemb1] (init2) {$\cdots$};
...@@ -1000,7 +1001,7 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\ ...@@ -1000,7 +1001,7 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\
\foreach \x in {1,2,...,3} \foreach \x in {1,2,...,3}
\node[rnnnode,minimum height=0.5\base,fill=green!30!white,anchor=south] (demb\x) at ([yshift=\base]enc\x.north) {\tiny{$e_y()$}}; \node[rnnnode,minimum height=0.5\base,fill=green!30!white,anchor=south] (demb\x) at ([yshift=\base]enc\x.north) {\tiny{$e_y()$}};
\foreach \x in {1,2,...,3} \foreach \x in {1,2,...,3}
\node[rnnnode,fill=blue!30!white,anchor=south] (dec\x) at ([yshift=0.3\base]demb\x.north) {{\tiny{$s_\x$}}}; \node[rnnnode,fill=blue!30!white,anchor=south] (dec\x) at ([yshift=0.3\base]demb\x.north) {{\tiny{$\textbf{s}_\x$}}};
\foreach \x in {1,2,...,3} \foreach \x in {1,2,...,3}
\node[rnnnode,minimum height=0.5\base,fill=red!30!white,anchor=south] (softmax\x) at ([yshift=0.3\base]dec\x.north) {\tiny{Softmax}}; \node[rnnnode,minimum height=0.5\base,fill=red!30!white,anchor=south] (softmax\x) at ([yshift=0.3\base]dec\x.north) {\tiny{Softmax}};
\node[wordnode,right=0.4\base of demb3] (end1) {$\cdots$}; \node[wordnode,right=0.4\base of demb3] (end1) {$\cdots$};
...@@ -1044,7 +1045,7 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\ ...@@ -1044,7 +1045,7 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\
\draw[-latex'] (enc3.north) .. controls +(north:0.3\base) and +(east:\base) .. (bridge) .. controls +(west:2.7\base) and +(west:0.3\base) .. (dec1.west); \draw[-latex'] (enc3.north) .. controls +(north:0.3\base) and +(east:\base) .. (bridge) .. controls +(west:2.7\base) and +(west:0.3\base) .. (dec1.west);
\visible<2->{ \visible<2->{
\node [anchor=east] (line1) at ([xshift=-3em,yshift=0.5em]softmax1.west) {\scriptsize{基于RNN的隐层状态$s_i$}}; \node [anchor=east] (line1) at ([xshift=-3em,yshift=0.5em]softmax1.west) {\scriptsize{基于RNN的隐层状态$\textbf{s}_i$}};
\node [anchor=north west] (line2) at ([yshift=0.3em]line1.south west) {\scriptsize{预测目标词的概率}}; \node [anchor=north west] (line2) at ([yshift=0.3em]line1.south west) {\scriptsize{预测目标词的概率}};
\node [anchor=north west] (line3) at ([yshift=0.3em]line2.south west) {\scriptsize{通常,用Softmax函数}}; \node [anchor=north west] (line3) at ([yshift=0.3em]line2.south west) {\scriptsize{通常,用Softmax函数}};
\node [anchor=north west] (line4) at ([yshift=0.3em]line3.south west) {\scriptsize{实现 $\textrm{P}(y_i|...)$}}; \node [anchor=north west] (line4) at ([yshift=0.3em]line3.south west) {\scriptsize{实现 $\textrm{P}(y_i|...)$}};
...@@ -1061,7 +1062,7 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\ ...@@ -1061,7 +1062,7 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\
\node [anchor=west] (line21) at ([xshift=1.3em,yshift=1.5em]enc3.east) {\scriptsize{源语编码器最后一个}}; \node [anchor=west] (line21) at ([xshift=1.3em,yshift=1.5em]enc3.east) {\scriptsize{源语编码器最后一个}};
\node [anchor=north west] (line22) at ([yshift=0.3em]line21.south west) {\scriptsize{循环单元的输出被}}; \node [anchor=north west] (line22) at ([yshift=0.3em]line21.south west) {\scriptsize{循环单元的输出被}};
\node [anchor=north west] (line23) at ([yshift=0.3em]line22.south west) {\scriptsize{看作是句子的表示,}}; \node [anchor=north west] (line23) at ([yshift=0.3em]line22.south west) {\scriptsize{看作是句子的表示,}};
\node [anchor=north west] (line24) at ([yshift=0.3em]line23.south west) {\scriptsize{记为$C$}}; \node [anchor=north west] (line24) at ([yshift=0.3em]line23.south west) {\scriptsize{记为$\textbf{C}$}};
} }
\begin{pgfonlayer}{background} \begin{pgfonlayer}{background}
...@@ -1097,7 +1098,7 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\ ...@@ -1097,7 +1098,7 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\
\item 可以重新定义\\ \item 可以重新定义\\
\vspace{-0.8em} \vspace{-0.8em}
\begin{displaymath} \begin{displaymath}
\textrm{P}(y_j|\textbf{y}_{<j}, \textbf{x}) \triangleq \textrm{P}(y_j|s_{j-1}, C) \textrm{P}(y_j|\textbf{y}_{<j}, \textbf{x}) \triangleq \textrm{P}(y_j|\textbf{s}_{j-1}, \textbf{C})
\end{displaymath} \end{displaymath}
对于上图中的模型,进一步化简为:\\ 对于上图中的模型,进一步化简为:\\
...@@ -1106,8 +1107,8 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\ ...@@ -1106,8 +1107,8 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\
\begin{displaymath} \begin{displaymath}
\textrm{P}(y_j|\textbf{y}_{<j}, \textbf{x}) \triangleq \left\{ \textrm{P}(y_j|\textbf{y}_{<j}, \textbf{x}) \triangleq \left\{
\begin{matrix} \begin{matrix}
\textrm{P}(y_j|C)\ \ \ \ & j = 1 \\ \textrm{P}(y_j|\textbf{C})\ \ \ \ & j = 1 \\
\textrm{P}(y_j|s_{j-1}) & j > 1 \textrm{P}(y_j|\textbf{s}_{j-1}) & j > 1
\end{matrix} \right. \end{matrix} \right.
\end{displaymath} \end{displaymath}
...@@ -1455,12 +1456,6 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\ ...@@ -1455,12 +1456,6 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\
\end{frame} \end{frame}
%%%------------------------------------------------------------------------------------------------------------ %%%------------------------------------------------------------------------------------------------------------
%%% 一些变种
\begin{frame}{改进 - fine-tuning}
%%% 图
\end{frame}
%%%------------------------------------------------------------------------------------------------------------
\subsection{注意力机制} \subsection{注意力机制}
%%%------------------------------------------------------------------------------------------------------------ %%%------------------------------------------------------------------------------------------------------------
...@@ -1564,7 +1559,7 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\ ...@@ -1564,7 +1559,7 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\
\begin{itemize} \begin{itemize}
\item 在注意力机制中,每个目标语单词的生成会使用一个动态的源语表示,而非一个统一的固定表示 \item 在注意力机制中,每个目标语单词的生成会使用一个动态的源语表示,而非一个统一的固定表示
\begin{itemize} \begin{itemize}
\item 这里$C_i$表示第$i$个目标语单词所使用的源语表示 \item 这里$\textbf{C}_i$表示第$i$个目标语单词所使用的源语表示
\end{itemize} \end{itemize}
\end{itemize} \end{itemize}
...@@ -1646,9 +1641,9 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\ ...@@ -1646,9 +1641,9 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\
\draw[<-] ([yshift=0.1em,xshift=1em]t6.north) -- ([yshift=1.2em,xshift=1em]t6.north); \draw[<-] ([yshift=0.1em,xshift=1em]t6.north) -- ([yshift=1.2em,xshift=1em]t6.north);
\draw [->] ([yshift=3em]s6.north) -- ([yshift=4em]s6.north) -- ([yshift=4em]t1.north) node [pos=0.5,fill=green!30,inner sep=2pt] (c1) {\scriptsize{表示$C_1$}} -- ([yshift=3em]t1.north) ; \draw [->] ([yshift=3em]s6.north) -- ([yshift=4em]s6.north) -- ([yshift=4em]t1.north) node [pos=0.5,fill=green!30,inner sep=2pt] (c1) {\scriptsize{表示$\textbf{C}_1$}} -- ([yshift=3em]t1.north) ;
\draw [->] ([yshift=3em]s5.north) -- ([yshift=5.3em]s5.north) -- ([yshift=5.3em]t2.north) node [pos=0.5,fill=green!30,inner sep=2pt] (c2) {\scriptsize{表示$C_2$}} -- ([yshift=3em]t2.north) ; \draw [->] ([yshift=3em]s5.north) -- ([yshift=5.3em]s5.north) -- ([yshift=5.3em]t2.north) node [pos=0.5,fill=green!30,inner sep=2pt] (c2) {\scriptsize{表示$\textbf{C}_2$}} -- ([yshift=3em]t2.north) ;
\draw [->] ([yshift=3.5em]s3.north) -- ([yshift=6.6em]s3.north) -- ([yshift=6.6em]t4.north) node [pos=0.5,fill=green!30,inner sep=2pt] (c3) {\scriptsize{表示$C_i$}} -- ([yshift=3.5em]t4.north) ; \draw [->] ([yshift=3.5em]s3.north) -- ([yshift=6.6em]s3.north) -- ([yshift=6.6em]t4.north) node [pos=0.5,fill=green!30,inner sep=2pt] (c3) {\scriptsize{表示$\textbf{C}_i$}} -- ([yshift=3.5em]t4.north) ;
\node [anchor=north] (smore) at ([yshift=3.5em]s3.north) {...}; \node [anchor=north] (smore) at ([yshift=3.5em]s3.north) {...};
\node [anchor=north] (tmore) at ([yshift=3.5em]t4.north) {...}; \node [anchor=north] (tmore) at ([yshift=3.5em]t4.north) {...};
...@@ -1663,15 +1658,15 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\ ...@@ -1663,15 +1658,15 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\
%%%------------------------------------------------------------------------------------------------------------ %%%------------------------------------------------------------------------------------------------------------
%%% C_i的定义 %%% C_i的定义
\begin{frame}{上下文向量$C_i$} \begin{frame}{上下文向量$\textbf{C}_i$}
\begin{itemize} \begin{itemize}
\item 对于目标语位置$i$$C_i$是目标语$i$使用的上下文向量 \item 对于目标语位置$i$$\textbf{C}_i$是目标语$i$使用的上下文向量
\begin{itemize} \begin{itemize}
\item $h_j$表示编码器第$j$个位置的隐层状态 \item $\textbf{h}_j$表示编码器第$j$个位置的隐层状态
\item $s_i$表示解码器第$i$个位置的隐层状态 \item $\textbf{s}_i$表示解码器第$i$个位置的隐层状态
\item<2-> $\alpha_{i,j}$表示注意力权重,表示目标语第$i$个位置与源语第$j$个位置之间的相关性大小 \item<2-> $\alpha_{i,j}$表示注意力权重,表示目标语第$i$个位置与源语第$j$个位置之间的相关性大小
\item<2-> $a(\cdot)$表示注意力函数,计算$s_{i-1}$$h_j$之间的相关性 \item<2-> $a(\cdot)$表示注意力函数,计算$\textbf{s}_{i-1}$$\textbf{h}_j$之间的相关性
\item<3-> $C_i$是所有源语编码表示$\{h_j\}$的加权求和,权重为$\{\alpha_{i,j}\}$ \item<3-> $\textbf{C}_i$是所有源语编码表示$\{\textbf{h}_j\}$的加权求和,权重为$\{\alpha_{i,j}\}$
\end{itemize} \end{itemize}
\end{itemize} \end{itemize}
...@@ -1680,17 +1675,17 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\ ...@@ -1680,17 +1675,17 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\
\begin{scope} \begin{scope}
\node [anchor=west,draw,fill=red!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (h1) at (0,0) {\scriptsize{$h_1$}}; \node [anchor=west,draw,fill=red!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (h1) at (0,0) {\scriptsize{$\textbf{h}_1$}};
\node [anchor=west,draw,fill=red!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (h2) at ([xshift=1em]h1.east) {\scriptsize{$h_2$}}; \node [anchor=west,draw,fill=red!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (h2) at ([xshift=1em]h1.east) {\scriptsize{$\textbf{h}_2$}};
\node [anchor=west,inner sep=0pt,minimum width=3em] (h3) at ([xshift=0.5em]h2.east) {\scriptsize{...}}; \node [anchor=west,inner sep=0pt,minimum width=3em] (h3) at ([xshift=0.5em]h2.east) {\scriptsize{...}};
\node [anchor=west,draw,fill=red!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (h4) at ([xshift=0.5em]h3.east) {\scriptsize{$h_n$}}; \node [anchor=west,draw,fill=red!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (h4) at ([xshift=0.5em]h3.east) {\scriptsize{$\textbf{h}_n$}};
\node [anchor=south,circle,minimum size=1.0em,draw,ublue,thick] (sum) at ([yshift=2em]h2.north east) {}; \node [anchor=south,circle,minimum size=1.0em,draw,ublue,thick] (sum) at ([yshift=2em]h2.north east) {};
\draw [thick,-,ublue] (sum.north) -- (sum.south); \draw [thick,-,ublue] (sum.north) -- (sum.south);
\draw [thick,-,ublue] (sum.west) -- (sum.east); \draw [thick,-,ublue] (sum.west) -- (sum.east);
\node [anchor=south,draw,fill=green!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (th1) at ([yshift=2em,xshift=-1em]sum.north west) {\scriptsize{$s_{i-1}$}}; \node [anchor=south,draw,fill=green!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (th1) at ([yshift=2em,xshift=-1em]sum.north west) {\scriptsize{$\textbf{s}_{i-1}$}};
\node [anchor=west,draw,fill=green!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (th2) at ([xshift=2em]th1.east) {\scriptsize{$s_{i}$}}; \node [anchor=west,draw,fill=green!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (th2) at ([xshift=2em]th1.east) {\scriptsize{$\textbf{s}_{i}$}};
\draw [->] (h1.north) .. controls +(north:0.8) and +(west:1) .. (sum.190) node [pos=0.3,left] {\tiny{$\alpha_{i,1}$}}; \draw [->] (h1.north) .. controls +(north:0.8) and +(west:1) .. (sum.190) node [pos=0.3,left] {\tiny{$\alpha_{i,1}$}};
\draw [->] (h2.north) .. controls +(north:0.6) and +(220:0.2) .. (sum.220) node [pos=0.2,right] {\tiny{$\alpha_{i,2}$}}; \draw [->] (h2.north) .. controls +(north:0.6) and +(220:0.2) .. (sum.220) node [pos=0.2,right] {\tiny{$\alpha_{i,2}$}};
...@@ -1699,7 +1694,7 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\ ...@@ -1699,7 +1694,7 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\
\draw [->] ([xshift=-1.5em]th1.west) -- ([xshift=-0.1em]th1.west); \draw [->] ([xshift=-1.5em]th1.west) -- ([xshift=-0.1em]th1.west);
\draw [->] ([xshift=0.1em]th1.east) -- ([xshift=-0.1em]th2.west); \draw [->] ([xshift=0.1em]th1.east) -- ([xshift=-0.1em]th2.west);
\draw [->] ([xshift=0.1em]th2.east) -- ([xshift=1.5em]th2.east); \draw [->] ([xshift=0.1em]th2.east) -- ([xshift=1.5em]th2.east);
\draw [->] (sum.north) .. controls +(north:0.8) and +(west:0.2) .. ([yshift=-0.4em,xshift=-0.1em]th2.west) node [pos=0.2,right] (ci) {\scriptsize{$C_{i}$}}; \draw [->] (sum.north) .. controls +(north:0.8) and +(west:0.2) .. ([yshift=-0.4em,xshift=-0.1em]th2.west) node [pos=0.2,right] (ci) {\scriptsize{$\textbf{C}_{i}$}};
\node [anchor=south,inner sep=1pt] (output) at ([yshift=0.8em]th2.north) {\tiny{输出层}}; \node [anchor=south,inner sep=1pt] (output) at ([yshift=0.8em]th2.north) {\tiny{输出层}};
\draw [->] ([yshift=0.1em]th2.north) -- ([yshift=-0.1em]output.south); \draw [->] ([yshift=0.1em]th2.north) -- ([yshift=-0.1em]output.south);
...@@ -1711,11 +1706,11 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\ ...@@ -1711,11 +1706,11 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\
\node [anchor=north] (enc42) at ([yshift=0.5em]enc4.south) {\tiny{(位置$4$)}}; \node [anchor=north] (enc42) at ([yshift=0.5em]enc4.south) {\tiny{(位置$4$)}};
\visible<2->{ \visible<2->{
\node [anchor=west] (math1) at ([xshift=5em,yshift=1em]th2.east) {$C_i = \sum_{j} \alpha_{i,j} h_j \ \ $}; \node [anchor=west] (math1) at ([xshift=5em,yshift=1em]th2.east) {$\textbf{C}_i = \sum_{j} \alpha_{i,j} \textbf{h}_j \ \ $};
} }
\visible<3->{ \visible<3->{
\node [anchor=north west] (math2) at ([yshift=-2em]math1.south west) {$\alpha_{i,j} = \frac{\exp(\beta_{i,j})}{\sum_{j'} \exp(\beta_{i,j'})}$}; \node [anchor=north west] (math2) at ([yshift=-2em]math1.south west) {$\alpha_{i,j} = \frac{\exp(\beta_{i,j})}{\sum_{j'} \exp(\beta_{i,j'})}$};
\node [anchor=north west] (math3) at ([yshift=-0em]math2.south west) {$\beta_{i,j} = a(s_{i-1}, h_j)$}; \node [anchor=north west] (math3) at ([yshift=-0em]math2.south west) {$\beta_{i,j} = a(\textbf{s}_{i-1}, \textbf{h}_j)$};
} }
\begin{pgfonlayer}{background} \begin{pgfonlayer}{background}
...@@ -1841,9 +1836,9 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\ ...@@ -1841,9 +1836,9 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\
\visible<3->{ \visible<3->{
% coverage score formula node % coverage score formula node
\node [anchor=north west] (formula) at ([xshift=-0.3\hnode,yshift=-1.5\hnode]attn11.south) {\small{不同$C_i$所对应的源语言词的权重是不同的}}; \node [anchor=north west] (formula) at ([xshift=-0.3\hnode,yshift=-1.5\hnode]attn11.south) {\small{不同$\textbf{C}_i$所对应的源语言词的权重是不同的}};
\node [anchor=north west] (example) at (formula.south west) {\footnotesize{$C_2=0.4 \times h(\textrm{``你''}) + 0.4 \times h(\textrm{``什么''}) +$}}; \node [anchor=north west] (example) at (formula.south west) {\footnotesize{$\textbf{C}_2=0.4 \times \textbf{h}(\textrm{``你''}) + 0.4 \times \textbf{h}(\textrm{``什么''}) +$}};
\node [anchor=north west] (example2) at ([yshift=0.4em]example.south west) {\footnotesize{$\ \ \ \ \ \ \ \ 0 \times h(\textrm{``都''}) + 0.1 \times h(\textrm{``没''}) + ..$}}; \node [anchor=north west] (example2) at ([yshift=0.4em]example.south west) {\footnotesize{$\ \ \ \ \ \ \ \ 0 \times \textbf{h}(\textrm{``都''}) + 0.1 \times \textbf{h}(\textrm{``没''}) + ..$}};
} }
\visible<3->{ \visible<3->{
...@@ -1867,11 +1862,11 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\ ...@@ -1867,11 +1862,11 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\
% coverage score for each source word % coverage score for each source word
\visible<2->{ \visible<2->{
\node[anchor=west] (sc1) at ([xshift=0.9\hnode]attn16.east) {$C_1 = \sum_{i=1}^{8} \alpha_{i1} h_{i}$}; \node[anchor=west] (sc1) at ([xshift=0.9\hnode]attn16.east) {$\textbf{C}_1 = \sum_{i=1}^{8} \alpha_{i1} \textbf{h}_{i}$};
} }
\visible<3->{ \visible<3->{
\node[anchor=west] (sc2) at ([xshift=0.9\hnode,yshift=0.0\hnode]attn26.east) {$C_2 = \sum_{i=1}^{8} \alpha_{i2} h_{i}$}; \node[anchor=west] (sc2) at ([xshift=0.9\hnode,yshift=0.0\hnode]attn26.east) {$\textbf{C}_2 = \sum_{i=1}^{8} \alpha_{i2} \textbf{h}_{i}$};
} }
\end{tikzpicture} \end{tikzpicture}
...@@ -1886,8 +1881,8 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\ ...@@ -1886,8 +1881,8 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\
{\small {\small
\begin{tabular}{l | l} \begin{tabular}{l | l}
引入注意力机制以前 & 引入注意力机制以后 \\ \hline 引入注意力机制以前 & 引入注意力机制以后 \\ \hline
$\textrm{``Have''} = \argmax_{y} \textrm{P}(y|0, \alert{C})$ & $\textrm{``Have''} = \argmax_{y} \textrm{P}(y|0, \alert{C_1})$ \\ $\textrm{``Have''} = \argmax_{y} \textrm{P}(y|0, \alert{\textbf{C}})$ & $\textrm{``Have''} = \argmax_{y} \textrm{P}(y|0, \alert{\textbf{C}_1})$ \\
$\textrm{``you''} = \argmax_{y} \textrm{P}(y|s_1, \alert{C})$ & $\textrm{``you''} = \argmax_{y} \textrm{P}(y|s_1, \alert{C_2})$ $\textrm{``you''} = \argmax_{y} \textrm{P}(y|\textbf{s}_1, \alert{\textbf{C}})$ & $\textrm{``you''} = \argmax_{y} \textrm{P}(y|\textbf{s}_1, \alert{\textbf{C}_2})$
\end{tabular} \end{tabular}
} }
\end{center} \end{center}
...@@ -1902,19 +1897,19 @@ $\textrm{``you''} = \argmax_{y} \textrm{P}(y|s_1, \alert{C})$ & $\textrm{``you'' ...@@ -1902,19 +1897,19 @@ $\textrm{``you''} = \argmax_{y} \textrm{P}(y|s_1, \alert{C})$ & $\textrm{``you''
\item 再来看一下注意力权重的定义。这个过程实际上是对$a(\cdot,\cdot)$做指数归一化:\\ \item 再来看一下注意力权重的定义。这个过程实际上是对$a(\cdot,\cdot)$做指数归一化:\\
\vspace{-0.3em} \vspace{-0.3em}
\begin{displaymath} \begin{displaymath}
\alpha_{i,j} = \frac{\exp(a(s_{i-1}, h_j))}{\sum_{j'} \exp(a(s_{i-1}, h_{j'}))} \alpha_{i,j} = \frac{\exp(a(\textbf{s}_{i-1}, \textbf{h}_j))}{\sum_{j'} \exp(a(\textbf{s}_{i-1}, \textbf{h}_{j'}))}
\end{displaymath} \end{displaymath}
\item<2-> 注意力函数$a(s,h)$的目的是捕捉$s$$h$之间的\alert{相似性},这也可以被看作是目标语表示和源语言表示的一种``统一化'',即把源语言和目标语表示在同一个语义空间,进而语义相近的内容有更大的相似性。\visible<3->{定义$a(s,h)$的方式:} \item<2-> 注意力函数$a(\textbf{s},\textbf{h})$的目的是捕捉$\textbf{s}$$\textbf{h}$之间的\alert{相似性},这也可以被看作是目标语表示和源语言表示的一种``统一化'',即把源语言和目标语表示在同一个语义空间,进而语义相近的内容有更大的相似性。\visible<3->{定义$a(\textbf{s},\textbf{h})$的方式:}
\vspace{-1em} \vspace{-1em}
\visible<3->{ \visible<3->{
\begin{displaymath} \begin{displaymath}
a(s,h) = \left\{ \begin{array}{ll} a(\textbf{s},\textbf{h}) = \left\{ \begin{array}{ll}
s h^T & \textrm{向量乘} \\ \textbf{s} \textbf{h}^{\textrm{T}} & \textrm{向量乘} \\
\textrm{cos}(s, h) & \textrm{向量夹角} \\ \textrm{cos}(\textbf{s}, \textbf{h}) & \textrm{向量夹角} \\
s \textbf{W} h^T & \textrm{线性模型} \\ \textbf{s} \textbf{W} \textbf{h}^{\textrm{T}} & \textrm{线性模型} \\
\textrm{TanH}(\textbf{W}[s,h])\textbf{v}^T & \textrm{拼接}[s,h]+\textrm{单层网络} \textrm{TanH}(\textbf{W}[\textbf{s},\textbf{h}])\textbf{v}^{\textrm{T}} & \textrm{拼接}[\textbf{s},\textbf{h}]+\textrm{单层网络}
\end{array} \end{array}
\right. \right.
\end{displaymath} \end{displaymath}
...@@ -1933,13 +1928,149 @@ $\textrm{``you''} = \argmax_{y} \textrm{P}(y|s_1, \alert{C})$ & $\textrm{``you'' ...@@ -1933,13 +1928,149 @@ $\textrm{``you''} = \argmax_{y} \textrm{P}(y|s_1, \alert{C})$ & $\textrm{``you''
\end{frame} \end{frame}
%%%------------------------------------------------------------------------------------------------------------ %%%------------------------------------------------------------------------------------------------------------
%%% 实验结果 %%% 如何进一步理解注意力机制
\begin{frame}{效果} \begin{frame}{重新解释注意力机制}
%% 实用注意力机制带来的提升 \begin{itemize}
%% 个大评测比赛没有不使用注意力机制的系统,已经成为标配 \item 换一个问题,假设有若干key-value单元,其中key是这个单元的索引表示,value是这个单元的值。对于任意一个query,可以找到匹配的key,并输出其对应的value
\end{itemize}
\vspace{-0.8em}
\begin{center}
\begin{tikzpicture}
\begin{scope}
\tikzstyle{rnode} = [draw,minimum width=3em,minimum height=1.2em]
\node [rnode,anchor=south west,fill=blue!20!white] (value1) at (0,0) {\scriptsize{value$_1$}};
\node [rnode,anchor=south west,fill=blue!20!white] (value2) at ([xshift=1em]value1.south east) {\scriptsize{value$_2$}};
\node [rnode,anchor=south west,fill=red!20!white] (value3) at ([xshift=1em]value2.south east) {\scriptsize{value$_3$}};
\node [rnode,anchor=south west,fill=blue!20!white] (value4) at ([xshift=1em]value3.south east) {\scriptsize{value$_4$}};
\node [rnode,anchor=south west,pattern=north east lines] (key1) at ([yshift=0.2em]value1.north west) {};
\node [rnode,anchor=south west,pattern=dots] (key2) at ([yshift=0.2em]value2.north west) {};
\node [rnode,anchor=south west,pattern=horizontal lines] (key3) at ([yshift=0.2em]value3.north west) {};
\node [rnode,anchor=south west,pattern=crosshatch dots] (key4) at ([yshift=0.2em]value4.north west) {};
\node [fill=white,inner sep=1pt] (key1label) at (key1) {\scriptsize{key$_1$}};
\node [fill=white,inner sep=1pt] (key1label) at (key2) {\scriptsize{key$_2$}};
\node [fill=white,inner sep=1pt] (key1label) at (key3) {\scriptsize{key$_3$}};
\node [fill=white,inner sep=1pt] (key1label) at (key4) {\scriptsize{key$_4$}};
\node [rnode,anchor=east,pattern=horizontal lines] (query) at ([xshift=-3em]key1.west) {};
\node [anchor=east] (querylabel) at ([xshift=-0.2em]query.west) {\scriptsize{query}};
\draw [->] ([yshift=1pt]query.north) .. controls +(90:2em) and +(90:2em) .. ([yshift=1pt]key3.north) node [pos=0.5,below,yshift=0.2em] {\scriptsize{匹配}};
\node [anchor=north] (result) at (value3.south) {\scriptsize{\alert{返回结果}}};
\end{scope}
\end{tikzpicture}
\end{center}
\vspace{-0.7em}
\begin{itemize}
\item<2-> 注意力机制也可以被看做对key-value单元的查询,但是所有key和query之间都有一种匹配程度,返回结果是对所有value的加权
\end{itemize}
\visible<2->{
\vspace{-0.5em}
\begin{center}
\begin{tikzpicture}
\begin{scope}
\tikzstyle{rnode} = [draw,minimum width=3em,minimum height=1.2em]
\node [rnode,anchor=south west,fill=red!20!white] (value1) at (0,0) {\scriptsize{value$_1$}};
\node [rnode,anchor=south west,fill=red!20!white] (value2) at ([xshift=1em]value1.south east) {\scriptsize{value$_2$}};
\node [rnode,anchor=south west,fill=red!20!white] (value3) at ([xshift=1em]value2.south east) {\scriptsize{value$_3$}};
\node [rnode,anchor=south west,fill=red!20!white] (value4) at ([xshift=1em]value3.south east) {\scriptsize{value$_4$}};
\node [rnode,anchor=south west,pattern=north east lines] (key1) at ([yshift=0.2em]value1.north west) {};
\node [rnode,anchor=south west,pattern=dots] (key2) at ([yshift=0.2em]value2.north west) {};
\node [rnode,anchor=south west,pattern=horizontal lines] (key3) at ([yshift=0.2em]value3.north west) {};
\node [rnode,anchor=south west,pattern=crosshatch dots] (key4) at ([yshift=0.2em]value4.north west) {};
\node [fill=white,inner sep=1pt] (key1label) at (key1) {\scriptsize{key$_1$}};
\node [fill=white,inner sep=1pt] (key1label) at (key2) {\scriptsize{key$_2$}};
\node [fill=white,inner sep=1pt] (key1label) at (key3) {\scriptsize{key$_3$}};
\node [fill=white,inner sep=1pt] (key1label) at (key4) {\scriptsize{key$_4$}};
\node [rnode,anchor=east,pattern=vertical lines] (query) at ([xshift=-3em]key1.west) {};
\node [anchor=east] (querylabel) at ([xshift=-0.2em]query.west) {\scriptsize{query}};
\draw [->] ([yshift=1pt,xshift=6pt]query.north) .. controls +(90:1em) and +(90:1em) .. ([yshift=1pt]key1.north);
\draw [->] ([yshift=1pt,xshift=3pt]query.north) .. controls +(90:1.5em) and +(90:1.5em) .. ([yshift=1pt]key2.north);
\draw [->] ([yshift=1pt]query.north) .. controls +(90:2em) and +(90:2em) .. ([yshift=1pt]key3.north);
\draw [->] ([yshift=1pt,xshift=-3pt]query.north) .. controls +(90:2.5em) and +(90:2.5em) .. ([yshift=1pt]key4.north);
\node [anchor=south east] (alpha1) at (key1.north east) {\scriptsize{$\alpha_1$}};
\node [anchor=south east] (alpha2) at (key2.north east) {\scriptsize{$\alpha_2$}};
\node [anchor=south east] (alpha3) at (key3.north east) {\scriptsize{$\alpha_3$}};
\node [anchor=south east] (alpha4) at (key4.north east) {\scriptsize{$\alpha_4$}};
\node [anchor=north] (result) at ([xshift=-1.5em]value2.south east) {\scriptsize{\alert{返回结果}=$\alpha_1 \cdot \textrm{value}_1 + \alpha_2 \cdot \textrm{value}_2 + \alpha_3 \cdot \textrm{value}_3 + \alpha_4 \cdot \textrm{value}_4$}};
\end{scope}
\end{tikzpicture}
\end{center}
}
\end{frame} \end{frame}
%%%------------------------------------------------------------------------------------------------------------ %%%------------------------------------------------------------------------------------------------------------
%%% 如何进一步理解注意力机制 - 回到机器翻译任务
\begin{frame}{重新解释注意力机制(续)}
\begin{itemize}
\item 回到机器翻译,如果把目标语状态$\textbf{s}_{i-1}$看做query,而把源语言所有位置的最上层RNN表示$\textbf{h}_{j}$看做{\color{ugreen} \textbf{key}}{\color{red} \textbf{value}}
\end{itemize}
\vspace{-1.5em}
\begin{center}
\begin{tikzpicture}
\begin{scope}
\tikzstyle{rnode} = [draw,minimum width=3.5em,minimum height=1.2em]
\node [rnode,anchor=south west,fill=red!20!white] (value1) at (0,0) {\scriptsize{$\textbf{h}(\textrm{``你''})$}};
\node [rnode,anchor=south west,fill=red!20!white] (value2) at ([xshift=1em]value1.south east) {\scriptsize{$\textbf{h}(\textrm{``什么''})$}};
\node [rnode,anchor=south west,fill=red!20!white] (value3) at ([xshift=1em]value2.south east) {\scriptsize{$\textbf{h}(\textrm{``也''})$}};
\node [rnode,anchor=south west,fill=red!20!white] (value4) at ([xshift=1em]value3.south east) {\scriptsize{$\textbf{h}(\textrm{``没''})$}};
\node [rnode,anchor=south west,fill=green!20!white] (key1) at ([yshift=0.2em]value1.north west) {\scriptsize{$\textbf{h}(\textrm{``你''})$}};
\node [rnode,anchor=south west,fill=green!20!white] (key2) at ([yshift=0.2em]value2.north west) {\scriptsize{$\textbf{h}(\textrm{``什么''})$}};
\node [rnode,anchor=south west,fill=green!20!white] (key3) at ([yshift=0.2em]value3.north west) {\scriptsize{$\textbf{h}(\textrm{``也''})$}};
\node [rnode,anchor=south west,fill=green!20!white] (key4) at ([yshift=0.2em]value4.north west) {\scriptsize{$\textbf{h}(\textrm{``没''})$}};
\node [rnode,anchor=east] (query) at ([xshift=-2em]key1.west) {\scriptsize{$\textbf{s}(\textrm{``you''})$}};
\node [anchor=east] (querylabel) at ([xshift=-0.2em]query.west) {\scriptsize{query}};
\draw [->] ([yshift=1pt,xshift=6pt]query.north) .. controls +(90:1em) and +(90:1em) .. ([yshift=1pt]key1.north);
\draw [->] ([yshift=1pt,xshift=3pt]query.north) .. controls +(90:1.5em) and +(90:1.5em) .. ([yshift=1pt]key2.north);
\draw [->] ([yshift=1pt]query.north) .. controls +(90:2em) and +(90:2em) .. ([yshift=1pt]key3.north);
\draw [->] ([yshift=1pt,xshift=-3pt]query.north) .. controls +(90:2.5em) and +(90:2.5em) .. ([yshift=1pt]key4.north);
\node [anchor=south east] (alpha1) at ([xshift=1em]key1.north east) {\scriptsize{$\alpha_1=.4$}};
\node [anchor=south east] (alpha2) at ([xshift=1em]key2.north east) {\scriptsize{$\alpha_2=.4$}};
\node [anchor=south east] (alpha3) at ([xshift=1em]key3.north east) {\scriptsize{$\alpha_3=0$}};
\node [anchor=south east] (alpha4) at ([xshift=1em]key4.north east) {\scriptsize{$\alpha_4=.1$}};
\end{scope}
\end{tikzpicture}
\end{center}
\vspace{-2.5em}
\begin{eqnarray}
\textbf{C}_3 & = & 0.4 \times \textbf{h}(\textrm{``什么''}) + 0.4 \times \textbf{h}(\textrm{``也''}) + \nonumber \\
& & 0 \times \textbf{h}(\textrm{``没''}) + 0.1 \times \textbf{h}(\textrm{``学''}) \nonumber
\end{eqnarray}
\vspace{-0.5em}
\begin{itemize}
\item<2-> 注意力机制也可以被看做是一个重新生成value的过程:对于一组value值,注意力模型对他们加权求和,并得到一个新的value。而这个新的value实际上就是query所对应查询结果,在机器翻译中被看做是目标语所对应的源语言上下文表示。
\end{itemize}
\end{frame}
\subsection{训练及推断}
%%%------------------------------------------------------------------------------------------------------------
%%% 训练 %%% 训练
\begin{frame}{训练} \begin{frame}{训练}
\end{frame} \end{frame}
...@@ -1949,6 +2080,14 @@ $\textrm{``you''} = \argmax_{y} \textrm{P}(y|s_1, \alert{C})$ & $\textrm{``you'' ...@@ -1949,6 +2080,14 @@ $\textrm{``you''} = \argmax_{y} \textrm{P}(y|s_1, \alert{C})$ & $\textrm{``you''
\begin{frame}{推断} \begin{frame}{推断}
\end{frame} \end{frame}
%%%------------------------------------------------------------------------------------------------------------
%%% 实验结果
\begin{frame}{效果}
%% 实用注意力机制带来的提升
%% 个大评测比赛没有不使用注意力机制的系统,已经成为标配
\end{frame}
%%%------------------------------------------------------------------------------------------------------------ %%%------------------------------------------------------------------------------------------------------------
%%% GNMT %%% GNMT
\begin{frame}{成功案例 - GNMT} \begin{frame}{成功案例 - GNMT}
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论