Commit a147039a by xiaotong

new pages of attention models

parent 9974fde7
......@@ -85,6 +85,11 @@
\newlength{\mystep}
\newlength{\base}
\newlength{\wseg}
\newlength{\hseg}
\newlength{\wnode}
\newlength{\hnode}
\usefonttheme[onlylarge]{structurebold}
\IfFileExists{C:/WINDOWS/win.ini}
......@@ -139,176 +144,30 @@
\subsection{注意力机制}
%%%------------------------------------------------------------------------------------------------------------
%%% NMT的数学描述
\begin{frame}{数学建模}
%%% 如何定义注意力函数
\begin{frame}{计算注意力权重 - 注意力函数}
\begin{itemize}
\item 对于源语言序列$\textbf{x} = \{x_1,x_2,...,x_m\}$,生成目标语序列$\textbf{y} = \{y_1,y_2,...,y_n\}$的概率可以被描述为
\begin{displaymath}
\log\textrm{P}(\textbf{y}|\textbf{x}) = \sum_{j=1}^{n} \log\textrm{P}(y_j|\textbf{y}_{<j}, \textbf{x})
\end{displaymath}
根据源于句子$\textbf{x}$和已生成的译文$\textbf{y}_{<j} = \{y_1,y_2,...,y_{j-1}\}$生成第$j$个译文$y_j$
\item<2-> \textbf{核心}:如何求解$\textrm{P}(y_j|\textbf{y}_{<j}, \textbf{x})$。在这个循环神经网络模型中,有三个步骤
\begin{enumerate}
\item 输入的单词用分布式表示,如$\textbf{x}$被表示为词向量序列$e_x(\textbf{x})$,同理$\textbf{y}_{<j}$被表示为$e_y(\textbf{y}_{<j})$
\item 源语言句子被一个RNN编码为一个表示$C$,如前面的例子中是一个实数向量
\item 目标端解码用另一个RNN,因此生成$y_j$时只考虑前一个状态$s_{j-1}$(这里,$s_{j-1}$表示RNN第$j-1$步骤的隐层状态)
\end{enumerate}
\end{itemize}
\end{frame}
%%%------------------------------------------------------------------------------------------------------------
%%% 各部分的解释
\begin{frame}{数学建模(续)}
\vspace{-1.5em}
\begin{center}
% \hspace*{-1.5cm}
\begin{tikzpicture}
\setlength{\base}{0.9cm}
\tikzstyle{rnnnode} = [rounded corners=1pt,minimum height=0.5\base,minimum width=1\base,draw,inner sep=0pt,outer sep=0pt]
\tikzstyle{wordnode} = [font=\tiny]
% RNN translation model
\begin{scope}[local bounding box=RNNMT]
% RNN Encoder
\coordinate (eemb0) at (0,0);
\foreach \x [count=\y from 0] in {1,2,...,3}
\node[rnnnode,minimum height=0.5\base,fill=green!30!white,anchor=west] (eemb\x) at ([xshift=0.4\base]eemb\y.east) {\tiny{$e_x()$}};
\foreach \x in {1,2,...,3}
\node[rnnnode,fill=blue!30!white,anchor=south] (enc\x) at ([yshift=0.3\base]eemb\x.north) {};
\node[] (enclabel1) at (enc1) {\tiny{$h_{m-2}$}};
\node[] (enclabel2) at (enc2) {\tiny{$h_{m-1}$}};
\node[rnnnode,fill=purple!30!white] (enclabel3) at (enc3) {\tiny{$h_{m}$}};
\node[wordnode,left=0.4\base of enc1] (init1) {$\cdots$};
\node[wordnode,left=0.4\base of eemb1] (init2) {$\cdots$};
\node[wordnode,below=0pt of eemb1] () {};
\node[wordnode,below=0pt of eemb2] () {};
\node[wordnode,below=0pt of eemb3] () {$\langle$eos$\rangle$};
% RNN Decoder
\foreach \x in {1,2,...,3}
\node[rnnnode,minimum height=0.5\base,fill=green!30!white,anchor=south] (demb\x) at ([yshift=\base]enc\x.north) {\tiny{$e_y()$}};
\foreach \x in {1,2,...,3}
\node[rnnnode,fill=blue!30!white,anchor=south] (dec\x) at ([yshift=0.3\base]demb\x.north) {{\tiny{$s_\x$}}};
\foreach \x in {1,2,...,3}
\node[rnnnode,minimum height=0.5\base,fill=red!30!white,anchor=south] (softmax\x) at ([yshift=0.3\base]dec\x.north) {\tiny{Softmax}};
\node[wordnode,right=0.4\base of demb3] (end1) {$\cdots$};
\node[wordnode,right=0.4\base of dec3] (end2) {$\cdots$};
\node[wordnode,right=0.4\base of softmax3] (end3) {$\cdots$};
% Decoder input words
\node[wordnode,below=0pt of demb1] (decwordin) {$\langle$sos$\rangle$};
\ExtractX{$(demb2.south)$}
\ExtractY{$(decwordin.base)$}
\node[wordnode,anchor=base] () at (\XCoord,\YCoord) {Do};
\ExtractX{$(demb3.south)$}
\ExtractY{$(decwordin.base)$}
\node[wordnode,anchor=base] () at (\XCoord,\YCoord) {you};
% Decoder output words
\node[wordnode,above=0pt of softmax1] (decwordout) {Do};
\ExtractX{$(softmax2.north)$}
\ExtractY{$(decwordout.base)$}
\node[wordnode,anchor=base] () at (\XCoord,\YCoord) {you};
\ExtractX{$(softmax3.north)$}
\ExtractY{$(decwordout.base)$}
\node[wordnode,anchor=base] () at (\XCoord,\YCoord) {know};
% Connections
\draw[-latex'] (init1.east) to (enc1.west);
\draw[-latex'] (dec3.east) to (end2.west);
\foreach \x in {1,2,...,3}
\draw[-latex'] (eemb\x) to (enc\x);
\foreach \x in {1,2,...,3}
\draw[-latex'] (demb\x) to (dec\x);
\foreach \x in {1,2,...,3}
\draw[-latex'] (dec\x.north) to (softmax\x.south);
\foreach \x [count=\y from 2] in {1,2}
{
\draw[-latex'] (enc\x.east) to (enc\y.west);
\draw[-latex'] (dec\x.east) to (dec\y.west);
}
\coordinate (bridge) at ([yshift=0.4\base]enc2.north west);
\draw[-latex'] (enc3.north) .. controls +(north:0.3\base) and +(east:\base) .. (bridge) .. controls +(west:2.7\base) and +(west:0.3\base) .. (dec1.west);
\visible<2->{
\node [anchor=east] (line1) at ([xshift=-3em,yshift=0.5em]softmax1.west) {\scriptsize{基于RNN的隐层状态$s_i$}};
\node [anchor=north west] (line2) at ([yshift=0.3em]line1.south west) {\scriptsize{预测目标词的概率}};
\node [anchor=north west] (line3) at ([yshift=0.3em]line2.south west) {\scriptsize{通常,用Softmax函数}};
\node [anchor=north west] (line4) at ([yshift=0.3em]line3.south west) {\scriptsize{实现 $\textrm{P}(y_i|...)$}};
}
\visible<3->{
\node [anchor=north west] (line11) at ([yshift=-1.8em]line4.west) {\scriptsize{每个词的one-hot}};
\node [anchor=north west] (line12) at ([yshift=0.3em]line11.south west) {\scriptsize{离散化表示都被转化为}};
\node [anchor=north west] (line13) at ([yshift=0.3em]line12.south west) {\scriptsize{实数向量,即词嵌入}};
\node [anchor=north west] (line14) at ([yshift=0.3em]line13.south west) {\scriptsize{($e_x()$$e_y()$函数)}};
}
\visible<4->{
\node [anchor=west] (line21) at ([xshift=1.3em,yshift=1.5em]enc3.east) {\scriptsize{源语编码器最后一个}};
\node [anchor=north west] (line22) at ([yshift=0.3em]line21.south west) {\scriptsize{循环单元的输出被}};
\node [anchor=north west] (line23) at ([yshift=0.3em]line22.south west) {\scriptsize{看作是句子的表示,}};
\node [anchor=north west] (line24) at ([yshift=0.3em]line23.south west) {\scriptsize{记为$C$}};
}
\begin{pgfonlayer}{background}
\visible<2->{
\node [rectangle,inner sep=0.2em,rounded corners=1pt,fill=red!10,drop shadow,draw=red] [fit = (line1) (line2) (line3) (line4)] (box1) {};
\node [rectangle,inner sep=0.2em,rounded corners=1pt,very thick,dotted,draw=red] [fit = (softmax1) (softmax2) (softmax3)] (box4) {};
\draw [->,dotted,very thick,red] ([yshift=1em,xshift=2.5em]box1.east) -- ([yshift=1em,xshift=0.1em]box1.east);
}
\visible<3->{
\node [rectangle,inner sep=0.2em,rounded corners=1pt,fill=green!10,drop shadow,draw=ugreen] [fit = (line11) (line12) (line13) (line14)] (box2) {};
\node [rectangle,inner sep=0.2em,rounded corners=1pt,very thick,dotted,draw=ugreen] [fit = (eemb1) (eemb2) (eemb3)] (box5) {};
\node [rectangle,inner sep=0.2em,rounded corners=1pt,very thick,dotted,draw=ugreen] [fit = (demb1) (demb2) (demb3)] (box6) {};
\draw [->,dotted,very thick,ugreen] ([yshift=-1.3em,xshift=2.5em]box2.east) -- ([yshift=-1.3em,xshift=0.1em]box2.east);
\draw [->,dotted,very thick,ugreen] ([xshift=0.1em]box6.west) .. controls +(west:1) and +(east:1) .. ([yshift=1.0em]box2.east) ;
}
\visible<4->{
\node [rectangle,inner sep=0.2em,rounded corners=1pt,fill=purple!10,drop shadow,draw=purple] [fit = (line21) (line22) (line23) (line24)] (box3) {};
\node [rectangle,inner sep=0.2em,rounded corners=1pt,very thick,dotted,draw=purple] [fit = (enc3)] (box7) {};
\draw [->,dotted,very thick,purple] ([xshift=0.1em]box7.east) -- ([xshift=0.8em]box7.east) ;
}
\end{pgfonlayer}
\end{scope}
\end{tikzpicture}
\end{center}
\visible<5->{
\vspace{-1.5em}
\begin{itemize}
\item 可以重新定义\\
\vspace{-0.8em}
\begin{displaymath}
\textrm{P}(y_j|\textbf{y}_{<j}, \textbf{x}) \triangleq \textrm{P}(y_j|s_{j-1}, C)
\end{displaymath}
对于上图中的模型,进一步化简为:\\
\item 再来看一下注意力权重的定义。这个过程实际上是对$a(\cdot,\cdot)$做指数归一化:\\
\vspace{-0.3em}
\begin{displaymath}
\textrm{P}(y_j|\textbf{y}_{<j}, \textbf{x}) \triangleq \left\{
\begin{matrix}
\textrm{P}(y_j|C)\ \ \ \ & j = 1 \\
\textrm{P}(y_j|s_{j-1}) & j > 1
\end{matrix} \right.
\alpha_{i,j} = \frac{\exp(a(s_{i-1}, h_j))}{\sum_{j'} \exp(a(s_{i-1}, h_{j'}))}
\end{displaymath}
\item<2-> 注意力函数$a(s,h)$的目的是捕捉$s$$h$之间的\alert{相似性},这也可以被看作是目标语表示和源语言表示的一种``统一化'',即把源语言和目标语表示在同一个语义空间,进而语义相近的内容有更大的相似性。\visible<3->{定义$a(s,h)$的方式:}
\visible<3->{
\begin{displaymath}
a(s,h) = \left\{ \begin{array}{ll}
s h^T & \textrm{向量乘} \\
\textrm{cos}(s, h) & \textrm{向量夹角} \\
s \textbf{W} h^T & \textrm{线性模型} \\
\textrm{TanH}(\textbf{W}[s,h])\textbf{v}^T & \textrm{拼接}
\end{array}
\right.
\end{displaymath}
$\textbf{W}$$\textbf{v}$是可学习参数
}
\end{itemize}
}
\end{frame}
%%%------------------------------------------------------------------------------------------------------------
......
......@@ -82,6 +82,10 @@
\newcounter{mycount3}
\newcounter{mycount4}
\newlength{\mystep}
\newlength{\wseg}
\newlength{\hseg}
\newlength{\wnode}
\newlength{\hnode}
\usefonttheme[onlylarge]{structurebold}
......@@ -866,7 +870,7 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\
\item 源语言句子被一个RNN编码为一个表示$C$,如前面的例子中是一个实数向量
\item 目标端解码用另一个RNN,因此生成$y_j$时只考虑前一个状态$s_{j-1}$(这里,$s_{j-1}$表示RNN第$j-1$步骤的隐层状态)
\end{enumerate}
\end{itemize}
\end{frame}
......@@ -1009,7 +1013,7 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\
\vspace{-0.3em}
\begin{displaymath}
\textrm{P}(y_j|\textbf{y}_{<j}, \textbf{x}) \triangleq \left\{
\textrm{P}(y_j|\textbf{y}_{<j}, \textbf{x}) \triangleq \left\{
\begin{matrix}
\textrm{P}(y_j|C)\ \ \ \ & j = 1 \\
\textrm{P}(y_j|s_{j-1}) & j > 1
......@@ -1619,7 +1623,7 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\
\node [anchor=west] (math1) at ([xshift=5em,yshift=1em]th2.east) {$C_i = \sum_{j} \alpha_{i,j} h_j \ \ $};
}
\visible<3->{
\node [anchor=north west] (math2) at ([yshift=-2em]math1.south west) {$\alpha_{i,j} = \frac{\exp(\beta_{i,j})}{\sum_j \exp(\beta_{i,j})}$};
\node [anchor=north west] (math2) at ([yshift=-2em]math1.south west) {$\alpha_{i,j} = \frac{\exp(\beta_{i,j})}{\sum_{j'} \exp(\beta_{i,j'})}$};
\node [anchor=north west] (math3) at ([yshift=-0em]math2.south west) {$\beta_{i,j} = a(s_{i-1}, h_j)$};
}
......@@ -1647,6 +1651,187 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\
\end{frame}
%%%------------------------------------------------------------------------------------------------------------
%%% 注意力权重的可视化
\begin{frame}{注意力权重$\alpha_{ij}$}
\begin{itemize}
\item 注意力权重$\alpha_{ij}$的可视化
\end{itemize}
\vspace{-1.5em}
\begin{center}
\begin{tikzpicture}
\setlength{\wseg}{1.5cm}
\setlength{\hseg}{1.0cm}
\setlength{\wnode}{3.75cm}
\setlength{\hnode}{1.0cm}
\tikzstyle{elementnode} = [rectangle,text=white,anchor=center]
\tikzstyle{srcnode} = [rotate=45,font=\small,anchor=south west]
\tikzstyle{tgtnode} = [left,font=\small,anchor=north east]
\tikzstyle{alignmentnode} = [rectangle,draw,minimum height=3.6\hnode,minimum width=0.36\hnode]
\tikzstyle{probnode} = [fill=blue!30,minimum width=0.4\hnode]
\tikzstyle{labelnode} = [above]
% alignment matrix
\begin{scope}[scale=0.9,yshift=0.12in]
\foreach \i / \j / \c in
{0/7/0.2, 1/7/0.45, 2/7/0.15, 3/7/0.15, 4/7/0.15, 5/7/0.15,
0/6/0.35, 1/6/0.45, 2/6/0.15, 3/6/0.15, 4/6/0.15, 5/6/0.15,
0/5/0.25, 1/5/0.15, 2/5/0.15, 3/5/0.35, 4/5/0.15, 5/5/0.15,
0/4/0.15, 1/4/0.25, 2/4/0.2, 3/4/0.30, 4/4/0.15, 5/4/0.15,
0/3/0.15, 1/3/0.15, 2/3/0.8, 3/3/0.25, 4/3/0.15, 5/3/0.25,
0/2/0.15, 1/2/0.15, 2/2/0.15, 3/2/0.15, 4/2/0.25, 5/2/0.3,
0/1/0.15, 1/1/0.15, 2/1/0.15, 3/1/0.15, 4/1/0.8, 5/1/0.15,
0/0/0.15, 1/0/0.15, 2/0/0.15, 3/0/0.15, 4/0/0.25, 5/0/0.60}
\node[elementnode,minimum size=0.6*\hnode*\c,inner sep=0.1pt,fill=blue] (a\i\j) at (0.5*\hnode*\i-5.4*0.5*\hnode,0.5*\hnode*\j-1.05*\hnode) {};
%attention score labels
\node[align=center] (l17) at (a17) {\scriptsize{{\color{white} .4}}};
\node[align=center] (l26) at (a06) {\scriptsize{{\color{white} .3}}};
\node[align=center] (l26) at (a16) {\scriptsize{{\color{white} .4}}};
\node[align=center] (l17) at (a35) {\scriptsize{{\color{white} .3}}};
\node[align=center] (l17) at (a34) {\tiny{{\color{white} .3}}};
\node[align=center] (l17) at (a23) {\small{{\color{white} .8}}};
\node[align=center] (l17) at (a41) {\small{{\color{white} .8}}};
\node[align=center] (l17) at (a50) {\small{{\color{white} .7}}};
% source
\node[srcnode] (src1) at (-5.4*0.5*\hnode,-1.05*\hnode+7.5*0.5*\hnode) {\scriptsize{Have}};
\node[srcnode] (src2) at ([xshift=0.5\hnode]src1.south west) {\scriptsize{you}};
\node[srcnode] (src3) at ([xshift=0.5\hnode]src2.south west) {\scriptsize{learned}};
\node[srcnode] (src4) at ([xshift=0.5\hnode]src3.south west) {\scriptsize{nothing}};
\node[srcnode] (src5) at ([xshift=0.5\hnode]src4.south west) {\scriptsize{?}};
\node[srcnode] (src6) at ([xshift=0.5\hnode]src5.south west) {\scriptsize{EOS}};
% target
\node[tgtnode] (tgt1) at (-6.0*0.5*\hnode,-1.05*\hnode+7.5*0.5*\hnode) {\scriptsize{}};
\node[tgtnode] (tgt2) at ([yshift=-0.5\hnode]tgt1.north east) {\scriptsize{什么}};
\node[tgtnode] (tgt3) at ([yshift=-0.5\hnode]tgt2.north east) {\scriptsize{}};
\node[tgtnode] (tgt4) at ([yshift=-0.5\hnode]tgt3.north east) {\scriptsize{}};
\node[tgtnode] (tgt5) at ([yshift=-0.5\hnode]tgt4.north east) {\scriptsize{}};
\node[tgtnode] (tgt6) at ([yshift=-0.5\hnode]tgt5.north east) {\scriptsize{}};
\node[tgtnode] (tgt7) at ([yshift=-0.5\hnode]tgt6.north east) {\scriptsize{?}};
\node[tgtnode] (tgt8) at ([yshift=-0.5\hnode]tgt7.north east) {\scriptsize{EOS}};
\end{scope}
\visible<2->{
% alignment rectangle 2
\node[alignmentnode, ugreen, anchor=north west] (alignment1) at ([xshift=-0.3em,yshift=0.4em]a07.north west) {};
}
\visible<3->{
% alignment rectangle 1
\node[alignmentnode, red, anchor=north west] (alignment2) at ([xshift=-0.1em,yshift=0.2em]a17.north west) {};
}
\visible<3->{
% alignment bars 2
\node[probnode,anchor=south west,minimum height=0.4\hnode,inner sep=0.1pt,fill=red!40,label=below:\scriptsize{$0.4$}] (attn21) at ([xshift=2.3\hnode,yshift=-0.0\hnode]alignment2.east) {};
\node[probnode,anchor=south west,minimum height=0.4\hnode,inner sep=0.1pt,fill=red!40,label=below:\scriptsize{$0.4$}] (attn22) at ([xshift=1pt]attn21.south east) {};
\node[probnode,anchor=south west,minimum height=0.05\hnode,inner sep=0.1pt,fill=red!40,label=below:\scriptsize{$0$}] (attn23) at ([xshift=1pt]attn22.south east) {};
\node[probnode,anchor=south west,minimum height=0.1\hnode,inner sep=0.1pt,fill=red!40,label=below:\scriptsize{$0.1$}] (attn24) at ([xshift=1pt]attn23.south east) {};
\node[probnode,anchor=south west,minimum height=0.05\hnode,inner sep=0.1pt,fill=red!40,label=below:\scriptsize{$0$}] (attn25) at ([xshift=1pt]attn24.south east) {};
\node[probnode,anchor=south west,minimum height=0.05\hnode,inner sep=0.1pt,fill=red!40,label=below:\scriptsize{$...$}] (attn26) at ([xshift=1pt]attn25.south east) {};
}
\visible<2->{
% alignment bars 1
\node[probnode,anchor=south,minimum height=0.2\hnode,inner sep=0.1pt,fill=ugreen!40,label=below:\scriptsize{$0.2$}] (attn11) at ([xshift=2.5\hnode,yshift=-1em]alignment2.north east) {};
\node[probnode,anchor=south west,minimum height=0.3\hnode,inner sep=0.1pt,fill=ugreen!40,label=below:\scriptsize{$0.3$}] (attn12) at ([xshift=1pt]attn11.south east) {};
\node[probnode,anchor=south west,minimum height=0.2\hnode,inner sep=0.1pt,fill=ugreen!40,label=below:\scriptsize{$0.2$}] (attn13) at ([xshift=1pt]attn12.south east) {};
\node[probnode,anchor=south west,minimum height=0.05\hnode,inner sep=0.1pt,fill=ugreen!40,label=below:\scriptsize{$0$}] (attn14) at ([xshift=1pt]attn13.south east) {};
\node[probnode,anchor=south west,minimum height=0.05\hnode,inner sep=0.1pt,fill=ugreen!40,label=below:\scriptsize{$0$}] (attn15) at ([xshift=1pt]attn14.south east) {};
\node[probnode,anchor=south west,minimum height=0.05\hnode,inner sep=0.1pt,fill=ugreen!40,label=below:\scriptsize{$...$}] (attn16) at ([xshift=1pt]attn15.south east) {};
}
\visible<3->{
% coverage score formula node
\node[anchor=north west] (formula) at ([xshift=-0.3\hnode,yshift=-2.5\hnode]attn11.south) {\small{不同$C_i$所对应的源语言词的权重是不同的}};
}
\visible<3->{
% matrix -> attn2
\draw[->,red] ([xshift=0.1em,yshift=2.3em]alignment2.east).. controls +(east:1.9cm) and +(west:1.0cm) ..([xshift=-0.15\hnode,yshift=-0.0\hnode]attn21.north west);
}
\visible<2->{
\draw[->,ugreen] ([xshift=0.1em,yshift=-1.2em]alignment1.north east)--([xshift=2.2\hnode,yshift=-1.2em]alignment2.north east);
}
\visible<3->{
% attn2 -> cov2
\draw[->] ([xshift=0.2\hnode,yshift=0.0\hnode]attn26.east)--([xshift=0.7\hnode,yshift=0.0\hnode]attn26.east) node[pos=0.5,above] (sum2) {\small{$\sum$}}; % 0.3 - 0.5 height of the
}
\visible<2->{
% attn1 -> cov1
\draw[->] ([xshift=0.2\hnode]attn16.east)--([xshift=0.7\hnode]attn16.east) node[pos=0.5,above] (sum1) {\small{$\sum$}};
}
% coverage score for each source word
\visible<2->{
\node[anchor=west] (sc1) at ([xshift=0.9\hnode]attn16.east) {$C_1 = \sum_{i=1}^{8} \alpha_{i1} h_{i}$};
}
\visible<3->{
\node[anchor=west] (sc2) at ([xshift=0.9\hnode,yshift=0.0\hnode]attn26.east) {$C_2 = \sum_{i=1}^{8} \alpha_{i2} h_{i}$};
}
\end{tikzpicture}
\end{center}
\visible<4->{
\begin{itemize}
\item 对比
\end{itemize}
\begin{center}
{\small
\begin{tabular}{l | l}
引入注意力机制以前 & 引入注意力机制以后 \\ \hline
$\textrm{``Have''} = \argmax_{y} \textrm{P}(y|0, \alert{C})$ & $\textrm{``Have''} = \argmax_{y} \textrm{P}(y|0, \alert{C_1})$ \\
$\textrm{``you''} = \argmax_{y} \textrm{P}(y|s_1, \alert{C})$ & $\textrm{``you''} = \argmax_{y} \textrm{P}(y|s_1, \alert{C_2})$
\end{tabular}
}
\end{center}
}
\end{frame}
%%%------------------------------------------------------------------------------------------------------------
%%% 如何定义注意力函数
\begin{frame}{计算注意力权重 - 注意力函数}
\begin{itemize}
\item 再来看一下注意力权重的定义。这个过程实际上是对$a(\cdot,\cdot)$做指数归一化:\\
\vspace{-0.3em}
\begin{displaymath}
\alpha_{i,j} = \frac{\exp(a(s_{i-1}, h_j))}{\sum_{j'} \exp(a(s_{i-1}, h_{j'}))}
\end{displaymath}
\item<2-> 注意力函数$a(s,h)$的目的是捕捉$s$$h$之间的\alert{相似性},这也可以被看作是目标语表示和源语言表示的一种``统一化'',即把源语言和目标语表示在同一个语义空间,进而语义相近的内容有更大的相似性。\visible<3->{定义$a(s,h)$的方式:}
\vspace{-1em}
\visible<3->{
\begin{displaymath}
a(s,h) = \left\{ \begin{array}{ll}
s h^T & \textrm{向量乘} \\
\textrm{cos}(s, h) & \textrm{向量夹角} \\
s \textbf{W} h^T & \textrm{线性模型} \\
\textrm{TanH}(\textbf{W}[s,h])\textbf{v}^T & \textrm{拼接}[s,h]+\textrm{单层网络}
\end{array}
\right.
\end{displaymath}
\vspace{-0.3em}
$\textbf{W}$$\textbf{v}$是可学习参数
}
\end{itemize}
\end{frame}
%%%------------------------------------------------------------------------------------------------------------
\section{Transformer}
%%%------------------------------------------------------------------------------------------------------------
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论