Commit 1e144093 by xiaotong

new pages and new sections

parent 65c39211
......@@ -7,3 +7,4 @@
*.gz
*.toc
*.blg
*.sav
......@@ -145,170 +145,27 @@
\subsection{注意力机制}
%%%------------------------------------------------------------------------------------------------------------
%%% 解码 - beam search
\begin{frame}{推断 - Beam Search}
%%% 解码 - 长度惩罚和覆盖度
\begin{frame}{推断 - 其它特征}
\begin{itemize}
\item \textbf{Greedy Search}: 目标语每一个位置,输出层的Softmax可以得到所有单词的概率,然后选择一个概率最大单词输出,下一个位置的预测就基于这一步输出的单词
\item \textbf{Beach Search}: 为了避免贪婪方法造成的错误累加,可以每次对$b$个单词进行扩展,而不是只使用一个单词,其中$b$称做束的宽度 - 这样可以搜索更多可能的译文
\item 直接用$\textrm{P}(\textbf{y}|\textbf{x})$进行解码,面临两方面问题
\begin{itemize}
\item$\textrm{P}(y_j|\textbf{y}_{<j},\textbf{x})$进行乘积会导致长句的概率很低
\item 模型本身并没有考虑每个源语言单词被使用的程度,比如一个单词可能会被翻译了很多``次''
\end{itemize}
\item<2-> 因此,解码时会使用其它特征与$\textrm{P}(\textbf{y}|\textbf{x})$一起组成模型得分$score(\textbf{y},\textbf{x})$$score(\textbf{y},\textbf{x})$也作为beam search的排序依据
\begin{eqnarray}
score(\textbf{y},\textbf{x}) & = & \textrm{P}(\textbf{y}|\textbf{x})/\textrm{lp}(\textbf{y}) + \textrm{cp}(\textbf{y},\textbf{x}) \nonumber \\
\textrm{lp}(\textbf{y}) & = & \frac{(5 + |\textbf{y}|)^\alpha}{(5 + 1)^\alpha} \nonumber \\
\textrm{cp}(\textbf{y},\textbf{x}) & = & \beta \cdot \sum\nolimits_{i=1}^{|\textbf{x}|} \log (\min(\sum\nolimits_{j}^{|\textbf{y}|} a_{ij}, 1))) \nonumber
\end{eqnarray}
\vspace{-0.5em}
\begin{itemize}
\item lp会惩罚译文过短的结果(长度惩罚);cp会惩罚把某些源语单词对应到很多目标语单词的情况(覆盖度),被覆盖的程度用$\sum\nolimits_{j}^{|\textbf{y}|} a_{ij}$度量;$\alpha$$\beta$是超参,需要经验性设置
\end{itemize}
\end{itemize}
\vspace{-0.3em}
\visible<2->{
\begin{center}
\begin{tikzpicture}
\begin{scope}
\tikzstyle{rnnnode} = [minimum height=1.1em,minimum width=3.5em,inner sep=2pt,rounded corners=1pt,draw,fill=red!20];
\tikzstyle{wnode} = [minimum height=1.0em,minimum width=3.5em,inner sep=2pt,rounded corners=1pt,draw,fill=white];
\visible<3->{
\node [rnnnode,anchor=west,fill=green!20] (t1) at (0,0) {\tiny{$e_y()$}};
}
\visible<7->{
\node [rnnnode,anchor=west,fill=green!20] (t2) at ([xshift=2.2em]t1.east) {\tiny{$e_y()$ ($\times 3$)}};
}
\visible<8->{
\node [rnnnode,anchor=west,fill=green!20] (t3) at ([xshift=2.2em]t2.east) {\tiny{$e_y()$ ($\times 3$)}};
\node [anchor=west,inner sep=2pt] (t4) at ([xshift=0.3em]t3.east) {\tiny{...}};
}
\visible<3->{
\node [rnnnode,anchor=south] (s1) at ([yshift=1em]t1.north) {\tiny{$\textbf{s}_1$}};
}
\visible<7->{
\node [rnnnode,anchor=south] (s2) at ([yshift=1em]t2.north) {\tiny{$\textbf{s}_2$ ($\times 3$)}};
}
\visible<8->{
\node [rnnnode,anchor=south] (s3) at ([yshift=1em]t3.north) {\tiny{$\textbf{s}_3$ ($\times 3$)}};
\node [anchor=west,inner sep=2pt] (s4) at ([xshift=0.3em]s3.east) {\tiny{...}};
}
\visible<3->{
\node [rnnnode,anchor=south,fill=blue!20] (o1) at ([yshift=1em]s1.north) {\tiny{softmax}};
}
\visible<7->{
\node [rnnnode,anchor=south,fill=blue!20] (o2) at ([yshift=1em]s2.north) {\tiny{softmax ($\times 3$)}};
}
\visible<8->{
\node [rnnnode,anchor=south,fill=blue!20] (o3) at ([yshift=1em]s3.north) {\tiny{softmax ($\times 3$)}};
\node [anchor=west,inner sep=2pt] (o4) at ([xshift=0.3em]o3.east) {\tiny{...}};
}
\node [wnode,anchor=north] (wt1) at ([yshift=-0.8em]t1.south) {\tiny{EOS}};
\visible<6->{
\node [wnode,anchor=north] (wt2) at ([yshift=-0.8em]t2.south) {\tiny{Have}};
\node [wnode,anchor=north] (wt2copy1) at ([xshift=-0.2em,yshift=-0.2em]wt2.north) {\tiny{Have}};
\node [wnode,anchor=north] (wt2copy2) at ([xshift=-0.4em,yshift=-0.4em]wt2.north) {\tiny{Have}};
}
\visible<8->{
\node [wnode,anchor=north,inner sep=2pt] (wt3) at ([yshift=-0.8em]t3.south) {\tiny{you}};
\node [wnode,anchor=north] (wt3copy1) at ([xshift=-0.2em,yshift=-0.2em]wt3.north) {\tiny{you}};
\node [wnode,anchor=north] (wt3copy2) at ([xshift=-0.4em,yshift=-0.4em]wt3.north) {\tiny{you}};
}
\visible<5->{
\node [wnode,anchor=center,inner sep=2pt] (wo1) at ([xshift=0.4em,yshift=1.8em]o1.north) {\tiny{Have}};
\node [wnode,anchor=north] (wo1copy1) at ([xshift=-0.2em,yshift=-0.2em]wo1.north) {\tiny{Have}};
\node [wnode,anchor=north] (wo1copy2) at ([xshift=-0.4em,yshift=-0.4em]wo1.north) {\tiny{Have}};
}
\visible<8->{
\node [wnode,anchor=center,inner sep=2pt] (wo2) at ([xshift=0.4em,yshift=1.8em]o2.north) {\tiny{you}};
\node [wnode,anchor=north] (wo2copy1) at ([xshift=-0.2em,yshift=-0.2em]wo2.north) {\tiny{you}};
\node [wnode,anchor=north] (wo2copy2) at ([xshift=-0.4em,yshift=-0.4em]wo2.north) {\tiny{you}};
}
\visible<8->{
\node [wnode,anchor=center,inner sep=2pt] (wo3) at ([xshift=0.4em,yshift=1.8em]o3.north) {\tiny{learned}};
\node [wnode,anchor=north] (wo3copy1) at ([xshift=-0.2em,yshift=-0.2em]wo3.north) {\tiny{learned}};
\node [wnode,anchor=north] (wo3copy2) at ([xshift=-0.4em,yshift=-0.4em]wo3.north) {\tiny{learned}};
}
\visible<3->{
\foreach \x in {1}{
\draw [->] ([yshift=-0.7em]t\x.south) -- ([yshift=-0.1em]t\x.south);
\draw [->] ([yshift=0.1em]t\x.north) -- ([yshift=-0.1em]s\x.south);
\draw [->] ([yshift=0.1em]s\x.north) -- ([yshift=-0.1em]o\x.south);
}
}
\visible<5->{
\draw [->] ([yshift=0.1em]o1.north) -- ([yshift=0.8em]o1.north) node [pos=0.5,right] {\tiny{top-3}};
}
\visible<7->{
\foreach \x in {2}{
\draw [->] ([yshift=-0.7em]t\x.south) -- ([yshift=-0.1em]t\x.south);
\draw [->] ([yshift=0.1em]t\x.north) -- ([yshift=-0.1em]s\x.south);
\draw [->] ([yshift=0.1em]s\x.north) -- ([yshift=-0.1em]o\x.south);
\draw [->] ([yshift=0.1em]o\x.north) -- ([yshift=0.8em]o\x.north) node [pos=0.5,right] {\tiny{top-3}};
}
}
\visible<8->{
\foreach \x in {3}{
\draw [->] ([yshift=-0.7em]t\x.south) -- ([yshift=-0.1em]t\x.south);
\draw [->] ([yshift=0.1em]t\x.north) -- ([yshift=-0.1em]s\x.south);
\draw [->] ([yshift=0.1em]s\x.north) -- ([yshift=-0.1em]o\x.south);
\draw [->] ([yshift=0.1em]o\x.north) -- ([yshift=0.8em]o\x.north) node [pos=0.5,right] {\tiny{top-3}};
}
}
\visible<3->{
\draw [->] ([xshift=-0.5em]s1.west) -- ([xshift=-0.1em]s1.west) node [pos=0,left,inner sep=1pt] {\tiny{0}};
}
\visible<7->{
\draw [->] ([xshift=0.1em]s1.east) -- ([xshift=-0.1em]s2.west);
}
\visible<8->{
\draw [->] ([xshift=0.1em]s2.east) -- ([xshift=-0.1em]s3.west);
}
\visible<6->{
\draw [->,very thick,dotted] (wo1.east) .. controls +(east:0.6) and +(west:0.8) ..(wt2copy2.west);
}
\visible<8->{
\draw [->,very thick,dotted] (wo2.east) .. controls +(east:0.6) and +(west:0.8) ..(wt3copy2.west);
}
\visible<7->{
\node [circle,draw,anchor=north,inner sep=2pt,fill=orange!20] (c2) at ([yshift=-2.5em]t1.south) {\tiny{$\textbf{C}_2$}};
\node [circle,draw,inner sep=2pt,fill=orange!20] (c2copy1) at ([yshift=-0.1em,xshift=-0.1em]c2) {\tiny{$\textbf{C}_2$}};
\node [circle,draw,inner sep=2pt,fill=orange!20] (c2copy2) at ([yshift=-0.2em,xshift=-0.2em]c2) {\tiny{$\textbf{C}_2$}};
\draw [->] ([xshift=-0.9em]c2.west) -- ([xshift=-0.3em]c2.west);
\draw [->] ([xshift=0.1em]c2.east) .. controls +(east:1.5) and +(west:0.8) ..([yshift=-0.3em,xshift=-0.1em]s2.west);
}
\visible<8->{
\node [circle,draw,anchor=north,inner sep=2pt,fill=orange!20] (c3) at ([yshift=-2.5em]t2.south) {\tiny{$\textbf{C}_3$}};
\node [circle,draw,inner sep=2pt,fill=orange!20] (c3copy1) at ([yshift=-0.1em,xshift=-0.1em]c3) {\tiny{$\textbf{C}_3$}};
\node [circle,draw,inner sep=2pt,fill=orange!20] (c3copy2) at ([yshift=-0.2em,xshift=-0.2em]c3) {\tiny{$\textbf{C}_3$}};
\draw [->] ([xshift=-0.9em]c3.west) -- ([xshift=-0.3em]c3.west);
\draw [->] ([xshift=0.1em]c3.east) .. controls +(east:1.5) and +(west:0.8) ..([yshift=-0.3em,xshift=-0.1em]s3.west);
}
\visible<3->{
\node [anchor=east] (vocab) at ([xshift=-5em]s1.west) {\tiny{$\begin{bmatrix} \textrm{Have} & 0.50 \\ \textrm{I} & 0.02 \\ \textrm{it} & 0.03 \\ \textrm{has} & 0.30 \\ \textrm{you} & 0.01 \\ \textrm{the} & 0.01 \\ \textrm{a} & 0.01 \\ \textrm{an} & 0.02 \\ \textrm{he} & 0.03 \\ \textrm{she} & 0.01 \\ \textrm{are} & 0.00 \\ \textrm{am} & 0.01 \\ ... & ... \end{bmatrix}$}};
\node [anchor=south] (vocablabel) at (vocab.north) {\tiny{单词的概率分布}};
\draw [->,red,very thick,dotted] (o1.west) .. controls +(west:1) and +(east:2) .. ([yshift=1em]vocab.south east);
}
\visible<4->{
\node [anchor=east,inner sep=1pt] (vocabtopn) at ([xshift=-0.5em,yshift=-0.5em]wo1.west) {\tiny{$\begin{bmatrix} \textrm{Have} \\ \textrm{has} \\ \textrm{it} \end{bmatrix}$}};
\draw [->] ([yshift=-1.6em,xshift=-0.4em]vocab.north east) .. controls +(east:1) and +(west:1) .. ([xshift=0.1em,yshift=0.4em]vocabtopn.west) node [pos=0.3,below] (topnlabel) {\tiny{top-3}};
\visible<4->{
\node [anchor=north] (cap) at (vocab.south east) {\scriptsize{\textbf{束搜索($b=3$)}}};
}
}
\end{scope}
\end{tikzpicture}
\end{center}
}
\end{frame}
%%%------------------------------------------------------------------------------------------------------------
......
......@@ -2438,6 +2438,30 @@ $\textrm{``you''} = \argmax_{y} \textrm{P}(y|\textbf{s}_1, \alert{\textbf{C}})$
\end{frame}
%%%------------------------------------------------------------------------------------------------------------
%%% 解码 - 长度惩罚和覆盖度
\begin{frame}{推断 - 其它特征}
\begin{itemize}
\item 直接用$\textrm{P}(\textbf{y}|\textbf{x})$进行解码,面临两方面问题
\begin{itemize}
\item$\textrm{P}(y_j|\textbf{y}_{<j},\textbf{x})$进行乘积会导致长句的概率很低
\item 模型本身并没有考虑每个源语言单词被使用的程度,比如一个单词可能会被翻译了很多``次''
\end{itemize}
\item<2-> 因此,解码时会使用其它特征与$\textrm{P}(\textbf{y}|\textbf{x})$一起组成模型得分$score(\textbf{y},\textbf{x})$$score(\textbf{y},\textbf{x})$也作为beam search的排序依据
\begin{eqnarray}
score(\textbf{y},\textbf{x}) & = & \textrm{P}(\textbf{y}|\textbf{x})/\textrm{lp}(\textbf{y}) + \textrm{cp}(\textbf{y},\textbf{x}) \nonumber \\
\textrm{lp}(\textbf{y}) & = & \frac{(5 + |\textbf{y}|)^\alpha}{(5 + 1)^\alpha} \nonumber \\
\textrm{cp}(\textbf{y},\textbf{x}) & = & \beta \cdot \sum\nolimits_{i=1}^{|\textbf{x}|} \log (\min(\sum\nolimits_{j}^{|\textbf{y}|} a_{ij}, 1))) \nonumber
\end{eqnarray}
\vspace{-0.5em}
\begin{itemize}
\item lp会惩罚译文过短的结果(长度惩罚);cp会惩罚把某些源语单词对应到很多目标语单词的情况(覆盖度),被覆盖的程度用$\sum\nolimits_{j}^{|\textbf{y}|} a_{ij}$度量;$\alpha$$\beta$是超参,需要经验性设置
\end{itemize}
\end{itemize}
\end{frame}
%%%------------------------------------------------------------------------------------------------------------
%%% 实验结果
\begin{frame}{效果}
%% 实用注意力机制带来的提升
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论