new pages and new sections

1e144093 · xiaotong · 65c39211 · 1e144093 · 65c39211 · 1e144093
Commit 1e144093 authored Nov 18, 2019 by xiaotong
--- a/.gitignore
+++ b/.gitignore
@@ -7,3 +7,4 @@
 *.gz
 *.toc
 *.blg
+*.sav
--- a/Section04-Phrasal-and-Syntactic-Models/section04.test.tex
+++ b/Section04-Phrasal-and-Syntactic-Models/section04.test.tex
--- a/Section04-Phrasal-and-Syntactic-Models/section04.tex
+++ b/Section04-Phrasal-and-Syntactic-Models/section04.tex
--- a/Section06-Neural-Machine-Translation/section06-test.tex
+++ b/Section06-Neural-Machine-Translation/section06-test.tex
@@ -145,170 +145,27 @@
 \subsection{注意力机制}

 %%%------------------------------------------------------------------------------------------------------------
-%%% 解码 - beam search
-\begin{frame}{推断 - Beam Search}
+%%% 解码 - 长度惩罚和覆盖度
+\begin{frame}{推断 - 其它特征}
 \begin{itemize}
-\item \textbf{Greedy Search}: 目标语每一个位置，输出层的Softmax可以得到所有单词的概率，然后选择一个概率最大单词输出，下一个位置的预测就基于这一步输出的单词
-\item \textbf{Beach Search}: 为了避免贪婪方法造成的错误累加，可以每次对$b$个单词进行扩展，而不是只使用一个单词，其中$b$称做束的宽度 - 这样可以搜索更多可能的译文
+\item 直接用$\textrm{P}(\textbf{y}|\textbf{x})$进行解码，面临两方面问题
+    \begin{itemize}
+    \item 对$\textrm{P}(y_j|\textbf{y}_{<j},\textbf{x})$进行乘积会导致长句的概率很低
+    \item 模型本身并没有考虑每个源语言单词被使用的程度，比如一个单词可能会被翻译了很多``次''
+    \end{itemize}
+\item<2-> 因此，解码时会使用其它特征与$\textrm{P}(\textbf{y}|\textbf{x})$一起组成模型得分$score(\textbf{y},\textbf{x})$，$score(\textbf{y},\textbf{x})$也作为beam search的排序依据
+    \begin{eqnarray}
+    score(\textbf{y},\textbf{x}) & = & \textrm{P}(\textbf{y}|\textbf{x})/\textrm{lp}(\textbf{y}) + \textrm{cp}(\textbf{y},\textbf{x}) \nonumber \\
+    \textrm{lp}(\textbf{y})      & = & \frac{(5 + |\textbf{y}|)^\alpha}{(5 + 1)^\alpha} \nonumber \\ 
+    \textrm{cp}(\textbf{y},\textbf{x}) & = & \beta \cdot \sum\nolimits_{i=1}^{|\textbf{x}|} \log (\min(\sum\nolimits_{j}^{|\textbf{y}|} a_{ij}, 1))) \nonumber
+    \end{eqnarray}
+    
+    \vspace{-0.5em}
+    \begin{itemize}
+    \item lp会惩罚译文过短的结果(长度惩罚)；cp会惩罚把某些源语单词对应到很多目标语单词的情况(覆盖度)，被覆盖的程度用$\sum\nolimits_{j}^{|\textbf{y}|} a_{ij}$度量；$\alpha$和$\beta$是超参，需要经验性设置
+    \end{itemize}
 \end{itemize}

-\vspace{-0.3em}
-\visible<2->{
-\begin{center}
-\begin{tikzpicture}
-\begin{scope}
-\tikzstyle{rnnnode} = [minimum height=1.1em,minimum width=3.5em,inner sep=2pt,rounded corners=1pt,draw,fill=red!20];
-\tikzstyle{wnode} = [minimum height=1.0em,minimum width=3.5em,inner sep=2pt,rounded corners=1pt,draw,fill=white];
-
-
-\visible<3->{
-\node [rnnnode,anchor=west,fill=green!20] (t1) at (0,0) {\tiny{$e_y()$}};
-}
-\visible<7->{
-\node [rnnnode,anchor=west,fill=green!20] (t2) at ([xshift=2.2em]t1.east) {\tiny{$e_y()$ ($\times 3$)}};
-}
-\visible<8->{
-\node [rnnnode,anchor=west,fill=green!20] (t3) at ([xshift=2.2em]t2.east) {\tiny{$e_y()$ ($\times 3$)}};
-\node [anchor=west,inner sep=2pt] (t4) at ([xshift=0.3em]t3.east) {\tiny{...}};
-}
-\visible<3->{
-\node [rnnnode,anchor=south] (s1) at ([yshift=1em]t1.north) {\tiny{$\textbf{s}_1$}};
-}
-\visible<7->{
-\node [rnnnode,anchor=south] (s2) at ([yshift=1em]t2.north) {\tiny{$\textbf{s}_2$ ($\times 3$)}};
-}
-\visible<8->{
-\node [rnnnode,anchor=south] (s3) at ([yshift=1em]t3.north) {\tiny{$\textbf{s}_3$ ($\times 3$)}};
-\node [anchor=west,inner sep=2pt] (s4) at ([xshift=0.3em]s3.east) {\tiny{...}};
-}
-\visible<3->{
-\node [rnnnode,anchor=south,fill=blue!20] (o1) at ([yshift=1em]s1.north) {\tiny{softmax}};
-}
-\visible<7->{
-\node [rnnnode,anchor=south,fill=blue!20] (o2) at ([yshift=1em]s2.north) {\tiny{softmax ($\times 3$)}};
-}
-\visible<8->{
-\node [rnnnode,anchor=south,fill=blue!20] (o3) at ([yshift=1em]s3.north) {\tiny{softmax ($\times 3$)}};
-\node [anchor=west,inner sep=2pt] (o4) at ([xshift=0.3em]o3.east) {\tiny{...}};
-}
-
-\node [wnode,anchor=north] (wt1) at ([yshift=-0.8em]t1.south) {\tiny{EOS}};
-
-\visible<6->{
-\node [wnode,anchor=north] (wt2) at ([yshift=-0.8em]t2.south) {\tiny{Have}};
-\node [wnode,anchor=north] (wt2copy1) at ([xshift=-0.2em,yshift=-0.2em]wt2.north) {\tiny{Have}};
-\node [wnode,anchor=north] (wt2copy2) at ([xshift=-0.4em,yshift=-0.4em]wt2.north) {\tiny{Have}};
-}
-
-\visible<8->{
-\node [wnode,anchor=north,inner sep=2pt] (wt3) at ([yshift=-0.8em]t3.south) {\tiny{you}};
-\node [wnode,anchor=north] (wt3copy1) at ([xshift=-0.2em,yshift=-0.2em]wt3.north) {\tiny{you}};
-\node [wnode,anchor=north] (wt3copy2) at ([xshift=-0.4em,yshift=-0.4em]wt3.north) {\tiny{you}};
-}
-
-\visible<5->{
-\node [wnode,anchor=center,inner sep=2pt] (wo1) at ([xshift=0.4em,yshift=1.8em]o1.north) {\tiny{Have}};
-\node [wnode,anchor=north] (wo1copy1) at ([xshift=-0.2em,yshift=-0.2em]wo1.north) {\tiny{Have}};
-\node [wnode,anchor=north] (wo1copy2) at ([xshift=-0.4em,yshift=-0.4em]wo1.north) {\tiny{Have}};
-}
-
-\visible<8->{
-\node [wnode,anchor=center,inner sep=2pt] (wo2) at ([xshift=0.4em,yshift=1.8em]o2.north) {\tiny{you}};
-\node [wnode,anchor=north] (wo2copy1) at ([xshift=-0.2em,yshift=-0.2em]wo2.north) {\tiny{you}};
-\node [wnode,anchor=north] (wo2copy2) at ([xshift=-0.4em,yshift=-0.4em]wo2.north) {\tiny{you}};
-}
-
-\visible<8->{
-\node [wnode,anchor=center,inner sep=2pt] (wo3) at ([xshift=0.4em,yshift=1.8em]o3.north) {\tiny{learned}};
-\node [wnode,anchor=north] (wo3copy1) at ([xshift=-0.2em,yshift=-0.2em]wo3.north) {\tiny{learned}};
-\node [wnode,anchor=north] (wo3copy2) at ([xshift=-0.4em,yshift=-0.4em]wo3.north) {\tiny{learned}};
-}
-
-\visible<3->{
-\foreach \x in {1}{
-    \draw [->] ([yshift=-0.7em]t\x.south) -- ([yshift=-0.1em]t\x.south);
-    \draw [->] ([yshift=0.1em]t\x.north) -- ([yshift=-0.1em]s\x.south);
-    \draw [->] ([yshift=0.1em]s\x.north) -- ([yshift=-0.1em]o\x.south);
-   
-}
-}
-
-\visible<5->{
- \draw [->] ([yshift=0.1em]o1.north) -- ([yshift=0.8em]o1.north) node [pos=0.5,right] {\tiny{top-3}};
- }
-
-\visible<7->{
-\foreach \x in {2}{
-    \draw [->] ([yshift=-0.7em]t\x.south) -- ([yshift=-0.1em]t\x.south);
-    \draw [->] ([yshift=0.1em]t\x.north) -- ([yshift=-0.1em]s\x.south);
-    \draw [->] ([yshift=0.1em]s\x.north) -- ([yshift=-0.1em]o\x.south);
-    \draw [->] ([yshift=0.1em]o\x.north) -- ([yshift=0.8em]o\x.north) node [pos=0.5,right] {\tiny{top-3}};
-}
-}
-
-\visible<8->{
-\foreach \x in {3}{
-    \draw [->] ([yshift=-0.7em]t\x.south) -- ([yshift=-0.1em]t\x.south);
-    \draw [->] ([yshift=0.1em]t\x.north) -- ([yshift=-0.1em]s\x.south);
-    \draw [->] ([yshift=0.1em]s\x.north) -- ([yshift=-0.1em]o\x.south);
-    \draw [->] ([yshift=0.1em]o\x.north) -- ([yshift=0.8em]o\x.north) node [pos=0.5,right] {\tiny{top-3}};
-}
-}
-
-\visible<3->{
-\draw [->] ([xshift=-0.5em]s1.west) -- ([xshift=-0.1em]s1.west) node [pos=0,left,inner sep=1pt] {\tiny{0}};
-}
-\visible<7->{
-\draw [->] ([xshift=0.1em]s1.east) -- ([xshift=-0.1em]s2.west);
-}
-\visible<8->{
-\draw [->] ([xshift=0.1em]s2.east) -- ([xshift=-0.1em]s3.west);
-}
-
-\visible<6->{
-\draw [->,very thick,dotted] (wo1.east) .. controls +(east:0.6) and +(west:0.8) ..(wt2copy2.west);
-}
-\visible<8->{
-\draw [->,very thick,dotted] (wo2.east) .. controls +(east:0.6) and +(west:0.8) ..(wt3copy2.west);
-}
-
-\visible<7->{
-\node [circle,draw,anchor=north,inner sep=2pt,fill=orange!20] (c2) at ([yshift=-2.5em]t1.south) {\tiny{$\textbf{C}_2$}};
-\node [circle,draw,inner sep=2pt,fill=orange!20] (c2copy1) at ([yshift=-0.1em,xshift=-0.1em]c2) {\tiny{$\textbf{C}_2$}};
-\node [circle,draw,inner sep=2pt,fill=orange!20] (c2copy2) at ([yshift=-0.2em,xshift=-0.2em]c2) {\tiny{$\textbf{C}_2$}};
-\draw [->] ([xshift=-0.9em]c2.west) -- ([xshift=-0.3em]c2.west);
-\draw [->] ([xshift=0.1em]c2.east) .. controls +(east:1.5) and +(west:0.8) ..([yshift=-0.3em,xshift=-0.1em]s2.west);
-}
-
-\visible<8->{
-\node [circle,draw,anchor=north,inner sep=2pt,fill=orange!20] (c3) at ([yshift=-2.5em]t2.south) {\tiny{$\textbf{C}_3$}};
-\node [circle,draw,inner sep=2pt,fill=orange!20] (c3copy1) at ([yshift=-0.1em,xshift=-0.1em]c3) {\tiny{$\textbf{C}_3$}};
-\node [circle,draw,inner sep=2pt,fill=orange!20] (c3copy2) at ([yshift=-0.2em,xshift=-0.2em]c3) {\tiny{$\textbf{C}_3$}};
-\draw [->] ([xshift=-0.9em]c3.west) -- ([xshift=-0.3em]c3.west);
-\draw [->] ([xshift=0.1em]c3.east) .. controls +(east:1.5) and +(west:0.8) ..([yshift=-0.3em,xshift=-0.1em]s3.west);
-}
-
-\visible<3->{
-\node [anchor=east] (vocab) at ([xshift=-5em]s1.west) {\tiny{$\begin{bmatrix} \textrm{Have} & 0.50 \\ \textrm{I} & 0.02 \\ \textrm{it} & 0.03 \\ \textrm{has} & 0.30 \\ \textrm{you} & 0.01 \\ \textrm{the} & 0.01 \\ \textrm{a} & 0.01 \\ \textrm{an} & 0.02 \\ \textrm{he} & 0.03 \\ \textrm{she} & 0.01 \\ \textrm{are} & 0.00 \\ \textrm{am} & 0.01 \\ ... & ... \end{bmatrix}$}};
-\node [anchor=south] (vocablabel) at (vocab.north) {\tiny{单词的概率分布}};
-\draw [->,red,very thick,dotted] (o1.west) .. controls +(west:1) and +(east:2) .. ([yshift=1em]vocab.south east); 
-}
-
-\visible<4->{
-\node [anchor=east,inner sep=1pt] (vocabtopn) at ([xshift=-0.5em,yshift=-0.5em]wo1.west) {\tiny{$\begin{bmatrix} \textrm{Have} \\ \textrm{has} \\ \textrm{it} \end{bmatrix}$}};
-\draw [->] ([yshift=-1.6em,xshift=-0.4em]vocab.north east) .. controls +(east:1) and +(west:1) ..  ([xshift=0.1em,yshift=0.4em]vocabtopn.west) node [pos=0.3,below] (topnlabel) {\tiny{top-3}};
-
-\visible<4->{
-\node [anchor=north] (cap) at (vocab.south east) {\scriptsize{\textbf{束搜索($b=3$)}}};
-}
-}
-
-
-\end{scope}
-\end{tikzpicture}
-\end{center}
-}
-
 \end{frame}

 %%%------------------------------------------------------------------------------------------------------------

--- a/Section06-Neural-Machine-Translation/section06.tex
+++ b/Section06-Neural-Machine-Translation/section06.tex
@@ -2438,6 +2438,30 @@ $\textrm{``you''} = \argmax_{y} \textrm{P}(y|\textbf{s}_1, \alert{\textbf{C}})$ 
 \end{frame}

 %%%------------------------------------------------------------------------------------------------------------
+%%% 解码 - 长度惩罚和覆盖度
+\begin{frame}{推断 - 其它特征}
+\begin{itemize}
+\item 直接用$\textrm{P}(\textbf{y}|\textbf{x})$进行解码，面临两方面问题
+    \begin{itemize}
+    \item 对$\textrm{P}(y_j|\textbf{y}_{<j},\textbf{x})$进行乘积会导致长句的概率很低
+    \item 模型本身并没有考虑每个源语言单词被使用的程度，比如一个单词可能会被翻译了很多``次''
+    \end{itemize}
+\item<2-> 因此，解码时会使用其它特征与$\textrm{P}(\textbf{y}|\textbf{x})$一起组成模型得分$score(\textbf{y},\textbf{x})$，$score(\textbf{y},\textbf{x})$也作为beam search的排序依据
+    \begin{eqnarray}
+    score(\textbf{y},\textbf{x}) & = & \textrm{P}(\textbf{y}|\textbf{x})/\textrm{lp}(\textbf{y}) + \textrm{cp}(\textbf{y},\textbf{x}) \nonumber \\
+    \textrm{lp}(\textbf{y})      & = & \frac{(5 + |\textbf{y}|)^\alpha}{(5 + 1)^\alpha} \nonumber \\
+    \textrm{cp}(\textbf{y},\textbf{x}) & = & \beta \cdot \sum\nolimits_{i=1}^{|\textbf{x}|} \log (\min(\sum\nolimits_{j}^{|\textbf{y}|} a_{ij}, 1))) \nonumber
+    \end{eqnarray}
+
+    \vspace{-0.5em}
+    \begin{itemize}
+    \item lp会惩罚译文过短的结果(长度惩罚)；cp会惩罚把某些源语单词对应到很多目标语单词的情况(覆盖度)，被覆盖的程度用$\sum\nolimits_{j}^{|\textbf{y}|} a_{ij}$度量；$\alpha$和$\beta$是超参，需要经验性设置
+    \end{itemize}
+\end{itemize}
+
+\end{frame}
+
+%%%------------------------------------------------------------------------------------------------------------
 %%% 实验结果
 \begin{frame}{效果}
 %% 实用注意力机制带来的提升