Merge branch 'master' into liyanyang

bdcfc769 · Lee · b041e362 · abfdaf38 · bdcfc769 · bdcfc769
Commit bdcfc769 authored Dec 16, 2019 by Lee
--- a/Section04-Phrasal-and-Syntactic-Models/section04-test.tex
+++ b/Section04-Phrasal-and-Syntactic-Models/section04-test.tex
@@ -103,220 +103,76 @@
 \section{使用更大的翻译单元}

 %%%------------------------------------------------------------------------------------------------------------
-%%% 数学模型
-\begin{frame}{数学模型}
-\begin{itemize}
-\item \textbf{机器翻译}：对于输入的源语言句子$\textbf{s}$，找到最佳译文$\hat{\textbf{t}}$
-
-\begin{displaymath}
-\hat{\textbf{t}} = \argmax_{\textbf{t}} \textrm{P}(\textbf{t}|\textbf{s})
-\end{displaymath}
-
-其中$\textrm{P}(\textbf{t}|\textbf{s})$表示$\textbf{s}$到$\textbf{t}$的翻译概率
-
-\item 三个基本问题(回忆一下第三章)
-    \begin{enumerate}
-    \item 如何定义$\textrm{P}(\textbf{t}|\textbf{s})$ - 建模问题
-    \item 如何学习$\textrm{P}(\textbf{t}|\textbf{s})$的统计模型 - 训练问题
-    \item 如何找到最优译文 - 解码问题
-    \end{enumerate}
-\vspace{0.3em}
-\item<2-> 先看建模问题。可以把$\textrm{P}(\textbf{t}|\textbf{s})$表示成所有翻译推导的概率
-
-\begin{displaymath}
-\textrm{P}(\textbf{t}|\textbf{s}) = \sum_{d} \textrm{P}(d,\textbf{t}|\textbf{s})
-\end{displaymath}
-
-$d$是一个$(\textbf{s},\textbf{t})$上基于短语的翻译推导，$\textrm{P}(d,\textbf{t}|\textbf{s})$表示翻译推导$d$的概率
-
-\end{itemize}
-\end{frame}
+\section{基于短语的模型}

 %%%------------------------------------------------------------------------------------------------------------
-%%% 翻译推导的建模
-\begin{frame}{对翻译推导进行建模}
-\begin{itemize}
-\item $\textrm{P}(\textbf{t}|\textbf{s}) = \sum_{d} \textrm{P}(d,\textbf{t}|\textbf{s})$带来新的问题：如何描述$\textrm{P}(d,\textbf{t}|\textbf{s})$ \\
-
-\vspace{0.5em}
-\begin{center}
-\begin{tikzpicture}
-
-\begin{scope}[minimum height = 18pt]
-
-\node[anchor=east] (s0) at (-0.5em, 0) {$\textbf{s}$:};
-\node[anchor=west,fill=ugreen!50] (s1) at (0, 0) {在};
-\node[anchor=west,fill=red!50] (s2) at ([xshift=1em]s1.east) {桌子 上 的};
-\node[anchor=west,fill=blue!50] (s3) at ([xshift=1em]s2.east) {苹果};
-
-\node[anchor=east] (t0) at (-0.5em, -1.5) {$\textbf{t}$:};
-\node[anchor=west,fill=blue!50] (t1) at (0, -1.5) {the apple};
-\node[anchor=west,fill=ugreen!50] (t2) at ([xshift=1em]t1.east) {on};
-\node[anchor=west,fill=red!50] (t3) at ([xshift=1em]t2.east) {the table};
-
-\path[<->, thick] (s1.south) edge (t3.north);
-\path[<->, thick] (s2.south) edge (t2.north);
-\path[<->, thick] (s3.south) edge (t1.north);
-
-\end{scope}
-\end{tikzpicture}
-\end{center}
+\subsection{建模}

-上图体现了三方面问题
+%%%------------------------------------------------------------------------------------------------------------
+\subsection{短语抽取}

-    \begin{enumerate}
-    \item 确定哪些是``可用''的短语
-    \item 描述短语翻译的好坏
-    \item 描述翻译中的调序现象
-    \end{enumerate}
+%%%------------------------------------------------------------------------------------------------------------
+\subsection{判别式模型及特征}

-\item<2-> 希望有这样一种模型可以对任意的因素进行方便的建模。经典的判别式模型成为了不二的选择
-\end{itemize}
+%%%------------------------------------------------------------------------------------------------------------
+\subsection{最小错误率训练}

-\visible<2->{
-\textbf{Discriminative Training and Maximum Entropy Models for Statistical Machine Translation}\\
-\textbf{Franz Och and Hermann Ney, 2002, In Proc of ACL}
-}
+%%%------------------------------------------------------------------------------------------------------------
+\subsection{栈解码}

-\end{frame}
+%%%------------------------------------------------------------------------------------------------------------
+\section{基于层次短语的模型}

 %%%------------------------------------------------------------------------------------------------------------
-%%% 判别式模型
-\begin{frame}{判别式模型}
+%%%  短语系统的问题 - 数据稀疏和无法处理长距离依赖
+\begin{frame}{基于短语的方法的不足}
 \begin{itemize}
-\item 判别式模型的形式：
-\begin{displaymath}
-\textrm{P}(d,\textbf{t}|\textbf{s}) = \frac{\exp(\sum_{i=1}^{M} \lambda_i \cdot h_i(d,\textbf{s},\textbf{t}))}{\sum_{d',t'}\exp(\sum_{i=1}^{M} \lambda_i \cdot h_i(d',\textbf{s},\textbf{t}'))}
-\end{displaymath}
+\item 短语可以很好的捕捉词语之间的局部搭配和调序，但是长距离依赖需要更长的短语
    \begin{itemize}
-    \item $\{h_i(\cdot)\}$是$M$个特征，每个$h_i(d,\textbf{s},\textbf{t})$把$d$映射为一个实数值
-    \item $\{\lambda_i\}$是这些特征对应权重，权重越大表示特征越重要
-    \item $\sum_{i=1}^{M} \lambda_i \cdot h_i(d,\textbf{s},\textbf{t}))$描述了$d$的整体质量，值约大$d$越``好''
+    \item 实践中发现使用超过长度3的短语作用不大
+    \item 短语非常稀疏，包含多个词的短语大多非常低频
    \end{itemize}
-\item 判别式模型的优点在于，它可以很方便的引入各种特征。我们只需要设计不同的特征函数$h_i(\cdot)$即可。
-    \begin{itemize}
-    \item 比如，可以定义短语翻译概率作为特征，也可以定义调序的程度作为一个特征
-    \end{itemize}
-\item \textbf{两个问题}：
+
+    \vspace{0.5em}
+    \begin{center}
+    \begin{tabular}{l | r}
+    短语(中文) & 训练数据中出现频次 \\ \hline
+    包含 & 3341 \\
+    包含 多个 & 213 \\
+    包含 多个 词 & 12 \\
+    包含 多个 词 的 & 8 \\
+    包含 多个 词 的 短语 & 0 \\
+    包含 多个 词 的 短语 大多 & 0
+    \end{tabular}
+    \end{center}
+    
+\vspace{0.5em}
+\item 简单使用短语和$n$-gram语言模型无法处理长距离的调序
    \begin{itemize}
-    \item 特征定义：定义短语翻译特征和调序特征(马上)
-    \item 权重调优：得到最好的特征权重（后面）
+    \item 引入独立的调序模型，比如简单的基于距离的调序
+    \item 当然，也可以设计更加复杂的调序模型
    \end{itemize}
-
 \end{itemize}
 \end{frame}

 %%%------------------------------------------------------------------------------------------------------------
-%%% 翻译推导的建模
-\begin{frame}{对翻译推导进行建模}
+%%%  短语系统的问题 - 一个实例
+\begin{frame}{基于短语的方法的不足 - 一个实例}
 \begin{itemize}
-\item 回到最开始的问题: 给定$\textbf{s}$和$\textbf{t}$，\alert{如何获得双语短语}
-    \begin{itemize}
-    \item 如果没有限制，$\textbf{s}$和$\textbf{t}$之间任何子串映射都可以看做双语短语
-    \end{itemize}
+\item 一个短语翻译不成功的例子
 \end{itemize}

-\vspace{-0.7em}
-
 \begin{center}
 \begin{tikzpicture}
-
-\setlength{\wseg}{1.5cm}
-\setlength{\hseg}{1.0cm}
-\setlength{\wnode}{3.75cm}
-\setlength{\hnode}{1.0cm}
-
-\tikzstyle{elementnode} = [rectangle,text=white,anchor=center]
-\tikzstyle{srcnode} = [rotate=45,font=\small,anchor=south west]
-\tikzstyle{tgtnode} = [left,font=\small,anchor=north east]
-\tikzstyle{alignmentnode} = [rectangle,draw,minimum height=3.6\hnode,minimum width=0.36\hnode]
-\tikzstyle{probnode} = [fill=blue!30,minimum width=0.4\hnode]
-\tikzstyle{labelnode} = [above]
-
-% alignment matrix
-\begin{scope}[scale=0.85,yshift=0.12in]
-\foreach \i / \j / \c in
-    {0/7/0.15, 1/7/0.15, 2/7/0.15, 3/7/0.15, 4/7/0.15, 5/7/0.15,
-    0/6/0.15, 1/6/0.15, 2/6/0.15, 3/6/0.15, 4/6/0.15, 5/6/0.15,
-    0/5/0.15, 1/5/0.15, 2/5/0.15, 3/5/0.15, 4/5/0.15, 5/5/0.15,
-    0/4/0.15, 1/4/0.15, 2/4/0.15, 3/4/0.15, 4/4/0.15, 5/4/0.15,
-    0/3/0.15, 1/3/0.15, 2/3/0.15, 3/3/0.15, 4/3/0.15, 5/3/0.15,
-    0/2/0.15, 1/2/0.15, 2/2/0.15, 3/2/0.15, 4/2/0.15, 5/2/0.15,
-    0/1/0.15, 1/1/0.15, 2/1/0.15, 3/1/0.15, 4/1/0.15, 5/1/0.15,
-    0/0/0.15, 1/0/0.15, 2/0/0.15, 3/0/0.15, 4/0/0.15, 5/0/0.15}
-    \node[elementnode,minimum size=0.6*\hnode*\c,inner sep=0.1pt,fill=blue] (a\i\j) at (0.5*\hnode*\i-5.4*0.5*\hnode,0.5*\hnode*\j-1.05*\hnode) {};
-
-% source
-\node[srcnode] (src1) at (-5.4*0.5*\hnode,-1.05*\hnode+7.5*0.5*\hnode) {\scriptsize{Have}};
-\node[srcnode] (src2) at ([xshift=0.5\hnode]src1.south west) {\scriptsize{you}};
-\node[srcnode] (src3) at ([xshift=0.5\hnode]src2.south west) {\scriptsize{learned}};
-\node[srcnode] (src4) at ([xshift=0.5\hnode]src3.south west) {\scriptsize{nothing}};
-\node[srcnode] (src5) at ([xshift=0.5\hnode]src4.south west) {\scriptsize{?}};
-\node[srcnode] (src6) at ([xshift=0.5\hnode]src5.south west) {\scriptsize{EOS}};
-
-% target
-\node[tgtnode] (tgt1) at (-6.0*0.5*\hnode,-1.05*\hnode+7.5*0.5*\hnode) {\scriptsize{你}};
-\node[tgtnode] (tgt2) at ([yshift=-0.5\hnode]tgt1.north east) {\scriptsize{什么}};
-\node[tgtnode] (tgt3) at ([yshift=-0.5\hnode]tgt2.north east) {\scriptsize{都}};
-\node[tgtnode] (tgt4) at ([yshift=-0.5\hnode]tgt3.north east) {\scriptsize{没}};
-\node[tgtnode] (tgt5) at ([yshift=-0.5\hnode]tgt4.north east) {\scriptsize{学}};
-\node[tgtnode] (tgt6) at ([yshift=-0.5\hnode]tgt5.north east) {\scriptsize{到}};
-\node[tgtnode] (tgt7) at ([yshift=-0.5\hnode]tgt6.north east) {\scriptsize{?}};
-\node[tgtnode] (tgt8) at ([yshift=-0.5\hnode]tgt7.north east) {\scriptsize{EOS}};
-
-\node [anchor=west] (p1line1) at ([xshift=4em,yshift=1em]a57.east) {\footnotesize{$\tilde{s}_i$: 什么\ \ \ 都\ \ \ 没}};
-\node [anchor=north west] (p1line2) at ([xshift=0]p1line1.south west) {\footnotesize{$\tilde{t}_i$: learned\ \ \ nothing\ \ \ ? \ \ \ \ \ \ \ \ \ \ \ \ }};
-
-\node [anchor=west] (p2line1) at ([xshift=4em]a53.east) {\footnotesize{$\tilde{s}_j$: 到\ \ \ ?}};
-\node [anchor=north west] (p2line2) at ([xshift=0]p2line1.south west) {\footnotesize{$\tilde{t}_j$: Have\ \ \ you\ \ \ learned\ \ \ nothing}};
-
-\begin{pgfonlayer}{background}
-\node [rectangle,draw=red,thick,inner sep=0.2em,fill=white,drop shadow] [fit = (a26) (a44)] (phrase1) {};
-\node [rectangle,draw=ugreen,thick,inner sep=0.2em,fill=white,drop shadow] [fit = (a01) (a32)] (phrase2) {};
-\node [rectangle,inner sep=0.2em,fill=red!10] [fit = (p1line1) (p1line2)] (box1) {};
-\node [rectangle,inner sep=0.2em,fill=green!10] [fit = (p2line1) (p2line2)] (box2) {};
-\end{pgfonlayer}
-
-\draw [->,thick,dotted] ([yshift=-0.8em]phrase1.east) .. controls +(east:1.5) and +(west:1) ..  (box1.west);
-\draw [->,thick,dotted] ([yshift=-0.0em]phrase2.east) .. controls +(east:2.0) and +(west:1) ..  ([yshift=1em]box2.west);
-
+\begin{scope}
+\node [anchor=east] (shead) at (0,0) {源语:};
 \end{scope}
-
 \end{tikzpicture}
 \end{center}

-\begin{itemize}
-\item<2-> \textbf{显然}，不加限制的定义短语会带来很多问题
-    \begin{itemize}
-    \item 短语数量随句子长度增加急剧膨胀
-    \item 大量噪声，如``到 ? $\leftrightarrow$ Have you learned nothing''
-    \end{itemize}
-\end{itemize}
-
 \end{frame}

 %%%------------------------------------------------------------------------------------------------------------
-\section{基于短语的模型}
-
-%%%------------------------------------------------------------------------------------------------------------
-\subsection{建模}
-
-%%%------------------------------------------------------------------------------------------------------------
-\subsection{短语抽取}
-
-%%%------------------------------------------------------------------------------------------------------------
-\subsection{判别式模型及特征}
-
-%%%------------------------------------------------------------------------------------------------------------
-\subsection{最小错误率训练}
-
-%%%------------------------------------------------------------------------------------------------------------
-\subsection{栈解码}
-
-%%%------------------------------------------------------------------------------------------------------------
-\section{基于层次短语的模型}
-
-%%%------------------------------------------------------------------------------------------------------------
 \subsection{同步上下文无关文法}

 %%%------------------------------------------------------------------------------------------------------------

--- a/Section04-Phrasal-and-Syntactic-Models/section04.tex
+++ b/Section04-Phrasal-and-Syntactic-Models/section04.tex
@@ -1067,10 +1067,11 @@ $d$是一个$(\textbf{s},\textbf{t})$上基于短语的翻译推导，$\textrm{P
 %%%------------------------------------------------------------------------------------------------------------
 %%% 翻译推导的建模
 \begin{frame}{对翻译推导进行建模}
+\vspace{-0.4em}
 \begin{itemize}
 \item $\textrm{P}(\textbf{t}|\textbf{s}) = \sum_{d} \textrm{P}(d,\textbf{t}|\textbf{s})$带来新的问题：如何描述$\textrm{P}(d,\textbf{t}|\textbf{s})$ \\

-\vspace{0.5em}
+\vspace{0.0em}
 \begin{center}
 \begin{tikzpicture}

@@ -1105,6 +1106,7 @@ $d$是一个$(\textbf{s},\textbf{t})$上基于短语的翻译推导，$\textrm{P
 \item<2-> 希望有这样一种模型可以对任意的因素进行方便的建模。经典的判别式模型成为了不二的选择
 \end{itemize}

+\vspace{-0.2em}
 \visible<2->{
 \textbf{Discriminative Training and Maximum Entropy Models for Statistical Machine Translation}\\
 \textbf{Franz Och and Hermann Ney, 2002, In Proc of ACL}
@@ -1364,7 +1366,19 @@ $d$是一个$(\textbf{s},\textbf{t})$上基于短语的翻译推导，$\textrm{P
 %%%------------------------------------------------------------------------------------------------------------
 %%%  短语系统的问题
 \begin{frame}{基于短语的方法的不足}
-% David Chiang ACL2005的论文
+\begin{itemize}
+\item 短语可以很好的捕捉词语之间的局部搭配和调序，但是长距离依赖需要更长的短语
+    \begin{itemize}
+    \item 实践中发现使用超过长度3的短语作用不大
+    \item 短语非常稀疏，包含多个词的短语大多非常低频
+    \end{itemize}
+    
+    \begin{tabular}{l | r}
+    短语(中文) & 训练数据中频次 \hline \\
+    包含 & 3341 \\
+    包含 多个 & 213 \\
+    \end{tabular}
+\end{itemize}
 \end{frame}

 %%%------------------------------------------------------------------------------------------------------------

--- a/Section06-Neural-Machine-Translation/section06.tex
+++ b/Section06-Neural-Machine-Translation/section06.tex