update chapter 7

248efc67 · 曹润柘 · 6735eb1a · 248efc67 · 248efc67 · 248efc67
Commit 248efc67 authored Sep 07, 2020 by 曹润柘
--- a/Chapter7/Figures/figure-example-of-vocabulary-translation-probability.tex
+++ b/Chapter7/Figures/figure-example-of-vocabulary-translation-probability.tex
@@ -39,10 +39,10 @@
 \node[align=center,elementnode,minimum size=0.3cm,inner sep=0.1pt,fill=blue!50] (la4) at (a41) {};
 \node[align=center,elementnode,minimum size=0.3cm,inner sep=0.1pt,fill=blue!50] (la5) at (a30) {};
-\node[anchor=west] (f1) at ([xshift=3em,yshift=0.8em]a43.east) {\small{$\funp{P}_{\textrm{lex}}(\bar{t}|\bar{s})=\sigma (t_1|s_1)\times$}};
+\node[anchor=west] (f1) at ([xshift=3em,yshift=0.8em]a43.east) {\small{$\funp{P}_{\textrm{lex}}(\bar{t}|\bar{s})=w(t_1|s_1)\times$}};
-\node[anchor=north] (f2) at ([xshift=5.2em]f1.south) {\small{$\frac{1}{2}(\sigma (t_2|s_2)+\sigma (t_4|s_2))\times$}};
+\node[anchor=north] (f2) at ([xshift=5.2em]f1.south) {\small{$\frac{1}{2}(w(t_2|s_2)+w(t_4|s_2))\times$}};
-\node[anchor=north west] (f3) at (f2.south west) {\small{$\sigma (N|s_3)\times$}};
+\node[anchor=north west] (f3) at (f2.south west) {\small{$w(N|s_3)\times$}};
-\node[anchor=north west] (f4) at (f3.south west) {\small{$\sigma (t_4|s_4)\times$}};
+\node[anchor=north west] (f4) at (f3.south west) {\small{$w(t_4|s_4)\times$}};
 \end{scope}

--- a/Chapter7/Figures/figure-function-image-about-weight-and-Bleu.tex
+++ b/Chapter7/Figures/figure-function-image-about-weight-and-Bleu.tex
@@ -20,8 +20,8 @@
 \draw[thick,dotted] ([xshift=4em]x0.center) -- ([xshift=4em,yshift=3.6em]x0.center);
 }
 \node[anchor=north] (zero) at ([yshift=0.1em]x0.south) {\small{0}};
-\node[anchor=north] (wx) at ([xshift=4em,yshift=0.1em]x0.south) {\small{$\lambda_x$}};
+\node[anchor=north] (wx) at ([xshift=4em,yshift=0.1em]x0.south) {\small{$w_x$}};
-\node[anchor=north] (wi) at ([xshift=8em,yshift=0.1em]x0.south) {\small{$\lambda_i$}};
+\node[anchor=north] (wi) at ([xshift=8em,yshift=0.1em]x0.south) {\small{$w_i$}};
 \end{scope}
@@ -33,8 +33,8 @@
 \draw[->,thick] (x0.center) -- ([yshift=5.6em]x0.center);
 \node[anchor=north] (zero) at ([yshift=0.1em]x0.south) {\small{0}};
-\node[anchor=north] (wx) at ([xshift=4em,yshift=0.1em]x0.south) {\small{$\lambda_x$}};
+\node[anchor=north] (wx) at ([xshift=4em,yshift=0.1em]x0.south) {\small{$w_x$}};
-\node[anchor=north] (wi) at ([xshift=8em,yshift=0.1em]x0.south) {\small{$\lambda_i$}};
+\node[anchor=north] (wi) at ([xshift=8em,yshift=0.1em]x0.south) {\small{$w_i$}};
 {
 \draw[thick] ([yshift=2em]x0.center) -- ([xshift=4em,yshift=2em]x0.center);
@@ -42,12 +42,12 @@
 \draw[thick,dotted] ([xshift=4em]x0.center) -- ([xshift=4em,yshift=5.5em]x0.center);
-\node[anchor=north] (e1) at ([xshift=2em,yshift=3em]x0.north) {\small{$\hat{d}=d_1$}};
+\node[anchor=north] (e1) at ([xshift=2em,yshift=3em]x0.north) {\small{$d^*=d_1$}};
-\node[anchor=north] (e2) at ([xshift=6.2em,yshift=5em]x0.north) {\small{$\hat{d}=d_2$}};
+\node[anchor=north] (e2) at ([xshift=6.2em,yshift=5em]x0.north) {\small{$d^*=d_2$}};
 \node[anchor=north,rotate=90] (e2) at ([xshift=-1.3em,yshift=4em]x0.south) {\small{BLEU}};
 \draw[decorate,decoration={brace,amplitude=0.4em},red,thick] ([xshift=4em,yshift=0.5em]x0.south) -- ([xshift=8.2em,yshift=0.5em]x0.south);
-\node[anchor=north] (wi) at ([xshift=6.1em,yshift=2.1em]x0.south) {\footnotesize{\red{挑选$\hat{\lambda}_i$}}};
+\node[anchor=north] (wi) at ([xshift=6.1em,yshift=2.1em]x0.south) {\footnotesize{\red{挑选$w_i$}}};
 }
 \end{scope}
 \end{tikzpicture}
--- a/Chapter7/Figures/figure-three-types-of-reorder-method-in-msd.tex
+++ b/Chapter7/Figures/figure-three-types-of-reorder-method-in-msd.tex
@@ -54,7 +54,7 @@
 {
 \node [anchor=west] (p1line1) at ([xshift=3.5em,yshift=0.5em]a75.east) {\footnotesize{M(monotone)：单调调序}};
 \node [anchor=north west] (p1line2) at ([xshift=0,yshift=-1em]p1line1.south west) {\footnotesize{S(swap)： 与前面一个短语}};
-\node [anchor=north west] (p1line3) at ([xshift=3.8em]p1line2.south west) {\footnotesize{位置进行交换}};
+\node [anchor=north west] (p1line3) at ([xshift=3.5em]p1line2.south west) {\footnotesize{位置进行交换}};
 \node [anchor=north west] (p1line4) at ([xshift=-3.5em,yshift=-1em]p1line3.south west) {\footnotesize{D(discontinuous)：非连续调序}};
 \node [anchor=east] (p1line5) at ([xshift=0em,yshift=3em]p1line4.east) {};

--- a/Chapter7/chapter7.tex
+++ b/Chapter7/chapter7.tex
@@ -338,7 +338,7 @@ d = {(\bar{s}_{\bar{a}_1},\bar{t}_1)} \circ {(\bar{s}_{\bar{a}_2},\bar{t}_2)} \c
 %    NEW SUB-SECTION
 %----------------------------------------------------------------------------------------
-\subsection{判别式模型中的特征}
+\subsection{翻译特征}
 \parinterval 判别式模型最大的好处在于它可以更灵活地引入特征。某种意义上，每个特征都是在描述翻译的某方面属性。在各种统计分类模型中，也大量使用了“特征”这个概念（见{\chapterthree}）。比如，要判别一篇新闻是体育方面的还是文化方面的，可以设计一个分类器，用词作为特征。这个分类器就会根据有能力区分“体育”和“文化”两个类别的特征，最终决定这篇文章属于哪个类别。统计机器翻译也在做类似的事情。系统研发者可以通过设计翻译相关的特征，来区分不同翻译结果的好坏。翻译模型会综合这些特征对所有可能的译文进行打分和排序，并选择得分最高的译文输出。
@@ -482,11 +482,11 @@ d = {(\bar{s}_{\bar{a}_1},\bar{t}_1)} \circ {(\bar{s}_{\bar{a}_2},\bar{t}_2)} \c
 \parinterval 当遇到低频短语时，短语翻译概率的估计可能会不准确。例如，短语$\bar{s}$和$\bar{t}$在语料中只出现了一次，且在一个句子中共现，那么$\bar{s}$到$\bar{t}$的翻译概率为$\funp{P}(\bar{t}|\bar{s})=1$，这显然是不合理的，因为$\bar{s}$和$\bar{t}$的出现完全可能是偶然事件。既然直接度量双语短语的好坏会面临数据稀疏问题，一个自然的想法就是把短语拆解成单词，利用双语短语中单词翻译的好坏间接度量双语短语的好坏。为了达到这个目的，可以使用{\small\bfnew{词汇化翻译概率}}\index{词汇化翻译概率}（Lexical Translation Probability）\index{Lexical Translation Probability}。前面借助词对齐信息完成了双语短语的抽取，因此，词对齐信息本身就包含了短语内部单词之间的对应关系。因此同样可以借助词对齐来计算词汇翻译概率，公式如下：
 \begin{eqnarray}
-\funp{P}_{\textrm{lex}}(\bar{t}|\bar{s}) = \prod_{j=1}^{|\bar{s}|} \frac{1}{|\{j|a(j,i) = 1\}|} \sum_{\forall(j,i):a(j,i) = 1} \sigma (t_i|s_j)
+\funp{P}_{\textrm{lex}}(\bar{t}|\bar{s}) = \prod_{j=1}^{|\bar{s}|} \frac{1}{|\{j|a(j,i) = 1\}|} \sum_{\forall(j,i):a(j,i) = 1} w(t_i|s_j)
 \label{eq:7-14}
 \end{eqnarray}
-\parinterval 它表达的意思是短语$\bar{s}$和$\bar{t}$存在词汇级的对应关系，其中$\sigma $表示词汇翻译概率用来度量两个单词之间翻译的可能性大小（见{\chapterfive}），作为两个词之间对应的强度。
+\parinterval 它表达的意思是短语$\bar{s}$和$\bar{t}$存在词汇级的对应关系，其中$w$表示词汇翻译概率用来度量两个单词之间翻译的可能性大小（见{\red{{\chapterfive}}}），作为两个词之间对应的强度。
 \parinterval 下面来看一个具体的例子，如图\ref{fig:7-17}所示。对于一个双语短语，将它们的词对齐关系代入到上面的公式就会得到短语的词汇翻译概率。对于词汇翻译概率，可以使用IBM 模型中的单词翻译表，也可以通过统计获得\upcite{koehn2002learning}。如果一个单词的词对齐为空，则用$N$表示它翻译为空的概率。和短语翻译概率一样，可以使用双向的词汇化翻译概率来评价双语短语的好坏。
@@ -518,7 +518,7 @@ d = {(\bar{s}_{\bar{a}_1},\bar{t}_1)} \circ {(\bar{s}_{\bar{a}_2},\bar{t}_2)} \c
 \parinterval 尽管已经知道了如何将一个源语言短语翻译成目标语言短语，但是想要获得一个高质量的译文，仅有互译的双语短语是远远不够的。
-\parinterval 如图\ref{fig:7-19}所示，按照从左到右的顺序对一个句子“在/桌子/上/的/苹果”进行翻译，得到的译文“on the table the apple”的语序是不对的。虽然可以使用$n$-gram语言模型对语序进行建模，但是此处仍然需要用更加准确的方式描述目标语短语间的次序。一般，把这个问题称为短语调序，或者简称{\small\bfnew{调序}}\index{调序}（Reordering）\index{Reordering}。通常，基于短语的调序模型会作为判别式模型的特征参与到翻译过程中来。接下来，会介绍3 种不同的调序方法，分别是基于距离的调序、基于方向的调序（MSD模型）以及基于分类的调序。
+\parinterval 如图\ref{fig:7-19}所示，按照从左到右的顺序对一个句子“在\ \ 桌子\ \ 上\ \ 的\ \ 苹果”进行翻译，得到的译文“on the table the apple”的语序是不对的。虽然可以使用$n$-gram语言模型对语序进行建模，但是此处仍然需要用更加准确的方式描述目标语短语间的次序。一般，把这个问题称为短语调序，或者简称{\small\bfnew{调序}}\index{调序}（Reordering）\index{Reordering}。通常，基于短语的调序模型会作为判别式模型的特征参与到翻译过程中来。接下来，会介绍3 种不同的调序方法，分别是基于距离的调序、基于方向的调序（MSD模型）以及基于分类的调序。
 %----------------------------------------------
 \begin{figure}[htp]
@@ -535,9 +535,9 @@ d = {(\bar{s}_{\bar{a}_1},\bar{t}_1)} \circ {(\bar{s}_{\bar{a}_2},\bar{t}_2)} \c
 \subsection{基于距离的调序}
-\parinterval 基于距离的调序是最简单的一种调序模型。{\chaptersix}中所讨论的“扭曲度”本质上就是一种调序模型。只不过{\chaptersix}所涉及的扭曲度描述的单词的调序问题，而这里需要把类似的概念推广到短语。
+\parinterval 基于距离的调序是最简单的一种调序模型。{\color{red} {\chaptersix}中所讨论的“扭曲度”本质上就是一种调序模型。只不过{\chaptersix}所涉及的扭曲度描述的单词的调序问题，而这里需要把类似的概念推广到短语。}
-\parinterval 基于距离的调序的一个基本假设是：语言的翻译基本上都是顺序的，也就是，译文单词出现的顺序和源语言单词的顺序基本上是一致的。反过来说，如果译文和源语言单词（或短语）的顺序差别很大，就认为出现了调序。
+\parinterval {\color{red}基于距离的调序的一个基本假设是：}语言的翻译基本上都是顺序的，也就是，译文单词出现的顺序和源语言单词的顺序基本上是一致的。反过来说，如果译文和源语言单词（或短语）的顺序差别很大，就认为出现了调序。
 \parinterval 基于距离的调序方法的核心思想就是度量当前翻译结果与顺序翻译之间的差距。对于译文中的第$i$个短语，令$start_i$表示它所对应的源语言短语中第一个词所在的位置，$end_i$表示它所对应的源语言短语中最后一个词所在的位置。于是，这个短语（相对于前一个短语）的调序距离为：
 \begin{eqnarray}
@@ -545,7 +545,7 @@ dr = start_i-end_{i-1}-1
 \label{eq:7-15}
 \end{eqnarray}
-\parinterval 在图\ref{fig:7-20}的例子中，“the apple”所对应的调序距离为4，“on the table”所对应的调序距离为$-5$。显然，如果两个源语短语按顺序翻译，则$start_i = end_{i-1} + 1$，这时调序距离为0。
+\parinterval 在图\ref{fig:7-20}的例子中，“the apple”所对应的调序距离为4，“on the table”所对应的调序距离为-5。显然，如果两个源语短语按顺序翻译，则$start_i = end_{i-1} + 1$，这时调序距离为0。
 %----------------------------------------------
 \begin{figure}[htp]
@@ -556,7 +556,7 @@ dr = start_i-end_{i-1}-1
 \end{figure}
 %-------------------------------------------
-\parinterval 如果把调序距离作为特征，一般会使用指数函数$f(dr) = a^{|dr|}$作为特征函数（或者调序代价的函数），其中$a$是一个参数，控制调序距离对整个特征值的影响。调序距离$dr$的绝对值越大，调序代价越高。基于距离的调序模型比较适用于像法语到英语翻译这样的任务，因为两种语言的语序基本上是一致的。但是，对于汉语到日语翻译，由于句子结构存在很大差异（日语是谓词后置，而汉语中谓词放在宾语前），使用基于距离的调序会带来一些问题。因此，具体应用时应该根据语言之间的差异性有选择的使用该模型。
+\parinterval 如果把调序距离作为特征，一般会使用指数函数$f(dr) = a^{|dr|}$作为特征函数（或者调序代价的函数），其中$a$是一个参数，控制调序距离对整个特征值的影响。调序距离$dr$的绝对值越大，调序代价越高。基于距离的调序模型比较适用于像法–英翻译这样的任务，因为两种语言的语序基本上是一致的。但是，对于汉–日翻译，由于句子结构存在很大差异（日语是谓词后置，而汉语中谓词放在宾语前），使用基于距离的调序会带来一些问题。因此，具体应用时应该根据语言之间的差异性有选择的使用该模型。
 %----------------------------------------------------------------------------------------
 %    NEW SUB-SECTION
@@ -564,7 +564,7 @@ dr = start_i-end_{i-1}-1
 \subsection{基于方向的调序}
-\parinterval 基于方向的调序模型是另一种常用的调序模型。该模型是一种典型的词汇化调序模型，因此调序的结果会根据不同短语有所不同。简单来说，在两个短语目标语言端连续的情况下，该模型会判断两个双语短语在源语言端的调序情况，包含三种调序类型：顺序的单调翻译（M）、与前一个短语交换位置（S）、非连续翻译（D）。因此，这个模型也被称作MSD调序模型，也是Moses等经典的机器翻译系统所采用的调序模型\upcite{Koehn2007Moses}。
+\parinterval 基于方向的调序模型是另一种常用的调序模型。该模型是一种典型的词汇化调序模型，因此调序的结果会根据不同短语有所不同。简单来说，在目标语言端连续的情况下，该模型会判断两个双语短语在源语言端的调序情况，包含三种调序类型：顺序的单调翻译（M）、与前一个短语交换位置（S）、非连续翻译（D）。因此，这个模型也被称作MSD调序模型\upcite{Gros2008MSD}。
 \parinterval 图\ref{fig:7-21}展示了这三种调序类型，当两个短语对在源语言和目标语言中都是按顺序排列时，它们就是单调的（如：从左边数前两个短语）；如果对应的短语顺序在目标语中是反过来的，属于交换调序（如：从左边数第三和第四个短语）；如果两个短语之间还有其他的短语，就是非连续翻译（如：从右边数的前两个短语）。
@@ -585,7 +585,7 @@ dr = start_i-end_{i-1}-1
 \noindent 其中，$o_i$表示（目标语言）第$i$个短语的调序方向，$\mathbf{o}=\{o_i\}$表示短语序列的调序方向，$K$表示短语的数量。短语之间的调序概率是由双语短语以及短语对齐决定的，$o$表示调序的种类，可以取M、S、D 中的任意一种。而整个句子调序的好坏就是把相邻的短语之间的调序概率相乘（对应取log后的加法）。这样，公式\ref{eq:7-16}把调序的好坏定义为新的特征，对于M、S、D总共就有三个特征。除了当前短语和前一个短语的调序特征，还可以定义当前短语和后一个短语的调序特征，即将上述公式中的$a_{i-1}$换成$a_{i+1}$。 于是，又可以得到三个特征。因此在MSD调序中总共可以有6个特征。
-\parinterval 具体实现时，通常使用词对齐对两个短语间的调序关系进行判断。图\ref{fig:7-22}展示了这个过程。先判断短语的左上角和右上角是否存在词对齐，再根据其位置对调序类型进行划分。每个短语对应的调序概率都可以用相对频次估计进行计算。而MSD调序模型也相当于在短语表中的每个双语短语后添加6个特征。不过，调序模型一般并不会和短语表一起存储，因此在系统中通常会看到两个独立的模型文件，分别保存短语表和调序模型。
+\parinterval 具体实现时，通常使用词对齐对两个短语间的调序关系进行判断。图\ref{fig:7-22}展示了这个过程。先判断短语的左上角和右上角是否存在词对齐，再根据其位置对调序类型进行划分。每个短语对应的调序概率都可以用相对频率估计进行计算。而MSD调序模型也相当于在短语表中的每个双语短语后添加6个特征。不过，调序模型一般并不会和短语表一起存储，因此在系统中通常会看到两个独立的模型文件，分别保存短语表和调序模型。
 %----------------------------------------------
 \begin{figure}[htp]
@@ -620,7 +620,7 @@ dr = start_i-end_{i-1}-1
 \sectionnewpage
 \section{翻译特征}
-\parinterval 基于短语的模型使用判别式模型对翻译推导进行建模，给定双语句对$(\seq{s},\seq{t})$，每个翻译推导$d$都有一个模型得分，由$M$个特征线性加权得到，记为$\textrm{score}(d,\seq{t},\seq{s}) = \sum_{i=1}^{M} \lambda_i \cdot h_i (d,\seq{t},\seq{s})$，其中$\lambda_i$表示特征权重，$h_i (d,\seq{t},\seq{s})$表示特征函数（简记为$h_i (d)$）。这些特征包含刚刚介绍过的短语翻译概率、调序模型得分等，除此之外，还包含语言模型等其他特征，它们共同组成了特征集合。这里列出了基于短语的模型中的一些基础特征：
+\parinterval 基于短语的模型使用判别式模型对翻译推导进行建模，给定双语句对$(\seq{s},\seq{t})$，每个翻译推导$d$都有一个模型得分，由$M$个特征线性加权得到，记为$\textrm{score}(d,\seq{t},\seq{s}) = \sum_{i=1}^{M} \lambda_i \cdot h_i (d,\seq{t},\seq{s})$，其中$\lambda_i$表示特征权重，$h_i (d,\seq{t},\seq{s})$表示特征函数（简记为$h_i (d)$）。这些特征包含刚刚介绍过的短语翻译概率、调序模型得分等，除此之外，还包含语言模型等其他特征，它们共同组成了特征集合。这里列出了基于短语的模型中常用的特征：
 \begin{itemize}
 \vspace{0.5em}
@@ -634,7 +634,7 @@ dr = start_i-end_{i-1}-1
 \vspace{0.5em}
 \item 翻译规则数量，为了避免模型仅使用少量特征构成翻译推导（规则数量少，短语翻译概率相乘的因子也会少，得分一般会大一些），同时让系统自动学习对规则数量的偏好。
 \vspace{0.5em}
-\item 被翻译为空的源语言单词数量。注意，空翻译特征有时也被称作{\small\bfnew{有害特征}}\index{有害特征}（Evil Feature\index{Evil Feature}），这类特征在一些数据上对BLEU有很好的提升作用，但会造成人工评价结果的下降，需要谨慎使用。
+\item 被翻译为空的源语言单词数量。注意，空翻译规则有时也被称作evil feature，这类特征在一些数据上对BLEU有很好的提升作用，但会造成人工评价结果的下降，需要谨慎使用。
 \vspace{0.5em}
 \item 基于MSD的调序模型，包括与前一个短语的调序模型$f_{\textrm{M-pre}}(d)$\ 、$f_{\textrm{S-pre}}(d)$\ 、$f_{\textrm{D-pre}}(d)$和与后一个短语的调序模型$f_{\textrm{M-fol}}(d)$\ 、$f_{\textrm{S-fol}}(d)$\ 、$f_{\textrm{D-fol}}(d)$，共6个特征。
 \vspace{0.5em}
@@ -651,25 +651,24 @@ dr = start_i-end_{i-1}-1
 \parinterval 想要得到最优的特征权重，最简单的方法是枚举所有的特征权重可能的取值，然后评价每组权重所对应的翻译性能，最后选择最优的特征权重作为调优的结果。但是特征权重是一个实数值，因此可以考虑把实数权重进行量化，即把权重看作是在固定间隔上的取值，比如，每隔0.01取值。即使是这样，同时枚举多个特征的权重也是非常耗时的工作，当特征数量增多时这种方法的效率仍然很低。
-\parinterval 这里介绍一种更加高效的特征权重调优方法$\ \dash \ ${\small\bfnew{最小错误率训练}}\index{最小错误率训练}（Minimum Error Rate Training\index{Minimum Error Rate Training}，MERT）。最小错误率训练是统计机器翻译发展中代表性工作，也是机器翻译领域原创的重要技术方法之一\upcite{och2003minimum}。最小错误率训练假设：翻译结果相对于标准答案的错误是可度量的，进而可以通过降低错误数量的方式来找到最优的特征权重。假设有样本集合$S = \{(s_1,\seq{r}_1),...,(s_N,\seq{r}_N)\}$，$s_i$为样本中第$i$个源语言句子，$\seq{r}_i$为相应的参考译文。注意，$\seq{r}_i$ 可以包含多个参考译文。$S$通常被称为{\small\bfnew{调优集合}}\index{调优集合}（Tuning Set）\index{Tuning Set}。对于$S$中的每个源语句子$s_i$，机器翻译模型会解码出$n$-best推导$\hat{\seq{d}}_{i} = \{\hat{d}_{ij}\}$，其中$\hat{d}_{ij}$表示对于源语言句子$s_i$得到的第$j$个最好的推导。$\{\hat{d}_{ij}\}$可以被定义如下：
+\parinterval 这里介绍一种更加高效的特征权重调优方法$\ \dash \ ${\small\bfnew{最小错误率训练}}\index{最小错误率训练}（Minimum Error Rate Training\index{Minimum Error Rate Training}，MERT）。最小错误率训练是统计机器翻译发展中代表性工作，也是从机器翻译中原创的重要技术方法之一\upcite{och2003minimum}。最小错误率训练假设：翻译结果相对于标准答案的错误是可度量的，进而可以通过降低错误数量的方式来找到最优的特征权重。假设有样本集合$S = \{(s_1,\seq{r}_1),...,(s_N,\seq{r}_N)\}$，$s_i$为样本中第$i$个源语言句子，$\seq{r}_i$为相应的参考译文。注意，$\seq{r}_i$可以包含多个参考译文。$S$通常被称为{\small\bfnew{调优集合}}\index{调优集合}（Tuning Set）\index{Tuning Set}。对于$S$中的每个源语句子$s_i$，机器翻译模型会解码出$n$-best推导$d_{i}^{\ast} = \{\seq{d}_{ij}^{\ast}\}$，其中$d_{ij}^{\ast}$表示翻译源语言句子为$s_i$时得到的第$j$个最好的推导。$\{d_{ij}^{\ast}\}$可以被定义如下：
 \begin{eqnarray}
-\{\hat{d}_{ij}\} = \arg\max_{\{d_{ij}\}} \sum_{i=1}^{M} \lambda_i \cdot h_i (d,\seq{t},\seq{s})
+\{d_{ij}^{\ast}\} = \arg\max_{\{d_{ij}\}} \sum_{i=1}^{M} \lambda_i \cdot h_i (d,\seq{t},\seq{s})
 \label{eq:7-17}
 \end{eqnarray}
-\parinterval 对于每个样本都可以得到$n$-best推导集合，整个数据集上的推导集合被记为$\hat{\seq{D}} = \{\hat{\seq{d}}_{1},...,\hat{\seq{d}}_{s}\}$。进一步，令所有样本的参考译文集合为$\seq{R} = \{\seq{r}_1,...,\seq{r}_N\}$。最小错误率训练的目标就是降低$\hat{\seq{D}}$相对于$\seq{R}$的错误。也就是，通过调整不同特征的权重$\lambda = \{ \lambda_i \}$，让错误率最小，形式化描述为：
+\parinterval 对于每个样本都可以得到$n$-best推导集合，整个数据集上的推导集合被记为$\seq{D}^{\ast} = \{\seq{d}_{1}^{\ast},...,\seq{d}_{s}^{\ast}\}$。进一步，令所有样本的参考译文集合为$\seq{R} = \{\seq{r}_1,...,\seq{r}_N\}$。最小错误率训练的目标就是降低$\seq{D}^{\ast}$相对于$\seq{R}$的错误。也就是，通过调整不同特征的权重$\lambda = \{ \lambda_i \}$，让错误率最小，形式化描述为：
 \begin{eqnarray}
-\hat{\lambda} = \arg\min_{\lambda} \textrm{Error}(\hat{\seq{D}},\seq{R})
+\lambda^{\ast} = \arg\min_{\lambda} \textrm{Error}(\seq{D}^{\ast},\seq{R})
 \label{eq:7-18}
 \end{eqnarray}
 %公式--------------------------------------------------------------------
-\noindent 其中，\textrm{Error}$(\cdot)$是错误率函数。\textrm{Error}$(\cdot)$的定义方式有很多，一般来说\textrm{Error}$(\cdot)$会与机器翻译的评价指标相关，例如，词错误率(WER)、位置错误率(PER)、BLEU 值、NIST值等都可以用于\textrm{Error}$(\cdot)$的定义。这里使用$1-$BLEU作为错误率函数，即$\textrm{Error}(\hat{\seq{D}},\seq{R}) = 1 - \textrm{BLEU}(\hat{\seq{D}},\seq{R})$。则公式\ref{eq:7-18}可改写为：
+\noindent 其中\textrm{Error}$(\cdot)$是错误率函数。\textrm{Error}$(\cdot)$的定义方式有很多，一般来说\textrm{Error}$(\cdot)$会与机器翻译的评价指标相关，例如，词错误率(WER)、位置错误率(PER)、BLEU 值、NIST值等都可以用于\textrm{Error}$(\cdot)$的定义。这里使用1-BLEU作为错误率函数，即$\textrm{Error}(\seq{D}^{\ast},\seq{R}) = 1 - \textrm{BLEU}(\seq{D}^{\ast},\seq{R})$。则公式\ref{eq:7-18}可改写为：
 %公式--------------------------------------------------------------------
 \begin{eqnarray}
-\hat{\lambda} &=& \arg\min_{\lambda}\ (1 - \textrm{BLEU}(\hat{\seq{D}},\seq{R}))   \nonumber \\
+\lambda^{\ast} &=& \arg\min_{\lambda}\ (1 - \textrm{BLEU}(\seq{D}^{\ast},\seq{R}))   \nonumber \\
-&=& \arg\max_{\lambda} \textrm{BLEU}(\hat{\seq{D}},\seq{R})
+&=& \arg\max_{\lambda} \textrm{BLEU}(\seq{D}^{\ast},\seq{R})
 \label{eq:7-19}
 \end{eqnarray}
 %公式--------------------------------------------------------------------
@@ -687,7 +686,7 @@ dr = start_i-end_{i-1}-1
 \end{figure}
 %-------------------------------------------
-\parinterval 其中横坐标为所有的$M$个特征函数，纵坐标为权重可能的取值。假设每个特征都有$V$种取值，那么遍历所有特征权重取值的组合有$M^V$种。每组$\lambda = \{\lambda_i\}$的取值实际上就是一个贯穿所有特征权重的折线，如图\ref{fig:7-23}中间红线所展示的路径。当然，可以通过枚举得到很多这样的折线（图\ref{fig:7-23}右）。假设计算BLEU的时间开销为$B$，那么遍历所有的路径的时间复杂度为$O(M^V \cdot B)$，由于$V$可能很大，而且$B$往往也无法忽略，因此这种计算方式的时间成本是极高的。如果考虑对每一组特征权重都需要重新解码得到$n$-best译文，那么基于这种简单枚举的方法是无法使用的。
+\parinterval 其中横坐标为所有的$M$个特征函数，纵坐标为权重可能的取值。假设每个特征都有$V$种取值，那么遍历所有特征权重取值的组合有$M^V$种。每组$\lambda = \{\lambda_i\}$的取值实际上就是一个贯穿所有特征权重的折线，如图\ref{fig:7-23}中间红线所展示的路径。当然，可以通过枚举得到很多这样的折线（图\ref{fig:7-23}右）。假设计算BLEU的时间开销为$B$，那么遍历所有的路径的时间复杂度为$O(M^V \cdot B)$，由于$V$可能很大，而且$B$往往也无法忽略，因此这种计算方式的时间成本是极高的。
 \parinterval 对全搜索的一种改进是使用局部搜索。循环处理每个特征，每一次只调整一个特征权重的值，找到使BLEU达到最大的权重。反复执行该过程，直到模型达到稳定状态（例如BLEU不再降低）。
@@ -704,7 +703,7 @@ dr = start_i-end_{i-1}-1
 \parinterval 格搜索的问题在于，每个特征都要访问$V$个点，且不说$V$个点无法对连续的特征权重进行表示，里面也会存在大量的无用访问。也就是说，这$V$个点中绝大多数点根本“不可能”成为最优的权重。可以把这样的点称为无效取值点。
-\parinterval 能否避开这些无效的权重取值点呢？再重新看一下优化的目标BLEU。实际上，当一个特征权重发生变化时，BLEU的变化只会出现在系统1-best译文发生变化的时候。那么，可以只关注使1-best译文发生变化的取值点，而其他的取值点都不会对优化的目标函数产生变化。这也就构成了线搜索的思想。
+\parinterval 能否避开这些无效的权重取值点呢？再重新看一下优化的目标BLEU。实际上，当一个特征权重发生变化时，BLEU的变化只会产生在系统1-best译文发生变化的时候。那么，可以只关注使1-best译文发生变化的取值点，而其他的取值点都不会对优化的目标函数产生变化。这也就构成了线搜索的思想。
 \parinterval 假设对于每个输入的句子，翻译模型生成了两个推导$\seq{d} = \{d_1,d_2\}$，每个推导$d$的得分score($d$)可以表示成关于第$i$个特征的权重$\lambda_i$的线性函数：
 \begin{eqnarray}
@@ -714,7 +713,7 @@ dr = start_i-end_{i-1}-1
 \label{eq:7-20}
 \end{eqnarray}
-\parinterval 这里，$a = h_i(d)$是直线的斜率，$b = \sum_{k \neq i}^{M} \lambda_k \cdot h_k (d)$是截距。有了关于权重$\lambda_i$的直线表示，可以将$d_1$和$d_2$分别画成两条直线，如图\ref{fig:7-25}所示。在两条直线交叉点的左侧，$d_2$是最优的翻译结果；在交叉点右侧，$d_1$是最优的翻译结果。也就是说，只需知道交叉点左侧和右侧谁的BLEU 值高，$\lambda_i$的最优值就应该落在相应的范围，比如，这个例子中交叉点右侧（即$d_2$）所对应的BLEU值更高，因此最优特征权重$\hat{\lambda}_i$应该在交叉点右侧（$\lambda_x \sim \lambda_i$任意取值都可以）。
+\parinterval 这里，$a = h_i(d)$是直线的斜率，$b = \sum_{k \neq i}^{M} \lambda_k \cdot h_k (d)$是截距。有了关于权重$\lambda_i$的直线表示，可以将$d_1$和$d_2$分别画成两条直线，如图\ref{fig:7-25}所示。在两条直线交叉点的左侧，$d_2$是最优的翻译结果；在交叉点右侧，$d_1$是最优的翻译结果。也就是说，只需知道交叉点左侧和右侧谁的BLEU 值高，$\lambda_i$的最优值就应该落在相应的范围，比如，这个例子中交叉点右侧（即$d_2$）所对应的BLEU值更高，因此最优特征权重应该在交叉点右侧（$\lambda_x \sim \lambda_i$任意取值都可以）。
 %----------------------------------------------
 \begin{figure}[htp]
@@ -743,7 +742,7 @@ dr = start_i-end_{i-1}-1
 \vspace{0.5em}
 \end{itemize}
-\parinterval 最小错误率训练最大的优点在于可以用于目标函数不可微、甚至不连续的情况。对于优化线性模型， 最小错误率训练是一种很好的选择。但是，也有研究发现，简单使用最小错误率训练无法处理特征数量过多的情况。比如，用最小错误率训练优化10000个稀疏特征的权重时，优化效果可能会不理想，而且收敛速度慢。这时也可以考虑使用在线学习等技术对大量特征的权重进行调优，比较有代表性的方法包括MIRA\upcite{DBLP:conf/emnlp/ChiangMR08}和PRO\upcite{Hopkins2011Tuning}。由于篇幅所限，这里不对这些方法做深入讨论，感兴趣的读者可以参考\ref{section-7.8}节的内容，对相关文献进行查阅。
+\parinterval MERT最大的优点在于可以用于目标函数不可微、甚至不连续的情况。对于优化线性模型， MERT是一种很好的选择。但是，也有研究发现，简单使用MERT无法处理特征数量过多的情况。比如，用MERT优化10000个稀疏特征的权重时，优化效果可能会不理想，而且收敛速度慢。这时也可以考虑使用在线学习等技术对大量特征的权重进行调优，比较有代表性的方法包括MIRA\upcite{DBLP:conf/emnlp/ChiangMR08}和PRO\upcite{Hopkins2011Tuning}。由于篇幅所限，这里不对这些方法做深入讨论，感兴趣的读者可以参考\ref{section-7.8}节的内容，对相关文献进行查阅。
 %----------------------------------------------------------------------------------------
 %    NEW SECTION
@@ -754,11 +753,11 @@ dr = start_i-end_{i-1}-1
 \parinterval 解码的目的是根据模型以及输入，找到模型得分最高的推导，即：
 \begin{eqnarray}
-\hat{d} = \arg\max_{d}\ \ \textrm{score}(d,\seq{t},\seq{s})
+\hat{d} = \arg\max_{d} \textrm{score}(d,\seq{t},\seq{s})
 \label{eq:7-21}
 \end{eqnarray}
-\parinterval 然而想要找到得分最高的翻译推导并不是一件简单的事情。对于每一句源语言句子，可能的翻译结果是指数级的。由于机器翻译解码是一个NP完全问题\upcite{knight1999decoding}，简单的暴力搜索显然不现实。因此，在机器翻译中会使用特殊的解码策略来确保搜索的效率。本节将介绍基于栈的自左向右解码方法。它是基于短语的模型中的经典解码方法，非常适于处理语言生成的各种任务。
+\parinterval 然而想要找到得分最高的翻译推导并不是一件简单的事情。对于每一句源语言句子，可能的翻译结果是指数级的。由于机器翻译解码是一个NP难问题\upcite{knight1999decoding}，简单的暴力搜索显然不现实。因此，在机器翻译中会使用特殊的解码策略来确保搜索的效率。本节将介绍基于栈的自左向右解码方法。它是基于短语的模型中的经典解码方法，非常适于处理语言生成的各种任务。
 \parinterval 首先，看一下翻译一个句子的基本流程。如图\ref{fig:7-26}所示，首先需要得到译文句子的第一个单词。在基于短语的模型中，可以从源语言端找出生成句首译文的短语，之后把译文放到目标语言端，例如，源语言的“有”对应的译文是“There is”。这个过程可以重复执行，直到生成完整句子的译文。但是，有两点需要注意：
@@ -774,7 +773,7 @@ dr = start_i-end_{i-1}-1
 \begin{figure}[htp]
 \centering
 \input{./Chapter7/Figures/figure-basic-process-of-translation}
-\caption{按目标语言短语自左向右生成的翻译实例}
+\caption{翻译的基本流程}
 \label{fig:7-26}
 \end{figure}
 %-------------------------------------------
@@ -787,10 +786,10 @@ dr = start_i-end_{i-1}-1
 \subsection{翻译候选匹配}
-\parinterval 在解码时，首先要知道每个源语言短语可能的译文都是什么。对于一个源语言短语，每个可能的译文也被称作{\small\bfnew{翻译候选}}\index{翻译候选}（Translation Candidate）\index{Translation Candidate}。实现翻译候选的匹配很简单。只需要遍历输入的源语言句子中所有可能的短语，之后在短语表中找到相应的翻译即可。比如，图\ref{fig:7-27}展示了句子“桌子/上/有/一个/苹果”的翻译候选匹配结果。可以看到，不同的短语会对应若干翻译候选。这些翻译候选会保存在所对应的范围（被称为跨度）中。这里，跨度$[a,b]$表示从第$a+1$个词开始到第$b$个词为止所表示的词串。比如，“upon the table” 是短语“桌子/上/有”的翻译候选，即对应源语言跨度[0,3]。
+\parinterval 在解码时，首先要知道每个源语言短语可能的译文都是什么。对于一个源语言短语，每个可能的译文也被称作{\small\bfnew{翻译候选}}\index{翻译候选}（Translation Candidate）\index{Translation Candidate}。实现翻译候选的匹配很简单。只需要遍历输入的源语言句子中所有可能的短语，之后在短语表中找到相应的翻译即可。比如，图\ref{fig:7-27}展示了句子“桌子\ 上\ 有\ 一个\ 苹果”的翻译候选匹配结果。可以看到，不同的短语会对应若干翻译候选。这些翻译候选会保存在所对应的跨度中。比如，“upon the table”是短语“桌子 上 有”的翻译候选，即对应源语言跨度[0,3]。
 %----------------------------------------------
-\begin{figure}[htp]
+\begin{figure}[t]
 \centering
 \input{./Chapter7/Figures/figure-translation-option}
 \caption{一个句子匹配的短语翻译候选}
@@ -829,7 +828,7 @@ dr = start_i-end_{i-1}-1
 \begin{itemize}
 \vspace{0.5em}
-\item 对相同译文的翻译假设进行重新组合；
+\item 对相同译文的翻译假设进行重新组合。
 \vspace{0.5em}
 \item 对低质量的翻译假设进行裁剪。
 \vspace{0.5em}
@@ -858,7 +857,7 @@ dr = start_i-end_{i-1}-1
 \parinterval 然而在实际处理中，并不需要“删掉”分数低的翻译假设，而是将它们与分数高的翻译假设连在了一起。对于搜索最优翻译，这些连接可能并没有什么作用，但是如果需要分数最高的前两个或前三个翻译，就可能需要用到这些连接。
-\parinterval 翻译假设的重组有效地减少了解码过程中相同或者相似翻译假设带来的冗余。因此这些方法在机器翻译中被广泛使用。包括{\chaptereight}将要介绍的基于句法的翻译模型解码中，也可以使用假设重组进行系统加速。
+\parinterval 翻译假设的重组有效地减少了解码过程中相同或者相似翻译假设带来的冗余。因此这些方法在机器翻译中被广泛使用。包括本章后面将要介绍的基于句法的翻译模型解码中，也可以使用假设重组进行系统加速。
 %----------------------------------------------------------------------------------------
 %    NEW SUB-SECTION
@@ -870,7 +869,7 @@ dr = start_i-end_{i-1}-1
 \begin{itemize}
 \vspace{0.5em}
-\item 删除的翻译假设可能会在后续的扩展过程中被重新搜索出来；
+\item 删除的翻译假设可能会在后续的扩展过程中被重新搜索出来。
 \vspace{0.5em}
 \item 过早地删除某些翻译假设可能会导致无法搜索到最优的翻译假设。
 \vspace{0.5em}
@@ -900,11 +899,23 @@ dr = start_i-end_{i-1}-1
 \sectionnewpage
 \section{小节及深入阅读}\label{section-7.8}
+\parinterval 统计机器翻译模型是近三十年内自然语言处理的重要里程碑之一。其统计建模的思想长期影响着自然语言处理的研究。无论是前面介绍的基于单词的模型，还是本章介绍的基于短语的模型，甚至后面即将介绍的基于句法的模型，大家都在尝试回答：究竟应该用什么样的知识对机器翻译进行统计建模？不过，这个问题至今还没有确定的答案。但是，显而易见，统计机器翻译为机器翻译的研究提供了一种范式，即让计算机用概率化的 “知识” 描述翻译问题。这些 “ 知识” 体现在统计模型的结构和参数中，并且可以从大量的双语和单语数据中自动学习。这种建模思想在今天的机器翻译研究中仍然随处可见。
+\parinterval 本章对统计机器翻译中的基于短语的模型进行了介绍。可以说，基于短语的模型是机器翻译中最成功的机器翻译模型之一。其结构简单，而且翻译速度快，因此也被大量应用于机器翻译产品及服务中。此外，包括判别式模型、最小错误率训练、短语抽取等经典问题都是源自基于短语的模型。可是，基于短语的模型所涉及的非常丰富，很难通过一章的内容进行面面俱到的介绍。还有很多方向值得读者进一步了解：
+\begin{itemize}
+\vspace{0.5em}
+\item 基于短语的机器翻译的想法很早就出现了，比如直接对把机器翻译看作基于短语的生成问题\upcite{DBLP:conf/acl/OchW98,DBLP:phd/dnb/Och02,och2004alignment}，或者单独对短语翻译进行建模，之后集成到基于单词的模型中\upcite{DBLP:conf/acl/WangW98,DBLP:conf/acl/WatanabeSO03,DBLP:conf/acl/Marcu01}。现在，最通用的框架是Koehn等人提出的模型\upcite{koehn2003statistical}，与其类似的还有Zens等人的工作\upcite{DBLP:conf/ki/ZensON02,DBLP:conf/naacl/ZensN04}。这类模型把短语翻译分解为短语学习问题和解码问题。因此，在随后相当长一段时间里，如何获取双语短语也是机器翻译领域的热点。比如，一些团队研究如何直接从双语句对中学习短语翻译，而不是通过简单的启发性规则进行短语抽取\upcite{DBLP:conf/emnlp/MarcuW02,DBLP:conf/wmt/DeNeroGZK06}。也有研究者对短语边界的建模进行研究，以获得更高质量的短语，同时减小模型大小\upcite{german2011bilingual,DBLP:conf/coling/BlackwoodGB08,DBLP:conf/naacl/XiongZL10}。
+\vspace{0.5em}
+\item 调序是基于短语的模型中经典的问题之一。早期的模型都是词汇化的调序模型，这类模型把调序定义为短语之间的相对位置建模问题\upcite{DBLP:conf/naacl/Tillman04,DBLP:conf/naacl/KumarB05,DBLP:conf/acl/NagataSYO06}。后来，也有一些工作使用判别式模型来集成更多的调序特征\upcite{xiong2006maximum,DBLP:conf/wmt/ZensN06,DBLP:conf/naacl/GreenGM10,DBLP:conf/naacl/Cherry13}。实际上，除了基于短语的模型，调序也在基于句法的模型中被广泛讨论。因此，一些工作尝试将基于短语的调序模型集成到基于句法的机器翻译系统中\upcite{DBLP:conf/wmt/HuckWRN13,matthias2012discriminative,vinh2009improving,xiong2006maximum}。此外，也有研究者对不同的调序模型进行了系统化的对比和分析，可以作为相关研究的参考\upcite{DBLP:journals/coling/BisazzaF16}。与在机器翻译系统中集成调序模型不同，预调序（Pre-ordering）也是一种解决调序问题的思路\upcite{DBLP:conf/coling/XiaM04,DBLP:conf/acl/CollinsKK05,DBLP:conf/emnlp/WangCK07,DBLP:conf/ijcnlp/WuSDTN11}。机器翻译中的预调序是指将输入的源语言句子按目标语言的顺序进行排列，这样在翻译中就尽可能减少调序操作。这种方法大多依赖源语言的句法树进行调序的建模，不过它与机器翻译系统的耦合很小，因此很容易进行系统集成。
+\vspace{0.5em}
+\item 统计机器翻译中使用的栈解码方法源自Tillmann等人的工作\upcite{tillmann1997a}。这种方法在Pharaoh\upcite{DBLP:conf/amta/Koehn04}、Moses\upcite{Koehn2007Moses}等开源系统中被成功的应用，在机器翻译领域产生了很大的影响力。特别是，这种解码方法效率很高，因此在许多工业系统里也大量使用。对于栈解码也有很多改进工作，比如，早期的工作考虑剪枝或者限制调序范围以加快解码速度\upcite{DBLP:conf/acl/WangW97,DBLP:conf/coling/TillmannN00,DBLP:conf/iwslt/ShenDA06a,robert2007faster}。随后，也有研究工作从解码算法和语言模型集成方式的角度对这类方法进行改进\upcite{DBLP:conf/acl/HeafieldKM14,DBLP:conf/acl/WuebkerNZ12,DBLP:conf/iwslt/ZensN08}。
+\vspace{0.5em}
+\item 统计机器翻译的成功很大程度上来自判别式模型引入任意特征的能力。因此，在统计机器翻译时代，很多工作都集中在新特征的设计上。比如，可以基于不同的统计特征和先验知识设计翻译特征\upcite{och2004smorgasbord,Chiang200911,gildea2003loosely}，也可以模仿分类任务设计大规模的稀疏特征\upcite{chiang2008online}。另一方面，模型训练和特征权重调优也是统计机器翻译中的重要问题，除了最小错误率训练，还有很多方法，比如，最大似然估计\upcite{koehn2003statistical,DBLP:journals/coling/BrownPPM94}、判别式方法\upcite{Blunsom2008A}、贝叶斯方法\upcite{Blunsom2009A,Cohn2009A}、最小风险训练\upcite{smith2006minimum,li2009first}、基于Margin的方法\upcite{watanabe2007online,Chiang200911}以及基于排序模型的方法（PRO）\upcite{Hopkins2011Tuning,dreyer2015apro}。实际上，统计机器翻译的训练和解码也存在不一致的问题，比如，特征值由双语数据上的极大似然估计得到（没有剪枝），而解码时却使用束剪枝，而且模型的目标是最大化机器翻译评价指标。对于这个问题也可以通过调整训练的目标函数进行缓解\upcite{XiaoA,marcu2006practical}。
+\vspace{0.5em}
+\item 短语表是基于短语的系统中的重要模块。但是，简单的利用基于频次的方法估计得到的翻译概率无法很好的处理低频短语。这时就需要对短语表进行平滑\upcite{DBLP:conf/iwslt/ZensN08,DBLP:conf/emnlp/SchwenkCF07,boxing2011unpacking,DBLP:conf/coling/DuanSZ10}。另一方面，随着数据量的增长和抽取短语长度的增大，短语表的体积会极具膨胀，这也大大增加了系统的存储消耗，同时过大的短语表也会带来短语查询效率的下降。针对这个问题，很多工作尝试对短语表进行压缩。一种思路是限制短语的长度\upcite{DBLP:conf/naacl/QuirkM06,DBLP:journals/coling/MarinoBCGLFC06}；另一种广泛使用的思路是使用一些指标或者分类器来对短语进行剪枝，其核心思想是判断每个短语的质量\upcite{DBLP:conf/emnlp/ZensSX12}，并过滤掉低质量的短语。代表性的方法有：基于假设检验的剪枝\upcite{DBLP:conf/emnlp/JohnsonMFK07}、基于熵的剪枝\upcite{DBLP:conf/emnlp/LingGTB12}、两阶段短语抽取方法\upcite{DBLP:conf/naacl/ZettlemoyerM07}、基于解码中短语使用频率的方法\upcite{DBLP:conf/naacl/EckVW07}等。此外，短语表的存储方式也是在实际使用中需要考虑的问题。因此，也有研究者尝试使用更加紧凑、高效的结构保存短语表。其中最具代表性的结构是后缀数组（Suffix Arrays），这种结构可以充分利用短语之间有重叠的性质，发幅减少了重复存储\upcite{DBLP:conf/acl/Callison-BurchBS05,DBLP:conf/acl/Callison-BurchBS05,DBLP:conf/naacl/ZensN07,2014Dynamic}。
+\vspace{0.5em}
+\end{itemize}

--- a/bibliography.bib
+++ b/bibliography.bib
@@ -1797,18 +1797,7 @@
  pages     = {79--85},
  year      = {1990}
 }
-@article{Peter1993The,
-  author    = {Peter F. Brown and
-               Stephen Della Pietra and
-               Vincent J. Della Pietra and
-               Robert L. Mercer},
-  title     = {The Mathematics of Statistical Machine Translation: Parameter Estimation},
-  journal   = {Computational Linguistics},
-  volume    = {19},
-  number    = {2},
-  pages     = {263--311},
-  year      = {1993}
-}
 @article{knight1999decoding,
  author    = {Kevin Knight},
  title     = {Decoding Complexity in Word-Replacement Translation Models},
@@ -1998,18 +1987,7 @@
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 %%%%% chapter 6------------------------------------------------------
-@article{Peter1993The,
-  author    = {Peter F. Brown and
-               Stephen Della Pietra and
-               Vincent J. Della Pietra and
-               Robert L. Mercer},
-  title     = {The Mathematics of Statistical Machine Translation: Parameter Estimation},
-  journal   = {Computational Linguistics},
-  volume    = {19},
-  number    = {2},
-  pages     = {263--311},
-  year      = {1993}
-}
 @inproceedings{ittycheriah2005maximum,
  author    = {Abraham Ittycheriah and
               Salim Roukos},
@@ -2250,6 +2228,550 @@
  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2003}
 }
+@inproceedings{DBLP:conf/acl/OchW98,
+  author    = {Franz Josef Och and
+               Hans Weber},
+  title     = {Improving Statistical Natural Language Translation with Categories
+               and Rules},
+  pages     = {985--989},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
+  year      = {1998}
+}
+@phdthesis{DBLP:phd/dnb/Och02,
+  author    = {Franz Josef Och},
+  title     = {Statistical machine translation: from single word models to alignment
+               templates},
+  publisher = {{RWTH} Aachen University, Germany},
+  year      = {2002}
+}
+@inproceedings{DBLP:conf/acl/WangW98,
+  author    = {Ye{-}Yi Wang and
+               Alex Waibel},
+  title     = {Modeling with Structures in Statistical Machine Translation},
+  pages     = {1357--1363},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
+  year      = {1998}
+}
+@inproceedings{DBLP:conf/acl/WatanabeSO03,
+  author    = {Taro Watanabe and
+               Eiichiro Sumita and
+               Hiroshi G. Okuno},
+  title     = {Chunk-Based Statistical Translation},
+  pages     = {303--310},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
+  year      = {2003}
+}
+@inproceedings{DBLP:conf/acl/Marcu01,
+  author    = {Daniel Marcu},
+  title     = {Towards a Unified Approach to Memory- and Statistical-Based Machine
+               Translation},
+  pages     = {378--385},
+  publisher = {Morgan Kaufmann Publishers},
+  year      = {2001}
+}
+@inproceedings{DBLP:conf/ki/ZensON02,
+  author    = {Richard Zens and
+               Franz Josef Och and
+               Hermann Ney},
+  title     = {Phrase-Based Statistical Machine Translation},
+  volume    = {2479},
+  pages     = {18--32},
+  publisher = {Springer},
+  year      = {2002}
+}
+@inproceedings{DBLP:conf/naacl/ZensN04,
+  author    = {Richard Zens and
+               Hermann Ney},
+  title     = {Improvements in Phrase-Based Statistical Machine Translation},
+  pages     = {257--264},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
+  year      = {2004}
+}
+@inproceedings{DBLP:conf/emnlp/MarcuW02,
+  author    = {Daniel Marcu and
+               Daniel Wong},
+  title     = {A Phrase-Based, Joint Probability Model for Statistical Machine Translation},
+  publisher = {Conference on Empirical Methods in Natural Language Processing},
+  pages     = {133--139},
+  year      = {2002}
+}
+@inproceedings{DBLP:conf/wmt/DeNeroGZK06,
+  author    = {John DeNero and
+               Dan Gillick and
+               James Zhang and
+               Dan Klein},
+  title     = {Why Generative Phrase Models Underperform Surface Heuristics},
+  pages     = {31--38},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
+  year      = {2006}
+}
+@inproceedings{german2011bilingual,
+  author    = {German Sanchis-Trilles and
+               Daniel Ortiz-Martinez and
+               Jesus Gonzalez-Rubio and
+               Jorge Gonzalez and
+			   Francisco Casacuberta},
+  title     = {Bilingual segmentation for phrasetable pruning in Statistical Machine Translation},
+  pages     = {257--264},
+  publisher = {Conference of the European Association for Machine Translation},
+  year      = {2011}
+}
+@inproceedings{DBLP:conf/coling/BlackwoodGB08,
+  author    = {Graeme W. Blackwood and
+               Adri{\`{a}} de Gispert and
+               William Byrne},
+  title     = {Phrasal Segmentation Models for Statistical Machine Translation},
+  publisher = {International Conference on Computational Linguistics},
+  pages     = {19--22},
+  year      = {2008}
+}
+@inproceedings{DBLP:conf/naacl/XiongZL10,
+  author    = {Deyi Xiong and
+               Min Zhang and
+               Haizhou Li},
+  title     = {Learning Translation Boundaries for Phrase-Based Decoding},
+  pages     = {136--144},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
+  year      = {2010}
+}
+@inproceedings{DBLP:conf/naacl/Tillman04,
+  author    = {Christoph Tillman},
+  title     = {A Unigram Orientation Model for Statistical Machine Translation},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
+  year      = {2004}
+}
+@inproceedings{DBLP:conf/acl/NagataSYO06,
+  author    = {Masaaki Nagata and
+               Kuniko Saito and
+               Kazuhide Yamamoto and
+               Kazuteru Ohashi},
+  title     = {A Clustered Global Phrase Reordering Model for Statistical Machine
+               Translation},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
+  year      = {2006}
+}
+@inproceedings{DBLP:conf/wmt/ZensN06,
+  author    = {Richard Zens and
+               Hermann Ney},
+  title     = {Discriminative Reordering Models for Statistical Machine Translation},
+  pages     = {55--63},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
+  year      = {2006}
+}
+@inproceedings{DBLP:conf/naacl/GreenGM10,
+  author    = {Spence Green and
+               Michel Galley and
+               Christopher D. Manning},
+  title     = {Improved Models of Distortion Cost for Statistical Machine Translation},
+  pages     = {867--875},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
+  year      = {2010}
+}
+@inproceedings{DBLP:conf/naacl/Cherry13,
+  author    = {Colin Cherry},
+  title     = {Improved Reordering for Phrase-Based Translation using Sparse Features},
+  pages     = {22--31},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
+  year      = {2013}
+}
+@inproceedings{DBLP:conf/wmt/HuckWRN13,
+  author    = {Matthias Huck and
+               Joern Wuebker and
+               Felix Rietig and
+               Hermann Ney},
+  title     = {A Phrase Orientation Model for Hierarchical Machine Translation},
+  pages     = {452--463},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
+  year      = {2013}
+}
+@inproceedings{matthias2012discriminative,
+  author    = {Matthias Huck and 
+			   Stephan Peitz and 
+               Markus Freitag and 
+               Hermann Ney},
+  title     = {Discriminative Reordering Extensions for Hierarchical Phrase-Based Machine Translation },
+  publisher = {International Conference on Material Engineering and Advanced Manufacturing Technology},
+  year      = {2012}
+}
+@inproceedings{vinh2009improving,
+  author    = {Vinh Van Nguyen and 
+			   Akira Shimazu and 
+               Minh Le Nguyen and 
+               Thai Phuong Nguyen},
+  title     = {Improving a Lexicalized Hierarchical Reordering Model Using Maximum Entropy},
+  publisher = {MT summit XII},
+  year      = {2009}
+}
+@article{DBLP:journals/coling/BisazzaF16,
+  author    = {Arianna Bisazza and
+               Marcello Federico},
+  title     = {A Survey of Word Reordering in Statistical Machine Translation: Computational
+               Models and Language Phenomena},
+  journal   = {Computational Linguistics},
+  volume    = {42},
+  number    = {2},
+  pages     = {163--205},
+  year      = {2016}
+}
+@inproceedings{DBLP:conf/coling/XiaM04,
+  author    = {Fei Xia and
+               Michael C. McCord},
+  title     = {Improving a Statistical {MT} System with Automatically Learned Rewrite
+               Patterns},
+  publisher = {International Conference on Computational Linguistics},
+  year      = {2004}
+}
+@inproceedings{DBLP:conf/acl/CollinsKK05,
+  author    = {Michael Collins and
+               Philipp Koehn and
+               Ivona Kucerova},
+  title     = {Clause Restructuring for Statistical Machine Translation},
+  pages     = {531--540},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
+  year      = {2005}
+}
+@inproceedings{DBLP:conf/emnlp/WangCK07,
+  author    = {Chao Wang and
+               Michael Collins and
+               Philipp Koehn},
+  title     = {Chinese Syntactic Reordering for Statistical Machine Translation},
+  pages     = {737--745},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
+  year      = {2007}
+}
+@inproceedings{DBLP:conf/ijcnlp/WuSDTN11,
+  author    = {Xianchao Wu and
+               Katsuhito Sudoh and
+               Kevin Duh and
+               Hajime Tsukada and
+               Masaaki Nagata},
+  title     = {Extracting Pre-ordering Rules from Predicate-Argument Structures},
+  pages     = {29--37},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
+  year      = {2011}
+}
+@inproceedings{DBLP:conf/coling/TillmannN00,
+  author    = {Christoph Tillmann and
+               Hermann Ney},
+  title     = {Word Re-ordering and DP-based Search in Statistical Machine Translation},
+  pages     = {850--856},
+  publisher = {Morgan Kaufmann},
+  year      = {2000}
+}
+@inproceedings{DBLP:conf/iwslt/ShenDA06a,
+  author    = {Wade Shen and
+               Brian Delaney and
+               Timothy R. Anderson},
+  title     = {An efficient graph search decoder for phrase-based statistical machine
+               translation},
+  pages     = {197--204},
+  publisher = {International Symposium on Computer Architecture},
+  year      = {2006}
+}
+@inproceedings{robert2007faster,
+  author    = {Robert C Moore and
+               Chris Quirk},
+  title     = {Faster Beam-Search Decoding for Phrasal Statistical Machine Translation},
+  publisher = {MT Summit XI},
+  year      = {2007}
+}
+@inproceedings{DBLP:conf/acl/HeafieldKM14,
+  author    = {Kenneth Heafield and
+               Michael Kayser and
+               Christopher D. Manning},
+  title     = {Faster Phrase-Based Decoding by Refining Feature State},
+  pages     = {130--135},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
+  year      = {2014}
+}
+@inproceedings{DBLP:conf/acl/WuebkerNZ12,
+  author    = {Joern Wuebker and
+               Hermann Ney and
+               Richard Zens},
+  title     = {Fast and Scalable Decoding with Language Model Look-Ahead for Phrase-based
+               Statistical Machine Translation},
+  pages     = {28--32},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
+  year      = {2012}
+}
+@inproceedings{DBLP:conf/iwslt/ZensN08,
+  author    = {Richard Zens and
+               Hermann Ney},
+  title     = {Improvements in dynamic programming beam search for phrase-based statistical
+               machine translation},
+  pages     = {198--205},
+  publisher = {International Symposium on Computer Architecture},
+  year      = {2008}
+}
+@inproceedings{och2004smorgasbord,
+  author    = {Franz Josef Och and
+               Daniel Gildea and
+               Sanjeev Khudanpur and
+               Anoop Sarkar and
+               Kenji Yamada and
+               Alexander M. Fraser and
+               Shankar Kumar and
+               Libin Shen and
+               David Smith and
+               Katherine Eng and
+               Viren Jain and
+               Zhen Jin and
+               Dragomir R. Radev},
+  title     = {A Smorgasbord of Features for Statistical Machine Translation},
+  pages     = {161--168},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
+  year      = {2004}
+}
+@inproceedings{Chiang200911,
+  author    = {David Chiang and
+               Kevin Knight and
+               Wei Wang},
+  title     = {11,001 New Features for Statistical Machine Translation},
+  pages     = {218--226},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
+  year      = {2009}
+}
+@inproceedings{gildea2003loosely,
+  author    = {Daniel Gildea},
+  title     = {Loosely Tree-Based Alignment for Machine Translation},
+  pages     = {80--87},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
+  year      = {2003}
+}
+@inproceedings{chiang2008online,
+  author    = {David Chiang and
+               Yuval Marton and
+               Philip Resnik},
+  title     = {Online Large-Margin Training of Syntactic and Structural Translation
+               Features},
+  pages     = {224--233},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
+  year      = {2008}
+}
+@inproceedings{Blunsom2008A,
+  author    = {Phil Blunsom and
+               Trevor Cohn and
+               Miles Osborne},
+  title     = {A Discriminative Latent Variable Model for Statistical Machine Translation},
+  pages     = {200--208},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
+  year      = {2008}
+}
+@inproceedings{Blunsom2009A,
+  author    = {Phil Blunsom and
+               Trevor Cohn and
+               Chris Dyer and
+               Miles Osborne},
+  title     = {A Gibbs Sampler for Phrasal Synchronous Grammar Induction},
+  pages     = {782--790},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
+  year      = {2009}
+}
+@inproceedings{Cohn2009A,
+  author    = {Trevor Cohn and
+               Phil Blunsom},
+  title     = {A Bayesian Model of Syntax-Directed Tree to String Grammar Induction},
+  pages     = {352--361},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
+  year      = {2009}
+}
+@inproceedings{smith2006minimum,
+  author    = {David A. Smith and
+               Jason Eisner},
+  title     = {Minimum Risk Annealing for Training Log-Linear Models},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
+  year      = {2006}
+}
+@inproceedings{li2009first,
+  author    = {Zhifei Li and
+               Jason Eisner},
+  title     = {First- and Second-Order Expectation Semirings with Applications to
+               Minimum-Risk Training on Translation Forests},
+  pages     = {40--51},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
+  year      = {2009}
+}
+@inproceedings{watanabe2007online,
+  author    = {Taro Watanabe and
+               Jun Suzuki and
+               Hajime Tsukada and
+               Hideki Isozaki},
+  title     = {Online Large-Margin Training for Statistical Machine Translation},
+  pages     = {764--773},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
+  year      = {2007},
+}
+@inproceedings{dreyer2015apro,
+  author    = {Markus Dreyer and
+               Yuanzhe Dong},
+  title     = {{APRO:} All-Pairs Ranking Optimization for {MT} Tuning},
+  pages     = {1018--1023},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
+  year      = {2015}
+}
+@article{XiaoA,
+  author    = {Tong Xiao and
+               Derek F. Wong and
+               Jingbo Zhu},
+  title     = {A Loss-Augmented Approach to Training Syntactic Machine Translation
+               Systems},
+  journal   = {IEEE Transactions on Audio, Speech, and Language Processing},
+  volume    = {24},
+  number    = {11},
+  pages     = {2069--2083},
+  year      = {2016}
+}
+@article{marcu2006practical,
+	title={Practical structured learning techniques for natural language processing},
+	author={Daniel Marcu and Harold Charles Daume},
+	journal={Ph.D. thesis, University of Southern California, Los Angeles, CA},
+	year={2006}
+}
+@inproceedings{DBLP:conf/iwslt/ZensN08,
+  author    = {Richard Zens and
+               Hermann Ney},
+  title     = {Improvements in dynamic programming beam search for phrase-based statistical
+               machine translation},
+  pages     = {198--205},
+  publisher = {International Symposium on Computer Architecture},
+  year      = {2008}
+}
+@inproceedings{DBLP:conf/emnlp/SchwenkCF07,
+  author    = {Holger Schwenk and
+               Marta R. Costa{-}juss{\`{a}} and
+               Jos{\'{e}} A. R. Fonollosa},
+  title     = {Smooth Bilingual N-Gram Translation},
+  pages     = {430--438},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
+  year      = {2007}
+}
+@inproceedings{boxing2011unpacking,
+  author    = {Boxing Chen and
+               Roland Kuhn and
+               George Foster and
+			   Howard Johnson},
+  title     = {Unpacking and Transforming Feature Functions: New Ways to Smooth Phrase Tables},
+  publisher = {MT Summit},
+  year      = {2011}
+}
+@inproceedings{DBLP:conf/coling/DuanSZ10,
+  author    = {Nan Duan and
+               Hong Sun and
+               Ming Zhou},
+  title     = {Translation Model Generalization using Probability Averaging for Machine
+               Translation},
+  publisher = {Tsinghua University Press},
+  year      = {2010}
+}
+@inproceedings{DBLP:conf/naacl/QuirkM06,
+  author    = {Christopher Quirk and
+               Arul Menezes},
+  title     = {Do we need phrases? Challenging the conventional wisdom in Statistical
+               Machine Translation},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
+  year      = {2006}
+}
+@article{DBLP:journals/coling/MarinoBCGLFC06,
+  author    = {Jos{\'{e}} B. Mari{\~{n}}o and
+               Rafael E. Banchs and
+               Josep Maria Crego and
+               Adri{\`{a}} de Gispert and
+               Patrik Lambert and
+               Jos{\'{e}} A. R. Fonollosa and
+               Marta R. Costa{-}juss{\`{a}}},
+  title     = {\emph{N}-gram-based Machine Translation},
+  journal   = {Computational Linguistics},
+  volume    = {32},
+  number    = {4},
+  pages     = {527--549},
+  year      = {2006}
+}
+@inproceedings{DBLP:conf/emnlp/ZensSX12,
+  author    = {Richard Zens and
+               Daisy Stanton and
+               Peng Xu},
+  title     = {A Systematic Comparison of Phrase Table Pruning Techniques},
+  pages     = {972--983},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
+  year      = {2012}
+}
+@inproceedings{DBLP:conf/emnlp/JohnsonMFK07,
+  author    = {Howard Johnson and
+               Joel D. Martin and
+               George F. Foster and
+               Roland Kuhn},
+  title     = {Improving Translation Quality by Discarding Most of the Phrasetable},
+  pages     = {967--975},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
+  year      = {2007}
+}
+@inproceedings{DBLP:conf/emnlp/LingGTB12,
+  author    = {Wang Ling and
+               Jo{\~{a}}o Gra{\c{c}}a and
+               Isabel Trancoso and
+               Alan W. Black},
+  title     = {Entropy-based Pruning for Phrase-based Machine Translation},
+  pages     = {962--971},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
+  year      = {2012}
+}
+@inproceedings{DBLP:conf/naacl/ZettlemoyerM07,
+  author    = {Luke S. Zettlemoyer and
+               Robert Moore},
+  title     = {Selective Phrase Pair Extraction for Improved Statistical Machine
+               Translation},
+  pages     = {209--212},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
+  year      = {2007}
+}
+@inproceedings{DBLP:conf/naacl/EckVW07,
+  author    = {Matthias Eck and
+               Stephan Vogel and
+               Alex Waibel},
+  title     = {Translation Model Pruning via Usage Statistics for Statistical Machine
+               Translation},
+  pages     = {21--24},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
+  year      = {2007},
+}
+@inproceedings{DBLP:conf/acl/Callison-BurchBS05,
+  author    = {Chris Callison{-}Burch and
+               Colin J. Bannard and
+               Josh Schroeder},
+  title     = {Scaling Phrase-Based Statistical Machine Translation to Larger Corpora
+               and Longer Phrases},
+  pages     = {255--262},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
+  year      = {2005}
+}
+@inproceedings{DBLP:conf/acl/Callison-BurchBS05,
+  author    = {Paul McNamee and James Mayfield},
+  title     = {Translation of Multiword Expressions Using Parallel Suffix Arrays},
+  publisher = {Association for Machine Translation in the Americas},
+  year      = {2006}
+}
+@inproceedings{DBLP:conf/naacl/ZensN07,
+  author    = {Richard Zens and
+               Hermann Ney},
+  title     = {Efficient Phrase-Table Representation for Machine Translation with
+               Applications to Online {MT} and Speech Translation},
+  pages     = {492--499},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
+  year      = {2007}
+}
+@inproceedings{2014Dynamic,
+  title={Dynamic Phrase Tables for Machine Translation in an Interactive Post-editing Scenario},
+  author={Germann, Ulrich},
+  publisher = {Association for Machine Translation in the Americas},
+  year={2014},
+}
 %%%%% chapter 7------------------------------------------------------
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%