update and add index

1867cd53 · 曹润柘 · fb496768 · 1867cd53 · 1867cd53 · 1867cd53
Commit 1867cd53 authored Mar 27, 2020 by 曹润柘
--- a/Book/Chapter1/Figures/figure-Example-RBMT.tex
+++ b/Book/Chapter1/Figures/figure-Example-RBMT.tex
@@ -77,7 +77,7 @@
 \draw[->,thick,ublue] ([xshift=-0em]be1.south)..controls +(south:0.3) and +(north:0.4)..([xshift=1.6em,yshift=-0.2em]t15final.north west);
 }
 {
-\draw[->,dotted,thick,red] ([xshift=-0.2em]rule5part2.east)..controls +(east:1.5) and +( west:1.5)..(t15final.west) node[pos=0.9,below,xshift=0.3em,yshift=0.0em] (applyr6) {\color{red}{\tiny{\textbf{规则5}}}};
+\draw[->,dotted,thick,red] ([xshift=-0.2em]rule5part2.east)..controls +(east:1.5) and +( west:1.5)..(t15final.west) node[pos=0.9,below,xshift=0.3em,yshift=0.0em] (applyr6) {\color{red}{\tiny\sffamily\bfseries{规则5}}};
 }
 \end{tikzpicture}

--- a/Book/Chapter1/Figures/figure-Example-SMT.tex
+++ b/Book/Chapter1/Figures/figure-Example-SMT.tex
@@ -96,11 +96,11 @@
 }
 {
-\draw[->,thick,ublue] (bidata.east)--([xshift=2.7em]bidata.east) node[pos=0.5,above] (simexample) {\color{red}{\scriptsize{\textbf{学习}}}};
+\draw[->,thick,ublue] (bidata.east)--([xshift=2.2em]bidata.east) node[pos=0.5,above] (simexample) {\color{red}{\scriptsize{\scriptsize\sffamily\bfseries{学习}}}};
 }
 {
-\draw[->,thick,ublue] (monodata.east)--([xshift=2.7em]monodata.east) node[pos=0.5,above] (simexample) {\color{red}{\scriptsize{\textbf{学习}}}};
+\draw[->,thick,ublue] (monodata.east)--([xshift=2.7em]monodata.east) node[pos=0.5,above] (simexample) {\color{red}{\scriptsize{\scriptsize\sffamily\bfseries{学习}}}};
 }
 \begin{scope}[xshift=3.6in]
@@ -135,16 +135,16 @@ You and me & {0.02}\\
 }
 {
-\draw[->,thick,double,ublue] (decoder.north) -- ([yshift=2.2em]decoder.north) node[pos=0.5,right] (decodinglabel) {\color{red}{\tiny{\textbf{枚举所有可能}}}};
+\draw[->,thick,double,ublue] (decoder.north) -- ([yshift=2.2em]decoder.north) node[pos=0.5,right] (decodinglabel) {\color{red}{\tiny\sffamily\bfseries{枚举所有可能}}};
 }
 {
-\draw[->,thick,double,ublue] (decoder.east) .. controls +(east:3.5em) .. ([xshift=3.5em,yshift=3.0em]decoder.east) node[xshift=0.5em,pos=0.3,below] (decodinglabel) {\color{red}{\tiny{\textbf{计算翻译可能性}}}};
+\draw[->,thick,double,ublue] (decoder.east) .. controls +(east:3.5em) .. ([xshift=3.5em,yshift=3.0em]decoder.east) node[xshift=0.5em,pos=0.3,below] (decodinglabel) {\color{red}{\tiny\sffamily\bfseries{计算翻译可能性}}};
 }
 {
 \node[anchor=west,draw,thick,red,minimum width=11.5em,minimum height=1em] (outputlabel) at ([xshift=-0.3em,yshift=-6.1em]srcsentence.south west){};
-\node[anchor=west] (outputlabel2) at ([xshift=-0.3em]outputlabel.east) {\color{red}{\tiny{\textbf{输出}}}};
+\node[anchor=west] (outputlabel2) at ([xshift=-0.3em]outputlabel.east) {\color{red}{\tiny\sffamily\bfseries{输出}}};
 }
 }

--- a/Book/Chapter1/Figures/figure-results-zh-to-en-news-field-translation.tex
+++ b/Book/Chapter1/Figures/figure-results-zh-to-en-news-field-translation.tex
@@ -8,15 +8,15 @@
 {
 \begin{tikzpicture}
-\node [anchor=south west, fill=blue!50, minimum width=1.1cm, minimum height=2.3cm] (mt) at (1,0) {{\color{white} \textbf{机器}}};
+\node [anchor=south west, fill=blue!50, minimum width=1.1cm, minimum height=2.3cm] (mt) at (1,0) {{\color{white} {\small\sffamily\bfseries{机器}}}};
-\node [anchor=south west, fill=red!50, minimum width=1.1cm, minimum height=2.7cm] (human) at ([xshift=0.5cm]mt.south east) {{\color{white} \textbf{人}}};
+\node [anchor=south west, fill=red!50, minimum width=1.1cm, minimum height=2.7cm] (human) at ([xshift=0.5cm]mt.south east) {{\color{white} {\small\sffamily\bfseries{人}}}};
 \node [anchor=south] (mtscore) at (mt.north) {3.9};
 \node [anchor=south] (humanscore) at (human.north) {4.7};
 \draw [->,thick] ([xshift=-0.5cm]mt.south west) -- ([xshift=0.5cm]human.south east);
 \draw [->,thick] ([xshift=-0.5cm]mt.south west) -- ([xshift=-0.5cm,yshift=3.2cm]mt.south west);
-\node [anchor=south west, fill=blue!50, minimum width=1.1cm, minimum height=1.5cm] (mt1) at ([xshift=13.0em,yshift=-3.0em]mt.east) {{\color{white} \textbf{机器}}};
+\node [anchor=south west, fill=blue!50, minimum width=1.1cm, minimum height=1.5cm] (mt1) at ([xshift=13.0em,yshift=-3.0em]mt.east) {{\color{white} {\small\sffamily\bfseries{机器}}}};
-\node [anchor=south west, fill=red!50, minimum width=1.1cm, minimum height=2.7cm] (human1) at ([xshift=0.5cm]mt1.south east) {{\color{white} \textbf{人}}};
+\node [anchor=south west, fill=red!50, minimum width=1.1cm, minimum height=2.7cm] (human1) at ([xshift=0.5cm]mt1.south east) {{\color{white} {\small\sffamily\bfseries{人}}}};
 \node [anchor=south] (mtscore1) at (mt1.north) {47\%};
 \node [anchor=south] (humanscore1) at (human1.north) {100\%};
 \draw [->,thick] ([xshift=-0.5cm]mt1.south west) -- ([xshift=0.5cm]human1.south east);

--- a/Book/Chapter1/Figures/figure-zh-sentences-into-en-sentences.tex
+++ b/Book/Chapter1/Figures/figure-zh-sentences-into-en-sentences.tex
@@ -77,22 +77,22 @@
 \end{scope}
 {
-\draw[<->,dotted,thick,red] (example2.east)..controls +(east:0.6) and +(west:0.6)..(c1.west) node[pos=0.9,left,xshift=0.6em,yshift=0.4em] (simexample) {\color{red}{\tiny{\textbf{相似实例}}}};
+\draw[<->,dotted,thick,red] (example2.east)..controls +(east:0.6) and +(west:0.6)..(c1.west) node[pos=0.9,left,xshift=0.6em,yshift=0.4em] (simexample) {\color{red}{\tiny\sffamily\bfseries{相似实例}}};
 \draw[<->,dotted,thick,red] ([xshift=-0.2em]example2part2.east)..controls +(east:0.5) and +(west:0.5)..(e1.west);
 }
 {
-\draw[<->,dotted,thick,ublue] (w3.south)--(c3.north) node[pos=0.5,right] (mismatch1) {\color{red}{\tiny{\textbf{不匹配}}}};
+\draw[<->,dotted,thick,ublue] (w3.south)--(c3.north) node[pos=0.5,right] (mismatch1) {\color{red}{\tiny\sffamily\bfseries{不匹配}}};
 \draw[dotted,thick,ublue] ([xshift=-0.6em,yshift=0.0em]c3.south east)--([xshift=0.8em,yshift=0.0em]e5.north west);
 }
 {
-\draw[<->,dotted,thick,ublue] (w5.south)--(c5.north) node[pos=0.5,right] (mismatch1) {\color{red}{\tiny{\textbf{不匹配}}}};
+\draw[<->,dotted,thick,ublue] (w5.south)--(c5.north) node[pos=0.5,right] (mismatch1) {\color{red}{\tiny\sffamily\bfseries{不匹配}}};
 \draw[dotted,thick,ublue] ([xshift=0.8em,yshift=0.0em]c5.south west)--([xshift=0.0em,yshift=-0.2em]e3.north);
 }
 {
-\draw[double,->,thick,ublue] (e3.south)--([yshift=-1.2em]e3.south) node[pos=0.5,right,xshift=0.2em,yshift=0.2em] (step1) {\color{red}{\tiny{\textbf{用`你'替换`他'}}}};
+\draw[double,->,thick,ublue] (e3.south)--([yshift=-1.2em]e3.south) node[pos=0.5,right,xshift=0.2em,yshift=0.2em] (step1) {\color{red}{\tiny\sffamily\bfseries{用`你'替换`他'}}};
 \draw[->,dotted,thick,red] ([xshift=-0.1em]entry2.east)..controls +(east:4) and +(west:4)..([yshift=-0.6em,xshift=-0.5em]e3.south) ;
 }
@@ -101,7 +101,7 @@
 {\footnotesize
 \node [anchor=north west,inner sep=1mm] (c1) at (0,0) {我};
 \node [anchor=north west,inner sep=1mm] (c2) at ([xshift=0.3em]c1.north east) {对};
-\node [anchor=north west,inner sep=1mm] (c3) at ([xshift=0.3em]c2.north east) {\textbf{{\color{ublue} 你}}};
+\node [anchor=north west,inner sep=1mm] (c3) at ([xshift=0.3em]c2.north east) {\footnotesize\sffamily\bfseries{{\color{ublue} 你}}};
 \node [anchor=north west,inner sep=1mm] (c4) at ([xshift=0.3em]c3.north east) {感到};
 \node [anchor=north west,inner sep=1mm] (c5) at ([xshift=0.3em]c4.north east) {高兴};
 }
@@ -122,7 +122,7 @@
 }
 {
-\draw[double,->,thick,ublue] (e3.south)--([yshift=-1.2em]e3.south) node[pos=0.5,right,xshift=0.2em,yshift=0.2em] (step1) {\color{red}{\tiny{\textbf{用`满意'替换`高兴'}}}};
+\draw[double,->,thick,ublue] (e3.south)--([yshift=-1.2em]e3.south) node[pos=0.5,right,xshift=0.2em,yshift=0.2em] (step1) {\color{red}{\tiny\sffamily\bfseries{用`满意'替换`高兴'}}};
 \draw[->,dotted,thick,red] ([xshift=-1.2em,yshift=-0.6em]entry3.north east)..controls +(east:2) and +(west:3)..([yshift=-0.6em,xshift=-0.5em]e3.south) ;
 }
@@ -133,7 +133,7 @@
 \node [anchor=north west,inner sep=1mm] (c2) at ([xshift=0.3em]c1.north east) {对};
 \node [anchor=north west,inner sep=1mm] (c3) at ([xshift=0.3em]c2.north east) {你};
 \node [anchor=north west,inner sep=1mm] (c4) at ([xshift=0.3em]c3.north east) {感到};
-\node [anchor=north west,inner sep=1mm] (c5) at ([xshift=0.3em]c4.north east) {\textbf{{\color{ublue} 满意}}};
+\node [anchor=north west,inner sep=1mm] (c5) at ([xshift=0.3em]c4.north east) {\footnotesize\sffamily\bfseries{{\color{ublue} 满意}}};
 }
 \end{scope}
@@ -154,7 +154,7 @@
 \begin{pgfonlayer}{background}
 {\footnotesize
 \node[rectangle,draw=red,thick,inner sep=0mm] [fit = (e1) (e2) (e3) (e4) (e5)] {};
-\node[anchor=north] (outputlabel) at (e3.south) {\small{\textbf{\color{red}{输出翻译结果}}}};
+\node[anchor=north] (outputlabel) at (e3.south) {\footnotesize\sffamily\bfseries{\color{red}{输出翻译结果}}};
 }
 \end{pgfonlayer}

--- a/Book/Chapter1/chapter1.tex
+++ b/Book/Chapter1/chapter1.tex
 % !Mode:: "TeX:UTF-8"
 % !TEX encoding = UTF-8 Unicode
+\part{统计建模基础}
 %----------------------------------------------------------------------------------------
 %	CHAPTER 1
 %----------------------------------------------------------------------------------------
@@ -270,8 +270,8 @@
 \caption{不同机器翻译的对比}
 \label{tab:comparison-of-different-MT}
 \begin{tabular}{l | l l l l}
-& \parbox{3.8em}{\textbf{规则}} & \parbox{3.8em}{\textbf{实例}} & \parbox{3.8em}{\textbf{统计}} & \parbox{3.8em}
+& \parbox{3.8em}{规则} & \parbox{3.8em}{实例} & \parbox{3.8em}{统计} & \parbox{3.8em}
-{\textbf{神经}} \\
+{神经} \\
 \hline
 人工写规则 & 是 & 否 & 否 & 否\\
@@ -337,7 +337,7 @@
 \begin{example}
 Candidate：the the the the
-\qquad \qquad \quad Reference：The cat is standing on the ground
+\qquad \qquad \ \  Reference：The cat is standing on the ground
 \end{example}
 \parinterval 在引入截断方式之前，该译文的1-gram准确率为4/4=1，这显然是不合理的。在引入截断的方式之后，the在译文中出现4次，在参考译文中出现2次，截断操作则是取二者的最小值，即$\textrm{Count}_{\textrm{hit}}= 2$，$\textrm{Count}_{\textrm{output}}= 4$，该译文的1-gram准确率为2/4。
@@ -379,7 +379,7 @@ e^{(1-\frac{r}{c})}& c<r
 \begin{example}
 Candidate：cat is standing in the ground
-\qquad \qquad \quad Reference：The cat is standing on the ground
+\qquad \qquad \ \  Reference：The cat is standing on the ground
 \end{example}
 \parinterval 将Candidate转换为Reference，需要进行一次增加操作，在句首增加The，一次替换操作，将in替换为on，所以$\textrm{edit}(c,r) = 2$，归一化因子$l$为Reference的长度7，所以该参考译文的TER错误率为2/7。
@@ -395,29 +395,29 @@ Candidate：cat is standing in the ground
 \begin{example}
 They got up at six this morning.
-\qquad \qquad \quad 他们今天早晨六点钟起床。
+\qquad \qquad \ \  他们今天早晨六点钟起床。
-\qquad \qquad \quad 检测点：时间词的顺序。
+\qquad \qquad \ \  检测点：时间词的顺序。
 \end{example}
 \begin{example}
 There are nine cows on the farm.
-\qquad \qquad \quad 农场里有九头牛。
+\qquad \qquad \ \  农场里有九头牛。
-\qquad \qquad \quad 检测点：量词``头''
+\qquad \qquad \ \  检测点：量词``头''
 \end{example}
 \begin{example}
 His house is on the south bank of the river.
-\qquad \qquad \quad 他的房子在河的南岸。
+\qquad \qquad \ \  他的房子在河的南岸。
-\qquad \qquad \quad We keep our money in a bank.
+\qquad \qquad \ \  We keep our money in a bank.
-\qquad \qquad \quad 我们在一家银行存钱。
+\qquad \qquad \ \  我们在一家银行存钱。
-\qquad \qquad \quad 检测点：bank的多义翻译
+\qquad \qquad \ \  检测点：bank的多义翻译
 \end{example}
 \parinterval 基于检测点的评价方法的意义在于，它并不是简单给出一个分数，而是帮助系统研发人员定位问题。因此这类方法更多的使用在对机器翻译的结果进行分析上，是对BLEU等整体评价指标的一种很好的补充。
@@ -499,15 +499,15 @@ His house is on the south bank of the river.
 \vspace{0.5em}
 \item SAMT：SAMT\cite{zollmann2007the}是由卡内基梅隆大学机器翻译团队开发的语法增强的统计机器翻译系统。SAMT在解码的时候使用目标树来生成翻译规则，而不严格遵守目标语言的语法。SAMT 的一个亮点是它提供了简单但高效的方式来利用在机器翻译中句法信息。由于SAMT在hadoop中实现，它可受益于跨计算机群的大数据集的分布式处理。网址：\url{http://www.cs.cmu.edu/zollmann/samt/}
 \vspace{0.5em}
-\item cdec：cdec\cite{Dyer2010cdec}是一个强大的解码器，是由Chris Dyer 和他的合作者们一起开发。cdec的主要的功能是它使用了翻译模型的一个统一的内部表示，并为实验结构预测问题的各种模型和算法提供了框架。所以，cdec也可以在被用来做一个对齐系统或者一个更通用的学习框架。此外，cdec由于使用高效的C++语言编写，运行速度较快。网址：\url{http://cdec-decoder.org/index.php?title=MainPage}
+\item cdec：cdec\cite{dyer2010cdec}是一个强大的解码器，是由Chris Dyer 和他的合作者们一起开发。cdec的主要的功能是它使用了翻译模型的一个统一的内部表示，并为实验结构预测问题的各种模型和算法提供了框架。所以，cdec也可以在被用来做一个对齐系统或者一个更通用的学习框架。此外，cdec由于使用高效的C++语言编写，运行速度较快。网址：\url{http://cdec-decoder.org/index.php?title=MainPage}
 \vspace{0.5em}
 \item Phrasal：Phrasal\cite{Cer2010Phrasal}是由斯坦福自然语言处理小组开发的系统。除了传统的基于短语的模型，Phrasal还支持了基于非层次短语的模型，这种模型将基于短语的翻译延伸到非连续的短语翻译(phrasal discontinues translation)，增加了模型的泛化能力。网址：\url{http://nlp.stanford.edu/phrasal/}
 \vspace{0.5em}
-\item Jane：Jane\cite{VilarJane}是一个基于短语和基于层次短语的机器翻译系统，由亚琛工业大学的人类语言技术与模式识别小组开发。Jane提供了系统融合模块，因此可以非常方便的对多个系统进行融合。网址：\url{http://www-i6.informatik.rwth-aachen.de/jane/}
+\item Jane：Jane\cite{vilar2012jane}是一个基于短语和基于层次短语的机器翻译系统，由亚琛工业大学的人类语言技术与模式识别小组开发。Jane提供了系统融合模块，因此可以非常方便的对多个系统进行融合。网址：\url{https://www-i6.informatik.rwth-aachen.de/jane/}
 \vspace{0.5em}
-\item GIZA++：GIZA++\cite{Junczysdowmunt2012SyMGiza}是Franz Och研发的用于训练IBM模型1-5和HMM单词对齐模型的工具包。在早期，GIZA++是所有统计机器翻译系统中词对齐的标配工具。网址：\url{https://github.com/moses-smt/giza-pp}
+\item GIZA++：GIZA++\cite{och2003systematic}是Franz Och研发的用于训练IBM模型1-5和HMM单词对齐模型的工具包。在早期，GIZA++是所有统计机器翻译系统中词对齐的标配工具。网址：\url{https://github.com/moses-smt/giza-pp}
 \vspace{0.5em}
-\item HiFST：HiFST\cite{pino2010the}是剑桥大学开发的统计机器翻译系统。该系统完全基于有限状态自动机实现，因此非常适合对搜索空间进行有效的表示。网址：\\ \url{http://ucam-smt.github.io/}
+\item HiFST：HiFST\cite{iglesias2009hierarchical}是剑桥大学开发的统计机器翻译系统。该系统完全基于有限状态自动机实现，因此非常适合对搜索空间进行有效的表示。网址：\url{http://ucam-smt.github.io/}
 \vspace{0.5em}
 \item FastAlign：FastAlign\cite{dyer2013a}是一个快速，无监督的词对齐工具，由卡内基梅隆大学开发。网址：\url{https://github.com/clab/fast\_align}
 \end{itemize}
@@ -517,7 +517,7 @@ His house is on the south bank of the river.
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \vspace{0.5em}
 \begin{itemize}
-\item GroundHog：GroundHog\cite{bahdanau2015neural}基于Theano框架，由蒙特利尔大学LISA 实验室使用Python语言编写的一个框架，旨在提供灵活而高效的方式来实现复杂的循环神经网络模型。它提供了包括LSTM在内的多种模型。Bahdanau等人在此框架上又编写了GroundHog神经机器翻译系统。该系统被当作很多论文的基线系统。网址：\url{https://github.com/lisa-groundhog/GroundHog}
+\item GroundHog：GroundHog\cite{bahdanau2015neural}基于Theano\cite{al2016theano}框架，由蒙特利尔大学LISA 实验室使用Python语言编写的一个框架，旨在提供灵活而高效的方式来实现复杂的循环神经网络模型。它提供了包括LSTM在内的多种模型。Bahdanau等人在此框架上又编写了GroundHog神经机器翻译系统。该系统被当作很多论文的基线系统。网址：\url{https://github.com/lisa-groundhog/GroundHog}
 \vspace{0.5em}
 \item Nematus：Nematus\cite{SennrichNematus}是英国爱丁堡大学开发的，基于Theano框架的神经机器翻译系统。该系统使用GRU作为隐层单元，支持多层网络。Nematus 编码端有正向和反向的编码方式，可以同时提取源语句子中的上下文信息。该系统的一个优点是，它可以支持输入端有多个特征的输入（例如词的词性等）。网址：\url{https://github.com/EdinburghNLP/nematus}
 \vspace{0.5em}
@@ -525,7 +525,7 @@ His house is on the south bank of the river.
 \vspace{0.5em}
 \item Fairseq：Fairseq\cite{Ottfairseq}是由Facebook开发的，基于PyTorch框架的用以解决序列到序列问题的工具包，其中包括基于卷积神经网络、基于循环神经网络、基于Transformer的模型等。Fairseq是当今使用最广泛的神经机器翻译开源系统之一。https://github.com/facebookresearch/fairseq
 \vspace{0.5em}
-\item Tensor2Tensor：Tensor2Tensor\cite{VaswaniTensor2Tensor}是由谷歌推出的，基于TensorFlow框架的开源系统。该系统基于Transformer模型，因此可以支持大多数序列到序列任务。得益于Transformer 的网络结构，系统的训练速度较快。现在，Tensor2Te-\\nsor也是机器翻译翻译领域广泛使用的开源系统之一。网址：\url{https://github.com/tensorflow/tensor2tensor}
+\item Tensor2Tensor：Tensor2Tensor\cite{VaswaniTensor2Tensor}是由谷歌推出的，基于TensorFlow框架的开源系统。该系统基于Transformer模型，因此可以支持大多数序列到序列任务。得益于Transformer 的网络结构，系统的训练速度较快。现在，Tensor2Tensor也是机器翻译翻译领域广泛使用的开源系统之一。网址：\url{https://github.com/tensorflow/tensor2tensor}
 \vspace{0.5em}
 \item OpenNMT：OpenNMT\cite{KleinOpenNMT}系统是由哈佛大学自然语言处理研究组开源的，基于Torch框架的神经机器翻译系统。OpenNMT系统的早期版本使用Lua 语言编写，现在也扩展到了TensorFlow和PyTorch，设计简单易用，易于扩展，同时保持效率和翻译精度。网址：\url{https://github.com/OpenNMT/OpenNMT}
 \vspace{0.5em}
@@ -570,27 +570,27 @@ His house is on the south bank of the river.
 \parinterval 以上机器翻译评测各自有不同的特点，NIST最近几年更加关注稀缺资源翻译相关问题；NTCIR 在评估方式上纳入了时间、多语种评估等手段；WAT倾向于评测亚洲相关语言的翻译；CCMT以汉语为核心，并支持国内许多少数民族语言；WMT面向欧洲语系，现在也延伸到汉语，语种范围广，评测类型丰富；IWSLT针对语音对话的翻译相关问题进行评测。
-\parinterval 以上评测数据大多可以从评测网站上下载，此外部分数据也可以从LDC（Lingu \\ -istic Data Consortium）上申请，网址为\url{https://www.ldc.upenn.edu/}。ELRA（Euro \\ -pean Language Resources Association）上也有一些免费的语料库供研究使用，其官网为\url{http://www.elra.info/}。更多机器翻译的语料信息可参看附录\ref{appendix-A}。
+\parinterval 以上评测数据大多可以从评测网站上下载，此外部分数据也可以从LDC（Lingu-istic Data Consortium）上申请，网址为\url{https://www.ldc.upenn.edu/}。ELRA（Euro-pean Language Resources Association）上也有一些免费的语料库供研究使用，其官网为\url{http://www.elra.info/}。更多机器翻译的语料信息可参看附录\ref{appendix-A}。
 \parinterval 从机器翻译发展的角度看，这些评测任务给相关研究提供了基准数据集，使得不同的系统都可以在同一个环境下进行比较和分析，进而建立了机器翻译研究所需的实验基础。此外，这些评测任务也使得研究者可以第一时间了解机器翻译研究的最新成果，比如，有多篇ACL最佳论文的灵感就来自当年参加机器翻译评测任务的系统。
 \section{推荐学习资源}\index{Chapter1.8}
-\parinterval 首先，推荐一本书《Statistical Machine Translation》\cite{SPhilipp}，其作者是机器翻译领域著名学者Philipp Koehn教授。该书是机器翻译领域内的经典之作，介绍了统计机器翻译技术的进展，并解释了如何为任意两种语言搭建机器翻译系统。该书从语言学和概率学两个方面介绍了统计机器翻译的构成要素，然后介绍了统计机器翻译的主要模型：基于词、基于短语和基于树的模型，以及机器翻译评价、语言建模、判别式训练等方法。这本书还报道了当时的最新研究，提出了主要的挑战，并使新手以及经验丰富的研究员能够为这一令人兴奋的领域做出新的贡献。该书内容全面，被很多高校和机器翻译的研究者使用。
+\parinterval 首先，推荐一本书《Statistical Machine Translation》\cite{koehn2009statistical}，其作者是机器翻译领域著名学者Philipp Koehn教授。该书是机器翻译领域内的经典之作，介绍了统计机器翻译技术的进展，并解释了如何为任意两种语言搭建机器翻译系统。该书从语言学和概率学两个方面介绍了统计机器翻译的构成要素，然后介绍了统计机器翻译的主要模型：基于词、基于短语和基于树的模型，以及机器翻译评价、语言建模、判别式训练等方法。这本书还报道了当时的最新研究，提出了主要的挑战，并使新手以及经验丰富的研究员能够为这一令人兴奋的领域做出新的贡献。该书内容全面，被很多高校和机器翻译的研究者使用。
-\parinterval 《Foundations of Statistical Natural Language Processing》\cite{SIDDHARTHANChristopher}中文译名《自然语言处理基础》\cite{曼宁2005《统计自然语言处理基础》}，作者是自然语言处理领域的权威Chris Manning教授和Hinrich Sch$\ddot{\textrm{u}}$tze教授。该书对统计自然语言处理方法进行了全面介绍。书中讲解了必要的语言学和概率论基础知识，介绍了机器翻译评价、语言建模、判别式训练以及整合语言学信息等基础方法。其中包含了构建NLP工具所需的基本理论和算法，提供了对数学和语言学基础内容广泛而严格的覆盖，以及统计方法的详细讨论。
+\parinterval 《Foundations of Statistical Natural Language Processing》\cite{manning1999foundations}中文译名《统计自然语言处理基础》\cite{manning2005统计自然语言处理基础}，作者是自然语言处理领域的权威Chris Manning教授和Hinrich Sch$\ddot{\textrm{u}}$tze教授。该书对统计自然语言处理方法进行了全面介绍。书中讲解了必要的语言学和概率论基础知识，介绍了机器翻译评价、语言建模、判别式训练以及整合语言学信息等基础方法。其中包含了构建NLP工具所需的基本理论和算法，提供了对数学和语言学基础内容广泛而严格的覆盖，以及统计方法的详细讨论。
 \parinterval 《统计自然语言处理》\cite{宗成庆2013统计自然语言处理}由中国科学院自动化所宗成庆教授所著，其中全面介绍了统计自然语言处理的基本概念、理论方法和最新研究进展，既有对基础知识和理论模型的介绍，也有对相关问题的研究背景、实现方法和技术现状的详细阐述。可供从事自然语言处理、机器翻译等研究的相关人员参考。
-\parinterval  Ian Goodfellow、Yoshua Bengio，Aaron Courville三位机器学习领域的学者所写的《Deep Learning》\cite{HeatonIan}也是值得一读的参考书。其讲解了有关深度学习常用的方法，其中很多都会在深度学习模型设计和使用中用到。同时在《Deep Learning》应用一章中也简单讲解了神经机器翻译的任务定义和发展过程。
+\parinterval  Ian Goodfellow、Yoshua Bengio，Aaron Courville三位机器学习领域的学者所写的《Deep Learning》\cite{Goodfellow-et-al-2016}也是值得一读的参考书。其讲解了有关深度学习常用的方法，其中很多都会在深度学习模型设计和使用中用到。同时在《Deep Learning》应用一章中也简单讲解了神经机器翻译的任务定义和发展过程。
-\parinterval 《Neural Network Methods in Natural Language Processing》\cite{Goldberg2017Neural}是Yoav Goldberg编写的面向自然语言处理的深度学习参考书。相比《深度学习》，该书聚焦在自然语言处理中的深度学习方法，内容更加易读。
+\parinterval 《Neural Network Methods for Natural Language Processing》\cite{goldberg2017neural}是Yoav Goldberg编写的面向自然语言处理的深度学习参考书。相比《深度学习》，该书聚焦在自然语言处理中的深度学习方法，内容更加易读。
-\parinterval 《机器学习》\cite{周志华2018《机器学习》}由南京大学教授周志华教授所著，作为机器学习领域入门教材，该书尽可能地涵盖了机器学习基础知识的各个方面，试图尽可能少地使用数学知识介绍机器学习方法与思想。在机器翻译中使用的很多机器学习概念和方法可以从该书中进行学习。
+\parinterval 《机器学习》\cite{周志华2016机器学习}由南京大学教授周志华教授所著，作为机器学习领域入门教材，该书尽可能地涵盖了机器学习基础知识的各个方面，试图尽可能少地使用数学知识介绍机器学习方法与思想。在机器翻译中使用的很多机器学习概念和方法可以从该书中进行学习。
 \parinterval  TensorFlow官网提供了一个有关神经机器翻译的教程，介绍了从数据处理开始如何利用TensorFlow工具从零搭建一个神经机器翻译系统以及如何解码，其地址为\url{https://www.tensorflow.org/tutorials/text/nmt\_with\_attention}。此外谷歌和Facebook也分别提供了基于序列到序列机器翻译模型的高级教程。谷歌的版本是基于TensorFlow实现，网址为：\url{https://github.com/tensorflow/nmt}，Facebook的教程主要是基于PyTorch实现，网址为：\url{https://pytorch.org/tutorials/intermediate/seq2seq\_translation\_tutorial.html}。网站上也包含一些综述论文，其中详细的介绍了神经机器翻译的发展历程，问题定义以及目前遇到的问题。
-\parinterval  \url{http://www.statmt.org}是一个介绍机器翻译研究的网站，该网站包含了对统计机器翻译研究的一些介绍资料，一些自然语言处理的会议和workshop，常用工具以及语料库。\url{http://www.mt-archive.info}与\url{https://www.aclweb.org/anthology}网站上有许多介绍机器翻译和自然语言处理的论文。通过这个网站可以了解到自然语言处理领域的一些重要的会议，比如与机器翻译相关的国际会议有：
+\parinterval  \url{http://www.statmt.org}是一个介绍机器翻译研究的网站，该网站包含了对统计机器翻译研究的一些介绍资料，一些自然语言处理的会议和workshop，常用工具以及语料库。\url{http://www.mt-archive.info}与\url{https://www.aclweb.org/anthology}\\网站上有许多介绍机器翻译和自然语言处理的论文。通过这个网站可以了解到自然语言处理领域的一些重要的会议，比如与机器翻译相关的国际会议有：
 \vspace{0.5em}
 \begin{itemize}

--- a/Book/Chapter2/chapter2.tex
+++ b/Book/Chapter2/chapter2.tex
@@ -22,7 +22,7 @@
 %--问题概述-----------------------------------------
 \section{问题概述 }\index{Chapter2.1}
-\parinterval 很多时候机器翻译系统被看作是孤立的``黑盒''系统（图 \ref {fig:MT-construction-comparison} (a)）。我们将一段文本作为输入送入机器翻译系统，之后得到翻译好的译文输出。但是真实的机器翻译系统要复杂的多。因为系统看到的输入和输出的实际上只是一些符号串，这些符号并没有任何其它意义，因此需要进一步对这些符号串进行处理才能使机器翻译系统更好的使用它们，比如，需要定义翻译中最基本的单元是什么？符号串是否还有结构信息？如何用数学工具刻画这些基本单元和结构？
+\parinterval 很多时候机器翻译系统被看作是孤立的``黑盒''系统（图 \ref {fig:2.1-1} (a)）。我们将一段文本作为输入送入机器翻译系统，之后得到翻译好的译文输出。但是真实的机器翻译系统要复杂的多。因为系统看到的输入和输出的实际上只是一些符号串，这些符号并没有任何其它意义，因此需要进一步对这些符号串进行处理才能使机器翻译系统更好的使用它们，比如，需要定义翻译中最基本的单元是什么？符号串是否还有结构信息？如何用数学工具刻画这些基本单元和结构？
 %----------------------------------------------
 % 图2.1
@@ -31,22 +31,22 @@
 	\subfigure[机器翻译系统被看作一个黑盒] {\input{./Chapter2/Figures/figure-MT-system-as-a-black-box}  }
 	\subfigure[机器翻系统 = 语言分析 + 翻译引擎] {\input{./Chapter2/Figures/figure-MT=language-analysis+translation-engine}}
 	\caption{机器翻译系统的结构对比}
-    \label{fig:MT-construction-comparison}
+    \label{fig:2.1-1}
 \end{figure}
 %-------------------------------------------
-\parinterval 图\ref{fig:MT-construction-comparison} (b)展示了一个机器翻译系统处理输入输出的例子。可以看到，输入的中文字串``猫喜欢吃鱼''被加工成一个新的结构。
+\parinterval 图\ref{fig:2.1-1} (b)展示了一个机器翻译系统处理输入输出的例子。可以看到，输入的中文字串``猫喜欢吃鱼''被加工成一个新的结构。
 %----------------------------------------------
 % 图2.2
 \begin{figure}[htp]
 \centering
 \input{./Chapter2/Figures/figure-analysis-of-sentence-participle&syntactic}
 \caption{中文句子``猫喜欢吃鱼''的分析结果（分词和句法分析）}
-\label{fig:analysis-of-sentence-participle&syntactic}
+\label{fig:2.1-2}
 \end{figure}
 %-------------------------------------------
-\parinterval 直觉上，这个结构有些奇怪，因为上面多了很多新的符号，而且还有一些线将不同符号进行连接。实际上这就是语言分析中对句子常用的结构表示 – 短语结构树。从原始的词串转化为图\ref {fig:analysis-of-sentence-participle&syntactic} 的样子，有两个步骤：
+\parinterval 直觉上，这个结构有些奇怪，因为上面多了很多新的符号，而且还有一些线将不同符号进行连接。实际上这就是语言分析中对句子常用的结构表示 – 短语结构树。从原始的词串转化为图\ref {fig:2.1-2} 的样子，有两个步骤：
 \vspace{0.5em}
 \begin{itemize}
@@ -58,22 +58,22 @@
 \parinterval 类似的，机器翻译输出的结果也可以包含同样的信息。甚至系统输出英文译文之后，还有一个额外的步骤来把部分英文单词的大小写恢复出来，比如，上例中句首单词Cats的首字母要大写。
-\parinterval 一般来说，在送入机器翻译系统前需要对文字序列进行处理和加工的过程被称为\textbf{预处理}。同理，在机器翻译模型输出译文后的处理作被称作\textbf{后处理}。这两个过程对机器翻译性能影响很大，比如，在神经机器翻译里，预处理使用不同策略的切分可能会造成翻译性能的天差地别。
+\parinterval 一般来说，在送入机器翻译系统前需要对文字序列进行处理和加工的过程被称为{\small\sffamily\bfseries{预处理}}。同理，在机器翻译模型输出译文后的处理作被称作{\small\sffamily\bfseries{后处理}}。这两个过程对机器翻译性能影响很大，比如，在神经机器翻译里，预处理使用不同策略的切分可能会造成翻译性能的天差地别。
 \parinterval 值得注意的是，不论是分词还是句法分析，对于机器翻译来说并不是必须要求符合人的认知和语言学约束。换句话说，机器翻译所使用的``单词''和``结构''本身并不是为了符合人类的解释，它直接目的是更好的进行翻译。从系统的角度，有时候即使一些处理和我们的语言习惯有差别的``单词''和``结构''，仍然会带来性能的提升，比如在神经机器翻译中，在传统分词的基础上进一步使用双字节编码（Byte Pair Encoding，BPE）子词切分会使得机器翻译性能大幅提高。当然，自然语言处理中语言学信息的使用一直是学界关注的焦点。甚至关于语言学结构对机器翻译是否有作用这个问题也有争论。但是不能否认的是，无论是语言学的知识，还是计算机自己学习到的知识，对机器翻译都是有价值的。在后续章节会看到，这两种类型的知识对机器翻译帮助很大 \footnote[1]{笔者并不认同语言学结构对机器翻译的帮助有限，相反机器翻译需要更多的人类先验知识的指导。当然，这个问题不是这里讨论的重点。} 。
 \parinterval 剩下的问题是如何进行句子的切分和结构的分析。思路有很多，一种常用的方法是对问题进行概率化，用统计模型来描述问题并求解。比如，一个句子切分的好坏，并不是非零即一的判断，而是要估计出这种切分的可能性大小，最终选择可能性最大的结果进行输出。这也是一种典型的用统计建模的方式来描述自然语言处理问题。
 \parinterval 本章将会对上述问题及方法进行介绍。首先，会用一个例子给出统计建模的基本思路，之后会应用这种方法进行中文分词、语言建模和句法分析。
+\vspace{-1em}
 %--概率论基础-----------------------------------------
-\vspace{-1.5em}
 \section{概率论基础}\index{Chapter2.2}
 \parinterval 为了便于后续内容的介绍，首先对本书中使用的概率和统计学概念和符号与定理进行说明。
 %--随机变量和概率---------------------
 \subsection{随机变量和概率}\index{Chapter2.2.1}
-\parinterval 在自然界中，有这样一类具有偶然性的事件，它在一定条件下是否会发生是不确定的。例如，明天会下雨、掷一枚硬币是正面朝上、扔一个骰子的点数是5$\cdots\cdots$这类可能会发生也可能不会发生，通过大量的重复试验，能发现其发生具有某种规律性的事件叫做\textbf{随机事件}。
+\parinterval 在自然界中，有这样一类具有偶然性的事件，它在一定条件下是否会发生是不确定的。例如，明天会下雨、掷一枚硬币是正面朝上、扔一个骰子的点数是5$\cdots\cdots$这类可能会发生也可能不会发生，通过大量的重复试验，能发现其发生具有某种规律性的事件叫做{\small\sffamily\bfseries{随机事件}}。
-\parinterval \textbf{随机变量}（random variable）是对随机事件发生可能状态的描述，是随机事件的数量表征。设$\Omega = \{ \omega \}$为一个随机试验的样本空间，$X=X(\omega)$就是定义在样本空间$\omega$上的、取值为实数的单值函数，即$X=X(\omega)$为随机变量，记为$X$。随机变量是一种能随机选取数值的变量，常用大写的英文字母或希腊字母表示，其取值通常用小写字母来表示。例如，用$A$表示一个随机变量，用$a$表示变量$A$的一个取值。根据随机变量可以选取的值，可以将其划分为离散变量和连续变量。
+\parinterval {\small\sffamily\bfseries{随机变量}}（random variable）是对随机事件发生可能状态的描述，是随机事件的数量表征。设$\Omega = \{ \omega \}$为一个随机试验的样本空间，$X=X(\omega)$就是定义在样本空间$\omega$上的、取值为实数的单值函数，即$X=X(\omega)$为随机变量，记为$X$。随机变量是一种能随机选取数值的变量，常用大写的英文字母或希腊字母表示，其取值通常用小写字母来表示。例如，用$A$表示一个随机变量，用$a$表示变量$A$的一个取值。根据随机变量可以选取的值，可以将其划分为离散变量和连续变量。
 \parinterval 离散变量是指在其取值区间内可以被一一列举，总数有限并且可计算的数值变量。例如，用随机变量$X$代表某次投骰子出现的点数，点数只可能取1$\sim$6这6个整数，$X$是一个离散变量。
@@ -81,7 +81,7 @@
 \parinterval 概率是度量随机事件呈现其每个可能状态的可能性的数值，本质上它是一个测度函数\cite{mao-prob-book-2011}\cite{kolmogorov2018foundations}。概率的大小表征了随机事件在一次试验中发生的可能性大小。用$\textrm{P}(\cdot )$表示一个随机事件的可能性，即事件发生的概率。比如$\textrm{P}(\textrm{太阳从东方升起})$表示``太阳从东方升起的可能性''，同理，$\textrm{P}(A=B)$表示的就是``$A=B$'' 这件事的可能性。
-\parinterval 在实际问题中，我们往往需要得到某些概率值。但是，真实的概率值往往是无法准确知道的，这时就需要对概率进行\textbf{估计}，得到的结果是概率的\textbf{估计值}（estimate）。在概率论中，一个很简单的获取概率的方式是利用相对频度作为概率的估计值。如果$\{x_1,x_2,\dots,x_n \}$是一个试验的样本空间，在相同情况下重复试验N次，观察到样本$x_i (1\leq{i}\leq{n})$的次数为$n_N (x_i )$，那么$x_i$在这N次试验中的相对频率是$\frac{n_N (x_i )}{N}$。当N越来越大时，相对概率也就越来越接近真实概率$\textrm{P}(x_i)$，即$\lim_{N \to \infty}\frac{n_N (x_i )}{N}=\textrm{P}(x_i)$。 实际上，很多概率模型都等同于相对频度估计，比如，对于一个多项式分布变量的概率的极大似然估计就可以用相对频度估计实现。
+\parinterval 在实际问题中，我们往往需要得到某些概率值。但是，真实的概率值往往是无法准确知道的，这时就需要对概率进行{\small\sffamily\bfseries{估计}}，得到的结果是概率的{\small\sffamily\bfseries{估计值}}（estimate）。在概率论中，一个很简单的获取概率的方式是利用相对频度作为概率的估计值。如果$\{x_1,x_2,\dots,x_n \}$是一个试验的样本空间，在相同情况下重复试验N次，观察到样本$x_i (1\leq{i}\leq{n})$的次数为$n_N (x_i )$，那么$x_i$在这N次试验中的相对频率是$\frac{n_N (x_i )}{N}$。当N越来越大时，相对概率也就越来越接近真实概率$\textrm{P}(x_i)$，即$\lim_{N \to \infty}\frac{n_N (x_i )}{N}=\textrm{P}(x_i)$。 实际上，很多概率模型都等同于相对频度估计，比如，对于一个多项式分布变量的概率的极大似然估计就可以用相对频度估计实现。
 \parinterval 概率函数是用函数形式给出离散变量每个取值发生的概率，其实就是将变量的概率分布转化为数学表达形式。如果我们把$A$看做一个离散变量，$a$看做变量$A$的一个取值，那么$\textrm{P}(A)$被称作变量$A$的概率函数，$\textrm{P}(A=a)$被称作$A = a$的概率值，简记为$\textrm{P}(a)$。例如，在相同条件下掷一个骰子50次，用$A$表示投骰子出现的点数这个离散变量，$a_i$表示点数的取值，$\textrm{P}_i$表示$A=a_i$的概率值。下表为$A$的概率分布，给出了$A$的所有取值及其概率。
 %表1--------------------------------------------------------------------
@@ -93,13 +93,13 @@
               \hline
 \rule{0pt}{15pt}     $\textrm{P}_i$ & $\textrm{P}_1=\frac{4}{25}$  &  $\textrm{P}_2=\frac{3}{25}$ &  $\textrm{P}_3=\frac{4}{25}$ & $\textrm{P}_4=\frac{6}{25}$ & $\textrm{P}_5=\frac{3}{25}$ & $\textrm{P}_6=\frac{1}{25}$  \\
             \end{tabular}
-             \label{tab1}
+             \label{tab:2.2-1}
 \end{table}
 %表1--------------------------------------------------------------------
 \parinterval 除此之外，概率函数$\textrm{P}(\cdot)$还具有非负性、归一性等特点，非负性是指，所有的概率函数$\textrm{P}(\cdot)$都必须是大于等于0的数值，概率函数中不可能出现负数：$\forall{x},\textrm{P}{(x)}\geq{0}$。归一性，又称规范性，简单的说就是所有可能发生的事件的概率总和为一,$\sum_{x}\textrm{P}{(x)}={1}$。
-\parinterval 对于离散变量$A$，$\textrm{P}(A=a)$是个确定的值，可以表示事件$A=a$的可能性大小；而对于连续变量，求在某个定点处的概率是无意义的，只能求其落在某个取值区间内的概率。因此，用\textbf{概率分布函数$F(x)$}和\textbf{概率密度函数}$f(x)$来统一描述随机变量的取值分布情况。概率分布函数$F(x)$取值小于某个值的概率，是概率的累加形式。假设$A$是一个随机变量，$a$是任意实数，将函数$F(a)=\textrm{P}\{A\leq a\}$，$-\infty<a<\infty $定义为$A$的分布函数。通过分布函数，我们可以清晰地表示任何随机变量的概率。
+\parinterval 对于离散变量$A$，$\textrm{P}(A=a)$是个确定的值，可以表示事件$A=a$的可能性大小；而对于连续变量，求在某个定点处的概率是无意义的，只能求其落在某个取值区间内的概率。因此，用{\small\sffamily\bfseries{概率分布函数$F(x)$}}和{\small\sffamily\bfseries{概率密度函数}}$f(x)$来统一描述随机变量的取值分布情况。概率分布函数$F(x)$取值小于某个值的概率，是概率的累加形式。假设$A$是一个随机变量，$a$是任意实数，将函数$F(a)=\textrm{P}\{A\leq a\}$，$-\infty<a<\infty $定义为$A$的分布函数。通过分布函数，我们可以清晰地表示任何随机变量的概率。
 \parinterval 对于连续变量，我们不能像离散变量一样列出所有的概率取值，而是用概率密度函数来描述分布情况。概率密度函数反映了变量在某个区间内的概率变化快慢，概率密度函数的值是概率的变化率，该连续变量的概率也就是对概率密度函数求积分得到的结果。设$f(x) \geq 0$是连续变量$X$的概率密度函数，$X$的分布函数就可以用$F(X)=\int_{-\infty}^x f(x)dx \ (x\in \mathbb{R})$来表示。
@@ -109,41 +109,41 @@
 \centering
 \input{./Chapter2/Figures/figure-Probability-density-function&Distribution-function}
 \caption{一个概率密度函数(左)与其对应的分布函数(右)}
-\label{fig:Probability-density-function&Distribution-function}
+\label{fig:2.2-1}
 \end{figure}
 %-------------------------------------------
 \subsection{联合概率、条件概率和边缘概率}\index{Chapter2.2.2}
-\parinterval \textbf{联合概率}（joint probability）是指多个事件同时发生，每个随机变量满足各自条件的概率，表示为$\textrm{P}(AB)$。
+\parinterval {\small\sffamily\bfseries{联合概率}}（joint probability）是指多个事件同时发生，每个随机变量满足各自条件的概率，表示为$\textrm{P}(AB)$。
-\parinterval \textbf{条件概率}（conditional probability）是指$A$、$B$为任意的两个事件，在事件$A$已出现的前提下，事件$B$出现的概率，使用$\textrm{P}(B \mid A)$表示。通常来说，$\textrm{P}(B \mid A) \neq \textrm{P}(B)$。
+\parinterval {\small\sffamily\bfseries{条件概率}}（conditional probability）是指$A$、$B$为任意的两个事件，在事件$A$已出现的前提下，事件$B$出现的概率，使用$\textrm{P}(B \mid A)$表示。通常来说，$\textrm{P}(B \mid A) \neq \textrm{P}(B)$。
 \parinterval 贝叶斯法则是条件概率计算时的重要依据，条件概率可以表示为
 %----------------------------------------------
 \begin{eqnarray}
 \textrm{P}{(B|A)} = \frac{\textrm{P}(A\cap{B})}{\textrm{P}(A)} = \frac{\textrm{P}(A)\textrm{P}(B|A)}{\textrm{P}(A)} = \frac{\textrm{P}(B)\textrm{P}(A|B)}{\textrm{P}(A)}
-\label{eqC2.1-new}
+\label{eq:2.1-1}
 \end{eqnarray}
 %----------------------------------------------
-\parinterval \textbf{边缘概率}（marginal probability）是和联合概率对应的，它指的是$\textrm{P}(X=a)$或$\textrm{P}(Y=b)$，即仅与单个随机变量有关的概率称为边缘概率。
+\parinterval {\small\sffamily\bfseries{边缘概率}}（marginal probability）是和联合概率对应的，它指的是$\textrm{P}(X=a)$或$\textrm{P}(Y=b)$，即仅与单个随机变量有关的概率称为边缘概率。
 \parinterval 对于离散随机变量$X$和$Y$，我们知道$\textrm{P}(X,Y)$，则边缘概率$\textrm{P}(X)$可以通过求和的方式得到，如下式所示
 \begin{eqnarray}
 \forall x \in X ,\textrm{P}(X=x)=\sum_{y}  \textrm{P}(X=x,Y=y)
-\label{eqC2.2-new}
+\label{eq:2.2-2}
 \end{eqnarray}
 %----------------------------------------------
 \parinterval 对于连续变量，边缘概率$\textrm{P}(X)$需要通过积分得到，如下式所示
 \begin{eqnarray}
 \textrm{P}(X)=\int \textrm{P}(x,y)dy
-\label{eqC2.3-new}
+\label{eq:2.3-3}
 \end{eqnarray}
 %----------------------------------------------
-\parinterval 为了更好的区分条件概率、边缘概率和联合概率，我们将通过图\ref{fig:schematic-edge-probability&joint-probability}所示的面积来举例说明。
+\parinterval 为了更好的区分条件概率、边缘概率和联合概率，我们将通过图\ref{fig:2.2-2}所示的面积来举例说明。
 %----------------------------------------------
 % 图2.4
@@ -151,11 +151,11 @@
 \centering
 \input{./Chapter2/Figures/figure-schematic-edge-probability&joint-probability}
 \caption{一个概率密度函数与其对应的分布函数}
-\label{fig:schematic-edge-probability&joint-probability}
+\label{fig:2.2-2}
 \end{figure}
 %-------------------------------------------
-\parinterval 如图\ref{fig:schematic-edge-probability&joint-probability}所示，矩形A代表事件X发生所对应的所有可能状态，矩形B代表事件Y发生所对应的所有可能状态，矩形C代表A和B的交集，则
+\parinterval 如图\ref{fig:2.2-2}所示，矩形A代表事件X发生所对应的所有可能状态，矩形B代表事件Y发生所对应的所有可能状态，矩形C代表A和B的交集，则
 \parinterval 边缘概率：矩形A或者矩形B的面积；
@@ -170,14 +170,14 @@
 \textrm{P}(a,b,c) & = & \textrm{P}(a \mid b ,c)\textrm{P}(b,c) \nonumber \\
 \textrm{P}(b,c) & = & \textrm{P}(b \mid c)\textrm{P}(c)\nonumber \\
 \textrm{P}(a,b,c) & = & \textrm{P}(a \mid b,c)\textrm{P}(b \mid c)\textrm{P}(c)
-\label{eqC2.4-new}
+\label{eq:2.2-4}
 \end{eqnarray}
 %----------------------------------------------
 \parinterval 推广到$n$个事件，我们得到了链式法则的公式
 \begin{eqnarray}
 \textrm{P}(x_1,x_2,...,x_n)=\textrm{P}(x_1) \prod_{i=2}^n \textrm{P}(x_i \mid x_1,x_2,...,x_{(i-1)})
-\label{eqC2.5-new}
+\label{eq:2.2-5}
 \end{eqnarray}
 %----------------------------------------------
@@ -190,7 +190,7 @@
 \input{./Chapter2/Figures/figure-schematic-chain-rule}
 \setlength{\belowcaptionskip}{-1cm}
 \caption{A,B,C,D,E关系图}
-\label{fig:schematic-chain-rule}
+\label{fig:2.2-3}
 \end{figure}
 %-------------------------------------------
 \begin{eqnarray}
@@ -198,14 +198,14 @@
 &=&\textrm{P}(E \mid A,B,C,D) \cdot \textrm{P}(D \mid A,B,C) \cdot \textrm{P}(A,B,C) \nonumber \\
 &=&\textrm{P}(E \mid A,B,C,D) \cdot \textrm{P}(D \mid A,B,C) \cdot \textrm{P}(C \mid A,B) \cdot \textrm{P}(A,B) \nonumber \\
 &=&\textrm{P}(E \mid A,B,C,D) \cdot \textrm{P}(D \mid A,B,C) \cdot \textrm{P}(C \mid A,B) \cdot \textrm{P}(B \mid A) \cdot \textrm{P}(A)\nonumber \\
-\label{eqC2.6-new}
+\label{eq:2.2-6}
 \end{eqnarray}
-\parinterval 根据图\ref {fig:schematic-chain-rule} 易知$E$只和$C$有关，所以$\textrm{P}(E \mid A,B,C,D)=\textrm{P}(E \mid C)$；$D$不依赖于其他事件，所以$\textrm{P}(D \mid A,B,C)=\textrm{P}(D)$；$C$只和$BD$有关，所以$\textrm{P}(C \mid A,B)=\textrm{P}(C \mid B)$；$B$不依赖于其他事件，所以$\textrm{P}(B \mid  A)=\textrm{P}(B)$。最终化简可得：
+\parinterval 根据图\ref {fig:2.2-3} 易知$E$只和$C$有关，所以$\textrm{P}(E \mid A,B,C,D)=\textrm{P}(E \mid C)$；$D$不依赖于其他事件，所以$\textrm{P}(D \mid A,B,C)=\textrm{P}(D)$；$C$只和$BD$有关，所以$\textrm{P}(C \mid A,B)=\textrm{P}(C \mid B)$；$B$不依赖于其他事件，所以$\textrm{P}(B \mid  A)=\textrm{P}(B)$。最终化简可得：
 %---------------------------------------------
 \begin{eqnarray}
 \textrm{P}(A,B,C,D,E)=\textrm{P}(E \mid C) \cdot \textrm{P}(D) \cdot \textrm{P}(C \mid B) \cdot \textrm{P}(B)
-\label{eqC2.7-new}
+\label{eq:2.2-7}
 \end{eqnarray}
 %---------------------------------------------
@@ -220,12 +220,12 @@
 %---------------------------------------------
 \begin{eqnarray}
 \bigcup_{i=1}^n B_i=S \textrm{且}B_iB_j=\varnothing , i,j=1,...,n,i\neq j
-\label{eqC2.8-new}
+\label{eq:2.2-8}
 \end{eqnarray}
 \parinterval 设$B_1,…,B_n$是S的一个划分，A为事件，则
 \begin{eqnarray}
 \textrm{P}(A)=\sum_{k=1}^n \textrm{P}(A \mid B_k)\textrm{P}(B_k)
-\label{eqC2.9-new}
+\label{eq:2.2-9}
 \end{eqnarray}
 %---------------------------------------------
 \parinterval 这就是全概率公式。
@@ -245,47 +245,47 @@
 \end{eqnarray}
 %--------------------------------------------
-\parinterval \textbf{贝叶斯法则}（Bayes’ rule）是概率论中的一个定理，通常用于知$\textrm{P}(A \mid B)$求$\textrm{P}(B \mid A)$。其内容如下：
+\parinterval {\small\sffamily\bfseries{贝叶斯法则}}（Bayes’ rule）是概率论中的一个定理，通常用于知$\textrm{P}(A \mid B)$求$\textrm{P}(B \mid A)$。其内容如下：
 %--------------------------------------------
 \parinterval 设$B_1,…,B_n$是S的一个划分，A为事件，则对于$i=1,…,n$，有如下公式
 \begin{eqnarray}
 \textrm{P}(B_i \mid A)=\frac {\textrm{P}(A \mid B_i)\textrm{P}(B_i) } { \sum_{k=1}^n\textrm{P}(A \mid B_k)\textrm{P}(B_k) }
-\label{eqC2.10-new}
+\label{eq:2.2-10}
 \end{eqnarray}
 \parinterval 来看一下贝叶斯公式的推导。由前面的知识，我们知道条件概率的公式为
 \begin{eqnarray}
 \textrm{P}(B \mid A)= \frac {\textrm{P}(AB)} {\textrm{P}(A)}
-\label{eqC2.11-new}
+\label{eq:2.2-11}
 \end{eqnarray}
 \parinterval 由乘法定理我们可以得到
 \begin{eqnarray}
 \textrm{P}(AB)=\textrm{P}(B)\textrm{P}(A \mid B)
-\label{eqC2.12-new}
+\label{eq:2.2-12}
 \end{eqnarray}
 \parinterval 设$B_1,…,B_n$是S的一个划分，A为事件，由全概率公式我们可以得到
 \begin{eqnarray}
 \textrm{P}(A)=\textrm{P}(A \mid B_1)\textrm{P}(B_1)+\textrm{P}(A \mid B_2)\textrm{P}(B_2)+\ldots +\textrm{P}(A \mid B_n)\textrm{P}(B_n)
-\label{eqC2.13-new}
+\label{eq:2.2-13}
 \end{eqnarray}
 \parinterval 将乘法定理带入条件概率的分子，将全概率公式带入条件概率的分母，我们就可以得到贝叶斯定理
 \begin{eqnarray}
 \textrm{P}(B_i \mid A)=\frac {\textrm{P}(A \mid B_i)\textrm{P}(B_i) } {\sum_{k=1}^n \textrm{P}(A\mid B_k)\textrm{P}(B_k)}
-\label{eqC2.14-new}
+\label{eq:2.2-14}
 \end{eqnarray}
 \parinterval 由上式，我们也可以得到贝叶斯公式的另外两种写法:
 \begin{eqnarray}
 \textrm{P}(A \mid B)=\frac { \textrm{P}(A \mid B)\textrm{P}(B) }  {\textrm{P}(A)}
-\label{eqC2.15-new}
+\label{eq:2.2-15}
 \end{eqnarray}
 \begin{eqnarray}
 \textrm{P}(A \mid B)=\frac { \textrm{P}(A \mid B)\textrm{P}(B) }  {\textrm{P}(A \mid B)\textrm{P}(B)+\textrm{P}(A \mid \bar{B}) \textrm{P}(\bar{B})}
-\label{eqC2.16-new}
+\label{eq:2.2-16}
 \end{eqnarray}
 %--------------------------------------------
 \parinterval 贝叶斯公式常用于根据已知的结果来推断使之发生的各因素的可能性。
@@ -295,7 +295,7 @@
 \subsubsection{（一）信息熵}\index{Chapter2.2.5.1}
-\parinterval \textbf{熵}（entropy）是热力学中的一个概念，同时也是对系统无序性的一种度量标准，在自然语言处理领域也会使用到信息熵这一概念，比如描述文字的信息量大小。一条信息的信息量大小与它的不确定性有着直接的关系，如果我们需要确认一件非常不确定甚至于一无所知的事情，那么需要理解大量的相关信息才能确认清楚；同样的，如果我们对某件事已经非常确定，那么就不需要太多的信息就可以把它搞清楚。
+\parinterval {\small\sffamily\bfseries{熵}}（entropy）是热力学中的一个概念，同时也是对系统无序性的一种度量标准，在自然语言处理领域也会使用到信息熵这一概念，比如描述文字的信息量大小。一条信息的信息量大小与它的不确定性有着直接的关系，如果我们需要确认一件非常不确定甚至于一无所知的事情，那么需要理解大量的相关信息才能确认清楚；同样的，如果我们对某件事已经非常确定，那么就不需要太多的信息就可以把它搞清楚。
 \begin{example}
 确定性和不确定性的事件
@@ -303,7 +303,7 @@
 \qquad\qquad\quad``太阳从东方升起''
 \qquad\qquad\quad``明天天气多云''
-\label{example2-1}
+\label{e.g:2.2-1}
 \end{example}
 \parinterval 在这两句话中，``太阳从东方升起''是一件确定性事件，几乎不需要查阅更多信息就可以确认，因此这件事的信息熵相对较低；而``明天天气多云''这件事，我们需要询问气象局的相关研究人员，或者关注天气预报，才能大概率确定的一件事，它的不确定性很高，因而它的信息熵也就相对较高。因此，信息熵也是对事件不确定性的度量。
@@ -311,24 +311,24 @@
 \parinterval 一个事件X的自信息（self-information）的表达式为：
 \begin{eqnarray}
 \textrm{I}(x)=-\log\textrm{P}(x)
-\label{eqC2.17-new}
+\label{eq:2.2-17}
 \end{eqnarray}
-\parinterval 其中，$\textrm{P}(x)$表示概率，自信息用来衡量单一事件发生时所包含的信息多少，当底数为e时，单位为$nats$，其中1$nats$是通过观察概率为$\frac{1}{e}$的事件而获得的信息量；当底数为2时，单位为$bits$或$shannons$，我们通常使用前者。$\textrm{I}(x)$和$\textrm{P}(x)$的函数关系如图\ref{fig:Self-information-function} 所示：
+\parinterval 其中，$\textrm{P}(x)$表示概率，自信息用来衡量单一事件发生时所包含的信息多少，当底数为e时，单位为$nats$，其中1$nats$是通过观察概率为$\frac{1}{e}$的事件而获得的信息量；当底数为2时，单位为$bits$或$shannons$，我们通常使用前者。$\textrm{I}(x)$和$\textrm{P}(x)$的函数关系如图\ref{fig:2.2-4} 所示：
 %----------------------------------------------
 % 图2.6
 \begin{figure}[htp]
 \centering
 \input{./Chapter2/Figures/figure-Self-information-function}
 \caption{自信息函数图像}
-\label{fig:Self-information-function}
+\label{fig:2.2-4}
 \end{figure}
 %-------------------------------------------
 \parinterval 自信息只处理单一的结果。若量化整个概率分布中的不确定性或者说信息量，我们可以用信息熵，其公式如下：
 \begin{eqnarray}
 \textrm{H}(x)=\sum_{x \in \textrm{X}}[ \textrm{P}(x) \textrm{I}(x)] =- \sum_{x \in \textrm{X} } [\textrm{P}(x)\log(\textrm{P}(x)) ]
-\label{eqC2.18-new}
+\label{eq:2.2-18}
 \end{eqnarray}
 \parinterval 一个分布的信息熵也就是从该分布中得到的一个事件的期望信息量。比如，$a$、$b$、$c$、$d$三支球队，三支队伍夺冠的概率分别是$P_1$、$P_2$、$P_3$、$P_4$，某个人对比赛不感兴趣但是又想知道哪只球队夺冠，通过使用二分法2次就确定哪支球队夺冠了。但其实，我们知道这四只球队中c的实力比较强劲，那么猜1次就可以确定。所以对于前者，哪只球队夺冠的信息量较高，信息熵也相对较高，对于后者信息量和信息熵也就相对较低。因此我们可以得知：较为尖锐的分布具有较低的熵；分布越接近均匀熵越大。
@@ -339,7 +339,7 @@
 \begin{eqnarray}
 \textrm{D}_{\textrm{KL}}(\textrm{P}\parallel \textrm{Q}) & = & \sum_{x \in \textrm{X}} [ \textrm{P}(x)\log \frac{\textrm{P}(x) }{ \textrm{Q}(x) } ]  \nonumber \\
                                                                                       & = & \sum_{x \in \textrm{X} }[ \textrm{P}(x)(\log\textrm{P}(x)-\log \textrm{Q}(x))]
-\label{eqC2.19-new}
+\label{eq:2.2-19}
 \end{eqnarray}
 \parinterval 这一概念的意义在于：在相同事件空间里，概率分布$\textrm{P}(x)$对应的每个事件，若用概率分布Q$(x)$编码时，平均每个基本事件的信息量增加了多少。它衡量的是相同事件空间里的两个概率分布的差异情况。KL距离有两条重要的性质：
@@ -357,14 +357,15 @@
 \parinterval 交叉熵是一个与KL距离密切相关的概念，它的公式是：
 \begin{eqnarray}
 \textrm{H}(\textrm{P},\textrm{Q})=-\sum_{x \in \textrm{X}} [\textrm{P}(x) \log \textrm{Q}(x) ]
-\label{eqC2.20-new}
+\label{eq:2.2-20}
 \end{eqnarray}
 \parinterval 结合相对熵公式可知，交叉熵是KL距离公式中的右半部分。因此，求关于Q的交叉熵的最小值等价于求KL距离的最小值。交叉熵与KL距离的意义相同：都是用来描述两个分布的差异，由于交叉熵计算上更加直观方便，因此在机器翻译中被广泛应用。
-\vspace{-1.0em}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \section{中文分词}\index{Chapter2.3}
-\parinterval 对于机器翻译系统而言，输入的是已经切分好的单词序列，而不是原始的字符串。比如，对于一个中文句子，单词之间是没有间隔的，因此我们需要把一个个的单词切分出来，这样机器翻译系统可以区分不同单元。甚至，我们可以对语言学上的单词进行进一步切分，得到词片段序列（比如：中国人$\Rightarrow$中国 人）。我们可以把上述过程看作是一种\textbf{分词}（segmentation）过程，即：将一个输入的自然语言字符串切割成单元序列（token序列），每个单元都对应可以处理的最小单位。
+\parinterval 对于机器翻译系统而言，输入的是已经切分好的单词序列，而不是原始的字符串。比如，对于一个中文句子，单词之间是没有间隔的，因此我们需要把一个个的单词切分出来，这样机器翻译系统可以区分不同单元。甚至，我们可以对语言学上的单词进行进一步切分，得到词片段序列（比如：中国人$\Rightarrow$中国 人）。我们可以把上述过程看作是一种{\small\sffamily\bfseries{分词}}（segmentation）过程，即：将一个输入的自然语言字符串切割成单元序列（token序列），每个单元都对应可以处理的最小单位。
 %----------------------------------------------
 % 图2.7
@@ -372,7 +373,7 @@
 \centering
 \input{./Chapter2/Figures/figure-a-simple-pre-processing-process}
 \caption{一个简单的预处理流程}
-\label{fig:a-simple-pre-processing-process}
+\label{fig:2.3-1}
 \end{figure}
 %-------------------------------------------
 %\vspace{-0.5em}
@@ -401,12 +402,13 @@
 \parinterval 从语言学的角度，普遍认为词是可以单独运用的、包含意义的基本单位。我们使用有限的词可以组合出无限的句子，这也正体现出自然语言的奇妙之处。
 \parinterval 不过，机器翻译并不仅仅局限在语言学定义的单词，因此机器翻译系统所使用的分词也不仅仅把句子按照词切开，比如，神经机器翻译中广泛使用的BPE子词切分方法，可以被理解为将词的一部分也进行切开，也就是得到词片段送给机器翻译系统使用。比如，对如下英文字符串，可以得到如下切分结果
+\vspace{0.5em}
-\parinterval Interesting \; -> \; Interest/ing  selection \;->\;se/lect/ion  procession \hspace{0.23em} -> \; pro/cess/ion
+\parinterval Interesting \; -> \; Interest/ing  selection \hspace{0.08em} -> \;se/lect/ion  procession \hspace{0.43em} -> \; pro/cess/ion
-\parinterval Interested \hspace{0.62em} -> \; Interest/ed   selecting \hspace{0.34em} -> \; se/lect/ing  processing -> \; pro/cess/ing
+\parinterval Interested \hspace{0.62em} -> \; Interest/ed   selecting \hspace{0.34em} -> \; se/lect/ing  processing \hspace{0.22em} -> \; pro/cess/ing
-\parinterval Interests \hspace{1.17em} -> \; Interest/s   selected \hspace{1.24em} -> \; se/lect/ed   processed \hspace{0.42em} -> \; pro/cess/ed \\
+\parinterval Interests \hspace{1.17em} -> \; Interest/s   selected \hspace{1.24em} -> \; se/lect/ed   processed \hspace{0.82em} -> \; pro/cess/ed \\
 \parinterval 词法分析的重要性在自然语言处理领域已经有共识。如果切分的颗粒度很大，获得的单词的歧义也很小，比如``中华人民共和国''整体作为一个单词不存在歧义，而如果单独的一个单词``国''，可能会代表``中国''、``美国''等不同的国家，存在歧义。但是随着切分颗粒度的增大，特定单词出现的频度也随之降低，低频词容易和噪音混淆，系统很难进行学习。因此，处理这些问题并开发适合翻译任务的分词系统是机器翻译的第一步。
@@ -416,18 +418,18 @@
 \parinterval 然而，计算机并不能像人类一样在概念上理解``词''，因此需要使用其他的方式让计算机可以进行分词。一个最简单的方法就是给定一个词典，在这个词典中出现的汉字组合就是我们定义的``词''。也就是，我们通过一个词典定义一个标准，符合这个标准定义的字符串都是合法的``词''。
-\parinterval 在使用基于词典的分词方法时，只需预先加载词典到计算机中，扫描输入句子，查询每个词串是否出现在词典中。如图\ref{fig:Example-of-word-segmentation-based-on-dictionary} 所示，比如，我们有一个包含六个词的词典，给定输入句子``确实现在物价很高''后，我们自左至右遍历输入句子的每个字，发现词串``确实''在词典中出现，说明``确实''是一个``词''，进行分词操作并在切分该``词''之后重复这个过程。
+\parinterval 在使用基于词典的分词方法时，只需预先加载词典到计算机中，扫描输入句子，查询每个词串是否出现在词典中。如图\ref{fig:2.3-2} 所示，比如，我们有一个包含六个词的词典，给定输入句子``确实现在物价很高''后，我们自左至右遍历输入句子的每个字，发现词串``确实''在词典中出现，说明``确实''是一个``词''，进行分词操作并在切分该``词''之后重复这个过程。
 %----------------------------------------------
 % 图2.8
 \begin{figure}[htp]
 \centering
 \input{./Chapter2/Figures/figure-Example-of-word-segmentation-based-on-dictionary}
 \caption{基于词典进行分词实例}
-\label{fig:Example-of-word-segmentation-based-on-dictionary}
+\label{fig:2.3-2}
 \end{figure}
 %-------------------------------------------
-\parinterval 但是，基于词典的分词方法很``硬''。这是因为自然语言非常灵活，经常出现歧义，用词典定义的合法单词之间有重叠的交叉型歧义就很难解决。图\ref{fig:cross-type-word-segmentation-ambiguity} 就给出了上面例子中的交叉型歧义，从词典中查看，``实现''和``现在''都是合法的单词，但是在句子中二者有重叠，因此词典无法告诉我们哪个结果是正确的。
+\parinterval 但是，基于词典的分词方法很``硬''。这是因为自然语言非常灵活，经常出现歧义，用词典定义的合法单词之间有重叠的交叉型歧义就很难解决。图\ref{fig:2.3-3} 就给出了上面例子中的交叉型歧义，从词典中查看，``实现''和``现在''都是合法的单词，但是在句子中二者有重叠，因此词典无法告诉我们哪个结果是正确的。
 %----------------------------------------------
 % 图2.9
@@ -435,7 +437,7 @@
 \centering
 \input{./Chapter2/Figures/figure-cross-type-word-segmentation-ambiguity}
 \caption{交叉型分词歧义}
-\label{fig:cross-type-word-segmentation-ambiguity}
+\label{fig:2.3-3}
 \end{figure}
 %-------------------------------------------
@@ -465,13 +467,13 @@
 \begin{figure}[htp]
 \centering
 \input{./Chapter2/Figures/figure-word-segmentation-based-on-statistics}
-\setlength{\belowcaptionskip}{-0.5cm}
+%\setlength{\belowcaptionskip}{-0.5cm}
 \caption{基于统计的分词流程}
-\label{fig:word-segmentation-based-on-statistics}
+\label{fig:2.3-4}
 \end{figure}
 %-------------------------------------------
-\parinterval 图\ref{fig:word-segmentation-based-on-statistics} 给出了一个基于统计建模的汉语分词实例。左侧是标注数据，其中的每个句子已经经过人工标注分词结果（单词用斜杠分开）。之后，建立一个统计模型，记为$\textrm{P}(\cdot)$。模型通过在标注数据上的学习达到能够很好描述问题的状态。最后，对于新的未分词的句子，使用模型$\textrm{P}(\cdot)$对每个可能的分切进行概率估计，之后选择概率最高的切分结果输出。
+\parinterval 图\ref{fig:2.3-4} 给出了一个基于统计建模的汉语分词实例。左侧是标注数据，其中的每个句子已经经过人工标注分词结果（单词用斜杠分开）。之后，建立一个统计模型，记为$\textrm{P}(\cdot)$。模型通过在标注数据上的学习达到能够很好描述问题的状态。最后，对于新的未分词的句子，使用模型$\textrm{P}(\cdot)$对每个可能的分切进行概率估计，之后选择概率最高的切分结果输出。
 \vspace{-0.5em}
 \subsubsection{掷骰子游戏}\index{Chapter2.3.2.2}
@@ -482,9 +484,9 @@
 \begin{figure}[htp]
 \centering
 \input{./Chapter2/Figures/figure-the-dice-game}
-\setlength{\belowcaptionskip}{-0.5cm}
+%\setlength{\belowcaptionskip}{-0.5cm}
 \caption{骰子结果}
-\label{fig:the-dice-game}
+\label{fig:2.3-5}
 \end{figure}
 %-------------------------------------------
@@ -492,7 +494,7 @@
 \parinterval 似乎玩家的胜利只能来源于运气。不过，请注意，这里的假设``随便选一个数字''这本身就是一个概率模型，它对骰子的六个面的出现做了均匀分布假设。
 \begin{eqnarray}
 \textrm{P(``1'')}=\textrm{P(``2'')}=...=\textrm{P(``5'')}=\textrm{P(``6'')}=1/6
-\label{eqC2.21-new}
+\label{eq:2.3-1}
 \end{eqnarray}
 \vspace{-0.5em}
@@ -505,16 +507,16 @@
 \textrm{P(``4'')} &=&\theta_4 \nonumber \\
 \textrm{P(``5'')} &=&\theta_5 \nonumber \\
 \textrm{P(``6'')} &=&1-\sum_{1 \leq i \leq 5}\theta_i \qquad \lhd \textrm {归一性}
-\label{eqC2.22-new}
+\label{eq:2.3-2}
 \end{eqnarray}
 \parinterval 这里$\theta_1 \sim \theta_5$可以被看作是模型的参数。对于这样的模型，参数确定了，模型也就确定了。但是，新的问题来了，在定义骰子每个面的概率后，如何求出具体的值呢？一种常用的方法是，从大量实例中学习模型参数，这个方法也是常说的参数估计。我们可以将这个不均匀的骰子先实验性的掷很多次，这可以被看作是独立同分布的若干次采样，比如$X$次，发现``1''出现$X_1$次，``2''出现$X_2$次，以此类推，得到了各个面出现的次数。假设掷骰子中每个面出现的概率符合多项式分布，通过简单的概率论知识可以知道每个面出现概率的极大似然估计为：
 \begin{eqnarray}
 \textrm{P(``i'')}=\frac {X_i}{X}
-\label{eqC2.23-new}
+\label{eq:2.3-3}
 \end{eqnarray}
-\parinterval 当$X$足够大的话，$\frac{X_i}{X}$可以无限逼近P(``$i$'')的真实值，因此可以通过大量的实验推算出掷骰子各个面的概率的准确估计值。回归到我们的问题中，如果我们在正式开始游戏前，预先掷骰子30次，得到如图\ref{fig:the-dice-game2}的结果。
+\parinterval 当$X$足够大的话，$\frac{X_i}{X}$可以无限逼近P(``$i$'')的真实值，因此可以通过大量的实验推算出掷骰子各个面的概率的准确估计值。回归到我们的问题中，如果我们在正式开始游戏前，预先掷骰子30次，得到如图\ref{fig:2.3-6}的结果。
 %----------------------------------------------
 % 图2.12
@@ -522,11 +524,11 @@
 \centering
 \input{./Chapter2/Figures/figure-the-dice-game2}
 \caption{预投骰子结果}
-\label{fig:the-dice-game2}
+\label{fig:2.3-6}
 \end{figure}
 %-------------------------------------------
-\parinterval 于是，我们看到了一个有倾向性的模型（图 \ref{fig:the-dice-game-model}）：
+\parinterval 于是，我们看到了一个有倾向性的模型（图 \ref{fig:2.3-7}）：
 %----------------------------------------------
 % 图2.13
@@ -534,13 +536,13 @@
 \centering
 \input{./Chapter2/Figures/figure-the-dice-game-model}
 \caption{预设的骰子模型}
-\label{fig:the-dice-game-model}
+\label{fig:2.3-7}
 \end{figure}
 %-------------------------------------------
 在这样的预先实验基础上，我们知道如果再次玩掷骰子游戏的话，选则数字``4''获胜的可能性是最大的。
-\parinterval 通过上面这个掷骰子的游戏，可以得到一个道理：\textbf{上帝是不公平的}。因为在``公平''的世界中，没有任何一个模型可以学到有价值的事情。从机器学习的角度来看，所谓的``不公平''实际上这是客观事物中蕴含的一种\textbf{偏置}（bias），也就是很多事情天然就有对某些情况有倾向。而图像处理、自然语言处理等问题绝大多数都存在着偏置。比如，我们翻译一个英文单词的时候，它最可能的翻译结果往往就是那几个词。我们设计统计模型的目的正是要学习这种偏置，之后利用这种偏置对新的问题做出足够好的决策。
+\parinterval 通过上面这个掷骰子的游戏，可以得到一个道理：{\small\sffamily\bfseries{上帝是不公平的}}。因为在``公平''的世界中，没有任何一个模型可以学到有价值的事情。从机器学习的角度来看，所谓的``不公平''实际上这是客观事物中蕴含的一种{\small\sffamily\bfseries{偏置}}（bias），也就是很多事情天然就有对某些情况有倾向。而图像处理、自然语言处理等问题绝大多数都存在着偏置。比如，我们翻译一个英文单词的时候，它最可能的翻译结果往往就是那几个词。我们设计统计模型的目的正是要学习这种偏置，之后利用这种偏置对新的问题做出足够好的决策。
 \subsubsection{全概率分词方法}\index{Chapter2.3.2.3}
@@ -554,7 +556,7 @@
 \end{itemize}
 \vspace{0.5em}
-\parinterval 如果把投掷这个新的骰子，可能会得到图\ref{fig:full-probability-word-segmentation-1}这样的结果，
+\parinterval 如果把投掷这个新的骰子，可能会得到图\ref{fig:2.3-8}这样的结果，
 %----------------------------------------------
 % 图2.14
@@ -562,7 +564,7 @@
 \centering
 \input{./Chapter2/Figures/figure-full-probability-word-segmentation-1}
 \caption{新投骰子结果}
-\label{fig:full-probability-word-segmentation-1}
+\label{fig:2.3-8}
 \end{figure}
 %-------------------------------------------
@@ -585,7 +587,7 @@
 \input{./Chapter2/Figures/figure-full-probability-word-segmentation-2}
 \setlength{\belowcaptionskip}{-0.2cm}
 \caption{换成汉字后结果}
-\label{fig:full-probability-word-segmentation-2}
+\label{fig:2.3-9}
 \end{figure}
 %-------------------------------------------
@@ -596,21 +598,20 @@
 \centering
 \input{./Chapter2/Figures/figure-full-probability-word-segmentation-3}
 \caption{每个单词概率估计值}
-\label{fig:full-probability-word-segmentation-3}
+\label{fig:2.3-10}
 \end{figure}
 %-------------------------------------------
 \parinterval 通过这个学习过程，我们得到了每个词出现的概率，即模型的参数。而我们原始的问题是如何计算这个整句分词结果的概率，比如
 \begin{equation}
 \textrm{P}\textrm{(``确实/现在/数据/很/多'')}=?
-\label{eqC2.24-new}
+\label{eq:2.3-4}
 \end{equation}
 \parinterval 这里可以使用``大题小做''的技巧：原始的问题很复杂，我们将其切分为小问题。这样，将复杂的分词问题简单化，基于独立性假设解决分词问题：假定所有词出现都是相互独立的。设$w_1 w_2 w_3…w_m$表示一个由单词$w_1,w_2,w_3,…,w_m$组成的切分结果，于是有：
 {\setlength{\belowdisplayskip}{-9pt}
 \begin{eqnarray}
 \textrm{P}(w_1 w_2 w_3…w_m)=\textrm{P}(w_1) \cdot \textrm{P}(w_2) \cdot ... \cdot \textrm{P}(w_m)
-\label{eqC2.25-new}
+\label{eq:2.3-5}
 \end{eqnarray}
 }
 \begin{eqnarray}
@@ -618,7 +619,7 @@
 & = &\textrm{P}\textrm{(``确实'')} \cdot \textrm{P}\textrm{(``现在'')} \cdot \textrm{P}\textrm{(``数据'')} \cdot \textrm{P}\textrm{(``很'')} \cdot \textrm{P}\textrm{(``多'')} \nonumber \\
 & = &0.000001 \times 0.000022 \times 0.000009 \times 0.000010 \times 0.000078 \nonumber \\
 & = &1.5444 \times 10^{-25}
-\label{eqC2.26-new}
+\label{eq:2.3-6}
 \end{eqnarray}
 \parinterval 以``确实现在数据很多''这个实例来说，如果把这句话按照``确实/现在/数据/很/多''这样的方式进行切分，这句切分的概率P(``确实/现在/数据/很/多'')可以通过每个词出现概率相乘的方式进行计算。这个假设也是自然语言处理中1-gram语言模型假设，即当前词的生成与任何历史都无关。当然，独立性假设并不能完美描述客观世界的问题，但是它大大化简了问题的复杂度。
@@ -629,11 +630,11 @@
 \centering
 \input{./Chapter2/Figures/figure-examples-of-Chinese-word-segmentation-based-on-1-gram-model}
 \caption{基于1-gram语言模型的中文分词实例}
-\label{fig:examples-of-Chinese-word-segmentation-based-on-1-gram-model}
+\label{fig:2.3-11}
 \end{figure}
 %-------------------------------------------
-\parinterval 最后让我们再整体看一下分词系统的学习和使用过程。如图\ref {fig:examples-of-Chinese-word-segmentation-based-on-1-gram-model}所示，我们利用大量人工标注好的分词数据，通过统计学习方法获得一个统计模型$\textrm{P}(\cdot)$，给定任意分词结果$W=w_1 w_2…w_m$，都能通过$\textrm{P}(W)=\textrm{P}(w_1) \times \textrm{P}(w_2 ) \times …\textrm{P}(w_m)$计算这种切分的概率值。
+\parinterval 最后让我们再整体看一下分词系统的学习和使用过程。如图\ref {fig:2.3-8}所示，我们利用大量人工标注好的分词数据，通过统计学习方法获得一个统计模型$\textrm{P}(\cdot)$，给定任意分词结果$W=w_1 w_2…w_m$，都能通过$\textrm{P}(W)=\textrm{P}(w_1) \times \textrm{P}(w_2 ) \times …\textrm{P}(w_m)$计算这种切分的概率值。
 \parinterval 经过充分训练的统计模型$\textrm{P}(\cdot)$就是我们得到分词模型。对于任意输入的新句子S，通过这个模型找到最佳的分词结果$W^*$输出。假设输入句子S是``确实现在数据很多''，可以通过列举获得不同切分方式的概率，其中概率最高的切分方式，就是我们的目标输出。
@@ -649,7 +650,7 @@
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \subsection{建模}\index{Chapter2.4.1}
-\parinterval \textbf{语言模型}（language model）的目的是描述文字序列出现的规律。这个对问题建模的过程被称作\textbf{语言建模}（language modeling）。如果使用统计建模的方式，语言模型可以被定义为计算$\textrm{P}(w_1 w_2...w_m)$，也就是计算整个词序列$w_1 w_2...w_m$出现的可能性大小。具体定义如下，
+\parinterval {\small\sffamily\bfseries{语言模型}}（language model）的目的是描述文字序列出现的规律。这个对问题建模的过程被称作{\small\sffamily\bfseries{语言建模}}（language modeling）。如果使用统计建模的方式，语言模型可以被定义为计算$\textrm{P}(w_1 w_2...w_m)$，也就是计算整个词序列$w_1 w_2...w_m$出现的可能性大小。具体定义如下，
 %----------------------------------------------
 % 定义3.1
@@ -662,7 +663,7 @@
 \parinterval 直接求$\textrm{P}(w_1 w_2...w_m)$并不简单，因为如果把$w_1 w_2...w_m$整个作为一个变量，模型的参数量会非常大。$w_1 w_2...w_m$有$|V|^m$种可能性，这里$|V|$表示词汇表大小。显然，当$m$增大的时候会使模型复杂度会急剧增加，甚至都无法进行存储和计算。既然把$w_1 w_2...w_m$整个作为一个变量不好处理，就可以考虑对这个序列的生成进行分解。使用链式法则，很容易得到
 \begin{eqnarray}
 \textrm{P}(w_1 w_2...w_m)=\textrm{P}(w_1)\textrm{P}(w_2|w_1)\textrm{P}(w_3|w_1 w_2)...\textrm{P}(w_m|w_1 w_2...w_{m-1})
-\label{eqC2.27-new}
+\label{eq:2.4-1}
 \end{eqnarray}
 这样，$w_1 w_2...w_m$的生成可以被看作是逐个生成每个单词的过程，即首先生成$w_1$，然后根据$w_1$再生成$w_2$，然后根据$w_1 w_2$再生成$w_3$，以此类推，直到根据所有前$m$-1个词生成序列的最后一个单词$w_m$。这个模型把联合概率$\textrm{P}(w_1 w_2...w_m)$分解为多个条件概率的乘积，虽然可以对生成序列的过程进行分解，但是模型的复杂度和以前是一样的，比如，$\textrm{P}(w_m|w_1 w_2...w_{m-1})$仍然不好计算。
@@ -670,7 +671,7 @@
 \parinterval 换一个角度看，$\textrm{P}(w_m|w_1 w_2...w_{m-1})$体现了一种基于``历史''的单词生成模型，也就是把前面生成的所有单词作为``历史''，并参考这个``历史''生成当前单词。但是这个``历史''的长度和整个序列长度是相关的，也是一种长度变化的历史序列。为了化简问题，一种自然的想法是使用定长历史，比如，每次只考虑前面$n$-1个历史单词来生成当前单词，这就是$n$-gram语言模型。这个模型的数学描述如下：
 \begin{eqnarray}
 \textrm{P}(w_m|w_{m-n+1}...w_{m-1})=\textrm{P}(w_m|w_1 w_2...w_{m-1})
-\label{eqC2.28-new}
+\label{eq:2.4-2}
 \end{eqnarray}
 \parinterval 这样，整个序列$w_1 w_2...w_m$的生成概率可以被重新定义为：
@@ -679,7 +680,7 @@
 \begin{table}[htp]
 \centering
 \caption{$n$-gram语言模型取不同$n$值的模型描述}
-\label{tab:n-gram-model-of-different-n}
+\label{tab::2.4-1}
 {\scriptsize
 \begin{tabular}{l|l|l l|l}
 链式法则 & 1-gram & 2-gram & $...$ & $n$-gram\\
@@ -703,7 +704,7 @@
 \item 极大似然估计。直接利用不同词序列在训练数据中出现的频度计算出$\textrm{P}(w_m$\\$|w_{m-n+1} ... w_{m-1})$
 \begin{eqnarray}
 \textrm{P}(w_m|w_{m-n+1}...w_{m-1})=\frac{\textrm{count}(w_{m-n+1}...w_m)}{\textrm{count}(w_{m-n+1}...w_{m-1})}
-\label{eqC2.29-new}
+\label{eq:2.4-3}
 \end{eqnarray}
 \item 人工神经网络方法。构建一个人工神经网络估计$\textrm{P}(w_m|w_{m-n+1} ... w_{m-1})$的值，比如，可以构建一个前馈神经网络来对$n$-gram进行建模。
@@ -716,7 +717,7 @@
 & &\textrm{P}_{2-gram}{(\textrm{``确实}/\textrm{现在}/\textrm{数据}/\textrm{很}/\textrm{多''})} \nonumber \\
 &= & \textrm{P}(\textrm{``确实''}) \times\textrm{P}(\textrm{``现在''}|\textrm{``确实''})\times\textrm{P}(\textrm{``数据''}|\textrm{``现在''}) \nonumber \\
 & \times & \textrm{P}(\textrm{``很''}|\textrm{``数据''})\times\textrm{P}(\textrm{``多''}|\textrm{``很''})
-\label{eqC2.30-new}
+\label{eq:2.4-4}
 \end{eqnarray}
 \parinterval 以$n$-gram语言模型为代表的统计语言模型的应用非常广泛。除了分词，在文本生成、信息检索、摘要等等自然语言处理任务中，语言模型都有举足轻重的地位。包括近些年非常受关注的预训练模型，本质上也是统计语言模型。这些技术都会在后续章节进行介绍。值得注意的是，统计语言模型给我们解决自然语言处理问题提供了一个非常好的建模思路，即：把整个序列生成的问题转化为逐个生成单词的问题。很快我们就会看到，这种建模方式会被广泛的用于机器翻译建模中，在统计机器翻译和神经机器翻译中都会有明显的体现。
@@ -724,15 +725,15 @@
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \subsection{未登录词和平滑算法}\index{Chapter2.4.2}
-\parinterval 在式\ref{eqC2.30-new}的例子中，如果语料中从没有``确实''和``现在''两个词连续出现的情况，那么使用2-gram计算``确实/现在/数据/很/多''的切分方式的概率时，会出现如下情况
+\parinterval 在式\ref{eq:2.4-4}的例子中，如果语料中从没有``确实''和``现在''两个词连续出现的情况，那么使用2-gram计算``确实/现在/数据/很/多''的切分方式的概率时，会出现如下情况
 \begin{eqnarray}
 \textrm{P}(\textrm{``现在''}|\textrm{``确实''}) & =  & \frac{\textrm{count}(\textrm{``确实}\,\textrm{现在''})}{\textrm{count}(\textrm{``确实''})} \nonumber \\
                                                                     & =  & \frac{0}{\textrm{count}(\textrm{``确实''})} \nonumber \\
                                                                     & =  & 0
-\label{eqC2.31-new}
+\label{eq:2.4-5}
 \end{eqnarray}
-\parinterval 显然，这个结果是不能接受的。因为即使语料中没有 ``确实''和``现在''两个词连续出现，但是这种搭配也是客观存在的。这时简单的用极大似然估计得到概率却是0，导致整个切分结果的概率为0。更常见的问题是那些根本没有出现在词表中的词，称为\textbf{未登录词}（Out-of-Vocabulary, OOV），比如一些生僻词，可能模型训练阶段从来没有看到过，这时模型仍然会给出0概率。图\ref{fig:word-frequency-distribution}展示了词语出现频度的分布，可以看到绝大多数词都是低频词。
+\parinterval 显然，这个结果是不能接受的。因为即使语料中没有 ``确实''和``现在''两个词连续出现，但是这种搭配也是客观存在的。这时简单的用极大似然估计得到概率却是0，导致整个切分结果的概率为0。更常见的问题是那些根本没有出现在词表中的词，称为{\small\sffamily\bfseries{未登录词}}（Out-of-Vocabulary, OOV），比如一些生僻词，可能模型训练阶段从来没有看到过，这时模型仍然会给出0概率。图\ref{fig:2.4-1}展示了词语出现频度的分布，可以看到绝大多数词都是低频词。
 %----------------------------------------------
 % 图2.18
@@ -740,7 +741,7 @@
    \centering
 \input{./Chapter2/Figures/figure-word-frequency-distribution}
 	 \caption{词语频度分布}
-    \label{fig:word-frequency-distribution}
+    \label{fig:2.4-1}
 \end{figure}
 %---------------------------
@@ -758,19 +759,19 @@
 \textrm{P}(\textrm{现在}|\textrm{确实}) & = & \frac{\textrm{count}(\textrm{确实}\,\textrm{现在})}{\textrm{count}(\textrm{确实})} \nonumber \\
                                                             & = & \frac{0}{\textrm{count}(\textrm{确实})} \nonumber \\
                                                             & = & 0
-\label{eqC2.32-new}
+\label{eq:2.4-6}
 \end{eqnarray}
 \parinterval 加法平滑方法（additive smoothing）假设每个$n$-gram出现的次数比实际统计次数多$\theta$次，$0 \leqslant\theta\leqslant 1$，使得分子部分不为0，那么计算前文例子``确实 现在''的概率时，可以使用如下方法计算。
 \begin{eqnarray}
 \textrm{P}(\textrm{现在}|\textrm{确实}) & =  & \frac{\theta + \textrm{count}(\textrm{确实}\,\textrm{现在})}{\sum_{w}^{|V|}(\theta + \textrm{count}(\textrm{确实}w))} \nonumber \\
                                                             & =  & \frac{\theta + \textrm{count}(\textrm{确实}\,\textrm{现在})}{\theta{|V|} + \textrm{count}(\textrm{确实})}
-\label{eqC2.33-new}
+\label{eq:2.4-7}
 \end{eqnarray}
 \parinterval 这里面$V$表示所有词汇的词表，$|V|$为词表中单词的个数，$w$为词典中的词。常见的加法平滑方法会将$\theta$取1，这时我们又称为加一平滑或是拉普拉斯平滑。这种方法比较容易理解，也比较简单，但是一些人认为这种方法的表现较差，因此，其实际的使用效果还要视具体情况而定。
-\parinterval 举一个例子来形象的描述加法平滑方法。假设在一个英文文档中随机抽取词汇，已经抽到的词包括12个，词典大小$|V|$=20，已抽到的词汇统计结果为：4 look，3 people，2 am，1 what，1 want，1 do。为了更形象的描述在平滑之前和平滑之后的概率分布的区别，可以参考下图\ref{fig:no-smoothing&smoothed-probability-distributions}的示例：
+\parinterval 举一个例子来形象的描述加法平滑方法。假设在一个英文文档中随机抽取词汇，已经抽到的词包括12个，词典大小$|V|$=20，已抽到的词汇统计结果为：4 look，3 people，2 am，1 what，1 want，1 do。为了更形象的描述在平滑之前和平滑之后的概率分布的区别，可以参考下图\ref{fig:2.4-2}的示例：
 %----------------------------------------------
 % 图2.19
@@ -778,7 +779,7 @@
    \centering
 	\input{./Chapter2/Figures/figure-no-smoothing&smoothed-probability-distributions}
 	\caption{无平滑和有平滑后的概率分布}
-    \label{fig:no-smoothing&smoothed-probability-distributions}
+    \label{fig:2.4-2}
 \end{figure}
 %-------------------------------------------
@@ -790,13 +791,13 @@
 \parinterval 假定在语料库中出现r次的$n$元语法有$n_r$个，特别的，出现0次的$n$元语法(即未登录词)出现的次数为$n_0$个。语料库中全部词语的个数为$N$，显然
 \begin{eqnarray}
 N = \sum_{r=1}^{\infty}{r\,n_r}
-\label{eqC2.34-new}
+\label{eq:2.4-8}
 \end{eqnarray}
 \parinterval 这时，出现$r$次的$n$元语法在词典中的相对频率为$r/N$，这也是不做平滑处理时这些词的概率估计。为了解决零概率问题，Good-Turing方法对于任何一个出现$r$次的$n$元语法，利用出现$r$+1次的$n$元语法统计量重新假设它出现$r^*$次，这里
 \begin{eqnarray}
 r^* = (r + 1)\frac{n_{r + 1}}{n_r}
-\label{eqC2.35-new}
+\label{eq:2.4-9}
 \end{eqnarray}
 \parinterval 基于这个公式，就可以估计所有0次$n$元语法的频次$n_0 r_0^*=(r_0+1)n_1=n_1$。要把这个重新估计的统计数转化为概率，只需要进行归一化处理：对于每个统计数为$r$的事件，其概率为$\textrm{P}_r=r^*/N$，其中
@@ -804,14 +805,14 @@ r^* = (r + 1)\frac{n_{r + 1}}{n_r}
 N & = & \sum_{r=0}^{\infty}{r^{*}n_r} \nonumber \\
    & = & \sum_{r=0}^{\infty}{(r + 1)n_{r + 1}} \nonumber \\
    & = & \sum_{r=1}^{\infty}{r\,n_r}
-\label{eqC2.36-new}
+\label{eq:2.4-10}
 \end{eqnarray}
 也就是说，N仍然为这个整个样本分布最初的计数。这样样本中所有事件的概率之和为：
 \begin{eqnarray}
 N & = & \sum_{r>0}{p_r n_r} \nonumber \\
   & =  & 1 - \frac{n_1}{N} < 1
-\label{eqC2.37-new}
+\label{eq:2.4-11}
 \end{eqnarray}
 其中$n_1/N$的概率余量就是分配给所有统计为0的事件。
@@ -824,7 +825,7 @@ N & = & \sum_{r>0}{p_r n_r} \nonumber \\
 \begin{table}[htp]{
 \begin{center}
 \caption{英文词汇抽取统计结果}
-\label{tab:results-of-en-vocabulary-extraction}
+\label{tab::2.4-2}
 {
 \begin{tabular}{l|lll}
 \rule{0pt}{10pt} $r$ & $n_r$ & $n^*$ & $p_r$\\ \hline
@@ -850,7 +851,7 @@ N & = & \sum_{r>0}{p_r n_r} \nonumber \\
 \parinterval 首先介绍一下absolute discounting平滑算法，公式如下所示：
 \begin{eqnarray}
 \textrm{P}_{\textrm{AbsDiscount}}(w_i | w_{i-1}) = \frac{c(w_{i-1},w_i )-d}{c(w_{i-1})} + \lambda(w_{i-1})\textrm{P}(w)
-\label{eqC2.38-new}
+\label{eq:2.4-12}
 \end{eqnarray}
 其中$d$是固定的被裁剪的值，$\lambda$是一个正则化常数。可以看到第一项是经过减值调整过的2-gram的概率值，第二项则相当于一个带权重$\lambda$的1-gram的插值项。然而这种插值模型极易受到原始1-gram模型的干扰。
@@ -862,31 +863,31 @@ N & = & \sum_{r>0}{p_r n_r} \nonumber \\
 \parinterval 为了评估$\textrm{P}_{\textrm{cont}}$，统计使用当前词作为第二个词所出现二元语法的种类，二元语法种类越多，这个词作为第二个词出现的可能性越高，呈正比：
 \begin{eqnarray}
 \textrm{P}_{\textrm{cont}}(w_i) \varpropto |w_{i-1}: c(w_{i-1} w_i )>0|
-\label{eqC2.39-new}
+\label{eq:2.4-13}
 \end{eqnarray}
 通过全部的二元语法的种类做归一化可得到评估的公式
 \begin{eqnarray}
 \textrm{P}_{\textrm{cont}}(w_i) = \frac{|\{ w_{i-1}:c(w_{i-1} w_i )>0 \}|}{|\{ (w_{j-1}, w_j):c(w_{j-1},w_j )>0 \}|}
-\label{eqC2.40-new}
+\label{eq:2.4-14}
 \end{eqnarray}
 \parinterval 基于分母的变化还有另一种形式
 \begin{eqnarray}
 \textrm{P}_{\textrm{cont}}(w_i) = \frac{|\{ w_{i-1}:c(w_{i-1} w_i )>0 \}|}{\sum_{w^{\prime}}|\{ w_{i-1}^{\prime}:c(w_{i-1}^{\prime},w_i^{\prime} )>0 \}|}
-\label{eqC2.41-new}
+\label{eq:2.4-15}
 \end{eqnarray}
 结合基础的absolute discounting计算公式，从而得到了Kneser-Ney平滑方法的公式
 \begin{eqnarray}
 \textrm{P}_{\textrm{KN}}(w_i|w_{i-1}) = \frac{\max(c(w_{i-1},w_i )-d,0)}{c(w_{i-1})}+ \lambda(w_{i-1})\textrm{P}_{\textrm{cont}}(w_i)
-\label{eqC2.42-new}
+\label{eq:2.4-16}
 \end{eqnarray}
 \noindent 其中
 \begin{eqnarray}
 \lambda(w_{i-1}) = \frac{d}{c(w_{i-1})}|\{w:c(w_{i-1},w)>0\}|
-\label{eqC2.43-new}
+\label{eq:2.4-17}
 \end{eqnarray}
 \noindent 这里$\max(\cdot)$保证了分子部分为不小0的数，原始1-gram更新成$\textrm{P}_{\textrm{cont}}$概率分布，$\lambda$是正则化项。
@@ -897,8 +898,8 @@ N & = & \sum_{r>0}{p_r n_r} \nonumber \\
                                                   &   &  \lambda(w_{i-n+1}...w_{i-1})\textrm{P}_{\textrm{KN}}(w_i|w_{i-n+2}...w_{i-1})
 \end{eqnarray}
 \begin{eqnarray}
-\lambda(w_{i-1}) & = &  \frac{d}{c_{\textrm{KN}}(w_{i-n+1}^{i-1})}|\{w:c_{\textrm{KN}}(w_{i-n+1}...w_{i-1}w)>0\}| \label{eqC2.44-new} \\
+\lambda(w_{i-1}) & = &  \frac{d}{c_{\textrm{KN}}(w_{i-n+1}^{i-1})}|\{w:c_{\textrm{KN}}(w_{i-n+1}...w_{i-1}w)>0\}| \label{eq:2.4-18} \\
-c_{\textrm{KN}}(\cdot) & = & \begin{cases} \textrm{count}(\cdot)\quad \textrm{for\ highest\ order}  \\ \textrm{catcount}(\cdot)\quad \textrm{for\ lower\ order} \end{cases} \label{eqC2.45-new}
+c_{\textrm{KN}}(\cdot) & = & \begin{cases} \textrm{count}(\cdot)\quad \textrm{for\ highest\ order}  \\ \textrm{catcount}(\cdot)\quad \textrm{for\ lower\ order} \end{cases} \label{eq:2.4-19}
 \end{eqnarray}
 \noindent 其中catcount$(\cdot)$表示的是基于某个单个词作为第$n$个词的$n$-gram的种类数目。
@@ -912,9 +913,9 @@ c_{\textrm{KN}}(\cdot) & = & \begin{cases} \textrm{count}(\cdot)\quad \textrm{fo
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \subsection{句子的句法树表示}\index{Chapter2.5.1}
-\parinterval \textbf{句法}（syntax）是研究句子的每个组成部分和它们之间的组合方式。一般来说，句法和语言是相关的，比如，英文是主谓宾结构，而日语是主宾谓结构。因此不同的语言也会有不同的句法描述方式。这里我们将介绍自然语言处理领域最常用的两种句法分析形式 – \textbf{短语结构分析}（phrase structure parsing）和\textbf{依存分析}（dependency parsing）。它们在汉语、英语等多种语言的处理中都有广泛应用。
+\parinterval {\small\sffamily\bfseries{句法}}（syntax）是研究句子的每个组成部分和它们之间的组合方式。一般来说，句法和语言是相关的，比如，英文是主谓宾结构，而日语是主宾谓结构。因此不同的语言也会有不同的句法描述方式。这里我们将介绍自然语言处理领域最常用的两种句法分析形式 – {\small\sffamily\bfseries{短语结构分析}}（phrase structure parsing）和{\small\sffamily\bfseries{依存分析}}（dependency parsing）。它们在汉语、英语等多种语言的处理中都有广泛应用。
-\parinterval 图\ref{fig:phrase-structure-tree-and-dependency-tree}展示了这两种的句法表示形式的实例。其中，左侧是短语结构树。它描述的是短语的结构功能，比如``吃''是动词（记为VV），``鱼''是名词（记为NN），``吃鱼''组成动词短语，这个短语再与``喜欢''这一动词组成新的动词短语。每个子树都是一个句法功能单元，比如，VP(VV(吃) NN(鱼))这个子树就表示了``吃鱼''这个动词短语的结构，其中子树根节点VP是句法功能标记。短语结构树利用嵌套和递归的方式描述了语言学的功能。短语结构树中，每个词都有词性(或词类)，不同的词或者短语可以组成名动结构、动宾结构等语言学短语结构。短语结构分析一般也被称为成分分析(constituency parsing)，也被称作完全分析（full parsing）。
+\parinterval 图\ref{fig:2.5-1}展示了这两种的句法表示形式的实例。其中，左侧是短语结构树。它描述的是短语的结构功能，比如``吃''是动词（记为VV），``鱼''是名词（记为NN），``吃鱼''组成动词短语，这个短语再与``喜欢''这一动词组成新的动词短语。每个子树都是一个句法功能单元，比如，VP(VV(吃) NN(鱼))这个子树就表示了``吃鱼''这个动词短语的结构，其中子树根节点VP是句法功能标记。短语结构树利用嵌套和递归的方式描述了语言学的功能。短语结构树中，每个词都有词性(或词类)，不同的词或者短语可以组成名动结构、动宾结构等语言学短语结构。短语结构分析一般也被称为成分分析(constituency parsing)，也被称作完全分析（full parsing）。
 %----------------------------------------------
 % 图2.5.1.1
@@ -922,13 +923,13 @@ c_{\textrm{KN}}(\cdot) & = & \begin{cases} \textrm{count}(\cdot)\quad \textrm{fo
    \centering
 \input{./Chapter2/Figures/figure-phrase-structure-tree-and-dependency-tree}
    \caption{短语结构树(左)和依存树(右)}
-    \label{fig:phrase-structure-tree-and-dependency-tree}
+    \label{fig:2.5-1}
 \end{figure}
 %---------------------------
-\parinterval 图\ref{fig:phrase-structure-tree-and-dependency-tree}右侧展示的是另一种句法结构，被称作依存句法树。依存句法树表示了句子中单词和单词之间的依存关系。比如，从这个例子可以了解，``猫''依赖``喜欢''，``吃''依赖``喜欢''，``鱼''依赖``吃''。
+\parinterval 图\ref{fig:2.5-1}右侧展示的是另一种句法结构，被称作依存句法树。依存句法树表示了句子中单词和单词之间的依存关系。比如，从这个例子可以了解，``猫''依赖``喜欢''，``吃''依赖``喜欢''，``鱼''依赖``吃''。
-\parinterval 短语结构树和依存句法树的结构和功能都有所不同。短语结构树的叶子节点是单词，中间节点是词性或者短语句法标记。在短语结构分析中，通常把单词称作终结符（terminal），把词性称为预终结符（pre-terminal），而把其它句法标记称为非终结符（non-terminal）。依存句法树没有预终结符和非终结符，所有的节点都是句子里的单词，通过不同节点间的连线表示句子中各个单词之间的依存关系。每个依存关系实际上都是有方向的，头和尾分别指向``接受''和``发出''依存关系的词。每个依存关系也可以进行分类，如图\ref{fig:phrase-structure-tree-and-dependency-tree}所示，每个依存关系都的类型都进行了标记，这也被称作有标记的依存分析。如果不生成这些标记，这样的句法分析被称作无标记的依存分析。
+\parinterval 短语结构树和依存句法树的结构和功能都有所不同。短语结构树的叶子节点是单词，中间节点是词性或者短语句法标记。在短语结构分析中，通常把单词称作终结符（terminal），把词性称为预终结符（pre-terminal），而把其它句法标记称为非终结符（non-terminal）。依存句法树没有预终结符和非终结符，所有的节点都是句子里的单词，通过不同节点间的连线表示句子中各个单词之间的依存关系。每个依存关系实际上都是有方向的，头和尾分别指向``接受''和``发出''依存关系的词。每个依存关系也可以进行分类，如图\ref{fig:2.5-1}所示，每个依存关系都的类型都进行了标记，这也被称作有标记的依存分析。如果不生成这些标记，这样的句法分析被称作无标记的依存分析。
 \parinterval 虽然短语结构树和依存树是两种不同的句法表现形式，但是它们在某种条件下能相互转化。比如，可以使用启发性规则将短语结构树自动转化为依存树。从应用的角度，依存分析由于形式更加简单，而且直接建模词语之间的依赖，因此在最近自然语言处理领域中关注较多。在机器翻译中，不过无论是哪种句法树结构，都已经被证明会对机器翻译系统产生正面效果。特别是短语结构树，在机器翻译中的应用历史更长，研究更加深入，因此本节将会里短语结构分析为例介绍相关概念。
@@ -980,19 +981,19 @@ c_{\textrm{KN}}(\cdot) & = & \begin{cases} \textrm{count}(\cdot)\quad \textrm{fo
 \parinterval 举例说明，假设有上下文无关文法$G=<N,\Sigma,R,S>$，其中把非终结符集合定义为不同的句法标记
 \begin{eqnarray}
 \textrm{N}=\{\textrm{NN},\textrm{VV},\textrm{NP},\textrm{VP},\textrm{IP}\}
-\label{eqC2.46-new}
+\label{eq:2.5-1}
 \end{eqnarray}
 这里，\textrm{NN}代表名词，\textrm{VV}代表动词，\textrm{NP}代表名词短语，\textrm{VP}代表动词短语，\textrm{IP}代表单句。进一步，把终结符集合定义为
 \begin{eqnarray}
 \Sigma = \{\text{猫,喜欢,吃,鱼}\}
-\label{eqC2.47-new}
+\label{eq:2.5-2}
 \end{eqnarray}
 再定义起始符集合为
 \begin{eqnarray}
 \textrm{S}=\{\textrm{IP}\}
-\label{eqC2.48-new}
+\label{eq:2.5-3}
 \end{eqnarray}
 最后，文法的规则集定义如下（其中$r_i$为规则的编号）
@@ -1001,19 +1002,19 @@ c_{\textrm{KN}}(\cdot) & = & \begin{cases} \textrm{count}(\cdot)\quad \textrm{fo
 \begin{figure}[htp]
    \centering
 \input{./Chapter2/Figures/figure-rules-of-grammar}
-\caption{还没有写}
+\caption{文法的规则集定义}
-    \label{fig:rules-of-grammar}
+    \label{fig:2.5-2}
 \end{figure}
 %---------------------------
 \parinterval 上面这个文法蕴含了不同``层次''的句法信息。比如，规则$r_1$、$r_2$、$r_3$和$r_4$表达了词性对单词的抽象；规则$r_6$、$r_7$和$r_8$是表达了短语结构的抽象，其中，规则$r_8$表达了$\textrm{NP}+\textrm{VP}$描述了汉语中名词短语(主语)+动词短语(谓语)的结构。在实际应用中，像$r_8$这样的规则可以覆盖很大的片段（试想一下一个包含50个词的主谓结构的句子，都可以使用进行$r_8$描述）。
-\parinterval 下文无关文法的规则是一种\textbf{产生式规则}（production rule），形如$\alpha \to \beta $，它表示把规则左端的非终结符$\alpha$替换为规则右端的符号序列$\beta$。通常，$\alpha$被称作规则的左部（left-hand side），$\beta$被称作规则的右部（right-hand side）。使用右边$\beta$替换左部$\alpha$的过程也被称作规则的使用，而这个过程的逆过程称为规约。规则的使用可以如下定义：
+\parinterval 下文无关文法的规则是一种{\small\sffamily\bfseries{产生式规则}}（production rule），形如$\alpha \to \beta $，它表示把规则左端的非终结符$\alpha$替换为规则右端的符号序列$\beta$。通常，$\alpha$被称作规则的左部（left-hand side），$\beta$被称作规则的右部（right-hand side）。使用右边$\beta$替换左部$\alpha$的过程也被称作规则的使用，而这个过程的逆过程称为规约。规则的使用可以如下定义：
 \vspace{0.5em}
 %-------------------------------------------
 \begin{definition}
-一个符号序列$u$可以通过使用规则$r$替换其中的某个非终结符，并得到符号序列$v$,我们说$v$是在$u$上使用$r$的结果，如图\ref{fig:usage-of-regulation}所示：
+一个符号序列$u$可以通过使用规则$r$替换其中的某个非终结符，并得到符号序列$v$,我们说$v$是在$u$上使用$r$的结果，如图\ref{fig:2.5-3}所示：
 \end{definition}
 %-------------------------------------------
 \vspace{-0.5em}
@@ -1021,10 +1022,9 @@ c_{\textrm{KN}}(\cdot) & = & \begin{cases} \textrm{count}(\cdot)\quad \textrm{fo
 \centering
 \input{./Chapter2/Figures/figure-usage-of-regulation}
 \caption{规则的使用示意图}
-\label{fig:usage-of-regulation}
+\label{fig:2.5-3}
 \end{figure}
-\vspace{-0.5em}
 \parinterval 给定义起始非终结符，我们可以不断地使用规则，最终生成一个终结符串，这个过程也被称为推导（derivation）。
 \vspace{0.5em}
@@ -1051,7 +1051,7 @@ s_0 \overset{r_1}{\Rightarrow} s_1 \overset{r_2}{\Rightarrow} s_2 \overset{r_3}{
 \end{definition}
 %-------------------------------------------
-\parinterval 比如，使用前面的示例文法，可以对``猫 喜欢 吃 鱼''进行分析，并形成句法分析树（图\ref{fig:example-of-derivation}）。我们从起始非终结符IP开始，利用唯一拥有IP在左部的规则$r_8$推导出NP和VP，之后依次使用规则$r_5$、$r_1$、$r_7$、$r_2$、$r_6$、$r_3$、$r_4$，得到了完整的句法树。
+\parinterval 比如，使用前面的示例文法，可以对``猫 喜欢 吃 鱼''进行分析，并形成句法分析树（图\ref{fig:2.5-4}）。我们从起始非终结符IP开始，利用唯一拥有IP在左部的规则$r_8$推导出NP和VP，之后依次使用规则$r_5$、$r_1$、$r_7$、$r_2$、$r_6$、$r_3$、$r_4$，得到了完整的句法树。
 %-------------------------------------------
 % 图2.5.2.3
 \begin{figure}[htp]
@@ -1059,7 +1059,7 @@ s_0 \overset{r_1}{\Rightarrow} s_1 \overset{r_2}{\Rightarrow} s_2 \overset{r_3}{
 \input{./Chapter2/Figures/figure-example-of-derivation}
 \setlength{\abovecaptionskip}{-1.0em}
 	\caption{上下文无关文法推导实例}
-    \label{fig:example-of-derivation}
+    \label{fig:2.5-4}
 \end{figure}
 %-------------------------------------------
@@ -1067,7 +1067,7 @@ s_0 \overset{r_1}{\Rightarrow} s_1 \overset{r_2}{\Rightarrow} s_2 \overset{r_3}{
 \parinterval 但是，句子和规则的推导并不是一一对应的。同一个句子，往往有很多推导与之对应，我们称为歧义。甚至同一棵句法树，也可以对应不同的推导。
-\parinterval 图\ref{fig:two-different-derivation-of-regulation}给出一个同一棵句法树所对应的两种不同的规则推导。
+\parinterval 图\ref{fig:2.5-5}给出一个同一棵句法树所对应的两种不同的规则推导。
 %-------------------------------------------
 %图2.5.2.4
@@ -1076,7 +1076,7 @@ s_0 \overset{r_1}{\Rightarrow} s_1 \overset{r_2}{\Rightarrow} s_2 \overset{r_3}{
 \input{./Chapter2/Figures/figure-two-different-derivation-of-regulation}
 \setlength{\abovecaptionskip}{-1.0em}
 	\caption{同一棵句法树对应的不同规则推导}
-    \label{fig:two-different-derivation-of-regulation}
+    \label{fig:2.5-5}
 \end{figure}
 %-------------------------------------------
@@ -1092,18 +1092,18 @@ s_0 \overset{r_1}{\Rightarrow} s_1 \overset{r_2}{\Rightarrow} s_2 \overset{r_3}{
    \centering
 \input{./Chapter2/Figures/figure-perspectives-of-expert-ordinary-and-syntactic-parser}
 	\caption{如何选择最佳的句法分析结果 - 专家、普通人和句法分析器的视角}
-    \label{fig:perspectives-of-expert-ordinary-and-syntactic-parser}
+    \label{fig:2.5-6}
 \end{figure}
 %-------------------------------------------
-\parinterval 在统计句法分析中，我们需要对每个推导进行统计建模，于是我们得到一个模型$\textrm{P}( \cdot )$，对于任意的推导$d$，都可以用$\textrm{P}(d)$计算推导$d$的概率。这样，给定一个输入句子，我们可以对所有可能的推导用$\textrm{P}(d)$计算其概率值，并选择概率最大的结果作为句法分析的结果输出（图\ref{fig:probability-values-corresponding-to-different-derivations}）。
+\parinterval 在统计句法分析中，我们需要对每个推导进行统计建模，于是我们得到一个模型$\textrm{P}( \cdot )$，对于任意的推导$d$，都可以用$\textrm{P}(d)$计算推导$d$的概率。这样，给定一个输入句子，我们可以对所有可能的推导用$\textrm{P}(d)$计算其概率值，并选择概率最大的结果作为句法分析的结果输出（图\ref{fig:2.5-7}）。
 %-------------------------------------------
 %图2.5.2.6
 \begin{figure}[htp]
    \centering
 \input{./Chapter2/Figures/figure-probability-values-corresponding-to-different-derivations}
 	\caption{不同推导（句法树）对应的概率值}
-    \label{fig:probability-values-corresponding-to-different-derivations}
+    \label{fig:2.5-7}
 \end{figure}
 %-------------------------------------------
@@ -1129,14 +1129,14 @@ s_0 \overset{r_1}{\Rightarrow} s_1 \overset{r_2}{\Rightarrow} s_2 \overset{r_3}{
 \parinterval 概率上下文无关文法与传统上下文无关文法的区别在于，每条规则都会有一个概率，描述规则生成的可能性。具体来说，规则$\textrm{P}(\alpha \to \beta)$的概率可以被定义为：
 \begin{eqnarray}
 \textrm{P}(\alpha \to \beta)=\textrm{P}(\beta | \alpha)
-\label{eqC2.49-new}
+\label{eq:2.5-4}
 \end{eqnarray}
 即，在给定规则左部的情况下生成规则右部的可能性。进一步，在上下文无关文法中，因为上下文无关的属性，每条规则之间的使用都是相互独立的 \footnote[3]{如果是上下文有关文法，规则会形如 $\textrm{a}\alpha \textrm{b}\to \textrm{a}\beta \textrm{b}$，这时$\alpha \to \beta $的过程会依赖前后上下文\textrm{a}和\textrm{b}}。因此可以把\textrm{P(d)}分解为规则概率的乘积进行计算：
 \begin{eqnarray}
 \textrm{P}(d) & = & \textrm{P}(r_1 \cdot r_2 \cdot … \cdot r_n) \nonumber \\
 & = & \textrm{P}(r_1) \cdot \textrm{P}(r_2) \cdots \textrm{P}(r_n)
-\label{eqC2.50-new}
+\label{eq:2.5-5}
 \end{eqnarray}
 \parinterval 这样我们就可以得到每个推导d的概率值。这个模型，可以很好的解释词串的生成过程。比如，对于规则集
@@ -1150,7 +1150,7 @@ r_6: & \textrm{VP} \to \textrm{VV} \textrm{NN} \nonumber
 \begin{eqnarray}
 \textrm{P}(d_1) & = &\textrm{P}(r_3) \cdot \textrm{P}(r_4) \cdot \textrm{P}(r_6)\nonumber  \\
 & = & \textrm{P}(\textrm{VV} \to \text{吃}) \cdot \textrm{P}(\textrm{NN} \to \text{鱼}) \cdots \textrm{P}(\textrm{VP} \to \textrm{VV NN})
-\label{eqC2.51-new}
+\label{eq:2.5-6}
 \end{eqnarray}
 \parinterval 这也对应了词串``吃 鱼''的生成过程。首先，从起始VP使用规则$r_6$生成两个非终结符VV和NN；进一步，分别使用规则$r_3$和$r_4$从VV和NN进一步生成单词``吃''和``鱼''。整个过程的概率等于三条规则概率的乘积。
@@ -1158,10 +1158,10 @@ r_6: & \textrm{VP} \to \textrm{VV} \textrm{NN} \nonumber
 \parinterval 新的问题又来了，如何得到规则的概率呢？这里仍然可以使用数据驱动的想法，从数据中学习文法规则的概率。假设我们有人工标注的数据，其中包括很多句子的人工标注的句法树，称之为树库。然后，对于规则$\textrm{r}:\alpha \to \beta$可以使用极大似然估计：
 \begin{eqnarray}
 \textrm{P}(r)  = \frac{\text{规则$r$在树库中出现的次数}}{\alpha \text{在树库中出现的次数}}
-\label{eqC2.52-new}
+\label{eq:2.5-7}
 \end{eqnarray}
-\parinterval 这里通过一个例子来解释规则概率的计算过程（图\ref{fig:evaluation-of-probability-for-grammar}）。
+\parinterval 这里通过一个例子来解释规则概率的计算过程（图\ref{fig:2.5-8}）。
 %-------------------------------------------
 % 图2.5.3.1
@@ -1169,13 +1169,13 @@ r_6: & \textrm{VP} \to \textrm{VV} \textrm{NN} \nonumber
    \centering
 \input{./Chapter2/Figures/figure-evaluation-of-probability-for-grammar}
 	\caption{上下文无关文法规则概率估计}
-    \label{fig:evaluation-of-probability-for-grammar}
+    \label{fig:2.5-8}
 \end{figure}
 %-------------------------------------------
 \parinterval 与词法分析类似，我们统计树库中规则左部和右部同时出现的次数，除以规则左部出现的全部次数，所得的结果就是所求规则的概率。这种方法也是典型的相对频度估计。但是如果规则左部和右部同时出现的次数为0时是否代表这个规则概率是0呢？遇到这种情况，可以使用平滑方法对概率进行平滑处理，具体思路和2.4.2节基本上是一样的。
-\parinterval 图\ref{fig:process-of-statistical-syntax-analysis}展示了基于统计句法分析的流程。首先，通过对人工标注数据的统计，获得各个规则的概率，这样我们得到了一个上下文无关句法分析模型$\textrm{P}( \cdot )$，对于任意句法分析结果$\textrm{d}=r_1 \cdot r_2 \cdot … \cdot r_n$，都能通过$\textrm{P}(\textrm{d})= \prod_{i=1}^{n}\textrm{p}(r_i)$计算其概率值。
+\parinterval 图\ref{fig:2.5-9}展示了基于统计句法分析的流程。首先，通过对人工标注数据的统计，获得各个规则的概率，这样我们得到了一个上下文无关句法分析模型$\textrm{P}( \cdot )$，对于任意句法分析结果$\textrm{d}=r_1 \cdot r_2 \cdot … \cdot r_n$，都能通过$\textrm{P}(\textrm{d})= \prod_{i=1}^{n}\textrm{p}(r_i)$计算其概率值。
 %-------------------------------------------
 % 图2.5.3.2
@@ -1183,20 +1183,20 @@ r_6: & \textrm{VP} \to \textrm{VV} \textrm{NN} \nonumber
    \centering
 \input{./Chapter2/Figures/figure-process-of-statistical-syntax-analysis}
 	\caption{统计句法分析的流程}
-    \label{fig:process-of-statistical-syntax-analysis}
+    \label{fig:2.5-9}
 \end{figure}
 %-------------------------------------------
-\parinterval 在获取统计分析模型后，就可以使用模型对任意句子进行分析，计算每个句法分析树的概率，并输出概率最高的树作为句法分析的结果。图\ref{fig:example-of-zh-syntactic-analysis}给出了几个真实句法分析器的输入输出实例。
+\parinterval 在获取统计分析模型后，就可以使用模型对任意句子进行分析，计算每个句法分析树的概率，并输出概率最高的树作为句法分析的结果。图\ref{fig:2.5-10}给出了几个真实句法分析器的输入输出实例。
 %-------------------------------------------
 % 图2.5.3.3
 \begin{figure}[htp]
    \centering
 \input{./Chapter2/Figures/figure-example-of-zh-syntactic-analysis}
-\setlength{\belowcaptionskip}{-1.0em}
+%\setlength{\belowcaptionskip}{-1.0em}
 	\caption{中文句法分析实例}
-   \label{fig:example-of-zh-syntactic-analysis}
+   \label{fig:2.5-10}
 \end{figure}
 %-------------------------------------------
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@@ -1208,7 +1208,7 @@ r_6: & \textrm{VP} \to \textrm{VV} \textrm{NN} \nonumber
 \begin{adjustwidth}{1em}{}
 \begin{itemize}
-\item 在建模方面，本章介绍的三个任务均采用的是基于人工先验知识进行模型设计的思路。也就是，问题所表达的现象被``一步一步''生成出来。这是一种典型的生成式建模思想，它把要解决的问题看作一些观测结果的隐含变量（比如，句子是观测结果，分词是隐含变量），之后通过对观测变量生成观测结果的过程建模，以达到对问题进行数学描述的目的。这类模型一般需要进行一些独立性假设，假设的好坏对最终的性能有较大影响。相对\textbf{生成模型}（generative models），另一类方法\textbf{判别模型}（discriminative models），它直接对问题进行求解，可以更加灵活的定义不同的特征，而且不依赖过多的独立性假设。判别式模型在自然语言处理中也有广泛应用\cite{shannon1948mathematical}\cite{ng2002discriminative}。在本书的第四章也会使用到判别式模型。
+\item 在建模方面，本章介绍的三个任务均采用的是基于人工先验知识进行模型设计的思路。也就是，问题所表达的现象被``一步一步''生成出来。这是一种典型的生成式建模思想，它把要解决的问题看作一些观测结果的隐含变量（比如，句子是观测结果，分词是隐含变量），之后通过对观测变量生成观测结果的过程建模，以达到对问题进行数学描述的目的。这类模型一般需要进行一些独立性假设，假设的好坏对最终的性能有较大影响。相对{\small\sffamily\bfseries{生成模型}}（generative models），另一类方法{\small\sffamily\bfseries{判别模型}}（discriminative models），它直接对问题进行求解，可以更加灵活的定义不同的特征，而且不依赖过多的独立性假设。判别式模型在自然语言处理中也有广泛应用\cite{shannon1948mathematical}\cite{ng2002discriminative}。在本书的第四章也会使用到判别式模型。
 \item 从现在自然语言处理的前沿看，基于端到端学习的深度学习方法在很多任务中都取得了领先的性能。但是，深度学习及相关方法并没有在本章被涉及，这是由于笔者认为对问题的建模是处理自然语言处理问题的基础，对问题的描述并不会因为方法的改变而改变。因此，本章的内容没有太多的陷入到更加复杂的模型和算法设计中，相反，我们希望关注对基本问题的理解和描述。不过，对于本章中涉及的自然语言问题，一些前沿方法可以参考，包括：基于条件随机场和双向长短时记忆模型的序列标注模型（\cite{lafferty2001conditional}\cite{huang2015bidirectional}\cite{ma2016end}、神经语言模型\cite{bengio2003neural}\cite{mikolov2010recurrent}、神经句法分析模型\cite{chen2014fast}\cite{zhu2015long}。

--- a/Book/Chapter3/Chapter3.tex
+++ b/Book/Chapter3/Chapter3.tex
 % !Mode:: "TeX:UTF-8"
 % !TEX encoding = UTF-8 Unicode
+\part{统计机器翻译}
 %----------------------------------------------------------------------------------------
 %	CHAPTER 3
 %----------------------------------------------------------------------------------------
@@ -34,14 +34,14 @@
 \end{figure}
 %-------------------------------------------
-\parinterval 上面的例子反映了人在做翻译时所使用的一些知识：首先，两种语言单词的顺序可能不一致，译文需要符合目标语的习惯，这也是我们常说翻译的\textbf{流畅度}问题（fluency）；其次，源语言单词需要准确的被翻译出来\footnote{当然，对于一些意译的情况或者虚词并不需要翻译。}，也是我们常说的翻译的\textbf{准确性}和\textbf{充分性}问题（adequacy）。为了达到以上目的，传统观点认为翻译需要过程包含三个步骤（图 \ref{fig:3-2}）
+\parinterval 上面的例子反映了人在做翻译时所使用的一些知识：首先，两种语言单词的顺序可能不一致，译文需要符合目标语的习惯，这也是我们常说翻译的{\small\sffamily\bfseries{流畅度}}问题（fluency）；其次，源语言单词需要准确的被翻译出来\footnote{当然，对于一些意译的情况或者虚词并不需要翻译。}，也是我们常说的翻译的{\small\sffamily\bfseries{准确性}}和{\small\sffamily\bfseries{充分性}}问题（adequacy）。为了达到以上目的，传统观点认为翻译需要过程包含三个步骤（图 \ref{fig:3-2}）
 \begin{itemize}
-\item \textbf{分析：}将源语言句子切分或者表示为能够处理的最小单元。基于词的翻译模型中，最小处理单元就是单词，因此在这里也可以简单地将分析理解为分词\footnote{在后续章节中会看到，分析也包括对语言结构的深入分析，但是这里为了突出基于单词的概念，因此把问题简化为最简单的情况。}。
+\item {\small\sffamily\bfseries{分析：}}将源语言句子切分或者表示为能够处理的最小单元。基于词的翻译模型中，最小处理单元就是单词，因此在这里也可以简单地将分析理解为分词\footnote{在后续章节中会看到，分析也包括对语言结构的深入分析，但是这里为了突出基于单词的概念，因此把问题简化为最简单的情况。}。
-\item \textbf{转换：}把源语句中的每个单词翻译成目标语单词。
+\item {\small\sffamily\bfseries{转换：}}把源语句中的每个单词翻译成目标语单词。
-\item \textbf{生成：}基于转换的结果，将目标语译文变成通顺且合乎语法的句子。
+\item {\small\sffamily\bfseries{生成：}}基于转换的结果，将目标语译文变成通顺且合乎语法的句子。
 \end{itemize}
 %----------------------------------------------
@@ -62,7 +62,7 @@
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \subsection{如何进行翻译？}\index{Chapter3.2.1}
-\subsubsection{（一）人工翻译流程}\index{Chapter3.2.1.1}
+\subsubsection*{（一）人工翻译流程}\index{Chapter3.2.1.1}
 \parinterval 当我们翻译一个句子时，首先会快速地分析出句子的（单词）构成，然后根据以往的知识，得到每个词可能的翻译，最后利用对目标语的理解拼出来一个译文。尽管这个过程并不是严格来自心理学或者脑科学的相关结论，但至少可以帮助我们理解人在翻译时的思考方式。
 %----------------------------------------------
@@ -79,7 +79,7 @@
 \begin{itemize}
 \vspace{0.5em}
-\item 翻译知识的学习：对于输入的源语言句子，我们首先需要知道每个单词可能的翻译有什么，这些翻译被称为\textbf{翻译候选}。比如，汉语单词``对''可能的译文有``to''、``with''和``for''等。对于人来说，可以通过阅读、背诵、做题或者老师教等途径获得翻译知识，这些知识就包含了源语言与目标语言单词之间的对应关系。我们也把这个过程称之为学习过程。
+\item 翻译知识的学习：对于输入的源语言句子，我们首先需要知道每个单词可能的翻译有什么，这些翻译被称为{\small\sffamily\bfseries{翻译候选}}。比如，汉语单词``对''可能的译文有``to''、``with''和``for''等。对于人来说，可以通过阅读、背诵、做题或者老师教等途径获得翻译知识，这些知识就包含了源语言与目标语言单词之间的对应关系。我们也把这个过程称之为学习过程。
 \vspace{0.5em}
 \item 运用知识生成译文：当翻译一个从未见过的句子时，我们可以运用学习到的翻译知识，得到新的句子中每个单词的译文，并处理常见的单词搭配、主谓一致等问题，比如，我们知道``satisfied''后面常常使用介词``with''构成搭配，基于这些知识可以快速生成译文。
@@ -115,7 +115,7 @@
 \parinterval 对于第二个问题，尽管机器能够找到很多这样的译文选择路径，但它并不知道哪些路径是好的。说的再直白一些，简单的枚举路径实际上就是一个体力活，没有什么智能。因此计算机还需要再聪明一些，运用它的能够``掌握''的知识判断翻译结果的好与坏。这一步是最具挑战的，当然也有很多思路。在统计机器翻译中，这个问题被定义为：设计一种统计模型，它可以给每个译文一个可能性，而这个可能性越高表明译文越接近人工翻译。如图\ref{fig:3-4}所示，每个单词翻译候选的右侧黑色框里的数字就是单词的翻译概率，使用这些单词的翻译概率，我们可以得到整句译文的概率（用符号P表示）。这样，我们用概率化的模型描述了每个翻译候选的可能性。基于每个翻译候选的可能性，机器翻译系统可以对所有的翻译``路径''进行打分，比如，图\ref{fig:3-4}中第一条路径的分数为0.042，第二条是0.006，以此类推。最后，系统可以选择分数最高的路径作为源语言句子的最终译文。
 \vspace{-0.5em}
 \subsubsection{（三）人工 vs. 机器}\index{Chapter3.2.1.3}
-\parinterval 人在翻译时的决策是非常确定并且快速的，但计算机处理这个问题时却充满了概率化的思想。当然它们也有类似的地方。首先，计算机使用统计模型的目的是把翻译知识变得可计算，并把这些``知识''储存在相关的模型参数中，这个模型和我们大脑的作用是类似的\footnote{这里，并不是要把统计模型等同于生物学或者认知科学上的人脑，我们指的是他们处理翻译问题时发挥的作用类似。}；其次，计算机对统计模型进行训练的过程相当于人类学习知识的过程，或者二者都可以称为学习；再有，计算机使用学习到的模型对新句子进行翻译的过程相当于人运用知识的过程。在统计机器翻译中，模型学习的过程被称为\textbf{训练}，目的是从双语平行数据中自动学习翻译``知识''；而使用模型处理新句子的过程被称为\textbf{解码}或\textbf{推断}，目的是使用学习到的知识对新的句子进行翻译。这也反映了机器翻译的两个核心步骤：训练和解码。图\ref{fig:3-4}的右侧标注在翻译过程中训练和解码的位置。最终，统计机器翻译的核心由三部分构成\ \dash \ 建模、训练和解码。本章后续内容会围绕这三个问题展开讨论。
+\parinterval 人在翻译时的决策是非常确定并且快速的，但计算机处理这个问题时却充满了概率化的思想。当然它们也有类似的地方。首先，计算机使用统计模型的目的是把翻译知识变得可计算，并把这些``知识''储存在相关的模型参数中，这个模型和我们大脑的作用是类似的\footnote{这里，并不是要把统计模型等同于生物学或者认知科学上的人脑，我们指的是他们处理翻译问题时发挥的作用类似。}；其次，计算机对统计模型进行训练的过程相当于人类学习知识的过程，或者二者都可以称为学习；再有，计算机使用学习到的模型对新句子进行翻译的过程相当于人运用知识的过程。在统计机器翻译中，模型学习的过程被称为{\small\sffamily\bfseries{训练}}，目的是从双语平行数据中自动学习翻译``知识''；而使用模型处理新句子的过程被称为{\small\sffamily\bfseries{解码}}或{\small\sffamily\bfseries{推断}}，目的是使用学习到的知识对新的句子进行翻译。这也反映了机器翻译的两个核心步骤：训练和解码。图\ref{fig:3-4}的右侧标注在翻译过程中训练和解码的位置。最终，统计机器翻译的核心由三部分构成\ \dash \ 建模、训练和解码。本章后续内容会围绕这三个问题展开讨论。
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \subsection{基本框架}\index{Chapter3.2.2}
@@ -124,9 +124,9 @@
 \vspace{0.5em}
 \begin{itemize}
-\item \textbf{训练}：从双语平行数据中学习翻译模型，记为$\textrm{P}(\mathbf{t}|\mathbf{s})$，其中$\mathbf{s}$表示源语言句子，$\mathbf{t}$表示目标语句子。$\textrm{P}(\mathbf{t}|\mathbf{s})$表示把$\mathbf{s}$翻译为$\mathbf{t}$的概率。简言之，这一步需要从大量的双语平行数据中学习到$\textrm{P}(\mathbf{t}|\mathbf{s})$的计算方法。
+\item {\small\sffamily\bfseries{训练}}：从双语平行数据中学习翻译模型，记为$\textrm{P}(\mathbf{t}|\mathbf{s})$，其中$\mathbf{s}$表示源语言句子，$\mathbf{t}$表示目标语句子。$\textrm{P}(\mathbf{t}|\mathbf{s})$表示把$\mathbf{s}$翻译为$\mathbf{t}$的概率。简言之，这一步需要从大量的双语平行数据中学习到$\textrm{P}(\mathbf{t}|\mathbf{s})$的计算方法。
 \vspace{0.5em}
-\item \textbf{解码}：当面对一个新的待翻译句子时，我们需要使用学习到的模型进行推断。推断可以被视为一个搜索和计算的过程，也就是，我们尽可能搜索更多的翻译结果，然后对每个翻译结果进行打分，最后选择得分最高的翻译结果作为输出。
+\item {\small\sffamily\bfseries{解码}}：当面对一个新的待翻译句子时，我们需要使用学习到的模型进行推断。推断可以被视为一个搜索和计算的过程，也就是，我们尽可能搜索更多的翻译结果，然后对每个翻译结果进行打分，最后选择得分最高的翻译结果作为输出。
 \end{itemize}
 %----------------------------------------------
 % 图3.6
@@ -196,7 +196,7 @@
 \label{eqC3.2-new}
 \end{eqnarray}
-\noindent 这里运算$|\cdot|$表示句子长度。类似的，可以得到``机器''和``translation''、``机器''和``look''\\的单词翻译概率：
+\noindent 这里运算$|\cdot|$表示句子长度。类似的，可以得到``机器''和``translation''、``机器''和``look''的单词翻译概率：
 \begin{eqnarray}
 \textrm{P}(\text{``机器''},\text{``translation''}; \mathbf{s},\mathbf{t})  & = & \frac{2}{63} \\
 \textrm{P}(\text{``机器''},\text{``look''}; \mathbf{s},\mathbf{t})  & =  & \frac{0}{63}
@@ -218,13 +218,13 @@
 \begin{example}
 两个汉英互译的句对
-\qquad\qquad\quad $\mathbf{s}^1$ = 机器\quad {\color{red}翻译}\; 就\; 是\; 用\; 计算机\; 来\; 进行\; {\color{red}翻译}
+\qquad\qquad \; $\mathbf{s}^1$ = 机器\quad {\color{red}翻译}\; 就\; 是\; 用\; 计算机\; 来\; 进行\; {\color{red}翻译}
-\qquad\qquad\quad $\mathbf{s}^1$ = machine\; {\color{red}translation}\; is\; just\; {\color{red}translation}\; by\; computer
+\qquad\qquad\; $\mathbf{s}^1$ = machine\; {\color{red}translation}\; is\; just\; {\color{red}translation}\; by\; computer
-\qquad\qquad\quad $\mathbf{s}^2$ = 那\quad 人工\quad {\color{red}翻译}\quad 呢\quad ?
+\qquad\qquad\; $\mathbf{s}^2$ = 那\quad 人工\quad {\color{red}翻译}\quad 呢\quad ?
-\qquad\qquad\quad $\mathbf{t}^2$ = so\; what\; is\; human\; {\color{red}translation}\; ?
+\qquad\qquad\; $\mathbf{t}^2$ = so\; what\; is\; human\; {\color{red}translation}\; ?
 \label{example3-2}
 \end{example}
@@ -238,7 +238,7 @@
 \label{eqC3.6-new}
 \end{eqnarray}
 }
-\parinterval 公式\ref{eqC3.6-new}所展示的计算过程很简单。分子是两个句对中``翻译''和``translation''共现次数的累计，分母是两个句对的源语言单词和目标语言单词的组合数的累加。显然，这个方法也很容易推广到处理更多句子的情况中，我们仅需要对每个句子的计数进行累加即可。
+\parinterval 公式\ref{eqC3.6-new}所展示的计算过程很简单，分子是两个句对中``翻译''和``translation''共现次数的累计，分母是两个句对的源语言单词和目标语言单词的组合数的累加。显然，这个方法也很容易推广到处理更多句子的情况中，我们仅需要对每个句子的计数进行累加即可。
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \subsection{句子级翻译模型}\index{Chapter3.2.4}
@@ -279,7 +279,7 @@
 \parinterval 回到设计$g(\mathbf{s},\mathbf{t})$的问题上。这里，我们采用``大题小作''的方法，这个技巧在第二章已经进行了充分的介绍。具体来说，直接建模句子之间的对应比较困难，但可以利用单词之间的对应来描述句子之间的对应关系。这就用到了上一小节所介绍的单词翻译概率。
-\parinterval 我们首先引入一个非常重要的概念\ \dash \ \textbf{词对齐}，它是统计机器翻译中最核心的概念之一。词对齐描述了平行句对中单词之间的对应关系，它体现了一种观点：本质上句子之间的对应是由词之间的对应表示的。当然，这个观点在神经机器翻译或者其它模型中可能会有不同的理解，但是翻译句子的过程中我们考虑词级的对应关系是符合我们对语言的认知的。图\ref{fig:3-7}展示了一个句对$\mathbf{s}$和$\mathbf{t}$，单词的右下标数字表示了该词在句中的位置，而虚线表示的是句子$\mathbf{s}$和$\mathbf{t}$中的词对齐关系。比如，``满意''的右下标数字5表示在句子$\mathbf{s}$中处于第5个位置，``satisfied''的右下标数字3表示在句子$\mathbf{t}$中处于第3个位置，``满意''和``satisfied''之间的虚线表示两个单词之间是对齐的。为方便描述，我们用二元组$(j,i)$来描述词对齐，它表示源语言句子的第$j$个单词对应目标语言句子的第$i$个单词，即单词$s_j$和$t_i$对应。通常，也会把$(j,i)$称作一条\textbf{词对齐连接}。图\ref{fig:3-7}中共有5条虚线，表示有5组单词之间的词对齐连接。我们把这些词对齐连接构成的集合作为词对齐的一种表示，记为$\mathbf{a}$，即$A={\{(1,1),(2,4),(3,5),(4,2)(5,3)}\}$。
+\parinterval 我们首先引入一个非常重要的概念\ \dash \ {\small\sffamily\bfseries{词对齐}}，它是统计机器翻译中最核心的概念之一。词对齐描述了平行句对中单词之间的对应关系，它体现了一种观点：本质上句子之间的对应是由词之间的对应表示的。当然，这个观点在神经机器翻译或者其它模型中可能会有不同的理解，但是翻译句子的过程中我们考虑词级的对应关系是符合我们对语言的认知的。图\ref{fig:3-7}展示了一个句对$\mathbf{s}$和$\mathbf{t}$，单词的右下标数字表示了该词在句中的位置，而虚线表示的是句子$\mathbf{s}$和$\mathbf{t}$中的词对齐关系。比如，``满意''的右下标数字5表示在句子$\mathbf{s}$中处于第5个位置，``satisfied''的右下标数字3表示在句子$\mathbf{t}$中处于第3个位置，``满意''和``satisfied''之间的虚线表示两个单词之间是对齐的。为方便描述，我们用二元组$(j,i)$来描述词对齐，它表示源语言句子的第$j$个单词对应目标语言句子的第$i$个单词，即单词$s_j$和$t_i$对应。通常，也会把$(j,i)$称作一条{\small\sffamily\bfseries{词对齐连接}}。图\ref{fig:3-7}中共有5条虚线，表示有5组单词之间的词对齐连接。我们把这些词对齐连接构成的集合作为词对齐的一种表示，记为$\mathbf{a}$，即$A={\{(1,1),(2,4),(3,5),(4,2)(5,3)}\}$。
 %----------------------------------------------
 % 图3.11
 \begin{figure}[htp]
@@ -335,7 +335,7 @@ g(\mathbf{s},\mathbf{t}) \equiv \prod_{j,i \in \widehat{A}}{\textrm{P}(s_j,t_i)}
 \label{eqC3.11-new}
 \end{eqnarray}
-\parinterval 如图\ref{fig:3-9}所示，语言模型$\textrm{P}_{\textrm{lm}}(\mathbf{t})$分别给$\mathbf{t}^{'}$和$\mathbf{t}^{''}$赋予0.0107和0.0009的概率，这表明句子$\mathbf{t}^{'}$更符合英文的表达，这与我们的期望是相吻合的。它们再分别乘以$\prod_{j,i \in \widehat{A}}{\textrm{P}(s_j}$ \\ $,t_i)$的值，就得到公式\ref{eqC3.13-new}定义的函数$g(\cdot)$的值。显然句子$\mathbf{t}^{'}$的分数更高，同时它也是我们希望得到的翻译结果。至此，我们完成了对函数$g(\mathbf{s},\mathbf{t})$的一个简单定义，把它带入公式\ref{eqC3.7-new}就得到了同时考虑准确性和流畅性的句子级统计翻译模型。
+\parinterval 如图\ref{fig:3-9}所示，语言模型$\textrm{P}_{\textrm{lm}}(\mathbf{t})$分别给$\mathbf{t}^{'}$和$\mathbf{t}^{''}$赋予0.0107和0.0009的概率，这表明句子$\mathbf{t}^{'}$更符合英文的表达，这与我们的期望是相吻合的。它们再分别乘以$\prod_{j,i \in \widehat{A}}{\textrm{P}(s_j},t_i)$的值，就得到公式\ref{eqC3.13-new}定义的函数$g(\cdot)$的值。显然句子$\mathbf{t}^{'}$的分数更高，同时它也是我们希望得到的翻译结果。至此，我们完成了对函数$g(\mathbf{s},\mathbf{t})$的一个简单定义，把它带入公式\ref{eqC3.7-new}就得到了同时考虑准确性和流畅性的句子级统计翻译模型。
 %----------------------------------------------
 % 图3.14
 \begin{figure}[htp]
@@ -351,13 +351,13 @@ g(\mathbf{s},\mathbf{t}) \equiv \prod_{j,i \in \widehat{A}}{\textrm{P}(s_j,t_i)}
 \subsection{解码}\index{Chapter3.2.5}
 \label{sec:simple-decoding}
-\parinterval \textbf{解码}是指在得到翻译模型后，对于新输入的句子生成最佳译文的过程。具体来说，当给定任意的源语言句子$\mathbf{s}$，解码系统要找到翻译概率最大的目标语译文$\hat{\mathbf{t}}$。这个过程可以被形式化描述为：
+\parinterval {\small\sffamily\bfseries{解码}}是指在得到翻译模型后，对于新输入的句子生成最佳译文的过程。具体来说，当给定任意的源语言句子$\mathbf{s}$，解码系统要找到翻译概率最大的目标语译文$\hat{\mathbf{t}}$。这个过程可以被形式化描述为：
 \begin{eqnarray}
 \widehat{\mathbf{t}}=\argmax_{\mathbf{t}} \textrm{P}(\mathbf{t}|\mathbf{s})
 \label{eqC3.12-new}
 \end{eqnarray}
-\noindent  其中$\argmax_{\mathbf{t}} \textrm{P}(\mathbf{t}|\mathbf{s})$表示找到使$\textrm{P}(\mathbf{t}|\mathbf{s})$达到最大时的译文$\mathbf{t}$。结合上一小节中关于$\textrm{P}(\mathbf{t}|\mathbf{s})$\\的定义，把公式\ref{eqC3.7-new}带入公式\ref{eqC3.12-new}得到：
+\noindent  其中$\argmax_{\mathbf{t}} \textrm{P}(\mathbf{t}|\mathbf{s})$表示找到使$\textrm{P}(\mathbf{t}|\mathbf{s})$达到最大时的译文$\mathbf{t}$。结合上一小节中关于$\textrm{P}(\mathbf{t}|\mathbf{s})$的定义，把公式\ref{eqC3.7-new}带入公式\ref{eqC3.12-new}得到：
 \begin{eqnarray}
 \widehat{\mathbf{t}}=\argmax_{\mathbf{t}}\frac{g(\mathbf{s},\mathbf{t})}{\sum_{\mathbf{t}^{'}g(\mathbf{s},\mathbf{t}^{'})}}
 \label{eqC3.13-new}
@@ -420,7 +420,6 @@ g(\mathbf{s},\mathbf{t}) \equiv \prod_{j,i \in \widehat{A}}{\textrm{P}(s_j,t_i)}
 \section{基于词的翻译建模}\index{Chapter3.3}
 \parinterval 在\ref{sec:simple-mt-example}节中，我们实现了一个简单的基于词的统计机器翻译模型，内容涉及建模、训练和解码。但是，还有很多问题还没有进行深入讨论，比如，如何处理空翻译？如何对调序问题进行建模？如何用更严密的数学模型描述翻译过程？如何对更加复杂的统计模型进行训练？等等。针对以上问题，本节将系统的介绍IBM统计机器翻译模型。作为经典的器翻译模型，对IBM模型的学习将帮助我们建立对自然语言处理问题的系统化建模思想，特别是对问题的数学描述方法将会成为理解本书后续内容的基础工具。
 \subsection{噪声信道模型}\index{Chapter3.3.1}
 \parinterval 首先，重新思考一下人类进行翻译的过程。对于给定的源语句$\mathbf{s}$，人不会像计算机一样尝试很多的可能，而是快速准确的翻译出一个或者少数几个正确的译文。因此在人看来除了正确的译文外，其它的翻译都是不正确的，或者说除了少数的译文人甚至都不会考虑太多其它的可能性。但是，在统计机器翻译的世界里，没有译文是不可能的。换句话说，对于源语言句子$\mathbf{s}$，所有可能的目标语词串$\mathbf{t}$都是可能的译文，只是可能性大小不同。即每对$(\mathbf{s},\mathbf{t})$都有一个概率值$\textrm{P}(\mathbf{t}|\mathbf{s})$来描述$\mathbf{s}$翻译为$\mathbf{t}$的好与坏（图\ref{fig:3-12}）。
@@ -434,7 +433,7 @@ g(\mathbf{s},\mathbf{t}) \equiv \prod_{j,i \in \widehat{A}}{\textrm{P}(s_j,t_i)}
 \end{figure}
 %---------------------------
-\parinterval IBM模型也是建立在如上统计模型之上。具体来说，IBM模型的基础是\textbf{噪声信道模型(Noise Channel Model)}，它是由香农在上世纪40年代末提出来的\cite{shannon1949communication}，并于上世纪80年代应用在语言识别领域，后来又被Brown等人用于统计机器翻译中\cite{brown1990statistical}。
+\parinterval IBM模型也是建立在如上统计模型之上。具体来说，IBM模型的基础是{\small\sffamily\bfseries{噪声信道模型}}(Noise Channel Model)，它是由香农在上世纪40年代末提出来的\cite{shannon1949communication}，并于上世纪80年代应用在语言识别领域，后来又被Brown等人用于统计机器翻译中\cite{brown1990statistical}。
 \parinterval 在噪声信道模型中，源语言句子$\mathbf{s}$（信宿）被看作是由目标语言句子$\mathbf{t}$（信源）经过一个有噪声的信道得到的。如果知道了$\mathbf{s}$和信道的性质，我们可以通过$\textrm{P}(\mathbf{t}|\mathbf{s})$得到信源的信息，这个过程如图\ref{fig:3-13}所示。
@@ -448,7 +447,8 @@ g(\mathbf{s},\mathbf{t}) \equiv \prod_{j,i \in \widehat{A}}{\textrm{P}(s_j,t_i)}
 \end{figure}
 %---------------------------
-\parinterval 举个例子。对于汉译英的翻译任务，汉语句子$\mathbf{s}$可以被看作是英语句子$\mathbf{t}$加入噪声通过信道后得到的结果。换句话说，英语句子经过噪声-信道传输时发生了变化，在信道的输出端呈现为汉语句子。于是我们需要根据观察到的汉语特征，通过概率$\textrm{P}(\mathbf{t}|\mathbf{s})$猜测最为可能的英语句子。这个找到最可能的目标语句（信源）的过程也被称为\textbf{解码（decoding）}。直到今天，解码这个概念也被广泛的使用在机器翻译及相关任务中。这个过程也可以表述为：给定输入$\mathbf{s}$，找到最可能的输出$\mathbf{t}$，使得$\textrm{P}(\mathbf{t}|\mathbf{s})$达到最大：
+\parinterval 举个例子。对于汉译英的翻译任务，汉语句子$\mathbf{s}$可以被看作是英语句子$\mathbf{t}$加入噪声通过信道后得到的结果。换句话说，英语句子经过噪声-信道传输时发生了变化，在信道的输出端呈现为汉语句子。于是我们需要根据观察到的汉语特征，通过概率$\textrm{P}(\mathbf{t}|\mathbf{s})$猜测最为可能的英语句子。这个找到最可能的目标语句（信源）的过程也被称为
+{\small\sffamily\bfseries{解码}}（decoding）。直到今天，解码这个概念也被广泛的使用在机器翻译及相关任务中。这个过程也可以表述为：给定输入$\mathbf{s}$，找到最可能的输出$\mathbf{t}$，使得$\textrm{P}(\mathbf{t}|\mathbf{s})$达到最大：
 \begin{eqnarray}
 \widehat{\mathbf{t}}=\argmax_{\mathbf{t}}\textrm{P}(\mathbf{t}|\mathbf{s})
 \label{eqC3.15-new}
@@ -479,11 +479,11 @@ g(\mathbf{s},\mathbf{t}) \equiv \prod_{j,i \in \widehat{A}}{\textrm{P}(s_j,t_i)}
 \begin{itemize}
 \vspace{0.5em}
-\item \textbf{建模（modeling）}：如何建立$\textrm{P}(\mathbf{s}|\mathbf{t})$和$\textrm{P}(\mathbf{t})$的数学模型。换句话说，需要用可计算的方式对翻译问题和语言建模问题进行描述，这也是最核心的问题。
+\item {\small\sffamily\bfseries{建模}}（modeling）：如何建立$\textrm{P}(\mathbf{s}|\mathbf{t})$和$\textrm{P}(\mathbf{t})$的数学模型。换句话说，需要用可计算的方式对翻译问题和语言建模问题进行描述，这也是最核心的问题。
 \vspace{0.5em}
-\item \textbf{训练（training）}：如何获得$\textrm{P}(\mathbf{s}|\mathbf{t})$和$\textrm{P}(\mathbf{t})$所需的参数。即从数据中得到模型的最优参数。
+\item {\small\sffamily\bfseries{训练}}（training）：如何获得$\textrm{P}(\mathbf{s}|\mathbf{t})$和$\textrm{P}(\mathbf{t})$所需的参数。即从数据中得到模型的最优参数。
 \vspace{0.5em}
-\item \textbf{解码（decoding）}：如何完成搜索最优解的过程。即完成$\argmax$。
+\item {\small\sffamily\bfseries{解码}}（decoding）：如何完成搜索最优解的过程。即完成$\argmax$。
 \vspace{0.5em}
 \end{itemize}
@@ -502,11 +502,11 @@ g(\mathbf{s},\mathbf{t}) \equiv \prod_{j,i \in \widehat{A}}{\textrm{P}(s_j,t_i)}
 \subsubsection{词对齐}\index{Chapter3.3.2.1}
-\parinterval IBM模型中有一个非常基础的假设是词对齐假设（或单词对齐假设）。\textbf{词对齐（word alignment）}描述了源语言句子和目标语句子之间单词级别的对应。具体地说，给定源语句子$\mathbf{s}$和目标语译文$\mathbf{t}$，其中$\mathbf{s}$由$s_1$到$s_m$共$m$个单词组成，$\mathbf{t}$由$t_1$到$t_l$共$l$个单词组成。IBM模型假设词对齐满足下述两个条件。
+\parinterval IBM模型中有一个非常基础的假设是词对齐假设（或单词对齐假设）。{\small\sffamily\bfseries{词对齐}}（word alignment）描述了源语言句子和目标语句子之间单词级别的对应。具体地说，给定源语句子$\mathbf{s}$和目标语译文$\mathbf{t}$，其中$\mathbf{s}$由$s_1$到$s_m$共$m$个单词组成，$\mathbf{t}$由$t_1$到$t_l$共$l$个单词组成。IBM模型假设词对齐满足下述两个条件。
 \begin{itemize}
 \vspace{0.5em}
-\item 一个源语言单词只能对应一个目标语单词。在图\ref{fig:3-15}表示的例子中，(a)和\\(c)都满足该条件，尽管(c)中的``谢谢''和``你''都对应``thanks''，但并不违背条件。而(b)不满足条件，因为``谢谢''同时对应到了两个目标语单词上。这个约束条件也导致这里的词对齐变成一种\textbf{非对称的词对齐}，因为它只对源语言做了约束，但是目标语言没有。使用这样的约束的目的是为了减少建模的复杂度。在后来的方法中也提出了双向词对齐，用于建模一个源语言单词对应到多个目标语单词的情况。
+\item 一个源语言单词只能对应一个目标语单词。在图\ref{fig:3-15}表示的例子中，(a)和(c)都满足该条件，尽管(c)中的``谢谢''和``你''都对应``thanks''，但并不违背条件。而(b)不满足条件，因为``谢谢''同时对应到了两个目标语单词上。这个约束条件也导致这里的词对齐变成一种{\small\sffamily\bfseries{非对称的词对齐}}，因为它只对源语言做了约束，但是目标语言没有。使用这样的约束的目的是为了减少建模的复杂度。在后来的方法中也提出了双向词对齐，用于建模一个源语言单词对应到多个目标语单词的情况。
 %----------------------------------------------
 % 图3.21
 \begin{figure}[htp]
@@ -517,7 +517,7 @@ g(\mathbf{s},\mathbf{t}) \equiv \prod_{j,i \in \widehat{A}}{\textrm{P}(s_j,t_i)}
 \end{figure}
 %---------------------------
 \vspace{0.5em}
-\item 源语言单词可以翻译为空，这时它对应到一个虚拟或伪造的目标语单词$t_0$。在图\ref{fig:3-16}所示的例子中，``在''没有对应到``on the table''中的任意一个词，而是把它对应到$t_0$上。这个条件保证了所有的源语言单词都能找到一个目标语单词对应。这个条件也很好的引入了\textbf{空对齐}的思想，即源语言单词不对应任何真实存在的单词的情况。而这种空对齐的情况在翻译中是频繁出现的，比如虚词的翻译。
+\item 源语言单词可以翻译为空，这时它对应到一个虚拟或伪造的目标语单词$t_0$。在图\ref{fig:3-16}所示的例子中，``在''没有对应到``on the table''中的任意一个词，而是把它对应到$t_0$上。这个条件保证了所有的源语言单词都能找到一个目标语单词对应。这个条件也很好的引入了{\small\sffamily\bfseries{空对齐}}的思想，即源语言单词不对应任何真实存在的单词的情况。而这种空对齐的情况在翻译中是频繁出现的，比如虚词的翻译。
 %----------------------------------------------
 % 图3.21
 \begin{figure}[htp]
@@ -529,7 +529,7 @@ g(\mathbf{s},\mathbf{t}) \equiv \prod_{j,i \in \widehat{A}}{\textrm{P}(s_j,t_i)}
 %---------------------------
 \end{itemize}
-\parinterval 通常，我们把词对齐记为$\mathbf{a}$，它由$a_1$到$a_m$共$m$个词对齐连接组成，即$\mathbf{a}=a_1...a_m$，其中$m$表示源语言句子长度。$a_j$表示第$j$个源语单词$s_j$对应的目标语单词的位置。在图\ref{fig:3-16}的例子中，词对齐关系可以记为$a_1=0, a_2=3, a_3=1$。它表示第1个源语单词``在''对应到目标语译文的第0个位置，第2个源语单词``桌子''对应到目标语译文的第3个位置是，第3个源语单词``上''对应到目标语译文的第1个位置。 \\ \\
+\parinterval 通常，我们把词对齐记为$\mathbf{a}$，它由$a_1$到$a_m$共$m$个词对齐连接组成，即$\mathbf{a}=a_1...a_m$，其中$m$表示源语言句子长度。$a_j$表示第$j$个源语单词$s_j$对应的目标语单词的位置。在图\ref{fig:3-16}的例子中，词对齐关系可以记为$a_1=0, a_2=3, a_3=1$。它表示第1个源语单词``在''对应到目标语译文的第0个位置，第2个源语单词``桌子''对应到目标语译文的第3个位置是，第3个源语单词``上''对应到目标语译文的第1个位置。
 \subsubsection{基于词对齐的翻译模型}\index{Chapter3.3.2.2}
@@ -768,7 +768,7 @@ g(\mathbf{s},\mathbf{t}) \equiv \prod_{j,i \in \widehat{A}}{\textrm{P}(s_j,t_i)}
 %%%%%%%%%%%%%%%%%%%%%%
 \subsubsection {（二）优化}\index{Chapter3.4.4.2}
-\parinterval 我们已经把IBM模型的参数训练问题定义为带约束的目标函数优化问题。由于目标函数是可微分函数，解决这类问题的一种常用手法是把带约束的优化问题转化为不带约束的优化问题。这里用到了\textbf{拉格朗日乘数法（The Lagrange Multiplier Method）}，它的基本思想是把含有$n$个变量和$m$个约束条件的优化问题转化为含有$n+m$个变量的无约束优化问题。
+\parinterval 我们已经把IBM模型的参数训练问题定义为带约束的目标函数优化问题。由于目标函数是可微分函数，解决这类问题的一种常用手法是把带约束的优化问题转化为不带约束的优化问题。这里用到了{\small\sffamily\bfseries{拉格朗日乘数法}}（The Lagrange Multiplier Method），它的基本思想是把含有$n$个变量和$m$个约束条件的优化问题转化为含有$n+m$个变量的无约束优化问题。
 \parinterval 这里，我们的目标是$\max(\textrm{P}_{\theta}(\mathbf{s}|\mathbf{t}))$，约束条件是对于任意的目标语单词$t_y$有\\$\sum_{s_x}{\textrm{P}(s_x|t_y)}=1$。根据拉格朗日乘数法，可以把上述优化问题重新定义最大化如下拉格朗日函数：
 \begin{eqnarray}
@@ -838,7 +838,7 @@ f(s_u|t_v) = \lambda_{t_v}^{-1} \frac{\epsilon}{(l+1)^{m}} \prod\limits_{j=1}^{m
 \label{eqC3.40-new}
 \end{eqnarray}
-\noindent \hspace{2em}  通过采用一个非常经典的\textbf{期望最大化（Expectation Maximization）}方法，简称EM方法（或算法），我们仍可以利用上式迭代地计算$f(s_u|t_v)$，使其最终收敛到最优值。该方法的思想是：用当前的参数，求一个似然函数的期望，之后最大化这个期望同时得到新的一组参数的值。对于IBM模型来说，其迭代过程就是反复使用公式1.39，具体如下图。
+\noindent \hspace{2em}  通过采用一个非常经典的{\small\sffamily\bfseries{期望最大化}}（Expectation Maximization）方法，简称EM方法（或算法），我们仍可以利用上式迭代地计算$f(s_u|t_v)$，使其最终收敛到最优值。该方法的思想是：用当前的参数，求一个似然函数的期望，之后最大化这个期望同时得到新的一组参数的值。对于IBM模型来说，其迭代过程就是反复使用公式1.39，具体如下图。
 %----------------------------------------------
 % 图3.28
 \begin{figure}[htp]
@@ -849,7 +849,7 @@ f(s_u|t_v) = \lambda_{t_v}^{-1} \frac{\epsilon}{(l+1)^{m}} \prod\limits_{j=1}^{m
 \end{figure}
 %---------------------------
-\noindent \hspace{2em} 为了化简$f(s_u|t_v)$的计算，在此对公式\ref{eqC3.40-new}进行了重新组织，见下图。红色部分表示翻译概率P$(\mathbf{s}|\mathbf{t})$；蓝色部分表示$(s_u,t_v)$在句对$(\mathbf{s},\mathbf{t})$中配对的总次数，即``$t_v$翻译为$s_u$''在所有对齐中出现的次数；绿色部分表示$f(s_u|t_v)$对于所有的$t_i$的相对值，即``$t_v$翻译为$s_u$''在所有对齐中出现的相对概率；蓝色与绿色部分相乘表示``$t_v$翻译为$s_u$''这个事件出现次数的期望的估计，称之为\textbf{期望频次(expected count)}。
+\noindent \hspace{2em} 为了化简$f(s_u|t_v)$的计算，在此对公式\ref{eqC3.40-new}进行了重新组织，见下图。红色部分表示翻译概率P$(\mathbf{s}|\mathbf{t})$；蓝色部分表示$(s_u,t_v)$在句对$(\mathbf{s},\mathbf{t})$中配对的总次数，即``$t_v$翻译为$s_u$''在所有对齐中出现的次数；绿色部分表示$f(s_u|t_v)$对于所有的$t_i$的相对值，即``$t_v$翻译为$s_u$''在所有对齐中出现的相对概率；蓝色与绿色部分相乘表示``$t_v$翻译为$s_u$''这个事件出现次数的期望的估计，称之为{\small\sffamily\bfseries{期望频次}}(expected count)。
 %----------------------------------------------
 % 图3.29
 \begin{figure}[htp]
@@ -961,7 +961,7 @@ a(i|j,m,l) &=\frac{\sum_{k=0}^{K}c_{\mathbb{E}}(i|j;\mathbf{s}^{[k]},\mathbf{t}^
 \parinterval 从前面的介绍可知，IBM模型1和模型2都把不同的源文单词都看作相互独立的单元来进行词对齐和翻译。换句话说，即使源语中的某个短语中的两个单词都对齐到同一个目标语单词，它们之间也是相互独立的。这样模型1和模型2对于多个源语单词对齐到同一个目标语单词的情况并不能很好的描述。
-\parinterval 这里将会给出另一个翻译模型，能在一定程度上解决上面提到的问题。我们把目标语言译文生成源文的过程分解为如下几个步骤：首先，确定每个目标语言单词生成源语言单词的个数，这里把它称为\textbf{产出率}或\textbf{繁衍率（fertility）}；其次，决定译文中每个单词生成的源语单词都是什么，即决定生成的第一个源语单词是什么，生成的第二个源语单词是什么，以此类推。这样每个目标语单词就对应了一个源语单词列表；最后把各组源语单词列表中的每个单词都放置到源语句子的某个位置上，完成目标语言译文到源语言句子的生成。
+\parinterval 这里将会给出另一个翻译模型，能在一定程度上解决上面提到的问题。我们把目标语言译文生成源文的过程分解为如下几个步骤：首先，确定每个目标语言单词生成源语言单词的个数，这里把它称为{\small\sffamily\bfseries{产出率}}或{\small\sffamily\bfseries{繁衍率}}（fertility）；其次，决定译文中每个单词生成的源语单词都是什么，即决定生成的第一个源语单词是什么，生成的第二个源语单词是什么，以此类推。这样每个目标语单词就对应了一个源语单词列表；最后把各组源语单词列表中的每个单词都放置到源语句子的某个位置上，完成目标语言译文到源语言句子的生成。
 %----------------------------------------------
 % 图3.5.1
 \begin{figure}[htp]
@@ -1070,7 +1070,7 @@ p_0+p_1                            & = & 1 \label{eqC3.62-new}
 \parinterval IBM模型3仍然存在问题，比如不能很好的处理一个目标语言单词生成多个源语言单词的情况。这个问题在模型1和模型2中也存在。如果一个目标语言单词对应多个源语言单词，往往这些源语言单词构成短语或搭配。但是模型1-3都把这些源语单词看成独立的单元，而实际上它们应该被看成一个翻译的整体。这就造成了在模型1-3中，这些源语言单词可能``分散''开。为了解决这个问题，模型4对模型3进行了进一步修改。
-\parinterval 为了更清楚的阐述，这里引入新的术语\dash 概念单元。词对齐又可以被看作概念（concept，简记为cept.）之间的对应。这里的概念是指具有独立语法或语义功能的一组单词。依照Brown等人的表示方法，我们把概念记为cept.。每个句子都可以被表示成一系列的cept.。这里要注意的是，源语言句子中的cept.数量不一定等于目标句子中的cept.数量。因为有些cept.可以为空，我们把那些空对的单词看作空cept.。比如，在图\ref{fig:3-32}的实例中，``了''就是（对应）一个空cept.。
+\parinterval 为了更清楚的阐述，这里引入新的术语\ \dash \ 概念单元。词对齐又可以被看作概念（concept，简记为cept.）之间的对应。这里的概念是指具有独立语法或语义功能的一组单词。依照Brown等人的表示方法，我们把概念记为cept.。每个句子都可以被表示成一系列的cept.。这里要注意的是，源语言句子中的cept.数量不一定等于目标句子中的cept.数量。因为有些cept.可以为空，我们把那些空对的单词看作空cept.。比如，在图\ref{fig:3-32}的实例中，``了''就是（对应）一个空cept.。
 %----------------------------------------------
 % 图3.6.1
 \begin{figure}[htp]
@@ -1146,7 +1146,7 @@ p_0+p_1                            & = & 1 \label{eqC3.62-new}
 \subsubsection{隐马尔可夫模型}
-\parinterval \textbf{隐马尔可夫模型（Hidden Markov Model，HMM）}是一个经典的机器学习模型，它在语音识别、自然语言处理等领域得到了非常广泛的应用。其本质是一个概率模型，用来描述一个含有隐含参数的马尔可夫过程，简单来说，是用来描述一个系统隐含状态的转移和可见状态的概率\footnote{https://zh.wikipedia.org/zh-hans/隐马尔可夫模型}。
+\parinterval {\small\sffamily\bfseries{隐马尔可夫模型}}（Hidden Markov Model，HMM）是一个经典的机器学习模型，它在语音识别、自然语言处理等领域得到了非常广泛的应用。其本质是一个概率模型，用来描述一个含有隐含参数的马尔可夫过程，简单来说，是用来描述一个系统隐含状态的转移和可见状态的概率\footnote{https://zh.wikipedia.org/zh-hans/隐马尔可夫模型}。
 \parinterval 我们用一个简单的例子来对这些概念进行说明。假设你有三枚质地不同的硬币A、B、C，这三个硬币抛出正面的概率分别为0.3、0.5、0.7。之后我们开始抛硬币，随机从三个硬币里挑一个，挑到每一个硬币的概率都是  1/3 。不停的重复上述过程，我们会得到一串硬币的正反序列，如：抛硬币6次，得到：正 正 反 反 正 反。
@@ -1210,7 +1210,7 @@ p_0+p_1                            & = & 1 \label{eqC3.62-new}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \subsection{解码和训练}\index{Chapter3.5.5}
-\parinterval 和IBM模型1-2一样，IBM模型3-5和隐马尔可夫模型的解码可以直接使用\ref{sec:sentence-level-translation}节所描述的方法。基本思路是对译文自左向右生成，每次扩展一个源语言单词的翻译，即把源语言单词的译文放到已经生成的译文的右侧。每次扩展可以选择不同的源语言单词或者同一个源语言单词的不同翻译候选，这样就可以得到多个不同的扩展译文。在这个过程中，同时计算翻译模型和语言模型的得分，对每个得到译文候选打分。最终，保留一个或者多个译文。这个过程重复执行直至所有源语言单词被翻译完。
+\parinterval 和IBM模型1-2一样，IBM模型3-5和隐马尔可夫模型的解码可以直接使用\ref{sec:sentence-level-translation}\\节所描述的方法。基本思路是对译文自左向右生成，每次扩展一个源语言单词的翻译，即把源语言单词的译文放到已经生成的译文的右侧。每次扩展可以选择不同的源语言单词或者同一个源语言单词的不同翻译候选，这样就可以得到多个不同的扩展译文。在这个过程中，同时计算翻译模型和语言模型的得分，对每个得到译文候选打分。最终，保留一个或者多个译文。这个过程重复执行直至所有源语言单词被翻译完。
 \parinterval 类似的，IBM模型3-5和隐马尔可夫模型也都可以使用期望最大化（EM）方法进行模型训练。相关数学推导可参考附录\ref{appendix-B}的内容。通常，我们会使用这些模型获得双语句对间的词对齐结果，比如著名的GIZA++工具。这时，往往会使用多个模型，把简单的模型训练后的参数作为初始值送给后面更加复杂的模型。比如，先用IBM模型1训练，之后把参数送给IBM模型2，再训练，之后把参数送给隐马尔可夫模型等。值得注意的是，并不是所有的模型使用EM算法都能找到全局最优解。特别是IBM模型3-5的训练中使用一些剪枝和近似的方法，优化的真实目标函数会更加复杂。不过，IBM模型1是一个凸函数（convex function），因此理论上使用EM类的方法是能找到全局最优解的。更实际的好处是，IBM模型1训练的最终结果与参数的初始化过程无关。也是为什么在使用IBM系列模型时，往往会使用IBM模型1作为起始模型的原因。
@@ -1221,17 +1221,17 @@ p_0+p_1                            & = & 1 \label{eqC3.62-new}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \subsection{词对齐及对称化}\index{Chapter3.6.1}
-\parinterval IBM的五个模型都是基于一个词对齐的假设\ \dash \ 一个源语言单词最多只能对齐到一个目标语言单词。这个约束大大化简了IBM模型的建模。最初，Brown等人提出这个假设可能是因为在法英翻译中一对多的对齐情况并不多见，这个假设带来的问题也不是那么严重。但是，在像汉英翻译这样的任务中，一个汉语单词对应多个英语单词的翻译很常见，这时IBM模型的词对齐假设就表现出了明显的问题。比如在翻译``我 会 试一试 。''$\to$``I will have a try .''时，IBM模型根本不能把单词``试一试''对齐到三个单词``have a try''，因而可能无法得到正确的翻译结果。
+\parinterval IBM的五个模型都是基于一个词对齐的假设\ \dash \ 一个源语言单词最多只能对齐到一个目标语言单词。这个约束大大化简了IBM模型的建模。最初，Brown等人提出这个假设可能是因为在法英翻译中一对多的对齐情况并不多见，这个假设带来的问题也不是那么严重。但是，在像汉英翻译这样的任务中，一个汉语单词对应多个英语单词的翻译很常见，这时IBM模型的词对齐假设就表现出了明显的问题。比如在翻译``我 会 试一试 。''\ $\to$ \ ``I will have a try .''时，IBM模型根本不能把单词``试一试''对齐到三个单词``have a try''，因而可能无法得到正确的翻译结果。
 \parinterval 本质上说，IBM模型的词对齐的``不完整''问题是IBM模型本身的缺陷。解决这个问题有很多思路，第一种方法就是，反向训练后，合并源语言单词，然后再正向训练。这里用汉英翻译为例来解释这个方法。首先反向训练，就是把英语当作待翻译语言，而把汉语当作目标语言进行训练（参数估计）。这样可以得到一个词对齐结果（参数估计的中间结果）。在这个词对齐结果里面，一个汉语单词可对应多个英语单词。之后，扫描每个英语句子，如果有多个英语单词对应同一个汉语单词，就把这些英语单词合并成一个英语单词。处理完之后，再把汉语当作源语言言把英语当作目标语言进行训练。这样就可以把一个汉语词对应到合并的英语单词上。虽然从模型上看，还是一个汉语单词对应一个英语``单词''，但实质上已经把这个汉语单词对应到多个英语单词上了。训练完之后，再利用这些参数进行翻译（解码）时，就能把一个中文单词翻译成多个英文单词了。但是反向训练后再训练也存在一些问题。首先，合并英语单词会使数据变得更稀疏，使训练不充分。其次，由于IBM模型的词对齐结果并不是高精度的，利用它的词对齐结果来合并一些英文单词可能造成严重的错误，比如：把本来很独立的几个单词合在了一起。因此，此方法也并不完美。具体使用时还要考虑实际需要和问题的严重程度来决定是否使用这个方法。
-\parinterval 另一种方法是双向对齐之后进行词对齐\textbf{对称化（Symmetrization）}。这个方法可以帮助我们在IBM词对齐的基础上获得对称的词对齐结果。思路很简单，用正向（汉语为源语言，英语为目标语言）和反向（汉语为目标语言，英语为源语言）同时训练。这样可以得到两个词对齐结果。然后利用一些启发性方法用这两个词对齐生成对称的结果（比如，取``并集''、``交集''等），这样就可以得到包含1对多和多对多的词对齐结果。比如，在基于短语的统计机器翻译中已经很成功地使用了这种词对齐信息进行短语的获取。直到今天，对称化仍然是很多自然语言处理系统中的一个关键步骤。
+\parinterval 另一种方法是双向对齐之后进行词对齐{\small\sffamily\bfseries{对称化}}（Symmetrization）。这个方法可以帮助我们在IBM词对齐的基础上获得对称的词对齐结果。思路很简单，用正向（汉语为源语言，英语为目标语言）和反向（汉语为目标语言，英语为源语言）同时训练。这样可以得到两个词对齐结果。然后利用一些启发性方法用这两个词对齐生成对称的结果（比如，取``并集''、``交集''等），这样就可以得到包含一对多和多对多的词对齐结果。比如，在基于短语的统计机器翻译中已经很成功地使用了这种词对齐信息进行短语的获取。直到今天，对称化仍然是很多自然语言处理系统中的一个关键步骤。
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \subsection{Deficiency}\index{Chapter3.6.2}
 \parinterval Deficiency问题是指翻译模型会把一部分概率分配给一些根本不存在的源语言字符串。如果用$\textrm{P}(\textrm{well}|\mathbf{t})$表示$\textrm{P}(\mathbf{s}| \mathbf{t})$在所有的正确的（可以理解为语法上正确的）$\mathbf{s}$上的和，即
 \begin{eqnarray}
-\textrm{P}(\textrm{well}|\mathbf{t})=\sum_{s\textrm{\;is\;well\;formed}}{\textrm{P}(\mathbf{s}| \mathbf{t})}
+\textrm{P}(\textrm{well}|\mathbf{t})=\sum_{\mathbf{s}\textrm{\;is\;well\;formed}}{\textrm{P}(\mathbf{s}| \mathbf{t})}
 \label{eqC3.70-new}
 \end{eqnarray}
@@ -1257,7 +1257,7 @@ p_0+p_1                            & = & 1 \label{eqC3.62-new}
 \subsection{其它问题}\index{Chapter3.6.5}
-\parinterval 模型5的意义？模型5的提出是为了消除了模型3和模型4的Deficiency问题。Defic\\ -iency问题的本质是，$\textrm{P}(\mathbf{s},\mathbf{a}| \mathbf{t})$在所有合理的对齐上概率和不为1。但是，在统计机器翻译中我们更关心是哪个对齐$\mathbf{a}$使$\textrm{P}(\mathbf{s},\mathbf{a}| \mathbf{t})$达到最大，即使$\textrm{P}(\mathbf{s},\mathbf{a}|\mathbf{t})$不符合概率分布的定义，也并不影响我们寻找理想的对齐$\mathbf{a}$。从这个工程的角度说，$\textrm{P}(\mathbf{s},\mathbf{a}| \mathbf{t})$不归一并不是一个十分严重的问题。遗憾的是，实际上至今也太多对IBM模型3和模型4中的Deficiency问题进行过系统的实验和分析，这个问题到底有多严重并没有定论。当然用模型5是可以解决这个问题。但是如果用一个非常复杂的模型去解决了一个并不产生严重后果的问题，那这个模型也就没有太大意义了（从实践的角度）。
+\parinterval 模型5的意义？模型5的提出是为了消除了模型3和模型4的Deficiency问题。Deficiency问题的本质是，$\textrm{P}(\mathbf{s},\mathbf{a}| \mathbf{t})$在所有合理的对齐上概率和不为1。但是，在统计机器翻译中我们更关心是哪个对齐$\mathbf{a}$使$\textrm{P}(\mathbf{s},\mathbf{a}| \mathbf{t})$达到最大，即使$\textrm{P}(\mathbf{s},\mathbf{a}|\mathbf{t})$不符合概率分布的定义，也并不影响我们寻找理想的对齐$\mathbf{a}$。从这个工程的角度说，$\textrm{P}(\mathbf{s},\mathbf{a}| \mathbf{t})$不归一并不是一个十分严重的问题。遗憾的是，实际上至今也太多对IBM模型3和模型4中的Deficiency问题进行过系统的实验和分析，这个问题到底有多严重并没有定论。当然用模型5是可以解决这个问题。但是如果用一个非常复杂的模型去解决了一个并不产生严重后果的问题，那这个模型也就没有太大意义了（从实践的角度）。
 \parinterval 概念（Cept.）的意义？经过前面的分析可知，IBM模型的词对齐模型是使用了cept.这个概念。但是实质上，在IBM模型中使用的cept.最多只能对应一个目标语言单词（模型并没有用到源语言cept.的概念）。因此可以直接用单词代替cept.。这样，即使不引入cept.的概念，也并不影响IBM模型的建模。实际上，cept.的引入确实可以帮助我们从语法和语义的角度解释词对齐过程。不过，这个方法在IBM模型中的效果究竟如何也没有定论。

--- a/Book/Chapter3/Figures/figure-EM-algorithm-flow-chart.tex
+++ b/Book/Chapter3/Figures/figure-EM-algorithm-flow-chart.tex
@@ -6,7 +6,7 @@
 %%% outline
 %-------------------------------------------------------------------------
 \begin{tikzpicture}
-\node [anchor=north west] (line1) at (0,0) {\textbf{IBM模型1的训练（EM算法）}};
+\node [anchor=north west] (line1) at (0,0) {\small\sffamily\bfseries{IBM模型1的训练（EM算法）}};
 \node [anchor=north west] (line2) at ([yshift=-0.3em]line1.south west) {输入: 平行语料${(\mathbf{s}^{[1]},\mathbf{t}^{[1]}),...,(\mathbf{s}^{[N]},\mathbf{t}^{[N]})}$};
 \node [anchor=north west] (line3) at ([yshift=-0.1em]line2.south west) {输出: 参数$f(\cdot|\cdot)$的最优值};
 \node [anchor=north west] (line4) at ([yshift=-0.1em]line3.south west) {1: \textbf{Function} \textsc{TrainItWithEM}($\{(\mathbf{s}^{[1]},\mathbf{t}^{[1]}),...,(\mathbf{s}^{[N]},\mathbf{t}^{[N]})\}$) };

--- a/Book/Chapter3/Figures/figure-IBM-model-iteration-process-diagram.tex
+++ b/Book/Chapter3/Figures/figure-IBM-model-iteration-process-diagram.tex
@@ -14,8 +14,8 @@
 \node [anchor=west,inner sep=2pt] (eq6) at ([xshift=-2pt]eq5.east) {\footnotesize{$\sum\limits_{j=1}^{m} \delta(s_j,s_u) \sum\limits_{i=0}^{l} \delta(t_i,t_v)$}};
 \node [anchor=west,inner sep=2pt,fill=red!20,minimum height=3em] (eq7) at ([xshift=-2pt,yshift=-0pt]eq6.east) {$\frac{f(s_u|t_v)}{\sum_{i=0}^{l}f(s_u|t_i)}$};
-\node [anchor=south west,inner sep=2pt] (label1) at ([yshift=1em]eq1.north west) {\footnotesize{\textbf{新的参数值}}};
+\node [anchor=south west,inner sep=2pt] (label1) at ([yshift=1em]eq1.north west) {\footnotesize{\sffamily\bfseries{新的参数值}}};
-\node [anchor=south east,inner sep=2pt] (label2) at ([yshift=1em,xshift=-5em]eq7.north east) {\footnotesize{\textbf{旧的参数值}}};
+\node [anchor=south east,inner sep=2pt] (label2) at ([yshift=1em,xshift=-5em]eq7.north east) {\footnotesize{\sffamily\bfseries{旧的参数值}}};
 \draw [<-,thick] (label1.south) .. controls +(south:1em) and +(north:1em) .. ([xshift=-1em]eq1.north);

--- a/Book/Chapter3/Figures/figure-a-more-detailed-explanation-of-formula-3.40.tex
+++ b/Book/Chapter3/Figures/figure-a-more-detailed-explanation-of-formula-3.40.tex
@@ -27,15 +27,15 @@
 }
 {
-\node [anchor=south west,inner sep=2pt] (label1) at (eq4.north west) {\textbf{\scriptsize{翻译概率$\textrm{P}(\mathbf{s}|\mathbf{t})$}}};
+\node [anchor=south west,inner sep=2pt] (label1) at (eq4.north west) {{\scriptsize{翻译概率$\textrm{P}(\mathbf{s}|\mathbf{t})$}}};
 }
 {
-\node [anchor=south west,inner sep=2pt] (label2) at (eq5.north west) {\textbf{\scriptsize{配对的总次数}}};
+\node [anchor=south west,inner sep=2pt] (label2) at (eq5.north west) {{\scriptsize{配对的总次数}}};
-\node [anchor=south west,inner sep=2pt] (label2part2) at ([yshift=-3pt]label2.north west) {\textbf{\scriptsize{$(s_u,t_v)$在句对$(\mathbf{s},\mathbf{t})$中}}};
+\node [anchor=south west,inner sep=2pt] (label2part2) at ([yshift=-3pt]label2.north west) {{\scriptsize{$(s_u,t_v)$在句对$(\mathbf{s},\mathbf{t})$中}}};
 }
 {
-\node [anchor=south west,inner sep=2pt] (label3) at (eq6.north west) {\textbf{\scriptsize{有的$t_i$的相对值}}};
+\node [anchor=south west,inner sep=2pt] (label3) at (eq6.north west) {{\scriptsize{有的$t_i$的相对值}}};
-\node [anchor=south west,inner sep=2pt] (label4) at ([yshift=-3pt]label3.north west) {\textbf{\scriptsize{$f(s_u|t_v)$对于所}}};
+\node [anchor=south west,inner sep=2pt] (label4) at ([yshift=-3pt]label3.north west) {{\scriptsize{$f(s_u|t_v)$对于所}}};
 }
 {
@@ -44,9 +44,9 @@
 }
 {
-\draw[decorate,thick,decoration={brace,amplitude=5pt,mirror}] ([yshift=-0.2em]eq5.south west) -- ([yshift=-0.2em]eq6.south east) node [pos=0.4,below,xshift=-0.0em,yshift=-0.3em] (expcount1) {\footnotesize{\textbf{``$t_v$翻译为$s_u$''这个事件}}};
+\draw[decorate,thick,decoration={brace,amplitude=5pt,mirror}] ([yshift=-0.2em]eq5.south west) -- ([yshift=-0.2em]eq6.south east) node [pos=0.4,below,xshift=-0.0em,yshift=-0.3em] (expcount1) {\footnotesize{{``$t_v$翻译为$s_u$''这个事件}}};
-\node [anchor=north west] (expcount2) at ([yshift=0.5em]expcount1.south west) {\footnotesize{\textbf{出现次数的期望的估计}}};
+\node [anchor=north west] (expcount2) at ([yshift=0.5em]expcount1.south west) {\footnotesize{{出现次数的期望的估计}}};
-\node [anchor=north west] (expcount3) at ([yshift=0.5em]expcount2.south west) {\footnotesize{\textbf{称之为期望频次expected count}}};
+\node [anchor=north west] (expcount3) at ([yshift=0.5em]expcount2.south west) {\footnotesize{{称之为期望频次}}（expected count）};
 }
 \end{tikzpicture}

--- a/Book/Chapter3/Figures/figure-calculation-formula&iterative-process-of-function.tex
+++ b/Book/Chapter3/Figures/figure-calculation-formula&iterative-process-of-function.tex
@@ -47,7 +47,7 @@
    {
    \draw [->,thick] ([yshift=1em]flabel.east) -- ([yshift=1em]clabel.west);
-    \draw [<-,thick] ([yshift=-1em]flabel.east) -- ([yshift=-1em]clabel.west) node [pos=0.5,above,yshift=0.3em] {\footnotesize{\textbf{反复执行}}};
+    \draw [<-,thick] ([yshift=-1em]flabel.east) -- ([yshift=-1em]clabel.west) node [pos=0.5,above,yshift=0.3em] {\footnotesize{{反复执行}}};
    }
    \end{tikzpicture}

--- a/Book/Chapter3/Figures/figure-correspondence-between-IBM-model&formula-1.13.tex
+++ b/Book/Chapter3/Figures/figure-correspondence-between-IBM-model&formula-1.13.tex
@@ -10,11 +10,10 @@
    \node [anchor=west,inner sep=1pt] (e4) at (e3.east) {$\times$};
    \node [anchor=west,inner sep=3pt,fill=blue!20] (e5) at (e4.east) {$\textrm{P}_{lm}(\mathbf{t})$};
    \node [anchor=north west,inner sep=1pt] (n1) at ([xshift=7.0em,yshift=-0.5em]e1.south west) {$\textrm{P}(\mathbf{s}|\mathbf{t})$};
-    \node [anchor=north] (n1part2) at ([yshift=0.3em]n1.south) {\scriptsize{\textbf{翻译模型}}};
+    \node [anchor=north] (n1part2) at ([yshift=0.3em]n1.south) {\scriptsize{{翻译模型}}};
    \node [anchor=west,inner sep=1pt] (n2) at ([xshift=4.0em]n1.east) {$\textrm{P}(\mathbf{t})$};
-    \node [anchor=north] (n2part2) at ([yshift=0.3em]n2.south) {\scriptsize{\textbf{语言模型}}};
+    \node [anchor=north] (n2part2) at ([yshift=0.3em]n2.south) {\scriptsize{{语言模型}}};
-    %\draw [->,thick] (e3.south) .. controls +(south:1em) and +(north:1em) .. (n1.north);
-    %\draw [->,thick] (e5.south) .. controls +(south:1em) and +(70:1em) .. (n2.north);
    \end{tikzpicture}

--- a/Book/Chapter3/Figures/figure-different-translation-result-in-different-score-IBM1.tex
+++ b/Book/Chapter3/Figures/figure-different-translation-result-in-different-score-IBM1.tex
@@ -25,13 +25,13 @@
 \node [anchor=north] (score11) at ([yshift=-2.0em]s1.south) {$\textrm{P}(\mathbf{s}|\mathbf{t})$};
 \node [anchor=north] (score12) at ([yshift=-2.0em]s2.south) {$\textrm{P}(\mathbf{s}|\mathbf{t}')$};
 \node [anchor=west] (comp1) at ([xshift=2.3em]score11.east) {\large{$\mathbf{=}$}};
-\node [anchor=east] (label1) at ([xshift=-1em,yshift=0.1em]score11.west) {\textbf{IBM模型1:}};
+\node [anchor=east] (label1) at ([xshift=-1em,yshift=0.1em]score11.west) {{IBM模型1:}};
 {
 \node [anchor=north] (score21) at ([yshift=0.2em]score11.south) {$\textrm{P}(\mathbf{s}|\mathbf{t})$};
 \node [anchor=north] (score22) at ([yshift=0.2em]score12.south) {$\textrm{P}(\mathbf{s}|\mathbf{t}')$};
 \node [anchor=west] (comp2) at ([xshift=2.3em]score21.east) {\large{$\mathbf{>}$}};
-\node [anchor=east] (label2) at ([xshift=-1em,yshift=0.1em]score21.west) {\textbf{理想:}};
+\node [anchor=east] (label2) at ([xshift=-1em,yshift=0.1em]score21.west) {{理想:}};
 }
 \end{tikzpicture}

--- a/Book/Chapter3/Figures/figure-greedy-MT-decoding-pseudo-code.tex
+++ b/Book/Chapter3/Figures/figure-greedy-MT-decoding-pseudo-code.tex
@@ -98,7 +98,7 @@
 %% remark 4
 \begin{scope}
 {
-\node [anchor=north west,inner sep=2pt,align=left] (remark4) at ([xshift=0.43em,yshift=-0.8em]remark3.south west) {\textsc{Join}($a,b$) 返回\\$a$ 和$b$ 的所有组合 };
+\node [anchor=north west,inner sep=2pt,align=left] (remark4) at ([xshift=0.31em,yshift=-0.8em]remark3.south west) {\textsc{Join}($a,b$) 返回\\$a$ 和$b$ 的所有组合 };
 {\scriptsize
 \node [anchor=north west,inner sep=1pt,align=center,draw] (a1) at ([yshift=-0.5em]remark4.north east) {a1\\a2};

--- a/Book/Chapter3/Figures/figure-human-translation.tex
+++ b/Book/Chapter3/Figures/figure-human-translation.tex
@@ -5,12 +5,12 @@
 \begin{tikzpicture}
 \begin{scope}
-\node [anchor=west] (s1) at (0,0) {\textbf{我}};
+\node [anchor=west] (s1) at (0,0) {{我}};
-\node [anchor=west] (s2) at ([xshift=2em]s1.east) {\textbf{对}};
+\node [anchor=west] (s2) at ([xshift=2em]s1.east) {{对}};
-\node [anchor=west] (s3) at ([xshift=2em]s2.east) {\textbf{你}};
+\node [anchor=west] (s3) at ([xshift=2em]s2.east) {{你}};
-\node [anchor=west] (s4) at ([xshift=2em]s3.east) {\textbf{表示ʾ}};
+\node [anchor=west] (s4) at ([xshift=2em]s3.east) {{表示}};
-\node [anchor=west] (s5) at ([xshift=2em]s4.east) {\textbf{满意}};
+\node [anchor=west] (s5) at ([xshift=2em]s4.east) {{满意}};
-\node [anchor=south west] (sentlabel) at ([yshift=-0.5em]s1.north west) {\scriptsize{\textbf{\color{red}{待翻译句子(已经分词):}}}};
+\node [anchor=south west] (sentlabel) at ([yshift=-0.5em]s1.north west) {\scriptsize{\sffamily\bfseries{\color{red}{待翻译句子(已经分词):}}}};
 {
 \draw [->,very thick,ublue] (s1.south) -- ([yshift=-0.7em]s1.south);
@@ -75,7 +75,7 @@
 \draw [->,thick] ([yshift=-0.1em,xshift=0.2em]t31.south west) ..controls +(south:3em) and +(north:3em).. ([yshift=0.1em,xshift=0.2em]ft14.north west);
 \draw [->,thick] ([yshift=0.1em]t52.south west) ..controls +(250:4em) and +(north:4em).. ([yshift=0.1em]ft12.north);
-\node [anchor=east,inner sep=1pt] (nulltranslabel) at (t42.south west) {\scriptsize{\textbf{翻空}}};
+\node [anchor=east,inner sep=1pt] (nulltranslabel) at (t42.south west) {\scriptsize{{翻空}}};
 \draw [->,thick] ([yshift=0.1em]t41.south west) ..controls +(250:1em) and +(north:1em).. (nulltranslabel.north);
 }
 }
@@ -86,12 +86,12 @@
 \node [anchor=north west] (label1) at (ft11.south west) {\small{选择最佳单词翻译，调整词序，得到完美的结果}};
 }
 {
-\draw[decorate,thick,decoration={brace,amplitude=5pt,mirror}] ([yshift=8em,xshift=-0.7em]t13.south west) -- ([xshift=-0.7em]t13.south west) node [pos=0.5,left,xshift=-0.5em,yshift=0.5em] (label2) {\footnotesize{\textbf{学习到的}}};
+\draw[decorate,thick,decoration={brace,amplitude=5pt,mirror}] ([yshift=8em,xshift=-0.7em]t13.south west) -- ([xshift=-0.7em]t13.south west) node [pos=0.5,left,xshift=-0.5em,yshift=0.5em] (label2) {\footnotesize{{学习到的}}};
-\node [anchor=north west] (label2part2) at ([yshift=0.3em]label2.south west) {\footnotesize{\textbf{单词翻译}}};
+\node [anchor=north west] (label2part2) at ([yshift=0.3em]label2.south west) {\footnotesize{{单词翻译}}};
 }
 {
-\draw[decorate,thick,decoration={brace,amplitude=5pt,mirror}] ([yshift=-0.2em,xshift=-0.7em]t13.south west) -- ([yshift=-5em,xshift=-0.7em]t13.south west) node [pos=0.5,left,xshift=-0.5em,yshift=0.5em] (label3) {\footnotesize{\textbf{运用知识}}};
+\draw[decorate,thick,decoration={brace,amplitude=5pt,mirror}] ([yshift=-0.2em,xshift=-0.7em]t13.south west) -- ([yshift=-5em,xshift=-0.7em]t13.south west) node [pos=0.5,left,xshift=-0.5em,yshift=0.5em] (label3) {\footnotesize{{运用知识}}};
-\node [anchor=north west] (label3part2) at ([yshift=0.3em]label3.south west) {\footnotesize{\textbf{生成译文}}};
+\node [anchor=north west] (label3part2) at ([yshift=0.3em]label3.south west) {\footnotesize{{生成译文}}};
 }
 \end{scope}

--- a/Book/Chapter3/Figures/figure-probability_translation_process.tex
+++ b/Book/Chapter3/Figures/figure-probability_translation_process.tex
@@ -4,10 +4,10 @@
 {
 {
-\node [anchor=north west] (st) at (0,0) {\textbf{s}};
+\node [anchor=north west] (st) at (0,0) {$\mathbf{s}$};
-\node [anchor=north] (taut) at ([yshift=-3em]st.south) {\textbf{$\tau$}};
+\node [anchor=north] (taut) at ([yshift=-3em]st.south) {\sffamily\bfseries{$\tau$}};
-\node [anchor=north] (phit) at ([yshift=-3em]taut.south) {\textbf{$\phi$}};
+\node [anchor=north] (phit) at ([yshift=-3em]taut.south) {\sffamily\bfseries{$\phi$}};
-\node [anchor=north] (tt) at ([yshift=-3em]phit.south) {\textbf{t}};
+\node [anchor=north] (tt) at ([yshift=-3em]phit.south) {$\mathbf{t}$};
 }
 {\scriptsize
 \node [anchor=west,minimum height=2.5em,minimum width=5.5em] (sf1) at ([xshift=1em]st.east) {};

--- a/Book/Chapter3/Figures/figure-process-of-machine-translation.tex
+++ b/Book/Chapter3/Figures/figure-process-of-machine-translation.tex
@@ -2,13 +2,13 @@
 \begin{scope}
-\node [anchor=west] (s1) at (0,0) {\textbf{我}};
+\node [anchor=west] (s1) at (0,0) {{我}};
-\node [anchor=west] (s2) at ([xshift=2em]s1.east) {\textbf{对}};
+\node [anchor=west] (s2) at ([xshift=2em]s1.east) {{对}};
-\node [anchor=west] (s3) at ([xshift=2em]s2.east) {\textbf{你}};
+\node [anchor=west] (s3) at ([xshift=2em]s2.east) {{你}};
-\node [anchor=west] (s4) at ([xshift=2em]s3.east) {\textbf{表示}};
+\node [anchor=west] (s4) at ([xshift=2em]s3.east) {{表示}};
-\node [anchor=west] (s5) at ([xshift=2em]s4.east) {\textbf{满意}};
+\node [anchor=west] (s5) at ([xshift=2em]s4.east) {{满意}};
-\node [anchor=south west] (sentlabel) at ([yshift=-0.5em]s1.north west) {\scriptsize{\textbf{\color{red}{待翻译句子(已经分词):}}}};
+\node [anchor=south west] (sentlabel) at ([yshift=-0.5em]s1.north west) {\scriptsize{{\color{red}{待翻译句子(已经分词):}}}};
 \draw [->,very thick,ublue] (s1.south) -- ([yshift=-0.7em]s1.south);
 \draw [->,very thick,ublue] (s2.south) -- ([yshift=-0.7em]s2.south);
@@ -150,8 +150,8 @@
 }
 {
-\node [anchor=west,inner sep=2pt,minimum height=1.5em,minimum width=2.5em] (ft42) at ([yshift=-2em]ft32.west) {\scriptsize{\textbf{所有翻译单元都是概率化的}}};
+\node [anchor=west,inner sep=2pt,minimum height=1.5em,minimum width=2.5em] (ft42) at ([yshift=-2em]ft32.west) {\scriptsize{{所有翻译单元都是概率化的}}};
-\node [anchor=west,inner sep=1pt,fill=black] (ft43) at (ft42.east) {{\color{white} \tiny{\textbf{P=概率}}}};
+\node [anchor=west,inner sep=1pt,fill=black] (ft43) at (ft42.east) {{\color{white} \tiny{{P=概率}}}};
 }
 }
 \end{scope}
@@ -181,17 +181,17 @@
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \begin{scope}
 {
-\draw[decorate,thick,decoration={brace,amplitude=5pt}] ([yshift=8em,xshift=2.0em]t53.south east) -- ([xshift=2.0em]t53.south east) node [pos=0.5,right,xshift=0.5em,yshift=2.0em] (label2) {\footnotesize{\textbf{从双语数}}};
+\draw[decorate,thick,decoration={brace,amplitude=5pt}] ([yshift=8em,xshift=2.0em]t53.south east) -- ([xshift=2.0em]t53.south east) node [pos=0.5,right,xshift=0.5em,yshift=2.0em] (label2) {\footnotesize{{从双语数}}};
-\node [anchor=north west] (label2part2) at ([yshift=0.3em]label2.south west) {\footnotesize{\textbf{据中自动}}};
+\node [anchor=north west] (label2part2) at ([yshift=0.3em]label2.south west) {\footnotesize{{据中自动}}};
-\node [anchor=north west] (label2part3) at ([yshift=0.3em]label2part2.south west) {\footnotesize{\textbf{学习词典}}};
+\node [anchor=north west] (label2part3) at ([yshift=0.3em]label2part2.south west) {\footnotesize{{学习词典}}};
-\node [anchor=north west] (label2part4) at ([yshift=0.3em]label2part3.south west) {\footnotesize{\textbf{(训练)}}};
+\node [anchor=north west] (label2part4) at ([yshift=0.3em]label2part3.south west) {\footnotesize{{(训练)}}};
 }
 {
-\draw[decorate,thick,decoration={brace,amplitude=5pt}] ([yshift=-1.0em,xshift=6.2em]t53.south west) -- ([yshift=-10.5em,xshift=6.2em]t53.south west) node [pos=0.5,right,xshift=0.5em,yshift=2.0em] (label3) {\footnotesize{\textbf{利用概率}}};
+\draw[decorate,thick,decoration={brace,amplitude=5pt}] ([yshift=-1.0em,xshift=6.2em]t53.south west) -- ([yshift=-10.5em,xshift=6.2em]t53.south west) node [pos=0.5,right,xshift=0.5em,yshift=2.0em] (label3) {\footnotesize{{利用概率}}};
-\node [anchor=north west] (label3part2) at ([yshift=0.3em]label3.south west) {\footnotesize{\textbf{化的词典}}};
+\node [anchor=north west] (label3part2) at ([yshift=0.3em]label3.south west) {\footnotesize{{化的词典}}};
-\node [anchor=north west] (label3part3) at ([yshift=0.3em]label3part2.south west) {\footnotesize{\textbf{进行翻译}}};
+\node [anchor=north west] (label3part3) at ([yshift=0.3em]label3part2.south west) {\footnotesize{{进行翻译}}};
-\node [anchor=north west] (label3part4) at ([yshift=0.3em]label3part3.south west) {\footnotesize{\textbf{(解码)}}};
+\node [anchor=north west] (label3part4) at ([yshift=0.3em]label3part3.south west) {\footnotesize{{(解码)}}};
 }
 \end{scope}
@@ -202,13 +202,13 @@
 \node [anchor=west] (score1) at ([xshift=1.5em]ft14.east) {\footnotesize{P=0.042}};
 \node [anchor=west] (score2) at ([xshift=1.5em]ft24.east) {\footnotesize{P=0.006}};
 \node [anchor=west] (score3) at ([xshift=1.5em]ft34.east) {\footnotesize{P=0.003}};
-\node [anchor=south] (scorelabel) at (score1.north) {\scriptsize{\textbf{\color{red}{都赋予一个模型得分}}}};
+\node [anchor=south] (scorelabel) at (score1.north) {\scriptsize{{\color{red}{都赋予一个模型得分}}}};
-\node [anchor=south] (scorelabel2) at ([yshift=-0.5em]scorelabel.north) {\scriptsize{\textbf{\color{red}{系统给每个译文}}}};
+\node [anchor=south] (scorelabel2) at ([yshift=-0.5em]scorelabel.north) {\scriptsize{{\color{red}{系统给每个译文}}}};
 }
 {
-\node [anchor=north] (scorelabel2) at (score3.south) {\scriptsize{\textbf{选择得分}}};
+\node [anchor=north] (scorelabel2) at (score3.south) {\scriptsize{{选择得分}}};
-\node [anchor=north west] (scorelabel2part2) at ([xshift=-0.5em,yshift=0.5em]scorelabel2.south west) {\scriptsize{\textbf{最高的译文}}};
+\node [anchor=north west] (scorelabel2part2) at ([xshift=-0.5em,yshift=0.5em]scorelabel2.south west) {\scriptsize{{最高的译文}}};
-\node [anchor=center,draw=ublue,circle,thick,fill=white,inner sep=1pt,circular drop shadow={shadow xshift=0.05em,shadow yshift=-0.05em}] (head1) at ([xshift=0.3em]score1.east) {\scriptsize{{\color{ugreen} \textbf{ok}}}};
+\node [anchor=center,draw=ublue,circle,thick,fill=white,inner sep=1pt,circular drop shadow={shadow xshift=0.05em,shadow yshift=-0.05em}] (head1) at ([xshift=0.3em]score1.east) {\scriptsize{{\color{ugreen} {ok}}}};
 }
 \end{scope}

--- a/Book/Chapter3/Figures/figure-processes-SMT.tex
+++ b/Book/Chapter3/Figures/figure-processes-SMT.tex
@@ -9,7 +9,7 @@
 \node [anchor=north west,inner sep=1pt] (entry2) at ([yshift=0.1em]entry1.south west) {\tiny{\textbf{2:} 小心 ！$\leftrightarrow$ Look out !}};
 \node [anchor=north west,inner sep=1pt] (entry3) at ([yshift=0.1em]entry2.south west) {\tiny{\textbf{3:} 你 是 谁 $\leftrightarrow$ Who are you}};
 \node [anchor=north west,inner sep=2pt] (entry4) at ([yshift=0.1em]entry3.south west) {...};
-\node [anchor=south west] (corpuslabel) at (entry1.north west) {{\color{ublue} \textbf{双语平行数据}}};
+\node [anchor=south west] (corpuslabel) at (entry1.north west) {{\color{ublue} \sffamily\bfseries{双语平行数据}}};
 \begin{pgfonlayer}{background}
 \node[rectangle,draw=ublue,thick,inner sep=0.2em,fill=white,drop shadow,minimum height=1.6cm] [fit = (entry1) (entry2) (entry3) (entry4) (corpuslabel)] (corpus) {};
@@ -17,7 +17,7 @@
 }
 \node [anchor=west,ugreen] (P) at ([xshift=4em,yshift=-0.7em]corpus.east){P($t|s$)};
-\node [anchor=south] (modellabel) at (P.north) {{\color{ublue} {\scriptsize \textbf{翻译模型}}}};
+\node [anchor=south] (modellabel) at (P.north) {{\color{ublue} {\scriptsize \sffamily\bfseries{翻译模型}}}};
 \begin{pgfonlayer}{background}
 \node[rectangle,draw=ublue,thick,inner sep=0.2em,fill=white,drop shadow,minimum height=1.6cm] [fit = (P) (modellabel)] (model) {};
@@ -29,8 +29,8 @@
 \draw [->,very thick,ublue] ([xshift=0.4em]model.east) -- ([xshift=3.4em]model.east)  node [inner sep=0pt,pos=0.5, above,yshift=0.3em] (decodingarrow) {\color{red}{\scriptsize{穷举\&计算}}};
 {\scriptsize
-\node [anchor=north west,inner sep=2pt] (sentlabel) at ([xshift=5.5em,yshift=-0.9em]model.north east) {{\color{ublue} \textbf{机器翻译引擎}}};
+\node [anchor=north west,inner sep=2pt] (sentlabel) at ([xshift=5.5em,yshift=-0.9em]model.north east) {{\color{ublue} \sffamily\bfseries{机器翻译引擎}}};
-\node [anchor=north west] (sent) at ([yshift=-0.5em]sentlabel.south west) {\textbf{对任意句子进行翻译}};
+\node [anchor=north west] (sent) at ([yshift=-0.5em]sentlabel.south west) {{对任意句子进行翻译}};
 \node [anchor=north west] (sentpart2) at ([yshift=0.3em]sent.south west) {\textbf{\quad}};
 }
 }

--- a/Book/Chapter3/Figures/figure-translation-pipeline.tex
+++ b/Book/Chapter3/Figures/figure-translation-pipeline.tex
@@ -46,9 +46,9 @@
 \draw [->,thick] ([yshift=-0.1em]t5.south) -- ([yshift=0.1em]ft3.north);
 }
 {
-\node [anchor=north west] (label1) at ([xshift=0.6em,yshift=0.0em]sent-1.south east) {{\scriptsize \textbf{分析}}};
+\node [anchor=north west] (label1) at ([xshift=0.6em,yshift=0.0em]sent-1.south east) {{分析}};
-\node [anchor=north west] (label2) at ([yshift=-1.8em]label1.south west) {{\scriptsize \textbf{转换}}};
+\node [anchor=north west] (label2) at ([yshift=-1.8em]label1.south west) {{转换}};
-\node [anchor=north west] (label3) at ([yshift=-1.3em]label2.south west) {{\scriptsize \textbf{生成}}};
+\node [anchor=north west] (label3) at ([yshift=-1.3em]label2.south west) {{生成}};
 }
 {\scriptsize
 		\begin{scope}

--- a/Book/Chapter3/Figures/figure-zh-en-sentence-alignment.tex
+++ b/Book/Chapter3/Figures/figure-zh-en-sentence-alignment.tex
@@ -15,10 +15,10 @@
 \begin{scope}[yshift=-3.0em]
 \node [anchor=west] (t1) at (0.35em,0) {\footnotesize{$t_1$}:I};
-\node [anchor=west] (t2) at ([xshift=0.3em,yshift=-0.1em]t1.east) {\footnotesize{$t_2$}:am};
+\node [anchor=west] (t2) at ([xshift=1.0em,yshift=0.0em]t1.east) {\footnotesize{$t_2$}:am};
-\node [anchor=west] (t3) at ([xshift=0.3em,yshift=0.1em]t2.east) {\footnotesize{$t_3$}:satisfied};
+\node [anchor=west] (t3) at ([xshift=0.3em,yshift=0.0em]t2.east) {\footnotesize{$t_3$}:satisfied};
 \node [anchor=west] (t4) at ([xshift=0.3em]t3.east) {\footnotesize{$t_4$}:with};
-\node [anchor=west] (t5) at ([xshift=0.3em,yshift=-0.2em]t4.east) {\footnotesize{$t_5$}:you};
+\node [anchor=west] (t5) at ([xshift=0.3em,yshift=-0.0em]t4.east) {\footnotesize{$t_5$}:you};
 \end{scope}

--- a/Book/Chapter3/Figures/figurerole-of-P-in-sentence-level-translation.tex
+++ b/Book/Chapter3/Figures/figurerole-of-P-in-sentence-level-translation.tex
@@ -9,7 +9,7 @@
 \node [anchor=north west,inner sep=1pt] (entry2) at ([yshift=0.1em]entry1.south west) {\tiny{\textbf{2:} 小心 ！$\leftrightarrow$ Look out !}};
 \node [anchor=north west,inner sep=1pt] (entry3) at ([yshift=0.1em]entry2.south west) {\tiny{\textbf{3:} 你 是 谁 $\leftrightarrow$ Who are you}};
 \node [anchor=north west,inner sep=2pt] (entry4) at ([yshift=0.1em]entry3.south west) {...};
-\node [anchor=south west] (corpuslabel) at (entry1.north west) {{\color{ublue} \textbf{双语平行数据}}};
+\node [anchor=south west] (corpuslabel) at (entry1.north west) {{\color{ublue} \sffamily\bfseries{双语平行数据}}};
 \begin{pgfonlayer}{background}
 \node[rectangle,draw=ublue,thick,inner sep=0.2em,fill=white,drop shadow,minimum height=1.6cm] [fit = (entry1) (entry2) (entry3) (entry4) (corpuslabel)] (corpus) {};
@@ -17,7 +17,7 @@
 }
 \node [anchor=west,ugreen] (P) at ([xshift=4em,yshift=-0.7em]corpus.east){P($\mathbf{t}|\mathbf{s}$)};
-\node [anchor=south] (modellabel) at (P.north) {{\color{ublue} {\scriptsize \textbf{翻译模型}}}};
+\node [anchor=south] (modellabel) at (P.north) {{\color{ublue} {\scriptsize \sffamily\bfseries{翻译模型}}}};
 \begin{pgfonlayer}{background}
 \node[rectangle,draw=ublue,thick,inner sep=0.2em,fill=white,drop shadow,minimum height=1.6cm] [fit = (P) (modellabel)] (model) {};
@@ -28,8 +28,8 @@
 \draw [->,very thick,ublue] ([xshift=0.4em]model.east) -- ([xshift=3.4em]model.east)  node [inner sep=0pt,pos=0.5,above,yshift=0.3em] (decodingarrow) {\color{red}{\scriptsize{穷举\&计算}}};
 {\scriptsize
-\node [anchor=north west,inner sep=2pt] (sentlabel) at ([xshift=5.5em,yshift=-0.9em]model.north east) {{\color{ublue} \textbf{机器翻译引擎}}};
+\node [anchor=north west,inner sep=2pt] (sentlabel) at ([xshift=5.5em,yshift=-0.9em]model.north east) {{\color{ublue} \sffamily\bfseries{机器翻译引擎}}};
-\node [anchor=north west] (sent) at ([yshift=-0.5em]sentlabel.south west) {\textbf{对任意句子进行翻译}};
+\node [anchor=north west] (sent) at ([yshift=-0.5em]sentlabel.south west) {{对任意句子进行翻译}};
 \node [anchor=north west] (sentpart2) at ([yshift=0.3em]sent.south west) {\textbf{\quad}};
 }

--- a/Book/Chapter3/Figures/greedy-MT-decoding-process-1.tex
+++ b/Book/Chapter3/Figures/greedy-MT-decoding-process-1.tex
@@ -14,13 +14,13 @@
 {\scriptsize
 %% input words
-\node [anchor=west] (s1) at (0,0) {\textbf{我}};
+\node [anchor=west] (s1) at (0,0) {{我}};
-\node [anchor=west] (s2) at ([xshift=3em]s1.east) {\textbf{对}};
+\node [anchor=west] (s2) at ([xshift=3em]s1.east) {{对}};
-\node [anchor=west] (s3) at ([xshift=3em]s2.east) {\textbf{你}};
+\node [anchor=west] (s3) at ([xshift=3em]s2.east) {{你}};
-\node [anchor=west] (s4) at ([xshift=2.5em]s3.east) {\textbf{表示}};
+\node [anchor=west] (s4) at ([xshift=2.5em]s3.east) {{表示}};
-\node [anchor=west] (s5) at ([xshift=2.5em]s4.east) {\textbf{满意}};
+\node [anchor=west] (s5) at ([xshift=2.5em]s4.east) {{满意}};
-\node [anchor=south west,inner sep=1pt] (sentlabel) at ([yshift=0.3em]s1.north west) {\scriptsize{\textbf{输入: 待翻译句子(已经分词)}}};
+\node [anchor=south west,inner sep=1pt] (sentlabel) at ([yshift=0.3em]s1.north west) {\scriptsize{{输入: 待翻译句子(已经分词)}}};
 {
 \draw [->,very thick,ublue] ([yshift=0.2em]s1.south) -- ([yshift=-0.8em]s1.south) node [pos=0.5,right] (pi1) {\tiny{$\pi$(1)}};
 \draw [->,very thick,ublue] ([yshift=0.2em]s2.south) -- ([yshift=-0.8em]s2.south) node [pos=0.5,right] (pi2) {\tiny{$\pi$(2)}};
@@ -119,13 +119,13 @@
 {\scriptsize
 %% input words
-\node [anchor=west] (s1) at (0,0) {\textbf{我}};
+\node [anchor=west] (s1) at (0,0) {{我}};
-\node [anchor=west] (s2) at ([xshift=3em]s1.east) {\textbf{对}};
+\node [anchor=west] (s2) at ([xshift=3em]s1.east) {{对}};
-\node [anchor=west] (s3) at ([xshift=3em]s2.east) {\textbf{你}};
+\node [anchor=west] (s3) at ([xshift=3em]s2.east) {{你}};
-\node [anchor=west] (s4) at ([xshift=2.5em]s3.east) {\textbf{表示}};
+\node [anchor=west] (s4) at ([xshift=2.5em]s3.east) {{表示}};
-\node [anchor=west] (s5) at ([xshift=2.5em]s4.east) {\textbf{满意}};
+\node [anchor=west] (s5) at ([xshift=2.5em]s4.east) {{满意}};
-\node [anchor=south west,inner sep=1pt] (sentlabel) at ([yshift=0.3em]s1.north west) {\scriptsize{\textbf{输入: 待翻译句子(已经分词)}}};
+\node [anchor=south west,inner sep=1pt] (sentlabel) at ([yshift=0.3em]s1.north west) {\scriptsize{{输入: 待翻译句子(已经分词)}}};
 {
 \draw [->,very thick,ublue] ([yshift=0.2em]s1.south) -- ([yshift=-0.8em]s1.south) node [pos=0.5,right] (pi1) {\tiny{$\pi$(1)}};

--- a/Book/Chapter3/Figures/greedy-MT-decoding-process-3.tex
+++ b/Book/Chapter3/Figures/greedy-MT-decoding-process-3.tex
@@ -9,13 +9,13 @@
 {\scriptsize
 %% input words
-\node [anchor=west] (s1) at (0,0) {\textbf{我}};
+\node [anchor=west] (s1) at (0,0) {{我}};
-\node [anchor=west] (s2) at ([xshift=3em]s1.east) {\textbf{对}};
+\node [anchor=west] (s2) at ([xshift=3em]s1.east) {{对}};
-\node [anchor=west] (s3) at ([xshift=3em]s2.east) {\textbf{你}};
+\node [anchor=west] (s3) at ([xshift=3em]s2.east) {{你}};
-\node [anchor=west] (s4) at ([xshift=2.5em]s3.east) {\textbf{表示}};
+\node [anchor=west] (s4) at ([xshift=2.5em]s3.east) {{表示}};
-\node [anchor=west] (s5) at ([xshift=2.5em]s4.east) {\textbf{满意}};
+\node [anchor=west] (s5) at ([xshift=2.5em]s4.east) {{满意}};
-\node [anchor=south west,inner sep=1pt] (sentlabel) at ([yshift=0.3em]s1.north west) {\scriptsize{\textbf{输入: 待翻译句子(已经分词)}}};
+\node [anchor=south west,inner sep=1pt] (sentlabel) at ([yshift=0.3em]s1.north west) {\scriptsize{{输入: 待翻译句子(已经分词)}}};
 {
 \draw [->,very thick,ublue] ([yshift=0.2em]s1.south) -- ([yshift=-0.8em]s1.south) node [pos=0.5,right] (pi1) {\tiny{$\pi$(1)}};
@@ -174,13 +174,13 @@
 {\scriptsize
 %% input words
-\node [anchor=west] (s1) at (0,0) {\textbf{我}};
+\node [anchor=west] (s1) at (0,0) {{我}};
-\node [anchor=west] (s2) at ([xshift=3em]s1.east) {\textbf{对}};
+\node [anchor=west] (s2) at ([xshift=3em]s1.east) {{对}};
-\node [anchor=west] (s3) at ([xshift=3em]s2.east) {\textbf{你}};
+\node [anchor=west] (s3) at ([xshift=3em]s2.east) {{你}};
-\node [anchor=west] (s4) at ([xshift=2.5em]s3.east) {\textbf{表示}};
+\node [anchor=west] (s4) at ([xshift=2.5em]s3.east) {{表示}};
-\node [anchor=west] (s5) at ([xshift=2.5em]s4.east) {\textbf{满意}};
+\node [anchor=west] (s5) at ([xshift=2.5em]s4.east) {{满意}};
-\node [anchor=south west,inner sep=1pt] (sentlabel) at ([yshift=0.3em]s1.north west) {\scriptsize{\textbf{输入: 待翻译句子(已经分词)}}};
+\node [anchor=south west,inner sep=1pt] (sentlabel) at ([yshift=0.3em]s1.north west) {\scriptsize{{输入: 待翻译句子(已经分词)}}};
 {
 \draw [->,very thick,ublue] ([yshift=0.2em]s1.south) -- ([yshift=-0.8em]s1.south) node [pos=0.5,right] (pi1) {\tiny{$\pi$(1)}};

--- a/Book/Chapter5/Figures/fig-back-propagation-hid.tex
+++ b/Book/Chapter5/Figures/fig-back-propagation-hid.tex
@@ -8,7 +8,7 @@
 \node [anchor=west] (next) at ([xshift=2em]h2.east) {...};
 \draw [->,thick] ([xshift=0.1em]prev.east) -- ([xshift=-0.1em]h.west);
 \draw [->,thick] ([xshift=0.1em]h.east) -- ([xshift=-0.1em]s.west) node [pos=0.5,below] {\tiny{$\textbf{s}^k = \textbf{h}^{k-1}\textbf{w}^k$}};
-\draw [->,thick] ([xshift=0.1em]s.east) -- ([xshift=-0.1em]h2.west) node [pos=0.5,below] {\tiny{$\textbf{h}^k = f(\textbf{s}^{k})$}};
+\draw [->,thick] ([xshift=0.1em]s.east) -- ([xshift=-0.1em]h2.west) node [pos=0.5,below] {\tiny{$\textbf{h}^k = f^k(\textbf{s}^{k})$}};
 \draw [->,thick] ([xshift=0.1em]h2.east) -- ([xshift=-0.1em]next.west);
 {

--- a/Book/Chapter5/Figures/fig-back-propagation.tex
+++ b/Book/Chapter5/Figures/fig-back-propagation.tex
@@ -12,18 +12,18 @@
 \node [anchor=east] (input) at ([xshift=-1em]layer01.west){\scriptsize{输入}};
 \node [anchor=west] (output) at ([xshift=1em]layer05.east){\scriptsize{输出}};
 {
-\draw [<-,very thick,red] ([xshift=-1em,yshift=-0.3em]layer01.west) -- ([xshift=-0.1em,yshift=-0.3em]layer01.west)node [pos=0.5,above] {\tiny{⑦}};
+\draw [<-,very thick,red] ([xshift=-1em,yshift=-0.3em]layer01.west) -- ([xshift=-0.1em,yshift=-0.3em]layer01.west)node [pos=0.5,above] {\small{\ding{178}}};
-\draw [<-,very thick,red] ([xshift=0.1em,yshift=-0.8em]layer01.north east) -- ([xshift=-0.1em,yshift=-0.8em]layer02.north west)node [pos=0.5,above] {\tiny{⑥}};
+\draw [<-,very thick,red] ([xshift=0.1em,yshift=-0.8em]layer01.north east) -- ([xshift=-0.1em,yshift=-0.8em]layer02.north west)node [pos=0.5,above] {\small{\ding{177}}};
-\draw [<-,very thick,red] ([xshift=0.1em,yshift=0.2em]layer01.south east) -- ([xshift=-0.1em,yshift=0.2em]layer03.south west)node [pos=0.5,below] {\tiny{⑤}};
+\draw [<-,very thick,red] ([xshift=0.1em,yshift=0.2em]layer01.south east) -- ([xshift=-0.1em,yshift=0.2em]layer03.south west)node [pos=0.5,below] {\small{\ding{176}}};
-\draw [<-,very thick,red] ([xshift=0.1em,yshift=-0.8em]layer02.north east) -- ([xshift=-0.1em,yshift=-0.8em]layer04.north west)node [pos=0.5,above] {\tiny{④}};
+\draw [<-,very thick,red] ([xshift=0.1em,yshift=-0.8em]layer02.north east) -- ([xshift=-0.1em,yshift=-0.8em]layer04.north west)node [pos=0.5,above] {\small{\ding{175}}};
-\draw [<-,very thick,red] ([xshift=0.1em,yshift=0.2em]layer03.south east) -- ([xshift=-0.1em,yshift=0.2em]layer04.south west)node [pos=0.5,below] {\tiny{③}};
+\draw [<-,very thick,red] ([xshift=0.1em,yshift=0.2em]layer03.south east) -- ([xshift=-0.1em,yshift=0.2em]layer04.south west)node [pos=0.5,below] {\small{\ding{174}}};
-\draw [<-,very thick,red] ([xshift=0.1em,yshift=-0.3em]layer04.east) -- ([xshift=-0.1em,yshift=-0.3em]layer05.west)node [pos=0.5,above] {\tiny{②}};
+\draw [<-,very thick,red] ([xshift=0.1em,yshift=-0.3em]layer04.east) -- ([xshift=-0.1em,yshift=-0.3em]layer05.west)node [pos=0.5,above] {\small{\ding{173}}};
-\draw [<-,very thick,red] ([xshift=0.1em,yshift=-0.3em]layer05.east) -- ([xshift=1.0em,yshift=-0.3em]layer05.east)node [pos=0.5,above] {\tiny{①}};
+\draw [<-,very thick,red] ([xshift=0.1em,yshift=-0.3em]layer05.east) -- ([xshift=1.0em,yshift=-0.3em]layer05.east)node [pos=0.5,above] {\small{\ding{172}}};
 }
 {
-\draw [<-,thin] ([xshift=0.3em,yshift=-0.7em]layer04.east) .. controls +(-35:1) and +(145:1) .. ([xshift=-2em,yshift=-0.3em]layer05.south west) node [pos=1,below] {\scriptsize{反向：$h_{i}$ 处的梯度$\frac{\partial L}{\partial h_i}$}};
+\draw [<-,thin] ([xshift=0.3em,yshift=-0.7em]layer04.east) .. controls +(-35:1) and +(145:1) .. ([xshift=-2em,yshift=-0.9em]layer05.south west) node [pos=1,below] {\scriptsize{反向：$h_{i}$ 处的梯度$\frac{\partial L}{\partial h_i}$}};
 }
 \end{scope}

--- a/Book/Chapter5/Figures/fig-forward-propagation-hid.tex
+++ b/Book/Chapter5/Figures/fig-forward-propagation-hid.tex
@@ -8,7 +8,7 @@
 \node [anchor=west] (next) at ([xshift=2em]h2.east) {...};
 \draw [->,thick] ([xshift=0.1em]prev.east) -- ([xshift=-0.1em]h.west);
 \draw [->,thick] ([xshift=0.1em]h.east) -- ([xshift=-0.1em]s.west) node [pos=0.5,below] {\tiny{$\textbf{s}^k = \textbf{h}^{k-1}\textbf{w}^k$}};
-\draw [->,thick] ([xshift=0.1em]s.east) -- ([xshift=-0.1em]h2.west) node [pos=0.5,below] {\tiny{$\textbf{h}^k = f(\textbf{s}^{k})$}};
+\draw [->,thick] ([xshift=0.1em]s.east) -- ([xshift=-0.1em]h2.west) node [pos=0.5,below] {\tiny{$\textbf{h}^k = f^k(\textbf{s}^{k})$}};
 \draw [->,thick] ([xshift=0.1em]h2.east) -- ([xshift=-0.1em]next.west);

--- a/Book/Chapter5/Figures/fig-forward-propagation.tex
+++ b/Book/Chapter5/Figures/fig-forward-propagation.tex
@@ -14,19 +14,19 @@
 {
-\draw [->,very thick,ublue] ([xshift=-1em]layer01.west) -- ([xshift=-0.1em]layer01.west)node [pos=0.5,above] {\tiny{①}};;
+\draw [->,very thick,ublue] ([xshift=-1em]layer01.west) -- ([xshift=-0.1em]layer01.west)node [pos=0.5,above] {\small{\ding{172}}};;
 }
 {
-\draw [->,very thick,ublue] ([xshift=0.1em,yshift=-0.5em]layer01.north east) -- ([xshift=-0.1em,yshift=-0.5em]layer02.north west)node [pos=0.5,above] {\tiny{②}};
+\draw [->,very thick,ublue] ([xshift=0.1em,yshift=-0.5em]layer01.north east) -- ([xshift=-0.1em,yshift=-0.5em]layer02.north west)node [pos=0.5,above] {\small{\ding{173}}};
 }
 {
-\draw [->,very thick,ublue] ([xshift=0.1em,yshift=0.5em]layer01.south east) -- ([xshift=-0.1em,yshift=0.5em]layer03.south west)node [pos=0.5,below] {\tiny{③}};
+\draw [->,very thick,ublue] ([xshift=0.1em,yshift=0.5em]layer01.south east) -- ([xshift=-0.1em,yshift=0.5em]layer03.south west)node [pos=0.5,below] {\small{\ding{174}}};
 }
 {
-\draw [->,very thick,ublue] ([xshift=0.1em,yshift=-0.5em]layer02.north east) -- ([xshift=-0.1em,yshift=-0.5em]layer04.north west)node [pos=0.5,above] {\tiny{④}};
+\draw [->,very thick,ublue] ([xshift=0.1em,yshift=-0.5em]layer02.north east) -- ([xshift=-0.1em,yshift=-0.5em]layer04.north west)node [pos=0.5,above] {\small{\ding{175}}};
-\draw [->,very thick,ublue] ([xshift=0.1em,yshift=0.5em]layer03.south east) -- ([xshift=-0.1em,yshift=0.5em]layer04.south west)node [pos=0.5,below] {\tiny{⑤}};
+\draw [->,very thick,ublue] ([xshift=0.1em,yshift=0.5em]layer03.south east) -- ([xshift=-0.1em,yshift=0.5em]layer04.south west)node [pos=0.5,below] {\small{\ding{176}}};
-\draw [->,very thick,ublue] ([xshift=0.1em]layer04.east) -- ([xshift=-0.1em]layer05.west)node [pos=0.5,above] {\tiny{⑥}};
+\draw [->,very thick,ublue] ([xshift=0.1em]layer04.east) -- ([xshift=-0.1em]layer05.west)node [pos=0.5,above] {\small{\ding{177}}};
-\draw [->,very thick,ublue] ([xshift=0.1em]layer05.east) -- ([xshift=1.0em]layer05.east)node [pos=0.5,above] {\tiny{⑦}};
+\draw [->,very thick,ublue] ([xshift=0.1em]layer05.east) -- ([xshift=1.0em]layer05.east)node [pos=0.5,above] {\small{\ding{178}}};
 }

--- a/Book/Chapter5/Figures/fig-tensor-mul.tex
+++ b/Book/Chapter5/Figures/fig-tensor-mul.tex
@@ -11,7 +11,7 @@
    \addtocounter{mycount1}{1};
  }
 }
-\node [anchor=south west] (label11) at (-1.3,0.9) {\footnotesize{①}};
+\node [anchor=south west] (label11) at (-1.3,0.9) {\footnotesize{\ding{172}}};
 \end{scope}
 \begin{scope}[yshift=6em,xshift=0.5em]
@@ -24,7 +24,7 @@
    \addtocounter{mycount2}{1};
  }
 }
-\node [anchor=south west] (label12) at (-1.3,0.9) {\footnotesize{②}};
+\node [anchor=south west] (label12) at (-1.3,0.9) {\footnotesize{\ding{173}}};
 \end{scope}
 \begin{scope}[yshift=5.5em,xshift=0em]
@@ -37,7 +37,7 @@
    \addtocounter{mycount3}{1};
  }
 }
-\node [anchor=south west] (label13) at (-1.3,0.9) {\footnotesize{③}};
+\node [anchor=south west] (label13) at (-1.3,0.9) {\footnotesize{\ding{174}}};
 \end{scope}
 \begin{scope}[yshift=5em,xshift=-0.5em]
@@ -51,7 +51,7 @@
  }
 \node [anchor=north] (xlabel) at (0,-1.2) {$\textbf{x}$};
 }
-\node [anchor=south west] (label14) at (-1.3,0.9) {\footnotesize{④}};
+\node [anchor=south west] (label14) at (-1.3,0.9) {\footnotesize{\ding{175}}};
 \end{scope}
 \begin{scope}[yshift=5em,xshift=1.5in]
@@ -85,7 +85,7 @@
  }
 }
 }
-\node [anchor=south west] (label21) at (-0.8,0.9) {\footnotesize{①}};
+\node [anchor=south west] (label21) at (-0.8,0.9) {\footnotesize{\ding{172}}};
 \end{scope}
 \begin{scope}[yshift=6em,xshift=0.5em+3in]
@@ -99,7 +99,7 @@
  }
 }
 }
-\node [anchor=south west] (label22) at (-0.8,0.9) {\footnotesize{②}};
+\node [anchor=south west] (label22) at (-0.8,0.9) {\footnotesize{\ding{173}}};
 \end{scope}
 \begin{scope}[yshift=5.5em,xshift=0em+3in]
@@ -113,7 +113,7 @@
  }
 }
 }
-\node [anchor=south west] (label23) at (-0.8,0.9) {\footnotesize{③}};
+\node [anchor=south west] (label23) at (-0.8,0.9) {\footnotesize{\ding{174}}};
 \end{scope}
 \begin{scope}[yshift=5.0em,xshift=-0.5em+3in]
@@ -127,7 +127,7 @@
  }
 }
 }
-\node [anchor=south west] (label24) at (-0.8,0.9) {\footnotesize{④}};
+\node [anchor=south west] (label24) at (-0.8,0.9) {\footnotesize{\ding{175}}};
 {
 \node [anchor=north] (xlabel) at (0,-1.2) {$\textbf{x} \cdot \textbf{w}$};
 \node [anchor=center] (elabel) at (-0.7in,0) {\Huge{$\textbf{=}$}};

--- a/Book/Chapter5/Figures/fig-weather-forward.tex
+++ b/Book/Chapter5/Figures/fig-weather-forward.tex
@@ -3,7 +3,7 @@
 \node [anchor=west,minimum width=1.5em,minimum height=1.5em] (part1) at (0,0) {\footnotesize{$y$}};
 \node [anchor=north,minimum width=1.5em,minimum height=1.5em] (part1-2) at ([xshift=-1.6em,yshift=-0.3em]part1.south) {\scriptsize {$\rm {shape(1)}$}};
-\node [anchor=north,draw,minimum width=4.0em,minimum height=1.5em] (part2) at ([yshift=-1.5em]part1.south) {\footnotesize {$\rm{Sigmoid}$}};
+\node [anchor=north,draw,minimum width=4.0em,minimum height=1.5em] (part2) at ([yshift=-1.5em]part1.south) {\footnotesize {$\rm{sigmoid}$}};
 \draw [-,thick](part1.south)--(part2.north);
 \node [anchor=north,minimum width=1.5em,minimum height=1.5em] (part2-2) at ([xshift=-1.6em,yshift=-0.3em]part2.south) {\scriptsize {$\rm{shape(1)}$}};
@@ -26,7 +26,7 @@
 \node [anchor=south,minimum width=1.5em,minimum height=1.5em] (part5-4-1) at ([xshift=1.3em,yshift=-0.45em]part5-4.north) {\scriptsize {$\rm{shape(1)}$}};
 %%%%%%%%%%%%%%%%%%%%%%%%%%
 \node [anchor=north,minimum width=1.5em,minimum height=1.5em] (part5-2) at ([xshift=-1.6em,yshift=-0.2em]part5.south) {\scriptsize {$\rm{shape(2)}$}};
-\node [anchor=north,draw,minimum width=4.0em,minimum height=1.5em] (part6) at ([yshift=-1.4em]part5.south) {\footnotesize {$\rm{Tanh}$}};
+\node [anchor=north,draw,minimum width=4.0em,minimum height=1.5em] (part6) at ([yshift=-1.4em]part5.south) {\footnotesize {$\rm{tanh}$}};
 \draw [-,thick]([yshift=0.1em]part5.south)--(part6.north);
 \node [anchor=north,minimum width=1.5em,minimum height=1.5em] (part6-2) at ([xshift=-1.6em,yshift=-0.3em]part6.south) {\scriptsize {$\rm{shape(2)}$}};

--- a/Book/Chapter5/chapter5.tex
+++ b/Book/Chapter5/chapter5.tex
--- a/Book/Chapter6/Chapter6.tex
+++ b/Book/Chapter6/Chapter6.tex
@@ -23,7 +23,7 @@
 \parinterval 不过，有人也意识到了神经机器翻译在表示学习等方面的优势。特别是，以Yoshua Bengio团队为代表的研究力量对包括机器翻译在内的序列到序列问题进行了广泛而深入的研究，注意力机制等新的模型不断被推出。这使得神经机器翻译的翻译品质逐渐体现出优势，甚至超越了当时的统计机器翻译。正当大家在讨论神经机器翻译是否能取代统计机器翻译成为下一代机器翻译的范式的时候，谷歌、百度等企业推出以神经机器翻译技术为内核的在线机器翻译服务。在很多场景下的翻译品质显著超越了当时最好的统计机器翻译系统。这也引发了学术界和产业界对神经机器翻译的讨论。随着关注度的不断升高，神经机器翻译的研究吸引了更多的科研机构和企业的投入，系统翻译品质得到进一步提升。
-\parinterval 在短短5-6年间，神经机器翻译从一个新生的概念已经成长为机器翻译领域的最前沿技术之一，在各种机器翻译评测和应用中呈全面替代统计机器翻译之势。比如，从近几年WMT、CCMT等评测结果来看，神经机器翻译已经处于绝对的统治地位，在各个语种和领域的翻译任务中，成为各参赛系统的标配。此外，从ACL等自然语言处理顶级会议的发表论文看，神经机器翻译是毫无疑问的焦点，在论文数量上成明显的增长趋势，这也体现了学术界对该方法的热情。至今，无论是国外的著名企业，如谷歌、微软、脸书，还是国内的团队，如百度、腾讯、阿里巴巴、有道、搜狗、小牛翻译，都推出了自己研发的神经机器翻译系统，整个研究和产业生态欣欣向荣。图\ref{fig:6-1}展示了包含神经机器翻译在内的机器翻译发展简史。
+\parinterval 在短短5-6年间，神经机器翻译从一个新生的概念已经成长为机器翻译领域的最前沿技术之一，在各种机器翻译评测和应用中呈全面替代统计机器翻译之势。比如，从近几年WMT、CCMT等评测结果来看，神经机器翻译已经处于绝对的统治地位，在各个语种和领域的翻译任务中，成为各参赛系统的标配。此外，从ACL等自然语言处理顶级会议的发表论文看，神经机器翻译是毫无疑问的焦点，在论文数量上呈明显的增长趋势，这也体现了学术界对该方法的热情。至今，无论是国外的著名企业，如谷歌、微软、脸书，还是国内的团队，如百度、腾讯、阿里巴巴、有道、搜狗、小牛翻译，都推出了自己研发的神经机器翻译系统，整个研究和产业生态欣欣向荣。图\ref{fig:6-1}展示了包含神经机器翻译在内的机器翻译发展简史。
 %空一行用来段落换行，noindent取消首行缩进，hspace{}指定缩进距离，1em等于两个英文字符|一个汉字
 %----------------------------------------------
 % 图6.1
@@ -54,7 +54,7 @@
 \parinterval 从广义上讲，神经机器翻译是一种基于人工神经网络的方法，它把翻译过程描述为可以用人工神经网络表示的函数。所有的训练和推断都在这些函数上进行。由于神经机器翻译中的神经网络可以用连续可微函数表示，因此这类方法也可以用基于梯度的方法进行优化，相关技术非常成熟。更为重要的是，在神经网络的设计中，研究者引入了分布式表示（distributed representation）的概念，这也是近些年自然语言处理领域的重要成果之一。传统统计机器翻译仍然把词序列看作离散空间里的由多个特征函数描述的点，类似于$n$-gram语言模型，这类模型对数据稀疏问题非常敏感。此外，人工设计特征也在一定程度上限制了模型对问题的表示能力。神经机器翻译把文字序列表示为实数向量，一方面避免了特征工程繁重的工作，另一方面使得系统可以对文字序列的``表示''进行学习。可以说，神经机器翻译的成功很大程度上源自``表示学习''这种自然语言处理的新范式的出现。在表示学习的基础上，注意力机制、深度神经网络等技术都被应用于神经机器翻译，使其得以进一步发展。
-\parinterval 虽然神经机器翻译中大量的使用了人工神经网络方法，但是它并不是最早在机器翻译中使用人工神经网络的框架。实际上，人工神经网络在机器翻译中应用的历史要远早于现在的神经机器翻译。 在统计机器翻译时代，也有很多利用人工神经网络进行进行机器翻译系统模块的构建\cite{devlin-etal-2014-fast}\cite{liumodel}，比如，Jacob Devlin等人就成功的在统计机器翻译系统中使用了基于神经网络的联合表示模型，取得了令人振奋的结果，这项工作也获得了ACL2014的最佳论文奖（best paper award）。
+\parinterval 虽然神经机器翻译中大量的使用了人工神经网络方法，但是它并不是最早在机器翻译中使用人工神经网络的框架。实际上，人工神经网络在机器翻译中应用的历史要远早于现在的神经机器翻译。 在统计机器翻译时代，也有很多利用人工神经网络进行进行机器翻译系统模块的构建\cite{devlin-etal-2014-fast}\cite{Schwenk_continuousspace}，比如，Jacob Devlin等人就成功的在统计机器翻译系统中使用了基于神经网络的联合表示模型，取得了令人振奋的结果，这项工作也获得了ACL2014的最佳论文奖（best paper award）。
 \parinterval 不过，以上这些工作大多都是在局部使用人工神经网络和深度学习方法。与之不同的是，神经机器翻译是用人工神经网络完成整个翻译过程的建模，这样做的一个好处是，整个系统可以进行端到端学习，无需引入对任何翻译的隐含结构假设。这种利用端到端学习对机器翻译进行神经网络建模的方式也就成为了现在大家所熟知的神经机器翻译。这里简单列出部分代表性的工作：
@@ -123,7 +123,7 @@
 \label{tab:HTER}
 \begin{tabular}{r|llc}
 %  						& \multicolumn{2}{c}{HTERnoShift}         &      \\
- \textbf{system}                      & \textbf{word} & \textbf{lemma} & \textbf{\%Δ} \\ \hline
+ system                    & word & lemma & \%Δ \\ \hline
 PBSY                    &27.1          & 22.5           & -16.9       \\
 HPB                     & 28.7          & 23.5           & -18.4       \\
 SPB                     & 28.3          & 23.2           & -18.0       \\
@@ -140,7 +140,7 @@ NMT                     & $ 21.7^{\ast}$          & $18.7^{\ast}$           & -1
 \caption{不同机器翻译系统人类评价结果\cite{Hassan2018AchievingHP}}
 \label{tab:Human assessment}
 \begin{tabular}{l | l l}
-	\# 		&\textbf{\begin{tabular}[c]{@{}l@{}}Ave\%\\ （平均原始分数）\end{tabular}}		&\textbf{System} \\ \hline
+	\# 		&\begin{tabular}[c]{@{}l@{}}Ave\%\\ （平均原始分数）\end{tabular}		&System \\ \hline
 	1 		&69.0			&Combo-6 \\
 			&68.5			&Reference-HT \\
 			&68.9			&Combo-5 \\
@@ -161,7 +161,7 @@ NMT                     & $ 21.7^{\ast}$          & $18.7^{\ast}$           & -1
 \caption{WMT14英德数据集上不同神经机器翻译系统的表现\cite{WangLearning}}
 \label{tab:result-of-wmt14}
 \begin{tabular}{ l | l l l}
-   \textbf{模型}         		 &\textbf{作者}	& \textbf{年份}	& \textbf{BLEU} \\ \hline
+   模型         		 &作者	& 年份	& BLEU \\ \hline
   ConvS2S                			&Gehring等 		&2017 			&25.2 \\
   Transformer-Base 			&Vaswani等 		&2017 			&27.3 \\
   Transformer-Big   			&Vaswani等 		&2017 			&28.4 \\
@@ -182,7 +182,7 @@ NMT                     & $ 21.7^{\ast}$          & $18.7^{\ast}$           & -1
 \caption{统计机器翻译 vs 神经机器翻译}
 \label{tab:nmt vs smt}
 \begin{tabular}{ l | l }
-  \rule{0pt}{15pt}  \parbox{12em}{\textbf{统计机器翻译}		}	&\textbf{神经机器翻译}\\ \hline
+  \rule{0pt}{15pt}  \parbox{12em}{统计机器翻译		}	&神经机器翻译\\ \hline
 	  \rule{0pt}{13pt}  基于离散空间的表示模型			&基于连续空间的表示模型 \\
 	  \rule{0pt}{13pt} NLP问题的隐含结构假设			&无隐含结构假设，端到端学习 \\
 	  \rule{0pt}{13pt} 特征工程为主					&无显性特征，但需要设计网络 \\
@@ -315,7 +315,7 @@ NMT                     & $ 21.7^{\ast}$          & $18.7^{\ast}$           & -1
 \caption{不同机器翻译范式中人类的作用}
 \label{tab:what people do in different methods of mt}
 \begin{tabular}{ l | l }
-	\textbf{机器翻译方法}			&\textbf{人类参与方式} \\ \hline
+	机器翻译方法			&人类参与方式 \\ \hline
 	\rule{0pt}{13pt} 基于规则的方法					&设计翻译规则 \\
 	\rule{0pt}{13pt} 传统统计方法					&设计翻译特征 \\
 	\rule{0pt}{13pt} 神经网络方法					&设计网络架构 \\
@@ -337,7 +337,7 @@ NMT                     & $ 21.7^{\ast}$          & $18.7^{\ast}$           & -1
 \caption{2013-2015期间神经机器翻译方面的部分论文}
 \label{tab:papers in 2013-2015}
 \begin{tabular}{l| l p{8cm}}
-\rule{0pt}{16pt} \textbf{时间}   & \textbf{作者}                                                              & \textbf{论文}                                                                      \\ \hline
+\rule{0pt}{16pt} 时间   & 作者                                                              & 论文                                                                      \\ \hline
 \rule{0pt}{0pt} 2013 & \begin{tabular}[c]{@{}l@{}l@{}}\\Kalchbrenner\\ 和Blunsom\end{tabular} & Recurrent Continuous Translation Models                                 \\
 \rule{0pt}{16pt} 2014 & Sutskever等                                                       & Sequence to Sequence Learning with neural networks                      \\
 \rule{0pt}{16pt} 2014 & Bahdanau等                                                       & Neural Machine Translation by Jointly Learning to Align and Translate \\
@@ -869,7 +869,7 @@ $\textrm{a}(\cdot)$可以被看作是目标语表示和源语言表示的一种`
 \caption{引入注意力机制前后译文单词生成公式}
 \label{tab:word-translation-examples}
 \begin{tabular}{ l | l }
-\rule{0pt}{13pt}	\textbf{引入注意力之前}			&\textbf{引入注意力之后} \\ \hline
+\rule{0pt}{13pt}	引入注意力之前			&引入注意力之后 \\ \hline
 \rule{0pt}{16pt}	$\textrm{``have''} = \argmax_{y_1}\mathrm{P}(y_1 | \mathbf{C} , y_0)$		&$\textrm{``have''} = \argmax_{y_1}\mathrm{P}(y_1 | \mathbf{C}_1 , y_0)$	\\
 \rule{0pt}{16pt}	$\textrm{``you''} = \argmax_{y_2}\mathrm{P}(y_2 | \mathbf{s}_1 , y_1)$			&$\textrm{``you''} = \argmax_{y_2}\mathrm{P}(y_2 | \mathbf{s}_1, \mathbf{C}_2 , y_1)$	\\
 \end{tabular}
@@ -979,7 +979,7 @@ L(\mathbf{Y},\hat{\mathbf{Y}}) = \sum_{j=1}^n L_{\textrm{ce}}(\mathbf{y}_j,\hat{
 \caption{ Adam / SGD对比}
 \label{tab:Adam vs SGD}
 \begin{tabular}{l | l  l }
-	 		&\textbf{使用}		&\textbf{性能} \\ \hline
+	 		&使用		&性能 \\ \hline
 \rule{0pt}{13pt}	Adam	&一套配置包打天下	&不算差，但没到极限 \\
 \rule{0pt}{13pt}	SGD	&换一个任务就得调	&效果杠杠的 \\
 \end{tabular}
@@ -1043,7 +1043,7 @@ L(\mathbf{Y},\hat{\mathbf{Y}}) = \sum_{j=1}^n L_{\textrm{ce}}(\mathbf{y}_j,\hat{
 \caption{ 数据并行与模型并行优缺点对比}
 \label{tab:adv and disadv between Data parallel and model parallel }
 \begin{tabular}{l | p{12em}  p{10em} }
-	 		 &\textbf{优点}		&\textbf{缺点} \\ \hline
+	 		 &优点		&缺点 \\ \hline
 \rule{0pt}{13pt}	数据并行 &并行度高，理论上有多大的batch就可以有多少个设备并行计算	&模型不能大于当个设备的极限 \\
 \rule{0pt}{13pt}	模型并行	&可以对很大的模型进行运算	&只能有限并行，比如多少层就多少个设备 \\
 \end{tabular}
@@ -1223,8 +1223,8 @@ L(\mathbf{Y},\hat{\mathbf{Y}}) = \sum_{j=1}^n L_{\textrm{ce}}(\mathbf{y}_j,\hat{
 \caption{GNMT与其它翻译模型对比\cite{Wu2016GooglesNM}}
 \label{tab:gnmt vs state-of-the-art models}
 \begin{tabular}{l l l l}
-\multicolumn{1}{l|}{\multirow{2}{*}{\#}} & \multicolumn{2}{c}{\textbf{BLEU}} & \multirow{2}{*}{\textbf{CPU decoding time}} \\
+\multicolumn{1}{l|}{\multirow{2}{*}{\#}} & \multicolumn{2}{c}{BLEU} & \multirow{2}{*}{CPU decoding time} \\
-\multicolumn{1}{l|}{}                    & \textbf{EN-DE}  & \textbf{EN-FR}  &                                             \\ \hline
+\multicolumn{1}{l|}{}                    & EN-DE  & EN-FR  &                                             \\ \hline
 \multicolumn{1}{l|}{PBMT}                & 20.7            & 37.0            & -                                           \\
 \multicolumn{1}{l|}{RNNSearch}           & 16.5            & -               & -                                           \\
 \multicolumn{1}{l|}{LSTM(6 layers)}      & -               & 31.5            & -                                           \\

--- a/Book/Chapter6/Figures/figure-Structure-of-the-network-during-Transformer-training.tex
+++ b/Book/Chapter6/Figures/figure-Structure-of-the-network-during-Transformer-training.tex
@@ -39,8 +39,10 @@
 \node [rnnnode,anchor=south] (s2) at ([yshift=1em]t2.north) {\tiny{$\textbf{s}_2$}};
 \node [rnnnode,anchor=south] (s3) at ([yshift=1em]t3.north) {\tiny{$\textbf{s}_3$}};
 \node [rnnnode,anchor=south] (s4) at ([yshift=1em]t4.north) {\tiny{$\textbf{s}_4$}};
-\node [anchor=south] (dot3) at ([xshift=-0.4em,yshift=-0.7em]s3.south) {\tiny{...}};
+%\node [anchor=south] (dot3) at ([xshift=-0.4em,yshift=-0.7em]s3.south) {\tiny{...}};
 \node [anchor=south] (dot4) at ([xshift=-0.4em,yshift=-0.7em]s4.south) {\tiny{...}};
+\draw [->] ([xshift=-0.6em,yshift=-0.5em]s3.south) .. controls +(north:0) and +(south:0.2) .. ([xshift=-0.3em,yshift=-0.1em]s3.south);
+    \draw [->] ([xshift=-1.5em,yshift=-0.5em]s3.south) .. controls +(north:0) and +(south:0.15) .. ([xshift=-0.6em,yshift=-0.1em]s3.south);
 }
 {

--- a/Book/bibliography.bib
+++ b/Book/bibliography.bib
 % !Mode:: "TeX:UTF-8"
 % !TEX encoding = UTF-8 Unicode
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%%%%% chapter 1------------------------------------------------------
 @article{nagao1984framework,
  title={A framework of a mechanical translation between Japanese and English by analogy principle},
  author={Nagao, Makoto},
@@ -28,13 +31,6 @@
  organization={Association for Computational Linguistics}
 }
-@article{bahdanau2014neural,
-  title={Neural machine translation by jointly learning to align and translate},
-  author={Bahdanau, Dzmitry and Cho, Kyunghyun and Bengio, Yoshua},
-  journal={arXiv preprint arXiv:1409.0473},
-  year={2014}
-}
 @inproceedings{sutskever2014sequence,
  title={Sequence to sequence learning with neural networks},
  author={Sutskever, Ilya and Vinyals, Oriol and Le, Quoc V},
@@ -43,6 +39,13 @@
  year={2014}
 }
+@article{bahdanau2014neural,
+  title={Neural machine translation by jointly learning to align and translate},
+  author={Bahdanau, Dzmitry and Cho, Kyunghyun and Bengio, Yoshua},
+  journal={arXiv preprint arXiv:1409.0473},
+  year={2014}
+}
 @inproceedings{papineni2002bleu,
  title={BLEU: a method for automatic evaluation of machine translation},
  author={Papineni, Kishore and Roukos, Salim and Ward, Todd and Zhu, Wei-Jing},
@@ -71,54 +74,236 @@
  year={1993},
  publisher={Springer}
 }
-@article{rabiner1989tutorial,
-  title={A tutorial on hidden Markov models and selected applications in speech recognition},
+@inproceedings{Tong2012NiuTrans,
-  author={Rabiner, Lawrence R},
+  title={NiuTrans: an open source toolkit for phrase-based and syntax-based machine translation},
-  journal={Proceedings of the IEEE},
+  author={Tong, Xiao and Zhu, Jingbo and Hao, Zhang and Qiang, Li},
-  volume={77},
+  booktitle={Proceedings of the ACL 2012 System Demonstrations},
-  number={2},
+  year={2012},
-  pages={257--286},
-  year={1989},
-  publisher={Ieee}
 }
-@article{rabiner1986introduction,
+@article{Koehn2007Moses,
-  title={An introduction to hidden Markov models},
+  title={Moses: Open Source Toolkit for Statistical Machine Translation},
-  author={Rabiner, Lawrence and Juang, B},
+  author={Koehn, Philipp and Hoang, Hieu and Birch, Alexandra and Callisonburch, Chris and Federico, Marcello and Bertoldi, Nicola and Cowan, Brooke and Shen, Wade and Moran, Christine and Zens, Richard},
-  journal={ieee assp magazine},
+  volume={9},
-  volume={3},
  number={1},
-  pages={4--16},
+  pages={177--180},
-  year={1986},
+  year={2007},
-  publisher={IEEE}
 }
-@article{parsing2009speech,
+@article{Li2010Joshua,
-  title={Speech and language processing},
+  title={Joshua: An Open Source Toolkit for Parsing-based Machine Translation},
-  author={Parsing, Constituency},
+  author={Li, Zhifei and Callisonburch, Chris and Dyer, Chris and Ganitkevitch, Juri and Khudanpur, Sanjeev and Schwartz, Lane and Thornton, Wren N. G. and Weese, Jonathan and Zaidan, Omar F.},
-  year={2009}
+  pages={135--139},
+  year={2010},
 }
-@article{ney1994structuring,
+@article{zollmann2007the,
-  title={On structuring probabilistic dependences in stochastic language modelling},
+title={The Syntax Augmented MT (SAMT) System at the Shared Task for the 2007 ACL Workshop on Statistical Machine Translation},
-  author={Ney, Hermann and Essen, Ute and Kneser, Reinhard},
+author={Zollmann, Andreas and Venugopal, Ashish and Paulik, Matthias and Vogel, Stephan},
-  journal={Computer Speech \& Language},
+pages={216--219},
-  volume={8},
+year={2007}}
+@inproceedings{dyer2010cdec,
+  title={cdec: A decoder, alignment, and learning framework for finite-state and context-free translation models},
+  author={Dyer, Chris and Weese, Jonathan and Setiawan, Hendra and Lopez, Adam and Ture, Ferhan and Eidelman, Vladimir and Ganitkevitch, Juri and Blunsom, Phil and Resnik, Philip},
+  booktitle={Proceedings of the ACL 2010 System Demonstrations},
+  pages={7--12},
+  year={2010},
+  organization={Association for Computational Linguistics}
+}
+@article{Cer2010Phrasal,
+  title={Phrasal: A Statistical Machine Translation Toolkit for Exploring New Model Features.},
+  author={Cer, Daniel M and Galley, Michel and Jurafsky, Daniel and Manning, Christopher D},
+  year={2010},
+}
+@article{vilar2012jane,
+  title={Jane: an advanced freely available hierarchical machine translation toolkit},
+  author={Vilar, David and Stein, Daniel and Huck, Matthias and Ney, Hermann},
+  journal={Machine Translation},
+  volume={26},
+  number={3},
+  pages={197--216},
+  year={2012},
+  publisher={Springer}
+}
+@article{och2003systematic,
+  title={A systematic comparison of various statistical alignment models},
+  author={Och, Franz Josef and Ney, Hermann},
+  journal={Computational linguistics},
+  volume={29},
  number={1},
-  pages={1--38},
+  pages={19--51},
-  year={1994}
+  year={2003},
+  publisher={MIT Press}
 }
-@article{chen1999empirical,
+@inproceedings{iglesias2009hierarchical,
-  title={An empirical study of smoothing techniques for language modeling},
+  title={Hierarchical phrase-based translation with weighted finite state transducers},
-  author={Chen, Stanley F and Goodman, Joshua},
+  author={Iglesias, Gonzalo and de Gispert, Adri{\`a} and Banga, Eduardo R and Byrne, William},
-  journal={Computer Speech \& Language},
+  booktitle={Proceedings of Human Language Technologies: The 2009 Annual Conference of the North American Chapter of the Association for Computational Linguistics},
-  volume={13},
+  pages={433--441},
-  number={4},
+  year={2009}
-  pages={359--394},
+}
+@article{dyer2013a,
+title={A Simple, Fast, and Effective Reparameterization of IBM Model 2},
+author={Dyer, Chris and Chahuneau, Victor and Smith, Noah A},
+pages={644--648},
+year={2013}}
+@article{bahdanau2015neural,
+title={Neural Machine Translation by Jointly Learning to Align and Translate},
+author={Bahdanau, Dzmitry and Cho, Kyunghyun and Bengio, Yoshua},
+year={2015}}
+@article{al2016theano,
+  title={Theano: A Python framework for fast computation of mathematical expressions},
+  author={Al-Rfou, Rami and Alain, Guillaume and Almahairi, Amjad and Angermueller, Christof and Bahdanau, Dzmitry and Ballas, Nicolas and Bastien, Fr{\'e}d{\'e}ric and Bayer, Justin and Belikov, Anatoly and Belopolsky, Alexander and others},
+  journal={arXiv preprint arXiv:1605.02688},
+  year={2016}
+}
+@article{SennrichNematus,
+  title={Nematus: a Toolkit for Neural Machine Translation},
+  author={Sennrich, Rico and Firat, Orhan and Cho, Kyunghyun and Birch, Alexandra and Haddow, Barry and Hitschler, Julian and Junczys-Dowmunt, Marcin and Läubli, Samuel and Barone, Antonio Valerio Miceli and Mokry, Jozef},
+}
+@article{zoph2016simple,
+title={Simple, Fast Noise-Contrastive Estimation for Large RNN Vocabularies.},
+author={Zoph, Barret and Vaswani, Ashish and May, Jonathan and Knight, Kevin},
+pages={1217--1222},
+year={2016}}
+@article{Ottfairseq,
+  title={fairseq: A Fast, Extensible Toolkit for Sequence Modeling},
+  author={Ott, Myle and Edunov, Sergey and Baevski, Alexei and Fan, Angela and Gross, Sam and Ng, Nathan and Grangier, David and Auli, Michael},
+}
+@article{VaswaniTensor2Tensor,
+  title={Tensor2Tensor for Neural Machine Translation},
+  author={Vaswani, Ashish and Bengio, Samy and Brevdo, Eugene and Chollet, Francois and Gomez, Aidan N. and Gouws, Stephan and Jones, Llion and Kaiser, Łukasz and Kalchbrenner, Nal and Parmar, Niki},
+}
+@article{KleinOpenNMT,
+  title={OpenNMT: Open-Source Toolkit for Neural Machine Translation},
+  author={Klein, Guillaume and Kim, Yoon and Deng, Yuntian and Senellart, Jean and Rush, Alexander M.},
+}
+@inproceedings{luong2016acl_hybrid,
+ author = {Luong, Minh-Thang  and  Manning, Christopher D.},
+ title = {Achieving Open Vocabulary Neural Machine Translation with Hybrid Word-Character Models},
+ booktitle = {Association for Computational Linguistics (ACL)},
+ address = {Berlin, Germany},
+ month = {August},
+ year = {2016}
+}
+@article{ZhangTHUMT,
+  title={THUMT: An Open Source Toolkit for Neural Machine Translation},
+  author={Zhang, Jiacheng and Ding, Yanzhuo and Shen, Shiqi and Cheng, Yong and Sun, Maosong and Luan, Huanbo and Liu, Yang},
+}
+@article{JunczysMarian,
+  title={Marian: Fast Neural Machine Translation in C++},
+  author={Junczys-Dowmunt, Marcin and Grundkiewicz, Roman and Dwojak, Tomasz and Hoang, Hieu and Heafield, Kenneth and Neckermann, Tom and Seide, Frank and Germann, Ulrich and Aji, Alham Fikri and Bogoychev, Nikolay},
+}
+@article{hieber2017sockeye,
+title={Sockeye: A Toolkit for Neural Machine Translation.},
+author={Hieber, Felix and Domhan, Tobias and Denkowski, Michael and Vilar, David and Sokolov, Artem and Clifton, Ann and Post, Matt},
+journal={arXiv: Computation and Language},
+year={2017}}
+@article{WangCytonMT,
+  title={CytonMT: an Efficient Neural Machine Translation Open-source Toolkit Implemented in C++},
+  author={Wang, Xiaolin and Utiyama, Masao and Sumita, Eiichiro},
+}
+@article{KuchaievMixed,
+  title={Mixed-Precision Training for NLP and Speech Recognition with OpenSeq2Seq},
+  author={Kuchaiev, Oleksii and Ginsburg, Boris and Gitman, Igor and Lavrukhin, Vitaly and Li, Jason and Nguyen, Huyen and Case, Carl and Micikevicius, Paulius},
+}
+@article{nmtpy2017,
+  author    = {Ozan Caglayan and
+               Mercedes Garc\'{i}a-Mart\'{i}nez and
+               Adrien Bardet and
+               Walid Aransa and
+               Fethi Bougares and
+               Lo\"{i}c Barrault},
+  title     = {NMTPY: A Flexible Toolkit for Advanced Neural Machine Translation Systems},
+  journal   = {Prague Bull. Math. Linguistics},
+  volume    = {109},
+  pages     = {15--28},
+  year      = {2017},
+  url       = {https://ufal.mff.cuni.cz/pbml/109/art-caglayan-et-al.pdf},
+  doi       = {10.1515/pralin-2017-0035},
+  timestamp = {Tue, 12 Sep 2017 10:01:08 +0100}
+}
+@book{koehn2009statistical,
+  title={Statistical machine translation},
+  author={Koehn, Philipp},
+  year={2009},
+  publisher={Cambridge University Press}
+}
+@book{manning1999foundations,
+  title={Foundations of statistical natural language processing},
+  author={Manning, Christopher D and Manning, Christopher D and Sch{\"u}tze, Hinrich},
  year={1999},
-  publisher={Elsevier}
+  publisher={MIT press}
+}
+@misc{manning2005统计自然语言处理基础,
+  title={统计自然语言处理基础},
+  author={Manning, Christopher D and Schutze, Hinrich},
+  year={2005},
+  publisher={北京: 电子工业出版社}
+}
+@book{宗成庆2013统计自然语言处理,
+  title={统计自然语言处理},
+  author={宗成庆},
+  year={2013},
+}
+@book{Goodfellow-et-al-2016,
+    title={Deep Learning},
+    author={Ian Goodfellow and Yoshua Bengio and Aaron Courville},
+    publisher={MIT Press},
+    note={\url{http://www.deeplearningbook.org}},
+    year={2016}
+}
+@article{goldberg2017neural,
+  title={Neural network methods for natural language processing},
+  author={Goldberg, Yoav},
+  journal={Synthesis Lectures on Human Language Technologies},
+  volume={10},
+  number={1},
+  pages={1--309},
+  year={2017},
+  publisher={Morgan \& Claypool Publishers}
+}
+@book{周志华2016机器学习,
+  title={机器学习},
+  author={周志华},
+  year={2016},
+  publisher={清华大学出版社}
 }
+%%%%% chapter 1------------------------------------------------------
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%%%%% chapter 2------------------------------------------------------
 @book{mao-prob-book-2011,
  title={概率论与数理统计教程: 第二版},
@@ -134,28 +319,6 @@
  publisher={Courier Dover Publications}
 }
-@article{brown1993mathematics,
-  title={The mathematics of statistical machine translation: Parameter estimation},
-  author={Brown, Peter F and Pietra, Vincent J Della and Pietra, Stephen A Della and Mercer, Robert L},
-  journal={Computational linguistics},
-  volume={19},
-  number={2},
-  pages={263--311},
-  year={1993},
-  publisher={MIT Press}
-}
-@article{shannon1949communication,
-  title={Communication theory of secrecy systems},
-  author={Shannon, Claude E},
-  journal={Bell system technical journal},
-  volume={28},
-  number={4},
-  pages={656--715},
-  year={1949},
-  publisher={Wiley Online Library}
-}
 @book{liuke-markov-2004,
  title={实用马尔可夫决策过程},
  author={刘克},
@@ -213,16 +376,33 @@
  year={1999},
  publisher={Elsevier}
 }
-@book{chomsky2002syntactic,
-  title={Syntactic structures},
+@article{parsing2009speech,
-  author={Chomsky, Noam},
+  title={Speech and language processing},
-  year={2002},
+  author={Parsing, Constituency},
-  publisher={Walter de Gruyter}
+  year={2009}
 }
-@article{shannon1948mathematical,
+@article{ney1994structuring,
-  title={A mathematical theory of communication},
+  title={On structuring probabilistic dependences in stochastic language modelling},
-  author={Shannon, Claude E},
+  author={Ney, Hermann and Essen, Ute and Kneser, Reinhard},
+  journal={Computer Speech \& Language},
+  volume={8},
+  number={1},
+  pages={1--38},
+  year={1994}
+}
+@book{chomsky2002syntactic,
+  title={Syntactic structures},
+  author={Chomsky, Noam},
+  year={2002},
+  publisher={Walter de Gruyter}
+}
+@article{shannon1948mathematical,
+  title={A mathematical theory of communication},
+  author={Shannon, Claude E},
  journal={Bell system technical journal},
  volume={27},
  number={3},
@@ -291,6 +471,12 @@ journal={arXiv: Computation and Language},
 year={2015}
 }
+@inproceedings{huang2008advanced,
+  title={Advanced Dynamic Programming in CL},
+  author={Huang, Liang},
+  year={2008}
+}
 @book{aho1972theory,
  title={The theory of parsing, translation, and compiling},
  author={Aho, Alfred V and Ullman, Jeffrey D},
@@ -299,23 +485,42 @@ year={2015}
  publisher={Prentice-Hall Englewood Cliffs, NJ}
 }
-@inproceedings{huang2008advanced,
+%%%%% chapter 2------------------------------------------------------
-  title={Advanced Dynamic Programming in CL},
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-  author={Huang, Liang},
-  year={2008}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%%%%% chapter 3------------------------------------------------------
+@article{brown1990statistical,
+  title={A statistical approach to machine translation},
+  author={Brown, Peter F and Cocke, John and Della Pietra, Stephen A and Della Pietra, Vincent J and Jelinek, Frederick and Lafferty, John and Mercer, Robert L and Roossin, Paul S},
+  journal={Computational linguistics},
+  volume={16},
+  number={2},
+  pages={79--85},
+  year={1990}
 }
-@article{och2004alignment,
+@article{brown1993mathematics,
-  title={The alignment template approach to statistical machine translation},
+  title={The mathematics of statistical machine translation: Parameter estimation},
-  author={Och, Franz Josef and Ney, Hermann},
+  author={Brown, Peter F and Pietra, Vincent J Della and Pietra, Stephen A Della and Mercer, Robert L},
  journal={Computational linguistics},
-  volume={30},
+  volume={19},
-  number={4},
+  number={2},
-  pages={417--449},
+  pages={263--311},
-  year={2004},
+  year={1993},
  publisher={MIT Press}
 }
+@inproceedings{ittycheriah2005maximum,
+  title={A maximum entropy word aligner for Arabic-English machine translation},
+  author={Ittycheriah, Abraham and Roukos, Salim},
+  booktitle={Proceedings of the conference on Human Language Technology and Empirical Methods in Natural Language Processing},
+  pages={89--96},
+  year={2005},
+  organization={Association for Computational Linguistics}
+}
 @inproceedings{koehn2003statistical,
  title={Statistical phrase-based translation},
  author={Koehn, Philipp and Och, Franz Josef and Marcu, Daniel},
@@ -324,7 +529,12 @@ year={2015}
  year={2003},
  organization={Association for Computational Linguistics}
 }
+@book{manning1999foundations,
+  title={Foundations of statistical natural language processing},
+  author={Manning, Christopher D and Manning, Christopher D and Sch{\"u}tze, Hinrich},
+  year={1999},
+  publisher={MIT press}
+}
 @article{och2003systematic,
  title={A systematic comparison of various statistical alignment models},
  author={Och, Franz Josef and Ney, Hermann},
@@ -335,23 +545,25 @@ year={2015}
  year={2003},
  publisher={MIT Press}
 }
+@article{och2004alignment,
-@inproceedings{ittycheriah2005maximum,
+  title={The alignment template approach to statistical machine translation},
-  title={A maximum entropy word aligner for Arabic-English machine translation},
+  author={Och, Franz Josef and Ney, Hermann},
-  author={Ittycheriah, Abraham and Roukos, Salim},
+  journal={Computational linguistics},
-  booktitle={Proceedings of the conference on Human Language Technology and Empirical Methods in Natural Language Processing},
+  volume={30},
-  pages={89--96},
+  number={4},
-  year={2005},
+  pages={417--449},
-  organization={Association for Computational Linguistics}
+  year={2004},
+  publisher={MIT Press}
 }
+@article{shannon1949communication,
-@article{xiao2013unsupervised,
+  title={Communication theory of secrecy systems},
-  title={Unsupervised sub-tree alignment for tree-to-tree translation},
+  author={Shannon, Claude E},
-  author={Xiao, Tong and Zhu, Jingbo},
+  journal={Bell system technical journal},
-  journal={Journal of Artificial Intelligence Research},
+  volume={28},
-  volume={48},
+  number={4},
-  pages={733--782},
+  pages={656--715},
-  year={2013}
+  year={1949},
+  publisher={Wiley Online Library}
 }
 @inproceedings{vogel1996hmm,
  title={HMM-based word alignment in statistical translation},
@@ -361,229 +573,293 @@ year={2015}
  year={1996},
  organization={Association for Computational Linguistics}
 }
+@article{xiao2013unsupervised,
-@book{manning1999foundations,
+  title={Unsupervised sub-tree alignment for tree-to-tree translation},
-  title={Foundations of statistical natural language processing},
+  author={Xiao, Tong and Zhu, Jingbo},
-  author={Manning, Christopher D and Manning, Christopher D and Sch{\"u}tze, Hinrich},
+  journal={Journal of Artificial Intelligence Research},
-  year={1999},
+  volume={48},
-  publisher={MIT press}
+  pages={733--782},
-}
+  year={2013}
-@article{SPhilipp,
-  title={Philipp Koehn, Statistical machine translation},
-  author={Sánchez-Martínez, Felipe and Juan Antonio Pérez-Ortiz},
-  volume={24},
-  number={3-4},
-  pages={273-278},
-}
-@article{SIDDHARTHANChristopher,
-  title={Christopher D. Manning and Hinrich Schutze. Foundations of Statistical Natural Language Processing. MIT Press, 2000. ISBN 0-262-13360-1. 620 pp.},
-  author={SIDDHARTHAN and ADVAITH},
-  journal={Natural Language Engineering},
-  volume={8},
-  number={01},
-}
-@book{宗成庆2013统计自然语言处理,
-  title={统计自然语言处理},
-  author={宗成庆},
-  year={2013},
 }
+%%%%% chapter 3------------------------------------------------------
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-@article{HeatonIan,
-  title={Ian Goodfellow, Yoshua Bengio, and Aaron Courville: Deep learning},
-  author={Heaton and Jeff},
-  journal={Genetic Programming \& Evolvable Machines},
-  pages={s10710-017-9314-z},
-}
-@article{周志华2018《机器学习》,
-  title={《机器学习》},
-  author={周志华},
-  journal={航空港},
-  number={2},
-  pages={94-94},
-  year={2018},
-}
-@inproceedings{Tong2012NiuTrans,
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-  title={NiuTrans: an open source toolkit for phrase-based and syntax-based machine translation},
+%%%%%chapter5------------------------------------------------------------
-  author={Tong, Xiao and Zhu, Jingbo and Hao, Zhang and Qiang, Li},
+@article{baydin2017automatic,
-  booktitle={Proceedings of the ACL 2012 System Demonstrations},
+  title={Automatic differentiation in machine learning: a survey},
-  year={2012},
+  author={Baydin, At{\i}l{\i}m G{\"u}nes and Pearlmutter, Barak A and Radul, Alexey Andreyevich and Siskind, Jeffrey Mark},
+  journal={The Journal of Machine Learning Research},
+  volume={18},
+  number={1},
+  pages={5595--5637},
+  year={2017},
+  publisher={JMLR. org}
 }
+@article{qian1999momentum,
-@article{Koehn2007Moses,
+  title={On the momentum term in gradient descent learning algorithms},
-  title={Moses: Open Source Toolkit for Statistical Machine Translation},
+  author={Qian, Ning},
-  author={Koehn, Philipp and Hoang, Hieu and Birch, Alexandra and Callisonburch, Chris and Federico, Marcello and Bertoldi, Nicola and Cowan, Brooke and Shen, Wade and Moran, Christine and Zens, Richard},
+  journal={Neural networks},
-  volume={9},
+  volume={12},
  number={1},
-  pages={177--180},
+  pages={145--151},
-  year={2007},
+  year={1999},
+  publisher={Elsevier}
 }
+@article{duchi2011adaptive,
-@inproceedings{Dyer2010cdec,
+  title={Adaptive subgradient methods for online learning and stochastic optimization},
-  title={cdec: A Decoder, Alignment, and Learning Framework for Finite-State and Context-Free Translation Models},
+  author={Duchi, John and Hazan, Elad and Singer, Yoram},
-  author={Dyer, Chris and Lopez, Adam and Ganitkevitch, Juri and Weese, Jonathan and Resnik, Philip},
+  journal={Journal of machine learning research},
-  booktitle={ACL 2010, Proceedings of the 48th Annual Meeting of the Association for Computational Linguistics, July 11-16, 2010, Uppsala, Sweden, System Demonstrations},
+  volume={12},
-  year={2010},
+  number={Jul},
+  pages={2121--2159},
+  year={2011}
 }
+@article{tieleman2012rmsprop,
-@article{SennrichNematus,
+  title={Rmsprop: Divide the gradient by a running average of its recent magnitude. coursera: Neural networks for machine learning},
-  title={Nematus: a Toolkit for Neural Machine Translation},
+  author={Tieleman, Tijmen and Hinton, Geoffrey},
-  author={Sennrich, Rico and Firat, Orhan and Cho, Kyunghyun and Birch, Alexandra and Haddow, Barry and Hitschler, Julian and Junczys-Dowmunt, Marcin and Läubli, Samuel and Barone, Antonio Valerio Miceli and Mokry, Jozef},
+  journal={COURSERA Neural Networks Mach. Learn},
+  year={2012}
 }
+@article{kingma2014adam,
-@article{Ottfairseq,
+  title={Adam: A method for stochastic optimization},
-  title={fairseq: A Fast, Extensible Toolkit for Sequence Modeling},
+  author={Kingma, Diederik P and Ba, Jimmy},
-  author={Ott, Myle and Edunov, Sergey and Baevski, Alexei and Fan, Angela and Gross, Sam and Ng, Nathan and Grangier, David and Auli, Michael},
+  journal={arXiv preprint arXiv:1412.6980},
+  year={2014}
 }
+@inproceedings{xiao2017fast,
-@article{VaswaniTensor2Tensor,
+  title={Fast Parallel Training of Neural Language Models.},
-  title={Tensor2Tensor for Neural Machine Translation},
+  author={Xiao, Tong and Zhu, Jingbo and Liu, Tongran and Zhang, Chunliang},
-  author={Vaswani, Ashish and Bengio, Samy and Brevdo, Eugene and Chollet, Francois and Gomez, Aidan N. and Gouws, Stephan and Jones, Llion and Kaiser, Łukasz and Kalchbrenner, Nal and Parmar, Niki},
+  booktitle={IJCAI},
+  pages={4193--4199},
+  year={2017}
 }
+@article{ioffe2015batch,
-@article{KleinOpenNMT,
+  title={Batch normalization: Accelerating deep network training by reducing internal covariate shift},
-  title={OpenNMT: Open-Source Toolkit for Neural Machine Translation},
+  author={Ioffe, Sergey and Szegedy, Christian},
-  author={Klein, Guillaume and Kim, Yoon and Deng, Yuntian and Senellart, Jean and Rush, Alexander M.},
+  journal={arXiv preprint arXiv:1502.03167},
+  year={2015}
 }
+@article{ba2016layer,
-@article{ZhangTHUMT,
+  title={Layer normalization},
-  title={THUMT: An Open Source Toolkit for Neural Machine Translation},
+  author={Ba, Jimmy Lei and Kiros, Jamie Ryan and Hinton, Geoffrey E},
-  author={Zhang, Jiacheng and Ding, Yanzhuo and Shen, Shiqi and Cheng, Yong and Sun, Maosong and Luan, Huanbo and Liu, Yang},
+  journal={arXiv preprint arXiv:1607.06450},
+  year={2016}
 }
+@inproceedings{he2016deep,
-@article{WangCytonMT,
+  title={Deep residual learning for image recognition},
-  title={CytonMT: an Efficient Neural Machine Translation Open-source Toolkit Implemented in C++},
+  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
-  author={Wang, Xiaolin and Utiyama, Masao and Sumita, Eiichiro},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={770--778},
+  year={2016}
 }
+@article{bengio2003neural,
-@article{Germann2016Modern,
+  title={A neural probabilistic language model},
-  title={Modern MT: A New Open-Source Machine Translation Platform for the Translation Industry},
+  author={Bengio, Yoshua and Ducharme, R{\'e}jean and Vincent, Pascal and Jauvin, Christian},
-  author={Germann, Ulrich and Barbu, E and Bentivoglio, M and Bogoychev, Nikolay and Buck, C and Caroselli, D and Carvalho, L and Cattelan, A and Cattoni, R and Cettolo, M},
+  journal={Journal of machine learning research},
-  year={2016},
+  volume={3},
- abstract={Modern MT (www.modernmt.eu) is a three-year Horizon 2020 innovation action(2015–2017) to develop new open-source machine translation technology for use in translation production environments, both fully automatic and as a back-end in interactive post-editing scenarios. Led by Translated srl, the project consortium also includes the Fondazione Bruno Kessler (FBK), the University of Edinburgh, and TAUS B.V. Modern MT has received funding from the European Union’s Horizon 2020 research and innovation programme under Grant Agreement No. 645487 (call ICT-17-2014).},
+  number={Feb},
+  pages={1137--1155},
+  year={2003}
 }
+@inproceedings{mikolov2010recurrent,
-@article{JunczysMarian,
+  title={Recurrent neural network based language model},
-  title={Marian: Fast Neural Machine Translation in C++},
+  author={Mikolov, Tom{\'a}{\v{s}} and Karafi{\'a}t, Martin and Burget, Luk{\'a}{\v{s}} and {\v{C}}ernock{\`y}, Jan and Khudanpur, Sanjeev},
-  author={Junczys-Dowmunt, Marcin and Grundkiewicz, Roman and Dwojak, Tomasz and Hoang, Hieu and Heafield, Kenneth and Neckermann, Tom and Seide, Frank and Germann, Ulrich and Aji, Alham Fikri and Bogoychev, Nikolay},
+  booktitle={Eleventh annual conference of the international speech communication association},
+  year={2010}
 }
+@inproceedings{vaswani2017attention,
-@article{hieber2017sockeye,
+  title={Attention is all you need},
-title={Sockeye: A Toolkit for Neural Machine Translation.},
+  author={Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, {\L}ukasz and Polosukhin, Illia},
-author={Hieber, Felix and Domhan, Tobias and Denkowski, Michael and Vilar, David and Sokolov, Artem and Clifton, Ann and Post, Matt},
+  booktitle={Advances in neural information processing systems},
-journal={arXiv: Computation and Language},
+  pages={5998--6008},
-year={2017}}
+  year={2017}
-@article{KuchaievMixed,
-  title={Mixed-Precision Training for NLP and Speech Recognition with OpenSeq2Seq},
-  author={Kuchaiev, Oleksii and Ginsburg, Boris and Gitman, Igor and Lavrukhin, Vitaly and Li, Jason and Nguyen, Huyen and Case, Carl and Micikevicius, Paulius},
 }
+@inproceedings{mikolov2013distributed,
-@article{bahdanau2015neural,
+  title={Distributed representations of words and phrases and their compositionality},
-title={Neural Machine Translation by Jointly Learning to Align and Translate},
+  author={Mikolov, Tomas and Sutskever, Ilya and Chen, Kai and Corrado, Greg S and Dean, Jeff},
-author={Bahdanau, Dzmitry and Cho, Kyunghyun and Bengio, Yoshua},
+  booktitle={Advances in neural information processing systems},
-year={2015}}
+  pages={3111--3119},
+  year={2013}
-@article{Li2010Joshua,
-  title={Joshua: An Open Source Toolkit for Parsing-based Machine Translation},
-  author={Li, Zhifei and Callisonburch, Chris and Dyer, Chris and Ganitkevitch, Juri and Khudanpur, Sanjeev and Schwartz, Lane and Thornton, Wren N. G. and Weese, Jonathan and Zaidan, Omar F.},
-  pages={135--139},
-  year={2010},
 }
+@inproceedings{pennington2014glove,
-@inproceedings{Goldberg2017Neural,
+  title={Glove: Global vectors for word representation},
-  title={Neural Network Methods in Natural Language Processing},
+  author={Pennington, Jeffrey and Socher, Richard and Manning, Christopher D},
-  author={Goldberg, Yoav and Hirst, Graeme},
+  booktitle={Proceedings of the 2014 conference on empirical methods in natural language processing (EMNLP)},
-  booktitle={Neural Network Methods in Natural Language Processing},
+  pages={1532--1543},
+  year={2014}
+}
+@article{peters2018deep,
+  title={Deep contextualized word representations},
+  author={Peters, Matthew E and Neumann, Mark and Iyyer, Mohit and Gardner, Matt and Clark, Christopher and Lee, Kenton and Zettlemoyer, Luke},
+  journal={arXiv preprint arXiv:1802.05365},
+  year={2018}
+}
+@article{radford2018improving,
+  title={Improving language understanding by generative pre-training},
+  author={Radford, Alec and Narasimhan, Karthik and Salimans, Tim and Sutskever, Ilya},
+  journal={URL https://s3-us-west-2. amazonaws. com/openai-assets/researchcovers/languageunsupervised/language understanding paper. pdf},
+  year={2018}
+}
+@article{devlin2018bert,
+  title={Bert: Pre-training of deep bidirectional transformers for language understanding},
+  author={Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina},
+  journal={arXiv preprint arXiv:1810.04805},
+  year={2018}
+}
+@article{goldberg2017neural,
+  title={Neural network methods for natural language processing},
+  author={Goldberg, Yoav},
+  journal={Synthesis Lectures on Human Language Technologies},
+  volume={10},
+  number={1},
+  pages={1--309},
  year={2017},
+  publisher={Morgan \& Claypool Publishers}
 }
+@article{lecun2015deep,
-@article{pino2010the,
+  title={Deep learning},
-title={The CUED HiFST System for the WMT10 Translation Shared Task},
+  author={LeCun, Yann and Bengio, Yoshua and Hinton, Geoffrey},
-author={Pino, Juan and Iglesias, Gonzalo and De Gispert, Adria and Blackwood, Graeme and Brunning, Jamie and Byrne, William},
+  journal={nature},
-pages={155--160},
+  volume={521},
-year={2010}}
+  number={7553},
+  pages={436--444},
-@book{Junczysdowmunt2012SyMGiza,
+  year={2015},
-  title={SyMGiza++: Symmetrized Word Alignment Models for Statistical Machine Translation},
+  publisher={Nature Publishing Group}
-  author={Junczysdowmunt, Marcin and Szał, Arkadiusz},
-  year={2012},
 }
+@article{guidotti2018survey,
-@article{VilarJane,
+  title={A survey of methods for explaining black box models},
-  title={Jane: an advanced freely available hierarchical machine translation toolkit},
+  author={Guidotti, Riccardo and Monreale, Anna and Ruggieri, Salvatore and Turini, Franco and Giannotti, Fosca and Pedreschi, Dino},
-  author={Vilar, David and Stein, Daniel and Huck, Matthias and Ney, Hermann},
+  journal={ACM computing surveys (CSUR)},
-  journal={Machine Translation},
+  volume={51},
-  volume={26},
+  number={5},
-  number={3},
+  pages={1--42},
-  pages={197-216},
+  year={2018},
+  publisher={ACM New York, NY, USA}
 }
+@inproceedings{koh2017understanding,
-@article{Cer2010Phrasal,
+  title={Understanding black-box predictions via influence functions},
-  title={Phrasal: A Statistical Machine Translation Toolkit for Exploring New Model Features.},
+  author={Koh, Pang Wei and Liang, Percy},
-  author={Cer, Daniel M and Galley, Michel and Jurafsky, Daniel and Manning, Christopher D},
+  booktitle={Proceedings of the 34th International Conference on Machine Learning-Volume 70},
-  year={2010},
+  pages={1885--1894},
+  year={2017},
+  organization={JMLR. org}
 }
+@article{arthur2016incorporating,
-@article{zollmann2007the,
+  title={Incorporating discrete translation lexicons into neural machine translation},
-title={The Syntax Augmented MT (SAMT) System at the Shared Task for the 2007 ACL Workshop on Statistical Machine Translation},
+  author={Arthur, Philip and Neubig, Graham and Nakamura, Satoshi},
-author={Zollmann, Andreas and Venugopal, Ashish and Paulik, Matthias and Vogel, Stephan},
+  journal={arXiv preprint arXiv:1606.02006},
-pages={216--219},
+  year={2016}
-year={2007}}
+}
+@article{zhang2018prior,
-@article{曼宁2005《统计自然语言处理基础》,
+  title={Prior knowledge integration for neural machine translation using posterior regularization},
-  title={《统计自然语言处理基础》},
+  author={Zhang, Jiacheng and Liu, Yang and Luan, Huanbo and Xu, Jingfang and Sun, Maosong},
-  author={曼宁},
+  journal={arXiv preprint arXiv:1811.01100},
-  journal={中文信息学报},
+  year={2018}
-  volume={19},
+}
-  number={3},
+@inproceedings{zollmann2006syntax,
-  pages={54-54},
+  title={Syntax augmented machine translation via chart parsing},
-  year={2005},
+  author={Zollmann, Andreas and Venugopal, Ashish},
+  booktitle={Proceedings of the Workshop on Statistical Machine Translation},
+  pages={138--141},
+  year={2006},
+  organization={Association for Computational Linguistics}
+}
+@inproceedings{charniak2003syntax,
+  title={Syntax-based language models for statistical machine translation},
+  author={Charniak, Eugene and Knight, Kevin and Yamada, Kenji},
+  booktitle={Proceedings of MT Summit IX},
+  pages={40--46},
+  year={2003}
+}
+@article{stahlberg2016syntactically,
+  title={Syntactically guided neural machine translation},
+  author={Stahlberg, Felix and Hasler, Eva and Waite, Aurelien and Byrne, Bill},
+  journal={arXiv preprint arXiv:1605.04569},
+  year={2016}
+}
+@inproceedings{plank2013embedding,
+  title={Embedding semantic similarity in tree kernels for domain adaptation of relation extraction},
+  author={Plank, Barbara and Moschitti, Alessandro},
+  booktitle={Proceedings of the 51st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
+  pages={1498--1507},
+  year={2013}
+}
+@inproceedings{perozzi2014deepwalk,
+  title={Deepwalk: Online learning of social representations},
+  author={Perozzi, Bryan and Al-Rfou, Rami and Skiena, Steven},
+  booktitle={Proceedings of the 20th ACM SIGKDD international conference on Knowledge discovery and data mining},
+  pages={701--710},
+  year={2014}
+}
+@article{collobert2011natural,
+  title={Natural language processing (almost) from scratch},
+  author={Collobert, Ronan and Weston, Jason and Bottou, L{\'e}on and Karlen, Michael and Kavukcuoglu, Koray and Kuksa, Pavel},
+  journal={Journal of machine learning research},
+  volume={12},
+  number={Aug},
+  pages={2493--2537},
+  year={2011}
+}
+@article{mikolov2013efficient,
+  title={Efficient estimation of word representations in vector space},
+  author={Mikolov, Tomas and Chen, Kai and Corrado, Greg and Dean, Jeffrey},
+  journal={arXiv preprint arXiv:1301.3781},
+  year={2013}
+}
+@inproceedings{mccann2017learned,
+  title={Learned in translation: Contextualized word vectors},
+  author={McCann, Bryan and Bradbury, James and Xiong, Caiming and Socher, Richard},
+  booktitle={Advances in Neural Information Processing Systems},
+  pages={6294--6305},
+  year={2017}
+}
+@article{radford2019language,
+  title={Language models are unsupervised multitask learners},
+  author={Radford, Alec and Wu, Jeffrey and Child, Rewon and Luan, David and Amodei, Dario and Sutskever, Ilya},
+  journal={OpenAI Blog},
+  volume={1},
+  number={8},
+  pages={9},
+  year={2019}
+}
+@article{lample2019cross,
+  title={Cross-lingual language model pretraining},
+  author={Lample, Guillaume and Conneau, Alexis},
+  journal={arXiv preprint arXiv:1901.07291},
+  year={2019}
+}
+@article{song2019mass,
+  title={Mass: Masked sequence to sequence pre-training for language generation},
+  author={Song, Kaitao and Tan, Xu and Qin, Tao and Lu, Jianfeng and Liu, Tie-Yan},
+  journal={arXiv preprint arXiv:1905.02450},
+  year={2019}
+}
+@inproceedings{yang2019xlnet,
+  title={Xlnet: Generalized autoregressive pretraining for language understanding},
+  author={Yang, Zhilin and Dai, Zihang and Yang, Yiming and Carbonell, Jaime and Salakhutdinov, Russ R and Le, Quoc V},
+  booktitle={Advances in neural information processing systems},
+  pages={5754--5764},
+  year={2019}
+}
+@article{lan2019albert,
+  title={Albert: A lite bert for self-supervised learning of language representations},
+  author={Lan, Zhenzhong and Chen, Mingda and Goodman, Sebastian and Gimpel, Kevin and Sharma, Piyush and Soricut, Radu},
+  journal={arXiv preprint arXiv:1909.11942},
+  year={2019}
 }
+%%%%%chapter5------------------------------------------------------------
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-@article{zoph2016simple,
-title={Simple, Fast Noise-Contrastive Estimation for Large RNN Vocabularies.},
-author={Zoph, Barret and Vaswani, Ashish and May, Jonathan and Knight, Kevin},
-pages={1217--1222},
-year={2016}}
-@article{dyer2013a,
-title={A Simple, Fast, and Effective Reparameterization of IBM Model 2},
-author={Dyer, Chris and Chahuneau, Victor and Smith, Noah A},
-pages={644--648},
-year={2013}}
-@article{nmtpy2017,
-  author    = {Ozan Caglayan and
-               Mercedes Garc\'{i}a-Mart\'{i}nez and
-               Adrien Bardet and
-               Walid Aransa and
-               Fethi Bougares and
-               Lo\"{i}c Barrault},
-  title     = {NMTPY: A Flexible Toolkit for Advanced Neural Machine Translation Systems},
-  journal   = {Prague Bull. Math. Linguistics},
-  volume    = {109},
-  pages     = {15--28},
-  year      = {2017},
-  url       = {https://ufal.mff.cuni.cz/pbml/109/art-caglayan-et-al.pdf},
-  doi       = {10.1515/pralin-2017-0035},
-  timestamp = {Tue, 12 Sep 2017 10:01:08 +0100}
-}
-@inproceedings{luong2016acl_hybrid,
- author = {Luong, Minh-Thang  and  Manning, Christopher D.},
- title = {Achieving Open Vocabulary Neural Machine Translation with Hybrid Word-Character Models},
- booktitle = {Association for Computational Linguistics (ACL)},
- address = {Berlin, Germany},
- month = {August},
- year = {2016}
-}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%section6
+%chapter6----------------------------------------------------------------
 @article{DBLP:journals/corr/abs-1905-13324,
  author    = {Biao Zhang and
@@ -729,13 +1005,10 @@ year={2013}}
  bibsource = {dblp computer science bibliography, https://dblp.org}
 }
-%//找不到，自己写的  层次短语翻译的神经网络调序模型
+@MISC{Schwenk_continuousspace,
-@incollection{liumodel,
+    author = {Hol Ger Schwenk},
-title = {层次短语翻译的神经网络调序模型},
+    title = {Continuous Space Translation Models for Phrase-Based Statistical Machine Translation},
-author = {李鹏,刘洋,孙茂松},
+    year = {}
-booktitle = {清华大学学报(自然科学版)},
-pages = {1529-1533},
-year = {2014}
 }
 @incollection{NIPS2017_7181,
@@ -1330,541 +1603,7 @@ url = {http://papers.nips.cc/paper/5346-sequence-to-sequence-learning-with-neura
  author={Zhang, Chunliang and Tong, Xiao and Zhu, Jingbo and Liu, Tongran},
  year={2017},
 }
+%chapter6----------------------------------------------------------------
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%%%%%chapter5------------------------------------------------------------
-@article{baydin2017automatic,
-  title={Automatic differentiation in machine learning: a survey},
-  author={Baydin, At{\i}l{\i}m G{\"u}nes and Pearlmutter, Barak A and Radul, Alexey Andreyevich and Siskind, Jeffrey Mark},
-  journal={The Journal of Machine Learning Research},
-  volume={18},
-  number={1},
-  pages={5595--5637},
-  year={2017},
-  publisher={JMLR. org}
-}
-@article{qian1999momentum,
-  title={On the momentum term in gradient descent learning algorithms},
-  author={Qian, Ning},
-  journal={Neural networks},
-  volume={12},
-  number={1},
-  pages={145--151},
-  year={1999},
-  publisher={Elsevier}
-}
-@article{duchi2011adaptive,
-  title={Adaptive subgradient methods for online learning and stochastic optimization},
-  author={Duchi, John and Hazan, Elad and Singer, Yoram},
-  journal={Journal of machine learning research},
-  volume={12},
-  number={Jul},
-  pages={2121--2159},
-  year={2011}
-}
-@article{tieleman2012rmsprop,
-  title={Rmsprop: Divide the gradient by a running average of its recent magnitude. coursera: Neural networks for machine learning},
-  author={Tieleman, Tijmen and Hinton, Geoffrey},
-  journal={COURSERA Neural Networks Mach. Learn},
-  year={2012}
-}
-@article{kingma2014adam,
-  title={Adam: A method for stochastic optimization},
-  author={Kingma, Diederik P and Ba, Jimmy},
-  journal={arXiv preprint arXiv:1412.6980},
-  year={2014}
-}
-@inproceedings{xiao2017fast,
-  title={Fast Parallel Training of Neural Language Models.},
-  author={Xiao, Tong and Zhu, Jingbo and Liu, Tongran and Zhang, Chunliang},
-  booktitle={IJCAI},
-  pages={4193--4199},
-  year={2017}
-}
-@article{ioffe2015batch,
-  title={Batch normalization: Accelerating deep network training by reducing internal covariate shift},
-  author={Ioffe, Sergey and Szegedy, Christian},
-  journal={arXiv preprint arXiv:1502.03167},
-  year={2015}
-}
-@article{ba2016layer,
-  title={Layer normalization},
-  author={Ba, Jimmy Lei and Kiros, Jamie Ryan and Hinton, Geoffrey E},
-  journal={arXiv preprint arXiv:1607.06450},
-  year={2016}
-}
-@inproceedings{he2016deep,
-  title={Deep residual learning for image recognition},
-  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
-  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
-  pages={770--778},
-  year={2016}
-}
-@article{bengio2003neural,
-  title={A neural probabilistic language model},
-  author={Bengio, Yoshua and Ducharme, R{\'e}jean and Vincent, Pascal and Jauvin, Christian},
-  journal={Journal of machine learning research},
-  volume={3},
-  number={Feb},
-  pages={1137--1155},
-  year={2003}
-}
-@inproceedings{mikolov2010recurrent,
-  title={Recurrent neural network based language model},
-  author={Mikolov, Tom{\'a}{\v{s}} and Karafi{\'a}t, Martin and Burget, Luk{\'a}{\v{s}} and {\v{C}}ernock{\`y}, Jan and Khudanpur, Sanjeev},
-  booktitle={Eleventh annual conference of the international speech communication association},
-  year={2010}
-}
-@inproceedings{vaswani2017attention,
-  title={Attention is all you need},
-  author={Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, {\L}ukasz and Polosukhin, Illia},
-  booktitle={Advances in neural information processing systems},
-  pages={5998--6008},
-  year={2017}
-}
-@inproceedings{mikolov2013distributed,
-  title={Distributed representations of words and phrases and their compositionality},
-  author={Mikolov, Tomas and Sutskever, Ilya and Chen, Kai and Corrado, Greg S and Dean, Jeff},
-  booktitle={Advances in neural information processing systems},
-  pages={3111--3119},
-  year={2013}
-}
-@inproceedings{pennington2014glove,
-  title={Glove: Global vectors for word representation},
-  author={Pennington, Jeffrey and Socher, Richard and Manning, Christopher D},
-  booktitle={Proceedings of the 2014 conference on empirical methods in natural language processing (EMNLP)},
-  pages={1532--1543},
-  year={2014}
-}
-@article{peters2018deep,
-  title={Deep contextualized word representations},
-  author={Peters, Matthew E and Neumann, Mark and Iyyer, Mohit and Gardner, Matt and Clark, Christopher and Lee, Kenton and Zettlemoyer, Luke},
-  journal={arXiv preprint arXiv:1802.05365},
-  year={2018}
-}
-@article{radford2018improving,
-  title={Improving language understanding by generative pre-training},
-  author={Radford, Alec and Narasimhan, Karthik and Salimans, Tim and Sutskever, Ilya},
-  journal={URL https://s3-us-west-2. amazonaws. com/openai-assets/researchcovers/languageunsupervised/language understanding paper. pdf},
-  year={2018}
-}
-@article{devlin2018bert,
-  title={Bert: Pre-training of deep bidirectional transformers for language understanding},
-  author={Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina},
-  journal={arXiv preprint arXiv:1810.04805},
-  year={2018}
-}
-@article{goldberg2017neural,
-  title={Neural network methods for natural language processing},
-  author={Goldberg, Yoav},
-  journal={Synthesis Lectures on Human Language Technologies},
-  volume={10},
-  number={1},
-  pages={1--309},
-  year={2017},
-  publisher={Morgan \& Claypool Publishers}
-}
-@article{lecun2015deep,
-  title={Deep learning},
-  author={LeCun, Yann and Bengio, Yoshua and Hinton, Geoffrey},
-  journal={nature},
-  volume={521},
-  number={7553},
-  pages={436--444},
-  year={2015},
-  publisher={Nature Publishing Group}
-}
-@article{guidotti2018survey,
-  title={A survey of methods for explaining black box models},
-  author={Guidotti, Riccardo and Monreale, Anna and Ruggieri, Salvatore and Turini, Franco and Giannotti, Fosca and Pedreschi, Dino},
-  journal={ACM computing surveys (CSUR)},
-  volume={51},
-  number={5},
-  pages={1--42},
-  year={2018},
-  publisher={ACM New York, NY, USA}
-}
-@inproceedings{koh2017understanding,
-  title={Understanding black-box predictions via influence functions},
-  author={Koh, Pang Wei and Liang, Percy},
-  booktitle={Proceedings of the 34th International Conference on Machine Learning-Volume 70},
-  pages={1885--1894},
-  year={2017},
-  organization={JMLR. org}
-}
-@article{arthur2016incorporating,
-  title={Incorporating discrete translation lexicons into neural machine translation},
-  author={Arthur, Philip and Neubig, Graham and Nakamura, Satoshi},
-  journal={arXiv preprint arXiv:1606.02006},
-  year={2016}
-}
-@article{zhang2018prior,
-  title={Prior knowledge integration for neural machine translation using posterior regularization},
-  author={Zhang, Jiacheng and Liu, Yang and Luan, Huanbo and Xu, Jingfang and Sun, Maosong},
-  journal={arXiv preprint arXiv:1811.01100},
-  year={2018}
-}
-@inproceedings{zollmann2006syntax,
-  title={Syntax augmented machine translation via chart parsing},
-  author={Zollmann, Andreas and Venugopal, Ashish},
-  booktitle={Proceedings of the Workshop on Statistical Machine Translation},
-  pages={138--141},
-  year={2006},
-  organization={Association for Computational Linguistics}
-}
-@inproceedings{charniak2003syntax,
-  title={Syntax-based language models for statistical machine translation},
-  author={Charniak, Eugene and Knight, Kevin and Yamada, Kenji},
-  booktitle={Proceedings of MT Summit IX},
-  pages={40--46},
-  year={2003}
-}
-@article{stahlberg2016syntactically,
-  title={Syntactically guided neural machine translation},
-  author={Stahlberg, Felix and Hasler, Eva and Waite, Aurelien and Byrne, Bill},
-  journal={arXiv preprint arXiv:1605.04569},
-  year={2016}
-}
-@inproceedings{plank2013embedding,
-  title={Embedding semantic similarity in tree kernels for domain adaptation of relation extraction},
-  author={Plank, Barbara and Moschitti, Alessandro},
-  booktitle={Proceedings of the 51st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
-  pages={1498--1507},
-  year={2013}
-}
-@inproceedings{perozzi2014deepwalk,
-  title={Deepwalk: Online learning of social representations},
-  author={Perozzi, Bryan and Al-Rfou, Rami and Skiena, Steven},
-  booktitle={Proceedings of the 20th ACM SIGKDD international conference on Knowledge discovery and data mining},
-  pages={701--710},
-  year={2014}
-}
-@article{collobert2011natural,
-  title={Natural language processing (almost) from scratch},
-  author={Collobert, Ronan and Weston, Jason and Bottou, L{\'e}on and Karlen, Michael and Kavukcuoglu, Koray and Kuksa, Pavel},
-  journal={Journal of machine learning research},
-  volume={12},
-  number={Aug},
-  pages={2493--2537},
-  year={2011}
-}
-@article{mikolov2013efficient,
-  title={Efficient estimation of word representations in vector space},
-  author={Mikolov, Tomas and Chen, Kai and Corrado, Greg and Dean, Jeffrey},
-  journal={arXiv preprint arXiv:1301.3781},
-  year={2013}
-}
-@inproceedings{mccann2017learned,
-  title={Learned in translation: Contextualized word vectors},
-  author={McCann, Bryan and Bradbury, James and Xiong, Caiming and Socher, Richard},
-  booktitle={Advances in Neural Information Processing Systems},
-  pages={6294--6305},
-  year={2017}
-}
-@article{radford2019language,
-  title={Language models are unsupervised multitask learners},
-  author={Radford, Alec and Wu, Jeffrey and Child, Rewon and Luan, David and Amodei, Dario and Sutskever, Ilya},
-  journal={OpenAI Blog},
-  volume={1},
-  number={8},
-  pages={9},
-  year={2019}
-}
-@article{lample2019cross,
-  title={Cross-lingual language model pretraining},
-  author={Lample, Guillaume and Conneau, Alexis},
-  journal={arXiv preprint arXiv:1901.07291},
-  year={2019}
-}
-@article{song2019mass,
-  title={Mass: Masked sequence to sequence pre-training for language generation},
-  author={Song, Kaitao and Tan, Xu and Qin, Tao and Lu, Jianfeng and Liu, Tie-Yan},
-  journal={arXiv preprint arXiv:1905.02450},
-  year={2019}
-}
-@inproceedings{yang2019xlnet,
-  title={Xlnet: Generalized autoregressive pretraining for language understanding},
-  author={Yang, Zhilin and Dai, Zihang and Yang, Yiming and Carbonell, Jaime and Salakhutdinov, Russ R and Le, Quoc V},
-  booktitle={Advances in neural information processing systems},
-  pages={5754--5764},
-  year={2019}
-}
-@article{lan2019albert,
-  title={Albert: A lite bert for self-supervised learning of language representations},
-  author={Lan, Zhenzhong and Chen, Mingda and Goodman, Sebastian and Gimpel, Kevin and Sharma, Piyush and Soricut, Radu},
-  journal={arXiv preprint arXiv:1909.11942},
-  year={2019}
-}
-%%-----------------------------------------------------------------------------------------
-%%%%%chapter5------------------------------------------------------------
-@article{baydin2017automatic,
-  title={Automatic differentiation in machine learning: a survey},
-  author={Baydin, At{\i}l{\i}m G{\"u}nes and Pearlmutter, Barak A and Radul, Alexey Andreyevich and Siskind, Jeffrey Mark},
-  journal={The Journal of Machine Learning Research},
-  volume={18},
-  number={1},
-  pages={5595--5637},
-  year={2017},
-  publisher={JMLR. org}
-}
-@article{qian1999momentum,
-  title={On the momentum term in gradient descent learning algorithms},
-  author={Qian, Ning},
-  journal={Neural networks},
-  volume={12},
-  number={1},
-  pages={145--151},
-  year={1999},
-  publisher={Elsevier}
-}
-@article{duchi2011adaptive,
-  title={Adaptive subgradient methods for online learning and stochastic optimization},
-  author={Duchi, John and Hazan, Elad and Singer, Yoram},
-  journal={Journal of machine learning research},
-  volume={12},
-  number={Jul},
-  pages={2121--2159},
-  year={2011}
-}
-@article{tieleman2012rmsprop,
-  title={Rmsprop: Divide the gradient by a running average of its recent magnitude. coursera: Neural networks for machine learning},
-  author={Tieleman, Tijmen and Hinton, Geoffrey},
-  journal={COURSERA Neural Networks Mach. Learn},
-  year={2012}
-}
-@article{kingma2014adam,
-  title={Adam: A method for stochastic optimization},
-  author={Kingma, Diederik P and Ba, Jimmy},
-  journal={arXiv preprint arXiv:1412.6980},
-  year={2014}
-}
-@inproceedings{xiao2017fast,
-  title={Fast Parallel Training of Neural Language Models.},
-  author={Xiao, Tong and Zhu, Jingbo and Liu, Tongran and Zhang, Chunliang},
-  booktitle={IJCAI},
-  pages={4193--4199},
-  year={2017}
-}
-@article{ioffe2015batch,
-  title={Batch normalization: Accelerating deep network training by reducing internal covariate shift},
-  author={Ioffe, Sergey and Szegedy, Christian},
-  journal={arXiv preprint arXiv:1502.03167},
-  year={2015}
-}
-@article{ba2016layer,
-  title={Layer normalization},
-  author={Ba, Jimmy Lei and Kiros, Jamie Ryan and Hinton, Geoffrey E},
-  journal={arXiv preprint arXiv:1607.06450},
-  year={2016}
-}
-@inproceedings{he2016deep,
-  title={Deep residual learning for image recognition},
-  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
-  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
-  pages={770--778},
-  year={2016}
-}
-@article{bengio2003neural,
-  title={A neural probabilistic language model},
-  author={Bengio, Yoshua and Ducharme, R{\'e}jean and Vincent, Pascal and Jauvin, Christian},
-  journal={Journal of machine learning research},
-  volume={3},
-  number={Feb},
-  pages={1137--1155},
-  year={2003}
-}
-@inproceedings{mikolov2010recurrent,
-  title={Recurrent neural network based language model},
-  author={Mikolov, Tom{\'a}{\v{s}} and Karafi{\'a}t, Martin and Burget, Luk{\'a}{\v{s}} and {\v{C}}ernock{\`y}, Jan and Khudanpur, Sanjeev},
-  booktitle={Eleventh annual conference of the international speech communication association},
-  year={2010}
-}
-@inproceedings{vaswani2017attention,
-  title={Attention is all you need},
-  author={Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, {\L}ukasz and Polosukhin, Illia},
-  booktitle={Advances in neural information processing systems},
-  pages={5998--6008},
-  year={2017}
-}
-@inproceedings{mikolov2013distributed,
-  title={Distributed representations of words and phrases and their compositionality},
-  author={Mikolov, Tomas and Sutskever, Ilya and Chen, Kai and Corrado, Greg S and Dean, Jeff},
-  booktitle={Advances in neural information processing systems},
-  pages={3111--3119},
-  year={2013}
-}
-@inproceedings{pennington2014glove,
-  title={Glove: Global vectors for word representation},
-  author={Pennington, Jeffrey and Socher, Richard and Manning, Christopher D},
-  booktitle={Proceedings of the 2014 conference on empirical methods in natural language processing (EMNLP)},
-  pages={1532--1543},
-  year={2014}
-}
-@article{peters2018deep,
-  title={Deep contextualized word representations},
-  author={Peters, Matthew E and Neumann, Mark and Iyyer, Mohit and Gardner, Matt and Clark, Christopher and Lee, Kenton and Zettlemoyer, Luke},
-  journal={arXiv preprint arXiv:1802.05365},
-  year={2018}
-}
-@article{radford2018improving,
-  title={Improving language understanding by generative pre-training},
-  author={Radford, Alec and Narasimhan, Karthik and Salimans, Tim and Sutskever, Ilya},
-  journal={URL https://s3-us-west-2. amazonaws. com/openai-assets/researchcovers/languageunsupervised/language understanding paper. pdf},
-  year={2018}
-}
-@article{devlin2018bert,
-  title={Bert: Pre-training of deep bidirectional transformers for language understanding},
-  author={Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina},
-  journal={arXiv preprint arXiv:1810.04805},
-  year={2018}
-}
-@article{goldberg2017neural,
-  title={Neural network methods for natural language processing},
-  author={Goldberg, Yoav},
-  journal={Synthesis Lectures on Human Language Technologies},
-  volume={10},
-  number={1},
-  pages={1--309},
-  year={2017},
-  publisher={Morgan \& Claypool Publishers}
-}
-@article{lecun2015deep,
-  title={Deep learning},
-  author={LeCun, Yann and Bengio, Yoshua and Hinton, Geoffrey},
-  journal={nature},
-  volume={521},
-  number={7553},
-  pages={436--444},
-  year={2015},
-  publisher={Nature Publishing Group}
-}
-@article{guidotti2018survey,
-  title={A survey of methods for explaining black box models},
-  author={Guidotti, Riccardo and Monreale, Anna and Ruggieri, Salvatore and Turini, Franco and Giannotti, Fosca and Pedreschi, Dino},
-  journal={ACM computing surveys (CSUR)},
-  volume={51},
-  number={5},
-  pages={1--42},
-  year={2018},
-  publisher={ACM New York, NY, USA}
-}
-@inproceedings{koh2017understanding,
-  title={Understanding black-box predictions via influence functions},
-  author={Koh, Pang Wei and Liang, Percy},
-  booktitle={Proceedings of the 34th International Conference on Machine Learning-Volume 70},
-  pages={1885--1894},
-  year={2017},
-  organization={JMLR. org}
-}
-@article{arthur2016incorporating,
-  title={Incorporating discrete translation lexicons into neural machine translation},
-  author={Arthur, Philip and Neubig, Graham and Nakamura, Satoshi},
-  journal={arXiv preprint arXiv:1606.02006},
-  year={2016}
-}
-@article{zhang2018prior,
-  title={Prior knowledge integration for neural machine translation using posterior regularization},
-  author={Zhang, Jiacheng and Liu, Yang and Luan, Huanbo and Xu, Jingfang and Sun, Maosong},
-  journal={arXiv preprint arXiv:1811.01100},
-  year={2018}
-}
-@inproceedings{zollmann2006syntax,
-  title={Syntax augmented machine translation via chart parsing},
-  author={Zollmann, Andreas and Venugopal, Ashish},
-  booktitle={Proceedings of the Workshop on Statistical Machine Translation},
-  pages={138--141},
-  year={2006},
-  organization={Association for Computational Linguistics}
-}
-@inproceedings{charniak2003syntax,
-  title={Syntax-based language models for statistical machine translation},
-  author={Charniak, Eugene and Knight, Kevin and Yamada, Kenji},
-  booktitle={Proceedings of MT Summit IX},
-  pages={40--46},
-  year={2003}
-}
-@article{stahlberg2016syntactically,
-  title={Syntactically guided neural machine translation},
-  author={Stahlberg, Felix and Hasler, Eva and Waite, Aurelien and Byrne, Bill},
-  journal={arXiv preprint arXiv:1605.04569},
-  year={2016}
-}
-@inproceedings{plank2013embedding,
-  title={Embedding semantic similarity in tree kernels for domain adaptation of relation extraction},
-  author={Plank, Barbara and Moschitti, Alessandro},
-  booktitle={Proceedings of the 51st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
-  pages={1498--1507},
-  year={2013}
-}
-@inproceedings{perozzi2014deepwalk,
-  title={Deepwalk: Online learning of social representations},
-  author={Perozzi, Bryan and Al-Rfou, Rami and Skiena, Steven},
-  booktitle={Proceedings of the 20th ACM SIGKDD international conference on Knowledge discovery and data mining},
-  pages={701--710},
-  year={2014}
-}
-@article{collobert2011natural,
-  title={Natural language processing (almost) from scratch},
-  author={Collobert, Ronan and Weston, Jason and Bottou, L{\'e}on and Karlen, Michael and Kavukcuoglu, Koray and Kuksa, Pavel},
-  journal={Journal of machine learning research},
-  volume={12},
-  number={Aug},
-  pages={2493--2537},
-  year={2011}
-}
-@article{mikolov2013efficient,
-  title={Efficient estimation of word representations in vector space},
-  author={Mikolov, Tomas and Chen, Kai and Corrado, Greg and Dean, Jeffrey},
-  journal={arXiv preprint arXiv:1301.3781},
-  year={2013}
-}
-@inproceedings{mccann2017learned,
-  title={Learned in translation: Contextualized word vectors},
-  author={McCann, Bryan and Bradbury, James and Xiong, Caiming and Socher, Richard},
-  booktitle={Advances in Neural Information Processing Systems},
-  pages={6294--6305},
-  year={2017}
-}
-@article{radford2019language,
-  title={Language models are unsupervised multitask learners},
-  author={Radford, Alec and Wu, Jeffrey and Child, Rewon and Luan, David and Amodei, Dario and Sutskever, Ilya},
-  journal={OpenAI Blog},
-  volume={1},
-  number={8},
-  pages={9},
-  year={2019}
-}
-@article{lample2019cross,
-  title={Cross-lingual language model pretraining},
-  author={Lample, Guillaume and Conneau, Alexis},
-  journal={arXiv preprint arXiv:1901.07291},
-  year={2019}
-}
-@article{song2019mass,
-  title={Mass: Masked sequence to sequence pre-training for language generation},
-  author={Song, Kaitao and Tan, Xu and Qin, Tao and Lu, Jianfeng and Liu, Tie-Yan},
-  journal={arXiv preprint arXiv:1905.02450},
-  year={2019}
-}
-@inproceedings{yang2019xlnet,
-  title={Xlnet: Generalized autoregressive pretraining for language understanding},
-  author={Yang, Zhilin and Dai, Zihang and Yang, Yiming and Carbonell, Jaime and Salakhutdinov, Russ R and Le, Quoc V},
-  booktitle={Advances in neural information processing systems},
-  pages={5754--5764},
-  year={2019}
-}
-@article{lan2019albert,
-  title={Albert: A lite bert for self-supervised learning of language representations},
-  author={Lan, Zhenzhong and Chen, Mingda and Goodman, Sebastian and Gimpel, Kevin and Sharma, Piyush and Soricut, Radu},
-  journal={arXiv preprint arXiv:1909.11942},
-  year={2019}
-}
-%%-----------------------------------------------------------------------------------------
--- a/Book/mt-book-xelatex.bbl
+++ b/Book/mt-book-xelatex.bbl
--- a/Book/mt-book-xelatex.idx
+++ b/Book/mt-book-xelatex.idx
+\indexentry{Chapter1.1|hyperpage}{11}
+\indexentry{Chapter1.2|hyperpage}{14}
+\indexentry{Chapter1.3|hyperpage}{19}
+\indexentry{Chapter1.4|hyperpage}{20}
+\indexentry{Chapter1.4.1|hyperpage}{20}
+\indexentry{Chapter1.4.2|hyperpage}{22}
+\indexentry{Chapter1.4.3|hyperpage}{23}
+\indexentry{Chapter1.4.4|hyperpage}{24}
+\indexentry{Chapter1.4.5|hyperpage}{25}
+\indexentry{Chapter1.5|hyperpage}{25}
+\indexentry{Chapter1.5.1|hyperpage}{26}
+\indexentry{Chapter1.5.2|hyperpage}{27}
+\indexentry{Chapter1.5.2.1|hyperpage}{27}
+\indexentry{Chapter1.5.2.2|hyperpage}{28}
+\indexentry{Chapter1.5.2.3|hyperpage}{29}
+\indexentry{Chapter1.6|hyperpage}{30}
+\indexentry{Chapter1.7|hyperpage}{32}
+\indexentry{Chapter1.7.1|hyperpage}{33}
+\indexentry{Chapter1.7.1.1|hyperpage}{33}
+\indexentry{Chapter1.7.1.2|hyperpage}{34}
+\indexentry{Chapter1.7.2|hyperpage}{36}
+\indexentry{Chapter1.8|hyperpage}{39}
+\indexentry{Chapter2.1|hyperpage}{44}
+\indexentry{Chapter2.2|hyperpage}{45}
+\indexentry{Chapter2.2.1|hyperpage}{46}
+\indexentry{Chapter2.2.2|hyperpage}{47}
+\indexentry{Chapter2.2.3|hyperpage}{48}
+\indexentry{Chapter2.2.4|hyperpage}{49}
+\indexentry{Chapter2.2.5|hyperpage}{51}
+\indexentry{Chapter2.2.5.1|hyperpage}{51}
+\indexentry{Chapter2.2.5.2|hyperpage}{52}
+\indexentry{Chapter2.2.5.3|hyperpage}{52}
+\indexentry{Chapter2.3|hyperpage}{53}
+\indexentry{Chapter2.3.1|hyperpage}{54}
+\indexentry{Chapter2.3.2|hyperpage}{55}
+\indexentry{Chapter2.3.2.1|hyperpage}{55}
+\indexentry{Chapter2.3.2.2|hyperpage}{56}
+\indexentry{Chapter2.3.2.3|hyperpage}{58}
+\indexentry{Chapter2.4|hyperpage}{60}
+\indexentry{Chapter2.4.1|hyperpage}{61}
+\indexentry{Chapter2.4.2|hyperpage}{63}
+\indexentry{Chapter2.4.2.1|hyperpage}{64}
+\indexentry{Chapter2.4.2.2|hyperpage}{65}
+\indexentry{Chapter2.4.2.3|hyperpage}{66}
+\indexentry{Chapter2.5|hyperpage}{68}
+\indexentry{Chapter2.5.1|hyperpage}{68}
+\indexentry{Chapter2.5.2|hyperpage}{70}
+\indexentry{Chapter2.5.3|hyperpage}{74}
+\indexentry{Chapter2.6|hyperpage}{76}
+\indexentry{Chapter3.1|hyperpage}{81}
+\indexentry{Chapter3.2|hyperpage}{83}
+\indexentry{Chapter3.2.1|hyperpage}{83}
+\indexentry{Chapter3.2.1.1|hyperpage}{83}
+\indexentry{Chapter3.2.1.2|hyperpage}{84}
+\indexentry{Chapter3.2.1.3|hyperpage}{85}
+\indexentry{Chapter3.2.2|hyperpage}{85}
+\indexentry{Chapter3.2.3|hyperpage}{86}
+\indexentry{Chapter3.2.3.1|hyperpage}{86}
+\indexentry{Chapter3.2.3.2|hyperpage}{87}
+\indexentry{Chapter3.2.3.3|hyperpage}{88}
+\indexentry{Chapter3.2.4|hyperpage}{89}
+\indexentry{Chapter3.2.4.1|hyperpage}{89}
+\indexentry{Chapter3.2.4.2|hyperpage}{91}
+\indexentry{Chapter3.2.5|hyperpage}{92}
+\indexentry{Chapter3.3|hyperpage}{95}
+\indexentry{Chapter3.3.1|hyperpage}{95}
+\indexentry{Chapter3.3.2|hyperpage}{98}
+\indexentry{Chapter3.3.2.1|hyperpage}{99}
+\indexentry{Chapter3.3.2.2|hyperpage}{100}
+\indexentry{Chapter3.3.2.3|hyperpage}{101}
+\indexentry{Chapter3.4|hyperpage}{102}
+\indexentry{Chapter3.4.1|hyperpage}{102}
+\indexentry{Chapter3.4.2|hyperpage}{104}
+\indexentry{Chapter3.4.3|hyperpage}{105}
+\indexentry{Chapter3.4.4|hyperpage}{106}
+\indexentry{Chapter3.4.4.1|hyperpage}{106}
+\indexentry{Chapter3.4.4.2|hyperpage}{107}
+\indexentry{Chapter3.5|hyperpage}{112}
+\indexentry{Chapter3.5.1|hyperpage}{113}
+\indexentry{Chapter3.5.2|hyperpage}{115}
+\indexentry{Chapter3.5.3|hyperpage}{117}
+\indexentry{Chapter3.5.4|hyperpage}{118}
+\indexentry{Chapter3.5.5|hyperpage}{120}
+\indexentry{Chapter3.5.5|hyperpage}{122}
+\indexentry{Chapter3.6|hyperpage}{123}
+\indexentry{Chapter3.6.1|hyperpage}{123}
+\indexentry{Chapter3.6.2|hyperpage}{124}
+\indexentry{Chapter3.6.4|hyperpage}{125}
+\indexentry{Chapter3.6.5|hyperpage}{125}
+\indexentry{Chapter3.7|hyperpage}{125}
+\indexentry{Chapter5.1|hyperpage}{130}
+\indexentry{Chapter5.1.1|hyperpage}{130}
+\indexentry{Chapter5.1.1.1|hyperpage}{130}
+\indexentry{Chapter5.1.1.2|hyperpage}{131}
+\indexentry{Chapter5.1.1.3|hyperpage}{132}
+\indexentry{Chapter5.1.2|hyperpage}{133}
+\indexentry{Chapter5.1.2.1|hyperpage}{133}
+\indexentry{Chapter5.1.2.2|hyperpage}{134}
+\indexentry{Chapter5.2|hyperpage}{134}
+\indexentry{Chapter5.2.1|hyperpage}{134}
+\indexentry{Chapter5.2.1.1|hyperpage}{135}
+\indexentry{Chapter5.2.1.2|hyperpage}{136}
+\indexentry{Chapter5.2.1.3|hyperpage}{136}
+\indexentry{Chapter5.2.1.4|hyperpage}{137}
+\indexentry{Chapter5.2.1.5|hyperpage}{138}
+\indexentry{Chapter5.2.1.6|hyperpage}{139}
+\indexentry{Chapter5.2.2|hyperpage}{140}
+\indexentry{Chapter5.2.2.1|hyperpage}{141}
+\indexentry{Chapter5.2.2.2|hyperpage}{141}
+\indexentry{Chapter5.2.2.3|hyperpage}{142}
+\indexentry{Chapter5.2.2.4|hyperpage}{143}
+\indexentry{Chapter5.2.3|hyperpage}{144}
+\indexentry{Chapter5.2.3.1|hyperpage}{144}
+\indexentry{Chapter5.2.3.2|hyperpage}{146}
+\indexentry{Chapter5.2.4|hyperpage}{148}
+\indexentry{Chapter5.3|hyperpage}{151}
+\indexentry{Chapter5.3.1|hyperpage}{151}
+\indexentry{Chapter5.3.1.1|hyperpage}{151}
+\indexentry{Chapter5.3.1.2|hyperpage}{153}
+\indexentry{Chapter5.3.1.3|hyperpage}{154}
+\indexentry{Chapter5.3.2|hyperpage}{155}
+\indexentry{Chapter5.3.3|hyperpage}{156}
+\indexentry{Chapter5.3.4|hyperpage}{160}
+\indexentry{Chapter5.3.5|hyperpage}{161}
+\indexentry{Chapter5.4|hyperpage}{162}
+\indexentry{Chapter5.4.1|hyperpage}{163}
+\indexentry{Chapter5.4.2|hyperpage}{164}
+\indexentry{Chapter5.4.2.1|hyperpage}{165}
+\indexentry{Chapter5.4.2.2|hyperpage}{167}
+\indexentry{Chapter5.4.2.3|hyperpage}{169}
+\indexentry{Chapter5.4.3|hyperpage}{172}
+\indexentry{Chapter5.4.4|hyperpage}{174}
+\indexentry{Chapter5.4.4.1|hyperpage}{174}
+\indexentry{Chapter5.4.4.2|hyperpage}{176}
+\indexentry{Chapter5.4.4.3|hyperpage}{176}
+\indexentry{Chapter5.4.5|hyperpage}{177}
+\indexentry{Chapter5.4.6|hyperpage}{178}
+\indexentry{Chapter5.4.6.1|hyperpage}{179}
+\indexentry{Chapter5.4.6.2|hyperpage}{181}
+\indexentry{Chapter5.4.6.3|hyperpage}{182}
+\indexentry{Chapter5.5|hyperpage}{183}
+\indexentry{Chapter5.5.1|hyperpage}{184}
+\indexentry{Chapter5.5.1.1|hyperpage}{185}
+\indexentry{Chapter5.5.1.2|hyperpage}{187}
+\indexentry{Chapter5.5.1.3|hyperpage}{188}
+\indexentry{Chapter5.5.1.4|hyperpage}{189}
+\indexentry{Chapter5.5.2|hyperpage}{190}
+\indexentry{Chapter5.5.2.1|hyperpage}{190}
+\indexentry{Chapter5.5.2.2|hyperpage}{190}
+\indexentry{Chapter5.5.3|hyperpage}{192}
+\indexentry{Chapter5.5.3.1|hyperpage}{192}
+\indexentry{Chapter5.5.3.2|hyperpage}{194}
+\indexentry{Chapter5.5.3.3|hyperpage}{195}
+\indexentry{Chapter5.5.3.4|hyperpage}{195}
+\indexentry{Chapter5.5.3.5|hyperpage}{196}
+\indexentry{Chapter5.6|hyperpage}{197}
+\indexentry{Chapter6.1|hyperpage}{199}
+\indexentry{Chapter6.1.1|hyperpage}{201}
+\indexentry{Chapter6.1.2|hyperpage}{203}
+\indexentry{Chapter6.1.3|hyperpage}{206}
+\indexentry{Chapter6.2|hyperpage}{207}
+\indexentry{Chapter6.2.1|hyperpage}{208}
+\indexentry{Chapter6.2.2|hyperpage}{209}
+\indexentry{Chapter6.2.3|hyperpage}{210}
+\indexentry{Chapter6.2.4|hyperpage}{211}
+\indexentry{Chapter6.3|hyperpage}{212}
+\indexentry{Chapter6.3.1|hyperpage}{213}
+\indexentry{Chapter6.3.2|hyperpage}{216}
+\indexentry{Chapter6.3.3|hyperpage}{219}
+\indexentry{Chapter6.3.3.1|hyperpage}{219}
+\indexentry{Chapter6.3.3.2|hyperpage}{220}
+\indexentry{Chapter6.3.3.3|hyperpage}{222}
+\indexentry{Chapter6.3.3.4|hyperpage}{223}
+\indexentry{Chapter6.3.3.5|hyperpage}{224}
+\indexentry{Chapter6.3.4|hyperpage}{224}
+\indexentry{Chapter6.3.4.1|hyperpage}{225}
+\indexentry{Chapter6.3.4.2|hyperpage}{227}
+\indexentry{Chapter6.3.4.3|hyperpage}{230}
+\indexentry{Chapter6.3.5|hyperpage}{232}
+\indexentry{Chapter6.3.5.1|hyperpage}{232}
+\indexentry{Chapter6.3.5.2|hyperpage}{233}
+\indexentry{Chapter6.3.5.3|hyperpage}{233}
+\indexentry{Chapter6.3.5.4|hyperpage}{233}
+\indexentry{Chapter6.3.5.5|hyperpage}{234}
+\indexentry{Chapter6.3.5.5|hyperpage}{235}
+\indexentry{Chapter6.3.6|hyperpage}{238}
+\indexentry{Chapter6.3.6.1|hyperpage}{239}
+\indexentry{Chapter6.3.6.2|hyperpage}{240}
+\indexentry{Chapter6.3.6.3|hyperpage}{241}
+\indexentry{Chapter6.3.7|hyperpage}{242}
+\indexentry{Chapter6.4|hyperpage}{243}
+\indexentry{Chapter6.4.1|hyperpage}{244}
+\indexentry{Chapter6.4.2|hyperpage}{246}
+\indexentry{Chapter6.4.3|hyperpage}{248}
+\indexentry{Chapter6.4.4|hyperpage}{250}
+\indexentry{Chapter6.4.5|hyperpage}{252}
+\indexentry{Chapter6.4.6|hyperpage}{253}
+\indexentry{Chapter6.4.7|hyperpage}{254}
+\indexentry{Chapter6.4.8|hyperpage}{256}
+\indexentry{Chapter6.4.9|hyperpage}{257}
+\indexentry{Chapter6.4.10|hyperpage}{259}
+\indexentry{Chapter6.5|hyperpage}{260}
+\indexentry{Chapter6.5.1|hyperpage}{260}
+\indexentry{Chapter6.5.2|hyperpage}{261}
+\indexentry{Chapter6.5.3|hyperpage}{261}
+\indexentry{Chapter6.5.4|hyperpage}{262}
+\indexentry{Chapter6.5.5|hyperpage}{262}
+\indexentry{Chapter6.6|hyperpage}{264}
--- a/Book/mt-book-xelatex.ind
+++ b/Book/mt-book-xelatex.ind
+\begin{theindex}
+  \item Chapter1.1, \hyperpage{11}
+  \item Chapter1.2, \hyperpage{14}
+  \item Chapter1.3, \hyperpage{19}
+  \item Chapter1.4, \hyperpage{20}
+  \item Chapter1.4.1, \hyperpage{20}
+  \item Chapter1.4.2, \hyperpage{22}
+  \item Chapter1.4.3, \hyperpage{23}
+  \item Chapter1.4.4, \hyperpage{24}
+  \item Chapter1.4.5, \hyperpage{25}
+  \item Chapter1.5, \hyperpage{25}
+  \item Chapter1.5.1, \hyperpage{26}
+  \item Chapter1.5.2, \hyperpage{27}
+  \item Chapter1.5.2.1, \hyperpage{27}
+  \item Chapter1.5.2.2, \hyperpage{28}
+  \item Chapter1.5.2.3, \hyperpage{29}
+  \item Chapter1.6, \hyperpage{30}
+  \item Chapter1.7, \hyperpage{32}
+  \item Chapter1.7.1, \hyperpage{33}
+  \item Chapter1.7.1.1, \hyperpage{33}
+  \item Chapter1.7.1.2, \hyperpage{34}
+  \item Chapter1.7.2, \hyperpage{36}
+  \item Chapter1.8, \hyperpage{39}
+  \item Chapter2.1, \hyperpage{44}
+  \item Chapter2.2, \hyperpage{45}
+  \item Chapter2.2.1, \hyperpage{46}
+  \item Chapter2.2.2, \hyperpage{47}
+  \item Chapter2.2.3, \hyperpage{48}
+  \item Chapter2.2.4, \hyperpage{49}
+  \item Chapter2.2.5, \hyperpage{51}
+  \item Chapter2.2.5.1, \hyperpage{51}
+  \item Chapter2.2.5.2, \hyperpage{52}
+  \item Chapter2.2.5.3, \hyperpage{52}
+  \item Chapter2.3, \hyperpage{53}
+  \item Chapter2.3.1, \hyperpage{54}
+  \item Chapter2.3.2, \hyperpage{55}
+  \item Chapter2.3.2.1, \hyperpage{55}
+  \item Chapter2.3.2.2, \hyperpage{56}
+  \item Chapter2.3.2.3, \hyperpage{58}
+  \item Chapter2.4, \hyperpage{60}
+  \item Chapter2.4.1, \hyperpage{61}
+  \item Chapter2.4.2, \hyperpage{63}
+  \item Chapter2.4.2.1, \hyperpage{64}
+  \item Chapter2.4.2.2, \hyperpage{65}
+  \item Chapter2.4.2.3, \hyperpage{66}
+  \item Chapter2.5, \hyperpage{68}
+  \item Chapter2.5.1, \hyperpage{68}
+  \item Chapter2.5.2, \hyperpage{70}
+  \item Chapter2.5.3, \hyperpage{74}
+  \item Chapter2.6, \hyperpage{76}
+  \item Chapter3.1, \hyperpage{81}
+  \item Chapter3.2, \hyperpage{83}
+  \item Chapter3.2.1, \hyperpage{83}
+  \item Chapter3.2.1.1, \hyperpage{83}
+  \item Chapter3.2.1.2, \hyperpage{84}
+  \item Chapter3.2.1.3, \hyperpage{85}
+  \item Chapter3.2.2, \hyperpage{85}
+  \item Chapter3.2.3, \hyperpage{86}
+  \item Chapter3.2.3.1, \hyperpage{86}
+  \item Chapter3.2.3.2, \hyperpage{87}
+  \item Chapter3.2.3.3, \hyperpage{88}
+  \item Chapter3.2.4, \hyperpage{89}
+  \item Chapter3.2.4.1, \hyperpage{89}
+  \item Chapter3.2.4.2, \hyperpage{91}
+  \item Chapter3.2.5, \hyperpage{92}
+  \item Chapter3.3, \hyperpage{95}
+  \item Chapter3.3.1, \hyperpage{95}
+  \item Chapter3.3.2, \hyperpage{98}
+  \item Chapter3.3.2.1, \hyperpage{99}
+  \item Chapter3.3.2.2, \hyperpage{100}
+  \item Chapter3.3.2.3, \hyperpage{101}
+  \item Chapter3.4, \hyperpage{102}
+  \item Chapter3.4.1, \hyperpage{102}
+  \item Chapter3.4.2, \hyperpage{104}
+  \item Chapter3.4.3, \hyperpage{105}
+  \item Chapter3.4.4, \hyperpage{106}
+  \item Chapter3.4.4.1, \hyperpage{106}
+  \item Chapter3.4.4.2, \hyperpage{107}
+  \item Chapter3.5, \hyperpage{112}
+  \item Chapter3.5.1, \hyperpage{113}
+  \item Chapter3.5.2, \hyperpage{115}
+  \item Chapter3.5.3, \hyperpage{117}
+  \item Chapter3.5.4, \hyperpage{118}
+  \item Chapter3.5.5, \hyperpage{120}, \hyperpage{122}
+  \item Chapter3.6, \hyperpage{123}
+  \item Chapter3.6.1, \hyperpage{123}
+  \item Chapter3.6.2, \hyperpage{124}
+  \item Chapter3.6.4, \hyperpage{125}
+  \item Chapter3.6.5, \hyperpage{125}
+  \item Chapter3.7, \hyperpage{125}
+  \item Chapter5.1, \hyperpage{130}
+  \item Chapter5.1.1, \hyperpage{130}
+  \item Chapter5.1.1.1, \hyperpage{130}
+  \item Chapter5.1.1.2, \hyperpage{131}
+  \item Chapter5.1.1.3, \hyperpage{132}
+  \item Chapter5.1.2, \hyperpage{133}
+  \item Chapter5.1.2.1, \hyperpage{133}
+  \item Chapter5.1.2.2, \hyperpage{134}
+  \item Chapter5.2, \hyperpage{134}
+  \item Chapter5.2.1, \hyperpage{134}
+  \item Chapter5.2.1.1, \hyperpage{135}
+  \item Chapter5.2.1.2, \hyperpage{136}
+  \item Chapter5.2.1.3, \hyperpage{136}
+  \item Chapter5.2.1.4, \hyperpage{137}
+  \item Chapter5.2.1.5, \hyperpage{138}
+  \item Chapter5.2.1.6, \hyperpage{139}
+  \item Chapter5.2.2, \hyperpage{140}
+  \item Chapter5.2.2.1, \hyperpage{141}
+  \item Chapter5.2.2.2, \hyperpage{141}
+  \item Chapter5.2.2.3, \hyperpage{142}
+  \item Chapter5.2.2.4, \hyperpage{143}
+  \item Chapter5.2.3, \hyperpage{144}
+  \item Chapter5.2.3.1, \hyperpage{144}
+  \item Chapter5.2.3.2, \hyperpage{146}
+  \item Chapter5.2.4, \hyperpage{148}
+  \item Chapter5.3, \hyperpage{151}
+  \item Chapter5.3.1, \hyperpage{151}
+  \item Chapter5.3.1.1, \hyperpage{151}
+  \item Chapter5.3.1.2, \hyperpage{153}
+  \item Chapter5.3.1.3, \hyperpage{154}
+  \item Chapter5.3.2, \hyperpage{155}
+  \item Chapter5.3.3, \hyperpage{156}
+  \item Chapter5.3.4, \hyperpage{160}
+  \item Chapter5.3.5, \hyperpage{161}
+  \item Chapter5.4, \hyperpage{162}
+  \item Chapter5.4.1, \hyperpage{163}
+  \item Chapter5.4.2, \hyperpage{164}
+  \item Chapter5.4.2.1, \hyperpage{165}
+  \item Chapter5.4.2.2, \hyperpage{167}
+  \item Chapter5.4.2.3, \hyperpage{169}
+  \item Chapter5.4.3, \hyperpage{172}
+  \item Chapter5.4.4, \hyperpage{174}
+  \item Chapter5.4.4.1, \hyperpage{174}
+  \item Chapter5.4.4.2, \hyperpage{176}
+  \item Chapter5.4.4.3, \hyperpage{176}
+  \item Chapter5.4.5, \hyperpage{177}
+  \item Chapter5.4.6, \hyperpage{178}
+  \item Chapter5.4.6.1, \hyperpage{179}
+  \item Chapter5.4.6.2, \hyperpage{181}
+  \item Chapter5.4.6.3, \hyperpage{182}
+  \item Chapter5.5, \hyperpage{183}
+  \item Chapter5.5.1, \hyperpage{184}
+  \item Chapter5.5.1.1, \hyperpage{185}
+  \item Chapter5.5.1.2, \hyperpage{187}
+  \item Chapter5.5.1.3, \hyperpage{188}
+  \item Chapter5.5.1.4, \hyperpage{189}
+  \item Chapter5.5.2, \hyperpage{190}
+  \item Chapter5.5.2.1, \hyperpage{190}
+  \item Chapter5.5.2.2, \hyperpage{190}
+  \item Chapter5.5.3, \hyperpage{192}
+  \item Chapter5.5.3.1, \hyperpage{192}
+  \item Chapter5.5.3.2, \hyperpage{194}
+  \item Chapter5.5.3.3, \hyperpage{195}
+  \item Chapter5.5.3.4, \hyperpage{195}
+  \item Chapter5.5.3.5, \hyperpage{196}
+  \item Chapter5.6, \hyperpage{197}
+  \item Chapter6.1, \hyperpage{199}
+  \item Chapter6.1.1, \hyperpage{201}
+  \item Chapter6.1.2, \hyperpage{203}
+  \item Chapter6.1.3, \hyperpage{206}
+  \item Chapter6.2, \hyperpage{207}
+  \item Chapter6.2.1, \hyperpage{208}
+  \item Chapter6.2.2, \hyperpage{209}
+  \item Chapter6.2.3, \hyperpage{210}
+  \item Chapter6.2.4, \hyperpage{211}
+  \item Chapter6.3, \hyperpage{212}
+  \item Chapter6.3.1, \hyperpage{213}
+  \item Chapter6.3.2, \hyperpage{216}
+  \item Chapter6.3.3, \hyperpage{219}
+  \item Chapter6.3.3.1, \hyperpage{219}
+  \item Chapter6.3.3.2, \hyperpage{220}
+  \item Chapter6.3.3.3, \hyperpage{222}
+  \item Chapter6.3.3.4, \hyperpage{223}
+  \item Chapter6.3.3.5, \hyperpage{224}
+  \item Chapter6.3.4, \hyperpage{224}
+  \item Chapter6.3.4.1, \hyperpage{225}
+  \item Chapter6.3.4.2, \hyperpage{227}
+  \item Chapter6.3.4.3, \hyperpage{230}
+  \item Chapter6.3.5, \hyperpage{232}
+  \item Chapter6.3.5.1, \hyperpage{232}
+  \item Chapter6.3.5.2, \hyperpage{233}
+  \item Chapter6.3.5.3, \hyperpage{233}
+  \item Chapter6.3.5.4, \hyperpage{233}
+  \item Chapter6.3.5.5, \hyperpage{234, 235}
+  \item Chapter6.3.6, \hyperpage{238}
+  \item Chapter6.3.6.1, \hyperpage{239}
+  \item Chapter6.3.6.2, \hyperpage{240}
+  \item Chapter6.3.6.3, \hyperpage{241}
+  \item Chapter6.3.7, \hyperpage{242}
+  \item Chapter6.4, \hyperpage{243}
+  \item Chapter6.4.1, \hyperpage{244}
+  \item Chapter6.4.10, \hyperpage{259}
+  \item Chapter6.4.2, \hyperpage{246}
+  \item Chapter6.4.3, \hyperpage{248}
+  \item Chapter6.4.4, \hyperpage{250}
+  \item Chapter6.4.5, \hyperpage{252}
+  \item Chapter6.4.6, \hyperpage{253}
+  \item Chapter6.4.7, \hyperpage{254}
+  \item Chapter6.4.8, \hyperpage{256}
+  \item Chapter6.4.9, \hyperpage{257}
+  \item Chapter6.5, \hyperpage{260}
+  \item Chapter6.5.1, \hyperpage{260}
+  \item Chapter6.5.2, \hyperpage{261}
+  \item Chapter6.5.3, \hyperpage{261}
+  \item Chapter6.5.4, \hyperpage{262}
+  \item Chapter6.5.5, \hyperpage{262}
+  \item Chapter6.6, \hyperpage{264}
+\end{theindex}
--- a/Book/mt-book-xelatex.tex
+++ b/Book/mt-book-xelatex.tex
@@ -19,6 +19,7 @@
 \setCJKmonofont{SimSun}
 \setmainfont{Times New Roman} 
 %----------------------------------------------------------------------------------------
 {\newcommand{\mycfont}{song}}
 {\newcommand{\mycfont}{gbsn}}
@@ -46,27 +47,57 @@
 \renewcommand{\baselinestretch}{1.2}%设置行间距
 \begin{document}
 %\begin{CJK}{UTF8}{\mycfont}%原来的CJK
+%----------------------------------------------------------------------------------------
+%	TITLE PAGE
+%----------------------------------------------------------------------------------------
+\begingroup
+\thispagestyle{empty} % Suppress headers and footers on the title page
+%\begin{tikzpicture}[remember picture,overlay]
+\begin{tikzpicture}[remember picture,overlay]
+\node[inner sep=0pt] (background) at (current page.center) {\includegraphics[width=\paperwidth]{background.pdf}};
+\draw (current page.center) node [fill=ocre!30!white,fill opacity=0.6,text opacity=1,inner sep=1cm]{\Huge\centering\bfseries\sffamily\parbox[c][][t]{\paperwidth}{\centering 机器翻译：统计建模与深度学习方法\\[15pt] % Book title
+%{\Large 副标题是否需要}\\[20pt] % Subtitle
+{\huge 肖桐}}}; % Author name
+\end{tikzpicture}
+\vfill
+\endgroup
 %----------------------------------------------------------------------------------------
-%	TABLE OF CONTENTS
+%	COPYRIGHT PAGE
 %----------------------------------------------------------------------------------------
-%\usechapterimagefalse % If you don't want to include a chapter image, use this to toggle images off - it can be enabled later with \usechapterimagetrue
+\newpage
+~\vfill
+\thispagestyle{empty}
-\chapterimage{chapter_head_1.pdf} %目录标题的图案
+\noindent Copyright \copyright\ 2020 Xiao Tong\\ % Copyright notice
-\pagestyle{empty} % Disable headers and footers for the following pages
+\noindent \textsc{Published by \red{Publisher}}\\ % Publisher
-\tableofcontents % 打印目录
+\noindent \textsc{\url{http://47.105.50.196/NiuTrans/Toy-MT-Introduction/tree/master/Book}}\\ % URL
+\noindent {\red{Licensed under the Creative Commons Attribution-NonCommercial 3.0 Unported License (the ``License''). You may not use this file except in compliance with the License. You may obtain a copy of the License at \url{http://creativecommons.org/licenses/by-nc/3.0}. Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an \textsc{``as is'' basis, without warranties or conditions of any kind}, either express or implied. See the License for the specific language governing permissions and limitations under the License.}}\\ % License information, replace this with your own license (if any)
+\noindent \textit{First printing, \red{March 2019}} % Printing/edition date
-\cleardoublepage %保证章节页在奇数页
+%----------------------------------------------------------------------------------------
+%	TABLE OF CONTENTS
+%----------------------------------------------------------------------------------------
+%\usechapterimagefalse % If you don't want to include a chapter image, use this to toggle images off - it can be enabled later with \usechapterimagetrue
+\chapterimage{chapter_head_1.pdf} %目录标题的图案
+\pagestyle{empty} % Disable headers and footers for the following pages
+\tableofcontents % 打印目录
+\cleardoublepage %保证章节页在奇数页
 \pagestyle{fancy} % Enable headers and footers again
 %----------------------------------------------------------------------------------------
 %	CHAPTERS
 %----------------------------------------------------------------------------------------
 \include{Chapter1/chapter1}
 \include{Chapter2/chapter2}
 \include{Chapter3/chapter3}
@@ -78,12 +109,24 @@
 %----------------------------------------------------------------------------------------
 %	BIBLIOGRAPHY
+%----------------------------------------------------------------------------------------
+\cleardoublepage % Make sure the index starts on an odd (right side) page
 \printbibliography
-%------------------------------------------------
-%\include{Chapters/bibliography}
-%\include{Chapters/index}
+%----------------------------------------------------------------------------------------
+%	INDEX
+%----------------------------------------------------------------------------------------
+\cleardoublepage % Make sure the index starts on an odd (right side) page
+%\phantomsection
+%\setlength{\columnsep}{0.75cm} % Space between the 2 columns of the index
+%\addcontentsline{toc}{chapter}{\textcolor{ocre}{Index}} % Add an Index heading to the table of contents
+\printindex % Output the index
 %-------------------------

--- a/Book/mt-book.tex
+++ b/Book/mt-book.tex
@@ -59,12 +59,10 @@
 %----------------------------------------------------------------------------------------
-\IfFileExists{C:/WINDOWS/win.ini}
+%\IfFileExists{C:/WINDOWS/win.ini}
 {\newcommand{\mycfont}{song}}
-{\newcommand{\mycfont}{gbsn}}
+%{\newcommand{\mycfont}{gbsn}}
-\begin{CJK}{UTF8}{\mycfont}
-\end{CJK}
 %公式字体设置为计算机现代罗马
 \AtBeginDocument{
 \SetSymbolFont{operators}   {normal}{OT1}{cmr} {m}{n}
@@ -87,7 +85,41 @@
 }
 \renewcommand{\baselinestretch}{1.2}%设置行间距
 \begin{document}
-\begin{CJK}{UTF8}{\mycfont}%使用xelatex的话需要注释掉
+\begin{CJK}{UTF8}{song}%使用xelatex的话需要注释掉
+%----------------------------------------------------------------------------------------
+%	TITLE PAGE
+%----------------------------------------------------------------------------------------
+\begingroup
+\thispagestyle{empty} % Suppress headers and footers on the title page
+%\begin{tikzpicture}[remember picture,overlay]
+\begin{tikzpicture}[remember picture,overlay]
+\node[inner sep=0pt] (background) at (current page.center) {\includegraphics[width=\paperwidth]{background.pdf}};
+\draw (current page.center) node [fill=ocre!30!white,fill opacity=0.6,text opacity=1,inner sep=1cm]{\Huge\centering\bfseries\sffamily\parbox[c][][t]{\paperwidth}{\centering 机器翻译：统计建模与深度学习方法\\[15pt] % Book title
+%{\Large 副标题是否需要}\\[20pt] % Subtitle
+{\huge 肖桐}}}; % Author name
+\end{tikzpicture}
+\vfill
+\endgroup
+%----------------------------------------------------------------------------------------
+%	COPYRIGHT PAGE
+%----------------------------------------------------------------------------------------
+\newpage
+~\vfill
+\thispagestyle{empty}
+\noindent Copyright \copyright\ 2020 Xiao Tong\\ % Copyright notice
+\noindent \textsc{Published by \red{Publisher}}\\ % Publisher
+\noindent \textsc{\url{http://47.105.50.196/NiuTrans/Toy-MT-Introduction/tree/master/Book}}\\ % URL
+\noindent {\red{Licensed under the Creative Commons Attribution-NonCommercial 3.0 Unported License (the ``License''). You may not use this file except in compliance with the License. You may obtain a copy of the License at \url{http://creativecommons.org/licenses/by-nc/3.0}. Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an \textsc{``as is'' basis, without warranties or conditions of any kind}, either express or implied. See the License for the specific language governing permissions and limitations under the License.}}\\ % License information, replace this with your own license (if any)
+\noindent \textit{First printing, \red{March 2019}} % Printing/edition date
 %----------------------------------------------------------------------------------------
 %	TABLE OF CONTENTS
@@ -106,19 +138,34 @@
 %----------------------------------------------------------------------------------------
 %	CHAPTERS
 %----------------------------------------------------------------------------------------
-%\include{Chapter1/chapter1}
+\include{Chapter1/chapter1}
-%\include{Chapter2/chapter2}
+\include{Chapter2/chapter2}
-%\include{Chapter3/chapter3}
+\include{Chapter3/chapter3}
+\include{Chapter5/chapter5}
 \include{Chapter6/chapter6}
 %----------------------------------------------------------------------------------------
 %	BIBLIOGRAPHY
 %----------------------------------------------------------------------------------------
+\cleardoublepage % Make sure the index starts on an odd (right side) page
 \printbibliography
+%----------------------------------------------------------------------------------------
+%	INDEX
+%----------------------------------------------------------------------------------------
+\cleardoublepage % Make sure the index starts on an odd (right side) page
+%\phantomsection
+%\setlength{\columnsep}{0.75cm} % Space between the 2 columns of the index
+%\addcontentsline{toc}{chapter}{\textcolor{ocre}{Index}} % Add an Index heading to the table of contents
+\printindex % Output the index
 %--------------------------------------------------------------------------------------
 \end{CJK}
 \end{document}
--- a/Book/structure.tex
+++ b/Book/structure.tex
@@ -80,7 +80,6 @@
 \usepackage{calc} % For simpler calculation - used for spacing the index letter headings correctly
 \usepackage{makeidx} % Required to make an index
-%\bibliographystyle{plainnat}
 \makeindex % Tells LaTeX to create the files required for indexing
 %----------------------------------------------------------------------------------------
@@ -506,7 +505,15 @@ innerbottommargin=5pt]{cBox}
 %----------------------------------------------------------------------------------------
 \usepackage{hyperref}
-\hypersetup{hidelinks,backref=true,pagebackref=true,hyperindex=true,colorlinks=false,breaklinks=true,urlcolor=ocre,bookmarks=true,bookmarksopen=false}
+\hypersetup{hidelinks,backref=true,pagebackref=true,hyperindex=true,colorlinks=false,breaklinks=true,urlcolor=ocre,bookmarks=true,bookmarksopen=true}
+%backref反向引用
+%pagebackref反向引用页码
+%hyperindex索引链接
+%colorlinks彩色链接
+%breaklinks允许链接断行
+%urlcolor网页与电邮链接颜色
+%bookmarks生成书签
+%bookmarksopen书签目录展开
 \usepackage{bookmark}
 \bookmarksetup{
@@ -537,6 +544,8 @@ addtohook={%
 \DeclareMathOperator*{\argmin}{arg\,min}
 \usepackage{setspace}%调整行间距
+%\usepackage{tocbibind}
 %----------------------------------------------------------------------------------------
 %	Chapter 1
 %----------------------------------------------------------------------------------------
@@ -582,6 +591,7 @@ addtohook={%
 %%%%%%%%%%%chapter5图片等---------------------------------------
 \usepackage{tikz-3dplot}
+\usepackage{pifont}
 \tcbuselibrary{skins}
 \definecolor{ublue}{rgb}{0.152,0.250,0.545}
 \definecolor{ugreen}{rgb}{0,0.5,0}