update figure

30726017 · 曹润柘 · 595e3c7e · 30726017 · 30726017 · 30726017
Commit 30726017 authored Apr 30, 2020 by 曹润柘
--- a/Book/Chapter1/chapter1.tex
+++ b/Book/Chapter1/chapter1.tex
@@ -8,7 +8,7 @@
 %----------------------------------------------------------------------------------------
 \renewcommand\figurename{图}%将figure改为图
 \renewcommand\tablename{表}%将figure改为图
-\chapterimage{chapter_head_1.pdf} % Chapter heading image
+\chapterimage{fig-NEU-2.jpg} % Chapter heading image

 \chapter{机器翻译简介}
 \section{机器翻译的概念}\index{Chapter1.1}

--- a/Book/Chapter2/chapter2.tex
+++ b/Book/Chapter2/chapter2.tex
@@ -8,7 +8,7 @@
 \renewcommand\figurename{图}%将figure改为图
 \renewcommand\tablename{表}%将figure改为图
 %\renewcommand\arraystretch{1.5}%将表格高度调整为1.5倍
-\chapterimage{chapter_head_1.pdf} % Chapter heading image
+\chapterimage{fig-NEU-3.jpg} % Chapter heading image

 \chapter{词法、语法及统计建模基础}


--- a/Book/Chapter3/Chapter3.tex
+++ b/Book/Chapter3/Chapter3.tex
@@ -8,7 +8,7 @@
 \renewcommand\tablename{表}%将figure改为图
 \definecolor{ublue}{rgb}{0.152,0.250,0.545}
 \definecolor{ugreen}{rgb}{0,0.5,0}
-\chapterimage{chapter_head_1} % Chapter heading image
+\chapterimage{fig-NEU-4.jpg} % Chapter heading image
 %公式1.7之后往后串一个
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \chapter{基于词的机器翻译模型}

--- a/Book/Chapter4/Figures/combination-of-translation-with-different-rules.tex
+++ b/Book/Chapter4/Figures/combination-of-translation-with-different-rules.tex
@@ -7,7 +7,7 @@
 \node[anchor=north] (q1) at (0,0) {\scriptsize\sffamily\bfseries{输入字符串：}};
 \node[anchor=west] (q2) at ([xshift=0em,yshift=-2em]q1.west) {\footnotesize{进口$\quad$和$\quad$出口$\quad$大幅度$\quad$下降$\quad$了}};

-\node[anchor=north,fill=blue!20,minimum height=1em,minimum width=1em] (f1) at ([xshift=-4.1em,yshift=-0.8em]q2.south) {};
+%\node[anchor=north,fill=blue!20,minimum height=1em,minimum width=1em] (f1) at ([xshift=-4.1em,yshift=-0.8em]q2.south) {};

 \node[anchor=north,fill=blue!20,minimum height=4em,minimum width=1em] (f1) at ([xshift=2.2em,yshift=-0.7em]q2.south) {};


--- a/Book/Chapter4/Figures/examples-of-translation-with-complex-ordering.tex
+++ b/Book/Chapter4/Figures/examples-of-translation-with-complex-ordering.tex
@@ -27,7 +27,7 @@

 \node[anchor=north west] (input) at ([yshift=-6.5em]synhifst.south west) {\sffamily\bfseries{源语句法树:}};

-\begin{scope}[scale = 0.9, grow'=up, sibling distance=5pt, level distance=23pt, xshift=3.49in, yshift=-2.8in]
+\begin{scope}[scale = 0.9, grow'=up, sibling distance=5pt, level distance=30pt, xshift=3.49in, yshift=-3.1in]

 \Tree[.\node(tn1){IP};
        [.\node(tn2){NP}; \edge[roof]; \node[](seg1){中国$_1$ 明星$_2$ 艺术团$_3$}; ]

--- a/Book/Chapter4/Figures/extract-hierarchical-phrase-rules.tex
+++ b/Book/Chapter4/Figures/extract-hierarchical-phrase-rules.tex
@@ -67,11 +67,11 @@
 \draw[-] (rules.south west)--([xshift=1.9in]rules.south west);

 {
-\node[anchor=north west] (p1) at ([yshift=-0.3em]phrase.south west) {天气真好 -- The weather is very good};
+\node[anchor=north west] (p1) at ([yshift=-0.3em]phrase.south west) {天气\ \ 真好 -- The weather is very good};
 }

 {
-\node[anchor=north west] (r1) at ([yshift=-0.3em]rules.south west) {$\mathrm{X_1}$真好 -- $\mathrm{X_1}$ is very good};
+\node[anchor=north west] (r1) at ([yshift=-0.3em]rules.south west) {$\mathrm{X_1}$\ \ 真好 -- $\mathrm{X_1}$ is very good};

 \node[anchor=east] (r2) at ([yshift=-2.65cm]p1.east) {};
 }

--- a/Book/Chapter4/Figures/reorder-base-distance.tex
+++ b/Book/Chapter4/Figures/reorder-base-distance.tex
@@ -45,8 +45,8 @@
 \draw[-,thick] (s2.north west)--([yshift=0.3in]s2.north west);
 \draw[->,densely dotted,thick] ([yshift=0.3in]s2.north west)--([xshift=-0.3in,yshift=0.3in]s2.north west);

-\node[anchor=south] (ld1) at ([xshift=-0.5em,yshift=0.4em]n1.north) {\small{$dr$=-5}};
-\node[anchor=south] (ld2) at ([xshift=6.5em,yshift=0.4em]n1.north) {\small{$dr$=+4}};
+\node[anchor=south] (ld1) at ([xshift=-0.5em,yshift=0.4em]n1.north) {\small{$dr= -5$}};
+\node[anchor=south] (ld2) at ([xshift=6.5em,yshift=0.4em]n1.north) {\small{$dr= +4$}};

 \end{scope}
 \end{tikzpicture}

--- a/Book/Chapter4/Figures/result-of-tree-binarization.tex
+++ b/Book/Chapter4/Figures/result-of-tree-binarization.tex
@@ -3,7 +3,7 @@
 \begin{center}
 \begin{tikzpicture}

-{\scriptsize
+{\footnotesize
 \begin{scope}[sibling distance=4pt, level distance=25pt]

 \Tree[.\node(n1){NP};
@@ -22,7 +22,7 @@
 \draw [-,dashed] (sw3.south) -- (tw3.north);
 \draw [-,dashed] (sw4.south) -- (tw3.north);

-\draw [->,very thick] ([xshift=1em]sw4.east) -- ([xshift=5em]sw4.east) node [pos=0.5,above] {\tiny{二叉化}};
+\draw [->,very thick] ([xshift=1em]sw4.east) -- ([xshift=5em]sw4.east) node [pos=0.5,above] {\scriptsize{二叉化}};

 \end{scope}


--- a/Book/Chapter4/Figures/translation-hypothesis-extension.tex
+++ b/Book/Chapter4/Figures/translation-hypothesis-extension.tex
@@ -64,7 +64,7 @@
 }

 {
-\draw [->,ultra thick,red,line width=2pt,opacity=0.7] ([xshift=-0.2em]h0.west) -- ([xshift=0.7em]h0.east) -- ([xshift=-0.2em]h3.west) -- ([xshift=0.8em]h3.east) -- ([xshift=-0.2em]h5.west) -- ([xshift=0.8em]h5.east) -- ([xshift=-0.2em]h7.west) -- ([xshift=0.8em]h7.east);
+\draw [->,ultra thick,red,line width=2pt,opacity=0.7] ([xshift=-0.5em]h0.west) -- ([xshift=0.7em]h0.east) -- ([xshift=-0.2em]h3.west) -- ([xshift=0.8em]h3.east) -- ([xshift=-0.2em]h5.west) -- ([xshift=0.8em]h5.east) -- ([xshift=-0.2em]h7.west) -- ([xshift=1.5em]h7.east);
 \node [anchor=north west] (wtranslabel) at ([yshift=-3em]h0.south west) {\small{翻译路径:}};
 \draw [->,ultra thick,red,line width=1.5pt,opacity=0.7] (wtranslabel.east) -- ([xshift=1.5em]wtranslabel.east);
 }

--- a/Book/Chapter4/Figures/tree-fragment-to-string-mapping.tex
+++ b/Book/Chapter4/Figures/tree-fragment-to-string-mapping.tex
@@ -14,7 +14,7 @@
 \node [anchor=west] (tw1) at ([xshift=3.5em]sn3.east) {increases};
 \node [anchor=west,fill=red!20] (tw2) at ([xshift=0.3em]tw1.east) {NN};

-\draw[dotted,thick] ([yshift=-0.1em]sn3.south)..controls +(south:1.2) and +(south: 1.2)..([yshift=-0.1em]tw2.south);
+\draw[dotted,thick,<->] ([yshift=-0.1em]sn3.south)..controls +(south:1.2) and +(south: 1.2)..([yshift=-0.1em]tw2.south);

 \begin{pgfonlayer}{background}
 \node [rectangle,inner sep=0em,fill=red!20] [fit = (sn3)] (nn1) {};

--- a/Book/Chapter4/chapter4.tex
+++ b/Book/Chapter4/chapter4.tex
@@ -6,7 +6,7 @@
 %----------------------------------------------------------------------------------------
 \renewcommand\figurename{图}%将figure改为图
 \renewcommand\tablename{表}%将figure改为图
-\chapterimage{chapter_head_1.pdf} % Chapter heading image
+\chapterimage{fig-NEU-5.jpg} % Chapter heading image

 \chapter{基于短语和句法的机器翻译模型}

@@ -242,10 +242,15 @@ d = {(\bar{s}_{\bar{a}_1},\bar{t}_1)} \circ {(\bar{s}_{\bar{a}_2},\bar{t}_2)} \c
 \parinterval 到此为止，就得到了一个基于短语的翻译模型。对于每个双语句对($\textbf{s}, \textbf{t}$)，每个翻译推导$d$都对应了一个基于短语的翻译过程。而基于短语的机器翻译的目标就是对$d$进行描述。有四个基本问题：

 \begin{itemize}
+\vspace{0.3em}
 \item 如何用统计模型描述每个翻译推导的好坏\ \dash \ 即翻译的统计建模问题；
+\vspace{0.3em}
 \item 如何获得可使用的双语短语对\ \dash \ 即短语翻译获取问题；
+\vspace{0.3em}
 \item 如何对翻译中的调序问题进行建模\ \dash \ 即调序问题；
+\vspace{0.3em}
 \item 如何找到输入句子\textbf{s}的最佳译文\ \dash \ 即解码问题。
+\vspace{0.3em}
 \end{itemize}

 \parinterval 这四个问题也构成了基于短语的翻译模型的核心，下面对其逐一展开讨论。
@@ -593,8 +598,11 @@ dr = \textrm{start}_i-\textrm{end}_{i-1}-1
 \parinterval 在MSD调序中，双语短语所对应的调序概率$\textrm{P}(o_i| \bar{s}_{a_i}, \bar{t}_i, a_{i-1}, a_i)$是用极大似然估计方法进行计算的。但是，这种方法也会面临数据稀疏问题，同时对调序产生影响的细致特征也没有考虑进来。另一种有效的方法是直接用统计分类模型对调序进行建模，比如，可以使用最大熵、SVM等分类器输出调序概率或者得分\cite{xiong2006maximum}（{\red 参考文献！再引用两篇！}）。对于基于分类的调序模型，有两方面问题需要考虑：

 \begin{itemize}
+\vspace{0.3em}
 \item 训练样本的生成。可以把M、S、D看作是类别标签，把所对应的短语及短语对齐信息看作是输入。这样就得到了大量分类器训练所需的样本；
+\vspace{0.3em}
 \item 分类特征设计。这部分是传统统计机器学习中的重要组成部分，好的特征会对分类结果产生很大影响。在调序模型中，一般直接使用单词作为特征，比如用短语的第一个单词和最后一个单词作为特征就可以达到很好的效果。
+\vspace{0.3em}
 \end{itemize}

 \parinterval 随着神经网络方法的兴起，也可以考虑使用多层神经网络构建调序模型\cite{li-etal-2014-neural}。这时，可以把短语直接送入一个神经网络，之后由神经网络完成对特征的抽取和表示，并输出最终的调序模型得分。
@@ -605,13 +613,21 @@ dr = \textrm{start}_i-\textrm{end}_{i-1}-1
 \parinterval 基于短语的模型使用判别式模型对翻译推导进行建模，给定双语句对$(\textbf{s},\textbf{t})$，每个翻译推导$d$都有一个模型得分，由$M$个特征线性加权得到，记为$\textrm{score}(d,\textbf{t},\textbf{s}) = \sum_{i=1}^{M} \lambda_i \cdot h_i (d,\textbf{t},\textbf{s})$，其中$\lambda_i$表示特征权重，$h_i (d,\textbf{t},\textbf{s})$表示特征函数（简记为$h_i (d)$）。这些特征包含刚刚介绍过的短语翻译概率、调序模型得分等，除此之外，还包含语言模型等其他特征，它们共同组成了特征集合。这里列出了基于短语的模型中常用的特征：

 \begin{itemize}
+\vspace{0.3em}
 \item 短语翻译概率（取对数），包含正向翻译概率$\textrm{log}(\textrm{P}(\bar{t}|\bar{s}))$和反向翻译概率$\textrm{log}(\textrm{P}(\bar{s}$\\$|\bar{t}))$，它们是基于短语的模型中最主要的特征；
+\vspace{0.3em}
 \item 词汇化翻译概率（取对数），同样包含正向词汇化翻译概率$\textrm{log(P}_{\textrm{lex}}(\bar{t}|\bar{s}\textrm{))}$和反向词汇化翻译概率$\textrm{log(P}_{\textrm{lex}}(\bar{s}|\bar{t}\textrm{))}$，它们用来描述双语短语中单词之间对应的好坏；
+\vspace{0.3em}
 \item $n$-gram语言模型，用来度量译文的流畅程度，可以通过大规模目标端单语数据得到；
+\vspace{0.3em}
 \item 译文长度，避免模型倾向于短译文，同时让系统自动学习对译文长度的偏好；
+\vspace{0.3em}
 \item 翻译规则数量，为了避免模型仅使用少量特征构成翻译推导（规则数量少，短语翻译概率相乘的因子也会少，得分一般会大一些），同时让系统自动学习对规则数量的偏好；
+\vspace{0.3em}
 \item 被翻译为空的源语言单词数量。注意，空翻译规则有时也被称作evil feature，这类特征在一些数据上对BLEU有很好的提升作用，但会造成人工评价结果的下降，需要谨慎使用；
+\vspace{0.3em}
 \item 基于MSD的调序模型，包括与前一个短语的调序模型$f_{\textrm{M-pre}}(d)$\ 、$f_{\textrm{S-pre}}(d)$\ 、$f_{\textrm{D-pre}}(d)$和与后一个短语的调序模型$f_{\textrm{M-fol}}(d)$\ 、$f_{\textrm{S-fol}}(d)$\ 、$f_{\textrm{D-fol}}(d)$，共6个特征。
+\vspace{0.3em}
 \end{itemize}

 %--4.2.6 最小错误率训练---------------------
@@ -636,7 +652,7 @@ dr = \textrm{start}_i-\textrm{end}_{i-1}-1
 \label{eqa4.18}
 \end{eqnarray}
 %公式--------------------------------------------------------------------
- 
+
 \noindent 其中\textrm{Error}$(\cdot)$是错误率函数。\textrm{Error}$(\cdot)$的定义方式有很多，一般来说\textrm{Error}$(\cdot)$会与机器翻译的评价指标相关，例如，词错误率(WER)、位置错误率(PER)、BLEU 值、NIST值等都可以用于\textrm{Error}$(\cdot)$的定义。这里使用1-BLEU作为错误率函数，即$\textrm{Error}(\textbf{D}^{\ast},\textbf{R}) = 1 - \textrm{BLEU}(\textbf{D}^{\ast},\textbf{R})$。则公式\ref{eqa4.18}可改写为：
 %公式--------------------------------------------------------------------
 \begin{eqnarray}
@@ -707,11 +723,17 @@ dr = \textrm{start}_i-\textrm{end}_{i-1}-1
 \parinterval 还有一些经验性的技巧用来完善基于线搜索的MERT。例如：

 \begin{itemize}
+\vspace{0.3em}
 \item 随机生成特征权重的起始点；
+\vspace{0.3em}
 \item 搜索中，给权重加入一些微小的扰动，避免陷入局部最优；
+\vspace{0.3em}
 \item 随机选择特征优化的顺序；
+\vspace{0.3em}
 \item 使用先验知识来指导MERT（对权重的取值范围进行约束）；
+\vspace{0.3em}
 \item 使用多轮迭代训练，最终对权重进行平均。
+\vspace{0.3em}
 \end{itemize}

 \parinterval MERT最大的优点在于可以用于目标函数不可微、甚至不连续的情况。对于优化线性模型， MERT是一种很好的选择。但是，也有研究发现，简单使用MERT无法处理特征数量过多的情况。比如，用MERT优化10000个稀疏特征的权重时，优化效果可能会不理想，而且收敛速度慢。这时也可以考虑使用在线学习等技术对大量特征的权重进行调优，比较有代表性的方法包括MIRA\cite{crammer2003ultraconservative}和PRO\cite{Hopkins2011Tuning}。由于篇幅所限，这里不对这些方法做深入讨论，感兴趣的读者可以参考\ref{section-4.5}节的内容，对相关文献进行查阅。
@@ -719,7 +741,7 @@ dr = \textrm{start}_i-\textrm{end}_{i-1}-1
 %--4.2.7 栈解码---------------------
 \subsection{栈解码}\index{Chapter4.2.7}

-\parinterval 对翻译模型解码的目的是根据模型以及输入，找到模型得分最高的推导，即：
+\parinterval 解码的目的是根据模型以及输入，找到模型得分最高的推导，即：

 %公式--------------------------------------------------------------------
 \begin{eqnarray}
@@ -728,16 +750,19 @@ dr = \textrm{start}_i-\textrm{end}_{i-1}-1
 \end{eqnarray}
 %公式--------------------------------------------------------------------

-\parinterval 然而想要找到得分最高的翻译推导，并不是一件简单的事情。对于每一句源语言句子，可能的翻译结果是指数级的。而机器翻译解码也已经被证明是一个NP难问题\cite{knight1999decoding}。简单的暴力搜索显然不现实。因此，在机器翻译中会使用特殊的解码策略来确保搜索的效率。本节将介绍基于栈的自左向右解码方法。它是基于短语的模型中的经典解码方法，非常适于处理语言生成的各种任务。
+\parinterval 然而想要找到得分最高的翻译推导并不是一件简单的事情。对于每一句源语言句子，可能的翻译结果是指数级的。而机器翻译解码也已经被证明是一个NP难问题\cite{knight1999decoding}。简单的暴力搜索显然不现实。因此，在机器翻译中会使用特殊的解码策略来确保搜索的效率。本节将介绍基于栈的自左向右解码方法。它是基于短语的模型中的经典解码方法，非常适于处理语言生成的各种任务。

 \parinterval 首先，看一下翻译一个句子的基本流程。如图\ref{fig:basic-process-of-translation}所示，首先需要得到译文句子的第一个单词。在基于短语的模型中，可以从源语言端找出生成句首译文的短语，之后把译文放到目标语言端，例如，源语言的``有''对应的译文是``There is''。这个过程可以重复执行，直到生成完整句子的译文。但是，有两点需要注意：

 \begin{itemize}
+\vspace{0.3em}
 \item 源语言的每个单词（短语）只能被翻译一次；
+\vspace{0.3em}
 \item 译文的生成需自左向右连续进行。
+\vspace{0.3em}
 \end{itemize}

-\parinterval 前者对应了一种{\small\bfnew{覆盖度模型}}（Coverage Model），后者定义了解码的方向，这样可以确保$n$-gram语言模型的计算是准确的。这样，就得到了一个简单的基于短语的机器翻译解码框架。每次从源语言句子中找到一个短语，作为译文最右侧的部分，重复执行直到整个译文被生成出来。
+\parinterval 前者对应了一种{\small\bfnew{覆盖度模型}}（Coverage Model）；后者定义了解码的方向，这样可以确保$n$-gram语言模型的计算是准确的。这样，就得到了一个简单的基于短语的机器翻译解码框架。每次从源语言句子中找到一个短语，作为译文最右侧的部分，重复执行直到整个译文被生成出来。

 %----------------------------------------------
 % 图4.26
@@ -752,14 +777,14 @@ dr = \textrm{start}_i-\textrm{end}_{i-1}-1
 %%%%%%%%%%%%%%%%%%
 \subsubsection{翻译候选匹配}\index{Chapter4.2.7.1}

-\parinterval 在解码时，首先要知道每个源语言短语可能的译文都是什么。对于一个源语言短语，每个可能的译文也被称作{\small\bfnew{翻译候选}}（Translation Candidate）。实现翻译候选的匹配很简单。只需要遍历输入的源语言句子中所有可能的短语，之后在短语表中找到相应的翻译即可。比如，图\ref{fig:translation-option}展示了句子``桌子 上 有 一个 苹果''的翻译候选匹配结果。可以看到，不同的短语会对应若干翻译候选。这些翻译候选会保存在所对应的跨度中。比如，``upon the table''是短语``桌子 上 有''的翻译候选，即对应源语言跨度[0,3]。
+\parinterval 在解码时，首先要知道每个源语言短语可能的译文都是什么。对于一个源语言短语，每个可能的译文也被称作{\small\bfnew{翻译候选}}（Translation Candidate）。实现翻译候选的匹配很简单。只需要遍历输入的源语言句子中所有可能的短语，之后在短语表中找到相应的翻译即可。比如，图\ref{fig:translation-option}展示了句子``桌子\ 上\ 有\ 一个\ 苹果''的翻译候选匹配结果。可以看到，不同的短语会对应若干翻译候选。这些翻译候选会保存在所对应的跨度中。比如，``upon the table''是短语``桌子 上 有''的翻译候选，即对应源语言跨度[0,3]。

 %----------------------------------------------
 % 图4.27
 \begin{figure}[htp]
 \centering
 \input{./Chapter4/Figures/translation-option}
-\caption{翻译选项}
+\caption{一个句子匹配的短语翻译候选}
 \label{fig:translation-option}
 \end{figure}
 %-------------------------------------------
@@ -782,22 +807,28 @@ dr = \textrm{start}_i-\textrm{end}_{i-1}-1
 %%%%%%%%%%%%%%%%%%
 \subsubsection{剪枝}\index{Chapter4.2.7.3}

-\parinterval 假设扩展建立了解码算法的基本框架。但是，当句子变长时，这种方法还是面临着搜索空间爆炸的问题。对于这个问题，常用的解决办法是{\small\bfnew{剪枝}}（Pruning），也就是在搜索图中排除掉一些节点。比如，可以使用{\small\bfnew{束剪枝}}（Beam Pruning），确保每次翻译扩展时，最多生成$k$个新的翻译假设。这里$k$可以被看做是束的宽度。通过控制$k$的大小，可以在解码精度和速度之间进行平衡。这种基于束宽度进行剪枝的方法也被称作{\small\bfnew{直方图剪枝}}（Histogram Pruning）。另一种思路是，每次扩展时只保留与最优翻译假设得分相差在$\delta$之内的翻译假设。$\delta$可以被看作是一种与最优翻译假设之间距离的阈值，超过这个阈值就被剪枝。因此这种方法也被称作{\small\bfnew{阈值剪枝}}（Threshold Pruning）。
+\parinterval 假设扩展建立了解码算法的基本框架。但是，当句子变长时，这种方法还是面临着搜索空间爆炸的问题。对于这个问题，常用的解决办法是{\small\bfnew{剪枝}}（Pruning），也就是在搜索图中排除掉一些节点。比如，可以使用{\small\bfnew{束剪枝}}（Beam Pruning），确保每次翻译扩展时，最多生成$k$个新的翻译假设。这里$k$可以被看做是束的宽度。通过控制$k$的大小，可以在解码精度和速度之间进行平衡。这种基于束宽度进行剪枝的方法也被称作{\small\bfnew{直方图剪枝}}（Histogram Pruning）。另一种思路是，每次扩展时只保留与最优翻译假设得分相差在$\delta$之内的翻译假设。$\delta$可以被看作是一种与最优翻译假设之间距离的阈值，超过这个阈值就被剪枝。这种方法也被称作{\small\bfnew{阈值剪枝}}（Threshold Pruning）。

 \parinterval 不过，即使引入束剪枝，解码过程中仍然会有很多冗余的翻译假设。有两种方法可以进一步加速解码：

 \begin{itemize}
+\vspace{0.5em}
 \item 对相同译文的翻译假设进行重新组合；
+\vspace{0.5em}
 \item 对低质量的翻译假设进行裁剪。
+\vspace{0.5em}
 \end{itemize}

-\parinterval 对翻译假设进行重新组合又被称作{\small\bfnew{假设重组}}（Hypothesis Recombination）。其核心思想是，把代表同一个译文的不同翻译假设融合为一个翻译假设。如图29所示，对于给定的输入短语``一个苹果''，系统可能将两个单词``一个''、``苹果''分别翻译成``an''和``apple''，也可能将这两个单词作为一个短语直接翻译成``an apple''。虽然这两个翻译假设得到的译文相同，并且覆盖了相同的源语言短语，但是却是两个不同的翻译假设，模型给它们的打分也是不一样的。这是，可以舍弃两个翻译假设中分数较低的那个，因为分数较低的翻译假设永远不可能成为最优路径的一部分。这也就相当于把两个翻译假设重组为一个假设。
+\parinterval 对翻译假设进行重新组合又被称作{\small\bfnew{假设重组}}（Hypothesis Recombination）。其核心思想是，把代表同一个译文的不同翻译假设融合为一个翻译假设。如图29所示，对于给定的输入短语``一个\ \ 苹果''，系统可能将两个单词``一个''、``苹果''分别翻译成``an''和``apple''，也可能将这两个单词作为一个短语直接翻译成``an apple''。虽然这两个翻译假设得到的译文相同，并且覆盖了相同的源语言短语，但是却是两个不同的翻译假设，模型给它们的打分也是不一样的。这时，可以舍弃两个翻译假设中分数较低的那个，因为分数较低的翻译假设永远不可能成为最优路径的一部分。这也就相当于把两个翻译假设重组为一个假设。

-\parinterval 即使翻译假设对应的译文不同也可以进行假设重组。图\ref{fig:example-of-hypothesis-recombination}下半部分的给出了一个这样的实例。在两个翻译假设中，第一个单词分别被翻译成了``it''和``he''，紧接着它们后面的部分都被翻译成了``is not''。这两个翻译假设是非常相似的，因为它们译文的最后两个单词是相同的，而且翻译假设都覆盖了相同的源语言部分。这时，也可以对这两个翻译假设进行假设重组：如果得分较低的翻译假设和得分较高的翻译假设都使用相同的翻译候选进行扩展，且两个翻译假设都覆盖相同的源语言单词，分数低的翻译假设可以被剪枝掉。不过，还有两点需要注意：
+\parinterval 即使翻译假设对应的译文不同也可以进行假设重组。图\ref{fig:example-of-hypothesis-recombination}下半部分的给出了一个这样的实例。在两个翻译假设中，第一个单词分别被翻译成了``it''和``he''，紧接着它们后面的部分都被翻译成了``is not''。这两个翻译假设是非常相似的，因为它们译文的最后两个单词是相同的，而且翻译假设都覆盖了相同的源语言部分。这时，也可以对这两个翻译假设进行假设重组：如果得分较低的翻译假设和得分较高的翻译假设都使用相同的翻译候选进行扩展，且两个翻译假设都覆盖相同的源语言单词，分数低的翻译假设可以被剪枝掉。此外，还有两点需要注意：

 \begin{itemize}
-\item $n$元语言模型将前$n-1$单词作为历史信息，所以当两个假设最后$n-1$个单词不同时，不能进行假设重组，因为后续的扩展可能会得到不同的语言模型得分，并影响最终的模型得分；
+\vspace{0.3em}
+\item $n$-gram语言模型将前$n-1$单词作为历史信息，所以当两个假设最后$n-1$个单词不相同时，不能进行假设重组，因为后续的扩展可能会得到不同的语言模型得分，并影响最终的模型得分；
+\vspace{0.3em}
 \item 调序模型通常是用来判断当前输入的短语与前一个输入短语之间的调序代价。因此当两个翻译假设对应短语在源语言中的顺序不同时，也不能被重新组合。
+\vspace{0.3em}
 \end{itemize}

 \parinterval 然而在实际处理中，并不需要``删掉''分数低的翻译假设，而是将它们与分数高的翻译假设连在了一起。对于搜索最优翻译，这些连接可能并没有什么作用，但是如果需要分数最高的前两个或前三个翻译，就可能需要用到这些连接。
@@ -817,7 +848,7 @@ dr = \textrm{start}_i-\textrm{end}_{i-1}-1
 %%%%%%%%%%%%%%%%%%
 \subsubsection{解码中的栈结构}\index{Chapter4.2.7.4}

-\parinterval 当质量较差的翻译假设在扩展早期出现时，这些翻译假设需要被剪枝掉，这样可以忽略所有从它扩展出来的翻译假设，进而有效地减小搜索空间。但是这样做也存在着一定的问题，首先，删除的翻译假设可能会在后续的扩展过程中被重新搜索出来。其次，过早的删除某些翻译假设可能会导致无法搜索到最优的翻译假设。所以最好的情况是尽早删除质量差的翻译假设，同时又不会对整个搜索结果产生过大影响。但是这个``质量''，从哪个方面来衡量，也是一个需要思考的问题。理想的情况就是从早期的翻译假设中，挑选一些可比的翻译假设进行筛选。
+\parinterval 当质量较差的翻译假设在扩展早期出现时，这些翻译假设需要被剪枝掉，这样可以忽略所有从它扩展出来的翻译假设，进而有效地减小搜索空间。但是这样做也存在着一定的问题，首先，删除的翻译假设可能会在后续的扩展过程中被重新搜索出来。其次，过早的删除某些翻译假设可能会导致无法搜索到最优的翻译假设。所以最好的情况是尽早删除质量差的翻译假设，同时又不会对整个搜索结果产生过大影响。但是这个``质量''从哪个方面来衡量，也是一个需要思考的问题。理想的情况就是从早期的翻译假设中，挑选一些可比的翻译假设进行筛选。

 \parinterval 目前比较通用的做法是将翻译假设进行整理，放进一种栈结构中。这里所说的``栈''是为了描述方便的一种说法。它实际上就是保存多个翻译假设的一种数据结构\footnote[4]{虽然被称作栈，实际上使用一个堆进行实现。这样可以根据模型得分对翻译假设进行排序。}。当放入栈的翻译假设超过一定阈值时（比如200），可以删除掉模型得分低的翻译假设。一般，会使用多个栈来保存翻译假设，每个栈代表覆盖源语言单词数量相同的翻译假设。比如，第一个堆栈包含了覆盖一个源语言单词的翻译假设，第二个堆栈包含了覆盖两个源语言单词的翻译假设，以此类推。利用覆盖源语言单词数进行栈的划分的原因在于：翻译相同数量的单词所对应的翻译假设一般是``可比的''，因此在同一个栈里对它们进行剪枝带来的风险较小。

@@ -841,7 +872,7 @@ dr = \textrm{start}_i-\textrm{end}_{i-1}-1
 \qquad \textrm{{\small\bfnew{我}}\ \ 在\ \ 今天\ \ 早上\ \ 没有\ \ 吃\ \ 早\ \ 饭\ \ 的\ \ 情况\ \ 下\ \ 还是\ \ 正常\ \ {\small\bfnew{去}}\ \ {\small\bfnew{上班}}\ \ 了。} \nonumber
 \end{eqnarray}

-\parinterval 这句话的主语``我''和谓语``去 上班''构成了主谓搭配，而二者之间的部分是状语。显然，用短语去捕捉这个搭配需要覆盖很长的词串，也就是整个``我 $...$ 去 上班''的部分。如果把这样的短语考虑到建模中，会面临非常严重的数据稀疏问题，因为无法保证这么长的词串在训练数据中能够出现。实际上，随着短语长度变长，短语在数据中会变得越来越低频，相关的统计特征也会越来越不可靠。表\ref{tab:trainingdata-phrase-frequency}就展示了不同长度的短语在训练数据中出现的频次。可以看到，长度超过3的短语已经非常低频了，更长的短语甚至在训练数据中一次也没有出现过。
+\parinterval 这句话的主语``我''和谓语``去\ 上班''构成了主谓搭配，而二者之间的部分是状语。显然，用短语去捕捉这个搭配需要覆盖很长的词串，也就是整个``我 $...$ 去 上班''的部分。如果把这样的短语考虑到建模中，会面临非常严重的数据稀疏问题，因为无法保证这么长的词串在训练数据中能够出现。实际上，随着短语长度变长，短语在数据中会变得越来越低频，相关的统计特征也会越来越不可靠。表\ref{tab:trainingdata-phrase-frequency}就展示了不同长度的短语在训练数据中出现的频次。可以看到，长度超过3的短语已经非常低频了，更长的短语甚至在训练数据中一次也没有出现过。

 %----------------------------------------------
 % 表4.1
@@ -866,7 +897,7 @@ dr = \textrm{start}_i-\textrm{end}_{i-1}-1

 \parinterval 显然，利用过长的短语来处理长距离的依赖并不是一种十分有效的方法。过于低频的长短语无法提供可靠的信息，而且使用长短语会导致模型体积急剧增加。

-\parinterval 再来看一个翻译实例\cite{Chiang2012Hope}。图\ref{fig:an-example-of-phrase-system}是一个基于短语的机器翻译系统的翻译结果。这个例子中的调序有一些复杂，比如，``少数 国家 之一''和``与 北韩 有 邦交''的英文翻译都需要进行调序，分别是``one of the few countries''和``have diplomatic relations with North Korea''。基于短语的系统可以很好的处理这些调序问题，因为它们仅仅使用了局部的信息。但是，系统却无法在这两个短语（1和2）之间进行正确的调序。
+\parinterval 再来看一个翻译实例\cite{Chiang2012Hope}。图\ref{fig:an-example-of-phrase-system}是一个基于短语的机器翻译系统的翻译结果。这个例子中的调序有一些复杂，比如，``少数\ 国家\ 之一''和``与\ 北韩\ 有\ 邦交''的英文翻译都需要进行调序，分别是``one of the few countries''和``have diplomatic relations with North Korea''。基于短语的系统可以很好的处理这些调序问题，因为它们仅仅使用了局部的信息。但是，系统却无法在这两个短语（1和2）之间进行正确的调序。

 %----------------------------------------------
 % 图4.31
@@ -878,7 +909,7 @@ dr = \textrm{start}_i-\textrm{end}_{i-1}-1
 \end{figure}
 %-------------------------------------------

-\parinterval 这个例子也在一定程度上说明了长距离的调序需要额外的机制才能得到更好的被处理。实际上，1和2之间的调序现象本身对应了一种结构，或者说模板。也就是汉语中的：
+\parinterval 这个例子也在一定程度上说明了长距离的调序需要额外的机制才能得到更好的被处理。实际上，两个短语（1和2）之间的调序现象本身对应了一种结构，或者说模板。也就是汉语中的：
 \begin{eqnarray}
 \text{与}\ \ \text{[什么东西]}\ \ \text{有}\ \ \text{[什么事]} \quad \nonumber
 \end{eqnarray}
@@ -893,7 +924,7 @@ dr = \textrm{start}_i-\textrm{end}_{i-1}-1
 \langle \ \text{与}\ \textrm{X}_1\ \text{有}\ \textrm{X}_2,\quad \textrm{have}\ \textrm{X}_2\ \textrm{with}\ \textrm{X}_1\ \rangle \nonumber
 \end{eqnarray}

-\parinterval 其中逗号分隔了源语言和目标语言部分，$\textrm{X}_1$和$\textrm{X}_2$表示模板中需要替换的内容，或者说变量。源语言中的变量和目标语言中的变量是一一对应的，比如，源语言中的$\textrm{X}_1$和目标语言中的$\textrm{X}_1$代表这两个变量可以``同时''被替换。假设给定短语对：
+\noindent 其中，逗号分隔了源语言和目标语言部分，$\textrm{X}_1$和$\textrm{X}_2$表示模板中需要替换的内容，或者说变量。源语言中的变量和目标语言中的变量是一一对应的，比如，源语言中的$\textrm{X}_1$ 和目标语言中的$\textrm{X}_1$代表这两个变量可以``同时''被替换。假设给定短语对：
 \begin{eqnarray}
 \langle \ \text{北韩},\quad \textrm{North Korea} \ \rangle \qquad\ \quad\quad\ \  \nonumber \\
 \langle \ \text{邦交},\quad \textrm{diplomatic relations} \ \rangle\quad\ \ \ \nonumber
@@ -904,7 +935,7 @@ dr = \textrm{start}_i-\textrm{end}_{i-1}-1
 \langle \ \text{与}\ \text{[北韩]}\ \text{有}\ \textrm{X}_2,\quad \textrm{have}\ \textrm{X}_2\ \textrm{with}\ \textrm{[North Korea]} \ \rangle \nonumber
 \end{eqnarray}

-\parinterval 其中，$[\cdot]$表示被替换的部分。可以看到，在源语言和目标语言中，$\textrm{X}_1$被同时替换为相应的短语。进一步，可以用第二个短语替换$\textrm{X}_2$，得到：
+\noindent 其中，$[\cdot]$表示被替换的部分。可以看到，在源语言和目标语言中，$\textrm{X}_1$被同时替换为相应的短语。进一步，可以用第二个短语替换$\textrm{X}_2$，得到：
 \begin{eqnarray}
 \quad\langle \ \text{与}\ \text{北韩}\ \text{有}\ \text{[邦交]},\quad \textrm{have}\ \textrm{[diplomatic relations]}\ \textrm{with}\ \textrm{North Korea} \ \rangle \nonumber
 \end{eqnarray}
@@ -933,7 +964,7 @@ dr = \textrm{start}_i-\textrm{end}_{i-1}-1
 %--4.3.1 同步上下文无关文法---------------------
 \subsection{同步上下文无关文法}\index{Chapter4.3.1}

-\parinterval {\small\bfnew{基于层次短语的模型}}（Hierarchical Phrase-based Model）是David Chiang于2015提出的统计机器翻译模型\cite{chiang2005a,chiang2007hierarchical}。这个模型可以很好的解决短语系统对翻译中长距离调序建模不足的问题。基于层次短语的系统也在多项机器翻译比赛中取得了很好的成绩。这项工作也获得了自然处理领域顶级会议ACL2015的最佳论文奖。
+\parinterval {\small\bfnew{基于层次短语的模型}}（Hierarchical Phrase-based Model）是David Chiang于2005提出的统计机器翻译模型\cite{chiang2005a,chiang2007hierarchical}。这个模型可以很好的解决短语系统对翻译中长距离调序建模不足的问题。基于层次短语的系统也在多项机器翻译比赛中取得了很好的成绩。这项工作也获得了自然处理领域顶级会议ACL2015的最佳论文奖。

 \parinterval 层次短语模型的核心是把翻译问题归结为两种语言词串的同步生成问题。实际上，词串的生成问题是自然语言处理中的经典问题，早期的研究更多的是关注单语句子的生成，比如，如何使用句法树描述一个句子的生成过程。层次短语模型的创新之处是把传统单语词串的生成推广到双语词串的同步生成上。这使得机器翻译可以使用类似句法分析的方法进行求解。

@@ -943,25 +974,26 @@ dr = \textrm{start}_i-\textrm{end}_{i-1}-1
 \parinterval 层次短语模型中一个重要的概念是{\small\bfnew{同步上下文无关文法}}（Synchronous Context-free Grammar，简称SCFG）。SCFG可以被看作是对源语言和目标语言上下文无关文法的融合，它要求源语言和目标语言的产生式及产生式中的变量具有对应关系。具体定义如下：

 %-------------------------------------------
+\vspace{0.5em}
 \begin{definition} 同步上下文无关文法

 {\small
 一个同步上下文无关文法由五部分构成$(N, T_s, T_t, I, R)$，其中：
 \begin{enumerate}
 \item $N$是非终结符集合。
-\item $T_s$和$T_t$分别是源语言和目标语终结符集合。
+\item $T_s$和$T_t$分别是源语言和目标语言的终结符集合。
 \item $I \subseteq N$起始非终结符集合。
 \item $R$是规则集合，每条规则$r \in R$有如下形式：
 \end{enumerate}
 \begin{displaymath}
 \textrm{LHS} \to <\alpha, \beta, \sim>
 \end{displaymath}
-其中，$\textrm{LHS} \in N$表示规则的左部，它是一个非终结符；规则右部由三部分组成，$\alpha \in (N \bigcup T_s)^{*}$表示由源语言终结符和非终结符组成的串；$\beta \in (N \bigcup T_t)^{*}$ 表示由目标语言终结符和非终结符组成的串；$\sim$表示$\alpha$和$\beta$中终结符的1-1对应关系。
+其中，$\textrm{LHS} \in N$表示规则的左部，它是一个非终结符；规则的右部由三部分组成，$\alpha \in (N \bigcup T_s)^{*}$表示由源语言终结符和非终结符组成的串；$\beta \in (N \bigcup T_t)^{*}$ 表示由目标语言终结符和非终结符组成的串；$\sim$表示$\alpha$和$\beta$中非终结符的1-1对应关系。
 }
 \end{definition}
 %-------------------------------------------

-\parinterval 根据这个定义，源语言和目标语言有不同的终结符集合（单词），但是它们会共享同一个非终结符集合（变量）。每个产生式包括源语言和目标语言两个部分，分别表示由规则左部生成的源语言和目标语言符号串。由于产生式会同时生成两种语言的符号串，因此这是一种``同步''生成，可以很好的描述翻译中两个词串的对应。
+\parinterval 根据这个定义，源语言和目标语言有不同的终结符集合（单词），但是它们会共享同一个非终结符集合（变量）。每个产生式包括源语言和目标语言两个部分，分别表示由规则左部生成的源语言和目标语言符号串。由于产生式会同时生成两种语言的符号串，因此这是一种``同步''生成，可以很好的描述翻译中两个词串之间的对应。

 \parinterval 下面是一个简单的SCFG实例：
 \begin{eqnarray}
@@ -990,7 +1022,7 @@ r_3:\quad \textrm{X}\ &\to\ &\langle \ \text{大幅度},\quad \textrm{drasticall
 r_4:\quad \textrm{X}\ &\to\ &\langle \ \text{了},\quad \textrm{have}\ \rangle \nonumber
 \end{eqnarray}

-\parinterval 其中，规则$r_1$和$r_2$是含有变量的规则，这些变量可以被其他规则的右部替换；规则$r_2$是调序规则；规则$r_3$和$r_4$是纯词汇化规则，表示单词或者短语的翻译。
+\noindent 其中，规则$r_1$和$r_2$是含有变量的规则，这些变量可以被其他规则的右部替换；规则$r_2$是调序规则；规则$r_3$和$r_4$是纯词汇化规则，表示单词或者短语的翻译。

 \parinterval 对于一个双语句对：
 \begin{eqnarray}
@@ -1009,7 +1041,7 @@ r_4:\quad \textrm{X}\ &\to\ &\langle \ \text{了},\quad \textrm{have}\ \rangle \
 & & \ \textrm{The imports}\ {\red{\textrm{have}}}\ \textrm{drastically}\ \textrm{fallen}\ \rangle \nonumber
 \end{eqnarray}

-\parinterval 其中，每使用一次规则就会同步替换源语言和目标语言符号串中的一个非终结符。通常，可以把上面这个过程称作翻译推导，记为：
+\noindent 其中，每使用一次规则就会同步替换源语言和目标语言符号串中的一个非终结符。通常，可以把上面这个过程称作翻译{\small\bfnew{推导}}（Derivation），记为：
 %公式--------------------------------------------------------------------
 \begin{eqnarray}
 d = {r_1} \circ {r_2} \circ {r_3} \circ {r_4}
@@ -1017,12 +1049,12 @@ d = {r_1} \circ {r_2} \circ {r_3} \circ {r_4}
 \end{eqnarray}
 %公式--------------------------------------------------------------------

-\parinterval 在层次短语模型中，每个翻译推导都唯一的对应一个目标语译文。因此，可以用推导的概率$\textrm{P}(d)$描述翻译的好坏。同基于短语的模型是一样的（第7.2.2节），层次短语翻译的目标是：求概率最高的翻译推导$\hat{d}=\arg\max\textrm{P}(d)$。值得注意的是，基于推导的方法在句法分析中也十分常用。层次短语翻译实质上也是通过生成翻译规则的推导来对问题的表示空间进行建模。在\ref{section-4.4}节还将看到，这种方法可以被扩展到语言学上基于句法的翻译模型中。而且这些模型都可以用一种被称作超图的结构来进行建模。从某种意义上讲，基于规则推导的方法将句法分析和机器翻译进行了形式上的统一。因此机器翻译也借用了很多句法分析的思想。
+\parinterval 在层次短语模型中，每个翻译推导都唯一的对应一个目标语译文。因此，可以用推导的概率$\textrm{P}(d)$描述翻译的好坏。同基于短语的模型是一样的（见\ref{subsection-4.2.2}节），层次短语翻译的目标是：求概率最高的翻译推导$\hat{d}=\arg\max\textrm{P}(d)$。值得注意的是，基于推导的方法在句法分析中也十分常用。层次短语翻译实质上也是通过生成翻译规则的推导来对问题的表示空间进行建模。在\ref{section-4.4} 节还将看到，这种方法可以被扩展到语言学上基于句法的翻译模型中。而且这些模型都可以用一种被称作超图的结构来进行建模。从某种意义上讲，基于规则推导的方法将句法分析和机器翻译进行了形式上的统一。因此机器翻译也借用了很多句法分析的思想。

 %%%%%%%%%%%%%%%%%%
 \subsubsection{胶水规则}\index{Chapter4.3.1.3}

-\parinterval 由于翻译现象非常复杂，在实际系统中往往需要把两个局部翻译线性拼接到一起。在层次短语模型中，这个问题通过引入胶水规则（Glue Rule）来处理，形式如下：
+\parinterval 由于翻译现象非常复杂，在实际系统中往往需要把两个局部翻译线性拼接到一起。在层次短语模型中，这个问题通过引入{\small\bfnew{胶水规则}}（Glue Rule）来处理，形式如下：
 \begin{eqnarray}
 \textrm{S} & \to & \langle\ \textrm{S}_1\ \textrm{X}_2,\ \textrm{S}_1\ \textrm{X}_2\ \rangle \nonumber \\
 \textrm{S} & \to & \langle\ \textrm{X}_1,\ \textrm{X}_1\ \rangle \nonumber
@@ -1036,9 +1068,9 @@ d = {r_1} \circ {r_2} \circ {r_3} \circ {r_4}
                & \to & \langle\ \textrm{X}_n\ ...\ \textrm{X}_4\ \textrm{X}_2,\ \textrm{X}_n\ ...\ \textrm{X}_4\ \textrm{X}_2\ \rangle \nonumber
 \end{eqnarray}

-\parinterval 实际上，胶水规则在很大程度上模拟了基于短语的系统中对字符串顺序翻译的操作。而且在实践中发现，这个步骤是十分必要的。特别是对汉-英翻译这样的任务，由于语言的结构基本上是顺序翻译的，因此引入顺序拼接的操作符合翻译的整体规律。同时，这种拼接给翻译增加了灵活性，系统会更加健壮。
+\parinterval 实际上，胶水规则在很大程度上模拟了基于短语的系统中对字符串顺序翻译的操作。而且在实践中发现，这个步骤是十分必要的。特别是对法-英翻译这样的任务，由于语言的结构基本上是顺序翻译的，因此引入顺序拼接的操作符合翻译的整体规律。同时，这种拼接给翻译增加了灵活性，系统会更加健壮。

-\parinterval 需要说明的是，使用同步文法进行翻译时由于单词之间顺序是内嵌在翻译规则内的，因此这种模型并不依赖额外的调序模型。一旦文法确定下来，系统就可以进行翻译。
+\parinterval 需要说明的是，使用同步文法进行翻译时由于单词的顺序是内嵌在翻译规则内的，因此这种模型并不依赖额外的调序模型。一旦文法确定下来，系统就可以进行翻译。

 %%%%%%%%%%%%%%%%%%
 \subsubsection{处理流程}\index{Chapter4.3.1.4}
@@ -1062,12 +1094,13 @@ d = {r_1} \circ {r_2} \circ {r_3} \circ {r_4}
 \parinterval 在\ref{subsection-4.2.3}节已经介绍了短语与词对齐相兼容的概念。这里，所有层次短语规则也是与词对齐相兼容（一致）的。

 %-------------------------------------------
+\vspace{0.5em}
 \begin{definition} 与词对齐相兼容的层次短语规则

 {\small
-对于句对(\textbf{s},\textbf{t})和它们之间的词对齐\textbf{a}，令$N$表示在句对(\textbf{s},\textbf{t})上与\textbf{a}相兼容的双语短语集合。则：
+对于句对$(\mathbf{s},\mathbf{t})$和它们之间的词对齐$\mathbf{a}$，令$N$表示在句对$(\mathbf{s},\mathbf{t})$上与$\mathbf{a}$相兼容的双语短语集合。则：
 \begin{enumerate}
-\item 	如果$(x,y)\in N$，$\textrm{X} \to \langle x,y,\phi \rangle$是与词对齐相兼容的层次短语规则。
+\item 	如果$(x,y)\in N$，则$\textrm{X} \to \langle x,y,\phi \rangle$是与词对齐相兼容的层次短语规则。
 \item 	对于$(x,y)\in N$，存在$m$个双语短语$(x_i,y_j)\in N$，同时存在(1,$...$,$m$)上面的一个排序$\sim = {\pi_1 , ... ,\pi_m}$，且：
 \end{enumerate}
 %公式--------------------------------------------------------------------
@@ -1098,9 +1131,13 @@ y&=&\beta_0 y_{\pi_1} \beta_1 y_{\pi_2} ... \beta_{m-1} y_{\pi_m} \beta_m
 \parinterval 这种方式可以抽取出大量的层次短语规则。但是，不加限制的抽取，会带来规则集合的过度膨胀，对解码系统造成很大负担。比如，如果考虑任意长度的短语会使得层次短语规则过大，一方面这些规则很难在测试数据上被匹配，另一方面抽取这样的``长''规则会使得抽取算法变慢，而且规则数量猛增之后难以存储。还有，如果一个层次短语规则中含有过多的变量，也会导致解码算法变得更加复杂，不利于系统实现和调试。针对这些问题，在标准的层次短语系统中会考虑一些限制，包括：

 \begin{itemize}
+\vspace{0.3em}
 \item 抽取的规则最多可以跨越10个词；
+\vspace{0.3em}
 \item 规则的（源语言端）变量个数不能超过2；
+\vspace{0.3em}
 \item 规则的（源语言端）变量不能连续出现。
+\vspace{0.3em}
 \end{itemize}
 \parinterval 在具体实现时还会考虑其他的限制，比如，限定规则的源语言端终结符数量的上限等。

@@ -1109,7 +1146,7 @@ y&=&\beta_0 y_{\pi_1} \beta_1 y_{\pi_2} ... \beta_{m-1} y_{\pi_m} \beta_m

 \parinterval 在层次短语模型中，每个翻译推导都有一个模型得分$\textrm{score}(d,\textbf{s},\textbf{t})$。$\textrm{score}(d,\textbf{s},\textbf{t})$是若干特征的线性加权之和：$\textrm{score}(d,\textbf{t},\textbf{s})=\sum_{i=1}^M\lambda_i\cdot h_i (d,\textbf{t},\textbf{s})$，其中$\lambda_i$是特征权重，$h_i (d,\textbf{t},\textbf{s})$是特征函数。层次短语模型的特征包括与规则相关的特征和语言模型特征，如下：

-\parinterval 对于每一条翻译规则LHS$\to \langle \alpha, \beta ,\sim \rangle$
+\parinterval 对于每一条翻译规则LHS$\to \langle \alpha, \beta ,\sim \rangle$，有：

 \begin{itemize}
 \item 	(h1-2)短语翻译概率（取对数），即$\textrm{log}(\textrm{P}(\alpha \mid \beta))$和$\textrm{log}(\textrm{P}(\beta \mid \alpha))$，特征的计算与基于短语的模型完全一样；
@@ -1146,8 +1183,11 @@ h_i (d,\textbf{t},\textbf{s})=\sum_{r \in d}h_i (r)
 \parinterval 其中：

 \begin{itemize}
-\item 	$\textrm{log}⁡(\textrm{P}_{\textrm{lm}}(\textrm{t}))$表示语言模型得分；
-\item 	$\mid \textrm{t} \mid$表示译文的长度。
+\vspace{0.3em}
+\item $\textrm{log}⁡(\textrm{P}_{\textrm{lm}}(\textrm{t}))$表示语言模型得分；
+\vspace{0.3em}
+\item $\mid \textrm{t} \mid$表示译文的长度。
+\vspace{0.3em}
 \end{itemize}

 \parinterval 在定义特征函数之后，特征权重$\{ \lambda_i \}$可以通过最小错误率训练在开发集上进行调优。关于最小错误率训练可以参考\ref{subsection-4.2.6}节的内容。
@@ -1158,22 +1198,22 @@ h_i (d,\textbf{t},\textbf{s})=\sum_{r \in d}h_i (r)
 \parinterval 层次短语模型解码的目标是找到模型得分最高的推导，即：
 %公式--------------------------------------------------------------------
 \begin{eqnarray}
-\hat{d} = \arg\max_{d} score(d,\textbf{s},\textbf{t})
+\hat{d} = \arg\max_{d} \textrm{score}(d,\textbf{s},\textbf{t})
 \label{eqa4.28}
 \end{eqnarray}
 %公式--------------------------------------------------------------------

-\parinterval $\hat{d}$的目标语部分即最佳译文$\hat{\textbf{t}}$。令函数$e(\cdot)$返回翻译推导的目标语词串，于是有：
+\parinterval $\hat{d}$的目标语部分即最佳译文$\hat{\textbf{t}}$。令函数$t(\cdot)$返回翻译推导的目标语词串，于是有：
 %公式--------------------------------------------------------------------
 \begin{eqnarray}
-\hat{\textbf{t}}=e(\hat{d})
+\hat{\textbf{t}}=t(\hat{d})
 \label{eqa4.29}
 \end{eqnarray}
 %公式--------------------------------------------------------------------

-\parinterval 由于层次短语规则本质上就是CFG规则，因此公式\ref{eqa4.6}代表了一个典型的句法分析过程。需要做的是，用模型源语言端的CFG对输入句子进行分析，同时用模型目标语言端的CFG生成译文。基于CFG的句法分析是自然语言处理中的经典问题。一种广泛使用的方法是：首先把CFG转化为$\varepsilon$-free的乔姆斯基范式（Chomsky Normal Form）\footnote[5]{能够证明任意的CFG都可以被转换为乔姆斯基范式，即文法只包含形如A$\to$BC或A$\to$a的规则。这里，假设文法中不包含空串产生式A$\to\varepsilon$，其中$\varepsilon$表示空字符串。}，之后采用CYK方法进行分析。
+\parinterval 由于层次短语规则本质上就是CFG规则，因此公式\ref{eqa4.28}代表了一个典型的句法分析过程。需要做的是，用模型源语言端的CFG对输入句子进行分析，同时用模型目标语言端的CFG生成译文。基于CFG的句法分析是自然语言处理中的经典问题。一种广泛使用的方法是：首先把CFG转化为$\varepsilon$-free的{\small\bfnew{乔姆斯基范式}}（Chomsky Normal Form）\footnote[5]{能够证明任意的CFG都可以被转换为乔姆斯基范式，即文法只包含形如A$\to$BC或A$\to$a的规则。这里，假设文法中不包含空串产生式A$\to\varepsilon$，其中$\varepsilon$表示空字符串。}，之后采用CYK方法进行分析。

-\parinterval CYK是形式语言中一种常用的句法分析方法\cite{cocke1969programming,younger1967recognition,kasami1966efficient}。它主要用于分析符合乔姆斯基范式的句子。由于乔姆斯基范式中每个规则最多包含两叉（或者说两个变量），因此CYK方法也可以被看作是基于二叉规则的一种分析方法。对于一个待分析的字符串，CYK方法从小的``范围''开始，不断扩大分析的``范围''，最终完成对整个字符串的分析。在CYK方法中，一个重要的概念是跨度（Span），所谓跨度表示了一个符号串的范围。这里可以把跨度简单的理解为从一个起始位置到一个结束位置中间的部分。比如，如图\ref{fig:word-and-index-of-pos}所示，每个单词左右都有一个数字来表示序号。可以用序号的范围来表示跨度，例如：
+\parinterval CYK是形式语言中一种常用的句法分析方法\cite{cocke1969programming,younger1967recognition,kasami1966efficient}。它主要用于分析符合乔姆斯基范式的句子。由于乔姆斯基范式中每个规则最多包含两叉（或者说两个变量），因此CYK方法也可以被看作是基于二叉规则的一种分析方法。对于一个待分析的字符串，CYK方法从小的``范围''开始，不断扩大分析的``范围''，最终完成对整个字符串的分析。在CYK方法中，一个重要的概念是{\small\bfnew{跨度}}（Span），所谓跨度表示了一个符号串的范围。这里可以把跨度简单的理解为从一个起始位置到一个结束位置中间的部分。

 %----------------------------------------------
 % 图
@@ -1184,20 +1224,26 @@ h_i (d,\textbf{t},\textbf{s})=\sum_{r \in d}h_i (r)
 \label{fig:word-and-index-of-pos}
 \end{figure}
 %-------------------------------------------
+
+比如，如图\ref{fig:word-and-index-of-pos} 所示，每个单词左右都有一个数字来表示序号。可以用序号的范围来表示跨度，例如：
+
 \begin{eqnarray}
-\textrm{Span[0,1]}&=&\textrm{``猫''} \nonumber \\
-\textrm{Span[2,4]}&=&\textrm{``吃} \quad \textrm{鱼''} \nonumber \\
-\textrm{Span[0,4]}&=&\textrm{``猫} \quad \textrm{喜欢} \quad \textrm{吃} \quad \textrm{鱼''} \nonumber
+span\textrm{[0,1]}&=&\textrm{``猫''} \nonumber \\
+span\textrm{[2,4]}&=&\textrm{``吃} \quad \textrm{鱼''} \nonumber \\
+span\textrm{[0,4]}&=&\textrm{``猫} \quad \textrm{喜欢} \quad \textrm{吃} \quad \textrm{鱼''} \nonumber
 \end{eqnarray}

-\parinterval CYK方法是按跨度由小到大的次序执行的，这也对应了一种自下而上的分析过程。对于每个跨度，检查：
+\parinterval CYK方法是按跨度由小到大的次序执行的，这也对应了一种{\small\bfnew{自下而上的分析}}（Top-down Parsing）过程。对于每个跨度，检查：

 \begin{itemize}
+\vspace{0.3em}
 \item 	是否有形如A$\to$a的规则可以匹配；
+\vspace{0.3em}
 \item 	是否有形如A$\to$BC的规则可以匹配。
+\vspace{0.3em}
 \end{itemize}

-\parinterval 对于第一种情况，简单匹配字符串即可；对于第二种情况，需要把当前的跨度进一步分割为两部分，并检查左半部分是否已经被归纳为B，右半部分是否已经被归纳为C。如果可以匹配，会在这个跨度上保存匹配结果。后面，可以访问这个结果（也就是A）来生成更大跨度上的分析结果。CYK算法的伪代码如图\ref{fig:CYK-algorithm}所示。整个算法的执行顺序是按跨度的长度（$l$）组织的。对于每个span[$j,j + l$]，会在位置$k$进行切割。之后，判断span[$j,k$]和span[$k,j +l$]是否可以形成一个规则的右部。也就是判断span[$j,k$]是否生成了B，同时判断span[$k,j + l$]是否生成了C，如果文法中有规则A$\to$BC，则把这个规则放入span[$j,j+l$]。这个过程由Compose函数完成。如果span[$j,j + l$]可以匹配多条规则，所有生成的推导都会被记录在span[$j,j + l$]所对应的一个列表里\footnote[6]{通常，这个列表会用优先队列实现。这样可以对推导按模型得分进行排序，方便后续的剪枝操作。}。
+\parinterval 对于第一种情况，简单匹配字符串即可；对于第二种情况，需要把当前的跨度进一步分割为两部分，并检查左半部分是否已经被归纳为B，右半部分是否已经被归纳为C。如果可以匹配，会在这个跨度上保存匹配结果。后面，可以访问这个结果（也就是A）来生成更大跨度上的分析结果。CYK算法的伪代码如图\ref{fig:CYK-algorithm}所示。整个算法的执行顺序是按跨度的长度（$l$）组织的。对于每个$span[j,j + l]$，会在位置$k$进行切割。之后，判断$span[j,k]$和$span[k,j +l]$是否可以形成一个规则的右部。也就是判断$span[j,k]$是否生成了B，同时判断$span[k,j + l]$是否生成了C，如果文法中有规则A$\to$BC，则把这个规则放入$span[j,j+l]$。这个过程由Compose函数完成。如果$span[j,j + l]$可以匹配多条规则，所有生成的推导都会被记录在$span[j,j + l]$所对应的一个列表里\footnote[6]{通常，这个列表会用优先队列实现。这样可以对推导按模型得分进行排序，方便后续的剪枝操作。}。

 %----------------------------------------------
 % 图
@@ -1217,7 +1263,7 @@ h_i (d,\textbf{t},\textbf{s})=\sum_{r \in d}h_i (r)
 \centering
 \input{./Chapter4/Figures/example-of-cyk-algorithm-execution-label}
 \input{./Chapter4/Figures/example-of-cyk-algorithm-execution}
-\caption{一个三层循环神经网络的模型并行过程}
+\caption{CYK算法执行实例}
 \label{fig:example-of-cyk-algorithm-execution}
 \end{figure}
 %----------------------------------------------
@@ -1225,23 +1271,29 @@ h_i (d,\textbf{t},\textbf{s})=\sum_{r \in d}h_i (r)
 \parinterval 不过，CYK方法并不能直接用于层次短语模型。有两个问题：

 \begin{itemize}
+\vspace{0.3em}
 \item 层次短语模型的文法不符合乔姆斯基范式；
+\vspace{0.3em}
 \item 机器翻译中需要语言模型。由于当前词的语言模型得分需要前面的词做条件，因此机器翻译的解码过程并不是上下文无关的。
+\vspace{0.3em}
 \end{itemize}

 \parinterval 解决第一个问题有两个思路：

 \begin{itemize}
+\vspace{0.3em}
 \item 把层次短语文法转化为乔姆斯基范式，这样可以直接使用原始的CYK方法进行分析；
+\vspace{0.3em}
 \item 对CYK方法进行改造。解码的核心任务要知道每个跨度是否能匹配规则的源语言部分。实际上，层次短语模型的文法是一种特殊的文法。这种文法规则的源语言部分最多包含两个变量，而且变量不能连续。这样的规则会对应一种特定类型的模版，比如，对于包含两个变量的规则，它的源语言部分形如$\alpha_0 \textrm{X}_1 \alpha_1 \textrm{X}_2 \alpha_2$。其中，$\alpha_0$、$\alpha_1$和$\alpha_2$表示终结符串，$\textrm{X}_1$和$\textrm{X}_2$是变量。显然，如果$\alpha_0$、$\alpha_1$和$\alpha_2$确定下来那么$\textrm{X}_1$和$\textrm{X}_2$的位置也就确定了下来。因此，对于每一个词串，都可以很容易的生成这种模版，进而完成匹配。而$\textrm{X}_1$和$\textrm{X}_2$和原始CYK中匹配二叉规则本质上是一样的。由于这种方法并不需要对CYK方法进行过多的调整，因此层次短语系统中广泛使用这种改造的CYK方法进行解码。
+\vspace{0.3em}
 \end{itemize}

-\parinterval 对于语言模型在解码中的集成问题，一种简单的办法是：在CYK分析的过程中，用语言模型对每个局部的翻译结果进行评价，并计算局部翻译（推导）的模型得分。注意，局部的语言模型得分可能是不准确的，比如，局部翻译片段最左边单词的概率计算需要依赖前面的单词。但是由于每个跨度下生成的翻译是局部的，当前跨度下看不到前面的译文。这时会用1-gram语言模型的得分代替真实的高阶语言模型得分。等这个局部翻译片段和其他片段组合之后，可以知道前文的内容时，才会得出最终的语言模型得分。另一种解决问题的思路是，先不加入语言模型，这样可以直接使用CYK方法进行分析。在得到最终的结果后，对最好的多个推导用含有语言模型的完整模型进行打分，选出最终的最优推导。不过，在实践中发现，由于语言模型在机器翻译中起到至关重要的作用，因此对最终结果进行重排序会带来一定的性能损失。不过这种方法的优势在于速度快，而且容易实现。
+\parinterval 对于语言模型在解码中的集成问题，一种简单的办法是：在CYK分析的过程中，用语言模型对每个局部的翻译结果进行评价，并计算局部翻译（推导）的模型得分。注意，局部的语言模型得分可能是不准确的，比如，局部翻译片段最左边单词的概率计算需要依赖前面的单词。但是由于每个跨度下生成的翻译是局部的，当前跨度下看不到前面的译文。这时会用1-gram语言模型的得分代替真实的高阶语言模型得分。等这个局部翻译片段和其他片段组合之后，可以知道前文的内容，这时才会得出最终的语言模型得分。另一种解决问题的思路是，先不加入语言模型，这样可以直接使用CYK方法进行分析。在得到最终的结果后，对最好的多个推导用含有语言模型的完整模型进行打分，选出最终的最优推导。不过，在实践中发现，由于语言模型在机器翻译中起到至关重要的作用，因此对最终结果进行重排序会带来一定的性能损失。不过这种方法的优势在于速度快，而且容易实现。

 \parinterval 另外，在实践时，还需要考虑两方面问题：

 \begin{itemize}
-\item 剪枝：在CYK中，每个跨度都可以生成非常多的推导（局部翻译假设）。理论上，这些推导的数量会和跨度大小成指数关系。显然不可能保存如此大量的翻译推导。对于这个问题，常用的办法是只保留top-$k$个推导。也就是每个局部结果只保留最好的$k$个。这种方法也被称作束剪枝（Beam Pruning）。在极端情况下，当$k$=1时，这个方法就变成了贪婪的方法；
+\item 剪枝：在CYK中，每个跨度都可以生成非常多的推导（局部翻译假设）。理论上，这些推导的数量会和跨度大小成指数关系。显然不可能保存如此大量的翻译推导。对于这个问题，常用的办法是只保留top-$k$个推导。也就是每个局部结果只保留最好的$k$个。这种方法也被称作{\small\bfnew{束剪枝}}（Beam Pruning）。在极端情况下，当$k$=1时，这个方法就变成了贪婪的方法；
 \item $n$-best结果的生成：$n$-best推导（译文）的生成是统计机器翻译必要的功能。比如，最小错误率训练中就需要最好的$n$个结果用于特征权重调优。在基于CYK的方法中，整个句子的翻译结果会被保存在最大跨度所对应的结构中。因此一种简单的$n$-best生成方法是从这个结构中取出排名最靠前的$n$个结果。另外，也可以考虑自上而下遍历CYK生成的推导空间，得到更好的$n$-best结果\cite{huang2005better}。
 \end{itemize}

@@ -1279,9 +1331,9 @@ h_i (d,\textbf{t},\textbf{s})=\sum_{r \in d}h_i (r)
 \end{figure}
 %-------------------------------------------

-\parinterval 如果相同源语言端的规则有$n$个，规则中每个变量可以被替换为$m$个结果，对于只含有一个变量的规则，一共有$nm$种不同的组合。如果规则含有两个变量，组合的数量是$n{m}^2$。由于翻译中会进行大量的规则匹配，如果每个匹配的源语言端都考虑所有$n{m}^2$种译文的组合，解码速度会很慢。
+\parinterval 假设有$n$个规则源语言端相同，规则中每个变量可以被替换为$m$个结果，对于只含有一个变量的规则，一共有$nm$种不同的组合。如果规则含有两个变量，这种组合的数量是$n{m}^2$。由于翻译中会进行大量的规则匹配，如果每个匹配的源语言端都考虑所有$n{m}^2$种译文的组合，解码速度会很慢。

-\parinterval 在层次短语系统中，会进一步对搜索空间剪枝。简言之，此时并不需要对所有$n{m}^2$种组合进行遍历，而是只考虑其中的一部分组合。这种方法也被称作立方剪枝（Cube Pruning）。所谓``立方''是指组合译文时的三个维度：规则的目标语端、第一个变量所对应的翻译候选、第二个变量所对应的翻译候选。立方剪枝假设所有的译文候选都经过排序，比如，按照短语翻译概率排序。这样，每个译文都对应一个坐标，比如，$(i,j,k)$就表示第$i$个规则目标语端、第二个变量的第$j$个翻译候选、第三个变量的第$k$个翻译候选的组合。于是，可以把每种组合看作是一个三维空间中的一个点。在立方剪枝中，开始的时候会看到(0,0,0)这个翻译假设，并把这个翻译假设放入一个优先队列中。之后每次从这个优先队里中弹出最好的结果，之后沿着三个维度分别将坐标加1，比如，如果优先队列弹出$(i,j,k)$，则会生成$(i+1,j,k)$、$(i,j+1,k)$和$(i,j,k+1)$这三个新的翻译假设。之后，计算出它们的模型得分，并压入优先队列。这个过程不断被执行，直到达到终止条件，比如，扩展次数达到一个上限。图\ref{fig:execution-of-cube-pruning}展示了立方剪枝的过程（规则只含有一个变量的情况）。可以看到，每个步骤中，算法只会扩展当前最好结果周围的两个点（对应两个维度，横轴对应变量被替换的内容，纵轴对应规则的目标语端）。
+\parinterval 在层次短语系统中，会进一步对搜索空间剪枝。简言之，此时并不需要对所有$n{m}^2$种组合进行遍历，而是只考虑其中的一部分组合。这种方法也被称作{\small\bfnew{立方剪枝}}（Cube Pruning）。所谓`` 立方''是指组合译文时的三个维度：规则的目标语端、第一个变量所对应的翻译候选、第二个变量所对应的翻译候选。立方剪枝假设所有的译文候选都经过排序，比如，按照短语翻译概率排序。这样，每个译文都对应一个坐标，比如，$(i,j,k)$就表示第$i$个规则目标语端、第二个变量的第$j$个翻译候选、第三个变量的第$k$个翻译候选的组合。于是，可以把每种组合看作是一个三维空间中的一个点。在立方剪枝中，开始的时候会看到$(0,0,0)$这个翻译假设，并把这个翻译假设放入一个优先队列中。之后每次从这个优先队里中弹出最好的结果，之后沿着三个维度分别将坐标加1，比如，如果优先队列弹出$(i,j,k)$，则会生成$(i+1,j,k)$、$(i,j+1,k)$和$(i,j,k+1)$这三个新的翻译假设。之后，计算出它们的模型得分，并压入优先队列。这个过程不断被执行，直到达到终止条件，比如，扩展次数达到一个上限。图\ref{fig:execution-of-cube-pruning}展示了立方剪枝的过程（规则只含有一个变量的情况）。可以看到，每个步骤中，算法只会扩展当前最好结果周围的两个点（对应两个维度，横轴对应变量被替换的内容，纵轴对应规则的目标语端）。

 %----------------------------------------------
 % 图
@@ -1298,7 +1350,7 @@ h_i (d,\textbf{t},\textbf{s})=\sum_{r \in d}h_i (r)
 %---------4.4基于语言学句法的模型
 \section{基于语言学句法的模型}\index{Chapter4.4}\label{section-4.4}

-\parinterval 层次短语模型是一种典型的基于翻译文法的模型。它把翻译问题转化为语言分析问题。在翻译一个句子的时候，模型会生成一个树形结构，这样也就得到了句子结构的某种表示。图\ref{fig:derivation-of-hierarchical-phrase-and-tree-structure model}展示了一个使用层次短语系统进行翻译时所生成的翻译推导$d$，以及这个推导所对应的树形结构（源语言）。这棵树体现了从机器翻译的视角如何看待句子结构，尽管这个结构并不是人类语言学中的句法树。
+\parinterval 层次短语模型是一种典型的基于翻译文法的模型。它把翻译问题转化为语言分析问题。在翻译一个句子的时候，模型会生成一个树形结构，这样也就得到了句子结构的某种表示。图\ref{fig:derivation-of-hierarchical-phrase-and-tree-structure model}展示了一个使用层次短语系统进行翻译时所生成的翻译推导$d$，以及这个推导所对应的树形结构（源语言）。这棵树体现了机器翻译的视角下的句子结构，尽管这个结构并不是人类语言学中的句法树。

 %----------------------------------------------
 % 图
@@ -1313,8 +1365,11 @@ h_i (d,\textbf{t},\textbf{s})=\sum_{r \in d}h_i (r)
 \parinterval 在翻译中使用树结构的好处在于，模型可以更加有效地对句子的层次结构进行抽象。而且树结构可以作为对序列结构的一种补充，比如，在句子中距离较远的两个单词，在树结构中可以很近。不过，层次短语模型也存在一些不足：

 \begin{itemize}
+\vspace{0.3em}
 \item 层次短语规则没有语言学句法标记，很多规则并不符合语言学认知，因此译文的生成和调序也不遵循语言学规律。比如，层次短语系统经常会把完整的句法结构打散，或者``破坏''句法成分进行组合；
+\vspace{0.3em}
 \item 层次短语系统中有大量的工程化约束条件。比如，规则的源语言部分不允许两个变量连续出现，而且变量个数也不能超过两个。这些约束在一定程度上限制了模型处理翻译问题的能力。
+\vspace{0.3em}
 \end{itemize}

 \parinterval 实际上，基于层次短语的方法可以被看作是介于基于短语的方法和基于语言学句法的方法之间的一种折中。它的优点在于，具备短语模型简单、灵活的优点，同时，由于同步翻译文法可以对句子的层次结构进行表示，因此也能够处理一些较长距离的调序问题。但是，另一方面，层次短语模型并不是一种``精细''的句法模型，当翻译需要复杂的结构信息时，这种模型可能会无能为力。图\ref{fig:examples-of-translation-with-complex-ordering}展示了一个翻译实例，对图中句子进行翻译需要通过复杂的调序才能生成正确译文。为了完成这样的翻译，需要对多个结构（超过两个）进行调序，但是这种情况在标准的层次短语系统中是不允许的。
@@ -1372,7 +1427,7 @@ h_i (d,\textbf{t},\textbf{s})=\sum_{r \in d}h_i (r)
 &此翻译可以被看作从句法树到句法树的转换 \\
 \rule{0pt}{15pt}基于句法 & 使用语言学句法 \\
 \rule{0pt}{15pt}基于树 &（源语言）使用树结构（大多指句法树） \\
-\rule{0pt}{15pt}基于串 &（源语言）使用词串，比如串到树的翻译系统的解码器一般\\
+\rule{0pt}{15pt}基于串 &（源语言）使用词串，比如串到树翻译系统的解码器一般\\
 &都是基于串的解码方法 \\
 \rule{0pt}{15pt}基于森林 &（源语言）使用句法森林，这里森林只是对多个句法树的一\\
 &种压缩表示 \\
@@ -1380,7 +1435,7 @@ h_i (d,\textbf{t},\textbf{s})=\sum_{r \in d}h_i (r)
 \rule{0pt}{15pt}非词汇规则 & 不含有终结符的规则 \\
 \rule{0pt}{15pt}句法软约束 & 不强制规则推导匹配语言学句法树，通常把句法信息作为特\\
 &征使用 \\
-\rule{0pt}{15pt}句法硬约束 & 强制推导必须符合语言学句法树，不符合的推导会被过滤掉 \\
+\rule{0pt}{15pt}句法硬约束 & 要求推导必须符合语言学句法树，不符合的推导会被过滤掉 \\
 \end{tabular}
 }
 \end{center}
@@ -1409,7 +1464,7 @@ h_i (d,\textbf{t},\textbf{s})=\sum_{r \in d}h_i (r)
 \label{tab:comparison-of-models-based-on-syntax}
 {
 \begin{tabular}{l | l | l | l | l}
- & 形式句法 & \multicolumn{3}{c}{语言学句法} \\
+模型 & 形式句法 & \multicolumn{3}{c}{语言学句法} \\
 \cline{3-5}
 \rule{0pt}{15pt} & & \multicolumn{1}{c|}{树到串} & \multicolumn{1}{c}{串到树} & \multicolumn{1}{|c}{树到树} \\
 \hline
@@ -1427,14 +1482,14 @@ h_i (d,\textbf{t},\textbf{s})=\sum_{r \in d}h_i (r)
 %--4.4.2 基于树结构的文法---------------------
 \subsection{基于树结构的文法}\index{Chapter4.4.2}

-\parinterval 基于句法的翻译模型的一个核心问题是要对树结构进行建模，进而完成树之间或者树和串之间的转换。在计算机领域中，所谓树就是由一些节点组成的层次关系的集合。计算机领域的树和自然世界中的树没有任何关系，只是借用了相似的概念，因为这种层次结构很像一个倒过来的树。在使用树时，经常会把树的层次结构转化为序列结构，称为树结构的序列化或者线性化。比如，使用树的先序遍历就可以得到一个树的序列表示。图\ref{fig:different-representations-of-syntax-tree}就对比了同一棵树的不同表示方式。实际上，树的序列表示是非常适合计算机进行读取和处理的。因此，本章也会使用树的序列化结果来表示句法结构。
+\parinterval 基于句法的翻译模型的一个核心问题是要对树结构进行建模，进而完成树之间或者树和串之间的转换。在计算机领域中，所谓树就是由一些节点组成的层次关系的集合。计算机领域的树和自然世界中的树没有任何关系，只是借用了相似的概念，因为这种层次结构很像一个倒过来的树。在使用树时，经常会把树的层次结构转化为序列结构，称为树结构的{\small\bfnew{序列化}}或者{\small\bfnew{线性化}}（Linearization）。比如，使用树的先序遍历就可以得到一个树的序列表示。图\ref{fig:different-representations-of-syntax-tree}就对比了同一棵树的不同表示方式。实际上，树的序列表示是非常适合计算机进行读取和处理的。因此，本章也会使用树的序列化结果来表示句法结构。

 %----------------------------------------------
 % 图
 \begin{figure}[htp]
 \centering
 \input{./Chapter4/Figures/different-representations-of-syntax-tree}
-\caption{基于句法的机器翻译模型的分类}
+\caption{树结构的不同表示形式}
 \label{fig:different-representations-of-syntax-tree}
 \end{figure}
 %-------------------------------------------
@@ -1442,8 +1497,11 @@ h_i (d,\textbf{t},\textbf{s})=\sum_{r \in d}h_i (r)
 \parinterval 在基于语言学句法的机器翻译中，两个句子间的转化仍然需要使用文法规则进行描述。有两种类型的规则：

 \begin{itemize}
-\item 树到串翻译规则：在树到串、串到树模型中使用；
-\item 树到树翻译规则：在树到树模型中使用。
+\vspace{0.3em}
+\item {\small\bfnew{树到串翻译规则}}（Tree-to-String Translation Rule）：在树到串、串到树模型中使用；
+\vspace{0.3em}
+\item {\small\bfnew{树到树翻译规则}}（Tree-to-Tree Translation Rule）：在树到树模型中使用。
+\vspace{0.3em}
 \end{itemize}

 \parinterval 树到串规则描述了一端是树结构而另一端是串的情况，因此树到串模型和串到树模型都可以使用这种形式的规则。树到树模型需要在两种语言上同时使用句法树结构，需要树到树翻译规则。
@@ -1451,27 +1509,30 @@ h_i (d,\textbf{t},\textbf{s})=\sum_{r \in d}h_i (r)
 %%%%%%%%%%%%%%%%%%
 \subsubsection{树到树翻译规则}\index{Chapter4.4.2.1}

-\parinterval 虽然树到串翻译规则和树到树翻译规则蕴含了不同类型的翻译知识，但是它们都在描述一个结构（树/串）到另一个结构（树/串）的映射。这里采用了一种更加通用的文法\ \dash \ 基于树结构的文法\ \dash \ 将树到串翻译规则和树到树翻译规则进行统一，如下：
+\parinterval 虽然树到串翻译规则和树到树翻译规则蕴含了不同类型的翻译知识，但是它们都在描述一个结构（树/串）到另一个结构（树/串）的映射。这里采用了一种更加通用的文法\ \dash \ 基于树结构的文法\ \dash \ 将树到串翻译规则和树到树翻译规则进行统一。定义如下：

 %-------------------------------------------
+\vspace{0.5em}
 \begin{definition} 基于树结构的文法

 {\small
-一个基于树结构的文法由七部分构成$(N_s, N_t, T_s, T_t, I_s, I_t, R)$，其中： \\
-1. $N_s$和$N_t$是源语和目标语非终结符集合。 \\
-2. $T_s$和$T_t$是源语言和目标语终结符集合。 \\
-3. $I_s \subseteq N_s$和$I_t \subseteq N_t$是源语言和目标语起始非终结符集合。 \\
-4. $R$是规则集合，每条规则$r \in R$有如下形式：
+一个基于树结构的文法由七部分构成$(N_s, N_t, T_s, T_t, I_s, I_t, R)$，其中
+\begin{enumerate}
+\item $N_s$和$N_t$是源语言和目标语言非终结符集合；
+\item $T_s$和$T_t$是源语言和目标语言终结符集合；
+\item $I_s \subseteq N_s$和$I_t \subseteq N_t$是源语言和目标语言起始非终结符集合；
+\item $R$是规则集合，每条规则$r \in R$有如下形式：

 \begin{displaymath}
 \langle\  \alpha_h, \beta_h\ \rangle \to \langle\ \alpha_r, \beta_r, \sim\ \rangle
 \end{displaymath}
 其中，规则左部由非终结符$\alpha_h \in N_s$和$\beta_h \in N_t$构成；规则右部由三部分组成，$\alpha_r$表示由源语言终结符和非终结符组成的树结构；$\beta_r$ 表示由目标语言终结符和非终结符组成的树结构；$\sim$表示$\alpha_r$和$\beta_r$中叶子非终结符的1-1对应关系。
+\end{enumerate}
 }
 \end{definition}
 %-------------------------------------------

-\parinterval 基于树结构的规则非常适用于描述树结构到树结构的映射。比如，图\ref{fig:example-of-tree-structure-correspondence}是一个汉语句法树结构到一个英语句法树结构的对应。其中的树结构可以被看作是完整句法树上的一个片段，称为{\small\bfnew{树片段}}（Tree Fragment）。树片段的叶子节点既可以是终结符（单词）也可以是非终结符。当叶子节点为非终结符时，表示这个非终结符会被进一步替换，因此它可以被看作是变量。而源语言树结构和目标语言树结构中的变量是一一对应的，对应关系用虚线表示。
+\parinterval 基于树结构的规则非常适合于描述树结构到树结构的映射。比如，图\ref{fig:example-of-tree-structure-correspondence}是一个汉语句法树结构到一个英语句法树结构的对应。其中的树结构可以被看作是完整句法树上的一个片段，称为{\small\bfnew{树片段}}（Tree Fragment）。树片段的叶子节点既可以是终结符（单词）也可以是非终结符。当叶子节点为非终结符时，表示这个非终结符会被进一步替换，因此它可以被看作是变量。而源语言树结构和目标语言树结构中的变量是一一对应的，对应关系用虚线表示。

 %----------------------------------------------
 % 图
@@ -1491,17 +1552,17 @@ h_i (d,\textbf{t},\textbf{s})=\sum_{r \in d}h_i (r)
 \sim &=& \{1-2,2-1\} \nonumber
 \end{eqnarray}

-\parinterval 这里，$\alpha_h$和$\beta_h$表示规则的左部，对应树片段的根节点；$\alpha_r$和$\beta_r$是两种语言的树结构（序列化表示），其中标记为$x$的非终结符是变量。$\sim = \{1-2,2-1\}$表示源语言的第一个变量对应目标语言的第二个变量，而源语言的第二个变量对应目标语言的第一个变量，这也反应出两种语言句法结构中的调序现象。有时候为了化简规则的形式，会把规则中变量的对应关系用下标进行表示。比如，上面的规则也可以被写为如下形式。
+\parinterval 这里，$\alpha_h$和$\beta_h$表示规则的左部，对应树片段的根节点；$\alpha_r$和$\beta_r$是两种语言的树结构（序列化表示），其中标记为$x$的非终结符是变量。$\sim = \{1-2,2-1\}$表示源语言的第一个变量对应目标语言的第二个变量，而源语言的第二个变量对应目标语言的第一个变量，这也反应出两种语言句法结构中的调序现象。有时候为了化简规则的形式，会把规则中变量的对应关系用下标进行表示。例如，上面的规则也可以被写为如下形式。
 \begin{eqnarray}
 \langle\ \textrm{VP}, \textrm{VP}\ \rangle\ \to\ \langle\ \textrm{PP}_{1} \ \textrm{VP(VV(表示)}\ \textrm{NN}_{2}))\ \textrm{VP}(\textrm{VBZ(was)}\ \textrm{VP(VBN}_{2} \ \textrm{PP}_{1})) \ \rangle \nonumber
 \end{eqnarray}

-\parinterval 其中，两种语言中变量的对应关系为$\textrm{PP}_1 \leftrightarrow \textrm{PP}_1$，$\textrm{NN}_2 \leftrightarrow \textrm{VBN}_2$。
+\noindent 其中，两种语言中变量的对应关系为$\textrm{PP}_1 \leftrightarrow \textrm{PP}_1$，$\textrm{NN}_2 \leftrightarrow \textrm{VBN}_2$。

 %%%%%%%%%%%%%%%%%%
 \subsubsection{基于树结构的翻译推导}\index{Chapter4.4.2.2}

-\parinterval 规则中的变量预示着一种替换操作，即变量可以被其他树结构替换。实际上，上面的树到树规则就是一种{\small\bfnew{同步树替换文法规则}}（Synchronous Tree Substitution Grammar Rule）。不论是源语言端还是目标语言端，都可以通过这种替换操作不断生成更大的树结构，也就是通过树片段的组合得到更大的树片段。图\ref{fig:operation-of-tree-replace}就展示了树替换操作的一个实例。
+\parinterval 规则中的变量预示着一种替换操作，即变量可以被其他树结构替换。实际上，上面的树到树翻译规则就是一种{\small\bfnew{同步树替换文法规则}}（Synchronous Tree Substitution Grammar Rule）。不论是源语言端还是目标语言端，都可以通过这种替换操作不断生成更大的树结构，也就是通过树片段的组合得到更大的树片段。图\ref{fig:operation-of-tree-replace}就展示了树替换操作的一个实例。

 %----------------------------------------------
 % 图
@@ -1513,7 +1574,7 @@ h_i (d,\textbf{t},\textbf{s})=\sum_{r \in d}h_i (r)
 \end{figure}
 %-------------------------------------------

-\parinterval 这种方法也可以推广到双语的情况。图\ref{fig:based-on-tree-structure-generate-sentence-pairs}给出了一个使用基于树结构的同步文法生成双语句对的实例。其中，每条规则都同时对应源语言和目标语言的一个树片段（用矩形表示）。变量部分可以被替换，这个过程不断执行。最后，四条规则组合在一起形成了源语言和目标语言的句法树。这个过程也被称作规则的推导。
+\parinterval 这种方法也可以被扩展到双语的情况。图\ref{fig:based-on-tree-structure-generate-sentence-pairs}给出了一个使用基于树结构的同步文法生成双语句对的实例。其中，每条规则都同时对应源语言和目标语言的一个树片段（用矩形表示）。变量部分可以被替换，这个过程不断执行。最后，四条规则组合在一起形成了源语言和目标语言的句法树。这个过程也被称作规则的推导。

 %----------------------------------------------
 % 图
@@ -1556,7 +1617,7 @@ r_9: \quad \textrm{IP(}\textrm{NN}_1\ \textrm{VP}_2) \rightarrow \textrm{S(}\tex
 \end{eqnarray}
 }

-\parinterval 其中，箭头$\rightarrow$表示推导之意。显然，可以把翻译看作是基于树结构的推导过程（记为$d$）。因此，与层次短语模型一样，基于语言学句法的机器翻译也是要找到最佳的推导$\hat{d} = \arg\max\textrm{P}(d)$。
+\noindent 其中，箭头$\rightarrow$表示推导之意。显然，可以把翻译看作是基于树结构的推导过程（记为$d$）。因此，与层次短语模型一样，基于语言学句法的机器翻译也是要找到最佳的推导$\hat{d} = \arg\max\textrm{P}(d)$。

 %%%%%%%%%%%%%%%%%%
 \subsubsection{树到串翻译规则}\index{Chapter4.4.2.3}
@@ -1604,22 +1665,26 @@ r_9: \quad \textrm{IP(}\textrm{NN}_1\ \textrm{VP}_2) \rightarrow \textrm{S(}\tex
 \parinterval 本节首先介绍树到串文法归纳的经典方法 —— GHKM方法\cite{galley2004s,galley2006scalable}。所谓GHKM是四位作者名字的首字母。GHKM方法的输入包括：

 \begin{itemize}
+\vspace{0.3em}
 \item 源语言句子及其句法树；
+\vspace{0.3em}
 \item 目标语言句子；
+\vspace{0.3em}
 \item 源语言句子和目标语言句子之间的词对齐。
+\vspace{0.3em}
 \end{itemize}

-\parinterval 它的输出是这个句对上的树到串翻译规则。GHKM不是一套单一的算法，它还包括很多技术手段用于增加规则的覆盖度和准确性。下面就具体看看GHKM是如何工作的。
+\parinterval 它的输出是这个双语句对上的树到串翻译规则。GHKM不是一套单一的算法，它还包括很多技术手段用于增加规则的覆盖度和准确性。下面就具体看看GHKM是如何工作的。

 %%%%%%%%%%%%%%%%%%
 \subsubsection{树的切割与最小规则}\index{Chapter4.4.3.1}

-\parinterval 获取树到串规则就是要找到在源语言树片段与目标语串的对应关系。一棵句法树会有很多个树片段，那么哪些树片段可以和目标语言词串产生对应关系呢？在GHKM方法中，源语言树片段和目标语言词串的对应是由词对齐决定的。GHKM假设：一个合法的树到串翻译规则，不应该违反词对齐。这个假设和双语短语抽取中的词对齐一致性约束是一样的。简单来说，两个互相对应的部分不应包含对齐到外部的词对齐连接。为了说明这个问题，来看一个例子。图\ref{fig:example-of-tree-to-string-rule-and-word-alignment}包含了一棵句法树、一个词串和它们之间的词对齐结果。图中包含如下规则：
+\parinterval 获取树到串规则就是要找到在源语言树片段与目标语言词串的对应关系。一棵句法树会有很多个树片段，那么哪些树片段可以和目标语言词串产生对应关系呢？在GHKM方法中，源语言树片段和目标语言词串的对应是由词对齐决定的。GHKM假设：一个合法的树到串翻译规则，不应该违反词对齐。这个假设和双语短语抽取中的词对齐一致性约束是一样的（见\ref{subsection-4.2.3}节）。简单来说，规则中两种语言互相对应的部分不应包含对齐到外部的词对齐连接。为了说明这个问题，来看一个例子。图\ref{fig:example-of-tree-to-string-rule-and-word-alignment}包含了一棵句法树、一个词串和它们之间的词对齐结果。图中包含如下规则：
 \begin{eqnarray}
 \textrm{PP(P(对)}\ \textrm{NP(NN(回答)))} \rightarrow \textrm{with}\ \textrm{the}\ \textrm{answer} \nonumber
 \end{eqnarray}

-\parinterval 该规则是一个条满足词对齐约束的规则（对应于图\ref{fig:example-of-tree-to-string-rule-and-word-alignment}中红色部分），因为不存在从规则的源语言或目标语言部分对齐到规则外部的情况。但是，如下的规则却是一条不合法的规则：
+\parinterval 该规则是一条满足词对齐约束的规则（对应于图\ref{fig:example-of-tree-to-string-rule-and-word-alignment}中红色部分），因为不存在从规则的源语言或目标语言部分对齐到规则外部的情况。但是，如下的规则却是一条不合法的规则：
 \begin{eqnarray}
 \textrm{NN(满意)} \rightarrow \textrm{satisfied} \nonumber
 \end{eqnarray}
@@ -1639,37 +1704,40 @@ r_9: \quad \textrm{IP(}\textrm{NN}_1\ \textrm{VP}_2) \rightarrow \textrm{S(}\tex
 \parinterval 为了能够获得与词对齐相兼容的规则，GHKM引入了几个概念。首先，GHKM定义了Span和Complement Span：

 %-------------------------------------------
+\vspace{0.5em}
 \begin{definition} Span

 {\small
-对于一个源语言句法树节点，它的Span是这个节点所对应到目标语的第一个单词和最后一个单词所构成的索引范围。
+对于一个源语言句法树节点，它的Span是这个节点所对应到的目标语言第一个单词和最后一个单词所构成的索引范围。
 }
 \end{definition}
 %-------------------------------------------

 %-------------------------------------------
+\vspace{0.5em}
 \begin{definition} Complement Span

 {\small
-对于一个源语言句法树节点，它的Complement Span是除了它的祖先和子孙阶段外的其他节点Span的并集。
+对于一个源语言句法树节点，它的Complement Span是除了它的祖先和子孙节点外的其他节点Span的并集。
 }
 \end{definition}
 %-------------------------------------------

-\parinterval Span定义了每个节点覆盖的源语言片段所对应的目标语片段。实际上，它表示了目标语句子上的一个跨度，这个跨度代表了这个源语言句法树节点所能达到的最大范围。因此Span实际上是一个目标语单词索引的范围。Complement Span是与Span相对应的一个概念，它定义了句法树中一个节点之外的部分对应到目标语的范围，但是这个范围并不必须是连续的。
+\parinterval Span定义了每个节点覆盖的源语言片段所对应的目标语言片段。实际上，它表示了目标语言句子上的一个跨度，这个跨度代表了这个源语言句法树节点所能达到的最大范围。因此Span实际上是一个目标语单词索引的范围。Complement Span是与Span相对应的一个概念，它定义了句法树中一个节点之外的部分对应到目标语的范围，但是这个范围并不必须是连续的。

 \parinterval 有了Span和Complement Span的定义之后，可以进一步定义：

 %-------------------------------------------
+\vspace{0.5em}
 \begin{definition} 可信节点(Admissible Node)

 {\small
-对于源语言树节点$n$，如果它的Span和Complement Span不相交，节点$n$就是一个可信节点，否则是一个不可信节点。
+对于源语言树节点$node$，如果它的Span和Complement Span不相交，节点$node$就是一个可信节点，否则是一个不可信节点。
 }
 \end{definition}
 %-------------------------------------------

-\parinterval 可信节点表示这个树节点$n$和树中的其他部分（不包括$n$的祖先和孩子）没有任何词对齐上的歧义。也就是说，这个节点可以完整的对应到目标语的一个连续范围，不会出现在这个目标语范围中的词对应到其他节点的情况。如果节点不是可信节点，则表示它会引起词对齐的歧义，因此不能作为树到串规则中源语言树片段的根节点或者变量部分。图\ref{fig:syntax-tree-with-admissible-node}给出了一个可信节点的实例。
+\parinterval 可信节点表示这个树节点$node$和树中的其他部分（不包括$node$的祖先和孩子）没有任何词对齐上的歧义。也就是说，这个节点可以完整的对应到目标语言句子的一个连续范围，不会出现在这个范围中的词对应到其他节点的情况。如果节点不是可信节点，则表示它会引起词对齐的歧义，因此不能作为树到串规则中源语言树片段的根节点或者变量部分。图\ref{fig:syntax-tree-with-admissible-node}给出了一个可信节点的实例。

 %----------------------------------------------
 % 图
@@ -1684,6 +1752,7 @@ r_9: \quad \textrm{IP(}\textrm{NN}_1\ \textrm{VP}_2) \rightarrow \textrm{S(}\tex
 \parinterval 进一步，可以定义树到串模型中合法的树片段：

 %-------------------------------------------
+\vspace{0.5em}
 \begin{definition} 合法的树片段

 {\small
@@ -1697,14 +1766,14 @@ r_9: \quad \textrm{IP(}\textrm{NN}_1\ \textrm{VP}_2) \rightarrow \textrm{S(}\tex
 \textrm{VP(PP(P(对)}\ \textrm{NP(NN(回答)))}\ \textrm{VP}_1) \rightarrow \textrm{VP}_1\ \textrm{with}\ \textrm{the}\ \textrm{answer} \nonumber
 \end{eqnarray}

-\parinterval 其中蓝色部分表示可以抽取到的规则，显然它的根节点和叶子非终结符节点都是可信节点。由于源语言树片段中包含一个变量（VP），因此需要对VP节点的Span所表示的目标语范围进行泛化（红色方框部分）。
+\noindent 其中，蓝色部分表示可以抽取到的规则，显然它的根节点和叶子非终结符节点都是可信节点。由于源语言树片段中包含一个变量（VP），因此需要对VP节点的Span所表示的目标语言范围进行泛化（红色方框部分）。

 %----------------------------------------------
 % 图
 \begin{figure}[htp]
 \centering
 \input{./Chapter4/Figures/translation-rule-based-on-admissible-node}
-\caption{根据可信结点得到的翻译规则}
+\caption{根据可信结点得到的树到串翻译规则}
 \label{fig:translation-rule-based-on-admissible-node}
 \end{figure}
 %-------------------------------------------
@@ -1740,7 +1809,7 @@ r_9: \quad \textrm{IP(}\textrm{NN}_1\ \textrm{VP}_2) \rightarrow \textrm{S(}\tex
 %%%%%%%%%%%%%%%%%%
 \subsubsection{空对齐处理}\index{Chapter4.4.3.2}

-\parinterval 空对齐是翻译中的常见现象。比如，一些虚词经常找不到在另一种语言中的对应，因此不会被翻译，这种情况也被称作空对齐。比如，在图\ref{fig:minimum-rule-from-tree-cutting}中目标语中的``was''就是一个空对齐单词。实际上，空对齐的使用可以大大增加翻译的灵活度。具体到树到串规则抽取任务，需要把空对齐考虑进来，这样能够覆盖更多的语言现象。
+\parinterval 空对齐是翻译中的常见现象。比如，一些虚词经常找不到在另一种语言中的对应，因此不会被翻译，这种情况也被称作空对齐。比如，在图\ref{fig:minimum-rule-from-tree-cutting}中目标语中的``was''就是一个空对齐单词。空对齐的使用可以大大增加翻译的灵活度。具体到树到串规则抽取任务，需要把空对齐考虑进来，这样能够覆盖更多的语言现象。

 \parinterval 处理空对齐单词的手段非常简单。只需要把空对齐单词附着在它周围的规则上即可。也就是，检查每条最小规则，如果空对齐单词能够作为规则的一部分进行扩展，就可以生成一条新的规则。图\ref{fig:tree-to-string-rule-empty-alignment}展示了前面例子中``was''被附着在周围的规则上的结果。其中，含有红色``was''的规则是通过附着空对齐单词得到的新规则。比如，对于规则：
 \begin{eqnarray}
@@ -1797,7 +1866,7 @@ r_9: \quad \textrm{IP(}\textrm{NN}_1\ \textrm{VP}_2) \rightarrow \textrm{S(}\tex
 \textrm{对}\ \textrm{形式} \rightarrow \textrm{about}\ \textrm{the}\ \textrm{situation} \nonumber
 \end{eqnarray}

-\parinterval 然后，从这个短语出发向上搜索，找到覆盖这个短语的最小树片段，之后生成规则即可。在这个例子中可以使用SPMT规则：
+\parinterval 然后，从这个短语出发向上搜索，找到覆盖这个短语的最小树片段，之后生成规则即可。在这个例子中可以得到SPMT规则：
 \begin{eqnarray}
 \textrm{VP(P(对)}\ \textrm{NP(NN(局势))}\ \textrm{VP}_1) \rightarrow \textrm{VP}_1\ \textrm{about}\ \textrm{the}\ \textrm{situation} \nonumber
 \end{eqnarray}
@@ -1817,7 +1886,7 @@ r_9: \quad \textrm{IP(}\textrm{NN}_1\ \textrm{VP}_2) \rightarrow \textrm{S(}\tex
 %%%%%%%%%%%%%%%%%%
 \subsubsection{句法树二叉化}\index{Chapter4.4.3.5}

-\parinterval 句法树是使用人类语言学知识归纳出来的一种解释句子结构的工具。比如， CTB、PTB等语料就是常用的训练句法分析器的数据。但是，这些数据的标注中会含有大量的偏平结构，如图\ref{fig:syntax-tree-in-ctb}所示，多个分句可能会导致一个根节点下有很多个分支。
+\parinterval 句法树是使用人类语言学知识归纳出来的一种解释句子结构的工具。比如， CTB、PTB等语料就是常用的训练句法分析器的数据（{\red 参考文献！}）。但是，这些数据的标注中会含有大量的偏平结构，如图\ref{fig:syntax-tree-in-ctb}所示，多个分句可能会导致一个根节点下有很多个分支。

 %----------------------------------------------
 % 图
@@ -1850,7 +1919,7 @@ r_9: \quad \textrm{IP(}\textrm{NN}_1\ \textrm{VP}_2) \rightarrow \textrm{S(}\tex
 && \textrm{NP-BAR(}\textrm{NN}_1\ \textrm{NP-}\textrm{BAR}_2) \rightarrow \textrm{NN}_1\ \textrm{NP-}\textrm{BAR}_2 \nonumber
 \end{eqnarray}

-\parinterval 由于树二叉化可以帮助规则抽取得到更细颗粒度的规则，提高规则抽取的召回率，因此成为了基于句法的机器翻译中的常用方法。二叉化方法也有很多不同的实现策略，比如：左二叉化、右二叉化、基于中心词的二叉化等\cite{Tong2009Better}。具体实现时可以根据实际情况进行选择。
+\parinterval 由于树二叉化可以帮助规则抽取得到更细颗粒度的规则，提高规则抽取的召回率，因此成为了基于句法的机器翻译中的常用方法。二叉化方法也有很多不同的实现策略，比如：左二叉化、右二叉化、基于中心词的二叉化等（{\red 再引用Hao Zhang的论文，Sycnhrounous Binarization}）\cite{Tong2009Better}。具体实现时可以根据实际情况进行选择。

 %----------------------------------------------
 % 图
@@ -1876,7 +1945,7 @@ r_9: \quad \textrm{IP(}\textrm{NN}_1\ \textrm{VP}_2) \rightarrow \textrm{S(}\tex
 \textrm{VP(}\textrm{PP}_1\ \textrm{VP(VV(表示)}\ \textrm{NN}_2\textrm{))} \rightarrow \textrm{VP(VBZ(was)}\ \textrm{VP(}\textrm{VBN}_2\ \textrm{PP}_1\textrm{))} \nonumber
 \end{eqnarray}

-\parinterval 其中，规则的左部是源语言句法树结构，右部是目标语言句法树结构，变量的下标表示对应关系。为了获取这样的规则，需要进行树到树规则抽取。最直接的办法是把GHKM方法推广到树到树翻译的情况。比如，可以利用双语结构的约束和词对齐，定义树的切割点，之后找到两种语言树结构的映射关系\cite{liu2009improving}。
+\noindent 其中，规则的左部是源语言句法树结构，右部是目标语言句法树结构，变量的下标表示对应关系。为了获取这样的规则，需要进行树到树规则抽取。最直接的办法是把GHKM方法推广到树到树翻译的情况。比如，可以利用双语结构的约束和词对齐，定义树的切割点，之后找到两种语言树结构的映射关系\cite{liu2009improving}。

 %%%%%%%%%%%%%%%%%%
 \subsubsection{基于节点对齐的规则抽取}\index{Chapter4.4.4.1}

--- a/Book/Chapter5/Figures/fig-bert.tex
+++ b/Book/Chapter5/Figures/fig-bert.tex
@@ -9,7 +9,7 @@
 \node [anchor=west,inner sep=4pt] (sep) at ([xshift=1em]Trm3.east) {\scriptsize{...}};
 \node [anchor=west,draw,inner sep=4pt,fill=blue!20!white,minimum width=3em] (Trm4) at ([xshift=1em]sep.east) {\scriptsize{TRM}};

-\node [anchor=south,draw,inner sep=4pt,fill=blue!20!white,minimum width=3em] (Trm5) at ([yshift=1em]Trm0.north) {\scriptsize{TRM}};
+\node [anchor=south,draw,inner sep=4pt,fill=blue!20!white,minimum width=3em] (Trm5) at ([yshift=1.2em]Trm0.north) {\scriptsize{TRM}};
 \node [anchor=west,draw,inner sep=4pt,fill=blue!20!white,minimum width=3em] (Trm6) at ([xshift=1em]Trm5.east) {\scriptsize{TRM}};
 \node [anchor=west,draw,inner sep=4pt,fill=blue!20!white,minimum width=3em] (Trm7) at ([xshift=1em]Trm6.east) {\scriptsize{TRM}};
 \node [anchor=west,draw,inner sep=4pt,fill=blue!20!white,minimum width=3em] (Trm8) at ([xshift=1em]Trm7.east) {\scriptsize{TRM}};
@@ -18,22 +18,28 @@

 \node [rectangle,rounded corners,draw=black!50,densely dashed,inner sep=0.4em] [fit = (Trm0) (Trm4) (Trm5) (Trm9)] (inputshadow) {};

-\node [anchor=north,draw,inner sep=4pt,fill=ugreen!20!white,minimum width=2em] (e1) at ([yshift=-1em]Trm0.south) {\scriptsize{$\textbf{e}_1$}};
-\node [anchor=north,draw,inner sep=4pt,fill=ugreen!20!white,minimum width=2em] (e2) at ([yshift=-1em]Trm1.south) {\scriptsize{$\textbf{e}_2$}};
-\node [anchor=north,draw,inner sep=4pt,fill=ugreen!20!white,minimum width=2em] (e3) at ([yshift=-1em]Trm2.south) {\scriptsize{$\textbf{e}_3$}};
-\node [anchor=north,draw,inner sep=4pt,fill=ugreen!20!white,minimum width=2em] (e4) at ([yshift=-1em]Trm3.south) {\scriptsize{$\textbf{e}_4$}};
+\node [anchor=north,draw,inner sep=4pt,fill=ugreen!20!white,minimum width=2em] (e1) at ([yshift=-1.2em]Trm0.south) {\scriptsize{$\textbf{e}_1$}};
+\node [anchor=north,draw,inner sep=4pt,fill=ugreen!20!white,minimum width=2em] (e2) at ([yshift=-1.2em]Trm1.south) {\scriptsize{$\textbf{e}_2$}};
+\node [anchor=north,draw,inner sep=4pt,fill=ugreen!20!white,minimum width=2em] (e3) at ([yshift=-1.2em]Trm2.south) {\scriptsize{$\textbf{e}_3$}};
+\node [anchor=north,draw,inner sep=4pt,fill=ugreen!20!white,minimum width=2em] (e4) at ([yshift=-1.2em]Trm3.south) {\scriptsize{$\textbf{e}_4$}};
 \node [anchor=north,inner sep=4pt] (sep5) at ([yshift=-1em]sep.south) {\scriptsize{...}};
-\node [anchor=north,draw,inner sep=4pt,fill=ugreen!20!white,minimum width=2em] (e5) at ([yshift=-1em]Trm4.south) {\scriptsize{$\textbf{e}_m$}};
+\node [anchor=north,draw,inner sep=4pt,fill=ugreen!20!white,minimum width=2em] (e5) at ([yshift=-1.2em]Trm4.south) {\scriptsize{$\textbf{e}_m$}};
+\node [anchor=south] (word1) at ([yshift=-1.5em]e1.south) {\footnotesize {Once}};
+\node [anchor=south] (word2) at ([yshift=-1.7em]e2.south) {\footnotesize {[MASK]}};
+\node [anchor=south] (word3) at ([yshift=-1.5em]e3.south) {\footnotesize {a}};
+\node [anchor=south] (word4) at ([yshift=-1.5em]e4.south) {\footnotesize {time}};
+\node [anchor=south] (wordseq) at ([yshift=-2.0em]sep5.south) {\footnotesize{...}};
+\node [anchor=south] (word4) at ([yshift=-1.5em]e5.south) {\footnotesize {island}};

-\node [anchor=south,draw,inner sep=4pt,fill=yellow!30,minimum width=2em] (t1) at ([yshift=1em]Trm5.north) {\scriptsize{$\textbf{h}_1$}};
-\node [anchor=south,draw,inner sep=4pt,fill=yellow!30,minimum width=2em] (t2) at ([yshift=1em]Trm6.north) {\scriptsize{$\textbf{h}_2$}};
-\node [anchor=south,draw,inner sep=4pt,fill=yellow!30,minimum width=2em] (t3) at ([yshift=1em]Trm7.north) {\scriptsize{$\textbf{h}_3$}};
-\node [anchor=south,draw,inner sep=4pt,fill=yellow!30,minimum width=2em] (t4) at ([yshift=1em]Trm8.north) {\scriptsize{$\textbf{h}_4$}};
+\node [anchor=south,draw,inner sep=4pt,fill=yellow!30,minimum width=2em] (t1) at ([yshift=1.2em]Trm5.north) {\scriptsize{$\textbf{h}_1$}};
+\node [anchor=south,draw,inner sep=4pt,fill=yellow!30,minimum width=2em] (t2) at ([yshift=1.2em]Trm6.north) {\scriptsize{$\textbf{h}_2$}};
+\node [anchor=south,draw,inner sep=4pt,fill=yellow!30,minimum width=2em] (t3) at ([yshift=1.2em]Trm7.north) {\scriptsize{$\textbf{h}_3$}};
+\node [anchor=south,draw,inner sep=4pt,fill=yellow!30,minimum width=2em] (t4) at ([yshift=1.2em]Trm8.north) {\scriptsize{$\textbf{h}_4$}};
 \node [anchor=south,inner sep=4pt] (sep6) at ([yshift=1em]sep1.north) {\scriptsize{...}};
-\node [anchor=south,draw,inner sep=4pt,fill=yellow!30,minimum width=2em] (t5) at ([yshift=1em]Trm9.north) {\scriptsize{$\textbf{h}_m$}};
+\node [anchor=south,draw,inner sep=4pt,fill=yellow!30,minimum width=2em] (t5) at ([yshift=1.2em]Trm9.north) {\scriptsize{$\textbf{h}_m$}};

-\node [anchor=west,draw,inner sep=3pt,fill=blue!20!white,minimum width=1em] (Lt1) at ([yshift=1.5em]t1.west) {\tiny{TRM}};
-\node [anchor=west] (Lt2) at ([xshift=-0.1em]Lt1.east) {\scriptsize{: Transformer Block}};
+\node [anchor=west,draw,inner sep=3pt,fill=blue!20!white,minimum width=1em] (Lt1) at ([yshift=1.2em]t1.west) {\tiny{TRM}};
+\node [anchor=west] (Lt2) at ([xshift=-0.1em]Lt1.east) {\tiny{: Transformer}};

 \draw [->] ([yshift=0.1em]e1.north) -- ([yshift=-0.1em]Trm0.south);
 \draw [->] ([yshift=0.1em]e1.north) -- ([yshift=-0.1em]Trm1.south);
@@ -51,7 +57,6 @@

 \draw [->] ([yshift=0.1em]Trm0.north) -- ([yshift=-0.1em]Trm5.south);
 \draw [->] ([yshift=0.1em]Trm0.north) -- ([yshift=-0.1em]Trm6.south);
-\draw [->] ([yshift=0.1em]Trm0.north) -- ([yshift=-0.1em]Trm7.south);
 \draw [->] ([yshift=0.1em]Trm0.north) -- ([yshift=-0.1em]Trm8.south);
 \draw [->] ([yshift=0.1em]Trm0.north) -- ([yshift=-0.1em]Trm9.south);
 \draw [->] ([yshift=0.1em]Trm1.north) -- ([yshift=-0.1em]Trm6.south);

--- a/Book/Chapter5/Figures/fig-code-back-propagation-1.tex
+++ b/Book/Chapter5/Figures/fig-code-back-propagation-1.tex
@@ -51,11 +51,6 @@
 \node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=8em,minimum height=1.2em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (h3) at ([yshift=1.5em]h2.north) {\scriptsize{h2 = Relu(h1 * w2)}};
 \node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=8em,minimum height=1.2em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (h4) at ([yshift=1.5em]h3.north) {\scriptsize{h3 = h2 + h1}};

-{\draw [->,thick] (h1.north) -- (h2.south);}
-{\draw [->,thick] (h2.north) -- (h3.south);}
-{\draw [->,thick] (h3.north) -- (h4.south);}
-{\draw [->,thick,rounded corners] (h2.east) -- ([xshift=0.5em]h2.east) -- ([xshift=0.5em,yshift=0.5em]h3.north east) -- ([xshift=-2em,yshift=0.5em]h3.north east) -- ([xshift=-2em,yshift=1.5em]h3.north east);}
-
 {\draw [<-,very thick,red] (h1.north) -- (h2.south);}
 {\draw [<-,very thick,red] (h2.north) -- (h3.south);}
 {\draw [<-,very thick,red] (h3.north) -- (h4.south);}
@@ -64,11 +59,11 @@
 \node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=8.0em,minimum height=1.2em,fill=red!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (slayer) at ([yshift=1.5em]h4.north) {\tiny{h4 = Softmax(h3 * w4) (output)}};
 \node [anchor=south] (losslabel) at (slayer.north) {\scriptsize{\textbf{Cross Entropy Loss}}};

-{\draw [->,thick] (h4.north) -- (slayer.south);}
 {\draw [<-,very thick,red] (h4.north) -- (slayer.south);}

 \end{tikzpicture}
 \end{center}
 \end{tcolorbox}
+
 %%%------------------------------------------------------------------------------------------------------------

--- a/Book/Chapter5/Figures/fig-code-back-propagation-2.tex
+++ b/Book/Chapter5/Figures/fig-code-back-propagation-2.tex
 %%%------------------------------------------------------------------------------------------------------------
- \begin{tcolorbox}
+\begin{tcolorbox}
 [bicolor,sidebyside,width=13cm,righthand width=4cm,size=title,frame engine=empty,
 colback=blue!10!white,colbacklower=black!5!white]
 {\scriptsize
 \begin{tabbing}
-\texttt{XTensor x, y, gold, h[5], w[5], s[5];} \\
-\texttt{XTensor dh[5], dw[5], ds[5];} \\
-\texttt{...} // 前向过程 \\
-\texttt{h[0] = x;} \\
-\texttt{y = h[4];} \\
-
+\texttt{XTensor x, loss, gold, h[5], w[5], b[5];} \\
+\texttt{...} \\

 \texttt{} \\
-\texttt{CrossEntropyBackward(dh[4], y, gold);} \\
-\texttt{SoftmaxBackward(y, s[4], dh[4], ds[4]);}\\
-\texttt{MMul(h[3], {\scriptsize X\_TRANS}, ds[4], {\scriptsize X\_NOTRANS}, dw[4]);}\\
-\texttt{MMul(ds[4], {\scriptsize X\_NOTRANS}, w[4], {\scriptsize X\_RANS}, dh[3]);}\\
-
-
+\texttt{h[1] = Relu(MMul(x, w[1]) + b[1]);} \\
+\texttt{h[2] = Relu(MMul(h[1], w[2]) + b[2]);} \\
+\texttt{h[3] = HardTanH(h[2]);} \\
+\texttt{h[4] = Softmax(MMul(h[3], w[3]));} \\
+\texttt{loss = CrossEntropy(h[4], gold);} \\

 \texttt{} \\
-\texttt{dh[2] = dh[3];}\\
-\texttt{ReluBackward(h[2], s[2], dh[2], ds[2]);}\\
-\texttt{MMul(h[1], {\scriptsize X\_TRANS}, ds[2], {\scriptsize X\_NOTRANS}, dw[2]);}\\
-\texttt{MMul(ds[2], {\scriptsize X\_NOTRANS}, w[2], {\scriptsize X\_TRANS}, dh[2]);}\\
-
-
+\texttt{XNet net;}\\
+{\texttt{net.Backward(loss);} //一行代码实现自动微分}\\

 \texttt{} \\
-\texttt{dh[1] = dh[1] + dh[3];}\\
-
-
-
-\texttt{...} // 继续反向传播 \\
-\texttt{} \\
 \texttt{for(unsigned i = 0; i < 5; i++)\{} \\
-\texttt{} \ \ \ \ ... // 通过{\texttt{dw[i]}}访问参数的梯度\\
+\texttt{} \ \ \ \ ... // 通过{\texttt{w[i].grad}}访问参数的梯度\\
 \texttt{\}}

-
 \end{tabbing}
 }
 \tcblower
@@ -46,26 +30,19 @@
 \begin{tikzpicture}


-\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=8em,minimum height=1.2em,fill=red!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (h1) at (0,0) {\scriptsize{x (input)}};
-\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=8em,minimum height=1.2em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (h2) at ([yshift=1.5em]h1.north) {\scriptsize{h1 = Relu(x * w1)}};
-\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=8em,minimum height=1.2em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (h3) at ([yshift=1.5em]h2.north) {\scriptsize{h2 = Relu(h1 * w2)}};
-\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=8em,minimum height=1.2em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (h4) at ([yshift=1.5em]h3.north) {\scriptsize{h3 = h2 + h1}};
-
-{\draw [->,thick] (h1.north) -- (h2.south);}
-{\draw [->,thick] (h2.north) -- (h3.south);}
-{\draw [->,thick] (h3.north) -- (h4.south);}
-{\draw [->,thick,rounded corners] (h2.east) -- ([xshift=0.5em]h2.east) -- ([xshift=0.5em,yshift=0.5em]h3.north east) -- ([xshift=-2em,yshift=0.5em]h3.north east) -- ([xshift=-2em,yshift=1.5em]h3.north east);}
+\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=8em,minimum height=1.0em,fill=red!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (h1) at (0,0) {\tiny{x (input)}};
+\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=8em,minimum height=1.0em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (h2) at ([yshift=1.0em]h1.north) {\tiny{h1 = Relu(x * w1 + b1)}};
+\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=8em,minimum height=1.0em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (h3) at ([yshift=1.0em]h2.north) {\tiny{h2 = Relu(h1 * w2 + b2)}};
+\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=8em,minimum height=1.0em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (h4) at ([yshift=1.0em]h3.north) {\tiny{h3 = HardTanh(h2)}};

-{\draw [<-,very thick,red] (h1.north) -- (h2.south);}
-{\draw [<-,very thick,red] (h2.north) -- (h3.south);}
-{\draw [<-,very thick,red] (h3.north) -- (h4.south);}
-{\draw [<-,very thick,red,rounded corners] (h2.east) -- ([xshift=0.5em]h2.east) -- ([xshift=0.5em,yshift=0.5em]h3.north east) -- ([xshift=-2em,yshift=0.5em]h3.north east) -- ([xshift=-2em,yshift=1.5em]h3.north east);}
+\draw [->,thick] (h1.north) -- (h2.south);
+\draw [->,thick] (h2.north) -- (h3.south);
+\draw [->,thick] (h3.north) -- (h4.south);

-\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=8.0em,minimum height=1.2em,fill=red!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (slayer) at ([yshift=1.5em]h4.north) {\tiny{h4 = Softmax(h3 * w4) (output)}};
+\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=8.0em,minimum height=1.0em,fill=red!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (slayer) at ([yshift=1.0em]h4.north) {\tiny{h4 = Softmax(h3 * w4) (output)}};
 \node [anchor=south] (losslabel) at (slayer.north) {\scriptsize{\textbf{Cross Entropy Loss}}};

-{\draw [->,thick] (h4.north) -- (slayer.south);}
-{\draw [<-,very thick,red] (h4.north) -- (slayer.south);}
+\draw [->,thick] (h4.north) -- (slayer.south);

 \end{tikzpicture}
 \end{center}

--- a/Book/Chapter5/Figures/fig-elmo.tex
+++ b/Book/Chapter5/Figures/fig-elmo.tex
@@ -18,6 +18,10 @@
 \node [anchor=west,draw,inner sep=4pt,fill=ugreen!20!white,minimum width=2em] (e2) at ([xshift=1em]e1.east) {\scriptsize{$\textbf{e}_2$}};
 \node [anchor=west,inner sep=4pt] (sep5) at ([xshift=1em]e2.east) {\scriptsize{...}};
 \node [anchor=west,draw,inner sep=4pt,fill=ugreen!20!white,minimum width=2em] (e3) at ([xshift=1em]sep5.east) {\scriptsize{$\textbf{e}_m$}};
+\node [anchor=south] (word1) at ([yshift=-1.5em]e1.south) {\footnotesize {Once}};
+\node [anchor=south] (word2) at ([yshift=-1.6em]e2.south) {\footnotesize {upon}};
+\node [anchor=south] (wordseq) at ([yshift=-1.5em]sep5.south) {\footnotesize{...}};
+\node [anchor=south] (word3) at ([yshift=-1.5em]e3.south) {\footnotesize {island}};

 \node [anchor=south,draw,inner sep=4pt,fill=yellow!30,minimum width=2em] (t1) at ([xshift=-2em,yshift=1em]Lstm5.north) {\scriptsize{$\textbf{h}_1$}};
 \node [anchor=west,draw,inner sep=4pt,fill=yellow!30,minimum width=2em] (t2) at ([xshift=1em]t1.east) {\scriptsize{$\textbf{h}_2$}};
@@ -48,31 +52,31 @@
 \draw [->] ([yshift=0.1em]Lstm1.north) -- ([yshift=-0.1em]Lstm4.south);
 \draw [->] ([yshift=0.1em]Lstm2.north) -- ([yshift=-0.1em]Lstm5.south);

-\draw [->] ([xshift=0.1em]Lstm6.east) -- ([xshift=-0.1em]Lstm7.west);
-\draw [->] ([xshift=0.1em]Lstm7.east) -- ([xshift=0.1em]sep3.west);
-\draw [->] ([xshift=-0.1em]sep3.east) -- ([xshift=-0.1em]Lstm8.west);
+\draw [->] ([xshift=-0.1em]Lstm7.west) -- ([xshift=0.1em]Lstm6.east);
+\draw [->] ([xshift=0.1em]sep3.west) -- ([xshift=0.1em]Lstm7.east);
+\draw [->] ([xshift=-0.1em]Lstm8.west) -- ([xshift=-0.1em]sep3.east);

-\draw [->] ([xshift=0.1em]Lstm9.east) -- ([xshift=-0.1em]Lstm10.west);
-\draw [->] ([xshift=0.1em]Lstm10.east) -- ([xshift=0.1em]sep4.west);
-\draw [->] ([xshift=-0.1em]sep4.east) -- ([xshift=-0.1em]Lstm11.west);
+\draw [->] ([xshift=-0.1em]Lstm10.west) -- ([xshift=0.1em]Lstm9.east);
+\draw [->] ([xshift=0.1em]sep4.west) -- ([xshift=0.1em]Lstm10.east);
+\draw [->] ([xshift=-0.1em]Lstm11.west) -- ([xshift=-0.1em]sep4.east);

 \draw [->] ([yshift=0.1em]Lstm6.north) -- ([yshift=-0.1em]Lstm9.south);
 \draw [->] ([yshift=0.1em]Lstm7.north) -- ([yshift=-0.1em]Lstm10.south);
 \draw [->] ([yshift=0.1em]Lstm8.north) -- ([yshift=-0.1em]Lstm11.south);

-\draw [->] ([yshift=0.1em]e1.north) -- ([yshift=-0.1em]Lstm0.south);
-\draw [->] ([yshift=0.1em]e1.north) -- ([yshift=-0.1em]Lstm6.south);
-\draw [->] ([yshift=0.1em]e2.north) -- ([yshift=-0.1em]Lstm1.south);
-\draw [->] ([yshift=0.1em]e2.north) -- ([yshift=-0.1em]Lstm7.south);
-\draw [->] ([yshift=0.1em]e3.north) -- ([yshift=-0.1em]Lstm2.south);
-\draw [->] ([yshift=0.1em]e3.north) -- ([yshift=-0.1em]Lstm8.south);
-
-\draw [->] ([yshift=0.1em]Lstm3.north) -- ([xshift=-0.05em,yshift=-0.1em]t1.south);
-\draw [->] ([yshift=0.1em]Lstm9.north) -- ([yshift=-0.1em]t1.south);
-\draw [->] ([yshift=0.1em]Lstm4.north) -- ([xshift=-0.05em,yshift=-0.1em]t2.south);
-\draw [->] ([yshift=0.1em]Lstm10.north) -- ([yshift=-0.1em]t2.south);
-\draw [->] ([yshift=0.1em]Lstm5.north) -- ([xshift=-0.05em,yshift=-0.1em]t3.south);
-\draw [->] ([yshift=0.1em]Lstm11.north) -- ([yshift=-0.1em]t3.south);
+\draw [->] ([yshift=0.1em]e1.north) -- ([yshift=-0.12em]Lstm0.south);
+\draw [->] ([yshift=0.1em]e1.north) -- ([yshift=-0.12em]Lstm6.south);
+\draw [->] ([yshift=0.1em]e2.north) -- ([yshift=-0.12em]Lstm1.south);
+\draw [->] ([yshift=0.1em]e2.north) -- ([yshift=-0.12em]Lstm7.south);
+\draw [->] ([yshift=0.1em]e3.north) -- ([yshift=-0.12em]Lstm2.south);
+\draw [->] ([yshift=0.1em]e3.north) -- ([yshift=-0.12em]Lstm8.south);
+
+\draw [->] ([yshift=0.1em]Lstm3.north) -- ([xshift=-0.05em,yshift=-0.12em]t1.south);
+\draw [->] ([yshift=0.1em]Lstm9.north) -- ([yshift=-0.12em]t1.south);
+\draw [->] ([yshift=0.1em]Lstm4.north) -- ([xshift=-0.05em,yshift=-0.12em]t2.south);
+\draw [->] ([yshift=0.1em]Lstm10.north) -- ([yshift=-0.12em]t2.south);
+\draw [->] ([yshift=0.1em]Lstm5.north) -- ([xshift=-0.05em,yshift=-0.12em]t3.south);
+\draw [->] ([yshift=0.1em]Lstm11.north) -- ([yshift=-0.12em]t3.south);

 \end{scope}
 \end{tikzpicture}

--- a/Book/Chapter5/Figures/fig-fit.tex
+++ b/Book/Chapter5/Figures/fig-fit.tex
@@ -32,12 +32,12 @@
 \draw [->,thick] ([yshift=0.1em]n20.north) -- (y.south);

 %% weight and bias
-{\node [anchor=center,rotate=90,fill=white,inner sep=1pt] (b0) at ([yshift=3em,xshift=-0.5em]b.north) {\tiny{$b=-6$}};}
-{\node [anchor=center,rotate=-59,fill=white,inner sep=1pt] (w2) at ([yshift=1.2em,xshift=-1.2em]x1.north) {\tiny{$w=100$}};}
-{\node [anchor=center,rotate=62,fill=white,inner sep=1pt] (w21) at ([yshift=2em,xshift=0.5em]n10.north) {\tiny{$w'=-0.7$}};}
-{\node [anchor=center,rotate=-62,fill=white,inner sep=1pt] (w22) at ([yshift=2em,xshift=-0.5em]n11.north) {\tiny{$w'=0.7$}};}
-{\node [anchor=center,rotate=59,fill=white,inner sep=1pt] (b1) at ([yshift=4.9em,xshift=2.2em]b.north) {\tiny{$b=-4$}};}
-{\node [anchor=center,rotate=90,fill=white,inner sep=1pt] (w1) at ([yshift=3em,xshift=0.5em]x1.north) {\tiny{$w=100$}};}
+{\node [anchor=center,rotate=90,fill=white,inner sep=1pt] (b0) at ([yshift=3em,xshift=-0.5em]b.north) {\tiny{$b_1=-6$}};}
+{\node [anchor=center,rotate=-59,fill=white,inner sep=1pt] (w2) at ([yshift=1.2em,xshift=-1.2em]x1.north) {\tiny{$w_1=100$}};}
+{\node [anchor=center,rotate=59,fill=white,inner sep=1pt] (b1) at ([yshift=5.1em,xshift=2.3em]b.north) {\tiny{$b_2=-4$}};}
+{\node [anchor=center,rotate=90,fill=white,inner sep=1pt] (w1) at ([yshift=3em,xshift=0.5em]x1.north) {\tiny{$w_2=100$}};}
+{\node [anchor=center,rotate=62,fill=white,inner sep=1pt] (w21) at ([yshift=1.8em,xshift=0.2em]n10.north) {\tiny{$w'_1=-0.7$}};}
+{\node [anchor=center,rotate=-62,fill=white,inner sep=1pt] (w22) at ([yshift=1.8em,xshift=-0.2em]n11.north) {\tiny{$w'_2=0.7$}};}

 %% sigmoid box
 \begin{scope}
@@ -120,18 +120,18 @@
 \node [] (y) at ([yshift=3em]n20.north) {$y$};
 \draw [->,thick] ([yshift=0.1em]n20.north) -- (y.south);
 {
-\draw [->,thick] ([yshift=0.1em]n12.north) -- ([yshift=-0.1em]n20.310);
-\draw [->,thick] ([yshift=0.1em]n13.north) -- ([yshift=-0.1em]n20.330);
+\draw [->,thick] ([yshift=0.1em]n12.north) -- ([yshift=-0.1em]n20.330);
+\draw [->,thick] ([yshift=0.1em]n13.north) -- ([yshift=-0.1em]n20.340);
 }


 %% weight and bias
-{\node [anchor=center,rotate=90,fill=white,inner sep=1pt] (b0) at ([yshift=3em,xshift=-0.5em]b.north) {\tiny{$b=-6$}};}
-{\node [anchor=center,rotate=-59,fill=white,inner sep=1pt] (w2) at ([yshift=1.2em,xshift=-1.2em]x1.north) {\tiny{$w=100$}};}
-{\node [anchor=center,rotate=59,fill=white,inner sep=1pt] (b1) at ([yshift=4.9em,xshift=2.2em]b.north) {\tiny{$b=-4$}};}
-{\node [anchor=center,rotate=90,fill=white,inner sep=1pt] (w1) at ([yshift=3em,xshift=0.5em]x1.north) {\tiny{$w=100$}};}
-{\node [anchor=center,rotate=62,fill=white,inner sep=1pt] (w21) at ([yshift=2em,xshift=0.5em]n10.north) {\tiny{$w'=-0.7$}};}
-{\node [anchor=center,rotate=-62,fill=white,inner sep=1pt] (w22) at ([yshift=2em,xshift=-0.5em]n11.north) {\tiny{$w'=0.7$}};}
+{\node [anchor=center,rotate=90,fill=white,inner sep=1pt] (b0) at ([yshift=3em,xshift=-0.5em]b.north) {\tiny{$b_1=-6$}};}
+{\node [anchor=center,rotate=-59,fill=white,inner sep=1pt] (w2) at ([yshift=1.2em,xshift=-1.2em]x1.north) {\tiny{$w_1=100$}};}
+{\node [anchor=center,rotate=59,fill=white,inner sep=1pt] (b1) at ([yshift=5.1em,xshift=2.3em]b.north) {\tiny{$b_2=-4$}};}
+{\node [anchor=center,rotate=90,fill=white,inner sep=1pt] (w1) at ([yshift=3em,xshift=0.5em]x1.north) {\tiny{$w_2=100$}};}
+{\node [anchor=center,rotate=62,fill=white,inner sep=1pt] (w21) at ([yshift=1.8em,xshift=0.2em]n10.north) {\tiny{$w'_1=-0.7$}};}
+{\node [anchor=center,rotate=-62,fill=white,inner sep=1pt] (w22) at ([yshift=1.8em,xshift=-0.2em]n11.north) {\tiny{$w'_2=0.7$}};}

 %% sigmoid box
 \begin{scope}
@@ -179,6 +179,8 @@

 \end{scope}
 \end{tikzpicture}
+
+
 %%%------------------------------------------------------------------------------------------------------------



--- a/Book/Chapter5/Figures/fig-four-layers-of-neural-network.tex
+++ b/Book/Chapter5/Figures/fig-four-layers-of-neural-network.tex
@@ -14,7 +14,7 @@

 \foreach \n in {1,...,5}{
    \foreach \m in {1,...,5}{
-        \draw [<-] (neuron0\m.south) -- ([yshift=-1.8em]neuron0\n.south);
+        \draw [<-] ([yshift=-0.1em]neuron0\m.south) -- ([yshift=-1.8em]neuron0\n.south);
    }
    \node [anchor=north] (x\n) at ([yshift=-1.8em]neuron0\n.south) {$x_\n$};
 }
@@ -25,13 +25,13 @@
 \node [rectangle,inner sep=0.2em,fill=red!20] [fit = (neuron01) (neuron05)] (layer01) {};
 \end{pgfonlayer}

-\node [anchor=west] (layer00label) at ([xshift=1.25em]x5.east) {\footnotesize{{输入层}}};
+\node [anchor=west] (layer00label) at ([xshift=1.25em]x5.east) {\footnotesize{\red{{输入层}}}};

 {
 \node [anchor=west] (layer01label) at ([xshift=1em]layer01.east) {\footnotesize{第二层}};
 }
 {
-\node [anchor=west] (layer01label2) at (layer01label.east) {\footnotesize{(隐层)}};
+\node [anchor=west] (layer01label2) at (layer01label.east) {\footnotesize{\red{({隐层})}}};
 }

 %%% layer 2
@@ -43,7 +43,7 @@

 \foreach \n in {2,...,4}{
    \foreach \m in {1,...,5}{
-        \draw [<-] (neuron1\n.south) -- (neuron0\m.north);
+        \draw [<-] ([yshift=-0.1em]neuron1\n.south) -- (neuron0\m.north);
    }
 }

@@ -57,7 +57,7 @@

 \node [anchor=west] (layer02label) at ([xshift=4.5em]layer02.east) {\footnotesize{第三层}};
 {
-\node [anchor=west] (layer02label2) at (layer02label.east) {\footnotesize{({隐层})}};
+\node [anchor=west] (layer02label2) at (layer02label.east) {\footnotesize{\red{({隐层})}}};
 }
 }

@@ -70,7 +70,7 @@

 \foreach \n in {1,...,5}{
    \foreach \m in {2,...,4}{
-        \draw [<-] (neuron2\n.south) -- (neuron1\m.north);
+        \draw [<-] ([yshift=-0.1em]neuron2\n.south) -- (neuron1\m.north);
    }

    \node [anchor=south] (y\n) at ([yshift=1.2em]neuron2\n.north) {$y_\n$};
@@ -87,7 +87,7 @@

 \node [anchor=west] (layer03label) at ([xshift=1em]layer03.east) {\footnotesize{第四层}};
 {
-\node [anchor=west] (layer03label2) at (layer03label.east) {\footnotesize{({输出层})}};
+\node [anchor=west] (layer03label2) at (layer03label.east) {\footnotesize{\red{({输出层})}}};
 }
 }


--- a/Book/Chapter5/Figures/fig-gpt.tex
+++ b/Book/Chapter5/Figures/fig-gpt.tex
@@ -9,7 +9,7 @@
 \node [anchor=west,inner sep=4pt] (sep) at ([xshift=1em]Trm3.east) {\scriptsize{...}};
 \node [anchor=west,draw,inner sep=4pt,fill=blue!20!white,minimum width=3em] (Trm4) at ([xshift=1em]sep.east) {\scriptsize{TRM}};

-\node [anchor=south,draw,inner sep=4pt,fill=blue!20!white,minimum width=3em] (Trm5) at ([yshift=1em]Trm0.north) {\scriptsize{TRM}};
+\node [anchor=south,draw,inner sep=4pt,fill=blue!20!white,minimum width=3em] (Trm5) at ([yshift=1.2em]Trm0.north) {\scriptsize{TRM}};
 \node [anchor=west,draw,inner sep=4pt,fill=blue!20!white,minimum width=3em] (Trm6) at ([xshift=1em]Trm5.east) {\scriptsize{TRM}};
 \node [anchor=west,draw,inner sep=4pt,fill=blue!20!white,minimum width=3em] (Trm7) at ([xshift=1em]Trm6.east) {\scriptsize{TRM}};
 \node [anchor=west,draw,inner sep=4pt,fill=blue!20!white,minimum width=3em] (Trm8) at ([xshift=1em]Trm7.east) {\scriptsize{TRM}};
@@ -18,44 +18,51 @@

 \node [rectangle,rounded corners,draw=black!50,densely dashed,inner sep=0.4em] [fit = (Trm0) (Trm4) (Trm5) (Trm9)] (inputshadow) {};

-\node [anchor=north,draw,inner sep=4pt,fill=ugreen!20!white,minimum width=2em] (e1) at ([yshift=-1em]Trm0.south) {\scriptsize{$\textbf{e}_1$}};
-\node [anchor=north,draw,inner sep=4pt,fill=ugreen!20!white,minimum width=2em] (e2) at ([yshift=-1em]Trm1.south) {\scriptsize{$\textbf{e}_2$}};
-\node [anchor=north,draw,inner sep=4pt,fill=ugreen!20!white,minimum width=2em] (e3) at ([yshift=-1em]Trm2.south) {\scriptsize{$\textbf{e}_3$}};
-\node [anchor=north,draw,inner sep=4pt,fill=ugreen!20!white,minimum width=2em] (e4) at ([yshift=-1em]Trm3.south) {\scriptsize{$\textbf{e}_4$}};
+\node [anchor=north,draw,inner sep=4pt,fill=ugreen!20!white,minimum width=2em] (e1) at ([yshift=-1.2em]Trm0.south) {\scriptsize{$\textbf{e}_1$}};
+\node [anchor=north,draw,inner sep=4pt,fill=ugreen!20!white,minimum width=2em] (e2) at ([yshift=-1.2em]Trm1.south) {\scriptsize{$\textbf{e}_2$}};
+\node [anchor=north,draw,inner sep=4pt,fill=ugreen!20!white,minimum width=2em] (e3) at ([yshift=-1.2em]Trm2.south) {\scriptsize{$\textbf{e}_3$}};
+\node [anchor=north,draw,inner sep=4pt,fill=ugreen!20!white,minimum width=2em] (e4) at ([yshift=-1.2em]Trm3.south) {\scriptsize{$\textbf{e}_4$}};
 \node [anchor=north,inner sep=4pt] (sep5) at ([yshift=-1em]sep.south) {\scriptsize{...}};
-\node [anchor=north,draw,inner sep=4pt,fill=ugreen!20!white,minimum width=2em] (e5) at ([yshift=-1em]Trm4.south) {\scriptsize{$\textbf{e}_m$}};
+\node [anchor=north,draw,inner sep=4pt,fill=ugreen!20!white,minimum width=2em] (e5) at ([yshift=-1.2em]Trm4.south) {\scriptsize{$\textbf{e}_m$}};
+\node [anchor=south] (word1) at ([yshift=-1.5em]e1.south) {\footnotesize {Once}};
+\node [anchor=south] (word2) at ([yshift=-1.6em]e2.south) {\footnotesize {upon}};
+\node [anchor=south] (word3) at ([yshift=-1.5em]e3.south) {\footnotesize {a}};
+\node [anchor=south] (word4) at ([yshift=-1.5em]e4.south) {\footnotesize {time}};
+\node [anchor=south] (wordseq) at ([yshift=-2.0em]sep5.south) {\footnotesize{...}};
+\node [anchor=south] (word4) at ([yshift=-1.5em]e5.south) {\footnotesize {island}};

-\node [anchor=south,draw,inner sep=4pt,fill=yellow!30,minimum width=2em] (t1) at ([yshift=1em]Trm5.north) {\scriptsize{$\textbf{h}_1$}};
-\node [anchor=south,draw,inner sep=4pt,fill=yellow!30,minimum width=2em] (t2) at ([yshift=1em]Trm6.north) {\scriptsize{$\textbf{h}_2$}};
-\node [anchor=south,draw,inner sep=4pt,fill=yellow!30,minimum width=2em] (t3) at ([yshift=1em]Trm7.north) {\scriptsize{$\textbf{h}_3$}};
-\node [anchor=south,draw,inner sep=4pt,fill=yellow!30,minimum width=2em] (t4) at ([yshift=1em]Trm8.north) {\scriptsize{$\textbf{h}_4$}};
+\node [anchor=south,draw,inner sep=4pt,fill=yellow!30,minimum width=2em] (t1) at ([yshift=1.2em]Trm5.north) {\scriptsize{$\textbf{h}_1$}};
+\node [anchor=south,draw,inner sep=4pt,fill=yellow!30,minimum width=2em] (t2) at ([yshift=1.2em]Trm6.north) {\scriptsize{$\textbf{h}_2$}};
+\node [anchor=south,draw,inner sep=4pt,fill=yellow!30,minimum width=2em] (t3) at ([yshift=1.2em]Trm7.north) {\scriptsize{$\textbf{h}_3$}};
+\node [anchor=south,draw,inner sep=4pt,fill=yellow!30,minimum width=2em] (t4) at ([yshift=1.2em]Trm8.north) {\scriptsize{$\textbf{h}_4$}};
 \node [anchor=south,inner sep=4pt] (sep6) at ([yshift=1em]sep1.north) {\scriptsize{...}};
-\node [anchor=south,draw,inner sep=4pt,fill=yellow!30,minimum width=2em] (t5) at ([yshift=1em]Trm9.north) {\scriptsize{$\textbf{h}_m$}};
+\node [anchor=south,draw,inner sep=4pt,fill=yellow!30,minimum width=2em] (t5) at ([yshift=1.2em]Trm9.north) {\scriptsize{$\textbf{h}_m$}};

-\node [anchor=west,draw,inner sep=3pt,fill=blue!20!white,minimum width=1em] (Lt1) at ([yshift=1.5em]t1.west) {\tiny{TRM}};
-\node [anchor=west] (Lt2) at ([xshift=-0.1em]Lt1.east) {\scriptsize{: Transformer Block}};
+\node [anchor=west,draw,inner sep=3pt,fill=blue!20!white,minimum width=1em] (Lt1) at ([yshift=1.2em]t1.west) {\tiny{TRM}};
+\node [anchor=west] (Lt2) at ([xshift=-0.1em]Lt1.east) {\tiny{: Transformer}};

 \draw [->] ([yshift=0.1em]e1.north) -- ([yshift=-0.1em]Trm0.south);
-\draw [->] ([yshift=0.1em]e1.north) -- ([yshift=-0.1em]Trm1.south);
-\draw [->] ([yshift=0.1em]e1.north) -- ([yshift=-0.1em]Trm2.south);
-\draw [->] ([yshift=0.1em]e1.north) -- ([yshift=-0.1em]Trm3.south);
-\draw [->] ([yshift=0.1em]e1.north) -- ([yshift=-0.1em]Trm4.south);
+\draw [->] ([yshift=0.1em]e1.north) -- ([yshift=-0.1em]Trm1.260);
+\draw [->] ([yshift=0.1em]e1.north) -- ([yshift=-0.1em]Trm2.260);
+\draw [->] ([yshift=0.1em]e1.north) -- ([yshift=-0.1em]Trm3.260);
+\draw [->] ([yshift=0.1em]e1.north) -- ([yshift=-0.1em]Trm4.260);
 \draw [->] ([yshift=0.1em]e2.north) -- ([yshift=-0.1em]Trm1.south);
+
 \draw [->] ([yshift=0.1em]e3.north) -- ([yshift=-0.1em]Trm2.south);
-\draw [->] ([yshift=0.1em]e3.north) -- ([yshift=-0.1em]Trm3.south);
-\draw [->] ([yshift=0.1em]e3.north) -- ([yshift=-0.1em]Trm4.south);
+\draw [->] ([yshift=0.1em]e3.north) -- ([yshift=-0.1em]Trm3.260);
+\draw [->] ([yshift=0.1em]e3.north) -- ([yshift=-0.1em]Trm4.260);
 \draw [->] ([yshift=0.1em]e4.north) -- ([yshift=-0.1em]Trm3.south);
 \draw [->] ([yshift=0.1em]e5.north) -- ([yshift=-0.1em]Trm4.south);

 \draw [->] ([yshift=0.1em]Trm0.north) -- ([yshift=-0.1em]Trm5.south);
-\draw [->] ([yshift=0.1em]Trm0.north) -- ([yshift=-0.1em]Trm6.south);
-\draw [->] ([yshift=0.1em]Trm0.north) -- ([yshift=-0.1em]Trm7.south);
-\draw [->] ([yshift=0.1em]Trm0.north) -- ([yshift=-0.1em]Trm8.south);
-\draw [->] ([yshift=0.1em]Trm0.north) -- ([yshift=-0.1em]Trm9.south);
+\draw [->] ([yshift=0.1em]Trm0.north) -- ([yshift=-0.1em]Trm6.260);
+\draw [->] ([yshift=0.1em]Trm0.north) -- ([yshift=-0.1em]Trm7.260);
+\draw [->] ([yshift=0.1em]Trm0.north) -- ([yshift=-0.1em]Trm8.260);
+\draw [->] ([yshift=0.1em]Trm0.north) -- ([yshift=-0.1em]Trm9.260);
 \draw [->] ([yshift=0.1em]Trm1.north) -- ([yshift=-0.1em]Trm6.south);
 \draw [->] ([yshift=0.1em]Trm2.north) -- ([yshift=-0.1em]Trm7.south);
-\draw [->] ([yshift=0.1em]Trm2.north) -- ([yshift=-0.1em]Trm8.south);
-\draw [->] ([yshift=0.1em]Trm2.north) -- ([yshift=-0.1em]Trm9.south);
+\draw [->] ([yshift=0.1em]Trm2.north) -- ([yshift=-0.1em]Trm8.260);
+\draw [->] ([yshift=0.1em]Trm2.north) -- ([yshift=-0.1em]Trm9.260);
 \draw [->] ([yshift=0.1em]Trm3.north) -- ([yshift=-0.1em]Trm8.south);
 \draw [->] ([yshift=0.1em]Trm4.north) -- ([yshift=-0.1em]Trm9.south);


--- a/Book/Chapter5/Figures/fig-more-layers.tex
+++ b/Book/Chapter5/Figures/fig-more-layers.tex
@@ -11,7 +11,7 @@
 }
 \foreach \n in {1,...,5}{
    \foreach \m in {1,...,5}{
-        \draw [<-] (neuron0\m.south) -- ([yshift=-1.8em]neuron0\n.south);
+        \draw [<-] ([yshift=-0.1em]neuron0\m.south) -- ([yshift=-1.8em]neuron0\n.south);
    }
    \node [anchor=north] (x\n) at ([yshift=-1.8em]neuron0\n.south) {$x_\n$};
 {
@@ -45,7 +45,7 @@

 \foreach \n in {1,...,5}{
    \foreach \m in {1,...,5}{
-        \draw [<-] (neuron0\m.south) -- ([yshift=-1.8em]neuron0\n.south);
+        \draw [<-] ([yshift=-0.1em]neuron0\m.south) -- ([yshift=-1.8em]neuron0\n.south);
    }
    \node [anchor=north] (x\n) at ([yshift=-1.8em]neuron0\n.south) {$x_\n$};
 }
@@ -65,7 +65,7 @@

 \foreach \n in {2,...,4}{
    \foreach \m in {1,...,5}{
-        \draw [<-] (neuron1\n.south) -- (neuron0\m.north);
+        \draw [<-] ([yshift=-0.1em]neuron1\n.south) -- (neuron0\m.north);
    }
    \draw [<-,thick] ([yshift=1.1em]neuron1\n.north) -- (neuron1\n.north);
    \node [anchor=south] (y\n) at ([yshift=1.25em]neuron1\n.north) {$y_\n$};
@@ -98,7 +98,7 @@

 \foreach \n in {1,...,5}{
    \foreach \m in {1,...,5}{
-        \draw [<-] (neuron0\m.south) -- ([yshift=-1.8em]neuron0\n.south);
+        \draw [<-] ([yshift=-0.1em]neuron0\m.south) -- ([yshift=-1.8em]neuron0\n.south);
    }
    \node [anchor=north] (x\n) at ([yshift=-1.8em]neuron0\n.south) {$x_\n$};
 }
@@ -118,7 +118,7 @@

 \foreach \n in {2,...,4}{
    \foreach \m in {1,...,5}{
-        \draw [<-] (neuron1\n.south) -- (neuron0\m.north);
+        \draw [<-] ([yshift=-0.1em]neuron1\n.south) -- (neuron0\m.north);
    }
 }

@@ -142,7 +142,7 @@

 \foreach \n in {1,...,5}{
    \foreach \m in {2,...,4}{
-        \draw [<-] (neuron2\n.south) -- (neuron1\m.north);
+        \draw [<-] ([yshift=-0.1em]neuron2\n.south) -- (neuron1\m.north);
    }

    \node [anchor=south] (y\n) at ([yshift=1.25em]neuron2\n.north) {$y_\n$};

--- a/Book/Chapter5/Figures/fig-multilayer-neural-network-example.tex
+++ b/Book/Chapter5/Figures/fig-multilayer-neural-network-example.tex
@@ -24,7 +24,7 @@

 \foreach \n in {1,...,4}{
    \foreach \m in {1,...,4}{
-        \draw [<-] (neuron1\n.south) -- (neuron0\m.north);
+        \draw [<-] ([yshift=-0.1em]neuron1\n.south) -- (neuron0\m.north);
    }
 }

@@ -41,7 +41,7 @@

 \foreach \n in {1,...,4}{
    \foreach \m in {1,...,4}{
-        \draw [<-] (neuron2\n.south) -- (neuron1\m.north);
+        \draw [<-] ([yshift=-0.1em]neuron2\n.south) -- (neuron1\m.north);
    }
 }

@@ -85,5 +85,6 @@

 \end{scope}
 \end{tikzpicture}
+
 %%%------------------------------------------------------------------------------------------------------------

--- a/Book/Chapter5/Figures/fig-residual-structure.tex
+++ b/Book/Chapter5/Figures/fig-residual-structure.tex
@@ -3,26 +3,25 @@
 \begin{scope}


-\node [anchor=center] (node1) at (0,0) {};
-\node [anchor=north,draw,thick](node2)at ([yshift=-1.5em]node1.south){\small{\ \ layer\ \ }};
-\draw[->,thick](node1.south)--(node2.north);
+\node [anchor=center] (node6) at (0,0) {};
+\node[anchor=west](node6-1) at ([xshift=-0.2em,yshift=-0.6em]node6.east) {\footnotesize{$\rm{ReLU}$}};
+\node [anchor=north](node3)at ([yshift=-1.2em]node6.south){$\bigoplus$};
+\draw[->,thick]([yshift=-0.32em]node3.north)--(node6.south);

-\node [anchor=north](node3)at ([yshift=-1.2em]node2.south){$\bigoplus$};
-\draw[->,thick](node2.south)--([yshift=-0.3em]node3.north);
+\node [anchor=north,draw,thick](node2)at ([yshift=-1.2em]node3.south){\small{weight layer}};
+\draw[->,thick](node2.north)--([yshift=0.35em]node3.south);
+\node[anchor=west](node2-1) at ([xshift=2.1em,yshift=1.2em]node2.east) {$\mathbf{x}$};
+\node[anchor=north](node2-2) at ([xshift=0.2em,yshift=-0.3em]node2-1.south) {\footnotesize{$\rm{identity}$}};

-\node [anchor=east](node4) at ([xshift=0.0em]node2.west) {$\textrm{F}(\mathbf{x})$};
-\node [anchor=east](node5) at ([xshift=0.3em]node3.west) {$\textrm{F}(\mathbf{x})+\mathbf{x}$};
+\node [anchor=east](node4) at ([xshift=-0.2em]node2.west) {$\textrm{F}(\mathbf{x})$};
+\node [anchor=east](node5) at ([xshift=-0.3em]node3.west) {$\textrm{F}(\mathbf{x})+\mathbf{x}$};

-\node [anchor=east](node1-1) at ([xshift=0.5em,yshift=-1.0em]node1.west) {$\mathbf{x}$};
-\draw[->,thick]([xshift=-0.1em]node1-1.east)--([xshift=4.0em]node1-1.east)--([xshift=4.0em,yshift=-4.45em]node1-1.east)--([xshift=-0.35em]node3.east);
+\node [anchor=north](node1) at ([yshift=-1.8em]node2.south) {};
+\draw[->,thick]([yshift=0.0em]node1.north)--(node2.south);
+\node [anchor=east](node1-1) at ([xshift=1em,yshift=0.4em]node1.east) {$\mathbf{x}$};
+\draw[->,thick]([xshift=-1.3em,yshift=0.8em]node1-1.east)--([xshift=2.7em,yshift=0.8em]node1-1.east)--([xshift=2.7em,yshift=5.35em]node1-1.east)--([xshift=-0.4em]node3.east);

-\node[anchor=west](node2-1) at ([xshift=2.3em]node2.east) {$\mathbf{x}$};
-\node[anchor=north](node2-2) at ([xshift=0.2em,yshift=-0.5em]node2-1.south) {\footnotesize{$\rm{identity}$}};

-\node [anchor=north](node6) at ([yshift=-1.2em]node3.south) {};
-\draw[->,thick]([yshift=0.3em]node3.south)--([yshift=0.0em]node6.north);
-
-\node[anchor=west](node6-1) at ([xshift=-0.2em,yshift=0.6em]node6.east) {\footnotesize{$\rm{Relu}$}};


 \end{scope}

--- a/Book/Chapter5/Figures/fig-sawtooth.tex
+++ b/Book/Chapter5/Figures/fig-sawtooth.tex
@@ -13,12 +13,12 @@
 \node [anchor=north,color=red] (node9) at ([xshift=0.6em,yshift=-1.7em]node1.south) {\large{$\bullet$}};
 \node [anchor=north,color=red] (node10) at ([xshift=0.0em,yshift=-1.2em]node1.south) {\large{$\bullet$}};

-\draw[-,ublue]([xshift=0.5em,yshift=0.46em]node4.south west)--([xshift=-0.5em,yshift=-0.4em]node5.north east);
-\draw[-,ublue]([xshift=-0.45em,yshift=0.52em]node6.south east)--([xshift=0.47em,yshift=-0.43em]node5.north west);
-\draw[-,ublue]([xshift=0.5em,yshift=0.46em]node6.south west)--([xshift=-0.5em,yshift=-0.4em]node7.north east);
-\draw[-,ublue]([xshift=-0.45em,yshift=0.52em]node8.south east)--([xshift=0.47em,yshift=-0.43em]node7.north west);
+\draw[-,ublue,line width=0.3mm]([xshift=0.5em,yshift=0.46em]node4.south west)--([xshift=-0.5em,yshift=-0.4em]node5.north east);
+\draw[-,ublue,line width=0.3mm]([xshift=-0.45em,yshift=0.52em]node6.south east)--([xshift=0.47em,yshift=-0.43em]node5.north west);
+\draw[-,ublue,line width=0.3mm]([xshift=0.5em,yshift=0.46em]node6.south west)--([xshift=-0.5em,yshift=-0.4em]node7.north east);
+\draw[-,ublue,line width=0.3mm]([xshift=-0.45em,yshift=0.52em]node8.south east)--([xshift=0.47em,yshift=-0.43em]node7.north west);
 \draw[-,ublue]([xshift=0.5em,yshift=0.46em]node8.south west)--([xshift=-0.5em,yshift=-0.4em]node9.north east);
-\draw[-,ublue]([xshift=-0.78em,yshift=0.77em]node9.south east)--([xshift=0.78em,yshift=-0.68em]node10.north west);
+\draw[-,ublue,line width=0.3mm]([xshift=-0.78em,yshift=0.77em]node9.south east)--([xshift=0.78em,yshift=-0.68em]node10.north west);



@@ -39,6 +39,47 @@

 \draw[-,ublue](0,-0.8)..controls(0.5,-0.8) and (0.6,-0.85)..(0.6,-0.9)..controls(0.6,-0.93)and (0.5,-0.91)..(0.3,-0.88)..controls(0.2,-0.87)and (0.1,-0.86)..(0,-0.86)..controls(-0.1,-0.86)and(-0.2,-0.87)..(-0.3,-0.88)..controls(-0.5,-0.91) and(-0.6,-0.93) ..(-0.6,-0.9)..controls(-0.6,-0.85)and (-0.5,-0.8)..(0,-0.8);

+
+\node [anchor=north] (labela) at (0,-2.7) {\footnotesize{(a)梯度下降算法中的``锯齿''现象}};
+
+\end{scope}
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\begin{scope}[yshift=-2in]
+
+
+\node [anchor=center,color=red] (node1) at (0,0) {};
+
+\node [anchor=north,color=red] (node2) at ([xshift=0.0em,yshift=-1.2em]node1.south) {\large{$\bullet$}};
+\node [anchor=north,color=red] (node3) at ([xshift=3.55em,yshift=-0.981em]node1.south) {\large{$\bullet$}};
+\node [anchor=north,color=red] (node4) at ([xshift=7.75em,yshift=-2.91em]node1.south) {\large{$\bullet$}};
+\node [anchor=north,color=red] (node5) at ([xshift=11.38em,yshift=-1.11em]node1.south) {\large{$\bullet$}};
+
+\draw[-,ublue,line width=0.3mm]([xshift=0.79em,yshift=-0.59em]node2.north west)--([xshift=-0.75em,yshift=0.6em]node3.south east);
+\draw[-,ublue,line width=0.3mm]([xshift=0.79em,yshift=0.66em]node3.south west)--([xshift=-0.76em,yshift=-0.5em]node4.north east);
+\draw[-,ublue,line width=0.3mm]([xshift=0.79em,yshift=-0.59em]node4.north west)--([xshift=-0.75em,yshift=0.6em]node5.south east);
+
+
+
+
+\draw [-,ublue] (0,0) .. controls (2,0) and (3,-1.0)..(3,-1.5) .. controls (3,-2.2) and (2,-1.75)..(1.5,-1.65)..controls (1.5,-1.65) and (0.5,-1.45)..(0,-1.45)..controls (-0.5,-1.45) and (-1.5,-1.65)..(-1.5,-1.65)..controls (-2,-1.75)and (-3,-2.2).. (-3,-1.5)..controls (-3,-1.0) and (-2,0)..(0,0);
+
+\draw [-,ublue] (0,0.5)..controls (2,0.5) and (4,-1.0).. (4,-1.7)..controls(4,-2.6)and (3,-2.3)..(2,-2.05)..controls (2,-2.05) and (1,-1.80)..(0,-1.80)..controls (-1,-1.80)and (-2,-2.05)..(-2,-2.05)..controls(-3,-2.3)and(-4,-2.6)..(-4,-1.7)..controls(-4,-1.0)and (-2,0.5)..(0,0.5);
+
+\draw[-,ublue](0,1.0)..controls(3,1.0) and (5,-1.0)..(5,-1.9)..controls (5,-3.2)and (4,-2.7)..(3,-2.5)..controls (3,-2.5) and (2,-2.20)..(0,-2.15)..controls (-2,-2.20)and (-3,-2.5)..(-3,-2.5)..controls (-4,-2.7) and (-5,-3.2) ..(-5,-1.9)..controls (-5,-1.0) and (-3,1.0)..(0,1.0);
+
+\draw[-,ublue] (0,-0.3)..controls (1.5,-0.3)and (2.5,-1.0)..(2.5,-1.4)..controls(2.5,-1.8)and (2,-1.55)..(1.5,-1.45) ..controls (1.5,-1.45) and (0.5,-1.25)..(0,-1.25) .. controls(-0.5,-1.25)and (-1.5,-1.45)..(-1.5,-1.45)..controls(-2,-1.55)and (-2.5,-1.8) ..(-2.5,-1.4)..controls(-2.5,-1.0) and (-1.5,-0.3)..(0,-0.3);
+
+\draw[-,ublue](0,-0.5)..controls (1.0,-0.5) and (1.9,-0.8)..(1.9,-1.3)..controls(1.9,-1.5)and (1.5,-1.3)..(1.0,-1.2) ..controls(1.0,-1.2) and (0.5,-1.1)..(0,-1.1)..controls(-0.5,-1.1) and (-1.0,-1.2)..(-1.0,-1.2)..controls (-1.5,-1.3)and (-1.9,-1.5)..(-1.9,-1.3) ..controls(-1.9,-0.8)and (-1.0,-0.5) ..(0,-0.5);
+
+\draw[-,ublue](0,-0.7)..controls(1.0,-0.7) and (1.4,-0.9)..(1.4,-1.1) .. controls(1.4,-1.25) and (1.2,-1.15)..(1.0,-1.1)..controls(1.0,-1.1) and (0.5,-0.95)..(0,-0.95)..controls(-0.5,-0.95)and (-1.0,-1.1) ..(-1.0,-1.1)..controls(-1.2,-1.15) and (-1.4,-1.25)..(-1.4,-1.1)..controls(-1.4,-0.9) and (-1.0,-0.7)..(0,-0.7);
+
+\draw[-,ublue](0,-0.75)..controls(0.7,-0.75)and (1.0,-0.9)..(1.0,-1.0)..controls(1.0,-1.05) and (0.9,-1.05)..(0.7,-1.0)..controls(0.5,-0.95)and (0.3,-0.9)..(0,-0.9)..controls(-0.3,-0.9)and (-0.5,-0.95)..(-0.7,-1.0)..controls(-0.9,-1.05)and (-1.0,-1.05)..(-1.0,-1.0) ..controls(-1.0,-0.9)and (-0.7,-0.75)..(0,-0.75);
+
+\draw[-,ublue](0,-0.8)..controls(0.5,-0.8) and (0.6,-0.85)..(0.6,-0.9)..controls(0.6,-0.93)and (0.5,-0.91)..(0.3,-0.88)..controls(0.2,-0.87)and (0.1,-0.86)..(0,-0.86)..controls(-0.1,-0.86)and(-0.2,-0.87)..(-0.3,-0.88)..controls(-0.5,-0.91) and(-0.6,-0.93) ..(-0.6,-0.9)..controls(-0.6,-0.85)and (-0.5,-0.8)..(0,-0.8);
+
+\node [anchor=north] (labelb) at (0,-3) {\footnotesize{(b)Momentum梯度下降算法更加``平滑''地更新}};
+
 \end{scope}
 \end{tikzpicture}
 %%%------------------------------------------------------------------------------------------------------------

--- a/Book/Chapter5/Figures/fig-two-layer-neural-network.tex
+++ b/Book/Chapter5/Figures/fig-two-layer-neural-network.tex
@@ -33,6 +33,14 @@
 \node [] (y) at ([yshift=2.5em]n20.north) {$y$};
 \draw [->,thick] ([yshift=0.1em]n20.north) -- (y.south);

+%% weight and bias
+{\node [anchor=center,rotate=90,fill=white,inner sep=1pt] (b0) at ([yshift=2em,xshift=-0.5em]b.north) {\tiny{$b_1$}};}
+{\node [anchor=center,rotate=-59,fill=white,inner sep=1pt] (w2) at ([yshift=1em,xshift=-1.0em]x1.north) {\tiny{$w_1$}};}
+{\node [anchor=center,rotate=62,fill=white,inner sep=1pt] (w21) at ([yshift=1.2em,xshift=0em]n10.north) {\tiny{$w'_1$}};}
+{\node [anchor=center,rotate=-62,fill=white,inner sep=1pt] (w22) at ([yshift=1.2em,xshift=-0em]n11.north) {\tiny{$w'_2$}};}
+{\node [anchor=center,rotate=59,fill=white,inner sep=1pt] (b1) at ([yshift=3.4em,xshift=1.5em]b.north) {\tiny{$b_2$}};}
+{\node [anchor=center,rotate=90,fill=white,inner sep=1pt] (w1) at ([yshift=2em,xshift=0.5em]x1.north) {\tiny{$w_2$}};}
+
 %% sigmoid box
 \begin{scope}
 {

--- a/Book/Chapter5/Figures/fig-weather-forward.tex
+++ b/Book/Chapter5/Figures/fig-weather-forward.tex
@@ -2,19 +2,19 @@
 \begin{tikzpicture}

 \node [anchor=west,minimum width=1.5em,minimum height=1.5em] (part1) at (0,0) {\footnotesize{$y$}};
-\node [anchor=north,minimum width=1.5em,minimum height=1.5em] (part1-2) at ([xshift=-1.6em,yshift=-0.3em]part1.south) {\scriptsize {$\rm {shape(1)}$}};
-\node [anchor=north,draw,minimum width=4.0em,minimum height=1.5em,fill=orange!20] (part2) at ([yshift=-1.5em]part1.south) {\footnotesize {$\rm{sigmoid}$}};
+\node [anchor=north,minimum width=1.5em,minimum height=1.5em] (part1-2) at ([xshift=-1.2em,yshift=-0.3em]part1.south) {\scriptsize {$1\times1$}};
+\node [anchor=north,draw,minimum width=4.0em,minimum height=1.5em,fill=orange!20] (part2) at ([yshift=-1.5em]part1.south) {\footnotesize {$\rm{Sigmoid}$}};
 \draw [-,thick](part1.south)--(part2.north);

-\node [anchor=north,minimum width=1.5em,minimum height=1.5em] (part2-2) at ([xshift=-1.6em,yshift=-0.3em]part2.south) {\scriptsize {$\rm{shape(1)}$}};
+\node [anchor=north,minimum width=1.5em,minimum height=1.5em] (part2-2) at ([xshift=-1.2em,yshift=-0.3em]part2.south) {\scriptsize {$1\times1$}};
 \node [anchor=north,draw,minimum width=4.0em,minimum height=1.5em,fill=green!20] (part3) at ([yshift=-1.5em]part2.south) {\footnotesize {$\rm{ADD}$}};
 \draw [-,thick](part2.south)--(part3.north);

-\node [anchor=north,minimum width=1.5em,minimum height=1.5em] (part3-2) at ([xshift=-1.6em,yshift=-0.3em]part3.south) {\scriptsize {$\rm {shape(1)}$}};
+\node [anchor=north,minimum width=1.5em,minimum height=1.5em] (part3-2) at ([xshift=-1.2em,yshift=-0.3em]part3.south) {\scriptsize {$1\times1$}};
 \node [anchor=north,draw,minimum width=4.0em,minimum height=1.5em,fill=blue!20] (part4) at ([yshift=-1.5em]part3.south) {\footnotesize {$\rm{MUL}$}};
 \draw [-,thick](part3.south)--(part4.north);

-\node [anchor=north,minimum width=1.5em,minimum height=1.5em] (part4-2) at ([xshift=-1.6em,yshift=-0.2em]part4.south) {\scriptsize {$\rm {shape(2)}$}};
+\node [anchor=north,minimum width=1.5em,minimum height=1.5em] (part4-2) at ([xshift=-1.2em,yshift=-0.2em]part4.south) {\scriptsize {$1\times 2$}};
 \node [anchor=north,minimum width=4.0em,minimum height=1.5em] (part5) at ([yshift=-1.4em]part4.south) {\footnotesize {$\mathbf a$}};
 \draw [-,thick](part4.south)--([yshift=-0.1em]part5.north);
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%
@@ -22,22 +22,22 @@
 \node [anchor=west,minimum width=2.0em,minimum height=1.5em,draw,fill=orange!40] (part5-4) at ([xshift=2.0em,yshift=0.0em]part5-3.east) {\footnotesize {$\mathbf b^2$}};
 \draw[-,thick](part4.south)--(part5-3.north);
 \draw[-,thick](part3.south)--(part5-4.north);
-\node [anchor=south,minimum width=1.5em,minimum height=1.5em] (part5-3-1) at ([xshift=1.3em,yshift=-0.45em]part5-3.north) {\scriptsize {$\rm{shape(2)}$}};
-\node [anchor=south,minimum width=1.5em,minimum height=1.5em] (part5-4-1) at ([xshift=1.3em,yshift=-0.45em]part5-4.north) {\scriptsize {$\rm{shape(1)}$}};
+\node [anchor=south,minimum width=1.5em,minimum height=1.5em] (part5-3-1) at ([xshift=1.1em,yshift=-0.45em]part5-3.north) {\scriptsize {$1\times 2$}};
+\node [anchor=south,minimum width=1.5em,minimum height=1.5em] (part5-4-1) at ([xshift=1.1em,yshift=-0.45em]part5-4.north) {\scriptsize {$1\times1$}};
 %%%%%%%%%%%%%%%%%%%%%%%%%%
-\node [anchor=north,minimum width=1.5em,minimum height=1.5em] (part5-2) at ([xshift=-1.6em,yshift=-0.2em]part5.south) {\scriptsize {$\rm{shape(2)}$}};
-\node [anchor=north,draw,minimum width=4.0em,minimum height=1.5em,fill=yellow!20] (part6) at ([yshift=-1.4em]part5.south) {\footnotesize {$\rm{tanh}$}};
+\node [anchor=north,minimum width=1.5em,minimum height=1.5em] (part5-2) at ([xshift=-1.2em,yshift=-0.2em]part5.south) {\scriptsize {$1\times 2$}};
+\node [anchor=north,draw,minimum width=4.0em,minimum height=1.5em,fill=yellow!20] (part6) at ([yshift=-1.4em]part5.south) {\footnotesize {$\rm{Tanh}$}};
 \draw [-,thick]([yshift=0.1em]part5.south)--(part6.north);

-\node [anchor=north,minimum width=1.5em,minimum height=1.5em] (part6-2) at ([xshift=-1.6em,yshift=-0.3em]part6.south) {\scriptsize {$\rm{shape(2)}$}};
+\node [anchor=north,minimum width=1.5em,minimum height=1.5em] (part6-2) at ([xshift=-1.2em,yshift=-0.3em]part6.south) {\scriptsize {$1\times 2$}};
 \node [anchor=north,draw,minimum width=4.0em,minimum height=1.5em,fill=green!20] (part7) at ([yshift=-1.5em]part6.south) {\footnotesize {$\rm{ADD}$}};
 \draw [-,thick](part6.south)--(part7.north);

-\node [anchor=north,minimum width=1.5em,minimum height=1.5em] (part7-2) at ([xshift=-1.6em,yshift=-0.3em]part7.south) {\scriptsize {$\rm{shape(2)}$}};
+\node [anchor=north,minimum width=1.5em,minimum height=1.5em] (part7-2) at ([xshift=-1.2em,yshift=-0.3em]part7.south) {\scriptsize {$1\times 2$}};
 \node [anchor=north,draw,minimum width=4.0em,minimum height=1.5em,fill=blue!20] (part8) at ([yshift=-1.5em]part7.south) {\footnotesize {$\rm{MUL}$}};
 \draw [-,thick](part7.south)--(part8.north);

-\node [anchor=north,minimum width=1.5em,minimum height=1.5em] (part8-2) at ([xshift=-1.6em,yshift=-0.2em]part8.south) {\scriptsize{$\rm{shape(2)}$}};
+\node [anchor=north,minimum width=1.5em,minimum height=1.5em] (part8-2) at ([xshift=-1.2em,yshift=-0.2em]part8.south) {\scriptsize{$1\times 2$}};
 \node [anchor=north,minimum width=4.0em,minimum height=1.5em] (part9) at ([yshift=-1.4em]part8.south) {\footnotesize {$\mathbf x$}};
 \draw [-,thick](part8.south)--([yshift=-0.1em]part9.north);
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%
@@ -45,8 +45,8 @@
 \node [anchor=west,minimum width=2.0em,minimum height=1.5em,draw,fill=orange!40] (part9-4) at ([xshift=2.0em,yshift=0.0em]part9-3.east) {\footnotesize {$\mathbf b^1$}};
 \draw[-,thick](part8.south)--(part9-3.north);
 \draw[-,thick](part7.south)--(part9-4.north);
-\node [anchor=south,minimum width=1.5em,minimum height=1.5em] (part9-3-1) at ([xshift=1.5em,yshift=-0.45em]part9-3.north) {\scriptsize {$\rm{shape(3,2)}$}};
-\node [anchor=south,minimum width=1.5em,minimum height=1.5em] (part9-4-1) at ([xshift=1.3em,yshift=-0.45em]part9-4.north) {\scriptsize {$\rm{shape(2)}$}};
+\node [anchor=south,minimum width=1.5em,minimum height=1.5em] (part9-3-1) at ([xshift=1.1em,yshift=-0.45em]part9-3.north) {\scriptsize {$3\times 2$}};
+\node [anchor=south,minimum width=1.5em,minimum height=1.5em] (part9-4-1) at ([xshift=1.1em,yshift=-0.45em]part9-4.north) {\scriptsize {$1\times 2$}};
 %%%%%%%%%%%%%%%%%%%%%%%%%%

 \end{tikzpicture}

--- a/Book/Chapter5/Figures/fig-weather.tex
+++ b/Book/Chapter5/Figures/fig-weather.tex
@@ -2,69 +2,98 @@
 \begin{tikzpicture}
 \begin{scope}
 %左
-\node [anchor=west,draw=ublue,minimum width=2.5em,fill=yellow!20] (part1-1) at (0,0) {\scriptsize{天空状况}};
-\node [anchor=north,draw=ublue,minimum width=2.5em,fill=yellow!20] (part1-2) at ([yshift=-1.7em]part1-1.south) {\scriptsize {低空气温}};
-\node [anchor=north,draw=ublue,minimum width=2.5em,fill=yellow!20] (part1-3) at ([yshift=-1.7em]part1-2.south) {\scriptsize {水平气压}};
-\node [anchor=north,minimum width=2.5em] (part1-4) at ([yshift=-0.5em]part1-3.south) {\scriptsize {输入层}};
+\node [anchor=west,draw=ublue,minimum width=3.55em,fill=yellow!20] (part1-1) at (0,0) {\scriptsize{天空状况}};
+\node [anchor=north] (inputlabel) at ([yshift=2em]part1-1.north) {\scriptsize{输入}};
+\node [anchor=north,draw=ublue,minimum width=3.55em,fill=yellow!20] (part1-2) at ([yshift=-2.0em]part1-1.south) {\scriptsize {低空气温}};
+\node [anchor=north,draw=ublue,minimum width=3.55em,fill=yellow!20] (part1-3) at ([yshift=-2.0em]part1-2.south) {\scriptsize {水平气压}};
+\node [rectangle,rounded corners,draw=black!50,densely dashed,inner sep=0.4em] [fit = (part1-1) (part1-2) (part1-3) (inputlabel)] (inputshadow) {};
+\node [anchor=north,draw=ublue,minimum width=3.55em,fill=yellow!20] (part1-4) at ([yshift=-2.0em]part1-3.south) {\scriptsize {偏移1}};
+\node [anchor=north,minimum width=2.5em] (part1-5) at ([yshift=-0.5em]part1-4.south) {\scriptsize {输入层}};


 %中
-\node [circle,anchor=west,draw=ublue,minimum width=2.0em,fill=blue!20] (part2-1) at ([xshift=2.0em,yshift=1.5em]part1-2.east) {\scriptsize {温度}};
-\node [circle,anchor=west,draw=ublue,minimum width=2.0em,fill=blue!20] (part2-2) at ([xshift=2.0em,yshift=-1.5em]part1-2.east) {\scriptsize {风速}};
-\node [anchor=north,minimum width=3.0em] (part2-3) at ([xshift=0.0em,yshift=-1.52em]part2-2.south) {\scriptsize{隐藏层}};
-\node [anchor=north] (labela) at ([xshift=0.0em,yshift=-1em]part2-3.south) {\footnotesize {(a)}};
+\node [circle,anchor=west,draw=ublue,minimum width=2.5em,fill=blue!20] (part2-1) at ([xshift=2.0em,yshift=1.7em]part1-2.east) {\scriptsize {温度}};
+\node [anchor=north] (hidlabel) at ([yshift=3.1em]part2-1.north) {\scriptsize{特征}};
+\node [circle,anchor=west,draw=ublue,minimum width=2.5em,fill=blue!20] (part2-2) at ([xshift=2.0em,yshift=-1.7em]part1-2.east) {\scriptsize {风速}};
+\node [rectangle,rounded corners,draw=black!50,densely dashed,inner sep=0.4em] [fit = (part2-1) (part2-2) (hidlabel) ] (inputshadow) {};
+\node [circle,anchor=west,draw=ublue,minimum width=2.5em,fill=blue!20,inner sep=2pt] (part2-3) at ([xshift=2.0em,yshift=-1.7em]part1-3.east) {\scriptsize {偏移2}};
+\node [anchor=north,minimum width=3.0em] (part2-4) at ([xshift=0.0em,yshift=-1.6em]part2-3.south) {\scriptsize{隐藏层}};
+\node [anchor=north] (labela) at ([xshift=0.0em,yshift=-4em]part2-3.south) {\footnotesize {(a)}};

 %右
-\node [anchor=west,draw=ublue,minimum width=3.0em,fill=purple!20] (part3-1) at ([xshift=5.8em,yshift=0.0em]part1-2.east) {\scriptsize {穿衣指数}};
-\node [anchor=north,minimum width=3.0em] (part3-2) at ([yshift=-3.6em]part3-1.south) {\scriptsize{输出层}};
-\node[anchor=south,minimum height=11em,minimum width=15.0em,draw=ublue,dotted,thick] (part2out) at ([xshift=4.8em,yshift=-7em]part1-2.north) {};
+\node [anchor=west,draw=ublue,minimum width=3.0em,fill=purple!20] (part3-1) at ([xshift=2em,yshift=0.0em]part2-2.east) {\scriptsize {穿衣指数}};
+\node [anchor=north,minimum width=3.0em] (part3-2) at ([yshift=-5.55em]part3-1.south) {\scriptsize{输出层}};
+%\node[anchor=south,minimum height=18em,minimum width=16.0em,draw=ublue,dotted,thick] (part2out) at ([xshift=4.8em,yshift=-11em]part1-2.north) {};


 %连线

-\draw [->,thick,ublue](part1-1.east)--(part2-1.west);
-\draw [->,thick,ublue](part1-1.east)--(part2-2.west);
-\draw [->,thick,ublue](part1-2.east)--(part2-1.west);
-\draw [->,thick,ublue](part1-2.east)--(part2-2.west);
-\draw [->,thick,ublue](part1-3.east)--(part2-1.west);
-\draw [->,thick,ublue](part1-3.east)--(part2-2.west);
-\draw [->,thick,ublue](part2-1.east)--(part3-1.west);
-\draw [->,thick,ublue](part2-2.east)--(part3-1.west);
+\draw [->,line width=0.2mm,ublue](part1-1.east)--([xshift=-0.05em]part2-1.170);
+\draw [->,line width=0.2mm,ublue](part1-1.east)--([xshift=-0.05em]part2-2.165);
+
+\draw [->,line width=0.2mm,ublue](part1-2.east)--([xshift=-0.05em]part2-1.175);
+\draw [->,line width=0.2mm,ublue](part1-2.east)--([xshift=-0.05em]part2-2.175);
+
+\draw [->,line width=0.2mm,ublue](part1-3.east)--([xshift=-0.05em]part2-1.185);
+\draw [->,line width=0.2mm,ublue](part1-3.east)--([xshift=-0.05em]part2-2.185);
+
+\draw [->,line width=0.2mm,ublue](part1-4.east)--([xshift=-0.05em]part2-1.195);
+\draw [->,line width=0.2mm,ublue](part1-4.east)--([xshift=-0.05em]part2-2.195);
+
+\draw [->,line width=0.2mm,ublue](part2-1.east)--([xshift=-0.05em,yshift=0.2em]part3-1.west);
+\draw [->,line width=0.2mm,ublue](part2-2.east)--([xshift=-0.05em]part3-1.west);
+\draw [->,line width=0.2mm,ublue](part2-3.east)--([xshift=-0.05em,yshift=-0.2em]part3-1.west);
 \end{scope}

-\begin{scope}[xshift=2.8in]
+\begin{scope}[xshift=3.0in]
 %左
-\node [anchor=west,draw=ublue,minimum width=1.5em,minimum height=1.5em,fill=yellow!20] (part1-1) at (0,0) {\footnotesize{$x_1$}};
-\node [anchor=north,draw=ublue,minimum width=1.5em,minimum height=1.5em,fill=yellow!20] (part1-2) at ([yshift=-1.6em]part1-1.south) {\footnotesize{$x_2$}};
-\node [anchor=north,draw=ublue,minimum width=1.5em,minimum height=1.5em,fill=yellow!20] (part1-3) at ([yshift=-1.6em]part1-2.south) {\footnotesize{$x_3$}};
-\node [anchor=north,minimum width=3.0em] (part1-4) at ([yshift=-0.5em]part1-3.south) {\scriptsize {输入层}};
+\node [anchor=west,align=center,draw=ublue,minimum width=3.55em,minimum height=1.33em,fill=yellow!20] (part1-1) at (0,0) {\normalsize{$x_1$}};
+\node [anchor=north] (inputlabel) at ([yshift=2em]part1-1.north) {\scriptsize{输入$\mathbf x $}};
+\node [anchor=north,draw=ublue,minimum width=3.55em,minimum height=1.33em,fill=yellow!20] (part1-2) at ([yshift=-2.0em]part1-1.south) {\normalsize{$x_2$}};
+\node [anchor=north,draw=ublue,minimum width=3.55em,minimum height=1.33em,fill=yellow!20] (part1-3) at ([yshift=-2.0em]part1-2.south) {\normalsize{$x_3$}};
+y
+\node [rectangle,rounded corners,draw=black!50,densely dashed,inner sep=0.4em] [fit = (part1-1) (part1-2) (part1-3) (inputlabel)] (inputshadow) {};
+
+\node [anchor=north,draw=ublue,minimum width=3.55em,fill=yellow!20] (part1-4) at ([yshift=-2.0em]part1-3.south) {\footnotesize {$\mathbf b^1 $}};
+\node [anchor=north,minimum width=2.5em] (part1-5) at ([yshift=-0.5em]part1-4.south) {\scriptsize {输入层}};


 %中
-\node [circle,anchor=west,draw=ublue,minimum width=2.0em,fill=blue!20] (part2-1) at ([xshift=1.8em,yshift=1.5em]part1-2.east) {\footnotesize{$a_1$}};
-\node [circle,anchor=west,draw=ublue,minimum width=2.0em,fill=blue!20] (part2-2) at ([xshift=1.8em,yshift=-1.5em]part1-2.east) {\footnotesize {$a_2$}};
-\node [anchor=north,minimum width=3.0em] (part2-3) at ([xshift=0.0em,yshift=-1.9em]part2-2.south) {\scriptsize {隐藏层}};
-\node [anchor=north] (labelb) at ([xshift=1em,yshift=-1em]part2-3.south) {\footnotesize {(b)}};
+
+\node [circle,anchor=west,draw=ublue,minimum width=2.5em,fill=blue!20] (part2-1) at ([xshift=2.0em,yshift=1.7em]part1-2.east) {\large{$a_1$}};
+\node [anchor=north] (hidlabel) at ([yshift=3.1em]part2-1.north) {\scriptsize{特征$\mathbf a $}};
+
+\node [circle,anchor=west,draw=ublue,minimum width=2.5em,fill=blue!20] (part2-2) at ([xshift=2.0em,yshift=-1.7em]part1-2.east) {\large{$a_2$}};
+\node [rectangle,rounded corners,draw=black!50,densely dashed,inner sep=0.4em] [fit = (part2-1) (part2-2) (hidlabel) ] (inputshadow) {};
+\node [circle,anchor=west,draw=ublue,minimum width=2.5em,fill=blue!20,inner sep=2pt] (part2-3) at ([xshift=2.0em,yshift=-1.7em]part1-3.east) {\large {$b^2 $}};
+\node [anchor=north,minimum width=3.0em] (part2-4) at ([xshift=0.0em,yshift=-1.6em]part2-3.south) {\scriptsize{隐藏层}};
+\node [anchor=north] (labelb) at ([xshift=0.0em,yshift=-4em]part2-3.south) {\footnotesize {(b)}};


 %右
-\node [circle,anchor=west,draw=ublue,minimum width=2.0em,fill=purple!20] (part3-1) at ([xshift=5.2em,yshift=0.0em]part1-2.east) {\footnotesize{$y$}};
-\node [anchor=west,draw=ublue,minimum width=1.5em,minimum height=1.5em,fill=red!40] (part3-2) at ([xshift=1.0em]part3-1.east) {\footnotesize {$y$}};
-\node [anchor=north,minimum width=3.0em] (part3-3) at ([xshift=1.4em,yshift=-3.45em]part3-1.south) {\scriptsize {输出层}};
-\node[anchor=south,minimum height=11em,minimum width=14.0em,draw=ublue,dotted,thick] (part2out) at ([xshift=4.9em,yshift=-7em]part1-2.north) {};
+\node [anchor=west,draw=ublue,minimum width=3.0em,fill=purple!20] (part3-1) at ([xshift=2em,yshift=0.0em]part2-2.east) {\large{$y$}};
+\node [anchor=north,minimum width=3.0em] (part3-2) at ([yshift=-5.55em]part3-1.south) {\scriptsize{输出层}};
+%\node[anchor=south,minimum height=18em,minimum width=16.0em,draw=ublue,dotted,thick] (part2out) at ([xshift=4.8em,yshift=-11em]part1-2.north) {};


 %连线

-\draw [->,thick,ublue](part1-1.east)--(part2-1.west);
-\draw [->,thick,ublue](part1-1.east)--(part2-2.west);
-\draw [->,thick,ublue](part1-2.east)--(part2-1.west);
-\draw [->,thick,ublue](part1-2.east)--(part2-2.west);
-\draw [->,thick,ublue](part1-3.east)--(part2-1.west);
-\draw [->,thick,ublue](part1-3.east)--(part2-2.west);
-\draw [->,thick,ublue](part2-1.east)--(part3-1.west);
-\draw [->,thick,ublue](part2-2.east)--(part3-1.west);
-\draw [->,thick,ublue](part3-1.east)--(part3-2.west);
+\draw [->,line width=0.2mm,ublue](part1-1.east)--([xshift=-0.05em]part2-1.170);
+\draw [->,line width=0.2mm,ublue](part1-1.east)--([xshift=-0.05em]part2-2.165);
+
+\draw [->,line width=0.2mm,ublue](part1-2.east)--([xshift=-0.05em]part2-1.175);
+\draw [->,line width=0.2mm,ublue](part1-2.east)--([xshift=-0.05em]part2-2.175);
+
+\draw [->,line width=0.2mm,ublue](part1-3.east)--([xshift=-0.05em]part2-1.185);
+\draw [->,line width=0.2mm,ublue](part1-3.east)--([xshift=-0.05em]part2-2.185);
+
+\draw [->,line width=0.2mm,ublue](part1-4.east)--([xshift=-0.05em]part2-1.195);
+\draw [->,line width=0.2mm,ublue](part1-4.east)--([xshift=-0.05em]part2-2.195);
+
+\draw [->,line width=0.2mm,ublue](part2-1.east)--([xshift=-0.05em,yshift=0.2em]part3-1.west);
+\draw [->,line width=0.2mm,ublue](part2-2.east)--([xshift=-0.05em]part3-1.west);
+\draw [->,line width=0.2mm,ublue](part2-3.east)--([xshift=-0.05em,yshift=-0.2em]part3-1.west);
+
 \end{scope}

 \end{tikzpicture}

--- a/Book/Chapter5/chapter5.tex
+++ b/Book/Chapter5/chapter5.tex
@@ -8,13 +8,13 @@
 \renewcommand\figurename{图}%将figure改为图
 \renewcommand\tablename{表}%将figure改为图
 %\renewcommand\arraystretch{1.5}%将表格高度调整为1.5倍
-\chapterimage{chapter_head_1.pdf} % Chapter heading image
+\chapterimage{fig-NEU-6.jpg} % Chapter heading image

 \chapter{人工神经网络和神经语言建模}

 \parinterval {\small\sffamily\bfseries{人工神经网络}}（Artificial Neural Networks）或{\small\sffamily\bfseries{神经网络}}（Neural Networks）是描述客观世界的一种数学模型。这种模型的行为和生物学上的神经系统有一些相似之处，但是人们更多的是把它作为一种计算工具，而非一个生物学模型。近些年，随着机器学习领域的快速发展，人工神经网络被更多的使用在对图像和自然语言处理问题的建模上。特别是，研究人员发现深层神经网络可以被成功训练后，学术界也逐渐形成了一种新的机器学习范式\ \dash \ 深度学习。可以说，深度学习是近几年最受瞩目的研究领域，其应用也十分广泛。比如，图像识别的很多重要进展都来自深度学习模型的使用。包括机器翻译在内的很多自然语言处理任务中，深度学习也已经成为了一种标准模型。基于深度学习的表示学习方法也为自然语言处理开辟了新的思路。

-\parinterval 本章将对深度学习的概念和技术进行介绍，目的是为第六章和第七章神经机器翻译的内容进行铺垫。此外，本章也会对深度学习在语言建模方面的应用进行介绍。这样，读者可以更容易的理解如何使用深度学习方法描述自然语言处理问题。同时，进一步了解一些相关的学术前沿，如预训练模型。\\ \\ \\ \\ \\ \\ \\
+\parinterval 本章将对深度学习的概念和技术进行介绍，目的是为第六章和第七章神经机器翻译的内容进行铺垫。此外，本章也会对深度学习在语言建模方面的应用进行介绍。这样，读者可以更容易地理解如何使用深度学习方法描述自然语言处理问题。同时，进一步了解一些相关的学术前沿，如预训练模型。\\ \\ \\ \\ \\ \\ \\
 %--5.1深度学习与人工神经网络-----------------------------------------
 \section{深度学习与人工神经网络}\index{Chapter5.1}

@@ -32,9 +32,9 @@

 \parinterval 最初，神经网络设计的初衷是用计算模型来模拟生物大脑中神经元的运行机理，这种想法哪怕在现在看来也是十分超前的。例如，目前很多机构关注的概念\ \dash \ ``类脑计算''就是希望研究人脑的运行机制及相关的计算机实现方法。然而模拟大脑这件事并没有想象中的那么简单，众所周知，生物学中对人脑机制的研究是十分困难的，我们对人脑的运行机制尚不明确又何谈模拟呢？因而，神经网络技术一直在摸索着前行，发展到现在，其计算过程与人脑的运行机制已经大相径庭。

-\parinterval 人工神经网络的第一个发展阶段是在二十世纪40年代到70年代，这个时期的人工神经网络还停留在利用线性模型模拟生物神经元的阶段，比如使用线性加权函数来描述输入$ \mathbf x $和输出$ y $ 之间的联系：$y=x_1 \cdot w_1 + \dots + x_n \cdot w_n $。举一个简单例子，输入$ \mathbf x $是这个地区的坐标和时间，输出$ y $是这个地区的温度，尽管真实的问题可能要复杂的多，但是线性模型确实有能力去拟合简单的函数关系。
+\parinterval 人工神经网络的第一个发展阶段是在二十世纪40年代到70年代，这个时期的人工神经网络还停留在利用线性模型模拟生物神经元的阶段，比如使用线性加权函数来描述输入$ \mathbf x $和输出$ y $ 之间的联系：$y=x_1 \cdot w_1 + \dots + x_n \cdot w_n $。举一个简单例子，输入$ \mathbf x $是某个地区的坐标和时间，输出$ y $是该地区的温度，尽管真实的问题可能要复杂的多，但是线性模型确实有能力去拟合简单的函数关系。

-\parinterval 这种线性模型在现在看来可能比较``简陋''，但是这类模型对后来的随机下降等经典方法产生了深远影响。不过，显而易见的是，这种结构也存在着非常明显的缺陷，单层结构限制了它的学习能力，使它无法描述非线性问题，如著名的异或函数（XOR）学习问题，然而非线性才是现实世界的普遍特征，第一代人工神经网络对很多事物的规律都无法准确描述。此后，神经网络的研究陷入了很长一段时间的低迷期。
+\parinterval 这种线性模型在现在看来可能比较``简陋''，但是这类模型对后来的随机梯度下降等经典方法产生了深远影响。不过，显而易见的是，这种结构也存在着非常明显的缺陷，单层结构限制了它的学习能力，使它无法描述非线性问题，如著名的异或函数（XOR）学习问题，然而非线性才是现实世界的普遍特征，第一代人工神经网络对很多事物的规律都无法准确描述。此后，神经网络的研究陷入了很长一段时间的低迷期。
 %--5.1.1.2神经网络的第二次高潮和第二次寒冬---------------------
 \subsubsection{神经网络的第二次高潮和第二次寒冬}\index{Chapter5.1.1.2}

@@ -44,7 +44,7 @@
 \parinterval （1）符号主义与连接主义
 \vspace{0.3em}

-\parinterval 人工智能领域始终存在着符号主义和连接主义之争。早期的人工智能研究在认知学中被称为{\small\bfnew{符号主义}}（Symbolicism），符号主义认为人工智能源于数理逻辑，希望将世界万物的所有运转方式归纳成像文法一样符合逻辑规律的推导过程。符号主义的支持者们坚信基于物理符号系统（即符号操作系统）假设和有限合理性原理，就能通过逻辑推理来模拟智能。但被他们忽略的一点是，模拟智能的推理过程需要大量的先验知识作支持，哪怕是在现代，生物学界也很难解释大脑中神经元的工作原理，因此也很难用符号系统刻画人脑逻辑。另一方面，连接主义则侧重于利用人工神经网络中神经元的连接去探索并模拟输入与输出之间存在的某种关系，这个过程不需要任何先验知识，其核心思想是``大量简单的计算单元连接到一起可以实现智能行为''，这种思想也推动了反向传播等多层神经网络方法的应用，并发展了包括长短时记忆模型在内的经典建模方法。2019年3月27日，ACM 正式宣布将图灵奖授予 Yoshua Bengio, Geoffrey Hinton 和 Yann LeCun，以表彰他们提出的概念和工作使得深度学习神经网络有了重大突破，这三位获奖人均是人工智能连接主义学派的主要代表，从这件事中也可以看出连接主义对当代人工智能和深度学习的巨大影响。
+\parinterval 人工智能领域始终存在着符号主义和连接主义之争。早期的人工智能研究在认知学中被称为{\small\bfnew{符号主义}}（Symbolicism），符号主义认为人工智能源于数理逻辑，希望将世界万物的所有运转方式归纳成像文法一样符合逻辑规律的推导过程。符号主义的支持者们坚信基于物理符号系统（即符号操作系统）假设和有限合理性原理，就能通过逻辑推理来模拟智能。但被他们忽略的一点是，模拟智能的推理过程需要大量的先验知识支持，哪怕是在现代，生物学界也很难解释大脑中神经元的工作原理，因此也很难用符号系统刻画人脑逻辑。另一方面，连接主义则侧重于利用人工神经网络中神经元的连接去探索并模拟输入与输出之间存在的某种关系，这个过程不需要任何先验知识，其核心思想是``大量简单的计算单元连接到一起可以实现智能行为''，这种思想也推动了反向传播等多层神经网络方法的应用，并发展了包括长短时记忆模型在内的经典建模方法。2019年3月27日，ACM 正式宣布将图灵奖授予 Yoshua Bengio, Geoffrey Hinton 和 Yann LeCun，以表彰他们提出的概念和工作使得深度学习神经网络有了重大突破，这三位获奖人均是人工智能连接主义学派的主要代表，从这件事中也可以看出连接主义对当代人工智能和深度学习的巨大影响。

 \vspace{0.3em}
 \parinterval （2）分布式表示
@@ -52,12 +52,12 @@

 \parinterval 分布式表示的主要思想是``一个复杂系统的任何部分的输入都应该是多个特征共同表示的结果''，这种思想在自然语言处理领域的影响尤其深刻，它改变了刻画世界的角度，将世界万物从离散空间映射到多维连续空间。例如，在现实世界中，``张三''这个代号就代表着一个人。如果想要知道这个人亲属都有谁，因为有``A和B如果姓氏相同，在一个家谱中，那么A和B是本家''这个先验知识在，在知道代号``张三''的情况下，可以得知``张三''的亲属是谁。但是如果不依靠这个先验知识，就无法得知``张三''的亲属是谁。但在分布式表示中，可以用一个实数向量，如$ (0.1,0.3,0.4) $来表示``张三''这个人，这个人的所有特征信息都包含在这个实数向量中，通过在向量空间中的一些操作（如计算距离等），哪怕没有任何先验知识的存在，也完全可以找到这个人的所有亲属。在自然语言处理中，一个单词也用一个实数向量（词向量或词嵌入）表示，通过这种方式将语义空间重新刻画，将这个离散空间转化成了一个连续空间，这时单词就不再是一个简单的词条，而是由成百上千个特征共同描述出来，而每个特征都描述这个词的某个`` 方面''。

-\parinterval 随着第二代人工神经网络的``脱胎换骨''，学者们又对神经网络方法燃起了希望之火，这也导致有些时候过分的夸大了神经网络的能力。20世纪90年代后期，由于在包括语音识别、自然语言处理等应用中，人们对神经网络方法期望过高，但是结果并没有达到预期，这也让很多人丧失了对神经网络方法的信任。相反，核方法、图模型等机器学习方法取得了很好的效果，这导致神经网络研究又一次进入低谷。
+\parinterval 随着第二代人工神经网络的``脱胎换骨''，学者们又对神经网络方法燃起了希望之火，这也导致有些时候过分夸大了神经网络的能力。20世纪90年代后期，由于在语音识别、自然语言处理等应用中，人们对神经网络方法期望过高，但是结果并没有达到预期，这也让很多人丧失了对神经网络方法的信任。相反，核方法、图模型等机器学习方法取得了很好的效果，这导致神经网络研究又一次进入低谷。

 %--5.1.1.3深度学习和神经网络的崛起---------------------
 \subsubsection{深度学习和神经网络方法的崛起}\index{Chapter5.1.1.3}

-\parinterval 21世纪初，随着深度学习浪潮席卷世界，人工神经网络又一次出现在人们的视野中。深度学习的流行源于2006年Hinton等人成功训练了一个深度信念网络（Deep Belief Network），在深度神经网络方法完全不受重视的情况下，大家突然发现深度神经网络完全是一个魔鬼般的存在，可以解决很多当时其他方法无法解决的问题。神经网络方法终于在一次又一次的否定后，迎来了它的春天。随之针对神经网络和深度学习的一系列研究前赴后继的展开了，延续至今。
+\parinterval 21世纪初，随着深度学习浪潮席卷世界，人工神经网络又一次出现在人们的视野中。深度学习的流行源于2006年Hinton等人成功训练了一个深度信念网络（Deep Belief Network），在深度神经网络方法完全不受重视的情况下，大家突然发现深度神经网络完全是一个魔鬼般的存在，可以解决很多当时其他方法无法解决的问题。神经网络方法终于在一次又一次的否定后，迎来了它的春天。随之针对神经网络和深度学习的一系列研究前赴后继地展开了，延续至今。

 \parinterval 回过头来看，现代深度学习的成功主要有三方面的原因：

@@ -90,7 +90,7 @@
 %--5.1.2.1端到端学习和表示学习---------------------
 \subsubsection{端到端学习和表示学习}\index{Chapter5.1.2.1}

-\parinterval 端到端学习使机器学习不再像以往传统的特征工程方法一样需要经过繁琐的数据预处理、特征选择、降维等过程，而是直接利用人工神经网络自动从简单特征中提取、组合更复杂的特征，大大提升了模型能力和工程效率。如图\ref{fig:vs}中的图像分类为例，在传统方法中，图像分类需要很多阶段的处理。首先，需要提取一些手工设计的图像特征，在将其降维之后，需要利用SVM等分类算法对其进行分类。与这种多阶段的流水线似的处理流程相比，端到端深度学习只训练一个神经网络，输入就是图片的像素表示，输出直接是分类类别。
+\parinterval 端到端学习使机器学习不再像以往传统的特征工程方法一样需要经过繁琐的数据预处理、特征选择、降维等过程，而是直接利用人工神经网络自动从简单特征中提取、组合更复杂的特征，大大提升了模型能力和工程效率。以图\ref{fig:vs}中的图像分类为例，在传统方法中，图像分类需要很多阶段的处理。首先，需要提取一些手工设计的图像特征，在将其降维之后，需要利用SVM等分类算法对其进行分类。与这种多阶段的流水线似的处理流程相比，端到端深度学习只训练一个神经网络，输入就是图片的像素表示，输出直接是分类类别。
 %----------------------------------------------
 % 图
    \begin{figure}
@@ -125,7 +125,7 @@
 \end{itemize}
 \vspace{0.5em}

-\parinterval 端到端学习将人们从大量的特征提取工作之中解放出来，可以不需要太多人的先验知识。从某种意义上讲，对问题的特征提取全是自动完成的，这也意味着哪怕我们不是该任务的``专家''也可以完成相关系统的开发。此外，端到端学习实际上也隐含了一种新的对问题的表示形式\ $\dash$\ {\small\bfnew{分布式表示}}（Distributed Representation）。在这种框架下，模型的输入可以被描述为分布式的实数向量，这样模型可以有更多的维度描述一个事物，同时避免传统符号系统对客观事物离散化的刻画。比如，在自然语言处理中，表示学习重新定义了什么是词，什么是句子。在本章的后面的内容中也会看到，表示学习可以让计算机对语言文字的描述更加准确和充分。
+\parinterval 端到端学习将人们从大量的特征提取工作之中解放出来，可以不需要太多人的先验知识。从某种意义上讲，对问题的特征提取全是自动完成的，这也意味着哪怕我们不是该任务的``专家''也可以完成相关系统的开发。此外，端到端学习实际上也隐含了一种新的对问题的表示形式\ $\dash$\ {\small\bfnew{分布式表示}}（Distributed Representation）。在这种框架下，模型的输入可以被描述为分布式的实数向量，这样模型可以有更多的维度描述一个事物，同时避免传统符号系统对客观事物离散化的刻画。比如，在自然语言处理中，表示学习重新定义了什么是词，什么是句子。在本章后面的内容中也会看到，表示学习可以让计算机对语言文字的描述更加准确和充分。
 %--5.1.2.2深度学习的效果---------------------
 \subsubsection{深度学习的效果}\index{Chapter5.1.2.2}

@@ -134,22 +134,23 @@
 %表1--------------------------------------------------------------------
 \begin{table}[htp]
 \centering
-\caption{不同方法在PTB语言建模任务上的困惑度（PPL）（{\red 下面，加入参考文献！}）}
+\caption{不同方法在PTB语言建模任务上的困惑度（PPL）}
 \label{tab1}
+\small
 \begin{tabular}{l | l l l}
 \rule{0pt}{15pt}     模型 & 作者 & 年份 & PPL  \\
 \hline
-\rule{0pt}{15pt}     3-gram LM & Brown et al. & 1992 & 178.0  \\
-\rule{0pt}{15pt}     Feed-forward Neural LM & Bengio et al. & 2003 & 162.2  \\
-\rule{0pt}{15pt}     Recurrent NN-based LM & Mikolov et al. & 2010 & 124.7  \\
-\rule{0pt}{15pt}     Recurrent NN-LDA & Mikolov et al. & 2012 & 92.0  \\
-\rule{0pt}{15pt}     LSTM & Zaremba et al. & 2014 & 78.4  \\
-\rule{0pt}{15pt}     RHN & Zilly et al. & 2016 & 65.4  \\
-\rule{0pt}{15pt}     AWD-LSTM & Merity et al. & 2018 & 58.8  \\
-\rule{0pt}{15pt}     GPT-2 (Transformer) & Radford et al. & 2019 & 35.7  \\
+\rule{0pt}{15pt}     3-gram LM\cite{brown1992class} & Brown et al. & 1992 & 178.0  \\
+\rule{0pt}{15pt}     Feed-forward Neural LM\cite{bengio2003a} & Bengio et al. & 2003 & 162.2  \\
+\rule{0pt}{15pt}     Recurrent NN-based LM\cite{mikolov2010recurrent} & Mikolov et al. & 2010 & 124.7  \\
+\rule{0pt}{15pt}     Recurrent NN-LDA\cite{mikolov2012context} & Mikolov et al. & 2012 & 92.0  \\
+\rule{0pt}{15pt}     LSTM \cite{zaremba2014recurrent}& Zaremba et al. & 2014 & 78.4  \\
+\rule{0pt}{15pt}     RHN\cite{zilly2016recurrent} & Zilly et al. & 2016 & 65.4  \\
+\rule{0pt}{15pt}     AWD-LSTM\cite{merity2017regularizing} & Merity et al. & 2018 & 58.8  \\
+\rule{0pt}{15pt}     GPT-2 (Transformer)\cite{radford2019language} & Radford et al. & 2019 & 35.7  \\
 \end{tabular}
 \end{table}
-%表1--------------------------------------------------------------------
+%表1------------------------

 %--5.2神经网络基础-----------------------------------------
 \section{神经网络基础}\index{Chapter5.2}
@@ -264,7 +265,7 @@
 %--5.2.1.4矩阵乘法和矩阵点乘---------------------
 \subsubsection{矩阵乘法和矩阵点乘}\index{Chapter5.2.1.4}

-\parinterval 矩阵乘法是矩阵运算中最重要的操作之一，为了与矩阵点乘区分，通常也把矩阵乘法叫做矩阵的叉乘。假设$ \mathbf a $为$ m\times p $的矩阵，$ \mathbf b $为$ p\times n $的矩阵，对$ \mathbf a $和$ \mathbf b $作矩阵乘积的结果是一个$ m\times n $的矩阵$ \mathbf c $，其中矩阵$ \mathbf c $中第$ i $行、第$ j $列的元素可以表示为：
+\parinterval 矩阵乘法是矩阵运算中最重要的操作之一，为了与矩阵点乘区分，通常也把矩阵乘法叫做矩阵的叉乘。假设$ \mathbf a $为$ m\times p $的矩阵，$ \mathbf b $为$ p\times n $的矩阵，对$ \mathbf a $和$ \mathbf b $作矩阵乘法的结果是一个$ m\times n $的矩阵$ \mathbf c $，其中矩阵$ \mathbf c $中第$ i $行、第$ j $列的元素可以表示为：
 %公式--------------------------------------------------------------------
 \begin{eqnarray}
 {(\mathbf a\mathbf b)}_{ij} &=& \prod_{k=1}^p a_{ik}b_{kj}
@@ -575,7 +576,7 @@ x_0\cdot w_0+x_1\cdot w_1+x_2\cdot w_2 & = & 0\cdot 1+0\cdot 1+1\cdot 1 \nonumbe
 \end{itemize}
 \vspace{0.5em}

-\parinterval 上面的例子对这三个问题都简要的做出了回答。下面的内容将继续对它们进行详细阐述。
+\parinterval 上面的例子对这三个问题都简要地做出了回答。下面的内容将继续对它们进行详细阐述。

 %--5.2.3多层神经网络---------------------
 \subsection{多层神经网络}\index{Chapter5.2.3}
@@ -658,7 +659,7 @@ x_0\cdot w_0+x_1\cdot w_1+x_2\cdot w_2 & = & 0\cdot 1+0\cdot 1+1\cdot 1 \nonumbe
 \end{figure}
 %-------------------------------------------

-\parinterval 那激活函数又是什么？神经元在接收到经过线性变换的结果后，通过激活函数的处理，得到最终的输出$ \mathbf y $。激活函数的目的是解决实际问题中的非线性变换，线性变换只能拟合直线，而激活函数的加入，使神经网络具有了拟合曲线的能力。 特别是在实际问题中，很多现象都无法用简单的线性关系描述，这时激活函数的非线性就为描述更加复杂的问题提供了工具。常见的非线性函数有Sigmoid、Relu、Tanh等。如图\ref{fig:activation}列举了几种激活函数的形式。
+\parinterval 那激活函数又是什么？神经元在接收到经过线性变换的结果后，通过激活函数的处理，得到最终的输出$ \mathbf y $。激活函数的目的是解决实际问题中的非线性变换，线性变换只能拟合直线，而激活函数的加入，使神经网络具有了拟合曲线的能力。 特别是在实际问题中，很多现象都无法用简单的线性关系描述，这时激活函数的非线性就为描述更加复杂的问题提供了工具。常见的非线性函数有Sigmoid、ReLU、Tanh等。如图\ref{fig:activation}列举了几种激活函数的形式。
 %----------------------------------------------
 % 图
    \begin{figure}\centering
@@ -682,7 +683,7 @@ x_0\cdot w_0+x_1\cdot w_1+x_2\cdot w_2 & = & 0\cdot 1+0\cdot 1+1\cdot 1 \nonumbe
        \input{./Chapter5/Figures/fig-Tanh}
    \end{minipage}
    }\\    \vspace{-0.5em}
-    \subfigure[Relu]{
+    \subfigure[ReLU]{
    \centering
    \begin{minipage}{.23\textwidth}
        \input{./Chapter5/Figures/fig-Relu}
@@ -710,7 +711,7 @@ x_0\cdot w_0+x_1\cdot w_1+x_2\cdot w_2 & = & 0\cdot 1+0\cdot 1+1\cdot 1 \nonumbe
 %--5.2.3.2单层神经网络->多层神经网络---------------------
 \subsubsection{单层神经网络$\rightarrow$多层神经网络}\index{Chapter5.2.3.2}

-\parinterval 单层神经网络由线性变换和激活函数两部分构成，但在实际问题中，单层网络并不能很好的拟合复杂函数。因此很自然的想到将单层网络扩展到多层神经网络即深层神经网络。将一层神经网络的最终输出向量作为另一层神经网络的输入向量，通过这种方式可以将多层神经网络连接在一起，如图\ref{fig:more-layers}所示。
+\parinterval 单层神经网络由线性变换和激活函数两部分构成，但在实际问题中，单层网络并不能很好地拟合复杂函数。因此很自然地想到将单层网络扩展到多层神经网络，即深层神经网络。将一层神经网络的最终输出向量作为另一层神经网络的输入向量，通过这种方式可以将多层神经网络连接在一起，如图\ref{fig:more-layers}所示。
 %----------------------------------------------
 % 图
 \begin{figure}[htp]
@@ -737,9 +738,9 @@ x_0\cdot w_0+x_1\cdot w_1+x_2\cdot w_2 & = & 0\cdot 1+0\cdot 1+1\cdot 1 \nonumbe

 \parinterval 神经网络方法之所以受到青睐一方面是由于它提供了端到端学习的模式，另一方面是由于它强大的函数拟合能力。理论上说，神经网络可以拟合任何形状的函数。下面就来看一下为什么神经网络会有这样的能力。

-\parinterval 众所周知，单层神经网络无法解决线性不可分问题，比如经典的异或问题。但是具有一个隐藏层的两层神经网络在理论上就可以拟合所有的函数了。接下来我们分析一下为什么仅仅是多了一层，神经网络就能变得如此强大。在此之前，需要明确的一点是，``拟合''是把平面上一系列的点，用一条光滑的曲线连接起来，并用函数来表示这条拟合的曲线。在用神经网络解决问题时，可以通过拟合训练数据中的``数据点''来获得输入与输出之间的函数关系，并利用其对未知数据做出判断。可以假设输入与输出之间存在一种函数关系，而神经网络的``拟合''能力要是尽可能地逼近原函数输出值，与原函数输出值越逼近，则意味着拟合得越优秀。
+\parinterval 众所周知，单层神经网络无法解决线性不可分问题，比如经典的异或问题。但是具有一个隐藏层的两层神经网络在理论上就可以拟合所有的函数了。接下来我们分析一下为什么仅仅是多了一层，神经网络就能变得如此强大。在此之前，需要明确的一点是，``拟合''是把平面上一系列的点，用一条光滑的曲线连接起来，并用函数来表示这条拟合的曲线。在用神经网络解决问题时，可以通过拟合训练数据中的``数据点''来获得输入与输出之间的函数关系，并利用其对未知数据做出判断。可以假设输入与输出之间存在一种函数关系，而神经网络的``拟合''能力是要尽可能地逼近原函数输出值，与原函数输出值越逼近，则意味着拟合得越优秀。

-\parinterval 如图\ref{fig:two-layer-neural-network}是一个以Sigmoid作为隐藏层激活函数的两层神经网络。通过调整参数$ \mathbf w=(w_1,w_2) $，$ \mathbf b=(b_1,b_2) $和$ \mathbf w^{'}=(w'_{0},w'_{1}) $ 的值，可以不断地改变目标函数的形状。
+\parinterval 如图\ref{fig:two-layer-neural-network}是一个以Sigmoid作为隐藏层激活函数的两层神经网络。通过调整参数$ \mathbf w=(w_1,w_2) $，$ \mathbf b=(b_1,b_2) $和$ \mathbf w^{'}=(w'_{1},w'_{2}) $ 的值，可以不断地改变目标函数的形状。
 %----------------------------------------------
 % 图
 \begin{figure}[htp]
@@ -751,7 +752,7 @@ x_0\cdot w_0+x_1\cdot w_1+x_2\cdot w_2 & = & 0\cdot 1+0\cdot 1+1\cdot 1 \nonumbe
 %-------------------------------------------


-\parinterval 设置$ w’_1=1 $，$ w_1=1 $，$ b_1=0 $，其他参数设置为0。可以得到如图\ref{fig:w_1}(a)所示的目标函数，此时目标函数还是比较平缓的。通过调大$ w_1 $，可以将图\ref{fig:w_1}(a) 中函数的坡度调得更陡：当$ w_1=10 $时，如图\ref{fig:w_1}(b)所示，目标函数的坡度与图\ref{fig:w_1}(a)相比变得更陡了；当$ w_1=100 $时，如图\ref{fig:w_1}(c)所示,目标函数的坡度变得更陡、更尖锐，已经逼近一个阶梯函数。
+\parinterval 设置$ w'_1=1 $，$ w_1=1 $，$ b_1=0 $，其他参数设置为0。可以得到如图\ref{fig:w_1}(a)所示的目标函数，此时目标函数还是比较平缓的。通过调大$ w_1 $，可以将图\ref{fig:w_1}(a) 中函数的坡度调得更陡：当$ w_1=10 $时，如图\ref{fig:w_1}(b)所示，目标函数的坡度与图\ref{fig:w_1}(a)相比变得更陡了；当$ w_1=100 $时，如图\ref{fig:w_1}(c)所示,目标函数的坡度变得更陡、更尖锐，已经逼近一个阶梯函数。
 %----------------------------------------------
 % 图
 \begin{figure}[htp]
@@ -762,7 +763,7 @@ x_0\cdot w_0+x_1\cdot w_1+x_2\cdot w_2 & = & 0\cdot 1+0\cdot 1+1\cdot 1 \nonumbe
 \end {figure}
 %-------------------------------------------

-\parinterval 设置$ w’_1=1 $，$ w_1=100 $，$ b_1=0 $，其他参数设置为0。可以得到如图\ref{fig:b}(a)所示的目标函数，此时目标函数是一个阶梯函数，其``阶梯''恰好与y轴重合。通过改变$ b_1 $，可以将整个函数沿x轴向左右平移：当$ b_1=-2 $时，如图\ref{fig:b}(b)所示，与图\ref{fig:b}(a)相比目标函数的形状没有发生改变，但其位置沿x轴向右平移；当$ b_1=-4 $时，如图\ref{fig:b}(c)所示，目标函数的位置继续沿x轴向右平移。
+\parinterval 设置$ w'_1=1 $，$ w_1=100 $，$ b_1=0 $，其他参数设置为0。可以得到如图\ref{fig:b}(a)所示的目标函数，此时目标函数是一个阶梯函数，其``阶梯''恰好与y轴重合。通过改变$ b_1 $，可以将整个函数沿x轴向左右平移：当$ b_1=-2 $时，如图\ref{fig:b}(b)所示，与图\ref{fig:b}(a)相比目标函数的形状没有发生改变，但其位置沿x轴向右平移；当$ b_1=-4 $时，如图\ref{fig:b}(c)所示，目标函数的位置继续沿x轴向右平移。
 %----------------------------------------------
 % 图
 \begin{figure}[htp]
@@ -773,18 +774,18 @@ x_0\cdot w_0+x_1\cdot w_1+x_2\cdot w_2 & = & 0\cdot 1+0\cdot 1+1\cdot 1 \nonumbe
 \end {figure}
 %-------------------------------------------

-\parinterval 设置$ w’_1=1 $，$ w_1=100 $，$ b_1=-4 $，其他参数设置为0。可以得到如图\ref{fig:w'_1}\\(a)所示的目标函数，此时目标函数是一个阶梯函数，该阶梯函数取得最大值的分段处为$ y=1 $。 通过改变$ w’_1 $，可以将目标函数``拉高''或是``压扁''。如图\ref{fig:w'_1}(b)和(c)所示,目标函数变得 ``扁''了。最终，该阶梯函数取得最大值的分段处约为$ y=0.7 $。
+\parinterval 设置$ w'_1=1 $，$ w_1=100 $，$ b_1=-4 $，其他参数设置为0。可以得到如图\ref{fig:w'_1}\\(a)所示的目标函数，此时目标函数是一个阶梯函数，该阶梯函数取得最大值的分段处为$ y=1 $。 通过改变$ w'_1 $，可以将目标函数``拉高''或是``压扁''。如图\ref{fig:w'_1}(b)和(c)所示,目标函数变得 ``扁''了。最终，该阶梯函数取得最大值的分段处约为$ y=0.7 $。
 %----------------------------------------------
 % 图
 \begin{figure}[htp]
 \centering
 \input{./Chapter5/Figures/fig-w1}
-\caption{通过改变偏移量$ w’_1 $将目标函数``拉高''或``压扁''}
+\caption{通过改变偏移量$ w'_1 $将目标函数``拉高''或``压扁''}
 \label{fig:w'_1}
 \end {figure}
 %-------------------------------------------

-\parinterval 设置$ w’_1=0.7 $，$ w_1=100 $，$ b_1=-4 $，其他参数设置为0。可以得到如图\ref{fig:w2}\\(a)所示的目标函数，此时目标函数是一个阶梯函数。若是将其他参数设置为$ w’_2=0.7 $，$ w_2=100 $，$ b_2=16 $，由图\ref{fig:w2}(b)可以看出，原来目标函数的``阶梯''由一级变成了两级，由此可以推测，由于将第二组参数进行设置，使目标函数分段数增多；若将第二组参数中的$ w’_2 $由原来的$ 0.7 $设置为$ -0.7 $，可得到如图\ref{fig:w2}(c)所示的目标函数，与图\ref{fig:w2}(b)相比，原目标函数的``第二级阶梯''向下翻转，由此可见$ w’ $的符号决定了目标函数的翻转方向。
+\parinterval 设置$ w'_1=0.7 $，$ w_1=100 $，$ b_1=-4 $，其他参数设置为0。可以得到如图\ref{fig:w2}\\(a)所示的目标函数，此时目标函数是一个阶梯函数。若是将其他参数设置为$ w'_2=0.7 $，$ w_2=100 $，$ b_2=16 $，由图\ref{fig:w2}(b)可以看出，原来目标函数的``阶梯''由一级变成了两级，由此可以推测，由于将第二组参数进行设置，使目标函数分段数增多；若将第二组参数中的$ w'_2 $由原来的$ 0.7 $设置为$ -0.7 $，可得到如图\ref{fig:w2}(c)所示的目标函数，与图\ref{fig:w2}(b)相比，原目标函数的``第二级阶梯''向下翻转，由此可见$ w' $的符号决定了目标函数的翻转方向。
 %----------------------------------------------
 % 图
 \begin{figure}[htp]
@@ -836,7 +837,7 @@ x_0\cdot w_0+x_1\cdot w_1+x_2\cdot w_2 & = & 0\cdot 1+0\cdot 1+1\cdot 1 \nonumbe

 \parinterval 在神经网络内部，输入经过若干次变换，最终得到输出的结果。这个过程类似于一种逐层的数据``流动''。不禁会产生这样的疑问：在神经网络中，数据是以哪种形式``流动''的？如何去编程实现这种数据``流动''呢？

-\parinterval 为了解决上面的问题，本节将介绍人工神经网络更加通用的描述形式 \ \dash \ 张量计算。随后也会看到，基于张量用数学工具，可以方便的搭建神经网络。
+\parinterval 为了解决上面的问题，本节将介绍人工神经网络更加通用的描述形式 \ \dash \ 张量计算。随后也会看到，使用基于张量的数学工具，可以方便的搭建神经网络。
 %--5.3.1 张量及其计算---------------------
 \subsection{ 张量及其计算}\index{Chapter5.3.1}

@@ -852,7 +853,7 @@ x_0\cdot w_0+x_1\cdot w_1+x_2\cdot w_2 & = & 0\cdot 1+0\cdot 1+1\cdot 1 \nonumbe
 \label{}
 \end{eqnarray}
 %公式--------------------------------------------------------------------
-\parinterval 简单来说，是一种通用的工具，用于描述由多个数据构成的量。比如，输入的量有三个维度在变化，用矩阵不容易描述，但是用张量却很容易。
+\parinterval 简单来说，张量是一种通用的工具，用于描述由多个数据构成的量。比如，输入的量有三个维度在变化，用矩阵不容易描述，但是用张量却很容易。

 \parinterval 从计算机实现的角度来看，现在所有深度学习框架都把张量定义为``多维数组''。张量有一个非常重要的属性\ \dash \ {\small\bfnew{阶}}（Rank）。可以将多维数组中``维''的属性与张量的``阶''的属性作类比，这两个属性都表示多维数组（张量）有多少个独立的方向。例如，3是一个标量（Scalar），相当于一个0维数组或0阶张量；$ {(\begin{array}{cccc} 2 & -3 & 0.8 & 0.2\end{array})}^{\rm T} $ 是一个向量（Vector），相当于一个1维数组或1阶张量；$ \begin{pmatrix} -1 & 3 & 7\\ 0.2 & 2 & 9\end{pmatrix} $是一个矩阵（Matrix)，相当于一个2维数组或2阶张量；如图\ref{fig:tensor-sample}，这是一个3 维数组或3阶张量，其中，每个$4 \times 4$的方形代表一个2阶张量，这样的方形有4个，最终形成3阶张量。
 %----------------------------------------------
@@ -871,7 +872,7 @@ x_0\cdot w_0+x_1\cdot w_1+x_2\cdot w_2 & = & 0\cdot 1+0\cdot 1+1\cdot 1 \nonumbe

 \parinterval 不过，更广泛接受的定义是：张量是多重线性函数，是定义在一些向量空间和笛卡尔积上的多重线性映射。张量的多重线性表现在，对于每一个输入函数都是线性的。比如，张量$ \mathbf T(v_0,v_1,\dots,v_r) $，其输入是$r$个向量$ \{v_0,v_1,\dots,v_r\} $，对于张量$ \mathbf T $的任意一个$ v_i $，都有$ \mathbf T(v_0,\dots,v_i+c\cdot u,\dots,v_r)=\mathbf T(v_0,\dots,v_i,\dots,v_r)+c\cdot{\mathbf T(v_0,\dots,u,\dots,v_r)} $，其中，$ c $为任意实数。这个性质非常重要，根据这个性质可以推导出张量的其他定义。

-\parinterval 从我们的物理世界看，如果一个物理量在物体的某个位置上只是一个单值，那么它是一个标量，例如密度；如果一个物理量在同一个位置、从多个方向上看，有不同的值，那么这个物理量就是一个的张量。比如物理学中常用的应力的描述就是一个典型的张量。举一个简单的例子：$ \mathbf T(\mathbf v,\mathbf u) $是一个三维空间$(\textrm{x},\textrm{y},\textrm{z})$上的2阶张量，其中$ \mathbf v $和$ \mathbf u $ 是两个向量，如图\ref{fig:tensor}所示，向量$ \mathbf v $在某个两两垂直的三维坐标系中可以表示为$ {(\begin{array}{ccc} a & b & c\end{array})}^{\rm T} $，同理向量$ \mathbf u $在某个两两垂直的三维坐标系中可以表示为$ {(\begin{array}{ccc} a' & b' & c' \end{array})}^{\rm T} $。但在三维空间$(\textrm{x},\textrm{y},\textrm{z})$中，向量$ \mathbf v $和向量$ \mathbf u $分别被表示为$ {(\begin{array}{ccc} v_x & v_y & v_z\end{array})}^{\rm T} $和$ {(\begin{array}{ccc} u_x & u_y & u_z\end{array})}^{\rm T} $。
+\parinterval 从我们的物理世界看，如果一个物理量在物体的某个位置上只是一个单值，那么它是一个标量，例如密度；如果一个物理量在同一个位置、从多个方向上看，有不同的值，那么这个物理量就是一个张量。比如物理学中常用的应力的描述就是一个典型的张量。举一个简单的例子：$ \mathbf T(\mathbf v,\mathbf u) $是一个三维空间$(\textrm{x},\textrm{y},\textrm{z})$上的2阶张量，其中$ \mathbf v $和$ \mathbf u $ 是两个向量，如图\ref{fig:tensor}所示，向量$ \mathbf v $在某个两两垂直的三维坐标系中可以表示为$ {(\begin{array}{ccc} a & b & c\end{array})}^{\rm T} $，同理向量$ \mathbf u $在某个两两垂直的三维坐标系中可以表示为$ {(\begin{array}{ccc} a' & b' & c' \end{array})}^{\rm T} $。但在三维空间$(\textrm{x},\textrm{y},\textrm{z})$中，向量$ \mathbf v $和向量$ \mathbf u $分别被表示为$ {(\begin{array}{ccc} v_x & v_y & v_z\end{array})}^{\rm T} $和$ {(\begin{array}{ccc} u_x & u_y & u_z\end{array})}^{\rm T} $。
 %----------------------------------------------
 % 图
 \begin{figure}[htp]
@@ -956,14 +957,14 @@ x_0\cdot w_0+x_1\cdot w_1+x_2\cdot w_2 & = & 0\cdot 1+0\cdot 1+1\cdot 1 \nonumbe
 %-------------------------------------------

 \vspace{0.5em}
-\item 除了单位加之外，张量之间也可以减法、乘法，也可以对张量作激活函数。这里将其称作为函数的{\small\bfnew{向量化}}（Vectorization）。例如，对向量（1阶张量）作Relu激活，其中Relu激活函数的公式为：
+\item 除了单位加之外，张量之间也可以使用减法操作、乘法操作，也可以对张量作激活操作。这里将其称作为函数的{\small\bfnew{向量化}}（Vectorization）。例如，对向量（1阶张量）作ReLU激活，其中ReLU激活函数的公式为：
 %公式--------------------------------------------------------------------
 \begin{eqnarray}
 f(x)=\begin{cases} 0 & x\le 0 \\x & x>0\end{cases}
 \label{eqa1.26}
 \end{eqnarray}
 %公式--------------------------------------------------------------------
-例如$ {\rm{Relu}}\left( \begin{pmatrix} 2\\-.3\end{pmatrix}\right)=\begin{pmatrix} 2\\0\end{pmatrix} $。
+例如$ {\rm{ReLU}}\left( \begin{pmatrix} 2\\-.3\end{pmatrix}\right)=\begin{pmatrix} 2\\0\end{pmatrix} $。
 \end{itemize}
 \vspace{0.5em}
 %--5.3.2 张量的物理存储形式---------------------
@@ -997,7 +998,7 @@ f(x)=\begin{cases} 0 & x\le 0 \\x & x>0\end{cases}

 \parinterval 此外，如今深度学习框架已经非常成熟。比如， Tensorflow和Pytorch就是非常受欢迎的深度学习工具包，除此之外还有很多其他优秀的框架：CNTK、MXNet、\\PaddlePaddle、Keras、Chainer、dl4j、NiuTensor等。开发者可以根据自身的喜好和开发项目的要求选择所采用的框架。

-\parinterval 本节将使用NiuTensor来描述张量计算。NiuTensor是一个面向自然语言处理任务的张量库，他支持丰富的张量计算接口。此外，该NiuTensor内核基于C++语言编写，代码高度优化。该工具包获取网址为\url{https://developer.niutrans.com/ArticleContent/technicaldoc/doc1/1.NiuTensor}。
+\parinterval 本节将使用NiuTensor来描述张量计算。NiuTensor是一个面向自然语言处理任务的张量库，它支持丰富的张量计算接口。此外，该NiuTensor内核基于C++语言编写，代码高度优化。该工具包获取网址为\url{https://developer.niutrans.com/ArticleContent/technicaldoc/doc1/1.NiuTensor}。

 \parinterval NiuTensor的使用非常简单，如图\ref{fig:code-tensor-define}是一个使用NiuTensor声明、定义张量的C++代码：
 %----------------------------------------------
@@ -1029,7 +1030,7 @@ f(x)=\begin{cases} 0 & x\le 0 \\x & x>0\end{cases}
 \vspace{0.5em}
 \item 张量的阶，如6。
 \vspace{0.5em}
-\item 各个方向维度的大小，约定该参数形式与传统的多维数组形式相同，如$ \{2,3,4,2,3,4\} $。
+\item 各个方向维度的大小，约定该参数形式与传统的多维数组形式相同，如$ \{2,3,4,\\2,3,4\} $。
 \vspace{0.5em}
 \item 张量的数据类型，该参数有缺省值。
 \end{itemize}
@@ -1118,7 +1119,7 @@ f(x)=\begin{cases} 0 & x\le 0 \\x & x>0\end{cases}
 \rule{0pt}{15pt}     Sigmoid(a) & 对$ \mathbf a $进行Sigmoid变换  \\
 \rule{0pt}{15pt}     Softmax(a) & 对$ \mathbf a $进行Softmax变换，沿最后一个方向  \\
 \rule{0pt}{15pt}     HardTanh(a) & 对$ \mathbf a $进行hard Tanh变换（双曲正切的近似）  \\
-\rule{0pt}{15pt}     Relu(a) & 对$ \mathbf a $进行Relu变换  \\
+\rule{0pt}{15pt}     Relu(a) & 对$ \mathbf a $进行ReLU变换  \\
 \end{tabular}
 \end{table}
 %表2--------------------------------------------------------------------
@@ -1158,7 +1159,7 @@ y&=&{\rm{Sigmoid}}({\rm{Tanh}}(\mathbf x\cdot \mathbf w^1+\mathbf b^1)\cdot \mat
 \end{figure}
 %-------------------------------------------

-\parinterval 前向计算实现如图\ref{fig:weather-forward}所示，图中对各张量和其他参数的形状做了详细说明，类似shape(3)这种形式代表维度为3的1阶张量，shape(3, 2)代表2阶张量，其中第1阶有3个维度，第2阶有2个维度，也可以将其理解为$ 3 \times 2 $的矩阵。输入$ \mathbf x $是一个1阶张量，该阶有3个维度，分别对应天空状况、低空气温、水平气压三个方面。输入数据经过隐藏层的线性变换$ \mathbf x\cdot \mathbf w^1+\mathbf b^1 $和Tanh激活函数后，得到新的张量$ \mathbf a $，张量$ \mathbf a $也是一个1阶张量，该阶有2个维度，分别对应着从输入数据中提取出的温度和风速两方面特征；神经网络在获取到天气情况的特征$ \mathbf a $后，继续对其进行线性变换$ \mathbf a\cdot \mathbf w^2+ b^2 $（$ b^2 $是标量）和Sigmoid激活函数后，得到神经网络的最终输出$ y $，即神经网络此时预测的穿衣指数。
+\parinterval 前向计算实现如图\ref{fig:weather-forward}所示，图中对各张量和其他参数的形状做了详细说明。输入$ \mathbf x=(x_1,x_2,x_3) $是一个$1\times 3$的张量，其三个维度分别对应天空状况、低空气温、水平气压三个方面的数据。输入数据经过隐藏层的线性变换$ \mathbf x\cdot \mathbf w^1+\mathbf b^1 $和Tanh函数的激活，得到新的张量$ \mathbf a=(a_1,a_2) $，其中$a_1$，$a_2$分别对应着从输入数据中提取出的温度和风速两方面特征；神经网络在获取到天气情况的特征$ \mathbf a $后，继续对其进行线性变换$ \mathbf a\cdot \mathbf w^2+ b^2 $（其中$b^2$是标量）和Sigmoid函数的激活操作，得到神经网络的最终输出$ y $，即神经网络此时预测的穿衣指数。

 \parinterval 图\ref{fig:weather-forward}实际上是神经网络的一种{\small\bfnew{计算图}}（Computation Graph）表示。现在很多深度学习框架都是把神经网络转化为计算图，这样可以把复杂的运算分解为简单的运算。通过对计算图中节点的遍历，可以方便的完成神经网络的计算。比如，可以对图中节点进行拓扑排序（由输入到输出），之后依次访问每个节点同时完成相应的计算。这也就实现了一个前向计算的过程。构建计算图的方式有很多，比如，动态图、静态图等。在\ref{sec5:para-training}节会进一步对计算图在模型参数训练中的应用进行介绍。

@@ -1176,7 +1177,7 @@ y&=&{\rm{Sigmoid}}({\rm{Tanh}}(\mathbf x\cdot \mathbf w^1+\mathbf b^1)\cdot \mat
 \end{figure}
 %-------------------------------------------

-	\parinterval 如图\ref{fig:code-niutensor-three}是一个构造三层神经网络的程序示例。在第一层中，$ \mathbf x $作为输入，$ \mathbf h1 $作为输出，其中$ \mathbf h1={\rm{Sigmoid}}(\mathbf x\cdot \mathbf w1+\mathbf b1) $。在第二层中，$ \mathbf h1 $作为输入，$ \mathbf h2 $作为输出，其中$ \mathbf h2={\rm{Tanh}}(\mathbf h1\cdot \mathbf w2) $。在第三层中，$ \mathbf h2 $作为输入，$ \mathbf y $作为输出，其中$ \mathbf y={\rm{Relu}}(\mathbf h2\cdot \mathbf w3) $。$ \mathbf y $也会作为整个神经网络的输出。
+	\parinterval 如图\ref{fig:code-niutensor-three}是一个构造三层神经网络的程序示例。在第一层中，$ \mathbf x $作为输入，$ \mathbf h1 $作为输出，其中$ \mathbf h1={\rm{Sigmoid}}(\mathbf x\cdot \mathbf w1+\mathbf b1) $。在第二层中，$ \mathbf h1 $作为输入，$ \mathbf h2 $作为输出，其中$ \mathbf h2={\rm{Tanh}}(\mathbf h1\cdot \mathbf w2) $。在第三层中，$ \mathbf h2 $作为输入，$ \mathbf y $作为输出，其中$ \mathbf y={\rm{ReLU}}(\mathbf h2\cdot \mathbf w3) $。$ \mathbf y $也会作为整个神经网络的输出。

 %----------------------------------------------
 % 图
@@ -1202,7 +1203,7 @@ y&=&{\rm{Sigmoid}}({\rm{Tanh}}(\mathbf x\cdot \mathbf w^1+\mathbf b^1)\cdot \mat
 %--5.4神经网络的参数训练-----------------------------------------
 \section{神经网络的参数训练}\index{Chapter5.4}

-\parinterval 简单来说，神经网络可以被看作是由变量和函数组成的表达式，例如：$ \mathbf y=\mathbf x+\mathbf b $、$ \mathbf y={\rm{Relu}}(\mathbf x\cdot \mathbf w+\mathbf b) $、$ \mathbf y={\rm{Sigmoid}}({\rm{Relu}}(\mathbf x\cdot \mathbf w^1+\mathbf b^1)\cdot \mathbf w^2+\mathbf b^2) $等等，其中的$ \mathbf x $和$ \mathbf y $作为输入和输出变量， $ \mathbf w $、$ \mathbf b $等其他变量作为{\small\sffamily\bfseries{模型参数}}（Model Parameters）。确定了函数表达式和模型参数，也就确定了神经网络模型。通常，表达式的形式需要系统开发者设计，而模型参数的数量有时会非常巨大，因此需要自动学习，这个过程也被称为模型学习或{\small\bfnew{训练}}（Training）。为了实现这个目标，通常会准备一定量的带有标准答案的数据，称之为{\small\sffamily\bfseries{有标注数据}}（Annotated Data/Labeled Data）。这些数据会用于对模型参数的学习，这也对应了统计模型中的参数估计过程。在机器学习中，一般把这种使用有标注数据进行统计模型参数训练的过程称为{\small\sffamily\bfseries{有指导的训练}}或{\small\sffamily\bfseries{有监督的训练}}（Supervised Training）。在本章中，如果没有特殊说明，模型训练都是指有监督的训练。那么神经网络内部是怎样利用有标注数据对参数进行训练的呢？
+\parinterval 简单来说，神经网络可以被看作是由变量和函数组成的表达式，例如：$ \mathbf y=\mathbf x+\mathbf b $、$ \mathbf y={\rm{ReLU}}(\mathbf x\cdot \mathbf w+\mathbf b) $、$ \mathbf y={\rm{Sigmoid}}({\rm{ReLU}}(\mathbf x\cdot \mathbf w^1+\mathbf b^1)\cdot \mathbf w^2+\mathbf b^2) $等等，其中的$ \mathbf x $和$ \mathbf y $作为输入和输出变量， $ \mathbf w $、$ \mathbf b $等其他变量作为{\small\sffamily\bfseries{模型参数}}（Model Parameters）。确定了函数表达式和模型参数，也就确定了神经网络模型。通常，表达式的形式需要系统开发者设计，而模型参数的数量有时会非常巨大，因此需要自动学习，这个过程也被称为模型学习或{\small\bfnew{训练}}（Training）。为了实现这个目标，通常会准备一定量的带有标准答案的数据，称之为{\small\sffamily\bfseries{有标注数据}}（Annotated Data/Labeled Data）。这些数据会用于对模型参数的学习，这也对应了统计模型中的参数估计过程。在机器学习中，一般把这种使用有标注数据进行统计模型参数训练的过程称为{\small\sffamily\bfseries{有指导的训练}}或{\small\sffamily\bfseries{有监督的训练}}（Supervised Training）。在本章中，如果没有特殊说明，模型训练都是指有监督的训练。那么神经网络内部是怎样利用有标注数据对参数进行训练的呢？

 \parinterval 为了回答这个问题，可以把模型参数的学习过程看作是一个优化问题，即找到一组参数，使得模型达到某种最优的状态。这个问题又可以被转化为两个新的问题：

@@ -1221,7 +1222,7 @@ y&=&{\rm{Sigmoid}}({\rm{Tanh}}(\mathbf x\cdot \mathbf w^1+\mathbf b^1)\cdot \mat

 \parinterval 在神经网络的有监督学习中，训练模型的数据是由输入和正确答案所组成的样本构成的。假设有多个输入样本$ \{\mathbf x_1,\mathbf x_2,\dots,\mathbf x_n\} $，每一个$ \mathbf x_i $都对应一个正确答案$ \mathbf {\widetilde y}_i $，$ \{\mathbf x_i,\mathbf {\widetilde y}_i\} $就构成一个优化神经网络的{\small\sffamily\bfseries{训练数据集合}}（Training Data Set）。对于一个神经网络模型$ \mathbf y=f(\mathbf x) $,每个$ \mathbf x_i $也会有一个输出$ \mathbf y_i $。如果可以度量正确答案$ \mathbf {\widetilde y}_i $和神经网络输出$ \mathbf y_i $之间的偏差，进而通过调整网络参数减小这种偏差，就可以得到更好的模型。

-\parinterval 通常，可以通过设计{\small\sffamily\bfseries{损失函数}}（Loss Function）来度量正确答案$ \mathbf {\widetilde y}_i $和神经网络输出$ \mathbf y_i $之间的偏差。而这个损失函数往往充当训练的{\small\sffamily\bfseries{目标函数}}（Objective Function），神经网络训练就是通过不断调整神经网络内部的参数而使损失函数最小化。图\ref{fig:absolute-loss}展示了一个的绝对值损失函数的实例。
+\parinterval 通常，可以通过设计{\small\sffamily\bfseries{损失函数}}（Loss Function）来度量正确答案$ \mathbf {\widetilde y}_i $和神经网络输出$ \mathbf y_i $之间的偏差。而这个损失函数往往充当训练的{\small\sffamily\bfseries{目标函数}}（Objective Function），神经网络训练就是通过不断调整神经网络内部的参数而使损失函数最小化。图\ref{fig:absolute-loss}展示了一个绝对值损失函数的实例。
 %----------------------------------------------
 % 图
 \begin{figure}[htp]
@@ -1321,7 +1322,7 @@ J(\mathbf w)&=&L(\mathbf x_i,\mathbf {\widetilde y}_i;\mathbf w)
 \label{eqa1.31}
 \end{eqnarray}
 %公式--------------------------------------------------------------------
-\noindent 由于每次只随机选取一个样本$(\mathbf x_i,\mathbf {\widetilde y}_i)$进行优化，这样更新的计算代价低，参数更新的速度大大加快，而且也适用于利用少量样本进行在线学习的情况\footnote{比如，训练数据不是一次给定的，而是随着模型的使用不断追加的。这时，需要不断地用新的训练样本更新模型，这种模式也被称作在{\scriptsize\bfnew{线学习}}（Online Learning）}。
+\noindent 由于每次只随机选取一个样本$(\mathbf x_i,\mathbf {\widetilde y}_i)$进行优化，这样更新的计算代价低，参数更新的速度大大加快，而且也适用于利用少量样本进行在线学习的情况\footnote{比如，训练数据不是一次给定的，而是随着模型的使用不断追加的。这时，需要不断地用新的训练样本更新模型，这种模式也被称作{\scriptsize\bfnew{在线学习}}（Online Learning）}。

 \parinterval 因为随机梯度下降算法每次优化的只是某一个样本上的损失，所以它的问题也非常明显：单个样本上的损失无法代表在全部样本上的损失，因此参数更新的效率低，方法收敛速度极慢。即使在目标函数为强凸函数的情况下，SGD仍旧无法做到线性收敛。

@@ -1360,7 +1361,7 @@ J(\mathbf w)&=&\frac{1}{m}\sum_{i=j}^{j+m-1}{L(\mathbf x_i,\mathbf {\widetilde y

 \parinterval 数值微分根据导数的原始定义完成，根据公式可知，要得到损失函数在某个参数状态$ \mathbf w $下的梯度，可以将$ \mathbf w $增大或减小一点（$ \Delta \mathbf w $），例如，取$ \Delta \mathbf w=0.0001 $，之后观测损失函数的变化与$ \Delta \mathbf w $的比值。$ \Delta \mathbf w $的取值越小计算的结果越接近导数的真实值，但是对计算的精度要求越高。

-\parinterval 这种求梯度的方法很简单，但是计算量很大，求解速度非常慢，而且这种方法造成的{\small\sffamily\bfseries{截断误差}}（Truncation Error）和{\small\sffamily\bfseries{舍入误差}}（Round-off Error)。在网络比较复杂、参数量稍微有点大的模型上一般不会使用这种方法。
+\parinterval 这种求梯度的方法很简单，但是计算量很大，求解速度非常慢，而且这种方法会造成{\small\sffamily\bfseries{截断误差}}（Truncation Error）和{\small\sffamily\bfseries{舍入误差}}（Round-off Error)。在网络比较复杂、参数量稍微有点大的模型上一般不会使用这种方法。

 \parinterval 截断误差和舍入误差是如何造成的呢？数值微分方法求梯度时，需用极限或无穷过程来求得。然而计算机需要将求解过程化为一系列有限的算术运算和逻辑运算。这样就要对某种无穷过程进行``截断''，即仅保留无穷过程的前段有限序列而舍弃它的后段。这就带来截断误差；舍入误差，是指运算得到的近似值和精确值之间的差异。由于数值微分方法计算复杂函数的梯度问题时，经过无数次的近似，每一次近似都产生了舍入误差，在这样的情况下，误差会随着运算次数增加而积累得很大，最终得出没有意义的运算结果。实际上，截断误差和舍入误差在训练复杂神经网络中，特别是使用低精度计算时，也会出现，因此是实际系统研发中需要注意的问题。

@@ -1472,7 +1473,7 @@ w_{t+1}&=&w_t-\alpha v_t
 \begin{figure}[htp]
 \centering
 \input{./Chapter5/Figures/fig-sawtooth}
-\caption{梯度下降算法中的``锯齿''现象}
+\caption{Momentum梯度下降 vs 普通梯度下降}
 \label{fig:sawtooth }
 \end{figure}
 %-------------------------------------------
@@ -1530,7 +1531,7 @@ w_{t+1}&=&w_t-\frac{\eta}{\sqrt{z_t+\epsilon}} v_t

 \noindent 可以看到Adam 算法相当于在RMSProp算法中引入了Momentum算法中的动量项，这样做使得Adam算法兼具了Momentum算法和RMSProp算法的优点：既能使梯度更为``平滑''地更新，同时可以为神经网络中的每个参数设置不同的学习率。

-\parinterval  需要注意的是包括Adam在内的很多参数更新算法中的学习率都需要人为设置。而且模型学习的效果与学习率的设置关系极大，甚至在研发实际系统时工程师需要进行的实验，以得到最佳的模型。第六章还会具体介绍在机器翻译中参数更新学习率设置的策略。
+\parinterval  需要注意的是包括Adam在内的很多参数更新算法中的学习率都需要人为设置。而且模型学习的效果与学习率的设置关系极大，甚至在研发实际系统时工程师需要进行大量的实验，以得到最佳的模型。第六章还会具体介绍在机器翻译中参数更新学习率设置的策略。

 %--5.4.3 参数更新的并行化策略---------------------
 \subsection{参数更新的并行化策略}\index{Chapter5.4.3}
@@ -1555,9 +1556,9 @@ w_{t+1}&=&w_t-\frac{\eta}{\sqrt{z_t+\epsilon}} v_t
 \end {figure}
 %-------------------------------------------

-\parinterval  图\ref{fig:parallel}对比了同步更新和异步更新的区别，在这个例子中，使用4台设备对一个两层神经网络中的参数进行更新，其中使用了一个{\small\bfnew{参数服务器}}（Parameter Server，图中的G4）来保存最新的参数，不同设备（Worker，图中的G1、G2、G3）可以通过同步或者异步的方式访问参数服务器。图中的$ \mathbf w_o $和$ \mathbf w_h $分别代表输出层和隐藏层的全部参数，操作表示push(P) 设备向参数服务器传送梯度，操作fetch(F)表示参数服务器向设备传送更新后的参数。
+\parinterval  图\ref{fig:parallel}对比了同步更新和异步更新的区别，在这个例子中，使用4台设备对一个两层神经网络中的参数进行更新，其中使用了一个{\small\bfnew{参数服务器}}（Parameter Server，图中的G4）来保存最新的参数，不同设备（Worker，图中的G1、G2、G3）可以通过同步或者异步的方式访问参数服务器。图中的$ \mathbf w_o $和$ \mathbf w_h $分别代表输出层和隐藏层的全部参数，操作push(P) 表示设备向参数服务器传送梯度，操作fetch(F)表示参数服务器向设备传送更新后的参数。

-\parinterval  此外，在使用多个设备进行并行训练的时候，由于设备间带宽的限制，大量的数据传输会有较高的延时。对于复杂神经网络来说，设备间参数和梯度传递的时间消耗也会成为一个不得不考虑的因素。有时候，设备间数据传输的时间甚至比模型计算的时间都长，大大降低了并行度\cite{xiao2017fast}。对于这种问题，可以考虑对数据进行压缩或者减少传输的次数缓解问题。
+\parinterval  此外，在使用多个设备进行并行训练的时候，由于设备间带宽的限制，大量的数据传输会有较高的延时。对于复杂神经网络来说，设备间参数和梯度传递的时间消耗也会成为一个不得不考虑的因素。有时候，设备间数据传输的时间甚至比模型计算的时间都长，大大降低了并行度\cite{xiao2017fast}。对于这种问题，可以考虑对数据进行压缩或者减少传输的次数来缓解问题。
 %--5.4.4 梯度消失、梯度爆炸和稳定性训练---------------------
 \subsection{梯度消失、梯度爆炸和稳定性训练}\index{Chapter5.4.4}\label{sec:5.4.4}

@@ -1565,7 +1566,7 @@ w_{t+1}&=&w_t-\frac{\eta}{\sqrt{z_t+\epsilon}} v_t
 %--5.4.4.1梯度消失现象及解决方法---------------------
 \subsubsection{易于优化的激活函数}\index{Chapter5.4.4.1}

-\parinterval  网络训练过程中，如果每层网络的梯度都小于1，各层梯度的偏导数会与后面层传递而来的梯度相乘得到本层的梯度，并向前一层传递。该过程循环进行，最后导致梯度指数级地减小，这就产生了梯度消失现象。这种情况会导致神经网络层数较浅的部分梯度接近0。一般来说，产生很小梯度的原因是使用了类似于Sigmoid这样的激活函数，当输入的值过大或者过小的时候这类函数曲线会趋于直线，梯度近似为零。针对这个问题，主要的解决办法是使用更加易于优化的激活函数，比如，使用Relu代替Sigmoid和Tanh作为激活函数。
+\parinterval  网络训练过程中，如果每层网络的梯度都小于1，各层梯度的偏导数会与后面层传递而来的梯度相乘得到本层的梯度，并向前一层传递。该过程循环进行，最后导致梯度指数级地减小，这就产生了梯度消失现象。这种情况会导致神经网络层数较浅的部分梯度接近0。一般来说，产生很小梯度的原因是使用了类似于Sigmoid这样的激活函数，当输入的值过大或者过小的时候这类函数曲线会趋于直线，梯度近似为零。针对这个问题，主要的解决办法是使用更加易于优化的激活函数，比如，使用ReLU代替Sigmoid和Tanh作为激活函数。

 \parinterval  缓解梯度消失问题最直接的想法就是希望各层的偏导数大于或等于1。图\ref{fig:derivative1}展示了Sigmoid激活函数$ y=\frac{1}{1+e^{-x}}$的函数曲线和导函数曲线，如果使用Sigmoid作为损失函数，其梯度不可能超过0.25，这样经过链式求导之后，很容易发生梯度消失。
 %----------------------------------------------
@@ -1589,13 +1590,13 @@ w_{t+1}&=&w_t-\frac{\eta}{\sqrt{z_t+\epsilon}} v_t
 \end {figure}
 %-------------------------------------------

-\parinterval  Relu激活函数的思想也很简单，如果激活函数的导数为1，那么就不存在梯度消失爆炸的问题了。图\ref{fig:derivative3}展示了Relu激活函数$ y={\rm{max}}(0,x)$的函数曲线和导函数曲线。可以很容易看出，Relu函数的导数在正数部分是恒等于1的，因此在深层网络中使用Relu激活函数就不会产生很小的梯度。
+\parinterval  ReLU激活函数的思想也很简单，如果激活函数的导数为1，那么就不存在梯度消失爆炸的问题了。图\ref{fig:derivative3}展示了ReLU激活函数$ y={\rm{max}}(0,x)$的函数曲线和导函数曲线。可以很容易看出，ReLU函数的导数在正数部分是恒等于1的，因此在深层网络中使用ReLU激活函数就不会产生很小的梯度。
 %----------------------------------------------
 % 图
 \begin{figure}[htp]
 \centering
 \input{./Chapter5/Figures/fig-derivative3}
-\caption{Relu激活函数的函数曲线和导函数曲线}
+\caption{ReLU激活函数的函数曲线和导函数曲线}
 \label{fig:derivative3}
 \end {figure}
 %-------------------------------------------
@@ -1606,7 +1607,7 @@ w_{t+1}&=&w_t-\frac{\eta}{\sqrt{z_t+\epsilon}} v_t

 \parinterval  网络训练过程中，如果参数的初始值过大，而且每层网络的梯度都大于1，反向传播过程中，各层梯度的偏导数都会比较大，会导致梯度指数级地增长直至超出浮点数表示的范围，这就产生了梯度爆炸现象。如果发生这种情况，模型中离输入近的部分比离输入远的部分参数更新得更快，使网络变得非常不稳定。在极端情况下，模型的参数值变得非常大，甚至于溢出。针对梯度爆炸的问题，常用的解决办法为{\small\sffamily\bfseries{梯度裁剪}}（Gradient Clipping）。

-\parinterval    梯度剪切的思想是设置一个梯度剪切阈值。在更新梯度的时候，如果梯度超过这个阈值，就将其强制限制在这个范围之内。假设梯度为$ \mathbf g $，梯度剪切阈值为$ \theta $，梯度裁剪的公式为
+\parinterval    梯度裁剪的思想是设置一个梯度剪切阈值。在更新梯度的时候，如果梯度超过这个阈值，就将其强制限制在这个范围之内。假设梯度为$ \mathbf g $，梯度剪切阈值为$ \theta $，梯度裁剪的公式为
 %公式--------------------------------------------------------------------
 \begin{eqnarray}
 \mathbf g&=&{\rm{min}}(\frac{\theta}{\Vert \mathbf g\Vert},1)\mathbf g
@@ -1647,7 +1648,7 @@ w_{t+1}&=&w_t-\frac{\eta}{\sqrt{z_t+\epsilon}} v_t
 \label{eqa1.44}
 \end{eqnarray}
 %公式--------------------------------------------------------------------
-\parinterval  相比较于简单的多层堆叠的结构，残差网络提供了跨层连接结构。这种结构在反向传播中有很大的好处，比如，对于$ \mathbf x_l $处的梯度可以如下进行计算：
+\parinterval  相比较于简单的多层堆叠的结构，残差网络提供了跨层连接结构。这种结构在反向传播中有很大的好处，比如，对于$ \mathbf x_l $处的梯度可以进行如下计算：
 %公式--------------------------------------------------------------------
 \begin{eqnarray}
 \frac{\partial L}{\partial \mathbf x_l}&=&\frac{\partial L}{\partial \mathbf x_{l+1}} \cdot  \frac{\partial \mathbf x_{l+1}}{\partial \mathbf x_l}\nonumber\\
@@ -1664,7 +1665,7 @@ w_{t+1}&=&w_t-\frac{\eta}{\sqrt{z_t+\epsilon}} v_t

 \parinterval  {\small\sffamily\bfseries{正则化}}（Regularization）是常见的缓解过拟合问题的手段，通过在损失函数中加上用来刻画模型复杂程度的正则项来惩罚过度复杂的模型，从而避免神经网络过度学习造成过拟合。引入正则化处理之后目标函数变为$ J(\mathbf w)+\lambda R(\mathbf w) $，其中$ J(\mathbf w) $是原来的代价函数，$ R(\mathbf w) $即为正则项，$ \lambda $用来调节正则项对结果影响的程度。

-\parinterval  过拟合的模型通常会表现为部分非零参数过多或者参数的值过大。这种参数产生的原因在于模型需要复杂的参数才能匹配样本中的个别现象甚至噪声。基于此，常见的正则化方法有L1正则化和L2正则化，其命名方式是由$ R(\mathbf w) $的计算形式来决定的。在L1正则化中，$ R(\mathbf w) $即为参数$ w $的$ l_1 $范数，即$ R(\mathbf w) ={\Vert \mathbf w\Vert}_1=\sum_{i=1}^{n}{\vert w_i\vert} $；在L2正则化中，$ R(\mathbf w) $即为参数$ w $的$ l_2 $范数的平方，即$ R(\mathbf w) =({\Vert \mathbf w\Vert}_2)^2=\sum_{i=1}^{n}{w_i^2} $。L1正则化中的正则项衡量了模型权数中的绝对值大小，倾向于生成值为0的参数，从而让参数变得更加稀疏；而L2正则化由于平方的加入，当参数中的某一项小到一定程度，比如0.001的时候，参数的平方结果已经可以忽略不计了，因此L2正则化会倾向生成很小的参数，在这种情况下，即便训练数据中含有少量随机噪音，模型也不太容易通过增加个别参数的值来对噪声进行过渡拟合，即提高了模型的抗扰动能力。
+\parinterval  过拟合的模型通常会表现为部分非零参数过多或者参数的值过大。这种参数产生的原因在于模型需要复杂的参数才能匹配样本中的个别现象甚至噪声。基于此，常见的正则化方法有L1正则化和L2正则化，其命名方式是由$ R(\mathbf w) $的计算形式来决定的。在L1正则化中，$ R(\mathbf w) $即为参数$ w $的$ l_1 $范数，即$ R(\mathbf w) ={\Vert \mathbf w\Vert}_1=\sum_{i=1}^{n}{\vert w_i\vert} $；在L2正则化中，$ R(\mathbf w) $即为参数$ w $的$ l_2 $范数的平方，即$ R(\mathbf w) =({\Vert \mathbf w\Vert}_2)^2=\sum_{i=1}^{n}{w_i^2} $。L1正则化中的正则项衡量了模型权数中的绝对值大小，倾向于生成值为0的参数，从而让参数变得更加稀疏；而L2正则化由于平方的加入，当参数中的某一项小到一定程度，比如0.001的时候，参数的平方结果已经可以忽略不计了，因此L2正则化会倾向生成很小的参数，在这种情况下，即便训练数据中含有少量随机噪音，模型也不太容易通过增加个别参数的值来对噪声进行过度拟合，即提高了模型的抗扰动能力。

 \parinterval  此外，在第六章即将介绍的Dropout和Label Smoothing方法也可以被看作是一种正则化操作。它们都可以提高模型在未见数据上的泛化能力。
 %--5.4.6 反向传播---------------------
@@ -1980,7 +1981,7 @@ w_{t+1}&=&w_t-\frac{\eta}{\sqrt{z_t+\epsilon}} v_t
 \label{eqa1.62}
 \end{eqnarray}
 %公式--------------------------------------------------------------------
-\parinterval  输出$ \mathbf y $是词表上的一个分布，通过$ w_i $可以索引到相应的概率$ {\rm P}(w_i|w_{i-1},w_{i-2},w_{i-3}) $。\\$ \mathbf U $、$ \mathbf H $和$ \mathbf d $是模型的参数。从结构上看，FNNLM主要有三层：1）词的分布式表示层，即把输入的离散的单词变为分布式表示对应的实数向量；2）隐藏层，即将得到的词的分布式表示进行线性和非线性变换；3）输出层，根据隐藏层的输出预测单词的概率分布。这三层堆叠在一起构成了整个网络，而且也可以加入从词的分布式表示直接到输出层的连接（红色虚线箭头）。
+\parinterval  输出$ \mathbf y $是词表上的一个分布，通过单词$ w_i $可以索引到对应概率$ {\rm P}(w_i|w_{i-1},w_{i-2},\\w_{i-3}) $。$ \mathbf U $、$ \mathbf H $和$ \mathbf d $是模型的参数。从结构上看，FNNLM主要有三层：1）词的分布式表示层，即把输入的离散的单词变为分布式表示对应的实数向量；2）隐藏层，即将得到的词的分布式表示进行线性和非线性变换；3）输出层，根据隐藏层的输出预测单词的概率分布。这三层堆叠在一起构成了整个网络，而且也可以加入从词的分布式表示直接到输出层的连接（红色虚线箭头）。

 \parinterval  值得注意的是，在FNNLM中，单词已经不再是一个孤立的符号串，而是被表示为一个实数向量。这样，两个单词之间可以通过向量计算某种相似度或距离。这导致相似的单词会具有相似的分布，进而缓解$n$-gram语言模型的问题\ \dash \ 明明意思很相近的两个词但是概率估计的结果差异性却很大。
 %----------------------------------------------
@@ -2026,7 +2027,7 @@ w_{t+1}&=&w_t-\frac{\eta}{\sqrt{z_t+\epsilon}} v_t
 %--5.5.1.2基于循环神经网络的语言模型---------------------
 \subsubsection{基于循环神经网络的语言模型}\index{Chapter5.5.1.2}

-\parinterval  FNNLM模型固然有效，但是和传统的$n$-gram语言模型一样需要依赖有限上下文假设，也就是$ w_i $的生成概率只依赖于之前的$ n-1 $个单词。很自然的一个想法是引入更大范围的历史信息，这样可以扑捉单词间的长距离依赖。
+\parinterval  FNNLM模型固然有效，但是和传统的$n$-gram语言模型一样需要依赖有限上下文假设，也就是$ w_i $的生成概率只依赖于之前的$ n-1 $个单词。很自然的一个想法是引入更大范围的历史信息，这样可以捕捉单词间的长距离依赖。

 \parinterval  对于这个问题，可以通过{\small\sffamily\bfseries{循环神经网络}}（Recurrent Neural Network，或RNN）进行求解。通过引入循环单元这种特殊的结构，循环神经网络可以对任意长度的历史进行建模，因此在一定程度上解决了传统$n$-gram语言模型有限历史的问题。正是基于这个优点，{\small\sffamily\bfseries{循环神经网络语言模型}}（RNNLM）应运而生\cite{mikolov2010recurrent}。

@@ -2063,7 +2064,7 @@ w_{t+1}&=&w_t-\frac{\eta}{\sqrt{z_t+\epsilon}} v_t

 \parinterval  通过引入记忆历史的能力，RNNLM缓解了$n$-gram模型中有限上下文的局限性，但依旧存在一些问题。随着序列变长，不同单词之间信息传递路径变长，信息传递的效率变低。对于长序列，很难通过很多次的循环单元操作保留很长的历史信息。过长的序列还容易引起梯度消失和梯度爆炸问题（详见\ref{sec:5.4.4}节），增加模型训练的难度。

-\parinterval  对于这个问题，研究者又提出了一种新的结构$\ \dash \ ${\small\bfnew{自注意力机制}}（Self-Attention Mechanism）。自注意力是一种特殊的神经网络结构，它可以对序列上任意两个词的相互作用直接进行建模，这样也就避免了循环神经网络中随着距离变长信息传递步骤增多的缺陷。在自然语言处理领域，自注意力机制被成功的应用在机器翻译，形成了著名的Transformer模型\cite{NIPS2017_7181}。第六章会系统的介绍自注意力机制和Transformer模型。
+\parinterval  对于这个问题，研究者又提出了一种新的结构$\ \dash \ ${\small\bfnew{自注意力机制}}（Self-Attention Mechanism）。自注意力是一种特殊的神经网络结构，它可以对序列上任意两个词的相互作用直接进行建模，这样也就避免了循环神经网络中随着距离变长信息传递步骤增多的缺陷。在自然语言处理领域，自注意力机制被成功的应用在机器翻译，形成了著名的Transformer模型\cite{NIPS2017_7181}。第六章会系统地介绍自注意力机制和Transformer模型。

 \parinterval  这里，先简单了解一下基于Transformer的语言模型结构（图\ref{fig:transformer-LM}）。与FNNLM\\和RNNLM一样，Transformer首先对输入单词进行分布式表示，同时加上每个位置的编码构成了整个模型的输入（蓝色方框）。之后，利用自注意力机制对输入的向量进行处理（绿色方框）。自注意力的结果会被送入一个前馈神经网络，之后再送给Softmax输出层（橙色方框）。
 %----------------------------------------------
@@ -2087,7 +2088,7 @@ w_{t+1}&=&w_t-\frac{\eta}{\sqrt{z_t+\epsilon}} v_t
 \label{eqa1.65}
 \end{eqnarray}
 %公式--------------------------------------------------------------------
-\parinterval  本质上，PPL反映了语言模型对序列可能性预测能力的一种评估。因为$ w_1\dots w_m $\\是真实的自然语言，``完美''的模型会得到$ {\rm P} (w_1\dots w_m)=1 $，它对应了最低的困惑度$ {\rm{PPL}}=1$。这说明模型可以完美的对词序列出现的可能性进行预测。当然，真实的语言模型是无法达到$ {\rm{PPL}}=1$的，比如，在著名的Penn Treebank（PTB）数据上最好的语言模型的PPL值也只能到达35左右。可见自然语言处理任务的困难程度。
+\parinterval  本质上，PPL反映了语言模型对序列可能性预测能力的一种评估。因为$ w_1\dots w_m $\\是真实的自然语言，``完美''的模型会得到$ {\rm P} (w_1\dots w_m)=1 $，它对应了最低的困惑度$ {\rm{PPL}}=1$。这说明模型可以完美地对词序列出现的可能性进行预测。当然，真实的语言模型是无法达到$ {\rm{PPL}}=1$的，比如，在著名的Penn Treebank（PTB）数据上最好的语言模型的PPL值也只能到达35左右。可见自然语言处理任务的困难程度。
 %--5.5.2单词表示模型---------------------
 \subsection{单词表示模型}\index{Chapter5.5.2}


--- a/Book/Chapter6/Chapter6.tex
+++ b/Book/Chapter6/Chapter6.tex
@@ -8,7 +8,7 @@
 \renewcommand\tablename{表}%将figure改为图
 \definecolor{ublue}{rgb}{0.152,0.250,0.545}
 \definecolor{ugreen}{rgb}{0,0.5,0}
-\chapterimage{chapter_head_1} % Chapter heading image
+\chapterimage{fig-NEU-7.jpg} % Chapter heading image
 %------------------------------------------------

 %公式1.7之后往后串一个

--- a/Book/ChapterAppend/ChapterAppend.tex
+++ b/Book/ChapterAppend/ChapterAppend.tex
@@ -8,7 +8,7 @@
 %----------------------------------------------------------------------------------------
 \renewcommand\figurename{图}%将figure改为图
 \renewcommand\tablename{表}%将figure改为图
-\chapterimage{chapter_head_1} % Chapter heading image
+\chapterimage{fig-NEU-9.jpg} % Chapter heading image
 %------------------------------------------------

 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%第一章附录

--- a/Book/Figures/010813165162.pdf.pdf
+++ b/Book/Figures/010813165162.pdf.pdf
--- a/Book/Figures/fig-NEU-1.jpg
+++ b/Book/Figures/fig-NEU-1.jpg
--- a/Book/Figures/fig-NEU-10.jpg
+++ b/Book/Figures/fig-NEU-10.jpg
--- a/Book/Figures/fig-NEU-2.jpg
+++ b/Book/Figures/fig-NEU-2.jpg
--- a/Book/Figures/fig-NEU-3.jpg
+++ b/Book/Figures/fig-NEU-3.jpg
--- a/Book/Figures/fig-NEU-4.jpg
+++ b/Book/Figures/fig-NEU-4.jpg
--- a/Book/Figures/fig-NEU-5.jpg
+++ b/Book/Figures/fig-NEU-5.jpg
--- a/Book/Figures/fig-NEU-6.jpg
+++ b/Book/Figures/fig-NEU-6.jpg
--- a/Book/Figures/fig-NEU-7.jpg
+++ b/Book/Figures/fig-NEU-7.jpg
--- a/Book/Figures/fig-NEU-8.jpg
+++ b/Book/Figures/fig-NEU-8.jpg
--- a/Book/Figures/fig-NEU-9.jpg
+++ b/Book/Figures/fig-NEU-9.jpg
--- a/Book/Figures/fig-cover.jpg
+++ b/Book/Figures/fig-cover.jpg
--- a/Book/Figures/placeholder.jpg
+++ b/Book/Figures/placeholder.jpg
--- a/Book/bibliography.bib
+++ b/Book/bibliography.bib
@@ -2706,6 +2706,87 @@ year ={2008},
  //biburl    = {https://dblp.org/rec/conf/wmt/ZollmannV06.bib},
  //bibsource = {dblp computer science bibliography, https://dblp.org}
 }
+@article{brown1992class,
+  title={Class-based n-gram models of natural language},
+  author={Brown and
+              Peter F and 
+              Desouza and
+              Peter V and 
+              Mercer amd 
+              Robert L 
+              and Pietra and
+              Vincent J Della 
+              and Lai and 
+              Jenifer C},
+  journal={Computational linguistics},
+  volume={18},
+  number={4},
+  pages={467--479},
+  year={1992},
+  publisher={MIT Press}
+}
+@article{bengio2003a,
+  title={A neural probabilistic language model},
+  author={Bengio and 
+             Yoshua and 
+             Ducharme and
+             Rejean and 
+             Vincent and
+             Pascal and
+             Janvin and
+             Christian},
+  journal={Journal of Machine Learning Research},
+  volume={3},
+  number={6},
+  pages={1137--1155},
+  year={2003}
+}
+@inproceedings{mikolov2012context,
+  title={Context dependent recurrent neural network language model},
+  author={Mikolov and
+            Tomas and
+            Zweig and
+            Geoffrey},
+  booktitle={2012 IEEE Spoken Language Technology Workshop (SLT)},
+  pages={234--239},
+  year={2012},
+  organization={IEEE}
+}
+@article{zaremba2014recurrent,
+  title={Recurrent Neural Network Regularization},
+  author={Zaremba and 
+             Wojciech and 
+             Sutskever and 
+             Ilya and 
+             Vinyals and
+             Oriol},
+  journal={arXiv: Neural and Evolutionary Computing},
+  year={2014}
+}
+@article{zilly2016recurrent,
+  title={Recurrent Highway Networks},
+  author={Zilly and
+            Julian and
+            Srivastava and
+            Rupesh Kumar and
+            Koutnik and
+            Jan and 
+            Schmidhuber and
+            Jurgen},
+  journal={arXiv: Learning},
+  year={2016}
+}
+@article{merity2017regularizing,
+  title={Regularizing and optimizing LSTM language models},
+  author={Merity and
+             tephen and
+             Keskar and
+             Nitish Shirish and
+             Socher and
+             Richard},
+  journal={arXiv: Computation and Language},
+  year={2017}
+}
 %%%%% chapter 5------------------------------------------------------------
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

@@ -3955,4 +4036,658 @@ pages ={157-166},
 %%%%% chapter 6----------------------------------------------------------------
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%%%%% chapter 7------------------------------------------------------
+@article{姚树杰2011基于句对质量和覆盖度的统计机器翻译训练语料选取,
+  title={基于句对质量和覆盖度的统计机器翻译训练语料选取},
+  author={姚树杰 and 肖桐 and 朱靖波},
+  journal={中文信息学报},
+  volume={25},
+  number={2},
+  pages={72-78},
+  year={2011},
+}
+%%%%%%%%%%%%%%%
+@misc{provilkov2019bpedropout,
+    title={BPE-Dropout: Simple and Effective Subword Regularization},
+    author={Ivan Provilkov and Dmitrii Emelianenko and Elena Voita},
+    year={2019},
+    eprint={1910.13267},
+    archivePrefix={arXiv},
+    primaryClass={cs.CL}
+}
+%%%%%%%%%%%%%%%%%%%
+@article{DBLP:journals/corr/SennrichHB15,
+  author    = {Rico Sennrich and
+               Barry Haddow and
+               Alexandra Birch},
+  title     = {Neural Machine Translation of Rare Words with Subword Units},
+  journal   = {CoRR},
+  volume    = {abs/1508.07909},
+  year      = {2015},
+  url       = {http://arxiv.org/abs/1508.07909},
+  archivePrefix = {arXiv},
+  eprint    = {1508.07909},
+  timestamp = {Mon, 13 Aug 2018 16:47:17 +0200},
+  biburl    = {https://dblp.org/rec/journals/corr/SennrichHB15.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@article{DBLP:journals/corr/abs-1207-0580,
+  author    = {Geoffrey E. Hinton and
+               Nitish Srivastava and
+               Alex Krizhevsky and
+               Ilya Sutskever and
+               Ruslan Salakhutdinov},
+  title     = {Improving neural networks by preventing co-adaptation of feature detectors},
+  journal   = {CoRR},
+  volume    = {abs/1207.0580},
+  year      = {2012},
+  url       = {http://arxiv.org/abs/1207.0580},
+  archivePrefix = {arXiv},
+  eprint    = {1207.0580},
+  timestamp = {Mon, 13 Aug 2018 16:46:10 +0200},
+  biburl    = {https://dblp.org/rec/journals/corr/abs-1207-0580.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@article{Hornic1989Multilayer,
+  title={Multilayer feedforward networks are universal approximators},
+  author={Hornic, K},
+  journal={Neural Networks},
+  volume={2},
+  number={5},
+  pages={359-366},
+  year={1989},
+}
+
+@article{DBLP:journals/corr/abs-1809-10853,
+  author    = {Alexei Baevski and
+               Michael Auli},
+  title     = {Adaptive Input Representations for Neural Language Modeling},
+  journal   = {CoRR},
+  volume    = {abs/1809.10853},
+  year      = {2018},
+  url       = {http://arxiv.org/abs/1809.10853},
+  archivePrefix = {arXiv},
+  eprint    = {1809.10853},
+  timestamp = {Fri, 05 Oct 2018 11:34:52 +0200},
+  biburl    = {https://dblp.org/rec/journals/corr/abs-1809-10853.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@inproceedings{Stahlberg2019OnNS,
+  title={On NMT Search Errors and Model Errors: Cat Got Your Tongue?},
+  author={Felix Stahlberg and Bill Byrne},
+  booktitle={EMNLP/IJCNLP},
+  year={2019}
+}
+
+@article{DBLP:journals/corr/abs-1810-08398,
+  author    = {Mingbo Ma and
+               Liang Huang and
+               Hao Xiong and
+               Kaibo Liu and
+               Chuanqiang Zhang and
+               Zhongjun He and
+               Hairong Liu and
+               Xing Li and
+               Haifeng Wang},
+  title     = {{STACL:} Simultaneous Translation with Integrated Anticipation and
+               Controllable Latency},
+  journal   = {CoRR},
+  volume    = {abs/1810.08398},
+  year      = {2018},
+  url       = {http://arxiv.org/abs/1810.08398},
+  archivePrefix = {arXiv},
+  eprint    = {1810.08398},
+  timestamp = {Thu, 01 Nov 2018 11:22:30 +0100},
+  biburl    = {https://dblp.org/rec/journals/corr/abs-1810-08398.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@article{DBLP:journals/corr/StahlbergHSB17,
+  author    = {Felix Stahlberg and
+               Eva Hasler and
+               Danielle Saunders and
+               Bill Byrne},
+  title     = {{SGNMT} - {A} Flexible {NMT} Decoding Platform for Quick Prototyping
+               of New Models and Search Strategies},
+  journal   = {CoRR},
+  volume    = {abs/1707.06885},
+  year      = {2017},
+  url       = {http://arxiv.org/abs/1707.06885},
+  archivePrefix = {arXiv},
+  eprint    = {1707.06885},
+  timestamp = {Mon, 13 Aug 2018 16:48:37 +0200},
+  biburl    = {https://dblp.org/rec/journals/corr/StahlbergHSB17.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@article{DBLP:journals/corr/SennrichHB16,
+  author    = {Rico Sennrich and
+               Barry Haddow and
+               Alexandra Birch},
+  title     = {Edinburgh Neural Machine Translation Systems for {WMT} 16},
+  journal   = {CoRR},
+  volume    = {abs/1606.02891},
+  year      = {2016},
+  url       = {http://arxiv.org/abs/1606.02891},
+  archivePrefix = {arXiv},
+  eprint    = {1606.02891},
+  timestamp = {Mon, 13 Aug 2018 16:46:23 +0200},
+  biburl    = {https://dblp.org/rec/journals/corr/SennrichHB16.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@inproceedings{DBLP:conf/wmt/LiLXLLLWZXWFCLL19,
+  author    = {Bei Li and
+               Yinqiao Li and
+               Chen Xu and
+               Ye Lin and
+               Jiqiang Liu and
+               Hui Liu and
+               Ziyang Wang and
+               Yuhao Zhang and
+               Nuo Xu and
+               Zeyang Wang and
+               Kai Feng and
+               Hexuan Chen and
+               Tengbo Liu and
+               Yanyang Li and
+               Qiang Wang and
+               Tong Xiao and
+               Jingbo Zhu},
+  editor    = {Ondrej Bojar and
+               Rajen Chatterjee and
+               Christian Federmann and
+               Mark Fishel and
+               Yvette Graham and
+               Barry Haddow and
+               Matthias Huck and
+               Antonio Jimeno{-}Yepes and
+               Philipp Koehn and
+               Andr{\'{e}} Martins and
+               Christof Monz and
+               Matteo Negri and
+               Aur{\'{e}}lie N{\'{e}}v{\'{e}}ol and
+               Mariana L. Neves and
+               Matt Post and
+               Marco Turchi and
+               Karin Verspoor},
+  title     = {The NiuTrans Machine Translation Systems for {WMT19}},
+  booktitle = {Proceedings of the Fourth Conference on Machine Translation, {WMT}
+               2019, Florence, Italy, August 1-2, 2019 - Volume 2: Shared Task Papers,
+               Day 1},
+  pages     = {257--266},
+  publisher = {Association for Computational Linguistics},
+  year      = {2019},
+  url       = {https://doi.org/10.18653/v1/w19-5325},
+  doi       = {10.18653/v1/w19-5325},
+  timestamp = {Tue, 28 Jan 2020 10:30:56 +0100},
+  biburl    = {https://dblp.org/rec/conf/wmt/LiLXLLLWZXWFCLL19.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@inproceedings{DBLP:conf/aaai/DabreF19,
+  author    = {Raj Dabre and
+               Atsushi Fujita},
+  title     = {Recurrent Stacking of Layers for Compact Neural Machine Translation
+               Models},
+  booktitle = {The Thirty-Third {AAAI} Conference on Artificial Intelligence, {AAAI}
+               2019, The Thirty-First Innovative Applications of Artificial Intelligence
+               Conference, {IAAI} 2019, The Ninth {AAAI} Symposium on Educational
+               Advances in Artificial Intelligence, {EAAI} 2019, Honolulu, Hawaii,
+               USA, January 27 - February 1, 2019},
+  pages     = {6292--6299},
+  publisher = {{AAAI} Press},
+  year      = {2019},
+  url       = {https://doi.org/10.1609/aaai.v33i01.33016292},
+  doi       = {10.1609/aaai.v33i01.33016292},
+  timestamp = {Wed, 25 Sep 2019 11:05:09 +0200},
+  biburl    = {https://dblp.org/rec/conf/aaai/DabreF19.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@article{DBLP:journals/corr/abs-1712-05877,
+  author    = {Benoit Jacob and
+               Skirmantas Kligys and
+               Bo Chen and
+               Menglong Zhu and
+               Matthew Tang and
+               Andrew G. Howard and
+               Hartwig Adam and
+               Dmitry Kalenichenko},
+  title     = {Quantization and Training of Neural Networks for Efficient Integer-Arithmetic-Only
+               Inference},
+  journal   = {CoRR},
+  volume    = {abs/1712.05877},
+  year      = {2017},
+  url       = {http://arxiv.org/abs/1712.05877},
+  archivePrefix = {arXiv},
+  eprint    = {1712.05877},
+  timestamp = {Mon, 13 Aug 2018 16:48:27 +0200},
+  biburl    = {https://dblp.org/rec/journals/corr/abs-1712-05877.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@article{DBLP:journals/corr/abs-1910-10485,
+  author    = {Gabriele Prato and
+               Ella Charlaix and
+               Mehdi Rezagholizadeh},
+  title     = {Fully Quantized Transformer for Improved Translation},
+  journal   = {CoRR},
+  volume    = {abs/1910.10485},
+  year      = {2019},
+  url       = {http://arxiv.org/abs/1910.10485},
+  archivePrefix = {arXiv},
+  eprint    = {1910.10485},
+  timestamp = {Fri, 25 Oct 2019 14:59:26 +0200},
+  biburl    = {https://dblp.org/rec/journals/corr/abs-1910-10485.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@article{DBLP:journals/corr/abs-1801-05122,
+  author    = {Xiangwen Zhang and
+               Jinsong Su and
+               Yue Qin and
+               Yang Liu and
+               Rongrong Ji and
+               Hongji Wang},
+  title     = {Asynchronous Bidirectional Decoding for Neural Machine Translation},
+  journal   = {CoRR},
+  volume    = {abs/1801.05122},
+  year      = {2018},
+  url       = {http://arxiv.org/abs/1801.05122},
+  archivePrefix = {arXiv},
+  eprint    = {1801.05122},
+  timestamp = {Mon, 15 Jul 2019 14:17:41 +0200},
+  biburl    = {https://dblp.org/rec/journals/corr/abs-1801-05122.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@article{DBLP:journals/corr/abs-1809-00069,
+  author    = {Liang Huang and
+               Kai Zhao and
+               Mingbo Ma},
+  title     = {When to Finish? Optimal Beam Search for Neural Text Generation (modulo
+               beam size)},
+  journal   = {CoRR},
+  volume    = {abs/1809.00069},
+  year      = {2018},
+  url       = {http://arxiv.org/abs/1809.00069},
+  archivePrefix = {arXiv},
+  eprint    = {1809.00069},
+  timestamp = {Fri, 05 Oct 2018 11:34:52 +0200},
+  biburl    = {https://dblp.org/rec/journals/corr/abs-1809-00069.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@article{DBLP:journals/jcss/FreundS97,
+  author    = {Yoav Freund and
+               Robert E. Schapire},
+  title     = {A Decision-Theoretic Generalization of On-Line Learning and an Application
+               to Boosting},
+  journal   = {J. Comput. Syst. Sci.},
+  volume    = {55},
+  number    = {1},
+  pages     = {119--139},
+  year      = {1997},
+  url       = {https://doi.org/10.1006/jcss.1997.1504},
+  doi       = {10.1006/jcss.1997.1504},
+  timestamp = {Wed, 14 Nov 2018 10:33:59 +0100},
+  biburl    = {https://dblp.org/rec/journals/jcss/FreundS97.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@inproceedings{DBLP:conf/acl/XiaoZZW10,
+  author    = {Tong Xiao and
+               Jingbo Zhu and
+               Muhua Zhu and
+               Huizhen Wang},
+  editor    = {Jan Hajic and
+               Sandra Carberry and
+               Stephen Clark},
+  title     = {Boosting-Based System Combination for Machine Translation},
+  booktitle = {{ACL} 2010, Proceedings of the 48th Annual Meeting of the Association
+               for Computational Linguistics, July 11-16, 2010, Uppsala, Sweden},
+  pages     = {739--748},
+  publisher = {The Association for Computer Linguistics},
+  year      = {2010},
+  url       = {https://www.aclweb.org/anthology/P10-1076/},
+  timestamp = {Fri, 13 Sep 2019 13:00:43 +0200},
+  biburl    = {https://dblp.org/rec/conf/acl/XiaoZZW10.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@inproceedings{DBLP:conf/icassp/SimBGSW07,
+  author    = {Khe Chai Sim and
+               William J. Byrne and
+               Mark J. F. Gales and
+               Hichem Sahbi and
+               Philip C. Woodland},
+  title     = {Consensus Network Decoding for Statistical Machine Translation System
+               Combination},
+  booktitle = {Proceedings of the {IEEE} International Conference on Acoustics, Speech,
+               and Signal Processing, {ICASSP} 2007, Honolulu, Hawaii, USA, April
+               15-20, 2007},
+  pages     = {105--108},
+  publisher = {{IEEE}},
+  year      = {2007},
+  url       = {https://doi.org/10.1109/ICASSP.2007.367174},
+  doi       = {10.1109/ICASSP.2007.367174},
+  timestamp = {Wed, 16 Oct 2019 14:14:52 +0200},
+  biburl    = {https://dblp.org/rec/conf/icassp/SimBGSW07.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@inproceedings{DBLP:conf/acl/RostiMS07,
+  author    = {Antti{-}Veikko I. Rosti and
+               Spyridon Matsoukas and
+               Richard M. Schwartz},
+  editor    = {John A. Carroll and
+               Antal van den Bosch and
+               Annie Zaenen},
+  title     = {Improved Word-Level System Combination for Machine Translation},
+  booktitle = {{ACL} 2007, Proceedings of the 45th Annual Meeting of the Association
+               for Computational Linguistics, June 23-30, 2007, Prague, Czech Republic},
+  publisher = {The Association for Computational Linguistics},
+  year      = {2007},
+  url       = {https://www.aclweb.org/anthology/P07-1040/},
+  timestamp = {Mon, 16 Sep 2019 13:46:41 +0200},
+  biburl    = {https://dblp.org/rec/conf/acl/RostiMS07.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@inproceedings{DBLP:conf/wmt/RostiZMS08,
+  author    = {Antti{-}Veikko I. Rosti and
+               Bing Zhang and
+               Spyros Matsoukas and
+               Richard M. Schwartz},
+  editor    = {Chris Callison{-}Burch and
+               Philipp Koehn and
+               Christof Monz and
+               Josh Schroeder and
+               Cameron S. Fordyce},
+  title     = {Incremental Hypothesis Alignment for Building Confusion Networks with
+               Application to Machine Translation System Combination},
+  booktitle = {Proceedings of the Third Workshop on Statistical Machine Translation,
+               WMT@ACL 2008, Columbus, Ohio, USA, June 19, 2008},
+  pages     = {183--186},
+  publisher = {Association for Computational Linguistics},
+  year      = {2008},
+  url       = {https://www.aclweb.org/anthology/W08-0329/},
+  timestamp = {Fri, 13 Sep 2019 13:08:46 +0200},
+  biburl    = {https://dblp.org/rec/conf/wmt/RostiZMS08.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}

+@article{DBLP:journals/corr/LiMJ16,
+  author    = {Jiwei Li and
+               Will Monroe and
+               Dan Jurafsky},
+  title     = {A Simple, Fast Diverse Decoding Algorithm for Neural Generation},
+  journal   = {CoRR},
+  volume    = {abs/1611.08562},
+  year      = {2016},
+  url       = {http://arxiv.org/abs/1611.08562},
+  archivePrefix = {arXiv},
+  eprint    = {1611.08562},
+  timestamp = {Mon, 13 Aug 2018 16:48:46 +0200},
+  biburl    = {https://dblp.org/rec/journals/corr/LiMJ16.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@inproceedings{DBLP:conf/emnlp/TrombleKOM08,
+  author    = {Roy Tromble and
+               Shankar Kumar and
+               Franz Josef Och and
+               Wolfgang Macherey},
+  title     = {Lattice Minimum Bayes-Risk Decoding for Statistical Machine Translation},
+  booktitle = {2008 Conference on Empirical Methods in Natural Language Processing,
+               {EMNLP} 2008, Proceedings of the Conference, 25-27 October 2008, Honolulu,
+               Hawaii, USA, {A} meeting of SIGDAT, a Special Interest Group of the
+               {ACL}},
+  pages     = {620--629},
+  publisher = {{ACL}},
+  year      = {2008},
+  url       = {https://www.aclweb.org/anthology/D08-1065/},
+  timestamp = {Fri, 13 Sep 2019 13:08:45 +0200},
+  biburl    = {https://dblp.org/rec/conf/emnlp/TrombleKOM08.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@inproceedings{DBLP:conf/aaai/SuTXJSL17,
+  author    = {Jinsong Su and
+               Zhixing Tan and
+               Deyi Xiong and
+               Rongrong Ji and
+               Xiaodong Shi and
+               Yang Liu},
+  editor    = {Satinder P. Singh and
+               Shaul Markovitch},
+  title     = {Lattice-Based Recurrent Neural Network Encoders for Neural Machine
+               Translation},
+  booktitle = {Proceedings of the Thirty-First {AAAI} Conference on Artificial Intelligence,
+               February 4-9, 2017, San Francisco, California, {USA}},
+  pages     = {3302--3308},
+  publisher = {{AAAI} Press},
+  year      = {2017},
+  url       = {http://aaai.org/ocs/index.php/AAAI/AAAI17/paper/view/14320},
+  timestamp = {Sun, 31 Mar 2019 12:09:37 +0200},
+  biburl    = {https://dblp.org/rec/conf/aaai/SuTXJSL17.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@inproceedings{DBLP:conf/acl/BirdL04,
+  author    = {Steven Bird and
+               Edward Loper},
+  title     = {{NLTK:} The Natural Language Toolkit},
+  booktitle = {Proceedings of the 42nd Annual Meeting of the Association for Computational
+               Linguistics, Barcelona, Spain, July 21-26, 2004 - Poster and Demonstration},
+  publisher = {{ACL}},
+  year      = {2004},
+  url       = {https://www.aclweb.org/anthology/P04-3031/},
+  timestamp = {Wed, 18 Sep 2019 12:15:54 +0200},
+  biburl    = {https://dblp.org/rec/conf/acl/BirdL04.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@inproceedings{wang-etal-2018-dynamic,
+    title = "Dynamic Sentence Sampling for Efficient Training of Neural Machine Translation",
+    author = "Wang, Rui  and
+      Utiyama, Masao  and
+      Sumita, Eiichiro",
+    booktitle = "Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)",
+    month = jul,
+    year = "2018",
+    address = "Melbourne, Australia",
+    publisher = "Association for Computational Linguistics",
+    url = "https://www.aclweb.org/anthology/P18-2048",
+    doi = "10.18653/v1/P18-2048",
+    pages = "298--304",
+    abstract = "Traditional Neural machine translation (NMT) involves a fixed training procedure where each sentence is sampled once during each epoch. In reality, some sentences are well-learned during the initial few epochs; however, using this approach, the well-learned sentences would continue to be trained along with those sentences that were not well learned for 10-30 epochs, which results in a wastage of time. Here, we propose an efficient method to dynamically sample the sentences in order to accelerate the NMT training. In this approach, a weight is assigned to each sentence based on the measured difference between the training costs of two iterations. Further, in each epoch, a certain percentage of sentences are dynamically sampled according to their weights. Empirical results based on the NIST Chinese-to-English and the WMT English-to-German tasks show that the proposed method can significantly accelerate the NMT training and improve the NMT performance.",
+}
+
+@inproceedings{garciamartinez:hal-01433161,
+  TITLE = {{Factored Neural Machine Translation Architectures}},
+  AUTHOR = {Garcia-Martinez, Mercedes and Barrault, Lo{\"i}c and Bougares, Fethi},
+  URL = {https://hal.archives-ouvertes.fr/hal-01433161},
+  BOOKTITLE = {{International Workshop on Spoken Language Translation (IWSLT'16)}},
+  ADDRESS = {Seattle, United States},
+  YEAR = {2016},
+  PDF = {https://hal.archives-ouvertes.fr/hal-01433161/file/FNMTiwslt2016.pdf},
+  HAL_ID = {hal-01433161},
+  HAL_VERSION = {v1},
+}
+
+@article{DBLP:journals/corr/JeanCMB14,
+  author    = {S{\'{e}}bastien Jean and
+               Kyunghyun Cho and
+               Roland Memisevic and
+               Yoshua Bengio},
+  title     = {On Using Very Large Target Vocabulary for Neural Machine Translation},
+  journal   = {CoRR},
+  volume    = {abs/1412.2007},
+  year      = {2014},
+  url       = {http://arxiv.org/abs/1412.2007},
+  archivePrefix = {arXiv},
+  eprint    = {1412.2007},
+  timestamp = {Mon, 13 Aug 2018 16:46:10 +0200},
+  biburl    = {https://dblp.org/rec/journals/corr/JeanCMB14.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@article{DBLP:journals/corr/LuongM16,
+  author    = {Minh{-}Thang Luong and
+               Christopher D. Manning},
+  title     = {Achieving Open Vocabulary Neural Machine Translation with Hybrid Word-Character
+               Models},
+  journal   = {CoRR},
+  volume    = {abs/1604.00788},
+  year      = {2016},
+  url       = {http://arxiv.org/abs/1604.00788},
+  archivePrefix = {arXiv},
+  eprint    = {1604.00788},
+  timestamp = {Mon, 13 Aug 2018 16:47:26 +0200},
+  biburl    = {https://dblp.org/rec/journals/corr/LuongM16.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@article{philipAlgorithmfordataCompression,
+  title={A New Algorithm for Data Compression},
+  author={Philip Gage},
+  year      = {1994}
+}
+
+@article{DBLP:journals/corr/abs-1804-10959,
+  author    = {Taku Kudo},
+  title     = {Subword Regularization: Improving Neural Network Translation Models
+               with Multiple Subword Candidates},
+  journal   = {CoRR},
+  volume    = {abs/1804.10959},
+  year      = {2018},
+  url       = {http://arxiv.org/abs/1804.10959},
+  archivePrefix = {arXiv},
+  eprint    = {1804.10959},
+  timestamp = {Mon, 13 Aug 2018 16:48:57 +0200},
+  biburl    = {https://dblp.org/rec/journals/corr/abs-1804-10959.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@article{DBLP:journals/corr/ZagoruykoK16,
+  author    = {Sergey Zagoruyko and
+               Nikos Komodakis},
+  title     = {Wide Residual Networks},
+  journal   = {CoRR},
+  volume    = {abs/1605.07146},
+  year      = {2016},
+  url       = {http://arxiv.org/abs/1605.07146},
+  archivePrefix = {arXiv},
+  eprint    = {1605.07146},
+  timestamp = {Mon, 13 Aug 2018 16:46:42 +0200},
+  biburl    = {https://dblp.org/rec/journals/corr/ZagoruykoK16.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@article{DBLP:journals/iet-bmt/Sepas-Moghaddam20,
+  author    = {Alireza Sepas{-}Moghaddam and
+               Fernando Pereira and
+               Paulo Lobato Correia},
+  title     = {Face recognition: a novel multi-level taxonomy based survey},
+  journal   = {{IET} Biom.},
+  volume    = {9},
+  number    = {2},
+  pages     = {58--67},
+  year      = {2020},
+  url       = {https://doi.org/10.1049/iet-bmt.2019.0001},
+  doi       = {10.1049/iet-bmt.2019.0001},
+  timestamp = {Wed, 01 Apr 2020 08:42:20 +0200},
+  biburl    = {https://dblp.org/rec/journals/iet-bmt/Sepas-Moghaddam20.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@inproceedings{ethayarajh-2019-contextual,
+    title = "How Contextual are Contextualized Word Representations? Comparing the Geometry of {BERT}, {ELM}o, and {GPT}-2 Embeddings",
+    author = "Ethayarajh, Kawin",
+    booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)",
+    month = nov,
+    year = "2019",
+    address = "Hong Kong, China",
+    publisher = "Association for Computational Linguistics",
+    url = "https://www.aclweb.org/anthology/D19-1006",
+    doi = "10.18653/v1/D19-1006",
+    pages = "55--65",
+    abstract = "Replacing static word embeddings with contextualized word representations has yielded significant improvements on many NLP tasks. However, just how contextual are the contextualized representations produced by models such as ELMo and BERT? Are there infinitely many context-specific representations for each word, or are words essentially assigned one of a finite number of word-sense representations? For one, we find that the contextualized representations of all words are not isotropic in any layer of the contextualizing model. While representations of the same word in different contexts still have a greater cosine similarity than those of two different words, this self-similarity is much lower in upper layers. This suggests that upper layers of contextualizing models produce more context-specific representations, much like how upper layers of LSTMs produce more task-specific representations. In all layers of ELMo, BERT, and GPT-2, on average, less than 5{\%} of the variance in a word{'}s contextualized representations can be explained by a static embedding for that word, providing some justification for the success of contextualized representations.",
+}
+
+@inproceedings{DBLP:conf/acl/JawaharSS19,
+  author    = {Ganesh Jawahar and
+               Beno{\^{\i}}t Sagot and
+               Djam{\'{e}} Seddah},
+  editor    = {Anna Korhonen and
+               David R. Traum and
+               Llu{\'{\i}}s M{\`{a}}rquez},
+  title     = {What Does {BERT} Learn about the Structure of Language?},
+  booktitle = {Proceedings of the 57th Conference of the Association for Computational
+               Linguistics, {ACL} 2019, Florence, Italy, July 28- August 2, 2019,
+               Volume 1: Long Papers},
+  pages     = {3651--3657},
+  publisher = {Association for Computational Linguistics},
+  year      = {2019},
+  url       = {https://doi.org/10.18653/v1/p19-1356},
+  doi       = {10.18653/v1/p19-1356},
+  timestamp = {Tue, 28 Jan 2020 10:28:06 +0100},
+  biburl    = {https://dblp.org/rec/conf/acl/JawaharSS19.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@article{DBLP:journals/corr/abs-1806-00187,
+  author    = {Myle Ott and
+               Sergey Edunov and
+               David Grangier and
+               Michael Auli},
+  title     = {Scaling Neural Machine Translation},
+  journal   = {CoRR},
+  volume    = {abs/1806.00187},
+  year      = {2018},
+  url       = {http://arxiv.org/abs/1806.00187},
+  archivePrefix = {arXiv},
+  eprint    = {1806.00187},
+  timestamp = {Mon, 13 Aug 2018 16:47:40 +0200},
+  biburl    = {https://dblp.org/rec/journals/corr/abs-1806-00187.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@article{bengioCurriculumlearning,
+  author	= {Yoshu Bengio and 
+          Jerome Louradour and
+		  Ronman Collobert and
+		  Jason Weston},
+  title		= {Curriculum learning}
+}
+
+@inproceedings{Hubara2016BinarizedNN,
+  title={Binarized Neural Networks},
+  author={Itay Hubara and Matthieu Courbariaux and Daniel Soudry and Ran El-Yaniv and Yoshua Bengio},
+  booktitle={NIPS},
+  year={2016}
+}
+
+@inproceedings{DBLP:conf/emnlp/DuanLXZ09,
+  author    = {Nan Duan and
+               Mu Li and
+               Tong Xiao and
+               Ming Zhou},
+  title     = {The Feature Subspace Method for {SMT} System Combination},
+  booktitle = {Proceedings of the 2009 Conference on Empirical Methods in Natural
+               Language Processing, {EMNLP} 2009, 6-7 August 2009, Singapore, {A}
+               meeting of SIGDAT, a Special Interest Group of the {ACL}},
+  pages     = {1096--1104},
+  publisher = {{ACL}},
+  year      = {2009},
+  url       = {https://www.aclweb.org/anthology/D09-1114/},
+  timestamp = {Fri, 13 Sep 2019 13:08:45 +0200},
+  biburl    = {https://dblp.org/rec/conf/emnlp/DuanLXZ09.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
--- a/Book/mt-book-xelatex.tex
+++ b/Book/mt-book-xelatex.tex
@@ -53,12 +53,10 @@

 \begingroup
 \thispagestyle{empty} % Suppress headers and footers on the title page
-%\begin{tikzpicture}[remember picture,overlay]
+
 \begin{tikzpicture}[remember picture,overlay]
-\node[inner sep=0pt] (background) at (current page.center) {\includegraphics[width=\paperwidth]{background.pdf}};
-\draw (current page.center) node [fill=ocre!30!white,fill opacity=0.6,text opacity=1,inner sep=1cm]{\Huge\centering\bfseries\sffamily\parbox[c][][t]{\paperwidth}{\centering 机器翻译：统计建模与深度学习方法\\[15pt] % Book title
-%{\Large 副标题是否需要}\\[20pt] % Subtitle
-{\LARGE 肖桐\ \ 朱靖波}}}; % Author name
+\node[inner sep=0pt] (background) at (current page.center) {\includegraphics[width=\paperwidth,height=\paperheight]{fig-cover.jpg}};
+
 \end{tikzpicture}
 \vfill
 \endgroup
@@ -95,11 +93,12 @@
 \noindent 曹润柘、曾信、孟霞、单韦乔、姜雨帆、王子扬、刘辉、许诺、李北、刘继强、张哲旸、周书涵、周涛、张裕浩、李炎洋、林野、刘晓倩、牛蕊 \\
 }

+
 %----------------------------------------------------------------------------------------
 %	TABLE OF CONTENTS
 %----------------------------------------------------------------------------------------
 %\usechapterimagefalse % If you don't want to include a chapter image, use this to toggle images off - it can be enabled later with \usechapterimagetrue
-\chapterimage{chapter_head_1.pdf} %目录标题的图案
+\chapterimage{fig-NEU-1.jpg} %目录标题的图案
 \pagestyle{empty} % Disable headers and footers for the following pages
 \tableofcontents % 打印目录
 \cleardoublepage %保证章节页在奇数页
@@ -111,19 +110,21 @@
 %	CHAPTERS
 %----------------------------------------------------------------------------------------

-%\include{Chapter1/chapter1}
-%\include{Chapter2/chapter2}
-%\include{Chapter3/chapter3}
+\include{Chapter1/chapter1}
+\include{Chapter2/chapter2}
+\include{Chapter3/chapter3}
 \include{Chapter4/chapter4}
-%\include{Chapter5/chapter5}
-%\include{Chapter6/chapter6}
-%\include{ChapterAppend/chapterappend}
+\include{Chapter5/chapter5}
+\include{Chapter6/chapter6}
+%\include{Chapter7/chapter7}
+\include{ChapterAppend/chapterappend}



 %----------------------------------------------------------------------------------------
 %	BIBLIOGRAPHY
 %----------------------------------------------------------------------------------------
+\chapterimage{fig-NEU-10.jpg} %目录标题的图案
 \cleardoublepage % Make sure the index starts on an odd (right side) page
 \printbibliography

@@ -132,7 +133,7 @@
 %----------------------------------------------------------------------------------------
 %	INDEX
 %----------------------------------------------------------------------------------------
-
+\chapterimage{fig-NEU-10.jpg} %目录标题的图案
 \cleardoublepage % Make sure the index starts on an odd (right side) page
 %\phantomsection
 %\setlength{\columnsep}{0.75cm} % Space between the 2 columns of the index

--- a/Book/structure.tex
+++ b/Book/structure.tex
@@ -547,7 +547,7 @@ addtohook={%
 \usetikzlibrary{mindmap,backgrounds} % mind map
 \usepackage{type1cm}%设置公式字体
 \usepackage{caption}%设置图片标题字体大小
-\captionsetup{font={small}}
+\captionsetup{font={footnotesize}}
 \usepackage{pstricks}
 \DeclareMathOperator*{\argmax}{arg\,max}
 \DeclareMathOperator*{\argmin}{arg\,min}
@@ -620,5 +620,3 @@ addtohook={%



-
-