合并分支 'master' 到 'mengxia'

Master 查看合并请求 !661

合并分支 'master' 到 'mengxia'
Master 查看合并请求 !661
677fc9e4 · 孟霞 · 79d719c1 · 7b86b96a · 79d719c1 · 677fc9e4
Commit 677fc9e4 authored Dec 22, 2020 by 孟霞
--- a/Chapter13/Figures/figure-batch-generation-method.tex
+++ b/Chapter13/Figures/figure-batch-generation-method.tex
-\begin{tikzpicture}
-	\tikzstyle{node} = [minimum height=1.0*1.2em,draw,fill=green!20]
-	\tikzstyle{legend} = [minimum height=1.0*1.2em,minimum width=1.0*1.2em,draw]
-	\tikzstyle{node2} = [minimum width=1.0*1.2em,minimum height=4.1*1.2em,draw,fill=blue!20]
-	\node[node,minimum width=2.8*1.2em] (node1) at (0,0) {};
-	\node[node,minimum width=4.0*1.2em,anchor=north west] (node2) at (node1.south west) {};
-	\node[node,minimum width=3.2*1.2em,anchor=north west] (node3) at (node2.south west) {};
-	\node[node,minimum width=3.0*1.2em,anchor=north west] (node4) at (node3.south west) {};
-	\node[node2,anchor = north west] (grad1) at ([xshift=1.2em]node1.north east) {};
-	\node[node,minimum width=3.7*1.2em,anchor=north west] (node5) at (grad1.north east) {};
-	\node[node,minimum width=2.8*1.2em,anchor=north west] (node6) at (node5.south west) {};
-	\node[node,minimum width=3.2*1.2em,anchor=north west] (node7) at (node6.south west) {};
-	\node[node,minimum width=4.0*1.2em,anchor=north west] (node8) at (node7.south west) {};
-	\node[font=\footnotesize,anchor=east] (line1) at (node1.west) {GPU1};
-	\node[font=\footnotesize,anchor=east] (line2) at (node2.west) {GPU2};
-	\node[font=\footnotesize,anchor=east] (line3) at (node3.west) {GPU3};
-	\node[font=\footnotesize,anchor=east] (line4) at (node4.west) {GPU4};
-	\node[node2,anchor = north west] (grad2) at ([xshift=0.3em]node5.north east) {};
-	\draw[->,thick] (-1.4em*1.2,-3.62*1.2em) -- (9em*1.2,-3.62*1.2em);
-	\node[node,minimum width=2.8*1.2em] (node9) at (16em,0) {};
-	\node[node,minimum width=4.0*1.2em,anchor=north west] (node10) at (node9.south west) {};
-	\node[node,minimum width=3.2*1.2em,anchor=north west] (node11) at (node10.south west) {};
-	\node[node,minimum width=3.0*1.2em,anchor=north west] (node12) at (node11.south west) {};
-	\node[node,minimum width=3.7*1.2em,anchor=north west] (node13) at (node9.north east) {};
-	\node[node,minimum width=2.8*1.2em,anchor=north west] (node14) at (node10.north east) {};
-	\node[node,minimum width=3.2*1.2em,anchor=north west] (node15) at (node11.north east) {};
-	\node[node,minimum width=4.0*1.2em,anchor=north west] (node16) at (node12.north east) {};
-	\node[node2,anchor = north west] (grad3) at ([xshift=0.5em]node13.north east) {};
-	\node[font=\footnotesize,anchor=east] (line1) at (node9.west) {GPU1};
-	\node[font=\footnotesize,anchor=east] (line2) at (node10.west) {GPU2};
-	\node[font=\footnotesize,anchor=east] (line3) at (node11.west) {GPU3};
-	\node[font=\footnotesize,anchor=east] (line4) at (node12.west) {GPU4};
-	\draw[->,thick] (node12.south west) -- ([xshift=3em]node16.south east);
-	\begin{pgfonlayer}{background}
-	\node [rectangle,inner sep=-0.0em,draw] [fit = (node1) (node2) (node3) (node4)] (box1) {};
-	\node [rectangle,inner sep=-0.0em,draw] [fit = (node5) (node6) (node7) (node8)] (box2) {};
-	\node [rectangle,inner sep=-0.0em,draw] [fit = (node9) (node13) (node12) (node16)] (box2) {};
-	\end{pgfonlayer}
-	\node[font=\footnotesize,anchor=north] (legend1) at ([xshift=3em]node4.south) {一步一更新};
-	\node[font=\footnotesize,anchor=north] (legend2) at ([xshift=2.5em]node12.south) {累积两步更新};
-	\node[font=\footnotesize,anchor=north] (time1) at (grad2.south) {time};
-	\node[font=\footnotesize,anchor=north] (time1) at (grad3.south) {time};
-	\node[legend] (legend3) at (2em,2em) {};
-	\node[font=\footnotesize,anchor=west] (idle) at (legend3.east) {:空闲};
-	\node[legend,anchor=west,draw,fill=green!30] (legend4) at ([xshift = 2em]idle.east) {};
-	\node[font=\footnotesize,anchor=west] (FB) at (legend4.east) {:前向/反向};
-	\node[legend,anchor=west,draw,fill=blue!30] (legend5) at ([xshift = 2em]FB.east) {};
-	\node[font=\footnotesize,anchor=west] (grad_sync) at (legend5.east) {:梯度更新};
-\end{tikzpicture}
\ No newline at end of file
--- a/Chapter13/Figures/figure-bpe.tex
+++ b/Chapter13/Figures/figure-bpe.tex
@@ -19,10 +19,10 @@
 	\node[node,anchor = west] (node8) at ([xshift = 2em,yshift = 2em]node7.east) {对于词表外的词lowest};
 	\node[node,anchor = north west] (node9) at ([yshift = 0.3em]node8.south west) {可以被分割为low est};
-	\node[node,font=\scriptsize,anchor = north,fill=ugreen!5,drop shadow] (dict) at ([xshift = 8em,yshift = -5em]node6.south){\begin{tabular}{llllll}
+	\node[node,font=\scriptsize,anchor = north,fill=ugreen!5,drop shadow] (dict) at ([xshift = 5em,yshift = -5em]node6.south){\begin{tabular}{llllll}
-		\multirow{3}{*}{子词词表:} & `es'  & `est' & `est$<$e$>$' & `lo' & `low'   \\
+		\multirow{3}{*}{符号合并表:} & ('e','s')  & ('es','t') & ('est','$<$e$>$') & ('l','o') & ('lo','w')   \\
-        & `ne'  & `new'&`newest$<$e$>$' & `low$<$e$>$'& `wi'\\
+        & ('n','e')  & ('ne','w') & ('new','est$<$e$>$') & ('low','$<$e$>$') & 'w','i') \\
-        & `wid' & `widest$<$e$>$' & `lowe' & `lower'& `lower$<$e$>$'
+        & ('wi','d') & ('wid','est$<$e$>$') & ('low','e') & ('lowe','r') & ('lower','$<$e$>$')
 		\end{tabular}};
 	\node[node,anchor=west] (line1) at ([xshift = 8em]node1.south east) {按字符拆分，并添加};

--- a/Chapter13/Figures/figure-increase-the-encoder.tex
+++ b/Chapter13/Figures/figure-increase-the-encoder.tex
-	\begin{tikzpicture}
-		\setlength{\base}{1.2em}
-		\tikzstyle{node} = [rounded corners=1pt,minimum width=1.2em,minimum height=1.2em,draw,fill=green!30!white]
-		\tikzstyle{node2} = [rounded corners=1pt,minimum width=1.2em,minimum height=1.2em,draw,fill=blue!30!white]
-		\node[node] (enc1) at (0,0) {};
-		\node[node] (enc2) at ([xshift = \base]enc1.east) {};
-		\node[node] (enc3) at ([xshift = \base]enc2.east) {};
-		\node[node] (enc4) at ([xshift = \base]enc3.east) {};
-		\node[node] (enc5) at ([xshift = \base]enc4.east) {};
-		\node[node] (enc6) at ([xshift = \base]enc5.east) {};
-		\node[] (enc7) at ([xshift = \base]enc6.east) {...};
-		\node[node] (enc8) at ([xshift = \base]enc7.east) {};
-		\node[node] (enc9) at ([xshift = \base]enc8.east) {};
-		\node[node] (enc10) at ([xshift = \base]enc9.east) {};
-		\node[font=\scriptsize,rotate=270] (src) at ([xshift = -\base]enc1.west) {src};
-		\draw [->] ([xshift=-0.75em]enc1.west) -- (enc1.west);
-		\draw [decorate,decoration={brace}] ([yshift=0.3em]enc1.north west) to node [auto,anchor=south,font=\scriptsize] {$N$x} ([yshift=0.3em]enc10.north east);
-		\draw [->] (enc1.east) -- (enc2.west);
-		\draw [->] (enc2.east) -- (enc3.west);
-		\draw [->] (enc3.east) -- (enc4.west);
-		\draw [->] (enc4.east) -- (enc5.west);
-		\draw [->] (enc5.east) -- (enc6.west);
-		\draw [->] (enc8.east) -- (enc9.west);
-		\draw [->] (enc9.east) -- (enc10.west);
-		\node[node2,anchor=north] (dec1) at ([yshift=-2em]enc1.south) {};
-		\node[node2,anchor=north] (dec2) at ([yshift=-2em]enc2.south) {};
-		\node[node2,anchor=north] (dec3) at ([yshift=-2em]enc3.south) {};
-		\node[node2,anchor=north] (dec4) at ([yshift=-2em]enc4.south) {};
-		\node[node2,anchor=north] (dec5) at ([yshift=-2em]enc5.south) {};
-		\node[node2,anchor=north] (dec6) at ([yshift=-2em]enc6.south) {};
-		\node[font=\scriptsize,rotate=270] (tgt) at ([xshift = -\base]dec1.west) {tgt};
-		\node[font=\scriptsize,rotate=270] (tgt) at ([xshift = \base]dec6.east) {out};
-		\draw [->] ([xshift=-0.75em]dec1.west) -- (dec1.west);
-		\draw [->] (dec6.east) -- ([xshift=0.75em]dec6.east);
-		\draw [decorate,decoration={brace,mirror}] ([yshift=-0.3em]dec1.south west) to node [auto,anchor=north,font=\scriptsize] {6x} ([yshift=-0.3em]dec6.south east);
-		\draw [->] (dec1.east) -- (dec2.west);
-		\draw [->] (dec2.east) -- (dec3.west);
-		\draw [->] (dec3.east) -- (dec4.west);
-		\draw [->] (dec4.east) -- (dec5.west);
-		\draw [->] (dec5.east) -- (dec6.west);
-		\node[node] (enc_legend) at ([xshift = 2\base]enc10.east) {};
-		\node[node2,anchor=north] (dec_legend) at ([yshift = -\base]enc_legend.south) {};
-		\node[font=\scriptsize,anchor=west] (line1) at (enc_legend.east) {:编码层};
-		\node[font=\scriptsize,anchor=west] (line1) at (dec_legend.east) {:解码层};
-		%\node[node] (dec1) at ([xshift=4em]enc1.east) {Decoder};
-		%\node[node2] (enc2) at ([xshift=4em]dec1.east) {Encoder};
-		%\node[node] (dec2) at ([xshift=4em]enc2.east) {Decoder};
-		\coordinate (c1) at ([xshift=1em]enc10.east);
-		\coordinate (c2) at ([yshift=-1.6em]c1.south);
-		\draw [->,rounded corners] (enc10.east) -- (c1) -- (c2)--([yshift=1em]dec1.north) -- (dec1.north);
-		\draw [->,rounded corners] (enc10.east) -- (c1) -- (c2)--([yshift=1em]dec2.north) -- (dec2.north);
-		\draw [->,rounded corners] (enc10.east) -- (c1) -- (c2)--([yshift=1em]dec3.north) -- (dec3.north);
-		\draw [->,rounded corners] (enc10.east) -- (c1) -- (c2)--([yshift=1em]dec4.north) -- (dec4.north);
-		\draw [->,rounded corners] (enc10.east) -- (c1) -- (c2)--([yshift=1em]dec5.north) -- (dec5.north);
-		\draw [->,rounded corners] (enc10.east) -- (c1) -- (c2)--([yshift=1em]dec6.north) -- (dec6.north);
-	\end{tikzpicture}
\ No newline at end of file
--- a/Chapter13/Figures/figure-randomly-generation-vs-generate-by-sentence-length.tex
+++ b/Chapter13/Figures/figure-randomly-generation-vs-generate-by-sentence-length.tex
-\begin{tikzpicture}
-	\tikzstyle{node} = [minimum height=1.0*1.2em,draw,fill=green!20]
-	\node[node,minimum width=2.0*1.2em] (sent1) at (0,0) {};
-	\node[node,minimum width=5.0*1.2em,anchor=north west] (sent2) at (sent1.south west) {};
-	\node[node,minimum width=1.0*1.2em,anchor=north west] (sent3) at (sent2.south west) {};
-	\node[node,minimum width=3.0*1.2em,anchor=north west] (sent4) at (sent3.south west) {};
-	\node[node,minimum width=4.0*1.2em] (sent5) at (14em,0) {};
-	\node[node,minimum width=4.5*1.2em,anchor=north west] (sent6) at (sent5.south west) {};
-	\node[node,minimum width=4.5*1.2em,anchor=north west] (sent7) at (sent6.south west) {};
-	\node[node,minimum width=5*1.2em,anchor=north west] (sent8) at (sent7.south west) {};
-	\node[font=\footnotesize,anchor=east] (line1) at (sent1.west) {句子1};
-	\node[font=\footnotesize,anchor=east] (line2) at (sent2.west) {句子2};
-	\node[font=\footnotesize,anchor=east] (line3) at (sent3.west) {句子3};
-	\node[font=\footnotesize,anchor=east] (line4) at (sent4.west) {句子4};
-	\node[font=\footnotesize,anchor=east] (line5) at (sent5.west) {句子1};
-	\node[font=\footnotesize,anchor=east] (line6) at (sent6.west) {句子2};
-	\node[font=\footnotesize,anchor=east] (line7) at (sent7.west) {句子3};
-	\node[font=\footnotesize,anchor=east] (line8) at (sent8.west) {句子4};
-	\begin{pgfonlayer}{background}
-	\node [rectangle,inner sep=-0.0em,draw] [fit = (sent1) (sent2) (sent3) (sent4)] (box1) {};
-	\node [rectangle,inner sep=-0.0em,draw] [fit = (sent5) (sent6) (sent7) (sent8)] (box2) {};
-	\end{pgfonlayer}
-	\node[font=\footnotesize] (node1) at ([yshift=-3.4em]sent2.south) {随机生成};
-	\node[font=\footnotesize] (node2) at ([yshift=-1em]sent8.south) {排序生成};
-\end{tikzpicture}
\ No newline at end of file
--- a/Chapter13/chapter13.tex
+++ b/Chapter13/chapter13.tex
--- a/Chapter15/chapter15.tex
+++ b/Chapter15/chapter15.tex
@@ -318,7 +318,7 @@ C(\mathbi{x}_j \mathbi{W}_K,\omega) &=& (\mathbi{x}_{j-\omega},\ldots,\mathbi{x}
 \parinterval 前面提到Transformer模型完全摒弃了循环单元与卷积单元，仅通过绝对位置编码来区分序列中的不同位置。另一方面，由于循环神经网络也非常适用于处理序列结构，且其结构成熟、易于优化。因此，将其与Transformer模型融合，一方面发挥循环神经网络简单高效的特点，另一方面发挥Transformer模型在特征提取方面的优势，也是一种非常值得探索的思路（{\color{red} 引用RNMT？}）。
 \parinterval 一种方法是，对深层网络的不同层使用循环机制。早在残差网络提出时，研究人员已经开始尝试探讨残差网络成功背后的原因（{\color{red} 参考文献}）。本质上，在卷积神经网络中引入残差连接后，网络从深度上隐性地利用循环的特性。区别在于不同层的参数独立，而非共享。Transformer网络的编码端与解码端分别由$N$个相同结构但参数独立的块堆叠而成，其中编码块与解码块中分别包含2/3个子层。同时，子层之间引入了残差连接保证了网络信息传递的高效性。因此，一个自然的想法是通过共享不同块之间的参数，引入循环神经网络中的归纳偏置\upcite{DBLP:conf/iclr/DehghaniGVUK19}。其中每层的权重是共享的，并引入了基于时序的编码向量用于显著区分不同深度下的时序信息。之后，在训练大容量预训练模型时同样也采取了共享层间参数的方式\upcite{Lan2020ALBERTAL}。
 \parinterval 另一种方法是，利用循环神经网络对输入序列进行编码，之后通过门控机制将得到的结果与Transformer进行融合\upcite{DBLP:conf/naacl/HaoWYWZT19}。融合机制可以采用顺序计算或并行计算。
 %----------------------------------------------------------------------------------------
@@ -857,9 +857,9 @@ lr &=& d_{model}^{-0.5}\cdot step\_num^{-0.5}
 %----------------------------------------------------------------------------------------
 \sectionnewpage
-\section{基于树结构的模型}
+\section{基于句法的神经机器翻译模型}
-\parinterval 在统计机器翻译时代，使用句法树是一种非常有效的机器翻译建模手段（见{\chaptereight}）。由于句法树是人类运用语言的高级抽象结果，使用句法树可以非常有效地帮助机器翻译系统捕捉句子的结构。将这种知识引入到机器翻译中，能使得翻译系统在译文语法正确性以及流畅度等方面获得进一步的提升，同时也可以消除译文中的歧义，进而得到更准确的译文。在神经机器翻译中，虽然标准的框架大多基于词串，但是引入句法树结构仍然很有潜力\upcite{DBLP:conf/acl/LiXTZZZ17}。具体来说，由于神经机器翻译模型缺少对于句子结构的理解，会导致一些明显的翻译问题：
+\parinterval 在统计机器翻译时代，使用句法信息是一种非常有效的机器翻译建模手段（见{\chaptereight}）。由于句法是人类运用语言的高级抽象结果，使用句法信息（如句法树）可以非常有效地帮助机器翻译系统捕捉句子的结构。将这种知识引入到机器翻译中，能使得翻译系统在译文语法正确性以及流畅度等方面获得进一步的提升，同时也可以消除译文中的歧义，进而得到更准确的译文。在神经机器翻译中，虽然标准的框架大多基于词串，但是引入句法树等结构仍然很有潜力\upcite{DBLP:conf/acl/LiXTZZZ17}。具体来说，由于神经机器翻译模型缺少对句子结构的理解，会导致一些明显的翻译问题：
 \begin{itemize}
 \vspace{0.5em}
@@ -877,15 +877,15 @@ lr &=& d_{model}^{-0.5}\cdot step\_num^{-0.5}
 \vspace{0.5em}
 \end{itemize}
-\parinterval 显然，神经机器翻译系统并没有按照合理的句法结构生成译文。也就是说，模型并没有理解句子的结构\upcite{DBLP:conf/acl/LiXTZZZ17}。甚至对于一些语言差异很大的语言对，会出现将介词短语翻译成一个词的情况。虽然可以通过不同的手段对上述问题进行求解，但是使用句法树是最直接的一种解决问题的方法\upcite{DBLP:conf/acl/EriguchiHT16}。
+\parinterval 显然，神经机器翻译系统并没有按照合理的句法结构生成译文。也就是说，模型并没有理解句子的结构\upcite{DBLP:conf/acl/LiXTZZZ17}。甚至对于一些语言差异很大的语言对，会出现将介词短语翻译成一个词的情况。虽然可以通过很多手段对上述问题进行求解，但是使用句法树是最直接的一种解决问题的方法\upcite{DBLP:conf/acl/EriguchiHT16}。
 \parinterval 那么在神经机器翻译中，如何将这种离散化的树结构融入到基于分布式表示的翻译模型中呢？有以下两种策略：
 \begin{itemize}
 \vspace{0.5em}
-\item 将句法树结构加入到编码端，使得编码端更加充分地表示源语言句子。
+\item 将句法树结构加入到编码器，使得编码器更加充分地表示源语言句子。
 \vspace{0.5em}
-\item 将句法树结构加入到解码端，使得翻译模型能生成更符合句法的译文。
+\item 将句法树结构加入到解码器，使得翻译模型能生成更符合句法的译文。
 \vspace{0.5em}
 \end{itemize}
@@ -895,7 +895,7 @@ lr &=& d_{model}^{-0.5}\cdot step\_num^{-0.5}
 \subsection{编码端使用句法信息}
-\parinterval 编码器中使用句法信息有两种思路，一种是在解码器中显性使用树结构进行建模，另一种是把句法信息作为特征输入到序列编码器中。这两种思路与统计机器翻译中基于句法树结构的模型和基于句法特征的模型十分相似（见{\chaptereight}）。
+\parinterval 编码器中使用句法信息有两种思路，一种是在解码器中显性使用树结构进行建模，另一种是把句法信息作为特征输入到传统的序列编码器中。这两种思路与统计机器翻译中基于句法树结构的模型和基于句法特征的模型十分相似（见{\chaptereight}）。
 %----------------------------------------------------------------------------------------
 %    NEW SUBSUB-SECTION
@@ -935,9 +935,9 @@ lr &=& d_{model}^{-0.5}\cdot step\_num^{-0.5}
 \textrm{“私は緑茶を飲んでいます”} \nonumber
 \end{equation}
-\noindent 其中，“a cup of green tea”实际上只对应“緑茶”一个词（{\color{red} 怎么看出来只对应一个词？？？}）。使用句法树后，“a cup of green tea”会作为树中一个节点，这样可以更好地把它作为一个整体进行翻译。
+\noindent 其中，“a cup of green tea”实际上只对应“緑茶”一个词（{\color{red} 怎么看出来只对应一个词？？？}）。使用句法树后，“a cup of green tea”会作为树中一个节点，这样可以容易把它作为一个整体进行翻译。
-\parinterval 但是，这种自底向上的树结构表示方法也存在问题：每个树节点的状态并不能包含树中其它位置的信息。也就是说，从每个节点上看，其表示结果没有很好的利用上下文。因此，可以同时使用自下而上和自上而下的信息传递进行句法树的表示\upcite{Yang2017TowardsBH,DBLP:conf/acl/ChenHCC17}，这样增加了树中每个节点对其覆盖的子树以及周围上下文的建模能力。如图\ref{fig:15-21}所示，图中$\mathbi{h}^\textrm{up}$和$\mathbi{h}^\textrm{down}$分别代表向上传输节点和向下传输节点的状态，虚线框代表了$\mathbi{h}^\textrm{up}$和$\mathbi{h}^\textrm{down}$会拼接到一起，并作为这个节点的整体表示参与注意力模型的计算。
+\parinterval 但是，这种自底向上的树结构表示方法也存在问题：每个树节点的状态并不能包含树中其它位置的信息。也就是说，从每个节点上看，其表示结果没有很好的利用上下文。因此，可以同时使用自下而上和自上而下的信息传递方式进行句法树的表示\upcite{Yang2017TowardsBH,DBLP:conf/acl/ChenHCC17}，这样增加了树中每个节点对其覆盖的子树以及周围上下文的建模能力。如图\ref{fig:15-21}所示，$\mathbi{h}^\textrm{up}$和$\mathbi{h}^\textrm{down}$分别代表向上传输节点和向下传输节点的状态，虚线框代表了$\mathbi{h}^\textrm{up}$和$\mathbi{h}^\textrm{down}$会拼接到一起，并作为这个节点的整体表示参与注意力模型的计算。
 %----------------------------------------------
 \begin{figure}[htp]
@@ -954,40 +954,39 @@ lr &=& d_{model}^{-0.5}\cdot step\_num^{-0.5}
 \subsubsection{2. 基于句法特征的编码}
-\parinterval 不同于直接对树结构进行编码，另一种方法是将词语、句法信息等信息直接转换为特征向量拼接到一起，作为机器翻译系统的输入\upcite{DBLP:conf/wmt/SennrichH16}。这种方法的优点在于，句法信息可以无缝融入到现有神经机器翻译框架，对系统结构的修改很小。以基于循环神经网络的翻译模型为例，对于输入序列的$i$个单词，可以用公式\eqref{eq:15-52}计算其表示向量$\mathbi{h}_i$时添加特征向量：
+\parinterval 不同于直接对树结构进行编码，另一种方法是将单词、句法信息等信息直接转换为特征向量拼接到一起，作为机器翻译系统的输入\upcite{DBLP:conf/wmt/SennrichH16}。这种方法的优点在于，句法信息可以无缝融入到现有神经机器翻译框架，对系统结构的修改很小。以基于循环神经网络的翻译模型为例，对于输入序列的$i$个单词，可以用如下方式计算$i$时刻的表示结果：
 \begin{eqnarray}
 \mathbi{h}_i &=& \textrm{tanh}(\mathbi{W}(\|_{k=1}^{|F|}\mathbi{E}_h x_{ik}) + \mathbi{U}\mathbi{h}_{i-1})
 \label{eq:15-52}
 \end{eqnarray}
-\noindent 其中，$\mathbi{W}$是转换矩阵，$\mathbi{U}$是权重矩阵，$F$代表了特征的数量，而$\mathbi{E}$是一个特征矩阵，它包含了不同种类特征的数量，$x_{ik}$代表了第$i$个词在第$k$种特征中所表达的值，$\|$操作为拼接操作。公式\eqref{eq:15-52}将从共$F$个特征向量连接成固定大小的向量，作为编码端的输入。这种方法可以很容易地融合如词根，子词，形态，词性以及依存标签等特征。
+\noindent 其中，$\mathbi{W}$是转换矩阵，$\mathbi{U}$是权重矩阵，$F$代表了特征的数量（{\color{red} 特征的数量为啥还要加绝对值}）；而$\mathbi{E}$是一个特征矩阵，它包含了不同种类特征的数量，$x_{ik}$ 代表了第$i$ 个词在第$k$ 种特征中所表达的值，$\|$操作为拼接操作。公式\eqref{eq:15-52}将从共$F$个特征向量连接成固定大小的向量，作为编码端的输入。这种方法可以很容易地融合如词根、子词、形态、词性以及依存标签等特征。
+\parinterval 另一种方式是将句法信息的表示转化为基于序列的编码，之后与原始的词串融合。这样做的好处在于，并不需要使用基于树结构的编码器，句法信息仍然可以使用基于序列的编码器。而句法信息可以在对句法树的序列化表示中学习得到。如图\ref{fig:15-22}(a) 所示，对于英语句子“I love dogs”，可以得到如图\ref{fig:15-22}(a) 所示的句法树。这里，使用$w_i$ 表示第$i$ 个单词，如图\ref{fig:15-22}(b) 所示。通过对句法树进行遍历，可以得到句法树节点的序列$\{l_1,l_2,...,l_T\}$，其中$T$表示句法树中节点的个数，$l_j$ 表示第$j$ 个节点（{\color{red}$l$ 是不是在前面也用到过？而且怎么定义$j$？怎么遍历的树？}），如图\ref{fig:15-22}(c) 所示。通过观察句法树的结构可以看出（{\color{red} 怎么看出来的？}），对一个单词来说，句法树中该单词的父节点（{\color{red} 祖先节点及路径？}）代表了描述该单词最准确的句法信息。因此可以单词祖先节点及路径信息与原始的词信息构造出新的融合表示${\mathbi{h}'}_i$，并使用这种新的表示计算上下文向量$\mathbi{C}$，即：
-\parinterval 此外是否有更有效的方法，将句法信息融入到编码端呢？如图\ref{fig:15-22}(a)所示，对于英文“I love dogs”，有如下句法解析树：
+\begin{eqnarray}
+\mathbi{C}_j &=& \sum_{i=1}^m \alpha_{i,j} {\mathbi{h}'}_i
+\label{eq:15-53}
+\end{eqnarray}
 %----------------------------------------------
 \begin{figure}[htp]
 \centering
 \includegraphics[scale=0.5]{./Chapter15/Figures/figure-parsing-tree-of-a-sentence.png}
-\caption{一个句子的句法解析树}
+\caption{一个句子的句法树、词序列、词性标记序列}
 \label{fig:15-22}
 \end{figure}
 %-------------------------------------------
-\parinterval 对于该句法解析树，可以使用$w_i$表示句法解析树中第$i$个单词，如图\ref{fig:15-22}(b)所示。使用$l_j$表示句法解析树中第$j$个语法标签，如图\ref{fig:15-22}(c)所示。通过观察句法树的结构可以看出，对一个单词来说，句法树中该单词的父节点代表了该单词最精确的句法信息，因此一个直观的想法是直接使用单词父节点的句法信息与原始的词信息构造出新的融合表示${\mathbi{h}'}_i$，并使用这种新的表示计算上下文向量$\mathbi{C}$，如公式\eqref{eq:15-53}所示：
+\noindent 其中，$m$是源语言句子的长度。计算${\mathbi{h}'}_i$有如下几种方式（{\color{red} 实现啥？}）\upcite{DBLP:conf/acl/LiXTZZZ17}：
-\begin{eqnarray}
-\mathbi{C}_j &=& \sum_{i=1}^m \alpha_{i,j} {\mathbi{h}'}_i
-\label{eq:15-53}
-\end{eqnarray}
-\parinterval 其中，$m$是源语言句子的长度。基于这种想法，有如下几种实现方式\upcite{DBLP:conf/acl/LiXTZZZ17}：
 \begin{itemize}
 \vspace{0.5em}
-\item 平行结构。利用两个编码端构成平行结构，分别对源语言词汇和线性化的句法树进行建模，之后在词语RNN和句法RNN中寻找句法解析树中每个词汇以及他们的父节点，将他们的隐层状态相融合，得到新的表示。如图\ref{fig:15-23}(a)所示，图中$\mathbi{h}_{w_i}$为词$w_i$在词语RNN中的隐藏状态，$\mathbi{h}_{l_j}$为树结构标签$l_i$在句法RNN中的隐藏状态，其中如果词语$w_i$是标签$l_j$在句法树中的子节点，则将$\mathbi{h}_{w_i}$和$\mathbi{h}_{l_j}$向量拼接到一起作为这个词的新表示${\mathbi{h}'}_i$；
+\item 平行结构。利用两个编码端构成平行结构，分别对源语言单词和线性化的句法树进行建模，之后在句法树节点序列中寻找每个单词的父节点（或者祖先），将这个单词和它的父节点的状态相融合，得到新的表示。如图\ref{fig:15-23}(a)所示，图中$\mathbi{h}_{w_i}$为词$w_i$在单词序列中的状态，$\mathbi{h}_{l_j}$为树节点$l_j$在句法表示（序列）中的隐藏状态。如果单词$w_i$是节点$l_j$ 在句法树中的子节点，则将$\mathbi{h}_{w_i}$和$\mathbi{h}_{l_j}$向量拼接到一起作为这个词的新表示${\mathbi{h}'}_i$；
 \vspace{0.5em}
-\item 分层结构。将句法RNN与源语言单词的词嵌入向量进行融合，如图\ref{fig:15-23}(b)所示，其中$\mathbi{e}_{w_i}$为第$i$个词的词嵌入。类似地，如果$w_i$词语是$l_j$标签在句法树中的子节点，则将$\mathbi{e}_{w_i}$和$\mathbi{h}_{l_j}$向量拼接到一起作为词语RNN的输入，输出得${\mathbi{h}'}_i$直接参与注意力计算；
+\item 分层结构。将句法表示结果与源语言单词的词嵌入向量进行融合，如图\ref{fig:15-23}(b)所示，其中$\mathbi{e}_{w_i}$为第$i$个词的词嵌入。类似地，如果$w_i$词语是节点$l_j$在句法树中的子节点，则将$\mathbi{e}_{w_i}$和$\mathbi{h}_{l_j}$向量拼接到一起作为原始模型的输入，这样${\mathbi{h}'}_i$直接参与注意力计算（{\color{red} 这段话看不懂，不是和“平行结构”一样了吗？}）。注意，分层结构和平行结构的区别在于，分层结构最终还是使用了一个编码器，句法信息只是与词嵌入进行融合，因此最终的结构和原始的模型是一致的；平行结构相当于使用了两个编码器，因此单词和句法信息的融合是在两个编码的输出上进行的；
 \vspace{0.5em}
-\item 混合结构。使用混合结构的方式在编码端融入句法信息，首先对图\ref{fig:15-22}(a)中句法解析树进行先序遍历，将句法标签和源语言单词融合到同一个序列中，得到如图\ref{fig:15-23}(c)所示序列，之后使用同一个RNN结构处理这些表示，然后使用源语言词汇位置的隐藏信息参与注意力的计算。有趣的是，相比于前两种方法，使用混合结构的方式不仅参数量最少而且最为有效\upcite{DBLP:conf/acl/LiXTZZZ17}。
+\item 混合结构。首先对图\ref{fig:15-22}(a)中句法树进行先序遍历，将句法标记和源语言单词融合到同一个序列中，得到如图\ref{fig:15-23}(c)所示序列。之后使用传统的序列编码器对这个序列进行编码，然后使用序列中源语言单词所对应的状态参与注意力模型的计算。有趣的是，相比于前两种方法，这种方法参数量少而且也十分有效\upcite{DBLP:conf/acl/LiXTZZZ17}。
 \vspace{0.5em}
 \end{itemize}
@@ -1002,7 +1001,7 @@ lr &=& d_{model}^{-0.5}\cdot step\_num^{-0.5}
 \end{figure}
 %-------------------------------------------
-\parinterval 总的来说，对于在源语言端使用句法信息的情况，由于句法信息的生成依赖于句法分析装置，因此句法分析的错误会在很大程度上影响源语言句子的表示结果。如果获得的句法信息不够精确，可能会对翻译系统带来负面的作用。此外，也有研究发现基于词串的神经机器翻译模型本身就能学习到源语言的句法信息\upcite{DBLP:conf/emnlp/ShiPK16}，这表明了神经机器翻译模型也有一定的归纳句法的能力。除了循环神经网络结构，也有学者探索如何在Transformer中引入树结构信息。比如，可以在计算编码端自注意力分布的时候，将词与词之间的依存变换成距离表示作为额外的语法信息融入到注意力计算中\upcite{DBLP:conf/acl/BugliarelloO20}。
+\parinterval 需要注意的是，句法分析的错误会在很大程度上影响源语言句子的表示结果。如果获得的句法信息不够准确，可能会对翻译系统带来负面的作用。此外，也有研究发现基于词串的神经机器翻译模型本身就能学习到一些源语言的句法信息\upcite{DBLP:conf/emnlp/ShiPK16}，这表明了神经机器翻译模型也有一定的归纳句子结构的能力。除了循环神经网络结构，也有研究人员也探索了如何在Transformer中引入树结构信息。比如，可以在计算自注意力分布的时候，将词与词之间的依存变换成距离表示作为额外的语法信息融入到注意力模型中\upcite{DBLP:conf/acl/BugliarelloO20}。
 %----------------------------------------------------------------------------------------
 %    NEW SUB-SECTION
@@ -1010,7 +1009,7 @@ lr &=& d_{model}^{-0.5}\cdot step\_num^{-0.5}
 \subsection{解码端使用句法信息}\label{subsec-15.3.2}
-\parinterval 也有大量的工作探索如何将句法树加入到解码端中。一种最直接的方式是将目标语言句法树结构进行线性化，这样目标语言句子就变成了一个含有句法信息的序列，神经机器翻译系统不需要进行修改即可完成训练\upcite{Aharoni2017TowardsSN}。
+\parinterval 不同于在编码器中使用树结构，解码器直接生成树结构是较为复杂的工作。因此，想要在解码器中使用句法信息，一种最直接的方式是将目标语言句法树结构进行线性化，这样目标语言句子就变成了一个含有句法标记和单词的混合序列（见{\chaptereight}）。这样，神经机器翻译系统不需要进行修改，直接使用句法树序列化的结果进行训练和推断\upcite{Aharoni2017TowardsSN}。图\ref{fig:15-24}展示了一个目标语言句法树经过线性化后的结果。
 %----------------------------------------------
 \begin{figure}[htp]
@@ -1021,13 +1020,11 @@ lr &=& d_{model}^{-0.5}\cdot step\_num^{-0.5}
 \end{figure}
 %-------------------------------------------
-\parinterval 图\ref{fig:15-24}展示了一个目标语言句法树经过线性化后的结果。这种方法的最大优点是简单、易于实现，而且同时适用于在源语言和目标语言引入句法树的情况。不过，这种方法的问题是，推断时预测的目标语言串可能并不对应合理的句法树结构，此时需要额外的模块对结果进行修正或者调整，以得到合理的译文。
+\parinterval 不过，直接使用序列化的句法树也会带来新的问题。比如，在推断时，生成的译文序列可能根本不对应合法的句法树。此时，需要额外的模块对结果进行修正或者调整，以得到合理的译文。
-\parinterval 另一种方式是直接使用目标语言句法树完成翻译建模。与源语言句法树的建模不同，目标语言句法树的生成伴随着译文的生成，因此无法像源语言端一样将整个句法树一次输入。这样译文生成问题本质上就变成了目标语言树结构的生成，从这个角度说，这个过程与统计机器翻译中串到树的模型是类似的（见{\chaptereight}）。树结构的生成有很多种策略，基本的思想均是根据已经生成的局部结构预测新的局部结构，并将这些局部结构拼装成更大的结构，直到得到完整的句法树结构\upcite{DBLP:conf/iclr/Alvarez-MelisJ17}。
-\parinterval 实现目标语言句法树生成的一种手段是将形式文法扩展到神经网络模型。这样，我们可以使用形式文法描述句法树的生成过程（见{\chapterthree}），同时利用分布式表示来进行建模和学习。比如，可以使用基于循环神经网络的文法描述方法，把句法分析过程看作是一个循环神经网络的执行过程\upcite{DBLP:conf/naacl/DyerKBS16}。这个过程对应一个移进-规约分析过程\upcite{aho1972theory}，因此句法分析实质上就是一个移进-规约动作序列。这样（目标语言）句法树就可以通过基于循环神经网络的移进-规约过程实现。
+\parinterval 另一种方法是直接在目标语言端使用句法树进行建模。与源语言句法树的建模不同，目标语言句法树的生成伴随着译文的生成，因此无法像源语言端一样将整个句法树一次输入。这样译文生成问题本质上就变成了目标语言树结构的生成，从这个角度说，这个过程与统计机器翻译中串到树的模型是类似的（见{\chaptereight}）。树结构的生成有很多种策略，基本的思想均是根据已经生成的局部结构预测新的局部结构，并将这些局部结构拼装成更大的结构，直到得到完整的句法树结构\upcite{DBLP:conf/iclr/Alvarez-MelisJ17}。
-\parinterval 另一种方式从多任务角度出发，用多个解码端共同完成目标语言句子的生成\upcite{DBLP:journals/corr/LuongLSVK15}。图\ref{fig:15-25}展示了由一个编码器（汉语）和多个解码器组成的序列生成模型。其中不同解码器分别负责不同的任务：第一个用于预测翻译结果，即翻译任务；第二个用于预测句法结构；第三个用于重新生成源语言序列，进行自编码。其设计思想是各个任务之间能够相互辅助，使得编码器的表示能包含更多的信息，进而让多个任务都获得性能提升。这种方法也可以使用多个编码器，其思想是类似的。{\red{图中改成编码器}}
+\parinterval 实现目标语言句法树生成的一种手段是将形式文法扩展到神经网络模型。这样，可以使用形式文法描述句法树的生成过程（见{\chapterthree}），同时利用分布式表示来进行建模和学习。比如，可以使用基于循环神经网络的文法描述方法，把句法分析过程看作是一个循环神经网络的执行过程\upcite{DBLP:conf/naacl/DyerKBS16}。此外，也可以从多任务学习出发，用多个解码端共同完成目标语言句子的生成\upcite{DBLP:journals/corr/LuongLSVK15}。图\ref{fig:15-25}展示了由一个编码器（汉语）和多个解码器组成的序列生成模型。其中不同解码器分别负责不同的任务：第一个用于预测翻译结果，即翻译任务；第二个用于预测句法结构；第三个用于重新生成源语言序列，进行自编码。其设计思想是各个任务之间能够相互辅助，使得编码器的表示能包含更多的信息，进而让多个任务都获得性能提升。这种方法也可以使用在多个编码器上，其思想是类似的。{\red{图中改成编码器}}
 %----------------------------------------------
 \begin{figure}[htp]
@@ -1038,20 +1035,20 @@ lr &=& d_{model}^{-0.5}\cdot step\_num^{-0.5}
 \end{figure}
 %-------------------------------------------
-\parinterval 针对上述两种在解码端融入句法信息的方法，也有研究者提出过以下问题：紧密的融合词串和句法结构的学习是否优于多任务学习？以及目标语言句法信息的引入是否真的帮助到了机器翻译？\upcite{DBLP:conf/wmt/NadejdeRSDJKB17}。为了回答这个问题，研究者尝试利用{\small\bfnew{CCG}}\index{CCG}（Combinatory Categorial Grammar）\index{Combinatory Categorial Grammar}中的supertags作为句法信息，并比较了直接将CCG标签与句子进行融合（如图\ref{fig:15-24}）和将句子与标签分开作为多任务（如图\ref{fig:15-25}）两种融合方式进行训练，最后发现第一种方式是更加有效的。
+\parinterval 针对上述方法，也有以下问题：紧密的融合词串和句法结构的方法是否优于多任务学习的方法？以及，目标语言句法信息的引入是否真的帮助到了机器翻译？\upcite{DBLP:conf/wmt/NadejdeRSDJKB17}。 为了回答这些问题，研究人员尝试利用{\small\bfnew{CCG}}\index{CCG}（Combinatory Categorial Grammar）\index{Combinatory Categorial Grammar}中的supertags 作为句法信息，并发现CCG标签与句子进行融合更加有效。
-\parinterval 虽然融合树结构和目标语言词串的方法优于基于多任务的方法，但是前者会导致解码端的序列过长，因此会面临难以训练的问题。为了缓解这种现象可以在这两种方法之间设计两个循环神经网络结构，一个生成句子，另一个生成树结构\upcite{DBLP:conf/acl/WuZYLZ17,DBLP:journals/corr/abs-1808-09374}。以生成目标语言依存树为例，生成依存树的循环神经网络仍然是一个移进-规约序列的生成模型。另一个循环神经网络负责预测目标语言词序列，它只有在第一个循环神经网络移进操作的时候才会预测一下词，同时会将当前词状态作为信息送入到第一个循环神经网络中。整个过程如图\ref{fig:15-26}所示，其中$\mathbi{h}_i^\textrm{action}$表示动作RNN的隐藏层状态，$\mathbi{h}_i^\textrm{word}$表示词语RNN的隐藏层状态。动作RNN会结合词语RNN的状态预测出“移位”，“左规约”，“右规约”三种动作，只有当动作RNN预测出“移位”操作时，词语RNN才会预测下一时刻的词语。最后词语RNN预测出结束符号<eos>时，整个过程结束。需要注意的是，这种方法并不依赖循环神经网络结构，也可以将其替换为Transformer等结构来完成序列的生成。
+\parinterval 虽然，融合树结构和目标语言词串的方法有其优越性，但是这种方法会导致目标语言端的序列过长，导致模型难以训练。为了缓解这个问题，可以使用两个模型（如循环神经网络），一个生成句子，另一个生成树结构\upcite{DBLP:conf/acl/WuZYLZ17,DBLP:journals/corr/abs-1808-09374}。以生成目标语言依存树为例，生成依存树的模型仍然是一个移进-规约序列的生成模型，称为动作模型。另一个模型负责预测目标语言词序列，成为词预测模型。它只有在第一个模型进行进操作的时候才会预测一下词，同时会将当前词的状态作为信息送入到第一个模型中。整个过程如图\ref{fig:15-26}所示，这里使用循环神经网络构建了动作模型和词预测模型。$\mathbi{h}_i^\textrm{action}$ 表示动作模型的隐藏层状态，$\mathbi{h}_i^\textrm{word}$表示词预测模型的隐藏层状态。动作模型会结合词预测模型的状态预测出“移位”，“左规约”，“右规约”三种动作，只有当动作模型预测出“移位”操作时，词预测模型才会预测下一时刻的词语；而动作模型预测“左规约”和“右规约相当于完成了依存关系的预测（依存树见图\ref{fig:15-26}右侧）。最后词预测模型预测出结束符号<eos> 时，整个过程结束。
 %----------------------------------------------
 \begin{figure}[htp]
 \centering
 \includegraphics[scale=0.5]{./Chapter15/Figures/figure-parallel-RNN-structure.png}
-\caption{平行RNN结构}
+\caption{平行RNN结构（{\color{red} 看前面的文字：一个叫动作模型，一个叫词预测模型。还有右侧图的依存的箭头方向一定再确定一下！}）}
 \label{fig:15-26}
 \end{figure}
 %-------------------------------------------
-\parinterval 相较于在编码端融入句法信息，在解码端融入句法信息更为困难。由于解码过程存在着前向依赖，因此树结构信息与词语的生成反而是一个相互影响的过程，如果严格按照给定的树结构翻译，即先生成结构再根据结构生成标签，那么一旦结构有误解码结果就会受到影响，而如果用一种不太严格的方式，即将树结构和目标语言一起预测，树结构对译文的预测作用又会减弱。在统计机器翻译中，句法信息究竟应该使用到什么程度已经有一些讨论\upcite{Tong2016Syntactic}，而在神经机器翻译中，如何更有效地引入树结构信息以及如何平衡树结构信息与词串的作用还有待进一步确认。
+\parinterval 相较于在编码端融入句法信息，在解码端融入句法信息更为困难。由于树结构与单词的生成是一个相互影响的过程，如果先生成树结构，再根据树得到译文单词串，那么一旦树结构有误，翻译结果就会有问题。在统计机器翻译中，句法信息究竟应该使用到什么程度已经有一些讨论\upcite{Tong2016Syntactic}。而在神经机器翻译中，如何更有效地引入树结构信息以及如何平衡树结构信息与词串的作用还有待确认。如前文所述，基于词串的神经机器翻译模型已经能够捕捉到一些句法结构信息，虽然有些信息是不容易通过人的先验知识进行解释的。这时，使用人工总结的句法结构来约束或者强化翻译模型，是否可以补充模型无法学到的信息，还是需要进一步研究。
 %----------------------------------------------------------------------------------------
 %    NEW SECTION
@@ -1060,21 +1057,21 @@ lr &=& d_{model}^{-0.5}\cdot step\_num^{-0.5}
 \sectionnewpage
 \section{基于结构搜索的翻译模型优化}
+\parinterval 目前为止，对模型的很多改良都来自于研究人员自身的经验及灵感。从某种意义上说，很多时候，模型结构的优化依赖于研究人员的经验，包括其对任务的理解以及自身的想象力。此外所设计出的模型结构还需要在对应任务上进行实验，优秀的模型往往需要很长时间的探索与验证。因此，人们希望在无需外部干预的情况下，让计算机自动地找到最适用于当前任务的神经网络模型结构，这种方法被称作{\small\bfnew{神经架构搜索}}\index{神经架构搜索}（Neural Architecture Search）\index{Neural Architecture Search}，在神经网络模型中有时也被称作{\small\bfnew{神经网络结构搜索}}\index{神经网络结构搜索}或{\small\bfnew{网络结构搜索}}\index{网络结构搜索}\upcite{DBLP:conf/iclr/ZophL17,DBLP:conf/cvpr/ZophVSL18,Real2019AgingEF}。
 %----------------------------------------------------------------------------------------
 %    NEW SUB-SECTION
 %----------------------------------------------------------------------------------------
 \subsection{神经网络结构搜索}
-\parinterval 目前为止，对模型的很多改良都来自于研究人员自身的经验及灵感。从某种意义上说，很多时候，模型结构的优化依赖于研究人员的经验，包括其对任务的理解以及自身的想象力。此外所设计出的模型结构还需要在对应任务上进行实验，优秀的模型往往需要很长时间的探索与验证。因此，人们希望在无需外部干预的情况下，让计算机自动地找到最适用于当前任务的神经网络模型结构，这种方法被称作{\small\bfnew{神经架构搜索}}\index{神经架构搜索}（Neural Architecture Search）\index{Neural Architecture Search}，在神经网络模型中有时也被称作{\small\bfnew{神经网络结构搜索}}\index{神经网络结构搜索}或{\small\bfnew{网络结构搜索}}\index{网络结构搜索}\upcite{DBLP:conf/iclr/ZophL17,DBLP:conf/cvpr/ZophVSL18,Real2019AgingEF}。
 \parinterval 网络结构搜索属于{\small\bfnew{自动机器学习}}\index{自动机器学习}（Automated Machine Learning）\index{Automated Machine Learning}的范畴，其目的是根据对应任务上的数据找到最合适的模型结构。在这个过程中，模型结构就像神经网络中的参数一样被自动地学习出来。以机器翻译任务为例，通过网络结构搜索方法能够以Transformer模型为基础，对神经网络结构进行自动优化，找到更适用于机器翻译任务的模型结构。图\ref{fig:15-27}(a) 展示了人工设计的Transformer编码器的局部结构，图\ref{fig:15-27}(b) 给出对该结构使用进化算法优化后得到的结构图\upcite{DBLP:conf/icml/SoLL19}。可以看到，使用网络结构搜索方法得到的模型中，出现了与人工设计的结构不同的跨层连接，同时还搜索到了全新的多分支网络结构。这种结构也是人工不易设计出来的。
 %----------------------------------------------
 \begin{figure}[htp]
 \centering
 \input{./Chapter15/Figures/figure-encoder-structure-of-transformer-model-optimized-by-nas}
-\caption{传统Transformer以及通过网络结构搜索方法优化后的Transformer模型编码器结构}
+\caption{传统Transformer以及通过网络结构搜索方法优化后的Transformer模型编码器结构（{\color{red} 如果图是别的论文的，需要引用！层正则化写全！正则化=层正则化？}）}
 \label{fig:15-27}
 \end{figure}
 %-------------------------------------------

--- a/Chapter16/Figures/figure-multitask-learning-in-machine-translation-1.tex
+++ b/Chapter16/Figures/figure-multitask-learning-in-machine-translation-1.tex
@@ -31,7 +31,7 @@
 \draw [->,thick](node2-3.east)--(node2-6.west)node[pos=0.5,above,font=\scriptsize]{重排序};
 \draw [->,thick](node2-6.north)--(node2-7.south);
+\node [anchor=east] (node1) at ([xshift=-2.0em]node1-1.west) {\small{$x,y$：双语数据}};
 \node [anchor=north](pos1) at ([yshift=0em]node1-4.south) {\small{(a)单任务学习}};
 \node [anchor=west](pos2) at ([xshift=10.0em]pos1.east) {\small{(b)多任务学习}};

--- a/Chapter16/chapter16.tex
+++ b/Chapter16/chapter16.tex
@@ -179,22 +179,22 @@
 %----------------------------------------------------------------------------------------
 \subsubsection{2. 预训练词嵌入}
-\parinterval 神经机器翻译模型所使用的编码器-解码器框架天然就包含了对输入（源语言）和输出（目标语言）进行表示学习的过程。在编码端，需要学习一种分布式表示来表示源语言句子的信息，这种分布式表示可以包含序列中每个位置的表示结果（见{\chapternine}）。从结构上看，神经机器翻译所使用的编码器与神经语言模型无异，或者说神经机器翻译的编码器其实就是一个源语言的语言模型。唯一的区别在于，神经机器翻译的编码器并不直接输出源语言句子的生成概率，而传统语言模型是建立在序列生成任务上的。既然神经机器翻译的编码器可以与解码器一起在双语数据上联合训练，那为什么不使用更大规模的数据单独对编码器进行训练呢？或者说，直接使用一个预先训练好的编码器，与机器翻译的解码器配合完成翻译过程。
+\parinterval 神经机器翻译模型所使用的编码器-解码器框架天然就包含了对输入（源语言）和输出（目标语言）进行表示学习的过程。在编码端，需要学习一种分布式表示来表示源语言句子的信息，这种分布式表示可以包含序列中每个位置的表示结果（见{\chapternine}）。从结构上看，神经机器翻译所使用的编码器与语言模型无异，或者说神经机器翻译的编码器其实就是一个源语言的语言模型。唯一的区别在于，神经机器翻译的编码器并不直接输出源语言句子的生成概率，而传统语言模型是建立在序列生成任务上的。既然神经机器翻译的编码器可以与解码器一起在双语数据上联合训练，那为什么不使用更大规模的数据单独对编码器进行训练呢？或者说，直接使用一个预先训练好的编码器，与机器翻译的解码器配合完成翻译过程。
 \parinterval 实现上述想法的一种手段是{\small\sffamily\bfnew{预训练}}\index{预训练}（Pre-training）\index{Pre-training}\upcite{DBLP:conf/nips/DaiL15,DBLP:journals/corr/abs-1802-05365,radford2018improving,devlin2019bert}。预训练的做法相当于将表示模型的学习任务从目标任务中分离出来，这样可以利用额外的更大规模的数据进行学习。常用的一种方法是使用语言建模等方式在大规模单语数据上进行训练，来得到神经机器翻译模型中的一部分（比如词嵌入和编码器等）的模型参数，作为模型的初始值。然后，神经机器翻译模型在双语数据上进行{\small\sffamily\bfnew{微调}}\index{微调}（Fine-tuning）\index{Fine-tuning}，以得到最终的翻译模型。
 \parinterval 词嵌入可以被看作是对每个独立单词进行的表示学习，在自然语言处理的众多任务中都扮演着重要角色\upcite{DBLP:conf/icml/CollobertW08,2011Natural,DBLP:journals/corr/abs-1901-09069}。到目前为止已经有大量的词嵌入学习方法被提出（见{\chapternine}），因此可以直接应用这些方法在海量的单语数据上训练得到词嵌入，用来初始化神经机器翻译模型的词嵌入参数矩阵\upcite{DBLP:conf/aclwat/NeishiSTIYT17,2018When}。
-\parinterval 需要注意的是，在神经机器翻译中使用预训练词嵌入有两种方法。一种方法是直接将词嵌入作为固定的输入，也就是在训练机器翻译模型的过程中，并不调整词嵌入的参数。这样做的目的是完全将词嵌入模块独立出来，机器翻译可以被看作是在固定的词嵌入输入上进行的建模，从而降低了机器翻译系统学习的难度。另一种方法是仍然遵循``预训练+微调''的策略，将词嵌入作为翻译模型的初始值。之后在机器翻译训练过程中，词嵌入模型结果会被进一步更新。近些年，在词嵌入预训练的基础上进行微调的方法受到研究者越来越多的青睐。因为在实践中发现，完全用单语数据学习的单词表示，与双语上的翻译任务并不完全匹配。目标语言的信息也会影响源语言的表示学习，在预训练词嵌入的基础上进一步进行微调是更加有效的方案。
+\parinterval 需要注意的是，在神经机器翻译中使用预训练词嵌入有两种方法。一种方法是直接将词嵌入作为固定的输入，也就是在训练神经机器翻译模型的过程中，并不调整词嵌入的参数。这样做的目的是完全将词嵌入模块独立出来，机器翻译可以被看作是在固定的词嵌入输入上进行的建模，从而降低了机器翻译模型学习的难度。另一种方法是仍然遵循``预训练+微调''的策略，将词嵌入作为机器翻译模型的初始值。在之后机器翻译训练过程中，词嵌入模型结果会被进一步更新。近些年，在词嵌入预训练的基础上进行微调的方法越来越受到研究者的青睐。因为在实践中发现，完全用单语数据学习的单词表示，与双语数据上的翻译任务并不完全匹配。同时目标语言的信息也会影响源语言的表示学习，在预训练词嵌入的基础上进一步进行微调是更加有效的方案。
-\parinterval 虽然预训练词嵌入在海量的单语数据上学习到了丰富的表示，但词嵌入很主要的一个缺点是无法解决一词多义问题。在不同的上下文中，同一个单词经常表示不同的意思，但词嵌入是完全相同的。模型需要在编码过程中通过上下文去理解每个词在当前语境下的含义，从而增加了建模的复杂度。因此，上下文词向量在近些年得到了广泛的关注\upcite{DBLP:conf/acl/PetersABP17,mccann2017learned,DBLP:conf/naacl/PetersNIGCLZ18}。上下文词嵌入是指一个词的表示不仅依赖于单词自身，还要根据所在的上下文语境来得到。由于在不同的上下文中，每个词对应的词嵌入是不同的，因此无法简单地通过词嵌入矩阵来表示，通常的做法是使用海量的单语数据预训练语言模型任务，使模型具备丰富的特征提取能力\upcite{DBLP:conf/naacl/PetersNIGCLZ18,radford2018improving,devlin2019bert}。
+\parinterval 虽然预训练词嵌入在海量的单语数据上学习到了丰富的表示，但词嵌入很主要的一个缺点是无法解决一词多义问题。在不同的上下文中，同一个单词经常表示不同的意思，但词嵌入是完全相同的。模型需要在编码过程中通过上下文去理解每个词在当前语境下的含义，从而增加了建模的复杂度。因此，上下文词向量在近些年得到了广泛的关注\upcite{DBLP:conf/acl/PetersABP17,mccann2017learned,DBLP:conf/naacl/PetersNIGCLZ18}。上下文词嵌入是指一个词的表示不仅依赖于单词自身，还依赖于上下文语境。由于在不同的上下文中，每个词对应的词嵌入是不同的，因此无法简单地通过词嵌入矩阵来表示，通常的做法是使用海量的单语数据预训练语言模型任务，使模型具备丰富的特征提取能力\upcite{DBLP:conf/naacl/PetersNIGCLZ18,radford2018improving,devlin2019bert}。
 %----------------------------------------------------------------------------------------
 %    NEW SUB-SUB-SECTION
 %----------------------------------------------------------------------------------------
 \subsubsection{3. 预训练模型}
-\parinterval 相比固定的词嵌入，上下文词嵌入包含了在当前语境中的语义信息，丰富了模型的输入表示，降低了训练难度。但是，模型仍有大量的参数需要从零学习，来进一步提取整个句子的表示。那么，能不能在预训练阶段中直接得到预训练好的模型参数，在下游任务中仅仅通过任务特定的数据对模型参数进行微调，来得到一个较强的模型呢？{\small\bfnew{生成式预训练}}（Generative Pre-Training，GPT）\index{生成式预训练}\index{GPT}和来自Transformer的{\small\bfnew{双向编码器表示}}（Bidirectional Encoder Representations from Transformers，BERT）\index{双向编码器表示}\index{BERT}对这个问题进行了探索，图\ref{fig:16-5}对比了GPT和BERT的模型结构。
+\parinterval 相比固定的词嵌入，上下文词嵌入包含了在当前语境中的语义信息，丰富了模型的输入表示，降低了训练难度。但是，模型仍有大量的参数需要从零学习，来进一步提取整个句子的表示。那么，能不能在预训练阶段中直接得到预训练好的模型参数，在下游任务中仅仅通过任务特定的数据对模型参数进行微调，来得到一个较强的模型呢？{\small\bfnew{生成式预训练}}（Generative Pre-training，GPT）\index{生成式预训练}\index{GPT}和{\small\bfnew{来自Transformer的双向编码器表示}}（Bidirectional Encoder Representations From Transformers，BERT）\index{双向编码器表示}\index{BERT}对这个问题进行了探索，图\ref{fig:16-5}对比了GPT和BERT的模型结构。
 %----------------------------------------------
 \begin{figure}[htp]
@@ -205,17 +205,17 @@
 \end{figure}
 %----------------------------------------------
-\parinterval GPT\upcite{radford2018improving}通过Transformer模型自回归地训练单向语言模型，类似于神经机器翻译模型的解码器，相比双向LSTM等模型，Tranformer架构的表示能力更强。在大规模单语数据上预训练得到的模型结构只需要进行简单的修改，然后通过任务特定的训练数据进行微调，就可以很好地适配到下游任务中。之后提出的BERT模型更是将预训练的作用提升到了新的水平\upcite{devlin2019bert}。GPT模型的一个缺陷在于模型只能进行单向编码，也就是前面的文本在建模时无法获取到后面的信息。而BERT提出了一种自编码的方式，使模型在预训练阶段可以通过双向编码的方式进行建模，进一步增强了模型的表示能力。
+\parinterval GPT\upcite{radford2018improving}通过Transformer模型自回归地训练单向语言模型，类似于神经机器翻译模型的解码器，相比双向LSTM等模型，Tranformer架构的表示能力更强。在大规模单语数据上预训练得到的模型结构只需要进行简单的修改，再通过任务特定的训练数据进行微调，就可以很好地适配到下游任务中。之后提出的BERT模型更是将预训练的作用提升到了新的水平\upcite{devlin2019bert}。GPT模型的一个缺陷在于模型只能进行单向编码，也就是前面的文本在建模时无法获取到后面的信息。而BERT提出了一种自编码的方式，使模型在预训练阶段可以通过双向编码的方式进行建模，进一步增强了模型的表示能力。
-\parinterval BERT的核心思想是通过{\small\bfnew{掩码语言模型}}(Masked Language model，MLM)\index{掩码语言模型}\index{MLM}任务进行预训练。掩码语言模型的思想类似于完形填空，随机选择输入句子中的部分词掩码，模型来预测这些被掩码的词。掩码的具体做法是将被选中的词替换为一个特殊的词[Mask]，这样模型在训练过程中，无法得到掩码位置词的信息，需要联合上下文内容进行预测，因此提高了模型对上下文的特征提取能力。实验表明，相比在下游任务中仅利用上下文词嵌入，在大规模单语输数据上预训练的模型具有更强的表示能力。而使用掩妈的方式训练也给机器翻译提供了新的思路，在本章的部分内容中也会使用到类似方法。
+\parinterval BERT的核心思想是通过{\small\bfnew{掩码语言模型}}(Masked Language Model，MLM)\index{掩码语言模型}\index{MLM}任务进行预训练。掩码语言模型的思想类似于完形填空，随机选择输入句子中的部分词掩码，模型来预测这些被掩码的词。掩码的具体做法是将被选中的词替换为一个特殊的词[Mask]，这样模型在训练过程中，无法得到掩码位置词的信息，需要联合上下文内容进行预测，因此提高了模型对上下文的特征提取能力。实验表明，相比在下游任务中仅利用上下文词嵌入，在大规模单语数据上预训练的模型具有更强的表示能力。而使用掩码的方式训练也给神经机器翻译提供了新的思路，在本章的部分内容中也会使用到类似方法。
-\parinterval 在神经机器翻译任务中，预训练模型可以用于初始化编码器的模型参数\upcite{DBLP:conf/emnlp/ClinchantJN19,DBLP:conf/emnlp/ImamuraS19,DBLP:conf/naacl/EdunovBA19}。之所以用在编码器而不是解码器端，主要原因在于编码器的作用主要在于特征提取，训练难度相对较高，而解码器的作用主要在于生成，和编码器提取到的表示是强依赖的，相对比较脆弱\upcite{DBLP:journals/corr/abs-1908-06259}。模型在预训练阶段的生成过程中并没有考虑到额外的表示信息，因此和神经机器翻译的编码器存在着明显的不一致问题，所以目前主流的做法是仅利用预训练模型对编码器的模型参数进行初始化。
+\parinterval 在神经机器翻译任务中，预训练模型可以用于初始化编码器的模型参数\upcite{DBLP:conf/emnlp/ClinchantJN19,DBLP:conf/emnlp/ImamuraS19,DBLP:conf/naacl/EdunovBA19}。之所以用在编码器端而不是解码器端，主要原因是编码器的作用主要在于特征提取，训练难度相对较高，而解码器的作用主要在于生成，和编码器提取到的表示是强依赖的，相对比较脆弱\upcite{DBLP:journals/corr/abs-1908-06259}。模型在预训练阶段的生成过程中并没有考虑到额外的表示信息，因此和神经机器翻译的编码器存在着明显的不一致问题，所以目前主流的做法是仅利用预训练模型对编码器的模型参数进行初始化。
-\parinterval 然而，在实践中发现，参数初始化的方法在一些富资源语种上提升效果并不明显，甚至反而会带来性能的下降\upcite{DBLP:journals/corr/abs-2002-06823}。原因可能在于，预训练阶段的训练数据规模是非常大的，因此在下游任务数据量较少的情况下帮助较大。而在一些富资源语种上，双语句对的数据量可以达到千万级别，因此简单通过预训练模型来初始化模型参数无法带来明显的提升。此外，预训练模型的训练目标并没有考虑到序列到序列的生成，与神经机器翻译的训练目标并不完全一致，两者训练得到的模型参数可能存在一些区别。
+\parinterval 然而，在实践中发现，参数初始化的方法在一些富资源语种上提升效果并不明显，甚至会带来性能的下降\upcite{DBLP:journals/corr/abs-2002-06823}。原因可能在于，预训练阶段的训练数据规模是非常大的，因此在下游任务数据量较少的情况下帮助较大。而在一些富资源语种上，双语句对的数据量可以达到千万级别，因此简单通过预训练模型来初始化模型参数无法带来明显的提升。此外，预训练模型的训练目标并没有考虑到序列到序列的生成，与神经机器翻译的训练目标并不完全一致，两者训练得到的模型参数可能存在一些区别。
-\parinterval 因此，一些做法将预训练模型和翻译模型进行融合，将预训练模型作为一个独立的模块来为编码器或者解码器提供句子级表示信息\upcite{DBLP:journals/corr/abs-2002-06823,DBLP:conf/aaai/YangW0Z00020}。另外一种做法是针对生成任务进行预训练。机器翻译是一种典型的语言生成任务，不仅包含源语言表示学习的问题，还有序列到序列的映射，以及目标端序列生成的问题，这些知识是无法单独通过（源语言）单语数据学习到的。因此，可以使用单语数据对编码器-解码器结构进行预训练\upcite{song2019mass,DBLP:conf/acl/LewisLGGMLSZ20,DBLP:conf/emnlp/QiYGLDCZ020}。
+\parinterval 因此，一种做法将预训练模型和翻译模型进行融合，把预训练模型作为一个独立的模块来为编码器或者解码器提供句子级表示信息\upcite{DBLP:journals/corr/abs-2002-06823,DBLP:conf/aaai/YangW0Z00020}。另外一种做法是针对生成任务进行预训练。机器翻译是一种典型的语言生成任务，不仅包含源语言表示学习的问题，还有序列到序列的映射，以及目标语言端序列生成的问题，这些知识是无法单独通过（源语言）单语数据学习到的。因此，可以使用单语数据对编码器-解码器结构进行预训练\upcite{song2019mass,DBLP:conf/acl/LewisLGGMLSZ20,DBLP:conf/emnlp/QiYGLDCZ020}。
-\parinterval 以{\small\bfnew{掩码端到端预训练}}（Masked Sequence to Sequence Pre-training，MASS）\index{掩码端到端预训练}\index{MASS}方法为例\upcite{song2019mass}，其思想与BERT十分相似，也是在预训练过程中采用掩码的方式，随机选择编码器输入句子中的连续片段替换为特殊词[Mask]，然后在解码器端预测这个连续片段，如图\ref{fig:16-6} 所示。这种做法可以使得编码器端捕捉上下文信息，同时迫使解码器依赖于编码器进行自回归地生成，从而学习到编码器和解码器之间的注意力。为了适配下游的机器翻译任务，使预训练模型可以学习到不同语言的表示，MASS对不同语言的句子采用共享词汇表和模型参数的方法，利用同一个预训练模型来进行不同语言句子的预训练。通过这种方式，模型既学到了对源语言句子的编码，也学习到了对目标语言句子的生成方法，之后通过使用双语句对来对预训练模型的参数进行微调，模型可以快速收敛到较好的水平。
+\parinterval 以{\small\bfnew{掩码端到端预训练}}（Masked Sequence To Sequence Pre-training，MASS）\index{掩码端到端预训练}\index{MASS}方法为例\upcite{song2019mass}，其思想与BERT十分相似，也是在预训练过程中采用掩码的方式，随机选择编码器输入句子中的连续片段替换为特殊词[Mask]，然后在解码器预测这个连续片段，如图\ref{fig:16-6} 所示。这种做法可以使得编码器捕捉上下文信息，同时迫使解码器依赖于编码器进行自回归的生成，从而学习到编码器和解码器之间的注意力。为了适配下游的机器翻译任务，使预训练模型可以学习到不同语言的表示，MASS对不同语言的句子采用共享词汇表和模型参数的方法，利用同一个预训练模型来进行不同语言句子的预训练。通过这种方式，模型既学到了对源语言句子的编码，也学习到了对目标语言句子的生成方法，之后通过使用双语句对来对预训练模型的参数进行微调，模型可以快速收敛到较好的水平。
 %----------------------------------------------
 \begin{figure}[htp]
@@ -226,16 +226,16 @@
 \end{figure}
 %----------------------------------------------
-\parinterval  此外，还有很多工作对如何将语言模型应用到神经机器翻译模型中进行了研究。研究人员分析了预训练词嵌入为何在神经机器翻译模型中有效\upcite{2018When}；如何在神经机器翻译模型中利用预训练的BERT模型\upcite{DBLP:conf/emnlp/ClinchantJN19,DBLP:conf/emnlp/ImamuraS19,DBLP:conf/aaai/YangW0Z00020,DBLP:conf/aaai/WengYHCL20,DBLP:conf/emnlp/ImamuraS19}；如何针对神经机器翻译任务进行预训练\upcite{DBLP:journals/corr/abs-2001-08210,DBLP:conf/aaai/JiZDZCL20,DBLP:conf/acl/LewisLGGMLSZ20}；针对机器翻译中的Code-switching问题进行预训练\upcite{DBLP:journals/corr/abs-2009-08088}；如何在微调过程中避免遗忘原始的语言模型任务\upcite{DBLP:journals/corr/abs-2010-09403}。
+\parinterval  此外，还有很多工作对如何将语言模型应用到神经机器翻译模型中进行了研究。研究人员分析了预训练词嵌入为何在神经机器翻译模型中有效\upcite{2018When}；如何在神经机器翻译模型中利用预训练的BERT模型\upcite{DBLP:conf/emnlp/ClinchantJN19,DBLP:conf/emnlp/ImamuraS19,DBLP:conf/aaai/YangW0Z00020,DBLP:conf/aaai/WengYHCL20,DBLP:conf/emnlp/ImamuraS19}；如何针对神经机器翻译任务进行预训练\upcite{DBLP:journals/corr/abs-2001-08210,DBLP:conf/aaai/JiZDZCL20,DBLP:conf/acl/LewisLGGMLSZ20}；如何针对机器翻译中的Code-switching问题进行预训练\upcite{DBLP:journals/corr/abs-2009-08088}；如何在微调过程中避免遗忘原始的语言模型任务\upcite{DBLP:journals/corr/abs-2010-09403}。
 %----------------------------------------------------------------------------------------
 %    NEW SUB-SUB-SECTION
 %----------------------------------------------------------------------------------------
 \subsubsection{4. 多任务学习}
-\parinterval 在训练一个神经网络的时候，会给定模型一个训练目标，希望模型通过不断训练在这个目标上表现地越来越好。我们希望模型在训练过程中可以自动提取到与训练目标相关的所有信息。然而，过分地关注单个训练目标，可能使模型忽略掉其他可能有帮助的信息，这些信息可能来自于一些其他相关的任务\upcite{DBLP:journals/corr/Ruder17a}。通过联合多个独立但相关的任务共同学习，任务之间相互``促进''，就是{\small\sffamily\bfnew{多任务学习}}\index{多任务学习}（Multitask Learning）\index{Multitask Learning}方法\upcite{DBLP:journals/corr/Ruder17a,DBLP:books/sp/98/Caruana98,liu2019multi}。多任务学习的常用做法是针对多个相关的任务，共享模型的部分参数来学习不同任务之间相似的特征，并通过特定的模块来学习每个任务独立的特征（见\chapterfifteen）。常用的策略是对底层的模型参数进行共享，顶层的模型参数用于独立学习各个不同的任务。
+\parinterval 在训练一个神经网络的时候，会给定模型一个训练目标，希望模型通过不断训练在这个目标上表现得越来越好。同时也希望模型在训练过程中可以自动提取到与训练目标相关的所有信息。然而，过分地关注单个训练目标，可能使模型忽略掉其他可能有帮助的信息，这些信息可能来自于一些其他相关的任务\upcite{DBLP:journals/corr/Ruder17a}。通过联合多个独立但相关的任务共同学习，任务之间相互``促进''，就是{\small\sffamily\bfnew{多任务学习}}\index{多任务学习}（Multitask Learning）\index{Multitask Learning}方法\upcite{DBLP:journals/corr/Ruder17a,DBLP:books/sp/98/Caruana98,liu2019multi}。多任务学习的常用做法是针对多个相关的任务，共享模型的部分参数来学习不同任务之间相似的特征，并通过特定的模块来学习每个任务独立的特征（见\chapterfifteen）。常用的策略是对底层的模型参数进行共享，顶层的模型参数用于独立学习各个不同的任务。
-\parinterval 在神经机器翻译中，应用多任务学习的主要策略是将翻译任务作为主任务，同时设置一些仅使用单语数据的子任务，通过这些子任务来捕捉单语数据中的语言知识\upcite{DBLP:conf/emnlp/DomhanH17,DBLP:conf/emnlp/ZhangZ16,DBLP:journals/corr/LuongLSVK15}。一种多任务学习的方法是利用源语言单语数据，通过单个编码器对源语言数据进行建模，然后分别使用两个解码器来学习源语言排序和翻译任务。源语言排序任务是指对句子的顺序进行调整，可以通过单语数据来构造训练数据，从而使编码器被训练得更加充分\upcite{DBLP:conf/emnlp/ZhangZ16}，如图\ref{fig:16-7}所示。
+\parinterval 在神经机器翻译中，应用多任务学习的主要策略是将翻译任务作为主任务，同时设置一些仅使用单语数据的子任务，通过这些子任务来捕捉单语数据中的语言知识\upcite{DBLP:conf/emnlp/DomhanH17,DBLP:conf/emnlp/ZhangZ16,DBLP:journals/corr/LuongLSVK15}。一种多任务学习的方法是利用源语言单语数据，通过单个编码器对源语言数据进行建模，再分别使用两个解码器来学习源语言排序和翻译任务。源语言排序任务是指对句子的顺序进行调整，可以通过单语数据来构造训练数据，从而使编码器被训练得更加充分\upcite{DBLP:conf/emnlp/ZhangZ16}，如图\ref{fig:16-7}所示。
 %----------------------------------------------
 \begin{figure}[htp]
    \centering
@@ -245,7 +245,7 @@
 \end{figure}
 %----------------------------------------------
-\parinterval 虽然神经翻译模型可以看作一种语言生成模型，但生成过程中却依赖于源语言信息，因此无法直接利用目标语言单语数据进行多任务学习。针对这个问题，可以对原有翻译模型结构进行修改，在解码器底层增加一个语言模型子层，这个子层用于学习语言模型任务，与编码器端是完全独立的，如图\ref{fig:16-8}所示\upcite{DBLP:conf/emnlp/DomhanH17}。在训练过程中，可以分别将双语数据和单语数据送入翻译模型和语言模型进行计算，双语数据训练产生的梯度用于对整个模型进行参数更新，而单语数据产生的梯度只对语言模型子层进行参数更新。通过这种方式，可以有效利用单语数据使解码器端的底层网络训练得更加充分，从而提取到更有效的特征来生成翻译结果。
+\parinterval 虽然神经机器翻译模型可以看作一种语言生成模型，但生成过程中却依赖于源语言信息，因此无法直接利用目标语言单语数据进行多任务学习。针对这个问题，可以对原有翻译模型结构进行修改，在解码器底层增加一个语言模型子层，这个子层用于学习语言模型任务，与编码器端是完全独立的，如图\ref{fig:16-8}所示\upcite{DBLP:conf/emnlp/DomhanH17}。在训练过程中，可以分别将双语数据和单语数据送入翻译模型和语言模型进行计算，双语数据训练产生的梯度用于对整个模型进行参数更新，而单语数据产生的梯度只对语言模型子层进行参数更新。通过这种方式，可以有效利用单语数据使解码器端的底层网络训练得更加充分，从而提取到更有效的特征来生成翻译结果。
 %----------------------------------------------
 \begin{figure}[htp]
@@ -256,7 +256,7 @@
 \end{figure}
 %----------------------------------------------
-\parinterval 此外，还有一些工作对多任务学习进行了探讨。比如利用多任务学习来训练多到一模型（多个编码器，单个解码器）、一到多模型（单个编码器、多个解码器）和多到多模型（多个编码器、多个解码器），从而借助单语数据或其他数据来使编码器或解码器训练地更加充分\upcite{DBLP:journals/corr/LuongLSVK15}，任务的形式包括翻译任务、成分句法分析任务、图像标注等。另外一种策略是利用多任务学习的思想同时训练多个语言的翻译任务\upcite{DBLP:conf/acl/DongWHYW15,DBLP:journals/tacl/JohnsonSLKWCTVW17}，同样包括多到一翻译（多个语种到一个语种）、一到多翻译（一个语种到多个语种）以及多到多翻译（多个语种到多个语种），这种方法可以利用多种语言的训练数据进行学习，具有较大的潜力，逐渐受到了研究人员们的关注，具体内容可以参考\ref{multilingual-translation-model}一节。
+\parinterval 此外，还有一些工作对多任务学习进行了探讨。一种策略是利用多任务学习思想来训练多到一模型（多个编码器，单个解码器）、一到多模型（单个编码器、多个解码器）和多到多模型（多个编码器、多个解码器），从而借助单语数据或其他数据来使编码器或解码器训练得更加充分\upcite{DBLP:journals/corr/LuongLSVK15}，任务的形式包括翻译任务、成分句法分析任务、图像标注等。另外一种策略是利用多任务学习的思想同时训练多个语言的翻译任务\upcite{DBLP:conf/acl/DongWHYW15,DBLP:journals/tacl/JohnsonSLKWCTVW17}，同样包括多到一翻译（多个语种到一个语种）、一到多翻译（一个语种到多个语种）以及多到多翻译（多个语种到多个语种），这种方法可以利用多种语言的训练数据进行学习，具有较大的潜力，逐渐受到了研究人员们的关注，具体内容可以参考\ref{multilingual-translation-model}一节。
 %----------------------------------------------------------------------------------------
 %    NEW SECTION 16.2
@@ -277,11 +277,11 @@
 \label{eq:16-2}
 \end{eqnarray}
-\parinterval 这里可以把$\seq{x}$和$\seq{y}$都看作分布式的向量表示；$\seq{W}$应当是一个满秩矩阵，否则对于任意一个$\seq{x}$经过$\seq{W}$变换得到的$\seq{y}$只落在所有可能的$\seq{y}$的一个子空间内，即在给定$\seq{W}$的情况下有些$\seq{y}$不能被任何一个$\seq{x}$表达，而这不符合常识，因为不管是什么句子，我们总能找到它的一种译文。若$\seq{W}$是满秩矩阵说明$\seq{W}$可逆，也就是给定$\seq{x}$到$\seq{y}$的变换$\seq{W}$下，$\seq{y}$到$\seq{x}$的变换必然是$\seq{W}$的逆而不是其他矩阵。
+\parinterval 这里可以把$\seq{x}$和$\seq{y}$都看作分布式的向量表示；$\seq{W}$应当是一个满秩矩阵，否则对于任意一个$\seq{x}$经过$\seq{W}$变换得到的$\seq{y}$只落在所有可能的$\seq{y}$的一个子空间内，即在给定$\seq{W}$的情况下有些$\seq{y}$不能被任何一个$\seq{x}$表达，而这不符合常识，因为不管是什么句子，总能找到它的一种译文。若$\seq{W}$是满秩矩阵说明$\seq{W}$可逆，也就是给定$\seq{x}$到$\seq{y}$的变换$\seq{W}$下，$\seq{y}$到$\seq{x}$的变换必然是$\seq{W}$的逆而不是其他矩阵。
 \parinterval 这个例子说明$\funp{P}(\seq{y}|\seq{x})$和$\funp{P}(\seq{x}|\seq{y})$直觉上应当存在联系。当然，$\seq{x}$和$\seq{y}$之间是否存在简单的线性变换关系并没有结论，但是上面的例子给出了一种对源语言句子和目标语言句子进行相互转化的思路。实际上，研究人员已经通过一些数学技巧用目标函数来把$\funp{P}(\seq{y}|\seq{x})$和$\funp{P}(\seq{x}|\seq{y})$联系起来，这样训练神经机器翻译系统一次就可以同时得到两个方向的翻译模型，使得训练变得更加高效\upcite{Hassan2018AchievingHP,DBLP:conf/aaai/Zhang0LZC18,DBLP:conf/wmt/SunJXHWW19}。双向联合训练的基本思想是：使用两个方向的翻译模型对单语数据进行解码，之后用解码后的翻译结果与原始的单语数据作为训练语料，通过多次迭代更新两个方向上的机器翻译模型。
-\parinterval 图\ref{fig:16-9}给出了一个双向训练的详细流程，其中$M_{x \rightarrow y}^{k}$表示第$k$轮得到的$x$到$y$的翻译模型，$M_{y \rightarrow x}^{k}$表示第$k$轮得到的$y$到$x$的翻译模型。这里只展示了前两轮迭代。在第一次迭代开始之前，首先使用双语数据对两个初始翻译模型进行预训练。为了保持一致性，这里称之为第0 轮迭代。在第一轮迭代中，首先使用这两个翻译模型$M_{x \rightarrow y}^{0}$和$M_{y \rightarrow x}^{0}$ 翻译单语数据$X=\{ x_i \}$ 和$Y= \{ y_i \}$ 后得到译文$\{\hat{y}_i^{0} \}$和$\{ \hat{x}_i^{0}\}$。进一步，构建伪训练数据集$\{ x_i,\hat{y}_i^{0}\}$ 与$\{ \hat{x}_i^{0},y_i \}$。然后使用上面的两个伪训练集和原始双语数据混合训练得到模型$M_{x \rightarrow y}^{1}$和$M_{y \rightarrow x}^{1}$并执行参数更新，即用$\{ x_i,\hat{y}_i^{0}\} \bigcup \{ x_i,y_i\}$训练$M_{x \rightarrow y}^{1}$，用$\{ y_i,\hat{x}_i^{0}\} \bigcup \{ y_i,x_i\}$训练$M_{y \rightarrow x}^{1}$。第二轮迭代继续重复上述过程，使用更新参数后的翻译模型$M_{x \rightarrow y}^{1}$和$M_{y \rightarrow x}^{1}$ 得到新的伪数据集$\{ x_i,\hat{y}_i^{1}\}$ 与$\{ \hat{x}_i^{1},y_i \}$。然后，进一步得到翻译模型$M_{x \rightarrow y}^{2}$和$M_{y \rightarrow x}^{2}$。这种方式本质上也是一种自学习的过程，通过逐步生成更好的伪数据来提升模型质量。
+\parinterval 图\ref{fig:16-9}给出了一个双向训练的详细流程，其中$M_{x \rightarrow y}^{k}$表示第$k$轮得到的$x$到$y$的翻译模型，$M_{y \rightarrow x}^{k}$表示第$k$轮得到的$y$到$x$的翻译模型。这里只展示了前两轮迭代。在第一次迭代开始之前，首先使用双语数据对两个初始翻译模型进行预训练。为了保持一致性，这里称之为第0 轮迭代。在第一轮迭代中，首先使用这两个翻译模型$M_{x \rightarrow y}^{0}$和$M_{y \rightarrow x}^{0}$ 翻译单语数据$X=\{ x_i \}$ 和$Y= \{ y_i \}$ 后得到译文$\{\hat{y}_i^{0} \}$和$\{ \hat{x}_i^{0}\}$。进一步，构建伪训练数据集$\{ x_i,\hat{y}_i^{0}\}$ 与$\{ \hat{x}_i^{0},y_i \}$。然后使用上面的两个伪训练集和原始双语数据混合训练得到模型$M_{x \rightarrow y}^{1}$和$M_{y \rightarrow x}^{1}$并进行参数更新，即用$\{ x_i,\hat{y}_i^{0}\} \bigcup \{ x_i,y_i\}$训练$M_{x \rightarrow y}^{1}$，用$\{ y_i,\hat{x}_i^{0}\} \bigcup \{ y_i,x_i\}$训练$M_{y \rightarrow x}^{1}$。第二轮迭代继续重复上述过程，使用更新参数后的翻译模型$M_{x \rightarrow y}^{1}$和$M_{y \rightarrow x}^{1}$ 得到新的伪数据集$\{ x_i,\hat{y}_i^{1}\}$ 与$\{ \hat{x}_i^{1},y_i \}$。然后，进一步得到翻译模型$M_{x \rightarrow y}^{2}$和$M_{y \rightarrow x}^{2}$。这种方式本质上也是一种自学习的过程，通过逐步生成更好的伪数据来提升模型质量。
 %----------------------------------------------
 \begin{figure}[h]
@@ -296,7 +296,7 @@
 %----------------------------------------------------------------------------------------
 \subsection{对偶学习}
-\parinterval 对称，也许是人类最喜欢的美，其始终贯穿在整个人类文明的诞生与发展之中。古语“夫美者，上下、内外、大小、远近皆无害焉，故曰美”描述的即是这样的美。在人工智能的任务中，也存在着这样的对称结构，比如机器翻译中英译汉和汉译英、图像处理中的图像标注和图像生成以及语音处理中的语音识别和文字合成等。利用这些任务的对称性质（也称对偶性），可以使互为对偶的两个任务获得更有效的反馈，从而使对应的模型相互学习、相互提高。目前，对偶学习的思想已经广泛应用于低资源机器翻译领域，其不仅能够提升在有限双语资源下的翻译模型性能（{\small\bfnew{有监督对偶学习}}，Dual Supervised Learning\index{Dual Supervised Learning}）\upcite{DBLP:conf/icml/XiaQCBYL17,DBLP:conf/acl/SuHC19,DBLP:journals/ejasmp/RadzikowskiNWY19}，而且能够利用未标注的单语数据来进行学习（{\small\bfnew{无监督对偶学习}}，Dual Unsupervised Learning\index{Dual Unsupervised Learning}）\upcite{qin2020dual,DBLP:conf/iccv/YiZTG17,DBLP:journals/access/DuRZH20}。下面将一一展开讨论。
+\parinterval 对称，也许是人类最喜欢的美，其始终贯穿在整个人类文明的诞生与发展之中。古语“夫美者，上下、内外、大小、远近皆无害焉，故曰美”描述的即是这样的美。在人工智能的任务中，也存在着这样的对称结构，比如机器翻译中英译汉和汉译英、图像处理中的图像标注和图像生成以及语音处理中的语音识别和文字合成等。利用这些任务的对称性质（也称对偶性），可以使互为对偶的两个任务获得更有效的反馈，从而使对应的模型相互学习、相互提高。目前，对偶学习的思想已经广泛应用于低资源机器翻译领域，它不仅能够提升在有限双语资源下的翻译模型性能（{\small\bfnew{有监督对偶学习}}，Dual Supervised Learning\index{Dual Supervised Learning}）\upcite{DBLP:conf/icml/XiaQCBYL17,DBLP:conf/acl/SuHC19,DBLP:journals/ejasmp/RadzikowskiNWY19}，而且能够利用未标注的单语数据来进行学习（{\small\bfnew{无监督对偶学习}}，Dual Unsupervised Learning\index{Dual Unsupervised Learning}）\upcite{qin2020dual,DBLP:conf/iccv/YiZTG17,DBLP:journals/access/DuRZH20}。下面将一一展开讨论。
 %----------------------------------------------------------------------------------------
 %    NEW SUB-SUB-SECTION
@@ -316,7 +316,7 @@
 \label{eq:16-4}
 \end{eqnarray}
-\parinterval 通过该正则化项，我们将互为对偶的两个任务放在一块学习，通过任务对偶性加强监督学习的过程，就是有监督对偶学习\upcite{DBLP:conf/icml/XiaQCBYL17,qin2020dual}。这里，$\funp{P}(\seq{x})$和$\funp{P}(\seq{y})$这两个语言模型是预先训练好的，并不参与翻译模型的训练。可以看到，对于单独的一个模型来说，其目标函数增加了与另外一个方向的模型相关的项。这样的形式与L1/L2正则化非常类似（见{\chapterthirteen}），因此可以把这个方法看作是一种任务特定的正则化的手段（由翻译任务本身的性质所启发而来）。有监督对偶学习实际上要优化如下的损失函数:
+\parinterval 通过该正则化项，互为对偶的两个任务可以被放在一块学习，通过任务对偶性加强监督学习的过程，就是有监督对偶学习\upcite{DBLP:conf/icml/XiaQCBYL17,qin2020dual}。这里，$\funp{P}(\seq{x})$和$\funp{P}(\seq{y})$这两个语言模型是预先训练好的，并不参与翻译模型的训练。可以看到，对于单独的一个模型来说，其目标函数增加了与另外一个方向的模型相关的项。这样的形式与L1/L2正则化非常类似（见{\chapterthirteen}），因此可以把这个方法看作是一种任务特定的正则化的手段（由翻译任务本身的性质所启发而来）。有监督对偶学习实际上要优化如下的损失函数:
 \begin{eqnarray}
 {L} & = &  \log{\funp{P}(\seq{y}|\seq{x})}+\log{\funp{P}(\seq{x}|\seq{y})}+{L}_{\rm{dual}}
 \label{eq:16-5}
@@ -331,7 +331,7 @@
 \parinterval 如上一节所述，有监督的对偶学习需要使用双语数据来训练两个翻译模型。幸运的是，存在大量的单语数据可供使用。因此，如何使用这些单语数据来提升翻译模型的性能是一个关键问题。
-\parinterval 无监督对偶学习为我们提供了一个思路\upcite{qin2020dual}。假设目前有两个比较弱的翻译模型，一个原始任务模型$f$将源语言句子$\seq{x}$翻译成目标语言句子$\seq{y}$，一个对偶任务模型$g$将目标语言句子$\seq{y}$翻译成源语言句子$\seq{x}$。翻译模型可由有限的双语训练或者使用无监督机器翻译的方法得到。如图\ref{fig:16-10}所示，无监督对偶学习的做法是，先通过原始任务模型$f$将一个源语言单语句子$x$翻译为目标语言句子$y$，由于没有参考译文，我们无法判断$y$的正确性。但通过语言模型，可以判断这个句子是否通顺、符合语法规范，这些信息可用来评估翻译模型$f$的翻译流畅性。随后，再通过对偶任务模型$g$将目标语言句子$y$再翻译为源语言句子$x^{'}$。如果模型$f$和$g$的翻译性能较好，那么$x^{'}$和$x$会十分相似。通过计算二者的{\small\bfnew{重构损失}}\index{重构损失}（Reconstruction Loss）\index{Reconstruction Loss}，就可以优化模型$f$和$g$的参数。这个过程可以多次迭代，从大量的无标注单语数据上不断提升性能。
+\parinterval 无监督对偶学习提供了一个解决问题的思路\upcite{qin2020dual}。假设目前有两个比较弱的翻译模型，一个原始任务模型$f$将源语言句子$\seq{x}$翻译成目标语言句子$\seq{y}$，一个对偶任务模型$g$将目标语言句子$\seq{y}$翻译成源语言句子$\seq{x}$。翻译模型可由有限的双语训练或者使用无监督机器翻译的方法得到。如图\ref{fig:16-10}所示，无监督对偶学习的做法是，先通过原始任务模型$f$将一个源语言单语句子$x$翻译为目标语言句子$y$，由于没有参考译文，无法判断$y$的正确性。但通过语言模型，可以判断这个句子是否通顺、符合语法规范，这些信息可用来评估翻译模型$f$的翻译流畅性。随后，再通过对偶任务模型$g$将目标语言句子$y$翻译为源语言句子$x^{'}$。如果模型$f$和$g$的翻译性能较好，那么$x^{'}$和$x$会十分相似。通过计算二者的{\small\bfnew{重构损失}}\index{重构损失}（Reconstruction Loss）\index{Reconstruction Loss}，就可以优化模型$f$和$g$的参数。这个过程可以多次迭代，从大量的无标注单语数据上不断提升性能。
 %----------------------------------------------
 \begin{figure}[htp]
@@ -349,7 +349,7 @@
 %----------------------------------------------------------------------------------------
 \section{多语言翻译模型}\label{multilingual-translation-model}
-\parinterval 低资源机器翻译面临的主要挑战是缺乏大规模高质量的双语数据。这个问题往往伴随着多语言的翻译任务\upcite{dabre2019brief,dabre2020survey}。也就是，要同时开发多个不同语言之间的机器翻译系统，其中少部分语言是富资源语言，而其它语言是低资源语言。针对低资源语言双语数据稀少或者缺失的情况，一种常见的思路是利用富资源语种的数据或者系统帮助低资源机器翻译系统。这也构成了多语言翻译的思想，并延伸出大量的研究工作，有三个典型研究方向：
+\parinterval 低资源机器翻译面临的主要挑战是缺乏大规模高质量的双语数据。这个问题往往伴随着多语言的翻译任务\upcite{dabre2019brief,dabre2020survey}。也就是，要同时开发多个不同语言之间的机器翻译系统，其中少部分语言是富资源语言，而其它语言是低资源语言。针对低资源语言双语数据稀少或者缺失的情况，一种常见的思路是利用富资源语言的数据或者系统帮助低资源机器翻译系统。这也构成了多语言翻译的思想，并延伸出大量的研究工作，其中有三个典型研究方向：
 \begin{itemize}
 \vspace{0.5em}
@@ -369,11 +369,11 @@
 \subsection{基于枢轴语言的方法}
-\parinterval 传统的多语言翻译中，广泛使用的是{\small\bfnew{基于枢轴语言的翻译}}（Pivot-based Translation）\upcite{DBLP:conf/emnlp/KimPPKN19,DBLP:journals/mt/WuW07}。在这种方法中，会使用一种数据丰富语言作为{\small\bfnew{中介语言}}\index{中介语言}或者{\small\bfnew{枢轴语言}}\index{枢轴语言}（Pivot Language）\index{Pivot Language}，之后让源语言和目标语言向枢轴语言进行翻译。这样，通过资源丰富的中介语言将源语言和目标语言桥接在一起，达到解决源语言-目标语言双语数据缺乏的问题。比如，想要得到泰语到波兰语的翻译，可以通过英语做枢轴语言。通过“泰语$\to$英语$\to$波兰语”的翻译过程完成泰语到波兰语的转换。
+\parinterval 传统的多语言翻译中，广泛使用的是{\small\bfnew{基于枢轴语言的翻译}}（Pivot-based Translation）\upcite{DBLP:conf/emnlp/KimPPKN19,DBLP:journals/mt/WuW07}。这种方法会使用一种数据丰富语言作为{\small\bfnew{中介语言}}\index{中介语言}或者{\small\bfnew{枢轴语言}}\index{枢轴语言}（Pivot Language）\index{Pivot Language}，之后让源语言和目标语言向枢轴语言进行翻译。这样，通过资源丰富的枢轴语言将源语言和目标语言桥接在一起，达到解决源语言-目标语言双语数据缺乏的问题。比如，想要得到泰语到波兰语的翻译，可以通过英语做枢轴语言。通过“泰语$\to$英语$\to$波兰语”的翻译过程完成泰语到波兰语的转换。
-\parinterval 基于枢轴语的方法很早就出现在基于统计机器翻译中。在基于短语的机器翻译中，已经有很多方法建立了源到枢轴和枢轴到目标的短语/单词级别特征，并基于这些特征开发了源语言到目标语言的系统\upcite{DBLP:conf/naacl/UtiyamaI07,DBLP:journals/mt/WuW07,DBLP:conf/acl/ZahabiBK13,DBLP:conf/emnlp/ZhuHWZWZ14,DBLP:conf/acl/MiuraNSTN15}，这些系统也已经广泛用于翻译稀缺资源语言对\upcite{DBLP:conf/acl/CohnL07,DBLP:journals/mt/WuW07,DBLP:conf/acl/WuW09,de2006catalan}。由于基于枢轴语的方法与模型结构无关，因此该方法也快速适用于神经机器翻译，并且取得了不错的效果\upcite{DBLP:conf/emnlp/KimPPKN19,DBLP:journals/corr/ChengLYSX16}。
+\parinterval 基于枢轴语言的方法很早就出现在基于统计机器翻译中。在基于短语的机器翻译中，已经有很多方法建立了源语言到枢轴语言和枢轴语言到目标语言的短语/单词级别特征，并基于这些特征开发了源语言到目标语言的系统\upcite{DBLP:conf/naacl/UtiyamaI07,DBLP:conf/acl/ZahabiBK13,DBLP:conf/emnlp/ZhuHWZWZ14,DBLP:conf/acl/MiuraNSTN15}，这些系统也已经广泛用于翻译低资源语言对\upcite{DBLP:conf/acl/CohnL07,DBLP:journals/mt/WuW07,DBLP:conf/acl/WuW09,de2006catalan}。由于基于枢轴语言的方法与模型结构无关，该方法也快速适用于神经机器翻译，并且取得了不错的效果\upcite{DBLP:conf/emnlp/KimPPKN19,DBLP:journals/corr/ChengLYSX16}。
-\parinterval 基于枢轴语言的方法可以被描述为如图\ref{fig:16-11}所示的过程。这里，使用虚线表示具有双语平行语料库的语言对，并使用带有箭头的实线表示翻译方向，令$\seq{x}$、$\seq{y}$和$\seq{p}$ 分别表示源语言、目标语言和枢轴语言，对于输入源语言句子$\seq{x}$和目标语言句子$\seq{y}$，其翻译过程可以被建模为如下公式：
+\parinterval 基于枢轴语言的方法可以被描述为如图\ref{fig:16-11}所示的过程。这里，使用虚线表示具有双语平行语料库的语言对，并使用带有箭头的实线表示翻译方向，令$\seq{x}$、$\seq{y}$和$\seq{p}$ 分别表示源语言、目标语言和枢轴语言，对于输入源语言句子$\seq{x}$和目标语言句子$\seq{y}$，其翻译过程可以被建模为公式\eqref{eq:16-7}：
 %----------------------------------------------
 \begin{figure}[h]
 \centering
@@ -387,11 +387,11 @@
 \label{eq:16-7}
 \end{eqnarray}
-\noindent 其中，$\seq{p}$表示一个枢轴语言句子， $\funp{P(\seq{y}|\seq{x})}$为从源语言句子$\seq{x}$翻译到目标语言句子$\seq{y}$的概率，$\funp{P}(\seq{p}|\seq{x})$为从源语言句子$\seq{x}$翻译到枢轴语言语句子$\seq{p}$的概率，$\funp{P}(\seq{y}|\seq{p})$为从枢轴语言句子$\seq{p}$到目标语言句子$\seq{y}$的概率。
+\noindent 其中，$\seq{p}$表示一个枢轴语言句子， $\funp{P(\seq{y}|\seq{x})}$为从源语言句子$\seq{x}$翻译到目标语言句子$\seq{y}$的概率，$\funp{P}(\seq{p}|\seq{x})$为从源语言句子$\seq{x}$翻译到枢轴语言语句子$\seq{p}$的概率，$\funp{P}(\seq{y}|\seq{p})$为从枢轴语言句子$\seq{p}$翻译到目标语言句子$\seq{y}$的概率。
 \parinterval $\funp{P}(\seq{p}|\seq{x})$和$\funp{P}(\seq{y}|\seq{p})$可以直接复用既有的模型和方法。不过，枚举所有的枢轴语言句子$\seq{p}$是不可行的。因此一部分研究工作也探讨了如何选择有效的路径，从$\seq{x}$经过少量$\seq{p}$到达$\seq{y}$\upcite{DBLP:conf/naacl/PaulYSN09}。
-\parinterval 虽然基于枢轴语言的方法简单且易于实现，但该方法也有一些不足。例如，它需要两次翻译过程，因此增加了翻译时间。而且在两次翻译中，翻译错误会进行累积从而产生错误传播问题，导致模型翻译准确性降低。此外，基于枢轴的语言仍然假设源语言和枢轴语言（或者目标语言和枢轴语言）之间存在一定规模的双语平行数据，但是这个假设在很多情况下并不成立。比如，对于一些资源极度稀缺的语言，其到英语或者汉语的双语数据仍然十分缺乏，这时使用基于枢轴语言的方法的效果往往也并不理想。虽然存在以上问题，但是基于枢轴语言的方法仍然受到工业界的青睐，很多在线翻译引擎也在大量使用这种方法进行多语言的翻译。
+\parinterval 虽然基于枢轴语言的方法简单且易于实现，但该方法也有一些不足。例如，它需要两次翻译过程，因此增加了翻译时间。而且在两次翻译中，翻译错误会进行累积从而产生错误传播问题，导致模型翻译准确性降低。此外，基于枢轴语言的方法仍然假设源语言和枢轴语言（或者目标语言和枢轴语言）之间存在一定规模的双语平行数据，但是这个假设在很多情况下并不成立。比如，对于一些资源极度稀缺的语言，其到英语或者汉语的双语数据仍然十分匮乏，这时使用基于枢轴语言的方法的效果往往也并不理想。虽然存在以上问题，但是基于枢轴语言的方法仍然受到工业界的青睐，很多在线翻译引擎也在大量使用这种方法进行多语言的翻译。
 %----------------------------------------------------------------------------------------
 %    NEW SUB-SECTION
@@ -418,7 +418,7 @@
 \parinterval 和基于枢轴语言的方法相比，基于教师-学生框架的方法无需训练源语言到枢轴语言的翻译模型，也就无需经历两次翻译过程，翻译效率有所提升，又避免了两次翻译所面临的错误传播问题。举个例子，假设图\ref{fig:16-12}中$\seq{x}$为源语言德语 “hallo”，$\seq{p}$为中间语言英语 “hello”，$\seq{y}$为目标语言法语“bonjour”，则德语“hallo”翻译为法语“bonjour”的概率应该与英语“hello”翻译为法语“bonjour”的概率相近。
-\parinterval 不过，基于知识蒸馏的方法仍然需要显性的使用枢轴语言进行桥接，因此仍然面临着“源语言$\to$枢轴语言$\to$目标语言”转换中信息丢失的问题。比如，当枢轴语言到目标语言翻译效果较差时，由于教师模型无法提供准确的指导，学生模型也无法取得很好的学习效果。
+\parinterval 不过，基于知识蒸馏的方法仍然需要显性地使用枢轴语言进行桥接，因此仍然面临着“源语言$\to$枢轴语言$\to$目标语言”转换中信息丢失的问题。比如，当枢轴语言到目标语言翻译效果较差时，由于教师模型无法提供准确的指导，学生模型也无法取得很好的学习效果。
 %----------------------------------------------------------------------------------------
 %    NEW SUB-SECTION
@@ -426,11 +426,11 @@
 \subsection{基于迁移学习的方法}
-\parinterval {\small\bfnew{迁移学习}}\index{迁移学习}（Transfer Learning）\index{Transfer Learning}是一种机器学习的方法，指的是一个预训练的模型被重新用在另一个任务中，而并不是从头训练一个新的模型\upcite{DBLP:conf/ijcnlp/Costa-JussaHB11,DBLP:journals/corr/HintonVD15}。迁移学习的目标是将某个领域或任务上学习到的知识应用到不同但相关的领域或问题中。在机器翻译中，可以用富资源语言的知识来改进稀缺资源语言上的机器翻译性能，即将富资源语言中的知识迁移到稀缺资源语言中。
+\parinterval {\small\bfnew{迁移学习}}\index{迁移学习}（Transfer Learning）\index{Transfer Learning}是一种机器学习的方法，指的是一个预训练的模型被重新用在另一个任务中，而并不是从头训练一个新的模型\upcite{DBLP:conf/ijcnlp/Costa-JussaHB11,DBLP:journals/corr/HintonVD15}。迁移学习的目标是将某个领域或任务上学习到的知识应用到不同但相关的领域或问题中。在机器翻译中，可以用富资源语言的知识来改进低资源语言上的机器翻译性能，即将富资源语言中的知识迁移到低资源语言中。
 \parinterval 基于枢轴语言的方法需要显性地建立“源语言$\to$枢轴语言$\to$目标语言”的路径。这时，如果路径中某处出现了问题，就会成为整个路径的瓶颈。如果使用多个枢轴语言，这个问题就会更加严重。不同于基于枢轴语言的方法，迁移学习无需进行两步解码，也就避免了翻译路径中累积错误的问题。
-\parinterval 基于迁移学习的方法思想非常简单，如图\ref{fig:16-13}所示。这种方法无需像传统的机器学习一样为每个任务单独训练一个模型，它将所有任务分类为源任务和目标任务，目标就是将源任务中的知识迁移到目标任务当中
+\parinterval 基于迁移学习的方法思想非常简单，如图\ref{fig:16-13}所示。这种方法无需像传统的机器学习一样为每个任务单独训练一个模型，它将所有任务分类为源任务和目标任务，目标就是将源任务中的知识迁移到目标任务当中。
 %----------------------------------------------
 \begin{figure}[h]
 \centering
@@ -447,7 +447,7 @@
 %----------------------------------------------------------------------------------------
 \subsubsection{1. 参数初始化方法}
-\parinterval 在解决多语言翻译问题时，首先需要在富资源语言上训练一个翻译模型，将其称为{\small\bfnew{父模型}}\index{父模型}（Parent Model）\index{Parent Model}。在对父模型的参数进行初始化的基础上，训练低资源语言的翻译模型，称之为{\small\bfnew{子模型}}\index{子模型}（Child Model）\index{Child Model}，这意味着低资源翻译模型将不会从随机初始化的参数开始学习，而是从父模型的参数开始\upcite{gu2018meta,DBLP:conf/icml/FinnAL17,DBLP:conf/naacl/GuHDL18}。这时，也可以把参数初始化看作是迁移学习。在图\ref{fig:16-14}中，左侧模型为父模型，右侧模型为子模型。这里假设从英语到汉语的翻译为富资源翻译，从英语到德语的翻译为低资源翻译，则首先用英中双语平行语料库训练出一个初始化的父模型，之后再用英语到德语的数据在父模型上微调得到子模型，这个子模型即为我们想要得到的迁移学习的模型。此过程可以看作是在富资源训练模型上对低资源语言进行微调，将富资源语言中的知识迁移到低资源语言中，从而提升低资源语言的模型性能。
+\parinterval 在解决多语言翻译问题时，首先需要在富资源语言上训练一个翻译模型，将其称之为{\small\bfnew{父模型}}\index{父模型}（Parent Model）\index{Parent Model}。在对父模型的参数进行初始化的基础上，训练低资源语言的翻译模型，称之为{\small\bfnew{子模型}}\index{子模型}（Child Model）\index{Child Model}，这意味着低资源翻译模型将不会从随机初始化的参数开始学习，而是从父模型的参数开始\upcite{gu2018meta,DBLP:conf/icml/FinnAL17,DBLP:conf/naacl/GuHDL18}。这时，也可以把参数初始化看作是迁移学习。在图\ref{fig:16-14}中，左侧模型为父模型，右侧模型为子模型。这里假设从英语到汉语的翻译为富资源翻译，从英语到德语的翻译为低资源翻译，则首先用英中双语平行语料库训练出一个初始化的父模型，之后再用英语到德语的数据在父模型上微调得到子模型，这个子模型即为迁移学习的模型。此过程可以看作是在富资源语言训练模型上对低资源语言进行微调，将富资源语言中的知识迁移到低资源语言中，从而提升低资源语言的模型性能。
 %----------------------------------------------
 \begin{figure}[h]
@@ -458,7 +458,7 @@
 \end{figure}
 %----------------------------------------------
-\parinterval 尽管这种方法在某些低语言上取得了成功，但在资源极度匮乏或零资源的翻译任务中仍然表现不佳。具体而言，如果没有任何子模型训练数据，则父模型在子测试集上的性能会很糟糕\upcite{DBLP:conf/wmt/KocmiB18}。
+\parinterval 这种方法尽管在某些低资源语言上取得了成功，但在资源极度匮乏或零资源的翻译任务中仍然表现不佳。具体而言，如果没有任何子模型训练数据，则父模型在子测试集上的性能会很糟糕\upcite{DBLP:conf/wmt/KocmiB18}。
 %----------------------------------------------------------------------------------------
 %    NEW SUB-SUB-SECTION
@@ -486,7 +486,7 @@
 %----------------------------------------------------------------------------------------
 \subsubsection{3. 零资源翻译}
-\parinterval 零资源翻译是一种较为特殊的情况：源语言和目标语言之间没有任何对齐的数据。这时，需要学到一个模型，即使在没看到这个翻译任务中的训练数据的情况下，它仍然能通过这个模型得到这个翻译任务上的译文结果。本质上，零资源翻译也是一种迁移学习\upcite{DBLP:books/crc/aggarwal14/Pan14,DBLP:journals/tkde/PanY10}，只是迁移的目标任务没有直接可以用的双语平行数据。
+\parinterval 零资源翻译是一种较为特殊的情况：源语言和目标语言之间没有任何对齐的数据。这时，需要学习一个模型，即使在没看到这个翻译任务中的训练数据的情况下，它仍然能通过这个模型得到这个翻译任务上的译文结果。本质上，零资源翻译也是一种迁移学习\upcite{DBLP:books/crc/aggarwal14/Pan14,DBLP:journals/tkde/PanY10}，只是迁移的目标任务没有直接可以用的双语平行数据。
 \parinterval 以德语到西班牙语的翻译为例，假设此翻译语言方向为零资源，即没有德语到西班牙语的双语平行数据，但是有德语到其他语言的双语平行数据，也有其他语言到西班牙语的双语平行数据。在模型训练时，训练数据的源语言句子可以增加一个语言标签。若没有语言标签，具有相同拼写但属于不同源语言的不同含义的单词可能难以翻译，但整个多语言翻译的流程更简单。假设，多语言单模型系统已经学习到了德语到英语和英语到西班牙语的翻译能力，那么这个系统也可以进行德语到西班牙语的翻译。从这个角度说，零资源神经机器翻译也需要枢轴语言，只是这些枢轴语言数据仅在训练期间使用\upcite{DBLP:journals/tacl/JohnsonSLKWCTVW17}，而无需生成伪并行语料库。
@@ -517,7 +517,7 @@
 \subsection{无监督词典归纳}\label{unsupervised-dictionary-induction}
-\parinterval {\small\bfnew{词典归纳}}\index{词典归纳}（Bilingual Dictionary Induction，BDI\index{Bilingual Dictionary Induction}），也叫{\small\bfnew{词典推断}}，是实现语种间单词级别翻译的任务。在统计机器翻译中，词典归纳是一项核心的任务，它从双语平行语料中发掘互为翻译的单词，是翻译知识的主要来源\upcite{黄书剑0统计机器翻译中的词对齐研究}。在端到端神经机器翻译中，词典归纳通常被用到无监督机器翻译、多语言机器翻译等任务中。在神经机器翻译中，单词通过实数向量来表示，即词嵌入。所有单词分布在一个多维空间中，而且研究人员发现：单词嵌入空间在各种语言中显示出类似的结构，这使得直接利用词嵌入来构建双语词典成为可能\upcite{DBLP:journals/corr/MikolovLS13}。其基本想法是先将来自不同语言的词嵌入投影到共享嵌入空间中，然后在这个共享空间中归纳出双语词典，原理图如图\ref{fig:16-16}所示。较早的尝试是使用一个包含数千词对的种子词典作为锚点来学习从源语言到目标语词言嵌入空间的线性映射，将两个语言的单词投影到共享的嵌入空间之后，执行一些对齐算法即可得到双语词典\upcite{DBLP:journals/corr/MikolovLS13}。最近的研究表明，词典归纳可以在更弱的监督信号下完成，这些监督信号来自更小的种子词典\upcite{DBLP:conf/acl/VulicK16}、 相同的字符串\upcite{DBLP:conf/iclr/SmithTHH17}，甚至仅仅是共享的数字\upcite{DBLP:conf/acl/ArtetxeLA17}。
+\parinterval {\small\bfnew{词典归纳}}\index{词典归纳}（Bilingual Dictionary Induction，BDI\index{Bilingual Dictionary Induction}），也叫{\small\bfnew{词典推断}}，是实现语种间单词级别翻译的任务。在统计机器翻译中，词典归纳是一项核心的任务，它从双语平行语料中发掘互为翻译的单词，是翻译知识的主要来源\upcite{黄书剑0统计机器翻译中的词对齐研究}。在端到端神经机器翻译中，词典归纳通常被用到无监督机器翻译、多语言机器翻译等任务中。在神经机器翻译中，单词通过实数向量来表示，即词嵌入。所有单词分布在一个多维空间中，而且研究人员发现：单词嵌入空间在各种语言中显示出类似的结构，这使得直接利用词嵌入来构建双语词典成为可能\upcite{DBLP:journals/corr/MikolovLS13}。其基本想法是先将来自不同语言的词嵌入投影到共享嵌入空间中，然后在这个共享空间中归纳出双语词典，原理如图\ref{fig:16-16}所示。较早的尝试是使用一个包含数千词对的种子词典作为锚点来学习从源语言到目标语词言嵌入空间的线性映射，将两个语言的单词投影到共享的嵌入空间之后，执行一些对齐算法即可得到双语词典\upcite{DBLP:journals/corr/MikolovLS13}。最近的研究表明，词典归纳可以在更弱的监督信号下完成，这些监督信号来自更小的种子词典\upcite{DBLP:conf/acl/VulicK16}、 相同的字符串\upcite{DBLP:conf/iclr/SmithTHH17}，甚至仅仅是共享的数字\upcite{DBLP:conf/acl/ArtetxeLA17}。
 %----------------------------------------------
 \begin{figure}[h]
 \centering
@@ -544,7 +544,7 @@
 \vspace{0.5em}
 \end{itemize}
-\parinterval 其具体流程图如\ref{fig:16-17}所示，包括：
+\parinterval 其具体流程如图\ref{fig:16-17}所示，包括：
 \begin{itemize}
 \vspace{0.5em}
@@ -570,7 +570,7 @@
 \vspace{0.5em}
 \item 基于GAN的方法\upcite{DBLP:conf/iclr/LampleCRDJ18,DBLP:conf/acl/ZhangLLS17,DBLP:conf/emnlp/XuYOW18,DBLP:conf/naacl/MohiuddinJ19}。在这个方法中，通过生成器来产生映射$\mathbi{W}$，鉴别器负责区分随机抽样的元素$\mathbi{W} \mathbi{X}$ 和$\mathbi{Y}$，两者共同优化收敛后即可得到映射$\mathbi{W}$。
 \vspace{0.5em}
-\item 基于Gromov-Wasserstein 的方法\upcite{DBLP:conf/emnlp/Alvarez-MelisJ18,DBLP:conf/lrec/GarneauGBDL20,DBLP:journals/corr/abs-1811-01124,DBLP:conf/emnlp/XuYOW18}。Wasserstein距离是度量空间中定义两个概率分布之间距离的函数。在这个任务中，它用来衡量不同语言中单词对之间的相似性，利用空间近似同构的信息可以定义出一些目标函数，之后通过优化该目标函数也可以得到映射$\mathbi{W}$。
+\item 基于Gromov-wasserstein 的方法\upcite{DBLP:conf/emnlp/Alvarez-MelisJ18,DBLP:conf/lrec/GarneauGBDL20,DBLP:journals/corr/abs-1811-01124,DBLP:conf/emnlp/XuYOW18}。Wasserstein距离是度量空间中定义两个概率分布之间距离的函数。在这个任务中，它用来衡量不同语言中单词对之间的相似性，利用空间近似同构的信息可以定义出一些目标函数，之后通过优化该目标函数也可以得到映射$\mathbi{W}$。
 \vspace{0.5em}
 \end{itemize}

--- a/Chapter17/Figures/figure-audio-processing.tex
+++ b/Chapter17/Figures/figure-audio-processing.tex
+\tikzstyle{process} = [rectangle,very thick,rounded corners,minimum width=4.7cm,minimum height=2.5cm,text centered,draw=black!70,fill=red!20]
+\tikzstyle{cir} = [circle,thick,rounded corners,minimum width=0.7cm,text centered,draw=black,fill=green!25]
+\begin{tikzpicture}[node distance = 0,scale = 0.7]
+\tikzstyle{every node}=[scale=0.7]
+\node(voice)[scale=1.0]{声波};
+\node(microphone)[rectangle,right of = voice,xshift=1.4cm,yshift=-1cm,minimum width=0.32cm,minimum height=0.35cm,fill=black!85,draw=black!85]{};
+\draw[black!85,line width=1.8]([yshift=0.38cm,xshift=-0.4cm]microphone.north)arc(180:360:0.4cm);
+\node(microphone_1)[rectangle,minimum width=0.4cm,minimum height=0.8cm,rounded corners=3pt,above of =microphone,yshift=0.75cm,draw=black!85,line width=2.5]{};
+\draw[-,black!85,very thick]([yshift=0.4cm,xshift=-0.2cm]microphone.north)--([yshift=0.4cm,xshift=-0cm]microphone.north);
+\draw[-,black!85,very thick]([yshift=0.5cm,xshift=-0.2cm]microphone.north)--([yshift=0.5cm,xshift=-0cm]microphone.north);
+\draw[-,black!85,very thick]([yshift=0.6cm,xshift=-0.2cm]microphone.north)--([yshift=0.6cm,xshift=-0cm]microphone.north);
+\draw[-,black!85,line width=1.8]([yshift=0.6cm,xshift=-0.4cm]microphone.north)--([yshift=0.37cm,xshift=-0.4cm]microphone.north);
+\draw[-,black!85,line width=1.8]([yshift=0.6cm,xshift=0.4cm]microphone.north)--([yshift=0.37cm,xshift=0.4cm]microphone.north);
+\draw[black!85,line width=1]([yshift=0.8cm,xshift=-0.8cm]microphone.north)arc(-45:45:0.3cm);
+\draw[black!85,line width=1]([yshift=0.75cm,xshift=-0.7cm]microphone.north)arc(-45:45:0.4cm);
+\draw[black!85,line width=1]([yshift=0.7cm,xshift=-0.6cm]microphone.north)arc(-45:45:0.5cm);
+\node(process_1)[process,right of = microphone,xshift=4.7cm,yshift=0.5cm]{};
+\node(text_1)[below of = process_1,yshift=-2cm,scale=1.3]{采样};
+\draw [very thick,rounded corners=10pt]([xshift=-2.2cm,yshift=-1cm]process_1.center)--([xshift=-1.8cm,yshift=1cm]process_1.center)--([xshift=-1.4cm,yshift=0cm]process_1.center)--([xshift=-1.1cm,yshift=0.8cm]process_1.center)--([xshift=-0.8cm,yshift=-0.4cm]process_1.center)--([xshift=-0.5cm,yshift=0.4cm]process_1.center);
+\draw [->,very thick]([xshift=-0.3cm]process_1.center)to([xshift=0.3cm]process_1.center);
+\draw [very thick,rounded corners=10pt,densely dotted]([xshift=0.5cm,yshift=-1cm]process_1.center)--([xshift=0.9cm,yshift=1cm]process_1.center)--([xshift=1.3cm,yshift=0cm]process_1.center)--([xshift=1.6cm,yshift=0.8cm]process_1.center)--([xshift=1.9cm,yshift=-0.4cm]process_1.center)--([xshift=2.2cm,yshift=0.4cm]process_1.center);
+\node(process_2)[process,right of = process_1,xshift=6.6cm]{};
+\node(text_2)[below of = process_2,yshift=-2cm,scale=1.3]{量化};
+\draw [very thick,rounded corners=10pt,densely dotted]([xshift=-2.2cm,yshift=-1cm]process_2.center)--([xshift=-1.8cm,yshift=1cm]process_2.center)--([xshift=-1.4cm,yshift=0cm]process_2.center)--([xshift=-1.1cm,yshift=0.8cm]process_2.center)--([xshift=-0.8cm,yshift=-0.4cm]process_2.center)--([xshift=-0.5cm,yshift=0.4cm]process_2.center);
+\draw [->,very thick]([xshift=-0.3cm]process_2.center)to([xshift=0.3cm]process_2.center);
+\draw [very thick,]([xshift=0.5cm,yshift=-0.8cm]process_2.center)--([xshift=0.5cm,yshift=0.3cm]process_2.center)--([xshift=0.7cm,yshift=0.3cm]process_2.center)--([xshift=0.7cm,yshift=0.8cm]process_2.center)--([xshift=1cm,yshift=0.8cm]process_2.center)--([xshift=1cm,yshift=0.2cm]process_2.center)--([xshift=1.3cm,yshift=0.2cm]process_2.center)--([xshift=1.3cm,yshift=0.6cm]process_2.center)--([xshift=1.6cm,yshift=0.6cm]process_2.center)--([xshift=1.6cm,yshift=-0.3cm]process_2.center)--([xshift=1.8cm,yshift=-0.3cm]process_2.center)--([xshift=1.8cm,yshift=0.3cm]process_2.center)--([xshift=2cm,yshift=0.3cm]process_2.center);
+\node(text1)[left of = process_1,xshift=-3.2cm,yshift=-0.5cm,align=center]{模拟\\语音信号};
+\node(text2)[right of = process_1,xshift=3.3cm,yshift=-0.5cm,align=center]{离散\\时间信号};
+\node(text3)[right of = process_2,xshift=3.2cm,yshift=-0.5cm,align=center]{数字离散\\时间信号};
+\draw[->,very thick](process_1.east)to(process_2.west);
+\draw[->,very thick]([xshift=-1.8cm]process_1.west)to(process_1.west);
+\draw[->,very thick](process_2.east)to([xshift=1.8cm]process_2.east);
+%%%%音频
+\node(signal)[right of = process_2,xshift=5.5cm]{};
+\draw[-,thick,]([xshift=-1.2cm]signal.center)--([xshift=1.2cm]signal.center);
+\draw[-,thick]([xshift=-1cm,yshift=-0.8cm]signal.center)--([xshift=-0.9cm,yshift=0.4cm]signal.center)--([xshift=-0.8cm,yshift=-0.3cm]signal.center)--([xshift=-0.7cm,yshift=0.7cm]signal.center)--([xshift=-0.6cm,yshift=-0.1cm]signal.center)--([xshift=-0.5cm,yshift=0.3cm]signal.center)--([xshift=-0.4cm,yshift=-0.5cm]signal.center)--([xshift=-0.3cm,yshift=0.7cm]signal.center)--([xshift=-0.2cm,yshift=-0.2cm]signal.center)--([xshift=-0.1cm,yshift=0.4cm]signal.center)--([xshift=0cm,yshift=-0.9cm]signal.center)--([xshift=0.1cm,yshift=0.5cm]signal.center)--([xshift=0.2cm,yshift=-0.4cm]signal.center)--([xshift=0.3cm,yshift=0.3cm]signal.center)--([xshift=0.4cm,yshift=-0.2cm]signal.center)--([xshift=0.5cm,yshift=0.1cm]signal.center)--([xshift=0.6cm,yshift=-0.8cm]signal.center)--([xshift=0.7cm,yshift=0.4cm]signal.center)--([xshift=0.8cm,yshift=-0.6cm]signal.center)--([xshift=0.9cm,yshift=0.7cm]signal.center)--([xshift=1cm,yshift=-0.2cm]signal.center);
+\end{tikzpicture}
\ No newline at end of file
--- a/Chapter17/Figures/figure-cascading-speech-translation.tex
+++ b/Chapter17/Figures/figure-cascading-speech-translation.tex
+\tikzstyle{process} = [rectangle,very thick,rounded corners,minimum width=3.2cm,minimum height=3cm,text centered,draw=black!70,fill=red!20]
+\tikzstyle{cir} = [circle,thick,rounded corners,minimum width=0.7cm,text centered,draw=black,fill=green!25]
+\begin{tikzpicture}[node distance = 0,scale = 0.5]
+\tikzstyle{every node}=[scale=0.5]
+\node(process_1)[process]{};
+\draw[-,thick]([xshift=-1.2cm]process_1.center)--([xshift=1.2cm]process_1.center);
+\draw[-,thick]([xshift=-1cm,yshift=-0.8cm]process_1.center)--([xshift=-0.9cm,yshift=0.4cm]process_1.center)--([xshift=-0.8cm,yshift=-0.3cm]process_1.center)--([xshift=-0.7cm,yshift=0.7cm]process_1.center)--([xshift=-0.6cm,yshift=-0.1cm]process_1.center)--([xshift=-0.5cm,yshift=0.3cm]process_1.center)--([xshift=-0.4cm,yshift=-0.5cm]process_1.center)--([xshift=-0.3cm,yshift=0.7cm]process_1.center)--([xshift=-0.2cm,yshift=-0.2cm]process_1.center)--([xshift=-0.1cm,yshift=0.4cm]process_1.center)--([xshift=0cm,yshift=-0.9cm]process_1.center)--([xshift=0.1cm,yshift=0.5cm]process_1.center)--([xshift=0.2cm,yshift=-0.4cm]process_1.center)--([xshift=0.3cm,yshift=0.3cm]process_1.center)--([xshift=0.4cm,yshift=-0.2cm]process_1.center)--([xshift=0.5cm,yshift=0.1cm]process_1.center)--([xshift=0.6cm,yshift=-0.8cm]process_1.center)--([xshift=0.7cm,yshift=0.4cm]process_1.center)--([xshift=0.8cm,yshift=-0.6cm]process_1.center)--([xshift=0.9cm,yshift=0.7cm]process_1.center)--([xshift=1cm,yshift=-0.2cm]process_1.center);
+\node(text_1)[below of = process_1,yshift=-2cm,scale=1.5]{语音信号};
+\node(process_2)[process,right of = process_1,xshift=7.0cm,text width=4cm,align=center]{\baselineskip=4pt\LARGE{[[0.2,...,0.3], \qquad ..., \qquad  0.3,...,0.5]]}\par};
+\node(text_2)[below of = process_2,yshift=-2cm,scale=1.5]{语音特征};
+\node(process_3)[process,,minimum width=6cm,minimum height=5cm,right of = process_2,xshift=8.2cm,text width=4cm,align=center]{};
+\node(text_3)[below of = process_3,yshift=-3cm,scale=1.5]{源语文本及其词格};
+\node(cir_s)[cir,very thick, below of = process_3,xshift=-2.2cm,yshift=1.1cm]{\LARGE S};
+\node(cir_a)[cir,right of = cir_s,xshift=1cm,yshift=0.8cm]{\LARGE a};
+\node(cir_c)[cir,right of = cir_a,xshift=1.2cm,yshift=0cm]{\LARGE c};
+\node(cir_f)[cir,right of = cir_c,xshift=1.2cm,yshift=0cm]{\LARGE f};
+\node(cir_E)[cir,very thick,right of = cir_f,xshift=1cm,yshift=-0.8cm]{\LARGE E};
+\node(cir_b)[cir,right of = cir_s,xshift=1cm,yshift=-0.8cm]{\Large b};
+\node(cir_d)[cir,right of = cir_b,xshift=1cm,yshift=0.6cm]{\Large d};
+\node(cir_e)[cir, right of = cir_b,xshift=1cm,yshift=-0.8cm]{\LARGE e};
+\node(cir_g)[cir,right of = cir_e,xshift=1cm,yshift=0.8cm]{\LARGE g};
+\draw[-latex](cir_s)node[above,xshift=0.3cm,yshift=0.4cm]{0.4}to(cir_a);
+\draw[-latex](cir_a)node[above,xshift=0.6cm,yshift=0cm]{1}to(cir_c);
+\draw[-latex](cir_c)node[above,xshift=0.6cm,yshift=0cm]{1}to(cir_f);
+\draw[-latex](cir_f)node[above,xshift=0.6cm,yshift=-0.3cm]{1}to(cir_E);
+\draw[-latex](cir_s)node[above,xshift=0.7cm,yshift=-0.4cm]{0.6}to(cir_b);
+\draw[-latex](cir_b)node[above,xshift=0.3cm,yshift=0.3cm]{0.8}to(cir_d);
+\draw[-latex](cir_b)node[above,xshift=0.7cm,yshift=-0.4cm]{0.2}to(cir_e);
+\draw[-latex](cir_e)node[above,xshift=0.3cm,yshift=0.3cm]{1}to(cir_g);
+\draw[-latex](cir_d)node[above,xshift=0.7cm,yshift=0cm]{1}to(cir_f);
+\draw[-latex](cir_g)node[above,xshift=0.6cm,yshift=0.3cm]{1}--(cir_E);
+\node(text)[below of = process_3,yshift=-1.8cm,scale=1.8]{你是谁};
+\node(process_4)[process,right of = process_3,xshift=8.2cm,text width=4cm,align=center]{\Large\textbf{Who are you?}};
+\node(text_4)[below of = process_4,yshift=-2cm,scale=1.5]{翻译译文};
+\draw[->,very thick](process_1.east)to(process_2.west);
+\draw[->,very thick](process_2.east)to(process_3.west);
+\draw[->,very thick](process_3.east)to(process_4.west);
+\node(arrow_text1)[right of = process_1,xshift=3.2cm,yshift=0.7cm,scale=1.4,align=center]{音频\\特征提取};
+\node(arrow_text2)[right of = process_2,xshift=3.6cm,yshift=0.7cm,scale=1.4,align=center]{语音\\识别系统};
+\node(arrow_text3)[right of = process_3,xshift=4.5cm,yshift=0.4cm,scale=1.4]{翻译系统};
+\end{tikzpicture}
\ No newline at end of file
--- a/Chapter17/Figures/figure-framing-schematic.tex
+++ b/Chapter17/Figures/figure-framing-schematic.tex
+\tikzstyle{process} = [rectangle,very thick,rounded corners,minimum width=5cm,minimum height=2.5cm,text centered,draw=black!70,fill=red!25]
+\tikzstyle{cir} = [circle,thick,rounded corners,minimum width=0.7cm,text centered,draw=black,fill=green!25]
+\begin{tikzpicture}[node distance = 0,scale = 1]
+\tikzstyle{every node}=[scale=1]
+\node [anchor=center](ori) at (-0.2,-0.2) {$O$};
+\draw[->,thick](-0.5,0)--(5,0)node[below]{$t$};
+\draw[->,thick](0,-2)--(0,2)node[left,scale=0.8]{量化值};
+\draw[-,thick](0,0)sin(0.7,1.5)cos(1.4,0)sin(2.1,-1.5)cos(2.8,0)sin(3.5,1.5)cos(4.2,0);
+\draw[-,thick,dashed](0.5,-1.8)--(0.5,1.8);
+\draw[-](1.2,-1.8)--(1.2,1.8);
+\draw[-,thick,dashed](1.9,-1.8)--(1.9,1.8);
+\draw[<->,thick](0,-1.1)--(1.2,-1.1)node[left,xshift=-0.05cm,yshift=0.15cm,scale=0.6]{帧长};
+\draw[<->,thick](0,-1.4)--(0.5,-1.4)node[left,xshift=0.05cm,yshift=-0.25cm,scale=0.6]{帧移};
+\draw[<->,thick](0.5,-1.4)--(1.9,-1.4);
+\end{tikzpicture}
\ No newline at end of file
--- a/Chapter17/Figures/figure-joint-encoder-mode.tex
+++ b/Chapter17/Figures/figure-joint-encoder-mode.tex
+\tikzstyle{coder} = [rectangle,thick,rounded corners,minimum width=2.3cm,minimum height=1cm,text centered,draw=black!70,fill=red!25]
+\begin{tikzpicture}[node distance = 0,scale = 0.75]
+\tikzstyle{every node}=[scale=0.75]
+\node(encoder)[coder]{\large{编码器}};
+\node(decoder_1)[coder,above of =encoder,xshift=-1.6cm,yshift=2.4cm,fill=blue!20]{\large{解码器}};
+\node(decoder_2)[coder,above of =encoder, xshift=1.6cm,yshift=2.4cm,fill=yellow!25]{\large{解码器}};
+\node(s)[below of = encoder,yshift=-1.8cm,scale=1.6]{$s$};
+\node(y)[above of = decoder_2,yshift=1.8cm,scale=1.6]{$y$};
+\draw[->,thick](s.north)to(encoder.south);
+\draw[->,thick](decoder_1.east)to(decoder_2.west);
+\draw[->,thick](decoder_2.north)to(y.south);
+\draw[->,thick](encoder.north)--([yshift=0.6725cm]encoder.north)--([yshift=-0.7cm]decoder_1.south)--(decoder_1.south);
+\draw[->,thick](encoder.north)--([yshift=0.6725cm]encoder.north)--([yshift=-0.7cm]decoder_2.south)--(decoder_2.south);
+\end{tikzpicture}
\ No newline at end of file
--- a/Chapter17/Figures/figure-three-ways-of-dual-decoder-speech-translation.tex
+++ b/Chapter17/Figures/figure-three-ways-of-dual-decoder-speech-translation.tex
+\tikzstyle{coder} = [rectangle,thick,rounded corners,minimum width=2.3cm,minimum height=1cm,text centered,draw=black!70,fill=red!20]
+\begin{tikzpicture}[node distance = 0,scale = 0.75]
+\tikzstyle{every node}=[scale=0.75]
+\node(encoder)[coder]at (0,0){\large{编码器}};
+\node(decoder_1)[coder,above of =encoder,xshift=-1.6cm,yshift=2.8cm,fill=blue!20]{\large{解码器}};
+\node(decoder_2)[coder,above of =encoder, xshift=1.6cm,yshift=2.8cm,fill=yellow!20]{\large{解码器}};
+\node(s)[below of = encoder,yshift=-1.8cm,scale=1.6]{$s$};
+\node(x)[above of = decoder_1,yshift=1.8cm,scale=1.6]{$x$};
+\node(y)[above of = decoder_2,yshift=1.8cm,scale=1.6]{$y$};
+\draw[->,thick](s.north)to(encoder.south);
+\draw[->,thick](decoder_1.north)to(x.south);
+\draw[->,thick](decoder_2.north)to(y.south);
+\draw[->,thick](encoder.north)--([yshift=0.7cm]encoder.north)--([xshift=-4.16em,yshift=0.7cm]encoder.north)--(decoder_1.south);
+\draw[->,thick](encoder.north)--([yshift=0.7cm]encoder.north)--([xshift=4.16em,yshift=0.7cm]encoder.north)--(decoder_2.south);
+\node [anchor=north](pos1) at (s.south) {(a) 单编码器-双解码器方式};
+%%%%%%%%%%%%%%%%%%%%%%%%级联
+\node(encoder-2)[coder]at ([xshift=10.0em]encoder.east){\large{编码器}};
+\node(decoder_1-2)[coder,above of =encoder-2,yshift=1.4cm,fill=blue!20]{\large{解码器}};
+\node(decoder_2-2)[coder,above of =decoder_1-2, yshift=1.4cm,fill=yellow!20]{\large{解码器}};
+\node(s-2)[below of = encoder-2,yshift=-1.8cm,scale=1.6]{$s$};
+\node(y-2)[above of = decoder_2-2,yshift=1.8cm,scale=1.6]{$y$};
+\draw[->,thick](s-2.north)to(encoder-2.south);
+\draw[->,thick](encoder-2.north)to(decoder_1-2.south);
+\draw[->,thick](decoder_1-2.north)to(decoder_2-2.south);
+\draw[->,thick](decoder_2-2.north)to(y-2.south);
+\node [anchor=north](pos2) at (s-2.south) {(b) 级联编码器方式};
+%%%%%%%%%%%%%%%%%%%%%%%%联合
+\node(encoder-3)[coder]at([xshift=10.0em]encoder-2.east){\large{编码器}};
+\node(decoder_1-3)[coder,above of =encoder-3,xshift=-1.6cm,yshift=2.8cm,fill=blue!20]{\large{解码器}};
+\node(decoder_2-3)[coder,above of =encoder-3, xshift=1.6cm,yshift=2.8cm,fill=yellow!20]{\large{解码器}};
+\node(s-3)[below of = encoder-3,yshift=-1.8cm,scale=1.6]{$s$};
+\node(y-3)[above of = decoder_2-3,yshift=1.8cm,scale=1.6]{$y$};
+\draw[->,thick](s-3.north)to(encoder-3.south);
+\draw[->,thick](decoder_1-3.east)to(decoder_2-3.west);
+\draw[->,thick](decoder_2-3.north)to(y-3.south);
+\draw[->,thick](encoder-3.north)--([yshift=0.7cm]encoder-3.north)--([xshift=-4.16em,yshift=0.7cm]encoder-3.north)--(decoder_1-3.south);
+\draw[->,thick](encoder-3.north)--([yshift=0.7cm]encoder-3.north)--([xshift=4.16em,yshift=0.7cm]encoder-3.north)--(decoder_2-3.south);
+\node [anchor=north](pos3) at (s-3.south) {(c) 联合编码器方式};
+\end{tikzpicture}
\ No newline at end of file
--- a/Chapter17/chapter17.tex
+++ b/Chapter17/chapter17.tex
@@ -56,19 +56,28 @@
 \subsection{音频处理}
-\parinterval 不同于文本，音频本质上是经过若干信号处理之后的{\small\bfnew{波形}}（Waveform）\index{Waveform}。具体来说，声音是一种空气的震动，因此可以被转换为模拟信号。模拟信号是一段连续的信号，经过采样变为离散数字信号。采样是每隔固定的时间记录一下声音的振幅，采样率表示每秒的采样点数，单位是赫兹（Hz）。采样率越高，结果的损失则越小。通常来说，采样的标准是能够通过离散化的数字信号重现原始语音。我们日常生活中使用的手机和电脑设备的采样率一般为16kHz，表示每秒16000个采样点；而音频CD的采样率可以达到44.1kHz。经过进一步的量化，将采样点的值转换为整型数值保存，从而减少占用的存储空间，通常采用的是16位量化。将采样率和量化位数相乘，就可以得到{\small\bfnew{比特率}}\index{比特率}（Bits Per Second，BPS）\index{Bits Per Second}，表示音频每秒占用的位数。16kHz采样率和16位量化的音频，比特率为256kb/s。整体流程如图\ref{fig:17-2-1}所示\upcite{洪青阳2020语音识别原理与应用,陈果果2020语音识别实战}。
+\parinterval 不同于文本，音频本质上是经过若干信号处理之后的{\small\bfnew{波形}}（Waveform）\index{Waveform}。具体来说，声音是一种空气的震动，因此可以被转换为模拟信号。模拟信号是一段连续的信号，经过采样变为离散数字信号。采样是每隔固定的时间记录一下声音的振幅，采样率表示每秒的采样点数，单位是赫兹（Hz）。采样率越高，结果的损失则越小。通常来说，采样的标准是能够通过离散化的数字信号重现原始语音。我们日常生活中使用的手机和电脑设备的采样率一般为16kHz，表示每秒16000个采样点；而音频CD的采样率可以达到44.1kHz。经过进一步的量化，将采样点的值转换为整型数值保存，从而减少占用的存储空间，通常采用的是16位量化。将采样率和量化位数相乘，就可以得到{\small\bfnew{比特率}}\index{比特率}（Bits Per Second，BPS）\index{Bits Per Second}，表示音频每秒占用的位数。16kHz采样率和16位量化的音频，比特率为256kb/s。整体流程如图\ref{fig:17-1}所示\upcite{洪青阳2020语音识别原理与应用,陈果果2020语音识别实战}。
 %----------------------------------------------------------------------------------------------------
 \begin{figure}[htp]
 \centering
+\input{./Chapter17/Figures/figure-audio-processing}
 \caption{音频处理过程}
-\label{fig:17-2-1}
+\label{fig:17-1}
 \end{figure}
 %----------------------------------------------------------------------------------------------------
 \parinterval 经过上面的描述，音频的表示实际上是一个非常长的采样点序列，这导致了直接使用现有的深度学习技术处理音频序列较为困难。并且，原始的音频信号中可能包含着较多的噪声、环境声或冗余信息也会对模型产生干扰。因此，一般会对音频序列进行处理来提取声学特征，具体为将长序列的采样点序列转换为短序列的特征向量序列，再用于下游系统模块。虽然已有一些工作不依赖特征提取，直接在原始的采样点序列上进行声学建模和模型训练\upcite{DBLP:conf/interspeech/SainathWSWV15}，但目前的主流方法仍然是基于声学特征进行建模\upcite{DBLP:conf/icassp/MohamedHP12}。
 \parinterval 声学特征提取的第一步是预处理。其流程主要是对音频进行预加重、分帧和加窗。预加重用来提升音频信号中的高频部分，目的是使频谱更加平滑。分帧是基于短时平稳假设，即根据生物学特征，语音信号是一个缓慢变化的过程，10ms~30ms的信号片段是相对平稳的。基于这个假设，一般将每25ms作为一帧来提取特征，这个时间称为{\small\bfnew{帧长}}\index{帧长}（Frame Length）\index{Frame Length}。同时，为了保证不同帧之间的信号平滑性，使每两个相邻帧之间存在一定的重合部分。一般每隔10ms取一帧，这个时长称为{\small\bfnew{帧移}}\index{帧移}（Frame Shift）\index{Frame Shift}。为了缓解分帧带来的频谱泄漏，对每帧的信号进行加窗处理使其幅度在两段渐变到0，一般采用的是{\small\bfnew{汉明窗}}\index{汉明窗}（Hamming）\index{Hamming}。
+%----------------------------------------------------------------------------------------------------
+\begin{figure}[htp]
+\centering
+\input{./Chapter17/Figures/figure-framing-schematic}
+\caption{分帧原理图}
+\label{fig:17-2}
+\end{figure}
+%----------------------------------------------------------------------------------------------------
 \parinterval 经过了上述的预处理操作，可以得到音频对应的帧序列，之后通过不同的操作来提取不同类型的声学特征。常用的声学特征包括{\small\bfnew{Mel频率倒谱系数}}\index{Mel频率倒谱系数}（Mel-Frequency Cepstral Coefficient, MFCC）\index{Mel-Frequency Cepstral Coefficient}、{\small\bfnew{感知线性预测系数}}\index{感知线性预测系数}（Perceptual Lienar Predictive, PLP）\index{Perceptual Lienar Predictive}、{\small\bfnew{滤波器组}}\index{滤波器组}（Filter-bank, Fbank）\index{Filter-bank}等。MFCC、PLP和Fbank特征都需要对预处理后的音频做{\small\bfnew{短时傅里叶变换}}\index{短时傅里叶变换}（Short-time Fourier Tranform, STFT）\index{Short-time Fourier Tranform}，得到具有规律的线性分辨率。之后再经过特定的操作，得到各种声学特征。不同声学特征的特点是不同的，MFCC去相关性较好，PLP抗噪性强，FBank可以保留更多的语音原始特征。在语音翻译中，比较常用的声学特征为FBank或MFCC\upcite{洪青阳2020语音识别原理与应用}。
@@ -80,13 +89,14 @@
 \subsection{级联式语音翻译}
-\parinterval 实现语音翻译最简单的思路是基于级联的方式，即：先通过{\small\bfnew{自动语音识别}}\index{自动语音识别}（Automatic Speech Recognition，ASR）\index{Automatic Speech Recognition}系统将语音识别为源语言文本，然后利用机器翻译系统将源语言文本翻译为目标语言文本。这种做法的好处在于语音识别和机器翻译模型可以分别进行训练，有很多数据资源以及成熟技术可以分别运用到两个系统中。因此，级联语音翻译是很长时间以来的主流方法，深受工业界的青睐。级联语音翻译主要的流程如图\ref{fig:17-2-2}所示。
+\parinterval 实现语音翻译最简单的思路是基于级联的方式，即：先通过{\small\bfnew{自动语音识别}}\index{自动语音识别}（Automatic Speech Recognition，ASR）\index{Automatic Speech Recognition}系统将语音识别为源语言文本，然后利用机器翻译系统将源语言文本翻译为目标语言文本。这种做法的好处在于语音识别和机器翻译模型可以分别进行训练，有很多数据资源以及成熟技术可以分别运用到两个系统中。因此，级联语音翻译是很长时间以来的主流方法，深受工业界的青睐。级联语音翻译主要的流程如图\ref{fig:17-3}所示。
 %----------------------------------------------------------------------------------------------------
 \begin{figure}[htp]
 \centering
+\input{./Chapter17/Figures/figure-cascading-speech-translation}
 \caption{级联语音翻译}
-\label{fig:17-2-2}
+\label{fig:17-3}
 \end{figure}
 %----------------------------------------------------------------------------------------------------
@@ -177,9 +187,9 @@
 \parinterval 针对这两个问题，研究人员们也提出了很多方法进行缓解，包括多任务学习、迁移学习等，主要思想都是利用语音识别或文本翻译数据来指导语音模型学习。并且，文本翻译中的很多方法和思想都对语音翻译技术的发展提供了思路。如何将其他领域现有的工作在语音翻译任务上验证，并针对语音这一信息载体进行特定的建模适应，是语音翻译任务当前的研究重点\upcite{DBLP:conf/mtsummit/GangiNCDT19}。
 %----------------------------------------------------------------------------------------------------
-\begin{itemize}
+\noindent{\small\bfnew{1）多任务学习}}
-    \vspace{0.5em}
-\item 多任务学习。针对语音翻译模型建模复杂度较高问题，常用的一个方法是进行多任务学习，使模型在训练过程中有更多的监督信息，从而使模型收敛地更加充分。语音语言中多任务学习主要借助语音对应的标注信息，也就是源语言文本。{\small\bfnew{连接时序分类}}\index{连接时序分类}（Connectionist Temporal Classification，CTC）\index{Connectionist Temporal Classification}\upcite{DBLP:conf/icml/GravesFGS06}是语音处理中最简单有效的一种多任务学习方法\upcite{DBLP:journals/jstsp/WatanabeHKHH17,DBLP:conf/icassp/KimHW17}，也被广泛应用于文本识别任务中\upcite{DBLP:journals/pami/ShiBY17}。CTC可以将输入序列的每一位置都对应到标注文本中，学习语音和文字之间的软对齐关系。比如，对于下面的音频序列，CTC可以将每个位置分别对应到同一个词。需要注意的是，CTC会额外新增一个词$\epsilon$，类似于一个空白词，表示这个位置没有声音或者没有任何对应的预测结果。然后，将相同且连续的词合并，去除$\epsilon$，就可以得到预测结果，如图\ref{fig:17-2-6}所示。
+\parinterval 针对语音翻译模型建模复杂度较高问题，常用的一个方法是进行多任务学习，使模型在训练过程中有更多的监督信息，从而使模型收敛地更加充分。语音语言中多任务学习主要借助语音对应的标注信息，也就是源语言文本。{\small\bfnew{连接时序分类}}\index{连接时序分类}（Connectionist Temporal Classification，CTC）\index{Connectionist Temporal Classification}\upcite{DBLP:conf/icml/GravesFGS06}是语音处理中最简单有效的一种多任务学习方法\upcite{DBLP:journals/jstsp/WatanabeHKHH17,DBLP:conf/icassp/KimHW17}，也被广泛应用于文本识别任务中\upcite{DBLP:journals/pami/ShiBY17}。CTC可以将输入序列的每一位置都对应到标注文本中，学习语音和文字之间的软对齐关系。比如，对于下面的音频序列，CTC可以将每个位置分别对应到同一个词。需要注意的是，CTC会额外新增一个词$\epsilon$，类似于一个空白词，表示这个位置没有声音或者没有任何对应的预测结果。然后，将相同且连续的词合并，去除$\epsilon$，就可以得到预测结果，如图\ref{fig:17-2-6}所示。
 %----------------------------------------------------------------------------------------------------
 \begin{figure}[htp]
@@ -211,21 +221,24 @@
 \end{figure}
 %----------------------------------------------------------------------------------------------------
-\parinterval 另外一种多任务学习的思想是通过两个解码器，分别预测语音对应的源语言句子和目标语言句子，具体有图XXX展示的三种方式\upcite{DBLP:conf/naacl/AnastasopoulosC18,DBLP:conf/asru/BaharBN19}。图\ref{fig:17-2-8}（a）中采用单编码器-双解码器的方式，两个解码器根据编码器的表示，分别预测源语言句子和目标语言句子，从而使编码器训练地更加充分。这种做法的好处在于仅仅增加了训练代价，解码时只需要生成目标语言句子即可。图\ref{fig:17-2-8}（b）则通过使用两个级联的解码器，先利用第一个解码器生成源语言句子，然后再利用第一个解码器的表示，通过第二个解码器生成目标语言句子。这种方法通过增加一个中间输出，降低了模型的训练难度，但同时也会带来额外的解码耗时，因为两个解码器需要串行地进行生成。图\ref{fig:17-2-8}（c）中模型更进一步，第二个编码器联合编码器和第一个解码器的表示进行生成，更充分地利用了已有信息。
+\parinterval 另外一种多任务学习的思想是通过两个解码器，分别预测语音对应的源语言句子和目标语言句子，具体有图\ref{fig:17-9}展示的三种方式\upcite{DBLP:conf/naacl/AnastasopoulosC18,DBLP:conf/asru/BaharBN19}。图\ref{fig:17-9}(a)中采用单编码器-双解码器的方式，两个解码器根据编码器的表示，分别预测源语言句子和目标语言句子，从而使编码器训练地更加充分。这种做法的好处在于仅仅增加了训练代价，解码时只需要生成目标语言句子即可。图\ref{fig:17-9}(b)则通过使用两个级联的解码器，先利用第一个解码器生成源语言句子，然后再利用第一个解码器的表示，通过第二个解码器生成目标语言句子。这种方法通过增加一个中间输出，降低了模型的训练难度，但同时也会带来额外的解码耗时，因为两个解码器需要串行地进行生成。图\ref{fig:17-9}(c)中模型更进一步，第二个编码器联合编码器和第一个解码器的表示进行生成，更充分地利用了已有信息。
 %----------------------------------------------------------------------------------------------------
 \begin{figure}[htp]
 \centering
+\input{./Chapter17/Figures/figure-three-ways-of-dual-decoder-speech-translation}
 \caption{双解码器语音翻译的三种方式}
-\label{fig:17-2-8}
+\label{fig:17-9}
 \end{figure}
 %----------------------------------------------------------------------------------------------------
-    \vspace{0.5em}
-    \item 迁移学习。相比语音识别和文本翻译，端到端语音翻译的训练数据量要小很多，因此，如何利用其它数据来增加可用的数据量是语音翻译的一个重要方向。和文本翻译中的方法相似，一种思路是利用迁移学习或预训练，利用其他语言的双语数据预训练模型参数，然后迁移到目标语言任务上\upcite{DBLP:conf/naacl/BansalKLLG19}，或者是利用语音识别数据或文本翻译数据，分别预训练编码器和解码器参数，用于初始化语音翻译模型参数\upcite{DBLP:conf/icassp/BerardBKP18}。预训练的编码器对语音翻译模型的学习尤为重要\upcite{DBLP:conf/naacl/BansalKLLG19}，相比文本数据，语音数据的复杂性更高，如果仅从小规模语音翻译数据上学习很难学习充分。此外，模型对声学特征的学习与语言并不是强相关的，在其他语种预训练的编码器对模型学习也是有帮助的。
-    \vspace{0.5em}
-    \item 数据增强。数据增强是增加训练数据最简单直观的一种方法。但是相比文本翻译中，可以利用回译的方法生成伪数据（见{\chaptersixteen}）。语音翻译正向翻译模型通过源语言语音生成目标语言文本，如果直接利用回译的思想，需要通过一个模型，将目标语文本翻译为目标语语音，但实际上这种模型是不能简单得到。因此，一个简单的思路是通过一个反向翻译模型和语音合成模型级联来生成伪数据\upcite{DBLP:conf/icassp/JiaJMWCCALW19}。另外，正向翻译模型生成的伪数据在文本翻译中也被验证了对模型训练有一定的帮助，因此同样可以利用语音识别和文本翻译模型，将源语言语音生成目标语言翻译，得到伪平行语料。
-\end{itemize}
+\noindent{\small\bfnew{2）迁移学习}}
+\parinterval 相比语音识别和文本翻译，端到端语音翻译的训练数据量要小很多，因此，如何利用其它数据来增加可用的数据量是语音翻译的一个重要方向。和文本翻译中的方法相似，一种思路是利用迁移学习或预训练，利用其他语言的双语数据预训练模型参数，然后迁移到目标语言任务上\upcite{DBLP:conf/naacl/BansalKLLG19}，或者是利用语音识别数据或文本翻译数据，分别预训练编码器和解码器参数，用于初始化语音翻译模型参数\upcite{DBLP:conf/icassp/BerardBKP18}。预训练的编码器对语音翻译模型的学习尤为重要\upcite{DBLP:conf/naacl/BansalKLLG19}，相比文本数据，语音数据的复杂性更高，如果仅从小规模语音翻译数据上学习很难学习充分。此外，模型对声学特征的学习与语言并不是强相关的，在其他语种预训练的编码器对模型学习也是有帮助的。
+\noindent{\small\bfnew{3）数据增强}}
+\parinterval 数据增强是增加训练数据最简单直观的一种方法。但是相比文本翻译中，可以利用回译的方法生成伪数据（见{\chaptersixteen}）。语音翻译正向翻译模型通过源语言语音生成目标语言文本，如果直接利用回译的思想，需要通过一个模型，将目标语文本翻译为目标语语音，但实际上这种模型是不能简单得到。因此，一个简单的思路是通过一个反向翻译模型和语音合成模型级联来生成伪数据\upcite{DBLP:conf/icassp/JiaJMWCCALW19}。另外，正向翻译模型生成的伪数据在文本翻译中也被验证了对模型训练有一定的帮助，因此同样可以利用语音识别和文本翻译模型，将源语言语音生成目标语言翻译，得到伪平行语料。
 %----------------------------------------------------------------------------------------------------
 \parinterval 此外，研究人员们还探索了很多其他方法来提高语音翻译模型的性能。利用在海量的无标注语音数据上预训练的{\small\bfnew{自监督}}\index{自监督}（Self-supervised）\index{Self-supervised}模型作为一个特征提取器，将从语音中提取的特征作为语音翻译模型的输入，可以有效提高模型的性能\upcite{DBLP:conf/interspeech/WuWPG20}。相比语音翻译模型，文本翻译模型任务更加简单，因此一种思想是利用文本翻译模型来指导语音翻译模型，比如通过知识蒸馏\upcite{DBLP:conf/interspeech/LiuXZHWWZ19}、正则化\upcite{DBLP:conf/emnlp/AlinejadS20}等方法。为了简化语音翻译模型的学习，可以通过课程学习的策略，使模型从语音识别任务，逐渐过渡到语音翻译任务，这种由易到难的训练策略可以使模型训练更加充分\upcite{DBLP:journals/corr/abs-1802-06003,DBLP:conf/acl/WangWLZY20}。

--- a/bibliography.bib
+++ b/bibliography.bib
@@ -19,10 +19,10 @@
  year={2019}
 }
-@article{赵军峰2019深化改革,
+@inproceedings{赵军峰2019深化改革,
  title ={深化改革 探讨创新 推进发展——全国翻译专业学位研究生教育2019年会综述},
  author ={赵军峰,姚恺璇},
-  journal ={中国翻译},
+  publisher ={中国翻译},
  year ={2019},
 }
@@ -33,28 +33,28 @@
 	publisher={University of Toronto Press}
 }
-@article{DBLP:journals/bstj/Shannon48,
+@inproceedings{DBLP:journals/bstj/Shannon48,
  author    = {Claude E. Shannon},
  title     = {A mathematical theory of communication},
-  journal   = {Bell System Technical Journal},
+  publisher   = {Bell System Technical Journal},
  volume    = {27},
  number    = {3},
  pages     = {379--423},
  year      = {1948}
 }
-@article{shannon1949the,
+@inproceedings{shannon1949the,
 	title={The mathematical theory of communication},
 	author={Claude E. {Shannon} and Warren {Weaver}},
-	journal={IEEE Transactions on Instrumentation and Measurement},
+	publisher={IEEE Transactions on Instrumentation and Measurement},
 	volume={13},
 	year={1949}
 }
-@article{weaver1955translation,
+@inproceedings{weaver1955translation,
  title={Translation},
  author={Weaver, Warren},
-  journal={Machine translation of languages},
+  publisher={Machine translation of languages},
  volume={14},
  number={15-23},
  pages={10},
@@ -62,10 +62,10 @@
  publisher={Cambridge: Technology Press, MIT}
 }
-@article{chomsky1957syntactic,
+@inproceedings{chomsky1957syntactic,
  title={Syntactic Structures},
  author={Chomsky, Noam},
-  journal={Language},
+  publisher={Language},
  volume={33},
  number={3},
  year={1957},
@@ -80,23 +80,23 @@
  year      = {1990}
 }
-@article{DBLP:journals/coling/BrownPPM94,
+@inproceedings{DBLP:journals/coling/BrownPPM94,
  author    = {Peter F. Brown and
               Stephen Della Pietra and
               Vincent J. Della Pietra and
               Robert L. Mercer},
  title     = {The Mathematics of Statistical Machine Translation: Parameter Estimation},
-  journal   = {Computational Linguistics},
+  publisher   = {Computational Linguistics},
  volume    = {19},
  number    = {2},
  pages     = {263--311},
  year      = {1993}
 }
-@article{nirenburg1989knowledge,
+@inproceedings{nirenburg1989knowledge,
  title={Knowledge-based machine translation},
  author={Nirenburg, Sergei},
-  journal={Machine Translation},
+  publisher={Machine Translation},
  volume={4},
  number={1},
  pages={5--24},
@@ -111,10 +111,10 @@
  publisher={Ellis Horwood Chichester}
 }
-@article{zarechnak1979history,
+@inproceedings{zarechnak1979history,
  title={The history of machine translation},
  author={Zarechnak, Michael},
-  journal={Machine Translation},
+  publisher={Machine Translation},
  volume={1979},
  pages={1--87},
  year={1979}
@@ -127,44 +127,44 @@
  year={2004},
 }
-@article{王宝库1991机器翻译系统中一种规则描述语言,
+@inproceedings{王宝库1991机器翻译系统中一种规则描述语言,
  title={机器翻译系统中一种规则描述语言(CTRDL)},
  author={王宝库,张中义,姚天顺},
-  journal={中文信息学报},
+  publisher={中文信息学报},
  volume={5},
  number={4},
  year={1991},
 }
-@article{唐泓英1995基于搭配词典的词汇语义驱动算法,
+@inproceedings{唐泓英1995基于搭配词典的词汇语义驱动算法,
  title={基于搭配词典的词汇语义驱动算法},
  author={唐泓英,姚天顺},
-  journal={软件学报},
+  publisher={软件学报},
  volume={6},
  number={A01},
  pages={78-85},
  year={1995},
 }
-@article{nagao1984framework,
+@inproceedings{nagao1984framework,
  title={A framework of a mechanical translation between Japanese and English by analogy principle},
  author={Nagao, Makoto},
-  journal={Artificial and human intelligence},
+  publisher={Artificial and human intelligence},
  pages={351--354},
  year={1984}
 }
-@article{gale1993a,
+@inproceedings{gale1993a,
 	title={A program for aligning sentences in bilingual corpora},
 	author={William A. {Gale} and Kenneth W. {Church}},
-	journal={Computational Linguistics},
+	publisher={Computational Linguistics},
 	volume={19},
 	number={1},
 	pages={75--102},
 	year={1993}
 }
-@article{Wu2016GooglesNM,
+@inproceedings{Wu2016GooglesNM,
  author    = {Yonghui Wu and
               Mike Schuster and
               Zhifeng Chen and
@@ -198,7 +198,7 @@
               Jeffrey Dean},
  title     = {Google's Neural Machine Translation System: Bridging the Gap between
               Human and Machine Translation},
-  journal   = {CoRR},
+  publisher   = {CoRR},
  volume    = {abs/1609.08144},
  year      = {2016}
 }
@@ -276,10 +276,10 @@
  year      = {2016}
 }
-@article{goldberg2017neural,
+@inproceedings{goldberg2017neural,
  title={Neural network methods for natural language processing},
  author={Goldberg, Yoav},
-  journal={Synthesis Lectures on Human Language Technologies},
+  publisher={Synthesis Lectures on Human Language Technologies},
  volume={10},
  number={1},
  pages={1--309},
@@ -328,14 +328,14 @@
  publisher ={北京: 高等教育出版社}
 }
-@article{resnick1992adventures,
+@inproceedings{resnick1992adventures,
    author = {Barbour, A. and Resnick, Sidney},
    year = {1993},
    month = {12},
    pages = {1474},
    title = {Adventures in Stochastic Processes.},
    volume = {88},
-    journal = {Journal of the American Statistical Association}
+    publisher = {Journal of the American Statistical Association}
 }
 @book{liuke-markov-2004,
@@ -345,21 +345,21 @@
  publisher ={清华大学出版社}
 }
-@article{gale1995good,
+@inproceedings{gale1995good,
  author    = {William A. Gale and
               Geoffrey Sampson},
  title     = {Good-Turing Frequency Estimation Without Tears},
-  journal   = {Journal of Quantitative Linguistics},
+  publisher   = {Journal of Quantitative Linguistics},
  volume    = {2},
  number    = {3},
  pages     = {217--237},
  year      = {1995}
 }
-@article{good1953population,
+@inproceedings{good1953population,
  title ={The population frequencies of species and the estimation of population parameters},
  author ={Good, Irving J},
-  journal ={Biometrika},
+  publisher ={Biometrika},
  volume ={40},
  number ={3-4},
  pages ={237--264},
@@ -391,23 +391,23 @@
  year      = {2002}
 }
-@article{chen1999empirical,
+@inproceedings{chen1999empirical,
  author    = {Stanley F. Chen and
               Joshua Goodman},
  title     = {An empirical study of smoothing techniques for language modeling},
-  journal   = {Computer Speech \& Language},
+  publisher   = {Computer Speech \& Language},
  volume    = {13},
  number    = {4},
  pages     = {359--393},
  year      = {1999}
 }
-@article{ney1994structuring,
+@inproceedings{ney1994structuring,
  author    = {Hermann Ney and
               Ute Essen and
               Reinhard Kneser},
  title     = {On structuring probabilistic dependences in stochastic language modelling},
-  journal   = {Computer Speech \& Language},
+  publisher   = {Computer Speech \& Language},
  volume    = {8},
  number    = {1},
  pages     = {1--38},
@@ -440,22 +440,22 @@
  publisher={Cambridge University Press}
 }
-@article{tarjan1972depth,
+@inproceedings{tarjan1972depth,
 	title={Depth-First Search and Linear Graph Algorithms},
 	author={Robert Endre {Tarjan}},
-	journal={SIAM Journal on Computing},
+	publisher={SIAM Journal on Computing},
 	volume={1},
 	number={2},
 	pages={146--160},
 	year={1972}
 }
-@article{DBLP:journals/ai/SabharwalS11,
+@inproceedings{DBLP:journals/ai/SabharwalS11,
  author    = {Ashish Sabharwal and
               Bart Selman},
  title     = {S. Russell, P. Norvig, Artificial Intelligence: {A} Modern Approach,
               Third Edition},
-  journal   = {Artificial Intelligence},
+  publisher   = {Artificial Intelligence},
  volume    = {175},
  number    = {5-6},
  pages     = {935--937},
@@ -469,10 +469,10 @@
 	publisher={Computer Science Press}
 }
-@article{hart1968a,
+@inproceedings{hart1968a,
 	title={A Formal Basis for the Heuristic Determination of Minimum Cost Paths},
 	author={Peter E. {Hart} and Nils J. {Nilsson} and Bertram {Raphael}},
-	journal={IEEE Transactions on Systems Science and Cybernetics},
+	publisher={IEEE Transactions on Systems Science and Cybernetics},
 	volume={4},
 	number={2},
 	pages={100--107},
@@ -493,20 +493,20 @@
 	publisher={Oxford university press}
 }
-@article{åström1965optimal,
+@inproceedings{åström1965optimal,
 	title={Optimal control of Markov processes with incomplete state information},
 	author={Karl Johan {Åström}},
-	journal={Journal of Mathematical Analysis and Applications},
+	publisher={Journal of Mathematical Analysis and Applications},
 	volume={10},
 	number={1},
 	pages={174--205},
 	year={1965}
 }
-@article{korf1990real,
+@inproceedings{korf1990real,
 	title={Real-time heuristic search},
 	author={Richard E. {Korf}},
-	journal={Artificial Intelligence},
+	publisher={Artificial Intelligence},
 	volume={42},
 	number={2},
 	pages={189--211},
@@ -535,28 +535,28 @@
  year      = {2018}
 }
-@article{jelinek1980interpolated,
+@inproceedings{jelinek1980interpolated,
 	title={Interpolated estimation of Markov source parameters from sparse data},
 	author={F. {Jelinek}},
-	journal={Pattern Recognition in Practice},
+	publisher={Pattern Recognition in Practice},
 	pages={381--397},
 	year={1980}
 }
-@article{katz1987estimation,
+@inproceedings{katz1987estimation,
 	title={Estimation of probabilities from sparse data for the language model component of a speech recognizer},
 	author={S. {Katz}},
-	journal={IEEE Transactions on Acoustics, Speech, and Signal Processing},
+	publisher={IEEE Transactions on Acoustics, Speech, and Signal Processing},
 	volume={35},
 	number={3},
 	pages={400--401},
 	year={1987}
 }
-@article{witten1991the,
+@inproceedings{witten1991the,
 	title={The zero-frequency problem: estimating the probabilities of novel events in adaptive text compression},
 	author={I.H. {Witten} and T.C. {Bell}},
-	journal={IEEE Transactions on Information Theory},
+	publisher={IEEE Transactions on Information Theory},
 	volume={37},
 	number={4},
 	pages={1085--1094},
@@ -570,10 +570,10 @@
 	publisher={Prentice Hall}
 }
-@article{goodman2001a,
+@inproceedings{goodman2001a,
 	title={A bit of progress in language modeling},
 	author={Joshua T. {Goodman}},
-	journal={Computer Speech \& Language},
+	publisher={Computer Speech \& Language},
 	volume={15},
 	number={4},
 	pages={403--434},
@@ -644,17 +644,17 @@
 	year={2007}
 }
-@article{jing2019a,
+@inproceedings{jing2019a,
 	title={A Survey on Neural Network Language Models.},
 	author={Kun {Jing} and Jungang {Xu}},
-	journal={arXiv preprint arXiv:1906.03591},
+	publisher={arXiv preprint arXiv:1906.03591},
 	year={2019}
 }
-@article{bengio2003a,
+@inproceedings{bengio2003a,
 	title={A neural probabilistic language model},
 	author={Yoshua {Bengio} and Réjean {Ducharme} and Pascal {Vincent} and Christian {Janvin}},
-	journal={Journal of Machine Learning Research},
+	publisher={Journal of Machine Learning Research},
 	volume={3},
 	number={6},
 	pages={1137--1155},
@@ -772,11 +772,11 @@
 	year={2001}
 }
-@article{DBLP:journals/mt/BangaloreR02,
+@inproceedings{DBLP:journals/mt/BangaloreR02,
  author    = {Srinivas Bangalore and
               Giuseppe Riccardi},
  title     = {Stochastic Finite-State Models for Spoken Language Machine Translation},
-  journal   = {Machine Translation},
+  publisher   = {Machine Translation},
  volume    = {17},
  number    = {3},
  pages     = {165--184},
@@ -2246,7 +2246,7 @@ year = {2012}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 %%%%% chapter 5------------------------------------------------------
-@article{brown1990statistical,
+@inproceedings{brown1990statistical,
  author    = {Peter F. Brown and
               John Cocke and
               Stephen Della Pietra and
@@ -2256,26 +2256,26 @@ year = {2012}
               Robert L. Mercer and
               Paul S. Roossin},
  title     = {A Statistical Approach to Machine Translation},
-  journal   = {Computational Linguistics},
+  publisher  = {Computational Linguistics},
  volume    = {16},
  number    = {2},
  pages     = {79--85},
  year      = {1990}
 }
-@article{knight1999decoding,
+@inproceedings{knight1999decoding,
  author    = {Kevin Knight},
  title     = {Decoding Complexity in Word-Replacement Translation Models},
-  journal   = {Computational Linguistics},
+  publisher  = {Computational Linguistics},
  volume    = {25},
  number    = {4},
  pages     = {607--615},
  year      = {1999}
 }
-@article{shannon1949communication,
+@inproceedings{shannon1949communication,
  title ={Communication theory of secrecy systems},
  author ={Claude Elwood Shannon},
-  journal ={Bell system technical journal},
+  publisher ={Bell system technical journal},
  volume ={28},
  number ={4},
  pages ={656--715},
@@ -2289,14 +2289,14 @@ year = {2012}
               Linguistics},
  year      = {2004}
 }
-@article{肖桐1991面向统计机器翻译的重对齐方法研究,
+@inproceedings{肖桐1991面向统计机器翻译的重对齐方法研究,
  title={面向统计机器翻译的重对齐方法研究},
  author={肖桐 and
          李天宁 and
          陈如山 and
          朱靖波 and
          王会珍},
-  journal={中文信息学报},
+  publisher ={中文信息学报},
  volume={24},
  number={110--116},
  year={2010},
@@ -2311,7 +2311,7 @@ year = {2012}
  publisher = {International Joint Conference on Natural Language Processing},
  year      = {2005}
 }
-@article{1998Grammar,
+@inproceedings{1998Grammar,
  title={Grammar Inference and Statistical Machine Translation},
  author={Ye-Yi Wang and Wayne Ward},
  year={1999},
@@ -2367,36 +2367,36 @@ year = {2012}
  year   = {2007}
 }
-@article{黄书剑2009一种错误敏感的词对齐评价方法,
+@inproceedings{黄书剑2009一种错误敏感的词对齐评价方法,
  title={一种错误敏感的词对齐评价方法},
  author={黄书剑 and
          奚宁 and
          赵迎功 and
          戴新宇 and
          陈家骏},
-  journal={中文信息学报},
+  publisher ={中文信息学报},
  volume={23},
  number={88-94},
  year={2009}
 }
-@article{DBLP:journals/coling/FraserM07,
+@inproceedings{DBLP:journals/coling/FraserM07,
  author    = {Alexander Fraser and
               Daniel Marcu},
  title     = {Measuring Word Alignment Quality for Statistical Machine Translation},
-  journal   = {Computational Linguistics},
+  publisher  = {Computational Linguistics},
  volume    = {33},
  number    = {3},
  pages     = {293--303},
  year      = {2007}
 }
-@article{DBLP:journals/corr/FengLLZ16,
+@inproceedings{DBLP:journals/corr/FengLLZ16,
  author    = {Shi Feng and
               Shujie Liu and
               Mu Li and
               Ming Zhou},
  title     = {Implicit Distortion and Fertility Models for Attention-based Encoder-Decoder
               {NMT} Model},
-  journal   = {CoRR},
+  publisher  = {CoRR},
  volume    = {abs/1601.03317},
  year      = {2016}
 }
@@ -2432,12 +2432,12 @@ year = {2012}
  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2008}
 }
-@article{DBLP:journals/mt/FlemingKN15,
+@inproceedings{DBLP:journals/mt/FlemingKN15,
  author    = {Noah Fleming and
               Antonina Kolokolova and
               Renesa Nizamee},
  title     = {Complexity of alignment and decoding problems: restrictions and approximations},
-  journal   = {Machine Translation},
+  publisher  = {Machine Translation},
  volume    = {29},
  number    = {3-4},
  pages     = {163--187},
@@ -2470,21 +2470,21 @@ year = {2012}
  year ={1999},
  publisher ={Massachusetts Institute of Technology Press}
 }
-@article{och2003systematic,
+@inproceedings{och2003systematic,
  author    = {Franz Josef Och and
               Hermann Ney},
  title     = {A Systematic Comparison of Various Statistical Alignment Models},
-  journal   = {Computational Linguistics},
+  publisher  = {Computational Linguistics},
  volume    = {29},
  number    = {1},
  pages     = {19--51},
  year      = {2003}
 }
-@article{och2004alignment,
+@inproceedings{och2004alignment,
  author    = {Franz Josef Och and
               Hermann Ney},
  title     = {The Alignment Template Approach to Statistical Machine Translation},
-  journal   = {Computational Linguistics},
+  publisher  = {Computational Linguistics},
  volume    = {30},
  number    = {4},
  pages     = {417--449},
@@ -2499,10 +2499,10 @@ year = {2012}
  pages     = {836--841},
  year      = {1996}
 }
-@article{xiao2013unsupervised,
+@inproceedings{xiao2013unsupervised,
  title ={Unsupervised sub-tree alignment for tree-to-tree translation},
  author ={Tong Xiao and Jingbo Zhu},
-  journal ={Journal of Artificial Intelligence Research},
+  publisher ={Journal of Artificial Intelligence Research},
  volume ={48},
  pages ={733--782},
  year ={2013}
@@ -2542,11 +2542,11 @@ year = {2012}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 %%%%% chapter 7------------------------------------------------------
-@article{DBLP:journals/tit/Viterbi67,
+@inproceedings{DBLP:journals/tit/Viterbi67,
  author    = {Andrew J. Viterbi},
  title     = {Error bounds for convolutional codes and an asymptotically optimum
               decoding algorithm},
-  journal   = {IEEE Transactions on Information Theory},
+  publisher  = {IEEE Transactions on Information Theory},
  volume    = {13},
  number    = {2},
  pages     = {260--269},
@@ -2631,11 +2631,11 @@ year = {2012}
  year      = {2014}
 }
-@article{powell1964an,
+@inproceedings{powell1964an,
  author    = {M. J. D. Powell},
  title     = {An efficient method for finding the minimum of a function of several
               variables without calculating derivatives},
-  journal   = {The Computer Journal},
+  publisher  = {The Computer Journal},
  volume    = {7},
  number    = {2},
  pages     = {155--162},
@@ -2841,12 +2841,12 @@ year = {2012}
  publisher = {Machine Translation Summit XII},
  year      = {2009}
 }
-@article{DBLP:journals/coling/BisazzaF16,
+@inproceedings{DBLP:journals/coling/BisazzaF16,
  author    = {Arianna Bisazza and
               Marcello Federico},
  title     = {A Survey of Word Reordering in Statistical Machine Translation: Computational
               Models and Language Phenomena},
-  journal   = {Computational Linguistics},
+  publisher  = {Computational Linguistics},
  volume    = {42},
  number    = {2},
  pages     = {163--205},
@@ -3039,13 +3039,13 @@ year = {2012}
  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2015}
 }
-@article{XiaoA,
+@inproceedings{XiaoA,
  author    = {Tong Xiao and
               Derek F. Wong and
               Jingbo Zhu},
  title     = {A Loss-Augmented Approach to Training Syntactic Machine Translation
               Systems},
-  journal   = {IEEE Transactions on Audio, Speech, and Language Processing},
+  publisher  = {IEEE Transactions on Audio, Speech, and Language Processing},
  volume    = {24},
  number    = {11},
  pages     = {2069--2083},
@@ -3093,7 +3093,7 @@ year = {2012}
  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2006}
 }
-@article{DBLP:journals/coling/MarinoBCGLFC06,
+@inproceedings{DBLP:journals/coling/MarinoBCGLFC06,
  author    = {Jos{\'{e}} B. Mari{\~{n}}o and
               Rafael E. Banchs and
               Josep Maria Crego and
@@ -3102,7 +3102,7 @@ year = {2012}
               Jos{\'{e}} A. R. Fonollosa and
               Marta R. Costa-juss{\`{a}}},
  title     = {\emph{N}-gram-based Machine Translation},
-  journal   = {Computational Linguistics},
+  publisher  = {Computational Linguistics},
  volume    = {32},
  number    = {4},
  pages     = {527--549},
@@ -3195,19 +3195,19 @@ year = {2012}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 %%%%% chapter 8------------------------------------------------------
-@article{Chiang2012Hope,
+@inproceedings{Chiang2012Hope,
  author    = {David Chiang},
  title     = {Hope and Fear for Discriminative Training of Statistical Translation
               Models},
-  journal   = {Journal of Machine Learning Research},
+  publisher  = {Journal of Machine Learning Research},
  volume    = {13},
  pages     = {1159--1187},
  year      = {2012}
 }
-@article{chiang2007hierarchical,
+@inproceedings{chiang2007hierarchical,
    title={Hierarchical Phrase-Based Translation},
    author ={David Chiang},
-    journal ={Computational Linguistics},
+    publisher ={Computational Linguistics},
    volume ={33},
    number ={2},
    pages ={201--228},
@@ -3219,19 +3219,19 @@ year = {2012}
  year ={1970},
  publisher ={Courant Institute of Mathematical Sciences, New York University}
 }
-@article{younger1967recognition,
+@inproceedings{younger1967recognition,
  author    = {Daniel H. Younger},
  title     = {Recognition and Parsing of Context-Free Languages in Time n{\^{3}}},
-  journal   = {Information and Control},
+  publisher  = {Information and Control},
  volume    = {10},
  number    = {2},
  pages     = {189--208},
  year      = {1967}
 }
-@article{kasami1966efficient,
+@inproceedings{kasami1966efficient,
  author ={Tadao Kasami},
  title ={An efficient recognition and syntax-analysis algorithm for context-free languages},
-  journal ={Coordinated Science Laboratory Report no. R-257},
+  publisher ={Coordinated Science Laboratory Report no. R-257},
  year ={1966}
 }
 @inproceedings{huang2005better,
@@ -3242,11 +3242,11 @@ year = {2012}
  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2005}
 }
-@article{wu1997stochastic,
+@inproceedings{wu1997stochastic,
  author    = {Dekai Wu},
  title     = {Stochastic Inversion Transduction Grammars and Bilingual Parsing of
               Parallel Corpora},
-  journal   = {Computational Linguistics},
+  publisher  = {Computational Linguistics},
  volume    = {23},
  number    = {3},
  pages     = {377--403},
@@ -3312,21 +3312,21 @@ year = {2012}
  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2006}
 }
-@article{xue2005building,
+@inproceedings{xue2005building,
  title={Building a large annotated Chinese corpus: the Penn Chinese treebank},
  author={Xue, Nianwen and Xia, Fei and dong Chiou, Fu and Palmer, Martha},
-  journal={Journal of Natural Language Engineering},
+  publisher ={Journal of Natural Language Engineering},
  volume={11},
  number={2},
  pages={207--238},
  year={2005}
 }
-@article{DBLP:journals/coling/MarcusSM94,
+@inproceedings{DBLP:journals/coling/MarcusSM94,
  author    = {Mitchell P. Marcus and
               Beatrice Santorini and
               Mary Ann Marcinkiewicz},
  title     = {Building a Large Annotated Corpus of English: The Penn Treebank},
-  journal   = {Computational Linguistics},
+  publisher  = {Computational Linguistics},
  volume    = {19},
  number    = {2},
  pages     = {313--330},
@@ -3397,19 +3397,19 @@ year = {2012}
  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2010}
 }
-@article{ilprints729,
+@inproceedings{ilprints729,
  title={Parsing and Hypergraphs},
  author={Klein, Dan  and  Manning, Christopher D.},
-  journal={New Developments in Parsing Technology},
+  publisher ={New Developments in Parsing Technology},
  volume={65},
  number={3},
  pages={123--134},
  year={2001},
 }
-@article{goodman1999semiring,
+@inproceedings{goodman1999semiring,
  author    = {Joshua Goodman},
  title     = {Semiring Parsing},
-  journal   = {Computational Linguistics},
+  publisher  = {Computational Linguistics},
  volume    = {25},
  number    = {4},
  pages     = {573--605},
@@ -3609,7 +3609,7 @@ year = {2012}
  publisher = {Chinese Information Processing Society of China},
  year      = {2010}
 }
-@article{Zhai2012Treebased,
+@inproceedings{Zhai2012Treebased,
  title={Treebased translation without using parse trees},
  author    = {Feifei Zhai and
               Jiajun Zhang and
@@ -3618,24 +3618,24 @@ year = {2012}
  publisher = {International Conference on Computational Linguistics},
  year={2012},
 }
-@article{DBLP:journals/tacl/ZhaiZZZ13,
+@inproceedings{DBLP:journals/tacl/ZhaiZZZ13,
  author    = {Feifei Zhai and
               Jiajun Zhang and
               Yu Zhou and
               Chengqing Zong},
  title     = {Unsupervised Tree Induction for Tree-based Translation},
-  journal   = {Transactions of Association for Computational Linguistic},
+  publisher  = {Transactions of Association for Computational Linguistic},
  volume    = {1},
  pages     = {243--254},
  year      = {2013}
 }
-@article{DBLP:journals/mt/QuirkM06,
+@inproceedings{DBLP:journals/mt/QuirkM06,
  author    = {Christopher Quirk and
               Arul Menezes},
  title     = {Dependency treelet translation: the convergence of statistical and
               example-based machine-translation?},
-  journal   = {Machine Translation},
+  publisher  = {Machine Translation},
  volume    = {20},
  number    = {1},
  pages     = {43--65},
@@ -3747,7 +3747,7 @@ year = {2012}
  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2007}
 }
-@article{xiao2013bagging,
+@inproceedings{xiao2013bagging,
  title ={Bagging and boosting statistical machine translation systems},
  author ={Tong Xiao and Jingbo Zhu and Tongran Liu },
  publisher ={Artificial Intelligence},
@@ -3837,7 +3837,7 @@ year = {2012}
  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2008}
 }
-@article{xiao2011language,
+@inproceedings{xiao2011language,
  title ={Language Modeling for Syntax-Based Machine Translation Using Tree Substitution Grammars: A Case Study on Chinese-English Translation},
  author ={Xiao, Tong and Zhu, Jingbo and Zhu, Muhua},
  volume ={10},
@@ -4387,32 +4387,32 @@ year = {2012}
  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2013}
 }
-@article{HochreiterThe,
+@inproceedings{HochreiterThe,
  author    = {Sepp Hochreiter},
  title     = {The Vanishing Gradient Problem During Learning Recurrent Neural Nets
               and Problem Solutions},
-  journal   = {International Journal of Uncertainty, Fuzziness and Knowledge-Based Systems},
+  publisher   = {International Journal of Uncertainty, Fuzziness and Knowledge-Based Systems},
  volume    = {6},
  number    = {2},
  pages     = {107--116},
  year      = {1998}
 }
-@article{BENGIO1994Learning,
+@inproceedings{BENGIO1994Learning,
 author    = {Yoshua Bengio and
               Patrice Y. Simard and
               Paolo Frasconi},
  title     = {Learning long-term dependencies with gradient descent is difficult},
-  journal   = {IEEE Transportation Neural Networks},
+  publisher   = {IEEE Transportation Neural Networks},
  volume    = {5},
  number    = {2},
  pages     = {157--166},
  year      = {1994}
 }
-@article{StahlbergNeural,
+@inproceedings{StahlbergNeural,
  title={Neural Machine Translation: A Review},
  author={Felix Stahlberg},
-  journal={Journal of Artificial Intelligence Research},
+  publisher={Journal of Artificial Intelligence Research},
  year={2020},
  volume={69},
  pages={343-418}
@@ -4427,7 +4427,7 @@ author    = {Yoshua Bengio and
  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2016}
 }
-@article{Hassan2018AchievingHP,
+@inproceedings{Hassan2018AchievingHP,
  author    = {Hany Hassan and
               Anthony Aue and
               Chang Chen and
@@ -4453,7 +4453,7 @@ author    = {Yoshua Bengio and
               Zhirui Zhang and
               Ming Zhou},
  title     = {Achieving Human Parity on Automatic Chinese to English News Translation},
-  journal   = {CoRR},
+  publisher   = {CoRR},
  volume    = {abs/1803.05567},
  year      = {2018},
 }
@@ -4481,14 +4481,14 @@ author    = {Yoshua Bengio and
  publisher = {AAAI Conference on Artificial Intelligence},
  year      = {2020}
 }
-@article{HochreiterLong,
+@inproceedings{HochreiterLong,
  author = {Hochreiter, Sepp and Schmidhuber, Jürgen},
  year = {1997},
  month = {12},
  pages = {1735-80},
  title = {Long Short-term Memory},
  volume = {9},
-  journal = {Neural Computation}
+  publisher = {Neural Computation}
 }
 @inproceedings{Cho2014Learning,
  author    = {Kyunghyun Cho and
@@ -4580,12 +4580,12 @@ author    = {Yoshua Bengio and
  pages     = {1538--1548},
  year      = {2019}
 }
-@article{Lei2017TrainingRA,
+@inproceedings{Lei2017TrainingRA,
  author    = {Tao Lei and
               Yu Zhang and
               Yoav Artzi},
  title     = {Training RNNs as Fast as CNNs},
-  journal   = {CoRR},
+  publisher   = {CoRR},
  volume    = {abs/1709.02755},
  year      = {2017}
 }
@@ -4693,15 +4693,15 @@ author    = {Yoshua Bengio and
  pages     = {4772--4777},
  year      = {2018}
 }
-@article{DBLP:journals/corr/ZhangZ16c,
+@inproceedings{DBLP:journals/corr/ZhangZ16c,
  author    = {Jiajun Zhang and
               Chengqing Zong},
  title     = {Bridging Neural Machine Translation and Bilingual Dictionaries},
-  journal   = {CoRR},
+  publisher   = {CoRR},
  volume    = {abs/1610.07272},
  year      = {2016}
 }
-@article{Dai2019TransformerXLAL,
+@inproceedings{Dai2019TransformerXLAL,
  author    = {Zihang Dai and
               Zhilin Yang and
               Yiming Yang and
@@ -4709,7 +4709,7 @@ author    = {Yoshua Bengio and
               Quoc V. Le and
               Ruslan Salakhutdinov},
  title     = {Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context},
-  journal   = {Annual Meeting of the Association for Computational Linguistics},
+  publisher   = {Annual Meeting of the Association for Computational Linguistics},
  pages     = {2978--2988},
  year      = {2019}
 }
@@ -4746,7 +4746,7 @@ author    = {Yoshua Bengio and
  pages     = {1264--1274},
  year      = {2018}
 }
-@article{DBLP:journals/corr/abs-1906-00532,
+@inproceedings{DBLP:journals/corr/abs-1906-00532,
  author    = {Aishwarya Bhandare and
               Vamsi Sripathi and
               Deepthi Karkada and
@@ -4756,7 +4756,7 @@ author    = {Yoshua Bengio and
               Vikram Saletore},
  title     = {Efficient 8-Bit Quantization of Transformer Neural Machine Language
               Translation Model},
-  journal   = {CoRR},
+  publisher   = {CoRR},
  volume    = {abs/1906.00532},
  year      = {2019}
 }
@@ -4791,12 +4791,12 @@ author    = {Yoshua Bengio and
  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2017}
 }
-@article{Hinton2015Distilling,
+@inproceedings{Hinton2015Distilling,
  author    = {Geoffrey Hinton and
               Oriol Vinyals and
               Jeffrey Dean},
  title     = {Distilling the Knowledge in a Neural Network},
-  journal   = {CoRR},
+  publisher   = {CoRR},
  volume    = {abs/1503.02531},
  year      = {2015}
 }
@@ -4827,10 +4827,10 @@ author    = {Yoshua Bengio and
  publisher = {Conference on Empirical Methods in Natural Language Processing},
  year      = {2016}
 }
-@article{Akaike1969autoregressive,
+@inproceedings{Akaike1969autoregressive,
  author    = {Hirotugu Akaike},
  title     = {Fitting autoregressive models for prediction},
-  journal   = {Annals of the institute of Statistical Mathematics},
+  publisher   = {Annals of the institute of Statistical Mathematics},
  volume    = {21(1)},
  year      = {2015},
  pages     = {243--247},
@@ -4958,14 +4958,14 @@ author    = {Yoshua Bengio and
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 %%%%% chapter 11------------------------------------------------------
-@article{DBLP:journals/pami/RenHG017,
+@inproceedings{DBLP:journals/pami/RenHG017,
  author    = {Shaoqing Ren and
               Kaiming He and
               Ross Girshick and
               Jian Sun},
  title     = {Faster {R-CNN:} Towards Real-Time Object Detection with Region Proposal
               Networks},
-  journal   = {{IEEE} Transactions on Pattern Analysis and Machine Intelligence},
+  publisher   = {{IEEE} Transactions on Pattern Analysis and Machine Intelligence},
  volume    = {39},
  number    = {6},
  pages     = {1137--1149},
@@ -5101,10 +5101,10 @@ author    = {Yoshua Bengio and
  year      = {2017}
 }
-@article{2011Natural,
+@inproceedings{2011Natural,
  title={Natural Language Processing (almost) from Scratch},
  author={ Collobert, Ronan  and  Weston, Jason  and Bottou, Léon and  Karlen, Michael  and  Kavukcuoglu, Koray  and  Kuksa, Pavel },
-  journal={Journal of Machine Learning Research},
+  publisher={Journal of Machine Learning Research},
  volume={12},
  number={1},
  pages={2493-2537},
@@ -5171,16 +5171,16 @@ author    = {Yoshua Bengio and
  year      = {2015}
 }
-@article{StahlbergNeural,
+@inproceedings{StahlbergNeural,
  title={Neural Machine Translation: A Review},
  author={Felix Stahlberg},
-  journal={Journal of Artificial Intelligence Research},
+  publisher={Journal of Artificial Intelligence Research},
  year={2020},
  volume={69},
  pages={343-418}
 }
-@article{Sennrich2016ImprovingNM,
+@inproceedings{Sennrich2016ImprovingNM,
  author    = {Rico Sennrich and
               Barry Haddow and
               Alexandra Birch},
@@ -5198,27 +5198,27 @@ author    = {Yoshua Bengio and
  year      = {2015}
 }
-@article{Waibel1989PhonemeRU,
+@inproceedings{Waibel1989PhonemeRU,
  title={Phoneme recognition using time-delay neural networks},
  author={Alexander Waibel and Toshiyuki Hanazawa and Geoffrey Hinton and Kiyohiro Shikano and Kevin J. Lang},
-  journal={IEEE Transactions on Acoustics, Speech, and Signal Processing},
+  publisher={IEEE Transactions on Acoustics, Speech, and Signal Processing},
  year={1989},
  volume={37},
  pages={328-339}
 }
-@article{LeCun1989BackpropagationAT,
+@inproceedings{LeCun1989BackpropagationAT,
  title={Backpropagation Applied to Handwritten Zip Code Recognition},
  author={Yann Lecun and Bernhard Boser and John Denker and Don Henderson and Richard E.Howard and Wayne E. Hubbard and Larry Jackel},
-  journal={Neural Computation},
+  publisher={Neural Computation},
  year={1989},
  volume={1},
  pages={541-551}
 }
-@article{726791,
+@inproceedings{726791,
  author={Yann {Lecun} and Leon {Bottou} and Yoshua {Bengio} and Patrick {Haffner}},
-  journal={Proceedings of the IEEE}, 
+  publisher={Proceedings of the IEEE}, 
  title={Gradient-based learning applied to document recognition}, 
  year={1998},
  volume={86},
@@ -5248,18 +5248,18 @@ author    = {Yoshua Bengio and
  year      = {2017}
 }
-@article{Girshick2015FastR,
+@inproceedings{Girshick2015FastR,
  title={Fast R-CNN},
  author={Ross Girshick},
-  journal={International Conference on Computer Vision},
+  publisher={International Conference on Computer Vision},
  year={2015},
  pages={1440-1448}
 }
-@article{He2020MaskR,
+@inproceedings{He2020MaskR,
  title={Mask R-CNN},
  author={Kaiming He and Georgia Gkioxari and Piotr Doll{\'a}r and Ross B. Girshick},
-  journal={International Conference on Computer Vision},
+  publisher={International Conference on Computer Vision},
  pages={2961--2969},
  year={2017}
 }
@@ -5336,12 +5336,12 @@ author    = {Yoshua Bengio and
  year      = {2017}
 }
-@article{Kaiser2018DepthwiseSC,
+@inproceedings{Kaiser2018DepthwiseSC,
  title={Depthwise Separable Convolutions for Neural Machine Translation},
  author    = {Lukasz Kaiser and
               Aidan N. Gomez and
               Fran{\c{c}}ois Chollet},
-  journal = {International Conference on Learning Representations},
+  publisher = {International Conference on Learning Representations},
  year={2018},
 }
@@ -5365,7 +5365,7 @@ author    = {Yoshua Bengio and
  year      = {2013}
 }
-@article{Wu2016GooglesNM,
+@inproceedings{Wu2016GooglesNM,
  title={Google's Neural Machine Translation System: Bridging the Gap between Human and Machine Translation},
  author    = {Yonghui Wu and
               Mike Schuster and
@@ -5398,7 +5398,7 @@ author    = {Yoshua Bengio and
               Greg Corrado and
               Macduff Hughes and
               Jeffrey Dean},
-  journal   = {CoRR},
+  publisher   = {CoRR},
  year={2016},
  volume={abs/1609.08144}
 }
@@ -5444,32 +5444,32 @@ author    = {Yoshua Bengio and
  year={2013}
 }
-@article{Bengio2013AdvancesIO,
+@inproceedings{Bengio2013AdvancesIO,
  title={Advances in optimizing recurrent networks},
  author={Yoshua Bengio and Nicolas Boulanger-Lewandowski and Razvan Pascanu},
-  journal={IEEE Transactions on Acoustics, Speech, and Signal Processing},
+  publisher={IEEE Transactions on Acoustics, Speech, and Signal Processing},
  year={2013},
  pages={8624-8628}
 }
-@article{JMLR:v15:srivastava14a,
+@inproceedings{JMLR:v15:srivastava14a,
  author  = {Nitish Srivastava and Geoffrey Hinton and Alex Krizhevsky and Ilya Sutskever and Ruslan Salakhutdinov},
  title   = {Dropout: A Simple Way to Prevent Neural Networks from Overfitting},
-  journal = {Journal of Machine Learning Research},
+  publisher = {Journal of Machine Learning Research},
  year    = {2014},
  volume  = {15},
  pages   = {1929-1958},
 }
-@article{Chollet2017XceptionDL,
+@inproceedings{Chollet2017XceptionDL,
  title={Xception: Deep Learning with Depthwise Separable Convolutions},
  author    = {Fran{\c{c}}ois Chollet},
-  journal={IEEE Conference on Computer Vision and Pattern Recognition},
+  publisher={IEEE Conference on Computer Vision and Pattern Recognition},
  year={2017},
  pages={1800-1807}
 }
-@article{Howard2017MobileNetsEC,
+@inproceedings{Howard2017MobileNetsEC,
  title={MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications},
  author    = {Andrew Howard and
               Menglong Zhu and
@@ -5483,17 +5483,17 @@ author    = {Yoshua Bengio and
  year={2017},
 }
-@article{sifre2014rigid,
+@inproceedings{sifre2014rigid,
  title={Rigid-motion scattering for image classification},
  author={Sifre, Laurent and Mallat, St{\'e}phane},
  year={2014},
-  journal={Citeseer}
+  publisher={Citeseer}
 }
-@article{Taigman2014DeepFaceCT,
+@inproceedings{Taigman2014DeepFaceCT,
  title={DeepFace: Closing the Gap to Human-Level Performance in Face Verification},
  author={Yaniv Taigman and Ming Yang and Marc'Aurelio Ranzato and Lior Wolf},
-  journal={IEEE Conference on Computer Vision and Pattern Recognition},
+  publisher={IEEE Conference on Computer Vision and Pattern Recognition},
  year={2014},
  pages={1701-1708}
 }
@@ -5511,7 +5511,7 @@ author    = {Yoshua Bengio and
  year={2015}
 }
-@article{Chen2020DynamicCA,
+@inproceedings{Chen2020DynamicCA,
  title={Dynamic Convolution: Attention Over Convolution Kernels},
  author    = {Yinpeng Chen and
               Xiyang Dai and
@@ -5519,7 +5519,7 @@ author    = {Yoshua Bengio and
               Dongdong Chen and
               Lu Yuan and
               Zicheng Liu},
-  journal = {IEEE Conference on Computer Vision and Pattern Recognition},
+  publisher = {IEEE Conference on Computer Vision and Pattern Recognition},
  year={2020},
  pages={11027-11036}
 }
@@ -5579,7 +5579,7 @@ author    = {Yoshua Bengio and
  year      = {2018}
 }
-@article{DBLP:journals/corr/abs-1802-05751,
+@inproceedings{DBLP:journals/corr/abs-1802-05751,
  author    = {Niki Parmar and
               Ashish Vaswani and
               Jakob Uszkoreit and
@@ -5587,7 +5587,7 @@ author    = {Yoshua Bengio and
               Noam Shazeer and
               Alexander Ku},
  title     = {Image Transformer},
-  journal   = {CoRR},
+  publisher   = {CoRR},
  volume    = {abs/1802.05751},
  year      = {2018}
 }
@@ -5612,17 +5612,17 @@ author    = {Yoshua Bengio and
  year      = {2020}
 }
-@article{DBLP:journals/corr/abs-2004-05150,
+@inproceedings{DBLP:journals/corr/abs-2004-05150,
  author    = {Iz Beltagy and
               Matthew E. Peters and
               Arman Cohan},
  title     = {Longformer: The Long-Document Transformer},
-  journal   = {CoRR},
+  publisher   = {CoRR},
  volume    = {abs/2004.05150},
  year      = {2020}
 }
-@article{DBLP:journals/corr/abs-2005-00743,
+@inproceedings{DBLP:journals/corr/abs-2005-00743,
  author    = {Yi Tay and
               Dara Bahri and
               Donald Metzler and
@@ -5630,7 +5630,7 @@ author    = {Yoshua Bengio and
               Zhe Zhao and
               Che Zheng},
  title     = {Synthesizer: Rethinking Self-Attention in Transformer Models},
-  journal   = {CoRR},
+  publisher   = {CoRR},
  volume    = {abs/2005.00743},
  year      = {2020}
 }
@@ -5691,10 +5691,10 @@ author    = {Yoshua Bengio and
  pages     = {770--778},
  year      = {2016},
 }
-@article{JMLR:v15:srivastava14a,
+@inproceedings{JMLR:v15:srivastava14a,
  author  = {Nitish Srivastava and Geoffrey Hinton and Alex Krizhevsky and Ilya Sutskever and Ruslan Salakhutdinov},
  title   = {Dropout: A Simple Way to Prevent Neural Networks from Overfitting},
-  journal = {Journal of Machine Learning Research},
+  publisher = {Journal of Machine Learning Research},
  year    = {2014},
  volume  = {15},
  pages   = {1929-1958},
@@ -5719,12 +5719,12 @@ author    = {Yoshua Bengio and
  pages     = {1789--1798},
  year      = {2018},
 }
-@article{DBLP:journals/corr/CourbariauxB16,
+@inproceedings{DBLP:journals/corr/CourbariauxB16,
  author    = {Matthieu Courbariaux and
               Yoshua Bengio},
  title     = {BinaryNet: Training Deep Neural Networks with Weights and Activations
               Constrained to +1 or -1},
-  journal   = {CoRR},
+  publisher   = {CoRR},
  volume    = {abs/1602.02830},
  year      = {2016},
 }
@@ -5751,10 +5751,10 @@ author    = {Yoshua Bengio and
  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2019}
 }
-@article{Liu2020LearningTE,
+@inproceedings{Liu2020LearningTE,
 	title={Learning to Encode Position for Transformer with Continuous Dynamical Model},
 	author={Xuanqing Liu and Hsiang-Fu Yu and Inderjit Dhillon and Cho-Jui Hsieh},
-	journal={ArXiv},
+	publisher={ArXiv},
 	year={2020},
 	volume={abs/2003.09229}
 }
@@ -5787,10 +5787,10 @@ author    = {Yoshua Bengio and
 	publisher = {Annual Meeting of the Association for Computational Linguistics},
 	year = {2019},
 }
-@article{Wang2018MultilayerRF,
+@inproceedings{Wang2018MultilayerRF,
  title={Multi-layer Representation Fusion for Neural Machine Translation},
  author={Qiang Wang and Fuxue Li and Tong Xiao and Yanyang Li and Yinqiao Li and Jingbo Zhu},
-  journal={International Conference on Computational Linguistics},
+  publisher={International Conference on Computational Linguistics},
  year={2018},
  volume={abs/2002.06714}
 }
@@ -5863,27 +5863,27 @@ author    = {Yoshua Bengio and
  year      = {2018}
 }
-@article{Kitaev2020ReformerTE,
+@inproceedings{Kitaev2020ReformerTE,
  author    = {Nikita Kitaev and
               Lukasz Kaiser and
               Anselm Levskaya},
  title     = {Reformer: The Efficient Transformer},
-  journal = {International Conference on Learning Representations},
+  publisher = {International Conference on Learning Representations},
  year      = {2020}
 }
-@article{Lin2020WeightDT,
+@inproceedings{Lin2020WeightDT,
  title={Weight Distillation: Transferring the Knowledge in Neural Network Parameters},
  author={Ye Lin and Yanyang Li and Ziyang Wang and Bei Li and Quan Du and Tong Xiao and Jingbo Zhu},
-  journal={ArXiv},
+  publisher={ArXiv},
  year={2020},
  volume={abs/2009.09152}
 }
-@article{li2020shallow,
+@inproceedings{li2020shallow,
  title={Shallow-to-Deep Training for Neural Machine Translation},
  author={Li, Bei and Wang, Ziyang and Liu, Hui and Jiang, Yufan and Du, Quan and Xiao, Tong and Wang, Huizhen and Zhu, Jingbo},
-  journal={Conference on Empirical Methods in Natural Language Processing},
+  publisher={Conference on Empirical Methods in Natural Language Processing},
  year={2020}
 }
 %%%%% chapter 12------------------------------------------------------
@@ -5909,7 +5909,7 @@ author    = {Yoshua Bengio and
  year      = {2018}
 }
-@article{DBLP:journals/jmlr/RaffelSRLNMZLL20,
+@inproceedings{DBLP:journals/jmlr/RaffelSRLNMZLL20,
  author    = {Colin Raffel and
               Noam Shazeer and
               Adam Roberts and
@@ -5921,7 +5921,7 @@ author    = {Yoshua Bengio and
               Peter J. Liu},
  title     = {Exploring the Limits of Transfer Learning with a Unified Text-to-Text
               Transformer},
-  journal   = {Journal of Machine Learning Reseach},
+  publisher   = {Journal of Machine Learning Reseach},
  volume    = {21},
  pages     = {140:1--140:67},
  year      = {2020}
@@ -5936,10 +5936,10 @@ author    = {Yoshua Bengio and
  year      = {2012}
 }
-@article{JMLR:v15:srivastava14a,
+@inproceedings{JMLR:v15:srivastava14a,
  author  = {Nitish Srivastava and Geoffrey Hinton and Alex Krizhevsky and Ilya Sutskever and Ruslan Salakhutdinov},
  title   = {Dropout: A Simple Way to Prevent Neural Networks from Overfitting},
-  journal = {Journal of Machine Learning Research},
+  publisher = {Journal of Machine Learning Research},
  year    = {2014},
  volume  = {15},
  pages   = {1929-1958},
@@ -6067,10 +6067,10 @@ author    = {Yoshua Bengio and
  year      = {2019}
 }
-@article{Gong2018AdversarialTW,
+@inproceedings{Gong2018AdversarialTW,
  title={Adversarial Texts with Gradient Methods},
  author={Zhitao Gong and Wenlu Wang and B. Li and D. Song and W. Ku},
-  journal={ArXiv},
+  publisher={ArXiv},
  year={2018},
  volume={abs/1801.07175}
 }
@@ -6107,11 +6107,11 @@ author    = {Yoshua Bengio and
  year      = {2018}
 }
-@article{DBLP:journals/corr/SamantaM17,
+@inproceedings{DBLP:journals/corr/SamantaM17,
  author    = {Suranjana Samanta and
               Sameep Mehta},
  title     = {Towards Crafting Text Adversarial Samples},
-  journal   = {CoRR},
+  publisher   = {CoRR},
  volume    = {abs/1707.02812},
  year      = {2017}
 }
@@ -8112,12 +8112,12 @@ author    = {Zhuang Liu and
  year      = {2017}
 }
-@article{DBLP:journals/corr/GreffSS16,
+@inproceedings{DBLP:journals/corr/GreffSS16,
  author    = {Klaus Greff and
               Rupesh Kumar Srivastava and
               J{\"{u}}rgen Schmidhuber},
  title     = {Highway and Residual Networks learn Unrolled Iterative Estimation},
-  journal = {International Conference on Learning Representations},
+  publisher = {International Conference on Learning Representations},
  year      = {2017}
 }
@@ -8148,7 +8148,7 @@ author    = {Zhuang Liu and
  year      = {2019}
 }
-@article{DBLP:journals/corr/abs-2002-04745,
+@inproceedings{DBLP:journals/corr/abs-2002-04745,
  author    = {Ruibin Xiong and
               Yunchang Yang and
               Di He and
@@ -8160,7 +8160,7 @@ author    = {Zhuang Liu and
               Liwei Wang and
               Tie-Yan Liu},
  title     = {On Layer Normalization in the Transformer Architecture},
-  journal   = {International Conference on Machine Learning},
+  publisher   = {International Conference on Machine Learning},
  volume    = {abs/2002.04745},
  year      = {2020}
 }
@@ -8188,12 +8188,12 @@ author    = {Zhuang Liu and
  year      = {2016},
 }
-@article{Ba2016LayerN,
+@inproceedings{Ba2016LayerN,
  author    = {Lei Jimmy Ba and
               Jamie Ryan Kiros and
               Geoffrey E. Hinton},
  title     = {Layer Normalization},
-  journal   = {CoRR},
+  publisher   = {CoRR},
  volume    = {abs/1607.06450},
  year      = {2016}
 }
@@ -8231,10 +8231,10 @@ author    = {Zhuang Liu and
  year      = {2019}
 }
-@article{Wang2018MultilayerRF,
+@inproceedings{Wang2018MultilayerRF,
  title={Multi-layer Representation Fusion for Neural Machine Translation},
  author={Qiang Wang and Fuxue Li and Tong Xiao and Yanyang Li and Yinqiao Li and Jingbo Zhu},
-  journal={International Conference on Computational Linguistics},
+  publisher={International Conference on Computational Linguistics},
  year={2018},
  volume={abs/2002.06714}
 }
@@ -8425,11 +8425,11 @@ author    = {Zhuang Liu and
  year      = {1989}
 }
-@article{DBLP:journals/compsys/Kitano90,
+@inproceedings{DBLP:journals/compsys/Kitano90,
  author    = {Hiroaki Kitano},
  title     = {Designing Neural Networks Using Genetic Algorithms with Graph Generation
               System},
-  journal   = {Complex Systems},
+  publisher   = {Complex Systems},
  volume    = {4},
  number    = {4},
  year      = {1990}
@@ -8691,14 +8691,14 @@ author    = {Zhuang Liu and
  year      = {2020}
 }
-@article{DBLP:journals/jmlr/LiJDRT17,
+@inproceedings{DBLP:journals/jmlr/LiJDRT17,
  author    = {Lisha Li and
               Kevin G. Jamieson and
               Giulia DeSalvo and
               Afshin Rostamizadeh and
               Ameet Talwalkar},
  title     = {Hyperband: {A} Novel Bandit-Based Approach to Hyperparameter Optimization},
-  journal   = {Journal of Machine Learning Research},
+  publisher   = {Journal of Machine Learning Research},
  volume    = {18},
  pages     = {185:1--185:52},
  year      = {2017}
@@ -8722,7 +8722,7 @@ author    = {Zhuang Liu and
  year      = {2018}
 }
-@article{DBLP:journals/taslp/FanTXQLL20,
+@inproceedings{DBLP:journals/taslp/FanTXQLL20,
  author    = {Yang Fan and
               Fei Tian and
               Yingce Xia and
@@ -8730,7 +8730,7 @@ author    = {Zhuang Liu and
               Xiang-Yang Li and
               Tie-Yan Liu},
  title     = {Searching Better Architectures for Neural Machine Translation},
-  journal   = {IEEE Transactions on Audio, Speech, and Language Processing},
+  publisher   = {IEEE Transactions on Audio, Speech, and Language Processing},
  volume    = {28},
  pages     = {1574--1585},
  year      = {2020}
@@ -8782,59 +8782,57 @@ author    = {Zhuang Liu and
  year      = {2018}
 }
-@article{DBLP:journals/corr/abs-2003-03384,
+@inproceedings{DBLP:journals/corr/abs-2003-03384,
  author    = {Esteban Real and
               Chen Liang and
               David R. So and
               Quoc V. Le},
  title     = {AutoML-Zero: Evolving Machine Learning Algorithms From Scratch},
-  journal   = {CoRR},
+  publisher   = {CoRR},
  volume    = {abs/2003.03384},
  year      = {2020}
 }
-@article{Chollet2017XceptionDL,
+@inproceedings{Chollet2017XceptionDL,
  title={Xception: Deep Learning with Depthwise Separable Convolutions},
  author    = {Fran{\c{c}}ois Chollet},
-  journal={IEEE Conference on Computer Vision and Pattern Recognition},
+  publisher={IEEE Conference on Computer Vision and Pattern Recognition},
  year={2017},
  pages={1800-1807}
 }
-@article{DBLP:journals/tnn/AngelineSP94,
+@inproceedings{DBLP:journals/tnn/AngelineSP94,
  author    = {Peter J. Angeline and
               Gregory M. Saunders and
               Jordan B. Pollack},
  title     = {An evolutionary algorithm that constructs recurrent neural networks},
-  journal   = {IEEE Transactions on Neural Networks},
+  publisher   = {IEEE Transactions on Neural Networks},
  volume    = {5},
  number    = {1},
  pages     = {54--65},
  year      = {1994}
 }
-@article{stanley2002evolving,
+@inproceedings{stanley2002evolving,
  title={Evolving neural networks through augmenting topologies},
  author={Stanley, Kenneth O and Miikkulainen, Risto},
-  journal={Evolutionary computation},
+  publisher={Evolutionary computation},
  volume={10},
  number={2},
  pages={99--127},
-  year={2002},
+  year={2002}
-  publisher={MIT Press}
 }
-@article{DBLP:journals/alife/StanleyDG09,
+@inproceedings{DBLP:journals/alife/StanleyDG09,
  author    = {Kenneth O. Stanley and
               David B. D'Ambrosio and
               Jason Gauci},
  title     = {A Hypercube-Based Encoding for Evolving Large-Scale Neural Networks},
-  journal   = {Artificial Life},
+  publisher   = {Artificial Life},
  volume    = {15},
  number    = {2},
  pages     = {185--212},
-  year      = {2009},
+  year      = {2009}
-  publisher = {MIT Press}
 }
 @inproceedings{DBLP:conf/ijcai/SuganumaSN18,
@@ -8905,21 +8903,21 @@ author    = {Zhuang Liu and
  year      = {2016}
 }
-@article{DBLP:journals/corr/abs-1807-06906,
+@inproceedings{DBLP:journals/corr/abs-1807-06906,
  author    = {Arber Zela and
               Aaron Klein and
               Stefan Falkner and
               Frank Hutter},
  title     = {Towards Automated Deep Learning: Efficient Joint Neural Architecture
               and Hyperparameter Search},
-  journal   = {International Conference on Machine Learning},
+  publisher   = {International Conference on Machine Learning},
  year      = {2018}
 }
-@article{li2020automated,
+@inproceedings{li2020automated,
  title={Automated and Lightweight Network Design via Random Search for Remote Sensing Image Scene Classification},
  author={Li, Jihao and Diao, Wenhui and Sun, Xian and Feng, Yingchao and Zhang, Wenkai and Chang, Zhonghan and Fu, Kun},
-  journal={The International Archives of Photogrammetry, Remote Sensing and Spatial Information Sciences},
+  publisher={The International Archives of Photogrammetry, Remote Sensing and Spatial Information Sciences},
  volume={43},
  pages={1217--1224},
  year={2020}
@@ -8954,13 +8952,13 @@ author    = {Zhuang Liu and
  year      = {2017}
 }
-@article{DBLP:journals/corr/ChrabaszczLH17,
+@inproceedings{DBLP:journals/corr/ChrabaszczLH17,
  author    = {Patryk Chrabaszcz and
               Ilya Loshchilov and
               Frank Hutter},
  title     = {A Downsampled Variant of ImageNet as an Alternative to the {CIFAR}
               datasets},
-  journal   = {CoRR},
+  publisher   = {CoRR},
  volume    = {abs/1707.08819},
  year      = {2017}
 }
@@ -9060,7 +9058,7 @@ author    = {Zhuang Liu and
  year      = {2018}
 }
-@article{DBLP:journals/corr/abs-2009-02070,
+@inproceedings{DBLP:journals/corr/abs-2009-02070,
  author    = {Wei Zhu and
               Xiaoling Wang and
               Xipeng Qiu and
@@ -9068,7 +9066,7 @@ author    = {Zhuang Liu and
               Guotong Xie},
  title     = {AutoTrans: Automating Transformer Design via Reinforced Architecture
               Search},
-  journal   = {CoRR},
+  publisher   = {CoRR},
  volume    = {abs/2009.02070},
  year      = {2020}
 }
@@ -9088,7 +9086,7 @@ author    = {Zhuang Liu and
  year      = {2020}
 }
-@article{DBLP:journals/corr/abs-2008-06808,
+@inproceedings{DBLP:journals/corr/abs-2008-06808,
  author    = {Henry Tsai and
               Jayden Ooi and
               Chun-Sung Ferng and
@@ -9096,7 +9094,7 @@ author    = {Zhuang Liu and
               Jason Riesa},
  title     = {Finding Fast Transformers: One-Shot Neural Architecture Search by
               Component Composition},
-  journal   = {CoRR},
+  publisher   = {CoRR},
  volume    = {abs/2008.06808},
  year      = {2020}
 }
@@ -9115,20 +9113,20 @@ author    = {Zhuang Liu and
  year={2020}
 }
-@article{li2020shallow,
+@inproceedings{li2020shallow,
  title={Shallow-to-Deep Training for Neural Machine Translation},
  author={Li, Bei and Wang, Ziyang and Liu, Hui and Jiang, Yufan and Du, Quan and Xiao, Tong and Wang, Huizhen and Zhu, Jingbo},
-  journal={Conference on Empirical Methods in Natural Language Processing},
+  publisher={Conference on Empirical Methods in Natural Language Processing},
  year={2020}
 }
-@article{DBLP:journals/corr/abs-2007-06257,
+@inproceedings{DBLP:journals/corr/abs-2007-06257,
  author    = {Hongfei Xu and
               Qiuhui Liu and
               Deyi Xiong and
               Josef van Genabith},
  title     = {Transformer with Depth-Wise {LSTM}},
-  journal   = {CoRR},
+  publisher   = {CoRR},
  volume    = {abs/2007.06257},
  year      = {2020}
 }
@@ -9145,7 +9143,7 @@ author    = {Zhuang Liu and
  year      = {2020}
 }
-@article{DBLP:journals/corr/abs-2006-10369,
+@inproceedings{DBLP:journals/corr/abs-2006-10369,
  author    = {Jungo Kasai and
               Nikolaos Pappas and
               Hao Peng and
@@ -9153,12 +9151,12 @@ author    = {Zhuang Liu and
               Noah A. Smith},
  title     = {Deep Encoder, Shallow Decoder: Reevaluating the Speed-Quality Tradeoff
               in Machine Translation},
-  journal   = {CoRR},
+  publisher   = {CoRR},
  volume    = {abs/2006.10369},
  year      = {2020}
 }
-@article{DBLP:journals/corr/abs-1806-01261,
+@inproceedings{DBLP:journals/corr/abs-1806-01261,
  author    = {Peter W. Battaglia and
               Jessica B. Hamrick and
               Victor Bapst and
@@ -9187,7 +9185,7 @@ author    = {Zhuang Liu and
               Yujia Li and
               Razvan Pascanu},
  title     = {Relational inductive biases, deep learning, and graph networks},
-  journal   = {CoRR},
+  publisher   = {CoRR},
  volume    = {abs/1806.01261},
  year      = {2018}
 }
@@ -9202,7 +9200,7 @@ author    = {Zhuang Liu and
  year      = {2018},
 }
-@article{Dai2019TransformerXLAL,
+@inproceedings{Dai2019TransformerXLAL,
  author    = {Zihang Dai and
               Zhilin Yang and
               Yiming Yang and
@@ -9210,7 +9208,7 @@ author    = {Zhuang Liu and
               Quoc V. Le and
               Ruslan Salakhutdinov},
  title     = {Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context},
-  journal   = {Annual Meeting of the Association for Computational Linguistics},
+  publisher   = {Annual Meeting of the Association for Computational Linguistics},
  pages     = {2978--2988},
  year      = {2019}
 }
@@ -9309,11 +9307,11 @@ author    = {Zhuang Liu and
  year      = {2020}
 }
-@article{Aharoni2017TowardsSN,
+@inproceedings{Aharoni2017TowardsSN,
  title={Towards String-To-Tree Neural Machine Translation},
  author={Roee Aharoni and 
          Yoav Goldberg},
-  journal={Annual Meeting of the Association for Computational Linguistics},
+  publisher={Annual Meeting of the Association for Computational Linguistics},
  year={2017}
 }
@@ -9415,10 +9413,10 @@ author    = {Zhuang Liu and
  year      = {2019}
 }
-@article{Liu2020LearningTE,
+@inproceedings{Liu2020LearningTE,
 	title={Learning to Encode Position for Transformer with Continuous Dynamical Model},
 	author={Xuanqing Liu and Hsiang-Fu Yu and Inderjit Dhillon and Cho-Jui Hsieh},
-	journal={ArXiv},
+	publisher={ArXiv},
 	year={2020},
 	volume={abs/2003.09229}
 }
@@ -9510,17 +9508,17 @@ author    = {Zhuang Liu and
  year      = {2017}
 }
-@article{DBLP:journals/corr/abs-1711-02132,
+@inproceedings{DBLP:journals/corr/abs-1711-02132,
  author    = {Karim Ahmed and
               Nitish Shirish Keskar and
               Richard Socher},
  title     = {Weighted Transformer Network for Machine Translation},
-  journal   = {CoRR},
+  publisher   = {CoRR},
  volume    = {abs/1711.02132},
  year      = {2017}
 }
-@article{DBLP:journals/corr/abs-2006-10270,
+@inproceedings{DBLP:journals/corr/abs-2006-10270,
  author    = {Yang Fan and
               Shufang Xie and
               Yingce Xia and
@@ -9529,7 +9527,7 @@ author    = {Zhuang Liu and
               Xiang-Yang Li and
               Tie-Yan Liu},
  title     = {Multi-branch Attentive Transformer},
-  journal   = {CoRR},
+  publisher   = {CoRR},
  volume    = {abs/2006.10270},
  year      = {2020}
 }
@@ -9544,10 +9542,10 @@ author    = {Zhuang Liu and
  year      = {2020}
 }
-@article{李北2019面向神经机器翻译的集成学习方法分析,
+@inproceedings{李北2019面向神经机器翻译的集成学习方法分析,
  title={面向神经机器翻译的集成学习方法分析},
  author={李北 and 王强 and 肖桐 and 姜雨帆 and 张哲旸 and 刘继强 and 张俐 and 于清},
-  journal={中文信息学报},
+  publisher={中文信息学报},
  volume={33},
  number={3},
  year={2019},
@@ -9575,10 +9573,10 @@ author    = {Zhuang Liu and
  year      = {2019}
 }
-@article{Lan2020ALBERTAL,
+@inproceedings{Lan2020ALBERTAL,
  title={ALBERT: A Lite BERT for Self-supervised Learning of Language Representations},
  author={Zhenzhong Lan and Mingda Chen and Sebastian Goodman and Kevin Gimpel and Piyush Sharma and Radu Soricut},
-  journal={International Conference on Learning Representations},
+  publisher={International Conference on Learning Representations},
  year={2020}
 }
@@ -9621,45 +9619,45 @@ author    = {Zhuang Liu and
  year      = {2018}
 }
-@article{DBLP:journals/corr/abs-2004-05150,
+@inproceedings{DBLP:journals/corr/abs-2004-05150,
  author    = {Iz Beltagy and
               Matthew E. Peters and
               Arman Cohan},
  title     = {Longformer: The Long-Document Transformer},
-  journal   = {CoRR},
+  publisher   = {CoRR},
  volume    = {abs/2004.05150},
  year      = {2020}
 }
-@article{Kitaev2020ReformerTE,
+@inproceedings{Kitaev2020ReformerTE,
  author    = {Nikita Kitaev and
               Lukasz Kaiser and
               Anselm Levskaya},
  title     = {Reformer: The Efficient Transformer},
-  journal = {International Conference on Learning Representations},
+  publisher = {International Conference on Learning Representations},
  year      = {2020}
 }
-@article{DBLP:journals/corr/abs-2003-05997,
+@inproceedings{DBLP:journals/corr/abs-2003-05997,
  author    = {Aurko Roy and
               Mohammad Saffar and
               Ashish Vaswani and
               David Grangier},
  title     = {Efficient Content-Based Sparse Attention with Routing Transformers},
-  journal   = {CoRR},
+  publisher   = {CoRR},
  volume    = {abs/2003.05997},
  year      = {2020}
 }
-@article{Katharopoulos2020TransformersAR,
+@inproceedings{Katharopoulos2020TransformersAR,
  title={Transformers are RNNs: Fast Autoregressive Transformers with Linear Attention},
  author={Angelos Katharopoulos and Apoorv Vyas and Nikolaos Pappas and Franccois Fleuret},
-  journal={CoRR},
+  publisher={CoRR},
  year={2020},
  volume={abs/2006.16236}
 }
-@article{DBLP:journals/corr/abs-2009-14794,
+@inproceedings{DBLP:journals/corr/abs-2009-14794,
  author    = {Krzysztof Choromanski and
               Valerii Likhosherstov and
               David Dohan and
@@ -9674,7 +9672,7 @@ author    = {Zhuang Liu and
               Lucy Colwell and
               Adrian Weller},
  title     = {Rethinking Attention with Performers},
-  journal   = {CoRR},
+  publisher   = {CoRR},
  volume    = {abs/2009.14794},
  year      = {2020}
 }
@@ -9704,14 +9702,14 @@ author    = {Zhuang Liu and
  year      = {2018}
 }
-@article{DBLP:journals/corr/abs-2006-04768,
+@inproceedings{DBLP:journals/corr/abs-2006-04768,
  author    = {Sinong Wang and
               Belinda Z. Li and
               Madian Khabsa and
               Han Fang and
               Hao Ma},
  title     = {Linformer: Self-Attention with Linear Complexity},
-  journal   = {CoRR},
+  publisher   = {CoRR},
  volume    = {abs/2006.04768},
  year      = {2020}
 }
@@ -14049,50 +14047,50 @@ author    = {Zhuang Liu and
  year      = {2017}
 }
-@article{DBLP:journals/corr/abs-1805-01553,
+@inproceedings{DBLP:journals/corr/abs-1805-01553,
  author    = {Tsz Kin Lam and
               Julia Kreutzer and
               Stefan Riezler},
  title     = {A Reinforcement Learning Approach to Interactive-Predictive Neural
               Machine Translation},
-  journal   = {CoRR},
+  publisher   = {CoRR},
  volume    = {abs/1805.01553},
  year      = {2018}
 }
-@article{DBLP:journals/mt/DomingoPC17,
+@inproceedings{DBLP:journals/mt/DomingoPC17,
  author    = {Miguel Domingo and
               {\'{A}}lvaro Peris and
               Francisco Casacuberta},
  title     = {Segment-based interactive-predictive machine translation},
-  journal   = {Machine Translation},
+  publisher   = {Machine Translation},
  volume    = {31},
  number    = {4},
  pages     = {163--185},
  year      = {2017}
 }
-@article{Peris2017InteractiveNM,
+@inproceedings{Peris2017InteractiveNM,
  title={Interactive neural machine translation},
  author={{\'A}lvaro Peris and Miguel Domingo and F. Casacuberta},
-  journal={Computer Speech and Language},
+  publisher={Computer Speech and Language},
  year={2017},
  volume={45},
  pages={201-220}
 }
-@article{DBLP:journals/csl/PerisC19,
+@inproceedings{DBLP:journals/csl/PerisC19,
  author    = {{\'{A}}lvaro Peris and
               Francisco Casacuberta},
  title     = {Online learning for effort reduction in interactive neural machine
               translation},
-  journal   = {Computer Speech Language},
+  publisher   = {Computer Speech Language},
  volume    = {58},
  pages     = {98--126},
  year      = {2019}
 }
-@article{DBLP:journals/coling/BarrachinaBCCCKLNTVV09,
+@inproceedings{DBLP:journals/coling/BarrachinaBCCCKLNTVV09,
  author    = {Sergio Barrachina and
               Oliver Bender and
               Francisco Casacuberta and
@@ -14105,7 +14103,7 @@ author    = {Zhuang Liu and
               Enrique Vidal and
               Juan Miguel Vilar},
  title     = {Statistical Approaches to Computer-Assisted Translation},
-  journal   = {Computer Linguistics},
+  publisher   = {Computer Linguistics},
  volume    = {35},
  number    = {1},
  pages     = {3--28},
@@ -14131,14 +14129,14 @@ author    = {Zhuang Liu and
  year      = {2016}
 }
-@article{DBLP:journals/corr/abs-2010-05680,
+@inproceedings{DBLP:journals/corr/abs-2010-05680,
  author    = {Jiarui Fang and
               Yang Yu and
               Chengduo Zhao and
               Jie Zhou},
  title     = {TurboTransformers: An Efficient {GPU} Serving System For Transformer
               Models},
-  journal   = {CoRR},
+  publisher   = {CoRR},
  volume    = {abs/2010.05680},
  year      = {2020}
 }
@@ -14155,13 +14153,13 @@ author    = {Zhuang Liu and
  year      = {2018}
 }
-@article{DBLP:journals/corr/BolukbasiWDS17,
+@inproceedings{DBLP:journals/corr/BolukbasiWDS17,
  author    = {Tolga Bolukbasi and
               Joseph Wang and
               Ofer Dekel and
               Venkatesh Saligrama},
  title     = {Adaptive Neural Networks for Fast Test-Time Prediction},
-  journal   = {CoRR},
+  publisher   = {CoRR},
  volume    = {abs/1702.07811},
  year      = {2017}
 }
@@ -14188,13 +14186,13 @@ author    = {Zhuang Liu and
  year      = {2020}
 }
-@article{DBLP:journals/corr/abs-1912-00567,
+@inproceedings{DBLP:journals/corr/abs-1912-00567,
  author    = {Tao Wang and
               Shaohui Kuang and
               Deyi Xiong and
               Ant{\'{o}}nio Branco},
  title     = {Merging External Bilingual Pairs into Neural Machine Translation},
-  journal   = {CoRR},
+  publisher   = {CoRR},
  volume    = {abs/1912.00567},
  year      = {2019}
 }
@@ -14292,20 +14290,20 @@ author    = {Zhuang Liu and
  year      = {2018}
 }
-@article{barone2017regularization,
+@inproceedings{barone2017regularization,
  title={Regularization techniques for fine-tuning in neural machine translation},
  author={Barone, Antonio Valerio Miceli and Haddow, Barry and Germann, Ulrich and Sennrich, Rico},
-  journal={arXiv preprint arXiv:1707.09920},
+  publisher={arXiv preprint arXiv:1707.09920},
  year={2017}
 }
-@article{DBLP:journals/corr/ChuDK17,
+@inproceedings{DBLP:journals/corr/ChuDK17,
  author    = {Chenhui Chu and
               Raj Dabre and
               Sadao Kurohashi},
  title     = {An Empirical Comparison of Simple Domain Adaptation Methods for Neural
               Machine Translation},
-  journal   = {CoRR},
+  publisher   = {CoRR},
  volume    = {abs/1701.03214},
  year      = {2017}
 }
@@ -14393,7 +14391,7 @@ author    = {Zhuang Liu and
  year      = {2010}
 }
-@article{vilar2012jane,
+@inproceedings{vilar2012jane,
  title={Jane: an advanced freely available hierarchical machine translation toolkit},
  author={Vilar, David and Stein, Daniel and Huck, Matthias and Ney, Hermann},
  publisher={Machine Translation},
@@ -14413,7 +14411,7 @@ author    = {Zhuang Liu and
  year      = {2013}
 }
-@article{al2016theano,
+@inproceedings{al2016theano,
  author    = {Rami Al-Rfou and
               Guillaume Alain and
               Amjad Almahairi and
@@ -14528,7 +14526,7 @@ author    = {Zhuang Liu and
               Ying Zhang},
  title     = {Theano: {A} Python framework for fast computation of mathematical
               expressions},
-  journal   = {CoRR},
+  publisher   = {CoRR},
  volume    = {abs/1605.02688},
  year      = {2016}
 }
@@ -14583,11 +14581,11 @@ author    = {Zhuang Liu and
  year      = {2007}
 }
-@article{och2003systematic,
+@inproceedings{och2003systematic,
  author    = {Franz Josef Och and
               Hermann Ney},
  title     = {A Systematic Comparison of Various Statistical Alignment Models},
-  journal   = {Computational Linguistics},
+  publisher   = {Computational Linguistics},
  volume    = {29},
  number    = {1},
  pages     = {19--51},
@@ -14661,7 +14659,7 @@ author    = {Zhuang Liu and
  year      = {2016}
 }
-@article{ZhangTHUMT,
+@inproceedings{ZhangTHUMT,
  author    = {Jiacheng Zhang and
               Yanzhuo Ding and
               Shiqi Shen and
@@ -14670,7 +14668,7 @@ author    = {Zhuang Liu and
               Huan-Bo Luan and
               Yang Liu},
  title     = {{THUMT:} An Open Source Toolkit for Neural Machine Translation},
-  journal   = {CoRR},
+  publisher   = {CoRR},
  volume    = {abs/1706.06415},
  year      = {2017}
 }
@@ -14694,7 +14692,7 @@ author    = {Zhuang Liu and
  year      = {2018}
 }
-@article{hieber2017sockeye,
+@inproceedings{hieber2017sockeye,
  author    = {Felix Hieber and
               Tobias Domhan and
               Michael Denkowski and
@@ -14703,7 +14701,7 @@ author    = {Zhuang Liu and
               Ann Clifton and
               Matt Post},
  title     = {Sockeye: {A} Toolkit for Neural Machine Translation},
-  journal   = {CoRR},
+  publisher   = {CoRR},
  volume    = {abs/1712.05690},
  year      = {2017}
 }
@@ -14719,7 +14717,7 @@ author    = {Zhuang Liu and
  year      = {2018}
 }
-@article{DBLP:journals/corr/abs-1805-10387,
+@inproceedings{DBLP:journals/corr/abs-1805-10387,
  author    = {Oleksii Kuchaiev and
               Boris Ginsburg and
               Igor Gitman and
@@ -14728,12 +14726,12 @@ author    = {Zhuang Liu and
               Paulius Micikevicius},
  title     = {OpenSeq2Seq: extensible toolkit for distributed and mixed precision
               training of sequence-to-sequence models},
-  journal   = {CoRR},
+  publisher   = {CoRR},
  volume    = {abs/1805.10387},
  year      = {2018}
 }
-@article{nmtpy2017,
+@inproceedings{nmtpy2017,
  author    = {Ozan Caglayan and
               Mercedes Garc{\'{\i}}a-Mart{\'{\i}}nez and
               Adrien Bardet and
@@ -14742,7 +14740,7 @@ author    = {Zhuang Liu and
               Lo{\"{\i}}c Barrault},
  title     = {{NMTPY:} {A} Flexible Toolkit for Advanced Neural Machine Translation
               Systems},
-  journal   = {The Prague Bulletin of Mathematical Linguistics},
+  publisher   = {The Prague Bulletin of Mathematical Linguistics},
  volume    = {109},
  pages     = {15--28},
  year      = {2017}