合并分支 'caorunzhe' 到 'zengxin'

Caorunzhe 查看合并请求 !774

合并分支 'caorunzhe' 到 'zengxin'
Caorunzhe 查看合并请求 !774
186a8657 · zengxin · a23d073a · dc4cb935 · 186a8657 · 186a8657
Commit 186a8657 authored Jan 04, 2021 by zengxin
--- a/Chapter13/Figures/figure-a-predefined-course-planning.tex
+++ b/Chapter13/Figures/figure-a-predefined-course-planning.tex
+\begin{tikzpicture}
+\tikzstyle{node1}=[inner sep=0mm,minimum height=1em,minimum width=3em,fill=ugreen!10!blue!10]
+\tikzstyle{node2}=[inner sep=0mm,minimum height=1em,minimum width=3em,fill=ugreen!20!blue!20]
+\tikzstyle{node3}=[inner sep=0mm,minimum height=1em,minimum width=3em,fill=ugreen!30!blue!30]
+\tikzstyle{node4}=[inner sep=0mm,minimum height=1em,minimum width=3em,fill=ugreen!40!blue!40]
+\tikzstyle{node5}=[inner sep=0mm,minimum height=1em,minimum width=3em,fill=ugreen!50!blue!50]
+\begin{scope}
+\node[anchor=north west] (l1) at (0,0) {易};
+\node[anchor=west] (l2) at ([xshift=10.3em,yshift=0em]l1.east) {难};
+\draw [->,thick] ([xshift=-2em,yshift=-1em]l1.south) --  ([xshift=2em,yshift=-1em]l2.south);
+\node[anchor=north,node1] (c1) at ([xshift=0em,yshift=-2em]l1.south) {};
+\node[anchor=west,node2] (c2) at ([xshift=0em,yshift=0em]c1.east) {};
+\node[anchor=west,node3] (c3) at ([xshift=0em,yshift=0em]c2.east) {};
+\node[anchor=west,node4] (c4) at ([xshift=0em,yshift=0em]c3.east) {};
+\node[anchor=west,node5] (c5) at ([xshift=0em,yshift=0em]c4.east) {};
+\node[anchor=north] (lb1) at ([xshift=0em,yshift=-1.5em]c3.south) {(a)样本难易程度图示};
+\end{scope}
+\begin{scope}[yshift = -1.7in]
+\foreach \i / \j / \z in 
+		{0/0/node1, 1/0/node1, 2/0/node1, 3/0/node1, 4/0/node1, 5/0/node1, 6/0/node1,
+		 1/1/node2, 2/1/node2, 3/1/node2, 4/1/node2, 5/1/node2, 6/1/node2,
+		 2/2/node3, 3/2/node3, 4/2/node3, 5/2/node3, 6/2/node3,
+		 3/3/node4, 4/3/node4, 5/3/node4, 6/3/node4,
+		 4/4/node5, 5/4/node5, 6/4/node5,
+		}
+	\node[anchor=north west,\z,draw=white](n\i\j) at (1.0*3em*\j + 0*0em,-1.0*1em*\i + 0*0em){};
+	\node[anchor=north west,node1,draw=white](nc) at (0,0){};
+\draw [->,very thick] ([xshift=-1em,yshift=1em]n00.north west) --  ([xshift=16em,yshift=1em]n00.north west);
+\draw [->,very thick] ([xshift=-1em,yshift=1em]n00.north west) --  ([xshift=-1em,yshift=-9em]n00.north west);
+\node[anchor=west] (x1) at ([xshift=12em,yshift=2em]n00.north west) {数据块};
+\node[anchor=east] (y1) at ([xshift=-1em,yshift=-7.5em]n00.north west) {轮次};
+\node[anchor=west,font=\small] (t1) at ([xshift=0em,yshift=-1em]n60.south west) {$\ldots$直到模型收敛};
+\node[anchor=north] (lb2) at ([xshift=0em,yshift=-3em]n62.south) {(b)不同训练阶段使用的数据};
+\end{scope}
+\end{tikzpicture}
\ No newline at end of file
--- a/Chapter13/Figures/figure-bpe.tex
+++ b/Chapter13/Figures/figure-bpe.tex
@@ -17,11 +17,11 @@
 	\draw[->,line width=.03cm] ([yshift=0em]node5.south) -- ([yshift=0em]node6.north);
 	\node[node,anchor = west] (node8) at ([xshift = 2em,yshift = 2em]node7.east) {对于词表外的词lowest};
-	\node[node,anchor = north west] (node9) at ([yshift = 0.3em]node8.south west) {可以被分割为low est};
+	\node[node,anchor = north west] (node9) at ([yshift = 0.3em]node8.south west) {可以被合并为low est};
 	\node[node,font=\scriptsize,anchor = north,fill=ugreen!5,drop shadow] (dict) at ([xshift = 5em,yshift = -5em]node6.south){\begin{tabular}{llllll}
 		\multirow{3}{*}{符号合并表:} & ('e','s')  & ('es','t') & ('est','$<$e$>$') & ('l','o') & ('lo','w')   \\
-        & ('low','$<$e$>$') & ('n','e')  & ('ne','w') & ('new','est$<$e$>$') & ('w','i') \\
+        & ('n','e')  & ('ne','w') & ('new','est$<$e$>$') & ('low','$<$e$>$') & ('w','i') \\
        & ('wi','d') & ('wid','est$<$e$>$') & ('low','e') & ('lowe','r') & ('lower','$<$e$>$')
 		\end{tabular}};

--- a/Chapter13/Figures/figure-computation-of-dropout.tex
+++ b/Chapter13/Figures/figure-computation-of-dropout.tex
@@ -7,7 +7,7 @@
 \tikzstyle{neuronnode} = [minimum size=1.8em,circle,draw,very thick,ublue,inner sep=0pt, fill=white,align=center]
 %standard
-\node [neuronnode] (neuron_b) at (0,0) {\scriptsize{$b_{i}^{l}$}};
+\node [neuronnode] (neuron_b) at (0,0) {\scriptsize{$b^{l}$}};
 \node [neuronnode] (neuron_y3) at (0,-1*\neuronsep) {\scriptsize{$x_{3}^{l}$}};
 \node [neuronnode] (neuron_y2) at (0,-2*\neuronsep) {\scriptsize{$x_{2}^{l}$}};
 \node [neuronnode] (neuron_y1) at (0,-3*\neuronsep) {\scriptsize{$x_{1}^{l}$}};
@@ -25,7 +25,7 @@
 \draw [->,line width=0.3mm] (neuron_z.east) -- (neuron_y'.west);
 %dropout
-\node [neuronnode] (drop_neuron_b) at (5*\nodespace,0) {\scriptsize{$b_{i}^{l}$}};
+\node [neuronnode] (drop_neuron_b) at (5*\nodespace,0) {\scriptsize{$b^{l}$}};
 \node [neuronnode] (drop_neuron_y3') at (5*\nodespace,-1*\neuronsep) {\scriptsize{$\tilde{x}_{3}^{l}$}};
 \node [neuronnode] (drop_neuron_y2') at (5*\nodespace,-2*\neuronsep) {\scriptsize{$\tilde{x}_{2}^{l}$}};
 \node [neuronnode] (drop_neuron_y1') at (5*\nodespace,-3*\neuronsep) {\scriptsize{$\tilde{x}_{1}^{l}$}};
@@ -60,12 +60,12 @@
 %equ
 \node [anchor=west,inner sep = 2pt] (line1) at (9*\nodespace,0) {未应用Dropout：};
-\node [anchor=north west,inner sep = 2pt] (line2) at (line1.south west) {$z_{i}^{l+1}=\mathbf{w}^{l} \mathbf{x}^{l} + b_{i}^{l}$};
+\node [anchor=north west,inner sep = 2pt] (line2) at (line1.south west) {$z_{i}^{l+1}=\mathbf{w}^{l} \mathbf{x}^{l} + b^{l}$};
 \node [anchor=north west,inner sep = 2pt] (line3) at (line2.south west) {$x_{i}^{l+1}=f\left(z_{i}^{l+1}\right)$};
 \node [anchor=north west,inner sep = 2pt] (line4) at (line3.south west) {应用Dropout：};
 \node [anchor=north west,inner sep = 2pt] (line5) at (line4.south west) {$r_{j}^{l} \sim$ Bernoulli $(1-p)$};
 \node [anchor=north west,inner sep = 2pt] (line6) at (line5.south west) {$\tilde{\mathbf{x}}=\mathbf{r} * \mathbf{x}$};
-\node [anchor=north west,inner sep = 2pt] (line7) at (line6.south west) {$z_{i}^{l+1}=\mathbf{w}^{l} \widetilde{\mathbf{x}}^{l} + b_{i}^{l}$};
+\node [anchor=north west,inner sep = 2pt] (line7) at (line6.south west) {$z_{i}^{l+1}=\mathbf{w}^{l} \widetilde{\mathbf{x}}^{l} + b^{l}$};
 \node [anchor=north west,inner sep = 2pt] (line8) at (line7.south west) {$x_{i}^{l+1}=f\left(z_{i}^{l+1}\right)$};
 \end{tikzpicture}
\ No newline at end of file
--- a/Chapter13/Figures/figure-curriculum-learning-framework.tex
+++ b/Chapter13/Figures/figure-curriculum-learning-framework.tex
+\begin{tikzpicture}
+\tikzstyle{node}=[inner sep=0mm,minimum height=3em,minimum width=6em,rounded corners=5pt]
+\node[anchor=west,node,fill=ugreen!30] (n1) at (0,0) {训练集};
+\node[anchor=west,node,fill=yellow!30] (n2) at ([xshift=2em,yshift=0em]n1.east) {难度评估器};
+\node[anchor=west,node,fill=red!30] (n3) at ([xshift=4em,yshift=0em]n2.east) {训练调度器};
+\node[anchor=west,node,fill=blue!30] (n4) at ([xshift=4em,yshift=0em]n3.east) {模型训练器};
+\draw [->,very thick] ([xshift=0em,yshift=0em]n1.east) -- ([xshift=0em,yshift=0em]n2.west);
+\draw [->,very thick] ([xshift=0em,yshift=0em]n2.east) -- ([xshift=0em,yshift=0em]n3.west);
+\draw [->,very thick] ([xshift=0em,yshift=0em]n3.east) -- ([xshift=0em,yshift=0em]n4.west);
+\node[anchor=west,inner sep=0mm,minimum height=2em,minimum width=3em,font=\small,align=left] (n5) at ([xshift=0.5em,yshift=-1.5em]n2.east) {排序过\\的数据};
+\node[anchor=west,inner sep=0mm,minimum height=2em,minimum width=3em,font=\small,align=left] (n6) at ([xshift=1em,yshift=-1.5em]n3.east) {采样\\批次$t$};
+\node[anchor=south west,inner sep=0mm,minimum height=2em,minimum width=4em,font=\small] (n7) at ([xshift=2.5em,yshift=1em]n3.north) {如果模型收敛};
+\node[anchor=south west,inner sep=0mm] (n8) at ([xshift=0em,yshift=1.5em]n4.north) {批次$t$};
+\node[anchor=north east,inner sep=0mm] (n9) at ([xshift=0em,yshift=-1em]n3.south east) {课程设计};
+\draw [->,dotted,very thick] ([xshift=0em,yshift=0em]n4.north) --  ([xshift=0em,yshift=1em]n4.north) --  ([xshift=0em,yshift=1em]n3.north) -- (n3.north);
+    \begin{pgfonlayer}{background}
+      \node[rectangle,inner sep=5pt,rounded corners=5pt,fill=gray!30] [fit = (n3) (n4) (n6) (n8) ] (g2) {};
+      \node[rectangle,inner sep=5pt,rounded corners=5pt,fill=orange!30] [fit = (n2) (n3) (n9) ] (g1) {};
+    \end{pgfonlayer}
+\end{tikzpicture}
\ No newline at end of file
--- a/Chapter13/Figures/figure-exposure-bias.tex
+++ b/Chapter13/Figures/figure-exposure-bias.tex
+%------------------------------------------------------------
+\begin{tikzpicture}
+\begin{scope}[]
+\tikzstyle{rnnnode} = [draw,inner sep=2pt,minimum width=3em,minimum height=1.5em,rounded corners=1pt,fill=red!20]
+\tikzstyle{snode} = [draw,inner sep=2pt,minimum width=3em,minimum height=1.5em,rounded corners=1pt,fill=blue!20]
+\tikzstyle{ynode} = [inner sep=2pt,minimum width=3em,minimum height=1.5em,rounded corners=1pt]
+\node [anchor=west,rnnnode] (n1) at (0,0) {$\mathbi{h}_{1}$};
+\node [anchor=west] (n2) at ([xshift=2em,yshift=0em]n1.east) {$\cdots$};
+\node [anchor=west,rnnnode] (n3) at ([xshift=2em,yshift=0em]n2.east) {$\mathbi{h}_{j-1}$};
+\node [anchor=west,rnnnode] (n4) at ([xshift=2em,yshift=0em]n3.east) {$\mathbi{h}_{j}$};
+\node [anchor=south,snode,font=\footnotesize] (n5) at ([xshift=0em,yshift=1em]n3.north) {Softmax};
+\node [anchor=south,ynode] (n6) at ([xshift=0em,yshift=1em]n5.north) {$\tilde{{y}}_{j-1}$};
+\node [anchor=south,snode,font=\footnotesize] (n7) at ([xshift=0em,yshift=1em]n4.north) {Softmax};
+\node [anchor=south,ynode] (n8) at ([xshift=0em,yshift=1em]n7.north) {$\tilde{{y}}_{j}$};
+\node [anchor=south,snode,font=\footnotesize] (n13) at ([xshift=0em,yshift=1em]n1.north) {Softmax};
+\node [anchor=south,ynode] (n14) at ([xshift=0em,yshift=1em]n13.north) {$\tilde{{y}}_{1}$};
+\node [anchor=north] (n11) at ([xshift=0em,yshift=-1em]n3.south) {${{y}}_{j-2}$};
+\node [anchor=north] (n12) at ([xshift=0em,yshift=-1em]n4.south) {${{y}}_{j-1}$};
+\node [anchor=north] (n9) at ([xshift=0em,yshift=-3em]n4.south) {\small{(a) 训练阶段}};
+\node [anchor=north] (x1) at ([xshift=0em,yshift=-1em]n1.south) {$\langle$sos$\rangle$};
+\node [anchor=south,inner sep=2pt] (st1) at (n6.north) {\scriptsize{\textbf{[step $j-1$]}}};
+\node [anchor=south,inner sep=2pt] (st2) at (n8.north) {\scriptsize{\textbf{[step $j$]}}};
+\node [anchor=south,inner sep=2pt] (st3) at (n14.north) {\scriptsize{\textbf{[step $1$]}}};
+\draw [->,thick] ([xshift=0em,yshift=0em]x1.north)--([xshift=0em,yshift=0em]n1.south);
+\draw [->,thick] ([xshift=0em,yshift=0em]n1.east)--([xshift=0em,yshift=0em]n2.west);
+\draw [->,thick] ([xshift=0em,yshift=0em]n2.east)--([xshift=0em,yshift=0em]n3.west);
+\draw [->,thick] ([xshift=0em,yshift=0em]n3.east)--([xshift=0em,yshift=0em]n4.west);
+\draw [->,thick] ([xshift=0em,yshift=0em]n4.east)--([xshift=2em,yshift=0em]n4.east);
+\draw [->,thick] ([xshift=0em,yshift=0em]n3.north)--([xshift=0em,yshift=0em]n5.south);
+\draw [->,thick] ([xshift=0em,yshift=0em]n5.north)--([xshift=0em,yshift=0em]n6.south);
+\draw [->,thick] ([xshift=0em,yshift=0em]n4.north)--([xshift=0em,yshift=0em]n7.south);
+\draw [->,thick] ([xshift=0em,yshift=0em]n7.north)--([xshift=0em,yshift=0em]n8.south);
+\draw [->,thick] ([xshift=0em,yshift=0em]n1.north)--([xshift=0em,yshift=0em]n13.south);
+\draw [->,thick] ([xshift=0em,yshift=0em]n13.north)--([xshift=0em,yshift=0em]n14.south);
+\draw [->,thick] ([xshift=-0em,yshift=0em]n11.north)--([xshift=-0em,yshift=0em]n3.south);
+\draw [->,thick] ([xshift=0em,yshift=0em]n12.north)--([xshift=-0em,yshift=-0em]n4.south);
+\node [anchor=west] (n10) at ([xshift=4em,yshift=0em]n8.east) {真实数据$\seq{y}$服从的分布：};
+\node [anchor=north west] (y1) at ([xshift=0em,yshift=-4em]n10.south west) {${y}_{1}$};
+\node [anchor=west] (y2) at ([xshift=-0.25em,yshift=0em]y1.east) {${y}_{2}$};
+\node [anchor=west] (y3) at ([xshift=-0.25em,yshift=0em]y2.east) {${y}_{3}$};
+\node [anchor=west] (y4) at ([xshift=-0.25em,yshift=0em]y3.east) {${y}_{4}$};
+\node [anchor=west] (y5) at ([xshift=-0.25em,yshift=0em]y4.east) {${y}_{5}$};
+\node [anchor=west] (y6) at ([xshift=-0.25em,yshift=0em]y5.east) {${y}_{6}$};
+\node [anchor=west] (y7) at ([xshift=-0.25em,yshift=0em]y6.east) {$\ldots$};
+\node [anchor=west] (y8) at ([xshift=-0.25em,yshift=0em]y7.east) {${y}_{n}$};
+\node [anchor=south,minimum width=0.8em,minimum height=1.4em,fill=ugreen!50,inner sep=0pt] (label1) at ([xshift=-0.1em,yshift=0em]y1.north) {};
+\node [anchor=south,minimum width=0.8em,minimum height=2.2em,fill=ugreen!50,inner sep=0pt] (label2) at ([xshift=-0.1em,yshift=0em]y2.north) {};
+\node [anchor=south,minimum width=0.8em,minimum height=2.8em,fill=ugreen!50,inner sep=0pt] (label3) at ([xshift=-0.1em,yshift=0em]y3.north) {};
+\node [anchor=south,minimum width=0.8em,minimum height=3.2em,fill=ugreen!50,inner sep=0pt] (label4) at ([xshift=-0.1em,yshift=0em]y4.north) {};
+\node [anchor=south,minimum width=0.8em,minimum height=2.4em,fill=ugreen!50,inner sep=0pt] (label5) at ([xshift=-0.1em,yshift=0em]y5.north) {};
+\node [anchor=south,minimum width=0.8em,minimum height=1.7em,fill=ugreen!50,inner sep=0pt] (label6) at ([xshift=-0.1em,yshift=0em]y6.north) {};
+\node [anchor=south,minimum width=0.8em,minimum height=0.4em,fill=ugreen!50,inner sep=0pt] (label8) at ([xshift=-0.1em,yshift=0em]y8.north) {};
+\begin{pgfonlayer}{background}
+\node [rectangle,inner sep=0.1em,rounded corners=5pt,very thick,dotted,draw=ugreen] [fit = (n11) (n12)] (b1) {};
+\node [rectangle,inner sep=0.5em,rounded corners=5pt,very thick,dotted,draw=ugreen] [fit = (n10) (y1) (y8)] (b2) {};
+\draw [->,dotted,very thick,ugreen] ([yshift=-0em]b1.east) .. controls +(east:1.7) and +(west:1) .. ([xshift=-0.1em]b2.west);
+\node [anchor=east] (inputlabel1) at ([yshift=-0.2em]b1.west) {{\color{red} \footnotesize{人工标注数据}}};
+\end{pgfonlayer}
+\end{scope}
+\begin{scope}[yshift=-2in]
+\tikzstyle{rnnnode} = [draw,inner sep=2pt,minimum width=3em,minimum height=1.5em,rounded corners=1pt,fill=red!20]
+\tikzstyle{snode} = [draw,inner sep=2pt,minimum width=3em,minimum height=1.5em,rounded corners=1pt,fill=blue!20]
+\tikzstyle{ynode} = [inner sep=2pt,minimum width=3em,minimum height=1.5em,rounded corners=1pt]
+\node [anchor=west,rnnnode] (n1) at (0,0) {$\mathbi{h}_{1}$};
+\node [anchor=west] (n2) at ([xshift=2em,yshift=0em]n1.east) {$\cdots$};
+\node [anchor=west,rnnnode] (n3) at ([xshift=2em,yshift=0em]n2.east) {$\mathbi{h}_{j-1}$};
+\node [anchor=west,rnnnode] (n4) at ([xshift=2em,yshift=0em]n3.east) {$\mathbi{h}_{j}$};
+\node [anchor=south,snode,font=\footnotesize] (n5) at ([xshift=0em,yshift=1em]n3.north) {Softmax};
+\node [anchor=south,ynode] (n6) at ([xshift=0em,yshift=1em]n5.north) {$\tilde{{y}}_{j-1}$};
+\node [anchor=south,snode,font=\footnotesize] (n7) at ([xshift=0em,yshift=1em]n4.north) {Softmax};
+\node [anchor=south,ynode] (n8) at ([xshift=0em,yshift=1em]n7.north) {$\tilde{{y}}_{j}$};
+\node [anchor=south,snode,font=\footnotesize] (n13) at ([xshift=0em,yshift=1em]n1.north) {Softmax};
+\node [anchor=south,ynode] (n14) at ([xshift=0em,yshift=1em]n13.north) {$\tilde{{y}}_{1}$};
+\node [anchor=north] (n9) at ([xshift=0em,yshift=-3em]n4.south) {\small{(b) 推断阶段}};
+\node [anchor=north] (n11) at ([xshift=0em,yshift=-1em]n3.south) {$\tilde{{y}}_{j-2}$};
+\node [anchor=north] (n12) at ([xshift=0em,yshift=-1em]n4.south) {$\tilde{{y}}_{j-1}$};
+\node [anchor=north] (x1) at ([xshift=0em,yshift=-1em]n1.south) {$\langle$sos$\rangle$};
+\node [anchor=south,inner sep=2pt] (st1) at (n6.north) {\scriptsize{\textbf{[step $j-1$]}}};
+\node [anchor=south,inner sep=2pt] (st2) at (n8.north) {\scriptsize{\textbf{[step $j$]}}};
+\node [anchor=south,inner sep=2pt] (st3) at (n14.north) {\scriptsize{\textbf{[step $1$]}}};
+\draw [->,thick] ([xshift=0em,yshift=0em]x1.north)--([xshift=0em,yshift=0em]n1.south);
+\draw [->,thick] ([xshift=0em,yshift=0em]n1.east)--([xshift=0em,yshift=0em]n2.west);
+\draw [->,thick] ([xshift=0em,yshift=0em]n2.east)--([xshift=0em,yshift=0em]n3.west);
+\draw [->,thick] ([xshift=0em,yshift=0em]n3.east)--([xshift=0em,yshift=0em]n4.west);
+\draw [->,thick] ([xshift=0em,yshift=0em]n4.east)--([xshift=2em,yshift=0em]n4.east);
+\draw [->,thick] ([xshift=0em,yshift=0em]n3.north)--([xshift=0em,yshift=0em]n5.south);
+\draw [->,thick] ([xshift=0em,yshift=0em]n5.north)--([xshift=0em,yshift=0em]n6.south);
+\draw [->,thick] ([xshift=0em,yshift=0em]n4.north)--([xshift=0em,yshift=0em]n7.south);
+\draw [->,thick] ([xshift=0em,yshift=0em]n7.north)--([xshift=0em,yshift=0em]n8.south);
+\draw [->,thick] ([xshift=0em,yshift=0em]n1.north)--([xshift=0em,yshift=0em]n13.south);
+\draw [->,thick] ([xshift=0em,yshift=0em]n13.north)--([xshift=0em,yshift=0em]n14.south);
+\draw [->,thick,dotted] ([xshift=-2.5em,yshift=1em]n11.north) .. controls +(south:2em) and +(west:0.1em) .. ([xshift=0.2em,yshift=0em]n11.west);
+\draw [->,thick,dotted] ([xshift=0em,yshift=0em]n14.east) .. controls +(east:0.3em) and +(north:2em) .. ([xshift=2.5em,yshift=-0.5em]n14.south);
+\draw [->,thick,dotted] ([xshift=0em,yshift=0em]n6.east) .. controls ([xshift=2em,yshift=1em]n6.east) and ([xshift=-2em,yshift=-2em]n4.south west) .. ([xshift=0.2em,yshift=-0em]n12.west);
+\draw [->,thick] ([xshift=-0em,yshift=0em]n11.north)--([xshift=-0em,yshift=0em]n3.south);
+\draw [->,thick] ([xshift=0em,yshift=0em]n12.north)--([xshift=-0em,yshift=-0em]n4.south);
+\node [anchor=west] (n10) at ([xshift=4em,yshift=0em]n8.east) {模型输出$\tilde{{y}}$服从的分布：};
+\node [anchor=north west] (y1) at ([xshift=0em,yshift=-4em]n10.south west) {$\tilde{y}_{1}$};
+\node [anchor=west] (y2) at ([xshift=-0.25em,yshift=0em]y1.east) {$\tilde{y}_{2}$};
+\node [anchor=west] (y3) at ([xshift=-0.25em,yshift=0em]y2.east) {$\tilde{y}_{3}$};
+\node [anchor=west] (y4) at ([xshift=-0.25em,yshift=0em]y3.east) {$\tilde{y}_{4}$};
+\node [anchor=west] (y5) at ([xshift=-0.25em,yshift=0em]y4.east) {$\tilde{y}_{5}$};
+\node [anchor=west] (y6) at ([xshift=-0.25em,yshift=0em]y5.east) {$\tilde{y}_{6}$};
+\node [anchor=west] (y7) at ([xshift=-0.25em,yshift=0em]y6.east) {$\ldots$};
+\node [anchor=west] (y8) at ([xshift=-0.25em,yshift=0em]y7.east) {$\tilde{y}_{n}$};
+\node [anchor=south,minimum width=0.8em,minimum height=1.5em,fill=ublue!80,inner sep=0pt] (label1) at ([xshift=-0.1em,yshift=0em]y1.north) {};
+\node [anchor=south,minimum width=0.8em,minimum height=2.8em,fill=ublue!80,inner sep=0pt] (label2) at ([xshift=-0.1em,yshift=0em]y2.north) {};
+\node [anchor=south,minimum width=0.8em,minimum height=0.9em,fill=ublue!80,inner sep=0pt] (label3) at ([xshift=-0.1em,yshift=0em]y3.north) {};
+\node [anchor=south,minimum width=0.8em,minimum height=3.6em,fill=ublue!80,inner sep=0pt] (label4) at ([xshift=-0.1em,yshift=0em]y4.north) {};
+\node [anchor=south,minimum width=0.8em,minimum height=2.2em,fill=ublue!80,inner sep=0pt] (label5) at ([xshift=-0.1em,yshift=0em]y5.north) {};
+\node [anchor=south,minimum width=0.8em,minimum height=0.2em,fill=ublue!80,inner sep=0pt] (label6) at ([xshift=-0.1em,yshift=0em]y6.north) {};
+\node [anchor=south,minimum width=0.8em,minimum height=1.2em,fill=ublue!80,inner sep=0pt] (label8) at ([xshift=-0.1em,yshift=0em]y8.north) {};
+\begin{pgfonlayer}{background}
+\node [rectangle,inner sep=0.1em,rounded corners=5pt,very thick,dotted,draw=ublue] [fit = (n11) (n12)] (b1) {};
+\node [rectangle,inner sep=0.5em,rounded corners=5pt,very thick,dotted,draw=ublue] [fit = (n10) (y1) (y5)] (b2) {};
+\draw [->,dotted,very thick,ublue] ([xshift=-0em,yshift=-0em]b1.east) .. controls +(east:1.7) and +(west:1) .. ([xshift=-0.1em]b2.west);
+\node [anchor=east] (inputlabel1) at ([yshift=-0.2em]b1.west) {{\color{red} \footnotesize{系统预测结果}}};
+\end{pgfonlayer}
+\end{scope}
+\end{tikzpicture}
--- a/Chapter13/Figures/figure-framework-of-Adversarial-Neural-machine-translation.jpg
+++ b/Chapter13/Figures/figure-framework-of-Adversarial-Neural-machine-translation.jpg
--- a/Chapter13/Figures/figure-framework-of-Adversarial-Neural-machine-translation.tex
+++ b/Chapter13/Figures/figure-framework-of-Adversarial-Neural-machine-translation.tex
+%------------------------------------------------------------
+\begin{tikzpicture}
+\tikzstyle{rnnnode} = [draw,inner sep=2pt,minimum width=4em,minimum height=2em,rounded corners=1pt,fill=yellow!20]
+\tikzstyle{snode} = [draw,inner sep=2pt,minimum width=4em,minimum height=2em,rounded corners=1pt,fill=red!20]
+\tikzstyle{wode} = [inner sep=0pt,minimum width=4em,minimum height=2em,rounded corners=0pt]
+\node [anchor=west,wode] (n1) at (0,0) {${y}_1,{y}_2,\ldots,{y}_n$};
+\node [anchor=north west,wode] (n2) at ([xshift=1em,yshift=0.5em]n1.south east) {${x}_1,{x}_2,\ldots,{x}_m$};
+\node [anchor=south west,rnnnode] (n3) at ([xshift=8em,yshift=0.5em]n2.north east) {生成模型G};
+\node [anchor=south east,wode] (n4) at ([xshift=-2em,yshift=0em]n3.north west) {$\tilde{{y}}_{1},\tilde{{y}}_{2},...,\tilde{{y}}_{J}$};
+\node [anchor=south,snode] (n5) at ([xshift=0em,yshift=6em]n2.north) {判别网络D};
+\node [anchor=west,align=left,font=\small] (n6) at ([xshift=15em,yshift=-3em]n5.east) {根据$(\seq{x},\seq{\tilde{y}})$生\\成奖励信号};
+\draw [->,thick] ([xshift=0em,yshift=0em]n1.north)--([xshift=0em,yshift=0em]n5.south);
+\draw [->,thick] ([xshift=0em,yshift=0em]n2.north)--([xshift=0em,yshift=0em]n5.south);
+\draw [->,thick] ([xshift=0em,yshift=0em]n4.west)--([xshift=0em,yshift=0em]n5.south);
+\draw [->,thick] ([xshift=0em,yshift=0em]n3.north)--([xshift=0em,yshift=1em]n3.north)--([xshift=0em,yshift=0em]n4.east);
+\draw [->,thick] ([xshift=0em,yshift=0em]n5.east) --  ([xshift=14.5em,yshift=0em]n5.east) --  ([xshift=1em,yshift=0em]n3.east) --  ([xshift=0em,yshift=0em]n3.east);
+\draw [->,thick] ([xshift=0em,yshift=0em]n2.east) --  ([xshift=0em,yshift=-1.5em]n3.south) --  ([xshift=0em,yshift=0em]n3.south);
+\begin{pgfonlayer}{background}
+%\node [rectangle,inner sep=0.5em,rounded corners=5pt,very thick,dotted,draw=yellow] [fit = (n3) (n4)] (b1) {};
+%\node [rectangle,inner sep=0.5em,rounded corners=5pt,very thick,dotted,draw=red] [fit = (n5)] (b2) {};
+\end{pgfonlayer}
+%\draw [->,thick] ([xshift=0em,yshift=0em]n3.south) .. controls +(south:1em) and +(north:1em) .. ([xshift=0em,yshift=0em]n5.north);
+\end{tikzpicture}
--- a/Chapter13/Figures/figure-of-scheduling-sampling-method.tex
+++ b/Chapter13/Figures/figure-of-scheduling-sampling-method.tex
+%------------------------------------------------------------
+\begin{tikzpicture}
+\tikzstyle{rnnnode} = [draw,inner sep=2pt,minimum width=4em,minimum height=2em,rounded corners=1pt,fill=red!20]
+\tikzstyle{snode} = [draw,inner sep=2pt,minimum width=4em,minimum height=2em,rounded corners=1pt,fill=blue!20]
+\tikzstyle{ynode} = [inner sep=2pt,minimum width=4em,minimum height=2em,rounded corners=1pt]
+\node [anchor=west,rnnnode] (n1) at (0,0) {$\mathbi{h}_{1}$};
+\node [anchor=west] (n2) at ([xshift=3em,yshift=0em]n1.east) {$\cdots$};
+\node [anchor=west,rnnnode] (n3) at ([xshift=3em,yshift=0em]n2.east) {$\mathbi{h}_{j-1}$};
+\node [anchor=west,rnnnode] (n4) at ([xshift=3em,yshift=0em]n3.east) {$\mathbi{h}_{j}$};
+\node [anchor=south,snode] (n5) at ([xshift=0em,yshift=1em]n3.north) {Softmax};
+\node [anchor=south,ynode] (n6) at ([xshift=0em,yshift=1em]n5.north) {$\tilde{{y}}_{j-1}$};
+\node [anchor=south,snode] (n7) at ([xshift=0em,yshift=1em]n4.north) {Softmax};
+\node [anchor=south,ynode] (n8) at ([xshift=0em,yshift=1em]n7.north) {$\tilde{{y}}_{j}$};
+\node [anchor=south,snode,font=\footnotesize] (n13) at ([xshift=0em,yshift=1em]n1.north) {Softmax};
+\node [anchor=south,ynode] (n14) at ([xshift=0em,yshift=1em]n13.north) {$\tilde{{y}}_{1}$};
+\node [anchor=north] (x1) at ([xshift=0em,yshift=-1em]n1.south) {$\langle$sos$\rangle$};
+\node [anchor=north,font=\small] (x2) at ([xshift=-1.3em,yshift=-2.3em]n3.south) {$\tilde{{y}}_{j-2}$};
+\node [anchor=north,font=\small] (x3) at ([xshift=1.3em,yshift=-2.5em]n3.south) {${y}_{j-2}$};
+\node [anchor=north,font=\small] (x4) at ([xshift=1.3em,yshift=-2.5em]n4.south) {${y}_{j-1}$};
+\node [anchor=north,font=\small] (x5) at ([xshift=-1.3em,yshift=-2.3em]n4.south) {$\tilde{{y}}_{j-1}$};
+\node [anchor=south,inner sep=2pt] (st1) at (n6.north) {\scriptsize{\textbf{[step $j-1$]}}};
+\node [anchor=south,inner sep=2pt] (st2) at (n8.north) {\scriptsize{\textbf{[step $j$]}}};
+\node [anchor=south,inner sep=2pt] (st3) at (n14.north) {\scriptsize{\textbf{[step $1$]}}};
+\node [anchor=north,font=\tiny,rotate=90] (e1) at ([xshift=-2.7em,yshift=-1.1em]n3.south) {${(1-\epsilon_i)}^2$};
+%\node [anchor=north,font=\scriptsize] (e2) at ([xshift=2em,yshift=-0.1em]n3.south) {$\funp{P}=\epsilon_i$};
+%\node [anchor=north,font=\scriptsize] (e3) at ([xshift=-2em,yshift=-1em]n4.south) {$\funp{P}={(1-\epsilon_i)}^2$};
+\node [anchor=north,font=\tiny,rotate=90] (e4) at ([xshift=1.5em,yshift=-1.2em]n4.south) {$\epsilon_i$};
+%\node [anchor=south east,font=\small] (l1) at ([xshift=-1em,yshift=0.5em]n5.north west) {Loss};
+%\node [anchor=south west,font=\small] (l2) at ([xshift=1em,yshift=0.5em]n7.north east) {Loss};
+\draw [->,thick] ([xshift=0em,yshift=0em]x1.north)--([xshift=0em,yshift=0em]n1.south);
+\draw [->,thick] ([xshift=0em,yshift=0em]n1.east)--([xshift=0em,yshift=0em]n2.west);
+\draw [->,thick] ([xshift=0em,yshift=0em]n2.east)--([xshift=0em,yshift=0em]n3.west);
+\draw [->,thick] ([xshift=0em,yshift=0em]n3.east)--([xshift=0em,yshift=0em]n4.west);
+\draw [->,thick] ([xshift=0em,yshift=0em]n4.east)--([xshift=3em,yshift=0em]n4.east);
+\draw [->,thick] ([xshift=0em,yshift=0em]n3.north)--([xshift=0em,yshift=0em]n5.south);
+\draw [->,thick] ([xshift=0em,yshift=0em]n5.north)--([xshift=0em,yshift=0em]n6.south);
+\draw [->,thick] ([xshift=0em,yshift=0em]n4.north)--([xshift=0em,yshift=0em]n7.south);
+\draw [->,thick] ([xshift=0em,yshift=0em]n7.north)--([xshift=0em,yshift=0em]n8.south);
+\draw [->,thick] ([xshift=0em,yshift=0em]n1.north)--([xshift=0em,yshift=0em]n13.south);
+\draw [->,thick] ([xshift=0em,yshift=0em]n13.north)--([xshift=0em,yshift=0em]n14.south);
+%\draw [->,thick] ([xshift=0em,yshift=0em]l1.south) .. controls +(south:1em) and +(west:0.1em) .. ([xshift=0em,yshift=0em]n5.west);
+%\draw [->,thick] ([xshift=0em,yshift=0em]l2.south) .. controls +(south:1em) and +(east:0.1em) .. ([xshift=0em,yshift=0em]n7.east);
+\node [circle,fill=black,minimum size=1pt,inner sep=1.2pt] (c1) at ([xshift=0em,yshift=0.6em]x2.north) {};
+\node [circle,fill=black,minimum size=1pt,inner sep=1.2pt] (c2) at ([xshift=0em,yshift=0.8em]x3.north) {};
+\node [circle,fill=black,minimum size=1pt,inner sep=1.2pt] (c3) at ([xshift=0em,yshift=0.8em]x4.north) {};
+\node [circle,fill=black,minimum size=1pt,inner sep=1.2pt] (c4) at ([xshift=0em,yshift=0.6em]x5.north) {};
+\draw [-,thick] ([xshift=0em,yshift=-0em]x2.north)-- ([xshift=-1.3em,yshift=0em]n3.south);
+\draw [-,thick] ([xshift=0em,yshift=0.2em]x3.north)-- ([xshift=0em,yshift=0em]c2.south);
+\draw [-,thick] ([xshift=0em,yshift=0em]c2.north)-- ([xshift=0.3em,yshift=0.6em]c2.north);
+\draw [-,thick] ([xshift=0em,yshift=0.6em]c2.north)-- ([xshift=1.3em,yshift=0em]n3.south);
+\draw [-,thick] ([xshift=-0em,yshift=-0em]x5.north)-- ([xshift=0em,yshift=0em]c4.south);
+\draw [-,thick] ([xshift=-0em,yshift=0em]c4.north)-- ([xshift=0.3em,yshift=0.6em]c4.north);
+\draw [-,thick] ([xshift=-0em,yshift=0.6em]c4.north)-- ([xshift=-1.3em,yshift=0em]n4.south);
+\draw [-,thick] ([xshift=0em,yshift=0.2em]x4.north)-- ([xshift=1.3em,yshift=0em]n4.south);
+\draw [->,thick,dotted] ([xshift=-2.5em,yshift=1em]x2.north) .. controls +(south:2em) and +(west:0.1em) .. ([xshift=0.2em,yshift=0em]x2.west);
+\draw [->,thick,dotted] ([xshift=0em,yshift=0em]n6.east) .. controls ([xshift=2em,yshift=1em]n6.east) and ([xshift=-2.5em,yshift=-4em]n4.south west) .. ([xshift=-0em,yshift=-0em]x5.west);
+\draw [->,thick,dotted] ([xshift=0em,yshift=0em]n14.east) .. controls +(east:0.3em) and +(north:2em) .. ([xshift=3em,yshift=-0.5em]n14.south);
+\end{tikzpicture}
--- a/Chapter13/Figures/figure-reinforcement-learning-method-based-on-actor-critic.tex
+++ b/Chapter13/Figures/figure-reinforcement-learning-method-based-on-actor-critic.tex
+\begin{tikzpicture}
+	\node[anchor=west,inner sep=0mm,minimum height=4em,minimum width=5.5em,rounded corners=15pt,align=left,draw] (n1) at (0,0) {Decoder\\Encoder};
+	\node[anchor=west,inner sep=0mm,minimum height=4em,minimum width=5.5em,rounded corners=15pt,align=left,draw] (n2) at ([xshift=10em,yshift=0em]n1.east) {Decoder\\Encoder};
+	\node[anchor=south,inner sep=0mm,font=\small] (a1) at ([xshift=0em,yshift=1em]n1.north) {演员$p_{\theta}$};
+	\node[anchor=north,inner sep=0mm] (a2) at ([xshift=0em,yshift=-1em]n1.south) {${x}_1,{x}_2,\ldots,{x}_m$};
+	\node[anchor=south,inner sep=0mm,font=\small] (c1) at ([xshift=0em,yshift=1em]n2.north) {评论家$Q$};
+	\node[anchor=north,inner sep=0mm] (c2) at ([xshift=0em,yshift=-1em]n2.south) {${y}_1,{y}_2,\ldots,{y}_J$};
+%	\node[anchor=west,inner sep=0mm] (n3) at ([xshift=2.1em,yshift=2em]n1.east) {$Q_1,Q_2,\ldots,Q_J$};
+%	\node[anchor=west,inner sep=0mm] (n4) at ([xshift=2.9em,yshift=-0.4em]n1.east) {$\hat{\mathbi{y}}_1,\hat{\mathbi{y}}_2,\ldots,\hat{\mathbi{y}}_J$};
+%	\node[anchor=west,inner sep=0mm,font=\small] (n5) at ([xshift=3em,yshift=-3em]n1.east) {演员状态};
+\draw [-,thick] ([xshift=0em,yshift=0em]n1.west) -- ([xshift=0em,yshift=0em]n1.east);
+\draw [-,thick] ([xshift=0em,yshift=0em]n2.west) -- ([xshift=0em,yshift=0em]n2.east);
+%\draw [->,thick] ([xshift=0em,yshift=1em]n2.west) -- ([xshift=0em,yshift=1em]n1.east);
+%\draw [->,thick] ([xshift=0em,yshift=0.5em]n1.east) -- ([xshift=0em,yshift=0.5em]n2.west);
+%\draw [->,dotted,very thick] ([xshift=0em,yshift=0em]n1.east)  .. controls ([xshift=3em,yshift=-1em]n1.-90) and ([xshift=-3em,yshift=-1em]n2.-90) .. (n2.west);
+	\node[anchor=west,inner sep=0mm] (n3) at ([xshift=2.1em,yshift=1em]n1.east) {$Q_1,Q_2,\ldots,Q_J$};
+	\node[anchor=west,inner sep=0mm] (n4) at ([xshift=2.9em,yshift=-1em]n1.east) {$\tilde{{y}}_1,\tilde{{y}}_2,\ldots,\tilde{{y}}_J$};
+\draw [->,thick] ([xshift=0em,yshift=0.2em]n2.west) -- ([xshift=0em,yshift=0.2em]n1.east);
+\draw [->,thick] ([xshift=0em,yshift=-0.2em]n1.east) -- ([xshift=0em,yshift=-0.2em]n2.west);
+\end{tikzpicture}
\ No newline at end of file
--- a/Chapter13/Figures/figure-unk-of-bpe.tex
+++ b/Chapter13/Figures/figure-unk-of-bpe.tex
 	\begin{tikzpicture}
-		\node[rounded corners=3pt,minimum width=1.0em,minimum height=2.0em,font=\scriptsize,fill=green!5,drop shadow,thick,draw](top) at (0,0) {
+		\node[rounded corners=3pt,minimum width=1.0em,minimum height=2.0em,font=\scriptsize,fill=green!5,drop shadow,thick](top) at (0,0) {
-		\begin{tabular}{ll}
+		\begin{tabular}{lllllll}
-			\multicolumn{2}{c}{BPE词表:}  \\
+			\multicolumn{7}{c}{符号合并表}  \\
-			errrr$<$e$>$ & tain$<$e$>$ \\
+			r\ $<$e$>$, & e\ s, & l\ o,  & es\ t, & lo\ w, & est\ $<$e$>$, & e\ r$<$e$>$,                             
-			moun  & est$<$e$>$  \\
-			high & the$<$e$>$  \\
-			a$<$e$>$ &                               
 			\end{tabular}
 		};
-		\node[font=\scriptsize,anchor=west] (node1) at ([xshift=0.5em,yshift=1em]top.east) {原始序列：};
+		\node[font=\footnotesize,anchor=north] (l1) at ([xshift=0em,yshift=-1em]top.south) {(a) 符号合并表};
-		\node[font=\scriptsize,anchor=west] (this) at (node1.east) {"this$<$e$>$" ,};
+		\node[font=\scriptsize,anchor=west] (n1) at ([xshift=-6em,yshift=-6em]top.west) {l\ o\ w\ e\ r\ $<$e$>$};
-		\node[font=\scriptsize,anchor=west] (highest) at (this.east) {"highest$<$e$>$",};
+		\node[font=\scriptsize,anchor=west] (n2) at ([xshift=2.6em]n1.east) {l\ o\ w\ e\ {\red r$<$e$>$}};
-		\node[font=\scriptsize,anchor=west] (mountain) at (highest.east) { "mountain$<$e$>$"};
+		\node[font=\scriptsize,anchor=west] (n3) at ([xshift=2.6em]n2.east) {{\red lo}\ w\ e\ r$<$e$>$};
+		\node[font=\scriptsize,anchor=west] (n4) at ([xshift=2.6em]n3.east) {{\red low}\ e\ r$<$e$>$};
+		\node[font=\scriptsize,anchor=west] (n5) at ([xshift=2.6em]n4.east) {low\ {\red er$<$e$>$}};
-		\node[font=\scriptsize,anchor=west] (node2) at ([yshift=-1.5em]node1.south west) {BPE切分：};
+		\node[font=\scriptsize,anchor=west] (t1) at ([yshift=-1.5em]n1.south west) {l\ o\ w\ e\ s\ t\ $<$e$>$};
-		\node[font=\scriptsize,anchor=west] (unk) at (node2.east) {"$<$unk$>$",};
+		\node[font=\scriptsize,anchor=west] (t2) at ([xshift=0.8em]t1.east) {l\ o\ w\ {\red es}\ t\ $<$e$>$};
-		\node[font=\scriptsize,anchor=west] (high) at (unk.east) {"high",};
+		\node[font=\scriptsize,anchor=west] (t3) at ([xshift=0.8em]t2.east) {{\red lo}\ w\ es\ t\ $<$e$>$};
-		\node[font=\scriptsize,anchor=west] (est) at (high.east) {"est$<$e$>$",};
+		\node[font=\scriptsize,anchor=west] (t4) at ([xshift=0.8em]t3.east) {lo\ w\ {\red est}\ $<$e$>$};
-		\node[font=\scriptsize,anchor=west] (moun) at (est.east) {"moun",};
+		\node[font=\scriptsize,anchor=west] (t5) at ([xshift=0.8em]t4.east) {{\red low}\ est\ $<$e$>$};
-		\node[font=\scriptsize,anchor=west] (tain) at (moun.east) {"tain$<$e$>$"};
+		\node[font=\scriptsize,anchor=west] (t6) at ([xshift=0.8em]t5.east) {low\ {\red est$<$e$>$}};
+		\node[font=\footnotesize,anchor=north] (l2) at ([xshift=1.5em,yshift=-1em]t3.south) {(b) 合并样例};
-		%\draw[->,thick](node1.south) -- ([xshift=-1.0em]node2.north);
+		\draw[->,thick](n1.east) -- (n2.west);
-		\draw[->,thick]([xshift=-0.2em]this.south) -- (unk);
+		\draw[->,thick](n2.east) -- (n3.west);
-		\draw[->,thick](highest.south) -- (high);
+		\draw[->,thick](n3.east) -- (n4.west);
-		\draw[->,thick](highest.south) -- (est);
+		\draw[->,thick](n4.east) -- (n5.west);
-		\draw[->,thick](mountain.south) -- (moun);
-		\draw[->,thick](mountain.south) -- (tain);
+		\draw[->,thick](t1.east) -- (t2.west);
+		\draw[->,thick](t2.east) -- (t3.west);
+		\draw[->,thick](t3.east) -- (t4.west);
+		\draw[->,thick](t4.east) -- (t5.west);
+		\draw[->,thick](t5.east) -- (t6.west);
 	\end{tikzpicture}
\ No newline at end of file
--- a/Chapter13/Figures/figure-word-root.tex
+++ b/Chapter13/Figures/figure-word-root.tex
@@ -11,8 +11,8 @@
 \node[anchor = north] (new_root) at ([yshift = -1.5em]newer.south) {new};
 \draw [->] ([yshift=0.2em]do_root.north) .. controls +(north:0.4) and +(south:0.6) ..(do.south);
 \draw [->] (do_root.north) -- (does.south);
-\draw [->] ([yshift=0.2em]do_root.north) .. controls +(north:0.4) and +(south:0.6) ..([yshift=0.1em]doing.south);
+\draw [->] ([yshift=0.2em]do_root.north) .. controls +(north:0.4) and +(south:0.6) ..(doing.south);
 \draw [->] ([yshift=0.2em]new_root.north) .. controls +(north:0.4) and +(south:0.6) ..(new.south);
 \draw [->] (new_root.north) -- (newer.south);
-\draw [->] ([yshift=0.2em]new_root.north) .. controls +(north:0.4) and +(south:0.6) ..([yshift=0.08em]newest.south);
+\draw [->] ([yshift=0.2em]new_root.north) .. controls +(north:0.4) and +(south:0.6) ..(newest.south);
 \end{tikzpicture}
\ No newline at end of file
--- a/Chapter13/chapter13.tex
+++ b/Chapter13/chapter13.tex
--- a/Chapter14/Figures/figure-3vs.tex
+++ b/Chapter14/Figures/figure-3vs.tex
-\tikzstyle{yy} = [circle,minimum height=1cm,text centered,draw=black,thick,drop shadow={shadow xshift=0.3em,yshift=0.8em},fill=white]
+\begin{tikzpicture}
-\begin{tikzpicture}[node distance = 0,scale = 1]
+\tikzstyle{decoder} = [rectangle,thick,rounded corners,minimum width=5cm,minimum height=0.6cm,text centered,draw=black,fill=blue!15]
-\begin{scope}[xshift=0.2in]
-\tikzstyle{every node}=[scale=1]
+\begin{scope}
-\node (y1)[yy]{\large$y_1$};
+\node (aa)[decoder] at (0,0) {};
-\node (y2)[yy,right of = y1,xshift=1.5cm]{\large$y_2$};
+\node (y2b)[anchor=south] at ([yshift=-2.5em]aa.south) {$y_2$};
-\node (y3)[yy,right of = y2,xshift=1.5cm]{\large$y_3$};
+\node (label)[anchor=south] at ([yshift=-1.8em]y2b.south) {\small{(a) 自回归解码}};
-\node (y4)[yy,right of = y3,xshift=1.5cm]{\large$y_4$};
+\node (y1b)[anchor=east] at ([xshift=-2.5em]y2b.east) {$y_1$};
-\node (y5)[yy,right of = y4,xshift=1.5cm]{\large$y_5$};
+\node (sos)[anchor=east] at ([xshift=-4.3em]y2b.east) {\small{<sos>}};
-\node (y6)[yy,right of = y5,xshift=1.5cm]{\large$y_6$};
+\node (y3b)[anchor=west] at ([xshift=2.5em]y2b.west) {$y_3$};
-\node [anchor=north,font=\scriptsize] (labela) at ([xshift=1.8em,yshift=-2em]y3.south) {(a) 自回归模型};
+\node (y4b)[anchor=west] at ([xshift=5em]y2b.west) {$y_4$};
-\draw[->,thick] (y1.north) .. controls ([yshift=1.5em]y1.north) and ([yshift=1.5em]y3.north)..  (y3.north);
+\node (y3a)[anchor=north] at ([yshift=2.5em]aa.north) {$y_3$};
-\draw[->,thick] (y1.north) .. controls ([yshift=2em]y1.north) and ([yshift=2em]y4.north)..  (y4.north);
+\node (y2a)[anchor=east] at ([xshift=-2.5em]y3a.east) {$y_2$};
-\draw[->,thick] (y1.south) .. controls ([yshift=-1.5em]y1.south) and ([yshift=-1.5em]y5.south)..  (y5.south);
+\node (y1a)[anchor=east] at ([xshift=-5em]y3a.east) {$y_1$};
-\draw[->,thick] (y1.south) .. controls ([yshift=-2em]y1.south) and ([yshift=-2em]y6.south)..  (y6.south);
+\node (y4a)[anchor=west] at ([xshift=2.5em]y3a.west) {$y_4$};
+\node (eos)[anchor=west] at ([xshift=4.3em]y3a.west) {\small{<eos>}};
-\draw[->,thick] (y2.north) .. controls ([yshift=1.5em]y2.north) and ([yshift=1.5em]y4.north)..  (y4.north);
-\draw[->,thick] (y2.south) .. controls ([yshift=-1.5em]y2.south) and ([yshift=-1.6em]y5.south)..  (y5.south);
+\draw [->,very thick,dotted] ([xshift=-0.3em]y1a.east) .. controls +(east:0.5) and +(west:0.5) ..([xshift=0.3em]y1b.west);
-\draw[->,thick] (y2.south) .. controls ([yshift=-2em]y2.south) and ([yshift=-2em]y6.south)..  (y6.south);
+\draw [->,very thick,dotted] ([xshift=-0.3em]y2a.east) .. controls +(east:0.5) and +(west:0.5) ..([xshift=0.3em]y2b.west);
+\draw [->,very thick,dotted] ([xshift=-0.3em]y3a.east) .. controls +(east:0.5) and +(west:0.5) ..([xshift=0.3em]y3b.west);
+\draw [->,very thick,dotted] ([xshift=-0.3em]y4a.east) .. controls +(east:0.5) and +(west:0.5) ..([xshift=0.3em]y4b.west);
-\draw[->,thick] (y3.south) .. controls ([yshift=-1.5em]y3.south) and ([yshift=-1.5em]y5.south)..  (y5.south);
-\draw[->,thick] (y3.south) .. controls ([yshift=-2em]y3.south) and ([yshift=-2em]y6.south)..  (y6.south);
+\node (autodecoder)[decoder] at (0,0) {自回归编码器};
-\draw[->,thick] (y4.south) .. controls ([yshift=-1.5em]y4.south) and ([yshift=-1.5em]y6.south)..  (y6.south);
+\draw [->,thick]([yshift=0em]y1b.north) to ([yshift=1.15em]y1b.north); 
+\draw [->,thick]([yshift=0em]y2b.north) to ([yshift=1.15em]y2b.north);
-\draw[->,red,very thick](y1.east)to(y2.west);
+\draw [->,thick]([yshift=0em]y3b.north) to ([yshift=1.15em]y3b.north);
-\draw[->,red,very thick](y2.east)to(y3.west);
+\draw [->,thick]([yshift=0em]y4b.north) to ([yshift=1.15em]y4b.north);
-\draw[->,red,very thick](y3.east)to(y4.west);
+\draw [->,thick]([yshift=0em]sos.north) to ([yshift=1.15em]sos.north);
-\draw[->,red,very thick](y4.east)to(y5.west);
-\draw[->,red,very thick](y5.east)to(y6.west);
+\draw [->,thick]([yshift=-1.15em]y1a.south) to (y1a.south); 
+\draw [->,thick]([yshift=-1.15em]y2a.south) to (y2a.south);
+\draw [->,thick]([yshift=-1.15em]y3a.south) to (y3a.south);
+\draw [->,thick]([yshift=-1.15em]y4a.south) to (y4a.south);
+\draw [->,thick]([yshift=-1.2em]eos.south) to (eos.south);
 \end{scope}
-\begin{scope}[yshift=-1.45in]
+\begin{scope}[yshift=-1.55in]
-\tikzstyle{rec} = [rectangle,minimum width=2.8cm,minimum height=1.5cm,text centered,draw=black,dashed]
+\node (aa) [decoder] at (0,0) {};
-\tikzstyle{every node}=[scale=1]
+\node (y1y2b)[rectangle,anchor=south,inner sep=0.25em,densely dashed,draw] at ([yshift=-2.6em]aa.south) {$y_1\;y_2$};
-\node (y1)[yy]{\large$y_1$};
+\node (label)[anchor=south] at ([yshift=-2.1em]y1y2b.south) {\small{(b) 半自回归解码}};
-\node (y2)[yy,right of = y1,xshift=1.5cm]{\large$y_2$};
+\node (sos)[anchor=east] at ([xshift=-4.55em]y1y2b.east) {\small{<sos>}};
-\node (y3)[yy,right of = y2,xshift=2cm]{\large$y_3$};
+\node (y3y4b)[rectangle,anchor=west,inner sep=0.25em,densely dashed,draw] at ([xshift=4.7em]y1y2b.west) {$y_3\;y_4$};
-\node (y4)[yy,right of = y3,xshift=1.5cm]{\large$y_4$};
-\node (y5)[yy,right of = y4,xshift=2cm]{\large$y_5$};
+\node (y3y4a)[rectangle,anchor=north,inner sep=0.25em,densely dashed,draw] at ([yshift=2.6em]aa.north) {$y_3\;y_4$};
-\node (y6)[yy,right of = y5,xshift=1.5cm]{\large$y_6$};
+\node (y1y2a)[rectangle,anchor=west,inner sep=0.25em,densely dashed,draw] at ([xshift=-4.7em]y3y4a.west) {$y_1\;y_2$};
-\node (rec1)[rec,right of = y1,xshift=0.75cm]{};
+\node (eos)[anchor=east] at ([xshift=4.85em]y3y4a.east) {\small{<eos>}};
-\node (rec2)[rec,right of = y3,xshift=0.75cm]{};
-\node (rec3)[rec,right of = y5,xshift=0.75cm]{};
+\draw [->,very thick,dotted] ([xshift=-0em]y1y2a.east) .. controls +(east:0.5) and +(west:0.5) ..([xshift=0em]y1y2b.west);
-\node [anchor=north,font=\scriptsize] (labelb) at ([xshift=2.2em,yshift=-1em]y3.south) {(b) 半自回归模型};
+\draw [->,very thick,dotted] ([xshift=-0em]y3y4a.east) .. controls +(east:0.5) and +(west:0.5) ..([xshift=0em]y3y4b.west);
-\draw[->,red,very thick](rec1.east)to(rec2.west);
+\node (autodecoder)[decoder] at (0,0) {半自回归编码器};
-\draw[->,red,very thick](rec2.east)to(rec3.west);
+\draw [->,thick]([yshift=0.05em]sos.north) to ([yshift=1.38em]sos.north);
-\draw[->,thick] (rec1.north) .. controls ([yshift=2.5em]rec1.north) and ([yshift=2.5em]rec3.north)..  (rec3.north);
+\draw [->,thick]([yshift=0em]y1y2b.north) to ([yshift=1.38em]y1y2b.north);
+\draw [->,thick]([yshift=0em]y3y4b.north) to ([yshift=1.38em]y3y4b.north);
+\draw [->,thick]([yshift=-1.5em]y1y2a.south) to ([yshift=-0.02em]y1y2a.south);
+\draw [->,thick]([yshift=-1.5em]y3y4a.south) to ([yshift=-0.02em]y3y4a.south);
+\draw [->,thick]([yshift=-1.4em]eos.south) to ([yshift=-0.05em]eos.south);
 \end{scope}
-\begin{scope}[xshift=0.3in,yshift=-2.35in]
-\tikzstyle{every node}=[scale=1]
+\begin{scope}[yshift=-3.1in]
-\node (y1)[yy]{\large$y_1$};
+\node (aa) [decoder]at (0,0) {非自回归模型};
-\node (y2)[yy,right of = y1,xshift=1.5cm]{\large$y_2$};
+\node (y2b)[anchor=south] at ([xshift=-1.5em,yshift=-2.5em]aa.south) {$y_2$};
-\node (y3)[yy,right of = y2,xshift=1.5cm]{\large$y_3$};
+\node (label)[anchor=south] at ([yshift=-4.3em]aa.south) {\small{(c) 非自回归解码}};
-\node (y4)[yy,right of = y3,xshift=1.5cm]{\large$y_4$};
+\node (y1b)[anchor=east] at ([xshift=-3em]y2b.east) {$y_1$};
-\node (y5)[yy,right of = y4,xshift=1.5cm]{\large$y_5$};
+\node (y3b)[anchor=west] at ([xshift=3em]y2b.west) {$y_3$};
-\node (y6)[yy,right of = y5,xshift=1.5cm]{\large$y_6$};
+\node (y4b)[anchor=west] at ([xshift=6em]y2b.west) {$y_4$};
-\node [anchor=north,font=\scriptsize] (labelc) at ([xshift=1.5em,yshift=-0.5em]y3.south) {(c) 非自回归模型};
+\node (y2a)[anchor=north] at ([xshift=-1.5em,yshift=2.5em]aa.north) {$y_2$};
+\node (y1a)[anchor=east] at ([xshift=-3em]y2a.east) {$y_1$};
+\node (y3a)[anchor=west] at ([xshift=3em]y2a.west) {$y_3$};
+\node (y4a)[anchor=west] at ([xshift=6em]y2a.west) {$y_4$};
+\draw [->,thick]([yshift=0em]y1b.north) to ([yshift=1.15em]y1b.north); 
+\draw [->,thick]([yshift=0em]y2b.north) to ([yshift=1.15em]y2b.north);
+\draw [->,thick]([yshift=0em]y3b.north) to ([yshift=1.15em]y3b.north);
+\draw [->,thick]([yshift=0em]y4b.north) to ([yshift=1.15em]y4b.north);
+\draw [->,thick]([yshift=-1.2em]y1a.south) to (y1a.south); 
+\draw [->,thick]([yshift=-1.2em]y2a.south) to (y2a.south);
+\draw [->,thick]([yshift=-1.2em]y3a.south) to (y3a.south);
+\draw [->,thick]([yshift=-1.2em]y4a.south) to (y4a.south);
 \end{scope}
 \end{tikzpicture}
\ No newline at end of file
--- a/Chapter14/Figures/figure-syntax.tex
+++ b/Chapter14/Figures/figure-syntax.tex
-\definecolor{Melon}{rgb}{0.99, 0.74, 0.71}
+\begin{tikzpicture}
-\definecolor{Goldenrod}{rgb}{0.85, 0.65, 0.13}
+\tikzstyle{encoder} = [rectangle,thick,rounded corners,minimum width=1.9cm,minimum height=1.2cm,text centered,draw=black,fill=red!25]
-\definecolor{Cerulean}{rgb}{0, 0.48, 0.65}
+\tikzstyle{autodecoder} = [rectangle,thick,rounded corners,minimum width=3cm,minimum height=1.2cm,text centered,draw=black,fill=blue!15]
-\definecolor{Gray}{rgb}{0.5, 0.5, 0.5}
+\tikzstyle{nonautodecoder} = [rectangle,thick,rounded corners,minimum width=3.4cm,minimum height=1.2cm,text centered,draw=black!70,fill=blue!15]
-\definecolor{aliceblue}{rgb}{0.94, 0.97, 1.0}
-\tikzstyle{emb} = [rectangle,very thick,rounded corners,minimum width=3cm,minimum height=0.85cm,text centered,draw=black!70,fill=red!15]
+\node (encoder)[encoder] at (0,0) {编码器};
-\tikzstyle{sa} = [rectangle,very thick,rounded corners,minimum width=3cm,minimum height=1cm,text centered,draw=black!70,fill=yellow!20]
+\node (text_left)[anchor=south] at ([yshift=-3em]encoder.south) {\footnotesize{猫\ 在\ 熟睡}};
-\tikzstyle{edsa} = [rectangle,very thick,rounded corners,minimum width=3cm,minimum height=1.5cm,text centered,align=center,draw=black!70,fill=yellow!20]
+\node (autodecoder)[autodecoder,right of=encoder,xshift=6em ] {自回归解码器};
-\tikzstyle{an} = [rectangle,very thick,rounded corners,minimum width=3.5cm,minimum height=0.7cm,text centered,draw=black!70,fill=aliceblue]
+\node (text_mid1)[anchor=north] at ([yshift=3em]autodecoder.north) {\scriptsize{NP1\ VP3\ <eos>}};
-\tikzstyle{ff} = [rectangle,very thick,rounded corners,minimum width=3cm,minimum height=1cm,text centered,align=center,draw=black!70,fill=orange!20]
+\node (text_mid2)[anchor=south] at ([yshift=-3em]autodecoder.south) {\scriptsize{<sos>\ NP1\ VP3}};
-\tikzstyle{linear} = [rectangle,very thick,rounded corners,minimum width=3cm,minimum height=0.7cm,text centered,draw=black!70,fill=green!20]
+\node (nonautodecoder)[nonautodecoder,right of=autodecoder,xshift=10.5em ] {非自回归解码器};
-\tikzstyle{softmax} = [rectangle,very thick,rounded corners,minimum width=3cm,minimum height=0.7cm,text centered,draw=black!70,fill=blue!20]
+\node (text_right1)[anchor=north] at ([yshift=3em]nonautodecoder.north) {\scriptsize{NP1\;Cats\;VP3\;sleep\;a\;lot}};
-\begin{tikzpicture}[node distance = 0,scale = 0.7]
+\node (text_right2)[anchor=south] at ([yshift=-3em]nonautodecoder.south) {\scriptsize{NP1\;<Mask>\;VP3\;<Mask>\;<Mask>\;<Mask>}};
-\tikzstyle{every node}=[scale=0.7]
-%left
-\node(left_Emb)[emb]{\footnotesize{\textbf{Input Embedding}}};
+\draw[->,thick] ([yshift=0.1em]text_left.north) to (encoder.south);
-\node(left_cir)[circle,very thick,minimum width=0.5cm,draw=black!70,above of = left_Emb,yshift=1.1cm]{};
+\draw[->,thick] ([yshift=0.1em]text_mid2.north) to (autodecoder.south);
-\draw[-,very thick,draw=black!70]([xshift=0.03cm]left_cir.west)--([xshift=-0.03cm]left_cir.east);
+\draw[->,thick] (autodecoder.north) to ([yshift=-0.1em]text_mid1.south);
-\draw[-,very thick,draw=black!70]([yshift=-0.03cm]left_cir.north)--([yshift=0.03cm]left_cir.south);
+\draw[->,thick] ([yshift=0.1em]text_right2.north) to (nonautodecoder.south);
-\node(left_cir2)[circle,very thick,minimum width=0.5cm,draw=black!70,left of = left_cir,xshift=-1.5cm]{};
+\draw[->,thick] (nonautodecoder.north) to ([yshift=-0.1em]text_right1.south);
-\draw[very thick,draw=black!70]([xshift=0.04cm]left_cir2.west)sin([xshift=0.14cm,yshift=0.08cm]left_cir2.west)cos([xshift=0.25cm]left_cir2.west)sin([xshift=0.36cm,yshift=-0.08cm]left_cir2.west)cos([xshift=-0.03cm]left_cir2.east);
+\draw[->,thick] (text_mid1.east) -- ([xshift=2.1em]text_mid1.east) -- ([xshift=-1.2em]text_right2.west)-- (text_right2.west);
-\draw [->,very thick,draw=black!70](left_cir2.east)--(left_cir.west);
+\draw[-,thick] (encoder.north) to ([yshift=0.8em]encoder.north);
-\node(left_Self)[sa,above of = left_cir,yshift=1.6cm]{\textbf{Self-Attention}};
+\draw[-,thick,dashed] ([yshift=0.8em]encoder.north) -- ([xshift=-7em,yshift=0.8em]nonautodecoder.north) -- ([xshift=-2.5em]nonautodecoder.west);
-\node(left_Add_bottom)[an,above of = left_Self,yshift=1.1cm]{\textbf{Add \& LayerNorm}};
+\draw[->,thick]([xshift=-2.5em]nonautodecoder.west) to (nonautodecoder.west);
-\node(left_Feed)[ff,above of = left_Add_bottom,yshift=1.2cm]{\textbf{Feed Forward}\\\textbf{Network}};
-\node(left_Add_top)[an,above of = left_Feed,yshift=1.1cm]{\textbf{Add \& LayerNorm}};
-\node(left_text_bottom)[below of = left_Emb,xshift=0cm,yshift=-1.2cm,scale=1]{\small\sffamily\bfseries{我\quad 爱\quad  我的\quad  狗}};
-\draw [->,very thick,draw=black!70]([yshift=-0.5cm]left_Emb.south)--(left_Emb.south);
-\draw [->,very thick,draw=black!70](left_Emb.north)--(left_cir.south);
-\draw [->,very thick,draw=black!70](left_cir.north)--(left_Self.south);
-\draw [->,very thick,draw=black!70](left_Self.north)--(left_Add_bottom.south);
-\draw [->,very thick,draw=black!70](left_Add_bottom.north)--(left_Feed.south);
-\draw [->,very thick,draw=black!70](left_Feed.north)--(left_Add_top.south);
-\draw [->,very thick,draw=black!70]([yshift=0.35cm]left_cir.north)--([xshift=-2.27cm,yshift=0.35cm]left_cir.north)--([xshift=-0.5cm]left_Add_bottom.west)--(left_Add_bottom.west);
-\draw [->,very thick,draw=black!70]([yshift=0.1cm]left_Add_bottom.north)--([xshift=-2.27cm,yshift=0.1cm]left_Add_bottom.north)--([xshift=-0.5cm]left_Add_top.west)--(left_Add_top.west);
-\draw[->,very thick,draw=black!70,in=250,out=0] ([yshift=0.5cm]left_cir.north)to([xshift=0.9cm]left_Self.south);
-\draw[->,very thick,draw=black!70,in=290,out=180] ([yshift=0.5cm]left_cir.north)to([xshift=-0.9cm]left_Self.south);
-%middle
-\node(Emb)[emb,right of = left_Emb,xshift=5cm]{\footnotesize{\textbf{Parser Embedding}}};
-\node(cir)[circle,very thick,draw=black!70,minimum width=0.5cm,above of = Emb,yshift=1.1cm]{};
-\draw[-,very thick,draw=black!70]([xshift=0.03cm]cir.west)--([xshift=-0.03cm]cir.east);
-\draw[-,very thick,draw=black!70]([yshift=-0.03cm]cir.north)--([yshift=0.03cm]cir.south);
-\node(cir2)[circle,very thick,minimum width=0.5cm,draw=black!70,right of = cir,xshift=1.5cm]{};
-\draw[very thick,draw=black!70]([xshift=0.04cm]cir2.west)sin([xshift=0.14cm,yshift=0.08cm]cir2.west)cos([xshift=0.25cm]cir2.west)sin([xshift=0.36cm,yshift=-0.08cm]cir2.west)cos([xshift=-0.03cm]cir2.east);
-\node(Self)[sa,above of = cir,yshift=1.6cm]{\textbf{Self-Attention}};
-\node(Add_bottom)[an,above of = Self,yshift=1.1cm]{\textbf{Add \& LayerNorm}};
-\node(ED_Self)[edsa,above of = Add_bottom,yshift=1.8cm]{\textbf{Encoder-Decoder}\\ \textbf{Attention}};
-\node(Add_mid)[an,above of = ED_Self,yshift=1.35cm]{\textbf{Add \& LayerNorm}};
-\node(Feed)[ff,above of = Add_mid,yshift=1.2cm]{\textbf{Feed Forward}\\ \textbf{Network}};
-\node(Add_top)[an,above of = Feed,yshift=1.1cm]{\textbf{Add \& LayerNorm}};
-\node(Linear)[linear,above of = Add_top,yshift=1.3cm]{\textbf{Linear}};
-\node(Softmax)[softmax,above of = Linear,yshift=1cm]{\textbf{Softmax}};
-\node(text_bottom)[below of = Emb,xshift=0.2cm,yshift=-1.2cm,scale=0.9]{\textbf{VP1\ \ VP3\ \  <eos>}};
-\node(text_top)[above of = Softmax,xshift=0.2cm,yshift=1.2cm,scale=0.9]{\textbf{VP1\ \ VP3\ \  <eos>}};
-\draw [->,very thick,draw=black!70]([yshift=-0.5cm]Emb.south)--(Emb.south);
-\draw [->,very thick,draw=black!70]([xshift=0.9cm,yshift=-0.5cm]Emb.south)--([xshift=0.9cm]Emb.south);
-\draw [->,very thick,draw=black!70]([xshift=-0.9cm,yshift=-0.5cm]Emb.south)--([xshift=-0.9cm]Emb.south);
-\draw [->,very thick,draw=black!70](Emb.north)--(cir.south);
-\draw [->,very thick,draw=black!70](cir.north)--(Self.south);
-\draw [->,very thick,draw=black!70](cir2.west)--(cir.east);
-\draw[->,very thick,draw=black!70,in=250,out=0] ([yshift=0.5cm]cir.north)to([xshift=0.9cm]Self.south);
-\draw[->,very thick,draw=black!70,in=290,out=180] ([yshift=0.5cm]cir.north)to([xshift=-0.9cm]Self.south);
-\draw [->,very thick,draw=black!70](Self.north)--(Add_bottom.south);
-\draw [->,very thick,draw=black!70](ED_Self.north)--(Add_mid.south);
-\draw [->,very thick,draw=black!70](Add_mid.north)--(Feed.south);
-\draw [->,very thick,draw=black!70](Feed.north)--(Add_top.south);
-\draw [->,very thick,draw=black!70](Add_top.north)--(Linear.south);
-\draw [->,very thick,draw=black!70](Linear.north)--(Softmax.south);
-\draw [->,very thick,draw=black!70](Softmax.north)--([yshift=0.5cm]Softmax.north);
-\draw [->,very thick,draw=black!70]([yshift=0.35cm]cir.north)--([xshift=2.27cm,yshift=0.35cm]cir.north)--([xshift=0.5cm]Add_bottom.east)--(Add_bottom.east);
-\draw [->,very thick,draw=black!70]([yshift=0.1cm]Add_mid.north)--([xshift=2.27cm,yshift=0.1cm]Add_mid.north)--([xshift=0.5cm]Add_top.east)--(Add_top.east);
-\draw [->,very thick,draw=black!70](left_Add_top.north)--([yshift=0.6cm]left_Add_top.north)--([xshift=2.27cm,yshift=0.6cm]left_Add_top.north)--([xshift=2.27cm,yshift=-2cm]left_Add_top.north)--([xshift=5cm,yshift=-2cm]left_Add_top.north)--(ED_Self.south);
-\draw [->,very thick,draw=black!70]([xshift=0.9cm,yshift=-0.45cm]ED_Self.south)--([xshift=0.9cm]ED_Self.south);
-\draw [->,very thick,draw=black!70]([xshift=-0.9cm,yshift=-0.35cm]ED_Self.south)--([xshift=-0.9cm]ED_Self.south);
-\draw [->,very thick,draw=black!70](Add_bottom.north)--([yshift=0.2cm]Add_bottom.north)--([xshift=2.27cm,yshift=0.2cm]Add_bottom.north)--([xshift=0.5cm]Add_mid.east)--(Add_mid.east);
-%right
-\node(right_Emb)[emb,right of = Emb,xshift=5.5cm]{\footnotesize{\textbf{Parser Embedding}}};
-\node(right_cir)[circle,very thick,minimum width=0.5cm,draw=black!70,above of = right_Emb,yshift=1.1cm]{};
-\draw[-,very thick,draw=black!70]([xshift=0.03cm]right_cir.west)--([xshift=-0.03cm]right_cir.east);
-\draw[-,very thick,draw=black!70]([yshift=-0.03cm]right_cir.north)--([yshift=0.03cm]right_cir.south);
-\node(right_cir2)[circle,very thick,minimum width=0.5cm,draw=black!70,right of = right_cir,xshift=1.5cm]{};
-\draw[very thick,draw=black!70]([xshift=0.04cm]right_cir2.west)sin([xshift=0.14cm,yshift=0.08cm]right_cir2.west)cos([xshift=0.25cm]right_cir2.west)sin([xshift=0.36cm,yshift=-0.08cm]right_cir2.west)cos([xshift=-0.03cm]right_cir2.east);
-\node(right_Self)[sa,above of = right_cir,yshift=1.6cm]{\textbf{Self-Attention}};
-\node(right_Add_bottom)[an,above of = right_Self,yshift=1.1cm]{\textbf{Add \& LayerNorm}};
-\node(right_ED_Self)[edsa,above of = right_Add_bottom,yshift=1.8cm]{\textbf{Encoder-Decoder}\\\textbf{Attention}};
-\node(right_Add_mid)[an,above of = right_ED_Self,yshift=1.35cm]{\textbf{Add \& LayerNorm}};
-\node(right_Feed)[ff,above of = right_Add_mid,yshift=1.2cm]{\textbf{Feed Forward}\\\textbf{Network}};
-\node(right_Add_top)[an,above of = right_Feed,yshift=1.1cm]{\textbf{Add \& LayerNorm}};
-\node(right_Linear)[linear,above of = right_Add_top,yshift=1.3cm]{\textbf{Linear}};
-\node(right_Softmax)[softmax,above of = right_Linear,yshift=1cm]{\textbf{Softmax}};
-\node(right_text_bottom)[below of = right_Emb,xshift=1.2cm,yshift=-1.2cm,scale=0.8]{\textbf{VP1 <Mask> VP3 <Mask> <Mask> <Mask>}};
-\node(right_text_top)[above of = right_Softmax,xshift=0cm,yshift=1.2cm,scale=0.9]{\textbf{VP1 I VP3 love my dog}};
-\draw [->,very thick,draw=black!70]([yshift=-0.5cm]right_Emb.south)--(right_Emb.south);
-\draw [->,very thick,draw=black!70](right_Emb.north)--(right_cir.south);
-\draw [->,very thick,draw=black!70](right_cir.north)--(right_Self.south);
-\draw [->,very thick,draw=black!70](right_cir2.west)--(right_cir.east);
-\draw[->,very thick,draw=black!70,in=250,out=0] ([yshift=0.5cm]right_cir.north)to([xshift=0.9cm]right_Self.south);
-\draw[->,very thick,draw=black!70,in=290,out=180] ([yshift=0.5cm]right_cir.north)to([xshift=-0.9cm]right_Self.south);
-\draw [->,very thick,draw=black!70](right_Self.north)--(right_Add_bottom.south);
-\draw [->,very thick,draw=black!70](right_ED_Self.north)--(right_Add_mid.south);
-\draw [->,very thick,draw=black!70](right_Add_mid.north)--(right_Feed.south);
-\draw [->,very thick,draw=black!70](right_Feed.north)--(right_Add_top.south);
-\draw [->,very thick,draw=black!70](right_Add_top.north)--(right_Linear.south);
-\draw [->,very thick,draw=black!70](right_Linear.north)--(right_Softmax.south);
-\draw [->,very thick,draw=black!70](right_Softmax.north)--([yshift=0.5cm]right_Softmax.north);
-\draw [->,very thick,draw=black!70]([yshift=0.35cm]right_cir.north)--([xshift=2.27cm,yshift=0.35cm]right_cir.north)--([xshift=0.5cm]right_Add_bottom.east)--(right_Add_bottom.east);
-\draw [->,very thick,draw=black!70]([yshift=0.1cm]right_Add_mid.north)--([xshift=2.27cm,yshift=0.1cm]right_Add_mid.north)--([xshift=0.5cm]right_Add_top.east)--(right_Add_top.east);
-\draw [->,very thick,draw=black!70]([xshift=0.9cm,yshift=-0.45cm]right_ED_Self.south)--([xshift=0.9cm]right_ED_Self.south);
-\draw [->,very thick,draw=black!70]([xshift=-0.9cm,yshift=-0.35cm]right_ED_Self.south)--([xshift=-0.9cm]right_ED_Self.south);
-\draw [-,very thick,dashed,draw=black!70]([xshift=2.2cm,yshift=0.6cm]left_Add_top.north)--([xshift=2.2cm,yshift=3.5cm]left_Add_top.north)--([xshift=8cm,yshift=3.5cm]left_Add_top.north)--([xshift=8cm,yshift=-2cm]left_Add_top.north);
-\draw [->,very thick,draw=black!70](right_Add_bottom.north)--([yshift=0.2cm]right_Add_bottom.north)--([xshift=2.27cm,yshift=0.2cm]right_Add_bottom.north)--([xshift=0.5cm]right_Add_mid.east)--(right_Add_mid.east);
-\draw [->,very thick,draw=black!70]([xshift=8cm,yshift=-2cm]left_Add_top.north)--([yshift=0.3cm]right_Add_bottom.north)--(right_ED_Self.south);
-\draw [->,very thick,draw=black!70](Softmax.east)--([xshift=1.2cm]Softmax.east)--([xshift=1.2cm,yshift=-12.75cm]Softmax.east)--([xshift=2cm,yshift=-12.75cm]Softmax.east);
-%module
-\node(left_module)[rectangle,very thick,rounded corners,minimum width=4.5cm,minimum height=5.3cm,text centered,draw=black!70,above of = left_Emb,xshift=-0.25cm,yshift=4.1cm]{};
-\node(module)[rectangle,very thick,rounded corners,minimum width=4.5cm,minimum height=8.4cm,text centered,draw=black!70,above of = Emb,xshift=0.25cm,yshift=5.65cm]{};
-\node(right_module)[rectangle,very thick,rounded corners,minimum width=4.5cm,minimum height=8.4cm,text centered,draw=black!70,above of = right_Emb,xshift=0.25cm,yshift=5.65cm]{};
-\node(N)[right of = right_ED_Self,xshift=3cm,yshift=-3.7cm,scale=1.3]{\textbf{$N\times$}};
-\node(left_N)[left of = left_Feed,xshift=-3cm,yshift=-3cm,scale=1.3]{\textbf{$N\times$}};
-\node(M)[left of = ED_Self,xshift=-2.5cm,yshift=-3.7cm,scale=1.3]{\textbf{$M\times$}};
 \end{tikzpicture}
\ No newline at end of file
--- a/Chapter14/chapter14.tex
+++ b/Chapter14/chapter14.tex
@@ -60,7 +60,7 @@
 \vspace{0.5em}
 \end{itemize}
-\parinterval 预测模块是由模型决定的，而搜索模块可以与模型无关。也就是说，不同的模型可以共享同一个搜索模块完成推断。比如，对于基于循环神经网络的模型，预测模块需要读入前一个状态的信息和前一个位置的译文单词，然后预测当前位置单词的概率分布；对于Transformer，预测模块需要对前面的所有位置做注意力运算，之后预测当前位置的单词概率分布。不过，这两个模型都可以使用同一个搜索模块。图\ref{fig:14-1}给出了这种架构的示意图。
+\parinterval 预测模块是由模型决定的，而搜索模块可以与模型无关。也就是说，不同的模型可以共享同一个搜索模块完成推断。比如，对于基于循环神经网络的模型，预测模块需要读入前一个状态的信息和前一个位置的译文单词，然后预测当前位置单词的概率分布；对于Transformer，预测模块需要对前面的所有位置做注意力运算，之后预测当前位置单词的概率分布。不过，这两个模型都可以使用同一个搜索模块。图\ref{fig:14-1}给出了这种架构的示意图。
 %----------------------------------------------
 \begin{figure}[htp]
@@ -73,13 +73,13 @@
 \parinterval 这是一个非常通用的框架，同样适用基于统计的机器翻译模型。因此，神经机器翻译推断中的很多问题与统计机器翻译是一致的，比如：束搜索的宽度、解码终止条件等等。
-\parinterval 一般来说，机器翻译推断系统的设计要考虑三个因素：搜索的准确性、搜索的时延、搜索所需要的存储。通常，准确性是研究人员最关心的问题，比如可以通过增大搜索空间来找到模型得分更高的结果。而搜索的时延和存储消耗是实践中必须要考虑的问题，比如可以设计合理的搜索终止条件降低搜索时延。
+\parinterval 一般来说，机器翻译推断系统的设计要考虑三个因素：搜索的准确性、搜索的时延、搜索所需要的存储。通常，准确性是研究人员最关心的问题，比如可以通过增大搜索空间来找到模型得分更高的结果。而搜索的时延和存储消耗是实践中必须要考虑的问题，因此可以设计合理的搜索终止条件降低搜索时延。
 \parinterval 虽然，上述问题在统计机器翻译中都有讨论，但是在神经机器翻译中又面临着新的挑战。
 \begin{itemize}
 \vspace{0.5em}
-\item 搜索的基本问题在神经机器翻译中有着特殊的现象。比如，在统计机器翻译中，降低搜索错误是提升翻译效果的一种手段。但是神经机器翻译中，简单的降低搜索错误可能无法带来性能的提升，甚至会造成翻译品质的下降\upcite{li-etal-2018-simple,Stahlberg2019OnNS}；
+\item 搜索的基本问题在神经机器翻译中有着特殊的现象。比如，在统计机器翻译中，降低搜索错误是提升翻译品质的一种手段。但是神经机器翻译中，简单的降低搜索错误可能无法带来性能的提升，甚至会造成翻译品质的下降\upcite{li-etal-2018-simple,Stahlberg2019OnNS}；
 \vspace{0.5em}
 \item 搜索的时延很高，系统实际部署的成本很高。与统计机器翻译系统不同的是，神经机器翻译依赖大量的浮点运算。这导致神经机器翻译系统的推断会比统计机器翻译系统慢很多。虽然可以使用GPU来加快神经机器翻译的推断速度，但是也大大增加了成本；
 \vspace{0.5em}
@@ -87,7 +87,7 @@
 \vspace{0.5em}
 \end{itemize}
-\parinterval 研究人员也针对以上问题开展了大量的研究工作。在\ref{sec:14-2}节中，本章会对神经机器翻译推断中所涉及的一些基本问题进行讨论。虽然这些问题在统计机器翻译中均有涉及，但是在神经机器翻译中却有着不同的现象和解决思路。在\ref{sec:14-3}-\ref{sec:14-5}节中，会针对如何改进神经机器翻译推断效率和怎样进行多模型融合这两个问题展开讨论。
+\parinterval 研究人员也针对以上问题开展了大量的研究工作。在\ref{sec:14-2}节中，会对神经机器翻译推断中所涉及的一些基本问题进行讨论。虽然这些问题在统计机器翻译中均有涉及，但是在神经机器翻译中却有着不同的现象和解决思路。在\ref{sec:14-3}-\ref{sec:14-5}节中，会针对如何改进神经机器翻译推断效率和怎样进行多模型融合这两个问题展开讨论。
 %----------------------------------------------------------------------------------------
 %    NEW SECTION
@@ -114,7 +114,7 @@
 \parinterval 自右向左的翻译可以被描述为公式\eqref{eq:14-2}：
 \begin{eqnarray}
-\funp{P}(\seq{y}\vert\seq{x}) &=&\prod_{j=1}^n \funp{P}(y_{j}\vert\seq{y}_{>j},\seq{x})
+\funp{P}(\seq{y}\vert\seq{x}) &=&\prod_{j=1}^n \funp{P}(y_{n+1-j}\vert\seq{y}_{>n+1-j},\seq{x})
 \label{eq:14-2}
 \end{eqnarray}
@@ -124,7 +124,7 @@
 \vspace{0.5em}
 \item {\small\sffamily\bfseries{重排序}}\index{重排序}（Reranking）\index{Reranking}。可以用一个基础模型（比如自左向右的模型）得到每个源语言句子的$n$-best翻译结果，之后同时用基础模型的得分和自右向左模型的得分对$n$-best翻译结果进行重排序\upcite{Liu2016AgreementOT,DBLP:conf/wmt/SennrichHB16,DBLP:conf/wmt/LiLXLLLWZXWFCLL19}。也有研究人员利用最小贝叶斯风险的方法进行重排序\upcite{Stahlberg2018TheUO}。由于这类方法不会改变基础模型的翻译过程，因此相对“安全”，不会对系统性能造成副作用。
 \vspace{0.5em}
-\item {\small\sffamily\bfseries{双向推断}}\index{双向推断}（Bidirectional Inference）\index{Bidirectional Inference}。除了自左向右推断和自右向左推断，另一种方法让自左向右和自右向左模型同步进行，也就是同时考虑译文左侧和右侧的文字信息\upcite{DBLP:conf/aaai/ZhangSQLJW18,Zhou2019SynchronousBN,DBLP:conf/aaai/ZhangSQLJW18}。例如，可以同时对左边和右边生成的译文进行注意力计算，得到当前位置的单词预测结果。这种方法能够更加充分地融合双向翻译的优势。
+\item {\small\sffamily\bfseries{双向推断}}\index{双向推断}（Bidirectional Inference）\index{Bidirectional Inference}。除了自左向右推断和自右向左推断，另一种方法让自左向右和自右向左模型同步进行，也就是同时考虑译文左侧和右侧的文字信息\upcite{DBLP:conf/aaai/ZhangSQLJW18,Zhou2019SynchronousBN,DBLP:conf/aaai/ZhangSQLJW18}。例如，可以同时对左侧和右侧生成的译文进行注意力计算，得到当前位置的单词预测结果。这种方法能够更加充分地融合双向翻译的优势。
 \vspace{0.5em}
 \item {\small\sffamily\bfseries{多阶段推断}}\index{多阶段推断}（Multi-stage Inference）\index{Multi-stage Inference}。在第一阶段，通过一个基础模型生成一个初步的翻译结果。在第二阶段，同时使用第一阶段生成的翻译结果和源语言句子，进一步生成更好的译文\upcite{Li2017EnhancedNM,ElMaghraby2018EnhancingTF,Geng2018AdaptiveMD}。由于第一阶段的结果已经包含了完整的译文信息，因此在第二阶段中，系统实际上已经同时使用了整个译文串的两端信息。上述过程可以扩展为迭代式的译文生成方法，配合掩码等技术，可以在生成每个译文单词时，同时考虑左右两端的上下文信息\upcite{Lee2018DeterministicNN,Gu2019LevenshteinT,Guo2020JointlyMS}。
 \vspace{0.5em}
@@ -140,7 +140,7 @@
 \subsection{译文长度控制}
-\parinterval 机器翻译推断的一个特点是译文长度需要额外的机制进行控制\upcite{Kikuchi2016ControllingOL,Takase2019PositionalET,Murray2018CorrectingLB,Sountsov2016LengthBI}。这是因为机器翻译在建模时仅考虑了将训练样本（即标准答案）上的损失最小化，但是推断的时候会看到从未见过的样本，而且这些未见样本占据了样本空间的绝大多数。这时，模型会产生偏置，即模型仅仅能够对见过的样本进行准确建模，而对于未见样本的建模并不准确。该问题导致的一个现象是：直接使用训练好的模型会翻译出长度短得离谱的译文。神经机器翻译模型使用单词概率的乘积表示整个句子的翻译概率，它天然就倾向生成短译文，因为概率为大于0小于1的常数，短译文会使用更少的概率因式相乘，倾向于得到更高的句子得分，而模型只关心每个目标语位置是否被正确预测，对于译文长度没有考虑。译文长度不合理的问题也出现在统计机器翻译模型中，当时的策略是在推断过程中引入译文长度控制机制\upcite{Koehn2007Moses}。神经机器翻译也借用了类似的思想来控制译文长度，有以下几种方法：
+\parinterval 机器翻译推断的一个特点是译文长度需要额外的机制进行控制\upcite{Kikuchi2016ControllingOL,Takase2019PositionalET,Murray2018CorrectingLB,Sountsov2016LengthBI}。这是因为机器翻译在建模时仅考虑了将训练样本（即标准答案）上的损失最小化，但是推断的时候会看到从未见过的样本，而且这些未见样本占据了样本空间的绝大多数。这时，模型会产生偏置，即模型仅仅能够对见过的样本进行准确建模，而对于未见样本的建模并不准确。该问题导致的一个现象是：直接使用训练好的模型会翻译出长度短得离谱的译文。神经机器翻译模型使用单词概率的乘积表示整个句子的翻译概率，它天然就倾向生成短译文，因为概率为大于0小于1的常数，短译文会使用更少的概率因式相乘，倾向于得到更高的句子得分，而模型只关心每个目标语言位置是否被正确预测，对于译文长度没有考虑。译文长度不合理的问题也出现在统计机器翻译模型中，当时的策略是在推断过程中引入译文长度控制机制\upcite{Koehn2007Moses}。神经机器翻译也借用了类似的思想来控制译文长度，有以下几种方法：
 \begin{itemize}
 \vspace{0.5em}
@@ -151,7 +151,7 @@
 \label{eq:14-12}
 \end{eqnarray}
-其中$\vert\seq{y}\vert$表示译文长度，通常$\textrm{lp}(\seq{y})$随$\vert\seq{y}\vert$的增大而增大，因此这种方式相当于对$\log \funp{P}(\seq{y}\vert\seq{x})$按长度进行归一化\upcite{Jean2015MontrealNM}。$\textrm{lp}(\seq{y})$的定义方式有很多，表\ref{tab:14-1}列出了一些常用的形式，其中$\alpha$是需要人为设置的参数。
+通常$\textrm{lp}(\seq{y})$随$\vert\seq{y}\vert$的增大而增大，（其中$\vert\seq{y}\vert$表示译文长度），因此这种方式相当于对$\log \funp{P}(\seq{y}\vert\seq{x})$按长度进行归一化\upcite{Jean2015MontrealNM}。$\textrm{lp}(\seq{y})$的定义方式有很多，表\ref{tab:14-1}列出了一些常用的形式，其中$\alpha$是需要人为设置的参数。
 %----------------------------------------------------------------------------------------------------
 \begin{table}[htp]
@@ -246,7 +246,7 @@ b &=& \omega_{\textrm{high}}\cdot |\seq{x}| \label{eq:14-4}
 \parinterval 机器翻译的错误分为两类：搜索错误和模型错误。搜索错误是指由于搜索算法的限制，即使潜在的搜索空间中有更好的解，模型也无法找到。比较典型的例子是，在对搜索结果进行剪枝的时候，如果剪枝过多，找到的结果很有可能不是最优的。这时就出现了搜索错误。而模型错误则是指由于模型学习能力的限制，潜在的搜索空间过小而无法将最优解包含其中。
-\parinterval 在统计机器翻译中，搜索错误可以通过减少剪枝进行缓解。比较简单的方式是增加搜索束宽度，这往往会带来一定的性能提升\upcite{Xiao2016ALA}。也可以对搜索问题进行单独建模，以保证学习到的模型出现更少的搜索错误\upcite{Liu2014SearchAwareTF,Yu2013MaxViolationPA}。但是，在神经机器翻译中，这个问题却表现出不同的现象：在很多神经机器翻译系统中，随着搜索束的增大，系统的BLEU不升反降。图\ref{fig:14-3}展示了BLEU随束大小的变化曲线\footnote{为了使该图更加规整直观，横坐标处将束大小进行了取对数操作。}。这个现象与传统的常识是相违背的，因此也有一些研究尝试解释这个现象\upcite{Stahlberg2019OnNS,Niehues2017AnalyzingNM}。在实验中，研究人员也发现增加搜索束的大小会导致翻译生成的结果变得更短。他们将这个现象归因于：神经机器翻译的建模基于局部归一的最大似然估计，增加搜索束的大小，会导致更多的模型错误\upcite{Sountsov2016LengthBI,Murray2018CorrectingLB,StahlbergNeural}。因为此外，也有研究人员把这种翻译过短的现象归因于搜索错误\upcite{Stahlberg2019OnNS}。由于搜索时所面临的搜索空间是十分巨大的，因此搜索时可能无法找到模型定义的“最好”的译文。在某种意义上，这也体现了训练和推断不一致的问题。
+\parinterval 在统计机器翻译中，搜索错误可以通过减少剪枝进行缓解。比较简单的方式是增加搜索束宽度，这往往会带来一定的性能提升\upcite{Xiao2016ALA}。也可以对搜索问题进行单独建模，以保证学习到的模型出现更少的搜索错误\upcite{Liu2014SearchAwareTF,Yu2013MaxViolationPA}。但是，在神经机器翻译中，这个问题却表现出不同的现象：在很多神经机器翻译系统中，随着搜索束的增大，系统的BLEU不升反降。图\ref{fig:14-3}展示了BLEU随束大小的变化曲线\footnote{为了使该图更加规整直观，横坐标处将束大小进行了取对数操作。}。这个现象与传统的常识是相违背的，因此也有一些研究尝试解释这个现象\upcite{Stahlberg2019OnNS,Niehues2017AnalyzingNM}。在实验中，研究人员也发现增加搜索束的大小会导致翻译生成的结果变得更短。他们将这个现象归因于：神经机器翻译的建模基于局部归一的最大似然估计，增加搜索束的大小，会导致更多的模型错误\upcite{Sountsov2016LengthBI,Murray2018CorrectingLB,StahlbergNeural}。此外，也有研究人员把这种翻译过短的现象归因于搜索错误\upcite{Stahlberg2019OnNS}。由于搜索时所面临的搜索空间是十分巨大的，因此搜索时可能无法找到模型定义的“最好”的译文。在某种意义上，这也体现了训练和推断不一致的问题。
 %----------------------------------------------------------------------
 \begin{figure}[htp]
@@ -257,7 +257,7 @@ b &=& \omega_{\textrm{high}}\cdot |\seq{x}| \label{eq:14-4}
 \end{figure}
 %----------------------------------------------------------------------
-\parinterval 一种解决问题的思路是从训练和推断的行为和目标不一致的角度切入。比如，为了解决{\small\sffamily\bfseries{曝光偏置}}\index{曝光偏置}（Exposure Bias）\index{Exposure Bias}问题\upcite{Ranzato2016SequenceLT}，可以让系统使用前面步骤的预测结果作为预测下一个词所需要的历史信息，而不是依赖于标准答案\upcite{Bengio2015ScheduledSF,Zhang2019BridgingTG}。此外，为了解决训练和推断目标不一致的问题，可以在训练的时候模拟推断的行为，同时让模型训练的目标与评价系统的标准尽可能一致\upcite{DBLP:conf/acl/ShenCHHWSL16}。
+\parinterval 一种解决问题的思路是从训练和推断的行为和目标不一致的角度切入。比如，为了解决曝光偏置问题\upcite{Ranzato2016SequenceLT}，可以让系统使用前面步骤的预测结果作为预测下一个词所需要的历史信息，而不是依赖于标准答案\upcite{Bengio2015ScheduledSF,Zhang2019BridgingTG}。为了解决训练和推断目标不一致的问题，可以在训练的时候模拟推断的行为，同时让模型训练的目标与评价系统的标准尽可能一致\upcite{DBLP:conf/acl/ShenCHHWSL16}。
 \parinterval 需要注意的是，前面提到的搜索束变大造成的翻译品质下降的问题还有其它解决方法。比如，可以通过对结果重排序来缓解这个问题\upcite{DBLP:conf/emnlp/Yang0M18}，也可以通过设计更好的覆盖度模型来生成长度更加合理的译文\upcite{li-etal-2018-simple}。从这个角度说，上述问题的成因也较为复杂，因此需要同时考虑模型错误和搜索错误。
@@ -329,7 +329,7 @@ b &=& \omega_{\textrm{high}}\cdot |\seq{x}| \label{eq:14-4}
 \subsection{轻量解码器及小模型}
-\parinterval 在推断时，神经机器翻译的解码器是最耗时的，因为每个目标语言位置需要单独输出单词的分布，同时在搜索过程中每一个翻译假设都要被扩展成多个翻译假设，进一步增加了计算量。因此，另一种思路是使用更加轻量的解码器加快翻译假设的生成速度\upcite{DBLP:journals/corr/HintonVD15,Munim2019SequencelevelKD}。
+\parinterval 在推断时，神经机器翻译的解码器是最耗时的，因为每个目标语言位置需要单独输出单词的分布，同时在搜索过程中每一个翻译假设都要被扩展成多个翻译假设，进一步增加了计算量。因此，另一种思路是使用更加轻量的解码器加快翻译假设的生成速度\upcite{Hinton2015Distilling,Munim2019SequencelevelKD}。
 \parinterval 比较简单的做法是把解码器的网络变得更“浅”、更“窄”。所谓浅网络是指使用更少的层构建神经网络，比如，使用3 层，甚至1 层网络的Transformer 解码器。所谓窄网络是指将网络中某些层中神经元的数量减少。不过，直接训练这样的小模型会带来翻译品质的下降。这时会考虑使用知识蒸馏（也称作知识精炼）等技术来提升小模型的品质。
@@ -443,7 +443,7 @@ b &=& \omega_{\textrm{high}}\cdot |\seq{x}| \label{eq:14-4}
 \end{figure}
 %----------------------------------------------------------------------
-\parinterval 完全独立地对每个词建模，会出现什么问题呢？来看一个例子，将汉语句子“干/得/好/！”翻译成英文，可以翻译成“Good job !”或者“Well done !”。假设生成这两种翻译的概率是相等的，即一半的概率是“Good job !”，另一半的概率是“Well done !”。由于非自回归模型的条件独立性假设，解码时第一个词“Good”和“Well”的概率是差不多大的，第二个词“job”和“done”的概率差不多大的，会使得模型生成出“Good done !”或者“Well job !”这样错误的翻译，如图\ref{fig:14-13}所示。这便是影响句子质量的关键问题，称之为{\small\sffamily\bfseries{多峰问题}}\index{多峰问题}（Multi-modality Problem）\index{Multi-modality Problem}\upcite{Gu2017NonAutoregressiveNM}。如何有效处理非自回归模型中的多峰问题  是提升非自回归模型质量的关键。
+\parinterval 完全独立地对每个词建模，会出现什么问题呢？来看一个例子，将汉语句子“干/得/好/！”翻译成英文，可以翻译成“Good job !”或者“Well done !”。假设生成这两种翻译的概率是相等的，即一半的概率是“Good job !”，另一半的概率是“Well done !”。由于非自回归模型的条件独立性假设，推断时第一个词“Good”和“Well”的概率是差不多大的，第二个词“job”和“done”的概率差不多大的，会使得模型生成出“Good done !”或者“Well job !”这样错误的翻译，如图\ref{fig:14-13}所示。这便是影响句子质量的关键问题，称之为{\small\sffamily\bfseries{多峰问题}}\index{多峰问题}（Multi-modality Problem）\index{Multi-modality Problem}\upcite{Gu2017NonAutoregressiveNM}。如何有效处理非自回归模型中的多峰问题  是提升非自回归模型质量的关键。
 %----------------------------------------------------------------------
 \begin{figure}[htp]
@@ -462,7 +462,7 @@ b &=& \omega_{\textrm{high}}\cdot |\seq{x}| \label{eq:14-4}
 \subsubsection{1. 基于繁衍率的非自回归模型}
-\parinterval 图\ref{fig:14-14}给出了基于繁衍率的Transformer非自回归模型的结构\upcite{Gu2017NonAutoregressiveNM}，由三个模块组成:编码器，解码器，繁衍率预测器，其中解码器中新增了位置注意力模块。与自回归翻译模型类似，Transformer模型的编码器和解码器都完全由前馈神经网络和多头注意力模块组成。在解码开始之前，非自回归模型需要知道译文的长度，以便并行生成所有单词。更重要的是，非自回归模型需要一次性生成出所有的译文单词，因此不能像自回归模型那样用已生成的词作为第一个解码器层的输入。
+\parinterval 图\ref{fig:14-14}给出了基于繁衍率的Transformer非自回归模型的结构\upcite{Gu2017NonAutoregressiveNM}，由三个模块组成:编码器，解码器，繁衍率预测器，其中解码器中新增了位置注意力模块。与自回归翻译模型类似，Transformer模型的编码器和解码器都完全由前馈神经网络和多头注意力模块组成。在推断开始之前，非自回归模型需要知道译文的长度，以便并行生成所有单词。更重要的是，非自回归模型需要一次性生成出所有的译文单词，因此不能像自回归模型那样用已生成的词作为第一个解码器层的输入。
 \parinterval 那么非自回归模型解码器的输入是什么呢？如果完全省略第一个解码器层的输入，或者仅使用位置嵌入，将会导致性能非常差。这里使用繁衍率来解决这个问题，繁衍率指的是：根据每个源语言单词预测出其对应的目标语言单词的个数（见\chaptersix），如图\ref{fig:14-14}所示，翻译过程中英语单词“We”对应一个汉语单词“我们”，其繁衍率为1。翻译过程取决于繁衍率序列（图\ref{fig:14-14}中的数字1\ 1\ 2\ 0\ 1），最终译文长度则由源语言单词的繁衍率之和决定。这个繁衍率序列可以通过外部词对齐工具得到， 来训练这个繁衍率预测器。但由于外部词对齐系统会出现错误，因此在模型收敛之后，需要在繁衍率预测器上加一个强化学习的损失来进行微调。
@@ -475,7 +475,7 @@ b &=& \omega_{\textrm{high}}\cdot |\seq{x}| \label{eq:14-4}
 \end{figure}
 %----------------------------------------------------------------------
-\parinterval 另外，在每个解码器层中还新增了额外的位置注意力模块，该模块与Transformer模型的其它部分中使用的多头注意力机制相同。其仍然基于$\mathbi{Q}$、$\mathbi{K}$、$\mathbi{V}$之间的计算（见{\chaptertwelve}），只是把位置编码作为$\mathbi{Q}$ 和$\mathbi{K}$, 解码端上一层的输出作为$\mathbi{V}$。这种方法提供了更强的位置信息。
+\parinterval 另外，在每个解码器层中还新增了额外的位置注意力模块，该模块与Transformer模型的其它部分中使用的多头注意力机制相同。其仍然基于$\mathbi{Q}$、$\mathbi{K}$、$\mathbi{V}$之间的计算（见{\chaptertwelve}），只是把位置编码作为$\mathbi{Q}$ 和$\mathbi{K}$, 解码器端上一层的输出作为$\mathbi{V}$。这种方法提供了更强的位置信息。
 %----------------------------------------------------------------------------------------
 %    NEW SUBSUB-SECTION
@@ -499,7 +499,7 @@ b &=& \omega_{\textrm{high}}\cdot |\seq{x}| \label{eq:14-4}
 \subsection{更好的训练目标}
-\parinterval 虽然非自回归翻译可以显著提升翻译速度，但是很多情况下其翻译质量还是低于传统的自回归翻译\upcite{Gu2017NonAutoregressiveNM,Kaiser2018FastDI,Guo2020FineTuningBC}。因此，很多工作致力于缩小自回归模型和非自回归模型的性能差距\upcite{Ran2020LearningTR,Tu2020ENGINEEI,Shu2020LatentVariableNN}。其中一些通常通过修改训练目标来达到提升非自回归翻译品质的目的，例如：
+\parinterval 虽然非自回归翻译可以显著提升翻译速度，但是很多情况下其翻译质量还是低于传统的自回归翻译\upcite{Gu2017NonAutoregressiveNM,Kaiser2018FastDI,Guo2020FineTuningBC}。因此，很多工作致力于缩小自回归模型和非自回归模型的性能差距\upcite{Ran2020LearningTR,Tu2020ENGINEEI,Shu2020LatentVariableNN}。其中一些通过修改训练目标来达到提升非自回归翻译品质的目的，例如：
 \begin{itemize}
 \vspace{0.5em}
@@ -530,18 +530,18 @@ b &=& \omega_{\textrm{high}}\cdot |\seq{x}| \label{eq:14-4}
 \end{figure}
 %----------------------------------------------
-\parinterval 另一种做法是半自回归地生成译文\upcite{Wang2018SemiAutoregressiveNM}。如图\ref{fig:14-20}所示，自回归模型从左到右依次生成译文，具有“最强”的自回归性；而非自回归模型完全独立的生成每个译文单词，具有“最弱”的自回归性；半自回归模型则是将整个译文分成$k$个块，在组内执行非自回归解码，在组间则执行自回归的解码，能够在每个时间步并行产生多个连续的单词。通过调整块的大小，半自回归模型可以灵活的调整到自回归模型（当$k$等于1）和非自回归模型（当$k$大于等于最大的译文长度）上来。
+\parinterval 另一种做法是半自回归地生成译文\upcite{Wang2018SemiAutoregressiveNM}。如图\ref{fig:14-20}所示，自回归模型从左到右依次生成译文，具有“最强”的自回归性；而非自回归模型完全独立的生成每个译文单词，具有“最弱”的自回归性；半自回归模型则是将整个译文分成$k$个块，在块内执行非自回归解码，在块间则执行自回归的解码，能够在每个时间步并行产生多个连续的单词。通过调整块的大小，半自回归模型可以灵活的调整到自回归模型（当$k$等于1）和非自回归模型（当$k$大于等于最大的译文长度）上来。
 %----------------------------------------------
 \begin{figure}[htp]
 \centering
 \input{./Chapter14/Figures/figure-3vs}
-\caption{自回归、半自回归和非自回归模型\upcite{Wang2018SemiAutoregressiveNM}}
+\caption{自回归、半自回归和非自回归解码对比\upcite{Wang2018SemiAutoregressiveNM}}
 \label{fig:14-20}
 \end{figure}
 %----------------------------------------------
-\parinterval 还有一种做法引入了轻量级的自回归{\small\sffamily\bfseries{调序}}\index{调序}（Reordering\index{Reordering}）模块\upcite{Ran2019GuidingNN}。为了解决非自回归模型解码搜索空间过大的问题，可以使用调序技术在相对较少的翻译候选上进行自回归模型的计算。如图\ref{fig:14-22}所示，该方法对源语言句子进行重新排列转换成由源语言单词组成但位于目标语言结构中的伪译文，然后将伪译文进一步转换成目标语言以获得最终的翻译。其中，这个调序模块可以是一个轻量自回归模型，例如，一层的循环神经网络。
+\parinterval 还有一种做法引入了轻量级的自回归调序模块\upcite{Ran2019GuidingNN}。为了解决非自回归模型解码搜索空间过大的问题，可以使用调序技术在相对较少的翻译候选上进行自回归模型的计算。如图\ref{fig:14-22}所示，该方法对源语言句子进行重新排列转换成由源语言单词组成但位于目标语言结构中的伪译文，然后将伪译文进一步转换成目标语言以获得最终的翻译。其中，这个调序模块可以是一个轻量自回归模型，例如，一层的循环神经网络。
 %----------------------------------------------------------------------
 \begin{figure}[htp]
@@ -604,7 +604,7 @@ b &=& \omega_{\textrm{high}}\cdot |\seq{x}| \label{eq:14-4}
 \begin{itemize}
 \vspace{0.5em}
-\item 假设生成。构建翻译假设集合是假设选择的第一步，也是最重要的一步。理想的情况下，这个集合应该尽可能包含更多高质量的翻译假设，这样后面有更大的几率选出更好的结果。不过，由于单个模型的性能是有上限的，因此无法期望这些翻译假设的品质超越单个模型的上限。研究人员更加关心的是翻译假设的多样性，因为已经证明多样的翻译假设非常有助于提升系统融合的性能\upcite{DBLP:journals/corr/LiMJ16,xiao2013bagging}。为了生成多样的翻译假设，通常有两种思路：1）使用不同的模型生成翻译假设；2）使用同一个模型的不同参数和设置生成翻译假设。图\ref{fig:14-8}展示了二者的区别。比如，可以使用基于循环神经网络和基于注意力机制的Transformer模型生成不同的翻译假设，之后都放入集合中；也可以只用Transformer 模型，但是用不同的模型参数构建多个系统，之后分别生成翻译假设。在神经机器翻译中，经常采用的是第二种方式，因为系统开发的成本更低。比如，很多研究工作都是基于一个基础模型，用不同的初始参数、不同层数、不同解码方式生成多个模型，进行翻译假设生成。
+\item 假设生成。构建翻译假设集合是假设选择的第一步，也是最重要的一步。理想的情况下，这个集合应该尽可能包含更多高质量的翻译假设，这样后面有更大的几率选出更好的结果。不过，由于单个模型的性能是有上限的，因此无法期望这些翻译假设的品质超越单个模型的上限。研究人员更加关心的是翻译假设的多样性，因为已经证明多样的翻译假设非常有助于提升系统融合的性能\upcite{DBLP:journals/corr/LiMJ16,xiao2013bagging}。为了生成多样的翻译假设，通常有两种思路：1）使用不同的模型生成翻译假设；2）使用同一个模型的不同参数和设置生成翻译假设。图\ref{fig:14-8}展示了二者的区别。比如，可以使用基于循环神经网络和基于注意力机制的Transformer模型生成不同的翻译假设，之后都放入集合中；也可以只用Transformer 模型，但是用不同的模型参数构建多个系统，之后分别生成翻译假设。在神经机器翻译中，经常采用的是第二种方式，因为系统开发的成本更低。比如，很多研究工作都是基于一个基础模型，用不同的初始参数、不同层数、不同推断方式生成多个模型，进行翻译假设生成。
 %----------------------------------------------
 \begin{figure}[htp]
@@ -682,7 +682,7 @@ b &=& \omega_{\textrm{high}}\cdot |\seq{x}| \label{eq:14-4}
 \label{fig:14-10}
 \end{figure}
 %----------------------------------------------------------------------
-\parinterval 有了Lattice 这样的结构，多模型融合又有了新的思路。首先，可以将多个模型的译文融合为Lattice。注意，这个Lattice 会包含这些模型无法生成的完整译文句子。之后，用一个更强的模型在Lattice 上搜索最优的结果。这个过程有可能找到一些“新”的译文，即结果可能是从多个模型的结果中重组而来的。Lattice 上的搜索模型可以基于多模型的融合，也可以使用一个简单的模型，这里需要考虑的是将神经机器翻译模型适应到Lattice 上进行推断\upcite{DBLP:conf/aaai/SuTXJSL17}。其过程基本与原始的模型推断没有区别，只是需要把模型预测的结果附着到Lattice 中的每条边上，再进行推断。
+\parinterval 有了Lattice 这样的结构，多模型集成又有了新的思路。首先，可以将多个模型的译文融合为Lattice。注意，这个Lattice 会包含这些模型无法生成的完整译文句子。之后，用一个更强的模型在Lattice 上搜索最优的结果。这个过程有可能找到一些“新”的译文，即结果可能是从多个模型的结果中重组而来的。Lattice 上的搜索模型可以基于多模型的融合，也可以使用一个简单的模型，这里需要考虑的是将神经机器翻译模型适应到Lattice 上进行推断\upcite{DBLP:conf/aaai/SuTXJSL17}。其过程基本与原始的模型推断没有区别，只是需要把模型预测的结果附着到Lattice 中的每条边上，再进行推断。
 \parinterval 图\ref{fig:14-11}对比了不同模型集成方法的区别。从系统开发的角度看，假设选择和模型预测融合的复杂度较低，适合快速开发原型系统，而且性能稳定。译文重组需要更多的模块，系统调试的复杂度较高，但是由于看到了更大的搜索空间，因此系统性能提升的潜力较大\footnote{一般来说Lattice 上的Oracle 要比$n$-best译文上的Oracle 的质量高。}。
@@ -707,13 +707,13 @@ b &=& \omega_{\textrm{high}}\cdot |\seq{x}| \label{eq:14-4}
 \vspace{0.5em}
 \item 机器翻译系统中的推断也借用了{\small\sffamily\bfseries{统计推断}}\index{统计推断}（Statistical Inference）\index{Statistical Inference}的概念。传统意义上讲，这类方法都是在利用样本数据去推测总体的趋势和特征。因此，从统计学的角度也有很多不同的思路。例如，贝叶斯学习等方法就在自然语言处理中得到广泛应用\upcite{Held2013AppliedSI,Silvey2018StatisticalI}。其中比较有代表性的是{\small\sffamily\bfseries{变分方法}}\index{变分方法}（Variational Methods）\index{Variational Methods}。这类方法通过引入新的隐含变量来对样本的分布进行建模，从某种意义上说它是在描述“分布的分布”，因此这种方法对事物的统计规律描述得更加细致\upcite{Beal2003VariationalAF}。这类方法也被成功地用于统计机器翻译\upcite{Li2009VariationalDF,xiao2011language,}和神经机器翻译\upcite{Bastings2019ModelingLS,Shah2018GenerativeNM,Su2018VariationalRN,Zhang2016VariationalNM}。
 \vspace{0.5em}
-\item 推断系统也可以受益于更加高效的网络结构。这方面工作集中在结构化剪枝、减少模型的冗余计算、低秩分解等方向。结构化剪枝中的代表性工作是LayerDrop\upcite{DBLP:conf/iclr/FanGJ20,DBLP:conf/emnlp/WangXZ20,DBLP:journals/corr/abs-2002-02925}，这类方法在训练时随机选择部分子结构，在推断时根据输入来选择模型中的部分层进行计算，而跳过其余层，达到加速和减少参数量的目的。有关减少冗余计算的研究主要集中在改进注意力机制上，本章已经有所介绍。低秩分解则针对词向量或者注意力的映射矩阵进行改进，例如词频自适应表示\upcite{DBLP:conf/iclr/BaevskiA19}，词频越高则对应的向量维度越大，反之则越小，或者层数越高注意力映射矩阵维度越小\upcite{DBLP:journals/corr/abs-2006-04768,DBLP:journals/corr/abs-1911-12385,DBLP:journals/corr/abs-1906-09777,DBLP:conf/nips/YangLSL19}。在实践中比较有效的是较深的编码器与较浅的解码器结合的方式，极端情况下解码器仅使用1层神经网络即可取得与多层神经网络相媲美的翻译精度，而极大地提升翻译效率\upcite{DBLP:journals/corr/abs-2006-10369,DBLP:conf/aclnmt/HuLLLLWXZ20,DBLP:journals/corr/abs-2010-02416}。在{\chapterfifteen}还会进一步对高效神经机器翻译的模型结构进行讨论。
+\item 推断系统也可以受益于更加高效的网络结构。这方面工作集中在结构化剪枝、减少模型的冗余计算、低秩分解等方向。结构化剪枝中的代表性工作是LayerDrop\upcite{DBLP:conf/iclr/FanGJ20,DBLP:conf/emnlp/WangXZ20,DBLP:journals/corr/abs-2002-02925}，这类方法在训练时随机选择部分子结构，在推断时根据输入来选择模型中的部分层进行计算，而跳过其余层，达到加速的目的。有关减少冗余计算的研究主要集中在改进注意力机制上，本章已经有所介绍。低秩分解则针对词向量或者注意力的映射矩阵进行改进，例如词频自适应表示\upcite{DBLP:conf/iclr/BaevskiA19}，词频越高则对应的向量维度越大，反之则越小，或者层数越高注意力映射矩阵维度越小\upcite{DBLP:journals/corr/abs-2006-04768,DBLP:journals/corr/abs-1911-12385,DBLP:journals/corr/abs-1906-09777,DBLP:conf/nips/YangLSL19}。在实践中比较有效的是较深的编码器与较浅的解码器结合的方式，极端情况下解码器仅使用1层神经网络即可取得与多层神经网络相媲美的翻译品质，从而极大地提升翻译效率\upcite{DBLP:journals/corr/abs-2006-10369,DBLP:conf/aclnmt/HuLLLLWXZ20,DBLP:journals/corr/abs-2010-02416}。在{\chapterfifteen}还会进一步对高效神经机器翻译的模型结构进行讨论。
 \vspace{0.5em}
-\item 在对机器翻译推断系统进行实际部署时，对存储的消耗也是需要考虑的因素。因此如何让模型变得更小也是研发人员所关注的方向。当前的模型压缩方法主要可以分为几类：剪枝、量化、知识蒸馏和轻量方法，其中轻量方法主要是基于更轻量模型结构的设计，这类方法已经在上文进行了介绍。剪枝主要包括权重大小剪枝\upcite{Han2015LearningBW,Lee2019SNIPSN,Frankle2019TheLT,Brix2020SuccessfullyAT}、面向多头注意力的剪枝\upcite{Michel2019AreSH,DBLP:journals/corr/abs-1905-09418}、网络层以及其他部分的剪枝等\upcite{Liu2017LearningEC,Liu2019RethinkingTV}，还有一些方法也通过在训练期间采用正则化的方式来提升剪枝能力\upcite{DBLP:conf/iclr/FanGJ20}。量化方法主要通过截断浮点数来减少模型的存储大小，使其仅使用几个比特位的数字表示方法便能存储整个模型，虽然会导致舍入误差，但压缩效果显著\upcite{DBLP:journals/corr/abs-1906-00532,Cheong2019transformersZ,Banner2018ScalableMF,Hubara2017QuantizedNN}。一些方法利用知识蒸馏手段还将Transformer模型蒸馏成如LSTMs 等其他各种推断速度更快的结构\upcite{DBLP:journals/corr/HintonVD15,Munim2019SequencelevelKD,Tang2019DistillingTK}。另外还有一些方法不仅在输出上，还在权重矩阵和隐藏的激活层上对“教师模型”知识进行更深入的挖掘\upcite{Jiao2020TinyBERTDB}。
+\item 在对机器翻译推断系统进行实际部署时，对存储的消耗也是需要考虑的因素。因此如何让模型变得更小也是研发人员所关注的方向。当前的模型压缩方法主要可以分为几类：剪枝、量化、知识蒸馏和轻量方法，其中轻量方法主要是基于更轻量模型结构的设计，这类方法已经在上文进行了介绍。剪枝主要包括权重大小剪枝\upcite{Han2015LearningBW,Lee2019SNIPSN,Frankle2019TheLT,Brix2020SuccessfullyAT}、面向多头注意力的剪枝\upcite{Michel2019AreSH,DBLP:journals/corr/abs-1905-09418}、网络层以及其他部分的剪枝等\upcite{Liu2017LearningEC,Liu2019RethinkingTV}，还有一些方法也通过在训练期间采用正则化的方式来提升剪枝能力\upcite{DBLP:conf/iclr/FanGJ20}。量化方法主要通过截断浮点数来减少模型的存储大小，使其仅使用几个比特位的数字表示方法便能存储整个模型，虽然会导致舍入误差，但压缩效果显著\upcite{DBLP:journals/corr/abs-1906-00532,Cheong2019transformersZ,Banner2018ScalableMF,Hubara2017QuantizedNN}。一些方法利用知识蒸馏手段还将Transformer模型蒸馏成如LSTMs 等其他各种推断速度更快的结构\upcite{Hinton2015Distilling,Munim2019SequencelevelKD,Tang2019DistillingTK}。另外还有一些方法不仅在输出上，还在权重矩阵和隐藏的激活层上对“教师模型”知识进行更深入的挖掘\upcite{Jiao2020TinyBERTDB}。
 \vspace{0.5em}
-\item 目前的翻译模型使用交叉熵损失作为优化函数，这在自回归模型上取得了非常优秀的性能。交叉熵是一个严格的损失函数，每个预测错误的单词所对应的位置都会受到惩罚，即使是编辑距离很小的输出序列。自回归模型会很大程度上避免这种惩罚，因为当前位置的单词是根据先前生成的词得到的，然而非自回归模型无法获得这种信息。如果在预测时漏掉一个单词，就可能会将正确的单词放在错误的位置上。为此，一些研究工作通过改进损失函数来提高非自回归模型的性能。一种做法使用一种新的交叉熵函数\upcite{Ghazvininejad2020AlignedCE}，它通过忽略绝对位置、关注相对顺序和词汇匹配来为非自回归模型提供更精确的训练信号。另外，也可以使用基于$n$-gram的训练目标\upcite{Shao2020MinimizingTB}来最小化模型与参考译文之间的$n$-gram差异。该训练目标在$n$-gram 的层面上评估预测结果，因此能够建模目标序列之间的依赖关系。
+\item 目前的翻译模型使用交叉熵损失作为优化函数，这在自回归模型上取得了非常优秀的性能。交叉熵是一个严格的损失函数，每个预测错误的单词所对应的位置都会受到惩罚，即使是编辑距离很小的输出序列。自回归模型会很大程度上避免这种惩罚，因为当前位置的单词是根据先前生成的词得到的，然而非自回归模型无法获得这种信息。如果在预测时漏掉一个单词，就可能会将正确的单词放在错误的位置上。为此，一些研究工作通过改进损失函数来提高非自回归模型的性能。一种做法使用一种新的交叉熵函数\upcite{Ghazvininejad2020AlignedCE}，它通过忽略绝对位置、关注相对顺序和词汇匹配来为非自回归模型提供更精确的训练信号。另外，也可以使用基于$n$-gram的训练目标\upcite{Shao2020MinimizingTB}来最小化模型与参考译文之间的$n$-gram差异。该训练目标在$n$-gram 的层面上评估预测结果，因此能够建模目标序列单词之间的依赖关系。
 \vspace{0.5em}
-\item 自回归模型解码时，当前位置单词的生成依赖于先前生成的单词，已生成的单词提供了较强的目标端上下文信息。然而，非自回归模型并行地生成所有词，因此缺乏这样的信息。与自回归模型相比，非自回归模型的解码器需要在信息更少的情况下执行翻译任务。因此可以为非自回归模型的解码器端引入更多的信息，来缩小模型的搜索空间。一些研究工作通过将条件随机场引入非自回归模型中来对结构依赖进行建模\upcite{Ma2019FlowSeqNC}。也有工作引入了词嵌入转换矩阵来将源语言端的词嵌入转换为目标语言端的词嵌入来为解码器提供更好的输入\upcite{Guo2019NonAutoregressiveNM}。此外，研究人员也提出了轻量级的调序模块来显式地建模调序信息，以指导非自回归模型的解码\upcite{Ran2019GuidingNN}。
+\item 自回归模型解码时，当前位置单词的生成依赖于先前生成的单词，已生成的单词提供了较强的目标端上下文信息。然而，非自回归模型并行地生成所有词，因此缺乏这样的信息。与自回归模型相比，非自回归模型的解码器需要在信息更少的情况下执行翻译任务。因此可以为非自回归模型的解码器端引入更多的信息，来缩小模型的搜索空间。一些研究工作通过将条件随机场引入非自回归模型中来对结构依赖进行建模\upcite{Ma2019FlowSeqNC}。也有工作引入了词嵌入转换矩阵来将源语言端的词嵌入转换为目标语言端的词嵌入来为解码器提供更好的输入\upcite{Guo2019NonAutoregressiveNM}。此外，研究人员也提出了轻量级的调序模块来显式地建模调序信息，以指导非自回归模型的推断\upcite{Ran2019GuidingNN}。
 \vspace{0.5em}
 \end{itemize}

--- a/Chapter15/chapter15.tex
+++ b/Chapter15/chapter15.tex
--- a/Chapter16/Figures/figure-comparison-of-structure-between-gpt-and-bert-model.tex
+++ b/Chapter16/Figures/figure-comparison-of-structure-between-gpt-and-bert-model.tex
@@ -103,7 +103,7 @@
 \node [anchor=north] (pos1) at ([xshift=1.5em,yshift=-1.0em]node0-2.south) {\small{(a) GPT模型结构}};
 \node [anchor=north] (pos2) at ([xshift=1.5em,yshift=-1.0em]node0-6.south) {\small{(b) BERT模型结构}};
-\node [anchor=south] (ex) at ([xshift=2.1em,yshift=0.5em]node3-1.north) {\small{TRM：transformer}};
+\node [anchor=south] (ex) at ([xshift=2.1em,yshift=0.5em]node3-1.north) {\small{TRM：Transformer}};

--- a/Chapter16/Figures/figure-example-of-iterative-back-translation.tex
+++ b/Chapter16/Figures/figure-example-of-iterative-back-translation.tex
@@ -60,7 +60,7 @@
 \node [anchor=west,fill=red!20,minimum width=1.5em](d2-1) at ([xshift=-0.0em]d2.east){};
 \node [anchor=west,fill=yellow!20,minimum width=1.5em](d3-1) at ([xshift=-0.0em]d3.east){};
 \node [anchor=north] (d4) at ([xshift=1em]d1.south) {\small{训练：}};
-\node [anchor=north] (d5) at ([xshift=0.5em]d2.south) {\small{推理：}};
+\node [anchor=north] (d5) at ([xshift=0.5em]d2.south) {\small{推断：}};
 \draw [->,thick] ([xshift=0em]d4.east)--([xshift=1.5em]d4.east);
 \draw [->,thick,dashed] ([xshift=0em]d5.east)--([xshift=1.5em]d5.east);

--- a/Chapter16/Figures/figure-examples-of-comparable-corpora.tex
+++ b/Chapter16/Figures/figure-examples-of-comparable-corpora.tex
 \begin{tikzpicture}
 \begin{scope}
-\node [anchor=center] (node1) at (0,0) {\textbf{Machine translation}, sometiomes referred to by the abbreviation \textbf{MT} (not to be };
+\node [anchor=center] (node1) at (0,0) {\textbf{Machine Translation}, sometimes referred to by the abbreviation \textbf{MT} (not to be };
-\node [anchor=north] (node2) at (node1.south) {confused with computer-aided translation,,machine-aided human translation inter};
+\node [anchor=north] (node2) at (node1.south) {confused with computer-aided translation,machine-aided human translation inter};
 \node [anchor=north] (node3) at (node2.south) {-active translation), is a subfield of computational linguistics that investigates the};
 \node [anchor=north] (node4) at ([xshift=-1.8em]node3.south) {use of software to translate text or speech from one language to another.};
 \node [anchor=south] (node5) at ([xshift=-12.8em,yshift=0.5em]node1.north) {\Large{WIKIPEDIA}};

--- a/Chapter16/Figures/figure-parameter-initialization-method-diagram.tex
+++ b/Chapter16/Figures/figure-parameter-initialization-method-diagram.tex
@@ -12,8 +12,8 @@
 \node[node,anchor=west,minimum width=6em,minimum height=2.4em,fill=blue!20,line width=0.6pt] (decoder2) at ([xshift=4em,yshift=0em]decoder1.east){\small 解码器};
 \node[node,anchor=west,minimum width=6em,minimum height=2.4em,fill=blue!30,line width=0.6pt] (decoder3) at ([xshift=3em]decoder2.east){\small 解码器};
-\node[anchor=north,font=\scriptsize,fill=yellow!20] (w1) at ([yshift=-1.6em]decoder1.south){知识 \ 就是 \ 力量 \ 。 \ <EOS>};
+\node[anchor=north,font=\scriptsize,fill=yellow!20] (w1) at ([yshift=-1.6em]decoder1.south){知识 \ 就是 \ 力量 \ 。 \ <eos>};
-\node[anchor=north,font=\scriptsize,fill=green!20] (w3) at ([yshift=-1.6em]decoder3.south){Wissen  \ ist \ Machit \ . \ <EOS>};
+\node[anchor=north,font=\scriptsize,fill=green!20] (w3) at ([yshift=-1.6em]decoder3.south){Wissen  \ ist \ Machit \ . \ <eos>};
 \node[anchor=south,font=\scriptsize,fill=orange!20] (w2) at ([yshift=1.6em]encoder1.north){Knowledge \ is \ power \ . };
 \node[anchor=south,font=\scriptsize,fill=orange!20] (w4) at ([yshift=1.6em]encoder3.north){Knowledge \ is \ power \ . };

--- a/Chapter16/chapter16.tex
+++ b/Chapter16/chapter16.tex
--- a/Chapter17/Figures/figure-application-of-multimodal-machine-translation-to-multitask-learning.tex
+++ b/Chapter17/Figures/figure-application-of-multimodal-machine-translation-to-multitask-learning.tex
 \tikzstyle{coder} = [rectangle,rounded corners,minimum height=2.2em,minimum width=4.3em,text centered,draw=black,fill=red!25]
 \begin{tikzpicture}[node distance = 0,scale = 0.75]
 \tikzstyle{every node}=[scale=0.75]
-\node(x)[]{x};
+\node(x)[]{$x$};
 \node(encoder)[coder, above of = x,yshift=4em]{{编码器}};
 \node(decoder_left)[coder, above of = encoder, yshift=6em,fill=blue!25]{{解码器}};
-\node(y_hat)[above of = decoder_left, yshift=4em]{{$\rm y'$}};
+\node(y_hat)[above of = decoder_left, yshift=4em]{{$y'$}};
-\node(y)[above of = decoder_left, xshift=-6em]{{$\rm y$}};
+\node(y)[above of = decoder_left, xshift=-6em]{{$y$}};
 \node(decoder_right)[coder, above of = encoder, xshift=11em,fill=yellow!25]{{解码器}};
 \node(figure)[draw=white,above of = decoder_right,yshift=6.5em,scale=0.25] {\includegraphics[width=0.62\textwidth]{./Chapter17/Figures/figure-bank-without-attention.png}};

--- a/Chapter17/Figures/figure-cache.tex
+++ b/Chapter17/Figures/figure-cache.tex
@@ -18,7 +18,7 @@
 \node[anchor=south,font=\footnotesize,inner sep=0pt] (cache)at ([yshift=2em,xshift=1.5em]key.north){\small\bfnew{Cache}};
 \node[draw,anchor=east,minimum size=1.8em,fill=orange!15] (dt) at ([yshift=2.1em,xshift=-4em]key.west){${\mathbi{d}}_{t}$};
-\node[anchor=north,font=\footnotesize] (readlab) at ([xshift=2.8em,yshift=0.3em]dt.north){\red{reading}};
+\node[anchor=north,font=\footnotesize] (readlab) at ([xshift=2.8em,yshift=0.3em]dt.north){\red{读取}};
 \node[draw,anchor=east,minimum size=1.8em,fill=ugreen!15] (st) at ([xshift=-3.7em]dt.west){${\mathbi{s}}_{t}$};
 \node[draw,anchor=east,minimum size=1.8em,fill=red!15] (st2) at ([xshift=-0.85em,yshift=3.5em]dt.west){$ \widetilde{\mathbi{s}}_{t}$};
@@ -27,10 +27,10 @@
 \draw[-,thick] (add.0) -- (add.180);
 \draw[-,thick] (add.90) -- (add.-90);
-\node[anchor=north,inner sep=0pt,font=\footnotesize,text=red] at ([xshift=-0.08em,yshift=-1em]add.south){combining};
+\node[anchor=north,inner sep=0pt,font=\footnotesize,text=red] at ([xshift=-0em,yshift=-0.5em]add.south){融合};
 \node[draw,anchor=east,minimum size=1.8em,fill=yellow!15] (ct) at ([xshift=-2em,yshift=-3.5em]st.west){$ {\mathbi{C}}_{t}$};
-\node[anchor=north,font=\footnotesize] (matchlab) at ([xshift=6.7em,yshift=-0.1em]ct.north){\red{mathching}};
+\node[anchor=north,font=\footnotesize] (matchlab) at ([xshift=6.7em,yshift=-0.1em]ct.north){\red{匹配}};
 \node[anchor=east] (y) at ([xshift=-6em,yshift=1em]st.west){$\mathbi{y}_{t-1}$};

--- a/Chapter17/Figures/figure-framing-schematic.tex
+++ b/Chapter17/Figures/figure-framing-schematic.tex
@@ -3,7 +3,7 @@
 \begin{tikzpicture}[node distance = 0,scale = 1]
 \tikzstyle{every node}=[scale=1]
-\node [anchor=center](ori) at (-0.2,-0.2) {$O$};
+\node [anchor=center](ori) at (-0.2,-0.2) {0};
 \draw[->,thick](-0.5,0)--(5,0)node[below,scale=0.8]{时间};
 \draw[->,thick](0,-2)--(0,2)node[left,scale=0.8]{量化值};
 \draw[-,thick](0,0)sin(0.7,1.5)cos(1.4,0)sin(2.1,-1.5)cos(2.8,0)sin(3.5,1.5)cos(4.2,0);

--- a/Chapter17/Figures/figure-traditional-methods-of-image-description.tex
+++ b/Chapter17/Figures/figure-traditional-methods-of-image-description.tex
@@ -26,7 +26,7 @@
 \draw[->, thick,color=black!60](figure.east)to([xshift=-0.1cm]dog.west)node[left,xshift=-0.2cm,yshift=-0.1cm,color=black]{图片检测};
 \draw[->, thick,color=black!60]([yshift=-0.1cm]hat.south)to([yshift=0.1cm]ground.north)node[right,xshift=-0.2cm,yshift=0.5cm,color=black]{模板填充};
-\node [anchor=north](pos1)at ([xshift=-3.8em,yshift=-0.5em]ground-1.south){(a) 基于检索的图像描述生成范式};
+\node [anchor=north](pos1)at ([xshift=-3.8em,yshift=-0.5em]ground-1.south){(a) 基于检索的图像描述生成};
-\node [anchor=north](pos2)at ([xshift=-3.8em,yshift=-0.5em]ground.south){(b) 基于模板的图像描述生成范式};
+\node [anchor=north](pos2)at ([xshift=-3.8em,yshift=-0.5em]ground.south){(b) 基于模板的图像描述生成};
 \end{tikzpicture}
\ No newline at end of file
--- a/Chapter17/chapter17.tex
+++ b/Chapter17/chapter17.tex
--- a/Chapter18/Figures/figure-memory-multi-use.tex
+++ b/Chapter18/Figures/figure-memory-multi-use.tex
@@ -39,7 +39,7 @@
 \draw [->,dashed,line width=0.7pt] ([yshift=0.5em,xshift=0.5em]three.north) .. controls +(north:3.5em) and +(south:4.5em) .. ([yshift=-0.2em]mthree.south);
 \draw [->,dashed,line width=0.7pt] ([yshift=0.5em]four.north) .. controls +(north:4.5em) and +(south:4.5em) .. ([yshift=-0.2em]mfour.south);
-\node [word] at ([yshift=-6em]two.south) {（a）显存不复用};
+\node [word] at ([yshift=-6em]two.south) {(a) 显存不复用};
 %占位
 \node[word] at ([xshift=1em]four.east) {};
@@ -95,7 +95,7 @@
 \node [word] at ([xshift=1.5em,yshift=5.6em]one.north) {\scriptsize 显存};
-\node [word] at ([yshift=-6em]two.south) {（b）显存复用};
+\node [word] at ([yshift=-6em]two.south) {(b) 显存复用};
 \begin{pgfonlayer}{background}
 \node [rectangle,inner sep=0.5em,rounded corners=1pt,minimum width=10em,minimum height=3.6em,fill=gray!10,drop shadow] at ([yshift=6.6em,xshift=1em]two.north) {};

--- a/Chapter18/chapter18.tex
+++ b/Chapter18/chapter18.tex
@@ -25,7 +25,7 @@
 \parinterval 随着机器翻译品质的不断提升，越来越多的应用需求被挖掘出来。但是，一个优秀的机器翻译引擎并不意味着机器翻译可以被成功应用。机器翻译技术落地需要“额外”考虑很多因素，例如，数据加工方式、交互方式、应用的领域等，甚至机器翻译模型也要经过改造才能适应到不同的场景中。
-\parinterval 本章将重点介绍机器翻译应用中所面临的问题，以及解决这些问题可以采用的策略。本章所涉及的内容较为广泛，一方面会大量使用本书前十七章的模型和方法，另一方面也会介绍新的技术手段。最终，本章会结合机器翻译的特点展示一些机器翻译可能的应用场景。
+\parinterval 本章将重点介绍机器翻译应用中所面临的一些实际问题，以及解决这些问题可以采用的策略。本章所涉及的内容较为广泛，一方面会大量使用本书前十七章的模型和方法，另一方面也会介绍新的技术手段。最终，本章会结合机器翻译的特点展示一些机器翻译可以应用的场景。
 %----------------------------------------------------------------------------------------
 %    NEW SECTION
@@ -33,17 +33,17 @@
 \section{机器翻译的应用并不简单}
-\parinterval 机器翻译一直是自然语言处理的热点，无论从评测比赛的结果，还是论文发表数量上看，机器翻译的研究可谓火热。但是，客观的说，我们离机器翻译完美的应用还有相当的距离。这主要是因为，成熟的系统需要很多技术的融合。因此，机器翻译系统研发也是一项复杂的系统工程。而机器翻译研究大多是对局部模型和方法的调整，这也会造成一个现象：很多论文里报道的技术方法可能无法直接应用于真实场景的系统。因此，需要关注如何对具体的机器翻译应用问题进行求解，使机器翻译技术能够落地。有几方面挑战：
+\parinterval 近几年，无论从评测比赛的结果，还是论文发表数量上看，机器翻译的研究可谓火热。但是，客观的说，我们离机器翻译完美的应用还有相当的距离。这主要是因为，成熟的系统需要很多技术的融合。因此，机器翻译系统研发也是一项复杂的系统工程。而机器翻译研究大多是对局部模型和方法的调整，这也会造成一个现象：很多论文里报道的技术方法可能无法直接应用于真实场景的系统。这里，有几方面挑战：
 \begin{itemize}
 \vspace{0.5em}
-\item {\small\bfnew{机器翻译模型很脆弱}}。实验环境下，给定翻译任务，甚至给定训练和测试数据，机器翻译模型可以表现得很好。但是，应用场景是不断变化的。经常会出现训练数据缺乏、应用领域与训练数据不匹配、用户的测试方法与开发者不同等等一系列问题。特别是，对于不同的任务，神经机器翻译模型需要进行非常细致的调整，理想中“一套包打天下”的模型和设置是不存在的。这些都导致一个结果：直接使用既有机器翻译模型很难满足不断变化的应用需求。
+\item 机器翻译模型很脆弱。实验环境下，给定翻译任务，甚至给定训练和测试数据，机器翻译模型可以表现得很好。但是，应用场景是不断变化的。经常会出现训练数据缺乏、应用领域与训练数据不匹配、用户的测试方法与开发者不同等等一系列问题。特别是，对于不同的任务，神经机器翻译模型需要进行非常细致的调整，理想中“一套包打天下”的模型和设置是不存在的。这些都导致一个结果：直接使用既有机器翻译模型很难满足不断变化的应用需求。
 \vspace{0.5em}
-\item {\small\bfnew{机器翻译缺少针对场景的应用技术}}。目前为止，机器翻译的研究进展已经为我们提供很好的机器翻译基础模型。但是，用户并不是简单的与这些模型“打交道”，他们更加关注如何解决自身的业务需求，例如，机器翻译应用的交互方式、系统是否可以自己预估翻译可信度等等。甚至，在某些场景中，用户对翻译模型的体积和速度都有非常严格的要求。
+\item 机器翻译缺少针对场景的应用技术。目前为止，机器翻译的研究进展已经为我们提供很好的机器翻译基础模型。但是，用户并不是简单的与这些模型“打交道”，他们更加关注如何解决自身的业务需求，例如，机器翻译应用的交互方式、系统是否可以自己预估翻译可信度等等。甚至，在某些场景中，用户对翻译模型的体积和速度都有非常严格的要求。
 \vspace{0.5em}
-\item {\small\bfnew{优秀系统的研发需要长时间的打磨}}。工程打磨也是研发优秀机器翻译系统的必备条件，有些时候甚至是决定性的。从科学研究的角度看，我们需要对更本质的科学问题进行探索，而非简单的工程开发与调试。但是，对一个初级的系统进行研究往往会掩盖掉“真正的问题”，因为很多问题在更优秀的系统中并不存在。
+\item 优秀系统的研发需要长时间的打磨。工程打磨也是研发优秀机器翻译系统的必备条件，有些时候甚至是决定性的。从科学研究的角度看，我们需要对更本质的科学问题进行探索，而非简单的工程开发与调试。但是，对一个初级的系统进行研究往往会掩盖掉“真正的问题”，因为很多问题在更优秀的系统中并不存在。
 \vspace{0.5em}
 \end{itemize}
@@ -59,7 +59,7 @@
 \begin{itemize}
 \vspace{0.5em}
-\item 应用的目标领域和场景可能是系统研发时无法预见的，但是用户会有一定量自有数据，可以用于系统优化。
+\item 应用的目标领域和场景可能是研发系统时无法预见的，但是用户会有一定量自有数据，可以用于系统优化。
 \vspace{0.5em}
 \item 系统在应用中会产生新的数据，这些数据经过一些筛选和修改也可以用于模型训练。
 \vspace{0.5em}
@@ -69,7 +69,7 @@
 \parinterval 增量训练就是满足上述需求的一种方法。{\chapterthirteen}已经就增量训练这个概念展开了一些讨论，这里重点介绍一些具体的实践手段。本质上，神经机器翻译中使用的随机梯度下降方法就是典型的增量训练方法，其基本思想是：每次选择一个样本对模型进行更新，这个过程反复不断执行，每次模型更新都是一次增量训练。当多个样本构成了一个新数据集时，可以把这些新样本作为训练数据，把当前的模型作为初始模型，之后正常执行机器翻译的训练过程即可。如果新增加的数据量不大（比如，几万句对），训练的代价非常低。
-\parinterval 这里面的一个问题是，新的数据虽然能代表一部分的翻译现象，但是如果仅仅依赖新数据进行更新，会使模型对新数据过分拟合，进而造成无法很好地处理新数据之外的样本。这也可以被看做是一种灾难性遗忘的问题\upcite{DBLP:conf/coling/GuF20}，即：模型过分注重对新样本的拟合，丧失了旧模型的一部分能力。在应用系统开发中，有几种常用的增量训练方法：
+\parinterval 这里面的一个问题是，新的数据虽然能代表一部分的翻译现象，但是如果仅仅依赖新数据进行更新，会使模型对新数据过分拟合，从而无法很好地处理新数据之外的样本。这也可以被看做是一种灾难性遗忘的问题\upcite{DBLP:conf/coling/GuF20}，即：模型过分注重对新样本的拟合，丧失了旧模型的一部分能力。在实际系统开发中，有几种常用的增量训练方法：
 \begin{itemize}
 \vspace{0.5em}
@@ -79,11 +79,13 @@
 \item 模型插值\upcite{DBLP:conf/emnlp/WangULCS17}。在增量训练之后，将新模型与旧模型进行插值。
 \vspace{0.5em}
-\item 多目标训练\upcite{barone2017regularization,DBLP:conf/aclnmt/KhayrallahTDK18,DBLP:conf/naacl/ThompsonGKDK19}。在增量训练时，除了在新数据上定义损失函数之外，可以再定义一个在旧数据上的损失函数，这样确保模型可以在两个数据上都有较好的表现。也可以引入正则化项，使新模型的参数不会偏离旧模型的参数太远。
+\item 多目标训练\upcite{barone2017regularization,DBLP:conf/aclnmt/KhayrallahTDK18,DBLP:conf/naacl/ThompsonGKDK19}。在增量训练时，除了在新数据上定义损失函数之外，可以再定义一个在旧数据上的损失函数，这样确保模型可以在两个数据上都有较好的表现。也可以在损失函数中引入正则化项，使新模型的参数不会偏离旧模型的参数太远。
 \vspace{0.5em}
 \end{itemize}
+\parinterval 图\ref{fig:18-1}给出了上述方法的对比。在实际应用中，还有很多细节会影响增量训练的效果，比如，学习率大小的选择等。另外，新的数据积累到何种规模可以进行增量训练也是实践中需要解决问题。一般来说，增量训练使用的数据量越大，训练的效果越稳定。但是，这并不是说数据量少就不可以进行增量训练，而是如果数据量过少时，需要考虑训练代价和效果之间的平衡。而且，过于频繁的增量训练也会带来更多的灾难性遗忘的风险，因此合理进行增量训练也是机器翻译应用中需要实践的。
 %----------------------------------------------
 \begin{figure}[htp]
 \centering
@@ -94,9 +96,7 @@
 \end{figure}
 %----------------------------------------------
-\parinterval 图\ref{fig:18-1}给出了上述方法的对比。在实际应用中，还有很多细节会影响增量训练的效果，比如，学习率大小的选择等。另外，新的数据积累到何种规模可以进行增量训练也是实践中需要解决问题。一般来说，增量训练使用的数据量越大，训练的效果越稳定。但是，这并不是说数据量少就不可以进行增量训练，而是如果数据量过少时，需要考虑训练代价和效果之间的平衡。而且，过于频繁的增量训练也会带来更多的灾难性遗忘的风险，因此合理进行增量训练也是应用中需要实践的。
+\parinterval 需要注意的是，理想状态下，系统使用者会希望系统看到少量句子就可以很好地解决一类翻译问题，即：进行真正的小样本学习。但是，现实的情况是，现在的机器翻译系统还无法很好的做到“举一反三”。增量训练也需要专业人士完成才能得到相对较好的效果。
-\parinterval 主要注意的是，理想状态下，系统使用者会希望系统看到少量句子就可以很好地解决一类翻译问题，即：进行真正的小样本学习。但是，现实的情况是，现在的机器翻译系统还无法很好的做到“举一反三”。增量训练也需要专业人士完成才能得到相对较好的效果。
 \parinterval 另一个实际的问题是，当应用场景没有双语句对时是否可以优化系统？这个问题在{\chaptersixteen}的领域适应部分进行了一些讨论。一般来说，如果目标任务没有双语数据，仍然可以使用单语数据进行优化。常用的方法有数据增强、基于语言模型的方法等。具体方法可以参考{\chaptersixteen}的内容。
@@ -125,18 +125,18 @@
 \parinterval 交互式机器翻译系统主要通过用户的反馈来提升译文的质量，不同类型的反馈信息则影响着系统最终的性能。根据反馈形式的不同，可以将交互式机器翻译分为以下几种：
 \begin{itemize}
 \vspace{0.5em}
-\item {\small\bfnew 基于前缀的交互式机器翻译}。早期的交互式机器翻译系统都是采用基于前缀的方式。基于翻译系统生成的初始译文，翻译人员从左到右检查翻译的正确性，并在第一个错误的位置进行更正。这为系统提供了一种双重信号：表明该位置上单词必须是译员修改过后的单词，并且该位置之前的单词都是正确的。之后系统根据已经检查过的前缀再生成后面的译文\upcite{DBLP:conf/acl/WuebkerGDHL16,Zens2003EfficientSF,DBLP:journals/coling/BarrachinaBCCCKLNTVV09,DBLP:journals/csl/PerisC19}。
+\item 基于前缀的交互式机器翻译。早期的交互式机器翻译系统都是采用基于前缀的方式。基于翻译系统生成的初始译文，翻译人员从左到右检查翻译的正确性，并在第一个错误的位置进行更正。这为系统提供了一种双重信号：表明该位置上单词必须是译员修改过后的单词，并且该位置之前的单词都是正确的。之后系统根据已经检查过的前缀再生成后面的译文\upcite{DBLP:conf/acl/WuebkerGDHL16,Zens2003EfficientSF,DBLP:journals/coling/BarrachinaBCCCKLNTVV09,DBLP:journals/csl/PerisC19}。
 \vspace{0.5em}
-\item {\small\bfnew 基于片段的交互式机器翻译}。根据用户提供的反馈来生成更好的翻译结果是交互式翻译系统的关键。而基于前缀的系统则存在一个严重的缺陷，当翻译系统获得确定的翻译前缀之后，再重新生成译文时会将原本正确的翻译后缀遗漏了，因此会引入新的错误。在基于片段的交互式机器翻译系统中，翻译人员除了纠正第一个错误的单词，还可以指定在未来迭代中保留的单词序列。之后系统根据这些反馈信号再生成新的译文\upcite{Peris2017InteractiveNM,DBLP:journals/mt/DomingoPC17}。
+\item 基于片段的交互式机器翻译。根据用户提供的反馈来生成更好的翻译结果是交互式翻译系统的关键。而基于前缀的系统则存在一个严重的缺陷，当翻译系统获得确定的翻译前缀之后，再重新生成译文时会将原本正确的翻译后缀遗漏了，因此会引入新的错误。在基于片段的交互式机器翻译系统中，翻译人员除了纠正第一个错误的单词，还可以指定在未来迭代中保留的单词序列。之后系统根据这些反馈信号再生成新的译文\upcite{Peris2017InteractiveNM,DBLP:journals/mt/DomingoPC17}。
 \vspace{0.5em}
-\item {\small\bfnew 基于评分的交互式机器翻译}。随着计算机算力的提升，有时会出现“机器等人”的现象，因此需要提升人参与交互的效率也是需要考虑的。与之前的系统不同，基于评分的交互式机器翻译系统不需要译员选择、纠正或删除某个片段，而是使用译员对译文的评分来强化机器翻译的学习\upcite{DBLP:journals/corr/abs-1805-01553,DBLP:conf/emnlp/NguyenDB17}。
+\item 基于评分的交互式机器翻译。随着计算机算力的提升，有时会出现“机器等人”的现象，因此需要提升人参与交互的效率也是需要考虑的。与之前的系统不同，基于评分的交互式机器翻译系统不需要译员选择、纠正或删除某个片段，而是使用译员对译文的评分来强化机器翻译的学习\upcite{DBLP:journals/corr/abs-1805-01553,DBLP:conf/emnlp/NguyenDB17}。
 \vspace{0.5em}
 \end{itemize}
-\parinterval 除此之外，基于在线学习的方法也受到了关注，这类方法也可以被看作是交互式翻译与增量训练的一种结合。用户总是希望翻译系统能从反馈中自动纠正以前的错误。当用户最终确认一个修改过后的译文后，翻译系统将源语与该修正后的译文作为训练语料继续训练\upcite{DBLP:conf/acl/DomingoGEBHPCH19}。实际上，交互式机器翻译是机器翻译大规模应用的重要路径之一，它为打通译员和机器翻译系统之间的障碍提供了手段。不过，交互式机器翻译也有许多挑战等待解决。一个是如何设计交互方式？理想的交互方式应该式更加贴近译员输入文字的习惯，比如，利用输入法完成交互；另一个是如何把交互式翻译嵌入到翻译的生产流程里？这本身不完全是一个技术问题，可能需要更多的产品手段来求解。
+\parinterval 除此之外，基于在线学习的方法也受到了关注，这类方法也可以被看作是交互式翻译与增量训练的一种结合。用户总是希望翻译系统能从反馈中自动纠正以前的错误。当用户最终确认一个修改过后的译文后，翻译系统将源语言句子与该修正后的译文作为训练语料继续训练\upcite{DBLP:conf/acl/DomingoGEBHPCH19}。实际上，交互式机器翻译是机器翻译大规模应用的重要路径之一，它为打通译员和机器翻译系统之间的障碍提供了手段。不过，交互式机器翻译也有许多挑战等待解决。一个是如何设计交互方式？理想的交互方式应该式更加贴近译员输入文字的习惯，比如，利用输入法完成交互；另一个是如何把交互式翻译嵌入到翻译的生产流程里？这本身不完全是一个技术问题，可能需要更多的产品手段来求解。
 %----------------------------------------------------------------------------------------
@@ -145,12 +145,12 @@
 \section{翻译结果可干预性}
-\parinterval 交互式机器翻译体现了一种用户的行为“干预”机器翻译结果的思想。实际上，在机器翻译出现错误时，人们总是希望用一种直接有效的方式“改变”译文，最短时间内达到改善翻译质量的目的。比如，如果机器翻译系统可以输出多个候选译文，用户可以在其中挑选最好的译文进行输出。也就是，人干预了译文候选的排序过程。另一个例子是使用{\small\bfnew{翻译记忆}}\index{翻译记忆}（Translation Memory\index{Translation Memory}）改善机器翻译系统的性能。翻译记忆记录了高质量的源语言-目标语言句对，有时也可以被看作是一种先验知识或“记忆”。因此，当进行机器翻译时，使用翻译记忆指导翻译过程也可以被看作是一种干预手段\upcite{DBLP:conf/acl/WangZS13,DBLP:conf/aaai/XiaHLS19}。
+\parinterval 交互式机器翻译体现了一种用户的行为“干预”机器翻译结果的思想。实际上，在机器翻译出现错误时，人们总是希望用一种直接有效的方式“改变”译文，最短时间内达到改善翻译质量的目的。比如，如果机器翻译系统可以输出多个候选译文，用户可以在其中挑选最好的译文进行输出。也就是，人干预了译文候选的排序过程。另一个例子是{\small\bfnew{翻译记忆}}\index{翻译记忆}（Translation Memory\index{Translation Memory}）。翻译记忆记录了高质量的源语言-目标语言句对，有时也可以被看作是一种先验知识或“记忆”。因此，当进行机器翻译时，使用翻译记忆指导翻译过程也可以被看作是一种干预手段\upcite{DBLP:conf/acl/WangZS13,DBLP:conf/aaai/XiaHLS19}。
-\parinterval 虽然干预机器翻译系统的方式很多，最常用的还是对源语言特定片段翻译的干预，以期望最终句子的译文满足某些约束。这个问题也被称作{\small\bfnew{基于约束的翻译}}\index{基于约束的翻译} （Constraint-based Translation\index{Constraint-based Translation}）。比如，在翻译网页时，需要保持译文中的网页标签与源文一致。另一个典型例子是术语翻译。在实际应用中，经常会遇到公司名称、品牌名称、产品名称等专有名词和行业术语，以及不同含义的缩写，比如，对于“小牛翻译”这个专有名词，不同的机器翻译系统给出的结果不一样:“Maverick translation”、“Calf translation”、“The mavericks translation”…… 而它正确的翻译应该为“NiuTrans”。 对于这些类似的特殊词汇，大多数机器翻译引擎很难翻译得准确。一方面，因为模型大多是在通用数据集上训练出来的，并不能保证数据集能涵盖所有的语言现象。另一方面，即使是这些术语在训练数据中出现，它们通常也是低频的，模型不容易捕捉它们的规律。为了保证翻译的准确性，对术语翻译进行干预是十分有必要的，这对领域适应等问题的求解也是非常有意义的。
+\parinterval 虽然干预机器翻译系统的方式很多，最常用的还是对源语言特定片段翻译的干预，以期望最终句子的译文满足某些约束。这个问题也被称作{\small\bfnew{基于约束的翻译}}\index{基于约束的翻译} （Constraint-based Translation\index{Constraint-based Translation}）。比如，在翻译网页时，需要保持译文中的网页标签与源文一致。另一个典型例子是术语翻译。在实际应用中，经常会遇到公司名称、品牌名称、产品名称等专有名词和行业术语，以及不同含义的缩写，比如，对于“小牛翻译”这个专有名词，不同的机器翻译系统给出的结果不一样:“Maverick translation”、“Calf translation”、“The mavericks translation”…… 而它正确的翻译应该为“NiuTrans”。 对于这些类似的特殊词汇，机器翻译引擎很难翻译得准确。一方面，因为模型大多是在通用数据集上训练出来的，并不能保证数据集能涵盖所有的语言现象。另一方面，即使是这些术语在训练数据中出现，它们通常也是低频的，模型不容易捕捉它们的规律。为了保证翻译的准确性，对术语翻译进行干预是十分有必要的，这对领域适应等问题的求解也是非常有意义的。
-\parinterval 就{\small\bfnew 词汇约束翻译}\index{词汇约束翻译}（Lexically Constrained Translation）\index{Lexically Constrained Translation}而言，在不干预的情况下让模型直接翻译出正确术语是很难的，因为目标术语翻译词很可能是未登录词，因此必须人为提供额外的术语词典，那么我们的目标就是让模型的翻译输出遵守用户提供的术语约束。这个过程如图\ref{fig:18-3}所示。
+\parinterval 就{\small\bfnew 词汇约束翻译}\index{词汇约束翻译}（Lexically Constrained Translation）\index{Lexically Constrained Translation}而言，在不干预的情况下让模型直接翻译出正确术语是很难的，因为术语的译文很可能是未登录词，因此必须人为提供额外的术语词典，那么我们的目标就是让模型的翻译输出遵守用户提供的术语约束。这个过程如图\ref{fig:18-3}所示。
 %----------------------------------------------
 \begin{figure}[htp]
 \centering
@@ -161,21 +161,21 @@
 \end{figure}
 %----------------------------------------------
-\parinterval 在统计机器翻译中，翻译本质上是由短语和规则构成的推导，因此修改译文比较容易，比如，可以在一个源语言片段所对应的翻译候选集中添加希望得到的译文即可。而神经机器翻译是一个端到端模型，翻译过程本质上是连续空间中元素的一系列映射、组合和代数运算，因此无法像修改符号系统那样直接修改模型并加入离散化的约束来影响译文生成。目前主要有两种解决思路：
+\parinterval 在统计机器翻译中，翻译本质上是由短语和规则构成的推导，因此修改译文比较容易，比如，可以在一个源语言片段所对应的翻译候选集中添加希望得到的译文即可。而神经机器翻译是一个端到端模型，翻译过程本质上是连续空间中元素的一系列映射、组合和代数运算。虽然在模型训练阶段仍然可以通过修改损失函数等手段引入约束，但是在推断阶段进行直接干预并不容易，因为我们无法像修改符号系统那样直接修改模型（如短语翻译表）来影响译文生成。实践中主要有两种解决思路：
 \begin{itemize}
 \vspace{0.5em}
-\item 强制生成。这种方法并不改变模型，而是在解码过程中按照一定的策略来实施约束，一般是修改束搜索算法以确保输出必须包含指定的词或者短语\upcite{DBLP:conf/acl/HokampL17,DBLP:conf/naacl/PostV18,DBLP:conf/wmt/ChatterjeeNTFSB17,DBLP:conf/naacl/HaslerGIB18}，例如，在获得译文输出后，利用注意力机制获取词对齐，之后通过词对齐得到源语言和目标语言片段的对应关系，最后对指定译文片段进行强制替换。或者，对包含正确术语翻译的翻译候选进行额外的加分，以确保解码时这样的翻译候选的排名足够靠前。
+\item 强制生成。这种方法并不改变模型，而是在推断过程中按照一定的策略来实施约束，一般是修改束搜索算法以确保输出必须包含指定的词或者短语\upcite{DBLP:conf/acl/HokampL17,DBLP:conf/naacl/PostV18,DBLP:conf/wmt/ChatterjeeNTFSB17,DBLP:conf/naacl/HaslerGIB18}，例如，在获得译文输出后，利用注意力机制获取词对齐，之后通过词对齐得到源语言和目标语言片段的对应关系，最后对指定译文片段进行强制替换。或者，对包含正确术语翻译的翻译候选进行额外的加分，以确保推断时这样的翻译候选的排名足够靠前。
 \vspace{0.5em}
-\item 数据增强。这类方法通过修改机器翻译模型的数据和训练过程来实现约束。通常是根据术语词典对源语言句子进行一定的修改，例如，将术语的译文编辑到源语言句子中，之后将原始语料库和合成语料库进行混合训练，期望模型能够自动利用术语信息来指导解码，或者是利用占位符来替换源语中的术语，待翻译完成后再进行还原\upcite{DBLP:conf/naacl/SongZYLWZ19,DBLP:conf/acl/DinuMFA19,DBLP:journals/corr/abs-1912-00567,DBLP:conf/ijcai/ChenCWL20}。
+\item 数据增强。这类方法通过修改机器翻译模型的数据来实现推断阶段施加约束。通常是根据术语词典对数据进行一定的修改，例如，将术语的译文编辑到源语言句子中，之后将原始语料库和合成语料库进行混合训练，期望模型能够自动利用术语信息来指导解码，或者是利用占位符来替换术语，待翻译完成后再进行还原\upcite{DBLP:conf/naacl/SongZYLWZ19,DBLP:conf/acl/DinuMFA19,DBLP:journals/corr/abs-1912-00567,DBLP:conf/ijcai/ChenCWL20}。
 \vspace{0.5em}
 \end{itemize}
 \parinterval 强制生成的方法是在搜索策略上进行限制，与模型无关，这类方法能保证输出满足约束，但是会影响翻译速度。数据增强的方法是通过构造特定格式的数据让模型训练，从而让模型具有一定的泛化能力，通常不会影响翻译速度，但并不能保证输出能满足约束。
-\parinterval 此外，机器翻译在应用时通常还需要进行译前译后的处理，译前处理指的是在翻译前对源语言句子进行修改和规范，从而能生成比较顺畅的译文，提高译文的可读性和准确率。在实际应用时，由于用户输入的形式多样，可能会包含比如术语、缩写、数学公式等，有些甚至可能还包含网页标签，因此对源文进行预处理是很有必要的。常见的处理工作包括格式转换、标点符号检査、术语编辑、标签识别等，待翻译完成后，则需要对机器译文进行进一步的编辑和修正，从而使其符合使用规范，比如进行标点、格式检查，术语、标签还原等，这些过程通常都是按照设定的处理策略自动完成的。另外,译文长度的控制、译文多样性的控制等也可以丰富机器翻译系统干预的手段（见{\chapterfourteen}）。
+\parinterval 此外，机器翻译在应用时通常还需要进行译前译后的处理，译前处理指的是在翻译前对源语言句子进行修改和规范，从而能生成比较通顺的译文，提高译文的可读性和准确率。在实际应用时，由于用户输入的形式多样，可能会包含比如术语、缩写、数学公式等，有些甚至可能还包含网页标签，因此对源文进行预处理是很有必要的。常见的处理工作包括格式转换、标点符号检査、术语编辑、标签识别等，待翻译完成后，则需要对机器译文进行进一步的编辑和修正，从而使其符合使用规范，比如进行标点、格式检查，术语、标签还原等，这些过程通常都是按照设定的处理策略自动完成的。另外,译文长度的控制、译文多样性的控制等也可以丰富机器翻译系统干预的手段（见{\chapterfourteen}）。
 %----------------------------------------------------------------------------------------
 %    NEW SECTION
@@ -195,13 +195,13 @@
 \item 低精度存储及计算（{\chapterfourteen}）。可以使用量化的方式将模型压缩，同时整数型计算也非常适合在CPU等设备上执行。
 \vspace{0.5em}
-\item 轻量模型结构（{\chapterfourteen}和{\chapterfifteen}）。对机器翻译模型的局部结构进行优化也是非常有效的手段，比如，使用更加轻量的卷积计算模块，或者使用深编码器-浅解码器等结构等。
+\item 轻量模型结构（{\chapterfourteen}和{\chapterfifteen}）。对机器翻译模型的局部结构进行优化也是非常有效的手段，比如，使用更加轻量的卷积计算模块，或者使用深编码器-浅解码器等高效的结构。
 \vspace{0.5em}
-\item 面向设备的结构学习（{\chapterfifteen}）。可以把设备的存储及延时作为目标函数的一部分，自动搜索高效的翻译模型结构。
+\item 面向设备的模型结构学习（{\chapterfifteen}）。可以把设备的存储及延时作为目标函数的一部分，自动搜索高效的翻译模型结构。
 \vspace{0.5em}
-\item 动态适应性模型\upcite{DBLP:conf/emnlp/WangXZ20,DBLP:journals/corr/BolukbasiWDS17,DBLP:conf/iclr/HuangCLWMW18}。模型可以动态调整大小或者计算规模，以达到在不同设备上平衡延时和精度的目的。比如，可以根据延时的要求，动态生成合适深度的神经网络模型进行翻译。
+\item 动态适应性模型\upcite{DBLP:conf/emnlp/WangXZ20,DBLP:journals/corr/BolukbasiWDS17,DBLP:conf/iclr/HuangCLWMW18}。模型可以动态调整大小或者计算规模，以达到在不同设备上平衡延时和精度的目的。比如，可以根据延时的要求，动态生成合适深度的神经网络进行翻译。
 \vspace{0.5em}
 \end{itemize}
@@ -210,7 +210,7 @@
 \parinterval 另一个工程问题是，在很多系统中，机器翻译模块并不是单独执行，而是与其他的模块并发执行。这时，由于多个计算密集型任务存在竞争，处理器要进行更多的上下文切换，会造成程序变慢。比如，机器翻译和语音识别两个模块一起运行时{\footnote{在一些语音翻译场景中，由于采用了语音识别和翻译异步执行的方式，两个程序可能会并发。}}，机器翻译的速度会有较明显的下降。对于这种情况，需要设计更好的调度机制。而且在一些同时具有CPU和GPU的设备上，可以考虑合理调度CPU和GPU的资源，增加两种设备可并行处理的内容，避免在某个处理器上的拥塞。
-\parinterval 除了运行速度，模型过大也是限制其在小设备上运行的障碍。在模型体积上，神经机器翻译具有天然的优势。因此，在对模型规模有苛刻要求的场景中，神经机器翻译是不二的选择。通过量化、剪枝、参数共享等方式，还可以将模型压缩一个数量级以上。
+\parinterval 除了运行速度，模型过大也是限制其在小设备上运行的因素。在模型体积上，神经机器翻译具有天然的优势。因此，在对模型规模有苛刻要求的场景中，神经机器翻译是不二的选择。通过量化、剪枝、参数共享等方式，还可以将模型压缩一个数量级以上。
 %----------------------------------------------------------------------------------------
 %    NEW SECTION
@@ -238,7 +238,7 @@
 \centering
 \input{./Chapter18/Figures/figure-memory-multi-use}
 %\setlength{\abovecaptionskip}{-0.2cm}
-\caption{显存复用示例}
+\caption{显存不复用与显存复用的示例}
 \label{fig:18-4}
 \end{figure}
 %----------------------------------------------
@@ -260,11 +260,11 @@
 \parinterval （一）网页翻译
-\parinterval 进入信息爆炸的时代之后，互联网上海量的数据随处可得，然而由于国家和地区语言的不同，网络上的数据也呈现出多语言的特性。当人们在遇到包含不熟悉语言的网页时，无法及时有效地获取其中的信息。因此，对不同语言的网页进行翻译是必不可少的一步。由于网络上网页的数量数不胜数，依靠人工对网页进行翻译是不切实际的，相反，机器翻译十分适合这个任务。目前，市场上有很多浏览器提供网页翻译的服务，极大地简化了人们从网络上获取不同语言信息的难度。
+\parinterval 进入信息爆炸的时代之后，互联网上海量的数据随处可得，然而由于不同国家和地区语言的差异，网络上的数据也呈现出多语言的特性。当人们在遇到包含不熟悉语言的网页时，无法及时有效地获取其中的信息。因此，对不同语言的网页进行翻译是必不可少的一步。由于网络上网页的数量数不胜数，依靠人工对网页进行翻译是不切实际的，相反，机器翻译十分适合这个任务。目前，市场上有很多浏览器提供网页翻译的服务，极大地简化了人们从网络上获取不同语言信息的难度。
 \parinterval （二）科技文献翻译
-\parinterval 在专利等科技文献翻译中，往往需要将文献翻译为英语或者其他语言，比如摘要翻译。以往这种翻译工作通常由人工来完成。由于对翻译结果的质量要求较高，因此要求翻译人员具有相关背景知识，这导致译员资源稀缺。特别是，近几年国内专利申请数不断增加，这给人工翻译带来了很大的负担。相比于人工翻译，机器翻译可以在短时间内完成大量的专利翻译，同时结合术语词典和人工校对等方式，可以保证专利的翻译质量。同时，以专利为代表的科技文献往往具有很强的领域性，针对各类领域文本进行单独优化，机器翻译的品质可以大大提高。因此，机器翻译在专利翻译等行业有十分广泛的应用前景。
+\parinterval 在专利等科技文献翻译中，往往需要将文献翻译为英语或者其他语言，比如摘要翻译。以往这种翻译工作通常由人工来完成。由于对翻译结果的质量要求较高，因此要求翻译人员具有相关专业的背景知识，这导致译员资源稀缺。特别是，近几年国内专利申请数不断增加，这给人工翻译带来了很大的负担。相比于人工翻译，机器翻译可以在短时间内完成大量的专利翻译，同时结合术语词典和人工校对等方式，可以保证专利的翻译质量。同时，以专利为代表的科技文献往往具有很强的领域性，针对各类领域文本进行单独优化，机器翻译的品质可以大大提高。因此，机器翻译在专利翻译等行业有十分广泛的应用前景。
 \parinterval （三）视频字幕翻译
@@ -276,7 +276,7 @@
 \parinterval （五）同声传译
-\parinterval 在一些国际会议中，与会者来自许多不同的国家，为了保证会议的流畅，通常需要专业译员进行同声传译。同声传译需要在不打断演讲的同时，不间断地将讲话内容进行口译，对翻译人员的素质要求极高，成本高昂。现在，一些会议开始采用语音识别来将语音转换成文本，同时使用机器翻译技术进行翻译的方式，达到同步翻译的目的。这项技术已经得到了多个企业的关注，并在很多重要会议上进行尝试，取得了很好的反响。不过同声传译达到真正的使用还需一定时间的打磨，特别是会议场景下，准确进行语音识别和翻译仍然具有挑战性。
+\parinterval 在一些国际会议中，与会者来自许多不同的国家，为了保证会议的流畅，通常需要专业译员进行同声传译。同声传译需要在不打断演讲的同时，不间断地将讲话内容进行口译，对翻译人员的要求极高。现在，一些会议开始采用语音识别来将语音转换成文本，同时使用机器翻译技术进行翻译的方式，达到同步翻译的目的。这项技术已经得到了多个企业的关注，并在很多重要会议上进行尝试，取得了很好的反响。不过同声传译达到真正的使用还需一定时间的打磨，特别是会议场景下，准确进行语音识别和翻译仍然具有挑战性。
 \parinterval （六）中国传统语言文化的翻译

--- a/Chapter4/chapter4.tex
+++ b/Chapter4/chapter4.tex
@@ -519,7 +519,7 @@ His house is on the south bank of the river.
 \parinterval {\small\bfnew{词嵌入}}\index{词嵌入}（Word Embedding\index{Word Embedding}）技术是近些年自然语言处理中的重要成果，其思想是把每个单词映射为多维实数空间中的一个点（具体表现为一个实数向量），这种技术也被称作单词的{\small\bfnew{分布式表示}}\index{分布式表示}（Distributed Representation\index{Distributed Representation}）。在这项技术中，单词之间的关系可以通过空间的几何性质进行刻画，意义相近的单词之间的欧式距离也十分相近（单词分布式表示的具体内容，将在书的{\chapternine} 详细介绍，在此不再赘述）。
-\parinterval 受词嵌入技术的启发，研究人员尝试借助参考答案和机器译文的分布式表示来进行译文质量评价，为译文质量评价提供了新思路。在自然语言的上下文中，表示是与每个单词、句子或文档相关联的数学对象。这个对象通常是一个向量，其中每个元素的值在某种程度上描述了相关单词、句子或文档的语义或句法属性。基于这个想法，研究人员提出了{\small\sffamily\bfseries{分布式表示评价度量}}\index{分布式表示评价度量}（Distributed Representations Evaluation Metrics，DREEM）\index{Distributed Representations Evaluation Metrics}\upcite{DBLP:conf/acl/ChenG15}。这种方法将单词或句子的分布式表示映射到连续的低维空间，发现在该空间中，具有相似句法和语义属性的单词彼此接近，类似的结论也出现在相关工作中，如参考文献\cite{bengio2003a,DBLP:conf/emnlp/SocherPHNM11,DBLP:conf/emnlp/SocherPWCMNP13}所示。而这个特点可以被应用到译文质量评估中。
+\parinterval 受词嵌入技术的启发，研究人员尝试借助参考答案和机器译文的分布式表示来进行译文质量评价，为译文质量评价提供了新思路。在自然语言的上下文中，表示是与每个单词、句子或文档相关联的数学对象。这个对象通常是一个向量，其中每个元素的值在某种程度上描述了相关单词、句子或文档的语义或句法属性。基于这个想法，研究人员提出了{\small\sffamily\bfseries{分布式表示评价度量}}\index{分布式表示评价度量}（Distributed Representations Evaluation Metrics，DREEM）\index{DREEM}\upcite{DBLP:conf/acl/ChenG15}。这种方法将单词或句子的分布式表示映射到连续的低维空间，发现在该空间中，具有相似句法和语义属性的单词彼此接近，类似的结论也出现在相关工作中，如参考文献\cite{bengio2003a,DBLP:conf/emnlp/SocherPHNM11,DBLP:conf/emnlp/SocherPWCMNP13}所示。而这个特点可以被应用到译文质量评估中。
 \parinterval 在DREEM中，分布式表示的选取是一个十分关键的问题，理想的情况下，分布式表示应该涵盖句子在词汇、句法、语法、语义、依存关系等各个方面的信息。目前常见的分布式表示方式如表\ref{tab:4-2}所示。除此之外，还可以通过词袋模型、循环神经网络等将词向量表示转换为句子向量表示。
@@ -532,9 +532,9 @@ His house is on the south bank of the river.
 \hline
 \rule{0pt}{10pt} One-hot词向量 & RAE编码\upcite{DBLP:conf/emnlp/SocherPHNM11} \\
 \rule{0pt}{10pt} Word2Vec词向量\upcite{DBLP:journals/corr/abs-1301-3781} & Doc2Vec向量\upcite{DBLP:conf/icml/LeM14}  \\
-\rule{0pt}{10pt} Prob-fasttext词向量\upcite{DBLP:conf/acl/AthiwaratkunW17} & ELMO预训练句子表示\upcite{DBLP:conf/naacl/PetersNIGCLZ18} \\
+\rule{0pt}{10pt} Prob-fasttext词向量\upcite{DBLP:conf/acl/AthiwaratkunW17} & ELMO预训练句子表示\upcite{Peters2018DeepCW} \\
 \rule{0pt}{10pt} GloVe词向量\upcite{DBLP:conf/emnlp/PenningtonSM14} & GPT句子表示\upcite{radford2018improving} \\
-\rule{0pt}{10pt} ELMO预训练词向量\upcite{DBLP:conf/naacl/PetersNIGCLZ18} & BERT预训练句子表示\upcite{devlin2019bert} \\
+\rule{0pt}{10pt} ELMO预训练词向量\upcite{Peters2018DeepCW} & BERT预训练句子表示\upcite{devlin2019bert} \\
 \rule{0pt}{10pt} BERT预训练词向量\upcite{devlin2019bert} & Skip-thought向量\upcite{DBLP:conf/nips/KirosZSZUTF15} \\
 \end{tabular}
 \label{tab:4-2}
@@ -874,7 +874,7 @@ d&=&t \frac{s}{\sqrt{n}}
 \vspace{0.5em}
 \end{itemize}
-\parinterval 随着深度学习技术的发展，另一种思路是使用表示学习技术生成句子的分布式表示，并在此基础上利用神经网络自动提取高度抽象的句子特征\upcite{DBLP:conf/wmt/KreutzerSR15,DBLP:conf/wmt/MartinsAHK16,DBLP:conf/wmt/ChenTZXZLW17}，这样就避免了人工设计特征所带来的时间以及人工代价，同时表示学习所得到的分布式表示可以涵盖更多人工设计难以捕获到的特征，更加全面地反映句子的特点，因此在质量评估任务上也取得了很好的效果\upcite{kreutzer2015quality,DBLP:conf/wmt/ShahLPBBBS15,DBLP:conf/wmt/ScartonBSSS16,DBLP:conf/wmt/AbdelsalamBE16,DBLP:conf/wmt/BasuPN18,DBLP:conf/wmt/Lo19,DBLP:conf/wmt/YankovskayaTF19}。比如，最近的一些工作中大量使用了神经机器翻译模型来获得双语句子的表示结果，并用于质量评估\upcite{DBLP:conf/wmt/Qi19,DBLP:conf/wmt/ZhouZH19,DBLP:conf/wmt/Hokamp17,wang2019niutrans}。这样做的好处在于，质量评估可以直接复用机器翻译的模型，从某种意义上降低了质量评估系统开发的代价。此外，随着近几年各种预训练模型的出现，使用预训练模型来获取用于质量评估的句子表示也成为一大流行趋势，这种方法大大减少了质量评估模型自身的训练时间，在该领域内的表现也十分亮眼\upcite{kepler2019unbabel,DBLP:conf/wmt/YankovskayaTF19,DBLP:conf/wmt/KimLKN19}。关于表示学习、神经机器翻译、预训练模型的内容在第九章和第十章会有进一步介绍。
+\parinterval 随着深度学习技术的发展，另一种思路是使用表示学习技术生成句子的分布式表示，并在此基础上利用神经网络自动提取高度抽象的句子特征\upcite{DBLP:conf/wmt/KreutzerSR15,DBLP:conf/wmt/MartinsAHK16,DBLP:conf/wmt/ChenTZXZLW17}，这样就避免了人工设计特征所带来的时间以及人工代价，同时表示学习所得到的分布式表示可以涵盖更多人工设计难以捕获到的特征，更加全面地反映句子的特点，因此在质量评估任务上也取得了很好的效果\upcite{kreutzer2015quality,DBLP:conf/wmt/ShahLPBBBS15,DBLP:conf/wmt/ScartonBSSS16,DBLP:conf/wmt/AbdelsalamBE16,DBLP:conf/wmt/BasuPN18}。比如，最近的一些工作中大量使用了神经机器翻译模型来获得双语句子的表示结果，并用于质量评估\upcite{DBLP:conf/wmt/Qi19,DBLP:conf/wmt/ZhouZH19,DBLP:conf/wmt/Hokamp17,wang2019niutrans}。这样做的好处在于，质量评估可以直接复用机器翻译的模型，从某种意义上降低了质量评估系统开发的代价。此外，随着近几年各种预训练模型的出现，使用预训练模型来获取用于质量评估的句子表示也成为一大流行趋势，这种方法大大减少了质量评估模型自身的训练时间，在该领域内的表现也十分亮眼\upcite{kepler2019unbabel,DBLP:conf/wmt/YankovskayaTF19,DBLP:conf/wmt/KimLKN19}。关于表示学习、神经机器翻译、预训练模型的内容在第九章和第十章会有进一步介绍。
 \parinterval 在得到句子表示之后，可以使用质量评估模块对译文质量进行预测。质量评估模型通常由回归算法或分类算法实现：

--- a/Chapter9/chapter9.tex
+++ b/Chapter9/chapter9.tex
@@ -1884,7 +1884,7 @@ z_t&=&\gamma z_{t-1}+(1-\gamma) \frac{\partial J}{\partial {\theta}_t} \cdot  \f
 \subsubsection{1. 模型结构}
-\parinterval  最具代表性的神经语言模型是{\small\sffamily\bfseries{前馈神经网络语言模型}}\index{前馈神经网络语言模型}（Feed-forward Neural Network Language Model\index{Feed-forward Neural Network Language Model}，简称FNNLM）。这种语言模型的目标是用神经网络计算$ \funp{P}(w_m|w_{m-n+1}\dots w_{m-1}) $，之后将多个$n$-gram的概率相乘得到整个序列的概率\upcite{bengio2003a}。
+\parinterval  最具代表性的神经语言模型是{\small\sffamily\bfseries{前馈神经网络语言模型}}\index{前馈神经网络语言模型}（Feed-forward Neural Network Language Model，FNNLM\index{FNNLM}）。这种语言模型的目标是用神经网络计算$ \funp{P}(w_m|w_{m-n+1}\dots w_{m-1}) $，之后将多个$n$-gram的概率相乘得到整个序列的概率\upcite{bengio2003a}。
 %----------------------------------------------
 \begin{figure}[htp]

--- a/bibliography.bib
+++ b/bibliography.bib
@@ -2400,15 +2400,6 @@ year = {2012}
  pages     = {260--269},
  year      = {1967}
 }
-@inproceedings{DBLP:conf/acl/OchN02,
-  author    = {Franz Josef Och and
-               Hermann Ney},
-  title     = {Discriminative Training and Maximum Entropy Models for Statistical
-               Machine Translation},
-  pages     = {295--302},
-  publisher = {Annual Meeting of the Association for Computational Linguistics},
-  year      = {2002}
-}
 @inproceedings{koehn2000estimating,
  author    = {Philipp Koehn and
               Kevin Knight},
@@ -3853,15 +3844,6 @@ year = {2012}
  pages     = {701--710},
  year      = {2014}
 }
-@inproceedings{2011Natural,
-  title={Natural Language Processing (almost) from Scratch},
-  author={ Collobert, Ronan  and  Weston, Jason  and Bottou, Léon and  Karlen, Michael  and  Kavukcuoglu, Koray  and  Kuksa, Pavel },
-  publisher={Journal of Machine Learning Research},
-  volume={12},
-  number={1},
-  pages={2493-2537},
-  year={2011}
-}
 @inproceedings{mccann2017learned,
  author    = {Bryan Mccann and
               James Bradbury and
@@ -3874,16 +3856,17 @@ year = {2012}
 }
 %%%%%%%%%%%%%%%%%%%%%%%神经语言模型，已检查修改%%%%%%%%%%%%%%%%%%%%%%%%%
 @inproceedings{Peters2018DeepCW,
-  title={Deep contextualized word representations},
+  author    = {Matthew Peters and
-  author={Matthew Peters and 
+               Mark Neumann and
-          Mark Neumann and 
+               Mohit Iyyer and
-		  Mohit Iyyer and 
+               Matt Gardner and
-		  Matt Gardner and 
+               Christopher Clark and
-		  Christopher Clark and 
+               Kenton Lee and
-		  Kenton Lee and 
+               Luke Zettlemoyer},
-		  Luke Zettlemoyer},
+  title     = {Deep Contextualized Word Representations},
-  publisher={Proceedings of the Human Language Technology Conference of the North American Chapter of the Association for Computational Linguistics},
+  pages     = {2227--2237},
-  year={2018}
+  publisher = {Annual Conference of the North American Chapter of the Association for Computational Linguistics},
+  year      = {2018}
 }
 @inproceedings{Graves2013HybridSR,
  title={Hybrid speech recognition with Deep Bidirectional LSTM},
@@ -4116,13 +4099,6 @@ year = {2012}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 %%%%% chapter 10------------------------------------------------------
-@inproceedings{vaswani2017attention,
-	title={Attention is All You Need},
-	author={Ashish {Vaswani} and Noam {Shazeer} and Niki {Parmar} and Jakob {Uszkoreit} and Llion {Jones} and Aidan N. {Gomez} and Lukasz {Kaiser} and Illia {Polosukhin}},
-	publisher={International Conference on Neural Information Processing},
-	pages={5998--6008},
-	year={2017}
-}
 @inproceedings{DBLP:conf/acl/LiLWJXZLL20,
  author    = {Bei Li and
               Hui Liu and
@@ -4679,15 +4655,6 @@ author    = {Yoshua Bengio and
  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2015}
 }
-@inproceedings{DBLP:journals/corr/LuongPM15,
-  author    = {Thang Luong and
-               Hieu Pham and
-               Christopher D. Manning},
-  title     = {Effective Approaches to Attention-based Neural Machine Translation},
-  publisher = {Conference on Empirical Methods in Natural Language Processing},
-  pages     = {1412--1421},
-  year      = {2015}
-}
 @inproceedings{He2016ImprovedNM,
  author    = {Wei He and
               Zhongjun He and
@@ -4775,19 +4742,6 @@ author    = {Yoshua Bengio and
  pages     = {21--37},
  year      = {2016}
 }
-@inproceedings{devlin-etal-2014-fast,
-  author    = {Jacob Devlin and
-               Rabih Zbib and
-               Zhongqiang Huang and
-               Thomas Lamar and
-               Richard M. Schwartz and
-               John Makhoul},
-  title     = {Fast and Robust Neural Network Joint Models for Statistical Machine
-               Translation},
-  pages     = {1370--1380},
-  publisher = {Annual Meeting of the Association for Computational Linguistics},
-  year      = {2014}
-}
 @inproceedings{DBLP:conf/acl/WangLLJL15,
  author    = {Mingxuan Wang and
               Zhengdong Lu and
@@ -4818,15 +4772,6 @@ author    = {Yoshua Bengio and
  publisher = {International Conference on Acoustics, Speech and Signal Processing},
  year      = {2013}
 }
-@inproceedings{DBLP:journals/corr/LuongPM15,
-  author    = {Thang Luong and
-               Hieu Pham and
-               Christopher D. Manning},
-  title     = {Effective Approaches to Attention-based Neural Machine Translation},
-  publisher = {Conference on Empirical Methods in Natural Language Processing},
-  pages     = {1412--1421},
-  year      = {2015}
-}
 @inproceedings{DBLP:conf/acl-codeswitch/WangCK18,
  author    = {Changhan Wang and
               Kyunghyun Cho and
@@ -4879,15 +4824,6 @@ author    = {Yoshua Bengio and
  publisher = {Springer},
  year      = {2017}
 }
-@inproceedings{2011Natural,
-  title={Natural Language Processing (almost) from Scratch},
-  author={ Collobert, Ronan  and  Weston, Jason  and Bottou, Léon and  Karlen, Michael  and  Kavukcuoglu, Koray  and  Kuksa, Pavel },
-  publisher={Journal of Machine Learning Research},
-  volume={12},
-  number={1},
-  pages={2493-2537},
-  year={2011},
-}
 @inproceedings{DBLP:conf/acl/NguyenG15,
  author    = {Thien Huu Nguyen and
               Ralph Grishman},
@@ -4943,14 +4879,6 @@ author    = {Yoshua Bengio and
  publisher = {Proceedings of the Human Language Technology Conference of the North American Chapter of the Association for Computational Linguistics},
  year      = {2015}
 }
-@inproceedings{StahlbergNeural,
-  title={Neural Machine Translation: A Review},
-  author={Felix Stahlberg},
-  publisher={Journal of Artificial Intelligence Research},
-  year={2020},
-  volume={69},
-  pages={343-418}
-}
 @inproceedings{Sennrich2016ImprovingNM,
  author    = {Rico Sennrich and
               Barry Haddow and
@@ -4959,14 +4887,6 @@ author    = {Yoshua Bengio and
  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2016}
 }
-@inproceedings{bahdanau2014neural,
-  author    = {Dzmitry Bahdanau and
-               Kyunghyun Cho and
-               Yoshua Bengio},
-  title     = {Neural Machine Translation by Jointly Learning to Align and Translate},
-  publisher = {International Conference on Learning Representations},
-  year      = {2015}
-}
 @inproceedings{Waibel1989PhonemeRU,
  title={Phoneme recognition using time-delay neural networks},
  author={Alexander Waibel and Toshiyuki Hanazawa and Geoffrey Hinton and Kiyohiro Shikano and Kevin J. Lang},
@@ -5002,30 +4922,6 @@ author    = {Yoshua Bengio and
  pages     = {770--778},
  year      = {2016}
 }
-@inproceedings{DBLP:conf/cvpr/HuangLMW17,
-  author    = {Gao Huang and
-               Zhuang Liu and
-               Laurens van der Maaten and
-               Kilian Q. Weinberger},
-  title     = {Densely Connected Convolutional Networks},
-  pages     = {2261--2269},
-  publisher = {{IEEE} Conference on Computer Vision and Pattern Recognition},
-  year      = {2017}
-}
-@inproceedings{Girshick2015FastR,
-  title={Fast R-CNN},
-  author={Ross Girshick},
-  publisher={International Conference on Computer Vision},
-  year={2015},
-  pages={1440-1448}
-}
-@inproceedings{He2020MaskR,
-  title={Mask R-CNN},
-  author={Kaiming He and Georgia Gkioxari and Piotr Doll{\'a}r and Ross B. Girshick},
-  publisher={International Conference on Computer Vision},
-  pages={2961--2969},
-  year={2017}
-}
 @inproceedings{Kalchbrenner2014ACN,
  title={A Convolutional Neural Network for Modelling Sentences},
  author={Nal Kalchbrenner and Edward Grefenstette and Phil Blunsom},
@@ -5079,18 +4975,6 @@ author    = {Yoshua Bengio and
  pages     = {123--135},
  year={2017}
 }
-@inproceedings{DBLP:journals/corr/GehringAGYD17,
-  author    = {Jonas Gehring and
-               Michael Auli and
-               David Grangier and
-               Denis Yarats and
-               Yann N. Dauphin},
-  title     = {Convolutional Sequence to Sequence Learning},
-  publisher = {International Conference on Machine Learning},
-  volume    = {70},
-  pages     = {1243--1252},
-  year      = {2017}
-}
 @inproceedings{Kaiser2018DepthwiseSC,
  title={Depthwise Separable Convolutions for Neural Machine Translation},
  author    = {Lukasz Kaiser and
@@ -5109,14 +4993,6 @@ author    = {Yoshua Bengio and
 publisher = {International Conference on Learning Representations},
 year = {2019}
 }
-@inproceedings{kalchbrenner-blunsom-2013-recurrent,
-  author    = {Nal Kalchbrenner and
-               Phil Blunsom},
-  title     = {Recurrent Continuous Translation Models},
-  pages     = {1700--1709},
-  publisher = {Conference on Empirical Methods in Natural Language Processing},
-  year      = {2013}
-}
 @inproceedings{Wu2016GooglesNM,
  title={Google's Neural Machine Translation System: Bridging the Gap between Human and Machine Translation},
  author    = {Yonghui Wu and
@@ -5154,16 +5030,6 @@ author    = {Yoshua Bengio and
  year={2016},
  volume={abs/1609.08144}
 }
-@inproceedings{DBLP:journals/corr/HeZRS15,
-  author    = {Kaiming He and
-               Xiangyu Zhang and
-               Shaoqing Ren and
-               Jian Sun},
-  title     = {Deep Residual Learning for Image Recognition},
-  publisher = {IEEE Conference on Computer Vision and Pattern Recognition},
-  pages     = {770--778},
-  year      = {2016},
-}
 @inproceedings{Sukhbaatar2015EndToEndMN,
  title={End-To-End Memory Networks},
  author    = {Sainbayar Sukhbaatar and
@@ -5326,13 +5192,6 @@ author    = {Yoshua Bengio and
  volume    = {abs/1802.05751},
  year      = {2018}
 }
-@inproceedings{vaswani2017attention,
-	title={Attention is All You Need},
-	author={Ashish {Vaswani} and Noam {Shazeer} and Niki {Parmar} and Jakob {Uszkoreit} and Llion {Jones} and Aidan N. {Gomez} and Lukasz {Kaiser} and Illia {Polosukhin}},
-	publisher={International Conference on Neural Information Processing},
-	pages={5998--6008},
-	year={2017}
-}
 @inproceedings{DBLP:conf/iclr/WuLLLH20,
  author    = {Zhanghao Wu and
               Zhijian Liu and
@@ -5377,24 +5236,6 @@ author    = {Yoshua Bengio and
  pages     = {464--468},
  year      = {2018},
 }
-@inproceedings{DBLP:journals/corr/HeZRS15,
-  author    = {Kaiming He and
-               Xiangyu Zhang and
-               Shaoqing Ren and
-               Jian Sun},
-  title     = {Deep Residual Learning for Image Recognition},
-  publisher = {IEEE Conference on Computer Vision and Pattern Recognition},
-  pages     = {770--778},
-  year      = {2016},
-}
-@inproceedings{JMLR:v15:srivastava14a,
-  author  = {Nitish Srivastava and Geoffrey Hinton and Alex Krizhevsky and Ilya Sutskever and Ruslan Salakhutdinov},
-  title   = {Dropout: A Simple Way to Prevent Neural Networks from Overfitting},
-  publisher = {Journal of Machine Learning Research},
-  year    = {2014},
-  volume  = {15},
-  pages   = {1929-1958},
-}
 @inproceedings{Szegedy_2016_CVPR,
  author    = {Christian Szegedy and
               Vincent Vanhoucke and
@@ -5424,28 +5265,6 @@ author    = {Yoshua Bengio and
  volume    = {abs/1602.02830},
  year      = {2016},
 }
-@inproceedings{Wu2019PayLA,
- author = {Felix Wu and
-		 Angela Fan and
-		 Alexei Baevski and
-		 Yann N. Dauphin and
-		 Michael Auli},
- title = {Pay Less Attention with Lightweight and Dynamic Convolutions},
- publisher = {International Conference on Learning Representations},
- year = {2019},
-}
-@inproceedings{dai-etal-2019-transformer,
- author    = {Zihang Dai and
-               Zhilin Yang and
-               Yiming Yang and
-               Jaime G. Carbonell and
-               Quoc Viet Le and
-               Ruslan Salakhutdinov},
-  title     = {Transformer-XL: Attentive Language Models beyond a Fixed-Length Context},
-  pages     = {2978--2988},
-  publisher = {Annual Meeting of the Association for Computational Linguistics},
-  year      = {2019}
-}
 @inproceedings{Liu2020LearningTE,
 	title={Learning to Encode Position for Transformer with Continuous Dynamical Model},
 	author={Xuanqing Liu and Hsiang-Fu Yu and Inderjit Dhillon and Cho-Jui Hsieh},
@@ -5620,14 +5439,6 @@ author    = {Yoshua Bengio and
  publisher = {IEEE International Conference on Acoustics, Speech and Signal Processing},
  year      = {2012}
 }
-@inproceedings{JMLR:v15:srivastava14a,
-  author  = {Nitish Srivastava and Geoffrey Hinton and Alex Krizhevsky and Ilya Sutskever and Ruslan Salakhutdinov},
-  title   = {Dropout: A Simple Way to Prevent Neural Networks from Overfitting},
-  publisher = {Journal of Machine Learning Research},
-  year    = {2014},
-  volume  = {15},
-  pages   = {1929-1958},
-}
 @inproceedings{DBLP:conf/amta/MullerRS20,
  author    = {Mathias M{\"{u}}ller and
               Annette Rios and
@@ -5834,21 +5645,6 @@ author    = {Yoshua Bengio and
  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2019}
 }
-@inproceedings{DBLP:conf/acl/LiLWJXZLL20,
-  author    = {Bei Li and
-               Hui Liu and
-               Ziyang Wang and
-               Yufan Jiang and
-               Tong Xiao and
-               Jingbo Zhu and
-               Tongran Liu and
-               Changliang Li},
-  title     = {Does Multi-Encoder Help? {A} Case Study on Context-Aware Neural Machine
-               Translation},
-  pages     = {3512--3518},
-  publisher = {Annual Meeting of the Association for Computational Linguistics},
-  year      = {2020}
-}
 @techreport{chen1999gaussian,
  title={A Gaussian prior for smoothing maximum entropy models},
  author={Chen, Stanley F and Rosenfeld, Ronald},
@@ -5863,14 +5659,6 @@ author    = {Yoshua Bengio and
  publisher = {Conference on Empirical Methods in Natural Language Processing},
  year      = {2018}
 }
-@inproceedings{DBLP:conf/icassp/SchusterN12,
-  author    = {Mike Schuster and
-               Kaisuke Nakajima},
-  title     = {Japanese and Korean voice search},
-  pages     = {5149--5152},
-  publisher = {IEEE International Conference on Acoustics, Speech and Signal Processing},
-  year      = {2012}
-}
 @inproceedings{kudo2018sentencepiece,
 	title={SentencePiece: A simple and language independent subword tokenizer and detokenizer for Neural Text Processing},
 	author={Taku {Kudo} and John {Richardson}},
@@ -6166,12 +5954,6 @@ author    = {Yoshua Bengio and
  publisher = {Conference on Computational Learning Theory},
  year      = {1992}
 }
-@book{mitchell1996m,
-  title={Machine Learning},
-  author={Mitchell, Tom},
-  journal={McCraw Hill},
-  year={1996}
-}
 @inproceedings{DBLP:conf/icml/AbeM98,
  author    = {Naoki Abe and
               Hiroshi Mamitsuka},
@@ -6195,15 +5977,6 @@ author    = {Yoshua Bengio and
  publisher = {{IEEE} Conference on Computer Vision and Pattern Recognition},
  year      = {2005}
 }
-@inproceedings{726791,
-  author={Yann {Lecun} and Leon {Bottou} and Yoshua {Bengio} and Patrick {Haffner}},
-  publisher={Proceedings of the IEEE}, 
-  title={Gradient-based learning applied to document recognition}, 
-  year={1998},
-  volume={86},
-  number={11},
-  pages={2278-2324}
-}
 @book{atkinson2007optimum,
  title={Optimum experimental designs, with SAS},
  author={Atkinson, Anthony and Donev, Alexander and Tobias, Randall and others},
@@ -6245,16 +6018,6 @@ author    = {Yoshua Bengio and
  publisher = {{IEEE} Winter Conference on Applications of Computer Vision},
  year      = {2020}
 }
-@inproceedings{DBLP:conf/acl/JeanCMB15,
-  author    = {S{\'{e}}bastien Jean and
-               KyungHyun Cho and
-               Roland Memisevic and
-               Yoshua Bengio},
-  title     = {On Using Very Large Target Vocabulary for Neural Machine Translation},
-  pages     = {1--10},
-  publisher = {Annual Meeting of the Association for Computational Linguistics},
-  year      = {2015}
-}
 @inproceedings{2015OnGulcehre,
  title = {On Using Monolingual Corpora in Neural Machine Translation},
  author = {Gulcehre Caglar  and  
@@ -6269,14 +6032,6 @@ author    = {Yoshua Bengio and
  publisher = {Computer Science},
  year = {2015},
 }
-@inproceedings{Sennrich2016ImprovingNM,
-  author    = {Rico Sennrich and
-               Barry Haddow and
-               Alexandra Birch},
-  title     = {Improving Neural Machine Translation Models with Monolingual Data},
-  publisher = {Annual Meeting of the Association for Computational Linguistics},
-  year      = {2016}
-}
 @inproceedings{DBLP:conf/aaai/Zhang0LZC18,
  author    = {Zhirui Zhang and
               Shujie Liu and
@@ -6641,16 +6396,6 @@ author    = {Yoshua Bengio and
  pages     = {1171--1179},
  year      = {2015}
 }
-@inproceedings{Bengio2015ScheduledSF,
-  title={Scheduled Sampling for Sequence Prediction with Recurrent Neural Networks},
-  author={Samy Bengio and
-               Oriol Vinyals and
-               Navdeep Jaitly and
-               Noam Shazeer},
-  publisher = {Annual Conference on Neural Information Processing Systems},
-  pages     = {1171--1179},
-  year      = {2015}
-}
 @inproceedings{Ranzato2016SequenceLT,
  title={Sequence Level Training with Recurrent Neural Networks},
  author={Marc'Aurelio Ranzato and
@@ -6674,43 +6419,6 @@ author    = {Yoshua Bengio and
  pages     = {2672--2680},
  year      = {2014}
 }
-@inproceedings{DBLP:conf/acl/ShenCHHWSL16,
-  author    = {Shiqi Shen and
-               Yong Cheng and
-               Zhongjun He and
-               Wei He and
-               Hua Wu and
-               Maosong Sun and
-               Yang Liu},
-  title     = {Minimum Risk Training for Neural Machine Translation},
-  publisher = {Annual Meeting of the Association for Computational Linguistics},
-  year      = {2016},
-}
-@inproceedings{DBLP:conf/acl/PapineniRWZ02,
-  author    = {Kishore Papineni and
-               Salim Roukos and
-               Todd Ward and
-               Wei-jing Zhu},
-  title     = {Bleu: a Method for Automatic Evaluation of Machine Translation},
-  pages     = {311--318},
-  publisher = {Annual Meeting of the Association for Computational Linguistics},
-  year      = {2002}
-}
-@inproceedings{doddington2002automatic,
-  title={Automatic evaluation of machine translation quality using n-gram co-occurrence statistics},
-  publisher={Proceedings of the second international conference on Human Language Technology Research},
-  author={Doddington, George},
-  pages={138--145},
-  year={2002}
-}
-@inproceedings{snover2006study,
-  title={A study of translation edit rate with targeted human annotation},
-  author={Snover, Matthew and Dorr, Bonnie and Schwartz, Richard and Micciulla, Linnea and Makhoul, John},
-  publisher={Proceedings of association for machine translation in the Americas},
-  volume={200},
-  number={6},
-  year={2006}
-}
 @inproceedings{lavie2009meteor,
  title={The METEOR metric for automatic evaluation of machine translation},
  author={Lavie, Alon and Denkowski, Michael J},
@@ -6720,36 +6428,6 @@ author    = {Yoshua Bengio and
  pages={105--115},
  year={2009}
 }
-@inproceedings{bahdanau2014neural,
-  author    = {Dzmitry Bahdanau and
-               Kyunghyun Cho and
-               Yoshua Bengio},
-  title     = {Neural Machine Translation by Jointly Learning to Align and Translate},
-  publisher = {International Conference on Learning Representations},
-  year      = {2015}
-}
-@inproceedings{koehn2003statistical,
-  author    = {Philipp Koehn and
-               Franz Josef Och and
-               Daniel Marcu},
-  title     = {Statistical Phrase-Based Translation},
-  publisher = {Annual Meeting of the Association for Computational Linguistics},
-  year      = {2003}
-}
-@inproceedings{smith2006minimum,
-  author    = {David A. Smith and
-               Jason Eisner},
-  title     = {Minimum Risk Annealing for Training Log-Linear Models},
-  publisher = {Annual Meeting of the Association for Computational Linguistics},
-  year      = {2006}
-}
-@inproceedings{he2012maximum,
-title={Maximum expected bleu training of phrase and lexicon translation models},
-author={He, Xiaodong and Deng, Li},
-publisher={Annual Meeting of the Association for Computational Linguistics},
-pages={292--301},
-year={2012}
-}
 @inproceedings{DBLP:conf/acl/GaoHYD14,
  author    = {Jianfeng Gao and
               Xiaodong He and
@@ -6907,14 +6585,6 @@ year={2012}
  volume    = {abs/2002.11794},
  year      = {2020}
 }
-@inproceedings{kim-rush-2016-sequence,
-    author    = {Yoon Kim and
-               Alexander M. Rush},
-  title     = {Sequence-Level Knowledge Distillation},
-  pages     = {1317--1327},
-  publisher = {Conference on Empirical Methods in Natural Language Processing},
-  year      = {2016}
-}
 @inproceedings{Jiao2020TinyBERTDB,
  author    = {Xiaoqi Jiao and
               Yichun Yin and
@@ -6952,34 +6622,6 @@ year={2012}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 %%%%% chapter 14------------------------------------------------------
-@inproceedings{Koehn2007Moses,
-  author    = {Philipp Koehn and
-               Hieu Hoang and
-			    Alexandra Birch and
-               Chris Callison-Burch and
-               Marcello Federico and
-               Nicola Bertoldi and
-               Brooke Cowan and
-               Wade Shen and
-               Christine Moran and
-               Richard Zens and
-               Chris Dyer and
-               Ondrej Bojar and
-               Alexandra Constantin and
-               Evan Herbst},
-  title     = {Moses: Open Source Toolkit for Statistical Machine Translation},
-  publisher = {Annual Meeting of the Association for Computational Linguistics},
-  year      = {2007}
-}
-@inproceedings{DBLP:conf/amta/Koehn04,
-  author    = {Philipp Koehn},
-  title     = {Pharaoh: {A} Beam Search Decoder for Phrase-Based Statistical Machine
-               Translation Models},
-  volume    = {3265},
-  pages     = {115--124},
-  publisher = {	Association for Machine Translation in the Americas},
-  year      = {2004}
-}
 @inproceedings{DBLP:conf/emnlp/StahlbergHSB17,
  author    = {Felix Stahlberg and
               Eva Hasler and
@@ -7189,16 +6831,6 @@ year={2012}
  publisher = {Conference on Empirical Methods in Natural Language Processing},
  year      = {2016}
 }
-@inproceedings{DBLP:conf/emnlp/HuangZM17,
-  author    = {Liang Huang and
-               Kai Zhao and
-               Mingbo Ma},
-  title     = {When to Finish? Optimal Beam Search for Neural Text Generation (modulo
-               beam size)},
-  pages     = {2134--2139},
-  publisher = {Annual Meeting of the Association for Computational Linguistics},
-  year      = {2017}
-}
 @inproceedings{Wiseman2016SequencetoSequenceLA,
  title={Sequence-to-Sequence Learning as Beam-Search Optimization},
  author={Sam Wiseman and Alexander M. Rush},
@@ -7206,16 +6838,6 @@ year={2012}
  pages={1296--1306},
  year={2016}
 }
-@inproceedings{DBLP:conf/emnlp/Yang0M18,
-  author    = {Yilin Yang and
-               Liang Huang and
-               Mingbo Ma},
-  title     = {Breaking the Beam Search Curse: {A} Study of (Re-)Scoring Methods
-               and Stopping Criteria for Neural Machine Translation},
-  pages     = {3054--3059},
-  publisher = {Annual Meeting of the Association for Computational Linguistics},
-  year      = {2018}
-}
 @inproceedings{Ma2019LearningTS,
  title={Learning to Stop in Structured Prediction for Neural Machine Translation},
  author={Mingbo Ma and
@@ -7236,14 +6858,6 @@ year={2012}
  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2017}
 }
-@inproceedings{bahdanau2014neural,
-  author    = {Dzmitry Bahdanau and
-               Kyunghyun Cho and
-               Yoshua Bengio},
-  title     = {Neural Machine Translation by Jointly Learning to Align and Translate},
-  publisher = {International Conference on Learning Representations},
-  year      = {2015}
-}
 @inproceedings{Jiang2012LearnedPF,
  title={Learned Prioritization for Trading Off Accuracy and Speed},
  author={Jiarong Jiang and Adam R. Teichert and Hal Daum{\'e} and Jason Eisner},
@@ -7379,33 +6993,6 @@ year={2012}
  publisher={Annual Meeting of the Association for Computational Linguistics},
  year={2017}
 }
-@inproceedings{StahlbergNeural,
-  title={Neural Machine Translation: A Review},
-  author={Felix Stahlberg},
-  publisher={Journal of Artificial Intelligence Research},
-  year={2020},
-  volume={69},
-  pages={343-418}
-}
-@inproceedings{Ranzato2016SequenceLT,
-  title={Sequence Level Training with Recurrent Neural Networks},
-  author={Marc'Aurelio Ranzato and
-               Sumit Chopra and
-               Michael Auli and
-               Wojciech Zaremba},
-  publisher={International Conference on Learning Representations},
-  year={2016}
-}
-@inproceedings{Bengio2015ScheduledSF,
-  title={Scheduled Sampling for Sequence Prediction with Recurrent Neural Networks},
-  author={Samy Bengio and
-               Oriol Vinyals and
-               Navdeep Jaitly and
-               Noam Shazeer},
-  publisher = {Annual Conference on Neural Information Processing Systems},
-  pages     = {1171--1179},
-  year      = {2015}
-}
 @inproceedings{Zhang2019BridgingTG,
  title={Bridging the Gap between Training and Inference for Neural Machine Translation},
  author={Wen Zhang and Yang Feng and Fandong Meng and Di You and Qun Liu},
@@ -7413,55 +7000,6 @@ year={2012}
  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2019}
 }
-@inproceedings{DBLP:conf/acl/ShenCHHWSL16,
-  author    = {Shiqi Shen and
-               Yong Cheng and
-               Zhongjun He and
-               Wei He and
-               Hua Wu and
-               Maosong Sun and
-               Yang Liu},
-  title     = {Minimum Risk Training for Neural Machine Translation},
-  publisher = {Annual Meeting of the Association for Computational Linguistics},
-  year      = {2016},
-}
-@inproceedings{DBLP:conf/acl/SennrichHB16a,
-  author    = {Rico Sennrich and
-               Barry Haddow and
-               Alexandra Birch},
-  title     = {Neural Machine Translation of Rare Words with Subword Units},
-  publisher = {Annual Meeting of the Association for Computational Linguistics},
-  year      = {2016},
-}
-@inproceedings{DBLP:conf/emnlp/ZensSX12,
-  author    = {Richard Zens and
-               Daisy Stanton and
-               Peng Xu},
-  title     = {A Systematic Comparison of Phrase Table Pruning Techniques},
-  pages     = {972--983},
-  publisher = {Annual Meeting of the Association for Computational Linguistics},
-  year      = {2012}
-}
-@inproceedings{DBLP:conf/emnlp/JohnsonMFK07,
-  author    = {Howard Johnson and
-               Joel D. Martin and
-               George F. Foster and
-               Roland Kuhn},
-  title     = {Improving Translation Quality by Discarding Most of the Phrasetable},
-  pages     = {967--975},
-  publisher = {Annual Meeting of the Association for Computational Linguistics},
-  year      = {2007}
-}
-@inproceedings{DBLP:conf/emnlp/LingGTB12,
-  author    = {Wang Ling and
-               Jo{\~{a}}o Gra{\c{c}}a and
-               Isabel Trancoso and
-               Alan W. Black},
-  title     = {Entropy-based Pruning for Phrase-based Machine Translation},
-  pages     = {962--971},
-  publisher = {Annual Meeting of the Association for Computational Linguistics},
-  year      = {2012}
-}
 @inproceedings{Narang2017BlockSparseRN,
  title={Block-Sparse Recurrent Neural Networks},
  author={Sharan Narang and Eric Undersander and Gregory Diamos},
@@ -7483,31 +7021,10 @@ year={2012}
  author    = {Paul Michel and
               Omer Levy and
               Graham Neubig},
-  title     = {Are Sixteen Heads Really Better than One?},
  publisher = {Annual Conference on Neural Information Processing Systems},
  pages     = {14014--14024},
  year      = {2019}
 }
-@inproceedings{DBLP:journals/corr/abs-1905-09418,
-  author    = {Elena Voita and
-               David Talbot and
-               Fedor Moiseev and
-               Rico Sennrich and
-               Ivan Titov},
-  title     = {Analyzing Multi-Head Self-Attention: Specialized Heads Do the Heavy
-               Lifting, the Rest Can Be Pruned},
-  pages     = {5797--5808},
-  publisher = {Annual Meeting of the Association for Computational Linguistics},
-  year      = {2019},
-}
-@inproceedings{Kitaev2020ReformerTE,
-  author    = {Nikita Kitaev and
-               Lukasz Kaiser and
-               Anselm Levskaya},
-  title     = {Reformer: The Efficient Transformer},
-  publisher = {International Conference on Learning Representations},
-  year      = {2020}
-}
 @inproceedings{Katharopoulos2020TransformersAR,
  title={Transformers are RNNs: Fast Autoregressive Transformers with Linear Attention},
  author={Angelos Katharopoulos and Apoorv Vyas and Nikolaos Pappas and Franccois Fleuret},
@@ -7515,15 +7032,6 @@ year={2012}
  year={2020},
  volume={abs/2006.16236}
 }
-@inproceedings{xiao2011language,
-  title ={Language Modeling for Syntax-Based Machine Translation Using Tree Substitution Grammars: A Case Study on Chinese-English Translation},
-  author ={Xiao, Tong and Zhu, Jingbo and Zhu, Muhua},
-  volume ={10},
-  number ={4},
-  pages ={1--29},
-  year ={2011},
-  publisher ={ACM Transactions on Asian Language Information Processing (TALIP)}
-}
 @inproceedings{Li2009VariationalDF,
  title={Variational Decoding for Statistical Machine Translation},
  author={Zhifei Li and
@@ -7558,18 +7066,6 @@ year={2012}
  pages={5488--5495},
  year={2018}
 }
-@inproceedings{DBLP:journals/corr/GehringAGYD17,
-  author    = {Jonas Gehring and
-               Michael Auli and
-               David Grangier and
-               Denis Yarats and
-               Yann N. Dauphin},
-  title     = {Convolutional Sequence to Sequence Learning},
-  publisher = {International Conference on Machine Learning},
-  volume    = {70},
-  pages     = {1243--1252},
-  year      = {2017}
-}
 @inproceedings{Wei2019ImitationLF,
  title={Imitation Learning for Non-Autoregressive Neural Machine Translation},
  author={Bingzhen Wei and Mingxuan Wang and Hao Zhou and Junyang Lin and Xu Sun},
@@ -7717,15 +7213,6 @@ author    = {Zhuang Liu and
  volume={18},
  pages={187:1-187:30}
 }
-@inproceedings{DBLP:journals/corr/HintonVD15,
-  author    = {Geoffrey E. Hinton and
-               Oriol Vinyals and
-               Jeffrey Dean},
-  title     = {Distilling the Knowledge in a Neural Network},
-  publisher   = {CoRR},
-  volume    = {abs/1503.02531},
-  year      = {2015}
-}
 @inproceedings{Munim2019SequencelevelKD,
  title={Sequence-level Knowledge Distillation for Model Compression of Attention-based Sequence-to-sequence Speech Recognition},
  author={Raden Mu'az Mun'im and Nakamasa Inoue and Koichi Shinoda},
@@ -7746,20 +7233,6 @@ author    = {Zhuang Liu and
  volume    = {abs/1903.12136},
  year      = {2019}
 }
-@inproceedings{Jiao2020TinyBERTDB,
-  author    = {Xiaoqi Jiao and
-               Yichun Yin and
-               Lifeng Shang and
-               Xin Jiang and
-               Xiao Chen and
-               Linlin Li and
-               Fang Wang and
-               Qun Liu},
-  title     = {TinyBERT: Distilling {BERT} for Natural Language Understanding},
-  pages     = {4163--4174},
-  publisher={Conference on Empirical Methods in Natural Language Processing},
-  year={2020}
-}
 @inproceedings{Ghazvininejad2020AlignedCE,
  author    = {Marjan Ghazvininejad and
               Vladimir Karpukhin and
@@ -7816,23 +7289,6 @@ author    = {Zhuang Liu and
  volume    = {abs/1911.02215},
  year      = {2019}
 }
-@inproceedings{vaswani2017attention,
-	title={Attention is All You Need},
-	author={Ashish {Vaswani} and Noam {Shazeer} and Niki {Parmar} and Jakob {Uszkoreit} and Llion {Jones} and Aidan N. {Gomez} and Lukasz {Kaiser} and Illia {Polosukhin}},
-	publisher={International Conference on Neural Information Processing},
-	pages={5998--6008},
-	year={2017}
-}
-@inproceedings{Gu2017NonAutoregressiveNM,
-  author    = {Jiatao Gu and
-               James Bradbury and
-               Caiming Xiong and
-               Victor O. K. Li and
-               Richard Socher},
-  title     = {Non-Autoregressive Neural Machine Translation},
-  publisher = {International Conference on Learning Representations},
-  year      = {2018}
-}
 @inproceedings{Zhou2020UnderstandingKD,
  title={Understanding Knowledge Distillation in Non-autoregressive Machine Translation},
  author={Chunting Zhou and Graham Neubig and Jiatao Gu},
@@ -7941,13 +7397,6 @@ author    = {Zhuang Liu and
  volume={7},
  pages={91-105}
 }
-@inproceedings{devlin2019bert,
-  title={Bert: Pre-training of deep bidirectional transformers for language understanding},
-  author={Devlin Jacob and Chang Ming-Wei and Lee Kenton and Toutanova Kristina},
-  year={2019},
-  pages = {4171--4186},
-  publisher = {Annual Meeting of the Association for Computational Linguistics},
-}
 @inproceedings{Feng2016ImprovingAM,
  title={Improving Attention Modeling with Implicit Distortion and Fertility for Machine Translation},
  author={Shi Feng and Shujie Liu and Nan Yang and Mu Li and Ming Zhou and Kenny Q. Zhu},
@@ -7955,65 +7404,6 @@ author    = {Zhuang Liu and
  pages={3082--3092},
  year={2016}
 }
-@inproceedings{TuModeling,
-  author    = {Zhaopeng Tu and
-               Zhengdong Lu and
-               Yang Liu and
-               Xiaohua Liu and
-               Hang Li},
-  title     = {Modeling Coverage for Neural Machine Translation},
-  publisher = {Annual Meeting of the Association for Computational Linguistics},
-  year      = {2016}
-}
-@inproceedings{Wu2016GooglesNM,
-  title={Google's Neural Machine Translation System: Bridging the Gap between Human and Machine Translation},
-  author    = {Yonghui Wu and
-               Mike Schuster and
-               Zhifeng Chen and
-               Quoc V. Le and
-               Mohammad Norouzi and
-               Wolfgang Macherey and
-               Maxim Krikun and
-               Yuan Cao and
-               Qin Gao and
-               Klaus Macherey and
-               Jeff Klingner and
-               Apurva Shah and
-               Melvin Johnson and
-               Xiaobing Liu and
-               Lukasz Kaiser and
-               Stephan Gouws and
-               Yoshikiyo Kato and
-               Taku Kudo and
-               Hideto Kazawa and
-               Keith Stevens and
-               George Kurian and
-               Nishant Patil and
-               Wei Wang and
-               Cliff Young and
-               Jason Smith and
-               Jason Riesa and
-               Alex Rudnick and
-               Oriol Vinyals and
-               Greg Corrado and
-               Macduff Hughes and
-               Jeffrey Dean},
-  publisher   = {CoRR},
-  year={2016},
-  volume={abs/1609.08144}
-}
-@inproceedings{li-etal-2018-simple,
-  author    = {Yanyang Li and
-               Tong Xiao and
-               Yinqiao Li and
-               Qiang Wang and
-               Changming Xu and
-               Jingbo Zhu},
-  title     = {A Simple and Effective Approach to Coverage-Aware Neural Machine Translation},
-  publisher = {Annual Meeting of the Association for Computational Linguistics},
-  pages     = {292--297},
-  year      = {2018}
-}
 @inproceedings{Peris2017InteractiveNM,
  title={Interactive neural machine translation},
  author={{\'A}lvaro Peris and Miguel Domingo and F. Casacuberta},
@@ -8022,13 +7412,6 @@ author    = {Zhuang Liu and
  volume={45},
  pages={201-220}
 }
-@inproceedings{Peris2018ActiveLF,
-  title={Active Learning for Interactive Neural Machine Translation of Data Streams},
-  author={{\'A}lvaro Peris and Francisco Casacuberta},
-  publisher={The SIGNLL Conference on Computational Natural Language Learning},
-  pages={151--160},
-  year={2018}
-}
 @inproceedings{Xiao2016ALA,
  title={A Loss-Augmented Approach to Training Syntactic Machine Translation Systems},
  author={Tong Xiao and Derek F. Wong and Jingbo Zhu},
@@ -8037,16 +7420,6 @@ author    = {Zhuang Liu and
  volume={24},
  pages={2069-2083}
 }
-@inproceedings{DBLP:conf/acl/JeanCMB15,
-  author    = {S{\'{e}}bastien Jean and
-               KyungHyun Cho and
-               Roland Memisevic and
-               Yoshua Bengio},
-  title     = {On Using Very Large Target Vocabulary for Neural Machine Translation},
-  pages     = {1--10},
-  publisher = {Annual Meeting of the Association for Computational Linguistics},
-  year      = {2015}
-}
 @inproceedings{61115,
  author={Jianhua Lin},
  publisher={IEEE Transactions on Information Theory}, 
@@ -8065,73 +7438,6 @@ author    = {Zhuang Liu and
  publisher = {	AAAI Conference on Artificial Intelligence},
  year      = {2019}
 }
-@inproceedings{DBLP:journals/corr/abs-1805-00631,
-  author    = {Biao Zhang and
-               Deyi Xiong and
-               Jinsong Su},
-  title     = {Accelerating Neural Transformer via an Average Attention Network},
-  publisher = {Annual Meeting of the Association for Computational Linguistics},
-  pages     = {1789--1798},
-  year      = {2018},
-}
-@inproceedings{Wu2019PayLA,
- author = {Felix Wu and
-		 Angela Fan and
-		 Alexei Baevski and
-		 Yann N. Dauphin and
-		 Michael Auli},
- title = {Pay Less Attention with Lightweight and Dynamic Convolutions},
- publisher = {International Conference on Learning Representations},
- year = {2019},
-}
-@inproceedings{Xiao2019SharingAW,
-  author    = {Tong Xiao and
-               Yinqiao Li and
-               Jingbo Zhu and
-               Zhengtao Yu and
-               Tongran Liu},
-  title     = {Sharing Attention Weights for Fast Transformer},
-  publisher = {International Joint Conference on Artificial Intelligence},
-  pages     = {5292--5298},
-  year      = {2019}
-}
-@inproceedings{Chen2018TheBO,
-  author    = {Mia Xu Chen and
-               Orhan Firat and
-               Ankur Bapna and
-               Melvin Johnson and
-               Wolfgang Macherey and
-               George F. Foster and
-               Llion Jones and
-               Mike Schuster and
-               Noam Shazeer and
-               Niki Parmar and
-               Ashish Vaswani and
-               Jakob Uszkoreit and
-               Lukasz Kaiser and
-               Zhifeng Chen and
-               Yonghui Wu and
-               Macduff Hughes},
-  title     = {The Best of Both Worlds: Combining Recent Advances in Neural Machine
-               Translation},
-  pages     = {76--86},
-  publisher = {Annual Meeting of the Association for Computational Linguistics},
-  year      = {2018}
-}
-@inproceedings{DBLP:journals/corr/abs-1906-00532,
-  author    = {Aishwarya Bhandare and
-               Vamsi Sripathi and
-               Deepthi Karkada and
-               Vivek Menon and
-               Sun Choi and
-               Kushal Datta and
-               Vikram Saletore},
-  title     = {Efficient 8-Bit Quantization of Transformer Neural Machine Language
-               Translation Model},
-  publisher   = {CoRR},
-  volume    = {abs/1906.00532},
-  year      = {2019}
-}
 @inproceedings{DBLP:conf/cvpr/JacobKCZTHAK18,
  author    = {Benoit Jacob and
               Skirmantas Kligys and
@@ -8239,14 +7545,6 @@ author    = {Zhuang Liu and
  volume    = {abs/1611.08562},
  year      = {2016}
 }
-@inproceedings{xiao2013bagging,
-  title ={Bagging and boosting statistical machine translation systems},
-  author ={Tong Xiao and Jingbo Zhu and Tongran Liu },
-  publisher ={Artificial Intelligence},
-  volume ={195},
-  pages ={496--527},
-  year ={2013}
-}
 @inproceedings{DBLP:conf/emnlp/TrombleKOM08,
  author    = {Roy Tromble and
               Shankar Kumar and
@@ -8270,29 +7568,6 @@ author    = {Zhuang Liu and
  pages     = {3302--3308},
  year      = {2017}
 }
-@inproceedings{Shaw2018SelfAttentionWR,
-  author    = {Peter Shaw and
-               Jakob Uszkoreit and
-               Ashish Vaswani},
-  title     = {Self-Attention with Relative Position Representations},
-  publisher = {Proceedings of the Human Language Technology Conference of 
-               the North American Chapter of the Association for Computational Linguistics},
-  pages     = {464--468},
-  year      = {2018}
-}
-@inproceedings{WangLearning,
-  author    = {Qiang Wang and
-               Bei Li and
-               Tong Xiao and
-               Jingbo Zhu and
-               Changliang Li and
-               Derek F. Wong and
-               Lidia S. Chao},
-  title     = {Learning Deep Transformer Models for Machine Translation},
-  pages     = {1810--1822},
-  publisher = {Annual Meeting of the Association for Computational Linguistics},
-  year      = {2019}
-}
 @inproceedings{DBLP:conf/iclr/FanGJ20,
  author    = {Angela Fan and
               Edouard Grave and
@@ -8409,25 +7684,6 @@ author    = {Zhuang Liu and
  volume    = {abs/2010.02416},
  year      = {2020}
 }
-@inproceedings{Vaswani2018Tensor2TensorFN,
-   author    = {Ashish Vaswani and
-               Samy Bengio and
-               Eugene Brevdo and
-               Fran{\c{c}}ois Chollet and
-               Aidan N. Gomez and
-               Stephan Gouws and
-               Llion Jones and
-               Lukasz Kaiser and
-               Nal Kalchbrenner and
-               Niki Parmar and
-               Ryan Sepassi and
-               Noam Shazeer and
-               Jakob Uszkoreit},
-  title     = {Tensor2Tensor for Neural Machine Translation},
-  pages     = {193--199},
-  publisher = {Association for Machine Translation in the Americas},
-  year      = {2018}
-}
 @inproceedings{Sun2019BaiduNM,
  title={Baidu Neural Machine Translation Systems for WMT19},
  author    = {Meng Sun and
@@ -8610,17 +7866,6 @@ author    = {Zhuang Liu and
  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2019}
 }
-@inproceedings{KleinOpenNMT,
-  author    = {Guillaume Klein and
-               Yoon Kim and
-               Yuntian Deng and
-               Jean Senellart and
-               Alexander M. Rush},
-  title     = {OpenNMT: Open-Source Toolkit for Neural Machine Translation},
-  pages     = {67--72},
-  publisher = {Annual Meeting of the Association for Computational Linguistics},
-  year      = {2017}
-}
 @inproceedings{DBLP:conf/acl/WuWXTGQLL19,
  author    = {Lijun Wu and
               Yiren Wang and
@@ -8635,16 +7880,6 @@ author    = {Zhuang Liu and
  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2019}
 }
-@inproceedings{DBLP:conf/cvpr/HuangLMW17,
-  author    = {Gao Huang and
-               Zhuang Liu and
-               Laurens van der Maaten and
-               Kilian Q. Weinberger},
-  title     = {Densely Connected Convolutional Networks},
-  pages     = {2261--2269},
-  publisher = {{IEEE} Conference on Computer Vision and Pattern Recognition},
-  year      = {2017}
-}
 @inproceedings{DBLP:journals/corr/GreffSS16,
  author    = {Klaus Greff and
               Rupesh Kumar Srivastava and
@@ -8653,31 +7888,6 @@ author    = {Zhuang Liu and
  publisher = {International Conference on Learning Representations},
  year      = {2017}
 }
-@inproceedings{Bapna2018TrainingDN,
-  author    = {Ankur Bapna and
-               Mia Xu Chen and
-               Orhan Firat and
-               Yuan Cao and
-               Yonghui Wu},
-  title     = {Training Deeper Neural Machine Translation Models with Transparent
-               Attention},
-  pages     = {3028--3033},
-  publisher = {Annual Meeting of the Association for Computational Linguistics},
-  year      = {2018}
-}
-@inproceedings{WangLearning,
-  author    = {Qiang Wang and
-               Bei Li and
-               Tong Xiao and
-               Jingbo Zhu and
-               Changliang Li and
-               Derek F. Wong and
-               Lidia S. Chao},
-  title     = {Learning Deep Transformer Models for Machine Translation},
-  pages     = {1810--1822},
-  publisher = {Annual Meeting of the Association for Computational Linguistics},
-  year      = {2019}
-}
 @inproceedings{DBLP:journals/corr/abs-2002-04745,
  author    = {Ruibin Xiong and
               Yunchang Yang and
@@ -8705,86 +7915,6 @@ author    = {Zhuang Liu and
  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2020}
 }
-@inproceedings{DBLP:journals/corr/HeZRS15,
-  author    = {Kaiming He and
-               Xiangyu Zhang and
-               Shaoqing Ren and
-               Jian Sun},
-  title     = {Deep Residual Learning for Image Recognition},
-  publisher = {IEEE Conference on Computer Vision and Pattern Recognition},
-  pages     = {770--778},
-  year      = {2016},
-}
-@inproceedings{Ba2016LayerN,
-  author    = {Lei Jimmy Ba and
-               Jamie Ryan Kiros and
-               Geoffrey E. Hinton},
-  title     = {Layer Normalization},
-  publisher   = {CoRR},
-  volume    = {abs/1607.06450},
-  year      = {2016}
-}
-@inproceedings{Vaswani2018Tensor2TensorFN,
-   author    = {Ashish Vaswani and
-               Samy Bengio and
-               Eugene Brevdo and
-               Fran{\c{c}}ois Chollet and
-               Aidan N. Gomez and
-               Stephan Gouws and
-               Llion Jones and
-               Lukasz Kaiser and
-               Nal Kalchbrenner and
-               Niki Parmar and
-               Ryan Sepassi and
-               Noam Shazeer and
-               Jakob Uszkoreit},
-  title     = {Tensor2Tensor for Neural Machine Translation},
-  pages     = {193--199},
-  publisher = {Association for Machine Translation in the Americas},
-  year      = {2018}
-}
-@inproceedings{Dou2019DynamicLA,
-  author    = {Zi-Yi Dou and
-               Zhaopeng Tu and
-               Xing Wang and
-               Longyue Wang and
-               Shuming Shi and
-               Tong Zhang},
-  title     = {Dynamic Layer Aggregation for Neural Machine Translation with Routing-by-Agreement},
-  pages     = {86--93},
-  publisher = {AAAI Conference on Artificial Intelligence},
-  year      = {2019}
-}
-@inproceedings{Wang2018MultilayerRF,
-  title={Multi-layer Representation Fusion for Neural Machine Translation},
-  author={Qiang Wang and Fuxue Li and Tong Xiao and Yanyang Li and Yinqiao Li and Jingbo Zhu},
-  publisher={International Conference on Computational Linguistics},
-  year={2018},
-  volume={abs/2002.06714}
-}
-@inproceedings{Dou2018ExploitingDR,
-   author    = {Zi-Yi Dou and
-               Zhaopeng Tu and
-               Xing Wang and
-               Shuming Shi and
-               Tong Zhang},
-  title     = {Exploiting Deep Representations for Neural Machine Translation},
-  pages     = {4253--4262},
-  publisher = {Annual Meeting of the Association for Computational Linguistics},
-  year      = {2018}
-}
-@inproceedings{DBLP:journals/corr/LinFSYXZB17,
-  author    = {Zhouhan Lin and
-               Minwei Feng and
-               C{\'{\i}}cero Nogueira dos Santos and
-               Mo Yu and
-               Bing Xiang and
-               Bowen Zhou and
-               Yoshua Bengio},
-  title     = {A Structured Self-Attentive Sentence Embedding},
-  publisher = {International Conference on Learning Representations},
-  year      = {2017},
-}
 @inproceedings{DBLP:conf/nips/SrivastavaGS15,
  author    = {Rupesh Kumar Srivastava and
               Klaus Greff and
@@ -8830,15 +7960,6 @@ author    = {Zhuang Liu and
  pages     = {1675--1685},
  year      = {2019}
 }
-@inproceedings{pmlr-v9-glorot10a,
-  author    = {Xavier Glorot and
-               Yoshua Bengio},
-  title     = {Understanding the difficulty of training deep feedforward neural networks},
-  publisher = {International Conference on Artificial Intelligence and Statistics},
-  volume    = {9},
-  pages     = {249--256},
-  year      = {2010}
-}
 @inproceedings{DBLP:conf/iccv/HeZRS15,
  author    = {Kaiming He and
               Xiangyu Zhang and
@@ -9269,13 +8390,6 @@ author    = {Zhuang Liu and
  volume    = {abs/2003.03384},
  year      = {2020}
 }
-@inproceedings{Chollet2017XceptionDL,
-  title={Xception: Deep Learning with Depthwise Separable Convolutions},
-  author    = {Fran{\c{c}}ois Chollet},
-  publisher={IEEE Conference on Computer Vision and Pattern Recognition},
-  year={2017},
-  pages={1800-1807}
-}
 @inproceedings{DBLP:journals/tnn/AngelineSP94,
  author    = {Peter J. Angeline and
               Gregory M. Saunders and
@@ -9523,20 +8637,6 @@ author    = {Zhuang Liu and
  volume    = {abs/2009.02070},
  year      = {2020}
 }
-@inproceedings{DBLP:conf/acl/WangWLCZGH20,
-  author    = {Hanrui Wang and
-               Zhanghao Wu and
-               Zhijian Liu and
-               Han Cai and
-               Ligeng Zhu and
-               Chuang Gan and
-               Song Han},
-  title     = {{HAT:} Hardware-Aware Transformers for Efficient Natural Language
-               Processing},
-  pages     = {7675--7688},
-  publisher = {Annual Meeting of the Association for Computational Linguistics},
-  year      = {2020}
-}
 @inproceedings{DBLP:journals/corr/abs-2008-06808,
  author    = {Henry Tsai and
               Jayden Ooi and
@@ -9549,24 +8649,6 @@ author    = {Zhuang Liu and
  volume    = {abs/2008.06808},
  year      = {2020}
 }
-@inproceedings{Wang2019ExploitingSC,
-  title={Exploiting Sentential Context for Neural Machine Translation},
-  author={Xing Wang and Zhaopeng Tu and Longyue Wang and Shuming Shi},
-  publisher={Annual Meeting of the Association for Computational Linguistics},
-  year={2019}
-}
-@inproceedings{Wei2020MultiscaleCD,
-  title={Multiscale Collaborative Deep Models for Neural Machine Translation},
-  author={Xiangpeng Wei and Heng Yu and Yue Hu and Yue Zhang and Rongxiang Weng and Weihua Luo},
-  publisher={Annual Meeting of the Association for Computational Linguistics},
-  year={2020}
-}
-@inproceedings{li2020shallow,
-  title={Shallow-to-Deep Training for Neural Machine Translation},
-  author={Li, Bei and Wang, Ziyang and Liu, Hui and Jiang, Yufan and Du, Quan and Xiao, Tong and Wang, Huizhen and Zhu, Jingbo},
-  publisher={Conference on Empirical Methods in Natural Language Processing},
-  year={2020}
-}
 @inproceedings{DBLP:journals/corr/abs-2007-06257,
  author    = {Hongfei Xu and
               Qiuhui Liu and
@@ -9588,18 +8670,6 @@ author    = {Zhuang Liu and
  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2020}
 }
-@inproceedings{DBLP:journals/corr/abs-2006-10369,
-  author    = {Jungo Kasai and
-               Nikolaos Pappas and
-               Hao Peng and
-               James Cross and
-               Noah A. Smith},
-  title     = {Deep Encoder, Shallow Decoder: Reevaluating the Speed-Quality Tradeoff
-               in Machine Translation},
-  publisher   = {CoRR},
-  volume    = {abs/2006.10369},
-  year      = {2020}
-}
 @inproceedings{DBLP:journals/corr/abs-1806-01261,
  author    = {Peter W. Battaglia and
               Jessica B. Hamrick and
@@ -9633,34 +8703,6 @@ author    = {Zhuang Liu and
  volume    = {abs/1806.01261},
  year      = {2018}
 }
-@inproceedings{Shaw2018SelfAttentionWR,
-  author    = {Peter Shaw and
-               Jakob Uszkoreit and
-               Ashish Vaswani},
-  title     = {Self-Attention with Relative Position Representations},
-  publisher = {Annual Conference of the North American Chapter of the Association for Computational Linguistics},
-  pages     = {464--468},
-  year      = {2018},
-}
-@inproceedings{Dai2019TransformerXLAL,
-  author    = {Zihang Dai and
-               Zhilin Yang and
-               Yiming Yang and
-               Jaime G. Carbonell and
-               Quoc V. Le and
-               Ruslan Salakhutdinov},
-  title     = {Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context},
-  publisher   = {Annual Meeting of the Association for Computational Linguistics},
-  pages     = {2978--2988},
-  year      = {2019}
-}
-@inproceedings{vaswani2017attention,
-	title={Attention is All You Need},
-	author={Ashish {Vaswani} and Noam {Shazeer} and Niki {Parmar} and Jakob {Uszkoreit} and Llion {Jones} and Aidan N. {Gomez} and Lukasz {Kaiser} and Illia {Polosukhin}},
-	publisher={International Conference on Neural Information Processing},
-	pages={5998--6008},
-	year={2017}
-}
 @inproceedings{DBLP:conf/acl/LiXTZZZ17,
  author    = {Junhui Li and
               Deyi Xiong and
@@ -9681,18 +8723,6 @@ author    = {Zhuang Liu and
  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2016}
 }
-@inproceedings{Yang2017TowardsBH,
-  author    = {Baosong Yang and
-               Derek F. Wong and
-               Tong Xiao and
-               Lidia S. Chao and
-               Jingbo Zhu},
-  title     = {Towards Bidirectional Hierarchical Representations for Attention-based
-               Neural Machine Translation},
-  publisher = {Conference on Empirical Methods in Natural Language Processing},
-  pages     = {1432--1441},
-  year      = {2017}
-}
 @inproceedings{DBLP:conf/acl/ChenHCC17,
  author    = {Huadong Chen and
               Shujian Huang and
@@ -9704,16 +8734,6 @@ author    = {Zhuang Liu and
  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2017}
 }
-@inproceedings{TuModeling,
-  author    = {Zhaopeng Tu and
-               Zhengdong Lu and
-               Yang Liu and
-               Xiaohua Liu and
-               Hang Li},
-  title     = {Modeling Coverage for Neural Machine Translation},
-  publisher = {Annual Meeting of the Association for Computational Linguistics},
-  year      = {2016}
-}
 @inproceedings{DBLP:conf/wmt/SennrichH16,
  author    = {Rico Sennrich and
               Barry Haddow},
@@ -9739,13 +8759,6 @@ author    = {Zhuang Liu and
  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2020}
 }
-@inproceedings{Aharoni2017TowardsSN,
-  title={Towards String-To-Tree Neural Machine Translation},
-  author={Roee Aharoni and 
-          Yoav Goldberg},
-  publisher={Annual Meeting of the Association for Computational Linguistics},
-  year={2017}
-}
 @inproceedings{DBLP:conf/iclr/Alvarez-MelisJ17,
  author    = {David Alvarez-Melis and
               Tommi S. Jaakkola},
@@ -9763,13 +8776,6 @@ author    = {Zhuang Liu and
  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2016}
 }
-@book{aho1972theory,
-  author    = {Aho, Alfred V and
-               Ullman, Jeffrey D},
-  title     = {The theory of parsing, translation, and compiling},
-  publisher = {Prentice-Hall Englewood Cliffs, NJ},
-  year      = {1973},
-}
 @inproceedings{DBLP:journals/corr/LuongLSVK15,
  author    = {Minh-Thang Luong and
               Quoc V. Le and
@@ -9805,26 +8811,6 @@ author    = {Zhuang Liu and
  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2017}
 }
-@inproceedings{DBLP:journals/corr/abs-1808-09374,
-  author    = {Xinyi Wang and
-               Hieu Pham and
-               Pengcheng Yin and
-               Graham Neubig},
-  title     = {A Tree-based Decoder for Neural Machine Translation},
-  publisher = {Conference on Empirical Methods in Natural Language Processing},
-  pages     = {4772--4777},
-  year      = {2018}
-}
-@inproceedings{Tong2016Syntactic,
-  author    = {Tong Xiao and
-               Jingbo Zhu and
-               Chunliang Zhang and
-               Tongran Liu},
-  title     = {Syntactic Skeleton-Based Translation},
-  pages     = {2856--2862},
-  publisher = {AAAI Conference on Artificial Intelligence},
-  year      = {2016},
-}
 @inproceedings{DBLP:conf/emnlp/WangTWS19a,
  author    = {Xing Wang and
               Zhaopeng Tu and
@@ -9835,13 +8821,6 @@ author    = {Zhuang Liu and
  publisher = {Conference on Empirical Methods in Natural Language Processing},
  year      = {2019}
 }
-@inproceedings{Liu2020LearningTE,
-	title={Learning to Encode Position for Transformer with Continuous Dynamical Model},
-	author={Xuanqing Liu and Hsiang-Fu Yu and Inderjit Dhillon and Cho-Jui Hsieh},
-	publisher={ArXiv},
-	year={2020},
-	volume={abs/2003.09229}
-}
 @inproceedings{DBLP:conf/nips/ChenRBD18,
  author    = {Tian Qi Chen and
               Yulia Rubanova and
@@ -9852,27 +8831,6 @@ author    = {Zhuang Liu and
  pages     = {6572--6583},
  year      = {2018}
 }
-@inproceedings{DBLP:journals/corr/LuongPM15,
-  author    = {Thang Luong and
-               Hieu Pham and
-               Christopher D. Manning},
-  title     = {Effective Approaches to Attention-based Neural Machine Translation},
-  publisher = {Conference on Empirical Methods in Natural Language Processing},
-  pages     = {1412--1421},
-  year      = {2015}
-}
-@inproceedings{Yang2018ModelingLF,
-	author    = {Baosong Yang and
-               Zhaopeng Tu and
-               Derek F. Wong and
-               Fandong Meng and
-               Lidia S. Chao and
-               Tong Zhang},
-  title     = {Modeling Localness for Self-Attention Networks},
-  pages     = {4449--4458},
-  publisher = {Annual Meeting of the Association for Computational Linguistics},
-  year      = {2018}
-}
 @inproceedings{DBLP:conf/aaai/GuoQLXZ20,
  author    = {Qipeng Guo and
               Xipeng Qiu and
@@ -9884,33 +8842,6 @@ author    = {Zhuang Liu and
  publisher = {AAAI Conference on Artificial Intelligence},
  year      = {2020}
 }
-@inproceedings{Wu2019PayLA,
- author = {Felix Wu and
-		 Angela Fan and
-		 Alexei Baevski and
-		 Yann N. Dauphin and
-		 Michael Auli},
- title = {Pay Less Attention with Lightweight and Dynamic Convolutions},
- publisher = {International Conference on Learning Representations},
- year = {2019},
-}
-@inproceedings{DBLP:conf/interspeech/GulatiQCPZYHWZW20,
-  author    = {Anmol Gulati and
-               James Qin and
-               Chung-Cheng Chiu and
-               Niki Parmar and
-               Yu Zhang and
-               Jiahui Yu and
-               Wei Han and
-               Shibo Wang and
-               Zhengdong Zhang and
-               Yonghui Wu and
-               Ruoming Pang},
-  title     = {Conformer: Convolution-augmented Transformer for Speech Recognition},
-  pages     = {5036--5040},
-  publisher = {International Speech Communication Association},
-  year      = {2020}
-}
 @inproceedings{DBLP:conf/cvpr/XieGDTH17,
  author    = {Saining Xie and
               Ross B. Girshick and
@@ -9961,16 +8892,6 @@ author    = {Zhuang Liu and
  number={3},
  year={2019},
 }
-@inproceedings{DBLP:conf/iclr/WuLLLH20,
-  author    = {Zhanghao Wu and
-               Zhijian Liu and
-               Ji Lin and
-               Yujun Lin and
-               Song Han},
-  title     = {Lite Transformer with Long-Short Range Attention},
-  publisher = {International Conference on Learning Representations},
-  year      = {2020}
-}
 @inproceedings{DBLP:conf/iclr/DehghaniGVUK19,
  author    = {Mostafa Dehghani and
               Stephan Gouws and
@@ -9981,12 +8902,6 @@ author    = {Zhuang Liu and
  publisher = {International Conference on Learning Representations},
  year      = {2019}
 }
-@inproceedings{Lan2020ALBERTAL,
-  title={ALBERT: A Lite BERT for Self-supervised Learning of Language Representations},
-  author={Zhenzhong Lan and Mingda Chen and Sebastian Goodman and Kevin Gimpel and Piyush Sharma and Radu Soricut},
-  publisher={International Conference on Learning Representations},
-  year={2020}
-}
 @inproceedings{DBLP:conf/naacl/HaoWYWZT19,
  author    = {Jie Hao and
               Xing Wang and
@@ -10032,14 +8947,6 @@ author    = {Zhuang Liu and
  volume    = {abs/2004.05150},
  year      = {2020}
 }
-@inproceedings{Kitaev2020ReformerTE,
-  author    = {Nikita Kitaev and
-               Lukasz Kaiser and
-               Anselm Levskaya},
-  title     = {Reformer: The Efficient Transformer},
-  publisher = {International Conference on Learning Representations},
-  year      = {2020}
-}
 @inproceedings{DBLP:journals/corr/abs-2003-05997,
  author    = {Aurko Roy and
               Mohammad Saffar and
@@ -10050,13 +8957,6 @@ author    = {Zhuang Liu and
  volume    = {abs/2003.05997},
  year      = {2020}
 }
-@inproceedings{Katharopoulos2020TransformersAR,
-  title={Transformers are RNNs: Fast Autoregressive Transformers with Linear Attention},
-  author={Angelos Katharopoulos and Apoorv Vyas and Nikolaos Pappas and Franccois Fleuret},
-  publisher={CoRR},
-  year={2020},
-  volume={abs/2006.16236}
-}
 @inproceedings{DBLP:journals/corr/abs-2009-14794,
  author    = {Krzysztof Choromanski and
               Valerii Likhosherstov and
@@ -10099,17 +8999,6 @@ author    = {Zhuang Liu and
  publisher = {Conference on Empirical Methods in Natural Language Processing},
  year      = {2018}
 }
-@inproceedings{DBLP:journals/corr/abs-2006-04768,
-  author    = {Sinong Wang and
-               Belinda Z. Li and
-               Madian Khabsa and
-               Han Fang and
-               Hao Ma},
-  title     = {Linformer: Self-Attention with Linear Complexity},
-  publisher   = {CoRR},
-  volume    = {abs/2006.04768},
-  year      = {2020}
-}
 @inproceedings{DBLP:conf/nips/BergstraBBK11,
  author    = {James Bergstra and
               R{\'{e}}mi Bardenet and
@@ -10131,18 +9020,6 @@ author    = {Zhuang Liu and
  publisher = {Learning and Intelligent Optimization},
  year      = {2011}
 }
-@inproceedings{DBLP:conf/icml/BergstraYC13,
-  author    = {James Bergstra and
-               Daniel Yamins and
-               David D. Cox},
-  title     = {Making a Science of Model Search: Hyperparameter Optimization in Hundreds
-               of Dimensions for Vision Architectures},
-  series    = {{JMLR} Workshop and Conference Proceedings},
-  volume    = {28},
-  pages     = {115--123},
-  publisher = {International Conference on Machine Learning},
-  year      = {2013}
-}
 @inproceedings{DBLP:conf/iccv/ChenXW019,
  author    = {Xin Chen and
               Lingxi Xie and
@@ -10165,122 +9042,34 @@ author    = {Zhuang Liu and
  publisher = {International Conference on Machine Learning},
  year      = {2020}
 }
-@inproceedings{Jawahar2019WhatDB,
-	title={What Does BERT Learn about the Structure of Language?},
-	author={Ganesh Jawahar and Beno{\^{\i}}t Sagot and Djam{\'e} Seddah},
-	publisher={Annual Meeting of the Association for Computational Linguistics},
-	year={2019}
-}
 @inproceedings{DBLP:conf/emnlp/Ethayarajh19,
  author    = {Kawin Ethayarajh},
  title     = {How Contextual are Contextualized Word Representations? Comparing
               the Geometry of BERT, ELMo, and {GPT-2} Embeddings},
  pages     = {55--65},
  publisher = {Conference on Empirical Methods in Natural Language Processing},
  year      = {2019}
 }
-@inproceedings{DBLP:journals/corr/abs-1905-09418,
+@inproceedings{DBLP:conf/emnlp/LiTYLZ18,
-  author    = {Elena Voita and
+  author    = {Jian Li and
-               David Talbot and
+               Zhaopeng Tu and
-               Fedor Moiseev and
+               Baosong Yang and
-               Rico Sennrich and
+               Michael R. Lyu and
-               Ivan Titov},
+               Tong Zhang},
-  title     = {Analyzing Multi-Head Self-Attention: Specialized Heads Do the Heavy
+  title     = {Multi-Head Attention with Disagreement Regularization},
-               Lifting, the Rest Can Be Pruned},
+  pages     = {2897--2903},
-  pages     = {5797--5808},
+  publisher = {Conference on Empirical Methods in Natural Language Processing},
-  publisher = {Annual Meeting of the Association for Computational Linguistics},
-  year      = {2019},
-}
-@inproceedings{Michel2019AreSH,
-  title={Are Sixteen Heads Really Better than One?},
-  author    = {Paul Michel and
-               Omer Levy and
-               Graham Neubig},
-  title     = {Are Sixteen Heads Really Better than One?},
-  publisher = {Annual Conference on Neural Information Processing Systems},
-  pages     = {14014--14024},
-  year      = {2019}
-}
-@inproceedings{DBLP:conf/emnlp/LiTYLZ18,
-  author    = {Jian Li and
-               Zhaopeng Tu and
-               Baosong Yang and
-               Michael R. Lyu and
-               Tong Zhang},
-  title     = {Multi-Head Attention with Disagreement Regularization},
-  pages     = {2897--2903},
-  publisher = {Conference on Empirical Methods in Natural Language Processing},
-  year      = {2018}
-}
-@inproceedings{Su2018VariationalRN,
-  title={Variational Recurrent Neural Machine Translation},
-  author={Jinsong Su and Shan Wu and Deyi Xiong and Yaojie Lu and Xianpei Han and Biao Zhang},
-  publisher={AAAI Conference on Artificial Intelligence},
-  pages={5488--5495},
-  year={2018}
-}
-@inproceedings{DBLP:conf/acl/SetiawanSNP20,
-  author    = {Hendra Setiawan and
-               Matthias Sperber and
-               Udhyakumar Nallasamy and
-               Matthias Paulik},
-  title     = {Variational Neural Machine Translation with Normalizing Flows},
-  publisher = {Annual Meeting of the Association for Computational Linguistics},
-  year      = {2020}
-}
-@inproceedings{Li2020NeuralMT,
-  author    = {Yanyang Li and
-               Qiang Wang and
-               Tong Xiao and
-               Tongran Liu and
-               Jingbo Zhu},
-  title     = {Neural Machine Translation with Joint Representation},
-  pages     = {8285--8292},
-  publisher = {AAAI Conference on Artificial Intelligence},
-  year      = {2020}
-}
-@inproceedings{JMLR:v15:srivastava14a,
-  author  = {Nitish Srivastava and Geoffrey Hinton and Alex Krizhevsky and Ilya Sutskever and Ruslan Salakhutdinov},
-  title   = {Dropout: A Simple Way to Prevent Neural Networks from Overfitting},
-  publisher = {Journal of Machine Learning Research},
-  year    = {2014},
-  volume  = {15},
-  pages   = {1929-1958},
-}
-@inproceedings{Szegedy_2016_CVPR,
-  author    = {Christian Szegedy and
-               Vincent Vanhoucke and
-               Sergey Ioffe and
-               Jonathon Shlens and
-               Zbigniew Wojna},
-  title     = {Rethinking the Inception Architecture for Computer Vision},
-  publisher = {IEEE Conference on Computer Vision and Pattern Recognition},
-  pages     = {2818--2826},
-  year      = {2016},
-}
-@inproceedings{Chen2018TheBO,
-  author    = {Mia Xu Chen and
-               Orhan Firat and
-               Ankur Bapna and
-               Melvin Johnson and
-               Wolfgang Macherey and
-               George F. Foster and
-               Llion Jones and
-               Mike Schuster and
-               Noam Shazeer and
-               Niki Parmar and
-               Ashish Vaswani and
-               Jakob Uszkoreit and
-               Lukasz Kaiser and
-               Zhifeng Chen and
-               Yonghui Wu and
-               Macduff Hughes},
-  title     = {The Best of Both Worlds: Combining Recent Advances in Neural Machine
-               Translation},
-  pages     = {76--86},
-  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2018}
 }
+@inproceedings{DBLP:conf/acl/SetiawanSNP20,
+  author    = {Hendra Setiawan and
+               Matthias Sperber and
+               Udhyakumar Nallasamy and
+               Matthias Paulik},
+  title     = {Variational Neural Machine Translation with Normalizing Flows},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
+  year      = {2020}
+}
 @inproceedings{DBLP:conf/naacl/GuoQLSXZ19,
  author    = {Qipeng Guo and
               Xipeng Qiu and
@@ -10312,6 +9101,16 @@ author    = {Zhuang Liu and
  publisher = {International Conference on Learning Representations},
  year      = {2018}
 }
+@inproceedings{DBLP:conf/cvpr/HuangLMW17,
+  author    = {Gao Huang and
+               Zhuang Liu and
+               Laurens van der Maaten and
+               Kilian Q. Weinberger},
+  title     = {Densely Connected Convolutional Networks},
+  pages     = {2261--2269},
+  publisher = {{IEEE} Conference on Computer Vision and Pattern Recognition},
+  year      = {2017}
+}
 %%%%% chapter 15------------------------------------------------------
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@@ -10575,20 +9374,6 @@ author    = {Zhuang Liu and
  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2019}
 }
-@inproceedings{2015OnGulcehre,
-  title = {On Using Monolingual Corpora in Neural Machine Translation},
-  author = {Gulcehre Caglar  and  
-           Firat Orhan  and  
-           Xu Kelvin  and  
-           Cho Kyunghyun  and  
-           Barrault Loic  and  
-           Lin Huei Chi  and  
-           Bougares Fethi  and  
-           Schwenk Holger  and  
-           Bengio  Yoshua},
-  publisher = {Computer Science},
-  year = {2015},
-}
 @inproceedings{黄书剑0统计机器翻译中的词对齐研究,
  title={统计机器翻译中的词对齐研究},
  author={黄书剑},
@@ -11087,18 +9872,6 @@ author    = {Zhuang Liu and
  publisher = {Annual Conference of the North American Chapter of the Association for Computational Linguistics},
  year      = {2016}
 }
-@inproceedings{DBLP:conf/emnlp/KimPPKN19,
-  author    = {Yunsu Kim and
-               Petre Petrov and
-               Pavel Petrushkov and
-               Shahram Khadivi and
-               Hermann Ney},
-  title     = {Pivot-based Transfer Learning for Neural Machine Translation between
-               Non-English Languages},
-  pages     = {866--876},
-  publisher = {Annual Meeting of the Association for Computational Linguistics},
-  year      = {2019}
-}
 @inproceedings{DBLP:journals/mt/WuW07,
  author    = {Hua Wu and
               Haifeng Wang},
@@ -11211,15 +9984,6 @@ author    = {Zhuang Liu and
  publisher = {International Joint Conference on Natural Language Processing},
  year      = {2011}
 }
-@inproceedings{DBLP:journals/corr/HintonVD15,
-  author    = {Geoffrey E. Hinton and
-               Oriol Vinyals and
-               Jeffrey Dean},
-  title     = {Distilling the Knowledge in a Neural Network},
-  publisher   = {CoRR},
-  volume    = {abs/1503.02531},
-  year      = {2015}
-}
 @inproceedings{gu2018meta,
  author    = {Jiatao Gu and
               Yong Wang and
@@ -11541,25 +10305,6 @@ author    = {Zhuang Liu and
  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2019}
 }
-@inproceedings{DBLP:conf/emnlp/FiratSAYC16,
-  author    = {Orhan Firat and
-               Baskaran Sankaran and
-               Yaser Al-Onaizan and
-               Fatos T. Yarman-Vural and
-               Kyunghyun Cho},
-  title     = {Zero-Resource Translation with Multi-Lingual Neural Machine Translation},
-  pages     = {268--277},
-  publisher = {Conference on Empirical Methods in Natural Language Processing},
-  year      = {2016}
-}
-@inproceedings{DBLP:conf/emnlp/CurreyH19,
-  author    = {Anna Currey and
-               Kenneth Heafield},
-  title     = {Zero-Resource Neural Machine Translation with Monolingual Pivot Data},
-  pages     = {99--107},
-  publisher = {Conference on Empirical Methods in Natural Language Processing},
-  year      = {2019}
-}
 @inproceedings{DBLP:conf/acl/FadaeeBM17a,
  author    = {Marzieh Fadaee and
               Arianna Bisazza and
@@ -11609,15 +10354,6 @@ author    = {Zhuang Liu and
  year      = {2008},
  publisher = {International Conference on Machine Learning}
 }
-@inproceedings{DBLP:conf/iclr/LampleCDR18,
-  author    = {Guillaume Lample and
-               Alexis Conneau and
-               Ludovic Denoyer and
-               Marc'Aurelio Ranzato},
-  title     = {Unsupervised Machine Translation Using Monolingual Corpora Only},
-  publisher = {International Conference on Learning Representations},
-  year      = {2018}
-}
 @inproceedings{DBLP:journals/coling/BhagatH13,
  author    = {Rahul Bhagat and
               Eduard Hovy},
@@ -11684,16 +10420,6 @@ author    = {Zhuang Liu and
  pages     = {569--631},
  year      = {2019}
 }
-@inproceedings{DBLP:conf/acl/TuLLLL16,
-  author    = {Zhaopeng Tu and
-               Zhengdong Lu and
-               Yang Liu and
-               Xiaohua Liu and
-               Hang Li},
-  title     = {Modeling Coverage for Neural Machine Translation},
-  publisher = {Annual Meeting of the Association for Computational Linguistics},
-  year      = {2016}
-}
 @inproceedings{DBLP:journals/tacl/TuLLLL17,
  author    = {Zhaopeng Tu and
               Yang Liu and
@@ -11748,29 +10474,6 @@ author    = {Zhuang Liu and
  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2018}
 }
-@inproceedings{DBLP:conf/wmt/LiLXLLLWZXWFCLL19,
-  author    = {Bei Li and
-               Yinqiao Li and
-               Chen Xu and
-               Ye Lin and
-               Jiqiang Liu and
-               Hui Liu and
-               Ziyang Wang and
-               Yuhao Zhang and
-               Nuo Xu and
-               Zeyang Wang and
-               Kai Feng and
-               Hexuan Chen and
-               Tengbo Liu and
-               Yanyang Li and
-               Qiang Wang and
-               Tong Xiao and
-               Jingbo Zhu},
-  title     = {The NiuTrans Machine Translation Systems for {WMT19}},
-  pages     = {257--266},
-  publisher = {Annual Meeting of the Association for Computational Linguistics},
-  year      = {2019}
-}
 @inproceedings{DBLP:conf/nips/DaiL15,
  author    = {Andrew Dai and
               Quoc Le},
@@ -11779,19 +10482,6 @@ author    = {Zhuang Liu and
  publisher = {Annual Conference on Neural Information Processing Systems},
  year      = {2015}
 }
-@inproceedings{DBLP:journals/corr/abs-1802-05365,
-  author    = {Matthew Peters and
-               Mark Neumann and
-               Mohit Iyyer and
-               Matt Gardner and
-               Christopher Clark and
-               Kenton Lee and
-               Luke Zettlemoyer},
-  title     = {Deep Contextualized Word Representations},
-  pages     = {2227--2237},
-  publisher = {Annual Conference of the North American Chapter of the Association for Computational Linguistics},
-  year      = {2018}
-}
 @inproceedings{DBLP:conf/icml/CollobertW08,
  author    = {Ronan Collobert and
               Jason Weston},
@@ -11889,16 +10579,6 @@ author    = {Zhuang Liu and
  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2019}
 }
-@inproceedings{DBLP:journals/corr/LuongLSVK15,
-  author    = {Minh-Thang Luong and
-               Quoc V. Le and
-               Ilya Sutskever and
-               Oriol Vinyals and
-               Lukasz Kaiser},
-  title     = {Multi-task Sequence to Sequence Learning},
-  publisher = {International Conference on Learning Representations},
-  year      = {2016}
-}
 @inproceedings{DBLP:conf/emnlp/ZhangZ16,
  author    = {Jiajun Zhang and
               Chengqing Zong},
@@ -12094,13 +10774,6 @@ author    = {Zhuang Liu and
  pages={117},
  year={2015}
 }
-@inproceedings{chen2016bilingual,
-  title={Bilingual methods for adaptive training data selection for machine translation},
-  author={Chen, Boxing and Kuhn, Roland and Foster, George and Cherry, Colin and Huang, Fei},
-  publisher={Association for Machine Translation in the Americas},
-  pages={93--103},
-  year={2016}
-}
 @inproceedings{DBLP:conf/iwslt/Ueffing06,
  author    = {Nicola Ueffing},
  title     = {Using monolingual source-language data to improve {MT} performance},
@@ -12272,15 +10945,6 @@ author    = {Zhuang Liu and
  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2020}
 }
-@inproceedings{DBLP:conf/emnlp/AxelrodHG11,
-  author    = {Amittai Axelrod and
-               Xiaodong He and
-               Jianfeng Gao},
-  title     = {Domain Adaptation via Pseudo In-Domain Data Selection},
-  pages     = {355--362},
-  publisher = {Conference on Empirical Methods in Natural Language Processing},
-  year      = {2011}
-}
 @inproceedings{DBLP:conf/icdm/Remus12,
  author    = {Robert Remus},
  title     = {Domain Adaptation Using Domain Similarity- and Domain Complexity-Based
@@ -12309,13 +10973,6 @@ author    = {Zhuang Liu and
  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2019}
 }
-@inproceedings{britz2017effective,
-  title={Effective domain mixing for neural machine translation},
-  author={Britz, Denny and Le, Quoc and Pryzant, Reid},
-  publisher={Proceedings of the Second Conference on Machine Translation},
-  pages={118--126},
-  year={2017}
-}
 @inproceedings{DBLP:conf/ranlp/KobusCS17,
  author    = {Catherine Kobus and
               Josep Maria Crego and
@@ -12326,27 +10983,6 @@ author    = {Zhuang Liu and
               Language Processing},
  year      = {2017}
 }
-@inproceedings{DBLP:conf/emnlp/WangULCS17,
-  author    = {Rui Wang and
-               Masao Utiyama and
-               Lemao Liu and
-               Kehai Chen and
-               Eiichiro Sumita},
-  title     = {Instance Weighting for Neural Machine Translation Domain Adaptation},
-  pages     = {1482--1488},
-  publisher = {Conference on Empirical Methods in Natural Language Processing},
-  year      = {2017}
-}
-@inproceedings{DBLP:conf/aclnmt/ChenCFL17,
-  author    = {Boxing Chen and
-               Colin Cherry and
-               George F. Foster and
-               Samuel Larkin},
-  title     = {Cost Weighting for Neural Machine Translation Domain Adaptation},
-  pages     = {40--46},
-  publisher = {Annual Meeting of the Association for Computational Linguistics},
-  year      = {2017}
-}
 @inproceedings{DBLP:journals/corr/abs-1906-03129,
  author    = {Shen Yan and
               Leonard Dahlmann and
@@ -12432,15 +11068,6 @@ author    = {Zhuang Liu and
  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2019}
 }
-@inproceedings{DBLP:conf/wmt/BritzLP17,
-  author    = {Denny Britz and
-               Quoc V. Le and
-               Reid Pryzant},
-  title     = {Effective Domain Mixing for Neural Machine Translation},
-  pages     = {118--126},
-  publisher = {Annual Meeting of the Association for Computational Linguistics},
-  year      = {2017}
-}
 @inproceedings{DBLP:journals/ibmrd/Luhn58,
  author    = {Hans Peter Luhn},
  title     = {The Automatic Creation of Literature Abstracts},
@@ -12468,27 +11095,6 @@ author    = {Zhuang Liu and
  publisher = {Annual Conference of the North American Chapter of the Association for Computational Linguistics},
  year      = {2019}
 }
-@inproceedings{DBLP:conf/emnlp/WeesBM17,
-  author    = {Marlies van der Wees and
-               Arianna Bisazza and
-               Christof Monz},
-  title     = {Dynamic Data Selection for Neural Machine Translation},
-  pages     = {1400--1410},
-  publisher = {Conference on Empirical Methods in Natural Language Processing},
-  year      = {2017}
-}
-@inproceedings{DBLP:conf/naacl/ZhangSKMCD19,
-  author    = {Xuan Zhang and
-               Pamela Shapiro and
-               Gaurav Kumar and
-               Paul McNamee and
-               Marine Carpuat and
-               Kevin Duh},
-  title     = {Curriculum Learning for Domain Adaptation in Neural Machine Translation},
-  pages     = {1903--1915},
-  publisher = {Annual Conference of the North American Chapter of the Association for Computational Linguistics},
-  year      = {2019}
-}
 @inproceedings{DBLP:conf/acl/ChuDK17,
  author    = {Chenhui Chu and
               Raj Dabre and
@@ -12564,18 +11170,6 @@ author    = {Zhuang Liu and
  publisher = {Annual Conference of the European Association for Machine Translation},
  year      = {2017}
 }
-@inproceedings{DBLP:conf/aaai/Zhang0LZC18,
-  author    = {Zhirui Zhang and
-               Shujie Liu and
-               Mu Li and
-               Ming Zhou and
-               Enhong Chen},
-  title     = {Joint Training for Neural Machine Translation Models with Monolingual
-               Data},
-  pages     = {555--562},
-  publisher = {AAAI Conference on Artificial Intelligence},
-  year      = {2018}
-}
 @inproceedings{DBLP:conf/wmt/SunJXHWW19,
  author    = {Meng Sun and
               Bojian Jiang and
@@ -12794,19 +11388,6 @@ author    = {Zhuang Liu and
  publisher = {International Conference on Machine Learning},
  year      = {2018}
 }
-@inproceedings{DBLP:conf/nips/HeXQWYLM16,
-  author    = {Di He and
-               Yingce Xia and
-               Tao Qin and
-               Liwei Wang and
-               Nenghai Yu and
-               Tie-Yan Liu and
-               Wei-Ying Ma},
-  title     = {Dual Learning for Machine Translation},
-  publisher = {Annual Conference on Neural Information Processing Systems},
-  pages     = {820--828},
-  year      = {2016}
-}
 @article{zhao2020dual,
  title={Dual Learning: Theoretical Study and an Algorithmic Extension},
  author={Zhao, Zhibing and Xia, Yingce and Qin, Tao and Xia, Lirong and Liu, Tie-Yan},
@@ -12832,12 +11413,6 @@ author    = {Zhuang Liu and
  volume    = {abs/1901.09115},
  year      = {2019}
 }
-@book{jurafsky2000speech,
-  title={Speech \& language processing},
-  author={Jurafsky, Dan},
-  year={2000},
-  publisher={Pearson Education India}
-}
 @inproceedings{DBLP:conf/anlp/MarcuCW00,
  author    = {Daniel Marcu and
               Lynn Carlson and
@@ -12936,19 +11511,10 @@ author    = {Zhuang Liu and
  author    = {Thomas Meyer and
               Andrei Popescu-Belis},
  title     = {Using Sense-labeled Discourse Connectives for Statistical Machine
               Translation},
  pages     = {129--138},
  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2012}
-}
-@inproceedings{DBLP:conf/nips/SutskeverVL14,
-  author    = {Ilya Sutskever and
-               Oriol Vinyals and
-               Quoc V. Le},
-  title     = {Sequence to Sequence Learning with Neural Networks},
-  pages     = {3104--3112},
-  year      = {2014},
-  publisher = {Annual Conference on Neural Information Processing Systems}
 }
 @inproceedings{DBLP:conf/emnlp/LaubliS018,
  author    = {Samuel L{\"{a}}ubli and
@@ -12995,16 +11561,6 @@ author    = {Zhuang Liu and
  volume    = {abs/1704.05135},
  year      = {2017}
 }
-@inproceedings{DBLP:conf/acl/TitovSSV18,
-  author    = {Elena Voita and
-               Pavel Serdyukov and
-               Rico Sennrich and
-               Ivan Titov},
-  title     = {Context-Aware Neural Machine Translation Learns Anaphora Resolution},
-  pages     = {1264--1274},
-  publisher = {Annual Meeting of the Association for Computational Linguistics},
-  year      = {2018}
-}
 @inproceedings{DBLP:conf/acl/HaffariM18,
  author    = {Sameen Maruf and
               Gholamreza Haffari},
@@ -13129,14 +11685,6 @@ author    = {Zhuang Liu and
  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2018}
 }
-@inproceedings{DBLP:conf/iclr/KitaevKL20,
-  author    = {Nikita Kitaev and
-               Lukasz Kaiser and
-               Anselm Levskaya},
-  title     = {Reformer: The Efficient Transformer},
-  publisher = {International Conference on Learning Representations},
-  year      = {2020}
-}
 @inproceedings{agrawal2018contextual,
  title={Contextual handling in neural machine translation: Look behind, ahead and on both sides},
  author={Agrawal, Ruchit Rajeshkumar and Turchi, Marco and Negri, Matteo},
@@ -13144,17 +11692,6 @@ author    = {Zhuang Liu and
  pages={11--20},
  year={2018}
 }
-@inproceedings{DBLP:conf/emnlp/WerlenRPH18,
-  author    = {Lesly Miculicich Werlen and
-               Dhananjay Ram and
-               Nikolaos Pappas and
-               James Henderson},
-  title     = {Document-Level Neural Machine Translation with Hierarchical Attention
-               Networks},
-  pages     = {2947--2954},
-  publisher = {Conference on Empirical Methods in Natural Language Processing},
-  year      = {2018}
-}
 @inproceedings{DBLP:conf/naacl/MarufMH19,
  author    = {Sameen Maruf and
               Andr{\'{e}} F. T. Martins and
@@ -13230,21 +11767,6 @@ author    = {Zhuang Liu and
  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2017}
 }
-@inproceedings{DBLP:conf/acl/LiLWJXZLL20,
-  author    = {Bei Li and
-               Hui Liu and
-               Ziyang Wang and
-               Yufan Jiang and
-               Tong Xiao and
-               Jingbo Zhu and
-               Tongran Liu and
-               Changliang Li},
-  title     = {Does Multi-Encoder Help? {A} Case Study on Context-Aware Neural Machine
-               Translation},
-  pages     = {3512--3518},
-  publisher = {Annual Meeting of the Association for Computational Linguistics},
-  year      = {2020}
-}
 @inproceedings{DBLP:conf/discomt/KimTN19,
  author    = {Yunsu Kim and
               Duc Thanh Tran and
@@ -13364,21 +11886,6 @@ author    = {Zhuang Liu and
  volume    = {abs/1911.03110},
  year      = {2019}
 }
-@article{DBLP:journals/tacl/LiuGGLEGLZ20,
-  author    = {Yinhan Liu and
-               Jiatao Gu and
-               Naman Goyal and
-               Xian Li and
-               Sergey Edunov and
-               Marjan Ghazvininejad and
-               Mike Lewis and
-               Luke Zettlemoyer},
-  title     = {Multilingual Denoising Pre-training for Neural Machine Translation},
-  journal   = {Transactions of the Association for Computational Linguistics},
-  volume    = {8},
-  pages     = {726--742},
-  year      = {2020}
-}
 @inproceedings{DBLP:conf/wmt/MarufMH18,
  author    = {Sameen Maruf and
               Andr{\'{e}} F. T. Martins and
@@ -13480,17 +11987,6 @@ author    = {Zhuang Liu and
  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2019}
 }
-@inproceedings{DBLP:conf/acl/LiuTMCZ18,
-  author    = {Yong Cheng and
-               Zhaopeng Tu and
-               Fandong Meng and
-               Junjie Zhai and
-               Yang Liu},
-  title     = {Towards Robust Neural Machine Translation},
-  pages     = {1756--1766},
-  publisher = {Annual Meeting of the Association for Computational Linguistics},
-  year      = {2018}
-}
 @inproceedings{DBLP:conf/naacl/DuongACBC16,
  author    = {Long Duong and
               Antonios Anastasopoulos and
@@ -14262,20 +12758,6 @@ author    = {Zhuang Liu and
  publisher = {International Conference on Learning Representations},
  year      = {2020}
 }
-@inproceedings{DBLP:conf/nips/GoodfellowPMXWOCB14,
-  author    = {Ian J. Goodfellow and
-               Jean Pouget-Abadie and
-               Mehdi Mirza and
-               Bing Xu and
-               David Warde-Farley and
-               Sherjil Ozair and
-               Aaron C. Courville and
-               Yoshua Bengio},
-  title     = {Generative Adversarial Nets},
-  publisher = {Conference on Neural Information Processing Systems},
-  pages     = {2672--2680},
-  year      = {2014}
-}
 @inproceedings{DBLP:conf/nips/ZhuZPDEWS17,
  author    = {Jun-Yan Zhu and
               Richard Zhang and
@@ -14320,16 +12802,6 @@ author    = {Zhuang Liu and
  publisher = {International Conference on Computer Vision},
  year      = {2017}
 }
-@inproceedings{DBLP:conf/iccv/YiZTG17,
-  author    = {Zili Yi and
-               Hao (Richard) Zhang and
-               Ping Tan and
-               Minglun Gong},
-  title     = {DualGAN: Unsupervised Dual Learning for Image-to-Image Translation},
-  pages     = {2868--2876},
-  publisher = {International Conference on Computer Vision},
-  year      = {2017}
-}
 @inproceedings{DBLP:conf/nips/LiuBK17,
  author    = {Ming-Yu Liu and
               Thomas Breuel and
@@ -14584,24 +13056,6 @@ author    = {Zhuang Liu and
  pages     = {163--185},
  year      = {2017}
 }
-@inproceedings{Peris2017InteractiveNM,
-  title={Interactive neural machine translation},
-  author={{\'A}lvaro Peris and Miguel Domingo and F. Casacuberta},
-  publisher={Computer Speech and Language},
-  year={2017},
-  volume={45},
-  pages={201-220}
-}
-@inproceedings{DBLP:journals/csl/PerisC19,
-  author    = {{\'{A}}lvaro Peris and
-               Francisco Casacuberta},
-  title     = {Online learning for effort reduction in interactive neural machine
-               translation},
-  publisher   = {Computer Speech Language},
-  volume    = {58},
-  pages     = {98--126},
-  year      = {2019}
-}
 @inproceedings{DBLP:journals/coling/BarrachinaBCCCKLNTVV09,
  author    = {Sergio Barrachina and
               Oliver Bender and
@@ -14670,16 +13124,6 @@ author    = {Zhuang Liu and
  volume    = {abs/1702.07811},
  year      = {2017}
 }
-@inproceedings{DBLP:conf/emnlp/WangXZ20,
-  author    = {Qiang Wang and
-               Tong Xiao and
-               Jingbo Zhu},
-  title     = {Training Flexible Depth Model by Multi-Task Learning for Neural Machine
-               Translation},
-  pages     = {4307--4312},
-  publisher = {Conference on Empirical Methods in Natural Language Processing},
-  year      = {2020}
-}
 @inproceedings{DBLP:conf/ijcai/ChenCWL20,
  author    = {Guanhua Chen and
               Yun Chen and
@@ -14762,18 +13206,6 @@ author    = {Zhuang Liu and
  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2017}
 }
-@inproceedings{DBLP:conf/naacl/ThompsonGKDK19,
-  author    = {Brian Thompson and
-               Jeremy Gwinnup and
-               Huda Khayrallah and
-               Kevin Duh and
-               Philipp Koehn},
-  title     = {Overcoming Catastrophic Forgetting During Domain Adaptation of Neural
-               Machine Translation},
-  pages     = {2062--2068},
-  publisher = {Annual Meeting of the Association for Computational Linguistics},
-  year      = {2019}
-}
 @inproceedings{DBLP:conf/aclnmt/KhayrallahTDK18,
  author    = {Huda Khayrallah and
               Brian Thompson and
@@ -14785,12 +13217,6 @@ author    = {Zhuang Liu and
  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2018}
 }
-@inproceedings{barone2017regularization,
-  title={Regularization techniques for fine-tuning in neural machine translation},
-  author={Barone, Antonio Valerio Miceli and Haddow, Barry and Germann, Ulrich and Sennrich, Rico},
-  publisher={arXiv preprint arXiv:1707.09920},
-  year={2017}
-}
 @inproceedings{DBLP:journals/corr/ChuDK17,
  author    = {Chenhui Chu and
               Raj Dabre and
@@ -14801,15 +13227,6 @@ author    = {Zhuang Liu and
  volume    = {abs/1701.03214},
  year      = {2017}
 }
-@inproceedings{DBLP:conf/coling/GuF20,
-  author    = {Shuhao Gu and
-               Yang Feng},
-  title     = {Investigating Catastrophic Forgetting During Continual Training for
-               Neural Machine Translation},
-  pages     = {4315--4326},
-  publisher = {International Committee on Computational Linguistics},
-  year      = {2020}
-}
 %%%%% chapter 18------------------------------------------------------
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@@ -14885,15 +13302,6 @@ author    = {Zhuang Liu and
  pages={197--216},
  year={2012}
 }
-@inproceedings{DBLP:conf/naacl/DyerCS13,
-  author    = {Chris Dyer and
-               Victor Chahuneau and
-               Noah A. Smith},
-  title     = {A Simple, Fast, and Effective Reparameterization of {IBM} Model 2},
-  pages     = {644--648},
-  publisher = {Annual Meeting of the Association for Computational Linguistics},
-  year      = {2013}
-}
 @inproceedings{al2016theano,
  author    = {Rami Al-Rfou and
               Guillaume Alain and
@@ -15013,63 +13421,6 @@ author    = {Zhuang Liu and
  volume    = {abs/1605.02688},
  year      = {2016}
 }
-@inproceedings{DBLP:journals/corr/SennrichFCBHHJL17,
-  author    = {Rico Sennrich and
-               Orhan Firat and
-               Kyunghyun Cho and
-               Barry Haddow and
-			   Alexandra Birch and
-               Julian Hitschler and
-               Marcin Junczys-Dowmunt and
-               Samuel L{\"{a}}ubli and
-               Antonio Valerio Miceli Barone and
-               Jozef Mokry and
-               Maria Nadejde},
-  title     = {Nematus: a Toolkit for Neural Machine Translation},
-  publisher = {Annual Conference of the European Association for Machine Translation},
-  pages     = {65--68},
-  year      = {2017}
-}
-@inproceedings{Koehn2007Moses,
-  author    = {Philipp Koehn and
-               Hieu Hoang and
-			    Alexandra Birch and
-               Chris Callison-Burch and
-               Marcello Federico and
-               Nicola Bertoldi and
-               Brooke Cowan and
-               Wade Shen and
-               Christine Moran and
-               Richard Zens and
-               Chris Dyer and
-               Ondrej Bojar and
-               Alexandra Constantin and
-               Evan Herbst},
-  title     = {Moses: Open Source Toolkit for Statistical Machine Translation},
-  publisher = {Annual Meeting of the Association for Computational Linguistics},
-  year      = {2007}
-}
-@inproceedings{zollmann2007the,
-  author    = {Andreas Zollmann and
-               Ashish Venugopal and
-               Matthias Paulik and
-               Stephan Vogel},
-  title     = {The Syntax Augmented {MT} {(SAMT)} System at the Shared Task for the
-               2007 {ACL} Workshop on Statistical Machine Translation},
-  pages     = {216--219},
-  publisher = {Annual Meeting of the Association for Computational Linguistics},
-  year      = {2007}
-}
-@inproceedings{och2003systematic,
-  author    = {Franz Josef Och and
-               Hermann Ney},
-  title     = {A Systematic Comparison of Various Statistical Alignment Models},
-  publisher   = {Computational Linguistics},
-  volume    = {29},
-  number    = {1},
-  pages     = {19--51},
-  year      = {2003}
-}
 @inproceedings{zoph2016simple,
  author    = {Barret Zoph and
               Ashish Vaswani and
@@ -15080,50 +13431,6 @@ author    = {Zhuang Liu and
  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2016}
 }
-@inproceedings{Ottfairseq,
-  author    = {Myle Ott and
-               Sergey Edunov and
-               Alexei Baevski and
-               Angela Fan and
-               Sam Gross and
-               Nathan Ng and
-               David Grangier and
-               Michael Auli},
-  title     = {fairseq: {A} Fast, Extensible Toolkit for Sequence Modeling},
-  pages     = {48--53},
-  publisher = {Annual Meeting of the Association for Computational Linguistics},
-  year      = {2019}
-}
-@inproceedings{Vaswani2018Tensor2TensorFN,
-   author    = {Ashish Vaswani and
-               Samy Bengio and
-               Eugene Brevdo and
-               Fran{\c{c}}ois Chollet and
-               Aidan N. Gomez and
-               Stephan Gouws and
-               Llion Jones and
-               Lukasz Kaiser and
-               Nal Kalchbrenner and
-               Niki Parmar and
-               Ryan Sepassi and
-               Noam Shazeer and
-               Jakob Uszkoreit},
-  title     = {Tensor2Tensor for Neural Machine Translation},
-  pages     = {193--199},
-  publisher = {Association for Machine Translation in the Americas},
-  year      = {2018}
-}
-@inproceedings{KleinOpenNMT,
-  author    = {Guillaume Klein and
-               Yoon Kim and
-               Yuntian Deng and
-               Jean Senellart and
-               Alexander M. Rush},
-  title     = {OpenNMT: Open-Source Toolkit for Neural Machine Translation},
-  pages     = {67--72},
-  publisher = {Annual Meeting of the Association for Computational Linguistics},
-  year      = {2017}
-}
 @inproceedings{luong2016acl_hybrid,
  author    = {Minh-Thang Luong and
               Christopher D. Manning},