16的图和文字

95648175 · 孟霞 · 845a1a6c · 95648175 · 95648175 · 95648175
Commit 95648175 authored Jan 08, 2021 by 孟霞
--- a/Chapter14/Figures/figure-3vs.tex
+++ b/Chapter14/Figures/figure-3vs.tex
@@ -2,9 +2,10 @@
 \tikzstyle{decoder} = [rectangle,thick,rounded corners,minimum width=5cm,minimum height=0.6cm,text centered,draw=black,fill=blue!15]
 \begin{scope}
-\node (aa)[decoder] at (0,0) {};
+\node (encoder) at (0,0) {来自编码器的信息};
+\node (aa)[decoder,anchor=east] at ([xshift=5.5cm]encoder.east) {};
 \node (y2b)[anchor=south] at ([yshift=-2.5em]aa.south) {$y_2$};
-\node (label)[anchor=south] at ([yshift=-1.8em]y2b.south) {\small{(a) 自回归解码}};
+\node (label)[anchor=south] at ([xshift=-4.5em,yshift=-1.8em]y2b.south) {\small{(a) 自回归解码}};
 \node (y1b)[anchor=east] at ([xshift=-2.5em]y2b.east) {$y_1$};
 \node (sos)[anchor=east] at ([xshift=-4.3em]y2b.east) {\small{<sos>}};
 \node (y3b)[anchor=west] at ([xshift=2.5em]y2b.west) {$y_3$};
@@ -21,8 +22,9 @@
 \draw [->,very thick,dotted] ([xshift=-0.3em]y3a.east) .. controls +(east:0.5) and +(west:0.5) ..([xshift=0.3em]y3b.west);
 \draw [->,very thick,dotted] ([xshift=-0.3em]y4a.east) .. controls +(east:0.5) and +(west:0.5) ..([xshift=0.3em]y4b.west);
-\node (autodecoder)[decoder] at (0,0) {自回归编码器};
+\node (autodecoder)[decoder,anchor=east] at ([xshift=5.5cm]encoder.east) {自回归解码器};
+%\node (encoder)[decoder,anchor=west,fill=red!20] at ([xshift=-2cm]autodecoder.west) {编码器};
+\draw [->,thick](encoder.east) to (autodecoder.west); 
 \draw [->,thick]([yshift=0em]y1b.north) to ([yshift=1.15em]y1b.north); 
 \draw [->,thick]([yshift=0em]y2b.north) to ([yshift=1.15em]y2b.north);
 \draw [->,thick]([yshift=0em]y3b.north) to ([yshift=1.15em]y3b.north);
@@ -37,9 +39,10 @@
 \end{scope}
 \begin{scope}[yshift=-1.55in]
-\node (aa) [decoder] at (0,0) {};
+\node (encoder) at (0,0) {来自编码器的信息};
+\node (aa)[decoder,anchor=east] at ([xshift=5.5cm]encoder.east) {};
 \node (y1y2b)[rectangle,anchor=south,inner sep=0.25em,densely dashed,draw] at ([yshift=-2.6em]aa.south) {$y_1\;y_2$};
-\node (label)[anchor=south] at ([yshift=-2.1em]y1y2b.south) {\small{(b) 半自回归解码}};
+\node (label)[anchor=south] at ([xshift=-4em,yshift=-2.1em]y1y2b.south) {\small{(b) 半自回归解码}};
 \node (sos)[anchor=east] at ([xshift=-4.55em]y1y2b.east) {\small{<sos>}};
 \node (y3y4b)[rectangle,anchor=west,inner sep=0.25em,densely dashed,draw] at ([xshift=4.7em]y1y2b.west) {$y_3\;y_4$};
@@ -50,7 +53,9 @@
 \draw [->,very thick,dotted] ([xshift=-0em]y1y2a.east) .. controls +(east:0.5) and +(west:0.5) ..([xshift=0em]y1y2b.west);
 \draw [->,very thick,dotted] ([xshift=-0em]y3y4a.east) .. controls +(east:0.5) and +(west:0.5) ..([xshift=0em]y3y4b.west);
-\node (autodecoder)[decoder] at (0,0) {半自回归编码器};
+\node (autodecoder)[decoder,anchor=east] at ([xshift=5.5cm]encoder.east)  {半自回归解码器};
+%\node (encoder)[decoder,anchor=west,fill=red!20] at ([xshift=-2cm]autodecoder.west) {编码器};
+\draw [->,thick](encoder.east) to (autodecoder.west); 
 \draw [->,thick]([yshift=0.05em]sos.north) to ([yshift=1.38em]sos.north);
 \draw [->,thick]([yshift=0em]y1y2b.north) to ([yshift=1.38em]y1y2b.north);
@@ -62,22 +67,19 @@
 \end{scope}
 \begin{scope}[yshift=-3.1in]
-\node (aa) [decoder]at (0,0) {非自回归模型};
+\node (encoder) at (0,0) {来自编码器的信息};
-\node (y2b)[anchor=south] at ([xshift=-1.5em,yshift=-2.5em]aa.south) {$y_2$};
+\node (aa)[decoder,anchor=east] at ([xshift=5.5cm]encoder.east) {非自回归解码器};
-\node (label)[anchor=south] at ([yshift=-4.3em]aa.south) {\small{(c) 非自回归解码}};
+%\node (encoder)[decoder,anchor=west,fill=red!20] at ([xshift=-2cm]aa.west) {编码器};
-\node (y1b)[anchor=east] at ([xshift=-3em]y2b.east) {$y_1$};
+\draw [->,thick](encoder.east) to (aa.west); 
-\node (y3b)[anchor=west] at ([xshift=3em]y2b.west) {$y_3$};
-\node (y4b)[anchor=west] at ([xshift=6em]y2b.west) {$y_4$};
+\node (label)[anchor=south] at ([xshift=-4em,yshift=-4.3em]aa.south) {\small{(c) 非自回归解码}};
 \node (y2a)[anchor=north] at ([xshift=-1.5em,yshift=2.5em]aa.north) {$y_2$};
 \node (y1a)[anchor=east] at ([xshift=-3em]y2a.east) {$y_1$};
 \node (y3a)[anchor=west] at ([xshift=3em]y2a.west) {$y_3$};
 \node (y4a)[anchor=west] at ([xshift=6em]y2a.west) {$y_4$};
-\draw [->,thick]([yshift=0em]y1b.north) to ([yshift=1.15em]y1b.north); 
-\draw [->,thick]([yshift=0em]y2b.north) to ([yshift=1.15em]y2b.north);
-\draw [->,thick]([yshift=0em]y3b.north) to ([yshift=1.15em]y3b.north);
-\draw [->,thick]([yshift=0em]y4b.north) to ([yshift=1.15em]y4b.north);
 \draw [->,thick]([yshift=-1.2em]y1a.south) to (y1a.south); 
 \draw [->,thick]([yshift=-1.2em]y2a.south) to (y2a.south);

--- a/Chapter14/Figures/figure-batch-time-mem.tex
+++ b/Chapter14/Figures/figure-batch-time-mem.tex
@@ -3,7 +3,6 @@
 \tikzstyle{snode} = [draw,inner sep=1pt,minimum width=3em,minimum height=0.5em,rounded corners=1pt,fill=green!20!white]
 \tikzstyle{pnode} = [draw,inner sep=1pt,minimum width=1em,minimum height=0.5em,rounded corners=1pt]
-\node [anchor=west] (des) at (1.5,3) {\normalsize\bfnew{$\bm{m}$：显存\quad$\bm{t}$：时间\quad$\bm{m_1>m_2}$\quad$\bm{t_1>t_2}$}};
 \node [anchor=west,snode] (s1) at (0,0) {\tiny{}};
 \node [anchor=north west,snode,minimum width=6.3em] (s2) at ([yshift=-0.3em]s1.south west) {\tiny{}};
 \node [anchor=north west,snode,minimum width=2em] (s3) at ([yshift=-0.3em]s2.south west) {\tiny{}};
@@ -12,6 +11,9 @@
 \node [anchor=north west,snode,minimum width=3em] (s6) at ([yshift=-0.3em]s5.south west) {\tiny{}};
 \node [anchor=east] (label1) at ([xshift=-0.8em,yshift=-2em]s1.west) {{句子:}};
+\node [anchor=east,draw,dashed,minimum height=7.5cm,minimum width=7.3cm,thick] (box) at ([xshift=10.9cm]label1.east) {};
+%\node [anchor=north] (label6) at ([xshift=3em,yshift=7em]label1.north) {{$m$：显存}};
+%\node [anchor=north] (label7) at ([xshift=3.3em,yshift=5.5em]label1.north) {{$t$：延迟}};
 \node [anchor=west,pnode,minimum width=3em] (p1) at ([xshift=0.3em]s1.east) {\tiny{}};
 \node [anchor=west,pnode,minimum width=4em] (p3) at ([xshift=0.3em]s3.east) {\tiny{}};
 \node [anchor=west,pnode,minimum width=0.5em] (p4) at ([xshift=0.3em]s4.east) {\tiny{}};
@@ -19,17 +21,20 @@
 \node [anchor=west,pnode,minimum width=3em] (p6) at ([xshift=0.3em]s6.east) {\tiny{}};
 \node [rectangle,inner sep=0.5em,rounded corners=2pt,very thick,dotted,draw=ugreen!80] [fit = (s1) (s6) (p1) (p6)] (box0) {};
-\node[rectangle,inner sep=0.5em,rounded corners=1pt,draw,fill=blue!20] (model) at ([xshift=3.5em]box0.east){{模型}};
+\node[anchor=east] (model) at ([xshift=2em]box0.east){{}};
 % big batch
-\node [anchor=west,snode] (sbi1) at ([xshift=3em,yshift=6em]model.east) {\tiny{}};
+\node [anchor=west,snode] (sbi1) at ([xshift=3.5em,yshift=6.7em]model.east) {\tiny{}};
 \node [anchor=north west,snode,minimum width=6.3em] (sbi2) at ([yshift=-0.3em]sbi1.south west) {\tiny{}};
 \node [anchor=north west,snode,minimum width=2em] (sbi3) at ([yshift=-0.3em]sbi2.south west) {\tiny{}};
 \node [anchor=north west,snode,minimum width=5.5em] (sbi4) at ([yshift=-0.3em]sbi3.south west) {\tiny{}};
 \node [anchor=north west,snode,minimum width=5.8em] (sbi5) at ([yshift=-0.3em]sbi4.south west) {\tiny{}};
 \node [anchor=north west,snode,minimum width=3em] (sbi6) at ([yshift=-0.3em]sbi5.south west) {\tiny{}};
+\node [anchor=south] (label2) at ([xshift=0.3em,yshift=-3em]sbi5.south) {\footnotesize{批次1}};
-\node [anchor=east] (label1) at ([xshift=-0.8em,yshift=-1em]sbi1.west) {{大批次}};
+\node [anchor=west] (label12) at ([xshift=-2.3em,yshift=-0.2em]sbi3.west) {{批}};
+\node [anchor=north] (label11) at ([yshift=1.1em]label12.north) {{大}};
+\node [anchor=south] (label13) at ([yshift=-1.1em]label12.south) {{次}};
 \node [anchor=west,pnode,minimum width=3em] (pbi1) at ([xshift=0.3em]sbi1.east) {\tiny{}};
 \node [anchor=west,pnode,minimum width=4em] (pbi3) at ([xshift=0.3em]sbi3.east) {\tiny{}};
 \node [anchor=west,pnode,minimum width=0.5em] (pbi4) at ([xshift=0.3em]sbi4.east) {\tiny{}};
@@ -39,11 +44,15 @@
 \node [rectangle,inner sep=0.5em,rounded corners=2pt,very thick,dotted,draw=ugreen!80] [fit = (sbi1) (sbi6) (pbi1) (pbi6)] (box1) {};
 % small batch
-\node [anchor=west,snode,minimum width=5.5em] (sma1) at ([xshift=3em,yshift=-3em]model.east) {\tiny{}};
+\node [anchor=west,snode,minimum width=5.5em] (sma1) at ([xshift=3.5em,yshift=-3.7em]model.east) {\tiny{}};
 \node [anchor=north west,snode,minimum width=5.8em] (sma2) at ([yshift=-0.3em]sma1.south west) {\tiny{}};
 \node [anchor=north west,snode,minimum width=6.3em] (sma3) at ([yshift=-0.3em]sma2.south west) {\tiny{}};
+\node [anchor=south] (label3) at ([xshift=0.3em,yshift=-3em]sma2.south) {\footnotesize{批次1}};
+\node [anchor=south] (label5) at ([xshift=2.5em,yshift=-1.8em]label3.south) {{Transformer模型处理}};
-\node [anchor=east] (label1) at ([xshift=-0.8em,yshift=-2em]sma1.west) {{小批次}};
+\node [anchor=west] (label22) at ([xshift=-2.3em]sma2.west) {{批}};
+\node [anchor=north] (label21) at ([yshift=1.1em]label22.north) {{小}};
+\node [anchor=south] (label23) at ([yshift=-1.1em]label22.south) {{次}};
 \node [anchor=west,pnode,minimum width=0.5em] (pma1) at ([xshift=0.3em]sma1.east) {\tiny{}};
 \node [anchor=west,pnode,minimum width=0.2em] (pma2) at ([xshift=0.3em]sma2.east) {\tiny{}};
@@ -53,15 +62,18 @@
 \node [anchor=west,snode,minimum width=2em] (sma4) at ([xshift=3.5em,yshift=0em]sma1.east) {\tiny{}};
 \node [anchor=north west,snode,minimum width=3em] (sma5) at ([yshift=-0.3em]sma4.south west) {\tiny{}};
 \node [anchor=north west,snode,minimum width=3em] (sma6) at ([yshift=-0.3em]sma5.south west) {\tiny{}};
+\node [anchor=south] (label4) at ([yshift=-3em]sma5.south) {\footnotesize{批次2}};
 \node [anchor=west,pnode,minimum width=0.7em] (pma4) at ([xshift=0.3em]sma4.east) {\tiny{}};
 \node [rectangle,inner sep=0.5em,rounded corners=2pt,very thick,dotted,draw=ugreen!80] [fit = (sma4) (sma6) (pma4)] (box3) {};
-\draw [->,very thick] (box0.east) -- (model.west);
+\draw [->,very thick] (box0.east) -- ([xshift=0.3em]model.west);
-\draw [->,thick] (model.east) .. controls +(east:0.5) and +(west:0.5) .. ([xshift=-1em]box1.west);
+\draw [->,thick] (model.east) .. controls +(east:0.5) and +(west:0.5) .. ([xshift=-1.8em]box1.west);
-\draw [->,thick] (model.east) .. controls +(east:0.5) and +(west:0.5) .. ([xshift=-1em]box2.west);
+\draw [->,thick] (model.east) .. controls +(east:0.5) and +(west:0.5) .. ([xshift=-1.8em]box2.west);
-\draw [->,very thick] (box2.east) -- (box3.west);
+\draw [-,very thick] ([xshift=0.3em]box2.east) -- ([xshift=-0.3em]box3.west);
+\draw [-,very thick] ([xshift=0.782em,yshift=0.5em]box2.east) -- ([xshift=0.782em,yshift=-0.5em]box2.east);
 %%%%%
 \node [] (t10) at ([yshift=1.5em]box1.north) {$t_1$};

--- a/Chapter14/Figures/figure-beamsize-bleu.tex
+++ b/Chapter14/Figures/figure-beamsize-bleu.tex
@@ -5,19 +5,27 @@
 width=8cm,
 height=5cm,
 yticklabel style={/pgf/number format/.cd,fixed,precision=2},
-xticklabel style={/pgf/number format/.cd,fixed,precision=2},
+xticklabel style={color=white},,
-xlabel={\footnotesize{搜索束大小（取log）}},ylabel={\footnotesize{BLEU\ (\%)}},
+xlabel={\footnotesize{搜索束大小（取$\log$）}},ylabel={\footnotesize{BLEU\ （\%）}},
 ymin=28.8,ymax=30.4,
 xmin=0,xmax=7,
-xtick={0,1,2,3,4,5,6,7},
+xtick={0,1,2.32,3.32,4.91,6.64},
 ytick={28.8,29.0,29.2,29.4,29.6,29.8,30.0,30.2,30.4},
 xticklabels={0,1,2,3,4,5,6,7},
 yticklabels={28.8,29.0,29.2,29.4,29.6,29.8,30.0,30.2,30.4},
 legend style={yshift=-5em,xshift=0em,legend cell align=left,legend plot pos=right}
 ]
-\addplot[purple,mark=square,mark=star,very thick] coordinates {(0,29.3) (1,29.7) (1.58,30.05) (2.32,30.1) (2.73,30.2) (3.32,30.3) (3.84,30.2) (4.23,30.08) (4.91,29.98) (5.81,29.6)(6.64,28.8) };
+\addplot[purple,mark=square,mark=star,very thick] coordinates {(0,29.3) (1,29.7) (1.58,30.05) (2.32,30.1) (3,30.2) (3.32,30.3) (3.9,30.2) (4.32,30.08) (4.91,29.98) (5.91,29.6)(6.64,28.8) };
 \end{axis}
+\node[inner sep=0pt] at (0,-1em) {$\log$1};
+\node[inner sep=0pt] at (1,-1em) {$\log$2};
+%\node[inner sep=0pt] at (1.58,-1em) {$\log$3};
+\node[inner sep=0pt] at (2.15,-1em) {$\log$5};
+\node[inner sep=0pt] at (3.05,-1em) {$\log$10};
+\node[inner sep=0pt] at (4.45,-1em) {$\log$30};
+\node[inner sep=0pt] at (6,-1em) {$\log$100};
 }
 \end{tikzpicture}

--- a/Chapter14/Figures/figure-different-integration-model.tex
+++ b/Chapter14/Figures/figure-different-integration-model.tex
@@ -6,29 +6,29 @@
    \subfigure[\small{假设选择}]
    {
        \begin{tikzpicture}[scale=0.5]
-            \tikzstyle{system} = [rectangle,very thick,minimum width=1cm,font=\tiny];
+            \tikzstyle{system} = [rectangle,thick,minimum width=1.2cm,font=\scriptsize];
-            \tikzstyle{output} = [rectangle,very thick,rounded corners=3pt,minimum width=1cm,align=center,font=\tiny];
+            \tikzstyle{output} = [rectangle,thick,rounded corners=3pt,minimum width=1.2cm,align=center,font=\scriptsize];
            \begin{scope}
-                \node [system,draw=orange!70,text=orange] (model3) at (0,0) {模型 $3$};
+                \node [system,fill=orange!20,draw] (model3) at (0,0) {模型 $3$};
-                \node [system,draw=ugreen!70,text=ugreen,anchor=south] (model2) at ([yshift=0.3cm]model3.north) {模型 $2$};
+                \node [system,fill=ugreen!20,draw,anchor=south] (model2) at ([yshift=0.5cm]model3.north) {模型 $2$};
-                \node [system,draw=red!70,text=red,anchor=south] (model1) at ([yshift=0.3cm]model2.north) {模型 $1$};
+                \node [system,fill=red!20,draw,anchor=south] (model1) at ([yshift=0.5cm]model2.north) {模型 $1$};
-                \node [output,draw=orange!70,text=orange,anchor=west] (output3) at ([xshift=0.5cm]model3.east) {输出 $3$};
+                \node [output,fill=orange!20,draw,anchor=west] (output3) at ([xshift=0.8cm]model3.east) {输出 $3$};
-                \node [output,draw=ugreen!70,text=ugreen,anchor=west] (output2) at ([xshift=0.5cm]model2.east) {输出 $2$};
+                \node [output,fill=ugreen!20,draw,anchor=west] (output2) at ([xshift=0.8cm]model2.east) {输出 $2$};
-                \node [output,draw=red!70,text=red,anchor=west] (output1) at ([xshift=0.5cm]model1.east) {输出 $1$};
+                \node [output,fill=red!20,draw,anchor=west] (output1) at ([xshift=0.8cm]model1.east) {输出 $1$};
                \begin{pgfonlayer}{background}
-                    \node [draw,thick,dashed,rounded corners=3pt,inner sep=2pt,fit=(output1) (output2) (output3)] (output) {};
+                    \node [draw,thick,dashed,rounded corners=3pt,inner sep=3pt,fit=(output1) (output2) (output3)] (output) {};
                \end{pgfonlayer}
-                \node [output,draw=cocoabrown!70,text=cocoabrown,minimum width=1cm,right=1cm of output] (final) {最终\\输出};
+                \node [output,fill=cocoabrown!20,draw,minimum width=1.2cm,right=1cm of output] (final) {最终\\输出};
                \draw [->,very thick] (model1) to (output1);
                \draw [->,very thick] (model2) to (output2);
                \draw [->,very thick] (model3) to (output3);
-                \draw [->,very thick] (output) to node [above,pos=0.5,font=\tiny] {选择} (final);
+                \draw [->,very thick] (output) to node [above,pos=0.5,font=\scriptsize] {选择} (final);
            \end{scope}
        \end{tikzpicture}
    }
@@ -36,23 +36,23 @@
    \subfigure[\small{预测融合}]
    {
        \begin{tikzpicture}[scale=0.5]
-            \tikzstyle{system} = [rectangle,very thick,minimum width=1cm,font=\tiny];
+            \tikzstyle{system} = [rectangle,thick,minimum width=1.2cm,font=\scriptsize];
-            \tikzstyle{output} = [rectangle,very thick,rounded corners=3pt,minimum width=1cm,align=center,font=\tiny];
+            \tikzstyle{output} = [rectangle,thick,rounded corners=3pt,minimum width=1.2cm,align=center,font=\scriptsize];
            \begin{scope}
-                \node [system,draw=orange!70,text=orange] (model3) at (0,0) {模型 $3$};
+                \node [system,fill=orange!20,draw] (model3) at (0,0) {模型 $3$};
-                \node [system,draw=ugreen!70,text=ugreen,anchor=south] (model2) at ([yshift=0.3cm]model3.north) {模型 $2$};
+                \node [system,fill=ugreen!20,draw,anchor=south] (model2) at ([yshift=0.5cm]model3.north) {模型 $2$};
-                \node [system,draw=red!70,text=red,anchor=south] (model1) at ([yshift=0.3cm]model2.north) {模型 $1$};
+                \node [system,fill=red!20,draw,anchor=south] (model1) at ([yshift=0.5cm]model2.north) {模型 $1$};
                \begin{pgfonlayer}{background}
-                    \node [draw,thick,dashed,inner sep=2pt,fit=(model3) (model2) (model1)] (ensemble) {};
+                    \node [draw,thick,dashed,inner sep=3pt,fit=(model3) (model2) (model1)] (ensemble) {};
                \end{pgfonlayer}
-                \node [system,draw=ugreen!70,text=ugreen,right=1cm of ensemble] (model) {模型};
+                \node [system,fill=ugreen!20,draw,right=1cm of ensemble] (model) {模型};
-                \node [output,draw=cocoabrown!70,text=cocoabrown,minimum width=1cm,anchor=west] (final) at ([xshift=0.5cm]model.east) {最终\\输出};
+                \node [output,fill=cocoabrown!20,draw,minimum width=1.2cm,anchor=west] (final) at ([xshift=0.8cm]model.east) {最终\\输出};
-                \draw [->,very thick] (ensemble) to node [above,pos=0.5,font=\tiny] {融合} (model);
+                \draw [->,very thick] (ensemble) to node [above,pos=0.5,font=\scriptsize] {融合} (model);
                \draw [->,very thick] (model) to (final);
            \end{scope}
@@ -63,25 +63,25 @@
    \subfigure[\small{译文重组}]
    {
        \begin{tikzpicture}[scale=0.5]
-            \tikzstyle{system} = [rectangle,very thick,minimum width=1cm,font=\tiny];
+            \tikzstyle{system} = [rectangle,thick,minimum width=1.2cm,font=\scriptsize];
-            \tikzstyle{output} = [rectangle,very thick,rounded corners=3pt,minimum width=1cm,align=center,font=\tiny];
+            \tikzstyle{output} = [rectangle,thick,rounded corners=3pt,minimum width=1.2cm,align=center,font=\scriptsize];
            \tikzstyle{dot} = [circle,fill=blue!40!white,minimum size=5pt,inner sep=0pt];
            \begin{scope}
-                \node [system,draw=orange!70,text=orange] (model3) at (0,0) {模型 $3$};
+                \node [system,fill=orange!20,draw] (model3) at (0,0) {模型 $3$};
-                \node [system,draw=ugreen!70,text=ugreen,anchor=south] (model2) at ([yshift=0.3cm]model3.north) {模型 $2$};
+                \node [system,fill=ugreen!20,draw,anchor=south] (model2) at ([yshift=0.5cm]model3.north) {模型 $2$};
-                \node [system,draw=red!70,text=red,anchor=south] (model1) at ([yshift=0.3cm]model2.north) {模型 $1$};
+                \node [system,fill=red!20,draw,anchor=south] (model1) at ([yshift=0.5cm]model2.north) {模型 $1$};
-                \node [output,draw=orange!70,text=orange,anchor=west] (output3) at ([xshift=0.5cm]model3.east) {输出 $3$};
+                \node [output,fill=orange!20,draw,anchor=west] (output3) at ([xshift=0.8cm]model3.east) {输出 $3$};
-                \node [output,draw=ugreen!70,text=ugreen,anchor=west] (output2) at ([xshift=0.5cm]model2.east) {输出 $2$};
+                \node [output,fill=ugreen!20,draw,anchor=west] (output2) at ([xshift=0.8cm]model2.east) {输出 $2$};
-                \node [output,draw=red!70,text=red,anchor=west] (output1) at ([xshift=0.5cm]model1.east) {输出 $1$};
+                \node [output,fill=red!20,draw,anchor=west] (output1) at ([xshift=0.8cm]model1.east) {输出 $1$};
                \draw [->,very thick] (model1) to (output1);
                \draw [->,very thick] (model2) to (output2);
                \draw [->,very thick] (model3) to (output3);
                \begin{pgfonlayer}{background}
-                    \node [draw,thick,dashed,rounded corners=3pt,inner sep=2pt,fit=(output1) (output2) (output3)] (output) {};
+                    \node [draw,thick,dashed,rounded corners=3pt,inner sep=3pt,fit=(output1) (output2) (output3)] (output) {};
                \end{pgfonlayer}
                \node [dot,anchor=west] (lattice1) at ([shift={(1.5cm,0.5cm)}]output2.east) {};
@@ -98,14 +98,14 @@
                \draw [-latex,blue] (lattice5) to [out=-60,in=-90] (lattice3);
                \begin{pgfonlayer}{background}
-                    \node [draw=blue,fill=white,drop shadow,thick,rounded corners=3pt,inner sep=5pt,fit=(lattice1) (lattice2) (lattice3) (lattice4) (lattice5),label={[font=\tiny,label distance=0pt]90:词格}] (lattice) {};
+                    \node [fill=blue,fill=white,drop shadow,thick,rounded corners=3pt,inner sep=5pt,fit=(lattice1) (lattice2) (lattice3) (lattice4) (lattice5),label={[font=\scriptsize,label distance=0pt]90:词格}] (lattice) {};
                \end{pgfonlayer}
                \draw [->,very thick] (output) to (lattice);
-                \node [system,draw=purple,text=purple,anchor=west] (model) at ([xshift=5.3cm]output1.east) {模型};
+                \node [system,fill=purple!20,draw,anchor=west] (model) at ([xshift=5.3cm]output1.east) {模型};
-                \node [output,draw=cocoabrown!70,text=cocoabrown,minimum width=1cm,right=1.3cm of lattice] (final) {最终输出};
+                \node [output,fill=cocoabrown!20,draw,minimum width=1.2cm,right=1.5cm of lattice] (final) {最终输出};
                \draw [->,very thick] (model) |- (final);
                \draw [->,very thick] (lattice) -- (final);

--- a/Chapter14/Figures/figure-hypothesis-generation.tex
+++ b/Chapter14/Figures/figure-hypothesis-generation.tex
 \definecolor{cocoabrown}{rgb}{0.82, 0.41, 0.12}
 \begin{tikzpicture}
-\tikzstyle{system} = [rectangle,very thick,minimum width=1.5cm,font=\scriptsize];
+\tikzstyle{system} = [rectangle,thick,minimum width=1.2cm,minimum height=0.6cm,font=\scriptsize];
-\tikzstyle{output} = [rectangle,very thick,rounded corners=3pt,minimum width=1.5cm,align=center,font=\scriptsize];
+\tikzstyle{output} = [rectangle,thick,rounded corners=3pt,minimum width=1.2cm,align=center,font=\scriptsize];
 \begin{scope}[local bounding box=MULTIPLE]
-    \node [system,draw=orange!70,text=orange] (engine3) at (0,0) {系统 $n$};
+    \node [system,fill=orange!20,draw] (engine3) at (0,0) {系统 $n$};
-    \node [system,draw=ugreen!70,text=ugreen,anchor=south] (engine2) at ([yshift=0.6cm]engine3.north) {系统 $2$};
+    \node [system,fill=ugreen!20,draw,anchor=south] (engine2) at ([yshift=0.6cm]engine3.north) {系统 $2$};
-    \node [system,draw=red!70,text=red,anchor=south] (engine1) at ([yshift=0.3cm]engine2.north) {系统 $1$};
+    \node [system,fill=red!20,draw,anchor=south] (engine1) at ([yshift=0.3cm]engine2.north) {系统 $1$};
-    \node [output,draw=orange!70,text=orange,anchor=west] (output3) at ([xshift=0.5cm]engine3.east) {输出 $n$};
+    \node [output,fill=orange!20,draw,anchor=west] (output3) at ([xshift=0.5cm]engine3.east) {输出 $n$};
-    \node [output,draw=ugreen!70,text=ugreen,anchor=west] (output2) at ([xshift=0.5cm]engine2.east) {输出 $2$};
+    \node [output,fill=ugreen!20,draw,anchor=west] (output2) at ([xshift=0.5cm]engine2.east) {输出 $2$};
-    \node [output,draw=red!70,text=red,anchor=west] (output1) at ([xshift=0.5cm]engine1.east) {输出 $1$};
+    \node [output,fill=red!20,draw,anchor=west] (output1) at ([xshift=0.5cm]engine1.east) {输出 $1$};
    \draw [very thick,decorate,decoration={brace}] ([xshift=3pt]output1.north east) to node [midway,name=final] {} ([xshift=3pt]output3.south east);
-    \node [output,draw=cocoabrown!70,text=cocoabrown,minimum width=1cm,right=0pt of final,minimum height=2.5em] () {最终\\输出};
+    \node [output,fill=cocoabrown!20,draw,minimum width=1cm,right=3pt of final,minimum height=2.5em] () {最终\\输出};
    \draw [->,very thick] (engine1) to (output1);
    \draw [->,very thick] (engine2) to (output2);
@@ -25,15 +25,15 @@
 \end{scope}
 \begin{scope}[local bounding box=SINGLE]
-    \node [output,draw=ugreen!70,text=ugreen,anchor=west] (output3) at ([xshift=4cm]output3.east) {输出 $n$};
+    \node [output,fill=ugreen!20,draw,anchor=west] (output3) at ([xshift=4cm]output3.east) {输出 $n$};
-    \node [output,draw=ugreen!70,text=ugreen,anchor=west] (output2) at ([xshift=4cm]output2.east) {输出 $2$};
+    \node [output,fill=ugreen!20,draw,anchor=west] (output2) at ([xshift=4cm]output2.east) {输出 $2$};
-    \node [output,draw=ugreen!70,text=ugreen,anchor=west] (output1) at ([xshift=4cm]output1.east) {输出 $1$};
+    \node [output,fill=ugreen!20,draw,anchor=west] (output1) at ([xshift=4cm]output1.east) {输出 $1$};
-    \node [system,draw=ugreen!70,text=ugreen,anchor=east,align=center,inner sep=1.9pt] (engine) at ([xshift=-0.5cm]output2.west) {单系统};
+    \node [system,fill=ugreen!20,draw,anchor=east,align=center,inner sep=1.9pt] (engine) at ([xshift=-0.5cm]output2.west) {单系统};
    \draw [very thick,decorate,decoration={brace}] ([xshift=3pt]output1.north east) to node [midway,name=final] {} ([xshift=3pt]output3.south east);
-    \node [output,draw=cocoabrown!70,text=cocoabrown,minimum width=1cm,right=0pt of final,minimum height=2.5em] () {最终\\输出};
+    \node [output,fill=cocoabrown!20,draw,minimum width=1cm,right=3pt of final,minimum height=2.5em] () {最终\\输出};
    \draw [->,very thick] (engine.east) to (output1.west);
    \draw [->,very thick] (engine.east) to (output2.west);

--- a/Chapter14/Figures/figure-iteration.tex
+++ b/Chapter14/Figures/figure-iteration.tex
@@ -5,37 +5,44 @@
 \tikzstyle{er} = [rectangle,minimum width=2.5cm,minimum height=1.5cm,rounded corners,text centered,draw,drop shadow]
 \begin{tikzpicture}[node distance = 0,scale = 0.75]
 \tikzstyle{every node}=[scale=0.75]
-\node (encoder)[er,thick,draw,fill=ugreen!20]{\Large{编码器}};
+\node (encoder)[er,thick,draw,fill=red!20,minimum width=2.8cm]{\Large{编码器}};
-\node (decoder_1)[er,thick,draw,right of=encoder,xshift=4cm,fill=red!20]{\Large{解码器}};
+\node (lenpre)[er,anchor=north,thick,draw,fill=yellow!20,minimum height=0.8cm] at ([yshift=1.5cm]encoder.north){\Large{长度预测器}};
-\node (decoder_2)[er,thick,draw,right of=decoder_1,xshift=4cm,fill=red!20]{\Large{解码器}};
+\node (decoder_1)[er,thick,draw,right of=encoder,xshift=5cm,fill=blue!20]{\Large{解码器}};
-\node (point)[right of=decoder_2,xshift=2.5cm,]{\LARGE{...}};
+\node (decoder_2)[er,thick,draw,right of=decoder_1,xshift=3.7cm,fill=blue!20]{\Large{解码器}};
-\node (decoder_3)[er,thick,draw,right of=point,xshift=2.5cm,fill=red!20]{\Large{解码器}};
+\node (point)[right of=decoder_2,xshift=3cm,]{\LARGE{...}};
-\draw [->,very thick,draw=black!70]([xshift=0.2cm]encoder.east) --  ([xshift=-0.2cm]decoder_1.west);
+\node (decoder_3)[er,thick,draw,right of=point,xshift=3cm,fill=blue!20]{\Large{解码器}};
-%\draw [->,very thick,draw=black!70]([xshift=0.2cm]decoder_1.east) --  ([xshift=-0.2cm]decoder_2.west);
+\draw [->,very thick,draw=black!70]([xshift=0cm]encoder.east) --  ([xshift=-0cm]decoder_1.west);
-%\draw [->,very thick,draw=black!70]([xshift=0.2cm]decoder_2.east) --  ([xshift=-0.1cm]point.west);
-%\draw [->,very thick,draw=black!70]([xshift=0.1cm]point.east) --  ([xshift=-0.2cm]decoder_3.west);
+\draw [->,very thick,draw=black!70]([xshift=0,yshift=-1cm]encoder.south) --  ([xshift=0]encoder.south);
+\draw [->,very thick,draw=black!70](encoder.north) --  (lenpre.south);
-\draw [->,very thick,draw=black!70]([xshift=0,yshift=-1cm]encoder.south) --  ([xshift=0,yshift=-0.2cm]encoder.south);
-\draw [->,very thick,draw=black!70]([xshift=0,yshift=0.2cm]encoder.north) --  ([xshift=0,yshift=1cm]encoder.north);
-\node [below of = encoder,xshift=0cm,yshift=2.2cm]{预测目标长度};
 \node [below of = encoder,xshift=0cm,yshift=-2.2cm]{\Large$\seq{x}$};
-\draw [->,very thick,draw=black!70]([xshift=0,yshift=-1cm]decoder_1.south) --  ([xshift=0,yshift=-0.2cm]decoder_1.south);
+\draw [->,very thick,draw=black!70]([xshift=0,yshift=-1cm]decoder_1.south) --  ([xshift=0]decoder_1.south);
-\draw [->,very thick,draw=black!70]([xshift=0,yshift=0.2cm]decoder_1.north) --  ([xshift=0,yshift=1cm]decoder_1.north);
+\draw [->,very thick,draw=black!70]([xshift=0]decoder_1.north) --  ([xshift=0,yshift=1cm]decoder_1.north);
-\node [below of = decoder_1,xshift=0cm,yshift=-2.2cm]{\Large$\seq{x'}$};
+\node (d1x) [below of = decoder_1,xshift=0cm,yshift=-2.2cm]{\Large$\seq{x'}$};
+\draw [-,very thick,draw=black!70] (lenpre.east) --([xshift=1.26cm]lenpre.east);
+\draw [-,very thick,draw=black!70,dashed] ([xshift=1.26cm]lenpre.east) -- ([xshift=-2cm]d1x.west);
+\draw [->,very thick,draw=black!70] ([xshift=-2cm]d1x.west) -- ([xshift=0cm]d1x.west);
 \node (line1_1)[below of = decoder_1,xshift=0cm,yshift=2.2cm]{\Large$\seq{y}^{[1]}$};
-\draw [->,thick,]([xshift=0,yshift=-1cm]decoder_2.south) --  ([xshift=0,yshift=-0.2cm]decoder_2.south);
+\draw [->,thick,]([xshift=0,yshift=-1cm]decoder_2.south) --  ([xshift=0]decoder_2.south);
-\draw [->,very thick,draw=black!70]([xshift=0,yshift=0.2cm]decoder_2.north) --  ([xshift=0,yshift=1cm]decoder_2.north);
+\draw [->,very thick,draw=black!70]([xshift=0]decoder_2.north) --  ([xshift=0,yshift=1cm]decoder_2.north);
 \node (line1_2)[below of = decoder_2,xshift=0cm,yshift=-2.2cm]{\Large$\seq{y}^{[1]}$};
-\node [below of = decoder_2,xshift=0cm,yshift=2.2cm]{\Large$\seq{y}^{[2]}$};
+\node (line2_1)[below of = decoder_2,xshift=0cm,yshift=2.2cm]{\Large$\seq{y}^{[2]}$};
+\node (line2_2)[below of = point,xshift=0cm,yshift=-2.2cm]{};
+\node (line3_1)[below of = point,xshift=0cm,yshift=2.2cm]{};
-\draw [->,very thick,draw=black!70]([xshift=0,yshift=-1cm]decoder_3.south) --  ([xshift=0,yshift=-0.2cm]decoder_3.south);
+\draw [->,very thick,draw=black!70]([xshift=0,yshift=-1cm]decoder_3.south) --  ([xshift=0]decoder_3.south);
-\draw [->,very thick,draw=black!70]([xshift=0,yshift=0.2cm]decoder_3.north) --  ([xshift=0,yshift=1cm]decoder_3.north);
+\draw [->,very thick,draw=black!70]([xshift=0]decoder_3.north) --  ([xshift=0,yshift=1cm]decoder_3.north);
 \node (line3_2)[below of = decoder_3,xshift=0cm,yshift=-2.2cm]{\Large$\seq{y}^{[N-1]}$};
 \node [below of = decoder_3,xshift=0cm,yshift=2.2cm]{\Large$\seq{y}^{[N]}$};
 \draw[->,very thick,draw=black!70, out=0, in=180,dotted] (line1_1.east) to (line1_2.west);
-\draw[->,very thick,draw=black!70, out=0, in=180,dotted] ([xshift=4cm]line1_1.east) to ([xshift=3cm]line1_2.west);
+\draw[->,very thick,draw=black!70, out=0, in=180,dotted] (line2_1.east) to (line2_2.west);
-\draw[->,very thick,draw=black!70, out=0, in=180,dotted] ([xshift=6cm]line1_1.east) to (line3_2.west);
+\draw[->,very thick,draw=black!70, out=0, in=180,dotted] (line3_1.east) to (line3_2.west);
+\draw [->,very thick,draw=black!70] ([xshift=0.5cm]encoder.east) -- ([xshift=0.5cm,yshift=-2.8cm]encoder.east) --([xshift=5.55cm,yshift=-2.8cm]encoder.east) --([xshift=-0.5cm]decoder_2.west) -- (decoder_2.west);
+\draw [->,very thick,draw=black!70] ([xshift=5.55cm,yshift=-2.8cm]encoder.east) -- ([xshift=9.45cm,yshift=-2.8cm]encoder.east) --([xshift=-0.5cm]point.west) -- (point.west);
+\draw [->,very thick,draw=black!70] ([xshift=9.45cm,yshift=-2.8cm]encoder.east) -- ([xshift=11.55cm,yshift=-2.8cm]encoder.east) -- ([xshift=-0.5cm]decoder_3.west) -- (decoder_3.west);
 \end{tikzpicture}
\ No newline at end of file
--- a/Chapter14/Figures/figure-mask-predict.tex
+++ b/Chapter14/Figures/figure-mask-predict.tex
@@ -2,37 +2,33 @@
 \tikzstyle{er} = [rectangle,minimum width=7cm,minimum height=2.5cm,text centered,draw,drop shadow,rounded corners]
 \begin{tikzpicture}[node distance = 0,scale = 0.55]
 \tikzstyle{every node}=[scale=0.55]
-\node (encoder)[er,thick,minimum width=5.5cm,fill=ugreen!20]{\huge{编码器}};
+\node (encoder)[er,thick,minimum width=5.5cm,fill=red!20]{\huge{编码器}};
-\node (decoder)[er,thick,right of=encoder,xshift=7.75cm,fill=red!20]{\huge{解码器}};
+\node (decoder)[er,thick,right of=encoder,xshift=8.75cm,fill=blue!20]{\huge{解码器}};
-\node (decoder_1)[er,thick,right of=decoder,xshift=8.75cm,fill=red!20]{\huge{解码器}};
+\node (decoder_1)[er,thick,right of=decoder,xshift=8.75cm,fill=blue!20]{\huge{解码器}};
-\draw [->,very thick,draw=blue!70]([xshift=0.2cm]encoder.east) --  ([xshift=-0.2cm]decoder.west);
+\draw [->,very thick,draw=black!70]([xshift=0cm]encoder.east) --  ([xshift=-0cm]decoder.west);
-\begin{pgfonlayer}{background}
-\draw [->,very thick,draw=blue!70]([xshift=0.2cm,yshift=-0.8em]encoder.east) --  ([xshift=-0.2cm,yshift=-0.8em]decoder_1.west);
-\end{pgfonlayer}
 \foreach \x in {-2.2cm,-1.1cm,...,2.2cm}
 \draw [->,very thick,draw=black!70]([xshift=\x,yshift=-1cm]encoder.south) --  ([xshift=\x,yshift=-0.2cm]encoder.south);
-\node [below of = encoder,xshift=-2.3cm,yshift=-2.95cm,scale=1.2]{\large{<LEN>}};
+\node [below of = encoder,xshift=-2.3cm,yshift=-2.92cm,scale=1.2]{\small{<LEN>}};
-\node [below of = encoder,xshift=-1cm,yshift=-2.9cm,scale=1.2]{\large{hello}};
+\node [below of = encoder,xshift=-1cm,yshift=-2.9cm,scale=1.2]{\large{Hello}};
 \node [below of = encoder,xshift=0cm,yshift=-3.05cm,scale=1.2]{,};
 \node [below of = encoder,xshift=1.1cm,yshift=-2.9cm,scale=1.2]{\large{world}};
 \node [below of = encoder,xshift=2.2cm,yshift=-2.9cm,scale=1.2]{!};
-\draw [->,very thick,draw=black!70]([xshift=-2.2cm,yshift=0.2cm]encoder.north) --  ([xshift=-2.2cm,yshift=1cm]encoder.north);
-\node [below of = encoder,xshift=-2.2cm,yshift=2.9cm,scale=1.5]{4};
 \foreach \x in {-2.7cm,-0.9cm,...,2.8cm}
 {\draw [->,very thick,draw=black!70]([xshift=\x,yshift=-1cm]decoder.south) --  ([xshift=\x,yshift=-0.2cm]decoder.south);
 \draw [->,very thick,draw=black!70]([xshift=\x,yshift=0.2cm]decoder.north) --  ([xshift=\x,yshift=1cm]decoder.north);}
-\node [below of = decoder,xshift=-3cm,yshift=-2.9cm,scale=1.6]{\small{<Mask>}};
+\node (mask_1) [below of = decoder,xshift=-3cm,yshift=-2.9cm,scale=1.6]{\small{<Mask>}};
 \node [below of = decoder,xshift=-1cm,yshift=-2.9cm,scale=1.6]{\small{<Mask>}};
 \node [below of = decoder,xshift=1cm,yshift=-2.9cm,scale=1.6]{\small{<Mask>}};
 \node [below of = decoder,xshift=3cm,yshift=-2.9cm,scale=1.6]{\small{<Mask>}};
 \node [below of = decoder,xshift=-3cm,yshift=2.9cm,scale=1.6]{你好};
 \node [below of = decoder,xshift=-1cm,yshift=2.7cm,scale=1.6]{，};
 \node [below of = decoder,xshift=1cm,yshift=2.9cm,scale=1.6]{你好};
-\node [below of = decoder,xshift=2.9cm,yshift=2.9cm,scale=1.6]{！};
+\node (line1)[below of = decoder,xshift=2.6cm,yshift=2.9cm,scale=1.6]{！};
 \foreach \x in {-2.7cm,-0.9cm,...,2.8cm}
@@ -41,12 +37,18 @@
 \node [below of = decoder_1,xshift=-2.7cm,yshift=2.9cm,scale=1.6]{你好};
 \node [below of = decoder_1,xshift=-0.9cm,yshift=2.7cm,scale=1.6]{，};
 \node [below of = decoder_1,xshift=0.9cm,yshift=2.9cm,scale=1.6]{世界};
-\node [below of = decoder_1,xshift=2.7cm,yshift=2.9cm,scale=1.6]{！};
+\node [below of = decoder_1,xshift=2.6cm,yshift=2.8cm,scale=1.6]{！};
-\node [below of = decoder_1,xshift=-2.7cm,yshift=-2.9cm,scale=1.6]{你好};
+\node (line2)[below of = decoder_1,xshift=-2.7cm,yshift=-2.9cm,scale=1.6]{你好};
 \node [below of = decoder_1,xshift=-0.9cm,yshift=-3cm,scale=1.6]{，};
 \node [below of = decoder_1,xshift=0.9cm,yshift=-2.9cm,scale=1.6]{\small{<Mask>}};
-\node [below of = decoder_1,xshift=2.7cm,yshift=-2.9cm,scale=1.6]{！};
+\node [below of = decoder_1,xshift=2.6cm,yshift=-2.8cm,scale=1.6]{！};
+\draw [-,very thick,draw=black!70]([xshift=-2.2cm]encoder.north) --  ([xshift=-2.2cm,yshift=0.5cm]encoder.north)--  ([xshift=4.1cm,yshift=0.5cm]encoder.north);
+\draw [-,very thick,draw=black!70,dashed]([xshift=4.1cm,yshift=0.5cm]encoder.north) -- ([xshift=-0.5cm]mask_1.west);
+\draw [->,very thick,draw=black!70]([xshift=-0.5cm]mask_1.west) -- (mask_1.west);
+\draw [->,very thick,draw=black!70]([xshift=0.5cm]encoder.east) -- ([xshift=0.5cm,yshift=-3.5cm]encoder.east) -- ([xshift=10.5cm,yshift=-3.5cm]encoder.east) -- ([xshift=-0.72cm]decoder_1.west) --  (decoder_1.west);
+\draw [->,very thick,dotted] (line1.east) .. controls +(east:1.2) and +(west:1.2) ..(line2.west);
+\node [below of = encoder,xshift=1.2cm,yshift=2.4cm,scale=1.7]{译文长度：4};
 \end{tikzpicture}
\ No newline at end of file
--- a/Chapter14/Figures/figure-multi-modality.tex
+++ b/Chapter14/Figures/figure-multi-modality.tex
@@ -4,23 +4,33 @@
 %%% outline
 %-------------------------------------------------------------------------
 \begin{tikzpicture}
-	\tikzstyle{word} = [font=\scriptsize]
+	\tikzstyle{emb} = [font=\scriptsize,rounded corners=1pt, fill=orange!20, minimum width=1.8em,minimum height=1.5em,draw]
+	\tikzstyle{po} = [font=\scriptsize,rounded corners=1pt, fill=gray!20, minimum width=1.8em,minimum height=1.5em,draw]
 	\tikzstyle{tgt} = [minimum height=1.6em,minimum width=5.2em,fill=black!10!yellow!30,font=\footnotesize,drop shadow={shadow xshift=0.15em,shadow yshift=-0.15em,}]
 	\tikzstyle{p} = [fill=ugreen!15,minimum width=0.4em,inner sep=0pt]
-\node[ rounded corners=3pt, fill=red!20, drop shadow, minimum width=10em,minimum height=4em,draw]  (encoder) at (0,0) {Transformer 编码器    };
+\node[ rounded corners=3pt, fill=red!20, drop shadow, minimum width=12em,minimum height=4em,draw]  (encoder) at (0,0) {编码器};
-\node[anchor=west, rounded corners=3pt, fill=blue!20, drop shadow, minimum width=14em,minimum height=4em,draw] (decoder) at ([xshift=0.8cm]encoder.east) {Transformer 解码器};
+\node[anchor=north,rounded corners=3pt, fill=yellow!20, drop shadow, minimum width=12em,minimum height=2em,draw] (lenpre) at([yshift=3em]encoder.north){长度预测器};
+\node[anchor=north] (lable) at([xshift=3.5em,yshift=2.5em]lenpre.north){译文长度：3};
-\node[anchor=north,word] (en1) at ([yshift=-1.3em,xshift=-3em]encoder.south) {干};
+\node[anchor=west, rounded corners=3pt, fill=blue!20, drop shadow, minimum width=13em,minimum height=4em,draw] (decoder) at ([xshift=1cm]encoder.east) {解码器};
-\node[anchor=north,word] (en2) at ([yshift=-1.3em,xshift=-1em]encoder.south) {得};
-\node[anchor=north,word] (en3) at ([yshift=-1.3em,xshift=1em]encoder.south) {好};
+\node[anchor=north,emb] (en1) at ([yshift=-1.3em,xshift=-4.5em]encoder.south) {${\mathbi e}$(干)};
-\node[anchor=north,word] (en4) at ([yshift=-1.3em,xshift=3em]encoder.south) {！};
+\node[anchor=north,emb] (en2) at ([yshift=-1.3em,xshift=-1.5em]encoder.south) {${\mathbi e}$(得)};
+\node[anchor=north,emb] (en3) at ([yshift=-1.3em,xshift=1.5em]encoder.south) {${\mathbi e}$(好)};
+\node[anchor=north,emb] (en4) at ([yshift=-1.3em,xshift=4.5em]encoder.south) {${\mathbi e}$(!)};
+\node[anchor=north,po] (po1) at ([yshift=-1.4em]en1.south) {PE(1)};
+\node[anchor=north,po] (po2) at ([yshift=-1.4em]en2.south) {PE(2)};
+\node[anchor=north,po] (po3) at ([yshift=-1.4em]en3.south) {PE(3)};
+\node[anchor=north,po] (po4) at ([yshift=-1.4em]en4.south) {PE(4)};
+\foreach \x in {1,2,3,4}{
+	\node [anchor=north] (plus\x) at ([yshift=-0.04em]en\x.south) {\large{\textbf{$\oplus$}}};
+}
-\node[anchor=north,word] (de1) at ([yshift=-1.3em,xshift=-5.2em]decoder.south) {1};
+\node[anchor=north,po] (de1) at ([yshift=-1.3em,xshift=-4.5em]decoder.south) {PE(1)};
-\node[anchor=north,word] (de2) at ([yshift=-1.3em]decoder.south) {2};
+\node[anchor=north,po] (de2) at ([yshift=-1.3em]decoder.south) {PE(2)};
-\node[anchor=north,word] (de3) at ([yshift=-1.3em,xshift=5em]decoder.south) {3};
+\node[anchor=north,po] (de3) at ([yshift=-1.3em,xshift=4.5em]decoder.south) {PE(3};
+\node[rounded corners=3pt, minimum width=12em,minimum height=2em,draw,dashed,very thick] (box0) at ([yshift=-2.05em]decoder.south) {};
-\node[p,anchor=south, minimum height=0.5em] (w1_1) at ([xshift=-7em,yshift=1.5em]decoder.north){};
+\node[p,anchor=south, minimum height=0.5em] (w1_1) at ([xshift=-6.5em,yshift=1.5em]decoder.north){};
 \node[p,anchor=south,minimum height=2em] (w1_2) at ([xshift=0.3em]w1_1.south east){};
 \node[p,anchor=south,minimum height=0.7em] (w1_3) at ([xshift=0.3em]w1_2.south east){};
 \node[p,anchor=south,minimum height=0.6em] (w1_4) at ([xshift=0.3em]w1_3.south east){};
@@ -29,7 +39,7 @@
 \node[p,anchor=south,minimum height=0.6em] (w1_7) at ([xshift=0.3em]w1_6.south east){};
 \node[p,anchor=south,minimum height=0.8em] (w1_8) at ([xshift=0.3em]w1_7.south east){};
-\node[p,anchor=south, minimum height=0.5em] (w2_1) at ([xshift=-1.8em,yshift=1.5em]decoder.north){};
+\node[p,anchor=south, minimum height=0.5em] (w2_1) at ([xshift=-1.9em,yshift=1.5em]decoder.north){};
 \node[p,anchor=south,minimum height=2em] (w2_2) at ([xshift=0.3em]w2_1.south east){};
 \node[p,anchor=south,minimum height=0.7em] (w2_3) at ([xshift=0.3em]w2_2.south east){};
 \node[p,anchor=south,minimum height=0.6em] (w2_4) at ([xshift=0.3em]w2_3.south east){};
@@ -38,7 +48,7 @@
 \node[p,anchor=south,minimum height=0.6em] (w2_7) at ([xshift=0.3em]w2_6.south east){};
 \node[p,anchor=south,minimum height=0.8em] (w2_8) at ([xshift=0.3em]w2_7.south east){};
-\node[p,anchor=south, minimum height=0.4em] (w3_1) at ([xshift=3.2em,yshift=1.5em]decoder.north){};
+\node[p,anchor=south, minimum height=0.4em] (w3_1) at ([xshift=2.7em,yshift=1.5em]decoder.north){};
 \node[p,anchor=south,minimum height=0.5em] (w3_2) at ([xshift=0.3em]w3_1.south east){};
 \node[p,anchor=south,minimum height=0.7em] (w3_3) at ([xshift=0.3em]w3_2.south east){};
 \node[p,anchor=south,minimum height=2em] (w3_4) at ([xshift=0.3em]w3_3.south east){};
@@ -54,30 +64,31 @@
 \node[inner sep=0pt,font=\scriptsize] at ([yshift=0.45em]w3_4.north){!};
-\draw[->, thick] ([yshift=0.1em]en1.north) -- ([xshift=-3em,yshift=-0.1em]encoder.south);
+\draw[->, thick] ([yshift=0.1em]en1.north) -- ([xshift=-4.5em,yshift=-0.1em]encoder.south);
-\draw[->, thick] ([yshift=0.1em]en2.north) -- ([xshift=-1em,yshift=-0.1em]encoder.south);
+\draw[->, thick] ([yshift=0.1em]en2.north) -- ([xshift=-1.5em,yshift=-0.1em]encoder.south);
-\draw[->, thick] ([yshift=0.1em]en3.north) -- ([xshift=1em,yshift=-0.1em]encoder.south);
+\draw[->, thick] ([yshift=0.1em]en3.north) -- ([xshift=1.5em,yshift=-0.1em]encoder.south);
-\draw[->, thick] ([yshift=0.1em]en4.north) -- ([xshift=3em,yshift=-0.1em]encoder.south);
+\draw[->, thick] ([yshift=0.1em]en4.north) -- ([xshift=4.5em,yshift=-0.1em]encoder.south);
-\draw[->, thick] ([yshift=0.1em]de1.north) -- ([xshift=-5.2em,yshift=-0.1em]decoder.south);
+\draw[->, thick] ([yshift=0.1em]de1.north) -- ([xshift=-4.5em,yshift=-0.1em]decoder.south);
 \draw[->, thick] ([yshift=0.1em]de2.north) -- ([xshift=0em,yshift=-0.1em]decoder.south);
-\draw[->, thick] ([yshift=0.1em]de3.north) -- ([xshift=5em,yshift=-0.1em]decoder.south);
+\draw[->, thick] ([yshift=0.1em]de3.north) -- ([xshift=4.5em,yshift=-0.1em]decoder.south);
 \draw[->, line width=1.5pt] (encoder.east) -- (decoder.west);
-\begin{pgfonlayer}{background}
-{
 \node[inner sep=2pt] [fit =(w1_1)(w1_2)(w1_8)](box1) {};
 \node[inner sep=2pt] [fit =(w2_1)(w2_2)(w2_8)] (box2){};
 \node[inner sep=2pt] [fit =(w3_1)(w3_2)(w3_8)] (box3){};
-}
+\draw[->, thick] ([yshift=0em]encoder.north) -- ([yshift=0em]lenpre.south);
-\end{pgfonlayer}
+\draw[-, thick] ([yshift=0em]lenpre.north) -- ([yshift=1em]lenpre.north) -- ([xshift=7.1em,yshift=1em]lenpre.north);
+\draw[-, thick,dashed] ([xshift=7.1em,yshift=1em]lenpre.north) -- ([xshift=-2em]box0.west);
+\draw[->, thick] ([xshift=-2em]box0.west) -- (box0.west);
 \draw[->,thick] ([yshift=-1.2em]box1.south) -- (box1.south);
 \draw[->, thick] ([yshift=-1.2em]box2.south) -- (box2.south);
 \draw[->, thick] ([yshift=-1.2em]box3.south) -- (box3.south);
-\node[tgt,anchor=west,align=left] (tgt1) at ([xshift=2em]box3.east) {Good job !};
+\node[tgt,anchor=west,align=left] (tgt1) at ([xshift=1.5em]box3.east) {Good job !};
 \node[tgt,,anchor=north,align=left](tgt2) at ([yshift=-1em]tgt1.south) {Well done !};
 \node[tgt,,anchor=north,align=left] (tgt3) at ([yshift=-1em]tgt2.south) {Good done !};
 \node[tgt,,anchor=north,align=left] (tgt4) at ([yshift=-1em]tgt3.south) {Well job !};

--- a/Chapter14/Figures/figure-non-autoregressive.tex
+++ b/Chapter14/Figures/figure-non-autoregressive.tex
@@ -4,54 +4,128 @@
 %%% outline
 %-------------------------------------------------------------------------
 \begin{tikzpicture}
-	\tikzstyle{word} = [font=\scriptsize]
+	\tikzstyle{emb} = [font=\scriptsize,rounded corners=1pt, fill=orange!20, minimum width=1.8em,minimum height=1.5em,draw]
-\node[rounded corners=3pt, fill=red!20, drop shadow, minimum width=10em,minimum height=4em,draw]  (encoder) at (0,0) {Transformer 编码器    };
+	\tikzstyle{po} = [font=\scriptsize,rounded corners=1pt, fill=gray!20, minimum width=1.8em,minimum height=1.5em,draw]
-\node[draw,anchor=west, rounded corners=2pt, fill=orange!20,minimum width=2.5cm,minimum height=2em] (attention) at ([xshift=0.8cm]encoder.east) {注意力模块};
+\begin{scope} 
-\node[anchor=west, rounded corners=3pt, fill=blue!20, drop shadow, minimum width=10em,minimum height=4em,draw] (decoder) at ([xshift=0.8cm]attention.east) {Transformer 解码器};
+\node[rounded corners=3pt, fill=red!20, drop shadow, minimum width=10em,minimum height=4em,draw]  (encoder) at (0,0) {编码器};
+\node[anchor=north,rounded corners=3pt, fill=yellow!20, drop shadow, minimum width=10em,minimum height=2em,draw] (lenpre) at([yshift=3em]encoder.north){长度预测器};
+\node[anchor=north] (lable) at([xshift=3.5em,yshift=2.5em]lenpre.north){译文长度：4};
+\node[anchor=west, rounded corners=3pt, fill=blue!20, drop shadow, minimum width=16em,minimum height=4em,draw] (decoder) at ([xshift=1.8cm]encoder.east) {解码器};
+\node[anchor=north,emb] (en2) at ([yshift=-1.3em]encoder.south) {${\mathbi e}(x_2)$};
+\node[anchor=north,emb] (en1) at ([yshift=-1.3em,xshift=-3em]encoder.south) {${\mathbi e}(x_1)$};
+\node[anchor=north,emb] (en3) at ([yshift=-1.3em,xshift=3em]encoder.south) {${\mathbi e}(x_3)$};
+\node[anchor=north,po] (po1) at ([yshift=-1.4em]en1.south) {PE(1)};
+\node[anchor=north,po] (po2) at ([yshift=-1.4em]en2.south) {PE(2)};
+\node[anchor=north,po] (po3) at ([yshift=-1.4em]en3.south) {PE(3)};
+\node[anchor=south,font=\small](labelb) at ([xshift=9em,yshift=-2.5em]po3.south){(b) 非自回归翻译模型};
+\foreach \x in {1,2,3}{
+	\node [anchor=north] (plus\x) at ([yshift=-0.04em]en\x.south) {\large{\textbf{$\oplus$}}};
+}
+\node[anchor=north,po] (de1) at ([yshift=-1.3em,xshift=-4.5em]decoder.south) {PE(1)};
+\node[anchor=north,po] (de2) at ([yshift=-1.3em,xshift=-1.5em]decoder.south) {PE(2)};
+\node[anchor=north,po] (de3) at ([yshift=-1.3em,xshift=1.5 em]decoder.south) {PE(3)};
+\node[anchor=north,po] (de4) at ([yshift=-1.3em,xshift=4.5em]decoder.south) {PE(4)};
+\node[rounded corners=3pt, minimum width=12.5em,minimum height=2em,draw,dashed,very thick] (box1) at ([yshift=-2.05em]decoder.south) {};
+\node[anchor=south,font=\scriptsize] (out1) at ([yshift=1.3em,xshift=-4.5em]decoder.north) {$y_1$};
+\node[anchor=south,font=\scriptsize] (out2) at ([yshift=1.3em,xshift=-1.5em]decoder.north) {$y_2$};
+\node[anchor=south,font=\scriptsize] (out3) at ([yshift=1.3em,xshift=1.5em]decoder.north) {$y_3$};
+\node[anchor=south,font=\scriptsize] (out4) at ([yshift=1.3em,xshift=4.5em]decoder.north) {$y_4$};
+\draw[->, thick] ([yshift=0em]en1.north) -- ([xshift=-3em,yshift=0em]encoder.south);
+\draw[->, thick] ([yshift=0em]en2.north) -- ([xshift=0em,yshift=0em]encoder.south);
+\draw[->, thick] ([yshift=0em]en3.north) -- ([xshift=3em,yshift=0em]encoder.south);
+\draw[->, thick] ([yshift=0em]encoder.north) -- ([yshift=0em]lenpre.south);
+\draw[-, thick] ([yshift=0em]lenpre.north) -- ([yshift=1em]lenpre.north) -- ([xshift=7.4em,yshift=1em]lenpre.north);
+\draw[-, thick,dashed] ([xshift=7.4em,yshift=1em]lenpre.north) -- ([xshift=-4em]box1.west);
+\draw[->, thick] ([xshift=-4em]box1.west) -- (box1.west);
+\draw[->,thick] ([yshift=0em]de1.north) -- ([xshift=-4.5em]decoder.south);
+\draw[->, thick] ([yshift=0em]de2.north) -- ([xshift=-1.5em]decoder.south);
+\draw[->, thick] ([yshift=0em]de3.north) -- ([xshift=1.5em]decoder.south);
+\draw[->, thick] ([yshift=0em]de4.north) -- ([xshift=4.5em]decoder.south);
+\draw[->, thick] ([xshift=-4.5em]decoder.north) -- ([yshift=-0em]out1.south);
+\draw[->, thick] ([xshift=-1.5em]decoder.north) -- ([yshift=-0em]out2.south);
+\draw[->, thick] ([xshift=1.5em]decoder.north) -- ([yshift=-0em]out3.south);
+\draw[->, thick] ([xshift=4.5em]decoder.north) -- ([yshift=-0em]out4.south);
+\draw[->,line width=1pt] (encoder.east) -- (decoder.west);
+\end{scope} 
+\begin{scope}[yshift=2.8in]
+\node[rounded corners=3pt, fill=red!20, drop shadow, minimum width=10em,minimum height=4em,draw]  (encoder) at (0,0) {编码器};
+\node[anchor=west,minimum width=16em,minimum height=4em] (decoder) at ([xshift=1.8cm]encoder.east) {};
+\node[anchor=north,emb] (en2) at ([yshift=-1.3em]encoder.south) {${\mathbi e}(x_2)$};
+\node[anchor=north,emb] (en1) at ([yshift=-1.3em,xshift=-3em]encoder.south) {${\mathbi e}(x_1)$};
+\node[anchor=north,emb] (en3) at ([yshift=-1.3em,xshift=3em]encoder.south) {${\mathbi e}(x_3)$};
+\node[anchor=north,po] (po1) at ([yshift=-1.4em]en1.south) {PE(1)};
+\node[anchor=north,po] (po2) at ([yshift=-1.4em]en2.south) {PE(2)};
+\node[anchor=north,po] (po3) at ([yshift=-1.4em]en3.south) {PE(3)};
+\foreach \x in {1,2,3}{
+	\node [anchor=north] (plus\x) at ([yshift=-0.04em]en\x.south) {\large{\textbf{$\oplus$}}};
+}
+\node[anchor=north,emb] (de3) at ([yshift=-1.3em]decoder.south) {${\mathbi e}(y_2)$};
+\node[anchor=north,po] (po4) at ([yshift=-1.4em]de3.south) {PE(3)};
+\node[anchor=north,emb] (de2) at ([yshift=-1.3em,xshift=-3.5em]decoder.south) {${\mathbi e}(y_1)$};
+\node[anchor=north,po] (po5) at ([yshift=-1.4em]de2.south) {PE(2)};
+\node[anchor=north,emb] (de1) at ([yshift=-1.3em,xshift=-7em]decoder.south) {${\mathbi e}$(sos)};
+\node[anchor=north,po] (po6) at ([yshift=-1.4em]de1.south) {PE(1)};
+\node[anchor=north,emb] (de4) at ([yshift=-1.3em,xshift=3.5em]decoder.south) {${\mathbi e}(y_3)$};
+\node[anchor=north,po] (po7) at ([yshift=-1.4em]de4.south) {PE(4)};
+\node[anchor=north,emb] (de5) at ([yshift=-1.3em,xshift=7em]decoder.south) {${\mathbi e}(y_4)$};
+\node[anchor=north,po] (po8) at ([yshift=-1.4em]de5.south) {PE(5)};
+\node[anchor=south,font=\small](labela) at ([xshift=1em,yshift=-2.5em]po6.south){(a) 自回归翻译模型};
+\foreach \x in {1,2,3,4,5}{
+	\node [anchor=north] (plus\x) at ([yshift=-0.04em]de\x.south) {\large{\textbf{$\oplus$}}};
+}
+\node[anchor=south,font=\scriptsize] (out1) at ([yshift=1.3em,xshift=-7em]decoder.north) {$y_1$};
+\node[anchor=south,font=\scriptsize] (out2) at ([yshift=1.3em,xshift=-3.5em]decoder.north) {$y_2$};
+\node[anchor=south,font=\scriptsize] (out3) at ([yshift=1.3em,xshift=0em]decoder.north) {$y_3$};
+\node[anchor=south,font=\scriptsize] (out4) at ([yshift=1.3em,xshift=3.5em]decoder.north) {$y_4$};
+\node[anchor=south,font=\scriptsize] (out5) at ([yshift=1.3em,xshift=7em]decoder.north) {<eos>};
+\draw[->, thick] ([yshift=0em]en1.north) -- ([xshift=-3em,yshift=0em]encoder.south);
+\draw[->, thick] ([yshift=0em]en2.north) -- ([xshift=0em,yshift=0em]encoder.south);
+\draw[->, thick] ([yshift=0em]en3.north) -- ([xshift=3em,yshift=0em]encoder.south);
+\draw[->,thick] ([yshift=0em]de1.north) -- ([xshift=-7em]decoder.south);
+\draw[->, thick] ([yshift=0em]de2.north) -- ([xshift=-3.5em]decoder.south);
+\draw[->, thick] ([yshift=0em]de3.north) -- ([xshift=0em]decoder.south);
+\draw[->, thick] ([yshift=0em]de4.north) -- ([xshift=3.5em]decoder.south);
+\draw[->, thick] ([yshift=0em]de5.north) -- ([xshift=7em]decoder.south);
-\node[anchor=north,word] (en1) at ([yshift=-1.3em,xshift=-3em]encoder.south) {hello};
+\draw[->, thick] ([xshift=-7em]decoder.north) -- ([yshift=-0em]out1.south);
-\node[anchor=north,word] (en2) at ([yshift=-1.6em,xshift=-1em]encoder.south) {,};
+\draw[->, thick] ([xshift=-3.5em]decoder.north) -- ([yshift=-0em]out2.south);
-\node[anchor=north,word] (en3) at ([yshift=-1.3em,xshift=1em]encoder.south) {world};
+\draw[->, thick] ([xshift=0em]decoder.north) -- ([yshift=-0em]out3.south);
-\node[anchor=north,word] (en4) at ([yshift=-1.3em,xshift=3em]encoder.south) {!};
+\draw[->, thick] ([xshift=3.5em]decoder.north) -- ([yshift=-0em]out4.south);
+\draw[->, thick] ([xshift=7em]decoder.north) -- ([yshift=-0em]out5.south);
+\draw [->,very thick,dotted] ([xshift=-0.3em]out1.east) .. controls +(east:0.5) and +(west:0.5) ..([xshift=0em]de2.west);
+\draw [->,very thick,dotted] ([xshift=-0.3em]out2.east) .. controls +(east:0.5) and +(west:0.5) ..([xshift=0em]de3.west);
+\draw [->,very thick,dotted] ([xshift=-0.3em]out3.east) .. controls +(east:0.5) and +(west:0.5) ..([xshift=0em]de4.west);
+\draw [->,very thick,dotted] ([xshift=-0.3em]out4.east) .. controls +(east:0.5) and +(west:0.5) ..([xshift=0em]de5.west);
+\node[anchor=west, rounded corners=3pt, fill=blue!20, drop shadow, minimum width=16em,minimum height=4em,draw] (decoder2) at ([xshift=1.8cm]encoder.east) {解码器};
-\node[anchor=north,word] (de1) at ([yshift=-1.3em,xshift=-3em]decoder.south) {1};
+\draw[->,line width=1pt] (encoder.east) -- (decoder.west);
-\node[anchor=north,word] (de2) at ([yshift=-1.3em,xshift=-1em]decoder.south) {2};
+\end{scope}
-\node[anchor=north,word] (de3) at ([yshift=-1.3em,xshift=1 em]decoder.south) {3};
-\node[anchor=north,word] (de4) at ([yshift=-1.3em,xshift=3em]decoder.south) {4};
-\node[anchor=south,word] (out1) at ([yshift=1.3em,xshift=-3em]decoder.north) {你好};
-\node[anchor=south,word] (out2) at ([yshift=1.3em,xshift=-1em]decoder.north) {，};
-\node[anchor=south,word] (out3) at ([yshift=1.3em,xshift=1em]decoder.north) {世界};
-\node[anchor=south,word] (out4) at ([yshift=1.3em,xshift=3em]decoder.north) {！};
-\draw[->, thick] ([yshift=0.1em]en1.north) -- ([xshift=-3em,yshift=-0.1em]encoder.south);
-\draw[->, thick] ([yshift=0.3em]en2.north) -- ([xshift=-1em,yshift=-0.1em]encoder.south);
-\draw[->, thick] ([yshift=0.1em]en3.north) -- ([xshift=1em,yshift=-0.1em]encoder.south);
-\draw[->, thick] ([yshift=0.1em]en4.north) -- ([xshift=3em,yshift=-0.1em]encoder.south);
-\draw[->,thick] ([yshift=0.1em]de1.north) -- ([xshift=-3em]decoder.south);
-\draw[->, thick] ([yshift=0.1em]de2.north) -- ([xshift=-1em]decoder.south);
-\draw[->, thick] ([yshift=0.1em]de3.north) -- ([xshift=1em]decoder.south);
-\draw[->, thick] ([yshift=0.1em]de4.north) -- ([xshift=3em]decoder.south);
-\draw[->, thick] ([xshift=-3em,yshift=0.1em]decoder.north) -- ([yshift=-0.1em]out1.south);
-\draw[->, thick] ([xshift=-1em,yshift=0.1em]decoder.north) -- ([yshift=-0.1em]out2.south);
-\draw[->, thick] ([xshift=1em,yshift=0.1em]decoder.north) -- ([yshift=-0.1em]out3.south);
-\draw[->, thick] ([xshift=3em,yshift=0.1em]decoder.north) -- ([yshift=-0.1em]out4.south);
-\draw[->,line width=1.5pt] (encoder.east) -- (attention.west);
-\draw[->,line width=1.5pt] (attention.east) -- (decoder.west);
-\draw[decorate,decoration={brace, mirror},ublue, very thick] ([xshift=0.5em,yshift=-0.4em]de1.-135) -- node[font=\scriptsize,text=black,yshift=-1em]{预测译文长度 \& 计算位置编码}([xshift=-0.5em,yshift=-0.4em]de4.-45);
-%\begin{pgfonlayer}{background}
-%{
-%\node[rectangle,draw=ublue, inner sep=0mm] [fit =(original1)(ht1)(mt1)(ht1-4)] {};
-%}
-%\end{pgfonlayer}
 \end{tikzpicture}

--- a/Chapter14/Figures/figure-reproduction-rate.tex
+++ b/Chapter14/Figures/figure-reproduction-rate.tex
@@ -8,15 +8,16 @@
 	\tikzstyle{cir} = [draw,circle,minimum size=1em, thick,inner sep=0pt]
 	%encoder
-	\node[layer,fill=red!15] (src_emb) at (0,0){\scriptsize\textbf{Input Embedding}};
+	\node[layer,fill=red!15] (src_emb) at (0,0){\scriptsize\textbf{Embedding}};
 	\node[anchor=south,layer,fill=yellow!20] (src_sa) at ([yshift=3.7em]src_emb.north){\scriptsize\textbf{Self-attention}};
 	\node[anchor=south,layer,fill=orange!20] (src_ff) at ([yshift=1em]src_sa.north){\scriptsize\textbf{Feed Forward}};
 	\node[anchor=south,layer,fill=blue!20] (src_sf) at ([yshift=2.6em]src_ff.north){\scriptsize\textbf{Softmax}};
 	%decoder
-	\node[anchor=west,layer,fill=red!15] (tgt_emb) at ([xshift=4.4em]src_emb.east){\scriptsize\textbf{Output Embedding}};
+	\node[anchor=west,layer,fill=red!15] (tgt_emb) at ([xshift=4.4em]src_emb.east){\scriptsize\textbf{Embedding}};
 	\node[anchor=south,layer,fill=yellow!20] (tgt_sa) at ([yshift=3.7em]tgt_emb.north){\scriptsize\textbf{Self-attention}};
 	\node[anchor=south,layer,fill=yellow!20] (tgt_pa) at ([yshift=1.5em]tgt_sa.north){\scriptsize\textbf{Positional Attention}};
+	\node[anchor=south,layer,draw=red,dashed,line width=2pt,minimum height=1.55em] (tgt_paa) at ([yshift=1.5em]tgt_sa.north){};
 	\node[anchor=south,layer,fill=yellow!20] (tgt_eda) at ([yshift=1.5em]tgt_pa.north){\scriptsize\textbf{Encoder-Decoder} \\  \scriptsize\textbf{Attention}};
 	\node[anchor=south,layer,fill=orange!20] (tgt_ff) at ([yshift=1em]tgt_eda.north){\scriptsize\textbf{Feed Forward}};
 	\node[anchor=south,layer,fill=green!20] (tgt_linear) at ([yshift=1.4em]tgt_ff.north){\scriptsize\textbf{Linear}};
@@ -63,6 +64,7 @@
 	\draw[line] (src_sf.north) -- (w3.south);
 	\draw[line] (tgt_sf.north) -- (output.south);
 	\draw[line] (src.north) -- (src_emb.south);
+	\draw[line] (tgt.north) -- (tgt_emb.south);
 	\draw[line,<->,out=-35,in=-145] ([xshift=-2em]src_sa.south) to ([xshift=2em]src_sa.south);
 	\draw[line, rounded corners=2pt] (src_ff.north) -- ([yshift=1.1em]src_ff.north) -- ([xshift=-2.4em,yshift=-0.8em]tgt_eda.south) -- ([xshift=-2.4em]tgt_eda.south);
 	\draw[line, rounded corners=2pt] (src_ff.north) -- ([yshift=1.1em]src_ff.north) -- ([yshift=-0.8em]tgt_eda.south) -- (tgt_eda.south);
@@ -84,8 +86,8 @@
 \node[] at ([xshift=2em]box3.east){\normalsize{解码器}};
 	\node[] at ([xshift=1em,yshift=-6em]box3.east){{$\times N$}};
-	\draw[line,dotted,rounded corners=4pt,violet] (box2.north) -- ([yshift=1em]box2.north) -- ([yshift=1em,xshift=5.8em]box2.north) -- ([xshift=-2.35em]tgt_emb.west) -- (tgt_emb.west);
+	\draw[line,dotted,rounded corners=4pt,violet] (box2.north) -- ([yshift=1em]box2.north) -- ([yshift=1em,xshift=5.8em]box2.north) -- ([xshift=-1.8em]tgt.west) -- (tgt.west);
-	\draw[line,-,dotted,rounded corners=4pt,violet,] (src_emb.east) -- ([xshift=-2em]tgt_emb.west);
+	\draw[line,-,dotted,rounded corners=4pt,violet] (src.east) -- ([xshift=-1.8em]tgt.west);
 \end{tikzpicture}

--- a/Chapter14/Figures/figure-reranking.tex
+++ b/Chapter14/Figures/figure-reranking.tex
@@ -15,10 +15,10 @@
 	\node[module, minimum width=8em] (encoder) at (0,0) {编码器组件};
 	\node[module,anchor=west, minimum width=8em] (decoder) at ([xshift=4em]encoder.east){解码器组件};
 	\node[module,anchor=west, minimum width=8em] (decoder2) at ([xshift=4em]decoder.east){解码器组件};
-	\node[module,anchor=north, minimum width=6em,font=\scriptsize,inner ysep=4pt] (deinput) at ([yshift=-2em]decoder2.south){解码端输入};
+	\node[module,anchor=north, minimum width=6em,font=\scriptsize,inner ysep=4pt] (deinput) at ([yshift=-2em]decoder2.south){解码器输入};
 	\node[anchor=south,font=\footnotesize] (mod1) at ([yshift=0.4em]encoder.north){\small\bfnew{编码器模块}};
 	\node[anchor=south,font=\footnotesize] (mod2) at ([yshift=0.4em]decoder.north){\small\bfnew{调序模块}};
-	\node[anchor=south,font=\footnotesize] (mod3) at ([yshift=0.4em]decoder2.north){\small\bfnew{解码端}};
+	\node[anchor=south,font=\footnotesize] (mod3) at ([yshift=0.4em]decoder2.north){\small\bfnew{解码器模块}};
 \begin{pgfonlayer}{background}
 {
@@ -27,14 +27,14 @@
 \node[box][fit=(decoder2)(mod3)] (box3) {};
 }
 \end{pgfonlayer}
-	\node[anchor=north,font=\scriptsize,align=center] (w1) at ([yshift=-2em]encoder.south){\scriptsize\bfnew{There exist different} \\ \scriptsize\bfnew{opinions on this question}};
+	\node[anchor=north,font=\scriptsize,align=center] (w1) at ([yshift=-2em]encoder.south){\scriptsize\bfnew{There exist different} \\ \scriptsize\bfnew{opinions on this question .}};
-	\node[anchor=north,font=\scriptsize,align=center] (w2) at ([yshift=-2em]decoder.south){\scriptsize\bfnew{There exist different} \\ \scriptsize\bfnew{opinions on this question}};
+	\node[anchor=north,font=\scriptsize,align=center] (w2) at ([yshift=-2em]decoder.south){\scriptsize\bfnew{There exist different} \\ \scriptsize\bfnew{opinions on this question .}};
 	\node[anchor=north,font=\scriptsize,text=gray] (w3) at ([yshift=0.6em]w2.south){\scriptsize\bfnew{（复制源语言句子）}};
-	\node[anchor=south,font=\scriptsize,align=center] (w4) at ([yshift=1.6em]box2.north){\scriptsize\bfnew{on this question} \\ \scriptsize\bfnew{There exist different opinions}};
+	\node[anchor=south,font=\scriptsize,align=center] (w4) at ([yshift=1.6em]box2.north){\scriptsize\bfnew{on this question} \\ \scriptsize\bfnew{There exist different opinions .}};
-	\node[anchor=south,font=\scriptsize,align=center] (w5) at ([yshift=1.6em]box3.north){\tiny\bfnew{对 \ 这个 \ 问题 \ 存在 \ 不同的 \ 看法}};
+	\node[anchor=south,font=\scriptsize,align=center] (w5) at ([yshift=1.6em]box3.north){\tiny\bfnew{对 \ 这个 \ 问题 \ 存在 \ 不同的 \ 看法 \  。}};
 	\node[font=\tiny] at ([xshift=-0.8em,yshift=-0.6em]encoder.east) {$N\times$};
 	\node[font=\tiny] at ([xshift=-0.8em,yshift=-0.6em]decoder.east) {$1\times$};
-	\node[font=\tiny] at ([xshift=-1em,yshift=-0.6em]decoder2.east) {$N-1\times$};
+	\node[font=\tiny] at ([xshift=-1.2em,yshift=-0.6em]decoder2.east) {$N-1\times$};
 	\draw[line] (w1.north) -- (box1.south);
 	\draw[line] (w2.north) -- (box2.south);
@@ -44,7 +44,7 @@
 	\draw[line] (box1.east) -- (box2.west);
 	\draw[line] (box2.east) -- (box3.west);
 	\draw[line,rounded corners=2pt,dotted,brown(traditional)] (w1.south) -- ([yshift=-1.6em]w1.south) -- ([yshift=-2.3em]deinput.south) -- (deinput.south);
-	\draw[line,rounded corners=2pt,dotted,brown(traditional)] (w4.east) -- ([xshift=0.9em]w4.east) -- ([xshift=-3em]deinput.west) -- (deinput.west);
+	\draw[line,rounded corners=2pt,dotted,brown(traditional)] (w4.east) -- ([xshift=0.9em]w4.east) -- ([xshift=-2.7em]deinput.west) -- (deinput.west);
 \end{tikzpicture}

--- a/Chapter14/Figures/figure-syntax.tex
+++ b/Chapter14/Figures/figure-syntax.tex
 \begin{tikzpicture}
 \tikzstyle{encoder} = [rectangle,thick,rounded corners,minimum width=1.9cm,minimum height=1.2cm,text centered,draw=black,fill=red!25]
 \tikzstyle{autodecoder} = [rectangle,thick,rounded corners,minimum width=3cm,minimum height=1.2cm,text centered,draw=black,fill=blue!15]
-\tikzstyle{nonautodecoder} = [rectangle,thick,rounded corners,minimum width=3.4cm,minimum height=1.2cm,text centered,draw=black!70,fill=blue!15]
+\tikzstyle{nonautodecoder} = [rectangle,thick,rounded corners,minimum width=4cm,minimum height=1.2cm,text centered,draw=black!70,fill=blue!15]
 \node (encoder)[encoder] at (0,0) {编码器};
-\node (text_left)[anchor=south] at ([yshift=-3em]encoder.south) {\footnotesize{猫\ 在\ 熟睡}};
+%\node (des)[anchor=north] at ([yshift=2cm]encoder.north) {<Mask>：<Mask>};
+\node (text_left)[anchor=south] at ([yshift=-3em]encoder.south) {\footnotesize{猫\ 在\ 熟睡\ 。}};
 \node (autodecoder)[autodecoder,right of=encoder,xshift=6em ] {自回归解码器};
-\node (text_mid1)[anchor=north] at ([yshift=3em]autodecoder.north) {\scriptsize{NP1\ VP3\ <eos>}};
+\node (text_mid1)[anchor=north] at ([yshift=3em]autodecoder.north) {\scriptsize{NP1\ VP3\ PU1\ <eos>}};
-\node (text_mid2)[anchor=south] at ([yshift=-3em]autodecoder.south) {\scriptsize{<sos>\ NP1\ VP3}};
+\node (text_mid2)[anchor=south] at ([yshift=-3em]autodecoder.south) {\scriptsize{<sos>\ NP1\ VP3\ PU1}};
-\node (nonautodecoder)[nonautodecoder,right of=autodecoder,xshift=10.5em ] {非自回归解码器};
+\node (nonautodecoder)[nonautodecoder,right of=autodecoder,xshift=12.5em] {非自回归解码器};
-\node (text_right1)[anchor=north] at ([yshift=3em]nonautodecoder.north) {\scriptsize{NP1\;Cats\;VP3\;sleep\;a\;lot}};
+\node (text_right1)[anchor=north] at ([yshift=3em]nonautodecoder.north) {\scriptsize{NP1\;Cats\;VP3\;sleep\;a\;lot\;PU1\;.}};
-\node (text_right2)[anchor=south] at ([yshift=-3em]nonautodecoder.south) {\scriptsize{NP1\;<Mask>\;VP3\;<Mask>\;<Mask>\;<Mask>}};
+\node (text_right2)[anchor=south] at ([yshift=-3em]nonautodecoder.south) {\scriptsize{NP1\;<Mask>\;VP3\;<Mask>\;<Mask>\;<Mask>\;PU1\;<Mask>}};
+\draw[->,thick] (encoder.east) to (autodecoder.west);
 \draw[->,thick] ([yshift=0.1em]text_left.north) to (encoder.south);
 \draw[->,thick] ([yshift=0.1em]text_mid2.north) to (autodecoder.south);
 \draw[->,thick] (autodecoder.north) to ([yshift=-0.1em]text_mid1.south);
 \draw[->,thick] ([yshift=0.1em]text_right2.north) to (nonautodecoder.south);
 \draw[->,thick] (nonautodecoder.north) to ([yshift=-0.1em]text_right1.south);
-\draw[->,thick] (text_mid1.east) -- ([xshift=2.1em]text_mid1.east) -- ([xshift=-1.2em]text_right2.west)-- (text_right2.west);
+\draw[->,thick] (text_mid1.east) -- ([xshift=1.4em]text_mid1.east) -- ([xshift=-1.2em]text_right2.west)-- (text_right2.west);
 \draw[-,thick] (encoder.north) to ([yshift=0.8em]encoder.north);
-\draw[-,thick,dashed] ([yshift=0.8em]encoder.north) -- ([xshift=-7em,yshift=0.8em]nonautodecoder.north) -- ([xshift=-2.5em]nonautodecoder.west);
+\draw[-,thick,dashed] ([yshift=0.8em]encoder.north) -- ([xshift=-7.7em,yshift=0.8em]nonautodecoder.north) --([xshift=-2.5em]nonautodecoder.west);
-\draw[->,thick]([xshift=-2.5em]nonautodecoder.west) to (nonautodecoder.west);
+\draw[->,thick] ([xshift=-2.5em]nonautodecoder.west) -- (nonautodecoder.west);
 \end{tikzpicture}
\ No newline at end of file
--- a/Chapter14/chapter14.tex
+++ b/Chapter14/chapter14.tex
@@ -23,9 +23,19 @@
 \chapter{神经机器翻译模型推断}
-\parinterval 推断是神经机器翻译中的核心问题。由于训练时双语句子对模型是可见的，但是在推断阶段，模型需要根据输入的源语言句子预测译文，因此神经机器翻译的推断和训练过程有着很大的不同。特别是，推断系统往往对应着机器翻译实际部署的需要，因此机器翻译推断系统的精度和速度等也是实践中需要考虑的。
+\parinterval 推断是神经机器翻译中的核心问题。训练时双语句子对模型是可见的，但是在推断阶段，模型需要根据输入的源语言句子预测译文，因此神经机器翻译的推断和训练过程有着很大的不同。特别是，推断系统往往对应着机器翻译实际部署的需要，因此机器翻译推断系统的精度和速度等也是实践中需要考虑的。
-\parinterval 本章对神经机器翻译模型推断的若干问题进行讨论。主要涉及三方面内容：1）神经机器翻译的基本问题，如推断方向、译文长度控制等；2）神经机器翻译的推断加速方法，如轻量模型、非自回归模型等；3）多模型集成推断。
+\parinterval 本章对神经机器翻译模型推断的若干问题进行讨论。主要涉及三方面内容：
+\begin{itemize}
+\vspace{0.5em}
+\item 神经机器翻译的基本问题，如推断方向、译文长度控制等。
+\vspace{0.5em}
+\item 神经机器翻译的推断加速方法，如轻量模型、非自回归模型等。
+\vspace{0.5em}
+\item 多模型集成推断。
+\vspace{0.5em}
+\end{itemize}
 %----------------------------------------------------------------------------------------
 %    NEW SECTION
@@ -53,10 +63,10 @@
 \begin{itemize}
 \vspace{0.5em}
-\item 预测模块，也就是根据已经生成的部分译文和源语言信息，预测下一个要生成的译文单词的概率分布\footnote{在统计机器翻译中，翻译的每一步也可以同时预测若干个连续的单词，即短语。在神经机器翻译中也有类似于生成短语的方
+\item {\small\sffamily\bfseries{预测模块}}，它根据已经生成的部分译文和源语言信息，预测下一个要生成的译文单词的概率分布\footnote{在统计机器翻译中，翻译的每一步也可以同时预测若干个连续的单词，即短语。在神经机器翻译中也有类似于生成短语的方
 法，但是主流的方法还是按单词为单位进行生成。}。因此预测模块实际上就是一个模型打分装置；
 \vspace{0.5em}
-\item 搜索模块，它会利用预测结果，对当前的翻译假设进行打分，并根据模型得分对翻译假设进行排序和剪枝。
+\item {\small\sffamily\bfseries{搜索模块}}，它会利用预测结果，对当前的翻译假设进行打分，并根据模型得分对翻译假设进行排序和剪枝。
 \vspace{0.5em}
 \end{itemize}
@@ -132,7 +142,7 @@
 \parinterval 不论是自左向右推断还是自右向左推断，本质上都是在对上下文信息进行建模。此外，研究人员也提出了许多新的译文生成策略，比如，从中部向外生成\upcite{DBLP:conf/nips/MehriS18}、按源语言顺序生成\upcite{Stahlberg2018AnOS}、基于插入的方式生成\upcite{Stern2019InsertionTF,stling2017NeuralMT}等。或者将翻译问题松弛化为一个连续空间模型的优化问题，进而在推断的过程中同时使用译文左右两端的信息\upcite{Geng2018AdaptiveMD}。
-\parinterval 最近，以BERT 为代表的预训练语言模型已经证明，一个单词的“历史” 和“未来” 信息对于生成当前单词都是有帮助的\upcite{devlin2019bert}。类似的观点也在神经机器翻译编码器设计中得到验证。比如，在基于循环神经网络的模型中，经常同时使用自左向右和自右向左的方式对源语言句子进行编码。还有，Transformer 编码器会使用整个句子的信息对每一个源语言位置进行表示。因此，神经机器翻译的推断采用类似的策略是有其合理性的。
+\parinterval 最近，以BERT 为代表的预训练语言模型已经证明，一个单词的“历史” 和“未来” 信息对于生成当前单词都是有帮助的\upcite{devlin2019bert}。类似的观点也在神经机器翻译编码器设计中得到验证。比如，在基于循环神经网络的模型中，经常同时使用自左向右和自右向左的方式对源语言句子进行编码；在Transformer 模型中，编码器会使用整个句子的信息对每一个源语言位置进行表示。因此，神经机器翻译的推断采用类似的策略是有其合理性的。
 %----------------------------------------------------------------------------------------
 %    NEW SUBSUB-SECTION
@@ -140,7 +150,7 @@
 \subsection{译文长度控制}
-\parinterval 机器翻译推断的一个特点是译文长度需要额外的机制进行控制\upcite{Kikuchi2016ControllingOL,Takase2019PositionalET,Murray2018CorrectingLB,Sountsov2016LengthBI}。这是因为机器翻译在建模时仅考虑了将训练样本（即标准答案）上的损失最小化，但是推断的时候会看到从未见过的样本，而且这些未见样本占据了样本空间的绝大多数。该问题会导致的一个现象是：直接使用训练好的模型会翻译出长度短得离谱的译文。神经机器翻译模型使用单词概率的乘积表示整个句子的翻译概率，它天然就倾向生成短译文，因为概率为大于0小于1的常数，短译文会使用更少的概率因式相乘，倾向于得到更高的句子得分，而模型只关心每个目标语言位置是否被正确预测，对于译文长度没有考虑。译文长度不合理的问题也出现在统计机器翻译模型中，常见的策略是在推断过程中引入译文长度控制机制\upcite{Koehn2007Moses}。神经机器翻译也借用了类似的思想来控制译文长度，有以下几种方法：
+\parinterval 机器翻译推断的一个特点是译文长度需要额外的机制进行控制\upcite{Kikuchi2016ControllingOL,Takase2019PositionalET,Murray2018CorrectingLB,Sountsov2016LengthBI}。这是因为机器翻译在建模时仅考虑了将训练样本（即标准答案）上的损失最小化，但是推断的时候会看到从未见过的样本，而且这些未见样本占据了样本空间的绝大多数。该问题会导致的一个现象是：直接使用训练好的模型会翻译出长度短得离谱的译文。神经机器翻译模型使用单词概率的乘积表示整个句子的翻译概率，它天然就倾向生成短译文，因为概率为大于0小于1的常数，短译文会使用更少的概率因式相乘，倾向于得到更高的句子得分，而模型只关心每个目标语言位置是否被正确预测，对于译文长度没有考虑。统计机器翻译模型中也存在译文长度不合理的问题，解决该问题的常见策略是在推断过程中引入译文长度控制机制\upcite{Koehn2007Moses}。神经机器翻译也借用了类似的思想来控制译文长度，有以下几种方法：
 \begin{itemize}
 \vspace{0.5em}
@@ -217,19 +227,19 @@ b &=& \omega_{\textrm{high}}\cdot |\seq{x}| \label{eq:14-4}
 \parinterval 机器翻译系统的输出并不仅限于单个译文。很多情况下，需要多个译文。比如，译文重排序中通常就需要系统的$n$-best输出，在交互式机器翻译中也往往需要提供多个译文供用户选择\upcite{Peris2017InteractiveNM,Peris2018ActiveLF}。但是，无论是统计机器翻译还是神经机器翻译，都面临一个同样的问题：$n$-best输出中的译文十分相似。实例\ref{eg:14-1}就展示了一个神经机器翻译输出的多个翻译结果，可以看到这些译文的区别很小。这个问题也被看做是机器翻译缺乏译文多样性的问题\upcite{Gimpel2013ASE,Li2016MutualIA,DBLP:conf/emnlp/DuanLXZ09,DBLP:conf/acl/XiaoZZW10,xiao2013bagging}。
 \begin{example}
-源语言句子：我们期待安理会尽早就此作出决定。
+源语言句子：我们\ 期待\ 安理会\ 尽早\ 就此\ 作出\ 决定\ 。
 \qquad\ 机器译文\ \,1\ ：We look forward to the Security Council making a decision on this
-\hspace{8.3em}as soon as possible.
+\hspace{8.3em}as soon as possible .
 \qquad\ 机器译文\ \,2\ ：We look forward to the Security Council making a decision on this
-\hspace{8.3em}issue as soon as possible.
+\hspace{8.3em}issue as soon as possible .
 \qquad\ 机器译文\ 3\ ：We hope that the Security Council will make a decision on this
-\hspace{8.4em}issue as soon as possible.
+\hspace{8.4em}issue as soon as possible .
 \label{eg:14-1}
 \end{example}
@@ -244,9 +254,9 @@ b &=& \omega_{\textrm{high}}\cdot |\seq{x}| \label{eq:14-4}
 \subsection{搜索错误}
-\parinterval 机器翻译的错误分为两类：搜索错误和模型错误。搜索错误是指由于搜索算法的限制，即使潜在的搜索空间中有更好的解，模型也无法找到。比较典型的例子是，在对搜索结果进行剪枝的时候，如果剪枝过多，找到的结果很有可能不是最优的。这时就出现了搜索错误。而模型错误则是指由于模型学习能力的限制无法将好的结果排序在前面，即使这个结果在搜索空间中被覆盖到。
+\parinterval 机器翻译的错误分为两类：搜索错误和模型错误。搜索错误是指由于搜索算法的限制，即使潜在的搜索空间中有更好的解，模型也无法找到。比较典型的例子是，在对搜索结果进行剪枝的时候，如果剪枝过多，找到的结果很有可能不是最优的。这时就出现了搜索错误。而模型错误则是指由于模型学习能力的限制，即使搜索空间中存在最优解，模型也无法将该解排序在前面。
-\parinterval 在统计机器翻译中，搜索错误可以通过减少剪枝进行缓解。比较简单的方式是增加搜索束宽度，这往往会带来一定的性能提升\upcite{Xiao2016ALA}。也可以对搜索问题进行单独建模，以保证学习到的模型出现更少的搜索错误\upcite{Liu2014SearchAwareTF,Yu2013MaxViolationPA}。但是，在神经机器翻译中，这个问题却表现出不同的现象：在很多神经机器翻译系统中，随着搜索束的增大，系统的BLEU不升反降。图\ref{fig:14-3}展示了BLEU随束大小的变化曲线，这里为了使该图更加规整直观，横坐标处将束大小进行了取对数操作。这个现象与传统的常识是相违背的，因此也有一些研究尝试解释这个现象\upcite{Stahlberg2019OnNS,Niehues2017AnalyzingNM}。在实验中，研究人员发现增加搜索束的大小会导致翻译生成的结果变得更短。他们将这个现象归因于：神经机器翻译的建模基于局部归一的最大似然估计，增加搜索束的大小，会导致更多的模型错误\upcite{Sountsov2016LengthBI,Murray2018CorrectingLB,StahlbergNeural}。此外，也有研究人员把这种翻译过短的现象归因于搜索错误\upcite{Stahlberg2019OnNS}。 由于搜索时所面临的搜索空间是十分巨大的，因此搜索时可能无法找到模型定义的“最好”的译文。在某种意义上，这也体现了训练和推断不一致的问题（见{\chapterthirteen}）。
+\parinterval 在统计机器翻译中，搜索错误可以通过减少剪枝进行缓解。比较简单的方式是增加搜索束宽度，这往往会带来一定的性能提升\upcite{Xiao2016ALA}。也可以对搜索问题进行单独建模，以保证学习到的模型出现更少的搜索错误\upcite{Liu2014SearchAwareTF,Yu2013MaxViolationPA}。但是，在神经机器翻译中，这个问题却表现出不同的现象：在很多神经机器翻译系统中，随着搜索束的增大，系统的BLEU不升反降。图\ref{fig:14-3}展示了神经机器翻译系统中BLEU随搜索束大小的变化曲线，这里为了使该图更加规整直观，横坐标处将束大小进行了取对数操作。这个现象与传统的常识是相违背的，因此也有一些研究尝试解释这个现象\upcite{Stahlberg2019OnNS,Niehues2017AnalyzingNM}。在实验中，研究人员发现增加搜索束的大小会导致翻译生成的结果变得更短。他们将这个现象归因于：神经机器翻译的建模基于局部归一的最大似然估计，增加搜索束的大小，会导致更多的模型错误\upcite{Sountsov2016LengthBI,Murray2018CorrectingLB,StahlbergNeural}。此外，也有研究人员把这种翻译过短的现象归因于搜索错误\upcite{Stahlberg2019OnNS}。 由于搜索时所面临的搜索空间是十分巨大的，因此搜索时可能无法找到模型定义的“最好”的译文。在某种意义上，这也体现了训练和推断不一致的问题（见{\chapterthirteen}）。
 %----------------------------------------------------------------------
 \begin{figure}[htp]
@@ -257,7 +267,7 @@ b &=& \omega_{\textrm{high}}\cdot |\seq{x}| \label{eq:14-4}
 \end{figure}
 %----------------------------------------------------------------------
-\parinterval 一种解决问题的思路是从训练和推断行为不一致的角度切入。比如，为了解决曝光偏置问题\upcite{Ranzato2016SequenceLT}，可以让系统使用前面步骤的预测结果作为预测下一个词所需要的历史信息，而不是依赖于标准答案\upcite{Bengio2015ScheduledSF,Zhang2019BridgingTG}。为了解决训练和推断目标不一致的问题，可以在训练的时候模拟推断的行为，同时让模型训练的目标与评价系统的标准尽可能一致\upcite{DBLP:conf/acl/ShenCHHWSL16}。
+\parinterval 一种解决问题的思路是从“训练和推断行为不一致”的角度切入。比如，为了解决曝光偏置问题\upcite{Ranzato2016SequenceLT}，可以让系统使用前面步骤的预测结果作为预测下一个词所需要的历史信息，而不是依赖于标准答案\upcite{Bengio2015ScheduledSF,Zhang2019BridgingTG}。为了解决训练和推断目标不一致的问题，可以在训练的时候模拟推断的行为，同时让模型训练的目标与评价系统的标准尽可能一致\upcite{DBLP:conf/acl/ShenCHHWSL16}。
 \parinterval 需要注意的是，前面提到的搜索束变大造成的翻译品质下降的问题还有其它解决方法。比如，可以通过对结果重排序来缓解这个问题\upcite{DBLP:conf/emnlp/Yang0M18}，也可以通过设计更好的覆盖度模型来生成长度更加合理的译文\upcite{li-etal-2018-simple}。从这个角度说，上述问题的成因也较为复杂，因此需要同时考虑模型错误和搜索错误。
@@ -267,7 +277,7 @@ b &=& \omega_{\textrm{high}}\cdot |\seq{x}| \label{eq:14-4}
 \section{轻量模型}\label{sec:14-3}
-\parinterval 翻译速度和翻译精度之间的平衡是机器翻译系统研发中的常见问题。即使是以提升翻译品质为目标的任务（如用BLEU进行评价），也不得不考虑翻译速度的影响。比如，在很多任务中会构造伪数据，涉及对大规模单语数据的翻译；无监督机器翻译中也会频繁地使用神经机器翻译系统构造训练数据。如果翻译速度过慢会增大实验的周期。从应用的角度看，在很多场景下翻译速度甚至比翻译品质更重要。比如，在线翻译和一些小设备上的机器翻译系统都需要保证相对低的翻译时延，以满足用户体验的最基本要求。虽然，我们希望能有一套又好又快的翻译系统，但是现实的情况是：往往需要通过牺牲一些翻译品质来换取翻译速度的提升。下面就列举一些常用的神经机器翻译轻量模型和加速方法。这些方法通常是应用在神经机器翻译的解码器，因为相比编码器，解码器是推断过程中最耗时的部分。
+\parinterval 翻译速度和翻译精度之间的平衡是机器翻译系统研发中的常见问题。即使是以提升翻译品质为目标的任务（如用BLEU进行评价），也不得不考虑翻译速度的影响。比如，在很多任务中会构造伪数据，该过程涉及对大规模单语数据的翻译；无监督机器翻译中也会频繁地使用神经机器翻译系统构造训练数据。在这些情况下，如果翻译速度过慢会增大实验的周期。从应用的角度看，在很多场景下翻译速度甚至比翻译品质更重要。比如，在线翻译和一些小设备上的机器翻译系统都需要保证相对低的翻译时延，以满足用户体验的最基本要求。虽然，我们希望能有一套又好又快的翻译系统，但是现实的情况是：往往需要通过牺牲一些翻译品质来换取翻译速度的提升。下面就列举一些常用的神经机器翻译轻量模型和加速方法。这些方法通常应用在神经机器翻译的解码器上，因为相比编码器，解码器是推断过程中最耗时的部分。
 %----------------------------------------------------------------------------------------
 %    NEW SUBSUB-SECTION
@@ -296,7 +306,7 @@ b &=& \omega_{\textrm{high}}\cdot |\seq{x}| \label{eq:14-4}
 \subsection{消除冗余计算}
-\parinterval 消除不必要的计算是加速机器翻译系统的另一种方法。比如，在统计机器翻译时代，假设重组就是一种典型的避免冗余计算的手段（见{\chapterseven}）。对于神经机器翻译中，消除冗余计算的一种简单有效的方法是对解码器的注意力结果进行缓存。以Transformer为例，在生成每个译文时，Transformer 模型会对当前位置之前的所有位置进行自注意力操作，但是这些计算里只有和当前位置相关的计算是“新” 的，前面位置之间的注意力结果已经在之前的解码步骤里计算过，因此可以对其进行缓存。
+\parinterval 消除不必要的计算是加速机器翻译系统的另一种方法。比如，在统计机器翻译时代，假设重组就是一种典型的避免冗余计算的手段（见{\chapterseven}）。在神经机器翻译中，消除冗余计算的一种简单有效的方法是对解码器的注意力结果进行缓存。以Transformer为例，在生成每个译文时，Transformer 模型会对当前位置之前的所有位置进行自注意力操作，但是这些计算里只有和当前位置相关的计算是“新” 的，前面位置之间的注意力结果已经在之前的解码步骤里计算过，因此可以对其进行缓存。
 \parinterval 此外，由于Transformer 模型较为复杂，还存在很多冗余。比如，Transformer 的每一层会包含自注意力机制、层正则化、残差连接、前馈神经网络等多种不同的结构。同时，不同结构之间还会包含一些线性变换。多层Transformer模型会更加复杂。但是，这些层可能在做相似的事情，甚至有些计算根本就是重复的。图\ref{fig:14-5}中展示了解码器自注意力和编码-解码注意力中不同层的注意力权重的相似性，这里的相似性利用Jensen-Shannon散度进行度量\upcite{61115}。可以看到，自注意力中，2-6层之间的注意力权重的分布非常相似。编码-解码注意力也有类似的现象，临近的层之间有非常相似的注意力权重。这个现象说明：在多层神经网络中有些计算是冗余的，因此很自然的想法是消除这些冗余使得机器翻译变得更“轻”。
@@ -329,11 +339,11 @@ b &=& \omega_{\textrm{high}}\cdot |\seq{x}| \label{eq:14-4}
 \subsection{轻量解码器及小模型}
-\parinterval 在推断时，神经机器翻译的解码器是最耗时的，因为每个目标语言位置需要单独输出单词的分布，同时在搜索过程中每一个翻译假设都要被扩展成多个翻译假设，进一步增加了计算量。因此，另一种思路是使用更加轻量的解码器加快翻译假设的生成速度\upcite{Hinton2015Distilling,Munim2019SequencelevelKD}。
+\parinterval 在推断时，神经机器翻译的解码器是最耗时的，因为每个目标语言位置需要单独输出单词的分布，同时在搜索过程中每一个翻译假设都要被扩展成多个翻译假设，进一步增加了计算量。因此，提高推断速度的一种思路是使用更加轻量的解码器加快翻译假设的生成速度\upcite{Hinton2015Distilling,Munim2019SequencelevelKD}。
 \parinterval 比较简单的做法是把解码器的网络变得更“浅”、更“窄”。所谓浅网络是指使用更少的层构建神经网络，比如，使用3 层，甚至1 层网络的Transformer 解码器。所谓窄网络是指将网络中某些层中神经元的数量减少。不过，直接训练这样的小模型会带来翻译品质的下降。这时会考虑使用知识蒸馏等技术来提升小模型的品质（见{\chapterthirteen}）。
-\parinterval 另一种思路是化简Transformer 解码器的神经网络。比如，可以使用平均注意力机制代替原始Transformer 中的自注意力机制\upcite{DBLP:journals/corr/abs-1805-00631}，也可以使用运算更轻的卷积操作代替注意力模块\upcite{Wu2019PayLA}。前面提到的基于共享注意力机制的模型也是一种典型的轻量模型\upcite{Xiao2019SharingAW}。这些方法本质上也是对注意力模型结构的优化，这类思想在近几年也受到了很多关注 \upcite{Kitaev2020ReformerTE,Katharopoulos2020TransformersAR,DBLP:journals/corr/abs-2006-04768}，在{\chapterfifteen}也会有进一步讨论。
+\parinterval 另一种提高推断速度的思路是化简Transformer 解码器的神经网络也可以提高推断速度。比如，可以使用平均注意力机制代替原始Transformer 中的自注意力机制\upcite{DBLP:journals/corr/abs-1805-00631}，也可以使用运算更轻的卷积操作代替注意力模块\upcite{Wu2019PayLA}。前面提到的基于共享注意力机制的模型也是一种典型的轻量模型\upcite{Xiao2019SharingAW}。这些方法本质上也是对注意力模型结构的优化，这类思想在近几年也受到了很多关注 \upcite{Kitaev2020ReformerTE,Katharopoulos2020TransformersAR,DBLP:journals/corr/abs-2006-04768}，在{\chapterfifteen}也会有进一步讨论。
 \parinterval 此外，使用异构神经网络也是一种平衡精度和速度的有效方法。在很多研究中发现，基于Transformer 的编码器对翻译品质的影响更大，而解码器的作用会小一些。因此，一种想法是使用速度更快的解码器结构，比如，用基于循环神经网络的解码器代替Transformer模型中基于注意力机制的解码器\upcite{Chen2018TheBO}。这样，既能发挥Transformer 在编码上的优势，同时也能利用循环神经网络在解码器速度上的优势。使用类似的思想，也可以用卷积神经网络等结构进行解码器的设计。
@@ -351,9 +361,9 @@ b &=& \omega_{\textrm{high}}\cdot |\seq{x}| \label{eq:14-4}
 \begin{itemize}
 \vspace{0.5em}
-\item 批次生成策略。对于源语言文本预先给定的情况，通常是按句子长度组织每个批次，即：把长度相似的句子放到一个批次里。这样做的好处是可以尽可能保证一个批次中的内容是“满” 的，否则如果句长差异过大会造成批次中有很多位置用占位符填充，产生无用计算。对于实时翻译的情况，批次的组织较为复杂。由于有翻译时延的限制，可能无法等到有足够多的句子就要进行翻译。常见的做法是，设置一个等待的时间，在同一个时间段中的句子可以放到一个批次中（或者几个批次中）。对于高并发的情况，也可以考虑使用不同的{\small\sffamily\bfseries{桶}}\index{桶}（Bucket\index{Bucket}）保存不同长度范围的句子，之后将同一个桶中的句子进行批量推断。这个问题在{\chaptereighteen}中还会做进一步讨论。
+\item {\small\sffamily\bfseries{批次生成策略}}。对于源语言文本预先给定的情况，通常是按句子长度组织每个批次，即：把长度相似的句子放到一个批次里。这样做的好处是可以尽可能保证一个批次中的内容是“满” 的，否则如果句长差异过大会造成批次中有很多位置用占位符填充，产生无用计算。对于实时翻译的情况，批次的组织较为复杂。由于有翻译时延的限制，可能无法等到有足够多的句子就要进行翻译。常见的做法是，设置一个等待的时间，在同一个时间段中的句子可以放到一个批次中（或者几个批次中）。对于高并发的情况，也可以考虑使用不同的{\small\sffamily\bfseries{桶}}\index{桶}（Bucket\index{Bucket}）保存不同长度范围的句子，之后将同一个桶中的句子进行批量推断。这个问题在{\chaptereighteen}中还会做进一步讨论。
 \vspace{0.5em}
-\item 批次大小的选择。一个批次中的句子数量越多，GPU 设备的利用率越高，系统吞吐越大。但是，一个批次中所有句子翻译结束后才能拿到翻译结果，因此批次中有些句子即使已经翻译结束也要等待其它没有完成的句子。也就是说，从单个句子来看，批次越大翻译的延时越长，这也导致在翻译实时性要求较高的场景中，不能使用过大的批次。而且，大批次对GPU 显存的消耗更大，因此也需要根据具体任务合理选择批次大小。为了说明这些问题，图\ref{fig:14-7}展示了不同批次大小下的时延和显存消耗。
+\item {\small\sffamily\bfseries{批次大小的选择}}。一个批次中的句子数量越多，GPU 设备的利用率越高，系统吞吐越大。但是，一个批次中所有句子翻译结束后才能拿到翻译结果，因此批次中有些句子即使已经翻译结束也要等待其它没有完成的句子。也就是说，从单个句子来看，批次越大翻译的延时越长，这也导致在翻译实时性要求较高的场景中，不能使用过大的批次。而且，大批次对GPU 显存的消耗更大，因此也需要根据具体任务合理选择批次大小。为了说明这些问题，图\ref{fig:14-7}展示了不同批次大小下的时延和显存消耗。
 \vspace{0.5em}
 \end{itemize}
@@ -376,11 +386,11 @@ b &=& \omega_{\textrm{high}}\cdot |\seq{x}| \label{eq:14-4}
 \begin{itemize}
 \vspace{0.5em}
-\item 半精度浮点运算。半精度浮点运算是随着近几年GPU 技术发展而逐渐流行的一种运算方式。简单来说，半精度的表示要比单精度需要更少的存储单元，所表示的浮点数范围也相应的变小。不过，实践中已经证明神经机器翻译中的许多运算用半精度计算就可以满足对精度的要求。因此，直接使用半精度运算可以大大加速系统的训练和推断进程，同时对翻译品质的影响很小。不过，需要注意的是，在分布式训练的时候，由于参数服务器需要对多个计算节点上的梯度进行累加，因此保存参数时仍然会使用单精度浮点以保证多次累加之后不会造成过大的精度损失。
+\item {\small\sffamily\bfseries{半精度浮点运算}}。半精度浮点运算是随着近几年GPU 技术发展而逐渐流行的一种运算方式。简单来说，半精度的表示要比单精度需要更少的存储单元，所表示的浮点数范围也相应的变小。不过，实践中已经证明神经机器翻译中的许多运算用半精度计算就可以满足对精度的要求。因此，直接使用半精度运算可以大大加速系统的训练和推断进程，同时对翻译品质的影响很小。不过，需要注意的是，在分布式训练时，由于参数服务器需要对多个计算节点上的梯度进行累加，因此保存参数时仍然会使用单精度浮点以保证多次累加之后不会造成过大的精度损失。
 \vspace{0.5em}
-\item  整型运算。整型运算是一种比浮点运算“轻” 很多的运算。无论是芯片占用面积、能耗还是处理单次运算的时钟周期数，整型运算相比浮点运算都有着明显的优势。不过，整数的表示和浮点数有着很大的不同。一个基本的问题是，整数是不连续的，因此无法准确的刻画浮点数中很小的小数。对于这个问题，一种解决方法是利用“量化+ 反量化+ 缩放” 的策略让整型运算达到近似浮点运算的效果\upcite{DBLP:journals/corr/abs-1906-00532,DBLP:conf/cvpr/JacobKCZTHAK18,DBLP:journals/corr/abs-1910-10485}。所谓“量化” 就是把一个浮点数离散化为一个整数，“反量化” 是这个过程的逆过程。由于浮点数可能超出整数的范围，因此会引入一个缩放因子。在量化前将浮点数缩放到整数可以表示的范围，反量化前再缩放回原始浮点数的表示范围。这种方法在理论上可以带来很好的加速效果。不过由于量化和反量化的操作本身也有时间消耗，而且在不同处理器上的表现差异较大。因此不同的实现方式带来的加速效果并不相同，需要通过实验测算。
+\item {\small\sffamily\bfseries{整型运算}}。整型运算是一种比浮点运算“轻” 很多的运算。无论是芯片占用面积、能耗还是处理单次运算的时钟周期数，相比浮点运算，整型运算都有着明显的优势。不过，整数的表示和浮点数有着很大的不同。一个基本问题是，整数是不连续的，因此无法准确地刻画浮点数中很小的小数。对于这个问题，一种解决方法是利用“量化+ 反量化+ 缩放” 的策略让整型运算达到与浮点运算近似的效果\upcite{DBLP:journals/corr/abs-1906-00532,DBLP:conf/cvpr/JacobKCZTHAK18,DBLP:journals/corr/abs-1910-10485}。所谓“量化” 就是把一个浮点数离散化为一个整数，“反量化” 是这个过程的逆过程。由于浮点数可能超出整数的范围，因此会引入一个缩放因子：在量化前将浮点数缩放到整数可以表示的范围，反量化前再缩放回原始浮点数的表示范围。这种方法在理论上可以带来很好的加速效果。不过由于量化和反量化的操作本身也有时间消耗，而且在不同处理器上的表现差异较大。因此不同实现方式带来的加速效果并不相同，需要通过实验测算。
 \vspace{0.5em}
-\item 低精度整型运算。使用更低精度的整型运算是进一步加速的手段之一。比如使用16 位整数、8 位整数，甚至4 位整数在理论上都会带来速度的提升，如表\ref{tab:14-3}所示。不过，并不是所有处理器都支持低精度整型的运算。开发这样的系统，一般需要硬件和特殊低精度整型计算库的支持。而且相关计算大多是在CPU 上实现，应用会受到一定的限制。
+\item {\small\sffamily\bfseries{低精度整型运算}}。使用更低精度的整型运算是进一步加速的手段之一。比如使用16 位整数、8 位整数，甚至4 位整数在理论上都会带来速度的提升，如表\ref{tab:14-3}所示。不过，并不是所有处理器都支持低精度整型的运算。开发这样的系统，一般需要硬件和特殊低精度整型计算库的支持。而且相关计算大多是在CPU 上实现，应用会受到一定的限制。
 \vspace{0.5em}
 \end{itemize}
@@ -405,7 +415,7 @@ b &=& \omega_{\textrm{high}}\cdot |\seq{x}| \label{eq:14-4}
 \section{非自回归翻译}
-\parinterval 目前大多数神经机器翻译模型都使用自左向右逐词生成译文的策略，也就是，第$j$个目标语言单词的生成依赖于先前生成的$j-1$ 个词。这种翻译方式也被称作{\small\sffamily\bfseries{自回归解码}}\index{自回归解码}（Autoregressive Decoding）\index{Autoregressive Decoding}。虽然以Transformer为代表的模型使得训练过程高度并行化，加快了训练速度。但由于推断过程自回归的特性，模型无法同时生成译文中的所有单词，这导致模型的推断过程非常缓慢，这对于神经机器翻译的实际应用是个很大的挑战。因此，如何设计一个在训练和推断阶段都能够并行化的模型是目前研究的热点之一。
+\parinterval 目前大多数神经机器翻译模型都使用自左向右逐词生成译文的策略，即第$j$个目标语言单词的生成依赖于先前生成的$j-1$ 个词。这种翻译方式也被称作{\small\sffamily\bfseries{自回归解码}}\index{自回归解码}（Autoregressive Decoding）\index{Autoregressive Decoding}。虽然以Transformer为代表的模型使得训练过程高度并行化，加快了训练速度。但由于推断过程自回归的特性，模型无法同时生成译文中的所有单词，这导致模型的推断过程非常缓慢，对于神经机器翻译的实际应用是个很大的挑战。因此，如何设计一个在训练和推断阶段都能够并行化的模型是目前研究的热点之一。
 %----------------------------------------------------------------------------------------
 %    NEW SUBSUB-SECTION
@@ -413,7 +423,7 @@ b &=& \omega_{\textrm{high}}\cdot |\seq{x}| \label{eq:14-4}
 \subsection{自回归 vs 非自回归}
-\parinterval 目前主流的神经机器翻译的推断是一种{\small\sffamily\bfseries{自回归翻译}}\index{自回归翻译}（Autoregressive Translation）\index{Autoregressive Translation}过程。所谓自回归是一种描述时间序列生成的方式。对于目标序列$\seq{y}=\{y_1,\dots,y_n\}$，自回归模型假设$j$时刻状态$y_j$的生成依赖于之前的状态$\{y_1,\dots,y_{j-1}\}$，而且$y_j$与$\{y_1,\dots,y_{j-1}\}$构成线性关系，那么生成$y_j$就是自回归的序列生成过程。神经机器翻译借用了这个概念，但是并不要求使用线性模型，\ref{sec:14-2-1}节提到的自左向右翻译模型和自右向左翻译模型都属于自回归翻译模型。自回归模型在机器翻译任务上也有很好的表现，特别是配合束搜索也能够有效地寻找近似最优译文。但是，由于解码器的每个步骤必须顺序地而不是并行地运行，自回归翻译模型会阻碍不同译文单词生成的并行化。特别是在GPU 上，翻译的自回归性会大大降低计算的并行度，设备利用率低。
+\parinterval 目前主流的神经机器翻译的推断是一种{\small\sffamily\bfseries{自回归翻译}}\index{自回归翻译}（Autoregressive Translation）\index{Autoregressive Translation}过程。所谓自回归是一种描述时间序列生成的方式。对于目标序列$\seq{y}=\{y_1,\dots,y_n\}$，自回归模型假设$j$时刻状态$y_j$的生成依赖于之前的状态$\{y_1,\dots,y_{j-1}\}$，而且$y_j$与$\{y_1,\dots,y_{j-1}\}$构成线性关系，那么生成$y_j$就是自回归的序列生成过程。神经机器翻译借用了这个概念，但是并不要求使用线性模型，\ref{sec:14-2-1}节提到的自左向右翻译模型和自右向左翻译模型都属于自回归翻译模型。自回归模型在机器翻译任务上也有很好的表现，特别是配合束搜索往往能够有效地寻找近似最优译文。但是，由于解码器的每个步骤必须顺序地而不是并行地运行，自回归翻译模型会阻碍不同译文单词生成的并行化。特别是在GPU 上，翻译的自回归性会大大降低计算的并行度和设备利用率。
 \parinterval 对于这个问题，研究人员也考虑移除翻译的自回归性，进行{\small\sffamily\bfseries{非自回归翻译}}\index{非自回归翻译}（Non-Autoregressive Translation，NAT）\index{Non-Autoregressive Translation}\upcite{Gu2017NonAutoregressiveNM}。一个简单的非自回归翻译模型将问题建模为公式\eqref{eq:14-9}：
@@ -430,7 +440,7 @@ b &=& \omega_{\textrm{high}}\cdot |\seq{x}| \label{eq:14-4}
 \subsection{非自回归翻译模型的结构}
-\parinterval 在介绍非自回归模型的具体结构之前，先来看看如何实现一个简单的非自回归翻译模型。这里用标准的Transformer来举例。首先为了一次性生成所有的词，需要丢弃解码器对未来信息屏蔽的矩阵，从而去掉模型的自回归性。此外，还要考虑生成译文的长度。自回归模型每步的输入是上一步解码出的结果，当预测到终止符<eos>时，序列的生成就自动停止了，然而非自回归模型却没有这样的特性，因此还需要一个长度预测器来预测出其长度，之后再用这个长度得到每个位置的表示，进而完成整个序列的生成。
+\parinterval 在介绍非自回归模型的具体结构之前，先来看看如何实现一个简单的非自回归翻译模型。这里用标准的Transformer来举例。首先为了一次性生成所有的词，需要丢弃解码器对未来信息屏蔽的矩阵，从而去掉模型的自回归性。此外，还要考虑生成译文的长度。自回归模型每步的输入是上一步解码出的结果，当预测到终止符<eos>时，序列的生成就自动停止了，然而非自回归模型却没有这样的特性，因此还需要一个长度预测器来预测出其长度，之后再用这个长度得到每个位置的表示，将其作为解码器的输入，进而完成整个序列的生成。
 \parinterval 图\ref{fig:14-12}对比了自回归翻译模型和简单的非自回归翻译模型。可以看到这种自回归翻译模型可以一次性生成完整的译文。不过，高并行性也带来了翻译品质的下降。比如，在IWSLT 英德等数据上的BLEU[\%] 值只有个位数，而现在最好的自回归模型已经能够达到30左右的BLEU得分。这是因为每个位置词的预测只依赖于源语言句子$\seq{x}$，使得预测不准确。
@@ -443,7 +453,7 @@ b &=& \omega_{\textrm{high}}\cdot |\seq{x}| \label{eq:14-4}
 \end{figure}
 %----------------------------------------------------------------------
-\parinterval 完全独立地对每个词建模，会出现什么问题呢？来看一个例子，将汉语句子“干/得/好/！”翻译成英文，可以翻译成“Good job !”或者“Well done !”。假设生成这两种翻译的概率是相等的，即一半的概率是“Good job !”，另一半的概率是“Well done !”。由于非自回归模型的条件独立性假设，推断时第一个词“Good”和“Well”的概率是差不多大的，如果第二个词“job”和“done”的概率也差不多大，会使得模型生成出“Good done !”或者“Well job !”这样错误的翻译，如图\ref{fig:14-13}所示。这便是影响句子质量的关键问题，称之为{\small\sffamily\bfseries{多峰问题}}\index{多峰问题}（Multi-modality Problem）\index{Multi-modality Problem}\upcite{Gu2017NonAutoregressiveNM}。如何有效处理非自回归模型中的多峰问题  是提升非自回归模型质量的关键。
+\parinterval 完全独立地对每个词建模，会出现什么问题呢？来看一个例子，将汉语句子“干/得/好/！”翻译成英文，可以翻译成“Good job !”或者“Well done !”。假设生成这两种翻译的概率是相等的，即一半的概率是“Good job !”，另一半的概率是“Well done !”。由于非自回归模型的条件独立性假设，推断时第一个词“Good”和“Well”的概率是差不多大的，如果第二个词“job”和“done”的概率也差不多大，会使得模型生成出“Good done !”或者“Well job !”这样错误的翻译，如图\ref{fig:14-13}所示。这便是影响句子质量的关键问题，称之为{\small\sffamily\bfseries{多峰问题}}\index{多峰问题}（Multimodality Problem）\index{Multimodality Problem}\upcite{Gu2017NonAutoregressiveNM}。如何有效处理非自回归模型中的多峰问题  是提升非自回归模型质量的关键。
 %----------------------------------------------------------------------
 \begin{figure}[htp]
@@ -462,9 +472,9 @@ b &=& \omega_{\textrm{high}}\cdot |\seq{x}| \label{eq:14-4}
 \subsubsection{1. 基于繁衍率的非自回归模型}
-\parinterval 图\ref{fig:14-14}给出了基于繁衍率的Transformer非自回归模型的结构\upcite{Gu2017NonAutoregressiveNM}，由三个模块组成:编码器，解码器，繁衍率预测器。类似于标准的Transformer模型，这里编码器和解码器都完全由前馈神经网络和多头注意力模块组成。唯一的不同是解码器中新增了位置注意力模块，用于更好的捕捉目标语言端的位置信息。
+\parinterval 图\ref{fig:14-14}给出了基于繁衍率的Transformer非自回归模型的结构\upcite{Gu2017NonAutoregressiveNM}，由三个模块组成:编码器，解码器，繁衍率预测器。类似于标准的Transformer模型，这里编码器和解码器都完全由前馈神经网络和多头注意力模块组成。唯一的不同是解码器中新增了位置注意力模块（图\ref{fig:14-14}中被红色虚线框住的模块），用于更好的捕捉目标语言端的位置信息。
-\parinterval 繁衍率预测器的一个作用是预测整个译文句子的长度，以便并行生成所有译文单词。可以通过对每个源语言单词计算繁衍率来估计最终译文的长度。具体来说，繁衍率指的是：根据每个源语言单词预测出其对应的目标语言单词的个数（见\chaptersix），如图\ref{fig:14-14}所示，翻译过程中英语单词“We”对应一个汉语单词“我们”，其繁衍率为1。于是，可以得到源语言句子对应的繁衍率序列（图\ref{fig:14-14}中的数字1\ 1\ 2\ 0\ 1），最终译文长度则由源语言单词的繁衍率之和决定。在模型训练阶段，繁衍率序列可以通过外部词对齐工具得到， 之后训练繁衍率预测器。但由于外部词对齐系统会出现错误，因此在模型收敛之后，可以对繁衍率预测器进行额外的微调。
+\parinterval 繁衍率预测器的一个作用是预测整个译文句子的长度，以便并行地生成所有译文单词。可以通过对每个源语言单词计算繁衍率来估计最终译文的长度。具体来说，繁衍率指的是：根据每个源语言单词预测出其对应的目标语言单词的个数（见\chaptersix），如图\ref{fig:14-14}所示，翻译过程中英语单词“We”对应一个汉语单词“我们”，其繁衍率为1。于是，可以得到源语言句子对应的繁衍率序列（图\ref{fig:14-14}中的数字1\ 1\ 2\ 0\ 1），最终译文长度则由源语言单词的繁衍率之和决定。在模型训练阶段，繁衍率序列可以通过外部词对齐工具得到， 用于之后训练繁衍率预测器。但由于外部词对齐系统会出现错误，因此在模型收敛之后，可以对繁衍率预测器进行额外的微调。
 %----------------------------------------------------------------------
 \begin{figure}[htp]
@@ -503,7 +513,7 @@ b &=& \omega_{\textrm{high}}\cdot |\seq{x}| \label{eq:14-4}
 \parinterval 虽然非自回归翻译可以显著提升翻译速度，但是很多情况下其翻译质量还是低于传统的自回归翻译\upcite{Gu2017NonAutoregressiveNM,Kaiser2018FastDI,Guo2020FineTuningBC}。因此，很多工作致力于缩小自回归模型和非自回归模型的性能差距\upcite{Ran2020LearningTR,Tu2020ENGINEEI,Shu2020LatentVariableNN}。
-\parinterval 一种直接的方法是层级知识蒸馏\upcite{Li2019HintBasedTF}。由于自回归模型和非自回归模型的结构相差不大，因此可以将翻译质量更高的自回归模型作为“教师”，通过给非自回归模型提供监督信号，使其逐块地学习前者的分布。研究人员发现了两点非常有意思的现象：1）非自回归模型容易出现“重复翻译”的现象，这些相邻的重复单词所对应的位置的隐藏状态非常相似。2）非自回归模型的注意力分布比自回归模型的分布更加尖锐。这两点发现启发了研究人员使用自回归模型中的隐层状态来指导非自回归模型学习。可以计算两个模型隐层状态的距离以及注意力矩阵的KL散度\footnote{KL散度即相对熵。}，将它们作为额外的损失指导非自回归模型的训练。类似的做法也出现在基于模仿学习的方法中\upcite{Wei2019ImitationLF}，它也可以被看作是对自回归模型不同层行为的模拟。不过，基于模仿学习的方法会使用更复杂的模块来完成自回归模型对非自回归模型的指导，比如，在非自回归模型中使用一个额外的神经网络，用于接收自回归模型的层级监督信号。
+\parinterval 一种直接的方法是层级知识蒸馏\upcite{Li2019HintBasedTF}。由于自回归模型和非自回归模型的结构相差不大，因此可以将翻译质量更高的自回归模型作为“教师”，通过给非自回归模型提供监督信号，使其逐块地学习前者的分布。研究人员发现了两点非常有意思的现象：1）非自回归模型容易出现“重复翻译”的现象，这些相邻的重复单词所对应的位置的隐藏状态非常相似。2）非自回归模型的注意力分布比自回归模型的分布更加尖锐。这两点发现启发了研究人员使用自回归模型中的隐层状态和注意力矩阵等中间表示来指导非自回归模型学习。可以计算两个模型隐层状态的距离以及注意力矩阵的KL散度\footnote{KL散度即相对熵。}，将它们作为额外的损失指导非自回归模型的训练。类似的做法也出现在基于模仿学习的方法中\upcite{Wei2019ImitationLF}，它也可以被看作是对自回归模型不同层行为的模拟。不过，基于模仿学习的方法会使用更复杂的模块来完成自回归模型对非自回归模型的指导，比如，在自回归模型和非自回归模型中都使用一个额外的神经网络，用于传递自回归模型提供给非自回归模型的层级监督信号。
 \parinterval 此外，也可以使用基于正则化因子的方法\upcite{Wang2019NonAutoregressiveMT}。非自回归模型的翻译结果中存在着两种非常严重的错误：重复翻译和不完整的翻译。重复翻译问题是因为解码器隐层状态中相邻的两个位置过于相似，因此翻译出来的单词也一样。对于不完整翻译，或者说欠翻译，通常将其归咎于非自回归模型在翻译的过程中丢失了一些源语言句子的信息。针对这两个问题，可以通过在相邻隐层状态间添加相似度约束来计算一个重构损失。具体实践时，对于翻译$\seq{x}\to\seq{y}$，通过一个反向的自回归模型再将$\seq{y}$翻译成$\seq{x'}$，最后计算$\seq{x}$与$\seq{x'}$的差异性作为损失。
@@ -568,7 +578,7 @@ b &=& \omega_{\textrm{high}}\cdot |\seq{x}| \label{eq:14-4}
 \end{figure}
 %----------------------------------------------
-\parinterval 除了使用上一个步骤的输出，当前解码器的输入还可以使用了添加噪声的正确目标语言句子\upcite{Lee2018DeterministicNN}。另外，对于译文长度的预测，也可以使用编码器的输出单独训练一个独立的长度预测模块，这种方法也推广到了目前大多数模型上。
+\parinterval 除了使用上一个步骤的输出，当前解码器的输入还可以使用了添加噪声的正确目标语言句子\upcite{Lee2018DeterministicNN}。另外，对于译文长度的预测，也可以使用编码器的输出单独训练一个独立的长度预测模块，这种方法也推广到了目前大多数非自回归模型上。
 \parinterval 另一种方法借鉴了BERT的思想\upcite{devlin2019bert}，称为Mask-Predict\upcite{Ghazvininejad2019MaskPredictPD}。类似于BERT中的<CLS>标记，该方法在源语言句子的最前面加上了一个特殊符号<LEN>作为输入，用来预测目标句的长度$n$。之后，将特殊符<Mask>（与BERT中的<Mask>有相似的含义）复制$n$次作为解码器的输入，然后用非自回归的方式生成所有的译文单词。这样生成的翻译可能是比较差的，因此可以将第一次生成的这些词中不确定（即生成概率比较低）的一些词“擦”掉，依据剩余的译文单词以及源语言句子重新进行预测，不断迭代，直到满足停止条件为止。图\ref{fig:14-19}给出了一个示例。
@@ -589,7 +599,7 @@ b &=& \omega_{\textrm{high}}\cdot |\seq{x}| \label{eq:14-4}
 \parinterval 在机器学习领域，把多个模型融合成一个模型是提升系统性能的一种有效方法。比如，在经典的AdaBoost 方法中\upcite{DBLP:journals/jcss/FreundS97}，用多个“弱” 分类器构建的“强” 分类器可以使模型在训练集上的分类错误率无限接近0。类似的思想也被应用到机器翻译中\upcite{DBLP:conf/acl/XiaoZZW10,DBLP:conf/icassp/SimBGSW07,DBLP:conf/acl/RostiMS07,DBLP:conf/wmt/RostiZMS08}，被称为{\small\sffamily\bfseries{系统融合}}\index{系统融合}（System Combination）\index{System Combination}。在各种机器翻译比赛中，系统融合已经成为经常使用的技术之一。因为许多模型融合方法都是在推断阶段完成，因此此类方法开发的代价较低。
-\parinterval 广义上来讲，使用多个特征组合的方式都可以被看作是一种模型的融合。融合多个神经机器翻译系统的方法有很多，可以分为假设选择、局部预测融合、译文重组三类，下面进行介绍。
+\parinterval 广义上来讲，使用多个特征组合的方式都可以被看作是一种模型的融合。融合多个神经机器翻译系统的方法有很多，可以分为假设选择、局部预测融合、译文重组三类，下面分别进行介绍。
 %----------------------------------------------------------------------------------------
 %    NEW SUBSUB-SECTION
@@ -703,9 +713,9 @@ b &=& \omega_{\textrm{high}}\cdot |\seq{x}| \label{eq:14-4}
 \vspace{0.5em}
 \item 在对机器翻译推断系统进行实际部署时，对存储的消耗也是需要考虑的因素。因此如何让模型变得更小也是研发人员所关注的方向。当前的模型压缩方法主要可以分为几类：剪枝、量化、知识蒸馏和轻量方法，其中轻量方法主要是基于更轻量模型结构的设计，这类方法已经在本章进行了介绍。剪枝主要包括权重大小剪枝\upcite{Han2015LearningBW,Lee2019SNIPSN,Frankle2019TheLT,Brix2020SuccessfullyAT}、 面向多头注意力的剪枝\upcite{Michel2019AreSH,DBLP:journals/corr/abs-1905-09418}、网络层以及其他结构剪枝等\upcite{Liu2017LearningEC,Liu2019RethinkingTV}，还有一些方法也通过在训练期间采用正则化的方式来提升剪枝能力\upcite{DBLP:conf/iclr/FanGJ20}。量化方法主要通过截断浮点数来减少模型的存储大小，使其仅使用几个比特位的数字表示方法便能存储整个模型，虽然会导致舍入误差，但压缩效果显著\upcite{DBLP:journals/corr/abs-1906-00532,Cheong2019transformersZ,Banner2018ScalableMF,Hubara2017QuantizedNN}。一些方法利用知识蒸馏手段还将Transformer模型蒸馏成如LSTMs 等其他各种推断速度更快的结构\upcite{Hinton2015Distilling,Munim2019SequencelevelKD,Tang2019DistillingTK}。
 \vspace{0.5em}
-\item 目前的翻译模型使用交叉熵损失作为优化函数，这在自回归模型上取得了非常优秀的性能。交叉熵是一个严格的损失函数，每个预测错误的单词所对应的位置都会受到惩罚，即使是编辑距离很小的输出序列。自回归模型会很大程度上避免这种惩罚，因为当前位置的单词是根据先前生成的词得到的，然而非自回归模型无法获得这种信息。如果在预测时漏掉一个单词，就可能会将正确的单词放在错误的位置上。为此，一些研究工作通过改进损失函数来提高非自回归模型的性能。一种做法使用一种新的交叉熵函数\upcite{Ghazvininejad2020AlignedCE}，它通过忽略绝对位置、关注相对顺序和词汇匹配来为非自回归模型提供更精确的训练信号。另外，也可以使用基于$n$-gram的训练目标\upcite{Shao2020MinimizingTB}来最小化模型与参考译文之间的$n$-gram差异。该训练目标在$n$-gram 的层面上评估预测结果，因此能够建模目标序列单词之间的依赖关系。
+\item 目前的翻译模型使用交叉熵损失作为优化函数，这在自回归模型上取得了非常优秀的性能。交叉熵是一个严格的损失函数，每个预测错误的单词所对应的位置都会受到惩罚，即使是编辑距离很小的输出序列\upcite{Ghazvininejad2020AlignedCE}。自回归模型会很大程度上避免这种惩罚，因为当前位置的单词是根据先前生成的词得到的，然而非自回归模型无法获得这种信息。如果在预测时漏掉一个单词，就可能会将正确的单词放在错误的位置上。为此，一些研究工作通过改进损失函数来提高非自回归模型的性能。一种做法使用一种新的交叉熵函数\upcite{Ghazvininejad2020AlignedCE}，它通过忽略绝对位置、关注相对顺序和词汇匹配来为非自回归模型提供更精确的训练信号。另外，也可以使用基于$n$-gram的训练目标\upcite{Shao2020MinimizingTB}来最小化模型与参考译文之间的$n$-gram差异。该训练目标在$n$-gram 的层面上评估预测结果，因此能够建模目标序列单词之间的依赖关系。
 \vspace{0.5em}
-\item 自回归模型解码时，当前位置单词的生成依赖于先前生成的单词，已生成的单词提供了较强的目标端上下文信息。与自回归模型相比，非自回归模型的解码器需要在信息更少的情况下执行翻译任务。一些研究工作通过将条件随机场引入非自回归模型中来对结构依赖进行建模\upcite{Ma2019FlowSeqNC}。也有工作引入了词嵌入转换矩阵来将源语言端的词嵌入转换为目标语言端的词嵌入来为解码器提供更好的输入\upcite{Guo2019NonAutoregressiveNM}。此外，研究人员也提出了轻量级的调序模块来显式地建模调序信息，以指导非自回归模型的推断\upcite{Ran2019GuidingNN}。
+\item 自回归模型解码时，当前位置单词的生成依赖于先前生成的单词，已生成的单词提供了较强的目标端上下文信息。与自回归模型相比，非自回归模型的解码器需要在信息更少的情况下执行翻译任务。一些研究工作通过将条件随机场引入非自回归模型中来对序列依赖进行建模\upcite{Ma2019FlowSeqNC}。也有工作引入了词嵌入转换矩阵来将源语言端的词嵌入转换为目标语言端的词嵌入来为解码器提供更好的输入\upcite{Guo2019NonAutoregressiveNM}。此外，研究人员也提出了轻量级的调序模块来显式地建模调序信息，以指导非自回归模型的推断\upcite{Ran2019GuidingNN}。
 \vspace{0.5em}
 \end{itemize}