update

b1d01dc3 · 曹润柘 · e0927e2a · b1d01dc3 · b1d01dc3 · b1d01dc3
Commit b1d01dc3 authored Dec 24, 2020 by 曹润柘
--- a/Chapter17/Figures/figure-an-end-to-end-voice-translation-model-based-on-transformer.tex
+++ b/Chapter17/Figures/figure-an-end-to-end-voice-translation-model-based-on-transformer.tex
@@ -17,11 +17,11 @@
 \node[layer,anchor=south,fill=red!20] (de_ca) at ([yshift=1.4em]de_sa.north){Multi-Head \\ Attention};
 \node[layer,anchor=south,fill=green!20] (de_ffn) at ([yshift=1.4em]de_ca.north){Feed Forward \\ Network};

-\node[layer,anchor=south,fill=blue!20] (sf) at ([yshift=2em]de_ffn.north){Softmax};
+\node[layer,anchor=south,fill=blue!20] (sf) at ([yshift=1.6em]de_ffn.north){Softmax};
 \node[layer,anchor=south,fill=orange!20] (output) at ([yshift=1.4em]sf.north){STLoss};

-\node[anchor=north,font=\scriptsize,align=center] (en_input) at ([yshift=-1em]en_cnn.south){语音特征\\(FilterBank/MFCC)};
-\node[anchor=north,font=\scriptsize,align=center] (de_input) at ([yshift=-1em]de_add.south){目标文本\\(Embedding)};
+\node[anchor=north,font=\scriptsize,align=center] (en_input) at ([yshift=-1em]en_cnn.south){Speech Feature\\(FilterBank/MFCC)};
+\node[anchor=north,font=\scriptsize,align=center] (de_input) at ([yshift=-1.1em]de_add.south){Target Text\\(Embedding)};

 \node[anchor=east,font=\scriptsize,align=center] (en_pos) at ([xshift=-2em]en_add.west){Position\\(Embedding)};
 \node[anchor=west,font=\scriptsize,align=center] (de_pos) at ([xshift=2em]de_add.east){Position\\(Embedding)};
@@ -40,8 +40,8 @@
 \draw[->] ([xshift=-0.1em]de_pos.180) -- ([xshift=0.1em]de_add.0);
 \draw[->,rounded corners=2pt] ([yshift=0.1em]en_ffn.90) -- ([yshift=2em]en_ffn.90) -- ([xshift=4em,yshift=2em]en_ffn.90) -- ([xshift=-1.5em]de_ca.west) -- ([xshift=-0.1em]de_ca.west);
 \begin{pgfonlayer}{background}
-\node[draw=ugreen,rounded corners=2pt,inner xsep=6pt,inner ysep=8pt][fit=(en_sa)(en_ffn)]{};
-\node[draw=red,rounded corners=2pt,inner xsep=6pt,inner ysep=8pt][fit=(de_sa)(de_ca)(de_ffn)]{};
+\node[draw=ugreen,rounded corners=2pt,inner xsep=6pt,inner ysep=8pt,dashed,thick][fit=(en_sa)(en_ffn)]{};
+\node[draw=red,rounded corners=2pt,inner xsep=6pt,inner ysep=8pt,dashed,thick][fit=(de_sa)(de_ca)(de_ffn)]{};
 \end{pgfonlayer}

 \node[anchor=east,font=\scriptsize,text=ugreen] at ([xshift=-0.1em]box1.west){$N \times$};

--- a/Chapter17/Figures/figure-application-of-multimodal-machine-translation-to-multitask-learning.tex
+++ b/Chapter17/Figures/figure-application-of-multimodal-machine-translation-to-multitask-learning.tex
-\tikzstyle{coder} = [rectangle,thick,rounded corners,minimum width=2.3cm,minimum height=1cm,text centered,draw=black,fill=red!25]
+\tikzstyle{coder} = [rectangle,rounded corners,minimum height=2.2em,minimum width=4.3em,text centered,draw=black,fill=red!25]
 \begin{tikzpicture}[node distance = 0,scale = 1]
 \tikzstyle{every node}=[scale=1]
 \node(x)[]{x};
@@ -6,7 +6,7 @@
 \node(decoder_left)[coder, above of = encoder, yshift=6em,fill=blue!25]{{解码器}};
 \node(y_hat)[above of = decoder_left, yshift=4em]{{$\rm y'$}};
 \node(y)[above of = decoder_left, xshift=-6em]{{$\rm y$}};
-\node(decoder_right)[coder, above of = encoder, xshift=12em,fill=yellow!25]{{解码器}};
+\node(decoder_right)[coder, above of = encoder, xshift=11em,fill=yellow!25]{{解码器}};

 \node(figure)[draw=white,above of = decoder_right,yshift=6.5em,scale=0.25] {\includegraphics[width=0.62\textwidth]{./Chapter17/Figures/figure-bank-without-attention.png}};

@@ -14,6 +14,6 @@
 \draw[->,thick](encoder)to(decoder_left)node[right,xshift=-0.1cm,yshift=-1.25cm,scale=1.0]{翻译};
 \draw[->,thick](decoder_left)to(y_hat);
 \draw[->,thick](y)to(decoder_left);
-\draw[->,thick](encoder)to(decoder_right)node[left,xshift=-3.8em,yshift=0.25cm,scale=1.0]{生成图片};
+\draw[->,thick](encoder)to(decoder_right)node[left,xshift=-3.1em,yshift=0.25cm,scale=1.0]{生成图片};
 \draw[->,thick](decoder_right)to(figure);
 \end{tikzpicture}
\ No newline at end of file
--- a/Chapter17/Figures/figure-image-description-of-encoder-decoder-framework.tex
+++ b/Chapter17/Figures/figure-image-description-of-encoder-decoder-framework.tex
@@ -20,11 +20,11 @@

 \foreach \x in {1,2}{
 \draw[-,line width=2pt] (A\x) -- ([xshift=3.6em]A\x) -- ([xshift=3.6em,yshift=-3em]A\x) -- ([yshift=-3em]A\x) -- (A\x) -- ([xshift=1em]A\x);
-\draw[-, very thick] (B\x) -- (C\x) -- (D\x) -- (B\x);
-\draw[-, very thick,fill=black] ([xshift=-0.6em,yshift=-1.2em]B\x)  -- ([xshift=-0.3em,yshift=-1em]B\x) -- ([yshift=-1.2em]B\x) --([xshift=0.3em,yshift=-1em]B\x) -- ([xshift=0.6em,yshift=-1.2em]B\x) -- (D\x) -- (C\x) -- ([xshift=-0.6em,yshift=-1.2em]B\x);
-\draw[-, very thick,fill=black] (E\x) -- ([xshift=0.2em,yshift=0.3em]E\x) -- ([xshift=0.33em]F\x) -- (F\x) -- (E\x);
+\draw[-, thick] (B\x) -- (C\x) -- (D\x) -- (B\x);
+\draw[-, thick,fill=black] ([xshift=-0.6em,yshift=-1.2em]B\x)  -- ([xshift=-0.3em,yshift=-1em]B\x) -- ([yshift=-1.2em]B\x) --([xshift=0.3em,yshift=-1em]B\x) -- ([xshift=0.6em,yshift=-1.2em]B\x) -- (D\x) -- (C\x) -- ([xshift=-0.6em,yshift=-1.2em]B\x);
+\draw[-, thick,fill=black] (E\x) -- ([xshift=0.2em,yshift=0.3em]E\x) -- ([xshift=0.33em]F\x) -- (F\x) -- (E\x);
 \node[circle,inner sep=0pt,minimum size=0.4em,fill=black] at ([xshift=-0.7em,yshift=-0.2em]B\x){};
-\node[draw,rounded corners=2pt,fill=yellow!20,minimum width=2.3cm,minimum height=1cm](cnn\x) at ([xshift=9.5em,yshift=-1.5em]A\x){CNN};
+\node[draw,rounded corners=2pt,fill=yellow!20,minimum height=2.2em,minimum width=4.3em](cnn\x) at ([xshift=9.5em,yshift=-1.5em]A\x){CNN};
 \node[draw,circle,fill=green!20,font=\footnotesize,anchor=west,inner sep=3pt] (h\x_2) at ([xshift=3em,yshift=0.66em]cnn\x.east){$h_2$};
 \node[draw,circle,fill=green!20,font=\footnotesize,anchor=south,inner sep=3pt] (h\x_1) at ([yshift=1em]h\x_2.north){$h_1$};
 \node[font=\footnotesize,anchor=north] (h\x_c) at ([yshift=-0.6em]h\x_2.south){$\cdots$};
@@ -36,11 +36,11 @@
 \node[draw,thick,rounded corners=2pt,densely dashed,inner ysep=1.2em,inner xsep=0.4em,label={above:图像特征向量}][fit=(h2_1)(h2_2)(h2_n)](box2){};
 \end{pgfonlayer}

-\node[anchor=west,draw,rounded corners=2pt,fill=blue!20,minimum width=2.3cm,minimum height=1cm] (decoder1)at ([xshift=3em]box1.east){解码器};
+\node[anchor=west,draw,rounded corners=2pt,fill=blue!20,minimum height=2.2em,minimum width=4.3em] (decoder1)at ([xshift=3em]box1.east){解码器};
 \node[anchor=west,draw,circle,inner sep=0pt,minimum size=1.4em] (add)at ([xshift=2em,yshift=1.6em]box2.east){};
 \draw[] (add.0) -- (add.180);
 \draw[] (add.90) -- (add.-90);
-\node[anchor=west,draw,rounded corners=2pt,fill=blue!20,minimum width=2.3cm,minimum height=1cm] (decoder2)at ([xshift=6em]box2.east){解码器};
+\node[anchor=west,draw,rounded corners=2pt,fill=blue!20,minimum height=2.2em,minimum width=4.3em] (decoder2)at ([xshift=6em]box2.east){解码器};


 \draw[->,thick] ([xshift=-2.7em]cnn1.180) -- ([xshift=-0.1em]cnn1.180);

--- a/Chapter17/Figures/figure-modeling-a-global-approach-to-visual-characteristics.tex
+++ b/Chapter17/Figures/figure-modeling-a-global-approach-to-visual-characteristics.tex
@@ -24,13 +24,12 @@
 \draw[-, very thick,fill=black] ([xshift=-0.6em,yshift=-1.2em]B\x)  -- ([xshift=-0.3em,yshift=-1em]B\x) -- ([yshift=-1.2em]B\x) --([xshift=0.3em,yshift=-1em]B\x) -- ([xshift=0.6em,yshift=-1.2em]B\x) -- (D\x) -- (C\x) -- ([xshift=-0.6em,yshift=-1.2em]B\x);
 \draw[-, very thick,fill=black] (E\x) -- ([xshift=0.2em,yshift=0.3em]E\x) -- ([xshift=0.33em]F\x) -- (F\x) -- (E\x);
 \node[circle,inner sep=0pt,minimum size=0.4em,fill=black] at ([xshift=-0.7em,yshift=-0.2em]B\x){};
-\node[draw,rounded corners=2pt,fill=yellow!20,minimum width=2.3cm,minimum height=1cm](cnn\x) at ([xshift=1.8em,yshift=3.6em]A\x){CNN};
+\node[draw,rounded corners=2pt,fill=yellow!20,minimum width=2.3cm,minimum height=2.2em](cnn\x) at ([xshift=1.8em,yshift=3.6em]A\x){CNN};
 }
-
-\node[draw,anchor=south,rounded corners=2pt,minimum width=4.0cm,minimum height=1cm,fill=red!20](encoder) at ([yshift=2.6em,xshift=2.2em]cnn1.north){编码器};
+\node[draw,anchor=south,rounded corners=2pt,minimum width=4.0cm,minimum height=2.2em,fill=red!20](encoder) at ([yshift=2.6em,xshift=2.2em]cnn1.north){编码器};
 \node[anchor=north,font=\Large](x) at ([xshift=2.5em,yshift=-3.4em]encoder.south){$\seq{x}$};

-\node[draw,anchor=south,rounded corners=2pt,minimum width=4.0cm,minimum height=1cm,fill=blue!20](decoder) at ([yshift=2.6em,xshift=2.2em]cnn2.north){解码器};
+\node[draw,anchor=south,rounded corners=2pt,minimum width=4.0cm,minimum height=2.2em,fill=blue!20](decoder) at ([yshift=2.6em,xshift=2.2em]cnn2.north){解码器};
 \node[anchor=north,font=\Large](y) at ([xshift=2.5em,yshift=-3.4em]decoder.south){$\seq{y}$};
 \node[anchor=south,font=\Large](y_1) at ([yshift=3em]decoder.north){$\seq{y}'$};


--- a/Chapter17/Figures/figure-speech-recognition-model-based-on-transformer.tex
+++ b/Chapter17/Figures/figure-speech-recognition-model-based-on-transformer.tex
@@ -17,11 +17,11 @@
 \node[layer,anchor=south,fill=red!20] (de_ca) at ([yshift=1.4em]de_sa.north){Multi-Head \\ Attention};
 \node[layer,anchor=south,fill=green!20] (de_ffn) at ([yshift=1.4em]de_ca.north){Feed Forward \\ Network};

-\node[layer,anchor=south,fill=blue!20] (sf) at ([yshift=2em]de_ffn.north){Softmax};
+\node[layer,anchor=south,fill=blue!20] (sf) at ([yshift=1.6em]de_ffn.north){Softmax};
 \node[layer,anchor=south,fill=orange!20] (output) at ([yshift=1.4em]sf.north){Output Probabilities};

 \node[anchor=north,font=\scriptsize,align=center] (en_input) at ([yshift=-1em]en_cnn.south){Speech Feature\\(FilterBank/MFCC)};
-\node[anchor=north,font=\scriptsize,align=center] (de_input) at ([yshift=-1em]de_add.south){Transcription\\(Embedding)};
+\node[anchor=north,font=\scriptsize,align=center] (de_input) at ([yshift=-1.1em]de_add.south){Transcription\\(Embedding)};

 \node[anchor=east,font=\scriptsize,align=center] (en_pos) at ([xshift=-2em]en_add.west){Position\\(Embedding)};
 \node[anchor=west,font=\scriptsize,align=center] (de_pos) at ([xshift=2em]de_add.east){Position\\(Embedding)};
@@ -40,8 +40,8 @@
 \draw[->] ([xshift=-0.1em]de_pos.180) -- ([xshift=0.1em]de_add.0);
 \draw[->,rounded corners=2pt] ([yshift=0.1em]en_ffn.90) -- ([yshift=2em]en_ffn.90) -- ([xshift=4em,yshift=2em]en_ffn.90) -- ([xshift=-1.5em]de_ca.west) -- ([xshift=-0.1em]de_ca.west);
 \begin{pgfonlayer}{background}
-\node[draw=ugreen,rounded corners=2pt,inner xsep=6pt,inner ysep=8pt][fit=(en_sa)(en_ffn)](box1){};
-\node[draw=red,rounded corners=2pt,inner xsep=6pt,inner ysep=8pt][fit=(de_sa)(de_ca)(de_ffn)](box2){};
+\node[draw=ugreen,rounded corners=2pt,inner xsep=6pt,inner ysep=8pt,dashed,thick][fit=(en_sa)(en_ffn)](box1){};
+\node[draw=red,rounded corners=2pt,inner xsep=6pt,inner ysep=8pt,dashed,thick][fit=(de_sa)(de_ca)(de_ffn)](box2){};
 \end{pgfonlayer}

 \node[anchor=east,font=\scriptsize,text=ugreen] at ([xshift=-0.1em]box1.west){$N \times$};

--- a/Chapter17/Figures/figure-speech-translation-model-based-on-CTC.tex
+++ b/Chapter17/Figures/figure-speech-translation-model-based-on-CTC.tex
@@ -19,11 +19,11 @@

 \node[layer,anchor=south,fill=blue!20] (en_sf) at ([yshift=3em]en_ffn.north){Softmax};
 \node[layer,anchor=south,fill=blue!20] (sf) at ([yshift=2em]de_ffn.north){Softmax};
-\node[layer,anchor=south,fill=orange!20] (en_output) at ([yshift=1.4em]en_sf.north){CTC输出};
-\node[layer,anchor=south,fill=orange!20] (output) at ([yshift=1.4em]sf.north){语音翻译输出};
+\node[layer,anchor=south,fill=orange!20] (en_output) at ([yshift=1.4em]en_sf.north){CTC Output};
+\node[layer,anchor=south,fill=orange!20] (output) at ([yshift=1.4em]sf.north){ST Output};

-\node[anchor=north,font=\scriptsize,align=center] (en_input) at ([yshift=-1em]en_cnn.south){语音特征\\(FilterBank/MFCC)};
-\node[anchor=north,font=\scriptsize,align=center] (de_input) at ([yshift=-1em]de_add.south){目标文本\\(Embedding)};
+\node[anchor=north,font=\scriptsize,align=center] (en_input) at ([yshift=-1em]en_cnn.south){Speech Feature\\(FilterBank/MFCC)};
+\node[anchor=north,font=\scriptsize,align=center] (de_input) at ([yshift=-1em]de_add.south){Target Text\\(Embedding)};

 \node[anchor=east,font=\scriptsize,align=center] (en_pos) at ([xshift=-2em]en_add.west){Position\\(Embedding)};
 \node[anchor=west,font=\scriptsize,align=center] (de_pos) at ([xshift=2em]de_add.east){Position\\(Embedding)};
@@ -44,13 +44,13 @@
 \draw[->] ([xshift=-0.1em]de_pos.180) -- ([xshift=0.1em]de_add.0);
 \draw[->,rounded corners=2pt] ([yshift=2em]en_ffn.90) -- ([xshift=4em,yshift=2em]en_ffn.90) -- ([xshift=-1.5em]de_ca.west) -- ([xshift=-0.1em]de_ca.west);
 \begin{pgfonlayer}{background}
-\node[draw=ugreen,rounded corners=2pt,inner xsep=6pt,inner ysep=8pt][fit=(en_sa)(en_ffn)]{};
-\node[draw=red,rounded corners=2pt,inner xsep=6pt,inner ysep=8pt][fit=(de_sa)(de_ca)(de_ffn)]{};
+\node[draw=ugreen,rounded corners=2pt,inner xsep=6pt,inner ysep=8pt,dashed,thick][fit=(en_sa)(en_ffn)]{};
+\node[draw=red,rounded corners=2pt,inner xsep=6pt,inner ysep=8pt,dashed,thick][fit=(de_sa)(de_ca)(de_ffn)]{};
 \end{pgfonlayer}

 \node[anchor=east,font=\scriptsize,text=ugreen] at ([xshift=-0.1em]box1.west){$N \times$};
 \node[anchor=west,font=\scriptsize,text=red] at ([xshift=0.1em]box2.east){$\times N$};
 \node[anchor=east,font=\scriptsize] at ([xshift=-0.1em]en_cnn.west){$2 \times$};
-\node[anchor=east,font=\scriptsize,align=center,text=ugreen] at ([xshift=-0.1em,yshift=3em]box1.west){语音翻译\\编码器};
-\node[anchor=west,font=\scriptsize,align=center,text=red] at ([xshift=0.1em,yshift=5em]box2.east){语音翻译\\解码器};
+\node[anchor=east,font=\scriptsize,align=center,text=ugreen] at ([xshift=-0.1em,yshift=3em]box1.west){ST\\Encoder};
+\node[anchor=west,font=\scriptsize,align=center,text=red] at ([xshift=0.1em,yshift=5em]box2.east){ST\\Decoder};
 \end{tikzpicture}
\ No newline at end of file
--- a/Chapter17/Figures/figure-the-encoder-explicitly-incorporates-semantic-information.tex
+++ b/Chapter17/Figures/figure-the-encoder-explicitly-incorporates-semantic-information.tex
@@ -10,24 +10,24 @@
 \node(bank)[word, below of = jump, yshift=-0.75cm, fill=blue!65]{bank};
 \node(sky)[word, below of = bank, yshift=-0.75cm, fill=blue!30]{sky};
 \node(tree)[word, below of = sky, yshift=-0.75cm, fill=blue!15]{tree};
-\node(cir)[circle,very thick, minimum width=0.6cm, xshift=8cm,  draw=black]{};
-\node(decoder)[rectangle, rounded corners, minimum width=2.5cm, minimum height=1.2cm, right of = cir,xshift=3cm, draw=black, fill=blue!25]{\large{解码器}};
+\node(cir)[circle,thick, minimum width=0.6cm, xshift=8cm,  draw=black]{};
+\node(decoder)[rectangle, rounded corners, minimum height=2.2em,minimum width=4.3em, right of = cir,xshift=3cm, draw=black, fill=blue!25]{\large{解码器}};
 \node(yn_1)[below of = decoder,yshift=-2cm,scale=1.2]{$\rm y_{n-1}$};
 \node(yn_2)[above of = decoder,yshift=2cm,scale=1.2]{$\rm y'_{n-1}$(bank)};

-\draw[->, very thick]([xshift=0.1cm]figure.east)to([xshift=2cm]figure.east);
-\draw[-,very thick]([xshift=-0.03cm]cir.east)to([xshift=0.03cm]cir.west);
-\draw[-,very thick]([yshift=0.03cm]cir.south)to([yshift=-0.03cm]cir.north);
-\draw[->, very thick]([xshift=0.1cm]cir.east)to([xshift=-0.1cm]decoder.west);
-\draw[->, very thick](yn_1)to([yshift=-0.1cm]decoder.south);
-\draw[->, very thick]([yshift=0.1cm]decoder.north)to(yn_2);
+\draw[->, thick]([xshift=0.1cm]figure.east)to([xshift=2cm]figure.east);
+\draw[-,thick]([xshift=-0.03cm]cir.east)to([xshift=0.03cm]cir.west);
+\draw[-,thick]([yshift=0.03cm]cir.south)to([yshift=-0.03cm]cir.north);
+\draw[->, thick]([xshift=0.1cm]cir.east)to([xshift=-0.1cm]decoder.west);
+\draw[->, thick](yn_1)to([yshift=-0.1cm]decoder.south);
+\draw[->, thick]([yshift=0.1cm]decoder.north)to(yn_2);

 \draw[->, thick, color=blue!45]([xshift=0.05cm]river.east)to([xshift=-0.05cm]cir.west);
 \draw[->, thick, color=blue!45]([xshift=0.05cm]mountain.east)to([xshift=-0.05cm]cir.west);
 \draw[->, thick, color=blue!15]([xshift=0.05cm]child.east)to([xshift=-0.05cm]cir.west);
 \draw[->, thick, color=blue!25]([xshift=0.05cm]man.east)to([xshift=-0.05cm]cir.west);
 \draw[->, thick, color=blue!30]([xshift=0.05cm]jump.east)to([xshift=-0.05cm]cir.west);
-\draw[->, thick, color=blue!65]([xshift=0.05cm]bank.east)to([xshift=-0.05cm]cir.west);
+\draw[->, very thick, color=blue!65]([xshift=0.05cm]bank.east)to([xshift=-0.05cm]cir.west);
 \draw[->, thick, color=blue!30]([xshift=0.05cm]sky.east)to([xshift=-0.05cm]cir.west);
 \draw[->, thick, color=blue!15]([xshift=0.05cm]tree.east)to([xshift=-0.05cm]cir.west);
 \end{tikzpicture}
\ No newline at end of file
--- a/Chapter17/Figures/figure-three-ways-of-dual-decoder-speech-translation.tex
+++ b/Chapter17/Figures/figure-three-ways-of-dual-decoder-speech-translation.tex
-\tikzstyle{coder} = [rectangle,thick,rounded corners,minimum width=2.3cm,minimum height=1cm,text centered,draw=black!70,fill=red!20]
+\tikzstyle{coder} = [rectangle,thick,rounded corners,minimum height=2.2em,minimum width=4.3em,text centered,draw=black!70,fill=red!20]

 \begin{tikzpicture}[node distance = 0,scale = 0.75]
 \tikzstyle{every node}=[scale=0.75]
@@ -19,7 +19,7 @@
 \node [anchor=south,scale=1.2] (node1) at ([xshift=-2.0em,yshift=6em]decoder_1.north) {{$x,y$：语言数据}};
 \node [anchor=north,scale=1.2] (node2) at ([xshift=0.6em]node1.south){{$s$：语音数据}};
 %%%%%%%%%%%%%%%%%%%%%%%%级联
-\node(encoder-2)[coder]at ([xshift=10.0em]encoder.east){\large{编码器}};
+\node(encoder-2)[coder]at ([xshift=12.0em]encoder.east){\large{编码器}};
 \node(decoder_1-2)[coder,above of =encoder-2,yshift=1.4cm,fill=blue!20]{\large{解码器}};
 \node(decoder_2-2)[coder,above of =decoder_1-2, yshift=1.4cm,fill=yellow!20]{\large{解码器}};
 \node(s-2)[below of = encoder-2,yshift=-1.8cm,scale=1.6]{$s$};

--- a/Chapter17/Figures/figure-traditional-methods-of-image-description.tex
+++ b/Chapter17/Figures/figure-traditional-methods-of-image-description.tex
@@ -17,14 +17,14 @@
 \node(surd-1)[right of = text_3-1, xshift=2cm,scale=1.5]{\textcolor{red}{$\surd$}};
 \node(text_4-1)[description, right of = figure-1, xshift=5.2cm,yshift=-0.9cm,fill=color_blue]{\textcolor{white}{男人戴着眼镜。}};
 \node(point-1)[right of = figure-1, xshift=5cm,yshift=-1.4cm,scale=1.5]{...};
-\draw[->,very thick](figure-1)to([xshift=-0.1cm]ground-1.west);
+\draw[->,thick](figure-1)to([xshift=-0.1cm]ground-1.west);

 \node(figure)[draw=white,scale=0.25]at ([xshift=20.0em]figure-1.east){\includegraphics[width=0.62\textwidth]{./Chapter17/Figures/figure-dog-with-hat.png}};
 \node(ground)[rectangle,rounded corners, minimum width=5cm, minimum height=1.5cm,right of = figure, xshift=5cm,yshift=-2.6em,fill=blue!20]{\large{图片中有\underline{\textcolor{red}{狗}}，\underline{\textcolor{red}{帽子}}，\underline{\quad\ }。}};
 \node(dog)[rectangle,rounded corners, minimum width=1cm, minimum height=0.7cm,right of = figure, xshift=3cm,yshift=1.5cm,thick, draw=color_orange,fill=color_orange!50]{狗};
 \node(hat)[rectangle,rounded corners, minimum width=1.5cm, minimum height=0.7cm,right of = figure, xshift=4.5cm,yshift=1.5cm,thick, draw=color_green,fill=color_green!50]{帽子};
-\draw[->, very thick,color=black!60](figure.east)to([xshift=-0.1cm]dog.west)node[left,xshift=-0.2cm,yshift=-0.1cm,color=black]{图片检测};
-\draw[->, very thick,color=black!60]([yshift=-0.1cm]hat.south)to([yshift=0.1cm]ground.north)node[right,xshift=-0.2cm,yshift=0.5cm,color=black]{模板填充};
+\draw[->, thick,color=black!60](figure.east)to([xshift=-0.1cm]dog.west)node[left,xshift=-0.2cm,yshift=-0.1cm,color=black]{图片检测};
+\draw[->, thick,color=black!60]([yshift=-0.1cm]hat.south)to([yshift=0.1cm]ground.north)node[right,xshift=-0.2cm,yshift=0.5cm,color=black]{模板填充};

 \node [anchor=north](pos1)at ([xshift=-3.8em,yshift=-0.5em]ground-1.south){（a）基于检索的图像描述生成范式};
 \node [anchor=north](pos2)at ([xshift=-3.8em,yshift=-0.5em]ground.south){（b）基于模板的图像描述生成范式};

--- a/Chapter17/Figures/figure-word-lattice.tex
+++ b/Chapter17/Figures/figure-word-lattice.tex
@@ -18,22 +18,22 @@

 \draw[->] (n0.0) -- node[word,above]{of /0.343}(n2.180);
 \draw[->] (n0.60) -- node[word,above,rotate=40]{a /0.499}(n1.-150);
-\draw[->] (n0.-50) -- node[word,above,rotate=-20]{our /0.116}(n3.150);
-\draw[->] (n0.-70) .. controls ([xshift=-8em]n4.180) and ([xshift=-8em]n4.180) .. node[above,word,xshift=3em,yshift=-0.6em]{that /0.039} (n4.180);
-\draw[->] (n4.0) .. node[word,above,xshift=-2em,yshift=-0.4em]{hostage /1} controls ([xshift=5em]n4.0) and ([yshift=-6em]n6.-90) .. (n6.-90);
-\draw[->] (n2.-90) -- node[word,above,rotate=-18,pos=0.55]{house /0.125}(n7.180);
+\draw[->] (n0.-50) -- node[word,above,rotate=-20]{their /0.116}(n3.150);
+\draw[->] (n0.-70) .. controls ([xshift=-8em]n4.180) and ([xshift=-8em]n4.180) .. node[above,word,xshift=3em,yshift=-0.6em]{that /0.042} (n4.180);
+\draw[->] (n4.0) .. node[word,above,xshift=-2em,yshift=-0.4em]{hospital /1} controls ([xshift=5em]n4.0) and ([yshift=-6em]n6.-90) .. (n6.-90);
+\draw[->] (n2.-90) -- node[word,above,rotate=-18,pos=0.55]{house /0.127}(n7.180);
 \draw[->] (n3.-10) node[word,above,xshift=3.6em,yshift=-0.8em]{conference /1} .. controls ([xshift=4.6em,yshift=-1.8em]n3.-10) and ([yshift=-1.6em,xshift=-3em]n10.-135) .. (n10.-135);
 \draw[->] (n7.0) -- node[word,above]{which /1}(n10.180);
-\draw[->] (n2.0) -- node[word,above,pos=0.5]{hostages /0.300}(n6.180);
+\draw[->] (n2.0) -- node[word,above,pos=0.5]{hospital /0.300}(n6.180);
 \draw[->] (n2.45) -- node[word,above,rotate=18,pos=0.3]{a /0.573}(n11.-135);
-\draw[->,rounded corners=1em] (n1.-45) node[word,above,xshift=1.4em,yshift=-1.3em,rotate=-43]{house /0.078} -- ([yshift=-0.4em,xshift=-1em]n11.-90) -- (n7.100);
+\draw[->,rounded corners=1em] (n1.-45) node[word,above,xshift=1.4em,yshift=-1.3em,rotate=-43]{house /0.079} -- ([yshift=-0.4em,xshift=-1em]n11.-90) -- (n7.100);
 \draw[->] (n1.20) node[word,above,xshift=4em]{conference /0.734} .. controls ([xshift=8em]n1.20) and  ([xshift=-0.6em,yshift=2.2em]n5.110) .. (n5.110);
 \draw[->] (n11.0) -- node[word,above]{conference /1}(n5.180);
 \draw[->] (n5.-90) ..node[word,above,xshift=1.4em]{is /0.773} controls ([yshift=-1.6em]n5.-90) and ([xshift=-3em]n6.150]) .. (n6.150);
-\draw[->] (n5.0) node[word, above,xshift=1.4em]{as /0.226}.. controls ([xshift=2.6em]n5.0) and ([xshift=-0.6em,yshift=2em]n6.120) .. (n6.120);
+\draw[->] (n5.0) node[word, above,xshift=1.4em]{as /0.227}.. controls ([xshift=2.6em]n5.0) and ([xshift=-0.6em,yshift=2em]n6.120) .. (n6.120);

 \coordinate (a) at ([xshift=6em,yshift=3em]n1);
-\draw[->] (n1.60) .. controls ([xshift=3em,yshift=2em]n1.60) and ([xshift=-2em]a) .. (a) node[word,above,xshift=1em]{hostage /0.187}.. controls ([xshift=8em]a) and ([xshift=-0.6em,yshift=6em]n6.90) .. (n6.90);
+\draw[->] (n1.60) .. controls ([xshift=3em,yshift=2em]n1.60) and ([xshift=-2em]a) .. (a) node[word,above,xshift=1em]{hospital /0.187}.. controls ([xshift=8em]a) and ([xshift=-0.6em,yshift=6em]n6.90) .. (n6.90);
 \draw[->] (n10.0) -- node[above,word,pos=0.4,rotate=30]{is /1}(n6.-135);
 \draw[->] (n6.0) -- node[above,word,yshift=0.2em]{being /1}(n8.180);
 \draw[->] (n8.0) -- node[above,word,yshift=0.3em]{recorded /1}(n9.180);

--- a/Chapter17/chapter17.tex
+++ b/Chapter17/chapter17.tex
@@ -75,7 +75,7 @@

 \parinterval 经过上面的描述，音频的表示实际上是一个非常长的采样点序列，这导致了直接使用现有的深度学习技术处理音频序列较为困难。并且，原始的音频信号中可能包含着较多的噪声、环境声或冗余信息也会对模型产生干扰。因此，一般会对音频序列进行处理来提取声学特征，具体为将长序列的采样点序列转换为短序列的特征向量序列，再用于下游系统模块。虽然已有一些工作不依赖特征提取，直接在原始的采样点序列上进行声学建模和模型训练\upcite{DBLP:conf/interspeech/SainathWSWV15}，但目前的主流方法仍然是基于声学特征进行建模\upcite{DBLP:conf/icassp/MohamedHP12}。

-\parinterval 声学特征提取的第一步是预处理。其流程主要是对音频进行预加重、分帧和加窗。预加重用来提升音频信号中的高频部分，目的是使频谱更加平滑。分帧（原理如图\ref{fig17-2}）是基于短时平稳假设，即根据生物学特征，语音信号是一个缓慢变化的过程，10ms~30ms的信号片段是相对平稳的。基于这个假设，一般将每25ms作为一帧来提取特征，这个时间称为{\small\bfnew{帧长}}\index{帧长}（Frame Length）\index{Frame Length}。同时，为了保证不同帧之间的信号平滑性，使每两个相邻帧之间存在一定的重合部分。一般每隔10ms取一帧，这个时长称为{\small\bfnew{帧移}}\index{帧移}（Frame Shift）\index{Frame Shift}。为了缓解分帧带来的频谱泄漏，对每帧的信号进行加窗处理使其幅度在两段渐变到0，一般采用的是{\small\bfnew{汉明窗}}\index{汉明窗}（Hamming）\index{Hamming}。
+\parinterval 声学特征提取的第一步是预处理。其流程主要是对音频进行预加重、分帧和加窗。预加重用来提升音频信号中的高频部分，目的是使频谱更加平滑。分帧（原理如图\ref{fig:17-2}）是基于短时平稳假设，即根据生物学特征，语音信号是一个缓慢变化的过程，10ms~30ms的信号片段是相对平稳的。基于这个假设，一般将每25ms作为一帧来提取特征，这个时间称为{\small\bfnew{帧长}}\index{帧长}（Frame Length）\index{Frame Length}。同时，为了保证不同帧之间的信号平滑性，使每两个相邻帧之间存在一定的重合部分。一般每隔10ms取一帧，这个时长称为{\small\bfnew{帧移}}\index{帧移}（Frame Shift）\index{Frame Shift}。为了缓解分帧带来的频谱泄漏，对每帧的信号进行加窗处理使其幅度在两段渐变到0，一般采用的是{\small\bfnew{汉明窗}}\index{汉明窗}（Hamming）\index{Hamming}。
 %----------------------------------------------------------------------------------------------------
 \begin{figure}[htp]
 \centering
@@ -85,7 +85,7 @@
 \end{figure}
 %----------------------------------------------------------------------------------------------------

-\parinterval 经过了上述的预处理操作，可以得到音频对应的帧序列，之后通过不同的操作来提取不同类型的声学特征。常用的声学特征包括{\small\bfnew{Mel频率倒谱系数}}\index{Mel频率倒谱系数}（Mel-Frequency Cepstral Coefficient, MFCC）\index{Mel-Frequency Cepstral Coefficient}、{\small\bfnew{感知线性预测系数}}\index{感知线性预测系数}（Perceptual Lienar Predictive, PLP）\index{Perceptual Lienar Predictive}、{\small\bfnew{滤波器组}}\index{滤波器组}（Filter-bank, Fbank）\index{Filter-bank}等。MFCC、PLP和Fbank特征都需要对预处理后的音频做{\small\bfnew{短时傅里叶变换}}\index{短时傅里叶变换}（Short-time Fourier Tranform, STFT）\index{Short-time Fourier Tranform}，得到具有规律的线性分辨率。之后再经过特定的操作，得到各种声学特征。不同声学特征的特点是不同的，MFCC去相关性较好，PLP抗噪性强，FBank可以保留更多的语音原始特征。在语音翻译中，比较常用的声学特征为FBank或MFCC\upcite{洪青阳2020语音识别原理与应用}。
+\parinterval 经过了上述的预处理操作，可以得到音频对应的帧序列，之后通过不同的操作来提取不同类型的声学特征。常用的声学特征包括{\small\bfnew{Mel频率倒谱系数}}\index{Mel频率倒谱系数}（Mel-Frequency Cepstral Coefficient，MFCC）\index{Mel-Frequency Cepstral Coefficient}、{\small\bfnew{感知线性预测系数}}\index{感知线性预测系数}（Perceptual Lienar Predictive，PLP）\index{Perceptual Lienar Predictive}、{\small\bfnew{滤波器组}}\index{滤波器组}（Filter-bank，Fbank）\index{Filter-bank}等。MFCC、PLP和Fbank特征都需要对预处理后的音频做{\small\bfnew{短时傅里叶变换}}\index{短时傅里叶变换}（Short-time Fourier Tranform，STFT）\index{Short-time Fourier Tranform}，得到具有规律的线性分辨率。之后再经过特定的操作，得到各种声学特征。不同声学特征的特点是不同的，MFCC去相关性较好，PLP抗噪性强，FBank可以保留更多的语音原始特征。在语音翻译中，比较常用的声学特征为FBank或MFCC\upcite{洪青阳2020语音识别原理与应用}。

 \parinterval 某种程度上讲，提取到的声学特征可以理解计算机视觉中的像素特征，或者自然语言处理中的词嵌入表示。不同之处在于，声学特征更加复杂多变，可能存在着较多的噪声和冗余信息。此外，相比对应的文字序列，音频提取到的特征序列长度要大十倍以上。比如，人类正常交流中每秒钟一般可以说2-3个字，而每秒钟的语音可以提取得到100帧的特征序列。巨大的长度比差异也为语音翻译中对声学特征建模带来了困难。

@@ -252,7 +252,7 @@

 %----------------------------------------------------------------------------------------------------

-\parinterval 此外，研究人员们还探索了很多其他方法来提高语音翻译模型的性能。利用在海量的无标注语音数据上预训练的{\small\bfnew{自监督}}\index{自监督}（Self-supervised）\index{Self-supervised}模型作为一个特征提取器，将从语音中提取的特征作为语音翻译模型的输入，可以有效提高模型的性能\upcite{DBLP:conf/interspeech/WuWPG20}。相比语音翻译模型，文本翻译模型任务更加简单，因此一种思想是利用文本翻译模型来指导语音翻译模型，比如通过知识蒸馏\upcite{DBLP:conf/interspeech/LiuXZHWWZ19}、正则化\upcite{DBLP:conf/emnlp/AlinejadS20}等方法。为了简化语音翻译模型的学习，可以通过课程学习的策略，使模型从语音识别任务，逐渐过渡到语音翻译任务，这种由易到难的训练策略可以使模型训练更加充分\upcite{DBLP:journals/corr/abs-1802-06003,DBLP:conf/acl/WangWLZY20}。
+\parinterval 此外，研究人员还探索了很多其他方法来提高语音翻译模型的性能。利用在海量的无标注语音数据上预训练的{\small\bfnew{自监督}}\index{自监督}（Self-supervised）\index{Self-supervised}模型作为一个特征提取器，将从语音中提取的特征作为语音翻译模型的输入，可以有效提高模型的性能\upcite{DBLP:conf/interspeech/WuWPG20}。相比语音翻译模型，文本翻译模型任务更加简单，因此一种思想是利用文本翻译模型来指导语音翻译模型，比如通过知识蒸馏\upcite{DBLP:conf/interspeech/LiuXZHWWZ19}、正则化\upcite{DBLP:conf/emnlp/AlinejadS20}等方法。为了简化语音翻译模型的学习，可以通过课程学习的策略，使模型从语音识别任务，逐渐过渡到语音翻译任务，这种由易到难的训练策略可以使模型训练更加充分\upcite{DBLP:journals/corr/abs-1802-06003,DBLP:conf/acl/WangWLZY20}。

 %----------------------------------------------------------------------------------------
 %    NEW SECTION
@@ -277,7 +277,7 @@

 \subsection{基于图像增强的文本翻译}

-\parinterval 在文本翻译中引入图像信息是最典型的多模态机器翻译任务。虽然多模态机器翻译还是一种从源语言文字到目标语言文字的转换，但是在转换的过程中，融入了其他模态的信息减少了歧义的产生。例如前文提到的通过与源语言相关的图像信息，将“A medium sized  child jumps off of a dusty bank”中“bank”译为“河岸”而不是“银行”，通过给定一张相关的图片，机器翻译模型就可以利用视觉信息更好的理解歧义词，避免产生歧义。换句话说，对于同一图像或者视觉场景的描述，源语言和目标语言描述的本质意义是一致的，只不过，体现在语言上会有表达方法上的差异。那么，图像就会存在一些源语言和目标语言的隐含对齐“约束”，将这种“约束”融入到机器翻译系统，会让模型加深对某些歧义词语上下文的理解，从而进一步提高机器翻译质量。
+\parinterval 在文本翻译中引入图像信息是最典型的多模态机器翻译任务。虽然多模态机器翻译还是一种从源语言文字到目标语言文字的转换，但是在转换的过程中，融入了其他模态的信息减少了歧义的产生。例如前文提到的通过与源语言相关的图像信息，将“A medium sized  child jumps off of a dusty bank”中“bank”翻译为“河岸”而不是“银行”，通过给定一张相关的图片，机器翻译模型就可以利用视觉信息更好的理解歧义词，避免产生歧义。换句话说，对于同一图像或者视觉场景的描述，源语言和目标语言描述的本质意义是一致的，只不过，体现在语言上会有表达方法上的差异。那么，图像就会存在一些源语言和目标语言的隐含对齐“约束”，将这种“约束”融入到机器翻译系统，会让模型加深对某些歧义词语上下文的理解，从而进一步提高机器翻译质量。

 \parinterval WMT机器翻译评测在2016年首次将融合图像和文本的多模态机器翻译作为机器翻译和跨语言图像描述的共享任务\upcite{DBLP:conf/wmt/SpeciaFSE16}，这项任务也受到了广泛的研究\upcite{DBLP:conf/wmt/CaglayanABGBBMH17,DBLP:conf/wmt/LibovickyHTBP16}。如何融入视觉信息，更好的理解多模态上下文语义是多模态机器翻译研究的热点，大体的研究方向包括基于特征融合的方法\upcite{DBLP:conf/emnlp/CalixtoL17,DBLP:journals/corr/abs-1712-03449,DBLP:conf/wmt/HelclLV18}、基于多任务学习的方法\upcite{DBLP:conf/ijcnlp/ElliottK17,DBLP:conf/acl/YinMSZYZL20}。接下来将从这两个方向，对多模态机器翻译的研究展开介绍。

@@ -291,7 +291,7 @@

 \begin{itemize}
    \vspace{0.5em}
-    \item 图像信息不全都是有用的，往往存在一些与源语言或目标语言无关的信息，作为全局特征会引入噪音
+    \item 图像信息不全都是有用的，往往存在一些与源语言或目标语言无关的信息，作为全局特征会引入噪音。
    \vspace{0.5em}
    \item 图像信息作为源语言的一部分或者初始化状态，间接参与目标语言单词的生成，在循环神经网络信息传递的过程中，图像信息会有一定的损失。
    \vspace{0.5em}
@@ -333,7 +333,7 @@

 \noindent 其中，${\alpha}_{i,j}$是注意力权重，它表示目标语言第j个位置与图片编码状态序列第i个位置的相关性大小，计算方式与{\chapterten}描述的注意力函数一致。

-\parinterval 这里，将每个时间步编码器的输出$\mathbi{h}_{i}$看作源图像序列位置$i$的表示结果。图3说明了模型在生成目标词“man”时，图像经过注意力机制对图像区域关注度的可视化效果，可以看到，经过注意力机制后，模型更注重的是与目标词相关的图像部分。当然，多模态机器翻译的输入还包括源语言文字序列。通常，源语言文字对于翻译的作用比图像更大\upcite{DBLP:conf/acl/YaoW20}。从这个角度说，图像信息更多的是作为文字信息的补充，而不是替代。除此之外，注意力机制在多模态机器翻译中也有很多研究，不仅仅在解码器端将经过注意力机制的文本特征和视觉特征作为解码输入的一部分，还有的工作在编码端将源语言与图像信息进行注意力建模\upcite{DBLP:journals/corr/abs-1712-03449,DBLP:conf/acl/YaoW20}，得到更好的源语言特征表示。
+\parinterval 这里，将每个时间步编码器的输出$\mathbi{h}_{i}$看作源图像序列位置$i$的表示结果。图\ref{fig:17-12}说明了模型在生成目标词“bank”时，图像经过注意力机制对图像区域关注度的可视化效果，可以看到，经过注意力机制后，模型更注重的是与目标词相关的图像部分。当然，多模态机器翻译的输入还包括源语言文字序列。通常，源语言文字对于翻译的作用比图像更大\upcite{DBLP:conf/acl/YaoW20}。从这个角度说，图像信息更多的是作为文字信息的补充，而不是替代。除此之外，注意力机制在多模态机器翻译中也有很多研究，不仅仅在解码器端将经过注意力机制的文本特征和视觉特征作为解码输入的一部分，还有的工作在编码器端将源语言与图像信息进行注意力建模\upcite{DBLP:journals/corr/abs-1712-03449,DBLP:conf/acl/YaoW20}，得到更好的源语言特征表示。

 %----------------------------------------------------------------------------------------
 %    NEW SUBSUB-SECTION
@@ -408,7 +408,7 @@

 \parinterval 要想使编码器-解码器框架在图像描述中充分发挥作用，编码器也要更好的表示图像信息。对于编码器的改进，大多也是从这个方向出发。通常，体现在向编码器中添加图像的语义信息\upcite{DBLP:conf/cvpr/YouJWFL16,DBLP:conf/cvpr/ChenZXNSLC17,DBLP:journals/pami/FuJCSZ17}和位置信息\upcite{DBLP:conf/cvpr/ChenZXNSLC17,DBLP:conf/ijcai/LiuSWWY17}。

-\parinterval 图像的语义信息一般是指图像中存在的实体、属性、场景等等。如图\ref{fig:17-16}所示，从图像中利用属性或实体检测器提取出“child”、“river”、“bank”等等的属性词和实体词作为图像的语义信息，提取全局的图像特征初始化循环神经网络，再利用注意力机制计算目标词与属性词或实体词之间的注意力权重，根据该权重计算上下文向量，从而将编码语义信息送入解码端\upcite{DBLP:conf/cvpr/YouJWFL16}，在解码‘bank’单词时，会更关注图像语义信息中的‘bank’。当然，除了图像中的实体和属性作为语义信息外，也可以将图片的场景信息也加入到编码器当中\upcite{DBLP:journals/pami/FuJCSZ17}。有关如何做属性、实体和场景的检测，涉及到目标检测任务的工作，例如Faster-RCNN\upcite{DBLP:journals/pami/RenHG017}、YOLO\upcite{DBLP:journals/corr/abs-1804-02767,DBLP:journals/corr/abs-2004-10934}等等,这里不过多赘述。
+\parinterval 图像的语义信息一般是指图像中存在的实体、属性、场景等等。如图\ref{fig:17-16}所示，从图像中利用属性或实体检测器提取出“child”、“river”、“bank”等等的属性词和实体词作为图像的语义信息，提取全局的图像特征初始化循环神经网络，再利用注意力机制计算目标词与属性词或实体词之间的注意力权重，根据该权重计算上下文向量，从而将编码语义信息送入解码器端\upcite{DBLP:conf/cvpr/YouJWFL16}，在解码‘bank’单词时，会更关注图像语义信息中的‘bank’。当然，除了图像中的实体和属性作为语义信息外，也可以将图片的场景信息加入到编码器当中\upcite{DBLP:journals/pami/FuJCSZ17}。有关如何做属性、实体和场景的检测，涉及到目标检测任务的工作，例如Faster-RCNN\upcite{DBLP:journals/pami/RenHG017}、YOLO\upcite{DBLP:journals/corr/abs-1804-02767,DBLP:journals/corr/abs-2004-10934}等等,这里不过多赘述。

 %----------------------------------------------------------------------------------------------------
 \begin{figure}[htp]
@@ -419,7 +419,7 @@
 \end{figure}
 %----------------------------------------------------------------------------------------------------

-\parinterval 以上的方法大都是将图像中的实体、属性、场景等映射到文字上，并把这些信息显式地添加到编码器端。令一种方式，把图像中的语义特征隐式地作用到编码器端\upcite{DBLP:conf/cvpr/ChenZXNSLC17}。例如，可以图像数据可以分解为三个通道（红、绿、蓝），简单来说，就是将图像的每一个像素点按照红色、绿色、蓝色分成三个部分，这样就将图像分成了三个通道。在很多图像中，不同通道随伴随的特征是不一样的，可以将其作用于编码器端。另一种方法是基于位置信息的编码器增强。位置信息指的是图像中对象（物体）的位置。利用目标检测技术检测系统获得图中的对象和对应的特征，这样就确定了图中的对象位置。显然，这些信息也可以加入到编码端，以加强编码器的表示能力\upcite{DBLP:conf/eccv/YaoPLM18}。
+\parinterval 以上的方法大都是将图像中的实体、属性、场景等映射到文字上，并把这些信息显式地添加到编码器端。令一种方式，把图像中的语义特征隐式地作用到编码器端\upcite{DBLP:conf/cvpr/ChenZXNSLC17}。例如，图像数据可以分解为三个通道（红、绿、蓝），简单来说，就是将图像的每一个像素点按照红色、绿色、蓝色分成三个部分，这样就将图像分成了三个通道。在很多图像中，不同通道伴随的特征是不一样的，可以将其作用于编码器端。另一种方法是基于位置信息的编码器增强。位置信息指的是图像中对象（物体）的位置。利用目标检测技术检测系统获得图中的对象和对应的特征，这样就确定了图中的对象位置。显然，这些信息也可以加入到编码器端，以加强编码器的表示能力\upcite{DBLP:conf/eccv/YaoPLM18}。

 %----------------------------------------------------------------------------------------
 %    NEW SUBSUB-SECTION
@@ -427,8 +427,9 @@

 \subsubsection{3. 解码器的改进}

-\parinterval 由于解码器输出的是语言文字序列，因此需要考虑语言的特点对其进行改进。 例如，解码过程中， “the”,“on”，“at”这种介词或者冠词与图像的相关性较低，这时图像信息的引入就会产生负面影响\upcite{DBLP:conf/cvpr/LuXPS17}。因此，可以通过门等结构，控制视觉信号作用于文字生成的程度。另外,在解码过程中，生成的每个单词对应着图像的区域可能是不同的。因此也可以设计更为有效的注意力机制来捕捉解码端对不同图像局部信息的关注程度\upcite{DBLP:conf/cvpr/00010BT0GZ18}。 
-\parinterval 除了在解码端更好的使生成文本与图像特征相互作用以外，还有一些其他的解码器端改进的方向。例如：用其它结构（如卷积神经网络或者Transformer）代替解码器端循环神经网络\upcite{DBLP:conf/cvpr/AnejaDS18}。或者使用更深层的神经网络学习动词或者名词等视觉中不易表现出来的单词\upcite{DBLP:journals/mta/FangWCT18}，其思想与深层神经机器翻译模型有相通之处（{\chapterfifteen}）。
+\parinterval 由于解码器输出的是语言文字序列，因此需要考虑语言的特点对其进行改进。 例如，解码过程中， “the”,“on”，“at”这种介词或者冠词与图像的相关性较低，这时图像信息的引入就会产生负面影响\upcite{DBLP:conf/cvpr/LuXPS17}。因此，可以通过门等结构，控制视觉信号作用于文字生成的程度。另外,在解码过程中，生成的每个单词对应着图像的区域可能是不同的。因此也可以设计更为有效的注意力机制来捕捉解码器端对不同图像局部信息的关注程度\upcite{DBLP:conf/cvpr/00010BT0GZ18}。
+
+\parinterval 除了在解码器端更好的使生成文本与图像特征相互作用以外，还有一些其他的解码器端改进的方向。例如：用其它结构（如卷积神经网络或者Transformer）代替解码器端循环神经网络\upcite{DBLP:conf/cvpr/AnejaDS18}。或者使用更深层的神经网络学习动词或者名词等视觉中不易表现出来的单词\upcite{DBLP:journals/mta/FangWCT18}，其思想与深层神经机器翻译模型有相通之处（{\chapterfifteen}）。

 %----------------------------------------------------------------------------------------
 %    NEW SUB-SECTION
@@ -440,7 +441,7 @@

 \parinterval 计算机视觉领域，图像风格转移、图像语义分割、图像超分辨率等任务，都可以被视为{\small\bfnew{图像到图像的翻译}}\index{图像到图像的翻译}（Image-to-Image Translation）\index{Image-to-Image Translation}问题。与机器翻译类似，这些问题的共同目标是学习从一个对象到另一个对象的映射，只不过这里的对象是指图像，而非机器翻译中的文字。例如，给定物体的轮廓生成真实物体照片或者给定白天照片生成夜晚的照片等。图像到图像的翻译有广阔的应用场景，如图片补全、风格迁移等。

-\parinterval 对抗神经网络被广泛地应用再图像到图像的翻译任务当中\upcite{DBLP:conf/nips/GoodfellowPMXWOCB14,DBLP:conf/nips/ZhuZPDEWS17,DBLP:journals/corr/abs-1908-06616}。实际上，这类方法非常适合图像生成类的任务。简单来说，对抗生成网络包括两个部分分别是：生成器和判别器。基于输入生成器生成一个结果，而判别器要判别生成的结果和真实结果是否是相同的，对抗的思想是，通过强化生成器的生成能力和判别器的判别能力，当生成器生成的结果可以“骗”过判别器时，即判别器无法分清真实结果和生成结果，认为模型学到了这种映射关系。在图像到图像的翻译中，根据输入图像，生成器生成预测图像，判别器判别是否为目标图像，多次迭代后，生成图像被判别为目标图像时，则模型学习到了“翻译能力”。以上的工作都是有监督的，即基于对齐的图像对数据集，但是，这种数据的标注是极为费时费力的，所以有很多的工作也基于无监督的方法展开\upcite{DBLP:conf/iccv/ZhuPIE17,DBLP:conf/iccv/YiZTG17,DBLP:conf/nips/LiuBK17}，这里不过多赘述。
+\parinterval 对抗神经网络被广泛地应用在图像到图像的翻译任务当中\upcite{DBLP:conf/nips/GoodfellowPMXWOCB14,DBLP:conf/nips/ZhuZPDEWS17,DBLP:journals/corr/abs-1908-06616}。实际上，这类方法非常适合图像生成类的任务。简单来说，对抗生成网络包括两个部分分别是：生成器和判别器。基于输入生成器生成一个结果，而判别器要判别生成的结果和真实结果是否是相同的，对抗的思想是，通过强化生成器的生成能力和判别器的判别能力，当生成器生成的结果可以“骗”过判别器时，即判别器无法分清真实结果和生成结果，认为模型学到了这种映射关系。在图像到图像的翻译中，根据输入图像，生成器生成预测图像，判别器判别是否为目标图像，多次迭代后，生成图像被判别为目标图像时，则模型学习到了“翻译能力”。以上的工作都是有监督的，即基于对齐的图像对数据集，但是，这种数据的标注是极为费时费力的，所以有很多的工作也基于无监督的方法展开\upcite{DBLP:conf/iccv/ZhuPIE17,DBLP:conf/iccv/YiZTG17,DBLP:conf/nips/LiuBK17}，这里不过多赘述。

 \parinterval {\small\bfnew{文本到图像的翻译}}\index{文本到图像的翻译}（Text-to-Image Translation）\index{Text-to-Image Translation}是指给定描述物体颜色和形状等细节的一自然语言文字，生成对应的图像。该任务也可以看作是图像描述任务的逆任务。目前方法上大部分基于对抗神经网络\upcite{DBLP:conf/icml/ReedAYLSL16,DBLP:journals/corr/DashGALA17,DBLP:conf/nips/ReedAMTSL16}。基本流程为：首先利用自然语言处理技术提取出文本信息，然后再用文本特征作为后面生成图像的约束，在对抗神经网络中生成器（Generator）中根据文本特征生成图像的约束，从而别鉴别器（Discriminator）鉴定其生成效果。