update 17

3863f6c3 · 曹润柘 · 7d622fd1 · 3863f6c3 · 3863f6c3 · 3863f6c3
Commit 3863f6c3 authored Dec 27, 2020 by 曹润柘
--- a/Chapter17/Figures/figure-an-end-to-end-voice-translation-model-based-on-transformer.tex
+++ b/Chapter17/Figures/figure-an-end-to-end-voice-translation-model-based-on-transformer.tex
@@ -18,9 +18,9 @@
 \node[layer,anchor=south,fill=green!20] (de_ffn) at ([yshift=1.4em]de_ca.north){Feed Forward \\ Network};

 \node[layer,anchor=south,fill=blue!20] (sf) at ([yshift=1.6em]de_ffn.north){Softmax};
-\node[layer,anchor=south,fill=orange!20] (output) at ([yshift=1.4em]sf.north){STLoss};
+%\node[layer,anchor=south,fill=orange!20] (output) at ([yshift=1.4em]sf.north){STLoss};

-\node[anchor=north,font=\scriptsize,align=center] (en_input) at ([yshift=-1em]en_cnn.south){Speech Feature\\(FilterBank/MFCC)};
+\node[anchor=north,font=\scriptsize,align=center] (en_input) at ([yshift=-1em]en_cnn.south){Speech Feature\\(FBank/MFCC)};
 \node[anchor=north,font=\scriptsize,align=center] (de_input) at ([yshift=-1.1em]de_add.south){Target Text\\(Embedding)};

 \node[anchor=east,font=\scriptsize,align=center] (en_pos) at ([xshift=-2em]en_add.west){Position\\(Embedding)};
@@ -35,7 +35,7 @@
 \draw[->] ([yshift=0.1em]de_sa.90) -- ([yshift=-0.1em]de_ca.-90);
 \draw[->] ([yshift=0.1em]de_ca.90) -- ([yshift=-0.1em]de_ffn.-90);
 \draw[->] ([yshift=0.1em]de_ffn.90) -- ([yshift=-0.1em]sf.-90);
-\draw[->] ([yshift=0.1em]sf.90) -- ([yshift=-0.1em]output.-90);
+\draw[->] ([yshift=0.1em]sf.90) -- ([yshift=1.5em]sf.90);
 \draw[->] ([xshift=0.1em]en_pos.0) -- ([xshift=-0.1em]en_add.180);
 \draw[->] ([xshift=-0.1em]de_pos.180) -- ([xshift=0.1em]de_add.0);
 \draw[->,rounded corners=2pt] ([yshift=0.1em]en_ffn.90) -- ([yshift=2em]en_ffn.90) -- ([xshift=4em,yshift=2em]en_ffn.90) -- ([xshift=-1.5em]de_ca.west) -- ([xshift=-0.1em]de_ca.west);

--- a/Chapter17/Figures/figure-examples-of-CTC-predictive-word-sequences.tex
+++ b/Chapter17/Figures/figure-examples-of-CTC-predictive-word-sequences.tex
@@ -14,7 +14,7 @@
 \node[anchor=west,minimum width=1.2em,minimum height=2.2em] (w8) at ([xshift=0.2em]w7.east){{l}};
 \node[anchor=west,minimum width=1.2em,minimum height=2.2em] (w9) at ([xshift=0.2em]w8.east){{o}};
 \node[anchor=west,minimum width=1.2em,minimum height=2.2em] (w10) at ([xshift=0.2em]w9.east){{o}};
-\node[anchor=west,minimum width=1.2em,minimum height=2.2em] (w11) at ([xshift=0.2em]w10.east){{!}};
+\node[anchor=west,minimum width=1.2em,minimum height=2.2em] (w11) at ([xshift=0.2em]w10.east){{}};

 \draw[very thick] (w1.south west) -- (w1.south east);
 \draw[very thick] (w2.south west) -- (w2.south east);
@@ -24,7 +24,7 @@
 \draw[very thick] (w8.south west) -- (w8.south east);
 \draw[very thick] (w9.south west) -- (w9.south east);
 \draw[very thick] (w10.south west) -- (w10.south east);
-\draw[very thick] (w11.south west) -- (w11.south east);
+%\draw[very thick] (w11.south west) -- (w11.south east);

 \node[anchor=north,minimum width=1.2em,minimum height=1.4em,fill=gray!30] (m1) at ([yshift=-1em]w1.south){{h}};
 \node[anchor=north,minimum width=2.64em,minimum height=1.4em,fill=gray!30] (m2) at ([yshift=-1em,xshift=0.72em]w2.south){{e}};
@@ -33,14 +33,14 @@
 \node[anchor=north,minimum width=1.2em,minimum height=1.4em,fill=gray!30] (m5) at ([yshift=-1em]w6.south){};
 \node[anchor=north,minimum width=2.64em,minimum height=1.4em,fill=gray!30] (m6) at ([yshift=-1em,xshift=0.72em]w7.south){{l}};
 \node[anchor=north,minimum width=2.64em,minimum height=1.4em,fill=gray!30] (m7) at ([yshift=-1em,xshift=0.72em]w9.south){{o}};
-\node[anchor=north,minimum width=1.2em,minimum height=1.4em,fill=gray!30] (m8) at ([yshift=-1em]w11.south){{!}};
+\node[anchor=north,minimum width=1.2em,minimum height=1.4em,fill=gray!30] (m8) at ([yshift=-1em]w11.south){};

 \node[anchor=north,minimum width=1.2em,minimum height=1.4em,fill=gray!30] (o1) at ([yshift=-3.8em]w1.south){{h}};
 \node[anchor=north,minimum width=1.2em,minimum height=1.4em,fill=gray!30] (o2) at ([yshift=-3.8em]w2.south){{e}};
 \node[anchor=north,minimum width=1.2em,minimum height=1.4em,fill=gray!30] (o3) at ([yshift=-3.8em]w3.south){{l}};
 \node[anchor=north,minimum width=1.2em,minimum height=1.4em,fill=gray!30] (o4) at ([yshift=-3.8em]w4.south){{l}};
 \node[anchor=north,minimum width=1.2em,minimum height=1.4em,fill=gray!30] (o5) at ([yshift=-3.8em]w5.south){{o}};
-\node[anchor=north,minimum width=1.2em,minimum height=1.4em,fill=gray!30] (o6) at ([yshift=-3.8em]w6.south){{!}};
+\node[anchor=north,minimum width=1.2em,minimum height=1.4em,fill=gray!30] (o6) at ([yshift=-3.8em]w6.south){};
 \node[anchor=north,minimum width=1.2em,minimum height=1.4em,fill=gray!30] at ([yshift=-3.8em]w7.south){};
 \node[anchor=north,minimum width=1.2em,minimum height=1.4em,fill=gray!30] at ([yshift=-3.8em]w8.south){};
 \node[anchor=north,minimum width=1.2em,minimum height=1.4em,fill=gray!30] at ([yshift=-3.8em]w9.south){};
@@ -52,7 +52,7 @@
 \draw[blue!40,fill=blue!30,opacity=0.7] (w5.south west) -- (w5.south east) -- (m4.south east) .. controls ([yshift=-0.3em]m4.south east) and ([yshift=0.3em]o3.north east) .. (o3.north east) -- (o3.south east) -- (o3.south west) -- (o3.north west) .. controls ([yshift=0.3em]o3.north west) and ([yshift=-0.3em]m4.south west) .. (m4.south west) -- (w5.south west);
 \draw[blue!40,fill=blue!30,opacity=0.7] (w7.south west) -- (w8.south east) -- (m6.south east) .. controls ([yshift=-0.3em]m6.south east) and ([yshift=0.3em]o4.north east) .. (o4.north east) -- (o4.south east) -- (o4.south west) -- (o4.north west) .. controls ([yshift=0.3em]o4.north west) and ([yshift=-0.3em]m6.south west) .. (m6.south west) -- (w7.south west);
 \draw[blue!40,fill=blue!30,opacity=0.7] (w9.south west) -- (w10.south east) -- (m7.south east) .. controls ([yshift=-0.1em]m7.south east) and ([yshift=0.2em]o5.north east) .. (o5.north east) -- (o5.south east) -- (o5.south west) -- (o5.north west) .. controls ([yshift=0.1em]o5.north west) and ([yshift=-0.5em]m7.south west) .. (m7.south west) -- (w9.south west);
-\draw[blue!40,fill=blue!30,opacity=0.7] (w11.south west) -- (w11.south east) -- (m8.south east) .. controls ([yshift=-0.4em]m8.south east) and ([yshift=0.1em]o6.north east) .. (o6.north east) -- (o6.south east) -- (o6.south west) -- (o6.north west) .. controls ([yshift=0.1em]o6.north west) and ([yshift=-0.5em]m8.south west) .. (m8.south west) -- (w11.south west);
+%\draw[blue!40,fill=blue!30,opacity=0.7] (w11.south west) -- (w11.south east) -- (m8.south east) .. controls ([yshift=-0.4em]m8.south east) and ([yshift=0.1em]o6.north east) .. (o6.north east) -- (o6.south east) -- (o6.south west) -- (o6.north west) .. controls ([yshift=0.1em]o6.north west) and ([yshift=-0.5em]m8.south west) .. (m8.south west) -- (w11.south west);

 \node[anchor=north,font=\scriptsize,align=center] (a2) at  ([yshift=-1.4em]a1.south) {预测字母序列};
 \node[anchor=north,font=\scriptsize,align=center] (a3) at  ([yshift=-1.8em]a2.south) {合并重复字母 \\ 并丢弃$\epsilon$};

--- a/Chapter17/Figures/figure-speech-recognition-model-based-on-transformer.tex
+++ b/Chapter17/Figures/figure-speech-recognition-model-based-on-transformer.tex
@@ -18,9 +18,9 @@
 \node[layer,anchor=south,fill=green!20] (de_ffn) at ([yshift=1.4em]de_ca.north){Feed Forward \\ Network};

 \node[layer,anchor=south,fill=blue!20] (sf) at ([yshift=1.6em]de_ffn.north){Softmax};
-\node[layer,anchor=south,fill=orange!20] (output) at ([yshift=1.4em]sf.north){Output Probabilities};
+%\node[layer,anchor=south,fill=orange!20] (output) at ([yshift=1.4em]sf.north){Output Probabilities};

-\node[anchor=north,font=\scriptsize,align=center] (en_input) at ([yshift=-1em]en_cnn.south){Speech Feature\\(FilterBank/MFCC)};
+\node[anchor=north,font=\scriptsize,align=center] (en_input) at ([yshift=-1em]en_cnn.south){Speech Feature\\(FBank/MFCC)};
 \node[anchor=north,font=\scriptsize,align=center] (de_input) at ([yshift=-1.1em]de_add.south){Transcription\\(Embedding)};

 \node[anchor=east,font=\scriptsize,align=center] (en_pos) at ([xshift=-2em]en_add.west){Position\\(Embedding)};
@@ -35,7 +35,7 @@
 \draw[->] ([yshift=0.1em]de_sa.90) -- ([yshift=-0.1em]de_ca.-90);
 \draw[->] ([yshift=0.1em]de_ca.90) -- ([yshift=-0.1em]de_ffn.-90);
 \draw[->] ([yshift=0.1em]de_ffn.90) -- ([yshift=-0.1em]sf.-90);
-\draw[->] ([yshift=0.1em]sf.90) -- ([yshift=-0.1em]output.-90);
+\draw[->] ([yshift=0.1em]sf.90) -- ([yshift=1.5em]sf.90);
 \draw[->] ([xshift=0.1em]en_pos.0) -- ([xshift=-0.1em]en_add.180);
 \draw[->] ([xshift=-0.1em]de_pos.180) -- ([xshift=0.1em]de_add.0);
 \draw[->,rounded corners=2pt] ([yshift=0.1em]en_ffn.90) -- ([yshift=2em]en_ffn.90) -- ([xshift=4em,yshift=2em]en_ffn.90) -- ([xshift=-1.5em]de_ca.west) -- ([xshift=-0.1em]de_ca.west);

--- a/Chapter17/Figures/figure-speech-translation-model-based-on-CTC.tex
+++ b/Chapter17/Figures/figure-speech-translation-model-based-on-CTC.tex
@@ -20,9 +20,9 @@
 \node[layer,anchor=south,fill=blue!20] (en_sf) at ([yshift=3em]en_ffn.north){Softmax};
 \node[layer,anchor=south,fill=blue!20] (sf) at ([yshift=2em]de_ffn.north){Softmax};
 \node[layer,anchor=south,fill=orange!20] (en_output) at ([yshift=1.4em]en_sf.north){CTC Output};
-\node[layer,anchor=south,fill=orange!20] (output) at ([yshift=1.4em]sf.north){ST Output};
+%\node[layer,anchor=south,fill=orange!20] (output) at ([yshift=1.4em]sf.north){ST Output};

-\node[anchor=north,font=\scriptsize,align=center] (en_input) at ([yshift=-1em]en_cnn.south){Speech Feature\\(FilterBank/MFCC)};
+\node[anchor=north,font=\scriptsize,align=center] (en_input) at ([yshift=-1em]en_cnn.south){Speech Feature\\(FBank/MFCC)};
 \node[anchor=north,font=\scriptsize,align=center] (de_input) at ([yshift=-1em]de_add.south){Target Text\\(Embedding)};

 \node[anchor=east,font=\scriptsize,align=center] (en_pos) at ([xshift=-2em]en_add.west){Position\\(Embedding)};
@@ -39,7 +39,7 @@
 \draw[->] ([yshift=0.1em]en_ffn.90) -- ([yshift=-0.1em]en_sf.-90);
 \draw[->] ([yshift=0.1em]en_sf.90) -- ([yshift=-0.1em]en_output.-90);
 \draw[->] ([yshift=0.1em]de_ffn.90) -- ([yshift=-0.1em]sf.-90);
-\draw[->] ([yshift=0.1em]sf.90) -- ([yshift=-0.1em]output.-90);
+\draw[->] ([yshift=0.1em]sf.90) -- ([yshift=1.5em]sf.90);
 \draw[->] ([xshift=0.1em]en_pos.0) -- ([xshift=-0.1em]en_add.180);
 \draw[->] ([xshift=-0.1em]de_pos.180) -- ([xshift=0.1em]de_add.0);
 \draw[->,rounded corners=2pt] ([yshift=2em]en_ffn.90) -- ([xshift=4em,yshift=2em]en_ffn.90) -- ([xshift=-1.5em]de_ca.west) -- ([xshift=-0.1em]de_ca.west);

--- a/Chapter17/Figures/figure-three-ways-of-dual-decoder-speech-translation.tex
+++ b/Chapter17/Figures/figure-three-ways-of-dual-decoder-speech-translation.tex
@@ -6,9 +6,9 @@
 \node(encoder)[coder]at (0,0){\large{编码器}};
 \node(decoder_1)[coder,above of =encoder,xshift=-1.6cm,yshift=2.8cm,fill=blue!20]{\large{解码器}};
 \node(decoder_2)[coder,above of =encoder, xshift=1.6cm,yshift=2.8cm,fill=yellow!20]{\large{解码器}};
-\node(s)[below of = encoder,yshift=-1.8cm,scale=1.6]{$s$};
-\node(x)[above of = decoder_1,yshift=1.8cm,scale=1.6]{$x$};
-\node(y)[above of = decoder_2,yshift=1.8cm,scale=1.6]{$y$};
+\node(s)[below of = encoder,yshift=-1.8cm,scale=1.2]{$s$};
+\node(x)[above of = decoder_1,yshift=1.8cm,scale=1.2]{$x$};
+\node(y)[above of = decoder_2,yshift=1.8cm,scale=1.2]{$y$};

 \draw[->,thick](s.north)to(encoder.south);
 \draw[->,thick](decoder_1.north)to(x.south);
@@ -22,8 +22,8 @@
 \node(encoder-2)[coder]at ([xshift=12.0em]encoder.east){\large{编码器}};
 \node(decoder_1-2)[coder,above of =encoder-2,yshift=1.4cm,fill=blue!20]{\large{解码器}};
 \node(decoder_2-2)[coder,above of =decoder_1-2, yshift=1.4cm,fill=yellow!20]{\large{解码器}};
-\node(s-2)[below of = encoder-2,yshift=-1.8cm,scale=1.6]{$s$};
-\node(y-2)[above of = decoder_2-2,yshift=1.8cm,scale=1.6]{$y$};
+\node(s-2)[below of = encoder-2,yshift=-1.8cm,scale=1.2]{$s$};
+\node(y-2)[above of = decoder_2-2,yshift=1.8cm,scale=1.2]{$y$};

 \draw[->,thick](s-2.north)to(encoder-2.south);
 \draw[->,thick](encoder-2.north)to(decoder_1-2.south);
@@ -34,8 +34,8 @@
 \node(encoder-3)[coder]at([xshift=10.0em]encoder-2.east){\large{编码器}};
 \node(decoder_1-3)[coder,above of =encoder-3,xshift=-1.6cm,yshift=2.8cm,fill=blue!20]{\large{解码器}};
 \node(decoder_2-3)[coder,above of =encoder-3, xshift=1.6cm,yshift=2.8cm,fill=yellow!20]{\large{解码器}};
-\node(s-3)[below of = encoder-3,yshift=-1.8cm,scale=1.6]{$s$};
-\node(y-3)[above of = decoder_2-3,yshift=1.8cm,scale=1.6]{$y$};
+\node(s-3)[below of = encoder-3,yshift=-1.8cm,scale=1.2]{$s$};
+\node(y-3)[above of = decoder_2-3,yshift=1.8cm,scale=1.2]{$y$};

 \draw[->,thick](s-3.north)to(encoder-3.south);
 \draw[->,thick](decoder_1-3.east)to(decoder_2-3.west);

--- a/Chapter17/Figures/figure-word-lattice.tex
+++ b/Chapter17/Figures/figure-word-lattice.tex
@@ -16,25 +16,25 @@
 \node[anchor=north,node] (n4) at ([xshift=6em,yshift=-1.6em]n3.south){4};


-\draw[->] (n0.0) -- node[word,above]{of /0.343}(n2.180);
-\draw[->] (n0.60) -- node[word,above,rotate=40]{a /0.499}(n1.-150);
-\draw[->] (n0.-50) -- node[word,above,rotate=-20]{their /0.116}(n3.150);
-\draw[->] (n0.-70) .. controls ([xshift=-8em]n4.180) and ([xshift=-8em]n4.180) .. node[above,word,xshift=3em,yshift=-0.6em]{that /0.042} (n4.180);
-\draw[->] (n4.0) .. node[word,above,xshift=-2em,yshift=-0.4em]{hospital /1} controls ([xshift=5em]n4.0) and ([yshift=-6em]n6.-90) .. (n6.-90);
-\draw[->] (n2.-90) -- node[word,above,rotate=-18,pos=0.55]{house /0.127}(n7.180);
-\draw[->] (n3.-10) node[word,above,xshift=3.6em,yshift=-0.8em]{conference /1} .. controls ([xshift=4.6em,yshift=-1.8em]n3.-10) and ([yshift=-1.6em,xshift=-3em]n10.-135) .. (n10.-135);
+\draw[->] (n0.0) -- node[word,above]{for /0.227}(n2.180);
+\draw[->] (n0.60) -- node[word,above,rotate=40]{a /0.628}(n1.-150);
+\draw[->] (n0.-50) -- node[word,above,rotate=-20]{our /0.103}(n3.150);
+\draw[->] (n0.-70) .. controls ([xshift=-8em]n4.180) and ([xshift=-8em]n4.180) .. node[above,word,xshift=3em,yshift=-0.6em]{this /0.042} (n4.180);
+\draw[->] (n4.0) .. node[word,above,xshift=-2em,yshift=-0.4em]{video /1} controls ([xshift=5em]n4.0) and ([yshift=-6em]n6.-90) .. (n6.-90);
+\draw[->] (n2.-90) -- node[word,above,rotate=-18,pos=0.55]{movie /0.127}(n7.180);
+\draw[->] (n3.-10) node[word,above,xshift=3.6em,yshift=-0.8em]{book /1} .. controls ([xshift=4.6em,yshift=-1.8em]n3.-10) and ([yshift=-1.6em,xshift=-3em]n10.-135) .. (n10.-135);
 \draw[->] (n7.0) -- node[word,above]{which /1}(n10.180);
-\draw[->] (n2.0) -- node[word,above,pos=0.5]{hospital /0.300}(n6.180);
+\draw[->] (n2.0) -- node[word,above,pos=0.5]{video /0.300}(n6.180);
 \draw[->] (n2.45) -- node[word,above,rotate=18,pos=0.3]{a /0.573}(n11.-135);
-\draw[->,rounded corners=1em] (n1.-45) node[word,above,xshift=1.4em,yshift=-1.3em,rotate=-43]{house /0.079} -- ([yshift=-0.4em,xshift=-1em]n11.-90) -- (n7.100);
-\draw[->] (n1.20) node[word,above,xshift=4em]{conference /0.734} .. controls ([xshift=8em]n1.20) and  ([xshift=-0.6em,yshift=2.2em]n5.110) .. (n5.110);
-\draw[->] (n11.0) -- node[word,above]{conference /1}(n5.180);
-\draw[->] (n5.-90) ..node[word,above,xshift=1.4em]{is /0.773} controls ([yshift=-1.6em]n5.-90) and ([xshift=-3em]n6.150]) .. (n6.150);
-\draw[->] (n5.0) node[word, above,xshift=1.4em]{as /0.227}.. controls ([xshift=2.6em]n5.0) and ([xshift=-0.6em,yshift=2em]n6.120) .. (n6.120);
+\draw[->,rounded corners=1em] (n1.-45) node[word,above,xshift=1.4em,yshift=-1.3em,rotate=-43]{movie /0.187} -- ([yshift=-0.4em,xshift=-1em]n11.-90) -- (n7.100);
+\draw[->] (n1.20) node[word,above,xshift=4em]{book /0.520} .. controls ([xshift=8em]n1.20) and  ([xshift=-0.6em,yshift=2.2em]n5.110) .. (n5.110);
+\draw[->] (n11.0) -- node[word,above]{book /1}(n5.180);
+\draw[->] (n5.-90) ..node[word,above,xshift=1.4em]{is /0.822} controls ([yshift=-1.6em]n5.-90) and ([xshift=-3em]n6.150]) .. (n6.150);
+\draw[->] (n5.0) node[word, above,xshift=1.4em]{as /0.178}.. controls ([xshift=2.6em]n5.0) and ([xshift=-0.6em,yshift=2em]n6.120) .. (n6.120);

 \coordinate (a) at ([xshift=6em,yshift=3em]n1);
-\draw[->] (n1.60) .. controls ([xshift=3em,yshift=2em]n1.60) and ([xshift=-2em]a) .. (a) node[word,above,xshift=1em]{hospital /0.187}.. controls ([xshift=8em]a) and ([xshift=-0.6em,yshift=6em]n6.90) .. (n6.90);
+\draw[->] (n1.60) .. controls ([xshift=3em,yshift=2em]n1.60) and ([xshift=-2em]a) .. (a) node[word,above,xshift=1em]{video /0.293}.. controls ([xshift=8em]a) and ([xshift=-0.6em,yshift=6em]n6.90) .. (n6.90);
 \draw[->] (n10.0) -- node[above,word,pos=0.4,rotate=30]{is /1}(n6.-135);
 \draw[->] (n6.0) -- node[above,word,yshift=0.2em]{being /1}(n8.180);
-\draw[->] (n8.0) -- node[above,word,yshift=0.3em]{recorded /1}(n9.180);
+\draw[->] (n8.0) -- node[above,word,yshift=0.3em]{written /1}(n9.180);
 \end{tikzpicture}
\ No newline at end of file
--- a/Chapter17/chapter17.tex
+++ b/Chapter17/chapter17.tex
@@ -54,7 +54,7 @@
 %----------------------------------------------------------------------------------------
 \section{语音翻译}

-\parinterval 语音，是人类日常生活与交流中最常用的一种信息载体。从日常聊天、国际旅游，到国际会议、跨国合作，对于语言进行翻译的需求不断增加。甚至在有些场景下，用语音进行交互要比用文本进行交互频繁的多。因此，{\small\bfnew{语音翻译}}\index{语音翻译}（Speech Translation）\index{Speech Translation}也成为了语音处理和机器翻译相结合的重要产物。根据目标语言的载体类型，可以将语音翻译分为{\small\bfnew{语音到文本翻译}}\index{语音到文本翻译}（Speech-to-Text Translation）\index{Speech-to-Text Translation}和{\small\bfnew{语音到语音翻译}}（Speech-to-Speech Translation）\index{Speech-to-Speech Translation}；基于翻译的实时性，还可以分为{\small\bfnew{实时语音翻译}}\index{实时语音翻译}（即同声传译，Simultaneous Translation）\index{Simultaneous Translation}和{\small\bfnew{离线语音翻译}}（Offline speech translation）\index{Offline speech translation}。本节主要关注离线语音到文本翻译方法（简称为语音翻译），分别从音频处理、级联语音翻译和端到端语音翻译进行介绍。
+\parinterval 语音，是人类日常生活与交流中最常用的一种信息载体。从日常聊天、国际旅游，到国际会议、跨国合作，对于语言进行翻译的需求不断增加。甚至在有些场景下，用语音进行交互要比用文本进行交互频繁的多。因此，{\small\bfnew{语音翻译}}\index{语音翻译}（Speech Translation）\index{Speech Translation}也成为了语音处理和机器翻译相结合的重要产物。根据目标语言的载体类型，可以将语音翻译分为{\small\bfnew{语音到文本翻译}}\index{语音到文本翻译}（Speech-to-Text Translation）\index{Speech-to-Text Translation}和{\small\bfnew{语音到语音翻译}}\index{语音到语音翻译}（Speech-to-Speech Translation）\index{Speech-to-Speech Translation}；基于翻译的实时性，还可以分为{\small\bfnew{实时语音翻译}}\index{实时语音翻译}（即同声传译，Simultaneous Translation）\index{Simultaneous Translation}和{\small\bfnew{离线语音翻译}}（Offline Speech Translation）\index{离线语音翻译}\index{Offline Speech Translation}。本节主要关注离线语音到文本翻译方法（简称为语音翻译），分别从音频处理、级联语音翻译和端到端语音翻译进行介绍。

 %----------------------------------------------------------------------------------------
 %    NEW SUB-SECTION
@@ -62,7 +62,7 @@

 \subsection{音频处理}

-\parinterval 不同于文本，音频本质上是经过若干信号处理之后的{\small\bfnew{波形}}（Waveform）\index{Waveform}。具体来说，声音是一种空气的震动，因此可以被转换为模拟信号。模拟信号是一段连续的信号，经过采样变为离散数字信号。采样是每隔固定的时间记录一下声音的振幅，采样率表示每秒的采样点数，单位是赫兹（Hz）。采样率越高，结果的损失则越小。通常来说，采样的标准是能够通过离散化的数字信号重现原始语音。我们日常生活中使用的手机和电脑设备的采样率一般为16kHz，表示每秒16000个采样点；而音频CD的采样率可以达到44.1kHz。经过进一步的量化，将采样点的值转换为整型数值保存，从而减少占用的存储空间，通常采用的是16位量化。将采样率和量化位数相乘，就可以得到{\small\bfnew{比特率}}\index{比特率}（Bits Per Second，BPS）\index{Bits Per Second}，表示音频每秒占用的位数。16kHz采样率和16位量化的音频，比特率为256kb/s。整体流程如图\ref{fig:17-2}所示\upcite{洪青阳2020语音识别原理与应用,陈果果2020语音识别实战}。
+\parinterval 不同于文本，音频本质上是经过若干信号处理之后的{\small\bfnew{波形}}（Waveform）\index{Waveform}。具体来说，声音是一种空气的震动，因此可以被转换为模拟信号。模拟信号是一段连续的信号，经过采样变为离散数字信号。采样是每隔固定的时间记录一下声音的振幅，采样率表示每秒的采样点数，单位是赫兹（Hz）。采样率越高，结果的损失则越小。通常来说，采样的标准是能够通过离散化的数字信号重现原始语音。我们日常生活中使用的手机和电脑设备的采样率一般为16kHz，表示每秒16000个采样点；而音频CD的采样率可以达到44.1kHz。经过进一步的量化，将采样点的值转换为整型数值保存，从而减少占用的存储空间，通常采用的是16位量化。将采样率和量化位数相乘，就可以得到{\small\bfnew{比特率}}\index{比特率}（Bits Per Second，BPS）\index{Bits Per Second}，表示音频每秒占用的位数。例如，16kHz采样率和16位量化的音频，比特率为256kb/s。整体流程如图\ref{fig:17-2}所示\upcite{洪青阳2020语音识别原理与应用,陈果果2020语音识别实战}。

 %----------------------------------------------------------------------------------------------------
 \begin{figure}[htp]
@@ -73,9 +73,9 @@
 \end{figure}
 %----------------------------------------------------------------------------------------------------

-\parinterval 经过上面的描述，音频的表示实际上是一个非常长的采样点序列，这导致了直接使用现有的深度学习技术处理音频序列较为困难。并且，原始的音频信号中可能包含着较多的噪声、环境声或冗余信息也会对模型产生干扰。因此，一般会对音频序列进行处理来提取声学特征，具体为将长序列的采样点序列转换为短序列的特征向量序列，再用于下游系统模块。虽然已有一些工作不依赖特征提取，直接在原始的采样点序列上进行声学建模和模型训练\upcite{DBLP:conf/interspeech/SainathWSWV15}，但目前的主流方法仍然是基于声学特征进行建模\upcite{DBLP:conf/icassp/MohamedHP12}。
+\parinterval 经过上面的描述可以看出，音频的表示实际上是一个非常长的采样点序列，这导致了直接使用现有的深度学习技术处理音频序列较为困难。并且，原始的音频信号中可能包含着较多的噪声、环境声或冗余信息，也会对模型产生干扰。因此，一般会对音频序列进行处理来提取声学特征，具体为将长序列的采样点序列转换为短序列的特征向量序列，再用于下游系统模块。虽然已有一些工作不依赖特征提取，直接在原始的采样点序列上进行声学建模和模型训练\upcite{DBLP:conf/interspeech/SainathWSWV15}，但目前的主流方法仍然是基于声学特征进行建模\upcite{DBLP:conf/icassp/MohamedHP12}。

-\parinterval 声学特征提取的第一步是预处理。其流程主要是对音频进行预加重、分帧和加窗。预加重用来提升音频信号中的高频部分，目的是使频谱更加平滑。分帧（原理如图\ref{fig:17-3}）是基于短时平稳假设，即根据生物学特征，语音信号是一个缓慢变化的过程，10ms~30ms的信号片段是相对平稳的。基于这个假设，一般将每25ms作为一帧来提取特征，这个时间称为{\small\bfnew{帧长}}\index{帧长}（Frame Length）\index{Frame Length}。同时，为了保证不同帧之间的信号平滑性，使每两个相邻帧之间存在一定的重合部分。一般每隔10ms取一帧，这个时长称为{\small\bfnew{帧移}}\index{帧移}（Frame Shift）\index{Frame Shift}。为了缓解分帧带来的频谱泄漏，对每帧的信号进行加窗处理使其幅度在两段渐变到0，一般采用的是{\small\bfnew{汉明窗}}\index{汉明窗}（Hamming）\index{Hamming}。
+\parinterval 声学特征提取的第一步是预处理。其流程主要是对音频进行预加重、分帧和加窗。预加重用来提升音频信号中的高频部分，目的是使频谱更加平滑。分帧（原理如图\ref{fig:17-3}）是基于短时平稳假设，即根据生物学特征，语音信号是一个缓慢变化的过程，10ms$\thicksim$30ms的信号片段是相对平稳的。基于这个假设，一般将每25ms作为一帧来提取特征，这个时间称为{\small\bfnew{帧长}}\index{帧长}（Frame Length）\index{Frame Length}。同时，为了保证不同帧之间的信号平滑性，使每两个相邻帧之间存在一定的重合部分。一般每隔10ms取一帧，这个时长称为{\small\bfnew{帧移}}\index{帧移}（Frame Shift）\index{Frame Shift}。为了缓解分帧带来的频谱泄漏，对每帧的信号进行加窗处理使其幅度在两段渐变到0，一般采用的是{\small\bfnew{汉明窗}}\index{汉明窗}（Hamming）\index{Hamming}。
 %----------------------------------------------------------------------------------------------------
 \begin{figure}[htp]
 \centering
@@ -110,7 +110,7 @@

 \parinterval 传统的语音识别模型和统计机器翻译相似，需要利用声学模型、语言模型和发音词典联合进行识别，系统较为复杂\upcite{DBLP:journals/ftsig/GalesY07,DBLP:journals/taslp/MohamedDH12,DBLP:journals/spm/X12a}。而近些年来，随着神经网络的发展，基于神经网络的端到端语音识别模型逐渐成为主流，大大简化了训练流程\upcite{DBLP:conf/nips/ChorowskiBSCB15,DBLP:conf/icassp/ChanJLV16}。目前的端到端语音识别模型主要基于序列到序列结构，编码器根据输入的声学特征进一步提取高级特征，解码器根据编码器提取的特征识别对应的文本。在后文中即将介绍的端到端语音翻译模型也是使用十分相似的结构。因此，从某种意义上说，语音识别和翻译的端到端方法与神经机器翻译是一致的。

-\parinterval 语音识别目前广泛使用基于Transformer的模型结构（见{\chaptertwelve}），如图\ref{fig:17-5}所示。可以看出，相比文本翻译，模型结构上唯一的区别在于编码器的输入为声学特征，以及编码器底层会使用额外的卷积层来减小输入序列的长度，从而降低长序列带来的显存占用以及建模困难。通过大量的语音-标注平行数据对模型进行训练，可以得到高质量的语音识别模型。
+\parinterval 语音识别目前广泛使用基于Transformer的模型结构（见{\chaptertwelve}），如图\ref{fig:17-5}所示。可以看出，相比文本翻译，模型结构上唯一的区别在于编码器的输入为声学特征，以及编码器底层会使用额外的卷积层来减小输入序列的长度，从而降低长序列带来的显存占用以及建模困难。通过大量的语音-标注平行数据对模型进行训练，可以得到高质量的语音识别模型。由于语音对应的特征序列过长，在计算Attention的时候，会占用大量的内存/显存，从而降低计算效率，过长的序列也会增加模型训练的难度。因此，通常会先对语音特征做一个下采样，缩小语音的序列长度。目前一个常用的做法，是在输入的语音特征上进行两层步长为2的卷积操作，从而将输入序列的长度缩小为之前的1/4。

 %----------------------------------------------------------------------------------------------------
 \begin{figure}[htp]
@@ -151,7 +151,7 @@
    \vspace{0.5em}
    \item 翻译效率问题。由于需要语音识别模型和文本标注模型只能串行地计算，翻译效率相对较低，而实际很多场景中都需要达到低延时的翻译。
    \vspace{0.5em}
-    \item 语音中的副语言信息丢失。将语音识别为文本的过程中，语音中包含的语气、情感、音调等信息会丢失，而同一句话在不同的语气中表达的意思很可能是不同的，导致翻译出现偏差。
+    \item 语音中的副语言信息丢失。将语音识别为文本的过程中，语音中包含的语气、情感、音调等信息会丢失，而同一句话在不同的语气中表达的意思很可能是不同的，导致翻译出现偏差。尤其是在实际使用时，由于ASR的识别结果通常并不包含标点，还需要额外的后处理模型将标点还原，也会带来额外的计算代价。
    \vspace{0.5em}
 \end{itemize}
 %----------------------------------------------------------------------------------------------------
@@ -170,7 +170,7 @@
 \end{itemize}
 %----------------------------------------------------------------------------------------------------

-\parinterval 因此，端到端模型收到了研究人员的关注。目前比较火热的，基于Transformer的语音翻译模型架构如图\ref{fig:17-7}所示（下文中语音翻译模型均指端到端的模型）。该模型采用的也是序列到序列架构，编码器的输入是从语音中提取的特征（比如FBank特征）。由于语音对应的特征序列过长，在计算Attention的时候，会占用大量的内存/显存，从而降低计算效率，过长的序列也会增加模型训练的难度。因此，通常会先对语音特征做一个下采样，缩小语音的序列长度。目前一个常用的做法，是在输入的语音特征上进行两层步长为2的卷积操作，从而将输入序列的长度缩小为之前的1/4。之后的流程和标准的机器翻译是完全一致的，编码器对语音特征进行编码，解码器根据编码表示生成目标语言的翻译结果。
+\parinterval 因此，端到端模型收到了研究人员的关注。目前比较火热的，基于Transformer的语音翻译模型架构如图\ref{fig:17-7}所示（下文中语音翻译模型均指端到端的模型）。该模型采用的也是序列到序列架构，编码器的输入是从语音中提取的特征（比如FBank特征）。编码器底层采用和ASR模型相同的卷积结构来降低序列的长度。之后的流程和标准的机器翻译是完全一致的，编码器对语音特征进行编码，解码器根据编码表示生成目标语言的翻译结果。

 %----------------------------------------------------------------------------------------------------
 \begin{figure}[htp]
@@ -214,7 +214,7 @@
 %----------------------------------------------------------------------------------------------------
 \begin{itemize}
    \vspace{0.5em}
-    \item 输入输出之间的对齐是单调的。也就是后面的输入只会预测与前面的序列相同或后面的输出内容。比如对于上面的例子，如果输入的位置t已经预测了字符r，那么t之后的位置不会再预测前面的字符w和o。
+    \item 输入输出之间的对齐是单调的。也就是后面的输入只会预测与前面的序列相同或后面的输出内容。比如对于上面的例子，如果输入的位置t已经预测了字符l，那么t之后的位置不会再预测前面的字符h和e。
    \vspace{0.5em}
    \item 输入和输出之间是多对一的关系。也就是多个输入会对应到同一个输出上。这对于语音序列来说是非常自然的一件事情，由于输入的每个位置只包含非常短的语音特征，因此多个输入才可以对应到一个输出字符。
    \vspace{0.5em}
@@ -677,6 +677,8 @@ D_i&\subseteq&\{X_{-i},Y_{-i}\} \label{eq:17-3-2}

 \parinterval 本章仅对音频处理和语音识别进行了简单的介绍，具体内容可以参考一些经典书籍，比如关于信号处理的基础知识\upcite{Oppenheim2001DiscretetimeSP,Quatieri2001DiscreteTimeSS}，以及语音识别的传统方法\upcite{DBLP:books/daglib/0071550,Huang2001SpokenLP}和基于深度学习的最新方法\upcite{benesty2008automatic}。此外，语音翻译的一个重要应用是机器同声传译。

+\parinterval 同声传译是指在说话人陈述过程中，实时输出语音对应的文本翻译结果。在演讲、会议、聊天等场景，同声传译可以极大地提高交流效率。同声传译主要的难点在于不同语言的句法顺序不同。比如，“普京７月在赫尔辛基与特朗普会晤”这句话对应的翻译结果为“Putin meets Trump in Helsinki in July”，只有当最后一个词“会晤”说出来时，翻译结果中的第二个词“meets”才能够被正确翻译。这个问题导致了同声传译模型需要在翻译性能和实时性之间进行取舍。目前，同声传译的一种思路是基于目前已经说出的语音进行翻译\upcite{DBLP:conf/acl/MaHXZLZZHLLWW19}，一种方式是设定固定等待源语K个词语，然后再进行翻译，同时改进束搜索方式来预测未来的词序列，从而提升准确度\upcite{DBLP:conf/emnlp/ZhengMZH19}。另一种方式是对当前语音进行翻译，但需要判断翻译的词是否能够作为最终结果。如果是则不需要重新解码，可以将确定的词作为之后解码端的输入，否则将会根据之后的语音重新进行解码\upcite{DBLP:conf/naacl/DalviDSV18,DBLP:journals/corr/ChoE16}。第二种思路是动态预测当前时刻是应该继续等待还是开始翻译，这种方式更符合人类进行同传的思路。但是这种策略的难点在于标注每一时刻的决策状态十分耗时且标准难以统一，目前主流的方式是利用强化学习方法\upcite{DBLP:conf/eacl/NeubigCGL17,DBLP:conf/emnlp/GrissomHBMD14}，对句子进行不同决策方案采样，最终学到最优的决策方案。此外，还有一些工作设计不同的学习策略\upcite{DBLP:conf/acl/ZhengLZMLH20,DBLP:conf/emnlp/ZhengZMH19,DBLP:conf/acl/ZhengZMH19}或改进注意力机制\upcite{DBLP:conf/acl/ArivazhaganCMCY19}以提升同声传译的性能。
+
 \parinterval 在篇章级翻译方面，一些研究工作对这类模型的上下文建模能力进行了探索\upcite{DBLP:conf/discomt/KimTN19,DBLP:conf/acl/LiLWJXZLL20}，发现模型性能在小数据集上的BLEU提升并不完全来自于上下文信息的利用。同时，受限于数据规模，篇章级翻译模型相对难以训练。一些研究人员通过调整训练策略来帮助模型更容易捕获上下文信息\upcite{DBLP:journals/corr/abs-1903-04715,DBLP:conf/acl/SaundersSB20,DBLP:conf/mtsummit/StojanovskiF19}。除了训练策略的调整，也可以使用数据增强\upcite{DBLP:conf/discomt/SugiyamaY19}和预训练\upcite{DBLP:journals/corr/abs-1911-03110,DBLP:journals/tacl/LiuGGLEGLZ20}的手段来缓解数据稀缺的问题。此外，区别于传统的篇章级翻译，一些对话翻译也需要使用长距离上下文信息\upcite{DBLP:conf/wmt/MarufMH18}。

 \parinterval 最近，多模态机器翻译、图像描述、视觉问答\upcite{DBLP:conf/iccv/AntolALMBZP15}（Visual Question Answering）等多模态任务受到人工智能领域的广泛关注。如何将多个模态的信息充分融合，是研究多模态任务的重要问题。在自然语言处理领域transformer\upcite{vaswani2017attention}框架的提出后，被应用到计算机视觉\upcite{DBLP:conf/eccv/CarionMSUKZ20}、多模态任务\upcite{DBLP:conf/acl/YaoW20,DBLP:journals/tcsv/YuLYH20,Huasong2020SelfAdaptiveNM}效果也有显著的提升。另外，数据稀缺是多模态任务受限之处，可以采取数据增强\upcite{DBLP:conf/emnlp/GokhaleBBY20,DBLP:conf/eccv/Tang0ZWY20}的方式缓解。但是，这时仍需要回答在：模型没有充分训练时，图像等模态信息究竟在翻译里发挥了多少作用？类似的问题在篇章级机器翻译中也存在，上下文模型在训练数据量很小的时候对翻译的作用十分微弱（引用李北ACL）。因此，也有必要探究究竟图像等上下文信息如何可以更有效地发挥作用。此外，受到预训练模型的启发，在多模态领域，图像和文本联合预训练\upcite{DBLP:conf/eccv/Li0LZHZWH0WCG20,DBLP:conf/aaai/ZhouPZHCG20,DBLP:conf/iclr/SuZCLLWD20}的工作也相继开展，利用transformer框架，通过自注意力机制捕捉图像和文本的隐藏对齐，提升模型性能，同时缓解数据稀缺问题。

--- a/bibliography.bib
+++ b/bibliography.bib
@@ -10966,8 +10966,6 @@ author    = {Zhuang Liu and
  publisher   = {CoRR},
  year      = {2020}
 }
-
-
 @inproceedings{DBLP:journals/corr/abs200403672,
  author    = {Zi-Yi Dou and
               Antonios Anastasopoulos and
@@ -10977,7 +10975,6 @@ author    = {Zhuang Liu and
  publisher = {Conference on Empirical Methods in Natural Language Processing},
  year      = {2020}
 }
-
 @inproceedings{DBLP:conf/emnlp/WuZHGQLL19,
  author    = {Lijun Wu and
               Jinhua Zhu and
@@ -10998,7 +10995,6 @@ author    = {Zhuang Liu and
  publisher   = {CoRR},
  year      = {2019}
 }
-
 @inproceedings{DBLP:journals/corr/abs-2002-06823,
  author    = {Jinhua Zhu and
               Yingce Xia and
@@ -11044,7 +11040,6 @@ author    = {Zhuang Liu and
  publisher = {International Conference on Machine Learning},
  year      = {2017}
 }
-
 @inproceedings{DBLP:conf/nips/HeXQWYLM16,
  author    = {Di He and
               Yingce Xia and
@@ -11384,7 +11379,6 @@ author    = {Zhuang Liu and
  publisher = {AAAI Conference on Artificial Intelligence},
  year      = {2020}
 }
-
 @inproceedings{DBLP:journals/corr/abs-2001-08210,
  author    = {Yinhan Liu and
               Jiatao Gu and
@@ -11400,8 +11394,6 @@ author    = {Zhuang Liu and
  pages     = {726--742},
  year      = {2020}
 }
-
-
 @inproceedings{DBLP:conf/aaai/JiZDZCL20,
  author    = {Baijun Ji and
               Zhirui Zhang and
@@ -11816,7 +11808,6 @@ author    = {Zhuang Liu and
  volume    = {abs/1503.02531},
  year      = {2015}
 }
-
 @inproceedings{gu2018meta,
  author    = {Jiatao Gu and
               Yong Wang and
@@ -11828,7 +11819,6 @@ author    = {Zhuang Liu and
  publisher = {Conference on Empirical Methods in Natural Language Processing},
  year      = {2018}
 }
-
 @inproceedings{DBLP:conf/naacl/GuHDL18,
  author    = {Jiatao Gu and
               Hany Hassan and
@@ -11905,8 +11895,6 @@ author    = {Zhuang Liu and
  publisher = {OpenReview.net},
  year      = {2019}
 }
-
-
 @inproceedings{platanios2018contextual,
  author    = {Emmanouil Antonios Platanios and
               Mrinmaya Sachan and
@@ -12109,8 +12097,6 @@ author    = {Zhuang Liu and
  publisher = {Annual Conference of the North American Chapter of the Association for Computational Linguistics},
  year      = {2019}
 }
-
-
 @inproceedings{firat2016zero,
  author    = {Orhan Firat and
               Baskaran Sankaran and
@@ -12122,7 +12108,6 @@ author    = {Zhuang Liu and
  publisher = {Conference on Empirical Methods in Natural Language Processing},
  year      = {2016}
 }
-
 @inproceedings{DBLP:journals/corr/abs-1805-10338,
  author    = {Lierni Sestorain and
               Massimiliano Ciaramita and
@@ -12209,9 +12194,7 @@ author    = {Zhuang Liu and
               Yoshua Bengio and
               Pierre-Antoine Manzagol},
  title     = {Extracting and composing robust features with denoising autoencoders},
-  series    = {International Conference on Learning Representations},
-  volume    = {307},
-  pages     = {1096--1103},
+  year      = {2008},
  publisher = {International Conference on Machine Learning}
 }
 @inproceedings{DBLP:conf/iclr/LampleCDR18,
@@ -12335,8 +12318,6 @@ author    = {Zhuang Liu and
  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2017}
 }
-
-
 @inproceedings{ng2019facebook,
  author    = {Nathan Ng and
               Kyra Yee and
@@ -12349,7 +12330,6 @@ author    = {Zhuang Liu and
  publisher = {Association for Computational Linguistics},
  year      = {2019}
 }
-
 @inproceedings{DBLP:conf/wmt/WangLLJZLLXZ18,
  author    = {Qiang Wang and
               Bei Li and
@@ -12396,8 +12376,6 @@ author    = {Zhuang Liu and
  publisher = {Conference and Workshop on Neural Information Processing Systems},
  year      = {2015}
 }
-
-
 @inproceedings{DBLP:journals/corr/abs-1802-05365,
  author    = {Matthew E. Peters and
               Mark Neumann and
@@ -12411,7 +12389,6 @@ author    = {Zhuang Liu and
  publisher = {Annual Conference of the North American Chapter of the Association for Computational Linguistics},
  year      = {2018}
 }
-
 @inproceedings{DBLP:conf/icml/CollobertW08,
  author    = {Ronan Collobert and
               Jason Weston},
@@ -12499,7 +12476,6 @@ author    = {Zhuang Liu and
  publisher = {Springer},
  year      = {1998}
 }
-
 @inproceedings{liu2019multi,
  author    = {Xiaodong Liu and
               Pengcheng He and
@@ -12510,7 +12486,6 @@ author    = {Zhuang Liu and
  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2019}
 }
-
 @inproceedings{DBLP:journals/corr/LuongLSVK15,
  author    = {Minh-Thang Luong and
               Quoc V. Le and
@@ -13000,8 +12975,6 @@ author    = {Zhuang Liu and
  publisher = {Conference on Empirical Methods in Natural Language Processing},
  year      = {2019}
 }
-
-
 @inproceedings{barone2017regularization,
  author    = {Antonio Valerio Miceli Barone and
               Barry Haddow and
@@ -13012,7 +12985,6 @@ author    = {Zhuang Liu and
  publisher = {Conference on Empirical Methods in Natural Language Processing},
  year      = {2017}
 }
-
 @inproceedings{DBLP:conf/acl/SaundersB20,
  author    = {Danielle Saunders and
               Bill Byrne},
@@ -13270,7 +13242,6 @@ author    = {Zhuang Liu and
  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2020}
 }
-
 @inproceedings{DBLP:conf/nips/YangDYCSL19,
  author    = {Zhilin Yang and
               Zihang Dai and
@@ -13420,8 +13391,6 @@ author    = {Zhuang Liu and
  publisher = {International Conference on Machine Learning},
  year      = {2018}
 }
-
-
 @inproceedings{DBLP:conf/nips/HeXQWYLM16,
  author    = {Di He and
               Yingce Xia and
@@ -13441,10 +13410,8 @@ author    = {Zhuang Liu and
  journal={arXiv preprint arXiv:2005.08238},
  year={2020}
 }
-
 %%%%% chapter 16------------------------------------------------------
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 %%%%% chapter 17------------------------------------------------------
 @article{DBLP:journals/ac/Bar-Hillel60,
@@ -14029,7 +13996,6 @@ author    = {Zhuang Liu and
  publisher = {International Symposium on Computer Architecture},
  year      = {2015}
 }
-
 @inproceedings{DBLP:conf/icassp/MohamedHP12,
  author    = {Abdel-rahman Mohamed and
               Geoffrey E. Hinton and
@@ -14039,7 +14005,6 @@ author    = {Zhuang Liu and
  publisher = {IEEE Transactions on Acoustics, Speech, and Signal Processing},
  year      = {2012}
 }
-
 @article{DBLP:journals/ftsig/GalesY07,
  author    = {Mark J. F. Gales and
               Steve J. Young},
@@ -14050,7 +14015,6 @@ author    = {Zhuang Liu and
  pages     = {195--304},
  year      = {2007}
 }
-
 @article{DBLP:journals/taslp/MohamedDH12,
  author    = {Abdel-rahman Mohamed and
               George E. Dahl and
@@ -14062,7 +14026,6 @@ author    = {Zhuang Liu and
  pages     = {14--22},
  year      = {2012}
 }
-
 @article{DBLP:journals/spm/X12a,
  title     = {Deep Neural Networks for Acoustic Modeling in Speech Recognition:
               The Shared Views of Four Research Groups},
@@ -14072,7 +14035,6 @@ author    = {Zhuang Liu and
  pages     = {82--97},
  year      = {2012}
 }
-
 @inproceedings{DBLP:conf/nips/ChorowskiBSCB15,
  author    = {Jan Chorowski and
               Dzmitry Bahdanau and
@@ -14084,7 +14046,6 @@ author    = {Zhuang Liu and
  pages     = {577--585},
  year      = {2015}
 }
-
 @inproceedings{DBLP:conf/icassp/ChanJLV16,
  author    = {William Chan and
               Navdeep Jaitly and
@@ -14096,7 +14057,6 @@ author    = {Zhuang Liu and
  pages     = {4960--4964},
  year      = {2016}
 }
-
 @inproceedings{DBLP:conf/acl/ZhangGCF19,
  author    = {Pei Zhang and
               Niyu Ge and
@@ -14107,7 +14067,6 @@ author    = {Zhuang Liu and
  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2019}
 }
-
 @inproceedings{DBLP:conf/acl/SperberNPW19,
  author    = {Matthias Sperber and
               Graham Neubig and
@@ -14118,7 +14077,6 @@ author    = {Zhuang Liu and
  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2019}
 }
-
 @inproceedings{DBLP:conf/acl/LiuTMCZ18,
  author    = {Yong Cheng and
               Zhaopeng Tu and
@@ -14130,7 +14088,6 @@ author    = {Zhuang Liu and
  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2018}
 }
-
 @inproceedings{DBLP:conf/naacl/DuongACBC16,
  author    = {Long Duong and
               Antonios Anastasopoulos and
@@ -14142,7 +14099,6 @@ author    = {Zhuang Liu and
  publisher = {Annual Conference of the North American Chapter of the Association for Computational Linguistics},
  year      = {2016}
 }
-
 @article{DBLP:journals/corr/BerardPSB16,
  author    = {Alexandre Berard and
               Olivier Pietquin and
@@ -14154,7 +14110,6 @@ author    = {Zhuang Liu and
  volume    = {abs/1612.01744},
  year      = {2016}
 }
-
 @inproceedings{DBLP:conf/interspeech/WeissCJWC17,
  author    = {Ron J. Weiss and
               Jan Chorowski and
@@ -14166,7 +14121,6 @@ author    = {Zhuang Liu and
  publisher = {International Symposium on Computer Architecture},
  year      = {2017}
 }
-
 @inproceedings{DBLP:conf/mtsummit/GangiNCDT19,
  author    = {Mattia Antonino Di Gangi and
               Matteo Negri and
@@ -14178,7 +14132,6 @@ author    = {Zhuang Liu and
  publisher = {European Association for Machine Translation},
  year      = {2019}
 }
-
 @inproceedings{DBLP:conf/icml/GravesFGS06,
  author    = {Alex Graves and
               Santiago Fern{\'{a}}ndez and
@@ -14191,7 +14144,6 @@ author    = {Zhuang Liu and
  publisher = {International Conference on Machine Learning},
  year      = {2006}
 }
-
 @article{DBLP:journals/jstsp/WatanabeHKHH17,
  author    = {Shinji Watanabe and
               Takaaki Hori and
@@ -14205,7 +14157,6 @@ author    = {Zhuang Liu and
  pages     = {1240--1253},
  year      = {2017}
 }
-
 @inproceedings{DBLP:conf/icassp/KimHW17,
  author    = {Suyoun Kim and
               Takaaki Hori and
@@ -14216,7 +14167,6 @@ author    = {Zhuang Liu and
  publisher = {IEEE Transactions on Acoustics, Speech, and Signal Processing},
  year      = {2017}
 }
-
 @article{DBLP:journals/pami/ShiBY17,
  author    = {Baoguang Shi and
               Xiang Bai and
@@ -14229,7 +14179,6 @@ author    = {Zhuang Liu and
  pages     = {2298--2304},
  year      = {2017}
 }
-
 @inproceedings{DBLP:conf/naacl/AnastasopoulosC18,
  author    = {Antonios Anastasopoulos and
               David Chiang},
@@ -14238,7 +14187,6 @@ author    = {Zhuang Liu and
  publisher = {Annual Conference of the North American Chapter of the Association for Computational Linguistics},
  year      = {2018}
 }
-
 @inproceedings{DBLP:conf/asru/BaharBN19,
  author    = {Parnia Bahar and
               Tobias Bieschke and
@@ -14248,7 +14196,6 @@ author    = {Zhuang Liu and
  publisher = {	IEEE Automatic Speech Recognition and Understanding Workshop},
  year      = {2019}
 }
-
 @inproceedings{DBLP:conf/naacl/BansalKLLG19,
  author    = {Sameer Bansal and
               Herman Kamper and
@@ -14261,7 +14208,6 @@ author    = {Zhuang Liu and
  publisher = {Annual Conference of the North American Chapter of the Association for Computational Linguistics},
  year      = {2019}
 }
-
 @inproceedings{DBLP:conf/icassp/BerardBKP18,
  author    = {Alexandre Berard and
               Laurent Besacier and
@@ -14272,7 +14218,6 @@ author    = {Zhuang Liu and
  publisher = {IEEE Transactions on Acoustics, Speech, and Signal Processing},
  year      = {2018}
 }
-
 @inproceedings{DBLP:conf/icassp/JiaJMWCCALW19,
  author    = {Ye Jia and
               Melvin Johnson and
@@ -14289,7 +14234,6 @@ author    = {Zhuang Liu and
  publisher = {IEEE Transactions on Acoustics, Speech, and Signal Processing},
  year      = {2019}
 }
-
 @inproceedings{DBLP:conf/interspeech/WuWPG20,
  author    = {Anne Wu and
               Changhan Wang and
@@ -14300,7 +14244,6 @@ author    = {Zhuang Liu and
  publisher = {International Symposium on Computer Architecture},
  year      = {2020}
 }
-
 @inproceedings{DBLP:conf/interspeech/LiuXZHWWZ19,
  author    = {Yuchen Liu and
               Hao Xiong and
@@ -14314,7 +14257,6 @@ author    = {Zhuang Liu and
  publisher = {International Symposium on Computer Architecture},
  year      = {2019}
 }
-
 @inproceedings{DBLP:conf/emnlp/AlinejadS20,
  author    = {Ashkan Alinejad and
               Anoop Sarkar},
@@ -14324,7 +14266,6 @@ author    = {Zhuang Liu and
  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2020}
 }
-
 @article{DBLP:journals/corr/abs-1802-06003,
  author    = {Takatomo Kano and
               Sakriani Sakti and
@@ -14335,7 +14276,6 @@ author    = {Zhuang Liu and
  volume    = {abs/1802.06003},
  year      = {2018}
 }
-
 @inproceedings{DBLP:conf/acl/WangWLZY20,
  author    = {Chengyi Wang and
               Yu Wu and
@@ -14347,7 +14287,6 @@ author    = {Zhuang Liu and
  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2020}
 }
-
 @book{DBLP:books/daglib/0071550,
  author    = {Lawrence R. Rabiner and
               Biing-Hwang Juang},
@@ -14356,42 +14295,36 @@ author    = {Zhuang Liu and
  publisher = {Prentice Hall},
  year      = {1993}
 }
-
 @book{benesty2008automatic,
  title={Automatic Speech Recognition: a Deep Learning Approach},
  author={Dong Yu,Li Deng},
  year={2008},
  publisher={Springer}
 }
-
 @book{Huang2001SpokenLP,
  title={Spoken Language Processing: A Guide to Theory, Algorithm and System Development},
  author={Xuedong Huang and Alex Acero and Hsiao-Wuen Hon},
  year={2001},
  publisher={Prentice Hall PTR}
 }
-
 @book{Quatieri2001DiscreteTimeSS,
  title={Discrete-Time Speech Signal Processing: Principles and Practice},
  author={Thomas F. Quatieri},
  year={2001},
  publisher={Prentice Hall PTR}
 }
-
 @inproceedings{Oppenheim2001DiscretetimeSP,
  title={Discrete-time Signal Processing},
  author={Alan V. Oppenheim and Ronald W. Schafer},
  year={2009},
  publisher={Pearson}
 }
-
 @book{洪青阳2020语音识别原理与应用,
  title={语音识别：原理与应用},
  author={洪青阳,李琳},
  publisher={电子工业出版社},
  year={2020}
 }
-
 @book{陈果果2020语音识别实战,
  title={Kaldi语音识别实战},
  author={陈果果 and 都家宇 and 那兴宇 and 张俊博},
@@ -14413,7 +14346,6 @@ author    = {Zhuang Liu and
  publisher = {	ACM Multimedia},
  year      = {2020}
 }
-
 @inproceedings{DBLP:conf/wmt/SpeciaFSE16,
  author    = {Lucia Specia and
               Stella Frank and
@@ -14425,7 +14357,6 @@ author    = {Zhuang Liu and
  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2016}
 }
-
 @inproceedings{DBLP:conf/wmt/ElliottFBBS17,
  author    = {Desmond Elliott and
               Stella Frank and
@@ -14438,7 +14369,6 @@ author    = {Zhuang Liu and
  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2017}
 }
-
 @inproceedings{DBLP:conf/wmt/BarraultBSLEF18,
  author    = {Lo{\"{\i}}c Barrault and
               Fethi Bougares and
@@ -14451,7 +14381,6 @@ author    = {Zhuang Liu and
  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2018}
 }
-
 @inproceedings{DBLP:conf/wmt/CaglayanABGBBMH17,
  author    = {Ozan Caglayan and
               Walid Aransa and
@@ -14467,7 +14396,6 @@ author    = {Zhuang Liu and
  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2017}
 }
-
 @inproceedings{DBLP:conf/wmt/LibovickyHTBP16,
  author    = {Jindrich Libovick{\'{y}} and
               Jindrich Helcl and
@@ -14480,7 +14408,6 @@ author    = {Zhuang Liu and
  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2016}
 }
-
 @inproceedings{DBLP:conf/emnlp/CalixtoL17,
  author    = {Iacer Calixto and
               Qun Liu},
@@ -14490,7 +14417,6 @@ author    = {Zhuang Liu and
  publisher = {Conference on Empirical Methods in Natural Language Processing},
  year      = {2017}
 }
-
 @inproceedings{DBLP:conf/wmt/HuangLSOD16,
  author    = {Po-Yao Huang and
               Frederick Liu and
@@ -14502,7 +14428,6 @@ author    = {Zhuang Liu and
  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2016}
 }
-
 @article{Elliott2015MultilingualID,
  title={Multilingual Image Description with Neural Sequence Models},
  author={Desmond Elliott and 
@@ -14511,7 +14436,6 @@ author    = {Zhuang Liu and
  journal={arXiv: Computation and Language},
  year={2015}
 }
-
 @inproceedings{DBLP:conf/wmt/MadhyasthaWS17,
  author    = {Pranava Swaroop Madhyastha and
               Josiah Wang and
@@ -14522,7 +14446,6 @@ author    = {Zhuang Liu and
  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2017}
 }
-
 @article{DBLP:journals/corr/CaglayanBB16,
  author    = {Ozan Caglayan and
               Lo{\"{\i}}c Barrault and
@@ -14532,7 +14455,6 @@ author    = {Zhuang Liu and
  volume    = {abs/1609.03976},
  year      = {2016}
 }
-
 @inproceedings{DBLP:conf/acl/CalixtoLC17,
  author    = {Iacer Calixto and
               Qun Liu and
@@ -14542,7 +14464,6 @@ author    = {Zhuang Liu and
  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2017}
 }
-
 @article{DBLP:journals/corr/DelbrouckD17,
  author    = {Jean-Benoit Delbrouck and
               St{\'{e}}phane Dupont},
@@ -14552,7 +14473,6 @@ author    = {Zhuang Liu and
  volume    = {abs/1703.08084},
  year      = {2017}
 }
-
 @inproceedings{DBLP:conf/acl/LibovickyH17,
  author    = {Jindrich Libovick{\'{y}} and
               Jindrich Helcl},
@@ -14561,7 +14481,6 @@ author    = {Zhuang Liu and
  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2017}
 }
-
 @article{DBLP:journals/corr/abs-1712-03449,
  author    = {Jean-Benoit Delbrouck and
               St{\'{e}}phane Dupont},
@@ -14571,7 +14490,6 @@ author    = {Zhuang Liu and
  volume    = {abs/1712.03449},
  year      = {2017}
 }
-
 @article{DBLP:journals/corr/abs-1807-11605,
  author    = {Hasan Sait Arslan and
               Mark Fishel and
@@ -14581,7 +14499,6 @@ author    = {Zhuang Liu and
  volume    = {abs/1807.11605},
  year      = {2018}
 }
-
 @inproceedings{DBLP:conf/wmt/HelclLV18,
  author    = {Jindrich Helcl and
               Jindrich Libovick{\'{y}} and
@@ -14591,7 +14508,6 @@ author    = {Zhuang Liu and
  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2018}
 }
-
 @inproceedings{DBLP:conf/ijcnlp/ElliottK17,
  author    = {Desmond Elliott and
               {\'{A}}kos K{\'{a}}d{\'{a}}r},
@@ -14600,7 +14516,6 @@ author    = {Zhuang Liu and
  publisher = {International Joint Conference on Natural Language Processing},
  year      = {2017}
 }
-
 @inproceedings{DBLP:conf/emnlp/ZhouCLY18,
  author    = {Mingyang Zhou and
               Runxiang Cheng and
@@ -14611,7 +14526,6 @@ author    = {Zhuang Liu and
  publisher = {Conference on Empirical Methods in Natural Language Processing},
  year      = {2018}
 }
-
 @inproceedings{DBLP:conf/acl/CalixtoRA19,
  author    = {Iacer Calixto and
               Miguel Rios and
@@ -14621,7 +14535,6 @@ author    = {Zhuang Liu and
  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2019}
 }
-
 @inproceedings{DBLP:conf/acl/YinMSZYZL20,
  author    = {Yongjing Yin and
               Fandong Meng and
@@ -14636,7 +14549,6 @@ author    = {Zhuang Liu and
  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2020}
 }
-
 @inproceedings{DBLP:conf/acl/YaoW20,
  author    = {Shaowei Yao and
               Xiaojun Wan},
@@ -14645,7 +14557,6 @@ author    = {Zhuang Liu and
  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2020}
 }
-
 @inproceedings{DBLP:conf/nips/LuYBP16,
  author    = {Jiasen Lu and
               Jianwei Yang and
@@ -14656,7 +14567,6 @@ author    = {Zhuang Liu and
  pages     = {289--297},
  year      = {2016}
 }
-
 @inproceedings{DBLP:conf/cvpr/VinyalsTBE15,
  author    = {Oriol Vinyals and
               Alexander Toshev and
@@ -14667,7 +14577,6 @@ author    = {Zhuang Liu and
  publisher = {IEEE Conference on Computer Vision and Pattern Recognition},
  year      = {2015}
 }
-
 @inproceedings{DBLP:conf/icml/XuBKCCSZB15,
  author    = {Kelvin Xu and
               Jimmy Ba and
@@ -14684,7 +14593,6 @@ author    = {Zhuang Liu and
  publisher = {International Conference on Machine Learning},
  year      = {2015}
 }
-
 @inproceedings{DBLP:conf/cvpr/YouJWFL16,
  author    = {Quanzeng You and
               Hailin Jin and
@@ -14696,7 +14604,6 @@ author    = {Zhuang Liu and
  publisher = {IEEE Conference on Computer Vision and Pattern Recognition},
  year      = {2016}
 }
-
 @inproceedings{DBLP:conf/cvpr/ChenZXNSLC17,
  author    = {Long Chen and
               Hanwang Zhang and
@@ -14711,7 +14618,6 @@ author    = {Zhuang Liu and
  publisher = {IEEE Conference on Computer Vision and Pattern Recognition},
  year      = {2017}
 }
-
 @article{DBLP:journals/pami/FuJCSZ17,
  author    = {Kun Fu and
               Junqi Jin and
@@ -14726,7 +14632,6 @@ author    = {Zhuang Liu and
  pages     = {2321--2334},
  year      = {2017}
 }
-
 @inproceedings{DBLP:conf/eccv/YaoPLM18,
  author    = {Ting Yao and
               Yingwei Pan and
@@ -14739,7 +14644,6 @@ author    = {Zhuang Liu and
  publisher = {European Conference on Computer Vision},
  year      = {2018}
 }
-
 @inproceedings{DBLP:conf/ijcai/LiuSWWY17,
  author    = {Chang Liu and
               Fuchun Sun and
@@ -14751,7 +14655,6 @@ author    = {Zhuang Liu and
  publisher = {International Joint Conference on Artificial Intelligence},
  year      = {2017}
 }
-
 @article{DBLP:journals/corr/abs-1804-02767,
  author    = {Joseph Redmon and
               Ali Farhadi},
@@ -14760,7 +14663,6 @@ author    = {Zhuang Liu and
  volume    = {abs/1804.02767},
  year      = {2018}
 }
-
 @article{DBLP:journals/corr/abs-2004-10934,
  author    = {Alexey Bochkovskiy and
               Chien-Yao Wang and
@@ -14770,7 +14672,6 @@ author    = {Zhuang Liu and
  volume    = {abs/2004.10934},
  year      = {2020}
 }
-
 @inproceedings{DBLP:conf/cvpr/LuXPS17,
  author    = {Jiasen Lu and
               Caiming Xiong and
@@ -14782,7 +14683,6 @@ author    = {Zhuang Liu and
  publisher = {IEEE Conference on Computer Vision and Pattern Recognition},
  year      = {2017}
 }
-
 @inproceedings{DBLP:conf/cvpr/00010BT0GZ18,
  author    = {Peter Anderson and
               Xiaodong He and
@@ -14797,7 +14697,6 @@ author    = {Zhuang Liu and
  publisher = {IEEE Conference on Computer Vision and Pattern Recognition},
  year      = {2018}
 }
-
 @inproceedings{DBLP:conf/mm/ZhouXKC17,
  author    = {Luowei Zhou and
               Chenliang Xu and
@@ -14808,7 +14707,6 @@ author    = {Zhuang Liu and
  publisher = {ACM Multimedia},
  year      = {2017}
 }
-
 @article{DBLP:journals/mta/FangWCT18,
  author    = {Fang Fang and
               Hanli Wang and
@@ -14821,7 +14719,6 @@ author    = {Zhuang Liu and
  pages     = {31159--31175},
  year      = {2018}
 }
-
 @inproceedings{DBLP:conf/cvpr/AnejaDS18,
  author    = {Jyoti Aneja and
               Aditya Deshpande and
@@ -14831,7 +14728,6 @@ author    = {Zhuang Liu and
  publisher = {IEEE Conference on Computer Vision and Pattern Recognition},
  year      = {2018}
 }
-
 @article{DBLP:journals/corr/abs-1805-09019,
  author    = {Qingzhong Wang and
               Antoni B. Chan},
@@ -14840,7 +14736,6 @@ author    = {Zhuang Liu and
  volume    = {abs/1805.09019},
  year      = {2018}
 }
-
 @inproceedings{DBLP:conf/eccv/DaiYL18,
  author    = {Bo Dai and
               Deming Ye and
@@ -14851,7 +14746,6 @@ author    = {Zhuang Liu and
  publisher = {European Conference on Computer Vision},
  year      = {2018}
 }
-
 @inproceedings{DBLP:conf/iccv/AntolALMBZP15,
  author    = {Stanislaw Antol and
               Aishwarya Agrawal and
@@ -14865,7 +14759,6 @@ author    = {Zhuang Liu and
  publisher = {International Conference on Computer Vision},
  year      = {2015}
 }
-
 @inproceedings{DBLP:conf/eccv/CarionMSUKZ20,
  author    = {Nicolas Carion and
               Francisco Massa and
@@ -14879,7 +14772,6 @@ author    = {Zhuang Liu and
  publisher = {European Conference on Computer Vision},
  year      = {2020}
 }
-
 @article{DBLP:journals/tcsv/YuLYH20,
  author    = {Jun Yu and
               Jing Li and
@@ -14893,7 +14785,6 @@ author    = {Zhuang Liu and
  pages     = {4467--4480},
  year      = {2020}
 }
-
 @article{Huasong2020SelfAdaptiveNM,
  title={Self-Adaptive Neural Module Transformer for Visual Question Answering},
  author={Zhong Huasong and Jingyuan Chen and Chen Shen and Hanwang Zhang and Jianqiang Huang and Xian-Sheng Hua},
@@ -14901,7 +14792,6 @@ author    = {Zhuang Liu and
  year={2020},
  pages={1-1}
 }
-
 @inproceedings{DBLP:conf/emnlp/GokhaleBBY20,
  author    = {Tejas Gokhale and
               Pratyay Banerjee and
@@ -14913,7 +14803,6 @@ author    = {Zhuang Liu and
  publisher = {Conference on Empirical Methods in Natural Language Processing},
  year      = {2020}
 }
-
 @inproceedings{DBLP:conf/eccv/Tang0ZWY20,
  author    = {Ruixue Tang and
               Chao Ma and
@@ -14927,7 +14816,6 @@ author    = {Zhuang Liu and
  publisher = {	European Conference on Computer Vision},
  year      = {2020}
 }
-
 @inproceedings{DBLP:conf/eccv/Li0LZHZWH0WCG20,
  author    = {Xiujun Li and
               Xi Yin and
@@ -14947,7 +14835,6 @@ author    = {Zhuang Liu and
  publisher = {	European Conference on Computer Vision},
  year      = {2020}
 }
-
 @inproceedings{DBLP:conf/aaai/ZhouPZHCG20,
  author    = {Luowei Zhou and
               Hamid Palangi and
@@ -14960,7 +14847,6 @@ author    = {Zhuang Liu and
  publisher = {AAAI Conference on Artificial Intelligence},
  year      = {2020}
 }
-
 @inproceedings{DBLP:conf/iclr/SuZCLLWD20,
  author    = {Weijie Su and
               Xizhou Zhu and
@@ -14973,7 +14859,6 @@ author    = {Zhuang Liu and
  publisher = {International Conference on Learning Representations},
  year      = {2020}
 }
-
 @inproceedings{DBLP:conf/nips/GoodfellowPMXWOCB14,
  author    = {Ian J. Goodfellow and
               Jean Pouget-Abadie and
@@ -14988,7 +14873,6 @@ author    = {Zhuang Liu and
  pages     = {2672--2680},
  year      = {2014}
 }
-
 @inproceedings{DBLP:conf/nips/ZhuZPDEWS17,
  author    = {Jun-Yan Zhu and
               Richard Zhang and
@@ -15002,7 +14886,6 @@ author    = {Zhuang Liu and
  pages     = {465--476},
  year      = {2017}
 }
-
 @article{DBLP:journals/corr/abs-1908-06616,
  author    = {Hajar Emami and
               Majid Moradi Aliabadi and
@@ -15013,7 +14896,6 @@ author    = {Zhuang Liu and
  volume    = {abs/1908.06616},
  year      = {2019}
 }
-
 @article{DBLP:journals/access/XiongWG19,
  author    = {Feng Xiong and
               Qianqian Wang and
@@ -15024,7 +14906,6 @@ author    = {Zhuang Liu and
  pages     = {126651--126661},
  year      = {2019}
 }
-
 @inproceedings{DBLP:conf/iccv/ZhuPIE17,
  author    = {Jun-Yan Zhu and
               Taesung Park and
@@ -15036,7 +14917,6 @@ author    = {Zhuang Liu and
  publisher = {International Conference on Computer Vision},
  year      = {2017}
 }
-
 @inproceedings{DBLP:conf/iccv/YiZTG17,
  author    = {Zili Yi and
               Hao (Richard) Zhang and
@@ -15047,7 +14927,6 @@ author    = {Zhuang Liu and
  publisher = {International Conference on Computer Vision},
  year      = {2017}
 }
-
 @inproceedings{DBLP:conf/nips/LiuBK17,
  author    = {Ming-Yu Liu and
               Thomas Breuel and
@@ -15057,7 +14936,6 @@ author    = {Zhuang Liu and
  pages     = {700--708},
  year      = {2017}
 }
-
 @inproceedings{DBLP:conf/cvpr/IsolaZZE17,
  author    = {Phillip Isola and
               Jun-Yan Zhu and
@@ -15068,7 +14946,6 @@ author    = {Zhuang Liu and
  publisher = {IEEE Conference on Computer Vision and Pattern Recognition},
  year      = {2017}
 }
-
 @inproceedings{DBLP:conf/icml/ReedAYLSL16,
  author    = {Scott E. Reed and
               Zeynep Akata and
@@ -15082,7 +14959,6 @@ author    = {Zhuang Liu and
  publisher = {International Conference on Machine Learning},
  year      = {2016}
 }
-
 @article{DBLP:journals/corr/DashGALA17,
  author    = {Ayushman Dash and
               John Cristian Borges Gamboa and
@@ -15095,7 +14971,6 @@ author    = {Zhuang Liu and
  volume    = {abs/1703.06412},
  year      = {2017}
 }
-
 @inproceedings{DBLP:conf/nips/ReedAMTSL16,
  author    = {Scott E. Reed and
               Zeynep Akata and
@@ -15108,7 +14983,6 @@ author    = {Zhuang Liu and
  pages     = {217--225},
  year      = {2016}
 }
-
 @inproceedings{DBLP:conf/cvpr/ZhangXY18,
  author    = {Zizhao Zhang and
               Yuanpu Xie and
@@ -15119,9 +14993,126 @@ author    = {Zhuang Liu and
  publisher = {IEEE Conference on Computer Vision and Pattern Recognition},
  year      = {2018}
 }
-
+@inproceedings{DBLP:conf/acl/MaHXZLZZHLLWW19,
+  author    = {Mingbo Ma and
+               Liang Huang and
+               Hao Xiong and
+               Renjie Zheng and
+               Kaibo Liu and
+               Baigong Zheng and
+               Chuanqiang Zhang and
+               Zhongjun He and
+               Hairong Liu and
+               Xing Li and
+               Hua Wu and
+               Haifeng Wang},
+  title     = {{STACL:} Simultaneous Translation with Implicit Anticipation and Controllable
+               Latency using Prefix-to-Prefix Framework},
+  pages     = {3025--3036},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
+  year      = {2019}
+}
+@inproceedings{DBLP:conf/emnlp/ZhengMZH19,
+  author    = {Renjie Zheng and
+               Mingbo Ma and
+               Baigong Zheng and
+               Liang Huang},
+  title     = {Speculative Beam Search for Simultaneous Translation},
+  pages     = {1395--1402},
+  publisher = {Conference on Empirical Methods in Natural Language Processing},
+  year      = {2019}
+}
+@inproceedings{DBLP:conf/naacl/DalviDSV18,
+  author    = {Fahim Dalvi and
+               Nadir Durrani and
+               Hassan Sajjad and
+               Stephan Vogel},
+  title     = {Incremental Decoding and Training Methods for Simultaneous Translation
+               in Neural Machine Translation},
+  pages     = {493--499},
+  publisher = {Annual Conference of the North American Chapter of the Association for Computational Linguistics},
+  year      = {2018}
+}
+@article{DBLP:journals/corr/ChoE16,
+  author    = {Kyunghyun Cho and
+               Masha Esipova},
+  title     = {Can neural machine translation do simultaneous translation?},
+  journal   = {CoRR},
+  volume    = {abs/1606.02012},
+  year      = {2016}
+}
+@inproceedings{DBLP:conf/eacl/NeubigCGL17,
+  author    = {Jiatao Gu and
+               Graham Neubig and
+               Kyunghyun Cho and
+               Victor O. K. Li},
+  title     = {Learning to Translate in Real-time with Neural Machine Translation},
+  pages     = {1053--1062},
+  publisher = {Annual Conference of the European Association for Machine Translation},
+  year      = {2017}
+}
+@inproceedings{DBLP:conf/emnlp/GrissomHBMD14,
+  author    = {Alvin Grissom II and
+               He He and
+               Jordan L. Boyd-Graber and
+               John Morgan and
+               Hal Daum{\'{e}} III},
+  title     = {Don't Until the Final Verb Wait: Reinforcement Learning for Simultaneous
+               Machine Translation},
+  pages     = {1342--1352},
+  publisher = {Conference on Empirical Methods in Natural Language Processing},
+  year      = {2014}
+}
+@inproceedings{DBLP:conf/acl/ZhengLZMLH20,
+  author    = {Baigong Zheng and
+               Kaibo Liu and
+               Renjie Zheng and
+               Mingbo Ma and
+               Hairong Liu and
+               Liang Huang},
+  title     = {Simultaneous Translation Policies: From Fixed to Adaptive},
+  pages     = {2847--2853},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
+  year      = {2020}
+}
+@inproceedings{DBLP:conf/emnlp/ZhengZMH19,
+  author    = {Baigong Zheng and
+               Renjie Zheng and
+               Mingbo Ma and
+               Liang Huang},
+  title     = {Simpler and Faster Learning of Adaptive Policies for Simultaneous
+               Translation},
+  pages     = {1349--1354},
+  publisher = {Conference on Empirical Methods in Natural Language Processing},
+  year      = {2019}
+}
+@inproceedings{DBLP:conf/acl/ZhengZMH19,
+  author    = {Baigong Zheng and
+               Renjie Zheng and
+               Mingbo Ma and
+               Liang Huang},
+  title     = {Simultaneous Translation with Flexible Policy via Restricted Imitation
+               Learning},
+  pages     = {5816--5822},
+  publisher = {Conference on Empirical Methods in Natural Language Processing},
+  year      = {2019}
+}
+@inproceedings{DBLP:conf/acl/ArivazhaganCMCY19,
+  author    = {Naveen Arivazhagan and
+               Colin Cherry and
+               Wolfgang Macherey and
+               Chung-Cheng Chiu and
+               Semih Yavuz and
+               Ruoming Pang and
+               Wei Li and
+               Colin Raffel},
+  title     = {Monotonic Infinite Lookback Attention for Simultaneous Machine Translation},
+  pages     = {1313--1323},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
+  year      = {2019}
+}
 %%%%% chapter 17------------------------------------------------------
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%cha
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 %%%%% chapter 18------------------------------------------------------