update 17

4a458dc0 · 曹润柘 · d3ac3c85 · 4a458dc0 · 4a458dc0 · 4a458dc0
Commit 4a458dc0 authored Dec 21, 2020 by 曹润柘
--- a/Chapter17/Figures/figure-audio-processing.tex
+++ b/Chapter17/Figures/figure-audio-processing.tex
+\tikzstyle{process} = [rectangle,very thick,rounded corners,minimum width=4.7cm,minimum height=2.5cm,text centered,draw=black!70,fill=red!20]
+\tikzstyle{cir} = [circle,thick,rounded corners,minimum width=0.7cm,text centered,draw=black,fill=green!25]
+
+\begin{tikzpicture}[node distance = 0,scale = 0.7]
+\tikzstyle{every node}=[scale=0.7]
+\node(voice)[scale=1.0]{声波};
+\node(microphone)[rectangle,right of = voice,xshift=1.4cm,yshift=-1cm,minimum width=0.32cm,minimum height=0.35cm,fill=black!85,draw=black!85]{};
+\draw[black!85,line width=1.8]([yshift=0.38cm,xshift=-0.4cm]microphone.north)arc(180:360:0.4cm);
+\node(microphone_1)[rectangle,minimum width=0.4cm,minimum height=0.8cm,rounded corners=3pt,above of =microphone,yshift=0.75cm,draw=black!85,line width=2.5]{};
+\draw[-,black!85,very thick]([yshift=0.4cm,xshift=-0.2cm]microphone.north)--([yshift=0.4cm,xshift=-0cm]microphone.north);
+\draw[-,black!85,very thick]([yshift=0.5cm,xshift=-0.2cm]microphone.north)--([yshift=0.5cm,xshift=-0cm]microphone.north);
+\draw[-,black!85,very thick]([yshift=0.6cm,xshift=-0.2cm]microphone.north)--([yshift=0.6cm,xshift=-0cm]microphone.north);
+\draw[-,black!85,line width=1.8]([yshift=0.6cm,xshift=-0.4cm]microphone.north)--([yshift=0.37cm,xshift=-0.4cm]microphone.north);
+\draw[-,black!85,line width=1.8]([yshift=0.6cm,xshift=0.4cm]microphone.north)--([yshift=0.37cm,xshift=0.4cm]microphone.north);
+\draw[black!85,line width=1]([yshift=0.8cm,xshift=-0.8cm]microphone.north)arc(-45:45:0.3cm);
+\draw[black!85,line width=1]([yshift=0.75cm,xshift=-0.7cm]microphone.north)arc(-45:45:0.4cm);
+\draw[black!85,line width=1]([yshift=0.7cm,xshift=-0.6cm]microphone.north)arc(-45:45:0.5cm);
+
+\node(process_1)[process,right of = microphone,xshift=4.7cm,yshift=0.5cm]{};
+\node(text_1)[below of = process_1,yshift=-2cm,scale=1.3]{采样};
+\draw [very thick,rounded corners=10pt]([xshift=-2.2cm,yshift=-1cm]process_1.center)--([xshift=-1.8cm,yshift=1cm]process_1.center)--([xshift=-1.4cm,yshift=0cm]process_1.center)--([xshift=-1.1cm,yshift=0.8cm]process_1.center)--([xshift=-0.8cm,yshift=-0.4cm]process_1.center)--([xshift=-0.5cm,yshift=0.4cm]process_1.center);
+\draw [->,very thick]([xshift=-0.3cm]process_1.center)to([xshift=0.3cm]process_1.center);
+\draw [very thick,rounded corners=10pt,densely dotted]([xshift=0.5cm,yshift=-1cm]process_1.center)--([xshift=0.9cm,yshift=1cm]process_1.center)--([xshift=1.3cm,yshift=0cm]process_1.center)--([xshift=1.6cm,yshift=0.8cm]process_1.center)--([xshift=1.9cm,yshift=-0.4cm]process_1.center)--([xshift=2.2cm,yshift=0.4cm]process_1.center);
+\node(process_2)[process,right of = process_1,xshift=6.6cm]{};
+\node(text_2)[below of = process_2,yshift=-2cm,scale=1.3]{量化};
+\draw [very thick,rounded corners=10pt,densely dotted]([xshift=-2.2cm,yshift=-1cm]process_2.center)--([xshift=-1.8cm,yshift=1cm]process_2.center)--([xshift=-1.4cm,yshift=0cm]process_2.center)--([xshift=-1.1cm,yshift=0.8cm]process_2.center)--([xshift=-0.8cm,yshift=-0.4cm]process_2.center)--([xshift=-0.5cm,yshift=0.4cm]process_2.center);
+\draw [->,very thick]([xshift=-0.3cm]process_2.center)to([xshift=0.3cm]process_2.center);
+\draw [very thick,]([xshift=0.5cm,yshift=-0.8cm]process_2.center)--([xshift=0.5cm,yshift=0.3cm]process_2.center)--([xshift=0.7cm,yshift=0.3cm]process_2.center)--([xshift=0.7cm,yshift=0.8cm]process_2.center)--([xshift=1cm,yshift=0.8cm]process_2.center)--([xshift=1cm,yshift=0.2cm]process_2.center)--([xshift=1.3cm,yshift=0.2cm]process_2.center)--([xshift=1.3cm,yshift=0.6cm]process_2.center)--([xshift=1.6cm,yshift=0.6cm]process_2.center)--([xshift=1.6cm,yshift=-0.3cm]process_2.center)--([xshift=1.8cm,yshift=-0.3cm]process_2.center)--([xshift=1.8cm,yshift=0.3cm]process_2.center)--([xshift=2cm,yshift=0.3cm]process_2.center);
+
+\node(text1)[left of = process_1,xshift=-3.2cm,yshift=-0.5cm,align=center]{模拟\\语音信号};
+\node(text2)[right of = process_1,xshift=3.3cm,yshift=-0.5cm,align=center]{离散\\时间信号};
+\node(text3)[right of = process_2,xshift=3.2cm,yshift=-0.5cm,align=center]{数字离散\\时间信号};
+
+\draw[->,very thick](process_1.east)to(process_2.west);
+\draw[->,very thick]([xshift=-1.8cm]process_1.west)to(process_1.west);
+\draw[->,very thick](process_2.east)to([xshift=1.8cm]process_2.east);
+%%%%音频
+\node(signal)[right of = process_2,xshift=5.5cm]{};
+\draw[-,thick,]([xshift=-1.2cm]signal.center)--([xshift=1.2cm]signal.center);
+\draw[-,thick]([xshift=-1cm,yshift=-0.8cm]signal.center)--([xshift=-0.9cm,yshift=0.4cm]signal.center)--([xshift=-0.8cm,yshift=-0.3cm]signal.center)--([xshift=-0.7cm,yshift=0.7cm]signal.center)--([xshift=-0.6cm,yshift=-0.1cm]signal.center)--([xshift=-0.5cm,yshift=0.3cm]signal.center)--([xshift=-0.4cm,yshift=-0.5cm]signal.center)--([xshift=-0.3cm,yshift=0.7cm]signal.center)--([xshift=-0.2cm,yshift=-0.2cm]signal.center)--([xshift=-0.1cm,yshift=0.4cm]signal.center)--([xshift=0cm,yshift=-0.9cm]signal.center)--([xshift=0.1cm,yshift=0.5cm]signal.center)--([xshift=0.2cm,yshift=-0.4cm]signal.center)--([xshift=0.3cm,yshift=0.3cm]signal.center)--([xshift=0.4cm,yshift=-0.2cm]signal.center)--([xshift=0.5cm,yshift=0.1cm]signal.center)--([xshift=0.6cm,yshift=-0.8cm]signal.center)--([xshift=0.7cm,yshift=0.4cm]signal.center)--([xshift=0.8cm,yshift=-0.6cm]signal.center)--([xshift=0.9cm,yshift=0.7cm]signal.center)--([xshift=1cm,yshift=-0.2cm]signal.center);
+\end{tikzpicture}
\ No newline at end of file
--- a/Chapter17/Figures/figure-cascading-speech-translation.tex
+++ b/Chapter17/Figures/figure-cascading-speech-translation.tex
+\tikzstyle{process} = [rectangle,very thick,rounded corners,minimum width=3.2cm,minimum height=3cm,text centered,draw=black!70,fill=red!20]
+\tikzstyle{cir} = [circle,thick,rounded corners,minimum width=0.7cm,text centered,draw=black,fill=green!25]
+
+\begin{tikzpicture}[node distance = 0,scale = 0.5]
+\tikzstyle{every node}=[scale=0.5]
+\node(process_1)[process]{};
+\draw[-,thick]([xshift=-1.2cm]process_1.center)--([xshift=1.2cm]process_1.center);
+\draw[-,thick]([xshift=-1cm,yshift=-0.8cm]process_1.center)--([xshift=-0.9cm,yshift=0.4cm]process_1.center)--([xshift=-0.8cm,yshift=-0.3cm]process_1.center)--([xshift=-0.7cm,yshift=0.7cm]process_1.center)--([xshift=-0.6cm,yshift=-0.1cm]process_1.center)--([xshift=-0.5cm,yshift=0.3cm]process_1.center)--([xshift=-0.4cm,yshift=-0.5cm]process_1.center)--([xshift=-0.3cm,yshift=0.7cm]process_1.center)--([xshift=-0.2cm,yshift=-0.2cm]process_1.center)--([xshift=-0.1cm,yshift=0.4cm]process_1.center)--([xshift=0cm,yshift=-0.9cm]process_1.center)--([xshift=0.1cm,yshift=0.5cm]process_1.center)--([xshift=0.2cm,yshift=-0.4cm]process_1.center)--([xshift=0.3cm,yshift=0.3cm]process_1.center)--([xshift=0.4cm,yshift=-0.2cm]process_1.center)--([xshift=0.5cm,yshift=0.1cm]process_1.center)--([xshift=0.6cm,yshift=-0.8cm]process_1.center)--([xshift=0.7cm,yshift=0.4cm]process_1.center)--([xshift=0.8cm,yshift=-0.6cm]process_1.center)--([xshift=0.9cm,yshift=0.7cm]process_1.center)--([xshift=1cm,yshift=-0.2cm]process_1.center);
+\node(text_1)[below of = process_1,yshift=-2cm,scale=1.5]{语音信号};
+\node(process_2)[process,right of = process_1,xshift=7.0cm,text width=4cm,align=center]{\baselineskip=4pt\LARGE{[[0.2,...,0.3], \qquad ..., \qquad  0.3,...,0.5]]}\par};
+\node(text_2)[below of = process_2,yshift=-2cm,scale=1.5]{语音特征};
+\node(process_3)[process,,minimum width=6cm,minimum height=5cm,right of = process_2,xshift=8.2cm,text width=4cm,align=center]{};
+\node(text_3)[below of = process_3,yshift=-3cm,scale=1.5]{源语文本及其词格};
+\node(cir_s)[cir,very thick, below of = process_3,xshift=-2.2cm,yshift=1.1cm]{\LARGE S};
+\node(cir_a)[cir,right of = cir_s,xshift=1cm,yshift=0.8cm]{\LARGE a};
+\node(cir_c)[cir,right of = cir_a,xshift=1.2cm,yshift=0cm]{\LARGE c};
+\node(cir_f)[cir,right of = cir_c,xshift=1.2cm,yshift=0cm]{\LARGE f};
+\node(cir_E)[cir,very thick,right of = cir_f,xshift=1cm,yshift=-0.8cm]{\LARGE E};
+\node(cir_b)[cir,right of = cir_s,xshift=1cm,yshift=-0.8cm]{\Large b};
+\node(cir_d)[cir,right of = cir_b,xshift=1cm,yshift=0.6cm]{\Large d};
+\node(cir_e)[cir, right of = cir_b,xshift=1cm,yshift=-0.8cm]{\LARGE e};
+\node(cir_g)[cir,right of = cir_e,xshift=1cm,yshift=0.8cm]{\LARGE g};
+\draw[-latex](cir_s)node[above,xshift=0.3cm,yshift=0.4cm]{0.4}to(cir_a);
+\draw[-latex](cir_a)node[above,xshift=0.6cm,yshift=0cm]{1}to(cir_c);
+\draw[-latex](cir_c)node[above,xshift=0.6cm,yshift=0cm]{1}to(cir_f);
+\draw[-latex](cir_f)node[above,xshift=0.6cm,yshift=-0.3cm]{1}to(cir_E);
+\draw[-latex](cir_s)node[above,xshift=0.7cm,yshift=-0.4cm]{0.6}to(cir_b);
+\draw[-latex](cir_b)node[above,xshift=0.3cm,yshift=0.3cm]{0.8}to(cir_d);
+\draw[-latex](cir_b)node[above,xshift=0.7cm,yshift=-0.4cm]{0.2}to(cir_e);
+\draw[-latex](cir_e)node[above,xshift=0.3cm,yshift=0.3cm]{1}to(cir_g);
+\draw[-latex](cir_d)node[above,xshift=0.7cm,yshift=0cm]{1}to(cir_f);
+\draw[-latex](cir_g)node[above,xshift=0.6cm,yshift=0.3cm]{1}--(cir_E);
+
+
+\node(text)[below of = process_3,yshift=-1.8cm,scale=1.8]{你是谁};
+\node(process_4)[process,right of = process_3,xshift=8.2cm,text width=4cm,align=center]{\Large\textbf{Who are you?}};
+\node(text_4)[below of = process_4,yshift=-2cm,scale=1.5]{翻译译文};
+
+\draw[->,very thick](process_1.east)to(process_2.west);
+\draw[->,very thick](process_2.east)to(process_3.west);
+\draw[->,very thick](process_3.east)to(process_4.west);
+\node(arrow_text1)[right of = process_1,xshift=3.2cm,yshift=0.7cm,scale=1.4,align=center]{音频\\特征提取};
+\node(arrow_text2)[right of = process_2,xshift=3.6cm,yshift=0.7cm,scale=1.4,align=center]{语音\\识别系统};
+\node(arrow_text3)[right of = process_3,xshift=4.5cm,yshift=0.4cm,scale=1.4]{翻译系统};
+\end{tikzpicture}
\ No newline at end of file
--- a/Chapter17/Figures/figure-framing-schematic.tex
+++ b/Chapter17/Figures/figure-framing-schematic.tex
+\tikzstyle{process} = [rectangle,very thick,rounded corners,minimum width=5cm,minimum height=2.5cm,text centered,draw=black!70,fill=red!25]
+\tikzstyle{cir} = [circle,thick,rounded corners,minimum width=0.7cm,text centered,draw=black,fill=green!25]
+
+\begin{tikzpicture}[node distance = 0,scale = 1]
+\tikzstyle{every node}=[scale=1]
+\node [anchor=center](ori) at (-0.2,-0.2) {$O$};
+\draw[->,thick](-0.5,0)--(5,0)node[below]{$t$};
+\draw[->,thick](0,-2)--(0,2)node[left,scale=0.8]{量化值};
+\draw[-,thick](0,0)sin(0.7,1.5)cos(1.4,0)sin(2.1,-1.5)cos(2.8,0)sin(3.5,1.5)cos(4.2,0);
+\draw[-,thick,dashed](0.5,-1.8)--(0.5,1.8);
+\draw[-](1.2,-1.8)--(1.2,1.8);
+\draw[-,thick,dashed](1.9,-1.8)--(1.9,1.8);
+\draw[<->,thick](0,-1.1)--(1.2,-1.1)node[left,xshift=-0.05cm,yshift=0.15cm,scale=0.6]{帧长};
+\draw[<->,thick](0,-1.4)--(0.5,-1.4)node[left,xshift=0.05cm,yshift=-0.25cm,scale=0.6]{帧移};
+\draw[<->,thick](0.5,-1.4)--(1.9,-1.4);
+\end{tikzpicture}
\ No newline at end of file
--- a/Chapter17/Figures/figure-joint-encoder-mode.tex
+++ b/Chapter17/Figures/figure-joint-encoder-mode.tex
+\tikzstyle{coder} = [rectangle,thick,rounded corners,minimum width=2.3cm,minimum height=1cm,text centered,draw=black!70,fill=red!25]
+
+\begin{tikzpicture}[node distance = 0,scale = 0.75]
+\tikzstyle{every node}=[scale=0.75]
+\node(encoder)[coder]{\large{编码器}};
+\node(decoder_1)[coder,above of =encoder,xshift=-1.6cm,yshift=2.4cm,fill=blue!20]{\large{解码器}};
+\node(decoder_2)[coder,above of =encoder, xshift=1.6cm,yshift=2.4cm,fill=yellow!25]{\large{解码器}};
+\node(s)[below of = encoder,yshift=-1.8cm,scale=1.6]{$s$};
+\node(y)[above of = decoder_2,yshift=1.8cm,scale=1.6]{$y$};
+
+\draw[->,thick](s.north)to(encoder.south);
+\draw[->,thick](decoder_1.east)to(decoder_2.west);
+\draw[->,thick](decoder_2.north)to(y.south);
+\draw[->,thick](encoder.north)--([yshift=0.6725cm]encoder.north)--([yshift=-0.7cm]decoder_1.south)--(decoder_1.south);
+\draw[->,thick](encoder.north)--([yshift=0.6725cm]encoder.north)--([yshift=-0.7cm]decoder_2.south)--(decoder_2.south);
+\end{tikzpicture}
\ No newline at end of file
--- a/Chapter17/Figures/figure-three-ways-of-dual-decoder-speech-translation.tex
+++ b/Chapter17/Figures/figure-three-ways-of-dual-decoder-speech-translation.tex
+\tikzstyle{coder} = [rectangle,thick,rounded corners,minimum width=2.3cm,minimum height=1cm,text centered,draw=black!70,fill=red!20]
+
+\begin{tikzpicture}[node distance = 0,scale = 0.75]
+\tikzstyle{every node}=[scale=0.75]
+
+\node(encoder)[coder]at (0,0){\large{编码器}};
+\node(decoder_1)[coder,above of =encoder,xshift=-1.6cm,yshift=2.8cm,fill=blue!20]{\large{解码器}};
+\node(decoder_2)[coder,above of =encoder, xshift=1.6cm,yshift=2.8cm,fill=yellow!20]{\large{解码器}};
+\node(s)[below of = encoder,yshift=-1.8cm,scale=1.6]{$s$};
+\node(x)[above of = decoder_1,yshift=1.8cm,scale=1.6]{$x$};
+\node(y)[above of = decoder_2,yshift=1.8cm,scale=1.6]{$y$};
+
+\draw[->,thick](s.north)to(encoder.south);
+\draw[->,thick](decoder_1.north)to(x.south);
+\draw[->,thick](decoder_2.north)to(y.south);
+\draw[->,thick](encoder.north)--([yshift=0.7cm]encoder.north)--([xshift=-4.16em,yshift=0.7cm]encoder.north)--(decoder_1.south);
+\draw[->,thick](encoder.north)--([yshift=0.7cm]encoder.north)--([xshift=4.16em,yshift=0.7cm]encoder.north)--(decoder_2.south);
+\node [anchor=north](pos1) at (s.south) {(a) 单编码器-双解码器方式};
+%%%%%%%%%%%%%%%%%%%%%%%%级联
+\node(encoder-2)[coder]at ([xshift=10.0em]encoder.east){\large{编码器}};
+\node(decoder_1-2)[coder,above of =encoder-2,yshift=1.4cm,fill=blue!20]{\large{解码器}};
+\node(decoder_2-2)[coder,above of =decoder_1-2, yshift=1.4cm,fill=yellow!20]{\large{解码器}};
+\node(s-2)[below of = encoder-2,yshift=-1.8cm,scale=1.6]{$s$};
+\node(y-2)[above of = decoder_2-2,yshift=1.8cm,scale=1.6]{$y$};
+
+\draw[->,thick](s-2.north)to(encoder-2.south);
+\draw[->,thick](encoder-2.north)to(decoder_1-2.south);
+\draw[->,thick](decoder_1-2.north)to(decoder_2-2.south);
+\draw[->,thick](decoder_2-2.north)to(y-2.south);
+\node [anchor=north](pos2) at (s-2.south) {(b) 级联编码器方式};
+%%%%%%%%%%%%%%%%%%%%%%%%联合
+\node(encoder-3)[coder]at([xshift=10.0em]encoder-2.east){\large{编码器}};
+\node(decoder_1-3)[coder,above of =encoder-3,xshift=-1.6cm,yshift=2.8cm,fill=blue!20]{\large{解码器}};
+\node(decoder_2-3)[coder,above of =encoder-3, xshift=1.6cm,yshift=2.8cm,fill=yellow!20]{\large{解码器}};
+\node(s-3)[below of = encoder-3,yshift=-1.8cm,scale=1.6]{$s$};
+\node(y-3)[above of = decoder_2-3,yshift=1.8cm,scale=1.6]{$y$};
+
+\draw[->,thick](s-3.north)to(encoder-3.south);
+\draw[->,thick](decoder_1-3.east)to(decoder_2-3.west);
+\draw[->,thick](decoder_2-3.north)to(y-3.south);
+\draw[->,thick](encoder-3.north)--([yshift=0.7cm]encoder-3.north)--([xshift=-4.16em,yshift=0.7cm]encoder-3.north)--(decoder_1-3.south);
+\draw[->,thick](encoder-3.north)--([yshift=0.7cm]encoder-3.north)--([xshift=4.16em,yshift=0.7cm]encoder-3.north)--(decoder_2-3.south);
+\node [anchor=north](pos3) at (s-3.south) {(c) 联合编码器方式};
+\end{tikzpicture}
\ No newline at end of file
--- a/Chapter17/chapter17.tex
+++ b/Chapter17/chapter17.tex
@@ -56,19 +56,28 @@

 \subsection{音频处理}

-\parinterval 不同于文本，音频本质上是经过若干信号处理之后的{\small\bfnew{波形}}（Waveform）\index{Waveform}。具体来说，声音是一种空气的震动，因此可以被转换为模拟信号。模拟信号是一段连续的信号，经过采样变为离散数字信号。采样是每隔固定的时间记录一下声音的振幅，采样率表示每秒的采样点数，单位是赫兹（Hz）。采样率越高，结果的损失则越小。通常来说，采样的标准是能够通过离散化的数字信号重现原始语音。我们日常生活中使用的手机和电脑设备的采样率一般为16kHz，表示每秒16000个采样点；而音频CD的采样率可以达到44.1kHz。经过进一步的量化，将采样点的值转换为整型数值保存，从而减少占用的存储空间，通常采用的是16位量化。将采样率和量化位数相乘，就可以得到{\small\bfnew{比特率}}\index{比特率}（Bits Per Second，BPS）\index{Bits Per Second}，表示音频每秒占用的位数。16kHz采样率和16位量化的音频，比特率为256kb/s。整体流程如图\ref{fig:17-2-1}所示\upcite{洪青阳2020语音识别原理与应用,陈果果2020语音识别实战}。
+\parinterval 不同于文本，音频本质上是经过若干信号处理之后的{\small\bfnew{波形}}（Waveform）\index{Waveform}。具体来说，声音是一种空气的震动，因此可以被转换为模拟信号。模拟信号是一段连续的信号，经过采样变为离散数字信号。采样是每隔固定的时间记录一下声音的振幅，采样率表示每秒的采样点数，单位是赫兹（Hz）。采样率越高，结果的损失则越小。通常来说，采样的标准是能够通过离散化的数字信号重现原始语音。我们日常生活中使用的手机和电脑设备的采样率一般为16kHz，表示每秒16000个采样点；而音频CD的采样率可以达到44.1kHz。经过进一步的量化，将采样点的值转换为整型数值保存，从而减少占用的存储空间，通常采用的是16位量化。将采样率和量化位数相乘，就可以得到{\small\bfnew{比特率}}\index{比特率}（Bits Per Second，BPS）\index{Bits Per Second}，表示音频每秒占用的位数。16kHz采样率和16位量化的音频，比特率为256kb/s。整体流程如图\ref{fig:17-1}所示\upcite{洪青阳2020语音识别原理与应用,陈果果2020语音识别实战}。

 %----------------------------------------------------------------------------------------------------
 \begin{figure}[htp]
 \centering
+\input{./Chapter17/Figures/figure-audio-processing}
 \caption{音频处理过程}
-\label{fig:17-2-1}
+\label{fig:17-1}
 \end{figure}
 %----------------------------------------------------------------------------------------------------

 \parinterval 经过上面的描述，音频的表示实际上是一个非常长的采样点序列，这导致了直接使用现有的深度学习技术处理音频序列较为困难。并且，原始的音频信号中可能包含着较多的噪声、环境声或冗余信息也会对模型产生干扰。因此，一般会对音频序列进行处理来提取声学特征，具体为将长序列的采样点序列转换为短序列的特征向量序列，再用于下游系统模块。虽然已有一些工作不依赖特征提取，直接在原始的采样点序列上进行声学建模和模型训练\upcite{DBLP:conf/interspeech/SainathWSWV15}，但目前的主流方法仍然是基于声学特征进行建模\upcite{DBLP:conf/icassp/MohamedHP12}。

 \parinterval 声学特征提取的第一步是预处理。其流程主要是对音频进行预加重、分帧和加窗。预加重用来提升音频信号中的高频部分，目的是使频谱更加平滑。分帧是基于短时平稳假设，即根据生物学特征，语音信号是一个缓慢变化的过程，10ms~30ms的信号片段是相对平稳的。基于这个假设，一般将每25ms作为一帧来提取特征，这个时间称为{\small\bfnew{帧长}}\index{帧长}（Frame Length）\index{Frame Length}。同时，为了保证不同帧之间的信号平滑性，使每两个相邻帧之间存在一定的重合部分。一般每隔10ms取一帧，这个时长称为{\small\bfnew{帧移}}\index{帧移}（Frame Shift）\index{Frame Shift}。为了缓解分帧带来的频谱泄漏，对每帧的信号进行加窗处理使其幅度在两段渐变到0，一般采用的是{\small\bfnew{汉明窗}}\index{汉明窗}（Hamming）\index{Hamming}。
+%----------------------------------------------------------------------------------------------------
+\begin{figure}[htp]
+\centering
+\input{./Chapter17/Figures/figure-framing-schematic}
+\caption{分帧原理图}
+\label{fig:17-2}
+\end{figure}
+%----------------------------------------------------------------------------------------------------

 \parinterval 经过了上述的预处理操作，可以得到音频对应的帧序列，之后通过不同的操作来提取不同类型的声学特征。常用的声学特征包括{\small\bfnew{Mel频率倒谱系数}}\index{Mel频率倒谱系数}（Mel-Frequency Cepstral Coefficient, MFCC）\index{Mel-Frequency Cepstral Coefficient}、{\small\bfnew{感知线性预测系数}}\index{感知线性预测系数}（Perceptual Lienar Predictive, PLP）\index{Perceptual Lienar Predictive}、{\small\bfnew{滤波器组}}\index{滤波器组}（Filter-bank, Fbank）\index{Filter-bank}等。MFCC、PLP和Fbank特征都需要对预处理后的音频做{\small\bfnew{短时傅里叶变换}}\index{短时傅里叶变换}（Short-time Fourier Tranform, STFT）\index{Short-time Fourier Tranform}，得到具有规律的线性分辨率。之后再经过特定的操作，得到各种声学特征。不同声学特征的特点是不同的，MFCC去相关性较好，PLP抗噪性强，FBank可以保留更多的语音原始特征。在语音翻译中，比较常用的声学特征为FBank或MFCC\upcite{洪青阳2020语音识别原理与应用}。

@@ -80,13 +89,14 @@

 \subsection{级联式语音翻译}

-\parinterval 实现语音翻译最简单的思路是基于级联的方式，即：先通过{\small\bfnew{自动语音识别}}\index{自动语音识别}（Automatic Speech Recognition，ASR）\index{Automatic Speech Recognition}系统将语音识别为源语言文本，然后利用机器翻译系统将源语言文本翻译为目标语言文本。这种做法的好处在于语音识别和机器翻译模型可以分别进行训练，有很多数据资源以及成熟技术可以分别运用到两个系统中。因此，级联语音翻译是很长时间以来的主流方法，深受工业界的青睐。级联语音翻译主要的流程如图\ref{fig:17-2-2}所示。
+\parinterval 实现语音翻译最简单的思路是基于级联的方式，即：先通过{\small\bfnew{自动语音识别}}\index{自动语音识别}（Automatic Speech Recognition，ASR）\index{Automatic Speech Recognition}系统将语音识别为源语言文本，然后利用机器翻译系统将源语言文本翻译为目标语言文本。这种做法的好处在于语音识别和机器翻译模型可以分别进行训练，有很多数据资源以及成熟技术可以分别运用到两个系统中。因此，级联语音翻译是很长时间以来的主流方法，深受工业界的青睐。级联语音翻译主要的流程如图\ref{fig:17-3}所示。

 %----------------------------------------------------------------------------------------------------
 \begin{figure}[htp]
 \centering
+\input{./Chapter17/Figures/figure-cascading-speech-translation}
 \caption{级联语音翻译}
-\label{fig:17-2-2}
+\label{fig:17-3}
 \end{figure}
 %----------------------------------------------------------------------------------------------------

@@ -177,9 +187,9 @@
 \parinterval 针对这两个问题，研究人员们也提出了很多方法进行缓解，包括多任务学习、迁移学习等，主要思想都是利用语音识别或文本翻译数据来指导语音模型学习。并且，文本翻译中的很多方法和思想都对语音翻译技术的发展提供了思路。如何将其他领域现有的工作在语音翻译任务上验证，并针对语音这一信息载体进行特定的建模适应，是语音翻译任务当前的研究重点\upcite{DBLP:conf/mtsummit/GangiNCDT19}。

 %----------------------------------------------------------------------------------------------------
-\begin{itemize}
-    \vspace{0.5em}
-\item 多任务学习。针对语音翻译模型建模复杂度较高问题，常用的一个方法是进行多任务学习，使模型在训练过程中有更多的监督信息，从而使模型收敛地更加充分。语音语言中多任务学习主要借助语音对应的标注信息，也就是源语言文本。{\small\bfnew{连接时序分类}}\index{连接时序分类}（Connectionist Temporal Classification，CTC）\index{Connectionist Temporal Classification}\upcite{DBLP:conf/icml/GravesFGS06}是语音处理中最简单有效的一种多任务学习方法\upcite{DBLP:journals/jstsp/WatanabeHKHH17,DBLP:conf/icassp/KimHW17}，也被广泛应用于文本识别任务中\upcite{DBLP:journals/pami/ShiBY17}。CTC可以将输入序列的每一位置都对应到标注文本中，学习语音和文字之间的软对齐关系。比如，对于下面的音频序列，CTC可以将每个位置分别对应到同一个词。需要注意的是，CTC会额外新增一个词$\epsilon$，类似于一个空白词，表示这个位置没有声音或者没有任何对应的预测结果。然后，将相同且连续的词合并，去除$\epsilon$，就可以得到预测结果，如图\ref{fig:17-2-6}所示。
+\noindent{\small\bfnew{1）多任务学习}}
+
+\parinterval 针对语音翻译模型建模复杂度较高问题，常用的一个方法是进行多任务学习，使模型在训练过程中有更多的监督信息，从而使模型收敛地更加充分。语音语言中多任务学习主要借助语音对应的标注信息，也就是源语言文本。{\small\bfnew{连接时序分类}}\index{连接时序分类}（Connectionist Temporal Classification，CTC）\index{Connectionist Temporal Classification}\upcite{DBLP:conf/icml/GravesFGS06}是语音处理中最简单有效的一种多任务学习方法\upcite{DBLP:journals/jstsp/WatanabeHKHH17,DBLP:conf/icassp/KimHW17}，也被广泛应用于文本识别任务中\upcite{DBLP:journals/pami/ShiBY17}。CTC可以将输入序列的每一位置都对应到标注文本中，学习语音和文字之间的软对齐关系。比如，对于下面的音频序列，CTC可以将每个位置分别对应到同一个词。需要注意的是，CTC会额外新增一个词$\epsilon$，类似于一个空白词，表示这个位置没有声音或者没有任何对应的预测结果。然后，将相同且连续的词合并，去除$\epsilon$，就可以得到预测结果，如图\ref{fig:17-2-6}所示。

 %----------------------------------------------------------------------------------------------------
 \begin{figure}[htp]
@@ -211,21 +221,24 @@
 \end{figure}
 %----------------------------------------------------------------------------------------------------

-\parinterval 另外一种多任务学习的思想是通过两个解码器，分别预测语音对应的源语言句子和目标语言句子，具体有图XXX展示的三种方式\upcite{DBLP:conf/naacl/AnastasopoulosC18,DBLP:conf/asru/BaharBN19}。图\ref{fig:17-2-8}（a）中采用单编码器-双解码器的方式，两个解码器根据编码器的表示，分别预测源语言句子和目标语言句子，从而使编码器训练地更加充分。这种做法的好处在于仅仅增加了训练代价，解码时只需要生成目标语言句子即可。图\ref{fig:17-2-8}（b）则通过使用两个级联的解码器，先利用第一个解码器生成源语言句子，然后再利用第一个解码器的表示，通过第二个解码器生成目标语言句子。这种方法通过增加一个中间输出，降低了模型的训练难度，但同时也会带来额外的解码耗时，因为两个解码器需要串行地进行生成。图\ref{fig:17-2-8}（c）中模型更进一步，第二个编码器联合编码器和第一个解码器的表示进行生成，更充分地利用了已有信息。
-
+\parinterval 另外一种多任务学习的思想是通过两个解码器，分别预测语音对应的源语言句子和目标语言句子，具体有图\ref{fig:17-9}展示的三种方式\upcite{DBLP:conf/naacl/AnastasopoulosC18,DBLP:conf/asru/BaharBN19}。图\ref{fig:17-9}(a)中采用单编码器-双解码器的方式，两个解码器根据编码器的表示，分别预测源语言句子和目标语言句子，从而使编码器训练地更加充分。这种做法的好处在于仅仅增加了训练代价，解码时只需要生成目标语言句子即可。图\ref{fig:17-9}(b)则通过使用两个级联的解码器，先利用第一个解码器生成源语言句子，然后再利用第一个解码器的表示，通过第二个解码器生成目标语言句子。这种方法通过增加一个中间输出，降低了模型的训练难度，但同时也会带来额外的解码耗时，因为两个解码器需要串行地进行生成。图\ref{fig:17-9}(c)中模型更进一步，第二个编码器联合编码器和第一个解码器的表示进行生成，更充分地利用了已有信息。
 %----------------------------------------------------------------------------------------------------
 \begin{figure}[htp]
 \centering
+\input{./Chapter17/Figures/figure-three-ways-of-dual-decoder-speech-translation}
 \caption{双解码器语音翻译的三种方式}
-\label{fig:17-2-8}
+\label{fig:17-9}
 \end{figure}
 %----------------------------------------------------------------------------------------------------
-    \vspace{0.5em}
-    \item 迁移学习。相比语音识别和文本翻译，端到端语音翻译的训练数据量要小很多，因此，如何利用其它数据来增加可用的数据量是语音翻译的一个重要方向。和文本翻译中的方法相似，一种思路是利用迁移学习或预训练，利用其他语言的双语数据预训练模型参数，然后迁移到目标语言任务上\upcite{DBLP:conf/naacl/BansalKLLG19}，或者是利用语音识别数据或文本翻译数据，分别预训练编码器和解码器参数，用于初始化语音翻译模型参数\upcite{DBLP:conf/icassp/BerardBKP18}。预训练的编码器对语音翻译模型的学习尤为重要\upcite{DBLP:conf/naacl/BansalKLLG19}，相比文本数据，语音数据的复杂性更高，如果仅从小规模语音翻译数据上学习很难学习充分。此外，模型对声学特征的学习与语言并不是强相关的，在其他语种预训练的编码器对模型学习也是有帮助的。
-    \vspace{0.5em}
-    \item 数据增强。数据增强是增加训练数据最简单直观的一种方法。但是相比文本翻译中，可以利用回译的方法生成伪数据（见{\chaptersixteen}）。语音翻译正向翻译模型通过源语言语音生成目标语言文本，如果直接利用回译的思想，需要通过一个模型，将目标语文本翻译为目标语语音，但实际上这种模型是不能简单得到。因此，一个简单的思路是通过一个反向翻译模型和语音合成模型级联来生成伪数据\upcite{DBLP:conf/icassp/JiaJMWCCALW19}。另外，正向翻译模型生成的伪数据在文本翻译中也被验证了对模型训练有一定的帮助，因此同样可以利用语音识别和文本翻译模型，将源语言语音生成目标语言翻译，得到伪平行语料。

-\end{itemize}
+\noindent{\small\bfnew{2）迁移学习}}
+
+\parinterval 相比语音识别和文本翻译，端到端语音翻译的训练数据量要小很多，因此，如何利用其它数据来增加可用的数据量是语音翻译的一个重要方向。和文本翻译中的方法相似，一种思路是利用迁移学习或预训练，利用其他语言的双语数据预训练模型参数，然后迁移到目标语言任务上\upcite{DBLP:conf/naacl/BansalKLLG19}，或者是利用语音识别数据或文本翻译数据，分别预训练编码器和解码器参数，用于初始化语音翻译模型参数\upcite{DBLP:conf/icassp/BerardBKP18}。预训练的编码器对语音翻译模型的学习尤为重要\upcite{DBLP:conf/naacl/BansalKLLG19}，相比文本数据，语音数据的复杂性更高，如果仅从小规模语音翻译数据上学习很难学习充分。此外，模型对声学特征的学习与语言并不是强相关的，在其他语种预训练的编码器对模型学习也是有帮助的。
+
+\noindent{\small\bfnew{3）数据增强}}
+
+\parinterval 数据增强是增加训练数据最简单直观的一种方法。但是相比文本翻译中，可以利用回译的方法生成伪数据（见{\chaptersixteen}）。语音翻译正向翻译模型通过源语言语音生成目标语言文本，如果直接利用回译的思想，需要通过一个模型，将目标语文本翻译为目标语语音，但实际上这种模型是不能简单得到。因此，一个简单的思路是通过一个反向翻译模型和语音合成模型级联来生成伪数据\upcite{DBLP:conf/icassp/JiaJMWCCALW19}。另外，正向翻译模型生成的伪数据在文本翻译中也被验证了对模型训练有一定的帮助，因此同样可以利用语音识别和文本翻译模型，将源语言语音生成目标语言翻译，得到伪平行语料。
+
 %----------------------------------------------------------------------------------------------------

 \parinterval 此外，研究人员们还探索了很多其他方法来提高语音翻译模型的性能。利用在海量的无标注语音数据上预训练的{\small\bfnew{自监督}}\index{自监督}（Self-supervised）\index{Self-supervised}模型作为一个特征提取器，将从语音中提取的特征作为语音翻译模型的输入，可以有效提高模型的性能\upcite{DBLP:conf/interspeech/WuWPG20}。相比语音翻译模型，文本翻译模型任务更加简单，因此一种思想是利用文本翻译模型来指导语音翻译模型，比如通过知识蒸馏\upcite{DBLP:conf/interspeech/LiuXZHWWZ19}、正则化\upcite{DBLP:conf/emnlp/AlinejadS20}等方法。为了简化语音翻译模型的学习，可以通过课程学习的策略，使模型从语音识别任务，逐渐过渡到语音翻译任务，这种由易到难的训练策略可以使模型训练更加充分\upcite{DBLP:journals/corr/abs-1802-06003,DBLP:conf/acl/WangWLZY20}。