Commit ed87fd41 by 曹润柘

update

parent dd755fc1
\begin{tikzpicture}
\tikzstyle{layer}=[draw,rounded corners=2pt,font=\scriptsize,align=center,minimum width=5em]
\tikzstyle{layer}=[draw,rounded corners=2pt,font=\scriptsize,align=center,minimum width=7.1em]
\tikzstyle{word}=[font=\scriptsize]
%%%%encoder
\node[layer,fill=red!20] (en_sa) at (0,0){Multi-Head \\ Attention};
\node[layer,anchor=south,fill=green!20] (en_ffn) at ([yshift=1.4em]en_sa.north){Feed Forward \\ Network};
\node[draw,circle,inner sep=0pt, minimum size=1em,anchor=north] (en_add) at ([yshift=-1.4em]en_sa.south){};
\draw[] (en_add.90) -- (en_add.-90);
\draw[] (en_add.0) -- (en_add.180);
\node[layer,anchor=north,fill=yellow!20] (en_cnn) at ([yshift=-1.4em]en_add.south){CNN};
\node[anchor=south,layer,fill=yellow!20](en_add1) at ([yshift=1.0em]en_sa.north) {Add \& LayerNorm};
\node[layer,anchor=south,fill=green!20] (en_ffn) at ([yshift=1.0em]en_add1.north){Feed Forward \\ Network};
\node[anchor=south,layer,fill=yellow!20](en_add2) at ([yshift=1.0em]en_ffn.north) {Add \& LayerNorm};
\node[draw,circle,inner sep=0pt, minimum size=1em,anchor=north,thick] (en_add) at ([yshift=-1.4em]en_sa.south){};
\draw[thick] (en_add.90) -- (en_add.-90);
\draw[thick] (en_add.0) -- (en_add.180);
\node[layer,anchor=north,fill=yellow!20] (en_cnn) at ([yshift=-1.0em]en_add.south){CNN};
\node[anchor=east,font=\scriptsize,align=center] (en_pos) at ([xshift=-2em]en_add.west){位置编码};
\node[anchor=north,font=\scriptsize,align=center] (en_input) at ([yshift=-1em]en_cnn.south){源语言语音特征\\(FBank/MFCC)};
\draw[->,thick] (en_input.90) -- ([yshift=-0.1em]en_cnn.-90);
\draw[->,thick] ([yshift=0.1em]en_cnn.90) -- ([yshift=-0.1em]en_add.-90);
\draw[->,thick] ([yshift=0.1em]en_add.90) -- ([yshift=-0.1em]en_sa.-90);
\draw[->,thick] ([yshift=0.1em]en_sa.90) -- ([yshift=-0.1em]en_add1.-90);
\draw[->,thick] ([yshift=0.1em]en_add1.90) -- ([yshift=-0.1em]en_ffn.-90);
\draw[->,thick] ([yshift=0.1em]en_ffn.90) --([yshift=-0.1em]en_add2.-90);
\draw[->,rounded corners=2pt,thick] ([yshift=-0.6em]en_sa.south)--([yshift=-0.6em,xshift=-4.0em]en_sa.south)--([xshift=-0.43em]en_add1.west)--(en_add1.west);
\draw[->,rounded corners=2pt,thick] ([yshift=-0.6em]en_ffn.south)--([yshift=-0.6em,xshift=-4.0em]en_ffn.south)--([xshift=-0.43em]en_add2.west)--(en_add2.west);
\node[draw,circle,inner sep=0pt, minimum size=1em,anchor=west] (de_add) at ([xshift=7em]en_add.east){};
\draw[] (de_add.90) -- (de_add.-90);
\draw[] (de_add.0) -- (de_add.180);
%%%%decoder
\node[draw,circle,inner sep=0pt, minimum size=1em,anchor=west,thick] (de_add) at ([xshift=9em]en_add.east){};
\draw[thick] (de_add.90) -- (de_add.-90);
\draw[thick] (de_add.0) -- (de_add.180);
\node[layer,anchor=south,fill=red!20] (de_sa) at ([yshift=1.4em]de_add.north){Masked \\Multi-Head\\Attention};
\node[layer,anchor=south,fill=red!20] (de_ca) at ([yshift=1.4em]de_sa.north){Multi-Head \\ Attention};
\node[layer,anchor=south,fill=green!20] (de_ffn) at ([yshift=1.4em]de_ca.north){Feed Forward \\ Network};
\node[layer,anchor=south,fill=blue!20] (sf) at ([yshift=1.6em]de_ffn.north){Softmax};
%\node[layer,anchor=south,fill=orange!20] (output) at ([yshift=1.4em]sf.north){STLoss};
\node[anchor=north,font=\scriptsize,align=center] (en_input) at ([yshift=-1em]en_cnn.south){源语言语音特征\\(FBank/MFCC)};
\node[anchor=south,layer,fill=yellow!20](de_add1) at ([yshift=1.0em]de_sa.north) {Add \& LayerNorm};
\node[layer,anchor=south,fill=red!20] (de_ca) at ([yshift=1.0em]de_add1.north){Multi-Head \\ Attention};
\node[anchor=south,layer,fill=yellow!20](de_add2) at ([yshift=1.0em]de_ca.north) {Add \& LayerNorm};
\node[layer,anchor=south,fill=green!20] (de_ffn) at ([yshift=1.0em]de_add2.north){Feed Forward \\ Network};
\node[anchor=south,layer,fill=yellow!20](de_add3) at ([yshift=1.0em]de_ffn.north) {Add \& LayerNorm};
\node[layer,anchor=south,fill=blue!20] (sf) at ([yshift=1.2em]de_add3.north){Softmax};
\node[anchor=north,font=\scriptsize,align=center] (de_input) at ([yshift=-1.1em]de_add.south){目标语言文本\\编码表示};
\node[anchor=east,font=\scriptsize,align=center] (en_pos) at ([xshift=-2em]en_add.west){位置编码};
\node[anchor=west,font=\scriptsize,align=center] (de_pos) at ([xshift=2em]de_add.east){位置编码};
\draw[->] (en_input.90) -- ([yshift=-0.1em]en_cnn.-90);
\draw[->] ([yshift=0.1em]en_cnn.90) -- ([yshift=-0.1em]en_add.-90);
\draw[->] ([yshift=0.1em]en_add.90) -- ([yshift=-0.1em]en_sa.-90);
\draw[->] ([yshift=0.1em]en_sa.90) -- ([yshift=-0.1em]en_ffn.-90);
\draw[->] (de_input.90) -- ([yshift=-0.1em]de_add.-90);
\draw[->] ([yshift=0.1em]de_add.90) -- ([yshift=-0.1em]de_sa.-90);
\draw[->] ([yshift=0.1em]de_sa.90) -- ([yshift=-0.1em]de_ca.-90);
\draw[->] ([yshift=0.1em]de_ca.90) -- ([yshift=-0.1em]de_ffn.-90);
\draw[->] ([yshift=0.1em]de_ffn.90) -- ([yshift=-0.1em]sf.-90);
\draw[->] ([yshift=0.1em]sf.90) -- ([yshift=1.5em]sf.90);
\draw[->] ([xshift=0.1em]en_pos.0) -- ([xshift=-0.1em]en_add.180);
\draw[->] ([xshift=-0.1em]de_pos.180) -- ([xshift=0.1em]de_add.0);
\draw[->,rounded corners=2pt] ([yshift=0.1em]en_ffn.90) -- ([yshift=2em]en_ffn.90) -- ([xshift=4em,yshift=2em]en_ffn.90) -- ([xshift=-1.5em]de_ca.west) -- ([xshift=-0.1em]de_ca.west);
\draw[->,thick] (de_input.90) -- ([yshift=-0.1em]de_add.-90);
\draw[->,thick] ([yshift=0.1em]de_add.90) -- ([yshift=-0.1em]de_sa.-90);
\draw[->,thick] ([yshift=0.1em]de_sa.90) -- ([yshift=-0.1em]de_add1.-90);
\draw[->,thick] ([yshift=0.1em]de_add1.90) -- ([yshift=-0.1em]de_ca.-90);
\draw[->,thick] ([yshift=0.1em]de_ca.90) -- ([yshift=-0.1em]de_add2.-90);
\draw[->,thick] ([yshift=0.1em]de_add2.90) -- ([yshift=-0.1em]de_ffn.-90);
\draw[->,thick] ([yshift=0.1em]de_ffn.90) -- ([yshift=-0.1em]de_add3.-90);
\draw[->,thick] ([yshift=0.1em]de_add3.90) -- ([yshift=-0.1em]sf.-90);
\draw[->,thick] ([yshift=0.1em]sf.90) -- ([yshift=1.0em]sf.90);
\draw[->,thick] ([xshift=0.1em]en_pos.0) -- ([xshift=-0.1em]en_add.180);
\draw[->,thick] ([xshift=-0.1em]de_pos.180) -- ([xshift=0.1em]de_add.0);
\draw[->,rounded corners=2pt,thick] ([yshift=-0.6em]de_sa.south)--([yshift=-0.6em,xshift=4.0em]de_sa.south)--([xshift=0.43em]de_add1.east)--(de_add1.east);
\draw[->,rounded corners=2pt,thick] ([yshift=-0.6em]de_ca.south)--([yshift=-0.6em,xshift=4.0em]de_ca.south)--([xshift=0.43em]de_add2.east)--(de_add2.east);
\draw[->,rounded corners=2pt,thick] ([yshift=-0.6em]de_ffn.south)--([yshift=-0.6em,xshift=4.0em]de_ffn.south)--([xshift=0.43em]de_add3.east)--(de_add3.east);
\draw[->,rounded corners=2pt,thick] ([yshift=0.1em]en_add2.90) -- ([yshift=1.5em]en_add2.90) -- ([xshift=5.0em,yshift=1.5em]en_add2.90) -- ([xshift=-1.5em]de_ca.west) -- ([xshift=-0.1em]de_ca.west);
\begin{pgfonlayer}{background}
\node[draw=ugreen,rounded corners=2pt,inner xsep=6pt,inner ysep=8pt,dashed,thick][fit=(en_sa)(en_ffn)]{};
\node[draw=red,rounded corners=2pt,inner xsep=6pt,inner ysep=8pt,dashed,thick][fit=(de_sa)(de_ca)(de_ffn)]{};
\node[draw=ugreen,rounded corners=2pt,inner xsep=6pt,inner ysep=8pt,dashed,thick,xshift=-0.2em,yshift=-0.2em][fit=(en_add1)(en_add2)(en_sa)(en_ffn)](box1){};
\node[draw=red,rounded corners=2pt,inner xsep=6pt,inner ysep=8pt,dashed,thick,xshift=0.2em,yshift=-0.2em][fit=(de_sa)(de_ca)(de_ffn)(de_add3)](box2){};
\end{pgfonlayer}
\node[anchor=east,font=\scriptsize,text=ugreen] at ([xshift=-0.1em]box1.west){$N \times$};
\node[anchor=west,font=\scriptsize,text=red] at ([xshift=0.1em]box2.east){$\times N$};
\node[anchor=east,font=\scriptsize] at ([xshift=-0.1em]en_cnn.west){$2 \times$};
\node[anchor=east,font=\scriptsize,align=center,text=ugreen] at ([xshift=-0.1em,yshift=3em]box1.west){ST\\ 编码器};
\node[anchor=west,font=\scriptsize,align=center,text=red] at ([xshift=0.1em,yshift=5em]box2.east){ST\\解码器};
\node[anchor=east,font=\scriptsize,align=center,text=ugreen] at ([xshift=-0.1em,yshift=3em]box1.west){ST \\ 编码器};
\node[anchor=west,font=\scriptsize,align=center,text=red] at ([xshift=0.1em,yshift=5em]box2.east){ST \\ 解码器};
\end{tikzpicture}
\ No newline at end of file
\begin{tikzpicture}
\tikzstyle{layer}=[draw,rounded corners=2pt,font=\scriptsize,align=center,minimum width=5em]
\tikzstyle{layer}=[draw,rounded corners=2pt,font=\scriptsize,align=center,minimum width=7.1em]
\tikzstyle{word}=[font=\scriptsize]
%%%%encoder
\node[layer,fill=red!20] (en_sa) at (0,0){Multi-Head \\ Attention};
\node[layer,anchor=south,fill=green!20] (en_ffn) at ([yshift=1.4em]en_sa.north){Feed Forward \\ Network};
\node[draw,circle,inner sep=0pt, minimum size=1em,anchor=north] (en_add) at ([yshift=-1.4em]en_sa.south){};
\draw[] (en_add.90) -- (en_add.-90);
\draw[] (en_add.0) -- (en_add.180);
\node[layer,anchor=north,fill=yellow!20] (en_cnn) at ([yshift=-1.4em]en_add.south){CNN};
\node[anchor=south,layer,fill=yellow!20](en_add1) at ([yshift=1.0em]en_sa.north) {Add \& LayerNorm};
\node[layer,anchor=south,fill=green!20] (en_ffn) at ([yshift=1.0em]en_add1.north){Feed Forward \\ Network};
\node[anchor=south,layer,fill=yellow!20](en_add2) at ([yshift=1.0em]en_ffn.north) {Add \& LayerNorm};
\node[layer,anchor=south,fill=blue!20] (en_sf) at ([yshift=2.4em]en_add2.north){Softmax};
\node[layer,anchor=south,fill=orange!20] (en_output) at ([yshift=1.0em]en_sf.north){CTC Output};
\node[draw,circle,inner sep=0pt, minimum size=1em,anchor=north,thick] (en_add) at ([yshift=-1.4em]en_sa.south){};
\draw[thick] (en_add.90) -- (en_add.-90);
\draw[thick] (en_add.0) -- (en_add.180);
\node[layer,anchor=north,fill=yellow!20] (en_cnn) at ([yshift=-1.0em]en_add.south){CNN};
\node[anchor=east,font=\scriptsize,align=center] (en_pos) at ([xshift=-2em]en_add.west){位置编码};
\node[anchor=north,font=\scriptsize,align=center] (en_input) at ([yshift=-1em]en_cnn.south){源语言语音特征\\(FBank/MFCC)};
\draw[->,thick] (en_input.90) -- ([yshift=-0.1em]en_cnn.-90);
\draw[->,thick] ([yshift=0.1em]en_cnn.90) -- ([yshift=-0.1em]en_add.-90);
\draw[->,thick] ([yshift=0.1em]en_add.90) -- ([yshift=-0.1em]en_sa.-90);
\draw[->,thick] ([yshift=0.1em]en_sa.90) -- ([yshift=-0.1em]en_add1.-90);
\draw[->,thick] ([yshift=0.1em]en_add1.90) -- ([yshift=-0.1em]en_ffn.-90);
\draw[->,thick] ([yshift=0.1em]en_ffn.90) --([yshift=-0.1em]en_add2.-90);
\draw[->,thick] ([yshift=0.1em]en_add2.90) -- ([yshift=-0.1em]en_sf.-90);
\draw[->,thick] ([yshift=0.1em]en_sf.90) -- ([yshift=-0.1em]en_output.-90);
\draw[->,rounded corners=2pt,thick] ([yshift=-0.6em]en_sa.south)--([yshift=-0.6em,xshift=-4.0em]en_sa.south)--([xshift=-0.43em]en_add1.west)--(en_add1.west);
\draw[->,rounded corners=2pt,thick] ([yshift=-0.6em]en_ffn.south)--([yshift=-0.6em,xshift=-4.0em]en_ffn.south)--([xshift=-0.43em]en_add2.west)--(en_add2.west);
\node[draw,circle,inner sep=0pt, minimum size=1em,anchor=west] (de_add) at ([xshift=7em]en_add.east){};
\draw[] (de_add.90) -- (de_add.-90);
\draw[] (de_add.0) -- (de_add.180);
%%%%decoder
\node[draw,circle,inner sep=0pt, minimum size=1em,anchor=west,thick] (de_add) at ([xshift=9em]en_add.east){};
\draw[thick] (de_add.90) -- (de_add.-90);
\draw[thick] (de_add.0) -- (de_add.180);
\node[layer,anchor=south,fill=red!20] (de_sa) at ([yshift=1.4em]de_add.north){Masked \\Multi-Head\\Attention};
\node[layer,anchor=south,fill=red!20] (de_ca) at ([yshift=1.4em]de_sa.north){Multi-Head \\ Attention};
\node[layer,anchor=south,fill=green!20] (de_ffn) at ([yshift=1.4em]de_ca.north){Feed Forward \\ Network};
\node[anchor=south,layer,fill=yellow!20](de_add1) at ([yshift=1.0em]de_sa.north) {Add \& LayerNorm};
\node[layer,anchor=south,fill=red!20] (de_ca) at ([yshift=1.0em]de_add1.north){Multi-Head \\ Attention};
\node[anchor=south,layer,fill=yellow!20](de_add2) at ([yshift=1.0em]de_ca.north) {Add \& LayerNorm};
\node[layer,anchor=south,fill=green!20] (de_ffn) at ([yshift=1.0em]de_add2.north){Feed Forward \\ Network};
\node[anchor=south,layer,fill=yellow!20](de_add3) at ([yshift=1.0em]de_ffn.north) {Add \& LayerNorm};
\node[layer,anchor=south,fill=blue!20] (sf) at ([yshift=1.2em]de_add3.north){Softmax};
\node[anchor=north,font=\scriptsize,align=center] (de_input) at ([yshift=-1.1em]de_add.south){目标语言文本\\编码表示};
\node[layer,anchor=south,fill=blue!20] (en_sf) at ([yshift=3em]en_ffn.north){Softmax};
\node[layer,anchor=south,fill=blue!20] (sf) at ([yshift=2em]de_ffn.north){Softmax};
\node[layer,anchor=south,fill=orange!20] (en_output) at ([yshift=1.4em]en_sf.north){CTC Output};
%\node[layer,anchor=south,fill=orange!20] (output) at ([yshift=1.4em]sf.north){ST Output};
\node[anchor=west,font=\scriptsize,align=center] (de_pos) at ([xshift=2em]de_add.east){位置编码};
\node[anchor=north,font=\scriptsize,align=center] (en_input) at ([yshift=-1em]en_cnn.south){语音特征\\(FBank/MFCC)};
\node[anchor=north,font=\scriptsize,align=center] (de_input) at ([yshift=-1em]de_add.south){标注文本\\编码表示};
\draw[->,thick] (de_input.90) -- ([yshift=-0.1em]de_add.-90);
\draw[->,thick] ([yshift=0.1em]de_add.90) -- ([yshift=-0.1em]de_sa.-90);
\draw[->,thick] ([yshift=0.1em]de_sa.90) -- ([yshift=-0.1em]de_add1.-90);
\draw[->,thick] ([yshift=0.1em]de_add1.90) -- ([yshift=-0.1em]de_ca.-90);
\draw[->,thick] ([yshift=0.1em]de_ca.90) -- ([yshift=-0.1em]de_add2.-90);
\draw[->,thick] ([yshift=0.1em]de_add2.90) -- ([yshift=-0.1em]de_ffn.-90);
\draw[->,thick] ([yshift=0.1em]de_ffn.90) -- ([yshift=-0.1em]de_add3.-90);
\draw[->,thick] ([yshift=0.1em]de_add3.90) -- ([yshift=-0.1em]sf.-90);
\draw[->,thick] ([yshift=0.1em]sf.90) -- ([yshift=1.0em]sf.90);
\draw[->,thick] ([xshift=0.1em]en_pos.0) -- ([xshift=-0.1em]en_add.180);
\draw[->,thick] ([xshift=-0.1em]de_pos.180) -- ([xshift=0.1em]de_add.0);
\draw[->,rounded corners=2pt,thick] ([yshift=-0.6em]de_sa.south)--([yshift=-0.6em,xshift=4.0em]de_sa.south)--([xshift=0.43em]de_add1.east)--(de_add1.east);
\draw[->,rounded corners=2pt,thick] ([yshift=-0.6em]de_ca.south)--([yshift=-0.6em,xshift=4.0em]de_ca.south)--([xshift=0.43em]de_add2.east)--(de_add2.east);
\draw[->,rounded corners=2pt,thick] ([yshift=-0.6em]de_ffn.south)--([yshift=-0.6em,xshift=4.0em]de_ffn.south)--([xshift=0.43em]de_add3.east)--(de_add3.east);
\draw[->,rounded corners=2pt,thick] ([yshift=0.1em]en_add2.90) -- ([yshift=1.5em]en_add2.90) -- ([xshift=5.0em,yshift=1.5em]en_add2.90) -- ([xshift=-1.5em]de_ca.west) -- ([xshift=-0.1em]de_ca.west);
\node[anchor=east,font=\scriptsize,align=center] (en_pos) at ([xshift=-2em]en_add.west){位置编码};
\node[anchor=west,font=\scriptsize,align=center] (de_pos) at ([xshift=2em]de_add.east){位置编码};
\draw[->] (en_input.90) -- ([yshift=-0.1em]en_cnn.-90);
\draw[->] ([yshift=0.1em]en_cnn.90) -- ([yshift=-0.1em]en_add.-90);
\draw[->] ([yshift=0.1em]en_add.90) -- ([yshift=-0.1em]en_sa.-90);
\draw[->] ([yshift=0.1em]en_sa.90) -- ([yshift=-0.1em]en_ffn.-90);
\draw[->] (de_input.90) -- ([yshift=-0.1em]de_add.-90);
\draw[->] ([yshift=0.1em]de_add.90) -- ([yshift=-0.1em]de_sa.-90);
\draw[->] ([yshift=0.1em]de_sa.90) -- ([yshift=-0.1em]de_ca.-90);
\draw[->] ([yshift=0.1em]de_ca.90) -- ([yshift=-0.1em]de_ffn.-90);
\draw[->] ([yshift=0.1em]en_ffn.90) -- ([yshift=-0.1em]en_sf.-90);
\draw[->] ([yshift=0.1em]en_sf.90) -- ([yshift=-0.1em]en_output.-90);
\draw[->] ([yshift=0.1em]de_ffn.90) -- ([yshift=-0.1em]sf.-90);
\draw[->] ([yshift=0.1em]sf.90) -- ([yshift=1.5em]sf.90);
\draw[->] ([xshift=0.1em]en_pos.0) -- ([xshift=-0.1em]en_add.180);
\draw[->] ([xshift=-0.1em]de_pos.180) -- ([xshift=0.1em]de_add.0);
\draw[->,rounded corners=2pt] ([yshift=2em]en_ffn.90) -- ([xshift=4em,yshift=2em]en_ffn.90) -- ([xshift=-1.5em]de_ca.west) -- ([xshift=-0.1em]de_ca.west);
\begin{pgfonlayer}{background}
\node[draw=ugreen,rounded corners=2pt,inner xsep=6pt,inner ysep=8pt,dashed,thick][fit=(en_sa)(en_ffn)]{};
\node[draw=red,rounded corners=2pt,inner xsep=6pt,inner ysep=8pt,dashed,thick][fit=(de_sa)(de_ca)(de_ffn)]{};
\node[draw=ugreen,rounded corners=2pt,inner xsep=6pt,inner ysep=8pt,dashed,thick,xshift=-0.2em,yshift=-0.2em][fit=(en_add1)(en_add2)(en_sa)(en_ffn)](box1){};
\node[draw=red,rounded corners=2pt,inner xsep=6pt,inner ysep=8pt,dashed,thick,xshift=0.2em,yshift=-0.2em][fit=(de_sa)(de_ca)(de_ffn)(de_add3)](box2){};
\end{pgfonlayer}
\node[anchor=east,font=\scriptsize,text=ugreen] at ([xshift=-0.1em]box1.west){$N \times$};
\node[anchor=west,font=\scriptsize,text=red] at ([xshift=0.1em]box2.east){$\times N$};
\node[anchor=east,font=\scriptsize] at ([xshift=-0.1em]en_cnn.west){$2 \times$};
\node[anchor=east,font=\scriptsize,align=center,text=ugreen] at ([xshift=-0.1em,yshift=3em]box1.west){ST\\ 编码器};
\node[anchor=west,font=\scriptsize,align=center,text=red] at ([xshift=0.1em,yshift=5em]box2.east){ST\\解码器};
\node[anchor=east,font=\scriptsize,align=center,text=ugreen] at ([xshift=-0.1em,yshift=3em]box1.west){ST \\ 编码器};
\node[anchor=west,font=\scriptsize,align=center,text=red] at ([xshift=0.1em,yshift=5em]box2.east){ST \\ 解码器};
\end{tikzpicture}
\ No newline at end of file
......@@ -75,7 +75,7 @@
\parinterval 经过上面的描述可以看出,音频的表示实际上是一个非常长的采样点序列,这导致了直接使用现有的深度学习技术处理音频序列较为困难。并且,原始的音频信号中可能包含着较多的噪声、环境声或冗余信息,也会对模型产生干扰。因此,一般会对音频序列进行处理来提取声学特征,具体为将长序列的采样点序列转换为短序列的特征向量序列,再用于下游系统。虽然已有一些工作不依赖特征提取,直接在原始的采样点序列上进行声学建模和模型训练\upcite{DBLP:conf/interspeech/SainathWSWV15},但目前的主流方法仍然是基于声学特征进行建模\upcite{DBLP:conf/icassp/MohamedHP12}
\parinterval 声学特征提取的第一步是预处理。其流程主要是对音频进行预加重、分帧和加窗。预加重用来提升音频信号中的高频部分,目的是使频谱更加平滑。分帧(原理如图\ref{fig:17-3}所示)是基于短时平稳假设,即根据生物学特征,语音信号是一个缓慢变化的过程,10ms$\thicksim$30ms的信号片段是相对平稳的。基于这个假设,一般将每25ms作为一帧来提取特征,这个时间称为{\small\bfnew{帧长}}\index{帧长}(Frame Length)\index{Frame Length}。同时,为了保证不同帧之间的信号平滑性,使每两个相邻帧之间存在一定的重合部分。一般每隔10ms取一帧,这个时长称为{\small\bfnew{帧移}}\index{帧移}(Frame Shift)\index{Frame Shift}。为了缓解分帧带来的频谱泄漏,对每帧的信号进行加窗处理使其幅度在两段渐变到0,一般采用的是{\small\bfnew{汉明窗}}\index{汉明窗}(Hamming)\index{Hamming}\upcite{洪青阳2020语音识别原理与应用}
\parinterval 声学特征提取的第一步是预处理。其流程主要是对音频进行预加重、分帧和加窗。预加重是通过增强音频信号中的高频部分来减弱语音中对高频信号的抑制,使频谱更加顺滑。分帧(原理如图\ref{fig:17-3}所示)是基于短时平稳假设,即根据生物学特征,语音信号是一个缓慢变化的过程,10ms$\thicksim$30ms的信号片段是相对平稳的。基于这个假设,一般将每25ms作为一帧来提取特征,这个时间称为{\small\bfnew{帧长}}\index{帧长}(Frame Length)\index{Frame Length}。同时,为了保证不同帧之间的信号平滑性,使每两个相邻帧之间存在一定的重合部分。一般每隔10ms取一帧,这个时长称为{\small\bfnew{帧移}}\index{帧移}(Frame Shift)\index{Frame Shift}。为了缓解分帧带来的频谱泄漏,对每帧的信号进行加窗处理使其幅度在两段渐变到0,一般采用的是{\small\bfnew{汉明窗}}\index{汉明窗}(Hamming)\index{Hamming}\upcite{洪青阳2020语音识别原理与应用}
%----------------------------------------------------------------------------------------------------
\begin{figure}[htp]
\centering
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论