Commit de694ea0 by 曹润柘

合并分支 'caorunzhe' 到 'master'

Caorunzhe

查看合并请求 !912
parents 78add50a ed87fd41
\begin{tikzpicture}
\tikzstyle{layer}=[draw,rounded corners=2pt,font=\scriptsize,align=center,minimum width=5em]
\tikzstyle{layer}=[draw,rounded corners=2pt,font=\scriptsize,align=center,minimum width=7.1em]
\tikzstyle{word}=[font=\scriptsize]
%%%%encoder
\node[layer,fill=red!20] (en_sa) at (0,0){Multi-Head \\ Attention};
\node[layer,anchor=south,fill=green!20] (en_ffn) at ([yshift=1.4em]en_sa.north){Feed Forward \\ Network};
\node[draw,circle,inner sep=0pt, minimum size=1em,anchor=north] (en_add) at ([yshift=-1.4em]en_sa.south){};
\draw[] (en_add.90) -- (en_add.-90);
\draw[] (en_add.0) -- (en_add.180);
\node[layer,anchor=north,fill=yellow!20] (en_cnn) at ([yshift=-1.4em]en_add.south){CNN};
\node[anchor=south,layer,fill=yellow!20](en_add1) at ([yshift=1.0em]en_sa.north) {Add \& LayerNorm};
\node[layer,anchor=south,fill=green!20] (en_ffn) at ([yshift=1.0em]en_add1.north){Feed Forward \\ Network};
\node[anchor=south,layer,fill=yellow!20](en_add2) at ([yshift=1.0em]en_ffn.north) {Add \& LayerNorm};
\node[draw,circle,inner sep=0pt, minimum size=1em,anchor=north,thick] (en_add) at ([yshift=-1.4em]en_sa.south){};
\draw[thick] (en_add.90) -- (en_add.-90);
\draw[thick] (en_add.0) -- (en_add.180);
\node[layer,anchor=north,fill=yellow!20] (en_cnn) at ([yshift=-1.0em]en_add.south){CNN};
\node[anchor=east,font=\scriptsize,align=center] (en_pos) at ([xshift=-2em]en_add.west){位置编码};
\node[anchor=north,font=\scriptsize,align=center] (en_input) at ([yshift=-1em]en_cnn.south){源语言语音特征\\(FBank/MFCC)};
\draw[->,thick] (en_input.90) -- ([yshift=-0.1em]en_cnn.-90);
\draw[->,thick] ([yshift=0.1em]en_cnn.90) -- ([yshift=-0.1em]en_add.-90);
\draw[->,thick] ([yshift=0.1em]en_add.90) -- ([yshift=-0.1em]en_sa.-90);
\draw[->,thick] ([yshift=0.1em]en_sa.90) -- ([yshift=-0.1em]en_add1.-90);
\draw[->,thick] ([yshift=0.1em]en_add1.90) -- ([yshift=-0.1em]en_ffn.-90);
\draw[->,thick] ([yshift=0.1em]en_ffn.90) --([yshift=-0.1em]en_add2.-90);
\draw[->,rounded corners=2pt,thick] ([yshift=-0.6em]en_sa.south)--([yshift=-0.6em,xshift=-4.0em]en_sa.south)--([xshift=-0.43em]en_add1.west)--(en_add1.west);
\draw[->,rounded corners=2pt,thick] ([yshift=-0.6em]en_ffn.south)--([yshift=-0.6em,xshift=-4.0em]en_ffn.south)--([xshift=-0.43em]en_add2.west)--(en_add2.west);
\node[draw,circle,inner sep=0pt, minimum size=1em,anchor=west] (de_add) at ([xshift=7em]en_add.east){};
\draw[] (de_add.90) -- (de_add.-90);
\draw[] (de_add.0) -- (de_add.180);
%%%%decoder
\node[draw,circle,inner sep=0pt, minimum size=1em,anchor=west,thick] (de_add) at ([xshift=9em]en_add.east){};
\draw[thick] (de_add.90) -- (de_add.-90);
\draw[thick] (de_add.0) -- (de_add.180);
\node[layer,anchor=south,fill=red!20] (de_sa) at ([yshift=1.4em]de_add.north){Masked \\Multi-Head\\Attention};
\node[layer,anchor=south,fill=red!20] (de_ca) at ([yshift=1.4em]de_sa.north){Multi-Head \\ Attention};
\node[layer,anchor=south,fill=green!20] (de_ffn) at ([yshift=1.4em]de_ca.north){Feed Forward \\ Network};
\node[anchor=south,layer,fill=yellow!20](de_add1) at ([yshift=1.0em]de_sa.north) {Add \& LayerNorm};
\node[layer,anchor=south,fill=red!20] (de_ca) at ([yshift=1.0em]de_add1.north){Multi-Head \\ Attention};
\node[anchor=south,layer,fill=yellow!20](de_add2) at ([yshift=1.0em]de_ca.north) {Add \& LayerNorm};
\node[layer,anchor=south,fill=green!20] (de_ffn) at ([yshift=1.0em]de_add2.north){Feed Forward \\ Network};
\node[anchor=south,layer,fill=yellow!20](de_add3) at ([yshift=1.0em]de_ffn.north) {Add \& LayerNorm};
\node[layer,anchor=south,fill=blue!20] (sf) at ([yshift=1.2em]de_add3.north){Softmax};
\node[anchor=north,font=\scriptsize,align=center] (de_input) at ([yshift=-1.1em]de_add.south){目标语言文本\\编码表示};
\node[layer,anchor=south,fill=blue!20] (sf) at ([yshift=1.6em]de_ffn.north){Softmax};
%\node[layer,anchor=south,fill=orange!20] (output) at ([yshift=1.4em]sf.north){STLoss};
\node[anchor=west,font=\scriptsize,align=center] (de_pos) at ([xshift=2em]de_add.east){位置编码};
\node[anchor=north,font=\scriptsize,align=center] (en_input) at ([yshift=-1em]en_cnn.south){语音特征\\(FBank/MFCC)};
\node[anchor=north,font=\scriptsize,align=center] (de_input) at ([yshift=-1.1em]de_add.south){标注文本\\编码表示};
\draw[->,thick] (de_input.90) -- ([yshift=-0.1em]de_add.-90);
\draw[->,thick] ([yshift=0.1em]de_add.90) -- ([yshift=-0.1em]de_sa.-90);
\draw[->,thick] ([yshift=0.1em]de_sa.90) -- ([yshift=-0.1em]de_add1.-90);
\draw[->,thick] ([yshift=0.1em]de_add1.90) -- ([yshift=-0.1em]de_ca.-90);
\draw[->,thick] ([yshift=0.1em]de_ca.90) -- ([yshift=-0.1em]de_add2.-90);
\draw[->,thick] ([yshift=0.1em]de_add2.90) -- ([yshift=-0.1em]de_ffn.-90);
\draw[->,thick] ([yshift=0.1em]de_ffn.90) -- ([yshift=-0.1em]de_add3.-90);
\draw[->,thick] ([yshift=0.1em]de_add3.90) -- ([yshift=-0.1em]sf.-90);
\draw[->,thick] ([yshift=0.1em]sf.90) -- ([yshift=1.0em]sf.90);
\draw[->,thick] ([xshift=0.1em]en_pos.0) -- ([xshift=-0.1em]en_add.180);
\draw[->,thick] ([xshift=-0.1em]de_pos.180) -- ([xshift=0.1em]de_add.0);
\draw[->,rounded corners=2pt,thick] ([yshift=-0.6em]de_sa.south)--([yshift=-0.6em,xshift=4.0em]de_sa.south)--([xshift=0.43em]de_add1.east)--(de_add1.east);
\draw[->,rounded corners=2pt,thick] ([yshift=-0.6em]de_ca.south)--([yshift=-0.6em,xshift=4.0em]de_ca.south)--([xshift=0.43em]de_add2.east)--(de_add2.east);
\draw[->,rounded corners=2pt,thick] ([yshift=-0.6em]de_ffn.south)--([yshift=-0.6em,xshift=4.0em]de_ffn.south)--([xshift=0.43em]de_add3.east)--(de_add3.east);
\draw[->,rounded corners=2pt,thick] ([yshift=0.1em]en_add2.90) -- ([yshift=1.5em]en_add2.90) -- ([xshift=5.0em,yshift=1.5em]en_add2.90) -- ([xshift=-1.5em]de_ca.west) -- ([xshift=-0.1em]de_ca.west);
\node[anchor=east,font=\scriptsize,align=center] (en_pos) at ([xshift=-2em]en_add.west){位置编码};
\node[anchor=west,font=\scriptsize,align=center] (de_pos) at ([xshift=2em]de_add.east){位置编码};
\draw[->] (en_input.90) -- ([yshift=-0.1em]en_cnn.-90);
\draw[->] ([yshift=0.1em]en_cnn.90) -- ([yshift=-0.1em]en_add.-90);
\draw[->] ([yshift=0.1em]en_add.90) -- ([yshift=-0.1em]en_sa.-90);
\draw[->] ([yshift=0.1em]en_sa.90) -- ([yshift=-0.1em]en_ffn.-90);
\draw[->] (de_input.90) -- ([yshift=-0.1em]de_add.-90);
\draw[->] ([yshift=0.1em]de_add.90) -- ([yshift=-0.1em]de_sa.-90);
\draw[->] ([yshift=0.1em]de_sa.90) -- ([yshift=-0.1em]de_ca.-90);
\draw[->] ([yshift=0.1em]de_ca.90) -- ([yshift=-0.1em]de_ffn.-90);
\draw[->] ([yshift=0.1em]de_ffn.90) -- ([yshift=-0.1em]sf.-90);
\draw[->] ([yshift=0.1em]sf.90) -- ([yshift=1.5em]sf.90);
\draw[->] ([xshift=0.1em]en_pos.0) -- ([xshift=-0.1em]en_add.180);
\draw[->] ([xshift=-0.1em]de_pos.180) -- ([xshift=0.1em]de_add.0);
\draw[->,rounded corners=2pt] ([yshift=0.1em]en_ffn.90) -- ([yshift=2em]en_ffn.90) -- ([xshift=4em,yshift=2em]en_ffn.90) -- ([xshift=-1.5em]de_ca.west) -- ([xshift=-0.1em]de_ca.west);
\begin{pgfonlayer}{background}
\node[draw=ugreen,rounded corners=2pt,inner xsep=6pt,inner ysep=8pt,dashed,thick][fit=(en_sa)(en_ffn)]{};
\node[draw=red,rounded corners=2pt,inner xsep=6pt,inner ysep=8pt,dashed,thick][fit=(de_sa)(de_ca)(de_ffn)]{};
\node[draw=ugreen,rounded corners=2pt,inner xsep=6pt,inner ysep=8pt,dashed,thick,xshift=-0.2em,yshift=-0.2em][fit=(en_add1)(en_add2)(en_sa)(en_ffn)](box1){};
\node[draw=red,rounded corners=2pt,inner xsep=6pt,inner ysep=8pt,dashed,thick,xshift=0.2em,yshift=-0.2em][fit=(de_sa)(de_ca)(de_ffn)(de_add3)](box2){};
\end{pgfonlayer}
\node[anchor=east,font=\scriptsize,text=ugreen] at ([xshift=-0.1em]box1.west){$N \times$};
\node[anchor=west,font=\scriptsize,text=red] at ([xshift=0.1em]box2.east){$\times N$};
\node[anchor=east,font=\scriptsize] at ([xshift=-0.1em]en_cnn.west){$2 \times$};
\node[anchor=east,font=\scriptsize,align=center,text=ugreen] at ([xshift=-0.1em,yshift=3em]box1.west){ST\\ 编码器};
\node[anchor=west,font=\scriptsize,align=center,text=red] at ([xshift=0.1em,yshift=5em]box2.east){ST\\解码器};
\node[anchor=east,font=\scriptsize,align=center,text=ugreen] at ([xshift=-0.1em,yshift=3em]box1.west){ST \\ 编码器};
\node[anchor=west,font=\scriptsize,align=center,text=red] at ([xshift=0.1em,yshift=5em]box2.east){ST \\ 解码器};
\end{tikzpicture}
\ No newline at end of file
......@@ -36,6 +36,7 @@
\draw[->,very thick](process_2.east)to([xshift=1.8cm]process_2.east);
%%%%音频
\node(signal)[right of = process_2,xshift=5.5cm]{};
\node(text_3)[below of = signal,yshift=-1.98cm,scale=1.3]{语音信号};
\draw[-,thick,]([xshift=-1.2cm]signal.center)--([xshift=1.2cm]signal.center);
\draw[-,thick]([xshift=-1cm,yshift=-0.8cm]signal.center)--([xshift=-0.9cm,yshift=0.4cm]signal.center)--([xshift=-0.8cm,yshift=-0.3cm]signal.center)--([xshift=-0.7cm,yshift=0.7cm]signal.center)--([xshift=-0.6cm,yshift=-0.1cm]signal.center)--([xshift=-0.5cm,yshift=0.3cm]signal.center)--([xshift=-0.4cm,yshift=-0.5cm]signal.center)--([xshift=-0.3cm,yshift=0.7cm]signal.center)--([xshift=-0.2cm,yshift=-0.2cm]signal.center)--([xshift=-0.1cm,yshift=0.4cm]signal.center)--([xshift=0cm,yshift=-0.9cm]signal.center)--([xshift=0.1cm,yshift=0.5cm]signal.center)--([xshift=0.2cm,yshift=-0.4cm]signal.center)--([xshift=0.3cm,yshift=0.3cm]signal.center)--([xshift=0.4cm,yshift=-0.2cm]signal.center)--([xshift=0.5cm,yshift=0.1cm]signal.center)--([xshift=0.6cm,yshift=-0.8cm]signal.center)--([xshift=0.7cm,yshift=0.4cm]signal.center)--([xshift=0.8cm,yshift=-0.6cm]signal.center)--([xshift=0.9cm,yshift=0.7cm]signal.center)--([xshift=1cm,yshift=-0.2cm]signal.center);
\end{tikzpicture}
\ No newline at end of file
\begin{tikzpicture}
\tikzstyle{layer}=[draw,rounded corners=2pt,font=\scriptsize,align=center,minimum width=5em]
\tikzstyle{layer}=[draw,rounded corners=2pt,font=\scriptsize,align=center,minimum width=7.1em]
\tikzstyle{word}=[font=\scriptsize]
%%%%encoder
\node[layer,fill=red!20] (en_sa) at (0,0){Multi-Head \\ Attention};
\node[layer,anchor=south,fill=green!20] (en_ffn) at ([yshift=1.4em]en_sa.north){Feed Forward \\ Network};
\node[draw,circle,inner sep=0pt, minimum size=1em,anchor=north] (en_add) at ([yshift=-1.4em]en_sa.south){};
\draw[] (en_add.90) -- (en_add.-90);
\draw[] (en_add.0) -- (en_add.180);
\node[layer,anchor=north,fill=yellow!20] (en_cnn) at ([yshift=-1.4em]en_add.south){CNN};
\node[anchor=south,layer,fill=yellow!20](en_add1) at ([yshift=1.0em]en_sa.north) {Add \& LayerNorm};
\node[layer,anchor=south,fill=green!20] (en_ffn) at ([yshift=1.0em]en_add1.north){Feed Forward \\ Network};
\node[anchor=south,layer,fill=yellow!20](en_add2) at ([yshift=1.0em]en_ffn.north) {Add \& LayerNorm};
\node[draw,circle,inner sep=0pt, minimum size=1em,anchor=north,thick] (en_add) at ([yshift=-1.4em]en_sa.south){};
\draw[thick] (en_add.90) -- (en_add.-90);
\draw[thick] (en_add.0) -- (en_add.180);
\node[layer,anchor=north,fill=yellow!20] (en_cnn) at ([yshift=-1.0em]en_add.south){CNN};
\node[anchor=east,font=\scriptsize,align=center] (en_pos) at ([xshift=-2em]en_add.west){位置编码};
\node[anchor=north,font=\scriptsize,align=center] (en_input) at ([yshift=-1em]en_cnn.south){语音特征\\(FBank/MFCC)};
\draw[->,thick] (en_input.90) -- ([yshift=-0.1em]en_cnn.-90);
\draw[->,thick] ([yshift=0.1em]en_cnn.90) -- ([yshift=-0.1em]en_add.-90);
\draw[->,thick] ([yshift=0.1em]en_add.90) -- ([yshift=-0.1em]en_sa.-90);
\draw[->,thick] ([yshift=0.1em]en_sa.90) -- ([yshift=-0.1em]en_add1.-90);
\draw[->,thick] ([yshift=0.1em]en_add1.90) -- ([yshift=-0.1em]en_ffn.-90);
\draw[->,thick] ([yshift=0.1em]en_ffn.90) --([yshift=-0.1em]en_add2.-90);
\draw[->,rounded corners=2pt,thick] ([yshift=-0.6em]en_sa.south)--([yshift=-0.6em,xshift=-4.0em]en_sa.south)--([xshift=-0.43em]en_add1.west)--(en_add1.west);
\draw[->,rounded corners=2pt,thick] ([yshift=-0.6em]en_ffn.south)--([yshift=-0.6em,xshift=-4.0em]en_ffn.south)--([xshift=-0.43em]en_add2.west)--(en_add2.west);
\node[draw,circle,inner sep=0pt, minimum size=1em,anchor=west] (de_add) at ([xshift=7em]en_add.east){};
\draw[] (de_add.90) -- (de_add.-90);
\draw[] (de_add.0) -- (de_add.180);
%%%%decoder
\node[draw,circle,inner sep=0pt, minimum size=1em,anchor=west,thick] (de_add) at ([xshift=9em]en_add.east){};
\draw[thick] (de_add.90) -- (de_add.-90);
\draw[thick] (de_add.0) -- (de_add.180);
\node[layer,anchor=south,fill=red!20] (de_sa) at ([yshift=1.4em]de_add.north){Masked \\Multi-Head\\Attention};
\node[layer,anchor=south,fill=red!20] (de_ca) at ([yshift=1.4em]de_sa.north){Multi-Head \\ Attention};
\node[layer,anchor=south,fill=green!20] (de_ffn) at ([yshift=1.4em]de_ca.north){Feed Forward \\ Network};
\node[layer,anchor=south,fill=blue!20] (sf) at ([yshift=1.6em]de_ffn.north){Softmax};
%\node[layer,anchor=south,fill=orange!20] (output) at ([yshift=1.4em]sf.north){Output Probabilities};
\node[anchor=north,font=\scriptsize,align=center] (en_input) at ([yshift=-1em]en_cnn.south){语音特征\\(FBank/MFCC)};
\node[anchor=south,layer,fill=yellow!20](de_add1) at ([yshift=1.0em]de_sa.north) {Add \& LayerNorm};
\node[layer,anchor=south,fill=red!20] (de_ca) at ([yshift=1.0em]de_add1.north){Multi-Head \\ Attention};
\node[anchor=south,layer,fill=yellow!20](de_add2) at ([yshift=1.0em]de_ca.north) {Add \& LayerNorm};
\node[layer,anchor=south,fill=green!20] (de_ffn) at ([yshift=1.0em]de_add2.north){Feed Forward \\ Network};
\node[anchor=south,layer,fill=yellow!20](de_add3) at ([yshift=1.0em]de_ffn.north) {Add \& LayerNorm};
\node[layer,anchor=south,fill=blue!20] (sf) at ([yshift=1.2em]de_add3.north){Softmax};
\node[anchor=north,font=\scriptsize,align=center] (de_input) at ([yshift=-1.1em]de_add.south){标注文本\\编码表示};
\node[anchor=east,font=\scriptsize,align=center] (en_pos) at ([xshift=-2em]en_add.west){位置编码};
\node[anchor=west,font=\scriptsize,align=center] (de_pos) at ([xshift=2em]de_add.east){位置编码};
\draw[->] (en_input.90) -- ([yshift=-0.1em]en_cnn.-90);
\draw[->] ([yshift=0.1em]en_cnn.90) -- ([yshift=-0.1em]en_add.-90);
\draw[->] ([yshift=0.1em]en_add.90) -- ([yshift=-0.1em]en_sa.-90);
\draw[->] ([yshift=0.1em]en_sa.90) -- ([yshift=-0.1em]en_ffn.-90);
\draw[->] (de_input.90) -- ([yshift=-0.1em]de_add.-90);
\draw[->] ([yshift=0.1em]de_add.90) -- ([yshift=-0.1em]de_sa.-90);
\draw[->] ([yshift=0.1em]de_sa.90) -- ([yshift=-0.1em]de_ca.-90);
\draw[->] ([yshift=0.1em]de_ca.90) -- ([yshift=-0.1em]de_ffn.-90);
\draw[->] ([yshift=0.1em]de_ffn.90) -- ([yshift=-0.1em]sf.-90);
\draw[->] ([yshift=0.1em]sf.90) -- ([yshift=1.5em]sf.90);
\draw[->] ([xshift=0.1em]en_pos.0) -- ([xshift=-0.1em]en_add.180);
\draw[->] ([xshift=-0.1em]de_pos.180) -- ([xshift=0.1em]de_add.0);
\draw[->,rounded corners=2pt] ([yshift=0.1em]en_ffn.90) -- ([yshift=2em]en_ffn.90) -- ([xshift=4em,yshift=2em]en_ffn.90) -- ([xshift=-1.5em]de_ca.west) -- ([xshift=-0.1em]de_ca.west);
\draw[->,thick] (de_input.90) -- ([yshift=-0.1em]de_add.-90);
\draw[->,thick] ([yshift=0.1em]de_add.90) -- ([yshift=-0.1em]de_sa.-90);
\draw[->,thick] ([yshift=0.1em]de_sa.90) -- ([yshift=-0.1em]de_add1.-90);
\draw[->,thick] ([yshift=0.1em]de_add1.90) -- ([yshift=-0.1em]de_ca.-90);
\draw[->,thick] ([yshift=0.1em]de_ca.90) -- ([yshift=-0.1em]de_add2.-90);
\draw[->,thick] ([yshift=0.1em]de_add2.90) -- ([yshift=-0.1em]de_ffn.-90);
\draw[->,thick] ([yshift=0.1em]de_ffn.90) -- ([yshift=-0.1em]de_add3.-90);
\draw[->,thick] ([yshift=0.1em]de_add3.90) -- ([yshift=-0.1em]sf.-90);
\draw[->,thick] ([yshift=0.1em]sf.90) -- ([yshift=1.0em]sf.90);
\draw[->,thick] ([xshift=0.1em]en_pos.0) -- ([xshift=-0.1em]en_add.180);
\draw[->,thick] ([xshift=-0.1em]de_pos.180) -- ([xshift=0.1em]de_add.0);
\draw[->,rounded corners=2pt,thick] ([yshift=-0.6em]de_sa.south)--([yshift=-0.6em,xshift=4.0em]de_sa.south)--([xshift=0.43em]de_add1.east)--(de_add1.east);
\draw[->,rounded corners=2pt,thick] ([yshift=-0.6em]de_ca.south)--([yshift=-0.6em,xshift=4.0em]de_ca.south)--([xshift=0.43em]de_add2.east)--(de_add2.east);
\draw[->,rounded corners=2pt,thick] ([yshift=-0.6em]de_ffn.south)--([yshift=-0.6em,xshift=4.0em]de_ffn.south)--([xshift=0.43em]de_add3.east)--(de_add3.east);
\draw[->,rounded corners=2pt,thick] ([yshift=0.1em]en_add2.90) -- ([yshift=1.5em]en_add2.90) -- ([xshift=5.0em,yshift=1.5em]en_add2.90) -- ([xshift=-1.5em]de_ca.west) -- ([xshift=-0.1em]de_ca.west);
\begin{pgfonlayer}{background}
\node[draw=ugreen,rounded corners=2pt,inner xsep=6pt,inner ysep=8pt,dashed,thick][fit=(en_sa)(en_ffn)](box1){};
\node[draw=red,rounded corners=2pt,inner xsep=6pt,inner ysep=8pt,dashed,thick][fit=(de_sa)(de_ca)(de_ffn)](box2){};
\node[draw=ugreen,rounded corners=2pt,inner xsep=6pt,inner ysep=8pt,dashed,thick,xshift=-0.2em,yshift=-0.2em][fit=(en_add1)(en_add2)(en_sa)(en_ffn)](box1){};
\node[draw=red,rounded corners=2pt,inner xsep=6pt,inner ysep=8pt,dashed,thick,xshift=0.2em,yshift=-0.2em][fit=(de_sa)(de_ca)(de_ffn)(de_add3)](box2){};
\end{pgfonlayer}
\node[anchor=east,font=\scriptsize,text=ugreen] at ([xshift=-0.1em]box1.west){$N \times$};
......
\begin{tikzpicture}
\tikzstyle{layer}=[draw,rounded corners=2pt,font=\scriptsize,align=center,minimum width=5em]
\tikzstyle{layer}=[draw,rounded corners=2pt,font=\scriptsize,align=center,minimum width=7.1em]
\tikzstyle{word}=[font=\scriptsize]
%%%%encoder
\node[layer,fill=red!20] (en_sa) at (0,0){Multi-Head \\ Attention};
\node[layer,anchor=south,fill=green!20] (en_ffn) at ([yshift=1.4em]en_sa.north){Feed Forward \\ Network};
\node[draw,circle,inner sep=0pt, minimum size=1em,anchor=north] (en_add) at ([yshift=-1.4em]en_sa.south){};
\draw[] (en_add.90) -- (en_add.-90);
\draw[] (en_add.0) -- (en_add.180);
\node[layer,anchor=north,fill=yellow!20] (en_cnn) at ([yshift=-1.4em]en_add.south){CNN};
\node[anchor=south,layer,fill=yellow!20](en_add1) at ([yshift=1.0em]en_sa.north) {Add \& LayerNorm};
\node[layer,anchor=south,fill=green!20] (en_ffn) at ([yshift=1.0em]en_add1.north){Feed Forward \\ Network};
\node[anchor=south,layer,fill=yellow!20](en_add2) at ([yshift=1.0em]en_ffn.north) {Add \& LayerNorm};
\node[layer,anchor=south,fill=blue!20] (en_sf) at ([yshift=2.4em]en_add2.north){Softmax};
\node[layer,anchor=south,fill=orange!20] (en_output) at ([yshift=1.0em]en_sf.north){CTC Output};
\node[draw,circle,inner sep=0pt, minimum size=1em,anchor=north,thick] (en_add) at ([yshift=-1.4em]en_sa.south){};
\draw[thick] (en_add.90) -- (en_add.-90);
\draw[thick] (en_add.0) -- (en_add.180);
\node[layer,anchor=north,fill=yellow!20] (en_cnn) at ([yshift=-1.0em]en_add.south){CNN};
\node[anchor=east,font=\scriptsize,align=center] (en_pos) at ([xshift=-2em]en_add.west){位置编码};
\node[anchor=north,font=\scriptsize,align=center] (en_input) at ([yshift=-1em]en_cnn.south){源语言语音特征\\(FBank/MFCC)};
\draw[->,thick] (en_input.90) -- ([yshift=-0.1em]en_cnn.-90);
\draw[->,thick] ([yshift=0.1em]en_cnn.90) -- ([yshift=-0.1em]en_add.-90);
\draw[->,thick] ([yshift=0.1em]en_add.90) -- ([yshift=-0.1em]en_sa.-90);
\draw[->,thick] ([yshift=0.1em]en_sa.90) -- ([yshift=-0.1em]en_add1.-90);
\draw[->,thick] ([yshift=0.1em]en_add1.90) -- ([yshift=-0.1em]en_ffn.-90);
\draw[->,thick] ([yshift=0.1em]en_ffn.90) --([yshift=-0.1em]en_add2.-90);
\draw[->,thick] ([yshift=0.1em]en_add2.90) -- ([yshift=-0.1em]en_sf.-90);
\draw[->,thick] ([yshift=0.1em]en_sf.90) -- ([yshift=-0.1em]en_output.-90);
\draw[->,rounded corners=2pt,thick] ([yshift=-0.6em]en_sa.south)--([yshift=-0.6em,xshift=-4.0em]en_sa.south)--([xshift=-0.43em]en_add1.west)--(en_add1.west);
\draw[->,rounded corners=2pt,thick] ([yshift=-0.6em]en_ffn.south)--([yshift=-0.6em,xshift=-4.0em]en_ffn.south)--([xshift=-0.43em]en_add2.west)--(en_add2.west);
\node[draw,circle,inner sep=0pt, minimum size=1em,anchor=west] (de_add) at ([xshift=7em]en_add.east){};
\draw[] (de_add.90) -- (de_add.-90);
\draw[] (de_add.0) -- (de_add.180);
%%%%decoder
\node[draw,circle,inner sep=0pt, minimum size=1em,anchor=west,thick] (de_add) at ([xshift=9em]en_add.east){};
\draw[thick] (de_add.90) -- (de_add.-90);
\draw[thick] (de_add.0) -- (de_add.180);
\node[layer,anchor=south,fill=red!20] (de_sa) at ([yshift=1.4em]de_add.north){Masked \\Multi-Head\\Attention};
\node[layer,anchor=south,fill=red!20] (de_ca) at ([yshift=1.4em]de_sa.north){Multi-Head \\ Attention};
\node[layer,anchor=south,fill=green!20] (de_ffn) at ([yshift=1.4em]de_ca.north){Feed Forward \\ Network};
\node[anchor=south,layer,fill=yellow!20](de_add1) at ([yshift=1.0em]de_sa.north) {Add \& LayerNorm};
\node[layer,anchor=south,fill=red!20] (de_ca) at ([yshift=1.0em]de_add1.north){Multi-Head \\ Attention};
\node[anchor=south,layer,fill=yellow!20](de_add2) at ([yshift=1.0em]de_ca.north) {Add \& LayerNorm};
\node[layer,anchor=south,fill=green!20] (de_ffn) at ([yshift=1.0em]de_add2.north){Feed Forward \\ Network};
\node[anchor=south,layer,fill=yellow!20](de_add3) at ([yshift=1.0em]de_ffn.north) {Add \& LayerNorm};
\node[layer,anchor=south,fill=blue!20] (sf) at ([yshift=1.2em]de_add3.north){Softmax};
\node[anchor=north,font=\scriptsize,align=center] (de_input) at ([yshift=-1.1em]de_add.south){目标语言文本\\编码表示};
\node[layer,anchor=south,fill=blue!20] (en_sf) at ([yshift=3em]en_ffn.north){Softmax};
\node[layer,anchor=south,fill=blue!20] (sf) at ([yshift=2em]de_ffn.north){Softmax};
\node[layer,anchor=south,fill=orange!20] (en_output) at ([yshift=1.4em]en_sf.north){CTC Output};
%\node[layer,anchor=south,fill=orange!20] (output) at ([yshift=1.4em]sf.north){ST Output};
\node[anchor=west,font=\scriptsize,align=center] (de_pos) at ([xshift=2em]de_add.east){位置编码};
\node[anchor=north,font=\scriptsize,align=center] (en_input) at ([yshift=-1em]en_cnn.south){语音特征\\(FBank/MFCC)};
\node[anchor=north,font=\scriptsize,align=center] (de_input) at ([yshift=-1em]de_add.south){标注文本\\编码表示};
\draw[->,thick] (de_input.90) -- ([yshift=-0.1em]de_add.-90);
\draw[->,thick] ([yshift=0.1em]de_add.90) -- ([yshift=-0.1em]de_sa.-90);
\draw[->,thick] ([yshift=0.1em]de_sa.90) -- ([yshift=-0.1em]de_add1.-90);
\draw[->,thick] ([yshift=0.1em]de_add1.90) -- ([yshift=-0.1em]de_ca.-90);
\draw[->,thick] ([yshift=0.1em]de_ca.90) -- ([yshift=-0.1em]de_add2.-90);
\draw[->,thick] ([yshift=0.1em]de_add2.90) -- ([yshift=-0.1em]de_ffn.-90);
\draw[->,thick] ([yshift=0.1em]de_ffn.90) -- ([yshift=-0.1em]de_add3.-90);
\draw[->,thick] ([yshift=0.1em]de_add3.90) -- ([yshift=-0.1em]sf.-90);
\draw[->,thick] ([yshift=0.1em]sf.90) -- ([yshift=1.0em]sf.90);
\draw[->,thick] ([xshift=0.1em]en_pos.0) -- ([xshift=-0.1em]en_add.180);
\draw[->,thick] ([xshift=-0.1em]de_pos.180) -- ([xshift=0.1em]de_add.0);
\draw[->,rounded corners=2pt,thick] ([yshift=-0.6em]de_sa.south)--([yshift=-0.6em,xshift=4.0em]de_sa.south)--([xshift=0.43em]de_add1.east)--(de_add1.east);
\draw[->,rounded corners=2pt,thick] ([yshift=-0.6em]de_ca.south)--([yshift=-0.6em,xshift=4.0em]de_ca.south)--([xshift=0.43em]de_add2.east)--(de_add2.east);
\draw[->,rounded corners=2pt,thick] ([yshift=-0.6em]de_ffn.south)--([yshift=-0.6em,xshift=4.0em]de_ffn.south)--([xshift=0.43em]de_add3.east)--(de_add3.east);
\draw[->,rounded corners=2pt,thick] ([yshift=0.1em]en_add2.90) -- ([yshift=1.5em]en_add2.90) -- ([xshift=5.0em,yshift=1.5em]en_add2.90) -- ([xshift=-1.5em]de_ca.west) -- ([xshift=-0.1em]de_ca.west);
\node[anchor=east,font=\scriptsize,align=center] (en_pos) at ([xshift=-2em]en_add.west){位置编码};
\node[anchor=west,font=\scriptsize,align=center] (de_pos) at ([xshift=2em]de_add.east){位置编码};
\draw[->] (en_input.90) -- ([yshift=-0.1em]en_cnn.-90);
\draw[->] ([yshift=0.1em]en_cnn.90) -- ([yshift=-0.1em]en_add.-90);
\draw[->] ([yshift=0.1em]en_add.90) -- ([yshift=-0.1em]en_sa.-90);
\draw[->] ([yshift=0.1em]en_sa.90) -- ([yshift=-0.1em]en_ffn.-90);
\draw[->] (de_input.90) -- ([yshift=-0.1em]de_add.-90);
\draw[->] ([yshift=0.1em]de_add.90) -- ([yshift=-0.1em]de_sa.-90);
\draw[->] ([yshift=0.1em]de_sa.90) -- ([yshift=-0.1em]de_ca.-90);
\draw[->] ([yshift=0.1em]de_ca.90) -- ([yshift=-0.1em]de_ffn.-90);
\draw[->] ([yshift=0.1em]en_ffn.90) -- ([yshift=-0.1em]en_sf.-90);
\draw[->] ([yshift=0.1em]en_sf.90) -- ([yshift=-0.1em]en_output.-90);
\draw[->] ([yshift=0.1em]de_ffn.90) -- ([yshift=-0.1em]sf.-90);
\draw[->] ([yshift=0.1em]sf.90) -- ([yshift=1.5em]sf.90);
\draw[->] ([xshift=0.1em]en_pos.0) -- ([xshift=-0.1em]en_add.180);
\draw[->] ([xshift=-0.1em]de_pos.180) -- ([xshift=0.1em]de_add.0);
\draw[->,rounded corners=2pt] ([yshift=2em]en_ffn.90) -- ([xshift=4em,yshift=2em]en_ffn.90) -- ([xshift=-1.5em]de_ca.west) -- ([xshift=-0.1em]de_ca.west);
\begin{pgfonlayer}{background}
\node[draw=ugreen,rounded corners=2pt,inner xsep=6pt,inner ysep=8pt,dashed,thick][fit=(en_sa)(en_ffn)]{};
\node[draw=red,rounded corners=2pt,inner xsep=6pt,inner ysep=8pt,dashed,thick][fit=(de_sa)(de_ca)(de_ffn)]{};
\node[draw=ugreen,rounded corners=2pt,inner xsep=6pt,inner ysep=8pt,dashed,thick,xshift=-0.2em,yshift=-0.2em][fit=(en_add1)(en_add2)(en_sa)(en_ffn)](box1){};
\node[draw=red,rounded corners=2pt,inner xsep=6pt,inner ysep=8pt,dashed,thick,xshift=0.2em,yshift=-0.2em][fit=(de_sa)(de_ca)(de_ffn)(de_add3)](box2){};
\end{pgfonlayer}
\node[anchor=east,font=\scriptsize,text=ugreen] at ([xshift=-0.1em]box1.west){$N \times$};
\node[anchor=west,font=\scriptsize,text=red] at ([xshift=0.1em]box2.east){$\times N$};
\node[anchor=east,font=\scriptsize] at ([xshift=-0.1em]en_cnn.west){$2 \times$};
\node[anchor=east,font=\scriptsize,align=center,text=ugreen] at ([xshift=-0.1em,yshift=3em]box1.west){ST\\ 编码器};
\node[anchor=west,font=\scriptsize,align=center,text=red] at ([xshift=0.1em,yshift=5em]box2.east){ST\\解码器};
\node[anchor=east,font=\scriptsize,align=center,text=ugreen] at ([xshift=-0.1em,yshift=3em]box1.west){ST \\ 编码器};
\node[anchor=west,font=\scriptsize,align=center,text=red] at ([xshift=0.1em,yshift=5em]box2.east){ST \\ 解码器};
\end{tikzpicture}
\ No newline at end of file
......@@ -75,7 +75,7 @@
\parinterval 经过上面的描述可以看出,音频的表示实际上是一个非常长的采样点序列,这导致了直接使用现有的深度学习技术处理音频序列较为困难。并且,原始的音频信号中可能包含着较多的噪声、环境声或冗余信息,也会对模型产生干扰。因此,一般会对音频序列进行处理来提取声学特征,具体为将长序列的采样点序列转换为短序列的特征向量序列,再用于下游系统。虽然已有一些工作不依赖特征提取,直接在原始的采样点序列上进行声学建模和模型训练\upcite{DBLP:conf/interspeech/SainathWSWV15},但目前的主流方法仍然是基于声学特征进行建模\upcite{DBLP:conf/icassp/MohamedHP12}
\parinterval 声学特征提取的第一步是预处理。其流程主要是对音频进行预加重、分帧和加窗。预加重用来提升音频信号中的高频部分,目的是使频谱更加平滑。分帧(原理如图\ref{fig:17-3}所示)是基于短时平稳假设,即根据生物学特征,语音信号是一个缓慢变化的过程,10ms$\thicksim$30ms的信号片段是相对平稳的。基于这个假设,一般将每25ms作为一帧来提取特征,这个时间称为{\small\bfnew{帧长}}\index{帧长}(Frame Length)\index{Frame Length}。同时,为了保证不同帧之间的信号平滑性,使每两个相邻帧之间存在一定的重合部分。一般每隔10ms取一帧,这个时长称为{\small\bfnew{帧移}}\index{帧移}(Frame Shift)\index{Frame Shift}。为了缓解分帧带来的频谱泄漏,对每帧的信号进行加窗处理使其幅度在两段渐变到0,一般采用的是{\small\bfnew{汉明窗}}\index{汉明窗}(Hamming)\index{Hamming}\upcite{洪青阳2020语音识别原理与应用}
\parinterval 声学特征提取的第一步是预处理。其流程主要是对音频进行预加重、分帧和加窗。预加重是通过增强音频信号中的高频部分来减弱语音中对高频信号的抑制,使频谱更加顺滑。分帧(原理如图\ref{fig:17-3}所示)是基于短时平稳假设,即根据生物学特征,语音信号是一个缓慢变化的过程,10ms$\thicksim$30ms的信号片段是相对平稳的。基于这个假设,一般将每25ms作为一帧来提取特征,这个时间称为{\small\bfnew{帧长}}\index{帧长}(Frame Length)\index{Frame Length}。同时,为了保证不同帧之间的信号平滑性,使每两个相邻帧之间存在一定的重合部分。一般每隔10ms取一帧,这个时长称为{\small\bfnew{帧移}}\index{帧移}(Frame Shift)\index{Frame Shift}。为了缓解分帧带来的频谱泄漏,对每帧的信号进行加窗处理使其幅度在两段渐变到0,一般采用的是{\small\bfnew{汉明窗}}\index{汉明窗}(Hamming)\index{Hamming}\upcite{洪青阳2020语音识别原理与应用}
%----------------------------------------------------------------------------------------------------
\begin{figure}[htp]
\centering
......@@ -117,7 +117,7 @@
\parinterval 传统的语音识别模型和统计机器翻译相似,需要利用声学模型、语言模型和发音词典联合进行识别,系统较为复杂\upcite{DBLP:journals/ftsig/GalesY07,DBLP:journals/taslp/MohamedDH12,DBLP:journals/spm/X12a}。而近些年来,随着神经网络的发展,基于神经网络的端到端语音识别模型逐渐受到关注,大大简化了训练流程\upcite{DBLP:conf/nips/ChorowskiBSCB15,DBLP:conf/icassp/ChanJLV16}。目前的端到端语音识别模型主要基于序列到序列结构,编码器根据输入的声学特征进一步提取高级特征,解码器根据编码器提取的特征识别对应的文本。在后文中即将介绍的端到端语音翻译模型也是基于十分相似的结构。因此,从某种意义上说,语音识别和翻译所使用的端到端方法与神经机器翻译是一致的。
\parinterval 语音识别目前广泛使用基于Transformer的模型结构(见{\chaptertwelve}),如图\ref{fig:17-5}所示。可以看出,相比文本翻译,模型结构上唯一的区别在于编码器的输入为声学特征,以及编码器底层会使用额外的卷积层来减小输入序列的长度,从而降低长序列带来的显存占用以及建模难度。由于语音对应的特征序列过长,在计算注意力模型的时候,会占用大量的内存/显存,并增加训练时间。因此,通常会先对语音特征做一个下采样,缩小语音的序列长度。目前一个常用的做法,是在输入的语音特征上进行两层步长为2的卷积操作,从而将输入序列的长度缩小为之前的1/4。 通过大量的语音-标注平行数据对模型进行训练,可以得到高质量的语音识别模型。
\parinterval 语音识别目前广泛使用基于Transformer的模型结构(见{\chaptertwelve}),如图\ref{fig:17-5}所示。可以看出,相比文本翻译,模型结构上唯一的区别在于编码器的输入为声学特征,以及编码器底层会使用额外的卷积层来减小输入序列的长度。这是由于语音对应的特征序列过长,在计算注意力模型的时候,会占用大量的内存/显存,并增加训练时间。因此,一个常用的做法是在语音特征上进行两层步长为2的卷积操作,从而将输入序列的长度缩小为之前的1/4。通过大量的语音-标注平行数据对模型进行训练,可以得到高质量的语音识别模型。
%----------------------------------------------------------------------------------------------------
\begin{figure}[htp]
......@@ -145,7 +145,7 @@
\end{figure}
%----------------------------------------------------------------------------------------------------
\parinterval 可以看出,词格可以保存多条搜索路径,路径中保存了输入序列的时间信息以及解码过程。翻译模型基于词格进行翻译,可以降低语音识别模型带来的误差\upcite{DBLP:conf/acl/ZhangGCF19,DBLP:conf/acl/SperberNPW19}。但在端到端语音识别模型中,一般使用基于束搜索的方法进行解码,解码序列的长度与输入序列并不匹配,相比传统声学模型解码丢失了语音的时间信息,因此这种基于词格的方法主要集中在传统语音识别系统上。
\parinterval 可以看出,词格可以保存多条搜索路径,路径中保存了输入序列的时间信息以及解码过程。翻译模型基于词格进行翻译,可以降低语音识别模型带来的误差\upcite{DBLP:conf/acl/ZhangGCF19,DBLP:conf/acl/SperberNPW19}。但在端到端语音识别模型中,一般使用基于束搜索的方法进行解码,因为解码序列的长度与输入序列并不匹配,相比传统声学模型解码丢失了语音的时间信息。因此这种基于词格的方法主要集中在传统语音识别系统上。
\parinterval 为了降低错误传播问题带来的影响,一种思路是通过一个后处理模型修正识别结果中的错误,再送给文本翻译模型进行翻译。也可以进一步对文本做{\small\bfnew{顺滑}}\index{顺滑}(Disfluency Detection\index{Disfluency Detection})处理,使得送给翻译系统的文本更加干净、流畅,比如除去一些导致停顿的语气词。这一做法在工业界得到了广泛应用,但由于每个模型只能串行地计算,也会带来额外的计算代价以及运算时间。另外一种思路是训练更加健壮的文本翻译模型,使其可以处理输入中存在的噪声或误差\upcite{DBLP:conf/acl/LiuTMCZ18}
......@@ -433,7 +433,7 @@
\parinterval 由于解码器输出的是语言文字序列,因此需要考虑语言的特点对其进行改进。 例如,解码过程中, “the”,“on”,“at”这种介词或者冠词与图像的相关性较低\upcite{DBLP:conf/cvpr/LuXPS17}。因此,可以通过门控单元,控制视觉信号作用于文字生成的程度。另外,在解码过程中,生成的每个单词对应着图像的区域可能是不同的。因此也可以设计更为有效的注意力机制来捕捉解码器端对不同图像局部信息的关注程度\upcite{DBLP:conf/cvpr/00010BT0GZ18}
\parinterval 除了更好地使生成文本与图像特征进行相互作用以外,还有一些改进方法。例如,用卷积神经网络或者Transformer代替解码器所使用的循环神经网络\upcite{DBLP:conf/cvpr/AnejaDS18}。或者使用更深层的神经网络学习动词或者词等视觉中不易表现出来的单词\upcite{DBLP:journals/mta/FangWCT18},其思想与深层神经机器翻译模型有相通之处(见{\chapterfifteen})。
\parinterval 除了更好地使生成文本与图像特征进行相互作用以外,还有一些改进方法。例如,用卷积神经网络或者Transformer代替解码器所使用的循环神经网络\upcite{DBLP:conf/cvpr/AnejaDS18}。或者使用更深层的神经网络学习动词或者形容词等视觉中不易表现出来的单词\upcite{DBLP:journals/mta/FangWCT18},其思想与深层神经机器翻译模型有相通之处(见{\chapterfifteen})。
%----------------------------------------------------------------------------------------
% NEW SUB-SECTION
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论