\begin{tikzpicture} \tikzstyle{layer}=[draw,rounded corners=2pt,font=\scriptsize,align=center,minimum width=5em] \tikzstyle{word}=[font=\scriptsize] \node[layer,fill=red!20] (en_sa) at (0,0){Multi-Head \\ Attention}; \node[layer,anchor=south,fill=green!20] (en_ffn) at ([yshift=1.4em]en_sa.north){Feed Forward \\ Network}; \node[draw,circle,inner sep=0pt, minimum size=1em,anchor=north] (en_add) at ([yshift=-1.4em]en_sa.south){}; \draw[] (en_add.90) -- (en_add.-90); \draw[] (en_add.0) -- (en_add.180); \node[layer,anchor=north,fill=yellow!20] (en_cnn) at ([yshift=-1.4em]en_add.south){CNN}; \node[draw,circle,inner sep=0pt, minimum size=1em,anchor=west] (de_add) at ([xshift=7em]en_add.east){}; \draw[] (de_add.90) -- (de_add.-90); \draw[] (de_add.0) -- (de_add.180); \node[layer,anchor=south,fill=red!20] (de_sa) at ([yshift=1.4em]de_add.north){Masked \\Multi-Head\\Attention}; \node[layer,anchor=south,fill=red!20] (de_ca) at ([yshift=1.4em]de_sa.north){Multi-Head \\ Attention}; \node[layer,anchor=south,fill=green!20] (de_ffn) at ([yshift=1.4em]de_ca.north){Feed Forward \\ Network}; \node[layer,anchor=south,fill=blue!20] (en_sf) at ([yshift=3em]en_ffn.north){Softmax}; \node[layer,anchor=south,fill=blue!20] (sf) at ([yshift=2em]de_ffn.north){Softmax}; \node[layer,anchor=south,fill=orange!20] (en_output) at ([yshift=1.4em]en_sf.north){CTC Output}; %\node[layer,anchor=south,fill=orange!20] (output) at ([yshift=1.4em]sf.north){ST Output}; \node[anchor=north,font=\scriptsize,align=center] (en_input) at ([yshift=-1em]en_cnn.south){语音特征\\(FBank/MFCC)}; \node[anchor=north,font=\scriptsize,align=center] (de_input) at ([yshift=-1em]de_add.south){标注文本\\编码表示}; \node[anchor=east,font=\scriptsize,align=center] (en_pos) at ([xshift=-2em]en_add.west){位置编码}; \node[anchor=west,font=\scriptsize,align=center] (de_pos) at ([xshift=2em]de_add.east){位置编码}; \draw[->] (en_input.90) -- ([yshift=-0.1em]en_cnn.-90); \draw[->] ([yshift=0.1em]en_cnn.90) -- ([yshift=-0.1em]en_add.-90); \draw[->] ([yshift=0.1em]en_add.90) -- ([yshift=-0.1em]en_sa.-90); \draw[->] ([yshift=0.1em]en_sa.90) -- ([yshift=-0.1em]en_ffn.-90); \draw[->] (de_input.90) -- ([yshift=-0.1em]de_add.-90); \draw[->] ([yshift=0.1em]de_add.90) -- ([yshift=-0.1em]de_sa.-90); \draw[->] ([yshift=0.1em]de_sa.90) -- ([yshift=-0.1em]de_ca.-90); \draw[->] ([yshift=0.1em]de_ca.90) -- ([yshift=-0.1em]de_ffn.-90); \draw[->] ([yshift=0.1em]en_ffn.90) -- ([yshift=-0.1em]en_sf.-90); \draw[->] ([yshift=0.1em]en_sf.90) -- ([yshift=-0.1em]en_output.-90); \draw[->] ([yshift=0.1em]de_ffn.90) -- ([yshift=-0.1em]sf.-90); \draw[->] ([yshift=0.1em]sf.90) -- ([yshift=1.5em]sf.90); \draw[->] ([xshift=0.1em]en_pos.0) -- ([xshift=-0.1em]en_add.180); \draw[->] ([xshift=-0.1em]de_pos.180) -- ([xshift=0.1em]de_add.0); \draw[->,rounded corners=2pt] ([yshift=2em]en_ffn.90) -- ([xshift=4em,yshift=2em]en_ffn.90) -- ([xshift=-1.5em]de_ca.west) -- ([xshift=-0.1em]de_ca.west); \begin{pgfonlayer}{background} \node[draw=ugreen,rounded corners=2pt,inner xsep=6pt,inner ysep=8pt,dashed,thick][fit=(en_sa)(en_ffn)]{}; \node[draw=red,rounded corners=2pt,inner xsep=6pt,inner ysep=8pt,dashed,thick][fit=(de_sa)(de_ca)(de_ffn)]{}; \end{pgfonlayer} \node[anchor=east,font=\scriptsize,text=ugreen] at ([xshift=-0.1em]box1.west){$N \times$}; \node[anchor=west,font=\scriptsize,text=red] at ([xshift=0.1em]box2.east){$\times N$}; \node[anchor=east,font=\scriptsize] at ([xshift=-0.1em]en_cnn.west){$2 \times$}; \node[anchor=east,font=\scriptsize,align=center,text=ugreen] at ([xshift=-0.1em,yshift=3em]box1.west){ST\\ 编码器}; \node[anchor=west,font=\scriptsize,align=center,text=red] at ([xshift=0.1em,yshift=5em]box2.east){ST\\解码器}; \end{tikzpicture}