\begin{tikzpicture}[scale=0.8]
\tikzstyle{every node}=[scale=0.8]
%figure 1
\coordinate (A1) at (0, 0);
\coordinate (B1) at ([xshift=1.5em,yshift=-0.4em]A1);
\coordinate (C1) at ([xshift=0.3em,yshift=-2.6em]A1);
\coordinate (D1) at ([xshift=2.7em,yshift=-2.6em]A1);
\coordinate (E1) at ([xshift=2.4em,yshift=-1.5em]A1);
\coordinate (F1) at ([xshift=0.3em]D1);


%figure 2
\coordinate (A2) at ([yshift=-15em]A1);
\coordinate (B2) at ([xshift=1.5em,yshift=-0.4em]A2);
\coordinate (C2) at ([xshift=0.3em,yshift=-2.6em]A2);
\coordinate (D2) at ([xshift=2.7em,yshift=-2.6em]A2);
\coordinate (E2) at ([xshift=2.4em,yshift=-1.5em]A2);
\coordinate (F2) at ([xshift=0.3em]D2);


\foreach \x in {1,2}{
\draw[-,line width=2pt] (A\x) -- ([xshift=3.6em]A\x) -- ([xshift=3.6em,yshift=-3em]A\x) -- ([yshift=-3em]A\x) -- (A\x) -- ([xshift=1em]A\x);
\draw[-, thick] (B\x) -- (C\x) -- (D\x) -- (B\x);
\draw[-, thick,fill=black] ([xshift=-0.6em,yshift=-1.2em]B\x)  -- ([xshift=-0.3em,yshift=-1em]B\x) -- ([yshift=-1.2em]B\x) --([xshift=0.3em,yshift=-1em]B\x) -- ([xshift=0.6em,yshift=-1.2em]B\x) -- (D\x) -- (C\x) -- ([xshift=-0.6em,yshift=-1.2em]B\x);
\draw[-, thick,fill=black] (E\x) -- ([xshift=0.2em,yshift=0.3em]E\x) -- ([xshift=0.33em]F\x) -- (F\x) -- (E\x);
\node[circle,inner sep=0pt,minimum size=0.4em,fill=black] at ([xshift=-0.7em,yshift=-0.2em]B\x){};
\node[draw,rounded corners=2pt,fill=yellow!20,minimum height=2.2em,minimum width=4.3em](cnn\x) at ([xshift=9.5em,yshift=-1.5em]A\x){CNN};
\node[draw,circle,fill=green!20,font=\footnotesize,anchor=west,inner sep=3pt] (h\x_2) at ([xshift=3em,yshift=0.66em]cnn\x.east){$h_2$};
\node[draw,circle,fill=green!20,font=\footnotesize,anchor=south,inner sep=3pt] (h\x_1) at ([yshift=1em]h\x_2.north){$h_1$};
\node[font=\footnotesize,anchor=north] (h\x_c) at ([yshift=-0.6em]h\x_2.south){$\cdots$};
\node[draw,circle,fill=green!20,font=\footnotesize,anchor=north,inner sep=3pt] (h\x_n) at ([yshift=-0.6em]h\x_c.south){$h_n$};
}

\begin{pgfonlayer}{background}
\node[draw,thick,rounded corners=2pt,densely dashed,inner ysep=1.2em,inner xsep=0.4em,label={above:图像特征向量}][fit=(h1_1)(h1_2)(h1_n)](box1){};
\node[draw,thick,rounded corners=2pt,densely dashed,inner ysep=1.2em,inner xsep=0.4em,label={above:图像特征向量}][fit=(h2_1)(h2_2)(h2_n)](box2){};
\end{pgfonlayer}

\node[anchor=west,draw,rounded corners=2pt,fill=blue!20,minimum height=2.2em,minimum width=4.3em] (decoder1)at ([xshift=6em]box1.east){解码器};
\node[anchor=west,draw,circle,inner sep=0pt,minimum size=1.4em] (add)at ([xshift=2em,yshift=1.6em]box2.east){};
\draw[] (add.0) -- (add.180);
\draw[] (add.90) -- (add.-90);
\node[anchor=west,draw,rounded corners=2pt,fill=blue!20,minimum height=2.2em,minimum width=4.3em] (decoder2)at ([xshift=6em]box2.east){解码器};


\draw[->,thick] ([xshift=-2.7em]cnn1.180) -- ([xshift=-0.1em]cnn1.180);
\draw[->,thick] ([xshift=-2.7em]cnn2.180) -- ([xshift=-0.1em]cnn2.180);
\draw[->,thick] ([xshift=0.1em]cnn1.0) -- ([xshift=-0.1em]box1.180);
\draw[->,thick] ([xshift=0.1em]cnn2.0) -- ([xshift=-0.1em]box2.180);
\draw[->,thick] ([xshift=0.1em]box1.0) -- ([xshift=-0.1em]decoder1.180);
\draw[->,thick] ([xshift=0.1em]h2_1.0) -- (add.180);
\draw[->,thick] ([xshift=0.1em]h2_2.0) -- (add.180);
\draw[->,thick] ([xshift=0.1em]h2_c.0) -- (add.180);
\draw[->,thick] ([xshift=0.1em]h2_n.0) -- (add.180);
\draw[->,thick,out=20,in=130] ([xshift=0.1em]add.45) to ([xshift=-0em,yshift=0.1em]decoder2.north west);
\draw[->,thick,out=200,in=-45] ([xshift=-0.1em]decoder2.west) to ([yshift=-0.1em]add.-90);

\node [anchor=north](pos1) at ([yshift=-1.0em]box1.south) {(a)未引入注意力机制};
\node [anchor=north](pos2) at ([yshift=-1.0em]box2.south) {(b)引入注意力机制};
\end{tikzpicture}
%------------------------------------------------------------------------------------------------------------