Commit 4d380108 by zengxin

合并分支 'zengxin' 到 'caorunzhe'

Zengxin

查看合并请求 !484
parents e337b1d6 ac4d14fc
......@@ -8,23 +8,20 @@
\tikzstyle{cir} = [thin,fill=blue!8,draw,circle,minimum size =0.5em,drop shadow={shadow xshift=0.15em, shadow yshift=-0.1em}]
\tikzstyle{word} = [inner sep=0pt, font=\footnotesize,minimum height=\bcc]
\draw[fill=blue!8,xshift=0.3cm,yshift=0.5cm,line width=0.6pt] (0cm,0cm) rectangle (0cm+6*\bcc,0cm+9*\bcc);
\draw[ugreen!60,step=\bcc,xshift=0.3cm,yshift=0.5cm,gray] (0cm,0cm) grid (0cm+6*\bcc,0cm+9*\bcc);
%\draw[line width=0.7pt,xshift=0.3cm,yshift=0.5cm] (0cm,0cm) rectangle (0cm+6*\bcc,0cm+9*\bcc);
\draw[red!60,line width=2pt,xshift=0.3cm,yshift=0.5cm] (0cm,0cm+2*\bcc) rectangle (0cm+6*\bcc,0cm+4*\bcc);
%\draw[fill=blue!8,xshift=0.3cm,yshift=0.5cm,line width=0.6pt] (0cm,0cm) rectangle (0cm+6*\bcc,0cm+9*\bcc);
%\draw[ugreen!60,step=\bcc,xshift=0.3cm,yshift=0.5cm,gray] (0cm,0cm) grid (0cm+6*\bcc,0cm+9*\bcc);
%\draw[red!60,line width=2pt,xshift=0.3cm,yshift=0.5cm] (0cm,0cm+2*\bcc) rectangle (0cm+6*\bcc,0cm+4*\bcc);
% 输入矩阵
\draw[thick,fill=blue!8,line width=0.6pt] (0cm,0cm) rectangle (0cm+6*\bcc,0cm+9*\bcc);
\draw[step=\bcc,gray] (0cm,0cm) grid (0cm+6*\bcc,0cm+9*\bcc);
%\draw[line width=0.7pt] (0cm,0cm) rectangle (0cm+6*\bcc,0cm+9*\bcc);
\draw[red!60,line width=2pt] (0cm,0cm) rectangle (0cm+6*\bcc,0cm+2*\bcc);
\draw[ugreen!60,line width=2pt] (0cm,0cm+3*\bcc) rectangle (0cm+6*\bcc,0cm+6*\bcc);
\draw[red!60,line width=2pt] (0cm,0cm+7*\bcc) rectangle (0cm+6*\bcc,0cm+9*\bcc);
% 特征图
\draw[fill=blue!8,xshift=5.0cm,yshift=1.3cm,line width=0.6pt] (0cm,0cm) rectangle (0cm+1*\bcc,0cm+6*\bcc);
\draw[step=\bcc,gray,xshift=5.0cm,yshift=1.3cm] (0cm,0cm) grid (0cm+1*\bcc,0cm+6*\bcc);
%\draw[xshift=5.0cm,yshift=1.3cm,line width=0.7pt] (0cm,0cm) rectangle (0cm+1*\bcc,0cm+6*\bcc);
\draw[ugreen!60,line width=2pt,xshift=5.0cm,yshift=1.3cm] (0cm,0cm+2*\bcc) rectangle (0cm+1*\bcc,0cm+3*\bcc);
\draw [gray,fill=blue!8,line width=0.6pt](8cm,2.6cm) -- (8.4cm, 2.6cm) -- (9cm,1cm) -- (8.6cm, 1cm) -- (8cm,2.6cm);
......@@ -40,15 +37,12 @@
\draw[fill=blue!8,xshift=5.2cm,yshift=1.0cm,line width=0.6pt] (0cm,0cm) rectangle (0cm+1*\bcc,0cm+6*\bcc);
\draw[step=\bcc,gray,xshift=5.2cm,yshift=1.0cm] (0cm,0cm) grid (0cm+1*\bcc,0cm+6*\bcc);
%\draw[line width=0.7pt,xshift=5.2cm,yshift=1.0cm] (0cm,0cm) rectangle (0cm+1*\bcc,0cm+6*\bcc);
\draw[fill=blue!8,xshift=5.4cm,yshift=0.3cm,line width=0.6pt] (0cm,0cm) rectangle (0cm+1*\bcc,0cm+7*\bcc);
\draw[step=\bcc,gray,xshift=5.4cm,yshift=0.3cm] (0cm,0cm) grid (0cm+1*\bcc,0cm+7*\bcc);
%\draw[line width=0.7pt,xshift=5.4cm,yshift=0.3cm] (0cm,0cm) rectangle (0cm+1*\bcc,0cm+7*\bcc);
\draw[fill=blue!8,xshift=5.6cm,yshift=0cm,line width=0.6pt] (0cm,0cm) rectangle (0cm+1*\bcc,0cm+7*\bcc);
\draw[step=\bcc,gray,xshift=5.6cm,yshift=0cm] (0cm,0cm) grid (0cm+1*\bcc,0cm+7*\bcc);
%\draw[line width=0.7pt,xshift=5.6cm,yshift=0cm] (0cm,0cm) rectangle (0cm+1*\bcc,0cm+7*\bcc);
\draw[red!60,line width=2pt,xshift=5.6cm,yshift=0cm] (0cm,0cm) rectangle (0cm+1*\bcc,0cm+1*\bcc);
\draw[red!60,line width=2pt,xshift=5.6cm,yshift=0cm] (0cm,0cm+2*\bcc) rectangle (0cm+1*\bcc,0cm+3*\bcc);
\draw[red!60,line width=2pt,xshift=5.6cm,yshift=0cm] (0cm,0cm+6*\bcc) rectangle (0cm+1*\bcc,0cm+7*\bcc);
......@@ -81,17 +75,12 @@
\node[draw,rectangle callout,callout relative pointer={(0.28,-0.6)}] at (-0.3cm,4.6cm) {\textrm{卷积核}};
\node[draw,rectangle callout,callout relative pointer={(0.1,-0.5)}] at (5cm,4.6cm) {\textrm{特征图}};
%\draw [thick] (0cm, -0.3cm) -- (0cm, -0.5cm) -- node[font=\tiny, align=center,yshift=-0.5cm]{$m \times k$ representation of \\ sentence with static and \\ non-static channels} (2.4cm,-0.5cm) -- (2.4cm, -0.3cm);
%\draw [thick] (3.6cm, -0.3cm) -- (3.6cm, -0.5cm) -- node[font=\tiny, align=center,yshift=-0.5cm]{Convolutional layer with \\ multiple filter widths and \\ feature maps} (6cm,-0.5cm) -- (6cm, -0.3cm);
%\draw [thick] (7.2cm, -0.3cm) -- (7.2cm, -0.5cm) -- node[font=\tiny, align=center,yshift=-0.5cm]{Max-over-time\\ pooling} (9cm,-0.5cm) -- (9cm, -0.3cm);
%\draw [thick] (10cm, -0.3cm) -- (10cm, -0.5cm) -- node[font=\tiny, align=center,yshift=-0.5cm]{Fully connected layer \\ with dropout and \\ softmax output} (11.7cm,-0.5cm) -- (11.7cm, -0.3cm);
\draw [thick] (0cm, -0.3cm) -- (0cm, -0.5cm) -- node[font=\tiny, align=center,yshift=-0.5cm]{维度大小为 $m \times K$ \\ 的静态与非静态通道\\的句子表示} (2.4cm,-0.5cm) -- (2.4cm, -0.3cm);
\draw [thick] (3.6cm, -0.3cm) -- (3.6cm, -0.5cm) -- node[font=\tiny, align=center,yshift=-0.5cm]{具有多个不同大小\\的卷积核和特征图\\的卷积层} (6cm,-0.5cm) -- (6cm, -0.3cm);
\draw [thick] (7.2cm, -0.3cm) -- (7.2cm, -0.5cm) -- node[font=\tiny, align=center,yshift=-0.5cm]{最大池化} (9cm,-0.5cm) -- (9cm, -0.3cm);
\draw [thick] (10cm, -0.3cm) -- (10cm, -0.5cm) -- node[font=\tiny, align=center,yshift=-0.5cm]{带有Dropout\\和Softmax输出\\的全连接层} (11.7cm,-0.5cm) -- (11.7cm, -0.3cm);
%\node [font=\Large] at (5.2cm,-2cm){$h_i = dot(F,x_{i:i+l-1})+b$};
\end{scope}
......
......@@ -324,11 +324,11 @@
\begin{itemize}
\vspace{0.5em}
\item 首先,将$\mathbi{Q}$$\mathbi{K}$$\mathbi{V}$分别通过线性(Linear)变换的方式映射为$h$个子集。即$\mathbi{Q}_i = \mathbi{Q}\mathbi{W}_i^Q $$\mathbi{K}_i = \mathbi{K}\mathbi{W}_i^K $$\mathbi{V}_i = \mathbi{V}\mathbi{W}_i^V $,其中$i$表示第$i$个头, $\mathbi{W}_i^Q \in \mathbb{R}^{d_{model} \times d_k}$, $\mathbi{W}_i^K \in \mathbb{R}^{d_{model} \times d_k}$, $\mathbi{W}_i^V \in \mathbb{R}^{d_{model} \times d_v}$是参数矩阵; $d_k=d_v=d_{model} / h$,对于不同的头采用不同的变换矩阵,这里$d_{model}$表示每个隐层向量的维度;
\item 首先,将$\mathbi{Q}$$\mathbi{K}$$\mathbi{V}$分别通过线性(Linear)变换的方式映射为$h$个子集。即$\mathbi{Q}_i = \mathbi{Q}\mathbi{W}_i^{\,Q} $$\mathbi{K}_i = \mathbi{K}\mathbi{W}_i^{\,K} $$\mathbi{V}_i = \mathbi{V}\mathbi{W}_i^{\,V} $,其中$i$表示第$i$个头, $\mathbi{W}_i^{\,Q} \in \mathbb{R}^{d_{model} \times d_k}$, $\mathbi{W}_i^{\,K} \in \mathbb{R}^{d_{model} \times d_k}$, $\mathbi{W}_i^{\,V} \in \mathbb{R}^{d_{model} \times d_v}$是参数矩阵; $d_k=d_v=d_{model} / h$,对于不同的头采用不同的变换矩阵,这里$d_{model}$表示每个隐层向量的维度;
\vspace{0.5em}
\item 其次,对每个头分别执行点乘注意力操作,并得到每个头的注意力操作的输出$\mathbi{head}_i$
\vspace{0.5em}
\item 最后,将$h$个头的注意力输出在最后一维$d_v$进行拼接(Concat)重新得到维度为$h \times d_v$的输出,并通过对其左乘一个权重矩阵$\mathbi{W}^o$进行线性变换,从而对多头计算得到的信息进行融合,且将多头注意力输出的维度映射为模型的隐层大小(即$d_{model}$),这里参数矩阵$\mathbi{W}^o \in \mathbb{R}^{h \times d_v \times d_{model}}$
\item 最后,将$h$个头的注意力输出在最后一维$d_v$进行拼接(Concat)重新得到维度为$h \times d_v$的输出,并通过对其左乘一个权重矩阵$\mathbi{W}^{\,o}$进行线性变换,从而对多头计算得到的信息进行融合,且将多头注意力输出的维度映射为模型的隐层大小(即$d_{model}$),这里参数矩阵$\mathbi{W}^{\,o} \in \mathbb{R}^{h \times d_v \times d_{model}}$
\vspace{0.5em}
\end{itemize}
......@@ -343,8 +343,8 @@
\parinterval 多头机制可以被形式化描述为如下公式:
\begin{eqnarray}
\textrm{MultiHead}(\mathbi{Q}, \mathbi{K} , \mathbi{V})& = & \textrm{Concat} (\mathbi{head}_1, ... , \mathbi{head}_h ) \mathbi{W}^o \label{eq:12-48} \\
\mathbi{head}_i & = &\textrm{Attention} (\mathbi{Q}\mathbi{W}_i^Q , \mathbi{K}\mathbi{W}_i^K , \mathbi{V}\mathbi{W}_i^V )
\textrm{MultiHead}(\mathbi{Q}, \mathbi{K} , \mathbi{V})& = & \textrm{Concat} (\mathbi{head}_1, ... , \mathbi{head}_h ) \mathbi{W}^{\,o} \label{eq:12-48} \\
\mathbi{head}_i & = &\textrm{Attention} (\mathbi{Q}\mathbi{W}_i^{\,Q} , \mathbi{K}\mathbi{W}_i^{\,K} , \mathbi{V}\mathbi{W}_i^{\,V} )
\label{eq:12-49}
\end{eqnarray}
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论