Commit b39fbc5c by 单韦乔

合并分支 'shanweiqiao' 到 'caorunzhe'

15章图片颜色

查看合并请求 !1006
parents 31877050 2f9ca45e
\begin{tikzpicture}
\tikzstyle{elementnode} = [anchor=center,draw,minimum size=0.6em,inner sep=0.1pt,gray!80]
\tikzstyle{elementnode} = [anchor=center,draw=gray,minimum size=0.6em,inner sep=0.1pt]
\begin{scope}[scale=1.0]
\foreach \i / \j in
......@@ -17,7 +17,7 @@
0/2, 1/2, 2/2, 3/2, 4/2, 5/2, 6/2, 7/2,
0/1, 1/1, 2/1, 3/1, 4/1, 5/1, 6/1, 7/1,
0/0, 1/0, 2/0, 3/0, 4/0, 5/0, 6/0, 7/0}
\node[elementnode,fill=gray!50] (b\i\j) at (0.6em*\i+5.5em,0.6em*\j) {};
\node[elementnode,fill=orange!15] (b\i\j) at (0.6em*\i+5.5em,0.6em*\j) {};
\node [anchor=south west,minimum height=0.5em,minimum width=4.8em,inner sep=0.1pt,very thick,blue!60,draw] (n1) at ([xshift=0em,yshift=0em]a01.south west) {};
......@@ -51,7 +51,7 @@
0/2, 1/2, 2/2, 3/2, 4/2, 5/2, 6/2, 7/2,
0/1, 1/1, 2/1, 3/1, 4/1, 5/1, 6/1, 7/1,
0/0, 1/0, 2/0, 3/0, 4/0, 5/0, 6/0, 7/0}
\node[elementnode,fill=gray!50] (b\i\j) at (0.6em*\i+5.5em,0.6em*\j) {};
\node[elementnode,fill=orange!15] (b\i\j) at (0.6em*\i+5.5em,0.6em*\j) {};
\node [anchor=south west,minimum height=0.5em,minimum width=3em,inner sep=0.1pt,very thick,blue!60,draw] (n1) at ([xshift=0em,yshift=0em]a01.south west) {};
......@@ -85,7 +85,7 @@
0/2, 1/2, 2/2, 3/2, 4/2, 5/2, 6/2, 7/2,
0/1, 1/1, 2/1, 3/1, 4/1, 5/1, 6/1, 7/1,
0/0, 1/0, 2/0, 3/0, 4/0, 5/0, 6/0, 7/0}
\node[elementnode,fill=gray!50] (b\i\j) at (0.6em*\i+5.5em,0.6em*\j) {};
\node[elementnode,fill=orange!15] (b\i\j) at (0.6em*\i+5.5em,0.6em*\j) {};
\node [anchor=south west,minimum height=1.8em,minimum width=3em,inner sep=0.1pt,very thick,blue!60,draw] (n1) at ([xshift=0em,yshift=0em]a00.south west) {};
......
\begin{tikzpicture}
\tikzstyle{node}=[minimum height=2.5em,minimum width=8em,draw,rounded corners=2pt,thick,drop shadow]
\tikzstyle{labelnode}=[minimum height=1.8em]
\tikzstyle{word}=[minimum height=1.8em,font=\scriptsize]
\tikzfading[name=fadeouts, inner color=transparent!60,outer color=transparent!100]
\tikzstyle{wordnodes}=[inner sep=0mm,font=\footnotesize,text=white]
\tikzstyle{cnodes}=[path fading=fadeouts,minimum size=6em,fill=orange]
\tikzfading[name=fadeoutn, inner color=transparent!30,outer color=transparent!100]
\tikzstyle{wordnoden}=[inner sep=0mm,text=white]
\tikzstyle{cnoden}=[path fading=fadeoutn,minimum size=9em,fill=orange]
\tikzfading[name=fadeoutl, inner color=transparent!0,outer color=transparent!100]
\tikzstyle{wordnodel}=[inner sep=0mm,font=\Large,text=white]
\tikzstyle{cnodel}=[path fading=fadeoutl,minimum size=12em,fill=orange]
\tikzstyle{attn}=[]
\tikzstyle{rnn}=[minimum size=7em]
\tikzstyle{cnn}=[minimum size=5em]
\node[anchor=north west] (label) at (0,0){\small\bfnew{结构空间}};
\node[anchor=north west,wordnodes] (w11) at ([xshift=-0em,yshift=-2.5em]label.south){Reformer};
\node[anchor=north west,wordnodel] (w12) at ([xshift=0.2em,yshift=-1em]w11.south east){Transformer-XL};
\node[anchor=north,wordnodel] (w13) at ([xshift=-1.5em,yshift=-0.5em]w12.south){Transformer-DLCL};
\node[anchor=north,wordnodes] (w14) at ([xshift=-1em,yshift=-0.5em]w13.south){Transformer};
\node[anchor=north west,wordnodel] (w15) at ([xshift=2em,yshift=-0.5em]w14.south east){BERT};
\node[anchor=north,wordnodes] (w16) at ([xshift=-2em,yshift=-1em]w14.south){Transformer-ANN};
\node[anchor=north west,wordnodes] (w17) at ([xshift=-0em,yshift=-1em]w16.south east){Transformer-SAN};
\node[anchor=north,wordnoden] (w18) at ([xshift=-0em,yshift=-1.5em]w16.south){ALBERT};
\node[anchor=north west,wordnodes] (w19) at ([xshift=-0em,yshift=-0.5em]w18.south east){universal Transformer};
\node[anchor=north west,word] (we1) at ([xshift=-0em,yshift=0.3em]w11.north west){};
\node[anchor=south east,word] (we2) at ([xshift=-0em,yshift=-0.3em]w19.south east){};
\begin{pgfonlayer}{background}
\node[anchor=center,cnodes] (b11) at ([xshift=-0em,yshift=-0em]w11.center){};
\node[anchor=center,cnodel] (b12) at ([xshift=-0em,yshift=-0em]w12.center){};
\node[anchor=center,cnodel] (b13) at ([xshift=-0em,yshift=-0em]w13.center){};
\node[anchor=center,cnodes] (b14) at ([xshift=-0em,yshift=-0em]w14.center){};
\node[anchor=center,cnodel] (b15) at ([xshift=-2em,yshift=-0em]w15.center){};
\node[anchor=center,cnodes] (b16) at ([xshift=-0em,yshift=-0em]w16.center){};
\node[anchor=center,cnodes] (b17) at ([xshift=-0em,yshift=-0em]w17.center){};
\node[anchor=center,cnoden] (b18) at ([xshift=-0em,yshift=-0em]w18.center){};
\node[anchor=center,cnodes] (b19) at ([xshift=-0em,yshift=-0em]w19.center){};
\node [rectangle,inner sep=1em,draw=black,dashed,thick,rounded corners=8pt] [fit = (w11) (w15) (w18) (w19) (we1) (we2)] (box1) {};
\node[anchor=center,cnodes] (bb1) at ([xshift=1em,yshift=-0em]w18.east){};
\node[anchor=center,cnodes] (bb2) at ([xshift=-0.5em,yshift=-0em]w13.west){};
\node[anchor=center,cnodes] (bb3) at ([xshift=-0.5em,yshift=0.5em]w18.west){};
\end{pgfonlayer}
\node[anchor=south,word] (l1) at ([xshift=-0em,yshift=-0.5em]box1.north){基于注意力的结构};
\node[anchor=south west,wordnoden] (w21) at ([xshift=6.5em,yshift=1em]w12.north east){SRU};
\node[anchor=north west,wordnodel] (w22) at ([xshift=0.2em,yshift=0.5em]w21.south east){GRU};
\node[anchor=north west,wordnoden] (w23) at ([xshift=0em,yshift=0em]w22.south east){RNN};
\node[anchor=north,wordnoden] (w24) at ([xshift=0em,yshift=-1.5em]w22.south){LSTM};
\node[anchor=north,wordnodel] (w25) at ([xshift=0em,yshift=-0.5em]w24.south){Bi-LSTM};
\begin{pgfonlayer}{background}
\node[anchor=center,cnoden,rnn] (b21) at ([xshift=-0em,yshift=-0em]w21.center){};
\node[anchor=center,cnodel,rnn] (b22) at ([xshift=-0em,yshift=-0em]w22.center){};
\node[anchor=center,cnoden,rnn] (b23) at ([xshift=-0em,yshift=-0em]w23.center){};
\node[anchor=center,cnoden,rnn] (b24) at ([xshift=-0em,yshift=-0em]w24.center){};
\node[anchor=center,cnodel,rnn] (b25) at ([xshift=-0em,yshift=-0em]w25.center){};
\node [rectangle,inner sep=1em,draw=black,dashed,thick,rounded corners=8pt] [fit = (w21) (w25) (w23)] (box2) {};
\node[anchor=center,cnodes] (bb4) at ([xshift=-0.5em,yshift=-0em]w24.west){};
\node[anchor=center,cnodes] (bb5) at ([xshift=0.5em,yshift=-0em]w24.west){};
\node[anchor=south east,cnodel,minimum size=4em] (bb6) at ([xshift=0em,yshift=1em]w21.north west){};
\node[anchor=south west,cnodel,minimum size=4em] (bb7) at ([xshift=-0.5em,yshift=0.5em]w23.north east){};
\node[anchor=west,cnodel,minimum size=4em] (bb8) at ([xshift=-0em,yshift=0em]w23.east){};
\node[anchor=south,cnodel,minimum size=4em] (bb9) at ([xshift=0.2em,yshift=0em]bb8.north){};
\end{pgfonlayer}
\node[anchor=south,word] (l2) at ([xshift=-0em,yshift=-0.5em]box2.north){基于循环单元的结构};
\node[anchor= west,wordnoden] (w31) at ([xshift=4em,yshift=-0.2em]w17.east){GoogleNet};
\node[anchor=north,wordnodes] (w32) at ([xshift=2em,yshift=-0.2em]w31.south){ResNet};
\node[anchor=north,wordnoden] (w33) at ([xshift=0em,yshift=-1.5em]w31.south){LeNet};
\node[anchor=east,wordnoden] (w34) at ([xshift=2.5em,yshift=0.4em]w32.east){CNN};
\node[anchor=south west,wordnoden] (w35) at ([xshift=0.5em,yshift=0.3em]w34.north east){AlexNet};
\node[anchor=north,wordnodel] (w36) at ([xshift=-1em,yshift=-2em]w35.south){VGG-Net};
\begin{pgfonlayer}{background}
\node[anchor=center,cnodel,cnn] (b31) at ([xshift=-0.5em,yshift=-0em]w31.center){};
\node[anchor=center,cnoden,cnn] (b32) at ([xshift=-0em,yshift=-0em]w32.center){};
\node[anchor=center,cnodel,cnn] (b33) at ([xshift=-0em,yshift=-0em]w33.center){};
\node[anchor=center,cnodel,cnn] (b34) at ([xshift=-0em,yshift=-0em]w34.center){};
\node[anchor=center,cnoden,cnn] (b35) at ([xshift=-0em,yshift=-0em]w35.center){};
\node[anchor=center,cnodel,cnn] (b36) at ([xshift=-0em,yshift=-0em]w36.center){};
\node [rectangle,inner sep=0.5em,draw=black,dashed,thick,rounded corners=8pt] [fit = (w31) (w33) (w35) (w36)] (box3) {};
\node[anchor=center,cnodes] (bb10) at ([xshift=1em,yshift=-0em]w31.west){};
\node[anchor=center,cnodes] (bb11) at ([xshift=0.5em,yshift=-0em]w34.west){};
\node[anchor=center,cnodes] (bb12) at ([xshift=0em,yshift=1em]w34.north){};
\end{pgfonlayer}
\node[anchor=south,word] (l3) at ([xshift=-0em,yshift=-0.5em]box3.north){基于卷积单元的结构};
\node [rectangle,inner sep=1em,draw=black,very thick,rounded corners=8pt] [fit = (label) (box1) (box2) (box3)] (box4) {};
\node[anchor=south east,word,text=ublue] (l4) at ([xshift=-0em,yshift=0em]box4.north east){颜色越深表示模型对当前任务的建模能力越强};
\end{tikzpicture}
\ No newline at end of file
......@@ -85,7 +85,7 @@
\noindent 进一步,$\mathbi{e}_{ij}$被定义为:
\begin{eqnarray}
\mathbi{e}_{ij} &=& \frac{(\mathbi{x}_i \mathbi{W}_Q){(\mathbi{x}_j \mathbi{W}_K)}^{T}}{\sqrt{d_k}}
\mathbi{e}_{ij} &=& \frac{(\mathbi{x}_i \mathbi{W}_Q){(\mathbi{x}_j \mathbi{W}_K)}^{\textrm{T}}}{\sqrt{d_k}}
\label{eq:15-6}
\end{eqnarray}
......@@ -122,8 +122,8 @@
\noindent 相比于公式\eqref{eq:15-4},公式\eqref{eq:15-10}在计算$\mathbi{z}_i$时引入了额外的向量$\mathbi{a}_{ij}^V$,用它来表示位置$i$与位置$j$之间的相对位置信息。同时在计算注意力权重时对$\mathbi{K}$进行修改,同样引入了$\mathbi{a}_{ij}^K$向量表示位置$i$与位置$j$之间的相对位置。在公式\eqref{eq:15-6}的基础上,注意力权重的计算方式调整为:
\begin{eqnarray}
\mathbi{e}_{ij} &=& \frac{\mathbi{x}_i \mathbi{W}_Q{(\mathbi{x}_j \mathbi{W}_K + \mathbi{a}_{ij}^K )}^{T}}{\sqrt{d_k}} \nonumber \\
&=& \frac{\mathbi{x}_i \mathbi{W}_Q{(\mathbi{x}_j \mathbi{W}_K)}^{T} + \mathbi{x}_i \mathbi{W}_Q{(\mathbi{a}_{ij}^K )}^{T}}{\sqrt{d_k}}
\mathbi{e}_{ij} &=& \frac{\mathbi{x}_i \mathbi{W}_Q{(\mathbi{x}_j \mathbi{W}_K + \mathbi{a}_{ij}^K )}^{\textrm{T}}}{\sqrt{d_k}} \nonumber \\
&=& \frac{\mathbi{x}_i \mathbi{W}_Q{(\mathbi{x}_j \mathbi{W}_K)}^{\textrm{T}} + \mathbi{x}_i \mathbi{W}_Q{(\mathbi{a}_{ij}^K )}^{T}}{\sqrt{d_k}}
\label{eq:15-11}
\end{eqnarray}
......@@ -132,25 +132,25 @@
\vspace{0.5em}
\item {\small\bfnew{Transformer-XL}}\upcite{Dai2019TransformerXLAL}。在Transformer中,模型的输入由词嵌入表示与绝对位置编码组成,例如,对于输入层有,$\mathbi{x}_i = \mathbi{E}_{x_i} + \mathbi{U}_i$$\mathbi{x}_j=\mathbi{E}_{x_j} + \mathbi{U}_j$,其中$\mathbi{E}_{x_i} $$\mathbi{E}_{x_j} $表示词嵌入,$\mathbi{U}_i$$\mathbi{U}_j$表示绝对位置编码(正余弦函数)。将$\mathbi{x}_i$$\mathbi{x}_j$代入公式\eqref{eq:15-6}中可以得到:
\begin{eqnarray}
\mathbi{e}_{ij} &=& \frac{(\mathbi{E}_{x_i} + \mathbi{U}_i)\mathbi{W}_Q{((\mathbi{E}_{x_j} + \mathbi{U}_j)\mathbi{W}_K)}^{T}}{\sqrt{d_k}}
\mathbi{e}_{ij} &=& \frac{(\mathbi{E}_{x_i} + \mathbi{U}_i)\mathbi{W}_Q{((\mathbi{E}_{x_j} + \mathbi{U}_j)\mathbi{W}_K)}^{\textrm{T}}}{\sqrt{d_k}}
\label{eq:15-12}
\end{eqnarray}
\noindent 这里使用$A_{ij}^{\rm abs}$表示公式\eqref{eq:15-12}中等式右侧的分子部分,并对其进行展开:
\begin{eqnarray}
A_{ij}^{\rm abs} &=& \underbrace{\mathbi{E}_{x_i}\mathbi{W}_Q\mathbi{W}_{K}^{T}\mathbi{E}_{x_j}^{T}}_{\textrm{(a)}} + \underbrace{\mathbi{E}_{x_i}\mathbi{W}_Q\mathbi{W}_{K}^{T}\mathbi{U}_{j}^{T}}_{\textrm{(b)}} + \nonumber \\
& & \underbrace{\mathbi{U}_i\mathbi{W}_Q\mathbi{W}_{K}^{T}\mathbi{E}_{x_j}^{T}}_{\textrm{(c)}} + \underbrace{\mathbi{U}_i\mathbi{W}_Q\mathbi{W}_{K}^{T}\mathbi{U}_{j}^{T}}_{\textrm{(d)}}
A_{ij}^{\rm abs} &=& \underbrace{\mathbi{E}_{x_i}\mathbi{W}_Q\mathbi{W}_{K}^{\textrm{T}}\mathbi{E}_{x_j}^{\textrm{T}}}_{\textrm{(a)}} + \underbrace{\mathbi{E}_{x_i}\mathbi{W}_Q\mathbi{W}_{K}^{\textrm{T}}\mathbi{U}_{j}^{\textrm{T}}}_{\textrm{(b)}} + \nonumber \\
& & \underbrace{\mathbi{U}_i\mathbi{W}_Q\mathbi{W}_{K}^{\textrm{T}}\mathbi{E}_{x_j}^{\textrm{T}}}_{\textrm{(c)}} + \underbrace{\mathbi{U}_i\mathbi{W}_Q\mathbi{W}_{K}^{\textrm{T}}\mathbi{U}_{j}^{\textrm{T}}}_{\textrm{(d)}}
\label{eq:15-13}
\end{eqnarray}
\noindent 其中,${\rm abs}$代表使用绝对位置编码计算得到的$A_{ij}$$\mathbi{W}_Q$$\mathbi{W}_K$表示线性变换矩阵。为了引入相对位置信息,可以将公式\eqref{eq:15-13}修改为如下形式:
\begin{eqnarray}
A_{ij}^{\rm rel} &=& \underbrace{\mathbi{E}_{x_i}\mathbi{W}_Q\mathbi{W}_{K}^{T}\mathbi{E}_{x_j}^{T}}_{\textrm{(a)}} + \underbrace{\mathbi{E}_{x_i}\mathbi{W}_Q\mathbi{W}_{K}^{T}\mathbi{R}_{i-j}^{T}}_{\textrm{(b)}} + \nonumber \\
& & \underbrace{\mathbi{u}\mathbi{W}_{K,E}^{T}\mathbi{E}_{x_j}^{T}}_{\textrm{(c)}} + \underbrace{\mathbi{v}\mathbi{W}_{K,R}^{T}\mathbi{R}_{i-j}^{T}}_{\textrm{(d)}}
A_{ij}^{\rm rel} &=& \underbrace{\mathbi{E}_{x_i}\mathbi{W}_Q\mathbi{W}_{K}^{\textrm{T}}\mathbi{E}_{x_j}^{\textrm{T}}}_{\textrm{(a)}} + \underbrace{\mathbi{E}_{x_i}\mathbi{W}_Q\mathbi{W}_{K}^{\textrm{T}}\mathbi{R}_{i-j}^{\textrm{T}}}_{\textrm{(b)}} + \nonumber \\
& & \underbrace{\mathbi{u}\mathbi{W}_{K,E}^{\textrm{T}}\mathbi{E}_{x_j}^{\textrm{T}}}_{\textrm{(c)}} + \underbrace{\mathbi{v}\mathbi{W}_{K,R}^{\textrm{T}}\mathbi{R}_{i-j}^{\textrm{T}}}_{\textrm{(d)}}
\label{eq:15-14}
\end{eqnarray}
\noindent 其中,$A_{ij}^{\rm rel}$为使用相对位置编码后位置$i$$j$关系的表示结果,$\mathbi{R}$是一个固定的正弦矩阵。不同于公式\eqref{eq:15-13},公式\eqref{eq:15-14}对(c)中的$\mathbi{E}_{x_j}^{T}$与(d)中的$\mathbi{R}_{i-j}^{T}$采用了不同的映射矩阵,分别为$\mathbi{W}_{K,E}^{T}$$\mathbi{W}_{K,R}^{T}$,这两项分别代表了键$\mathbi{K}$中的词嵌入表示和相对位置编码表示,并且由于此时只采用了相对位置编码,因此公式\eqref{eq:15-14}在(c)与(d)部分使用了$\mathbi{u}$$\mathbi{v}$两个可学习的矩阵代替$\mathbi{U}_i\mathbi{W}_Q$$\mathbi{U}_i\mathbi{W}_Q$,即查询$\mathbi{Q}$中的绝对位置编码部分。此时公式中各项的含义为:(a)表示位置$i$与位置$j$之间词嵌入的相关性,可以看作是基于内容的表示,(b)表示基于内容的位置偏置,(c)表示全局内容的偏置,(d)表示全局位置的偏置。公式\eqref{eq:15-13}中的(a)、(b)两项与前面介绍的绝对位置编码一致\upcite{Shaw2018SelfAttentionWR},并针对相对位置编码引入了额外的线性变换矩阵。同时,这种方法兼顾了全局内容偏置和全局位置偏置,可以更好地利用正余弦函数的归纳偏置特性。
\noindent 其中,$A_{ij}^{\rm rel}$为使用相对位置编码后位置$i$$j$关系的表示结果,$\mathbi{R}$是一个固定的正弦矩阵。不同于公式\eqref{eq:15-13},公式\eqref{eq:15-14}对(c)中的$\mathbi{E}_{x_j}^{\textrm{T}}$与(d)中的$\mathbi{R}_{i-j}^{\textrm{T}}$采用了不同的映射矩阵,分别为$\mathbi{W}_{K,E}^{\textrm{T}}$$\mathbi{W}_{K,R}^{\textrm{T}}$,这两项分别代表了键$\mathbi{K}$中的词嵌入表示和相对位置编码表示,并且由于此时只采用了相对位置编码,因此公式\eqref{eq:15-14}在(c)与(d)部分使用了$\mathbi{u}$$\mathbi{v}$两个可学习的矩阵代替$\mathbi{U}_i\mathbi{W}_Q$$\mathbi{U}_i\mathbi{W}_Q$,即查询$\mathbi{Q}$中的绝对位置编码部分。此时公式中各项的含义为:(a)表示位置$i$与位置$j$之间词嵌入的相关性,可以看作是基于内容的表示,(b)表示基于内容的位置偏置,(c)表示全局内容的偏置,(d)表示全局位置的偏置。公式\eqref{eq:15-13}中的(a)、(b)两项与前面介绍的绝对位置编码一致\upcite{Shaw2018SelfAttentionWR},并针对相对位置编码引入了额外的线性变换矩阵。同时,这种方法兼顾了全局内容偏置和全局位置偏置,可以更好地利用正余弦函数的归纳偏置特性。
\vspace{0.5em}
\item {\small\bfnew{结构化位置编码}}\index{基于结构化位置编码}(Structural Position Representations)\index{Structural Position Representations}\upcite{DBLP:conf/emnlp/WangTWS19a}。 例如,可以通过对输入句子进行依存句法分析得到句法树,根据叶子结点在句法树中的深度来表示其绝对位置,并在此基础上利用相对位置编码的思想计算节点之间的相对位置信息。
......@@ -183,7 +183,7 @@ A_{ij}^{\rm rel} &=& \underbrace{\mathbi{E}_{x_i}\mathbi{W}_Q\mathbi{W}_{K}^{T}\
\noindent 具体的形式如下:
\begin{eqnarray}
\mathbi{e}_{ij} &=& \frac{(\mathbi{x}_i \mathbi{W}_Q){(\mathbi{x}_j \mathbi{W}_K)}^{T}}{\sqrt{d_k}} + G_{ij}
\mathbi{e}_{ij} &=& \frac{(\mathbi{x}_i \mathbi{W}_Q){(\mathbi{x}_j \mathbi{W}_K)}^{\textrm{T}}}{\sqrt{d_k}} + G_{ij}
\label{eq:15-15}
\end{eqnarray}
......@@ -201,8 +201,8 @@ G_{ij} &=& - \frac{{(j - P_i)}^2}{2\sigma_i^2}
\noindent 其中,$m$表示序列长度,$p_i$$v_i$为计算的中间结果,被定义为:
\begin{eqnarray}
p_i &=& \mathbi{I}_p^T\textrm{Tanh}(\mathbi{W}_p\mathbi{Q}_i) \\
v_i &=& \mathbi{I}_d^T\textrm{Tanh}(\mathbi{W}_d\mathbi{Q}_i)
p_i &=& \mathbi{I}_p^{\textrm{T}}\textrm{Tanh}(\mathbi{W}_p\mathbi{Q}_i) \\
v_i &=& \mathbi{I}_d^{\textrm{T}}\textrm{Tanh}(\mathbi{W}_d\mathbi{Q}_i)
\label{eq:15-19}
\end{eqnarray}
......@@ -502,7 +502,7 @@ v_i &=& \mathbi{I}_d^T\textrm{Tanh}(\mathbi{W}_d\mathbi{Q}_i)
\vspace{0.5em}
\item {\small\bfnew{基于多跳的自注意力机制}}。如图\ref{fig:15-11}所示,其做法与前馈神经网络类似,首先将不同层的表示拼接成2维的句子级矩阵表示\upcite{DBLP:journals/corr/LinFSYXZB17}。之后利用类似于前馈神经网络的思想将维度为$\mathbb{R}^{d_{\textrm{model}} \times L}$的矩阵映射到维度为$\mathbb{R}^{d_{\textrm{model}} \times n_{\rm hop}}$的矩阵,如下:
\begin{eqnarray}
\mathbi{o} &=& \sigma ([\mathbi{h}^1,\ldots,\mathbi{h}^L]^{T} \cdot \mathbi{W}_1)\mathbi{W}_2
\mathbi{o} &=& \sigma ([\mathbi{h}^1,\ldots,\mathbi{h}^L]^{\textrm{T}} \cdot \mathbi{W}_1)\mathbi{W}_2
\label{eq:15-33}
\end{eqnarray}
......@@ -1128,7 +1128,8 @@ lr &=& d_{\textrm{model}}^{-0.5}\cdot step\_num^{-0.5}
%----------------------------------------------
\begin{figure}[htp]
\centering
\includegraphics[scale=0.5]{./Chapter15/Figures/figure-relationship-between-structures-in-structural-space.jpg}
\vspace{-5em}
\input{./Chapter15/Figures/figure-relationship-between-structures-in-structural-space}
\caption{结构空间内结构之间的关系}
\label{fig:15-30}
\end{figure}
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论