Commit 9156e40e by 孟霞

合并分支 'master' 到 'mengxia'

Master

查看合并请求 !407
parents 73f7d5f1 6ae34f8f
......@@ -15,9 +15,9 @@
\node[rnnnode,minimum height=0.5\base,fill=green!30!white,anchor=west] (eemb\x) at ([xshift=0.4\base]eemb\y.east) {\tiny{$e_x()$}};
\foreach \x in {1,2,...,3}
\node[rnnnode,fill=blue!30!white,anchor=south] (enc\x) at ([yshift=0.3\base]eemb\x.north) {};
\node[] (enclabel1) at (enc1) {\tiny{$\vectorn{\emph{h}}_{m-2}$}};
\node[] (enclabel2) at (enc2) {\tiny{$\vectorn{\emph{h}}_{m-1}$}};
\node[rnnnode,fill=purple!30!white] (enclabel3) at (enc3) {\tiny{$\vectorn{\emph{h}}_{m}$}};
\node[] (enclabel1) at (enc1) {\tiny{$\mathbi{h}_{m-2}$}};
\node[] (enclabel2) at (enc2) {\tiny{$\mathbi{h}_{m-1}$}};
\node[rnnnode,fill=purple!30!white] (enclabel3) at (enc3) {\tiny{$\mathbi{h}_{m}$}};
\node[wordnode,left=0.4\base of enc1] (init1) {$\cdots$};
\node[wordnode,left=0.4\base of eemb1] (init2) {$\cdots$};
......@@ -29,7 +29,7 @@
\foreach \x in {1,2,...,3}
\node[rnnnode,minimum height=0.5\base,fill=green!30!white,anchor=south] (demb\x) at ([yshift=\base]enc\x.north) {\tiny{$e_y()$}};
\foreach \x in {1,2,...,3}
\node[rnnnode,fill=blue!30!white,anchor=south] (dec\x) at ([yshift=0.3\base]demb\x.north) {{\tiny{$\vectorn{\emph{s}}_\x$}}};
\node[rnnnode,fill=blue!30!white,anchor=south] (dec\x) at ([yshift=0.3\base]demb\x.north) {{\tiny{$\mathbi{s}_\x$}}};
\foreach \x in {1,2,...,3}
\node[rnnnode,minimum height=0.5\base,fill=red!30!white,anchor=south] (softmax\x) at ([yshift=0.3\base]dec\x.north) {\tiny{Softmax}};
\node[wordnode,right=0.4\base of demb3] (end1) {$\cdots$};
......@@ -73,7 +73,7 @@
\draw[-latex'] (enc3.north) .. controls +(north:0.3\base) and +(east:\base) .. (bridge) .. controls +(west:2.7\base) and +(west:0.3\base) .. (dec1.west);
{
\node [anchor=east] (line1) at ([xshift=-3em,yshift=0.5em]softmax1.west) {\scriptsize{基于RNN的隐层状态$\vectorn{\emph{s}}_i$}};
\node [anchor=east] (line1) at ([xshift=-3em,yshift=0.5em]softmax1.west) {\scriptsize{基于RNN的隐层状态$\mathbi{s}_i$}};
\node [anchor=north west] (line2) at ([yshift=0.3em]line1.south west) {\scriptsize{预测目标词的概率}};
\node [anchor=north west] (line3) at ([yshift=0.3em]line2.south west) {\scriptsize{通常,用Softmax函数}};
\node [anchor=north west] (line4) at ([yshift=0.3em]line3.south west) {\scriptsize{实现 $\funp{P}(y_i|...)$}};
......@@ -90,7 +90,7 @@
\node [anchor=west] (line21) at ([xshift=1.3em,yshift=1.5em]enc3.east) {\scriptsize{源语编码器最后一个}};
\node [anchor=north west] (line22) at ([yshift=0.3em]line21.south west) {\scriptsize{循环单元的输出被}};
\node [anchor=north west] (line23) at ([yshift=0.3em]line22.south west) {\scriptsize{看作是句子的表示,}};
\node [anchor=north west] (line24) at ([yshift=0.3em]line23.south west) {\scriptsize{记为$\vectorn{\emph{C}}$}};
\node [anchor=north west] (line24) at ([yshift=0.3em]line23.south west) {\scriptsize{记为$\mathbi{C}$}};
}
\begin{pgfonlayer}{background}
......
......@@ -8,17 +8,17 @@
\begin{scope}
\node [anchor=west,draw,fill=red!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (h1) at (0,0) {\scriptsize{$\vectorn{h}_1$}};
\node [anchor=west,draw,fill=red!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (h2) at ([xshift=1em]h1.east) {\scriptsize{$\vectorn{h}_2$}};
\node [anchor=west,draw,fill=red!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (h1) at (0,0) {\scriptsize{$\mathbi{h}_1$}};
\node [anchor=west,draw,fill=red!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (h2) at ([xshift=1em]h1.east) {\scriptsize{$\mathbi{h}_2$}};
\node [anchor=west,inner sep=0pt,minimum width=3em] (h3) at ([xshift=0.5em]h2.east) {\scriptsize{...}};
\node [anchor=west,draw,fill=red!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (h4) at ([xshift=0.5em]h3.east) {\scriptsize{$\vectorn{h}_m$}};
\node [anchor=west,draw,fill=red!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (h4) at ([xshift=0.5em]h3.east) {\scriptsize{$\mathbi{h}_m$}};
\node [anchor=south,circle,minimum size=1.0em,draw,ublue,thick] (sum) at ([yshift=2em]h2.north east) {};
\draw [thick,-,ublue] (sum.north) -- (sum.south);
\draw [thick,-,ublue] (sum.west) -- (sum.east);
\node [anchor=south,draw,fill=green!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (th1) at ([yshift=2em,xshift=-1em]sum.north west) {\scriptsize{$\vectorn{s}_{j-1}$}};
\node [anchor=west,draw,fill=green!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (th2) at ([xshift=2em]th1.east) {\scriptsize{$\vectorn{s}_{j}$}};
\node [anchor=south,draw,fill=green!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (th1) at ([yshift=2em,xshift=-1em]sum.north west) {\scriptsize{$\mathbi{s}_{j-1}$}};
\node [anchor=west,draw,fill=green!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (th2) at ([xshift=2em]th1.east) {\scriptsize{$\mathbi{s}_{j}$}};
\draw [->] (h1.north) .. controls +(north:0.8) and +(west:1) .. (sum.190) node [pos=0.2,left] {\scriptsize{$\alpha_{1,j}$}};
\draw [->] (h2.north) .. controls +(north:0.6) and +(220:0.2) .. (sum.220) node [pos=0.2,right] {\scriptsize{$\alpha_{2,j}$}};
......@@ -27,7 +27,7 @@
\draw [->] ([xshift=-1.5em]th1.west) -- ([xshift=-0.1em]th1.west);
\draw [->] ([xshift=0.1em]th1.east) -- ([xshift=-0.1em]th2.west);
\draw [->] ([xshift=0.1em]th2.east) -- ([xshift=1.5em]th2.east);
\draw [->] (sum.north) .. controls +(north:0.8) and +(west:0.2) .. ([yshift=-0.4em,xshift=-0.1em]th2.west) node [pos=0.2,right] (ci) {\scriptsize{$\vectorn{C}_{j}$}};
\draw [->] (sum.north) .. controls +(north:0.8) and +(west:0.2) .. ([yshift=-0.4em,xshift=-0.1em]th2.west) node [pos=0.2,right] (ci) {\scriptsize{$\mathbi{C}_{j}$}};
\node [anchor=south,inner sep=1pt] (output) at ([yshift=0.8em]th2.north) {\scriptsize{输出层}};
\draw [->] ([yshift=0.1em]th2.north) -- ([yshift=-0.1em]output.south);
......@@ -39,11 +39,11 @@
\node [anchor=north] (enc42) at ([yshift=0.5em]enc4.south) {\scriptsize{(位置$4$)}};
{
\node [anchor=west] (math1) at ([xshift=5em,yshift=1em]th2.east) {$\vectorn{C}_j = \sum_{i} \alpha_{i,j} \vectorn{h}_i $ \ \ \ \ };
\node [anchor=west] (math1) at ([xshift=5em,yshift=1em]th2.east) {$\mathbi{C}_j = \sum_{i} \alpha_{i,j} \mathbi{h}_i $ \ \ \ \ };
}
{
\node [anchor=north west] (math2) at ([yshift=-2em,xshift=-0.2em]math1.south west) {$\alpha_{i,j} = \frac{\exp(\beta_{i,j})}{\sum_{i'} \exp(\beta_{i',j})}$};
\node [anchor=north west] (math3) at ([yshift=-0em]math2.south west) {$\beta_{i,j} = a(\vectorn{s}_{j-1}, \vectorn{h}_i)$};
\node [anchor=north west] (math3) at ([yshift=-0em]math2.south west) {$\beta_{i,j} = a(\mathbi{s}_{j-1}, \mathbi{h}_i)$};
}
\begin{pgfonlayer}{background}
......
......@@ -2,9 +2,9 @@
\begin{scope}
\tikzstyle{rnnnode} = [minimum height=1.1em,minimum width=2.1em,inner sep=2pt,rounded corners=1pt,draw,fill=red!20];
\node [rnnnode,anchor=west] (h1) at (0,0) {\tiny{$\vectorn{\emph{h}}_1$}};
\node [rnnnode,anchor=west] (h1) at (0,0) {\tiny{$\mathbi{\emph{h}}_1$}};
\node [anchor=west] (h2) at ([xshift=1em]h1.east) {\tiny{...}};
\node [rnnnode,anchor=west] (h3) at ([xshift=1em]h2.east) {\tiny{$\vectorn{\emph{h}}_m$}};
\node [rnnnode,anchor=west] (h3) at ([xshift=1em]h2.east) {\tiny{$\mathbi{\emph{h}}_m$}};
\node [rnnnode,anchor=north,fill=green!20] (e1) at ([yshift=-1em]h1.south) {\tiny{$e_x()$}};
\node [anchor=west] (e2) at ([xshift=1em]e1.east) {\tiny{...}};
\node [rnnnode,anchor=west,fill=green!20] (e3) at ([xshift=1em]e2.east) {\tiny{$e_x()$}};
......@@ -19,7 +19,7 @@
\draw [->] ([xshift=0.1em]h1.east) -- ([xshift=-0.1em]h2.west);
\draw [->] ([xshift=0.1em]h2.east) -- ([xshift=-0.1em]h3.west);
\draw [->] ([xshift=-0.8em]h1.west) -- ([xshift=-0.1em]h1.west) node [pos=0,left,inner sep=2pt] {\tiny{0}};
\node [anchor=south] (encoder) at ([xshift=-0.2em]h1.north west) {\scriptsize{\vectorn{编码器}}};
\node [anchor=south] (encoder) at ([xshift=-0.2em]h1.north west) {\scriptsize{\mathbi{编码器}}};
{
\node [rnnnode,anchor=west,fill=green!20] (t1) at ([xshift=3em]h3.east) {\tiny{$e_y()$}};
......@@ -33,14 +33,14 @@
\node [anchor=west,inner sep=2pt] (t5) at ([xshift=0.3em]t4.east) {\tiny{...}};
}
{
\node [rnnnode,anchor=south] (s1) at ([yshift=1em]t1.north) {\tiny{$\vectorn{\emph{s}}_1$}};
\node [rnnnode,anchor=south] (s1) at ([yshift=1em]t1.north) {\tiny{$\mathbi{\emph{s}}_1$}};
}
{
\node [rnnnode,anchor=south] (s2) at ([yshift=1em]t2.north) {\tiny{$\vectorn{\emph{s}}_2$}};
\node [rnnnode,anchor=south] (s2) at ([yshift=1em]t2.north) {\tiny{$\mathbi{\emph{s}}_2$}};
}
{
\node [rnnnode,anchor=south] (s3) at ([yshift=1em]t3.north) {\tiny{$\vectorn{\emph{s}}_3$}};
\node [rnnnode,anchor=south] (s4) at ([yshift=1em]t4.north) {\tiny{$\vectorn{\emph{s}}_4$}};
\node [rnnnode,anchor=south] (s3) at ([yshift=1em]t3.north) {\tiny{$\mathbi{\emph{s}}_3$}};
\node [rnnnode,anchor=south] (s4) at ([yshift=1em]t4.north) {\tiny{$\mathbi{\emph{s}}_4$}};
\node [anchor=west,inner sep=2pt] (s5) at ([xshift=0.3em]s4.east) {\tiny{...}};
}
{
......@@ -131,7 +131,7 @@
}
{
\node [circle,draw,anchor=south,inner sep=3pt,fill=orange!20] (c2) at ([yshift=2em]h2.north) {\tiny{$\vectorn{\emph{C}}_2$}};
\node [circle,draw,anchor=south,inner sep=3pt,fill=orange!20] (c2) at ([yshift=2em]h2.north) {\tiny{$\mathbi{\emph{C}}_2$}};
\node [anchor=south] (c2label) at (c2.north) {\tiny{\textbf{注意力机制:上下文}}};
\node [anchor=south] (c2more) at ([yshift=-1.5em]c2.south) {\tiny{...}};
\draw [->] (h1.north) .. controls +(north:0.6) and +(250:0.9) .. (c2.250);
......@@ -143,12 +143,12 @@
}
{
\node [circle,draw,anchor=north,inner sep=3pt,fill=orange!20] (c3) at ([yshift=-2em]t2.south) {\tiny{$\vectorn{\emph{C}}_3$}};
\node [circle,draw,anchor=north,inner sep=3pt,fill=orange!20] (c3) at ([yshift=-2em]t2.south) {\tiny{$\mathbi{\emph{C}}_3$}};
\draw [->] ([xshift=-0.7em]c3.west) -- ([xshift=-0.1em]c3.west);
\draw [->] ([xshift=0.1em]c3.east) .. controls +(east:0.6) and +(west:0.8) ..([yshift=-0.3em,xshift=-0.1em]s3.west);
}
{
\node [circle,draw,anchor=north,inner sep=3pt,fill=orange!20] (c4) at ([yshift=-2em]t3.south) {\tiny{$\vectorn{\emph{C}}_4$}};
\node [circle,draw,anchor=north,inner sep=3pt,fill=orange!20] (c4) at ([yshift=-2em]t3.south) {\tiny{$\mathbi{\emph{C}}_4$}};
\draw [->] ([xshift=-0.7em]c4.west) -- ([xshift=-0.1em]c4.west);
\draw [->] ([xshift=0.1em]c4.east) .. controls +(east:0.6) and +(west:0.8) ..([yshift=-0.3em,xshift=-0.1em]s4.west);
}
......
......@@ -104,9 +104,9 @@
%\visible<3->
{
% coverage score formula node
\node [anchor=north west] (formula) at ([xshift=-0.3\hnode,yshift=-1.5\hnode]attn11.south) {\small{不同$\vectorn{C}_j$所对应的源语言词的权重是不同的}};
\node [anchor=north west] (example) at (formula.south west) {\footnotesize{$\vectorn{C}_2=0.4 \times \vectorn{h}(\textrm{“你”}) + 0.4 \times \vectorn{h}(\textrm{“什么”}) +$}};
\node [anchor=north west] (example2) at ([yshift=0.4em]example.south west) {\footnotesize{$\ \ \ \ \ \ \ \ 0 \times \vectorn{h}(\textrm{“都”}) + 0.1 \times \vectorn{h}(\textrm{“ 没”}) + ..$}};
\node [anchor=north west] (formula) at ([xshift=-0.3\hnode,yshift=-1.5\hnode]attn11.south) {\small{不同$\mathbi{C}_j$所对应的源语言词的权重是不同的}};
\node [anchor=north west] (example) at (formula.south west) {\footnotesize{$\mathbi{C}_2=0.4 \times \mathbi{h}(\textrm{“你”}) + 0.4 \times \mathbi{h}(\textrm{“什么”}) +$}};
\node [anchor=north west] (example2) at ([yshift=0.4em]example.south west) {\footnotesize{$\ \ \ \ \ \ \ \ 0 \times \mathbi{h}(\textrm{“都”}) + 0.1 \times \mathbi{h}(\textrm{“ 没”}) + ..$}};
}
%\visible<3->
......@@ -138,12 +138,12 @@
%\visible<2->
{
\node[anchor=west] (sc1) at ([xshift=0.9\hnode]attn16.east) {$\vectorn{C}_1 = \sum_{i=1}^{8} \alpha_{i1} \vectorn{h}_{i}$};
\node[anchor=west] (sc1) at ([xshift=0.9\hnode]attn16.east) {$\mathbi{C}_1 = \sum_{i=1}^{8} \alpha_{i1} \mathbi{h}_{i}$};
}
%\visible<3->
{
\node[anchor=west] (sc2) at ([xshift=0.9\hnode,yshift=0.0\hnode]attn26.east) {$\vectorn{C}_2 = \sum_{i=1}^{8} \alpha_{i2} \vectorn{h}_{i}$};
\node[anchor=west] (sc2) at ([xshift=0.9\hnode,yshift=0.0\hnode]attn26.east) {$\mathbi{C}_2 = \sum_{i=1}^{8} \alpha_{i2} \mathbi{h}_{i}$};
}
\end{tikzpicture}
\ No newline at end of file
......@@ -78,8 +78,8 @@
\end{scope}
\begin{scope}
\node[wordnode,anchor=south] () at (aux71) {$\vectorn{\emph{h}}_{t-1}$};
\node[wordnode,anchor=west] () at (aux12) {$\vectorn{\emph{x}}_t$};
\node[wordnode,anchor=south] () at (aux71) {$\mathbi{h}_{t-1}$};
\node[wordnode,anchor=west] () at (aux12) {$\mathbi{x}_t$};
\end{scope}
......
......@@ -91,8 +91,8 @@
\end{scope}
\begin{scope}
\node[wordnode,anchor=south] () at (aux71) {$\vectorn{\emph{h}}_{t-1}$};
\node[wordnode,anchor=west] () at (aux12) {$\vectorn{\emph{x}}_t$};
\node[wordnode,anchor=south] () at (aux71) {$\mathbi{h}_{t-1}$};
\node[wordnode,anchor=west] () at (aux12) {$\mathbi{x}_t$};
\end{scope}
......
......@@ -109,11 +109,11 @@
\end{scope}
\begin{scope}
\node[wordnode,anchor=south] () at (aux71) {$\vectorn{\emph{h}}_{t-1}$};
\node[wordnode,anchor=west] () at (aux12) {$\vectorn{\emph{x}}_t$};
\node[wordnode,anchor=south] () at (aux71) {$\mathbi{h}_{t-1}$};
\node[wordnode,anchor=west] () at (aux12) {$\mathbi{x}_t$};
{
\node[wordnode,anchor=east] () at (aux87) {$\vectorn{\emph{h}}_{t}$};
\node[wordnode,anchor=south] () at (aux78) {$\vectorn{\emph{h}}_{t}$};
\node[wordnode,anchor=east] () at (aux87) {$\mathbi{h}_{t}$};
\node[wordnode,anchor=south] () at (aux78) {$\mathbi{h}_{t}$};
}
\end{scope}
......
......@@ -84,9 +84,9 @@
\end{scope}
\begin{scope}
\node[wordnode,anchor=south] () at ([xshift=0.5\base]aux21) {$\vectorn{\emph{h}}_{t-1}$};
\node[wordnode,anchor=west] () at (aux12) {$\vectorn{\emph{x}}_t$};
\node[wordnode,anchor=south] () at ([xshift=0.5\base]aux51) {$\vectorn{\emph{c}}_{t-1}$};
\node[wordnode,anchor=south] () at ([xshift=0.5\base]aux21) {$\mathbi{h}_{t-1}$};
\node[wordnode,anchor=west] () at (aux12) {$\mathbi{x}_t$};
\node[wordnode,anchor=south] () at ([xshift=0.5\base]aux51) {$\mathbi{c}_{t-1}$};
\end{scope}
......
......@@ -99,9 +99,9 @@
\end{scope}
\begin{scope}
\node[wordnode,anchor=south] () at ([xshift=0.5\base]aux21) {$\vectorn{\emph{h}}_{t-1}$};
\node[wordnode,anchor=west] () at (aux12) {$\vectorn{\emph{x}}_t$};
\node[wordnode,anchor=south] () at ([xshift=0.5\base]aux51) {$\vectorn{\emph{c}}_{t-1}$};
\node[wordnode,anchor=south] () at ([xshift=0.5\base]aux21) {$\mathbi{h}_{t-1}$};
\node[wordnode,anchor=west] () at (aux12) {$\mathbi{x}_t$};
\node[wordnode,anchor=south] () at ([xshift=0.5\base]aux51) {$\mathbi{c}_{t-1}$};
\end{scope}
......
......@@ -113,11 +113,11 @@
\end{scope}
\begin{scope}
\node[wordnode,anchor=south] () at ([xshift=0.5\base]aux21) {$\vectorn{\emph{h}}_{t-1}$};
\node[wordnode,anchor=west] () at (aux12) {$\vectorn{\emph{x}}_t$};
\node[wordnode,anchor=south] () at ([xshift=0.5\base]aux51) {$\vectorn{\emph{c}}_{t-1}$};
\node[wordnode,anchor=south] () at ([xshift=0.5\base]aux21) {$\mathbi{h}_{t-1}$};
\node[wordnode,anchor=west] () at (aux12) {$\mathbi{x}_t$};
\node[wordnode,anchor=south] () at ([xshift=0.5\base]aux51) {$\mathbi{c}_{t-1}$};
{
\node[wordnode,anchor=south] () at ([xshift=-0.5\base]aux59) {$\vectorn{\emph{c}}_{t}$};
\node[wordnode,anchor=south] () at ([xshift=-0.5\base]aux59) {$\mathbi{c}_{t}$};
}
\end{scope}
......
......@@ -131,15 +131,15 @@
\end{scope}
\begin{scope}
\node[wordnode,anchor=south] () at ([xshift=0.5\base]aux21) {$\vectorn{\emph{h}}_{t-1}$};
\node[wordnode,anchor=west] () at (aux12) {$\vectorn{\emph{x}}_t$};
\node[wordnode,anchor=south] () at ([xshift=0.5\base]aux51) {$\vectorn{\emph{c}}_{t-1}$};
\node[wordnode,anchor=south] () at ([xshift=0.5\base]aux21) {$\mathbi{h}_{t-1}$};
\node[wordnode,anchor=west] () at (aux12) {$\mathbi{x}_t$};
\node[wordnode,anchor=south] () at ([xshift=0.5\base]aux51) {$\mathbi{c}_{t-1}$};
{
\node[wordnode,anchor=south] () at ([xshift=-0.5\base]aux59) {$\vectorn{\emph{c}}_{t}$};
\node[wordnode,anchor=south] () at ([xshift=-0.5\base]aux59) {$\mathbi{c}_{t}$};
}
{
\node[wordnode,anchor=east] () at (aux68) {$\vectorn{\emph{h}}_{t}$};
\node[wordnode,anchor=south] () at ([xshift=-0.5\base]aux29) {$\vectorn{\emph{h}}_{t}$};
\node[wordnode,anchor=east] () at (aux68) {$\mathbi{h}_{t}$};
\node[wordnode,anchor=south] () at ([xshift=-0.5\base]aux29) {$\mathbi{h}_{t}$};
}
\end{scope}
......
......@@ -4,17 +4,17 @@
\tikzstyle{rnode} = [draw,minimum width=3.5em,minimum height=1.2em]
\node [rnode,anchor=south west,fill=red!20!white] (value1) at (0,0) {\scriptsize{$\vectorn{h}(\textrm{})$}};
\node [rnode,anchor=south west,fill=red!20!white] (value2) at ([xshift=1em]value1.south east) {\scriptsize{$\vectorn{h}(\textrm{什么})$}};
\node [rnode,anchor=south west,fill=red!20!white] (value3) at ([xshift=1em]value2.south east) {\scriptsize{$\vectorn{h}(\textrm{})$}};
\node [rnode,anchor=south west,fill=red!20!white] (value4) at ([xshift=1em]value3.south east) {\scriptsize{$\vectorn{h}(\textrm{})$}};
\node [rnode,anchor=south west,fill=red!20!white] (value1) at (0,0) {\scriptsize{${h}(\textrm{})$}};
\node [rnode,anchor=south west,fill=red!20!white] (value2) at ([xshift=1em]value1.south east) {\scriptsize{${h}(\textrm{什么})$}};
\node [rnode,anchor=south west,fill=red!20!white] (value3) at ([xshift=1em]value2.south east) {\scriptsize{${h}(\textrm{})$}};
\node [rnode,anchor=south west,fill=red!20!white] (value4) at ([xshift=1em]value3.south east) {\scriptsize{${h}(\textrm{})$}};
\node [rnode,anchor=south west,fill=green!20!white] (key1) at ([yshift=0.2em]value1.north west) {\scriptsize{$\vectorn{h}(\textrm{})$}};
\node [rnode,anchor=south west,fill=green!20!white] (key2) at ([yshift=0.2em]value2.north west) {\scriptsize{$\vectorn{h}(\textrm{什么})$}};
\node [rnode,anchor=south west,fill=green!20!white] (key3) at ([yshift=0.2em]value3.north west) {\scriptsize{$\vectorn{h}(\textrm{})$}};
\node [rnode,anchor=south west,fill=green!20!white] (key4) at ([yshift=0.2em]value4.north west) {\scriptsize{$\vectorn{h}(\textrm{})$}};
\node [rnode,anchor=south west,fill=green!20!white] (key1) at ([yshift=0.2em]value1.north west) {\scriptsize{${h}(\textrm{})$}};
\node [rnode,anchor=south west,fill=green!20!white] (key2) at ([yshift=0.2em]value2.north west) {\scriptsize{${h}(\textrm{什么})$}};
\node [rnode,anchor=south west,fill=green!20!white] (key3) at ([yshift=0.2em]value3.north west) {\scriptsize{${h}(\textrm{})$}};
\node [rnode,anchor=south west,fill=green!20!white] (key4) at ([yshift=0.2em]value4.north west) {\scriptsize{${h}(\textrm{})$}};
\node [rnode,anchor=east] (query) at ([xshift=-2em]key1.west) {\scriptsize{$\vectorn{s}(\textrm{you})$}};
\node [rnode,anchor=east] (query) at ([xshift=-2em]key1.west) {\scriptsize{${s}(\textrm{you})$}};
\node [anchor=east] (querylabel) at ([xshift=-0.2em]query.west) {\scriptsize{query}};
\draw [->] ([yshift=1pt,xshift=6pt]query.north) .. controls +(90:1em) and +(90:1em) .. ([yshift=1pt]key1.north);
......
......@@ -141,15 +141,15 @@
\end{scope}
\begin{scope}
\node[wordnode,anchor=south] () at ([xshift=0.5\base]aux21) {$\vectorn{\emph{h}}_{t-1}$};
\node[wordnode,anchor=west] () at (aux12) {$\vectorn{\emph{x}}_t$};
\node[wordnode,anchor=south] () at ([xshift=0.5\base]aux51) {$\vectorn{\emph{c}}_{t-1}$};
\node[wordnode,anchor=south] () at ([xshift=0.5\base]aux21) {$\mathbi{h}_{t-1}$};
\node[wordnode,anchor=west] () at (aux12) {$\mathbi{x}_t$};
\node[wordnode,anchor=south] () at ([xshift=0.5\base]aux51) {$\mathbi{c}_{t-1}$};
{
\node[wordnode,anchor=south] () at ([xshift=-0.5\base]aux59) {$\vectorn{\emph{c}}_{t}$};
\node[wordnode,anchor=south] () at ([xshift=-0.5\base]aux59) {$\mathbi{c}_{t}$};
}
{
\node[wordnode,anchor=east] () at (aux68) {$\vectorn{\emph{h}}_{t}$};
\node[wordnode,anchor=south] () at ([xshift=-0.5\base]aux29) {$\vectorn{\emph{h}}_{t}$};
\node[wordnode,anchor=east] () at (aux68) {$\mathbi{h}_{t}$};
\node[wordnode,anchor=south] () at ([xshift=-0.5\base]aux29) {$\mathbi{h}_{t}$};
}
\end{scope}
......@@ -170,19 +170,19 @@
\begin{scope}
{
% forget gate formula
\node[formulanode,anchor=south east,text width=10em] () at ([shift={(4\base,1.5\base)}]aux51) {遗忘门\\$\vectorn{\emph{f}}_t=\sigma(\vectorn{\emph{W}}_f[\vectorn{\emph{h}}_{t-1},\vectorn{\emph{x}}_t]+\vectorn{\emph{b}}_f)$};
\node[formulanode,anchor=south east,text width=10em] () at ([shift={(4\base,1.5\base)}]aux51) {遗忘门\\$\mathbi{f}_t=\sigma(\mathbi{W}_f[\mathbi{h}_{t-1},\mathbi{x}_t]+\mathbi{b}_f)$};
}
{
% input gate formula
\node[formulanode,anchor=north east,text width=10em] () at ([shift={(4\base,-1.5\base)}]aux21) {输入门\\$\vectorn{\emph{i}}_t=\sigma(\vectorn{\emph{W}}_i[\vectorn{\emph{h}}_{t-1},\vectorn{\emph{x}}_t]+\vectorn{\emph{b}}_i)$\\$\hat{\vectorn{\emph{c}}}_t=\mathrm{tanh}(\vectorn{\emph{W}}_c[\vectorn{\emph{h}}_{t-1},\vectorn{\emph{x}}_t]+\vectorn{\emph{b}}_c)$};
\node[formulanode,anchor=north east,text width=10em] () at ([shift={(4\base,-1.5\base)}]aux21) {输入门\\$\mathbi{i}_t=\sigma(\mathbi{W}_i[\mathbi{h}_{t-1},\mathbi{x}_t]+\mathbi{b}_i)$\\$\hat{\mathbi{c}}_t=\mathrm{tanh}(\mathbi{W}_c[\mathbi{h}_{t-1},\mathbi{x}_t]+\mathbi{b}_c)$};
}
{
% cell update formula
\node[formulanode,anchor=south west,text width=10em] () at ([shift={(-4\base,1.5\base)}]aux59) {记忆更新\\$\vectorn{\emph{c}}_{t}=\vectorn{\emph{f}}_t\cdot \vectorn{\emph{c}}_{t-1}+\vectorn{\emph{i}}_t\cdot \hat{\vectorn{\emph{c}}}_t$};
\node[formulanode,anchor=south west,text width=10em] () at ([shift={(-4\base,1.5\base)}]aux59) {记忆更新\\$\mathbi{c}_{t}=\mathbi{f}_t\cdot \mathbi{c}_{t-1}+\mathbi{i}_t\cdot \hat{\mathbi{c}}_t$};
}
{
% output gate formula
\node[formulanode,anchor=north west,text width=10em] () at ([shift={(-4\base,-1.5\base)}]aux29) {输出门\\$\vectorn{\emph{o}}_t=\sigma(\vectorn{\emph{W}}_o[\vectorn{\emph{h}}_{t-1},\vectorn{\emph{x}}_t]+\vectorn{\emph{b}}_o)$\\$\vectorn{\emph{h}}_{t}=\vectorn{\emph{o}}_t\cdot \mathrm{tanh}(\vectorn{\emph{c}}_{t})$};
\node[formulanode,anchor=north west,text width=10em] () at ([shift={(-4\base,-1.5\base)}]aux29) {输出门\\$\mathbi{o}_t=\sigma(\mathbi{W}_o[\mathbi{h}_{t-1},\mathbi{x}_t]+\mathbi{b}_o)$\\$\mathbi{h}_{t}=\mathbi{o}_t\cdot \mathrm{tanh}(\mathbi{c}_{t})$};
}
\end{scope}
\end{tikzpicture}
......

176 KB | W: | H:

182 KB | W: | H:

Chapter10/Figures/mt-history.png
Chapter10/Figures/mt-history.png
Chapter10/Figures/mt-history.png
Chapter10/Figures/mt-history.png
  • 2-up
  • Swipe
  • Onion skin
......@@ -37,7 +37,7 @@
\parinterval 不过,有人也意识到了神经机器翻译在表示学习等方面的优势。这一时期,很多研究团队对包括机器翻译在内的序列到序列问题进行了广泛而深入的研究,注意力机制等新的方法不断被推出。这使得神经机器翻译系统在翻译品质上逐渐体现出优势,甚至超越了当时的统计机器翻译系统。正当大家在讨论神经机器翻译是否能取代统计机器翻译成为下一代机器翻译范式的时候,一些互联网企业推出了以神经机器翻译技术为内核的在线机器翻译服务,在很多场景下的翻译品质显著超越了当时最好的统计机器翻译系统。这也引发了学术界和产业界对神经机器翻译的讨论。随着关注度的不断升高,神经机器翻译的研究吸引了更多的科研机构和企业的投入,神经机器翻译系统的翻译品质得到进一步提升。
\parinterval 在短短5-6年间,神经机器翻译从一个新生的概念已经成长为机器翻译领域的最前沿技术之一,在各种机器翻译评测和应用中呈全面替代统计机器翻译之势。比如,从近几年WMT、CCMT等评测的结果来看,神经机器翻译已经处于绝对的统治地位,在不同语种和领域的翻译任务中,成为各参赛系统的标配。此外,从ACL等自然语言处理顶级会议的发表论文看,神经机器翻译是毫无疑问的焦点,在论文数量上呈明显的增长趋势,这也体现了学术界对该方法的热情。至今,国内外的很多机构都推出了自己研发的神经机器翻译系统,整个研究和产业生态欣欣向荣。图\ref{fig:10-1}展示了包含神经机器翻译在内的机器翻译发展简史。
\parinterval 在短短5-6年间,神经机器翻译从一个新生的概念已经成长为机器翻译领域的最前沿技术之一,在各种机器翻译评测和应用中呈全面替代统计机器翻译之势。比如,从近几年WMT、CCMT等评测的结果来看,神经机器翻译已经处于绝对的统治地位,在不同语种和领域的翻译任务中,成为各参赛系统的标配。此外,从ACL等自然语言处理顶级会议的发表论文看,神经机器翻译在论文数量上呈明显的增长趋势,这也体现了学术界对该方法的热情。至今,国内外的很多机构都推出了自己研发的神经机器翻译系统,整个研究和产业生态欣欣向荣。图\ref{fig:10-1}展示了包含神经机器翻译在内的机器翻译发展简史。
%----------------------------------------------
\begin{figure}[htp]
......@@ -52,11 +52,11 @@
\begin{itemize}
\vspace{0.3em}
\item 自上世纪末所发展起来的基于数据驱动的方法为神经机器翻译提供了很好的基础。本质上,神经机器翻译仍然是一种基于统计建模的数据驱动的方法,因此无论是对问题的基本建模方式,还是训练统计模型所使用到的带标注数据,都可以复用机器翻译领域以前的研究成果。特别是机器翻译长期的发展已经积累了大量的双语、单语数据,这些数据在统计机器翻译时代就发挥了很大作用。随着时间的推移,数据规模和质量又得到进一步提升,包括一些评测基准、任务设置都已经非常完备,研究者可以直接在数据条件全部具备的情况下开展神经机器翻译的研究工作,这些都省去了大量的时间成本。从这个角度说,神经机器翻译是站在巨人的肩膀上才发展起来的。
\item 自上世纪末所发展起来的基于数据驱动的方法为神经机器翻译提供了很好的基础。本质上,神经机器翻译仍然是一种基于统计建模的数据驱动的方法,因此无论是对问题的基本建模方式,还是训练统计模型所使用到的带标注数据,都可以复用机器翻译领域以前的研究成果。特别是机器翻译长期的发展已经积累了大量的双语、单语数据,这些数据在统计机器翻译时代就发挥了很大作用。随着时间的推移,数据规模和质量又得到进一步提升,包括一些评测基准、任务设置都已经非常完备,研究者可以直接在数据条件全部具备的情况下开展神经机器翻译的研究工作,这些都节省了大量的时间成本。从这个角度说,神经机器翻译是站在巨人的肩膀上才发展起来的。
\vspace{0.3em}
\item 深度学习经过长时间的酝酿终于爆发,为机器翻译等自然语言处理任务提供了新的思路和技术手段。神经机器翻译的不断壮大伴随着深度学习技术的发展。在深度学习的视角下,语言文字可以被表示成抽象的实数向量这种文字的表示结果可以被自动学习,为机器翻译建模提供了更大的灵活性。相对于神经机器翻译,深度学习的发展更加曲折。虽然深度学习经过了漫长的起伏过程,但是神经机器翻译恰好出现在深度学习逐渐走向成熟的阶段。反过来说,受到深度学习及相关技术空前发展的影响,自然语言处理的范式也发生了变化,神经机器翻译的出现只是这种趋势下的一种必然。
\item 深度学习经过长时间的酝酿终于爆发,为机器翻译等自然语言处理任务提供了新的思路和技术手段。神经机器翻译的不断壮大伴随着深度学习技术的发展。在深度学习的视角下,语言文字可以被表示成抽象的实数向量这种文字的表示结果可以被自动学习,为机器翻译建模提供了更大的灵活性。相对于神经机器翻译,深度学习的发展更加曲折。虽然深度学习经过了漫长的起伏过程,但是神经机器翻译恰好出现在深度学习逐渐走向成熟的阶段。反过来说,受到深度学习及相关技术空前发展的影响,自然语言处理的范式也发生了变化,神经机器翻译的出现只是这种趋势下的一种必然。
\vspace{0.3em}
\item 此外,计算机算力的提升也为神经机器翻译提供了很好的支撑。与很多神经网络方法一样,神经机器翻译也依赖大量的基于浮点数的矩阵运算。甚至在本世纪初,大规模的矩阵运算仍然依赖非常昂贵的CPU集群系统,但是随着GPU等相关技术的发展,在相对低成本的设备上已经可以完成非常复杂的浮点并行运算。这使得包括神经机器翻译在内的很多基于深度学习的系统可以进行大规模实验,随着实验周期的缩短,相关研究和系统的迭代周期大大缩短。实际上,计算机硬件运算能力一直是稳定提升的,神经机器翻译只是受益于运算能力的阶段性突破。
\item 此外,计算机算力的提升也为神经机器翻译提供了很好的支撑。与很多神经网络方法一样,神经机器翻译也依赖大量基于浮点数的矩阵运算。甚至在本世纪初,大规模的矩阵运算仍然依赖非常昂贵的CPU集群系统,但是随着GPU等相关技术的发展,在相对低成本的设备上已经可以完成非常复杂的浮点并行运算。这使得包括神经机器翻译在内的很多基于深度学习的系统可以进行大规模实验,随着实验周期的缩短,相关研究和系统的迭代周期也大大缩短。实际上,计算机硬件运算能力一直是稳定提升的,神经机器翻译只是受益于运算能力的阶段性突破。
\vspace{0.3em}
\item 还有,翻译需求的不断增加也为机器翻译技术提供了新的机会。在近几年,无论是翻译品质,还是翻译语种数量,甚至不同的翻译场景,都对机器翻译有了更高的要求。人们迫切需要一种品质更高、翻译效果稳定的机器翻译方法,神经机器翻译恰好满足了这些要求。当然,应用端需求的增加也会反推机器翻译技术的发展,二者相互促进。
\vspace{0.3em}
......@@ -68,9 +68,9 @@
% NEW SUB-SECTION 10.1.1
%----------------------------------------------------------------------------------------
\subsection{神经机器翻译的起源}
\parinterval 从广义上讲,神经机器翻译是一种基于人工神经网络的方法,它把翻译过程描述为可以用人工神经网络表示的函数所有的训练和推断都在这些函数上进行。由于神经机器翻译中的神经网络可以用连续可微函数表示,因此这类方法也可以用基于梯度的方法进行优化,相关技术非常成熟。更为重要的是,在神经网络的设计中,研究者引入了分布式表示的概念,这也是近些年自然语言处理领域的重要成果之一。传统统计机器翻译仍然把词序列看作离散空间里的由多个特征函数描述的点,类似于$n$-gram语言模型,这类模型对数据稀疏问题非常敏感。此外,人工设计特征也在一定程度上限制了模型对问题的表示能力。神经机器翻译把文字序列表示为实数向量,一方面避免了特征工程繁重的工作,另一方面使得系统可以对文字序列的“表示”进行学习。可以说,神经机器翻译的成功很大程度上源自“ 表示学习”这种自然语言处理的新范式的出现。在表示学习的基础上,注意力机制、深度神经网络等技术都被应用于神经机器翻译,使其得以进一步发展。
\parinterval 从广义上讲,神经机器翻译是一种基于人工神经网络的方法,它把翻译过程描述为可以用人工神经网络表示的函数所有的训练和推断都在这些函数上进行。由于神经机器翻译中的神经网络可以用连续可微函数表示,因此这类方法也可以用基于梯度的方法进行优化,相关技术非常成熟。更为重要的是,在神经网络的设计中,研究者引入了分布式表示的概念,这也是近些年自然语言处理领域的重要成果之一。传统统计机器翻译仍然把词序列看作离散空间里的由多个特征函数描述的点,类似于$n$-gram语言模型,这类模型对数据稀疏问题非常敏感。此外,人工设计特征也在一定程度上限制了模型对问题的表示能力。神经机器翻译把文字序列表示为实数向量,一方面避免了特征工程繁重的工作,另一方面使得系统可以对文字序列的“表示”进行学习。可以说,神经机器翻译的成功很大程度上源自“ 表示学习”这种自然语言处理的新范式的出现。在表示学习的基础上,注意力机制、深度神经网络等技术都被应用于神经机器翻译,使其得以进一步发展。
\parinterval 虽然神经机器翻译中大量使用了人工神经网络方法,但是它并不是最早在机器翻译中使用人工神经网络的框架。实际上,人工神经网络在机器翻译中应用的历史要远早于现在的神经机器翻译。 在统计机器翻译时代,也有很多研究者利用人工神经网络进行机器翻译系统模块的构建\upcite{devlin-etal-2014-fast,Schwenk_continuousspace},比如,研究人员成功地在统计机器翻译系统中使用了基于神经网络的联合表示模型,取得了很好的效果\upcite{devlin-etal-2014-fast}
\parinterval 虽然神经机器翻译中大量使用了人工神经网络方法,但是它并不是最早在机器翻译中使用人工神经网络的框架。实际上,人工神经网络在机器翻译中应用的历史要远早于现在的神经机器翻译。 在统计机器翻译时代,也有很多研究者利用人工神经网络进行机器翻译系统模块的构建\upcite{devlin-etal-2014-fast,Schwenk_continuousspace},比如,研究人员成功地在统计机器翻译系统中使用了基于神经网络的联合表示模型,取得了很好的效果\upcite{devlin-etal-2014-fast}
\parinterval 不过,以上这些工作大多都是在系统的局部模块中使用人工神经网络和深度学习方法。与之不同的是,神经机器翻译是用人工神经网络完成整个翻译过程的建模,这样做的一个好处是,整个系统可以进行端到端学习,无需引入对任何翻译的隐含结构假设。这种利用端到端学习对机器翻译进行神经网络建模的方式也就成为了现在大家所熟知的神经机器翻译。这里简单列出部分代表性的工作:
......@@ -78,13 +78,13 @@
\vspace{0.3em}
\item 早在2013年,Nal Kalchbrenner和Phil Blunsom提出了一个基于编码器-解码器结构的新模型\upcite{kalchbrenner-blunsom-2013-recurrent}。该模型用卷积神经网络(CNN)将源语言编码成实数向量,之后用循环神经网络(RNN)将连续向量转换成目标语言。这使得模型不需要进行词对齐、特征提取等工作,就能够自动学习源语言的信息。这也是一种端到端学习的方法。不过,这项工作的实现较复杂,而且方法存在梯度消失/爆炸等问题\upcite{HochreiterThe,BENGIO1994Learning},因此并没有成为后来神经机器翻译的基础框架。
\vspace{0.3em}
\item 2014年,Ilya Sutskever等人提出了序列到序列(seq2seq)学习的方法,同时将长短记忆结构(LSTM)引入到神经机器翻译中,这个方法解决了梯度爆炸/消失的问题,并且通过遗忘门的设计让网络选择性地记忆信息,缓解了序列中长距离依赖的问题\upcite{NIPS2014_5346}。但是该模型在进行编码的过程中,将不同长度的源语言句子压缩成了一个固定长度的向量,句子越长,损失的信息越多,同时该模型无法对输入和输出序列之间的对齐进行建模,因此并不能有效的保证翻译质量。
\item 2014年,Ilya Sutskever等人提出了序列到序列(seq2seq)学习的方法,同时将长短时记忆结构(LSTM)引入到神经机器翻译中,这个方法解决了梯度消失/爆炸的问题,并且通过遗忘门的设计让网络选择性地记忆信息,缓解了序列中长距离依赖的问题\upcite{NIPS2014_5346}。但是该模型在进行编码的过程中,将不同长度的源语言句子压缩成了一个固定长度的向量,句子越长,损失的信息越多,同时该模型无法对输入和输出序列之间的对齐进行建模,因此并不能有效的保证翻译质量。
\vspace{0.3em}
\item 同年Dzmitry Bahdanau等人首次将{\small\bfnew{注意力机制}}\index{注意力机制}(Attention Mechanism\index{Attention Mechanism})应用到机器翻译领域,在机器翻译任务上对翻译和局部翻译单元之间的对应关系同时建模\upcite{bahdanau2014neural}。Bahdanau等人工作的意义在于,使用了更加有效的模型来表示源语言的信息,同时使用注意力机制对两种语言不同部分之间的相互联系进行建模。这种方法可以有效地处理长句子的翻译,而且注意力的中间结果具有一定的可解释性\footnote{比如,目标语言和源语言句子不同单词之间的注意力强度能够在一定程度上反应单词之间的互译程度。} 。然而相比于前人的神经机器翻译模型,注意力模型也引入了额外的成本,计算量较大。
\vspace{0.3em}
\item 2016年谷歌公司发布了基于多层循环神经网络方法的GNMT系统。该系统集成了当时的神经机器翻译技术,并进行了诸多的改进。它的性能显著优于基于短语的机器翻译系统\upcite{Wu2016GooglesNM},引起了研究者的广泛关注。在之后不到一年的时间里,脸书公司采用卷积神经网络(CNN)研发了新的神经机器翻译系统\upcite{DBLP:journals/corr/GehringAGYD17},实现了比基于循环神经网络(RNN)系统更高的翻译水平,并大幅提升翻译速度。
\vspace{0.3em}
\item 2017年,Ashish Vaswani等人提出了新的翻译模型Transformer。其完全抛弃了CNN、RNN等结构,仅仅通过自注意力机制和前馈神经网络,不需要使用序列对齐的循环框架就展示出强大的性能,并且巧妙解决了翻译中长距离依赖问题\upcite{NIPS2017_7181}。Transformer是第一个完全基于注意力机制搭建的模型,不仅训练速度更快,在翻译任务上也获得了更好的结果,一跃成为目前最主流的神经机器翻译框架。
\item 2017年,Ashish Vaswani等人提出了新的翻译模型Transformer。其完全抛弃了CNN、RNN等结构,仅仅通过自注意力机制和前馈神经网络,不需要使用序列对齐的循环框架就展示出强大的性能,并且巧妙解决了翻译中长距离依赖问题\upcite{NIPS2017_7181}。Transformer是第一个完全基于注意力机制搭建的模型,不仅训练速度更快,在翻译任务上也获得了更好的结果,一跃成为目前最主流的神经机器翻译框架。
\vspace{0.3em}
\end{itemize}
......@@ -117,9 +117,9 @@
%----------------------------------------------
\vspace{-0.3em}
\parinterval 可以明显地看到译文2更加通顺,意思的表达更加准确,翻译质量明显高于译文1。这个例子基本反应出统计机器翻译和神经机器翻译的差异性。当然,这里并不是要讨论统计机器翻译和神经机器翻译孰优孰劣。但是,很多场景中发现神经机器翻译系统可以生成非常流畅的译文,易于人工阅读和修改。
\parinterval 可以明显地看到译文2更加通顺,意思的表达更加准确,翻译质量明显高于译文1。这个例子基本反应出统计机器翻译和神经机器翻译的差异性。当然,这里并不是要讨论统计机器翻译和神经机器翻译孰优孰劣,只是很多场景中发现神经机器翻译系统可以生成非常流畅的译文,易于人工阅读和修改。
\parinterval 在很多量化的评价中也可以看到神经机器翻译的优势。回忆一下{\chapterfour}提到的机器翻译质量的自动评估指标中,使用最广泛的一种指标是BLEU。在2010年前,在由美国国家标准和科技机构(NIST)举办的汉英机器翻译评测中(比如汉英MT08数据集),30\%以上的BLEU值对于基于统计方法的翻译系统来说就已经是当时最顶尖的结果了。而现在的神经机器翻译系统,则可以轻松的将BLEU提高至45\%以上。
\parinterval 在很多量化的评价中也可以看到神经机器翻译的优势。回忆一下{\chapterfour}提到的机器翻译质量的自动评估指标中,使用最广泛的一种指标是BLEU。2010年前,在由美国国家标准和科技机构(NIST)举办的汉英机器翻译评测中(比如汉英MT08数据集),30\%以上的BLEU值对于基于统计方法的翻译系统来说就已经是当时最顶尖的结果了。而现在的神经机器翻译系统,则可以轻松地将BLEU提高至45\%以上。
%----------------------------------------------
\begin{figure}[htp]
......@@ -191,7 +191,7 @@ NMT & 21.7 & 18.7 & -13.7 \\
\end{table}
%----------------------------------------------
\parinterval 在最近两年,神经机器翻译的发展更加迅速,新的模型方法层出不穷。表\ref{tab:10-3}给出了到2020年为止一些主流的神经机器翻译模型的对比。可以看到,相比2017年,2018-2020年中机器翻译仍然有明显的进步。
\parinterval 在最近两年,神经机器翻译的发展更加迅速,新的模型方法层出不穷。表\ref{tab:10-3}给出了到2020年为止一些主流的神经机器翻译模型的对比。可以看到,相比2017年,2018-2020年中机器翻译仍然有明显的进步。
\vspace{0.5em}%全局布局使用
%----------------------------------------------
......@@ -239,21 +239,21 @@ NMT & 21.7 & 18.7 & -13.7 \\
\begin{itemize}
\vspace{0.5em}
\item 分布式连续空间表示模型,能捕获更多隐藏信息。神经机器翻译与统计机器翻译最大的区别在于对语言文字串的表示方法上。在统计机器翻译中,所有词串本质上都是由更小的词串(短语、规则)组合而成,也就是统计机器翻译模型利用了词串之间的组合性来表示更大的词串。统计机器翻译使用多个特征描述翻译结果,但是其仍然对应着离散的字符串的组合,因此可以把模型对问题的表示空间看做是由一个离散结构组成的集合。在神经机器翻译中,词串的表示已经被神经网络转化为多维实数向量,而且也不依赖任何的可组合性假设等其他假设来刻画离散的语言结构,从这个角度说,所有的词串分别对应了一个连续空间上的点(比如,对应多维实数空间中一个点)。这样,模型可以更好地进行优化,而且对未见样本有更好的泛化能力。此外,基于连续可微函数的机器学习算法已经相对完备,可以很容易的对问题进行建模和优化。
\item 分布式连续空间表示模型,能捕获更多隐藏信息。神经机器翻译与统计机器翻译最大的区别在于对语言文字串的表示方法。在统计机器翻译中,所有词串本质上都是由更小的词串(短语、规则)组合而成,也就是统计机器翻译模型利用了词串之间的组合性来表示更大的词串。统计机器翻译使用多个特征描述翻译结果,但是其仍然对应着离散的字符串的组合,因此可以把模型对问题的表示空间看做是由一个离散结构组成的集合。在神经机器翻译中,词串的表示已经被神经网络转化为多维实数向量,而且也不依赖任何的可组合性假设等其他假设来刻画离散的语言结构,从这个角度说,所有的词串分别对应了一个连续空间上的点(比如,对应多维实数空间中一个点)。这样,模型可以更好地进行优化,而且对未见样本有更好的泛化能力。此外,基于连续可微函数的机器学习算法已经相对完备,可以很容易地对问题进行建模和优化。
\vspace{0.5em}
\item 不需要特征工程,特征学习更加全面。经典的统计机器翻译可以通过判别式模型引入任意特征,不过这些特征需要人工设计,因此这个过程也被称为特征工程。特征工程依赖大量的人工,特别是对不同语种、不同场景的翻译任务,所采用的特征可能不尽相同,这也使得设计有效的特征成为了统计机器翻译时代最主要的工作之一。但是,由于人类自身的思维和认知水平的限制,人工设计的特征可能不全面,甚至会遗漏一些重要的翻译现象。神经机器翻译并不依赖任何人工特征的设计,或者说它的特征都隐含在分布式表示中。这些“特征”都是自动学习得到的,因此神经机器翻译并不会受到人工思维的限制,学习到的特征对问题描述更加全面。
\vspace{0.5em}
\item 不含隐含结构假设,端到端学习对问题建模更加直接。传统的自然语言处理任务会对问题进行隐含结构假设。比如,进行翻译时,统计机器翻译会假设翻译过程由短语的拼装完成。这些假设可以大大化简问题的复杂度,但是另一方面也带来了各种各样的约束条件错误的隐含假设往往会导致建模错误。神经机器翻译是一种端到端模型,它并不依赖任何隐含结构假设。这样,模型并不会受到错误的隐含结构的引导。从某种意义上说,端到端学习可以让模型更加“ 自由”地进行学习,因此往往可以学到很多传统认知上不容易理解或者不容易观测到的现象。
\item 不含隐含结构假设,端到端学习对问题建模更加直接。传统的自然语言处理任务会对问题进行隐含结构假设。比如,进行翻译时,统计机器翻译会假设翻译过程由短语的拼装完成。这些假设可以大大化简问题的复杂度,但是另一方面也带来了各种各样的约束条件,并且错误的隐含假设往往会导致建模错误。神经机器翻译是一种端到端模型,它并不依赖任何隐含结构假设。这样,模型并不会受到错误的隐含结构的引导。从某种意义上说,端到端学习可以让模型更加“ 自由”地进行学习,因此往往可以学到很多传统认知上不容易理解或者不容易观测到的现象。
\vspace{0.5em}
\item 模型结构统一,存储消耗更小。统计机器翻译系统依赖于很多模块,比如词对齐、短语(规则)表目标语言模型等等,因为所有的信息(如$n$-gram)都是离散化表示的,因此模型需要消耗大量的存储资源。同时,由于系统模块较多,开发的难度也较大。神经机器翻译的模型都是用神经网络进行表示,模型参数大多是实数矩阵,因此存储资源的消耗很小。而且神经网络可以作为一个整体进行开发和调试,系统搭建的代价相对较低。实际上,由于模型体积小,神经机器翻译也非常合适于离线小设备上的翻译任务。
\item 模型结构统一,存储消耗更小。统计机器翻译系统依赖于很多模块,比如词对齐、短语(规则)表目标语言模型等等,因为所有的信息(如$n$-gram)都是离散化表示的,因此模型需要消耗大量的存储资源。同时,由于系统模块较多,开发的难度也较大。神经机器翻译的模型都是用神经网络进行表示,模型参数大多是实数矩阵,因此存储资源的消耗很小。而且神经网络可以作为一个整体进行开发和调试,系统搭建的代价相对较低。实际上,由于模型体积小,神经机器翻译也非常合适于离线小设备上的翻译任务。
\vspace{0.5em}
\end{itemize}
\parinterval 当然,神经机器翻译也并不完美,很多问题有待解决。首先,神经机器翻译需要大规模浮点运算的支持,模型的推断速度较低。为了获得优质的翻译结果,往往需要大量GPU设备的支持,计算资源成本很高;其次,由于缺乏人类的先验知识对翻译过程的指导,神经机器翻译的运行过程缺乏可解释性,系统的可干预性也较差;此外,虽然脱离了繁重的特征工程,神经机器翻译仍然需要人工设计网络结构,包括在模型的各种超参的设置、训练策略的选择等方面,仍然需要大量人工参与。这也导致很多实验结果不容易重现。显然,完全不依赖人工进行机器翻译还很遥远。不过,随着研究者的不断攻关,很多问题也得到了解决。
\parinterval 当然,神经机器翻译也并不完美,很多问题有待解决。首先,神经机器翻译需要大规模浮点运算的支持,模型的推断速度较低。为了获得优质的翻译结果,往往需要大量GPU设备的支持,计算资源成本很高;其次,由于缺乏人类的先验知识对翻译过程的指导,神经机器翻译的运行过程缺乏可解释性,系统的可干预性也较差;此外,虽然脱离了繁重的特征工程,神经机器翻译仍然需要人工设计网络结构,在模型的各种超参的设置、训练策略的选择等方面,仍然需要大量人工参与。这也导致很多实验结果不容易重现。显然,完全不依赖人工进行机器翻译还很遥远。不过,随着研究者的不断攻关,很多问题也得到了解决。
%----------------------------------------------------------------------------------------
% NEW SECTION 10.2
......@@ -305,7 +305,7 @@ NMT & 21.7 & 18.7 & -13.7 \\
\end{figure}
%----------------------------------------------
\parinterval 实际上,编码器-解码器模型也并不是表示学习实现的唯一途径。比如,在{\chapternine}提到的神经语言模型实际上也是一种有效的学习句子表示的方法,它所衍生出的预训练模型可以从大规模单语数据上学习句子的表示形式。这种学习会比使用少量的双语数据进行编码端和解码端的学习更加充分。相比机器翻译任务,语言模型相当于一个编码器的学习 \footnote{相比神经机器翻译的编码器,神经语言模型会多出一个输出层,这时可以直接把神经语言模型的中间层的输出作为编码器的输出。},可以无缝嵌入到神经机器翻译模型中。不过,值得注意的是,机器翻译的目的是解决双语字符串之间的映射问题,因此它所使用的句子表示是为了更好地进行翻译。从这个角度说,机器翻译中的表示学习又和语言模型中的表示学习有不同。不过,这里不会深入讨论神经语言模型和预训练与神经机器翻译之间的异同,后续章节会有相关讨论。
\parinterval 实际上,编码器-解码器模型也并不是表示学习实现的唯一途径。比如,在{\chapternine}提到的神经语言模型实际上也是一种有效的学习句子表示的方法,它所衍生出的预训练模型可以从大规模单语数据上学习句子的表示形式。这种学习会比使用少量的双语数据进行编码端和解码端的学习更加充分。相比机器翻译任务,语言模型相当于一个编码器的学习 \footnote{相比神经机器翻译的编码器,神经语言模型会多出一个输出层,这时可以直接把神经语言模型的中间层的输出作为编码器的输出。},可以无缝嵌入到神经机器翻译模型中。不过,值得注意的是,机器翻译的目的是解决双语字符串之间的映射问题,因此它所使用的句子表示是为了更好地进行翻译。从这个角度说,机器翻译中的表示学习又和语言模型中的表示学习有不同。不过,这里不会深入讨论神经语言模型和预训练与神经机器翻译之间的异同,后续章节会有相关讨论。
\parinterval 还有一点,在神经机器翻译中,句子的表示形式可以有很多选择。使用单个向量表示一个句子是一种最简单的方法。当然,也可以用矩阵、高阶张量完成表示。甚至,在解码时动态地生成源语言的表示结果。
......@@ -415,7 +415,7 @@ NMT & 21.7 & 18.7 & -13.7 \\
\parinterval 显然,根据上下文中提到的“没/吃饭”、“很/饿”,最佳的答案是“吃饭”或者“吃东西”。也就是,对序列中某个位置的答案进行预测时需要记忆当前时刻之前的序列信息,因此,循环神经网络应运而生。实际上循环神经网络有着极为广泛的应用,例如语音识别、语言建模以及即将要介绍的神经机器翻译。
\parinterval {\chapternine}已经对循环神经网络的基本知识进行过介绍这里再回顾一下。简单来说,循环神经网络由循环单元组成。对于序列中的任意时刻,都有一个循环单元与之对应,它会融合当前时刻的输入和上一时刻循环单元的输出,生成当前时刻的输出。这样每个时刻的信息都会被传递到下一时刻,这也间接达到了记录历史信息的目的。比如,对于序列$\seq{x}=\{x_1, x_2,..., x_m\}$,循环神经网络会按顺序输出一个序列$\seq{h}=\{ \mathbi{h}_1, \mathbi{h}_2,..., \mathbi{h}_m \}$,其中$\mathbi{h}_i$表示$i$时刻循环神经网络的输出(通常为一个向量)。
\parinterval {\chapternine}已经对循环神经网络的基本知识进行过介绍这里再回顾一下。简单来说,循环神经网络由循环单元组成。对于序列中的任意时刻,都有一个循环单元与之对应,它会融合当前时刻的输入和上一时刻循环单元的输出,生成当前时刻的输出。这样每个时刻的信息都会被传递到下一时刻,这也间接达到了记录历史信息的目的。比如,对于序列$\seq{x}=\{x_1, x_2,..., x_m\}$,循环神经网络会按顺序输出一个序列$\seq{h}=\{ \mathbi{h}_1, \mathbi{h}_2,..., \mathbi{h}_m \}$,其中$\mathbi{h}_i$表示$i$时刻循环神经网络的输出(通常为一个向量)。
\parinterval\ref{fig:10-9}展示了一个循环神经网络处理序列问题的实例。当前时刻循环单元的输入由上一个时刻的输出和当前时刻的输入组成,因此也可以理解为,网络当前时刻计算得到的输出是由之前的序列共同决定的,即网络在不断地传递信息的过程中记忆了历史信息。以最后一个时刻的循环单元为例,它在对“开始”这个单词的信息进行处理时,参考了之前所有词(“<sos>\ \ 我们”)的信息。
......@@ -428,7 +428,7 @@ NMT & 21.7 & 18.7 & -13.7 \\
\end{figure}
%----------------------------------------------
\parinterval 在神经机器翻译里使用循环神经网络也很简单。只需要把源语言句子和目标语言句子分别看作两个序列,之后使用两个循环神经网络分别对其进行建模。这个过程如图\ref{fig:10-10}所示。图中,下半部分是编码器,上半部分是解码器。编码器利用循环神经网络对源语言序列逐词进行编码处理,同时利用循环单元的记忆能力,不断累积序列信息,遇到终止符<eos>后便得到了包含源语言句子全部信息的表示结果。解码器利用编码器的输出和起始符<sos>开始逐词进行解码,即逐词翻译,每得到一个译文单词,便将其作为当前时刻解码端循环单元的输入,这也是一个典型的神经语言模型的序列生成过程。解码器通过循环神经网络不断地累积已经得到的译文的信息,并继续生成下一个单词,直到遇到结束符<eos>,便得到了最终完整的译文。
\parinterval 在神经机器翻译里使用循环神经网络也很简单。只需要把源语言句子和目标语言句子分别看作两个序列,之后使用两个循环神经网络分别对其进行建模。这个过程如图\ref{fig:10-10}所示。图中,下半部分是编码器,上半部分是解码器。编码器利用循环神经网络对源语言序列逐词进行编码处理,同时利用循环单元的记忆能力,不断累积序列信息,遇到终止符<eos>后便得到了包含源语言句子全部信息的表示结果。解码器利用编码器的输出和起始符<sos>开始逐词进行解码,即逐词翻译,每得到一个译文单词,便将其作为当前时刻解码端循环单元的输入,这也是一个典型的神经语言模型的序列生成过程。解码器通过循环神经网络不断地累积已经得到的译文的信息,并继续生成下一个单词,直到遇到结束符<eos>,便得到了最终完整的译文。
%----------------------------------------------
\begin{figure}[htp]
......@@ -459,7 +459,7 @@ NMT & 21.7 & 18.7 & -13.7 \\
\vspace{-0.5em}
\begin{itemize}
\vspace{0.5em}
\item 如何对$\seq{{x}}$$\seq{{y}}_{<j }$进行分布式表示,即{\small\sffamily\bfseries{词嵌入}}(Word Embedding)。首先,将由one-hot向量表示的源语言单词,即由0和1构成的离散化向量表示,转化为实数向量。可以把这个过程记为$\textrm{e}_x (\cdot)$。类似,可以把目标语言序列$\seq{{y}}_{<j }$中的每个单词用同样的方式进行表示,记为$\textrm{e}_y (\cdot)$
\item 如何对$\seq{{x}}$$\seq{{y}}_{<j }$进行分布式表示,即{\small\sffamily\bfseries{词嵌入}}(Word Embedding)。首先,将由one-hot向量表示的源语言单词,即由0和1构成的离散化向量表示,转化为实数向量。可以把这个过程记为$\textrm{e}_x (\cdot)$。类似,可以把目标语言序列$\seq{{y}}_{<j }$中的每个单词用同样的方式进行表示,记为$\textrm{e}_y (\cdot)$
\vspace{0.5em}
\item 如何在词嵌入的基础上获取整个序列的表示,即句子的{\small\sffamily\bfseries{表示学习}}(Representation Learning)。可以把词嵌入的序列作为循环神经网络的输入,循环神经网络最后一个时刻的输出向量便是整个句子的表示结果。如图\ref{fig:10-11}中,编码器最后一个循环单元的输出$\mathbi{h}_m$被看作是一种包含了源语言句子信息的表示结果,记为$\mathbi{C}$
\vspace{0.5em}
......@@ -468,7 +468,7 @@ NMT & 21.7 & 18.7 & -13.7 \\
\funp{P} (y_j | \seq{{y}}_{<j},\seq{{x}}) \equiv \funp{P} ( {y_j | \mathbi{s}_{j-1} ,y_{j-1},\mathbi{C}} )
\label{eq:10-4}
\end{eqnarray}
$\funp{P}({y_j | \mathbi{s}_{j-1} ,y_{j-1},\mathbi{C}})$由Softmax实现,Softmax的输入是循环神经网络$j$时刻的输出。在具体实现时,$\mathbi{C}$可以被简单作为第一个时刻循环单元的输入,即,当$j=1$ 时,解码器的循环神经网络会读入编码器最后一个隐层状态$ \mathbi{h}_m$(也就是$\mathbi{C}$),而其他时刻的隐层状态不直接与$\mathbi{C}$相关。最终,$\funp{P} (y_j | \seq{{y}}_{<j},\seq{{x}})$ 被表示为:
$\funp{P}({y_j | \mathbi{s}_{j-1} ,y_{j-1},\mathbi{C}})$由Softmax实现,Softmax的输入是循环神经网络$j$时刻的输出。在具体实现时,$\mathbi{C}$可以被简单作为第一个时刻循环单元的输入,即,当$j=1$ 时,解码器的循环神经网络会读入编码器最后一个隐层状态$ \mathbi{h}_m$(也就是$\mathbi{C}$),而其他时刻的隐层状态不直接与$\mathbi{C}$相关。最终,$\funp{P} (y_j | \seq{{y}}_{<j},\seq{{x}})$ 被表示为:
\begin{eqnarray}
\funp{P} (y_j | \seq{{y}}_{<j},\seq{{x}}) \equiv
\left \{ \begin{array}{ll}
......@@ -510,7 +510,7 @@ $\funp{P}({y_j | \mathbi{s}_{j-1} ,y_{j-1},\mathbi{C}})$由Softmax实现,Softm
\parinterval RNN结构使得当前时刻循环单元的状态包含了之前时间步的状态信息。但是这种对历史信息的记忆并不是无损的,随着序列变长,RNN的记忆信息的损失越来越严重。在很多长序列处理任务中(如长文本生成)都观测到了类似现象。对于这个问题,研究者门提出了{\small\bfnew{长短时记忆}}\index{长短时记忆}(Long Short-term Memory)\index{Long Short-term Memory,LSTM}模型,也就是常说的LSTM模型\upcite{HochreiterLong}
\parinterval LSTM模型是RNN模型的一种改进。相比RNN仅传递前一时刻的状态$\mathbi{h}_{t-1}$,LSTM会同时传递两部分信息:状态信息$\mathbi{h}_{t-1}$和记忆信息$\mathbi{c}_{t-1}$。这里,$\mathbi{c}_{t-1}$是新引入的变量,它也是循环单元的一部分,用于显性记录需要记录的历史内容,$\mathbi{h}_{t-1}$$\mathbi{c}_{t-1}$在循环单元中会相互作用。LSTM通过“门”单元来动态地选择遗忘多少以前的信息和记忆多少当前的信息。LSTM中所使用的门单元结构如图\ref{fig:10-15}所示,包括遗忘门,输入门和输出门。图中$\sigma$代表Sigmoid函数,它将函数输入映射为0-1范围内的实数,用来充当门控信号。
\parinterval LSTM模型是RNN模型的一种改进。相比RNN仅传递前一时刻的状态$\mathbi{h}_{t-1}$,LSTM会同时传递两部分信息:状态信息$\mathbi{h}_{t-1}$和记忆信息$\mathbi{c}_{t-1}$。这里,$\mathbi{c}_{t-1}$是新引入的变量,它也是循环单元的一部分,用于显性记录需要记录的历史内容,$\mathbi{h}_{t-1}$$\mathbi{c}_{t-1}$在循环单元中会相互作用。LSTM通过“门”单元来动态地选择遗忘多少以前的信息和记忆多少当前的信息。LSTM中所使用的门单元结构如图\ref{fig:10-15}所示,包括遗忘门,输入门和输出门。图中$\sigma$代表Sigmoid函数,它将函数输入映射为0-1范围内的实数,用来充当门控信号。
%----------------------------------------------
\begin{figure}[htp]
......@@ -574,7 +574,7 @@ $\funp{P}({y_j | \mathbi{s}_{j-1} ,y_{j-1},\mathbi{C}})$由Softmax实现,Softm
\subsection{门控循环单元}
\parinterval LSTM 通过门控单元控制传递状态,忘记不重要的信息,记住必要的历史信息,在长序列上取得了很好的效果,但是其进行了许多门信号的计算,较为繁琐。{\small\bfnew{门循环单元}}\index{门循环单元}(Gated Recurrent Unit,GRU)\index{Gated Recurrent Unit,GRU}作为一个LSTM的变种,继承了LSTM中利用门控单元控制信息传递的思想,并对LSTM进行了简化\upcite{Cho2014Learning}。它把循环单元状态$\mathbi{h}_t$和记忆$\mathbi{c}_t$合并成一个状态$\mathbi{h}_t$,同时使用了更少的门控单元,大大提升了计算效率。
\parinterval LSTM 通过门控单元控制传递状态,忘记不重要的信息,记住必要的历史信息,在长序列上取得了很好的效果,但是其进行了许多门信号的计算,较为繁琐。{\small\bfnew{门循环单元}}\index{门循环单元}(Gated Recurrent Unit,GRU)\index{Gated Recurrent Unit,GRU}作为一个LSTM的变种,继承了LSTM中利用门控单元控制信息传递的思想,并对LSTM进行了简化\upcite{Cho2014Learning}。它把循环单元状态$\mathbi{h}_t$和记忆$\mathbi{c}_t$合并成一个状态$\mathbi{h}_t$,同时使用了更少的门控单元,大大提升了计算效率。
%----------------------------------------------
\begin{figure}[htp]
......@@ -636,7 +636,7 @@ $\funp{P}({y_j | \mathbi{s}_{j-1} ,y_{j-1},\mathbi{C}})$由Softmax实现,Softm
\parinterval 实际上,对于单词序列所使用的循环神经网络是一种很“深”的网络,因为从第一个单词到最后一个单词需要经过至少句子长度相当层数的神经元。比如,一个包含几十个词的句子也会对应几十个神经元层。但是,在很多深度学习应用中,更习惯把对输入序列的同一种处理作为“一层”。比如,对于输入序列,构建一个RNN,那么这些循环单元就构成了网络的“一层”。当然,这里并不是要混淆概念。只是要明确,在随后的讨论中,“层”并不是指一组神经元的全连接,它一般指的是网络结构中逻辑上的一层。
\parinterval 单层循环神经网络对输入序列进行了抽象,为了得到更深入的抽象能力,可以把多个循环神经网络叠在一起,构成多层循环神经网络。比如,图\ref{fig:10-19}就展示基于两层循环神经网络的解码器和编码器结构。通常来说,层数越多模型的表示能力越强,因此在很多基于循环神经网络的机器翻译系统中一般会使用4$\sim$8层的网络。但是,过多的层也会增加模型训练的难度,甚至导致模型无法进行训练。{\chapterthirteen}还会对这个问题进行深入讨论。
\parinterval 单层循环神经网络对输入序列进行了抽象,为了得到更深入的抽象能力,可以把多个循环神经网络叠在一起,构成多层循环神经网络。比如,图\ref{fig:10-19}就展示基于两层循环神经网络的解码器和编码器结构。通常来说,层数越多模型的表示能力越强,因此在很多基于循环神经网络的机器翻译系统中一般会使用4$\sim$8层的网络。但是,过多的层也会增加模型训练的难度,甚至导致模型无法进行训练。{\chapterthirteen}还会对这个问题进行深入讨论。
%----------------------------------------------
\begin{figure}[htp]
......@@ -660,7 +660,7 @@ $\funp{P}({y_j | \mathbi{s}_{j-1} ,y_{j-1},\mathbi{C}})$由Softmax实现,Softm
\centerline{中午\ \ 吃饭\ \ \ \ \ \ \ 下午\ 篮球\ \ \ 现在\ \ 饿\ \ \ \underline{\quad \quad \quad}}
\vspace{0.8em}
\noindent 之所以能想到在横线处填“吃饭”、“吃东西”很有可能是因为看到了“没/吃饭”、 “很/饿”等关键信息。也就是这些关键的片段对预测缺失的单词起着关键性作用。而预测“吃饭”与前文中的“ 中午”、“又”之间的联系似乎不那么紧密。也就是说,在形成 “吃饭”的逻辑时,在潜意识里会更注意“没/吃饭”、“很饿”等关键信息。也就是我们的关注度并不是均匀地分布在整个句子上的。
\noindent 之所以能想到在横线处填“吃饭”、“吃东西”很有可能是因为看到了“没/吃饭”、 “很/饿”等关键信息。也就是这些关键的片段对预测缺失的单词起着关键性作用。而预测“吃饭”与前文中的“ 中午”、“又”之间的联系似乎不那么紧密。也就是说,在形成 “吃饭”的逻辑时,在潜意识里会更注意“没/吃饭”、“很/饿”等关键信息。也就是我们的关注度并不是均匀地分布在整个句子上的。
\parinterval 这个现象可以用注意力机制进行解释。注意力机制的概念来源于生物学的一些现象:当待接收的信息过多时,人类会选择性地关注部分信息而忽略其他信息。它在人类的视觉、听觉、嗅觉等方面均有体现,当我们在感受事物时,大脑会自动过滤或衰减部分信息,仅关注其中少数几个部分。例如,当看到图\ref{fig:10-20}时,往往不是“均匀地”看图像中的所有区域,可能最先注意到的是小狗的嘴,然后才会关注图片中其他的部分。那注意力机制是如何解决神经机器翻译的问题呢?下面就一起来看一看。
......@@ -684,7 +684,7 @@ $\funp{P}({y_j | \mathbi{s}_{j-1} ,y_{j-1},\mathbi{C}})$由Softmax实现,Softm
\vspace{0.5em}
\item 首先,虽然编码器把一个源语言句子的表示传递给解码器,但是一个维度固定的向量所能包含的信息是有限的,随着源语言序列的增长,将整个句子的信息编码到一个固定维度的向量中可能会造成源语言句子信息的丢失。显然,在翻译较长的句子时,解码端可能无法获取完整的源语言信息,降低翻译性能;
\vspace{0.5em}
\item 此外,当生成某一个目标语言单词时,并不是均匀使用源语言句子中的单词信息。更普遍的情况是,系统会参考与这个目标语言单词相对应的源语言单词进行翻译。这有些类似于词对齐的作用,即翻译是基于单词之间的某种对应关系。但是,使用单一的源语言表示根本无法区分源语言句子的不同部分,更不用说对源语言单词和目标语言单词之间的联系进行建模了。
\item 此外,当生成某一个目标语言单词时,并不是均匀使用源语言句子中的单词信息。更普遍的情况是,系统会参考与这个目标语言单词相对应的源语言单词进行翻译。这有些类似于词对齐的作用,即翻译是基于单词之间的某种对应关系。但是,使用单一的源语言表示根本无法区分源语言句子的不同部分,更不用说对源语言单词和目标语言单词之间的联系进行建模了。
\vspace{0.5em}
\end{itemize}
......@@ -699,7 +699,7 @@ $\funp{P}({y_j | \mathbi{s}_{j-1} ,y_{j-1},\mathbi{C}})$由Softmax实现,Softm
\end{figure}
%----------------------------------------------
\parinterval 显然,以上问题的根本原因在于所使用的表示模型还比较“弱”。因此需要一个更强大的表示模型,在生成目标语言单词时能够有选择地获取源语言句子中更有用的部分。更准确的说,对于要生成的目标语单词,相关性更高的源语言片段应该在源语言句子的表示中体现出来,而不是将所有的源语言单词一视同仁。在神经机器翻译中引入注意力机制正是为了达到这个目的\upcite{bahdanau2014neural,DBLP:journals/corr/LuongPM15}。实际上,除了机器翻译,注意力机制也被成功地应用于图像处理、语音识别、自然语言处理等其他任务。正是注意力机制的引入,使得包括机器翻译在内很多自然语言处理系统得到了飞跃发展。
\parinterval 显然,以上问题的根本原因在于所使用的表示模型还比较“弱”。因此需要一个更强大的表示模型,在生成目标语言单词时能够有选择地获取源语言句子中更有用的部分。更准确的说,对于要生成的目标语单词,相关性更高的源语言片段应该在源语言句子的表示中体现出来,而不是将所有的源语言单词一视同仁。在神经机器翻译中引入注意力机制正是为了达到这个目的\upcite{bahdanau2014neural,DBLP:journals/corr/LuongPM15}。实际上,除了机器翻译,注意力机制也被成功地应用于图像处理、语音识别、自然语言处理等其他任务。正是注意力机制的引入,使得包括机器翻译在内很多自然语言处理系统得到了飞跃发展。
\parinterval 神经机器翻译中的注意力机制并不复杂。对于每个目标语言单词$y_j$,系统生成一个源语言表示向量$\mathbi{C}_j$与之对应,$\mathbi{C}_j$会包含生成$y_j$所需的源语言的信息,或者说$\mathbi{C}_j$是一种包含目标语言单词与源语言单词对应关系的源语言表示。相比用一个静态的表示$\mathbi{C}$,注意机制使用的是动态的表示$\mathbi{C}_j$$\mathbi{C}_j$也被称作对于目标语言位置$j${\small\bfnew{上下文向量}}\index{上下文向量}(Context Vector\index{Context Vector})。图\ref{fig:10-22}对比了未引入注意力机制和引入了注意力机制的编码器- 解码器结构。可以看出,在注意力模型中,对于每一个目标单词的生成,都会额外引入一个单独的上下文向量参与运算。
......@@ -747,13 +747,13 @@ $\funp{P}({y_j | \mathbi{s}_{j-1} ,y_{j-1},\mathbi{C}})$由Softmax实现,Softm
\label{eq:10-23}
\end{eqnarray}
$a(\cdot)$可以被看作是目标语言表示和源语言表示的一种“统一化”,即把源语言和目标语言表示映射在同一个语义空间,进而语义相近的内容有更大的相似性。该函数有多种计算方式,比如,向量乘、向量夹角单层神经网络等,数学表达如下:
$a(\cdot)$可以被看作是目标语言表示和源语言表示的一种“统一化”,即把源语言和目标语言表示映射在同一个语义空间,进而语义相近的内容有更大的相似性。该函数有多种计算方式,比如,向量乘、向量夹角单层神经网络等,数学表达如下:
\begin{eqnarray}
a (\mathbi{s},\mathbi{h}) = \left\{ \begin{array}{ll}
\mathbi{s} \mathbi{h}^{\textrm{T}} & \textrm{向量乘} \\
\textrm{cos}(\mathbi{s}, \mathbi{h}) & \textrm{向量夹角} \\
\mathbi{s} \mathbi{W} \mathbi{h}^{\textrm{T}} & \textrm{线性模型} \\
\textrm{TanH}(\mathbi{W}[\mathbi{s},\mathbi{h}])\mathbi{v}^{\textrm{T}} & \textrm{拼接}[\mathbi{s},\mathbi{h}]+\textrm{单层网络}
\textrm{Tanh}(\mathbi{W}[\mathbi{s},\mathbi{h}])\mathbi{v}^{\textrm{T}} & \textrm{拼接}[\mathbi{s},\mathbi{h}]+\textrm{单层网络}
\end{array}
\right.
\label{eq:10-24}
......@@ -878,7 +878,7 @@ a (\mathbi{s},\mathbi{h}) = \left\{ \begin{array}{ll}
\subsection{实例 - GNMT}
\vspace{0.5em}
\parinterval 循环神经网络在机器翻译中有很多成功的应用,比如RNNSearch\upcite{bahdanau2014neural}、Nematus\upcite{DBLP:journals/corr/SennrichFCBHHJL17}等系统就被很多研究者作为实验系统。在众多基于循环神经网络的系统中,GNMT系统是非常成功的一个\upcite{Wu2016GooglesNM}。GNMT是谷歌2016年发布的神经机器翻译系统。
\parinterval 循环神经网络在机器翻译中有很多成功的应用,比如RNNSearch\upcite{bahdanau2014neural}、Nematus\upcite{DBLP:journals/corr/SennrichFCBHHJL17}等系统就被很多研究者作为实验系统。在众多基于循环神经网络的系统中,GNMT系统是非常成功的一个\upcite{Wu2016GooglesNM}。GNMT是谷歌2016年发布的神经机器翻译系统。
\parinterval GNMT使用了编码器-解码器结构,构建了一个8层的深度网络,每层网络均由LSTM组成,且在编码器-解码器之间使用了多层注意力连接。其结构如图\ref{fig:10-35},编码器只有最下面2层为双向LSTM。GNMT在束搜索中也加入了长度惩罚和覆盖度因子来确保输出高质量的翻译结果。
\vspace{0.5em}
......@@ -892,7 +892,7 @@ a (\mathbi{s},\mathbi{h}) = \left\{ \begin{array}{ll}
\end{figure}
%----------------------------------------------
\parinterval 实际上,GNMT的主要贡献在于集成了多种优秀的技术,而且在大规模数据上证明了神经机器翻译的有效性。在引入注意力机制之前,神经机器翻译在较大规模的任务上的性能弱于统计机器翻译。加入注意力机制和深层网络后,神经机器翻译性能有了很大的提升。在英德和英法的任务中,GNMT的BLEU值不仅超过了当时优秀的神经机器翻译系统RNNSearch和LSTM(6层),还超过了当时处于领导地位的基于短语的统计机器翻译系统(PBMT)(表\ref{tab:10-10})。相比基于短语的统计机器翻译系统,在人工评价中,GNMT能将翻译错误平均减少60\%。这一结果也充分表明了神经机器翻译带来的巨大性能提升。
\parinterval 实际上,GNMT的主要贡献在于集成了多种优秀的技术,而且在大规模数据上证明了神经机器翻译的有效性。在引入注意力机制之前,神经机器翻译在较大规模的任务上的性能弱于统计机器翻译。加入注意力机制和深层网络后,神经机器翻译性能有了很大的提升。在英德和英法的任务中,GNMT的BLEU值不仅超过了当时优秀的神经机器翻译系统RNNSearch和LSTM(6层),还超过了当时处于领导地位的基于短语的统计机器翻译系统(PBMT)(表\ref{tab:10-10})。相比基于短语的统计机器翻译系统,在人工评价中,GNMT能将翻译错误平均减少60\%。这一结果也充分表明了神经机器翻译带来的巨大性能提升。
%----------------------------------------------
\begin{table}[htp]
......@@ -966,7 +966,7 @@ L(\mathbi{Y},\widehat{\mathbi{Y}}) = \sum_{j=1}^n L_{\textrm{ce}}(\mathbi{y}_j,\
\parinterval 神经网络的参数主要是各层中的线性变换矩阵和偏置。在训练开始时,需要对参数进行初始化。但是,由于神经机器翻译的网络结构复杂,因此损失函数往往不是凸函数,不同初始化会导致不同的优化结果。而且在大量实践中已经发现,神经机器翻译模型对初始化方式非常敏感,性能优异的系统往往需要特定的初始化方式。
\parinterval 因为LSTM是神经机器翻译中常用的一种模型,下面以LSTM模型为例(见\ref{sec:lstm-cell}节),介绍机器翻译模型的初始化方法这些方法也可以推广到GRU等结构。具体内容如下:
\parinterval 因为LSTM是神经机器翻译中常用的一种模型,下面以LSTM模型为例(见\ref{sec:lstm-cell}节),介绍机器翻译模型的初始化方法这些方法也可以推广到GRU等结构。具体内容如下:
\begin{itemize}
\vspace{0.5em}
......@@ -993,7 +993,7 @@ L(\mathbi{Y},\widehat{\mathbi{Y}}) = \sum_{j=1}^n L_{\textrm{ce}}(\mathbi{y}_j,\
%\vspace{0.5em}
\parinterval 公式\eqref{eq:10-30}展示了最基本的优化策略,也被称为标准的SGD优化器。实际上,训练神经机器翻译模型时,还有非常多的优化器可以选择,在{\chapternine}也有详细介绍,这里考虑Adam优化器\upcite{kingma2014adam}。 Adam 通过对梯度的{\small\bfnew{一阶矩估计}}\index{一阶矩估计}(First Moment Estimation)\index{First Moment Estimation}{\small\bfnew{二阶矩估计}}\index{二阶矩估计}(Second Moment Estimation)\index{Second Moment Estimation}进行综合考虑,计算出更新步长。
\parinterval 通常,Adam收敛比较快,不同任务基本上可以使用一套配置进行优化,虽性能不算差,但很难达到最优效果。相反,SGD虽能通过在不同的数据集上进行调整,来达到最优的结果,但是收敛速度慢。因此需要根据不同的需求来选择合适的优化器。若需要快得到模型的初步结果,选择Adam较为合适,若是需要在一个任务上得到最优的结果,选择SGD更为合适。
\parinterval 通常,Adam收敛比较快,不同任务基本上可以使用一套配置进行优化,虽性能不算差,但很难达到最优效果。相反,SGD虽能通过在不同的数据集上进行调整,来达到最优的结果,但是收敛速度慢。因此需要根据不同的需求来选择合适的优化器。若需要快得到模型的初步结果,选择Adam较为合适,若是需要在一个任务上得到最优的结果,选择SGD更为合适。
%----------------------------------------------------------------------------------------
% NEW SUBSUB-SECTION
......@@ -1030,7 +1030,7 @@ L(\mathbi{Y},\widehat{\mathbi{Y}}) = \sum_{j=1}^n L_{\textrm{ce}}(\mathbi{y}_j,\
\end{figure}
%----------------------------------------------
\parinterval 不同优化器需要的学习率不同,比如Adam一般使用0.001或0.0001,而SGD则在0.1$\sim$1之间进行挑选。在梯度下降法中,都是给定的统一的学习率,整个优化过程中都以确定的步长进行更新因此无论使用哪个优化器,为了保证训练又快又好,通常都需要根据当前的更新次数来动态调整学习率的大小。
\parinterval 不同优化器需要的学习率不同,比如Adam一般使用0.001或0.0001,而SGD则在0.1$\sim$1之间进行挑选。在梯度下降法中,都是给定的统一的学习率,整个优化过程中都以确定的步长进行更新因此无论使用哪个优化器,为了保证训练又快又好,通常都需要根据当前的更新次数来动态调整学习率的大小。
\vspace{0.5em}
......@@ -1042,7 +1042,7 @@ L(\mathbi{Y},\widehat{\mathbi{Y}}) = \sum_{j=1}^n L_{\textrm{ce}}(\mathbi{y}_j,\
\end{eqnarray}
%-------
\noindent 另一方面,当模型训练逐渐接近收敛的时候,使用太大学习率会很容易让模型在局部最优解附近震荡,从而错过局部极小,因此需要通过减小学习率来调整更新的步长,以此来不断的逼近局部最优,这一阶段也称为学习率的衰减阶段。学习率衰减的方法有很多,比如指数衰减,余弦衰减等,图\ref{fig:10-29}右侧展示的是{\small\bfnew{分段常数衰减}}\index{分段常数衰减}(Piecewise Constant Decay)\index{Piecewise Constant Decay},即每经过$m$次更新,学习率衰减为原来的$\beta_m$$\beta_m<1$)倍,其中$m$$\beta_m$为经验设置的超参。
\noindent 另一方面,当模型训练逐渐接近收敛的时候,使用太大学习率会很容易让模型在局部最优解附近震荡,从而错过局部极小,因此需要通过减小学习率来调整更新的步长,以此来不断地逼近局部最优,这一阶段也称为学习率的衰减阶段。学习率衰减的方法有很多,比如指数衰减以及余弦衰减等,图\ref{fig:10-29}右侧展示的是{\small\bfnew{分段常数衰减}}\index{分段常数衰减}(Piecewise Constant Decay)\index{Piecewise Constant Decay},即每经过$m$次更新,学习率衰减为原来的$\beta_m$$\beta_m<1$)倍,其中$m$$\beta_m$为经验设置的超参。
%----------------------------------------------
\begin{figure}[htp]
......@@ -1147,7 +1147,7 @@ L(\mathbi{Y},\widehat{\mathbi{Y}}) = \sum_{j=1}^n L_{\textrm{ce}}(\mathbi{y}_j,\
\label{eq:10-37}
\end{eqnarray}
\noindent 这里,$\{ \hat{y}_{j1},...,\hat{y}_{jk} \}$表示对于位置$j$翻译概率最大的前$k$个单词,$\{ \hat{\seq{{y}}}_{<j\ast} \}$表示前$j-1$步top-k单词组成的所有历史。${\hat{\seq{{y}}}_{<j\ast}}$可以被看作是一个集合,里面每一个元素都是一个目标语言单词序列,这个序列是前面生成的一系列top-k单词的某种组成。$\funp{P}(y_j | \{ \hat{\seq{{y}}}_{<{j^{\textrm{*}}}} \},\seq{{x}})$表示基于\{$ \hat{\seq{{y}}}_{<j\ast} $\}的某一条路径生成$y_j$的概率\footnote{严格来说,$ \funp{P} (y_j | {\hat{\seq{{y}}}_{<j\ast} })$不是一个准确的数学表达,这里通过这种写法强调$y_j$是由\{$ \hat{\seq{{y}}}_{<j\ast} $\}中的某个译文单词序列作为条件生成的。} 。这种方法也被称为束搜索,意思是搜索时始终考虑一个集束内的候选。
\noindent 这里,$\{ \hat{y}_{j1},...,\hat{y}_{jk} \}$表示对于位置$j$翻译概率最大的前$k$个单词,$\{ \hat{\seq{{y}}}_{<j\ast} \}$表示前$j-1$步top-k单词组成的所有历史。${\hat{\seq{{y}}}_{<j\ast}}$可以被看作是一个集合,里面每一个元素都是一个目标语言单词序列,这个序列是前面生成的一系列top-k单词的某种组成。$\funp{P}(y_j | \{ \hat{\seq{{y}}}_{<{j\ast}} \},\seq{{x}})$表示基于\{$ \hat{\seq{{y}}}_{<j\ast} $\}的某一条路径生成$y_j$的概率\footnote{严格来说,$ \funp{P} (y_j | {\hat{\seq{{y}}}_{<j\ast} })$不是一个准确的数学表达,这里通过这种写法强调$y_j$是由\{$ \hat{\seq{{y}}}_{<j\ast} $\}中的某个译文单词序列作为条件生成的。} 。这种方法也被称为束搜索,意思是搜索时始终考虑一个集束内的候选。
\parinterval 不论是贪婪搜索还是束搜索都是一个自左向右的过程,也就是每个位置的处理需要等前面位置处理完才能执行。这是一种典型的{\small\bfnew{自回归模型}}\index{自回归模型}(Autoregressive Model)\index{Autoregressive Model},它通常用来描述时序上的随机过程,其中每一个时刻的结果对时序上其他部分的结果有依赖\upcite{Akaike1969autoregressive}。相对应的,也有{\small\bfnew{非自回归模型}}\index{非自回归模型}(Non-autoregressive Model)\index{Non-autoregressive Model},它消除了不同时刻结果之间的直接依赖\upcite{Gu2017NonAutoregressiveNM}。由于自回归模型是当今神经机器翻译主流的推断方法,这里仍以自回归的贪婪搜索和束搜索为基础进行讨论。
......@@ -1157,7 +1157,7 @@ L(\mathbi{Y},\widehat{\mathbi{Y}}) = \sum_{j=1}^n L_{\textrm{ce}}(\mathbi{y}_j,\
\vspace{1.0em}
\subsubsection{1. 贪婪搜索}
\vspace{0.6em}
\parinterval\ref{fig:10-32}展示了一个基于贪婪方法的神经机器翻译解码过程。每一个时间步的单词预测都依赖于其前一步单词的生成。在解码第一个单词时,由于没有之前的单词信息,会用<sos>进行填充作为起始的单词,且会用一个零向量(可以理解为没有之前时间步的信息)表示第0步的中间层状态。
\parinterval\ref{fig:10-32}展示了一个基于贪婪方法的神经机器翻译解码过程。每一个时间步的单词预测都依赖于其前一步单词的生成。在解码第一个单词时,由于没有之前的单词信息,会用<sos>进行填充作为起始的单词,且会用一个零向量(可以理解为没有之前时间步的信息)表示第0步的中间层状态。
\vspace{0.8em}
%----------------------------------------------
......@@ -1190,7 +1190,7 @@ L(\mathbi{Y},\widehat{\mathbi{Y}}) = \sum_{j=1}^n L_{\textrm{ce}}(\mathbi{y}_j,\
\subsubsection{2. 束搜索}
\vspace{0.5em}
\parinterval 束搜索是一种启发式图搜索算法。相比于全搜索,它可以减少搜索所占用的空间和时间,在每一步扩展的时候,剪掉一些质量比较差的结点,保留下一些质量较高的结点。具体到机器翻译任务,对于每一个目标语言位置,束搜索选择了概率最大的前$K$个单词进行扩展(其中$k$叫做束宽度,或简称为束宽)。如图\ref{fig:10-34}所示,假设\{$y_1, y_2,..., y_n$\}表示生成的目标语言序列,且$k=3$,则束搜索的具体过程为:在预测第一个位置时,可以通过模型得到$y_1$的概率分布,选取概率最大的前3个单词作为候选结果(假设分别为“have”, “has”, “it”)。在预测第二个位置的单词时,模型针对已经得到的三个候选结果(“have”, “has”, “it”)计算第二个单词的概率分布。因为$y_2$对应$|V|$种可能,总共可以得到$3 \times |V|$种结果。然后从中选取使序列概率$\funp{P}(y_2,y_1| \seq{{x}})$最大的前三个$y_2$作为新的输出结果,这样便得到了前两个位置的top-3译文。在预测其他位置时也是如此,不断重复此过程直到推断结束。可以看到,束搜索的搜索空间大小与束宽度有关,也就是:束宽度越大,搜索空间越大,更有可能搜索到质量更高的译文,但同时搜索会更慢。束宽度等于3,意味着每次只考虑三个最有可能的结果,贪婪搜索实际上便是束宽度为1的情况。在神经机器翻译系统实现中,一般束宽度设置在4~8之间。
\parinterval 束搜索是一种启发式图搜索算法。相比于全搜索,它可以减少搜索所占用的空间和时间,在每一步扩展的时候,剪掉一些质量比较差的结点,保留下一些质量较高的结点。具体到机器翻译任务,对于每一个目标语言位置,束搜索选择了概率最大的前$K$个单词进行扩展(其中$k$叫做束宽度,或简称为束宽)。如图\ref{fig:10-34}所示,假设\{$y_1, y_2,..., y_n$\}表示生成的目标语言序列,且$k=3$,则束搜索的具体过程为:在预测第一个位置时,可以通过模型得到$y_1$的概率分布,选取概率最大的前3个单词作为候选结果(假设分别为“have”, “has”, “it”)。在预测第二个位置的单词时,模型针对已经得到的三个候选结果(“have”, “has”, “it”)计算第二个单词的概率分布。因为$y_2$对应$|V|$种可能,总共可以得到$3 \times |V|$种结果。然后从中选取使序列概率$\funp{P}(y_2,y_1| \seq{{x}})$最大的前三个$y_2$作为新的输出结果,这样便得到了前两个位置的top-3译文。在预测其他位置时也是如此,不断重复此过程直到推断结束。可以看到,束搜索的搜索空间大小与束宽度有关,也就是:束宽度越大,搜索空间越大,更有可能搜索到质量更高的译文,但同时搜索会更慢。束宽度等于3,意味着每次只考虑三个最有可能的结果,贪婪搜索实际上便是束宽度为1的情况。在神经机器翻译系统实现中,一般束宽度设置在4~8之间。
%----------------------------------------------
\begin{figure}[htp]
......@@ -1207,7 +1207,7 @@ L(\mathbi{Y},\widehat{\mathbi{Y}}) = \sum_{j=1}^n L_{\textrm{ce}}(\mathbi{y}_j,\
\subsubsection{3. 长度惩罚}
\parinterval 这里用$ \funp{P} (\seq{{y}} | \seq{{x}}) = \prod_{j=1}^n \funp{P}(y_j | \seq{{y}}_{<j},\seq{{x}}) $作为翻译模型。直接实现这个公式有一个明显的缺点:当句子过长时乘法运算容易产生溢出,也就是多个数相乘可能会产生浮点数无法表示的运算结果。为了解决这个问题,可以利用对数操作将乘法转换为加法,得到新的计算方式:$\textrm{log } \funp{P}(\seq{{y}} | \seq{{x}}) = \sum_{j=1}^n \textrm{log }\funp{P} (y_j | \seq{{y}}_{<j}, \seq{{x}}) $对数函数不会改变函数的单调性,因此在具体实现时,通常用$\textrm{log }\funp{P} (\seq{{y}} | \seq{{x}})$表示句子的得分,而不用$\funp{P}(\seq{{y}} | \seq{{x}})$
\parinterval 这里用$ \funp{P} (\seq{{y}} | \seq{{x}}) = \prod_{j=1}^n \funp{P}(y_j | \seq{{y}}_{<j},\seq{{x}}) $作为翻译模型。直接实现这个公式有一个明显的缺点:当句子过长时乘法运算容易产生溢出,也就是多个数相乘可能会产生浮点数无法表示的运算结果。为了解决这个问题,可以利用对数操作将乘法转换为加法,得到新的计算方式:$\textrm{log } \funp{P}(\seq{{y}} | \seq{{x}}) = \sum_{j=1}^n \textrm{log }\funp{P} (y_j | \seq{{y}}_{<j}, \seq{{x}}) $对数函数不会改变函数的单调性,因此在具体实现时,通常用$\textrm{log }\funp{P} (\seq{{y}} | \seq{{x}})$表示句子的得分,而不用$\funp{P}(\seq{{y}} | \seq{{x}})$
\parinterval 不管是使用$\funp{P}(\seq{{y}} | \seq{{x}})$还是$\textrm{log } \funp{P}(\seq{{y}} | \seq{{x}})$计算句子得分,还面临两个问题:
......@@ -1257,7 +1257,7 @@ L(\mathbi{Y},\widehat{\mathbi{Y}}) = \sum_{j=1}^n L_{\textrm{ce}}(\mathbi{y}_j,\
\vspace{0.5em}
\item 注意力机制的使用是机器翻译乃至整个自然语言处理近几年获得成功的重要因素之一\upcite{bahdanau2014neural,DBLP:journals/corr/LuongPM15}。早期,有研究者尝试将注意力机制和统计机器翻译的词对齐进行统一\upcite{WangNeural,He2016ImprovedNM,li-etal-2019-word}。最近,也有大量的研究工作对注意力机制进行改进,比如,使用自注意力机制构建翻译模型等\upcite{vaswani2017attention}。而对注意力模型的改进也成为了自然语言处理中的热点问题之一。在{\chapterfifteen}会对机器翻译中不同注意力模型进行进一步讨论。
\vspace{0.5em}
\item 一般来说,神经机器翻译的计算过程是没有人工干预的,翻译流程也无法用人类的知识直接进行解释,因此一个有趣的方向是在神经机器翻译中引入先验知识,使得机器翻译的行为更“像”人。比如,可以使用句法树来引入人类的语言学知识\upcite{Yang2017TowardsBH,Wang2019TreeTI},基于句法的神经机器翻译也包含大量的树结构的神经网络建模\upcite{DBLP:journals/corr/abs-1809-01854,DBLP:journals/corr/abs-1808-09374}。此外,也可以把用户定义的词典或者翻译记忆加入到翻译过程\upcite{DBLP:journals/corr/ZhangZ16c,zhang-etal-2017-prior,duan-etal-2020-bilingual,cao-xiong-2018-encoding},使得用户的约束可以直接反映到机器翻译的结果上来。先验知识的种类还有很多,包括词对齐\upcite{li-etal-2019-word}、 篇章信息\upcite{Werlen2018DocumentLevelNM,DBLP:journals/corr/abs-1805-10163} 等等,都是神经机器翻译中能够使用的信息。
\item 一般来说,神经机器翻译的计算过程是没有人工干预的,翻译流程也无法用人类的知识直接进行解释,因此一个有趣的方向是在神经机器翻译中引入先验知识,使得机器翻译的行为更“像”人。比如,可以使用句法树来引入人类的语言学知识\upcite{Yang2017TowardsBH,Wang2019TreeTI},基于句法的神经机器翻译也包含大量的树结构的神经网络建模\upcite{DBLP:journals/corr/abs-1809-01854,DBLP:journals/corr/abs-1808-09374}。此外,也可以把用户定义的词典或者翻译记忆加入到翻译过程\upcite{DBLP:journals/corr/ZhangZ16c,zhang-etal-2017-prior,duan-etal-2020-bilingual,cao-xiong-2018-encoding},使得用户的约束可以直接反映到机器翻译的结果上来。先验知识的种类还有很多,包括词对齐\upcite{li-etal-2019-word,DBLP:conf/emnlp/MiWI16,DBLP:conf/coling/LiuUFS16}、 篇章信息\upcite{Werlen2018DocumentLevelNM,DBLP:journals/corr/abs-1805-10163,DBLP:conf/acl/LiLWJXZLL20} 等等,都是神经机器翻译中能够使用的信息。
\end{itemize}
......@@ -28,4 +28,7 @@
\node [right of = num2,xshift= -0.7cm]{};
\node[font=\small] at ([yshift=-0.7cm,xshift=0.3cm]num12.south) {输入:4$\times$4};
\node[font=\small] at ([yshift=-1.3cm,xshift=0.3cm]num19.south) {输出:2$\times$2};
\end{tikzpicture}
\ No newline at end of file
......@@ -34,8 +34,26 @@
\node(num8_8)[num,below of = num88,xshift= -0.5cm,yshift= -0.5cm]{-1};
\node(num9_9)[num,below of = num99,xshift= -0.5cm,yshift= -0.5cm]{-1};
%\node[] (sub) at ([xshift=3.5cm,yshift=2cm]num9_9.east) {$q$:卷积核窗口的长度};
%\node[] (sub2) at ([xshift=0cm,yshift=-0.1cm]sub.south) {$k$:卷积核窗口的宽度};
%\node[] (sub3) at ([xshift=-0.2cm,yshift=-0.1cm]sub2.south) {$o$:卷积核深度\qquad \qquad };
%\node[inner sep=2pt,fill=green!5,draw=ugreen,dotted,very thick,align=center] (sub) at ([xshift=3.8cm,yshift=2cm]num9_9.east) {
%\begin{tabular}{r l}
%$q$:& 卷积核窗口的长度 \\
%$k$:& 卷积核窗口的宽度 \\
%$o$: & 卷积核深度
%\end{tabular}};
\node[] (sub) at ([xshift=3.35cm,yshift=2cm]num1_1.east) {:卷积核窗口的长度};
\node[] (sub2) at ([xshift=-3.9cm,yshift=-2.15cm]num1.north) {卷积核窗口的宽度:};
\node[] (sub3) at ([xshift=2.85cm,yshift=0.25cm]num9_9.south) {:卷积核深度};
%\node[minimum width = 1.8cm] (sub) at ([xshift=-5.5cm,yshift=2cm]num9_9.east) {};
\draw[decorate,decoration={brace,mirror,raise=0pt,amplitude=0.3cm},black,thick] ([yshift=0.4cm,xshift=-0.1cm]num1_1.west) -- node[att,xshift=-0.5cm]{$q$} ([yshift=-0.4cm,xshift=-0.1cm]num3_3.west);
\draw[decorate,decoration={brace,raise=0pt,amplitude=0.3cm},black,thick] ([xshift=-0.4cm,yshift=0.1cm]num1.north) -- node[att,yshift=0.5cm]{$k$}([xshift=0.4cm,yshift=0.1cm]num7.north);
\draw[decorate,decoration={brace,mirror,raise=0pt,amplitude=0.3cm},black,thick] ([xshift=0.5cm,yshift=0.00cm]num9_9.south) -- node[att,xshift=0.5cm,yshift=-0.3cm]{$o$}([xshift=0.5cm,yshift=0.00cm]num9.south);
\draw[decorate,decoration={brace,mirror,raise=0pt,amplitude=0.3cm},black,thick] ([yshift=0.4cm]num1_1.west) -- node[att,xshift=-0.5cm]{\large q} ([yshift=-0.4cm]num3_3.west);
\draw[decorate,decoration={brace,raise=0pt,amplitude=0.3cm},black,thick] ([xshift=-0.4cm]num1.north) -- node[att,yshift=0.5cm]{\large k}([xshift=0.4cm]num7.north);
\draw[decorate,decoration={brace,mirror,raise=0pt,amplitude=0.3cm},black,thick] ([xshift=0.4cm]num9_9.south) -- node[att,xshift=0.4cm,yshift=-0.3cm]{\large o}([xshift=0.4cm]num9.south);
\end{tikzpicture}
\ No newline at end of file
\begin{tikzpicture}
\tikzstyle{every node}=[scale=1.5]
\tikzstyle{unit} = [inner sep=0pt, minimum size=1em,draw,thick]
\tikzstyle{unit} = [inner sep=0pt, minimum size=1em,draw,very thick]
\tikzstyle{vuale} = [inner sep=0pt,font=\tiny]
......@@ -15,10 +15,10 @@
}
\foreach \point in {1,2}{
\draw[line width=0.5pt, ugreen!80, -latex] (l\point_0.east) -- (r2_0.west);
\draw[line width=0.5pt, orange!80, -latex] (l\point_1.east) -- (r2_1.west);
\draw[line width=0.5pt, cyan!80, -latex] (l\point_2.east) -- (r2_2.west);
\draw[line width=0.5pt, gray!80, -latex] (l\point_3.east) -- (r2_3.west);
\draw[line width=0.9pt, ugreen!80, -latex] (l\point_0.east) -- (r2_0.west);
\draw[line width=0.9pt, orange!80, -latex] (l\point_1.east) -- (r2_1.west);
\draw[line width=0.9pt, cyan!80, -latex] (l\point_2.east) -- (r2_2.west);
\draw[line width=0.9pt, gray!80, -latex] (l\point_3.east) -- (r2_3.west);
}
\node[vuale] at (-1.5em, 1.9em) {$x_2$};
......@@ -40,10 +40,10 @@
}
\foreach \point in {1,2}{
\draw[line width=0.5pt, orange!80, -latex] (l\point_0.east) -- (r2_0.west);
\draw[line width=0.5pt, cyan!80, -latex] (l\point_1.east) -- (r2_1.west);
\draw[line width=0.5pt, orange!80, -latex] (l\point_2.east) -- (r2_2.west);
\draw[line width=0.5pt, cyan!80, -latex] (l\point_3.east) -- (r2_3.west);
\draw[line width=0.9pt, orange!80, -latex] (l\point_0.east) -- (r2_0.west);
\draw[line width=0.9pt, cyan!80, -latex] (l\point_1.east) -- (r2_1.west);
\draw[line width=0.9pt, orange!80, -latex] (l\point_2.east) -- (r2_2.west);
\draw[line width=0.9pt, cyan!80, -latex] (l\point_3.east) -- (r2_3.west);
}
\node[vuale] at (-1.5em, 1.9em) {$x_2$};
......
\tikzstyle{num} = [minimum width = 0.6cm,minimum height = 0.6cm,draw,fill=green!2]
\tikzstyle{num} = [minimum width = 0.6cm,minimum height = 0.6cm,draw,fill=green!10]
\begin{tikzpicture}[node distance = 0]
\node[num] at (0,0){1};
......@@ -43,9 +43,10 @@
\node[num] at (2.4,3){2};
\node[num] at (3,3){1};
\node[minimum width = 1.8cm,minimum height = 1.8cm,draw=teal,line width=0.1cm] at (0.6,2.4) {};
\node[minimum width = 1.8cm,minimum height = 1.8cm,draw=purple!40,line width=0.08cm,fill=purple!40,fill opacity=0.4] at (0.6,2.4) {};
\fill (4,1.5) circle (2pt);
%\fill (4,1.5) circle (2pt);
\node [] at (4,1.5) {*};
\node[num] at (5,0.9){0};
\node[num] at (5.6,0.9){1};
......@@ -82,9 +83,13 @@
\node[num] at (10.2,2.4){8};
\node[num] at (10.8,2.4){7};
\node[minimum width = 0.6cm,minimum height = 0.6cm,draw=teal,line width=0.1cm] at (9,2.4) {};
\node[minimum width = 0.6cm,minimum height = 0.6cm,draw=purple!40,line width=0.08cm,fill=purple!40,fill opacity=0.4] at (9,2.4) {};
\node[] (in) at (1.5,-1) {\small{输入:6$\times$6}};
\node[] at (5.7,-1) {\small{卷积核:3$\times$3}};
\node[] (out) at (10,-1) {\small{输出:4$\times$4}};
%\node[font=\footnotesize,dashed,draw=teal,very thick,fill=green!5,align=center] at ([yshift=3cm,xshift=1.5cm]out.east) {*\ \ :表示\\卷积计算};
\node[] at ([yshift=-0.3cm,xshift=-2.3cm]in.east) {\ \ \ \ \ };
\draw (1.5,-1)node{\small{输入:6$\times$6}};
\draw (5.7,-1)node{\small{卷积核:3$\times$3}};
\draw (10,-1)node{\small{输出:4$\times$4}};
\end{tikzpicture}
\ No newline at end of file
......@@ -25,19 +25,28 @@
\begin{scope}
\foreach \point in {0,1,2,3,4,5}
\node[rec] (i_\point) at (0.9+\point, -0.05){};
\node[rec] (i_\point) at (0.9+\point, -0.1){};
\draw[thick] (i_0.0) -- (i_1.180);
\draw[thick] (i_1.0) -- (i_2.180);
\draw[thick] (i_2.0) -- (i_3.180);
\draw[thick] (i_3.0) -- (i_4.180);
\draw[thick] (i_4.0) -- (i_5.180);
\node[anchor=north,word] at ([yshift=-0.4em]i_0.south){$<$p$>$};
\node[anchor=north,word] (tgt_1) at ([yshift=-0.4em]i_0.south){$<$p$>$};
\node[anchor=north,word] at ([yshift=-0.4em]i_1.south){$<$p$>$};
\node[anchor=north,word] at ([yshift=-0.4em]i_2.south){$<$sos$>$};
\node[anchor=north,word] at ([yshift=-0.4em]i_2.south){$<$s$>$};
\node[anchor=north,word] at ([yshift=-0.4em]i_3.south){go};
\node[anchor=north,word] at ([yshift=-0.4em]i_4.south){to};
\node[anchor=north,word] at ([yshift=-0.4em]i_5.south){school};
\node[anchor=north,word] (tgt_2) at ([yshift=-0.4em]i_5.south){school};
\begin{pgfonlayer}{background}
\node [draw=ugreen!30,rectangle,inner ysep=5pt,inner xsep=1.8em,rounded corners=4pt,line width=2pt,fill=ugreen!10] [fit = (tgt_1) (i_0)(tgt_2)(i_5) ] (group1_2) {};
\end{pgfonlayer}
\node[anchor=east, word] (l_0) at ([xshift=-2em,yshift=-0.5em]i_0.west){\sffamily\bfnew{词嵌入}};
\node[anchor=south, word] (l_1) at ([yshift=2em]l_0.north){\sffamily\bfnew{卷积}};
\node[anchor=south, word] (l_2) at ([yshift=2.4em]l_1.north){\sffamily\bfnew{门控}};
\node[anchor=south, word] (l_3) at ([yshift=0.06em]l_2.north){\sffamily\bfnew{线性单元}};
\foreach \point in {0,1,2}
......@@ -89,12 +98,12 @@
\node[thick,draw,fill=yellow!60,minimum height =1em,minimum width=2em](g_2_3) at (5cm+4em, 7.5cm-3em){};
\foreach \point in {0,1,2,3}
\node[rec] (i_\point) at (8+\point, -0.05){};
\node[rec] (i_\point) at (8+\point, -0.1){};
\node[anchor=north,word] at ([yshift=-0.4em]i_0.south){go};
\node[anchor=north,word] at ([yshift=-0.4em]i_1.south){to};
\node[anchor=north,word] at ([yshift=-0.4em]i_2.south){school};
\node[anchor=north,word] at ([yshift=-0.4em]i_3.south){$<$eos$>$};
\node[anchor=north,word] at ([yshift=-0.4em]i_3.south){$<$/s$>$};
\foreach \point in {0,1,2,3}{
\node[cir,font=\fontsize{6}{6}\selectfont,inner sep=0.8pt](c_\point) at (8.2cm+\point*2em,7.5cm-1em*\point){\bm{$\sum$}};
......@@ -131,15 +140,15 @@
\node[anchor=south,word] (src_1) at ([xshift=-2em,yshift=0.4em]r_0.north){$<$p$>$};
\node[anchor=south,word] at ([yshift=0.4em]r_0.north){};
\node[anchor=south,word] at ([yshift=0.4em]r_1.north){上学};
\node[anchor=south,word] at ([yshift=0.4em]r_2.north){$<$sos$>$};
\node[anchor=south,word] at ([yshift=0.4em]r_2.north){$<$s$>$};
\node[anchor=south,word] (src_2) at ([xshift=2em,yshift=0.4em]r_2.north){$<$p$>$};
\node[anchor=east, word] (t_1) at ([xshift=-4em,yshift=0.5em]r_0.west){\sffamily\bfnew{词嵌入}};
\node[anchor=north, word] (t_2) at ([yshift=-1em,yshift=-1em]t_1.south){\sffamily\bfnew{卷积}};
\node[anchor=north, word] (t_3) at ([yshift=-2em,yshift=-0.8em]t_2.south){\sffamily\bfnew{门控}};
\node[anchor=north, word] (t_2) at ([yshift=-2em]t_1.south){\sffamily\bfnew{卷积}};
\node[anchor=north, word] (t_3) at ([yshift=-2.8em]t_2.south){\sffamily\bfnew{门控}};
\node[anchor=north, word] (t_4) at ([yshift=-0.06em]t_3.south){\sffamily\bfnew{线性单元}};
\node[anchor=north, word] (t_5) at ([yshift=-0.06em,yshift=1.3em]t_4.south){\sffamily\bfnew{}};
\node[anchor=north, word] (t_5) at ([yshift=1.24em]t_4.south){\sffamily\bfnew{}};
\node[anchor=north, word] (t_6) at ([yshift=-1.5em]t_5.south){\sffamily\bfnew{注意力模块}};
\node[cir] (o_2) at (7.95, 8.8cm){};
......@@ -158,13 +167,13 @@
\node [single arrow,minimum height=2.4em,fill=blue!30,minimum width=4pt,rotate=-90,draw, fill=blue!20] at (8.6cm+2em, 8cm) {};
\begin{pgfonlayer}{background}
\node [draw=ugreen!30,rectangle,inner ysep=5pt,inner xsep=1.3em,rounded corners=4pt,line width=2pt,fill=ugreen!10] [fit = (src_1) (src_2)(r_2) ] (group1) {};
%\node [rectangle,rounded corners=4pt,fill=mc_2_m, minimum width=3.8cm, minimum height=2.8cm] (group2) at (5.7cm, 10.7cm) {};
%\node [rectangle,rounded corners=4pt,fill=mc_2_m, minimum width=5.2cm, minimum height=3.1cm] (group3) at (3.4cm, 1.85cm) {};
%\draw [line width=2pt,draw=mc_1,fill=mc_1_m,rounded corners=4pt] ([yshift=0.4em,xshift=6em]t_6.north west) -- ([yshift=0.4em,xshift=6.3cm]t_6.north west) -- ([yshift=2.2cm,xshift=6.3cm]t_6.north west) -- ([yshift=2.2cm,xshift=9.8cm]t_6.north west) -- ([yshift=-5.4cm,xshift=9.8cm]t_6.north west) -- ([yshift=-5.4cm,xshift=-1.6em]t_6.north west)-- ([yshift=0.4em,xshift=-1.6em]t_6.north west) -- ([yshift=0.4em,xshift=6em]t_6.north west);
\node [draw=ugreen!30,rectangle,inner ysep=5pt,inner xsep=1.3em,rounded corners=4pt,line width=2pt,fill=ugreen!10] [fit = (src_1) (src_2)(r_2) ] (group1_1) {};
\node [rectangle,rounded corners=4pt,fill=mc_2_m, minimum width=3.8cm, minimum height=2.8cm] (group2) at (5.7cm, 10.7cm) {};
\node [rectangle,rounded corners=4pt,fill=mc_2_m, minimum width=5.2cm, minimum height=3.1cm] (group3) at (3.4cm, 1.85cm) {};
\draw [line width=2pt,draw=mc_1,fill=mc_1_m,rounded corners=4pt] ([yshift=0.4em,xshift=6em]t_6.north west) -- ([yshift=0.4em,xshift=6.3cm]t_6.north west) -- ([yshift=2.2cm,xshift=6.3cm]t_6.north west) -- ([yshift=2.2cm,xshift=9.8cm]t_6.north west) -- ([yshift=-5.4cm,xshift=9.8cm]t_6.north west) -- ([yshift=-5.4cm,xshift=-1.6em]t_6.north west)-- ([yshift=0.4em,xshift=-1.6em]t_6.north west) -- ([yshift=0.4em,xshift=6em]t_6.north west);
\end{pgfonlayer}
%\node [draw=mc_2!140,rectangle,rounded corners=4pt,line width=2pt,minimum width=3.8cm, minimum height=2.8cm] at (5.7cm, 10.7cm) {};
%\node [draw=mc_2!140,rectangle,rounded corners=4pt,line width=2pt,minimum width=5.2cm, minimum height=3.1cm] at (3.4cm, 1.85cm) {};
\node [draw=mc_2!140,rectangle,rounded corners=4pt,line width=2pt,minimum width=3.8cm, minimum height=2.8cm] at (5.7cm, 10.7cm) {};
\node [draw=mc_2!140,rectangle,rounded corners=4pt,line width=2pt,minimum width=5.2cm, minimum height=3.1cm] at (3.4cm, 1.85cm) {};
\end{scope}
\end{tikzpicture}
\ No newline at end of file
......@@ -25,7 +25,7 @@
\begin{scope}
\foreach \point in {0,1,2,3,4,5}
\node[rec] (i_\point) at (0.9+\point, -0.05){};
\node[rec] (i_\point) at (0.9+\point, -0.1){};
\draw[thick] (i_0.0) -- (i_1.180);
\draw[thick] (i_1.0) -- (i_2.180);
\draw[thick] (i_2.0) -- (i_3.180);
......@@ -34,11 +34,15 @@
\node[anchor=north,word] at ([yshift=-0.4em]i_0.south){$<$p$>$};
\node[anchor=north,word] at ([yshift=-0.4em]i_1.south){$<$p$>$};
\node[anchor=north,word] at ([yshift=-0.4em]i_2.south){$<$sos$>$};
\node[anchor=north,word] at ([yshift=-0.4em]i_2.south){$<$s$>$};
\node[anchor=north,word] at ([yshift=-0.4em]i_3.south){go};
\node[anchor=north,word] at ([yshift=-0.4em]i_4.south){to};
\node[anchor=north,word] at ([yshift=-0.4em]i_5.south){school};
\node[anchor=east, word] (l_0) at ([xshift=-2em,yshift=-0.5em]i_0.west){\sffamily\bfnew{词嵌入}};
\node[anchor=south, word] (l_1) at ([yshift=2em]l_0.north){\sffamily\bfnew{卷积}};
\node[anchor=south, word] (l_2) at ([yshift=2.4em]l_1.north){\sffamily\bfnew{门控}};
\node[anchor=south, word] (l_3) at ([yshift=0.06em]l_2.north){\sffamily\bfnew{线性单元}};
\foreach \point in {0,1,2}
\draw[thick,fill=blue!20] (5.9-\point,0.4) -- (4.9-\point,1.5) -- (3.9-\point,0.4) -- (5.9-\point,0.4);
......@@ -89,12 +93,12 @@
\node[thick,draw,fill=yellow!60,minimum height =1em,minimum width=2em](g_2_3) at (5cm+4em, 7.5cm-3em){};
\foreach \point in {0,1,2,3}
\node[rec] (i_\point) at (8+\point, -0.05){};
\node[rec] (i_\point) at (8+\point, -0.1){};
\node[anchor=north,word] at ([yshift=-0.4em]i_0.south){go};
\node[anchor=north,word] at ([yshift=-0.4em]i_1.south){to};
\node[anchor=north,word] at ([yshift=-0.4em]i_2.south){school};
\node[anchor=north,word] at ([yshift=-0.4em]i_3.south){$<$eos$>$};
\node[anchor=north,word] at ([yshift=-0.4em]i_3.south){$<$/s$>$};
\foreach \point in {0,1,2,3}{
\node[cir,font=\fontsize{6}{6}\selectfont,inner sep=0.8pt](c_\point) at (8.2cm+\point*2em,7.5cm-1em*\point){\bm{$\sum$}};
......@@ -131,15 +135,15 @@
\node[anchor=south,word] (src_1) at ([xshift=-2em,yshift=0.4em]r_0.north){$<$p$>$};
\node[anchor=south,word] at ([yshift=0.4em]r_0.north){};
\node[anchor=south,word] at ([yshift=0.4em]r_1.north){上学};
\node[anchor=south,word] at ([yshift=0.4em]r_2.north){$<$sos$>$};
\node[anchor=south,word] at ([yshift=0.4em]r_2.north){$<$s$>$};
\node[anchor=south,word] (src_2) at ([xshift=2em,yshift=0.4em]r_2.north){$<$p$>$};
\node[anchor=east, word] (t_1) at ([xshift=-4em,yshift=0.5em]r_0.west){\sffamily\bfnew{词嵌入}};
\node[anchor=north, word] (t_2) at ([yshift=-1em,yshift=-1em]t_1.south){\sffamily\bfnew{卷积}};
\node[anchor=north, word] (t_3) at ([yshift=-2em,yshift=-0.8em]t_2.south){\sffamily\bfnew{门控}};
\node[anchor=north, word] (t_2) at ([yshift=-2em]t_1.south){\sffamily\bfnew{卷积}};
\node[anchor=north, word] (t_3) at ([yshift=-2.8em]t_2.south){\sffamily\bfnew{门控}};
\node[anchor=north, word] (t_4) at ([yshift=-0.06em]t_3.south){\sffamily\bfnew{线性单元}};
\node[anchor=north, word] (t_5) at ([yshift=-0.06em,yshift=1.3em]t_4.south){\sffamily\bfnew{}};
\node[anchor=north, word] (t_5) at ([yshift=1.24em]t_4.south){\sffamily\bfnew{}};
\node[anchor=north, word] (t_6) at ([yshift=-1.5em]t_5.south){\sffamily\bfnew{注意力模块}};
\node[cir] (o_2) at (7.95, 8.8cm){};
......
......@@ -25,7 +25,7 @@
\begin{scope}
\foreach \point in {0,1,2,3,4,5}
\node[rec] (i_\point) at (0.9+\point, -0.05){};
\node[rec] (i_\point) at (0.9+\point, -0.1){};
\draw[thick] (i_0.0) -- (i_1.180);
\draw[thick] (i_1.0) -- (i_2.180);
\draw[thick] (i_2.0) -- (i_3.180);
......@@ -34,11 +34,16 @@
\node[anchor=north,word] at ([yshift=-0.4em]i_0.south){$<$p$>$};
\node[anchor=north,word] at ([yshift=-0.4em]i_1.south){$<$p$>$};
\node[anchor=north,word] at ([yshift=-0.4em]i_2.south){$<$sos$>$};
\node[anchor=north,word] at ([yshift=-0.4em]i_2.south){$<$s$>$};
\node[anchor=north,word] at ([yshift=-0.4em]i_3.south){go};
\node[anchor=north,word] at ([yshift=-0.4em]i_4.south){to};
\node[anchor=north,word] at ([yshift=-0.4em]i_5.south){school};
\node[anchor=east, word] (l_0) at ([xshift=-2em,yshift=-0.5em]i_0.west){\sffamily\bfnew{词嵌入}};
\node[anchor=south, word] (l_1) at ([yshift=2em]l_0.north){\sffamily\bfnew{卷积}};
\node[anchor=south, word] (l_2) at ([yshift=2.4em]l_1.north){\sffamily\bfnew{门控}};
\node[anchor=south, word] (l_3) at ([yshift=0.06em]l_2.north){\sffamily\bfnew{线性单元}};
\foreach \point in {0,1,2}
\draw[thick,fill=blue!20] (5.9-\point,0.4) -- (4.9-\point,1.5) -- (3.9-\point,0.4) -- (5.9-\point,0.4);
......@@ -89,12 +94,12 @@
\node[thick,draw,fill=yellow!60,minimum height =1em,minimum width=2em](g_2_3) at (5cm+4em, 7.5cm-3em){};
\foreach \point in {0,1,2,3}
\node[rec] (i_\point) at (8+\point, -0.05){};
\node[rec] (i_\point) at (8+\point, -0.1){};
\node[anchor=north,word] at ([yshift=-0.4em]i_0.south){go};
\node[anchor=north,word] at ([yshift=-0.4em]i_1.south){to};
\node[anchor=north,word] at ([yshift=-0.4em]i_2.south){school};
\node[anchor=north,word] at ([yshift=-0.4em]i_3.south){$<$eos$>$};
\node[anchor=north,word] at ([yshift=-0.4em]i_3.south){$<$/s$>$};
\foreach \point in {0,1,2,3}{
\node[cir,font=\fontsize{6}{6}\selectfont,inner sep=0.8pt](c_\point) at (8.2cm+\point*2em,7.5cm-1em*\point){\bm{$\sum$}};
......@@ -131,15 +136,15 @@
\node[anchor=south,word] (src_1) at ([xshift=-2em,yshift=0.4em]r_0.north){$<$p$>$};
\node[anchor=south,word] at ([yshift=0.4em]r_0.north){};
\node[anchor=south,word] at ([yshift=0.4em]r_1.north){上学};
\node[anchor=south,word] at ([yshift=0.4em]r_2.north){$<$sos$>$};
\node[anchor=south,word] at ([yshift=0.4em]r_2.north){$<$s$>$};
\node[anchor=south,word] (src_2) at ([xshift=2em,yshift=0.4em]r_2.north){$<$p$>$};
\node[anchor=east, word] (t_1) at ([xshift=-4em,yshift=0.5em]r_0.west){\sffamily\bfnew{词嵌入}};
\node[anchor=north, word] (t_2) at ([yshift=-1em,yshift=-1em]t_1.south){\sffamily\bfnew{卷积}};
\node[anchor=north, word] (t_3) at ([yshift=-2em,yshift=-0.8em]t_2.south){\sffamily\bfnew{门控}};
\node[anchor=north, word] (t_2) at ([yshift=-2em]t_1.south){\sffamily\bfnew{卷积}};
\node[anchor=north, word] (t_3) at ([yshift=-2.8em]t_2.south){\sffamily\bfnew{门控}};
\node[anchor=north, word] (t_4) at ([yshift=-0.06em]t_3.south){\sffamily\bfnew{线性单元}};
\node[anchor=north, word] (t_5) at ([yshift=-0.06em,yshift=1.3em]t_4.south){\sffamily\bfnew{}};
\node[anchor=north, word] (t_5) at ([yshift=1.24em]t_4.south){\sffamily\bfnew{}};
\node[anchor=north, word] (t_6) at ([yshift=-1.5em]t_5.south){\sffamily\bfnew{注意力模块}};
\node[cir] (o_2) at (7.95, 8.8cm){};
......
......@@ -25,9 +25,12 @@
\node[data,inner sep=2pt,fill=cyan!40] at (14.4em,2.4em) {19};
\node[data,inner sep=2pt] at (16em,2.4em) {25};
\node[font=\footnotesize] at (1.6em,4.8em) {输入};
\node[font=\footnotesize] (in) at (1.6em,4.8em) {输入};
\node[font=\footnotesize] at (8.8em,4.8em) {卷积核};
\node[font=\footnotesize] at (15.2em,4.8em) {输出};
\node[font=\footnotesize] (out) at (15.2em,4.8em) {输出};
%\node[font=\footnotesize,dashed,draw=cyan,very thick,fill=cyan!5,align=center] at ([yshift=-0.3cm,xshift=1.8cm]out.east) {*\ \ :表示\\卷积计算};
%\node[] at ([yshift=-0.3cm,xshift=-2.3cm]in.east) {\ \ \ \ \ };
\node at (5.6em, 1.6em) {*};
\node at (12em, 1.6em) {=};
......
......@@ -28,4 +28,7 @@
\node [right of = num20,xshift= 0.7cm]{};
\node[font=\small] at ([yshift=-0.7cm,xshift=0.3cm]num12.south) {输入:4$\times$4};
\node[font=\small] at ([yshift=-1.3cm,xshift=0.3cm]num19.south) {输出:2$\times$2};
\end{tikzpicture}
\ No newline at end of file
\tikzstyle{num} = [minimum width = 0.6cm,minimum height = 0.6cm,draw,fill=green!2]
\tikzstyle{num} = [minimum width = 0.6cm,minimum height = 0.6cm,draw,fill=green!10]
\tikzstyle{pad} = [minimum width = 0.6cm,minimum height = 0.6cm,draw,fill=blue!10]
\begin{tikzpicture}[node distance = 0]
......@@ -74,10 +74,11 @@
\node[pad] at (3,3.6){0};
\node[pad] at (3.6,3.6){0};
\node[minimum width = 1.8cm,minimum height = 1.8cm,draw=teal,line width=0.1cm] at (0,3) {};
\node[minimum width = 1.8cm,minimum height = 1.8cm,draw=orange!80,line width=0.08cm] at (0.6,2.4) {};
\node[minimum width = 1.8cm,minimum height = 1.8cm,draw=purple!40,line width=0.08cm,fill=purple!40,fill opacity=0.4] at (0,3) {};
\node[minimum width = 1.8cm,minimum height = 1.8cm,draw=orange!40,line width=0.08cm,fill=orange!40,fill opacity=0.4] at (0.6,2.4) {};
\fill (4.55,1.5) circle (2pt);
%\fill (4.55,1.5) circle (2pt);
\node [] at (4.55,1.5) {*};
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\node[num] at (5.5,0.9){0};
......@@ -137,10 +138,16 @@
\node[num] at (10.8,3){2};
\node[num] at (11.4,3){4};
\node[minimum width = 0.6cm,minimum height = 0.6cm,draw=teal,line width=0.1cm] at (8.4,3) {};
\node[minimum width = 0.6cm,minimum height = 0.6cm,draw=orange!80,line width=0.08cm] at (9,2.4) {};
\node[minimum width = 0.6cm,minimum height = 0.6cm,draw=purple!40,line width=0.08cm,fill=purple!40,fill opacity=0.4] at (8.4,3) {0};
\node[minimum width = 0.6cm,minimum height = 0.6cm,draw=orange!40,line width=0.08cm,fill=orange!40,fill opacity=0.4] at (9,2.4) {1};
\draw (1.5,-1.5)node{\small{输入:8$\times$8(填充后)}};
\draw (5.7,-1.5)node{\small{卷积核:3$\times$3}};
\draw (10,-1.5)node{\small{输出:6$\times$6}};
%\draw (1.5,-1.5)node{\small{输入:8$\times$8(填充后)}};
%\draw (5.7,-1.5)node{\small{卷积核:3$\times$3}};
%\draw (10,-1.5)node{\small{输出:6$\times$6}};
\node[] (in) at (1.5,-1.5) {\small{输入:8$\times$8(填充后)}};
\node[] at (6,-1.5) {\small{卷积核:3$\times$3}};
\node[] (out) at (10,-1.5) {\small{输出:6$\times$6}};
%\node[font=\footnotesize,dashed,draw=teal,very thick,fill=green!5,align=center] at ([yshift=4cm,xshift=1.8cm]out.east) {*\ \ :表示\\卷积计算};
\node[] at ([yshift=-0.3cm,xshift=-2.3cm]in.east) {\ \ \ \ \ };
\end{tikzpicture}
\ No newline at end of file
......@@ -23,6 +23,13 @@
\draw[fill=blue!40,draw=blue!40] ([yshift=0.05em]a4.north) -- ([yshift=0.05em]a6.north) -- ([yshift=-0.05em]b4.south) -- ([yshift=0.05em]a4.north);
\draw[fill=blue!50,draw=blue!50] ([yshift=0.05em]a5.north) -- ([yshift=0.05em]a7.north) -- ([yshift=-0.05em]b5.south) -- ([yshift=0.05em]a5.north);
\node[inner sep=2pt,word,fill=blue!5,draw=blue!60,dotted,very thick,align=center,font=\scriptsize] (sub) at ([xshift=2.3cm,yshift=-0.8cm]b5.east) {
\begin{tabular}{r l}
$<$p$>$& 填充 \\
$<$sos$>$& 表示序列的开始 \\
$<$eos$>$& 表示序列的终止
\end{tabular}};
\node[] at ([xshift=-6em]a1.west) {};
\end{tikzpicture}
\ No newline at end of file
......@@ -43,6 +43,15 @@
\node[word] (w6) at ([yshift=-\bd]w5) {日子};
\node[word] (w7) at ([yshift=-\bd]w6) {};
\node[word] (w8) at ([yshift=-\bd]w7) {$<$p$>$};
\node[inner sep=2pt,word,fill=green!5,draw=ugreen,dotted,very thick,minimum width=6em,align=center,minimum height=4em] (sub) at ([xshift=11cm,yshift=-0.1cm]w1.east) {
\begin{tabular}{r l}
$<$p$>$& 填充 \\
$\sigma$& sigmoid函数 \\
$\otimes$& 按位乘运算 \\
*: &卷积计算
\end{tabular}};
% $<$p$>$:填充 \\ $\sigma$:sigmoid函数 \\ $\otimes$:按位乘运算 \\ *:卷积计算
\node[circle, draw, minimum size=0.7em, inner sep=0.5pt,font=\footnotesize,fill=yellow!40] (c1) at ([yshift=1.2em]b.north) {$\sigma$};
\node[circle, draw, minimum size=0.7em, inner sep=0.5pt,font=\footnotesize,fill=yellow!40] (c2) at ([xshift=1.5em]b.east) {};
......
\begin{tikzpicture}
\tikzstyle{every node}=[scale=1.5]
\tikzstyle{unit} = [inner sep=0pt, minimum size=1em,draw,thick]
\tikzstyle{unit} = [inner sep=0pt, minimum size=1em,draw,very thick]
%, font=\scriptsize
\tikzstyle{vuale} = [inner sep=0pt,font=\tiny]
......@@ -17,19 +17,19 @@
}
\foreach \point in {0,1,2}{
\draw[line width=0.5pt, ugreen!80, -latex] (l1_\point.east) -- (r2_0.west);
\draw[line width=0.5pt, orange!80, -latex] (l1_\point.east) -- (r2_1.west);
\draw[line width=0.5pt, cyan!80, -latex] (l1_\point.east) -- (r2_2.west);
\draw[line width=0.5pt, ugreen!80, -latex] (l2_\point.east) -- (r2_0.west);
\draw[line width=0.5pt, orange!80, -latex] (l2_\point.east) -- (r2_1.west);
\draw[line width=0.5pt, cyan!80, -latex] (l2_\point.east) -- (r2_2.west);
\draw[line width=0.5pt, ugreen!80, -latex] (l2_\point.east) -- (r3_0.west);
\draw[line width=0.5pt, orange!80, -latex] (l2_\point.east) -- (r3_1.west);
\draw[line width=0.5pt, cyan!80, -latex] (l2_\point.east) -- (r3_2.west);
\draw[line width=0.5pt, ugreen!80, -latex] (l3_\point.east) -- (r3_0.west);
\draw[line width=0.5pt, orange!80, -latex] (l3_\point.east) -- (r3_1.west);
\draw[line width=0.5pt, cyan!80, -latex] (l3_\point.east) -- (r3_2.west);
\draw[line width=0.5pt, ugreen!80, -latex] (l1_\point.east) -- ([xshift=-0.1em,yshift=-0.1em]r2_0.west);
\draw[line width=0.5pt, orange!80, -latex] (l1_\point.east) -- ([xshift=-0.1em,yshift=-0.1em]r2_1.west);
\draw[line width=0.5pt, cyan!80, -latex] (l1_\point.east) -- ([xshift=-0.1em,yshift=-0.1em]r2_2.west);
\draw[line width=0.5pt, ugreen!80, -latex] (l2_\point.east) -- ([xshift=0em,yshift=0.1em]r2_0.west);
\draw[line width=0.5pt, orange!80, -latex] (l2_\point.east) -- ([xshift=0em,yshift=0.1em]r2_1.west);
\draw[line width=0.5pt, cyan!80, -latex] (l2_\point.east) -- ([xshift=0em,yshift=0.1em]r2_2.west);
\draw[line width=0.5pt, ugreen!80, -latex] (l2_\point.east) -- ([xshift=-0.1em,yshift=-0.1em]r3_0.west);
\draw[line width=0.5pt, orange!80, -latex] (l2_\point.east) -- ([xshift=-0.1em,yshift=-0.1em]r3_1.west);
\draw[line width=0.5pt, cyan!80, -latex] (l2_\point.east) -- ([xshift=-0.1em,yshift=-0.1em]r3_2.west);
\draw[line width=0.5pt, ugreen!80, -latex] (l3_\point.east) -- ([xshift=0em,yshift=0.1em]r3_0.west);
\draw[line width=0.5pt, orange!80, -latex] (l3_\point.east) -- ([xshift=0em,yshift=0.1em]r3_1.west);
\draw[line width=0.5pt, cyan!80, -latex] (l3_\point.east) -- ([xshift=0em,yshift=0.1em]r3_2.west);
}
\node[vuale] at ([xshift=-0.9em]l1_1.west) {$x_3$};
......
......@@ -3,15 +3,15 @@
\begin{tikzpicture}[node distance = 0cm]
\node(num1)[num]{\textbf1};
\node(num2)[num,right of = num1,xshift = 1.2cm]{\textbf{\textcolor{blue!70}2}};
\node(num3)[num,right of = num2,xshift = 1.2cm]{\textbf{\textcolor{blue!70}3}};
\node(num4)[num,right of = num3,xshift = 1.2cm]{\textbf{\textcolor{blue!70}4}};
\node(num5)[num,right of = num4,xshift = 1.2cm]{\textbf{\textcolor{blue!70}5}};
\node(num6)[num,right of = num5,xshift = 1.2cm]{\textbf{\textcolor{blue!70}6}};
\node(num7)[num,right of = num6,xshift = 1.2cm]{\textbf{\textcolor{blue!70}7}};
\node(num8)[num,right of = num7,xshift = 1.2cm]{\textbf{\textcolor{blue!70}8}};
\node(num9)[num,right of = num8,xshift = 1.2cm]{\textbf9};
\node(num1)[num]{$\mathbi{e}_1$};
\node(num2)[num,right of = num1,xshift = 1.2cm]{\textcolor{blue!70}{$\mathbi{e}_2$}};
\node(num3)[num,right of = num2,xshift = 1.2cm]{\textcolor{blue!70}{$\mathbi{e}_3$}};
\node(num4)[num,right of = num3,xshift = 1.2cm]{\textcolor{blue!70}{$\mathbi{e}_4$}};
\node(num5)[num,right of = num4,xshift = 1.2cm]{\textcolor{blue!70}{$\mathbi{e}_5$}};
\node(num6)[num,right of = num5,xshift = 1.2cm]{\textcolor{blue!70}{$\mathbi{e}_6$}};
\node(num7)[num,right of = num6,xshift = 1.2cm]{\textcolor{blue!70}{$\mathbi{e}_7$}};
\node(num8)[num,right of = num7,xshift = 1.2cm]{\textcolor{blue!70}{$\mathbi{e}_8$}};
\node(num9)[num,right of = num8,xshift = 1.2cm]{$\mathbi{e}_9$};
\node(A)[below of = num2,yshift = -0.6cm]{A};
\node(B)[below of = num8,yshift = -0.6cm]{B};
......
......@@ -2,21 +2,21 @@
\begin{tikzpicture}[node distance = 0cm]
\node(num1_0)[num, fill = blue!40]{\textbf{\textcolor{white}0}};
\node(num1_1)[num,right of = num1_0,xshift = 1.2cm]{\textbf1};
\node(num1_2)[num,right of = num1_1,xshift = 1.2cm]{\textbf{\textcolor{blue!70}2}};
\node(num1_3)[num,right of = num1_2,xshift = 1.2cm]{\textbf{\textcolor{blue!70}3}};
\node(num1_4)[num,right of = num1_3,xshift = 1.2cm]{\textbf{\textcolor{blue!70}4}};
\node(num1_5)[num,right of = num1_4,xshift = 1.2cm]{\textbf{\textcolor{blue!70}5}};
\node(num1_6)[num,right of = num1_5,xshift = 1.2cm]{\textbf{\textcolor{blue!70}6}};
\node(num1_7)[num,right of = num1_6,xshift = 1.2cm]{\textbf{\textcolor{blue!70}7}};
\node(num1_8)[num,right of = num1_7,xshift = 1.2cm]{\textbf{\textcolor{blue!70}8}};
\node(num1_9)[num,right of = num1_8,xshift = 1.2cm]{\textbf9};
\node(num1_10)[num,right of = num1_9,xshift = 1.2cm, fill = blue!40]{\textbf0};
\node(num1_0)[num, fill = blue!40]{\textcolor{white}{$\mathbi{0}$}};
\node(num1_1)[num,right of = num1_0,xshift = 1.2cm]{$\mathbi{e}_1$};
\node(num1_2)[num,right of = num1_1,xshift = 1.2cm]{\textcolor{blue!70}{$\mathbi{e}_2$}};
\node(num1_3)[num,right of = num1_2,xshift = 1.2cm]{\textcolor{blue!70}{$\mathbi{e}_3$}};
\node(num1_4)[num,right of = num1_3,xshift = 1.2cm]{\textcolor{blue!70}{$\mathbi{e}_4$}};
\node(num1_5)[num,right of = num1_4,xshift = 1.2cm]{\textcolor{blue!70}{$\mathbi{e}_5$}};
\node(num1_6)[num,right of = num1_5,xshift = 1.2cm]{\textcolor{blue!70}{$\mathbi{e}_6$}};
\node(num1_7)[num,right of = num1_6,xshift = 1.2cm]{\textcolor{blue!70}{$\mathbi{e}_7$}};
\node(num1_8)[num,right of = num1_7,xshift = 1.2cm]{\textcolor{blue!70}{$\mathbi{e}_8$}};
\node(num1_9)[num,right of = num1_8,xshift = 1.2cm]{$\mathbi{e}_9$};
\node(num1_10)[num,right of = num1_9,xshift = 1.2cm, fill = blue!40]{$\mathbi{0}$};
\node(A)[below of = num2,yshift = -0.6cm]{A};
\node(B)[below of = num8,yshift = -0.6cm]{B};
\node(num2_0)[num,above of = num1_0,yshift = 1.2cm, fill = blue!40]{\textbf{\textcolor{white}0}};
\node(num2_0)[num,above of = num1_0,yshift = 1.2cm, fill = blue!40]{\textcolor{white}{$\mathbi{0}$}};
\node(num2_1)[num,right of = num2_0,xshift = 1.2cm]{\textbf2};
\node(num2_2)[num,right of = num2_1,xshift = 1.2cm]{\textbf2};
\node(num2_3)[num,right of = num2_2,xshift = 1.2cm]{\textbf{\textcolor{blue!70}2}};
......@@ -26,9 +26,9 @@
\node(num2_7)[num,right of = num2_6,xshift = 1.2cm]{\textbf{\textcolor{blue!70}2}};
\node(num2_8)[num,right of = num2_7,xshift = 1.2cm]{\textbf2};
\node(num2_9)[num,right of = num2_8,xshift = 1.2cm]{\textbf2};
\node(num2_10)[num,right of = num2_9,xshift = 1.2cm, fill = blue!40]{\textbf0};
\node(num2_10)[num,right of = num2_9,xshift = 1.2cm, fill = blue!40]{$\mathbi{0}$};
\node(num3_0)[num,above of = num2_0,yshift = 1.2cm, fill = blue!40]{\textbf{\textcolor{white}0}};
\node(num3_0)[num,above of = num2_0,yshift = 1.2cm, fill = blue!40]{\textcolor{white}{$\mathbi{0}$}};
\node(num3_1)[num,right of = num3_0,xshift = 1.2cm]{\textbf3};
\node(num3_2)[num,right of = num3_1,xshift = 1.2cm]{\textbf3};
\node(num3_3)[num,right of = num3_2,xshift = 1.2cm]{\textbf3};
......@@ -38,9 +38,9 @@
\node(num3_7)[num,right of = num3_6,xshift = 1.2cm]{\textbf3};
\node(num3_8)[num,right of = num3_7,xshift = 1.2cm]{\textbf3};
\node(num3_9)[num,right of = num3_8,xshift = 1.2cm]{\textbf3};
\node(num3_10)[num,right of = num3_9,xshift = 1.2cm, fill = blue!40]{\textbf0};
\node(num3_10)[num,right of = num3_9,xshift = 1.2cm, fill = blue!40]{$\mathbi{0}$};
\node(num4_0)[num,above of = num3_0,yshift = 1.2cm, fill = blue!40]{\textbf{\textcolor{white}0}};
\node(num4_0)[num,above of = num3_0,yshift = 1.2cm, fill = blue!40]{\textcolor{white}{$\mathbi{0}$}};
\node(num4_1)[num,right of = num4_0,xshift = 1.2cm]{\textbf4};
\node(num4_2)[num,right of = num4_1,xshift = 1.2cm]{\textbf4};
\node(num4_3)[num,right of = num4_2,xshift = 1.2cm]{\textbf4};
......@@ -50,61 +50,61 @@
\node(num4_7)[num,right of = num4_6,xshift = 1.2cm]{\textbf4};
\node(num4_8)[num,right of = num4_7,xshift = 1.2cm]{\textbf4};
\node(num4_9)[num,right of = num4_8,xshift = 1.2cm]{\textbf4};
\node(num4_10)[num,right of = num4_9,xshift = 1.2cm, fill = blue!40]{\textbf0};
\node(num4_10)[num,right of = num4_9,xshift = 1.2cm, fill = blue!40]{$\mathbi{0}$};
\draw [->, thick](num1_0.north)--(num2_1.south);
\draw [->, thick](num2_0.north)--(num3_1.south);
\draw [->, thick](num3_0.north)--(num4_1.south);
\draw [->, thick](num1_1.north)--(num2_2.south);
\draw [->, thick](num2_1.north)--(num3_2.south);
\draw [->, thick](num3_1.north)--(num4_2.south);
\draw [->, thick, color = blue!60](num1_2.north)--(num2_3.south);
\draw [->, thick](num2_2.north)--(num3_3.south);
\draw [->, thick](num3_2.north)--(num4_3.south);
\draw [->, thick, color = blue!60](num1_3.north)--(num2_4.south);
\draw [->, thick, color = blue!60](num2_3.north)--(num3_4.south);
\draw [->, thick](num3_3.north)--(num4_4.south);
\draw [->, thick, color = blue!60](num1_4.north)--(num2_5.south);
\draw [->, thick, color = blue!60](num2_4.north)--(num3_5.south);
\draw [->, thick, color = blue!60](num3_4.north)--(num4_5.south);
\draw [->, thick, color = blue!60](num1_5.north)--(num2_6.south);
\draw [->, thick, color = blue!60](num2_5.north)--(num3_6.south);
\draw [->, thick](num3_5.north)--(num4_6.south);
\draw [->, thick, color = blue!60](num1_6.north)--(num2_7.south);
\draw [->, thick](num2_6.north)--(num3_7.south);
\draw [->, thick](num3_6.north)--(num4_7.south);
\draw [->, thick](num1_7.north)--(num2_8.south);
\draw [->, thick](num2_7.north)--(num3_8.south);
\draw [->, thick](num3_7.north)--(num4_8.south);
\draw [->, thick](num1_8.north)--(num2_9.south);
\draw [->, thick](num2_8.north)--(num3_9.south);
\draw [->, thick](num3_8.north)--(num4_9.south);
\draw [->, thick](num1_0.north)--([xshift=-0.1em,yshift=-0.1em]num2_1.south);
\draw [->, thick](num2_0.north)--([xshift=-0.1em,yshift=-0.1em]num3_1.south);
\draw [->, thick](num3_0.north)--([xshift=-0.1em,yshift=-0.1em]num4_1.south);
\draw [->, thick](num1_1.north)--([xshift=-0.1em,yshift=-0.1em]num2_2.south);
\draw [->, thick](num2_1.north)--([xshift=-0.1em,yshift=-0.1em]num3_2.south);
\draw [->, thick](num3_1.north)--([xshift=-0.1em,yshift=-0.1em]num4_2.south);
\draw [->, thick, color = blue!60](num1_2.north)--([xshift=-0.1em,yshift=-0.1em]num2_3.south);
\draw [->, thick](num2_2.north)--([xshift=-0.1em,yshift=-0.1em]num3_3.south);
\draw [->, thick](num3_2.north)--([xshift=-0.1em,yshift=-0.1em]num4_3.south);
\draw [->, thick, color = blue!60](num1_3.north)--([xshift=-0.1em,yshift=-0.1em]num2_4.south);
\draw [->, thick, color = blue!60](num2_3.north)--([xshift=-0.1em,yshift=-0.1em]num3_4.south);
\draw [->, thick](num3_3.north)--([xshift=-0.1em,yshift=-0.1em]num4_4.south);
\draw [->, thick, color = blue!60](num1_4.north)--([xshift=-0.1em,yshift=-0.1em]num2_5.south);
\draw [->, thick, color = blue!60](num2_4.north)--([xshift=-0.1em,yshift=-0.1em]num3_5.south);
\draw [->, thick, color = blue!60](num3_4.north)--([xshift=-0.1em,yshift=-0.1em]num4_5.south);
\draw [->, thick, color = blue!60](num1_5.north)--([xshift=-0.1em,yshift=-0.1em]num2_6.south);
\draw [->, thick, color = blue!60](num2_5.north)--([xshift=-0.1em,yshift=-0.1em]num3_6.south);
\draw [->, thick](num3_5.north)--([xshift=-0.1em,yshift=-0.1em]num4_6.south);
\draw [->, thick, color = blue!60](num1_6.north)--([xshift=-0.1em,yshift=-0.1em]num2_7.south);
\draw [->, thick](num2_6.north)--([xshift=-0.1em,yshift=-0.1em]num3_7.south);
\draw [->, thick](num3_6.north)--([xshift=-0.1em,yshift=-0.1em]num4_7.south);
\draw [->, thick](num1_7.north)--([xshift=-0.1em,yshift=-0.1em]num2_8.south);
\draw [->, thick](num2_7.north)--([xshift=-0.1em,yshift=-0.1em]num3_8.south);
\draw [->, thick](num3_7.north)--([xshift=-0.1em,yshift=-0.1em]num4_8.south);
\draw [->, thick](num1_8.north)--([xshift=-0.1em,yshift=-0.1em]num2_9.south);
\draw [->, thick](num2_8.north)--([xshift=-0.1em,yshift=-0.1em]num3_9.south);
\draw [->, thick](num3_8.north)--([xshift=-0.1em,yshift=-0.1em]num4_9.south);
\draw [->, thick](num1_2.north)--(num2_1.south);
\draw [->, thick](num2_2.north)--(num3_1.south);
\draw [->, thick](num3_2.north)--(num4_1.south);
\draw [->, thick](num1_3.north)--(num2_2.south);
\draw [->, thick](num2_3.north)--(num3_2.south);
\draw [->, thick](num3_3.north)--(num4_2.south);
\draw [->, thick, color = blue!60](num1_4.north)--(num2_3.south);
\draw [->, thick](num2_4.north)--(num3_3.south);
\draw [->, thick](num3_4.north)--(num4_3.south);
\draw [->, thick, color = blue!60](num1_5.north)--(num2_4.south);
\draw [->, thick, color = blue!60](num2_5.north)--(num3_4.south);
\draw [->, thick](num3_5.north)--(num4_4.south);
\draw [->, thick, color = blue!60](num1_6.north)--(num2_5.south);
\draw [->, thick, color = blue!60](num2_6.north)--(num3_5.south);
\draw [->, thick, color = blue!60](num3_6.north)--(num4_5.south);
\draw [->, thick, color = blue!60](num1_7.north)--(num2_6.south);
\draw [->, thick, color = blue!60](num2_7.north)--(num3_6.south);
\draw [->, thick](num3_7.north)--(num4_6.south);
\draw [->, thick, color = blue!60](num1_8.north)--(num2_7.south);
\draw [->, thick](num2_8.north)--(num3_7.south);
\draw [->, thick](num3_8.north)--(num4_7.south);
\draw [->, thick](num1_9.north)--(num2_8.south);
\draw [->, thick](num2_9.north)--(num3_8.south);
\draw [->, thick](num3_9.north)--(num4_8.south);
\draw [->, thick](num1_10.north)--(num2_9.south);
\draw [->, thick](num2_10.north)--(num3_9.south);
\draw [->, thick](num3_10.north)--(num4_9.south);
\draw [->, thick](num1_2.north)--([xshift=0.1em,yshift=-0.1em]num2_1.south);
\draw [->, thick](num2_2.north)--([xshift=0.1em,yshift=-0.1em]num3_1.south);
\draw [->, thick](num3_2.north)--([xshift=0.1em,yshift=-0.1em]num4_1.south);
\draw [->, thick](num1_3.north)--([xshift=0.1em,yshift=-0.1em]num2_2.south);
\draw [->, thick](num2_3.north)--([xshift=0.1em,yshift=-0.1em]num3_2.south);
\draw [->, thick](num3_3.north)--([xshift=0.1em,yshift=-0.1em]num4_2.south);
\draw [->, thick, color = blue!60](num1_4.north)--([xshift=0.1em,yshift=-0.1em]num2_3.south);
\draw [->, thick](num2_4.north)--([xshift=0.1em,yshift=-0.1em]num3_3.south);
\draw [->, thick](num3_4.north)--([xshift=0.1em,yshift=-0.1em]num4_3.south);
\draw [->, thick, color = blue!60](num1_5.north)--([xshift=0.1em,yshift=-0.1em]num2_4.south);
\draw [->, thick, color = blue!60](num2_5.north)--([xshift=0.1em,yshift=-0.1em]num3_4.south);
\draw [->, thick](num3_5.north)--([xshift=0.1em,yshift=-0.1em]num4_4.south);
\draw [->, thick, color = blue!60](num1_6.north)--([xshift=0.1em,yshift=-0.1em]num2_5.south);
\draw [->, thick, color = blue!60](num2_6.north)--([xshift=0.1em,yshift=-0.1em]num3_5.south);
\draw [->, thick, color = blue!60](num3_6.north)--([xshift=0.1em,yshift=-0.1em]num4_5.south);
\draw [->, thick, color = blue!60](num1_7.north)--([xshift=0.1em,yshift=-0.1em]num2_6.south);
\draw [->, thick, color = blue!60](num2_7.north)--([xshift=0.1em,yshift=-0.1em]num3_6.south);
\draw [->, thick](num3_7.north)--([xshift=0.1em,yshift=-0.1em]num4_6.south);
\draw [->, thick, color = blue!60](num1_8.north)--([xshift=0.1em,yshift=-0.1em]num2_7.south);
\draw [->, thick](num2_8.north)--([xshift=0.1em,yshift=-0.1em]num3_7.south);
\draw [->, thick](num3_8.north)--([xshift=0.1em,yshift=-0.1em]num4_7.south);
\draw [->, thick](num1_9.north)--([xshift=0.1em,yshift=-0.1em]num2_8.south);
\draw [->, thick](num2_9.north)--([xshift=0.1em,yshift=-0.1em]num3_8.south);
\draw [->, thick](num3_9.north)--([xshift=0.1em,yshift=-0.1em]num4_8.south);
\draw [->, thick](num1_10.north)--([xshift=0.1em,yshift=-0.1em]num2_9.south);
\draw [->, thick](num2_10.north)--([xshift=0.1em,yshift=-0.1em]num3_9.south);
\draw [->, thick](num3_10.north)--([xshift=0.1em,yshift=-0.1em]num4_9.south);
\end{tikzpicture}
\ No newline at end of file
%\newlength{\bcc}
\setlength{\bcc}{0.4cm}
\begin{tikzpicture}
\begin{scope}
%\tikzstyle{every node}=[scale=0.8]
\tikzstyle{line} = [dash pattern=on 2pt off 1pt,line width=0.5pt]
\tikzstyle{cir} = [thin,fill=blue!8,draw,circle,minimum size =0.5em,drop shadow={shadow xshift=0.15em, shadow yshift=-0.1em}]
\tikzstyle{word} = [inner sep=0pt, font=\scriptsize,minimum height=\bc]
\tikzstyle{word} = [inner sep=0pt, font=\scriptsize,minimum height=\bcc]
\draw[fill=red!8,line width=0.2pt] (0cm,0cm+1*\bc) rectangle (0cm+4*\bc,0cm+7*\bc);
\draw[fill=cyan!14,line width=0.2pt] (0cm,0cm) rectangle (0cm+4*\bc,0cm+1*\bc);
\draw[fill=cyan!14,line width=0.2pt] (0cm,0cm+7*\bc) rectangle (0cm+4*\bc,0cm+8*\bc);
\draw[step=\bc] (0cm,0cm) grid (0cm+4*\bc,0cm+8*\bc);
%\draw[line width=0.7pt] (0cm,0cm) rectangle (0cm+4*\bc,0cm+8*\bc);
\draw[red!50,line width=1.8pt] (0cm,0cm+5*\bc) rectangle (0cm+4*\bc,0cm+8*\bc);
\draw[ugreen!50,line width=1.8pt] (0cm,0cm+1*\bc) rectangle (0cm+4*\bc,0cm+4*\bc);
\draw[fill=red!8,line width=0.2pt] (0cm,0cm+1*\bcc) rectangle (0cm+4*\bcc,0cm+7*\bcc);
\draw[fill=cyan!14,line width=0.2pt] (0cm,0cm) rectangle (0cm+4*\bcc,0cm+1*\bcc);
\draw[fill=cyan!14,line width=0.2pt] (0cm,0cm+7*\bcc) rectangle (0cm+4*\bcc,0cm+8*\bcc);
\draw[step=\bcc] (0cm,0cm) grid (0cm+4*\bcc,0cm+8*\bcc);
%\draw[line width=0.7pt] (0cm,0cm) rectangle (0cm+4*\bcc,0cm+8*\bcc);
\draw[red!50,line width=1.8pt] (0cm,0cm+5*\bcc) rectangle (0cm+4*\bcc,0cm+8*\bcc);
\draw[ugreen!50,line width=1.8pt] (0cm,0cm+1*\bcc) rectangle (0cm+4*\bcc,0cm+4*\bcc);
\draw[fill=blue!8,xshift=5.0cm,yshift=1.0cm,line width=0.2pt] (0cm,0cm) rectangle (0cm+1*\bc,0cm+6*\bc);
\draw[step=\bc,xshift=5.0cm,yshift=1.0cm] (0cm,0cm) grid (0cm+1*\bc,0cm+6*\bc);
%\draw[line width=0.7pt,xshift=5.0cm,yshift=1.0cm] (0cm,0cm) rectangle (0cm+1*\bc,0cm+6*\bc);
\draw[fill=blue!8,xshift=5.0cm,yshift=1.0cm,line width=0.2pt] (0cm,0cm) rectangle (0cm+1*\bcc,0cm+6*\bcc);
\draw[step=\bcc,xshift=5.0cm,yshift=1.0cm] (0cm,0cm) grid (0cm+1*\bcc,0cm+6*\bcc);
%\draw[line width=0.7pt,xshift=5.0cm,yshift=1.0cm] (0cm,0cm) rectangle (0cm+1*\bcc,0cm+6*\bcc);
\draw[fill=blue!8,xshift=5.2cm,yshift=0.8cm,line width=0.2pt] (0cm,0cm) rectangle (0cm+1*\bc,0cm+6*\bc);
\draw[step=\bc,xshift=5.2cm,yshift=0.8cm] (0cm,0cm) grid (0cm+1*\bc,0cm+6*\bc);
%\draw[line width=0.7pt,xshift=5.2cm,yshift=0.8cm] (0cm,0cm) rectangle (0cm+1*\bc,0cm+6*\bc);
\draw[ugreen!50,line width=2pt,xshift=5.2cm,yshift=0.8cm] (0cm,0cm+1*\bc) rectangle (0cm+1*\bc,0cm+2*\bc);
\draw[fill=blue!8,xshift=5.2cm,yshift=0.8cm,line width=0.2pt] (0cm,0cm) rectangle (0cm+1*\bcc,0cm+6*\bcc);
\draw[step=\bcc,xshift=5.2cm,yshift=0.8cm] (0cm,0cm) grid (0cm+1*\bcc,0cm+6*\bcc);
%\draw[line width=0.7pt,xshift=5.2cm,yshift=0.8cm] (0cm,0cm) rectangle (0cm+1*\bcc,0cm+6*\bcc);
\draw[ugreen!50,line width=2pt,xshift=5.2cm,yshift=0.8cm] (0cm,0cm+1*\bcc) rectangle (0cm+1*\bcc,0cm+2*\bcc);
\draw[fill=blue!8,xshift=5.4cm,yshift=0.6cm,line width=0.2pt] (0cm,0cm) rectangle (0cm+1*\bc,0cm+6*\bc);
\draw[step=\bc,xshift=5.4cm,yshift=0.6cm] (0cm,0cm) grid (0cm+1*\bc,0cm+6*\bc);
%\draw[line width=0.7pt,xshift=5.4cm,yshift=0.6cm] (0cm,0cm) rectangle (0cm+1*\bc,0cm+6*\bc);
\draw[fill=blue!8,xshift=5.4cm,yshift=0.6cm,line width=0.2pt] (0cm,0cm) rectangle (0cm+1*\bcc,0cm+6*\bcc);
\draw[step=\bcc,xshift=5.4cm,yshift=0.6cm] (0cm,0cm) grid (0cm+1*\bcc,0cm+6*\bcc);
%\draw[line width=0.7pt,xshift=5.4cm,yshift=0.6cm] (0cm,0cm) rectangle (0cm+1*\bcc,0cm+6*\bcc);
\draw[fill=blue!8,xshift=5.6cm,yshift=0.4cm,line width=0.2pt] (0cm,0cm) rectangle (0cm+1*\bc,0cm+6*\bc);
\draw[step=\bc,xshift=5.6cm,yshift=0.4cm] (0cm,0cm) grid (0cm+1*\bc,0cm+6*\bc);
%\draw[line width=0.7pt,xshift=5.6cm,yshift=0.4cm] (0cm,0cm) rectangle (0cm+1*\bc,0cm+6*\bc);
\draw[red!50,line width=2pt,xshift=5.6cm,yshift=0.4cm] (0cm,0cm+5*\bc) rectangle (0cm+1*\bc,0cm+6*\bc);
\draw[fill=blue!8,xshift=5.6cm,yshift=0.4cm,line width=0.2pt] (0cm,0cm) rectangle (0cm+1*\bcc,0cm+6*\bcc);
\draw[step=\bcc,xshift=5.6cm,yshift=0.4cm] (0cm,0cm) grid (0cm+1*\bcc,0cm+6*\bcc);
%\draw[line width=0.7pt,xshift=5.6cm,yshift=0.4cm] (0cm,0cm) rectangle (0cm+1*\bcc,0cm+6*\bcc);
\draw[red!50,line width=2pt,xshift=5.6cm,yshift=0.4cm] (0cm,0cm+5*\bcc) rectangle (0cm+1*\bcc,0cm+6*\bcc);
\draw[red!50,line width=0.5pt] (0cm+4*\bc,0cm+8*\bc) -- ([xshift=5.6cm,yshift=0.4cm]0cm,0cm+6*\bc);
\draw[red!50,line width=0.5pt] (0cm+4*\bc,0cm+5*\bc) -- ([xshift=5.6cm,yshift=0.4cm]0cm,0cm+5*\bc);
\draw[red!50,line width=0.5pt] (0cm+4*\bcc,0cm+8*\bcc) -- ([xshift=5.6cm,yshift=0.4cm]0cm,0cm+6*\bcc);
\draw[red!50,line width=0.5pt] (0cm+4*\bcc,0cm+5*\bcc) -- ([xshift=5.6cm,yshift=0.4cm]0cm,0cm+5*\bcc);
\draw[ugreen!50,line width=0.5pt] (0cm+4*\bc,0cm+4*\bc) -- ([xshift=5.2cm,yshift=0.8cm]0cm,0cm+2*\bc);
\draw[ugreen!50,line width=0.5pt] (0cm+4*\bc,0cm+1*\bc) -- ([xshift=5.2cm,yshift=0.8cm]0cm,0cm+1*\bc);
\draw[ugreen!50,line width=0.5pt] (0cm+4*\bcc,0cm+4*\bcc) -- ([xshift=5.2cm,yshift=0.8cm]0cm,0cm+2*\bcc);
\draw[ugreen!50,line width=0.5pt] (0cm+4*\bcc,0cm+1*\bcc) -- ([xshift=5.2cm,yshift=0.8cm]0cm,0cm+1*\bcc);
\node[word] (w1) at (-0.5cm, 3.0cm) {$<$p$>$};
\node[word] (w2) at ([yshift=-\bc]w1) {今天};
\node[word] (w3) at ([yshift=-\bc]w2) {};
\node[word] (w4) at ([yshift=-\bc]w3) {};
\node[word] (w5) at ([yshift=-\bc]w4) {};
\node[word] (w6) at ([yshift=-\bc]w5) {日子};
\node[word] (w7) at ([yshift=-\bc]w6) {};
\node[word] (w8) at ([yshift=-\bc]w7) {$<$p$>$};
\node[word] (w2) at ([yshift=-\bcc]w1) {今天};
\node[word] (w3) at ([yshift=-\bcc]w2) {};
\node[word] (w4) at ([yshift=-\bcc]w3) {};
\node[word] (w5) at ([yshift=-\bcc]w4) {};
\node[word] (w6) at ([yshift=-\bcc]w5) {日子};
\node[word] (w7) at ([yshift=-\bcc]w6) {};
\node[word] (w8) at ([yshift=-\bcc]w7) {$<$p$>$};
\node[inner xsep=2pt,inner ysep=0pt,font=\footnotesize] (c1) at (0.8cm, 3.5cm) {$O$};
\draw[-latex] (c1.west) -- ([xshift=-0.4cm]c1.west);
......@@ -58,8 +59,9 @@
\draw[-latex] (c2.east) -- ([xshift=0.2cm, yshift=-0.2cm]c2.east);
%%%%%%%%%%%%%%%%%%%%%
\node[word] (sub) at ([xshift=9.5*\bc,yshift=3*\bc]w1) {$O$:卷积核大小};
\node[word] (sub2) at ([yshift=-0.5*\bc]sub.south) {$N$:卷积核数量};
\node[word] (sub) at ([xshift=9.5*\bcc,yshift=3.5*\bcc]w1) {$O$\ \ :输入通道数};
\node[word] (sub2) at ([yshift=-0.5*\bcc]sub.south) {$N$\ \ :卷积核数量};
\node[word] (sub3) at ([xshift=-1.35em,yshift=-0.5*\bcc]sub2.south) {$<$p$>$:填充};
\end{scope}
......
%\newlength{\bcc}
\setlength{\bcc}{0.4cm}
\begin{tikzpicture}
\begin{scope}
%\tikzstyle{every node}=[scale=0.8]
\tikzstyle{line} = [dash pattern=on 2pt off 1pt,line width=0.5pt]
\tikzstyle{line} = [dash pattern=on 2pt off 1pt,line width=0.6pt]
\tikzstyle{cir} = [thin,fill=blue!8,draw,circle,minimum size =0.5em,drop shadow={shadow xshift=0.15em, shadow yshift=-0.1em}]
\tikzstyle{word} = [inner sep=0pt, font=\footnotesize,minimum height=\bc]
\tikzstyle{word} = [inner sep=0pt, font=\footnotesize,minimum height=\bcc]
\draw[fill=blue!8,xshift=0.3cm,yshift=0.5cm,line width=0.2pt] (0cm,0cm) rectangle (0cm+6*\bc,0cm+9*\bc);
\draw[ugreen!60,step=\bc,xshift=0.3cm,yshift=0.5cm,gray] (0cm,0cm) grid (0cm+6*\bc,0cm+9*\bc);
%\draw[line width=0.7pt,xshift=0.3cm,yshift=0.5cm] (0cm,0cm) rectangle (0cm+6*\bc,0cm+9*\bc);
\draw[red!60,line width=2pt,xshift=0.3cm,yshift=0.5cm] (0cm,0cm+2*\bc) rectangle (0cm+6*\bc,0cm+4*\bc);
\draw[fill=blue!8,xshift=0.3cm,yshift=0.5cm,line width=0.6pt] (0cm,0cm) rectangle (0cm+6*\bcc,0cm+9*\bcc);
\draw[ugreen!60,step=\bcc,xshift=0.3cm,yshift=0.5cm,gray] (0cm,0cm) grid (0cm+6*\bcc,0cm+9*\bcc);
%\draw[line width=0.7pt,xshift=0.3cm,yshift=0.5cm] (0cm,0cm) rectangle (0cm+6*\bcc,0cm+9*\bcc);
\draw[red!60,line width=2pt,xshift=0.3cm,yshift=0.5cm] (0cm,0cm+2*\bcc) rectangle (0cm+6*\bcc,0cm+4*\bcc);
\draw[thick,fill=blue!8,line width=0.2pt] (0cm,0cm) rectangle (0cm+6*\bc,0cm+9*\bc);
\draw[step=\bc,gray] (0cm,0cm) grid (0cm+6*\bc,0cm+9*\bc);
%\draw[line width=0.7pt] (0cm,0cm) rectangle (0cm+6*\bc,0cm+9*\bc);
\draw[red!60,line width=2pt] (0cm,0cm) rectangle (0cm+6*\bc,0cm+2*\bc);
\draw[ugreen!60,line width=2pt] (0cm,0cm+3*\bc) rectangle (0cm+6*\bc,0cm+6*\bc);
\draw[red!60,line width=2pt] (0cm,0cm+7*\bc) rectangle (0cm+6*\bc,0cm+9*\bc);
\draw[thick,fill=blue!8,line width=0.6pt] (0cm,0cm) rectangle (0cm+6*\bcc,0cm+9*\bcc);
\draw[step=\bcc,gray] (0cm,0cm) grid (0cm+6*\bcc,0cm+9*\bcc);
%\draw[line width=0.7pt] (0cm,0cm) rectangle (0cm+6*\bcc,0cm+9*\bcc);
\draw[red!60,line width=2pt] (0cm,0cm) rectangle (0cm+6*\bcc,0cm+2*\bcc);
\draw[ugreen!60,line width=2pt] (0cm,0cm+3*\bcc) rectangle (0cm+6*\bcc,0cm+6*\bcc);
\draw[red!60,line width=2pt] (0cm,0cm+7*\bcc) rectangle (0cm+6*\bcc,0cm+9*\bcc);
\draw[fill=blue!8,xshift=5.0cm,yshift=1.3cm,line width=0.2pt] (0cm,0cm) rectangle (0cm+1*\bc,0cm+6*\bc);
\draw[step=\bc,gray,xshift=5.0cm,yshift=1.3cm] (0cm,0cm) grid (0cm+1*\bc,0cm+6*\bc);
%\draw[xshift=5.0cm,yshift=1.3cm,line width=0.7pt] (0cm,0cm) rectangle (0cm+1*\bc,0cm+6*\bc);
\draw[ugreen!60,line width=2pt,xshift=5.0cm,yshift=1.3cm] (0cm,0cm+2*\bc) rectangle (0cm+1*\bc,0cm+3*\bc);
\draw[fill=blue!8,xshift=5.0cm,yshift=1.3cm,line width=0.6pt] (0cm,0cm) rectangle (0cm+1*\bcc,0cm+6*\bcc);
\draw[step=\bcc,gray,xshift=5.0cm,yshift=1.3cm] (0cm,0cm) grid (0cm+1*\bcc,0cm+6*\bcc);
%\draw[xshift=5.0cm,yshift=1.3cm,line width=0.7pt] (0cm,0cm) rectangle (0cm+1*\bcc,0cm+6*\bcc);
\draw[ugreen!60,line width=2pt,xshift=5.0cm,yshift=1.3cm] (0cm,0cm+2*\bcc) rectangle (0cm+1*\bcc,0cm+3*\bcc);
\draw [gray,fill=blue!8](8cm,2.6cm) -- (8.4cm, 2.6cm) -- (9cm,1cm) -- (8.6cm, 1cm) -- (8cm,2.6cm);
\draw [gray,fill=blue!8,line width=0.6pt](8cm,2.6cm) -- (8.4cm, 2.6cm) -- (9cm,1cm) -- (8.6cm, 1cm) -- (8cm,2.6cm);
\draw [gray](8.15cm,2.2cm) -- (8.55cm,2.2cm);
\draw [gray](8.3cm,1.8cm) -- (8.7cm,1.8cm);
\draw [gray](8.45cm,1.4cm) -- (8.85cm,1.4cm);
\draw [gray,fill=blue!8](11cm,2.2cm) -- (11.4cm, 2.2cm) -- (11.7cm,1.4cm) -- (11.3cm, 1.4cm) -- (11cm,2.2cm);
\draw [gray,fill=blue!8,line width=0.6pt](11cm,2.2cm) -- (11.4cm, 2.2cm) -- (11.7cm,1.4cm) -- (11.3cm, 1.4cm) -- (11cm,2.2cm);
\draw [gray](11.15cm,1.8cm) -- (11.55cm,1.8cm);
\draw[ugreen!60,line] ([xshift=5.0cm,yshift=1.3cm]0cm+1*\bc,0cm+6*\bc) -- (8cm,2.6cm);
\draw[ugreen!60,line] ([xshift=5.0cm,yshift=1.3cm]0cm+1*\bc,0cm) -- (8.15cm,2.2cm);
\draw[ugreen!60,line] ([xshift=5.0cm,yshift=1.3cm]0cm+1*\bcc,0cm+6*\bcc) -- (8cm,2.6cm);
\draw[ugreen!60,line] ([xshift=5.0cm,yshift=1.3cm]0cm+1*\bcc,0cm) -- (8.15cm,2.2cm);
\draw[fill=blue!8,xshift=5.2cm,yshift=1.0cm,line width=0.2pt] (0cm,0cm) rectangle (0cm+1*\bc,0cm+6*\bc);
\draw[step=\bc,gray,xshift=5.2cm,yshift=1.0cm] (0cm,0cm) grid (0cm+1*\bc,0cm+6*\bc);
%\draw[line width=0.7pt,xshift=5.2cm,yshift=1.0cm] (0cm,0cm) rectangle (0cm+1*\bc,0cm+6*\bc);
\draw[fill=blue!8,xshift=5.2cm,yshift=1.0cm,line width=0.6pt] (0cm,0cm) rectangle (0cm+1*\bcc,0cm+6*\bcc);
\draw[step=\bcc,gray,xshift=5.2cm,yshift=1.0cm] (0cm,0cm) grid (0cm+1*\bcc,0cm+6*\bcc);
%\draw[line width=0.7pt,xshift=5.2cm,yshift=1.0cm] (0cm,0cm) rectangle (0cm+1*\bcc,0cm+6*\bcc);
\draw[fill=blue!8,xshift=5.4cm,yshift=0.3cm,line width=0.2pt] (0cm,0cm) rectangle (0cm+1*\bc,0cm+7*\bc);
\draw[step=\bc,gray,xshift=5.4cm,yshift=0.3cm] (0cm,0cm) grid (0cm+1*\bc,0cm+7*\bc);
%\draw[line width=0.7pt,xshift=5.4cm,yshift=0.3cm] (0cm,0cm) rectangle (0cm+1*\bc,0cm+7*\bc);
\draw[fill=blue!8,xshift=5.4cm,yshift=0.3cm,line width=0.6pt] (0cm,0cm) rectangle (0cm+1*\bcc,0cm+7*\bcc);
\draw[step=\bcc,gray,xshift=5.4cm,yshift=0.3cm] (0cm,0cm) grid (0cm+1*\bcc,0cm+7*\bcc);
%\draw[line width=0.7pt,xshift=5.4cm,yshift=0.3cm] (0cm,0cm) rectangle (0cm+1*\bcc,0cm+7*\bcc);
\draw[fill=blue!8,xshift=5.6cm,yshift=0cm,line width=0.2pt] (0cm,0cm) rectangle (0cm+1*\bc,0cm+7*\bc);
\draw[step=\bc,gray,xshift=5.6cm,yshift=0cm] (0cm,0cm) grid (0cm+1*\bc,0cm+7*\bc);
%\draw[line width=0.7pt,xshift=5.6cm,yshift=0cm] (0cm,0cm) rectangle (0cm+1*\bc,0cm+7*\bc);
\draw[red!60,line width=2pt,xshift=5.6cm,yshift=0cm] (0cm,0cm) rectangle (0cm+1*\bc,0cm+1*\bc);
\draw[red!60,line width=2pt,xshift=5.6cm,yshift=0cm] (0cm,0cm+2*\bc) rectangle (0cm+1*\bc,0cm+3*\bc);
\draw[red!60,line width=2pt,xshift=5.6cm,yshift=0cm] (0cm,0cm+6*\bc) rectangle (0cm+1*\bc,0cm+7*\bc);
\draw[fill=blue!8,xshift=5.6cm,yshift=0cm,line width=0.6pt] (0cm,0cm) rectangle (0cm+1*\bcc,0cm+7*\bcc);
\draw[step=\bcc,gray,xshift=5.6cm,yshift=0cm] (0cm,0cm) grid (0cm+1*\bcc,0cm+7*\bcc);
%\draw[line width=0.7pt,xshift=5.6cm,yshift=0cm] (0cm,0cm) rectangle (0cm+1*\bcc,0cm+7*\bcc);
\draw[red!60,line width=2pt,xshift=5.6cm,yshift=0cm] (0cm,0cm) rectangle (0cm+1*\bcc,0cm+1*\bcc);
\draw[red!60,line width=2pt,xshift=5.6cm,yshift=0cm] (0cm,0cm+2*\bcc) rectangle (0cm+1*\bcc,0cm+3*\bcc);
\draw[red!60,line width=2pt,xshift=5.6cm,yshift=0cm] (0cm,0cm+6*\bcc) rectangle (0cm+1*\bcc,0cm+7*\bcc);
\draw[line] (8.4cm, 2.6cm) -- (11cm,2.2cm);
\draw[line] (9cm,1cm) -- (11.3cm, 1.4cm);
\draw[red!60,line] ([xshift=5.6cm,yshift=0cm]0cm+1*\bc,0cm+7*\bc) -- (8.45cm,1.4cm);
\draw[red!60,line] ([xshift=5.6cm,yshift=0cm]0cm+1*\bc,0cm) -- (8.6cm, 1cm);
\draw[red!60,line] ([xshift=5.6cm,yshift=0cm]0cm+1*\bcc,0cm+7*\bcc) -- (8.45cm,1.4cm);
\draw[red!60,line] ([xshift=5.6cm,yshift=0cm]0cm+1*\bcc,0cm) -- (8.6cm, 1cm);
\draw[red!60,line] (0cm+6*\bc,0cm+9*\bc) -- ([xshift=5.6cm,yshift=0cm]0cm,0cm+7*\bc);
\draw[red!60,line] (0cm+6*\bc,0cm+7*\bc) -- ([xshift=5.6cm,yshift=0cm]0cm,0cm+6*\bc);
\draw[red!60,line] (0cm+6*\bc,0cm+2*\bc) -- ([xshift=5.6cm,yshift=0cm]0cm,0cm+1*\bc);
\draw[red!60,line] (0cm+6*\bc,0cm) -- ([xshift=5.6cm,yshift=0cm]0cm,0cm);
\draw[ugreen!60,line] (0cm+6*\bc,0cm+6*\bc) -- ([xshift=5.0cm,yshift=1.3cm]0cm,0cm+3*\bc);
\draw[ugreen!60,line] (0cm+6*\bc,0cm+3*\bc) -- ([xshift=5.0cm,yshift=1.3cm]0cm,0cm+2*\bc);
\draw[red!60,line] ([xshift=0.3cm,yshift=0.5cm]0cm+6*\bc,0cm+4*\bc) -- ([xshift=5.6cm,yshift=0cm]0cm,0cm+3*\bc);
\draw[red!60,line] ([xshift=0.3cm,yshift=0.5cm]0cm+6*\bc,0cm+2*\bc) -- ([xshift=5.6cm,yshift=0cm]0cm,0cm+2*\bc);
\draw[red!60,line] (0cm+6*\bcc,0cm+9*\bcc) -- ([xshift=5.6cm,yshift=0cm]0cm,0cm+7*\bcc);
\draw[red!60,line] (0cm+6*\bcc,0cm+7*\bcc) -- ([xshift=5.6cm,yshift=0cm]0cm,0cm+6*\bcc);
\draw[red!60,line] (0cm+6*\bcc,0cm+2*\bcc) -- ([xshift=5.6cm,yshift=0cm]0cm,0cm+1*\bcc);
\draw[red!60,line] (0cm+6*\bcc,0cm) -- ([xshift=5.6cm,yshift=0cm]0cm,0cm);
\draw[ugreen!60,line] (0cm+6*\bcc,0cm+6*\bcc) -- ([xshift=5.0cm,yshift=1.3cm]0cm,0cm+3*\bcc);
\draw[ugreen!60,line] (0cm+6*\bcc,0cm+3*\bcc) -- ([xshift=5.0cm,yshift=1.3cm]0cm,0cm+2*\bcc);
\draw[red!60,line] ([xshift=0.3cm,yshift=0.5cm]0cm+6*\bcc,0cm+4*\bcc) -- ([xshift=5.6cm,yshift=0cm]0cm,0cm+3*\bcc);
\draw[red!60,line] ([xshift=0.3cm,yshift=0.5cm]0cm+6*\bcc,0cm+2*\bcc) -- ([xshift=5.6cm,yshift=0cm]0cm,0cm+2*\bcc);
\node[word] (w1) at (-0.5cm, 3.4cm) {wait};
\node[word] (w2) at ([yshift=-\bc]w1) {for};
\node[word] (w3) at ([yshift=-\bc]w2) {the};
\node[word] (w4) at ([yshift=-\bc]w3) {video};
\node[word] (w5) at ([yshift=-\bc]w4) {and};
\node[word] (w6) at ([yshift=-\bc]w5) {do};
\node[word] (w7) at ([yshift=-\bc]w6) {n't};
\node[word] (w8) at ([yshift=-\bc]w7) {rent};
\node[word] (w9) at ([yshift=-\bc]w8) {it};
\node[word] (w2) at ([yshift=-\bcc]w1) {for};
\node[word] (w3) at ([yshift=-\bcc]w2) {the};
\node[word] (w4) at ([yshift=-\bcc]w3) {video};
\node[word] (w5) at ([yshift=-\bcc]w4) {and};
\node[word] (w6) at ([yshift=-\bcc]w5) {do};
\node[word] (w7) at ([yshift=-\bcc]w6) {n't};
\node[word] (w8) at ([yshift=-\bcc]w7) {rent};
\node[word] (w9) at ([yshift=-\bcc]w8) {it};
\node[draw,rectangle callout,callout relative pointer={(0.28,-0.6)}] at (-0.3cm,4.6cm) {\textrm{卷积核}};
\node[draw,rectangle callout,callout relative pointer={(0.1,-0.5)}] at (5cm,4.6cm) {\textrm{特征图}};
......@@ -86,7 +87,7 @@
%\draw [thick] (10cm, -0.3cm) -- (10cm, -0.5cm) -- node[font=\tiny, align=center,yshift=-0.5cm]{Fully connected layer \\ with dropout and \\ softmax output} (11.7cm,-0.5cm) -- (11.7cm, -0.3cm);
\draw [thick] (0cm, -0.3cm) -- (0cm, -0.5cm) -- node[font=\tiny, align=center,yshift=-0.5cm]{维度大小为 $m \times k$ \\ 的静态与非静态通道\\的句子表示} (2.4cm,-0.5cm) -- (2.4cm, -0.3cm);
\draw [thick] (3.6cm, -0.3cm) -- (3.6cm, -0.5cm) -- node[font=\tiny, align=center,yshift=-0.5cm]{具有多个不同大小\\的卷积核和特征图\\的卷积层} (6cm,-0.5cm) -- (6cm, -0.3cm);
\draw [thick] (7.2cm, -0.3cm) -- (7.2cm, -0.5cm) -- node[font=\tiny, align=center,yshift=-0.5cm]{Max-over-time\\ pooling} (9cm,-0.5cm) -- (9cm, -0.3cm);
\draw [thick] (7.2cm, -0.3cm) -- (7.2cm, -0.5cm) -- node[font=\tiny, align=center,yshift=-0.5cm]{最大池化} (9cm,-0.5cm) -- (9cm, -0.3cm);
\draw [thick] (10cm, -0.3cm) -- (10cm, -0.5cm) -- node[font=\tiny, align=center,yshift=-0.5cm]{带有dropout\\和softmax输出\\的全连接层} (11.7cm,-0.5cm) -- (11.7cm, -0.3cm);
......
......@@ -16,6 +16,7 @@
\renewcommand\figurename{}%将figure改为图
\renewcommand\tablename{}%将figure改为图
\chapterimage{fig-NEU-2.jpg} % Chapter heading image
\newlength{\bcc}
%----------------------------------------------------------------------------------------
% CHAPTER 11
......@@ -33,7 +34,7 @@
\section{卷积神经网络}
\parinterval {\small\bfnew{卷积神经网络}}\index{卷积神经网络}(Convolutional Neural Network,CNN)\index{Convolutional Neural Network,CNN} 是一种前馈神经网络,由若干的卷积层与池化层组成。其最早在语音识别任务上提出\upcite{Waibel1989PhonemeRU},但之后在图像处理领域取得了很好的效果\upcite{LeCun1989BackpropagationAT,726791}。近年来,卷积神经网络已经成为众多语音、语言、图像处理任务的基础框架\upcite{DBLP:journals/corr/HeZRS15,DBLP:conf/cvpr/HuangLMW17,Girshick2015FastR,He2020MaskR}
\parinterval {\small\bfnew{卷积神经网络}}\index{卷积神经网络}(Convolutional Neural Network,CNN)\index{Convolutional Neural Network,CNN} 是一种前馈神经网络,由若干的卷积层与池化层组成。其最早在语音识别任务上提出\upcite{Waibel1989PhonemeRU},但之后在图像处理领域取得了很好的效果\upcite{LeCun1989BackpropagationAT,726791}。近年来,卷积神经网络已经成为众多语音、语言、图像处理任务的基础框架\upcite{DBLP:journals/corr/HeZRS15,DBLP:conf/cvpr/HuangLMW17,Girshick2015FastR,He2020MaskR}在自然语言处理领域,卷积神经网络已经得到广泛应用,在文本分类\upcite{Kalchbrenner2014ACN,Kim2014ConvolutionalNN,Ma2015DependencybasedCN}、情感分析\upcite{Santos2014DeepCN,}、语言建模\upcite{Dauphin2017LanguageMW}、机器翻译\upcite{Gehring2017ACE,DBLP:journals/corr/GehringAGYD17,Kaiser2018DepthwiseSC,Wu2019PayLA}等任务中取得不错的成绩。
\parinterval\ref{fig:11-1}展示了全连接层和卷积层的结构对比,可以看到在全连接层中,模型考虑了所有的输入,层输出中的每一个元素都依赖于所有输入。这种全连接层适用于大多数任务,但是当处理图像这种网格数据的时候,规模过大的数据会导致模型参数量过大,难以处理。其次,在一些网格数据中,通常具有局部不变性的特征,比如图像中不同位置的相同物体,语言序列中相同的$n$-gram等。而全连接网络很难提取这些局部不变性特征。为此,一些研究人员提出使用卷积层来替换全连接层。
......@@ -52,8 +53,6 @@
\end{figure}
%----------------------------------------------
\parinterval 在自然语言处理领域,卷积神经网络已经得到广泛应用,在文本分类\upcite{Kalchbrenner2014ACN,Kim2014ConvolutionalNN,Ma2015DependencybasedCN}、情感分析\upcite{Santos2014DeepCN,}、语言建模\upcite{Dauphin2017LanguageMW}、机器翻译\upcite{Gehring2017ACE,DBLP:journals/corr/GehringAGYD17,Kaiser2018DepthwiseSC,Wu2019PayLA}等任务中取得不错的成绩。
\parinterval\ref{fig:11-2}展示了一个标准的卷积神经网络模块,其中包括了卷积层、激活函数和池化层三个部分。本节将对卷积神经网络中的基本结构进行介绍。
%----------------------------------------------
......@@ -91,7 +90,7 @@
\begin{figure}[htp]
\centering
\input{./Chapter11/Figures/figure-image-convolution}
\caption{图像卷积操作}
\caption{图像卷积操作(*\ \ 表示卷积计算)}
\label{fig:11-4}
\end{figure}
%----------------------------------------------
......@@ -122,7 +121,7 @@
\begin{figure}[htp]
\centering
\input{./Chapter11/Figures/figure-dimension-transformation}
\caption{卷积操作的维度变换}
\caption{卷积操作的维度变换(*\ \ 表示卷积计算)}
\label{fig:11-6}
\end{figure}
%----------------------------------------------
......@@ -134,7 +133,7 @@
\begin{figure}[htp]
\centering
\input{./Chapter11/Figures/figure-padding-and-conv}
\caption{填充和卷积操作}
\caption{填充和卷积操作(*\ \ 表示卷积计算)}
\label{fig:11-7}
\end{figure}
%----------------------------------------------
......@@ -175,9 +174,9 @@
\begin{figure}[htp]
\centering
%\input{./Chapter11/Figures/figure-f }
\subfigure[O(n)]{\input{./Chapter11/Figures/figure-structural-comparison-a}}
\subfigure[O(n/k)]{\input{./Chapter11/Figures/figure-structural-comparison-b}}
\caption{串行及层级结构对比}
\subfigure[循环神经网络的串行结构($O(n)$)]{\input{./Chapter11/Figures/figure-structural-comparison-a}}
\subfigure[卷积神经网络的层级结构($O(n/k)$)]{\input{./Chapter11/Figures/figure-structural-comparison-b}}
\caption{串行及层级结构对比$\mathbi{e}_i$表示词嵌入,$\mathbi{0}$表示$\mathbi{0}$向量,2,3,4表示第几层)}
\label{fig:11-9}
\end{figure}
%----------------------------------------------
......@@ -218,23 +217,23 @@
\parinterval ConvS2S模型是一种高并行的序列到序列的神经计算模型。该模型分别利用卷积网络对源语言端与目标语言端的输入数据进行特征提取,并使用注意力机制来捕获两个序列之间映射关系。相比于基于多层循环神经网络的GNMT模型\upcite{Wu2016GooglesNM},其主要优势在于每一层的网络计算是完全并行化的,避免了循环神经网络中计算顺序对时序的依赖。同时,利用多层卷积神经网络的层级结构可以有效的捕捉序列不同位置之间的依赖。即使是远距离依赖,也可以通过若干层卷积单元进行有效的捕捉,而且其信息传递的路径相比循环神经网络更短。除此之外,模型同时使用门控线性单元、残差网络和位置编码等技术来进一步提升模型性能,达到了和GNMT模型相媲美的翻译性能,同时大大缩短了训练时间。
\parinterval\ref{fig:11-12}为ConvS2S模型的整体结构示意图,其内部由若干不同的模块组成,包括:
\parinterval\ref{fig:11-12}为ConvS2S模型的结构示意图,其内部由若干不同的模块组成,包括:
\begin{itemize}
\item {\small\bfnew{位置编码}}(Position Embedding):图中绿色背景框表示源语端词嵌入部分,相比于RNN中的词嵌入(Word Embedding),该模型还引入了位置编码,帮助模型获得词位置信息。
\item {\small\bfnew{卷积层与门控线性单元}}(Gated Linear Units, GLU):黄色背景框是卷积模块,这里使用门控线性单元作为非线性函数,之前的研究工作\upcite{Dauphin2017LanguageMW}表明这种非线性函数更适合于序列建模任务。图中为了简化,只展示了一层卷积,但在实际中为了更好地捕获句子信息,通常使用多层卷积叠加计算。
\item {\small\bfnew{残差连接}}\index{残差连接}(Residual Connection)\index{Residual Connection}:对于源语言端和目标语言端的卷积层网络之间,都存在一个从输入到输出的额外连接,即跨层连接\upcite{DBLP:journals/corr/HeZRS15}。该连接方式确保每个隐层输出都能包含输入序列中的更多信息,同时能够有效促进深层网络的信息传递效率。
\item {\small\bfnew{残差连接}}\index{残差连接}(Residual Connection)\index{Residual Connection}:对于源语言端和目标语言端的卷积层网络之间,都存在一个从输入到输出的额外连接,即跨层连接\upcite{DBLP:journals/corr/HeZRS15}。该连接方式确保每个隐层输出都能包含输入序列中的更多信息,同时能够有效促进深层网络的信息传递效率(该部分在图\ref{fig:11-12}中没有显示,具体结构详见\ref{sec:11.2.3}节)
\item {\small\bfnew{多跳注意力机制}}\index{多跳注意力机制}(Multi-step Attention)\index{Multi-step Attention}:蓝色框内部展示了基于多跳结构的注意力机制模块\upcite{Sukhbaatar2015EndToEndMN}可以看出使用多跳注意力帮助解码器最后一层卷积模块关注源语言和目标语言信息;同时每一层卷积层都会执行同样的注意力机制,帮助模型获得更多历史信息。下面我们将以此模型为例对基于CNN的机器翻译模型进行介绍。
\item {\small\bfnew{多跳注意力机制}}\index{多跳注意力机制}(Multi-step Attention)\index{Multi-step Attention}:蓝色框内部展示了基于多跳结构的注意力机制模块\upcite{Sukhbaatar2015EndToEndMN}使用多跳注意力机制可以帮助解码器最后一层的卷积模块关注源语言和目标语言信息;同时每一层卷积层都会执行同样的注意力机制,这样可以帮助模型获得更多历史信息。下面我们将以此模型为例对基于CNN的机器翻译模型进行介绍。
\end{itemize}
%----------------------------------------------
% 图12.
\begin{figure}[htp]
\centering
\input{./Chapter11/Figures/figure-fairseq-1}
\input{./Chapter11/Figures/figure-fairseq-0}
\caption{ConvS2S结构}
\label{fig:11-12}
\end{figure}
......@@ -246,9 +245,9 @@
\subsection{位置编码}
\parinterval 和基于循环神经网络的翻译模型类似,基于卷积神经网络的翻译模型同样采用词嵌入来进行输入序列的表示,该模型首先将语言序列$\seq{w}=\{w_1,w_2,...,w_m\}$表示为维度大小为$m \times d$的矩阵,其中$m$为序列长度,$d$为词嵌入向量维度。和循环神经网络不同的是,基于卷积神经网络的模型需要对每个输入单词位置进行表示。这是由于,在卷积神经网络中,受限于卷积核的大小,单层的卷积神经网络只能捕捉序列局部的相对位置信息。虽然多层的卷积神经网络可以扩大感受野,但是对全局的位置表示并不充分。而相较于基于卷积神经网络的模型,基于循环神经网络的模型按时间步对输入的序列进行建模,这样间接的对位置信息进行了建模。而词序又是自然语言处理任务中重要信息,因此这里需要单独考虑。
\parinterval 和基于循环神经网络的翻译模型类似,基于卷积神经网络的翻译模型同样采用词嵌入来进行输入序列的表示,对于模型语言序列$\seq{w}=\{\mathbi{w}_1,\mathbi{w}_2,...,\mathbi{w}_m\}$,序列$\seq{w}$是维度大小为$m \times d$的矩阵,第$i$个单词$\mathbi{w}_i$是维度为$d$的向量,其中$m$为序列长度,$d$为词嵌入向量维度。和循环神经网络不同的是,基于卷积神经网络的模型需要对每个输入单词位置进行表示。这是由于,在卷积神经网络中,受限于卷积核的大小,单层的卷积神经网络只能捕捉序列局部的相对位置信息。虽然多层的卷积神经网络可以扩大感受野,但是对全局的位置表示并不充分。而相较于基于卷积神经网络的模型,基于循环神经网络的模型按时间步对输入的序列进行建模,这样间接的对位置信息进行了建模。而词序又是自然语言处理任务中重要信息,因此这里需要单独考虑。
\parinterval 为了更好地引入序列的词序信息,该模型引入了位置编码$\seq{p}=\{p_1,p_2,...,p_m\}$,其中$p_i$的维度大小为$d$,一般和词嵌入维度相等,其中具体数值作为网络可学习的参数。简单来说,位置编码的作用就是对位置信息进行表示,不同序列中的相同位置都对应一个唯一的向量表示。之后将词嵌入矩阵和位置编码进行相加,得到模型的输入向量序列矩阵$\seq{e}=\{w_1+p_1,w_2+p_2,...,w_m+p_m\}$。 实际上也有研究人员发现卷积神经网络本身具备一定的编码位置信息的能力\upcite{Islam2020HowMP},而这里的额外的位置编码模块可以被看作是对卷积神经网络位置编码能力的一种补充。
\parinterval 为了更好地引入序列的词序信息,该模型引入了位置编码$\seq{p}=\{\mathbi{p}_1,\mathbi{p}_2,...,\mathbi{p}_m\}$,其中$\mathbi{p}_i$的维度大小为$d$,一般和词嵌入维度相等,其中具体数值作为网络可学习的参数。简单来说,位置编码的作用就是对位置信息进行表示,不同序列中的相同位置都对应一个唯一的向量表示。之后将词嵌入矩阵和位置编码进行相加,得到模型的输入序列$\seq{e}=\{\mathbi{w}_1+\mathbi{p}_1,\mathbi{w}_2+\mathbi{p}_2,...,\mathbi{w}_m+\mathbi{p}_m\}$。 实际上也有研究人员发现卷积神经网络本身具备一定的编码位置信息的能力\upcite{Islam2020HowMP},而这里的额外的位置编码模块可以被看作是对卷积神经网络位置编码能力的一种补充。
%----------------------------------------------------------------------------------------
% NEW SUB-SECTION
......@@ -258,14 +257,14 @@
\parinterval 单层卷积神经网络的感受野受限于卷积核的大小,因此只能捕捉序列中局部的上下文信息,不能很好地进行序列建模。为了捕捉更长的长下文信息,最简单的做法就是堆叠多个卷积层。相比于循环神经网络的链式结构,对相同的上下文跨度,多层卷积神经网络的层级结构可以通过更少的非线性计算对其进行建模,缓解了长距离建模中的梯度消失问题。因此,卷积神经网络相对更容易进行训练优化。
\parinterval 在ConvS2S模型中,编码端和解码端分别使用堆叠的门控卷积网络对源语和目标语序列进行建模。门控卷积在传统卷积网络的基础上引入了门控线性单元(Gated Linear Units,GLU)\upcite{Dauphin2017LanguageMW},通过门控机制对卷积输出进行控制,它在模型中的位置如图\ref{fig:11-13}黄色色方框所示:
\parinterval 在ConvS2S模型中,编码端和解码端分别使用堆叠的门控卷积网络对源语和目标语序列进行建模在传统卷积网络的基础上引入了门控线性单元(Gated Linear Units,GLU)\upcite{Dauphin2017LanguageMW},通过门控机制对卷积输出进行控制,它在模型中的位置如图\ref{fig:11-13}黄色色方框所示:
%----------------------------------------------
% 图13.
\begin{figure}[htp]
\centering
\input{./Chapter11/Figures/figure-fairseq-2}
\caption{门控卷积网络机制在模型中的位置}
\caption{门控卷积网络机制在模型中的位置(黄色背景框部分)}
\label{fig:11-13}
\end{figure}
%----------------------------------------------
......@@ -284,12 +283,12 @@
\parinterval\ref{fig:11-14}是单层门控卷积网络的基本结构,$\mathbi{X}\in \mathbb{R}^{m\times d}$为单层网络的输入,$\mathbi{Y} \in \mathbb{R}^{m \times d}$为单层网络的输出,网络结构主要包括卷积计算和GLU非线性单元两部分。如图所示,形式上,卷积操作可以分成两部分,分别使用两个卷积核来得到两个卷积结果:
\begin{eqnarray}
\mathbi{A} = \mathbi{X} \cdot \mathbi{W} + \mathbi{b}_\mathbi{W} \nonumber \\
\mathbi{B} = \mathbi{X} \cdot \mathbi{V} + \mathbi{b}_\mathbi{V} \ \
\mathbi{A} = \mathbi{X} * \mathbi{W} + \mathbi{b}_\mathbi{W} \nonumber \\
\mathbi{B} = \mathbi{X} * \mathbi{V} + \mathbi{b}_\mathbi{V} \ \
\label{eq:11-1}
\end{eqnarray}
\noindent 其中$\mathbi{A},\mathbi{B}\in \mathbb{R}^d$$\mathbi{W}\in \mathbb{R}^d$$\mathbi{V}\in \mathbb{R}^d$$\mathbi{b}_\mathbi{W}$$\mathbi{b}_\mathbi{V} \in \mathbb{R}^d $$\mathbi{W}$$\mathbi{V}$在此表示卷积核,$\mathbi{b}_\mathbi{W}$$\mathbi{b}_\mathbi{V}$为偏置矩阵。在卷积操作之后,引入非线性变换:
\noindent 其中$\mathbi{A},\mathbi{B}\in \mathbb{R}^d$$\mathbi{W}\in \mathbb{R}^{K\times d \times d}$$\mathbi{V}\in \mathbb{R}^{K\times d \times d}$$\mathbi{b}_\mathbi{W}$$\mathbi{b}_\mathbi{V} \in \mathbb{R}^d $$\mathbi{W}$$\mathbi{V}$在此表示卷积核,$\mathbi{b}_\mathbi{W}$$\mathbi{b}_\mathbi{V}$为偏置矩阵。在卷积操作之后,引入非线性变换:
\begin{eqnarray}
\mathbi{Y} = \mathbi{A} \otimes \sigma ( \mathbi{B} )
\label{eq:11-2}
......@@ -314,6 +313,7 @@
%----------------------------------------------------------------------------------------
\subsection{残差网络}
\label{sec:11.2.3}
\parinterval 残差连接是一种训练深层网络的技术,其结构如图\ref{fig:11-15}所示,即在子层之间通过增加直接连接的方式,从而将底层信息直接传递给上层。
......@@ -329,15 +329,16 @@
\parinterval 残差连接从广义上讲也叫{\small\bfnew{短连接}}\index{短连接}(Short-cut Connection)\index{Short-cut Connection},指的是这种短距离的连接。它的思想很简单,就是把层和层之间的距离拉近,进而提高信息传递的效率。如图\ref{fig:11-15}所示,子层1 通过残差连接跳过了子层2,直接和子层3 进行信息传递。使信息传递变得更高效,有效解决了深层网络训练过程中容易出现的梯度消失/爆炸问题,使得深层网络的训练更加容易。其计算公式为:
\begin{eqnarray}
x_{l+1} = x_l + F (x_l)
\mathbi{h}^{l+1} = F (\mathbi{h}^l) + \mathbi{h}^l
\label{eq:11-3}
\end{eqnarray}
\noindent 其中,$x_l$表示$l$层网络的输入向量,$\mathcal{F} (x_l)$是子层运算。如果$l=2$,那么公式\eqref{eq:11-3}可以解释为,第3层的输入($x_3$)等于第2层的输出($\mathcal{F}(x_2)$)加上第二层的输入($x_2$)。
\noindent 其中,$\mathbi{h}^l$表示$l$层网络的输入向量,${F} (\mathbi{h}^l)$是子层运算。如果$l=2$,那么公式\eqref{eq:11-3}可以解释为,第3层的输入($\mathbi{h}^3$)等于第2层的输出(${F}(\mathbi{h}^2)$)加上第二层的输入($\mathbi{h}^2$)。
\parinterval 在ConvS2S中残差连接主要应用于门控卷积网络和多跳自注意力机制中。为了堆叠更多的卷积网络,在每个卷积网络的输入和输出之间增加残差连接,具体的数学描述如下:
\parinterval 在ConvS2S中残差连接主要应用于门控卷积网络和多跳自注意力机制中,比如在多层的门控卷积网络中,在每一层的输入和输出之间增加残差连接,具体的数学描述如下:
\begin{eqnarray}
\mathbi{h}_i^l = \funp{v} (\mathbi{W}^l [\mathbi{h}_{i-\frac{k}{2}}^{l-1},...,\mathbi{h}_{i+\frac{k}{2}}^{l-1}] + b_{\mathbi{W}}^l ) + \mathbi{h}_i^{l-1}
%\mathbi{h}_i^l = \funp{v} (\mathbi{W}^l [\mathbi{h}_{i-\frac{k}{2}}^{l-1},...,\mathbi{h}_{i+\frac{k}{2}}^{l-1}] + b_{\mathbi{W}}^l ) + \mathbi{h}_i^{l-1}
\mathbi{h}^{l+1} = \mathbi{A}^{l} \otimes \sigma ( \mathbi{B}^{l} ) + \mathbi{h}^{l}
\label{eq:11-4}
\end{eqnarray}
......@@ -355,21 +356,21 @@ x_{l+1} = x_l + F (x_l)
\begin{figure}[htp]
\centering
\input{./Chapter11/Figures/figure-fairseq-3}
\caption{多跳自注意力机制在模型中的位置}
\caption{多跳自注意力机制在模型中的位置(蓝色背景框部分)}
\label{fig:11-16}
\end{figure}
%----------------------------------------------
\parinterval 注意力机制早在基于循环神经网络的翻译模型中被广泛使用\upcite{bahdanau2014neural},用于避免循环神经网络将源语言序列压缩成一个固定维度的向量表示带来的信息损失。另一方面,注意力同样能够帮助解码端区分源语言中不同位置词对当前解码词的贡献权重,其具体的计算过程如下:
\begin{eqnarray}
\mathbi{C}_j = \sum_i \alpha_{i,j} \mathbi{h}_i \\
\alpha_{i,j} = \frac{ \textrm{exp}(a (\mathbi{s}_{j-1},\mathbi{h}_i)) }{\sum_{i'} \textrm{exp}(a(\mathbi{s}_{j-1},\mathbi{h}_{i'}))}
\mathbi{C}_j &=& \sum_i \alpha_{i,j} \mathbi{h}_i \\
\alpha_{i,j} &=& \frac{ \textrm{exp}(a (\mathbi{s}_{j-1},\mathbi{h}_i)) }{\sum_{i'} \textrm{exp}(a(\mathbi{s}_{j-1},\mathbi{h}_{i'}))}
\label{eq:11-5}
\end{eqnarray}
\noindent 其中$\mathbi{h}_i$表示源语端第$i$个位置的隐层状态,$\mathbi{s}_j$表示目标端第$j$个位置的隐层状态。给定$\mathbi{s}_j$$\mathbi{h}_i$,注意力机制通过函数a($\cdot$)计算目标语言表示$\mathbi{s}_j$与源语言表示$\mathbi{h}_i$之间的注意力权重$\alpha_{i,j}$,通过加权平均得到当前目标端位置所需的上下文表示$\mathbi{C}_j$。其中a($\cdot$)的具体计算方式在{\chapterten}\ref{sec:10.4.2}已经详细讨论。
\noindent 其中$\mathbi{h}_i$表示源语端第$i$个位置的隐层状态,$\mathbi{s}_j$表示目标端第$j$个位置的隐层状态。给定$\mathbi{s}_j$$\mathbi{h}_i$,注意力机制通过函数a($\cdot$)计算目标语言表示$\mathbi{s}_j$与源语言表示$\mathbi{h}_i$之间的注意力权重$\alpha_{i,j}$,通过加权平均得到当前目标端位置所需的上下文表示$\mathbi{C}_j$。其中a($\cdot$)的具体计算方式在{\chapterten}已经详细讨论。
\parinterval 对比基于循环神经网络的机器翻译模型(GNMT)仅在解码端的最底层采用注意力机制,在ConvS2S模型中,解码端的每一层中都分别引入了注意力机制,同时通过残差连接的方式将结果作用于上层网络的计算,因此称之为{\small\bfnew{ 多跳注意力}}(Multi-step Attention)。 其中ConvS2S模型选取向量乘的方式作为a($\cdot$)函数具体的数学描述为:
\parinterval 对比基于循环神经网络的机器翻译模型(GNMT)仅在解码端的最底层采用注意力机制,在ConvS2S模型中,解码端的每一层中都分别引入了注意力机制,同时通过残差连接的方式将结果作用于上层网络的计算,因此称之为{\small\bfnew{ 多跳注意力}}(Multi-step Attention)。 ConvS2S模型选取向量乘的方式作为a($\cdot$)函数具体的数学描述为:
\begin{eqnarray}
\alpha_{ij}^l = \frac{ \textrm{exp} (\mathbi{d}_{j}^l,\mathbi{h}_i) }{\sum_{t=1}^m \textrm{exp} (\mathbi{d}_{j}^l,\mathbi{h}_{i})}
\label{eq:11-6-1}
......@@ -391,7 +392,7 @@ x_{l+1} = x_l + F (x_l)
\parinterval 当得到上下文向量$\mathbi{C}_j^l$后,将其与$\mathbi{s}_j^l$相加后送入下一层计算,这种机制也被称为多跳机制,具体的数学描述如下:
\begin{eqnarray}
\mathbi{C}_j^{l+1} = \mathbi{C}_j^l + \mathbi{s}_j^l
\mathbi{s}_j^{l+1} = \mathbi{C}_j^l + \mathbi{s}_j^l
\label{eq:11-8}
\end{eqnarray}
......@@ -407,12 +408,12 @@ x_{l+1} = x_l + F (x_l)
\parinterval ConvS2S同样有针对性的应用了很多工程方面的调整,主要包括:
\begin{itemize}
\item ConvS2S使用了{\small\bfnew{Nesterov加速梯度下降法}}(NAG),动量累计的系数设置为0.99,当梯度范数超过0.1时重新进行规范化\upcite{Sutskever2013OnTI}
\item ConvS2S使用了{\small\bfnew{Nesterov加速梯度下降法}}(Nesterov Accelerated Gradient,NAG),动量累计的系数设置为0.99,当梯度范数超过0.1时重新进行规范化\upcite{Sutskever2013OnTI}
\item ConvS2S中设置学习率为0.25,每当模型在校验集上的困惑度不再下降时,便在每轮的训练后将学习率降低一个数量级,直至学习率小于一定的阈值(如0.0004)。
\end{itemize}
\parinterval Nesterov加速梯度下降法和{\chapternine}介绍的Momentum梯度下降法类似,都使用了历史梯度信息,区别在于使用的梯度不是来自于当前参数位置,而是按照之前梯度方向更新一小步的位置。这种方式可以更好的“预测未来”,提前调整更新速率,具体如下:
\parinterval Nesterov加速梯度下降法和{\chapternine}介绍的Momentum梯度下降法类似,都使用了历史梯度信息,首先回忆一下Momentum梯度下降法,公式如下:
\begin{eqnarray}
\textrm{Momentum:} \mathbi{v}_t = \beta \mathbi{v}_{t-1} + (1-\beta)\frac{\partial J(\mathbi{w})}{\partial \mathbi{w}_t} \\
\textrm{参数更新:} \mathbi{w}_{t+1} = \mathbi{w} - \alpha \mathbi{v}_t \ \ \ \ \ \ \ \ \ \ \ \
......@@ -421,13 +422,13 @@ x_{l+1} = x_l + F (x_l)
\noindent 其中$\mathbi{w}_t$表示第$t$步更新时的模型参数;$J(\mathbi{w})$表示损失函数均值期望的估计;$\frac{\partial J(\mathbi{w})}{\partial \mathbi{w}_t}$将指向$J(\mathbi{w})$$\mathbi{w}$处变化最大的方向,即梯度方向;$\alpha$ 为学习率。$\mathbi{v}_t$则为损失函数在前$t-1$步更新中累积的梯度动量,利用超参数$\beta$控制累积的范围。
\parinterval 在Nesterov加速梯度下降法中,使用的梯度不是来自于当前参数位置,而是按照之前梯度方向更新一小步的位置,以便于更好的“预测未来”,提前调整更新速率,因此,其动量的更新方式为:
\parinterval 在Nesterov加速梯度下降法中,使用的梯度不是来自于当前参数位置,而是按照之前梯度方向更新一小步的位置,以便于更好的“预测未来”,提前调整更新速率,因此,其动量的更新方式为:
\begin{eqnarray}
\textrm{Nesterov:} \mathbi{v}_t = \beta \mathbi{v}_{t-1} + (1-\beta)\frac{\partial J(\mathbi{w})}{\partial (\mathbi{w}_{t} -\alpha \beta \mathbi{v}_{t-1} )}
\label{eq:11-10}
\end{eqnarray}
\parinterval 在数学本质上,NAG其实是利用了二阶导数的信息,因此可以做到“向前看”,加速收敛过程\upcite{Bengio2013AdvancesIO}。为了模型的稳定训练。ConvS2S也采用了一些网络正则化和参数初始化的策略,使得模型在前向计算和反向计算过程中方差尽可能保持一致。
\parinterval 在数学本质上,Nesterov加速梯度下降法其实是利用了二阶导数的信息,因此可以做到“向前看”,加速收敛过程\upcite{Bengio2013AdvancesIO}。为了模型的稳定训练。ConvS2S也采用了一些网络正则化和参数初始化的策略,使得模型在前向计算和反向计算过程中方差尽可能保持一致。
\parinterval 此外,ConvS2S为了进一步提升模型的训练效率及性能,还使用了小批量训练,即每次从样本中选择出一小部分数据进行训练。同时,ConvS2S模型中也使用了Dropout方法\upcite{JMLR:v15:srivastava14a}。除了在词嵌入层和解码器输出层应用Dropout外,ConvS2S还对卷积块的输入层应用了Dropout。
......@@ -439,7 +440,7 @@ x_{l+1} = x_l + F (x_l)
\section{局部模型的改进}
\parinterval 在序列建模中,卷积神经网络可以通过参数共享,高效地捕捉局部上下文特征,如图\ref{fig:11-11}所示。但是通过进一步分析可以发现,在标准卷积操作中包括了不同词和不同通道之间两种信息的交互,每个卷积核都是对相邻词的不同通道进行卷积,参数量为$K \times O$。其中$K$为卷积核大小,$O$为输入的通道数,即单词表示的维度大小。因此$N$个卷积核总共的参数量为$K \times O \times N$。这里涉及卷积核大小、输入通道数和输出通道数三个维度,因此计算复杂度较高。为了进一步提升计算效率,降低参数量,一些研究人员提出{\small\bfnew{深度可分离卷积}}\index{深度可分离卷积}(Depthwise Separable Convolution)\index{Depthwise Separable Convolution},将空间维度和通道间的信息交互分离成深度卷积(也叫逐通道卷积,Depthwise Convolution)\index{逐通道卷积,Depthwise Convolution}{\small\bfnew{逐点卷积}} \index{逐点卷积}(Pointwise Convolution)\index{Pointwise Convolution} 两部分\upcite{Chollet2017XceptionDL,Howard2017MobileNetsEC}。除了直接将深度可分离卷积应用到神经机器翻译中\upcite{Kaiser2018DepthwiseSC},研究人员提出使用更高效的{\small\bfnew{轻量卷积}}\index{轻量卷积}(Lightweight Convolution)\index{Lightweight Convolution}{\small\bfnew{动态卷积}}\index{动态卷积}(Dynamic convolution)\index{Dynamic convolution}来进行不同词之间的特征提取\upcite{Wu2019PayLA}。本节将主要介绍这些改进的卷积操作。在后续章节中也会看到这些模型在神经机器翻译中的应用。
\parinterval 在序列建模中,卷积神经网络可以通过参数共享,高效地捕捉局部上下文特征,如图\ref{fig:11-11}所示。但是通过进一步分析可以发现,在标准卷积操作中包括了不同词和不同通道之间两种信息的交互,每个卷积核都是对相邻词的不同通道进行卷积,参数量为$K \times O$。其中$K$为卷积核大小,$O$为输入的通道数,即单词表示的维度大小。因此$N$个卷积核总共的参数量为$K \times O \times N$。这里涉及卷积核大小、输入通道数和输出通道数三个维度,因此计算复杂度较高。为了进一步提升计算效率,降低参数量,一些研究人员提出{\small\bfnew{深度可分离卷积}}\index{深度可分离卷积}(Depthwise Separable Convolution)\index{Depthwise Separable Convolution},将空间维度和通道间的信息交互分离成深度卷积(也叫逐通道卷积,Depthwise Convolution)\index{逐通道卷积,Depthwise Convolution}{\small\bfnew{逐点卷积}} \index{逐点卷积}(Pointwise Convolution)\index{Pointwise Convolution} 两部分\upcite{Chollet2017XceptionDL,Howard2017MobileNetsEC}。除了直接将深度可分离卷积应用到神经机器翻译中\upcite{Kaiser2018DepthwiseSC},研究人员提出使用更高效的{\small\bfnew{轻量卷积}}\index{轻量卷积}(Lightweight Convolution)\index{Lightweight Convolution}{\small\bfnew{动态卷积}}\index{动态卷积}(Dynamic Convolution)\index{Dynamic convolution}来进行不同词之间的特征提取\upcite{Wu2019PayLA}。本节将主要介绍这些改进的卷积操作。在后续章节中也会看到这些模型在神经机器翻译中的应用。
%----------------------------------------------------------------------------------------
% NEW SUB-SECTION
......@@ -462,29 +463,34 @@ x_{l+1} = x_l + F (x_l)
\parinterval 深度可分离卷积\upcite{Sifre2013RotationSA}由深度卷积和逐点卷积两部分结合而成。图\ref{fig:11-17}展示了标准卷积、深度卷积和逐点卷积的对比,为了方便显示,图中只画出了部分连接。
\parinterval 给定输入序列表示$\seq{X} = \{ \mathbi{x}_1,\mathbi{x}_2,...,\mathbi{x}_m \}$,其中$m$为序列长度,$\mathbi{x}_i \in \mathbb{R}^{O} $$O$ 即输入序列的通道数。为了获得与输入序列长度相同的卷积输出结果,首先需要进行填充。为了方便描述,这里在输入序列尾部填充 $K-1$ 个元素,其对应的卷积结果为$\seq{Z} = \{ \mathbi{z}_1,\mathbi{z}_2,...,\mathbi{z}_m \}$。在标准卷积中,$ \mathbi{z}_i^\textrm{std} \in \mathbb{R}^N$ (std是standard的缩写),N为卷积核的个数,也就是标准卷积输出序列的通道数。针对$ \mathbi{z}_i^\textrm{std} $ 中的第$n$个通道$ \mathbi{z}_{i,n}^\textrm{std}$,标准卷积具体计算方式如下:
\parinterval 给定输入序列表示$\seq{X} = \{ \mathbi{x}_1,\mathbi{x}_2,...,\mathbi{x}_m \}$,其中$m$为序列长度,$\mathbi{x}_i \in \mathbb{R}^{O} $$O$ 即输入序列的通道数。为了获得与输入序列长度相同的卷积输出结果,首先需要进行填充。为了方便描述,这里在输入序列尾部填充 $K-1$ 个元素($K$为卷积核窗口的长度),其对应的卷积结果为$\seq{Z} = \{ \mathbi{z}_1,\mathbi{z}_2,...,\mathbi{z}_m \}$
在标准卷积中,若使用N表示卷积核的个数,也就是标准卷积输出序列的通道数,那么对于第$i$个位置的第$n$个通道$ \mathbi{z}_{i,n}^\textrm{std}$,其标准卷积具体计算方式如下:
\begin{eqnarray}
\mathbi{z}_{i,n}^\textrm{std} = \sum_{o=1}^{O} \sum_{k=0}^{K-1} \mathbi{W}_{k,o,n}^\textrm{std} \mathbi{x}_{i+k,o}
\label{eq:11-11}
\end{eqnarray}
\noindent 其中 $\mathbi{W}^\textrm{std} \in \mathbb{R}^{K \times O \times N} $ 为标准卷积的参数。可以看出,标准卷积中每个输出元素需要考虑卷积核尺度内所有词的所有特征,参数量相对较多,对应图\ref{fig:11-17}中的连接数也最多。
%在标准卷积中,$ \mathbi{z}^\textrm{std}$表示标准卷积的输出,$ \mathbi{z}_i^\textrm{std} \in \mathbb{R}^N$ ,N为卷积核的个数,也就是标准卷积输出序列的通道数。针对$ \mathbi{z}_i^\textrm{std} $ 中的第$n$个通道$ \mathbi{z}_{i,n}^\textrm{std}$,标准卷积具体计算方式如下:
\noindent 其中$ \mathbi{z}^\textrm{std}$表示标准卷积的输出,$ \mathbi{z}_i^\textrm{std} \in \mathbb{R}^N$$\mathbi{W}^\textrm{std} \in \mathbb{R}^{K \times O \times N} $ 为标准卷积的参数。可以看出,标准卷积中每个输出元素需要考虑卷积核尺度内所有词的所有特征,参数量相对较多,对应图\ref{fig:11-17}中的连接数也最多。
\parinterval 相应的,深度卷积只考虑不同词之间的依赖性,而不考虑不同通道之间的关系,相当于使用$O$个卷积核逐个通道对不同的词进行卷积操作。因此深度卷积不改变输出的表示维度,输出序列表示的通道数与输入序列一致,其计算方式如下(dw是depth wise的缩写)
\parinterval 相应的,深度卷积只考虑不同词之间的依赖性,而不考虑不同通道之间的关系,相当于使用$O$个卷积核逐个通道对不同的词进行卷积操作。因此深度卷积不改变输出的表示维度,输出序列表示的通道数与输入序列一致,其计算方式如下:
\begin{eqnarray}
\mathbi{z}_{i,o}^\textrm{dw} = \sum_{k=0}^{K-1} \mathbi{W}_{k,o}^\textrm{dw} \mathbi{x}_{i+k,o}
\label{eq:11-12}
\end{eqnarray}
\noindent 其中 $\mathbi{z}_i^\textrm{dw} \in \mathbb{R}^{O}$$\mathbi{W}^\textrm{dw} \in \mathbb{R}^{K \times O}$,参数量只涉及卷积核大小及输入表示维度。
\noindent 其中$\mathbi{z}^\textrm{dw}$表示深度卷积的输出,$\mathbi{z}_i^\textrm{dw} \in \mathbb{R}^{O}$$\mathbi{W}^\textrm{dw} \in \mathbb{R}^{K \times O}$为深度卷积的参数,参数量只涉及卷积核大小及输入表示维度。
\parinterval 与深度卷积互为补充的是,逐点卷积只考虑不同通道之间的依赖性,而不考虑不同词之间的依赖。换句话说,逐点卷积对每个词表示做了一次线性变换,将输入表示$\mathbb{R}^{O}$ 的空间映射到 $\mathbb{R}^{N}$的空间,计算方式如下(pw是position wise的缩写)
\parinterval 与深度卷积互为补充的是,逐点卷积只考虑不同通道之间的依赖性,而不考虑不同词之间的依赖。换句话说,逐点卷积对每个词表示做了一次线性变换,将输入表示$\mathbi{x}_i$$\mathbb{R}^{O}$ 的空间映射到 $\mathbb{R}^{N}$的空间,计算方式如下
\begin{eqnarray}
\mathbi{z}_{i,n}^\textrm{pw} &=& \sum\nolimits_{o=1}^{O} \mathbi{W}_{o,n}^\textrm{pw} \mathbi{x}_{i,o} \nonumber \\
\mathbi{z}_{i,n}^\textrm{pw} &=& \sum\nolimits_{o=1}^{O} \mathbi{x}_{i,o} \mathbi{W}_{o,n}^\textrm{pw} \nonumber \\
&=& \mathbi{x}_i \mathbi{W}^\textrm{pw}
\label{eq:11-13}
\end{eqnarray}
\noindent 其中$\mathbi{z}^\textrm{pw}$表示逐点卷积的输出,$\mathbi{z}_{i}^\textrm{pw} \in \mathbb{R}^{N}$$\mathbi{W}^\textrm{pw} \in \mathbb{R}^{O \times N}$为逐点卷积的参数。
\parinterval\ref{tab:11-1}展示了这几种不同类型卷积的参数量,深度可分离卷积通过将标准卷积进行分解,降低了整体模型的参数量。在相同参数量的情况下,深度可分离卷积可以采用更大的卷积窗口,考虑序列中更大范围的依赖关系。因此相比于标准卷积,深度可分离卷积具有更强的表示能力,在机器翻译任务中也能获得更好的性能。
%----------------------------------------------
......@@ -517,7 +523,7 @@ x_{l+1} = x_l + F (x_l)
\parinterval 在序列建模的模型中,一个很重要的模块就是对序列中不同位置信息的提取,如ConvS2S中的卷积网络等。虽然考虑局部上下文的卷积神经网络只在序列这一维度进行操作,具有线性的复杂度,但是由于标准卷积操作中考虑了不同通道的信息交互,整体复杂度依旧较高。一种简化的策略就是采取通道独立的卷积操作,也就是\ref{sec:11.3.1}节中介绍的深度卷积。
\parinterval 在神经机器翻译模型中,多层表示的维度通常一致,即$O=N=d$。因此,深度卷积可以使得卷积网络参数量从 $d^2K$ 降到$dK$(参考表\ref{tab:11-1})。从形式上来看,深度卷积和注意力很类似,区别在于注意力机制考虑了序列全局上下文信息,权重来自于当前位置对其他位置的“注意力”,而深度卷积中仅考虑了局部的上下文信息,权重采用了在不同通道上独立的固定参数。为了进一步降低参数量,轻量卷积共享了部分通道的卷积参数。如图\ref{fig:11-18}所示,深度卷积中4种颜色的连接代表了4个通道上独立的卷积核,而轻量卷积中,第一和第三通道,第二和第四通道采用了共享的卷积核参数。通过共享,可以将参数量压缩到$mK$,其中压缩比例为$d/m$
\parinterval 在神经机器翻译模型中,多层表示的维度通常一致,即$O=N=d$。因此,深度卷积可以使得卷积网络参数量从 $Kd^2$ 降到$Kd$(参考表\ref{tab:11-1})。从形式上来看,深度卷积和注意力很类似,区别在于注意力机制考虑了序列全局上下文信息,权重来自于当前位置对其他位置的“注意力”,而深度卷积中仅考虑了局部的上下文信息,权重采用了在不同通道上独立的固定参数。为了进一步降低参数量,轻量卷积共享了部分通道的卷积参数。如图\ref{fig:11-18}所示,深度卷积中4种颜色的连接代表了4个通道上独立的卷积核,而轻量卷积中,第一和第三通道,第二和第四通道采用了共享的卷积核参数。通过共享,可以将参数量压缩到$Ka$,其中压缩比例为$d/a$$a$压缩后保留的共享通道数)
%----------------------------------------------
% 图18.
......@@ -529,30 +535,38 @@ x_{l+1} = x_l + F (x_l)
\end{figure}
%----------------------------------------------
\parinterval 此外,和标准卷积不同的是,卷积之前需要先对卷积参数进行归一化,具体计算过程如下(lw是Lightweight的缩写)
\parinterval 此外,和标准卷积不同的是,卷积之前需要先对卷积参数进行归一化,具体计算过程如下:
\begin{eqnarray}
\mathbi{z}_{i,o}^\textrm{lw} &=& \sum_{k=0}^{K-1} \textrm{Softmax}(\mathbi{W}^\textrm{lw})_{k,[\frac{oh}{d}]} \mathbi{x}_{i+k,o}
\mathbi{z}_{i,o}^\textrm{lw} &=& \sum_{k=0}^{K-1} \textrm{Softmax}(\mathbi{W}^\textrm{lw})_{k,[\frac{oa}{d}]} \mathbi{x}_{i+k,o}
\label{eq:11-14}
\end{eqnarray}
\noindent 其中$\mathbi{z}_i^\textrm{lw} \in \mathbb{R}^d $$\mathbi{W}^\textrm{lw} \in \mathbb{R}^{K\times h}$。在这里,轻量卷积用来捕捉相邻词的特征,通过Softmax可以在保证关注到不同词的同时,对输出大小进行限制。
\noindent 其中$\mathbi{z}^\textrm{lw}$表示轻量卷积的输出,$\mathbi{z}_i^\textrm{lw} \in \mathbb{R}^d $$\mathbi{W}^\textrm{lw} \in \mathbb{R}^{K\times a}$为轻量卷积的参数。在这里,轻量卷积用来捕捉相邻词的特征,通过Softmax可以在保证关注到不同词的同时,对输出大小进行限制。
%----------------------------------------------------------------------------------------
% NEW SUBSUB-SECTION
%----------------------------------------------------------------------------------------
\subsubsection{2. 动态卷积}
\parinterval 轻量卷积和动态卷积的概念最早都是在图像领域被提出,大大减少卷积网络模型中的参数和计算量\upcite{726791,Taigman2014DeepFaceCT,Chen2015LocallyconnectedAC}。虽然轻量卷积在存储和速度上具有优势,但其参数量的减少也导致了其表示能力的下降,损失了一部分模型性能。为此,研究人员提出了动态卷积,旨在不增加网络深度和宽度的情况下来增强模型的表示能力,其思想就是根据输入来动态地生成卷积参数\upcite{Wu2019PayLA,Chen2020DynamicCA}
\parinterval 轻量卷积和动态卷积的概念最早都是在图像领域被提出,大大减少卷积网络模型中的参数和计算量\upcite{726791,Taigman2014DeepFaceCT,Chen2015LocallyconnectedAC}。虽然轻量卷积在存储和速度上具有优势,但其参数量的减少也导致了其表示能力的下降,损失了一部分模型性能。为此,研究人员提出了动态卷积,旨在不增加网络深度和宽度的情况下来增强模型的表示能力,其思想就是根据输入来动态地生成卷积参数\upcite{Wu2019PayLA,Chen2020DynamicCA}
\parinterval 在轻量卷积中,模型使用的卷积参数是静态的,与序列位置无关, 维度大小为$m\times k$;而在动态卷积中,为了增强模型的表示能力,卷积参数来自于当前位置输入的变换,具体如下:
\parinterval 在轻量卷积中,模型使用的卷积参数是静态的,与序列位置无关, 维度大小为$K\times a$;而在动态卷积中,为了增强模型的表示能力,卷积参数来自于当前位置输入的变换,具体如下:
\begin{eqnarray}
\funp{f} (\mathbi{X}_i) = \sum_{c=1}^d \mathbi{W}_{:.:.c}\mathbi{X}_{i,c}
\funp{f} (\mathbi{X}_{i}) = \sum_{c=1}^d \mathbi{W}_{:,:,c} \odot \mathbi{X}_{i,c}
\label{eq:11-15}
\end{eqnarray}
\parinterval 这里采用了最简单的线性变换,其中$\mathbi{X}$为二维的序列表示,$i$$c$分别对应序列中不同的位置以及不同的通道,$\mathbi{W} \in \mathbb{R}^{m \times k \times d}$为变换矩阵,最后生成的$\funp{f} (\mathbi{X}_i)$就是与输入相关的卷积核参数。通过这种方式,模型可以根据不同位置的表示来确定如何关注其他位置信息的“权重”,更好地提取序列信息。同时,相比于注意力机制中两两位置确定出来的注意力权重,动态卷积线性复杂度的做法具有更高的计算效率。
\parinterval 这里采用了最简单的线性变换,其中$\odot$表示矩阵的点乘(详见第九章介绍),$\mathbi{X}$是序列表示,$d$为通道数,$i$$c$分别对应序列中不同的位置以及不同的通道,$\mathbi{W} \in \mathbb{R}^{K \times a \times d}$为变换矩阵,$\mathbi{W}_{:,:,c}$表示其只在$d$这一维进行计算,最后生成的$\funp{f} (\mathbi{X}_i)\in \mathbb{R}^{K \times a}$就是与输入相关的卷积核参数。通过这种方式,模型可以根据不同位置的表示来确定如何关注其他位置信息的“权重”,更好地提取序列信息。同时,相比于注意力机制中两两位置确定出来的注意力权重,动态卷积线性复杂度的做法具有更高的计算效率。
%----------------------------------------------------------------------------------------
% NEW SUB-SECTION
%----------------------------------------------------------------------------------------
\subsection{小节及拓展阅读}
\parinterval 卷积是一种高效处理网格数据的计算方式,在图像、语音等领域取得了令人瞩目的成绩。本章介绍了卷积的概念及其特性,并对池化、填充等操作进行了详细的讨论。前面介绍的基于循环神经网络的翻译模型在引入注意力机制后已经大幅度超越了基于统计的机器翻译模型,但由于循环神经网络的计算方式导致网络整体的并行能力差,训练耗时。本章介绍了具有高并行计算的能力的模型范式,即基于卷积神经网络的编码器-解码器框架。其在机器翻译任务上取得了与基于循环神经网络的GNMT模型相当的性能,并大幅度缩短了模型的训练周期。除了基础部分,本章还针对卷积计算进行了延伸,包括逐通道卷积、逐点卷积、轻量卷积和动态卷积等。除了上述提及的内容,卷积神经网络及其变种在文本分类、命名实体识别等其他自然语言处理任务上也有许多应用。
\parinterval 和机器翻译任务不同的是,文本分类任务侧重于对序列特征的提取,然后通过压缩后的特征表示做出类别预测。卷积神经网络可以对序列中一些$n$-gram特征进行提取,也可以用在文本分类任务中,其基本结构包括输入层、卷积层、池化层和全连接层。除了在本章介绍过的TextCNN模型\upcite{Kim2014ConvolutionalNN},不少研究工作在此基础上对其进行改进。比如,通过改变输入层来引入更多特征\upcite{DBLP:conf/acl/NguyenG15,DBLP:conf/aaai/LaiXLZ15},对卷积层的改进\upcite{DBLP:conf/acl/ChenXLZ015,DBLP:conf/emnlp/LeiBJ15}以及对池化层的改进\upcite{Kalchbrenner2014ACN,DBLP:conf/acl/ChenXLZ015}。在命名实体识别任务中,同样可以使用卷积神经网络来进行特征提取\upcite{DBLP:journals/jmlr/CollobertWBKKK11,DBLP:conf/cncl/ZhouZXQBX17},或者使用更高效的空洞卷积对更长的上下文进行建模\upcite{DBLP:conf/emnlp/StrubellVBM17}。此外,也有一些研究工作尝试使用卷积神经网络来提取字符级特征\upcite{DBLP:conf/acl/MaH16,DBLP:conf/emnlp/LiDWCM17,DBLP:conf/acl-codeswitch/WangCK18}
......
......@@ -5,9 +5,9 @@
\begin{scope}
\tikzstyle{rnnnode} = [minimum height=1.1em,minimum width=2.1em,inner sep=2pt,rounded corners=1pt,draw,fill=red!20];
\node [rnnnode,anchor=west] (h1) at (0,0) {\tiny{$\vectorn{\emph{h}}_1$}};
\node [rnnnode,anchor=west] (h2) at ([xshift=1em]h1.east) {\tiny{$\vectorn{\emph{h}}_2$}};
\node [rnnnode,anchor=west] (h3) at ([xshift=1em]h2.east) {\tiny{$\vectorn{\emph{h}}_3$}};
\node [rnnnode,anchor=west] (h1) at (0,0) {\tiny{$\mathbi{h}_1$}};
\node [rnnnode,anchor=west] (h2) at ([xshift=1em]h1.east) {\tiny{$\mathbi{h}_2$}};
\node [rnnnode,anchor=west] (h3) at ([xshift=1em]h2.east) {\tiny{$\mathbi{h}_3$}};
\node [rnnnode,anchor=north,fill=green!20] (e1) at ([yshift=-1em]h1.south) {\tiny{$e_x()$}};
\node [rnnnode,anchor=west,fill=green!20] (e2) at ([xshift=1em]e1.east) {\tiny{$e_x()$}};
\node [rnnnode,anchor=west,fill=green!20] (e3) at ([xshift=1em]e2.east) {\tiny{$e_x()$}};
......@@ -44,18 +44,18 @@
%\node [anchor=west,inner sep=2pt] (t5) at ([xshift=0.3em]t4.east) {\tiny{...}};
}
{
\node [rnnnode,anchor=south] (s1) at ([yshift=1em]t1.north) {\tiny{$\vectorn{\emph{s}}_1$}};
\node [rnnnode,anchor=south] (f1) at ([yshift=1em]s1.north) {\tiny{$\vectorn{\emph{f}}_1$}};
\node [rnnnode,anchor=south] (s1) at ([yshift=1em]t1.north) {\tiny{$\mathbi{s}_1$}};
\node [rnnnode,anchor=south] (f1) at ([yshift=1em]s1.north) {\tiny{$\mathbi{f}_1$}};
}
{
\node [rnnnode,anchor=south] (s2) at ([yshift=1em]t2.north) {\tiny{$\vectorn{\emph{s}}_2$}};
\node [rnnnode,anchor=south] (f2) at ([yshift=1em]s2.north) {\tiny{$\vectorn{\emph{f}}_2$}};
\node [rnnnode,anchor=south] (s2) at ([yshift=1em]t2.north) {\tiny{$\mathbi{s}_2$}};
\node [rnnnode,anchor=south] (f2) at ([yshift=1em]s2.north) {\tiny{$\mathbi{f}_2$}};
}
{
\node [rnnnode,anchor=south] (s3) at ([yshift=1em]t3.north) {\tiny{$\vectorn{\emph{s}}_3$}};
\node [rnnnode,anchor=south] (f3) at ([yshift=1em]s3.north) {\tiny{$\vectorn{\emph{f}}_3$}};
\node [rnnnode,anchor=south] (s4) at ([yshift=1em]t4.north) {\tiny{$\vectorn{\emph{s}}_4$}};
\node [rnnnode,anchor=south] (f4) at ([yshift=1em]s4.north) {\tiny{$\vectorn{\emph{f}}_4$}};
\node [rnnnode,anchor=south] (s3) at ([yshift=1em]t3.north) {\tiny{$\mathbi{s}_3$}};
\node [rnnnode,anchor=south] (f3) at ([yshift=1em]s3.north) {\tiny{$\mathbi{f}_3$}};
\node [rnnnode,anchor=south] (s4) at ([yshift=1em]t4.north) {\tiny{$\mathbi{s}_4$}};
\node [rnnnode,anchor=south] (f4) at ([yshift=1em]s4.north) {\tiny{$\mathbi{f}_4$}};
%\node [anchor=west,inner sep=2pt] (s5) at ([xshift=0.3em]s4.east) {\tiny{...}};
%\node [anchor=south] (dot3) at ([xshift=-0.4em,yshift=-0.7em]s3.south) {\tiny{...}};
\node [anchor=south] (dot4) at ([xshift=-0.4em,yshift=-0.7em]s4.south) {\tiny{...}};
......@@ -145,7 +145,7 @@
}
{
\node [circle,draw,anchor=south,inner sep=3pt,fill=orange!20] (c1) at ([yshift=2em]h2.north) {\tiny{$\vectorn{\emph{C}}_1$}};
\node [circle,draw,anchor=south,inner sep=3pt,fill=orange!20] (c1) at ([yshift=2em]h2.north) {\tiny{$\mathbi{C}_1$}};
\node [anchor=south] (c1label) at (c1.north) {\tiny{\textbf{编码-解码注意力机制:上下文}}};
\draw [->] (h1.north) .. controls +(north:0.6) and +(250:0.9) .. (c1.250);
\draw [->] (h2.north) .. controls +(north:0.6) and +(270:0.9) .. (c1.270);
......@@ -155,19 +155,19 @@
}
{
\node [circle,draw,anchor=north,inner sep=3pt,fill=orange!20] (c2) at ([yshift=-2em]t1.south) {\tiny{$\vectorn{\emph{C}}_2$}};
\node [circle,draw,anchor=north,inner sep=3pt,fill=orange!20] (c2) at ([yshift=-2em]t1.south) {\tiny{$\mathbi{C}_2$}};
\draw [->] ([xshift=-0.7em]c2.west) -- ([xshift=-0.1em]c2.west);
\draw [->] ([xshift=0.1em]c2.east) .. controls +(east:0.6) and +(west:0.8) ..([yshift=-0.3em,xshift=-0.1em]f2.west);
}
{
\node [circle,draw,anchor=north,inner sep=3pt,fill=orange!20] (c3) at ([yshift=-2em]t2.south) {\tiny{$\vectorn{\emph{C}}_3$}};
\node [circle,draw,anchor=north,inner sep=3pt,fill=orange!20] (c3) at ([yshift=-2em]t2.south) {\tiny{$\mathbi{C}_3$}};
\draw [->] ([xshift=-0.7em]c3.west) -- ([xshift=-0.1em]c3.west);
\draw [->] ([xshift=0.1em]c3.east) .. controls +(east:0.6) and +(west:0.8) ..([yshift=-0.3em,xshift=-0.1em]f3.west);
}
{
\node [circle,draw,anchor=north,inner sep=3pt,fill=orange!20] (c4) at ([yshift=-2em]t3.south) {\tiny{$\vectorn{\emph{C}}_4$}};
\node [circle,draw,anchor=north,inner sep=3pt,fill=orange!20] (c4) at ([yshift=-2em]t3.south) {\tiny{$\mathbi{C}_4$}};
\draw [->] ([xshift=-0.7em]c4.west) -- ([xshift=-0.1em]c4.west);
\draw [->] ([xshift=0.1em]c4.east) .. controls +(east:0.6) and +(west:0.8) ..([yshift=-0.3em,xshift=-0.1em]f4.west);
}
......
......@@ -10,6 +10,7 @@
\node [anchor=west,fill=green!20!white] (w5) at ([xshift=0.5em]w4.east) {$w_{m}$};
\draw [->,thick,red] (w5.north).. controls +(100:0.85) and +(50:0.85) .. (w0.north);
\draw [->,thick,red] (w5.north).. controls +(110:0.75) and +(50:0.75) .. (w1.north);
\draw [->,thick,red] (w5.north).. controls +(110:0.75) and +(50:0.75) .. (w2.north);
\draw [->,thick,red] (w5.north).. controls +(120:0.6) and +(50:0.6) .. ([yshift=0.2em]w3.north);
\draw [->,thick,red] (w5.north).. controls +(130:0.5) and +(50:0.5) .. (w4.north);
\draw [->,very thick,red] ([xshift=-5em]w0.west) -- ([xshift=-6.5em]w0.west) node [pos=0,right] {\scriptsize{信息传递}};
......
......@@ -6,17 +6,17 @@
\node [anchor=west,draw=black!30,inner sep=4pt,fill=ugreen!20!white,text=ugreen!20!white] (Linear0) at (0,0) {\footnotesize{Linear}};
\node [anchor=south west,draw=black!50,fill=ugreen!20!white,draw,inner sep=4pt,text=ugreen!20!white] (Linear01) at ([shift={(-0.2em,-0.2em)}]Linear0.south west) {\footnotesize{Linear}};
\node [anchor=south west,fill=ugreen!20!white,draw,inner sep=4pt] (Linear02) at ([shift={(-0.2em,-0.2em)}]Linear01.south west) {\footnotesize{Linear}};
\node [anchor=north] (Q) at ([xshift=0em,yshift=-1em]Linear02.south) {\footnotesize{$\vectorn{\emph{Q}}$}};
\node [anchor=north] (Q) at ([xshift=0em,yshift=-1em]Linear02.south) {\footnotesize{$\mathbi{Q}$}};
\node [anchor=west,draw=black!30,inner sep=4pt,fill=ugreen!20!white,text=ugreen!20!white] (Linear1) at ([xshift=1.5em]Linear0.east) {\footnotesize{Linear}};
\node [anchor=south west,draw=black!50,fill=ugreen!20!white,draw,inner sep=4pt,text=ugreen!20!white] (Linear11) at ([shift={(-0.2em,-0.2em)}]Linear1.south west) {\footnotesize{Linear}};
\node [anchor=south west,fill=ugreen!20!white,draw,inner sep=4pt] (Linear12) at ([shift={(-0.2em,-0.2em)}]Linear11.south west) {\footnotesize{Linear}};
\node [anchor=north] (K) at ([xshift=0em,yshift=-1em]Linear12.south) {\footnotesize{$\vectorn{\emph{K}}$}};
\node [anchor=north] (K) at ([xshift=0em,yshift=-1em]Linear12.south) {\footnotesize{$\mathbi{K}$}};
\node [anchor=west,draw=black!30,inner sep=4pt,fill=ugreen!20!white,text=ugreen!20!white] (Linear2) at ([xshift=1.5em]Linear1.east) {\footnotesize{Linear}};
\node [anchor=south west,draw=black!50,fill=ugreen!20!white,draw,inner sep=4pt,text=ugreen!20!white] (Linear21) at ([shift={(-0.2em,-0.2em)}]Linear2.south west) {\footnotesize{Linear}};
\node [anchor=south west,fill=ugreen!20!white,draw,inner sep=4pt] (Linear22) at ([shift={(-0.2em,-0.2em)}]Linear21.south west) {\footnotesize{Linear}};
\node [anchor=north] (V) at ([xshift=0em,yshift=-1em]Linear22.south) {\footnotesize{$\vectorn{\emph{V}}$}};
\node [anchor=north] (V) at ([xshift=0em,yshift=-1em]Linear22.south) {\footnotesize{$\mathbi{V}$}};
\node [anchor=south,draw=black!30,minimum width=12em,minimum height=2em,inner sep=4pt,fill=blue!20!white] (Scale) at ([yshift=1em]Linear1.north) {\footnotesize{}};
\node [anchor=south west,draw=black!50,minimum width=12em,minimum height=2em,fill=blue!20!white,draw,inner sep=4pt] (Scale1) at ([shift={(-0.2em,-0.2em)}]Scale.south west) {\footnotesize{}};
......
......@@ -7,13 +7,13 @@
\begin{scope}
\node [anchor=south west,fill=white,draw,inner sep=4pt,minimum width=4em,fill=blue!20!white] (MatMul) at (0,0) {\tiny{MatMul}};
\node [anchor=north] (Q1) at ([xshift=-1.4em,yshift=-1em]MatMul.south) {\footnotesize{$\vectorn{\emph{Q}}$}};
\node [anchor=north] (K1) at ([xshift=1.4em,yshift=-1em]MatMul.south) {\footnotesize{$\vectorn{\emph{K}}$}};
\node [anchor=north] (Q1) at ([xshift=-1.4em,yshift=-1em]MatMul.south) {\footnotesize{$\mathbi{Q}$}};
\node [anchor=north] (K1) at ([xshift=1.4em,yshift=-1em]MatMul.south) {\footnotesize{$\mathbi{K}$}};
\node [anchor=south,draw,inner sep=4pt,fill=yellow!30,minimum width=2.5em] (Scale3) at ([yshift=1em]MatMul.north) {\tiny{Scale}};
\node [anchor=south,draw,inner sep=4pt,fill=purple!20,minimum width=3.5em] (Mask) at ([yshift=0.8em]Scale3.north) {\tiny{Mask(opt.)}};
\node [anchor=south,draw,inner sep=4pt,fill=ugreen!20!white] (SoftMax) at ([yshift=1em]Mask.north) {\tiny{SoftMax}};
\node [anchor=south,draw,minimum width=4em,inner sep=4pt,fill=blue!20!white] (MatMul1) at ([xshift=1.7em,yshift=1em]SoftMax.north) {\tiny{MatMul}};
\node [anchor=north] (V1) at ([xshift=2em]K1.north) {\footnotesize{$\vectorn{\emph{V}}$}};
\node [anchor=north] (V1) at ([xshift=2em]K1.north) {\footnotesize{$\mathbi{V}$}};
\node [anchor=north] (null) at ([yshift=0.8em]MatMul1.north) {};
\draw [->] ([yshift=0.1em]Q1.north) -- ([xshift=-1.4em,yshift=-0.1em]MatMul.south);
......@@ -28,7 +28,7 @@
{
\node [anchor=east] (line1) at ([xshift=-4em,yshift=1em]MatMul.west) {\scriptsize{自注意力机制的Query}};
\node [anchor=north west] (line2) at ([yshift=0.3em]line1.south west) {\scriptsize{Key和Value均来自同一句}};
\node [anchor=north west] (line3) at ([yshift=0.3em]line2.south west) {\scriptsize{子编码-解码注意力机制}};
\node [anchor=north west] (line3) at ([yshift=0.3em]line2.south west) {\scriptsize{编码-解码注意力机制}};
\node [anchor=north west] (line4) at ([yshift=0.3em]line3.south west) {\scriptsize{与前面讲的一样}};
}
{
......@@ -59,7 +59,7 @@
{
\node [rectangle,inner sep=0.2em,rounded corners=1pt,fill=green!10,drop shadow,draw=ugreen,minimum width=10em] [fit = (line1) (line2) (line3) (line4)] (box1) {};
\node [rectangle,inner sep=0.1em,rounded corners=1pt,very thick,dotted,draw=ugreen] [fit = (Q1) (K1) (V1)] (box0) {};
\draw [->,dotted,very thick,ugreen] ([yshift=-1.5em,xshift=1.2em]box1.east) -- ([yshift=-1.5em,xshift=0.1em]box1.east);
\draw [->,dotted,very thick,ugreen] ([yshift=-1.5em,xshift=1.8em]box1.east) -- ([yshift=-1.5em,xshift=0.1em]box1.east);
}
{
\node [rectangle,inner sep=0.2em,rounded corners=1pt,fill=blue!20!white,drop shadow,draw=blue] [fit = (line11) (line12) (line13)] (box2) {};
......
......@@ -12,7 +12,7 @@
\rowcolor{yellow!20} \\ \hline
\end{tabular}
};
\node at ([xshift=0em,yshift=0.5em]tbq.north){$\vectorn{\emph{Q}}$};
\node at ([xshift=0em,yshift=0.5em]tbq.north){$\mathbi{Q}$};
\node(comma1) at ([xshift=0.15em,yshift=-2em]tbq.east){,};
......@@ -25,7 +25,7 @@
\rowcolor{blue!20} \\ \hline
\end{tabular}
};
\node at ([xshift=0em,yshift=0.5em]tbk.north){$\vectorn{\emph{K}}$};
\node at ([xshift=0em,yshift=0.5em]tbk.north){$\mathbi{K}$};
\node(comma2) at ([xshift=0.15em,yshift=-2em]tbk.east){,};
......@@ -38,7 +38,7 @@
\rowcolor{orange!20} \\ \hline
\end{tabular}
};
\node at ([xshift=0em,yshift=0.5em]tbv.north){$\vectorn{\emph{V}}$};
\node at ([xshift=0em,yshift=0.5em]tbv.north){$\mathbi{V}$};
\node(bra) at ([xshift=0.3em,yshift=0]tbv.east){)};
......@@ -56,7 +56,7 @@
\rowcolor{yellow!20} \\ \hline
\end{tabular}
};
\node at ([xshift=0em,yshift=0.5em]tbq2.north){$\vectorn{\emph{Q}}$};
\node at ([xshift=0em,yshift=0.5em]tbq2.north){$\mathbi{Q}$};
% x
......@@ -69,7 +69,7 @@
\cellcolor{blue!20} & \cellcolor{blue!20} &\cellcolor{blue!20} \\ \hline
\end{tabular}
};
\node at ([xshift=0em,yshift=0.5em]tbk2.north){$\vectorn{\emph{K}}^{\mathrm{T}}$};
\node at ([xshift=0em,yshift=0.5em]tbk2.north){$\mathbi{K}^{\mathrm{T}}$};
\draw [-] (5.6,-0.2) -- (8,-0.2);
......@@ -83,7 +83,7 @@
\cellcolor{green!20} &\cellcolor{green!20} &\cellcolor{green!20} \\ \hline
\end{tabular}
};
\node at ([xshift=0em,yshift=0.5em]mask.north){$\vectorn{\emph{Mask}}$};
\node at ([xshift=0em,yshift=0.5em]mask.north){$\mathbi{Mask}$};
%+
\node at ([xshift=-0.6em,yshift=0em]mask.west){$+$};
%)
......@@ -98,7 +98,7 @@
\rowcolor{orange!20} \\ \hline
\end{tabular}
};
\node at ([xshift=0em,yshift=0.5em]tbv2.north){$\vectorn{\emph{V}}$};
\node at ([xshift=0em,yshift=0.5em]tbv2.north){$\mathbi{V}$};
%------------------------------
%第二行
......@@ -116,7 +116,11 @@
% )
\node(bra2) at ([xshift=0.2em,yshift=0]mid.east){)};
%红色框
\node[rectangle,minimum width=4.0em,minimum height=1.5em,draw=red](p222) at([xshift=0em,yshift=-1.0em]mid.north) {};
\node[rectangle,minimum width=4.0em,minimum height=1.5em,draw=red,line width=1pt](p222) at([xshift=0em,yshift=-1.0em]mid.north) {};
\node[rectangle,minimum width=4.0em,minimum height=1.5em,draw=ugreen,ultra thick,dotted,thick,font=\footnotesize](sub) at([xshift=-12em,yshift=1.0em]p222.west) {按行进行Softmax};
\draw[->,dotted,very thick,draw=ugreen] (p222.west) .. controls +(north:0.5) and +(east:1) .. (sub.east);
%%%% v
\node(tbv3) at ([xshift=0.5em,yshift=0]bra2.east){
......@@ -127,7 +131,7 @@
\rowcolor{orange!20} \\ \hline
\end{tabular}
};
\node at ([xshift=0em,yshift=0.5em]tbv3.north){$\vectorn{\emph{V}}$};
\node at ([xshift=0em,yshift=0.5em]tbv3.north){$\mathbi{V}$};
%------------------------------------
%第三行
......@@ -152,7 +156,7 @@
\rowcolor{orange!20} \\ \hline
\end{tabular}
};
\node at ([xshift=0em,yshift=0.5em]tbv4.north){$\vectorn{\emph{V}}$};
\node at ([xshift=0em,yshift=0.5em]tbv4.north){$\mathbi{V}$};
%=
\node(eq4) at ([xshift=0.5em,yshift=0em]tbv4.east){=};
......
......@@ -3,9 +3,9 @@
\node[rounded corners=1pt,minimum width=11.0em,minimum height=2.0em,fill=pink!30,draw=black](p1) at (0,0) {\small{Self-Attention}};
\node[anchor=north](word1) at ([xshift=0.0em,yshift=-2.0em]p1.south) {\small \vectorn{\emph{K}}};
\node[anchor=west](word2) at ([xshift=2.2em]word1.east) {\small \vectorn{\emph{V}}};
\node[anchor=east](word3) at ([xshift=-2.2em]word1.west) {\small \vectorn{\emph{Q}}};
\node[anchor=north](word1) at ([xshift=0.0em,yshift=-2.0em]p1.south) {\small \mathbi{K}};
\node[anchor=west](word2) at ([xshift=2.2em]word1.east) {\small \mathbi{V}};
\node[anchor=east](word3) at ([xshift=-2.2em]word1.west) {\small \mathbi{Q}};
\draw[->,thick](word1.north)--(p1.south);
\draw[->,thick]([xshift=-3.6em]word1.north)--([xshift=-3.6em]p1.south);
......@@ -21,9 +21,9 @@
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\node[anchor=west,rounded corners=1pt,minimum width=14.0em,minimum height=2.0em,fill=pink!30,draw=black](p2) at ([xshift=5.0em]p1.east){\small{Encoder-Decoder Attention}};
\node[anchor=north](word1-2) at ([xshift=0.0em,yshift=-2.0em]p2.south) {\small \vectorn{\emph{K}}};
\node[anchor=west](word2-2) at ([xshift=2.2em]word1-2.east) {\small \vectorn{\emph{V}}};
\node[anchor=east](word3-2) at ([xshift=-2.2em]word1-2.west) {\small \vectorn{\emph{Q}}};
\node[anchor=north](word1-2) at ([xshift=0.0em,yshift=-2.0em]p2.south) {\small \mathbi{K}};
\node[anchor=west](word2-2) at ([xshift=2.2em]word1-2.east) {\small \mathbi{V}};
\node[anchor=east](word3-2) at ([xshift=-2.2em]word1-2.west) {\small \mathbi{Q}};
\draw[->,thick](word1-2.north)--(p2.south);
\draw[->,thick]([xshift=-3.6em]word1-2.north)--([xshift=-3.6em]p2.south);
......
......@@ -25,7 +25,7 @@
循环神经网络和卷积神经网络是两种经典的神经网络结构,在机器翻译中进行应用也是较为自然的想法。但是,这些模型在处理文字序列时也有问题:它们对序列中不同位置之间的依赖关系的建模并不直接。以卷积神经网络为例,如果要对长距离依赖进行描述,需要多层卷积操作,而且不同层之间信息传递也可能有损失,这些都限制了模型的能力。
为了更好地描述文字序列,研究人员提出了一种新的模型Transformer。Transformer并不依赖任何循环单元或者卷积单元,而是使用一种被称作自注意力网络的结构来对序列进行表示。自注意力可以非常高效的描述任意距离之间的依赖关系,因此非常适合处理语言文字序列。Transformer一经提出就受到了广泛关注,现在已经成为了机器翻译中最先进的架构之一。本章将会对Transformer的基本结构和实现技术进行介绍。这部分知识也会在本书的前沿部分({\chapterthirteen}$\sim${\chaptereighteen})中大量使用。
为了更好地描述文字序列,研究人员提出了一种新的模型Transformer。Transformer并不依赖任何循环单元或者卷积单元,而是使用一种被称作自注意力网络的结构来对序列进行表示。自注意力可以非常高效的描述任意距离之间的依赖关系,因此非常适合处理语言文字序列。Transformer一经提出就受到了广泛关注,现在已经成为了机器翻译中最先进的架构之一。本章将会对Transformer的基本结构和实现技术进行介绍。这部分知识也会在本书的前沿部分({\chapterthirteen}$\sim${\chaptereighteen})中大量使用。
%----------------------------------------------------------------------------------------
% NEW SECTION 12.1
......@@ -56,7 +56,7 @@
\end{figure}
%----------------------------------------------
\parinterval 自注意力机制也可以被看是一个序列表示模型。比如,对于每个目标位置$j$,都生成一个与之对应的源语句子表示,它的形式为:
\parinterval 自注意力机制也可以被看是一个序列表示模型。比如,对于每个目标位置$j$,都生成一个与之对应的源语句子表示,它的形式为:
\begin{eqnarray}
\mathbi{C}_j & = & \sum_i \alpha_{i,j}\vectorn{\emph{h}}_i
\label{eq:12-4201}
......@@ -118,7 +118,7 @@
\end{table}
%----------------------------------------------
\parinterval Transformer在被提出之后,很快就席卷了整个自然语言处理领域。实际上,Transformer也可以当作一种表示模型,因此也被大量地使用在自然语言处理的其他领域,甚至图像处理和语音处理中也能看到它的影子。比如,目前非常流行的BERT等预训练模型就是基于Transformer。表\ref{tab:12-12}展示了Transformer在WMT英德和英法机器翻译任务上的性能。它能用更少的计算量(FLOPS)达到比其他模型更好的翻译品质\footnote{FLOPS = floating-point operations per second,即每秒浮点运算次数。它是度量计算机运算规模的常用单位}
\parinterval Transformer在被提出之后,很快就席卷了整个自然语言处理领域。实际上,也可以把Transformer当作一种表示模型,因此也被大量地使用在自然语言处理的其他领域,甚至图像处理和语音处理中也能看到它的影子。比如,目前非常流行的BERT等预训练模型就是基于Transformer。表\ref{tab:12-12}展示了Transformer在WMT英德和英法机器翻译任务上的性能。它能用更少的计算量(FLOPS)达到比其他模型更好的翻译品质\footnote{FLOPS = floating-point operations per second,即每秒浮点运算次数。它是度量计算机运算规模的常用单位}
%----------------------------------------------
\begin{table}[htp]
......@@ -398,11 +398,11 @@
%\parinterval 残差连接从广义上讲也叫短连接,指的是这种短距离的连接。它的思想很简单,就是把层和层之间的距离拉近。如图\ref{fig:12-49}所示,子层1通过残差连接跳过了子层2,直接和子层3进行信息传递。使信息传递变得更高效,有效解决了深层网络训练过程中容易出现的梯度消失/爆炸问题,使得深层网络的训练更加容易。其计算公式为:
%\begin{eqnarray}
%x_{l+1} = x_l + \mathcal{F} (x_l)
%x_{l+1} = x_l + {F} (x_l)
%\label{eq:12-50}
%\end{eqnarray}
%\noindent 其中,$x_l$表示$l$层网络的输入向量,$\mathcal{F} (x_l)$是子层运算。如果$l=2$,那么公式\eqref{eq:12-50}可以解释为,第3层的输入($x_3$)等于第2层的输出($\mathcal{F}(x_2)$)加上第二层的输入($x_2$)。图\ref{fig:12-50} 中的红色方框展示了Transformer 中残差连接的位置。
%\noindent 其中,$x_l$表示$l$层网络的输入向量,${F} (x_l)$是子层运算。如果$l=2$,那么公式\eqref{eq:12-50}可以解释为,第3层的输入($x_3$)等于第2层的输出(${F}(x_2)$)加上第二层的输入($x_2$)。图\ref{fig:12-50} 中的红色方框展示了Transformer 中残差连接的位置。
%----------------------------------------------
\begin{figure}[htp]
......@@ -415,7 +415,7 @@
\parinterval 在Transformer的训练过程中,由于引入了残差操作,将前面所有层的输出加到一起,如公式:
\begin{eqnarray}
x_{l+1} = x_l + \mathcal{F} (x_l)
x_{l+1} = x_l + F (x_l)
\label{eq:12-50}
\end{eqnarray}
......@@ -563,7 +563,7 @@ Transformer Deep(48层) & 30.2 & 43.1 & 194$\times 10^
\parinterval Transformer解码器生成译文词序列的过程和其它神经机器翻译系统类似,都是从左往右生成,且下一个单词的预测依赖已经生成的单词。其具体推断过程如图\ref{fig:12-56}所示,其中$\mathbi{C}_i$是编码-解码注意力的结果,解码器首先根据“<eos>”和$\mathbi{C}_1$生成第一个单词“how”,然后根据“how”和$\mathbi{C}_2$生成第二个单词“are”,以此类推,当解码器生成“<eos>”时结束推断。
\parinterval 但是,Transformer在推断阶段无法对所有位置进行并行化操作,因为对于每一个目标语单词都需要对前面所有单词进行注意力操作,因此它推断速度非常慢。可以采用的加速手段有:Cache(缓存需要重复计算的变量)\upcite{Vaswani2018Tensor2TensorFN}、低精度计算\upcite{DBLP:journals/corr/CourbariauxB16}、共享注意力网络等\upcite{Xiao2019SharingAW}。关于Transformer模型的推断加速方法将会在{\chapterfourteen}进一步深入讨论。
\parinterval 但是,Transformer在推断阶段无法对所有位置进行并行化操作,因为对于每一个目标语单词都需要对前面所有单词进行注意力操作,因此它推断速度非常慢。可以采用的加速手段有:Cache(缓存需要重复计算的变量)\upcite{Vaswani2018Tensor2TensorFN}、低精度计算\upcite{DBLP:journals/corr/CourbariauxB16,Lin2020TowardsF8}、共享注意力网络等\upcite{Xiao2019SharingAW}。关于Transformer模型的推断加速方法将会在{\chapterfourteen}进一步深入讨论。
%----------------------------------------------
\begin{figure}[htp]
......@@ -587,7 +587,7 @@ Transformer Deep(48层) & 30.2 & 43.1 & 194$\times 10^
\begin{itemize}
\vspace{0.5em}
\item 近两年,有研究已经发现注意力机制可以捕捉一些语言现象\upcite{DBLP:journals/corr/abs-1905-09418},比如,在Transformer 的多头注意力中,不同头往往会捕捉到不同的信息,比如,有些头对低频词更加敏感,有些头更适合词意消歧,甚至有些头可以捕捉句法信息。此外,由于注意力机制增加了模型的复杂性,而且随着网络层数的增多,神经机器翻译中也存在大量的冗余,因此研发轻量的注意力模型也是具有实践意义的方向\upcite{Xiao2019SharingAW,DBLP:journals/corr/abs-1805-00631,Lin2020WeightDT}
\item 近两年,有研究已经发现注意力机制可以捕捉一些语言现象\upcite{DBLP:journals/corr/abs-1905-09418},比如,在Transformer 的多头注意力中,不同头往往会捕捉到不同的信息,比如,有些头对低频词更加敏感,有些头更适合词意消歧,甚至有些头可以捕捉句法信息。此外,由于注意力机制增加了模型的复杂性,而且随着网络层数的增多,神经机器翻译中也存在大量的冗余,因此研发轻量的注意力模型也是具有实践意义的方向\upcite{Xiao2019SharingAW,DBLP:journals/corr/abs-1805-00631,Lin2020WeightDT,DBLP:conf/iclr/WuLLLH20,Kitaev2020ReformerTE,DBLP:journals/corr/abs-2005-00743,dai-etal-2019-transformer,DBLP:journals/corr/abs-2004-05150,DBLP:conf/iclr/RaePJHL20}
\vspace{0.5em}
\item 神经机器翻译依赖成本较高的GPU设备,因此对模型的裁剪和加速也是很多系统研发人员所感兴趣的方向。比如,从工程上,可以考虑减少运算强度,比如使用低精度浮点数\upcite{Ott2018ScalingNM} 或者整数\upcite{DBLP:journals/corr/abs-1906-00532,Lin2020TowardsF8}进行计算,或者引入缓存机制来加速模型的推断\upcite{Vaswani2018Tensor2TensorFN};也可以通过对模型参数矩阵的剪枝来减小整个模型的体积\upcite{DBLP:journals/corr/SeeLM16};另一种方法是知识蒸馏\upcite{Hinton2015Distilling,kim-rush-2016-sequence}。 利用大模型训练小模型,这样往往可以得到比单独训练小模型更好的效果\upcite{DBLP:journals/corr/ChenLCL17}
\vspace{0.5em}
......
......@@ -3,13 +3,13 @@
\node [anchor=center] (node1) at (4.9,1) {\small{训练:}};
\node [anchor=center] (node11) at (5.5,1) {};
\node [anchor=center] (node12) at (6.7,1) {};
\node [anchor=center] (node2) at (4.9,0.5) {\small{}};
\node [anchor=center] (node2) at (4.9,0.5) {\small{}};
\node [anchor=center] (node21) at (5.5,0.5) {};
\node [anchor=center] (node22) at (6.7,0.5) {};
\node [anchor=west,line width=0.6pt,draw=black,minimum width=5.6em,minimum height=2.2em,fill=blue!20,rounded corners=2pt] (node1-1) at (0,0) {\footnotesize{双语数据}};
\node [anchor=south,line width=0.6pt,draw=black,minimum width=4.5em,minimum height=2.2em,fill=blue!20,rounded corners=2pt] (node1-2) at ([yshift=-5em]node1-1.south) {\footnotesize{目标语伪数据}};
\node [anchor=west,line width=0.6pt,draw=black,minimum width=4.5em,minimum height=2.2em,fill=red!20,rounded corners=2pt] (node2-1) at ([xshift=-8.8em,yshift=-2.5em]node1-1.west) {\footnotesize{反向NMT系统}};
\node [anchor=west,line width=0.6pt,draw=black,minimum width=4.5em,minimum height=2.2em,fill=red!20,rounded corners=2pt] (node3-1) at ([xshift=3em,yshift=-2.5em]node1-1.east) {\footnotesize{前向NMT系统}};
\node [anchor=west,line width=0.6pt,draw=black,minimum width=4.5em,minimum height=2.2em,fill=red!20,rounded corners=2pt] (node2-1) at ([xshift=-8.8em,yshift=-2.5em]node1-1.west) {\footnotesize{反向翻译系统}};
\node [anchor=west,line width=0.6pt,draw=black,minimum width=4.5em,minimum height=2.2em,fill=red!20,rounded corners=2pt] (node3-1) at ([xshift=3em,yshift=-2.5em]node1-1.east) {\footnotesize{前向翻译系统}};
\draw [->,line width=1pt](node1-1.west)--([xshift=3em]node2-1.north);
\draw [->,line width=1pt](node1-1.east)--([xshift=-3em]node3-1.north);
......
......@@ -235,7 +235,7 @@
\end{pgfonlayer}
{\scriptsize
\node [anchor=center] (cy00-2) at ([xshift=6.7em,yshift=0.2em]pos4-212) {\tiny{TopK}};
\node [anchor=center] (cy00-2) at ([xshift=6.7em,yshift=0.2em]pos4-212) {\tiny{$n$-best}};
\node [anchor=center,minimum height=1.8em,minimum width=0.8em,fill=orange!30] (cy11-2) at ([xshift=0.0em,yshift=-1.8em]pos4-212) {};
\node [anchor=center,minimum height=1.5em,minimum width=0.8em,fill=blue!30] (cy12-2) at ([xshift=1.3em,yshift=-0.15em]cy11-2) {};
\node [anchor=center,minimum height=2.5em,minimum width=0.8em,fill=black!30] (cy13-2) at ([xshift=1.3em,yshift=0.5em]cy12-2) {};
......
......@@ -5,7 +5,7 @@
\node [rectangle,inner sep=2pt,font=\scriptsize] (top) at ([yshift=3em,xshift=0em]center.north) {
\begin{tabular}{c}
翻译模型 \\
$\textrm{P}(\mathbf t|\mathbf s)$
$\textrm{P}(\ \mathbi{y}|\ \mathbi{x})$
\end{tabular}
};
......@@ -24,7 +24,7 @@ The weather is \\so good today.
\node [rectangle,inner sep=2pt,font=\scriptsize] (down) at ([yshift=-3em,xshift=0em]center.south) {
\begin{tabular}{c}
翻译模型 \\
$\textrm{P}(\mathbf s|\mathbf t)$
$\textrm{P}(\ \mathbi{x}|\ \mathbi{y})$
\end{tabular}
};
......
\begin{tikzpicture}
\begin{scope}
\node [anchor=center] (node1) at (-2.3,0) {\small{$x,y$:双语数据}};
\node [anchor=center] (node2) at (-2.1,-0.5) {\small{$z$}:单语数据};
\node [anchor=center] (node1-1) at (0,0) {\small{$y'$}};
\node [anchor=center] (node3-1) at ([xshift=5.5em,yshift=-0.1em]node1-1.east) {\small{$z'$}};
\node[anchor=south,line width=0.6pt,draw,rounded corners,minimum height=1.5em,minimum width=4em,fill=blue!20](node1-2) at ([yshift=-3em]node1-1.south) {\small{softmax}};
\node[anchor=south,line width=0.6pt,draw,rounded corners,minimum height=1.5em,minimum width=4em,fill=blue!20](node3-2) at ([yshift=-3em]node3-1.south) {\small{softmax}};
\node[anchor=south,line width=0.6pt,draw,rounded corners,minimum height=2.2em,minimum width=4em,fill=red!20](node1-3) at ([yshift=-4.0em]node1-2.south) {\small{Decoder}};
\node[anchor=south,line width=0.6pt,draw,rounded corners,minimum height=2.2em,minimum width=4em,fill=yellow!20](node3-3) at ([yshift=-4.0em]node3-2.south) {\small{LM}};
\node[anchor=south](node1-4) at ([xshift=-0.6em,yshift=-3em]node1-3.south) {\gray{\small{$y$}}};
\node[anchor=south](node3-41) at ([xshift=-0.6em,yshift=-3em]node3-3.south) {\small{$y$}};
\node[anchor=south](node3-42) at ([xshift=0.6em,yshift=-2.9em]node3-3.south) {\small{$z$}};
\node[anchor=west](node2-2) at ([xshift=-4.9em]node1-4.west) {\small{$x$}};
\node[anchor=north,line width=0.6pt,draw,rounded corners,minimum height=2.2em,minimum width=4em,fill=red!20](node2-1) at ([yshift=4em]node2-2.north) {\small{Encoder}};
\node[anchor=north,line width=0.6pt,draw,rounded corners,minimum height=2.2em,minimum width=4em,fill=red!20](node1-3) at ([yshift=-2.0em]node1-2.south) {\small{Decoder}};
\node[anchor=north,line width=0.6pt,draw,rounded corners,minimum height=2.2em,minimum width=4em,fill=yellow!20](node3-3) at ([yshift=-2.0em]node1-3.south) {\small{LM}};
\node[anchor=west,line width=0.6pt,draw,rounded corners,minimum height=1.5em,minimum width=4em,fill=blue!20](node3-2) at ([xshift=2em]node3-3.east) {\small{softmax}};
\node [anchor=north] (node3-1) at ([yshift=3.0em]node3-2.north) {\small{$z'$}};
\node[anchor=north](node3-41) at ([xshift=-0.6em,yshift=-2em]node3-3.south) {\small{$y$}};
\node[anchor=north](node3-42) at ([xshift=0.6em,yshift=-2em]node3-3.south) {\small{$z$}};
\node[anchor=east,line width=0.6pt,draw,rounded corners,minimum height=2.2em,minimum width=4em,fill=red!20](node2-1) at ([xshift=-2em]node1-3.west) {\small{Encoder}};
\node[anchor=north](node2-2) at ([yshift=-2em]node2-1.south) {\small{$x$}};
\node [rectangle,rounded corners,draw=red,line width=0.2mm,densely dashed,inner sep=0.4em] [fit = (node3-2) (node3-3)] (inputshadow) {};
\draw [->,thick,draw=gray](node1-4.north)--([xshift=-0.6em]node1-3.south);
\draw [->,thick](node1-3.north)--(node1-2);
\draw [->,thick](node1-2.north)--(node1-1);
\draw [->,thick](node2-2.north)--(node2-1);
......@@ -24,11 +27,31 @@
\draw [->,thick](node3-41.north)--([xshift=-0.6em]node3-3.south);
\draw [->,thick](node3-42.north)--([xshift=0.6em]node3-3.south);
\draw [->,thick]([xshift=0.6em]node3-3.north)--([xshift=0.6em]node3-2.south);
\draw [->,thick](node3-3.north)--(node1-3.south);
\draw [->,thick](node3-2.north)--(node3-1);
\draw[->,thick]([xshift=-0.6em]node3-3.north)--([xshift=-0.6em,yshift=0.6em]node3-3.north)--([xshift=-3em,yshift=0.6em]node3-3.north)--([xshift=-3em,yshift=-3em]node3-3.north)--([xshift=-5.6em,yshift=-3em]node3-3.north)--([xshift=0.6em]node1-3.south);
\draw[->,thick](node3-3.east)--(node3-2.west);
\node [anchor=east] (node2-1-1) at ([xshift=-12.0em,yshift=-4.25em]node1-1.west) {\small{$y'$}};
\node[anchor=south,line width=0.6pt,draw,rounded corners,minimum height=1.5em,minimum width=4em,fill=blue!20](node2-1-2) at ([yshift=-3em]node2-1-1.south) {\small{softmax}};
\node[anchor=north,line width=0.6pt,draw,rounded corners,minimum height=2.2em,minimum width=4em,fill=red!20](node2-1-3) at ([yshift=-2.0em]node2-1-2.south) {\small{Decoder}};
\node[anchor=east,line width=0.6pt,draw,rounded corners,minimum height=2.2em,minimum width=4em,fill=red!20](node2-2-1) at ([xshift=-2em]node2-1-3.west) {\small{Encoder}};
\node[anchor=north](node2-2-2) at ([yshift=-2em]node2-2-1.south) {\small{$x$}};
\node[anchor=north](node2-2-3) at ([yshift=-2em]node2-1-3.south) {\small{$y$}};
\draw [->,thick](node2-1-2.north)--(node2-1-1);
\draw [->,thick](node2-2-2.north)--(node2-2-1);
\draw[->,thick](node2-2-1.east)--(node2-1-3.west);
\draw [->,thick](node2-1-3.north)--(node2-1-2.south);
\draw [->,thick](node2-2-3.north)--(node2-1-3);
\node [anchor=east] (node1) at ([xshift=-2.0em,yshift=4em]node2-1-1.west) {\small{$x,y$:双语数据}};
\node [anchor=north] (node2) at ([xshift=0.45em]node1.south) {\small{$z$}:单语数据};
\node [anchor=north](pos1) at ([yshift=-3.5em]node3-3.south) {\small{(b)多任务学习}};
\node [anchor=east](pos2) at ([xshift=-10.0em]pos1.west) {\small{(a)单任务学习}};
%\draw[->](node2-1.north)--([yshift=1em]node2-1.north)--([xshift=2.5em,yshift=1em]node2-1.north)--([xshift=2.5em,yshift=-0.4em]node2-1.north)--(node1-3.west);
\end{scope}
\end{tikzpicture}
\ No newline at end of file
......@@ -6,17 +6,17 @@
\node [anchor=north,inner sep=2pt,fill=green!20,minimum height=1.5em,minimum width=3.0em] (a14) at ([yshift=-0.2em]a13.south) {感到};
\node [anchor=north,inner sep=2pt,fill=green!20,minimum height=1.5em,minimum width=3.0em] (a15) at ([yshift=-0.2em]a14.south) {满意};
\node [anchor=south east,inner sep=1pt,fill=black] (pa11) at (a11.south east) {\tiny{\color{white} \textbf{0}}};
\node [anchor=south east,inner sep=1pt,fill=black] (pa12) at (a12.south east) {\tiny{\color{white} \textbf{1}}};
\node [anchor=south east,inner sep=1pt,fill=black] (pa13) at (a13.south east) {\tiny{\color{white} \textbf{2}}};
\node [anchor=south east,inner sep=1pt,fill=black] (pa14) at (a14.south east) {\tiny{\color{white} \textbf{3}}};
\node [anchor=south east,inner sep=1pt,fill=black] (pa15) at (a15.south east) {\tiny{\color{white} \textbf{4}}};
\node [anchor=west,inner sep=2pt,fill=green!20,minimum height=1.5em,minimum width=2.0em] (a21) at ([xshift=1.0em]a11.east) {\footnotesize{P=0.1}};
\node [anchor=north,inner sep=2pt,fill=red!20,minimum height=1.5em,minimum width=2.0em] (a22) at ([yshift=-0.2em]a21.south) {\footnotesize{P=0.1}};
\node [anchor=north,inner sep=2pt,fill=green!20,minimum height=1.5em,minimum width=2.0em] (a23) at ([yshift=-0.2em]a22.south) {\footnotesize{P=0.1}};
\node [anchor=north,inner sep=2pt,fill=green!20,minimum height=1.5em,minimum width=2.0em] (a24) at ([yshift=-0.2em]a23.south) {\footnotesize{P=0.1}};
\node [anchor=north,inner sep=2pt,fill=green!20,minimum height=1.5em,minimum width=2.0em] (a25) at ([yshift=-0.2em]a24.south) {\footnotesize{P=0.1}};
\node [anchor=south east,inner sep=1pt,fill=black] (pa11) at (a11.south east) {\tiny{\color{white} \textbf{1}}};
\node [anchor=south east,inner sep=1pt,fill=black] (pa12) at (a12.south east) {\tiny{\color{white} \textbf{2}}};
\node [anchor=south east,inner sep=1pt,fill=black] (pa13) at (a13.south east) {\tiny{\color{white} \textbf{3}}};
\node [anchor=south east,inner sep=1pt,fill=black] (pa14) at (a14.south east) {\tiny{\color{white} \textbf{4}}};
\node [anchor=south east,inner sep=1pt,fill=black] (pa15) at (a15.south east) {\tiny{\color{white} \textbf{5}}};
\node [anchor=west,inner sep=2pt,fill=green!20,minimum height=1.5em,minimum width=2.0em] (a21) at ([xshift=1.0em]a11.east) {\footnotesize{$P=0.1$}};
\node [anchor=north,inner sep=2pt,fill=red!20,minimum height=1.5em,minimum width=2.0em] (a22) at ([yshift=-0.2em]a21.south) {\footnotesize{$P=0.1$}};
\node [anchor=north,inner sep=2pt,fill=green!20,minimum height=1.5em,minimum width=2.0em] (a23) at ([yshift=-0.2em]a22.south) {\footnotesize{$P=0.1$}};
\node [anchor=north,inner sep=2pt,fill=green!20,minimum height=1.5em,minimum width=2.0em] (a24) at ([yshift=-0.2em]a23.south) {\footnotesize{$P=0.1$}};
\node [anchor=north,inner sep=2pt,fill=green!20,minimum height=1.5em,minimum width=2.0em] (a25) at ([yshift=-0.2em]a24.south) {\footnotesize{$P=0.1$}};
\node [anchor=west,inner sep=2pt] (a31) at ([xshift=0.3em]a23.east) {$\Rightarrow$};
......@@ -25,13 +25,13 @@
\node [anchor=north,inner sep=2pt,fill=green!20,minimum height=1.5em,minimum width=3.0em] (a43) at ([yshift=-0.2em]a42.south) {感到};
\node [anchor=north,inner sep=2pt,fill=green!20,minimum height=1.5em,minimum width=3.0em] (a44) at ([yshift=-0.2em]a43.south) {满意};
\node [anchor=south east,inner sep=1pt,fill=black] (pa41) at (a41.south east) {\tiny{\color{white} \textbf{0}}};
\node [anchor=south east,inner sep=1pt,fill=black] (pa42) at (a42.south east) {\tiny{\color{white} \textbf{2}}};
\node [anchor=south east,inner sep=1pt,fill=black] (pa43) at (a43.south east) {\tiny{\color{white} \textbf{3}}};
\node [anchor=south east,inner sep=1pt,fill=black] (pa44) at (a44.south east) {\tiny{\color{white} \textbf{4}}};
\node [anchor=south east,inner sep=1pt,fill=black] (pa41) at (a41.south east) {\tiny{\color{white} \textbf{1}}};
\node [anchor=south east,inner sep=1pt,fill=black] (pa42) at (a42.south east) {\tiny{\color{white} \textbf{3}}};
\node [anchor=south east,inner sep=1pt,fill=black] (pa43) at (a43.south east) {\tiny{\color{white} \textbf{4}}};
\node [anchor=south east,inner sep=1pt,fill=black] (pa44) at (a44.south east) {\tiny{\color{white} \textbf{5}}};
\node [anchor=south,inner sep=2pt,minimum height=1.5em,minimum width=3.0em] (a10) at (a11.north) {\scriptsize{源语言}};
\node [anchor=south,inner sep=2pt,minimum height=1.5em,minimum width=3.0em] (a20) at (a21.north) {\small{P}};
\node [anchor=south,inner sep=2pt,minimum height=1.5em,minimum width=3.0em] (a20) at (a21.north) {\small{$P$}};
\node [anchor=south,inner sep=2pt] (a30) at (a41.north) {\scriptsize{丢弃的结果}};
\node [anchor=south,inner sep=2pt] (a30-2) at (a30.north) {\scriptsize{部分词随机}};
\node [anchor=north,inner sep=2pt] (pos1) at ([xshift=0.5em,yshift=-0.5em]a25.south) {\scriptsize{(a)部分词随机丢弃的加噪方法}};
......@@ -42,17 +42,17 @@
\node [anchor=north,inner sep=2pt,fill=blue!20,minimum height=1.5em,minimum width=3.0em] (b14) at ([yshift=-0.2em]b13.south) {感到};
\node [anchor=north,inner sep=2pt,fill=blue!20,minimum height=1.5em,minimum width=3.0em] (b15) at ([yshift=-0.2em]b14.south) {满意};
\node [anchor=south east,inner sep=1pt,fill=black] (pb11) at (b11.south east) {\tiny{\color{white} \textbf{0}}};
\node [anchor=south east,inner sep=1pt,fill=black] (pb12) at (b12.south east) {\tiny{\color{white} \textbf{1}}};
\node [anchor=south east,inner sep=1pt,fill=black] (pb13) at (b13.south east) {\tiny{\color{white} \textbf{2}}};
\node [anchor=south east,inner sep=1pt,fill=black] (pb14) at (b14.south east) {\tiny{\color{white} \textbf{3}}};
\node [anchor=south east,inner sep=1pt,fill=black] (pb15) at (b15.south east) {\tiny{\color{white} \textbf{4}}};
\node [anchor=south east,inner sep=1pt,fill=black] (pb11) at (b11.south east) {\tiny{\color{white} \textbf{1}}};
\node [anchor=south east,inner sep=1pt,fill=black] (pb12) at (b12.south east) {\tiny{\color{white} \textbf{2}}};
\node [anchor=south east,inner sep=1pt,fill=black] (pb13) at (b13.south east) {\tiny{\color{white} \textbf{3}}};
\node [anchor=south east,inner sep=1pt,fill=black] (pb14) at (b14.south east) {\tiny{\color{white} \textbf{4}}};
\node [anchor=south east,inner sep=1pt,fill=black] (pb15) at (b15.south east) {\tiny{\color{white} \textbf{5}}};
\node [anchor=west,inner sep=2pt,fill=blue!20,minimum height=1.5em,minimum width=2.0em] (b21) at ([xshift=1.0em]b11.east) {\footnotesize{P=0.1}};
\node [anchor=north,inner sep=2pt,fill=blue!20,minimum height=1.5em,minimum width=2.0em] (b22) at ([yshift=-0.2em]b21.south) {\footnotesize{P=0.1}};
\node [anchor=north,inner sep=2pt,fill=red!20,minimum height=1.5em,minimum width=2.0em] (b23) at ([yshift=-0.2em]b22.south) {\footnotesize{P=0.1}};
\node [anchor=north,inner sep=2pt,fill=blue!20,minimum height=1.5em,minimum width=2.0em] (b24) at ([yshift=-0.2em]b23.south) {\footnotesize{P=0.1}};
\node [anchor=north,inner sep=2pt,fill=blue!20,minimum height=1.5em,minimum width=2.0em] (b25) at ([yshift=-0.2em]b24.south) {\footnotesize{P=0.1}};
\node [anchor=west,inner sep=2pt,fill=blue!20,minimum height=1.5em,minimum width=2.0em] (b21) at ([xshift=1.0em]b11.east) {\footnotesize{$P=0.1$}};
\node [anchor=north,inner sep=2pt,fill=blue!20,minimum height=1.5em,minimum width=2.0em] (b22) at ([yshift=-0.2em]b21.south) {\footnotesize{$P=0.1$}};
\node [anchor=north,inner sep=2pt,fill=red!20,minimum height=1.5em,minimum width=2.0em] (b23) at ([yshift=-0.2em]b22.south) {\footnotesize{$P=0.1$}};
\node [anchor=north,inner sep=2pt,fill=blue!20,minimum height=1.5em,minimum width=2.0em] (b24) at ([yshift=-0.2em]b23.south) {\footnotesize{$P=0.1$}};
\node [anchor=north,inner sep=2pt,fill=blue!20,minimum height=1.5em,minimum width=2.0em] (b25) at ([yshift=-0.2em]b24.south) {\footnotesize{$P=0.1$}};
\node [anchor=west,inner sep=2pt] (b31) at ([xshift=0.3em]b23.east) {$\Rightarrow$};
......@@ -62,14 +62,14 @@
\node [anchor=north,inner sep=2pt,fill=blue!20,minimum height=1.5em,minimum width=3.0em] (b44) at ([yshift=-0.2em]b43.south) {感到};
\node [anchor=north,inner sep=2pt,fill=blue!20,minimum height=1.5em,minimum width=3.0em] (b45) at ([yshift=-0.2em]b44.south) {满意};
\node [anchor=south east,inner sep=1pt,fill=black] (pb41) at (b41.south east) {\tiny{\color{white} \textbf{0}}};
\node [anchor=south east,inner sep=1pt,fill=black] (pb42) at (b42.south east) {\tiny{\color{white} \textbf{1}}};
\node [anchor=south east,inner sep=1pt,fill=black] (pb43) at (b43.south east) {\tiny{\color{white} \textbf{2}}};
\node [anchor=south east,inner sep=1pt,fill=black] (pb44) at (b44.south east) {\tiny{\color{white} \textbf{3}}};
\node [anchor=south east,inner sep=1pt,fill=black] (pb45) at (b45.south east) {\tiny{\color{white} \textbf{4}}};
\node [anchor=south east,inner sep=1pt,fill=black] (pb41) at (b41.south east) {\tiny{\color{white} \textbf{1}}};
\node [anchor=south east,inner sep=1pt,fill=black] (pb42) at (b42.south east) {\tiny{\color{white} \textbf{2}}};
\node [anchor=south east,inner sep=1pt,fill=black] (pb43) at (b43.south east) {\tiny{\color{white} \textbf{3}}};
\node [anchor=south east,inner sep=1pt,fill=black] (pb44) at (b44.south east) {\tiny{\color{white} \textbf{4}}};
\node [anchor=south east,inner sep=1pt,fill=black] (pb45) at (b45.south east) {\tiny{\color{white} \textbf{5}}};
\node [anchor=south,inner sep=2pt,minimum height=1.5em,minimum width=3.0em] (b10) at (b11.north) {\scriptsize{源语言}};
\node [anchor=south,inner sep=2pt,minimum height=1.5em,minimum width=3.0em] (b20) at (b21.north) {\small{P}};
\node [anchor=south,inner sep=2pt,minimum height=1.5em,minimum width=3.0em] (b20) at (b21.north) {\small{$P$}};
\node [anchor=south,inner sep=2pt] (b30) at (b41.north) {\scriptsize{屏蔽的结果}};
\node [anchor=south,inner sep=2pt] (b30-2) at (b30.north) {\scriptsize{部分词随机}};
\node [anchor=north,inner sep=2pt] (pos2) at ([xshift=0.5em,yshift=-0.5em]b25.south) {\scriptsize{(b)部分词随机屏蔽的加噪方法}};
......@@ -80,23 +80,23 @@
\node [anchor=north,inner sep=2pt,fill=yellow!20,minimum height=1.5em,minimum width=3.0em] (c14) at ([yshift=-0.2em]c13.south) {感到};
\node [anchor=north,inner sep=2pt,fill=yellow!20,minimum height=1.5em,minimum width=3.0em] (c15) at ([yshift=-0.2em]c14.south) {满意};
\node [anchor=south east,inner sep=1pt,fill=black] (pc11) at (c11.south east) {\tiny{\color{white} \textbf{0}}};
\node [anchor=south east,inner sep=1pt,fill=black] (pc12) at (c12.south east) {\tiny{\color{white} \textbf{1}}};
\node [anchor=south east,inner sep=1pt,fill=black] (pc13) at (c13.south east) {\tiny{\color{white} \textbf{2}}};
\node [anchor=south east,inner sep=1pt,fill=black] (pc14) at (c14.south east) {\tiny{\color{white} \textbf{3}}};
\node [anchor=south east,inner sep=1pt,fill=black] (pc15) at (c15.south east) {\tiny{\color{white} \textbf{4}}};
\node [anchor=south east,inner sep=1pt,fill=black] (pc11) at (c11.south east) {\tiny{\color{white} \textbf{1}}};
\node [anchor=south east,inner sep=1pt,fill=black] (pc12) at (c12.south east) {\tiny{\color{white} \textbf{2}}};
\node [anchor=south east,inner sep=1pt,fill=black] (pc13) at (c13.south east) {\tiny{\color{white} \textbf{3}}};
\node [anchor=south east,inner sep=1pt,fill=black] (pc14) at (c14.south east) {\tiny{\color{white} \textbf{4}}};
\node [anchor=south east,inner sep=1pt,fill=black] (pc15) at (c15.south east) {\tiny{\color{white} \textbf{5}}};
\node [anchor=west,inner sep=2pt] (c21) at ([xshift=0.3em]c11.east) {\footnotesize{+}};
\node [anchor=west,inner sep=2pt] (c22) at ([xshift=0.3em]c12.east) {\footnotesize{+}};
\node [anchor=west,inner sep=2pt] (c23) at ([xshift=0.3em]c13.east) {\footnotesize{+}};
\node [anchor=west,inner sep=2pt] (c24) at ([xshift=0.3em]c14.east) {\footnotesize{+}};
\node [anchor=west,inner sep=2pt] (c25) at ([xshift=0.3em]c15.east) {\footnotesize{+}};
\node [anchor=west,inner sep=2pt] (c21) at ([xshift=0.35em]c11.east) {\footnotesize{+}};
\node [anchor=west,inner sep=2pt] (c22) at ([xshift=0.35em]c12.east) {\footnotesize{+}};
\node [anchor=west,inner sep=2pt] (c23) at ([xshift=0.35em]c13.east) {\footnotesize{+}};
\node [anchor=west,inner sep=2pt] (c24) at ([xshift=0.35em]c14.east) {\footnotesize{+}};
\node [anchor=west,inner sep=2pt] (c25) at ([xshift=0.35em]c15.east) {\footnotesize{+}};
\node [anchor=west,inner sep=2pt,fill=yellow!20,minimum height=1.5em] (c31) at ([xshift=0.352em]c21.east) {\footnotesize{2.54}};
\node [anchor=north,inner sep=2pt,fill=yellow!20,minimum height=1.5em] (c32) at ([yshift=-0.2em]c31.south) {\footnotesize{0.63}};
\node [anchor=north,inner sep=2pt,fill=yellow!20,minimum height=1.5em] (c33) at ([yshift=-0.2em]c32.south) {\footnotesize{1.77}};
\node [anchor=north,inner sep=2pt,fill=yellow!20,minimum height=1.5em] (c34) at ([yshift=-0.2em]c33.south) {\footnotesize{1.32}};
\node [anchor=north,inner sep=2pt,fill=yellow!20,minimum height=1.5em] (c35) at ([yshift=-0.2em]c34.south) {\footnotesize{2.15}};
\node [anchor=west,inner sep=2pt,fill=yellow!20,minimum height=1.5em,minimum width=2.5em] (c31) at ([xshift=0.423em]c21.east) {\footnotesize{2.54}};
\node [anchor=north,inner sep=2pt,fill=yellow!20,minimum height=1.5em,minimum width=2.5em] (c32) at ([yshift=-0.2em]c31.south) {\footnotesize{0.63}};
\node [anchor=north,inner sep=2pt,fill=yellow!20,minimum height=1.5em,minimum width=2.5em] (c33) at ([yshift=-0.2em]c32.south) {\footnotesize{1.77}};
\node [anchor=north,inner sep=2pt,fill=yellow!20,minimum height=1.5em,minimum width=2.5em] (c34) at ([yshift=-0.2em]c33.south) {\footnotesize{1.32}};
\node [anchor=north,inner sep=2pt,fill=yellow!20,minimum height=1.5em,minimum width=2.5em] (c35) at ([yshift=-0.2em]c34.south) {\footnotesize{2.15}};
\node [anchor=west,inner sep=2pt] (c41) at ([xshift=0.55em]c31.east) {\footnotesize{=}};
\node [anchor=west,inner sep=2pt] (c42) at ([xshift=0.55em]c32.east) {\footnotesize{=}};
......@@ -104,31 +104,31 @@
\node [anchor=west,inner sep=2pt] (c44) at ([xshift=0.55em]c34.east) {\footnotesize{=}};
\node [anchor=west,inner sep=2pt] (c45) at ([xshift=0.55em]c35.east) {\footnotesize{=}};
\node [anchor=west,inner sep=2pt,fill=yellow!20,minimum height=1.5em,minimum width=3.0em] (c51) at ([xshift=0.55em]c41.east) {\footnotesize{$S_{0}=2.54$}};
\node [anchor=north,inner sep=2pt,fill=yellow!20,minimum height=1.5em,minimum width=3.0em] (c52) at ([yshift=-0.2em]c51.south) {\footnotesize{$S_{1}=1.63$}};
\node [anchor=north,inner sep=2pt,fill=yellow!20,minimum height=1.5em,minimum width=3.0em] (c53) at ([yshift=-0.2em]c52.south) {\footnotesize{$S_{2}=3.77$}};
\node [anchor=north,inner sep=2pt,fill=yellow!20,minimum height=1.5em,minimum width=3.0em] (c54) at ([yshift= -0.2em]c53.south) {\footnotesize{$S_{3}=4.33$}};
\node [anchor=north,inner sep=2pt,fill=yellow!20,minimum height=1.5em,minimum width=3.0em] (c55) at ([yshift=-0.2em]c54.south) {\footnotesize{$S_{4}=6.15$}};
\node [anchor=west,inner sep=2pt,fill=yellow!20,minimum height=1.5em,minimum width=3.0em] (c51) at ([xshift=0.56em]c41.east) {\footnotesize{$S_{1}=3.54$}};
\node [anchor=north,inner sep=2pt,fill=yellow!20,minimum height=1.5em,minimum width=3.0em] (c52) at ([yshift=-0.2em]c51.south) {\footnotesize{$S_{2}=2.63$}};
\node [anchor=north,inner sep=2pt,fill=yellow!20,minimum height=1.5em,minimum width=3.0em] (c53) at ([yshift=-0.2em]c52.south) {\footnotesize{$S_{3}=4.77$}};
\node [anchor=north,inner sep=2pt,fill=yellow!20,minimum height=1.5em,minimum width=3.0em] (c54) at ([yshift= -0.2em]c53.south) {\footnotesize{$S_{4}=5.33$}};
\node [anchor=north,inner sep=2pt,fill=yellow!20,minimum height=1.5em,minimum width=3.0em] (c55) at ([yshift=-0.2em]c54.south) {\footnotesize{$S_{5}=7.15$}};
\node [anchor=west,inner sep=2pt,fill=yellow!20,minimum height=1.5em,minimum width=3.0em] (c61) at ([xshift=3.72em]c51.east) {\footnotesize{$S_{0}^{'}=1.63$}};
\node [anchor=north,inner sep=2pt,fill=yellow!20,minimum height=1.5em,minimum width=3.0em] (c62) at ([yshift=-0.2em]c61.south) {\footnotesize{$S_{1}^{'}=2.54$}};
\node [anchor=north,inner sep=2pt,fill=yellow!20,minimum height=1.5em,minimum width=3.0em] (c63) at ([yshift=-0.2em]c62.south) {\footnotesize{$S_{2}^{'}=3.77$}};
\node [anchor=north,inner sep=2pt,fill=yellow!20,minimum height=1.5em,minimum width=3.0em] (c64) at ([yshift=-0.2em]c63.south) {\footnotesize{$S_{3}^{'}=4.33$}};
\node [anchor=north,inner sep=2pt,fill=yellow!20,minimum height=1.5em,minimum width=3.0em] (c65) at ([yshift=-0.2em]c64.south) {\footnotesize{$S_{4}^{'}=6.15$}};
\node [anchor=west,inner sep=2pt,fill=yellow!20,minimum height=1.5em,minimum width=3.0em] (c61) at ([xshift=4.55em]c51.east) {\footnotesize{$S_{2}^{'}=2.63$}};
\node [anchor=north,inner sep=2pt,fill=yellow!20,minimum height=1.5em,minimum width=3.0em] (c62) at ([yshift=-0.2em]c61.south) {\footnotesize{$S_{1}^{'}=3.54$}};
\node [anchor=north,inner sep=2pt,fill=yellow!20,minimum height=1.5em,minimum width=3.0em] (c63) at ([yshift=-0.2em]c62.south) {\footnotesize{$S_{3}^{'}=4.77$}};
\node [anchor=north,inner sep=2pt,fill=yellow!20,minimum height=1.5em,minimum width=3.0em] (c64) at ([yshift=-0.2em]c63.south) {\footnotesize{$S_{4}^{'}=5.33$}};
\node [anchor=north,inner sep=2pt,fill=yellow!20,minimum height=1.5em,minimum width=3.0em] (c65) at ([yshift=-0.2em]c64.south) {\footnotesize{$S_{5}^{'}=7.15$}};
\node [anchor=north,inner sep=2pt] (c71) at ([yshift=-12.3em]b31.south) {$\Rightarrow$};
\node [anchor=west,inner sep=2pt,fill=red!20,minimum height=1.5em,minimum width=3.0em] (c81) at ([xshift=2.0em]c61.east) {};
\node [anchor=west,inner sep=2pt,fill=red!20,minimum height=1.5em,minimum width=3.0em] (c81) at ([xshift=1.99em]c61.east) {};
\node [anchor=north,inner sep=2pt,fill=red!20,minimum height=1.5em,minimum width=3.0em] (c82) at ([yshift=-0.2em]c81.south) {};
\node [anchor=north,inner sep=2pt,fill=yellow!20,minimum height=1.5em,minimum width=3.0em] (c83) at ([yshift=-0.2em]c82.south) {};
\node [anchor=north,inner sep=2pt,fill=yellow!20,minimum height=1.5em,minimum width=3.0em] (c84) at ([yshift=-0.2em]c83.south) {感到};
\node [anchor=north,inner sep=2pt,fill=yellow!20,minimum height=1.5em,minimum width=3.0em] (c85) at ([yshift=-0.2em]c84.south) {满意};
\node [anchor=south east,inner sep=1pt,fill=black] (pc81) at (c81.south east) {\tiny{\color{white} \textbf{0}}};
\node [anchor=south east,inner sep=1pt,fill=black] (pc81) at (c81.south east) {\tiny{\color{white} \textbf{2}}};
\node [anchor=south east,inner sep=1pt,fill=black] (pc82) at (c82.south east) {\tiny{\color{white} \textbf{1}}};
\node [anchor=south east,inner sep=1pt,fill=black] (pc83) at (c83.south east) {\tiny{\color{white} \textbf{2}}};
\node [anchor=south east,inner sep=1pt,fill=black] (pc84) at (c84.south east) {\tiny{\color{white} \textbf{3}}};
\node [anchor=south east,inner sep=1pt,fill=black] (pc85) at (c85.south east) {\tiny{\color{white} \textbf{4}}};
\node [anchor=south east,inner sep=1pt,fill=black] (pc83) at (c83.south east) {\tiny{\color{white} \textbf{3}}};
\node [anchor=south east,inner sep=1pt,fill=black] (pc84) at (c84.south east) {\tiny{\color{white} \textbf{4}}};
\node [anchor=south east,inner sep=1pt,fill=black] (pc85) at (c85.south east) {\tiny{\color{white} \textbf{5}}};
\draw [->,dashed](c51.east)--(c62.west);
\draw [->,dashed](c52.east)--(c61.west);
......@@ -137,8 +137,8 @@
\draw [->,dashed](c55.east)--(c65.west);
\node [anchor=south,inner sep=2pt,minimum height=1.5em,minimum width=3.0em] (c10) at (c11.north) {\scriptsize{源语言}};
\node [anchor=south,inner sep=2pt,minimum height=1.5em,minimum width=3.0em] (c30) at (c31.north) {\small{n=3}};
\node [anchor=south,inner sep=2pt,minimum height=1.5em,minimum width=3.0em] (c50) at (c51.north) {\small{S}};
\node [anchor=south,inner sep=2pt,minimum height=1.5em,minimum width=3.0em] (c30) at (c31.north) {\small{$n$=3}};
\node [anchor=south,inner sep=2pt,minimum height=1.5em,minimum width=3.0em] (c50) at (c51.north) {\small{$S$}};
\node [anchor=south,inner sep=2pt] (c60) at (c61.north) {\scriptsize{进行排序}};
\node [anchor=south,inner sep=2pt] (c60-2) at (c60.north) {\scriptsize{由小到大}};
......
......@@ -20,8 +20,8 @@
\node [anchor=west] (a15-2) at ([xshift=-4.25em]a15.west) {\tiny{$\cdots$}};
\node [anchor=east] (a13-3) at ([yshift=0.8em]a13-2.west) {\small{无监督语言}};
\node [anchor=north] (a13-4) at ([xshift=0em]a13-3.south) {\small{模型隐藏层}};
\node [anchor=east] (a13-3) at ([yshift=0.8em]a13-2.west) {\small{模型语言}};
\node [anchor=north] (a13-4) at ([xshift=0em]a13-3.south) {\small{隐藏层}};
\node [anchor=east] (a14-3) at ([yshift=0.8em]a14-2.west) {\small{神经机器翻译}};
\node [anchor=north] (a14-4) at ([xshift=0.5em]a14-3.south) {\small{模型隐藏层}};
......
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -4337,6 +4337,43 @@ year = {2012}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%% chapter 10------------------------------------------------------
@inproceedings{DBLP:conf/acl/LiLWJXZLL20,
author = {Bei Li and
Hui Liu and
Ziyang Wang and
Yufan Jiang and
Tong Xiao and
Jingbo Zhu and
Tongran Liu and
Changliang Li},
title = {Does Multi-Encoder Help? {A} Case Study on Context-Aware Neural Machine
Translation},
pages = {3512--3518},
publisher = {Association for Computational Linguistics},
year = {2020}
}
@inproceedings{DBLP:conf/emnlp/MiWI16,
author = {Haitao Mi and
Zhiguo Wang and
Abe Ittycheriah},
title = {Supervised Attentions for Neural Machine Translation},
pages = {2283--2288},
publisher = {The Association for Computational Linguistics},
year = {2016}
}
@inproceedings{DBLP:conf/coling/LiuUFS16,
author = {Lemao Liu and
Masao Utiyama and
Andrew M. Finch and
Eiichiro Sumita},
title = {Neural Machine Translation with Supervised Attention},
pages = {3093--3102},
publisher = {The Association for Computational Linguistics},
year = {2016}
}
@inproceedings{devlin-etal-2014-fast,
author = {Jacob Devlin and
Rabih Zbib and
......@@ -4378,13 +4415,15 @@ year = {2012}
year = {1998},
}
@article{BENGIO1994Learning,
author ={Y. {Bengio} and P. {Simard} and P. {Frasconi}},
journal ={IEEE Transactions on Neural Networks},
title ={Learning long-term dependencies with gradient descent is difficult},
year ={1994},
volume ={5},
number ={2},
pages ={157-166},
author = {Yoshua Bengio and
Patrice Y. Simard and
Paolo Frasconi},
title = {Learning long-term dependencies with gradient descent is difficult},
journal = {Institute of Electrical and Electronics Engineers},
volume = {5},
number = {2},
pages = {157--166},
year = {1994}
}
@inproceedings{NIPS2017_7181,
author = {Ashish Vaswani and
......@@ -4460,7 +4499,7 @@ pages ={157-166},
title = {Learning Deep Transformer Models for Machine Translation},
pages = {1810--1822},
publisher = {Association for Computational Linguistics},
year = {2019},
year = {2019}
}
@article{Li2020NeuralMT,
author = {Yanyang Li and
......@@ -4860,21 +4899,24 @@ pages ={157-166},
year={2018}
}
@inproceedings{Lin2020TowardsF8,
title={Towards Fully 8-bit Integer Inference for the Transformer Model},
author={Y. Lin and Yanyang Li and Tengbo Liu and Tong Xiao and T. Liu and Jingbo Zhu},
publisher={International Joint Conference on Artificial Intelligence},
year={2020}
author = {Ye Lin and
Yanyang Li and
Tengbo Liu and
Tong Xiao and
Tongran Liu and
Jingbo Zhu},
title = {Towards Fully 8-bit Integer Inference for the Transformer Model},
pages = {3759--3765},
publisher = {International Joint Conference on Artificial Intelligence},
year = {2020}
}
@inproceedings{kim-rush-2016-sequence,
title = "Sequence-Level Knowledge Distillation",
author = "Kim, Yoon and
Rush, Alexander M.",
publisher = "Proceedings of the 2016 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2016",
//address = "Austin, Texas",
//publisher = "Association for Computational Linguistics",
pages = "1317--1327",
author = {Yoon Kim and
Alexander M. Rush},
title = {Sequence-Level Knowledge Distillation},
pages = {1317--1327},
publisher = {The Association for Computational Linguistics},
year = {2016}
}
@article{Akaike1969autoregressive,
author = {Hirotugu Akaike},
......@@ -4914,16 +4956,14 @@ pages ={157-166},
year={2018}
}
@inproceedings{cho-etal-2014-properties,
title = "On the Properties of Neural Machine Translation: Encoder--Decoder Approaches",
author = {Cho, Kyunghyun and
van Merri{\"e}nboer, Bart and
Bahdanau, Dzmitry and
Bengio, Yoshua},
month = oct,
year = "2014",
address = "Doha, Qatar",
publisher = "Association for Computational Linguistics",
pages = "103--111",
author = {Kyunghyun Cho and
Bart van Merrienboer and
Dzmitry Bahdanau and
Yoshua Bengio},
title = {On the Properties of Neural Machine Translation: Encoder-Decoder Approaches},
pages = {103--111},
publisher = {Association for Computational Linguistics},
year = {2014}
}
@inproceedings{DBLP:conf/acl/JeanCMB15,
......@@ -4948,10 +4988,14 @@ pages ={157-166},
year = {2015}
}
@inproceedings{He2016ImprovedNM,
title={Improved Neural Machine Translation with SMT Features},
author={W. He and Zhongjun He and Hua Wu and H. Wang},
booktitle={AAAI Conference on Artificial Intelligence},
year={2016}
author = {Wei He and
Zhongjun He and
Hua Wu and
Haifeng Wang},
title = {Improved Neural Machine Translation with {SMT} Features},
pages = {151--157},
publisher = {the Association for the Advance of Artificial Intelligence},
year = {2016}
}
@inproceedings{zhang-etal-2017-prior,
title = {Prior Knowledge Integration for Neural Machine Translation using Posterior Regularization},
......@@ -4966,57 +5010,162 @@ pages ={157-166},
}
@inproceedings{duan-etal-2020-bilingual,
title = "Bilingual Dictionary Based Neural Machine Translation without Using Parallel Sentences",
author = "Duan, Xiangyu and
Ji, Baijun and
Jia, Hao and
Tan, Min and
Zhang, Min and
Chen, Boxing and
Luo, Weihua and
Zhang, Yue",
month = jul,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
pages = "1570--1579",
author = {Xiangyu Duan and
Baijun Ji and
Hao Jia and
Min Tan and
Min Zhang and
Boxing Chen and
Weihua Luo and
Yue Zhang},
title = {Bilingual Dictionary Based Neural Machine Translation without Using
Parallel Sentences},
pages = {1570--1579},
publisher = {Association for Computational Linguistics},
year = {2020}
}
@inproceedings{cao-xiong-2018-encoding,
title = "Encoding Gated Translation Memory into Neural Machine Translation",
author = "Cao, Qian and
Xiong, Deyi",
month = oct,
year = "2018",
address = "Brussels, Belgium",
publisher = "Association for Computational Linguistics",
pages = "3042--3047",
author = {Qian Cao and
Deyi Xiong},
title = {Encoding Gated Translation Memory into Neural Machine Translation},
pages = {3042--3047},
publisher = {Association for Computational Linguistics},
year = {2018}
}
@inproceedings{yang-etal-2016-hierarchical,
title = "Hierarchical Attention Networks for Document Classification",
author = "Yang, Zichao and
Yang, Diyi and
Dyer, Chris and
He, Xiaodong and
Smola, Alex and
Hovy, Eduard",
month = jun,
year = "2016",
address = "San Diego, California",
publisher = "Association for Computational Linguistics",
pages = "1480--1489",
author = {Zichao Yang and
Diyi Yang and
Chris Dyer and
Xiaodong He and
Alexander J. Smola and
Eduard H. Hovy},
title = {Hierarchical Attention Networks for Document Classification},
pages = {1480--1489},
publisher = {The Association for Computational Linguistics},
year = {2016}
}
%%%%% chapter 10------------------------------------------------------
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%% chapter 11------------------------------------------------------
@inproceedings{DBLP:conf/acl-codeswitch/WangCK18,
author = {Changhan Wang and
Kyunghyun Cho and
Douwe Kiela},
title = {Code-Switched Named Entity Recognition with Embedding Attention},
pages = {154--158},
publisher = {Association for Computational Linguistics},
year = {2018}
}
@inproceedings{DBLP:conf/emnlp/LiDWCM17,
author = {Peng-Hsuan Li and
Ruo-Ping Dong and
Yu-Siang Wang and
Ju-Chieh Chou and
Wei-Yun Ma},
title = {Leveraging Linguistic Structures for Named Entity Recognition with
Bidirectional Recursive Neural Networks},
pages = {2664--2669},
publisher = {Association for Computational Linguistics},
year = {2017}
}
@inproceedings{DBLP:conf/acl/MaH16,
author = {Xuezhe Ma and
Eduard H. Hovy},
title = {End-to-end Sequence Labeling via Bi-directional LSTM-CNNs-CRF},
publisher = {The Association for Computer Linguistics},
year = {2016}
}
@inproceedings{DBLP:conf/emnlp/StrubellVBM17,
author = {Emma Strubell and
Patrick Verga and
David Belanger and
Andrew McCallum},
title = {Fast and Accurate Entity Recognition with Iterated Dilated Convolutions},
pages = {2670--2680},
publisher = {Association for Computational Linguistics},
year = {2017}
}
@inproceedings{DBLP:conf/cncl/ZhouZXQBX17,
author = {Peng Zhou and
Suncong Zheng and
Jiaming Xu and
Zhenyu Qi and
Hongyun Bao and
Bo Xu},
title = {Joint Extraction of Multiple Relations and Entities by Using a Hybrid
Neural Network},
volume = {10565},
pages = {135--146},
publisher = {Springer},
year = {2017}
}
@article{DBLP:journals/jmlr/CollobertWBKKK11,
author = {Ronan Collobert and
Jason Weston and
L{\'{e}}on Bottou and
Michael Karlen and
Koray Kavukcuoglu and
Pavel P. Kuksa},
title = {Natural Language Processing (Almost) from Scratch},
journal = {J. Mach. Learn. Res.},
volume = {12},
pages = {2493--2537},
year = {2011}
}
@inproceedings{DBLP:conf/acl/NguyenG15,
author = {Thien Huu Nguyen and
Ralph Grishman},
title = {Event Detection and Domain Adaptation with Convolutional Neural Networks},
pages = {365--371},
publisher = {The Association for Computer Linguistics},
year = {2015}
}
@inproceedings{DBLP:conf/aaai/LaiXLZ15,
author = {Siwei Lai and
Liheng Xu and
Kang Liu and
Jun Zhao},
title = {Recurrent Convolutional Neural Networks for Text Classification},
pages = {2267--2273},
publisher = {the Association for the Advance of Artificial Intelligence},
year = {2015}
}
@inproceedings{DBLP:conf/acl/ChenXLZ015,
author = {Yubo Chen and
Liheng Xu and
Kang Liu and
Daojian Zeng and
Jun Zhao},
title = {Event Extraction via Dynamic Multi-Pooling Convolutional Neural Networks},
pages = {167--176},
publisher = {The Association for Computer Linguistics},
year = {2015}
}
@inproceedings{DBLP:conf/emnlp/LeiBJ15,
author = {Tao Lei and
Regina Barzilay and
Tommi S. Jaakkola},
title = {Molding CNNs for text: non-linear, non-consecutive convolutions},
pages = {1565--1575},
publisher = {The Association for Computational Linguistics},
year = {2015}
}
@inproceedings{DBLP:conf/naacl/Johnson015,
author = {Rie Johnson and
Tong Zhang},
editor = {Rada Mihalcea and
Joyce Yue Chai and
Anoop Sarkar},
title = {Effective Use of Word Order for Text Categorization with Convolutional
Neural Networks},
pages = {103--112},
......@@ -5027,10 +5176,6 @@ pages ={157-166},
@inproceedings{DBLP:conf/naacl/NguyenG15,
author = {Thien Huu Nguyen and
Ralph Grishman},
editor = {Phil Blunsom and
Shay B. Cohen and
Paramveer S. Dhillon and
Percy Liang},
title = {Relation Extraction: Perspective from Convolutional Neural Networks},
pages = {39--48},
publisher = {The Association for Computational Linguistics},
......@@ -5411,6 +5556,51 @@ pages ={157-166},
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%% chapter 12------------------------------------------------------
@inproceedings{DBLP:conf/iclr/RaePJHL20,
author = {Jack W. Rae and
Anna Potapenko and
Siddhant M. Jayakumar and
Chloe Hillier and
Timothy P. Lillicrap},
title = {Compressive Transformers for Long-Range Sequence Modelling},
publisher = {OpenReview.net},
year = {2020}
}
@article{DBLP:journals/corr/abs-2004-05150,
author = {Iz Beltagy and
Matthew E. Peters and
Arman Cohan},
title = {Longformer: The Long-Document Transformer},
journal = {CoRR},
volume = {abs/2004.05150},
year = {2020}
}
@article{DBLP:journals/corr/abs-2005-00743,
author = {Yi Tay and
Dara Bahri and
Donald Metzler and
Da-Cheng Juan and
Zhe Zhao and
Che Zheng},
title = {Synthesizer: Rethinking Self-Attention in Transformer Models},
journal = {CoRR},
volume = {abs/2005.00743},
year = {2020}
}
@inproceedings{DBLP:conf/iclr/WuLLLH20,
author = {Zhanghao Wu and
Zhijian Liu and
Ji Lin and
Yujun Lin and
Song Han},
title = {Lite Transformer with Long-Short Range Attention},
publisher = {OpenReview.net},
year = {2020}
}
@inproceedings{DBLP:journals/corr/abs-1905-09418,
author = {Elena Voita and
David Talbot and
......@@ -5506,18 +5696,16 @@ pages ={157-166},
}
@inproceedings{dai-etal-2019-transformer,
title = "Transformer-{XL}: Attentive Language Models beyond a Fixed-Length Context",
author = "Dai, Zihang and
Yang, Zhilin and
Yang, Yiming and
Carbonell, Jaime and
Le, Quoc and
Salakhutdinov, Ruslan",
month = jul,
year = "2019",
address = "Florence, Italy",
publisher = "Association for Computational Linguistics",
pages = "2978--2988",
author = {Zihang Dai and
Zhilin Yang and
Yiming Yang and
Jaime G. Carbonell and
Quoc Viet Le and
Ruslan Salakhutdinov},
title = {Transformer-XL: Attentive Language Models beyond a Fixed-Length Context},
pages = {2978--2988},
publisher = {Association for Computational Linguistics},
year = {2019}
}
@article{Liu2020LearningTE,
title={Learning to Encode Position for Transformer with Continuous Dynamical Model},
......@@ -5533,10 +5721,16 @@ pages ={157-166},
year={2019}
}
@inproceedings{Yang2018ModelingLF,
title={Modeling Localness for Self-Attention Networks},
author={Baosong Yang and Zhaopeng Tu and Derek F. Wong and Fandong Meng and Lidia S. Chao and T. Zhang},
publisher={Conference on Empirical Methods in Natural Language Processing},
year={2018}
author = {Baosong Yang and
Zhaopeng Tu and
Derek F. Wong and
Fandong Meng and
Lidia S. Chao and
Tong Zhang},
title = {Modeling Localness for Self-Attention Networks},
pages = {4449--4458},
publisher = {Association for Computational Linguistics},
year = {2018}
}
@inproceedings{DBLP:journals/corr/abs-1904-03107,
author = {Baosong Yang and
......@@ -5557,16 +5751,27 @@ pages ={157-166},
volume={abs/2002.06714}
}
@inproceedings{Bapna2018TrainingDN,
title={Training Deeper Neural Machine Translation Models with Transparent Attention},
author={Ankur Bapna and M. Chen and Orhan Firat and Yuan Cao and Y. Wu},
publisher={Conference on Empirical Methods in Natural Language Processing},
year={2018}
author = {Ankur Bapna and
Mia Xu Chen and
Orhan Firat and
Yuan Cao and
Yonghui Wu},
title = {Training Deeper Neural Machine Translation Models with Transparent
Attention},
pages = {3028--3033},
publisher = {Association for Computational Linguistics},
year = {2018}
}
@inproceedings{Dou2018ExploitingDR,
title={Exploiting Deep Representations for Neural Machine Translation},
author={Zi-Yi Dou and Zhaopeng Tu and Xing Wang and Shuming Shi and T. Zhang},
publisher={Conference on Empirical Methods in Natural Language Processing},
year={2018}
author = {Zi-Yi Dou and
Zhaopeng Tu and
Xing Wang and
Shuming Shi and
Tong Zhang},
title = {Exploiting Deep Representations for Neural Machine Translation},
pages = {4253--4262},
publisher = {Association for Computational Linguistics},
year = {2018}
}
@inproceedings{Wang2019ExploitingSC,
title={Exploiting Sentential Context for Neural Machine Translation},
......@@ -5576,10 +5781,16 @@ pages ={157-166},
}
@inproceedings{Dou2019DynamicLA,
title={Dynamic Layer Aggregation for Neural Machine Translation},
author={Zi-Yi Dou and Zhaopeng Tu and Xing Wang and Longyue Wang and Shuming Shi and T. Zhang},
publisher={AAAI Conference on Artificial Intelligence},
year={2019}
author = {Zi-Yi Dou and
Zhaopeng Tu and
Xing Wang and
Longyue Wang and
Shuming Shi and
Tong Zhang},
title = {Dynamic Layer Aggregation for Neural Machine Translation with Routing-by-Agreement},
pages = {86--93},
publisher = {the Association for the Advance of Artificial Intelligence},
year = {2019}
}
@inproceedings{Wei2020MultiscaleCD,
title={Multiscale Collaborative Deep Models for Neural Machine Translation},
......@@ -5589,18 +5800,32 @@ pages ={157-166},
}
@inproceedings{Vaswani2018Tensor2TensorFN,
title={Tensor2Tensor for Neural Machine Translation},
author={Ashish Vaswani and S. Bengio and E. Brevdo and F. Chollet and Aidan N. Gomez and S. Gouws and Llion Jones and L. Kaiser and Nal Kalchbrenner and Niki Parmar and Ryan Sepassi and Noam Shazeer and Jakob Uszkoreit},
booktitle={American Mobile Telecommunications Association },
year={2018}
author = {Ashish Vaswani and
Samy Bengio and
Eugene Brevdo and
Fran{\c{c}}ois Chollet and
Aidan N. Gomez and
Stephan Gouws and
Llion Jones and
Lukasz Kaiser and
Nal Kalchbrenner and
Niki Parmar and
Ryan Sepassi and
Noam Shazeer and
Jakob Uszkoreit},
title = {Tensor2Tensor for Neural Machine Translation},
pages = {193--199},
publisher = {Association for Machine Translation in the Americas},
year = {2018}
}
@article{Kitaev2020ReformerTE,
title={Reformer: The Efficient Transformer},
author={Nikita Kitaev and L. Kaiser and Anselm Levskaya},
journal={ArXiv},
year={2020},
volume={abs/2001.04451}
author = {Nikita Kitaev and
Lukasz Kaiser and
Anselm Levskaya},
title = {Reformer: The Efficient Transformer},
publisher = {OpenReview.net},
year = {2020}
}
@article{Lin2020WeightDT,
......@@ -5614,7 +5839,7 @@ pages ={157-166},
@article{li2020shallow,
title={Shallow-to-Deep Training for Neural Machine Translation},
author={Li, Bei and Wang, Ziyang and Liu, Hui and Jiang, Yufan and Du, Quan and Xiao, Tong and Wang, Huizhen and Zhu, Jingbo},
journal={arXiv preprint arXiv:2010.03737},
publisher={Conference on Empirical Methods in Natural Language Processing},
year={2020}
}
%%%%% chapter 12------------------------------------------------------
......@@ -6072,6 +6297,364 @@ pages ={157-166},
publisher = {Annual Meeting of the Association for Computational Linguistics},
year = {2019}
}
@article{2015OnGulcehre,
title = {On Using Monolingual Corpora in Neural Machine Translation},
author = { Gulcehre Caglar and
Firat Orhan and
Xu Kelvin and
Cho Kyunghyun and
Barrault Loic and
Lin Huei Chi and
Bougares Fethi and
Schwenk Holger and
Bengio Yoshua },
journal = {Computer Science},
year = {2015},
}
@phdthesis{黄书剑0统计机器翻译中的词对齐研究,
title={统计机器翻译中的词对齐研究},
author={黄书剑},
publisher={南京大学},
year={2012}
}
@article{DBLP:journals/corr/MikolovLS13,
author = {Tomas Mikolov and
Quoc V. Le and
Ilya Sutskever},
title = {Exploiting Similarities among Languages for Machine Translation},
journal = {CoRR},
volume = {abs/1309.4168},
year = {2013}
}
@inproceedings{DBLP:conf/acl/VulicK16,
author = {Ivan Vulic and
Anna Korhonen},
title = {On the Role of Seed Lexicons in Learning Bilingual Word Embeddings},
publisher = {The Association for Computer Linguistics},
year = {2016}
}
@inproceedings{DBLP:conf/iclr/SmithTHH17,
author = {Samuel L. Smith and
David H. P. Turban and
Steven Hamblin and
Nils Y. Hammerla},
title = {Offline bilingual word vectors, orthogonal transformations and the
inverted softmax},
publisher = {International Conference on Learning Representations},
year = {2017}
}
@inproceedings{DBLP:conf/acl/ArtetxeLA17,
author = {Mikel Artetxe and
Gorka Labaka and
Eneko Agirre},
title = {Learning bilingual word embeddings with (almost) no bilingual data},
pages = {451--462},
publisher = {Association for Computational Linguistics},
year = {2017}
}
@article{1966ASchnemann,
title={A generalized solution of the orthogonal procrustes problem},
author={Schnemann, Peter H. },
journal={Psychometrika},
volume={31},
number={1},
pages={1-10},
year={1966},
}
@inproceedings{DBLP:conf/iclr/LampleCRDJ18,
author = {Guillaume Lample and
Alexis Conneau and
Marc'Aurelio Ranzato and
Ludovic Denoyer and
Herv{\'{e}} J{\'{e}}gou},
title = {Word translation without parallel data},
publisher = {International Conference on Learning Representations},
year = {2018}
}
@inproceedings{DBLP:conf/acl/ZhangLLS17,
author = {Meng Zhang and
Yang Liu and
Huanbo Luan and
Maosong Sun},
title = {Adversarial Training for Unsupervised Bilingual Lexicon Induction},
pages = {1959--1970},
publisher = {Association for Computational Linguistics},
year = {2017}
}
@inproceedings{DBLP:conf/emnlp/XuYOW18,
author = {Ruochen Xu and
Yiming Yang and
Naoki Otani and
Yuexin Wu},
title = {Unsupervised Cross-lingual Transfer of Word Embedding Spaces},
pages = {2465--2474},
publisher = {Association for Computational Linguistics},
year = {2018}
}
@inproceedings{DBLP:conf/emnlp/Alvarez-MelisJ18,
author = {David Alvarez-Melis and
Tommi S. Jaakkola},
title = {Gromov-Wasserstein Alignment of Word Embedding Spaces},
pages = {1881--1890},
publisher = {Association for Computational Linguistics},
year = {2018}
}
@inproceedings{DBLP:conf/lrec/GarneauGBDL20,
author = {Nicolas Garneau and
Mathieu Godbout and
David Beauchemin and
Audrey Durand and
Luc Lamontagne},
title = {A Robust Self-Learning Method for Fully Unsupervised Cross-Lingual
Mappings of Word Embeddings: Making the Method Robustly Reproducible
as Well},
pages = {5546--5554},
publisher = {European Language Resources Association},
year = {2020}
}
@inproceedings{DBLP:conf/naacl/XingWLL15,
author = {Chao Xing and
Dong Wang and
Chao Liu and
Yiye Lin},
title = {Normalized Word Embedding and Orthogonal Transform for Bilingual Word
Translation},
pages = {1006--1011},
publisher = {The Association for Computational Linguistics},
year = {2015}
}
@inproceedings{DBLP:conf/iclr/SmithTHH17,
author = {Samuel L. Smith and
David H. P. Turban and
Steven Hamblin and
Nils Y. Hammerla},
title = {Offline bilingual word vectors, orthogonal transformations and the
inverted softmax},
publisher = {International Conference on Learning Representations},
year = {2017}
}
@inproceedings{DBLP:conf/emnlp/VulicGRK19,
author = {Ivan Vulic and
Goran Glavas and
Roi Reichart and
Anna Korhonen},
title = {Do We Really Need Fully Unsupervised Cross-Lingual Embeddings?},
pages = {4406--4417},
publisher = {Association for Computational Linguistics},
year = {2019}
}
@inproceedings{DBLP:conf/acl/SogaardVR18,
author = {Anders S{\o}gaard and
Sebastian Ruder and
Ivan Vulic},
title = {On the Limitations of Unsupervised Bilingual Dictionary Induction},
pages = {778--788},
publisher = {Association for Computational Linguistics},
year = {2018}
}
@article{DBLP:journals/talip/MarieF20,
author = {Benjamin Marie and
Atsushi Fujita},
title = {Iterative Training of Unsupervised Neural and Statistical Machine
Translation Systems},
journal = {{ACM} Trans. Asian Low Resour. Lang. Inf. Process.},
volume = {19},
number = {5},
pages = {68:1--68:21},
year = {2020}
}
@inproceedings{DBLP:conf/acl/ArtetxeLA19,
author = {Mikel Artetxe and
Gorka Labaka and
Eneko Agirre},
title = {An Effective Approach to Unsupervised Machine Translation},
pages = {194--203},
publisher = {Association for Computational Linguistics},
year = {2019}
}
@inproceedings{DBLP:conf/acl/PourdamghaniAGK19,
author = {Nima Pourdamghani and
Nada Aldarrab and
Marjan Ghazvininejad and
Kevin Knight and
Jonathan May},
title = {Translating Translationese: {A} Two-Step Approach to Unsupervised
Machine Translation},
pages = {3057--3062},
publisher = {Association for Computational Linguistics},
year = {2019}
}
@inproceedings{DBLP:conf/iclr/LampleCDR18,
author = {Guillaume Lample and
Alexis Conneau and
Ludovic Denoyer and
Marc'Aurelio Ranzato},
title = {Unsupervised Machine Translation Using Monolingual Corpora Only},
publisher = {International Conference on Learning Representations},
year = {2018}
}
@inproceedings{DBLP:conf/nips/ConneauL19,
author = {Alexis Conneau and
Guillaume Lample},
title = {Cross-lingual Language Model Pretraining},
pages = {7057--7067},
year = {2019}
}
@article{DBLP:journals/ipm/FarhanTAJATT20,
author = {Wael Farhan and
Bashar Talafha and
Analle Abuammar and
Ruba Jaikat and
Mahmoud Al-Ayyoub and
Ahmad Bisher Tarakji and
Anas Toma},
title = {Unsupervised dialectal neural machine translation},
journal = {Information Processing \& Management},
volume = {57},
number = {3},
pages = {102181},
year = {2020}
}
@article{A2020Li,
title={A Simple and Effective Approach to Robust Unsupervised Bilingual Dictionary Induction},
author={Yanyang Li and Yingfeng Luo and Ye Lin and Quan Du and Huizhen Wang and Shujian Huang and Tong Xiao and Jingbo Zhu},
publisher={International Conference on Computational Linguistics},
year={2020}
}
@inproceedings{2018When,
title={When and Why are Pre-trained Word Embeddings Useful for Neural Machine Translation?},
author={ Qi, Ye and Sachan, Devendra Singh and Felix, Matthieu and Padmanabhan, Sarguna Janani and Neubig, Graham },
publisher={Annual Conference of the North American Chapter of the Association for Computational Linguistics},
year={2018},
}
@inproceedings{DBLP:conf/emnlp/ClinchantJN19,
author = {St{\'{e}}phane Clinchant and
Kweon Woo Jung and
Vassilina Nikoulina},
title = {On the use of {BERT} for Neural Machine Translation},
pages = {108--117},
publisher = {Annual Meeting of the Association for Computational Linguistics},
year = {2019}
}
@inproceedings{DBLP:conf/emnlp/ImamuraS19,
author = {Kenji Imamura and
Eiichiro Sumita},
title = {Recycling a Pre-trained {BERT} Encoder for Neural Machine Translation},
booktitle = {Proceedings of the 3rd Workshop on Neural Generation and Translation@EMNLP-IJCNLP
2019, Hong Kong, November 4, 2019},
pages = {23--31},
publisher = {Annual Meeting of the Association for Computational Linguistics},
year = {2019}
}
@inproceedings{DBLP:conf/aaai/YangW0Z00020,
author = {Jiacheng Yang and
Mingxuan Wang and
Hao Zhou and
Chengqi Zhao and
Weinan Zhang and
Yong Yu and
Lei Li},
title = {Towards Making the Most of {BERT} in Neural Machine Translation},
pages = {9378--9385},
publisher = {AAAI Conference on Artificial Intelligence},
year = {2020}
}
@inproceedings{DBLP:conf/aaai/WengYHCL20,
author = {Rongxiang Weng and
Heng Yu and
Shujian Huang and
Shanbo Cheng and
Weihua Luo},
title = {Acquiring Knowledge from Pre-Trained Model to Neural Machine Translation},
pages = {9266--9273},
publisher = {AAAI Conference on Artificial Intelligence},
year = {2020}
}
@article{DBLP:journals/corr/abs-2001-08210,
author = {Yinhan Liu and
Jiatao Gu and
Naman Goyal and
Xian Li and
Sergey Edunov and
Marjan Ghazvininejad and
Mike Lewis and
Luke Zettlemoyer},
title = {Multilingual Denoising Pre-training for Neural Machine Translation},
journal = {CoRR},
volume = {abs/2001.08210},
year = {2020}
}
@inproceedings{DBLP:conf/aaai/JiZDZCL20,
author = {Baijun Ji and
Zhirui Zhang and
Xiangyu Duan and
Min Zhang and
Boxing Chen and
Weihua Luo},
title = {Cross-Lingual Pre-Training Based Transfer for Zero-Shot Neural Machine
Translation},
pages = {115--122},
publisher = {AAAI Conference on Artificial Intelligence},
year = {2020}
}
@inproceedings{DBLP:conf/acl/LewisLGGMLSZ20,
author = {Mike Lewis and
Yinhan Liu and
Naman Goyal and
Marjan Ghazvininejad and
Abdelrahman Mohamed and
Omer Levy and
Veselin Stoyanov and
Luke Zettlemoyer},
title = {{BART:} Denoising Sequence-to-Sequence Pre-training for Natural Language
Generation, Translation, and Comprehension},
pages = {7871--7880},
publisher = {Annual Meeting of the Association for Computational Linguistics},
year = {2020}
}
@article{DBLP:journals/corr/abs-2009-08088,
author = {Zhen Yang and
Bojie Hu and
Ambyera Han and
Shen Huang and
Qi Ju},
title = {Code-switching pre-training for neural machine translation},
journal = {CoRR},
volume = {abs/2009.08088},
year = {2020}
}
@article{DBLP:journals/corr/abs-2010-09403,
author = {Dusan Varis and
Ondrej Bojar},
title = {Unsupervised Pretraining for Neural Machine Translation Using Elastic
Weight Consolidation},
journal = {CoRR},
volume = {abs/2010.09403},
year = {2020}
}
@inproceedings{DBLP:conf/emnlp/LampleOCDR18,
author = {Guillaume Lample and
Myle Ott and
Alexis Conneau and
Ludovic Denoyer and
Marc'Aurelio Ranzato},
title = {Phrase-Based {\&} Neural Unsupervised Machine Translation},
pages = {5039--5049},
publisher = {Annual Meeting of the Association for Computational Linguistics},
year = {2018}
}
@article{DBLP:journals/jbd/ShortenK19,
author = {Connor Shorten and
Taghi M. Khoshgoftaar},
title = {A survey on Image Data Augmentation for Deep Learning},
journal = {J. Big Data},
volume = {6},
pages = {60},
year = {2019}
}
%%%%% chapter 16------------------------------------------------------
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
......
......@@ -666,8 +666,7 @@ addtohook={%
\usepackage[mathscr]{euscript}
%%%%%%%%%%%chapter 11---------------------------------------
\newlength{\bc}
\setlength{\bc}{0.4cm}
\newcommand{\mychapter}[1]{\ref{#1}}%chapter用
\newcommand{\mysection}[1]{\ref{#1}}%section、subsection、subsubsection用
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论