Commit 7d792ef5 by 曹润柘

合并分支 'caorunzhe' 到 'master'

Caorunzhe

查看合并请求 !909
parents 6bdba3bf 6f871f60
......@@ -12,7 +12,7 @@
% RNN Encoder
\coordinate (eemb0) at (0,0);
\foreach \x [count=\y from 0] in {1,2,...,3}
\node[rnnnode,minimum height=0.5\base,fill=green!30!white,anchor=west] (eemb\x) at ([xshift=0.4\base]eemb\y.east) {\tiny{$\textrm{e}_x()$}};
\node[rnnnode,minimum height=0.5\base,fill=green!30!white,anchor=west] (eemb\x) at ([xshift=0.6\base]eemb\y.east) {\tiny{$\textrm{e}_x()$}};
\foreach \x in {1,2,...,3}
\node[rnnnode,fill=blue!30!white,anchor=south] (enc\x) at ([yshift=0.3\base]eemb\x.north) {};
\node[] (enclabel1) at (enc1) {\tiny{$\mathbi{h}_{m-2}$}};
......@@ -73,7 +73,7 @@
\draw[-latex'] (enc3.north) .. controls +(north:0.3\base) and +(east:\base) .. (bridge) .. controls +(west:2.7\base) and +(west:0.3\base) .. (dec1.west);
{
\node [anchor=east] (line1) at ([xshift=-3em,yshift=0.5em]softmax1.west) {\scriptsize{基于RNN的隐层状态$\mathbi{s}_i$}};
\node [anchor=east] (line1) at ([xshift=-4em,yshift=0.5em]softmax1.west) {\scriptsize{基于RNN的隐层状态$\mathbi{s}_i$}};
\node [anchor=north west] (line2) at ([yshift=0.3em]line1.south west) {\scriptsize{预测目标词的概率}};
\node [anchor=north west] (line3) at ([yshift=0.3em]line2.south west) {\scriptsize{通常,用Softmax函数}};
\node [anchor=north west] (line4) at ([yshift=0.3em]line3.south west) {\scriptsize{实现 $\funp{P}(y_i|...)$}};
......@@ -87,7 +87,7 @@
}
{
\node [anchor=west] (line21) at ([xshift=1.3em,yshift=1.5em]enc3.east) {\scriptsize{源语编码器最后一个}};
\node [anchor=west] (line21) at ([xshift=3.3em,yshift=1.5em]enc3.east) {\scriptsize{源语编码器最后一个}};
\node [anchor=north west] (line22) at ([yshift=0.3em]line21.south west) {\scriptsize{循环单元的输出被}};
\node [anchor=north west] (line23) at ([yshift=0.3em]line22.south west) {\scriptsize{看作是句子的表示,}};
\node [anchor=north west] (line24) at ([yshift=0.3em]line23.south west) {\scriptsize{记为$\mathbi{C}$}};
......@@ -97,21 +97,21 @@
{
\node [rectangle,inner sep=0.2em,rounded corners=1pt,fill=red!10,drop shadow,draw=red,minimum width =9em] [fit = (line1) (line2) (line3) (line4)] (box1) {};
\node [rectangle,inner sep=0.2em,rounded corners=1pt,very thick,dotted,draw=red] [fit = (softmax1) (softmax2) (softmax3)] (box4) {};
\draw [->,dotted,very thick,red] ([yshift=1em,xshift=2.5em]box1.east) -- ([yshift=1em,xshift=0.1em]box1.east);
\draw [->,dotted,very thick,red] ([yshift=1em,xshift=3.5em]box1.east) -- ([yshift=1em,xshift=0.1em]box1.east);
}
{
\node [rectangle,inner sep=0.2em,rounded corners=1pt,fill=green!10,drop shadow,draw=ugreen,minimum width =9em] [fit = (line11) (line12) (line13) (line14)] (box2) {};
\node [rectangle,inner sep=0.2em,rounded corners=1pt,very thick,dotted,draw=ugreen] [fit = (eemb1) (eemb2) (eemb3)] (box5) {};
\node [rectangle,inner sep=0.2em,rounded corners=1pt,very thick,dotted,draw=ugreen] [fit = (demb1) (demb2) (demb3)] (box6) {};
\draw [->,dotted,very thick,ugreen] ([yshift=-1.3em,xshift=2.5em]box2.east) -- ([yshift=-1.3em,xshift=0.1em]box2.east);
\draw [->,dotted,very thick,ugreen] ([yshift=-1.7em,xshift=3.5em]box2.east) -- ([yshift=-1.7em,xshift=0.1em]box2.east);
\draw [->,dotted,very thick,ugreen] ([xshift=0.1em]box6.west) .. controls +(west:1) and +(east:1) .. ([yshift=1.0em]box2.east) ;
}
{
\node [rectangle,inner sep=0.2em,rounded corners=1pt,fill=purple!10,drop shadow,draw=purple] [fit = (line21) (line22) (line23) (line24)] (box3) {};
\node [rectangle,inner sep=0.2em,rounded corners=1pt,very thick,dotted,draw=purple] [fit = (enc3)] (box7) {};
\draw [->,dotted,very thick,purple] ([xshift=0.1em]box7.east) -- ([xshift=0.8em]box7.east) ;
\draw [->,dotted,very thick,purple] ([xshift=0.1em]box7.east) -- ([xshift=2.8em]box7.east) ;
}
\end{pgfonlayer}
......
......@@ -10,11 +10,11 @@
% RNN Encoder
\coordinate (eemb0) at (0,0);
\foreach \x [count=\y from 0] in {1,2,...,4}
\node[rnnnode,minimum height=0.5\base,fill=green!30!white,anchor=west] (eemb\x) at ([xshift=0.4\base]eemb\y.east) {};
\node[rnnnode,minimum height=0.5\base,fill=green!30!white,anchor=west] (eemb\x) at ([xshift=0.7\base]eemb\y.east) {};
\foreach \x in {1,2,...,4}
\node[rnnnode,fill=blue!30!white,anchor=south] (enc\x) at ([yshift=0.5\base]eemb\x.north) {};
\node[wordnode,left=0.4\base of enc1,font=\footnotesize] (init) {0};
\node[wordnode,anchor=east] (init2) at ([xshift=-3.0em]init.west){};
\node[wordnode,anchor=east] (init2) at ([xshift=-3em]init.west){};
{
\node[rnnnode,fill=purple] (repr) at (enc4) {};
\node[wordnode] (label) at ([yshift=2.5em]enc4.north) {
......@@ -35,7 +35,7 @@
% RNN Decoder
\foreach \x in {1,2,...,4}
\node[rnnnode,minimum height=0.5\base,fill=green!30!white,anchor=south] (demb\x) at ([xshift=9.5em,yshift=-3.9em]enc\x.north) {};
\node[rnnnode,minimum height=0.5\base,fill=green!30!white,anchor=south] (demb\x) at ([xshift=12.5em,yshift=-3.9em]enc\x.north) {};
\foreach \x in {1,2,...,4}
\node[rnnnode,fill=blue!30!white,anchor=south] (dec\x) at ([yshift=0.5\base]demb\x.north) {};
\foreach \x in {1,2,...,4}
......
......@@ -11,7 +11,7 @@
%\setlength{\mystep}{1.6em}
\foreach \x in {1,2,...,6}
\node[] (s\x) at (\x * 1.6em,0) {};
\node[] (s\x) at (\x * 2em,0) {};
\node [] (ws1) at (s1) {\scriptsize{}};
\node [] (ws2) at (s2) {\scriptsize{}};
......@@ -21,7 +21,7 @@
\node [] (ws6) at (s6) {\scriptsize{句子}};
\foreach \x in {1,2,...,6}
\node[] (t\x) at (\x * 1.6em + 2.4in,0) {};
\node[] (t\x) at (\x * 2em + 2.8in,0) {};
\node [] (wt1) at (t1) {\scriptsize{This}};
\node [] (wt2) at (t2) {\scriptsize{is}};
......@@ -30,9 +30,9 @@
\node [] (wt5) at (t5) {\scriptsize{long}};
\node [] (wt6) at ([xshift=1em]t6) {\scriptsize{sentence}};
\node [anchor=south west,fill=red!30,minimum width=1.6in,minimum height=1.5em] (encoder) at ([yshift=1.0em]ws1.north west) {\footnotesize{Encoder}};
\node [anchor=west,fill=blue!30,minimum width=1.9in,minimum height=1.5em] (decoder) at ([xshift=4.5em]encoder.east) {\footnotesize{Decoder}};
\node [anchor=west,fill=green!30,minimum height=1.5em] (representation) at ([xshift=1em]encoder.east) {\footnotesize{表示}};
\node [anchor=south west,fill=red!30,minimum width=1.9in,minimum height=1.5em] (encoder) at ([yshift=1.0em,xshift=-0.4em]ws1.north west) {\footnotesize{Encoder}};
\node [anchor=west,fill=green!30,minimum height=1.5em] (representation) at ([xshift=2em]encoder.east) {\footnotesize{表示}};
\node [anchor=west,fill=blue!30,minimum width=1.9in,minimum height=1.5em] (decoder) at ([xshift=2em]representation.east) {\footnotesize{Decoder}};
\draw [->,thick] ([xshift=1pt]encoder.east)--([xshift=-1pt]representation.west);
\draw [->,thick] ([xshift=1pt]representation.east)--([xshift=-1pt]decoder.west);
......
......@@ -5,7 +5,7 @@
\begin{tikzpicture}
\begin{scope}
\tikzstyle{rnnnode} = [minimum height=1.1em,minimum width=3.5em,inner sep=2pt,rounded corners=1pt,draw,fill=red!20];
\tikzstyle{rnnnode} = [minimum height=1.1em,minimum width=4em,inner sep=2pt,rounded corners=1pt,draw,fill=red!20];
\tikzstyle{wnode} = [minimum height=1.0em,minimum width=3.5em,inner sep=2pt,rounded corners=1pt,draw,fill=white];
......@@ -137,13 +137,13 @@
}
{
\node [anchor=east] (vocab) at ([xshift=-5em]s1.west) {\tiny{$\begin{bmatrix} \textrm{Have} & 0.50 \\ \textrm{I} & 0.02 \\ \textrm{it} & 0.03 \\ \textrm{has} & 0.30 \\ \textrm{you} & 0.01 \\ \textrm{the} & 0.01 \\ \textrm{a} & 0.01 \\ \textrm{an} & 0.02 \\ \textrm{he} & 0.03 \\ \textrm{she} & 0.01 \\ \textrm{are} & 0.00 \\ \textrm{am} & 0.01 \\ ... & ... \end{bmatrix}$}};
\node [anchor=east] (vocab) at ([xshift=-8em]s1.west) {\tiny{$\begin{bmatrix} \textrm{Have} & 0.50 \\ \textrm{I} & 0.02 \\ \textrm{it} & 0.03 \\ \textrm{has} & 0.30 \\ \textrm{you} & 0.01 \\ \textrm{the} & 0.01 \\ \textrm{a} & 0.01 \\ \textrm{an} & 0.02 \\ \textrm{he} & 0.03 \\ \textrm{she} & 0.01 \\ \textrm{are} & 0.00 \\ \textrm{am} & 0.01 \\ ... & ... \end{bmatrix}$}};
\node [anchor=south] (vocablabel) at (vocab.north) {\scriptsize{单词的概率分布}};
\draw [->,red,very thick,dotted] (o1.west) .. controls +(west:1) and +(east:2) .. ([yshift=1em]vocab.south east);
}
{
\node [anchor=east,inner sep=1pt] (vocabtopn) at ([xshift=-0.5em,yshift=-0.5em]wo1.west) {\scriptsize{$\begin{bmatrix} \textrm{Have} \\ \textrm{has} \\ \textrm{it} \end{bmatrix}$}};
\node [anchor=east,inner sep=1pt] (vocabtopn) at ([xshift=-1.5em,yshift=-0.5em]wo1.west) {\scriptsize{$\begin{bmatrix} \textrm{Have} \\ \textrm{has} \\ \textrm{it} \end{bmatrix}$}};
\draw [->] ([yshift=-1.6em,xshift=-0.4em]vocab.north east) .. controls +(east:1) and +(west:1) .. ([xshift=0.1em,yshift=0.4em]vocabtopn.west) node [pos=0.3,below] (topnlabel) {\scriptsize{top-3}};
{
......
......@@ -13,7 +13,7 @@
% RNN Encoder
\coordinate (eemb0) at (0,0);
\foreach \x [count=\y from 0] in {1,2,...,10}
\node[rnnnode,minimum height=0.5\base,fill=green!30!white,anchor=west] (eemb\x) at ([xshift=0.4\base]eemb\y.east) {};
\node[rnnnode,minimum height=0.5\base,fill=green!30!white,anchor=west] (eemb\x) at ([xshift=0.7\base]eemb\y.east) {};
\foreach \x in {1,2,...,10}
\node[rnnnode,fill=blue!30!white,anchor=south] (backenc\x) at ([yshift=0.5\base]eemb\x.north) {};
\foreach \x in {1,2,...,10}
......
......@@ -8,25 +8,25 @@
\begin{scope}
\node [anchor=west,draw,fill=red!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (h1) at (0,0) {\scriptsize{$\mathbi{h}_1$}};
\node [anchor=west,draw,fill=red!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (h2) at ([xshift=1em]h1.east) {\scriptsize{$\mathbi{h}_2$}};
\node [anchor=west,inner sep=0pt,minimum width=3em] (h3) at ([xshift=0.5em]h2.east) {\scriptsize{...}};
\node [anchor=west,draw,fill=red!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (h4) at ([xshift=0.5em]h3.east) {\scriptsize{$\mathbi{h}_m$}};
\node [anchor=west,draw,fill=red!30!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (h1) at (0,0) {\scriptsize{$\mathbi{h}_1$}};
\node [anchor=west,draw,fill=red!30!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (h2) at ([xshift=2em]h1.east) {\scriptsize{$\mathbi{h}_2$}};
\node [anchor=west,inner sep=0pt,minimum width=3em] (h3) at ([xshift=1em]h2.east) {\scriptsize{...}};
\node [anchor=west,draw,fill=red!30!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (h4) at ([xshift=1em]h3.east) {\scriptsize{$\mathbi{h}_m$}};
\node [anchor=south,circle,minimum size=1.0em,draw,ublue,thick] (sum) at ([yshift=2em]h2.north east) {};
\draw [thick,-,ublue] (sum.north) -- (sum.south);
\draw [thick,-,ublue] (sum.west) -- (sum.east);
\node [anchor=south,draw,fill=green!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (th1) at ([yshift=2em,xshift=-1em]sum.north west) {\scriptsize{$\mathbi{s}_{j-1}$}};
\node [anchor=west,draw,fill=green!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (th2) at ([xshift=2em]th1.east) {\scriptsize{$\mathbi{s}_{j}$}};
\node [anchor=south,draw,fill=green!30!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (th1) at ([yshift=2em,xshift=-1em]sum.north west) {\scriptsize{$\mathbi{s}_{j-1}$}};
\node [anchor=west,draw,fill=green!30!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (th2) at ([xshift=2.5em]th1.east) {\scriptsize{$\mathbi{s}_{j}$}};
\draw [->] (h1.north) .. controls +(north:0.8) and +(west:1) .. (sum.190) node [pos=0.2,left] {\scriptsize{$\alpha_{1,j}$}};
\draw [->] (h2.north) .. controls +(north:0.6) and +(220:0.2) .. (sum.220) node [pos=0.2,right] {\scriptsize{$\alpha_{2,j}$}};
\draw [->] (h4.north) .. controls +(north:0.8) and +(east:1) .. (sum.-10) node [pos=0.1,left] (alphan) {\scriptsize{$\alpha_{m,j}$}};
\draw [->] ([xshift=-1.5em]th1.west) -- ([xshift=-0.1em]th1.west);
\draw [->] ([xshift=-2em]th1.west) -- ([xshift=-0.1em]th1.west);
\draw [->] ([xshift=0.1em]th1.east) -- ([xshift=-0.1em]th2.west);
\draw [->] ([xshift=0.1em]th2.east) -- ([xshift=1.5em]th2.east);
\draw [->] ([xshift=0.1em]th2.east) -- ([xshift=2em]th2.east);
\draw [->] (sum.north) .. controls +(north:0.8) and +(west:0.2) .. ([yshift=-0.4em,xshift=-0.1em]th2.west) node [pos=0.2,right] (ci) {\scriptsize{$\mathbi{C}_{j}$}};
\node [anchor=south,inner sep=1pt] (output) at ([yshift=0.8em]th2.north) {\scriptsize{输出层}};
......@@ -39,7 +39,7 @@
\node [anchor=north] (enc42) at ([yshift=0.5em]enc4.south) {\scriptsize{(位置$m$)}};
{
\node [anchor=west] (math1) at ([xshift=5em,yshift=1em]th2.east) {$\mathbi{C}_j = \sum_{i} \alpha_{i,j} \mathbi{h}_i $ \ \ \ \ };
\node [anchor=west] (math1) at ([xshift=7em,yshift=1em]th2.east) {$\mathbi{C}_j = \sum_{i} \alpha_{i,j} \mathbi{h}_i $ \ \ \ \ };
}
{
\node [anchor=north west] (math2) at ([yshift=-2em,xshift=-0.2em]math1.south west) {$\alpha_{i,j} = \frac{\exp(\beta_{i,j})}{\sum_{i'} \exp(\beta_{i',j})}$};
......@@ -48,10 +48,10 @@
\begin{pgfonlayer}{background}
{
\node [rectangle,inner sep=0.4em,rounded corners=1pt,fill=blue!10,drop shadow,minimum width=9.5em] [fit = (math1)] (box1) {};
\node [rectangle,inner sep=0.4em,rounded corners=1pt,fill=blue!20,drop shadow,minimum width=9.5em] [fit = (math1)] (box1) {};
}
{
\node [rectangle,inner sep=0.4em,rounded corners=1pt,fill=orange!10,drop shadow,minimum width=9.5em] [fit = (math2) (math3)] (box2) {};
\node [rectangle,inner sep=0.4em,rounded corners=1pt,fill=orange!20,drop shadow,minimum width=9.5em] [fit = (math2) (math3)] (box2) {};
}
\end{pgfonlayer}
......
......@@ -17,7 +17,7 @@
\node [samplenode,anchor=south west,font=\scriptsize] (batch\i) at ([shift={(-1em,-0.5em)}]batch\j.south west) {句子 \k};
\draw [decorate,decoration={brace}] (batch1.south east) to node [auto,rotate=30,anchor=north,font=\scriptsize] {batch大小} (batch3.south east);
\node [samplenode,anchor=west,font=\scriptsize] (sample2) at ([xshift=4em]batch2.east) {句子2};
\node [samplenode,anchor=west,font=\scriptsize] (sample2) at ([xshift=5em]batch2.east) {句子2};
\node [samplenode,anchor=south,font=\scriptsize] (sample3) at ([yshift=3em]sample2.north) {句子3};
\node [samplenode,anchor=north,font=\scriptsize] (sample1) at ([yshift=-3em]sample2.south) {句子1};
......@@ -26,18 +26,18 @@
\foreach \i in {1,2,3}
{
\coordinate (start) at ([xshift=2em]sample\i.east);
\coordinate (start) at ([xshift=3em]sample\i.east);
\node [wordnode,anchor=west] (rnn0) at (start) {$0$};
\foreach \j [count=\k from 0] in {1,2,3}
{
\node [rnnnode,anchor=west] (rnn\j) at ([xshift=1em]rnn\k.east) {};
\node [rnnnode,anchor=west] (rnn\j) at ([xshift=1.5em]rnn\k.east) {};
\draw [-latex'] (rnn\k) to (rnn\j);
\coordinate (in\j) at ([yshift=-1em]rnn\j.south);
\draw [-latex'] (in\j) to (rnn\j.south);
\coordinate (out\j) at ([yshift=1em]rnn\j.north);
\draw [-latex'] (rnn\j.north) to (out\j);
}
\node [wordnode,anchor=west] (rnn4) at ([xshift=1em]rnn3.east) {$\cdots$};
\node [wordnode,anchor=west] (rnn4) at ([xshift=1.5em]rnn3.east) {$\cdots$};
\draw [-latex'] (rnn3) to (rnn4);
\node [draw,densely dashed,thick,rounded corners=0.3em,fit=(start) (in3) (out3) (rnn4),label={[font=\footnotesize,rotate=90,anchor=north]0:设备\i}] (rnn) {};
\draw [->,double] ([xshift=3pt]sample\i.east) -- ([xshift=-3pt]rnn.west);
......
......@@ -42,7 +42,7 @@
}
{
\node [anchor=east] (vocab) at ([xshift=-5em]s1.west) {\tiny{$\begin{bmatrix} \textrm{Have} & 0.50 \\ \textrm{I} & 0.02 \\ \textrm{it} & 0.03 \\ \textrm{has} & 0.30 \\ \textrm{you} & 0.01 \\ \textrm{the} & 0.01 \\ \textrm{a} & 0.01 \\ \textrm{an} & 0.02 \\ \textrm{he} & 0.03 \\ \textrm{she} & 0.01 \\ \textrm{are} & 0.00 \\ \textrm{am} & 0.01 \\ ... & ... \end{bmatrix}$}};
\node [anchor=east] (vocab) at ([xshift=-7em]s1.west) {\tiny{$\begin{bmatrix} \textrm{Have} & 0.50 \\ \textrm{I} & 0.02 \\ \textrm{it} & 0.03 \\ \textrm{has} & 0.30 \\ \textrm{you} & 0.01 \\ \textrm{the} & 0.01 \\ \textrm{a} & 0.01 \\ \textrm{an} & 0.02 \\ \textrm{he} & 0.03 \\ \textrm{she} & 0.01 \\ \textrm{are} & 0.00 \\ \textrm{am} & 0.01 \\ ... & ... \end{bmatrix}$}};
\node [anchor=south] (vocablabel) at (vocab.north) {\scriptsize{单词的概率分布}};
\draw [->,red,very thick,dotted] (o1.west) .. controls +(west:1) and +(east:2) .. ([yshift=1em]vocab.south east);
}
......
\begin{tikzpicture}
\begin{scope}
\tikzstyle{rnnnode} = [minimum height=1.1em,minimum width=2.1em,inner sep=2pt,rounded corners=1pt,draw,fill=red!20];
\tikzstyle{rnnnode} = [minimum height=1.1em,minimum width=2.7em,inner sep=2pt,rounded corners=1pt,draw,fill=red!20];
\node [rnnnode,anchor=west] (h1) at (0,0) {\tiny{$\mathbi{h}_1$}};
\node [anchor=west] (h2) at ([xshift=1em]h1.east) {\tiny{...}};
......@@ -139,7 +139,7 @@
\draw [->] ([yshift=-0.3em]s1.west) .. controls +(west:2) and +(-50:0.3) .. (c2.-40);
}
{
\draw [->] (c2.0) -- ([xshift=1.2in]c2.0) -- ([yshift=0.3em,xshift=-1.2em]s2.west) -- ([yshift=0.3em,xshift=-0.1em]s2.west);
\draw [->] (c2.0) -- ([xshift=1.4in]c2.0) -- ([yshift=0.3em,xshift=-1em]s2.west) -- ([yshift=0.3em,xshift=-0.1em]s2.west);
}
{
......
......@@ -10,7 +10,7 @@
% RNN Encoder
\coordinate (eemb0) at (0,0);
\foreach \x [count=\y from 0] in {1,2,...,10}
\node[rnnnode,minimum height=0.5\base,fill=green!30!white,anchor=west] (eemb\x) at ([xshift=0.4\base]eemb\y.east) {};
\node[rnnnode,minimum height=0.5\base,fill=green!30!white,anchor=west] (eemb\x) at ([xshift=0.7\base]eemb\y.east) {};
\foreach \x in {1,2,...,10}
\node[rnnnode,fill=blue!30!white,anchor=south] (enc1\x) at ([yshift=0.3\base]eemb\x.north) {};
\foreach \x in {1,2,...,10}
......@@ -121,10 +121,10 @@
}
\coordinate (bridge) at ([yshift=1.4\base]enc16.north west);
\draw[-latex'] (enc210.north) .. controls +(north:0.4\base) and +(east:1.5\base) .. (bridge) .. controls +(west:8.0\base) and +(south west:0.8\base) .. (dec21.west);
\draw[-latex'] (enc210.north) .. controls +(north:0.6\base) and +(east:2\base) .. (bridge) .. controls +(west:8.0\base) and +(south west:2.8\base) .. (dec21.west);
\coordinate (bridge) at ([yshift=1.6\base]enc16.north west);
\draw[-latex'] (enc110.east) .. controls +(east:0.5\base) and +(east:8\base) .. (bridge) .. controls +(west:7.5\base) and +(south west:0.1\base) .. (dec11.west);
\draw[-latex'] ([yshift=-0.5em]enc110.east) .. controls +(north:1.5\base) and +(east:8\base) .. (bridge) .. controls +(west:7.5\base) and +(south west:1.2\base) .. (dec11.west);
% stack RNN
\begin{pgfonlayer}{background}
......
......@@ -11,7 +11,7 @@
%%% a simple encoder-decoder model
\begin{scope}
\foreach \x in {1,2,...,6}
\node[] (s\x) at (\x * 1.6em,0) {};
\node[] (s\x) at (\x * 2em,0) {};
\node [] (ws1) at (s1) {\scriptsize{}};
\node [] (ws2) at (s2) {\scriptsize{}};
......@@ -21,7 +21,7 @@
\node [] (ws6) at (s6) {\scriptsize{句子}};
\foreach \x in {1,2,...,6}
\node[] (t\x) at (\x * 1.6em + 2.4in,0) {};
\node[] (t\x) at (\x * 2em + 2.8in,0) {};
\node [] (wt1) at (t1) {\scriptsize{This}};
\node [] (wt2) at (t2) {\scriptsize{is}};
......@@ -30,9 +30,9 @@
\node [] (wt5) at (t5) {\scriptsize{long}};
\node [] (wt6) at ([xshift=1em]t6) {\scriptsize{sentence}};
\node [anchor=south west,fill=red!30,minimum width=1.6in,minimum height=1.5em] (encoder) at ([yshift=1.0em]ws1.north west) {\footnotesize{Encoder}};
\node [anchor=west,fill=blue!30,minimum width=1.9in,minimum height=1.5em] (decoder) at ([xshift=4.5em]encoder.east) {\footnotesize{Decoder}};
\node [anchor=west,fill=green!30,minimum height=1.5em] (representation) at ([xshift=1em]encoder.east) {\footnotesize{表示}};
\node [anchor=south west,fill=red!30,minimum width=1.9in,minimum height=1.5em] (encoder) at ([yshift=1.0em,xshift=-0.4em]ws1.north west) {\footnotesize{Encoder}};
\node [anchor=west,fill=green!30,minimum height=1.5em] (representation) at ([xshift=2em]encoder.east) {\footnotesize{表示}};
\node [anchor=west,fill=blue!30,minimum width=1.9in,minimum height=1.5em] (decoder) at ([xshift=2em]representation.east) {\footnotesize{Decoder}};
\draw [->,thick] ([xshift=1pt]encoder.east)--([xshift=-1pt]representation.west);
\draw [->,thick] ([xshift=1pt]representation.east)--([xshift=-1pt]decoder.west);
......@@ -42,7 +42,7 @@
\foreach \x in {1,2,...,5}
\draw[<-] ([yshift=0.1em]t\x.north) -- ([yshift=1.2em]t\x.north);
\draw[<-] ([yshift=0.1em,xshift=1em]t6.north) -- ([yshift=1.2em,xshift=1em]t6.north);
\draw[<-,thick] ([yshift=0.1em,xshift=1em]t6.north) -- ([yshift=1.2em,xshift=1em]t6.north);
\node [anchor=north] (cap) at ([xshift=2em,yshift=-2.5em]encoder.south east) {\small{(a) 简单的编码器-解码器框架}};
\end{scope}
......@@ -50,7 +50,7 @@
%%% a encoder-decoder model with attention
\begin{scope}[yshift=-1.7in]
\foreach \x in {1,2,...,6}
\node[] (s\x) at (\x * 1.6em,0) {};
\node[] (s\x) at (\x * 2em,0) {};
\node [] (ws1) at (s1) {\scriptsize{}};
\node [] (ws2) at (s2) {\scriptsize{}};
......@@ -60,7 +60,7 @@
\node [] (ws6) at (s6) {\scriptsize{句子}};
\foreach \x in {1,2,...,6}
\node[] (t\x) at (\x * 1.6em + 2.4in,0) {};
\node[] (t\x) at (\x * 2em + 2.8in,0) {};
\node [] (wt1) at (t1) {\scriptsize{This}};
\node [] (wt2) at (t2) {\scriptsize{is}};
......@@ -69,8 +69,8 @@
\node [] (wt5) at (t5) {\scriptsize{long}};
\node [] (wt6) at ([xshift=1em]t6) {\scriptsize{sentence}};
\node [anchor=south west,fill=red!30,minimum width=1.6in,minimum height=1.5em] (encoder) at ([yshift=1.0em]ws1.north west) {\footnotesize{Encoder}};
\node [anchor=west,fill=blue!30,minimum width=1.9in,minimum height=1.5em] (decoder) at ([xshift=4.5em]encoder.east) {\footnotesize{Decoder}};
\node [anchor=south west,fill=red!30,minimum width=1.9in,minimum height=1.5em] (encoder) at ([yshift=1.0em,xshift=-0.4em]ws1.north west) {\footnotesize{Encoder}};
\node [anchor=west,fill=blue!30,minimum width=1.9in,minimum height=1.5em] (decoder) at ([xshift=6.4em]encoder.east) {\footnotesize{Decoder}};
\foreach \x in {1,2,...,6}
\draw[->] ([yshift=0.1em]s\x.north) -- ([yshift=1.2em]s\x.north);
......@@ -78,11 +78,11 @@
\foreach \x in {1,2,...,5}
\draw[<-] ([yshift=0.1em]t\x.north) -- ([yshift=1.2em]t\x.north);
\draw[<-] ([yshift=0.1em,xshift=1em]t6.north) -- ([yshift=1.2em,xshift=1em]t6.north);
\draw[<-,thick] ([yshift=0.1em,xshift=1em]t6.north) -- ([yshift=1.2em,xshift=1em]t6.north);
\draw [->] ([yshift=3em]s6.north) -- ([yshift=4em]s6.north) -- ([yshift=4em]t1.north) node [pos=0.5,fill=green!30,inner sep=2pt] (c1) {\scriptsize{表示$\mathbi{C}_1$}} -- ([yshift=3em]t1.north) ;
\draw [->] ([yshift=3em]s5.north) -- ([yshift=5.3em]s5.north) -- ([yshift=5.3em]t2.north) node [pos=0.5,fill=green!30,inner sep=2pt] (c2) {\scriptsize{表示$\mathbi{C}_2$}} -- ([yshift=3em]t2.north) ;
\draw [->] ([yshift=3.5em]s3.north) -- ([yshift=6.6em]s3.north) -- ([yshift=6.6em]t4.north) node [pos=0.5,fill=green!30,inner sep=2pt] (c3) {\scriptsize{表示$\mathbi{C}_i$}} -- ([yshift=3.5em]t4.north) ;
\draw [->,thick] ([yshift=3em]s6.north) -- ([yshift=4em]s6.north) -- ([yshift=4em]t1.north) node [pos=0.5,fill=green!30,inner sep=2pt] (c1) {\scriptsize{表示$\mathbi{C}_1$}} -- ([yshift=3em]t1.north) ;
\draw [->,thick] ([yshift=3em]s5.north) -- ([yshift=5.3em]s5.north) -- ([yshift=5.3em]t2.north) node [pos=0.5,fill=green!30,inner sep=2pt] (c2) {\scriptsize{表示$\mathbi{C}_2$}} -- ([yshift=3em]t2.north) ;
\draw [->,thick] ([yshift=3.5em]s3.north) -- ([yshift=6.6em]s3.north) -- ([yshift=6.6em]t4.north) node [pos=0.5,fill=green!30,inner sep=2pt] (c3) {\scriptsize{表示$\mathbi{C}_i$}} -- ([yshift=3.5em]t4.north) ;
\node [anchor=north] (smore) at ([yshift=3.5em]s3.north) {...};
\node [anchor=north] (tmore) at ([yshift=3.5em]t4.north) {...};
......
......@@ -11,7 +11,7 @@
% RNN Encoder
\coordinate (eemb0) at (0,0);
\foreach \x [count=\y from 0] in {1,2,...,10}
\node[rnnnode,minimum height=0.5\base,fill=green!30!white,anchor=west] (eemb\x) at ([xshift=0.4\base]eemb\y.east) {};
\node[rnnnode,minimum height=0.5\base,fill=green!30!white,anchor=west] (eemb\x) at ([xshift=0.7\base]eemb\y.east) {};
\foreach \x in {1,2,...,10}
\node[rnnnode,fill=blue!30!white,anchor=south] (enc\x) at ([yshift=0.5\base]eemb\x.north) {};
\node[wordnode,left=0.4\base of enc1] (init) {$0$};
......@@ -114,7 +114,7 @@
% legend
\begin{scope}[shift={(10\base,2.5\base)}]
\node[rnnnode,minimum height=0.5\base,fill=green!30!white,label={[label distance=3pt,font=\scriptsize]0:词嵌入层}] (emb) at (0,0) {};
\node[rnnnode,minimum height=0.5\base,fill=green!30!white,label={[label distance=3pt,font=\scriptsize]0:词嵌入层}] (emb) at (3,0) {};
\node[rnnnode,fill=blue!30!white,anchor=north west,label={[label distance=3pt,font=\scriptsize]0:循环单元}] (rnn) at ([yshift=2\base]emb.south west) {};
\node[rnnnode,minimum height=0.5\base,fill=red!30!white,anchor=north west,label={[label distance=3pt,font=\scriptsize]0:输出层}] (softmax) at ([yshift=2\base]rnn.south west) {};
\node [anchor=north west] (softmax2) at ([xshift=0.6\base]softmax.south west) {\scriptsize{Softmax}};
......
......@@ -5,7 +5,7 @@
\begin{tikzpicture}
\begin{scope}[local bounding box=WMT]
\draw[->,thick] (0.4,0) to (9.5,0);
\draw[->,thick] (0.4,0) to (11.5,0);
\draw[->,thick] (0.4,-0) to (0.4,4.3);
\draw[thick] (0.4,2) to (0.6,2);
\draw[thick] (0.4,4) to (0.6,4);
......@@ -13,38 +13,38 @@
\node[font=\scriptsize] at (0,4) {20};
% 2015
\node[minimum width=0.5cm,thick,minimum height=7*0.2cm,fill=blue!30!white,inner sep=0pt,outer sep=0pt,anchor=south west] (smt2015) at (1.5*0.7,0.5pt) {};
\node[minimum width=0.5cm,thick,minimum height=2*0.2cm,fill=red!30!white,inner sep=0pt,outer sep=0pt,anchor=south west] (nmt2015) at (smt2015.south east) {};
\node[minimum width=0.7cm,thick,minimum height=7*0.2cm,fill=blue!30!white,inner sep=0pt,outer sep=0pt,anchor=south west] (smt2015) at (1.5*0.7,0.5pt) {};
\node[minimum width=0.7cm,thick,minimum height=2*0.2cm,fill=red!30!white,inner sep=0pt,outer sep=0pt,anchor=south west] (nmt2015) at (smt2015.south east) {};
\node[font=\scriptsize,anchor=north] () at ([yshift=-0.2em]smt2015.south east) {2015};
% 2016
\node[minimum width=0.5cm,thick,minimum height=3*0.2cm,fill=blue!30!white,inner sep=0pt,outer sep=0pt,anchor=south west] (smt2016) at ($(nmt2015.south east)+(0.7,0)$) {};
\node[minimum width=0.5cm,thick,minimum height=8*0.2cm,fill=red!30!white,inner sep=0pt,outer sep=0pt,anchor=south west] (nmt2016) at (smt2016.south east) {};
\node[minimum width=0.7cm,thick,minimum height=3*0.2cm,fill=blue!30!white,inner sep=0pt,outer sep=0pt,anchor=south west] (smt2016) at ($(nmt2015.south east)+(0.7,0)$) {};
\node[minimum width=0.7cm,thick,minimum height=8*0.2cm,fill=red!30!white,inner sep=0pt,outer sep=0pt,anchor=south west] (nmt2016) at (smt2016.south east) {};
\node[font=\scriptsize,anchor=north] () at ([yshift=-0.2em]smt2016.south east) {2016};
% 2017
\node[minimum width=0.5cm,thick,minimum height=3*0.2cm,fill=blue!30!white,inner sep=0pt,outer sep=0pt,anchor=south west] (smt2017) at ($(nmt2016.south east)+(0.7,0)$) {};
\node[minimum width=0.5cm,thick,minimum height=13*0.2cm,fill=red!30!white,inner sep=0pt,outer sep=0pt,anchor=south west] (nmt2017) at (smt2017.south east) {};
\node[minimum width=0.7cm,thick,minimum height=3*0.2cm,fill=blue!30!white,inner sep=0pt,outer sep=0pt,anchor=south west] (smt2017) at ($(nmt2016.south east)+(0.7,0)$) {};
\node[minimum width=0.7cm,thick,minimum height=13*0.2cm,fill=red!30!white,inner sep=0pt,outer sep=0pt,anchor=south west] (nmt2017) at (smt2017.south east) {};
\node[font=\scriptsize,anchor=north] () at ([yshift=-0.2em]smt2017.south east) {2017};
% 2018
\node[minimum width=0.5cm,thick,minimum height=0cm,draw,fill=blue!30!white,inner sep=0pt,outer sep=0pt,anchor=south west] (smt2018) at ($(nmt2017.south east)+(0.7,0)$) {};
\node[minimum width=0.5cm,thick,minimum height=14*0.2cm,fill=red!30!white,inner sep=0pt,outer sep=0pt,anchor=south west] (nmt2018) at (smt2018.south east) {};
\node[minimum width=0.7cm,thick,minimum height=0cm,draw,fill=blue!30!white,inner sep=0pt,outer sep=0pt,anchor=south west] (smt2018) at ($(nmt2017.south east)+(0.7,0)$) {};
\node[minimum width=0.7cm,thick,minimum height=14*0.2cm,fill=red!30!white,inner sep=0pt,outer sep=0pt,anchor=south west] (nmt2018) at (smt2018.south east) {};
\node[font=\scriptsize,anchor=north] () at ([yshift=-0.2em]smt2018.south east) {2018};
% 2019
\node[minimum width=0.5cm,thick,minimum height=0cm,draw,fill=blue!30!white,inner sep=0pt,outer sep=0pt,anchor=south west] (smt2019) at ($(nmt2018.south east)+(0.7,0)$) {};
\node[minimum width=0.5cm,thick,minimum height=21*0.2cm,fill=red!30!white,inner sep=0pt,outer sep=0pt,anchor=south west] (nmt2019) at (smt2019.south east) {};
\node[minimum width=0.7cm,thick,minimum height=0cm,draw,fill=blue!30!white,inner sep=0pt,outer sep=0pt,anchor=south west] (smt2019) at ($(nmt2018.south east)+(0.7,0)$) {};
\node[minimum width=0.7cm,thick,minimum height=21*0.2cm,fill=red!30!white,inner sep=0pt,outer sep=0pt,anchor=south west] (nmt2019) at (smt2019.south east) {};
\node[font=\scriptsize,anchor=north] () at ([yshift=-0.2em]smt2019.south east) {2019};
\end{scope}
% legend
\ExtractX{$(nmt2015.west)$}
\ExtractY{$(WMT.north)$}
\node[minimum width=0.5cm,rectangle,fill=blue!30!white,anchor=north west,label={[label distance=1pt,font=\scriptsize]0:统计机器翻译}] () at (\XCoord,\YCoord) {};
\node[minimum width=0.7cm,rectangle,fill=blue!30!white,anchor=north west,label={[label distance=1pt,font=\scriptsize]0:统计机器翻译}] () at (\XCoord,\YCoord) {};
\ExtractX{$(nmt2017.west)$}
\node[minimum width=0.5cm,rectangle,fill=red!30!white,anchor=north west,label={[label distance=1pt,font=\scriptsize]0:神经机器翻译}] () at (\XCoord,\YCoord) {};
\node[minimum width=0.7cm,rectangle,fill=red!30!white,anchor=north west,label={[label distance=1pt,font=\scriptsize]0:神经机器翻译}] () at (\XCoord,\YCoord) {};
% \node[font=\normalsize,rotate=90] () at ([xshift=-1em]WMT.west) {数量};
\node[font=\normalsize] () at (0.4,4.5) {数量};
\node[font=\normalsize] () at (9.5,-0.3) {年份};
\node[font=\normalsize] () at (11.5,-0.3) {年份};
\end{tikzpicture}
\ No newline at end of file
......@@ -19,13 +19,13 @@
\end{scope}
\begin{scope}[xshift=1.3in]
\begin{scope}[xshift=1.8in]
\node [anchor=south west,draw,thick,red,minimum width=0.9in,minimum height=0.7in] (space1) at (0,0) {};
\node [anchor=south west,fill=blue,minimum width=0.1in,minimum height=0.1in] (unit1) at (0.2,0.8) {};
\node [anchor=south west,fill=ugreen,minimum width=0.1in,minimum height=0.1in] (unit2) at (0.7,0.3) {};
\node [anchor=south west,draw,thick,red,minimum width=0.9in,minimum height=0.7in] (space2) at (1.1in,0) {};
\node [anchor=south west,circle,fill=orange,minimum width=0.1in,minimum height=0.1in] (unit3) at (1.5in,1.3) {};
\node [anchor=south west,draw,thick,red,minimum width=0.9in,minimum height=0.7in] (space2) at (1.3in,0) {};
\node [anchor=south west,circle,fill=orange,minimum width=0.1in,minimum height=0.1in] (unit3) at (1.7in,1.3) {};
\draw [->] ([yshift=1pt]unit1.north) .. controls +(north:0.4) and +(west:2) .. ([yshift=0.0em,xshift=-1pt]unit3.west);
\draw [->] ([xshift=1pt]unit2.east) .. controls +(east:1.5) and +(south:1) .. ([xshift=0.0em,yshift=-1pt]unit3.south);
......
......@@ -5,26 +5,26 @@
\tikzstyle{rnode} = [draw,minimum width=3.5em,minimum height=1.2em]
\node [rnode,anchor=south west,fill=red!20!white] (value1) at (0,0) {\scriptsize{${h}(\textrm{})$}};
\node [rnode,anchor=south west,fill=red!20!white] (value2) at ([xshift=1em]value1.south east) {\scriptsize{${h}(\textrm{什么})$}};
\node [rnode,anchor=south west,fill=red!20!white] (value3) at ([xshift=1em]value2.south east) {\scriptsize{${h}(\textrm{})$}};
\node [rnode,anchor=south west,fill=red!20!white] (value4) at ([xshift=1em]value3.south east) {\scriptsize{${h}(\textrm{})$}};
\node [rnode,anchor=south west,fill=red!20!white] (value2) at ([xshift=2em]value1.south east) {\scriptsize{${h}(\textrm{什么})$}};
\node [rnode,anchor=south west,fill=red!20!white] (value3) at ([xshift=2em]value2.south east) {\scriptsize{${h}(\textrm{})$}};
\node [rnode,anchor=south west,fill=red!20!white] (value4) at ([xshift=2em]value3.south east) {\scriptsize{${h}(\textrm{})$}};
\node [rnode,anchor=south west,fill=green!20!white] (key1) at ([yshift=0.2em]value1.north west) {\scriptsize{${h}(\textrm{})$}};
\node [rnode,anchor=south west,fill=green!20!white] (key2) at ([yshift=0.2em]value2.north west) {\scriptsize{${h}(\textrm{什么})$}};
\node [rnode,anchor=south west,fill=green!20!white] (key3) at ([yshift=0.2em]value3.north west) {\scriptsize{${h}(\textrm{})$}};
\node [rnode,anchor=south west,fill=green!20!white] (key4) at ([yshift=0.2em]value4.north west) {\scriptsize{${h}(\textrm{})$}};
\node [rnode,anchor=east] (query) at ([xshift=-2em]key1.west) {\scriptsize{${s}(\textrm{you})$}};
\node [rnode,anchor=east] (query) at ([xshift=-4em]key1.west) {\scriptsize{${s}(\textrm{you})$}};
\node [anchor=east] (querylabel) at ([xshift=-0.2em]query.west) {\scriptsize{query}};
\draw [->] ([yshift=1pt,xshift=6pt]query.north) .. controls +(90:1em) and +(90:1em) .. ([yshift=1pt]key1.north);
\draw [->] ([yshift=1pt,xshift=3pt]query.north) .. controls +(90:1.5em) and +(90:1.5em) .. ([yshift=1pt]key2.north);
\draw [->] ([yshift=1pt]query.north) .. controls +(90:2em) and +(90:2em) .. ([yshift=1pt]key3.north);
\draw [->] ([yshift=1pt,xshift=-3pt]query.north) .. controls +(90:2.5em) and +(90:2.5em) .. ([yshift=1pt]key4.north);
\node [anchor=south east] (alpha1) at ([xshift=1em]key1.north east) {\scriptsize{$\alpha_1=.4$}};
\node [anchor=south east] (alpha2) at ([xshift=1em]key2.north east) {\scriptsize{$\alpha_2=.4$}};
\node [anchor=south east] (alpha3) at ([xshift=1em]key3.north east) {\scriptsize{$\alpha_3=.1$}};
\node [anchor=south east] (alpha4) at ([xshift=1em]key4.north east) {\scriptsize{$\alpha_4=.1$}};
\node [anchor=south east] (alpha1) at ([xshift=1.5em]key1.north east) {\scriptsize{$\alpha_1=0.4$}};
\node [anchor=south east] (alpha2) at ([xshift=1.5em]key2.north east) {\scriptsize{$\alpha_2=0.4$}};
\node [anchor=south east] (alpha3) at ([xshift=1.5em]key3.north east) {\scriptsize{$\alpha_3=0.1$}};
\node [anchor=south east] (alpha4) at ([xshift=1.5em]key4.north east) {\scriptsize{$\alpha_4=0.1$}};
\end{scope}
\end{tikzpicture}
\ No newline at end of file
......@@ -5,12 +5,12 @@
\begin{tikzpicture}
\begin{scope}
\tikzstyle{rnode} = [draw,minimum width=3em,minimum height=1.2em]
\tikzstyle{rnode} = [draw,minimum width=3.5em,minimum height=1.2em]
\node [rnode,anchor=south west,fill=blue!20!white] (value1) at (0,0) {\scriptsize{value$_1$}};
\node [rnode,anchor=south west,fill=blue!20!white] (value2) at ([xshift=1em]value1.south east) {\scriptsize{value$_2$}};
\node [rnode,anchor=south west,fill=red!20!white] (value3) at ([xshift=1em]value2.south east) {\scriptsize{value$_3$}};
\node [rnode,anchor=south west,fill=blue!20!white] (value4) at ([xshift=1em]value3.south east) {\scriptsize{value$_4$}};
\node [rnode,anchor=south west,fill=blue!20!white] (value2) at ([xshift=2em]value1.south east) {\scriptsize{value$_2$}};
\node [rnode,anchor=south west,fill=red!20!white] (value3) at ([xshift=2em]value2.south east) {\scriptsize{value$_3$}};
\node [rnode,anchor=south west,fill=blue!20!white] (value4) at ([xshift=2em]value3.south east) {\scriptsize{value$_4$}};
\node [rnode,anchor=south west,pattern=north east lines] (key1) at ([yshift=0.2em]value1.north west) {};
\node [rnode,anchor=south west,pattern=dots] (key2) at ([yshift=0.2em]value2.north west) {};
......@@ -21,7 +21,7 @@
\node [fill=white,inner sep=1pt] (key1label) at (key3) {\scriptsize{key$_3$}};
\node [fill=white,inner sep=1pt] (key1label) at (key4) {\scriptsize{key$_4$}};
\node [rnode,anchor=east,pattern=horizontal lines] (query) at ([xshift=-3em]key1.west) {};
\node [rnode,anchor=east,pattern=horizontal lines] (query) at ([xshift=-4em]key1.west) {};
\node [anchor=east] (querylabel) at ([xshift=-0.2em]query.west) {\scriptsize{query}};
\draw [->] ([yshift=1pt]query.north) .. controls +(90:2em) and +(90:2em) .. ([yshift=1pt]key3.north) node [pos=0.5,below,yshift=0.2em] {\scriptsize{匹配}};
......
......@@ -4,7 +4,7 @@
\begin{tikzpicture}
\footnotesize{
\begin{axis}[
width=.60\textwidth,
width=.8\textwidth,
height=.40\textwidth,
legend style={at={(0.60,0.08)}, anchor=south west},
xlabel={\scriptsize{更新次数}},
......
......@@ -2,7 +2,7 @@
%\definecolor{ublue}{rgb}{0.152,0.250,0.545}
\begin{tikzpicture}
\begin{axis}[
width=10cm, height=5cm,
width=12cm, height=5cm,
symbolic x coords={1-15,16-25,26-35,>35},
xtick=data,
ytick={6,12,...,28},
......
......@@ -6,11 +6,11 @@
\begin{scope}[scale=0.9]
{\Large
\tikzstyle{rnnnode} = [draw,inner sep=5pt,minimum width=3em,minimum height=0.8em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}]
\tikzstyle{rnnnode} = [draw,inner sep=5pt,minimum width=3.6em,minimum height=0.8em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}]
\node [anchor=west,rnnnode] (node11) at (0,0) {\scriptsize{RNN Cell}};
\node [anchor=west,rnnnode] (node12) at ([xshift=1em]node11.east) {\scriptsize{RNN Cell}};
\node [anchor=west,rnnnode] (node13) at ([xshift=1em]node12.east) {\scriptsize{RNN Cell}};
\node [anchor=west,rnnnode] (node14) at ([xshift=1em]node13.east) {\scriptsize{RNN Cell}};
\node [anchor=west,rnnnode] (node12) at ([xshift=2em]node11.east) {\scriptsize{RNN Cell}};
\node [anchor=west,rnnnode] (node13) at ([xshift=2em]node12.east) {\scriptsize{RNN Cell}};
\node [anchor=west,rnnnode] (node14) at ([xshift=2em]node13.east) {\scriptsize{RNN Cell}};
\node [anchor=north,rnnnode,fill=blue!30!white] (e1) at ([yshift=-1em]node11.south) {\scriptsize{}};
\node [anchor=north,rnnnode,fill=blue!30!white] (e2) at ([yshift=-1em]node12.south) {\scriptsize{}};
......
......@@ -12,9 +12,9 @@
% Encoder
\begin{scope}
\node[rnnnode,fill=green!20] (encemb1) at (0,0) {};
\node[rnnnode,fill=green!20,right=\base of encemb1] (encemb2) {};
\node[rnnnode,draw=white,fill=white,right=\base of encemb2] (encemb3) {$\cdots$};
\node[rnnnode,fill=green!20,right=\base of encemb3] (encemb4) {};
\node[rnnnode,fill=green!20,right=1.8\base of encemb1] (encemb2) {};
\node[rnnnode,draw=white,fill=white,right=1.8\base of encemb2] (encemb3) {$\cdots$};
\node[rnnnode,fill=green!20,right=1.8\base of encemb3] (encemb4) {};
\node[rnnnode,above=\base of encemb1] (enc11) {};
\node[rnnnode,above=\base of encemb2] (enc12) {};
......@@ -103,10 +103,10 @@
\node[rnnnode,fill=orange!20,minimum width=3.5cm,anchor=south west] (attention) at ([yshift=\base]enc61.north west) {注意力机制};
\begin{scope}
\node[rnnnode,fill=green!20,right=2.5cm of encemb4] (decemb1) {};
\node[rnnnode,fill=green!20,right=\base of decemb1] (decemb2) {};
\node[rnnnode,draw=white,fill=white,right=\base of decemb2] (decemb3) {$\cdots$};
\node[rnnnode,fill=green!20,right=\base of decemb3] (decemb4) {};
\node[rnnnode,fill=green!20,right=3cm of encemb4] (decemb1) {};
\node[rnnnode,fill=green!20,right=1.8\base of decemb1] (decemb2) {};
\node[rnnnode,draw=white,fill=white,right=1.8\base of decemb2] (decemb3) {$\cdots$};
\node[rnnnode,fill=green!20,right=1.8\base of decemb3] (decemb4) {};
\node[rnnnode,above=\base of decemb1] (dec11) {};
\node[rnnnode,above=\base of decemb2] (dec12) {};
......@@ -189,7 +189,7 @@
\end{scope}
% attention connections
\draw[-latex',rounded corners=2pt] (dec11) -| ([xshift=-0.4cm]attention.south east);
\draw[-latex',rounded corners=2pt] (dec11) -| ([xshift=-0.2cm]attention.south east);
\ExtractX{$([xshift=9pt]attention.east)$}
\ExtractY{$([yshift=2pt]dec11.north)$}
......@@ -224,11 +224,11 @@
\draw[decorate,decoration={brace,mirror}] ([xshift=5pt]dec14.south east) to node [auto,swap,font=\scriptsize,name=label2] {8层} ([xshift=5pt]dec54.north east);
\begin{pgfonlayer}{background}
\coordinate (tmp) at ([xshift=-4pt]label1.west);
\node[draw,densely dashed,rounded corners=2pt,inner sep=2pt,fit=(label1) (encword1) (attention) (tmp)] (encoder) {};
\node[thick,draw,densely dashed,rounded corners=2pt,inner sep=2pt,fit=(label1) (encword1) (attention) (tmp)] (encoder) {};
\ExtractX{$([xshift=4pt]label2.east)$}
\ExtractY{$([yshift=6pt]decoutword4.north)$}
\coordinate (tmp) at (\XCoord,\YCoord);
\node[draw,densely dashed,rounded corners=2pt,inner sep=2pt,fit=(label2) (decinword1) (decoutword4) (tmp)] (decoder) {};
\node[thick,draw,densely dashed,rounded corners=2pt,inner sep=2pt,fit=(label2) (decinword1) (decoutword4) (tmp)] (decoder) {};
\end{pgfonlayer}
\node[wnode,anchor=north west] () at (encoder.north west) {编码器};
\node[wnode,anchor=north east] () at (decoder.north east) {解码器};
......
......@@ -23,7 +23,7 @@
\chapter{基于循环神经网络的模型}
\parinterval {\small\sffamily\bfseries{神经机器翻译}} \index{神经机器翻译}(Neural Machine Translation)\index{Neural Machine Translation}是机器翻译的前沿方法。近几年,随着深度学习技术的发展和在各领域中的深入应用,基于端到端表示学习的方法正在改变着我们处理自然语言的方式,神经机器翻译在这种趋势下应运而生。一方面,神经机器翻译仍然延续着统计建模和基于数据驱动的思想,因此在基本问题的定义上与前人的研究是一致的;另一方面,神经机器翻译脱离了统计机器翻译中对隐含翻译结构的假设,同时使用分布式表示来对文字序列进行建模,这使得它可以从一个全新的视角看待翻译问题。现在,神经机器翻译已经成为了机器翻译研究及应用的热点,译文质量得到了巨大的提升。
\parinterval {\small\bfnew{神经机器翻译}} \index{神经机器翻译}(Neural Machine Translation)\index{Neural Machine Translation}是机器翻译的前沿方法。近几年,随着深度学习技术的发展和在各领域中的深入应用,基于端到端表示学习的方法正在改变着我们处理自然语言的方式,神经机器翻译在这种趋势下应运而生。一方面,神经机器翻译仍然延续着统计建模和基于数据驱动的思想,因此在基本问题的定义上与前人的研究是一致的;另一方面,神经机器翻译脱离了统计机器翻译中对隐含翻译结构的假设,同时使用分布式表示来对文字序列进行建模,这使得它可以从一个全新的视角看待翻译问题。现在,神经机器翻译已经成为了机器翻译研究及应用的热点,译文质量得到了巨大的提升。
\parinterval 本章将介绍神经机器翻译中的一种基础模型\ \dash \ 基于循环神经网络的模型。该模型是神经机器翻译中最早被成功应用的模型之一。基于这个模型框架,研究人员进行了大量的探索和改进工作,包括使用LSTM等循环单元结构、引入注意力机制等。这些内容都会在本章进行讨论。
......@@ -121,6 +121,8 @@
\parinterval 在很多量化的评价中也可以看到神经机器翻译的优势。回忆一下{\chapterfour}提到的机器翻译质量的自动评估指标中,使用最广泛的一种指标是BLEU。2010年前,在由美国国家标准和科技机构(NIST)举办的汉英机器翻译评测中(比如汉英MT08数据集),30\%以上的BLEU值对于基于统计方法的翻译系统来说就已经是当时最顶尖的结果了。而现在的神经机器翻译系统,则可以轻松地将BLEU提高至45\%以上。
\parinterval 同样,在机器翻译领域中著名评测比赛WMT(Workshop of Machine Translation)中,使用统计机器翻译方法的参赛系统也在逐年减少。而现在获得比赛冠军的系统中几乎没有只使用纯统计机器翻译模型的系统\footnote{但是,仍然有大量的统计机器翻译和神经机器翻译融合的方法。比如,在无指导机器翻译中,统计机器翻译仍然被作为初始模型。} 。图\ref{fig:10-3}展示了近年来WMT比赛冠军系统中神经机器翻译系统的占比,可见神经机器翻译系统的占比在逐年提高。
%----------------------------------------------
\begin{figure}[htp]
\centering
......@@ -130,17 +132,6 @@
\end{figure}
%----------------------------------------------
\parinterval 同样,在机器翻译领域中著名评测比赛WMT(Workshop of Machine Translation)中,使用统计机器翻译方法的参赛系统也在逐年减少。而现在获得比赛冠军的系统中几乎没有只使用纯统计机器翻译模型的系统\footnote{但是,仍然有大量的统计机器翻译和神经机器翻译融合的方法。比如,在无指导机器翻译中,统计机器翻译仍然被作为初始模型。} 。图\ref{fig:10-3}展示了近年来WMT比赛冠军系统中神经机器翻译系统的占比,可见神经机器翻译系统的占比在逐年提高。
%----------------------------------------------
\begin{figure}[htp]
\centering
\input{./Chapter10/Figures/figure-score-of-mter}
\caption{不同系统在不同长度句子上的mTER[\%]分值(得分越低越好)\upcite{Bentivogli2016NeuralVP}}
\label{fig:10-4}
\end{figure}
%----------------------------------------------
\parinterval 神经机器翻译在其他评价指标上的表现也全面超越统计机器翻译。比如,在IWSLT 2015英语-德语任务中,研究人员搭建了四个较为先进的机器翻译系统\upcite{Bentivogli2016NeuralVP}
\begin{itemize}
......@@ -156,6 +147,15 @@
\parinterval 与这些系统相比,神经机器翻译系统的mTER得分在不同长度句子上都有明显的下降,如图\ref{fig:10-4}\footnote{mTER、HTER等都是是错误率度量,值越低表明译文越好。}。其次,神经机器翻译的单词形态错误率和单词词义错误率(用HTER度量)都远低于统计机器翻译系统(表\ref{tab:10-1} )。
%----------------------------------------------
\begin{figure}[htp]
\centering
\input{./Chapter10/Figures/figure-score-of-mter}
\caption{不同系统在不同长度句子上的mTER[\%]分值(得分越低越好)\upcite{Bentivogli2016NeuralVP}}
\label{fig:10-4}
\end{figure}
%----------------------------------------------
\vspace{0.5em}%全局布局使用
%----------------------------------------------
\begin{table}[htp]
......@@ -325,9 +325,6 @@ NMT & 21.7 & 18.7 & -13.7 \\
\noindent 这里令<eos>(End of Sequence)表示序列的终止,<sos>(Start of Sequence)表示序列的开始。
\parinterval 翻译过程的神经网络结构如图\ref{fig:10-7}所示,其中左边是编码器,右边是解码器。编码器会顺序处理源语言单词,将每个单词都表示成一个实数向量,也就是每个单词的词嵌入结果(绿色方框)。在词嵌入的基础上运行循环神经网络(蓝色方框)。在编码下一个时间步状态的时候,上一个时间步的隐藏状态会作为历史信息传入循环神经网络。这样,句子中每个位置的信息都被向后传递,最后一个时间步的隐藏状态(红色方框)就包含了整个源语言句子的信息,也就得到了编码器的编码结果$\ \dash\ $源语言句子的分布式表示。
%----------------------------------------------
\begin{figure}[htp]
\centering
......@@ -337,6 +334,8 @@ NMT & 21.7 & 18.7 & -13.7 \\
\end{figure}
%----------------------------------------------
\parinterval 翻译过程的神经网络结构如图\ref{fig:10-7}所示,其中左边是编码器,右边是解码器。编码器会顺序处理源语言单词,将每个单词都表示成一个实数向量,也就是每个单词的词嵌入结果(绿色方框)。在词嵌入的基础上运行循环神经网络(蓝色方框)。在编码下一个时间步状态的时候,上一个时间步的隐藏状态会作为历史信息传入循环神经网络。这样,句子中每个位置的信息都被向后传递,最后一个时间步的隐藏状态(红色方框)就包含了整个源语言句子的信息,也就得到了编码器的编码结果$\ \dash\ $源语言句子的分布式表示。
\parinterval 解码器直接把源语言句子的分布式表示作为输入的隐层状态,之后像编码器一样依次读入目标语言单词,这是一个标准的循环神经网络的执行过程。与编码器不同的是,解码器会有一个输出层,用于根据当前时间步的隐层状态生成目标语言单词及其概率分布。可以看到,解码器当前时刻的输出单词与下一个时刻的输入单词是一样的。从这个角度说,解码器也是一种神经语言模型,只不过它会从另外一种语言(源语言)获得一些信息,而不是仅仅做单语句子的生成。具体来说,当生成第一个单词“I”时,解码器利用了源语言句子表示(红色方框)和目标语言的起始词“<sos>”。在生成第二个单词“am”时,解码器利用了上一个时间步的隐藏状态和已经生成的“I”的信息。这个过程会循环执行,直到生成完整的目标语言句子。
\parinterval 从这个例子可以看出,神经机器翻译的流程其实并不复杂:首先通过编码器神经网络将源语言句子编码成实数向量,然后解码器神经网络利用这个向量逐词生成译文。现在几乎所有的神经机器翻译系统都采用类似的架构。
......@@ -376,6 +375,7 @@ NMT & 21.7 & 18.7 & -13.7 \\
% NEW SECTION 10.3
%----------------------------------------------------------------------------------------
\sectionnewpage
\vspace{-2em}
\section{基于循环神经网络的翻译建模}
\parinterval 早期神经机器翻译的进展主要来自两个方面:1)使用循环神经网络对单词序列进行建模;2)注意力机制的使用。表\ref{tab:10-6}列出了2013-2015年间有代表性的部分研究工作。从这些工作的内容上看,当时的研究重点还是如何有效地使用循环神经网络进行翻译建模以及使用注意力机制捕捉双语单词序列间的对应关系。
......@@ -440,6 +440,7 @@ NMT & 21.7 & 18.7 & -13.7 \\
%----------------------------------------------
\parinterval 从数学模型上看,神经机器翻译模型与统计机器翻译的目标是一样的:在给定源语言句子$\seq{x}$的情况下,找出翻译概率最大的目标语言译文$\hat{\seq{y}}$,其计算如下式:
\vspace{-1em}
\begin{eqnarray}
\hat{\seq{{y}}} &=& \argmax_{\seq{{y}}} \funp{P} (\seq{{y}} | \seq{{x}})
\label{eq:10-1}
......@@ -463,7 +464,7 @@ NMT & 21.7 & 18.7 & -13.7 \\
\vspace{0.5em}
\item 如何在词嵌入的基础上获取整个序列的表示,即句子的表示学习。可以把词嵌入的序列作为循环神经网络的输入,循环神经网络最后一个时刻的输出向量便是整个句子的表示结果。如图\ref{fig:10-10}中,编码器最后一个循环单元的输出$\mathbi{h}_m$被看作是一种包含了源语言句子信息的表示结果,记为$\mathbi{C}$
\vspace{0.5em}
\item 如何得到每个目标语言单词的概率,即译文单词的{\small\sffamily\bfseries{生成}}\index{生成}(Generation)\index{Generation}。与神经语言模型一样,可以用一个Softmax输出层来获取当前时刻所有单词的分布,即利用Softmax 函数计算目标语言词表中每个单词的概率。令目标语言序列$j$时刻的循环神经网络的输出向量(或状态)为$\mathbi{s}_j$。根据循环神经网络的性质,$ y_j$ 的生成只依赖前一个状态$\mathbi{s}_{j-1}$和当前时刻的输入(即词嵌入$\textrm{e}_y (y_{j-1})$)。同时考虑源语言信息$\mathbi{C}$$\funp{P}(y_j | \seq{{y}}_{<j},\seq{{x}})$可以被重新定义为:
\item 如何得到每个目标语言单词的概率,即译文单词的{\small\bfnew{生成}}\index{生成}(Generation)\index{Generation}。与神经语言模型一样,可以用一个Softmax输出层来获取当前时刻所有单词的分布,即利用Softmax 函数计算目标语言词表中每个单词的概率。令目标语言序列$j$时刻的循环神经网络的输出向量(或状态)为$\mathbi{s}_j$。根据循环神经网络的性质,$ y_j$ 的生成只依赖前一个状态$\mathbi{s}_{j-1}$和当前时刻的输入(即词嵌入$\textrm{e}_y (y_{j-1})$)。同时考虑源语言信息$\mathbi{C}$$\funp{P}(y_j | \seq{{y}}_{<j},\seq{{x}})$可以被重新定义为:
\begin{eqnarray}
\funp{P} (y_j | \seq{{y}}_{<j},\seq{{x}}) &=& \funp{P} ( {y_j | \mathbi{s}_{j-1} ,y_{j-1},\mathbi{C}} )
\label{eq:10-3}
......@@ -479,7 +480,7 @@ $\funp{P}({y_j | \mathbi{s}_{j-1} ,y_{j-1},\mathbi{C}})$由Softmax实现,Softm
\end{eqnarray}
\vspace{0.5em}
\end{itemize}
\vspace{-2em}
%----------------------------------------------
\begin{figure}[htp]
\centering
......@@ -503,7 +504,7 @@ $\funp{P}({y_j | \mathbi{s}_{j-1} ,y_{j-1},\mathbi{C}})$由Softmax实现,Softm
%----------------------------------------------------------------------------------------
% NEW SUB-SECTION
%----------------------------------------------------------------------------------------
\vspace{-1em}
\subsection{长短时记忆网络}
\label{sec:lstm-cell}
......@@ -527,7 +528,7 @@ $\funp{P}({y_j | \mathbi{s}_{j-1} ,y_{j-1},\mathbi{C}})$由Softmax实现,Softm
\begin{itemize}
\vspace{0.5em}
\item {\small\sffamily\bfseries{遗忘}}\index{遗忘}。顾名思义,遗忘的目的是忘记一些历史,在LSTM中通过遗忘门实现,其结构如图\ref{fig:10-11}(a)所示。$\mathbi{x}_{t}$表示时刻$t$的输入向量,$\mathbi{h}_{t-1}$是时刻$t-1$的循环单元的输出,$\mathbi{x}_{t}$$\mathbi{h}_{t-1}$都作为$t$时刻循环单元的输入。$\sigma$将对$\mathbi{x}_{t}$$\mathbi{h}_{t-1}$进行筛选,以决定遗忘的信息,其计算如下:
\item {\small\bfnew{遗忘}}\index{遗忘}。顾名思义,遗忘的目的是忘记一些历史,在LSTM中通过遗忘门实现,其结构如图\ref{fig:10-11}(a)所示。$\mathbi{x}_{t}$表示时刻$t$的输入向量,$\mathbi{h}_{t-1}$是时刻$t-1$的循环单元的输出,$\mathbi{x}_{t}$$\mathbi{h}_{t-1}$都作为$t$时刻循环单元的输入。$\sigma$将对$\mathbi{x}_{t}$$\mathbi{h}_{t-1}$进行筛选,以决定遗忘的信息,其计算如下:
\begin{eqnarray}
\mathbi{f}_t &=& \sigma(\mathbi{W}_f [\mathbi{h}_{t-1},\mathbi{x}_{t}] + \mathbi{b}_f )
\label{eq:10-6}
......@@ -535,7 +536,7 @@ $\funp{P}({y_j | \mathbi{s}_{j-1} ,y_{j-1},\mathbi{C}})$由Softmax实现,Softm
这里,$\mathbi{W}_f$是权值,$\mathbi{b}_f$是偏置,$[\mathbi{h}_{t-1},\mathbi{x}_{t}]$表示两个向量的拼接。该公式可以解释为,对$[\mathbi{h}_{t-1},\mathbi{x}_{t}]$进行变换,并得到一个实数向量$\mathbi{f}_t$$\mathbi{f}_t$的每一维都可以被理解为一个“门”,它决定可以有多少信息被留下(或遗忘)。
\vspace{0.5em}
\item {\small\sffamily\bfseries{记忆更新}}\index{记忆更新}。首先,要生成当前时刻需要新增加的信息,该部分由输入门完成,其结构如图\ref{fig:10-11}(b)红色线部分,图中“$\bigotimes$”表示进行点乘操作。输入门的计算分为两部分,首先利用$\sigma$决定门控参数$\mathbi{i}_t$,如公式\eqref{eq:10-7},然后通过Tanh函数得到新的信息$\hat{\mathbi{c}}_t$,如公式\eqref{eq:10-8}
\item {\small\bfnew{记忆更新}}\index{记忆更新}。首先,要生成当前时刻需要新增加的信息,该部分由输入门完成,其结构如图\ref{fig:10-11}(b)红色线部分,图中“$\bigotimes$”表示进行点乘操作。输入门的计算分为两部分,首先利用$\sigma$决定门控参数$\mathbi{i}_t$,如公式\eqref{eq:10-7},然后通过Tanh函数得到新的信息$\hat{\mathbi{c}}_t$,如公式\eqref{eq:10-8}
\begin{eqnarray}
\mathbi{i}_t & = & \sigma (\mathbi{W}_i [\mathbi{h}_{t-1},\mathbi{x}_{t}] + \mathbi{b}_i ) \label{eq:10-7} \\
\hat{\mathbi{c}}_t & = & \textrm{Tanh} (\mathbi{W}_c [\mathbi{h}_{t-1},\mathbi{x}_{t}] + \mathbi{b}_c ) \label{eq:10-8}
......@@ -547,7 +548,7 @@ $\funp{P}({y_j | \mathbi{s}_{j-1} ,y_{j-1},\mathbi{C}})$由Softmax实现,Softm
\label{eq:10-9}
\end{eqnarray}
\vspace{-1.0em}
\item {\small\sffamily\bfseries{输出}}\index{输出}。该部分使用输出门计算最终的输出信息$\mathbi{h}_t$,其结构如图\ref{fig:10-11}(d)红色线部分所示。在输出门中,首先将$\mathbi{x}_t$$\mathbi{h}_{t-1}$通过$\sigma$函数变换得到$\mathbi{o}_t$,如公式\eqref{eq:10-10}。其次,将上一步得到的新记忆信息$\mathbi{c}_t$通过Tanh函数进行变换,得到值在[-1,1]范围的向量。最后将这两部分进行点乘,具体如公式\eqref{eq:10-11}
\item {\small\bfnew{输出}}\index{输出}。该部分使用输出门计算最终的输出信息$\mathbi{h}_t$,其结构如图\ref{fig:10-11}(d)红色线部分所示。在输出门中,首先将$\mathbi{x}_t$$\mathbi{h}_{t-1}$通过$\sigma$函数变换得到$\mathbi{o}_t$,如公式\eqref{eq:10-10}。其次,将上一步得到的新记忆信息$\mathbi{c}_t$通过Tanh函数进行变换,得到值在[-1,1]范围的向量。最后将这两部分进行点乘,具体如公式\eqref{eq:10-11}
\begin{eqnarray}
\mathbi{o}_t & = & \sigma (\mathbi{W}_o [\mathbi{h}_{t-1},\mathbi{x}_{t}] + \mathbi{b}_o ) \label{eq:10-10} \\
\mathbi{h}_t & = & \mathbi{o}_t \cdot \textrm{Tanh} (\mathbi{c}_t) \label{eq:10-11}
......@@ -725,7 +726,7 @@ $\funp{P}({y_j | \mathbi{s}_{j-1} ,y_{j-1},\mathbi{C}})$由Softmax实现,Softm
\label{eq:10-16}
\end{eqnarray}
\noindent 其中,$\alpha_{i,j}${\small\sffamily\bfseries{注意力权重}}\index{注意力权重}(Attention Weight)\index{Attention Weight},它表示目标语言第$j$个位置与源语言第$i$个位置之间的相关性大小。这里,将每个时间步编码器的输出$\mathbi{h}_i$ 看作源语言位置$i$的表示结果。进行翻译时,解码器可以根据当前的位置$j$,通过控制不同$\mathbi{h}_i$的权重得到$\mathbi{C}_j$,使得对目标语言位置$j$贡献大的$\mathbi{h}_i$$\mathbi{C}_j$的影响增大。也就是说,$\mathbi{C}_j$实际上就是\{${\mathbi{h}_1,...,\mathbi{h}_m}$\}的一种组合,只不过不同的$\mathbi{h}_i$会根据对目标端的贡献给予不同的权重。图\ref{fig:10-19}展示了上下文向量$\mathbi{C}_j$的计算过程。
\noindent 其中,$\alpha_{i,j}${\small\bfnew{注意力权重}}\index{注意力权重}(Attention Weight)\index{Attention Weight},它表示目标语言第$j$个位置与源语言第$i$个位置之间的相关性大小。这里,将每个时间步编码器的输出$\mathbi{h}_i$ 看作源语言位置$i$的表示结果。进行翻译时,解码器可以根据当前的位置$j$,通过控制不同$\mathbi{h}_i$的权重得到$\mathbi{C}_j$,使得对目标语言位置$j$贡献大的$\mathbi{h}_i$$\mathbi{C}_j$的影响增大。也就是说,$\mathbi{C}_j$实际上就是\{${\mathbi{h}_1,...,\mathbi{h}_m}$\}的一种组合,只不过不同的$\mathbi{h}_i$会根据对目标端的贡献给予不同的权重。图\ref{fig:10-19}展示了上下文向量$\mathbi{C}_j$的计算过程。
%----------------------------------------------
\begin{figure}[htp]
......@@ -1093,18 +1094,17 @@ L(\widehat{\mathbi{Y}},\mathbi{Y}) &=& \sum_{j=1}^n L_{\textrm{ce}}(\hat{\mathbi
\item {\small\bfnew{模型并行}}\index{模型并行}。另一种思路是,把较大的模型分成若干小模型,之后在不同设备上训练小模型。对于循环神经网络,不同层的网络天然就是一个相对独立的模型,因此非常适合使用这种方法。比如,对于$l$层的循环神经网络,把每层都看做一个小模型,然后分发到$l$个设备上并行计算。在序列较长的时候,该方法使其运算时间变为原来的${1}/{l}$。图\ref{fig:10-28}以三层循环网络为例展示了对句子“你\ \ 不错\ 。”进行模型并行的过程。其中,每一层网络都被放到了一个设备上。当模型根据已经生成的第一个词“你”,并预测下一个词时(图\ref{fig:10-28}(a)),同层的下一个时刻的计算和对“你”的第二层的计算就可以同时开展(图\ref{fig:10-28}(b))。以此类推,就完成了模型的并行计算。
\vspace{0.5em}
\end{itemize}
%-------------------------------------------
\begin{figure}[htp]
\centering
\begin{tabular}{l l}
%\begin{figure}[htp]
%\centering
%\begin{tabular}{l l}
%\subfigure[]{\input{./Chapter10/Figures/figure-process01}} &\subfigure[]{\input{./Chapter10/Figures/figure-process02}} \\
%\subfigure[]{\input{./Chapter10/Figures/figure-process03}} &\subfigure[]{\input{./Chapter10/Figures/figure-process04}} \\
%\subfigure[]{\input{./Chapter10/Figures/figure-process05}} &\subfigure[]{\input{./Chapter10/Figures/figure-process06}}\\
\end{tabular}
%\end{tabular}
%\caption{一个三层循环神经网络的模型并行过程}
%\label{fig:10-28}
\end{figure}
%\end{figure}
%----------------------------------------------
%-------------------------------------------
\begin{figure}[htp]
......@@ -1169,8 +1169,6 @@ L(\widehat{\mathbi{Y}},\mathbi{Y}) &=& \sum_{j=1}^n L_{\textrm{ce}}(\hat{\mathbi
\vspace{0.2em}
\parinterval 解码器的每一步Softmax层会输出所有单词的概率,由于是基于贪心的方法,这里会选择概率最大(top-1)的单词作为输出。这个过程可以参考图\ref{fig:10-30}的内容。选择分布中概率最大的单词“Have”作为得到的第一个单词,并再次送入解码器,作为第二步的输入同时预测下一个单词。以此类推,直到生成句子的终止符为止,就得到了完整的译文。
\parinterval 贪婪搜索的优点在于速度快。在对翻译速度有较高要求的场景中,贪婪搜索是一种十分有效的系统加速方法。而且贪婪搜索的原理非常简单,易于快速实现。不过,由于每一步只保留一个最好的局部结果,贪婪搜索往往会带来翻译品质上的损失。
%----------------------------------------------
\begin{figure}[htp]
\centering
......@@ -1180,6 +1178,8 @@ L(\widehat{\mathbi{Y}},\mathbi{Y}) &=& \sum_{j=1}^n L_{\textrm{ce}}(\hat{\mathbi
\end{figure}
%----------------------------------------------
\parinterval 贪婪搜索的优点在于速度快。在对翻译速度有较高要求的场景中,贪婪搜索是一种十分有效的系统加速方法。而且贪婪搜索的原理非常简单,易于快速实现。不过,由于每一步只保留一个最好的局部结果,贪婪搜索往往会带来翻译品质上的损失。
%----------------------------------------------------------------------------------------
% NEW SUBSUB-SECTION
%----------------------------------------------------------------------------------------
......
......@@ -2,32 +2,32 @@
\begin{tikzpicture}[node distance = 0cm]
\node(num1)[num,fill=red!10]{1};
\node(num2)[num,below of = num1,yshift= -0.6cm,fill=red!10]{5};
\node(num3)[num,right of = num1,xshift= 0.6cm,fill=red!10]{0};
\node(num4)[num,below of = num3,yshift= -0.6cm,fill=red!10]{6};
\node(num5)[num,right of = num3,xshift= 0.6cm,fill=green!10]{4};
\node(num6)[num,below of = num5,yshift= -0.6cm,fill=green!10]{7};
\node(num7)[num,right of = num5,xshift= 0.6cm,fill=green!10]{5};
\node(num8)[num,below of = num7,yshift= -0.6cm,fill=green!10]{8};
\node(num9)[num,below of = num2,yshift= -0.6cm,fill=yellow!10]{3};
\node(num10)[num,below of = num9,yshift= -0.6cm,fill=yellow!10]{1};
\node(num11)[num,right of = num9,xshift= 0.6cm,fill=yellow!10]{2};
\node(num12)[num,below of = num11,yshift= -0.6cm,fill=yellow!10]{2};
\node(num13)[num,right of = num11,xshift= 0.6cm,fill=blue!10]{1};
\node(num14)[num,below of = num13,yshift= -0.6cm,fill=blue!10]{3};
\node(num10)[num,right of = num13,xshift= 0.6cm,fill=blue!10]{0};
\node(num16)[num,below of = num10,yshift= -0.6cm,fill=blue!10]{4};
\node(num1)[num,fill=red!20]{1};
\node(num2)[num,below of = num1,yshift= -0.6cm,fill=red!20]{5};
\node(num3)[num,right of = num1,xshift= 0.6cm,fill=red!20]{0};
\node(num4)[num,below of = num3,yshift= -0.6cm,fill=red!20]{6};
\node(num5)[num,right of = num3,xshift= 0.6cm,fill=green!20]{4};
\node(num6)[num,below of = num5,yshift= -0.6cm,fill=green!20]{7};
\node(num7)[num,right of = num5,xshift= 0.6cm,fill=green!20]{5};
\node(num8)[num,below of = num7,yshift= -0.6cm,fill=green!20]{8};
\node(num9)[num,below of = num2,yshift= -0.6cm,fill=yellow!20]{3};
\node(num20)[num,below of = num9,yshift= -0.6cm,fill=yellow!20]{1};
\node(num11)[num,right of = num9,xshift= 0.6cm,fill=yellow!20]{2};
\node(num12)[num,below of = num11,yshift= -0.6cm,fill=yellow!20]{2};
\node(num13)[num,right of = num11,xshift= 0.6cm,fill=blue!20]{1};
\node(num14)[num,below of = num13,yshift= -0.6cm,fill=blue!20]{3};
\node(num20)[num,right of = num13,xshift= 0.6cm,fill=blue!20]{0};
\node(num16)[num,below of = num20,yshift= -0.6cm,fill=blue!20]{4};
\draw[->,thick]([xshift=0.4cm,yshift=-0.4cm]num8.east)--([xshift=1.5cm,yshift=-0.4cm]num8.east);
\node(num17)[num,right of = num8,xshift= 2.5cm,fill=red!10]{3};
\node(num18)[num,right of = num17,xshift= 0.6cm,fill=green!10]{6};
\node(num19)[num,below of = num17,yshift=-0.6cm,fill=yellow!10]{2};
\node(num20)[num,below of = num18,yshift= -0.6cm,fill=blue!10]{2};
\node(num17)[num,right of = num8,xshift= 2.5cm,fill=red!20]{3};
\node(num18)[num,right of = num17,xshift= 0.6cm,fill=green!20]{6};
\node(num19)[num,below of = num17,yshift=-0.6cm,fill=yellow!20]{2};
\node(num20)[num,below of = num18,yshift= -0.6cm,fill=blue!20]{2};
\node [right of = num2,xshift= -0.7cm]{};
......
\usetikzlibrary{decorations.pathreplacing}
\tikzstyle{num} = [rectangle, minimum width = 0.8cm, minimum height = 0.8cm, text centered,align=center,thick,draw = black,fill=green!5]
\tikzstyle{num} = [rectangle, minimum width = 0.8cm, minimum height = 0.8cm, text centered,align=center,thick,draw = black,fill=green!15]
\tikzstyle{att} = [rectangle, minimum width = 0.8cm, minimum height = 0.8cm, text centered,align=center]
......
......@@ -30,7 +30,7 @@
\node [anchor=north,font=\tiny,scale=1.1] at ([yshift=-0.2em]t2.south) {深度卷积};
\end{scope}
\begin{scope}[xshift=4cm]
\begin{scope}[xshift=5cm]
\foreach \point in {0,1,2,3}{
\node[unit] (l1_\point) at (0, 0+\point*1.5em){};
\node[unit] (l2_\point) at (0, 8em+\point*1.5em){};
......
\tikzstyle{num} = [minimum width = 0.6cm,minimum height = 0.6cm,draw,fill=green!10]
\tikzstyle{num} = [minimum width = 0.6cm,minimum height = 0.6cm,draw,fill=green!20]
\begin{tikzpicture}[node distance = 0]
\node[num] at (0,0){1};
......@@ -43,7 +43,7 @@
\node[num] at (2.4,3){2};
\node[num] at (3,3){1};
\node[minimum width = 1.8cm,minimum height = 1.8cm,draw=purple!40,line width=0.08cm,fill=purple!40,fill opacity=0.4] at (0.6,2.4) {};
\node[minimum width = 1.8cm,minimum height = 1.8cm,draw=purple!40,line width=0.08cm,fill=purple!50,fill opacity=0.4] at (0.6,2.4) {};
%\fill (4,1.5) circle (2pt);
\node [] at (4,1.5) {*};
......
\begin{tikzpicture}[node distance = 0]
\foreach \x in {0,1,...,6}{
\draw[color=ublue,fill=blue!15,thick]( \x,0 )circle( 0.3);
\draw[color=ublue,fill=blue!25,thick]( \x,0 )circle( 0.3);
}
\foreach \x in {1,2,...,5}{
\draw[color=ublue,thick]( \x,2)circle( 0.3);
......
\begin{tikzpicture}[node distance = 0]
\foreach \x in {0,1,...,6}{
\draw[color=ublue,fill=blue!15,thick]( \x,0 )circle( 0.3);
\draw[color=ublue,fill=blue!25,thick]( \x,0 )circle( 0.3);
}
\foreach \x in {1,2,...,5}{
\draw[color=ublue,thick]( \x,2)circle( 0.3);
......
......@@ -2,32 +2,32 @@
\begin{tikzpicture}[node distance = 0cm]
\node(num1)[num,fill=red!10]{1};
\node(num2)[num,below of = num1,yshift= -0.6cm,fill=red!10]{5};
\node(num3)[num,right of = num1,xshift= 0.6cm,fill=red!10]{0};
\node(num4)[num,below of = num3,yshift= -0.6cm,fill=red!10]{6};
\node(num5)[num,right of = num3,xshift= 0.6cm,fill=green!10]{4};
\node(num6)[num,below of = num5,yshift= -0.6cm,fill=green!10]{7};
\node(num7)[num,right of = num5,xshift= 0.6cm,fill=green!10]{5};
\node(num8)[num,below of = num7,yshift= -0.6cm,fill=green!10]{8};
\node(num9)[num,below of = num2,yshift= -0.6cm,fill=yellow!10]{3};
\node(num10)[num,below of = num9,yshift= -0.6cm,fill=yellow!10]{1};
\node(num11)[num,right of = num9,xshift= 0.6cm,fill=yellow!10]{2};
\node(num12)[num,below of = num11,yshift= -0.6cm,fill=yellow!10]{2};
\node(num13)[num,right of = num11,xshift= 0.6cm,fill=blue!10]{1};
\node(num14)[num,below of = num13,yshift= -0.6cm,fill=blue!10]{3};
\node(num10)[num,right of = num13,xshift= 0.6cm,fill=blue!10]{0};
\node(num16)[num,below of = num10,yshift= -0.6cm,fill=blue!10]{4};
\node(num1)[num,fill=red!20]{1};
\node(num2)[num,below of = num1,yshift= -0.6cm,fill=red!20]{5};
\node(num3)[num,right of = num1,xshift= 0.6cm,fill=red!20]{0};
\node(num4)[num,below of = num3,yshift= -0.6cm,fill=red!20]{6};
\node(num5)[num,right of = num3,xshift= 0.6cm,fill=green!20]{4};
\node(num6)[num,below of = num5,yshift= -0.6cm,fill=green!20]{7};
\node(num7)[num,right of = num5,xshift= 0.6cm,fill=green!20]{5};
\node(num8)[num,below of = num7,yshift= -0.6cm,fill=green!20]{8};
\node(num9)[num,below of = num2,yshift= -0.6cm,fill=yellow!20]{3};
\node(num10)[num,below of = num9,yshift= -0.6cm,fill=yellow!20]{1};
\node(num11)[num,right of = num9,xshift= 0.6cm,fill=yellow!20]{2};
\node(num12)[num,below of = num11,yshift= -0.6cm,fill=yellow!20]{2};
\node(num13)[num,right of = num11,xshift= 0.6cm,fill=blue!20]{1};
\node(num14)[num,below of = num13,yshift= -0.6cm,fill=blue!20]{3};
\node(num10)[num,right of = num13,xshift= 0.6cm,fill=blue!20]{0};
\node(num16)[num,below of = num10,yshift= -0.6cm,fill=blue!20]{4};
\draw[->,thick]([xshift=0.4cm,yshift=-0.4cm]num8.east)--([xshift=1.5cm,yshift=-0.4cm]num8.east);
\node(num17)[num,right of = num8,xshift= 2.5cm,fill=red!10]{6};
\node(num18)[num,right of = num17,xshift= 0.6cm,fill=green!10]{8};
\node(num19)[num,below of = num17,yshift=-0.6cm,fill=yellow!10]{3};
\node(num20)[num,below of = num18,yshift= -0.6cm,fill=blue!10]{4};
\node(num17)[num,right of = num8,xshift= 2.5cm,fill=red!20]{6};
\node(num18)[num,right of = num17,xshift= 0.6cm,fill=green!20]{8};
\node(num19)[num,below of = num17,yshift=-0.6cm,fill=yellow!20]{3};
\node(num20)[num,below of = num18,yshift= -0.6cm,fill=blue!20]{4};
\node [right of = num20,xshift= 0.7cm]{};
......
\tikzstyle{num} = [minimum width = 0.6cm,minimum height = 0.6cm,draw,fill=green!10]
\tikzstyle{pad} = [minimum width = 0.6cm,minimum height = 0.6cm,draw,fill=blue!10]
\tikzstyle{num} = [minimum width = 0.6cm,minimum height = 0.6cm,draw,fill=green!20]
\tikzstyle{pad} = [minimum width = 0.6cm,minimum height = 0.6cm,draw,fill=blue!20]
\begin{tikzpicture}[node distance = 0]
\node[pad] at (-0.6,-0.6){0};
......@@ -74,8 +74,8 @@
\node[pad] at (3,3.6){0};
\node[pad] at (3.6,3.6){0};
\node[minimum width = 1.8cm,minimum height = 1.8cm,draw=purple!40,line width=0.08cm,fill=purple!40,fill opacity=0.4] at (0,3) {};
\node[minimum width = 1.8cm,minimum height = 1.8cm,draw=orange!40,line width=0.08cm,fill=orange!40,fill opacity=0.4] at (0.6,2.4) {};
\node[minimum width = 1.8cm,minimum height = 1.8cm,draw=purple!40,line width=0.08cm,fill=purple!50,fill opacity=0.4] at (0,3) {};
\node[minimum width = 1.8cm,minimum height = 1.8cm,draw=orange!40,line width=0.08cm,fill=orange!50,fill opacity=0.4] at (0.6,2.4) {};
%\fill (4.55,1.5) circle (2pt);
\node [] at (4.55,1.5) {*};
......
......@@ -6,16 +6,16 @@
\tikzstyle{word} = [inner sep=0pt, font=\scriptsize,minimum height=\bd]
%\draw[fill=blue!8,xshift=0.3cm,yshift=0.5cm,line width=0.2pt] (0cm,0cm) rectangle (0cm+6*\bd,0cm+9*\bd);
\draw[fill=cyan!10] (0cm,0cm) -- (4*\bd,0cm) -- (4*\bd,8*\bd) -- (0cm, 8*\bd) -- (0cm,0cm);
\draw[fill=cyan!20] (0cm,0cm) -- (4*\bd,0cm) -- (4*\bd,8*\bd) -- (0cm, 8*\bd) -- (0cm,0cm);
\draw[cyan,step=\bd,line width=0.8pt] (0cm,0cm) grid (0cm+4*\bd,0cm+8*\bd);
\draw[fill=red!5] (0cm,1*\bd) -- (4*\bd,1*\bd) -- (4*\bd,7*\bd) -- (0cm, 7*\bd) -- (0cm,1*\bd);
\draw[red!50,step=\bd,xshift=0cm,yshift=\bd, line width=0.8pt] (0cm, 0cm) grid (0cm+4*\bd,0cm+6*\bd);
\draw[fill=ugreen!5] (7*\bd,\bd) -- (11*\bd,\bd) -- (11*\bd,7*\bd) -- (7*\bd, 7*\bd) -- (7*\bd,\bd);
\draw[ugreen!60,step=\bd,xshift=7*\bd,yshift=\bd, line width=0.8pt] (0cm, 0cm) grid (0cm+4*\bd,0cm+6*\bd);
\draw[fill=ugreen!5] (11.5*\bd,\bd) -- (15.5*\bd,\bd) -- (15.5*\bd,7*\bd) -- (11.5*\bd, 7*\bd) -- (11.5*\bd,\bd);
\draw[ugreen!60,step=\bd,xshift=11.5*\bd,yshift=\bd, line width=0.8pt] (0cm, 0cm) grid (0cm+4*\bd,0cm+6*\bd);
\draw[fill=blue!5] (18.5*\bd,\bd) -- (22.5*\bd,\bd) -- (22.5*\bd,7*\bd) -- (18.5*\bd, 7*\bd) -- (18.5*\bd,\bd);
\draw[blue!50,step=\bd,xshift=18.5*\bd,yshift=\bd, line width=0.8pt] (0cm, 0cm) grid (0cm+4*\bd,0cm+6*\bd);
\draw[fill=red!10] (0cm,1*\bd) -- (4*\bd,1*\bd) -- (4*\bd,7*\bd) -- (0cm, 7*\bd) -- (0cm,1*\bd);
\draw[red!60,step=\bd,xshift=0cm,yshift=\bd, line width=0.8pt] (0cm, 0cm) grid (0cm+4*\bd,0cm+6*\bd);
\draw[fill=ugreen!10] (7*\bd,\bd) -- (11*\bd,\bd) -- (11*\bd,7*\bd) -- (7*\bd, 7*\bd) -- (7*\bd,\bd);
\draw[ugreen!70,step=\bd,xshift=7*\bd,yshift=\bd, line width=0.8pt] (0cm, 0cm) grid (0cm+4*\bd,0cm+6*\bd);
\draw[fill=ugreen!10] (11.5*\bd,\bd) -- (15.5*\bd,\bd) -- (15.5*\bd,7*\bd) -- (11.5*\bd, 7*\bd) -- (11.5*\bd,\bd);
\draw[ugreen!70,step=\bd,xshift=11.5*\bd,yshift=\bd, line width=0.8pt] (0cm, 0cm) grid (0cm+4*\bd,0cm+6*\bd);
\draw[fill=blue!10] (18.5*\bd,\bd) -- (22.5*\bd,\bd) -- (22.5*\bd,7*\bd) -- (18.5*\bd, 7*\bd) -- (18.5*\bd,\bd);
\draw[blue!60,step=\bd,xshift=18.5*\bd,yshift=\bd, line width=0.8pt] (0cm, 0cm) grid (0cm+4*\bd,0cm+6*\bd);
\draw[-,line width=0.6pt] (4*\bd, 0.5*\bd) -- (5.5*\bd, 1.5*\bd) -- (4*\bd, 2.5*\bd);
......@@ -44,7 +44,7 @@
\node[word] (w7) at ([yshift=-\bd]w6) {};
\node[word] (w8) at ([yshift=-\bd]w7) {$<$p$>$};
\node[inner sep=2pt,word,fill=green!5,draw=ugreen,dotted,very thick,minimum width=6em,align=center,minimum height=4em] (sub) at ([xshift=11cm,yshift=-0.1cm]w1.east) {
\node[inner sep=2pt,word,fill=green!10,draw=ugreen,dotted,very thick,minimum width=6em,align=center,minimum height=4em] (sub) at ([xshift=11cm,yshift=-0.1cm]w1.east) {
\begin{tabular}{r l}
$<$p$>$& 填充 \\
$\sigma$& sigmoid函数 \\
......
\tikzstyle{input} = [rectangle, minimum width = 1cm, minimum height = 3cm, text centered]
\tikzstyle{output} = [rectangle, minimum width = 1cm, minimum height = 3cm, text centered]
\tikzstyle{convolution} = [rectangle, minimum width = 0.7cm, minimum height = 2cm, text centered, fill = red!10, draw = black, thick]
\tikzstyle{activation} = [rectangle, minimum width = 0.7cm, minimum height = 2cm, text centered, fill = blue!10, draw = black, thick]
\tikzstyle{pooling} = [rectangle, thick, minimum width = 0.7cm, minimum height = 2cm, text centered, draw = black, fill = ugreen!10]
\tikzstyle{input} = [rectangle, minimum width = 1.5cm, minimum height = 3cm, text centered]
\tikzstyle{output} = [rectangle, minimum width = 1.5cm, minimum height = 3cm, text centered]
\tikzstyle{convolution} = [rectangle, minimum width = 1cm, minimum height = 2cm, text centered, fill = red!20, draw = black, thick]
\tikzstyle{activation} = [rectangle, minimum width = 1cm, minimum height = 2cm, text centered, fill = blue!20, draw = black, thick]
\tikzstyle{pooling} = [rectangle, thick, minimum width = 1cm, minimum height = 2cm, text centered, draw = black, fill = ugreen!20]
\tikzstyle{arrow} = [thick, ->, >=stealth]
\begin{tikzpicture}[node distance = 0cm]
\node(input)[input, align=center]{\\};
\node(convolution)[convolution,right of = input,xshift = 2cm, align=center]{\\\\};
\node(activation)[activation,right of = convolution,xshift = 2cm, align=center]{\\\\\\};
\node(pooling)[pooling,right of = activation,xshift = 2cm, align=center]{\\\\};
\node(output)[output,right of = pooling,xshift= 2cm, align=center]{\\};
\node(convolution)[convolution,right of = input,xshift = 2.5cm, align=center]{\\\\};
\node(activation)[activation,right of = convolution,xshift = 2.5cm, align=center]{\\\\\\};
\node(pooling)[pooling,right of = activation,xshift = 2.5cm, align=center]{\\\\};
\node(output)[output,right of = pooling,xshift= 2.5cm, align=center]{\\};
\draw [arrow] (input) -- ([xshift=-0.15cm]convolution.180);
\draw [arrow] ([xshift=0.15cm]convolution.0) -- ([xshift=-0.15cm]activation.180);
......
......@@ -43,7 +43,7 @@
\node [anchor=north,font=\tiny,scale=1.1] at ([yshift=-0.2em]t1.south) {(a) 标准卷积};
\end{scope}
\begin{scope}[xshift=4cm]
\begin{scope}[xshift=5cm]
\foreach \point in {0,1,2}{
\node[unit] (l1_\point) at (0, 0+\point*1.5em){};
\node[unit] (l2_\point) at (0, 6em+\point*1.5em){};
......@@ -77,7 +77,7 @@
\node [anchor=north,font=\tiny,scale=1.1] at ([yshift=-0.2em]t2.south) {(b) 深度卷积};
\end{scope}
\begin{scope}[xshift=8cm]
\begin{scope}[xshift=10cm]
\foreach \point in {0,1,2}{
\node[unit] (l1_\point) at (0, 0+\point*1.5em){};
\node[unit] (l2_\point) at (0, 6em+\point*1.5em){};
......
......@@ -4,13 +4,13 @@
\begin{tikzpicture}[node distance = 0cm]
\node(num1)[num]{$\mathbi{e}_1$};
\node(num2)[num,right of = num1,xshift = 1.2cm]{\textcolor{blue!70}{$\mathbi{e}_2$}};
\node(num3)[num,right of = num2,xshift = 1.2cm]{\textcolor{blue!70}{$\mathbi{e}_3$}};
\node(num4)[num,right of = num3,xshift = 1.2cm]{\textcolor{blue!70}{$\mathbi{e}_4$}};
\node(num5)[num,right of = num4,xshift = 1.2cm]{\textcolor{blue!70}{$\mathbi{e}_5$}};
\node(num6)[num,right of = num5,xshift = 1.2cm]{\textcolor{blue!70}{$\mathbi{e}_6$}};
\node(num7)[num,right of = num6,xshift = 1.2cm]{\textcolor{blue!70}{$\mathbi{e}_7$}};
\node(num8)[num,right of = num7,xshift = 1.2cm]{\textcolor{blue!70}{$\mathbi{e}_8$}};
\node(num2)[num,right of = num1,xshift = 1.2cm]{\textcolor{blue!85}{$\mathbi{e}_2$}};
\node(num3)[num,right of = num2,xshift = 1.2cm]{\textcolor{blue!85}{$\mathbi{e}_3$}};
\node(num4)[num,right of = num3,xshift = 1.2cm]{\textcolor{blue!85}{$\mathbi{e}_4$}};
\node(num5)[num,right of = num4,xshift = 1.2cm]{\textcolor{blue!85}{$\mathbi{e}_5$}};
\node(num6)[num,right of = num5,xshift = 1.2cm]{\textcolor{blue!85}{$\mathbi{e}_6$}};
\node(num7)[num,right of = num6,xshift = 1.2cm]{\textcolor{blue!85}{$\mathbi{e}_7$}};
\node(num8)[num,right of = num7,xshift = 1.2cm]{\textcolor{blue!85}{$\mathbi{e}_8$}};
\node(num9)[num,right of = num8,xshift = 1.2cm]{$\mathbi{e}_9$};
%\node(A)[below of = num2,yshift = -0.6cm]{A};
%\node(B)[below of = num8,yshift = -0.6cm]{B};
......@@ -23,8 +23,8 @@
\draw [->, thick, color = blue!80](num6.east)--(num7.west);
\draw [->, thick, color = blue!80](num7.east)--(num8.west);
\draw [->,thick,color = black!70] (num1) -- (num2);
\draw [->,thick,color =black!70] (num8) -- (num9);
\draw [->,thick,color = black!85] (num1) -- (num2);
\draw [->,thick,color =black!85] (num8) -- (num9);
\end{tikzpicture}
\ No newline at end of file
......@@ -4,13 +4,13 @@
\begin{tikzpicture}[node distance = 0cm]
\node(num1_0)[num, fill = blue!40]{\textcolor{white}{$\mathbi{0}$}};
\node(num1_1)[num,right of = num1_0,xshift = 1.2cm]{$\mathbi{e}_1$};
\node(num1_2)[num,right of = num1_1,xshift = 1.2cm]{\textcolor{blue!70}{$\mathbi{e}_2$}};
\node(num1_3)[num,right of = num1_2,xshift = 1.2cm]{\textcolor{blue!70}{$\mathbi{e}_3$}};
\node(num1_4)[num,right of = num1_3,xshift = 1.2cm]{\textcolor{blue!70}{$\mathbi{e}_4$}};
\node(num1_5)[num,right of = num1_4,xshift = 1.2cm]{\textcolor{blue!70}{$\mathbi{e}_5$}};
\node(num1_6)[num,right of = num1_5,xshift = 1.2cm]{\textcolor{blue!70}{$\mathbi{e}_6$}};
\node(num1_7)[num,right of = num1_6,xshift = 1.2cm]{\textcolor{blue!70}{$\mathbi{e}_7$}};
\node(num1_8)[num,right of = num1_7,xshift = 1.2cm]{\textcolor{blue!70}{$\mathbi{e}_8$}};
\node(num1_2)[num,right of = num1_1,xshift = 1.2cm]{\textcolor{blue!85}{$\mathbi{e}_2$}};
\node(num1_3)[num,right of = num1_2,xshift = 1.2cm]{\textcolor{blue!85}{$\mathbi{e}_3$}};
\node(num1_4)[num,right of = num1_3,xshift = 1.2cm]{\textcolor{blue!85}{$\mathbi{e}_4$}};
\node(num1_5)[num,right of = num1_4,xshift = 1.2cm]{\textcolor{blue!85}{$\mathbi{e}_5$}};
\node(num1_6)[num,right of = num1_5,xshift = 1.2cm]{\textcolor{blue!85}{$\mathbi{e}_6$}};
\node(num1_7)[num,right of = num1_6,xshift = 1.2cm]{\textcolor{blue!85}{$\mathbi{e}_7$}};
\node(num1_8)[num,right of = num1_7,xshift = 1.2cm]{\textcolor{blue!85}{$\mathbi{e}_8$}};
\node(num1_9)[num,right of = num1_8,xshift = 1.2cm]{$\mathbi{e}_9$};
\node(num1_10)[num,right of = num1_9,xshift = 1.2cm, fill = blue!40]{$\mathbi{0}$};
%\node(A)[below of = num2,yshift = -0.6cm]{A};
......@@ -19,11 +19,11 @@
\node(num2_0)[num,above of = num1_0,yshift = 1.2cm, fill = blue!40]{\textcolor{white}{$\mathbi{0}$}};
\node(num2_1)[num,right of = num2_0,xshift = 1.2cm]{\textbf2};
\node(num2_2)[num,right of = num2_1,xshift = 1.2cm]{\textbf2};
\node(num2_3)[num,right of = num2_2,xshift = 1.2cm]{\textbf{\textcolor{blue!70}2}};
\node(num2_4)[num,right of = num2_3,xshift = 1.2cm]{\textbf{\textcolor{blue!70}2}};
\node(num2_5)[num,right of = num2_4,xshift = 1.2cm]{\textbf{\textcolor{blue!70}2}};
\node(num2_6)[num,right of = num2_5,xshift = 1.2cm]{\textbf{\textcolor{blue!70}2}};
\node(num2_7)[num,right of = num2_6,xshift = 1.2cm]{\textbf{\textcolor{blue!70}2}};
\node(num2_3)[num,right of = num2_2,xshift = 1.2cm]{\textbf{\textcolor{blue!85}2}};
\node(num2_4)[num,right of = num2_3,xshift = 1.2cm]{\textbf{\textcolor{blue!85}2}};
\node(num2_5)[num,right of = num2_4,xshift = 1.2cm]{\textbf{\textcolor{blue!85}2}};
\node(num2_6)[num,right of = num2_5,xshift = 1.2cm]{\textbf{\textcolor{blue!85}2}};
\node(num2_7)[num,right of = num2_6,xshift = 1.2cm]{\textbf{\textcolor{blue!85}2}};
\node(num2_8)[num,right of = num2_7,xshift = 1.2cm]{\textbf2};
\node(num2_9)[num,right of = num2_8,xshift = 1.2cm]{\textbf2};
\node(num2_10)[num,right of = num2_9,xshift = 1.2cm, fill = blue!40]{$\mathbi{0}$};
......@@ -32,9 +32,9 @@
\node(num3_1)[num,right of = num3_0,xshift = 1.2cm]{\textbf3};
\node(num3_2)[num,right of = num3_1,xshift = 1.2cm]{\textbf3};
\node(num3_3)[num,right of = num3_2,xshift = 1.2cm]{\textbf3};
\node(num3_4)[num,right of = num3_3,xshift = 1.2cm]{\textbf{\textcolor{blue!70}3}};
\node(num3_5)[num,right of = num3_4,xshift = 1.2cm]{\textbf{\textcolor{blue!70}3}};
\node(num3_6)[num,right of = num3_5,xshift = 1.2cm]{\textbf{\textcolor{blue!70}3}};
\node(num3_4)[num,right of = num3_3,xshift = 1.2cm]{\textbf{\textcolor{blue!85}3}};
\node(num3_5)[num,right of = num3_4,xshift = 1.2cm]{\textbf{\textcolor{blue!85}3}};
\node(num3_6)[num,right of = num3_5,xshift = 1.2cm]{\textbf{\textcolor{blue!85}3}};
\node(num3_7)[num,right of = num3_6,xshift = 1.2cm]{\textbf3};
\node(num3_8)[num,right of = num3_7,xshift = 1.2cm]{\textbf3};
\node(num3_9)[num,right of = num3_8,xshift = 1.2cm]{\textbf3};
......@@ -45,7 +45,7 @@
\node(num4_2)[num,right of = num4_1,xshift = 1.2cm]{\textbf4};
\node(num4_3)[num,right of = num4_2,xshift = 1.2cm]{\textbf4};
\node(num4_4)[num,right of = num4_3,xshift = 1.2cm]{\textbf4};
\node(num4_5)[num,right of = num4_4,xshift = 1.2cm]{\textbf{\textcolor{blue!60}4}};
\node(num4_5)[num,right of = num4_4,xshift = 1.2cm]{\textbf{\textcolor{blue!80}4}};
\node(num4_6)[num,right of = num4_5,xshift = 1.2cm]{\textbf4};
\node(num4_7)[num,right of = num4_6,xshift = 1.2cm]{\textbf4};
\node(num4_8)[num,right of = num4_7,xshift = 1.2cm]{\textbf4};
......@@ -58,19 +58,19 @@
\draw [->, thick](num1_1.north)--([xshift=-0.1em,yshift=-0.1em]num2_2.south);
\draw [->, thick](num2_1.north)--([xshift=-0.1em,yshift=-0.1em]num3_2.south);
\draw [->, thick](num3_1.north)--([xshift=-0.1em,yshift=-0.1em]num4_2.south);
\draw [->, thick, color = blue!60](num1_2.north)--([xshift=-0.1em,yshift=-0.1em]num2_3.south);
\draw [->, thick, color = blue!80](num1_2.north)--([xshift=-0.1em,yshift=-0.1em]num2_3.south);
\draw [->, thick](num2_2.north)--([xshift=-0.1em,yshift=-0.1em]num3_3.south);
\draw [->, thick](num3_2.north)--([xshift=-0.1em,yshift=-0.1em]num4_3.south);
\draw [->, thick, color = blue!60](num1_3.north)--([xshift=-0.1em,yshift=-0.1em]num2_4.south);
\draw [->, thick, color = blue!60](num2_3.north)--([xshift=-0.1em,yshift=-0.1em]num3_4.south);
\draw [->, thick, color = blue!80](num1_3.north)--([xshift=-0.1em,yshift=-0.1em]num2_4.south);
\draw [->, thick, color = blue!80](num2_3.north)--([xshift=-0.1em,yshift=-0.1em]num3_4.south);
\draw [->, thick](num3_3.north)--([xshift=-0.1em,yshift=-0.1em]num4_4.south);
\draw [->, thick, color = blue!60](num1_4.north)--([xshift=-0.1em,yshift=-0.1em]num2_5.south);
\draw [->, thick, color = blue!60](num2_4.north)--([xshift=-0.1em,yshift=-0.1em]num3_5.south);
\draw [->, thick, color = blue!60](num3_4.north)--([xshift=-0.1em,yshift=-0.1em]num4_5.south);
\draw [->, thick, color = blue!60](num1_5.north)--([xshift=-0.1em,yshift=-0.1em]num2_6.south);
\draw [->, thick, color = blue!60](num2_5.north)--([xshift=-0.1em,yshift=-0.1em]num3_6.south);
\draw [->, thick, color = blue!80](num1_4.north)--([xshift=-0.1em,yshift=-0.1em]num2_5.south);
\draw [->, thick, color = blue!80](num2_4.north)--([xshift=-0.1em,yshift=-0.1em]num3_5.south);
\draw [->, thick, color = blue!80](num3_4.north)--([xshift=-0.1em,yshift=-0.1em]num4_5.south);
\draw [->, thick, color = blue!80](num1_5.north)--([xshift=-0.1em,yshift=-0.1em]num2_6.south);
\draw [->, thick, color = blue!80](num2_5.north)--([xshift=-0.1em,yshift=-0.1em]num3_6.south);
\draw [->, thick](num3_5.north)--([xshift=-0.1em,yshift=-0.1em]num4_6.south);
\draw [->, thick, color = blue!60](num1_6.north)--([xshift=-0.1em,yshift=-0.1em]num2_7.south);
\draw [->, thick, color = blue!80](num1_6.north)--([xshift=-0.1em,yshift=-0.1em]num2_7.south);
\draw [->, thick](num2_6.north)--([xshift=-0.1em,yshift=-0.1em]num3_7.south);
\draw [->, thick](num3_6.north)--([xshift=-0.1em,yshift=-0.1em]num4_7.south);
\draw [->, thick](num1_7.north)--([xshift=-0.1em,yshift=-0.1em]num2_8.south);
......@@ -86,19 +86,19 @@
\draw [->, thick](num1_3.north)--([xshift=0.1em,yshift=-0.1em]num2_2.south);
\draw [->, thick](num2_3.north)--([xshift=0.1em,yshift=-0.1em]num3_2.south);
\draw [->, thick](num3_3.north)--([xshift=0.1em,yshift=-0.1em]num4_2.south);
\draw [->, thick, color = blue!60](num1_4.north)--([xshift=0.1em,yshift=-0.1em]num2_3.south);
\draw [->, thick, color = blue!80](num1_4.north)--([xshift=0.1em,yshift=-0.1em]num2_3.south);
\draw [->, thick](num2_4.north)--([xshift=0.1em,yshift=-0.1em]num3_3.south);
\draw [->, thick](num3_4.north)--([xshift=0.1em,yshift=-0.1em]num4_3.south);
\draw [->, thick, color = blue!60](num1_5.north)--([xshift=0.1em,yshift=-0.1em]num2_4.south);
\draw [->, thick, color = blue!60](num2_5.north)--([xshift=0.1em,yshift=-0.1em]num3_4.south);
\draw [->, thick, color = blue!80](num1_5.north)--([xshift=0.1em,yshift=-0.1em]num2_4.south);
\draw [->, thick, color = blue!80](num2_5.north)--([xshift=0.1em,yshift=-0.1em]num3_4.south);
\draw [->, thick](num3_5.north)--([xshift=0.1em,yshift=-0.1em]num4_4.south);
\draw [->, thick, color = blue!60](num1_6.north)--([xshift=0.1em,yshift=-0.1em]num2_5.south);
\draw [->, thick, color = blue!60](num2_6.north)--([xshift=0.1em,yshift=-0.1em]num3_5.south);
\draw [->, thick, color = blue!60](num3_6.north)--([xshift=0.1em,yshift=-0.1em]num4_5.south);
\draw [->, thick, color = blue!60](num1_7.north)--([xshift=0.1em,yshift=-0.1em]num2_6.south);
\draw [->, thick, color = blue!60](num2_7.north)--([xshift=0.1em,yshift=-0.1em]num3_6.south);
\draw [->, thick, color = blue!80](num1_6.north)--([xshift=0.1em,yshift=-0.1em]num2_5.south);
\draw [->, thick, color = blue!80](num2_6.north)--([xshift=0.1em,yshift=-0.1em]num3_5.south);
\draw [->, thick, color = blue!80](num3_6.north)--([xshift=0.1em,yshift=-0.1em]num4_5.south);
\draw [->, thick, color = blue!80](num1_7.north)--([xshift=0.1em,yshift=-0.1em]num2_6.south);
\draw [->, thick, color = blue!80](num2_7.north)--([xshift=0.1em,yshift=-0.1em]num3_6.south);
\draw [->, thick](num3_7.north)--([xshift=0.1em,yshift=-0.1em]num4_6.south);
\draw [->, thick, color = blue!60](num1_8.north)--([xshift=0.1em,yshift=-0.1em]num2_7.south);
\draw [->, thick, color = blue!80](num1_8.north)--([xshift=0.1em,yshift=-0.1em]num2_7.south);
\draw [->, thick](num2_8.north)--([xshift=0.1em,yshift=-0.1em]num3_7.south);
\draw [->, thick](num3_8.north)--([xshift=0.1em,yshift=-0.1em]num4_7.south);
\draw [->, thick](num1_9.north)--([xshift=0.1em,yshift=-0.1em]num2_8.south);
......
......@@ -5,41 +5,41 @@
\begin{scope}
%\tikzstyle{every node}=[scale=0.8]
\tikzstyle{line} = [dash pattern=on 2pt off 1pt,line width=0.5pt]
\tikzstyle{cir} = [thin,fill=blue!8,draw,circle,minimum size =0.5em,drop shadow={shadow xshift=0.15em, shadow yshift=-0.1em}]
\tikzstyle{cir} = [thin,fill=blue!15,draw,circle,minimum size =0.5em,drop shadow={shadow xshift=0.15em, shadow yshift=-0.1em}]
\tikzstyle{word} = [inner sep=0pt, font=\scriptsize,minimum height=\bcc]
\draw[fill=red!8,line width=0.2pt] (0cm,0cm+1*\bcc) rectangle (0cm+4*\bcc,0cm+7*\bcc);
\draw[fill=cyan!14,line width=0.2pt] (0cm,0cm) rectangle (0cm+4*\bcc,0cm+1*\bcc);
\draw[fill=cyan!14,line width=0.2pt] (0cm,0cm+7*\bcc) rectangle (0cm+4*\bcc,0cm+8*\bcc);
\draw[fill=red!15,line width=0.2pt] (0cm,0cm+1*\bcc) rectangle (0cm+4*\bcc,0cm+7*\bcc);
\draw[fill=cyan!20,line width=0.2pt] (0cm,0cm) rectangle (0cm+4*\bcc,0cm+1*\bcc);
\draw[fill=cyan!20,line width=0.2pt] (0cm,0cm+7*\bcc) rectangle (0cm+4*\bcc,0cm+8*\bcc);
\draw[step=\bcc] (0cm,0cm) grid (0cm+4*\bcc,0cm+8*\bcc);
%\draw[line width=0.7pt] (0cm,0cm) rectangle (0cm+4*\bcc,0cm+8*\bcc);
\draw[red!50,line width=1.8pt] (0cm,0cm+5*\bcc) rectangle (0cm+4*\bcc,0cm+8*\bcc);
\draw[ugreen!50,line width=1.8pt] (0cm,0cm+1*\bcc) rectangle (0cm+4*\bcc,0cm+4*\bcc);
\draw[red!70,line width=1.8pt] (0cm,0cm+5*\bcc) rectangle (0cm+4*\bcc,0cm+8*\bcc);
\draw[ugreen!70,line width=1.8pt] (0cm,0cm+1*\bcc) rectangle (0cm+4*\bcc,0cm+4*\bcc);
\draw[fill=blue!8,xshift=5.0cm,yshift=1.0cm,line width=0.2pt] (0cm,0cm) rectangle (0cm+1*\bcc,0cm+6*\bcc);
\draw[fill=blue!15,xshift=5.0cm,yshift=1.0cm,line width=0.2pt] (0cm,0cm) rectangle (0cm+1*\bcc,0cm+6*\bcc);
\draw[step=\bcc,xshift=5.0cm,yshift=1.0cm] (0cm,0cm) grid (0cm+1*\bcc,0cm+6*\bcc);
%\draw[line width=0.7pt,xshift=5.0cm,yshift=1.0cm] (0cm,0cm) rectangle (0cm+1*\bcc,0cm+6*\bcc);
\draw[fill=blue!8,xshift=5.2cm,yshift=0.8cm,line width=0.2pt] (0cm,0cm) rectangle (0cm+1*\bcc,0cm+6*\bcc);
\draw[fill=blue!15,xshift=5.2cm,yshift=0.8cm,line width=0.2pt] (0cm,0cm) rectangle (0cm+1*\bcc,0cm+6*\bcc);
\draw[step=\bcc,xshift=5.2cm,yshift=0.8cm] (0cm,0cm) grid (0cm+1*\bcc,0cm+6*\bcc);
%\draw[line width=0.7pt,xshift=5.2cm,yshift=0.8cm] (0cm,0cm) rectangle (0cm+1*\bcc,0cm+6*\bcc);
\draw[ugreen!50,line width=2pt,xshift=5.2cm,yshift=0.8cm] (0cm,0cm+1*\bcc) rectangle (0cm+1*\bcc,0cm+2*\bcc);
\draw[ugreen!70,line width=2pt,xshift=5.2cm,yshift=0.8cm] (0cm,0cm+1*\bcc) rectangle (0cm+1*\bcc,0cm+2*\bcc);
\draw[fill=blue!8,xshift=5.4cm,yshift=0.6cm,line width=0.2pt] (0cm,0cm) rectangle (0cm+1*\bcc,0cm+6*\bcc);
\draw[fill=blue!15,xshift=5.4cm,yshift=0.6cm,line width=0.2pt] (0cm,0cm) rectangle (0cm+1*\bcc,0cm+6*\bcc);
\draw[step=\bcc,xshift=5.4cm,yshift=0.6cm] (0cm,0cm) grid (0cm+1*\bcc,0cm+6*\bcc);
%\draw[line width=0.7pt,xshift=5.4cm,yshift=0.6cm] (0cm,0cm) rectangle (0cm+1*\bcc,0cm+6*\bcc);
\draw[fill=blue!8,xshift=5.6cm,yshift=0.4cm,line width=0.2pt] (0cm,0cm) rectangle (0cm+1*\bcc,0cm+6*\bcc);
\draw[fill=blue!15,xshift=5.6cm,yshift=0.4cm,line width=0.2pt] (0cm,0cm) rectangle (0cm+1*\bcc,0cm+6*\bcc);
\draw[step=\bcc,xshift=5.6cm,yshift=0.4cm] (0cm,0cm) grid (0cm+1*\bcc,0cm+6*\bcc);
%\draw[line width=0.7pt,xshift=5.6cm,yshift=0.4cm] (0cm,0cm) rectangle (0cm+1*\bcc,0cm+6*\bcc);
\draw[red!50,line width=2pt,xshift=5.6cm,yshift=0.4cm] (0cm,0cm+5*\bcc) rectangle (0cm+1*\bcc,0cm+6*\bcc);
\draw[red!70,line width=2pt,xshift=5.6cm,yshift=0.4cm] (0cm,0cm+5*\bcc) rectangle (0cm+1*\bcc,0cm+6*\bcc);
\draw[red!50,line width=0.5pt] (0cm+4*\bcc,0cm+8*\bcc) -- ([xshift=5.6cm,yshift=0.4cm]0cm,0cm+6*\bcc);
\draw[red!50,line width=0.5pt] (0cm+4*\bcc,0cm+5*\bcc) -- ([xshift=5.6cm,yshift=0.4cm]0cm,0cm+5*\bcc);
\draw[red!70,line width=0.5pt] (0cm+4*\bcc,0cm+8*\bcc) -- ([xshift=5.6cm,yshift=0.4cm]0cm,0cm+6*\bcc);
\draw[red!70,line width=0.5pt] (0cm+4*\bcc,0cm+5*\bcc) -- ([xshift=5.6cm,yshift=0.4cm]0cm,0cm+5*\bcc);
\draw[ugreen!50,line width=0.5pt] (0cm+4*\bcc,0cm+4*\bcc) -- ([xshift=5.2cm,yshift=0.8cm]0cm,0cm+2*\bcc);
\draw[ugreen!50,line width=0.5pt] (0cm+4*\bcc,0cm+1*\bcc) -- ([xshift=5.2cm,yshift=0.8cm]0cm,0cm+1*\bcc);
\draw[ugreen!70,line width=0.5pt] (0cm+4*\bcc,0cm+4*\bcc) -- ([xshift=5.2cm,yshift=0.8cm]0cm,0cm+2*\bcc);
\draw[ugreen!70,line width=0.5pt] (0cm+4*\bcc,0cm+1*\bcc) -- ([xshift=5.2cm,yshift=0.8cm]0cm,0cm+1*\bcc);
\node[word] (w1) at (-0.5cm, 3.0cm) {$<$p$>$};
\node[word] (w2) at ([yshift=-\bcc]w1) {今天};
......
......@@ -5,7 +5,7 @@
\begin{scope}
%\tikzstyle{every node}=[scale=0.8]
\tikzstyle{line} = [dash pattern=on 2pt off 1pt,line width=0.6pt]
\tikzstyle{cir} = [thin,fill=blue!8,draw,circle,minimum size =0.5em,drop shadow={shadow xshift=0.15em, shadow yshift=-0.1em}]
\tikzstyle{cir} = [thin,fill=blue!15,draw,circle,minimum size =0.5em,drop shadow={shadow xshift=0.15em, shadow yshift=-0.1em}]
\tikzstyle{word} = [inner sep=0pt, font=\footnotesize,minimum height=\bcc]
%\draw[fill=blue!8,xshift=0.3cm,yshift=0.5cm,line width=0.6pt] (0cm,0cm) rectangle (0cm+6*\bcc,0cm+9*\bcc);
......@@ -13,30 +13,30 @@
%\draw[red!60,line width=2pt,xshift=0.3cm,yshift=0.5cm] (0cm,0cm+2*\bcc) rectangle (0cm+6*\bcc,0cm+4*\bcc);
% 输入矩阵
\draw[thick,fill=blue!8,line width=0.6pt] (0cm,0cm) rectangle (0cm+6*\bcc,0cm+9*\bcc);
\draw[thick,fill=blue!15,line width=0.6pt] (0cm,0cm) rectangle (0cm+6*\bcc,0cm+9*\bcc);
\draw[step=\bcc,gray] (0cm,0cm) grid (0cm+6*\bcc,0cm+9*\bcc);
\draw[red!60,line width=2pt] (0cm,0cm) rectangle (0cm+6*\bcc,0cm+2*\bcc);
\draw[ugreen!60,line width=2pt] (0cm,0cm+3*\bcc) rectangle (0cm+6*\bcc,0cm+6*\bcc);
\draw[red!60,line width=2pt] (0cm,0cm+7*\bcc) rectangle (0cm+6*\bcc,0cm+9*\bcc);
\draw[red!80,line width=2pt] (0cm,0cm) rectangle (0cm+6*\bcc,0cm+2*\bcc);
\draw[ugreen!80,line width=2pt] (0cm,0cm+3*\bcc) rectangle (0cm+6*\bcc,0cm+6*\bcc);
\draw[red!80,line width=2pt] (0cm,0cm+7*\bcc) rectangle (0cm+6*\bcc,0cm+9*\bcc);
% 特征图
\draw[fill=blue!8,xshift=5.0cm,yshift=1.3cm,line width=0.6pt] (0cm,0cm-1*\bcc) rectangle (0cm+1*\bcc,0cm+6*\bcc);
\draw[fill=blue!15,xshift=5.0cm,yshift=1.3cm,line width=0.6pt] (0cm,0cm-1*\bcc) rectangle (0cm+1*\bcc,0cm+6*\bcc);
\draw[step=\bcc,gray,xshift=5.0cm,yshift=1.3cm] (0cm,0cm-1*\bcc) grid (0cm+1*\bcc,0cm+6*\bcc);
\draw[ugreen!60,line width=2pt,xshift=5.0cm,yshift=1.3cm] (0cm,0cm+2*\bcc) rectangle (0cm+1*\bcc,0cm+3*\bcc);
\draw[ugreen!80,line width=2pt,xshift=5.0cm,yshift=1.3cm] (0cm,0cm+2*\bcc) rectangle (0cm+1*\bcc,0cm+3*\bcc);
%最大池化
\draw [gray,fill=blue!8,line width=0.6pt](8cm,2.2cm) -- (8.4cm, 2.2cm) -- (8.7cm,1.4cm) -- (8.3cm, 1.4cm) -- (8cm,2.2cm);
\draw [gray,fill=blue!15,line width=0.6pt](8cm,2.2cm) -- (8.4cm, 2.2cm) -- (8.7cm,1.4cm) -- (8.3cm, 1.4cm) -- (8cm,2.2cm);
\draw [gray](8.15cm,1.8cm) -- (8.55cm,1.8cm);
%\draw [gray](8.3cm,1.8cm) -- (8.7cm,1.8cm);
%\draw [gray](8.45cm,1.4cm) -- (8.85cm,1.4cm);
%全连接层
\draw [gray,fill=blue!8,line width=0.6pt](11cm,2.2cm) -- (11.4cm, 2.2cm) -- (11.7cm,1.8cm) -- (11.3cm, 1.8cm) -- (11cm,2.2cm);
\draw [gray,fill=blue!15,line width=0.6pt](11cm,2.2cm) -- (11.4cm, 2.2cm) -- (11.7cm,1.8cm) -- (11.3cm, 1.8cm) -- (11cm,2.2cm);
%\draw [gray](11.15cm,1.8cm) -- (11.55cm,1.8cm);
%最大池化
\draw[ugreen!60,line] ([xshift=5.0cm,yshift=1.3cm]0cm+1*\bcc,0cm+6*\bcc) -- (8cm,2.2cm);
\draw[ugreen!60,line] ([xshift=5.0cm,yshift=1.3cm]0cm+1*\bcc,0cm-1*\bcc) -- (8.15cm,1.8cm);
\draw[ugreen!80,line] ([xshift=5.0cm,yshift=1.3cm]0cm+1*\bcc,0cm+6*\bcc) -- (8cm,2.2cm);
\draw[ugreen!80,line] ([xshift=5.0cm,yshift=1.3cm]0cm+1*\bcc,0cm-1*\bcc) -- (8.15cm,1.8cm);
%特征图
%\draw[fill=blue!8,xshift=5.2cm,yshift=1.0cm,line width=0.6pt] (0cm,0cm) rectangle (0cm+1*\bcc,0cm+6*\bcc);
......@@ -45,25 +45,25 @@
%\draw[fill=blue!8,xshift=5.4cm,yshift=0.3cm,line width=0.6pt] (0cm,0cm) rectangle (0cm+1*\bcc,0cm+7*\bcc);
%\draw[step=\bcc,gray,xshift=5.4cm,yshift=0.3cm] (0cm,0cm) grid (0cm+1*\bcc,0cm+7*\bcc);
\draw[fill=blue!8,xshift=5.6cm,yshift=0cm,line width=0.6pt] (0cm,0cm) rectangle (0cm+1*\bcc,0cm+8*\bcc);
\draw[fill=blue!15,xshift=5.6cm,yshift=0cm,line width=0.6pt] (0cm,0cm) rectangle (0cm+1*\bcc,0cm+8*\bcc);
\draw[step=\bcc,gray,xshift=5.6cm,yshift=0cm] (0cm,0cm) grid (0cm+1*\bcc,0cm+8*\bcc);
\draw[red!60,line width=2pt,xshift=5.6cm,yshift=0cm] (0cm,0cm) rectangle (0cm+1*\bcc,0cm+1*\bcc);
\draw[red!60,line width=2pt,xshift=5.6cm,yshift=0cm] (0cm,0cm+7*\bcc) rectangle (0cm+1*\bcc,0cm+8*\bcc);
\draw[red!80,line width=2pt,xshift=5.6cm,yshift=0cm] (0cm,0cm) rectangle (0cm+1*\bcc,0cm+1*\bcc);
\draw[red!80,line width=2pt,xshift=5.6cm,yshift=0cm] (0cm,0cm+7*\bcc) rectangle (0cm+1*\bcc,0cm+8*\bcc);
% 全连接线
\draw[line] (8.4cm, 2.2cm) -- (11.2cm,2.2cm);
\draw[line] (8.7cm,1.4cm) -- (11.3cm, 1.8cm);
%全连接上面的红虚线
\draw[red!60,line] ([xshift=5.6cm,yshift=0cm]0cm+1*\bcc,0cm+7*\bcc) -- (8.15cm,1.8cm);
\draw[red!60,line] ([xshift=5.6cm,yshift=0cm]0cm+1*\bcc,0cm) -- (8.3cm, 1.4cm);
\draw[red!80,line] ([xshift=5.6cm,yshift=0cm]0cm+1*\bcc,0cm+7*\bcc) -- (8.15cm,1.8cm);
\draw[red!80,line] ([xshift=5.6cm,yshift=0cm]0cm+1*\bcc,0cm) -- (8.3cm, 1.4cm);
% 特征图红色虚线
\draw[red!60,line] (0cm+6*\bcc,0cm+9*\bcc) -- ([xshift=5.6cm,yshift=0cm]0cm,0cm+8*\bcc);
\draw[red!60,line] (0cm+6*\bcc,0cm+7*\bcc) -- ([xshift=5.6cm,yshift=0cm]0cm,0cm+7*\bcc);
\draw[red!60,line] (0cm+6*\bcc,0cm+2*\bcc) -- ([xshift=5.6cm,yshift=0cm]0cm,0cm+1*\bcc);
\draw[red!60,line] (0cm+6*\bcc,0cm) -- ([xshift=5.6cm,yshift=0cm]0cm,0cm);
\draw[ugreen!60,line] (0cm+6*\bcc,0cm+6*\bcc) -- ([xshift=5.0cm,yshift=1.3cm]0cm,0cm+3*\bcc);
\draw[ugreen!60,line] (0cm+6*\bcc,0cm+3*\bcc) -- ([xshift=5.0cm,yshift=1.3cm]0cm,0cm+2*\bcc);
\draw[red!80,line] (0cm+6*\bcc,0cm+9*\bcc) -- ([xshift=5.6cm,yshift=0cm]0cm,0cm+8*\bcc);
\draw[red!80,line] (0cm+6*\bcc,0cm+7*\bcc) -- ([xshift=5.6cm,yshift=0cm]0cm,0cm+7*\bcc);
\draw[red!80,line] (0cm+6*\bcc,0cm+2*\bcc) -- ([xshift=5.6cm,yshift=0cm]0cm,0cm+1*\bcc);
\draw[red!80,line] (0cm+6*\bcc,0cm) -- ([xshift=5.6cm,yshift=0cm]0cm,0cm);
\draw[ugreen!80,line] (0cm+6*\bcc,0cm+6*\bcc) -- ([xshift=5.0cm,yshift=1.3cm]0cm,0cm+3*\bcc);
\draw[ugreen!80,line] (0cm+6*\bcc,0cm+3*\bcc) -- ([xshift=5.0cm,yshift=1.3cm]0cm,0cm+2*\bcc);
%\draw[red!60,line] ([xshift=0.3cm,yshift=0.5cm]0cm+6*\bcc,0cm+4*\bcc) -- ([xshift=5.6cm,yshift=0cm]0cm,0cm+3*\bcc);
%\draw[red!60,line] ([xshift=0.3cm,yshift=0.5cm]0cm+6*\bcc,0cm+2*\bcc) -- ([xshift=5.6cm,yshift=0cm]0cm,0cm+2*\bcc);
......
......@@ -131,7 +131,7 @@
%----------------------------------------------------------------------------------------
% NEW SUB-SECTION
%----------------------------------------------------------------------------------------
\vspace{-2em}
\subsection{步长与填充}
\parinterval 在卷积操作中,步长是指卷积核每次滑动的距离,和卷积核的大小共同决定了卷积输出的大小,如图\ref{fig:11-6}所示。步长越大,对输入数据的压缩程度越高,其输出的维度越小;反之步长越小,对输入数据的压缩程度越低,同时输出的尺寸和输入越接近。比如使用一个$3 \times 3 \times 1$的卷积核在$6 \times 6 \times 1$的图像上进行卷积,如设置步长为1,其对应的输出大小就为$4 \times 4 \times 1$。这种做法最为简单,但是会导致两个问题;一是在输入数据中,由于边缘区域的像素只会被计算一次,相比于中心区域来说,这些像素被考虑的次数会更少一些,导致图像边缘信息的丢失;二是在经历多次卷积之后,其输出特征的维度会不断减小,影响模型的泛化能力。
......@@ -161,7 +161,7 @@
%----------------------------------------------------------------------------------------
% NEW SUB-SECTION
%----------------------------------------------------------------------------------------
\vspace{-2em}
\subsection{池化}
\parinterval 在图\ref{fig:11-2}所示的网络结构中,卷积层输出会通过一个非线性的激活函数,之后会通过{\small\bfnew{池化层}}\index{池化层}(也称为汇聚层)。池化过程和卷积类似,都是根据设定的窗口进行滑动选取局部信息进行计算,不同的是,池化层的计算是无参数化的,不需要额外的权重矩阵。常见的池化操作有{\small\bfnew{最大池化}}\index{最大池化}(Max Pooling)\index{Max Pooling}{\small\bfnew{平均池化}}\index{平均池化}(Average Pooling)\index{Average Pooling}。前者获取窗口内最大的值,后者则获取窗口内矩阵的平均值。图\ref{fig:11-8}展示了窗口大小为$2 \times 2$、步长为2的两种池化方法的计算过程。
......@@ -200,7 +200,7 @@
\label{fig:11-9}
\end{figure}
%----------------------------------------------
\vspace{-1em}
\parinterval 针对不定长序列,一种可行的方法是使用之前介绍过的循环神经网络进行信息提取,其本质也是基于权重共享的想法,在不同的时间步复用相同的循环神经网络单元进行处理。但是,循环神经网络最大的弊端在于每一时刻的计算都依赖于上一时刻的结果,因此只能对序列进行串行处理,无法充分利用硬件设备进行并行计算,导致效率相对较低。此外,在处理较长的序列时,这种串行的方式很难捕捉长距离的依赖关系。相比之下,卷积神经网络采用共享参数的方式处理固定大小窗口内的信息,且不同位置的卷积操作之间没有相互依赖,因此可以对序列进行高效地并行处理。同时,针对序列中距离较长的依赖关系,可以通过堆叠多层卷积层来扩大{\small\bfnew{感受野}}\index{感受野} (Receptive Field)\index{Receptive Field} ,这里感受野指能够影响神经元输出的原始输入数据区域的大小。图\ref{fig:11-9}对比了这两种结构,可以看出,为了捕捉$\mathbi{e}_2$$\mathbi{e}_8$ 之间的联系,串行结构需要顺序地进行6次操作,和序列长度相关。而该卷积神经网络中,卷积操作每次对三个词进行计算,仅需要4层卷积计算就能得到$\mathbi{e}_2$$\mathbi{e}_8$之间的联系,其操作数和卷积核的大小相关,相比于串行的方式具有更短的路径和更少的非线性计算,更容易进行训练。因此,也有许多研究人员在许多自然语言处理任务上尝试使用卷积神经网络进行序列建模\upcite{Kim2014ConvolutionalNN,Santos2014DeepCN,Kalchbrenner2014ACN,DBLP:conf/naacl/Johnson015,DBLP:conf/naacl/NguyenG15}
\parinterval 区别于传统图像上的卷积操作,在面向序列的卷积操作中,卷积核只在序列这一维度进行移动,用来捕捉连续的多个词之间的特征。需要注意的是,由于单词通常由一个实数向量表示(词嵌入),因此可以将词嵌入的维度看作是卷积操作中的通道数。图\ref{fig:11-10}就是一个基于序列卷积的文本分类模型,模型的输入是维度大小为$m\times O $的句子表示,$m$表示句子长度,$O$表示卷积核通道数,其值等于词嵌入维度,模型使用多个不同(对应图中不同的颜色)的卷积核来对序列进行特征提取,得到了多个不同的特征序列。然后使用池化层降低表示维度,得到了一组和序列长度无关的特征表示。最后模型基于这组压缩过的特征表示,使用全连接网络和Softmax函数进行类别预测。在这过程中卷积层和池化层分别起到了特征提取和特征压缩的作用,将一个不定长的序列转化为一组固定大小的特征表示。
......@@ -214,7 +214,7 @@
\label{fig:11-10}
\end{figure}
%----------------------------------------------
\vspace{-1em}
\parinterval 和其它自然语言处理任务不同的是,机器翻译中需要对序列进行全局表示,换句话说,模型需要捕捉序列中各个位置之间的关系。因此,基于卷积神经网络的神经机器翻译模型需要堆叠多个卷积层进行远距离的依赖关系的建模。同时,为了在多层网络中维持序列的原有长度,需要在卷积操作前对输入序列进行填充。图\ref{fig:11-11}是一个简单的示例,针对一个长度$m=6$的句子,其隐层表示维度即卷积操作的输入通道数是$O=4$,卷积核大小为$K=3$。首先对序列进行填充,得到一个长度为8的序列,然后使用这些卷积核在这之上进行特征提取。一共使用了$N=4$个卷积核,整体的参数量为$K \times O \times N$,最后的卷积结果为$m \times N$的序列表示。
%----------------------------------------------
......@@ -235,6 +235,16 @@
\parinterval 正如之前所讲,卷积神经网络可以用于序列建模,同时具有并行性高和易于学习的特点,一个很自然的想法就是将其用作神经机器翻译模型中的特征提取器。因此,在神经机器翻译被提出之初,研究人员就已经开始利用卷积神经网络对句子进行特征提取。比较经典的模型是使用卷积神经网络作为源语言句子的编码器,使用循环神经网络作为目标语言译文生成的解码器\upcite{kalchbrenner-blunsom-2013-recurrent,Gehring2017ACE}。之后也有研究人员提出完全基于卷积神经网络的翻译模型(ConvS2S)\upcite{DBLP:journals/corr/GehringAGYD17},或者针对卷积层进行改进,提出效率更高、性能更好的模型\upcite{Kaiser2018DepthwiseSC,Wu2019PayLA}。本节将基于ConvS2S模型,阐述如何使用卷积神经网络搭建端到端神经机器翻译模型。
%----------------------------------------------
% 图12.
\begin{figure}[htp]
\centering
\input{./Chapter11/Figures/figure-fairseq-0}
\caption{ConvS2S模型结构}
\label{fig:11-12}
\end{figure}
%----------------------------------------------
\parinterval ConvS2S模型是一种高并行的序列到序列的神经计算模型。该模型利用卷积神经网络分别对源语言端与目标语言端的序列进行特征提取,并使用注意力机制来捕获两个序列之间映射关系。相比于基于多层循环神经网络的GNMT模型\upcite{Wu2016GooglesNM},其主要优势在于每一层的网络计算是完全并行化的,避免了循环神经网络中计算顺序对时序的依赖。同时,利用多层卷积神经网络的层级结构可以有效地捕捉序列不同位置之间的依赖。即使是远距离依赖,也可以通过若干层卷积单元进行有效的捕捉,而且其信息传递的路径相比循环神经网络更短。除此之外,模型同时使用门控线性单元、残差网络和位置编码等技术来进一步提升模型性能,达到了和GNMT模型相媲美的翻译性能,同时大大缩短了训练时间。
\parinterval\ref{fig:11-12}为ConvS2S模型的结构示意图,其内部由若干不同的模块组成,包括:
......@@ -249,16 +259,6 @@
\item {\small\bfnew{多跳注意力机制}}\index{多跳注意力机制}(Multi-step Attention/Multi-hop Attention)\index{Multi-step Attention}\index{Multi-hop Attention}:蓝色框内部展示了基于多跳结构的注意力机制模块\upcite{Sukhbaatar2015EndToEndMN}。ConvS2S模型同样使用注意力机制来捕捉两个序列之间不同位置的对应关系。区别于之前的做法,多跳注意力在解码器端每一个层都会执行注意力操作。下面将以此模型为例对基于卷积神经网络的机器翻译模型进行介绍。
\end{itemize}
%----------------------------------------------
% 图12.
\begin{figure}[htp]
\centering
\input{./Chapter11/Figures/figure-fairseq-0}
\caption{ConvS2S模型结构}
\label{fig:11-12}
\end{figure}
%----------------------------------------------
%----------------------------------------------------------------------------------------
% NEW SUB-SECTION
%----------------------------------------------------------------------------------------
......@@ -353,7 +353,6 @@
\label{eq:11-7}
\end{eqnarray}
%----------------------------------------------------------------------------------------
% NEW SUB-SECTION
%----------------------------------------------------------------------------------------
......@@ -362,6 +361,15 @@
\parinterval ConvS2S模型也采用了注意力机制来获取每个目标语言位置相应的源语言上下文信息。其仍然沿用传统的点乘注意力机制\upcite{DBLP:journals/corr/LuongPM15},其中图\ref{fig:11-16}蓝色框代表了多跳自注意力机制在模型中的位置。
\parinterval 在基于循环神经网络的翻译模型中,注意力机制已经被广泛使用\upcite{bahdanau2014neural},并用于避免循环神经网络将源语言序列压缩成一个固定维度的向量表示带来的信息损失。另一方面,注意力同样能够帮助解码器区分源语言中不同位置对当前目标语言位置的贡献度,其具体的计算过程如公式\eqref{eq:11-8}\eqref{eq:11-9}所示:
\begin{eqnarray}
\mathbi{C}_j &=& \sum_i \alpha_{i,j} \mathbi{h}_i \label{eq:11-8} \\
\alpha_{i,j} &=& \frac{ \textrm{exp}(\funp{a} (\mathbi{s}_{j-1},\mathbi{h}_i)) }{\sum_{i'} \textrm{exp}( \funp{a} (\mathbi{s}_{j-1},\mathbi{h}_{i'}))} \label{eq:11-9}
\end{eqnarray}
\noindent 其中,$\mathbi{h}_i$表示源语言端第$i$个位置的隐层状态,即编码器在第$i$个位置的输出。$\mathbi{s}_j$表示目标端第$j$个位置的隐层状态。给定$\mathbi{s}_j$$\mathbi{h}_i$,注意力机制通过函数$\funp{a}(\cdot)$计算目标语言表示$\mathbi{s}_j$与源语言表示$\mathbi{h}_i$之间的注意力权重$\alpha_{i,j}$,通过加权平均得到当前目标语言端位置所需的上下文表示$\mathbi{C}_j$。其中$\funp{a}(\cdot)$的具体计算方式在{\chapterten}已经详细讨论。
%----------------------------------------------
% 图16.
\begin{figure}[htp]
......@@ -372,15 +380,6 @@
\end{figure}
%----------------------------------------------
\parinterval 在基于循环神经网络的翻译模型中,注意力机制已经被广泛使用\upcite{bahdanau2014neural},并用于避免循环神经网络将源语言序列压缩成一个固定维度的向量表示带来的信息损失。另一方面,注意力同样能够帮助解码器区分源语言中不同位置对当前目标语言位置的贡献度,其具体的计算过程如公式\eqref{eq:11-8}\eqref{eq:11-9}所示:
\begin{eqnarray}
\mathbi{C}_j &=& \sum_i \alpha_{i,j} \mathbi{h}_i \label{eq:11-8} \\
\alpha_{i,j} &=& \frac{ \textrm{exp}(\funp{a} (\mathbi{s}_{j-1},\mathbi{h}_i)) }{\sum_{i'} \textrm{exp}( \funp{a} (\mathbi{s}_{j-1},\mathbi{h}_{i'}))} \label{eq:11-9}
\end{eqnarray}
\noindent 其中,$\mathbi{h}_i$表示源语言端第$i$个位置的隐层状态,即编码器在第$i$个位置的输出。$\mathbi{s}_j$表示目标端第$j$个位置的隐层状态。给定$\mathbi{s}_j$$\mathbi{h}_i$,注意力机制通过函数$\funp{a}(\cdot)$计算目标语言表示$\mathbi{s}_j$与源语言表示$\mathbi{h}_i$之间的注意力权重$\alpha_{i,j}$,通过加权平均得到当前目标语言端位置所需的上下文表示$\mathbi{C}_j$。其中$\funp{a}(\cdot)$的具体计算方式在{\chapterten}已经详细讨论。
\parinterval 在ConvS2S模型中,解码器同样采用堆叠的多层门控卷积网络来对目标语言进行序列建模。区别于编码器,解码器在每一层卷积网络之后引入了注意力机制,用来参考源语言信息。ConvS2S选用了点乘注意力,并且通过类似残差连接的方式将注意力操作的输入与输出同时作用于下一层计算,称为多跳注意力。其具体计算方式如下:
\begin{eqnarray}
\alpha_{ij}^l &=& \frac{ \textrm{exp} (\mathbi{d}_{j}^l\mathbi{h}_i) }{\sum_{i^{'}=1}^m \textrm{exp} (\mathbi{d}_{j}^l\mathbi{h}_{i^{'}})}
......
\begin{tikzpicture}
\begin{scope}
\tikzstyle{rnode} = [draw,minimum width=3.5em,minimum height=1.2em]
\tikzstyle{rnode} = [draw,minimum width=4em,minimum height=1.2em]
\node [rnode,anchor=south west,fill=red!20!white] (e1) at (0,0) {\scriptsize{$e(\textrm{沈阳})$}};
\node [rnode,anchor=south west,fill=red!20!white] (e2) at ([xshift=1em]e1.south east) {\scriptsize{$e(\textrm{})$}};
\node [rnode,anchor=south west,fill=red!20!white] (e3) at ([xshift=1em]e2.south east) {\scriptsize{$e(\textrm{广州})$}};
\node [rnode,anchor=south west,fill=red!20!white] (e4) at ([xshift=1em]e3.south east) {\scriptsize{$e(\textrm{})$}};
\node [rnode,anchor=south west,fill=red!20!white] (e5) at ([xshift=1em]e4.south east) {\scriptsize{$e(\textrm{机票})$}};
\node [rnode,anchor=south west,fill=red!30!white] (e1) at (0,0) {\scriptsize{$e(\textrm{沈阳})$}};
\node [rnode,anchor=south west,fill=red!30!white] (e2) at ([xshift=1.5em]e1.south east) {\scriptsize{$e(\textrm{})$}};
\node [rnode,anchor=south west,fill=red!30!white] (e3) at ([xshift=1.5em]e2.south east) {\scriptsize{$e(\textrm{广州})$}};
\node [rnode,anchor=south west,fill=red!30!white] (e4) at ([xshift=1.5em]e3.south east) {\scriptsize{$e(\textrm{})$}};
\node [rnode,anchor=south west,fill=red!30!white] (e5) at ([xshift=1.5em]e4.south east) {\scriptsize{$e(\textrm{机票})$}};
\node [rnode,anchor=south west,fill=green!20!white] (h1) at ([yshift=1.5em]e1.north west) {\scriptsize{$h(\textrm{沈阳})$}};
\node [rnode,anchor=south west,fill=green!20!white] (h2) at ([yshift=1.5em]e2.north west) {\scriptsize{$h(\textrm{})$}};
\node [rnode,anchor=south west,fill=green!20!white] (h3) at ([yshift=1.5em]e3.north west) {\scriptsize{$h(\textrm{广州})$}};
\node [rnode,anchor=south west,fill=green!20!white] (h4) at ([yshift=1.5em]e4.north west) {\scriptsize{$h(\textrm{})$}};
\node [rnode,anchor=south west,fill=green!20!white] (h5) at ([yshift=1.5em]e5.north west) {\scriptsize{$h(\textrm{机票})$}};
\node [rnode,anchor=south west,fill=green!30!white] (h1) at ([yshift=1.5em]e1.north west) {\scriptsize{$h(\textrm{沈阳})$}};
\node [rnode,anchor=south west,fill=green!30!white] (h2) at ([yshift=1.5em]e2.north west) {\scriptsize{$h(\textrm{})$}};
\node [rnode,anchor=south west,fill=green!30!white] (h3) at ([yshift=1.5em]e3.north west) {\scriptsize{$h(\textrm{广州})$}};
\node [rnode,anchor=south west,fill=green!30!white] (h4) at ([yshift=1.5em]e4.north west) {\scriptsize{$h(\textrm{})$}};
\node [rnode,anchor=south west,fill=green!30!white] (h5) at ([yshift=1.5em]e5.north west) {\scriptsize{$h(\textrm{机票})$}};
\foreach \x in {1,2,3,4,5}{
\node [anchor=north] (plus\x) at ([yshift=-0em]e\x.south) {\scriptsize{$\mathbf{\oplus}$}};
}
\node [rnode,anchor=north,fill=yellow!20!white] (pos1) at ([yshift=-1.1em]e1.south) {\scriptsize{$\textrm{PE}(1)$}};
\node [rnode,anchor=north,fill=yellow!20!white] (pos2) at ([yshift=-1.1em]e2.south) {\scriptsize{$\textrm{PE}(2)$}};
\node [rnode,anchor=north,fill=yellow!20!white] (pos3) at ([yshift=-1.1em]e3.south) {\scriptsize{$\textrm{PE}(3)$}};
\node [rnode,anchor=north,fill=yellow!20!white] (pos4) at ([yshift=-1.1em]e4.south) {\scriptsize{$\textrm{PE}(4)$}};
\node [rnode,anchor=north,fill=yellow!20!white] (pos5) at ([yshift=-1.1em]e5.south) {\scriptsize{$\textrm{PE}(5)$}};
\node [rnode,anchor=north,fill=yellow!30!white] (pos1) at ([yshift=-1.1em]e1.south) {\scriptsize{$\textrm{PE}(1)$}};
\node [rnode,anchor=north,fill=yellow!30!white] (pos2) at ([yshift=-1.1em]e2.south) {\scriptsize{$\textrm{PE}(2)$}};
\node [rnode,anchor=north,fill=yellow!30!white] (pos3) at ([yshift=-1.1em]e3.south) {\scriptsize{$\textrm{PE}(3)$}};
\node [rnode,anchor=north,fill=yellow!30!white] (pos4) at ([yshift=-1.1em]e4.south) {\scriptsize{$\textrm{PE}(4)$}};
\node [rnode,anchor=north,fill=yellow!30!white] (pos5) at ([yshift=-1.1em]e5.south) {\scriptsize{$\textrm{PE}(5)$}};
\foreach \x in {1,2,3,4,5}{
......
......@@ -2,13 +2,13 @@
\begin{tikzpicture}
\begin{scope}
\tikzstyle{rnode} = [draw,minimum width=3.5em,minimum height=1.2em]
\tikzstyle{rnode} = [draw,minimum width=4em,minimum height=1.2em]
\node [rnode,anchor=south west,fill=green!20!white] (key1) at (0,0) {\scriptsize{$h(\textrm{沈阳})$}};
\node [rnode,anchor=south west,fill=green!20!white] (key2) at ([xshift=1em]key1.south east) {\scriptsize{$h(\textrm{})$}};
\node [rnode,anchor=south west,fill=green!20!white] (key3) at ([xshift=1em]key2.south east) {\scriptsize{$h(\textrm{广州})$}};
\node [rnode,anchor=south west,fill=green!20!white] (key4) at ([xshift=2em]key3.south east) {\scriptsize{$h(\textrm{机票})$}};
\node [rnode,anchor=south west] (key5) at ([xshift=1em]key4.south east) {\scriptsize{$h(\textrm{机票})$}};
\node [rnode,anchor=south west,fill=green!30!white] (key1) at (0,0) {\scriptsize{$h(\textrm{沈阳})$}};
\node [rnode,anchor=south west,fill=green!30!white] (key2) at ([xshift=1.5em]key1.south east) {\scriptsize{$h(\textrm{})$}};
\node [rnode,anchor=south west,fill=green!30!white] (key3) at ([xshift=1.5em]key2.south east) {\scriptsize{$h(\textrm{广州})$}};
\node [rnode,anchor=south west,fill=green!30!white] (key4) at ([xshift=3em]key3.south east) {\scriptsize{$h(\textrm{机票})$}};
\node [rnode,anchor=south west] (key5) at ([xshift=1.5em]key4.south east) {\scriptsize{$h(\textrm{机票})$}};
\node [anchor=west] (sep1) at ([xshift=0.3em]key3.east) {\scriptsize{$\textbf{...}$}};
......@@ -17,17 +17,17 @@
\draw [->] ([yshift=1pt,xshift=3pt]key5.north) .. controls +(90:1.8em) and +(90:1.8em) .. ([yshift=1pt]key2.north);
\draw [->] ([yshift=1pt,xshift=6pt]key5.north) .. controls +(90:2.2em) and +(90:2.2em) .. ([yshift=1pt]key1.north);
\node [anchor=south west] (alpha1) at ([xshift=-1em]key1.north west) {\scriptsize{$\alpha_1=.2$}};
\node [anchor=south west] (alpha2) at ([xshift=-1em]key2.north west) {\scriptsize{$\alpha_2=.3$}};
\node [anchor=south west] (alpha3) at ([xshift=-1em]key3.north west) {\scriptsize{$\alpha_3=.1$}};
\node [anchor=south west] (alpha4) at ([xshift=-1em]key4.north west) {\scriptsize{$\alpha_i=.3$}};
\node [anchor=south west] (alpha1) at ([xshift=-1.2em]key1.north west) {\scriptsize{$\alpha_1=0.2$}};
\node [anchor=south west] (alpha2) at ([xshift=-1.2em]key2.north west) {\scriptsize{$\alpha_2=0.3$}};
\node [anchor=south west] (alpha3) at ([xshift=-1.2em]key3.north west) {\scriptsize{$\alpha_3=0.1$}};
\node [anchor=south west] (alpha4) at ([xshift=-1.2em]key4.north west) {\scriptsize{$\alpha_i=0.3$}};
\vspace{0.5em}
\node [rnode,anchor=south west,fill=green!20!white] (key6) at ([yshift=2em]key1.north west) {\scriptsize{$h(\textrm{广州})$}};
\node [rnode,anchor=south west,fill=green!20!white] (key7) at ([yshift=2em]key2.north west) {\scriptsize{$h(\textrm{})$}};
\node [rnode,anchor=south west,fill=green!20!white] (key8) at ([yshift=2em]key3.north west) {\scriptsize{$h(\textrm{沈阳})$}};
\node [rnode,anchor=south west,fill=green!20!white] (key9) at ([yshift=2em]key4.north west) {\scriptsize{$h(\textrm{机票})$}};
\node [rnode,anchor=south west,fill=green!30!white] (key6) at ([yshift=2em]key1.north west) {\scriptsize{$h(\textrm{广州})$}};
\node [rnode,anchor=south west,fill=green!30!white] (key7) at ([yshift=2em]key2.north west) {\scriptsize{$h(\textrm{})$}};
\node [rnode,anchor=south west,fill=green!30!white] (key8) at ([yshift=2em]key3.north west) {\scriptsize{$h(\textrm{沈阳})$}};
\node [rnode,anchor=south west,fill=green!30!white] (key9) at ([yshift=2em]key4.north west) {\scriptsize{$h(\textrm{机票})$}};
\node [rnode,anchor=south west] (key10) at ([yshift=2em]key5.north west) {\scriptsize{$h(\textrm{机票})$}};
\node [anchor=west] (sep1) at ([xshift=0.3em]key8.east) {\scriptsize{$\textbf{...}$}};
......@@ -37,10 +37,10 @@
\draw [->] ([yshift=1pt,xshift=3pt]key10.north) .. controls +(90:1.8em) and +(90:1.8em) .. ([yshift=1pt]key7.north);
\draw [->] ([yshift=1pt,xshift=6pt]key10.north) .. controls +(90:2.2em) and +(90:2.2em) .. ([yshift=1pt]key6.north);
\node [anchor=south west] (alpha5) at ([xshift=-1em]key6.north west) {\scriptsize{$\alpha_1=.1$}};
\node [anchor=south west] (alpha6) at ([xshift=-1em]key7.north west) {\scriptsize{$\alpha_2=.3$}};
\node [anchor=south west] (alpha7) at ([xshift=-1em]key8.north west) {\scriptsize{$\alpha_3=.2$}};
\node [anchor=south west] (alpha8) at ([xshift=-1em]key9.north west) {\scriptsize{$\alpha_i=.3$}};
\node [anchor=south west] (alpha5) at ([xshift=-1.2em]key6.north west) {\scriptsize{$\alpha_1=0.1$}};
\node [anchor=south west] (alpha6) at ([xshift=-1.2em]key7.north west) {\scriptsize{$\alpha_2=0.3$}};
\node [anchor=south west] (alpha7) at ([xshift=-1.2em]key8.north west) {\scriptsize{$\alpha_3=0.2$}};
\node [anchor=south west] (alpha8) at ([xshift=-1.2em]key9.north west) {\scriptsize{$\alpha_i=0.3$}};
\end{scope}
\end{tikzpicture}
......
......@@ -4,7 +4,7 @@
\begin{scope}[scale=1.5]
{\Large
\tikzstyle{snode} = [draw,inner sep=1pt,minimum width=3em,minimum height=0.5em,rounded corners=1pt,fill=green!30!white]
\tikzstyle{snode} = [draw,inner sep=1pt,minimum width=3em,minimum height=0.5em,rounded corners=1pt,fill=green!40!white]
\tikzstyle{pnode} = [draw,inner sep=1pt,minimum width=1em,minimum height=0.5em,rounded corners=1pt]
\node [anchor=west,snode] (s1) at (0,0) {};
\node [anchor=north west,snode,minimum width=6.5em] (s2) at ([yshift=-0.3em]s1.south west) {};
......@@ -15,7 +15,7 @@
\node [anchor=west,pnode,minimum width=3em] (p1) at ([xshift=0.3em]s1.east) {};
\node [anchor=west,pnode,minimum width=4em] (p3) at ([xshift=0.3em]s3.east) {};
\node [anchor=west,snode,minimum width=5em] (s4) at ([xshift=4em]p1.east) {};
\node [anchor=west,snode,minimum width=5em] (s4) at ([xshift=5em]p1.east) {};
\node [anchor=north west,snode,minimum width=5em] (s5) at ([yshift=-0.3em]s4.south west) {};
\node [anchor=north west,snode,minimum width=6.5em] (s6) at ([yshift=-0.3em]s5.south west) {};
......
......@@ -3,14 +3,14 @@
\begin{tikzpicture}
\begin{scope}
\tikzstyle{rnnnode} = [minimum height=1.1em,minimum width=2.1em,inner sep=2pt,rounded corners=1pt,draw,fill=red!20];
\tikzstyle{rnnnode} = [minimum height=1.1em,minimum width=2.7em,inner sep=2pt,rounded corners=1pt,draw,fill=red!20];
\node [rnnnode,anchor=west] (h1) at (0,0) {\tiny{$\mathbi{h}_1$}};
\node [rnnnode,anchor=west] (h2) at ([xshift=1em]h1.east) {\tiny{$\mathbi{h}_2$}};
\node [rnnnode,anchor=west] (h3) at ([xshift=1em]h2.east) {\tiny{$\mathbi{h}_3$}};
\node [rnnnode,anchor=west] (h2) at ([xshift=1.5em]h1.east) {\tiny{$\mathbi{h}_2$}};
\node [rnnnode,anchor=west] (h3) at ([xshift=1.5em]h2.east) {\tiny{$\mathbi{h}_3$}};
\node [rnnnode,anchor=north,fill=green!20] (e1) at ([yshift=-1em]h1.south) {\tiny{$e_x()$}};
\node [rnnnode,anchor=west,fill=green!20] (e2) at ([xshift=1em]e1.east) {\tiny{$e_x()$}};
\node [rnnnode,anchor=west,fill=green!20] (e3) at ([xshift=1em]e2.east) {\tiny{$e_x()$}};
\node [rnnnode,anchor=west,fill=green!20] (e2) at ([xshift=1.5em]e1.east) {\tiny{$e_x()$}};
\node [rnnnode,anchor=west,fill=green!20] (e3) at ([xshift=1.5em]e2.east) {\tiny{$e_x()$}};
\node [anchor=north,inner sep=2pt] (w1) at ([yshift=-0.6em]e1.south) {\tiny{}};
\node [anchor=north,inner sep=2pt] (w2) at ([yshift=-0.6em]e2.south) {\tiny{}};
\node [anchor=north,inner sep=2pt] (w3) at ([yshift=-0.6em]e3.south) {\tiny{$\langle$eos$\rangle$}};
......@@ -33,14 +33,14 @@
\node [anchor=south] (encoder) at ([xshift=-0.2em]h1.north west) {\scriptsize{\textbf{编码器}}};
{
\node [rnnnode,anchor=west,fill=green!20] (t1) at ([xshift=3em]e3.east) {\tiny{$e_y()$}};
\node [rnnnode,anchor=west,fill=green!20] (t1) at ([xshift=3.5em]e3.east) {\tiny{$e_y()$}};
}
{
\node [rnnnode,anchor=west,fill=green!20] (t2) at ([xshift=1.5em]t1.east) {\tiny{$e_y()$}};
\node [rnnnode,anchor=west,fill=green!20] (t2) at ([xshift=1.8em]t1.east) {\tiny{$e_y()$}};
}
{
\node [rnnnode,anchor=west,fill=green!20] (t3) at ([xshift=1.5em]t2.east) {\tiny{$e_y()$}};
\node [rnnnode,anchor=west,fill=green!20] (t4) at ([xshift=1.5em]t3.east) {\tiny{$e_y()$}};
\node [rnnnode,anchor=west,fill=green!20] (t3) at ([xshift=1.8em]t2.east) {\tiny{$e_y()$}};
\node [rnnnode,anchor=west,fill=green!20] (t4) at ([xshift=1.8em]t3.east) {\tiny{$e_y()$}};
%\node [anchor=west,inner sep=2pt] (t5) at ([xshift=0.3em]t4.east) {\tiny{...}};
}
{
......
......@@ -3,11 +3,11 @@
\begin{tikzpicture}
\begin{scope}
\node [anchor=west] (w0) at (0,0) {$w_1$};
\node [anchor=west] (w1) at ([xshift=0.5em]w0.east) {$w_2$};
\node [anchor=west] (w2) at ([xshift=0.5em]w1.east) {$w_3$};
\node [anchor=west] (w3) at ([xshift=0.5em]w2.east) {$...$};
\node [anchor=west] (w4) at ([xshift=0.5em]w3.east) {$w_{m-1}$};
\node [anchor=west,fill=green!20!white] (w5) at ([xshift=0.5em]w4.east) {$w_{m}$};
\node [anchor=west] (w1) at ([xshift=1em]w0.east) {$w_2$};
\node [anchor=west] (w2) at ([xshift=1em]w1.east) {$w_3$};
\node [anchor=west] (w3) at ([xshift=1em]w2.east) {$...$};
\node [anchor=west] (w4) at ([xshift=1em]w3.east) {$w_{m-1}$};
\node [anchor=west,fill=green!30!white] (w5) at ([xshift=0.5em]w4.east) {$w_{m}$};
\draw [->,thick,red] (w1.north).. controls +(130:0.5) and +(50:0.5) .. (w0.north);
\draw [->,thick,red] (w2.north).. controls +(130:0.5) and +(50:0.5) .. (w1.north);
\draw [->,thick,red] ([yshift=0.2em]w3.north).. controls +(130:0.5) and +(50:0.5) .. (w2.north);
......
......@@ -3,11 +3,11 @@
\begin{tikzpicture}
\begin{scope}
\node [anchor=west] (w0) at (0,-2) {$w_1$};
\node [anchor=west] (w1) at ([xshift=0.5em]w0.east) {$w_2$};
\node [anchor=west] (w2) at ([xshift=0.5em]w1.east) {$w_3$};
\node [anchor=west] (w3) at ([xshift=0.5em]w2.east) {$...$};
\node [anchor=west] (w4) at ([xshift=0.5em]w3.east) {$w_{m-1}$};
\node [anchor=west,fill=green!20!white] (w5) at ([xshift=0.5em]w4.east) {$w_{m}$};
\node [anchor=west] (w1) at ([xshift=1em]w0.east) {$w_2$};
\node [anchor=west] (w2) at ([xshift=1em]w1.east) {$w_3$};
\node [anchor=west] (w3) at ([xshift=1em]w2.east) {$...$};
\node [anchor=west] (w4) at ([xshift=1em]w3.east) {$w_{m-1}$};
\node [anchor=west,fill=green!30!white] (w5) at ([xshift=0.5em]w4.east) {$w_{m}$};
\draw [->,thick,red] (w5.north).. controls +(100:0.85) and +(50:0.85) .. (w0.north);
\draw [->,thick,red] (w5.north).. controls +(110:0.75) and +(50:0.75) .. (w1.north);
\draw [->,thick,red] (w5.north).. controls +(110:0.75) and +(50:0.75) .. (w2.north);
......
......@@ -3,12 +3,12 @@
\begin{tikzpicture}
\begin{scope}
\tikzstyle{lnode} = [minimum height=1.5em,minimum width=3em,inner sep=3pt,rounded corners=1.5pt,draw,fill=orange!20];
\tikzstyle{lnode} = [minimum height=1.5em,minimum width=3em,inner sep=3pt,rounded corners=1.5pt,draw,fill=orange!30];
\tikzstyle{standard} = [rounded corners=3pt]
\node [lnode,anchor=west] (l1) at (0,0) {\scriptsize{子层}};
\node [lnode,anchor=west] (l2) at ([xshift=3em]l1.east) {\scriptsize{层标准化}};
\node [lnode,anchor=west] (l3) at ([xshift=4em]l2.east) {\scriptsize{层标准化}};
\node [lnode,anchor=west] (l3) at ([xshift=6em]l2.east) {\scriptsize{层标准化}};
\node [lnode,anchor=west] (l4) at ([xshift=1.5em]l3.east) {\scriptsize{子层}};
\node [anchor=west] (plus1) at ([xshift=0.9em]l1.east) {\scriptsize{$\mathbf{\oplus}$}};
......
......@@ -3,15 +3,15 @@
\begin{tikzpicture}
\begin{scope}
\tikzstyle{rnode} = [draw,minimum width=2.8em,minimum height=1.2em]
\tikzstyle{rnode} = [draw,minimum width=3.5em,minimum height=1.2em]
\node [rnode,anchor=south west,fill=green!20!white] (key11) at (0,0) {\scriptsize{$h(\textrm{})$}};
\node [rnode,anchor=south west,fill=green!20!white] (key12) at ([xshift=0.8em]key11.south east) {\scriptsize{$h(\textrm{什么})$}};
\node [rnode,anchor=south west,fill=green!20!white] (key13) at ([xshift=0.8em]key12.south east) {\scriptsize{$h(\textrm{})$}};
\node [rnode,anchor=south west,fill=green!20!white] (key14) at ([xshift=0.8em]key13.south east) {\scriptsize{$h(\textrm{})$}};
\node [rnode,anchor=south west,fill=green!20!white] (key15) at ([xshift=0.8em]key14.south east) {\scriptsize{$h(\textrm{})$}};
\node [rnode,anchor=south west,fill=green!30!white] (key11) at (0,0) {\scriptsize{$h(\textrm{})$}};
\node [rnode,anchor=south west,fill=green!30!white] (key12) at ([xshift=1.5em]key11.south east) {\scriptsize{$h(\textrm{什么})$}};
\node [rnode,anchor=south west,fill=green!30!white] (key13) at ([xshift=1.5em]key12.south east) {\scriptsize{$h(\textrm{})$}};
\node [rnode,anchor=south west,fill=green!30!white] (key14) at ([xshift=1.5em]key13.south east) {\scriptsize{$h(\textrm{})$}};
\node [rnode,anchor=south west,fill=green!30!white] (key15) at ([xshift=1.5em]key14.south east) {\scriptsize{$h(\textrm{})$}};
\node [rnode,anchor=east] (query1) at ([xshift=-1em]key11.west) {\scriptsize{$h(\textrm{})$}};
......
......@@ -4,7 +4,7 @@
\begin{tikzpicture}
\footnotesize{
\begin{axis}[
width=.60\textwidth,
width=.8\textwidth,
height=.40\textwidth,
legend style={at={(0.60,0.08)}, anchor=south west},
xlabel={\footnotesize{更新步数 (10k)}},
......
......@@ -2,29 +2,31 @@
\begin{tikzpicture}
\begin{scope}
\node [anchor=west,draw=black!30,inner sep=4pt,fill=ugreen!20!white,text=ugreen!20!white] (Linear0) at (0,0) {\footnotesize{Linear}};
\node [anchor=south west,draw=black!50,fill=ugreen!20!white,draw,inner sep=4pt,text=ugreen!20!white] (Linear01) at ([shift={(-0.2em,-0.2em)}]Linear0.south west) {\footnotesize{Linear}};
\node [anchor=south west,fill=ugreen!20!white,draw,inner sep=4pt] (Linear02) at ([shift={(-0.2em,-0.2em)}]Linear01.south west) {\footnotesize{Linear}};
%
\node [anchor=west,draw=black!30,inner sep=4pt,fill=ugreen!30!white,text=ugreen!30!white,minimum width=4em] (Linear0) at (0,0) {\footnotesize{Linear}};
\node [anchor=south west,draw=black!50,fill=ugreen!30!white,draw,inner sep=4pt,text=ugreen!30!white,minimum width=4em] (Linear01) at ([shift={(-0.2em,-0.2em)}]Linear0.south west) {\footnotesize{Linear}};
\node [anchor=south west,fill=ugreen!30!white,draw,inner sep=4pt,minimum width=4em] (Linear02) at ([shift={(-0.2em,-0.2em)}]Linear01.south west) {\footnotesize{Linear}};
\node [anchor=north] (Q) at ([xshift=0em,yshift=-1em]Linear02.south) {\footnotesize{$\mathbi{Q}$}};
\node [anchor=west,draw=black!30,inner sep=4pt,fill=ugreen!20!white,text=ugreen!20!white] (Linear1) at ([xshift=1.5em]Linear0.east) {\footnotesize{Linear}};
\node [anchor=south west,draw=black!50,fill=ugreen!20!white,draw,inner sep=4pt,text=ugreen!20!white] (Linear11) at ([shift={(-0.2em,-0.2em)}]Linear1.south west) {\footnotesize{Linear}};
\node [anchor=south west,fill=ugreen!20!white,draw,inner sep=4pt] (Linear12) at ([shift={(-0.2em,-0.2em)}]Linear11.south west) {\footnotesize{Linear}};
\node [anchor=west,draw=black!30,inner sep=4pt,fill=ugreen!30!white,text=ugreen!30!white,minimum width=4em] (Linear1) at ([xshift=1.5em]Linear0.east) {\footnotesize{Linear}};
\node [anchor=south west,draw=black!50,fill=ugreen!30!white,draw,inner sep=4pt,text=ugreen!30!white,minimum width=4em] (Linear11) at ([shift={(-0.2em,-0.2em)}]Linear1.south west) {\footnotesize{Linear}};
\node [anchor=south west,fill=ugreen!30!white,draw,inner sep=4pt,minimum width=4em] (Linear12) at ([shift={(-0.2em,-0.2em)}]Linear11.south west) {\footnotesize{Linear}};
\node [anchor=north] (K) at ([xshift=0em,yshift=-1em]Linear12.south) {\footnotesize{$\mathbi{K}$}};
\node [anchor=west,draw=black!30,inner sep=4pt,fill=ugreen!20!white,text=ugreen!20!white] (Linear2) at ([xshift=1.5em]Linear1.east) {\footnotesize{Linear}};
\node [anchor=south west,draw=black!50,fill=ugreen!20!white,draw,inner sep=4pt,text=ugreen!20!white] (Linear21) at ([shift={(-0.2em,-0.2em)}]Linear2.south west) {\footnotesize{Linear}};
\node [anchor=south west,fill=ugreen!20!white,draw,inner sep=4pt] (Linear22) at ([shift={(-0.2em,-0.2em)}]Linear21.south west) {\footnotesize{Linear}};
\node [anchor=west,draw=black!30,inner sep=4pt,fill=ugreen!30!white,text=ugreen!30!white,minimum width=4em] (Linear2) at ([xshift=1.5em]Linear1.east) {\footnotesize{Linear}};
\node [anchor=south west,draw=black!50,fill=ugreen!30!white,draw,inner sep=4pt,text=ugreen!30!white,minimum width=4em] (Linear21) at ([shift={(-0.2em,-0.2em)}]Linear2.south west) {\footnotesize{Linear}};
\node [anchor=south west,fill=ugreen!30!white,draw,inner sep=4pt,minimum width=4em] (Linear22) at ([shift={(-0.2em,-0.2em)}]Linear21.south west) {\footnotesize{Linear}};
\node [anchor=north] (V) at ([xshift=0em,yshift=-1em]Linear22.south) {\footnotesize{$\mathbi{V}$}};
\node [anchor=south,draw=black!30,minimum width=12em,minimum height=2em,inner sep=4pt,fill=blue!20!white] (Scale) at ([yshift=1em]Linear1.north) {\footnotesize{}};
\node [anchor=south west,draw=black!50,minimum width=12em,minimum height=2em,fill=blue!20!white,draw,inner sep=4pt] (Scale1) at ([shift={(-0.2em,-0.2em)}]Scale.south west) {\footnotesize{}};
\node [anchor=south west,fill=blue!20!white,draw,minimum width=12em,minimum height=2em,inner sep=4pt] (Scale2) at ([shift={(-0.2em,-0.2em)}]Scale1.south west) {\footnotesize{Scaled Dot-Product Attention}};
% scaled dot-product attention
\node [anchor=south,draw=black!30,minimum width=16em,minimum height=2em,inner sep=4pt,fill=blue!30!white] (Scale) at ([yshift=1em]Linear1.north) {\footnotesize{}};
\node [anchor=south west,draw=black!50,minimum width=16em,minimum height=2em,fill=blue!30!white,draw,inner sep=4pt] (Scale1) at ([shift={(-0.2em,-0.2em)}]Scale.south west) {\footnotesize{}};
\node [anchor=south west,fill=blue!30!white,draw,minimum width=16em,minimum height=2em,inner sep=4pt] (Scale2) at ([shift={(-0.2em,-0.2em)}]Scale1.south west) {\footnotesize{Scaled Dot-Product Attention}};
\node [anchor=south,draw,minimum width=4em,inner sep=4pt,fill=yellow!30] (Concat) at ([yshift=1em]Scale2.north) {\footnotesize{Concat}};
%
\node [anchor=south,draw,minimum width=6em,inner sep=4pt,fill=yellow!30] (Concat) at ([yshift=1em]Scale2.north) {\footnotesize{Concat}};
\node [anchor=south,draw,minimum width=4em,inner sep=4pt,fill=ugreen!20!white] (Linear) at ([yshift=1em]Concat.north) {\footnotesize{Linear}};
\node [anchor=south,draw,minimum width=6em,inner sep=4pt,fill=ugreen!30!white] (Linear) at ([yshift=1em]Concat.north) {\footnotesize{Linear}};
\draw [->] ([yshift=0.1em]Q.north) -- ([yshift=-0.1em]Linear02.south);
......
......@@ -6,13 +6,13 @@
\begin{tikzpicture}
\begin{scope}
\node [anchor=south west,fill=white,draw,inner sep=4pt,minimum width=4em,fill=blue!20!white] (MatMul) at (0,0) {\tiny{MatMul}};
\node [anchor=south west,fill=white,draw,inner sep=4pt,minimum width=4em,fill=blue!25!white] (MatMul) at (0,0) {\tiny{MatMul}};
\node [anchor=north] (Q1) at ([xshift=-1.4em,yshift=-1em]MatMul.south) {\footnotesize{$\mathbi{Q}$}};
\node [anchor=north] (K1) at ([xshift=1.4em,yshift=-1em]MatMul.south) {\footnotesize{$\mathbi{K}$}};
\node [anchor=south,draw,inner sep=4pt,fill=yellow!30,minimum width=2.5em] (Scale3) at ([yshift=1em]MatMul.north) {\tiny{Scale}};
\node [anchor=south,draw,inner sep=4pt,fill=purple!20,minimum width=3.5em] (Mask) at ([yshift=0.8em]Scale3.north) {\tiny{Mask(opt.)}};
\node [anchor=south,draw,inner sep=4pt,fill=ugreen!20!white] (SoftMax) at ([yshift=1em]Mask.north) {\tiny{SoftMax}};
\node [anchor=south,draw,minimum width=4em,inner sep=4pt,fill=blue!20!white] (MatMul1) at ([xshift=1.7em,yshift=1em]SoftMax.north) {\tiny{MatMul}};
\node [anchor=south,draw,inner sep=4pt,fill=yellow!25,minimum width=2.5em] (Scale3) at ([yshift=1em]MatMul.north) {\tiny{Scale}};
\node [anchor=south,draw,inner sep=4pt,fill=purple!25,minimum width=3.5em] (Mask) at ([yshift=0.8em]Scale3.north) {\tiny{Mask(opt.)}};
\node [anchor=south,draw,inner sep=4pt,fill=ugreen!25!white] (SoftMax) at ([yshift=1em]Mask.north) {\tiny{SoftMax}};
\node [anchor=south,draw,minimum width=4em,inner sep=4pt,fill=blue!25!white] (MatMul1) at ([xshift=1.7em,yshift=1em]SoftMax.north) {\tiny{MatMul}};
\node [anchor=north] (V1) at ([xshift=2em]K1.north) {\footnotesize{$\mathbi{V}$}};
\node [anchor=north] (null) at ([yshift=0.8em]MatMul1.north) {};
......@@ -26,13 +26,13 @@
\draw [->] ([yshift=0.1em]MatMul1.north) -- ([yshift=0.8em]MatMul1.north);
{
\node [anchor=east] (line1) at ([xshift=-4em,yshift=1em]MatMul.west) {\scriptsize{自注意力机制的Query}};
\node [anchor=east] (line1) at ([xshift=-5em,yshift=1em]MatMul.west) {\scriptsize{自注意力机制的Query}};
\node [anchor=north west] (line2) at ([yshift=0.3em]line1.south west) {\scriptsize{Key和Value均来自同一句}};
\node [anchor=north west] (line3) at ([yshift=0.3em]line2.south west) {\scriptsize{子,编码-解码注意力机制}};
\node [anchor=north west] (line4) at ([yshift=0.3em]line3.south west) {\scriptsize{与前面讲的一样}};
}
{
\node [anchor=west] (line11) at ([xshift=3em,yshift=0em]MatMul.east) {\scriptsize{Query和Key的转置进}};
\node [anchor=west] (line11) at ([xshift=5em,yshift=0em]MatMul.east) {\scriptsize{Query和Key的转置进}};
\node [anchor=north west] (line12) at ([yshift=0.3em]line11.south west) {\scriptsize{行点积,得到句子内部}};
\node [anchor=north west] (line13) at ([yshift=0.3em]line12.south west) {\scriptsize{各个位置的相关性}};
}
......@@ -57,28 +57,28 @@
\begin{pgfonlayer}{background}
{
\node [rectangle,inner sep=0.2em,rounded corners=1pt,fill=green!10,drop shadow,draw=ugreen,minimum width=10em] [fit = (line1) (line2) (line3) (line4)] (box1) {};
\node [rectangle,inner sep=0.2em,rounded corners=1pt,fill=green!20,drop shadow,draw=ugreen,minimum width=10em] [fit = (line1) (line2) (line3) (line4)] (box1) {};
\node [rectangle,inner sep=0.1em,rounded corners=1pt,very thick,dotted,draw=ugreen] [fit = (Q1) (K1) (V1)] (box0) {};
\draw [->,dotted,very thick,ugreen] ([yshift=-1.5em,xshift=1.8em]box1.east) -- ([yshift=-1.5em,xshift=0.1em]box1.east);
\draw [->,dotted,very thick,ugreen] ([yshift=-1.5em,xshift=2.8em]box1.east) -- ([yshift=-1.5em,xshift=0.1em]box1.east);
}
{
\node [rectangle,inner sep=0.2em,rounded corners=1pt,fill=blue!20!white,drop shadow,draw=blue] [fit = (line11) (line12) (line13)] (box2) {};
\draw [->,dotted,very thick,blue] ([yshift=1em,xshift=-2.8em]box2.west) -- ([yshift=1em,xshift=-0.1em]box2.west);
\node [rectangle,inner sep=0.2em,rounded corners=1pt,fill=blue!25!white,drop shadow,draw=blue] [fit = (line11) (line12) (line13)] (box2) {};
\draw [->,dotted,very thick,blue] ([yshift=1em,xshift=-4.8em]box2.west) -- ([yshift=1em,xshift=-0.1em]box2.west);
}
{
\node [rectangle,inner sep=0.2em,rounded corners=1pt,fill=yellow!20,drop shadow,draw=black] [fit = (line21) (line22) (line23)] (box3) {};
\node [rectangle,inner sep=0.2em,rounded corners=1pt,fill=yellow!25,drop shadow,draw=black] [fit = (line21) (line22) (line23)] (box3) {};
\draw [->,dotted,very thick,black] ([xshift=0.1em]Scale3.east) .. controls +(east:1) and +(west:1) .. ([yshift=1.0em]box3.west) ;
}
{
\node [rectangle,inner sep=0.2em,rounded corners=1pt,fill=red!10,drop shadow,draw=red] [fit = (line31) (line32) (line33) (line34)] (box4) {};
\draw [->,dotted,very thick,red] ([yshift=-1.2em,xshift=2.2em]box4.east) -- ([yshift=-1.2em,xshift=0.1em]box4.east);
\node [rectangle,inner sep=0.2em,rounded corners=1pt,fill=red!20,drop shadow,draw=red] [fit = (line31) (line32) (line33) (line34)] (box4) {};
\draw [->,dotted,very thick,red] ([yshift=-1.2em,xshift=3.2em]box4.east) -- ([yshift=-1.2em,xshift=0.1em]box4.east);
}
{
\node [rectangle,inner sep=0.2em,rounded corners=1pt,fill=blue!20!white,drop shadow,draw=blue] [fit = (line41) (line42)] (box5) {};
\draw [->,dotted,very thick,blue] ([yshift=-0.3em,xshift=-1em]box5.west) -- ([yshift=-0.3em,xshift=-0.1em]box5.west);
\node [rectangle,inner sep=0.2em,rounded corners=1pt,fill=blue!25!white,drop shadow,draw=blue] [fit = (line41) (line42)] (box5) {};
\draw [->,dotted,very thick,blue] ([yshift=-0.3em,xshift=-3em]box5.west) -- ([yshift=-0.3em,xshift=-0.1em]box5.west);
}
\end{pgfonlayer}
......
......@@ -4,12 +4,12 @@
\begin{tikzpicture}
\begin{scope}
\tikzstyle{Sanode} = [minimum height=1.4em,minimum width=7em,inner sep=3pt,rounded corners=1.5pt,draw,fill=orange!20];
\tikzstyle{Resnode} = [minimum height=1.1em,minimum width=7em,inner sep=3pt,rounded corners=1.5pt,draw,fill=yellow!20];
\tikzstyle{Sanode} = [minimum height=1.4em,minimum width=7em,inner sep=3pt,rounded corners=1.5pt,draw,fill=orange!30];
\tikzstyle{Resnode} = [minimum height=1.1em,minimum width=7em,inner sep=3pt,rounded corners=1.5pt,draw,fill=yellow!30];
\tikzstyle{ffnnode} = [minimum height=1.4em,minimum width=7em,inner sep=3pt,rounded corners=1.5pt,draw];
\tikzstyle{outputnode} = [minimum height=1.4em,minimum width=7em,inner sep=3pt,rounded corners=1.5pt,draw];
\tikzstyle{inputnode} = [minimum height=1.4em,minimum width=3.5em,inner sep=3pt,rounded corners=1.5pt,draw,fill=red!10];
\tikzstyle{posnode} = [minimum height=1.4em,minimum width=3.5em,inner sep=3pt,rounded corners=1.5pt,draw,fill=black!5!white];
\tikzstyle{inputnode} = [minimum height=1.4em,minimum width=3.5em,inner sep=3pt,rounded corners=1.5pt,draw,fill=red!20];
\tikzstyle{posnode} = [minimum height=1.4em,minimum width=3.5em,inner sep=3pt,rounded corners=1.5pt,draw,fill=black!10!white];
\tikzstyle{standard} = [rounded corners=3pt]
\node [Sanode,anchor=west] (sa1) at (0,0) {\tiny{$\textbf{Self-Attention}$}};
......
......@@ -2,12 +2,12 @@
\begin{tikzpicture}
\begin{scope}
\tikzstyle{Sanode} = [minimum height=1.4em,minimum width=7em,inner sep=3pt,rounded corners=1.5pt,draw,fill=orange!20];
\tikzstyle{Resnode} = [minimum height=1.1em,minimum width=7em,inner sep=3pt,rounded corners=1.5pt,draw,fill=yellow!20];
\tikzstyle{ffnnode} = [minimum height=1.4em,minimum width=7em,inner sep=3pt,rounded corners=1.5pt,draw,fill=blue!20];
\tikzstyle{Sanode} = [minimum height=1.4em,minimum width=7em,inner sep=3pt,rounded corners=1.5pt,draw,fill=orange!30];
\tikzstyle{Resnode} = [minimum height=1.1em,minimum width=7em,inner sep=3pt,rounded corners=1.5pt,draw,fill=yellow!30];
\tikzstyle{ffnnode} = [minimum height=1.4em,minimum width=7em,inner sep=3pt,rounded corners=1.5pt,draw,fill=blue!30];
\tikzstyle{outputnode} = [minimum height=1.4em,minimum width=7em,inner sep=3pt,rounded corners=1.5pt,draw];
\tikzstyle{inputnode} = [minimum height=1.4em,minimum width=3.5em,inner sep=3pt,rounded corners=1.5pt,draw,fill=red!10];
\tikzstyle{posnode} = [minimum height=1.4em,minimum width=3.5em,inner sep=3pt,rounded corners=1.5pt,draw,fill=black!5!white];
\tikzstyle{inputnode} = [minimum height=1.4em,minimum width=3.5em,inner sep=3pt,rounded corners=1.5pt,draw,fill=red!20];
\tikzstyle{posnode} = [minimum height=1.4em,minimum width=3.5em,inner sep=3pt,rounded corners=1.5pt,draw,fill=black!10!white];
\tikzstyle{standard} = [rounded corners=3pt]
\node [Sanode,anchor=west] (sa1) at (0,0) {\tiny{$\textbf{Self-Attention}$}};
......
......@@ -3,12 +3,12 @@
\begin{tikzpicture}
\begin{scope}
\tikzstyle{Sanode} = [minimum height=1.4em,minimum width=7em,inner sep=3pt,rounded corners=1.5pt,draw,fill=orange!20];
\tikzstyle{Sanode} = [minimum height=1.4em,minimum width=7em,inner sep=3pt,rounded corners=1.5pt,draw,fill=orange!30];
\tikzstyle{Resnode} = [minimum height=1.1em,minimum width=7em,inner sep=3pt,rounded corners=1.5pt,draw];
\tikzstyle{ffnnode} = [minimum height=1.4em,minimum width=7em,inner sep=3pt,rounded corners=1.5pt,draw];
\tikzstyle{outputnode} = [minimum height=1.4em,minimum width=7em,inner sep=3pt,rounded corners=1.5pt,draw];
\tikzstyle{inputnode} = [minimum height=1.4em,minimum width=3.5em,inner sep=3pt,rounded corners=1.5pt,draw,fill=red!10];
\tikzstyle{posnode} = [minimum height=1.4em,minimum width=3.5em,inner sep=3pt,rounded corners=1.5pt,draw,fill=black!5!white];
\tikzstyle{inputnode} = [minimum height=1.4em,minimum width=3.5em,inner sep=3pt,rounded corners=1.5pt,draw,fill=red!20];
\tikzstyle{posnode} = [minimum height=1.4em,minimum width=3.5em,inner sep=3pt,rounded corners=1.5pt,draw,fill=black!10!white];
\tikzstyle{standard} = [rounded corners=3pt]
\node [Sanode,anchor=west] (sa1) at (0,0) {\tiny{$\textbf{Self-Attention}$}};
......
......@@ -7,9 +7,9 @@
\node(tbq) at ([xshift=0.5em,yshift=0]atten.east){
\begin{tabular}{|c|}
\hline
\rowcolor{yellow!20} \\ \hline
\rowcolor{yellow!20} \\ \hline
\rowcolor{yellow!20} \\ \hline
\rowcolor{yellow!30} \\ \hline
\rowcolor{yellow!30} \\ \hline
\rowcolor{yellow!30} \\ \hline
\end{tabular}
};
\node at ([xshift=0em,yshift=0.5em]tbq.north){$\mathbi{Q}$};
......@@ -20,9 +20,9 @@
\node(tbk) at ([xshift=1em,yshift=0]tbq.east){
\begin{tabular}{|c|}
\hline
\rowcolor{blue!20} \\ \hline
\rowcolor{blue!20} \\ \hline
\rowcolor{blue!20} \\ \hline
\rowcolor{blue!30} \\ \hline
\rowcolor{blue!30} \\ \hline
\rowcolor{blue!30} \\ \hline
\end{tabular}
};
\node at ([xshift=0em,yshift=0.5em]tbk.north){$\mathbi{K}$};
......@@ -33,9 +33,9 @@
\node(tbv) at ([xshift=1em,yshift=0]tbk.east){
\begin{tabular}{|c|}
\hline
\rowcolor{orange!20} \\ \hline
\rowcolor{orange!20} \\ \hline
\rowcolor{orange!20} \\ \hline
\rowcolor{orange!30} \\ \hline
\rowcolor{orange!30} \\ \hline
\rowcolor{orange!30} \\ \hline
\end{tabular}
};
\node at ([xshift=0em,yshift=0.5em]tbv.north){$\mathbi{V}$};
......@@ -51,9 +51,9 @@
\node(tbq2) at ([xshift=0.5em,yshift=2em]sof1.east){
\begin{tabular}{|c|}
\hline
\rowcolor{yellow!20} \\ \hline
\rowcolor{yellow!20} \\ \hline
\rowcolor{yellow!20} \\ \hline
\rowcolor{yellow!30} \\ \hline
\rowcolor{yellow!30} \\ \hline
\rowcolor{yellow!30} \\ \hline
\end{tabular}
};
\node at ([xshift=0em,yshift=0.5em]tbq2.north){$\mathbi{Q}$};
......@@ -66,7 +66,7 @@
\node(tbk2) at ([xshift=2em,yshift=0em]times.east){
\begin{tabular}{|l|l|l|}
\hline
\cellcolor{blue!20} & \cellcolor{blue!20} &\cellcolor{blue!20} \\ \hline
\cellcolor{blue!30} & \cellcolor{blue!30} &\cellcolor{blue!30} \\ \hline
\end{tabular}
};
\node at ([xshift=0em,yshift=0.5em]tbk2.north){$\mathbi{K}^{\mathrm{T}}$};
......@@ -78,9 +78,9 @@
\node(mask) at ([xshift=3em,yshift=-2em]tbk2.east){
\begin{tabular}{|l|l|l|}
\hline
\cellcolor{green!20} &\cellcolor{green!20} &\cellcolor{green!20} \\ \hline
\cellcolor{green!20} &\cellcolor{green!20} &\cellcolor{green!20} \\ \hline
\cellcolor{green!20} &\cellcolor{green!20} &\cellcolor{green!20} \\ \hline
\cellcolor{green!30} &\cellcolor{green!30} &\cellcolor{green!30} \\ \hline
\cellcolor{green!30} &\cellcolor{green!30} &\cellcolor{green!30} \\ \hline
\cellcolor{green!30} &\cellcolor{green!30} &\cellcolor{green!30} \\ \hline
\end{tabular}
};
\node at ([xshift=0em,yshift=0.5em]mask.north){$\mathbi{Mask}$};
......@@ -93,9 +93,9 @@
\node(tbv2) at ([xshift=1.2em,yshift=0]mask.east){
\begin{tabular}{|c|}
\hline
\rowcolor{orange!20} \\ \hline
\rowcolor{orange!20} \\ \hline
\rowcolor{orange!20} \\ \hline
\rowcolor{orange!30} \\ \hline
\rowcolor{orange!30} \\ \hline
\rowcolor{orange!30} \\ \hline
\end{tabular}
};
\node at ([xshift=0em,yshift=0.5em]tbv2.north){$\mathbi{V}$};
......@@ -108,9 +108,9 @@
\node(mid) at ([xshift=1.5em,yshift=0em]sof2.east){
\begin{tabular}{|l|l|l|}
\hline
\cellcolor{pink!30} &\cellcolor{pink!30} &\cellcolor{pink!30} \\ \hline
\cellcolor{pink!30} &\cellcolor{pink!30} &\cellcolor{pink!30} \\ \hline
\cellcolor{pink!30} &\cellcolor{pink!30} &\cellcolor{pink!30} \\ \hline
\cellcolor{pink!40} &\cellcolor{pink!40} &\cellcolor{pink!40} \\ \hline
\cellcolor{pink!40} &\cellcolor{pink!40} &\cellcolor{pink!40} \\ \hline
\cellcolor{pink!40} &\cellcolor{pink!40} &\cellcolor{pink!40} \\ \hline
\end{tabular}
};
% )
......@@ -126,9 +126,9 @@
\node(tbv3) at ([xshift=0.5em,yshift=0]bra2.east){
\begin{tabular}{|c|}
\hline
\rowcolor{orange!20} \\ \hline
\rowcolor{orange!20} \\ \hline
\rowcolor{orange!20} \\ \hline
\rowcolor{orange!30} \\ \hline
\rowcolor{orange!30} \\ \hline
\rowcolor{orange!30} \\ \hline
\end{tabular}
};
\node at ([xshift=0em,yshift=0.5em]tbv3.north){$\mathbi{V}$};
......@@ -140,9 +140,9 @@
\node(result) at ([xshift=2em,yshift=0]eq3.east){
\begin{tabular}{|l|l|l|}
\hline
\cellcolor{red!20} &\cellcolor{red!20} &\cellcolor{red!20} \\ \hline
\cellcolor{red!20}&\cellcolor{red!20} &\cellcolor{red!20} \\ \hline
\cellcolor{red!20} &\cellcolor{red!20} &\cellcolor{red!20} \\ \hline
\cellcolor{red!30} &\cellcolor{red!30} &\cellcolor{red!30} \\ \hline
\cellcolor{red!30}&\cellcolor{red!30} &\cellcolor{red!30} \\ \hline
\cellcolor{red!30} &\cellcolor{red!30} &\cellcolor{red!30} \\ \hline
\end{tabular}
};
% x
......@@ -151,9 +151,9 @@
\node(tbv4) at ([xshift=0.5em,yshift=0]times.east){
\begin{tabular}{|c|}
\hline
\rowcolor{orange!20} \\ \hline
\rowcolor{orange!20} \\ \hline
\rowcolor{orange!20} \\ \hline
\rowcolor{orange!30} \\ \hline
\rowcolor{orange!30} \\ \hline
\rowcolor{orange!30} \\ \hline
\end{tabular}
};
\node at ([xshift=0em,yshift=0.5em]tbv4.north){$\mathbi{V}$};
......
\begin{tikzpicture}
\node[rounded corners=1pt,minimum width=11.0em,minimum height=2.0em,fill=pink!30,draw=black](p1) at (0,0) {\small{Self-Attention}};
\node[rounded corners=1pt,minimum width=11.0em,minimum height=2.0em,fill=pink!60,draw=black](p1) at (0,0) {\small{Self-Attention}};
\node[anchor=north](word1) at ([xshift=0.0em,yshift=-2.0em]p1.south) {\small \mathbi{K}};
\node[anchor=west](word2) at ([xshift=2.2em]word1.east) {\small \mathbi{V}};
......@@ -19,7 +19,7 @@
\node[anchor=north](caption1) at ([xshift=0.0em,yshift=-9.5em]p1.south){\small{(a) Self-Attention的输入}};
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\node[anchor=west,rounded corners=1pt,minimum width=14.0em,minimum height=2.0em,fill=pink!30,draw=black](p2) at ([xshift=5.0em]p1.east){\small{Encoder-Decoder Attention}};
\node[anchor=west,rounded corners=1pt,minimum width=14.0em,minimum height=2.0em,fill=pink!50,draw=black](p2) at ([xshift=5.0em]p1.east){\small{Encoder-Decoder Attention}};
\node[anchor=north](word1-2) at ([xshift=0.0em,yshift=-2.0em]p2.south) {\small \mathbi{K}};
\node[anchor=west](word2-2) at ([xshift=2.2em]word1-2.east) {\small \mathbi{V}};
......
......@@ -6,8 +6,8 @@
\tikzstyle{Resnode} = [minimum height=1.1em,minimum width=7em,inner sep=3pt,rounded corners=1.5pt,draw];
\tikzstyle{ffnnode} = [minimum height=1.4em,minimum width=7em,inner sep=3pt,rounded corners=1.5pt,draw];
\tikzstyle{outputnode} = [minimum height=1.4em,minimum width=7em,inner sep=3pt,rounded corners=1.5pt,draw];
\tikzstyle{inputnode} = [minimum height=1.4em,minimum width=3.5em,inner sep=3pt,rounded corners=1.5pt,draw,fill=red!10];
\tikzstyle{posnode} = [minimum height=1.4em,minimum width=3.5em,inner sep=3pt,rounded corners=1.5pt,draw,fill=black!5!white];
\tikzstyle{inputnode} = [minimum height=1.4em,minimum width=3.5em,inner sep=3pt,rounded corners=1.5pt,draw,fill=red!20];
\tikzstyle{posnode} = [minimum height=1.4em,minimum width=3.5em,inner sep=3pt,rounded corners=1.5pt,draw,fill=black!10!white];
\tikzstyle{standard} = [rounded corners=3pt]
\node [Sanode,anchor=west] (sa1) at (0,0) {\tiny{$\textbf{Self-Attention}$}};
......
......@@ -2,12 +2,12 @@
\begin{tikzpicture}
\begin{scope}
\tikzstyle{Sanode} = [minimum height=1.4em,minimum width=7em,inner sep=3pt,rounded corners=1.5pt,draw,fill=orange!20];
\tikzstyle{Resnode} = [minimum height=1.1em,minimum width=7em,inner sep=3pt,rounded corners=1.5pt,draw,fill=yellow!20];
\tikzstyle{ffnnode} = [minimum height=1.4em,minimum width=7em,inner sep=3pt,rounded corners=1.5pt,draw,fill=blue!10];
\tikzstyle{outputnode} = [minimum height=1.4em,minimum width=7em,inner sep=3pt,rounded corners=1.5pt,draw,fill=blue!30];
\tikzstyle{inputnode} = [minimum height=1.4em,minimum width=3.5em,inner sep=3pt,rounded corners=1.5pt,draw,fill=red!10];
\tikzstyle{posnode} = [minimum height=1.4em,minimum width=3.5em,inner sep=3pt,rounded corners=1.5pt,draw,fill=black!5!white];
\tikzstyle{Sanode} = [minimum height=1.4em,minimum width=7em,inner sep=3pt,rounded corners=1.5pt,draw,fill=orange!30];
\tikzstyle{Resnode} = [minimum height=1.1em,minimum width=7em,inner sep=3pt,rounded corners=1.5pt,draw,fill=yellow!30];
\tikzstyle{ffnnode} = [minimum height=1.4em,minimum width=7em,inner sep=3pt,rounded corners=1.5pt,draw,fill=blue!20];
\tikzstyle{outputnode} = [minimum height=1.4em,minimum width=7em,inner sep=3pt,rounded corners=1.5pt,draw,fill=blue!40];
\tikzstyle{inputnode} = [minimum height=1.4em,minimum width=3.5em,inner sep=3pt,rounded corners=1.5pt,draw,fill=red!20];
\tikzstyle{posnode} = [minimum height=1.4em,minimum width=3.5em,inner sep=3pt,rounded corners=1.5pt,draw,fill=black!10!white];
\tikzstyle{standard} = [rounded corners=3pt]
\node [Sanode,anchor=west] (sa1) at (0,0) {\tiny{$\textbf{Self-Attention}$}};
......
......@@ -132,7 +132,7 @@
\multicolumn{1}{l|}{ConvS2S} & 25.16 & 40.46 & 1.5$\times 10^{20}$ \\
\multicolumn{1}{l|}{MoE} & 26.03 & 40.56 & 1.2$\times 10^{20}$ \\
\multicolumn{1}{l|}{Transformer (Base Model) } & 27.3 &38.1 & 3.3$\times 10^{18}$ \\
\multicolumn{1}{l|}{Transformer (Big Model)} & {\small\sffamily\bfseries{28.4}} & {\small\sffamily\bfseries{41.8}} & 2.3$\times 10^{19}$ \\
\multicolumn{1}{l|}{Transformer (Big Model)} & {\small\bfnew{28.4}} & {\small\bfnew{41.8}} & 2.3$\times 10^{19}$ \\
\end{tabular}
\end{table}
%----------------------------------------------
......@@ -158,19 +158,19 @@
\begin{itemize}
\vspace{0.5em}
\item {\small\sffamily\bfseries{自注意力子层}}\index{自注意力子层}(Self-Attention Sub-layer)\index{Self-Attention Sub-layer}:使用自注意力机制对输入的序列进行新的表示;
\item {\small\bfnew{自注意力子层}}\index{自注意力子层}(Self-Attention Sub-layer)\index{Self-Attention Sub-layer}:使用自注意力机制对输入的序列进行新的表示;
\vspace{0.5em}
\item {\small\sffamily\bfseries{前馈神经网络子层}}\index{前馈神经网络子层}(Feed-Forward Sub-layer)\index{Feed-Forward Sub-layer}:使用全连接的前馈神经网络对输入向量序列进行进一步变换;
\item {\small\bfnew{前馈神经网络子层}}\index{前馈神经网络子层}(Feed-Forward Sub-layer)\index{Feed-Forward Sub-layer}:使用全连接的前馈神经网络对输入向量序列进行进一步变换;
\vspace{0.5em}
\item {\small\sffamily\bfseries{残差连接}}(标记为“Add”):对于自注意力子层和前馈神经网络子层,都有一个从输入直接到输出的额外连接,也就是一个跨子层的直连。残差连接可以使深层网络的信息传递更为有效;
\item {\small\bfnew{残差连接}}(标记为“Add”):对于自注意力子层和前馈神经网络子层,都有一个从输入直接到输出的额外连接,也就是一个跨子层的直连。残差连接可以使深层网络的信息传递更为有效;
\vspace{0.5em}
\item {\small\sffamily\bfseries{层标准化}}(Layer Normalization):自注意力子层和前馈神经网络子层进行最终输出之前,会对输出的向量进行层标准化,规范结果向量取值范围,这样易于后面进一步的处理。
\item {\small\bfnew{层标准化}}(Layer Normalization):自注意力子层和前馈神经网络子层进行最终输出之前,会对输出的向量进行层标准化,规范结果向量取值范围,这样易于后面进一步的处理。
\vspace{0.5em}
\end{itemize}
\parinterval 以上操作就构成了Transformer的一层,各个模块执行的顺序可以简单描述为:Self-Attention $\to$ Residual Connection $\to$ Layer Normalization $\to$ Feed Forward Network $\to$ Residual Connection $\to$ Layer Normalization。编码器可以包含多个这样的层,比如,可以构建一个六层编码器,每层都执行上面的操作。最上层的结果作为整个编码的结果,会被传入解码器。
\parinterval 解码器的结构与编码器十分类似。它也是由若干层组成,每一层包含编码器中的所有结构,即:自注意力子层、前馈神经网络子层、残差连接和层标准化模块。此外,为了捕捉源语言的信息,解码器又引入了一个额外的{\small\sffamily\bfseries{编码-解码注意力子层}}\index{编码-解码注意力子层}(Encoder-Decoder Attention Sub-layer)\index{Encoder-Decoder Attention Sub-layer}。这个新的子层,可以帮助模型使用源语言句子的表示信息生成目标语言不同位置的表示。编码-解码注意力子层仍然基于自注意力机制,因此它和自注意力子层的结构是相同的,只是$\mathrm{query}$$\mathrm{key}$$\mathrm{value}$的定义不同。比如,在解码器端,自注意力子层的$\mathrm{query}$$\mathrm{key}$$\mathrm{value}$是相同的,它们都等于解码器每个位置的表示。而在编码-解码注意力子层中,$\mathrm{query}$是解码器每个位置的表示,此时$\mathrm{key}$$\mathrm{value}$是相同的,等于编码器每个位置的表示。图\ref{fig:12-5}给出了这两种不同注意力子层输入的区别。
\parinterval 解码器的结构与编码器十分类似。它也是由若干层组成,每一层包含编码器中的所有结构,即:自注意力子层、前馈神经网络子层、残差连接和层标准化模块。此外,为了捕捉源语言的信息,解码器又引入了一个额外的{\small\bfnew{编码-解码注意力子层}}\index{编码-解码注意力子层}(Encoder-Decoder Attention Sub-layer)\index{Encoder-Decoder Attention Sub-layer}。这个新的子层,可以帮助模型使用源语言句子的表示信息生成目标语言不同位置的表示。编码-解码注意力子层仍然基于自注意力机制,因此它和自注意力子层的结构是相同的,只是$\mathrm{query}$$\mathrm{key}$$\mathrm{value}$的定义不同。比如,在解码器端,自注意力子层的$\mathrm{query}$$\mathrm{key}$$\mathrm{value}$是相同的,它们都等于解码器每个位置的表示。而在编码-解码注意力子层中,$\mathrm{query}$是解码器每个位置的表示,此时$\mathrm{key}$$\mathrm{value}$是相同的,等于编码器每个位置的表示。图\ref{fig:12-5}给出了这两种不同注意力子层输入的区别。
%----------------------------------------------
\begin{figure}[htp]
......@@ -319,7 +319,7 @@
\subsection{多头注意力机制}
\parinterval Transformer中使用的另一项重要技术是{\small\sffamily\bfseries{多头注意力机制}}\index{多头注意力机制}(Multi-head Attention)\index{Multi-head Attention}。“多头”可以理解成将原来的$\mathbi{Q}$$\mathbi{K}$$\mathbi{V}$按照隐层维度平均切分成多份。假设切分$h$份,那么最终会得到$\mathbi{Q} = \{ \mathbi{Q}_1,...,\mathbi{Q}_h \}$$\mathbi{K}=\{ \mathbi{K}_1,...,\mathbi{K}_h \}$$\mathbi{V}=\{ \mathbi{V}_1,...,\mathbi{V}_h \}$。多头注意力就是用每一个切分得到的$\mathbi{Q}$$\mathbi{K}$$\mathbi{V}$独立的进行注意力计算,即第$i$个头的注意力计算结果$\mathbi{head}_i = \textrm{Attention}(\mathbi{Q}_i,\mathbi{K}_i, \mathbi{V}_i)$
\parinterval Transformer中使用的另一项重要技术是{\small\bfnew{多头注意力机制}}\index{多头注意力机制}(Multi-head Attention)\index{Multi-head Attention}。“多头”可以理解成将原来的$\mathbi{Q}$$\mathbi{K}$$\mathbi{V}$按照隐层维度平均切分成多份。假设切分$h$份,那么最终会得到$\mathbi{Q} = \{ \mathbi{Q}_1,...,\mathbi{Q}_h \}$$\mathbi{K}=\{ \mathbi{K}_1,...,\mathbi{K}_h \}$$\mathbi{V}=\{ \mathbi{V}_1,...,\mathbi{V}_h \}$。多头注意力就是用每一个切分得到的$\mathbi{Q}$$\mathbi{K}$$\mathbi{V}$独立的进行注意力计算,即第$i$个头的注意力计算结果$\mathbi{head}_i = \textrm{Attention}(\mathbi{Q}_i,\mathbi{K}_i, \mathbi{V}_i)$
\parinterval 下面根据图\ref{fig:12-12}详细介绍多头注意力的计算过程:
......
......@@ -38,7 +38,7 @@
\draw [->,thick]([yshift=-1.2em]eos.south) to (eos.south);
\end{scope}
\begin{scope}[yshift=-1.55in]
\begin{scope}[yshift=-1.45in]
\node (encoder) at (0,0) {来自编码器的信息};
\node (aa)[decoder,anchor=east] at ([xshift=5.5cm]encoder.east) {};
\node (y1y2b)[rectangle,anchor=south,inner sep=0.25em,densely dashed,draw] at ([yshift=-2.6em]aa.south) {$y_1\;y_2$};
......@@ -66,13 +66,13 @@
\draw [->,thick]([yshift=-1.4em]eos.south) to ([yshift=-0.05em]eos.south);
\end{scope}
\begin{scope}[yshift=-3.1in]
\begin{scope}[yshift=-2.9in]
\node (encoder) at (0,0) {来自编码器的信息};
\node (aa)[decoder,anchor=east] at ([xshift=5.5cm]encoder.east) {非自回归解码器};
%\node (encoder)[decoder,anchor=west,fill=red!20] at ([xshift=-2cm]aa.west) {编码器};
\draw [->,thick](encoder.east) to (aa.west);
\node (label)[anchor=south] at ([xshift=-4em,yshift=-4.3em]aa.south) {\small{(c) 非自回归解码}};
\node (label)[anchor=south] at ([xshift=-4em,yshift=-2.3em]aa.south) {\small{(c) 非自回归解码}};
\node (y2a)[anchor=north] at ([xshift=-1.5em,yshift=2.5em]aa.north) {$y_2$};
\node (y1a)[anchor=east] at ([xshift=-3em]y2a.east) {$y_1$};
......
......@@ -36,7 +36,6 @@
\item 多模型集成推断。
\vspace{0.5em}
\end{itemize}
%----------------------------------------------------------------------------------------
% NEW SECTION
%----------------------------------------------------------------------------------------
......@@ -116,13 +115,11 @@
\parinterval 机器翻译有两种常用的推断方式\ \dash \ 自左向右推断和自右向左推断。自左向右推断符合现实世界中人类的语言使用规律,因为人在翻译一个句子时,总是习惯从句子开始的部分向后生成\footnote{有些语言中,文字是自右向左书写,这时自右向左推断更符合人类使用这种语言的习惯。}。不过,有时候人也会使用当前单词后面的译文信息。也就是说,翻译也需要“未来” 的文字信息。于是很容易想到使用自右向左的方式对译文进行生成。
\parinterval 以上两种推断方式在神经机器翻译中都有应用,对于源语言句子$\seq{x}=\{x_1,\dots,x_m\}$和目标语言句子$\seq{y}=\{y_1,\dots,y_n\}$,自左向右推断可以被描述为:
\begin{eqnarray}
\funp{P}(\seq{y}\vert\seq{x}) &=& \prod_{j=1}^n \funp{P}(y_j\vert\seq{y}_{<j},\seq{x})
\label{eq:14-1}
\end{eqnarray}
\parinterval 自右向左推断可以被描述为:
\begin{eqnarray}
\funp{P}(\seq{y}\vert\seq{x}) &=&\prod_{j=1}^n \funp{P}(y_{n+1-j}\vert\seq{y}_{>n+1-j},\seq{x})
\label{eq:14-2}
......@@ -155,7 +152,6 @@
\begin{itemize}
\vspace{0.5em}
\item {\small\sffamily\bfseries{长度惩罚因子}}。用译文长度来归一化翻译概率是最常用的方法:对于源语言句子$\seq{x}$和译文句子$\seq{y}$,模型得分$\textrm{score}(\seq{x},\seq{y})$的值会随着译文$\seq{y}$ 的长度增大而减小。为了避免此现象,可以引入一个长度惩罚函数$\textrm{lp}(\seq{y})$,并定义模型得分如公式\eqref{eq:14-12}所示:
\begin{eqnarray}
\textrm{score}(\seq{x},\seq{y}) &=& \frac{\log \funp{P}(\seq{y}\vert\seq{x})}{\textrm{lp}(\seq{y})}
\label{eq:14-12}
......@@ -180,7 +176,6 @@
%----------------------------------------------------------------------------------------------------
\vspace{0.5em}
\item {\small\sffamily\bfseries{译文长度范围约束}}。为了让译文的长度落在合理的范围内,神经机器翻译的推断也会设置一个译文长度约束\upcite{Vaswani2018Tensor2TensorFN,KleinOpenNMT}。令$[a,b]$表示一个长度范围,可以定义:
\begin{eqnarray}
a &=& \omega_{\textrm{low}}\cdot |\seq{x}| \label{eq:14-3}\\
b &=& \omega_{\textrm{high}}\cdot |\seq{x}| \label{eq:14-4}
......@@ -194,12 +189,13 @@ b &=& \omega_{\textrm{high}}\cdot |\seq{x}| \label{eq:14-4}
\textrm{cp}(\seq{x},\seq{y}) &=& \beta \cdot \sum_{i=1}^{|\seq{x}|} \log(\textrm{min} (\sum_{j}^{|\seq{y}|} a_{ij} , 1))
\label{eq:14-6}
\end{eqnarray}
\noindent 其中,$\textrm{cp}(\seq{x},\seq{y}) $表示覆盖度模型,它度量了译文对源语言每个单词的覆盖程度。$\textrm{cp}(\seq{x},\seq{y}) $的定义中,$\beta$是一需要自行设置的超参数,$a_{ij}$表示源语言第$i$个位置与译文 第$j$个位置的注意力权重,这样$\sum \limits_{j}^{|\seq{y}|} a_{ij}$就可以用来衡量源语言第$i$个单词被翻译了“多少”,如果它大于1,表明翻译多了;如果小于1,表明翻译少了。公式\eqref{eq:14-6}会惩罚那些欠翻译的翻译假设。对覆盖度模型的一种改进形式是\upcite{li-etal-2018-simple}
\noindent 其中,$\textrm{cp}(\seq{x},\seq{y}) $表示覆盖度模型,它度量了译文对源语言每个单词的覆盖程度。$\textrm{cp}(\seq{x},\seq{y}) $的定义中,$\beta$是一需要自行设置的超参数,$a_{ij}$表示源语言第$i$个位置与译文 第$j$个位置的注意力权重,这样$\sum \limits_{j}^{|\seq{y}|} a_{ij}$就可以用来衡量源语言第$i$个单词被翻译了“多少”,如果它大于1,表明翻译多了;如果小于1,表明翻译少了。公式\eqref{eq:14-6}会惩罚那些欠翻译的翻译假设。对覆盖度模型的一种改进形式是\upcite{li-etal-2018-simple}
\begin{eqnarray}
\textrm{cp}(\seq{x},\seq{y}) &=& \sum_{i=1}^{|\seq{x}|} \log( \textrm{max} ( \sum_{j}^{|\seq{y}|} a_{ij},\beta))
\label{eq:14-7}
\end{eqnarray}
\noindent 公式\eqref{eq:14-7}将公式\eqref{eq:14-6}中的向下截断方式改为了向上截断。这样,模型可以对过翻译(或重复翻译)有更好的建模能力。不过,这个模型需要在开发集上细致地调整$\beta$,也带来了一定的额外工作量。此外,也可以将这种覆盖度单独建模并进行参数化,与翻译模型一同训练\upcite{Mi2016CoverageEM,TuModeling,Kazimi2017CoverageFC}。这样可以得到更加精细的覆盖度模型。
\vspace{0.5em}
\end{itemize}
......@@ -425,7 +421,6 @@ b &=& \omega_{\textrm{high}}\cdot |\seq{x}| \label{eq:14-4}
\parinterval 目前主流的神经机器翻译的推断是一种{\small\sffamily\bfseries{自回归翻译}}\index{自回归翻译}(Autoregressive Translation)\index{Autoregressive Translation}过程。所谓自回归是一种描述时间序列生成的方式:对于目标序列$\seq{y}=\{y_1,\dots,y_n\}$,如果$j$时刻状态$y_j$的生成依赖于之前的状态$\{y_1,\dots,y_{j-1}\}$,而且$y_j$$\{y_1,\dots,y_{j-1}\}$构成线性关系,那么称目标序列$\seq{y}$的生成过程是自回归的。神经机器翻译借用了这个概念,但是并不要求$y_j$$\{y_1,\dots,y_{j-1}\}$构成线性关系,\ref{sec:14-2-1}节提到的自左向右翻译模型和自右向左翻译模型都属于自回归翻译模型。自回归模型在机器翻译任务上也有很好的表现,特别是配合束搜索往往能够有效地寻找近似最优译文。但是,由于解码器的每个步骤必须顺序地而不是并行地运行,自回归翻译模型会阻碍不同译文单词生成的并行化。特别是在GPU 上,翻译的自回归性会大大降低计算的并行度和设备利用率。
\parinterval 对于这个问题,研究人员也考虑移除翻译的自回归性,进行{\small\sffamily\bfseries{非自回归翻译}}\index{非自回归翻译}(Non-Autoregressive Translation,NAT)\index{Non-Autoregressive Translation}\upcite{Gu2017NonAutoregressiveNM}。一个简单的非自回归翻译模型将问题建模为公式\eqref{eq:14-9}
\begin{eqnarray}
\funp{P}(\seq{y}|\seq{x}) &=& \prod_{j=1}^n {\funp{P}(y_j|\seq{x})}
\label{eq:14-9}
......@@ -441,8 +436,6 @@ b &=& \omega_{\textrm{high}}\cdot |\seq{x}| \label{eq:14-4}
\parinterval 在介绍非自回归模型的具体结构之前,先来看看如何实现一个简单的非自回归翻译模型。这里用标准的Transformer来举例。首先为了一次性生成所有的词,需要丢弃解码器对未来信息屏蔽的矩阵,从而去掉模型的自回归性。此外,还要考虑生成译文的长度。在自回归模型中,每步的输入是上一步解码出的结果,当预测到终止符<eos>时,序列的生成就自动停止了,然而非自回归模型却没有这样的特性,因此还需要一个长度预测器来预测出其长度,之后再用这个长度得到每个位置的表示,将其作为解码器的输入,进而完成整个序列的生成。
\parinterval\ref{fig:14-12}对比了自回归翻译模型和简单的非自回归翻译模型。可以看到这种自回归翻译模型可以一次性生成完整的译文。不过,高并行性也带来了翻译品质的下降。比如,在IWSLT 英德等数据上的BLEU[\%] 值只有个位数,而现在最好的自回归模型已经能够达到30左右的BLEU得分。这是因为每个位置词的预测只依赖于源语言句子$\seq{x}$,使得预测不准确。需要注意的是,图\ref{fig:14-12}(b)中将位置编码作为非自回归模型解码器的输入只是一个最简单的例子,在真实的系统中,非自回归解码器的输入一般是拷贝的源语言句子词嵌入与位置编码的融合。
%----------------------------------------------------------------------
\begin{figure}[htp]
\centering
......@@ -452,6 +445,10 @@ b &=& \omega_{\textrm{high}}\cdot |\seq{x}| \label{eq:14-4}
\end{figure}
%----------------------------------------------------------------------
\parinterval\ref{fig:14-12}对比了自回归翻译模型和简单的非自回归翻译模型。可以看到这种自回归翻译模型可以一次性生成完整的译文。不过,高并行性也带来了翻译品质的下降。比如,在IWSLT 英德等数据上的BLEU[\%] 值只有个位数,而现在最好的自回归模型已经能够达到30左右的BLEU得分。这是因为每个位置词的预测只依赖于源语言句子$\seq{x}$,使得预测不准确。需要注意的是,图\ref{fig:14-12}(b)中将位置编码作为非自回归模型解码器的输入只是一个最简单的例子,在真实的系统中,非自回归解码器的输入一般是拷贝的源语言句子词嵌入与位置编码的融合。
\parinterval 完全独立地对每个词建模,会出现什么问题呢?来看一个例子,将汉语句子“干/得/好/!”翻译成英文,可以翻译成“Good job !”或者“Well done !”。假设生成这两种翻译的概率是相等的,即一半的概率是“Good job !”,另一半的概率是“Well done !”。由于非自回归模型的条件独立性假设,推断时第一个词“Good”和“Well”的概率是差不多大的,如果第二个词“job”和“done”的概率也差不多大,会使得模型生成出“Good done !”或者“Well job !”这样错误的翻译,如图\ref{fig:14-13}所示。这便是影响句子质量的关键问题,称之为{\small\sffamily\bfseries{多峰问题}}\index{多峰问题}(Multimodality Problem)\index{Multimodality Problem}\upcite{Gu2017NonAutoregressiveNM}。如何有效处理非自回归模型中的多峰问题 是提升非自回归模型质量的关键。
%----------------------------------------------------------------------
......@@ -630,7 +627,6 @@ b &=& \omega_{\textrm{high}}\cdot |\seq{x}| \label{eq:14-4}
\subsection{局部预测融合}
\parinterval 神经机器翻译模型对每个目标语言位置$j$的单词的概率分布进行预测\footnote{即对于目标语言词汇表中的每个单词$w_r$,计算$\funp{P}(y_j=w_r | \seq{y}_{<j},\seq{x})$},假设有$K$个神经机器翻译系统,那么每个系统$k$都可以独立计算这个概率分布,记为$\funp{P}_{k} (y_j | \seq{y}_{<j},\seq{x})$。于是,可以用如下方式融合这$K$个系统的预测:
\begin{eqnarray}
\funp{P}(y_{j} | \seq{y}_{<j},\seq{x}) &=& \sum_{k=1}^K \gamma_{k} \cdot \funp{P}_{k} (y_j | \seq{y}_{<j},\seq{x})
\label{eq:14-11}
......
......@@ -6,7 +6,7 @@
\tikzstyle{every node}=[scale=0.7]
\node(encoder_c)[coder]{\large{编码器}};
\node(encoder_s)[coder, right of = encoder_c, xshift=3.5cm, fill=red!30]{\large{编码器}};
\node(h_pre)[above of = encoder_c, yshift=1.3cm,scale=1.3]{${\mathbi{h}}_{\rm pre}$};
\node(h_pre)[above of = encoder_c, yshift=1.3cm,scale=1.3]{${\mathbi{h}}^{\rm pre}$};
\node(h)[above of = encoder_s, yshift=1.3cm,scale=1.3]{$\mathbi{h}$};
\node(cir)[circle,very thick, right of = h, draw=black!90,minimum width=0.5cm,xshift=1.1cm]{};
\draw[-,very thick,draw=black!90]([xshift=0.04cm]cir.west)--([xshift=-0.04cm]cir.east);
......
......@@ -546,7 +546,7 @@
\subsubsection{2. 多编码器结构}
\parinterval 另一种思路是对传统的编码器-解码器框架进行更改,引入额外的编码器来对上下文句子进行编码,该结构被称为多编码器结构\upcite{DBLP:conf/acl/LiLWJXZLL20,DBLP:conf/discomt/SugiyamaY19}。这种结构最早被应用在基于循环神经网络的篇章级翻译模型中\upcite{DBLP:journals/corr/JeanLFC17,DBLP:conf/coling/KuangX18,DBLP:conf/naacl/BawdenSBH18,DBLP:conf/pacling/YamagishiK19},后期证明在Transformer模型上同样适用\upcite{DBLP:journals/corr/abs-1805-10163,DBLP:conf/emnlp/ZhangLSZXZL18}。图\ref{fig:17-18}展示了一个基于Transformer模型的多编码器结构,基于源语言当前待翻译句子的编码表示$\mathbi{h}$和上下文句子的编码表示$\mathbi{h}^{\textrm pre}$,模型首先通过注意力机制提取上下文信息$\mathbi{d}$
\parinterval 另一种思路是对传统的编码器-解码器框架进行更改,引入额外的编码器来对上下文句子进行编码,该结构被称为多编码器结构\upcite{DBLP:conf/acl/LiLWJXZLL20,DBLP:conf/discomt/SugiyamaY19}。这种结构最早被应用在基于循环神经网络的篇章级翻译模型中\upcite{DBLP:journals/corr/JeanLFC17,DBLP:conf/coling/KuangX18,DBLP:conf/naacl/BawdenSBH18,DBLP:conf/pacling/YamagishiK19},后期证明在Transformer模型上同样适用\upcite{DBLP:journals/corr/abs-1805-10163,DBLP:conf/emnlp/ZhangLSZXZL18}。图\ref{fig:17-18}展示了一个基于Transformer模型的多编码器结构,基于源语言当前待翻译句子的编码表示$\mathbi{h}$和上下文句子的编码表示$\mathbi{h}^{\textrm pre}$,模型首先通过注意力机制提取句子间上下文信息$\mathbi{d}$
\begin{eqnarray}
\mathbi{d}&=&\textrm{Attention}(\mathbi{h},\mathbi{h}^{\textrm pre},\mathbi{h}^{\textrm pre})
\label{eq:17-3-3}
......@@ -604,7 +604,7 @@
\noindent 之后,分别计算词级和句子级注意力模型。需要注意的是句子级注意力添加了一个前馈全连接网络子层FFN。其具体计算方式如下:
\begin{eqnarray}
\mathbi{s}^k&=&\textrm{WordAttention}(\mathbi{q}_{w},\mathbi{h}^{k},\mathbi{h}^{k})
\mathbi{s}^k&=&\textrm{WordAttention}(\mathbi{q}_{w},\mathbi{h}^{\textrm {pre}k},\mathbi{h}^{\textrm{pre}k})
\label{eq:17-3-7}\\
\mathbi{d}_t&=&\textrm{FFN}(\textrm{SentAttention}(\mathbi{q}_{s},\mathbi{s},\mathbi{s}))
\label{eq:17-3-9}
......
......@@ -211,7 +211,6 @@ $计算这种切分的概率值。
%-------------------------------------------
\parinterval 以“确实现在数据很多”这个实例来说,如果把这句话按照“确实/现在/数据/很/多”这样的方式进行切分,这个句子切分的概率$\funp{P}$(确实/现在/数据/很/多) 可以通过每个词出现概率相乘的方式进行计算。
\begin{eqnarray}
&\funp&{P}\textrm{(确实/现在/数据/很/多)} \nonumber \\
& = &\funp{P}\textrm{(确实)} \cdot \funp{P}\textrm{(现在)} \cdot \funp{P}\textrm{(数据)} \cdot \funp{P}\textrm{(很)} \cdot \funp{P}\textrm{(多)}
......@@ -250,6 +249,8 @@ $计算这种切分的概率值。
\vspace{0.5em}
\end{itemize}
\parinterval\ref{fig:3.3-1}给出了不同标注格式所对应的标注结果。可以看出文本序列中的非命名实体直接被标注为“O”,而命名实体的标注则被分为了两部分:位置和命名实体类别,图中的“B”、“I”、“E”等标注出了位置信息,而“CIT”和“CNT”则标注出了命名实体类别(“CIT”表示城市,“CNT”表示国家)。可以看到,命名实体的识别结果可以通过BIO、BIOES这类序列标注结果归纳出来:例如在BIOES格式中,标签“B-CNT”后面的标签只会是“I-CNT”或“E-CNT”,而不会是其他的标签。同时,在命名实体识别任务中涉及到实体边界的确定,而“BIO”或“BIOES”的标注格式本身就暗含着边界问题:在“BIO”格式下,实体左边界只能在“B”的左侧,右边界只能在“B”或“I”的右侧;在“BIOES”格式下,实体左边界只能在“B”或“S”的左侧,右边界只能在“E”和“S”的右侧。
%----------------------------------------------
\begin{figure}[htp]
\centering
......@@ -259,8 +260,6 @@ $计算这种切分的概率值。
\label{fig:3.3-1}
\end{figure}
%-------------------------------------------
%
\parinterval\ref{fig:3.3-1}给出了不同标注格式所对应的标注结果。可以看出文本序列中的非命名实体直接被标注为“O”,而命名实体的标注则被分为了两部分:位置和命名实体类别,图中的“B”、“I”、“E”等标注出了位置信息,而“CIT”和“CNT”则标注出了命名实体类别(“CIT”表示城市,“CNT”表示国家)。可以看到,命名实体的识别结果可以通过BIO、BIOES这类序列标注结果归纳出来:例如在BIOES格式中,标签“B-CNT”后面的标签只会是“I-CNT”或“E-CNT”,而不会是其他的标签。同时,在命名实体识别任务中涉及到实体边界的确定,而“BIO”或“BIOES”的标注格式本身就暗含着边界问题:在“BIO”格式下,实体左边界只能在“B”的左侧,右边界只能在“B”或“I”的右侧;在“BIOES”格式下,实体左边界只能在“B”或“S”的左侧,右边界只能在“E”和“S”的右侧。
\parinterval 需要注意的是,虽然图\ref{fig:3.3-1}中的命名实体识别以单词为基本单位进行标注,但真实系统中也可以在字序列上进行命名实体识别,其方法与基于词序列的命名实体识别是一样的。因此,这里仍然以基于词序列的方法为例进行介绍。
......@@ -345,6 +344,15 @@ $计算这种切分的概率值。
\parinterval 由于隐含状态序列之间存在转移概率,并且隐马尔可夫模型中隐含状态和可见状态之间存在着发射概率,因此根据可见状态的转移猜测隐含状态序列并非无迹可循。图\ref{fig:3.3-3}描述了如何使用隐马尔可夫模型来根据“抛硬币”结果推测挑选的硬币序列。可见,通过隐含状态之间的联系(绿色方框及它们之间的连线)可以对有序的状态进行描述,进而得到隐含状态序列所对应的可见状态序列(红色圆圈)。
%----------------------------------------------
\begin{figure}[htp]
\centering
\input{./Chapter3/Figures/figure-example-of-hmm-in-coin-toss}
\caption{抛硬币的隐马尔可夫模型实例}
\label{fig:3.3-3}
\end{figure}
%-------------------------------------------
\parinterval 从统计建模的角度看,上述过程本质上是在描述隐含状态和可见状态出现的联合概率。这里,用$\seq{x}=(x_1,...,x_m)$表示可见状态序列,用$\seq{y}=(y_1,...,y_m)$表示隐含状态序列。(一阶)隐马尔可夫模型假设:
\begin{itemize}
......@@ -366,14 +374,6 @@ $计算这种切分的概率值。
\noindent 这里,$y_{0}$表示一个虚拟的隐含状态。这样,可以定义$\funp{P}(y_1|y_{0}) \equiv \funp{P}(y_1)$,它表示起始隐含状态出现的概率。隐马尔可夫模型的假设也大大化简了问题,因此可以通过式\eqref{eq:joint-prob-xy}很容易地计算隐含状态序列和可见状态序列出现的概率。值得注意的是,发射概率和转移概率都可以被看作是描述序列生成过程的“特征”。但是,这些“特征”并不是随意定义的,而是符合问题的概率解释。而这种基于事件发生的逻辑所定义的概率生成模型,通常可以被看作是一种{\small\sffamily\bfseries{生成式模型}}\index{生成式模型}(Generative Model)\index{Generative Model}
%----------------------------------------------
\begin{figure}[htp]
\centering
\input{./Chapter3/Figures/figure-example-of-hmm-in-coin-toss}
\caption{抛硬币的隐马尔可夫模型实例}
\label{fig:3.3-3}
\end{figure}
%-------------------------------------------
\parinterval 一般来说,隐马尔可夫模型中包含下面三个问题:
\begin{itemize}
......@@ -399,10 +399,7 @@ $计算这种切分的概率值。
\parinterval 一种简单的办法是使用相对频次估计得到转移概率和发射概率估计值。令$x_i$表示第$i$个位置的可见状态,$y_i$表示第$i$个位置的隐含状态,$\funp{P}(y_i|y_{i-1})$表示第$i-1$个位置到第$i$个位置的状态转移概率,$\funp{P}(x_i|y_{i}) $表示第$i$个位置的发射概率,于是有:
\begin{eqnarray}
\funp{P}(y_i|y_{i-1}) &=& \frac{{c}(y_{i-1},y_i)}{{c}(y_{i-1})}
\label{eq:3.3-1}
\end{eqnarray}
\begin{eqnarray}
\label{eq:3.3-1}\\
\funp{P}(x_i|y_{i}) &=& \frac{{c}(x_i,y_i)}{{c}(y_i)}
\label{eq:3.3-2}
\end{eqnarray}
......@@ -422,7 +419,6 @@ $计算这种切分的概率值。
\end{eqnarray}
\parinterval 将式\eqref{eq:joint-prob-xy}带入式\eqref{eq:markov-sequence-argmax}可以得到最终计算公式,如下:
\begin{eqnarray}
\hat{\seq{y}} &=& \arg\max_{\seq{y}}\prod_{i=1}^{m}\funp{P}(x_i|y_i)\funp{P}(y_i|y_{i-1})
\label{eq:3.3-5}
......@@ -446,7 +442,6 @@ $计算这种切分的概率值。
\subsubsection{2. 条件随机场}
\parinterval 隐马尔可夫模型有一个很强的假设:一个隐含状态出现的概率仅由上一个隐含状态决定。这个假设也会带来一些问题,举个例子:在某个隐马尔可夫模型中,隐含状态集合为\{$A, B, C, D$\},可见状态集合为\{$T, F$\},其中隐含状态$A$可能的后继隐含状态集合为\{$A, B$\},隐含状态$B$可能的后继隐含状态集合为\{$A, B, C, D$\},于是有:
\begin{eqnarray}
\funp{P}(A|A)+\funp{P}(A|B) & = & 1 \label{eq:3.3-6} \\
\funp{P}(A|B)+\funp{P}(B|B)+\funp{P}(C|B)+\funp{P}(D|B) & = & 1 \label{eq:3.3-7}
......@@ -474,14 +469,12 @@ F(y_{i-1},y_i,\seq{x},i) & = & t(y_{i-1},y_i,\seq{x},i)+s(y_i,\seq{x},i)
\end{eqnarray}
\parinterval 实际上,基于特征函数的方法更像是对隐含状态序列的一种打分:根据人为设计的模板(特征函数),测试隐含状态之间的转换以及隐含状态与可见状态之间的对应关系是否符合这种模板。在处理序列问题时,假设可见状态序列$\seq{x}$的长度和待预测隐含状态序列$\seq{y}$的长度均为$m$,且共设计了$k$个特征函数,则有:
\begin{eqnarray}
\funp{P}(\seq{y}|\seq{x}) & = & \frac{1}{Z(\seq{x})}\exp(\sum_{i=1}^m\sum_{j=1}^{k}\lambda_{j}F_{j}(y_{i-1},y_i,\seq{x},i))
\label{eq:3.3-9}
\end{eqnarray}
\parinterval 公式\eqref{eq:3.3-9}中的$Z(x)$即为上面提到的实现全局统计归一化的归一化因子,其计算方式为:
\begin{eqnarray}
Z(\seq{x})&=&\sum_{\seq{y}}\exp(\sum_{i=1}^m\sum_{j=1}^k\lambda_{j}F_{j}(y_{i-1},y_i,\seq{x},i))
\label{eq:3.3-10}
......@@ -510,6 +503,8 @@ Z(\seq{x})&=&\sum_{\seq{y}}\exp(\sum_{i=1}^m\sum_{j=1}^k\lambda_{j}F_{j}(y_{i-1}
\parinterval 基于概率图的模型将序列表示为有向图或无向图,如图\ref{fig:3.3-7}(a)、(b)所示。这种方法增加了建模的复杂度。既然要得到每个位置的类别输出,另一种更加直接的方法是使用分类器对每个位置进行独立预测。分类器是机器学习中广泛使用的方法,它可以根据输入自动地对类别进行预测。如图\ref{fig:3.3-7}(c)所示,对于序列标注任务,分类器把每一个位置所对应的所有特征看作是输入,而把这个位置对应的标签看作输出。从这个角度说,隐马尔可夫模型等方法实际上也是在进行一种“分类”操作,只不过这些方法考虑了不同位置输出(或隐含状态)之间的依赖。
\parinterval 值得注意的是分类模型可以被应用于序列标注之外的很多任务,在后面的章节中还会看到,机器翻译中的很多模块也借鉴了统计分类的思想。其中使用到的基础数学模型和特征定义形式,与这里提到的分类器本质上是一样的。
%----------------------------------------------
\begin{figure}[htp]
\centering
......@@ -521,8 +516,6 @@ Z(\seq{x})&=&\sum_{\seq{y}}\exp(\sum_{i=1}^m\sum_{j=1}^k\lambda_{j}F_{j}(y_{i-1}
\end{figure}
%-------------------------------------------
\parinterval 值得注意的是分类模型可以被应用于序列标注之外的很多任务,在后面的章节中还会看到,机器翻译中的很多模块也借鉴了统计分类的思想。其中使用到的基础数学模型和特征定义形式,与这里提到的分类器本质上是一样的。
%----------------------------------------------------------------------------------------
% NEW SUBSUB-SECTION
%----------------------------------------------------------------------------------------
......@@ -575,6 +568,8 @@ Z(\seq{x})&=&\sum_{\seq{y}}\exp(\sum_{i=1}^m\sum_{j=1}^k\lambda_{j}F_{j}(y_{i-1}
\parinterval {\small\sffamily\bfseries{句法}}\index{句法}(Syntax)\index{Syntax}是研究句子的每个组成部分和它们之间的组合方式。一般来说,句法和语言是相关的,比如,英文是主谓宾结构,而日语是主宾谓结构,因此不同的语言也会有不同的句法描述方式。自然语言处理领域最常用的两种句法分析形式是{\small\sffamily\bfseries{短语结构分析}}\index{短语结构分析}(Phrase Structure Parsing)\index{Phrase Structure Parsing}{\small\sffamily\bfseries{依存分析}}\index{依存分析}(Dependency Parsing)\index{Dependency Parsing}。图\ref{fig:3.4-1}展示了这两种的句法表示形式的实例。其中,左侧是短语结构树,它描述的是短语的结构功能,比如“吃”是动词(记为VV),“鱼”是名词(记为NN),“吃/鱼”组成动词短语,这个短语再与“喜欢”这一动词组成新的动词短语。短语结构树的每个子树都是一个句法功能单元,比如,子树VP(VV(吃) NN(鱼))就表示了“吃/鱼”这个动词短语的结构,其中子树根节点VP是句法功能标记。短语结构树利用嵌套的方式描述了语言学的功能,短语结构树中,每个词都有词性(或词类),不同的词或者短语可以组成名动结构、动宾结构等语言学短语结构,短语结构分析一般也被称为{\small\sffamily\bfseries{成分分析}}\index{成分分析}(Constituency Parsing)或{\small\sffamily\bfseries{完全分析}}\index{完全分析}(Full Parsing)\index{Full Parsing}
\parinterval\ref{fig:3.4-1}右侧展示的是另一种句法结构,被称作依存句法树。依存句法树表示了句子中单词和单词之间的依存关系。比如,从这个例子可以了解,“猫”依赖“喜欢”,“吃”依赖“喜欢”,“鱼”依赖“吃”。
%----------------------------------------------
\begin{figure}[htp]
\centering
......@@ -584,8 +579,6 @@ Z(\seq{x})&=&\sum_{\seq{y}}\exp(\sum_{i=1}^m\sum_{j=1}^k\lambda_{j}F_{j}(y_{i-1}
\end{figure}
%---------------------------
\parinterval\ref{fig:3.4-1}右侧展示的是另一种句法结构,被称作依存句法树。依存句法树表示了句子中单词和单词之间的依存关系。比如,从这个例子可以了解,“猫”依赖“喜欢”,“吃”依赖“喜欢”,“鱼”依赖“吃”。
\parinterval 短语结构树和依存句法树的结构和功能有很大不同。短语结构树的叶子节点是单词,中间节点是词性或者短语句法标记。在短语结构分析中,通常把单词称作{\small\sffamily\bfseries{终结符}}\index{终结符}(Terminal)\index{Terminal},把词性称为{\small\sffamily\bfseries{预终结符}}\index{预终结符}(Pre-terminal)\index{Pre-terminal},而把其他句法标记称为{\small\sffamily\bfseries{非终结符}}\index{非终结符}(Non-terminal)\index{Non-terminal}。依存句法树没有预终结符和非终结符,所有的节点都是句子里的单词,通过不同节点间的连线表示句子中各个单词之间的依存关系。每个依存关系实际上都是有方向的,头和尾分别指向“接受”和“发出”依存关系的词。依存关系也可以进行分类,例如,图\ref{fig:3.4-1}中的对每个依存关系的类型都有一个标记,这也被称作是有标记的依存分析。如果不生成这些标记,这样的句法分析被称作无标记的依存分析。
\parinterval 虽然短语结构树和依存树的句法表现形式有很大不同,但是它们在某些条件下能相互转化。比如,可以使用启发性规则将短语结构树自动转化为依存树。从应用的角度,依存分析由于形式更加简单,而且直接建模词语之间的依赖,因此在自然语言处理领域中受到很多关注。在机器翻译中,无论是哪种句法树结构,都已经被证明会对机器翻译系统产生帮助。特别是短语结构树,在机器翻译中的应用历史更长,研究更为深入,因此本节将会以短语结构分析为例介绍句法分析的相关概念。
......@@ -722,6 +715,8 @@ s_0 \overset{r_1}{\Rightarrow} s_1 \overset{r_2}{\Rightarrow} s_2 \overset{r_3}{
\parinterval 比如,使用前面的示例文法,可以对“猫/喜欢/吃/鱼”进行分析,并形成句法分析树(图\ref{fig:3.4-3})。从起始非终结符IP开始,使用唯一拥有IP作为左部的规则$r_8$推导出NP和VP,之后依次使用规则$r_5$$r_1$$r_7$$r_2$$r_6$$r_3$$r_4$,得到了完整的句法树。
\parinterval 通常,可以把推导简记为$d=r_1 \circ r_2 \circ ... \circ r_n$,其中$ \circ $表示规则的组合。显然,$d$也对应了树形结构,也就是句法分析结果。从这个角度看,推导就是描述句法分析树的一种方式。此外,规则的推导也把规则的使用过程与生成的字符串对应起来。一个推导所生成的字符串,也被称作文法所产生的一个{\small\sffamily\bfseries{句子}}\index{句子}(Sentence)\index{Sentence}。而一个文法所能生成的所有句子的集合是这个文法所对应的{\small\sffamily\bfseries{语言}}\index{语言}(Language)\index{Language}
%-------------------------------------------
\begin{figure}[htp]
\centering
......@@ -731,8 +726,6 @@ s_0 \overset{r_1}{\Rightarrow} s_1 \overset{r_2}{\Rightarrow} s_2 \overset{r_3}{
\end{figure}
%-------------------------------------------
\parinterval 通常,可以把推导简记为$d=r_1 \circ r_2 \circ ... \circ r_n$,其中$ \circ $表示规则的组合。显然,$d$也对应了树形结构,也就是句法分析结果。从这个角度看,推导就是描述句法分析树的一种方式。此外,规则的推导也把规则的使用过程与生成的字符串对应起来。一个推导所生成的字符串,也被称作文法所产生的一个{\small\sffamily\bfseries{句子}}\index{句子}(Sentence)\index{Sentence}。而一个文法所能生成的所有句子的集合是这个文法所对应的{\small\sffamily\bfseries{语言}}\index{语言}(Language)\index{Language}
\parinterval 但是,句子和规则的推导并不是一一对应的。同一个句子,往往有很多推导的方式,这种现象被称为{\small\sffamily\bfseries{歧义}}\index{歧义}(Ambiguity)\index{Ambiguity}。甚至同一棵句法树,也可以对应不同的推导,图\ref{fig:3.4-4} 给出同一棵句法树所对应的两种不同的规则推导。
%-------------------------------------------
......@@ -747,6 +740,10 @@ s_0 \overset{r_1}{\Rightarrow} s_1 \overset{r_2}{\Rightarrow} s_2 \overset{r_3}{
\parinterval 显然,规则顺序的不同会导致句法树的推导这一确定的过程变得不确定,因此,需要进行{\small\sffamily\bfseries{消歧}}\index{消歧}(Disambiguation)\index{Disambiguation}。这里,可以使用启发式方法:要求规则使用都服从最左优先原则,这样得到的推导被称为{\small\sffamily\bfseries{最左优先推导}}\index{最左优先推导}(Left-most Derivation)\index{Left-most Derivation}。图\ref{fig:3.4-4}中的推导1 就是符合最左优先原则的推导。
\parinterval 这样,对于一个上下文无关文法,每一棵句法树都有唯一的最左推导与之对应。于是,句法分析可以被描述为:对于一个句子找到能够生成它的最佳推导,这个推导所对应的句法树就是这个句子的句法分析结果。
\parinterval 不过问题又回来了,怎样才能知道什么样的推导或者句法树是“最佳”的呢?如图\ref{fig:3.4-5}所示,对于语言学专家,他们可以很确定地分辨出哪些句法树是正确的,哪些句法树是错误。甚至普通人也可以通过一些课本中学到的知识产生一些模糊的判断。而计算机如何进行判别呢?沿着前面介绍的统计建模的思想,计算机可以得出不同句法树出现的概率,进而选择概率最高的句法树作为输出,而这正是统计句法分析所做的事情。
%-------------------------------------------
\begin{figure}[htp]
\centering
......@@ -756,10 +753,6 @@ s_0 \overset{r_1}{\Rightarrow} s_1 \overset{r_2}{\Rightarrow} s_2 \overset{r_3}{
\end{figure}
%-------------------------------------------
\parinterval 这样,对于一个上下文无关文法,每一棵句法树都有唯一的最左推导与之对应。于是,句法分析可以被描述为:对于一个句子找到能够生成它的最佳推导,这个推导所对应的句法树就是这个句子的句法分析结果。
\parinterval 不过问题又回来了,怎样才能知道什么样的推导或者句法树是“最佳”的呢?如图\ref{fig:3.4-5}所示,对于语言学专家,他们可以很确定地分辨出哪些句法树是正确的,哪些句法树是错误。甚至普通人也可以通过一些课本中学到的知识产生一些模糊的判断。而计算机如何进行判别呢?沿着前面介绍的统计建模的思想,计算机可以得出不同句法树出现的概率,进而选择概率最高的句法树作为输出,而这正是统计句法分析所做的事情。
\parinterval 在统计句法分析中,需要对每个推导进行统计建模,于是定义一个模型$\funp{P}( \cdot )$,对于任意的推导$d$,都可以用$\funp{P}(d)$计算出推导$d$的概率。这样,给定一个输入句子,我们可以对所有可能的推导用$\funp{P}(d)$计算其概率值,并选择概率最大的结果作为句法分析的结果输出(图\ref{fig:3.4-6})。
%-------------------------------------------
......@@ -829,7 +822,6 @@ r_6: & & \textrm{VP} \to \textrm{VV}\ \textrm{NN} \nonumber
\parinterval 这也对应了词串“吃/鱼”的生成过程。首先,从起始非终结符VP开始,使用规则$r_6$生成两个非终结符VV和NN;进一步,分别使用规则$r_3$$r_4$从VV和NN进一步生成单词“吃”和“鱼”。整个过程的概率等于三条规则概率的乘积。
\parinterval 新的问题又来了,如何得到规则的概率呢?这里仍然可以从数据中学习文法规则的概率。假设有人工标注的数据,它包括很多人工标注句法树的句法,称之为{\small\sffamily\bfseries{树库}}\index{树库}(Treebank)\index{Treebank}。然后,对于规则$\textrm{r}:\alpha \to \beta$可以使用基于频次的方法:
\begin{eqnarray}
\funp{P}(r) &=& \frac{\text{规则$r$在树库中出现的次数}}{\alpha \text{在树库中出现的次数}}
\label{eq:3.4-8}
......
......@@ -147,7 +147,6 @@
\begin{itemize}
\item {\small\sffamily\bfseries{根据系统胜出的次数进行排序}}\upcite{DBLP:conf/wmt/Callison-BurchK12}。以系统${S}_j$和系统${S}_k$为例,两个系统都被比较了$\textrm{C}_n^5 \times 4 \times 3$ 次,其中系统${S}_j$获胜20次,系统${S}_k$获胜30次,总体排名中系统${S}_k$优于系统${S}_j$
\item {\small\sffamily\bfseries{根据冲突次数进行排序}}\upcite{DBLP:conf/wmt/Lopez12}。第一种排序策略中存在冲突现象:例如在每次两两比较中,系统${S}_j$胜过系统${S}_k$ 的次数比系统${S}_j$不敌系统${S}_k$的次数多,若待评价系统仅有系统${S}_j$${S}_k$,显然系统${S}_j$的排名高于系统${S}_k$。但当待评价系统很多时,可能系统${S}_j$在所有比较中获胜的次数低于系统${S}_k$,此时就出现了总体排序与局部排序不一致的冲突。因此,有研究者提出,能够与局部排序冲突最少的总体排序才是最合理的。令$O$表示一个对若干个系统的排序,该排序所对应的冲突定义为:
\begin{eqnarray}
\textrm{conflict}(O) =\sum\limits_{{{S}_j},{{S}_k} \in O,j \ne k} {{\textrm{max}}(0,\textrm{count}_{\textrm{win}}({{S}_j},{{S}_k}) - \textrm{count}_{\textrm{loss}}({{S}_j},{{S}_k}))}
\label{eq:4-1}
......@@ -155,7 +154,6 @@
其中,${S}_j$${S}_k$是成对比较的两个系统,$\textrm{count}_{\textrm{win}}({S}_j,{S}_k)$$\textrm{count}_{\textrm{loss}}({S}_j,{S}_k)$分别是${S}_j$${S}_k$进行成对比较时系统${S}_j$ 胜利和失败的次数。而使得$\textrm{conflict}(O)$最低的$O$就是最终的系统排序结果。
\item {\small\sffamily\bfseries{根据某系统最终获胜的期望进行排序}}\upcite{DBLP:conf/iwslt/Koehn12}。以系统${S}_j$为例,若共有$n$个待评价的系统,则进行总体排序时系统 ${S}_j$ 的得分为其最终获胜的期望,即:
\begin{eqnarray}
\textrm{score}({{S}_j}) &=& \frac{1}{n}\sum\limits_{k,k \ne j} {\frac{\textrm{count}_{\textrm{win}}({{S}_j},{{S}_k})}{{\textrm{count}_{\textrm{win}}({{S}_j},{{S}_k}) + \textrm{count}_{\textrm{loss}}({{S}_j},{{S}_k})}}}
\label{eq:4-2}
......@@ -334,7 +332,6 @@
\label{fig:4-6}
\end{figure}
%----------------------------------------------
\vspace{0.5em}
\item 在得到机器译文与参考答案的对齐关系后,需要基于对齐关系计算准确率和召回率。
准确率:机器译文中命中单词数与机器译文单词总数的比值。即:
......@@ -358,7 +355,7 @@
\vspace{0.5em}
\end{itemize}
\parinterval 在上文提到的评价指标中,无论是准确率、召回率还是$\textrm F_{mean}$,都是基于单个词汇信息衡量译文质量,而忽略了语序问题。为了将语序问题考虑进来,Meteor会考虑更长的匹配:将机器译文按照最长匹配长度分块,并对“块数”较多的机器译文给予惩罚。例如图\ref{fig:4-6}显示的最终词对齐结果中,机器译文被分为了三个“块”——“Can I have it”、“like he”、“?”在这种情况下,看起来上例中的准确率、召回率都还不错,但最终会受到很严重的惩罚。这种罚分机制能够识别出机器译文中的词序问题,因为当待测译文词序与参考答案相差较大时,机器译文将会被分割得比较零散,这种惩罚机制的计算公式如式\eqref{eq:4-11},其中$\textrm {count}_{\textrm{chunks}}$表示匹配的块数。
\parinterval 在上文提到的评价指标中,无论是准确率、召回率还是$\textrm F_{mean}$,都是基于单个词汇信息衡量译文质量,而忽略了语序问题。为了将语序问题考虑进来,Meteor会考虑更长的匹配:将机器译文按照最长匹配长度分块,由于“块数”较多的机器译文与参考答案的对齐更加散乱,意味着其语序问题更多,因此Meteor会对这样的译文给予惩罚。例如图\ref{fig:4-6}显示的最终词对齐结果中,机器译文被分为了三个“块”——“Can I have it”、“like he”、“?”在这种情况下,看起来上例中的准确率、召回率都还不错,但最终会受到很严重的惩罚。这种罚分机制能够识别出机器译文中的词序问题,因为当待测译文词序与参考答案相差较大时,机器译文将会被分割得比较零散,这种惩罚机制的计算公式如式\eqref{eq:4-11},其中$\textrm {count}_{\textrm{chunks}}$表示匹配的块数。
\begin{eqnarray}
\textrm {Penalty} &=& 0.5 \cdot {\left({\frac{{\textrm {count}}_{\textrm {chunks}}}{\textrm {count}_{\textrm{hit}}}} \right)^3}
\label{eq:4-11}
......@@ -523,6 +520,11 @@ His house is on the south bank of the river .
\parinterval 在DREEM中,分布式表示的选取是一个十分关键的问题,理想的情况下,分布式表示应该涵盖句子在词汇、句法、语法、语义、依存关系等各个方面的信息。目前常见的分布式表示方式如表\ref{tab:4-2}所示。除此之外,还可以通过词袋模型、循环神经网络等将词向量表示转换为句子向量表示。
\parinterval DREEM方法中选取了能够反映句子中使用的特定词汇的One-hot向量、能够反映词汇信息的词嵌入向量\upcite{bengio2003a}、能够反映句子的合成语义信息的{\small\sffamily\bfseries{递归自动编码}}\index{递归自动编码}(Recursive Auto-encoder Embedding,RAE)\index{Recursive Auto-encoder Embedding},这三种表示级联在一起,最终形成句子的向量表示。在得到机器译文和参考答案的上述分布式表示后,利用余弦相似度和长度惩罚对机器译文质量进行评价。机器译文$o$和参考答案$g$之间的相似度如公式\eqref{eq:4-16}所示,其中${v_i}(o)$${v_i}(g)$分别是机器译文和参考答案的向量表示中的第$i$ 个元素,$N$是向量表示的维度大小。
\begin{eqnarray}
\textrm {cos}(t,r) &=& \frac{{\sum\limits_{i = 1}^N {{v_i}(o) \cdot {v_i}(g)} }}{{\sqrt {\sum\limits_{i = 1}^N {v_i^2(o)} } \sqrt {\sum\limits_{i = 1}^N {v_i^2(g)} } }}
\label{eq:4-16}
\end{eqnarray}
\begin{table}[htp]{
\begin{center}
\caption{常见的单词及句子分布表示}
......@@ -542,12 +544,6 @@ His house is on the south bank of the river .
\end{center}
}\end{table}
\parinterval DREEM方法中选取了能够反映句子中使用的特定词汇的One-hot向量、能够反映词汇信息的词嵌入向量\upcite{bengio2003a}、能够反映句子的合成语义信息的{\small\sffamily\bfseries{递归自动编码}}\index{递归自动编码}(Recursive Auto-encoder Embedding,RAE)\index{Recursive Auto-encoder Embedding},这三种表示级联在一起,最终形成句子的向量表示。在得到机器译文和参考答案的上述分布式表示后,利用余弦相似度和长度惩罚对机器译文质量进行评价。机器译文$o$和参考答案$g$之间的相似度如公式\eqref{eq:4-16}所示,其中${v_i}(o)$${v_i}(g)$分别是机器译文和参考答案的向量表示中的第$i$ 个元素,$N$是向量表示的维度大小。
\begin{eqnarray}
\textrm {cos}(t,r) &=& \frac{{\sum\limits_{i = 1}^N {{v_i}(o) \cdot {v_i}(g)} }}{{\sqrt {\sum\limits_{i = 1}^N {v_i^2(o)} } \sqrt {\sum\limits_{i = 1}^N {v_i^2(g)} } }}
\label{eq:4-16}
\end{eqnarray}
\parinterval 在此基础上,DREEM方法还引入了长度惩罚项,对与参考答案长度相差太多的机器译文进行惩罚,长度惩罚项如公式\eqref{eq:4-17}所示,其中${l_o}$${l_g}$分别是机器译文和参考答案长度:
\begin{eqnarray}
\textrm{BP} &=& \left\{ \begin{array}{l}
......@@ -569,15 +565,12 @@ His house is on the south bank of the river .
\parinterval 本质上,分布式表示是一种对句子语义的一种统计表示。因此,它可以帮助评价系统捕捉一些从简单的词或者句子片段中不易发现的现象,进而进行更深层的句子匹配。
\parinterval 在DREEM方法取得成功后,基于词嵌入的词对齐自动评价方法被提出\upcite{DBLP:journals/corr/MatsuoKS17},该方法中先得到机器译文与参考答案的词对齐关系后,通过对齐关系中两者的词嵌入相似度来计算机器译文与参考答案的相似度,公式如下:
\parinterval 在DREEM方法取得成功后,基于词嵌入的词对齐自动评价方法被提出\upcite{DBLP:journals/corr/MatsuoKS17},该方法中先得到机器译文与参考答案的词对齐关系后,通过对齐关系中两者的词嵌入相似度来计算机器译文与参考答案的相似度,公式如\eqref{eq:4-19}。其中,$o$是机器译文,$g$是参考答案,$m$表示译文$o$的长度,$l$表示参考答案$g$的长度,函数$\varphi(o,g,i,j)$用来计算$o$中第$i$个词和$g$中第$j$个词之间对齐关系的相似度。:
\begin{eqnarray}
\textrm{ASS}(o,g) &=& \frac{1}{{m \cdot l}}\sum\limits_{i = 1}^{m} {\sum\limits_{j = 1}^{l} {\varphi (o,g,i,j)} }
\label{eq:4-19}
\end{eqnarray}
\noindent 其中,$o$是机器译文,$g$是参考答案,$m$表示译文$o$的长度,$l$表示参考答案$g$的长度,函数$\varphi(o,g,i,j)$用来计算$o$中第$i$个词和$g$中第$j$个词之间对齐关系的相似度。
\parinterval 此外,将分布式表示与相对排序融合也是一个很有趣的想法\upcite{DBLP:journals/csl/GuzmanJMN17},在这个尝试中,研究人员利用分布式表示提取参考答案和多个机器译文中的句法信息和语义信息,利用神经网络模型对多个机器译文进行排序。
\parinterval 在基于分布式表示的这类译文质量评价方法中,译文和参考答案的所有词汇信息和句法语义信息都被包含在句子的分布式表示中,克服了单一参考答案的限制。但是同时也带来了新的问题,一方面将句子转化成分布式表示使评价过程变得不那么具有可解释性,另一方面分布式表示的质量也会对评价结果有较大的影响。
......@@ -647,7 +640,6 @@ His house is on the south bank of the river .
%----------------------------------------------
\parinterval 回到机器翻译的问题中来。一个更加基础的问题是:一个系统评价结果的变化在多大范围内是不显著的。利用假设检验的原理,这个问题可以被描述为:评价结果落在$[x-d,x+d]$区间的置信度是$1-\alpha$。换句话说,当系统性能落在$[x-d, x+d]$外,就可以说这个结果与原始的结果有显著性差异。这里$x$通常是系统译文的BLEU计算结果,$[x-d,x+d]$是其对应的置信区间。而$d$$\alpha$有很多计算方法,比如,如果假设评价结果服从正态分布,可以简单的计算$d$
\begin{eqnarray}
d&=&t \frac{s}{\sqrt{n}}
\label{eq:4-21}
......@@ -798,7 +790,7 @@ d&=&t \frac{s}{\sqrt{n}}
\label{eq:4-20}
\end{eqnarray}
\parinterval 这种质量评估方式往往以单词级质量评估为基础,在其结果的基础上进行计算。以实例\ref{eg:4-7}中词级质量评估结果为例,与编辑后结果相比较,机器翻译译文中有四处漏译(“Mit”、“können”、“Sie”、“einzelne”)、三处误译(“dem”、\\“Scharfzeichner”、“scharfzeichnen”分别被误译为“Der”、“Schärfen-Werkezug”、“Schärfer”)、一处多译(“erscheint”),因而需要进行4次插入操作、3次替换操作和1次删除操作,而最终译文长度为12,则有$\textrm HTER=(4+3+1)/12=0.667$。需要注意的是,即便这种评估方式以单词级质量评估为基础,也不意味这句子级质量评估只是在单词级质量评估的结果上通过简单的计算来获得其得分,在实际研究中,常将其视为一个回归问题,利用大量数据学习其评分规则。
\parinterval 这种质量评估方式往往以单词级质量评估为基础,在其结果的基础上进行计算。以实例\ref{eg:4-7}中词级质量评估结果为例,与编辑后结果相比较,机器翻译译文中有四处漏译(“Mit”、“können”、“Sie”、“einzelne”)、三处误译(“dem”、\\“Scharfzeichner”、“scharfzeichnen”分别被误译为“Der”、“Schärfen-Werkezug”、\\“Schärfer”)、一处多译(“erscheint”),因而需要进行4次插入操作、3次替换操作和1次删除操作,而最终译文长度为12,则有${\textrm {HTER}}=(4+3+1)/12=0.667$。需要注意的是,即便这种评估方式以单词级质量评估为基础,也不意味这句子级质量评估只是在单词级质量评估的结果上通过简单的计算来获得其得分,在实际研究中,常将其视为一个回归问题,利用大量数据学习其评分规则。
\vspace{0.5em}
\end{itemize}
......@@ -873,7 +865,6 @@ d&=&t \frac{s}{\sqrt{n}}
\item {\small\sffamily\bfseries{充分度特征}}:反映了源文和机器译文在不同语言层次上的密切程度或关联程度。
\vspace{0.5em}
\end{itemize}
\parinterval 随着深度学习技术的发展,另一种思路是使用表示学习技术生成句子的分布式表示,并在此基础上利用神经网络自动提取高度抽象的句子特征\upcite{DBLP:conf/wmt/KreutzerSR15,DBLP:conf/wmt/MartinsAHK16,DBLP:conf/wmt/ChenTZXZLW17},这样就避免了人工设计特征所带来的时间以及人工代价,同时表示学习所得到的分布式表示可以涵盖更多人工设计难以捕获到的特征,更加全面地反映句子的特点,因此在质量评估任务上也取得了很好的效果\upcite{kreutzer2015quality,DBLP:conf/wmt/ShahLPBBBS15,DBLP:conf/wmt/ScartonBSSS16,DBLP:conf/wmt/AbdelsalamBE16,DBLP:conf/wmt/BasuPN18}。比如,最近的一些工作中大量使用了神经机器翻译模型来获得双语句子的表示结果,并用于质量评估\upcite{DBLP:conf/wmt/Qi19,DBLP:conf/wmt/ZhouZH19,DBLP:conf/wmt/Hokamp17,wang2019niutrans}。这样做的好处在于,质量评估可以直接复用机器翻译的模型,从某种意义上降低了质量评估系统开发的代价。此外,随着近几年各种预训练模型的出现,使用预训练模型来获取用于质量评估的句子表示也成为一大流行趋势,这种方法大大减少了质量评估模型自身的训练时间,在该领域内的表现也十分亮眼\upcite{kepler2019unbabel,DBLP:conf/wmt/YankovskayaTF19,DBLP:conf/wmt/KimLKN19}。关于表示学习、神经机器翻译、预训练模型的内容在第九章和第十章会有进一步介绍。
\parinterval 在得到句子表示之后,可以使用质量评估模块对译文质量进行预测。质量评估模型通常由回归算法或分类算法实现:
......
%%%------------------------------------------------------------------------------------------------------------
\begin{tikzpicture}
\tikzstyle{every node}=[scale=1.2]
\begin{scope}
\draw[->, line width=1pt](-1.4,0)--(1.4,0)node[left,below,font=\scriptsize]{$x$};
\draw[->, line width=1pt](0,-1.2)--(0,1.4)node[right,font=\scriptsize]{$y$};
\foreach \x in {-1.0,-0.5,0.5,1.0}{\draw(\x,0)--(\x,0.05)node[below,outer sep=2pt,font=\scriptsize,,scale=0.8]at(\x,0.1){\x};}
\node[below,outer sep=2pt,font=\scriptsize,scale=0.8]at(0.1,0.1){0};
\foreach \y in {0.5,1.0}{\draw(0,\y)--(0.05,\y)node[left,outer sep=2pt,font=\scriptsize,,scale=0.8]at(0.1,\y){\y};}
\draw[color=red ,domain=-1.4:1.2, line width=1pt]plot(\x,{max(\x,0)});
\node[black,anchor=south,scale=0.8] at (0,1.6) {\small $y =\max (0, x)$};
\node [anchor=south east,inner sep=1pt,scale=0.8] (labeld) at (0.8,-2) {\small{(a) ReLU}};
\end{scope}
%%%------------------------------------------------------------------------------------------------------------
\begin{scope}[xshift=1.7in]
\draw[->, line width=1pt](-1.4,0)--(1.4,0)node[left,below,font=\scriptsize]{$x$};
\draw[->, line width=1pt](0,-1.2)--(0,1.4)node[right,font=\scriptsize]{$y$};
\foreach \x in {-1.0,-0.5,0.5,1.0}{\draw(\x,0)--(\x,0.05)node[below,outer sep=2pt,font=\scriptsize,scale=0.8]at(\x,0.1){\x};}
\node[below,outer sep=2pt,font=\scriptsize,scale=0.8]at(0.1,0.1){0};
\foreach \y in {0.5,1.0}{\draw(0,\y)--(0.05,\y)node[left,outer sep=2pt,font=\scriptsize,scale=0.8]at(0.1,\y){};}
\node[left,outer sep=2pt,font=\scriptsize,scale=0.8]at(0.1,0.5){0.5};
\node[left,outer sep=2pt,font=\scriptsize,scale=0.8]at(0.1,1.1){1.0};
\draw[color=red ,domain=-1.4:1.4, line width=1pt]plot(\x,{exp(-1*((\x)^2))});
\node[black,anchor=south,scale=0.8] at (0,1.6) {\small $y ={\textrm e}^{-x^2}$};
\node [anchor=south east,inner sep=1pt,scale=0.8] (labele) at (0.8,-2) {\small{(b) Gaussian}};
\end{scope}
%%%------------------------------------------------------------------------------------------------------------
\begin{scope}[xshift=3.4in]
\draw[->, line width=1pt](-1.4,0)--(1.4,0)node[left,below,font=\scriptsize]{$x$};
\draw[->, line width=1pt](0,-1.2)--(0,1.4)node[right,font=\scriptsize]{$y$};
\foreach \x in {-1.0,-0.5,0.5,1.0}{\draw(\x,0)--(\x,0.05)node[below,outer sep=2pt,font=\scriptsize,scale=0.8]at(\x,0.1){\x};}
\node[below,outer sep=2pt,font=\scriptsize,scale=0.8]at(0.1,0.1){0};
\foreach \y in {0.5,1.0}{\draw(0,\y)--(0.05,\y)node[left,outer sep=2pt,font=\scriptsize,scale=0.8]at(0.1,\y){\y};}
\draw[color=red ,domain=-1:1, line width=1pt]plot(\x,\x);
\node[black,anchor=south,scale=0.8] at (0,1.6) {\small $y =x$};
\node [anchor=south east,inner sep=1pt,scale=0.8] (labelf) at (0.8,-2) {\small{(c) Identity}};
\end{scope}
\end{tikzpicture}
%%%------------------------------------------------------------------------------------------------------------
\ No newline at end of file
%%%------------------------------------------------------------------------------------------------------------
\begin{tikzpicture}
\tikzstyle{every node}=[scale=1.2]
\begin{scope}
\draw[->, line width=1pt](-1.4,0)--(1.4,0)node[left,below,font=\scriptsize]{$x$};
\draw[->, line width=1pt](0,-1.4)--(0,1.4)node[right,font=\scriptsize]{$y$};
\foreach \x in {-1.0,-0.5,0.0,0.5,1.0}{\draw(\x,0)--(\x,0.05)node[below,outer sep=2pt,font=\scriptsize]at(\x,0){\x};}
\foreach \y in {1.0,0.5}{\draw(0,\y)--(0.05,\y)node[left,outer sep=2pt,font=\scriptsize]at(0,\y){\y};}
\draw[color=red ,domain=-1.4:1, line width=1pt]plot(\x,{ln(1+(exp(\x))});
\node[black,anchor=south] at (0,1.6) {\small $y = \ln(1+{\textrm e}^x)$};
\node [anchor=south east,inner sep=1pt] (labela) at (0.8,-2) {\small{(a) Softplus}};
\draw[->, line width=1pt](0,-1.2)--(0,1.4)node[right,font=\scriptsize]{$y$};
\foreach \x in {-1.0,-0.5,0.5,1.0}{\draw(\x,0)--(\x,0.05)node[below,outer sep=2pt,font=\scriptsize,scale=0.8]at(\x,0.1){\x};}
\node[below,outer sep=2pt,font=\scriptsize,scale=0.8]at(0.1,0.1){0};
\foreach \y in {1.0,0.5}{\draw(0,\y)--(0.05,\y)node[left,outer sep=2pt,font=\scriptsize,scale=0.8]at(0.1,\y){};}
\node[left,outer sep=2pt,font=\scriptsize,scale=0.8]at(0.1,0.4){0.5};
\node[left,outer sep=2pt,font=\scriptsize,scale=0.8]at(0.1,1.0){1.0};
\draw[color=red ,domain=-1.4:1.0, line width=1pt]plot(\x,{ln(1+(exp(\x))});
\node[black,anchor=south,scale=0.8] at (0,1.5) {\small $y = \ln(1+{\textrm e}^x)$};
\node [anchor=south east,inner sep=1pt,scale=0.8] (labela) at (0.8,-2) {\small{(a) Softplus}};
\end{scope}
%%%------------------------------------------------------------------------------------------------------------
\begin{scope}[xshift=1.7in]
\draw[->, line width=1pt](-1.4,0)--(1.4,0)node[left,below,font=\scriptsize]{$x$};
\draw[->, line width=1pt](0,-1.4)--(0,1.4)node[right,font=\scriptsize]{$y$};
\draw[->, line width=1pt](0,-1.2)--(0,1.4)node[right,font=\scriptsize,scale=0.8]{$y$};
\draw[dashed](0,1)--(1.4,1);
\foreach \x in {-1,-0.5,0,0.5,1}{\draw(\x,0)--(\x,0.05)node[below,outer sep=2pt,font=\scriptsize]at(\x,0){
\foreach \x in {-1,-0.5,0.5,1}{\draw(\x,0)--(\x,0.05)node[below,outer sep=2pt,font=\scriptsize,scale=0.8]at(\x,0.1){
\pgfmathparse{(\x)*5}
\pgfmathresult};}
\foreach \y in {0.5,1.0}{\draw(0,\y)--(0.05,\y)node[left,outer sep=2pt,font=\scriptsize]at(-0.15,\y){\y};}
\node[below,outer sep=2pt,font=\scriptsize,scale=0.8]at(0.1,0.1){0};
\foreach \y in {0.5,1.0}{\draw(0,\y)--(0.05,\y)node[left,outer sep=2pt,font=\scriptsize,scale=0.8]at(0.1,\y){\y};}
\draw[color=red,domain=-1.4:1.4, line width=1pt]plot(\x,{1/(1+(exp(-5*\x)))});
\node[black,anchor=south] at (0,1.6) {\small $y = \frac{1}{1+{\textrm e}^{-x}}$};
\node [anchor=south east,inner sep=1pt] (labelb) at (0.8,-2) {\small{(b) Sigmoid}};
\node[black,anchor=south,scale=0.8] at (0,1.5) {\small $y = \frac{1}{1+{\textrm {e}}^{-x}}$};
\node [anchor=south east,inner sep=1pt,scale=0.8] (labelb) at (0.8,-2) {\small{(b) Sigmoid}};
\end{scope}
%%%------------------------------------------------------------------------------------------------------------
\begin{scope}[xshift=3.4in]
\draw[->, line width=1pt](-1.4,0)--(1.4,0)node[left,below,font=\scriptsize]{$x$};
\draw[->, line width=1pt](0,-1.4)--(0,1.4)node[right,font=\scriptsize]{$y$};
\draw[->, line width=1pt](0,-1.4)--(0,1.2)node[right,font=\scriptsize]{$y$};
\draw[dashed](0,1)--(1.4,1);
\draw[dashed](-1.4,-1)--(0,-1);
\foreach \x in {-1.0,-0.5,0.0,0.5,1.0}{\draw(\x,0)--(\x,0.05)node[below,outer sep=2pt,font=\scriptsize]at(\x,0){\x};}
\foreach \y in {,-1.0-0.5,0.5,1.0}{\draw(0,\y)--(0.05,\y)node[left,outer sep=2pt,font=\scriptsize]at(0,\y){\y};}
\foreach \x in {-1.0,-0.5,0.5,1.0}{\draw(\x,0)--(\x,0.05)node[below,outer sep=2pt,font=\scriptsize,scale=0.8]at(\x,0.1){\x};}
\node[below,outer sep=2pt,font=\scriptsize,scale=0.8]at(0.1,0.1){0};
\foreach \y in {0.5,1.0}{\draw(0,\y)--(0.05,\y)node[left,outer sep=2pt,font=\scriptsize,scale=0.8]at(0,\y){\y};}
\draw[color=red ,domain=-1.4:1.4, line width=1pt]plot(\x,{tanh(\x)});
\node[black,anchor=south] at (0,1.6) {\small $y = \frac{{\textrm e}^{x}-{\textrm e}^{-x}}{{e}^{x}+e^{-x}}$};
\node [anchor=south east,inner sep=1pt] (labelc) at (0.8,-2) {\small{(c) Tanh}};
\end{scope}
%%%------------------------------------------------------------------------------------------------------------
\begin{scope}[yshift=-1.8in]
\draw[->, line width=1pt](-1.4,0)--(1.4,0)node[left,below,font=\scriptsize]{$x$};
\draw[->, line width=1pt](0,-1.4)--(0,1.4)node[right,font=\scriptsize]{$y$};
\foreach \x in {-1.0,-0.5,0.0,0.5,1.0}{\draw(\x,0)--(\x,0.05)node[below,outer sep=2pt,font=\scriptsize]at(\x,0){\x};}
\foreach \y in {0.5,1.0}{\draw(0,\y)--(0.05,\y)node[left,outer sep=2pt,font=\scriptsize]at(0,\y){\y};}
\draw[color=red ,domain=-1.4:1.4, line width=1pt]plot(\x,{max(\x,0)});
\node[black,anchor=south] at (0,1.6) {\small $y =\max (0, x)$};
\node [anchor=south east,inner sep=1pt] (labeld) at (0.8,-2) {\small{(d) ReLU}};
\end{scope}
%%%------------------------------------------------------------------------------------------------------------
\begin{scope}[yshift=-1.8in,xshift=1.7in]
\draw[->, line width=1pt](-1.4,0)--(1.4,0)node[left,below,font=\scriptsize]{$x$};
\draw[->, line width=1pt](0,-1.4)--(0,1.4)node[right,font=\scriptsize]{$y$};
\foreach \x in {-1.0,-0.5,0.0,0.5,1.0}{\draw(\x,0)--(\x,0.05)node[below,outer sep=2pt,font=\scriptsize]at(\x,0){\x};}
\foreach \y in {0.5,1.0}{\draw(0,\y)--(0.05,\y)node[left,outer sep=2pt,font=\scriptsize]at(-0.15,\y){\y};}
\draw[color=red ,domain=-1.4:1.4, line width=1pt]plot(\x,{exp(-1*((\x)^2))});
\node[black,anchor=south] at (0,1.6) {\small $y =e^{-x^2}$};
\node [anchor=south east,inner sep=1pt] (labele) at (0.8,-2) {\small{(e) Gaussian}};
\end{scope}
%%%------------------------------------------------------------------------------------------------------------
\begin{scope}[yshift=-1.8in,xshift=3.4in]
\draw[->, line width=1pt](-1.4,0)--(1.4,0)node[left,below,font=\scriptsize]{$x$};
\draw[->, line width=1pt](0,-1.4)--(0,1.4)node[right,font=\scriptsize]{$y$};
\foreach \x in {-1.0,-0.5,0.0,0.5,1.0}{\draw(\x,0)--(\x,0.05)node[below,outer sep=2pt,font=\scriptsize]at(\x,0){\x};}
\foreach \y in {0.5,1.0}{\draw(0,\y)--(0.05,\y)node[left,outer sep=2pt,font=\scriptsize]at(0,\y){\y};}
\draw[color=red ,domain=-1:1, line width=1pt]plot(\x,\x);
\node[black,anchor=south] at (0,1.6) {\small $y =x$};
\node [anchor=south east,inner sep=1pt] (labelf) at (0.8,-2) {\small{(f) Identity}};
\node[black,anchor=south,scale=0.8] at (0,1.5) {\small $y = \frac{{\textrm e}^{x}-{\textrm e}^{-x}}{\textrm{e}^{x}+{\textrm e}^{-x}}$};
\node [anchor=south east,inner sep=1pt,scale=0.8] (labelc) at (0.8,-2) {\small{(c) Tanh}};
\end{scope}
\end{tikzpicture}
%%%------------------------------------------------------------------------------------------------------------
\ No newline at end of file
......@@ -410,10 +410,7 @@ f(c{\mathbi{v}})&=&cf({\mathbi{v}})
\parinterval 利用矩阵$ {\mathbi{A}}\in {\mathbb R}^{m\times n} $,可以实现两个有限维欧氏空间的映射函数$f:{\mathbb R}^n\rightarrow {\mathbb R}^m$。例如$ n $维列向量$ {\mathbi{x}} ^{\textrm T}$$ m\times n $的矩阵$ {\mathbi{A}} $,向量$ {\mathbi{x}} ^{\textrm T}$左乘矩阵$ {\mathbi{A}} $,可将向量$ {\mathbi{x}} ^{\textrm T}$映射为$ m $列向量。公式\eqref{eq:9-11}\eqref{eq:9-12}\eqref{eq:9-13}展示了一个具体的例子,
\begin{eqnarray}
{\mathbi{x}}^{\textrm{T}} & = & {\begin{pmatrix} x_1\\ x_2 \\ \dots \\ x_n \end{pmatrix}}
\label{eq:9-11}
\end{eqnarray}
\begin{eqnarray}
\label{eq:9-11}\\ \\
{\mathbi{A}}&=&
\begin{pmatrix}
a_{11} & a_{12} & \dots & a_{1n}\\
......@@ -673,7 +670,6 @@ x_1\cdot w_1+x_2\cdot w_2+x_3\cdot w_3 & = & 0\cdot 1+0\cdot 1+1\cdot 1 \nonumbe
\parinterval 举个简单的例子,预报天气时,往往需要预测温度、湿度和风力,这就意味着如果使用单层神经网络进行预测,需要设置3个神经元。如图\ref{fig:9-10}所示,此时权重矩阵如下:
\begin{eqnarray}
{\mathbi{W}}&=&\begin{pmatrix} w_{11} & w_{12} & w_{13}\\ w_{21} & w_{22} & w_{23}\end{pmatrix}
\label{eq:9-105}
......@@ -701,22 +697,20 @@ x_1\cdot w_1+x_2\cdot w_2+x_3\cdot w_3 & = & 0\cdot 1+0\cdot 1+1\cdot 1 \nonumbe
\parinterval 在神经网络中,对于输入向量$ {\mathbi{x}}\in {\mathbb R}^m $,一层神经网络首先将其经过线性变换映射到$ {\mathbb R}^n $,再经过激活函数变成${\mathbi{y}}\in {\mathbb R}^n $。还是上面天气预测的例子,每个神经元获得相同的输入,权重矩阵$ {\mathbi{W}} $是一个$ 2\times 3 $矩阵,矩阵中每个元素$ w_{ij} $代表第$ j $个神经元中$ x_{i} $对应的权重值,假设编号为1的神经元负责预测温度,则$ w_{i1} $的含义为预测温度时输入$ x_{i} $对其影响程度。此外所有神经元的偏置$ b_{1} $$ b_{2} $$ b_{3} $组成了最终的偏置向量$ {\mathbi{b}}$。在该例中则有,权重矩阵$ {\mathbi{W}}=\begin{pmatrix} w_{11} & w_{12} & w_{13}\\ w_{21} & w_{22} & w_{23}\end{pmatrix} $,偏置向量$ {\mathbi{b}}=(b_1,b_2,b_3) $
\parinterval 那么,线性变换的本质是什么?
\parinterval 那么,线性变换的本质是什么?图\ref{fig:9-13}正是线性变换的简单示意。
\begin{itemize}
\vspace{0.5em}
\item 从代数角度看,对于线性空间$ \textrm V $,任意$ {\mathbi{a}}$${\mathbi{a}}\in {\textrm V} $和数域中的任意$ \alpha $,线性变换$ T(\cdot) $需满足:$ T({\mathbi{a}}+{\mathbi{b}})=T({\mathbi{a}})+T({\mathbi{b}}) $,且$ T(\alpha {\mathbi{a}})=\alpha T({\mathbi{a}}) $
\vspace{0.5em}
\item 从几何角度看,公式中的${\mathbi{x}}\cdot {\mathbi{W}}+{\mathbi{b}}$${\mathbi{x}}$右乘${\mathbi{W}}$相当于对$ {\mathbi{x}} $进行旋转变换。例如,对三个点$ (0,0) $$ (0,1) $$ (1,0) $及其围成的矩形区域右乘如下矩阵:
\begin{eqnarray}
{\mathbi{W}}&=&\begin{pmatrix} 1 & 0 & 0\\ 0 & -1 & 0\\ 0 & 0 & 1\end{pmatrix}
\label{eq:9-106}
\end{eqnarray}
这样,矩形区域由第一象限旋转90度到了第四象限,如图\ref{fig:9-13}第一步所示。公式$ {\mathbi{x}}\cdot {\mathbi{W}}+{\mathbi{b}}$中的公式中的${\mathbi{b}}$相当于对其进行平移变换。其过程如图\ref{fig:9-13} 第二步所示,偏置矩阵$ {\mathbi{b}}=\begin{pmatrix} 0.5 & 0 & 0\\ 0 & 0 & 0\\ 0 & 0 & 0\end{pmatrix} $将矩形区域沿$x$轴向右平移了一段距离。
\vspace{0.5em}
\end{itemize}
%----------------------------------------------
\begin{figure}[htp]
......@@ -726,10 +720,12 @@ x_1\cdot w_1+x_2\cdot w_2+x_3\cdot w_3 & = & 0\cdot 1+0\cdot 1+1\cdot 1 \nonumbe
\label{fig:9-13}
\end{figure}
%-------------------------------------------
\vspace{0.5em}
\end{itemize}
\parinterval 也就是说,线性变换提供了对输入数据进行空间中旋转、平移的能力。当然,线性变换也适用于更加复杂的情况,这也为神经网络提供了拟合不同函数的能力。比如,可以利用线性变换将三维图形投影到二维平面上,或者将二维平面上的图形映射到三维空间。如图\ref{fig:9-14}所示,通过一个简单的线性变换,可以将三维图形投影到二维平面上。
\vspace{-0.5em}
\parinterval 线性变换提供了对输入数据进行空间中旋转、平移的能力。线性变换也适用于更加复杂的情况,这也为神经网络提供了拟合不同函数的能力。比如可以利用线性变换将三维图形投影到二维平面上,或者将二维平面上的图形映射到三维空间。如图\ref{fig:9-14}所示,通过一个简单的线性变换,可以将三维图形投影到二维平面上。
%----------------------------------------------
\begin{figure}[htp]
\centering
......@@ -739,8 +735,7 @@ x_1\cdot w_1+x_2\cdot w_2+x_3\cdot w_3 & = & 0\cdot 1+0\cdot 1+1\cdot 1 \nonumbe
\end{figure}
%-------------------------------------------
\vspace{-0.5em}
\parinterval 那激活函数又是什么?一个神经元在接收到经过线性变换的结果后,通过激活函数的处理,得到最终的输出$ y $。激活函数的目的是解决实际问题中的非线性变换,线性变换只能拟合直线,而激活函数的加入,使神经网络具有了拟合曲线的能力。 特别是在实际问题中,很多现象都无法用简单的线性关系描述,这时可以使用非线性激活函数来描述更加复杂的问题。常见的非线性激活函数有Sigmoid、ReLU、Tanh等。图\ref{fig:9-15}中列举了几种激活函数的形式。
\parinterval 那激活函数又是什么?一个神经元在接收到经过线性变换的结果后,通过激活函数的处理,得到最终的输出$ y $。激活函数的目的是解决实际问题中的非线性变换,线性变换只能拟合直线,而激活函数的加入,使神经网络具有了拟合曲线的能力。 特别是在实际问题中,很多现象都无法用简单的线性关系描述,这时可以使用非线性激活函数来描述更加复杂的问题。常见的非线性激活函数有Sigmoid、ReLU、Tanh等。图\ref{fig:9-15}\ref{fig:9-15-2}中列举了几种激活函数的形式。
%----------------------------------------------
\begin{figure}[htp]
......@@ -750,6 +745,14 @@ x_1\cdot w_1+x_2\cdot w_2+x_3\cdot w_3 & = & 0\cdot 1+0\cdot 1+1\cdot 1 \nonumbe
\label{fig:9-15}
\end{figure}
%-------------------------------------------
%----------------------------------------------
\begin{figure}[htp]
\centering
\input{./Chapter9/Figures/figure-activate-2}
\caption{几种常见的激活函数(补)}
\label{fig:9-15-2}
\end{figure}
%-------------------------------------------
%----------------------------------------------------------------------------------------
% NEW SUBSUB-SECTION
......@@ -760,6 +763,8 @@ x_1\cdot w_1+x_2\cdot w_2+x_3\cdot w_3 & = & 0\cdot 1+0\cdot 1+1\cdot 1 \nonumbe
\parinterval 单层神经网络由线性变换和激活函数两部分构成,但在实际问题中,单层网络并不能很好地拟合复杂函数。因此很自然地想到将单层网络扩展到多层神经网络,即深层神经网络。将一层神经网络的最终输出向量作为另一层神经网络的输入向量,通过这种方式可以将多个单层神经网络连接在一起。
\parinterval 在多层神经网络中,通常包括输入层、输出层和至少一个隐藏层。图\ref{fig:9-17}展示了一个三层神经网络,包括输入层\footnote{由于输入层不存在神经元,因此在计算神经网络层数时不将其包括在内。}、输出层和两个隐藏层。
%----------------------------------------------
\begin{figure}[htp]
\centering
......@@ -769,8 +774,6 @@ x_1\cdot w_1+x_2\cdot w_2+x_3\cdot w_3 & = & 0\cdot 1+0\cdot 1+1\cdot 1 \nonumbe
\end{figure}
%-------------------------------------------
\parinterval 在多层神经网络中,通常包括输入层、输出层和至少一个隐藏层。图\ref{fig:9-17}展示了一个三层神经网络,包括输入层\footnote{由于输入层不存在神经元,因此在计算神经网络层数时不将其包括在内。}、输出层和两个隐藏层。\\
%----------------------------------------------------------------------------------------
% NEW SUB-SECTION
%----------------------------------------------------------------------------------------
......@@ -959,7 +962,6 @@ x_1\cdot w_1+x_2\cdot w_2+x_3\cdot w_3 & = & 0\cdot 1+0\cdot 1+1\cdot 1 \nonumbe
\label{fig:9-27}
\end {figure}
%-------------------------------------------
%----------------------------------------------------------------------------------------
% NEW SUBSUB-SECTION
%----------------------------------------------------------------------------------------
......@@ -1019,7 +1021,7 @@ f(x)&=&\begin{cases} 0 & x\le 0 \\x & x>0\end{cases}
\end{figure}
%-------------------------------------------
\parinterval 高阶张量的物理存储方式与多维数组在C++、Python中的物理存储方式相同。
\parinterval 实际上,高阶张量的物理存储方式也与多维数组在C++、Python中的物理存储方式相同。\\
%----------------------------------------------------------------------------------------
% NEW SUB-SECTION
......@@ -1107,9 +1109,8 @@ y&=&{\textrm{Sigmoid}}({\textrm{Tanh}}({\mathbi{x}}\cdot {\mathbi{W}}^{[1]}+{\ma
\sectionnewpage
\section{神经网络的参数训练}
\parinterval 简单来说,神经网络可以被看作是由变量和函数组成的表达式,例如:$ {\mathbi{y}}={\mathbi{x}}+{\mathbi{b}} $$ {\mathbi{y}}={\textrm{ReLU}}({\mathbi{x}}\cdot {\mathbi{W}}+{\mathbi{b}}) $$ {\mathbi{y}}={\textrm{Sigmoid}}({\textrm{ReLU}}({\mathbi{x}}\cdot {\mathbi{W}}^{[1]}+{\mathbi{b}}^{[1]})\cdot {\mathbi{W}}^{[2]}+{\mathbi{b}}^{[2]}) $等等,其中的$ {\mathbi{x}} $$ {\mathbi{y}} $作为输入和输出向量, ${\mathbi{W}}$${\mathbi{b}}$等其他变量作为{\small\sffamily\bfseries{模型参数}}\index{模型参数}(Model Parameters)\index{Model Parameters}。确定了函数表达式和模型参数,也就确定了神经网络模型。通常,表达式的形式需要系统开发者设计,而模型参数的数量有时会非常巨大,因此需要自动学习,这个过程也被称为模型学习或训练。为了实现这个目标,通常会准备一定量的带有标准答案的数据,称之为有标注数据。这些数据会用于对模型参数的学习,这也对应了统计模型中的参数估计过程。在机器学习中,一般把这种使用有标注数据进行统计模型参数训练的过程称为{\small\sffamily\bfseries{有指导的训练}}\index{有指导的训练}{\small\sffamily\bfseries{有监督的训练}}\index{有监督的训练}(Supervised Training)\index{Supervised Training}。在本章中,如果没有特殊说明,模型训练都是指有监督的训练。那么神经网络内部是怎样利用有标注数据对参数进行训练的呢?\\ \\
\parinterval 简单来说,神经网络可以被看作是由变量和函数组成的表达式,例如:$ {\mathbi{y}}={\mathbi{x}}+{\mathbi{b}} $$ {\mathbi{y}}={\textrm{ReLU}}({\mathbi{x}}\cdot {\mathbi{W}}+{\mathbi{b}}) $$ {\mathbi{y}}={\textrm{Sigmoid}}({\textrm{ReLU}}({\mathbi{x}}\cdot {\mathbi{W}}^{[1]}+{\mathbi{b}}^{[1]})\cdot {\mathbi{W}}^{[2]}+{\mathbi{b}}^{[2]}) $等等,其中的$ {\mathbi{x}} $$ {\mathbi{y}} $作为输入和输出向量, ${\mathbi{W}}$${\mathbi{b}}$等其他变量作为{\small\sffamily\bfseries{模型参数}}\index{模型参数}(Model Parameters)\index{Model Parameters}。确定了函数表达式和模型参数,也就确定了神经网络模型。通常,表达式的形式需要系统开发者设计,而模型参数的数量有时会非常巨大,因此需要自动学习,这个过程也被称为模型学习或训练。为了实现这个目标,通常会准备一定量的带有标准答案的数据,称之为有标注数据。这些数据会用于对模型参数的学习,这也对应了统计模型中的参数估计过程。在机器学习中,一般把这种使用有标注数据进行统计模型参数训练的过程称为{\small\sffamily\bfseries{有指导的训练}}\index{有指导的训练}{\small\sffamily\bfseries{有监督的训练}}\index{有监督的训练}(Supervised Training)\index{Supervised Training}。在本章中,如果没有特殊说明,模型训练都是指有监督的训练。那么神经网络内部是怎样利用有标注数据对参数进行训练的呢?
\vspace{-2.5em}
\parinterval 为了回答这个问题,可以把模型参数的学习过程看作是一个优化问题,即找到一组参数,使得模型达到某种最优的状态。这个问题又可以被转化为两个新的问题:
\begin{itemize}
......@@ -1214,7 +1215,6 @@ y&=&{\textrm{Sigmoid}}({\textrm{Tanh}}({\mathbi{x}}\cdot {\mathbi{W}}^{[1]}+{\ma
%
%----------------------------------------------------------------------------------------
\vspace{0.5em}
\noindent {\small\sffamily\bfseries{1)批量梯度下降\index{批量梯度下降}(Batch Gradient Descent)\index{Batch Gradient Descent}}}
\vspace{0.5em}
......@@ -1292,7 +1292,7 @@ J({\bm \theta})&=&\frac{1}{m}\sum_{i=j}^{j+m-1}{L({\mathbi{x}}^{[i]},{\mathbi{y}
\parinterval 截断误差和舍入误差是如何造成的呢?数值微分方法求梯度时,需用极限或无穷过程来求得。然而计算机需要将求解过程化为一系列有限的算术运算和逻辑运算。这样就要对某种无穷过程进行“截断”,即仅保留无穷过程的前段有限序列而舍弃它的后段。这就带来截断误差;舍入误差,是指运算得到的近似值和精确值之间的差异。由于数值微分方法计算复杂函数的梯度问题时,经过无数次的近似,每一次近似都产生了舍入误差,在这样的情况下,误差会随着运算次数增加而积累得很大,最终得出没有意义的运算结果。实际上,截断误差和舍入误差在训练复杂神经网络中,特别是使用低精度计算时,也会出现,因此是实际系统研发中需要注意的问题。
\parinterval 尽管数值微分不适用于大模型中的梯度求解,但是由于其非常简单,因此经常被用于检验其他梯度计算方法的正确性。比如在实现反向传播的时候(详见\ref{sec:9.4.6}节),可以检验求导是否正确(Gradient Check),这个过程就是利用数值微分实现的。\\ \\
\parinterval 尽管数值微分不适用于大模型中的梯度求解,但是由于其非常简单,因此经常被用于检验其他梯度计算方法的正确性。比如在实现反向传播的时候(详见\ref{sec:9.4.6}节),可以检验求导是否正确(Gradient Check),这个过程就是利用数值微分实现的。
%----------------------------------------------------------------------------------------
%
......@@ -1342,7 +1342,6 @@ $+2x^2+x+1)$ & \ \ $(x^4+2x^3+2x^2+x+1)$ & $+6x+1$ \\
\parinterval 由于它只对基本函数或常数运用符号微分法则,所以它非常适合嵌入编程语言的循环条件等结构中,形成一种程序化的微分过程。在具体实现时,自动微分往往被当做是一种基于图的计算,相关的理论和技术方法相对成熟,因此是深度学习中使用最广泛的一种方法。不同于一般的编程模式,图计算先生成计算图,然后按照计算图执行计算过程。
\parinterval 自动微分可以用一种{\small\sffamily\bfseries{反向模式}}\index{反向模式}(Reverse Mode\index{Reverse Mode}/Backward Mode\index{Backward Mode})即反向传播思想进行描述\upcite{baydin2017automatic}。令${\mathbi{h}}_i$是神经网络的计算图中第$i$个节点的输出。反向模式的自动微分是要计算:
\begin{eqnarray}
\bar{{\mathbi{h}}_i} &=& \frac{\partial L}{\partial {\mathbi{h}}_i} \label{eq:reverse-mode-v}
\end{eqnarray}
......@@ -1501,6 +1500,15 @@ z_t&=&\gamma z_{t-1}+(1-\gamma) \frac{\partial J}{\partial {\theta}_t} \cdot \f
\parinterval 当神经网络较为复杂时,模型训练还是需要几天甚至几周的时间。如果希望尽可能缩短一次学习所需的时间,最直接的想法就是把不同的训练样本分配给多个GPU 或CPU,然后在这些设备上同时进行训练,即实现并行化训练。这种方法也被称作{\small\sffamily\bfseries{数据并行}}\index{数据并行}。具体实现时,有两种常用的并行化策略:(参数)同步更新和(参数)异步更新。
%----------------------------------------------
\begin{figure}[htp]
\centering
\input{./Chapter9/Figures/figure-parallel}
\caption{同步更新与异步更新对比}
\label{fig:9-47}
\end {figure}
%-------------------------------------------
\begin{itemize}
\vspace{0.5em}
\item {\small\sffamily\bfseries{同步更新}}\index{同步更新}(Synchronous Update)\index{Synchronous Update}是指所有计算设备完成计算后,统一汇总并更新参数。当所有设备的反向传播算法完成之后同步更新参数,不会出现单个设备单独对参数进行更新的情况。这种方法效果稳定,但是效率比较低,在同步更新时,每一次参数更新都需要所有设备统一开始、统一结束,如果设备的运行速度不一致,那么每一次参数更新都需要等待最慢的设备结束才能开始。
......@@ -1513,16 +1521,6 @@ z_t&=&\gamma z_{t-1}+(1-\gamma) \frac{\partial J}{\partial {\theta}_t} \cdot \f
\parinterval 此外,在使用多个设备进行并行训练的时候,由于设备间带宽的限制,大量的数据传输会有较高的延时。对于复杂神经网络来说,设备间参数和梯度传递的时间消耗也会成为一个不得不考虑的因素。有时候,设备间数据传输的时间甚至比模型计算的时间都长,大大降低了并行度\upcite{xiao2017fast}。对于这种问题,可以考虑对数据进行压缩或者减少传输的次数来缓解问题。
%----------------------------------------------
\begin{figure}[htp]
\centering
\input{./Chapter9/Figures/figure-parallel}
\caption{同步更新与异步更新对比}
\label{fig:9-47}
\end {figure}
%-------------------------------------------
%----------------------------------------------------------------------------------------
% NEW SUB-SECTION
%----------------------------------------------------------------------------------------
......@@ -1569,12 +1567,19 @@ z_t&=&\gamma z_{t-1}+(1-\gamma) \frac{\partial J}{\partial {\theta}_t} \cdot \f
\item {\small\bfnew{层标准化}}\index{层标准化}(Layer Normalization)\index{Layer Normalization}。类似的,层标准化更多是针对自然语言处理这种序列处理任务\upcite{Ba2016LayerN},它和批量标准化的原理是一样的,只是标准化操作是在序列上同一层网络的输出结果上进行的,也就是标准化操作沿着序列方向进行。这种方法可以很好的避免序列上不同位置神经网络输出结果的不可比性。同时由于标准化后所有的结果都转化到一个可比的范围,使得隐层状态可以在不同层之间进行自由组合。
\item {\small\bfnew{残差网络}}\index{残差网络}(Residual Networks)\index{Residual Networks}。最初,残差网络是为了解决神经网络持续加深时的模型退化问题\upcite{DBLP:journals/corr/HeZRS15},但是残差结构对解决梯度消失和梯度爆炸问题也有所帮助。有了残差结构,可以很轻松的构建几十甚至上百层的神经网络,而不用担心层数过深造成的梯度消失问题。残差网络的结构如图\ref{fig:9-51}所示。图\ref{fig:9-51}中右侧的曲线叫做{\small\bfnew{跳接}}\index{跳接}(Skip Connection)\index{Skip Connection},通过跳接在激活函数前,将上一层(或几层)之前的输出与本层计算的输出相加,将求和的结果输入到激活函数中作为本层的输出。假设残差结构的输入为$ {\mathbi{x}}_l $,输出为$ {\mathbi{x}}_{l+1} $,则有
\begin{eqnarray}
{\mathbi{x}}_{l+1}&=&F({\mathbi{x}}_l)+{\mathbi{x}}_l
\label{eq:9-44}
\end{eqnarray}
相比较于简单的多层堆叠的结构,残差网络提供了跨层连接结构。这种结构在反向传播中有很大的好处,比如,对于一个训练样本,损失函数为$L$$ \mathbi x_l $处的梯度的计算方式如公式\eqref{eq:9-45}所示。残差网络可以将后一层的梯度$ \frac{\partial L}{\partial {\mathbi{x}}_{l+1}} $不经过任何乘法项直接传递到$ \frac{\partial L}{\partial {\mathbi{x}}_l} $,从而缓解了梯度经过每一层后多次累乘造成的梯度消失问题。在{\chaptertwelve}中还会看到,在机器翻译中残差结构可以和层标准化一起使用,而且这种组合可以取得很好的效果。
\begin{eqnarray}
\frac{\partial L}{\partial {\mathbi{x}}_l}&=&\frac{\partial L}{\partial {\mathbi{x}}_{l+1}} \cdot \frac{\partial {\mathbi{x}}_{l+1}}{\partial {\mathbi{x}}_l}\nonumber\\
&=&\frac{\partial L}{\partial {\mathbi{x}}_{l+1}} \cdot \left(1+\frac{\partial F({\mathbi{x}}_l)}{\partial {\mathbi{x}}_l}\right)\nonumber\\
&=&\frac{\partial L}{\partial {\mathbi{x}}_{l+1}}+\frac{\partial L}{\partial {\mathbi{x}}_{l+1}} \cdot \frac{\partial F({\mathbi{x}}_l)}{\partial {\mathbi{x}}_l}
\label{eq:9-45}
\end{eqnarray}
%----------------------------------------------
\begin{figure}[htp]
......@@ -1584,17 +1589,6 @@ z_t&=&\gamma z_{t-1}+(1-\gamma) \frac{\partial J}{\partial {\theta}_t} \cdot \f
\label{fig:9-51}
\end{figure}
%-------------------------------------------
相比较于简单的多层堆叠的结构,残差网络提供了跨层连接结构。这种结构在反向传播中有很大的好处,比如,对于一个训练样本,损失函数为$L$$ \mathbi x_l $处的梯度可以进行如下计算:
\begin{eqnarray}
\frac{\partial L}{\partial {\mathbi{x}}_l}&=&\frac{\partial L}{\partial {\mathbi{x}}_{l+1}} \cdot \frac{\partial {\mathbi{x}}_{l+1}}{\partial {\mathbi{x}}_l}\nonumber\\
&=&\frac{\partial L}{\partial {\mathbi{x}}_{l+1}} \cdot \left(1+\frac{\partial F({\mathbi{x}}_l)}{\partial {\mathbi{x}}_l}\right)\nonumber\\
&=&\frac{\partial L}{\partial {\mathbi{x}}_{l+1}}+\frac{\partial L}{\partial {\mathbi{x}}_{l+1}} \cdot \frac{\partial F({\mathbi{x}}_l)}{\partial {\mathbi{x}}_l}
\label{eq:9-45}
\end{eqnarray}
由上式可知,残差网络可以将后一层的梯度$ \frac{\partial L}{\partial {\mathbi{x}}_{l+1}} $不经过任何乘法项直接传递到$ \frac{\partial L}{\partial {\mathbi{x}}_l} $,从而缓解了梯度经过每一层后多次累乘造成的梯度消失问题。在{\chaptertwelve}中还会看到,在机器翻译中残差结构可以和层标准化一起使用,而且这种组合可以取得很好的效果。
\end{itemize}
%----------------------------------------------------------------------------------------
......@@ -1884,7 +1878,8 @@ z_t&=&\gamma z_{t-1}+(1-\gamma) \frac{\partial J}{\partial {\theta}_t} \cdot \f
\subsubsection{1. 模型结构}
\parinterval 最具代表性的神经语言模型是{\small\sffamily\bfseries{前馈神经网络语言模型}}\index{前馈神经网络语言模型}(Feed-forward Neural Network Language Model,FNNLM\index{FNNLM})。这种语言模型的目标是用神经网络计算$ \funp{P}(w_m|w_{m-n+1}\dots w_{m-1}) $,之后将多个$n$-gram的概率相乘得到整个序列的概率\upcite{bengio2003a}
\parinterval 最具代表性的神经语言模型是{\small\sffamily\bfseries{前馈神经网络语言模型}}\index{前馈神经网络语言模型}(Feed-forward Neural Network Language Model,FNNLM\index{FNNLM})。这种语言模型的目标是用神经网络计算$ \funp{P}(w_m|w_{m-n+1}
\\\dots w_{m-1}) $,之后将多个$n$-gram的概率相乘得到整个序列的概率\upcite{bengio2003a}
%----------------------------------------------
\begin{figure}[htp]
......@@ -1930,7 +1925,6 @@ z_t&=&\gamma z_{t-1}+(1-\gamma) \frac{\partial J}{\partial {\theta}_t} \cdot \f
\subsubsection{3. 隐藏层和输出层}
\parinterval 把得到的$ {\mathbi{e}}_0 $$ {\mathbi{e}}_1 $$ {\mathbi{e}}_2 $三个向量级联在一起,经过两层网络,最后通过Softmax函数(橙色方框)得到输出,具体过程为:
\begin{eqnarray}
{\mathbi{y}}&=&{\textrm{Softmax}}({\mathbi{h}}_0{\mathbi{U}})\label{eq:9-61}\\
{\mathbi{h}}_0&=&{\textrm{Tanh}}([{\mathbi{e}}_{i-3},{\mathbi{e}}_{i-2},{\mathbi{e}}_{i-1}]{\mathbi{H}}+{\mathbi{d}})
......@@ -1940,7 +1934,6 @@ z_t&=&\gamma z_{t-1}+(1-\gamma) \frac{\partial J}{\partial {\theta}_t} \cdot \f
\noindent 这里,输出$ {\mathbi{y}}$是词表$V$上的一个分布,来表示$\funp{P}(w_i|w_{i-1},w_{i-2},w_{i-3}) $$ {\mathbi{U}}$${\mathbi{H}}$${\mathbi{d}}$是模型的参数。这样,对于给定的单词$w_i$可以用$y_i$得到其概率,其中$y_i$表示向量${\mathbi{y}}$的第$i$维。
\parinterval Softmax($\cdot$)的作用是根据输入的$|V|$维向量(即${\mathbi{h}}_0{\mathbi{U}}$),得到一个$|V|$维的分布。令${\bm \tau}$表示Softmax($\cdot$)的输入向量,Softmax函数可以被定义为:
\begin{eqnarray}
\textrm{Softmax}(\tau_i)&=&\frac{\textrm{exp}(\tau_i)} {\sum_{i'=1}^{|V|} \textrm{exp}(\tau_{i'})}
\label{eq:9-120}
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论