Commit 8724dcde by 孟霞

合并分支 'caorunzhe' 到 'mengxia'

Caorunzhe

查看合并请求 !863
parents 7129dadb cb935bd2
...@@ -939,15 +939,15 @@ a (\mathbi{s},\mathbi{h}) &=& \left\{ \begin{array}{ll} ...@@ -939,15 +939,15 @@ a (\mathbi{s},\mathbi{h}) &=& \left\{ \begin{array}{ll}
\subsubsection{1. 损失函数} \subsubsection{1. 损失函数}
\parinterval 神经机器翻译在目标端的每个位置都会输出一个概率分布,表示这个位置上不同单词出现的可能性。设计损失函数时,需要知道当前位置输出的分布相比于标准答案的“差异”。对于这个问题,常用的是交叉熵损失函数。令$\mathbi{y}$表示机器翻译模型输出的分布,$\hat{\mathbi{y}}$ 表示标准答案,则交叉熵损失可以被定义为: \parinterval 神经机器翻译在目标端的每个位置都会输出一个概率分布,表示这个位置上不同单词出现的可能性。设计损失函数时,需要知道当前位置输出的分布相比于标准答案的“差异”。对于这个问题,常用的是交叉熵损失函数。令$\hat{\mathbi{y}}$ 表示机器翻译模型输出的分布, $\mathbi{y}$ 表示标准答案,则交叉熵损失可以被定义为:
\begin{eqnarray} \begin{eqnarray}
L_{\textrm{ce}}(\mathbi{y},\hat{\mathbi{y}}) &=& - \sum_{k=1}^{|V|} \mathbi{y}[k] \textrm{log} (\hat{\mathbi{y}}[k]) L_{\textrm{ce}}(\hat{\mathbi{y}},\mathbi{y}) &=& - \sum_{k=1}^{|V|} \hat{\mathbi{y}}[k] \textrm{log} (\mathbi{y}[k])
\label{eq:10-25} \label{eq:10-25}
\end{eqnarray} \end{eqnarray}
\noindent 其中$\mathbi{y}[k]$$\hat{\mathbi{y}}[k]$分别表示向量$\mathbi{y}$$\hat{\mathbi{y}}$的第$k$维,$|V|$表示输出向量的维度(等于词表大小)。假设有$n$个训练样本,模型输出的概率分布为$\mathbi{Y} = \{ \mathbi{y}_1,..., \mathbi{y}_n \}$,标准答案的分布$\widehat{\mathbi{Y}}=\{ \hat{\mathbi{y}}_1,...,\hat{\mathbi{y}}_n \}$。这个训练样本集合上的损失函数可以被定义为: \noindent 其中$\mathbi{y}[k]$$\hat{\mathbi{y}}[k]$分别表示向量$\mathbi{y}$$\hat{\mathbi{y}}$的第$k$维,$|V|$表示输出向量的维度(等于词表大小)。假设有$n$个训练样本,模型输出的概率分布为$\widehat{\mathbi{Y}}=\{ \hat{\mathbi{y}}_1,...,\hat{\mathbi{y}}_n \}$,标准答案的分布$\mathbi{Y} = \{ \mathbi{y}_1,..., \mathbi{y}_n \}$。这个训练样本集合上的损失函数可以被定义为:
\begin{eqnarray} \begin{eqnarray}
L(\mathbi{Y},\widehat{\mathbi{Y}}) &=& \sum_{j=1}^n L_{\textrm{ce}}(\mathbi{y}_j,\hat{\mathbi{y}}_j) L(\widehat{\mathbi{Y}},\mathbi{Y}) &=& \sum_{j=1}^n L_{\textrm{ce}}(\hat{\mathbi{y}}_j,\mathbi{y}_j)
\label{eq:10-26} \label{eq:10-26}
\end{eqnarray} \end{eqnarray}
......
...@@ -131,8 +131,8 @@ ...@@ -131,8 +131,8 @@
\multicolumn{1}{l|}{GNMT+RL} & 24.6 & 39.92 & 1.4$\times 10^{20}$ \\ \multicolumn{1}{l|}{GNMT+RL} & 24.6 & 39.92 & 1.4$\times 10^{20}$ \\
\multicolumn{1}{l|}{ConvS2S} & 25.16 & 40.46 & 1.5$\times 10^{20}$ \\ \multicolumn{1}{l|}{ConvS2S} & 25.16 & 40.46 & 1.5$\times 10^{20}$ \\
\multicolumn{1}{l|}{MoE} & 26.03 & 40.56 & 1.2$\times 10^{20}$ \\ \multicolumn{1}{l|}{MoE} & 26.03 & 40.56 & 1.2$\times 10^{20}$ \\
\multicolumn{1}{l|}{Transformer(Base Model)} & 27.3 &38.1 & 3.3$\times 10^{18}$ \\ \multicolumn{1}{l|}{Transformer (Base Model) } & 27.3 &38.1 & 3.3$\times 10^{18}$ \\
\multicolumn{1}{l|}{Transformer (Big)} & {\small\sffamily\bfseries{28.4}} & {\small\sffamily\bfseries{41.8}} & 2.3$\times 10^{19}$ \\ \multicolumn{1}{l|}{Transformer (Big Model)} & {\small\sffamily\bfseries{28.4}} & {\small\sffamily\bfseries{41.8}} & 2.3$\times 10^{19}$ \\
\end{tabular} \end{tabular}
\end{table} \end{table}
%---------------------------------------------- %----------------------------------------------
......
\begin{tikzpicture} \begin{tikzpicture}
\tikzstyle{tnode} = [rectangle,inner sep=0em,minimum width=8em,minimum height=6.6em,rounded corners=5pt,fill=green!20] \tikzstyle{tnode} = [rectangle,inner sep=0em,minimum width=8em,minimum height=6.6em,rounded corners=5pt,fill=green!5,drop shadow]
\tikzstyle{pnode} = [rectangle,inner sep=0em,minimum width=8em,minimum height=6.6em,rounded corners=5pt,fill=yellow!30] \tikzstyle{pnode} = [rectangle,inner sep=0em,minimum width=8em,minimum height=6.6em,rounded corners=5pt,fill=yellow!5,drop shadow]
\tikzstyle{mnode} = [rectangle,inner sep=0em,minimum width=8em,minimum height=6.6em,rounded corners=5pt,fill=red!20] \tikzstyle{mnode} = [rectangle,inner sep=0em,minimum width=8em,minimum height=6.6em,rounded corners=5pt,fill=red!5,drop shadow]
\tikzstyle{wnode} = [inner sep=0em,minimum height=1.5em] \tikzstyle{wnode} = [inner sep=0em,minimum height=1.5em]
%第一排 %第一排
......
...@@ -4,10 +4,10 @@ ...@@ -4,10 +4,10 @@
\tikzstyle{node}=[inner sep=0mm,minimum height=3em,minimum width=6em,rounded corners=5pt] \tikzstyle{node}=[inner sep=0mm,minimum height=3em,minimum width=6em,rounded corners=5pt]
\node[anchor=west,node,fill=ugreen!30] (n1) at (0,0) {训练集}; \node[anchor=west,node,fill=ugreen!15] (n1) at (0,0) {训练集};
\node[anchor=west,node,fill=yellow!30] (n2) at ([xshift=2em,yshift=0em]n1.east) {难度评估器}; \node[anchor=west,node,fill=yellow!15] (n2) at ([xshift=2em,yshift=0em]n1.east) {难度评估器};
\node[anchor=west,node,fill=red!30] (n3) at ([xshift=4em,yshift=0em]n2.east) {训练调度器}; \node[anchor=west,node,fill=red!15] (n3) at ([xshift=4em,yshift=0em]n2.east) {训练调度器};
\node[anchor=west,node,fill=blue!30] (n4) at ([xshift=4em,yshift=0em]n3.east) {模型训练器}; \node[anchor=west,node,fill=blue!15] (n4) at ([xshift=4em,yshift=0em]n3.east) {模型训练器};
\draw [->,very thick] ([xshift=0em,yshift=0em]n1.east) -- ([xshift=0em,yshift=0em]n2.west); \draw [->,very thick] ([xshift=0em,yshift=0em]n1.east) -- ([xshift=0em,yshift=0em]n2.west);
\draw [->,very thick] ([xshift=0em,yshift=0em]n2.east) -- ([xshift=0em,yshift=0em]n3.west); \draw [->,very thick] ([xshift=0em,yshift=0em]n2.east) -- ([xshift=0em,yshift=0em]n3.west);
...@@ -23,8 +23,8 @@ ...@@ -23,8 +23,8 @@
\draw [->,dotted,very thick] ([xshift=0em,yshift=0em]n4.north) -- ([xshift=0em,yshift=1em]n4.north) -- ([xshift=0em,yshift=1em]n3.north) -- (n3.north); \draw [->,dotted,very thick] ([xshift=0em,yshift=0em]n4.north) -- ([xshift=0em,yshift=1em]n4.north) -- ([xshift=0em,yshift=1em]n3.north) -- (n3.north);
\begin{pgfonlayer}{background} \begin{pgfonlayer}{background}
\node[rectangle,inner sep=5pt,rounded corners=5pt,fill=gray!30] [fit = (n3) (n4) (n6) (n8) ] (g2) {}; \node[rectangle,inner sep=5pt,rounded corners=5pt,fill=gray!15] [fit = (n3) (n4) (n6) (n8) ] (g2) {};
\node[rectangle,inner sep=5pt,rounded corners=5pt,fill=orange!30] [fit = (n2) (n3) (n9) ] (g1) {}; \node[rectangle,inner sep=5pt,rounded corners=5pt,fill=orange!15] [fit = (n2) (n3) (n9) ] (g1) {};
\end{pgfonlayer} \end{pgfonlayer}
......
%------------------------------------------------------------ %------------------------------------------------------------
\begin{tikzpicture} \begin{tikzpicture}
\tikzstyle{rnnnode} = [draw,inner sep=4pt,minimum width=2em,minimum height=2em,rounded corners=1pt,fill=green!20] \tikzstyle{rnnnode} = [draw,inner sep=4pt,minimum width=2em,minimum height=2em,rounded corners=1pt,fill=green!15]
\tikzstyle{snode} = [draw,inner sep=4pt,minimum width=2em,minimum height=2em,rounded corners=1pt,fill=red!20] \tikzstyle{snode} = [draw,inner sep=4pt,minimum width=2em,minimum height=2em,rounded corners=1pt,fill=red!15]
\tikzstyle{wode} = [inner sep=0pt,minimum width=2em,minimum height=2em,rounded corners=0pt] \tikzstyle{wode} = [inner sep=0pt,minimum width=2em,minimum height=2em,rounded corners=0pt]
\node [anchor=west,wode] (n1) at (0,0) {$y$}; \node [anchor=west,wode] (n1) at (0,0) {$y$};
\node [anchor=north west,wode] (n2) at ([xshift=3em,yshift=-2.5em]n1.south east) {$x$}; \node [anchor=north west,wode] (n2) at ([xshift=3em,yshift=-2.5em]n1.south east) {$x$};
\node [anchor=south west,rnnnode] (n3) at ([xshift=8em,yshift=0.5em]n2.north east) {生成模型$G$}; \node [anchor=south west,rnnnode] (n3) at ([xshift=8em,yshift=0.5em]n2.north east) {生成模型$G$};
\node [anchor=south east,wode] (n4) at ([xshift=-2em,yshift=0em]n3.north west) {$\tilde{y}$}; \node [anchor=south east,wode] (n4) at ([xshift=-2em,yshift=0em]n3.north west) {$\hat{y}$};
\node [anchor=south,snode] (n5) at ([xshift=0em,yshift=6em]n2.north) {判别网络$D$}; \node [anchor=south,snode] (n5) at ([xshift=0em,yshift=6em]n2.north) {判别网络$D$};
\node [anchor=west,align=left,font=\small] (n6) at ([xshift=15em,yshift=-3em]n5.east) {根据$(\seq{x},\seq{\tilde{y}})$\\成奖励信号}; \node [anchor=west,align=left,font=\small] (n6) at ([xshift=15em,yshift=-3em]n5.east) {根据$(\seq{x},\seq{\hat{y}})$\\成奖励信号};
\draw [->,thick] ([xshift=0em,yshift=-0.3em]n1.north)--([xshift=-0.3em,yshift=-0.1em]n5.south); \draw [->,thick] ([xshift=0em,yshift=-0.3em]n1.north)--([xshift=-0.3em,yshift=-0.1em]n5.south);
......
...@@ -38,20 +38,20 @@ ...@@ -38,20 +38,20 @@
\node [anchor=south,font=\scriptsize] (one_hot_w7) at (one_hot_label7.north) {$0$}; \node [anchor=south,font=\scriptsize] (one_hot_w7) at (one_hot_label7.north) {$0$};
%label smoothing %label smoothing
\node [anchor=west,minimum width=1.2em,minimum height=0.2em,fill=red!50,inner sep=0pt] (label1) at ([xshift=1.5em,yshift=-4.4em]model.east) {}; \node [anchor=west,minimum width=1.2em,minimum height=0.2em,fill=orange!50,inner sep=0pt] (label1) at ([xshift=1.5em,yshift=-4.4em]model.east) {};
\node [anchor=south,font=\scriptsize] (w1) at (label1.north) {$0.1$}; \node [anchor=south,font=\scriptsize] (w1) at (label1.north) {$0.1$};
\node [anchor=south west,minimum width=1.2em,minimum height=0.2em,fill=red!50,inner sep=0pt] (label2) at (label1.south east) {}; \node [anchor=south west,minimum width=1.2em,minimum height=0.2em,fill=orange!50,inner sep=0pt] (label2) at (label1.south east) {};
\node [anchor=south,font=\scriptsize] (w2) at (label2.north) {$0.1$}; \node [anchor=south,font=\scriptsize] (w2) at (label2.north) {$0.1$};
\node [anchor=south west,minimum width=1.2em,minimum height=0.8em,fill=red!50,inner sep=0pt] (label3) at (label2.south east) {}; \node [anchor=south west,minimum width=1.2em,minimum height=0.8em,fill=orange!50,inner sep=0pt] (label3) at (label2.south east) {};
\node [anchor=south,font=\scriptsize] (w3) at (label3.north) {{\color{red} $0.4$}}; \node [anchor=south,font=\scriptsize] (w3) at (label3.north) {{\color{red} $0.4$}};
\node [anchor=south west,minimum width=1.2em,minimum height=0.2em,fill=red!50,inner sep=0pt] (label4) at (label3.south east) {}; \node [anchor=south west,minimum width=1.2em,minimum height=0.2em,fill=orange!50,inner sep=0pt] (label4) at (label3.south east) {};
\node [anchor=south,font=\scriptsize] (w5) at (label4.north) {$0.1$}; \node [anchor=south,font=\scriptsize] (w5) at (label4.north) {$0.1$};
\node [anchor=south west,minimum width=1.2em,minimum height=0.2em,fill=red!50,inner sep=0pt] (label5) at (label4.south east) {}; \node [anchor=south west,minimum width=1.2em,minimum height=0.2em,fill=orange!50,inner sep=0pt] (label5) at (label4.south east) {};
\node [anchor=south,font=\scriptsize] (w6) at (label5.north) {$0.1$}; \node [anchor=south,font=\scriptsize] (w6) at (label5.north) {$0.1$};
\node [anchor=south west,minimum width=1.2em,minimum height=0.2em,fill=red!50,inner sep=0pt] (label6) at (label5.south east) {}; \node [anchor=south west,minimum width=1.2em,minimum height=0.2em,fill=orange!50,inner sep=0pt] (label6) at (label5.south east) {};
\node [anchor=south,font=\scriptsize] (w7) at (label6.north) {$0.1$}; \node [anchor=south,font=\scriptsize] (w7) at (label6.north) {$0.1$};
\node [anchor=south west,minimum width=1.2em,minimum height=0.2em,fill=red!50,inner sep=0pt] (label7) at (label6.south east) {}; \node [anchor=south west,minimum width=1.2em,minimum height=0.2em,fill=orange!50,inner sep=0pt] (label7) at (label6.south east) {};
\node [anchor=south,font=\scriptsize] (w8) at (label7.north) {$0.1$}; \node [anchor=south,font=\scriptsize] (w8) at (label7.north) {$0.1$};
......
...@@ -4,8 +4,8 @@ ...@@ -4,8 +4,8 @@
\begin{tikzpicture} \begin{tikzpicture}
\tikzstyle{rnnnode} = [draw,inner sep=2pt,minimum width=4em,minimum height=2em,rounded corners=1pt,fill=red!20] \tikzstyle{rnnnode} = [draw,inner sep=2pt,minimum width=4em,minimum height=2em,rounded corners=1pt,fill=red!15]
\tikzstyle{snode} = [draw,inner sep=2pt,minimum width=4em,minimum height=2em,rounded corners=1pt,fill=blue!20] \tikzstyle{snode} = [draw,inner sep=2pt,minimum width=4em,minimum height=2em,rounded corners=1pt,fill=blue!15]
\tikzstyle{ynode} = [inner sep=2pt,minimum width=4em,minimum height=2em,rounded corners=1pt] \tikzstyle{ynode} = [inner sep=2pt,minimum width=4em,minimum height=2em,rounded corners=1pt]
...@@ -14,18 +14,18 @@ ...@@ -14,18 +14,18 @@
\node [anchor=west,rnnnode] (n3) at ([xshift=3em,yshift=0em]n2.east) {$\mathbi{h}_{j-1}$}; \node [anchor=west,rnnnode] (n3) at ([xshift=3em,yshift=0em]n2.east) {$\mathbi{h}_{j-1}$};
\node [anchor=west,rnnnode] (n4) at ([xshift=3em,yshift=0em]n3.east) {$\mathbi{h}_{j}$}; \node [anchor=west,rnnnode] (n4) at ([xshift=3em,yshift=0em]n3.east) {$\mathbi{h}_{j}$};
\node [anchor=south,snode] (n5) at ([xshift=0em,yshift=1em]n3.north) {Softmax}; \node [anchor=south,snode] (n5) at ([xshift=0em,yshift=1em]n3.north) {Softmax};
\node [anchor=south,ynode] (n6) at ([xshift=0em,yshift=1em]n5.north) {$\tilde{{y}}_{j-1}$}; \node [anchor=south,ynode] (n6) at ([xshift=0em,yshift=1em]n5.north) {$\hat{{y}}_{j-1}$};
\node [anchor=south,snode] (n7) at ([xshift=0em,yshift=1em]n4.north) {Softmax}; \node [anchor=south,snode] (n7) at ([xshift=0em,yshift=1em]n4.north) {Softmax};
\node [anchor=south,ynode] (n8) at ([xshift=0em,yshift=1em]n7.north) {$\tilde{{y}}_{j}$}; \node [anchor=south,ynode] (n8) at ([xshift=0em,yshift=1em]n7.north) {$\hat{{y}}_{j}$};
\node [anchor=south,snode,font=\footnotesize] (n13) at ([xshift=0em,yshift=1em]n1.north) {Softmax}; \node [anchor=south,snode,font=\footnotesize] (n13) at ([xshift=0em,yshift=1em]n1.north) {Softmax};
\node [anchor=south,ynode] (n14) at ([xshift=0em,yshift=1em]n13.north) {$\tilde{{y}}_{1}$}; \node [anchor=south,ynode] (n14) at ([xshift=0em,yshift=1em]n13.north) {$\hat{{y}}_{1}$};
\node [anchor=north] (x1) at ([xshift=0em,yshift=-1em]n1.south) {$\langle$sos$\rangle$}; \node [anchor=north] (x1) at ([xshift=0em,yshift=-1em]n1.south) {$\langle$sos$\rangle$};
\node [anchor=north,font=\small] (x2) at ([xshift=-1.3em,yshift=-2.3em]n3.south) {$\tilde{{y}}_{j-2}$}; \node [anchor=north,font=\small] (x2) at ([xshift=-1.3em,yshift=-2.3em]n3.south) {$\hat{{y}}_{j-2}$};
\node [anchor=north,font=\small] (x3) at ([xshift=1.3em,yshift=-2.5em]n3.south) {${y}_{j-2}$}; \node [anchor=north,font=\small] (x3) at ([xshift=1.3em,yshift=-2.5em]n3.south) {${y}_{j-2}$};
\node [anchor=north,font=\small] (x4) at ([xshift=1.3em,yshift=-2.5em]n4.south) {${y}_{j-1}$}; \node [anchor=north,font=\small] (x4) at ([xshift=1.3em,yshift=-2.5em]n4.south) {${y}_{j-1}$};
\node [anchor=north,font=\small] (x5) at ([xshift=-1.3em,yshift=-2.3em]n4.south) {$\tilde{{y}}_{j-1}$}; \node [anchor=north,font=\small] (x5) at ([xshift=-1.3em,yshift=-2.3em]n4.south) {$\hat{{y}}_{j-1}$};
\node [anchor=south,inner sep=2pt] (st1) at (n6.north) {\scriptsize{\textbf{[step $j-1$]}}}; \node [anchor=south,inner sep=2pt] (st1) at (n6.north) {\scriptsize{\textbf{[step $j-1$]}}};
\node [anchor=south,inner sep=2pt] (st2) at (n8.north) {\scriptsize{\textbf{[step $j$]}}}; \node [anchor=south,inner sep=2pt] (st2) at (n8.north) {\scriptsize{\textbf{[step $j$]}}};
......
\begin{tikzpicture} \begin{tikzpicture}
\node[anchor=west,inner sep=0mm,minimum height=4em,minimum width=5.5em,rounded corners=15pt,align=left,draw,fill=red!20] (n1) at (0,0) {Decoder\\Encoder}; \node[anchor=west,inner sep=0mm,minimum height=4em,minimum width=5.5em,rounded corners=15pt,align=left,draw,fill=red!15] (n1) at (0,0) {Decoder\\Encoder};
\node[anchor=west,inner sep=0mm,minimum height=4em,minimum width=5.5em,rounded corners=15pt,align=left,draw,fill=green!20] (n2) at ([xshift=10em,yshift=0em]n1.east) {Decoder\\Encoder}; \node[anchor=west,inner sep=0mm,minimum height=4em,minimum width=5.5em,rounded corners=15pt,align=left,draw,fill=green!15] (n2) at ([xshift=10em,yshift=0em]n1.east) {Decoder\\Encoder};
\node[anchor=south,inner sep=0mm,font=\small] (a1) at ([xshift=0em,yshift=1em]n1.north) {演员$p$}; \node[anchor=south,inner sep=0mm,font=\small] (a1) at ([xshift=0em,yshift=1em]n1.north) {演员$p$};
......
This source diff could not be displayed because it is too large. You can view the blob instead.
...@@ -39,42 +39,49 @@ ...@@ -39,42 +39,49 @@
\end{scope} \end{scope}
%right %right
\begin{scope}[xshift=14em] \begin{scope}[xshift=13em]
\foreach \x/\d in {1/2em, 2/8em, 3/14em}
\node[unit,fill=yellow!20] at (0,\d) (ln_\x) {层正则化};
\foreach \x/\d in {1/6em, 2/12em, 3/22em}
\node[draw,circle,minimum size=1em,inner sep=1pt] at (0,\d) (add_\x) {\scriptsize\bfnew{+}};
\node[unit,fill=red!20] at (0,16em) (conv_4) {卷积$1 \times 1$:2048}; \foreach \x/\d in {1/2em, 2/8em, 3/16em}
\node[unit,fill=red!20] at (0,20em) (conv_5) {卷积$1 \times 1$:512}; \node[unit,fill=yellow!20] at (0,\d) (ln_\x) {层正则化};
\node[unit,fill=blue!20] at (0,18em) (relu_3) {RELU}; \foreach \x/\d in {1/6em, 2/14em, 3/20em}
\node[unit,fill=cyan!20] at (0,4em) (conv_3) {Sep卷积$9 \times 1$:256}; \node[draw,circle,minimum size=1em,inner sep=1pt] at (0,\d) (add_\x) {\scriptsize\bfnew{+}};
\node[unit,fill=green!20] at (0,10em) (sa_1) {8头自注意力:512};
\node[unit,fill=red!20] at (0,4em) (glu_1) {门控线性单元:512};
\node[unit,fill=red!20] at (-3em,10em) (conv_1) {卷积$1 \times 1$:2048};
\node[unit,fill=cyan!20] at (3em,10em) (conv_2) {卷积$3 \times 1$:256};
\node[unit,fill=blue!20] at (-3em,12em) (relu_1) {RELU};
\node[unit,fill=blue!20] at (3em,12em) (relu_2) {RELU};
\node[unit,fill=cyan!20] at (0em,18em) (conv_3) {Sep卷积$9 \times 1$:256};
\draw[->,thick] ([yshift=-1.4em]ln_1.-90) -- ([yshift=-0.1em]ln_1.-90); \draw[->,thick] ([yshift=-1.4em]ln_1.-90) -- ([yshift=-0.1em]ln_1.-90);
\draw[->,thick] ([yshift=0.1em]ln_1.90) -- ([yshift=-0.1em]conv_3.-90); \draw[->,thick] ([yshift=0.1em]ln_1.90) -- ([yshift=-0.1em]glu_1.-90);
\draw[->,thick] ([yshift=0.1em]conv_3.90) -- ([yshift=-0.1em]add_1.-90); \draw[->,thick] ([yshift=0.1em]glu_1.90) -- ([yshift=-0.1em]add_1.-90);
\draw[->,thick] ([yshift=0.1em]add_1.90) -- ([yshift=-0.1em]ln_2.-90); \draw[->,thick] ([yshift=0.1em]add_1.90) -- ([yshift=-0.1em]ln_2.-90);
\draw[->,thick] ([,yshift=0.1em]ln_2.90) -- ([yshift=-0.1em]sa_1.-90); \draw[->,thick] ([,yshift=0.1em]ln_2.135) -- ([yshift=-0.1em]conv_1.-90);
\draw[->,thick] ([yshift=0.1em]sa_1.90) -- ([yshift=-0.1em]add_2.-90); \draw[->,thick] ([yshift=0.1em]ln_2.45) -- ([yshift=-0.1em]conv_2.-90);
\draw[->,thick] ([yshift=0.1em]conv_1.90) -- ([yshift=-0.1em]relu_1.-90);
\draw[->,thick] ([yshift=0.1em]conv_2.90) -- ([yshift=-0.1em]relu_2.-90);
\draw[->,thick] ([yshift=0.1em]relu_1.90) -- ([yshift=-0.1em]add_2.-135);
\draw[->,thick] ([yshift=0.1em]relu_2.90) -- ([yshift=-0.1em]add_2.-45);
\draw[->,thick] ([yshift=0.1em]add_2.90) -- ([yshift=-0.1em]ln_3.-90); \draw[->,thick] ([yshift=0.1em]add_2.90) -- ([yshift=-0.1em]ln_3.-90);
\draw[->,thick] ([yshift=0.1em]ln_3.90) -- ([yshift=-0.1em]conv_4.-90); \draw[->,thick] ([yshift=0.1em]ln_3.90) -- ([yshift=-0.1em]conv_3.-90);
\draw[->,thick] ([yshift=0.1em]conv_4.90) -- ([yshift=-0.1em]relu_3.-90); \draw[->,thick] ([yshift=0.1em]conv_3.90) -- ([yshift=-0.1em]add_3.-90);
\draw[->,thick] ([yshift=0.1em]relu_3.90) -- ([yshift=-0.1em]conv_5.-90);
\draw[->,thick] ([yshift=0.1em]conv_5.90) -- ([yshift=-0.1em]add_3.-90);
\draw[->,thick] ([yshift=0.1em]add_3.90) -- ([yshift=1em]add_3.90); \draw[->,thick] ([yshift=0.1em]add_3.90) -- ([yshift=1em]add_3.90);
\draw[->,thick] ([yshift=-0.8em]ln_1.-90) .. controls ([xshift=5em,yshift=-0.8em]ln_1.-90) and ([xshift=5em]add_1.0) .. (add_1.0); \draw[->,thick] ([yshift=-0.8em]ln_1.-90) .. controls ([xshift=5em,yshift=-0.8em]ln_1.-90) and ([xshift=5em]add_1.0) .. (add_1.0);
\draw[->,thick] (add_1.0) .. controls ([xshift=5em]add_1.0) and ([xshift=5em]add_2.0) .. (add_2.0); \draw[->,thick] (add_1.0) .. controls ([xshift=8em]add_1.0) and ([xshift=8em]add_3.0) .. (add_3.0);
\draw[->,thick] (add_2.0) .. controls ([xshift=5em]add_2.0) and ([xshift=5em]add_3.0) .. (add_3.0);
\node[font=\scriptsize,align=center] at (0em, -1.5em){(b) 使用结构搜索方法优化后的 \\ Transformer编码器中若干块的结构}; \node[font=\scriptsize,align=center] at (0em, -1.5em){(b) 使用结构搜索方法优化后的 \\ Transformer编码器中若干块的结构};
\node[minimum size=0.8em,inner sep=0pt,rounded corners=1pt,draw,fill=blue!20] (act) at (5.5em, 20em){}; \node[minimum size=0.8em,inner sep=0pt,rounded corners=1pt,draw,fill=blue!20] (act) at (8em, 20em){};
\node[anchor=west,font=\footnotesize] at ([xshift=0.1em]act.east){激活函数}; \node[anchor=west,font=\footnotesize] at ([xshift=0.1em]act.east){激活函数};
\node[anchor=north,minimum size=0.8em,inner sep=0pt,rounded corners=1pt,draw,fill=yellow!20] (nor) at ([yshift=-0.6em]act.south){}; \node[anchor=north,minimum size=0.8em,inner sep=0pt,rounded corners=1pt,draw,fill=yellow!20] (nor) at ([yshift=-0.6em]act.south){};
\node[anchor=west,font=\footnotesize] at ([xshift=0.1em]nor.east){层正则化}; \node[anchor=west,font=\footnotesize] at ([xshift=0.1em]nor.east){层正则化};
......
...@@ -7,9 +7,9 @@ ...@@ -7,9 +7,9 @@
\tikzstyle{vlnode}=[rectangle,inner sep=0mm,minimum height=1em,minimum width=5em,rounded corners=2pt,draw] \tikzstyle{vlnode}=[rectangle,inner sep=0mm,minimum height=1em,minimum width=5em,rounded corners=2pt,draw]
\node [anchor=west,lnode] (n1) at (0, 0) {$\mathbi{g}^3$}; \node [anchor=west,lnode] (n1) at (0, 0) {$\mathbi{h}_3$};
\node [anchor=north west,lnode] (n2) at ([xshift=0em,yshift=-0.5em]n1.south west) {$\mathbi{g}^2$}; \node [anchor=north west,lnode] (n2) at ([xshift=0em,yshift=-0.5em]n1.south west) {$\mathbi{h}_2$};
\node [anchor=north west,lnode] (n3) at ([xshift=0em,yshift=-0.5em]n2.south west) {$\mathbi{g}^1$}; \node [anchor=north west,lnode] (n3) at ([xshift=0em,yshift=-0.5em]n2.south west) {$\mathbi{h}_1$};
\node [anchor=south] (d1) at ([xshift=0em,yshift=0.2em]n1.north) {1D}; \node [anchor=south] (d1) at ([xshift=0em,yshift=0.2em]n1.north) {1D};
...@@ -97,14 +97,14 @@ ...@@ -97,14 +97,14 @@
\draw [decorate,decoration={brace,mirror}] ([xshift=0em]n3.south west) to node [midway,font=\small,align=center,xshift=0em,yshift=-0.8em] {$d$} ([xshift=0em]n3.south east); \draw [decorate,decoration={brace,mirror}] ([xshift=0em]n3.south west) to node [midway,font=\small,align=center,xshift=0em,yshift=-0.8em] {$d$} ([xshift=0em]n3.south east);
\draw [decorate,decoration={brace,mirror}] ([xshift=0em]n6.south west) to node [midway,font=\small,align=center,xshift=0em,yshift=-0.8em] {$d_a$} ([xshift=0em]n6.south east); \draw [decorate,decoration={brace,mirror}] ([xshift=0em]n6.south west) to node [midway,font=\small,align=center,xshift=0em,yshift=-0.8em] {$d_{\rm a}$} ([xshift=0em]n6.south east);
\draw [decorate,decoration={brace,mirror}] ([xshift=0em]n7.north west) to node [midway,font=\small,align=center,xshift=-0.7em,yshift=-0em] {$d$} ([xshift=0em]n7.south west); \draw [decorate,decoration={brace,mirror}] ([xshift=0em]n7.north west) to node [midway,font=\small,align=center,xshift=-0.7em,yshift=-0em] {$d$} ([xshift=0em]n7.south west);
\draw [decorate,decoration={brace}] ([xshift=0em]n7.north west) to node [midway,font=\small,align=center,xshift=0em,yshift=0.7em] {$d$} ([xshift=0em]n7.north east); \draw [decorate,decoration={brace}] ([xshift=0em]n7.north west) to node [midway,font=\small,align=center,xshift=0em,yshift=0.7em] {$d$} ([xshift=0em]n7.north east);
\draw [decorate,decoration={brace,mirror}] ([xshift=0em]n8.north west) to node [midway,font=\small,align=center,xshift=-0.8em,yshift=-0em] {$d_a$} ([xshift=0em]n8.south west); \draw [decorate,decoration={brace,mirror}] ([xshift=0em]n8.north west) to node [midway,font=\small,align=center,xshift=-0.8em,yshift=-0em] {$d_{\rm a}$} ([xshift=0em]n8.south west);
\draw [decorate,decoration={brace}] ([xshift=0em]n8.north west) to node [midway,font=\small,align=center,xshift=0em,yshift=0.8em] {$n_{hop}$} ([xshift=0em]n8.north east); \draw [decorate,decoration={brace}] ([xshift=0em]n8.north west) to node [midway,font=\small,align=center,xshift=0em,yshift=0.8em] {$n_{\rm hop}$} ([xshift=0em]n8.north east);
\draw [decorate,decoration={brace,mirror}] ([xshift=0em]nc31.south west) to node [midway,font=\small,align=center,xshift=0em,yshift=-0.8em] {$n_{hop}$} ([xshift=0em]nc35.south east); \draw [decorate,decoration={brace,mirror}] ([xshift=0em]nc31.south west) to node [midway,font=\small,align=center,xshift=0em,yshift=-0.8em] {$n_{\rm hop}$} ([xshift=0em]nc35.south east);
\draw [decorate,decoration={brace,mirror}] ([xshift=0em]ln5.south west) to node [midway,font=\small,align=center,xshift=0em,yshift=-0.8em] {$d$} ([xshift=0em]ln5.south east); \draw [decorate,decoration={brace,mirror}] ([xshift=0em]ln5.south west) to node [midway,font=\small,align=center,xshift=0em,yshift=-0.8em] {$d$} ([xshift=0em]ln5.south east);
\draw [decorate] ([xshift=0em]ln5.south east) to node [midway,font=\footnotesize,align=center,xshift=1em,yshift=-0.5em] {$n_{hop}$} ([xshift=0em]ln1.south east); \draw [decorate] ([xshift=0em]ln5.south east) to node [midway,font=\footnotesize,align=center,xshift=1em,yshift=-0.5em] {$n_{\rm hop}$} ([xshift=0em]ln1.south east);
\draw [decorate,decoration={brace,mirror}] ([xshift=0em]fn.south east) to node [midway,font=\small,align=center,xshift=0.7em,yshift=-0em] {$d$} ([xshift=0em]fn.north east); \draw [decorate,decoration={brace,mirror}] ([xshift=0em]fn.south east) to node [midway,font=\small,align=center,xshift=0.7em,yshift=-0em] {$d$} ([xshift=0em]fn.north east);
......
...@@ -19,7 +19,7 @@ ...@@ -19,7 +19,7 @@
\node [anchor=west,encnode,draw=red!60!black!80,fill=red!20] (n7) at ([xshift=1em,yshift=0em]n6.east) {$\mathbi{h}_{L-1}$}; \node [anchor=west,encnode,draw=red!60!black!80,fill=red!20] (n7) at ([xshift=1em,yshift=0em]n6.east) {$\mathbi{h}_{L-1}$};
\node [anchor=north,rectangle,draw=teal!80, inner sep=0mm,minimum height=2em,minimum width=8em,fill=teal!17,rounded corners=5pt,thick] (n8) at ([xshift=3em,yshift=-1.2em]n4.south) {权重聚合$\mathbi{g}$}; \node [anchor=north,rectangle,draw=teal!80, inner sep=0mm,minimum height=2em,minimum width=8em,fill=teal!17,rounded corners=5pt,thick] (n8) at ([xshift=3em,yshift=-1.5em]n4.south) {权重聚合$\mathbi{g}$};
...@@ -61,11 +61,11 @@ ...@@ -61,11 +61,11 @@
\draw [->,thick] ([xshift=0em,yshift=0em]n5.east) -- ([xshift=0em,yshift=0em]n6.west); \draw [->,thick] ([xshift=0em,yshift=0em]n5.east) -- ([xshift=0em,yshift=0em]n6.west);
\draw [->,thick] ([xshift=0em,yshift=0em]n6.east) -- ([xshift=0em,yshift=0em]n7.west); \draw [->,thick] ([xshift=0em,yshift=0em]n6.east) -- ([xshift=0em,yshift=0em]n7.west);
\draw [->,thick] ([xshift=0em,yshift=0em]n2.south) -- ([xshift=0em,yshift=0em]n8.north); \draw [->,thick] ([xshift=0em,yshift=0em]n2.south)..controls +(south:1.5em) and +(north:1.5em)..([xshift=0em,yshift=0.1em]n8.north);
\draw [->,thick] ([xshift=0em,yshift=0em]n3.south) -- ([xshift=0em,yshift=0em]n8.north); \draw [->,thick] ([xshift=0em,yshift=0em]n3.south)..controls +(south:0.9em) and +(north:1.6em)..([xshift=0em,yshift=0.1em]n8.north);
\draw [->,thick] ([xshift=0em,yshift=0em]n4.south) -- ([xshift=0em,yshift=0em]n8.north); \draw [->,thick] ([xshift=0em,yshift=0em]n4.south)..controls +(south:0.8em) and +(north:1.4em)..([xshift=0em,yshift=0.1em]n8.north);
\draw [->,thick] ([xshift=0em,yshift=0em]n5.south) -- ([xshift=0em,yshift=0em]n8.north); \draw [->,thick] ([xshift=0em,yshift=0em]n5.south)..controls +(south:0.8em) and +(north:1.4em)..([xshift=0em,yshift=0.1em]n8.north);
\draw [->,thick] ([xshift=0em,yshift=0em]n7.south) -- ([xshift=0em,yshift=0em]n8.north); \draw [->,thick] ([xshift=0em,yshift=0em]n7.south)..controls +(south:1.5em) and +(north:1.5em)..([xshift=0em,yshift=0.1em]n8.north);
\draw [->,thick] ([xshift=0em,yshift=0em]n10.east) -- ([xshift=0em,yshift=0em]n11.west); \draw [->,thick] ([xshift=0em,yshift=0em]n10.east) -- ([xshift=0em,yshift=0em]n11.west);
\draw [->,thick] ([xshift=0em,yshift=0em]n11.east) -- ([xshift=0em,yshift=0em]n12.west); \draw [->,thick] ([xshift=0em,yshift=0em]n11.east) -- ([xshift=0em,yshift=0em]n12.west);
...@@ -74,10 +74,10 @@ ...@@ -74,10 +74,10 @@
\draw [->,thick] ([xshift=0em,yshift=0em]n14.east) -- ([xshift=0em,yshift=0em]n15.west); \draw [->,thick] ([xshift=0em,yshift=0em]n14.east) -- ([xshift=0em,yshift=0em]n15.west);
\draw [->,thick] ([xshift=0em,yshift=0em]n15.east) -- ([xshift=0em,yshift=0em]n16.west); \draw [->,thick] ([xshift=0em,yshift=0em]n15.east) -- ([xshift=0em,yshift=0em]n16.west);
\draw [->,thick] ([xshift=0em,yshift=0em]n8.south) -- ([xshift=0em,yshift=0em]n11.north); \draw [->,thick] ([xshift=0em,yshift=0em]n8.south)..controls +(south:1.2em) and +(north:1.5em)..([xshift=0em,yshift=0em]n11.north);
\draw [->,thick] ([xshift=0em,yshift=0em]n8.south) -- ([xshift=0em,yshift=0em]n12.north); \draw [->,thick] ([xshift=0em,yshift=0em]n8.south)..controls +(south:1.3em) and +(north:1.2em)..([xshift=0em,yshift=0em]n12.north);
\draw [->,thick] ([xshift=0em,yshift=0em]n8.south) -- ([xshift=0em,yshift=0em]n13.north); \draw [->,thick] ([xshift=0em,yshift=0em]n8.south)..controls +(south:1.3em) and +(north:1.2em)..([xshift=0em,yshift=0em]n13.north);
\draw [->,thick] ([xshift=0em,yshift=0em]n8.south) -- ([xshift=0em,yshift=0em]n15.north); \draw [->,thick] ([xshift=0em,yshift=0em]n8.south)..controls +(south:1.5em) and +(north:1.5em)..([xshift=0em,yshift=0em]n15.north);
......
...@@ -4,7 +4,7 @@ ...@@ -4,7 +4,7 @@
\begin{axis}[ \begin{axis}[
width=.50\textwidth, width=.50\textwidth,
height=.40\textwidth, height=.40\textwidth,
legend style={at={(0.60,0.08)}, anchor=south west}, legend style={at={(0.45,0.08)}, anchor=south west},
xlabel={\scriptsize{更新次数(10k)}}, xlabel={\scriptsize{更新次数(10k)}},
ylabel={\scriptsize{学习率 ($10^{-3}$}}, ylabel={\scriptsize{学习率 ($10^{-3}$}},
ylabel style={yshift=-1em},xlabel style={yshift=0.0em}, ylabel style={yshift=-1em},xlabel style={yshift=0.0em},
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
] ]
\addplot[red,line width=1.25pt] coordinates {(0,0) (1.6,2) (1.8,1.888) (2,1.787) (2.5,1.606) (3,1.462) (3.5,1.3549) (4,1.266) (4.5,1.193) (5,1.131)}; \addplot[red,line width=1.25pt] coordinates {(0,0) (1.6,2) (1.8,1.888) (2,1.787) (2.5,1.606) (3,1.462) (3.5,1.3549) (4,1.266) (4.5,1.193) (5,1.131)};
\addlegendentry{\scriptsize Base48} \addlegendentry{\scriptsize 原始学习率}
%\addplot[red,line width=1.25pt] coordinates {(0,0) (8000,0.002) (10000,0.00179) (12000,0.00163) (12950,0.001572)}; %\addplot[red,line width=1.25pt] coordinates {(0,0) (8000,0.002) (10000,0.00179) (12000,0.00163) (12950,0.001572)};
\addplot[blue,line width=1.25pt] coordinates {(0,0) (0.8,2) (0.9906,1.7983)}; \addplot[blue,line width=1.25pt] coordinates {(0,0) (0.8,2) (0.9906,1.7983)};
%\addplot[red,line width=1.25pt] coordinates {(0,0) (8000,0.002) (9906,0.0017983)}; %\addplot[red,line width=1.25pt] coordinates {(0,0) (8000,0.002) (9906,0.0017983)};
...@@ -31,7 +31,7 @@ ...@@ -31,7 +31,7 @@
\addplot[blue,line width=1.25pt] coordinates {(2.9706,2) (3.1706,1.79) (3.3706,1.63) (3.4656,1.572) (3.6706,1.4602) (3.7136,1.44)}; \addplot[blue,line width=1.25pt] coordinates {(2.9706,2) (3.1706,1.79) (3.3706,1.63) (3.4656,1.572) (3.6706,1.4602) (3.7136,1.44)};
\addplot[blue,dashed,line width=1.25pt] coordinates {(3.7136,1.44) (3.7136,2)}; \addplot[blue,dashed,line width=1.25pt] coordinates {(3.7136,1.44) (3.7136,2)};
\addplot[blue,line width=1.25pt] coordinates {(3.7136,2) (3.9136,1.79) (4.1136,1.63) (4.2086,1.572) (4.4136,1.4602) (4.4566,1.44) (4.7000,1.3574) (5.0000,1.2531)}; \addplot[blue,line width=1.25pt] coordinates {(3.7136,2) (3.9136,1.79) (4.1136,1.63) (4.2086,1.572) (4.4136,1.4602) (4.4566,1.44) (4.7000,1.3574) (5.0000,1.2531)};
\addlegendentry{\scriptsize SDT48} \addlegendentry{\scriptsize 调整后的学习率}
\end{axis} \end{axis}
} }
......
...@@ -3,28 +3,28 @@ ...@@ -3,28 +3,28 @@
\begin{center} \begin{center}
\begin{tikzpicture} \begin{tikzpicture}
\tikzstyle{manode}=[rectangle,inner sep=0mm,minimum height=4em,minimum width=4em,rounded corners=5pt,thick,draw,fill=blue!20] \tikzstyle{manode}=[rectangle,inner sep=0mm,minimum height=4em,minimum width=4em,rounded corners=5pt,thick,draw,fill=blue!15]
\tikzstyle{ffnnode}=[rectangle,inner sep=0mm,minimum height=1.8em,minimum width=6em,rounded corners=5pt,thick,fill=red!20,draw] \tikzstyle{ffnnode}=[rectangle,inner sep=0mm,minimum height=1.8em,minimum width=6em,rounded corners=5pt,thick,fill=red!15,draw]
\tikzstyle{ebnode}=[rectangle,inner sep=0mm,minimum height=1.8em,minimum width=10em,rounded corners=5pt,thick,fill=green!20,draw] \tikzstyle{ebnode}=[rectangle,inner sep=0mm,minimum height=1.8em,minimum width=10em,rounded corners=5pt,thick,fill=ugreen!15,draw]
\begin{scope}[] \begin{scope}[]
\node [anchor=west,ffnnode] (f1) at (0, 0){FFN}; \node [anchor=west,inner sep=0mm,minimum height=1.8em] (f1) at (0, 0){input};
\node [anchor=south,ebnode] (e1) at ([xshift=0em,yshift=1em]f1.north){Embedding}; \node [anchor=south,ebnode] (e1) at ([xshift=0em,yshift=1em]f1.north){Embedding};
\node [anchor=south west,manode] (a1) at ([xshift=0em,yshift=1em]e1.north west){Attention}; \node [anchor=south west,manode] (a1) at ([xshift=0em,yshift=1em]e1.north west){Attention};
\node [anchor=south east,manode] (c1) at ([xshift=0em,yshift=1em]e1.north east){Conv}; \node [anchor=south east,manode] (c1) at ([xshift=0em,yshift=1em]e1.north east){Conv};
\node [anchor=south west,ebnode] (e2) at ([xshift=0em,yshift=1em]a1.north west){Embedding}; \node [anchor=south west,ebnode] (e2) at ([xshift=0em,yshift=1em]a1.north west){Embedding};
\node [anchor=south,draw,circle,inner sep=4pt] (add1) at ([xshift=0em,yshift=0.5em]e2.north){}; \node [anchor=south,draw,circle,inner sep=4pt] (add1) at ([xshift=0em,yshift=0.5em]e2.north){};
\node [anchor=south,ffnnode] (f2) at ([xshift=0em,yshift=0.5em]add1.north){FFN}; \node [anchor=south,inner sep=0mm,minimum height=1.8em] (f2) at ([xshift=0em,yshift=0.5em]add1.north){output};
\draw[->,thick] ([xshift=0em,yshift=0em]f1.north)--([xshift=0em,yshift=0em]e1.south); \draw[->,thick] ([xshift=0em,yshift=-0.3em]f1.north)--([xshift=0em,yshift=0em]e1.south);
\draw[->,thick] ([xshift=0em,yshift=-1em]a1.south)--([xshift=0em,yshift=0em]a1.south); \draw[->,thick] ([xshift=0em,yshift=-1em]a1.south)--([xshift=0em,yshift=0em]a1.south);
\draw[->,thick] ([xshift=0em,yshift=-1em]c1.south)--([xshift=0em,yshift=0em]c1.south); \draw[->,thick] ([xshift=0em,yshift=-1em]c1.south)--([xshift=0em,yshift=0em]c1.south);
\draw[->,thick] ([xshift=0em,yshift=0em]a1.north)--([xshift=0em,yshift=1em]a1.north); \draw[->,thick] ([xshift=0em,yshift=0em]a1.north)--([xshift=0em,yshift=1em]a1.north);
\draw[->,thick] ([xshift=0em,yshift=0em]c1.north)--([xshift=0em,yshift=1em]c1.north); \draw[->,thick] ([xshift=0em,yshift=0em]c1.north)--([xshift=0em,yshift=1em]c1.north);
\draw[-,thick] ([xshift=0em,yshift=0em]e2.north)--([xshift=0em,yshift=0em]add1.south); \draw[-,thick] ([xshift=0em,yshift=0em]e2.north)--([xshift=0em,yshift=0em]add1.south);
\draw[->,thick] ([xshift=0em,yshift=0em]add1.north)--([xshift=0em,yshift=0em]f2.south); \draw[->,thick] ([xshift=0em,yshift=0em]add1.north)--([xshift=0em,yshift=0.3em]f2.south);
\draw[-] ([xshift=0em,yshift=0em]add1.west)--([xshift=-0em,yshift=0em]add1.east); \draw[-] ([xshift=0em,yshift=0em]add1.west)--([xshift=-0em,yshift=0em]add1.east);
\draw[-] ([xshift=0em,yshift=0em]add1.south)--([xshift=-0em,yshift=-0em]add1.north); \draw[-] ([xshift=0em,yshift=0em]add1.south)--([xshift=-0em,yshift=-0em]add1.north);
......
\begin{tikzpicture} \begin{tikzpicture}
\begin{scope} \begin{scope}
\tikzstyle{cirnode}=[circle,minimum size=3.7em,draw] \tikzstyle{cirnode}=[circle,minimum size=3em,font=\footnotesize,draw]
\tikzstyle{recnode}=[rectangle,rounded corners=2pt,inner sep=0mm,minimum height=1.5em,minimum width=4em,draw] \tikzstyle{recnode}=[rectangle,rounded corners=2pt,inner sep=0mm,minimum height=1.8em,minimum width=6em]
\node [anchor=west,cirnode] (n1) at (0, 0) {$\mathbi{h}_{i-2}^l$}; \node [anchor=west,cirnode] (n1) at (0, 0) {$\mathbi{h}_{i-2}^l$};
\node [anchor=west,cirnode] (n2) at ([xshift=1em,yshift=0em]n1.east) {$\mathbi{h}_{i-1}^l$}; \node [anchor=west,cirnode] (n2) at ([xshift=1.2em,yshift=0em]n1.east) {$\mathbi{h}_{i-1}^l$};
\node [anchor=west,cirnode] (n3) at ([xshift=1em,yshift=0em]n2.east) {$\mathbi{h}_{i}^l$}; \node [anchor=west,cirnode] (n3) at ([xshift=1.2em,yshift=0em]n2.east) {$\mathbi{h}_{i}^l$};
\node [anchor=west,cirnode] (n4) at ([xshift=1em,yshift=0em]n3.east) {$\mathbi{h}_{i+1}^l$}; \node [anchor=west,cirnode] (n4) at ([xshift=1.2em,yshift=0em]n3.east) {$\mathbi{h}_{i+1}^l$};
\node [anchor=west,cirnode] (n5) at ([xshift=1em,yshift=0em]n4.east) {$\mathbi{h}_{i+2}^l$}; \node [anchor=west,cirnode] (n5) at ([xshift=1.2em,yshift=0em]n4.east) {$\mathbi{h}_{i+2}^l$};
\node [anchor=center,blue!30,minimum height=4.2em,minimum width=4.5em,very thick,draw] (c1) at ([xshift=0em,yshift=0em]n3.center) {}; \begin{pgfonlayer}{background}
\node [anchor=center,ugreen!30,minimum height=4.9em,minimum width=14.5em,very thick,draw] (c2) at ([xshift=0em,yshift=0em]n3.center) {}; \node [anchor=center,red!30,minimum height=4.5em,minimum width=21em,very thick,draw] (c3) at ([xshift=0em,yshift=0em]n3.center) {};
\node [anchor=center,red!30,minimum height=5.6em,minimum width=24.5em,very thick,draw] (c3) at ([xshift=0em,yshift=0em]n3.center) {}; \node [anchor=center,ugreen!30,minimum height=4em,minimum width=12.5em,very thick,draw] (c2) at ([xshift=0em,yshift=0em]n3.center) {};
\node [anchor=center,orange!30,minimum height=3.5em,minimum width=3.6em,very thick,draw] (c1) at ([xshift=0em,yshift=0em]n3.center) {};
\end{pgfonlayer}
\node [anchor=south,recnode] (r1) at ([xshift=0em,yshift=2.5em]n2.north) {$\textrm{head}_1$}; \node [anchor=south,recnode,fill=red!20] (r1) at ([xshift=-3.5em,yshift=2.5em]n2.north) {$\textrm{head}_1$};
\node [anchor=south,recnode] (r2) at ([xshift=0em,yshift=2.5em]n3.north) {$\textrm{head}_2$}; \node [anchor=south,recnode,fill=orange!20] (r2) at ([xshift=0em,yshift=2.5em]n3.north) {$\textrm{head}_2$};
\node [anchor=south,recnode] (r3) at ([xshift=0em,yshift=2.5em]n4.north) {$\textrm{head}_3$}; \node [anchor=south,recnode,fill=ugreen!20] (r3) at ([xshift=3.5em,yshift=2.5em]n4.north) {$\textrm{head}_3$};
\node [anchor=south,cirnode] (n6) at ([xshift=0em,yshift=1em]r2.north) {$\mathbi{h}_{i}^{l+1}$}; \node [anchor=south,cirnode] (n6) at ([xshift=0em,yshift=1em]r2.north) {$\mathbi{h}_{i}^{l+1}$};
\draw [->,very thick,blue!30] ([xshift=0em,yshift=0em]c1.north) -- ([xshift=0em,yshift=0em]r2.south); \draw [->,very thick,orange!30] ([xshift=0em,yshift=0em]c1.north) -- ([xshift=0em,yshift=0em]r2.south);
\draw [->,very thick,ugreen!30] ([xshift=4.73em,yshift=0em]c2.north) -- ([xshift=0em,yshift=0em]r3.south); \draw [->,very thick,ugreen!30] ([xshift=3em,yshift=0em]c2.north)..controls +(north:1.5em) and +(south:1.5em)..([xshift=0em,yshift=0em]r3.south);
\draw [->,very thick,red!30] ([xshift=-4.73em,yshift=0em]c3.north) -- ([xshift=0em,yshift=0em]r1.south); \draw [->,very thick,red!30] ([xshift=-3em,yshift=0em]c3.north)..controls +(north:1.5em) and +(south:1.5em)..([xshift=0em,yshift=0em]r1.south);
\draw [->] ([xshift=0em,yshift=0em]r1.north) -- ([xshift=0em,yshift=0em]n6.south west); \draw [->] ([xshift=0em,yshift=0em]r1.north) -- ([xshift=0em,yshift=0em]n6.south west);
\draw [->] ([xshift=0em,yshift=0em]r2.north) -- ([xshift=0em,yshift=0em]n6.south); \draw [->] ([xshift=0em,yshift=0em]r2.north) -- ([xshift=0em,yshift=0em]n6.south);
......
...@@ -11,8 +11,8 @@ ...@@ -11,8 +11,8 @@
\node [anchor=west,enode] (e1) at ([xshift=1.5em,yshift=0em]w1.east) {编码器}; \node [anchor=west,enode] (e1) at ([xshift=1.5em,yshift=0em]w1.east) {编码器};
\node [anchor=west,dnode] (d1) at ([xshift=3em,yshift=6em]e1.east) {翻译任务}; \node [anchor=west,dnode] (d1) at ([xshift=3em,yshift=6em]e1.east) {翻译任务};
\node [anchor=north,dnode] (d2) at ([xshift=0em,yshift=-2em]d1.south) {文本处理任务}; \node [anchor=north,dnode] (d2) at ([xshift=0em,yshift=-2em]d1.south) {句法分析任务};
\node [anchor=north,dnode] (d3) at ([xshift=0em,yshift=-2em]d2.south) {语言理解}; \node [anchor=north,dnode] (d3) at ([xshift=0em,yshift=-2em]d2.south) {语言理解任务};
\node [anchor=north,dnode] (d4) at ([xshift=0em,yshift=-2em]d3.south) {其他任务}; \node [anchor=north,dnode] (d4) at ([xshift=0em,yshift=-2em]d3.south) {其他任务};
\node [anchor=west] (w2) at ([xshift=2em,yshift=0em]d1.east) {英语(目标语言)}; \node [anchor=west] (w2) at ([xshift=2em,yshift=0em]d1.east) {英语(目标语言)};
......
...@@ -3,13 +3,13 @@ ...@@ -3,13 +3,13 @@
\begin{center} \begin{center}
\begin{tikzpicture} \begin{tikzpicture}
\tikzstyle{wrnode}=[rectangle,inner sep=0mm,minimum height=1.8em,minimum width=3em,rounded corners=5pt,fill=blue!30] \tikzstyle{wrnode}=[rectangle,inner sep=0mm,minimum height=1.6em,minimum width=3em,rounded corners=5pt,fill=blue!30]
\tikzstyle{srnode}=[rectangle,inner sep=0mm,minimum height=1.8em,minimum width=3em,rounded corners=5pt,fill=orange!30] \tikzstyle{srnode}=[rectangle,inner sep=0mm,minimum height=1.6em,minimum width=3em,rounded corners=5pt,fill=orange!30]
\tikzstyle{dotnode}=[inner sep=0mm,minimum height=0.5em,minimum width=1.5em] \tikzstyle{dotnode}=[inner sep=0mm,minimum height=0.5em,minimum width=1.5em]
\tikzstyle{wnode}=[inner sep=0mm,minimum height=1.8em] \tikzstyle{wnode}=[inner sep=0mm,minimum height=1.6em]
{\small {\small
\begin{scope}[] \begin{scope}[scale=1]
\tikzstyle{every node}=[scale=1]
\node [anchor=west,wrnode] (wr1) at (0,0) {$\mathbi{h}_{w_1}$}; \node [anchor=west,wrnode] (wr1) at (0,0) {$\mathbi{h}_{w_1}$};
\node [anchor=west,wrnode] (wr2) at ([xshift=1em,yshift=0em]wr1.east) {$\mathbi{h}_{w_2}$}; \node [anchor=west,wrnode] (wr2) at ([xshift=1em,yshift=0em]wr1.east) {$\mathbi{h}_{w_2}$};
\node [anchor=west,wrnode] (wr3) at ([xshift=1em,yshift=0em]wr2.east) {$\mathbi{h}_{w_3}$}; \node [anchor=west,wrnode] (wr3) at ([xshift=1em,yshift=0em]wr2.east) {$\mathbi{h}_{w_3}$};
...@@ -22,27 +22,27 @@ ...@@ -22,27 +22,27 @@
\node [anchor=west,dotnode] (dot3) at ([xshift=0.8em,yshift=0em]sr3.east) {$\cdots$}; \node [anchor=west,dotnode] (dot3) at ([xshift=0.8em,yshift=0em]sr3.east) {$\cdots$};
\node [anchor=west,srnode] (sr4) at ([xshift=0.8em,yshift=0em]dot3.east) {$\mathbi{h}_{l_7}$}; \node [anchor=west,srnode] (sr4) at ([xshift=0.8em,yshift=0em]dot3.east) {$\mathbi{h}_{l_7}$};
\node [anchor=north,wnode,font=\footnotesize] (w1) at ([xshift=0em,yshift=-1em]wr1.south) {$w_1$\ :\ I}; \node [anchor=north,wnode,font=\footnotesize] (w1) at ([xshift=0em,yshift=-0.7em]wr1.south) {$w_1$\ :\ I};
\node [anchor=north,wnode,font=\footnotesize] (w2) at ([xshift=0em,yshift=-1em]wr2.south) {$w_2$\ :\ love}; \node [anchor=north,wnode,font=\footnotesize] (w2) at ([xshift=0em,yshift=-0.7em]wr2.south) {$w_2$\ :\ love};
\node [anchor=north,wnode,font=\footnotesize] (w3) at ([xshift=0em,yshift=-1em]wr3.south) {$w_3$\ :\ dogs}; \node [anchor=north,wnode,font=\footnotesize] (w3) at ([xshift=0em,yshift=-0.7em]wr3.south) {$w_3$\ :\ dogs};
\node [anchor=north,wnode,font=\footnotesize] (w4) at ([xshift=0em,yshift=-1em]sr1.south) {$l_1$\ :\ S}; \node [anchor=north,wnode,font=\footnotesize] (w4) at ([xshift=0em,yshift=-0.7em]sr1.south) {$l_1$\ :\ S};
\node [anchor=north,dotnode] (dot4) at ([xshift=0em,yshift=-2.4em]dot1.south) {$\cdots$}; \node [anchor=north,dotnode] (dot4) at ([xshift=0em,yshift=-2em]dot1.south) {$\cdots$};
\node [anchor=north,wnode,font=\footnotesize] (w5) at ([xshift=0em,yshift=-1em]sr2.south) {$l_3$\ :\ PRN}; \node [anchor=north,wnode,font=\footnotesize] (w5) at ([xshift=0em,yshift=-0.7em]sr2.south) {$l_3$\ :\ PRN};
\node [anchor=north,dotnode] (dot5) at ([xshift=0em,yshift=-2.2em]dot2.south) {$\cdots$}; \node [anchor=north,dotnode] (dot5) at ([xshift=0em,yshift=-2em]dot2.south) {$\cdots$};
\node [anchor=north,wnode,font=\footnotesize] (w6) at ([xshift=0em,yshift=-1em]sr3.south) {$l_5$\ :\ VBP}; \node [anchor=north,wnode,font=\footnotesize] (w6) at ([xshift=0em,yshift=-0.7em]sr3.south) {$l_5$\ :\ VBP};
\node [anchor=north,dotnode] (dot6) at ([xshift=0em,yshift=-2.3em]dot3.south) {$\cdots$}; \node [anchor=north,dotnode] (dot6) at ([xshift=0em,yshift=-2em]dot3.south) {$\cdots$};
\node [anchor=north,wnode,font=\footnotesize] (w7) at ([xshift=0em,yshift=-1em]sr4.south) {$l_7$\ :\ NNS}; \node [anchor=north,wnode,font=\footnotesize] (w7) at ([xshift=0em,yshift=-0.7em]sr4.south) {$l_7$\ :\ NNS};
\node [anchor=south,circle,draw,minimum size=1.2em] (c1) at ([xshift=2.5em,yshift=2em]wr2.north){}; \node [anchor=south,circle,draw,minimum size=1.2em] (c1) at ([xshift=2.5em,yshift=1.5em]wr2.north){};
\node [anchor=west,circle,draw,minimum size=1.2em] (c2) at ([xshift=8em,yshift=0em]c1.east){}; \node [anchor=west,circle,draw,minimum size=1.2em] (c2) at ([xshift=8em,yshift=0em]c1.east){};
\node [anchor=west,circle,draw,minimum size=1.2em] (c3) at ([xshift=8em,yshift=0em]c2.east){}; \node [anchor=west,circle,draw,minimum size=1.2em] (c3) at ([xshift=8em,yshift=0em]c2.east){};
\node [anchor=south,srnode] (m1) at ([xshift=0em,yshift=2em]c1.north) {$\mathbi{h}_{l_1}$}; \node [anchor=south,srnode] (m1) at ([xshift=0em,yshift=1em]c1.north) {$\mathbi{h}_{l_1}$};
\node [anchor=south,wrnode] (m2) at ([xshift=0em,yshift=0em]m1.north) {$\mathbi{h}_{w_1}$}; \node [anchor=south,wrnode] (m2) at ([xshift=0em,yshift=0em]m1.north) {$\mathbi{h}_{w_1}$};
\node [anchor=south,srnode] (m3) at ([xshift=0em,yshift=2em]c2.north) {$\mathbi{h}_{l_5}$}; \node [anchor=south,srnode] (m3) at ([xshift=0em,yshift=1em]c2.north) {$\mathbi{h}_{l_5}$};
\node [anchor=south,wrnode] (m4) at ([xshift=0em,yshift=0em]m3.north) {$\mathbi{h}_{w_2}$}; \node [anchor=south,wrnode] (m4) at ([xshift=0em,yshift=0em]m3.north) {$\mathbi{h}_{w_2}$};
\node [anchor=south,srnode] (m5) at ([xshift=0em,yshift=2em]c3.north) {$\mathbi{h}_{l_7}$}; \node [anchor=south,srnode] (m5) at ([xshift=0em,yshift=1em]c3.north) {$\mathbi{h}_{l_7}$};
\node [anchor=south,wrnode] (m6) at ([xshift=0em,yshift=0em]m5.north) {$\mathbi{h}_{w_3}$}; \node [anchor=south,wrnode] (m6) at ([xshift=0em,yshift=0em]m5.north) {$\mathbi{h}_{w_3}$};
...@@ -56,9 +56,9 @@ ...@@ -56,9 +56,9 @@
\begin{pgfonlayer}{background} \begin{pgfonlayer}{background}
\node [rectangle,inner sep=0.5em,draw=blue!80,dashed,very thick,rounded corners=10pt] [fit = (wr1) (wr3) (w1) (w3)] (box1) {}; \node [rectangle,inner sep=0.5em,draw=blue!80,dashed,very thick,rounded corners=10pt] [fit = (wr1) (wr3) (w1) (w3)] (box1) {};
\node [rectangle,inner sep=0.5em,draw=orange!80,dashed,very thick,rounded corners=10pt] [fit = (sr1) (sr4) (w4) (w7)] (box2) {}; \node [rectangle,inner sep=0.5em,draw=orange!80,dashed,very thick,rounded corners=10pt] [fit = (sr1) (sr4) (w4) (w7)] (box2) {};
\node [rectangle,minimum height=5em,inner sep=0.6em,fill=gray!20,draw=black,dashed,very thick,rounded corners=8pt] [fit = (m1) (m2)] (box3) {}; \node [rectangle,inner sep=0.5em,fill=gray!20,draw=black,dashed,very thick,rounded corners=8pt] [fit = (m1) (m2)] (box3) {};
\node [rectangle,minimum height=5em,inner sep=0.6em,fill=gray!20,draw=black,dashed,very thick,rounded corners=8pt] [fit = (m3) (m4)] (box4) {}; \node [rectangle,inner sep=0.5em,fill=gray!20,draw=black,dashed,very thick,rounded corners=8pt] [fit = (m3) (m4)] (box4) {};
\node [rectangle,minimum height=5em,inner sep=0.6em,fill=gray!20,draw=black,dashed,very thick,rounded corners=8pt] [fit = (m5) (m6)] (box5) {}; \node [rectangle,inner sep=0.5em,fill=gray!20,draw=black,dashed,very thick,rounded corners=8pt] [fit = (m5) (m6)] (box5) {};
\end{pgfonlayer} \end{pgfonlayer}
\node [anchor=south,wnode] (h1) at ([xshift=0em,yshift=0.1em]box3.north) {${\mathbi{h}'}_1$\ :\ }; \node [anchor=south,wnode] (h1) at ([xshift=0em,yshift=0.1em]box3.north) {${\mathbi{h}'}_1$\ :\ };
...@@ -73,9 +73,9 @@ ...@@ -73,9 +73,9 @@
\draw [->,thick] ([xshift=0em,yshift=0em]w6.north) -- ([xshift=0em,yshift=0em]sr3.south); \draw [->,thick] ([xshift=0em,yshift=0em]w6.north) -- ([xshift=0em,yshift=0em]sr3.south);
\draw [->,thick] ([xshift=0em,yshift=0em]w7.north) -- ([xshift=0em,yshift=0em]sr4.south); \draw [->,thick] ([xshift=0em,yshift=0em]w7.north) -- ([xshift=0em,yshift=0em]sr4.south);
\draw [->,thick] ([xshift=0em,yshift=0.7em]dot4.north) -- ([xshift=0em,yshift=-0.7em]dot1.south); \draw [->,thick] ([xshift=0em,yshift=0.6em]dot4.north) -- ([xshift=0em,yshift=-0.7em]dot1.south);
\draw [->,thick] ([xshift=0em,yshift=0.7em]dot5.north) -- ([xshift=0em,yshift=-0.7em]dot2.south); \draw [->,thick] ([xshift=0em,yshift=0.6em]dot5.north) -- ([xshift=0em,yshift=-0.7em]dot2.south);
\draw [->,thick] ([xshift=0em,yshift=0.7em]dot6.north) -- ([xshift=0em,yshift=-0.7em]dot3.south); \draw [->,thick] ([xshift=0em,yshift=0.6em]dot6.north) -- ([xshift=0em,yshift=-0.7em]dot3.south);
\draw [<->,thick] ([xshift=0em,yshift=0em]wr1.east) -- ([xshift=0em,yshift=0em]wr2.west); \draw [<->,thick] ([xshift=0em,yshift=0em]wr1.east) -- ([xshift=0em,yshift=0em]wr2.west);
\draw [<->,thick] ([xshift=0em,yshift=0em]wr2.east) -- ([xshift=0em,yshift=0em]wr3.west); \draw [<->,thick] ([xshift=0em,yshift=0em]wr2.east) -- ([xshift=0em,yshift=0em]wr3.west);
...@@ -96,13 +96,13 @@ ...@@ -96,13 +96,13 @@
\draw[->,thick] ([xshift=0em,yshift=-0em]wr3.north)..controls +(north:2em) and +(south:1em)..([xshift=-0em,yshift=-0em]c3.south west) ; \draw[->,thick] ([xshift=0em,yshift=-0em]wr3.north)..controls +(north:2em) and +(south:1em)..([xshift=-0em,yshift=-0em]c3.south west) ;
\draw[->,thick] ([xshift=0em,yshift=-0em]sr4.north)..controls +(north:2em) and +(east:0em)..([xshift=-0em,yshift=-0em]c3.east) ; \draw[->,thick] ([xshift=0em,yshift=-0em]sr4.north)..controls +(north:2em) and +(east:0em)..([xshift=-0em,yshift=-0em]c3.east) ;
\draw [->,thick] ([xshift=0em,yshift=0em]c1.north) -- ([xshift=0em,yshift=0em]box3.south); \draw [->,thick] ([xshift=0em,yshift=0em]c1.north) -- ([xshift=0em,yshift=0em]m1.south);
\draw [->,thick] ([xshift=0em,yshift=0em]c2.north) -- ([xshift=0em,yshift=0em]box4.south); \draw [->,thick] ([xshift=0em,yshift=0em]c2.north) -- ([xshift=0em,yshift=0em]m3.south);
\draw [->,thick] ([xshift=0em,yshift=0em]c3.north) -- ([xshift=0em,yshift=0em]box5.south); \draw [->,thick] ([xshift=0em,yshift=0em]c3.north) -- ([xshift=0em,yshift=0em]m5.south);
\node [anchor=north] (r1) at ([xshift=0em,yshift=-1em]w2.south) {词语RNN}; \node [anchor=north,font=\small] (r1) at ([xshift=0em,yshift=-1em]w2.south) {词语RNN};
\node [anchor=north] (r2) at ([xshift=3em,yshift=-1em]w5.south) {句法RNN}; \node [anchor=north,font=\small] (r2) at ([xshift=3em,yshift=-1em]w5.south) {句法RNN};
\node [anchor=north] (label1) at ([xshift=0em,yshift=-4em]dot4.south) {(a)平行结构}; \node [anchor=north,font=\small] (label1) at ([xshift=0em,yshift=-3em]dot4.south) {(a)平行结构};
\end{scope} \end{scope}
} }
......
...@@ -3,12 +3,12 @@ ...@@ -3,12 +3,12 @@
\begin{center} \begin{center}
\begin{tikzpicture} \begin{tikzpicture}
\tikzstyle{hnode}=[rectangle,inner sep=0mm,minimum height=1.8em,minimum width=3em,rounded corners=5pt,fill=red!30] \tikzstyle{hnode}=[rectangle,inner sep=0mm,minimum height=1.6em,minimum width=3em,rounded corners=5pt,fill=red!30]
\tikzstyle{dotnode}=[inner sep=0mm,minimum height=0.5em,minimum width=1.5em] \tikzstyle{dotnode}=[inner sep=0mm,minimum height=0.5em,minimum width=1.5em]
\tikzstyle{wnode}=[inner sep=0mm,minimum height=1.8em] \tikzstyle{wnode}=[inner sep=0mm,minimum height=1.6em]
{\small {\small
\begin{scope}[] \begin{scope}[scale=1]
\tikzstyle{every node}=[scale=1]
\node [anchor=west,hnode] (n1) at (0,0) {$\mathbi{h}_{1}$}; \node [anchor=west,hnode] (n1) at (0,0) {$\mathbi{h}_{1}$};
\node [anchor=west,hnode] (n2) at ([xshift=1em,yshift=0em]n1.east) {$\mathbi{h}_{2}$}; \node [anchor=west,hnode] (n2) at ([xshift=1em,yshift=0em]n1.east) {$\mathbi{h}_{2}$};
\node [anchor=west,dotnode] (dot1) at ([xshift=1em,yshift=0em]n2.east) {$\cdots$}; \node [anchor=west,dotnode] (dot1) at ([xshift=1em,yshift=0em]n2.east) {$\cdots$};
...@@ -18,14 +18,14 @@ ...@@ -18,14 +18,14 @@
\node [anchor=west,dotnode] (dot3) at ([xshift=1em,yshift=0em]n4.east) {$\cdots$}; \node [anchor=west,dotnode] (dot3) at ([xshift=1em,yshift=0em]n4.east) {$\cdots$};
\node [anchor=west,hnode] (n5) at ([xshift=1em,yshift=0em]dot3.east) {$\mathbi{h}_{10}$}; \node [anchor=west,hnode] (n5) at ([xshift=1em,yshift=0em]dot3.east) {$\mathbi{h}_{10}$};
\node [anchor=north,wnode,font=\footnotesize] (w1) at ([xshift=0em,yshift=-1em]n1.south) {$l_1$\ :\ S}; \node [anchor=north,wnode,font=\footnotesize] (w1) at ([xshift=0em,yshift=-0.7em]n1.south) {$l_1$\ :\ S};
\node [anchor=north,wnode,font=\footnotesize] (w2) at ([xshift=0em,yshift=-1em]n2.south) {$l_3$\ :\ NP}; \node [anchor=north,wnode,font=\footnotesize] (w2) at ([xshift=0em,yshift=-0.7em]n2.south) {$l_3$\ :\ NP};
\node [anchor=north,dotnode] (dot4) at ([xshift=0em,yshift=-2.4em]dot1.south) {$\cdots$}; \node [anchor=north,dotnode] (dot4) at ([xshift=0em,yshift=-2em]dot1.south) {$\cdots$};
\node [anchor=north,wnode,font=\footnotesize] (w3) at ([xshift=0em,yshift=-1em]n3.south) {$w_1$\ :\ I}; \node [anchor=north,wnode,font=\footnotesize] (w3) at ([xshift=0em,yshift=-0.7em]n3.south) {$w_1$\ :\ I};
\node [anchor=north,dotnode] (dot5) at ([xshift=0em,yshift=-2.2em]dot2.south) {$\cdots$}; \node [anchor=north,dotnode] (dot5) at ([xshift=0em,yshift=-2em]dot2.south) {$\cdots$};
\node [anchor=north,wnode,font=\footnotesize] (w4) at ([xshift=0em,yshift=-1em]n4.south) {$w_2$\ :\ love}; \node [anchor=north,wnode,font=\footnotesize] (w4) at ([xshift=0em,yshift=-0.7em]n4.south) {$w_2$\ :\ love};
\node [anchor=north,dotnode] (dot6) at ([xshift=0em,yshift=-2.3em]dot3.south) {$\cdots$}; \node [anchor=north,dotnode] (dot6) at ([xshift=0em,yshift=-2em]dot3.south) {$\cdots$};
\node [anchor=north,wnode,font=\footnotesize] (w5) at ([xshift=0em,yshift=-1em]n5.south) {$w_3$\ :\ dogs}; \node [anchor=north,wnode,font=\footnotesize] (w5) at ([xshift=0em,yshift=-0.7em]n5.south) {$w_3$\ :\ dogs};
\node [anchor=south,wnode] (h1) at ([xshift=0em,yshift=0.3em]n3.north) {${\mathbi{h}'}_1$\ :\ }; \node [anchor=south,wnode] (h1) at ([xshift=0em,yshift=0.3em]n3.north) {${\mathbi{h}'}_1$\ :\ };
...@@ -41,10 +41,10 @@ ...@@ -41,10 +41,10 @@
\end{pgfonlayer} \end{pgfonlayer}
\node [anchor=east] (r1) at ([xshift=-2em,yshift=0em]box1.west) {词语RNN}; \node [anchor=east,font=\small] (r1) at ([xshift=-2em,yshift=0em]box1.west) {混合RNN};
\node [anchor=south west,wnode] (l1) at ([xshift=1em,yshift=6em]r1.north west) {先序遍历句法树,得到序列:}; {\small
\node [anchor=south west,wnode] (l1) at ([xshift=1em,yshift=5em]r1.north west) {先序遍历句法树,得到序列:};
\node [anchor=north west,wnode,align=center] (l2) at ([xshift=0.5em,yshift=-0.6em]l1.north east) {S\\[0.5em]$l_1$}; \node [anchor=north west,wnode,align=center] (l2) at ([xshift=0.5em,yshift=-0.6em]l1.north east) {S\\[0.5em]$l_1$};
\node [anchor=north west,wnode,align=center] (l3) at ([xshift=0.5em,yshift=0em]l2.north east) {NP\\[0.5em]$l_2$}; \node [anchor=north west,wnode,align=center] (l3) at ([xshift=0.5em,yshift=0em]l2.north east) {NP\\[0.5em]$l_2$};
\node [anchor=north west,wnode,align=center] (l4) at ([xshift=0.5em,yshift=0em]l3.north east) {PRN\\[0.5em]$l_3$}; \node [anchor=north west,wnode,align=center] (l4) at ([xshift=0.5em,yshift=0em]l3.north east) {PRN\\[0.5em]$l_3$};
...@@ -55,7 +55,7 @@ ...@@ -55,7 +55,7 @@
\node [anchor=north west,wnode,align=center] (l9) at ([xshift=0.5em,yshift=0em]l8.north east) {NP\\[0.5em]$l_6$}; \node [anchor=north west,wnode,align=center] (l9) at ([xshift=0.5em,yshift=0em]l8.north east) {NP\\[0.5em]$l_6$};
\node [anchor=north west,wnode,align=center] (l10) at ([xshift=0.5em,yshift=0em]l9.north east) {NNS\\[0.5em]$l_7$}; \node [anchor=north west,wnode,align=center] (l10) at ([xshift=0.5em,yshift=0em]l9.north east) {NNS\\[0.5em]$l_7$};
\node [anchor=north west,wnode,align=center] (l11) at ([xshift=0.5em,yshift=0em]l10.north east) {dogs\\[0.5em]$w_3$}; \node [anchor=north west,wnode,align=center] (l11) at ([xshift=0.5em,yshift=0em]l10.north east) {dogs\\[0.5em]$w_3$};
}
\draw [->,thick] ([xshift=0em,yshift=0em]w1.north) -- ([xshift=0em,yshift=0em]n1.south); \draw [->,thick] ([xshift=0em,yshift=0em]w1.north) -- ([xshift=0em,yshift=0em]n1.south);
...@@ -65,9 +65,9 @@ ...@@ -65,9 +65,9 @@
\draw [->,thick] ([xshift=0em,yshift=0em]w5.north) -- ([xshift=0em,yshift=0em]n5.south); \draw [->,thick] ([xshift=0em,yshift=0em]w5.north) -- ([xshift=0em,yshift=0em]n5.south);
\draw [->,thick] ([xshift=0em,yshift=0.7em]dot4.north) -- ([xshift=0em,yshift=-0.7em]dot1.south); \draw [->,thick] ([xshift=0em,yshift=0.6em]dot4.north) -- ([xshift=0em,yshift=-0.7em]dot1.south);
\draw [->,thick] ([xshift=0em,yshift=0.7em]dot5.north) -- ([xshift=0em,yshift=-0.7em]dot2.south); \draw [->,thick] ([xshift=0em,yshift=0.6em]dot5.north) -- ([xshift=0em,yshift=-0.7em]dot2.south);
\draw [->,thick] ([xshift=0em,yshift=0.7em]dot6.north) -- ([xshift=0em,yshift=-0.7em]dot3.south); \draw [->,thick] ([xshift=0em,yshift=0.6em]dot6.north) -- ([xshift=0em,yshift=-0.7em]dot3.south);
\draw [<->,thick] ([xshift=0em,yshift=0em]n1.east) -- ([xshift=0em,yshift=0em]n2.west); \draw [<->,thick] ([xshift=0em,yshift=0em]n1.east) -- ([xshift=0em,yshift=0em]n2.west);
...@@ -79,7 +79,7 @@ ...@@ -79,7 +79,7 @@
\draw [<->,thick] ([xshift=0em,yshift=0em]dot3.east) -- ([xshift=0em,yshift=0em]n5.west); \draw [<->,thick] ([xshift=0em,yshift=0em]dot3.east) -- ([xshift=0em,yshift=0em]n5.west);
\node [anchor=north] (label2) at ([xshift=-2em,yshift=-2em]w3.south) {(c)混合结构}; \node [anchor=north,font=\small] (label2) at ([xshift=-2em,yshift=-1em]w3.south) {(c)混合结构};
\end{scope} \end{scope}
} }
......
...@@ -11,7 +11,7 @@ ...@@ -11,7 +11,7 @@
\node [anchor=west] (eq2) at (eq1.east) {$=$\ }; \node [anchor=west] (eq2) at (eq1.east) {$=$\ };
\draw [-] ([xshift=0.3em]eq2.east) -- ([xshift=11.6em]eq2.east); \draw [-] ([xshift=0.3em]eq2.east) -- ([xshift=11.6em]eq2.east);
\node [anchor=south west] (eq3) at ([xshift=1em]eq2.east) {$\sum_{k=1}^{K} c_{\mathbb{E}}(s_u|t_v;s^{[k]},t^{[k]})$}; \node [anchor=south west] (eq3) at ([xshift=1em]eq2.east) {$\sum_{k=1}^{K} c_{\mathbb{E}}(s_u|t_v;s^{[k]},t^{[k]})$};
\node [anchor=north west] (eq4) at (eq2.east) {$\sum_{s_u} \sum_{k=1}^{K} c_{\mathbb{E}}(s_u|t_v;s^{[k]},t^{[k]})$}; \node [anchor=north west] (eq4) at (eq2.east) {$\sum_{s'_u} \sum_{k=1}^{K} c_{\mathbb{E}}(s'_u|t_v;s^{[k]},t^{[k]})$};
{ {
\node [anchor=south] (label1) at ([yshift=-6em,xshift=3em]eq1.north west) {利用这个公式计算}; \node [anchor=south] (label1) at ([yshift=-6em,xshift=3em]eq1.north west) {利用这个公式计算};
......
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
\node [anchor=north west] (line7) at ([yshift=-0.1em]line6.south west) {4: \quad \quad \textbf{foreach} $k = 1$ to $K$ \textbf{do}}; \node [anchor=north west] (line7) at ([yshift=-0.1em]line6.south west) {4: \quad \quad \textbf{foreach} $k = 1$ to $K$ \textbf{do}};
\node [anchor=north west] (line8) at ([yshift=-0.1em]line7.south west) {5: \quad \quad \quad \footnotesize{$c_{\mathbb{E}}(\seq{s}_u|\seq{t}_v;\seq{s}^{[k]},\seq{t}^{[k]}) = \sum\limits_{j=1}^{|\seq{s}^{[k]}|} \delta(s_j,s_u) \sum\limits_{i=0}^{|\seq{t}^{[k]}|} \delta(t_i,t_v) \cdot \frac{f(s_u|t_v)}{\sum_{i=0}^{l}f(s_u|t_i)}$}\normalsize{}}; \node [anchor=north west] (line8) at ([yshift=-0.1em]line7.south west) {5: \quad \quad \quad \footnotesize{$c_{\mathbb{E}}(\seq{s}_u|\seq{t}_v;\seq{s}^{[k]},\seq{t}^{[k]}) = \sum\limits_{j=1}^{|\seq{s}^{[k]}|} \delta(s_j,s_u) \sum\limits_{i=0}^{|\seq{t}^{[k]}|} \delta(t_i,t_v) \cdot \frac{f(s_u|t_v)}{\sum_{i=0}^{l}f(s_u|t_i)}$}\normalsize{}};
\node [anchor=north west] (line9) at ([yshift=-0.1em]line8.south west) {6: \quad \quad \textbf{foreach} $t_v$ appears at least one of $\{\seq{t}^{[1]},...,\seq{t}^{[K]}\}$ \textbf{do}}; \node [anchor=north west] (line9) at ([yshift=-0.1em]line8.south west) {6: \quad \quad \textbf{foreach} $t_v$ appears at least one of $\{\seq{t}^{[1]},...,\seq{t}^{[K]}\}$ \textbf{do}};
\node [anchor=north west] (line10) at ([yshift=-0.1em]line9.south west) {7: \quad \quad \quad $\lambda_{t_v}^{'} = \sum_{s_u} \sum_{k=1}^{K} c_{\mathbb{E}}(s_u|t_v;\seq{s}^{[k]},\seq{t}^{[k]})$}; \node [anchor=north west] (line10) at ([yshift=-0.1em]line9.south west) {7: \quad \quad \quad $\lambda_{t_v}^{'} = \sum_{s'_u} \sum_{k=1}^{K} c_{\mathbb{E}}(s'_u|t_v;\seq{s}^{[k]},\seq{t}^{[k]})$};
\node [anchor=north west] (line11) at ([yshift=-0.1em]line10.south west) {8: \quad \quad \quad \textbf{foreach} $s_u$ appears at least one of $\{\seq{s}^{[1]},...,\seq{s}^{[K]}\}$ \textbf{do}}; \node [anchor=north west] (line11) at ([yshift=-0.1em]line10.south west) {8: \quad \quad \quad \textbf{foreach} $s_u$ appears at least one of $\{\seq{s}^{[1]},...,\seq{s}^{[K]}\}$ \textbf{do}};
\node [anchor=north west] (line12) at ([yshift=-0.1em]line11.south west) {9: \quad \quad \quad \quad $f(s_u|t_v) = \sum_{k=1}^{K} c_{\mathbb{E}}(s_u|t_v;\seq{s}^{[k]},\seq{t}^{[k]}) \cdot (\lambda_{t_v}^{'})^{-1}$}; \node [anchor=north west] (line12) at ([yshift=-0.1em]line11.south west) {9: \quad \quad \quad \quad $f(s_u|t_v) = \sum_{k=1}^{K} c_{\mathbb{E}}(s_u|t_v;\seq{s}^{[k]},\seq{t}^{[k]}) \cdot (\lambda_{t_v}^{'})^{-1}$};
\node [anchor=north west] (line13) at ([yshift=-0.1em]line12.south west) {10: \textbf{return} $f(\cdot|\cdot)$}; \node [anchor=north west] (line13) at ([yshift=-0.1em]line12.south west) {10: \textbf{return} $f(\cdot|\cdot)$};
......
...@@ -330,7 +330,7 @@ $\seq{t}^{[2]}$ = So\; ,\; what\; is\; human\; \underline{translation}\; ? ...@@ -330,7 +330,7 @@ $\seq{t}^{[2]}$ = So\; ,\; what\; is\; human\; \underline{translation}\; ?
\label{eq:5-7} \label{eq:5-7}
\end{eqnarray} \end{eqnarray}
\parinterval 公式\eqref{eq:5-7}相当于在函数$g(\cdot)$上做了归一化,这样等式右端的结果具有一些概率的属性,比如,$0 \le \frac{g(\seq{s},\seq{t})}{\sum_{\seq{t'}}g(\seq{s},\seq{t'})} \le 1$。具体来说,对于源语言句子$\seq{s}$,枚举其所有的翻译结果,并把所对应的函数$g(\cdot)$相加作为分母,而分子是某个翻译结果$\seq{t}$所对应的$g(\cdot)$的值。 \parinterval 公式\eqref{eq:5-7}相当于在函数$g(\cdot)$上做了归一化,这样等式右端的结果具有一些概率的属性,比如,$0 \le \frac{g(\seq{s},\seq{t})}{\sum_{\seq{t'}}g(\seq{s},\seq{t'})} \le 1$ 具体来说,对于源语言句子$\seq{s}$,枚举其所有的翻译结果,并把所对应的函数$g(\cdot)$相加作为分母,而分子是某个翻译结果$\seq{t}$所对应的$g(\cdot)$的值。
\parinterval 上述过程初步建立了句子级翻译模型,并没有直接求$\funp{P}(\seq{t}|\seq{s})$,而是把问题转化为对$g(\cdot)$的设计和计算上。但是,面临着两个新的问题: \parinterval 上述过程初步建立了句子级翻译模型,并没有直接求$\funp{P}(\seq{t}|\seq{s})$,而是把问题转化为对$g(\cdot)$的设计和计算上。但是,面临着两个新的问题:
...@@ -1024,13 +1024,13 @@ f(s_u|t_v) &= &\lambda_{t_v}^{-1} \cdot \funp{P}(\seq{s}| \seq{t}) \cdot c_{\mat ...@@ -1024,13 +1024,13 @@ f(s_u|t_v) &= &\lambda_{t_v}^{-1} \cdot \funp{P}(\seq{s}| \seq{t}) \cdot c_{\mat
\parinterval 为了满足$f(\cdot|\cdot)$的概率归一化约束,易得$\lambda_{t_v}^{'}$为: \parinterval 为了满足$f(\cdot|\cdot)$的概率归一化约束,易得$\lambda_{t_v}^{'}$为:
\begin{eqnarray} \begin{eqnarray}
\lambda_{t_v}^{'}&=&\sum\limits_{s_u} c_{\mathbb{E}}(s_u|t_v;\seq{s},\seq{t}) \lambda_{t_v}^{'}&=&\sum\limits_{s'_u} c_{\mathbb{E}}(s'_u|t_v;\seq{s},\seq{t})
\label{eq:5-43} \label{eq:5-43}
\end{eqnarray} \end{eqnarray}
\parinterval 因此,$f(s_u|t_v)$的计算式可再一步变换成下式: \parinterval 因此,$f(s_u|t_v)$的计算式可再一步变换成下式:
\begin{eqnarray} \begin{eqnarray}
f(s_u|t_v)&=&\frac{c_{\mathbb{E}}(s_u|t_v;\seq{s},\seq{t})} { \sum\limits_{s_u} c_{\mathbb{E}}(s_u|t_v;\seq{s},\seq{t}) } f(s_u|t_v)&=&\frac{c_{\mathbb{E}}(s_u|t_v;\seq{s},\seq{t})} { \sum\limits_{s'_u} c_{\mathbb{E}}(s'_u|t_v;\seq{s},\seq{t}) }
\label{eq:5-44} \label{eq:5-44}
\end{eqnarray} \end{eqnarray}
......
...@@ -335,13 +335,13 @@ p_0+p_1 & = & 1 \label{eq:6-21} ...@@ -335,13 +335,13 @@ p_0+p_1 & = & 1 \label{eq:6-21}
\parinterval 另外,可以用$\odot_{i}$表示位置为$[i]$的目标语言单词对应的那些源语言单词位置的平均值,如果这个平均值不是整数则对它向上取整。比如在本例中,目标语句中第4个cept. (“.”)对应在源语言句子中的第5个单词。可表示为${\odot}_{4}=5$ \parinterval 另外,可以用$\odot_{i}$表示位置为$[i]$的目标语言单词对应的那些源语言单词位置的平均值,如果这个平均值不是整数则对它向上取整。比如在本例中,目标语句中第4个cept. (“.”)对应在源语言句子中的第5个单词。可表示为${\odot}_{4}=5$
\parinterval 利用这些新引进的概念,模型4对模型3的扭曲度进行了修改。主要是把扭曲度分解为两类参数。对于$[i]$对应的源语言单词列表($\tau_{[i]}$)中的第一个单词($\tau_{[i]1}$),它的扭曲度用如下公式计算: \parinterval 利用这些新引进的概念,模型4对模型3的扭曲度进行了修改。主要是把扭曲度分解为两类参数。对于$[i]$对应的源语言单词列表($\tau_{[i]}$)中的第一个单词($\tau_{[i]1}$),$[i]>0$它的扭曲度用如下公式计算:
\begin{eqnarray} \begin{eqnarray}
\funp{P}(\pi_{[i]1}=j|{\pi}_1^{[i]-1},{\tau}_0^l,{\varphi}_0^l,\seq{t}) & = & d_{1}(j-{\odot}_{i-1}|A(t_{[i-1]}),B(s_j)) \funp{P}(\pi_{[i]1}=j|{\pi}_1^{[i]-1},{\tau}_0^l,{\varphi}_0^l,\seq{t}) & = & d_{1}(j-{\odot}_{i-1}|A(t_{[i-1]}),B(s_j))
\label{eq:6-22} \label{eq:6-22}
\end{eqnarray} \end{eqnarray}
\noindent 其中,第$i$个目标语言单词生成的第$k$个源语言单词的位置用变量$\pi_{ik}$表示。而对于列表($\tau_{[i]}$)中的其他的单词($\tau_{[i]k},1 < k \le \varphi_{[i]}$)的扭曲度,用如下公式计算: \noindent 其中,第$i$个目标语言单词生成的第$k$个源语言单词的位置用变量$\pi_{ik}$表示。而对于列表($\tau_{[i]}$)中的其他的单词($\tau_{[i]k},1 < k \le \varphi_{[i]}$)的扭曲度,$[i]>0$用如下公式计算:
\begin{eqnarray} \begin{eqnarray}
\funp{P}(\pi_{[i]k}=j|{\pi}_{[i]1}^{k-1},\pi_1^{[i]-1},\tau_0^l,\varphi_0^l,\seq{t}) & = & d_{>1}(j-\pi_{[i]k-1}|B(s_j)) \funp{P}(\pi_{[i]k}=j|{\pi}_{[i]1}^{k-1},\pi_1^{[i]-1},\tau_0^l,\varphi_0^l,\seq{t}) & = & d_{>1}(j-\pi_{[i]k-1}|B(s_j))
......
...@@ -652,14 +652,14 @@ dr & = & {\rm{start}}_i-{\rm{end}}_{i-1}-1 ...@@ -652,14 +652,14 @@ dr & = & {\rm{start}}_i-{\rm{end}}_{i-1}-1
\parinterval 想要得到最优的特征权重,最简单的方法是枚举所有特征权重可能的取值,然后评价每组权重所对应的翻译性能,最后选择最优的特征权重作为调优的结果。但是特征权重是一个实数值,因此可以考虑把实数权重进行量化,即把权重看作是在固定间隔上的取值,比如,每隔0.01取值。即使是这样,同时枚举多个特征的权重也是非常耗时的工作,当特征数量增多时这种方法的效率仍然很低。 \parinterval 想要得到最优的特征权重,最简单的方法是枚举所有特征权重可能的取值,然后评价每组权重所对应的翻译性能,最后选择最优的特征权重作为调优的结果。但是特征权重是一个实数值,因此可以考虑把实数权重进行量化,即把权重看作是在固定间隔上的取值,比如,每隔0.01取值。即使是这样,同时枚举多个特征的权重也是非常耗时的工作,当特征数量增多时这种方法的效率仍然很低。
\parinterval 这里介绍一种更加高效的特征权重调优方法$\ \dash \ ${\small\bfnew{最小错误率训练}}\index{最小错误率训练}(Minimum Error Rate Training\index{Minimum Error Rate Training},MERT)。最小错误率训练是统计机器翻译发展中代表性工作,也是机器翻译领域原创的重要技术方法之一\upcite{DBLP:conf/acl/Och03}。最小错误率训练假设:翻译结果相对于标准答案的错误是可度量的,进而可以通过降低错误数量的方式来找到最优的特征权重。假设有样本集合$S = \{(s_1,\seq{r}_1),...,(s_N,\seq{r}_N)\}$$s_i$为样本中第$i$个源语言句子,$\seq{r}_i$为相应的参考译文。注意,$\seq{r}_i$ 可以包含多个参考译文。$S$通常被称为{\small\bfnew{调优集合}}\index{调优集合}(Tuning Set)\index{Tuning Set}。对于$S$中的每个源语句子$s_i$,机器翻译模型会解码出$n$-best推导$\hat{\seq{d}}_{i} = \{\hat{d}_{ij}\}$,其中$\hat{d}_{ij}$表示对于源语言句子$s_i$得到的第$j$个最好的推导。$\{\hat{d}_{ij}\}$可以被定义如下: \parinterval 这里介绍一种更加高效的特征权重调优方法$\ \dash \ ${\small\bfnew{最小错误率训练}}\index{最小错误率训练}(Minimum Error Rate Training\index{Minimum Error Rate Training},MERT)。最小错误率训练是统计机器翻译发展中代表性工作,也是机器翻译领域原创的重要技术方法之一\upcite{DBLP:conf/acl/Och03}。最小错误率训练假设:翻译结果相对于标准答案的错误是可度量的,进而可以通过降低错误数量的方式来找到最优的特征权重。假设有样本集合$S = \{(s^{[1]},\seq{r}^{[1]}),...,(s^{[N]},\seq{r}^{[N]})\}$$s^{[i]}$为样本中第$i$个源语言句子,$\seq{r}^{[i]}$为相应的参考译文。注意,$\seq{r}^{[i]}$ 可以包含多个参考译文。$S$通常被称为{\small\bfnew{调优集合}}\index{调优集合}(Tuning Set)\index{Tuning Set}。对于$S$中的每个源语句子$s^{[i]}$,机器翻译模型会解码出$n$-best推导$\hat{\seq{d}}^{[i]} = \{\hat{d}_{j}^{[i]}\}$,其中$\hat{d}_{j}^{[i]}$表示对于源语言句子$s^{[i]}$得到的第$j$个最好的推导。$\{\hat{d}_{j}^{[i]}\}$可以被定义如下:
\begin{eqnarray} \begin{eqnarray}
\{\hat{d}_{ij}\} & = & \arg\max_{\{d_{ij}\}} \sum_{i=1}^{M} \lambda_i \cdot h_i (d,\seq{t},\seq{s}) \{\hat{d}_{j}^{[i]}\} & = & \arg\max_{\{d_{j}^{[i]}\}} \sum_{i=1}^{M} \lambda_i \cdot h_i (d,\seq{t}^{[i]},\seq{s}^{[i]})
\label{eq:7-17} \label{eq:7-17}
\end{eqnarray} \end{eqnarray}
\parinterval 对于每个样本都可以得到$n$-best推导集合,整个数据集上的推导集合被记为$\hat{\seq{D}} = \{\hat{\seq{d}}_{1},...,\hat{\seq{d}}_{s}\}$。进一步,令所有样本的参考译文集合为$\seq{R} = \{\seq{r}_1,...,\seq{r}_N\}$。最小错误率训练的目标就是降低$\hat{\seq{D}}$相对于$\seq{R}$的错误。也就是,通过调整不同特征的权重$\lambda = \{ \lambda_i \}$,让错误率最小,形式化描述为: \parinterval 对于每个样本都可以得到$n$-best推导集合,整个数据集上的推导集合被记为$\hat{\seq{D}} = \{\hat{\seq{d}}^{[1]},...,\hat{\seq{d}}^{[N]}\}$。进一步,令所有样本的参考译文集合为$\seq{R} = \{\seq{r}^{[1]},...,\seq{r}^{[N]}\}$。最小错误率训练的目标就是降低$\hat{\seq{D}}$相对于$\seq{R}$的错误。也就是,通过调整不同特征的权重$\lambda = \{ \lambda_i \}$,让错误率最小,形式化描述为:
\begin{eqnarray} \begin{eqnarray}
\hat{\lambda} & = & \arg\min_{\lambda} \textrm{Error}(\hat{\seq{D}},\seq{R}) \hat{\lambda} & = & \arg\min_{\lambda} \textrm{Error}(\hat{\seq{D}},\seq{R})
\label{eq:7-18} \label{eq:7-18}
......
...@@ -23,8 +23,8 @@ ...@@ -23,8 +23,8 @@
\node [anchor=west] (t4) at ([xshift=0.5em,]t3.east) {ball}; \node [anchor=west] (t4) at ([xshift=0.5em,]t3.east) {ball};
\draw [->] ([xshift=0em]t3.north) .. controls +(north:1em) and +(north:1em) .. ([xshift=-0.2em]t4.north); \draw [->] ([xshift=0em]t3.north) .. controls +(north:1em) and +(north:1em) .. ([xshift=-0.2em]t4.north);
\draw [->] ([xshift=0.2em]t4.north) .. controls +(north:2.5em) and +(north:2.5em) .. ([xshift=0.2em]t2.north); \draw [<-] ([xshift=0.2em]t4.north) .. controls +(north:2.5em) and +(north:2.5em) .. ([xshift=0.2em]t2.north);
\draw [->] ([xshift=0.0em]t1.north) .. controls +(north:2.5em) and +(north:2.5em) .. ([xshift=-0.2em]t2.north); \draw [<-] ([xshift=0.0em]t1.north) .. controls +(north:2.5em) and +(north:2.5em) .. ([xshift=-0.2em]t2.north);
\node [anchor=north west] (cap2) at ([yshift=-0.2em,xshift=-0.5em]t2.south west) {\small{(b) 依存树}}; \node [anchor=north west] (cap2) at ([yshift=-0.2em,xshift=-0.5em]t2.south west) {\small{(b) 依存树}};
\end{scope} \end{scope}
......
...@@ -9243,6 +9243,94 @@ author = {Zhuang Liu and ...@@ -9243,6 +9243,94 @@ author = {Zhuang Liu and
publisher = {Asian Conference on Machine Learning}, publisher = {Asian Conference on Machine Learning},
year = {2018} year = {2018}
} }
@inproceedings{DBLP:journals/corr/abs-1810-02525,
author = {Peter Henderson and
Joshua Romoff and
Joelle Pineau},
title = {Where Did My Optimum Go?: An Empirical Analysis of Gradient Descent
Optimization in Policy Gradient Methods},
publisher = {CoRR},
volume = {abs/1810.02525},
year = {2018}
}
@inproceedings{DBLP:conf/nips/Kakade01,
author = {Sham M. Kakade},
title = {A Natural Policy Gradient},
pages = {1531--1538},
publisher = {Advances in Neural Information Processing Systems},
year = {2001}
}
@inproceedings{DBLP:conf/icml/KoolHW19,
author = {Wouter Kool and
Herke van Hoof and
Max Welling},
title = {Stochastic Beams and Where To Find Them: The Gumbel-Top-k Trick for
Sampling Sequences Without Replacement},
series = {Proceedings of Machine Learning Research},
volume = {97},
pages = {3499--3508},
publisher = {International Conference on Machine Learning},
year = {2019}
}
@inproceedings{DBLP:journals/corr/abs-2012-13866,
author = {Bei Li and
Ziyang Wang and
Hui Liu and
Quan Du and
Tong Xiao and
Chunliang Zhang and
Jingbo Zhu},
title = {Learning Light-Weight Translation Models from Deep Transformer},
publisher = {CoRR},
volume = {abs/2012.13866},
year = {2020}
}
@inproceedings{DBLP:conf/aclnmt/HuLLLLWXZ20,
author = {Chi Hu and
Bei Li and
Yinqiao Li and
Ye Lin and
Yanyang Li and
Chenglong Wang and
Tong Xiao and
Jingbo Zhu},
title = {The NiuTrans System for {WNGT} 2020 Efficiency Task},
pages = {204--210},
publisher = {Annual Meeting of the Association for Computational Linguistics},
year = {2020}
}
@inproceedings{DBLP:conf/acl/WeiYHZWL20,
author = {Xiangpeng Wei and
Heng Yu and
Yue Hu and
Yue Zhang and
Rongxiang Weng and
Weihua Luo},
title = {Multiscale Collaborative Deep Models for Neural Machine Translation},
pages = {414--426},
publisher = {Annual Meeting of the Association for Computational Linguistics},
year = {2020}
}
@inproceedings{DBLP:conf/emnlp/DeNeefeKWM07,
author = {Steve DeNeefe and
Kevin Knight and
Wei Wang and
Daniel Marcu},
title = {What Can Syntax-Based {MT} Learn from Phrase-Based MT?},
pages = {755--763},
publisher = {Conference on Empirical Methods in Natural Language Processing},
year = {2007}
}
@inproceedings{DBLP:conf/emnlp/ShiPK16,
author = {Xing Shi and
Inkit Padhi and
Kevin Knight},
title = {Does String-Based Neural {MT} Learn Source Syntax?},
pages = {1526--1534},
publisher = {Conference on Empirical Methods in Natural Language Processing},
year = {2016}
}
%%%%% chapter 15------------------------------------------------------ %%%%% chapter 15------------------------------------------------------
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论