Commit 66fdb624 by 曹润柘

合并分支 'caorunzhe' 到 'master'

Caorunzhe

查看合并请求 !471
parents 6f3750df 2b74625f
......@@ -34,7 +34,7 @@
\node[anchor=north,word] (tgt_1) at ([yshift=-0.4em]i_0.south){$<$p$>$};
\node[anchor=north,word] at ([yshift=-0.4em]i_1.south){$<$p$>$};
\node[anchor=north,word] at ([yshift=-0.4em]i_2.south){$<$s$>$};
\node[anchor=north,word] at ([yshift=-0.4em]i_2.south){$<$sos$>$};
\node[anchor=north,word] at ([yshift=-0.4em]i_3.south){go};
\node[anchor=north,word] at ([yshift=-0.4em]i_4.south){to};
\node[anchor=north,word] (tgt_2) at ([yshift=-0.4em]i_5.south){school};
......@@ -103,7 +103,7 @@
\node[anchor=north,word] at ([yshift=-0.4em]i_0.south){go};
\node[anchor=north,word] at ([yshift=-0.4em]i_1.south){to};
\node[anchor=north,word] at ([yshift=-0.4em]i_2.south){school};
\node[anchor=north,word] at ([yshift=-0.4em]i_3.south){$<$/s$>$};
\node[anchor=north,word] at ([yshift=-0.4em]i_3.south){$<$eos$>$};
\foreach \point in {0,1,2,3}{
\node[cir,font=\fontsize{6}{6}\selectfont,inner sep=0.8pt](c_\point) at (8.2cm+\point*2em,7.5cm-1em*\point){\bm{$\sum$}};
......@@ -140,7 +140,7 @@
\node[anchor=south,word] (src_1) at ([xshift=-2em,yshift=0.4em]r_0.north){$<$p$>$};
\node[anchor=south,word] at ([yshift=0.4em]r_0.north){};
\node[anchor=south,word] at ([yshift=0.4em]r_1.north){上学};
\node[anchor=south,word] at ([yshift=0.4em]r_2.north){$<$s$>$};
\node[anchor=south,word] at ([yshift=0.4em]r_2.north){$<$sos$>$};
\node[anchor=south,word] (src_2) at ([xshift=2em,yshift=0.4em]r_2.north){$<$p$>$};
......
......@@ -34,7 +34,7 @@
\node[anchor=north,word] at ([yshift=-0.4em]i_0.south){$<$p$>$};
\node[anchor=north,word] at ([yshift=-0.4em]i_1.south){$<$p$>$};
\node[anchor=north,word] at ([yshift=-0.4em]i_2.south){$<$s$>$};
\node[anchor=north,word] at ([yshift=-0.4em]i_2.south){$<$sos$>$};
\node[anchor=north,word] at ([yshift=-0.4em]i_3.south){go};
\node[anchor=north,word] at ([yshift=-0.4em]i_4.south){to};
\node[anchor=north,word] at ([yshift=-0.4em]i_5.south){school};
......@@ -98,7 +98,7 @@
\node[anchor=north,word] at ([yshift=-0.4em]i_0.south){go};
\node[anchor=north,word] at ([yshift=-0.4em]i_1.south){to};
\node[anchor=north,word] at ([yshift=-0.4em]i_2.south){school};
\node[anchor=north,word] at ([yshift=-0.4em]i_3.south){$<$/s$>$};
\node[anchor=north,word] at ([yshift=-0.4em]i_3.south){$<$eos$>$};
\foreach \point in {0,1,2,3}{
\node[cir,font=\fontsize{6}{6}\selectfont,inner sep=0.8pt](c_\point) at (8.2cm+\point*2em,7.5cm-1em*\point){\bm{$\sum$}};
......@@ -135,7 +135,7 @@
\node[anchor=south,word] (src_1) at ([xshift=-2em,yshift=0.4em]r_0.north){$<$p$>$};
\node[anchor=south,word] at ([yshift=0.4em]r_0.north){};
\node[anchor=south,word] at ([yshift=0.4em]r_1.north){上学};
\node[anchor=south,word] at ([yshift=0.4em]r_2.north){$<$s$>$};
\node[anchor=south,word] at ([yshift=0.4em]r_2.north){$<$sos$>$};
\node[anchor=south,word] (src_2) at ([xshift=2em,yshift=0.4em]r_2.north){$<$p$>$};
......
......@@ -34,7 +34,7 @@
\node[anchor=north,word] at ([yshift=-0.4em]i_0.south){$<$p$>$};
\node[anchor=north,word] at ([yshift=-0.4em]i_1.south){$<$p$>$};
\node[anchor=north,word] at ([yshift=-0.4em]i_2.south){$<$s$>$};
\node[anchor=north,word] at ([yshift=-0.4em]i_2.south){$<$sos$>$};
\node[anchor=north,word] at ([yshift=-0.4em]i_3.south){go};
\node[anchor=north,word] at ([yshift=-0.4em]i_4.south){to};
\node[anchor=north,word] at ([yshift=-0.4em]i_5.south){school};
......@@ -99,7 +99,7 @@
\node[anchor=north,word] at ([yshift=-0.4em]i_0.south){go};
\node[anchor=north,word] at ([yshift=-0.4em]i_1.south){to};
\node[anchor=north,word] at ([yshift=-0.4em]i_2.south){school};
\node[anchor=north,word] at ([yshift=-0.4em]i_3.south){$<$/s$>$};
\node[anchor=north,word] at ([yshift=-0.4em]i_3.south){$<$eos$>$};
\foreach \point in {0,1,2,3}{
\node[cir,font=\fontsize{6}{6}\selectfont,inner sep=0.8pt](c_\point) at (8.2cm+\point*2em,7.5cm-1em*\point){\bm{$\sum$}};
......@@ -136,7 +136,7 @@
\node[anchor=south,word] (src_1) at ([xshift=-2em,yshift=0.4em]r_0.north){$<$p$>$};
\node[anchor=south,word] at ([yshift=0.4em]r_0.north){};
\node[anchor=south,word] at ([yshift=0.4em]r_1.north){上学};
\node[anchor=south,word] at ([yshift=0.4em]r_2.north){$<$s$>$};
\node[anchor=south,word] at ([yshift=0.4em]r_2.north){$<$sos$>$};
\node[anchor=south,word] (src_2) at ([xshift=2em,yshift=0.4em]r_2.north){$<$p$>$};
......
......@@ -22,8 +22,8 @@
\draw[->,thick]([xshift=0.4cm,yshift=-0.4cm]num8.east)--([xshift=1.5cm,yshift=-0.4cm]num8.east);
\node(num17)[num,right of = num8,xshift= 2.5cm,fill=red!10]{6};
\node(num18)[num,right of = num17,xshift= 0.6cm,fill=green!10]{3};
\node(num19)[num,below of = num17,yshift=-0.6cm,fill=yellow!10]{8};
\node(num18)[num,right of = num17,xshift= 0.6cm,fill=green!10]{8};
\node(num19)[num,below of = num17,yshift=-0.6cm,fill=yellow!10]{3};
\node(num20)[num,below of = num18,yshift= -0.6cm,fill=blue!10]{4};
\node [right of = num20,xshift= 0.7cm]{};
......
......@@ -63,9 +63,9 @@ $\otimes$: & 按位乘运算 \\
\draw[-latex,thick] (b.east) -- (c2.west);
\draw[-latex,thick] (c2.east) -- ([xshift=0.4cm]c2.east);
\node[inner sep=0pt, font=\tiny] at (0.75cm, -0.4cm) {$\mathbi{X}$};
\node[inner sep=0pt, font=\tiny] at ([yshift=-0.8cm]a.south) {$\mathbi{B}=\mathbi{X} * \mathbi{V} + \mathbi{b}_{\mathbi{W}}$};
\node[inner sep=0pt, font=\tiny] at ([yshift=-0.8cm]b.south) {$\mathbi{A}=\mathbi{X} * \mathbi{W} + \mathbi{b}_{\mathbi{V}}$};
\node[inner sep=0pt, font=\tiny] at (8.2cm, -0.4cm) {$\mathbi{Y}=\mathbi{A} \otimes \sigma(\mathbi{B})$};
\node[inner sep=0pt, font=\tiny] at (0.75cm, -0.4cm) {$\mathbi{x}$};
\node[inner sep=0pt, font=\tiny] at ([yshift=-0.8cm]a.south) {$\mathbi{B}=\mathbi{x} * \mathbi{V} + \mathbi{b}_{\mathbi{W}}$};
\node[inner sep=0pt, font=\tiny] at ([yshift=-0.8cm]b.south) {$\mathbi{A}=\mathbi{x} * \mathbi{W} + \mathbi{b}_{\mathbi{V}}$};
\node[inner sep=0pt, font=\tiny] at (8.2cm, -0.4cm) {$\mathbi{y}=\mathbi{A} \otimes \sigma(\mathbi{B})$};
\end{tikzpicture}
\ No newline at end of file
......@@ -40,7 +40,7 @@
\node[vuale] at ([xshift=0.9em]r3_1.east) {$\mathbi{z}_1$};
\node (t1) at (2.5em, -1em) {\large{$\cdots$}};
\node [anchor=north,font=\tiny] at ([yshift=-0.2em]t1.south) {(a) 传统卷积};
\node [anchor=north,font=\tiny] at ([yshift=-0.2em]t1.south) {(a) 标准卷积};
\end{scope}
\begin{scope}[xshift=4cm]
......
......@@ -85,10 +85,10 @@
%\draw [thick] (3.6cm, -0.3cm) -- (3.6cm, -0.5cm) -- node[font=\tiny, align=center,yshift=-0.5cm]{Convolutional layer with \\ multiple filter widths and \\ feature maps} (6cm,-0.5cm) -- (6cm, -0.3cm);
%\draw [thick] (7.2cm, -0.3cm) -- (7.2cm, -0.5cm) -- node[font=\tiny, align=center,yshift=-0.5cm]{Max-over-time\\ pooling} (9cm,-0.5cm) -- (9cm, -0.3cm);
%\draw [thick] (10cm, -0.3cm) -- (10cm, -0.5cm) -- node[font=\tiny, align=center,yshift=-0.5cm]{Fully connected layer \\ with dropout and \\ softmax output} (11.7cm,-0.5cm) -- (11.7cm, -0.3cm);
\draw [thick] (0cm, -0.3cm) -- (0cm, -0.5cm) -- node[font=\tiny, align=center,yshift=-0.5cm]{维度大小为 $m \times k$ \\ 的静态与非静态通道\\的句子表示} (2.4cm,-0.5cm) -- (2.4cm, -0.3cm);
\draw [thick] (0cm, -0.3cm) -- (0cm, -0.5cm) -- node[font=\tiny, align=center,yshift=-0.5cm]{维度大小为 $m \times K$ \\ 的静态与非静态通道\\的句子表示} (2.4cm,-0.5cm) -- (2.4cm, -0.3cm);
\draw [thick] (3.6cm, -0.3cm) -- (3.6cm, -0.5cm) -- node[font=\tiny, align=center,yshift=-0.5cm]{具有多个不同大小\\的卷积核和特征图\\的卷积层} (6cm,-0.5cm) -- (6cm, -0.3cm);
\draw [thick] (7.2cm, -0.3cm) -- (7.2cm, -0.5cm) -- node[font=\tiny, align=center,yshift=-0.5cm]{最大池化} (9cm,-0.5cm) -- (9cm, -0.3cm);
\draw [thick] (10cm, -0.3cm) -- (10cm, -0.5cm) -- node[font=\tiny, align=center,yshift=-0.5cm]{带有dropout\\和softmax输出\\的全连接层} (11.7cm,-0.5cm) -- (11.7cm, -0.3cm);
\draw [thick] (10cm, -0.3cm) -- (10cm, -0.5cm) -- node[font=\tiny, align=center,yshift=-0.5cm]{带有Dropout\\和Softmax输出\\的全连接层} (11.7cm,-0.5cm) -- (11.7cm, -0.3cm);
%\node [font=\Large] at (5.2cm,-2cm){$h_i = dot(F,x_{i:i+l-1})+b$};
......
......@@ -102,7 +102,7 @@
\parinterval 首先再来回顾一下{\chapterten}介绍的循环神经网络,虽然它很强大,但是也存在一些弊端。其中比较突出的问题是,循环神经网络每个循环单元都有向前依赖性,也就是当前时间步的处理依赖前一时间步处理的结果。这个性质可以使序列的“历史”信息不断被传递,但是也造成模型运行效率的下降。特别是对于自然语言处理任务,序列往往较长,无论是传统的RNN结构,还是更为复杂的LSTM结构,都需要很多次循环单元的处理才能够捕捉到单词之间的长距离依赖。由于需要多个循环单元的处理,距离较远的两个单词之间的信息传递变得很复杂。
\parinterval 针对这些问题,研究人员提出了一种全新的模型$\ \dash\ $Transformer\index{Transformer}\upcite{vaswani2017attention}。与循环神经网络等传统模型不同,Transformer模型仅仅使用自注意力机制和标准的前馈神经网络,完全不依赖任何循环单元或者卷积操作。自注意力机制的优点在于可以直接对序列中任意两个单元之间的关系进行建模,这使得长距离依赖等问题可以更好地被求解。此外,自注意力机制非常适合在GPU 上进行并行化,因此模型训练的速度更快。表\ref{tab:12-11}对比了RNN、CNN和Transformer层类型的复杂度\footnote{顺序操作数指序列中的位置按顺序操作的次数,由于Transformer和CNN都可以并行计算,所以是1;路径长度指序列中的一个位置和另外任意一个位置在网络中的距离。}
\parinterval 针对这些问题,研究人员提出了一种全新的模型$\ \dash\ $Transformer\index{Transformer}\upcite{vaswani2017attention}。与循环神经网络等传统模型不同,Transformer模型仅仅使用自注意力机制和标准的前馈神经网络,完全不依赖任何循环单元或者卷积操作。自注意力机制的优点在于可以直接对序列中任意两个单元之间的关系进行建模,这使得长距离依赖等问题可以更好地被求解。此外,自注意力机制非常适合在GPU 上进行并行化,因此模型训练的速度更快。表\ref{tab:12-11}对比了RNN、CNN和Transformer层类型的复杂度\footnote{顺序操作数指模型处理一个序列所需要的操作数,由于Transformer和CNN都可以并行计算,所以是1;路径长度指序列中任意两个单词在网络中的距离。}
%----------------------------------------------
\begin{table}[htp]
......
\begin{tikzpicture}
\tikzstyle{bignode} = [line width=0.6pt,draw=black,minimum width=6.3em,minimum height=2.2em,fill=blue!20,rounded corners=2pt]
\tikzstyle{middlenode} = [line width=0.6pt,draw=black,minimum width=5.6em,minimum height=2.2em,fill=blue!20,rounded corners=2pt]
\tikzstyle{bignode} = [line width=0.6pt,draw=black,minimum width=6.3em,minimum height=2.2em,fill=white]
\tikzstyle{middlenode} = [line width=0.6pt,draw=black,minimum width=5.6em,minimum height=2.2em,fill=white]
\node [anchor=center] (node1-1) at (0,0) {\scriptsize{汉语}};
\node [anchor=west] (node1-2) at ([xshift=1.0em]node1-1.east) {\scriptsize{英语}};
\node [anchor=north] (node1-3) at ([xshift=1.65em]node1-1.south) {\scriptsize{反向翻译模型}};
\node [anchor=west] (node1-2) at ([xshift=0.8em]node1-1.east) {\scriptsize{英语}};
\node [anchor=north] (node1-3) at ([xshift=1.45em]node1-1.south) {\scriptsize{反向翻译模型}};
\draw [->,line width=0.6pt](node1-1.east)--(node1-2.west);
\begin{pgfonlayer}{background}
{
\node[fill=red!20,rounded corners=2pt,inner sep=0.2em,draw=black,line width=0.6pt,minimum width=6.0em] [fit =(node1-1)(node1-2)(node1-3)] (remark1) {};
\node[fill=blue!20,inner sep=0.1em,draw=black,line width=0.6pt,minimum width=6.0em,drop shadow,rounded corners=2pt] [fit =(node1-1)(node1-2)(node1-3)] (remark1) {};
}
\end{pgfonlayer}
\node [anchor=north](node2-1) at ([xshift=-1.93em,yshift=-1.95em]remark1.south){\scriptsize{汉语}};
\node [anchor=north](node2-1-2) at (node2-1.south){\scriptsize{真实数据}};
\begin{pgfonlayer}{background}
{
\node[fill=blue!20,rounded corners=2pt,inner sep=0.1em,draw=black,line width=0.6pt,minimum width=3.85em] [fit =(node2-1)(node2-1-2)] (remark2-1) {};
}
\end{pgfonlayer}
\node [anchor=west](node2-2) at ([xshift=0.82em,yshift=0.68em]remark2-1.east){\scriptsize{英语}};
\node [anchor=north](node2-2-2) at (node2-2.south){\scriptsize{真实数据}};
\begin{pgfonlayer}{background}
{
\node[fill=green!20,rounded corners=2pt,inner sep=0.1em,draw=black,line width=0.6pt,minimum width=3.85em] [fit =(node2-2)(node2-2-2)] (remark2-2) {};
}
\end{pgfonlayer}
\node [anchor=north,fill=green!20,inner sep=0.1em,minimum width=3em,draw=black,line width=0.6pt,rounded corners=2pt](node2-1) at ([xshift=-1.5em,yshift=-1.95em]remark1.south){\scriptsize{汉语}};
\node [anchor=west,fill=green!20,inner sep=0.1em,minimum width=3em,draw=black,line width=0.6pt,rounded corners=2pt](node2-2) at (node2-1.east){\scriptsize{英语}};
\draw [->,line width=0.6pt]([yshift=-2.0em]remark1.south)--(remark1.south) node [pos=0.5,right] (pos1) {\scriptsize{训练}};
\node [anchor=west](node3-1) at ([xshift=5.0em,yshift=0.1em]node1-2.east){\scriptsize{汉语}};
\node [anchor=north](node3-1-2) at (node3-1.south){\scriptsize{真实数据}};
\begin{pgfonlayer}{background}
{
\node[fill=blue!20,rounded corners=2pt,inner sep=0.1em,draw=black,line width=0.6pt,minimum width=3.85em] [fit =(node3-1)(node3-1-2)] (remark3-1) {};
}
\end{pgfonlayer}
\node [anchor=north](node3-2) at ([yshift=-2.15em]remark3-1.south){\scriptsize{英语}};
\node [anchor=north](node3-2-2) at (node3-2.south){\scriptsize{伪数据}};
\begin{pgfonlayer}{background}
{
\node[fill=yellow!20,rounded corners=2pt,inner sep=0.1em,draw=black,line width=0.6pt,minimum width=3.85em] [fit =(node3-2)(node3-2-2)] (remark3-2) {};
}
\end{pgfonlayer}
\node [anchor=west,fill=yellow!20,inner sep=0.1em,minimum width=3em,draw=black,line width=0.6pt,rounded corners=2pt](node3-1) at ([xshift=5.0em,yshift=0.0em]node1-2.east){\scriptsize{汉语}};
\node [anchor=north,fill=red!20,inner sep=0.1em,minimum width=3em,draw=black,line width=0.6pt,rounded corners=2pt](node3-2) at ([yshift=-2.15em]node3-1.south){\scriptsize{英语}};
\draw [->,line width=0.6pt](remark3-1.south)--(remark3-2.north) node [pos=0.5,right] (pos2) {\scriptsize{翻译}};
\draw [->,line width=0.6pt](node3-1.south)--(node3-2.north) node [pos=0.5,right] (pos2) {\scriptsize{翻译}};
\begin{pgfonlayer}{background}
{
\node[rounded corners=2pt,inner sep=0.3em,draw=black,line width=0.6pt,dotted] [fit =(remark3-1)(remark3-2)] (remark2) {};
\node[rounded corners=2pt,inner sep=0.3em,draw=black,line width=0.6pt,dotted] [fit =(node3-1)(node3-2)] (remark2) {};
}
\end{pgfonlayer}
\draw [->,line width=0.6pt](remark1.east)--([yshift=2.40em]remark2.west) node [pos=0.5,above] (pos2) {\scriptsize{模型翻译}};
\draw [->,line width=0.6pt](remark1.east)--([yshift=0.85em]remark2.west) node [pos=0.5,above] (pos2) {\scriptsize{模型翻译}};
\node [anchor=south](pos2-2) at ([yshift=-0.5em]pos2.north){\scriptsize{使用反向}};
\draw[decorate,thick,decoration={brace,amplitude=5pt}] ([yshift=1.3em,xshift=1.5em]node3-1.east) -- ([yshift=-7.7em,xshift=1.5em]node3-1.east) node [pos=0.1,right,xshift=0.0em,yshift=0.0em] (label1) {\scriptsize{{混合}}};
\draw[decorate,thick,decoration={brace,amplitude=5pt}] ([yshift=1.3em,xshift=1.0em]node3-1.east) -- ([yshift=-5.2em,xshift=1.0em]node3-1.east) node [pos=0.1,right,xshift=0.0em,yshift=0.0em] (label1) {\scriptsize{{混合}}};
\node [anchor=west](node4-1) at ([xshift=3.5em,yshift=3.94em]node3-2.east){\scriptsize{英语}};
\node [anchor=north](node4-1-2) at (node4-1.south){\scriptsize{伪数据}};
\begin{pgfonlayer}{background}
{
\node[fill=yellow!20,rounded corners=2pt,inner sep=0.1em,draw=black,line width=0.6pt,minimum width=3.85em] [fit =(node4-1)(node4-1-2)] (remark4-1) {};
}
\end{pgfonlayer}
\node [anchor=north](node4-2) at ([yshift=-1.59em]node4-1.south){\scriptsize{英语}};
\node [anchor=north](node4-2-2) at (node4-2.south){\scriptsize{真实数据}};
\begin{pgfonlayer}{background}
{
\node[fill=green!20,rounded corners=2pt,inner sep=0.1em,draw=black,line width=0.6pt,minimum width=3.85em] [fit =(node4-2)(node4-2-2)] (remark4-2) {};
}
\end{pgfonlayer}
\node [anchor=west](node4-3) at ([xshift=1.7em]node4-2.east){\scriptsize{汉语}};
\node [anchor=north](node4-3-2) at (node4-3.south){\scriptsize{真实数据}};
\begin{pgfonlayer}{background}
{
\node[fill=blue!20,rounded corners=2pt,inner sep=0.1em,draw=black,line width=0.6pt,minimum width=3.85em] [fit =(node4-3)(node4-3-2)] (remark4-3) {};
}
\end{pgfonlayer}
\node [anchor=west](node4-4) at ([xshift=1.7em]node4-1.east){\scriptsize{汉语}};
\node [anchor=north](node4-4-2) at (node4-4.south){\scriptsize{真实数据}};
\node [anchor=west,fill=red!20,inner sep=0.1em,minimum width=3em,draw=black,line width=0.6pt,rounded corners=2pt](node4-1) at ([xshift=2.0em,yshift=1.6em]node3-2.east){\scriptsize{英语}};
\node [anchor=north,fill=green!20,inner sep=0.1em,minimum width=3em,draw=black,line width=0.6pt,rounded corners=2pt](node4-2) at (node4-1.south){\scriptsize{英语}};
\node [anchor=west,fill=green!20,inner sep=0.1em,minimum width=3em,draw=black,line width=0.6pt,rounded corners=2pt](node4-3) at (node4-1.east){\scriptsize{汉语}};
\node [anchor=north,fill=green!20,inner sep=0.1em,minimum width=3em,draw=black,line width=0.6pt,rounded corners=2pt](node4-4) at (node4-3.south){\scriptsize{汉语}};
\begin{pgfonlayer}{background}
{
\node[fill=blue!20,rounded corners=2pt,inner sep=0.1em,draw=black,line width=0.6pt,minimum width=3.85em] [fit =(node4-4)(node4-4-2)] (remark4-3) {};
}
\end{pgfonlayer}
\node [anchor=center] (node5-1) at ([xshift=4.3em,yshift=-1.48em]node4-4.east) {\scriptsize{英语}};
\node [anchor=west] (node5-2) at ([xshift=1.0em]node5-1.east) {\scriptsize{汉语}};
\node [anchor=center] (node5-1) at ([xshift=3.4em,yshift=0.25em]node4-3.east) {\scriptsize{英语}};
\node [anchor=west] (node5-2) at ([xshift=0.8em]node5-1.east) {\scriptsize{汉语}};
\node [anchor=north] (node5-3) at ([xshift=1.65em]node5-1.south) {\scriptsize{正向翻译模型}};
\draw [->,line width=0.6pt](node5-1.east)--(node5-2.west);
\begin{pgfonlayer}{background}
{
\node[fill=red!20,rounded corners=2pt,inner sep=0.2em,draw=black,line width=0.6pt,minimum width=6.0em] [fit =(node5-1)(node5-2)(node5-3)] (remark3) {};
\node[fill=blue!20,inner sep=0.1em,draw=black,line width=0.6pt,minimum width=6.0em,drop shadow,rounded corners=2pt] [fit =(node5-1)(node5-2)(node5-3)] (remark3) {};
}
\end{pgfonlayer}
\draw [->,line width=0.6pt]([xshift=-2em]remark3.west)--(remark3.west) node [pos=0.5,above] (pos3) {\scriptsize{训练}};
\node [anchor=south](d1) at ([xshift=0.0em,yshift=2em]remark3.north){\scriptsize{真实数据:}};
\node [anchor=north](d2) at ([xshift=0.35em]d1.south){\scriptsize{伪数据:}};
\node [anchor=south](d3) at ([xshift=0.0em,yshift=0em]d1.north){\scriptsize{额外数据:}};
\node [anchor=west,fill=green!20,minimum width=1em](d1-1) at ([xshift=-0.0em]d1.east){};
\node [anchor=west,fill=red!20,minimum width=1em](d2-1) at ([xshift=-0.0em]d2.east){};
\node [anchor=west,fill=yellow!20,minimum width=1em](d3-1) at ([xshift=-0.0em]d3.east){};
\end{tikzpicture}
\ No newline at end of file
......@@ -35,9 +35,9 @@
\begin{pgfonlayer}{background}
{
\node [rectangle,inner sep=0.1em,fill=ugreen!20!white] [fit = (w0) (index0)] (wordbox0) {};
\node [rectangle,inner sep=0.1em,fill=ugreen!20!white] [fit = (w1) (index1)] (wordbox1) {};
\node [rectangle,inner sep=0.1em,fill=ugreen!20!white] [fit = (w2) (index2)] (wordbox2) {};
\node [rectangle,draw,inner sep=0.1em,fill=ugreen!20!white] [fit = (w0) (index0)] (wordbox0) {};
\node [rectangle,draw,inner sep=0.1em,fill=ugreen!20!white] [fit = (w1) (index1)] (wordbox1) {};
\node [rectangle,draw,inner sep=0.1em,fill=ugreen!20!white] [fit = (w2) (index2)] (wordbox2) {};
}
\end{pgfonlayer}
......
......@@ -8,7 +8,7 @@
\node [anchor=south] (w1) at (o1.north) {\footnotesize{桌子}};
\node [anchor=south] (w2) at (o2.north) {\footnotesize{椅子}};
{
\node [anchor=south,fill=red!20!white] (cosine) at (w1.north) {\footnotesize{$\textrm{cosine}(\textrm{‘桌子’},\textrm{‘椅子’})=0.5$}};
\node [anchor=south,fill=red!20!white] (cosine) at (w1.north) {\footnotesize{$\textrm{cos}(\textrm{‘桌子’},\textrm{‘椅子’})=0.5$}};
}
\end{scope}
}
......
......@@ -7,7 +7,7 @@
\node [anchor=south] (w1) at (o1.north) {\footnotesize{桌子}};
\node [anchor=south] (w2) at (o2.north) {\footnotesize{椅子}};
{
\node [anchor=south,fill=red!20!white] (cosine) at (w1.north) {\footnotesize{$\textrm{cosine}(\textrm{‘桌子’},\textrm{‘椅子’})=0$}};
\node [anchor=south,fill=red!20!white] (cosine) at (w1.north) {\footnotesize{$\textrm{cos}(\textrm{‘桌子’},\textrm{‘椅子’})=0$}};
}
\end{scope}
......
......@@ -1564,9 +1564,9 @@ z_t&=&\gamma z_{t-1}+(1-\gamma) \frac{\partial J}{\partial {\theta}_t} \cdot \f
\parinterval 为了使神经网络模型训练更加稳定,通常还会考虑其他策略。
\begin{itemize}
\item {\small\bfnew{批量归一化}}\index{批量归一化}(Batch Normalization)\index{Batch Normalization}。批量归一化,顾名思义,是以进行学习时的小批量样本为单位进行归一化\upcite{ioffe2015batch}。具体而言,就是对神经网络隐层输出的每一个维度,沿着批次的方向进行均值为0、方差为1的归一化。在深层神经网络中,每一层网络都可以使用批量归一化操作。这样使神经网络任意一层的输入不至于过大或过小,从而防止隐层中异常值导致模型状态的巨大改变。
\item {\small\bfnew{批量标准化}}\index{批量标准化}(Batch Normalization)\index{Batch Normalization}。批量标准化,顾名思义,是以进行学习时的小批量样本为单位进行标准化\upcite{ioffe2015batch}。具体而言,就是对神经网络隐层输出的每一个维度,沿着批次的方向进行均值为0、方差为1的标准化。在深层神经网络中,每一层网络都可以使用批量标准化操作。这样使神经网络任意一层的输入不至于过大或过小,从而防止隐层中异常值导致模型状态的巨大改变。
\item {\small\bfnew{归一化}}\index{层归一化}(Layer Normalization)\index{Layer Normalization}。类似的,层归一化更多是针对自然语言处理这种序列处理任务\upcite{Ba2016LayerN},它和批量归一化的原理是一样的,只是归一化操作是在序列上同一层网络的输出结果上进行的,也就是归一化操作沿着序列方向进行。这种方法可以很好的避免序列上不同位置神经网络输出结果的不可比性。同时由于归一化后所有的结果都转化到一个可比的范围,使得隐层状态可以在不同层之间进行自由组合。
\item {\small\bfnew{标准化}}\index{层标准化}(Layer Normalization)\index{Layer Normalization}。类似的,层标准化更多是针对自然语言处理这种序列处理任务\upcite{Ba2016LayerN},它和批量标准化的原理是一样的,只是标准化操作是在序列上同一层网络的输出结果上进行的,也就是标准化操作沿着序列方向进行。这种方法可以很好的避免序列上不同位置神经网络输出结果的不可比性。同时由于标准化后所有的结果都转化到一个可比的范围,使得隐层状态可以在不同层之间进行自由组合。
\item {\small\bfnew{残差网络}}\index{残差网络}(Residual Networks)\index{Residual Networks}。最初,残差网络是为了解决神经网络持续加深时的模型退化问题\upcite{DBLP:journals/corr/HeZRS15},但是残差结构对解决梯度消失和梯度爆炸问题也有所帮助。有了残差结构,可以很轻松的构建几十甚至上百层的神经网络,而不用担心层数过深造成的梯度消失问题。残差网络的结构如图\ref{fig:9-51}所示。图\ref{fig:9-51}中右侧的曲线叫做{\small\bfnew{跳接}}\index{跳接}(Skip Connection)\index{Skip Connection},通过跳接在激活函数前,将上一层(或几层)之前的输出与本层计算的输出相加,将求和的结果输入到激活函数中作为本层的输出。假设残差结构的输入为$ {\mathbi{x}}_l $,输出为$ {\mathbi{x}}_{l+1} $,则有
......@@ -1593,7 +1593,7 @@ z_t&=&\gamma z_{t-1}+(1-\gamma) \frac{\partial J}{\partial {\theta}_t} \cdot \f
\label{eq:9-45}
\end{eqnarray}
由上式可知,残差网络可以将后一层的梯度$ \frac{\partial L}{\partial {\mathbi{x}}_{l+1}} $不经过任何乘法项直接传递到$ \frac{\partial L}{\partial {\mathbi{x}}_l} $,从而缓解了梯度经过每一层后多次累乘造成的梯度消失问题。在{\chaptertwelve}中还会看到,在机器翻译中残差结构可以和层归一化一起使用,而且这种组合可以取得很好的效果。
由上式可知,残差网络可以将后一层的梯度$ \frac{\partial L}{\partial {\mathbi{x}}_{l+1}} $不经过任何乘法项直接传递到$ \frac{\partial L}{\partial {\mathbi{x}}_l} $,从而缓解了梯度经过每一层后多次累乘造成的梯度消失问题。在{\chaptertwelve}中还会看到,在机器翻译中残差结构可以和层标准化一起使用,而且这种组合可以取得很好的效果。
\end{itemize}
......@@ -1946,7 +1946,7 @@ z_t&=&\gamma z_{t-1}+(1-\gamma) \frac{\partial J}{\partial {\theta}_t} \cdot \f
\label{eq:9-120}
\end{eqnarray}
\noindent 这里,exp($\cdot$)表示指数函数。Softmax函数是一个典型的归一化函数,它可以将输入的向量的每一维都转化为0-1之间的数,同时保证所有维的和等于1。Softmax的另一个优点是,它本身(对于输出的每一维)都是可微的(如图\ref{fig:softmax}所示),因此可以直接使用基于梯度的方法进行优化。实际上,Softmax经常被用于分类任务。也可以把机器翻译中目标语单词的生成看作一个分类问题,它的类别数是|$V$|。
\noindent 这里,exp($\cdot$)表示指数函数。Softmax函数是一个典型的标准化函数,它可以将输入的向量的每一维都转化为0-1之间的数,同时保证所有维的和等于1。Softmax的另一个优点是,它本身(对于输出的每一维)都是可微的(如图\ref{fig:softmax}所示),因此可以直接使用基于梯度的方法进行优化。实际上,Softmax经常被用于分类任务。也可以把机器翻译中目标语单词的生成看作一个分类问题,它的类别数是|$V$|。
%----------------------------------------------
\begin{figure}[htp]
......@@ -2040,7 +2040,7 @@ z_t&=&\gamma z_{t-1}+(1-\gamma) \frac{\partial J}{\partial {\theta}_t} \cdot \f
\subsubsection{1. One-hot编码}
\parinterval {\small\sffamily\bfseries{One-hot编码}}\index{One-hot编码}(也称{\small\sffamily\bfseries{独热编码}}\index{独热编码})是传统的单词表示方法。One-hot编码把单词表示为词汇表大小的0-1向量,其中只有该词所对应的那一项是1,而其余所有项都是。举个简单的例子,假如有一个词典,里面包含10k个单词,并进行编号。那么每个单词都可以表示为一个10k维的One-hot向量,它仅在对应编号那个维度为1,其他维度都为0,如图\ref{fig:9-64}所示。
\parinterval {\small\sffamily\bfseries{One-hot编码}}\index{One-hot编码}(也称{\small\sffamily\bfseries{独热编码}}\index{独热编码})是传统的单词表示方法。One-hot编码把单词表示为词汇表大小的0-1向量,其中只有该词所对应的那一项是1,而其余所有项都是0。举个简单的例子,假如有一个词典,里面包含10k个单词,并进行编号。那么每个单词都可以表示为一个10k维的One-hot向量,它仅在对应编号那个维度为1,其他维度都为0,如图\ref{fig:9-64}所示。
%----------------------------------------------
\begin{figure}[htp]
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论