reorganization

a1e891a5 · xiaotong · e7420cbc · a1e891a5
Commit a1e891a5 authored Dec 21, 2019 by xiaotong
--- a/Section06-Neural-Machine-Translation/section06.tex
+++ b/Section06-Neural-Machine-Translation/section06.tex
@@ -1588,6 +1588,9 @@ NLP问题的隐含结构假设 & 无隐含结构假设，端到端学习 \\
 \begin{frame}{另一种循环单元 - 门循环单元(GRU)}
 \begin{itemize}
 \item GRU是LSTM的一个变种，它把隐藏状态$h$和记忆$c$合并成一个隐藏状态$h$，同时使用了更少的``门''单元，大大提升了计算效率
+    \begin{itemize}
+    \item 在NMT中GRU会带来20-25\%的速度提升
+    \end{itemize}
 \end{itemize}
 %%% 图
 \begin{center}
@@ -1736,7 +1739,7 @@ NLP问题的隐含结构假设 & 无隐含结构假设，端到端学习 \\
    \end{tikzpicture}
 \end{center}

-{\scriptsize\begin{tabular}{l}
+{\footnotesize\begin{tabular}{l}
    *$x_t$: 上一层的输出\\
    *$h_t$: 同一层上一时刻的隐藏状态
 \end{tabular}}
@@ -3491,7 +3494,10 @@ $\textrm{``you''} = \argmax_{y} \textrm{P}(y|\textbf{s}_1, \alert{\textbf{C}})$ 
 \end{frame}

 %%%------------------------------------------------------------------------------------------------------------
-%%% 第一部分
+\section{Transformer}
+
+%%%------------------------------------------------------------------------------------------------------------
+%%% Transformer模型部分
 \begin{frame}{Transformer}

 \vspace{5.0em}
@@ -3567,6 +3573,9 @@ $\textrm{``you''} = \argmax_{y} \textrm{P}(y|\textbf{s}_1, \alert{\textbf{C}})$ 
 \end{frame}

 %%%------------------------------------------------------------------------------------------------------------
+\subsection{自注意力机制}
+
+%%%------------------------------------------------------------------------------------------------------------
 %%% 自注意力机制
 \begin{frame}{自注意力机制}
 \begin{itemize}
@@ -3701,7 +3710,9 @@ $\textrm{``you''} = \argmax_{y} \textrm{P}(y|\textbf{s}_1, \alert{\textbf{C}})$ 
 \end{frame}

 %%%------------------------------------------------------------------------------------------------------------
-\subsection{Transformer}
+\subsection{Transformer模型}
+
+%%%------------------------------------------------------------------------------------------------------------
 \begin{frame}{Transformer 介绍}
 \begin{itemize}

@@ -3742,7 +3753,6 @@ $\textrm{``you''} = \argmax_{y} \textrm{P}(y|\textbf{s}_1, \alert{\textbf{C}})$ 
 \end{frame}

 %%%------------------------------------------------------------------------------------------------------------
-\section{Transformer}
 \begin{frame}{Transformer}
 %\begin{tcolorbox}
 %[bicolor,sidebyside,righthand width=4.5cm,size=title,frame engine=empty,
@@ -3824,7 +3834,6 @@ $\textrm{``you''} = \argmax_{y} \textrm{P}(y|\textbf{s}_1, \alert{\textbf{C}})$ 
 \end{frame}

 %%%------------------------------------------------------------------------------------------------------------
-\section{Transformer}
 \begin{frame}{Transformer}
 %\begin{tcolorbox}
 %[bicolor,sidebyside,righthand width=4.5cm,size=title,frame engine=empty,
@@ -3910,10 +3919,7 @@ $\textrm{``you''} = \argmax_{y} \textrm{P}(y|\textbf{s}_1, \alert{\textbf{C}})$ 
 %\end{tcolorbox}
 \end{frame}

-
-
 %%%------------------------------------------------------------------------------------------------------------
-\subsection{输入}
 \begin{frame}{位置编码}
 \begin{itemize}
 \item 自注意力机制与前面的循环神经网络相比，忽略了词之间的顺序关系，例如下面两个语义不同的句子，通过自注意力得到的表示却是相同的
@@ -3975,7 +3981,6 @@ $\textrm{``you''} = \argmax_{y} \textrm{P}(y|\textbf{s}_1, \alert{\textbf{C}})$ 
 \end{frame}

 %%%------------------------------------------------------------------------------------------------------------
-\subsection{输入}
 \begin{frame}{位置编码(续)}
 \begin{itemize}
 \item 位置编码的计算方式有很多种，这里使用正余弦函数来编码。式中\textit{pos}代表第几个词，\textit{i}代表词嵌入中的第几维
@@ -4060,7 +4065,6 @@ PE_{(pos,2i+1)} = cos(pos/10000^{2i/d_{model}})


 %%%------------------------------------------------------------------------------------------------------------
-\section{Transformer}
 \begin{frame}{Transformer}
 %\begin{tcolorbox}
 %[bicolor,sidebyside,righthand width=4.5cm,size=title,frame engine=empty,
@@ -4148,7 +4152,6 @@ PE_{(pos,2i+1)} = cos(pos/10000^{2i/d_{model}})
 \end{frame}

 %%%------------------------------------------------------------------------------------------------------------
-\subsection{自注意力模型}
 \begin{frame}{基于点乘的注意力机制}
 \begin{itemize}
 \item Transformer使用点乘的自注意力方法来捕获句子内部各个位置之间的相似性：
@@ -4246,9 +4249,9 @@ PE_{(pos,2i+1)} = cos(pos/10000^{2i/d_{model}})

 \end{itemize}
 \end{frame}
-%
+
+
 %%%------------------------------------------------------------------------------------------------------------
-\subsection{多头自注意力模型}
 \begin{frame}{多头自注意力模型}
 \begin{itemize}
 \item Transformer首次提出了多头注意力机制，将输入的Query、Key、Value沿着隐层维度切分为$h$个子集，分别进行注意力操作，取得了很好的效果
@@ -4352,7 +4355,6 @@ PE_{(pos,2i+1)} = cos(pos/10000^{2i/d_{model}})


 %%%------------------------------------------------------------------------------------------------------------
-\section{Transformer}
 \begin{frame}{Transformer}
 %\begin{tcolorbox}
 %[bicolor,sidebyside,righthand width=4.5cm,size=title,frame engine=empty,
@@ -4439,8 +4441,8 @@ PE_{(pos,2i+1)} = cos(pos/10000^{2i/d_{model}})

 %\end{tcolorbox}
 \end{frame}
+
 %%%------------------------------------------------------------------------------------------------------------
-\subsection{残差和层正则化}
 \begin{frame}{残差\&层正则化}
 \begin{itemize}
 \item 在Transformer中，编码器、解码器分别由6层网络组成，每层网络又包含多个子层（自注意力网络、前馈神经网络）。Transformer实际上是一个很深的网络结构，在训练过程中容易出现梯度消失的情况
@@ -4523,7 +4525,6 @@ x_{l+1} = x_l+\mathcal{F}(x_l)
 \end{frame}

 %%%------------------------------------------------------------------------------------------------------------
-\subsection{残差和层正则化}
 \begin{frame}{残差\&层正则化(续)}
 \begin{itemize}
 \item 在Transformer的训练过程中，由于引入了残差操作，将前面所有层的输出加到一起。这样会导致高层的参数分布不断变大，造成训练过程不稳定、训练时间较长。
@@ -4579,7 +4580,6 @@ x_{l+1} = x_l+\mathcal{F}(x_l)


 %%%------------------------------------------------------------------------------------------------------------
-\section{Transformer}
 \begin{frame}{Transformer}
 %\begin{tcolorbox}
 %[bicolor,sidebyside,righthand width=4.5cm,size=title,frame engine=empty,
@@ -4666,7 +4666,6 @@ x_{l+1} = x_l+\mathcal{F}(x_l)
 \end{frame}

 %%------------------------------------------------------------------------------------------------------------
-\subsection{自注意力模型}
 \begin{frame}{前馈全连接网络}
 \begin{itemize}
 \item 在每层中，除了注意力操作，还包含了一个全连接的前馈神经网络，网络中包含两次线性变换和一次非线性变换(ReLU激活函数)，每层的前馈神经网络参数不共享
@@ -4721,22 +4720,16 @@ x_{l+1} = x_l+\mathcal{F}(x_l)
    }
 }

-\visible<2->{
 \node [anchor=east] (line1) at ([xshift=-3.5em,yshift=2.5em]neuron14.west) {\scriptsize{全连接网络的作用主要体现在}};
 \node [anchor=north west] (line2) at ([yshift=0.3em]line1.south west) {\scriptsize{将经过注意力操作之后的表示}};
 \node [anchor=north west] (line3) at ([yshift=0.3em]line2.south west) {\scriptsize{映射到更大的网络空间中}};
 \node [anchor=north west] (line4) at ([yshift=0.3em]line3.south west) {\scriptsize{提升了网络模型的表示能力}};
 \node [anchor=north west] (line5) at ([yshift=0.3em]line4.south west) {\scriptsize{实验证明，去掉全连接网络}};
 \node [anchor=north west] (line6) at ([yshift=0.3em]line5.south west) {\scriptsize{会对模型的性能造成影响}};
-}

 \begin{pgfonlayer}{background}
 \node [rectangle,inner sep=0.2em,fill=blue!20] [fit = (neuron01) (neuron14) (neuron13) (neuron22)] (ffn) {};
-
-\visible<2->{
 \node [rectangle,inner sep=0.2em,rounded corners=1pt,fill=green!10,drop shadow,draw=ugreen] [fit = (line1) (line2) (line3) (line6)] (box1) {};
-}
-
 \end{pgfonlayer}

 \end{scope}
@@ -4866,6 +4859,97 @@ x_{l+1} = x_l+\mathcal{F}(x_l)
 \end{frame}

 %%%------------------------------------------------------------------------------------------------------------
+%%% Transformer所使用的优化器
+\begin{frame}{优化器}
+% Adam
+% 学习率调整
+\end{frame}
+
+%%%------------------------------------------------------------------------------------------------------------
+\begin{frame}{训练配置}
+\begin{itemize}
+\item \textbf{优化器}：使用Adam优化器，$\beta_1$=0.9，$\beta_2$=0.98，$\epsilon=10^{-9}$ 关于学习率的设置，引入了warmup策略，在训练初期，学习率从一个较小的初始值逐渐增大，当到达一定的步数，学习率再逐渐减小
+    \begin{displaymath}
+    lrate=d_{\mathrm{model}}^{-0.5}\cdot \min(step^{-0.5},step\cdot \mathrm{warmup\_steps}^{-1.5})
+    \end{displaymath}
+    这样做可以减缓在训练初期的不稳定现象，保持分布平稳，通常warmup\_steps通常设置为4000
+
+
+
+\vspace{0.5em}
+
+\only<1>{
+\begin{figure}
+  \centering
+  \begin{tikzpicture}
+    \footnotesize{
+      \begin{axis}[
+      width=.60\textwidth,
+      height=.40\textwidth,
+      legend style={at={(0.60,0.08)}, anchor=south west},
+      xlabel={\footnotesize{num update (10k)}},
+      ylabel={\footnotesize{Learn rate  (\scriptsize{$10^{-3}$)}}},
+      ylabel style={yshift=-1em},xlabel style={yshift=0.0em},
+      yticklabel style={/pgf/number format/precision=2,/pgf/number format/fixed zerofill},
+      ymin=0,ymax=0.9, ytick={0.2, 0.4, 0.6, 0.8},
+      xmin=0,xmax=12,xtick={2,4,6,8,10},
+      legend style={yshift=-6pt, legend plot pos=right,font=\scriptsize,cells={anchor=west}}
+      ]
+      \addplot[orange,line width=1.25pt] coordinates {(0,0) (4,0.7) (5,0.63) (6,0.57) (7,0.525) (8,0.49) (9,0.465) (10,0.44) (11,0.42) (12,0.4)};
+      \end{axis}
+     }
+  \end{tikzpicture}
+  \caption{}\label{}
+\end{figure}
+}
+
+
+\item<2-> \textbf{Dropout }：为了防止网络训练过拟合，加入了Dropout操作。在四个地方用到了Dropout，词嵌入和位置编码、残差连接、注意力操作和前馈神经网络。Drop率通常设置为0.1
+
+\item<3-> \textbf{标签平滑}：学习一个较平滑的的目标，可以提升泛化能力，防止过拟合 :)
+\end{itemize}
+\vspace{-0.8em}
+
+\end{frame}
+
+%%%------------------------------------------------------------------------------------------------------------
+\begin{frame}{训练配置(续)}
+\begin{itemize}
+\item \textbf{Transformer Base}：标准的Transformer结构，解码器编码器均包含6层，隐层维度为512，前馈神经网络维度为2048，多头注意力机制为8头，Dropout设为0.1
+
+\item \textbf{Transformer Big}：为了提升网络的表示能力，在Base的基础上增大隐层维度至1024，前馈神经网络的维度变为4096，多头注意力机制为16头，Dropout设为0.3
+
+\item \textbf{Transformer Deep}：加深编码器网络层数可以进一步提升网络的性能，但简单堆叠网络层数会出现梯度消失问题，导致训练无法收敛。需要使用DLCL、正则化前作等方法来训练更深的网络。
+\end{itemize}
+\vspace{-0.8em}
+
+
+{
+    \footnotesize
+    \begin{center}
+        \setlength{\tabcolsep}{3pt}
+        \renewcommand\arraystretch{1}
+        \begin{tabular}{l}
+            \begin{tabular}{lcccl}
+                \specialrule{1pt}{1pt}{1pt}
+                \multirow{2}{*}{\#} & \multicolumn{2}{c}{BLEU} & \multicolumn{2}{c}{ \multirow{2}{*}{params}}\\
+                \cline{2-3}
+                & EN-DE & EN-FR & \\
+                \specialrule{0.6pt}{1pt}{1pt}
+                Transformer Base & 27.3 & 38.1 & \multicolumn{2}{c}{ 65$\times10^{6}$} \\
+                Transformer Big & 28.4 & 41.8 & \multicolumn{2}{c}{ 213$\times10^{6}$} \\
+                Transformer Deep(48层) & 30.2 & 43.1 & \multicolumn{2}{c}{ 194$\times10^{6}$} \\
+                \specialrule{1pt}{1pt}{1pt}
+            \end{tabular}\\
+            \addlinespace[-0.3ex]
+            %\footnote \\
+        \end{tabular}
+    \end{center}
+}
+
+\end{frame}
+
+%%%------------------------------------------------------------------------------------------------------------
 \subsection{推断}
 \begin{frame}{推断}
 \begin{itemize}
@@ -5051,90 +5135,7 @@ x_{l+1} = x_l+\mathcal{F}(x_l)
 \end{frame}

 %%%------------------------------------------------------------------------------------------------------------
-\subsection{训练配置}
-\begin{frame}{训练配置}
-\begin{itemize}
-\item \textbf{优化器}：使用Adam优化器，$\beta_1$=0.9，$\beta_2$=0.98，$\epsilon=10^{-9}$ 关于学习率的设置，引入了warmup策略，在训练初期，学习率从一个较小的初始值逐渐增大，当到达一定的步数，学习率再逐渐减小
-    \begin{displaymath}
-    lrate=d_{\mathrm{model}}^{-0.5}\cdot \min(step^{-0.5},step\cdot \mathrm{warmup\_steps}^{-1.5})
-    \end{displaymath}
-    这样做可以减缓在训练初期的不稳定现象，保持分布平稳，通常warmup\_steps通常设置为4000
-
-
-
-\vspace{0.5em}
-
-\only<1>{
-\begin{figure}
-  \centering
-  \begin{tikzpicture}
-    \footnotesize{
-      \begin{axis}[
-      width=.60\textwidth,
-      height=.40\textwidth,
-      legend style={at={(0.60,0.08)}, anchor=south west},
-      xlabel={\footnotesize{num update (10k)}},
-      ylabel={\footnotesize{Learn rate  (\scriptsize{$10^{-3}$)}}},
-      ylabel style={yshift=-1em},xlabel style={yshift=0.0em},
-      yticklabel style={/pgf/number format/precision=2,/pgf/number format/fixed zerofill},
-      ymin=0,ymax=0.9, ytick={0.2, 0.4, 0.6, 0.8},
-      xmin=0,xmax=12,xtick={2,4,6,8,10},
-      legend style={yshift=-6pt, legend plot pos=right,font=\scriptsize,cells={anchor=west}}
-      ]
-      \addplot[orange,line width=1.25pt] coordinates {(0,0) (4,0.7) (5,0.63) (6,0.57) (7,0.525) (8,0.49) (9,0.465) (10,0.44) (11,0.42) (12,0.4)};
-      \end{axis}
-     }
-  \end{tikzpicture}
-  \caption{}\label{}
-\end{figure}
-}
-
-
-\item<2-> \textbf{Dropout }：为了防止网络训练过拟合，加入了Dropout操作。在四个地方用到了Dropout，词嵌入和位置编码、残差连接、注意力操作和前馈神经网络。Drop率通常设置为0.1
-
-\item<3-> \textbf{标签平滑}：学习一个较平滑的的目标，可以提升泛化能力，防止过拟合 :)
-\end{itemize}
-\vspace{-0.8em}
-
-\end{frame}
-
-%%%------------------------------------------------------------------------------------------------------------
-\subsection{训练配置}
-\begin{frame}{训练配置(续)}
-\begin{itemize}
-\item \textbf{Transformer Base}：标准的Transformer结构，解码器编码器均包含6层，隐层维度为512，前馈神经网络维度为2048，多头注意力机制为8头，Dropout设为0.1
-
-\item \textbf{Transformer Big}：为了提升网络的表示能力，在Base的基础上增大隐层维度至1024，前馈神经网络的维度变为4096，多头注意力机制为16头，Dropout设为0.3
-
-\item \textbf{Transformer Deep}：加深编码器网络层数可以进一步提升网络的性能，但简单堆叠网络层数会出现梯度消失问题，导致训练无法收敛。需要使用DLCL、正则化前作等方法来训练更深的网络。
-\end{itemize}
-\vspace{-0.8em}
-
-
-{
-    \footnotesize
-    \begin{center}
-        \setlength{\tabcolsep}{3pt}
-        \renewcommand\arraystretch{1}
-        \begin{tabular}{l}
-            \begin{tabular}{lcccl}
-                \specialrule{1pt}{1pt}{1pt}
-                \multirow{2}{*}{\#} & \multicolumn{2}{c}{BLEU} & \multicolumn{2}{c}{ \multirow{2}{*}{params}}\\
-                \cline{2-3}
-                & EN-DE & EN-FR & \\
-                \specialrule{0.6pt}{1pt}{1pt}
-                Transformer Base & 27.3 & 38.1 & \multicolumn{2}{c}{ 65$\times10^{6}$} \\
-                Transformer Big & 28.4 & 41.8 & \multicolumn{2}{c}{ 213$\times10^{6}$} \\
-                Transformer Deep(48层) & 30.2 & 43.1 & \multicolumn{2}{c}{ 194$\times10^{6}$} \\
-                \specialrule{1pt}{1pt}{1pt}
-            \end{tabular}\\
-            \addlinespace[-0.3ex]
-            %\footnote \\
-        \end{tabular}
-    \end{center}
-}
-
-\end{frame}
+\section{应用}

 %%%------------------------------------------------------------------------------------------------------------
 %%% 应用
@@ -5209,7 +5210,6 @@ x_{l+1} = x_l+\mathcal{F}(x_l)


 %%%------------------------------------------------------------------------------------------------------------
-\subsection{应用}
 \begin{frame}{NMT应用}
 \begin{itemize}
 \item 神经机器翻译翻译系统除了满足日常翻译需求，还有很多其他有意思的应用！
@@ -5252,7 +5252,6 @@ x_{l+1} = x_l+\mathcal{F}(x_l)


 %%%------------------------------------------------------------------------------------------------------------
-\subsection{应用}
 \begin{frame}{NMT应用}
 \begin{itemize}
 \item 古文翻译实例
@@ -5292,7 +5291,6 @@ x_{l+1} = x_l+\mathcal{F}(x_l)
 \end{frame}

 %%%------------------------------------------------------------------------------------------------------------
-\subsection{应用}
 \begin{frame}{NMT应用}
 \begin{itemize}
 \item 神经机器翻译翻译系统除了满足日常翻译需求，还有很多其他有意思的应用！
@@ -5335,7 +5333,6 @@ x_{l+1} = x_l+\mathcal{F}(x_l)


 %%%------------------------------------------------------------------------------------------------------------
-\subsection{应用}
 \begin{frame}{NMT应用}
 \begin{itemize}
 \item 对联实例
@@ -5388,7 +5385,6 @@ x_{l+1} = x_l+\mathcal{F}(x_l)
 \end{frame}

 %%%------------------------------------------------------------------------------------------------------------
-\subsection{应用}
 \begin{frame}{NMT应用}
 \begin{itemize}
 \item 神经机器翻译翻译系统除了满足日常翻译需求，还有很多其他有意思的应用！
@@ -5456,6 +5452,10 @@ x_{l+1} = x_l+\mathcal{F}(x_l)
 \end{frame}

 %%%------------------------------------------------------------------------------------------------------------
+%%% 小结
+\section{小结}
+
+%%%------------------------------------------------------------------------------------------------------------
 %%% open source NMT
 \begin{frame}{一些开源NMT系统}
 \end{frame}