Update RNN training

cef82db3 · Lee · fb4867d8 · cef82db3
Commit cef82db3 authored Dec 24, 2019 by Lee
--- a/Section06-Neural-Machine-Translation/section06.tex
+++ b/Section06-Neural-Machine-Translation/section06.tex
@@ -77,6 +77,8 @@
 \newcommand*{\ExtractCoordinate}[1]{\path (#1); \pgfgetlastxy{\XCoord}{\YCoord};}%
 \newcommand*{\ExtractX}[1]{\path (#1); \pgfgetlastxy{\XCoord}{\TMP};}%
 \newcommand*{\ExtractY}[1]{\path (#1); \pgfgetlastxy{\TMP}{\YCoord};}%
+\newcommand{\specialcell}[3][c]{%
+  \begin{tabular}[#1]{@{}#2@{}}#3\end{tabular}}

 \newcounter{mycount1}
 \newcounter{mycount2}
@@ -2665,18 +2667,35 @@ $\textrm{``you''} = \argmax_{y} \textrm{P}(y|\textbf{s}_1, \alert{\textbf{C}})$ 

 %%%------------------------------------------------------------------------------------------------------------
 %%% 训练
-\begin{frame}{训练}
+\begin{frame}{训练 - 整体流程}
    \begin{itemize}
        \item 有了一个NMT模型，我们应该怎么使用梯度下降算法来训练一个``聪明''的翻译模型呢？
+        \begin{enumerate}[1]
+            \item 参数初始化
+            \item 优化器选择
+            \item 学习率调度
+            \item 多设备加速
+        \end{enumerate}
+    \end{itemize}
+\end{frame}
+
+\begin{frame}{训练 - 初始化}
    \begin{itemize}
-            \item<1|only@1>{参数初始化：模型结构是确定了，但是我们初始化参数还有很多需要注意的地方，否则训练不了一个优秀的模型
+        \item 模型结构是确定了，但是我们初始化参数还有很多需要注意的地方，否则训练不了一个优秀的模型
        \begin{itemize}
            \item LSTM遗忘门偏置初始为1，也就是始终选择遗忘记忆$c$，可以有效防止初始时$c$里包含的错误信号传播后面所有时刻
-                    \item 其他参数一般使用Xavier参数初始化方法，可以有效稳定训练过程，特别是对于比较``深''的网络$$W \sim \mathcal{U}(-\sqrt{\frac{6}{d_{\mathrm{in}}+d_{\mathrm{out}}}},\sqrt{\frac{6}{d_{\mathrm{in}}+d_{\mathrm{out}}}})$$
-                    \item $W$是参数，$d_{\mathrm{in}}$和$d_{\mathrm{out}}$分别是$W$的输入和输出的维度大小
+            \item 网络的其他偏置一般都初始化成0，可以有效防止加入过大或过小的偏置后使得激活函数的输出跑到``饱和区''，也就是梯度接近0的区域，使得训练一开始就无法跳出局部极小
+            \item 网络的权重矩阵$W$一般使用Xavier参数初始化方法，可以有效稳定训练过程，特别是对于比较``深''的网络$$W \sim \mathcal{U}(-\sqrt{\frac{6}{d_{\mathrm{in}}+d_{\mathrm{out}}}},\sqrt{\frac{6}{d_{\mathrm{in}}+d_{\mathrm{out}}}})$$
+            \item $d_{\mathrm{in}}$和$d_{\mathrm{out}}$分别是$W$的输入和输出的维度大小，经典的论文\\
+            \textbf{Understanding the difficulty of training deep feedforward neural networks}\\
+            \textbf{Glorot, X., \& Bengio, Y., 2010, In Proc of AISTATS}
        \end{itemize}
-            }
-            \item<2-4|only@2-4>{优化器选择：训练RNN我们通常会使用Adam或者SGD两种优化器，它们各有优劣
+    \end{itemize}
+\end{frame}
+
+\begin{frame}{训练 - 优化器}
+    \begin{itemize}
+        \item 训练RNN我们通常会使用Adam或者SGD两种优化器，它们各有优劣
        \begin{center}
            \footnotesize
            \begin{tabular}{c|c|c}
@@ -2686,33 +2705,123 @@ $\textrm{``you''} = \argmax_{y} \textrm{P}(y|\textbf{s}_1, \alert{\textbf{C}})$ 
                SGD & 换一个任务就得调 & 效果杠杠的 \\
            \end{tabular}
        \end{center}
-            }
-            \item<3-4|only@3-4>{学习率调度
-                \only<3>{
+        \item 因此需要快速得到模型看一下初步效果，选择Adam
+        \item 若是需要在一个任务上得到最优的结果，选择SGD
+        \begin{itemize}
+            \item 需要注意的是，训练RNN的时候，我们通常会遇到梯度爆炸的问题，也就是梯度突然变得很大，这种情况下需要使用``梯度裁剪''来防止梯度$\pi$超过阈值$$\pi'=\pi \cdot \frac{\mathrm{threshold}}{\max(\mathrm{threshold},\parallel \pi \parallel_2)}$$
+            \item 其中$\mathrm{threshold}$是手工设定的梯度大小阈值，$\parallel \cdot \parallel_2$是L2范数
+            \item 这个公式含义在于只要梯度大小超过阈值，就按照阈值与当前梯度大小的比例进行放缩
+        \end{itemize}
+    \end{itemize}
+\end{frame}
+
+\begin{frame}{训练 - 学习率}
    \begin{itemize}
        \item 不同优化器需要的学习率不同，比如Adam一般使用$0.001$或$0.0001$，而SGD则在$0.1\sim 1$之间挑选
-                        \item 但是无论使用哪个优化器，为了达到最好效果，我们通常都需要根据当前的更新次数来调整学习率的大小
+        \item 但是无论使用哪个优化器，为了保证训练又快又好，我们通常都需要根据当前的更新次数来调整学习率的大小
+        \begin{itemize}
+            \item 学习率预热：模型训练初期，梯度通常很大，直接使用很大的学习率很容易让模型跑偏，因此需要学习率有一个从小到大的过程
+            \item 学习率衰减：模型训练接近收敛的时候，使用大学习率会很容易让模型错过局部极小，因此需要学习率逐渐变小来逼近局部最小
        \end{itemize}
+        \begin{center}
+            \begin{tikzpicture}
+            \footnotesize{
+                \begin{axis}[
+                    width=.60\textwidth,
+                    height=.40\textwidth,
+                    legend style={at={(0.60,0.08)}, anchor=south west},
+                    xlabel={\scriptsize{更新次数}},
+                    ylabel={\scriptsize{学习率}},
+                    xtick=\empty,
+                    ytick=\empty,
+                    ylabel style={yshift=-2.5em},xlabel style={yshift=1.5em},
+                    legend style={yshift=-6pt, legend plot pos=right,font=\scriptsize,cells={anchor=west}}
+                ]
+                \addplot[orange,line width=1.25pt] coordinates {(0,0) (4,0.7) (5,0.63) (6,0.57) (7,0.525) (8,0.49) (9,0.465) (10,0.44) (11,0.42) (12,0.4)};
+                \end{axis}
            }
-                \only<4>{
+            \end{tikzpicture}
+        \end{center}
+    \end{itemize}
+\end{frame}
+
+\begin{frame}{训练 - 加速}
    \begin{itemize}
-                        \item 学习率预热
-                        \item 学习率衰减
+        \item 万事俱备，只是为什么训练这么慢？\visible<2>{\alert{- RNN需要等前面所有时刻都完成计算以后才能开始计算当前时刻的输出}}
+        \item 我有钱，是不是多买几台设备会更快？\visible<2>{\alert{- 可以，但是需要技巧，而且也不是无限增长的}}
+        \item<2> 使用多个设备并行计算进行加速的两种方法
+        \begin{itemize}
+            \item 数据并行：把``输入''分到不同设备上并行计算
+            \item 模型并行：把``模型''分到不同设备上并行计算
+        \end{itemize}
+        \begin{center}
+            \small
+            \begin{tabular}{c|cc}
+                & 优点 & 缺点 \\
+                \hline
+                数据并行 & \specialcell{l}{并行度高，理论上多大\\的batch就可以有多少\\个设备并行计算} & \specialcell{l}{模型不能大于单个设\\备的极限} \\
+                模型并行 & \specialcell{l}{可以对很大的模型进行\\运算} & \specialcell{l}{只能有限并行，比如\\多少层就多少个设备} \\
+            \end{tabular}
+        \end{center}
+        \item<2> 这两种方法可以一起使用！！！
    \end{itemize}
+\end{frame}
+
+\begin{frame}{训练 - 数据并行}
+    \begin{itemize}
+        \item 如果一台设备能完整放下一个RNN模型，那么数据并行可以把一个大batch均匀切分成$n$个小batch，然后分发到$n$个设备上并行计算，最后把结果汇总，相当于把运算时间变为原来的$1/n$
+        \vspace{-0.5em}
+        \begin{center}
+            \hspace*{-0.5cm}
+            \begin{tikzpicture}
+                \setlength{\base}{1.5em}
+                \tikzstyle{samplenode} = [rounded corners=1pt,minimum size=1\base,draw,inner sep=3pt,outer sep=0pt,fill=green!30!white]
+                \tikzstyle{rnnnode} = [rounded corners=1pt,minimum size=1\base,draw,inner sep=0pt,outer sep=0pt,fill=blue!30!white]
+                \tikzstyle{wordnode} = [font=\footnotesize,align=center]
+
+                \begin{scope}
+                    \coordinate (batch0) at (0,0);
+
+                    \foreach \i [count=\j from 0,evaluate=\i as \k using int(4-\i)] in {1,2,3}
+                        \node [samplenode,anchor=south west] (batch\i) at ([shift={(-1em,-0.5em)}]batch\j.south west) {句子\k};
+                    \draw [decorate,decoration={brace}] (batch1.south east) to node [auto,rotate=30,anchor=north,font=\scriptsize] {batch大小} (batch3.south east);
+                    
+                    \node [samplenode,anchor=west] (sample2) at ([xshift=4em]batch2.east) {句子2};
+                    \node [samplenode,anchor=south] (sample3) at ([yshift=3em]sample2.north) {句子3};
+                    \node [samplenode,anchor=north] (sample1) at ([yshift=-3em]sample2.south) {句子1};
+
+                    \foreach \i in {1,2,3}
+                        \draw [->,thick] ([xshift=1.5em]batch2.east) -- ([xshift=-3pt]sample\i.west);
+
+                    \foreach \i in {1,2,3}
+                    {
+                        \coordinate (start) at ([xshift=2em]sample\i.east);
+                        \node [wordnode,anchor=west] (rnn0) at (start) {$0$};
+                        \foreach \j [count=\k from 0] in {1,2,3}
+                        {
+                            \node [rnnnode,anchor=west] (rnn\j) at ([xshift=1em]rnn\k.east) {};
+                            \draw [-latex'] (rnn\k) to (rnn\j);
+                            \coordinate (in\j) at ([yshift=-1em]rnn\j.south);
+                            \draw [-latex'] (in\j) to (rnn\j.south);
+                            \coordinate (out\j) at ([yshift=1em]rnn\j.north);
+                            \draw [-latex'] (rnn\j.north) to (out\j);
                        }
+                        \node [wordnode,anchor=west] (rnn4) at ([xshift=1em]rnn3.east) {$\cdots$};
+                        \draw [-latex'] (rnn3) to (rnn4);
+                        \node [draw,densely dashed,thick,rounded corners=0.3em,fit=(start) (in3) (out3) (rnn4),label={[font=\footnotesize,rotate=90,anchor=north]0:设备\i}] (rnn) {};
+                        \draw [->,double] ([xshift=3pt]sample\i.east) -- ([xshift=-3pt]rnn.west);
                    }
-            \item<5-13|only@5-13>{多设备并行
-                \only<5-6>{
-                    \begin{itemize}
-                        \item 万事俱备，只是为什么训练这么慢？\only<6>{\alert{- RNN需要等前面所有时刻都完成计算以后才能开始计算当前时刻的输出}}
-                        \item 我有钱，是不是多买几台设备会更快？\only<6>{\alert{- 可以，但是需要技巧，而且也不是无限增长的}}
+                \end{scope}
+            \end{tikzpicture}
+        \end{center}
    \end{itemize}
-                }
-                \only<7-13>{
+\end{frame}
+
+\begin{frame}{训练 - 模型并行}
    \begin{itemize}
-                        \only<7>{\item 数据并行：如果一台设备能完整放下一个RNN模型，那么数据并行可以把一个大batch均匀切分成$n$个小batch，然后分发到$n$个设备上并行计算，最后把结果汇总，相当于把运算时间变为原来的$1/n$}
-                        \only<8-13>{\item 模型并行：做完了数据并行，仍然太慢了，因为RNN模型太大了，算一个样本也很慢，那么可以把RNN模型按层均匀切分成$l$个小模型，然后分发到$l$个设备上并行计算，相当于把运算时间变为原来的$1/l$
+        \item 做完了数据并行，仍然太慢了，因为RNN模型太大了，算一个样本也很慢，那么可以把RNN模型按层均匀切分成$l$个小模型，然后分发到$l$个设备上并行计算，相当于把运算时间变为原来的$1/l$
        \hspace*{-0.5cm}
+        \begin{center}
            \begin{tikzpicture}
                \setlength{\base}{1.5em}
                \tikzstyle{rnnnode} = [rounded corners=1pt,minimum size=1\base,draw,inner sep=0pt,outer sep=0pt,fill=blue!30!white]
@@ -2727,19 +2836,19 @@ $\textrm{``you''} = \argmax_{y} \textrm{P}(y|\textbf{s}_1, \alert{\textbf{C}})$ 
                        \coordinate (rnn0\i) at ([xshift=2\base]rnn0\j);
    
                    % step 1
-                                \visible<8->{
+                    \visible<1->{
                        \node[rnnnode] (rnn11) at ([xshift=2\base]rnn10) {};
                        \draw[-latex'] ([yshift=0.5\base]rnn01) to (rnn11);
                        \draw[-latex'] ([xshift=0.5\base]rnn10) to (rnn11);
                    }
-                                \visible<8>{
+                    \visible<1>{
                        % frontier
                        \node[rnnnode,fill=purple] () at (rnn11) {};
                        \node[draw=red,thick,inner sep=7pt,rounded corners=0.3em,rotate fit=-45,label={[font=\footnotesize,align=center]90:正在运算的\\{\color{red} 循环单元}},fit=(rnn11)] () {};
                    }
    
                    % step 2
-                                \visible<9->{
+                    \visible<2->{
                        \node[rnnnode] (rnn12) at ([xshift=2\base]rnn11) {};
                        \node[rnnnode] (rnn21) at ([yshift=2\base]rnn11) {};
                        \draw[-latex'] ([yshift=0.5\base]rnn02) to (rnn12);
@@ -2747,7 +2856,7 @@ $\textrm{``you''} = \argmax_{y} \textrm{P}(y|\textbf{s}_1, \alert{\textbf{C}})$ 
                        \draw[-latex'] (rnn11) to (rnn12);
                        \draw[-latex'] (rnn11) to (rnn21);
                    }
-                                \visible<9>{
+                    \visible<2>{
                        % frontier
                        \node[rnnnode,fill=purple] () at (rnn12) {};
                        \node[rnnnode,fill=purple] () at (rnn21) {};
@@ -2755,7 +2864,7 @@ $\textrm{``you''} = \argmax_{y} \textrm{P}(y|\textbf{s}_1, \alert{\textbf{C}})$ 
                    }
    
                    % step 3
-                                \visible<10->{
+                    \visible<3->{
                        \node[rnnnode] (rnn13) at ([xshift=2\base]rnn12) {};
                        \node[rnnnode] (rnn31) at ([yshift=2\base]rnn21) {};
                        \node[rnnnode] (rnn22) at ([xshift=2\base]rnn21) {};
@@ -2768,7 +2877,7 @@ $\textrm{``you''} = \argmax_{y} \textrm{P}(y|\textbf{s}_1, \alert{\textbf{C}})$ 
                        \draw[-latex'] (rnn21) to (rnn22);
                        \draw[-latex'] (rnn31) to (o1);
                    }
-                                \visible<10>{
+                    \visible<3>{
                        % frontier
                        \node[rnnnode,fill=purple] () at (rnn13) {};
                        \node[rnnnode,fill=purple] () at (rnn31) {};
@@ -2777,7 +2886,7 @@ $\textrm{``you''} = \argmax_{y} \textrm{P}(y|\textbf{s}_1, \alert{\textbf{C}})$ 
                    }
    
                    % step 4
-                                \visible<11->{
+                    \visible<4->{
                        \node[rnnnode] (rnn14) at ([xshift=2\base]rnn13) {};
                        \node[rnnnode] (rnn23) at ([xshift=2\base]rnn22) {};
                        \node[rnnnode] (rnn32) at ([xshift=2\base]rnn31) {};
@@ -2790,7 +2899,7 @@ $\textrm{``you''} = \argmax_{y} \textrm{P}(y|\textbf{s}_1, \alert{\textbf{C}})$ 
                        \draw[-latex'] (rnn31) to (rnn32);
                        \draw[-latex'] (rnn32) to (o2);
                    }
-                                \visible<11>{
+                    \visible<4>{
                        % frontier
                        \node[rnnnode,fill=purple] () at (rnn14) {};
                        \node[rnnnode,fill=purple] () at (rnn23) {};
@@ -2799,7 +2908,7 @@ $\textrm{``you''} = \argmax_{y} \textrm{P}(y|\textbf{s}_1, \alert{\textbf{C}})$ 
                    }
    
                    % step 5
-                                \visible<12->{
+                    \visible<5->{
                        \node[rnnnode] (rnn24) at ([xshift=2\base]rnn23) {};
                        \node[rnnnode] (rnn33) at ([xshift=2\base]rnn32) {};
                        \node[wordnode,anchor=south] (o3) at ([yshift=\base]rnn33.north) {。};
@@ -2809,7 +2918,7 @@ $\textrm{``you''} = \argmax_{y} \textrm{P}(y|\textbf{s}_1, \alert{\textbf{C}})$ 
                        \draw[-latex'] (rnn32) to (rnn33);
                        \draw[-latex'] (rnn33) to (o3);
                    }
-                                \visible<12>{
+                    \visible<5>{
                        % frontier
                        \node[rnnnode,fill=purple] () at (rnn24) {};
                        \node[rnnnode,fill=purple] () at (rnn33) {};
@@ -2817,31 +2926,31 @@ $\textrm{``you''} = \argmax_{y} \textrm{P}(y|\textbf{s}_1, \alert{\textbf{C}})$ 
                    }
    
                    % step 6
-                                \visible<13->{
+                    \visible<6->{
                        \node[rnnnode] (rnn34) at ([xshift=2\base]rnn33) {};
                        \node[wordnode,anchor=south] (o4) at ([yshift=\base]rnn34.north) {EOS};
                        \draw[-latex'] (rnn33) to (rnn34);
                        \draw[-latex'] (rnn24) to (rnn34);
                        \draw[-latex'] (rnn34) to (o4);
                    }
-                                \visible<13>{
+                    \visible<6>{
                        % frontier
                        \node[rnnnode,fill=purple] () at (rnn34) {};
                        \node[draw=red,thick,inner sep=7pt,rounded corners=0.3em,rotate fit=-45,label={[font=\footnotesize,align=center]90:正在运算的\\{\color{red} 循环单元}},fit=(rnn34)] () {};
                    }
    
                    % labels
-                                \alt<8-11>{
+                    \alt<1-4>{
                        \draw[decorate,decoration={brace}] ([yshift=-\base]rnn10.west) to node[wordnode,align=right,left,text=red] {正在使用的\\设备1} ([yshift=\base]rnn10.west);
                    }{
                        \draw[decorate,decoration={brace}] ([yshift=-\base]rnn10.west) to node[wordnode,align=right,left] {空闲的\\设备1} ([yshift=\base]rnn10.west);
                    }
-                                \alt<9-12>{
+                    \alt<2-5>{
                        \draw[decorate,decoration={brace}] ([yshift=-\base]rnn20.west) to node[wordnode,align=right,left,text=red] {正在使用的\\设备2} ([yshift=\base]rnn20.west);
                    }{
                        \draw[decorate,decoration={brace}] ([yshift=-\base]rnn20.west) to node[wordnode,align=right,left] {空闲的\\设备2} ([yshift=\base]rnn20.west);
                    }
-                                \alt<10-13>{
+                    \alt<3-6>{
                        \draw[decorate,decoration={brace}] ([yshift=-\base]rnn30.west) to node[wordnode,align=right,left,text=red] {正在使用的\\设备3} ([yshift=\base]rnn30.west);
                    }{
                        \draw[decorate,decoration={brace}] ([yshift=-\base]rnn30.west) to node[wordnode,align=right,left] {空闲的\\设备3} ([yshift=\base]rnn30.west);
@@ -2856,17 +2965,7 @@ $\textrm{``you''} = \argmax_{y} \textrm{P}(y|\textbf{s}_1, \alert{\textbf{C}})$ 
                    \node[wordnode] () at (rnn04) {。};
                \end{scope}
            \end{tikzpicture}
-                        }
-                    \end{itemize}
-                }
-            }
-            \item<14|only@14>{其他
-                \begin{itemize}
-                    \item 训练RNN的时候，我们通常会遇到梯度爆炸的问题，也就是梯度突然变得很大，这种情况下需要使用``梯度裁剪''来防止梯度$\pi$超过阈值$$\pi'=\pi \cdot \frac{\mathrm{threshold}}{\max(\mathrm{threshold},\parallel \pi \parallel_2)}$$
-                    \item 其中$\mathrm{threshold}$是手工设定的梯度大小阈值，$\parallel \cdot \parallel_2$是L2范数
-                \end{itemize}
-            }
-        \end{itemize}
+        \end{center}
    \end{itemize}
 \end{frame}