new pages

521ddae4 · xiaotong · 2e92218f · 521ddae4 · 521ddae4
Commit 521ddae4 authored Oct 07, 2019 by xiaotong
--- a/Section05-Neural-Networks-and-Language-Modeling/section05-test.tex
+++ b/Section05-Neural-Networks-and-Language-Modeling/section05-test.tex
@@ -116,75 +116,55 @@
 \subsection{参数学习 - 反向传播}

 %%%------------------------------------------------------------------------------------------------------------
-%%% 梯度下降
-\begin{frame}{梯度下降（Gradient Descent）}
+%%% 梯度下降的变种
+\begin{frame}{梯度下降的不同实现方式}

 \begin{itemize}
-\item 如果把目标函数看做是参数$\textbf{w}$的函数，记为$J(\textbf{w})$。优化目标是：找到使$J(\textbf{w})$达到最小的$\textbf{w}$
-\item 注意，$\textbf{w}$可能包含几亿个实数，不能是SMT中MERT之类的调参方法。这里可以考虑一种更加适合大量实数参数的优化方法，其核心思想是\alert{梯度下降}：
-    \begin{itemize}
-    \item<2-> 如果$J(\textbf{w})$对于$\textbf{w}$可微分，$\frac{\partial J(\textbf{w})}{\partial \textbf{w}}$表示$J$在$\textbf{w}$处变化最大的方向
-    \item<2-> $\textbf{w}$沿着梯度方向更新，新的$\textbf{w}$可以使函数更接近极值
-    \end{itemize}
+\item \textbf{梯度下降}：我们可以沿着梯度方向更新$\textbf{w}$一小步，之后得到更好的$\textbf{w}$，之后重新计算梯度，不断重复上述过程
+
+\begin{displaymath}
+\textbf{w}_{t+1} = \textbf{w}_t - \alpha \cdot \frac{\partial J(\textbf{w}_t)}{\partial \textbf{w}_t}
+\end{displaymath}
+
+其中$t$表示更新的步数，$\alpha$是一个参数，表示更新步幅的大小。$\alpha$的设置需要根据任务进行调整。而$J(\textbf{w}_t)$的形式决定了具体的算法具体的实现。
+
+\item<2-> \textbf{批量梯度下降(Batch Gradient Descent)}：
+
+\begin{displaymath}
+J(\textbf{w}_t) = \frac{1}{n} \sum_{i=1}^{n} L(\textbf{x}_i,\hat{\textbf{y}}_i;\textbf{w}_t)
+\end{displaymath}
+
+这种方法训练稳定，但是由于每次更新需要对所有训练样本进行遍历，效率低（比如$n$很大），大规模数据上很少使用
+
 \end{itemize}

-\pgfplotsset{%
-  colormap={whitered}{color(-1cm)=(orange!75!red);color(1cm)=(white)}
-}
+\end{frame}
+
+%%%------------------------------------------------------------------------------------------------------------
+%%% 梯度下降的变种
+\begin{frame}{梯度下降的不同实现方式(续)}
+
+\begin{itemize}
+\item \textbf{随机梯度下降(Stochastic Gradient Descent)}：
+
+\begin{displaymath}
+J(\textbf{w}_t) = L(\textbf{x}_i,\hat{\textbf{y}}_i;\textbf{w}_t)
+\end{displaymath}
+
+大名鼎鼎的SGD，所有机器学习的课程里几乎都有介绍。每次随机选取一个样本进行梯度计算和参数更新，更新的计算代价低，而且适用于利用少量样本进行在线学习(online learning)，不过方法收敛慢


-\begin{center}
-\begin{tikzpicture}[
-  declare function = {mu1=1;},
-  declare function = {mu2=2;},
-  declare function = {sigma1=0.5;},
-  declare function = {sigma2=1;},
-  declare function = {normal(\m,\s)=1/(2*\s*sqrt(pi))*exp(-(x-\m)^2/(2*\s^2));},
-  declare function = {bivar(\ma,\sa,\mb,\sb)=1/(2*pi*\sa*\sb) * exp(-((x-\ma)^2/\sa^2 + (y-\mb)^2/\sb^2))/2;}]
-  \footnotesize{
-  \visible<2->{
-  \begin{scope}
-  \begin{axis}[
-    colormap name  = whitered,
-    width          = 8cm,
-    height         = 5cm,
-    view           = {20}{45},
-    enlargelimits  = false,
-    grid           = major,
-    domain         = -1:3,
-    y domain       = 0:4,
-    samples        = 30,
-    xlabel         = $\textbf{w}^{[1]}$,
-    ylabel         = $\textbf{w}^{[2]}$,
-    xlabel style   = {xshift=0em,yshift=0.8em},
-    ylabel style   = {xshift=0.2em,yshift=0.8em},
-    zlabel         = {$f(\cdot)$},
-    ztick          = {-0.1},
-    colorbar,
-    colorbar style = {
-      at     = {(1.2,0.5)},
-      anchor = north west,
-      ytick  = {0,-0.1},
-      height = 0.25*\pgfkeysvalueof{/pgfplots/parent axis height},
-      title  = {}
-    }
-  ]
-    \addplot3 [surf] {-bivar(mu1,sigma1,mu2,sigma2)};
-
-    \node [circle,fill=red,minimum size=3pt,inner sep=1.5pt] () at (axis cs:0.5,2,-0.01) {};
-    
-    \draw [->,very thick,ublue] (axis cs:0.5,2,-0.01) -- (axis cs:0.8,1.6,-0.03) node [pos=1,right,inner sep=2pt] {\tiny{-$\frac{\partial J(\textbf{w})}{\partial \textbf{w}}$}};
-    \draw [->,very thick,dotted] (axis cs:0.5,2,-0.01) -- (axis cs:0.2,1.5,-0.03);
-    \draw [->,very thick,dotted] (axis cs:0.5,2,-0.01) -- (axis cs:0.5,3,-0.03);
-    %\draw [black!50] (axis cs:0,-1,0) -- (axis cs:0,4,0);
-
-  \end{axis}
-  \end{scope}
-  }
-  }
-\end{tikzpicture}
-\end{center}
+\vspace{0.3em}

+\item<2-> \textbf{小批量梯度下降(Mini-batch Gradient Descent)}：
+
+\begin{displaymath}
+J(\textbf{w}_t) = \frac{1}{m} \sum_{i=j}^{j+m} L(\textbf{x}_i,\hat{\textbf{y}}_i;\textbf{w}_t)
+\end{displaymath}
+
+每次随机使用若干样本进行参数更新(数量不会特别大)，算是一种折中方案，当今最常用的方法之一
+
+\end{itemize}

 \end{frame}


--- a/Section05-Neural-Networks-and-Language-Modeling/section05.tex
+++ b/Section05-Neural-Networks-and-Language-Modeling/section05.tex
@@ -2881,5 +2881,131 @@ $\textbf{w}^*$表示在训练集上使得损失的平均值达到最小的参数

 \end{frame}

+%%%------------------------------------------------------------------------------------------------------------
+%%% 梯度下降
+\begin{frame}{梯度下降（Gradient Descent）}
+
+\begin{itemize}
+\item 如果把目标函数看做是参数$\textbf{w}$的函数，记为$J(\textbf{w})$。优化目标是：找到使$J(\textbf{w})$达到最小的$\textbf{w}$
+\item 注意，$\textbf{w}$可能包含几亿个实数，不能是SMT中MERT之类的调参方法。这里可以考虑一种更加适合大量实数参数的优化方法，其核心思想是\alert{梯度下降}：
+    \begin{itemize}
+    \item<2-> 如果$J(\textbf{w})$对于$\textbf{w}$可微分，$\frac{\partial J(\textbf{w})}{\partial \textbf{w}}$表示$J$在$\textbf{w}$处变化最大的方向
+    \item<2-> $\textbf{w}$沿着梯度方向更新，新的$\textbf{w}$可以使函数更接近极值
+    \end{itemize}
+\end{itemize}
+
+\pgfplotsset{%
+  colormap={whitered}{color(-1cm)=(orange!75!red);color(1cm)=(white)}
+}
+
+
+\begin{center}
+\begin{tikzpicture}[
+  declare function = {mu1=1;},
+  declare function = {mu2=2;},
+  declare function = {sigma1=0.5;},
+  declare function = {sigma2=1;},
+  declare function = {normal(\m,\s)=1/(2*\s*sqrt(pi))*exp(-(x-\m)^2/(2*\s^2));},
+  declare function = {bivar(\ma,\sa,\mb,\sb)=1/(2*pi*\sa*\sb) * exp(-((x-\ma)^2/\sa^2 + (y-\mb)^2/\sb^2))/2;}]
+  \footnotesize{
+  \visible<2->{
+  \begin{scope}
+  \begin{axis}[
+    colormap name  = whitered,
+    width          = 8cm,
+    height         = 5cm,
+    view           = {20}{45},
+    enlargelimits  = false,
+    grid           = major,
+    domain         = -1:3,
+    y domain       = 0:4,
+    samples        = 30,
+    xlabel         = $\textbf{w}^{[1]}$,
+    ylabel         = $\textbf{w}^{[2]}$,
+    xlabel style   = {xshift=0em,yshift=0.8em},
+    ylabel style   = {xshift=0.2em,yshift=0.8em},
+    zlabel         = {$J(\textbf{w})$},
+    ztick          = {-0.1},
+    colorbar,
+    colorbar style = {
+      at     = {(1.2,0.5)},
+      anchor = north west,
+      ytick  = {0,-0.1},
+      height = 0.25*\pgfkeysvalueof{/pgfplots/parent axis height},
+      title  = {}
+    }
+  ]
+
+    \addplot3 [surf] {-bivar(mu1,sigma1,mu2,sigma2)};
+
+    \node [circle,fill=red,minimum size=3pt,inner sep=1.5pt] () at (axis cs:0.5,2,-0.01) {};
+    
+    \draw [->,very thick,ublue] (axis cs:0.5,2,-0.01) -- (axis cs:0.8,1.6,-0.03) node [pos=1,right,inner sep=2pt] {\tiny{-$\frac{\partial J(\textbf{w})}{\partial \textbf{w}}$}};
+    \draw [->,very thick,dotted] (axis cs:0.5,2,-0.01) -- (axis cs:0.2,1.5,-0.03);
+    \draw [->,very thick,dotted] (axis cs:0.5,2,-0.01) -- (axis cs:0.2,3.5,-0.03);
+    %\draw [black!50] (axis cs:0,-1,0) -- (axis cs:0,4,0);
+
+  \end{axis}
+  \end{scope}
+  }
+  }
+\end{tikzpicture}
+\end{center}
+
+\end{frame}
+
+%%%------------------------------------------------------------------------------------------------------------
+%%% 梯度下降的变种
+\begin{frame}{梯度下降的不同实现方式}
+
+\begin{itemize}
+\item \textbf{梯度下降}：我们可以沿着梯度方向更新$\textbf{w}$一小步，之后得到更好的$\textbf{w}$，之后重新计算梯度，不断重复上述过程
+
+\begin{displaymath}
+\textbf{w}_{t+1} = \textbf{w}_t - \alpha \cdot \frac{\partial J(\textbf{w}_t)}{\partial \textbf{w}_t}
+\end{displaymath}
+
+其中$t$表示更新的步数，$\alpha$是一个参数，表示更新步幅的大小。$\alpha$的设置需要根据任务进行调整。而$J(\textbf{w}_t)$的形式决定了具体的算法具体的实现。
+
+\item<2-> \textbf{批量梯度下降(Batch Gradient Descent)}：
+
+\begin{displaymath}
+J(\textbf{w}_t) = \frac{1}{n} \sum_{i=1}^{n} L(\textbf{x}_i,\hat{\textbf{y}}_i;\textbf{w}_t)
+\end{displaymath}
+
+这种方法训练稳定，但是由于每次更新需要对所有训练样本进行遍历，效率低（比如$n$很大），大规模数据上很少使用
+
+\end{itemize}
+
+\end{frame}
+
+%%%------------------------------------------------------------------------------------------------------------
+%%% 梯度下降的变种
+\begin{frame}{梯度下降的不同实现方式(续)}
+
+\begin{itemize}
+\item \textbf{随机梯度下降(Stochastic Gradient Descent)}：
+
+\begin{displaymath}
+J(\textbf{w}_t) = L(\textbf{x}_i,\hat{\textbf{y}}_i;\textbf{w}_t)
+\end{displaymath}
+
+大名鼎鼎的SGD，所有机器学习的课程里几乎都有介绍。每次随机选取一个样本进行梯度计算和参数更新，更新的计算代价低，而且适用于利用少量样本进行在线学习(online learning)，不过方法收敛慢
+
+
+\vspace{0.3em}
+
+\item<2-> \textbf{小批量梯度下降(Mini-batch Gradient Descent)}：
+
+\begin{displaymath}
+J(\textbf{w}_t) = \frac{1}{m} \sum_{i=j}^{j+m} L(\textbf{x}_i,\hat{\textbf{y}}_i;\textbf{w}_t)
+\end{displaymath}
+
+每次随机使用若干样本进行参数更新(数量不会特别大)，算是一种折中方案，当今最常用的方法之一
+
+\end{itemize}
+
+\end{frame}
+
 \end{CJK}
 \end{document}