Commit 80802792 by xiaotong

new pages

parent d613a736
......@@ -116,71 +116,22 @@
\subsection{参数学习 - 反向传播}
%%%------------------------------------------------------------------------------------------------------------
%%% 隐层的反向传播
\begin{frame}{反向传播 - 隐层}
%%% 前向过程及其它问题
\begin{frame}{前向计算及其它}
\begin{itemize}
\item 对于任意隐层$k$$\textbf{h}^k = f^k(\textbf{s}^k) = f^k(\textbf{h}^{k-1}\textbf{w}^k)$。给定:隐层输出处的梯度$\pi^k=\frac{\partial L}{\partial \textbf{h}^{k}}$,需要
\begin{enumerate}
\item 计算损失$L$对于第$k$层参数矩阵$\textbf{w}^k$的梯度,$\frac{\partial L}{\partial \textbf{w}^k}$
\item 计算损失$L$对于第$k$层输入$\textbf{h}^{k-1}$的梯度,$\frac{\partial L}{\partial \textbf{h}^{k-1}}$
\end{enumerate}
\item<2-> 直接套用上一页的方法,可以将$\pi^k=\frac{\partial L}{\partial \textbf{h}^{k}} \frac{\partial f^k(\textbf{s}^k)}{\partial \textbf{s}^{k}}$反向传播
\vspace{-0.0em}
\begin{eqnarray}
\frac{\partial L}{\partial \textbf{w}^k} & = & [\textbf{h}^{k-1}]^T \pi^k \nonumber \\
\frac{\partial L}{\partial \textbf{h}^{k-1}} & = & \pi^k [\textbf{w}^k]^T\nonumber
\end{eqnarray}
\item \alert{前向计算}实际上就是网络构建的过程,有两种常用方式
\begin{itemize}
\item \textbf{动态图}(如PyTorch、NiuTensor):写完函数表达式,前向计算即完成,易于调试
\item \textbf{静态图}(如TensorFlow):函数表达式完成后,并不能得到前向计算结果,需要显性调用一个Forward函数,但是计算图可以进行深度优化,执行效率较高
\end{itemize}
\item<2-> 其它一些深度学习系统实现的问题,值得关注,不过这些都超出了本课程的范围
\begin{itemize}
\item \textbf{分布式训练}:对于复杂模型的海量数据训练,需要同时利用多个设备(多机、多卡)同时训练
\item \textbf{低精度计算}:为了提高效率可以采用半精度或者定点数进行计算
\item \textbf{模型压缩}:减少冗余,可以压缩模型,使得模型易于存储同时提高系统运行效率
\item \textbf{训练方法和超参选择}:不同任务往往需要不同的训练策略,包括超参设置,坑很多,需要积累经验
\end{itemize}
\end{itemize}
\visible<3->{
\begin{center}
\begin{tikzpicture}
\begin{scope}
\node [anchor=center,draw,fill=red!20,minimum height=1.8em,minimum width=2.5em] (h) at (0,0) {$\textbf{h}^{k-1}$};
\node [anchor=west,draw,fill=blue!20,minimum height=1.8em,minimum width=2.5em] (s) at ([xshift=6em]h.east) {$\textbf{s}^{k}$};
\node [anchor=west,draw,fill=green!20,minimum height=1.8em,minimum width=2.5em] (h2) at ([xshift=6em]s.east) {$\textbf{h}^{k}$};
\node [anchor=east] (prev) at ([xshift=-2em]h.west) {...};
\node [anchor=west] (next) at ([xshift=2em]h2.east) {...};
\draw [->,thick] ([xshift=0.1em]prev.east) -- ([xshift=-0.1em]h.west);
\draw [->,thick] ([xshift=0.1em]h.east) -- ([xshift=-0.1em]s.west) node [pos=0.5,below] {\tiny{$\textbf{s}^k = \textbf{h}^{k-1}\textbf{w}^k$}};
\draw [->,thick] ([xshift=0.1em]s.east) -- ([xshift=-0.1em]h2.west) node [pos=0.5,below] {\tiny{$\textbf{h}^k = f(\textbf{s}^{k})$}};
\draw [->,thick] ([xshift=0.1em]h2.east) -- ([xshift=-0.1em]next.west);
\visible<4->{
\draw [<-,thick,red] ([xshift=0.1em,yshift=0.4em]h2.east) -- ([xshift=-0.1em,yshift=0.4em]next.west) node [pos=0.8,above] {\tiny{反向传播}};
}
\visible<5->{
\draw [<-,thick,red] ([xshift=0.1em,yshift=0.4em]s.east) -- ([xshift=-0.1em,yshift=0.4em]h2.west) node [pos=0.5,above] {\tiny{反向传播}};
}
\visible<6->{
\draw [<-,thick,red] ([xshift=0.1em,yshift=0.4em]h.east) -- ([xshift=-0.1em,yshift=0.4em]s.west) node [pos=0.5,above] {\tiny{反向传播}};
}
\visible<7->{
\draw [->,thick,red,dashed] ([yshift=-0.1em]h.south) -- ([yshift=-1em]h.south) -- ([yshift=-1em]h2.south) -- ([yshift=-0.1em]h2.south);
\node [anchor=north,red] (recur) at ([yshift=-1em]s.south) {\scriptsize{$k=k-1$重复上述过程}};
}
\visible<4->{
\node [anchor=south] (h2label) at (h2.north) {$\frac{\partial L}{\partial \textbf{h}^{k}}$};
}
\visible<5->{
\node [anchor=south] (slabel) at (s.north) {$\pi^k = \frac{\partial L}{\partial \textbf{s}^{k}}$};
}
\visible<6->{
\node [anchor=south] (hlabel) at (h.north) {$\frac{\partial L}{\partial \textbf{h}^{k-1}}$, $\frac{\partial L}{\partial \textbf{w}^{k}}$};
}
\end{scope}
\end{tikzpicture}
\end{center}
}
\end{frame}
\end{CJK}
......
......@@ -3584,5 +3584,167 @@ $+2x^2+x+1)$ & \ \ $(x^4+2x^3+2x^2+x+1)$ & $+6x+1$ \\
\end{frame}
%%%------------------------------------------------------------------------------------------------------------
%%% 反向传播实例
\begin{frame}{反向传播的实现}
\begin{itemize}
\item 对于一个多层神经网络很容易实现反向传播
\end{itemize}
\begin{tcolorbox}
[bicolor,sidebyside,righthand width=3.5cm,size=title,frame engine=empty,
colback=blue!10!white,colbacklower=black!5!white]
{\scriptsize
\begin{tabbing}
\texttt{XTensor x, y, gold, h[5], w[5], s[5];} \\
\texttt{XTensor dh[5], dw[5], ds[5];} \\
\texttt{...} // 前向过程 \\
\texttt{h[0] = x;} \\
\texttt{y = h[4];} \\
\visible<2->{
\texttt{} \\
\texttt{CrossEntropyBackward(dldh[4], y, gold);} \\
\texttt{SoftmaxBackward(y, s[4], dh[4], ds[4]);}\\
\texttt{MMul(h[3], {\tiny X\_TRANS}, ds[4], {\tiny X\_NOTRANS}, dw[4]);}\\
}
\visible<3->{
\texttt{} \\
\texttt{dh[2] = dh[3];}\\
\texttt{dh[1] = dh[3];}\\
}
\visible<4->{
\texttt{} \\
\texttt{ReluBackward(h[2], s[2], dh[2], ds[2]);}\\
\texttt{MMul(h[1], {\tiny X\_TRANS}, ds[2], {\tiny X\_NOTRANS}, dw[2]);}\\
\texttt{} \\
\texttt{ReluBackward(h[1], s[1], dh[1], ds[1]);}\\
\texttt{MMul(h[0], {\tiny X\_TRANS}, ds[1], {\tiny X\_NOTRANS}, dw[1]);}\\
}
\visible<5->{
\texttt{} \\
\texttt{for(unsigned i = 0; i < 5; i++)\{} \\
\texttt{} \ \ \ \ ... // 通过\alert{\texttt{dw[i]}}访问参数的梯度\\
\texttt{\}}
}
\end{tabbing}
}
\tcblower
\begin{center}
\begin{tikzpicture}
\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=8em,minimum height=1.2em,fill=red!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (h1) at (0,0) {\tiny{x (input)}};
\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=8em,minimum height=1.2em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (h2) at ([yshift=1.5em]h1.north) {\tiny{h1 = Relu(x * w1)}};
\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=8em,minimum height=1.2em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (h3) at ([yshift=1.5em]h2.north) {\tiny{h2 = Relu(h1 * w2)}};
\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=8em,minimum height=1.2em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (h4) at ([yshift=1.5em]h3.north) {\tiny{h3 = h2 + h1}};
\visible<1-3>{\draw [->,thick] (h1.north) -- (h2.south);}
\visible<1-3>{\draw [->,thick] (h2.north) -- (h3.south);}
\visible<1-2>{\draw [->,thick] (h3.north) -- (h4.south);}
\visible<1-2>{\draw [->,thick,rounded corners] (h2.east) -- ([xshift=0.5em]h2.east) -- ([xshift=0.5em,yshift=0.5em]h3.north east) -- ([xshift=-2em,yshift=0.5em]h3.north east) -- ([xshift=-2em,yshift=1.5em]h3.north east);}
\visible<4->{\draw [<-,very thick,red] (h1.north) -- (h2.south);}
\visible<4->{\draw [<-,very thick,red] (h2.north) -- (h3.south);}
\visible<3->{\draw [<-,very thick,red] (h3.north) -- (h4.south);}
\visible<3->{\draw [<-,very thick,red,rounded corners] (h2.east) -- ([xshift=0.5em]h2.east) -- ([xshift=0.5em,yshift=0.5em]h3.north east) -- ([xshift=-2em,yshift=0.5em]h3.north east) -- ([xshift=-2em,yshift=1.5em]h3.north east);}
\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=8.0em,minimum height=1.2em,fill=red!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (slayer) at ([yshift=1.5em]h4.north) {\tiny{h4 = Softmax(h3 * w4) (output)}};
\node [anchor=south] (losslabel) at (slayer.north) {\scriptsize{\textbf{Cross Entropy Loss}}};
\visible<1>{\draw [->,thick] (h4.north) -- (slayer.south);}
\visible<2->{\draw [<-,very thick,red] (h4.north) -- (slayer.south);}
\end{tikzpicture}
\end{center}
\end{tcolorbox}
\end{frame}
%%%------------------------------------------------------------------------------------------------------------
%%% 自动微分的实现
\begin{frame}{更简单的实现}
\begin{itemize}
\item 幸运的是,现在几乎所有的主流深度学习框架都实现了自动微分,一个函数可以搞定
\end{itemize}
\begin{tcolorbox}
[bicolor,sidebyside,righthand width=3.5cm,size=title,frame engine=empty,
colback=blue!10!white,colbacklower=black!5!white]
{\scriptsize
\begin{tabbing}
\texttt{XTensor x, loss, gold, h[5], w[5], b[5];} \\
\texttt{...} \\
\texttt{} \\
\texttt{h[1] = Relu(MMul(x, w[1]) + b[1]);} \\
\texttt{h[2] = Relu(MMul(h[1], w[2]) + b[2]);} \\
\texttt{h[3] = HardTanH(h[2]);} \\
\texttt{h[4] = Softmax(MMul(h[3], w[3]));} \\
\texttt{loss = CrossEntropy(h[4], gold);} \\
\texttt{} \\
\texttt{XNet net;}\\
\alert{\texttt{net.Backward(loss);} //一行代码实现自动微分}\\
\texttt{} \\
\texttt{for(unsigned i = 0; i < 5; i++)\{} \\
\texttt{} \ \ \ \ ... // 通过\alert{\texttt{w[i].grad}}访问参数的梯度\\
\texttt{\}}
\end{tabbing}
}
\tcblower
\begin{center}
\begin{tikzpicture}
\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=8em,minimum height=1.0em,fill=red!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (h1) at (0,0) {\tiny{x (input)}};
\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=8em,minimum height=1.0em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (h2) at ([yshift=1.0em]h1.north) {\tiny{h1 = Relu(x * w1 + b1)}};
\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=8em,minimum height=1.0em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (h3) at ([yshift=1.0em]h2.north) {\tiny{h2 = Relu(h1 * w2 + b2)}};
\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=8em,minimum height=1.0em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (h4) at ([yshift=1.0em]h3.north) {\tiny{h3 = HardTanh(h2)}};
\draw [->,thick] (h1.north) -- (h2.south);
\draw [->,thick] (h2.north) -- (h3.south);
\draw [->,thick] (h3.north) -- (h4.south);
\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=8.0em,minimum height=1.0em,fill=red!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (slayer) at ([yshift=1.0em]h4.north) {\tiny{h4 = Softmax(h3 * w4) (output)}};
\node [anchor=south] (losslabel) at (slayer.north) {\scriptsize{\textbf{Cross Entropy Loss}}};
\draw [->,thick] (h4.north) -- (slayer.south);
\end{tikzpicture}
\end{center}
\end{tcolorbox}
\begin{itemize}
\item 其它优秀的自动微分实现也可以参考TensorFlow、 PyTorch等工具
\end{itemize}
\end{frame}
%%%------------------------------------------------------------------------------------------------------------
%%% 前向计算过程及其它值得关注的问题
\begin{frame}{前向计算及其它问题}
\begin{itemize}
\item \alert{前向计算}实际上就是网络构建的过程,有两种常用方式
\begin{itemize}
\item \textbf{动态图}(如PyTorch、NiuTensor):写完函数表达式,前向计算即完成,易于调试
\item \textbf{静态图}(如TensorFlow):函数表达式完成后,并不能得到前向计算结果,需要显性调用一个Forward函数,但是计算图可以进行深度优化,执行效率较高
\end{itemize}
\item<2-> 其它一些深度学习系统实现的问题,值得关注,不过这些都超出了本课程的范围
\begin{itemize}
\item \textbf{分布式训练}:对于复杂模型的海量数据训练,需要利用多个设备(多机、多卡)同时训练
\item \textbf{低精度计算}:为了提高效率可以采用半精度或者定点数进行计算
\item \textbf{模型压缩}:减少冗余,可以压缩模型,使得模型易于存储同时提高系统运行效率
\item \textbf{训练方法和超参选择}:不同任务往往需要不同的训练策略,包括超参设置,坑很多,需要积累经验
\end{itemize}
\end{itemize}
\end{frame}
\end{CJK}
\end{document}
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论