new pages

80802792 · xiaotong · d613a736 · 80802792 · 80802792
Commit 80802792 authored Oct 19, 2019 by xiaotong
--- a/Section05-Neural-Networks-and-Language-Modeling/section05-test.tex
+++ b/Section05-Neural-Networks-and-Language-Modeling/section05-test.tex
@@ -116,71 +116,22 @@
 \subsection{参数学习 - 反向传播}

 %%%------------------------------------------------------------------------------------------------------------
-%%% 隐层的反向传播
-\begin{frame}{反向传播 - 隐层}
+%%% 前向过程及其它问题
+\begin{frame}{前向计算及其它}
 \begin{itemize}
-\item 对于任意隐层$k$，$\textbf{h}^k = f^k(\textbf{s}^k) = f^k(\textbf{h}^{k-1}\textbf{w}^k)$。给定：隐层输出处的梯度$\pi^k=\frac{\partial L}{\partial \textbf{h}^{k}}$，需要
-	\begin{enumerate}
-	\item 计算损失$L$对于第$k$层参数矩阵$\textbf{w}^k$的梯度，$\frac{\partial L}{\partial \textbf{w}^k}$
-	\item 计算损失$L$对于第$k$层输入$\textbf{h}^{k-1}$的梯度，$\frac{\partial L}{\partial \textbf{h}^{k-1}}$
-	\end{enumerate}
-\item<2-> 直接套用上一页的方法，可以将$\pi^k=\frac{\partial L}{\partial \textbf{h}^{k}} \frac{\partial f^k(\textbf{s}^k)}{\partial \textbf{s}^{k}}$反向传播
-\vspace{-0.0em}
-\begin{eqnarray}
-\frac{\partial L}{\partial \textbf{w}^k}      & = & [\textbf{h}^{k-1}]^T \pi^k \nonumber \\
-\frac{\partial L}{\partial \textbf{h}^{k-1}} & = & \pi^k  [\textbf{w}^k]^T\nonumber
-\end{eqnarray}
-
+\item \alert{前向计算}实际上就是网络构建的过程，有两种常用方式
+    \begin{itemize}
+    \item \textbf{动态图}(如PyTorch、NiuTensor)：写完函数表达式，前向计算即完成，易于调试
+    \item \textbf{静态图}(如TensorFlow)：函数表达式完成后，并不能得到前向计算结果，需要显性调用一个Forward函数，但是计算图可以进行深度优化，执行效率较高
+    \end{itemize}
+\item<2-> 其它一些深度学习系统实现的问题，值得关注，不过这些都超出了本课程的范围
+    \begin{itemize}
+    \item \textbf{分布式训练}：对于复杂模型的海量数据训练，需要同时利用多个设备（多机、多卡）同时训练
+    \item \textbf{低精度计算}：为了提高效率可以采用半精度或者定点数进行计算
+    \item \textbf{模型压缩}：减少冗余，可以压缩模型，使得模型易于存储同时提高系统运行效率
+    \item \textbf{训练方法和超参选择}：不同任务往往需要不同的训练策略，包括超参设置，坑很多，需要积累经验
+    \end{itemize}
 \end{itemize}
-
-\visible<3->{
-\begin{center}
-\begin{tikzpicture}
-\begin{scope}
-\node [anchor=center,draw,fill=red!20,minimum height=1.8em,minimum width=2.5em] (h) at (0,0) {$\textbf{h}^{k-1}$};
-\node [anchor=west,draw,fill=blue!20,minimum height=1.8em,minimum width=2.5em] (s) at ([xshift=6em]h.east) {$\textbf{s}^{k}$};
-\node [anchor=west,draw,fill=green!20,minimum height=1.8em,minimum width=2.5em] (h2) at ([xshift=6em]s.east) {$\textbf{h}^{k}$};
-\node [anchor=east] (prev) at ([xshift=-2em]h.west) {...};
-\node [anchor=west] (next) at ([xshift=2em]h2.east) {...};
-\draw [->,thick] ([xshift=0.1em]prev.east) -- ([xshift=-0.1em]h.west);
-\draw [->,thick] ([xshift=0.1em]h.east) -- ([xshift=-0.1em]s.west) node [pos=0.5,below] {\tiny{$\textbf{s}^k = \textbf{h}^{k-1}\textbf{w}^k$}};
-\draw [->,thick] ([xshift=0.1em]s.east) -- ([xshift=-0.1em]h2.west) node [pos=0.5,below] {\tiny{$\textbf{h}^k = f(\textbf{s}^{k})$}};
-\draw [->,thick] ([xshift=0.1em]h2.east) -- ([xshift=-0.1em]next.west);
-
-\visible<4->{
-\draw [<-,thick,red] ([xshift=0.1em,yshift=0.4em]h2.east) -- ([xshift=-0.1em,yshift=0.4em]next.west) node [pos=0.8,above] {\tiny{反向传播}};
-}
-
-\visible<5->{
-\draw [<-,thick,red] ([xshift=0.1em,yshift=0.4em]s.east) -- ([xshift=-0.1em,yshift=0.4em]h2.west) node [pos=0.5,above] {\tiny{反向传播}};
-}
-
-\visible<6->{
-\draw [<-,thick,red] ([xshift=0.1em,yshift=0.4em]h.east) -- ([xshift=-0.1em,yshift=0.4em]s.west) node [pos=0.5,above] {\tiny{反向传播}};
-}
-
-\visible<7->{
-\draw [->,thick,red,dashed] ([yshift=-0.1em]h.south) -- ([yshift=-1em]h.south) -- ([yshift=-1em]h2.south) -- ([yshift=-0.1em]h2.south);
-\node [anchor=north,red] (recur) at ([yshift=-1em]s.south) {\scriptsize{$k=k-1$重复上述过程}};
-}
-
-\visible<4->{
-\node [anchor=south] (h2label) at (h2.north) {$\frac{\partial L}{\partial \textbf{h}^{k}}$};
-}
-
-\visible<5->{
-\node [anchor=south] (slabel) at (s.north) {$\pi^k = \frac{\partial L}{\partial \textbf{s}^{k}}$};
-}
-
-\visible<6->{
-\node [anchor=south] (hlabel) at (h.north) {$\frac{\partial L}{\partial \textbf{h}^{k-1}}$, $\frac{\partial L}{\partial \textbf{w}^{k}}$};
-}
-
-\end{scope}
-\end{tikzpicture}
-\end{center}
-}
-
 \end{frame}

 \end{CJK}

--- a/Section05-Neural-Networks-and-Language-Modeling/section05.tex
+++ b/Section05-Neural-Networks-and-Language-Modeling/section05.tex
@@ -3584,5 +3584,167 @@ $+2x^2+x+1)$ & \ \ $(x^4+2x^3+2x^2+x+1)$ & $+6x+1$ \\

 \end{frame}

+%%%------------------------------------------------------------------------------------------------------------
+%%% 反向传播实例
+\begin{frame}{反向传播的实现}
+\begin{itemize}
+\item 对于一个多层神经网络很容易实现反向传播
+\end{itemize}
+
+\begin{tcolorbox}
+[bicolor,sidebyside,righthand width=3.5cm,size=title,frame engine=empty,
+ colback=blue!10!white,colbacklower=black!5!white]
+ {\scriptsize
+\begin{tabbing}
+\texttt{XTensor x, y, gold, h[5], w[5], s[5];} \\
+\texttt{XTensor dh[5], dw[5], ds[5];} \\
+\texttt{...} // 前向过程 \\
+\texttt{h[0] = x;} \\
+\texttt{y = h[4];} \\
+
+\visible<2->{
+\texttt{} \\
+\texttt{CrossEntropyBackward(dldh[4], y, gold);} \\
+\texttt{SoftmaxBackward(y, s[4], dh[4], ds[4]);}\\
+\texttt{MMul(h[3], {\tiny X\_TRANS}, ds[4], {\tiny X\_NOTRANS}, dw[4]);}\\
+}
+
+\visible<3->{
+\texttt{} \\
+\texttt{dh[2] = dh[3];}\\
+\texttt{dh[1] = dh[3];}\\
+}
+
+\visible<4->{
+\texttt{} \\
+\texttt{ReluBackward(h[2], s[2], dh[2], ds[2]);}\\
+\texttt{MMul(h[1], {\tiny X\_TRANS}, ds[2], {\tiny X\_NOTRANS}, dw[2]);}\\
+\texttt{} \\
+\texttt{ReluBackward(h[1], s[1], dh[1], ds[1]);}\\
+\texttt{MMul(h[0], {\tiny X\_TRANS}, ds[1], {\tiny X\_NOTRANS}, dw[1]);}\\
+}
+
+\visible<5->{
+\texttt{} \\
+\texttt{for(unsigned i = 0; i < 5; i++)\{} \\
+\texttt{} \ \ \ \ ... // 通过\alert{\texttt{dw[i]}}访问参数的梯度\\
+\texttt{\}}
+}
+
+\end{tabbing}
+}
+\tcblower
+\begin{center}
+\begin{tikzpicture}
+
+
+\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=8em,minimum height=1.2em,fill=red!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (h1) at (0,0) {\tiny{x (input)}};
+\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=8em,minimum height=1.2em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (h2) at ([yshift=1.5em]h1.north) {\tiny{h1 = Relu(x * w1)}};
+\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=8em,minimum height=1.2em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (h3) at ([yshift=1.5em]h2.north) {\tiny{h2 = Relu(h1 * w2)}};
+\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=8em,minimum height=1.2em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (h4) at ([yshift=1.5em]h3.north) {\tiny{h3 = h2 + h1}};
+
+\visible<1-3>{\draw [->,thick] (h1.north) -- (h2.south);}
+\visible<1-3>{\draw [->,thick] (h2.north) -- (h3.south);}
+\visible<1-2>{\draw [->,thick] (h3.north) -- (h4.south);}
+\visible<1-2>{\draw [->,thick,rounded corners] (h2.east) -- ([xshift=0.5em]h2.east) -- ([xshift=0.5em,yshift=0.5em]h3.north east) -- ([xshift=-2em,yshift=0.5em]h3.north east) -- ([xshift=-2em,yshift=1.5em]h3.north east);}
+
+\visible<4->{\draw [<-,very thick,red] (h1.north) -- (h2.south);}
+\visible<4->{\draw [<-,very thick,red] (h2.north) -- (h3.south);}
+\visible<3->{\draw [<-,very thick,red] (h3.north) -- (h4.south);}
+\visible<3->{\draw [<-,very thick,red,rounded corners] (h2.east) -- ([xshift=0.5em]h2.east) -- ([xshift=0.5em,yshift=0.5em]h3.north east) -- ([xshift=-2em,yshift=0.5em]h3.north east) -- ([xshift=-2em,yshift=1.5em]h3.north east);}
+
+\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=8.0em,minimum height=1.2em,fill=red!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (slayer) at ([yshift=1.5em]h4.north) {\tiny{h4 = Softmax(h3 * w4) (output)}};
+\node [anchor=south] (losslabel) at (slayer.north) {\scriptsize{\textbf{Cross Entropy Loss}}};
+
+\visible<1>{\draw [->,thick] (h4.north) -- (slayer.south);}
+\visible<2->{\draw [<-,very thick,red] (h4.north) -- (slayer.south);}
+
+\end{tikzpicture}
+\end{center}
+\end{tcolorbox}
+
+\end{frame}
+
+%%%------------------------------------------------------------------------------------------------------------
+%%% 自动微分的实现
+\begin{frame}{更简单的实现}
+\begin{itemize}
+\item 幸运的是，现在几乎所有的主流深度学习框架都实现了自动微分，一个函数可以搞定
+\end{itemize}
+
+\begin{tcolorbox}
+[bicolor,sidebyside,righthand width=3.5cm,size=title,frame engine=empty,
+ colback=blue!10!white,colbacklower=black!5!white]
+ {\scriptsize
+\begin{tabbing}
+\texttt{XTensor x, loss, gold, h[5], w[5], b[5];} \\
+\texttt{...} \\
+
+\texttt{} \\
+\texttt{h[1] = Relu(MMul(x, w[1]) + b[1]);} \\
+\texttt{h[2] = Relu(MMul(h[1], w[2]) + b[2]);} \\
+\texttt{h[3] = HardTanH(h[2]);} \\
+\texttt{h[4] = Softmax(MMul(h[3], w[3]));} \\
+\texttt{loss = CrossEntropy(h[4], gold);} \\
+
+\texttt{} \\
+\texttt{XNet net;}\\
+\alert{\texttt{net.Backward(loss);} //一行代码实现自动微分}\\
+
+\texttt{} \\
+\texttt{for(unsigned i = 0; i < 5; i++)\{} \\
+\texttt{} \ \ \ \ ... // 通过\alert{\texttt{w[i].grad}}访问参数的梯度\\
+\texttt{\}}
+
+\end{tabbing}
+}
+\tcblower
+\begin{center}
+\begin{tikzpicture}
+
+
+\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=8em,minimum height=1.0em,fill=red!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (h1) at (0,0) {\tiny{x (input)}};
+\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=8em,minimum height=1.0em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (h2) at ([yshift=1.0em]h1.north) {\tiny{h1 = Relu(x * w1 + b1)}};
+\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=8em,minimum height=1.0em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (h3) at ([yshift=1.0em]h2.north) {\tiny{h2 = Relu(h1 * w2 + b2)}};
+\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=8em,minimum height=1.0em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (h4) at ([yshift=1.0em]h3.north) {\tiny{h3 = HardTanh(h2)}};
+
+\draw [->,thick] (h1.north) -- (h2.south);
+\draw [->,thick] (h2.north) -- (h3.south);
+\draw [->,thick] (h3.north) -- (h4.south);
+
+\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=8.0em,minimum height=1.0em,fill=red!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (slayer) at ([yshift=1.0em]h4.north) {\tiny{h4 = Softmax(h3 * w4) (output)}};
+\node [anchor=south] (losslabel) at (slayer.north) {\scriptsize{\textbf{Cross Entropy Loss}}};
+
+\draw [->,thick] (h4.north) -- (slayer.south);
+
+\end{tikzpicture}
+\end{center}
+\end{tcolorbox}
+
+\begin{itemize}
+\item 其它优秀的自动微分实现也可以参考TensorFlow、 PyTorch等工具
+\end{itemize}
+
+\end{frame}
+
+%%%------------------------------------------------------------------------------------------------------------
+%%% 前向计算过程及其它值得关注的问题
+\begin{frame}{前向计算及其它问题}
+\begin{itemize}
+\item \alert{前向计算}实际上就是网络构建的过程，有两种常用方式
+    \begin{itemize}
+    \item \textbf{动态图}(如PyTorch、NiuTensor)：写完函数表达式，前向计算即完成，易于调试
+    \item \textbf{静态图}(如TensorFlow)：函数表达式完成后，并不能得到前向计算结果，需要显性调用一个Forward函数，但是计算图可以进行深度优化，执行效率较高
+    \end{itemize}
+\item<2-> 其它一些深度学习系统实现的问题，值得关注，不过这些都超出了本课程的范围
+    \begin{itemize}
+    \item \textbf{分布式训练}：对于复杂模型的海量数据训练，需要利用多个设备（多机、多卡）同时训练
+    \item \textbf{低精度计算}：为了提高效率可以采用半精度或者定点数进行计算
+    \item \textbf{模型压缩}：减少冗余，可以压缩模型，使得模型易于存储同时提高系统运行效率
+    \item \textbf{训练方法和超参选择}：不同任务往往需要不同的训练策略，包括超参设置，坑很多，需要积累经验
+    \end{itemize}
+\end{itemize}
+\end{frame}
+
 \end{CJK}
 \end{document}