new pages

0c5005e4 · xiaotong · ae1736d0 · 0c5005e4 · 0c5005e4
Commit 0c5005e4 authored Oct 24, 2019 by xiaotong
--- a/Section05-Neural-Networks-and-Language-Modeling/section05-test.tex
+++ b/Section05-Neural-Networks-and-Language-Modeling/section05-test.tex
@@ -118,64 +118,30 @@
 %%%------------------------------------------------------------------------------------------------------------
 \subsection{前馈、循环、自注意力神经网络}

-
 %%%------------------------------------------------------------------------------------------------------------
-%%% 循环神经网络的结构
-\begin{frame}{循环单元}
-
+%%% 循环单元的设计、梯度消失、训练等问题
+\begin{frame}{进一步的问题}
 \begin{itemize}
-\item 有输入序列$(\textbf{x}_0,\textbf{x}_1,...,\textbf{x}_t,...)$，其中$\textbf{x}_t$表示序列中第$t$个元素，也被称作\alert{时刻$t$}输入。它所对应的输出序列是$(\textbf{y}_0,\textbf{y}_1,...,\textbf{y}_t,...)$。 在循环神经网络中，每个时刻的输出都可以用同一个\alert{循环单元}来描述。\visible<2->{对于语言模型，一种简单的结构：}
-
-\visible<2->{
-{\small
-\begin{tcolorbox}
-[bicolor,sidebyside,righthand width=4.3cm,size=title,frame engine=empty,
- colback=blue!10!white,colbacklower=black!5!white]
- 
-\begin{eqnarray}
-\textbf{y}_t & = & \textrm{Softmax}(\textbf{h}_t \textbf{V}) \nonumber \\
-\textbf{h}_t & = & \textrm{TanH}(\textbf{x}_t \textbf{U} + \textbf{h}_{t-1} \textbf{W}) \nonumber
-\end{eqnarray}
-\footnotesize{$\textbf{h}_t$: $t$时刻的隐层状态\\
-$\textbf{h}_{t-1}$: $t-1$时刻的隐层状态\\
-$\textbf{V}, \textbf{U}, \textbf{W}$: 参数
-}
-\tcblower
-\begin{center}
-\begin{tikzpicture}
-\begin{scope}
-\node [anchor=west,inner sep=3pt,minimum width=8em] (h) at (0,0) {\tiny{$\textbf{h}_t  =  \textrm{TanH}(\textbf{x}_t \textbf{U} + \textbf{h}_{t-1} \textbf{W})$}};
-\node [anchor=south west,inner sep=3pt] (r) at ([yshift=-0.2em]h.north west) {\tiny{循环单元:}};
-\begin{pgfonlayer}{background}
-\node [rectangle,draw,inner sep=0em,fill=green!20!white] [fit = (r) (h)] (rbox) {};
-\end{pgfonlayer}
-\node [anchor=south,draw,minimum width=8em,fill=green!20!white] (y) at ([yshift=1.5em]rbox.north) {\tiny{$\textbf{y}_t = \textrm{Softmax}(\textbf{h}_t \textbf{V})$}};
-\node [anchor=south,inner sep=2pt] (output) at ([yshift=1em]y.north) {\scriptsize{$\textbf{y}_t$}};
-\node [anchor=north,inner sep=2pt] (input) at ([yshift=-1em]h.south) {\scriptsize{$\textbf{x}_t$}};
-\draw [->,thick] (input.north) -- ([yshift=-0.1em]rbox.south);
-\draw [->,thick] ([yshift=0.1em]rbox.north) -- ([yshift=-0.1em]y.south) node [pos=0.5,left] {\tiny{$\textbf{h}_t$}};
-\draw [->,thick] ([yshift=0.1em]y.north) -- (output.south);
-\draw [->,thick] ([xshift=0.1em]rbox.east) -- ([xshift=1em]rbox.east) node [pos=1,above] {\tiny{$\textbf{h}_t$}};
-\draw [->,thick] ([xshift=-1em]rbox.west) -- ([xshift=-0.1em]rbox.west) node [pos=0,above] {\tiny{$\textbf{h}_{t-1}$}};
-
-\end{scope}
-\end{tikzpicture}
-\end{center}
-\end{tcolorbox}
-}
-}
-
-\item<3-> \textbf{如何体现循环？}$t$时刻的状态是$t-1$时刻状态的函数，这个过程可以不断被执行
+\item \textbf{循环单元设计}：循环单元就是一个函数，入读当前时刻的输入和上一时刻的状态，生成当前时刻的状态
+    \begin{displaymath}
+    \textbf{h}_t = g(\textbf{x}_t, \textbf{h}_{t-1}; \theta)
+    \end{displaymath}
+    很多种方式设计$g(\cdot)$，如著名的LSTM、GRU等
+\item<2-> \textbf{梯度消失/爆炸}：随着序列变长，在反向传播时循环神经网络会产生更多的局部梯度相乘计算，这会导致\alert{梯度消失/爆炸问题}
+    \begin{displaymath}
+    \underbrace{0.2 \times 0.3 \times ... \times 0.2 \times 0.1}_{\text{100项}} \approx 0
+    \end{displaymath}
+    \vspace{-0.8em}
+    \begin{itemize}
+    \item 可以考虑梯度裁剪，限制梯度的大小
+    \item 也可以引入short-cut connection，如残差网络
+    \end{itemize}
+\item<2-> \textbf{训练}：有了自动微分，这不是个大问题 :)
 \end{itemize}

 \end{frame}

 %%%------------------------------------------------------------------------------------------------------------
-%%% 循环神经网络的“记忆”
-\begin{frame}{循环神经网络的``记忆''}
-\end{frame}
-
-%%%------------------------------------------------------------------------------------------------------------
 %%% 深度学习带来的问题及思考 - 并不是无所不能



--- a/Section05-Neural-Networks-and-Language-Modeling/section05.tex
+++ b/Section05-Neural-Networks-and-Language-Modeling/section05.tex