new pages

0c5005e4 · xiaotong · ae1736d0 · 0c5005e4 · 0c5005e4
Commit 0c5005e4 authored Oct 24, 2019 by xiaotong
--- a/Section05-Neural-Networks-and-Language-Modeling/section05-test.tex
+++ b/Section05-Neural-Networks-and-Language-Modeling/section05-test.tex
@@ -118,64 +118,30 @@
 %%%------------------------------------------------------------------------------------------------------------
 \subsection{前馈、循环、自注意力神经网络}

-
 %%%------------------------------------------------------------------------------------------------------------
-%%% 循环神经网络的结构
-\begin{frame}{循环单元}
-
+%%% 循环单元的设计、梯度消失、训练等问题
+\begin{frame}{进一步的问题}
 \begin{itemize}
-\item 有输入序列$(\textbf{x}_0,\textbf{x}_1,...,\textbf{x}_t,...)$，其中$\textbf{x}_t$表示序列中第$t$个元素，也被称作\alert{时刻$t$}输入。它所对应的输出序列是$(\textbf{y}_0,\textbf{y}_1,...,\textbf{y}_t,...)$。 在循环神经网络中，每个时刻的输出都可以用同一个\alert{循环单元}来描述。\visible<2->{对于语言模型，一种简单的结构：}
-
-\visible<2->{
-{\small
-\begin{tcolorbox}
-[bicolor,sidebyside,righthand width=4.3cm,size=title,frame engine=empty,
- colback=blue!10!white,colbacklower=black!5!white]
- 
-\begin{eqnarray}
-\textbf{y}_t & = & \textrm{Softmax}(\textbf{h}_t \textbf{V}) \nonumber \\
-\textbf{h}_t & = & \textrm{TanH}(\textbf{x}_t \textbf{U} + \textbf{h}_{t-1} \textbf{W}) \nonumber
-\end{eqnarray}
-\footnotesize{$\textbf{h}_t$: $t$时刻的隐层状态\\
-$\textbf{h}_{t-1}$: $t-1$时刻的隐层状态\\
-$\textbf{V}, \textbf{U}, \textbf{W}$: 参数
-}
-\tcblower
-\begin{center}
-\begin{tikzpicture}
-\begin{scope}
-\node [anchor=west,inner sep=3pt,minimum width=8em] (h) at (0,0) {\tiny{$\textbf{h}_t  =  \textrm{TanH}(\textbf{x}_t \textbf{U} + \textbf{h}_{t-1} \textbf{W})$}};
-\node [anchor=south west,inner sep=3pt] (r) at ([yshift=-0.2em]h.north west) {\tiny{循环单元:}};
-\begin{pgfonlayer}{background}
-\node [rectangle,draw,inner sep=0em,fill=green!20!white] [fit = (r) (h)] (rbox) {};
-\end{pgfonlayer}
-\node [anchor=south,draw,minimum width=8em,fill=green!20!white] (y) at ([yshift=1.5em]rbox.north) {\tiny{$\textbf{y}_t = \textrm{Softmax}(\textbf{h}_t \textbf{V})$}};
-\node [anchor=south,inner sep=2pt] (output) at ([yshift=1em]y.north) {\scriptsize{$\textbf{y}_t$}};
-\node [anchor=north,inner sep=2pt] (input) at ([yshift=-1em]h.south) {\scriptsize{$\textbf{x}_t$}};
-\draw [->,thick] (input.north) -- ([yshift=-0.1em]rbox.south);
-\draw [->,thick] ([yshift=0.1em]rbox.north) -- ([yshift=-0.1em]y.south) node [pos=0.5,left] {\tiny{$\textbf{h}_t$}};
-\draw [->,thick] ([yshift=0.1em]y.north) -- (output.south);
-\draw [->,thick] ([xshift=0.1em]rbox.east) -- ([xshift=1em]rbox.east) node [pos=1,above] {\tiny{$\textbf{h}_t$}};
-\draw [->,thick] ([xshift=-1em]rbox.west) -- ([xshift=-0.1em]rbox.west) node [pos=0,above] {\tiny{$\textbf{h}_{t-1}$}};
-
-\end{scope}
-\end{tikzpicture}
-\end{center}
-\end{tcolorbox}
-}
-}
-
-\item<3-> \textbf{如何体现循环？}$t$时刻的状态是$t-1$时刻状态的函数，这个过程可以不断被执行
+\item \textbf{循环单元设计}：循环单元就是一个函数，入读当前时刻的输入和上一时刻的状态，生成当前时刻的状态
+    \begin{displaymath}
+    \textbf{h}_t = g(\textbf{x}_t, \textbf{h}_{t-1}; \theta)
+    \end{displaymath}
+    很多种方式设计$g(\cdot)$，如著名的LSTM、GRU等
+\item<2-> \textbf{梯度消失/爆炸}：随着序列变长，在反向传播时循环神经网络会产生更多的局部梯度相乘计算，这会导致\alert{梯度消失/爆炸问题}
+    \begin{displaymath}
+    \underbrace{0.2 \times 0.3 \times ... \times 0.2 \times 0.1}_{\text{100项}} \approx 0
+    \end{displaymath}
+    \vspace{-0.8em}
+    \begin{itemize}
+    \item 可以考虑梯度裁剪，限制梯度的大小
+    \item 也可以引入short-cut connection，如残差网络
+    \end{itemize}
+\item<2-> \textbf{训练}：有了自动微分，这不是个大问题 :)
 \end{itemize}

 \end{frame}

 %%%------------------------------------------------------------------------------------------------------------
-%%% 循环神经网络的“记忆”
-\begin{frame}{循环神经网络的``记忆''}
-\end{frame}
-
-%%%------------------------------------------------------------------------------------------------------------
 %%% 深度学习带来的问题及思考 - 并不是无所不能



--- a/Section05-Neural-Networks-and-Language-Modeling/section05.tex
+++ b/Section05-Neural-Networks-and-Language-Modeling/section05.tex
@@ -3369,7 +3369,7 @@ $+2x^2+x+1)$ & \ \ $(x^4+2x^3+2x^2+x+1)$ & $+6x+1$ \\
 }

 \visible<4->{
-\draw [->,very thick,red] ([yshift=1em,xshift=-0.1em]l.north) -- ([yshift=1em,xshift=0.1em]s.north) node [pos=0.5,above] {\tiny{反向求梯度\alert{$\frac{\partial L}{\partial \textbf{s}^k} = ?$}}};
+\draw [->,very thick,red] ([yshift=1em,xshift=-0.1em]l.north) -- ([yshift=1em,xshift=0.1em]s.north) node [pos=0.5,above] {\tiny{反向求梯度\alert{$\frac{\partial L}{\partial \textbf{s}^K} = ?$}}};
 \draw [-,very thick,red] ([yshift=0.5em]l.north) -- ([yshift=1.5em]l.north);
 \draw [-,very thick,red] ([yshift=0.5em]s.north) -- ([yshift=1.5em]s.north);
 }
@@ -4122,7 +4122,7 @@ NLP问题的\alert{隐含结构}假设 & 无隐含结构假设，\alert{端到
 \begin{tcolorbox}
 [bicolor,sidebyside,righthand width=4.3cm,size=title,frame engine=empty,
 colback=blue!10!white,colbacklower=black!5!white]
- 
+
 \begin{eqnarray}
 \textbf{y}_t & = & \textrm{Softmax}(\textbf{h}_t \textbf{V}) \nonumber \\
 \textbf{h}_t & = & \textrm{TanH}(\textbf{x}_t \textbf{U} + \textbf{h}_{t-1} \textbf{W}) \nonumber
@@ -4161,6 +4161,211 @@ $\textbf{V}, \textbf{U}, \textbf{W}$: 参数

 \end{frame}

+%%%------------------------------------------------------------------------------------------------------------
+%%% 循环神经网络的“记忆”
+\begin{frame}{循环神经网络的``记忆''}
+\begin{itemize}
+\item 循环神经网络可以记忆任意长度的历史，因此可以非常适合处理不定长的序列，比如自然语言句子
+    \begin{itemize}
+    \item 注意：$\textbf{h}_{t-1}$可以被传递到后续状态
+    \end{itemize}
+\end{itemize}
+
+\vspace{-1em}
+\begin{eqnarray}
+\textbf{h}_t & = & \textrm{TanH}(\textbf{x}_t \textbf{U} + \alert{\textbf{h}_{t-1}} \textbf{W}) \nonumber \\
+\visible<2->{
+\textbf{h}_{t+1} & = & \textrm{TanH}(\textbf{x}_{t+1} \textbf{U} + \textbf{h}_{t} \textbf{W}) \nonumber \\
+                 & = & \textrm{TanH}(\textbf{x}_{t+1} \textbf{U} + \textrm{TanH}(\textbf{x}_t \textbf{U} + \alert{\textbf{h}_{t-1}} \textbf{W}) \textbf{W}) \nonumber \\
+                 }
+\visible<3->{
+\textbf{h}_{t+2} & = & \textrm{TanH}(\textbf{x}_{t+2} \textbf{U} + \textbf{h}_{t+1} \textbf{W}) \nonumber \\
+                 & = & \textrm{TanH}(\textbf{x}_{t+2} \textbf{U} + \nonumber \\
+                 &   & \textrm{TanH}(\textbf{x}_{t+1} \textbf{U} + \textrm{TanH}(\textbf{x}_t \textbf{U} + \alert{\textbf{h}_{t-1}} \textbf{W}) \textbf{W}) \textbf{W}) \nonumber
+                 }
+\end{eqnarray}
+
+\vspace{-1em}
+\begin{center}
+\begin{tikzpicture}
+\begin{scope}
+\tikzstyle{rnnnode} = [draw,inner sep=5pt,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}]
+\node [anchor=west,rnnnode] (node1) at (0,0) {\scriptsize{RNN Cell}};
+\visible<2->{
+\node [anchor=west,rnnnode] (node2) at ([xshift=4.5em]node1.east) {\scriptsize{RNN Cell}};
+}
+\visible<3->{
+\node [anchor=west,rnnnode] (node3) at ([xshift=4.5em]node2.east) {\scriptsize{RNN Cell}};
+}
+\node [anchor=north] (x1) at ([yshift=-1em]node1.south) {\footnotesize{$\textbf{x}_{t}$}};
+\visible<2->{
+\node [anchor=north] (x2) at ([yshift=-1em]node2.south) {\footnotesize{$\textbf{x}_{t+1}$}};
+}
+\visible<3->{
+\node [anchor=north] (x3) at ([yshift=-1em]node3.south) {\footnotesize{$\textbf{x}_{t+2}$}};
+}
+\node [anchor=south] (h1) at ([yshift=1em]node1.north) {\footnotesize{$\textbf{h}_{t}$}};
+\visible<2->{
+\node [anchor=south] (h2) at ([yshift=1em]node2.north) {\footnotesize{$\textbf{h}_{t+1}$}};
+}
+\visible<3->{
+\node [anchor=south] (h3) at ([yshift=1em]node3.north) {\footnotesize{$\textbf{h}_{t+2}$}};
+}
+\draw [->,thick] ([xshift=-1.0em]node1.west)--([xshift=-0.1em]node1.west) node [pos=0,left] {\scriptsize{$\alert{\textbf{h}_{t-1}}$}};
+\visible<3->{
+\draw [->,thick] ([xshift=0.1em]node3.east)--([xshift=1.0em]node3.east) node [pos=1,right] {\scriptsize{$\textbf{h}_{t+2}$}};
+}
+\draw [->,thick] ([xshift=0.1em]node1.east)--([xshift=-0.1em]node2.west) node [pos=0.5,above] {\tiny{$\textbf{h}_{t}(\alert{\textbf{h}_{t-1}})$}};
+\visible<2->{
+\draw [->,thick] ([xshift=0.1em]node2.east)--([xshift=-0.1em]node3.west) node [pos=0.5,above] {\tiny{$\textbf{h}_{t+1}(\textbf{h}_{t}(\alert{\textbf{h}_{t-1}}))$}};
+}
+\draw [->,thick] (x1.north)--([yshift=-0.1em]node1.south);
+\visible<2->{
+\draw [->,thick] (x2.north)--([yshift=-0.1em]node2.south);
+}
+\visible<3->{
+\draw [->,thick] (x3.north)--([yshift=-0.1em]node3.south);
+}
+\draw [->,thick] ([yshift=0.1em]node1.north)--(h1.south);
+\visible<2->{
+\draw [->,thick] ([yshift=0.1em]node2.north)--(h2.south);
+}
+\visible<3->{
+\draw [->,thick] ([yshift=0.1em]node3.north)--(h3.south);
+}
+
+\end{scope}
+\end{tikzpicture}
+\end{center}
+
+\end{frame}
+
+%%%------------------------------------------------------------------------------------------------------------
+%%% 基于循环神经网络的语言模型
+\begin{frame}{基于循环神经网络的语言模型(RNN LM)}
+\begin{itemize}
+\item 循环神经网络可以被直接用于语言模型
+    \begin{itemize}
+    \item<2-> 与FNN LM类似，首先把词从one-hot表示转换成分布式表示
+    \item<3-> $t$时刻预测$\textrm{P}(x_{t+1}|x_1...x_{t})$
+    \item<4-> 可以叠加更多的层
+    \end{itemize}
+\end{itemize}
+
+\visible<2->{
+\begin{center}
+\begin{tikzpicture}
+\begin{scope}
+\tikzstyle{rnnnode} = [draw,inner sep=5pt,minimum width=4em,minimum height=1.5em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}]
+\visible<3->{
+\node [anchor=west,rnnnode] (node11) at (0,0) {\scriptsize{RNN Cell}};
+\node [anchor=west,rnnnode] (node12) at ([xshift=2em]node11.east) {\scriptsize{RNN Cell}};
+\node [anchor=west,rnnnode] (node13) at ([xshift=2em]node12.east) {\scriptsize{RNN Cell}};
+\node [anchor=west,rnnnode] (node14) at ([xshift=2em]node13.east) {\scriptsize{RNN Cell}};
+}
+\node [anchor=north,rnnnode,fill=red!30!white] (e1) at ([yshift=-1.2em]node11.south) {\tiny{$e_1=w_1\textbf{C}$}};
+\node [anchor=north,rnnnode,fill=red!30!white] (e2) at ([yshift=-1.2em]node12.south) {\tiny{$e_2=w_2\textbf{C}$}};
+\node [anchor=north,rnnnode,fill=red!30!white] (e3) at ([yshift=-1.2em]node13.south) {\tiny{$e_3=w_3\textbf{C}$}};
+\node [anchor=north,rnnnode,fill=red!30!white] (e4) at ([yshift=-1.2em]node14.south) {\tiny{$e_4=w_4\textbf{C}$}};
+\node [anchor=north] (w1) at ([yshift=-1em]e1.south) {\footnotesize{$w_1$}};
+\node [anchor=north] (w2) at ([yshift=-1em]e2.south) {\footnotesize{$w_2$}};
+\node [anchor=north] (w3) at ([yshift=-1em]e3.south) {\footnotesize{$w_3$}};
+\node [anchor=north] (w4) at ([yshift=-1em]e4.south) {\footnotesize{$w_4$}};
+
+\draw [->,thick] ([yshift=0.1em]w1.north)--([yshift=-0.1em]e1.south);
+\draw [->,thick] ([yshift=0.1em]w2.north)--([yshift=-0.1em]e2.south);
+\draw [->,thick] ([yshift=0.1em]w3.north)--([yshift=-0.1em]e3.south);
+\draw [->,thick] ([yshift=0.1em]w4.north)--([yshift=-0.1em]e4.south);
+
+\draw [->,thick] ([yshift=0.1em]e1.north)--([yshift=-0.1em]node11.south);
+\draw [->,thick] ([yshift=0.1em]e2.north)--([yshift=-0.1em]node12.south);
+\draw [->,thick] ([yshift=0.1em]e3.north)--([yshift=-0.1em]node13.south);
+\draw [->,thick] ([yshift=0.1em]e4.north)--([yshift=-0.1em]node14.south);
+
+\visible<4->{
+\node [anchor=south,rnnnode] (node21) at ([yshift=1.5em]node11.north) {\scriptsize{RNN Cell}};
+\node [anchor=south,rnnnode] (node22) at ([yshift=1.5em]node12.north) {\scriptsize{RNN Cell}};
+\node [anchor=south,rnnnode] (node23) at ([yshift=1.5em]node13.north) {\scriptsize{RNN Cell}};
+\node [anchor=south,rnnnode] (node24) at ([yshift=1.5em]node14.north) {\scriptsize{RNN Cell}};
+
+\node [anchor=south,rnnnode,fill=blue!30!white] (node31) at ([yshift=1.5em]node21.north) {\scriptsize{Softmax($\cdot$)}};
+\node [anchor=south,rnnnode,fill=blue!30!white] (node32) at ([yshift=1.5em]node22.north) {\scriptsize{Softmax($\cdot$)}};
+\node [anchor=south,rnnnode,fill=blue!30!white] (node33) at ([yshift=1.5em]node23.north) {\scriptsize{Softmax($\cdot$)}};
+\node [anchor=south,rnnnode,fill=blue!30!white] (node34) at ([yshift=1.5em]node24.north) {\scriptsize{Softmax($\cdot$)}};
+}
+
+\visible<3>{
+\node [anchor=south,rnnnode,fill=blue!30!white] (node21) at ([yshift=1.5em]node11.north) {\scriptsize{Softmax($\cdot$)}};
+\node [anchor=south,rnnnode,fill=blue!30!white] (node22) at ([yshift=1.5em]node12.north) {\scriptsize{Softmax($\cdot$)}};
+\node [anchor=south,rnnnode,fill=blue!30!white] (node23) at ([yshift=1.5em]node13.north) {\scriptsize{Softmax($\cdot$)}};
+\node [anchor=south,rnnnode,fill=blue!30!white] (node24) at ([yshift=1.5em]node14.north) {\scriptsize{Softmax($\cdot$)}};
+
+\draw [->,thick] ([yshift=0.1em]node21.north)--([yshift=-0.1em]node31.south) node[pos=1,above] {\scriptsize{$\textrm{P}(w_2)$}};
+\draw [->,thick] ([yshift=0.1em]node22.north)--([yshift=-0.1em]node32.south) node[pos=1,above] {\scriptsize{$\textrm{P}(w_3|w_2)$}};
+\draw [->,thick] ([yshift=0.1em]node23.north)--([yshift=-0.1em]node33.south) node[pos=1,above] {\scriptsize{$\textrm{P}(w_4|w_2 w_3)$}};
+\draw [->,thick] ([yshift=0.1em]node24.north)--([yshift=-0.1em]node34.south) node[pos=1,above] {\scriptsize{$\textrm{P}(w_5|w_2 w_3 w_4)$}};
+}
+
+\visible<4->{
+\draw [->,thick] ([yshift=0.1em]node31.north)--([yshift=1em]node31.north) node[pos=1,above] {\scriptsize{$\textrm{P}(w_2)$}};
+\draw [->,thick] ([yshift=0.1em]node32.north)--([yshift=1em]node32.north) node[pos=1,above] {\scriptsize{$\textrm{P}(w_3|w_2)$}};
+\draw [->,thick] ([yshift=0.1em]node33.north)--([yshift=1em]node33.north) node[pos=1,above] {\scriptsize{$\textrm{P}(w_4|w_2 w_3)$}};
+\draw [->,thick] ([yshift=0.1em]node34.north)--([yshift=1em]node34.north) node[pos=1,above] {\scriptsize{$\textrm{P}(w_5|w_2 w_3 w_4)$}};
+
+\draw [->,thick] ([yshift=0.1em]node21.north)--([yshift=-0.1em]node31.south);
+\draw [->,thick] ([yshift=0.1em]node22.north)--([yshift=-0.1em]node32.south);
+\draw [->,thick] ([yshift=0.1em]node23.north)--([yshift=-0.1em]node33.south);
+\draw [->,thick] ([yshift=0.1em]node24.north)--([yshift=-0.1em]node34.south);
+
+\draw [->,thick] ([xshift=-1em]node21.west)--([xshift=-0.1em]node21.west);
+\draw [->,thick] ([xshift=0.1em]node21.east)--([xshift=-0.1em]node22.west);
+\draw [->,thick] ([xshift=0.1em]node22.east)--([xshift=-0.1em]node23.west);
+\draw [->,thick] ([xshift=0.1em]node23.east)--([xshift=-0.1em]node24.west);
+\draw [->,thick] ([xshift=0.1em]node24.east)--([xshift=1em]node24.east);
+}
+
+\visible<3->{
+\draw [->,thick] ([yshift=0.1em]node11.north)--([yshift=-0.1em]node21.south);
+\draw [->,thick] ([yshift=0.1em]node12.north)--([yshift=-0.1em]node22.south);
+\draw [->,thick] ([yshift=0.1em]node13.north)--([yshift=-0.1em]node23.south);
+\draw [->,thick] ([yshift=0.1em]node14.north)--([yshift=-0.1em]node24.south);
+
+\draw [->,thick] ([xshift=-1em]node11.west)--([xshift=-0.1em]node11.west);
+\draw [->,thick] ([xshift=0.1em]node11.east)--([xshift=-0.1em]node12.west);
+\draw [->,thick] ([xshift=0.1em]node12.east)--([xshift=-0.1em]node13.west);
+\draw [->,thick] ([xshift=0.1em]node13.east)--([xshift=-0.1em]node14.west);
+\draw [->,thick] ([xshift=0.1em]node14.east)--([xshift=1em]node14.east);
+}
+
+\end{scope}
+\end{tikzpicture}
+\end{center}
+}
+
+\end{frame}
+
+%%%------------------------------------------------------------------------------------------------------------
+%%% 循环单元的设计、梯度消失、训练等问题
+\begin{frame}{进一步的问题}
+\begin{itemize}
+\item \textbf{循环单元设计}：循环单元就是一个函数，入读当前时刻的输入和上一时刻的状态，生成当前时刻的状态
+    \begin{displaymath}
+    \textbf{h}_t = g(\textbf{x}_t, \textbf{h}_{t-1}; \theta)
+    \end{displaymath}
+    很多种方式设计$g(\cdot)$，如著名的LSTM、GRU等
+\item<2-> \textbf{梯度消失/爆炸}：随着序列变长，在反向传播时循环神经网络会产生更多的局部梯度相乘计算，这会导致\alert{梯度消失/爆炸问题}
+    \begin{displaymath}
+    \underbrace{0.2 \times 0.3 \times ... \times 0.2 \times 0.1}_{\text{100项}} \approx 0
+    \end{displaymath}
+    \vspace{-0.8em}
+    \begin{itemize}
+    \item 可以考虑梯度裁剪，限制梯度的大小
+    \item 也可以引入short-cut connection，如残差网络
+    \end{itemize}
+\item<2-> \textbf{训练}：有了自动微分，这不是个大问题 :)
+\end{itemize}
+\end{frame}
+

 %%%------------------------------------------------------------------------------------------------------------
 \subsection{词嵌入}