Commit 0c5005e4 by xiaotong

new pages

parent ae1736d0
......@@ -118,64 +118,30 @@
%%%------------------------------------------------------------------------------------------------------------
\subsection{前馈、循环、自注意力神经网络}
%%%------------------------------------------------------------------------------------------------------------
%%% 循环神经网络的结构
\begin{frame}{循环单元}
%%% 循环单元的设计、梯度消失、训练等问题
\begin{frame}{进一步的问题}
\begin{itemize}
\item 有输入序列$(\textbf{x}_0,\textbf{x}_1,...,\textbf{x}_t,...)$,其中$\textbf{x}_t$表示序列中第$t$个元素,也被称作\alert{时刻$t$}输入。它所对应的输出序列是$(\textbf{y}_0,\textbf{y}_1,...,\textbf{y}_t,...)$。 在循环神经网络中,每个时刻的输出都可以用同一个\alert{循环单元}来描述。\visible<2->{对于语言模型,一种简单的结构:}
\visible<2->{
{\small
\begin{tcolorbox}
[bicolor,sidebyside,righthand width=4.3cm,size=title,frame engine=empty,
colback=blue!10!white,colbacklower=black!5!white]
\begin{eqnarray}
\textbf{y}_t & = & \textrm{Softmax}(\textbf{h}_t \textbf{V}) \nonumber \\
\textbf{h}_t & = & \textrm{TanH}(\textbf{x}_t \textbf{U} + \textbf{h}_{t-1} \textbf{W}) \nonumber
\end{eqnarray}
\footnotesize{$\textbf{h}_t$: $t$时刻的隐层状态\\
$\textbf{h}_{t-1}$: $t-1$时刻的隐层状态\\
$\textbf{V}, \textbf{U}, \textbf{W}$: 参数
}
\tcblower
\begin{center}
\begin{tikzpicture}
\begin{scope}
\node [anchor=west,inner sep=3pt,minimum width=8em] (h) at (0,0) {\tiny{$\textbf{h}_t = \textrm{TanH}(\textbf{x}_t \textbf{U} + \textbf{h}_{t-1} \textbf{W})$}};
\node [anchor=south west,inner sep=3pt] (r) at ([yshift=-0.2em]h.north west) {\tiny{循环单元:}};
\begin{pgfonlayer}{background}
\node [rectangle,draw,inner sep=0em,fill=green!20!white] [fit = (r) (h)] (rbox) {};
\end{pgfonlayer}
\node [anchor=south,draw,minimum width=8em,fill=green!20!white] (y) at ([yshift=1.5em]rbox.north) {\tiny{$\textbf{y}_t = \textrm{Softmax}(\textbf{h}_t \textbf{V})$}};
\node [anchor=south,inner sep=2pt] (output) at ([yshift=1em]y.north) {\scriptsize{$\textbf{y}_t$}};
\node [anchor=north,inner sep=2pt] (input) at ([yshift=-1em]h.south) {\scriptsize{$\textbf{x}_t$}};
\draw [->,thick] (input.north) -- ([yshift=-0.1em]rbox.south);
\draw [->,thick] ([yshift=0.1em]rbox.north) -- ([yshift=-0.1em]y.south) node [pos=0.5,left] {\tiny{$\textbf{h}_t$}};
\draw [->,thick] ([yshift=0.1em]y.north) -- (output.south);
\draw [->,thick] ([xshift=0.1em]rbox.east) -- ([xshift=1em]rbox.east) node [pos=1,above] {\tiny{$\textbf{h}_t$}};
\draw [->,thick] ([xshift=-1em]rbox.west) -- ([xshift=-0.1em]rbox.west) node [pos=0,above] {\tiny{$\textbf{h}_{t-1}$}};
\end{scope}
\end{tikzpicture}
\end{center}
\end{tcolorbox}
}
}
\item<3-> \textbf{如何体现循环?}$t$时刻的状态是$t-1$时刻状态的函数,这个过程可以不断被执行
\item \textbf{循环单元设计}:循环单元就是一个函数,入读当前时刻的输入和上一时刻的状态,生成当前时刻的状态
\begin{displaymath}
\textbf{h}_t = g(\textbf{x}_t, \textbf{h}_{t-1}; \theta)
\end{displaymath}
很多种方式设计$g(\cdot)$,如著名的LSTM、GRU等
\item<2-> \textbf{梯度消失/爆炸}:随着序列变长,在反向传播时循环神经网络会产生更多的局部梯度相乘计算,这会导致\alert{梯度消失/爆炸问题}
\begin{displaymath}
\underbrace{0.2 \times 0.3 \times ... \times 0.2 \times 0.1}_{\text{100项}} \approx 0
\end{displaymath}
\vspace{-0.8em}
\begin{itemize}
\item 可以考虑梯度裁剪,限制梯度的大小
\item 也可以引入short-cut connection,如残差网络
\end{itemize}
\item<2-> \textbf{训练}:有了自动微分,这不是个大问题 :)
\end{itemize}
\end{frame}
%%%------------------------------------------------------------------------------------------------------------
%%% 循环神经网络的“记忆”
\begin{frame}{循环神经网络的``记忆''}
\end{frame}
%%%------------------------------------------------------------------------------------------------------------
%%% 深度学习带来的问题及思考 - 并不是无所不能
......
......@@ -3369,7 +3369,7 @@ $+2x^2+x+1)$ & \ \ $(x^4+2x^3+2x^2+x+1)$ & $+6x+1$ \\
}
\visible<4->{
\draw [->,very thick,red] ([yshift=1em,xshift=-0.1em]l.north) -- ([yshift=1em,xshift=0.1em]s.north) node [pos=0.5,above] {\tiny{反向求梯度\alert{$\frac{\partial L}{\partial \textbf{s}^k} = ?$}}};
\draw [->,very thick,red] ([yshift=1em,xshift=-0.1em]l.north) -- ([yshift=1em,xshift=0.1em]s.north) node [pos=0.5,above] {\tiny{反向求梯度\alert{$\frac{\partial L}{\partial \textbf{s}^K} = ?$}}};
\draw [-,very thick,red] ([yshift=0.5em]l.north) -- ([yshift=1.5em]l.north);
\draw [-,very thick,red] ([yshift=0.5em]s.north) -- ([yshift=1.5em]s.north);
}
......@@ -4122,7 +4122,7 @@ NLP问题的\alert{隐含结构}假设 & 无隐含结构假设,\alert{端到
\begin{tcolorbox}
[bicolor,sidebyside,righthand width=4.3cm,size=title,frame engine=empty,
colback=blue!10!white,colbacklower=black!5!white]
\begin{eqnarray}
\textbf{y}_t & = & \textrm{Softmax}(\textbf{h}_t \textbf{V}) \nonumber \\
\textbf{h}_t & = & \textrm{TanH}(\textbf{x}_t \textbf{U} + \textbf{h}_{t-1} \textbf{W}) \nonumber
......@@ -4161,6 +4161,211 @@ $\textbf{V}, \textbf{U}, \textbf{W}$: 参数
\end{frame}
%%%------------------------------------------------------------------------------------------------------------
%%% 循环神经网络的“记忆”
\begin{frame}{循环神经网络的``记忆''}
\begin{itemize}
\item 循环神经网络可以记忆任意长度的历史,因此可以非常适合处理不定长的序列,比如自然语言句子
\begin{itemize}
\item 注意:$\textbf{h}_{t-1}$可以被传递到后续状态
\end{itemize}
\end{itemize}
\vspace{-1em}
\begin{eqnarray}
\textbf{h}_t & = & \textrm{TanH}(\textbf{x}_t \textbf{U} + \alert{\textbf{h}_{t-1}} \textbf{W}) \nonumber \\
\visible<2->{
\textbf{h}_{t+1} & = & \textrm{TanH}(\textbf{x}_{t+1} \textbf{U} + \textbf{h}_{t} \textbf{W}) \nonumber \\
& = & \textrm{TanH}(\textbf{x}_{t+1} \textbf{U} + \textrm{TanH}(\textbf{x}_t \textbf{U} + \alert{\textbf{h}_{t-1}} \textbf{W}) \textbf{W}) \nonumber \\
}
\visible<3->{
\textbf{h}_{t+2} & = & \textrm{TanH}(\textbf{x}_{t+2} \textbf{U} + \textbf{h}_{t+1} \textbf{W}) \nonumber \\
& = & \textrm{TanH}(\textbf{x}_{t+2} \textbf{U} + \nonumber \\
& & \textrm{TanH}(\textbf{x}_{t+1} \textbf{U} + \textrm{TanH}(\textbf{x}_t \textbf{U} + \alert{\textbf{h}_{t-1}} \textbf{W}) \textbf{W}) \textbf{W}) \nonumber
}
\end{eqnarray}
\vspace{-1em}
\begin{center}
\begin{tikzpicture}
\begin{scope}
\tikzstyle{rnnnode} = [draw,inner sep=5pt,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}]
\node [anchor=west,rnnnode] (node1) at (0,0) {\scriptsize{RNN Cell}};
\visible<2->{
\node [anchor=west,rnnnode] (node2) at ([xshift=4.5em]node1.east) {\scriptsize{RNN Cell}};
}
\visible<3->{
\node [anchor=west,rnnnode] (node3) at ([xshift=4.5em]node2.east) {\scriptsize{RNN Cell}};
}
\node [anchor=north] (x1) at ([yshift=-1em]node1.south) {\footnotesize{$\textbf{x}_{t}$}};
\visible<2->{
\node [anchor=north] (x2) at ([yshift=-1em]node2.south) {\footnotesize{$\textbf{x}_{t+1}$}};
}
\visible<3->{
\node [anchor=north] (x3) at ([yshift=-1em]node3.south) {\footnotesize{$\textbf{x}_{t+2}$}};
}
\node [anchor=south] (h1) at ([yshift=1em]node1.north) {\footnotesize{$\textbf{h}_{t}$}};
\visible<2->{
\node [anchor=south] (h2) at ([yshift=1em]node2.north) {\footnotesize{$\textbf{h}_{t+1}$}};
}
\visible<3->{
\node [anchor=south] (h3) at ([yshift=1em]node3.north) {\footnotesize{$\textbf{h}_{t+2}$}};
}
\draw [->,thick] ([xshift=-1.0em]node1.west)--([xshift=-0.1em]node1.west) node [pos=0,left] {\scriptsize{$\alert{\textbf{h}_{t-1}}$}};
\visible<3->{
\draw [->,thick] ([xshift=0.1em]node3.east)--([xshift=1.0em]node3.east) node [pos=1,right] {\scriptsize{$\textbf{h}_{t+2}$}};
}
\draw [->,thick] ([xshift=0.1em]node1.east)--([xshift=-0.1em]node2.west) node [pos=0.5,above] {\tiny{$\textbf{h}_{t}(\alert{\textbf{h}_{t-1}})$}};
\visible<2->{
\draw [->,thick] ([xshift=0.1em]node2.east)--([xshift=-0.1em]node3.west) node [pos=0.5,above] {\tiny{$\textbf{h}_{t+1}(\textbf{h}_{t}(\alert{\textbf{h}_{t-1}}))$}};
}
\draw [->,thick] (x1.north)--([yshift=-0.1em]node1.south);
\visible<2->{
\draw [->,thick] (x2.north)--([yshift=-0.1em]node2.south);
}
\visible<3->{
\draw [->,thick] (x3.north)--([yshift=-0.1em]node3.south);
}
\draw [->,thick] ([yshift=0.1em]node1.north)--(h1.south);
\visible<2->{
\draw [->,thick] ([yshift=0.1em]node2.north)--(h2.south);
}
\visible<3->{
\draw [->,thick] ([yshift=0.1em]node3.north)--(h3.south);
}
\end{scope}
\end{tikzpicture}
\end{center}
\end{frame}
%%%------------------------------------------------------------------------------------------------------------
%%% 基于循环神经网络的语言模型
\begin{frame}{基于循环神经网络的语言模型(RNN LM)}
\begin{itemize}
\item 循环神经网络可以被直接用于语言模型
\begin{itemize}
\item<2-> 与FNN LM类似,首先把词从one-hot表示转换成分布式表示
\item<3-> $t$时刻预测$\textrm{P}(x_{t+1}|x_1...x_{t})$
\item<4-> 可以叠加更多的层
\end{itemize}
\end{itemize}
\visible<2->{
\begin{center}
\begin{tikzpicture}
\begin{scope}
\tikzstyle{rnnnode} = [draw,inner sep=5pt,minimum width=4em,minimum height=1.5em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}]
\visible<3->{
\node [anchor=west,rnnnode] (node11) at (0,0) {\scriptsize{RNN Cell}};
\node [anchor=west,rnnnode] (node12) at ([xshift=2em]node11.east) {\scriptsize{RNN Cell}};
\node [anchor=west,rnnnode] (node13) at ([xshift=2em]node12.east) {\scriptsize{RNN Cell}};
\node [anchor=west,rnnnode] (node14) at ([xshift=2em]node13.east) {\scriptsize{RNN Cell}};
}
\node [anchor=north,rnnnode,fill=red!30!white] (e1) at ([yshift=-1.2em]node11.south) {\tiny{$e_1=w_1\textbf{C}$}};
\node [anchor=north,rnnnode,fill=red!30!white] (e2) at ([yshift=-1.2em]node12.south) {\tiny{$e_2=w_2\textbf{C}$}};
\node [anchor=north,rnnnode,fill=red!30!white] (e3) at ([yshift=-1.2em]node13.south) {\tiny{$e_3=w_3\textbf{C}$}};
\node [anchor=north,rnnnode,fill=red!30!white] (e4) at ([yshift=-1.2em]node14.south) {\tiny{$e_4=w_4\textbf{C}$}};
\node [anchor=north] (w1) at ([yshift=-1em]e1.south) {\footnotesize{$w_1$}};
\node [anchor=north] (w2) at ([yshift=-1em]e2.south) {\footnotesize{$w_2$}};
\node [anchor=north] (w3) at ([yshift=-1em]e3.south) {\footnotesize{$w_3$}};
\node [anchor=north] (w4) at ([yshift=-1em]e4.south) {\footnotesize{$w_4$}};
\draw [->,thick] ([yshift=0.1em]w1.north)--([yshift=-0.1em]e1.south);
\draw [->,thick] ([yshift=0.1em]w2.north)--([yshift=-0.1em]e2.south);
\draw [->,thick] ([yshift=0.1em]w3.north)--([yshift=-0.1em]e3.south);
\draw [->,thick] ([yshift=0.1em]w4.north)--([yshift=-0.1em]e4.south);
\draw [->,thick] ([yshift=0.1em]e1.north)--([yshift=-0.1em]node11.south);
\draw [->,thick] ([yshift=0.1em]e2.north)--([yshift=-0.1em]node12.south);
\draw [->,thick] ([yshift=0.1em]e3.north)--([yshift=-0.1em]node13.south);
\draw [->,thick] ([yshift=0.1em]e4.north)--([yshift=-0.1em]node14.south);
\visible<4->{
\node [anchor=south,rnnnode] (node21) at ([yshift=1.5em]node11.north) {\scriptsize{RNN Cell}};
\node [anchor=south,rnnnode] (node22) at ([yshift=1.5em]node12.north) {\scriptsize{RNN Cell}};
\node [anchor=south,rnnnode] (node23) at ([yshift=1.5em]node13.north) {\scriptsize{RNN Cell}};
\node [anchor=south,rnnnode] (node24) at ([yshift=1.5em]node14.north) {\scriptsize{RNN Cell}};
\node [anchor=south,rnnnode,fill=blue!30!white] (node31) at ([yshift=1.5em]node21.north) {\scriptsize{Softmax($\cdot$)}};
\node [anchor=south,rnnnode,fill=blue!30!white] (node32) at ([yshift=1.5em]node22.north) {\scriptsize{Softmax($\cdot$)}};
\node [anchor=south,rnnnode,fill=blue!30!white] (node33) at ([yshift=1.5em]node23.north) {\scriptsize{Softmax($\cdot$)}};
\node [anchor=south,rnnnode,fill=blue!30!white] (node34) at ([yshift=1.5em]node24.north) {\scriptsize{Softmax($\cdot$)}};
}
\visible<3>{
\node [anchor=south,rnnnode,fill=blue!30!white] (node21) at ([yshift=1.5em]node11.north) {\scriptsize{Softmax($\cdot$)}};
\node [anchor=south,rnnnode,fill=blue!30!white] (node22) at ([yshift=1.5em]node12.north) {\scriptsize{Softmax($\cdot$)}};
\node [anchor=south,rnnnode,fill=blue!30!white] (node23) at ([yshift=1.5em]node13.north) {\scriptsize{Softmax($\cdot$)}};
\node [anchor=south,rnnnode,fill=blue!30!white] (node24) at ([yshift=1.5em]node14.north) {\scriptsize{Softmax($\cdot$)}};
\draw [->,thick] ([yshift=0.1em]node21.north)--([yshift=-0.1em]node31.south) node[pos=1,above] {\scriptsize{$\textrm{P}(w_2)$}};
\draw [->,thick] ([yshift=0.1em]node22.north)--([yshift=-0.1em]node32.south) node[pos=1,above] {\scriptsize{$\textrm{P}(w_3|w_2)$}};
\draw [->,thick] ([yshift=0.1em]node23.north)--([yshift=-0.1em]node33.south) node[pos=1,above] {\scriptsize{$\textrm{P}(w_4|w_2 w_3)$}};
\draw [->,thick] ([yshift=0.1em]node24.north)--([yshift=-0.1em]node34.south) node[pos=1,above] {\scriptsize{$\textrm{P}(w_5|w_2 w_3 w_4)$}};
}
\visible<4->{
\draw [->,thick] ([yshift=0.1em]node31.north)--([yshift=1em]node31.north) node[pos=1,above] {\scriptsize{$\textrm{P}(w_2)$}};
\draw [->,thick] ([yshift=0.1em]node32.north)--([yshift=1em]node32.north) node[pos=1,above] {\scriptsize{$\textrm{P}(w_3|w_2)$}};
\draw [->,thick] ([yshift=0.1em]node33.north)--([yshift=1em]node33.north) node[pos=1,above] {\scriptsize{$\textrm{P}(w_4|w_2 w_3)$}};
\draw [->,thick] ([yshift=0.1em]node34.north)--([yshift=1em]node34.north) node[pos=1,above] {\scriptsize{$\textrm{P}(w_5|w_2 w_3 w_4)$}};
\draw [->,thick] ([yshift=0.1em]node21.north)--([yshift=-0.1em]node31.south);
\draw [->,thick] ([yshift=0.1em]node22.north)--([yshift=-0.1em]node32.south);
\draw [->,thick] ([yshift=0.1em]node23.north)--([yshift=-0.1em]node33.south);
\draw [->,thick] ([yshift=0.1em]node24.north)--([yshift=-0.1em]node34.south);
\draw [->,thick] ([xshift=-1em]node21.west)--([xshift=-0.1em]node21.west);
\draw [->,thick] ([xshift=0.1em]node21.east)--([xshift=-0.1em]node22.west);
\draw [->,thick] ([xshift=0.1em]node22.east)--([xshift=-0.1em]node23.west);
\draw [->,thick] ([xshift=0.1em]node23.east)--([xshift=-0.1em]node24.west);
\draw [->,thick] ([xshift=0.1em]node24.east)--([xshift=1em]node24.east);
}
\visible<3->{
\draw [->,thick] ([yshift=0.1em]node11.north)--([yshift=-0.1em]node21.south);
\draw [->,thick] ([yshift=0.1em]node12.north)--([yshift=-0.1em]node22.south);
\draw [->,thick] ([yshift=0.1em]node13.north)--([yshift=-0.1em]node23.south);
\draw [->,thick] ([yshift=0.1em]node14.north)--([yshift=-0.1em]node24.south);
\draw [->,thick] ([xshift=-1em]node11.west)--([xshift=-0.1em]node11.west);
\draw [->,thick] ([xshift=0.1em]node11.east)--([xshift=-0.1em]node12.west);
\draw [->,thick] ([xshift=0.1em]node12.east)--([xshift=-0.1em]node13.west);
\draw [->,thick] ([xshift=0.1em]node13.east)--([xshift=-0.1em]node14.west);
\draw [->,thick] ([xshift=0.1em]node14.east)--([xshift=1em]node14.east);
}
\end{scope}
\end{tikzpicture}
\end{center}
}
\end{frame}
%%%------------------------------------------------------------------------------------------------------------
%%% 循环单元的设计、梯度消失、训练等问题
\begin{frame}{进一步的问题}
\begin{itemize}
\item \textbf{循环单元设计}:循环单元就是一个函数,入读当前时刻的输入和上一时刻的状态,生成当前时刻的状态
\begin{displaymath}
\textbf{h}_t = g(\textbf{x}_t, \textbf{h}_{t-1}; \theta)
\end{displaymath}
很多种方式设计$g(\cdot)$,如著名的LSTM、GRU等
\item<2-> \textbf{梯度消失/爆炸}:随着序列变长,在反向传播时循环神经网络会产生更多的局部梯度相乘计算,这会导致\alert{梯度消失/爆炸问题}
\begin{displaymath}
\underbrace{0.2 \times 0.3 \times ... \times 0.2 \times 0.1}_{\text{100项}} \approx 0
\end{displaymath}
\vspace{-0.8em}
\begin{itemize}
\item 可以考虑梯度裁剪,限制梯度的大小
\item 也可以引入short-cut connection,如残差网络
\end{itemize}
\item<2-> \textbf{训练}:有了自动微分,这不是个大问题 :)
\end{itemize}
\end{frame}
%%%------------------------------------------------------------------------------------------------------------
\subsection{词嵌入}
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论