Commit d613a736 by xiaotong

new pages

parent 44d2f25a
......@@ -116,165 +116,70 @@
\subsection{参数学习 - 反向传播}
%%%------------------------------------------------------------------------------------------------------------
%%% 输出层的反向传播
\begin{frame}{反向传播 - 输出层}
%%% 隐层的反向传播
\begin{frame}{反向传播 - 隐层}
\begin{itemize}
\item 输出层(两个阶段)
\item 对于任意隐层$k$$\textbf{h}^k = f^k(\textbf{s}^k) = f^k(\textbf{h}^{k-1}\textbf{w}^k)$。给定:隐层输出处的梯度$\pi^k=\frac{\partial L}{\partial \textbf{h}^{k}}$,需要
\begin{enumerate}
\item 计算损失$L$对于第$k$层参数矩阵$\textbf{w}^k$的梯度,$\frac{\partial L}{\partial \textbf{w}^k}$
\item 计算损失$L$对于第$k$层输入$\textbf{h}^{k-1}$的梯度,$\frac{\partial L}{\partial \textbf{h}^{k-1}}$
\end{enumerate}
\item<2-> 直接套用上一页的方法,可以将$\pi^k=\frac{\partial L}{\partial \textbf{h}^{k}} \frac{\partial f^k(\textbf{s}^k)}{\partial \textbf{s}^{k}}$反向传播
\vspace{-0.0em}
\begin{eqnarray}
\frac{\partial L}{\partial \textbf{w}^k} & = & [\textbf{h}^{k-1}]^T \pi^k \nonumber \\
\frac{\partial L}{\partial \textbf{h}^{k-1}} & = & \pi^k [\textbf{w}^k]^T\nonumber
\end{eqnarray}
\end{itemize}
\vspace{-0.5em}
\visible<3->{
\begin{center}
\begin{tikzpicture}
\begin{scope}
\node [anchor=center,minimum height=1.7em,fill=yellow!20,draw] (h) at (0,0) {$\textbf{h}^{K-1}$};
\node [anchor=west,minimum height=1.7em,fill=blue!20,draw] (s) at ([xshift=5.5em]h.east) {$\textbf{s}^{K}$};
\node [anchor=west,minimum height=1.7em,fill=green!20,draw] (h2) at ([xshift=5.5em]s.east) {$\textbf{h}^{K}$};
\node [anchor=west,minimum height=1.7em,fill=orange!20,draw] (l) at ([xshift=5.5em]h2.east) {$L$};
\draw [->] (h.east) -- (s.west);
\draw [->] (s.east) -- (h2.west);
\draw [->] (h2.east) -- (l.west) node [pos=0.5,above] {\tiny{损失}};
\node [anchor=south west,inner sep=2pt] (step100) at ([xshift=0.5em,yshift=-0.8em]h.north east) {\tiny{$\textbf{s}^K = \textbf{h}^{K-1} \textbf{w}^K$}};
\node [anchor=south west,inner sep=2pt] (step101) at (step100.north west) {\tiny{线性变换}};
\node [anchor=south west,inner sep=2pt] (step200) at ([xshift=0.5em,yshift=-0.8em]s.north east) {\tiny{$\textbf{h}^K = f^K(\textbf{s}^K)$}};
\node [anchor=south west,inner sep=2pt] (step201) at (step200.north west) {\tiny{激活函数}};
\node [anchor=south,inner sep=1pt] (outputlabel) at ([yshift=0.0em]h2.north) {\tiny{\textbf{输出层}}};
\visible<2->{
\draw[decorate,thick,decoration={brace,mirror,raise=0.4em,amplitude=2mm}] (h.south west) -- (s.south west) node [pos=0.5,below,yshift=-1em] {\scriptsize{\textbf{第一阶段:线性变换}}};
}
\visible<3->{
\draw[decorate,thick,decoration={brace,mirror,raise=0.4em,amplitude=2mm}] ([xshift=0.2em]s.south west) -- (l.south east) node [pos=0.5,below,yshift=-1em] (step2) {\scriptsize{\textbf{第二阶段:激活函数+损失函数}}};
}
\node [anchor=center,draw,fill=red!20,minimum height=1.8em,minimum width=2.5em] (h) at (0,0) {$\textbf{h}^{k-1}$};
\node [anchor=west,draw,fill=blue!20,minimum height=1.8em,minimum width=2.5em] (s) at ([xshift=6em]h.east) {$\textbf{s}^{k}$};
\node [anchor=west,draw,fill=green!20,minimum height=1.8em,minimum width=2.5em] (h2) at ([xshift=6em]s.east) {$\textbf{h}^{k}$};
\node [anchor=east] (prev) at ([xshift=-2em]h.west) {...};
\node [anchor=west] (next) at ([xshift=2em]h2.east) {...};
\draw [->,thick] ([xshift=0.1em]prev.east) -- ([xshift=-0.1em]h.west);
\draw [->,thick] ([xshift=0.1em]h.east) -- ([xshift=-0.1em]s.west) node [pos=0.5,below] {\tiny{$\textbf{s}^k = \textbf{h}^{k-1}\textbf{w}^k$}};
\draw [->,thick] ([xshift=0.1em]s.east) -- ([xshift=-0.1em]h2.west) node [pos=0.5,below] {\tiny{$\textbf{h}^k = f(\textbf{s}^{k})$}};
\draw [->,thick] ([xshift=0.1em]h2.east) -- ([xshift=-0.1em]next.west);
\visible<4->{
\draw [->,very thick,red] ([yshift=1em,xshift=-0.1em]l.north) -- ([yshift=1em,xshift=0.1em]s.north) node [pos=0.5,above] {\tiny{反向求梯度\alert{$\frac{\partial L}{\partial \textbf{s}^k} = ?$}}};
\draw [-,very thick,red] ([yshift=0.5em]l.north) -- ([yshift=1.5em]l.north);
\draw [-,very thick,red] ([yshift=0.5em]s.north) -- ([yshift=1.5em]s.north);
\draw [<-,thick,red] ([xshift=0.1em,yshift=0.4em]h2.east) -- ([xshift=-0.1em,yshift=0.4em]next.west) node [pos=0.8,above] {\tiny{反向传播}};
}
\end{scope}
\end{tikzpicture}
\end{center}
\begin{itemize}
\item<4-> 反向传播从输出向输入传播梯度,因此我们先考虑阶段二\visible<5->{。令$\pi^k = \frac{\partial L}{\partial \textbf{s}^k}$表示损失$L$在第$k$层激活函数输入处的梯度,利用链式法有}
\vspace{-1.5em}
\visible<5->{
\begin{eqnarray}
\pi^K & = & \frac{\partial L}{\partial \textbf{s}^K} \nonumber \\
& = & \frac{\partial L}{\partial \textbf{h}^K} \cdot \frac{\partial \textbf{h}^K}{\partial \textbf{s}^K} \nonumber \\
& = & \frac{\partial L}{\partial \textbf{h}^K} \cdot \frac{\partial f^K(\textbf{s}^K)}{\partial \textbf{s}^K} \nonumber
%\frac{\partial L}{\partial \textbf{w}^K} & = & \frac{\partial L}{\partial \textbf{h}^K} \cdot \frac{\partial \textbf{h}^K}{\partial \textbf{w}^K} \nonumber \\
% & \visible<4->{=} & \visible<4->{\frac{\partial L}{\partial \textbf{h}^K} \cdot \frac{\partial f^K(\textbf{h}^{K-1} \textbf{w}^K)}{\partial \textbf{w}^K} \ \ \ \ (\textrm{因为}\textbf{h}^K=f^K(\textbf{h}^{K-1} \textbf{w}^K))} \nonumber \\
% & \visible<5->{=} & \visible<5->{\frac{\partial L}{\partial \textbf{h}^K} \cdot \frac{\partial f^K(\textbf{s}^K)}{\partial \textbf{s}^K} \cdot \frac{\partial \textbf{s}^K}{\partial \textbf{w}^K} \ \ \ (\textrm{因为}\textbf{s}^K=\textbf{h}^{K-1} \textbf{w}^K)} \nonumber
\end{eqnarray}
\draw [<-,thick,red] ([xshift=0.1em,yshift=0.4em]s.east) -- ([xshift=-0.1em,yshift=0.4em]h2.west) node [pos=0.5,above] {\tiny{反向传播}};
}
\end{itemize}
\end{frame}
%%%------------------------------------------------------------------------------------------------------------
%%% 输出层的反向传播 - 各个因子的意义
\begin{frame}{反向传播 - 输出层(续)}
\begin{center}
\begin{tikzpicture}
\begin{scope}
\node [anchor=center] (factor00) at (0,0) {${\displaystyle \pi^K \ = }$};
\node [anchor=west] (factor01) at (factor00.east) {${\displaystyle \frac{\partial L}{\partial \textbf{h}^K}}$};
\node [anchor=west,inner sep=1pt] (factor02) at (factor01.east) {${\displaystyle \cdot}$};
\node [anchor=west] (factor03) at (factor02.east) {${\displaystyle \frac{\partial f^K(\textbf{s}^K)}{\partial \textbf{s}^K}}$};
\begin{pgfonlayer}{background}
\visible<2-4>{
\node [rectangle,inner sep=0em,fill=red!20] [fit = (factor01)] (p1) {};
\visible<6->{
\draw [<-,thick,red] ([xshift=0.1em,yshift=0.4em]h.east) -- ([xshift=-0.1em,yshift=0.4em]s.west) node [pos=0.5,above] {\tiny{反向传播}};
}
\visible<3-4>{
\node [rectangle,inner sep=0em,fill=blue!20] [fit = (factor03)] (p2) {};
}
\visible<5->{
\node [circle,inner sep=0em,fill=green!20] [fit = (factor02)] (p3) {};
}
\end{pgfonlayer}
\end{scope}
\end{tikzpicture}
\end{center}
\begin{itemize}
\item<2-> \raisebox{-0.7em}{\tikz{\node [anchor=west,fill=red!20] (factor01) at (factor00.east) {$\frac{\partial L}{\partial \textbf{h}^K}$};}} 表示损失$L$相对网络输出的变化率,比如,对于$L = \frac{1}{2} ||\hat{\textbf{y}} - \textbf{h}^K||^2$,有$\frac{\partial L}{\partial \textbf{h}^K} = \hat{\textbf{y}} - \textbf{h}^K$
\item<3-> \raisebox{-0.7em}{\tikz{\node [anchor=west,fill=blue!20] (factor01) at (factor00.east) {$\frac{\partial f^K(\textbf{s}^K)}{\partial \textbf{s}^K}$};}} 表示激活函数相对于它自己的输入的变化率,比如,对于$f(\textbf{s}) = \frac{1}{1+\exp(-\textbf{s})}$,有$\frac{\partial f(\textbf{s})}{\partial \textbf{s}} = f(\textbf{s})(1-f(\textbf{s}))$
\item<4-> 这个结果符合直觉,在$s^K$出的梯度相当于在损失函数微分($\frac{\partial L}{\partial \textbf{h}^K}$)和激活函数微分($\frac{\partial f^K(\textbf{s}^K)}{\partial \textbf{s}^K}$) 的乘积\visible<5->{,注意这里所有操作都是单元级,比如张量按单元乘法}
\end{itemize}
\visible<7->{
\draw [->,thick,red,dashed] ([yshift=-0.1em]h.south) -- ([yshift=-1em]h.south) -- ([yshift=-1em]h2.south) -- ([yshift=-0.1em]h2.south);
\node [anchor=north,red] (recur) at ([yshift=-1em]s.south) {\scriptsize{$k=k-1$重复上述过程}};
}
\visible<4->{
\vspace{-0.5em}
\begin{center}
\begin{tikzpicture}
\begin{scope}
\node [anchor=west,minimum height=1.7em,fill=blue!20,draw] (s) at (0,0) {$\textbf{s}^{K}$};
\node [anchor=west,minimum height=1.7em,fill=green!20,draw] (h2) at ([xshift=5.5em]s.east) {$\textbf{h}^{K}$};
\node [anchor=west,minimum height=1.7em,fill=orange!20,draw] (l) at ([xshift=5.5em]h2.east) {$L$};
\draw [->] (s.east) -- (h2.west);
\draw [->] (h2.east) -- (l.west);
\draw [->,very thick,red] ([yshift=1em,xshift=-0.1em]l.north) -- ([yshift=1em,xshift=0.1em]h2.north) node [pos=0.5,above] {\tiny{求梯度\alert{$\frac{\partial L}{\partial \textbf{h}^k} = ?$}}};
\draw [->,very thick,red] ([yshift=1em,xshift=-0.1em]h2.north) -- ([yshift=1em,xshift=0.1em]s.north) node [pos=0.5,above] {\tiny{求梯度\alert{$\frac{\partial f^K(\textbf{s}^K)}{\partial \textbf{s}^K} = ?$}}};
\draw [-,very thick,red] ([yshift=0.5em]l.north) -- ([yshift=1.5em]l.north);
\draw [-,very thick,red] ([yshift=0.5em]h2.north) -- ([yshift=1.5em]h2.north);
\draw [-,very thick,red] ([yshift=0.5em]s.north) -- ([yshift=1.5em]s.north);
\end{scope}
\end{tikzpicture}
\end{center}
\node [anchor=south] (h2label) at (h2.north) {$\frac{\partial L}{\partial \textbf{h}^{k}}$};
}
\end{frame}
%%%------------------------------------------------------------------------------------------------------------
%%% 输出层的反向传播 - 求 dL/dw
\begin{frame}{反向传播 - 输出层}
\begin{itemize}
\item 输出层(两个阶段)
\end{itemize}
\vspace{-0.5em}
\begin{center}
\begin{tikzpicture}
\begin{scope}
\node [anchor=center,minimum height=1.7em,fill=yellow!20,draw] (h) at (0,0) {$\textbf{h}^{K-1}$};
\node [anchor=west,minimum height=1.7em,fill=blue!20,draw] (s) at ([xshift=5.5em]h.east) {$\textbf{s}^{K}$};
\node [anchor=west,minimum height=1.7em,fill=green!20,draw] (h2) at ([xshift=5.5em]s.east) {$\textbf{h}^{K}$};
\node [anchor=west,minimum height=1.7em,fill=orange!20,draw] (l) at ([xshift=5.5em]h2.east) {$L$};
\draw [->] (h.east) -- (s.west);
\draw [->] (s.east) -- (h2.west);
\draw [->] (h2.east) -- (l.west) node [pos=0.5,above] {\tiny{损失}};
\node [anchor=south west,inner sep=2pt] (step100) at ([xshift=0.5em,yshift=-0.8em]h.north east) {\tiny{$\textbf{s}^K = \textbf{h}^{K-1} \textbf{w}^K$}};
\node [anchor=south west,inner sep=2pt] (step200) at ([xshift=0.5em,yshift=-0.8em]s.north east) {\tiny{$\textbf{h}^K = f^K(\textbf{s}^K)$}};
\node [anchor=south,inner sep=1pt] (outputlabel) at ([yshift=0.0em]h2.north) {\tiny{\textbf{输出层}}};
\node [anchor=south west] (slabel) at ([yshift=1em,xshift=0.3em]s.north) {\scriptsize{\textbf{\alert{已经得到:$\pi^K$}}}};
\visible<5->{
\node [anchor=south] (slabel) at (s.north) {$\pi^k = \frac{\partial L}{\partial \textbf{s}^{k}}$};
}
\draw [->,very thick,red] ([yshift=1em,xshift=-0.1em]s.north) -- ([yshift=1em,xshift=0.1em]h.north) node [pos=0.5,above] {\tiny{\alert{$\frac{\partial L}{\partial \textbf{w}^k} = ?$, $\frac{\partial L}{\partial \textbf{h}^k} = ?$}}};
\draw [-,very thick,red] ([yshift=0.5em]h.north) -- ([yshift=1.5em]h.north);
\draw [-,very thick,red] ([yshift=0.5em]s.north) -- ([yshift=1.5em]s.north);
\visible<6->{
\node [anchor=south] (hlabel) at (h.north) {$\frac{\partial L}{\partial \textbf{h}^{k-1}}$, $\frac{\partial L}{\partial \textbf{w}^{k}}$};
}
\end{scope}
\end{tikzpicture}
\end{center}
}
\end{frame}
......
......@@ -3333,5 +3333,256 @@ $+2x^2+x+1)$ & \ \ $(x^4+2x^3+2x^2+x+1)$ & $+6x+1$ \\
\end{frame}
%%%------------------------------------------------------------------------------------------------------------
%%% 输出层的反向传播
\begin{frame}{反向传播 - 输出层}
\begin{itemize}
\item 输出层(两个阶段)
\end{itemize}
\vspace{-1.0em}
\begin{center}
\begin{tikzpicture}
\begin{scope}
\node [anchor=center,minimum height=1.7em,fill=yellow!20,draw] (h) at (0,0) {$\textbf{h}^{K-1}$};
\node [anchor=west,minimum height=1.7em,fill=blue!20,draw] (s) at ([xshift=5.5em]h.east) {$\textbf{s}^{K}$};
\node [anchor=west,minimum height=1.7em,fill=green!20,draw] (h2) at ([xshift=5.5em]s.east) {$\textbf{h}^{K}$};
\node [anchor=west,minimum height=1.7em,fill=orange!20,draw] (l) at ([xshift=5.5em]h2.east) {$L$};
\draw [->] (h.east) -- (s.west);
\draw [->] (s.east) -- (h2.west);
\draw [->] (h2.east) -- (l.west) node [pos=0.5,above] {\tiny{损失}};
\node [anchor=south west,inner sep=2pt] (step100) at ([xshift=0.5em,yshift=-0.8em]h.north east) {\tiny{$\textbf{s}^K = \textbf{h}^{K-1} \textbf{w}^K$}};
\node [anchor=south west,inner sep=2pt] (step101) at (step100.north west) {\tiny{线性变换}};
\node [anchor=south west,inner sep=2pt] (step200) at ([xshift=0.5em,yshift=-0.8em]s.north east) {\tiny{$\textbf{h}^K = f^K(\textbf{s}^K)$}};
\node [anchor=south west,inner sep=2pt] (step201) at (step200.north west) {\tiny{激活函数}};
\node [anchor=south,inner sep=1pt] (outputlabel) at ([yshift=0.0em]h2.north) {\tiny{\textbf{输出层}}};
\visible<2->{
\draw[decorate,thick,decoration={brace,mirror,raise=0.4em,amplitude=2mm}] (h.south west) -- (s.south west) node [pos=0.5,below,yshift=-1em] {\scriptsize{\textbf{第一阶段:线性变换}}};
}
\visible<3->{
\draw[decorate,thick,decoration={brace,mirror,raise=0.4em,amplitude=2mm}] ([xshift=0.2em]s.south west) -- (l.south east) node [pos=0.5,below,yshift=-1em] (step2) {\scriptsize{\textbf{第二阶段:激活函数+损失函数}}};
}
\visible<4->{
\draw [->,very thick,red] ([yshift=1em,xshift=-0.1em]l.north) -- ([yshift=1em,xshift=0.1em]s.north) node [pos=0.5,above] {\tiny{反向求梯度\alert{$\frac{\partial L}{\partial \textbf{s}^k} = ?$}}};
\draw [-,very thick,red] ([yshift=0.5em]l.north) -- ([yshift=1.5em]l.north);
\draw [-,very thick,red] ([yshift=0.5em]s.north) -- ([yshift=1.5em]s.north);
}
\end{scope}
\end{tikzpicture}
\end{center}
\begin{itemize}
\item<4-> 反向传播从输出向输入传播梯度,因此我们先考虑阶段二\visible<5->{。令$\pi^k = \frac{\partial L}{\partial \textbf{s}^k}$表示损失$L$在第$k$层激活函数输入处的梯度,利用链式法有}
\vspace{-1.5em}
\visible<5->{
\begin{eqnarray}
\pi^K & = & \frac{\partial L}{\partial \textbf{s}^K} \nonumber \\
& = & \frac{\partial L}{\partial \textbf{h}^K} \cdot \frac{\partial \textbf{h}^K}{\partial \textbf{s}^K} \nonumber \\
& = & \frac{\partial L}{\partial \textbf{h}^K} \cdot \frac{\partial f^K(\textbf{s}^K)}{\partial \textbf{s}^K} \nonumber
%\frac{\partial L}{\partial \textbf{w}^K} & = & \frac{\partial L}{\partial \textbf{h}^K} \cdot \frac{\partial \textbf{h}^K}{\partial \textbf{w}^K} \nonumber \\
% & \visible<4->{=} & \visible<4->{\frac{\partial L}{\partial \textbf{h}^K} \cdot \frac{\partial f^K(\textbf{h}^{K-1} \textbf{w}^K)}{\partial \textbf{w}^K} \ \ \ \ (\textrm{因为}\textbf{h}^K=f^K(\textbf{h}^{K-1} \textbf{w}^K))} \nonumber \\
% & \visible<5->{=} & \visible<5->{\frac{\partial L}{\partial \textbf{h}^K} \cdot \frac{\partial f^K(\textbf{s}^K)}{\partial \textbf{s}^K} \cdot \frac{\partial \textbf{s}^K}{\partial \textbf{w}^K} \ \ \ (\textrm{因为}\textbf{s}^K=\textbf{h}^{K-1} \textbf{w}^K)} \nonumber
\end{eqnarray}
}
\end{itemize}
\end{frame}
%%%------------------------------------------------------------------------------------------------------------
%%% 输出层的反向传播 - 各个因子的意义
\begin{frame}{反向传播 - 输出层($\textbf{s}^K$处的梯度)}
\begin{center}
\begin{tikzpicture}
\begin{scope}
\node [anchor=center] (factor00) at (0,0) {${\displaystyle \pi^K \ = }$};
\node [anchor=west] (factor01) at (factor00.east) {${\displaystyle \frac{\partial L}{\partial \textbf{h}^K}}$};
\node [anchor=west,inner sep=1pt] (factor02) at (factor01.east) {${\displaystyle \cdot}$};
\node [anchor=west] (factor03) at (factor02.east) {${\displaystyle \frac{\partial f^K(\textbf{s}^K)}{\partial \textbf{s}^K}}$};
\begin{pgfonlayer}{background}
\visible<2-4>{
\node [rectangle,inner sep=0em,fill=red!20] [fit = (factor01)] (p1) {};
}
\visible<3-4>{
\node [rectangle,inner sep=0em,fill=blue!20] [fit = (factor03)] (p2) {};
}
\visible<5->{
\node [circle,inner sep=0em,fill=green!20] [fit = (factor02)] (p3) {};
}
\end{pgfonlayer}
\end{scope}
\end{tikzpicture}
\end{center}
\begin{itemize}
\item<2-> \raisebox{-0.7em}{\tikz{\node [anchor=west,fill=red!20] (factor01) at (factor00.east) {$\frac{\partial L}{\partial \textbf{h}^K}$};}} 表示损失$L$相对网络输出的变化率,比如,对于$L = \frac{1}{2} ||\hat{\textbf{y}} - \textbf{h}^K||^2$,有$\frac{\partial L}{\partial \textbf{h}^K} = \hat{\textbf{y}} - \textbf{h}^K$
\item<3-> \raisebox{-0.7em}{\tikz{\node [anchor=west,fill=blue!20] (factor01) at (factor00.east) {$\frac{\partial f^K(\textbf{s}^K)}{\partial \textbf{s}^K}$};}} 表示激活函数相对于它自己的输入的变化率,比如,对于$f(\textbf{s}) = \frac{1}{1+\exp(-\textbf{s})}$,有$\frac{\partial f(\textbf{s})}{\partial \textbf{s}} = f(\textbf{s})(1-f(\textbf{s}))$
\item<4-> 这个结果符合直觉,在$s^K$出的梯度相当于在损失函数微分($\frac{\partial L}{\partial \textbf{h}^K}$)和激活函数微分($\frac{\partial f^K(\textbf{s}^K)}{\partial \textbf{s}^K}$) 的乘积\visible<5->{,注意这里所有操作都是单元级,比如张量按单元乘法}
\end{itemize}
\visible<4->{
\vspace{-0.5em}
\begin{center}
\begin{tikzpicture}
\begin{scope}
\node [anchor=west,minimum height=1.7em,fill=blue!20,draw] (s) at (0,0) {$\textbf{s}^{K}$};
\node [anchor=west,minimum height=1.7em,fill=green!20,draw] (h2) at ([xshift=5.5em]s.east) {$\textbf{h}^{K}$};
\node [anchor=west,minimum height=1.7em,fill=orange!20,draw] (l) at ([xshift=5.5em]h2.east) {$L$};
\draw [->] (s.east) -- (h2.west);
\draw [->] (h2.east) -- (l.west);
\draw [->,very thick,red] ([yshift=1em,xshift=-0.1em]l.north) -- ([yshift=1em,xshift=0.1em]h2.north) node [pos=0.5,above] {\tiny{求梯度\alert{$\frac{\partial L}{\partial \textbf{h}^K} = ?$}}};
\draw [->,very thick,red] ([yshift=1em,xshift=-0.1em]h2.north) -- ([yshift=1em,xshift=0.1em]s.north) node [pos=0.5,above] {\tiny{求梯度\alert{$\frac{\partial f^K(\textbf{s}^K)}{\partial \textbf{s}^K} = ?$}}};
\draw [-,very thick,red] ([yshift=0.5em]l.north) -- ([yshift=1.5em]l.north);
\draw [-,very thick,red] ([yshift=0.5em]h2.north) -- ([yshift=1.5em]h2.north);
\draw [-,very thick,red] ([yshift=0.5em]s.north) -- ([yshift=1.5em]s.north);
\end{scope}
\end{tikzpicture}
\end{center}
}
\end{frame}
%%%------------------------------------------------------------------------------------------------------------
%%% 输出层的反向传播 - 求 dL/dw
\begin{frame}{反向传播 - 输出层($\textbf{h}^{K-1}$处的梯度)}
\begin{itemize}
\item 已经得到$\textbf{s}^K$处的梯度\visible<2->{,下面求解两个问题}
\begin{enumerate}
\item<2-> 计算损失$L$对于第$K$层参数矩阵$\textbf{w}^K$的梯度,$\frac{\partial L}{\partial \textbf{w}^K}$
\item<2-> 计算损失$L$对于第$K$层输入$\textbf{h}^{K-1}$的梯度,$\frac{\partial L}{\partial \textbf{h}^{K-1}}$
\end{enumerate}
\end{itemize}
\vspace{-0.8em}
\begin{center}
\begin{tikzpicture}
\begin{scope}
\node [anchor=center,minimum height=1.7em,fill=yellow!20,draw] (h) at (0,0) {$\textbf{h}^{K-1}$};
\node [anchor=west,minimum height=1.7em,fill=blue!20,draw] (s) at ([xshift=5.5em]h.east) {$\textbf{s}^{K}$};
\node [anchor=west,minimum height=1.7em,fill=green!20,draw] (h2) at ([xshift=5.5em]s.east) {$\textbf{h}^{K}$};
\node [anchor=west,minimum height=1.7em,fill=orange!20,draw] (l) at ([xshift=5.5em]h2.east) {$L$};
\draw [->] (h.east) -- (s.west);
\draw [->] (s.east) -- (h2.west);
\draw [->] (h2.east) -- (l.west) node [pos=0.5,above] {\tiny{损失}};
\node [anchor=south west,inner sep=2pt] (step100) at ([xshift=0.5em,yshift=-0.8em]h.north east) {\tiny{$\textbf{s}^K = \textbf{h}^{K-1} \textbf{w}^K$}};
\node [anchor=south west,inner sep=2pt] (step200) at ([xshift=0.5em,yshift=-0.8em]s.north east) {\tiny{$\textbf{h}^K = f^K(\textbf{s}^K)$}};
\node [anchor=south,inner sep=1pt] (outputlabel) at ([yshift=0.0em]h2.north) {\tiny{\textbf{输出层}}};
\node [anchor=south west] (slabel) at ([yshift=1em,xshift=0.3em]s.north) {\scriptsize{\textbf{\alert{已经得到:$\pi^K = \frac{\partial L}{\partial \textbf{s}^K}$}}}};
\draw [->,red] ([yshift=0.3em]slabel.south) .. controls +(south:0.5) and +(north:0.5) .. ([xshift=0.5em]s.north);
\visible<2->{
\draw [->,very thick,red] ([yshift=1em,xshift=-0.1em]s.north) -- ([yshift=1em,xshift=0.1em]h.north) node [pos=0.5,above] {\tiny{\alert{$\frac{\partial L}{\partial \textbf{w}^K} = ?$, $\frac{\partial L}{\partial \textbf{h}^{K-1}} = ?$}}};
\draw [-,very thick,red] ([yshift=0.5em]h.north) -- ([yshift=1.5em]h.north);
\draw [-,very thick,red] ([yshift=0.5em]s.north) -- ([yshift=1.5em]s.north);
}
\end{scope}
\end{tikzpicture}
\end{center}
\begin{itemize}
\item<3-> 由于$\textbf{s}^K = \textbf{h}^{K-1} \textbf{w}^K$,而且$\pi^K = \frac{\partial L}{\partial \textbf{s}^K}$已经求解,可以得到(需要一些数学分析和线性代数的知识,推导一下!):
\vspace{-1.2em}
\begin{eqnarray}
\frac{\partial L}{\partial \textbf{w}^K} & = & [\textbf{h}^{K-1}]^T \pi^K \nonumber \\
\frac{\partial L}{\partial \textbf{h}^{K-1}} & = & \pi^K [\textbf{w}^K]^T\nonumber
\end{eqnarray}
这里,$[\textbf{A}]^T$表示$\textbf{A}$的转置,$\pi^K [\textbf{w}^K]^T$表示张量$\pi^K$\alert{矩阵乘}$\textbf{w}^K$的转置
\end{itemize}
\end{frame}
%%%------------------------------------------------------------------------------------------------------------
%%% 隐层的反向传播
\begin{frame}{反向传播 - 隐层}
\begin{itemize}
\item 对于任意隐层$k$$\textbf{h}^k = f^k(\textbf{s}^k) = f^k(\textbf{h}^{k-1}\textbf{w}^k)$。给定:隐层输出处的梯度$\pi^k=\frac{\partial L}{\partial \textbf{h}^{k}}$,需要
\begin{enumerate}
\item 计算损失$L$对于第$k$层参数矩阵$\textbf{w}^k$的梯度,$\frac{\partial L}{\partial \textbf{w}^k}$
\item 计算损失$L$对于第$k$层输入$\textbf{h}^{k-1}$的梯度,$\frac{\partial L}{\partial \textbf{h}^{k-1}}$
\end{enumerate}
\item<2-> 直接套用上一页的方法,可以将$\pi^k=\frac{\partial L}{\partial \textbf{h}^{k}} \frac{\partial f^k(\textbf{s}^k)}{\partial \textbf{s}^{k}}$反向传播
\vspace{-0.0em}
\begin{eqnarray}
\frac{\partial L}{\partial \textbf{w}^k} & = & [\textbf{h}^{k-1}]^T \pi^k \nonumber \\
\frac{\partial L}{\partial \textbf{h}^{k-1}} & = & \pi^k [\textbf{w}^k]^T\nonumber
\end{eqnarray}
\end{itemize}
\visible<3->{
\begin{center}
\begin{tikzpicture}
\begin{scope}
\node [anchor=center,draw,fill=red!20,minimum height=1.8em,minimum width=2.5em] (h) at (0,0) {$\textbf{h}^{k-1}$};
\node [anchor=west,draw,fill=blue!20,minimum height=1.8em,minimum width=2.5em] (s) at ([xshift=6em]h.east) {$\textbf{s}^{k}$};
\node [anchor=west,draw,fill=green!20,minimum height=1.8em,minimum width=2.5em] (h2) at ([xshift=6em]s.east) {$\textbf{h}^{k}$};
\node [anchor=east] (prev) at ([xshift=-2em]h.west) {...};
\node [anchor=west] (next) at ([xshift=2em]h2.east) {...};
\draw [->,thick] ([xshift=0.1em]prev.east) -- ([xshift=-0.1em]h.west);
\draw [->,thick] ([xshift=0.1em]h.east) -- ([xshift=-0.1em]s.west) node [pos=0.5,below] {\tiny{$\textbf{s}^k = \textbf{h}^{k-1}\textbf{w}^k$}};
\draw [->,thick] ([xshift=0.1em]s.east) -- ([xshift=-0.1em]h2.west) node [pos=0.5,below] {\tiny{$\textbf{h}^k = f(\textbf{s}^{k})$}};
\draw [->,thick] ([xshift=0.1em]h2.east) -- ([xshift=-0.1em]next.west);
\visible<4->{
\draw [<-,thick,red] ([xshift=0.1em,yshift=0.4em]h2.east) -- ([xshift=-0.1em,yshift=0.4em]next.west) node [pos=0.8,above] {\tiny{反向传播}};
}
\visible<5->{
\draw [<-,thick,red] ([xshift=0.1em,yshift=0.4em]s.east) -- ([xshift=-0.1em,yshift=0.4em]h2.west) node [pos=0.5,above] {\tiny{反向传播}};
}
\visible<6->{
\draw [<-,thick,red] ([xshift=0.1em,yshift=0.4em]h.east) -- ([xshift=-0.1em,yshift=0.4em]s.west) node [pos=0.5,above] {\tiny{反向传播}};
}
\visible<7->{
\draw [->,thick,red,dashed] ([yshift=-0.1em]h.south) -- ([yshift=-1em]h.south) -- ([yshift=-1em]h2.south) -- ([yshift=-0.1em]h2.south);
\node [anchor=north,red] (recur) at ([yshift=-1em]s.south) {\scriptsize{$k=k-1$重复上述过程}};
}
\visible<4->{
\node [anchor=south] (h2label) at (h2.north) {$\frac{\partial L}{\partial \textbf{h}^{k}}$};
}
\visible<5->{
\node [anchor=south] (slabel) at (s.north) {$\pi^k = \frac{\partial L}{\partial \textbf{s}^{k}}$};
}
\visible<6->{
\node [anchor=south] (hlabel) at (h.north) {$\frac{\partial L}{\partial \textbf{h}^{k-1}}$, $\frac{\partial L}{\partial \textbf{w}^{k}}$};
}
\end{scope}
\end{tikzpicture}
\end{center}
}
\end{frame}
\end{CJK}
\end{document}
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论