new pages

d613a736 · xiaotong · 44d2f25a · d613a736 · d613a736
Commit d613a736 authored Oct 18, 2019 by xiaotong
--- a/Section05-Neural-Networks-and-Language-Modeling/section05-test.tex
+++ b/Section05-Neural-Networks-and-Language-Modeling/section05-test.tex
@@ -116,165 +116,70 @@
 \subsection{参数学习 - 反向传播}
 %%%------------------------------------------------------------------------------------------------------------
-%%% 输出层的反向传播
+%%% 隐层的反向传播
-\begin{frame}{反向传播 - 输出层}
+\begin{frame}{反向传播 - 隐层}
 \begin{itemize}
-\item 输出层(两个阶段)
+\item 对于任意隐层$k$，$\textbf{h}^k = f^k(\textbf{s}^k) = f^k(\textbf{h}^{k-1}\textbf{w}^k)$。给定：隐层输出处的梯度$\pi^k=\frac{\partial L}{\partial \textbf{h}^{k}}$，需要
+	\begin{enumerate}
+	\item 计算损失$L$对于第$k$层参数矩阵$\textbf{w}^k$的梯度，$\frac{\partial L}{\partial \textbf{w}^k}$
+	\item 计算损失$L$对于第$k$层输入$\textbf{h}^{k-1}$的梯度，$\frac{\partial L}{\partial \textbf{h}^{k-1}}$
+	\end{enumerate}
+\item<2-> 直接套用上一页的方法，可以将$\pi^k=\frac{\partial L}{\partial \textbf{h}^{k}} \frac{\partial f^k(\textbf{s}^k)}{\partial \textbf{s}^{k}}$反向传播
+\vspace{-0.0em}
+\begin{eqnarray}
+\frac{\partial L}{\partial \textbf{w}^k}      & = & [\textbf{h}^{k-1}]^T \pi^k \nonumber \\
+\frac{\partial L}{\partial \textbf{h}^{k-1}} & = & \pi^k  [\textbf{w}^k]^T\nonumber
+\end{eqnarray}
 \end{itemize}
-\vspace{-0.5em}
+\visible<3->{
 \begin{center}
 \begin{tikzpicture}
 \begin{scope}
-\node [anchor=center,minimum height=1.7em,fill=yellow!20,draw] (h) at (0,0) {$\textbf{h}^{K-1}$};
+\node [anchor=center,draw,fill=red!20,minimum height=1.8em,minimum width=2.5em] (h) at (0,0) {$\textbf{h}^{k-1}$};
-\node [anchor=west,minimum height=1.7em,fill=blue!20,draw] (s) at ([xshift=5.5em]h.east) {$\textbf{s}^{K}$};
+\node [anchor=west,draw,fill=blue!20,minimum height=1.8em,minimum width=2.5em] (s) at ([xshift=6em]h.east) {$\textbf{s}^{k}$};
-\node [anchor=west,minimum height=1.7em,fill=green!20,draw] (h2) at ([xshift=5.5em]s.east) {$\textbf{h}^{K}$};
+\node [anchor=west,draw,fill=green!20,minimum height=1.8em,minimum width=2.5em] (h2) at ([xshift=6em]s.east) {$\textbf{h}^{k}$};
-\node [anchor=west,minimum height=1.7em,fill=orange!20,draw] (l) at ([xshift=5.5em]h2.east) {$L$};
+\node [anchor=east] (prev) at ([xshift=-2em]h.west) {...};
-\draw [->] (h.east) -- (s.west);
+\node [anchor=west] (next) at ([xshift=2em]h2.east) {...};
-\draw [->] (s.east) -- (h2.west);
+\draw [->,thick] ([xshift=0.1em]prev.east) -- ([xshift=-0.1em]h.west);
-\draw [->] (h2.east) -- (l.west) node [pos=0.5,above] {\tiny{损失}};
+\draw [->,thick] ([xshift=0.1em]h.east) -- ([xshift=-0.1em]s.west) node [pos=0.5,below] {\tiny{$\textbf{s}^k = \textbf{h}^{k-1}\textbf{w}^k$}};
+\draw [->,thick] ([xshift=0.1em]s.east) -- ([xshift=-0.1em]h2.west) node [pos=0.5,below] {\tiny{$\textbf{h}^k = f(\textbf{s}^{k})$}};
-\node [anchor=south west,inner sep=2pt] (step100) at ([xshift=0.5em,yshift=-0.8em]h.north east) {\tiny{$\textbf{s}^K = \textbf{h}^{K-1} \textbf{w}^K$}};
+\draw [->,thick] ([xshift=0.1em]h2.east) -- ([xshift=-0.1em]next.west);
-\node [anchor=south west,inner sep=2pt] (step101) at (step100.north west) {\tiny{线性变换}};
-\node [anchor=south west,inner sep=2pt] (step200) at ([xshift=0.5em,yshift=-0.8em]s.north east) {\tiny{$\textbf{h}^K = f^K(\textbf{s}^K)$}};
-\node [anchor=south west,inner sep=2pt] (step201) at (step200.north west) {\tiny{激活函数}};
-\node [anchor=south,inner sep=1pt] (outputlabel) at ([yshift=0.0em]h2.north) {\tiny{\textbf{输出层}}};
-\visible<2->{
-\draw[decorate,thick,decoration={brace,mirror,raise=0.4em,amplitude=2mm}] (h.south west) -- (s.south west) node [pos=0.5,below,yshift=-1em] {\scriptsize{\textbf{第一阶段：线性变换}}};
-}
-\visible<3->{
-\draw[decorate,thick,decoration={brace,mirror,raise=0.4em,amplitude=2mm}] ([xshift=0.2em]s.south west) -- (l.south east) node [pos=0.5,below,yshift=-1em] (step2) {\scriptsize{\textbf{第二阶段：激活函数+损失函数}}};
-}
 \visible<4->{
-\draw [->,very thick,red] ([yshift=1em,xshift=-0.1em]l.north) -- ([yshift=1em,xshift=0.1em]s.north) node [pos=0.5,above] {\tiny{反向求梯度\alert{$\frac{\partial L}{\partial \textbf{s}^k} = ?$}}};
+\draw [<-,thick,red] ([xshift=0.1em,yshift=0.4em]h2.east) -- ([xshift=-0.1em,yshift=0.4em]next.west) node [pos=0.8,above] {\tiny{反向传播}};
-\draw [-,very thick,red] ([yshift=0.5em]l.north) -- ([yshift=1.5em]l.north);
-\draw [-,very thick,red] ([yshift=0.5em]s.north) -- ([yshift=1.5em]s.north);
 }
-\end{scope}
-\end{tikzpicture}
-\end{center}
-\begin{itemize}
-\item<4-> 反向传播从输出向输入传播梯度，因此我们先考虑阶段二\visible<5->{。令$\pi^k = \frac{\partial L}{\partial \textbf{s}^k}$表示损失$L$在第$k$层激活函数输入处的梯度，利用链式法有}
-\vspace{-1.5em}
 \visible<5->{
-\begin{eqnarray}
+\draw [<-,thick,red] ([xshift=0.1em,yshift=0.4em]s.east) -- ([xshift=-0.1em,yshift=0.4em]h2.west) node [pos=0.5,above] {\tiny{反向传播}};
-\pi^K & = & \frac{\partial L}{\partial \textbf{s}^K} \nonumber \\
-         & = & \frac{\partial L}{\partial \textbf{h}^K} \cdot \frac{\partial \textbf{h}^K}{\partial \textbf{s}^K}  \nonumber \\
-         & = & \frac{\partial L}{\partial \textbf{h}^K} \cdot \frac{\partial f^K(\textbf{s}^K)}{\partial \textbf{s}^K}  \nonumber
-%\frac{\partial L}{\partial \textbf{w}^K} & = & \frac{\partial L}{\partial \textbf{h}^K} \cdot \frac{\partial \textbf{h}^K}{\partial \textbf{w}^K} \nonumber \\
-%                                                         & \visible<4->{=} & \visible<4->{\frac{\partial L}{\partial \textbf{h}^K} \cdot \frac{\partial f^K(\textbf{h}^{K-1} \textbf{w}^K)}{\partial \textbf{w}^K} \ \ \ \ (\textrm{因为}\textbf{h}^K=f^K(\textbf{h}^{K-1} \textbf{w}^K))} \nonumber \\
-%                                                         & \visible<5->{=} & \visible<5->{\frac{\partial L}{\partial \textbf{h}^K} \cdot \frac{\partial f^K(\textbf{s}^K)}{\partial \textbf{s}^K} \cdot \frac{\partial \textbf{s}^K}{\partial \textbf{w}^K}  \ \ \ (\textrm{因为}\textbf{s}^K=\textbf{h}^{K-1} \textbf{w}^K)} \nonumber
-\end{eqnarray}
 }
-\end{itemize}
+\visible<6->{
+\draw [<-,thick,red] ([xshift=0.1em,yshift=0.4em]h.east) -- ([xshift=-0.1em,yshift=0.4em]s.west) node [pos=0.5,above] {\tiny{反向传播}};
-\end{frame}
-%%%------------------------------------------------------------------------------------------------------------
-%%% 输出层的反向传播 - 各个因子的意义
-\begin{frame}{反向传播 - 输出层(续)}
-\begin{center}
-\begin{tikzpicture}
-\begin{scope}
-\node [anchor=center] (factor00) at (0,0) {${\displaystyle \pi^K \ = }$};
-\node [anchor=west] (factor01) at (factor00.east) {${\displaystyle \frac{\partial L}{\partial \textbf{h}^K}}$};
-\node [anchor=west,inner sep=1pt] (factor02) at (factor01.east) {${\displaystyle \cdot}$};
-\node [anchor=west] (factor03) at (factor02.east) {${\displaystyle \frac{\partial f^K(\textbf{s}^K)}{\partial \textbf{s}^K}}$};
-\begin{pgfonlayer}{background}
-\visible<2-4>{
-\node [rectangle,inner sep=0em,fill=red!20] [fit = (factor01)] (p1) {};
 }
-\visible<3-4>{
-\node [rectangle,inner sep=0em,fill=blue!20] [fit = (factor03)] (p2) {};
-}
-\visible<5->{
-\node [circle,inner sep=0em,fill=green!20] [fit = (factor02)] (p3) {};
-}
-\end{pgfonlayer}
-\end{scope}
+\visible<7->{
-\end{tikzpicture}
+\draw [->,thick,red,dashed] ([yshift=-0.1em]h.south) -- ([yshift=-1em]h.south) -- ([yshift=-1em]h2.south) -- ([yshift=-0.1em]h2.south);
-\end{center}
+\node [anchor=north,red] (recur) at ([yshift=-1em]s.south) {\scriptsize{$k=k-1$重复上述过程}};
+}
-\begin{itemize}
-\item<2-> \raisebox{-0.7em}{\tikz{\node [anchor=west,fill=red!20] (factor01) at (factor00.east) {$\frac{\partial L}{\partial \textbf{h}^K}$};}} 表示损失$L$相对网络输出的变化率，比如，对于$L = \frac{1}{2} ||\hat{\textbf{y}} - \textbf{h}^K||^2$，有$\frac{\partial L}{\partial \textbf{h}^K} = \hat{\textbf{y}} - \textbf{h}^K$
-\item<3-> \raisebox{-0.7em}{\tikz{\node [anchor=west,fill=blue!20] (factor01) at (factor00.east) {$\frac{\partial f^K(\textbf{s}^K)}{\partial \textbf{s}^K}$};}} 表示激活函数相对于它自己的输入的变化率，比如，对于$f(\textbf{s}) = \frac{1}{1+\exp(-\textbf{s})}$，有$\frac{\partial f(\textbf{s})}{\partial \textbf{s}} = f(\textbf{s})(1-f(\textbf{s}))$
-\item<4-> 这个结果符合直觉，在$s^K$出的梯度相当于在损失函数微分($\frac{\partial L}{\partial \textbf{h}^K}$)和激活函数微分($\frac{\partial f^K(\textbf{s}^K)}{\partial \textbf{s}^K}$) 的乘积\visible<5->{，注意这里所有操作都是单元级，比如张量按单元乘法}
-\end{itemize}
 \visible<4->{
-\vspace{-0.5em}
+\node [anchor=south] (h2label) at (h2.north) {$\frac{\partial L}{\partial \textbf{h}^{k}}$};
-\begin{center}
-\begin{tikzpicture}
-\begin{scope}
-\node [anchor=west,minimum height=1.7em,fill=blue!20,draw] (s) at (0,0) {$\textbf{s}^{K}$};
-\node [anchor=west,minimum height=1.7em,fill=green!20,draw] (h2) at ([xshift=5.5em]s.east) {$\textbf{h}^{K}$};
-\node [anchor=west,minimum height=1.7em,fill=orange!20,draw] (l) at ([xshift=5.5em]h2.east) {$L$};
-\draw [->] (s.east) -- (h2.west);
-\draw [->] (h2.east) -- (l.west);
-\draw [->,very thick,red] ([yshift=1em,xshift=-0.1em]l.north) -- ([yshift=1em,xshift=0.1em]h2.north) node [pos=0.5,above] {\tiny{求梯度\alert{$\frac{\partial L}{\partial \textbf{h}^k} = ?$}}};
-\draw [->,very thick,red] ([yshift=1em,xshift=-0.1em]h2.north) -- ([yshift=1em,xshift=0.1em]s.north) node [pos=0.5,above] {\tiny{求梯度\alert{$\frac{\partial f^K(\textbf{s}^K)}{\partial \textbf{s}^K} = ?$}}};
-\draw [-,very thick,red] ([yshift=0.5em]l.north) -- ([yshift=1.5em]l.north);
-\draw [-,very thick,red] ([yshift=0.5em]h2.north) -- ([yshift=1.5em]h2.north);
-\draw [-,very thick,red] ([yshift=0.5em]s.north) -- ([yshift=1.5em]s.north);
-\end{scope}
-\end{tikzpicture}
-\end{center}
 }
-\end{frame}
+\visible<5->{
+\node [anchor=south] (slabel) at (s.north) {$\pi^k = \frac{\partial L}{\partial \textbf{s}^{k}}$};
-%%%------------------------------------------------------------------------------------------------------------
+}
-%%% 输出层的反向传播 - 求 dL/dw
-\begin{frame}{反向传播 - 输出层}
-\begin{itemize}
-\item 输出层(两个阶段)
-\end{itemize}
-\vspace{-0.5em}
-\begin{center}
-\begin{tikzpicture}
-\begin{scope}
-\node [anchor=center,minimum height=1.7em,fill=yellow!20,draw] (h) at (0,0) {$\textbf{h}^{K-1}$};
-\node [anchor=west,minimum height=1.7em,fill=blue!20,draw] (s) at ([xshift=5.5em]h.east) {$\textbf{s}^{K}$};
-\node [anchor=west,minimum height=1.7em,fill=green!20,draw] (h2) at ([xshift=5.5em]s.east) {$\textbf{h}^{K}$};
-\node [anchor=west,minimum height=1.7em,fill=orange!20,draw] (l) at ([xshift=5.5em]h2.east) {$L$};
-\draw [->] (h.east) -- (s.west);
-\draw [->] (s.east) -- (h2.west);
-\draw [->] (h2.east) -- (l.west) node [pos=0.5,above] {\tiny{损失}};
-\node [anchor=south west,inner sep=2pt] (step100) at ([xshift=0.5em,yshift=-0.8em]h.north east) {\tiny{$\textbf{s}^K = \textbf{h}^{K-1} \textbf{w}^K$}};
-\node [anchor=south west,inner sep=2pt] (step200) at ([xshift=0.5em,yshift=-0.8em]s.north east) {\tiny{$\textbf{h}^K = f^K(\textbf{s}^K)$}};
-\node [anchor=south,inner sep=1pt] (outputlabel) at ([yshift=0.0em]h2.north) {\tiny{\textbf{输出层}}};
-\node [anchor=south west] (slabel) at ([yshift=1em,xshift=0.3em]s.north) {\scriptsize{\textbf{\alert{已经得到：$\pi^K$}}}};
-\draw [->,very thick,red] ([yshift=1em,xshift=-0.1em]s.north) -- ([yshift=1em,xshift=0.1em]h.north) node [pos=0.5,above] {\tiny{\alert{$\frac{\partial L}{\partial \textbf{w}^k} = ?$, $\frac{\partial L}{\partial \textbf{h}^k} = ?$}}};
+\visible<6->{
-\draw [-,very thick,red] ([yshift=0.5em]h.north) -- ([yshift=1.5em]h.north);
+\node [anchor=south] (hlabel) at (h.north) {$\frac{\partial L}{\partial \textbf{h}^{k-1}}$, $\frac{\partial L}{\partial \textbf{w}^{k}}$};
-\draw [-,very thick,red] ([yshift=0.5em]s.north) -- ([yshift=1.5em]s.north);
+}
 \end{scope}
 \end{tikzpicture}
 \end{center}
+}
 \end{frame}

--- a/Section05-Neural-Networks-and-Language-Modeling/section05.tex
+++ b/Section05-Neural-Networks-and-Language-Modeling/section05.tex
@@ -3333,5 +3333,256 @@ $+2x^2+x+1)$ & \ \ $(x^4+2x^3+2x^2+x+1)$ & $+6x+1$ \\
 \end{frame}
+%%%------------------------------------------------------------------------------------------------------------
+%%% 输出层的反向传播
+\begin{frame}{反向传播 - 输出层}
+\begin{itemize}
+\item 输出层(两个阶段)
+\end{itemize}
+\vspace{-1.0em}
+\begin{center}
+\begin{tikzpicture}
+\begin{scope}
+\node [anchor=center,minimum height=1.7em,fill=yellow!20,draw] (h) at (0,0) {$\textbf{h}^{K-1}$};
+\node [anchor=west,minimum height=1.7em,fill=blue!20,draw] (s) at ([xshift=5.5em]h.east) {$\textbf{s}^{K}$};
+\node [anchor=west,minimum height=1.7em,fill=green!20,draw] (h2) at ([xshift=5.5em]s.east) {$\textbf{h}^{K}$};
+\node [anchor=west,minimum height=1.7em,fill=orange!20,draw] (l) at ([xshift=5.5em]h2.east) {$L$};
+\draw [->] (h.east) -- (s.west);
+\draw [->] (s.east) -- (h2.west);
+\draw [->] (h2.east) -- (l.west) node [pos=0.5,above] {\tiny{损失}};
+\node [anchor=south west,inner sep=2pt] (step100) at ([xshift=0.5em,yshift=-0.8em]h.north east) {\tiny{$\textbf{s}^K = \textbf{h}^{K-1} \textbf{w}^K$}};
+\node [anchor=south west,inner sep=2pt] (step101) at (step100.north west) {\tiny{线性变换}};
+\node [anchor=south west,inner sep=2pt] (step200) at ([xshift=0.5em,yshift=-0.8em]s.north east) {\tiny{$\textbf{h}^K = f^K(\textbf{s}^K)$}};
+\node [anchor=south west,inner sep=2pt] (step201) at (step200.north west) {\tiny{激活函数}};
+\node [anchor=south,inner sep=1pt] (outputlabel) at ([yshift=0.0em]h2.north) {\tiny{\textbf{输出层}}};
+\visible<2->{
+\draw[decorate,thick,decoration={brace,mirror,raise=0.4em,amplitude=2mm}] (h.south west) -- (s.south west) node [pos=0.5,below,yshift=-1em] {\scriptsize{\textbf{第一阶段：线性变换}}};
+}
+\visible<3->{
+\draw[decorate,thick,decoration={brace,mirror,raise=0.4em,amplitude=2mm}] ([xshift=0.2em]s.south west) -- (l.south east) node [pos=0.5,below,yshift=-1em] (step2) {\scriptsize{\textbf{第二阶段：激活函数+损失函数}}};
+}
+\visible<4->{
+\draw [->,very thick,red] ([yshift=1em,xshift=-0.1em]l.north) -- ([yshift=1em,xshift=0.1em]s.north) node [pos=0.5,above] {\tiny{反向求梯度\alert{$\frac{\partial L}{\partial \textbf{s}^k} = ?$}}};
+\draw [-,very thick,red] ([yshift=0.5em]l.north) -- ([yshift=1.5em]l.north);
+\draw [-,very thick,red] ([yshift=0.5em]s.north) -- ([yshift=1.5em]s.north);
+}
+\end{scope}
+\end{tikzpicture}
+\end{center}
+\begin{itemize}
+\item<4-> 反向传播从输出向输入传播梯度，因此我们先考虑阶段二\visible<5->{。令$\pi^k = \frac{\partial L}{\partial \textbf{s}^k}$表示损失$L$在第$k$层激活函数输入处的梯度，利用链式法有}
+\vspace{-1.5em}
+\visible<5->{
+\begin{eqnarray}
+\pi^K & = & \frac{\partial L}{\partial \textbf{s}^K} \nonumber \\
+         & = & \frac{\partial L}{\partial \textbf{h}^K} \cdot \frac{\partial \textbf{h}^K}{\partial \textbf{s}^K}  \nonumber \\
+         & = & \frac{\partial L}{\partial \textbf{h}^K} \cdot \frac{\partial f^K(\textbf{s}^K)}{\partial \textbf{s}^K}  \nonumber
+%\frac{\partial L}{\partial \textbf{w}^K} & = & \frac{\partial L}{\partial \textbf{h}^K} \cdot \frac{\partial \textbf{h}^K}{\partial \textbf{w}^K} \nonumber \\
+%                                                         & \visible<4->{=} & \visible<4->{\frac{\partial L}{\partial \textbf{h}^K} \cdot \frac{\partial f^K(\textbf{h}^{K-1} \textbf{w}^K)}{\partial \textbf{w}^K} \ \ \ \ (\textrm{因为}\textbf{h}^K=f^K(\textbf{h}^{K-1} \textbf{w}^K))} \nonumber \\
+%                                                         & \visible<5->{=} & \visible<5->{\frac{\partial L}{\partial \textbf{h}^K} \cdot \frac{\partial f^K(\textbf{s}^K)}{\partial \textbf{s}^K} \cdot \frac{\partial \textbf{s}^K}{\partial \textbf{w}^K}  \ \ \ (\textrm{因为}\textbf{s}^K=\textbf{h}^{K-1} \textbf{w}^K)} \nonumber
+\end{eqnarray}
+}
+\end{itemize}
+\end{frame}
+%%%------------------------------------------------------------------------------------------------------------
+%%% 输出层的反向传播 - 各个因子的意义
+\begin{frame}{反向传播 - 输出层($\textbf{s}^K$处的梯度)}
+\begin{center}
+\begin{tikzpicture}
+\begin{scope}
+\node [anchor=center] (factor00) at (0,0) {${\displaystyle \pi^K \ = }$};
+\node [anchor=west] (factor01) at (factor00.east) {${\displaystyle \frac{\partial L}{\partial \textbf{h}^K}}$};
+\node [anchor=west,inner sep=1pt] (factor02) at (factor01.east) {${\displaystyle \cdot}$};
+\node [anchor=west] (factor03) at (factor02.east) {${\displaystyle \frac{\partial f^K(\textbf{s}^K)}{\partial \textbf{s}^K}}$};
+\begin{pgfonlayer}{background}
+\visible<2-4>{
+\node [rectangle,inner sep=0em,fill=red!20] [fit = (factor01)] (p1) {};
+}
+\visible<3-4>{
+\node [rectangle,inner sep=0em,fill=blue!20] [fit = (factor03)] (p2) {};
+}
+\visible<5->{
+\node [circle,inner sep=0em,fill=green!20] [fit = (factor02)] (p3) {};
+}
+\end{pgfonlayer}
+\end{scope}
+\end{tikzpicture}
+\end{center}
+\begin{itemize}
+\item<2-> \raisebox{-0.7em}{\tikz{\node [anchor=west,fill=red!20] (factor01) at (factor00.east) {$\frac{\partial L}{\partial \textbf{h}^K}$};}} 表示损失$L$相对网络输出的变化率，比如，对于$L = \frac{1}{2} ||\hat{\textbf{y}} - \textbf{h}^K||^2$，有$\frac{\partial L}{\partial \textbf{h}^K} = \hat{\textbf{y}} - \textbf{h}^K$
+\item<3-> \raisebox{-0.7em}{\tikz{\node [anchor=west,fill=blue!20] (factor01) at (factor00.east) {$\frac{\partial f^K(\textbf{s}^K)}{\partial \textbf{s}^K}$};}} 表示激活函数相对于它自己的输入的变化率，比如，对于$f(\textbf{s}) = \frac{1}{1+\exp(-\textbf{s})}$，有$\frac{\partial f(\textbf{s})}{\partial \textbf{s}} = f(\textbf{s})(1-f(\textbf{s}))$
+\item<4-> 这个结果符合直觉，在$s^K$出的梯度相当于在损失函数微分($\frac{\partial L}{\partial \textbf{h}^K}$)和激活函数微分($\frac{\partial f^K(\textbf{s}^K)}{\partial \textbf{s}^K}$) 的乘积\visible<5->{，注意这里所有操作都是单元级，比如张量按单元乘法}
+\end{itemize}
+\visible<4->{
+\vspace{-0.5em}
+\begin{center}
+\begin{tikzpicture}
+\begin{scope}
+\node [anchor=west,minimum height=1.7em,fill=blue!20,draw] (s) at (0,0) {$\textbf{s}^{K}$};
+\node [anchor=west,minimum height=1.7em,fill=green!20,draw] (h2) at ([xshift=5.5em]s.east) {$\textbf{h}^{K}$};
+\node [anchor=west,minimum height=1.7em,fill=orange!20,draw] (l) at ([xshift=5.5em]h2.east) {$L$};
+\draw [->] (s.east) -- (h2.west);
+\draw [->] (h2.east) -- (l.west);
+\draw [->,very thick,red] ([yshift=1em,xshift=-0.1em]l.north) -- ([yshift=1em,xshift=0.1em]h2.north) node [pos=0.5,above] {\tiny{求梯度\alert{$\frac{\partial L}{\partial \textbf{h}^K} = ?$}}};
+\draw [->,very thick,red] ([yshift=1em,xshift=-0.1em]h2.north) -- ([yshift=1em,xshift=0.1em]s.north) node [pos=0.5,above] {\tiny{求梯度\alert{$\frac{\partial f^K(\textbf{s}^K)}{\partial \textbf{s}^K} = ?$}}};
+\draw [-,very thick,red] ([yshift=0.5em]l.north) -- ([yshift=1.5em]l.north);
+\draw [-,very thick,red] ([yshift=0.5em]h2.north) -- ([yshift=1.5em]h2.north);
+\draw [-,very thick,red] ([yshift=0.5em]s.north) -- ([yshift=1.5em]s.north);
+\end{scope}
+\end{tikzpicture}
+\end{center}
+}
+\end{frame}
+%%%------------------------------------------------------------------------------------------------------------
+%%% 输出层的反向传播 - 求 dL/dw
+\begin{frame}{反向传播 - 输出层($\textbf{h}^{K-1}$处的梯度)}
+\begin{itemize}
+\item 已经得到$\textbf{s}^K$处的梯度\visible<2->{，下面求解两个问题}
+	\begin{enumerate}
+	\item<2-> 计算损失$L$对于第$K$层参数矩阵$\textbf{w}^K$的梯度，$\frac{\partial L}{\partial \textbf{w}^K}$
+	\item<2-> 计算损失$L$对于第$K$层输入$\textbf{h}^{K-1}$的梯度，$\frac{\partial L}{\partial \textbf{h}^{K-1}}$
+	\end{enumerate}
+\end{itemize}
+\vspace{-0.8em}
+\begin{center}
+\begin{tikzpicture}
+\begin{scope}
+\node [anchor=center,minimum height=1.7em,fill=yellow!20,draw] (h) at (0,0) {$\textbf{h}^{K-1}$};
+\node [anchor=west,minimum height=1.7em,fill=blue!20,draw] (s) at ([xshift=5.5em]h.east) {$\textbf{s}^{K}$};
+\node [anchor=west,minimum height=1.7em,fill=green!20,draw] (h2) at ([xshift=5.5em]s.east) {$\textbf{h}^{K}$};
+\node [anchor=west,minimum height=1.7em,fill=orange!20,draw] (l) at ([xshift=5.5em]h2.east) {$L$};
+\draw [->] (h.east) -- (s.west);
+\draw [->] (s.east) -- (h2.west);
+\draw [->] (h2.east) -- (l.west) node [pos=0.5,above] {\tiny{损失}};
+\node [anchor=south west,inner sep=2pt] (step100) at ([xshift=0.5em,yshift=-0.8em]h.north east) {\tiny{$\textbf{s}^K = \textbf{h}^{K-1} \textbf{w}^K$}};
+\node [anchor=south west,inner sep=2pt] (step200) at ([xshift=0.5em,yshift=-0.8em]s.north east) {\tiny{$\textbf{h}^K = f^K(\textbf{s}^K)$}};
+\node [anchor=south,inner sep=1pt] (outputlabel) at ([yshift=0.0em]h2.north) {\tiny{\textbf{输出层}}};
+\node [anchor=south west] (slabel) at ([yshift=1em,xshift=0.3em]s.north) {\scriptsize{\textbf{\alert{已经得到：$\pi^K = \frac{\partial L}{\partial \textbf{s}^K}$}}}};
+\draw [->,red] ([yshift=0.3em]slabel.south) .. controls +(south:0.5) and +(north:0.5) .. ([xshift=0.5em]s.north);
+\visible<2->{
+\draw [->,very thick,red] ([yshift=1em,xshift=-0.1em]s.north) -- ([yshift=1em,xshift=0.1em]h.north) node [pos=0.5,above] {\tiny{\alert{$\frac{\partial L}{\partial \textbf{w}^K} = ?$, $\frac{\partial L}{\partial \textbf{h}^{K-1}} = ?$}}};
+\draw [-,very thick,red] ([yshift=0.5em]h.north) -- ([yshift=1.5em]h.north);
+\draw [-,very thick,red] ([yshift=0.5em]s.north) -- ([yshift=1.5em]s.north);
+}
+\end{scope}
+\end{tikzpicture}
+\end{center}
+\begin{itemize}
+\item<3-> 由于$\textbf{s}^K = \textbf{h}^{K-1} \textbf{w}^K$，而且$\pi^K = \frac{\partial L}{\partial \textbf{s}^K}$已经求解，可以得到(需要一些数学分析和线性代数的知识，推导一下!)：
+\vspace{-1.2em}
+\begin{eqnarray}
+\frac{\partial L}{\partial \textbf{w}^K}      & = & [\textbf{h}^{K-1}]^T \pi^K \nonumber \\
+\frac{\partial L}{\partial \textbf{h}^{K-1}} & = & \pi^K  [\textbf{w}^K]^T\nonumber
+\end{eqnarray}
+这里，$[\textbf{A}]^T$表示$\textbf{A}$的转置，$\pi^K  [\textbf{w}^K]^T$表示张量$\pi^K$\alert{矩阵乘}$\textbf{w}^K$的转置
+\end{itemize}
+\end{frame}
+%%%------------------------------------------------------------------------------------------------------------
+%%% 隐层的反向传播
+\begin{frame}{反向传播 - 隐层}
+\begin{itemize}
+\item 对于任意隐层$k$，$\textbf{h}^k = f^k(\textbf{s}^k) = f^k(\textbf{h}^{k-1}\textbf{w}^k)$。给定：隐层输出处的梯度$\pi^k=\frac{\partial L}{\partial \textbf{h}^{k}}$，需要
+	\begin{enumerate}
+	\item 计算损失$L$对于第$k$层参数矩阵$\textbf{w}^k$的梯度，$\frac{\partial L}{\partial \textbf{w}^k}$
+	\item 计算损失$L$对于第$k$层输入$\textbf{h}^{k-1}$的梯度，$\frac{\partial L}{\partial \textbf{h}^{k-1}}$
+	\end{enumerate}
+\item<2-> 直接套用上一页的方法，可以将$\pi^k=\frac{\partial L}{\partial \textbf{h}^{k}} \frac{\partial f^k(\textbf{s}^k)}{\partial \textbf{s}^{k}}$反向传播
+\vspace{-0.0em}
+\begin{eqnarray}
+\frac{\partial L}{\partial \textbf{w}^k}      & = & [\textbf{h}^{k-1}]^T \pi^k \nonumber \\
+\frac{\partial L}{\partial \textbf{h}^{k-1}} & = & \pi^k  [\textbf{w}^k]^T\nonumber
+\end{eqnarray}
+\end{itemize}
+\visible<3->{
+\begin{center}
+\begin{tikzpicture}
+\begin{scope}
+\node [anchor=center,draw,fill=red!20,minimum height=1.8em,minimum width=2.5em] (h) at (0,0) {$\textbf{h}^{k-1}$};
+\node [anchor=west,draw,fill=blue!20,minimum height=1.8em,minimum width=2.5em] (s) at ([xshift=6em]h.east) {$\textbf{s}^{k}$};
+\node [anchor=west,draw,fill=green!20,minimum height=1.8em,minimum width=2.5em] (h2) at ([xshift=6em]s.east) {$\textbf{h}^{k}$};
+\node [anchor=east] (prev) at ([xshift=-2em]h.west) {...};
+\node [anchor=west] (next) at ([xshift=2em]h2.east) {...};
+\draw [->,thick] ([xshift=0.1em]prev.east) -- ([xshift=-0.1em]h.west);
+\draw [->,thick] ([xshift=0.1em]h.east) -- ([xshift=-0.1em]s.west) node [pos=0.5,below] {\tiny{$\textbf{s}^k = \textbf{h}^{k-1}\textbf{w}^k$}};
+\draw [->,thick] ([xshift=0.1em]s.east) -- ([xshift=-0.1em]h2.west) node [pos=0.5,below] {\tiny{$\textbf{h}^k = f(\textbf{s}^{k})$}};
+\draw [->,thick] ([xshift=0.1em]h2.east) -- ([xshift=-0.1em]next.west);
+\visible<4->{
+\draw [<-,thick,red] ([xshift=0.1em,yshift=0.4em]h2.east) -- ([xshift=-0.1em,yshift=0.4em]next.west) node [pos=0.8,above] {\tiny{反向传播}};
+}
+\visible<5->{
+\draw [<-,thick,red] ([xshift=0.1em,yshift=0.4em]s.east) -- ([xshift=-0.1em,yshift=0.4em]h2.west) node [pos=0.5,above] {\tiny{反向传播}};
+}
+\visible<6->{
+\draw [<-,thick,red] ([xshift=0.1em,yshift=0.4em]h.east) -- ([xshift=-0.1em,yshift=0.4em]s.west) node [pos=0.5,above] {\tiny{反向传播}};
+}
+\visible<7->{
+\draw [->,thick,red,dashed] ([yshift=-0.1em]h.south) -- ([yshift=-1em]h.south) -- ([yshift=-1em]h2.south) -- ([yshift=-0.1em]h2.south);
+\node [anchor=north,red] (recur) at ([yshift=-1em]s.south) {\scriptsize{$k=k-1$重复上述过程}};
+}
+\visible<4->{
+\node [anchor=south] (h2label) at (h2.north) {$\frac{\partial L}{\partial \textbf{h}^{k}}$};
+}
+\visible<5->{
+\node [anchor=south] (slabel) at (s.north) {$\pi^k = \frac{\partial L}{\partial \textbf{s}^{k}}$};
+}
+\visible<6->{
+\node [anchor=south] (hlabel) at (h.north) {$\frac{\partial L}{\partial \textbf{h}^{k-1}}$, $\frac{\partial L}{\partial \textbf{w}^{k}}$};
+}
+\end{scope}
+\end{tikzpicture}
+\end{center}
+}
+\end{frame}
 \end{CJK}
 \end{document}