new update

44d2f25a · xiaotong · a204dca2 · 44d2f25a
Commit 44d2f25a authored Oct 18, 2019 by xiaotong
--- a/Section05-Neural-Networks-and-Language-Modeling/section05-test.tex
+++ b/Section05-Neural-Networks-and-Language-Modeling/section05-test.tex
@@ -128,15 +128,20 @@
 \begin{tikzpicture}
 \begin{scope}
 \node [anchor=center,minimum height=1.7em,fill=yellow!20,draw] (h) at (0,0) {$\textbf{h}^{K-1}$};
-\node [anchor=west,minimum height=1.7em,fill=blue!20,draw] (s) at ([xshift=6.5em]h.east) {$\textbf{s}^{K}$};
+\node [anchor=west,minimum height=1.7em,fill=blue!20,draw] (s) at ([xshift=5.5em]h.east) {$\textbf{s}^{K}$};
-\node [anchor=west,minimum height=1.7em,fill=green!20,draw] (h2) at ([xshift=6.5em]s.east) {$\textbf{h}^{K}$};
+\node [anchor=west,minimum height=1.7em,fill=green!20,draw] (h2) at ([xshift=5.5em]s.east) {$\textbf{h}^{K}$};
-\node [anchor=west,minimum height=1.7em,fill=orange!20,draw] (l) at ([xshift=4em]h2.east) {$L$};
+\node [anchor=west,minimum height=1.7em,fill=orange!20,draw] (l) at ([xshift=5.5em]h2.east) {$L$};
-\draw [->] (h.east) -- (s.west) node [pos=0.5,above] {\tiny{线性变换$\textbf{s}^K = \textbf{h}^{K-1} \textbf{w}^K$}};
+\draw [->] (h.east) -- (s.west);
-\draw [->] (s.east) -- (h2.west) node [pos=0.5,above] {\tiny{激活函数$\textbf{h}^K = f^K(\textbf{s}^K)$}};
+\draw [->] (s.east) -- (h2.west);
-\end{scope}
 \draw [->] (h2.east) -- (l.west) node [pos=0.5,above] {\tiny{损失}};
-\node [anchor=south] (outputlabel) at ([yshift=0.3em]h2.north) {\scriptsize{\textbf{网络输出层}}};
+\node [anchor=south west,inner sep=2pt] (step100) at ([xshift=0.5em,yshift=-0.8em]h.north east) {\tiny{$\textbf{s}^K = \textbf{h}^{K-1} \textbf{w}^K$}};
+\node [anchor=south west,inner sep=2pt] (step101) at (step100.north west) {\tiny{线性变换}};
+\node [anchor=south west,inner sep=2pt] (step200) at ([xshift=0.5em,yshift=-0.8em]s.north east) {\tiny{$\textbf{h}^K = f^K(\textbf{s}^K)$}};
+\node [anchor=south west,inner sep=2pt] (step201) at (step200.north west) {\tiny{激活函数}};
+\node [anchor=south,inner sep=1pt] (outputlabel) at ([yshift=0.0em]h2.north) {\tiny{\textbf{输出层}}};
 \visible<2->{
 \draw[decorate,thick,decoration={brace,mirror,raise=0.4em,amplitude=2mm}] (h.south west) -- (s.south west) node [pos=0.5,below,yshift=-1em] {\scriptsize{\textbf{第一阶段：线性变换}}};
@@ -145,19 +150,21 @@
 \draw[decorate,thick,decoration={brace,mirror,raise=0.4em,amplitude=2mm}] ([xshift=0.2em]s.south west) -- (l.south east) node [pos=0.5,below,yshift=-1em] (step2) {\scriptsize{\textbf{第二阶段：激活函数+损失函数}}};
 }
-\begin{pgfonlayer}{background}
 \visible<4->{
-\node [rectangle,inner sep=0em,fill=red!20] [fit = (step2)] (step2label) {};
+\draw [->,very thick,red] ([yshift=1em,xshift=-0.1em]l.north) -- ([yshift=1em,xshift=0.1em]s.north) node [pos=0.5,above] {\tiny{反向求梯度\alert{$\frac{\partial L}{\partial \textbf{s}^k} = ?$}}};
+\draw [-,very thick,red] ([yshift=0.5em]l.north) -- ([yshift=1.5em]l.north);
+\draw [-,very thick,red] ([yshift=0.5em]s.north) -- ([yshift=1.5em]s.north);
 }
-\end{pgfonlayer}
+\end{scope}
 \end{tikzpicture}
 \end{center}
 \begin{itemize}
-\item<4-> 反向传播从输出向输入传播梯度，因此我们先考虑阶段二。令$\pi^k = \frac{\partial L}{\partial \textbf{s}^k}$表示损失$L$在第$k$层激活函数输入处的梯度\visible<5->{，利用链式法有}
+\item<4-> 反向传播从输出向输入传播梯度，因此我们先考虑阶段二\visible<5->{。令$\pi^k = \frac{\partial L}{\partial \textbf{s}^k}$表示损失$L$在第$k$层激活函数输入处的梯度，利用链式法有}
-\vspace{-1em}
+\vspace{-1.5em}
 \visible<5->{
 \begin{eqnarray}
 \pi^K & = & \frac{\partial L}{\partial \textbf{s}^K} \nonumber \\
@@ -180,19 +187,21 @@
 \begin{tikzpicture}
 \begin{scope}
-\node [anchor=center] (factor00) at (0,0) {${\displaystyle \frac{\partial L}{\partial \textbf{w}^K} \ = }$};
+\node [anchor=center] (factor00) at (0,0) {${\displaystyle \pi^K \ = }$};
 \node [anchor=west] (factor01) at (factor00.east) {${\displaystyle \frac{\partial L}{\partial \textbf{h}^K}}$};
 \node [anchor=west,inner sep=1pt] (factor02) at (factor01.east) {${\displaystyle \cdot}$};
 \node [anchor=west] (factor03) at (factor02.east) {${\displaystyle \frac{\partial f^K(\textbf{s}^K)}{\partial \textbf{s}^K}}$};
-\node [anchor=west,inner sep=1pt] (factor04) at (factor03.east) {${\displaystyle \cdot}$};
-\node [anchor=west] (factor05) at (factor04.east) {${\displaystyle  \frac{\partial \textbf{s}^K}{\partial \textbf{w}^K}}$};
 \begin{pgfonlayer}{background}
+\visible<2-4>{
 \node [rectangle,inner sep=0em,fill=red!20] [fit = (factor01)] (p1) {};
+}
+\visible<3-4>{
 \node [rectangle,inner sep=0em,fill=blue!20] [fit = (factor03)] (p2) {};
-\node [rectangle,inner sep=0em,fill=green!20] [fit = (factor05)] (p3) {};
+}
-\node [circle,inner sep=0em,fill=purple!20] [fit = (factor02)] (p4) {};
+\visible<5->{
-\node [circle,inner sep=0em,fill=purple!20] [fit = (factor04)] (p5) {};
+\node [circle,inner sep=0em,fill=green!20] [fit = (factor02)] (p3) {};
+}
 \end{pgfonlayer}
 \end{scope}
@@ -200,12 +209,73 @@
 \end{center}
 \begin{itemize}
-\item \raisebox{-0.7em}{\tikz{\node [anchor=west,fill=red!20] (factor01) at (factor00.east) {$\frac{\partial L}{\partial \textbf{h}^K}$};}} 表示损失$L$相对网络输出的变化率，比如，对于$L = \frac{1}{2} (\hat{\textbf{y}} - \textbf{h}^K)^2$，有$\frac{\partial L}{\partial \textbf{h}^K} = \hat{\textbf{y}} - \textbf{h}^K$
+\item<2-> \raisebox{-0.7em}{\tikz{\node [anchor=west,fill=red!20] (factor01) at (factor00.east) {$\frac{\partial L}{\partial \textbf{h}^K}$};}} 表示损失$L$相对网络输出的变化率，比如，对于$L = \frac{1}{2} ||\hat{\textbf{y}} - \textbf{h}^K||^2$，有$\frac{\partial L}{\partial \textbf{h}^K} = \hat{\textbf{y}} - \textbf{h}^K$
-\item \raisebox{-0.7em}{\tikz{\node [anchor=west,fill=blue!20] (factor01) at (factor00.east) {$\frac{\partial f^K(\textbf{s}^K)}{\partial \textbf{s}^K}$};}} 表示激活函数相对于它自己的输入的变化率，比如，对于$f(\textbf{s}) = \frac{1}{1+\exp(-\textbf{s})}$，有$\frac{\partial f(\textbf{s})}{\partial \textbf{s}} = f(\textbf{s})(1-f(\textbf{s}))$
+\item<3-> \raisebox{-0.7em}{\tikz{\node [anchor=west,fill=blue!20] (factor01) at (factor00.east) {$\frac{\partial f^K(\textbf{s}^K)}{\partial \textbf{s}^K}$};}} 表示激活函数相对于它自己的输入的变化率，比如，对于$f(\textbf{s}) = \frac{1}{1+\exp(-\textbf{s})}$，有$\frac{\partial f(\textbf{s})}{\partial \textbf{s}} = f(\textbf{s})(1-f(\textbf{s}))$
-\item \raisebox{-0.7em}{\tikz{\node [anchor=west,fill=green!20] (factor01) at (factor00.east) {$\frac{\partial \textbf{s}^K}{\partial \textbf{w}^K}$};}} 表示激活函数的输入相对于参数矩阵$\textbf{w}^K$的变化率，根据简单的数学，可以得到$\frac{\partial \textbf{s}^K}{\partial \textbf{w}^K} = \frac{\partial \textbf{h}^{K-1}\textbf{w}^K}{\partial \textbf{w}^K}$
+\item<4-> 这个结果符合直觉，在$s^K$出的梯度相当于在损失函数微分($\frac{\partial L}{\partial \textbf{h}^K}$)和激活函数微分($\frac{\partial f^K(\textbf{s}^K)}{\partial \textbf{s}^K}$) 的乘积\visible<5->{，注意这里所有操作都是单元级，比如张量按单元乘法}
+\end{itemize}
+\visible<4->{
+\vspace{-0.5em}
+\begin{center}
+\begin{tikzpicture}
+\begin{scope}
+\node [anchor=west,minimum height=1.7em,fill=blue!20,draw] (s) at (0,0) {$\textbf{s}^{K}$};
+\node [anchor=west,minimum height=1.7em,fill=green!20,draw] (h2) at ([xshift=5.5em]s.east) {$\textbf{h}^{K}$};
+\node [anchor=west,minimum height=1.7em,fill=orange!20,draw] (l) at ([xshift=5.5em]h2.east) {$L$};
+\draw [->] (s.east) -- (h2.west);
+\draw [->] (h2.east) -- (l.west);
+\draw [->,very thick,red] ([yshift=1em,xshift=-0.1em]l.north) -- ([yshift=1em,xshift=0.1em]h2.north) node [pos=0.5,above] {\tiny{求梯度\alert{$\frac{\partial L}{\partial \textbf{h}^k} = ?$}}};
+\draw [->,very thick,red] ([yshift=1em,xshift=-0.1em]h2.north) -- ([yshift=1em,xshift=0.1em]s.north) node [pos=0.5,above] {\tiny{求梯度\alert{$\frac{\partial f^K(\textbf{s}^K)}{\partial \textbf{s}^K} = ?$}}};
+\draw [-,very thick,red] ([yshift=0.5em]l.north) -- ([yshift=1.5em]l.north);
+\draw [-,very thick,red] ([yshift=0.5em]h2.north) -- ([yshift=1.5em]h2.north);
+\draw [-,very thick,red] ([yshift=0.5em]s.north) -- ([yshift=1.5em]s.north);
+\end{scope}
+\end{tikzpicture}
+\end{center}
+}
+\end{frame}
+%%%------------------------------------------------------------------------------------------------------------
+%%% 输出层的反向传播 - 求 dL/dw
+\begin{frame}{反向传播 - 输出层}
+\begin{itemize}
+\item 输出层(两个阶段)
 \end{itemize}
+\vspace{-0.5em}
+\begin{center}
+\begin{tikzpicture}
+\begin{scope}
+\node [anchor=center,minimum height=1.7em,fill=yellow!20,draw] (h) at (0,0) {$\textbf{h}^{K-1}$};
+\node [anchor=west,minimum height=1.7em,fill=blue!20,draw] (s) at ([xshift=5.5em]h.east) {$\textbf{s}^{K}$};
+\node [anchor=west,minimum height=1.7em,fill=green!20,draw] (h2) at ([xshift=5.5em]s.east) {$\textbf{h}^{K}$};
+\node [anchor=west,minimum height=1.7em,fill=orange!20,draw] (l) at ([xshift=5.5em]h2.east) {$L$};
+\draw [->] (h.east) -- (s.west);
+\draw [->] (s.east) -- (h2.west);
+\draw [->] (h2.east) -- (l.west) node [pos=0.5,above] {\tiny{损失}};
+\node [anchor=south west,inner sep=2pt] (step100) at ([xshift=0.5em,yshift=-0.8em]h.north east) {\tiny{$\textbf{s}^K = \textbf{h}^{K-1} \textbf{w}^K$}};
+\node [anchor=south west,inner sep=2pt] (step200) at ([xshift=0.5em,yshift=-0.8em]s.north east) {\tiny{$\textbf{h}^K = f^K(\textbf{s}^K)$}};
+\node [anchor=south,inner sep=1pt] (outputlabel) at ([yshift=0.0em]h2.north) {\tiny{\textbf{输出层}}};
+\node [anchor=south west] (slabel) at ([yshift=1em,xshift=0.3em]s.north) {\scriptsize{\textbf{\alert{已经得到：$\pi^K$}}}};
+\draw [->,very thick,red] ([yshift=1em,xshift=-0.1em]s.north) -- ([yshift=1em,xshift=0.1em]h.north) node [pos=0.5,above] {\tiny{\alert{$\frac{\partial L}{\partial \textbf{w}^k} = ?$, $\frac{\partial L}{\partial \textbf{h}^k} = ?$}}};
+\draw [-,very thick,red] ([yshift=0.5em]h.north) -- ([yshift=1.5em]h.north);
+\draw [-,very thick,red] ([yshift=0.5em]s.north) -- ([yshift=1.5em]s.north);
+\end{scope}
+\end{tikzpicture}
+\end{center}
 \end{frame}
 \end{CJK}