minor updates (sec 9)

757a00c9 · xiaotong · f619e741 · 757a00c9 · 757a00c9
Commit 757a00c9 authored Nov 07, 2020 by xiaotong
--- a/Chapter9/Figures/figure-residual-structure.tex
+++ b/Chapter9/Figures/figure-residual-structure.tex
@@ -8,13 +8,13 @@
 \node [anchor=north](node3)at ([yshift=-1.2em]node6.south){$\bigoplus$};
 \draw[->,thick]([yshift=-0.32em]node3.north)--(node6.south);

-\node [anchor=north,draw,thick](node2)at ([yshift=-1.2em]node3.south){\small{weight layer}};
+\node [anchor=north,draw,thick](node2)at ([yshift=-1.2em]node3.south){\small{函数变换}};
 \draw[->,thick](node2.north)--([yshift=0.35em]node3.south);
-\node[anchor=west](node2-1) at ([xshift=2.1em,yshift=1.2em]node2.east) {$\mathbi{x}$};
-\node[anchor=north](node2-2) at ([xshift=0.2em,yshift=-0.3em]node2-1.south) {\footnotesize{$\rm{identity}$}};
+\node[anchor=west](node2-1) at ([xshift=3.1em,yshift=1.2em]node2.east) {$\mathbi{x}$};
+\node[anchor=north](node2-2) at ([xshift=0.2em,yshift=-0.0em]node2-1.south) {\footnotesize{等值传递}};

-\node [anchor=east](node4) at ([xshift=-0.2em]node2.west) {$\textrm{F}(\mathbi{x})$};
-\node [anchor=east](node5) at ([xshift=-0.3em]node3.west) {$\textrm{F}(\mathbi{x})+\mathbi{x}$};
+\node [anchor=east](node4) at ([xshift=-0.2em]node2.west) {$F(\mathbi{x})$};
+\node [anchor=east](node5) at ([xshift=-0.3em]node3.west) {$F(\mathbi{x})+\mathbi{x}$};

 \node [anchor=north](node1) at ([yshift=-1.8em]node2.south) {};
 \draw[->,thick]([yshift=0.0em]node1.north)--(node2.south);

--- a/Chapter9/chapter9.tex
+++ b/Chapter9/chapter9.tex
@@ -1158,8 +1158,8 @@ y&=&{\textrm{Sigmoid}}({\textrm{Tanh}}({\mathbi{x}}\cdot {\mathbi{W}}^{[1]}+{\ma
 \rule{0pt}{15pt}     Logistic损失 & $ L={\textrm{log}}(1+\widetilde{\mathbi{y}}_i\cdot {\mathbi{y}}_i) $ & 回归  \\
 \rule{0pt}{15pt}     平方损失 & $ L={(\widetilde{\mathbi{y}}_i-{\mathbi{y}}_i)}^2 $ & 回归  \\
 \rule{0pt}{15pt}     指数损失 & $ L={\textrm{exp}}(-\widetilde{\mathbi{y}}_i\cdot {\mathbi{y}}_i) $ & AdaBoost  \\
-\rule{0pt}{15pt}     交叉熵损失 & $ L=-\sum_{k}{{\mathbi{y}}_{ik}}{\textrm {log}} {\widetilde{\mathbi{y}}_{ik}} $ & 多分类  \\
-\rule{0pt}{15pt}     & 其中，${\mathbi{y}}_{ik}$ 表示 ${\mathbi{y}}_i$的第$k$维
+\rule{0pt}{15pt}     交叉熵损失 & $ L=-\sum_{k}{{\mathbi{y}}_{i}[k]}{\textrm {log}} {\widetilde{\mathbi{y}}_{i}[k]} $ & 多分类  \\
+\rule{0pt}{15pt}     & 其中，${\mathbi{y}}_{i}[k]$ 表示 ${\mathbi{y}}_i$的第$k$维
 \end{tabular}
 \end{table}
 %--------------------------------------------------------------------
@@ -1547,7 +1547,7 @@ z_t&=&\gamma z_{t-1}+(1-\gamma) \frac{\partial J}{\partial {\theta}_t} \cdot  \f

 \parinterval  网络训练过程中，如果参数的初始值过大，而且每层网络的梯度都大于1，反向传播过程中，各层梯度的偏导数都会比较大，会导致梯度指数级地增长直至超出浮点数表示的范围，这就产生了梯度爆炸现象。如果发生这种情况，模型中离输入近的部分比离输入远的部分参数更新得更快，使网络变得非常不稳定。在极端情况下，模型的参数值变得非常大，甚至于溢出。针对梯度爆炸的问题，常用的解决办法为{\small\sffamily\bfseries{梯度裁剪}}\index{梯度裁剪}（Gradient Clipping）\index{Gradient Clipping}。

-\parinterval    梯度裁剪的思想是设置一个梯度剪切阈值。在更新梯度的时候，如果梯度超过这个阈值，就将其强制限制在这个范围之内。假设梯度为${\mathbi{g}}$，梯度剪切阈值为$\sigma $，梯度裁剪的公式为\eqref{eq:9-43}：
+\parinterval    梯度裁剪的思想是设置一个梯度剪切阈值。在更新梯度的时候，如果梯度超过这个阈值，就将其强制限制在这个范围之内。假设梯度为${\mathbi{g}}$，梯度剪切阈值为$\sigma $，梯度裁剪的公式为：
 \begin{eqnarray}
 {\mathbi{g}}&=&{\textrm{min}}(\frac{\sigma}{\Vert {\mathbi{g}}\Vert},1){\mathbi{g}}
 \label{eq:9-43}