Merge remote-tracking branch 'origin/master' into caorunzhe

0ba97d85 · 曹润柘 · a4b4d34d · 286aaa92 · 0ba97d85 · 0ba97d85
Commit 0ba97d85 authored Mar 11, 2020 by 曹润柘
--- a/Book/Chapter6/Chapter6.tex
+++ b/Book/Chapter6/Chapter6.tex
@@ -130,7 +130,6 @@ SPB                     & 28.3          & 23.2           & -18.0       \\
 NMT                     & $ 21.7^{\ast}$          & $18.7^{\ast}$           & -13.7      \\
 \end{tabular}
 \end{table}
-%--------------------------------------

 \parinterval 更振奋人心的是，神经机器翻译在某些任务上的结果已经相当惊艳，比如在汉英新闻翻译任务中，神经机器翻译就取得了至少和专业翻译人员相媲美的效果\cite{Hassan2018AchievingHP}。在该任务中，神经机器系统（Combo-4、Combo-5 和 Combo-6）的人工评价得分与Reference-HT（专业翻译人员翻译）得分无显著差别，且远超Reference-WMT（WMT的参考译文，也是由人类翻译）得分（表\ref{tab:Human assessment}）。

@@ -891,7 +890,7 @@ $\textrm{a}(\cdot)$可以被看作是目标语表示和源语言表示的一种`
 \begin{figure}[htp]
 \centering
 \input{./Chapter6/Figures/figure-Query-model-corresponding-to-traditional-query-model-vs-attention-mechanism}
-\caption{传统查询模型（a） vs 注意力机制（b）所对应的查询模型}
+\caption{传统查询模型(a)和注意力机制所对应的查询模型(b)}
 \label{fig:6-25}
 \end{figure}
 %----------------------------------------------
@@ -938,9 +937,10 @@ $\textrm{a}(\cdot)$可以被看作是目标语表示和源语言表示的一种`
 \parinterval 将公式\ref{eqC6.29}应用于神经机器翻译有几个基本问题需要考虑：1）损失函数的选择；2）参数初始化的策略，也就是如何设置$\mathbf{w}_0$；3）优化策略和学习率调整策略；4）训练加速。下面我们对这些问题进行讨论。
 %%%%%%%%%%%%%%%%%%
 \subsubsection{损失函数}\index{Chapter6.3.5.1}
-\parinterval 因为神经机器翻译在每个目标语位置都会输出一个概率分布，表示这个位置上不同单词出现的可能性，因此我们需要知道当前位置输出的分布相比于标准答案的``损失''。对于这个问题，常用的是交叉熵损失函数\footnote{\ \ 百度百科：\url{https://baike.baidu.com/item/\%E4\%BA\%A4\%E5\%8F\%89\%E7\%86\%B5/8983241?fr=aladdin}}。令$\mathbf{y}$表示机器翻译模型输出的分布，$\hat{\mathbf{y}}$表示标准答案，则交叉熵损失可以被定义为$L_{ce}(\mathbf{y},\hat{\mathbf{y}}) = - \sum_{k=1}^{|V|} \mathbf{y}[k] \textrm{log} (\hat{\mathbf{y}}[k])$，其中$\mathbf{y}[k]$和$\hat{\mathbf{y}}[k]$分别表示向量$\mathbf{y}$和$\hat{\mathbf{y}}$的第$k$维，$|V|$表示输出向量得维度（等于词表大小）。对于一个模型输出的概率分布$\mathbf{Y} = \{ \mathbf{y}_1,\mathbf{y}_2,…, \mathbf{y}_n \}$和标准答案分布$\hat{\mathbf{Y}}=\{ \hat{\mathbf{y}}_1, \hat{\mathbf{y}}_2,…,\hat{\mathbf{y}}_n \}$，损失函数可以被定义为
+\parinterval 因为神经机器翻译在每个目标语位置都会输出一个概率分布，表示这个位置上不同单词出现的可能性，因此我们需要知道当前位置输出的分布相比于标准答案的``损失''。对于这个问题，常用的是交叉熵损失函数\footnote{\ \ 百度百科：\url{https://baike.baidu.com/item/\%E4\%BA\%A4\%E5\%8F\%89\%E7\%86\%B5/8983241?fr=aladdin}}。令$\mathbf{y}$表示机器翻译模型输出的分布，$\hat{\mathbf{y}}$表示标准答案，则交叉熵损失可以被定义为$L_{\textrm{ce}}(\mathbf{y},\hat{\mathbf{y}}) = - \sum_{k=1}^{|V|} \mathbf{y}[k] \textrm{log} (\hat{\mathbf{y}}[k])$，其中$\mathbf{y}[k]$ 和$\hat{\mathbf{y}}[k]$分别表示向量$\mathbf{y}$和$\hat{\mathbf{y}}$的第$k$维，$|V|$表示输出向量得维度（等于词表大小）。对于一个模型输出的概率分布$\mathbf{Y} = \{ \mathbf{y}_1,\mathbf{y}_2,…, \mathbf{y}_n \}$和标准答案分布$\hat{\mathbf{Y}}=\{ \hat{\mathbf{y}}_1, \hat{\mathbf{y}}_2,…,\hat{\mathbf{y}}_n \}$，损失函数可以被定义为
+%-------------
 \begin{eqnarray}
-L(\mathbf{Y},\hat{\mathbf{Y}}) = \sum_{j=1}^n L_{ce}(\mathbf{y}_j,\hat{\mathbf{y}}_j)
+L(\mathbf{Y},\hat{\mathbf{Y}}) = \sum_{j=1}^n L_{\textrm{ce}}(\mathbf{y}_j,\hat{\mathbf{y}}_j)
 \label{eqC6.30}
 \end{eqnarray}

@@ -1006,7 +1006,7 @@ L(\mathbf{Y},\hat{\mathbf{Y}}) = \sum_{j=1}^n L_{ce}(\mathbf{y}_j,\hat{\mathbf{y
 \centering
 %\includegraphics[scale=0.7]{./Chapter6/Figures/Big learning rate vs Small learning rate.png}
 \input{./Chapter6/Figures/figure-convergence&lr}
-\caption{学习率过小函数收敛过程（左） vs 学习率过大函数收敛过程（右） }
+\caption{学习率过小（左） vs 学习率过大（右） }
 \label{fig:6-27}
 \end{figure}
 %----------------------------------------------
@@ -1089,8 +1089,8 @@ L(\mathbf{Y},\hat{\mathbf{Y}}) = \sum_{j=1}^n L_{ce}(\mathbf{y}_j,\hat{\mathbf{y
 \parinterval 神经机器翻译的推断是指：利用已经训练好的模型对新的源语言句子进行翻译的过程。具体来说，首先利用编码器生成源语言句子的表示，之后利用解码器预测目标语译文。也就是，对于源语言句子$\mathbf{x}$，生成一个使翻译概率$\textrm{P}(\mathbf{y} | \mathbf{x})$最大的目标语译文$\hat{\mathbf{y}}$，如下（详细过程见\ref{sec:6.3.1}节）：
 %-----------------------
 \begin{eqnarray}
-\hat{\mathbf{y}} & = &  \argmax_y \textrm{P}(\mathbf{y} | \mathbf{x})  \nonumber \\
- & = &  \argmax_y \prod_{j=1}^n \textrm{P}(y_j | \mathbf{y}_{<j},\mathbf{x})
+\hat{\mathbf{y}} & = & \argmax_y \textrm{P}(\mathbf{y} | \mathbf{x}) \nonumber \\
+                 & = & \argmax_y \prod_{j=1}^n \textrm{P}(y_j | \mathbf{y}_{<j},\mathbf{x})
 \label{eqC6.33}
 \end{eqnarray}

@@ -1143,8 +1143,8 @@ L(\mathbf{Y},\hat{\mathbf{Y}}) = \sum_{j=1}^n L_{ce}(\mathbf{y}_j,\hat{\mathbf{y
 \parinterval 束搜索是一种启发式图搜索算法。相比于全搜索，它可以减少搜索所占用的空间和时间，在每一步扩展的时候，剪掉一些质量比较差的结点，保留下一些质量较高的结点。具体到机器翻译任务，对于每一个目标语位置，束搜索选择了概率最大的前$K$个单词进行扩展（其中$K$叫做束宽度，或简称为束宽）。如图\ref{fig:6-33}所示，当$K=3$时，若令\{$y_1, y_2,…, y_n$\}表示生成的目标语序列，则束搜索的具体过程为：在预测第一个位置时，我们通过模型得到$y_1$的概率分布，选取概率最大的前3个单词作为候选结果（假设分别为``have'', ``has'', ``it''）。在预测第二个位置的单词时，模型针对已经得到的三个候选结果（``have'', ``has'', ``it''）计算第二个单词的概率分布。例如，我们可以在将``have''作为第二步的输入，计算$y_2$的概率分布。此时，译文序列的概率为
 %--------------------------------------------
 \begin{eqnarray}
-\textrm{P} (y_2,y_1 | \mathbf{x}) &=& \textrm{P} (y_2, \textrm{``have''} | \mathbf{x}) \nonumber \\
-&=& \textrm{P}(y_2 | \textrm{``have''} , \mathbf{x}) \textrm{P} (\textrm{``have''} | \mathbf{x})		
+\textrm{P} (y_2,y_1 | \mathbf{x}) & = & \textrm{P} (y_2, \textrm{``have''} | \mathbf{x}) \nonumber \\
+								  & = & \textrm{P}(y_2 | \textrm{``have''} , \mathbf{x}) \textrm{P} (\textrm{``have''} | \mathbf{x})								
 \label{eqC6.36}
 \end{eqnarray}

@@ -1197,6 +1197,7 @@ L(\mathbf{Y},\hat{\mathbf{Y}}) = \sum_{j=1}^n L_{ce}(\mathbf{y}_j,\hat{\mathbf{y
 \end{eqnarray}

 \noindent 显然，当目标语$y$过短时，$\textrm{lp}(\mathbf{y})$的值越小，因为$\textrm{log P}(\mathbf{y} | \mathbf{x})$是负数，所以句子得分$\textrm{score} ( \mathbf{y} , \mathbf{x})$越小。也就是说，模型会惩罚译文过短的结果。当覆盖度较高时，同样会使得分变低。通过这样的惩罚机制，使模型得分更为合理，从而帮助我们选择出质量更高的译文。
+
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \subsection{实例-GNMT}\index{Chapter6.3.7}
 \parinterval 循环神经网络在机器翻译中有很多成功的应用，比如、RNNSearch\cite{bahdanau2014neural}、Nematus\\ \cite{DBLP:journals/corr/SennrichFCBHHJL17}等系统就被很多研究者作为实验系统。在众多基于循环神经网络的系统中，GNMT系统是最成功的一个\cite{Wu2016GooglesNM}。GNMT是谷歌2016年发布的神经机器翻译系统。在GNMT之前，神经机器翻译有三个弱点：训练和推理速度较慢、在翻译稀有单词上缺乏鲁棒性和有时无法完整翻译源语言句子中的所有单词。GNMT的提出有效的解决了上述问题。
@@ -1219,7 +1220,7 @@ L(\mathbf{Y},\hat{\mathbf{Y}}) = \sum_{j=1}^n L_{ce}(\mathbf{y}_j,\hat{\mathbf{y
 % 表
 \begin{table}[htp]
 \centering
-\caption{GNMT与当时最优秀的模型}
+\caption{GNMT与其它翻译模型对比\cite{Wu2016GooglesNM}}
 \label{tab:gnmt vs state-of-the-art models}
 \begin{tabular}{l l l l}
 \multicolumn{1}{l|}{\multirow{2}{*}{\#}} & \multicolumn{2}{c}{\textbf{BLEU}} & \multirow{2}{*}{\textbf{CPU decoding time}} \\
@@ -1302,7 +1303,7 @@ L(\mathbf{Y},\hat{\mathbf{Y}}) = \sum_{j=1}^n L_{ce}(\mathbf{y}_j,\hat{\mathbf{y
 \begin{figure}[htp]
 \centering
 \input{./Chapter6/Figures/figure-Dependencies-between-words-of-Attention}
-\caption{注意力机制中单词之间的依赖关系}
+\caption{自注意力机制中单词之间的依赖关系}
 \label{fig:6-35}
 \end{figure}
 %----------------------------------------------
@@ -1544,10 +1545,8 @@ L(\mathbf{Y},\hat{\mathbf{Y}}) = \sum_{j=1}^n L_{ce}(\mathbf{y}_j,\hat{\mathbf{y
 \parinterval 多头机制具体的计算公式如下：
 %-------------------------------------------------------
 \begin{eqnarray}
-\textrm{MultiHead}(\mathbf{Q}, \mathbf{K} , \mathbf{V})& =&
-\textrm{Concat} (\mathbf{head}_1, ... , \mathbf{head}_h ) \mathbf{W}^o \nonumber \\
-\textrm{where} \mathbf{head}_i & = &\textrm{Attention} (\mathbf{Q}\mathbf{W}_i^Q ,
- \mathbf{K}\mathbf{W}_i^K  , \mathbf{V}\mathbf{W}_i^V )
+\textrm{MultiHead}(\mathbf{Q}, \mathbf{K} , \mathbf{V})& = & \textrm{Concat} (\mathbf{head}_1, ... , \mathbf{head}_h ) \mathbf{W}^o \\
+\textrm{where} \mathbf{head}_i & = &\textrm{Attention} (\mathbf{Q}\mathbf{W}_i^Q , \mathbf{K}\mathbf{W}_i^K  , \mathbf{V}\mathbf{W}_i^V )
 \label{eqC6.46}
 \end{eqnarray}

@@ -1663,7 +1662,7 @@ lrate = d_{model}^{-0.5} \cdot \textrm{min} (step^{-0.5} , step \cdot warmup\_st
 \begin{figure}[htp]
 \centering
 \input{./Chapter6/Figures/figure-lrate-of-transformer}
-\caption{Transformer模型的学习率调整曲线}
+\caption{Transformer模型的学习率曲线}
 \label{fig:6-52}
 \end{figure}
 %----------------------------------------------
@@ -1704,12 +1703,12 @@ lrate = d_{model}^{-0.5} \cdot \textrm{min} (step^{-0.5} , step \cdot warmup\_st
 % 表
 \begin{table}[htp]
 \centering
-\caption{三种Transformer的实验对比}
+\caption{三种Transformer模型的对比}
 \label{tab:word-translation-examples}
 \begin{tabular}{l | l l l}

-\multirow{2}{*}{\#}   & \multicolumn{2}{c}{\textbf{BLEU}} & \multirow{2}{*}{\textbf{params}} \\
-                      & \textbf{EN-DE}  & \textbf{EN-FR}  &                                  \\ \hline
+\multirow{2}{*}{\#}   & \multicolumn{2}{c}{BLEU} & \multirow{2}{*}{params} \\
+                      & EN-DE  & EN-FR  &                                  \\ \hline
 Transformer Base      & 27.3            & 38.1            & 65$\times 10^{6}$                \\
 Transformer Big       & 28.4            & 41.8            & 213$\times 10^{6}$               \\
 Transformer Deep(48层) & 30.2            & 43.1            & 194$\times 10^{6}$              \\

--- a/Book/Chapter6/Figures/figure-Automatic-generation-of-ancient-poems-based-on-encoder-decoder-framework.tex
+++ b/Book/Chapter6/Figures/figure-Automatic-generation-of-ancient-poems-based-on-encoder-decoder-framework.tex
@@ -8,8 +8,8 @@
 \node [lnode,anchor=west] (l1) at (0,0) {编码器};
 \node [lnode,anchor=west,fill=blue!20] (l2) at ([xshift=3em]l1.east) {解码器};
 \node [anchor=north] (inputs) at ([xshift=-1.5em,yshift=-1em]l1.south) {Inputs: 五 星 红 旗};
-\node [anchor=south] (outputs) at ([xshift=-3.5em,yshift=2em]l2.north) {Outputs: {\color{red}五}云深处小蓬莱\ \  {\color{red}星}斗阑干次第开};
-\node [anchor=south] (outputs1) at ([xshift=-1.5em,yshift=1em]l2.north) {{\color{red}红}旆壁幢春色里\ \  {\color{red}旗}亭鼓吹乐声来};
+\node [anchor=south] (outputs) at ([xshift=-3.5em,yshift=2em]l2.north) {Outputs: {\color{red}五}云深处小蓬莱\ \ {\color{red}星}斗阑干次第开};
+\node [anchor=south] (outputs1) at ([xshift=-1.5em,yshift=1em]l2.north) {{\color{red}红}旆壁幢春色里\ \ {\color{red}旗}亭鼓吹乐声来};
 \draw [->,very thick] ([yshift=-1em]l1.south) -- ([yshift=-0.1em]l1.south);
 \draw [->,very thick] ([yshift=0.1em]l2.north) -- ([yshift=1em]l2.north);
 \draw [->,very thick] ([xshift=0.1em]l1.east) -- ([xshift=-0.1em]l2.west);

--- a/Book/Chapter6/Figures/figure-Automatically-generate-instances-of-couplets.tex
+++ b/Book/Chapter6/Figures/figure-Automatically-generate-instances-of-couplets.tex
@@ -2,8 +2,8 @@

 \begin{tikzpicture}
 \begin{scope}
-\tikzstyle{lnode} = [minimum height=2.5em,minimum width=12em,inner sep=3pt,very thick,rounded corners=2pt,draw=red!75!black,fill=red!5];
-\tikzstyle{rnode} = [minimum height=2.5em,minimum width=12em,inner sep=3pt,very thick,rounded corners=2pt,draw=blue!75!black,fill=blue!5];
+\tikzstyle{lnode} = [minimum height=2.5em,minimum width=12em,inner sep=3pt,rounded corners=2pt,draw=red!75!black,fill=red!5];
+\tikzstyle{rnode} = [minimum height=2.5em,minimum width=12em,inner sep=3pt,rounded corners=2pt,draw=blue!75!black,fill=blue!5];
 \tikzstyle{standard} = [rounded corners=3pt]

 \node [lnode,anchor=west] (l1) at (0,0) {上联：翠竹千支歌盛世};

--- a/Book/Chapter6/Figures/figure-Comparison-of-the-number-of-padding-in-batch.tex
+++ b/Book/Chapter6/Figures/figure-Comparison-of-the-number-of-padding-in-batch.tex
@@ -25,4 +25,5 @@

 \node [rectangle,inner sep=0.5em,rounded corners=2pt,very thick,dotted,draw=ugreen!80] [fit = (s1) (s3) (p1) (p3)] (box0) {};
 \node [rectangle,inner sep=0.5em,rounded corners=2pt,very thick,dotted,draw=ugreen!80] [fit = (s4) (s6) (p4) (p5)] (box0) {};
+
 \end{tikzpicture}
\ No newline at end of file
--- a/Book/Chapter6/Figures/figure-Example-of-automatic-translation-of-classical-Chinese.tex
+++ b/Book/Chapter6/Figures/figure-Example-of-automatic-translation-of-classical-Chinese.tex
@@ -3,13 +3,13 @@

 \begin{frame}{}

- \begin{tcolorbox}[size=normal,left=2mm,right=1mm,colback=red!5!white,colframe=red!75!black]
+ \begin{tcolorbox}[size=normal,left=2mm,right=1mm,colback=red!5!white,colframe=red!75!black,boxrule=1pt]
 {
 \small{古文：侍卫步军都指挥使、彰信节度使李继勋营于寿州城南，唐刘仁赡伺继勋无备，出兵击之，杀士卒数百人，焚其攻具。}
 }
 \end{tcolorbox}
 \vspace{-0.4em}
- \begin{tcolorbox}[size=normal,left=2mm,right=1mm,colback=blue!5!white,colframe=blue!75!black]
+ \begin{tcolorbox}[size=normal,left=2mm,right=1mm,colback=blue!5!white,colframe=blue!75!black,boxrule=1pt]
 {
 \small{现代文：侍卫步军都指挥使、彰信节度使李继勋在寿州城南扎营，唐刘仁赡窥伺李继勋没有防备，出兵攻打他，杀死士兵几百人，烧毁李继勋的攻城器}
 }
@@ -17,13 +17,13 @@

 \vspace{0.2em}

-\begin{tcolorbox}[size=normal,left=2mm,right=1mm,colback=red!5!white,colframe=red!75!black]
+\begin{tcolorbox}[size=normal,left=2mm,right=1mm,colback=red!5!white,colframe=red!75!black,boxrule=1pt]
 {
 \small{古文：其后人稍稍识之，多延至其家，使为弟子论学。}
 }
 \end{tcolorbox}
 \vspace{-0.4em}
- \begin{tcolorbox}[size=normal,left=2mm,right=1mm,colback=blue!5!white,colframe=blue!75!black]
+ \begin{tcolorbox}[size=normal,left=2mm,right=1mm,colback=blue!5!white,colframe=blue!75!black,boxrule=1pt]
 {
 \small{现代文：后来的人渐渐认识他，多把他请到家里，让他为弟子讲授学问。}
 }

--- a/Book/Chapter6/Figures/figure-GRU01.tex
+++ b/Book/Chapter6/Figures/figure-GRU01.tex
@@ -74,7 +74,7 @@
                \draw[emph] (aux71) -| (aux32) -| (aux44);
                \node[opnode,circle,draw=red,thick] () at (aux44) {$\sigma$};
            }
-            
+             
        \end{scope}

        \begin{scope}
@@ -86,7 +86,7 @@
       \node[] (tanh) at (aux46){};

        \begin{pgfonlayer}{background}
-            \node[draw,very thick,rectangle,fill=blue!30!white,rounded corners=5pt,inner sep=6pt,fit=(aux22) (aux76) (z76) (tanh)] (GRU) {};
+            \node[draw,very thick,rectangle,fill=blue!10!white,rounded corners=5pt,inner sep=6pt,fit=(aux22) (aux76) (z76) (tanh)] (GRU) {};
        \end{pgfonlayer}



--- a/Book/Chapter6/Figures/figure-GRU02.tex
+++ b/Book/Chapter6/Figures/figure-GRU02.tex
@@ -99,7 +99,7 @@
        \node[] (tanh) at (aux46){};

        \begin{pgfonlayer}{background}
-            \node[draw,very thick,rectangle,fill=blue!30!white,rounded corners=5pt,inner sep=6pt,fit=(aux22) (aux76) (z76) (tanh)] (GRU) {};
+            \node[draw,very thick,rectangle,fill=blue!10!white,rounded corners=5pt,inner sep=6pt,fit=(aux22) (aux76) (z76) (tanh)] (GRU) {};
        \end{pgfonlayer}



--- a/Book/Chapter6/Figures/figure-GRU03.tex
+++ b/Book/Chapter6/Figures/figure-GRU03.tex
@@ -118,7 +118,7 @@
        \end{scope}

        \begin{pgfonlayer}{background}
-            \node[draw,very thick,rectangle,fill=blue!30!white,rounded corners=5pt,inner sep=6pt,fit=(aux22) (aux76) (z76) (tanh)] (GRU) {};
+            \node[draw,very thick,rectangle,fill=blue!10!white,rounded corners=5pt,inner sep=6pt,fit=(aux22) (aux76) (z76) (tanh)] (GRU) {};
        \end{pgfonlayer}



--- a/Book/Chapter6/Figures/figure-LSTM01.tex
+++ b/Book/Chapter6/Figures/figure-LSTM01.tex
@@ -93,7 +93,7 @@
           \node[ ] (o27) at (aux27) { };

        \begin{pgfonlayer}{background}
-            \node[draw,very thick,rectangle,fill=blue!30!white,rounded corners=5pt,inner sep=4pt,fit=(aux22) (aux58) (u55) (o27)] (LSTM) {};
+            \node[draw,very thick,rectangle,fill=blue!10!white,rounded corners=5pt,inner sep=4pt,fit=(aux22) (aux58) (u55) (o27)] (LSTM) {};
        \end{pgfonlayer}

   

--- a/Book/Chapter6/Figures/figure-LSTM02.tex
+++ b/Book/Chapter6/Figures/figure-LSTM02.tex
@@ -108,7 +108,7 @@
         \node[ ] (o27) at (aux27) { };

        \begin{pgfonlayer}{background}
-            \node[draw,very thick,rectangle,fill=blue!30!white,rounded corners=5pt,inner sep=4pt,fit=(aux22) (aux58) (u55) (o27)] (LSTM) {};
+            \node[draw,very thick,rectangle,fill=blue!10!white,rounded corners=5pt,inner sep=4pt,fit=(aux22) (aux58) (u55) (o27)] (LSTM) {};
        \end{pgfonlayer}
        


--- a/Book/Chapter6/Figures/figure-LSTM03.tex
+++ b/Book/Chapter6/Figures/figure-LSTM03.tex
@@ -125,7 +125,7 @@
        \node[ ] (o27) at (aux27) { };

        \begin{pgfonlayer}{background}
-            \node[draw,very thick,rectangle,fill=blue!30!white,rounded corners=5pt,inner sep=4pt,fit=(aux22) (aux58) (u55) (o27)] (LSTM) {};
+            \node[draw,very thick,rectangle,fill=blue!10!white,rounded corners=5pt,inner sep=4pt,fit=(aux22) (aux58) (u55) (o27)] (LSTM) {};
        \end{pgfonlayer}



--- a/Book/Chapter6/Figures/figure-LSTM04.tex
+++ b/Book/Chapter6/Figures/figure-LSTM04.tex
@@ -144,7 +144,7 @@
        \end{scope}

        \begin{pgfonlayer}{background}
-            \node[draw,very thick,rectangle,fill=blue!30!white,rounded corners=5pt,inner sep=4pt,fit=(aux22) (aux58) (u55) (o27)] (LSTM) {};
+            \node[draw,very thick,rectangle,fill=blue!10!white,rounded corners=5pt,inner sep=4pt,fit=(aux22) (aux58) (u55) (o27)] (LSTM) {};
        \end{pgfonlayer}



--- a/Book/Chapter6/Figures/figure-convergence&lr.tex
+++ b/Book/Chapter6/Figures/figure-convergence&lr.tex
@@ -7,8 +7,8 @@
  ytick={0,1,...,4},
  xticklabel style={opacity=0},
  yticklabel style={opacity=0},
-  xlabel={\textbf{$\textrm{W}_t$}},
-  ylabel={\textbf{L($\textrm{W}_t$)}},
+  xlabel={$w$},
+  ylabel={$L(w)$},
  axis line style={->},
  xlabel style={xshift=2.2cm,yshift=1.2cm},
  ylabel style={rotate=-90,xshift=1.5cm,yshift=1.6cm},
@@ -34,8 +34,8 @@
  ytick={0,1,...,4},
  xticklabel style={opacity=0},
  yticklabel style={opacity=0},
-  xlabel={\textbf{$\textrm{W}_t$}},
-  ylabel={\textbf{L($\textrm{W}_t$)}},
+  xlabel={$w$},
+  ylabel={$L(w)$},
  axis line style={->},
  xlabel style={xshift=2.2cm,yshift=1.2cm},
  ylabel style={rotate=-90,xshift=1.5cm,yshift=1.6cm},

--- a/Book/Chapter6/Figures/figure-lrate-of-transformer.tex
+++ b/Book/Chapter6/Figures/figure-lrate-of-transformer.tex
@@ -7,8 +7,8 @@
      width=.60\textwidth,
      height=.40\textwidth,
      legend style={at={(0.60,0.08)}, anchor=south west},
-      xlabel={\footnotesize{num update (10k)}},
-      ylabel={\footnotesize{Learn rate  (\scriptsize{$10^{-3}$)}}},
+      xlabel={\footnotesize{更新步数  (10k)}},
+      ylabel={\footnotesize{学习率  (\scriptsize{$10^{-3}$)}}},
      ylabel style={yshift=-1em},xlabel style={yshift=0.0em},
      yticklabel style={/pgf/number format/precision=2,/pgf/number format/fixed zerofill},
      ymin=0,ymax=0.9, ytick={0.2, 0.4, 0.6, 0.8},

--- a/Book/Chapter6/Figures/figure-self-att-vs-enco-deco-att.tex
+++ b/Book/Chapter6/Figures/figure-self-att-vs-enco-deco-att.tex
@@ -13,11 +13,11 @@

 \node[anchor=north,rounded corners=1pt,minimum width=11.0em,minimum height=3.5em,draw=ugreen!70,very thick,dotted](p1-1) at ([yshift=-5.2em]p1.south) {\small{解码端每个位置的表示}};

-\draw [->,thick,dashed] (word3.south) .. controls +(south:1em) and +(north:1em) .. (p1-1.north);
+\draw [->,thick,dashed] (word3.south) .. controls +(south:1.5em) and +(north:1.5em) .. ([xshift=-0.4em]p1-1.north);
 \draw [->,thick,dashed](word1.south) --(p1-1.north);
-\draw [->,thick,dashed] (word2.south) .. controls +(south:1em) and +(north:1em) .. (p1-1.north);
+\draw [->,thick,dashed] (word2.south) .. controls +(south:1.0em) and +(north:1.5em) .. ([xshift=0.4em]p1-1.north);

-\node[anchor=north](caption1) at ([xshift=0.0em,yshift=-9.5em]p1.south){\small{(a)Self-Attention的输入}};
+\node[anchor=north](caption1) at ([xshift=0.0em,yshift=-9.5em]p1.south){\small{(a) Self-Attention的输入}};
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \node[anchor=west,rounded corners=1pt,minimum width=14.0em,minimum height=2.0em,fill=pink!30,draw=black](p2) at ([xshift=5.0em]p1.east){\small{Encoder-Decoder Attention}};

@@ -48,10 +48,10 @@

 \draw[<-,thick,dashed]([xshift=-3.6em,yshift=-3.2em]word1-2.north)--([xshift=-3.6em,yshift=-3.2em]p2.south);
 \draw[<-,thick,dashed]([xshift=3.6em,yshift=-3.2em]word1-2.north)--([xshift=3.6em,yshift=-3.2em]p2.south);
-\draw [->,thick,dashed] (word1-2.south) .. controls +(south:1em) and +(north:1em) .. ([yshift=0.3em]p2-3.north);
+\draw [->,thick,dashed] (word1-2.south) .. controls +(south:1em) and +(north:1.5em) .. ([yshift=0.3em,xshift=-0.4em]p2-3.north);


-\node[anchor=north](caption2) at ([xshift=0.0em,yshift=-9.5em]p2.south){\small{(b)Encoder-Decoder Attention的输入}};
+\node[anchor=north](caption2) at ([xshift=0.0em,yshift=-9.5em]p2.south){\small{(b) Encoder-Decoder Attention的输入}};




--- a/Book/Chapter6/Figures/figure-softmax.tex
+++ b/Book/Chapter6/Figures/figure-softmax.tex
@@ -4,8 +4,8 @@
  width=8cm, height=5cm, 
  xtick={-6,-4,...,6},
  ytick={0,0.5,1},
-  xlabel={\small{\textbf{x}}},
-  ylabel={\small{\textbf{Softmax(x)}}},
+  xlabel={\small{$x$}},
+  ylabel={\small{Softmax($x$)}},
  xlabel style={xshift=3.0cm,yshift=1cm},
  axis y line=middle,
  ylabel style={xshift=-2.4cm,yshift=-0.2cm},

--- a/Book/Chapter6/Figures/figure-the-whole-of-LSTM.tex
+++ b/Book/Chapter6/Figures/figure-the-whole-of-LSTM.tex
@@ -154,25 +154,25 @@
 \end{scope}

 \begin{pgfonlayer}{background}
-\node[draw,very thick,rectangle,fill=blue!30!white,rounded corners=5pt,inner sep=4pt,fit=(aux22) (aux58) (u55) (o27)] (LSTM) {};
+\node[draw,very thick,rectangle,fill=blue!10!white,rounded corners=5pt,inner sep=4pt,fit=(aux22) (aux58) (u55) (o27)] (LSTM) {};
 \end{pgfonlayer}

 \begin{scope}
 {
 % forget gate formula
-\node[formulanode,anchor=south east,text width=3.4cm] () at ([shift={(4\base,1.5\base)}]aux51) {遗忘门\\$\mathbf{f}_t=\sigma(\mathbf{W}_f[\mathbf{h}_{t-1},\mathbf{x}_t]+\mathbf{b}_f)$};
+\node[formulanode,anchor=south east,text width=10em] () at ([shift={(4\base,1.5\base)}]aux51) {遗忘门\\$\mathbf{f}_t=\sigma(\mathbf{W}_f[\mathbf{h}_{t-1},\mathbf{x}_t]+\mathbf{b}_f)$};
 }
 {
 % input gate formula
-\node[formulanode,anchor=north east] () at ([shift={(4\base,-1.5\base)}]aux21) {输入门\\$\mathbf{i}_t=\sigma(\mathbf{W}_i[\mathbf{h}_{t-1},\mathbf{x}_t]+\mathbf{b}_i)$\\$\hat{\mathbf{c}}_t=\mathrm{tanh}(\mathbf{W}_c[\mathbf{h}_{t-1},\mathbf{x}_t]+\mathbf{b}_c)$};
+\node[formulanode,anchor=north east,text width=10em] () at ([shift={(4\base,-1.5\base)}]aux21) {输入门\\$\mathbf{i}_t=\sigma(\mathbf{W}_i[\mathbf{h}_{t-1},\mathbf{x}_t]+\mathbf{b}_i)$\\$\hat{\mathbf{c}}_t=\mathrm{tanh}(\mathbf{W}_c[\mathbf{h}_{t-1},\mathbf{x}_t]+\mathbf{b}_c)$};
 }
 {
 % cell update formula
-\node[formulanode,anchor=south west,text width=3.02cm] () at ([shift={(-4\base,1.5\base)}]aux59) {记忆更新\\$\mathbf{c}_{t}=\mathbf{f}_t\cdot \mathbf{c}_{t-1}+\mathbf{i}_t\cdot \hat{\mathbf{c}}_t$};
+\node[formulanode,anchor=south west,text width=10em] () at ([shift={(-4\base,1.5\base)}]aux59) {记忆更新\\$\mathbf{c}_{t}=\mathbf{f}_t\cdot \mathbf{c}_{t-1}+\mathbf{i}_t\cdot \hat{\mathbf{c}}_t$};
 }
 {
 % output gate formula
-\node[formulanode,anchor=north west] () at ([shift={(-4\base,-1.5\base)}]aux29) {输出门\\$\mathbf{o}_t=\sigma(\mathbf{W}_o[\mathbf{h}_{t-1},\mathbf{x}_t]+\mathbf{b}_o)$\\$\mathbf{h}_{t}=\mathbf{o}_t\cdot \mathrm{tanh}(\mathbf{c}_{t})$};
+\node[formulanode,anchor=north west,text width=10em] () at ([shift={(-4\base,-1.5\base)}]aux29) {输出门\\$\mathbf{o}_t=\sigma(\mathbf{W}_o[\mathbf{h}_{t-1},\mathbf{x}_t]+\mathbf{b}_o)$\\$\mathbf{h}_{t}=\mathbf{o}_t\cdot \mathrm{tanh}(\mathbf{c}_{t})$};
 }
 \end{scope}
 \end{tikzpicture}

--- a/Book/mt-book.bbl
+++ b/Book/mt-book.bbl