chapter10 12

7bca970d · zengxin · 3ae1ebe0 · 7bca970d · 7bca970d · 7bca970d
Commit 7bca970d authored Sep 13, 2020 by zengxin
--- a/Chapter10/Figures/figure-3-base-problom-of-p.tex
+++ b/Chapter10/Figures/figure-3-base-problom-of-p.tex
@@ -15,9 +15,9 @@
 					\node[rnnnode,minimum height=0.5\base,fill=green!30!white,anchor=west] (eemb\x) at ([xshift=0.4\base]eemb\y.east) {\tiny{$e_x()$}};
 				\foreach \x in {1,2,...,3}
 					\node[rnnnode,fill=blue!30!white,anchor=south] (enc\x) at ([yshift=0.3\base]eemb\x.north) {};
-			        \node[] (enclabel1) at (enc1) {\tiny{$\vectorn{h}_{m-2}$}};
+			        \node[] (enclabel1) at (enc1) {\tiny{$\vectorn{\emph{h}}_{m-2}$}};
-			        \node[] (enclabel2) at (enc2) {\tiny{$\vectorn{h}_{m-1}$}};
+			        \node[] (enclabel2) at (enc2) {\tiny{$\vectorn{\emph{h}}_{m-1}$}};
-			        \node[rnnnode,fill=purple!30!white] (enclabel3) at (enc3) {\tiny{$\vectorn{h}_{m}$}};
+			        \node[rnnnode,fill=purple!30!white] (enclabel3) at (enc3) {\tiny{$\vectorn{\emph{h}}_{m}$}};
 				\node[wordnode,left=0.4\base of enc1] (init1) {$\cdots$};
 				\node[wordnode,left=0.4\base of eemb1] (init2) {$\cdots$};
@@ -29,7 +29,7 @@
 				\foreach \x in {1,2,...,3}
 					\node[rnnnode,minimum height=0.5\base,fill=green!30!white,anchor=south] (demb\x) at ([yshift=\base]enc\x.north) {\tiny{$e_y()$}};
 				\foreach \x in {1,2,...,3}
-					\node[rnnnode,fill=blue!30!white,anchor=south] (dec\x) at ([yshift=0.3\base]demb\x.north) {{\tiny{$\vectorn{s}_\x$}}};
+					\node[rnnnode,fill=blue!30!white,anchor=south] (dec\x) at ([yshift=0.3\base]demb\x.north) {{\tiny{$\vectorn{\emph{s}}_\x$}}};
 				\foreach \x in {1,2,...,3}
 					\node[rnnnode,minimum height=0.5\base,fill=red!30!white,anchor=south] (softmax\x) at ([yshift=0.3\base]dec\x.north) {\tiny{Softmax}};
 				\node[wordnode,right=0.4\base of demb3] (end1) {$\cdots$};
@@ -73,10 +73,10 @@
 				\draw[-latex'] (enc3.north) .. controls +(north:0.3\base) and +(east:\base) .. (bridge) .. controls +(west:2.7\base) and +(west:0.3\base) .. (dec1.west);
 				{
-				\node [anchor=east] (line1) at ([xshift=-3em,yshift=0.5em]softmax1.west) {\scriptsize{基于RNN的隐层状态$\vectorn{s}_i$}};
+				\node [anchor=east] (line1) at ([xshift=-3em,yshift=0.5em]softmax1.west) {\scriptsize{基于RNN的隐层状态$\vectorn{\emph{s}}_i$}};
 				\node [anchor=north west] (line2) at ([yshift=0.3em]line1.south west) {\scriptsize{预测目标词的概率}};
 				\node [anchor=north west] (line3) at ([yshift=0.3em]line2.south west) {\scriptsize{通常，用Softmax函数}};
-				\node [anchor=north west] (line4) at ([yshift=0.3em]line3.south west) {\scriptsize{实现 $\textrm{P}(y_i|...)$}};
+				\node [anchor=north west] (line4) at ([yshift=0.3em]line3.south west) {\scriptsize{实现 $\funp{P}(y_i|...)$}};
 				}
 				{
@@ -90,7 +90,7 @@
 				\node [anchor=west] (line21) at ([xshift=1.3em,yshift=1.5em]enc3.east)  {\scriptsize{源语编码器最后一个}};
 				\node [anchor=north west] (line22) at ([yshift=0.3em]line21.south west) {\scriptsize{循环单元的输出被}};
 				\node [anchor=north west] (line23) at ([yshift=0.3em]line22.south west) {\scriptsize{看作是句子的表示,}};
-				\node [anchor=north west] (line24) at ([yshift=0.3em]line23.south west) {\scriptsize{记为$\vectorn{C}$}};
+				\node [anchor=north west] (line24) at ([yshift=0.3em]line23.south west) {\scriptsize{记为$\vectorn{\emph{C}}$}};
 				}
 				\begin{pgfonlayer}{background}

--- a/Chapter10/Figures/figure-beam-search-process.tex
+++ b/Chapter10/Figures/figure-beam-search-process.tex
@@ -20,13 +20,13 @@
 \node [anchor=west,inner sep=2pt] (t4) at ([xshift=0.3em]t3.east) {\scriptsize{...}};
 }
 {
-\node [rnnnode,anchor=south] (s1) at ([yshift=1em]t1.north) {\scriptsize{$\textbf{s}_1$}};
+\node [rnnnode,anchor=south] (s1) at ([yshift=1em]t1.north) {\scriptsize{$\vectorn{\emph{s}}_1$}};
 }
 {
-\node [rnnnode,anchor=south] (s2) at ([yshift=1em]t2.north) {\scriptsize{$\textbf{s}_2$ ($\times 3$)}};
+\node [rnnnode,anchor=south] (s2) at ([yshift=1em]t2.north) {\scriptsize{$\vectorn{\emph{s}}_2$ ($\times 3$)}};
 }
 {
-\node [rnnnode,anchor=south] (s3) at ([yshift=1em]t3.north) {\scriptsize{$\textbf{s}_3$ ($\times 3$)}};
+\node [rnnnode,anchor=south] (s3) at ([yshift=1em]t3.north) {\scriptsize{$\vectorn{\emph{s}}_3$ ($\times 3$)}};
 \node [anchor=west,inner sep=2pt] (s4) at ([xshift=0.3em]s3.east) {\scriptsize{...}};
 }
 {
@@ -121,17 +121,17 @@
 }
 {
-\node [circle,draw,anchor=north,inner sep=2pt,fill=orange!20,text=orange!20] (c2) at ([yshift=-2.5em]t1.south) {\scriptsize{$\textbf{C}_2$}};
+\node [circle,draw,anchor=north,inner sep=2pt,fill=orange!20,text=orange!20] (c2) at ([yshift=-2.5em]t1.south) {\scriptsize{$\vectorn{\emph{C}}_2$}};
-\node [circle,draw,inner sep=2pt,fill=orange!20] (c2copy1) at ([yshift=-0.1em,xshift=-0.1em]c2) {\scriptsize{$\textbf{C}_2$}};
+\node [circle,draw,inner sep=2pt,fill=orange!20] (c2copy1) at ([yshift=-0.1em,xshift=-0.1em]c2) {\scriptsize{$\vectorn{\emph{C}}_2$}};
-\node [circle,draw,inner sep=2pt,fill=orange!20] (c2copy2) at ([yshift=-0.2em,xshift=-0.2em]c2) {\scriptsize{$\textbf{C}_2$}};
+\node [circle,draw,inner sep=2pt,fill=orange!20] (c2copy2) at ([yshift=-0.2em,xshift=-0.2em]c2) {\scriptsize{$\vectorn{\emph{C}}_2$}};
 \draw [->] ([xshift=-0.9em]c2.west) -- ([xshift=-0.3em]c2.west);
 \draw [->] ([xshift=0.1em]c2.east) .. controls +(east:1.5) and +(west:0.8) ..([yshift=-0.3em,xshift=-0.1em]s2.west);
 }
 {
-\node [circle,draw,anchor=north,inner sep=2pt,fill=orange!20,text=orange!20] (c3) at ([yshift=-2.5em]t2.south) {\scriptsize{$\textbf{C}_3$}};
+\node [circle,draw,anchor=north,inner sep=2pt,fill=orange!20,text=orange!20] (c3) at ([yshift=-2.5em]t2.south) {\scriptsize{$\vectorn{\emph{C}}_3$}};
-\node [circle,draw,inner sep=2pt,fill=orange!20,text=orange!20] (c3copy1) at ([yshift=-0.1em,xshift=-0.1em]c3) {\scriptsize{$\textbf{C}_3$}};
+\node [circle,draw,inner sep=2pt,fill=orange!20,text=orange!20] (c3copy1) at ([yshift=-0.1em,xshift=-0.1em]c3) {\scriptsize{$\vectorn{\emph{C}}_3$}};
-\node [circle,draw,inner sep=2pt,fill=orange!20] (c3copy2) at ([yshift=-0.2em,xshift=-0.2em]c3) {\scriptsize{$\textbf{C}_3$}};
+\node [circle,draw,inner sep=2pt,fill=orange!20] (c3copy2) at ([yshift=-0.2em,xshift=-0.2em]c3) {\scriptsize{$\vectorn{\emph{C}}_3$}};
 \draw [->] ([xshift=-0.9em]c3.west) -- ([xshift=-0.3em]c3.west);
 \draw [->] ([xshift=0.1em]c3.east) .. controls +(east:1.5) and +(west:0.8) ..([yshift=-0.3em,xshift=-0.1em]s3.west);
 }

--- a/Chapter10/Figures/figure-decode-the-word-probability-distribution-at-the-first-position.tex
+++ b/Chapter10/Figures/figure-decode-the-word-probability-distribution-at-the-first-position.tex
@@ -13,7 +13,7 @@
 }
 {
-\node [rnnnode,anchor=south] (s1) at ([yshift=1em]t1.north) {\scriptsize{$\textbf{s}_1$}};
+\node [rnnnode,anchor=south] (s1) at ([yshift=1em]t1.north) {\scriptsize{$\vectorn{\emph{s}}_1$}};
 }
 \node [wnode,anchor=north] (wt1) at ([yshift=-0.8em]t1.south) {\scriptsize{$\langle$sos$\rangle$}};

--- a/Chapter10/Figures/figure-decoding-process-based-on-greedy-method.tex
+++ b/Chapter10/Figures/figure-decoding-process-based-on-greedy-method.tex
@@ -2,9 +2,9 @@
 \begin{scope}
 \tikzstyle{rnnnode} = [minimum height=1.1em,minimum width=2.1em,inner sep=2pt,rounded corners=1pt,draw,fill=red!20];
-\node [rnnnode,anchor=west] (h1) at (0,0) {\tiny{$\textbf{h}_1$}};
+\node [rnnnode,anchor=west] (h1) at (0,0) {\tiny{$\vectorn{\emph{h}}_1$}};
 \node [anchor=west] (h2) at ([xshift=1em]h1.east) {\tiny{...}};
-\node [rnnnode,anchor=west] (h3) at ([xshift=1em]h2.east) {\tiny{$\textbf{h}_m$}};
+\node [rnnnode,anchor=west] (h3) at ([xshift=1em]h2.east) {\tiny{$\vectorn{\emph{h}}_m$}};
 \node [rnnnode,anchor=north,fill=green!20] (e1) at ([yshift=-1em]h1.south) {\tiny{$e_x()$}};
 \node [anchor=west] (e2) at ([xshift=1em]e1.east) {\tiny{...}};
 \node [rnnnode,anchor=west,fill=green!20] (e3) at ([xshift=1em]e2.east) {\tiny{$e_x()$}};
@@ -19,7 +19,7 @@
 \draw [->] ([xshift=0.1em]h1.east) -- ([xshift=-0.1em]h2.west);
 \draw [->] ([xshift=0.1em]h2.east) -- ([xshift=-0.1em]h3.west);
 \draw [->] ([xshift=-0.8em]h1.west) -- ([xshift=-0.1em]h1.west) node [pos=0,left,inner sep=2pt] {\tiny{0}};
-\node [anchor=south] (encoder) at ([xshift=-0.2em]h1.north west) {\scriptsize{\textbf{编码器}}};
+\node [anchor=south] (encoder) at ([xshift=-0.2em]h1.north west) {\scriptsize{\vectorn{编码器}}};
 {
 \node [rnnnode,anchor=west,fill=green!20] (t1) at ([xshift=3em]h3.east) {\tiny{$e_y()$}};
@@ -33,14 +33,14 @@
 \node [anchor=west,inner sep=2pt] (t5) at ([xshift=0.3em]t4.east) {\tiny{...}};
 }
 {
-\node [rnnnode,anchor=south] (s1) at ([yshift=1em]t1.north) {\tiny{$\textbf{s}_1$}};
+\node [rnnnode,anchor=south] (s1) at ([yshift=1em]t1.north) {\tiny{$\vectorn{\emph{s}}_1$}};
 }
 {
-\node [rnnnode,anchor=south] (s2) at ([yshift=1em]t2.north) {\tiny{$\textbf{s}_2$}};
+\node [rnnnode,anchor=south] (s2) at ([yshift=1em]t2.north) {\tiny{$\vectorn{\emph{s}}_2$}};
 }
 {
-\node [rnnnode,anchor=south] (s3) at ([yshift=1em]t3.north) {\tiny{$\textbf{s}_3$}};
+\node [rnnnode,anchor=south] (s3) at ([yshift=1em]t3.north) {\tiny{$\vectorn{\emph{s}}_3$}};
-\node [rnnnode,anchor=south] (s4) at ([yshift=1em]t4.north) {\tiny{$\textbf{s}_4$}};
+\node [rnnnode,anchor=south] (s4) at ([yshift=1em]t4.north) {\tiny{$\vectorn{\emph{s}}_4$}};
 \node [anchor=west,inner sep=2pt] (s5) at ([xshift=0.3em]s4.east) {\tiny{...}};
 }
 {
@@ -131,7 +131,7 @@
 }
 {
-\node [circle,draw,anchor=south,inner sep=3pt,fill=orange!20] (c2) at ([yshift=2em]h2.north) {\tiny{$\textbf{C}_2$}};
+\node [circle,draw,anchor=south,inner sep=3pt,fill=orange!20] (c2) at ([yshift=2em]h2.north) {\tiny{$\vectorn{\emph{C}}_2$}};
 \node [anchor=south] (c2label) at (c2.north) {\tiny{\textbf{注意力机制：上下文}}};
 \node [anchor=south] (c2more) at ([yshift=-1.5em]c2.south) {\tiny{...}};
 \draw [->] (h1.north) .. controls +(north:0.6) and +(250:0.9) .. (c2.250);
@@ -143,12 +143,12 @@
 }
 {
-\node [circle,draw,anchor=north,inner sep=3pt,fill=orange!20] (c3) at ([yshift=-2em]t2.south) {\tiny{$\textbf{C}_3$}};
+\node [circle,draw,anchor=north,inner sep=3pt,fill=orange!20] (c3) at ([yshift=-2em]t2.south) {\tiny{$\vectorn{\emph{C}}_3$}};
 \draw [->] ([xshift=-0.7em]c3.west) -- ([xshift=-0.1em]c3.west);
 \draw [->] ([xshift=0.1em]c3.east) .. controls +(east:0.6) and +(west:0.8) ..([yshift=-0.3em,xshift=-0.1em]s3.west);
 }
 {
-\node [circle,draw,anchor=north,inner sep=3pt,fill=orange!20] (c4) at ([yshift=-2em]t3.south) {\tiny{$\textbf{C}_4$}};
+\node [circle,draw,anchor=north,inner sep=3pt,fill=orange!20] (c4) at ([yshift=-2em]t3.south) {\tiny{$\vectorn{\emph{C}}_4$}};
 \draw [->] ([xshift=-0.7em]c4.west) -- ([xshift=-0.1em]c4.west);
 \draw [->] ([xshift=0.1em]c4.east) .. controls +(east:0.6) and +(west:0.8) ..([yshift=-0.3em,xshift=-0.1em]s4.west);
 }

--- a/Chapter10/Figures/figure-gru01.tex
+++ b/Chapter10/Figures/figure-gru01.tex
@@ -78,8 +78,8 @@
        \end{scope}
        \begin{scope}
-            \node[wordnode,anchor=south] () at (aux71) {$\vectorn{h}_{t-1}$};
+            \node[wordnode,anchor=south] () at (aux71) {$\vectorn{\emph{h}}_{t-1}$};
-            \node[wordnode,anchor=west] () at (aux12) {$\vectorn{x}_t$};
+            \node[wordnode,anchor=west] () at (aux12) {$\vectorn{\emph{x}}_t$};
        \end{scope}

--- a/Chapter10/Figures/figure-gru02.tex
+++ b/Chapter10/Figures/figure-gru02.tex
@@ -91,8 +91,8 @@
        \end{scope}
        \begin{scope}
-            \node[wordnode,anchor=south] () at (aux71) {$\vectorn{h}_{t-1}$};
+            \node[wordnode,anchor=south] () at (aux71) {$\vectorn{\emph{h}}_{t-1}$};
-            \node[wordnode,anchor=west] () at (aux12) {$\vectorn{x}_t$};
+            \node[wordnode,anchor=west] () at (aux12) {$\vectorn{\emph{x}}_t$};
        \end{scope}

--- a/Chapter10/Figures/figure-gru03.tex
+++ b/Chapter10/Figures/figure-gru03.tex
@@ -109,11 +109,11 @@
        \end{scope}
        \begin{scope}
-             \node[wordnode,anchor=south] () at (aux71) {$\vectorn{h}_{t-1}$};
+             \node[wordnode,anchor=south] () at (aux71) {$\vectorn{\emph{h}}_{t-1}$};
-            \node[wordnode,anchor=west] () at (aux12) {$\vectorn{x}_t$};
+            \node[wordnode,anchor=west] () at (aux12) {$\vectorn{\emph{x}}_t$};
            {
-                \node[wordnode,anchor=east] () at (aux87) {$\vectorn{h}_{t}$};
+                \node[wordnode,anchor=east] () at (aux87) {$\vectorn{\emph{h}}_{t}$};
-                \node[wordnode,anchor=south] () at (aux78) {$\vectorn{h}_{t}$};
+                \node[wordnode,anchor=south] () at (aux78) {$\vectorn{\emph{h}}_{t}$};
            }
        \end{scope}

--- a/Chapter10/Figures/figure-lstm01.tex
+++ b/Chapter10/Figures/figure-lstm01.tex
@@ -84,9 +84,9 @@
        \end{scope}
        \begin{scope}
-            \node[wordnode,anchor=south] () at ([xshift=0.5\base]aux21) {$\vectorn{h}_{t-1}$};
+            \node[wordnode,anchor=south] () at ([xshift=0.5\base]aux21) {$\vectorn{\emph{h}}_{t-1}$};
-            \node[wordnode,anchor=west] () at (aux12) {$\vectorn{x}_t$};
+            \node[wordnode,anchor=west] () at (aux12) {$\vectorn{\emph{x}}_t$};
-            \node[wordnode,anchor=south] () at ([xshift=0.5\base]aux51) {$\vectorn{c}_{t-1}$};
+            \node[wordnode,anchor=south] () at ([xshift=0.5\base]aux51) {$\vectorn{\emph{c}}_{t-1}$};
        \end{scope}

--- a/Chapter10/Figures/figure-lstm02.tex
+++ b/Chapter10/Figures/figure-lstm02.tex
@@ -99,9 +99,9 @@
         \end{scope}
        \begin{scope}
-            \node[wordnode,anchor=south] () at ([xshift=0.5\base]aux21) {$\vectorn{h}_{t-1}$};
+            \node[wordnode,anchor=south] () at ([xshift=0.5\base]aux21) {$\vectorn{\emph{h}}_{t-1}$};
-            \node[wordnode,anchor=west] () at (aux12) {$\vectorn{x}_t$};
+            \node[wordnode,anchor=west] () at (aux12) {$\vectorn{\emph{x}}_t$};
-            \node[wordnode,anchor=south] () at ([xshift=0.5\base]aux51) {$\vectorn{c}_{t-1}$};
+            \node[wordnode,anchor=south] () at ([xshift=0.5\base]aux51) {$\vectorn{\emph{c}}_{t-1}$};
        \end{scope}

--- a/Chapter10/Figures/figure-lstm03.tex
+++ b/Chapter10/Figures/figure-lstm03.tex
@@ -113,11 +113,11 @@
        \end{scope}
        \begin{scope}
-            \node[wordnode,anchor=south] () at ([xshift=0.5\base]aux21) {$\vectorn{h}_{t-1}$};
+            \node[wordnode,anchor=south] () at ([xshift=0.5\base]aux21) {$\vectorn{\emph{h}}_{t-1}$};
-            \node[wordnode,anchor=west] () at (aux12) {$\vectorn{x}_t$};
+            \node[wordnode,anchor=west] () at (aux12) {$\vectorn{\emph{x}}_t$};
-            \node[wordnode,anchor=south] () at ([xshift=0.5\base]aux51) {$\vectorn{c}_{t-1}$};
+            \node[wordnode,anchor=south] () at ([xshift=0.5\base]aux51) {$\vectorn{\emph{c}}_{t-1}$};
            {
-                \node[wordnode,anchor=south] () at ([xshift=-0.5\base]aux59) {$\vectorn{c}_{t}$};
+                \node[wordnode,anchor=south] () at ([xshift=-0.5\base]aux59) {$\vectorn{\emph{c}}_{t}$};
            }
        \end{scope}

--- a/Chapter10/Figures/figure-lstm04.tex
+++ b/Chapter10/Figures/figure-lstm04.tex
@@ -131,15 +131,15 @@
        \end{scope}
        \begin{scope}
-            \node[wordnode,anchor=south] () at ([xshift=0.5\base]aux21) {$\vectorn{h}_{t-1}$};
+            \node[wordnode,anchor=south] () at ([xshift=0.5\base]aux21) {$\vectorn{\emph{h}}_{t-1}$};
-            \node[wordnode,anchor=west] () at (aux12) {$\vectorn{x}_t$};
+            \node[wordnode,anchor=west] () at (aux12) {$\vectorn{\emph{x}}_t$};
-            \node[wordnode,anchor=south] () at ([xshift=0.5\base]aux51) {$\vectorn{c}_{t-1}$};
+            \node[wordnode,anchor=south] () at ([xshift=0.5\base]aux51) {$\vectorn{\emph{c}}_{t-1}$};
            {
-                \node[wordnode,anchor=south] () at ([xshift=-0.5\base]aux59) {$\vectorn{c}_{t}$};
+                \node[wordnode,anchor=south] () at ([xshift=-0.5\base]aux59) {$\vectorn{\emph{c}}_{t}$};
            }
            {
-                \node[wordnode,anchor=east] () at (aux68) {$\vectorn{h}_{t}$};
+                \node[wordnode,anchor=east] () at (aux68) {$\vectorn{\emph{h}}_{t}$};
-                \node[wordnode,anchor=south] () at ([xshift=-0.5\base]aux29) {$\vectorn{h}_{t}$};
+                \node[wordnode,anchor=south] () at ([xshift=-0.5\base]aux29) {$\vectorn{\emph{h}}_{t}$};
            }
        \end{scope}

--- a/Chapter10/Figures/figure-output-layer-structur.tex
+++ b/Chapter10/Figures/figure-output-layer-structur.tex
@@ -17,9 +17,9 @@
                    \node[rnnnode,minimum height=0.5\base,fill=green!30!white,anchor=west] (eemb\x) at ([xshift=0.4\base]eemb\y.east) {\tiny{$e_x()$}};
                \foreach \x in {1,2,...,3}
                    \node[rnnnode,fill=blue!30!white,anchor=south] (enc\x) at ([yshift=0.3\base]eemb\x.north) {};
-                    \node[] (enclabel1) at (enc1) {\tiny{$\textbf{h}_{m-2}$}};
+                    \node[] (enclabel1) at (enc1) {\tiny{$\vectorn{\emph{h}}_{m-2}$}};
-                    \node[] (enclabel2) at (enc2) {\tiny{$\textbf{h}_{m-1}$}};
+                    \node[] (enclabel2) at (enc2) {\tiny{$\vectorn{\emph{h}}_{m-1}$}};
-                    \node[rnnnode,fill=purple!30!white] (enclabel3) at (enc3) {\tiny{$\textbf{h}_{m}$}};
+                    \node[rnnnode,fill=purple!30!white] (enclabel3) at (enc3) {\tiny{$\vectorn{\emph{h}}_{m}$}};
                \node[wordnode,left=0.4\base of enc1] (init1) {$\cdots$};
                \node[wordnode,left=0.4\base of eemb1] (init2) {$\cdots$};
@@ -31,7 +31,7 @@
                \foreach \x in {1,2,...,3}
                    \node[rnnnode,minimum height=0.5\base,fill=green!30!white,anchor=south] (demb\x) at ([yshift=\base]enc\x.north) {\tiny{$e_y()$}};
                \foreach \x in {1,2,...,3}
-                    \node[rnnnode,fill=blue!30!white,anchor=south] (dec\x) at ([yshift=0.3\base]demb\x.north) {{\tiny{$\textbf{s}_\x$}}};
+                    \node[rnnnode,fill=blue!30!white,anchor=south] (dec\x) at ([yshift=0.3\base]demb\x.north) {{\tiny{$\vectorn{\emph{s}}_\x$}}};
                \foreach \x in {1,2,...,3}
                    \node[rnnnode,minimum height=0.5\base,fill=red!30!white,anchor=south] (softmax\x) at ([yshift=0.3\base]dec\x.north) {\tiny{Softmax}};
                \node[wordnode,right=0.4\base of demb3] (end1) {$\cdots$};
@@ -123,13 +123,13 @@
                    \draw [->,thick] ([xshift=0.2em,yshift=0.1em]hidden.north west) -- (target.south west);
                    \draw [->,thick] ([xshift=-0.2em,yshift=0.1em]hidden.north east) -- (target.south east);
-                    \node [anchor=south] () at ([yshift=0.3em]hidden.north) {\scriptsize{$\hat{\vectorn{s}}=\vectorn{Ws}$}};
+                    \node [anchor=south] () at ([yshift=0.3em]hidden.north) {\scriptsize{$\hat{\vectorn{\emph{s}}}_j=\vectorn{\emph{s}}_j \vectorn{\emph{W}}_o$}};
                }
                {
                    \node [rounded corners=0.3em] (softmax) at ([yshift=1.25em]target.north) {\scriptsize{$p(\hat{s}_i)=\frac{e^{\hat{s}_i}}{\sum_j e^{\hat{s}_j}}$}};
                    \filldraw [fill=blue!20,draw=white] ([yshift=0.1em]cell11.north west) {[rounded corners=0.3em] -- (softmax.west)} -- (label1.south west) -- (label8.south east) {[rounded corners=0.3em] -- (softmax.east)} -- ([yshift=0.1em]cell18.north east) -- ([yshift=0.1em]cell11.north west);
-                    \node [rounded corners=0.3em] (softmax) at ([yshift=1.25em]target.north) {\scriptsize{$p(\hat{s}_i)=\frac{e^{\hat{s}_i}}{\sum_j e^{\hat{s}_j}}$}};
+                    \node [rounded corners=0.3em] (softmax) at ([yshift=1.25em]target.north) {\scriptsize{$p(\hat{s}_{jk})=\frac{e^{\hat{s}_{jk}}}{\sum_n e^{\hat{s}_{jn}}}$}};
                }
                \draw [-latex'] ([yshift=-0.3cm]hidden.south) to (hidden.south);
                {

--- a/Chapter10/Figures/figure-the-whole-of-lstm.tex
+++ b/Chapter10/Figures/figure-the-whole-of-lstm.tex
@@ -141,15 +141,15 @@
 \end{scope}
 \begin{scope}
-\node[wordnode,anchor=south] () at ([xshift=0.5\base]aux21) {$\vectorn{h}_{t-1}$};
+\node[wordnode,anchor=south] () at ([xshift=0.5\base]aux21) {$\vectorn{\emph{h}}_{t-1}$};
-\node[wordnode,anchor=west] () at (aux12) {$\vectorn{x}_t$};
+\node[wordnode,anchor=west] () at (aux12) {$\vectorn{\emph{x}}_t$};
-\node[wordnode,anchor=south] () at ([xshift=0.5\base]aux51) {$\vectorn{c}_{t-1}$};
+\node[wordnode,anchor=south] () at ([xshift=0.5\base]aux51) {$\vectorn{\emph{c}}_{t-1}$};
 {
-\node[wordnode,anchor=south] () at ([xshift=-0.5\base]aux59) {$\vectorn{c}_{t}$};
+\node[wordnode,anchor=south] () at ([xshift=-0.5\base]aux59) {$\vectorn{\emph{c}}_{t}$};
 }
 {
-\node[wordnode,anchor=east] () at (aux68) {$\vectorn{h}_{t}$};
+\node[wordnode,anchor=east] () at (aux68) {$\vectorn{\emph{h}}_{t}$};
-\node[wordnode,anchor=south] () at ([xshift=-0.5\base]aux29) {$\vectorn{h}_{t}$};
+\node[wordnode,anchor=south] () at ([xshift=-0.5\base]aux29) {$\vectorn{\emph{h}}_{t}$};
 }
 \end{scope}
@@ -170,19 +170,19 @@
 \begin{scope}
 {
 % forget gate formula
-\node[formulanode,anchor=south east,text width=10em] () at ([shift={(4\base,1.5\base)}]aux51) {遗忘门\\$\vectorn{f}_t=\sigma(\vectorn{W}_f[\vectorn{h}_{t-1},\vectorn{x}_t]+\vectorn{b}_f)$};
+\node[formulanode,anchor=south east,text width=10em] () at ([shift={(4\base,1.5\base)}]aux51) {遗忘门\\$\vectorn{\emph{f}}_t=\sigma(\vectorn{\emph{W}}_f[\vectorn{\emph{h}}_{t-1},\vectorn{\emph{x}}_t]+\vectorn{\emph{b}}_f)$};
 }
 {
 % input gate formula
-\node[formulanode,anchor=north east,text width=10em] () at ([shift={(4\base,-1.5\base)}]aux21) {输入门\\$\vectorn{i}_t=\sigma(\vectorn{W}_i[\vectorn{h}_{t-1},\vectorn{x}_t]+\vectorn{b}_i)$\\$\hat{\vectorn{c}}_t=\mathrm{tanh}(\vectorn{W}_c[\vectorn{h}_{t-1},\vectorn{x}_t]+\vectorn{b}_c)$};
+\node[formulanode,anchor=north east,text width=10em] () at ([shift={(4\base,-1.5\base)}]aux21) {输入门\\$\vectorn{\emph{i}}_t=\sigma(\vectorn{\emph{W}}_i[\vectorn{\emph{h}}_{t-1},\vectorn{\emph{x}}_t]+\vectorn{\emph{b}}_i)$\\$\hat{\vectorn{\emph{c}}}_t=\mathrm{tanh}(\vectorn{\emph{W}}_c[\vectorn{\emph{h}}_{t-1},\vectorn{\emph{x}}_t]+\vectorn{\emph{b}}_c)$};
 }
 {
 % cell update formula
-\node[formulanode,anchor=south west,text width=10em] () at ([shift={(-4\base,1.5\base)}]aux59) {记忆更新\\$\vectorn{c}_{t}=\vectorn{f}_t\cdot \vectorn{c}_{t-1}+\vectorn{i}_t\cdot \hat{\vectorn{c}}_t$};
+\node[formulanode,anchor=south west,text width=10em] () at ([shift={(-4\base,1.5\base)}]aux59) {记忆更新\\$\vectorn{\emph{c}}_{t}=\vectorn{\emph{f}}_t\cdot \vectorn{\emph{c}}_{t-1}+\vectorn{\emph{i}}_t\cdot \hat{\vectorn{\emph{c}}}_t$};
 }
 {
 % output gate formula
-\node[formulanode,anchor=north west,text width=10em] () at ([shift={(-4\base,-1.5\base)}]aux29) {输出门\\$\vectorn{o}_t=\sigma(\vectorn{W}_o[\vectorn{h}_{t-1},\vectorn{x}_t]+\vectorn{b}_o)$\\$\vectorn{h}_{t}=\vectorn{o}_t\cdot \mathrm{tanh}(\vectorn{c}_{t})$};
+\node[formulanode,anchor=north west,text width=10em] () at ([shift={(-4\base,-1.5\base)}]aux29) {输出门\\$\vectorn{\emph{o}}_t=\sigma(\vectorn{\emph{W}}_o[\vectorn{\emph{h}}_{t-1},\vectorn{\emph{x}}_t]+\vectorn{\emph{b}}_o)$\\$\vectorn{\emph{h}}_{t}=\vectorn{\emph{o}}_t\cdot \mathrm{tanh}(\vectorn{\emph{c}}_{t})$};
 }
 \end{scope}
 \end{tikzpicture}

--- a/Chapter10/Figures/figure-word-embedding-structure.tex
+++ b/Chapter10/Figures/figure-word-embedding-structure.tex
@@ -14,9 +14,9 @@
                    \node[rnnnode,minimum height=0.5\base,fill=green!30!white,anchor=west] (eemb\x) at ([xshift=0.4\base]eemb\y.east) {\tiny{$e_x()$}};
                \foreach \x in {1,2,...,3}
                    \node[rnnnode,fill=blue!30!white,anchor=south] (enc\x) at ([yshift=0.3\base]eemb\x.north) {};
-                    \node[] (enclabel1) at (enc1) {\tiny{$\vectorn{h}_{m-2}$}};
+                    \node[] (enclabel1) at (enc1) {\tiny{$\vectorn{\emph{h}}_{m-2}$}};
-                    \node[] (enclabel2) at (enc2) {\tiny{$\vectorn{h}_{m-1}$}};
+                    \node[] (enclabel2) at (enc2) {\tiny{$\vectorn{\emph{h}}_{m-1}$}};
-                    \node[rnnnode,fill=purple!30!white] (enclabel3) at (enc3) {\tiny{$\vectorn{h}_{m}$}};
+                    \node[rnnnode,fill=purple!30!white] (enclabel3) at (enc3) {\tiny{$\vectorn{\emph{h}}_{m}$}};
                \node[wordnode,left=0.4\base of enc1] (init1) {$\cdots$};
                \node[wordnode,left=0.4\base of eemb1] (init2) {$\cdots$};
@@ -28,7 +28,7 @@
                \foreach \x in {1,2,...,3}
                    \node[rnnnode,minimum height=0.5\base,fill=green!30!white,anchor=south] (demb\x) at ([yshift=\base]enc\x.north) {\tiny{$e_y()$}};
                \foreach \x in {1,2,...,3}
-                    \node[rnnnode,fill=blue!30!white,anchor=south] (dec\x) at ([yshift=0.3\base]demb\x.north) {{\tiny{$\vectorn{s}_\x$}}};
+                    \node[rnnnode,fill=blue!30!white,anchor=south] (dec\x) at ([yshift=0.3\base]demb\x.north) {{\tiny{$\vectorn{\emph{s}}_\x$}}};
                \foreach \x in {1,2,...,3}
                    \node[rnnnode,minimum height=0.5\base,fill=red!30!white,anchor=south] (softmax\x) at ([yshift=0.3\base]dec\x.north) {\tiny{Softmax}};
                \node[wordnode,right=0.4\base of demb3] (end1) {$\cdots$};

--- a/Chapter10/chapter10.tex
+++ b/Chapter10/chapter10.tex
@@ -290,7 +290,7 @@ NMT                     & $ 21.7^{\ast}$          & $18.7^{\ast}$           & -1
 \end{figure}
 %----------------------------------------------
-\parinterval 实际上，编码器-解码器模型也并不是表示学习实现的唯一途径。比如，在第五章提到的神经语言模型实际上也是一种有效的学习句子表示的方法，它所衍生出的预训练模型可以从大规模单语数据上学习句子的表示形式。这种学习会比使用少量的双语数据进行编码端和解码端的学习更加充分。相比机器翻译任务，语言模型相当于一个编码器的学习 \footnote{相比神经机器翻译的编码器，神经语言模型会多出一个输出层，这时可以直接把神经语言模型的中间层的输出作为编码器的输出。}，可以无缝嵌入到神经机器翻译模型中。不过，值得注意的是，机器翻译的目的是解决双语字符串之间的映射问题，因此它所使用的句子表示是为了更好地进行翻译。从这个角度说，机器翻译中的表示学习又和语言模型中的表示学习有不同。不过，这里不会深入讨论神经语言模型和预训练与神经机器翻译之间的异同，感兴趣的读者可以参看第九章的相关内容。
+\parinterval 实际上，编码器-解码器模型也并不是表示学习实现的唯一途径。比如，在第九章提到的神经语言模型实际上也是一种有效的学习句子表示的方法，它所衍生出的预训练模型可以从大规模单语数据上学习句子的表示形式。这种学习会比使用少量的双语数据进行编码端和解码端的学习更加充分。相比机器翻译任务，语言模型相当于一个编码器的学习 \footnote{相比神经机器翻译的编码器，神经语言模型会多出一个输出层，这时可以直接把神经语言模型的中间层的输出作为编码器的输出。}，可以无缝嵌入到神经机器翻译模型中。不过，值得注意的是，机器翻译的目的是解决双语字符串之间的映射问题，因此它所使用的句子表示是为了更好地进行翻译。从这个角度说，机器翻译中的表示学习又和语言模型中的表示学习有不同。不过，这里不会深入讨论神经语言模型和预训练与神经机器翻译之间的异同，感兴趣的读者可以参看第九章的相关内容。
 \parinterval 还有一点，在神经机器翻译中，句子的表示形式可以有很多选择。使用单个向量表示一个句子是一种最简单的方法。当然，也可以用矩阵、高阶张量完成表示。甚至，在解码时动态地生成源语言的表示结果。这部分技术也会在随后的内容中进行介绍。
@@ -402,7 +402,7 @@ NMT                     & $ 21.7^{\ast}$          & $18.7^{\ast}$           & -1
 \parinterval 显然，根据上下文中提到的“没吃饭”、“很饿”，最佳的答案是“吃 饭”或者“吃 东西”。也就是，对序列中某个位置的答案进行预测时需要记忆当前时刻之前的序列信息，因此，{\small\bfnew{循环神经网络}}\index{循环神经网络}（Recurrent Neural Network, RNN）\index{Recurrent Neural Network, RNN}应运而生。实际上循环神经网络有着极为广泛的应用，例如语音识别、语言建模以及即将要介绍的神经机器翻译。
-\parinterval 第九章已经对循环神经网络的基本知识进行过介绍。这里再回顾一下。简单来说，循环神经网络由循环单元组成。对于序列中的任意时刻，都有一个循环单元与之对应，它会融合当前时刻的输入和上一时刻循环单元的输出，生成当前时刻的输出。这样每个时刻的信息都会被传递到下一时刻，这也间接达到了记录历史信息的目的。比如，对于序列$\seq{x}=\{x_1, x_2,..., x_m\}$，循环神经网络会按顺序输出一个序列$\vectorn{h}=\{ \vectorn{h}_1, \vectorn{h}_2,..., \vectorn{h}_m \}$，其中$\vectorn{h}_i$表示$i$时刻循环神经网络的输出（通常为一个向量）。
+\parinterval 第九章已经对循环神经网络的基本知识进行过介绍。这里再回顾一下。简单来说，循环神经网络由循环单元组成。对于序列中的任意时刻，都有一个循环单元与之对应，它会融合当前时刻的输入和上一时刻循环单元的输出，生成当前时刻的输出。这样每个时刻的信息都会被传递到下一时刻，这也间接达到了记录历史信息的目的。比如，对于序列$\vectorn{\emph{x}}=\{x_1, x_2,..., x_m\}$，循环神经网络会按顺序输出一个序列$\vectorn{\emph{h}}=\{ \vectorn{\emph{h}}_1, \vectorn{\emph{h}}_2,..., \vectorn{\emph{h}}_m \}$，其中$\vectorn{\emph{h}}_i$表示$i$时刻循环神经网络的输出（通常为一个向量）。
 \parinterval 图\ref{fig:10-9}展示了一个循环神经网络处理序列问题的实例。当前时刻循环单元的输入由上一个时刻的输入和当前时刻的输入组成，因此也可以理解为，网络当前时刻计算得到的输出是由之前的序列共同决定的，即网络在不断地传递信息的过程中记忆了历史信息。以最后一个时刻的循环单元为例，它在对“开始”这个单词的信息进行处理时，参考了之前所有词（“<sos>\ 让\ 我们”）的信息。
@@ -426,59 +426,60 @@ NMT                     & $ 21.7^{\ast}$          & $18.7^{\ast}$           & -1
 \end{figure}
 %----------------------------------------------
-\parinterval 从数学模型上看，神经机器翻译模型与统计机器翻译的目标是一样的：在给定源语言句子$\seq{x}$的情况下，找出翻译概率最大的目标语译文$\hat{\seq{y}}$:
+\parinterval 从数学模型上看，神经机器翻译模型与统计机器翻译的目标是一样的：在给定源语言句子$\vectorn{\emph{x}}$的情况下，找出翻译概率最大的目标语译文$\hat{\vectorn{\emph{y}}}$:
 \begin{eqnarray}
-\hat{\seq{y}} = \argmax_{\seq{y}} \funp{P} (\seq{y} | \seq{x})
+\hat{\vectorn{\emph{y}}} = \argmax_{\vectorn{\emph{y}}} \funp{P} (\vectorn{\emph{y}} | \vectorn{\emph{x}})
 \label{eq:10-1}
 \end{eqnarray}
-\noindent 这里，用$\seq{x}=\{ x_1,x_2,..., x_m \}$表示输入的源语言单词序列，$\seq{y}=\{ y_1,y_2,..., y_n \}$ 表示生成的目标语单词序列。由于神经机器翻译在生成译文时采用的是自左向右逐词生成的方式，并在翻译每个单词时考虑已经生成的翻译结果，因此对$ \funp{P} (\seq{y} | \seq{x})$的求解可以转换为：
+\noindent 这里，用$\vectorn{\emph{x}}=\{ x_1,x_2,..., x_m \}$表示输入的源语言单词序列，$\vectorn{\emph{y}}=\{ y_1,y_2,..., y_n \}$ 表示生成的目标语单词序列。由于神经机器翻译在生成译文时采用的是自左向右逐词生成的方式，并在翻译每个单词时考虑已经生成的翻译结果，因此对$ \funp{P} (\vectorn{\emph{y}} | \vectorn{\emph{x}})$的求解可以转换为：
 \begin{eqnarray}
-\funp{P} (\seq{y} | \seq{x}) = \prod_{j=1}^{n} \funp{P} ( y_j | \seq{y}_{<j }, \seq{x}  )
+\funp{P} (\vectorn{\emph{y}} | \vectorn{\emph{x}}) = \prod_{j=1}^{n} \funp{P} ( y_j | \vectorn{\emph{y}}_{<j }, \vectorn{\emph{x}}  )
 \label{eq:10-2}
 \end{eqnarray}
 \vspace{-0.5em}
-\noindent 其中，$ \seq{y}_{<j }$表示目标语第$j$个位置之前已经生成的译文单词序列。$ \funp{P} ( y_j | \seq{y}_{<j }, \seq{x})$可以被解释为：根据源语句子$\seq{x} $和已生成的目标语言译文片段$\seq{y}_{<j }=\{ y_1, y_2,..., y_{j-1} \}$,生成第$j$个目标语言单词$y_j$的概率。举个简单的例子，已知源文为$\seq{x} =$\{\textrm{“我”, “很好”}\}，则译文$\seq{y}=$\{“I’m”, “fine”\}的概率为:
+\noindent 其中，$ \vectorn{\emph{y}}_{<j }$表示目标语第$j$个位置之前已经生成的译文单词序列。$ \funp{P} ( y_j | \vectorn{\emph{y}}_{<j }, \vectorn{\emph{x}})$可以被解释为：根据源语句子$\vectorn{\emph{x}} $和已生成的目标语译文片段$\vectorn{\emph{y}}_{<j }=\{ y_1, y_2,..., y_{j-1} \}$,生成第$j$个目标语言单词$y_j$的概率。举个简单的例子，已知源文为$\vectorn{\emph{x}} =$\{\textrm{“我”, “很好”}\}，则译文$\vectorn{\emph{y}}=$\{“I’m”, “fine”\}的概率为:
 \begin{eqnarray}
 \funp{P} ( \{{\textrm{“I'm”,“fine”}}\}|\{\textrm{“我”, “很好”}\}) & = & \funp{P} (\textrm{“I'm”}| \{\textrm{“我”, “很好”}\} ) \cdot \nonumber \\
                                                                            &   & \funp{P} (\textrm{“fine”}|\textrm{“I'm”},\{\textrm{“我”, “很好”}\}) \nonumber \\
 \label{eq:10-3}
 \end{eqnarray}
-\parinterval 求解$\funp{P}(y_j | \seq{y}_{<j},\seq{x})$有三个关键问题（图\ref{fig:10-11}）：
+\parinterval 求解$\funp{P}(y_j | \vectorn{\emph{y}}_{<j},\vectorn{\emph{x}})$有三个关键问题（图\ref{fig:10-11}）：
 \vspace{-0.5em}
 \begin{itemize}
 \vspace{0.5em}
-\item	如何对$\seq{x}$和$\seq{y}_{<j }$进行分布式表示，即{\small\sffamily\bfseries{词嵌入}}\index{词嵌入}（Word Embedding）\index{Word Embedding}。首先，将由one-hot向量表示的源语言单词，即由0和1构成的离散化向量表示，转化为实数向量。可以把这个过程记为$\textrm{e}_x (\cdot)$。类似的，可以把目标语序列$\seq{y}_{<j }$中的每个单词用同样的方式进行表示，记为$\textrm{e}_y (\cdot)$。
+\item	如何对$\vectorn{\emph{x}}$和$\vectorn{\emph{y}}_{<j }$进行分布式表示，即{\small\sffamily\bfseries{词嵌入}}\index{词嵌入}（Word Embedding）\index{Word Embedding}。首先，将由one-hot向量表示的源语言单词，即由0和1构成的离散化向量表示，转化为实数向量。可以把这个过程记为$\textrm{e}_x (\cdot)$。类似的，可以把目标语序列$\vectorn{\emph{y}}_{<j }$中的每个单词用同样的方式进行表示，记为$\textrm{e}_y (\cdot)$。
 \vspace{0.5em}
-\item	如何在词嵌入的基础上获取整个序列的表示，即句子的{\small\sffamily\bfseries{表示学习}}\index{表示学习}（Representation Learning）\index{Representation Learning}。可以把词嵌入的序列作为循环神经网络的输入，循环神经网络最后一个时刻的输出向量便是整个句子的表示结果。如图\ref{fig:10-11}中，编码器最后一个循环单元的输出$\vectorn{h}_m$被看作是一种包含了源语句子信息的表示结果，记为$\vectorn{C}$。
+\item	如何在词嵌入的基础上获取整个序列的表示，即句子的{\small\sffamily\bfseries{表示学习}}\index{表示学习}（Representation Learning）\index{Representation Learning}。可以把词嵌入的序列作为循环神经网络的输入，循环神经网络最后一个时刻的输出向量便是整个句子的表示结果。如图\ref{fig:10-11}中，编码器最后一个循环单元的输出$\vectorn{\emph{h}}_m$被看作是一种包含了源语句子信息的表示结果，记为$\vectorn{\emph{C}}$。
 \vspace{0.5em}
-\item	如何得到每个目标语单词的概率，即译文单词的{\small\sffamily\bfseries{生成}}\index{生成}（Generation）\index{Generation}。与神经语言模型一样，可以用一个Softmax输出层来获取当前时刻所有单词的分布，即利用Softmax 函数计算目标语词表中每个单词的概率。令目标语序列$j$时刻的循环神经网络的输出向量（或状态）为$\vectorn{s}_j$。根据循环神经网络的性质，$ y_j$的生成只依赖前一个状态$\vectorn{s}_{j-1}$和当前时刻的输入（即词嵌入$\textrm{e}_y (y_{j-1})$）。同时考虑源语言信息$\vectorn{C}$，$\funp{P}(y_j  | \seq{y}_{<j},\seq{x})$可以被重新定义为：
+\item	如何得到每个目标语单词的概率，即译文单词的{\small\sffamily\bfseries{生成}}\index{生成}（Generation）\index{Generation}。与神经语言模型一样，可以用一个Softmax输出层来获取当前时刻所有单词的分布，即利用Softmax 函数计算目标语词表中每个单词的概率。令目标语序列$j$时刻的循环神经网络的输出向量（或状态）为$\vectorn{\emph{s}}_j$。根据循环神经网络的性质，$ y_j$的生成只依赖前一个状态$\vectorn{\emph{s}}_{j-1}$和当前时刻的输入（即词嵌入$\textrm{e}_y (y_{j-1})$）。同时考虑源语言信息$\vectorn{\emph{C}}$，$\funp{P}(y_j  | \vectorn{\emph{y}}_{<j},\vectorn{\emph{x}})$可以被重新定义为：
 \begin{eqnarray}
-\funp{P} (y_j | \seq{y}_{<j},\seq{x}) \equiv \funp{P} ( {y_j | \vectorn{s}_{j-1} ,y_{j-1},\vectorn{C}} )
+\funp{P} (y_j | \vectorn{\emph{y}}_{<j},\vectorn{\emph{x}}) \equiv \funp{P} ( {y_j | \vectorn{\emph{s}}_{j-1} ,y_{j-1},\vectorn{\emph{C}}} )
 \label{eq:10-4}
 \end{eqnarray}
-$\funp{P}({y_j | \vectorn{s}_{j-1} ,y_{j-1},\vectorn{C}})$由Softmax实现，Softmax的输入是循环神经网络$j$时刻的输出。在具体实现时，$\vectorn{C}$可以被简单的作为第一个时刻循环单元的输入，即，当$j=1$ 时，解码器的循环神经网络会读入编码器最后一个隐层状态$ \vectorn{h}_m$（也就是$\vectorn{C}$），而其他时刻的隐层状态不直接与$\vectorn{C}$相关。最终，$\funp{P} (y_j | \seq{y}_{<j},\seq{x})$ 被表示为：
+$\funp{P}({y_j | \vectorn{\emph{s}}_{j-1} ,y_{j-1},\vectorn{\emph{C}}})$由Softmax实现，Softmax的输入是循环神经网络$j$时刻的输出。在具体实现时，$\vectorn{\emph{C}}$可以被简单的作为第一个时刻循环单元的输入，即，当$j=1$ 时，解码器的循环神经网络会读入编码器最后一个隐层状态$ \vectorn{\emph{h}}_m$（也就是$\vectorn{\emph{C}}$），而其他时刻的隐层状态不直接与$\vectorn{\emph{C}}$相关。最终，$\funp{P} (y_j | \vectorn{\emph{y}}_{<j},\vectorn{\emph{x}})$ 被表示为：
 \begin{eqnarray}
-\funp{P} (y_j | \seq{y}_{<j},\seq{x}) \equiv
+\funp{P} (y_j | \vectorn{\emph{y}}_{<j},\vectorn{\emph{x}}) \equiv
 \left \{ \begin{array}{ll}
-\funp{P} (y_j |\vectorn{C} ,y_{j-1}) &j=1 \\
+\funp{P} (y_j |\vectorn{\emph{C}} ,y_{j-1}) &j=1 \\
-\funp{P} (y_j|\vectorn{s}_{j-1},y_{j-1})  \quad &j>1
+\funp{P} (y_j|\vectorn{\emph{s}}_{j-1},y_{j-1})  \quad &j>1
 \end{array} \right .
 \label{eq:10-5}
 \end{eqnarray}
 \vspace{0.5em}
 \end{itemize}
-\parinterval 上面提到的问题中，第九章已经介绍过输入层（词嵌入）和输出层（Softmax）的内容，\ref{sec:10.3.2}节将介绍常用的循环神经网络结构（表示学习模型结构）。
+\parinterval 下面会对以上问题分别展开讨论。其中，\ref{sec:10.3.2}节会介绍输入层（词嵌入）和输出层（Softmax）的内容，\ref{sec:10.3.3}节会介绍常用的循环神经网络结构（表示学习模型结
+构）。
 %----------------------------------------------
 \begin{figure}[htp]
 \centering
 \input{./Chapter10/Figures/figure-3-base-problom-of-p}
-\caption{求解$\funp{P} (y_j | \seq{y}_{<j},\seq{x})$的三个基本问题}
+\caption{求解$\funp{P} (y_j | \vectorn{\emph{y}}_{<j},\vectorn{\emph{x}})$的三个基本问题}
 \label{fig:10-11}
 \end{figure}
 %----------------------------------------------
@@ -486,24 +487,114 @@ $\funp{P}({y_j | \vectorn{s}_{j-1} ,y_{j-1},\vectorn{C}})$由Softmax实现，Sof
 %----------------------------------------------------------------------------------------
 %    NEW SUB-SECTION  10.3.2
 %----------------------------------------------------------------------------------------
-\subsection{循环神经网络结构}
+\subsection{输入（词嵌入）及输出（Softmax）}
 \label{sec:10.3.2}
+\parinterval 由\ref{sec:10.3.1}节可知，模型第一个需要解决的问题是词嵌入。词嵌入的概念已经在第九章神经语言模型的部分进行过详细介绍，本小节将侧重介绍在循环神经网络中词嵌入的具体计算。
+\parinterval 若假设输入的单词$y_j$已经被表示为One-hot形式（行向量），那么词嵌入的工作就是把One-hot向量右乘一个实数矩阵$\vectorn{\emph{E}}$，得到的结果（行向量）就是这个单词所对应的词嵌入结果，该过程可被形式化为：
+\begin{eqnarray}
+\textrm{e}_y (y_j) = y_j \vectorn{\emph{E}}
+\label{eq:6-6}
+\end{eqnarray}
+\noindent 这里，$\vectorn{\emph{E}}$也被称作词嵌入矩阵，它可以作为模型的一部分参数共同参与机器翻译系统的训练，也可以由外部其他模块训练得到（如预训练模型）。$\vectorn{\emph{E}}$的大小为$|V| \times d$，这里$|V|$表示词表$V$的大小，$d$表示循环神经网络输入和输出向量的维度。
+%----------------------------------------------
+\begin{figure}[htp]
+\centering
+\input{./Chapter10/Figures/figure-word-embedding-structure}
+\caption{词嵌入的生成过程}
+\label{fig:10-12}
+\end{figure}
+%----------------------------------------------
+\parinterval 更具体的，图\ref{fig:10-12}展示了以单词“you”为例的词嵌入生成过程。词嵌入层（图\ref{fig:10-12}左侧绿色方框部分）首先将输入的单词“you”转化成One-hot表示，对应虚线框中的0-1向量，即除了“you”在词表中的索引位置为1，其余位置均为0。然后词嵌入层将这个0-1向量乘以$\vectorn{\emph{E}}$就得到了词嵌入的结果（绿色圆角框框起来部分），这里用$\textrm{e}_y (\cdot)$表示这个过程，即you的词嵌入表示$\textrm{e}_y (\textrm{“you”})$。最后，将单词的词嵌入表示作为当前时间步循环单元（蓝色方框）的输入。
+\parinterval 需要注意的是，在上面这个过程中One-hot表示和词嵌入矩阵并不必须调用矩阵乘法才得到词嵌入结果。只需要获得One-hot向量中1对应的索引，从词嵌入矩阵中取出对应的行即可。这种利用索引“取”结果的方式避免了计算代价较高的矩阵乘法，因此在实际系统中很常用。
+\parinterval 介绍完了模型输入，再来看看模型的输出，由图\ref{fig:10-10}可见，在解码端模型每个位置都会预测且输出单词。而在循环神经网络中，每一时刻循环单元的输出为向量$\vectorn{\emph{s}}_j$，我们无法根据这个向量得出要生成的目标语单词的概率。进而需要借助输出层利用$\vectorn{\emph{s}}_j$计算词表中每个单词的生成概率，选取概率最高的单词作为当前时刻的输出。图\ref{fig:10-13}展示了一个输出层进行单词预测的实例。
+%----------------------------------------------
+\begin{figure}[htp]
+\centering
+\input{./Chapter10/Figures/figure-output-layer-structur}
+\caption{输出层的预测过程}
+\label{fig:10-13}
+\end{figure}
+%----------------------------------------------
+\parinterval 输出层的构造很简单，首先对输入的向量$\vectorn{\emph{s}}_j$进行一个线性变换得到$\hat{\vectorn{\emph{s}}}_j$，然后将$\hat{\vectorn{\emph{s}}}_j$送入Softmax函数，即可得到词表的概率分布，具体描述如下：
+\begin{eqnarray}
+\vectorn{\emph{o}}_j=\textrm{Softmax}( \vectorn{\emph{s}}_j \vectorn{\emph{W}}_o)
+\label{eq:10-7}
+\end{eqnarray}
+\noindent 其中，$\vectorn{\emph{W}}_o $是线性变换的参数矩阵，矩阵的大小为$d \times |V|$，也就是它会把$d$维的向量变为$|V|$维的向量；$\vectorn{\emph{o}}_j$表示输出的结果向量，$\vectorn{\emph{o}}_j$的每一维$\vectorn{\emph{o}}_{jk}$表示，在时刻$j$词表$V$中一个第$k$个单词出现的概率。这里把$\vectorn{\emph{o}}_j(y_j)$记作目标语单词$y_j$的生成概率，显然有
+\begin{eqnarray}
+\textrm{P} (y_j| \vectorn{\emph{y}}_{<j},\vectorn{\emph{x}})=\vectorn{\emph{o}}_j(y_j)
+\label{eq:10-8}
+\end{eqnarray}
+\parinterval Softmax($\cdot$)的作用是根据输入的$|V|$维向量（即$\vectorn{\emph{s}}_j \vectorn{\emph{W}}_o$），得到一个$|V|$维的分布。令$\mathbf{\tau}$表示Softmax($\cdot$)的输入向量，$\tau_k$表示向量的第$k$维。Softmax函数可以被定义为
+\begin{eqnarray}
+\textrm{Softmax}(\tau_k)=\frac{\textrm{exp}(\tau_k)}  {\sum_{k'=1}^{|V|} \textrm{exp}(\tau_{k'})}
+\label{eq:10-9}
+\end{eqnarray}
+\noindent 这里，exp($\cdot$)表示指数函数。Softmax函数是一个典型的归一化函数，它可以将输入的向量的每一维都转化为0-1之间的数，同时保证所有维的和等于1。Softmax的另一个优点是，它本身（对于输出的每一维）都是可微的（如图\ref{fig:10-14}所示），因此可以直接使用基于梯度的方法进行优化。实际上，Softmax经常被用于分类任务。也可以把机器翻译中目标语单词的生成看作一个分类问题，它的类别数是|$V$|。
+%----------------------------------------------
+\begin{figure}[htp]
+\centering
+\input{./Chapter10/Figures/figure-softmax}
+\caption{ Softmax函数（一维）所对应的曲线}
+\label{fig:10-14}
+\end{figure}
+%----------------------------------------------
+\parinterval 为了进一步理解Softmax的计算过程，来看一个简单的例子。假设词表为(“吃饭”,\ “睡觉”,\ “学习'')，当预测下一个译文单词时，可以将循环神经网络的输出通过矩阵$\vectorn{\emph{W}}_o$映射到词表大小的向量，得到$\mathbf{\tau}=(-3,1.5,2.7)$，此时再使用Softmax激活函数将其进行归一化：
+\begin{eqnarray}
+\textrm{Softmax}(\mathbf{\tau})=
+\left( \begin{array}{c}
+\frac{0.05}{0.05+4.48+14.88} \\
+\frac{4.48}{0.05+4.48+14.88} \\
+\frac{14.88}{0.05+4.48+14.88}
+\end{array} \right)
+=
+\left( \begin{array}{c}
+0.0026 \\
+0.2308 \\
+0.7666
+\end{array} \right)
+\label{eq:10-10}
+\end{eqnarray}
+\parinterval 最终得到在整个词表上的概率分布$(0.0026,0.2308,0.7666)$，其中概率最大的单词“学习”，便是最终的译文单词。
+%----------------------------------------------------------------------------------------
+%    NEW SUB-SECTION  10.3.3
+%----------------------------------------------------------------------------------------
+\subsection{循环神经网络结构}
+\label{sec:10.3.3}
 \parinterval 循环神经网络的核心是设计循环单元的结构。至今，研究人员已经提出了很多优秀的循环单元结构，这里将介绍其中三种基本结构：RNN，LSTM和GRU。LSTM\\和GRU是RNN的变体，在自然语言处理任务中得到了广泛的应用。
 %----------------------------------------------------------------------------------------
 %    NEW SUBSUB-SECTION
 %----------------------------------------------------------------------------------------
-\subsubsection{循环神经单元（RNN）}
+\subsubsection{1. 循环神经单元（RNN）}
-\parinterval RNN（Recurrent Neural Network）是最原始的循环神经网络结构。在RNN中，对于序列$\seq{x}=\{ \vectorn{x}_1, \vectorn{x}_2,...,\vectorn{x}_m \}$，每个时刻$t$都对应一个循环单元，它的输出是一个向量$\vectorn{h}_t$，可以被描述为：
+\parinterval RNN（Recurrent Neural Network）是最原始的循环神经网络结构。在RNN中，对于序列$\vectorn{x}=\{ \vectorn{\emph{x}}_1, \vectorn{\emph{x}}_2,...,\vectorn{\emph{x}}_m \}$，每个时刻$t$都对应一个循环单元，它的输出是一个向量$\vectorn{\emph{h}}_t$，可以被描述为：
 \begin{eqnarray}
-\vectorn{h}_t=f(\vectorn{x}_t \vectorn{U}+\vectorn{h}_{t-1} \vectorn{W}+\vectorn{b})
+\vectorn{\emph{h}}_t=f(\vectorn{\emph{x}}_t \vectorn{\emph{U}}+\vectorn{\emph{h}}_{t-1} \vectorn{\emph{W}}+\vectorn{\emph{b}})
 \label{eq:10-11}
 \end{eqnarray}
-\noindent 其中$\vectorn{x}_t$是当前时刻的输入，$\vectorn{h}_{t-1}$是上一时刻循环单元的输出，$f(\cdot)$是激活函数，$\vectorn{U}$和$\vectorn{W}$是参数矩阵，$\vectorn{b}$是偏置。
+\noindent 其中$\vectorn{\emph{x}}_t$是当前时刻的输入，$\vectorn{\emph{h}}_{t-1}$是上一时刻循环单元的输出，$f(\cdot)$是激活函数，$\vectorn{\emph{U}}$和$\vectorn{\emph{W}}$是参数矩阵，$\vectorn{\emph{b}}$是偏置。
 \parinterval 虽然RNN的结构很简单，但是已经具有了对序列信息进行记忆的能力。实际上，基于RNN结构的神经语言模型已经能够取得比传统$n$-gram语言模型更优异的性能。在机器翻译中，RNN也可以做为入门或者快速原型所使用的神经网络结构。
@@ -511,12 +602,12 @@ $\funp{P}({y_j | \vectorn{s}_{j-1} ,y_{j-1},\vectorn{C}})$由Softmax实现，Sof
 %    NEW SUBSUB-SECTION
 %----------------------------------------------------------------------------------------
-\subsubsection{长短时记忆网络（LSTM）}
+\subsubsection{2. 长短时记忆网络（LSTM）}
 \label{sec:10.3.3.2}
 \parinterval RNN结构使得当前时刻循环单元的状态包含了之前时间步的状态信息。但是这种对历史信息的记忆并不是无损的，随着序列变长，RNN的记忆信息的损失越来越严重。在很多长序列处理任务中（如长文本生成）都观测到了类似现象。对于这个问题，Hochreiter和Schmidhuber提出了{\small\bfnew{长短时记忆}}\index{长短时记忆}（Long Short-term Memory）\index{Long Short-Term Memory}模型，也就是常说的LSTM模型\upcite{HochreiterLong}。
-\parinterval LSTM模型是RNN模型的一种改进。相比RNN仅传递前一时刻的状态$\vectorn{h}_{t-1}$，LSTM会同时传递两部分信息：状态信息$\vectorn{h}_{t-1}$和记忆信息$\vectorn{c}_{t-1}$。这里，$\vectorn{c}_{t-1}$是新引入的变量，它也是循环单元的一部分，用于显性的记录需要记录的历史内容，$\vectorn{h}_{t-1}$和$\vectorn{c}_{t-1}$在循环单元中会相互作用。LSTM通过“门”单元来动态地选择遗忘多少以前的信息和记忆多少当前的信息。LSTM中所使用的门结构如图\ref{fig:10-15}所示，包括遗忘门，输入门和输出门。图中$\sigma$代表Sigmoid函数，它将函数输入映射为0-1范围内的实数，用来充当门控信号。
+\parinterval LSTM模型是RNN模型的一种改进。相比RNN仅传递前一时刻的状态$\vectorn{\emph{h}}_{t-1}$，LSTM会同时传递两部分信息：状态信息$\vectorn{\emph{h}}_{t-1}$和记忆信息$\vectorn{\emph{c}}_{t-1}$。这里，$\vectorn{\emph{c}}_{t-1}$是新引入的变量，它也是循环单元的一部分，用于显性的记录需要记录的历史内容，$\vectorn{\emph{h}}_{t-1}$和$\vectorn{\emph{c}}_{t-1}$在循环单元中会相互作用。LSTM通过“门”单元来动态地选择遗忘多少以前的信息和记忆多少当前的信息。LSTM中所使用的门结构如图\ref{fig:10-15}所示，包括遗忘门，输入门和输出门。图中$\sigma$代表Sigmoid函数，它将函数输入映射为0-1范围内的实数，用来充当门控信号。
 %----------------------------------------------
 \begin{figure}[htp]
@@ -534,34 +625,34 @@ $\funp{P}({y_j | \vectorn{s}_{j-1} ,y_{j-1},\vectorn{C}})$由Softmax实现，Sof
 \begin{itemize}
 \vspace{0.5em}
-\item {\small\sffamily\bfseries{遗忘}}\index{遗忘}。顾名思义，遗忘的目的是忘记一些历史，在LSTM中通过遗忘门实现，其结构如图\ref{fig:10-15}(a)所示。$\vectorn{x}_{t}$表示时刻$t$的输入向量，$\vectorn{h}_{t-1}$是时刻$t-1$的循环单元的输出，$\vectorn{x}_{t}$和$\vectorn{h}_{t-1}$都作为$t$时刻循环单元的输入。$\sigma$将对$\vectorn{x}_{t}$和$\vectorn{h}_{t-1}$进行筛选，以决定遗忘的信息，其计算公式如下：
+\item {\small\sffamily\bfseries{遗忘}}\index{遗忘}。顾名思义，遗忘的目的是忘记一些历史，在LSTM中通过遗忘门实现，其结构如图\ref{fig:10-15}(a)所示。$\vectorn{\emph{x}}_{t}$表示时刻$t$的输入向量，$\vectorn{\emph{h}}_{t-1}$是时刻$t-1$的循环单元的输出，$\vectorn{\emph{x}}_{t}$和$\vectorn{\emph{h}}_{t-1}$都作为$t$时刻循环单元的输入。$\sigma$将对$\vectorn{\emph{x}}_{t}$和$\vectorn{\emph{h}}_{t-1}$进行筛选，以决定遗忘的信息，其计算公式如下：
 \begin{eqnarray}
-\vectorn{f}_t=\sigma(\vectorn{W}_f [\vectorn{h}_{t-1},\vectorn{x}_{t}] + \vectorn{b}_f )
+\vectorn{\emph{f}}_t=\sigma(\vectorn{\emph{W}}_f [\vectorn{\emph{h}}_{t-1},\vectorn{\emph{x}}_{t}] + \vectorn{\emph{b}}_f )
 \label{eq:10-12}
 \end{eqnarray}
-这里，$\vectorn{W}_f$是权值，$\vectorn{b}_f$是偏置，$[\vectorn{h}_{t-1},\vectorn{x}_{t}]$表示两个向量的拼接。该公式可以解释为，对$[\vectorn{h}_{t-1},\vectorn{x}_{t}]$进行变换，并得到一个实数向量$\vectorn{f}_t$。$\vectorn{f}_t$的每一维都可以被理解为一个“门”，它决定可以有多少信息被留下（或遗忘）。
+这里，$\vectorn{\emph{W}}_f$是权值，$\vectorn{\emph{b}}_f$是偏置，$[\vectorn{\emph{h}}_{t-1},\vectorn{\emph{x}}_{t}]$表示两个向量的拼接。该公式可以解释为，对$[\vectorn{\emph{h}}_{t-1},\vectorn{\emph{x}}_{t}]$进行变换，并得到一个实数向量$\vectorn{\emph{f}}_t$。$\vectorn{\emph{f}}_t$的每一维都可以被理解为一个“门”，它决定可以有多少信息被留下（或遗忘）。
 \vspace{0.5em}
-\item {\small\sffamily\bfseries{记忆更新}}\index{记忆更新}。首先，要生成当前时刻需要新增加的信息，该部分由输入门完成，其结构如图\ref{fig:10-15}(b)红色线部分，图中“$\bigotimes$”表示进行点乘操作。输入门的计算分为两部分，首先利用$\sigma$决定门控参数$\vectorn{i}_t$，然后通过Tanh函数得到新的信息$\hat{\vectorn{c}}_t$，具体公式如下：
+\item {\small\sffamily\bfseries{记忆更新}}\index{记忆更新}。首先，要生成当前时刻需要新增加的信息，该部分由输入门完成，其结构如图\ref{fig:10-15}(b)红色线部分，图中“$\bigotimes$”表示进行点乘操作。输入门的计算分为两部分，首先利用$\sigma$决定门控参数$\vectorn{\emph{i}}_t$，然后通过Tanh函数得到新的信息$\hat{\vectorn{\emph{c}}}_t$，具体公式如下：
 \begin{eqnarray}
-\vectorn{i}_t = \sigma (\vectorn{W}_i [\vectorn{h}_{t-1},\vectorn{x}_{t}] + \vectorn{b}_i )
+\vectorn{\emph{i}}_t = \sigma (\vectorn{\emph{W}}_i [\vectorn{\emph{h}}_{t-1},\vectorn{\emph{x}}_{t}] + \vectorn{\emph{b}}_i )
 \label{eq:10-13}
 \end{eqnarray}
 \begin{eqnarray}
-\hat{\vectorn{c}}_t = \textrm{Tanh} (\vectorn{W}_c [\vectorn{h}_{t-1},\vectorn{x}_{t}] + \vectorn{b}_c )
+\hat{\vectorn{\emph{c}}}_t = \textrm{Tanh} (\vectorn{\emph{W}}_c [\vectorn{\emph{h}}_{t-1},\vectorn{\emph{x}}_{t}] + \vectorn{\emph{b}}_c )
 \label{eq:10-14}
 \end{eqnarray}
-之后，用$\vectorn{i}_t$点乘$\hat{\vectorn{c}}_t$，得到当前需要记忆的信息，记为$\vectorn{i}_t \cdot  \hat{\vectorn{c}}_t$。接下来需要更新旧的信息$\vectorn{c}_{t-1}$，得到新的记忆信息$\vectorn{c}_t$，更新的操作如图\ref{fig:10-15}(c)红色线部分所示，“$\bigoplus$”表示相加。具体规则是通过遗忘门选择忘记一部分上文信息$\vectorn{f}_t$，通过输入门计算新增的信息$\vectorn{i}_t \cdot  \hat{\vectorn{c}}_t$，然后根据“$\bigotimes$”门与“$\bigoplus$”门进行相应的乘法和加法计算：
+之后，用$\vectorn{\emph{i}}_t$点乘$\hat{\vectorn{\emph{c}}}_t$，得到当前需要记忆的信息，记为$\vectorn{\emph{i}}_t \cdot  \hat{\vectorn{\emph{c}}}_t$。接下来需要更新旧的信息$\vectorn{\emph{c}}_{t-1}$，得到新的记忆信息$\vectorn{\emph{c}}_t$，更新的操作如图\ref{fig:10-15}(c)红色线部分所示，“$\bigoplus$”表示相加。具体规则是通过遗忘门选择忘记一部分上文信息$\vectorn{\emph{f}}_t$，通过输入门计算新增的信息$\vectorn{\emph{i}}_t \cdot  \hat{\vectorn{\emph{c}}}_t$，然后根据“$\bigotimes$”门与“$\bigoplus$”门进行相应的乘法和加法计算：
 \begin{eqnarray}
-\vectorn{c}_t = \vectorn{f}_t \cdot \vectorn{c}_{t-1} + \vectorn{i}_t  \cdot \hat{\vectorn{c}_t}
+\vectorn{\emph{c}}_t = \vectorn{\emph{f}}_t \cdot \vectorn{\emph{c}}_{t-1} + \vectorn{\emph{i}}_t  \cdot \hat{\vectorn{\emph{c}}_t}
 \label{eq:10-15}
 \end{eqnarray}
 \vspace{-1.0em}
-\item {\small\sffamily\bfseries{输出}}\index{输出}。该部分使用输出门计算最终的输出信息$\vectorn{h}_t$，其结构如图\ref{fig:10-15}(d)红色线部分所示。在输出门中，首先将$\vectorn{x}_t$和$\vectorn{h}_{t-1}$通过$\sigma$函数变换得到$\vectorn{o}_t$。其次，将上一步得到的新记忆信息$\vectorn{c}_t$通过Tanh函数进行变换，得到值在[-1，1]范围的向量。最后将这两部分进行点乘，具体公式如下：
+\item {\small\sffamily\bfseries{输出}}\index{输出}。该部分使用输出门计算最终的输出信息$\vectorn{\emph{h}}_t$，其结构如图\ref{fig:10-15}(d)红色线部分所示。在输出门中，首先将$\vectorn{\emph{x}}_t$和$\vectorn{\emph{h}}_{t-1}$通过$\sigma$函数变换得到$\vectorn{\emph{o}}_t$。其次，将上一步得到的新记忆信息$\vectorn{\emph{c}}_t$通过Tanh函数进行变换，得到值在[-1，1]范围的向量。最后将这两部分进行点乘，具体公式如下：
 \begin{eqnarray}
-\vectorn{o}_t & = & \sigma (\vectorn{W}_o [\vectorn{h}_{t-1},\vectorn{x}_{t}] + \vectorn{b}_o ) \label{eq:10-16} \\
+\vectorn{\emph{o}}_t & = & \sigma (\vectorn{\emph{W}}_o [\vectorn{\emph{h}}_{t-1},\vectorn{\emph{x}}_{t}] + \vectorn{\emph{b}}_o ) \label{eq:10-16} \\
-\vectorn{h}_t & = & \vectorn{o}_t \cdot \textrm{Tanh} (\vectorn{c}_t) \label{eq:6-17}
+\vectorn{\emph{h}}_t & = & \vectorn{\emph{o}}_t \cdot \textrm{Tanh} (\vectorn{\emph{c}}_t) \label{eq:6-17}
 \end{eqnarray}
 \vspace{0.5em}
 \end{itemize}
@@ -576,15 +667,15 @@ $\funp{P}({y_j | \vectorn{s}_{j-1} ,y_{j-1},\vectorn{C}})$由Softmax实现，Sof
 \end{figure}
 %----------------------------------------------
-\parinterval LSTM的完整结构如图\ref{fig:10-16}所示，模型的参数包括：参数矩阵$\vectorn{W}_f$、$\vectorn{W}_i$ 、$\vectorn{W}_c$、\\$\vectorn{W}_o$和偏置$\vectorn{b}_f$、$\vectorn{b}_i$、$\vectorn{b}_c$、$\vectorn{b}_o$。可以看出，$\vectorn{h}_t$是由$\vectorn{c}_{t-1}$、$\vectorn{h}_{t-1}$与$\vectorn{x}_t$共同决定的。此外，上述公式中激活函数的选择是根据函数各自的特点决定的。
+\parinterval LSTM的完整结构如图\ref{fig:10-16}所示，模型的参数包括：参数矩阵$\vectorn{\emph{W}}_f$、$\vectorn{\emph{W}}_i$ 、$\vectorn{\emph{W}}_c$、\\$\vectorn{\emph{W}}_o$和偏置$\vectorn{\emph{b}}_f$、$\vectorn{\emph{b}}_i$、$\vectorn{\emph{b}}_c$、$\vectorn{\emph{b}}_o$。可以看出，$\vectorn{\emph{h}}_t$是由$\vectorn{\emph{c}}_{t-1}$、$\vectorn{\emph{h}}_{t-1}$与$\vectorn{\emph{x}}_t$共同决定的。此外，上述公式中激活函数的选择是根据函数各自的特点决定的。
 %----------------------------------------------------------------------------------------
 %    NEW SUBSUB-SECTION
 %----------------------------------------------------------------------------------------
-\subsubsection{门控循环单元（GRU）}
+\subsubsection{3. 门控循环单元（GRU）}
-\parinterval LSTM 通过门控单元控制传递状态，忘记不重要的信息，记住必要的历史信息，在长序列上取得了很好的效果，但是其进行了许多门信号的计算，较为繁琐。{\small\bfnew{门循环单元}}\index{门循环单元}（Gated Recurrent Unit，GRU）\index{Gated Recurrent Unit，GRU}作为一个LSTM的变种，它继承了LSTM中利用门控单元控制信息传递的思想，并对LSTM进行了简化\upcite{Cho2014Learning}。它把循环单元状态$\vectorn{h}_t$和记忆$\vectorn{c}_t$合并成一个状态$\vectorn{h}_t$，同时使用了更少的门控单元，大大提升了计算效率。
+\parinterval LSTM 通过门控单元控制传递状态，忘记不重要的信息，记住必要的历史信息，在长序列上取得了很好的效果，但是其进行了许多门信号的计算，较为繁琐。{\small\bfnew{门循环单元}}\index{门循环单元}（Gated Recurrent Unit，GRU）\index{Gated Recurrent Unit，GRU}作为一个LSTM的变种，它继承了LSTM中利用门控单元控制信息传递的思想，并对LSTM进行了简化\upcite{Cho2014Learning}。它把循环单元状态$\vectorn{\emph{h}}_t$和记忆$\vectorn{\emph{c}}_t$合并成一个状态$\vectorn{\emph{h}}_t$，同时使用了更少的门控单元，大大提升了计算效率。
 %----------------------------------------------
 \begin{figure}[htp]
@@ -597,33 +688,33 @@ $\funp{P}({y_j | \vectorn{s}_{j-1} ,y_{j-1},\vectorn{C}})$由Softmax实现，Sof
 \end{figure}
 %----------------------------------------------
-\parinterval GRU的输入和RNN是一样的，由输入$\vectorn{x}_t$和$t-1$时刻的状态$\vectorn{h}_{t-1}$组成。GRU只有两个门信号，分别是重置门和更新门。重置门$\vectorn{r}_t$用来控制前一时刻隐藏状态的记忆程度，其结构如图\ref{fig:10-17}(a)。更新门用来更新记忆，使用一个门同时完成遗忘和记忆两种操作，其结构如图\ref{fig:10-17}(b)。重置门和更新门的计算公式如下：
+\parinterval GRU的输入和RNN是一样的，由输入$\vectorn{\emph{x}}_t$和$t-1$时刻的状态$\vectorn{\emph{h}}_{t-1}$组成。GRU只有两个门信号，分别是重置门和更新门。重置门$\vectorn{\emph{r}}_t$用来控制前一时刻隐藏状态的记忆程度，其结构如图\ref{fig:10-17}(a)。更新门用来更新记忆，使用一个门同时完成遗忘和记忆两种操作，其结构如图\ref{fig:10-17}(b)。重置门和更新门的计算公式如下：
 \begin{eqnarray}
-\vectorn{r}_t & = &\sigma (\vectorn{W}_r [\vectorn{h}_{t-1},\vectorn{x}_{t}] ) \label{eq:10-18} \\
+\vectorn{\emph{r}}_t & = &\sigma (\vectorn{\emph{W}}_r [\vectorn{\emph{h}}_{t-1},\vectorn{\emph{x}}_{t}] ) \label{eq:10-18} \\
-\vectorn{u}_t & = & \sigma (\vectorn{W}_u [\vectorn{h}_{t-1},\vectorn{x}_{t}]) \label{eq:10-19}
+\vectorn{\emph{u}}_t & = & \sigma (\vectorn{\emph{W}}_u [\vectorn{\emph{h}}_{t-1},\vectorn{\emph{x}}_{t}]) \label{eq:10-19}
 \end{eqnarray}
-\parinterval 当完成了重置门和更新门计算后，就需要更新当前隐藏状态，如图\ref{fig:10-17}(c)所示。在计算得到了重置门的权重$\vectorn{r}_t$后，使用其对前一时刻的状态$\vectorn{h}_{t-1}$进行重置($\vectorn{r}_t \cdot \vectorn{h}_{t-1}$)，将重置后的结果与$\vectorn{x}_t$拼接，通过Tanh激活函数将数据变换到[-1,1]范围内：
+\parinterval 当完成了重置门和更新门计算后，就需要更新当前隐藏状态，如图\ref{fig:10-17}(c)所示。在计算得到了重置门的权重$\vectorn{\emph{r}}_t$后，使用其对前一时刻的状态$\vectorn{\emph{h}}_{t-1}$进行重置($\vectorn{\emph{r}}_t \cdot \vectorn{\emph{h}}_{t-1}$)，将重置后的结果与$\vectorn{\emph{x}}_t$拼接，通过Tanh激活函数将数据变换到[-1,1]范围内：
 \begin{eqnarray}
-\hat{\vectorn{h}}_t = \textrm{Tanh} (\vectorn{W}_h [\vectorn{r}_t \cdot \vectorn{h}_{t-1},\vectorn{x}_{t}])
+\hat{\vectorn{\emph{h}}}_t = \textrm{Tanh} (\vectorn{\emph{W}}_h [\vectorn{\emph{r}}_t \cdot \vectorn{\emph{h}}_{t-1},\vectorn{\emph{x}}_{t}])
 \label{eq:10-20}
 \end{eqnarray}
-\parinterval $\hat{\vectorn{h}}_t$在包含了输入信息$\vectorn{x}_t$的同时，引入了$\vectorn{h}_{t-1}$的信息，可以理解为，记忆了当前时刻的状态。下一步是计算更新后的隐藏状态也就是更新记忆，如下所示：
+\parinterval $\hat{\vectorn{\emph{h}}}_t$在包含了输入信息$\vectorn{\emph{x}}_t$的同时，引入了$\vectorn{\emph{h}}_{t-1}$的信息，可以理解为，记忆了当前时刻的状态。下一步是计算更新后的隐藏状态也就是更新记忆，如下所示：
 \begin{eqnarray}
-\vectorn{h}_t = (1-\vectorn{u}_t) \cdot \vectorn{h}_{t-1} +\vectorn{u}_t \cdot \hat{\vectorn{h}}_t
+\vectorn{\emph{h}}_t = (1-\vectorn{\emph{u}}_t) \cdot \vectorn{\emph{h}}_{t-1} +\vectorn{\emph{u}}_t \cdot \hat{\vectorn{\emph{h}}}_t
 \label{eq:10-21}
 \end{eqnarray}
-\noindent 这里，$\vectorn{u}_t$是更新门中得到的权重，将$\vectorn{u}_t$作用于$\hat{\vectorn{h}}_t$表示对当前时刻的状态进行“遗忘”，舍弃一些不重要的信息，将$(1-\vectorn{u}_t)$作用于$\vectorn{h}_{t-1}$，用于对上一时刻隐藏状态进行选择性记忆。
+\noindent 这里，$\vectorn{\emph{u}}_t$是更新门中得到的权重，将$\vectorn{\emph{u}}_t$作用于$\hat{\vectorn{\emph{h}}}_t$表示对当前时刻的状态进行“遗忘”，舍弃一些不重要的信息，将$(1-\vectorn{\emph{u}}_t)$作用于$\vectorn{\emph{h}}_{t-1}$，用于对上一时刻隐藏状态进行选择性记忆。
-\parinterval GRU的输入输出和RNN类似，其采用与LSTM类似的门控思想，达到捕获长距离依赖信息的目的。此外，GRU比LSTM少了一个门结构，而且参数只有$\vectorn{W}_r$、$\vectorn{W}_u$和$\vectorn{W}_h$。因此，GRU具有比LSTM高的运算效率，在系统研发中也经常被使用。
+\parinterval GRU的输入输出和RNN类似，其采用与LSTM类似的门控思想，达到捕获长距离依赖信息的目的。此外，GRU比LSTM少了一个门结构，而且参数只有$\vectorn{\emph{W}}_r$、$\vectorn{\emph{W}}_u$和$\vectorn{\emph{W}}_h$。因此，GRU具有比LSTM高的运算效率，在系统研发中也经常被使用。
 %----------------------------------------------------------------------------------------
 %    NEW SUBSUB-SECTION
 %----------------------------------------------------------------------------------------
-\subsubsection{双向模型}
+\subsubsection{4. 双向模型}
 \parinterval 前面提到的循环神经网络都是自左向右运行的，也就是说在处理一个单词的时候只能访问它前面的序列信息。但是，只根据句子的前文来生成一个序列的表示是不全面的，因为从最后一个词来看，第一个词的信息可能已经很微弱了。为了同时考虑前文和后文的信息，一种解决办法是使用双向循环网络，其结构如图\ref{fig:10-18}所示。这里，编码器可以看作有两个循环神经网络，第一个网络，即红色虚线框里的网络，从句子的右边进行处理，第二个网络从句子左边开始处理，最终将正向和反向得到的结果都融合后传递给解码器。
@@ -642,7 +733,7 @@ $\funp{P}({y_j | \vectorn{s}_{j-1} ,y_{j-1},\vectorn{C}})$由Softmax实现，Sof
 %    NEW SUBSUB-SECTION
 %----------------------------------------------------------------------------------------
-\subsubsection{多层循环神经网络}
+\subsubsection{5. 多层循环神经网络}
 \parinterval 实际上，对于单词序列所使用的循环神经网络是一种很“深”的网络，因为从第一个单词到最后一个单词需要经过至少句子长度相当层数的神经元。比如，一个包含几十个词的句子也会对应几十个神经元层。但是，在很多深度学习应用中，更习惯把对输入序列的同一种处理作为“一层”。比如，对于输入序列，构建一个RNN，那么这些循环单元就构成了网络的“一层”。当然，这里并不是要混淆概念。只是要明确，在随后的讨论中，“层”并不是指一组神经元的全连接，它一般指的是网络的拓扑结构。
@@ -658,36 +749,36 @@ $\funp{P}({y_j | \vectorn{s}_{j-1} ,y_{j-1},\vectorn{C}})$由Softmax实现，Sof
 %----------------------------------------------
 %----------------------------------------------------------------------------------------
-%    NEW SUB-SECTION  10.3.3
+%    NEW SUB-SECTION  10.3.4
 %----------------------------------------------------------------------------------------
 \vspace{-1.5em}
 \subsection{训练}
 \parinterval 第九章已经介绍了神经网络的训练方法。其中最常用的是基于梯度的方法，即：使用一定量样本进行神经网络的前向计算，之后进行反向计算，并得到所有参数的梯度信息，再使用下面的规则进行参数更新：
 \begin{eqnarray}
-\vectorn{w}_{step+1} = \vectorn{w}_{step} - \alpha \cdot \frac{ \partial L(\vectorn{w}_{step})} {\partial \vectorn{w}_{step} }
+\vectorn{\emph{w}}_{step+1} = \vectorn{\emph{w}}_{step} - \alpha \cdot \frac{ \partial L(\vectorn{\emph{w}}_{step})} {\partial \vectorn{\emph{w}}_{step} }
 \label{eq:10-30}
 \end{eqnarray}
-\noindent 其中，$\vectorn{w}_{step}$表示更新前的模型参数，$\vectorn{w}_{step+1}$表示更新后的模型参数，$L(\vectorn{w}_{step})$表示模型相对于$\vectorn{w}_{step}$的损失，$\frac{\partial L(\vectorn{w}_{step})} {\partial \vectorn{w}_{step} }$表示损失函数的梯度，$\alpha$是更新的步进值。也就是说，给定一定量的训练数据，不断执行公式\ref{eq:10-30}的过程。反复使用训练数据，直至模型参数达到收敛或者损失函数不再变化。通常，把公式的一次执行称为“一步”更新/训练，把访问完所有样本的训练称为“一轮”训练。
+\noindent 其中，$\vectorn{\emph{w}}_{step}$表示更新前的模型参数，$\vectorn{\emph{w}}_{step+1}$表示更新后的模型参数，$L(\vectorn{\emph{w}}_{step})$表示模型相对于$\vectorn{\emph{w}}_{step}$的损失，$\frac{\partial L(\vectorn{\emph{w}}_{step})} {\partial \vectorn{\emph{w}}_{step} }$表示损失函数的梯度，$\alpha$是更新的步进值。也就是说，给定一定量的训练数据，不断执行公式\ref{eq:10-30}的过程。反复使用训练数据，直至模型参数达到收敛或者损失函数不再变化。通常，把公式的一次执行称为“一步”更新/训练，把访问完所有样本的训练称为“一轮”训练。
-\parinterval 将公式\ref{eq:10-30}应用于神经机器翻译有几个基本问题需要考虑：1）损失函数的选择；2）参数初始化的策略，也就是如何设置$\vectorn{w}_0$；3）优化策略和学习率调整策略；4）训练加速。下面对这些问题进行讨论。
+\parinterval 将公式\ref{eq:10-30}应用于神经机器翻译有几个基本问题需要考虑：1）损失函数的选择；2）参数初始化的策略，也就是如何设置$\vectorn{\emph{w}}_0$；3）优化策略和学习率调整策略；4）训练加速。下面对这些问题进行讨论。
 %----------------------------------------------------------------------------------------
 %    NEW SUBSUB-SECTION
 %----------------------------------------------------------------------------------------
-\subsubsection{1.损失函数}
+\subsubsection{1. 损失函数}
-\parinterval 因为神经机器翻译在每个目标语位置都会输出一个概率分布，表示这个位置上不同单词出现的可能性，因此需要知道当前位置输出的分布相比于标准答案的“损失”。对于这个问题，常用的是交叉熵损失函数。令$\vectorn{y}$表示机器翻译模型输出的分布，$\hat{\vectorn{y}}$ 表示标准答案，则交叉熵损失可以被定义为：
+\parinterval 因为神经机器翻译在每个目标语位置都会输出一个概率分布，表示这个位置上不同单词出现的可能性，因此需要知道当前位置输出的分布相比于标准答案的“损失”。对于这个问题，常用的是交叉熵损失函数。令$\vectorn{\emph{y}}$表示机器翻译模型输出的分布，$\hat{\vectorn{\emph{y}}}$ 表示标准答案，则交叉熵损失可以被定义为：
 \begin{eqnarray}
-L_{\textrm{ce}}(\vectorn{y},\hat{\vectorn{y}}) = - \sum_{k=1}^{|V|} \vectorn{y}[k] \textrm{log} (\hat{\vectorn{y}}[k])
+L_{\textrm{ce}}(\vectorn{\emph{y}},\hat{\vectorn{\emph{y}}}) = - \sum_{k=1}^{|V|} \vectorn{\emph{y}}[k] \textrm{log} (\hat{\vectorn{\emph{y}}}[k])
 \label{eq:10-3222}
 \end{eqnarray}
-\noindent 其中$\vectorn{y}[k]$ 和$\hat{\vectorn{y}}[k]$分别表示向量$\vectorn{y}$和$\hat{\vectorn{y}}$的第$k$维，$|V|$表示输出向量的维度（等于词表大小）。对于一个模型输出的概率分布$\vectorn{Y} = \{ \vectorn{y}_1,\vectorn{y}_2,..., \vectorn{y}_n \}$和标准答案分布$\widehat{\vectorn{Y}}=\{ \hat{\vectorn{y}}_1, \hat{\vectorn{y}}_2,...,\hat{\vectorn{y}}_n \}$，损失函数可以被定义为：
+\noindent 其中$\vectorn{\emph{y}}[k]$ 和$\hat{\vectorn{\emph{y}}}[k]$分别表示向量$\vectorn{\emph{y}}$和$\hat{\vectorn{\emph{y}}}$的第$k$维，$|V|$表示输出向量的维度（等于词表大小）。对于一个模型输出的概率分布$\vectorn{\emph{Y}} = \{ \vectorn{\emph{y}}_1,\vectorn{\emph{y}}_2,..., \vectorn{\emph{y}}_n \}$和标准答案分布$\widehat{\vectorn{\emph{Y}}}=\{ \hat{\vectorn{\emph{y}}}_1, \hat{\vectorn{\emph{y}}}_2,...,\hat{\vectorn{\emph{y}}}_n \}$，损失函数可以被定义为：
 \begin{eqnarray}
-L(\vectorn{Y},\widehat{\vectorn{Y}}) = \sum_{j=1}^n L_{\textrm{ce}}(\vectorn{y}_j,\hat{\vectorn{y}}_j)
+L(\vectorn{\emph{Y}},\widehat{\vectorn{\emph{Y}}}) = \sum_{j=1}^n L_{\textrm{ce}}(\vectorn{\emph{y}}_j,\hat{\vectorn{\emph{y}}}_j)
 \label{eq:10-31}
 \end{eqnarray}
@@ -699,7 +790,7 @@ L(\vectorn{Y},\widehat{\vectorn{Y}}) = \sum_{j=1}^n L_{\textrm{ce}}(\vectorn{y}_
 %    NEW SUBSUB-SECTION
 %----------------------------------------------------------------------------------------
-\subsubsection{2.参数初始化}
+\subsubsection{2. 参数初始化}
 \parinterval 神经网络的参数主要是各层中的线性变换矩阵和偏置。在训练开始时，需要对参数进行初始化。但是，由于神经机器翻译的网络结构复杂，因此损失函数往往不是凸函数，不同初始化会导致不同的优化结果。而且在大量实践中已经发现，神经机器翻译模型对初始化方式非常敏感，性能优异的系统往往需要特定的初始化方式。
@@ -707,13 +798,13 @@ L(\vectorn{Y},\widehat{\vectorn{Y}}) = \sum_{j=1}^n L_{\textrm{ce}}(\vectorn{y}_
 \begin{itemize}
 \vspace{0.5em}
-\item LSTM遗忘门偏置初始化为1，也就是始终选择遗忘记忆$\vectorn{c}$，这样可以有效防止初始化时$\vectorn{c}$里包含的错误信号传播到后面的所有时刻。
+\item LSTM遗忘门偏置初始化为1，也就是始终选择遗忘记忆$\vectorn{\emph{c}}$，这样可以有效防止初始化时$\vectorn{\emph{c}}$里包含的错误信号传播到后面的所有时刻。
 \vspace{0.5em}
 \item 网络中的其他偏置一般都初始化为0，可以有效防止加入过大或过小的偏置后使得激活函数的输出跑到“饱和区”，也就是梯度接近0的区域，防止训练一开始就无法跳出局部极小的区域。
 \vspace{0.5em}
-\item 网络的权重矩阵$\vectorn{w}$一般使用Xavier参数初始化方法\upcite{pmlr-v9-glorot10a}，可以有效稳定训练过程，特别是对于比较“深”的网络。令$d_{in}$和$d_{out}$分别表示$\vectorn{w}$的输入和输出的维度大小，则该方法的具体实现如下：
+\item 网络的权重矩阵$\vectorn{\emph{w}}$一般使用Xavier参数初始化方法\upcite{pmlr-v9-glorot10a}，可以有效稳定训练过程，特别是对于比较“深”的网络。令$d_{in}$和$d_{out}$分别表示$\vectorn{\emph{w}}$的输入和输出的维度大小，则该方法的具体实现如下：
 \begin{eqnarray}
-\vectorn{w} \sim U(-\sqrt{ \frac{6} { d_{in} + d_{out} } } , \sqrt{ \frac{6} { d_{in} + d_{out} } })
+\vectorn{\emph{w}} \sim U(-\sqrt{ \frac{6} { d_{in} + d_{out} } } , \sqrt{ \frac{6} { d_{in} + d_{out} } })
 \label{eq:10-32}
 \vspace{0.5em}
 \end{eqnarray}
@@ -725,7 +816,7 @@ L(\vectorn{Y},\widehat{\vectorn{Y}}) = \sum_{j=1}^n L_{\textrm{ce}}(\vectorn{y}_
 %    NEW SUBSUB-SECTION
 %----------------------------------------------------------------------------------------
 \vspace{-2.5em}
-\subsubsection{3.优化策略}
+\subsubsection{3. 优化策略}
 %\vspace{0.5em}
 \parinterval 公式\ref{eq:10-30}展示了最基本的优化策略，也被称为标准的SGD优化器。实际上，训练神经机器翻译模型时，还有非常多的优化器可以选择，在第九章也有详细介绍，这里考虑Adam优化器。 Adam 通过对梯度的{\small\bfnew{一阶矩估计}}\index{一阶矩估计}（First Moment Estimation）\index{First Moment Estimation}和{\small\bfnew{二阶矩估计}}\index{二阶矩估计}（Second Moment Estimation）\index{Second Moment Estimation}进行综合考虑，计算出更新步长。
@@ -750,24 +841,24 @@ L(\vectorn{Y},\widehat{\vectorn{Y}}) = \sum_{j=1}^n L_{\textrm{ce}}(\vectorn{y}_
 %    NEW SUBSUB-SECTION
 %----------------------------------------------------------------------------------------
 \vspace{-2.5em}
-\subsubsection{4.梯度裁剪}
+\subsubsection{4. 梯度裁剪}
 %\vspace{0.5em}
 \parinterval 需要注意的是，训练循环神经网络时，反向传播使得网络层之间的梯度重复相乘，在网络层数过深时，如果连乘因子小于1可能造成梯度指数级的减少，甚至趋近于0，导致网络无法优化，也就是梯度消失问题。当连乘因子大于1时，可能会导致梯度的乘积变得异常大，造成梯度爆炸的问题。在这种情况下需要使用“梯度裁剪”来防止梯度超过阈值。梯度裁剪在第九章已经介绍过，这里简单回顾一下。梯度裁剪的具体公式如下：
 \vspace{-0.5em}
 \begin{eqnarray}
-\vectorn{w}' = \vectorn{w} \cdot \frac{\gamma} {\textrm{max}(\gamma,\| \vectorn{w} \|_2)}
+\vectorn{\emph{w}}' = \vectorn{\emph{w}} \cdot \frac{\gamma} {\textrm{max}(\gamma,\| \vectorn{\emph{w}} \|_2)}
 \label{eq:10-33}
 \end{eqnarray}
 %\vspace{0.5em}
-\noindent 其中$\gamma$是手工设定的梯度大小阈值， $\| \cdot \|_2$是L2范数，$\vectorn{w}'$表示梯度裁剪后的参数。这个公式的含义在于只要梯度大小超过阈值，就按照阈值与当前梯度大小的比例进行放缩。
+\noindent 其中$\gamma$是手工设定的梯度大小阈值， $\| \cdot \|_2$是L2范数，$\vectorn{\emph{w}}'$表示梯度裁剪后的参数。这个公式的含义在于只要梯度大小超过阈值，就按照阈值与当前梯度大小的比例进行放缩。
 %----------------------------------------------------------------------------------------
 %    NEW SUBSUB-SECTION
 %----------------------------------------------------------------------------------------
-\subsubsection{5.学习率策略}
+\subsubsection{5. 学习率策略}
 \vspace{0.5em}
 \parinterval 在公式\ref{eq:10-30}中， $\alpha$决定了每次参数更新时更新的步幅大小，称之为{\small\bfnew{学习率}}\index{学习率}（Learning Rate）\index{Learning Rate}。学习率作为基于梯度方法中的重要超参数，它决定目标函数能否收敛到较好的局部最优点以及收敛的速度。合理的学习率能够使模型快速、稳定地达到较好的状态。但是，如果学习率太小，收敛过程会很慢；而学习率太大，则模型的状态可能会出现震荡，很难达到稳定，甚至使模型无法收敛。图\ref{fig:10-28} 对比了不同学习率对优化过程的影响。
@@ -807,7 +898,7 @@ L(\vectorn{Y},\widehat{\vectorn{Y}}) = \sum_{j=1}^n L_{\textrm{ce}}(\vectorn{y}_
 %    NEW SUBSUB-SECTION
 %----------------------------------------------------------------------------------------
 %\vspace{0.5em}
-\subsubsection{6.并行训练}
+\subsubsection{6. 并行训练}
 %\vspace{0.5em}
 \parinterval 机器翻译是自然语言处理中很“重”的任务。因为数据量巨大而且模型较为复杂，模型训练的时间往往很长。比如，使用一千万句的训练数据，性能优异的系统往往需要几天甚至一周的时间。更大规模的数据会导致训练时间更长。特别是使用多层网络同时增加模型容量时（比如增加隐层宽度时），神经机器翻译的训练会更加缓慢。对于这个问题，一个思路是从模型训练算法上进行改进。比如前面提到的Adam就是一种高效的训练策略。另一种思路是利用多设备进行加速，也称作分布式训练。
@@ -842,7 +933,7 @@ L(\vectorn{Y},\widehat{\vectorn{Y}}) = \sum_{j=1}^n L_{\textrm{ce}}(\vectorn{y}_
 \end{figure}
 %----------------------------------------------
-\vspace{-0.5em}
+\vspace{0.5em}
 \item {\small\bfnew{模型并行}}\index{模型并行}。另一种思路是，把较大的模型分成若干小模型，之后在不同设备上训练小模型。对于循环神经网络，不同层的网络天然就是一个相对独立的模型，因此非常适合使用这种方法。比如，对于$l$层的循环神经网络，把每层都看做一个小模型，然后分发到$l$个设备上并行计算。在序列较长的时候，该方法使其运算时间变为原来的${1}/{l}$。图\ref{fig:10-31}以三层循环网络为例展示了对句子“你\ 很\ 不错\ 。”进行模型并行的过程。其中，每一层网络都被放到了一个设备上。当模型根据已经生成的第一个词“你”预测下一个词时（图\ref{fig:10-31}(a)），同层的下一个时刻的计算和对“你”的第二层的计算就可以同时开展（图\ref{fig:10-31}(b)）。以此类推，就完成了模型的并行计算。
 \vspace{0.5em}
 \end{itemize}
@@ -852,7 +943,7 @@ L(\vectorn{Y},\widehat{\vectorn{Y}}) = \sum_{j=1}^n L_{\textrm{ce}}(\vectorn{y}_
 \centering
 \begin{tabular}{l l}
 \subfigure[]{\input{./Chapter10/Figures/figure-process01}} &\subfigure[]{\input{./Chapter10/Figures/figure-process02}} \\
-%\subfigure[]{\input{./Chapter10/Figures/figure-process03}}  &\subfigure[]{\input{./Chapter10/Figures/figure-process04}} \\
+\subfigure[]{\input{./Chapter10/Figures/figure-process03}}  &\subfigure[]{\input{./Chapter10/Figures/figure-process04}} \\
 %\subfigure[]{\input{./Chapter10/Figures/figure-process05}}  &\subfigure[]{\input{./Chapter10/Figures/figure-process06}}\\
 \end{tabular}
 %\caption{一个三层循环神经网络的模型并行过程}
@@ -864,7 +955,7 @@ L(\vectorn{Y},\widehat{\vectorn{Y}}) = \sum_{j=1}^n L_{\textrm{ce}}(\vectorn{y}_
 \centering
 \begin{tabular}{l l}
 %\subfigure[]{\input{./Chapter10/Figures/figure-process01}} &\subfigure[]{\input{./Chapter10/Figures/figure-process02}} \\
-\subfigure[]{\input{./Chapter10/Figures/figure-process03}}  &\subfigure[]{\input{./Chapter10/Figures/figure-process04}} \\
+%\subfigure[]{\input{./Chapter10/Figures/figure-process03}}  &\subfigure[]{\input{./Chapter10/Figures/figure-process04}} \\
 \subfigure[]{\input{./Chapter10/Figures/figure-process05}}  &\subfigure[]{\input{./Chapter10/Figures/figure-process06}}
 \end{tabular}
 \caption{一个三层循环神经网络的模型并行过程}
@@ -873,31 +964,31 @@ L(\vectorn{Y},\widehat{\vectorn{Y}}) = \sum_{j=1}^n L_{\textrm{ce}}(\vectorn{y}_
 %----------------------------------------------
 %----------------------------------------------------------------------------------------
-%    NEW SUB-SECTION  10.3.4
+%    NEW SUB-SECTION  10.3.5
 %----------------------------------------------------------------------------------------
 \subsection{推断}
-\parinterval 神经机器翻译的推断是指：利用已经训练好的模型对新的源语言句子进行翻译的过程。具体来说，首先利用编码器生成源语言句子的表示，之后利用解码器预测目标语译文。也就是，对于源语言句子$\vectorn{x}$，生成一个使翻译概率$\funp{P}(\vectorn{y} | \vectorn{x})$最大的目标语译文$\hat{\vectorn{y}}$，如下（详细过程见\ref{sec:10.3.1}节）：
+\parinterval 神经机器翻译的推断是指：利用已经训练好的模型对新的源语言句子进行翻译的过程。具体来说，首先利用编码器生成源语言句子的表示，之后利用解码器预测目标语译文。也就是，对于源语言句子$\vectorn{\emph{x}}$，生成一个使翻译概率$\funp{P}(\vectorn{\emph{y}} | \vectorn{\emph{x}})$最大的目标语译文$\hat{\vectorn{\emph{y}}}$，如下（详细过程见\ref{sec:10.3.1}节）：
 \begin{eqnarray}
-\hat{\vectorn{y}} & = & \argmax_{\vectorn{y}} \funp{P}(\vectorn{y} | \vectorn{x}) \nonumber \\
+\hat{\vectorn{\emph{y}}} & = & \argmax_{\vectorn{\emph{y}}} \funp{P}(\vectorn{\emph{y}} | \vectorn{\emph{x}}) \nonumber \\
-                 & = & \argmax_{\vectorn{y}} \prod_{j=1}^n \funp{P}(y_j | \vectorn{y}_{<j},\vectorn{x})
+                 & = & \argmax_{\vectorn{\emph{y}}} \prod_{j=1}^n \funp{P}(y_j | \vectorn{\emph{y}}_{<j},\vectorn{\emph{x}})
 \label{eq:10-35}
 \end{eqnarray}
-\noindent 在具体实现时，由于当前目标语单词的生成需要依赖前面单词的生成，因此无法同时生成所有的目标语单词。理论上，可以枚举所有的$\vectorn{y}$，之后利用$\funp{P}(\vectorn{y} | \vectorn{x})$ 的定义对每个$\vectorn{y}$进行评价，然后找出最好的$\vectorn{y}$。这也被称作{\small\bfnew{全搜索}}\index{全搜索}（Full Search）\index{Full Search}。但是，枚举所有的译文单词序列显然是不现实的。因此，在具体实现时，并不会访问所有可能的译文单词序列，而是用某种策略进行有效的搜索。常用的做法是自左向右逐词生成。比如，对于每一个目标语位置$j$，可以执行
+\noindent 在具体实现时，由于当前目标语单词的生成需要依赖前面单词的生成，因此无法同时生成所有的目标语单词。理论上，可以枚举所有的$\vectorn{\emph{y}}$，之后利用$\funp{P}(\vectorn{\emph{y}} | \vectorn{\emph{x}})$ 的定义对每个$\vectorn{\emph{y}}$进行评价，然后找出最好的$\vectorn{\emph{y}}$。这也被称作{\small\bfnew{全搜索}}\index{全搜索}（Full Search）\index{Full Search}。但是，枚举所有的译文单词序列显然是不现实的。因此，在具体实现时，并不会访问所有可能的译文单词序列，而是用某种策略进行有效的搜索。常用的做法是自左向右逐词生成。比如，对于每一个目标语位置$j$，可以执行
 \begin{eqnarray}
-\hat{y}_j = \argmax_{y_j} \funp{P}(y_j | \hat{\vectorn{y}}_{<j} , \vectorn{x})
+\hat{y}_j = \argmax_{y_j} \funp{P}(y_j | \hat{\vectorn{\emph{y}}}_{<j} , \vectorn{\emph{x}})
 \label{eq:10-36}
 \end{eqnarray}
-\noindent 其中，$\hat{y}_j$表示位置$j$概率最高的单词，$\hat{\vectorn{y}}_{<j} = \{ \hat{y}_1,...,\hat{y}_{j-1} \}$表示已经生成的最优译文单词序列。也就是，把最优的译文看作是所有位置上最优单词的组合。显然，这是一种{\small\bfnew{贪婪搜索}}\index{贪婪搜索}（Greedy Search）\index{Greedy Search}，因为无法保证$\{ \hat{y}_1,...,\hat{y}_{n} \}$是全局最优解。一种缓解这个问题的方法是，在每步中引入更多的候选。这里定义$\hat{y}_{jk} $ 表示在目标语第$j$个位置排名在第$k$位的单词。在每一个位置$j$，可以生成$K$个最可能的单词，而不是1个，这个过程可以被描述为
+\noindent 其中，$\hat{y}_j$表示位置$j$概率最高的单词，$\hat{\vectorn{\emph{y}}}_{<j} = \{ \hat{y}_1,...,\hat{y}_{j-1} \}$表示已经生成的最优译文单词序列。也就是，把最优的译文看作是所有位置上最优单词的组合。显然，这是一种{\small\bfnew{贪婪搜索}}\index{贪婪搜索}（Greedy Search）\index{Greedy Search}，因为无法保证$\{ \hat{y}_1,...,\hat{y}_{n} \}$是全局最优解。一种缓解这个问题的方法是，在每步中引入更多的候选。这里定义$\hat{y}_{jk} $ 表示在目标语第$j$个位置排名在第$k$位的单词。在每一个位置$j$，可以生成$K$个最可能的单词，而不是1个，这个过程可以被描述为
 \begin{eqnarray}
 \{ \hat{y}_{j1},...,\hat{y}_{jk} \} = \argmax_{ \{ \hat{y}_{j1},...,\hat{y}_{jk} \} }
-\funp{P}(y_j | \{ \hat{\vectorn{y}}_{<{j^{\textrm{*}}}} \},\vectorn{x})
+\funp{P}(y_j | \{ \hat{\vectorn{\emph{y}}}_{<{j^{\textrm{*}}}} \},\vectorn{\emph{x}})
 \label{eq:10-37}
 \end{eqnarray}
-\noindent 这里，$\{ \hat{y}_{j1},...,\hat{y}_{jk} \}$表示对于位置$j$翻译概率最大的前$K$个单词，$\{ \hat{\vectorn{y}}_{<j^{\ast}} \}$表示前$j-1$步top-K单词组成的所有历史。${\hat{\vectorn{y}}_{<j^{\ast}}}$可以被看作是一个集合，里面每一个元素都是一个目标语单词序列，这个序列是前面生成的一系列top-K单词的某种组成。$\funp{P}(y_j | \{ \hat{\vectorn{y}}_{<{j^{\textrm{*}}}} \},\vectorn{x})$表示基于\{$ \hat{\vectorn{y}}_{<j^{\ast}} $\}的某一条路径生成$y_j$的概率\footnote{严格来说，$ \funp{P} (y_j | {\hat{\vectorn{y}}_{<j^{\ast}} })$不是一个准确的数学表达，这里通过这种写法强调$y_j$是由\{$ \hat{\vectorn{y}}_{<j^{\ast}} $\}中的某个译文单词序列作为条件生成的。} 。这种方法也被称为{\small\bfnew{束搜索}}\index{束搜索}（Beam Search）\index{Beam Search}，意思是搜索时始终考虑一个集束内的候选。
+\noindent 这里，$\{ \hat{y}_{j1},...,\hat{y}_{jk} \}$表示对于位置$j$翻译概率最大的前$K$个单词，$\{ \hat{\vectorn{\emph{y}}}_{<j^{\ast}} \}$表示前$j-1$步top-K单词组成的所有历史。${\hat{\vectorn{\emph{y}}}_{<j^{\ast}}}$可以被看作是一个集合，里面每一个元素都是一个目标语单词序列，这个序列是前面生成的一系列top-K单词的某种组成。$\funp{P}(y_j | \{ \hat{\vectorn{\emph{y}}}_{<{j^{\textrm{*}}}} \},\vectorn{\emph{x}})$表示基于\{$ \hat{\vectorn{\emph{y}}}_{<j^{\ast}} $\}的某一条路径生成$y_j$的概率\footnote{严格来说，$ \funp{P} (y_j | {\hat{\vectorn{\emph{y}}}_{<j^{\ast}} })$不是一个准确的数学表达，这里通过这种写法强调$y_j$是由\{$ \hat{\vectorn{\emph{y}}}_{<j^{\ast}} $\}中的某个译文单词序列作为条件生成的。} 。这种方法也被称为{\small\bfnew{束搜索}}\index{束搜索}（Beam Search）\index{Beam Search}，意思是搜索时始终考虑一个集束内的候选。
 \parinterval 不论是贪婪搜索还是束搜索都是一个自左向右的过程，也就是每个位置的处理需要等前面位置处理完才能执行。这是一种典型的{\small\bfnew{自回归模型}}\index{自回归模型}（Autoregressive Model）\index{Autoregressive Model}，它通常用来描述时序上的随机过程，其中每一个时刻的结果对时序上其他部分的结果有依赖\upcite{NIPS2017_7181}。相对应的，也有{\small\bfnew{非自回归模型}}\index{非自回归模型}（Non-autoregressive Model）\index{Non-autoregressive Model}，它消除了不同时刻结果之间的直接依赖\upcite{Gu2017NonAutoregressiveNM}。由于自回归模型是当今神经机器翻译主流的推断方法，这里仍以自回归的贪婪搜索和束搜索为基础进行讨论。
@@ -905,7 +996,7 @@ L(\vectorn{Y},\widehat{\vectorn{Y}}) = \sum_{j=1}^n L_{\textrm{ce}}(\vectorn{y}_
 %    NEW SUBSUB-SECTION
 %----------------------------------------------------------------------------------------
 \vspace{1.0em}
-\subsubsection{1.贪婪搜索}
+\subsubsection{1. 贪婪搜索}
 \vspace{0.6em}
 \parinterval 图\ref{fig:10-32}展示了一个基于贪婪方法的神经机器翻译解码过程。每一个时间步的单词预测都依赖于其前一步单词的生成。在解码第一个单词时，由于没有之前的单词信息，会用<sos>进行填充，作为起始的单词，且会用一个零向量（可以理解为没有之前时间步的信息）表示第0步的中间层状态。
 \vspace{0.8em}
@@ -937,18 +1028,17 @@ L(\vectorn{Y},\widehat{\vectorn{Y}}) = \sum_{j=1}^n L_{\textrm{ce}}(\vectorn{y}_
 %    NEW SUBSUB-SECTION
 %----------------------------------------------------------------------------------------
-\subsubsection{2.束搜索}
+\subsubsection{2. 束搜索}
 \vspace{0.5em}
-\parinterval 束搜索是一种启发式图搜索算法。相比于全搜索，它可以减少搜索所占用的空间和时间，在每一步扩展的时候，剪掉一些质量比较差的结点，保留下一些质量较高的结点。具体到机器翻译任务，对于每一个目标语位置，束搜索选择了概率最大的前$K$个单词进行扩展（其中$K$叫做束宽度，或简称为束宽）。如图\ref{fig:10-34}所示，假设\{$y_1, y_2,..., y_n$\}表示生成的目标语序列，且$K=3$，则束搜索的具体过程为：在预测第一个位置时，可以通过模型得到$y_1$的概率分布，选取概率最大的前3个单词作为候选结果（假设分别为“have”, “has”, “it”）。在预测第二个位置的单词时，模型针对已经得到的三个候选结果（“have”, “has”, “it”）计算第二个单词的概率分布。例如，可以在将“have”作为第二步的输入，计算$y_2$的概率分布。此时，译文序列的概率为
+\parinterval 束搜索是一种启发式图搜索算法。相比于全搜索，它可以减少搜索所占用的空间和时间，在每一步扩展的时候，剪掉一些质量比较差的结点，保留下一些质量较高的结点。具体到机器翻译任务，对于每一个目标语位置，束搜索选择了概率最大的前$K$个单词进行扩展（其中$K$叫做束宽度，或简称为束宽）。如图\ref{fig:10-34}所示，假设\{$y_1, y_2,..., y_n$\}表示生成的目标语序列，且$K=3$，则束搜索的具体过程为：在预测第一个位置时，可以通过模型得到$y_1$的概率分布，选取概率最大的前3个单词作为候选结果（假设分别为“have”, “has”, “it”）。在预测第二个位置的单词时，模型针对已经得到的三个候选结果（“have”, “has”, “it”）计算第二个单词的概率分布。例如，可以在将“have”作为第二步的输入，计算$y_2$的概率分布。此时，译文序列的概率为：
 \begin{eqnarray}
-\funp{P} (y_2,y_1 | \vectorn{x}) & = & \funp{P} (y_2, \textrm{“have”} | \vectorn{x}) \nonumber \\
+\funp{P} (y_2,y_1 | \vectorn{\emph{x}}) & = & \funp{P} (y_2, \textrm{“have”} | \vectorn{\emph{x}}) \nonumber \\
-								  & = & \funp{P}(y_2 | \textrm{“have”} , \vectorn{x}) \cdot \funp{P} (\textrm{“have”} | \vectorn{x})								
+								  & = & \funp{P}(y_2 | \textrm{“have”} , \vectorn{\emph{x}}) \cdot \funp{P} (\textrm{“have”} | \vectorn{\emph{x}})								
 \label{eq:10-38}
 \end{eqnarray}
-\noindent 类似的，对“has”和“it”进行同样的操作,分别计算得到$ \funp{P} (y_2, \textrm{“have”} | \vectorn{x})$ ，$ \funp{P} (y_2, \textrm{“has”} | \vectorn{x})$，\\ $ \funp{P} (y_2, \textrm{“it”} | \vectorn{x})$，因为$y_2$对应$|V|$种可能，总共可以得到$3 \times |V|$种结果。然后从中选取使序列概率$\funp{P}(y_2,y_1| \vectorn{x})$最大的前三个$y_2$作为新的输出结果，这样便得到了前两个位置的top-3译文。在预测其他位置时也是如此，不断重复此过程直到推断结束。可以看到，束搜索的搜索空间大小与束宽度有关，也就是：束宽度越大，搜索空间越大，更有可能搜索到质量更高的译文，但同时搜索会更慢。束宽度等于3，意味着每次只考虑三个最有可能的结果，贪婪搜索实际上便是集束宽度为1的情况。在神经机器翻译系统实现中，一般束宽度设置在4～8之间。
+\noindent 类似的，对“has”和“it”进行同样的操作,分别计算得到$ \funp{P} (y_2, \textrm{“have”} | \vectorn{\emph{x}})$ ，$ \funp{P} (y_2, \textrm{“has”} | \vectorn{\emph{x}})$，\\ $ \funp{P} (y_2, \textrm{“it”} | \vectorn{\emph{x}})$，因为$y_2$对应$|V|$种可能，总共可以得到$3 \times |V|$种结果。然后从中选取使序列概率$\funp{P}(y_2,y_1| \vectorn{\emph{x}})$最大的前三个$y_2$作为新的输出结果，这样便得到了前两个位置的top-3译文。在预测其他位置时也是如此，不断重复此过程直到推断结束。可以看到，束搜索的搜索空间大小与束宽度有关，也就是：束宽度越大，搜索空间越大，更有可能搜索到质量更高的译文，但同时搜索会更慢。束宽度等于3，意味着每次只考虑三个最有可能的结果，贪婪搜索实际上便是集束宽度为1的情况。在神经机器翻译系统实现中，一般束宽度设置在4～8之间。
 %----------------------------------------------
 \begin{figure}[htp]
@@ -963,45 +1053,45 @@ L(\vectorn{Y},\widehat{\vectorn{Y}}) = \sum_{j=1}^n L_{\textrm{ce}}(\vectorn{y}_
 %    NEW SUBSUB-SECTION
 %----------------------------------------------------------------------------------------
-\subsubsection{3.长度惩罚}
+\subsubsection{3. 长度惩罚}
-\parinterval 这里用$ \funp{P} (\vectorn{y} | \vectorn{x}) = \prod_{j=1}^n \funp{P}(y_j | \vectorn{y}_{<j},\vectorn{x}) $作为翻译模型。直接实现这个公式有一个明显的缺点：当句子过长时乘法运算容易产生溢出，也就是多个数相乘可能会产生浮点数无法表示的运算结果。为了解决这个问题，可以利用对数操作将乘法转换为加法，得到新的概率公式：$\textrm{log } \funp{P}(\vectorn{y} | \vectorn{x}) = \sum_{j=1}^n \textrm{log }\funp{P} (y_j | \vectorn{y}_{<j}, \vectorn{x}) $，对数函数不会改变函数的单调性，因此在具体实现时，通常用$\textrm{log }\funp{P} (\vectorn{y} | \vectorn{x})$表示句子的得分，而不用$\funp{P}(\vectorn{y} | \vectorn{x})$。
+\parinterval 这里用$ \funp{P} (\vectorn{\emph{y}} | \vectorn{\emph{x}}) = \prod_{j=1}^n \funp{P}(y_j | \vectorn{\emph{y}}_{<j},\vectorn{\emph{x}}) $作为翻译模型。直接实现这个公式有一个明显的缺点：当句子过长时乘法运算容易产生溢出，也就是多个数相乘可能会产生浮点数无法表示的运算结果。为了解决这个问题，可以利用对数操作将乘法转换为加法，得到新的概率公式：$\textrm{log } \funp{P}(\vectorn{\emph{y}} | \vectorn{\emph{x}}) = \sum_{j=1}^n \textrm{log }\funp{P} (y_j | \vectorn{\emph{y}}_{<j}, \vectorn{\emph{x}}) $，对数函数不会改变函数的单调性，因此在具体实现时，通常用$\textrm{log }\funp{P} (\vectorn{\emph{y}} | \vectorn{\emph{x}})$表示句子的得分，而不用$\funp{P}(\vectorn{\emph{y}} | \vectorn{\emph{x}})$。
-\parinterval 不管是使用$\funp{P}(\vectorn{y} | \vectorn{x})$还是$\textrm{log } \funp{P}(\vectorn{y} | \vectorn{x})$计算句子得分，还面临两个问题：
+\parinterval 不管是使用$\funp{P}(\vectorn{\emph{y}} | \vectorn{\emph{x}})$还是$\textrm{log } \funp{P}(\vectorn{\emph{y}} | \vectorn{\emph{x}})$计算句子得分，还面临两个问题：
 \begin{itemize}
 \vspace{0.5em}
-\item $\funp{P}(\vectorn{y} | \vectorn{x})$的范围是[0,1]，如果句子过长，那么句子的得分就是很多个小于1的数相乘，或者说取log之后很多个小于0的数相加。这也就是说，句子的得分会随着长度的增加而变小，即模型倾向于生成短句子。
+\item $\funp{P}(\vectorn{\emph{y}} | \vectorn{\emph{x}})$的范围是[0,1]，如果句子过长，那么句子的得分就是很多个小于1的数相乘，或者说取log之后很多个小于0的数相加。这也就是说，句子的得分会随着长度的增加而变小，即模型倾向于生成短句子。
 \vspace{0.5em}
 \item 模型本身并没有考虑每个源语言单词被使用的程度，比如一个单词可能会被翻译很多“次”。这个问题在统计机器翻译中并不存在，因为所有词在翻译中必须被“覆盖”到。但是早期的神经机器翻译模型没有所谓覆盖度的概念，因此也无法保证每个单词被翻译的“程度”是合理的\upcite{li-etal-2018-simple,TuModeling}。
 \vspace{0.5em}
 \end{itemize}
-\parinterval 为了解决上面提到的问题，可以使用其他特征与$\textrm{log } \funp{P} (\vectorn{y} | \vectorn{x})$一起组成新的模型得分$\textrm{score} ( \vectorn{y} , \vectorn{x})$。针对模型倾向于生成短句子的问题，常用的做法是引入惩罚机制。比如，可以定义一个惩罚因子，形式如下：
+\parinterval 为了解决上面提到的问题，可以使用其他特征与$\textrm{log } \funp{P} (\vectorn{\emph{y}} | \vectorn{\emph{x}})$一起组成新的模型得分$\textrm{score} ( \vectorn{\emph{y}} , \vectorn{\emph{x}})$。针对模型倾向于生成短句子的问题，常用的做法是引入惩罚机制。比如，可以定义一个惩罚因子，形式如下：
 \begin{eqnarray}
-\textrm{lp}(\vectorn{y}) = \frac {(5+ |\vectorn{y}|)^{\alpha}} {(5+1)^{\alpha}}
+\textrm{lp}(\vectorn{\emph{y}}) = \frac {(5+ |\vectorn{\emph{y}}|)^{\alpha}} {(5+1)^{\alpha}}
 \label{eq:10-39}
 \end{eqnarray}
-\noindent 其中，$|\vectorn{y}|$代表已经得到的译文长度，$\alpha$是一个固定的常数，用于控制惩罚的强度。同时在计算句子得分时，额外引入表示覆盖度的因子，如下：
+\noindent 其中，$|\vectorn{\emph{y}}|$代表已经得到的译文长度，$\alpha$是一个固定的常数，用于控制惩罚的强度。同时在计算句子得分时，额外引入表示覆盖度的因子，如下：
 \begin{eqnarray}
-\textrm{cp}(\vectorn{y} , \vectorn{x}) = \beta \cdot \sum_{i=1}^{|\vectorn{x}|} \textrm{log} \big(\textrm{min}(\sum_j^{|\vectorn{y}|} \alpha_{ij},1 ) \big)
+\textrm{cp}(\vectorn{\emph{y}} , \vectorn{\emph{x}}) = \beta \cdot \sum_{i=1}^{|\vectorn{\emph{x}}|} \textrm{log} \big(\textrm{min}(\sum_j^{|\vectorn{\emph{y}}|} \alpha_{ij},1 ) \big)
 \label{eq:10-40}
 \end{eqnarray}
-\noindent $\textrm{cp}(\cdot)$会惩罚把某些源语单词对应到很多目标语单词的情况（覆盖度），被覆盖的程度用$\sum_j^{|\vectorn{y}|} \alpha_{ij}$度量。$\beta$也是需要经验性设置的超参数，用于对覆盖度惩罚的强度进行控制。
+\noindent $\textrm{cp}(\cdot)$会惩罚把某些源语单词对应到很多目标语单词的情况（覆盖度），被覆盖的程度用$\sum_j^{|\vectorn{\emph{y}}|} \alpha_{ij}$度量。$\beta$也是需要经验性设置的超参数，用于对覆盖度惩罚的强度进行控制。
 \parinterval 最终，模型得分定义如下：
 \begin{eqnarray}
-\textrm{score} ( \vectorn{y} , \vectorn{x}) = \frac{\textrm{log} \funp{P}(\vectorn{y} | \vectorn{x})} {\textrm{lp}(\vectorn{y})} + \textrm{cp}(\vectorn{y} , \vectorn{x})
+\textrm{score} ( \vectorn{\emph{y}} , \vectorn{\emph{x}}) = \frac{\textrm{log} \funp{P}(\vectorn{\emph{y}} | \vectorn{\emph{x}})} {\textrm{lp}(\vectorn{\emph{y}})} + \textrm{cp}(\vectorn{\emph{y}} , \vectorn{\emph{x}})
 \label{eq:10-41}
 \end{eqnarray}
-\noindent 显然，当目标语$y$过短时，$\textrm{lp}(\vectorn{y})$的值越小，因为$\textrm{log } \funp{P}(\vectorn{y} | \vectorn{x})$是负数，所以句子得分$\textrm{score} ( \vectorn{y} , \vectorn{x})$越小。也就是说，模型会惩罚译文过短的结果。当覆盖度较高时，同样会使得分变低。通过这样的惩罚机制，使模型得分更为合理，从而帮助模型选择出质量更高的译文。
+\noindent 显然，当目标语$y$过短时，$\textrm{lp}(\vectorn{\emph{y}})$的值越小，因为$\textrm{log } \funp{P}(\vectorn{\emph{y}} | \vectorn{\emph{x}})$是负数，所以句子得分$\textrm{score} ( \vectorn{\emph{y}} , \vectorn{\emph{x}})$越小。也就是说，模型会惩罚译文过短的结果。当覆盖度较高时，同样会使得分变低。通过这样的惩罚机制，使模型得分更为合理，从而帮助模型选择出质量更高的译文。
 %----------------------------------------------------------------------------------------
-%    NEW SUB-SECTION  10.3.5
+%    NEW SUB-SECTION  10.3.6
 %----------------------------------------------------------------------------------------
 \subsection{实例-GNMT}
 \vspace{0.5em}
@@ -1040,7 +1130,7 @@ L(\vectorn{Y},\widehat{\vectorn{Y}}) = \sum_{j=1}^n L_{\textrm{ce}}(\vectorn{y}_
 %----------------------------------------------
 %----------------------------------------------------------------------------------------
-%    NEW SECTION   10.5
+%    NEW SECTION   10.4
 %----------------------------------------------------------------------------------------
 \sectionnewpage
 \section{小节及深入阅读}
@@ -1053,9 +1143,9 @@ L(\vectorn{Y},\widehat{\vectorn{Y}}) = \sum_{j=1}^n L_{\textrm{ce}}(\vectorn{y}_
 \vspace{0.5em}
 \item 循环神经网络有很多变种结构。比如，除了RNN、LSTM、GRU，还有其他改进的循环单元结构，如LRN\upcite{DBLP:journals/corr/abs-1905-13324}、SRU\upcite{Lei2017TrainingRA}、ATR\upcite{Zhang2018SimplifyingNM}。
 \vspace{0.5em}
-\item 一般来说，神经机器翻译的计算过程是没有人工干预的，翻译流程也无法用人类的知识直接进行解释，因此一个有趣的方向是在神经机器翻译中引入先验知识，使得机器翻译的行为更“像”人。比如，可以使用句法树来引入人类的语言学知识\upcite{Yang2017TowardsBH,Wang2019TreeTI}，基于句法的神经机器翻译也包含大量的树结构的神经网络建模\upcite{DBLP:journals/corr/abs-1809-01854,DBLP:journals/corr/abs-1808-09374}。此外，也可以把用户定义的词典或者翻译记忆加入到翻译过程来\upcite{DBLP:journals/corr/ZhangZ16c,Dai2019TransformerXLAL}，使得用户的约束可以直接反映到机器翻译的结果上来。先验知识的种类还有很多，包括词对齐\upcite{li-etal-2019-word,Zhang2017PriorKI}、篇章信息\upcite{Werlen2018DocumentLevelNM,DBLP:journals/corr/abs-1805-10163}等等，都是神经机器翻译中能够使用的信息。
+\item 一般来说，神经机器翻译的计算过程是没有人工干预的，翻译流程也无法用人类的知识直接进行解释，因此一个有趣的方向是在神经机器翻译中引入先验知识，使得机器翻译的行为更“像”人。比如，可以使用句法树来引入人类的语言学知识\upcite{Yang2017TowardsBH,Wang2019TreeTI}，基于句法的神经机器翻译也包含大量的树结构的神经网络建模\upcite{DBLP:journals/corr/abs-1809-01854,DBLP:journals/corr/abs-1808-09374}。此外，也可以把用户定义的词典或者翻译记忆加入到翻译过程来\upcite{DBLP:journals/corr/ZhangZ16c,Dai2019TransformerXLAL}，使得用户的约束可以直接反映到机器翻译的结果上来。先验知识的种类还有很多，包括词对齐\upcite{li-etal-2019-word}、篇章信息\upcite{Werlen2018DocumentLevelNM,DBLP:journals/corr/abs-1805-10163}等等，都是神经机器翻译中能够使用的信息。
 \vspace{0.5em}
-\item 神经机器翻译依赖成本较高的GPU设备，因此对模型的裁剪和加速也是很多系统研发人员所感兴趣的方向。比如，从工程上，可以考虑减少运算强度，比如使用低精度浮点数或者整数进行计算，或者引入缓存机制来加速模型的推断\upcite{DBLP:journals/corr/abs-1906-00532,DBLP:journals/corr/CourbariauxB16}；也可以通过对模型参数矩阵的剪枝，甚至对模块的剪枝，来减小整个模型的体积\upcite{Zhang2018SpeedingUN,DBLP:journals/corr/SeeLM16}；另一种方法是知识精炼。利用大模型训练小模型，这样往往可以得到比单独训练小模型更好的效果\upcite{DBLP:journals/corr/ChenLCL17,Hinton2015Distilling,Sun2019PatientKD}。
+\item 神经机器翻译依赖成本较高的GPU设备，因此对模型的裁剪和加速也是很多系统研发人员所感兴趣的方向。比如，从工程上，可以考虑减少运算强度，比如使用低精度浮点数\upcite{Ott2018ScalingNM}或者整数\upcite{DBLP:journals/corr/abs-1906-00532,Lin2020TowardsF8}进行计算，或者引入缓存机制来加速模型的推断；也可以通过对模型参数矩阵的剪枝\upcite{DBLP:journals/corr/SeeLM16}，甚至对模块的剪枝\upcite{Zhang2018SpeedingUN}，来减小整个模型的体积；另一种方法是知识精炼\upcite{kim-rush-2016-sequence}。利用大模型训练小模型，这样往往可以得到比单独训练小模型更好的效果\upcite{DBLP:journals/corr/ChenLCL17,Hinton2015Distilling}。
 \vspace{0.5em}
 \end{itemize}

--- a/Chapter12/Figures/dog-hat.jpg
+++ b/Chapter12/Figures/dog-hat.jpg
--- a/Chapter12/Figures/figure-a-combination-of-position-encoding-and-word-encoding.tex
+++ b/Chapter12/Figures/figure-a-combination-of-position-encoding-and-word-encoding.tex
+\begin{tikzpicture}
+\begin{scope}
+\tikzstyle{rnode} = [draw,minimum width=3.5em,minimum height=1.2em]
+\node [rnode,anchor=south west,fill=red!20!white] (e1) at (0,0) {\scriptsize{$\vectorn{\emph{e}}(\textrm{“沈阳”})$}};
+\node [rnode,anchor=south west,fill=red!20!white] (e2) at ([xshift=1em]e1.south east) {\scriptsize{$\vectorn{{\emph{e}}}(\textrm{“到”})$}};
+\node [rnode,anchor=south west,fill=red!20!white] (e3) at ([xshift=1em]e2.south east) {\scriptsize{$\vectorn{{\emph{e}}}(\textrm{“广州”})$}};
+\node [rnode,anchor=south west,fill=red!20!white] (e4) at ([xshift=1em]e3.south east) {\scriptsize{$\vectorn{{\emph{e}}}(\textrm{“的”})$}};
+\node [rnode,anchor=south west,fill=red!20!white] (e5) at ([xshift=1em]e4.south east) {\scriptsize{$\vectorn{\emph{e}}(\textrm{“机票”})$}};
+\node [rnode,anchor=south west,fill=green!20!white] (h1) at ([yshift=1.5em]e1.north west) {\scriptsize{$\vectorn{\emph{h}}(\textrm{“沈阳”})$}};
+\node [rnode,anchor=south west,fill=green!20!white] (h2) at ([yshift=1.5em]e2.north west) {\scriptsize{$\vectorn{\emph{h}}(\textrm{“到”})$}};
+\node [rnode,anchor=south west,fill=green!20!white] (h3) at ([yshift=1.5em]e3.north west) {\scriptsize{$\vectorn{\emph{h}}(\textrm{“广州”})$}};
+\node [rnode,anchor=south west,fill=green!20!white] (h4) at ([yshift=1.5em]e4.north west) {\scriptsize{$\vectorn{\emph{h}}(\textrm{“的”})$}};
+\node [rnode,anchor=south west,fill=green!20!white] (h5) at ([yshift=1.5em]e5.north west) {\scriptsize{$\vectorn{\emph{h}}(\textrm{“机票”})$}};
+\foreach \x in {1,2,3,4,5}{
+	\node [anchor=north] (plus\x) at ([yshift=-0em]e\x.south) {\scriptsize{$\mathbf{\oplus}$}};
+}
+\node [rnode,anchor=north,fill=yellow!20!white] (pos1) at ([yshift=-1.1em]e1.south) {\scriptsize{$\vectorn{{\emph{PE}}}(1)$}};
+\node [rnode,anchor=north,fill=yellow!20!white] (pos2) at ([yshift=-1.1em]e2.south) {\scriptsize{$ \vectorn{{\emph{PE}}}(2)$}};
+\node [rnode,anchor=north,fill=yellow!20!white] (pos3) at ([yshift=-1.1em]e3.south) {\scriptsize{$ \vectorn{{\emph{PE}}}(3)$}};
+\node [rnode,anchor=north,fill=yellow!20!white] (pos4) at ([yshift=-1.1em]e4.south) {\scriptsize{$ \vectorn{{\emph{PE}}}(4)$}};
+\node [rnode,anchor=north,fill=yellow!20!white] (pos5) at ([yshift=-1.1em]e5.south) {\scriptsize{$ \vectorn{{\emph{PE}}}(5)$}};
+\foreach \x in {1,2,3,4,5}{
+	\node [rectangle,inner sep=0.1em,rounded corners=1pt,very thick,dotted,draw=red!60] [fit = (e\x) (pos\x)] (box\x) {};
+}
+\node [anchor=north] (inputs1) at ([yshift=-1em]pos1.south) {\scriptsize{沈阳}};
+\node [anchor=north] (inputs2) at ([yshift=-1em]pos2.south) {\scriptsize{到}};
+\node [anchor=north] (inputs3) at ([yshift=-1em]pos3.south) {\scriptsize{广州}};
+\node [anchor=north] (inputs4) at ([yshift=-1em]pos4.south) {\scriptsize{的}};
+\node [anchor=north] (inputs5) at ([yshift=-1em]pos5.south) {\scriptsize{机票}};
+\draw [->] ([yshift=0.1em]e1.north) .. controls +(north:0.5) and +(south:0.5) .. ([xshift=-1em,yshift=-0.1em]h3.south);
+\draw [->] ([yshift=0.1em]e2.north) .. controls +(north:0.3) and +(south:0.6) .. ([xshift=-0.5em,yshift=-0.1em]h3.south);
+\draw [->] ([yshift=0.1em]e3.north) -- ([yshift=-0.1em]h3.south);
+\draw [->] ([yshift=0.1em]e4.north) .. controls +(north:0.3) and +(south:0.6) .. ([xshift=0.5em,yshift=-0.1em]h3.south);
+\draw [->] ([yshift=0.1em]e5.north) .. controls +(north:0.5) and +(south:0.5) .. ([xshift=1em,yshift=-0.1em]h3.south);
+\draw [->] ([yshift=0.1em]e1.north) -- ([yshift=-0.1em]h1.south);
+\draw [->] ([yshift=0.1em]e2.north) -- ([yshift=-0.1em]h2.south);
+\draw [->] ([yshift=0.1em]e4.north) -- ([yshift=-0.1em]h4.south);
+\draw [->] ([yshift=0.1em]e5.north) -- ([yshift=-0.1em]h5.south);
+\foreach \x in {1,2,3,4,5}{
+	\draw [->] ([yshift=-0.1em]inputs\x.north) -- ([yshift=-0.2em]pos\x.south);
+}
+\node [anchor=north] (dot1) at ([xshift=0.4em,yshift=-0.2em]h1.south) {\tiny{...}};
+\node [anchor=north] (dot2) at ([xshift=0.4em,yshift=-0.2em]h2.south) {\tiny{...}};
+\node [anchor=north] (dot4) at ([xshift=-0.4em,yshift=-0.2em]h4.south) {\tiny{...}};
+\node [anchor=north] (dot5) at ([xshift=-0.4em,yshift=-0.2em]h5.south) {\tiny{...}};
+\end{scope}
+\end{tikzpicture}
\ No newline at end of file
--- a/Chapter12/Figures/figure-attention-of-source-and-target-words.tex
+++ b/Chapter12/Figures/figure-attention-of-source-and-target-words.tex
+%
+%---------------------------------------
+\begin{tikzpicture}
+\begin{scope}
+%\newlength{\mystep}
+%\setlength{\mystep}{1.6em}
+\foreach \x in {1,2,...,6}
+    \node[] (s\x) at (\x * 1.6em,0) {};
+\node [] (ws1) at (s1) {\scriptsize{这}};
+\node [] (ws2) at (s2) {\scriptsize{是}};
+\node [] (ws3) at (s3) {\scriptsize{个}};
+\node [] (ws4) at (s4) {\scriptsize{很长}};
+\node [] (ws5) at (s5) {\scriptsize{的}};
+\node [] (ws6) at (s6) {\scriptsize{句子}};
+\foreach \x in {1,2,...,6}
+    \node[] (t\x) at (\x * 1.6em + 2.4in,0) {};
+\node [] (wt1) at (t1) {\scriptsize{This}};
+\node [] (wt2) at (t2) {\scriptsize{is}};
+\node [] (wt3) at ([yshift=-1pt]t3) {\scriptsize{a}};
+\node [] (wt4) at ([yshift=-0.1em]t4) {\scriptsize{very}};
+\node [] (wt5) at (t5) {\scriptsize{long}};
+\node [] (wt6) at ([xshift=1em]t6) {\scriptsize{sentence}};
+\node [anchor=south west,fill=red!30,minimum width=1.6in,minimum height=1.5em] (encoder) at ([yshift=1.0em]ws1.north west) {\footnotesize{Encoder}};
+\node [anchor=west,fill=blue!30,minimum width=1.9in,minimum height=1.5em] (decoder) at ([xshift=4.5em]encoder.east) {\footnotesize{Decoder}};
+\node [anchor=west,fill=green!30,minimum height=1.5em] (representation) at ([xshift=1em]encoder.east) {\footnotesize{表示}};
+\draw [->,thick] ([xshift=1pt]encoder.east)--([xshift=-1pt]representation.west);
+\draw [->,thick] ([xshift=1pt]representation.east)--([xshift=-1pt]decoder.west);
+\foreach \x in {1,2,...,6}
+    \draw[->] ([yshift=0.1em]s\x.north) -- ([yshift=1.2em]s\x.north);
+\foreach \x in {1,2,...,5}
+    \draw[<-] ([yshift=0.1em]t\x.north) -- ([yshift=1.2em]t\x.north);
+\draw[<-] ([yshift=0.1em,xshift=1em]t6.north) -- ([yshift=1.2em,xshift=1em]t6.north);
+{
+\draw [<->,ublue,thick] ([xshift=0.3em]ws4.south) .. controls +(-60:1) and +(south:1) .. (wt4.south);
+\draw [<->,ublue,thick] (ws4.south) .. controls +(south:1.0) and +(south:1.5) .. (wt5.south);
+}
+{
+\node [anchor=north,fill=green!30] (attentionlabel) at ([yshift=-3.4em]representation.south) {\footnotesize{词语的关注度}};
+\draw [->,dotted,very thick,ublue] ([yshift=0.1em]attentionlabel.north)--([yshift=-0.1em]representation.south);
+}
+\end{scope}
+\end{tikzpicture}
\ No newline at end of file
--- a/Chapter12/Figures/figure-calculation-of-context-vector-c.tex
+++ b/Chapter12/Figures/figure-calculation-of-context-vector-c.tex
+\begin{tikzpicture}
+\begin{scope}
+\tikzstyle{rnode} = [draw,minimum width=3.5em,minimum height=1.2em]
+\node [rnode,anchor=south west,fill=green!20!white] (key1) at (0,0) {\scriptsize{$\vectorn{\emph{h}}(\textrm{``沈阳''})$}};
+\node [rnode,anchor=south west,fill=green!20!white] (key2) at ([xshift=1em]key1.south east) {\scriptsize{$\vectorn{\emph{h}}(\textrm{``到''})$}};
+\node [rnode,anchor=south west,fill=green!20!white] (key3) at ([xshift=1em]key2.south east) {\scriptsize{$\vectorn{\emph{h}}(\textrm{``广州''})$}};
+\node [rnode,anchor=south west,fill=green!20!white] (key4) at ([xshift=2em]key3.south east) {\scriptsize{$\vectorn{\emph{h}}(\textrm{``机票''})$}};
+\node [rnode,anchor=south west] (key5) at ([xshift=1em]key4.south east) {\scriptsize{$\vectorn{\emph{h}}(\textrm{``机票''})$}};
+\node [anchor=west] (sep1) at ([xshift=0.3em]key3.east) {\scriptsize{$\textbf{...}$}};
+\draw [->] ([yshift=1pt,xshift=-3pt]key5.north) .. controls +(90:1em) and +(90:0.7em) .. ([yshift=1pt]key4.north);
+\draw [->] ([yshift=1pt,xshift=0pt]key5.north) .. controls +(90:1.4em) and +(90:1.4em) .. ([yshift=1pt]key3.north);
+\draw [->] ([yshift=1pt,xshift=3pt]key5.north) .. controls +(90:1.8em) and +(90:1.8em) .. ([yshift=1pt]key2.north);
+\draw [->] ([yshift=1pt,xshift=6pt]key5.north) .. controls +(90:2.2em) and +(90:2.2em) .. ([yshift=1pt]key1.north);
+\node [anchor=south west] (alpha1) at ([xshift=-1em]key1.north west) {\scriptsize{$\alpha_1=.2$}};
+\node [anchor=south west] (alpha2) at ([xshift=-1em]key2.north west) {\scriptsize{$\alpha_2=.3$}};
+\node [anchor=south west] (alpha3) at ([xshift=-1em]key3.north west) {\scriptsize{$\alpha_3=.1$}};
+\node [anchor=south west] (alpha4) at ([xshift=-1em]key4.north west) {\scriptsize{$\alpha_4=.3$}};
+\vspace{0.5em}
+\node [rnode,anchor=south west,fill=green!20!white] (key6) at ([yshift=2em]key1.north west) {\scriptsize{$\vectorn{\emph{h}}(\textrm{``广州''})$}};
+\node [rnode,anchor=south west,fill=green!20!white] (key7) at ([yshift=2em]key2.north west) {\scriptsize{$\vectorn{\emph{h}}(\textrm{``到''})$}};
+\node [rnode,anchor=south west,fill=green!20!white] (key8) at ([yshift=2em]key3.north west) {\scriptsize{$\vectorn{\emph{h}}(\textrm{``沈阳''})$}};
+\node [rnode,anchor=south west,fill=green!20!white] (key9) at ([yshift=2em]key4.north west) {\scriptsize{$\vectorn{\emph{h}}(\textrm{``机票''})$}};
+\node [rnode,anchor=south west] (key10) at ([yshift=2em]key5.north west) {\scriptsize{$\vectorn{\emph{h}}(\textrm{``机票''})$}};
+\node [anchor=west] (sep1) at ([xshift=0.3em]key8.east) {\scriptsize{$\textbf{...}$}};
+\draw [->] ([yshift=1pt,xshift=-3pt]key10.north) .. controls +(90:1em) and +(90:0.7em) .. ([yshift=1pt]key9.north);
+\draw [->] ([yshift=1pt,xshift=0pt]key10.north) .. controls +(90:1.4em) and +(90:1.4em) .. ([yshift=1pt]key8.north);
+\draw [->] ([yshift=1pt,xshift=3pt]key10.north) .. controls +(90:1.8em) and +(90:1.8em) .. ([yshift=1pt]key7.north);
+\draw [->] ([yshift=1pt,xshift=6pt]key10.north) .. controls +(90:2.2em) and +(90:2.2em) .. ([yshift=1pt]key6.north);
+\node [anchor=south west] (alpha5) at ([xshift=-1em]key6.north west) {\scriptsize{$\alpha_1=.1$}};
+\node [anchor=south west] (alpha6) at ([xshift=-1em]key7.north west) {\scriptsize{$\alpha_2=.3$}};
+\node [anchor=south west] (alpha7) at ([xshift=-1em]key8.north west) {\scriptsize{$\alpha_3=.2$}};
+\node [anchor=south west] (alpha8) at ([xshift=-1em]key9.north west) {\scriptsize{$\alpha_4=.3$}};
+\end{scope}
+\end{tikzpicture}
+\vspace{-1.0em}
+\footnotesize{
+\begin{eqnarray}
+\tilde{\mathbf{\emph{h}}} (\textrm{''机票''}) & = & 0.2 \times \vectorn{\emph{h}}(\textrm{``沈阳''}) + 0.3 \times \vectorn{\emph{h}}(\textrm{``到''}) + \nonumber \\
+             &   & 0.1 \times \vectorn{\emph{h}}(\textrm{``广州''}) + ... + 0.3 \times \vectorn{\emph{h}}(\textrm{``机票''}) \nonumber
+\end{eqnarray}
+}
\ No newline at end of file
--- a/Chapter12/Figures/figure-calculation-process-of-context-vector-c.tex
+++ b/Chapter12/Figures/figure-calculation-process-of-context-vector-c.tex
+\begin{tikzpicture}
+\begin{scope}
+\node [anchor=west,draw,fill=red!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (h1) at (0,0) {\scriptsize{$\vectorn{\emph{h}}_1$}};
+\node [anchor=west,draw,fill=red!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (h2) at ([xshift=1em]h1.east) {\scriptsize{$\vectorn{\emph{h}}_2$}};
+\node [anchor=west,inner sep=0pt,minimum width=3em] (h3) at ([xshift=0.5em]h2.east) {\scriptsize{...}};
+\node [anchor=west,draw,fill=red!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (h4) at ([xshift=0.5em]h3.east) {\scriptsize{$\vectorn{\emph{h}}_m$}};
+\node [anchor=south,circle,minimum size=1.0em,draw,ublue,thick] (sum) at ([yshift=2em]h2.north east) {};
+\draw [thick,-,ublue] (sum.north) -- (sum.south);
+\draw [thick,-,ublue] (sum.west) -- (sum.east);
+\node [anchor=south,draw,fill=green!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (th1) at ([yshift=2em,xshift=-1em]sum.north west) {\scriptsize{$\vectorn{\emph{s}}_{j-1}$}};
+\node [anchor=west,draw,fill=green!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (th2) at ([xshift=2em]th1.east) {\scriptsize{$\vectorn{\emph{s}}_{j}$}};
+\draw [->] (h1.north) .. controls +(north:0.8) and +(west:1) ..  (sum.190) node [pos=0.2,left] {\scriptsize{$\alpha_{1,j}$}};
+\draw [->] (h2.north) .. controls +(north:0.6) and +(220:0.2) ..  (sum.220) node [pos=0.2,right] {\scriptsize{$\alpha_{2,j}$}};
+\draw [->] (h4.north) .. controls +(north:0.8) and +(east:1) ..  (sum.-10) node [pos=0.1,left] (alphan) {\scriptsize{$\alpha_{m,j}$}};
+\draw [->] ([xshift=-1.5em]th1.west) -- ([xshift=-0.1em]th1.west);
+\draw [->] ([xshift=0.1em]th1.east) -- ([xshift=-0.1em]th2.west);
+\draw [->] ([xshift=0.1em]th2.east) -- ([xshift=1.5em]th2.east);
+\draw [->] (sum.north) .. controls +(north:0.8) and +(west:0.2) ..  ([yshift=-0.4em,xshift=-0.1em]th2.west) node [pos=0.2,right] (ci) {\scriptsize{$\vectorn{\emph{C}}_{j}$}};
+\node [anchor=south,inner sep=1pt] (output) at ([yshift=0.8em]th2.north) {\scriptsize{输出层}};
+\draw [->] ([yshift=0.1em]th2.north) -- ([yshift=-0.1em]output.south);
+\node [anchor=north] (enc1) at (h1.south west) {\scriptsize{编码器输出}};
+\node [anchor=north] (enc12) at ([yshift=0.5em]enc1.south) {\scriptsize{(位置$1$)}};
+\node [anchor=north] (enc2) at (h2.south) {\scriptsize{编码器输出}};
+\node [anchor=north] (enc22) at ([yshift=0.5em]enc2.south) {\scriptsize{(位置$2$)}};
+\node [anchor=north] (enc4) at (h4.south) {\scriptsize{编码器输出}};
+\node [anchor=north] (enc42) at ([yshift=0.5em]enc4.south) {\scriptsize{(位置$4$)}};
+{
+\node [anchor=west] (math1) at ([xshift=5em,yshift=1em]th2.east) {$\vectorn{\emph{C}}_j = \sum_{i} \alpha_{i,j} \vectorn{\emph{h}}_i \ \ $};
+}
+{
+\node [anchor=north west] (math2) at ([yshift=-2em]math1.south west) {$\alpha_{i,j} = \frac{\exp(\beta_{i,j})}{\sum_{i'} \exp(\beta_{i',j})}$};
+\node [anchor=north west] (math3) at ([yshift=-0em]math2.south west) {$\beta_{i,j} = a(\vectorn{\emph{s}}_{j-1}, \vectorn{\emph{h}}_i)$};
+}
+\begin{pgfonlayer}{background}
+{
+\node [rectangle,inner sep=0.4em,rounded corners=1pt,fill=blue!10,drop shadow] [fit = (math1)] (box1) {};
+}
+{
+\node [rectangle,inner sep=0.4em,rounded corners=1pt,fill=orange!10,drop shadow] [fit = (math2) (math3)] (box2) {};
+}
+\end{pgfonlayer}
+{
+\draw [->,dotted,thick,blue] (box1.west) .. controls +(west:1.2) and +(east:2.0) .. ([xshift=-0.3em]ci.east);
+}
+{
+\draw [->,dotted,thick,orange] ([yshift=1em]box2.west) .. controls +(west:1.2) and +(east:1.0) .. ([xshift=-0.35em]alphan.east);
+}
+\end{scope}
+\end{tikzpicture}
\ No newline at end of file
--- a/Chapter12/Figures/figure-comparison-of-the-number-of-padding-in-batch.tex
+++ b/Chapter12/Figures/figure-comparison-of-the-number-of-padding-in-batch.tex
+\begin{tikzpicture}
+\begin{scope}[scale=1.5]
+{\Large
+\tikzstyle{snode} = [draw,inner sep=1pt,minimum width=3em,minimum height=0.5em,rounded corners=1pt,fill=green!30!white]
+\tikzstyle{pnode} = [draw,inner sep=1pt,minimum width=1em,minimum height=0.5em,rounded corners=1pt]
+\node [anchor=west,snode] (s1) at (0,0) {};
+\node [anchor=north west,snode,minimum width=6.5em] (s2) at ([yshift=-0.3em]s1.south west) {};
+\node [anchor=north west,snode,minimum width=2em] (s3) at ([yshift=-0.3em]s2.south west) {};
+\node [anchor=east] (label1) at ([xshift=-0.8em,yshift=0.6em]s1.west) {\scriptsize{Shuffled:}};
+\node [anchor=west,pnode,minimum width=3em] (p1) at ([xshift=0.3em]s1.east) {};
+\node [anchor=west,pnode,minimum width=4em] (p3) at ([xshift=0.3em]s3.east) {};
+\node [anchor=west,snode,minimum width=5em] (s4) at ([xshift=4em]p1.east) {};
+\node [anchor=north west,snode,minimum width=5em] (s5) at ([yshift=-0.3em]s4.south west) {};
+\node [anchor=north west,snode,minimum width=6.5em] (s6) at ([yshift=-0.3em]s5.south west) {};
+\node [anchor=east] (label2) at ([xshift=-0.8em,yshift=0.6em]s4.west) {\scriptsize{Sorted:}};
+\node [anchor=west,pnode,minimum width=1em] (p4) at ([xshift=0.3em]s4.east) {};
+\node [anchor=west,pnode,minimum width=1em] (p5) at ([xshift=0.3em]s5.east) {};
+\node [rectangle,inner sep=0.5em,rounded corners=2pt,very thick,dotted,draw=ugreen!80] [fit = (s1) (s3) (p1) (p3)] (box0) {};
+\node [rectangle,inner sep=0.5em,rounded corners=2pt,very thick,dotted,draw=ugreen!80] [fit = (s4) (s6) (p4) (p5)] (box0) {};
+}
+\end{scope}
+\end{tikzpicture}
--- a/Chapter12/Figures/figure-decode-of-transformer.tex
+++ b/Chapter12/Figures/figure-decode-of-transformer.tex
+   \begin{tikzpicture}
+    \begin{scope}
+    \tikzstyle{rnnnode} = [minimum height=1.1em,minimum width=2.1em,inner sep=2pt,rounded corners=1pt,draw,fill=red!20];
+    \node [rnnnode,anchor=west] (h1) at (0,0) {\tiny{$\vectorn{\emph{h}}_1$}};
+    \node [rnnnode,anchor=west] (h2) at ([xshift=1em]h1.east) {\tiny{$\vectorn{\emph{h}}_2$}};
+    \node [rnnnode,anchor=west] (h3) at ([xshift=1em]h2.east) {\tiny{$\vectorn{\emph{h}}_3$}};
+    \node [rnnnode,anchor=north,fill=green!20] (e1) at ([yshift=-1em]h1.south) {\tiny{$e_x()$}};
+    \node [rnnnode,anchor=west,fill=green!20] (e2) at ([xshift=1em]e1.east) {\tiny{$e_x()$}};
+    \node [rnnnode,anchor=west,fill=green!20] (e3) at ([xshift=1em]e2.east) {\tiny{$e_x()$}};
+    \node [anchor=north,inner sep=2pt] (w1) at ([yshift=-0.6em]e1.south) {\tiny{你}};
+    \node [anchor=north,inner sep=2pt] (w2) at ([yshift=-0.6em]e2.south) {\tiny{好}};
+    \node [anchor=north,inner sep=2pt] (w3) at ([yshift=-0.6em]e3.south) {\tiny{$\langle$eos$\rangle$}};
+    %\node [anchor=south] (dot1) at ([xshift=0.4em,yshift=-0.7em]h1.south) {\tiny{...}};
+    %\node [anchor=south] (dot2) at ([xshift=-0.4em,yshift=-0.7em]h3.south) {\tiny{...}};
+    \draw [->] (w1.north) -- ([yshift=-0.1em]e1.south);
+    \draw [->] (w2.north) -- ([yshift=-0.1em]e2.south);
+    \draw [->] (w3.north) -- ([yshift=-0.1em]e3.south);
+    \draw [->] ([yshift=0.1em]e1.north) -- ([yshift=-0.1em]h1.south);
+    \draw [->] ([yshift=0.1em]e2.north) -- ([yshift=-0.1em]h2.south);
+    \draw [->] ([yshift=0.1em]e3.north) -- ([yshift=-0.1em]h3.south);
+    \draw [->] ([xshift=0.2em,yshift=0.1em]e1.north) .. controls +(north:0.3) and +(south:0.4) .. ([xshift=-0.3em,yshift=-0.1em]h2.south);
+    \draw [->] ([xshift=-0.2em,yshift=0.1em]e3.north) .. controls +(north:0.3) and +(south:0.4) .. ([xshift=0.3em,yshift=-0.1em]h2.south);
+    \draw [->] ([xshift=0.4em,yshift=-0.4em]h1.south) -- ([xshift=0.3em,yshift=-0.1em]h1.south);
+    \draw [->] ([xshift=0.8em,yshift=-0.4em]h1.south) -- ([xshift=0.6em,yshift=-0.1em]h1.south);
+    \draw [->] ([xshift=-0.4em,yshift=-0.4em]h3.south) -- ([xshift=-0.3em,yshift=-0.1em]h3.south);
+    \draw [->] ([xshift=-0.8em,yshift=-0.4em]h3.south) -- ([xshift=-0.6em,yshift=-0.1em]h3.south);
+    \node [anchor=south] (encoder) at ([xshift=-0.2em]h1.north west) {\scriptsize{\textbf{编码器}}};
+{
+    \node [rnnnode,anchor=west,fill=green!20] (t1) at ([xshift=3em]e3.east) {\tiny{$e_y()$}};
+    }
+{
+    \node [rnnnode,anchor=west,fill=green!20] (t2) at ([xshift=1.5em]t1.east) {\tiny{$e_y()$}};
+    }
+{
+    \node [rnnnode,anchor=west,fill=green!20] (t3) at ([xshift=1.5em]t2.east) {\tiny{$e_y()$}};
+    \node [rnnnode,anchor=west,fill=green!20] (t4) at ([xshift=1.5em]t3.east) {\tiny{$e_y()$}};
+    %\node [anchor=west,inner sep=2pt] (t5) at ([xshift=0.3em]t4.east) {\tiny{...}};
+    }
+{
+    \node [rnnnode,anchor=south] (s1) at ([yshift=1em]t1.north) {\tiny{$\vectorn{\emph{s}}_1$}};
+    \node [rnnnode,anchor=south] (f1) at ([yshift=1em]s1.north) {\tiny{$\vectorn{\emph{f}}_1$}};
+    }
+{
+    \node [rnnnode,anchor=south] (s2) at ([yshift=1em]t2.north) {\tiny{$\vectorn{\emph{s}}_2$}};
+    \node [rnnnode,anchor=south] (f2) at ([yshift=1em]s2.north) {\tiny{$\vectorn{\emph{f}}_2$}};
+    }
+{
+    \node [rnnnode,anchor=south] (s3) at ([yshift=1em]t3.north) {\tiny{$\vectorn{\emph{s}}_3$}};
+    \node [rnnnode,anchor=south] (f3) at ([yshift=1em]s3.north) {\tiny{$\vectorn{\emph{f}}_3$}};
+    \node [rnnnode,anchor=south] (s4) at ([yshift=1em]t4.north) {\tiny{$\vectorn{\emph{s}}_4$}};
+    \node [rnnnode,anchor=south] (f4) at ([yshift=1em]s4.north) {\tiny{$\vectorn{\emph{f}}_4$}};
+    %\node [anchor=west,inner sep=2pt] (s5) at ([xshift=0.3em]s4.east) {\tiny{...}};
+    %\node [anchor=south] (dot3) at ([xshift=-0.4em,yshift=-0.7em]s3.south) {\tiny{...}};
+    \node [anchor=south] (dot4) at ([xshift=-0.4em,yshift=-0.7em]s4.south) {\tiny{...}};
+    }
+{
+    \node [rnnnode,anchor=south,fill=blue!20] (o1) at ([yshift=1em]f1.north) {\tiny{softmax}};
+    \node [anchor=east] (decoder) at ([xshift=-0.3em,yshift=0.5em]o1.north west) {\scriptsize{\textbf{解码器}}};
+    }
+{
+    \node [rnnnode,anchor=south,fill=blue!20] (o2) at ([yshift=1em]f2.north) {\tiny{softmax}};
+    }
+{
+    \node [rnnnode,anchor=south,fill=blue!20] (o3) at ([yshift=1em]f3.north) {\tiny{softmax}};
+    \node [rnnnode,anchor=south,fill=blue!20] (o4) at ([yshift=1em]f4.north) {\tiny{softmax}};
+    %\node [anchor=west,inner sep=2pt] (o5) at ([xshift=0.3em]o4.east) {\tiny{...}};
+    }
+{
+    \node [anchor=north,inner sep=2pt] (wt1) at ([yshift=-0.6em]t1.south) {\tiny{$\langle$eos$\rangle$}};
+    }
+{
+    \node [anchor=north,inner sep=2pt] (wt2) at ([yshift=-0.6em]t2.south) {\tiny{How}};
+    }
+{
+    \node [anchor=north,inner sep=2pt] (wt3) at ([yshift=-0.8em]t3.south) {\tiny{are}};
+    \node [anchor=north,inner sep=2pt] (wt4) at ([yshift=-0.8em]t4.south) {\tiny{you}};
+    }
+{
+    \node [anchor=center,inner sep=2pt] (wo1) at ([yshift=1.2em]o1.north) {\tiny{How}};
+    }
+{
+    \node [anchor=south,inner sep=2pt] (wos1) at (wo1.north) {\tiny{\textbf{[step 1]}}};
+    }
+{
+    \node [anchor=center,inner sep=2pt] (wo2) at ([yshift=1.2em]o2.north) {\tiny{are}};
+    }
+{
+    \node [anchor=south,inner sep=2pt] (wos2) at (wo2.north) {\tiny{\textbf{[step 2]}}};
+    }
+{
+    \node [anchor=center,inner sep=2pt] (wo3) at ([yshift=1.2em]o3.north) {\tiny{you}};
+    \node [anchor=south,inner sep=2pt] (wos3) at (wo3.north) {\tiny{\textbf{[step 3]}}};
+    \node [anchor=center,inner sep=2pt] (wo4) at ([yshift=1.2em]o4.north) {\tiny{$\langle$eos$\rangle$}};
+    \node [anchor=south,inner sep=2pt] (wos4) at (wo4.north) {\tiny{\textbf{[step 4]}}};
+    }
+{
+    \foreach \x in {1}{
+        \draw [->] ([yshift=-0.7em]t\x.south) -- ([yshift=-0.1em]t\x.south);
+        \draw [->] ([yshift=0.1em]t\x.north) -- ([yshift=-0.1em]s\x.south);
+        \draw [->] ([yshift=0.1em]s\x.north) -- ([yshift=-0.1em]f\x.south);
+        \draw [->] ([yshift=0.1em]f\x.north) -- ([yshift=-0.1em]o\x.south);
+        \draw [->] ([yshift=0.1em]o\x.north) -- ([yshift=0.8em]o\x.north) node [pos=0.5,right] {\tiny{top1}};
+    }
+    }
+{
+    \foreach \x in {2}{
+        \draw [->] ([yshift=-0.7em]t\x.south) -- ([yshift=-0.1em]t\x.south);
+        \draw [->] ([yshift=0.1em]t\x.north) -- ([yshift=-0.1em]s\x.south);
+        \draw [->] ([yshift=0.1em]s\x.north) -- ([yshift=-0.1em]f\x.south);
+        \draw [->] ([yshift=0.1em]f\x.north) -- ([yshift=-0.1em]o\x.south);
+        \draw [->] ([yshift=0.1em]o\x.north) -- ([yshift=0.8em]o\x.north) node [pos=0.5,right] {\tiny{top1}};
+    \draw [->] ([xshift=0.2em,yshift=0.1em]t1.north) .. controls +(north:0.3) and +(south:0.3) .. ([xshift=-0.3em,yshift=-0.1em]s2.south);
+    }
+    }
+{
+    \foreach \x in {3,4}{
+        \draw [->] ([yshift=-0.7em]t\x.south) -- ([yshift=-0.1em]t\x.south);
+        \draw [->] ([yshift=0.1em]t\x.north) -- ([yshift=-0.1em]s\x.south);
+        \draw [->] ([yshift=0.1em]s\x.north) -- ([yshift=-0.1em]f\x.south);
+        \draw [->] ([yshift=0.1em]f\x.north) -- ([yshift=-0.1em]o\x.south);
+        \draw [->] ([yshift=0.1em]o\x.north) -- ([yshift=0.8em]o\x.north) node [pos=0.5,right] {\tiny{top1}};
+    %\draw [->] ([xshift=0.4em,yshift=0.1em]t1.north) .. controls +(north:0.25) and +(south:0.3) .. ([xshift=-0.6em,yshift=-0.1em]s3.south);
+    %\draw [->] ([xshift=0.2em,yshift=0.1em]t2.north) .. controls +(north:0.2) and +(south:0.4) .. ([xshift=-0.3em,yshift=-0.1em]s3.south);
+    \draw [->] ([xshift=-0.6em,yshift=-0.5em]s3.south) .. controls +(north:0) and +(south:0.2) .. ([xshift=-0.3em,yshift=-0.1em]s3.south);
+    \draw [->] ([xshift=-1.5em,yshift=-0.5em]s3.south) .. controls +(north:0) and +(south:0.15) .. ([xshift=-0.6em,yshift=-0.1em]s3.south);
+    }
+    }
+{
+    \draw [->,thick,dotted] (wo1.east) .. controls +(east:1.0) and +(west:1.0) ..(wt2.west);
+    }
+{
+    \draw [->,thick,dotted] (wo2.east) .. controls +(east:1.3) and +(west:1.1) ..(wt3.west);
+    \draw [->,thick,dotted] (wo3.east) .. controls +(east:1.1) and +(west:0.9) ..(wt4.west);
+    }
+{
+    \node [circle,draw,anchor=south,inner sep=3pt,fill=orange!20] (c1) at ([yshift=2em]h2.north) {\tiny{$\vectorn{\emph{C}}_1$}};
+    \node [anchor=south] (c1label) at (c1.north) {\tiny{\textbf{编码-解码注意力机制：上下文}}};
+    \draw [->] (h1.north) .. controls +(north:0.6) and +(250:0.9) .. (c1.250);
+    \draw [->] (h2.north) .. controls +(north:0.6) and +(270:0.9) .. (c1.270);
+    \draw [->] (h3.north) .. controls +(north:0.6) and +(290:0.9) .. (c1.290);
+    \draw [->] ([yshift=0.3em]s1.west) .. controls +(west:1) and +(east:1) .. (c1.-30);
+    \draw [->] (c1.0) .. controls +(east:1) and +(west:1) .. ([yshift=0em]f1.west);
+    }
+{
+    \node [circle,draw,anchor=north,inner sep=3pt,fill=orange!20] (c2) at ([yshift=-2em]t1.south) {\tiny{$\vectorn{\emph{C}}_2$}};
+    \draw [->] ([xshift=-0.7em]c2.west) -- ([xshift=-0.1em]c2.west);
+    \draw [->] ([xshift=0.1em]c2.east) .. controls +(east:0.6) and +(west:0.8) ..([yshift=-0.3em,xshift=-0.1em]f2.west);
+    }
+{
+    \node [circle,draw,anchor=north,inner sep=3pt,fill=orange!20] (c3) at ([yshift=-2em]t2.south) {\tiny{$\vectorn{\emph{C}}_3$}};
+    \draw [->] ([xshift=-0.7em]c3.west) -- ([xshift=-0.1em]c3.west);
+    \draw [->] ([xshift=0.1em]c3.east) .. controls +(east:0.6) and +(west:0.8) ..([yshift=-0.3em,xshift=-0.1em]f3.west);
+    }
+{
+    \node [circle,draw,anchor=north,inner sep=3pt,fill=orange!20] (c4) at ([yshift=-2em]t3.south) {\tiny{$\vectorn{\emph{C}}_4$}};
+    \draw [->] ([xshift=-0.7em]c4.west) -- ([xshift=-0.1em]c4.west);
+    \draw [->] ([xshift=0.1em]c4.east) .. controls +(east:0.6) and +(west:0.8) ..([yshift=-0.3em,xshift=-0.1em]f4.west);
+    }
+    \end{scope}
+    \end{tikzpicture}
\ No newline at end of file
--- a/Chapter12/Figures/figure-dependencies-between-words-in-a-recurrent-neural-network.tex
+++ b/Chapter12/Figures/figure-dependencies-between-words-in-a-recurrent-neural-network.tex
+\begin{tikzpicture}
+\begin{scope}
+\node [anchor=west] (w0) at (0,0) {$w_1$};
+\node [anchor=west] (w1) at ([xshift=0.5em]w0.east) {$w_2$};
+\node [anchor=west] (w2) at ([xshift=0.5em]w1.east) {$w_3$};
+\node [anchor=west] (w3) at ([xshift=0.5em]w2.east) {$...$};
+\node [anchor=west] (w4) at ([xshift=0.5em]w3.east) {$w_{m-1}$};
+\node [anchor=west,fill=green!20!white] (w5) at ([xshift=0.5em]w4.east) {$w_{m}$};
+\draw [->,thick,red] (w1.north).. controls +(130:0.5) and +(50:0.5) .. (w0.north);
+\draw [->,thick,red] (w2.north).. controls +(130:0.5) and +(50:0.5) .. (w1.north);
+\draw [->,thick,red] ([yshift=0.2em]w3.north).. controls +(130:0.5) and +(50:0.5) .. (w2.north);
+\draw [->,thick,red] (w4.north).. controls +(130:0.5) and +(50:0.5) .. ([yshift=0.2em]w3.north);
+\draw [->,thick,red] (w5.north).. controls +(130:0.5) and +(50:0.5) .. (w4.north);
+\draw [->,very thick,red] ([xshift=-5em]w0.west) -- ([xshift=-6.5em]w0.west) node [pos=0,right] {\scriptsize{信息传递}};
+\end{scope}
+\end{tikzpicture}
\ No newline at end of file
--- a/Chapter12/Figures/figure-dependencies-between-words-of-attention.tex
+++ b/Chapter12/Figures/figure-dependencies-between-words-of-attention.tex
+\begin{tikzpicture}
+\begin{scope}
+\node [anchor=west] (w0) at (0,-2) {$w_1$};
+\node [anchor=west] (w1) at ([xshift=0.5em]w0.east) {$w_2$};
+\node [anchor=west] (w2) at ([xshift=0.5em]w1.east) {$w_3$};
+\node [anchor=west] (w3) at ([xshift=0.5em]w2.east) {$...$};
+\node [anchor=west] (w4) at ([xshift=0.5em]w3.east) {$w_{m-1}$};
+\node [anchor=west,fill=green!20!white] (w5) at ([xshift=0.5em]w4.east) {$w_{m}$};
+\draw [->,thick,red] (w5.north).. controls +(100:0.85) and +(50:0.85) .. (w0.north);
+\draw [->,thick,red] (w5.north).. controls +(110:0.75) and +(50:0.75) .. (w1.north);
+\draw [->,thick,red] (w5.north).. controls +(120:0.6) and +(50:0.6) .. ([yshift=0.2em]w3.north);
+\draw [->,thick,red] (w5.north).. controls +(130:0.5) and +(50:0.5) .. (w4.north);
+\draw [->,very thick,red] ([xshift=-5em]w0.west) -- ([xshift=-6.5em]w0.west) node [pos=0,right] {\scriptsize{信息传递}};
+\end{scope}
+\end{tikzpicture}
\ No newline at end of file
--- a/Chapter12/Figures/figure-different-regularization-methods.tex
+++ b/Chapter12/Figures/figure-different-regularization-methods.tex
+\begin{tikzpicture}
+\begin{scope}
+\tikzstyle{lnode} = [minimum height=1.5em,minimum width=3em,inner sep=3pt,rounded corners=1.5pt,draw,fill=orange!20];
+\tikzstyle{standard} = [rounded corners=3pt]
+\node [lnode,anchor=west] (l1) at (0,0) {\scriptsize{子层$n$}};
+\node [lnode,anchor=west] (l2) at ([xshift=3em]l1.east) {\scriptsize{层正则化}};
+\node [lnode,anchor=west] (l3) at ([xshift=4em]l2.east) {\scriptsize{层正则化}};
+\node [lnode,anchor=west] (l4) at ([xshift=1.5em]l3.east) {\scriptsize{子层$n$}};
+\node [anchor=west] (plus1) at ([xshift=0.9em]l1.east) {\scriptsize{$\mathbf{\oplus}$}};
+\node [anchor=west] (plus2) at ([xshift=0.9em]l4.east) {\scriptsize{$\mathbf{\oplus}$}};
+\node [anchor=north] (label1) at ([xshift=3em,yshift=-0.5em]l1.south) {\scriptsize{(a)后正则化}};
+\node [anchor=north] (label2) at ([xshift=3em,yshift=-0.5em]l3.south) {\scriptsize{(b)前正则化}};
+\draw [->,thick] ([xshift=-1.5em]l1.west) -- ([xshift=-0.1em]l1.west);
+\draw [->,thick] ([xshift=0.1em]l1.east) -- ([xshift=0.2em]plus1.west);
+\draw [->,thick] ([xshift=-0.2em]plus1.east) -- ([xshift=-0.1em]l2.west);
+\draw [->,thick] ([xshift=0.1em]l2.east) -- ([xshift=1em]l2.east);
+\draw [->,thick] ([xshift=-1.5em]l3.west) -- ([xshift=-0.1em]l3.west);
+\draw [->,thick] ([xshift=0.1em]l3.east) -- ([xshift=-0.1em]l4.west);
+\draw [->,thick] ([xshift=0.1em]l4.east) -- ([xshift=0.2em]plus2.west);
+\draw [->,thick] ([xshift=-0.2em]plus2.east) -- ([xshift=1em]plus2.east);
+\draw[->,standard,thick] ([xshift=-0.8em]l1.west) -- ([xshift=-0.8em,yshift=2em]l1.west) -- ([yshift=2em]plus1.center) -- ([yshift=-0.2em]plus1.north);
+\draw[->,standard,thick] ([xshift=-0.8em]l3.west) -- ([xshift=-0.8em,yshift=2em]l3.west) -- ([yshift=2em]plus2.center) -- ([yshift=-0.2em]plus2.north);
+\end{scope}
+\end{tikzpicture}
\ No newline at end of file
--- a/Chapter12/Figures/figure-encoder-decoder-with-attention.tex
+++ b/Chapter12/Figures/figure-encoder-decoder-with-attention.tex
+%---------------------------------------------------------
+\begin{tikzpicture}
+%\setlength{\mystep}{1.6em}
+%%% a simple encoder-decoder model
+\begin{scope}
+\foreach \x in {1,2,...,6}
+    \node[] (s\x) at (\x * 1.6em,0) {};
+\node [] (ws1) at (s1) {\scriptsize{这}};
+\node [] (ws2) at (s2) {\scriptsize{是}};
+\node [] (ws3) at (s3) {\scriptsize{个}};
+\node [] (ws4) at (s4) {\scriptsize{很长}};
+\node [] (ws5) at (s5) {\scriptsize{的}};
+\node [] (ws6) at (s6) {\scriptsize{句子}};
+\foreach \x in {1,2,...,6}
+    \node[] (t\x) at (\x * 1.6em + 2.4in,0) {};
+\node [] (wt1) at (t1) {\scriptsize{This}};
+\node [] (wt2) at (t2) {\scriptsize{is}};
+\node [] (wt3) at ([yshift=-1pt]t3) {\scriptsize{a}};
+\node [] (wt4) at ([yshift=-0.1em]t4) {\scriptsize{very}};
+\node [] (wt5) at (t5) {\scriptsize{long}};
+\node [] (wt6) at ([xshift=1em]t6) {\scriptsize{sentence}};
+\node [anchor=south west,fill=red!30,minimum width=1.6in,minimum height=1.5em] (encoder) at ([yshift=1.0em]ws1.north west) {\footnotesize{Encoder}};
+\node [anchor=west,fill=blue!30,minimum width=1.9in,minimum height=1.5em] (decoder) at ([xshift=4.5em]encoder.east) {\footnotesize{Decoder}};
+\node [anchor=west,fill=green!30,minimum height=1.5em] (representation) at ([xshift=1em]encoder.east) {\footnotesize{表示}};
+\draw [->,thick] ([xshift=1pt]encoder.east)--([xshift=-1pt]representation.west);
+\draw [->,thick] ([xshift=1pt]representation.east)--([xshift=-1pt]decoder.west);
+\foreach \x in {1,2,...,6}
+    \draw[->] ([yshift=0.1em]s\x.north) -- ([yshift=1.2em]s\x.north);
+\foreach \x in {1,2,...,5}
+    \draw[<-] ([yshift=0.1em]t\x.north) -- ([yshift=1.2em]t\x.north);
+\draw[<-] ([yshift=0.1em,xshift=1em]t6.north) -- ([yshift=1.2em,xshift=1em]t6.north);
+\node [anchor=north] (cap) at ([xshift=2em,yshift=-2.5em]encoder.south east) {\small{(a) 简单的编码器-解码器框架}};
+\end{scope}
+%%% a encoder-decoder model with attention
+\begin{scope}[yshift=-1.7in]
+\foreach \x in {1,2,...,6}
+    \node[] (s\x) at (\x * 1.6em,0) {};
+\node [] (ws1) at (s1) {\scriptsize{这}};
+\node [] (ws2) at (s2) {\scriptsize{是}};
+\node [] (ws3) at (s3) {\scriptsize{个}};
+\node [] (ws4) at (s4) {\scriptsize{很长}};
+\node [] (ws5) at (s5) {\scriptsize{的}};
+\node [] (ws6) at (s6) {\scriptsize{句子}};
+\foreach \x in {1,2,...,6}
+    \node[] (t\x) at (\x * 1.6em + 2.4in,0) {};
+\node [] (wt1) at (t1) {\scriptsize{This}};
+\node [] (wt2) at (t2) {\scriptsize{is}};
+\node [] (wt3) at ([yshift=-1pt]t3) {\scriptsize{a}};
+\node [] (wt4) at ([yshift=-0.1em]t4) {\scriptsize{very}};
+\node [] (wt5) at (t5) {\scriptsize{long}};
+\node [] (wt6) at ([xshift=1em]t6) {\scriptsize{sentence}};
+\node [anchor=south west,fill=red!30,minimum width=1.6in,minimum height=1.5em] (encoder) at ([yshift=1.0em]ws1.north west) {\footnotesize{Encoder}};
+\node [anchor=west,fill=blue!30,minimum width=1.9in,minimum height=1.5em] (decoder) at ([xshift=4.5em]encoder.east) {\footnotesize{Decoder}};
+\foreach \x in {1,2,...,6}
+    \draw[->] ([yshift=0.1em]s\x.north) -- ([yshift=1.2em]s\x.north);
+\foreach \x in {1,2,...,5}
+    \draw[<-] ([yshift=0.1em]t\x.north) -- ([yshift=1.2em]t\x.north);
+\draw[<-] ([yshift=0.1em,xshift=1em]t6.north) -- ([yshift=1.2em,xshift=1em]t6.north);
+\draw [->] ([yshift=3em]s6.north) -- ([yshift=4em]s6.north) -- ([yshift=4em]t1.north) node [pos=0.5,fill=green!30,inner sep=2pt] (c1) {\scriptsize{表示$\vectorn{\emph{C}}_1$}} -- ([yshift=3em]t1.north) ;
+\draw [->] ([yshift=3em]s5.north) -- ([yshift=5.3em]s5.north) -- ([yshift=5.3em]t2.north) node [pos=0.5,fill=green!30,inner sep=2pt] (c2) {\scriptsize{表示$\vectorn{\emph{C}}_2$}} -- ([yshift=3em]t2.north) ;
+\draw [->] ([yshift=3.5em]s3.north) -- ([yshift=6.6em]s3.north) -- ([yshift=6.6em]t4.north) node [pos=0.5,fill=green!30,inner sep=2pt] (c3) {\scriptsize{表示$\vectorn{\emph{C}}_i$}} -- ([yshift=3.5em]t4.north) ;
+\node [anchor=north] (smore) at ([yshift=3.5em]s3.north) {...};
+\node [anchor=north] (tmore) at ([yshift=3.5em]t4.north) {...};
+\node [anchor=north] (cap) at ([xshift=2em,yshift=-2.5em]encoder.south east) {\small{(b) 引入注意力机制的编码器-解码器框架}};
+\end{scope}
+\end{tikzpicture}
\ No newline at end of file
--- a/Chapter12/Figures/figure-example-of-context-vector-calculation-process.tex
+++ b/Chapter12/Figures/figure-example-of-context-vector-calculation-process.tex
+%-------------------------------------------
+\begin{tikzpicture}
+%\newlength{\mystep}
+%\newlength{\wseg}
+%\newlength{\hseg}
+%\newlength{\wnode}
+%\newlength{\hnode}
+\setlength{\wseg}{1.5cm}
+\setlength{\hseg}{1.0cm}
+\setlength{\wnode}{3.75cm}
+\setlength{\hnode}{1.0cm}
+\tikzstyle{elementnode} = [rectangle,text=white,anchor=center]
+\tikzstyle{srcnode} = [rotate=45,font=\small,anchor=south west]
+\tikzstyle{tgtnode} = [left,font=\small,anchor=north east]
+\tikzstyle{alignmentnode} = [rectangle,draw,minimum height=3.6\hnode,minimum width=0.36\hnode]
+\tikzstyle{probnode} = [fill=blue!30,minimum width=0.4\hnode]
+\tikzstyle{labelnode} = [above]
+% alignment matrix
+\begin{scope}[scale=0.9,yshift=0.12in]
+\foreach \i / \j / \c in
+    {0/7/0.2, 1/7/0.45, 2/7/0.15, 3/7/0.15, 4/7/0.15, 5/7/0.15,
+    0/6/0.35, 1/6/0.45, 2/6/0.15, 3/6/0.15, 4/6/0.15, 5/6/0.15,
+    0/5/0.25, 1/5/0.15, 2/5/0.15, 3/5/0.35, 4/5/0.15, 5/5/0.15,
+    0/4/0.15, 1/4/0.25, 2/4/0.2, 3/4/0.30, 4/4/0.15, 5/4/0.15,
+    0/3/0.15, 1/3/0.15, 2/3/0.8, 3/3/0.25, 4/3/0.15, 5/3/0.25,
+    0/2/0.15, 1/2/0.15, 2/2/0.15, 3/2/0.15, 4/2/0.25, 5/2/0.3,
+    0/1/0.15, 1/1/0.15, 2/1/0.15, 3/1/0.15, 4/1/0.8, 5/1/0.15,
+    0/0/0.15, 1/0/0.15, 2/0/0.15, 3/0/0.15, 4/0/0.25, 5/0/0.60}
+    \node[elementnode,minimum size=0.6*\hnode*\c,inner sep=0.1pt,fill=blue] (a\i\j) at (0.5*\hnode*\i-5.4*0.5*\hnode,0.5*\hnode*\j-1.05*\hnode) {};
+%attention score labels
+\node[align=center] (l17) at (a17) {\scriptsize{{\color{white} .4}}};
+\node[align=center] (l26) at (a06) {\scriptsize{{\color{white} .3}}};
+\node[align=center] (l26) at (a16) {\scriptsize{{\color{white} .4}}};
+\node[align=center] (l17) at (a35) {\scriptsize{{\color{white} .3}}};
+\node[align=center] (l17) at (a34) {\tiny{{\color{white} .3}}};
+\node[align=center] (l17) at (a23) {\small{{\color{white} .8}}};
+\node[align=center] (l17) at (a41) {\small{{\color{white} .8}}};
+\node[align=center] (l17) at (a50) {\small{{\color{white} .7}}};
+% source
+\node[srcnode] (src1) at (-5.4*0.5*\hnode,-1.05*\hnode+7.5*0.5*\hnode) {\scriptsize{Have}};
+\node[srcnode] (src2) at ([xshift=0.5\hnode]src1.south west) {\scriptsize{you}};
+\node[srcnode] (src3) at ([xshift=0.5\hnode]src2.south west) {\scriptsize{learned}};
+\node[srcnode] (src4) at ([xshift=0.5\hnode]src3.south west) {\scriptsize{nothing}};
+\node[srcnode] (src5) at ([xshift=0.5\hnode]src4.south west) {\scriptsize{?}};
+\node[srcnode] (src6) at ([xshift=0.5\hnode]src5.south west) {\scriptsize{$\langle$eos$\rangle$}};
+% target
+\node[tgtnode] (tgt1) at (-6.0*0.5*\hnode,-1.05*\hnode+7.5*0.5*\hnode) {\scriptsize{你}};
+\node[tgtnode] (tgt2) at ([yshift=-0.5\hnode]tgt1.north east) {\scriptsize{什么}};
+\node[tgtnode] (tgt3) at ([yshift=-0.5\hnode]tgt2.north east) {\scriptsize{都}};
+\node[tgtnode] (tgt4) at ([yshift=-0.5\hnode]tgt3.north east) {\scriptsize{没}};
+\node[tgtnode] (tgt5) at ([yshift=-0.5\hnode]tgt4.north east) {\scriptsize{学}};
+\node[tgtnode] (tgt6) at ([yshift=-0.5\hnode]tgt5.north east) {\scriptsize{到}};
+\node[tgtnode] (tgt7) at ([yshift=-0.5\hnode]tgt6.north east) {\scriptsize{?}};
+\node[tgtnode] (tgt8) at ([yshift=-0.5\hnode]tgt7.north east) {\scriptsize{$\langle$eos$\rangle$}};
+\end{scope}
+%\visible<2->
+{
+% alignment rectangle 2
+\node[alignmentnode, ugreen, anchor=north west] (alignment1) at ([xshift=-0.3em,yshift=0.4em]a07.north west) {};
+}
+%\visible<3->
+{
+% alignment rectangle 1
+\node[alignmentnode, red, anchor=north west] (alignment2) at ([xshift=-0.1em,yshift=0.2em]a17.north west) {};
+}
+%\visible<3->
+{
+% alignment bars 2
+\node[probnode,anchor=south west,minimum height=0.4\hnode,inner sep=0.1pt,fill=red!40,label=below:\scriptsize{$0.4$}] (attn21) at ([xshift=2.3\hnode,yshift=0.5\hnode]alignment2.east) {};
+\node[probnode,anchor=south west,minimum height=0.4\hnode,inner sep=0.1pt,fill=red!40,label=below:\scriptsize{$0.4$}] (attn22) at ([xshift=1pt]attn21.south east) {};
+\node[probnode,anchor=south west,minimum height=0.05\hnode,inner sep=0.1pt,fill=red!40,label=below:\scriptsize{$0$}] (attn23) at ([xshift=1pt]attn22.south east) {};
+\node[probnode,anchor=south west,minimum height=0.1\hnode,inner sep=0.1pt,fill=red!40,label=below:\scriptsize{$0.1$}] (attn24) at ([xshift=1pt]attn23.south east) {};
+\node[probnode,anchor=south west,minimum height=0.05\hnode,inner sep=0.1pt,fill=red!40,label=below:\scriptsize{$0$}] (attn25) at ([xshift=1pt]attn24.south east) {};
+\node[probnode,anchor=south west,minimum height=0.05\hnode,inner sep=0.1pt,fill=red!40,label=below:\scriptsize{$...$}] (attn26) at ([xshift=1pt]attn25.south east) {};
+}
+%\visible<2->
+{
+% alignment bars 1
+\node[probnode,anchor=south,minimum height=0.2\hnode,inner sep=0.1pt,fill=ugreen!40,label=below:\scriptsize{$0.2$}] (attn11) at ([xshift=2.5\hnode,yshift=-1em]alignment2.north east) {};
+\node[probnode,anchor=south west,minimum height=0.3\hnode,inner sep=0.1pt,fill=ugreen!40,label=below:\scriptsize{$0.3$}] (attn12) at ([xshift=1pt]attn11.south east) {};
+\node[probnode,anchor=south west,minimum height=0.2\hnode,inner sep=0.1pt,fill=ugreen!40,label=below:\scriptsize{$0.2$}] (attn13) at ([xshift=1pt]attn12.south east) {};
+\node[probnode,anchor=south west,minimum height=0.05\hnode,inner sep=0.1pt,fill=ugreen!40,label=below:\scriptsize{$0$}] (attn14) at ([xshift=1pt]attn13.south east) {};
+\node[probnode,anchor=south west,minimum height=0.05\hnode,inner sep=0.1pt,fill=ugreen!40,label=below:\scriptsize{$0$}] (attn15) at ([xshift=1pt]attn14.south east) {};
+\node[probnode,anchor=south west,minimum height=0.05\hnode,inner sep=0.1pt,fill=ugreen!40,label=below:\scriptsize{$...$}] (attn16) at ([xshift=1pt]attn15.south east) {};
+}
+%\visible<3->
+{
+% coverage score formula node
+\node [anchor=north west] (formula) at ([xshift=-0.3\hnode,yshift=-1.5\hnode]attn11.south) {\small{不同$\vectorn{\emph{C}}_j$所对应的源语言词的权重是不同的}};
+\node [anchor=north west] (example) at (formula.south west) {\footnotesize{$\vectorn{\emph{C}}_2=0.4 \times \vectorn{\emph{h}}(\textrm{“你”}) + 0.4 \times \vectorn{\emph{h}}(\textrm{“什么”}) +$}};
+\node [anchor=north west] (example2) at ([yshift=0.4em]example.south west) {\footnotesize{$\ \ \ \ \ \ \ \ 0 \times \vectorn{\emph{h}}(\textrm{“都”}) + 0.1 \times \vectorn{\emph{h}}(\textrm{“ 没”}) + ..$}};
+}
+%\visible<3->
+{
+% matrix -> attn2
+\draw[->,red] ([xshift=0.1em,yshift=2.3em]alignment2.east).. controls +(east:1.9cm) and +(west:1.0cm) ..([xshift=-0.15\hnode,yshift=-1em]attn21.north west);
+}
+%\visible<2->
+{
+\draw[->,ugreen] ([xshift=0.1em,yshift=-1.2em]alignment1.north east)--([xshift=2.2\hnode,yshift=-1.2em]alignment2.north east);
+}
+%\visible<3->
+{
+% attn2 -> cov2
+\draw[->] ([xshift=0.2\hnode,yshift=0.0\hnode]attn26.east)--([xshift=0.7\hnode,yshift=0]attn26.east) node[pos=0.5,above] (sum2) {\small{$\sum$}}; % 0.3 - 0.5 height of the
+}
+%\visible<2->
+{
+% attn1 -> cov1
+\draw[->] ([xshift=0.2\hnode]attn16.east)--([xshift=0.7\hnode]attn16.east) node[pos=0.5,above] (sum1) {\small{$\sum$}};
+}
+% coverage score for each source word
+%\visible<2->
+{
+\node[anchor=west] (sc1) at ([xshift=0.9\hnode]attn16.east) {$\vectorn{\emph{C}}_1 = \sum_{i=1}^{8} \alpha_{i1} \vectorn{\emph{h}}_{i}$};
+}
+%\visible<3->
+{
+\node[anchor=west] (sc2) at ([xshift=0.9\hnode,yshift=0.0\hnode]attn26.east) {$\vectorn{\emph{C}}_2 = \sum_{i=1}^{8} \alpha_{i2} \vectorn{\emph{h}}_{i}$};
+}
+\end{tikzpicture}
\ No newline at end of file
--- a/Chapter12/Figures/figure-example-of-self-attention-mechanism-calculation.tex
+++ b/Chapter12/Figures/figure-example-of-self-attention-mechanism-calculation.tex
+\begin{tikzpicture}
+\begin{scope}
+\tikzstyle{rnode} = [draw,minimum width=2.8em,minimum height=1.2em]
+\node [rnode,anchor=south west,fill=green!20!white] (key11) at (0,0) {\scriptsize{$\vectorn{\emph{h}}(\textrm{“你”})$}};
+\node [rnode,anchor=south west,fill=green!20!white] (key12) at ([xshift=0.8em]key11.south east) {\scriptsize{$\vectorn{\emph{h}}(\textrm{“什么”})$}};
+\node [rnode,anchor=south west,fill=green!20!white] (key13) at ([xshift=0.8em]key12.south east) {\scriptsize{$\vectorn{\emph{h}}(\textrm{“也”})$}};
+\node [rnode,anchor=south west,fill=green!20!white] (key14) at ([xshift=0.8em]key13.south east) {\scriptsize{$\vectorn{\emph{h}}(\textrm{“没”})$}};
+\node [rnode,anchor=south west,fill=green!20!white] (key15) at ([xshift=0.8em]key14.south east) {\scriptsize{$\vectorn{\emph{h}}(\textrm{“学”})$}};
+\node [rnode,anchor=east] (query1) at ([xshift=-1em]key11.west) {\scriptsize{$\vectorn{\emph{h}}(\textrm{“你”})$}};
+\draw [->] ([yshift=1pt,xshift=4pt]query1.north) .. controls +(90:0.6em) and +(90:0.6em) .. ([yshift=1pt]key11.north);
+\draw [->] ([yshift=1pt,xshift=0pt]query1.north) .. controls +(90:1.0em) and +(90:1.0em) .. ([yshift=1pt]key12.north);
+\draw [->] ([yshift=1pt,xshift=-4pt]query1.north) .. controls +(90:1.4em) and +(90:1.4em) .. ([yshift=1pt]key13.north);
+\draw [->] ([yshift=1pt,xshift=-8pt]query1.north) .. controls +(90:1.8em) and +(90:1.8em) .. ([yshift=1pt]key14.north);
+\draw [->] ([yshift=1pt,xshift=-12pt]query1.north) .. controls +(90:2.2em) and +(90:2.2em) .. ([yshift=1pt]key15.north);
+\node [anchor=south west] (alpha11) at ([xshift=0.3em]key11.north) {\scriptsize{$\alpha_1$}};
+\node [anchor=south west] (alpha12) at ([xshift=0.3em]key12.north) {\scriptsize{$\alpha_2$}};
+\node [anchor=south west] (alpha13) at ([xshift=0.3em]key13.north) {\scriptsize{$\alpha_3$}};
+\node [anchor=south west] (alpha14) at ([xshift=0.3em]key14.north) {\scriptsize{$\alpha_4$}};
+\node [anchor=south west] (alpha15) at ([xshift=0.3em]key15.north) {\scriptsize{$\alpha_5$}};
+\end{scope}
+\end{tikzpicture}
\ No newline at end of file
--- a/Chapter12/Figures/figure-lrate-of-transformer.tex
+++ b/Chapter12/Figures/figure-lrate-of-transformer.tex
+  \begin{tikzpicture}
+    \footnotesize{
+      \begin{axis}[
+      width=.60\textwidth,
+      height=.40\textwidth,
+      legend style={at={(0.60,0.08)}, anchor=south west},
+      xlabel={\footnotesize{更新步数  (10k)}},
+      ylabel={\footnotesize{学习率  (\scriptsize{$10^{-3}$)}}},
+      ylabel style={yshift=-1em},xlabel style={yshift=0.0em},
+      yticklabel style={/pgf/number format/precision=2,/pgf/number format/fixed zerofill},
+      ymin=0,ymax=0.9, ytick={0.2, 0.4, 0.6, 0.8},
+      xmin=0,xmax=12,xtick={2,4,6,8,10},
+      legend style={yshift=-6pt, legend plot pos=right,font=\scriptsize,cells={anchor=west}}
+      ]
+      \addplot[orange,line width=1.25pt] coordinates {(0,0) (4,0.7) (5,0.63) (6,0.57) (7,0.525) (8,0.49) (9,0.465) (10,0.44) (11,0.42) (12,0.4)};
+      \end{axis}
+     }
+  \end{tikzpicture}
--- a/Chapter12/Figures/figure-mask-instance-for-future-positions-in-transformer.tex
+++ b/Chapter12/Figures/figure-mask-instance-for-future-positions-in-transformer.tex
+\begin{tikzpicture}
+\begin{scope}
+\tikzstyle{attnode} = [minimum size=1.5em,inner sep=0pt,rounded corners=1pt,draw]
+\tikzstyle{srcnode} = [rotate=45,font=\small,anchor=south west]
+\tikzstyle{tgtnode} = [left,font=\small,anchor=north east]
+\tikzstyle{masknode} = [minimum size=5.8em,inner sep=0pt,rounded corners=1pt,draw]
+\tikzstyle{elementnode} = [rectangle,text=white,anchor=center]
+%\setlength{\hnode}{1.0cm}
+%\node [anchor=west,attnode] (node1) at (0,0) {\tiny{}};
+%\node [anchor=west,attnode] (node2) at ([xshift=1em]node1.east) {\tiny{}};
+{
+\foreach \i / \j / \c in
+    {0/5/0.25, 1/5/0.15, 2/5/0.15, 3/5/0.35, 4/5/0.25, 5/5/0.15,
+    0/4/0.15, 1/4/0.25, 2/4/0.2, 3/4/0.30, 4/4/0.15, 5/4/0.15,
+    0/3/0.15, 1/3/0.15, 2/3/0.5, 3/3/0.25, 4/3/0.15, 5/3/0.25,
+    0/2/0.15, 1/2/0.15, 2/2/0.15, 3/2/0.15, 4/2/0.25, 5/2/0.3,
+    0/1/0.25, 1/1/0.15, 2/1/0.15, 3/1/0.15, 4/1/0.5, 5/1/0.15,
+    0/0/0.15, 1/0/0.15, 2/0/0.15, 3/0/0.15, 4/0/0.25, 5/0/0.40}
+    \node[elementnode,minimum size=0.6*1.0cm*\c,inner sep=0.1pt,fill=blue] (a\i\j) at (0.5*1.0cm*\i-5.4*0.5*1.0cm,0.5*1.0cm*\j-1.05*1.0cm) {};
+% source
+\node[srcnode] (src1) at (-5.4*0.5*1.0cm,-1.05*1.0cm+5.5*0.5*1.0cm) {\scriptsize{Have}};
+\node[srcnode] (src2) at ([xshift=0.5cm]src1.south west) {\scriptsize{you}};
+\node[srcnode] (src3) at ([xshift=0.5cm]src2.south west) {\scriptsize{learned}};
+\node[srcnode] (src4) at ([xshift=0.5cm]src3.south west) {\scriptsize{nothing}};
+\node[srcnode] (src5) at ([xshift=0.5cm]src4.south west) {\scriptsize{?}};
+\node[srcnode] (src6) at ([xshift=0.5cm]src5.south west) {\scriptsize{$\langle$eos$\rangle$}};
+% target
+\node[tgtnode] (tgt1) at (-6.0*0.5*1.0cm,-1.05*1.0cm+5.5*0.5*1.0cm) {\scriptsize{Have}};
+\node[tgtnode] (tgt2) at ([yshift=-0.5cm]tgt1.north east) {\scriptsize{you}};
+\node[tgtnode] (tgt3) at ([yshift=-0.5cm]tgt2.north east) {\scriptsize{learned}};
+\node[tgtnode] (tgt4) at ([yshift=-0.5cm]tgt3.north east) {\scriptsize{nothing}};
+\node[tgtnode] (tgt5) at ([yshift=-0.5cm]tgt4.north east) {\scriptsize{?}};
+\node[tgtnode] (tgt6) at ([yshift=-0.5cm]tgt5.north east) {\scriptsize{$\langle$eos$\rangle$}};
+{
+\filldraw [fill=blue!20,draw,thick,fill opacity=0.85] ([xshift=-0.9em,yshift=0.5em]a15.north west) -- ([xshift=0.5em,yshift=-0.9em]a51.south east) --  ([xshift=0.5em,yshift=0.5em]a55.north east) -- ([xshift=-0.9em,yshift=0.5em]a15.north west);
+\node[anchor=west] (labelmask) at ([xshift=0.3em,yshift=0.5em]a23.north east) {Masked};
+}
+{
+\foreach \i / \j / \c in
+    {0/5/0.25,
+    0/4/0.15, 1/4/0.25,
+    0/3/0.15, 1/3/0.15, 2/3/0.5,
+    0/2/0.15, 1/2/0.15, 2/2/0.15, 3/2/0.15,
+    0/1/0.25, 1/1/0.15, 2/1/0.15, 3/1/0.15, 4/1/0.5,
+    0/0/0.15, 1/0/0.15, 2/0/0.15, 3/0/0.15, 4/0/0.25, 5/0/0.40}
+    \node[elementnode,minimum size=0.6*1.0cm*\c,inner sep=0.1pt,fill=blue] (a\i\j) at (0.5*1.0cm*\i+6*0.5*1.0cm,0.5*1.0cm*\j-1.05*1.0cm) {};
+% source
+\node[srcnode] (src1) at (6*0.5*1.0cm,-1.05*1.0cm+5.5*0.5*1.0cm) {\scriptsize{Have}};
+\node[srcnode] (src2) at ([xshift=0.5cm]src1.south west) {\scriptsize{you}};
+\node[srcnode] (src3) at ([xshift=0.5cm]src2.south west) {\scriptsize{learned}};
+\node[srcnode] (src4) at ([xshift=0.5cm]src3.south west) {\scriptsize{nothing}};
+\node[srcnode] (src5) at ([xshift=0.5cm]src4.south west) {\scriptsize{?}};
+\node[srcnode] (src6) at ([xshift=0.5cm]src5.south west) {\scriptsize{$\langle$eos$\rangle$}};
+% target
+\node[tgtnode] (tgt1) at (5.4*0.5*1.0cm,-1.05*1.0cm+5.5*0.5*1.0cm) {\scriptsize{Have}};
+\node[tgtnode] (tgt2) at ([yshift=-0.5cm]tgt1.north east) {\scriptsize{you}};
+\node[tgtnode] (tgt3) at ([yshift=-0.5cm]tgt2.north east) {\scriptsize{learned}};
+\node[tgtnode] (tgt4) at ([yshift=-0.5cm]tgt3.north east) {\scriptsize{nothing}};
+\node[tgtnode] (tgt5) at ([yshift=-0.5cm]tgt4.north east) {\scriptsize{?}};
+\node[tgtnode] (tgt6) at ([yshift=-0.5cm]tgt5.north east) {\scriptsize{$\langle$eos$\rangle$}};
+}
+}
+\end{scope}
+\end{tikzpicture}
\ No newline at end of file
--- a/Chapter12/Figures/figure-matrix-representation-of-attention-weights-between-chinese-english-sentence-pairs.tex
+++ b/Chapter12/Figures/figure-matrix-representation-of-attention-weights-between-chinese-english-sentence-pairs.tex
+%-------------------------------------------
+\begin{tikzpicture}
+%\setlength{\hnode}{1.2cm}
+\tikzstyle{elementnode} = [rectangle,text=white,anchor=center]
+\tikzstyle{srcnode} = [rotate=45,font=\small,anchor=south west]
+\tikzstyle{tgtnode} = [left,font=\small,anchor=north east]
+\tikzstyle{alignmentnode} = [rectangle,draw,minimum height=3.6cm,minimum width=0.36cm]
+\tikzstyle{probnode} = [fill=blue!30,minimum width=0.4cm]
+\tikzstyle{labelnode} = [above]
+% alignment matrix
+\begin{scope}[scale=0.9,yshift=0.12in]
+\foreach \i / \j / \c in
+    {0/7/0.2, 1/7/0.45, 2/7/0.15, 3/7/0.15, 4/7/0.15, 5/7/0.15,
+    0/6/0.35, 1/6/0.45, 2/6/0.15, 3/6/0.15, 4/6/0.15, 5/6/0.15,
+    0/5/0.25, 1/5/0.15, 2/5/0.15, 3/5/0.35, 4/5/0.15, 5/5/0.15,
+    0/4/0.15, 1/4/0.25, 2/4/0.2, 3/4/0.30, 4/4/0.15, 5/4/0.15,
+    0/3/0.15, 1/3/0.15, 2/3/0.8, 3/3/0.25, 4/3/0.15, 5/3/0.25,
+    0/2/0.15, 1/2/0.15, 2/2/0.15, 3/2/0.15, 4/2/0.25, 5/2/0.3,
+    0/1/0.15, 1/1/0.15, 2/1/0.15, 3/1/0.15, 4/1/0.8, 5/1/0.15,
+    0/0/0.15, 1/0/0.15, 2/0/0.15, 3/0/0.15, 4/0/0.25, 5/0/0.60}
+    \node[elementnode,minimum size=0.6*1.2cm*\c,inner sep=0.1pt,fill=blue] (a\i\j) at (0.5*1.2cm*\i-5.4*0.5*1.2cm,0.5*1.2cm*\j-1.05*1.2cm) {};
+%attention score labels
+\node[align=center] (l17) at (a17) {\scriptsize{{\color{white} .4}}};
+\node[align=center] (l26) at (a06) {\scriptsize{{\color{white} .3}}};
+\node[align=center] (l26) at (a16) {\scriptsize{{\color{white} .4}}};
+\node[align=center] (l17) at (a35) {\scriptsize{{\color{white} .3}}};
+\node[align=center] (l17) at (a34) {\tiny{{\color{white} .3}}};
+\node[align=center] (l17) at (a23) {\small{{\color{white} .8}}};
+\node[align=center] (l17) at (a41) {\small{{\color{white} .8}}};
+\node[align=center] (l17) at (a50) {\small{{\color{white} .7}}};
+% source
+\node[srcnode] (src1) at (-5.4*0.5*1.2cm,-1.05*1.2cm+7.5*0.5*1.2cm) {\scriptsize{Have}};
+\node[srcnode] (src2) at ([xshift=0.6cm]src1.south west) {\scriptsize{you}};
+\node[srcnode] (src3) at ([xshift=0.6cm]src2.south west) {\scriptsize{learned}};
+\node[srcnode] (src4) at ([xshift=0.6cm]src3.south west) {\scriptsize{nothing}};
+\node[srcnode] (src5) at ([xshift=0.6cm]src4.south west) {\scriptsize{?}};
+\node[srcnode] (src6) at ([xshift=0.6cm]src5.south west) {\scriptsize{$\langle$eos$\rangle$}};
+% target
+\node[tgtnode] (tgt1) at (-6.0*0.5*1.2cm,-1.05*1.2cm+7.5*0.5*1.2cm) {\scriptsize{你}};
+\node[tgtnode] (tgt2) at ([yshift=-0.6cm]tgt1.north east) {\scriptsize{什么}};
+\node[tgtnode] (tgt3) at ([yshift=-0.6cm]tgt2.north east) {\scriptsize{都}};
+\node[tgtnode] (tgt4) at ([yshift=-0.6cm]tgt3.north east) {\scriptsize{没}};
+\node[tgtnode] (tgt5) at ([yshift=-0.6cm]tgt4.north east) {\scriptsize{学}};
+\node[tgtnode] (tgt6) at ([yshift=-0.6cm]tgt5.north east) {\scriptsize{到}};
+\node[tgtnode] (tgt7) at ([yshift=-0.6cm]tgt6.north east) {\scriptsize{?}};
+\node[tgtnode] (tgt8) at ([yshift=-0.6cm]tgt7.north east) {\scriptsize{$\langle$eos$\rangle$}};
+\end{scope}
+\end{tikzpicture}
+%-------------------------------------------
\ No newline at end of file
--- a/Chapter12/Figures/figure-multi-head-attention-model.tex
+++ b/Chapter12/Figures/figure-multi-head-attention-model.tex
+\begin{tikzpicture}
+\begin{scope}
+\node [anchor=west,draw=black!30,inner sep=4pt,fill=ugreen!20!white,text=ugreen!20!white] (Linear0) at (0,0) {\footnotesize{Linear}};
+\node [anchor=south west,draw=black!50,fill=ugreen!20!white,draw,inner sep=4pt,text=ugreen!20!white] (Linear01) at ([shift={(-0.2em,-0.2em)}]Linear0.south west) {\footnotesize{Linear}};
+\node [anchor=south west,fill=ugreen!20!white,draw,inner sep=4pt] (Linear02) at ([shift={(-0.2em,-0.2em)}]Linear01.south west) {\footnotesize{Linear}};
+\node [anchor=north] (Q) at ([xshift=0em,yshift=-1em]Linear02.south) {\footnotesize{$\vectorn{\emph{Q}}$}};
+\node [anchor=west,draw=black!30,inner sep=4pt,fill=ugreen!20!white,text=ugreen!20!white] (Linear1) at ([xshift=1.5em]Linear0.east) {\footnotesize{Linear}};
+\node [anchor=south west,draw=black!50,fill=ugreen!20!white,draw,inner sep=4pt,text=ugreen!20!white] (Linear11) at ([shift={(-0.2em,-0.2em)}]Linear1.south west) {\footnotesize{Linear}};
+\node [anchor=south west,fill=ugreen!20!white,draw,inner sep=4pt] (Linear12) at ([shift={(-0.2em,-0.2em)}]Linear11.south west) {\footnotesize{Linear}};
+\node [anchor=north] (K) at ([xshift=0em,yshift=-1em]Linear12.south) {\footnotesize{$\vectorn{\emph{K}}$}};
+\node [anchor=west,draw=black!30,inner sep=4pt,fill=ugreen!20!white,text=ugreen!20!white] (Linear2) at ([xshift=1.5em]Linear1.east) {\footnotesize{Linear}};
+\node [anchor=south west,draw=black!50,fill=ugreen!20!white,draw,inner sep=4pt,text=ugreen!20!white] (Linear21) at ([shift={(-0.2em,-0.2em)}]Linear2.south west) {\footnotesize{Linear}};
+\node [anchor=south west,fill=ugreen!20!white,draw,inner sep=4pt] (Linear22) at ([shift={(-0.2em,-0.2em)}]Linear21.south west) {\footnotesize{Linear}};
+\node [anchor=north] (V) at ([xshift=0em,yshift=-1em]Linear22.south) {\footnotesize{$\vectorn{\emph{V}}$}};
+\node [anchor=south,draw=black!30,minimum width=12em,minimum height=2em,inner sep=4pt,fill=blue!20!white] (Scale) at ([yshift=1em]Linear1.north) {\footnotesize{}};
+\node [anchor=south west,draw=black!50,minimum width=12em,minimum height=2em,fill=blue!20!white,draw,inner sep=4pt] (Scale1) at ([shift={(-0.2em,-0.2em)}]Scale.south west) {\footnotesize{}};
+\node [anchor=south west,fill=blue!20!white,draw,minimum width=12em,minimum height=2em,inner sep=4pt] (Scale2) at ([shift={(-0.2em,-0.2em)}]Scale1.south west) {\footnotesize{Scaled Dot-Product Attention}};
+\node [anchor=south,draw,minimum width=4em,inner sep=4pt,fill=yellow!30] (Concat) at ([yshift=1em]Scale2.north) {\footnotesize{Concat}};
+\node [anchor=south,draw,minimum width=4em,inner sep=4pt,fill=ugreen!20!white] (Linear) at ([yshift=1em]Concat.north) {\footnotesize{Linear}};
+\draw [->] ([yshift=0.1em]Q.north) -- ([yshift=-0.1em]Linear02.south);
+\draw [-,draw=black!50] ([yshift=0.1em]Q.north) -- ([xshift=0.2em,yshift=-0.1em]Linear02.south);
+\draw [-,draw=black!30] ([yshift=0.1em]Q.north) -- ([xshift=0.4em,yshift=-0.1em]Linear02.south);
+\draw [->] ([yshift=0.1em]K.north) -- ([yshift=-0.1em]Linear12.south);
+\draw [-,draw=black!50] ([yshift=0.1em]K.north) -- ([xshift=0.2em,yshift=-0.1em]Linear12.south);
+\draw [-,draw=black!30] ([yshift=0.1em]K.north) -- ([xshift=0.4em,yshift=-0.1em]Linear12.south);
+\draw [->] ([yshift=0.1em]V.north) -- ([yshift=-0.1em]Linear22.south);
+\draw [-,draw=black!50] ([yshift=0.1em]V.north) -- ([xshift=0.2em,yshift=-0.1em]Linear22.south);
+\draw [-,draw=black!30] ([yshift=0.1em]V.north) -- ([xshift=0.4em,yshift=-0.1em]Linear22.south);
+\draw [->] ([yshift=0em]Linear02.north) -- ([yshift=1em]Linear02.north);
+\draw [-,draw=black!50] ([yshift=0em]Linear01.north) -- ([yshift=0.8em]Linear01.north);
+\draw [-,draw=black!30] ([yshift=0em]Linear0.north) -- ([yshift=0.6em]Linear0.north);
+\draw [->] ([yshift=0em]Linear12.north) -- ([yshift=1em]Linear12.north);
+\draw [-,draw=black!50] ([yshift=0em]Linear11.north) -- ([yshift=0.8em]Linear11.north);
+\draw [-,draw=black!30] ([yshift=0em]Linear1.north) -- ([yshift=0.6em]Linear1.north);
+\draw [->] ([yshift=0em]Linear22.north) -- ([yshift=1em]Linear22.north);
+\draw [-,draw=black!50] ([yshift=0em]Linear21.north) -- ([yshift=0.8em]Linear21.north);
+\draw [-,draw=black!30] ([yshift=0em]Linear2.north) -- ([yshift=0.6em]Linear2.north);
+\draw [->] ([yshift=0em]Scale2.north) -- ([yshift=0em]Concat.south);
+\draw [-,draw=black!50] ([yshift=0em]Scale1.north) -- ([yshift=0.8em]Scale1.north);
+\draw [-,draw=black!30] ([yshift=0em]Scale.north) -- ([yshift=0.6em]Scale.north);
+\draw [->] ([yshift=0em]Concat.north) -- ([yshift=0em]Linear.south);
+\draw [->] ([yshift=0em]Linear.north) -- ([yshift=1em]Linear.north);
+\end{scope}
+\end{tikzpicture}
\ No newline at end of file
--- a/Chapter12/Figures/figure-point-product-attention-model.tex
+++ b/Chapter12/Figures/figure-point-product-attention-model.tex
+\begin{tikzpicture}
+\begin{scope}
+\node [anchor=south west,fill=white,draw,inner sep=4pt,minimum width=4em,fill=blue!20!white] (MatMul) at (0,0) {\tiny{MatMul}};
+\node [anchor=north] (Q1) at ([xshift=-1.4em,yshift=-1em]MatMul.south) {\footnotesize{$\vectorn{\emph{Q}}$}};
+\node [anchor=north] (K1) at ([xshift=1.4em,yshift=-1em]MatMul.south) {\footnotesize{$\vectorn{\emph{K}}$}};
+\node [anchor=south,draw,inner sep=4pt,fill=yellow!30,minimum width=2.5em] (Scale3) at ([yshift=1em]MatMul.north) {\tiny{Scale}};
+\node [anchor=south,draw,inner sep=4pt,fill=purple!20,minimum width=3.5em] (Mask) at ([yshift=0.8em]Scale3.north) {\tiny{Mask(opt.)}};
+\node [anchor=south,draw,inner sep=4pt,fill=ugreen!20!white] (SoftMax) at ([yshift=1em]Mask.north) {\tiny{SoftMax}};
+\node [anchor=south,draw,minimum width=4em,inner sep=4pt,fill=blue!20!white] (MatMul1) at ([xshift=1.7em,yshift=1em]SoftMax.north) {\tiny{MatMul}};
+\node [anchor=north] (V1) at ([xshift=2em]K1.north) {\footnotesize{$\vectorn{\emph{V}}$}};
+\node [anchor=north] (null) at ([yshift=0.8em]MatMul1.north) {};
+\draw [->] ([yshift=0.1em]Q1.north) -- ([xshift=-1.4em,yshift=-0.1em]MatMul.south);
+\draw [->] ([yshift=0.1em]K1.north) -- ([xshift=1.4em,yshift=-0.1em]MatMul.south);
+\draw [->] ([yshift=0.1em]MatMul.north) -- ([yshift=-0.1em]Scale3.south);
+\draw [->] ([yshift=0.1em]Scale3.north) -- ([yshift=-0.1em]Mask.south);
+\draw [->] ([yshift=0.1em]Mask.north) -- ([yshift=-0.1em]SoftMax.south);
+\draw [->] ([yshift=0.1em]SoftMax.north) -- ([yshift=0.9em]SoftMax.north);
+\draw [->] ([yshift=0.1em]V1.north) -- ([yshift=9.3em]V1.north);
+\draw [->] ([yshift=0.1em]MatMul1.north) -- ([yshift=0.8em]MatMul1.north);
+{
+\node [anchor=east] (line1) at ([xshift=-4em,yshift=1em]MatMul.west) {\scriptsize{自注意力机制的Query}};
+\node [anchor=north west] (line2) at ([yshift=0.3em]line1.south west) {\scriptsize{Key和Value均来自同一句子}};
+\node [anchor=north west] (line3) at ([yshift=0.3em]line2.south west) {\scriptsize{编码-解码注意力机制}};
+\node [anchor=north west] (line4) at ([yshift=0.3em]line3.south west) {\scriptsize{与前面讲的一样}};
+}
+{
+\node [anchor=west] (line11) at ([xshift=3em,yshift=0em]MatMul.east) {\scriptsize{Query和Key的转置}};
+\node [anchor=north west] (line12) at ([yshift=0.3em]line11.south west) {\scriptsize{进行点积,得到句子内部}};
+\node [anchor=north west] (line13) at ([yshift=0.3em]line12.south west) {\scriptsize{各个位置的相关性}};
+}
+{
+\node [anchor=west] (line21) at ([yshift=5em]line11.west) {\scriptsize{相关性矩阵在训练中}};
+\node [anchor=north west] (line22) at ([yshift=0.3em]line21.south west) {\scriptsize{方差变大，不利于训练}};
+\node [anchor=north west] (line23) at ([yshift=0.3em]line22.south west) {\scriptsize{所以对其进行缩放}};
+}
+{
+\node [anchor=west] (line31) at ([yshift=6em]line1.west) {\scriptsize{在编码端，对句子补齐}};
+\node [anchor=north west] (line32) at ([yshift=0.3em]line31.south west) {\scriptsize{填充的部分进行屏蔽}};
+\node [anchor=north west] (line33) at ([yshift=0.3em]line32.south west) {\scriptsize{解码时看不到未来的信息}};
+\node [anchor=north west] (line34) at ([yshift=0.3em]line33.south west) {\scriptsize{需要对未来的信息进行屏蔽}};
+}
+{
+\node [anchor=west] (line41) at ([yshift=4em]line21.west) {\scriptsize{用归一化的相关性打分}};
+\node [anchor=north west] (line42) at ([yshift=0.3em]line41.south west) {\scriptsize{对Value进行加权求和}};
+}
+\begin{pgfonlayer}{background}
+{
+\node [rectangle,inner sep=0.2em,rounded corners=1pt,fill=green!10,drop shadow,draw=ugreen] [fit = (line1) (line2) (line3) (line4)] (box1) {};
+\node [rectangle,inner sep=0.1em,rounded corners=1pt,very thick,dotted,draw=ugreen] [fit = (Q1) (K1) (V1)] (box0) {};
+\draw [->,dotted,very thick,ugreen] ([yshift=-1.5em,xshift=1.2em]box1.east) -- ([yshift=-1.5em,xshift=0.1em]box1.east);
+}
+{
+\node [rectangle,inner sep=0.2em,rounded corners=1pt,fill=blue!20!white,drop shadow,draw=blue] [fit = (line11) (line12) (line13)] (box2) {};
+\draw [->,dotted,very thick,blue] ([yshift=1em,xshift=-2.8em]box2.west) -- ([yshift=1em,xshift=-0.1em]box2.west);
+}
+{
+\node [rectangle,inner sep=0.2em,rounded corners=1pt,fill=yellow!20,drop shadow,draw=black] [fit = (line21) (line22) (line23)] (box3) {};
+\draw [->,dotted,very thick,black] ([xshift=0.1em]Scale3.east) .. controls +(east:1) and +(west:1) .. ([yshift=1.0em]box3.west) ;
+}
+{
+\node [rectangle,inner sep=0.2em,rounded corners=1pt,fill=red!10,drop shadow,draw=red] [fit = (line31) (line32) (line33) (line34)] (box4) {};
+\draw [->,dotted,very thick,red] ([yshift=-1.2em,xshift=2.2em]box4.east) -- ([yshift=-1.2em,xshift=0.1em]box4.east);
+}
+{
+\node [rectangle,inner sep=0.2em,rounded corners=1pt,fill=blue!20!white,drop shadow,draw=blue] [fit = (line41) (line42)] (box5) {};
+\draw [->,dotted,very thick,blue] ([yshift=-0.3em,xshift=-1em]box5.west) -- ([yshift=-0.3em,xshift=-0.1em]box5.west);
+}					
+\end{pgfonlayer}
+\end{scope}
+\end{tikzpicture}
\ No newline at end of file
--- a/Chapter12/Figures/figure-position-of-difference-and-layer-regularization-in-the-model.tex
+++ b/Chapter12/Figures/figure-position-of-difference-and-layer-regularization-in-the-model.tex
+\begin{tikzpicture}
+\begin{scope}
+\tikzstyle{Sanode} = [minimum height=1.4em,minimum width=7em,inner sep=3pt,rounded corners=1.5pt,draw,fill=orange!20];
+\tikzstyle{Resnode} = [minimum height=1.1em,minimum width=7em,inner sep=3pt,rounded corners=1.5pt,draw,fill=yellow!20];
+\tikzstyle{ffnnode} = [minimum height=1.4em,minimum width=7em,inner sep=3pt,rounded corners=1.5pt,draw];
+\tikzstyle{outputnode} = [minimum height=1.4em,minimum width=7em,inner sep=3pt,rounded corners=1.5pt,draw];
+\tikzstyle{inputnode} = [minimum height=1.4em,minimum width=3.5em,inner sep=3pt,rounded corners=1.5pt,draw,fill=red!10];
+\tikzstyle{posnode} = [minimum height=1.4em,minimum width=3.5em,inner sep=3pt,rounded corners=1.5pt,draw,fill=black!5!white];
+\tikzstyle{standard} = [rounded corners=3pt]
+\node [Sanode,anchor=west] (sa1) at (0,0) {\tiny{$\textbf{Self-Attention}$}};
+\node [Resnode,anchor=south] (res1) at ([yshift=0.3em]sa1.north) {\tiny{$\textbf{Add \& LayerNorm}$}};
+\node [ffnnode,anchor=south] (ffn1) at ([yshift=1em]res1.north) {\tiny{$\textbf{Feed Forward Network}$}};
+\node [Resnode,anchor=south] (res2) at ([yshift=0.3em]ffn1.north) {\tiny{$\textbf{Add \& LayerNorm}$}};
+\node [inputnode,anchor=north west] (input1) at ([yshift=-1em]sa1.south west) {\tiny{$\textbf{Embedding}$}};
+\node [posnode,anchor=north east] (pos1) at ([yshift=-1em]sa1.south east) {\tiny{$\textbf{Postion}$}};
+\node [anchor=north] (inputs) at ([yshift=-3em]sa1.south) {\scriptsize{$\textbf{编码器输入: 我\ \ 很\ \ 好}$}};
+\node [anchor=south] (encoder) at ([xshift=0.2em,yshift=0.6em]res2.north west) {\scriptsize{\textbf{编码器}}};
+\draw [->] (sa1.north) -- (res1.south);
+\draw [->] (res1.north) -- (ffn1.south);
+\draw [->] (ffn1.north) -- (res2.south);
+\draw [->] ([yshift=-1em]sa1.south) -- (sa1.south);
+\draw [->] ([yshift=-0.3em]inputs.north) -- ([yshift=0.6em]inputs.north);
+\node [Sanode,anchor=west] (sa2) at ([xshift=3em]sa1.east) {\tiny{$\textbf{Self-Attention}$}};
+\node [Resnode,anchor=south] (res3) at ([yshift=0.3em]sa2.north) {\tiny{$\textbf{Add \& LayerNorm}$}};
+\node [Sanode,anchor=south] (ed1) at ([yshift=1em]res3.north) {\tiny{$\textbf{Encoder-Decoder Attention}$}};
+\node [Resnode,anchor=south] (res4) at ([yshift=0.3em]ed1.north) {\tiny{$\textbf{Add \& LayerNorm}$}};
+\node [ffnnode,anchor=south] (ffn2) at ([yshift=1em]res4.north) {\tiny{$\textbf{Feed Forward Network}$}};
+\node [Resnode,anchor=south] (res5) at ([yshift=0.3em]ffn2.north) {\tiny{$\textbf{Add \& LayerNorm}$}};
+\node [outputnode,anchor=south] (o1) at ([yshift=1em]res5.north) {\tiny{$\textbf{Output layer}$}};
+\node [inputnode,anchor=north west] (input2) at ([yshift=-1em]sa2.south west) {\tiny{$\textbf{Embedding}$}};
+\node [posnode,anchor=north east] (pos2) at ([yshift=-1em]sa2.south east) {\tiny{$\textbf{Postion}$}};
+\node [anchor=north] (outputs) at ([yshift=-3em]sa2.south) {\scriptsize{$\textbf{解码器输入: $<$sos$>$ I  am  fine}$}};
+\node [anchor=east] (decoder) at ([xshift=-1em,yshift=-1.5em]o1.west) {\scriptsize{\textbf{解码器}}};
+\node [anchor=north] (decoutputs) at ([yshift=1.5em]o1.north) {\scriptsize{$\textbf{解码器输出: I  am  fine $<$eos$>$ }$}};
+\draw [->] (sa2.north) -- (res3.south);
+\draw [->] (res3.north) -- (ed1.south);
+\draw [->] (ed1.north) -- (res4.south);
+\draw [->] (res4.north) -- (ffn2.south);
+\draw [->] (ffn2.north) -- (res5.south);
+\draw [->] (res5.north) -- (o1.south);
+\draw [->] (o1.north) -- ([yshift=0.5em]o1.north);
+\draw [->] ([yshift=-1em]sa2.south) -- (sa2.south);
+\draw [->] ([yshift=-0.3em]outputs.north) -- ([yshift=0.6em]outputs.north);
+\draw[->,standard] ([yshift=-0.5em]sa1.south) -- ([xshift=-4em,yshift=-0.5em]sa1.south) -- ([xshift=-4em,yshift=2.3em]sa1.south) -- ([xshift=-3.5em,yshift=2.3em]sa1.south);
+\draw[->,standard] ([yshift=0.5em]res1.north) -- ([xshift=-4em,yshift=0.5em]res1.north) -- ([xshift=-4em,yshift=3.3em]res1.north) -- ([xshift=-3.5em,yshift=3.3em]res1.north);
+\draw[->,standard] ([yshift=-0.5em]sa2.south) -- ([xshift=4em,yshift=-0.5em]sa2.south) -- ([xshift=4em,yshift=2.3em]sa2.south) -- ([xshift=3.5em,yshift=2.3em]sa2.south);
+\draw[->,standard] ([yshift=0.5em]res3.north) -- ([xshift=4em,yshift=0.5em]res3.north) -- ([xshift=4em,yshift=3.3em]res3.north) -- ([xshift=3.5em,yshift=3.3em]res3.north);
+\draw[->,standard] ([yshift=0.5em]res4.north) -- ([xshift=4em,yshift=0.5em]res4.north) -- ([xshift=4em,yshift=3.3em]res4.north) -- ([xshift=3.5em,yshift=3.3em]res4.north);
+\draw[->,standard] (res2.north) -- ([yshift=0.5em]res2.north) -- ([xshift=5em,yshift=0.5em]res2.north) -- ([xshift=5em,yshift=-2.2em]res2.north) -- ([xshift=6.5em,yshift=-2.2em]res2.north);
+%\node [rectangle,inner sep=0.7em,rounded corners=1pt,very thick,dotted,draw=ugreen!70] [fit = (sa1) (res1) (ffn1) (res2)] (box0) {};
+%\node [rectangle,inner sep=0.7em,rounded corners=1pt,very thick,dotted,draw=red!60] [fit = (sa2) (res3) (res5)] (box1) {};
+\begin{pgfonlayer}{background}
+	\node [rectangle,inner sep=0.2em,rounded corners=1pt,very thick,dotted,fill=red!40] [fit = (res1)] (box1) {};		
+	\node [rectangle,inner sep=0.2em,rounded corners=1pt,very thick,dotted,fill=red!40] [fit = (res2)] (box2) {};	
+	\node [rectangle,inner sep=0.2em,rounded corners=1pt,very thick,dotted,fill=red!40] [fit = (res3)] (box3) {};	
+	\node [rectangle,inner sep=0.2em,rounded corners=1pt,very thick,dotted,fill=red!40] [fit = (res4)] (box4) {};	
+\end{pgfonlayer}
+\end{scope}
+\end{tikzpicture}
\ No newline at end of file
--- a/Chapter12/Figures/figure-position-of-feedforward-neural-network-in-the-model.tex
+++ b/Chapter12/Figures/figure-position-of-feedforward-neural-network-in-the-model.tex
+\begin{tikzpicture}
+\begin{scope}
+\tikzstyle{Sanode} = [minimum height=1.4em,minimum width=7em,inner sep=3pt,rounded corners=1.5pt,draw,fill=orange!20];
+\tikzstyle{Resnode} = [minimum height=1.1em,minimum width=7em,inner sep=3pt,rounded corners=1.5pt,draw,fill=yellow!20];
+\tikzstyle{ffnnode} = [minimum height=1.4em,minimum width=7em,inner sep=3pt,rounded corners=1.5pt,draw,fill=blue!20];
+\tikzstyle{outputnode} = [minimum height=1.4em,minimum width=7em,inner sep=3pt,rounded corners=1.5pt,draw];
+\tikzstyle{inputnode} = [minimum height=1.4em,minimum width=3.5em,inner sep=3pt,rounded corners=1.5pt,draw,fill=red!10];
+\tikzstyle{posnode} = [minimum height=1.4em,minimum width=3.5em,inner sep=3pt,rounded corners=1.5pt,draw,fill=black!5!white];
+\tikzstyle{standard} = [rounded corners=3pt]
+\node [Sanode,anchor=west] (sa1) at (0,0) {\tiny{$\textbf{Self-Attention}$}};
+\node [Resnode,anchor=south] (res1) at ([yshift=0.3em]sa1.north) {\tiny{$\textbf{Add \& LayerNorm}$}};
+\node [ffnnode,anchor=south] (ffn1) at ([yshift=1em]res1.north) {\tiny{$\textbf{Feed Forward Network}$}};
+\node [Resnode,anchor=south] (res2) at ([yshift=0.3em]ffn1.north) {\tiny{$\textbf{Add \& LayerNorm}$}};
+\node [inputnode,anchor=north west] (input1) at ([yshift=-1em]sa1.south west) {\tiny{$\textbf{Embedding}$}};
+\node [posnode,anchor=north east] (pos1) at ([yshift=-1em]sa1.south east) {\tiny{$\textbf{Postion}$}};
+\node [anchor=north] (inputs) at ([yshift=-3em]sa1.south) {\scriptsize{$\textbf{编码器输入: 我\ \ 很\ \ 好}$}};
+\node [anchor=south] (encoder) at ([xshift=0.2em,yshift=0.6em]res2.north west) {\scriptsize{\textbf{编码器}}};
+\draw [->] (sa1.north) -- (res1.south);
+\draw [->] (res1.north) -- (ffn1.south);
+\draw [->] (ffn1.north) -- (res2.south);
+\draw [->] ([yshift=-1em]sa1.south) -- (sa1.south);
+\draw [->] ([yshift=-0.3em]inputs.north) -- ([yshift=0.6em]inputs.north);
+\node [Sanode,anchor=west] (sa2) at ([xshift=3em]sa1.east) {\tiny{$\textbf{Self-Attention}$}};
+\node [Resnode,anchor=south] (res3) at ([yshift=0.3em]sa2.north) {\tiny{$\textbf{Add \& LayerNorm}$}};
+\node [Sanode,anchor=south] (ed1) at ([yshift=1em]res3.north) {\tiny{$\textbf{Encoder-Decoder Attention}$}};
+\node [Resnode,anchor=south] (res4) at ([yshift=0.3em]ed1.north) {\tiny{$\textbf{Add \& LayerNorm}$}};
+\node [ffnnode,anchor=south] (ffn2) at ([yshift=1em]res4.north) {\tiny{$\textbf{Feed Forward Network}$}};
+\node [Resnode,anchor=south] (res5) at ([yshift=0.3em]ffn2.north) {\tiny{$\textbf{Add \& LayerNorm}$}};
+\node [outputnode,anchor=south] (o1) at ([yshift=1em]res5.north) {\tiny{$\textbf{Output layer}$}};
+\node [inputnode,anchor=north west] (input2) at ([yshift=-1em]sa2.south west) {\tiny{$\textbf{Embedding}$}};
+\node [posnode,anchor=north east] (pos2) at ([yshift=-1em]sa2.south east) {\tiny{$\textbf{Postion}$}};
+\node [anchor=north] (outputs) at ([yshift=-3em]sa2.south) {\scriptsize{$\textbf{解码器输入: $<$sos$>$ I  am  fine}$}};
+\node [anchor=east] (decoder) at ([xshift=-1em,yshift=-1.5em]o1.west) {\scriptsize{\textbf{解码器}}};
+\node [anchor=north] (decoutputs) at ([yshift=1.5em]o1.north) {\scriptsize{$\textbf{解码器输出: I  am  fine $<$eos$>$ }$}};
+\draw [->] (sa2.north) -- (res3.south);
+\draw [->] (res3.north) -- (ed1.south);
+\draw [->] (ed1.north) -- (res4.south);
+\draw [->] (res4.north) -- (ffn2.south);
+\draw [->] (ffn2.north) -- (res5.south);
+\draw [->] (res5.north) -- (o1.south);
+\draw [->] (o1.north) -- ([yshift=0.5em]o1.north);
+\draw [->] ([yshift=-1em]sa2.south) -- (sa2.south);
+\draw [->] ([yshift=-0.3em]outputs.north) -- ([yshift=0.6em]outputs.north);
+\draw[->,standard] ([yshift=-0.5em]sa1.south) -- ([xshift=-4em,yshift=-0.5em]sa1.south) -- ([xshift=-4em,yshift=2.3em]sa1.south) -- ([xshift=-3.5em,yshift=2.3em]sa1.south);
+\draw[->,standard] ([yshift=0.5em]res1.north) -- ([xshift=-4em,yshift=0.5em]res1.north) -- ([xshift=-4em,yshift=3.3em]res1.north) -- ([xshift=-3.5em,yshift=3.3em]res1.north);
+\draw[->,standard] ([yshift=-0.5em]sa2.south) -- ([xshift=4em,yshift=-0.5em]sa2.south) -- ([xshift=4em,yshift=2.3em]sa2.south) -- ([xshift=3.5em,yshift=2.3em]sa2.south);
+\draw[->,standard] ([yshift=0.5em]res3.north) -- ([xshift=4em,yshift=0.5em]res3.north) -- ([xshift=4em,yshift=3.3em]res3.north) -- ([xshift=3.5em,yshift=3.3em]res3.north);
+\draw[->,standard] ([yshift=0.5em]res4.north) -- ([xshift=4em,yshift=0.5em]res4.north) -- ([xshift=4em,yshift=3.3em]res4.north) -- ([xshift=3.5em,yshift=3.3em]res4.north);
+\draw[->,standard] (res2.north) -- ([yshift=0.5em]res2.north) -- ([xshift=5em,yshift=0.5em]res2.north) -- ([xshift=5em,yshift=-2.2em]res2.north) -- ([xshift=6.5em,yshift=-2.2em]res2.north);
+%\node [rectangle,inner sep=0.7em,rounded corners=1pt,very thick,dotted,draw=ugreen!70] [fit = (sa1) (res1) (ffn1) (res2)] (box0) {};
+%\node [rectangle,inner sep=0.7em,rounded corners=1pt,very thick,dotted,draw=red!60] [fit = (sa2) (res3) (res5)] (box1) {};
+\begin{pgfonlayer}{background}
+	\node [rectangle,inner sep=0.2em,rounded corners=1pt,very thick,dotted,fill=red!40] [fit = (ffn1)] (box1) {};		
+	\node [rectangle,inner sep=0.2em,rounded corners=1pt,very thick,dotted,fill=red!40] [fit = (ffn2)] (box2) {};	
+\end{pgfonlayer}
+\end{scope}
+\end{tikzpicture}
\ No newline at end of file
--- a/Chapter12/Figures/figure-position-of-self-attention-mechanism-in-the-model.tex
+++ b/Chapter12/Figures/figure-position-of-self-attention-mechanism-in-the-model.tex
+\begin{tikzpicture}
+\begin{scope}
+\tikzstyle{Sanode} = [minimum height=1.4em,minimum width=7em,inner sep=3pt,rounded corners=1.5pt,draw,fill=orange!20];
+\tikzstyle{Resnode} = [minimum height=1.1em,minimum width=7em,inner sep=3pt,rounded corners=1.5pt,draw];
+\tikzstyle{ffnnode} = [minimum height=1.4em,minimum width=7em,inner sep=3pt,rounded corners=1.5pt,draw];
+\tikzstyle{outputnode} = [minimum height=1.4em,minimum width=7em,inner sep=3pt,rounded corners=1.5pt,draw];
+\tikzstyle{inputnode} = [minimum height=1.4em,minimum width=3.5em,inner sep=3pt,rounded corners=1.5pt,draw,fill=red!10];
+\tikzstyle{posnode} = [minimum height=1.4em,minimum width=3.5em,inner sep=3pt,rounded corners=1.5pt,draw,fill=black!5!white];
+\tikzstyle{standard} = [rounded corners=3pt]
+\node [Sanode,anchor=west] (sa1) at (0,0) {\tiny{$\textbf{Self-Attention}$}};
+\node [Resnode,anchor=south] (res1) at ([yshift=0.3em]sa1.north) {\tiny{$\textbf{Add \& LayerNorm}$}};
+\node [ffnnode,anchor=south] (ffn1) at ([yshift=1em]res1.north) {\tiny{$\textbf{Feed Forward Network}$}};
+\node [Resnode,anchor=south] (res2) at ([yshift=0.3em]ffn1.north) {\tiny{$\textbf{Add \& LayerNorm}$}};
+\node [inputnode,anchor=north west] (input1) at ([yshift=-1em]sa1.south west) {\tiny{$\textbf{Embedding}$}};
+\node [posnode,anchor=north east] (pos1) at ([yshift=-1em]sa1.south east) {\tiny{$\textbf{Postion}$}};
+\node [anchor=north] (inputs) at ([yshift=-3em]sa1.south) {\scriptsize{$\textbf{编码器输入: 我\ \ 很\ \ 好}$}};
+\node [anchor=south] (encoder) at ([xshift=0.2em,yshift=0.6em]res2.north west) {\scriptsize{\textbf{编码器}}};
+\draw [->] (sa1.north) -- (res1.south);
+\draw [->] (res1.north) -- (ffn1.south);
+\draw [->] (ffn1.north) -- (res2.south);
+\draw [->] ([yshift=-1em]sa1.south) -- (sa1.south);
+\draw [->] ([yshift=-0.3em]inputs.north) -- ([yshift=0.6em]inputs.north);
+\node [Sanode,anchor=west] (sa2) at ([xshift=3em]sa1.east) {\tiny{$\textbf{Self-Attention}$}};
+\node [Resnode,anchor=south] (res3) at ([yshift=0.3em]sa2.north) {\tiny{$\textbf{Add \& LayerNorm}$}};
+\node [Sanode,anchor=south] (ed1) at ([yshift=1em]res3.north) {\tiny{$\textbf{Encoder-Decoder Attention}$}};
+\node [Resnode,anchor=south] (res4) at ([yshift=0.3em]ed1.north) {\tiny{$\textbf{Add \& LayerNorm}$}};
+\node [ffnnode,anchor=south] (ffn2) at ([yshift=1em]res4.north) {\tiny{$\textbf{Feed Forward Network}$}};
+\node [Resnode,anchor=south] (res5) at ([yshift=0.3em]ffn2.north) {\tiny{$\textbf{Add \& LayerNorm}$}};
+\node [outputnode,anchor=south] (o1) at ([yshift=1em]res5.north) {\tiny{$\textbf{Output layer}$}};
+\node [inputnode,anchor=north west] (input2) at ([yshift=-1em]sa2.south west) {\tiny{$\textbf{Embedding}$}};
+\node [posnode,anchor=north east] (pos2) at ([yshift=-1em]sa2.south east) {\tiny{$\textbf{Postion}$}};
+\node [anchor=north] (outputs) at ([yshift=-3em]sa2.south) {\scriptsize{$\textbf{解码器输入: $<$sos$>$ I  am  fine}$}};
+\node [anchor=east] (decoder) at ([xshift=-1em,yshift=-1.5em]o1.west) {\scriptsize{\textbf{解码器}}};
+\node [anchor=north] (decoutputs) at ([yshift=1.5em]o1.north) {\scriptsize{$\textbf{解码器输出: I  am  fine $<$eos$>$ }$}};
+\draw [->] (sa2.north) -- (res3.south);
+\draw [->] (res3.north) -- (ed1.south);
+\draw [->] (ed1.north) -- (res4.south);
+\draw [->] (res4.north) -- (ffn2.south);
+\draw [->] (ffn2.north) -- (res5.south);
+\draw [->] (res5.north) -- (o1.south);
+\draw [->] (o1.north) -- ([yshift=0.5em]o1.north);
+\draw [->] ([yshift=-1em]sa2.south) -- (sa2.south);
+\draw [->] ([yshift=-0.3em]outputs.north) -- ([yshift=0.6em]outputs.north);
+\draw[->,standard] ([yshift=-0.5em]sa1.south) -- ([xshift=-4em,yshift=-0.5em]sa1.south) -- ([xshift=-4em,yshift=2.3em]sa1.south) -- ([xshift=-3.5em,yshift=2.3em]sa1.south);
+\draw[->,standard] ([yshift=0.5em]res1.north) -- ([xshift=-4em,yshift=0.5em]res1.north) -- ([xshift=-4em,yshift=3.3em]res1.north) -- ([xshift=-3.5em,yshift=3.3em]res1.north);
+\draw[->,standard] ([yshift=-0.5em]sa2.south) -- ([xshift=4em,yshift=-0.5em]sa2.south) -- ([xshift=4em,yshift=2.3em]sa2.south) -- ([xshift=3.5em,yshift=2.3em]sa2.south);
+\draw[->,standard] ([yshift=0.5em]res3.north) -- ([xshift=4em,yshift=0.5em]res3.north) -- ([xshift=4em,yshift=3.3em]res3.north) -- ([xshift=3.5em,yshift=3.3em]res3.north);
+\draw[->,standard] ([yshift=0.5em]res4.north) -- ([xshift=4em,yshift=0.5em]res4.north) -- ([xshift=4em,yshift=3.3em]res4.north) -- ([xshift=3.5em,yshift=3.3em]res4.north);
+\draw[->,standard] (res2.north) -- ([yshift=0.5em]res2.north) -- ([xshift=5em,yshift=0.5em]res2.north) -- ([xshift=5em,yshift=-2.2em]res2.north) -- ([xshift=6.5em,yshift=-2.2em]res2.north);
+%\node [rectangle,inner sep=0.7em,rounded corners=1pt,very thick,dotted,draw=ugreen!70] [fit = (sa1) (res1) (ffn1) (res2)] (box0) {};
+%\node [rectangle,inner sep=0.7em,rounded corners=1pt,very thick,dotted,draw=red!60] [fit = (sa2) (res3) (res5)] (box1) {};
+\begin{pgfonlayer}{background}
+	\node [rectangle,inner sep=0.2em,rounded corners=1pt,very thick,dotted,fill=red!40] [fit = (sa1)] (box1) {};		
+	\node [rectangle,inner sep=0.2em,rounded corners=1pt,very thick,dotted,fill=red!40] [fit = (sa2)] (box2) {};	
+	\node [rectangle,inner sep=0.2em,rounded corners=1pt,very thick,dotted,fill=red!40] [fit = (ed1)] (box3) {};	
+\end{pgfonlayer}
+\end{scope}
+\end{tikzpicture}
\ No newline at end of file
--- a/Chapter12/Figures/figure-process-of-5.tex
+++ b/Chapter12/Figures/figure-process-of-5.tex
+\begin{tikzpicture}
+\node(atten) at (0,0){Attention(};
+%%%% Q
+\node(tbq) at ([xshift=0.5em,yshift=0]atten.east){
+\begin{tabular}{|c|}
+\hline
+\rowcolor{yellow!20}  \\ \hline 
+\rowcolor{yellow!20}  \\ \hline
+\rowcolor{yellow!20}  \\ \hline 
+\end{tabular}
+};
+\node at  ([xshift=0em,yshift=0.5em]tbq.north){$\vectorn{\emph{Q}}$};
+\node(comma1) at ([xshift=0.15em,yshift=-2em]tbq.east){,};
+%%%% k
+\node(tbk) at ([xshift=1em,yshift=0]tbq.east){
+\begin{tabular}{|c|}
+\hline
+\rowcolor{blue!20}  \\ \hline 
+\rowcolor{blue!20}  \\ \hline
+\rowcolor{blue!20}  \\ \hline 
+\end{tabular}
+};
+\node at  ([xshift=0em,yshift=0.5em]tbk.north){$\vectorn{\emph{K}}$};
+\node(comma2) at ([xshift=0.15em,yshift=-2em]tbk.east){,};
+%%%% v
+\node(tbv) at ([xshift=1em,yshift=0]tbk.east){
+\begin{tabular}{|c|}
+\hline
+\rowcolor{orange!20}  \\ \hline 
+\rowcolor{orange!20}  \\ \hline
+\rowcolor{orange!20}  \\ \hline 
+\end{tabular}
+};
+\node at  ([xshift=0em,yshift=0.5em]tbv.north){$\vectorn{\emph{V}}$};
+\node(bra) at ([xshift=0.3em,yshift=0]tbv.east){)};
+\node(eq1) at ([xshift=0.5em,yshift=0]bra.east){=};
+\node(sof1) at ([xshift=2em,yshift=0]eq1.east){Softmax(};
+%-----------------------------------------------------------
+%QK+MASK
+\node(tbq2) at ([xshift=0.5em,yshift=2em]sof1.east){
+\begin{tabular}{|c|}
+\hline
+\rowcolor{yellow!20}  \\ \hline 
+\rowcolor{yellow!20}  \\ \hline
+\rowcolor{yellow!20}  \\ \hline 
+\end{tabular}
+};
+\node at  ([xshift=0em,yshift=0.5em]tbq2.north){$\vectorn{\emph{Q}}$};
+% x
+\node (times) at  ([xshift=1em,yshift=0em]tbq2.east){$\times$};
+%k
+\node(tbk2) at ([xshift=2em,yshift=0em]times.east){
+\begin{tabular}{|l|l|l|}
+\hline
+\cellcolor{blue!20} & \cellcolor{blue!20} &\cellcolor{blue!20}  \\ \hline
+\end{tabular}
+};
+\node at  ([xshift=0em,yshift=0.5em]tbk2.north){$\vectorn{\emph{K}}^{\mathrm{T}}$};
+\draw [-] (5.6,-0.2) -- (8,-0.2);
+\node at  ([xshift=0em,yshift=-3em]times.south){$\sqrt{d_k}$};
+% MASK
+\node(mask) at  ([xshift=3em,yshift=-2em]tbk2.east){
+\begin{tabular}{|l|l|l|}
+\hline
+\cellcolor{green!20} &\cellcolor{green!20}   &\cellcolor{green!20}   \\ \hline
+ \cellcolor{green!20} &\cellcolor{green!20}   &\cellcolor{green!20}   \\ \hline
+ \cellcolor{green!20} &\cellcolor{green!20}   &\cellcolor{green!20}   \\ \hline
+\end{tabular}
+};
+\node at  ([xshift=0em,yshift=0.5em]mask.north){$\vectorn{\emph{Mask}}$};
+%+
+\node at  ([xshift=-0.6em,yshift=0em]mask.west){$+$};
+%）
+\node at  ([xshift=0.2em,yshift=0em]mask.east){)};
+%%%% v
+\node(tbv2) at ([xshift=1.2em,yshift=0]mask.east){
+\begin{tabular}{|c|}
+\hline
+\rowcolor{orange!20}  \\ \hline 
+\rowcolor{orange!20}  \\ \hline
+\rowcolor{orange!20}  \\ \hline 
+\end{tabular}
+};
+\node at  ([xshift=0em,yshift=0.5em]tbv2.north){$\vectorn{\emph{V}}$};
+%------------------------------
+%第二行
+\node(eq2) at  ([xshift=0em,yshift=-6em]eq1.south){=};
+\node(sof2) at ([xshift=2em,yshift=0]eq2.east){Softmax(};
+%中间粉色矩阵
+\node(mid) at  ([xshift=1.5em,yshift=0em]sof2.east){
+\begin{tabular}{|l|l|l|}
+\hline
+\cellcolor{pink!30} &\cellcolor{pink!30}   &\cellcolor{pink!30}   \\ \hline
+ \cellcolor{pink!30} &\cellcolor{pink!30}   &\cellcolor{pink!30}   \\ \hline
+ \cellcolor{pink!30} &\cellcolor{pink!30}   &\cellcolor{pink!30}   \\ \hline
+\end{tabular}
+};
+% )
+\node(bra2) at ([xshift=0.2em,yshift=0]mid.east){)};
+%红色框
+\node[rectangle,minimum width=4.0em,minimum height=1.5em,draw=red](p222) at([xshift=0em,yshift=-1.0em]mid.north) {};
+%%%% v
+\node(tbv3) at ([xshift=0.5em,yshift=0]bra2.east){
+\begin{tabular}{|c|}
+\hline
+\rowcolor{orange!20}  \\ \hline 
+\rowcolor{orange!20}  \\ \hline
+\rowcolor{orange!20}  \\ \hline 
+\end{tabular}
+};
+\node at  ([xshift=0em,yshift=0.5em]tbv3.north){$\vectorn{\emph{V}}$};
+%------------------------------------
+%第三行
+\node(eq3) at  ([xshift=0em,yshift=-6em]eq2.south){=};
+%%%% softmax结果 红色矩阵
+\node(result) at ([xshift=2em,yshift=0]eq3.east){
+\begin{tabular}{|l|l|l|}
+\hline
+\cellcolor{red!20} &\cellcolor{red!20}   &\cellcolor{red!20}   \\ \hline
+\cellcolor{red!20}&\cellcolor{red!20}   &\cellcolor{red!20}   \\ \hline
+\cellcolor{red!20} &\cellcolor{red!20}  &\cellcolor{red!20}   \\ \hline
+\end{tabular}
+};
+% x
+\node (times) at  ([xshift=0.5em,yshift=0em]result.east){$\times$};
+%%%% v
+\node(tbv4) at ([xshift=0.5em,yshift=0]times.east){
+\begin{tabular}{|c|}
+\hline
+\rowcolor{orange!20}  \\ \hline 
+\rowcolor{orange!20}  \\ \hline
+\rowcolor{orange!20}  \\ \hline 
+\end{tabular}
+};
+\node at  ([xshift=0em,yshift=0.5em]tbv4.north){$\vectorn{\emph{V}}$};
+%=
+\node(eq4) at  ([xshift=0.5em,yshift=0em]tbv4.east){=};
+%%%% 灰色矩阵
+\node(gre) at ([xshift=0.5em,yshift=0]eq4.east){
+\begin{tabular}{|c|}
+\hline
+\rowcolor{black!15}  \\ \hline 
+\rowcolor{black!15}  \\ \hline
+\rowcolor{black!15}  \\ \hline 
+\end{tabular}
+};
+\end{tikzpicture}
\ No newline at end of file
--- a/Chapter12/Figures/figure-query-model-corresponding-to-attention-mechanism.tex
+++ b/Chapter12/Figures/figure-query-model-corresponding-to-attention-mechanism.tex
+\begin{tikzpicture}
+\begin{scope}
+\tikzstyle{rnode} = [draw,minimum width=3.5em,minimum height=1.2em]
+\node [rnode,anchor=south west,fill=red!20!white] (value1) at (0,0) {\scriptsize{$\vectorn{\emph{h}}(\textrm{“你”})$}};
+\node [rnode,anchor=south west,fill=red!20!white] (value2) at ([xshift=1em]value1.south east) {\scriptsize{$\vectorn{\emph{h}}(\textrm{“什么”})$}};
+\node [rnode,anchor=south west,fill=red!20!white] (value3) at ([xshift=1em]value2.south east) {\scriptsize{$\vectorn{\emph{h}}(\textrm{“也”})$}};
+\node [rnode,anchor=south west,fill=red!20!white] (value4) at ([xshift=1em]value3.south east) {\scriptsize{$\vectorn{\emph{h}}(\textrm{“没”})$}};
+\node [rnode,anchor=south west,fill=green!20!white] (key1) at ([yshift=0.2em]value1.north west) {\scriptsize{$\vectorn{\emph{h}}(\textrm{“你”})$}};
+\node [rnode,anchor=south west,fill=green!20!white] (key2) at ([yshift=0.2em]value2.north west) {\scriptsize{$\vectorn{\emph{h}}(\textrm{“什么”})$}};
+\node [rnode,anchor=south west,fill=green!20!white] (key3) at ([yshift=0.2em]value3.north west) {\scriptsize{$\vectorn{\emph{h}}(\textrm{“也”})$}};
+\node [rnode,anchor=south west,fill=green!20!white] (key4) at ([yshift=0.2em]value4.north west) {\scriptsize{$\vectorn{\emph{h}}(\textrm{“没”})$}};
+\node [rnode,anchor=east] (query) at ([xshift=-2em]key1.west) {\scriptsize{$\vectorn{\emph{s}}(\textrm{“you”})$}};
+\node [anchor=east] (querylabel) at ([xshift=-0.2em]query.west) {\scriptsize{query}};
+\draw [->] ([yshift=1pt,xshift=6pt]query.north) .. controls +(90:1em) and +(90:1em) .. ([yshift=1pt]key1.north);
+\draw [->] ([yshift=1pt,xshift=3pt]query.north) .. controls +(90:1.5em) and +(90:1.5em) .. ([yshift=1pt]key2.north);
+\draw [->] ([yshift=1pt]query.north) .. controls +(90:2em) and +(90:2em) .. ([yshift=1pt]key3.north);
+\draw [->] ([yshift=1pt,xshift=-3pt]query.north) .. controls +(90:2.5em) and +(90:2.5em) .. ([yshift=1pt]key4.north);
+\node [anchor=south east] (alpha1) at ([xshift=1em]key1.north east) {\scriptsize{$\alpha_1=.4$}};
+\node [anchor=south east] (alpha2) at ([xshift=1em]key2.north east) {\scriptsize{$\alpha_2=.4$}};
+\node [anchor=south east] (alpha3) at ([xshift=1em]key3.north east) {\scriptsize{$\alpha_3=0$}};
+\node [anchor=south east] (alpha4) at ([xshift=1em]key4.north east) {\scriptsize{$\alpha_4=.1$}};
+\end{scope}
+\end{tikzpicture}
\ No newline at end of file
--- a/Chapter12/Figures/figure-query-model-corresponding-to-traditional-query-model-vs-attention-mechanism.tex
+++ b/Chapter12/Figures/figure-query-model-corresponding-to-traditional-query-model-vs-attention-mechanism.tex
+%-----------------------------------------------------
+\begin{tikzpicture}
+\begin{scope}
+\tikzstyle{rnode} = [draw,minimum width=3em,minimum height=1.2em]
+\node [rnode,anchor=south west,fill=blue!20!white] (value1) at (0,0) {\scriptsize{value$_1$}};
+\node [rnode,anchor=south west,fill=blue!20!white] (value2) at ([xshift=1em]value1.south east) {\scriptsize{value$_2$}};
+\node [rnode,anchor=south west,fill=red!20!white] (value3) at ([xshift=1em]value2.south east) {\scriptsize{value$_3$}};
+\node [rnode,anchor=south west,fill=blue!20!white] (value4) at ([xshift=1em]value3.south east) {\scriptsize{value$_4$}};
+\node [rnode,anchor=south west,pattern=north east lines] (key1) at ([yshift=0.2em]value1.north west) {};
+\node [rnode,anchor=south west,pattern=dots] (key2) at ([yshift=0.2em]value2.north west) {};
+\node [rnode,anchor=south west,pattern=horizontal lines] (key3) at ([yshift=0.2em]value3.north west) {};
+\node [rnode,anchor=south west,pattern=crosshatch dots] (key4) at ([yshift=0.2em]value4.north west) {};
+\node [fill=white,inner sep=1pt] (key1label) at (key1) {\scriptsize{key$_1$}};
+\node [fill=white,inner sep=1pt] (key1label) at (key2) {\scriptsize{key$_2$}};
+\node [fill=white,inner sep=1pt] (key1label) at (key3) {\scriptsize{key$_3$}};
+\node [fill=white,inner sep=1pt] (key1label) at (key4) {\scriptsize{key$_4$}};
+\node [rnode,anchor=east,pattern=horizontal lines] (query) at ([xshift=-3em]key1.west) {};
+\node [anchor=east] (querylabel) at ([xshift=-0.2em]query.west) {\scriptsize{query}};
+\draw [->] ([yshift=1pt]query.north) .. controls +(90:2em) and +(90:2em) .. ([yshift=1pt]key3.north) node [pos=0.5,below,yshift=0.2em] {\scriptsize{匹配}};
+\node [anchor=north] (result) at (value3.south) {\scriptsize{ {\red 返回结果} }};
+\end{scope}
+\end{tikzpicture}
--- a/Chapter12/Figures/figure-query-model-corresponding-to-traditional-query-model-vs-attention-mechanism02.tex
+++ b/Chapter12/Figures/figure-query-model-corresponding-to-traditional-query-model-vs-attention-mechanism02.tex
+%-----------------------------------------------------
+\begin{tikzpicture}
+\begin{scope}
+\tikzstyle{rnode} = [draw,minimum width=3em,minimum height=1.2em]
+\node [rnode,anchor=south west,fill=red!20!white] (value1) at (0,0) {\scriptsize{value$_1$}};
+\node [rnode,anchor=south west,fill=red!20!white] (value2) at ([xshift=1em]value1.south east) {\scriptsize{value$_2$}};
+\node [rnode,anchor=south west,fill=red!20!white] (value3) at ([xshift=1em]value2.south east) {\scriptsize{value$_3$}};
+\node [rnode,anchor=south west,fill=red!20!white] (value4) at ([xshift=1em]value3.south east) {\scriptsize{value$_4$}};
+\node [rnode,anchor=south west,pattern=north east lines] (key1) at ([yshift=0.2em]value1.north west) {};
+\node [rnode,anchor=south west,pattern=dots] (key2) at ([yshift=0.2em]value2.north west) {};
+\node [rnode,anchor=south west,pattern=horizontal lines] (key3) at ([yshift=0.2em]value3.north west) {};
+\node [rnode,anchor=south west,pattern=crosshatch dots] (key4) at ([yshift=0.2em]value4.north west) {};
+\node [fill=white,inner sep=1pt] (key1label) at (key1) {\scriptsize{key$_1$}};
+\node [fill=white,inner sep=1pt] (key1label) at (key2) {\scriptsize{key$_2$}};
+\node [fill=white,inner sep=1pt] (key1label) at (key3) {\scriptsize{key$_3$}};
+\node [fill=white,inner sep=1pt] (key1label) at (key4) {\scriptsize{key$_4$}};
+\node [rnode,anchor=east,pattern=vertical lines] (query) at ([xshift=-3em]key1.west) {};
+\node [anchor=east] (querylabel) at ([xshift=-0.2em]query.west) {\scriptsize{query}};
+\draw [->] ([yshift=1pt,xshift=6pt]query.north) .. controls +(90:1em) and +(90:1em) .. ([yshift=1pt]key1.north);
+\draw [->] ([yshift=1pt,xshift=3pt]query.north) .. controls +(90:1.5em) and +(90:1.5em) .. ([yshift=1pt]key2.north);
+\draw [->] ([yshift=1pt]query.north) .. controls +(90:2em) and +(90:2em) .. ([yshift=1pt]key3.north);
+\draw [->] ([yshift=1pt,xshift=-3pt]query.north) .. controls +(90:2.5em) and +(90:2.5em) .. ([yshift=1pt]key4.north);
+\node [anchor=south east] (alpha1) at (key1.north east) {\scriptsize{$\alpha_1$}};
+\node [anchor=south east] (alpha2) at (key2.north east) {\scriptsize{$\alpha_2$}};
+\node [anchor=south east] (alpha3) at (key3.north east) {\scriptsize{$\alpha_3$}};
+\node [anchor=south east] (alpha4) at (key4.north east) {\scriptsize{$\alpha_4$}};
+\node [anchor=north] (result) at ([xshift=-1.5em]value2.south east) {\scriptsize{{\red 返回结果}=$\alpha_1 \cdot \textrm{value}_1 + \alpha_2 \cdot \textrm{value}_2 + \alpha_3 \cdot \textrm{value}_3 + \alpha_4 \cdot \textrm{value}_4$}};
+\end{scope}
+\end{tikzpicture}
\ No newline at end of file
--- a/Chapter12/Figures/figure-residual-network-structure.tex
+++ b/Chapter12/Figures/figure-residual-network-structure.tex
+\begin{tikzpicture}
+\begin{scope}
+\tikzstyle{lnode} = [minimum height=1.5em,minimum width=3em,inner sep=3pt,rounded corners=1.5pt,draw,fill=orange!20];
+\tikzstyle{standard} = [rounded corners=3pt]
+\node [lnode,anchor=west] (l1) at (0,0) {\scriptsize{子层1}};
+\node [lnode,anchor=west] (l2) at ([xshift=3em]l1.east) {\scriptsize{子层2}};
+\node [lnode,anchor=west] (l3) at ([xshift=3em]l2.east) {\scriptsize{子层3}};
+\node [anchor=west,inner sep=2pt] (dot1) at ([xshift=1em]l3.east) {\scriptsize{$\textbf{...}$}};
+\node [lnode,anchor=west] (l4) at ([xshift=1em]dot1.east) {\scriptsize{子层$n$}};
+\node [anchor=west] (plus1) at ([xshift=0.9em]l1.east) {\scriptsize{$\mathbf{\oplus}$}};
+\node [anchor=west] (plus2) at ([xshift=0.9em]l2.east) {\scriptsize{$\mathbf{\oplus}$}};
+\draw [->,thick] ([xshift=-1.5em]l1.west) -- ([xshift=-0.1em]l1.west);
+\draw [->,thick] ([xshift=0.1em]l1.east) -- ([xshift=0.2em]plus1.west);
+\draw [->,thick] ([xshift=-0.2em]plus1.east) -- ([xshift=-0.1em]l2.west);
+\draw [->,thick] ([xshift=0.1em]l2.east) -- ([xshift=0.2em]plus2.west);
+\draw [->,thick] ([xshift=-0.2em]plus2.east) -- ([xshift=-0.1em]l3.west);
+\draw [->,thick] ([xshift=0.1em]l3.east) -- ([xshift=-0.1em]dot1.west);
+\draw [->,thick] ([xshift=0.1em]dot1.east) -- ([xshift=-0.1em]l4.west);
+\draw [->,thick] ([xshift=0.1em]l4.east) -- ([xshift=1.5em]l4.east);
+\draw[->,standard,thick] ([xshift=-0.8em]l1.west) -- ([xshift=-0.8em,yshift=2em]l1.west) -- ([yshift=2em]plus1.center) -- ([yshift=-0.2em]plus1.north);
+\draw[->,standard,thick] ([xshift=-0.8em]l2.west) -- ([xshift=-0.8em,yshift=2em]l2.west) -- ([yshift=2em]plus2.center) -- ([yshift=-0.2em]plus2.north);
+\draw [->,very thick,red] ([xshift=1.5em,yshift=-0.3em]l4.east) -- ([xshift=0.1em,,yshift=-0.3em]l4.east);
+\draw [->,very thick,red] ([xshift=-0.1em,yshift=-0.3em]l4.west) -- ([xshift=0.1em,yshift=-0.3em]dot1.east);
+\draw [->,very thick,red] ([xshift=-0.1em,yshift=-0.3em]dot1.west) -- ([xshift=0.1em,yshift=-0.3em]l3.east);
+\draw[->,standard,very thick,red] ([xshift=-0.3em,yshift=-0.2em]plus2.north) -- ([xshift=-0.3em,yshift=1.8em]plus2.center) -- ([xshift=-0.5em,yshift=1.8em]l2.west) -- ([xshift=-0.5em,yshift=0.2em]l2.west);
+\draw[->,standard,very thick,red] ([xshift=-0.3em,yshift=-0.2em]plus1.north) -- ([xshift=-0.3em,yshift=1.8em]plus1.center) -- ([xshift=-0.5em,yshift=1.8em]l1.west) -- ([xshift=-0.5em,yshift=0.2em]l1.west);
+\node [anchor=west] (label1) at ([xshift=1em,yshift=1.5em]l3.north) {\tiny{前向计算}};
+\draw [->,thick] ([xshift=-1.5em]label1.west) -- ([xshift=-0.1em]label1.west);
+\node [anchor=west] (label2) at ([xshift=2.5em]label1.east) {\tiny{反向传播}};
+\draw [->,thick,red] ([xshift=-1.5em]label2.west) -- ([xshift=-0.1em]label2.west);
+\end{scope}
+\end{tikzpicture}
\ No newline at end of file
--- a/Chapter12/Figures/figure-self-att-vs-enco-deco-att.tex
+++ b/Chapter12/Figures/figure-self-att-vs-enco-deco-att.tex
+\begin{tikzpicture}
+\node[rounded corners=1pt,minimum width=11.0em,minimum height=2.0em,fill=pink!30,draw=black](p1) at (0,0) {\small{Self-Attention}};
+\node[anchor=north](word1) at ([xshift=0.0em,yshift=-2.0em]p1.south) {\small \vectorn{\emph{K}}};
+\node[anchor=west](word2) at ([xshift=2.2em]word1.east) {\small \vectorn{\emph{Q}}};
+\node[anchor=east](word3) at ([xshift=-2.2em]word1.west) {\small \vectorn{\emph{Q}}};
+\draw[->,thick](word1.north)--(p1.south);
+\draw[->,thick]([xshift=-3.6em]word1.north)--([xshift=-3.6em]p1.south);
+\draw[->,thick]([xshift=3.6em]word1.north)--([xshift=3.6em]p1.south);
+\node[anchor=north,rounded corners=1pt,minimum width=11.0em,minimum height=3.5em,draw=ugreen!70,very thick,dotted](p1-1) at ([yshift=-5.2em]p1.south) {\small{解码端每个位置的表示}};
+\draw [->,thick,dashed] (word3.south) .. controls +(south:1.5em) and +(north:1.5em) .. ([xshift=-0.4em]p1-1.north);
+\draw [->,thick,dashed](word1.south) --(p1-1.north);
+\draw [->,thick,dashed] (word2.south) .. controls +(south:1.0em) and +(north:1.5em) .. ([xshift=0.4em]p1-1.north);
+\node[anchor=north](caption1) at ([xshift=0.0em,yshift=-9.5em]p1.south){\small{(a) Self-Attention的输入}};
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\node[anchor=west,rounded corners=1pt,minimum width=14.0em,minimum height=2.0em,fill=pink!30,draw=black](p2) at ([xshift=5.0em]p1.east){\small{Encoder-Decoder Attention}};
+\node[anchor=north](word1-2) at ([xshift=0.0em,yshift=-2.0em]p2.south) {\small \vectorn{\emph{K}}};
+\node[anchor=west](word2-2) at ([xshift=2.2em]word1-2.east) {\small \vectorn{\emph{Q}}};
+\node[anchor=east](word3-2) at ([xshift=-2.2em]word1-2.west) {\small \vectorn{\emph{Q}}};
+\draw[->,thick](word1-2.north)--(p2.south);
+\draw[->,thick]([xshift=-3.6em]word1-2.north)--([xshift=-3.6em]p2.south);
+\draw[->,thick]([xshift=3.6em]word1-2.north)--([xshift=3.6em]p2.south);
+\node[anchor=north,rounded corners=1pt](p2-1) at ([xshift=-3.55em,yshift=-5.5em]p2.south) {\small{解码端每个}};
+\node[anchor=north,rounded corners=1pt](p2-2) at ([xshift=-3.55em,yshift=-6.8em]p2.south) {\small{位置的表示}};
+\begin{pgfonlayer}{background}
+{
+\node[rounded corners=1pt,draw=ugreen!70,very thick,dotted] [fit = (p2-1) (p2-2)] (p2-12) {};
+}
+\end{pgfonlayer}
+\node[anchor=north,rounded corners=1pt](p2-3) at ([xshift=3.55em,yshift=-5.5em]p2.south) {\small{编码端每个}};
+\node[anchor=north,rounded corners=1pt](p2-4) at ([xshift=3.55em,yshift=-6.8em]p2.south) {\small{位置的表示}};
+\begin{pgfonlayer}{background}
+{
+\node[rounded corners=1pt,draw=ugreen!70,very thick,dotted] [fit = (p2-3) (p2-4)] (p2-34) {};
+}
+\end{pgfonlayer}
+\draw[<-,thick,dashed]([xshift=-3.6em,yshift=-3.2em]word1-2.north)--([xshift=-3.6em,yshift=-3.2em]p2.south);
+\draw[<-,thick,dashed]([xshift=3.6em,yshift=-3.2em]word1-2.north)--([xshift=3.6em,yshift=-3.2em]p2.south);
+\draw [->,thick,dashed] (word1-2.south) .. controls +(south:1em) and +(north:1.5em) .. ([yshift=0.3em,xshift=-0.4em]p2-3.north);
+\node[anchor=north](caption2) at ([xshift=0.0em,yshift=-9.5em]p2.south){\small{(b) Encoder-Decoder Attention的输入}};
+    \end{tikzpicture}
\ No newline at end of file
--- a/Chapter12/Figures/figure-structure-of-the-network-during-transformer-training.tex
+++ b/Chapter12/Figures/figure-structure-of-the-network-during-transformer-training.tex
+\begin{tikzpicture}
+\begin{scope}
+\tikzstyle{rnnnode} = [minimum height=1.1em,minimum width=2.1em,inner sep=2pt,rounded corners=1pt,draw,fill=red!20];
+\tikzstyle{lossnode} = [minimum height=1.1em,minimum width=6em,inner sep=2pt,rounded corners=1pt,draw,fill=red!20];
+\node [rnnnode,anchor=west] (h1) at (0,0) {\tiny{$\textbf{h}_1$}};
+\node [rnnnode,anchor=west] (h2) at ([xshift=1em]h1.east) {\tiny{$\textbf{h}_2$}};
+\node [rnnnode,anchor=west] (h3) at ([xshift=1em]h2.east) {\tiny{$\textbf{h}_3$}};
+\node [rnnnode,anchor=north,fill=green!20] (e1) at ([yshift=-1em]h1.south) {\tiny{$e_x()$}};
+\node [rnnnode,anchor=west,fill=green!20] (e2) at ([xshift=1em]e1.east) {\tiny{$e_x()$}};
+\node [rnnnode,anchor=west,fill=green!20] (e3) at ([xshift=1em]e2.east) {\tiny{$e_x()$}};
+\node [anchor=north,inner sep=2pt] (w1) at ([yshift=-0.6em]e1.south) {\tiny{你}};
+\node [anchor=north,inner sep=2pt] (w2) at ([yshift=-0.6em]e2.south) {\tiny{好}};
+\node [anchor=north,inner sep=2pt] (w3) at ([yshift=-0.6em]e3.south) {\tiny{$\langle$eos$\rangle$}};
+\node [anchor=south] (dot1) at ([xshift=0.4em,yshift=-0.7em]h1.south) {\tiny{...}};
+\node [anchor=south] (dot2) at ([xshift=-0.4em,yshift=-0.7em]h3.south) {\tiny{...}};
+\draw [->] (w1.north) -- ([yshift=-0.1em]e1.south);
+\draw [->] (w2.north) -- ([yshift=-0.1em]e2.south);
+\draw [->] (w3.north) -- ([yshift=-0.1em]e3.south);
+\draw [->] ([yshift=0.1em]e1.north) -- ([yshift=-0.1em]h1.south);
+\draw [->] ([yshift=0.1em]e2.north) -- ([yshift=-0.1em]h2.south);
+\draw [->] ([yshift=0.1em]e3.north) -- ([yshift=-0.1em]h3.south);
+\draw [->] ([xshift=0.2em,yshift=0.1em]e1.north) .. controls +(north:0.3) and +(south:0.4) .. ([xshift=-0.3em,yshift=-0.1em]h2.south);
+\draw [->] ([xshift=-0.2em,yshift=0.1em]e3.north) .. controls +(north:0.3) and +(south:0.4) .. ([xshift=0.3em,yshift=-0.1em]h2.south);
+\node [anchor=south] (encoder) at ([xshift=-0.2em]h1.north west) {\scriptsize{\textbf{编码器}}};
+{
+\node [rnnnode,anchor=west,fill=green!20] (t1) at ([xshift=3em]e3.east) {\tiny{$e_y()$}};
+\node [rnnnode,anchor=west,fill=green!20] (t2) at ([xshift=1.5em]t1.east) {\tiny{$e_y()$}};
+\node [rnnnode,anchor=west,fill=green!20] (t3) at ([xshift=1.5em]t2.east) {\tiny{$e_y()$}};
+\node [rnnnode,anchor=west,fill=green!20] (t4) at ([xshift=1.5em]t3.east) {\tiny{$e_y()$}};
+}
+{
+\node [rnnnode,anchor=south] (s1) at ([yshift=1em]t1.north) {\tiny{$\textbf{s}_1$}};
+\node [rnnnode,anchor=south] (s2) at ([yshift=1em]t2.north) {\tiny{$\textbf{s}_2$}};
+\node [rnnnode,anchor=south] (s3) at ([yshift=1em]t3.north) {\tiny{$\textbf{s}_3$}};
+\node [rnnnode,anchor=south] (s4) at ([yshift=1em]t4.north) {\tiny{$\textbf{s}_4$}};
+%\node [anchor=south] (dot3) at ([xshift=-0.4em,yshift=-0.7em]s3.south) {\tiny{...}};
+\node [anchor=south] (dot4) at ([xshift=-0.4em,yshift=-0.7em]s4.south) {\tiny{...}};
+\draw [->] ([xshift=-0.6em,yshift=-0.5em]s3.south) .. controls +(north:0) and +(south:0.2) .. ([xshift=-0.3em,yshift=-0.1em]s3.south);
+    \draw [->] ([xshift=-1.5em,yshift=-0.5em]s3.south) .. controls +(north:0) and +(south:0.15) .. ([xshift=-0.6em,yshift=-0.1em]s3.south);
+}
+{
+\node [rnnnode,anchor=south] (f1) at ([yshift=1em]s1.north) {\tiny{$\textbf{f}_1$}};
+\node [rnnnode,anchor=south] (f2) at ([yshift=1em]s2.north) {\tiny{$\textbf{f}_2$}};
+\node [rnnnode,anchor=south] (f3) at ([yshift=1em]s3.north) {\tiny{$\textbf{f}_3$}};
+\node [rnnnode,anchor=south] (f4) at ([yshift=1em]s4.north) {\tiny{$\textbf{f}_4$}};
+\node [rnnnode,anchor=south,fill=blue!20] (o1) at ([yshift=1em]f1.north) {\tiny{softmax}};
+\node [rnnnode,anchor=south,fill=blue!20] (o2) at ([yshift=1em]f2.north) {\tiny{softmax}};
+\node [rnnnode,anchor=south,fill=blue!20] (o3) at ([yshift=1em]f3.north) {\tiny{softmax}};
+\node [rnnnode,anchor=south,fill=blue!20] (o4) at ([yshift=1em]f4.north) {\tiny{softmax}};
+\node [anchor=east] (decoder) at ([xshift=-0.3em,yshift=0.5em]o1.north west) {\scriptsize{\textbf{解码器}}};
+\node [anchor=south,fill=black!5!white,minimum height=1.1em,minimum width=13em,inner sep=2pt,rounded corners=1pt,draw] (loss) at ([xshift=1.8em,yshift=1em]o2.north) {\scriptsize{\textbf{Cross Entropy Loss}}};
+}
+{
+\node [anchor=north,inner sep=2pt] (wt1) at ([yshift=-0.6em]t1.south) {\tiny{$\langle$eos$\rangle$}};
+\node [anchor=north,inner sep=2pt] (wt2) at ([yshift=-0.6em]t2.south) {\tiny{How}};
+\node [anchor=north,inner sep=2pt] (wt3) at ([yshift=-0.8em]t3.south) {\tiny{are}};
+\node [anchor=north,inner sep=2pt] (wt4) at ([yshift=-0.8em]t4.south) {\tiny{you}};
+}
+{
+\foreach \x in {1,2,3,4}{
+    \draw [->] ([yshift=-0.7em]t\x.south) -- ([yshift=-0.1em]t\x.south);
+    \draw [->] ([yshift=0.1em]t\x.north) -- ([yshift=-0.1em]s\x.south);
+\draw [->] ([xshift=0.2em,yshift=0.1em]t1.north) .. controls +(north:0.3) and +(south:0.3) .. ([xshift=-0.3em,yshift=-0.1em]s2.south);
+}
+}
+{
+\foreach \x in {1,2,3,4}{
+    \draw [->] ([yshift=0.1em]s\x.north) -- ([yshift=-0.1em]f\x.south);
+    \draw [->] ([yshift=0.1em]f\x.north) -- ([yshift=-0.1em]o\x.south);
+    \draw [->] ([yshift=0.1em]o\x.north) -- ([yshift=0.8em]o\x.north);
+}
+}
+{
+\node [circle,draw,anchor=south,inner sep=3pt,fill=orange!20] (c1) at ([yshift=2em]h2.north) {\tiny{$\textbf{C}_1$}};
+\node [anchor=south] (c1label) at (c1.north) {\tiny{\textbf{编码-解码注意力机制：上下文}}};
+\draw [->] (h1.north) .. controls +(north:0.6) and +(250:0.9) .. (c1.250);
+\draw [->] (h2.north) .. controls +(north:0.6) and +(270:0.9) .. (c1.270);
+\draw [->] (h3.north) .. controls +(north:0.6) and +(290:0.9) .. (c1.290);
+\draw [->] ([yshift=0.3em]s1.west) .. controls +(west:1) and +(east:1) .. (c1.-30);
+\draw [->] (c1.0) .. controls +(east:1) and +(west:1) .. ([yshift=0em]f1.west);
+}
+{
+\node [circle,draw,anchor=north,inner sep=3pt,fill=orange!20] (c2) at ([yshift=-2em]t1.south) {\tiny{$\textbf{C}_2$}};
+\draw [->] ([xshift=-0.7em]c2.west) -- ([xshift=-0.1em]c2.west);
+\draw [->] ([xshift=0.1em]c2.east) .. controls +(east:0.6) and +(west:0.8) ..([yshift=-0.3em,xshift=-0.1em]f2.west);
+\node [circle,draw,anchor=north,inner sep=3pt,fill=orange!20] (c3) at ([yshift=-2em]t2.south) {\tiny{$\textbf{C}_3$}};
+\draw [->] ([xshift=-0.7em]c3.west) -- ([xshift=-0.1em]c3.west);
+\draw [->] ([xshift=0.1em]c3.east) .. controls +(east:0.6) and +(west:0.8) ..([yshift=-0.3em,xshift=-0.1em]f3.west);
+\node [circle,draw,anchor=north,inner sep=3pt,fill=orange!20] (c4) at ([yshift=-2em]t3.south) {\tiny{$\textbf{C}_4$}};
+\draw [->] ([xshift=-0.7em]c4.west) -- ([xshift=-0.1em]c4.west);
+\draw [->] ([xshift=0.1em]c4.east) .. controls +(east:0.6) and +(west:0.8) ..([yshift=-0.3em,xshift=-0.1em]f4.west);
+}
+\end{scope}
+\end{tikzpicture}
\ No newline at end of file
--- a/Chapter12/Figures/figure-transformer-input-and-position-encoding.tex
+++ b/Chapter12/Figures/figure-transformer-input-and-position-encoding.tex
+\begin{tikzpicture}
+\begin{scope}
+\tikzstyle{Sanode} = [minimum height=1.4em,minimum width=7em,inner sep=3pt,rounded corners=1.5pt,draw];
+\tikzstyle{Resnode} = [minimum height=1.1em,minimum width=7em,inner sep=3pt,rounded corners=1.5pt,draw];
+\tikzstyle{ffnnode} = [minimum height=1.4em,minimum width=7em,inner sep=3pt,rounded corners=1.5pt,draw];
+\tikzstyle{outputnode} = [minimum height=1.4em,minimum width=7em,inner sep=3pt,rounded corners=1.5pt,draw];
+\tikzstyle{inputnode} = [minimum height=1.4em,minimum width=3.5em,inner sep=3pt,rounded corners=1.5pt,draw,fill=red!10];
+\tikzstyle{posnode} = [minimum height=1.4em,minimum width=3.5em,inner sep=3pt,rounded corners=1.5pt,draw,fill=black!5!white];
+\tikzstyle{standard} = [rounded corners=3pt]
+\node [Sanode,anchor=west] (sa1) at (0,0) {\tiny{$\textbf{Self-Attention}$}};
+\node [Resnode,anchor=south] (res1) at ([yshift=0.3em]sa1.north) {\tiny{$\textbf{Add \& LayerNorm}$}};
+\node [ffnnode,anchor=south] (ffn1) at ([yshift=1em]res1.north) {\tiny{$\textbf{Feed Forward Network}$}};
+\node [Resnode,anchor=south] (res2) at ([yshift=0.3em]ffn1.north) {\tiny{$\textbf{Add \& LayerNorm}$}};
+\node [inputnode,anchor=north west] (input1) at ([yshift=-1em]sa1.south west) {\tiny{$\textbf{Embedding}$}};
+\node [posnode,anchor=north east] (pos1) at ([yshift=-1em]sa1.south east) {\tiny{$\textbf{Postion}$}};
+\node [anchor=north] (inputs) at ([yshift=-3em]sa1.south) {\scriptsize{$\textbf{编码器输入: 我\ \ 很\ \ 好}$}};
+\node [anchor=south] (encoder) at ([xshift=0.2em,yshift=0.6em]res2.north west) {\scriptsize{\textbf{编码器}}};
+\draw [->] (sa1.north) -- (res1.south);
+\draw [->] (res1.north) -- (ffn1.south);
+\draw [->] (ffn1.north) -- (res2.south);
+\draw [->] ([yshift=-1em]sa1.south) -- (sa1.south);
+\draw [->] ([yshift=-0.3em]inputs.north) -- ([yshift=0.6em]inputs.north);
+\node [Sanode,anchor=west] (sa2) at ([xshift=3em]sa1.east) {\tiny{$\textbf{Self-Attention}$}};
+\node [Resnode,anchor=south] (res3) at ([yshift=0.3em]sa2.north) {\tiny{$\textbf{Add \& LayerNorm}$}};
+\node [Sanode,anchor=south] (ed1) at ([yshift=1em]res3.north) {\tiny{$\textbf{Encoder-Decoder Attention}$}};
+\node [Resnode,anchor=south] (res4) at ([yshift=0.3em]ed1.north) {\tiny{$\textbf{Add \& LayerNorm}$}};
+\node [ffnnode,anchor=south] (ffn2) at ([yshift=1em]res4.north) {\tiny{$\textbf{Feed Forward Network}$}};
+\node [Resnode,anchor=south] (res5) at ([yshift=0.3em]ffn2.north) {\tiny{$\textbf{Add \& LayerNorm}$}};
+\node [outputnode,anchor=south] (o1) at ([yshift=1em]res5.north) {\tiny{$\textbf{Output layer}$}};
+\node [inputnode,anchor=north west] (input2) at ([yshift=-1em]sa2.south west) {\tiny{$\textbf{Embedding}$}};
+\node [posnode,anchor=north east] (pos2) at ([yshift=-1em]sa2.south east) {\tiny{$\textbf{Postion}$}};
+\node [anchor=north] (outputs) at ([yshift=-3em]sa2.south) {\scriptsize{$\textbf{解码器输入: $<$sos$>$ I  am  fine}$}};
+\node [anchor=east] (decoder) at ([xshift=-1em,yshift=-1.5em]o1.west) {\scriptsize{\textbf{解码器}}};
+\node [anchor=north] (decoutputs) at ([yshift=1.5em]o1.north) {\scriptsize{$\textbf{解码器输出: I  am  fine $<$eos$>$ }$}};
+\draw [->] (sa2.north) -- (res3.south);
+\draw [->] (res3.north) -- (ed1.south);
+\draw [->] (ed1.north) -- (res4.south);
+\draw [->] (res4.north) -- (ffn2.south);
+\draw [->] (ffn2.north) -- (res5.south);
+\draw [->] (res5.north) -- (o1.south);
+\draw [->] (o1.north) -- ([yshift=0.5em]o1.north);
+\draw [->] ([yshift=-1em]sa2.south) -- (sa2.south);
+\draw [->] ([yshift=-0.3em]outputs.north) -- ([yshift=0.6em]outputs.north);
+\draw[->,standard] ([yshift=-0.5em]sa1.south) -- ([xshift=-4em,yshift=-0.5em]sa1.south) -- ([xshift=-4em,yshift=2.3em]sa1.south) -- ([xshift=-3.5em,yshift=2.3em]sa1.south);
+\draw[->,standard] ([yshift=0.5em]res1.north) -- ([xshift=-4em,yshift=0.5em]res1.north) -- ([xshift=-4em,yshift=3.3em]res1.north) -- ([xshift=-3.5em,yshift=3.3em]res1.north);
+\draw[->,standard] ([yshift=-0.5em]sa2.south) -- ([xshift=4em,yshift=-0.5em]sa2.south) -- ([xshift=4em,yshift=2.3em]sa2.south) -- ([xshift=3.5em,yshift=2.3em]sa2.south);
+\draw[->,standard] ([yshift=0.5em]res3.north) -- ([xshift=4em,yshift=0.5em]res3.north) -- ([xshift=4em,yshift=3.3em]res3.north) -- ([xshift=3.5em,yshift=3.3em]res3.north);
+\draw[->,standard] ([yshift=0.5em]res4.north) -- ([xshift=4em,yshift=0.5em]res4.north) -- ([xshift=4em,yshift=3.3em]res4.north) -- ([xshift=3.5em,yshift=3.3em]res4.north);
+\draw[->,standard] (res2.north) -- ([yshift=0.5em]res2.north) -- ([xshift=5em,yshift=0.5em]res2.north) -- ([xshift=5em,yshift=-2.2em]res2.north) -- ([xshift=6.5em,yshift=-2.2em]res2.north);
+%\node [rectangle,inner sep=0.7em,rounded corners=1pt,very thick,dotted,draw=ugreen!70] [fit = (sa1) (res1) (ffn1) (res2)] (box0) {};
+%\node [rectangle,inner sep=0.7em,rounded corners=1pt,very thick,dotted,draw=red!60] [fit = (sa2) (res3) (res5)] (box1) {};
+\begin{pgfonlayer}{background}
+	\node [rectangle,inner sep=0.2em,rounded corners=1pt,very thick,dotted,fill=red!40] [fit = (input1) (pos1)] (box1) {};		
+	\node [rectangle,inner sep=0.2em,rounded corners=1pt,very thick,dotted,fill=red!40] [fit = (input2) (pos2)] (box2) {};	
+\end{pgfonlayer}
+\end{scope}
+\end{tikzpicture}
\ No newline at end of file
--- a/Chapter12/Figures/figure-transformer.tex
+++ b/Chapter12/Figures/figure-transformer.tex
+\begin{tikzpicture}
+\begin{scope}
+\tikzstyle{Sanode} = [minimum height=1.4em,minimum width=7em,inner sep=3pt,rounded corners=1.5pt,draw,fill=orange!20];
+\tikzstyle{Resnode} = [minimum height=1.1em,minimum width=7em,inner sep=3pt,rounded corners=1.5pt,draw,fill=yellow!20];
+\tikzstyle{ffnnode} = [minimum height=1.4em,minimum width=7em,inner sep=3pt,rounded corners=1.5pt,draw,fill=blue!10];
+\tikzstyle{outputnode} = [minimum height=1.4em,minimum width=7em,inner sep=3pt,rounded corners=1.5pt,draw,fill=blue!30];
+\tikzstyle{inputnode} = [minimum height=1.4em,minimum width=3.5em,inner sep=3pt,rounded corners=1.5pt,draw,fill=red!10];
+\tikzstyle{posnode} = [minimum height=1.4em,minimum width=3.5em,inner sep=3pt,rounded corners=1.5pt,draw,fill=black!5!white];
+\tikzstyle{standard} = [rounded corners=3pt]
+\node [Sanode,anchor=west] (sa1) at (0,0) {\tiny{$\textbf{Self-Attention}$}};
+\node [Resnode,anchor=south] (res1) at ([yshift=0.3em]sa1.north) {\tiny{$\textbf{Add \& LayerNorm}$}};
+\node [ffnnode,anchor=south] (ffn1) at ([yshift=1em]res1.north) {\tiny{$\textbf{Feed Forward Network}$}};
+\node [Resnode,anchor=south] (res2) at ([yshift=0.3em]ffn1.north) {\tiny{$\textbf{Add \& LayerNorm}$}};
+\node [inputnode,anchor=north west] (input1) at ([yshift=-1em]sa1.south west) {\tiny{$\textbf{Embedding}$}};
+\node [posnode,anchor=north east] (pos1) at ([yshift=-1em]sa1.south east) {\tiny{$\textbf{Postion}$}};
+\node [anchor=north] (inputs) at ([yshift=-3em]sa1.south) {\scriptsize{$\textbf{编码器输入: 我\ \ 很\ \ 好}$}};
+\node [anchor=south] (encoder) at ([xshift=0.2em,yshift=0.6em]res2.north west) {\scriptsize{\textbf{编码器}}};
+\draw [->] (sa1.north) -- (res1.south);
+\draw [->] (res1.north) -- (ffn1.south);
+\draw [->] (ffn1.north) -- (res2.south);
+\draw [->] ([yshift=-1em]sa1.south) -- (sa1.south);
+\draw [->] ([yshift=-0.3em]inputs.north) -- ([yshift=0.6em]inputs.north);
+\node [Sanode,anchor=west] (sa2) at ([xshift=3em]sa1.east) {\tiny{$\textbf{Self-Attention}$}};
+\node [Resnode,anchor=south] (res3) at ([yshift=0.3em]sa2.north) {\tiny{$\textbf{Add \& LayerNorm}$}};
+\node [Sanode,anchor=south] (ed1) at ([yshift=1em]res3.north) {\tiny{$\textbf{Encoder-Decoder Attention}$}};
+\node [Resnode,anchor=south] (res4) at ([yshift=0.3em]ed1.north) {\tiny{$\textbf{Add \& LayerNorm}$}};
+\node [ffnnode,anchor=south] (ffn2) at ([yshift=1em]res4.north) {\tiny{$\textbf{Feed Forward Network}$}};
+\node [Resnode,anchor=south] (res5) at ([yshift=0.3em]ffn2.north) {\tiny{$\textbf{Add \& LayerNorm}$}};
+\node [outputnode,anchor=south] (o1) at ([yshift=1em]res5.north) {\tiny{$\textbf{Output layer}$}};
+\node [inputnode,anchor=north west] (input2) at ([yshift=-1em]sa2.south west) {\tiny{$\textbf{Embedding}$}};
+\node [posnode,anchor=north east] (pos2) at ([yshift=-1em]sa2.south east) {\tiny{$\textbf{Postion}$}};
+\node [anchor=north] (outputs) at ([yshift=-3em]sa2.south) {\scriptsize{$\textbf{解码器输入: $<$sos$>$ I  am  fine}$}};
+\node [anchor=east] (decoder) at ([xshift=-1em,yshift=-1.5em]o1.west) {\scriptsize{\textbf{解码器}}};
+\node [anchor=north] (decoutputs) at ([yshift=1.5em]o1.north) {\scriptsize{$\textbf{解码器输出: I  am  fine $<$eos$>$ }$}};
+\draw [->] (sa2.north) -- (res3.south);
+\draw [->] (res3.north) -- (ed1.south);
+\draw [->] (ed1.north) -- (res4.south);
+\draw [->] (res4.north) -- (ffn2.south);
+\draw [->] (ffn2.north) -- (res5.south);
+\draw [->] (res5.north) -- (o1.south);
+\draw [->] (o1.north) -- ([yshift=0.5em]o1.north);
+\draw [->] ([yshift=-1em]sa2.south) -- (sa2.south);
+\draw [->] ([yshift=-0.3em]outputs.north) -- ([yshift=0.6em]outputs.north);
+\draw[->,standard] ([yshift=-0.5em]sa1.south) -- ([xshift=-4em,yshift=-0.5em]sa1.south) -- ([xshift=-4em,yshift=2.3em]sa1.south) -- ([xshift=-3.5em,yshift=2.3em]sa1.south);
+\draw[->,standard] ([yshift=0.5em]res1.north) -- ([xshift=-4em,yshift=0.5em]res1.north) -- ([xshift=-4em,yshift=3.3em]res1.north) -- ([xshift=-3.5em,yshift=3.3em]res1.north);
+\draw[->,standard] ([yshift=-0.5em]sa2.south) -- ([xshift=4em,yshift=-0.5em]sa2.south) -- ([xshift=4em,yshift=2.3em]sa2.south) -- ([xshift=3.5em,yshift=2.3em]sa2.south);
+\draw[->,standard] ([yshift=0.5em]res3.north) -- ([xshift=4em,yshift=0.5em]res3.north) -- ([xshift=4em,yshift=3.3em]res3.north) -- ([xshift=3.5em,yshift=3.3em]res3.north);
+\draw[->,standard] ([yshift=0.5em]res4.north) -- ([xshift=4em,yshift=0.5em]res4.north) -- ([xshift=4em,yshift=3.3em]res4.north) -- ([xshift=3.5em,yshift=3.3em]res4.north);
+\draw[->,standard] (res2.north) -- ([yshift=0.5em]res2.north) -- ([xshift=5em,yshift=0.5em]res2.north) -- ([xshift=5em,yshift=-2.2em]res2.north) -- ([xshift=6.5em,yshift=-2.2em]res2.north);
+\node [rectangle,inner sep=0.7em,rounded corners=1pt,very thick,dotted,draw=ugreen!70] [fit = (sa1) (res1) (ffn1) (res2)] (box0) {};
+\node [rectangle,inner sep=0.7em,rounded corners=1pt,very thick,dotted,draw=red!60] [fit = (sa2) (res3) (res5)] (box1) {};
+\node [ugreen,font=\scriptsize] (count) at ([xshift=-1.5em,yshift=-1em]encoder.south) {$6\times$};
+\node [red,font=\scriptsize] (count) at ([xshift=10.8em,yshift=0em]decoder.south) {$\times 6$};
+\end{scope}
+\end{tikzpicture}
\ No newline at end of file
--- a/Chapter12/chapter12.tex
+++ b/Chapter12/chapter12.tex
@@ -21,10 +21,770 @@
 %	CHAPTER 12
 %----------------------------------------------------------------------------------------
-\chapter{神经机器翻译模型训练}
+\chapter{基于自注意力的模型}
 %----------------------------------------------------------------------------------------
-%    NEW SECTION
+%    NEW SECTION   12.1
 %----------------------------------------------------------------------------------------
+\sectionnewpage
+\section{注意力机制}
+\label{sec:12.1}
-\section{}
+\parinterval 第二章提到过“上帝是不公平的”，这个观点主要是表达了：世界上事物之间的联系不是均匀的，有些事物之间的联系会很强，而其他的联系可能很弱。自然语言也完美地契合了这个观点。比如，再重新看一下前面提到的根据上下文补全缺失单词的例子，
+\vspace{0.8em}
+\centerline{中午\ 没\ 吃饭\ ，\ 又\ 刚\ 打\ 了\ 一\ 下午\ 篮球\ ，\ 我\ 现在\ 很\ 饿\ ，\ 我\ 想\underline{\quad \quad \quad} 。}
+\vspace{0.8em}
+\noindent 之所以能想到在横线处填“吃饭”、“吃东西”很有可能是因为看到了“没\ 吃饭”、 “很\ 饿”等关键信息。也就是这些关键的片段对预测缺失的单词起着关键性作用。而预测“吃饭”与前文中的“ 中午”、“又”之间的联系似乎不那么紧密。也就是说，在形成 “吃饭”的逻辑时，在潜意识里会更注意“没吃饭”、“很饿”等关键信息。也就是我们的关注度并不是均匀地分布在整个句子上的。
+\parinterval 这个现象可以用注意力机制进行解释。注意力机制的概念来源于生物学的一些现象：当待接收的信息过多时，人类会选择性地关注部分信息而忽略其他信息。它在人类的视觉、听觉、嗅觉等方面均有体现，当我们在感受事物时，大脑会自动过滤或衰减部分信息，仅关注其中少数几个部分。例如，当看到图\ref{fig:12-20}时，往往不是“均匀地”看图像中的所有区域，可能最先注意到的是大狗头上戴的帽子，然后才会关注图片中其他的部分。
+\parinterval 那么注意力机制和神经机器翻译又有什么关系呢？它如何解决神经机器翻译的问题呢？下面就一起来看一看。
+%----------------------------------------------
+\begin{figure}[htp]
+\centering
+\includegraphics[scale=0.2]{./Chapter12/Figures/dog-hat.jpg}
+\caption{戴帽子的狗}
+\label{fig:12-20}
+\end{figure}
+%----------------------------------------------
+%----------------------------------------------------------------------------------------
+%    NEW SUB-SECTION  12.1.2
+%----------------------------------------------------------------------------------------
+\subsection{翻译中的注意力机制}
+\parinterval 在第十章已经介绍过，早期的神经机器翻译只使用循环神经网络最后一个单元的输出作为整个序列的表示，这种方式有两个明显的缺陷：
+\begin{itemize}
+\vspace{0.5em}
+\item 首先，虽然编码器把一个源语句子的表示传递给解码器，但是一个维度固定的向量所能包含的信息是有限的，随着源语序列的增长，将整个句子的信息编码到一个固定维度的向量中可能会造成源语句子信息的丢失。显然，在翻译较长的句子时，解码端可能无法获取完整的源语信息，降低翻译性能；
+\vspace{0.5em}
+\item 此外，当生成某一个目标语单词时，并不是均匀的使用源语句子中的单词信息。更普遍的情况是，系统会参考与这个目标语单词相对应的源语单词进行翻译。这有些类似于词对齐的作用，即翻译是基于单词之间的某种对应关系。但是，使用单一的源语表示根本无法区分源语句子的不同部分，更不用说对源语单词和目标语单词之间的联系进行建模了。
+\vspace{0.5em}
+\end{itemize}
+\parinterval 更直观的，如图\ref{fig:12-21}，目标语中的“very long”仅依赖于源文中的“很长”。这时如果将所有源语编码成一个固定的实数向量，“很长”的信息就很可能被其他词的信息淹没掉。
+%----------------------------------------------
+\begin{figure}[htp]
+\centering
+\input{./Chapter12/Figures/figure-attention-of-source-and-target-words}
+\caption{源语词和目标语词的关注度}
+\label{fig:12-21}
+\end{figure}
+%----------------------------------------------
+\parinterval 显然，以上问题的根本原因在于所使用的表示模型还比较“弱”。因此需要一个更强大的表示模型，在生成目标语单词时能够有选择地获取源语句子中更有用的部分。更准确的说，对于要生成的目标语单词，相关性更高的源语片段应该在源语句子的表示中体现出来，而不是将所有的源语单词一视同仁。在神经机器翻译中引入注意力机制正是为了达到这个目的\upcite{bahdanau2014neural,DBLP:journals/corr/LuongPM15}。实际上，除了机器翻译，注意力机制也被成功地应用于图像处理、语音识别、自然语言处理等其他任务。而正是注意力机制的引入，使得包括机器翻译在内很多自然语言处理系统得到了飞跃发展。
+\parinterval 神经机器翻译中的注意力机制并不复杂。对于每个目标语单词$y_j$，系统生成一个源语表示向量$\vectorn{\emph{{C}}}_j$与之对应，$\vectorn{\emph{C}}_j$会包含生成$y_j$所需的源语的信息，或者说$\vectorn{\emph{C}}_j$是一种包含目标语单词与源语单词对应关系的源语表示。相比用一个静态的表示$\vectorn{\emph{C}}$，注意机制使用的是动态的表示$\vectorn{\emph{C}}_j$。$\vectorn{\emph{C}}_j$也被称作对于目标语位置$j$的上下文向量。图\ref{fig:12-22}对比了未引入注意力机制和引入了注意力机制的编码器-解码器结构。可以看出，在注意力模型中，对于每一个目标单词的生成，都会额外引入一个单独的上下文向量参与运算。
+%----------------------------------------------
+\begin{figure}[htp]
+\centering
+\input{./Chapter12/Figures/figure-encoder-decoder-with-attention}
+\caption{(a)不使用和(b)使用注意力机制的翻译模型对比}
+\label{fig:12-22}
+\end{figure}
+%----------------------------------------------
+%----------------------------------------------------------------------------------------
+%    NEW SUB-SECTION  12.1.3
+%----------------------------------------------------------------------------------------
+\subsection{上下文向量的计算}
+\label{sec:12.1.3}
+\parinterval 那么注意力机制是如何针对不同单词生成不同的上下文向量呢？这里，可以将注意力机制看做是一种对接收到的信息的加权处理。对于更重要的信息赋予更高的权重即更高的关注度，对于贡献度较低的信息分配较低的权重，弱化其对结果的影响。这样，$\vectorn{\emph{C}}_j$可以包含更多对当前目标语位置有贡献的源语片段的信息。
+\parinterval 根据这种思想，上下文向量$\vectorn{\emph{C}}_j$被定义为对不同时间步编码器输出的状态序列$\{ \vectorn{\emph{h}}_1, \vectorn{\emph{h}}_2,...,\vectorn{\emph{h}}_m \}$进行加权求和，如下：
+\begin{eqnarray}
+\vectorn{\emph{C}}_j=\sum_{i} \alpha_{i,j} \vectorn{\emph{h}}_i
+\label{eq:12-22}
+\end{eqnarray}
+\noindent 其中，$\alpha_{i,j}$是{\small\sffamily\bfseries{注意力权重}}\index{注意力权重}（Attention Weight）\index{Attention Weight}，它表示目标语第$j$个位置与源语第$i$个位置之间的相关性大小。这里，将每个时间步编码器的输出$\vectorn{\emph{h}}_i$ 看作源语位置$i$的表示结果。进行翻译时，解码端可以根据当前的位置$j$，通过控制不同$\vectorn{\emph{h}}_i$的权重得到$\vectorn{\emph{C}}_j$，使得对目标语位置$j$贡献大的$\vectorn{\emph{h}}_i$对$\vectorn{\emph{C}}_j$的影响增大。也就是说，$\vectorn{\emph{C}}_j$实际上就是\{${\vectorn{\emph{h}}_1, \vectorn{\emph{h}}_2,...,\vectorn{\emph{h}}_m}$\}的一种组合，只不过不同的$\vectorn{\emph{h}}_i$会根据对目标端的贡献给予不同的权重。图\ref{fig:12-23}展示了上下文向量$\vectorn{\emph{C}}_j$的计算过程。
+%----------------------------------------------
+\begin{figure}[htp]
+\centering
+\input{./Chapter12/Figures/figure-calculation-process-of-context-vector-c}
+\caption{上下文向量$\vectorn{\emph{C}}_j$的计算过程}
+\label{fig:12-23}
+\end{figure}
+%----------------------------------------------
+\parinterval 如图\ref{fig:12-23}所示，注意力权重$\alpha_{i,j}$的计算分为两步：
+\begin{itemize}
+\vspace{0.5em}
+\item	使用目标语上一时刻循环单元的输出$\vectorn{\emph{s}}_{j-1}$与源语第$i$个位置的表示$\vectorn{\emph{h}}_i$之间的相关性，其用来表示目标语位置$j$对源语位置$i$的关注程度，记为$\beta_{i,j}$，由函数$\textrm{a}(\cdot)$实现：
+\begin{eqnarray}
+\beta_{i,j} = a(\vectorn{\emph{s}}_{j-1},\vectorn{\emph{h}}_i)
+\label{eq:12-23}
+\end{eqnarray}
+$a(\cdot)$可以被看作是目标语表示和源语表示的一种“统一化”，即把源语和目标语表示映射在同一个语义空间，进而语义相近的内容有更大的相似性。该函数有多种计算方式，比如，向量乘、向量夹角、单词神经网络等，数学表达如下：
+\begin{eqnarray}
+a (\vectorn{\emph{s}},\vectorn{\emph{h}}) =  \left\{ \begin{array}{ll}
+    \vectorn{\emph{s}} \vectorn{\emph{h}}^{\textrm{T}} & \textrm{向量乘} \\
+    \textrm{cos}(\vectorn{\emph{s}}, \vectorn{\emph{h}}) & \textrm{向量夹角} \\
+    \vectorn{\emph{s}} \vectorn{\emph{W}} \vectorn{\emph{h}}^{\textrm{T}} & \textrm{线性模型} \\
+    \textrm{TanH}(\vectorn{\emph{W}}[\vectorn{\emph{s}},\vectorn{\emph{h}}])\vectorn{\emph{v}}^{\textrm{T}} & \textrm{拼接}[\vectorn{\emph{s}},\vectorn{\emph{h}}]+\textrm{单层网络}
+    \end{array}
+    \right.
+\label{eq:12-24}
+\end{eqnarray}
+其中$\vectorn{\emph{W}}$和$\vectorn{\emph{v}}$是可学习的参数。
+\vspace{0.5em}
+\item	进一步，利用Softmax函数，将相关性系数$\beta_{i,j}$进行指数归一化处理，得到注意力权重$\alpha_{i,j}$：
+\vspace{0.5em}
+\begin{eqnarray}
+\alpha_{i,j}=\frac{\textrm{exp}(\beta_{i,j})} {\sum_{i'} \textrm{exp}(\beta_{i',j})}
+\label{eq:12-25}
+\end{eqnarray}
+\vspace{0.5em}
+最终，\{$\alpha_{i,j}$\}可以被看作是一个矩阵，它的长为目标语言句子长度，宽为源语言句子长度，矩阵中的每一项对应一个$\alpha_{i,j}$。图\ref{fig:12-24}给出了\{$\alpha_{i,j}$\}的一个矩阵表示。图中蓝色方框的大小表示不同的注意力权重$\alpha_{i,j}$的大小，方框越大，源语言位置$i$和目标语言位置$j$的相关性越高。能够看到，对于互译的中英文句子，\{$\alpha_{i,j}$\}可以较好的反应两种语言之间不同位置的对应关系。
+%----------------------------------------------
+\begin{figure}[htp]
+\centering
+\input{./Chapter12/Figures/figure-matrix-representation-of-attention-weights-between-chinese-english-sentence-pairs}
+\caption{一个汉英句对之间的注意力权重{$\alpha_{i,j}$}的矩阵表示}
+\label{fig:12-24}
+\end{figure}
+%----------------------------------------------
+\vspace{0.5em}
+\end{itemize}
+\parinterval 图\ref{fig:12-25}展示了一个上下文向量的计算过程实例。首先，计算目标语第一个单词“Have”与源语中的所有单词的相关性，即注意力权重，对应图中第一列$\alpha_{i,1}$，则当前时刻所使用的上下文向量$\vectorn{\emph{C}}_1 = \sum_{i=1}^8 \alpha_{i,1} \vectorn{\emph{h}}_i$；然后，计算第二个单词“you”的注意力权重对应第二列$\alpha_{i,2}$，其上下文向量$\vectorn{\emph{C}}_2 = \sum_{i=1}^8 \alpha_{i,2} \vectorn{\emph{h}}_i$，以此类推，可以得到任意目标语位置$j$的上下文向量$\vectorn{\emph{C}}_j$。很容易看出，不同目标语单词的上下文向量对应的源语言词的权重$\alpha_{i,j}$是不同的，不同的注意力权重为不同位置赋予了不同重要性，对应了注意力机制的思想。
+%----------------------------------------------
+\begin{figure}[htp]
+\centering
+\input{./Chapter12/Figures/figure-example-of-context-vector-calculation-process}
+\caption{上下文向量计算过程实例}
+\label{fig:12-25}
+\end{figure}
+%----------------------------------------------
+\parinterval 在\ref{sec:10.3.1}节中，使用公式\ref{eq:10-5}描述了目标语单词生成概率$ \funp{P} (y_j | \vectorn{\emph{y}}_{<j},\vectorn{\emph{x}})$。在引入注意力机制后，不同时刻的上下文向量$\vectorn{\emph{C}}_j$替换了传统模型中固定的句子表示$\vectorn{\emph{C}}$。描述如下：
+\begin{eqnarray}
+\funp{P} (y_j | \vectorn{\emph{y}}_{<j},\vectorn{\emph{x}}) \equiv \funp{P} (y_j | \vectorn{\emph{s}}_{j-1},y_{j-1},\vectorn{\emph{C}}_j )
+\label{eq:12-26}
+\end{eqnarray}
+\parinterval 这样，可以在生成每个$y_j$时动态的使用不同的源语言表示$\vectorn{\emph{C}}_j$，并更准确地捕捉源语和目标语不同位置之间的相关性。表\ref{tab:12-7}展示了引入注意力机制前后译文单词生成公式的对比。
+\vspace{0.5em}
+%----------------------------------------------
+\begin{table}[htp]
+\centering
+\caption{引入注意力机制前后译文单词生成公式}
+\label{tab:12-7}
+\begin{tabular}{ l | l }
+\rule{0pt}{13pt}	引入注意力之前			&引入注意力之后 \\ \hline
+\rule{0pt}{16pt}	$\textrm{“have”} = \argmax_{y_1}  \funp{P} (y_1 | \vectorn{\emph{C}} , y_0)$		&$\textrm{“have”} = \argmax_{y_1}  \funp{P} (y_1 | \vectorn{\emph{C}}_1 , y_0)$	\\
+\rule{0pt}{16pt}	$\textrm{“you”} = \argmax_{y_2} \funp{P} (y_2 | \vectorn{\emph{s}}_1 , y_1)$			&$\textrm{“you”} = \argmax_{y_2} \funp{P} (y_2 | \vectorn{\emph{s}}_1, \vectorn{\emph{C}}_2 , y_1)$	\\
+\end{tabular}
+\end{table}
+%----------------------------------------------
+%----------------------------------------------------------------------------------------
+%    NEW SUB-SECTION  12.1.4
+%----------------------------------------------------------------------------------------
+\subsection{注意力机制的解读}
+\label{sec:12.1.4}
+\vspace{0.5em}
+\parinterval 从前面的描述可以看出，注意力机制在机器翻译中就是要回答一个问题：给定一个目标语位置$j$和一系列源语的不同位置上的表示\{${\vectorn{\emph{h}}_i}$\}，如何得到一个新的表示$\hat{\vectorn{\emph{h}}}$，使得它与目标语位置$j$对应得最好？
+\parinterval 那么，如何理解这个过程？注意力机制的本质又是什么呢？换一个角度来看，实际上，目标语位置$j$本质上是一个查询，我们希望从源语端找到与之最匹配的源语位置，并返回相应的表示结果。为了描述这个问题，可以建立一个查询系统。假设有一个库，里面包含若干个$\mathrm{key}$-$\mathrm{value}$单元，其中$\mathrm{key}$代表这个单元的索引关键字，$\mathrm{value}$代表这个单元的值。比如，对于学生信息系统，$\mathrm{key}$可以是学号，$\mathrm{value}$可以是学生的身高。当输入一个查询$\mathrm{query}$，我们希望这个系统返回与之最匹配的结果。也就是，希望找到匹配的$\mathrm{key}$，并输出其对应的$\mathrm{value}$。比如，当查询某个学生的身高信息时，可以输入学生的学号，之后在库中查询与这个学号相匹配的记录，并把这个记录中的$\mathrm{value}$（即身高）作为结果返回。
+\parinterval 图\ref{fig:12-26}展示了一个这样的查询系统。里面包含四个$\mathrm{key}$-$\mathrm{value}$单元，当输入查询$\mathrm{query}$，就把$\mathrm{query}$与这四个$\mathrm{key}$逐个进行匹配，如果完全匹配就返回相应的$\mathrm{value}$。在图中的例子中，$\mathrm{query}$和$\mathrm{key}_3$是完全匹配的（因为都是横纹），因此系统返回第三个单元的值，即$\mathrm{value}_3$。当然，如果库中没有与$\mathrm{query}$匹配的$\mathrm{key}$，则返回一个空结果。
+%----------------------------------------------
+\begin{figure}[htp]
+\centering
+\input{./Chapter12/Figures/figure-query-model-corresponding-to-traditional-query-model-vs-attention-mechanism}
+\caption{传统查询模型}
+\label{fig:12-26}
+\end{figure}
+%----------------------------------------------
+\parinterval 也可以用这个系统描述翻译中的注意力问题，其中，$\mathrm{query}$即目标语位置$j$的某种表示，$\mathrm{key}$和$\mathrm{value}$即源语每个位置$i$上的${\vectorn{\emph{h}}_i}$（这里$\mathrm{key}$和$\mathrm{value}$是相同的）。但是，这样的系统在机器翻译问题上并不好用，因为目标语的表示和源语的表示都在多维实数空间上，所以无法要求两个实数向量像字符串一样进行严格匹配，或者说这种严格匹配的模型可能会导致$\mathrm{query}$几乎不会命中任何的$\mathrm{key}$。既然无法严格精确匹配，注意力机制就采用了一个“模糊”匹配的方法。这里定义每个$\mathrm{key}_i$和$\mathrm{query}$都有一个0～1之间的匹配度，这个匹配度描述了$\mathrm{key}_i$和$\mathrm{query}$之间的相关程度，记为$\alpha_i$。而查询的结果（记为$\overline{\mathrm{value}}$）也不再是某一个单元的$\mathrm{value}$，而是所有单元$\mathrm{value}$用$\alpha_i$的加权和：
+\begin{eqnarray}
+\overline{\mathrm{value}} = \sum_i \alpha_i \cdot {\mathrm{value}}_i
+\label{eq:12-27}
+\end{eqnarray}
+\noindent 也就是说所有的$\mathrm{value}_i$都会对查询结果有贡献，只是贡献度不同罢了。可以通过设计$\alpha_i$来捕捉$\mathrm{key}$和$\mathrm{query}$之间的相关性，以达到相关度越大的$\mathrm{key}$所对应的$\mathrm{value}$对结果的贡献越大。
+\parinterval 重新回到神经机器翻译问题上来。这种基于模糊匹配的查询模型可以很好的满足对注意力建模的要求。实际上，公式\ref{eq:12-27}中的$\alpha_i$就是前面提到的注意力权重，它可以由注意力函数a($\cdot$)计算得到。这样，$\overline{\mathrm{value}}$就是得到的上下文向量，它包含了所有\{$\vectorn{\emph{h}}_i$\}的信息，只是不同$\vectorn{\emph{h}}_i$的贡献度不同罢了。图\ref{fig:12-27}展示了将基于模糊匹配的查询模型应用于注意力机制的实例。
+%----------------------------------------------
+\begin{figure}[htp]
+\centering
+\input{./Chapter12/Figures/figure-query-model-corresponding-to-attention-mechanism}
+\caption{注意力机制所对应的查询模型}
+\label{fig:12-27}
+\end{figure}
+%----------------------------------------------
+\parinterval 最后，从统计学的角度，如果把$\alpha_i$作为每个$\mathrm{value}_i$出现的概率的某种估计，即：$ \funp{P} (\mathrm{value}_i$) $= \alpha_i$，于是可以把公式\ref{eq:12-27}重写为：
+\begin{eqnarray}
+\overline{\mathrm{value}} = \sum_i \funp{P} ( {\mathrm{value}}_i) \cdot {\mathrm{value}}_i
+\label{eq:12-28}
+\end{eqnarray}
+\noindent 显然， $\overline{\mathrm{value}}$就是$\mathrm{value}_i$在分布$ \funp{P}( \mathrm{value}_i$)下的期望，即
+\begin{equation}
+\mathbb{E}_{\sim \\ \funp{P} ( {\mathrm{\mathrm{value}}}_i )} ({\mathrm{value}}_i) = \sum_i \funp{P} ({\mathrm{value}}_i) \cdot {\mathrm{value}}_i
+\label{eq:12-29}
+\end{equation}
+从这个观点看，注意力机制实际上是得到了一个变量（$\mathrm{value}$）的期望。当然，严格意义上说，$\alpha_i$并不是从概率角度定义的，这里也并不是要追求严格的统计学意义。不过这确实说明了，往往看似简单的模型背后的数学原理可能会很深刻。
+%----------------------------------------------------------------------------------------
+%    NEW SECTION  12.2
+%----------------------------------------------------------------------------------------
+\section{自注意力机制}
+\vspace{0.5em}
+\label{sec:12.2}
+\parinterval 自注意力机制与注意力机制究竟有什么不同？首先回顾一下循环神经网络处理文字序列的过程。如图\ref{fig:12-36}所示，对于单词序列$\{ w_1,...,w_m \}$，处理第$m$个单词$w_m$时（绿色方框部分），需要输入前一时刻的信息（即处理单词$w_{m-1}$），而$w_{m-1}$又依赖于$w_{m-2}$，以此类推。也就是说，如果想建立$w_m$和$w_1$之间的关系，需要$m-1$次信息传递。对于长序列来说，词汇之间信息传递距离过长会导致信息在传递过程中丢失，同时这种按顺序建模的方式也使得系统对序列的处理十分缓慢。
+%----------------------------------------------
+\begin{figure}[htp]
+\centering
+\input{./Chapter12/Figures/figure-dependencies-between-words-in-a-recurrent-neural-network}
+\caption{循环神经网络中单词之间的依赖关系}
+\label{fig:12-36}
+\end{figure}
+%----------------------------------------------
+\parinterval 那么能否摆脱这种顺序传递信息的方式，直接对不同位置单词之间的关系进行建模，即将信息传递的距离拉近为1？{\small\sffamily\bfseries{自注意力机制}}\index{自注意力机制}（Self-Attention）\index{Self-Attention}的提出便有效解决了这个问题\upcite{DBLP:journals/corr/LinFSYXZB17}。图\ref{fig:12-37}给出了自注意力机制对序列进行建模的示例。对于单词$w_m$，自注意力机制直接建立它与前$m-1$个单词之间的关系。也就是说，$w_m$与序列中所有其他单词的距离都是1。这种方式很好地解决了长距离依赖问题，同时由于单词之间的联系都是相互独立的，因此也大大提高了模型的并行度。
+%----------------------------------------------
+\begin{figure}[htp]
+\centering
+\input{./Chapter12/Figures/figure-dependencies-between-words-of-attention}
+\caption{自注意力机制中单词之间的依赖关系}
+\label{fig:12-37}
+\end{figure}
+%----------------------------------------------
+\parinterval 自注意力机制也可以被看做是一个序列表示模型。比如，对于每个目标位置$j$，都生成一个与之对应的源语言句子表示，它的形式为：
+\begin{eqnarray}
+\vectorn{\emph{C}}_j = \sum_i \alpha_{i,j}\vectorn{\emph{h}}_i
+\label{eq:12-4201}
+\end{eqnarray}
+\noindent 其中$\vectorn{\emph{h}}_i$ 为源语句子每个位置的表示结果，$\alpha_{i,j}$是目标位置$j$对$\vectorn{\emph{h}}_i$的注意力权重。而自注意力机制不仅可以处理两种语言句子之间的对应，它也可以对单语句子进行表示。以源语句子为例，自注意力机制将序列中每个位置的表示$\vectorn{\emph{h}}_i$看作$\mathrm{query}$（查询），并且将所有位置的表示看作$\mathrm{key}$（键）和$\mathrm{value}$（值）。自注意力模型通过计算当前位置与所有位置的匹配程度，也就是在注意力机制中提到的注意力权重，来对各个位置的$\mathrm{value}$进行加权求和。得到的结果可以被看作是在这个句子中当前位置的抽象表示。这个过程，可以叠加多次，形成多层注意力模型，对输入序列中各个位置进行更深层的表示。
+%----------------------------------------------
+\begin{figure}[htp]
+\centering
+\input{./Chapter12/Figures/figure-example-of-self-attention-mechanism-calculation}
+\caption{自注意力计算实例}
+\label{fig:12-38}
+\end{figure}
+%----------------------------------------------
+\parinterval 举个例子，如图\ref{fig:12-38}所示，一个汉语句子包含5个词。这里，用$\vectorn{\emph{h}}$(“你”)表示“你”当前的表示结果。如果把“你”看作目标，这时$\mathrm{query}$就是$\vectorn{\emph{h}}$(“你”)，$\mathrm{key}$和$\mathrm{value}$是图中所有位置的表示，即：{$\vectorn{\emph{h}}$(“你”)、$\vectorn{\emph{h}}$(“什么”)、$\vectorn{\emph{h}}$(“也”)、$\vectorn{\emph{h}}$(“没”)、$\vectorn{\emph{h}}$(“ 学”)}。在自注意力模型中，首先计算$\mathrm{query}$和$\mathrm{key}$的相关度，这里用$\alpha_i$表示$\vectorn{\emph{h}}$(“你”)和位置$i$的表示之间的相关性。然后，把$\alpha_i$作为权重，对不同位置上的$\mathrm{value}$进行加权求和。最终，得到新的表示结果$\tilde{\vectorn{\emph{h}}}$ (“你” )：
+\begin{eqnarray}
+\tilde{\vectorn{\emph{h}}} (\textrm{“你”} ) = \alpha_1 {\vectorn{\emph{h}}} (\textrm{“你”} )
+ \alpha_2 {\vectorn{\emph{h}}} (\textrm{“什么 ”})
+ \alpha_3 {\vectorn{\emph{h}}} (\textrm{“也”} )
+ \alpha_4 {\vectorn{\emph{h}}} (\textrm{“没”} )
+\alpha_5 {\vectorn{\emph{h}}} (\textrm{“学”} )  \nonumber \\
+\label{eq:12-42}
+\end{eqnarray}
+\parinterval 同理，也可以用同样的方法处理这个句子中的其他单词。可以看出，在注意力机制中，并不是使用类似于循环神经网络的记忆能力去访问历史信息。序列中所有单词之间的信息都是通过同一种操作（$\mathrm{query}$和$\mathrm{key}$的相关度）进行处理。这样，表示结果$\tilde{\vectorn{\emph{h}}} (\textrm{“你”})$在包含“你”这个单词的信息的同时，也包含了序列中其他词的信息。也就是，序列中每一个位置的表示结果中，都包含了其他位置的信息。从这个角度说，$\tilde{\vectorn{\emph{h}}} (\textrm{“你”})$已经不再是单词“你”自身的表示结果，而是一种在单词“你”的位置上的全局信息的表示。
+\parinterval 通常，也把生成\{ $\tilde{\vectorn{\emph{h}}}(\vectorn{\emph{w}}_i)$ \}的过程称为{\small\sffamily\bfseries{特征提取}}\index{特征提取}，而实现这个过程的模型被称为特征提取器。循环神经网络、自注意力模型都是典型的特征提取器。特征提取是神经机器翻译系统的关键步骤，在随后的内容中可以看到自注意力模型是一个非常适合机器翻译任务的特征提取器。
+%----------------------------------------------------------------------------------------
+%    NEW SECTION  12.2
+%----------------------------------------------------------------------------------------
+\sectionnewpage
+\section{Transformer}
+\parinterval 本小节将以Transformer为例，详细地展示自注意力机制在神经机器翻译中的应用。首先回顾一下{\chapterten}介绍的循环神经网络，虽然它很强大，但是也存在一些弊端。其中比较突出的问题是，循环神经网络每个循环单元都有向前依赖性，也就是当前时间步的处理依赖前一时间步处理的结果。这个性质可以使序列的“历史”信息不断被传递，但是也造成模型运行效率的下降。特别是对于自然语言处理任务，序列往往较长，无论是传统的RNN结构，还是更为复杂的LSTM结构，都需要很多次循环单元的处理才能够捕捉到单词之间的长距离依赖。由于需要多个循环单元的处理，距离较远的两个单词之间的信息传递变得很复杂。
+\parinterval 针对这些问题，谷歌的研究人员提出了一种全新的模型$\ \dash\ $Transformer\upcite{NIPS2017_7181}。与循环神经网络等传统模型不同，Transformer模型仅仅使用一种被称作自注意力机制的方法和标准的前馈神经网络，完全不依赖任何循环单元或者卷积操作。自注意力机制的优点在于可以直接对序列中任意两个单元之间的关系进行建模，这使得长距离依赖等问题可以更好地被求解。此外，自注意力机制非常适合在GPU 上进行并行化，因此模型训练的速度更快。表\ref{tab:12-11}对比了RNN、CNN、Transformer三种模型的时间复杂度。
+%----------------------------------------------
+\begin{table}[htp]
+\centering
+\caption{ RNN、CNN、Transformer的对比\upcite{NIPS2017_7181} （$n$表示序列长度，$d$表示隐层大小，$k$表示卷积核大小） }
+\label{tab:12-11}
+\begin{tabular}{l | l l l}
+\rule{0pt}{20pt} Layer Type & \begin{tabular}[l]{@{}l@{}}Complexity\\ per Layer\end{tabular} & \begin{tabular}[l]{@{}l@{}}Sequential\\ Operations\end{tabular} & \begin{tabular}[l]{@{}l@{}}Maximum\\ Path Length\end{tabular} \\ \hline
+\rule{0pt}{13pt}Self-Attention &$O(n^2\cdot d)$	&$O(1)$	&$O(1)$       \\
+\rule{0pt}{13pt}Recurrent &$O(n \cdot d^2)$		&$O(n)$	&$O(n)$ 	\\
+\rule{0pt}{13pt}Convolutional  &$O(k\cdot n \cdot d^2)$	&$O(1)$	&$O(\mathrm{log}_k(n))$
+\end{tabular}
+\end{table}
+%----------------------------------------------
+\parinterval Transformer在被提出之后，很快就席卷了整个自然语言处理领域。实际上，Transformer也可以当作一种表示模型，因此也被大量地使用在自然语言处理的其他领域，甚至图像处理和语音处理中也能看到它的影子。比如，目前非常流行的BERT等预训练模型就是基于Transformer。表\ref{tab:12-12}展示了Transformer在WMT英德和英法机器翻译任务上的性能。它能用更少的计算量（FLOPS）达到比其他模型更好的翻译品质\footnote{FLOPS = floating-point operations per second，即每秒浮点运算次数。它是度量计算机运算规模的常用单位} 。
+%----------------------------------------------
+\begin{table}[htp]
+\centering
+\caption{ 不同翻译模型性能对比\upcite{NIPS2017_7181}}
+\label{tab:12-12}
+\begin{tabular}{l l l l}
+\multicolumn{1}{l|}{\multirow{2}{*}{\#}} & \multicolumn{2}{c}{BLEU} & \multirow{2}{*}{\parbox{6em}{Training Cost (FLOPs)}} \\
+\multicolumn{1}{l|}{}                    & EN-DE  & EN-FR  &                                       \\ \hline
+\multicolumn{1}{l|}{GNMT+RL}             & 24.6            & 39.92           & 1.4$\times 10^{20}$                   \\
+\multicolumn{1}{l|}{ConvS2S}             & 25.16           & 40.46           & 1.5$\times 10^{20}$                   \\
+\multicolumn{1}{l|}{MoE}                 & 26.03           & 40.56           & 1.2$\times 10^{20}$                   \\
+\multicolumn{1}{l|}{Transformer (Big)}    & {\small\sffamily\bfseries{28.4}}   & {\small\sffamily\bfseries{41.8}}   & 2.3$\times 10^{19}$                   \\
+\end{tabular}
+\end{table}
+%----------------------------------------------
+\parinterval 注意，Transformer并不简单等同于自注意力机制。Transformer模型还包含了很多优秀的技术，比如：多头注意力、新的训练学习率调整策略等等。这些因素一起组成了真正的Transformer。下面就一起看一看自注意力机制和Transformer是如何工作的。
+%----------------------------------------------------------------------------------------
+%    NEW SUB-SECTION  12.2.1
+%----------------------------------------------------------------------------------------
+\subsection{Transformer架构}
+%----------------------------------------------
+\begin{figure}[htp]
+\centering
+\input{./Chapter12/Figures/figure-transformer}
+\caption{ Transformer结构}
+\label{fig:12-39}
+\end{figure}
+%----------------------------------------------
+\parinterval 图\ref{fig:12-39}展示了经典的Transformer结构。解码器由若干层组成（绿色虚线框就代表一层）。每一层（layer）的输入都是一个向量序列，输出是同样大小的向量序列，而Transformer层的作用是对输入进行进一步的抽象，得到新的表示结果。不过这里的层并不是指单一的神经网络结构，它里面由若干不同的模块组成，包括：
+\begin{itemize}
+\vspace{0.5em}
+\item {\small\sffamily\bfseries{自注意力子层}}\index{自注意力子层}（Self-attention Sub-layer）\index{Self-attention Sub-layer}：使用自注意力机制对输入的序列进行新的表示；
+\vspace{0.5em}
+\item {\small\sffamily\bfseries{前馈神经网络子层}}\index{前馈神经网络子层}（Feed-forward Sub-layer）\index{Feed-forward Sub-layer}：使用全连接的前馈神经网络对输入向量序列进行进一步变换；
+\vspace{0.5em}
+\item {\small\sffamily\bfseries{残差连接}}\index{残差连接}（Residual Connection，标记为“Add”）\index{Residual Connection}：对于自注意力子层和前馈神经网络子层，都有一个从输入直接到输出的额外连接，也就是一个跨子层的直连。残差连接可以使深层网络的信息传递更为有效；
+\vspace{0.5em}
+\item {\small\sffamily\bfseries{层正则化}}\index{层正则化}（Layer Normalization）\index{Layer Normalization}：自注意力子层和前馈神经网络子层进行最终输出之前，会对输出的向量进行层正则化，规范结果向量取值范围，这样易于后面进一步的处理。
+\vspace{0.5em}
+\end{itemize}
+\parinterval 以上操作就构成了Transformer的一层，各个模块执行的顺序可以简单描述为：Self-Attention $\to$ Residual Connection $\to$ Layer Normalization $\to$ Feed Forward Network $\to$ Residual Connection $\to$ Layer Normalization。编码器可以包含多个这样的层，比如，可以构建一个六层编码器，每层都执行上面的操作。最上层的结果作为整个编码的结果，会被传入解码器。
+\parinterval 解码器的结构与编码器十分类似。它也是由若干层组成，每一层包含编码器中的所有结构，即：自注意力子层、前馈神经网络子层、残差连接和层正则化模块。此外，为了捕捉源语言的信息，解码器又引入了一个额外的{\small\sffamily\bfseries{编码-解码注意力子层}}\index{编码-解码注意力子层}（Encoder-decoder Attention Sub-layer）\index{Encoder-decoder Attention Sub-layer}。这个新的子层，可以帮助模型使用源语言句子的表示信息生成目标语不同位置的表示。编码-解码注意力子层仍然基于自注意力机制，因此它和自注意力子层的结构是相同的，只是$\mathrm{query}$、$\mathrm{key}$、$\mathrm{value}$的定义不同。比如，在解码端，自注意力子层的$\mathrm{query}$、$\mathrm{key}$、$\mathrm{value}$是相同的，它们都等于解码端每个位置的表示。而在编码-解码注意力子层中，$\mathrm{query}$是解码端每个位置的表示，此时$\mathrm{key}$和$\mathrm{value}$是相同的，等于编码端每个位置的表示。图\ref{fig:12-40}给出了这两种不同注意力子层输入的区别。
+%----------------------------------------------
+\begin{figure}[htp]
+    \centering
+   \input{./Chapter12/Figures/figure-self-att-vs-enco-deco-att}
+    \caption{ 注意力模型的输入（自注意力子层 vs 编码-解码注意力子层）}
+    \label{fig:12-40}
+\end{figure}
+%----------------------------------------------
+\parinterval 此外，编码端和解码端都有输入的词序列。编码端的词序列输入是为了对其进行表示，进而解码端能从编码端访问到源语言句子的全部信息。解码端的词序列输入是为了进行目标语的生成，本质上它和语言模型是一样的，在得到前$n-1$个单词的情况下输出第$n$个单词。除了输入的词序列的词嵌入，Transformer中也引入了位置嵌入，以表示每个位置信息。原因是，自注意力机制没有显性地对位置进行表示，因此也无法考虑词序。在输入中引入位置信息可以让自注意力机制间接地感受到每个词的位置，进而保证对序列表示的合理性。最终，整个模型的输出由一个Softmax层完成，它和循环神经网络中的输出层是完全一样的（\ref{sec:10.3.2}节）。
+\parinterval 在进行更详细的介绍前，先利用图\ref{fig:12-39}简单了解一下Transformer模型是如何进行翻译的。首先，Transformer将源语“我\ 很\ 好”的{\small\bfnew{词嵌入}}\index{词嵌入}（Word Embedding）\index{Word Embedding}融合{\small\bfnew{位置编码}}\index{位置编码}（Position Embedding）\index{Position Embedding}后作为输入。然后，编码器对输入的源语句子进行逐层抽象，得到包含丰富的上下文信息的源语表示并传递给解码器。解码器的每一层，使用自注意力子层对输入解码端的表示进行加工，之后再使用编码-解码注意力子层融合源语句子的表示信息。就这样逐词生成目标语译文单词序列。解码器的每个位置的输入是当前单词（比如，“I”），而这个位置输出是下一个单词（比如，“am”），这个设计和标准的神经语言模型是完全一样的。
+\parinterval 了解到这里，可能大家还有很多疑惑，比如，什么是位置编码？Transformer的自注意力机制具体是怎么进行计算的，其结构是怎样的？Add\& LayerNorm又是什么？等等。下面就一一展开介绍。
+%----------------------------------------------------------------------------------------
+%    NEW SUB-SECTION  12.2.2
+%----------------------------------------------------------------------------------------
+\subsection{位置编码}
+\parinterval 在使用循环神经网络进行序列的信息提取时，每个时刻的运算都要依赖前一个时刻的输出，具有一定的时序性，这也与语言具有顺序的特点相契合。而采用自注意力机制对源语言和目标语言序列进行处理时，直接对当前位置和序列中的任意位置进行建模，忽略了词之间的顺序关系，例如图\ref{fig:12-41}中两个语义不同的句子，通过自注意力得到的表示$\tilde{\vectorn{\emph{h}}}$(“机票”)却是相同的。
+%----------------------------------------------
+\begin{figure}[htp]
+\centering
+\input{./Chapter12/Figures/figure-calculation-of-context-vector-c}
+\caption{“机票”的更进一步抽象表示$\tilde{\vectorn{\emph{h}}}$的计算}
+\label{fig:12-41}
+\end{figure}
+%----------------------------------------------
+\parinterval 为了解决这个问题，Transformer在原有的词向量输入基础上引入了位置编码，来表示单词之间的顺序关系。位置编码在Transformer结构中的位置如图\ref{fig:12-42}，它是Transformer成功的一个重要因素。
+%----------------------------------------------
+\begin{figure}[htp]
+\centering
+\input{./Chapter12/Figures/figure-transformer-input-and-position-encoding}
+\caption{Transformer输入与位置编码}
+\label{fig:12-42}
+\end{figure}
+%----------------------------------------------
+\parinterval 位置编码的计算方式有很多种，Transformer使用不同频率的正余弦函数：
+\begin{eqnarray}
+\textrm{PE}(pos,2i) = \textrm{sin} (\frac{pos}{10000^{2i/d_{model}}})
+\label{eq:12-43}
+\end{eqnarray}
+\begin{eqnarray}
+\textrm{PE}(pos,2i+1) = \textrm{cos} (\frac{pos}{10000^{2i/d_{model}}})
+\label{eq:12-44}
+\end{eqnarray}
+\noindent 式中PE($\cdot$)表示位置编码的函数，$pos$表示单词的位置，$i$代表位置编码向量中的第几维，$d_{model}$是Transformer的一个基础参数，表示每个位置的隐层大小。因为，正余弦函数的编码各占一半，因此当位置编码的维度为512 时，$i$ 的范围是0-255。 在Transformer中，位置编码的维度和词嵌入向量的维度相同（均为$d_{model}$），模型通过将二者相加作为模型输入，如图\ref{fig:12-43}所示。
+%----------------------------------------------
+\begin{figure}[htp]
+\centering
+\input{./Chapter12/Figures/figure-a-combination-of-position-encoding-and-word-encoding}
+\caption{位置编码与词编码的组合}
+\label{fig:12-43}
+\end{figure}
+%----------------------------------------------
+\parinterval 那么为什么通过这种计算方式可以很好的表示位置信息？有几方面原因。首先，正余弦函数是具有上下界的周期函数，用正余弦函数可将长度不同的序列的位置编码的范围都固定到[-1,1]，这样在与词的编码进行相加时，不至于产生太大差距。另外位置编码的不同维度对应不同的正余弦曲线，这为多维的表示空间赋予一定意义。最后，根据三角函数的性质：
+\begin{eqnarray}
+\textrm{sin}(\alpha + \beta) &=& \textrm{sin}\alpha \cdot \textrm{cos} \beta + \textrm{cos} \alpha \cdot \textrm{sin} \beta \nonumber  \\
+\textrm{cos}(\alpha + \beta) &=&  \textrm{cos} \alpha  \cdot \textrm{cos} \beta - \textrm{sin} \alpha \cdot \textrm{sin} \beta
+\label{eq:12-45}
+\end{eqnarray}
+\parinterval 可以得到“$pos+k$”的位置编码为：
+\begin{eqnarray}
+\textrm{PE}(pos+k,2i) &=& \textrm{PE}(pos,2i) \times \textrm{PE}(k,2i+1) + \nonumber \\
+                      & & \textrm{PE}(pos,2i+1) \times \textrm{PE}(k,2i)\\
+\textrm{PE}(pos+k ,2i+1) &=& \textrm{PE}(pos,2i+1) \times \textrm{PE}(k,2i+1) - \nonumber \\
+                         & & \textrm{PE}(pos,2i) \times \textrm{PE}(k,2i)
+\label{eq:12-46}
+\end{eqnarray}
+\noindent 即对于任意固定的偏移量$k$，$\textrm{PE}(pos+k)$能被表示成$\textrm{PE}(pos)$的线性函数，换句话说，位置编码可以表示词之间的距离。在实践中发现，位置编码对Transformer系统的性能有很大影响。对其进行改进也会带来进一步的性能提升\upcite{Shaw2018SelfAttentionWR}。
+%----------------------------------------------------------------------------------------
+%    NEW SUB-SECTION  12.2.3
+%----------------------------------------------------------------------------------------
+\subsection{基于点乘的注意力机制}
+%----------------------------------------------
+\begin{figure}[htp]
+\centering
+\input{./Chapter12/Figures/figure-position-of-self-attention-mechanism-in-the-model}
+\caption{自注意力机制在模型中的位置}
+\label{fig:12-44}
+\end{figure}
+%----------------------------------------------
+\parinterval Transformer模型摒弃了循环单元和卷积等结构，完全基于注意力机制来构造模型，其中包含着大量的注意力计算。比如，可以通过自注意力机制对源语言和目标语言序列进行信息提取，并通过编码-解码注意力对双语句对之间的关系进行提取。图\ref{fig:12-44}中红色方框部分是Transformer中使用自注意力机制的模块。
+\parinterval 在\ref{sec:12.1.3}节中已经介绍，自注意力机制中至关重要的是获取相关性系数，也就是在融合不同位置的表示向量时各位置的权重。不同于\ref{sec:12.1.3}节介绍的注意力机制的相关性系数计算方式，Transformer模型采用了一种基于点乘的方法来计算相关性系数。这种方法也称为{\small\bfnew{点乘注意力}}\index{点乘注意力}（Scaled Dot-Product Attention）\index{Scaled Dot-Product Attention}机制。它的运算并行度高，同时并不消耗太多的存储空间。
+\parinterval 具体来看，在注意力机制的计算过程中，包含三个重要的参数，分别是Query，\\Key和Value。在下面的描述中，分别用$\vectorn{\emph{Q}}$，$\vectorn{\emph{K}}$，$\vectorn{\emph{V}}$对它们进行表示，其中$\vectorn{\emph{Q}}$ 和$\vectorn{\emph{K}}$的维度为$L\times d_k$，$\vectorn{\emph{V}}$的维度为$L\times d_v$。这里，$L$为序列的长度，$d_k$和$d_v$分别表示每个Key和Value的大小，通常设置为$d_k=d_v=d_{model}$。
+\parinterval 在自注意力机制中，$\vectorn{\emph{Q}}$、$\vectorn{\emph{K}}$、$\vectorn{\emph{V}}$都是相同的，对应着源语言或目标语言的表示。而在编码-解码注意力机制中，由于要对双语之间的信息进行建模，因此，将目标语每个位置的表示视为编码-解码注意力机制的$\vectorn{\emph{Q}}$，源语言句子的表示视为$\vectorn{\emph{K}}$ 和$\vectorn{\emph{V}}$。
+\parinterval 在得到$\vectorn{\emph{Q}}$，$\vectorn{\emph{K}}$和$\vectorn{\emph{V}}$后，便可以进行注意力机制的运算，这个过程可以被形式化为：
+\begin{eqnarray}
+\textrm{Attention}(\vectorn{\emph{Q}},\vectorn{\emph{K}},\vectorn{\emph{V}}) = \textrm{Softmax}
+ ( \frac{\vectorn{\emph{Q}}\vectorn{\emph{K}}^{T}} {\sqrt{d_k}} + \vectorn{\emph{Mask}} ) \vectorn{\emph{V}}
+\label{eq:12-47}
+\end{eqnarray}
+\noindent 首先，通过对$\vectorn{\emph{Q}}$和$\vectorn{\emph{K}}$的转置进行点乘操作，计算得到一个维度大小为$L \times L$的相关性矩阵，即$\vectorn{\emph{Q}}\vectorn{\emph{K}}^{T}$，它表示一个序列上任意两个位置的相关性。再通过系数1/$\sqrt{d_k}$进行放缩操作，放缩可以尽量减少相关性矩阵的方差，具体体现在运算过程中实数矩阵中的数值不会过大，有利于模型训练。
+\parinterval 在此基础上，通过对相关性矩阵累加一个掩码矩阵，来屏蔽掉矩阵中的无用信息。比如，在编码端对句子的补齐，在解码端则屏蔽掉未来信息，这一部分内容将在下一小节进行详细介绍。随后，使用Softmax函数对相关性矩阵在行的维度上进行归一化操作，这可以理解为对第$i$行进行归一化，结果对应了$\vectorn{\emph{V}}$中不同位置上向量的注意力权重。对于$\mathrm{value}$的加权求和，可以直接用相关性系数和$\vectorn{\emph{V}}$进行矩阵乘法得到，即$\textrm{Softmax}
+ ( \frac{\vectorn{\emph{Q}}\vectorn{\emph{K}}^{T}} {\sqrt{d_k}} + \vectorn{\emph{Mask}} )$和$\vectorn{\emph{V}}$进行矩阵乘。最终得到自注意力的输出，它和输入的$\vectorn{\emph{V}}$的大小是一模一样的。图\ref{fig:12-45}展示了点乘注意力计算的全过程。
+%----------------------------------------------
+\begin{figure}[htp]
+\centering
+\input{./Chapter12/Figures/figure-point-product-attention-model}
+\caption{点乘注意力力模型 }
+\label{fig:12-45}
+\end{figure}
+%----------------------------------------------
+\parinterval 下面举个简单的例子介绍点乘注意力的具体计算过程。如图\ref{fig:12-46}所示，用黄色、蓝色和橙色的矩阵分别表示$\vectorn{\emph{Q}}$、$\vectorn{\emph{K}}$和$\vectorn{\emph{V}}$。$\vectorn{\emph{Q}}$、$\vectorn{\emph{K}}$ 和$\vectorn{\emph{V}}$中的每一个小格都对应一个单词在模型中的表示（即一个向量）。首先，通过点乘、放缩、掩码等操作得到相关性矩阵，即粉色部分。其次，将得到的中间结果矩阵（粉色）的每一行使用Softmax激活函数进行归一化操作，得到最终的权重矩阵，也就是图中的红色矩阵。红色矩阵中的每一行都对应一个注意力分布。最后，按行对$\vectorn{\emph{V}}$进行加权求和，便得到了每个单词通过点乘注意力机制计算得到的表示。这里面，主要的计算消耗是两次矩阵乘法，即$\vectorn{\emph{Q}}$与$\vectorn{\emph{K}}^{T}$的乘法、相关性矩阵和$\vectorn{\emph{V}}$的乘法。这两个操作都可以在GPU上高效地完成，因此可以一次性计算出序列中所有单词之间的注意力权重，并完成所有位置表示的加权求和过程，这样大大提高了模型的计算速度。
+%----------------------------------------------
+\begin{figure}[htp]
+\centering
+\input{./Chapter12/Figures/figure-process-of-5}
+\caption{式\ref{eq:12-47}的执行过程示例}
+\label{fig:12-46}
+\end{figure}
+%----------------------------------------------
+%----------------------------------------------------------------------------------------
+%    NEW SUB-SECTION  12.2.4
+%----------------------------------------------------------------------------------------
+\subsection{掩码操作}
+\parinterval 在公式\ref{eq:12-47}中提到了Mask（掩码），它的目的是对向量中某些值进行掩盖，避免无关位置的数值对运算造成影响。Transformer中的Mask主要应用在注意力机制中的相关性系数计算，具体方式是在相关性系数矩阵上累加一个Mask矩阵。该矩阵在需要Mask的位置的值为负无穷-inf（具体实现时是一个非常小的数，比如-1e-9），其余位置为0，这样在进行了Softmax归一化操作之后，被掩码掉的位置计算得到的权重便近似为0，也就是说对无用信息分配的权重为0，从而避免了其对结果产生影响。Transformer包含两种Mask：
+\begin{itemize}
+\vspace{0.5em}
+\item Padding Mask。在批量处理多个样本时（训练或解码），由于要对源语言和目标语言的输入进行批次化处理，而每个批次内序列的长度不一样，为了方便对批次内序列进行矩阵表示，需要进行对齐操作，即在较短的序列后面填充0来占位（padding操作）。而这些填充的位置没有意义，不参与注意力机制的计算，因此，需要进行Mask操作，屏蔽其影响。
+\vspace{0.5em}
+\item Future Mask。对于解码器来说，由于在预测的时候是自左向右进行的，即第$t$时刻解码器的输出只能依赖于$t$时刻之前的输出。且为了保证训练解码一致，避免在训练过程中观测到目标语端每个位置未来的信息，因此需要对未来信息进行屏蔽。具体的做法是：构造一个上三角值全为-inf的Mask矩阵，也就是说，在解码端计算中，在当前位置，通过Future Mask把序列之后的信息屏蔽掉了，避免了$t$时刻之后的位置对当前的计算产生影响。图\ref{fig:12-47}给出了一个具体的实例。
+%----------------------------------------------
+% 图3.10
+\begin{figure}[htp]
+\centering
+\input{./Chapter12/Figures/figure-mask-instance-for-future-positions-in-transformer}
+\caption{Transformer中对于未来位置进行的屏蔽的Mask实例}
+\label{fig:12-47}
+\end{figure}
+%----------------------------------------------
+\vspace{0.5em}
+\end{itemize}
+%----------------------------------------------------------------------------------------
+%    NEW SUB-SECTION  12.2.5
+%----------------------------------------------------------------------------------------
+\subsection{多头注意力}
+\parinterval Transformer中使用的另一项重要技术是{\small\sffamily\bfseries{多头注意力}}\index{多头注意力}（Multi-head Attention）\index{Multi-head Attention}。“多头”可以理解成将原来的$\vectorn{\emph{Q}}$、$\vectorn{\emph{K}}$、$\vectorn{\emph{V}}$按照隐层维度平均切分成多份。假设切分$h$份，那么最终会得到$\vectorn{\emph{Q}} = \{ \vectorn{\emph{q}}_1, \vectorn{\emph{q}}_2,...,\vectorn{\emph{q}}_h \}$，$\vectorn{\emph{K}}=\{ \vectorn{\emph{k}}_1,\vectorn{\emph{k}}_2,...,\vectorn{\emph{k}}_h \}$，$\vectorn{\emph{V}}=\{ \vectorn{\emph{v}}_1, \vectorn{\emph{v}}_2,...,\vectorn{\emph{v}}_h \}$。多头注意力机制就是用每一个切分得到的$\vectorn{\emph{q}}$，$\vectorn{\emph{k}}$，$\vectorn{\emph{v}}$独立的进行注意力计算。即第$i$个头的注意力计算结果$\vectorn{\emph{head}}_i = \textrm{Attention}(\vectorn{\emph{q}}_i,\vectorn{\emph{k}}_i, \vectorn{\emph{v}}_i)$。
+\parinterval 下面根据图\ref{fig:12-48}详细介绍多头注意力的计算过程：
+\begin{itemize}
+\vspace{0.5em}
+\item	首先将$\vectorn{\emph{Q}}$、$\vectorn{\emph{K}}$、$\vectorn{\emph{V}}$分别通过线性变换的方式映射为$h$个子集（机器翻译任务中，$h$一般为8）。即$\vectorn{\emph{q}}_i = \vectorn{\emph{Q}}\vectorn{\emph{W}}_i^Q $、$\vectorn{\emph{k}}_i = \vectorn{\emph{K}}\vectorn{\emph{W}}_i^K $、$\vectorn{\emph{v}}_i = \vectorn{\emph{V}}\vectorn{\emph{W}}_i^V $，其中$i$表示第$i$个头， $\vectorn{\emph{W}}_i^Q  \in \mathbb{R}^{d_{model} \times d_k}$,  $\vectorn{\emph{W}}_i^K  \in \mathbb{R}^{d_{model} \times d_k}$,  $\vectorn{\emph{W}}_i^V  \in \mathbb{R}^{d_{model} \times d_v}$是参数矩阵; $d_k=d_v=d_{model} / h$，对于不同的头采用不同的变换矩阵，这里$d_{model}$是Transformer的一个参数，表示每个隐层向量的维度；
+\vspace{0.5em}
+\item 其次对每个头分别执行点乘注意力操作，并得到每个头的注意力操作的输出$\vectorn{\emph{head}}_i$；
+\vspace{0.5em}
+\item	最后将$h$个头的注意力输出在最后一维$d_v$进行拼接（Concat）重新得到维度为$h \times d_v$的输出，并通过对其左乘一个权重矩阵$\vectorn{\emph{W}}^o$进行线性变换，从而对多头计算得到的信息进行融合，且将多头注意力输出的维度映射为模型的隐层大小（即$d_{model}$），这里参数矩阵$\vectorn{\emph{W}}^o \in \mathbb{R}^{h \times d_v \times d_{model}}$。
+\vspace{0.5em}
+\end{itemize}
+%----------------------------------------------
+\begin{figure}[htp]
+\centering
+\input{./Chapter12/Figures/figure-multi-head-attention-model}
+\caption{多头注意力模型}
+\label{fig:12-48}
+\end{figure}
+%----------------------------------------------
+\parinterval 多头机制具体的计算公式如下：
+\begin{eqnarray}
+\textrm{MultiHead}(\vectorn{\emph{Q}}, \vectorn{\emph{K}} , \vectorn{\emph{V}})& = & \textrm{Concat} (\vectorn{\emph{head}}_1, ... , \vectorn{\emph{head}}_h ) \vectorn{\emph{W}}^o \label{eq:12-48} \\
+\vectorn{\emph{head}}_i & = &\textrm{Attention} (\vectorn{\emph{Q}}\vectorn{\emph{W}}_i^Q , \vectorn{\emph{K}}\vectorn{\emph{W}}_i^K  , \vectorn{\emph{V}}\vectorn{\emph{W}}_i^V )
+\label{eq:12-49}
+\end{eqnarray}
+\parinterval 多头机制的好处是允许模型在不同的表示子空间里学习。在很多实验中发现，不同表示空间的头捕获的信息是不同的，比如，在使用Transformer处理自然语言时，有的头可以捕捉句法信息，有头可以捕捉词法信息。
+%----------------------------------------------------------------------------------------
+%    NEW SUB-SECTION  12.2.6
+%----------------------------------------------------------------------------------------
+\subsection{残差网络和层正则化}
+\parinterval Transformer编码器、解码器分别由多层网络组成（通常为6层），每层网络又包含多个子层（自注意力网络、前馈神经网络）。因此Transformer实际上是一个很深的网络结构。再加上前面介绍的点乘注意力机制，包含很多线性和非线性变换；另外，注意力函数Attention($\cdot$)的计算也涉及多层网络，整个网络的信息传递非常复杂。从反向传播的角度来看，每次回传的梯度都会经过若干步骤，容易产生梯度爆炸或者消失。
+\parinterval 解决这个问题的一种办法就是使用{\small\sffamily\bfseries{残差连接}}\index{残差连接}\upcite{DBLP:journals/corr/HeZRS15}。残差连接是一种用来训练深层网络的技术，其结构如图\ref{fig:12-49}，即在子层之前通过增加直接连接的方式，将底层信息直接传递给上层。
+%----------------------------------------------
+\begin{figure}[htp]
+\centering
+\input{./Chapter12/Figures/figure-residual-network-structure}
+\caption{残差网络结构}
+\label{fig:12-49}
+\end{figure}
+%----------------------------------------------
+\parinterval 残差连接从广义上讲也叫{\small\bfnew{短连接}}\index{短连接}（Short-cut Connection）\index{Short-cut Connection}，指的是这种短距离的连接。它的思想很简单，就是把层和层之间的距离拉近。如图\ref{fig:12-49}所示，子层1通过残差连接跳过了子层2，直接和子层3进行信息传递。使信息传递变得更高效，有效解决了深层网络训练过程中容易出现的梯度消失/爆炸问题，使得深层网络的训练更加容易。其计算公式为：
+\begin{eqnarray}
+x_{l+1} = x_l + \mathcal{F} (x_l)
+\label{eq:12-50}
+\end{eqnarray}
+\noindent 其中$\mathcal{F} (x_l)$是子层运算。如果$l=2$，那么公式\ref{eq:12-50}可以解释为，第3层的输出等于第2层的输出加上第二层的输入。图\ref{fig:12-50}中的红色方框展示了Transformer中残差连接的位置。
+%----------------------------------------------
+\begin{figure}[htp]
+\centering
+\input{./Chapter12/Figures/figure-position-of-difference-and-layer-regularization-in-the-model}
+\caption{残差和层正则化在模型中的位置}
+\label{fig:12-50}
+\end{figure}
+%----------------------------------------------
+\parinterval 在Transformer的训练过程中，由于引入了残差操作，将前面所有层的输出加到一起。这样会导致不同层（或子层）的结果之间的差异性很大，造成训练过程不稳定、训练时间较长。为了避免这种情况，在每层中加入了层正则化操作\upcite{Ba2016LayerN}。层正则化的计算公式如下：
+\begin{eqnarray}
+\textrm{LN}(x) = g \cdot \frac{x- \mu} {\sigma} + b
+\label{eq:12-51}
+\end{eqnarray}
+\noindent 该公式使用均值$\mu$和方差$\sigma$对样本进行平移缩放，将数据规范化为均值为0，方差为1的标准分布。$g$和$b$是可学习的参数。
+\parinterval 在Transformer中经常使用的层正则化操作有两种结构，分别是{\small\bfnew{后正则化}}\index{后正则化}（Post-norm）\index{Post-norm}和{\small\bfnew{前正则化}}\index{前正则化}（Pre-norm）\index{Pre-norm}，结构如图\ref{fig:12-51}所示。后正则化中先进行残差连接再进行层正则化，而前正则化则是在子层输入之前进行层正则化操作。在很多实践中已经发现，前正则化的方式更有利于信息传递，因此适合训练深层的Transformer模型\upcite{WangLearning}。
+%----------------------------------------------
+\begin{figure}[htp]
+\centering
+\input{./Chapter12/Figures/figure-different-regularization-methods}
+\caption{不同正则化方式 }
+\label{fig:12-51}
+\end{figure}
+%----------------------------------------------
+%----------------------------------------------------------------------------------------
+%    NEW SUB-SECTION  12.2.7
+%----------------------------------------------------------------------------------------
+\subsection{前馈全连接网络子层}
+\parinterval 在Transformer的结构中，每一个编码层或者解码层中都包含一个前馈神经网络，它在模型中的位置如图\ref{fig:12-52}中红色方框所示。
+%----------------------------------------------
+\begin{figure}[htp]
+\centering
+\input{./Chapter12/Figures/figure-position-of-feedforward-neural-network-in-the-model}
+\caption{前馈神经网络在模型中的位置}
+\label{fig:12-52}
+\end{figure}
+%----------------------------------------------
+\parinterval Transformer使用了全连接网络。全连接网络的作用主要体现在将经过注意力操作之后的表示映射到新的空间中，新的空间会有利于接下来的非线性变换等操作。实验证明，去掉全连接网络会对模型的性能造成影响。Transformer的全连接前馈神经网络包含两次线性变换和一次非线性变换（ReLU激活函数:ReLU$(x)=\textrm{max}⁡(0,x)$），每层的前馈神经网络参数不共享，计算公式如下：
+\begin{eqnarray}
+\textrm{FFN}(x) = \textrm{max} (0,\vectorn{\emph{x}}\vectorn{\emph{W}}_1 + \vectorn{\emph{b}}_1)\vectorn{\emph{W}}_2 + \vectorn{\emph{b}}_2
+\label{eq:12-52}
+\end{eqnarray}
+\noindent 其中，$\vectorn{\emph{W}}_1$、$\vectorn{\emph{W}}_2$、$\vectorn{\emph{b}}_1$和$\vectorn{\emph{b}}_2$为模型的参数。通常情况下，前馈神经网络的隐层维度要比注意力部分的隐层维度大，而且研究人员发现这种设置对Transformer是至关重要的。 比如，注意力部分的隐层维度为512，前馈神经网络部分的隐层维度为2048。当然，继续增大前馈神经网络的隐层大小，比如设为4096，甚至8192，还可以带来性能的增益，但是前馈部分的存储消耗较大，需要更大规模GPU 设备的支持。因此在具体实现时，往往需要在翻译准确性和存储/速度之间找到一个平衡。
+%----------------------------------------------------------------------------------------
+%    NEW SUB-SECTION  12.2.8
+%----------------------------------------------------------------------------------------
+\subsection{训练}
+\parinterval 与前面介绍的神经机器翻译模型的训练一样，Transformer的训练流程为：首先对模型进行初始化，然后在编码器输入包含结束符的源语言单词序列。前面已经介绍过，解码端每个位置单词的预测都要依赖已经生成的序列。在解码端输入包含起始符号的目标语序列，通过起始符号预测目标语的第一个单词，用真实的目标语的第一个单词去预测第二个单词，以此类推，然后用真实的目标语序列和预测的结果比较，计算它的损失。Transformer使用了{\small\bfnew{交叉熵损失}}\index{交叉熵损失}（Cross Entropy Loss）\index{Cross Entropy Loss}函数，损失越小说明模型的预测越接近真实输出。然后利用反向传播来调整模型中的参数。由于Transformer 将任意时刻输入信息之间的距离拉近为1，摒弃了RNN中每一个时刻的计算都要基于前一时刻的计算这种具有时序性的训练方式，因此Transformer中训练的不同位置可以并行化训练，大大提高了训练效率。
+%----------------------------------------------
+%\begin{figure}[htp]
+%\centering
+%\input{./Chapter12/Figures/figure-structure-of-the-network-during-transformer-training}
+%\caption{Transformer训练时网络的结构}
+%\label{fig:12-53}
+%\end{figure}
+%----------------------------------------------
+\parinterval 需要注意的是，Transformer也包含很多工程方面的技巧。首先，在训练优化器方面，需要注意以下几点：
+\begin{itemize}
+\vspace{0.5em}
+\item	Transformer使用Adam优化器优化参数，并设置$\beta_1=0.9$，$\beta_2=0.98$，$\epsilon=10^{-9}$。
+\item Transformer在学习率中同样应用了学习率{\small\bfnew{预热}}\index{预热}（Warmup）\index{Warmup}策略，其计算公式如下：
+\begin{eqnarray}
+lrate = d_{model}^{-0.5} \cdot \textrm{min} (step^{-0.5} , step \cdot warmup\_steps^{-1.5})
+\label{eq:12-53}
+\end{eqnarray}
+\vspace{0.5em}
+其中，$step$表示更新的次数（或步数）。通常设置网络更新的前4000步为预热阶段即$warmup\_steps=4000$。Transformer的学习率曲线如图\ref{fig:12-54}所示。在训练初期，学习率从一个较小的初始值逐渐增大（线性增长），当到达一定的步数，学习率再逐渐减小。这样做可以减缓在训练初期的不稳定现象，同时在模型达到相对稳定之后，通过逐渐减小的学习率让模型进行更细致的调整。这种学习率的调整方法是Transformer的一个很大的工程贡献。
+\vspace{0.5em}
+\end{itemize}
+%----------------------------------------------
+\begin{figure}[htp]
+\centering
+\input{./Chapter12/Figures/figure-lrate-of-transformer}
+\caption{Transformer模型的学习率曲线}
+\label{fig:12-54}
+\end{figure}
+%----------------------------------------------
+\parinterval 另外，Transformer为了提高模型训练的效率和性能，还进行了以下几方面的操作：
+\begin{itemize}
+\vspace{0.5em}
+\item {\small\bfnew{小批量训练}}\index{小批量训练}（Mini-batch Training）\index{Mini-batch Training}：每次使用一定数量的样本进行训练，即每次从样本中选择一小部分数据进行训练。这种方法的收敛较快，同时易于提高设备的利用率。批次大小通常设置为2048/4096（token数即每个批次中的单词个数）。每一个批次中的句子并不是随机选择的，模型通常会根据句子长度进行排序，选取长度相近的句子组成一个批次。这样做可以减少padding数量，提高训练效率，如图\ref{fig:12-55}。
+%----------------------------------------------
+\begin{figure}[htp]
+\centering
+\input{./Chapter12/Figures/figure-comparison-of-the-number-of-padding-in-batch}
+\caption{batch中padding数量对比（白色部分为padding）}
+\label{fig:12-55}
+\end{figure}
+%----------------------------------------------
+\vspace{0.5em}
+\item {\small\bfnew{Dropout}}\index{Dropout}：由于Transformer模型网络结构较为复杂，会导致过度拟合训练数据，从而对未见数据的预测结果变差。这种现象也被称作{\small\sffamily\bfseries{过拟合}}\index{过拟合}（Over Fitting）\index{Over fitting}。为了避免这种现象，Transformer加入了Dropout操作\upcite{JMLR:v15:srivastava14a}。Transformer中这四个地方用到了Dropout：词嵌入和位置编码、残差连接、注意力操作和前馈神经网络。Dropout比例通常设置为$0.1$。
+\vspace{0.5em}
+\item {\small\bfnew{标签平滑}}\index{标签平滑}（Label Smoothing）\index{Label Smoothing}：在计算损失的过程中，需要用预测概率去拟合真实概率。在分类任务中，往往使用One-hot向量代表真实概率，即真实答案位置那一维对应的概率为1，其余维为0，而拟合这种概率分布会造成两个问题：1)无法保证模型的泛化能力，容易造成过拟合；2) 1和0概率鼓励所属类别和其他类别之间的差距尽可能加大，会造成模型过于相信预测的类别。因此Transformer里引入标签平滑\upcite{Szegedy_2016_CVPR}来缓解这种现象，简单的说就是给正确答案以外的类别分配一定的概率，而不是采用非0即1的概率。这样，可以学习一个比较平滑的概率分布，从而提升泛化能力。
+\vspace{0.5em}
+\end{itemize}
+\parinterval 不同的Transformer可以适应不同的任务，常见的Transformer模型有Transformer Base、Transformer Big和Transformer Deep\upcite{NIPS2017_7181,WangLearning}，具体设置如下：
+\begin{itemize}
+\vspace{0.5em}
+\item  Transformer Base：标准的Transformer结构，解码器编码器均包含6层，隐层维度为512，前馈神经网络维度为2048，多头注意力机制为8头，Dropout设为0.1。
+\vspace{0.5em}
+\item  Transformer Big：为了提升网络的容量，使用更宽的网络。在Base的基础上增大隐层维度至1024，前馈神经网络的维度变为4096，多头注意力机制为16头，Dropout设为0.3。
+\vspace{0.5em}
+\item Transformer Deep：加深编码器网络层数可以进一步提升网络的性能，它的参数设置与Transformer Base基本一致，但是层数增加到48层，同时使用Pre-Norm作为层正则化的结构。
+\vspace{0.5em}
+\end{itemize}
+\parinterval 在WMT'16数据 上的实验对比如表\ref{tab:12-13}所示。可以看出，Transformer Base的BLE\\U得分虽不如另外两种模型，但其参数量是最少的。而Transformer Deep的性能整体好于Transformer Big。
+%----------------------------------------------
+\begin{table}[htp]
+\centering
+\caption{三种Transformer模型的对比}
+\label{tab:12-13}
+\begin{tabular}{l | l l l}
+\multirow{2}{*}{系统}   & \multicolumn{2}{c}{BLEU[\%]} & \# of \\
+                      & EN-DE  & EN-FR  &    params                              \\ \hline
+Transformer Base      & 27.3            & 38.1            & 65$\times 10^{6}$                \\
+Transformer Big       & 28.4            & 41.8            & 213$\times 10^{6}$               \\
+Transformer Deep(48层) & 30.2            & 43.1            & 194$\times 10^{6}$              \\
+\end{tabular}
+\end{table}
+%----------------------------------------------
+%----------------------------------------------------------------------------------------
+%    NEW SUB-SECTION  12.2.9
+%----------------------------------------------------------------------------------------
+\subsection{推断}
+\parinterval Transformer解码器生成目标语的过程和前面介绍的循环网络翻译模型类似，都是从左往右生成，且下一个单词的预测依赖已经生成的上一个单词。其具体推断过程如图\ref{fig:12-56}所示，其中$\vectorn{\emph{C}}_i$是编码-解码注意力的结果，解码器首先根据“<eos>”和$\vectorn{\emph{C}}_1$生成第一个单词“how”，然后根据“how”和$\vectorn{\emph{C}}_2$生成第二个单词“are”，以此类推，当解码器生成“<eos>”时结束推断。
+\parinterval 但是，Transformer在推断阶段无法对所有位置进行并行化操作，因为对于每一个目标语单词都需要对前面所有单词进行注意力操作，因此它推断速度非常慢。可以采用的加速手段有：低精度\upcite{DBLP:journals/corr/CourbariauxB16}、Cache（缓存需要重复计算的变量）\upcite{DBLP:journals/corr/abs-1805-00631}、共享注意力网络等\upcite{Xiao2019SharingAW}。关于模型的推断技术将会在{\chapterfourteen}进一步深入介绍。
+%----------------------------------------------
+\begin{figure}[htp]
+\centering
+\input{./Chapter12/Figures/figure-decode-of-transformer}
+\caption{Transformer推断过程示例}
+\label{fig:12-56}
+\end{figure}
+%----------------------------------------------
+%----------------------------------------------------------------------------------------
+%    NEW SECTION  12.3
+%----------------------------------------------------------------------------------------
+\section{小结及深入阅读}
+\parinterval 注意力机制的使用是机器翻译乃至整个自然语言处理近几年获得成功的重要因素之一\upcite{Liu_2019_CVPR,DBLP:journals/corr/abs-1811-00498,MoradiInterrogating}。早期，有研究者尝试将注意力机制和统计机器翻译的词对齐进行统一\upcite{WangNeural}。近两年，也有研究已经发现注意力模型可以捕捉一些语言现象\upcite{DBLP:journals/corr/abs-1905-09418}，比如，在Transformer的多头注意力中，不同头往往会捕捉到不同的信息，比如，有些头对低频词更加敏感，有些头更适合词意消歧，甚至有些头可以捕捉句法信息。此外，由于注意力机制增加了模型的复杂性，而且随着网络层数的增多，神经机器翻译中也存在大量的冗余，因此研发轻量的注意力模型也是具有实践意义的方向\upcite{Xiao2019SharingAW}。
\ No newline at end of file
--- a/bibliography.bib
+++ b/bibliography.bib
@@ -3999,13 +3999,13 @@
  //biburl    = {https://dblp.org/rec/journals/ijufks/Hochreiter98.bib},
  //bibsource = {dblp computer science bibliography, https://dblp.org}
 }
-@article{BENGIO1994Learning,
+@article{BENGIO1994Learning, 
-author ={Y. {Bengio} and P. {Simard} and P. {Frasconi}},
+author ={Y. {Bengio} and P. {Simard} and P. {Frasconi}}, 
-journal ={IEEE Transactions on Neural Networks},
+journal ={IEEE Transactions on Neural Networks}, 
-title ={Learning long-term dependencies with gradient descent is difficult},
+title ={Learning long-term dependencies with gradient descent is difficult}, 
-year ={1994},
+year ={1994}, 
 volume ={5},
-number ={2},
+number ={2}, 
 pages ={157-166},
 }
 @inproceedings{NIPS2017_7181,
@@ -4526,7 +4526,7 @@ pages ={157-166},
               November 3-7, 2019},
  //publisher = {Association for Computational Linguistics},
  pages     = {1061--1070},
-  year      = {2019},
+  year      = {2019\\},
  //url       = {https://doi.org/10.18653/v1/D19-1098},
  //doi       = {10.18653/v1/D19-1098},
  //timestamp = {Thu, 12 Dec 2019 13:23:46 +0100},
@@ -4627,28 +4627,7 @@ pages ={157-166},
  //biburl    = {https://dblp.org/rec/conf/acl/LiLLMS19.bib},
  //bibsource = {dblp computer science bibliography, https://dblp.org}
 }
-@inproceedings{Zhang2017PriorKI,
-  author    = {Jiacheng Zhang and
-               Yang Liu and
-               Huanbo Luan and
-               Jingfang Xu and
-               Maosong Sun},
-  //editor    = {Regina Barzilay and
-               Min-Yen Kan},
-  title     = {Prior Knowledge Integration for Neural Machine Translation using Posterior
-               Regularization},
-  publisher = {Proceedings of the 55th Annual Meeting of the Association for Computational
-               Linguistics, {ACL} 2017, Vancouver, Canada, July 30 - August 4, Volume
-               1: Long Papers},
-  pages     = {1514--1523},
-  //publisher = {Association for Computational Linguistics},
-  year      = {2017},
-  //url       = {https://doi.org/10.18653/v1/P17-1139},
-  //doi       = {10.18653/v1/P17-1139},
-  //timestamp = {Tue, 20 Aug 2019 11:59:06 +0200},
-  //biburl    = {https://dblp.org/rec/conf/acl/ZhangLLXS17.bib},
-  //bibsource = {dblp computer science bibliography, https://dblp.org}
-}
 @inproceedings{Werlen2018DocumentLevelNM,
  author    = {Lesly Miculicich Werlen and
               Dhananjay Ram and
@@ -4711,21 +4690,7 @@ pages ={157-166},
  //biburl    = {https://dblp.org/rec/journals/corr/abs-1906-00532.bib},
  //bibsource = {dblp computer science bibliography, https://dblp.org}
 }
-@article{DBLP:journals/corr/CourbariauxB16,
-  author    = {Matthieu Courbariaux and
-               Yoshua Bengio},
-  title     = {BinaryNet: Training Deep Neural Networks with Weights and Activations
-               Constrained to +1 or -1},
-  journal   = {CoRR},
-  volume    = {abs/1602.02830},
-  year      = {2016},
-  //url       = {http://arxiv.org/abs/1602.02830},
-  //archivePrefix = {arXiv},
-  //eprint    = {1602.02830},
-  //timestamp = {Mon, 13 Aug 2018 16:46:57 +0200},
-  //biburl    = {https://dblp.org/rec/journals/corr/CourbariauxB16.bib},
-  //bibsource = {dblp computer science bibliography, https://dblp.org}
-}
 @inproceedings{Zhang2018SpeedingUN,
  author    = {Wen Zhang and
               Liang Huang and
@@ -4800,28 +4765,31 @@ pages ={157-166},
  //biburl    = {https://dblp.org/rec/journals/corr/HintonVD15.bib},
  //bibsource = {dblp computer science bibliography, https://dblp.org}
 }
-@inproceedings{Sun2019PatientKD,
-  author    = {Siqi Sun and
+@inproceedings{Ott2018ScalingNM,
-               Yu Cheng and
+  title={Scaling Neural Machine Translation},
-               Zhe Gan and
+  author={Myle Ott and Sergey Edunov and David Grangier and M. Auli},
-               Jingjing Liu},
+  publisher={Workshop on Machine Translation},
-  //editor    = {Kentaro Inui and
+  year={2018}
-               Jing Jiang and
+}
-               Vincent Ng and
+@inproceedings{Lin2020TowardsF8,
-               Xiaojun Wan},
+  title={Towards Fully 8-bit Integer Inference for the Transformer Model},
-  title     = {Patient Knowledge Distillation for {BERT} Model Compression},
+  author={Y. Lin and Yanyang Li and Tengbo Liu and Tong Xiao and T. Liu and Jingbo Zhu},
-  publisher = {Proceedings of the 2019 Conference on Empirical Methods in Natural
+  publisher={International Joint Conference on Artificial Intelligence},
-               Language Processing and the 9th International Joint Conference on
+  year={2020}
-               Natural Language Processing, {EMNLP-IJCNLP} 2019, Hong Kong, China,
+}
-               November 3-7, 2019},
+@inproceedings{kim-rush-2016-sequence,
-  pages     = {4322--4331},
+    title = "Sequence-Level Knowledge Distillation",
-  //publisher = {Association for Computational Linguistics},
+    author = "Kim, Yoon  and
-  year      = {2019},
+      Rush, Alexander M.",
-  //url       = {https://doi.org/10.18653/v1/D19-1441},
+    publisher = "Proceedings of the 2016 Conference on Empirical Methods in Natural Language Processing",
-  //doi       = {10.18653/v1/D19-1441},
+    month = nov,
-  //timestamp = {Mon, 06 Apr 2020 14:36:31 +0200},
+    year = "2016",
-  //biburl    = {https://dblp.org/rec/conf/emnlp/SunCGL19.bib},
+    //address = "Austin, Texas",
-  //bibsource = {dblp computer science bibliography, https://dblp.org}
+    //publisher = "Association for Computational Linguistics",
+    //url = "https://www.aclweb.org/anthology/D16-1139",
+    //doi = "10.18653/v1/D16-1139",
+    pages = "1317--1327",
 }
 %%%%% chapter 10------------------------------------------------------
@@ -4835,6 +4803,138 @@ pages ={157-166},
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 %%%%% chapter 12------------------------------------------------------
+@inproceedings{DBLP:journals/corr/LinFSYXZB17,
+  author    = {Zhouhan Lin and
+               Minwei Feng and
+               C{\'{\i}}cero Nogueira dos Santos and
+               Mo Yu and
+               Bing Xiang and
+               Bowen Zhou and
+               Yoshua Bengio},
+  title     = {A Structured Self-Attentive Sentence Embedding},
+  publisher = {5th International Conference on Learning Representations, {ICLR} 2017,
+               Toulon, France, April 24-26, 2017, Conference Track Proceedings},
+  //publisher = {OpenReview.net},
+  year      = {2017},
+  //url       = {https://openreview.net/forum?id=BJC\_jUqxe},
+  //timestamp = {Thu, 25 Jul 2019 14:25:44 +0200},
+  //biburl    = {https://dblp.org/rec/conf/iclr/LinFSYXZB17.bib},
+  //bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+@inproceedings{Shaw2018SelfAttentionWR,
+  author    = {Peter Shaw and
+               Jakob Uszkoreit and
+               Ashish Vaswani},
+  //editor    = {Marilyn A. Walker and
+               Heng Ji and
+               Amanda Stent},
+  title     = {Self-Attention with Relative Position Representations},
+  publisher = {Proceedings of the 2018 Conference of the North American Chapter of
+               the Association for Computational Linguistics: Human Language Technologies,
+               NAACL-HLT, New Orleans, Louisiana, USA, June 1-6, 2018, Volume 2 (Short
+               Papers)},
+  pages     = {464--468},
+  //publisher = {Association for Computational Linguistics},
+  year      = {2018},
+  //url       = {https://doi.org/10.18653/v1/n18-2074},
+  //doi       = {10.18653/v1/n18-2074},
+  //timestamp = {Tue, 28 Jan 2020 10:30:17 +0100},
+  //biburl    = {https://dblp.org/rec/conf/naacl/ShawUV18.bib},
+  //bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+@inproceedings{DBLP:journals/corr/HeZRS15,
+  author    = {Kaiming He and
+               Xiangyu Zhang and
+               Shaoqing Ren and
+               Jian Sun},
+  title     = {Deep Residual Learning for Image Recognition},
+  publisher = {2016 {IEEE} Conference on Computer Vision and Pattern Recognition,
+               {CVPR} 2016, Las Vegas, NV, USA, June 27-30, 2016},
+  pages     = {770--778},
+  //publisher = {{IEEE} Computer Society},
+  year      = {2016},
+  //url       = {https://doi.org/10.1109/CVPR.2016.90},
+  //doi       = {10.1109/CVPR.2016.90},
+  //timestamp = {Wed, 16 Oct 2019 14:14:50 +0200},
+  //biburl    = {https://dblp.org/rec/conf/cvpr/HeZRS16.bib},
+  //bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+@article{Ba2016LayerN,
+  author    = {Lei Jimmy Ba and
+               Jamie Ryan Kiros and
+               Geoffrey E. Hinton},
+  title     = {Layer Normalization},
+  journal   = {CoRR},
+  volume    = {abs/1607.06450},
+  year      = {2016},
+  //url       = {http://arxiv.org/abs/1607.06450},
+  //archivePrefix = {arXiv},
+  //eprint    = {1607.06450},
+  //timestamp = {Tue, 23 Jul 2019 17:33:23 +0200},
+  //biburl    = {https://dblp.org/rec/journals/corr/BaKH16.bib},
+  //bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+@article{JMLR:v15:srivastava14a,
+  author  = {Nitish Srivastava and Geoffrey Hinton and Alex Krizhevsky and Ilya Sutskever and Ruslan Salakhutdinov},
+  title   = {Dropout: A Simple Way to Prevent Neural Networks from Overfitting},
+  journal = {Journal of Machine Learning Research},
+  year    = {2014},
+  volume  = {15},
+  pages   = {1929-1958},
+  //url     = {http://jmlr.org/papers/v15/srivastava14a.html}
+}
+@inproceedings{Szegedy_2016_CVPR,
+  author    = {Christian Szegedy and
+               Vincent Vanhoucke and
+               Sergey Ioffe and
+               Jonathon Shlens and
+               Zbigniew Wojna},
+  title     = {Rethinking the Inception Architecture for Computer Vision},
+  publisher = {2016 {IEEE} Conference on Computer Vision and Pattern Recognition,
+               {CVPR} 2016, Las Vegas, NV, USA, June 27-30, 2016},
+  pages     = {2818--2826},
+  //publisher = {{IEEE} Computer Society},
+  year      = {2016},
+  //url       = {https://doi.org/10.1109/CVPR.2016.308},
+  //doi       = {10.1109/CVPR.2016.308},
+  //timestamp = {Wed, 16 Oct 2019 14:14:50 +0200},
+  //biburl    = {https://dblp.org/rec/conf/cvpr/SzegedyVISW16.bib},
+  //bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+@inproceedings{DBLP:journals/corr/abs-1805-00631,
+  author    = {Biao Zhang and
+               Deyi Xiong and
+               Jinsong Su},
+  //editor    = {Iryna Gurevych and
+               Yusuke Miyao},
+  title     = {Accelerating Neural Transformer via an Average Attention Network},
+  publisher = {Proceedings of the 56th Annual Meeting of the Association for Computational
+               Linguistics, {ACL} 2018, Melbourne, Australia, July 15-20, 2018, Volume
+               1: Long Papers},
+  pages     = {1789--1798},
+  //publisher = {Association for Computational Linguistics},
+  year      = {2018},
+  //url       = {https://www.aclweb.org/anthology/P18-1166/},
+  //doi       = {10.18653/v1/P18-1166},
+  //timestamp = {Mon, 16 Sep 2019 13:46:41 +0200},
+  //biburl    = {https://dblp.org/rec/conf/acl/XiongZS18.bib},
+  //bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+@article{DBLP:journals/corr/CourbariauxB16,
+  author    = {Matthieu Courbariaux and
+               Yoshua Bengio},
+  title     = {BinaryNet: Training Deep Neural Networks with Weights and Activations
+               Constrained to +1 or -1},
+  journal   = {CoRR},
+  volume    = {abs/1602.02830},
+  year      = {2016},
+  //url       = {http://arxiv.org/abs/1602.02830},
+  //archivePrefix = {arXiv},
+  //eprint    = {1602.02830},
+  //timestamp = {Mon, 13 Aug 2018 16:46:57 +0200},
+  //biburl    = {https://dblp.org/rec/journals/corr/CourbariauxB16.bib},
+  //bibsource = {dblp computer science bibliography, https://dblp.org}
+}
 %%%%% chapter 12------------------------------------------------------
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%