合并分支 'caorunzhe' 到 'zhoutao'

Caorunzhe2zhoutao-5-19 查看合并请求 !221

合并分支 'caorunzhe' 到 'zhoutao'
Caorunzhe2zhoutao-5-19 查看合并请求 !221
09af97e2 · zhoutao · 2c4d575b · 36e4ae9c · 09af97e2 · 09af97e2
Commit 09af97e2 authored May 19, 2020 by zhoutao
--- a/Book/Chapter1/Figures/figure-required-parts-of-MT.tex
+++ b/Book/Chapter1/Figures/figure-required-parts-of-MT.tex
@@ -2,7 +2,8 @@
 \definecolor{ugreen}{rgb}{0,0.5,0}
 \begin{tikzpicture}
 \node [thick,rounded corners=3mm,draw=blue!50!black!50,top color=white,bottom color=blue!50!black!20] (s) at (0,0) {输入文字};
-\node [thick,minimum width=6em,draw=red!50!black!50,top color=white,bottom color=red!50!black!20,anchor=west] (mt) at ([xshift=0.5in]s.east) {\begin{tabular}{c}机器翻译\\???\end{tabular}};
+\node [thick,minimum width=6em,minimum height=3em,draw=red!50!black!50,anchor=west] (mt) at ([xshift=0.5in]s.east) {};
+\node [thick,minimum width=6em,minimum height=1.5em,draw=red!50!black!50,top color=white,bottom color=red!50!black!20,anchor=north] (mt1) at (mt.north) {机器翻译};
 \node [thick,rounded corners=3mm,draw=blue!50!black!50,top color=white,bottom color=blue!50!black!20,anchor=west] (t) at ([xshift=0.5in]mt.east) {输出译文};
 {
 \node [draw=ugreen,fill=ugreen,minimum width=3.1em,minimum height=1.5em,anchor=south west] (part1) at (mt.south west) {{\color{white} 1}};

--- a/Book/Chapter1/chapter1.tex
+++ b/Book/Chapter1/chapter1.tex
@@ -2,6 +2,14 @@
 % !TEX encoding = UTF-8 Unicode

 %----------------------------------------------------------------------------------------
+% 机器翻译：统计建模与深度学习方法
+% Machine Translation: Statistical Modeling and Deep Learning Methods
+%
+% Copyright 2020
+% 肖桐(xiaotong@mail.neu.edu.cn) 朱靖波 (zhujingbo@mail.neu.edu.cn)
+%----------------------------------------------------------------------------------------
+
+%----------------------------------------------------------------------------------------
 %    CONFIGURATIONS
 %----------------------------------------------------------------------------------------

@@ -188,7 +196,7 @@
 \includegraphics[scale=0.3]{./Chapter1/Figures/figure-wmt-participation.jpg}
 \includegraphics[scale=0.3]{./Chapter1/Figures/figure-wmt-bestresults.jpg}
 \setlength{\belowcaptionskip}{-1.5em}
-    \caption{国际机器翻译大赛(左：WMT19参赛队伍；右：WMT19最终个项目最好分数结果)}
+    \caption{国际机器翻译大赛(左：WMT19参赛队伍；右：WMT19各项目的最好分数结果)}
    \label{fig:1-6}
 \end{figure}
 %-------------------------------------------

--- a/Book/Chapter2/chapter2.tex
+++ b/Book/Chapter2/chapter2.tex
@@ -2,6 +2,14 @@
 % !TEX encoding = UTF-8 Unicode

 %----------------------------------------------------------------------------------------
+% 机器翻译：统计建模与深度学习方法
+% Machine Translation: Statistical Modeling and Deep Learning Methods
+%
+% Copyright 2020
+% 肖桐(xiaotong@mail.neu.edu.cn) 朱靖波 (zhujingbo@mail.neu.edu.cn)
+%----------------------------------------------------------------------------------------
+
+%----------------------------------------------------------------------------------------
 %    CONFIGURATIONS
 %----------------------------------------------------------------------------------------

@@ -939,9 +947,9 @@ I cannot see without my reading \underline{\ \ \ \ \ \ \ \ }
 \end{eqnarray}
 \begin{eqnarray}
 c_{\textrm{KN}}(\cdot) = \left\{\begin{array}{ll}
-\textrm{count}(\cdot) & \textrm{for\ highest\ order}  \\ 
-\textrm{catcount}(\cdot) & \textrm{for\ lower\ order} 
-\end{array}\right. 
+\textrm{count}(\cdot) & \textrm{for\ highest\ order}  \\
+\textrm{catcount}(\cdot) & \textrm{for\ lower\ order}
+\end{array}\right.
 \label{eq:2-41}
 \end{eqnarray}
 \noindent 其中catcount$(\cdot)$表示的是基于某个单个词作为第$n$个词的$n$-gram的种类数目。

--- a/Book/Chapter3/Chapter3.tex
+++ b/Book/Chapter3/Chapter3.tex
@@ -2,6 +2,14 @@
 % !TEX encoding = UTF-8 Unicode

 %----------------------------------------------------------------------------------------
+% 机器翻译：统计建模与深度学习方法
+% Machine Translation: Statistical Modeling and Deep Learning Methods
+%
+% Copyright 2020
+% 肖桐(xiaotong@mail.neu.edu.cn) 朱靖波 (zhujingbo@mail.neu.edu.cn)
+%----------------------------------------------------------------------------------------
+
+%----------------------------------------------------------------------------------------
 %    CONFIGURATIONS
 %----------------------------------------------------------------------------------------


--- a/Book/Chapter3/Figures/figure-example-of-formula3.29.tex
+++ b/Book/Chapter3/Figures/figure-example-of-formula3.29.tex
@@ -23,7 +23,7 @@
 \node [anchor=north] (eq1) at ([xshift=2em,yshift=-2em]exampleeq.south west) {$\sum\limits_{y_1=0}^{2} \sum\limits_{y_2=0}^{2} \alpha(1,y_1)\alpha(2,y_2)$};
 \node [anchor=west] (eq1part2) at ([xshift=-1em,yshift=-3em]eq1.west) {$=$};
 \node [anchor=west] (eq1part3) at ([xshift=-0.5em]eq1part2.east) {$\sum\limits_{y_1=0}^{2} \sum\limits_{y_2=0}^{2} \prod\limits_{x=1}^{2} $};
-\node [anchor=west,inner sep=2pt] (eq1part4) at ([xshift=-0.3em]eq1part3.east) {$\alpha(x,y_x)$};
+\node [anchor=west,inner sep=2pt] (eq1part4) at ([xshift=-0.3em]eq1part3.east) {};
 }

 {
@@ -31,7 +31,7 @@
 \node [anchor=west] (eq2part2) at ([yshift=-1.5em]eq2.west) {$(\alpha(2,0)+\alpha(2,1)+\alpha(2,2))$};
 \node [anchor=west] (eq2part3) at ([xshift=2.1in]eq1part2.east){$=$};
 \node [anchor=west] (eq2part4) at ([xshift=-0.5em]eq2part3.east){$\prod\limits_{x=1}^{2} \sum\limits_{y=0}^{2}$};
-\node [anchor=west,inner sep=2pt] (eq2part5) at ([xshift=-0.3em]eq2part4.east){$\alpha(x,y)$};
+\node [anchor=west,inner sep=2pt] (eq2part5) at ([xshift=-0.3em]eq2part4.east){};
 }
 }

@@ -64,8 +64,8 @@
 \node [anchor=west,inner sep=2pt,fill=blue!20] (feqpart5) at ([xshift=-0.3em]feqpart4.east) {$f(s_j|t_i)$};
 }

-\draw [->,thick] (eq1part4.south) .. controls +(south:2.5em) and +(north:2.5em) .. (feqpart2.north);
-\draw [->,thick] (eq2part5.south) .. controls +(south:1.5em) and +(north:1.5em) .. (feqpart5.north);
+\draw [->,thick] ([xshift=1.4em]eq1part4.south) .. controls +(south:2.5em) and +(north:2.5em) .. (feqpart2.north);
+\draw [->,thick] ([xshift=1.2em]eq2part5.south) .. controls +(south:1.5em) and +(north:1.5em) .. (feqpart5.north);

 \node [anchor=west,inner sep=2pt,fill=blue!20] (eq1part4) at ([xshift=-0.3em]eq1part3.east) {\footnotesize{$\alpha(x,y_x)$}};
 \node [anchor=west,inner sep=2pt,fill=blue!20] (eq2part5) at ([xshift=-0.3em]eq2part4.east){\footnotesize{$\alpha(x,y)$}};

--- a/Book/Chapter3/Figures/figure-expression.tex
+++ b/Book/Chapter3/Figures/figure-expression.tex
@@ -12,16 +12,16 @@



-\node [anchor=west,inner sep=2pt,minimum height=2.5em] (eq1) at (0,0) {${\textrm{P}(\tau,\pi|\mathbf{t}) =  \prod_{i=1}^{l}{\textrm{P}(\varphi_i|\varphi_{1}^{i-1},\mathbf{t})} \ \ \times \ \ {\textrm{P}(\varphi_0|\varphi_{1}^{l},\mathbf{t})}\ \ \times}$};
-\node [anchor=north west,inner sep=2pt,minimum height=2.5em] (eq2) at ([xshift=-16.06em,yshift=0.0em]eq1.south east) {${\prod_{i=0}^l{\prod_{k=1}^{\varphi_i}{\textrm{P}(\tau_{ik}|\tau_{i1}^{k-1},\tau_{1}^{i-1},\varphi_{0}^{l},\mathbf{t} )}} \ \ \times}$};
-\node [anchor=north west,inner sep=2pt,minimum height=2.5em] (eq3) at ([xshift=-15.9em,yshift=0.0em]eq2.south east) {${\prod_{i=1}^l{\prod_{k=1}^{\varphi_i}{\textrm{P}(\pi_{ik}|\pi_{i1}^{k-1},\pi_{1}^{i-1},\tau_{0}^{l},\varphi_{0}^{l},\mathbf{t} )}} \ \ \times}$};
-\node [anchor=north west,inner sep=2pt,minimum height=2.5em] (eq4) at ([xshift=-17.40em,yshift=0.0em]eq3.south east) {{${\prod_{k=1}^{\varphi_0}{\textrm{P}(\pi_{0k}|\pi_{01}^{k-1},\pi_{1}^{l},\tau_{0}^{l},\varphi_{0}^{l},\mathbf{t} )}}$}};
+\node [anchor=west,inner sep=2pt,minimum height=2.5em] (eq1) at (0,0) {${\textrm{P}(\tau,\pi|\mathbf{t}) =  \prod_{i=1}^{l}\hspace{6.0em} \times \ \hspace{5.5em}\times}$};
+\node [anchor=north west,inner sep=2pt,minimum height=2.5em] (eq2) at ([xshift=-16.06em,yshift=0.0em]eq1.south east) {${\prod_{i=0}^l{\prod_{k=1}^{\varphi_i}\hspace{9.6em}} \ \ \times}$};
+\node [anchor=north west,inner sep=2pt,minimum height=2.5em] (eq3) at ([xshift=-16.05em,yshift=0.0em]eq2.south east) {${\prod_{i=1}^l{\prod_{k=1}^{\varphi_i}}\hspace{11.5em} \times}$};
+\node [anchor=north west,inner sep=2pt,minimum height=2.5em] (eq4) at ([xshift=-17.44em,yshift=0.0em]eq3.south east) {{${\prod_{k=1}^{\varphi_0}}$}};

-\node [anchor=west,inner sep=2pt,minimum height=2.0em,fill=red!30] (part1) at ([xshift=-13.8em,yshift=0.0em]eq1.east) {{${\textrm{P}(\varphi_i|\varphi_{1}^{i-1},\mathbf{t})}$}};
-\node [anchor=west,inner sep=2pt,minimum height=2.0em,fill=blue!30] (part2) at ([xshift=-6.3em,yshift=0.0em]eq1.east) {{${\textrm{P}(\varphi_0|\varphi_{1}^{l},\mathbf{t})}$}};
+\node [anchor=west,inner sep=2pt,minimum height=2.0em,fill=red!30] (part1) at ([xshift=-13.4em,yshift=0.0em]eq1.east) {{${\textrm{P}(\varphi_i|\varphi_{1}^{i-1},\mathbf{t})}$}};
+\node [anchor=west,inner sep=2pt,minimum height=2.0em,fill=blue!30] (part2) at ([xshift=-6.4em,yshift=0.0em]eq1.east) {{${\textrm{P}(\varphi_0|\varphi_{1}^{l},\mathbf{t})}$}};
 \node [anchor=west,inner sep=2pt,minimum height=2.0em,fill=green!30] (part3) at ([xshift=-11em,yshift=0.0em]eq2.east) {{${\textrm{P}(\tau_{ik}|\tau_{i1}^{k-1},\tau_{1}^{i-1},\varphi_{0}^{l},\mathbf{t} )}$}};
-\node [anchor=west,inner sep=2pt,minimum height=2.0em,fill=yellow!30] (part4) at ([xshift=-12.53em,yshift=0.0em]eq3.east) {{${\textrm{P}(\pi_{ik}|\pi_{i1}^{k-1},\pi_{1}^{i-1},\tau_{0}^{l},\varphi_{0}^{l},\mathbf{t} )}$}};
-\node [anchor=west,inner sep=2pt,minimum height=2.0em,fill=gray!30] (part5) at ([xshift=-10.4em,yshift=0.0em]eq4.east) {{${\textrm{P}(\pi_{0k}|\pi_{01}^{k-1},\pi_{1}^{l},\tau_{0}^{l},\varphi_{0}^{l},\mathbf{t} )}$}};
+\node [anchor=west,inner sep=2pt,minimum height=2.0em,fill=yellow!30] (part4) at ([xshift=-12.5em,yshift=0.0em]eq3.east) {{${\textrm{P}(\pi_{ik}|\pi_{i1}^{k-1},\pi_{1}^{i-1},\tau_{0}^{l},\varphi_{0}^{l},\mathbf{t} )}$}};
+\node [anchor=west,inner sep=2pt,minimum height=2.0em,fill=gray!30] (part5) at ([xshift=0.0em,yshift=0.0em]eq4.east) {{${\textrm{P}(\pi_{0k}|\pi_{01}^{k-1},\pi_{1}^{l},\tau_{0}^{l},\varphi_{0}^{l},\mathbf{t} )}$}};


 \end{tikzpicture}
@@ -29,3 +29,34 @@



+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
--- a/Book/Chapter4/Figures/content-of-chart-in-tree-based-decoding.tex
+++ b/Book/Chapter4/Figures/content-of-chart-in-tree-based-decoding.tex
@@ -16,7 +16,7 @@
 \node [chartnode2,anchor=north west] (cell41) at ([yshift=0.04em]cell31.south west) {};
 \node [chartnode2,anchor=west] (cell42) at ([xshift=-0.04em]cell41.east) {};
 \node [chartnode2,anchor=west] (cell43) at ([xshift=-0.04em]cell42.east) {};
-\node [chartnode2,anchor=west] (cell44) at ([xshift=-0.02em]cell43.east) {};
+\node [chartnode2,anchor=west] (cell44) at ([xshift=-0.04em]cell43.east) {};

 \node [anchor=east] (s1) at (cell11.west) {\scriptsize{猫}};
 \node [anchor=east] (s2) at (cell21.west) {\scriptsize{喜欢}};

--- a/Book/Chapter4/Figures/execution-of-cube-pruning.tex
+++ b/Book/Chapter4/Figures/execution-of-cube-pruning.tex
@@ -127,11 +127,6 @@
 }

 {
-\node [anchor=center,selectnode,fill=red!20] (c2) at (alig11.center) {\footnotesize{5.1}};
-\node [anchor=center,selectnode,fill=red!20] (c3) at (alig2.center) {\footnotesize{5.5}};
-}
-
-{
 \node [anchor=center,selectnode] (c2) at (alig11.center) {\footnotesize{5.1}};
 \node [anchor=center,selectnode] (c3) at (alig2.center) {\footnotesize{5.5}};
 \node [anchor=center,selectnode,fill=red!20] (c5) at (alig21.center) {\footnotesize{8.5}};
@@ -183,11 +178,6 @@
 }

 {
-\node [anchor=center,selectnode,fill=red!20] (c2) at (alig11.center) {\footnotesize{5.1}};
-\node [anchor=center,selectnode,fill=red!20] (c3) at (alig2.center) {\footnotesize{5.5}};
-}
-
-{
 \node [anchor=center,selectnode] (c2) at (alig11.center) {\footnotesize{5.1}};
 \node [anchor=center,selectnode] (c3) at (alig2.center) {\footnotesize{5.5}};
 \node [anchor=center,selectnode,fill=red!20] (c5) at (alig21.center) {\footnotesize{8.5}};
@@ -210,15 +200,4 @@



-
-
-
-
-
-
-
-
-
-
-
 \end{tikzpicture}
--- a/Book/Chapter4/Figures/role-of-syntax-tree-in-different-decoding-methods.tex
+++ b/Book/Chapter4/Figures/role-of-syntax-tree-in-different-decoding-methods.tex
@@ -45,7 +45,7 @@

 \begin{pgfonlayer}{background}
 \node [draw,dashed,rectangle,inner sep=1em,thick,red,rounded corners=5pt] (box) [fit = (bsn0) (bsn1) (bsn2) (bsn3) (bsn4) (bsn5)] {};
-\node [anchor=north west] (boxlabel) at ([xshift=2em,yshift=-1em]box.north east) {隐含结构};
+\node [anchor=north west] (boxlabel) at ([xshift=2em,yshift=-2em]box.north east) {隐含结构};
 \end{pgfonlayer}

 \end{scope}

--- a/Book/Chapter4/Figures/rule-matching-base-tree.tex
+++ b/Book/Chapter4/Figures/rule-matching-base-tree.tex
@@ -42,13 +42,13 @@
 \begin{scope}[xshift=1.5in,yshift=1.6in]
 {
 \node[anchor=center, minimum size=10pt,draw] (cell1and1) at (0,0) {};
-\node[anchor=west, minimum size=10pt,draw] (cell2and1) at ([xshift=-0.04em]cell1and1.east) {};
-\node[anchor=west, minimum size=10pt,draw] (cell3and1) at ([xshift=-0.04em]cell2and1.east) {};
+\node[anchor=west, minimum size=10pt,draw] (cell2and1) at ([xshift=-0.02em]cell1and1.east) {};
+\node[anchor=west, minimum size=10pt,draw] (cell3and1) at ([xshift=-0.02em]cell2and1.east) {};
 \node[anchor=west, minimum size=10pt] (cell4and1) at ([xshift=0]cell3and1.east) {\scriptsize{$\dots$}};
 \node[anchor=west, minimum size=10pt,draw] (cellnand1) at ([xshift=0]cell4and1.east) {};

 \node[anchor=south, minimum size=10pt,draw] (cell1and2) at ([yshift=-0.04em]cell1and1.north) {};
-\node[anchor=west, minimum size=10pt,draw] (cell2and2) at ([xshift=-0.04em]cell1and2.east) {};
+\node[anchor=west, minimum size=10pt,draw] (cell2and2) at ([xshift=-0.02em]cell1and2.east) {};

 \node[anchor=south, minimum size=10pt,draw] (cell1and3) at ([yshift=-0.04em]cell1and2.north) {};
 \node[anchor=south, minimum size=10pt] (cell1and4) at ([yshift=0]cell1and3.north) {\scriptsize{$\vdots$}};

--- a/Book/Chapter4/Figures/translation-option.tex
+++ b/Book/Chapter4/Figures/translation-option.tex
@@ -11,33 +11,60 @@
 \node[anchor=west] (s4) at ([xshift=2em]s3.east) {一个};
 \node[anchor=west] (s5) at ([xshift=1.6em]s4.east) {苹果};

-\node [anchor=north,inner sep=2pt,fill=red!20,minimum height=1.5em,minimum width=3em] (t11) at ([yshift=-0.5em]s1.south) {table};
-\node [anchor=north,inner sep=2pt,fill=red!20,minimum height=1.5em,minimum width=3em] (t12) at ([yshift=-0.2em]t11.south) {desk};
+{\tiny
+\node[anchor=south,blue] (fn1) at ([xshift=-2.7em,yshift=-0.5em]s1.south) {0};
+\node[anchor=south,blue] (fn1) at ([xshift=3.5em,yshift=-0.5em]s1.south) {1};
+\node[anchor=south,blue] (fn1) at ([xshift=3.5em,yshift=-0.5em]s2.south) {2};
+\node[anchor=south,blue] (fn1) at ([xshift=3.5em,yshift=-0.5em]s3.south) {3};
+\node[anchor=south,blue] (fn1) at ([xshift=3.5em,yshift=-0.5em]s4.south) {4};
+\node[anchor=south,blue] (fn1) at ([xshift=2.5em,yshift=-0.5em]s5.south) {5};
+}

-\node [anchor=north,inner sep=2pt,fill=green!20,minimum height=1.5em,minimum width=3em] (t21) at ([yshift=-0.5em]s2.south) {on};
-\node [anchor=north,inner sep=2pt,fill=green!20,minimum height=1.5em,minimum width=3em] (t22) at ([yshift=-0.2em]t21.south) {up};
+\node [anchor=north,inner sep=2pt,fill=red!20,minimum height=2em,minimum width=3em] (t11) at ([yshift=-0.5em]s1.south) {table};
+\node [anchor=north west,inner sep=1pt,fill=black,minimum height=0.5em] (tl11) at (t11.north west) {\tiny{{\color{white} \textbf{0-1}}}};
+\node [anchor=north,inner sep=2pt,fill=red!20,minimum height=2em,minimum width=3em] (t12) at ([yshift=-0.2em]t11.south) {desk};
+\node [anchor=north west,inner sep=1pt,fill=black,minimum height=0.5em] (tl12) at (t12.north west) {\tiny{{\color{white} \textbf{0-1}}}};

-\node [anchor=north,inner sep=2pt,fill=blue!20,minimum height=1.5em,minimum width=3.4em] (t31) at ([yshift=-0.5em]s3.south) {have};
-\node [anchor=north,inner sep=2pt,fill=blue!20,minimum height=1.5em,minimum width=3.4em] (t32) at ([yshift=-0.2em]t31.south) {there is};
+\node [anchor=north,inner sep=2pt,fill=green!20,minimum height=2em,minimum width=3em] (t21) at ([yshift=-0.5em]s2.south) {on};
+\node [anchor=north west,inner sep=1pt,fill=black,minimum height=0.5em] (tl21) at (t21.north west) {\tiny{{\color{white} \textbf{1-2}}}};
+\node [anchor=north,inner sep=2pt,fill=green!20,minimum height=2em,minimum width=3em] (t22) at ([yshift=-0.2em]t21.south) {up};
+\node [anchor=north west,inner sep=1pt,fill=black,minimum height=0.5em] (tl22) at (t22.north west) {\tiny{{\color{white} \textbf{1-2}}}};

-\node [anchor=north,inner sep=2pt,fill=orange!20,minimum height=1.5em,minimum width=3em] (t41) at ([yshift=-0.5em]s4.south) {one};
-\node [anchor=north,inner sep=2pt,fill=orange!20,minimum height=1.5em,minimum width=3em] (t42) at ([yshift=-0.2em]t41.south) {an};
+\node [anchor=north,inner sep=2pt,fill=blue!20,minimum height=2em,minimum width=3.4em] (t31) at ([yshift=-0.5em]s3.south) {have};
+\node [anchor=north west,inner sep=1pt,fill=black,minimum height=0.5em] (tl31) at (t31.north west) {\tiny{{\color{white} \textbf{2-3}}}};
+\node [anchor=north,inner sep=2pt,fill=blue!20,minimum height=2em,minimum width=3.4em] (t32) at ([yshift=-0.2em]t31.south) {there is};
+\node [anchor=north west,inner sep=1pt,fill=black,minimum height=0.5em] (tl32) at (t32.north west) {\tiny{{\color{white} \textbf{2-3}}}};

-\node [anchor=north,inner sep=2pt,fill=purple!20,minimum height=1.5em,minimum width=3em] (t51) at ([yshift=-0.5em]s5.south) {apple};
-\node [anchor=north,inner sep=2pt,fill=purple!20,minimum height=1.5em,minimum width=3em] (t52) at ([yshift=-0.2em]t51.south) {apples};
+\node [anchor=north,inner sep=2pt,fill=orange!20,minimum height=2em,minimum width=3em] (t41) at ([yshift=-0.5em]s4.south) {one};
+\node [anchor=north west,inner sep=1pt,fill=black,minimum height=0.5em] (tl41) at (t41.north west) {\tiny{{\color{white} \textbf{3-4}}}};
+\node [anchor=north,inner sep=2pt,fill=orange!20,minimum height=2em,minimum width=3em] (t42) at ([yshift=-0.2em]t41.south) {an};
+\node [anchor=north west,inner sep=1pt,fill=black,minimum height=0.5em] (tl42) at (t42.north west) {\tiny{{\color{white} \textbf{3-4}}}};

-\node [anchor=north west,inner sep=2pt,fill=red!20,minimum height=1.5em,minimum width=7.2em] (t13) at ([yshift=-3.7em]t12.south west) {on table};
-\node [anchor=north west,inner sep=2pt,fill=red!20,minimum height=1.5em,minimum width=7.2em] (t14) at ([yshift=-0.2em]t13.south west) {on the table};
+\node [anchor=north,inner sep=2pt,fill=purple!20,minimum height=2em,minimum width=3em] (t51) at ([yshift=-0.5em]s5.south) {apple};
+\node [anchor=north west,inner sep=1pt,fill=black,minimum height=0.5em] (tl51) at (t51.north west) {\tiny{{\color{white} \textbf{4-5}}}};
+\node [anchor=north,inner sep=2pt,fill=purple!20,minimum height=2em,minimum width=3em] (t52) at ([yshift=-0.2em]t51.south) {apples};
+\node [anchor=north west,inner sep=1pt,fill=black,minimum height=0.5em] (tl52) at (t52.north west) {\tiny{{\color{white} \textbf{4-5}}}};

-\node [anchor=north west,inner sep=2pt,fill=orange!20,minimum height=1.5em,minimum width=7.35em] (t43) at ([yshift=-0.2em]t42.south west) {one apple};
-\node [anchor=north west,inner sep=2pt,fill=orange!20,minimum height=1.5em,minimum width=7.35em] (t44) at ([yshift=-0.2em]t43.south west) {an apple};
+\node [anchor=north west,inner sep=2pt,fill=red!20,minimum height=2em,minimum width=7.2em] (t13) at ([yshift=-4.7em]t12.south west) {on table};
+\node [anchor=north west,inner sep=1pt,fill=black,minimum height=0.5em] (tl13) at (t13.north west) {\tiny{{\color{white} \textbf{0-2}}}};
+\node [anchor=north west,inner sep=2pt,fill=red!20,minimum height=2em,minimum width=7.2em] (t14) at ([yshift=-0.2em]t13.south west) {on the table};
+\node [anchor=north west,inner sep=1pt,fill=black,minimum height=0.5em] (tl14) at (t14.north west) {\tiny{{\color{white} \textbf{0-2}}}};

-\node [anchor=north west,inner sep=2pt,fill=green!20,minimum height=1.5em,minimum width=7.25em] (t23) at ([yshift=-0.2em]t22.south west) {upon there};
+\node [anchor=north west,inner sep=2pt,fill=orange!20,minimum height=2em,minimum width=7.35em] (t43) at ([yshift=-0.2em]t42.south west) {one apple};
+\node [anchor=north west,inner sep=1pt,fill=black,minimum height=0.5em] (tl43) at (t43.north west) {\tiny{{\color{white} \textbf{3-5}}}};
+\node [anchor=north west,inner sep=2pt,fill=orange!20,minimum height=2em,minimum width=7.35em] (t44) at ([yshift=-0.2em]t43.south west) {an apple};
+\node [anchor=north west,inner sep=1pt,fill=black,minimum height=0.5em] (tl44) at (t44.north west) {\tiny{{\color{white} \textbf{3-5}}}};

-\node [anchor=north west,inner sep=2pt,fill=red!20,minimum height=1.5em,minimum width=11.5em] (t15) at ([yshift=-1.95em]t12.south west) {upon the table};
+\node [anchor=north west,inner sep=2pt,fill=green!20,minimum height=2em,minimum width=7.25em] (t23) at ([yshift=-0.2em]t22.south west) {upon there};
+\node [anchor=north west,inner sep=1pt,fill=black,minimum height=0.5em] (tl23) at (t23.north west) {\tiny{{\color{white} \textbf{1-3}}}};

-\node [anchor=north west,inner sep=2pt,fill=blue!20,minimum height=1.5em,minimum width=11.8em] (t33) at ([yshift=-3.7em]t32.south west) {there is an apple};
-\node [anchor=north west,inner sep=2pt,fill=blue!20,minimum height=1.5em,minimum width=11.8em] (t34) at ([yshift=-0.2em]t33.south west) {have an apple...};
+\node [anchor=north west,inner sep=2pt,fill=red!20,minimum height=2em,minimum width=11.5em] (t15) at ([yshift=-2.45em]t12.south west) {upon the table};
+\node [anchor=north west,inner sep=1pt,fill=black,minimum height=0.5em] (tl15) at (t15.north west) {\tiny{{\color{white} \textbf{0-3}}}};
+
+\node [anchor=north west,inner sep=2pt,fill=blue!20,minimum height=2em,minimum width=11.8em] (t33) at ([yshift=-4.7em]t32.south west) {there is an apple};
+\node [anchor=north west,inner sep=1pt,fill=black,minimum height=0.5em] (tl33) at (t33.north west) {\tiny{{\color{white} \textbf{2-5}}}};
+\node [anchor=north west,inner sep=2pt,fill=blue!20,minimum height=2em,minimum width=11.8em] (t34) at ([yshift=-0.2em]t33.south west) {have an apple...};
+\node [anchor=north west,inner sep=1pt,fill=black,minimum height=0.5em] (tl34) at (t34.north west) {\tiny{{\color{white} \textbf{2-5}}}};

 \end{scope}
 \end{tikzpicture}

--- a/Book/Chapter4/Figures/translation-rule-based-on-admissible-node.tex
+++ b/Book/Chapter4/Figures/translation-rule-based-on-admissible-node.tex
@@ -15,7 +15,7 @@
                    [.\node(n8){NN}; \node(cw3){回答}; ]
                ]
     		]
-     		[.\node(n9){VP};
+     		[.\node[minimum height=0.7em,minimum width=1.8em](n9){};
     			[.\node(n10){VV}; \node(cw4){表示}; ]
     			[.\node(n11){NN}; \node(cw5){满意}; ]
     		]
@@ -24,7 +24,7 @@

 \node[anchor=north,minimum size=18pt,align=center] (tw1) at ([yshift=-6.0em]cw1.south){he\\\footnotesize{1}};
 \node[anchor=west,minimum size=18pt,align=center] (tw2) at ([yshift=-0.1em,xshift=1.1em]tw1.east){was\\\footnotesize{2}};
-\node[anchor=west,minimum size=18pt,align=center] (tw3) at ([yshift=0.1em,xshift=1.1em]tw2.east){satisfied\\\footnotesize{3}};
+\node[anchor=west,minimum size=18pt,align=center,minimum height=2.6em,minimum width=4em] (tw3) at ([yshift=0.1em,xshift=1.1em]tw2.east){};
 \node[anchor=west,minimum size=18pt,align=center] (tw4) at ([xshift=1.1em]tw3.east){with\\\footnotesize{4}};
 \node[anchor=west,minimum size=18pt,align=center] (tw5) at ([xshift=1.1em]tw4.east){the\\\footnotesize{5}};
 \node[anchor=west,minimum size=18pt,align=center] (tw6) at ([yshift=-0.1em,xshift=1.1em]tw5.east){answer\\\footnotesize{6}};
@@ -66,7 +66,7 @@
 \node [anchor=west] (land2label) at (land2.east) {\footnotesize{不可信}};

 {
-\node [draw,thick,red,fill=red!20] [fit = (n9)] (var1) {{\color{black} VP}};
+\node [draw,thick,red,fill=red!20,anchor=north,minimum height=2.2em,minimum width=2.5em] (var1) at ([yshift=1.2em]n9.north) {{\color{black} VP}};
 \node [draw,thick,red,fill=red!20] [fit = (tw3)] (var1v2) {{\color{black} \large{VP}}};
 \node [anchor=west] (var1label) at ([yshift=0.5em]var1.east) {\footnotesize{\red{变量}}};
 \node [anchor=south] (var1v2label) at ([xshift=-2em]var1v2.north) {\footnotesize{\red{变量}}};

--- a/Book/Chapter4/Figures/tree-segment-corresponding-to-phrase.tex
+++ b/Book/Chapter4/Figures/tree-segment-corresponding-to-phrase.tex
@@ -75,7 +75,7 @@
 }

 {
-\node [anchor=south west,draw=red,thick,fill=red!20,inner sep=0pt,minimum height = 2em, minimum width=6em] (ps) at ([xshift=0em,yshift=0em]cfrag2.south west) {};
+\node [anchor=south west,draw=red,thick,fill=red!20,inner sep=0pt,minimum height = 2em, minimum width=5.4em] (ps) at ([xshift=0em,yshift=0em]cfrag2.south west) {};
 \node [anchor=south west,draw=red,thick,fill=red!20,inner sep=0pt] (pt) [fit = (tw14) (tw15) (tw16)] {};
 }
 \end{pgfonlayer}

--- a/Book/Chapter4/Figures/tree-to-string-rule-empty-alignment-2.tex
+++ b/Book/Chapter4/Figures/tree-to-string-rule-empty-alignment-2.tex
@@ -43,7 +43,7 @@
 \end{scope}

 \node[scale=0.9,anchor=north,minimum size=18pt] (tw11) at ([xshift=-0.3em,yshift=-1.2em]cfrag1.south){he};
-\node[scale=0.9,anchor=west,minimum size=18pt] (tw12) at ([yshift=-0.1em,xshift=0.5em]tw11.east){was};
+\node[scale=0.9,anchor=west,minimum size=18pt] (tw12) at ([yshift=-0.1em,xshift=0.5em]tw11.east){\red{was}};
 \node[scale=0.9,anchor=west,minimum size=18pt] (tw13) at ([yshift=0.1em,xshift=0.5em]tw12.east){satisfied};
 \node[scale=0.9,anchor=west,minimum size=18pt] (tw14) at ([xshift=0.5em]tw13.east){with};
 \node[scale=0.9,anchor=west,minimum size=18pt] (tw15) at ([xshift=0.5em]tw14.east){the};
@@ -77,37 +77,37 @@
 {
 \node [fill=blue,circle,inner sep=2pt] (rlabel7) at (cfrag7.south west) {{\color{white} \tiny{7}}};
 }
+
+\begin{pgfonlayer}{background}
 {
-\node [fill=white,draw=red,thick] (tw12label) at (tw12) {\red{was}};
+\node [fill=white,draw=red,thick,minimum height=1.2em,minimum width=2.2em] (tw12label) at (tw12) {};
 }
-
 {
-\draw [->,red] ([xshift=0.2em]tw12label.north west) .. controls +(north:0.4) and +(south:0.4) .. ([xshift=0em]cfrag1.south);
+\node [fill=green!20,inner sep=0pt] (cfrag1back) [fit = (cfrag1)] {};
 }
 {
-\draw [->,red] ([xshift=0.8em]tw12label.north west) -- ([xshift=0.8em,yshift=18.4em]tw12label.north west);
+\node [fill=green!20,inner sep=0pt] (cfrag1back) [fit = (cfrag4)] {};
 }
 {
-\draw [->,red] ([xshift=0.2em]tw12label.north) .. controls +(north:7em) and +(south:11em) .. ([xshift=0em,yshift=0em]cfrag6.south);
+\node [fill=green!20,inner sep=0pt] (cfrag1back) [fit = (cfrag6)] {};
 }
 {
-\draw [->,red] ([xshift=0.6em]tw12label.north) -- ([xshift=-2em]cfrag4.south);
+\node [fill=green!20,inner sep=0pt] (cfrag1back) [fit = (cfrag7)] {};
 }
+\end{pgfonlayer}

-\begin{pgfonlayer}{background}
 {
-\node [fill=green!20,inner sep=0pt] (cfrag1back) [fit = (cfrag1)] {};
+\draw [->,red] ([xshift=0.2em]tw12label.north west) .. controls +(north:0.4) and +(south:0.4) .. ([xshift=0em]cfrag1.south);
 }
 {
-\node [fill=green!20,inner sep=0pt] (cfrag1back) [fit = (cfrag4)] {};
+\draw [->,red] ([xshift=0.8em]tw12label.north west) -- ([xshift=0.8em,yshift=18.4em]tw12label.north west);
 }
 {
-\node [fill=green!20,inner sep=0pt] (cfrag1back) [fit = (cfrag6)] {};
+\draw [->,red] ([xshift=0.2em]tw12label.north) .. controls +(north:7em) and +(south:11em) .. ([xshift=0em,yshift=0em]cfrag6.south);
 }
 {
-\node [fill=green!20,inner sep=0pt] (cfrag1back) [fit = (cfrag7)] {};
+\draw [->,red] ([xshift=0.6em]tw12label.north) -- ([xshift=-2em]cfrag4.south);
 }
-\end{pgfonlayer}

 }
 \end{scope}

--- a/Book/Chapter4/chapter4.tex
+++ b/Book/Chapter4/chapter4.tex
@@ -2,6 +2,14 @@
 % !TEX encoding = UTF-8 Unicode

 %----------------------------------------------------------------------------------------
+% 机器翻译：统计建模与深度学习方法
+% Machine Translation: Statistical Modeling and Deep Learning Methods
+%
+% Copyright 2020
+% 肖桐(xiaotong@mail.neu.edu.cn) 朱靖波 (zhujingbo@mail.neu.edu.cn)
+%----------------------------------------------------------------------------------------
+
+%----------------------------------------------------------------------------------------
 %    CONFIGURATIONS configurations
 %----------------------------------------------------------------------------------------
 \renewcommand\figurename{图}%将figure改为图
@@ -780,7 +788,7 @@ dr = start_i-end_{i-1}-1

 \subsubsection{翻译候选匹配}

-\parinterval 在解码时，首先要知道每个源语言短语可能的译文都是什么。对于一个源语言短语，每个可能的译文也被称作{\small\bfnew{翻译候选}}\index{翻译候选}（Translation Candidate）\index{Translation Candidate}。实现翻译候选的匹配很简单。只需要遍历输入的源语言句子中所有可能的短语，之后在短语表中找到相应的翻译即可。比如，图\ref{fig:4-27}展示了句子``桌子\ 上\ 有\ 一个\ 苹果''的翻译候选匹配结果。可以看到，不同的短语会对应若干翻译候选。这些翻译候选会保存在所对应的跨度中。比如，``upon the table''是短语``桌子 上 有''的翻译候选，即对应源语言跨度[0,3]。\\ \\ \\ 
+\parinterval 在解码时，首先要知道每个源语言短语可能的译文都是什么。对于一个源语言短语，每个可能的译文也被称作{\small\bfnew{翻译候选}}\index{翻译候选}（Translation Candidate）\index{Translation Candidate}。实现翻译候选的匹配很简单。只需要遍历输入的源语言句子中所有可能的短语，之后在短语表中找到相应的翻译即可。比如，图\ref{fig:4-27}展示了句子``桌子\ 上\ 有\ 一个\ 苹果''的翻译候选匹配结果。可以看到，不同的短语会对应若干翻译候选。这些翻译候选会保存在所对应的跨度中。比如，``upon the table''是短语``桌子 上 有''的翻译候选，即对应源语言跨度[0,3]。\\ \\ \\

 %----------------------------------------------
 \begin{figure}[htp]

--- a/Book/Chapter5/Figures/deep-learning.jpg
+++ b/Book/Chapter5/Figures/deep-learning.jpg
--- a/Book/Chapter5/Figures/feature-engineering.jpg
+++ b/Book/Chapter5/Figures/feature-engineering.jpg
--- a/Book/Chapter5/Figures/fig-4-gram.tex
+++ b/Book/Chapter5/Figures/fig-4-gram.tex
-%%%------------------------------------------------------------------------------------------------------------
 \begin{tikzpicture}
 \begin{scope}
 \node [anchor=west] (w0) at (0,0) {\footnotesize{$w_{i-3}$}};
@@ -7,11 +6,13 @@
 \node [anchor=north] (index0) at ([yshift=0.5em]w0.south) {\tiny(index)};
 \node [anchor=north] (index1) at ([yshift=0.5em]w1.south) {\tiny(index)};
 \node [anchor=north] (index2) at ([yshift=0.5em]w2.south) {\tiny(index)};
-\node [anchor=south,draw,inner sep=3pt] (e0) at ([yshift=1em]w0.north) {\tiny{$\textbf{e}_0=w_{i-3} \textbf{C}$}};
-\node [anchor=south,draw,inner sep=3pt] (e1) at ([yshift=1em]w1.north) {\tiny{$\textbf{e}_1=w_{i-2} \textbf{C}$}};
-\node [anchor=south,draw,inner sep=3pt] (e2) at ([yshift=1em]w2.north) {\tiny{$\textbf{e}_2=w_{i-1} \textbf{C}$}};
-\node [anchor=south,draw,minimum width=9em,inner sep=3pt] (h0) at ([yshift=1.5em]e1.north) {\tiny{$\textbf{h}_0=\textrm{Tanh}([\textbf{e}_0,\textbf{e}_1,\textbf{e}_2] \textbf{H} + \textbf{d})$}};
-\node [anchor=south,draw,minimum width=9em,inner sep=3pt] (h1) at ([yshift=1.5em]h0.north) {\tiny{$\textbf{y}=\textrm{Softmax}(\textbf{h}_0 \textbf{U})$}};
+
+\node [anchor=south,draw,inner sep=3pt,fill=blue!20!white] (e0) at ([yshift=1em]w0.north) {\tiny{$\textbf{e}_0=w_{i-3} \textbf{C}$}};
+\node [anchor=south,draw,inner sep=3pt,fill=blue!20!white] (e1) at ([yshift=1em]w1.north) {\tiny{$\textbf{e}_1=w_{i-2} \textbf{C}$}};
+\node [anchor=south,draw,inner sep=3pt,fill=blue!20!white] (e2) at ([yshift=1em]w2.north) {\tiny{$\textbf{e}_2=w_{i-1} \textbf{C}$}};
+
+\node [anchor=south,draw,minimum width=9em,inner sep=3pt,fill=orange!20!white] (h0) at ([yshift=1.5em]e1.north) {\tiny{$\textbf{h}_0=\textrm{Tanh}([\textbf{e}_0,\textbf{e}_1,\textbf{e}_2] \textbf{H} + \textbf{d})$}};
+\node [anchor=south,draw,minimum width=9em,inner sep=3pt,fill=orange!20!white] (h1) at ([yshift=1.5em]h0.north) {\tiny{$\textbf{y}=\textrm{Softmax}(\textbf{h}_0 \textbf{U})$}};
 \node [anchor=south] (ylabel) at ([yshift=1em]h1.north) {\footnotesize{$\textrm{P}(w_i|w_{i-3}w_{i-2}w_{i-1})$}};

 \draw [->,line width=1pt] ([yshift=0.1em]w0.north) -- ([yshift=-0.1em]e0.south);
@@ -37,17 +38,7 @@
 }
 \end{pgfonlayer}

-{
-\node [anchor=south,draw,inner sep=3pt,fill=blue!20!white] (e0) at ([yshift=1em]w0.north) {\tiny{$\textbf{e}_0=w_{i-3} \textbf{C}$}};
-\node [anchor=south,draw,inner sep=3pt,fill=blue!20!white] (e1) at ([yshift=1em]w1.north) {\tiny{$\textbf{e}_1=w_{i-2} \textbf{C}$}};
-\node [anchor=south,draw,inner sep=3pt,fill=blue!20!white] (e2) at ([yshift=1em]w2.north) {\tiny{$\textbf{e}_2=w_{i-1} \textbf{C}$}};
-}
-{
-\node [anchor=south,draw,minimum width=9em,inner sep=3pt,fill=orange!20!white] (h0) at ([yshift=1.5em]e1.north) {\tiny{$\textbf{h}_0=\textrm{Tanh}([\textbf{e}_0,\textbf{e}_1,\textbf{e}_2] \textbf{H} + \textbf{d})$}};
-\node [anchor=south,draw,minimum width=9em,inner sep=3pt,fill=orange!20!white] (h1) at ([yshift=1.5em]h0.north) {\tiny{$\textbf{y}=\textrm{Softmax}(\textbf{h}_0 \textbf{U})$}};
-}

-\end{scope}
-\end{tikzpicture}
-%%%------------------------------------------------------------------------------------------------------------

+\end{scope}
+\end{tikzpicture}
\ No newline at end of file
--- a/Book/Chapter5/Figures/fig-broadcast.tex
+++ b/Book/Chapter5/Figures/fig-broadcast.tex
@@ -8,7 +8,7 @@
    \node [fill=orange!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (\x,\y) {$\number\value{mycount1}$};
    \addtocounter{mycount1}{1};
  }
-\node [anchor=south] (varlabel) at (0,0.6) {$\textbf{s}$};
+\node [anchor=south] (varlabel) at (0,0.6) {$\mathbf{s}$};
 \node [anchor=north] (labelc) at (0,-0.7) {\footnotesize{(a)}};
 \end{scope}

@@ -20,7 +20,7 @@
    \node [fill=green!20,inner sep=0pt,minimum height=0.48cm,minimum width=0.48cm] at (\x,\y) {$1$};
    \addtocounter{mycount1}{1};
  }
-\node [anchor=south] (varlabel) at (0,0.1) {$\textbf{b}$};
+\node [anchor=south] (varlabel) at (0,0.1) {$\mathbf{b}$};
 \node [anchor=north] (labelc) at (0,-0.7) {\footnotesize{(b)}};
 \end{scope}

@@ -34,7 +34,7 @@
    \node [fill=orange!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (\x,\y) {$\number\value{mycount1}$};
    \addtocounter{mycount1}{1};
  }
-\node [anchor=south] (varlabel) at (0,0.6) {$\textbf{s}$};
+\node [anchor=south] (varlabel) at (0,0.6) {$\mathbf{s}$};
 \end{scope}
 \begin{scope}[yshift=-1in,xshift=1.5in]
 \setcounter{mycount1}{1}
@@ -49,8 +49,8 @@
    \node [fill=purple!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (\x,\y) {$1$};
    \addtocounter{mycount1}{1};
  }
-\node [anchor=center] (plabel) at (-4.5em,0) {\huge{$\textbf{+}$}};
-\node [anchor=south] (varlabel) at (0,0.6) {$\textbf{b}$};
+\node [anchor=center] (plabel) at (-4.5em,0) {\huge{$\mathbf{+}$}};
+\node [anchor=south] (varlabel) at (0,0.6) {$\mathbf{b}$};
 \node [anchor=north] (labelc) at (0,-0.7) {\footnotesize{(c)}};
 \end{scope}
 \begin{scope}[yshift=-1in,xshift=3in]
@@ -61,8 +61,8 @@
    \node [fill=orange!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (\x,\y) {$\number\value{mycount1}$};
    \addtocounter{mycount1}{1};
  }
-\node [anchor=center] (plabel) at (-4.5em,0) {\huge{$\textbf{=}$}};
-\node [anchor=south] (varlabel) at (0,0.6) {$\textbf{s+b}$};
+\node [anchor=center] (plabel) at (-4.5em,0) {\huge{$\mathbf{=}$}};
+\node [anchor=south] (varlabel) at (0,0.6) {$\mathbf{s}+\mathbf{b}$};
 \end{scope}



--- a/Book/Chapter5/Figures/fig-code-niutensor-one.tex
+++ b/Book/Chapter5/Figures/fig-code-niutensor-one.tex
@@ -59,8 +59,8 @@
 \draw [thick,->] (layer3.north) -- (y.south);
 \node [anchor=west,align=left] (xshape) at (x.east) {\tiny{shape: 3*4*5}};
 \node [anchor=west,align=left] (yshape) at (y.east) {\tiny{shape: 3*4*4}};
-\node [anchor=south west,align=left,inner sep=2pt] (l1shape) at (layer1.north) {\tiny{shape: 3*4*3}};
-\node [anchor=south west,align=left,inner sep=2pt] (l2shape) at (layer2.north) {\tiny{shape: 3*4*6}};
+\node [anchor=south west,align=left,inner sep=2pt] (l1shape) at ([xshift=0.3em]layer1.north) {\tiny{shape: 3*4*3}};
+\node [anchor=south west,align=left,inner sep=2pt] (l2shape) at ([xshift=0.3em]layer2.north) {\tiny{shape: 3*4*6}};
 \end{tikzpicture}
 \end{center}
 \end{tcolorbox}

--- a/Book/Chapter5/Figures/fig-code-tensor-define-2.tex
+++ b/Book/Chapter5/Figures/fig-code-tensor-define-2.tex
 %%%------------------------------------------------------------------------------------------------------------
-\begin{tcolorbox}[enhanced,width=11cm,frame engine=empty,boxrule=0.1mm,size=title,colback=blue!10!white]
+\begin{tcolorbox}[enhanced,width=12cm,frame engine=empty,boxrule=0.1mm,size=title,colback=blue!10!white]
 \begin{flushleft}
 {\scriptsize
 \begin{tabbing}
-\texttt{XTensor tensor;} \hspace{12em} \= // 声明张量tensor \\
+\texttt{XTensor tensor;} \hspace{14em} \= // 声明张量tensor \\
 \texttt{int sizes[6] = \{2,3,4,2,3,4\};} \> // 张量的形状为2*3*4*2*3*4 \\
 \texttt{InitTensor(\&tensor, 6, sizes, X\_FLOAT);} \> // 定义形状为sizes的6阶张量
 \end{tabbing}
@@ -12,11 +12,11 @@
 \end{tcolorbox}
 \hspace{0.1in} \scriptsize{(a) NiuTensor定义张量程序}
 \\
-\begin{tcolorbox}[enhanced,width=11cm,frame engine=empty,boxrule=0.1mm,size=title,colback=blue!10!white]
+\begin{tcolorbox}[enhanced,width=12cm,frame engine=empty,boxrule=0.1mm,size=title,colback=blue!10!white]
 \begin{flushleft}
 {\scriptsize
 \begin{tabbing}
-\texttt{XTensor a, b, c;} \hspace{11.5em} \= // 声明张量tensor \\
+\texttt{XTensor a, b, c;} \hspace{13.5em} \= // 声明张量tensor \\
 \texttt{InitTensor1D(\&a, 10, X\_INT);} \> // 10维的整数型向量\\
 \texttt{InitTensor1D(\&b, 10);} \> // 10维的向量，缺省类型(浮点)\\
 \texttt{InitTensor4D(\&c, 10, 20, 30, 40);} \> // 10*20*30*40的4阶张量(浮点)
@@ -26,11 +26,11 @@
 \end{tcolorbox}
 \hspace{0.1in} \scriptsize{(b) 定义张量的简便方式程序}
 \\
-\begin{tcolorbox}[enhanced,width=11cm,frame engine=empty,boxrule=0.1mm,size=title,colback=blue!10!white]
+\begin{tcolorbox}[enhanced,width=12cm,frame engine=empty,boxrule=0.1mm,size=title,colback=blue!10!white]
 \begin{flushleft}
 {\scriptsize
 \begin{tabbing}
-\texttt{XTensor tensorGPU;} \hspace{10.5em} \= // 声明张量tensor \\
+\texttt{XTensor tensorGPU;} \hspace{12.5em} \= // 声明张量tensor \\
 \texttt{InitTensor2D(\&tensorGPU, 10, 20,} $\backslash$ \> // 在编号为0的GPU上定义张量 \\
 \hspace{6.7em} \texttt{X\_FLOAT, 0);}
 \end{tabbing}

--- a/Book/Chapter5/Figures/fig-code-tensor-define.tex
+++ b/Book/Chapter5/Figures/fig-code-tensor-define.tex
 %------------------------------------------------------------------------------------------------------------
-\begin{tcolorbox}[enhanced,width=11cm,frame engine=empty,boxrule=0.1mm,size=title,colback=blue!10!white]
+\begin{tcolorbox}[enhanced,width=12cm,frame engine=empty,boxrule=0.1mm,size=title,colback=blue!10!white]
 \begin{flushleft}
 {\scriptsize
 \begin{tabbing}
-\texttt{\#include "source/tensor/XTensor.h"} \hspace{4em} \= // 引用XTensor定义的头文件 \\
+\texttt{\#include "source/tensor/XTensor.h"} \hspace{6em} \= // 引用XTensor定义的头文件 \\

 \texttt{using namespace nts;} \> // 引用nts命名空间 \\
 \ \\

--- a/Book/Chapter5/Figures/fig-code-tensor-operation.tex
+++ b/Book/Chapter5/Figures/fig-code-tensor-operation.tex
 %%%------------------------------------------------------------------------------------------------------------
-\begin{tcolorbox}[enhanced,width=11cm,frame engine=empty,boxrule=0.1mm,size=title,colback=blue!10!white]
+\begin{tcolorbox}[enhanced,width=12cm,frame engine=empty,boxrule=0.1mm,size=title,colback=blue!10!white]
 \begin{flushleft}
 {\scriptsize
 \begin{tabbing}
-\texttt{XTensor a, b, c, d, e;} \hspace{7em} \= // 声明张量tensor \\
+\texttt{XTensor a, b, c, d, e;} \hspace{9em} \= // 声明张量tensor \\
 \texttt{InitTensor3D(\&a, 2, 3, 4);} \> // a为2*3*4的3阶张量 \\
 \texttt{InitTensor3D(\&b, 2, 3, 4);} \> // b为2*3*4的3阶张量 \\
 \texttt{InitTensor3D(\&c, 2, 3, 4);} \> // c为2*3*4的3阶张量 \\
@@ -19,11 +19,11 @@
 \end{tcolorbox}
 \hspace{0.1in} \scriptsize{(a) 张量进行1阶运算}
 \\
-\begin{tcolorbox}[enhanced,width=11cm,frame engine=empty,boxrule=0.1mm,size=title,colback=blue!10!white]
+\begin{tcolorbox}[enhanced,width=12cm,frame engine=empty,boxrule=0.1mm,size=title,colback=blue!10!white]
 \begin{flushleft}
 {\scriptsize
 \begin{tabbing}
-\texttt{XTensor a, b, c;} \hspace{10.0em} \= // 声明张量tensor \\
+\texttt{XTensor a, b, c;} \hspace{12.0em} \= // 声明张量tensor \\
 \texttt{InitTensor4D(\&a, 2, 2, 3, 4);} \> // a为2*2*3*4的4阶张量 \\
 \texttt{InitTensor2D(\&b, 4, 5);} \> // b为4*5的矩阵 \\
 \texttt{a.SetDataRand();} \> // 随机初始化a \\

--- a/Book/Chapter5/Figures/fig-rnn-LM.tex
+++ b/Book/Chapter5/Figures/fig-rnn-LM.tex
-%%%------------------------------------------------------------------------------------------------------------
 \begin{tikzpicture}
 \begin{scope}
 \tikzstyle{rnnnode} = [draw,inner sep=5pt,minimum width=4em,minimum height=1.5em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}]
@@ -28,10 +27,10 @@
 \draw [->,thick] ([yshift=0.1em]e4.north)--([yshift=-0.1em]node14.south);

 {
-\node [anchor=south,rnnnode] (node21) at ([yshift=1.5em]node11.north) {\scriptsize{RNN Cell}};
-\node [anchor=south,rnnnode] (node22) at ([yshift=1.5em]node12.north) {\scriptsize{RNN Cell}};
-\node [anchor=south,rnnnode] (node23) at ([yshift=1.5em]node13.north) {\scriptsize{RNN Cell}};
-\node [anchor=south,rnnnode] (node24) at ([yshift=1.5em]node14.north) {\scriptsize{RNN Cell}};
+\node [anchor=south,rnnnode,fill=blue!30!white] (node21) at ([yshift=1.5em]node11.north) {\scriptsize{Softmax($\cdot$)}};
+\node [anchor=south,rnnnode,fill=blue!30!white] (node22) at ([yshift=1.5em]node12.north) {\scriptsize{Softmax($\cdot$)}};
+\node [anchor=south,rnnnode,fill=blue!30!white] (node23) at ([yshift=1.5em]node13.north) {\scriptsize{Softmax($\cdot$)}};
+\node [anchor=south,rnnnode,fill=blue!30!white] (node24) at ([yshift=1.5em]node14.north) {\scriptsize{Softmax($\cdot$)}};

 \node [anchor=south,rnnnode,fill=blue!30!white] (node31) at ([yshift=1.5em]node21.north) {\scriptsize{Softmax($\cdot$)}};
 \node [anchor=south,rnnnode,fill=blue!30!white] (node32) at ([yshift=1.5em]node22.north) {\scriptsize{Softmax($\cdot$)}};
@@ -40,13 +39,6 @@
 }

 {
-\node [anchor=south,rnnnode,fill=blue!30!white] (node21) at ([yshift=1.5em]node11.north) {\scriptsize{Softmax($\cdot$)}};
-\node [anchor=south,rnnnode,fill=blue!30!white] (node22) at ([yshift=1.5em]node12.north) {\scriptsize{Softmax($\cdot$)}};
-\node [anchor=south,rnnnode,fill=blue!30!white] (node23) at ([yshift=1.5em]node13.north) {\scriptsize{Softmax($\cdot$)}};
-\node [anchor=south,rnnnode,fill=blue!30!white] (node24) at ([yshift=1.5em]node14.north) {\scriptsize{Softmax($\cdot$)}};
-}
-
-{
 \draw [->,thick] ([yshift=0.1em]node31.north)--([yshift=1em]node31.north) node[pos=1,above] {\scriptsize{$\textrm{P}(w_2)$}};
 \draw [->,thick] ([yshift=0.1em]node32.north)--([yshift=1em]node32.north) node[pos=1,above] {\scriptsize{$\textrm{P}(w_3|w_2)$}};
 \draw [->,thick] ([yshift=0.1em]node33.north)--([yshift=1em]node33.north) node[pos=1,above] {\scriptsize{$\textrm{P}(w_4|w_2 w_3)$}};
@@ -64,7 +56,7 @@
 \draw [->,thick] ([xshift=0.1em]node24.east)--([xshift=1em]node24.east);
 }

-{
+
 \draw [->,thick] ([yshift=0.1em]node11.north)--([yshift=-0.1em]node21.south);
 \draw [->,thick] ([yshift=0.1em]node12.north)--([yshift=-0.1em]node22.south);
 \draw [->,thick] ([yshift=0.1em]node13.north)--([yshift=-0.1em]node23.south);
@@ -75,9 +67,7 @@
 \draw [->,thick] ([xshift=0.1em]node12.east)--([xshift=-0.1em]node13.west);
 \draw [->,thick] ([xshift=0.1em]node13.east)--([xshift=-0.1em]node14.west);
 \draw [->,thick] ([xshift=0.1em]node14.east)--([xshift=1em]node14.east);
-}

-\end{scope}
-\end{tikzpicture}
-%%%------------------------------------------------------------------------------------------------------------

+\end{scope}
+\end{tikzpicture}
\ No newline at end of file
--- a/Book/Chapter5/Figures/fig-two-layer-neural-network.tex
+++ b/Book/Chapter5/Figures/fig-two-layer-neural-network.tex
@@ -36,8 +36,8 @@
 %% weight and bias
 {\node [anchor=center,rotate=90,fill=white,inner sep=1pt] (b0) at ([yshift=2em,xshift=-0.5em]b.north) {\scriptsize{$b_1$}};}
 {\node [anchor=center,rotate=-59,fill=white,inner sep=1pt] (w2) at ([yshift=1em,xshift=-1.0em]x1.north) {\scriptsize{$w_1$}};}
-{\node [anchor=center,rotate=62,fill=white,inner sep=1pt] (w21) at ([yshift=1.2em,xshift=0em]n10.north) {\scriptsize{$w'_1$}};}
-{\node [anchor=center,rotate=-62,fill=white,inner sep=1pt] (w22) at ([yshift=1.2em,xshift=-0em]n11.north) {\scriptsize{$w'_2$}};}
+{\node [anchor=center,rotate=62,fill=white,inner sep=1pt] (w21) at ([yshift=1.2em,xshift=-0.2em]n10.north) {\scriptsize{$w'_1$}};}
+{\node [anchor=center,rotate=-62,fill=white,inner sep=1pt] (w22) at ([yshift=1.2em,xshift=0.2em]n11.north) {\scriptsize{$w'_2$}};}
 {\node [anchor=center,rotate=59,fill=white,inner sep=1pt] (b1) at ([yshift=3.4em,xshift=1.5em]b.north) {\scriptsize{$b_2$}};}
 {\node [anchor=center,rotate=90,fill=white,inner sep=1pt] (w1) at ([yshift=2em,xshift=0.5em]x1.north) {\scriptsize{$w_2$}};}


--- a/Book/Chapter5/chapter5.tex
+++ b/Book/Chapter5/chapter5.tex
 % !Mode:: "TeX:UTF-8"
 % !TEX encoding = UTF-8 Unicode

+%----------------------------------------------------------------------------------------
+% 机器翻译：统计建模与深度学习方法
+% Machine Translation: Statistical Modeling and Deep Learning Methods
+%
+% Copyright 2020
+% 肖桐(xiaotong@mail.neu.edu.cn) 朱靖波 (zhujingbo@mail.neu.edu.cn)
+%----------------------------------------------------------------------------------------
+
 \part{神经机器翻译}
 %----------------------------------------------------------------------------------------
 %    CONFIGURATIONS
@@ -126,6 +134,7 @@
 %------------------------------------------------------------------------------
    \begin{figure}[htp]
    \centering
+    \subfigcapskip=8pt
    \subfigure[基于特征工程的机器学习方法做图像分类]{
    \begin{minipage}{.9\textwidth}
    \centering
@@ -393,7 +402,7 @@

 \subsubsection{线性映射}

-\parinterval {\small\sffamily\bfseries{线性映射}}\index{线性映射}（ Linear Mapping）\index{Linear Mapping}或{\small\sffamily\bfseries{线性变换}}\index{线性变换}（Linear Transformation）\index{Linear Transformation}是从一个向量空间V到另一个向量空间W的映射函数$ f:v\rightarrow w$，且该映射函数保持加法运算和数量乘法运算，即对于空间V中任何两个向量$ \mathbf u $和$ \mathbf v $以及任何标量$ c $，有：
+\parinterval {\small\sffamily\bfseries{线性映射}}\index{线性映射}（ Linear Mapping）\index{Linear Mapping}或{\small\sffamily\bfseries{线性变换}}\index{线性变换}（Linear Transformation）\index{Linear Transformation}是从一个向量空间V到另一个向量空间W的映射函数$ f:v\rightarrow w$，且该映射函数保持加法运算和数量乘法运算，即对于空间V中任何两个向量$ \mathbf u $ 和$ \mathbf v $以及任何标量$ c $，有：
 \begin{eqnarray}
 f(\mathbf u+\mathbf v)&=&f(\mathbf u)+f(\mathbf v)\label{eq:5-9}\\
 f(c\mathbf v)&=&cf(\mathbf v)
@@ -499,7 +508,7 @@ l_p(\mathbf x) & = & {\Vert{\mathbf x}\Vert}_p \nonumber \\
 \end{figure}
 %-------------------------------------------

-\parinterval 同样，人工神经元是人工神经网络的基本单元。在人们的想象中，人工神经元应该与生物神经元类似。但事实上，二者在形态上是有明显差别的。如图\ref{fig:5-4} 是一个典型的人工神经元，其本质是一个形似$ y=f(\mathbf x\cdot \mathbf w+b) $的函数。显而易见，一个神经元主要由$ \mathbf x $，$ \mathbf w $，$ b $，$ f $四个部分构成。其中$ \mathbf x $是一个形如$ (x_0,x_1,\dots,x_n) $的实数向量，在一个神经元中担任``输入''的角色。$ \mathbf w $是一个权重矩阵，其中的每一个元素都对应着一个输入和一个输出，代表着``某输入对某输出的贡献程度''，通常也被理解为神经元连接的{\small\sffamily\bfseries{权重}}\index{权重}（weight）\index{weight}。$ b $被称作偏置，是一个实数。$ f $被称作激活函数，其本质是一个非线性函数。可见，一个人工神经元的功能是将输入向量与权重矩阵右乘（做内积）后，加上偏置量，经过一个非线性激活函数得到一个标量结果。
+\parinterval 同样，人工神经元是人工神经网络的基本单元。在人们的想象中，人工神经元应该与生物神经元类似。但事实上，二者在形态上是有明显差别的。如图\ref{fig:5-4} 是一个典型的人工神经元，其本质是一个形似$ y=f(\mathbf x\cdot \mathbf w+b) $的函数。显而易见，一个神经元主要由$ \mathbf x $，$ \mathbf w $，$ b $，$ f $四个部分构成。其中$ \mathbf x $是一个形如$ (x_0,x_1,\dots,x_n) $ 的实数向量，在一个神经元中担任``输入''的角色。$ \mathbf w $是一个权重矩阵，其中的每一个元素都对应着一个输入和一个输出，代表着``某输入对某输出的贡献程度''，通常也被理解为神经元连接的{\small\sffamily\bfseries{权重}}\index{权重}（weight）\index{weight}。$ b $被称作偏置，是一个实数。$ f $被称作激活函数，其本质是一个非线性函数。可见，一个人工神经元的功能是将输入向量与权重矩阵右乘（做内积）后，加上偏置量，经过一个非线性激活函数得到一个标量结果。

 %----------------------------------------------
 \begin{figure}[htp]
@@ -1249,7 +1258,7 @@ y&=&{\rm{Sigmoid}}({\rm{Tanh}}(\mathbf x\cdot \mathbf w^{[1]}+\mathbf b^{[1]})\c

 \parinterval 下面用几个实例来说明搭建神经网络的过程。搭建神经网络的过程本质上就是定义前向计算的过程。首先构造一个单层神经网络。如图\ref{fig:5-39}(a)所示，简单的定义输入、权重和偏置后，定义激活函数为Sigmoid函数，输入$ \mathbf x $经过线性变换和激活函数，得到输出$ \mathbf y $。

-\parinterval 图\ref{fig:5-39}(b)是一个构造三层神经网络的程序示例。在第一层中，$ \mathbf x $作为输入，$ \mathbf h1 $作为输出，其中$ \mathbf h1={\rm{Sigmoid}}(\mathbf x\cdot \mathbf w1+\mathbf b1) $。在第二层中，$ \mathbf h1 $作为输入，$ \mathbf h2 $作为输出，其中$ \mathbf h2={\rm{Tanh}}(\mathbf h1\cdot \mathbf w2) $。在第三层中，$ \mathbf h2 $作为输入，$ \mathbf y $作为输出，其中$ \mathbf y={\rm{ReLU}}(\mathbf h2\cdot \mathbf w3) $。$ \mathbf y $也会作为整个神经网络的输出。
+\parinterval 图\ref{fig:5-39}(b)是一个构造三层神经网络的程序示例。在第一层中，$ \mathbf x $作为输入，$ \mathbf h1 $作为输出，其中$ \mathbf h1={\rm{Sigmoid}}(\mathbf x\cdot \mathbf w1+\mathbf b1) $。在第二层中，$ \mathbf h1 $作为输入，$ \mathbf h2 $作为输出，其中$ \mathbf h2={\rm{Tanh}}(\mathbf h1\cdot \mathbf w2) $。在第三层中，$ \mathbf h2 $作为输入，$ \mathbf y $ 作为输出，其中$ \mathbf y={\rm{ReLU}}(\mathbf h2\cdot \mathbf w3) $。$ \mathbf y $也会作为整个神经网络的输出。

 %----------------------------------------------
 \begin{figure}[htp]
@@ -1380,7 +1389,7 @@ y&=&{\rm{Sigmoid}}({\rm{Tanh}}(\mathbf x\cdot \mathbf w^{[1]}+\mathbf b^{[1]})\c
 \parinterval 从优化的角度看，梯度下降是一种典型的 {\small\bfnew{基于梯度的方法}}\index{基于梯度的方法}（Gradient-based Method）\index{Gradient-based Method}，属于基于一阶导数的方法。其他类似的方法还有牛顿法、共轭方向法、拟牛顿法等。在具体实现时，公式\ref{eq:5-29}可以有以下不同的形式。\\

 %----------------------------------------------------------------------------------------
-%  
+%
 %----------------------------------------------------------------------------------------

 \vspace{0.5em}
@@ -1398,7 +1407,7 @@ J(\mathbf w)&=&\frac{1}{n}\sum_{i=1}^{n}{L(\mathbf x_i,\mathbf {\widetilde y}_i;
 \parinterval 不过，这种方法的缺点也十分明显，因为要在全部训练数据上最小化损失，每一次参数更新都需要计算在所有样本上的损失。在使用海量数据进行训练的情况下，这种计算是非常消耗时间的。当训练数据规模很大时，很少使用这种方法。

 %----------------------------------------------------------------------------------------
-%  
+%
 %----------------------------------------------------------------------------------------

 \vspace{0.5em}
@@ -1416,7 +1425,7 @@ J(\mathbf w)&=&L(\mathbf x_i,\mathbf {\widetilde y}_i;\mathbf w)
 \parinterval 因为随机梯度下降算法每次优化的只是某一个样本上的损失，所以它的问题也非常明显：单个样本上的损失无法代表在全部样本上的损失，因此参数更新的效率低，方法收敛速度极慢。即使在目标函数为强凸函数的情况下，SGD仍旧无法做到线性收敛。

 %----------------------------------------------------------------------------------------
-%  
+%
 %----------------------------------------------------------------------------------------

 \vspace{0.5em}
@@ -1440,7 +1449,7 @@ J(\mathbf w)&=&\frac{1}{m}\sum_{i=j}^{j+m-1}{L(\mathbf x_i,\mathbf {\widetilde y
 \parinterval 梯度下降算法的一个核心是要得到目标函数相对于参数的梯度。下面将介绍三种常见的求梯度方法：数值微分、符号微分和自动微分，深度学习实现过程中多是采用自动微分方法计算梯度\cite{baydin2017automatic}。

 %----------------------------------------------------------------------------------------
-%  
+%
 %----------------------------------------------------------------------------------------

 \vspace{0.5em}
@@ -1464,7 +1473,7 @@ J(\mathbf w)&=&\frac{1}{m}\sum_{i=j}^{j+m-1}{L(\mathbf x_i,\mathbf {\widetilde y
 \parinterval 尽管数值微分不适用于大模型中的梯度求解，但是由于其非常简单，因此经常被用于检验其他梯度计算方法的正确性。比如在实现反向传播的时候（详见\ref{sec:5.4.6}节），可以检验求导是否正确（Gradient Check），这个过程就是利用数值微分实现的。\\ \\

 %----------------------------------------------------------------------------------------
-%  
+%
 %----------------------------------------------------------------------------------------

 \noindent {\small\sffamily\bfseries{ 符号微分\index{符号微分}（Symbolic Differentiation）\index{Symbolic Differentiation}}}
@@ -1499,7 +1508,7 @@ $+2x^2+x+1)$ & \ \ $(x^4+2x^3+2x^2+x+1)$ & $+6x+1$ \\
 %--------------------------------------------------------------------

 %----------------------------------------------------------------------------------------
-%  
+%
 %----------------------------------------------------------------------------------------

 \vspace{0.5em}
@@ -1570,7 +1579,7 @@ $+2x^2+x+1)$ & \ \ $(x^4+2x^3+2x^2+x+1)$ & $+6x+1$ \\
 %-------------------------------------------

 %----------------------------------------------------------------------------------------
-%  
+%
 %----------------------------------------------------------------------------------------

 \vspace{0.5em}
@@ -1589,7 +1598,7 @@ w_{t+1}&=&w_t-\alpha v_t
 \parinterval  这里的``梯度''不再只是现在的损失函数的梯度，而是之前的梯度的加权和。在原始的梯度下降算法中，如果在某个参数状态下，梯度方向变化特别大，甚至与上一次参数更新中梯度方向成90度夹角，下一次参数更新中梯度方向可能又是一次90度的改变，这时参数优化路径将会成``锯齿''状（如图\ref{fig:5-46}所示），优化效率极慢。而Momentum梯度下降算法不会让梯度发生90度的变化，而是让梯度慢慢发生改变：如果当前的梯度方向与之前的梯度方向相同，在原梯度方向上加速更新参数；如果当前的梯度方向与之前的梯度方向相反，并不会产生一个急转弯，而是尽量把优化路径平滑地进行改变。这样做的优点也非常明显，一方面杜绝了``锯齿''状优化路径的出现，另一方面将优化幅度变得更加平滑，不会导致频频跳过最优点。

 %----------------------------------------------------------------------------------------
-%  
+%
 %----------------------------------------------------------------------------------------

 \vspace{0.5em}
@@ -1608,7 +1617,7 @@ w_{t+1}&=&w_t-\eta \frac{1}{\sqrt{z_t}}\cdot \frac{\partial L}{\partial w_t}
 \parinterval  这里新出现了变量$ z $，它保存了以前的所有梯度值的平方和，在更新参数时，通过乘以$ \frac{1}{\sqrt{z_t}} $ ，就可以调整学习的尺度。这意味着，变动较大（被大幅度更新）的参数的学习率将变小。也就是说，可以按参数的元素进行学习率衰减，使变动大的参数的学习率逐渐减小。

 %----------------------------------------------------------------------------------------
-%  
+%
 %----------------------------------------------------------------------------------------

 \vspace{0.5em}
@@ -1629,7 +1638,7 @@ w_{t+1}&=&w_t-\frac{\eta}{\sqrt{z_t+\epsilon}}\cdot \frac{\partial L}{\partial w
 \parinterval  RMSProp与AdaGrad相比，学习率的分母部分（即两种梯度下降算法迭代公式中的$ z $）的计算由累积方式变成了指数衰减移动平均。于是，每个参数的学习率并不是呈衰减趋势，而是既可以变小也可以变大，从而避免AdaGrad算法中学习率不断单调下降以至于过早衰减的缺点。

 %----------------------------------------------------------------------------------------
-%  
+%
 %----------------------------------------------------------------------------------------

 \vspace{0.5em}

--- a/Book/Chapter6/Chapter6.tex
+++ b/Book/Chapter6/Chapter6.tex
@@ -2,6 +2,14 @@
 % !TEX encoding = UTF-8 Unicode

 %----------------------------------------------------------------------------------------
+% 机器翻译：统计建模与深度学习方法
+% Machine Translation: Statistical Modeling and Deep Learning Methods
+%
+% Copyright 2020
+% 肖桐(xiaotong@mail.neu.edu.cn) 朱靖波 (zhujingbo@mail.neu.edu.cn)
+%----------------------------------------------------------------------------------------
+
+%----------------------------------------------------------------------------------------
 %    CONFIGURATIONS
 %----------------------------------------------------------------------------------------

@@ -43,15 +51,15 @@
 \parinterval 神经机器翻译的迅速崛起确实让所有人都有些措手不及，甚至有一种一觉醒来天翻地覆的感觉。也有人评价，神经机器翻译的出现给整个机器翻译领域带来了前所未有的发展机遇。不过，客观地看，机器翻译达到今天这样的状态也是一种历史必然，其中有几方面原因：

 \begin{itemize}
-\vspace{0.5em}
+\vspace{0.3em}
 \item 自上世纪末所发展起来的基于数据驱动的方法为神经机器翻译提供了很好的基础。本质上，神经机器翻译仍然是一种基于统计建模的数据驱动的方法，因此无论是对问题的基本建模方式，还是训练统计模型所使用到的带标注数据，都可以复用机器翻译领域以前的研究成果。特别是机器翻译长期的发展已经积累了大量的双语、单语数据，这些数据在统计机器翻译时代就发挥了很大作用。随着时间的推移，数据规模和质量又得到进一步提升，包括一些评测基准、任务设置都已经非常完备，研究者可以直接在数据条件全部具备的情况下开展神经机器翻译的研究工作，这些都省去了大量的时间成本。从这个角度说，神经机器翻译是站在巨人的肩膀上才发展起来的。
-\vspace{0.5em}
+\vspace{0.3em}
 \item 深度学习经过长时间的酝酿终于爆发，为机器翻译等自然语言处理任务提供了新的思路和技术手段。神经机器翻译的不断壮大伴随着深度学习技术的发展。在深度学习的视角下，语言文字可以被表示成抽象的实数向量。这种文字的表示方法可以被自动学习，为机器翻译建模提供了更大的灵活性。相对于神经机器翻译，深度学习的发展更加曲折。虽然深度学习经过了漫长的起伏过程，但是神经机器翻译恰好出现在深度学习逐渐走向成熟的阶段。反过来说，受到深度学习及相关技术空前发展的影响，自然语言处理的范式也发生了变化，神经机器翻译的出现只是这种趋势下的一种必然。
-\vspace{0.5em}
+\vspace{0.3em}
 \item 此外，计算机算力的提升也为神经机器翻译提供了很好的支撑。与很多神经网络方法一样，神经机器翻译也依赖大量的基于浮点数的矩阵运算。在2000年前，大规模的矩阵运算仍然依赖非常昂贵的CPU集群系统，但是随着GPU等相关技术的发展，在相对低成本的设备上已经可以完成非常复杂的浮点并行运算。这使得包括神经机器翻译在内的很多基于深度学习的系统可以进行大规模实验，随着实验周期的缩短，相关研究和系统的迭代周期大大缩短。实际上，计算机硬件运算能力一直是稳定提升的，神经机器翻译只是受益于运算能力的阶段性突破。
-\vspace{0.5em}
+\vspace{0.3em}
 \item 还有，翻译需求的不断增加也为机器翻译技术提供了新的机会。在近几年，无论是翻译品质，还是翻译语种数量，甚至不同的翻译场景，都对机器翻译有了更高的要求。人们迫切需要一种品质更高、翻译效果稳定的机器翻译方法，神经机器翻译恰好满足了这些要求。当然，应用端需求的增加也会反推机器翻译技术的发展，二者相互促进。
-\vspace{0.5em}
+\vspace{0.3em}
 \end{itemize}

 \parinterval 至今，神经机器翻译已经成为带有时代特征的标志性方法。当然，机器翻译的发展也远没有达到终点。下面将介绍神经机器翻译的起源和优势，以便读者在正式了解神经机器翻译的技术方法前对其现状有一个充分的认识。
@@ -59,7 +67,7 @@
 %----------------------------------------------------------------------------------------
 %    NEW SUB-SECTION
 %----------------------------------------------------------------------------------------
-\vspace{-0.2em}
+\vspace{-0.3em}
 \subsection{神经机器翻译的起源}

 \parinterval 从广义上讲，神经机器翻译是一种基于人工神经网络的方法，它把翻译过程描述为可以用人工神经网络表示的函数。所有的训练和推断都在这些函数上进行。由于神经机器翻译中的神经网络可以用连续可微函数表示，因此这类方法也可以用基于梯度的方法进行优化，相关技术非常成熟。更为重要的是，在神经网络的设计中，研究者引入了{\small\bfnew{分布式表示}} \index{分布式表示}（Distributed Representation）\index{Distributed Representation}的概念，这也是近些年自然语言处理领域的重要成果之一。传统统计机器翻译仍然把词序列看作离散空间里的由多个特征函数描述的点，类似于$n$-gram语言模型，这类模型对数据稀疏问题非常敏感。此外，人工设计特征也在一定程度上限制了模型对问题的表示能力。神经机器翻译把文字序列表示为实数向量，一方面避免了特征工程繁重的工作，另一方面使得系统可以对文字序列的``表示''进行学习。可以说，神经机器翻译的成功很大程度上源自`` 表示学习''这种自然语言处理的新范式的出现。在表示学习的基础上，注意力机制、深度神经网络等技术都被应用于神经机器翻译，使其得以进一步发展。
@@ -883,10 +891,12 @@ a (\mathbf{s},\mathbf{h}) =  \left\{ \begin{array}{ll}
 其中$\mathbf{W}$和$\mathbf{v}$是可学习的参数。
 \vspace{0.5em}
 \item	进一步，利用Softmax函数，将相关性系数$\beta_{i,j}$进行指数归一化处理，得到注意力权重$\alpha_{i,j}$：
+\vspace{0.5em}
 \begin{eqnarray}
 \alpha_{i,j}=\frac{\textrm{exp}(\beta_{i,j})} {\sum_{i'} \textrm{exp}(\beta_{i',j})}
 \label{eq:6-25}
 \end{eqnarray}
+\vspace{0.5em}

 最终，\{$\alpha_{i,j}$\}可以被看作是一个矩阵，它的长为目标语言句子长度，宽为源语言句子长度，矩阵中的每一项对应一个$\alpha_{i,j}$。图\ref{fig:6-24}给出了\{$\alpha_{i,j}$\}的一个矩阵表示。图中蓝色方框的大小表示不同的注意力权重$\alpha_{i,j}$的大小，方框越大，源语言位置$i$和目标语言位置$j$的相关性越高。能够看到，对于互译的中英文句子，\{$\alpha_{i,j}$\}可以较好的反应两种语言之间不同位置的对应关系。

@@ -920,6 +930,7 @@ a (\mathbf{s},\mathbf{h}) =  \left\{ \begin{array}{ll}
 \end{eqnarray}

 \parinterval 这样，可以在生成每个$y_j$时动态的使用不同的源语言表示$\mathbf{C}_j$，并更准确地捕捉源语言和目标语言不同位置之间的相关性。表\ref{tab:6-7}展示了引入注意力机制前后译文单词生成公式的对比。
+\vspace{0.5em}

 %----------------------------------------------
 \begin{table}[htp]
@@ -941,6 +952,7 @@ a (\mathbf{s},\mathbf{h}) =  \left\{ \begin{array}{ll}
 \subsubsection{注意力机制的解读}
 \label{sec:6.3.4.3}

+\vspace{0.5em}
 \parinterval 从前面的描述可以看出，注意力机制在机器翻译中就是要回答一个问题：给定一个目标语位置$j$和一系列源语言的不同位置上的表示\{${\mathbf{h}_i}$\}，如何得到一个新的表示$\hat{\mathbf{h}}$，使得它与目标语位置$j$对应得最好？

 \parinterval 那么，如何理解这个过程？注意力机制的本质又是什么呢？换一个角度来看，实际上，目标语位置$j$本质上是一个查询，我们希望从源语言端找到与之最匹配的源语言位置，并返回相应的表示结果。为了描述这个问题，可以建立一个查询系统。假设有一个库，里面包含若干个$\mathrm{key}$-$\mathrm{value}$单元，其中$\mathrm{key}$代表这个单元的索引关键字，$\mathrm{value}$代表这个单元的值。比如，对于学生信息系统，$\mathrm{key}$可以是学号，$\mathrm{value}$可以是学生的身高。当输入一个查询$\mathrm{query}$，我们希望这个系统返回与之最匹配的结果。也就是，希望找到匹配的$\mathrm{key}$，并输出其对应的$\mathrm{value}$。比如，当查询某个学生的身高信息时，可以输入学生的学号，之后在库中查询与这个学号相匹配的记录，并把这个记录中的$\mathrm{value}$（即身高）作为结果返回。
@@ -1045,7 +1057,7 @@ L(\mathbf{Y},\widehat{\mathbf{Y}}) = \sum_{j=1}^n L_{\textrm{ce}}(\mathbf{y}_j,\
 \vspace{0.5em}
 \end{eqnarray}

-其中$U(a,b)$表示以$[a,b]$为范围的均匀分布，$6$是固定值。
+其中$U(a,b)$表示以$[a,b]$为范围的均匀分布，$6$是固定值。\\
 \end{itemize}

 %----------------------------------------------------------------------------------------
@@ -1054,10 +1066,12 @@ L(\mathbf{Y},\widehat{\mathbf{Y}}) = \sum_{j=1}^n L_{\textrm{ce}}(\mathbf{y}_j,\
 \vspace{-0.5em}
 \subsubsection{优化策略}

+\vspace{0.5em}
 \parinterval 公式\ref{eq:6-30}展示了最基本的优化策略，也被称为标准的SGD优化器。实际上，训练神经机器翻译模型时，还有非常多的优化器可以选择，在第五章也有详细介绍，这里考虑Adam优化器。 Adam 通过对梯度的{\small\bfnew{一阶矩估计}}\index{一阶矩估计}（First Moment Estimation）\index{First Moment Estimation}和{\small\bfnew{二阶矩估计}}\index{二阶矩估计}（Second Moment Estimation）\index{Second Moment Estimation}进行综合考虑，计算出更新步长。

 \parinterval 表\ref{tab:6-8}从效果上对比了Adam和SGD的区别。通常，Adam收敛的比较快，不同任务基本上可以使用一套配置进行优化，虽性能不算差，但很难达到最优效果。相反，SGD虽能通过在不同的数据集上进行调整，来达到最优的结果，但是收敛速度慢。因此需要根据不同的需求来选择合适的优化器。若需要快得到模型的初步结果，选择Adam较为合适，若是需要在一个任务上得到最优的结果，选择SGD更为合适。

+\vspace{0.5em}
 %----------------------------------------------
 \begin{table}[htp]
 \centering
@@ -1065,8 +1079,8 @@ L(\mathbf{Y},\widehat{\mathbf{Y}}) = \sum_{j=1}^n L_{\textrm{ce}}(\mathbf{y}_j,\
 \label{tab:6-8}
 \begin{tabular}{l | l  l }
 	 		&使用		&性能 \\ \hline
-\rule{0pt}{13pt}	Adam	&一套配置包打天下	&不算差，但没到极限 \\
-\rule{0pt}{13pt}	SGD	&换一个任务就得调	&效果好 \\
+\rule{0pt}{15pt}	Adam	&一套配置包打天下	&不算差，但没到极限 \\
+\rule{0pt}{15pt}	SGD	&换一个任务就得调	&效果好 \\
 \end{tabular}
 \end{table}
 %----------------------------------------------
@@ -1077,12 +1091,14 @@ L(\mathbf{Y},\widehat{\mathbf{Y}}) = \sum_{j=1}^n L_{\textrm{ce}}(\mathbf{y}_j,\

 \vspace{-1.5em}
 \subsubsection{梯度裁剪}
+\vspace{0.5em}

 \parinterval 需要注意的是，训练循环神经网络时，反向传播使得网络层之间的梯度重复相乘，在网络层数过深时，如果连乘因子小于1可能造成梯度指数级的减少，甚至趋近于0，导致网络无法优化，也就是梯度消失问题。当连乘因子大于1时，可能会导致梯度的乘积变得异常大，造成梯度爆炸的问题。在这种情况下需要使用``梯度裁剪''来防止梯度超过阈值。梯度裁剪在第五章已经介绍过，这里简单回顾一下。梯度裁剪的具体公式如下：
 \begin{eqnarray}
 \mathbf{w}' = \mathbf{w} \cdot \frac{\gamma} {\textrm{max}(\gamma,\| \mathbf{w} \|_2)}
 \label{eq:6-33}
 \end{eqnarray}
+\vspace{0.5em}

 \noindent 其中$\gamma$是手工设定的梯度大小阈值， $\| \cdot \|_2$是L2范数，$\mathbf{w}'$表示梯度裁剪后的参数。这个公式的含义在于只要梯度大小超过阈值，就按照阈值与当前梯度大小的比例进行放缩。

@@ -1091,6 +1107,7 @@ L(\mathbf{Y},\widehat{\mathbf{Y}}) = \sum_{j=1}^n L_{\textrm{ce}}(\mathbf{y}_j,\
 %----------------------------------------------------------------------------------------

 \subsubsection{学习率策略}
+\vspace{0.5em}

 \parinterval 在公式\ref{eq:6-30}中， $\alpha$决定了每次参数更新时更新的步幅大小，称之为{\small\bfnew{学习率}}\index{学习率}（Learning Rate）\index{Learning Rate}。学习率作为基于梯度方法中的重要超参数，它决定目标函数能否收敛到较好的局部最优点以及收敛的速度。合理的学习率能够使模型快速、稳定地达到较好的状态。但是，如果学习率太小，收敛过程会很慢；而学习率太大，则模型的状态可能会出现震荡，很难达到稳定，甚至使模型无法收敛。图\ref{fig:6-28} 对比了不同学习率对优化过程的影响。

@@ -1104,6 +1121,7 @@ L(\mathbf{Y},\widehat{\mathbf{Y}}) = \sum_{j=1}^n L_{\textrm{ce}}(\mathbf{y}_j,\
 %----------------------------------------------

 \parinterval 不同优化器需要的学习率不同，比如Adam一般使用0.001或0.0001，而SGD则在0.1$\sim$1之间进行挑选。在梯度下降法中，都是给定的统一的学习率，整个优化过程中都以确定的步长进行更新因此无论使用哪个优化器，为了保证训练又快又好，通常都需要根据当前的更新次数来动态调整学习率的大小。
+\vspace{0.5em}

 %----------------------------------------------
 \begin{figure}[htp]
@@ -1115,6 +1133,7 @@ L(\mathbf{Y},\widehat{\mathbf{Y}}) = \sum_{j=1}^n L_{\textrm{ce}}(\mathbf{y}_j,\
 %----------------------------------------------

 \parinterval 图\ref{fig:6-29}展示了一种常用的学习率调整策略。它分为两个阶段：预热阶段和衰减阶段。模型训练初期梯度通常很大，如果直接使用较大的学习率很容易让模型陷入局部最优。学习率的预热阶段便是通过在训练初期使学习率从小到大逐渐增加来减缓在初始阶段模型``跑偏''的现象。一般来说，初始学习率太高会使得模型进入一种损失函数曲面非常不平滑的区域，进而使得模型进入一种混乱状态，后续的优化过程很难取得很好的效果。一个常用的学习率预热方法是{\small\bfnew{逐渐预热}}\index{逐渐预热}（Gradual Warmup）\index{Gradual Warmup}。假设预热的更新次数为$T'$，初始学习率为$\alpha_0$，则预热阶段第$t$次更新的学习率为：
+\vspace{0.5em}
 \begin{eqnarray}
 \alpha_t = \frac{t}{T'} \alpha_0 \quad,\quad 1 \leq t \leq T'
 \label{eq:6-34}
@@ -1129,6 +1148,7 @@ L(\mathbf{Y},\widehat{\mathbf{Y}}) = \sum_{j=1}^n L_{\textrm{ce}}(\mathbf{y}_j,\
 \vspace{0.5em}
 \subsubsection{并行训练}

+\vspace{0.5em}
 \parinterval 机器翻译是自然语言处理中很``重''的任务。因为数据量巨大而且模型较为复杂，模型训练的时间往往很长。比如，使用一千万句的训练数据，性能优异的系统往往需要几天甚至一周的时间。更大规模的数据会导致训练时间更长。特别是使用多层网络同时增加模型容量时（比如增加隐层宽度时），神经机器翻译的训练会更加缓慢。对于这个问题，一个思路是从模型训练算法上进行改进。比如前面提到的Adam就是一种高效的训练策略。另一种思路是利用多设备进行加速，也称作分布式训练。

 \vspace{0.5em}
@@ -1139,8 +1159,8 @@ L(\mathbf{Y},\widehat{\mathbf{Y}}) = \sum_{j=1}^n L_{\textrm{ce}}(\mathbf{y}_j,\
 \label{tab:6-9}
 \begin{tabular}{l | p{12em}  p{12em} }
 	 		 &优点		&缺点 \\ \hline
-\rule{0pt}{13pt}	数据并行 &并行度高，理论上有多大的batch（批次）就可以有多少个设备并行计算	&模型不能大于单个设备的极限 \\
-\rule{0pt}{13pt}	模型并行	&可以对很大的模型进行运算	&只能有限并行，比如多少层就多少个设备 \\
+\rule{0pt}{15pt}	数据并行 &并行度高，理论上有多大的batch（批次）就可以有多少个设备并行计算	&模型不能大于单个设备的极限 \\
+\rule{0pt}{15pt}	模型并行	&可以对很大的模型进行运算	&只能有限并行，比如多少层就多少个设备 \\
 \end{tabular}
 \end{table}
 %----------------------------------------------
@@ -1149,6 +1169,7 @@ L(\mathbf{Y},\widehat{\mathbf{Y}}) = \sum_{j=1}^n L_{\textrm{ce}}(\mathbf{y}_j,\

 \begin{itemize}
 \vspace{0.5em}
+
 \item {\small\bfnew{数据并行}}\index{数据并行}。如果一台设备能完整放下一个神经机器翻译模型，那么数据并行可以把一个大批次均匀切分成$n$个小批次，然后分发到$n$个设备上并行计算，最后把结果汇总，相当于把运算时间变为原来的${1}/{n}$，数据并行的过程如图\ref{fig:6-30}所示。不过，需要注意的是，多设备并行需要对数据在不同设备间传输，特别是多个GPU的情况，设备间传输的带宽十分有限，设备间传输数据往往会造成额外的时间消耗\cite{xiao2017fast}。通常，数据并行的训练速度无法随着设备数量增加呈线性增长。不过这个问题也有很多优秀的解决方案，比如采用多个设备的异步训练，但是这些内容已经超出本章的内容，因此这里不做过多讨论。

 %----------------------------------------------
@@ -1221,10 +1242,11 @@ L(\mathbf{Y},\widehat{\mathbf{Y}}) = \sum_{j=1}^n L_{\textrm{ce}}(\mathbf{y}_j,\
 %----------------------------------------------------------------------------------------
 %    NEW SUBSUB-SECTION
 %----------------------------------------------------------------------------------------
-
+\vspace{1.0em}
 \subsubsection{贪婪搜索}
-
+\vspace{0.6em}
 \parinterval 图\ref{fig:6-32}展示了一个基于贪婪方法的神经机器翻译解码过程。每一个时间步的单词预测都依赖于其前一步单词的生成。在解码第一个单词时，由于没有之前的单词信息，会用<sos>进行填充，作为起始的单词，且会用一个零向量（可以理解为没有之前时间步的信息）表示第0步的中间层状态。
+\vspace{0.8em}

 %----------------------------------------------
 \begin{figure}[htp]
@@ -1235,6 +1257,7 @@ L(\mathbf{Y},\widehat{\mathbf{Y}}) = \sum_{j=1}^n L_{\textrm{ce}}(\mathbf{y}_j,\
 \end{figure}
 %----------------------------------------------

+\vspace{0.2em}
 \parinterval 解码端的每一步Softmax层会输出所有单词的概率，由于是基于贪心的方法，这里会选择概率最大（top-1）的单词作为输出。这个过程可以参考图\ref{fig:6-33}的内容。我们选择分布中概率最大的单词``Have''作为得到的第一个单词，并再次送入解码器，作为第二步的输入同时预测下一个单词。以此类推，直到生成句子的终止符为止，就得到了完整的译文。

 \parinterval 贪婪搜索的优点在于速度快。在对翻译速度有较高要求的场景中，贪婪搜索是一种十分有效的对系统加速的方法。而且贪婪搜索的原理非常简单，易于快速原型。不过，由于每一步只保留一个最好的局部结果，贪婪搜索往往会带来翻译品质上的损失。
@@ -1253,8 +1276,10 @@ L(\mathbf{Y},\widehat{\mathbf{Y}}) = \sum_{j=1}^n L_{\textrm{ce}}(\mathbf{y}_j,\
 %----------------------------------------------------------------------------------------

 \subsubsection{束搜索}
+\vspace{0.5em}

 \parinterval 束搜索是一种启发式图搜索算法。相比于全搜索，它可以减少搜索所占用的空间和时间，在每一步扩展的时候，剪掉一些质量比较差的结点，保留下一些质量较高的结点。具体到机器翻译任务，对于每一个目标语位置，束搜索选择了概率最大的前$K$个单词进行扩展（其中$K$叫做束宽度，或简称为束宽）。如图\ref{fig:6-34}所示，假设\{$y_1, y_2,..., y_n$\}表示生成的目标语序列，且$K=3$，则束搜索的具体过程为：在预测第一个位置时，可以通过模型得到$y_1$的概率分布，选取概率最大的前3个单词作为候选结果（假设分别为``have'', ``has'', ``it''）。在预测第二个位置的单词时，模型针对已经得到的三个候选结果（``have'', ``has'', ``it''）计算第二个单词的概率分布。例如，可以在将``have''作为第二步的输入，计算$y_2$的概率分布。此时，译文序列的概率为
+
 \begin{eqnarray}
 \textrm{P} (y_2,y_1 | \mathbf{x}) & = & \textrm{P} (y_2, \textrm{``have''} | \mathbf{x}) \nonumber \\
 								  & = & \textrm{P}(y_2 | \textrm{``have''} , \mathbf{x}) \cdot \textrm{P} (\textrm{``have''} | \mathbf{x})								
@@ -1317,10 +1342,12 @@ L(\mathbf{Y},\widehat{\mathbf{Y}}) = \sum_{j=1}^n L_{\textrm{ce}}(\mathbf{y}_j,\
 %----------------------------------------------------------------------------------------

 \subsection{实例-GNMT}
+\vspace{0.5em}

 \parinterval 循环神经网络在机器翻译中有很多成功的应用，比如、RNNSearch\cite{bahdanau2014neural}、Nematus\cite{DBLP:journals/corr/SennrichFCBHHJL17}等系统就被很多研究者作为实验系统。在众多基于循环神经网络的系统中，Google's Neural Machine Translation System（GNMT）系统是非常成功的一个\cite{Wu2016GooglesNM}。GNMT是谷歌2016年发布的神经机器翻译系统。当时，神经机器翻译有三个弱点：训练和推理速度较慢、在翻译稀有单词上缺乏鲁棒性和有时无法完整翻译源语言句子中的所有单词。GNMT的提出有效的缓解了上述问题。

 \parinterval GNMT使用了编码器-解码器结构，构建了一个8层的深度网络，每层网络均由LSTM组成，且在编码器-解码器之间使用了多层注意力连接。其结构如图\ref{fig:6-35}，编码器只有最下面2层为双向LSTM。GNMT在束搜索中也加入了长度惩罚和覆盖度因子来确保输出高质量的翻译结果（公式\ref{eq:6-41}）。
+\vspace{0.5em}

 %----------------------------------------------
 \begin{figure}[htp]
@@ -1402,6 +1429,7 @@ L(\mathbf{Y},\widehat{\mathbf{Y}}) = \sum_{j=1}^n L_{\textrm{ce}}(\mathbf{y}_j,\
 %----------------------------------------------------------------------------------------

 \subsection{自注意力模型}
+\vspace{0.5em}
 \label{sec:6.4.1}

 \parinterval 首先，再回顾一下循环神经网络处理文字序列的过程。如图\ref{fig:6-36}所示，对于单词序列$\{ w_1,...,w_m \}$，处理第$m$个单词$w_m$时（绿色方框部分），需要输入前一时刻的信息（即处理单词$w_{m-1}$），而$w_{m-1}$又依赖于$w_{m-2}$，以此类推。也就是说，如果想建立$w_m$和$w_1$之间的关系，需要$m-1$次信息传递。对于长序列来说，词汇之间信息传递距离过长会导致信息在传递过程中丢失，同时这种按顺序建模的方式也使得系统对序列的处理十分缓慢。

--- a/Book/Chapter6/Figures/figure-GRU01.tex
+++ b/Book/Chapter6/Figures/figure-GRU01.tex
@@ -15,52 +15,52 @@
        % Skeleton
        \begin{scope}[every label/.append style={label distance=1pt,font=\tiny,inner sep=0pt,opacity=0}]
            \coordinate (aux12) at (0,0);
-            \node[auxnode,label={-45:12}] () at (aux12) {};
+            %\node[auxnode,label={-45:12}] () at (aux12) {};

            \coordinate (aux22) at ([yshift=\base]aux12);
-            \node[auxnode,label={-45:22}] () at (aux22) {};
+            %\node[auxnode,label={-45:22}] () at (aux22) {};
            \coordinate (aux23) at ([xshift=0.5\base]aux22);
-            \node[auxnode,label={-45:23}] () at (aux23) {};
+            %\node[auxnode,label={-45:23}] () at (aux23) {};

            \coordinate (aux32) at ([yshift=0.4\base]aux22);
-            \node[auxnode,label={-45:32}] () at (aux32) {};
+            %\node[auxnode,label={-45:32}] () at (aux32) {};

            \ExtractX{$([xshift=\base]aux23)$}
            \ExtractY{$([yshift=\base]aux32)$}
            \coordinate (aux44) at (\XCoord,\YCoord);
-            \node[auxnode,label={-45:44}] () at (aux44) {};
+            %\node[auxnode,label={-45:44}] () at (aux44) {};
            \coordinate (aux45) at ([xshift=\base]aux44);
-            \node[auxnode,label={-45:45}] () at (aux45) {};
+            %\node[auxnode,label={-45:45}] () at (aux45) {};
            \coordinate (aux46) at ([xshift=1.3\base]aux45);
-            \node[auxnode,label={-45:46}] () at (aux46) {};
+            %\node[auxnode,label={-45:46}] () at (aux46) {};

            \ExtractX{$(aux23)$}
            \ExtractY{$([yshift=\base]aux44)$}
            \coordinate (aux53) at (\XCoord,\YCoord);
-            \node[auxnode,label={-45:53}] () at (aux53) {};
+            %\node[auxnode,label={-45:53}] () at (aux53) {};
            \coordinate (aux56) at ([yshift=\base]aux46);
-            \node[auxnode,label={-45:56}] () at (aux56) {};
+            %\node[auxnode,label={-45:56}] () at (aux56) {};

            \ExtractX{$(aux45)$}
            \ExtractY{$([yshift=0.5\base]aux56)$}
            \coordinate (aux65) at (\XCoord,\YCoord);
-            \node[auxnode,label={-45:65}] () at (aux65) {};
+            %\node[auxnode,label={-45:65}] () at (aux65) {};

            \ExtractX{$([xshift=-\base]aux12)$}
            \ExtractY{$([yshift=\base]aux65)$}
            \coordinate (aux71) at (\XCoord,\YCoord);
-            \node[auxnode,label={-45:71}] () at (aux71) {};
+            %\node[auxnode,label={-45:71}] () at (aux71) {};
            \coordinate (aux75) at ([yshift=\base]aux65);
-            \node[auxnode,label={-45:75}] () at (aux75) {};
+            %\node[auxnode,label={-45:75}] () at (aux75) {};
            \ExtractX{$(aux56)$}
            \ExtractY{$(aux75)$}
            \coordinate (aux76) at (\XCoord,\YCoord);
-            \node[auxnode,label={-45:76}] () at (aux76) {};
+            %\node[auxnode,label={-45:76}] () at (aux76) {};
            \coordinate (aux78) at ([xshift=1.7\base]aux76);
-            \node[auxnode,label={-45:78}] () at (aux78) {};
+            %\node[auxnode,label={-45:78}] () at (aux78) {};

            \coordinate (aux87) at ([shift={(0.7\base,1.3\base)}]aux76);
-            \node[auxnode,label={-45:87}] () at (aux87) {};
+            %\node[auxnode,label={-45:87}] () at (aux87) {};
        \end{scope}

        \begin{scope}

--- a/Book/Chapter6/Figures/figure-GRU02.tex
+++ b/Book/Chapter6/Figures/figure-GRU02.tex
@@ -16,52 +16,52 @@
        % Skeleton
        \begin{scope}[every label/.append style={label distance=1pt,font=\tiny,inner sep=0pt,opacity=0}]
            \coordinate (aux12) at (0,0);
-            \node[auxnode,label={-45:12}] () at (aux12) {};
+            %\node[auxnode,label={-45:12}] () at (aux12) {};

            \coordinate (aux22) at ([yshift=\base]aux12);
-            \node[auxnode,label={-45:22}] () at (aux22) {};
+            %\node[auxnode,label={-45:22}] () at (aux22) {};
            \coordinate (aux23) at ([xshift=0.5\base]aux22);
-            \node[auxnode,label={-45:23}] () at (aux23) {};
+            %\node[auxnode,label={-45:23}] () at (aux23) {};

            \coordinate (aux32) at ([yshift=0.4\base]aux22);
-            \node[auxnode,label={-45:32}] () at (aux32) {};
+            %\node[auxnode,label={-45:32}] () at (aux32) {};

            \ExtractX{$([xshift=\base]aux23)$}
            \ExtractY{$([yshift=\base]aux32)$}
            \coordinate (aux44) at (\XCoord,\YCoord);
-            \node[auxnode,label={-45:44}] () at (aux44) {};
+            %\node[auxnode,label={-45:44}] () at (aux44) {};
            \coordinate (aux45) at ([xshift=\base]aux44);
-            \node[auxnode,label={-45:45}] () at (aux45) {};
+            %\node[auxnode,label={-45:45}] () at (aux45) {};
            \coordinate (aux46) at ([xshift=1.3\base]aux45);
-            \node[auxnode,label={-45:46}] () at (aux46) {};
+            %\node[auxnode,label={-45:46}] () at (aux46) {};

            \ExtractX{$(aux23)$}
            \ExtractY{$([yshift=\base]aux44)$}
            \coordinate (aux53) at (\XCoord,\YCoord);
-            \node[auxnode,label={-45:53}] () at (aux53) {};
+            %\node[auxnode,label={-45:53}] () at (aux53) {};
            \coordinate (aux56) at ([yshift=\base]aux46);
-            \node[auxnode,label={-45:56}] () at (aux56) {};
+            %\node[auxnode,label={-45:56}] () at (aux56) {};

            \ExtractX{$(aux45)$}
            \ExtractY{$([yshift=0.5\base]aux56)$}
            \coordinate (aux65) at (\XCoord,\YCoord);
-            \node[auxnode,label={-45:65}] () at (aux65) {};
+            %\node[auxnode,label={-45:65}] () at (aux65) {};

            \ExtractX{$([xshift=-\base]aux12)$}
            \ExtractY{$([yshift=\base]aux65)$}
            \coordinate (aux71) at (\XCoord,\YCoord);
-            \node[auxnode,label={-45:71}] () at (aux71) {};
+            %\node[auxnode,label={-45:71}] () at (aux71) {};
            \coordinate (aux75) at ([yshift=\base]aux65);
-            \node[auxnode,label={-45:75}] () at (aux75) {};
+            %\node[auxnode,label={-45:75}] () at (aux75) {};
            \ExtractX{$(aux56)$}
            \ExtractY{$(aux75)$}
            \coordinate (aux76) at (\XCoord,\YCoord);
-            \node[auxnode,label={-45:76}] () at (aux76) {};
+            %\node[auxnode,label={-45:76}] () at (aux76) {};
            \coordinate (aux78) at ([xshift=1.7\base]aux76);
-            \node[auxnode,label={-45:78}] () at (aux78) {};
+            %\node[auxnode,label={-45:78}] () at (aux78) {};

            \coordinate (aux87) at ([shift={(0.7\base,1.3\base)}]aux76);
-            \node[auxnode,label={-45:87}] () at (aux87) {};
+            %\node[auxnode,label={-45:87}] () at (aux87) {};
        \end{scope}

        \begin{scope}

--- a/Book/Chapter6/Figures/figure-GRU03.tex
+++ b/Book/Chapter6/Figures/figure-GRU03.tex
@@ -16,52 +16,52 @@
        % Skeleton
        \begin{scope}[every label/.append style={label distance=1pt,font=\tiny,inner sep=0pt,opacity=0}]
            \coordinate (aux12) at (0,0);
-            \node[auxnode,label={-45:12}] () at (aux12) {};
+            %\node[auxnode,label={-45:12}] () at (aux12) {};

            \coordinate (aux22) at ([yshift=\base]aux12);
-            \node[auxnode,label={-45:22}] () at (aux22) {};
+            %\node[auxnode,label={-45:22}] () at (aux22) {};
            \coordinate (aux23) at ([xshift=0.5\base]aux22);
-            \node[auxnode,label={-45:23}] () at (aux23) {};
+            %\node[auxnode,label={-45:23}] () at (aux23) {};

            \coordinate (aux32) at ([yshift=0.4\base]aux22);
-            \node[auxnode,label={-45:32}] () at (aux32) {};
+            %\node[auxnode,label={-45:32}] () at (aux32) {};

            \ExtractX{$([xshift=\base]aux23)$}
            \ExtractY{$([yshift=\base]aux32)$}
            \coordinate (aux44) at (\XCoord,\YCoord);
-            \node[auxnode,label={-45:44}] () at (aux44) {};
+            %\node[auxnode,label={-45:44}] () at (aux44) {};
            \coordinate (aux45) at ([xshift=\base]aux44);
-            \node[auxnode,label={-45:45}] () at (aux45) {};
+            %\node[auxnode,label={-45:45}] () at (aux45) {};
            \coordinate (aux46) at ([xshift=1.3\base]aux45);
-            \node[auxnode,label={-45:46}] () at (aux46) {};
+            %\node[auxnode,label={-45:46}] () at (aux46) {};

            \ExtractX{$(aux23)$}
            \ExtractY{$([yshift=\base]aux44)$}
            \coordinate (aux53) at (\XCoord,\YCoord);
-            \node[auxnode,label={-45:53}] () at (aux53) {};
+            %\node[auxnode,label={-45:53}] () at (aux53) {};
            \coordinate (aux56) at ([yshift=\base]aux46);
-            \node[auxnode,label={-45:56}] () at (aux56) {};
+            %\node[auxnode,label={-45:56}] () at (aux56) {};

            \ExtractX{$(aux45)$}
            \ExtractY{$([yshift=0.5\base]aux56)$}
            \coordinate (aux65) at (\XCoord,\YCoord);
-            \node[auxnode,label={-45:65}] () at (aux65) {};
+            %\node[auxnode,label={-45:65}] () at (aux65) {};

            \ExtractX{$([xshift=-\base]aux12)$}
            \ExtractY{$([yshift=\base]aux65)$}
            \coordinate (aux71) at (\XCoord,\YCoord);
-            \node[auxnode,label={-45:71}] () at (aux71) {};
+            %\node[auxnode,label={-45:71}] () at (aux71) {};
            \coordinate (aux75) at ([yshift=\base]aux65);
-            \node[auxnode,label={-45:75}] () at (aux75) {};
+            %\node[auxnode,label={-45:75}] () at (aux75) {};
            \ExtractX{$(aux56)$}
            \ExtractY{$(aux75)$}
            \coordinate (aux76) at (\XCoord,\YCoord);
-            \node[auxnode,label={-45:76}] () at (aux76) {};
+            %\node[auxnode,label={-45:76}] () at (aux76) {};
            \coordinate (aux78) at ([xshift=1.7\base]aux76);
-            \node[auxnode,label={-45:78}] () at (aux78) {};
+            %\node[auxnode,label={-45:78}] () at (aux78) {};

            \coordinate (aux87) at ([shift={(0.7\base,1.3\base)}]aux76);
-            \node[auxnode,label={-45:87}] () at (aux87) {};
+            %\node[auxnode,label={-45:87}] () at (aux87) {};
        \end{scope}

        \begin{scope}

--- a/Book/Chapter6/Figures/figure-LSTM01.tex
+++ b/Book/Chapter6/Figures/figure-LSTM01.tex
@@ -15,60 +15,60 @@
        % Skeleton
        \begin{scope}[every label/.append style={label distance=1pt,font=\tiny,inner sep=0pt,opacity=0}]
            \coordinate (aux12) at (0,0);
-            \node[auxnode,label={-45:12}] () at (aux12) {};
+            %\node[auxnode,label={-45:12}] () at (aux12) {};

            \coordinate (aux22) at ([yshift=\base]aux12);
-            \node[auxnode,label={-45:22}] () at (aux22) {};
+            %\node[auxnode,label={-45:22}] () at (aux22) {};
            \coordinate (aux21) at ([xshift=-2\base]aux22);
-            \node[auxnode,label={-45:21}] () at (aux21) {};
+            %\node[auxnode,label={-45:21}] () at (aux21) {};
            \coordinate (aux23) at ([xshift=\base]aux22);
-            \node[auxnode,label={-45:23}] () at (aux23) {};
+            %\node[auxnode,label={-45:23}] () at (aux23) {};
            \coordinate (aux24) at ([xshift=\base]aux23);
-            \node[auxnode,label={-45:24}] () at (aux24) {};
+            %\node[auxnode,label={-45:24}] () at (aux24) {};
            \coordinate (aux25) at ([xshift=\base]aux24);
-            \node[auxnode,label={-45:25}] () at (aux25) {};
+            %\node[auxnode,label={-45:25}] () at (aux25) {};
            \coordinate (aux26) at ([xshift=\base]aux25);
-            \node[auxnode,label={-45:26}] () at (aux26) {};
+            %\node[auxnode,label={-45:26}] () at (aux26) {};
            \coordinate (aux27) at ([xshift=\base]aux26);
-            \node[auxnode,label={-45:27}] () at (aux27) {};
+            %\node[auxnode,label={-45:27}] () at (aux27) {};
            \coordinate (aux28) at ([xshift=\base]aux27);
-            \node[auxnode,label={-45:28}] () at (aux28) {};
+            %\node[auxnode,label={-45:28}] () at (aux28) {};
            \coordinate (aux29) at ([xshift=2\base]aux28);
-            \node[auxnode,label={-45:29}] () at (aux29) {};
+            %\node[auxnode,label={-45:29}] () at (aux29) {};

            \coordinate (aux33) at ([yshift=\base]aux23);
-            \node[auxnode,label={-45:33}] () at (aux33) {};
+            %\node[auxnode,label={-45:33}] () at (aux33) {};
            \coordinate (aux34) at ([yshift=\base]aux24);
-            \node[auxnode,label={-45:34}] () at (aux34) {};
+            %\node[auxnode,label={-45:34}] () at (aux34) {};
            \coordinate (aux35) at ([yshift=\base]aux25);
-            \node[auxnode,label={-45:35}] () at (aux35) {};
+            %\node[auxnode,label={-45:35}] () at (aux35) {};
            \coordinate (aux37) at ([yshift=\base]aux27);
-            \node[auxnode,label={-45:37}] () at (aux37) {};
+            %\node[auxnode,label={-45:37}] () at (aux37) {};

            \coordinate (aux45) at ([yshift=\base]aux35);
-            \node[auxnode,label={-45:45}] () at (aux45) {};
+            %\node[auxnode,label={-45:45}] () at (aux45) {};

            \coordinate (aux55) at ([yshift=\base]aux45);
-            \node[auxnode,label={-45:55}] () at (aux55) {};
+            %\node[auxnode,label={-45:55}] () at (aux55) {};
            \ExtractX{$(aux21)$}
            \ExtractY{$(aux55)$}
            \coordinate (aux51) at (\XCoord,\YCoord);
-            \node[auxnode,label={-45:51}] () at (aux51) {};
+            %\node[auxnode,label={-45:51}] () at (aux51) {};
            \ExtractX{$(aux23)$}
            \ExtractY{$(aux55)$}
            \coordinate (aux53) at (\XCoord,\YCoord);
-            \node[auxnode,label={-45:53}] () at (aux53) {};
+            %\node[auxnode,label={-45:53}] () at (aux53) {};
            \ExtractX{$(aux28)$}
            \ExtractY{$(aux55)$}
            \coordinate (aux58) at (\XCoord,\YCoord);
-            \node[auxnode,label={-45:58}] () at (aux58) {};
+            %\node[auxnode,label={-45:58}] () at (aux58) {};
            \ExtractX{$(aux29)$}
            \ExtractY{$(aux55)$}
            \coordinate (aux59) at (\XCoord,\YCoord);
-            \node[auxnode,label={-45:59}] () at (aux59) {};
+            %\node[auxnode,label={-45:59}] () at (aux59) {};

            \coordinate (aux68) at ([yshift=\base]aux58);
-            \node[auxnode,label={-45:68}] () at (aux68) {};
+            %\node[auxnode,label={-45:68}] () at (aux68) {};
        \end{scope}

        \begin{scope}

--- a/Book/Chapter6/Figures/figure-LSTM02.tex
+++ b/Book/Chapter6/Figures/figure-LSTM02.tex
@@ -16,60 +16,60 @@
        % Skeleton
        \begin{scope}[every label/.append style={label distance=1pt,font=\tiny,inner sep=0pt,opacity=0}]
            \coordinate (aux12) at (0,0);
-            \node[auxnode,label={-45:12}] () at (aux12) {};
+            %\node[auxnode,label={-45:12}] () at (aux12) {};

            \coordinate (aux22) at ([yshift=\base]aux12);
-            \node[auxnode,label={-45:22}] () at (aux22) {};
+            %\node[auxnode,label={-45:22}] () at (aux22) {};
            \coordinate (aux21) at ([xshift=-2\base]aux22);
-            \node[auxnode,label={-45:21}] () at (aux21) {};
+            %\node[auxnode,label={-45:21}] () at (aux21) {};
            \coordinate (aux23) at ([xshift=\base]aux22);
-            \node[auxnode,label={-45:23}] () at (aux23) {};
+            %\node[auxnode,label={-45:23}] () at (aux23) {};
            \coordinate (aux24) at ([xshift=\base]aux23);
-            \node[auxnode,label={-45:24}] () at (aux24) {};
+            %\node[auxnode,label={-45:24}] () at (aux24) {};
            \coordinate (aux25) at ([xshift=\base]aux24);
-            \node[auxnode,label={-45:25}] () at (aux25) {};
+            %\node[auxnode,label={-45:25}] () at (aux25) {};
            \coordinate (aux26) at ([xshift=\base]aux25);
-            \node[auxnode,label={-45:26}] () at (aux26) {};
+            %\node[auxnode,label={-45:26}] () at (aux26) {};
            \coordinate (aux27) at ([xshift=\base]aux26);
-            \node[auxnode,label={-45:27}] () at (aux27) {};
+            %\node[auxnode,label={-45:27}] () at (aux27) {};
            \coordinate (aux28) at ([xshift=\base]aux27);
-            \node[auxnode,label={-45:28}] () at (aux28) {};
+            %\node[auxnode,label={-45:28}] () at (aux28) {};
            \coordinate (aux29) at ([xshift=2\base]aux28);
-            \node[auxnode,label={-45:29}] () at (aux29) {};
+            %\node[auxnode,label={-45:29}] () at (aux29) {};

            \coordinate (aux33) at ([yshift=\base]aux23);
-            \node[auxnode,label={-45:33}] () at (aux33) {};
+            %\node[auxnode,label={-45:33}] () at (aux33) {};
            \coordinate (aux34) at ([yshift=\base]aux24);
-            \node[auxnode,label={-45:34}] () at (aux34) {};
+            %\node[auxnode,label={-45:34}] () at (aux34) {};
            \coordinate (aux35) at ([yshift=\base]aux25);
-            \node[auxnode,label={-45:35}] () at (aux35) {};
+            %\node[auxnode,label={-45:35}] () at (aux35) {};
            \coordinate (aux37) at ([yshift=\base]aux27);
-            \node[auxnode,label={-45:37}] () at (aux37) {};
+            %\node[auxnode,label={-45:37}] () at (aux37) {};

            \coordinate (aux45) at ([yshift=\base]aux35);
-            \node[auxnode,label={-45:45}] () at (aux45) {};
+            %\node[auxnode,label={-45:45}] () at (aux45) {};

            \coordinate (aux55) at ([yshift=\base]aux45);
-            \node[auxnode,label={-45:55}] () at (aux55) {};
+            %\node[auxnode,label={-45:55}] () at (aux55) {};
            \ExtractX{$(aux21)$}
            \ExtractY{$(aux55)$}
            \coordinate (aux51) at (\XCoord,\YCoord);
-            \node[auxnode,label={-45:51}] () at (aux51) {};
+            %\node[auxnode,label={-45:51}] () at (aux51) {};
            \ExtractX{$(aux23)$}
            \ExtractY{$(aux55)$}
            \coordinate (aux53) at (\XCoord,\YCoord);
-            \node[auxnode,label={-45:53}] () at (aux53) {};
+            %\node[auxnode,label={-45:53}] () at (aux53) {};
            \ExtractX{$(aux28)$}
            \ExtractY{$(aux55)$}
            \coordinate (aux58) at (\XCoord,\YCoord);
-            \node[auxnode,label={-45:58}] () at (aux58) {};
+            %\node[auxnode,label={-45:58}] () at (aux58) {};
            \ExtractX{$(aux29)$}
            \ExtractY{$(aux55)$}
            \coordinate (aux59) at (\XCoord,\YCoord);
-            \node[auxnode,label={-45:59}] () at (aux59) {};
+            %\node[auxnode,label={-45:59}] () at (aux59) {};

            \coordinate (aux68) at ([yshift=\base]aux58);
-            \node[auxnode,label={-45:68}] () at (aux68) {};
+            %\node[auxnode,label={-45:68}] () at (aux68) {};
        \end{scope}

        \begin{scope}

--- a/Book/Chapter6/Figures/figure-LSTM03.tex
+++ b/Book/Chapter6/Figures/figure-LSTM03.tex
@@ -15,60 +15,60 @@
        % Skeleton
        \begin{scope}[every label/.append style={label distance=1pt,font=\tiny,inner sep=0pt,opacity=0}]
            \coordinate (aux12) at (0,0);
-            \node[auxnode,label={-45:12}] () at (aux12) {};
+            %\node[auxnode,label={-45:12}] () at (aux12) {};

            \coordinate (aux22) at ([yshift=\base]aux12);
-            \node[auxnode,label={-45:22}] () at (aux22) {};
+            %\node[auxnode,label={-45:22}] () at (aux22) {};
            \coordinate (aux21) at ([xshift=-2\base]aux22);
-            \node[auxnode,label={-45:21}] () at (aux21) {};
+            %\node[auxnode,label={-45:21}] () at (aux21) {};
            \coordinate (aux23) at ([xshift=\base]aux22);
-            \node[auxnode,label={-45:23}] () at (aux23) {};
+            %\node[auxnode,label={-45:23}] () at (aux23) {};
            \coordinate (aux24) at ([xshift=\base]aux23);
-            \node[auxnode,label={-45:24}] () at (aux24) {};
+            %\node[auxnode,label={-45:24}] () at (aux24) {};
            \coordinate (aux25) at ([xshift=\base]aux24);
-            \node[auxnode,label={-45:25}] () at (aux25) {};
+            %\node[auxnode,label={-45:25}] () at (aux25) {};
            \coordinate (aux26) at ([xshift=\base]aux25);
-            \node[auxnode,label={-45:26}] () at (aux26) {};
+            %\node[auxnode,label={-45:26}] () at (aux26) {};
            \coordinate (aux27) at ([xshift=\base]aux26);
-            \node[auxnode,label={-45:27}] () at (aux27) {};
+            %\node[auxnode,label={-45:27}] () at (aux27) {};
            \coordinate (aux28) at ([xshift=\base]aux27);
-            \node[auxnode,label={-45:28}] () at (aux28) {};
+            %\node[auxnode,label={-45:28}] () at (aux28) {};
            \coordinate (aux29) at ([xshift=2\base]aux28);
-            \node[auxnode,label={-45:29}] () at (aux29) {};
+            %\node[auxnode,label={-45:29}] () at (aux29) {};

            \coordinate (aux33) at ([yshift=\base]aux23);
-            \node[auxnode,label={-45:33}] () at (aux33) {};
+            %\node[auxnode,label={-45:33}] () at (aux33) {};
            \coordinate (aux34) at ([yshift=\base]aux24);
-            \node[auxnode,label={-45:34}] () at (aux34) {};
+            %\node[auxnode,label={-45:34}] () at (aux34) {};
            \coordinate (aux35) at ([yshift=\base]aux25);
-            \node[auxnode,label={-45:35}] () at (aux35) {};
+            %\node[auxnode,label={-45:35}] () at (aux35) {};
            \coordinate (aux37) at ([yshift=\base]aux27);
-            \node[auxnode,label={-45:37}] () at (aux37) {};
+            %\node[auxnode,label={-45:37}] () at (aux37) {};

            \coordinate (aux45) at ([yshift=\base]aux35);
-            \node[auxnode,label={-45:45}] () at (aux45) {};
+            %\node[auxnode,label={-45:45}] () at (aux45) {};

            \coordinate (aux55) at ([yshift=\base]aux45);
-            \node[auxnode,label={-45:55}] () at (aux55) {};
+            %\node[auxnode,label={-45:55}] () at (aux55) {};
            \ExtractX{$(aux21)$}
            \ExtractY{$(aux55)$}
            \coordinate (aux51) at (\XCoord,\YCoord);
-            \node[auxnode,label={-45:51}] () at (aux51) {};
+            %\node[auxnode,label={-45:51}] () at (aux51) {};
            \ExtractX{$(aux23)$}
            \ExtractY{$(aux55)$}
            \coordinate (aux53) at (\XCoord,\YCoord);
-            \node[auxnode,label={-45:53}] () at (aux53) {};
+            %\node[auxnode,label={-45:53}] () at (aux53) {};
            \ExtractX{$(aux28)$}
            \ExtractY{$(aux55)$}
            \coordinate (aux58) at (\XCoord,\YCoord);
-            \node[auxnode,label={-45:58}] () at (aux58) {};
+            %\node[auxnode,label={-45:58}] () at (aux58) {};
            \ExtractX{$(aux29)$}
            \ExtractY{$(aux55)$}
            \coordinate (aux59) at (\XCoord,\YCoord);
-            \node[auxnode,label={-45:59}] () at (aux59) {};
+            %\node[auxnode,label={-45:59}] () at (aux59) {};

            \coordinate (aux68) at ([yshift=\base]aux58);
-            \node[auxnode,label={-45:68}] () at (aux68) {};
+            %\node[auxnode,label={-45:68}] () at (aux68) {};
        \end{scope}

        \begin{scope}

--- a/Book/Chapter6/Figures/figure-LSTM04.tex
+++ b/Book/Chapter6/Figures/figure-LSTM04.tex
@@ -16,60 +16,60 @@
        % Skeleton
        \begin{scope}[every label/.append style={label distance=1pt,font=\tiny,inner sep=0pt,opacity=0}]
            \coordinate (aux12) at (0,0);
-            \node[auxnode,label={-45:12}] () at (aux12) {};
+            %\node[auxnode,label={-45:12}] () at (aux12) {};

            \coordinate (aux22) at ([yshift=\base]aux12);
-            \node[auxnode,label={-45:22}] () at (aux22) {};
+            %\node[auxnode,label={-45:22}] () at (aux22) {};
            \coordinate (aux21) at ([xshift=-2\base]aux22);
-            \node[auxnode,label={-45:21}] () at (aux21) {};
+            %\node[auxnode,label={-45:21}] () at (aux21) {};
            \coordinate (aux23) at ([xshift=\base]aux22);
-            \node[auxnode,label={-45:23}] () at (aux23) {};
+            %\node[auxnode,label={-45:23}] () at (aux23) {};
            \coordinate (aux24) at ([xshift=\base]aux23);
-            \node[auxnode,label={-45:24}] () at (aux24) {};
+            %\node[auxnode,label={-45:24}] () at (aux24) {};
            \coordinate (aux25) at ([xshift=\base]aux24);
-            \node[auxnode,label={-45:25}] () at (aux25) {};
+            %\node[auxnode,label={-45:25}] () at (aux25) {};
            \coordinate (aux26) at ([xshift=\base]aux25);
-            \node[auxnode,label={-45:26}] () at (aux26) {};
+            %\node[auxnode,label={-45:26}] () at (aux26) {};
            \coordinate (aux27) at ([xshift=\base]aux26);
-            \node[auxnode,label={-45:27}] () at (aux27) {};
+            %\node[auxnode,label={-45:27}] () at (aux27) {};
            \coordinate (aux28) at ([xshift=\base]aux27);
-            \node[auxnode,label={-45:28}] () at (aux28) {};
+            %\node[auxnode,label={-45:28}] () at (aux28) {};
            \coordinate (aux29) at ([xshift=2\base]aux28);
-            \node[auxnode,label={-45:29}] () at (aux29) {};
+            %\node[auxnode,label={-45:29}] () at (aux29) {};

            \coordinate (aux33) at ([yshift=\base]aux23);
-            \node[auxnode,label={-45:33}] () at (aux33) {};
+            %\node[auxnode,label={-45:33}] () at (aux33) {};
            \coordinate (aux34) at ([yshift=\base]aux24);
-            \node[auxnode,label={-45:34}] () at (aux34) {};
+            %\node[auxnode,label={-45:34}] () at (aux34) {};
            \coordinate (aux35) at ([yshift=\base]aux25);
-            \node[auxnode,label={-45:35}] () at (aux35) {};
+            %\node[auxnode,label={-45:35}] () at (aux35) {};
            \coordinate (aux37) at ([yshift=\base]aux27);
-            \node[auxnode,label={-45:37}] () at (aux37) {};
+            %\node[auxnode,label={-45:37}] () at (aux37) {};

            \coordinate (aux45) at ([yshift=\base]aux35);
-            \node[auxnode,label={-45:45}] () at (aux45) {};
+            %\node[auxnode,label={-45:45}] () at (aux45) {};

            \coordinate (aux55) at ([yshift=\base]aux45);
-            \node[auxnode,label={-45:55}] () at (aux55) {};
+            %\node[auxnode,label={-45:55}] () at (aux55) {};
            \ExtractX{$(aux21)$}
            \ExtractY{$(aux55)$}
            \coordinate (aux51) at (\XCoord,\YCoord);
-            \node[auxnode,label={-45:51}] () at (aux51) {};
+            %\node[auxnode,label={-45:51}] () at (aux51) {};
            \ExtractX{$(aux23)$}
            \ExtractY{$(aux55)$}
            \coordinate (aux53) at (\XCoord,\YCoord);
-            \node[auxnode,label={-45:53}] () at (aux53) {};
+            %\node[auxnode,label={-45:53}] () at (aux53) {};
            \ExtractX{$(aux28)$}
            \ExtractY{$(aux55)$}
            \coordinate (aux58) at (\XCoord,\YCoord);
-            \node[auxnode,label={-45:58}] () at (aux58) {};
+            %\node[auxnode,label={-45:58}] () at (aux58) {};
            \ExtractX{$(aux29)$}
            \ExtractY{$(aux55)$}
            \coordinate (aux59) at (\XCoord,\YCoord);
-            \node[auxnode,label={-45:59}] () at (aux59) {};
+            %\node[auxnode,label={-45:59}] () at (aux59) {};

            \coordinate (aux68) at ([yshift=\base]aux58);
-            \node[auxnode,label={-45:68}] () at (aux68) {};
+            %\node[auxnode,label={-45:68}] () at (aux68) {};
        \end{scope}

        \begin{scope}

--- a/Book/Chapter6/Figures/figure-beam-search-process.tex
+++ b/Book/Chapter6/Figures/figure-beam-search-process.tex
@@ -43,32 +43,32 @@
 \node [wnode,anchor=north] (wt1) at ([yshift=-0.8em]t1.south) {\scriptsize{$\langle$sos$\rangle$}};

 {
-\node [wnode,anchor=north] (wt2) at ([yshift=-0.8em]t2.south) {\scriptsize{Have}};
-\node [wnode,anchor=north] (wt2copy1) at ([xshift=-0.2em,yshift=-0.2em]wt2.north) {\scriptsize{Have}};
+\node [wnode,anchor=north] (wt2) at ([yshift=-0.8em]t2.south) {\scriptsize{}};
+\node [wnode,anchor=north] (wt2copy1) at ([xshift=-0.2em,yshift=-0.2em]wt2.north) {\scriptsize{}};
 \node [wnode,anchor=north] (wt2copy2) at ([xshift=-0.4em,yshift=-0.4em]wt2.north) {\scriptsize{Have}};
 }

 {
-\node [wnode,anchor=north,inner sep=2pt] (wt3) at ([yshift=-0.8em]t3.south) {\scriptsize{you}};
-\node [wnode,anchor=north] (wt3copy1) at ([xshift=-0.2em,yshift=-0.2em]wt3.north) {\scriptsize{you}};
+\node [wnode,anchor=north,inner sep=2pt] (wt3) at ([yshift=-0.8em]t3.south) {\scriptsize{}};
+\node [wnode,anchor=north] (wt3copy1) at ([xshift=-0.2em,yshift=-0.2em]wt3.north) {\scriptsize{}};
 \node [wnode,anchor=north] (wt3copy2) at ([xshift=-0.4em,yshift=-0.4em]wt3.north) {\scriptsize{you}};
 }

 {
-\node [wnode,anchor=center,inner sep=2pt] (wo1) at ([xshift=0.4em,yshift=1.8em]o1.north) {\scriptsize{Have}};
-\node [wnode,anchor=north] (wo1copy1) at ([xshift=-0.2em,yshift=-0.2em]wo1.north) {\scriptsize{Have}};
+\node [wnode,anchor=center,inner sep=2pt] (wo1) at ([xshift=0.4em,yshift=1.8em]o1.north) {\scriptsize{}};
+\node [wnode,anchor=north] (wo1copy1) at ([xshift=-0.2em,yshift=-0.2em]wo1.north) {\scriptsize{}};
 \node [wnode,anchor=north] (wo1copy2) at ([xshift=-0.4em,yshift=-0.4em]wo1.north) {\scriptsize{Have}};
 }

 {
-\node [wnode,anchor=center,inner sep=2pt] (wo2) at ([xshift=0.4em,yshift=1.8em]o2.north) {\scriptsize{you}};
-\node [wnode,anchor=north] (wo2copy1) at ([xshift=-0.2em,yshift=-0.2em]wo2.north) {\scriptsize{you}};
+\node [wnode,anchor=center,inner sep=2pt] (wo2) at ([xshift=0.4em,yshift=1.8em]o2.north) {\scriptsize{}};
+\node [wnode,anchor=north] (wo2copy1) at ([xshift=-0.2em,yshift=-0.2em]wo2.north) {\scriptsize{}};
 \node [wnode,anchor=north] (wo2copy2) at ([xshift=-0.4em,yshift=-0.4em]wo2.north) {\scriptsize{you}};
 }

 {
-\node [wnode,anchor=center,inner sep=2pt] (wo3) at ([xshift=0.4em,yshift=1.8em]o3.north) {\scriptsize{learned}};
-\node [wnode,anchor=north] (wo3copy1) at ([xshift=-0.2em,yshift=-0.2em]wo3.north) {\scriptsize{learned}};
+\node [wnode,anchor=center,inner sep=2pt] (wo3) at ([xshift=0.4em,yshift=1.8em]o3.north) {\scriptsize{}};
+\node [wnode,anchor=north] (wo3copy1) at ([xshift=-0.2em,yshift=-0.2em]wo3.north) {\scriptsize{}};
 \node [wnode,anchor=north] (wo3copy2) at ([xshift=-0.4em,yshift=-0.4em]wo3.north) {\scriptsize{learned}};
 }

@@ -121,7 +121,7 @@
 }

 {
-\node [circle,draw,anchor=north,inner sep=2pt,fill=orange!20] (c2) at ([yshift=-2.5em]t1.south) {\scriptsize{$\textbf{C}_2$}};
+\node [circle,draw,anchor=north,inner sep=2pt,fill=orange!20,text=orange!20] (c2) at ([yshift=-2.5em]t1.south) {\scriptsize{$\textbf{C}_2$}};
 \node [circle,draw,inner sep=2pt,fill=orange!20] (c2copy1) at ([yshift=-0.1em,xshift=-0.1em]c2) {\scriptsize{$\textbf{C}_2$}};
 \node [circle,draw,inner sep=2pt,fill=orange!20] (c2copy2) at ([yshift=-0.2em,xshift=-0.2em]c2) {\scriptsize{$\textbf{C}_2$}};
 \draw [->] ([xshift=-0.9em]c2.west) -- ([xshift=-0.3em]c2.west);
@@ -129,8 +129,8 @@
 }

 {
-\node [circle,draw,anchor=north,inner sep=2pt,fill=orange!20] (c3) at ([yshift=-2.5em]t2.south) {\scriptsize{$\textbf{C}_3$}};
-\node [circle,draw,inner sep=2pt,fill=orange!20] (c3copy1) at ([yshift=-0.1em,xshift=-0.1em]c3) {\scriptsize{$\textbf{C}_3$}};
+\node [circle,draw,anchor=north,inner sep=2pt,fill=orange!20,text=orange!20] (c3) at ([yshift=-2.5em]t2.south) {\scriptsize{$\textbf{C}_3$}};
+\node [circle,draw,inner sep=2pt,fill=orange!20,text=orange!20] (c3copy1) at ([yshift=-0.1em,xshift=-0.1em]c3) {\scriptsize{$\textbf{C}_3$}};
 \node [circle,draw,inner sep=2pt,fill=orange!20] (c3copy2) at ([yshift=-0.2em,xshift=-0.2em]c3) {\scriptsize{$\textbf{C}_3$}};
 \draw [->] ([xshift=-0.9em]c3.west) -- ([xshift=-0.3em]c3.west);
 \draw [->] ([xshift=0.1em]c3.east) .. controls +(east:1.5) and +(west:0.8) ..([yshift=-0.3em,xshift=-0.1em]s3.west);

--- a/Book/Chapter6/Figures/figure-data-parallel-process.tex
+++ b/Book/Chapter6/Figures/figure-data-parallel-process.tex
@@ -11,8 +11,10 @@
                \begin{scope}
                    \coordinate (batch0) at (0,0);

-                    \foreach \i [count=\j from 0,evaluate=\i as \k using int(4-\i)] in {1,2,3}
-                        \node [samplenode,anchor=south west,font=\scriptsize] (batch\i) at ([shift={(-1em,-0.5em)}]batch\j.south west) {句子\k};
+                    \foreach \i [count=\j from 0,evaluate=\i as \k using int(4-\i)] in {1,2}
+                        \node [samplenode,anchor=south west,font=\scriptsize] (batch\i) at ([shift={(-1em,-0.5em)}]batch\j.south west) {\qquad \k};
+                    \foreach \i [count=\j from 2,evaluate=\i as \k using int(4-\i)] in {3}
+                        \node [samplenode,anchor=south west,font=\scriptsize] (batch\i) at ([shift={(-1em,-0.5em)}]batch\j.south west) {句子 \k};
                    \draw [decorate,decoration={brace}] (batch1.south east) to node [auto,rotate=30,anchor=north,font=\scriptsize] {batch大小} (batch3.south east);

                    \node [samplenode,anchor=west,font=\scriptsize] (sample2) at ([xshift=4em]batch2.east) {句子2};

--- a/Book/Chapter6/Figures/figure-multi-head-attention-model.tex
+++ b/Book/Chapter6/Figures/figure-multi-head-attention-model.tex
@@ -3,23 +3,23 @@
 \begin{tikzpicture}
 \begin{scope}

-\node [anchor=west,draw=black!30,inner sep=4pt,fill=ugreen!20!white] (Linear0) at (0,0) {\footnotesize{Linear}};
-\node [anchor=south west,draw=black!50,fill=ugreen!20!white,draw,inner sep=4pt] (Linear01) at ([shift={(-0.2em,-0.2em)}]Linear0.south west) {\footnotesize{Linear}};
+\node [anchor=west,draw=black!30,inner sep=4pt,fill=ugreen!20!white,text=ugreen!20!white] (Linear0) at (0,0) {\footnotesize{Linear}};
+\node [anchor=south west,draw=black!50,fill=ugreen!20!white,draw,inner sep=4pt,text=ugreen!20!white] (Linear01) at ([shift={(-0.2em,-0.2em)}]Linear0.south west) {\footnotesize{Linear}};
 \node [anchor=south west,fill=ugreen!20!white,draw,inner sep=4pt] (Linear02) at ([shift={(-0.2em,-0.2em)}]Linear01.south west) {\footnotesize{Linear}};
 \node [anchor=north] (Q) at ([xshift=0em,yshift=-1em]Linear02.south) {\footnotesize{$\mathbf{Q}$}};

-\node [anchor=west,draw=black!30,inner sep=4pt,fill=ugreen!20!white] (Linear1) at ([xshift=1.5em]Linear0.east) {\footnotesize{Linear}};
-\node [anchor=south west,draw=black!50,fill=ugreen!20!white,draw,inner sep=4pt] (Linear11) at ([shift={(-0.2em,-0.2em)}]Linear1.south west) {\footnotesize{Linear}};
+\node [anchor=west,draw=black!30,inner sep=4pt,fill=ugreen!20!white,text=ugreen!20!white] (Linear1) at ([xshift=1.5em]Linear0.east) {\footnotesize{Linear}};
+\node [anchor=south west,draw=black!50,fill=ugreen!20!white,draw,inner sep=4pt,text=ugreen!20!white] (Linear11) at ([shift={(-0.2em,-0.2em)}]Linear1.south west) {\footnotesize{Linear}};
 \node [anchor=south west,fill=ugreen!20!white,draw,inner sep=4pt] (Linear12) at ([shift={(-0.2em,-0.2em)}]Linear11.south west) {\footnotesize{Linear}};
 \node [anchor=north] (K) at ([xshift=0em,yshift=-1em]Linear12.south) {\footnotesize{$\mathbf{K}$}};

-\node [anchor=west,draw=black!30,inner sep=4pt,fill=ugreen!20!white] (Linear2) at ([xshift=1.5em]Linear1.east) {\footnotesize{Linear}};
-\node [anchor=south west,draw=black!50,fill=ugreen!20!white,draw,inner sep=4pt] (Linear21) at ([shift={(-0.2em,-0.2em)}]Linear2.south west) {\footnotesize{Linear}};
+\node [anchor=west,draw=black!30,inner sep=4pt,fill=ugreen!20!white,text=ugreen!20!white] (Linear2) at ([xshift=1.5em]Linear1.east) {\footnotesize{Linear}};
+\node [anchor=south west,draw=black!50,fill=ugreen!20!white,draw,inner sep=4pt,text=ugreen!20!white] (Linear21) at ([shift={(-0.2em,-0.2em)}]Linear2.south west) {\footnotesize{Linear}};
 \node [anchor=south west,fill=ugreen!20!white,draw,inner sep=4pt] (Linear22) at ([shift={(-0.2em,-0.2em)}]Linear21.south west) {\footnotesize{Linear}};
 \node [anchor=north] (V) at ([xshift=0em,yshift=-1em]Linear22.south) {\footnotesize{$\mathbf{V}$}};

-\node [anchor=south,draw=black!30,minimum width=12em,minimum height=2em,inner sep=4pt,fill=blue!20!white] (Scale) at ([yshift=1em]Linear1.north) {\footnotesize{Scaled Dot-Product Attention}};
-\node [anchor=south west,draw=black!50,minimum width=12em,minimum height=2em,fill=blue!20!white,draw,inner sep=4pt] (Scale1) at ([shift={(-0.2em,-0.2em)}]Scale.south west) {\footnotesize{Scaled Dot-Product Attention}};
+\node [anchor=south,draw=black!30,minimum width=12em,minimum height=2em,inner sep=4pt,fill=blue!20!white] (Scale) at ([yshift=1em]Linear1.north) {\footnotesize{}};
+\node [anchor=south west,draw=black!50,minimum width=12em,minimum height=2em,fill=blue!20!white,draw,inner sep=4pt] (Scale1) at ([shift={(-0.2em,-0.2em)}]Scale.south west) {\footnotesize{}};
 \node [anchor=south west,fill=blue!20!white,draw,minimum width=12em,minimum height=2em,inner sep=4pt] (Scale2) at ([shift={(-0.2em,-0.2em)}]Scale1.south west) {\footnotesize{Scaled Dot-Product Attention}};

 \node [anchor=south,draw,minimum width=4em,inner sep=4pt,fill=yellow!30] (Concat) at ([yshift=1em]Scale2.north) {\footnotesize{Concat}};

--- a/Book/Chapter6/Figures/figure-the-whole-of-LSTM.tex
+++ b/Book/Chapter6/Figures/figure-the-whole-of-LSTM.tex
@@ -17,60 +17,60 @@
 % Skeleton
 \begin{scope}[every label/.append style={label distance=1pt,font=\tiny,inner sep=0pt,opacity=0}]
 \coordinate (aux12) at (0,0);
-\node[auxnode,label={-45:12}] () at (aux12) {};
+%\node[auxnode,label={-45:12}] () at (aux12) {};

 \coordinate (aux22) at ([yshift=\base]aux12);
-\node[auxnode,label={-45:22}] () at (aux22) {};
+%\node[auxnode,label={-45:22}] () at (aux22) {};
 \coordinate (aux21) at ([xshift=-2\base]aux22);
-\node[auxnode,label={-45:21}] () at (aux21) {};
+%\node[auxnode,label={-45:21}] () at (aux21) {};
 \coordinate (aux23) at ([xshift=\base]aux22);
-\node[auxnode,label={-45:23}] () at (aux23) {};
+%\node[auxnode,label={-45:23}] () at (aux23) {};
 \coordinate (aux24) at ([xshift=\base]aux23);
-\node[auxnode,label={-45:24}] () at (aux24) {};
+%\node[auxnode,label={-45:24}] () at (aux24) {};
 \coordinate (aux25) at ([xshift=\base]aux24);
-\node[auxnode,label={-45:25}] () at (aux25) {};
+%\node[auxnode,label={-45:25}] () at (aux25) {};
 \coordinate (aux26) at ([xshift=\base]aux25);
-\node[auxnode,label={-45:26}] () at (aux26) {};
+%\node[auxnode,label={-45:26}] () at (aux26) {};
 \coordinate (aux27) at ([xshift=\base]aux26);
-\node[auxnode,label={-45:27}] () at (aux27) {};
+%\node[auxnode,label={-45:27}] () at (aux27) {};
 \coordinate (aux28) at ([xshift=\base]aux27);
-\node[auxnode,label={-45:28}] () at (aux28) {};
+%\node[auxnode,label={-45:28}] () at (aux28) {};
 \coordinate (aux29) at ([xshift=2\base]aux28);
-\node[auxnode,label={-45:29}] () at (aux29) {};
+%\node[auxnode,label={-45:29}] () at (aux29) {};

 \coordinate (aux33) at ([yshift=\base]aux23);
-\node[auxnode,label={-45:33}] () at (aux33) {};
+%\node[auxnode,label={-45:33}] () at (aux33) {};
 \coordinate (aux34) at ([yshift=\base]aux24);
-\node[auxnode,label={-45:34}] () at (aux34) {};
+%\node[auxnode,label={-45:34}] () at (aux34) {};
 \coordinate (aux35) at ([yshift=\base]aux25);
-\node[auxnode,label={-45:35}] () at (aux35) {};
+%\node[auxnode,label={-45:35}] () at (aux35) {};
 \coordinate (aux37) at ([yshift=\base]aux27);
-\node[auxnode,label={-45:37}] () at (aux37) {};
+%\node[auxnode,label={-45:37}] () at (aux37) {};

 \coordinate (aux45) at ([yshift=\base]aux35);
-\node[auxnode,label={-45:45}] () at (aux45) {};
+%\node[auxnode,label={-45:45}] () at (aux45) {};

 \coordinate (aux55) at ([yshift=\base]aux45);
-\node[auxnode,label={-45:55}] () at (aux55) {};
+%\node[auxnode,label={-45:55}] () at (aux55) {};
 \ExtractX{$(aux21)$}
 \ExtractY{$(aux55)$}
 \coordinate (aux51) at (\XCoord,\YCoord);
-\node[auxnode,label={-45:51}] () at (aux51) {};
+%\node[auxnode,label={-45:51}] () at (aux51) {};
 \ExtractX{$(aux23)$}
 \ExtractY{$(aux55)$}
 \coordinate (aux53) at (\XCoord,\YCoord);
-\node[auxnode,label={-45:53}] () at (aux53) {};
+%\node[auxnode,label={-45:53}] () at (aux53) {};
 \ExtractX{$(aux28)$}
 \ExtractY{$(aux55)$}
 \coordinate (aux58) at (\XCoord,\YCoord);
-\node[auxnode,label={-45:58}] () at (aux58) {};
+%\node[auxnode,label={-45:58}] () at (aux58) {};
 \ExtractX{$(aux29)$}
 \ExtractY{$(aux55)$}
 \coordinate (aux59) at (\XCoord,\YCoord);
-\node[auxnode,label={-45:59}] () at (aux59) {};
+%\node[auxnode,label={-45:59}] () at (aux59) {};

 \coordinate (aux68) at ([yshift=\base]aux58);
-\node[auxnode,label={-45:68}] () at (aux68) {};
+%\node[auxnode,label={-45:68}] () at (aux68) {};
 \end{scope}

 \begin{scope}

--- a/Book/Chapter7/Chapter7.tex
+++ b/Book/Chapter7/Chapter7.tex
@@ -2,6 +2,14 @@
 % !TEX encoding = UTF-8 Unicode

 %----------------------------------------------------------------------------------------
+% 机器翻译：统计建模与深度学习方法
+% Machine Translation: Statistical Modeling and Deep Learning Methods
+%
+% Copyright 2020
+% 肖桐(xiaotong@mail.neu.edu.cn) 朱靖波 (zhujingbo@mail.neu.edu.cn)
+%----------------------------------------------------------------------------------------
+
+%----------------------------------------------------------------------------------------
 %    CONFIGURATIONS
 %----------------------------------------------------------------------------------------

@@ -443,7 +451,7 @@ y = f(x)

 \parinterval 正则化的一种实现是在训练目标中引入一个正则项。在神经机器翻译中，引入正则项的训练目标为：
 \begin{eqnarray}
-\hat{\mathbf{w}}=\argmax_{\mathbf{w}}L(\mathbf{w}) + \lambda R(\mathbf{w})
+\widehat{\mathbf{w}}=\argmax_{\mathbf{w}}L(\mathbf{w}) + \lambda R(\mathbf{w})
 \label{eq:7-2}
 \end{eqnarray}

@@ -1854,11 +1862,13 @@ L_{\textrm{seq}} = - \textrm{logP}_{\textrm{s}}(\hat{\mathbf{y}} | \mathbf{x})
 \vspace{0.5em}
 \item 无指导机器翻译。无指导机器翻译由于其不需要双语语料即可训练翻译模型的特性，在稀缺资源机器翻译的场景中有非常大的潜力而得到广泛的关注。目前无指导机器翻译主要有两种范式：第一种先得到词典的翻译，然后得到短语表的翻译和相应的统计机器翻译系统，最后使用统计机器翻译系统生成伪双语平行语料训练神经机器翻译系统\cite{DBLP:conf/acl/ArtetxeLA19}；第二种是先预训练语言模型来初始化神经机器翻译系统的编码器和解码器，然后使用翻译中回译以及降噪自编码器来训练神经机器翻译系统\cite{lample2019cross}。尽管目前无指导机器翻译在富资源的语种上取得了很大进展，但是离实际应用还有很远距离。比如，目前无指导系统都依赖于大量单语数据，而实际上稀缺资源的语种不但双语语料少，单语语料也少；此外，这些系统还无法在远距离如中英这些字母表重合少，需要大范围调序的语种对上取得可接受的结果；使用大量单语训练无指导系统还面临数据来自于不同领域的问题\cite{DBLP:journals/corr/abs-2004-05516}。设计更鲁棒，使用单语数据更高效的无指导机器翻译方法乃至新范式会是未来的趋势。
 \vspace{0.5em}
-\item 更多上下文信息的建模。由于人类语言潜在的歧义性，传统的神经机器翻译在单句翻译中可能会出现歧义。为此，一些研究工作在翻译过程中尝试引入更多的上下文信息，比如多模态翻译、基于树的翻译或者篇章级翻译。多模态翻译的目标就是在给定一个图片和其源语描述的情况下，生成目标语言的描述。一般做法就是通过一个额外的编码器来提取图像特征\cite{DBLP:journals/corr/ElliottFH15,DBLP:conf/acl/HitschlerSR16}，然后通过权重门控机制、注意力网络等融合到系统中\cite{DBLP:conf/wmt/HuangLSOD16}。
+\item 图片翻译。由于人类语言潜在的歧义性，传统的神经机器翻译在单句翻译中可能会出现歧义。为此，一些研究工作在翻译过程中尝试引入更多的上下文信息，比如多模态翻译、基于树的翻译或者篇章级翻译。比如，图片翻译的目标就是在给定一个图片和其源语描述的情况下，生成目标语言的描述。一般做法就是通过一个额外的编码器来提取图像特征\cite{DBLP:journals/corr/ElliottFH15,DBLP:conf/acl/HitschlerSR16}，然后通过权重门控机制、注意力网络等融合到系统中\cite{DBLP:conf/wmt/HuangLSOD16}。

-\parinterval 基于树的翻译是指在翻译模型中引入句法结构树或依存树，从而引入更多的句法信息。一种常用的做法是将句法树进行序列化，从而保留序列到序列的模型结构\cite{DBLP:conf/emnlp/CurreyH18,DBLP:conf/acl/SaundersSGB18}。在此基础上，一些研究工作引入了更多的解析结果\cite{DBLP:conf/acl/SumitaUZTM18,DBLP:conf/coling/ZaremoodiH18}。同时，也有一些研究工作直接使用Tree-LSTMs等网络结构\cite{DBLP:conf/acl/TaiSM15,DBLP:conf/iclr/ShenTSC19}来直接表示树结构，并将其应用到神经机器翻译模型中\cite{DBLP:conf/acl/EriguchiHT16,Yang2017TowardsBH,DBLP:conf/acl/ChenHCC17}。
+\vspace{0.5em}
+\item 基于树的翻译。这类方法在翻译模型中引入句法结构树或依存树，从而引入更多的句法信息。一种常用的做法是将句法树进行序列化，从而保留序列到序列的模型结构\cite{DBLP:conf/emnlp/CurreyH18,DBLP:conf/acl/SaundersSGB18}。在此基础上，一些研究工作引入了更多的解析结果\cite{DBLP:conf/acl/SumitaUZTM18,DBLP:conf/coling/ZaremoodiH18}。同时，也有一些研究工作直接使用Tree-LSTMs等网络结构\cite{DBLP:conf/acl/TaiSM15,DBLP:conf/iclr/ShenTSC19}来直接表示树结构，并将其应用到神经机器翻译模型中\cite{DBLP:conf/acl/EriguchiHT16,Yang2017TowardsBH,DBLP:conf/acl/ChenHCC17}。

-\parinterval 篇章级翻译是为了引入篇章级上下文信息，来处理篇章翻译中译文不连贯，主谓不一致等歧义现象。为此，一些研究人员针对该问题进行了改进，主要可以分为两类方法：一种是将当前句子与上下文进行句子级的拼接，不改变模型的结构\cite{DBLP:conf/discomt/TiedemannS17}，另外一种是采用额外的编码器来捕获篇章信息\cite{DBLP:journals/corr/JeanLFC17,DBLP:journals/corr/abs-1805-10163,DBLP:conf/emnlp/ZhangLSZXZL18}。编码器的结构除了传统的RNN、自注意力网络，还有利用层级注意力来编码之前的多句上文\cite{Werlen2018DocumentLevelNM,tan-etal-2019-hierarchical}，使用可选择的稀疏注意力机制对整个文档进行篇章建模\cite{DBLP:conf/naacl/MarufMH19},使用记忆网络、缓存机制等对篇章中的关键词进行提取\cite{DBLP:conf/coling/KuangXLZ18,DBLP:journals/tacl/TuLSZ18}或者采用两阶段解码的方式\cite{DBLP:conf/aaai/XiongH0W19,DBLP:conf/acl/VoitaST19}。除了从建模角度引入上下文信息，也有一些工作使用篇章级修正模型\cite{DBLP:conf/emnlp/VoitaST19}或者语言模型\cite{DBLP:journals/corr/abs-1910-00553}对句子级翻译模型的译文进行修正，或者通过自学习在解码过程中保持翻译连贯性\cite{DBLP:journals/corr/abs-2003-05259}。
+\vspace{0.5em}
+\item 篇章级翻译。可以通过引入篇章级上下文信息，来处理篇章翻译中译文不连贯，主谓不一致等问题。为此，一些研究人员针对该问题进行了改进，主要可以分为两类方法：一种是将当前句子与上下文进行句子级的拼接，不改变模型的结构\cite{DBLP:conf/discomt/TiedemannS17}，另外一种是采用额外的编码器来捕获篇章信息\cite{DBLP:journals/corr/JeanLFC17,DBLP:journals/corr/abs-1805-10163,DBLP:conf/emnlp/ZhangLSZXZL18}。编码器的结构除了传统的RNN、自注意力网络，还有利用层级注意力来编码之前的多句上文\cite{Werlen2018DocumentLevelNM,tan-etal-2019-hierarchical}，使用可选择的稀疏注意力机制对整个文档进行篇章建模\cite{DBLP:conf/naacl/MarufMH19},使用记忆网络、缓存机制等对篇章中的关键词进行提取\cite{DBLP:conf/coling/KuangXLZ18,DBLP:journals/tacl/TuLSZ18}或者采用两阶段解码的方式\cite{DBLP:conf/aaai/XiongH0W19,DBLP:conf/acl/VoitaST19}。除了从建模角度引入上下文信息，也有一些工作使用篇章级修正模型\cite{DBLP:conf/emnlp/VoitaST19}或者语言模型\cite{DBLP:journals/corr/abs-1910-00553}对句子级翻译模型的译文进行修正，或者通过自学习在解码过程中保持翻译连贯性\cite{DBLP:journals/corr/abs-2003-05259}。
 \vspace{0.5em}
 \item 语音翻译。在日常生活中，语音翻译也是有很大的需求。针对语音到文本翻译的特点，最简单的做法是使用自动语音识别（ASR）将语音转换成文本，然后送入文本翻译模型进行翻译\cite{DBLP:conf/icassp/Ney99,DBLP:conf/interspeech/MatusovKN05}。然而为了避免流水线中的错误传播和高延迟问题，现在通常采用端到端的建模做法\cite{DBLP:conf/naacl/DuongACBC16,DBLP:journals/corr/BerardPSB16}。同时，针对语音翻译数据稀缺的问题，一些研究工作采用各种方法来进行缓解，包括预训练\cite{DBLP:conf/naacl/BansalKLLG19}、多任务学习\cite{Weiss2017SequencetoSequenceMC,DBLP:conf/icassp/BerardBKP18}、课程学习\cite{DBLP:conf/interspeech/KanoS017}、注意力传递\cite{DBLP:journals/tacl/SperberNNW19}和知识精炼\cite{DBLP:conf/interspeech/LiuXZHWWZ19,DBLP:conf/icassp/JiaJMWCCALW19}。
 \vspace{0.5em}

--- a/Book/Chapter7/Figures/figure-batch-generation-method.tex
+++ b/Book/Chapter7/Figures/figure-batch-generation-method.tex

 \begin{tikzpicture}
-	\tikzstyle{node} = [minimum height=1.0*1.2em,draw=teal,fill=teal!10]
+	\tikzstyle{node} = [minimum height=1.0*1.2em,draw,fill=green!20]
 	\tikzstyle{legend} = [minimum height=1.0*1.2em,minimum width=1.0*1.2em,draw]
-	\tikzstyle{node2} = [minimum width=1.0*1.2em,minimum height=4.1*1.2em,draw=blue,fill=blue!10]
+	\tikzstyle{node2} = [minimum width=1.0*1.2em,minimum height=4.1*1.2em,draw,fill=blue!20]
 	\node[node,minimum width=2.8*1.2em] (node1) at (0,0) {};
 	\node[node,minimum width=4.0*1.2em,anchor=north west] (node2) at (node1.south west) {};
 	\node[node,minimum width=3.2*1.2em,anchor=north west] (node3) at (node2.south west) {};
@@ -12,12 +12,12 @@
 	\node[node,minimum width=2.8*1.2em,anchor=north west] (node6) at (node5.south west) {};
 	\node[node,minimum width=3.2*1.2em,anchor=north west] (node7) at (node6.south west) {};
 	\node[node,minimum width=4.0*1.2em,anchor=north west] (node8) at (node7.south west) {};
-	\node[font=\footnotesize,anchor=east] (line1) at (node1.west) {gpu1};
-	\node[font=\footnotesize,anchor=east] (line2) at (node2.west) {gpu2};
-	\node[font=\footnotesize,anchor=east] (line3) at (node3.west) {gpu3};
-	\node[font=\footnotesize,anchor=east] (line4) at (node4.west) {gpu4};
+	\node[font=\footnotesize,anchor=east] (line1) at (node1.west) {GPU1};
+	\node[font=\footnotesize,anchor=east] (line2) at (node2.west) {GPU2};
+	\node[font=\footnotesize,anchor=east] (line3) at (node3.west) {GPU3};
+	\node[font=\footnotesize,anchor=east] (line4) at (node4.west) {GPU4};
 	\node[node2,anchor = north west] (grad2) at ([xshift=0.3em]node5.north east) {};
-	\draw[->] (-1.4em*1.2,-3.62*1.2em) -- (9em*1.2,-3.62*1.2em);
+	\draw[->,thick] (-1.4em*1.2,-3.62*1.2em) -- (9em*1.2,-3.62*1.2em);

 	\node[node,minimum width=2.8*1.2em] (node9) at (16em,0) {};
 	\node[node,minimum width=4.0*1.2em,anchor=north west] (node10) at (node9.south west) {};
@@ -29,11 +29,11 @@
 	\node[node,minimum width=3.2*1.2em,anchor=north west] (node15) at (node11.north east) {};
 	\node[node,minimum width=4.0*1.2em,anchor=north west] (node16) at (node12.north east) {};
 	\node[node2,anchor = north west] (grad3) at ([xshift=0.5em]node13.north east) {};
-	\node[font=\footnotesize,anchor=east] (line1) at (node9.west) {gpu1};
-	\node[font=\footnotesize,anchor=east] (line2) at (node10.west) {gpu2};
-	\node[font=\footnotesize,anchor=east] (line3) at (node11.west) {gpu3};
-	\node[font=\footnotesize,anchor=east] (line4) at (node12.west) {gpu4};
-	\draw[->] (13.6*1.2em,-3.62*1.2em) -- (20.5*1.2em,-3.62*1.2em);
+	\node[font=\footnotesize,anchor=east] (line1) at (node9.west) {GPU1};
+	\node[font=\footnotesize,anchor=east] (line2) at (node10.west) {GPU2};
+	\node[font=\footnotesize,anchor=east] (line3) at (node11.west) {GPU3};
+	\node[font=\footnotesize,anchor=east] (line4) at (node12.west) {GPU4};
+	\draw[->,thick] (node12.south west) -- ([xshift=3em]node16.south east);
 	\begin{pgfonlayer}{background}
 	\node [rectangle,inner sep=-0.0em,draw] [fit = (node1) (node2) (node3) (node4)] (box1) {};
 	\node [rectangle,inner sep=-0.0em,draw] [fit = (node5) (node6) (node7) (node8)] (box2) {};
@@ -46,9 +46,9 @@

 	\node[legend] (legend3) at (2em,2em) {};
 	\node[font=\footnotesize,anchor=west] (idle) at (legend3.east) {:空闲};
-	\node[legend,anchor=west,draw=teal,fill=teal!10] (legend4) at ([xshift = 2em]idle.east) {};
+	\node[legend,anchor=west,draw,fill=green!30] (legend4) at ([xshift = 2em]idle.east) {};
 	\node[font=\footnotesize,anchor=west] (FB) at (legend4.east) {:前向/反向};
-	\node[legend,anchor=west,draw=blue,fill=blue!10] (legend5) at ([xshift = 2em]FB.east) {};
+	\node[legend,anchor=west,draw,fill=blue!30] (legend5) at ([xshift = 2em]FB.east) {};
 	\node[font=\footnotesize,anchor=west] (grad_sync) at (legend5.east) {:梯度更新};

 \end{tikzpicture}
\ No newline at end of file
--- a/Book/Chapter7/Figures/figure-machine-translation-performance-curve.tex
+++ b/Book/Chapter7/Figures/figure-machine-translation-performance-curve.tex
-
 \begin{tikzpicture}
 \begin{scope}\small

@@ -12,7 +11,7 @@

 \draw [-,very thick,draw=ublue] ([xshift=0.7em,yshift=3em]n1.north) .. controls +(north:7em) and +(south:0em) .. ([xshift=17em,yshift=9em]n1.north);
 {\footnotesize
-\node [anchor=south] (n4) at ([xshift=7em,yshift=5em]n1.north) {性能快速爬升阶段};
+\node [anchor=south] (n4) at ([xshift=9.5em,yshift=5em]n1.north) {性能快速爬升阶段（红色）};
 \node [anchor=west] (n5) at ([xshift=0em,yshift=-2em]n4.west) {数据的作用会非常明显};

 \draw [-,very thick,draw=red] ([xshift=0.7em,yshift=3em]n1.north) .. controls +(north:5.9em) and +(south:0em) .. ([xshift=10em,yshift=9.6em]n1.north);

--- a/Book/Chapter7/Figures/figure-randomly-generation-vs-generate-by-sentence-length.tex
+++ b/Book/Chapter7/Figures/figure-randomly-generation-vs-generate-by-sentence-length.tex

 \begin{tikzpicture}
-	\tikzstyle{node} = [minimum height=1.0*1.2em,draw=teal,fill=teal!10]
+	\tikzstyle{node} = [minimum height=1.0*1.2em,draw,fill=green!20]
 	\node[node,minimum width=2.0*1.2em] (sent1) at (0,0) {};
 	\node[node,minimum width=5.0*1.2em,anchor=north west] (sent2) at (sent1.south west) {};
 	\node[node,minimum width=1.0*1.2em,anchor=north west] (sent3) at (sent2.south west) {};
@@ -11,15 +11,15 @@
 	\node[node,minimum width=4.5*1.2em,anchor=north west] (sent7) at (sent6.south west) {};
 	\node[node,minimum width=5*1.2em,anchor=north west] (sent8) at (sent7.south west) {};

-	\node[font=\footnotesize,anchor=east] (line1) at (sent1.west) {sent1};
-	\node[font=\footnotesize,anchor=east] (line2) at (sent2.west) {sent2};
-	\node[font=\footnotesize,anchor=east] (line3) at (sent3.west) {sent3};
-	\node[font=\footnotesize,anchor=east] (line4) at (sent4.west) {sent4};
+	\node[font=\footnotesize,anchor=east] (line1) at (sent1.west) {句子1};
+	\node[font=\footnotesize,anchor=east] (line2) at (sent2.west) {句子2};
+	\node[font=\footnotesize,anchor=east] (line3) at (sent3.west) {句子3};
+	\node[font=\footnotesize,anchor=east] (line4) at (sent4.west) {句子4};

-	\node[font=\footnotesize,anchor=east] (line5) at (sent5.west) {sent1};
-	\node[font=\footnotesize,anchor=east] (line6) at (sent6.west) {sent2};
-	\node[font=\footnotesize,anchor=east] (line7) at (sent7.west) {sent3};
-	\node[font=\footnotesize,anchor=east] (line8) at (sent8.west) {sent4};
+	\node[font=\footnotesize,anchor=east] (line5) at (sent5.west) {句子1};
+	\node[font=\footnotesize,anchor=east] (line6) at (sent6.west) {句子2};
+	\node[font=\footnotesize,anchor=east] (line7) at (sent7.west) {句子3};
+	\node[font=\footnotesize,anchor=east] (line8) at (sent8.west) {句子4};
 	\begin{pgfonlayer}{background}
 	\node [rectangle,inner sep=-0.0em,draw] [fit = (sent1) (sent2) (sent3) (sent4)] (box1) {};
 	\node [rectangle,inner sep=-0.0em,draw] [fit = (sent5) (sent6) (sent7) (sent8)] (box2) {};

--- a/Book/Chapter7/Figures/figure-word-root.tex
+++ b/Book/Chapter7/Figures/figure-word-root.tex
@@ -3,16 +3,16 @@
 \node[] (do) at (0,0) {{\red do}}; 
 \node[anchor = west] (does) at ([xshift = 1em]do.east) {{\red do}es};
 \node[anchor = west] (doing) at ([xshift = 0.7em]does.east) {{\red do}ing};
-\node[anchor = north] (do_root) at ([yshift = -1em]does.south) {do};
+\node[anchor = north] (do_root) at ([yshift = -1.5em]does.south) {do};

 \node[anchor = west] (new) at ([xshift = 2em]doing.east) {{\red new}}; 
 \node[anchor = west] (newer) at ([xshift = 1em]new.east) {{\red new}er};
 \node[anchor = west] (newest) at ([xshift = 0.7em]newer.east) {{\red new}est};
-\node[anchor = north] (new_root) at ([yshift = -1em]newer.south) {new};
-\draw [->] (do_root.north) .. controls +(north:0.4) and +(south:0.6) ..(do.south);
+\node[anchor = north] (new_root) at ([yshift = -1.5em]newer.south) {new};
+\draw [->] ([yshift=0.2em]do_root.north) .. controls +(north:0.4) and +(south:0.6) ..(do.south);
 \draw [->] (do_root.north) -- (does.south);
-\draw [->] (do_root.north) .. controls +(north:0.4) and +(south:0.6) ..(doing.south);
-\draw [->] (new_root.north) .. controls +(north:0.4) and +(south:0.6) ..(new.south);
+\draw [->] ([yshift=0.2em]do_root.north) .. controls +(north:0.4) and +(south:0.6) ..(doing.south);
+\draw [->] ([yshift=0.2em]new_root.north) .. controls +(north:0.4) and +(south:0.6) ..(new.south);
 \draw [->] (new_root.north) -- (newer.south);
-\draw [->] (new_root.north) .. controls +(north:0.4) and +(south:0.6) ..(newest.south);
+\draw [->] ([yshift=0.2em]new_root.north) .. controls +(north:0.4) and +(south:0.6) ..(newest.south);
 \end{tikzpicture}
\ No newline at end of file
--- a/Book/ChapterAppend/ChapterAppend.tex
+++ b/Book/ChapterAppend/ChapterAppend.tex
@@ -2,6 +2,14 @@
 % !TEX encoding = UTF-8 Unicode

 %----------------------------------------------------------------------------------------
+% 机器翻译：统计建模与深度学习方法
+% Machine Translation: Statistical Modeling and Deep Learning Methods
+%
+% Copyright 2020
+% 肖桐(xiaotong@mail.neu.edu.cn) 朱靖波 (zhujingbo@mail.neu.edu.cn)
+%----------------------------------------------------------------------------------------
+
+%----------------------------------------------------------------------------------------
 %    CONFIGURATIONS
 %----------------------------------------------------------------------------------------

@@ -208,7 +216,7 @@ S = N(b^{\infty}(V(\mathbf{s}|\mathbf{t};2))) \cup (\mathop{\cup}\limits_{ij} N(

 \parinterval 为了理解这个公式，先介绍几个概念。
 \begin{itemize}
-\item $V(\mathbf{s}|\mathbf{t})$表示Viterbi词对齐，$V(\mathbf{s}|\mathbf{t},1)$、$V(\mathbf{s}|\mathbf{t},2)$和$V(\mathbf{s}|\mathbf{t},3)$就分别对应了模型1、2 和3 的Viterbi 词对齐； 
+\item $V(\mathbf{s}|\mathbf{t})$表示Viterbi词对齐，$V(\mathbf{s}|\mathbf{t},1)$、$V(\mathbf{s}|\mathbf{t},2)$和$V(\mathbf{s}|\mathbf{t},3)$就分别对应了模型1、2 和3 的Viterbi 词对齐；
 \item 把那些满足第$j$个源语言单词对应第$i$个目标语言单词（$a_j=i$）的词对齐构成的集合记为$\mathbf{A}_{i \leftrightarrow j}(\mathbf{s},\mathbf{t})$。通常称这些对齐中$j$和$i$被``钉''在了一起。在$\mathbf{A}_{i \leftrightarrow j}(\mathbf{s},\mathbf{t})$中使$\textrm{P}(\mathbf{a}|\mathbf{s},\mathbf{t})$达到最大的那个词对齐被记为$V_{i \leftrightarrow j}(\mathbf{s},\mathbf{t})$；
 \item 如果两个词对齐，通过交换两个词对齐连接就能互相转化，则称它们为邻居。一个词对齐$\mathbf{a}$的所有邻居记为$N(\mathbf{a})$。
 \end{itemize}

--- a/Book/ChapterPreface/ChapterPreface.tex
+++ b/Book/ChapterPreface/ChapterPreface.tex
 % !Mode:: "TeX:UTF-8"
 % !TEX encoding = UTF-8 Unicode

+%----------------------------------------------------------------------------------------
+% 机器翻译：统计建模与深度学习方法
+% Machine Translation: Statistical Modeling and Deep Learning Methods
+%
+% Copyright 2020
+% 肖桐(xiaotong@mail.neu.edu.cn) 朱靖波 (zhujingbo@mail.neu.edu.cn)
+%----------------------------------------------------------------------------------------
+
 \renewcommand\figurename{图}

 %----------------------------------------------------------------------------------------

--- a/Book/mt-book-xelatex.idx
+++ b/Book/mt-book-xelatex.idx
-\indexentry{组合性翻译|hyperpage}{10}
-\indexentry{Compositional Translation|hyperpage}{10}
-\indexentry{短语|hyperpage}{10}
-\indexentry{短语切分|hyperpage}{15}
-\indexentry{Phrasal Segmentation|hyperpage}{15}
-\indexentry{短语对|hyperpage}{15}
-\indexentry{推导|hyperpage}{15}
-\indexentry{Derivation|hyperpage}{15}
-\indexentry{生成式模型|hyperpage}{18}
-\indexentry{Generative Model|hyperpage}{18}
-\indexentry{判别式模型|hyperpage}{18}
-\indexentry{Discriminative Model|hyperpage}{18}
-\indexentry{对数线性模型|hyperpage}{19}
-\indexentry{Log-linear Model|hyperpage}{19}
-\indexentry{短语抽取|hyperpage}{20}
-\indexentry{Phrase Extraction|hyperpage}{20}
-\indexentry{词汇化翻译概率|hyperpage}{23}
-\indexentry{Lexical Translation Probability|hyperpage}{23}
-\indexentry{短语表|hyperpage}{23}
-\indexentry{Phrase Table|hyperpage}{23}
-\indexentry{调序|hyperpage}{24}
-\indexentry{Reordering|hyperpage}{24}
-\indexentry{模型训练|hyperpage}{28}
-\indexentry{Model Training|hyperpage}{28}
-\indexentry{权重调优|hyperpage}{28}
-\indexentry{Weight Tuning|hyperpage}{28}
-\indexentry{最小错误率训练|hyperpage}{28}
-\indexentry{Minimum Error Rate Training|hyperpage}{28}
-\indexentry{调优集合|hyperpage}{28}
-\indexentry{Tuning Set|hyperpage}{28}
-\indexentry{线搜索|hyperpage}{29}
-\indexentry{Line Search|hyperpage}{29}
-\indexentry{格搜索|hyperpage}{30}
-\indexentry{Grid Search|hyperpage}{30}
-\indexentry{覆盖度模型|hyperpage}{32}
-\indexentry{Coverage Model|hyperpage}{32}
-\indexentry{翻译候选|hyperpage}{32}
-\indexentry{Translation Candidate|hyperpage}{32}
-\indexentry{翻译假设|hyperpage}{33}
-\indexentry{Translation Hypothesis|hyperpage}{33}
-\indexentry{剪枝|hyperpage}{34}
-\indexentry{Pruning|hyperpage}{34}
-\indexentry{束剪枝|hyperpage}{34}
-\indexentry{Beam Pruning|hyperpage}{34}
-\indexentry{直方图剪枝|hyperpage}{34}
-\indexentry{Histogram Pruning|hyperpage}{34}
-\indexentry{阈值剪枝|hyperpage}{34}
-\indexentry{Threshold Pruning|hyperpage}{34}
-\indexentry{假设重组|hyperpage}{34}
-\indexentry{Hypothesis Recombination|hyperpage}{34}
-\indexentry{基于层次短语的模型|hyperpage}{38}
-\indexentry{Hierarchical Phrase-based Model|hyperpage}{38}
-\indexentry{同步上下文无关文法|hyperpage}{39}
-\indexentry{Synchronous Context-free Grammar|hyperpage}{39}
-\indexentry{基于层次短语的文法|hyperpage}{40}
-\indexentry{Hierarchical Phrase-based Grammar|hyperpage}{40}
-\indexentry{推导|hyperpage}{41}
-\indexentry{Derivation|hyperpage}{41}
-\indexentry{胶水规则|hyperpage}{41}
-\indexentry{Glue Rule|hyperpage}{41}
-\indexentry{乔姆斯基范式|hyperpage}{45}
-\indexentry{Chomsky Normal Form|hyperpage}{45}
-\indexentry{跨度|hyperpage}{45}
-\indexentry{Span|hyperpage}{45}
-\indexentry{自下而上的分析|hyperpage}{46}
-\indexentry{Top-down Parsing|hyperpage}{46}
-\indexentry{束剪枝|hyperpage}{48}
-\indexentry{Beam Pruning|hyperpage}{48}
-\indexentry{立方剪枝|hyperpage}{50}
-\indexentry{Cube Pruning|hyperpage}{50}
-\indexentry{序列化|hyperpage}{53}
-\indexentry{线性化|hyperpage}{53}
-\indexentry{Linearization|hyperpage}{53}
-\indexentry{树到串翻译规则|hyperpage}{55}
-\indexentry{Tree-to-String Translation Rule|hyperpage}{55}
-\indexentry{树到树翻译规则|hyperpage}{55}
-\indexentry{Tree-to-Tree Translation Rule|hyperpage}{55}
-\indexentry{树片段|hyperpage}{56}
-\indexentry{Tree Fragment|hyperpage}{56}
-\indexentry{同步树替换文法规则|hyperpage}{57}
-\indexentry{Synchronous Tree Substitution Grammar Rule|hyperpage}{57}
-\indexentry{边缘集合|hyperpage}{63}
-\indexentry{Frontier Set|hyperpage}{63}
-\indexentry{最小规则|hyperpage}{64}
-\indexentry{Minimal Rules|hyperpage}{64}
-\indexentry{二叉化|hyperpage}{67}
-\indexentry{Binarization|hyperpage}{67}
-\indexentry{基于短语的特征|hyperpage}{72}
-\indexentry{基于句法的特征|hyperpage}{72}
-\indexentry{有向超图|hyperpage}{73}
-\indexentry{Directed Hyper-graph|hyperpage}{73}
-\indexentry{超边|hyperpage}{73}
-\indexentry{Hyper-edge|hyperpage}{73}
-\indexentry{半环分析|hyperpage}{73}
-\indexentry{Semi-ring Parsing|hyperpage}{73}
-\indexentry{组合|hyperpage}{75}
-\indexentry{Composition|hyperpage}{75}
-\indexentry{基于串的解码|hyperpage}{76}
-\indexentry{String-based Decoding|hyperpage}{76}
-\indexentry{基于树的解码|hyperpage}{76}
-\indexentry{Tree-based Decoding|hyperpage}{76}
-\indexentry{Lexicalized Norm Form|hyperpage}{79}
+\indexentry{源语言|hyperpage}{17}
+\indexentry{Source Language|hyperpage}{17}
+\indexentry{目标语言|hyperpage}{17}
+\indexentry{Target Language|hyperpage}{17}
+\indexentry{机器翻译|hyperpage}{18}
+\indexentry{Machine Translation|hyperpage}{18}
+\indexentry{数据驱动|hyperpage}{23}
+\indexentry{Data-Driven|hyperpage}{23}
+\indexentry{编码器-解码器|hyperpage}{30}
+\indexentry{encoder-decoder|hyperpage}{30}
+\indexentry{质量评价|hyperpage}{32}
+\indexentry{Quality Evaluation|hyperpage}{32}
+\indexentry{无参考答案的评价|hyperpage}{32}
+\indexentry{Quality Estimation|hyperpage}{32}
+\indexentry{$n$元语法单元|hyperpage}{33}
+\indexentry{$n$-gram准确率|hyperpage}{34}
+\indexentry{$n$-gram Precision|hyperpage}{34}
+\indexentry{短句惩罚因子|hyperpage}{34}
+\indexentry{Brevity Penalty|hyperpage}{34}
+\indexentry{分词|hyperpage}{50}
+\indexentry{Segmentation|hyperpage}{50}
+\indexentry{句法分析|hyperpage}{51}
+\indexentry{Parsing|hyperpage}{51}
+\indexentry{预处理|hyperpage}{51}
+\indexentry{Pre-processing|hyperpage}{51}
+\indexentry{后处理|hyperpage}{51}
+\indexentry{Post-processing|hyperpage}{51}
+\indexentry{事件|hyperpage}{52}
+\indexentry{Event|hyperpage}{52}
+\indexentry{随机事件|hyperpage}{52}
+\indexentry{随机变量|hyperpage}{52}
+\indexentry{Random Variable|hyperpage}{52}
+\indexentry{概率|hyperpage}{52}
+\indexentry{Probability|hyperpage}{52}
+\indexentry{估计|hyperpage}{52}
+\indexentry{估计值|hyperpage}{52}
+\indexentry{Estimate|hyperpage}{52}
+\indexentry{概率分布函数|hyperpage}{53}
+\indexentry{概率密度函数|hyperpage}{53}
+\indexentry{联合概率|hyperpage}{53}
+\indexentry{Joint Probability|hyperpage}{53}
+\indexentry{条件概率|hyperpage}{53}
+\indexentry{Conditional Probability|hyperpage}{53}
+\indexentry{边缘概率|hyperpage}{54}
+\indexentry{marginal probability|hyperpage}{54}
+\indexentry{全概率公式|hyperpage}{55}
+\indexentry{Law of Total Probability|hyperpage}{55}
+\indexentry{贝叶斯法则|hyperpage}{56}
+\indexentry{Bayes' rule|hyperpage}{56}
+\indexentry{熵|hyperpage}{57}
+\indexentry{Entropy|hyperpage}{57}
+\indexentry{自信息|hyperpage}{57}
+\indexentry{Self-information|hyperpage}{57}
+\indexentry{相对熵|hyperpage}{58}
+\indexentry{Relative Entropy|hyperpage}{58}
+\indexentry{交叉熵|hyperpage}{58}
+\indexentry{Cross-entropy|hyperpage}{58}
+\indexentry{分词|hyperpage}{59}
+\indexentry{Segmentation|hyperpage}{59}
+\indexentry{单词|hyperpage}{59}
+\indexentry{Word|hyperpage}{59}
+\indexentry{词|hyperpage}{59}
+\indexentry{词法分析|hyperpage}{59}
+\indexentry{Lexical Analysis|hyperpage}{59}
+\indexentry{标注数据|hyperpage}{61}
+\indexentry{Annotated Data|hyperpage}{61}
+\indexentry{训练|hyperpage}{61}
+\indexentry{Training|hyperpage}{61}
+\indexentry{推断|hyperpage}{61}
+\indexentry{Inference|hyperpage}{61}
+\indexentry{参数估计|hyperpage}{63}
+\indexentry{Parameter Estimation|hyperpage}{63}
+\indexentry{偏置|hyperpage}{63}
+\indexentry{Bias|hyperpage}{63}
+\indexentry{语言模型|hyperpage}{67}
+\indexentry{Language Model|hyperpage}{67}
+\indexentry{语言建模|hyperpage}{67}
+\indexentry{Language Modeling|hyperpage}{67}
+\indexentry{极大似然估计|hyperpage}{68}
+\indexentry{人工神经网络方法|hyperpage}{68}
+\indexentry{未登录词|hyperpage}{69}
+\indexentry{Out-of-Vocabulary Word，OOV Word|hyperpage}{69}
+\indexentry{加法平滑|hyperpage}{70}
+\indexentry{Additive Smoothing|hyperpage}{70}
+\indexentry{古德-图灵估计法|hyperpage}{71}
+\indexentry{Good-Turing Estimate|hyperpage}{71}
+\indexentry{句法|hyperpage}{74}
+\indexentry{Syntax|hyperpage}{74}
+\indexentry{短语结构分析|hyperpage}{74}
+\indexentry{Phrase Structure Parsing|hyperpage}{74}
+\indexentry{依存分析|hyperpage}{74}
+\indexentry{Dependency Parsing|hyperpage}{74}
+\indexentry{成分分析|hyperpage}{75}
+\indexentry{完全分析|hyperpage}{75}
+\indexentry{Full Parsing|hyperpage}{75}
+\indexentry{终结符|hyperpage}{75}
+\indexentry{Terminal|hyperpage}{75}
+\indexentry{预终结符|hyperpage}{75}
+\indexentry{Pre-terminal|hyperpage}{75}
+\indexentry{非终结符|hyperpage}{75}
+\indexentry{Non-terminal|hyperpage}{75}
+\indexentry{上下文无关文法|hyperpage}{76}
+\indexentry{Context-Free Grammar|hyperpage}{76}
+\indexentry{产生式规则|hyperpage}{77}
+\indexentry{Production Rule|hyperpage}{77}
+\indexentry{推导|hyperpage}{78}
+\indexentry{Derivation|hyperpage}{78}
+\indexentry{句子|hyperpage}{79}
+\indexentry{Sentence|hyperpage}{79}
+\indexentry{语言|hyperpage}{79}
+\indexentry{Language|hyperpage}{79}
+\indexentry{歧义|hyperpage}{79}
+\indexentry{Ambiguity|hyperpage}{79}
+\indexentry{消歧|hyperpage}{79}
+\indexentry{Disambiguation|hyperpage}{79}
+\indexentry{最左优先推导|hyperpage}{79}
+\indexentry{Left-most Derivation|hyperpage}{79}
+\indexentry{概率上下文无关文法|hyperpage}{81}
+\indexentry{Probabilistic Context-Free Grammar|hyperpage}{81}
+\indexentry{树库|hyperpage}{82}
+\indexentry{Treebank|hyperpage}{82}
+\indexentry{生成模型|hyperpage}{83}
+\indexentry{Generative Model|hyperpage}{83}
+\indexentry{判别模型|hyperpage}{83}
+\indexentry{Discriminative Model|hyperpage}{83}
+\indexentry{流畅度|hyperpage}{88}
+\indexentry{Fluency|hyperpage}{88}
+\indexentry{准确性|hyperpage}{88}
+\indexentry{Accuracy|hyperpage}{88}
+\indexentry{充分性|hyperpage}{88}
+\indexentry{Adequacy|hyperpage}{88}
+\indexentry{翻译候选|hyperpage}{89}
+\indexentry{Translation Candidate|hyperpage}{89}
+\indexentry{训练|hyperpage}{91}
+\indexentry{Training|hyperpage}{91}
+\indexentry{解码|hyperpage}{91}
+\indexentry{Decoding|hyperpage}{91}
+\indexentry{推断|hyperpage}{91}
+\indexentry{Inference|hyperpage}{91}
+\indexentry{词对齐|hyperpage}{96}
+\indexentry{Word Alignment|hyperpage}{96}
+\indexentry{词对齐连接|hyperpage}{96}
+\indexentry{解码|hyperpage}{99}
+\indexentry{Decoding|hyperpage}{99}
+\indexentry{噪声信道模型|hyperpage}{102}
+\indexentry{Noise Channel Model|hyperpage}{102}
+\indexentry{词对齐|hyperpage}{104}
+\indexentry{Word Alignment|hyperpage}{104}
+\indexentry{非对称的词对齐|hyperpage}{105}
+\indexentry{Asymmetric Word Alignment|hyperpage}{105}
+\indexentry{空对齐|hyperpage}{105}
+\indexentry{拉格朗日乘数法|hyperpage}{113}
+\indexentry{The Lagrange Multiplier Method|hyperpage}{113}
+\indexentry{期望最大化|hyperpage}{115}
+\indexentry{Expectation Maximization|hyperpage}{115}
+\indexentry{期望频次|hyperpage}{116}
+\indexentry{Expected Count|hyperpage}{116}
+\indexentry{产出率|hyperpage}{119}
+\indexentry{繁衍率|hyperpage}{119}
+\indexentry{Fertility|hyperpage}{119}
+\indexentry{扭曲度|hyperpage}{121}
+\indexentry{Distortion|hyperpage}{121}
+\indexentry{概念单元|hyperpage}{123}
+\indexentry{概念|hyperpage}{123}
+\indexentry{Concept|hyperpage}{123}
+\indexentry{缺陷|hyperpage}{125}
+\indexentry{Deficiency|hyperpage}{125}
+\indexentry{凸函数|hyperpage}{129}
+\indexentry{Convex function|hyperpage}{129}
+\indexentry{对称化|hyperpage}{130}
+\indexentry{Symmetrization|hyperpage}{130}
+\indexentry{系统偏置|hyperpage}{131}
+\indexentry{System Bias|hyperpage}{131}
+\indexentry{组合性翻译|hyperpage}{136}
+\indexentry{Compositional Translation|hyperpage}{136}
+\indexentry{短语|hyperpage}{136}
+\indexentry{短语切分|hyperpage}{141}
+\indexentry{Phrasal Segmentation|hyperpage}{141}
+\indexentry{短语对|hyperpage}{141}
+\indexentry{推导|hyperpage}{141}
+\indexentry{Derivation|hyperpage}{141}
+\indexentry{生成式模型|hyperpage}{144}
+\indexentry{Generative Model|hyperpage}{144}
+\indexentry{判别式模型|hyperpage}{144}
+\indexentry{Discriminative Model|hyperpage}{144}
+\indexentry{对数线性模型|hyperpage}{145}
+\indexentry{Log-linear Model|hyperpage}{145}
+\indexentry{短语抽取|hyperpage}{146}
+\indexentry{Phrase Extraction|hyperpage}{146}
+\indexentry{词汇化翻译概率|hyperpage}{149}
+\indexentry{Lexical Translation Probability|hyperpage}{149}
+\indexentry{短语表|hyperpage}{150}
+\indexentry{Phrase Table|hyperpage}{150}
+\indexentry{调序|hyperpage}{150}
+\indexentry{Reordering|hyperpage}{150}
+\indexentry{模型训练|hyperpage}{154}
+\indexentry{Model Training|hyperpage}{154}
+\indexentry{权重调优|hyperpage}{154}
+\indexentry{Weight Tuning|hyperpage}{154}
+\indexentry{最小错误率训练|hyperpage}{154}
+\indexentry{Minimum Error Rate Training|hyperpage}{154}
+\indexentry{调优集合|hyperpage}{154}
+\indexentry{Tuning Set|hyperpage}{154}
+\indexentry{线搜索|hyperpage}{155}
+\indexentry{Line Search|hyperpage}{155}
+\indexentry{格搜索|hyperpage}{156}
+\indexentry{Grid Search|hyperpage}{156}
+\indexentry{覆盖度模型|hyperpage}{158}
+\indexentry{Coverage Model|hyperpage}{158}
+\indexentry{翻译候选|hyperpage}{158}
+\indexentry{Translation Candidate|hyperpage}{158}
+\indexentry{翻译假设|hyperpage}{159}
+\indexentry{Translation Hypothesis|hyperpage}{159}
+\indexentry{剪枝|hyperpage}{160}
+\indexentry{Pruning|hyperpage}{160}
+\indexentry{束剪枝|hyperpage}{160}
+\indexentry{Beam Pruning|hyperpage}{160}
+\indexentry{直方图剪枝|hyperpage}{160}
+\indexentry{Histogram Pruning|hyperpage}{160}
+\indexentry{阈值剪枝|hyperpage}{160}
+\indexentry{Threshold Pruning|hyperpage}{160}
+\indexentry{假设重组|hyperpage}{160}
+\indexentry{Hypothesis Recombination|hyperpage}{160}
+\indexentry{基于层次短语的模型|hyperpage}{164}
+\indexentry{Hierarchical Phrase-based Model|hyperpage}{164}
+\indexentry{同步上下文无关文法|hyperpage}{165}
+\indexentry{Synchronous Context-free Grammar|hyperpage}{165}
+\indexentry{基于层次短语的文法|hyperpage}{166}
+\indexentry{Hierarchical Phrase-based Grammar|hyperpage}{166}
+\indexentry{推导|hyperpage}{167}
+\indexentry{Derivation|hyperpage}{167}
+\indexentry{胶水规则|hyperpage}{167}
+\indexentry{Glue Rule|hyperpage}{167}
+\indexentry{乔姆斯基范式|hyperpage}{171}
+\indexentry{Chomsky Normal Form|hyperpage}{171}
+\indexentry{跨度|hyperpage}{171}
+\indexentry{Span|hyperpage}{171}
+\indexentry{自下而上的分析|hyperpage}{172}
+\indexentry{Top-down Parsing|hyperpage}{172}
+\indexentry{束剪枝|hyperpage}{174}
+\indexentry{Beam Pruning|hyperpage}{174}
+\indexentry{立方剪枝|hyperpage}{176}
+\indexentry{Cube Pruning|hyperpage}{176}
+\indexentry{序列化|hyperpage}{179}
+\indexentry{线性化|hyperpage}{179}
+\indexentry{Linearization|hyperpage}{179}
+\indexentry{树到串翻译规则|hyperpage}{181}
+\indexentry{Tree-to-String Translation Rule|hyperpage}{181}
+\indexentry{树到树翻译规则|hyperpage}{181}
+\indexentry{Tree-to-Tree Translation Rule|hyperpage}{181}
+\indexentry{树片段|hyperpage}{182}
+\indexentry{Tree Fragment|hyperpage}{182}
+\indexentry{同步树替换文法规则|hyperpage}{183}
+\indexentry{Synchronous Tree Substitution Grammar Rule|hyperpage}{183}
+\indexentry{边缘集合|hyperpage}{189}
+\indexentry{Frontier Set|hyperpage}{189}
+\indexentry{最小规则|hyperpage}{190}
+\indexentry{Minimal Rules|hyperpage}{190}
+\indexentry{二叉化|hyperpage}{194}
+\indexentry{Binarization|hyperpage}{194}
+\indexentry{基于短语的特征|hyperpage}{198}
+\indexentry{基于句法的特征|hyperpage}{198}
+\indexentry{有向超图|hyperpage}{199}
+\indexentry{Directed Hyper-graph|hyperpage}{199}
+\indexentry{超边|hyperpage}{199}
+\indexentry{Hyper-edge|hyperpage}{199}
+\indexentry{半环分析|hyperpage}{200}
+\indexentry{Semi-ring Parsing|hyperpage}{200}
+\indexentry{组合|hyperpage}{201}
+\indexentry{Composition|hyperpage}{201}
+\indexentry{基于串的解码|hyperpage}{201}
+\indexentry{String-based Decoding|hyperpage}{201}
+\indexentry{基于树的解码|hyperpage}{201}
+\indexentry{Tree-based Decoding|hyperpage}{201}
+\indexentry{Lexicalized Norm Form|hyperpage}{205}
+\indexentry{人工神经网络|hyperpage}{211}
+\indexentry{Artificial Neural Networks|hyperpage}{211}
+\indexentry{神经网络|hyperpage}{211}
+\indexentry{Neural Networks|hyperpage}{211}
+\indexentry{深度学习|hyperpage}{212}
+\indexentry{Deep Learning|hyperpage}{212}
+\indexentry{连接主义|hyperpage}{213}
+\indexentry{Connectionism|hyperpage}{213}
+\indexentry{分布式表示|hyperpage}{213}
+\indexentry{Distributed representation|hyperpage}{213}
+\indexentry{符号主义|hyperpage}{213}
+\indexentry{Symbolicism|hyperpage}{213}
+\indexentry{端到端学习|hyperpage}{215}
+\indexentry{End-to-End Learning|hyperpage}{215}
+\indexentry{表示学习|hyperpage}{215}
+\indexentry{Representation Learning|hyperpage}{215}
+\indexentry{分布式表示|hyperpage}{216}
+\indexentry{Distributed Representation|hyperpage}{216}
+\indexentry{标量|hyperpage}{217}
+\indexentry{Scalar|hyperpage}{217}
+\indexentry{向量|hyperpage}{217}
+\indexentry{Vector|hyperpage}{217}
+\indexentry{矩阵|hyperpage}{217}
+\indexentry{Matrix|hyperpage}{217}
+\indexentry{转置|hyperpage}{218}
+\indexentry{Transpose|hyperpage}{218}
+\indexentry{按元素加法|hyperpage}{218}
+\indexentry{Element-wise Addition|hyperpage}{218}
+\indexentry{数乘|hyperpage}{219}
+\indexentry{Scalar Multiplication|hyperpage}{219}
+\indexentry{按元素乘积|hyperpage}{220}
+\indexentry{Element-wise Product|hyperpage}{220}
+\indexentry{线性映射|hyperpage}{220}
+\indexentry{Linear Mapping|hyperpage}{220}
+\indexentry{线性变换|hyperpage}{220}
+\indexentry{Linear Transformation|hyperpage}{220}
+\indexentry{范数|hyperpage}{221}
+\indexentry{Norm|hyperpage}{221}
+\indexentry{欧几里得范数|hyperpage}{222}
+\indexentry{Euclidean Norm|hyperpage}{222}
+\indexentry{Frobenius 范数|hyperpage}{222}
+\indexentry{Frobenius Norm|hyperpage}{222}
+\indexentry{权重|hyperpage}{223}
+\indexentry{weight|hyperpage}{223}
+\indexentry{张量|hyperpage}{233}
+\indexentry{Tensor|hyperpage}{233}
+\indexentry{阶|hyperpage}{233}
+\indexentry{Rank|hyperpage}{233}
+\indexentry{广播机制|hyperpage}{237}
+\indexentry{向量化|hyperpage}{237}
+\indexentry{Vectorization|hyperpage}{237}
+\indexentry{前向传播|hyperpage}{241}
+\indexentry{计算图|hyperpage}{242}
+\indexentry{Computation Graph|hyperpage}{242}
+\indexentry{模型参数|hyperpage}{243}
+\indexentry{Model Parameters|hyperpage}{243}
+\indexentry{训练|hyperpage}{243}
+\indexentry{Training|hyperpage}{243}
+\indexentry{有标注数据|hyperpage}{243}
+\indexentry{Annotated Data/Labeled Data|hyperpage}{243}
+\indexentry{有指导的训练|hyperpage}{243}
+\indexentry{有监督的训练|hyperpage}{243}
+\indexentry{Supervised Training|hyperpage}{243}
+\indexentry{训练数据集合|hyperpage}{244}
+\indexentry{Training Data Set|hyperpage}{244}
+\indexentry{损失函数|hyperpage}{244}
+\indexentry{Loss Function|hyperpage}{244}
+\indexentry{目标函数|hyperpage}{244}
+\indexentry{Objective Function|hyperpage}{244}
+\indexentry{代价函数|hyperpage}{246}
+\indexentry{Cost Function|hyperpage}{246}
+\indexentry{梯度下降方法|hyperpage}{246}
+\indexentry{Gradient Descent Method|hyperpage}{246}
+\indexentry{参数更新的规则|hyperpage}{246}
+\indexentry{Update Rule|hyperpage}{246}
+\indexentry{学习率|hyperpage}{246}
+\indexentry{Learning Rate|hyperpage}{246}
+\indexentry{基于梯度的方法|hyperpage}{246}
+\indexentry{Gradient-based Method|hyperpage}{246}
+\indexentry{批量梯度下降|hyperpage}{247}
+\indexentry{Batch Gradient Descent|hyperpage}{247}
+\indexentry{随机梯度下降|hyperpage}{247}
+\indexentry{Stochastic Gradient Descent|hyperpage}{247}
+\indexentry{小批量梯度下降|hyperpage}{247}
+\indexentry{Mini-Batch Gradient Descent|hyperpage}{247}
+\indexentry{数值微分|hyperpage}{248}
+\indexentry{Numerical Differentiation|hyperpage}{248}
+\indexentry{截断误差|hyperpage}{248}
+\indexentry{Truncation Error|hyperpage}{248}
+\indexentry{舍入误差|hyperpage}{248}
+\indexentry{Round-off Error|hyperpage}{248}
+\indexentry{符号微分|hyperpage}{249}
+\indexentry{Symbolic Differentiation|hyperpage}{249}
+\indexentry{表达式膨胀|hyperpage}{249}
+\indexentry{Expression Swell|hyperpage}{249}
+\indexentry{自动微分|hyperpage}{249}
+\indexentry{Automatic Differentiation|hyperpage}{249}
+\indexentry{反向模式|hyperpage}{250}
+\indexentry{Backward Mode|hyperpage}{250}
+\indexentry{学习率|hyperpage}{251}
+\indexentry{Learning Rate|hyperpage}{251}
+\indexentry{Momentum|hyperpage}{251}
+\indexentry{AdaGrad|hyperpage}{252}
+\indexentry{衰减|hyperpage}{252}
+\indexentry{Decay|hyperpage}{252}
+\indexentry{RMSprop|hyperpage}{252}
+\indexentry{Adam|hyperpage}{253}
+\indexentry{数据并行|hyperpage}{253}
+\indexentry{同步更新|hyperpage}{254}
+\indexentry{Synchronous Update|hyperpage}{254}
+\indexentry{异步更新|hyperpage}{254}
+\indexentry{Asynchronous Update|hyperpage}{254}
+\indexentry{参数服务器|hyperpage}{254}
+\indexentry{Parameter Server|hyperpage}{254}
+\indexentry{梯度消失|hyperpage}{255}
+\indexentry{Gradient Vanishing|hyperpage}{255}
+\indexentry{梯度爆炸|hyperpage}{255}
+\indexentry{Gradient Explosion|hyperpage}{255}
+\indexentry{梯度裁剪|hyperpage}{256}
+\indexentry{Gradient Clipping|hyperpage}{256}
+\indexentry{批量归一化|hyperpage}{257}
+\indexentry{Batch Normalization|hyperpage}{257}
+\indexentry{层归一化|hyperpage}{257}
+\indexentry{Layer Normalization|hyperpage}{257}
+\indexentry{残差网络|hyperpage}{257}
+\indexentry{Residual Networks|hyperpage}{257}
+\indexentry{跳接|hyperpage}{257}
+\indexentry{Shortcut Connection|hyperpage}{257}
+\indexentry{过拟合|hyperpage}{258}
+\indexentry{Overfitting|hyperpage}{258}
+\indexentry{正则化|hyperpage}{258}
+\indexentry{Regularization|hyperpage}{258}
+\indexentry{反向传播|hyperpage}{259}
+\indexentry{back propagation|hyperpage}{259}
+\indexentry{神经语言模型|hyperpage}{265}
+\indexentry{Neural Language Model|hyperpage}{265}
+\indexentry{前馈神经网络语言模型|hyperpage}{266}
+\indexentry{Feed-forward Neural Network Language Model|hyperpage}{266}
+\indexentry{循环神经网络|hyperpage}{268}
+\indexentry{Recurrent Neural Network|hyperpage}{268}
+\indexentry{循环神经网络语言模型|hyperpage}{268}
+\indexentry{RNNLM|hyperpage}{268}
+\indexentry{循环单元|hyperpage}{268}
+\indexentry{RNN Cell|hyperpage}{268}
+\indexentry{自注意力机制|hyperpage}{270}
+\indexentry{Self-Attention Mechanism|hyperpage}{270}
+\indexentry{注意力权重|hyperpage}{270}
+\indexentry{Attention Weight|hyperpage}{270}
+\indexentry{困惑度|hyperpage}{271}
+\indexentry{Perplexity|hyperpage}{271}
+\indexentry{One-hot编码|hyperpage}{271}
+\indexentry{独热编码|hyperpage}{271}
+\indexentry{分布式表示|hyperpage}{272}
+\indexentry{Distributed Representation|hyperpage}{272}
+\indexentry{词嵌入|hyperpage}{272}
+\indexentry{Word Embedding|hyperpage}{272}
+\indexentry{句子表示模型|hyperpage}{274}
+\indexentry{句子的表示|hyperpage}{274}
+\indexentry{表示学习|hyperpage}{274}
+\indexentry{Representation Learning|hyperpage}{274}
+\indexentry{可解释机器学习|hyperpage}{278}
+\indexentry{Explainable Machine Learning|hyperpage}{278}
+\indexentry{神经机器翻译|hyperpage}{281}
+\indexentry{Neural Machine Translation|hyperpage}{281}
+\indexentry{分布式表示|hyperpage}{283}
+\indexentry{Distributed Representation|hyperpage}{283}
+\indexentry{特征工程|hyperpage}{289}
+\indexentry{Feature Engineering|hyperpage}{289}
+\indexentry{编码器-解码器模型|hyperpage}{290}
+\indexentry{Encoder-Decoder Paradigm|hyperpage}{290}
+\indexentry{编码器-解码器框架|hyperpage}{290}
+\indexentry{循环神经网络|hyperpage}{295}
+\indexentry{Recurrent Neural Network, RNN|hyperpage}{295}
+\indexentry{词嵌入|hyperpage}{297}
+\indexentry{Word Embedding|hyperpage}{297}
+\indexentry{表示学习|hyperpage}{297}
+\indexentry{Representation Learning|hyperpage}{297}
+\indexentry{生成|hyperpage}{297}
+\indexentry{Generation|hyperpage}{297}
+\indexentry{长短时记忆|hyperpage}{302}
+\indexentry{Long Short-Term Memory|hyperpage}{302}
+\indexentry{遗忘|hyperpage}{302}
+\indexentry{记忆更新|hyperpage}{303}
+\indexentry{输出|hyperpage}{303}
+\indexentry{门循环单元|hyperpage}{304}
+\indexentry{Gated Recurrent Unit，GRU|hyperpage}{304}
+\indexentry{注意力权重|hyperpage}{309}
+\indexentry{Attention Weight|hyperpage}{309}
+\indexentry{一阶矩估计|hyperpage}{315}
+\indexentry{First Moment Estimation|hyperpage}{315}
+\indexentry{二阶矩估计|hyperpage}{315}
+\indexentry{Second Moment Estimation|hyperpage}{315}
+\indexentry{学习率|hyperpage}{316}
+\indexentry{Learning Rate|hyperpage}{316}
+\indexentry{逐渐预热|hyperpage}{316}
+\indexentry{Gradual Warmup|hyperpage}{316}
+\indexentry{分段常数衰减|hyperpage}{317}
+\indexentry{Piecewise Constant Decay|hyperpage}{317}
+\indexentry{数据并行|hyperpage}{318}
+\indexentry{模型并行|hyperpage}{318}
+\indexentry{全搜索|hyperpage}{320}
+\indexentry{Full Search|hyperpage}{320}
+\indexentry{贪婪搜索|hyperpage}{320}
+\indexentry{Greedy Search|hyperpage}{320}
+\indexentry{束搜索|hyperpage}{320}
+\indexentry{Beam Search|hyperpage}{320}
+\indexentry{自回归模型|hyperpage}{320}
+\indexentry{Autoregressive Model|hyperpage}{321}
+\indexentry{非自回归模型|hyperpage}{321}
+\indexentry{Non-autoregressive Model|hyperpage}{321}
+\indexentry{自注意力机制|hyperpage}{326}
+\indexentry{Self-Attention|hyperpage}{326}
+\indexentry{特征提取|hyperpage}{327}
+\indexentry{自注意力子层|hyperpage}{328}
+\indexentry{Self-attention Sub-layer|hyperpage}{328}
+\indexentry{前馈神经网络子层|hyperpage}{328}
+\indexentry{Feed-forward Sub-layer|hyperpage}{328}
+\indexentry{残差连接|hyperpage}{328}
+\indexentry{Residual Connection|hyperpage}{328}
+\indexentry{层正则化|hyperpage}{328}
+\indexentry{Layer Normalization|hyperpage}{328}
+\indexentry{编码-解码注意力子层|hyperpage}{329}
+\indexentry{Encoder-decoder Attention Sub-layer|hyperpage}{329}
+\indexentry{词嵌入|hyperpage}{329}
+\indexentry{Word Embedding|hyperpage}{329}
+\indexentry{位置编码|hyperpage}{329}
+\indexentry{Position Embedding|hyperpage}{329}
+\indexentry{点乘注意力|hyperpage}{333}
+\indexentry{Scaled Dot-Product Attention|hyperpage}{333}
+\indexentry{多头注意力|hyperpage}{335}
+\indexentry{Multi-head Attention|hyperpage}{335}
+\indexentry{残差连接|hyperpage}{336}
+\indexentry{短连接|hyperpage}{337}
+\indexentry{Short-cut Connection|hyperpage}{337}
+\indexentry{后正则化|hyperpage}{338}
+\indexentry{Post-norm|hyperpage}{338}
+\indexentry{前正则化|hyperpage}{338}
+\indexentry{Pre-norm|hyperpage}{338}
+\indexentry{交叉熵损失|hyperpage}{339}
+\indexentry{Cross Entropy Loss|hyperpage}{339}
+\indexentry{预热|hyperpage}{339}
+\indexentry{Warmup|hyperpage}{339}
+\indexentry{小批量训练|hyperpage}{340}
+\indexentry{Mini-batch Training|hyperpage}{340}
+\indexentry{Dropout|hyperpage}{340}
+\indexentry{过拟合|hyperpage}{340}
+\indexentry{Over fitting|hyperpage}{340}
+\indexentry{标签平滑|hyperpage}{340}
+\indexentry{Label Smoothing|hyperpage}{340}
+\indexentry{序列到序列的转换/生成问题|hyperpage}{342}
+\indexentry{Sequence-to-Sequence Problem|hyperpage}{342}
+\indexentry{未登录词|hyperpage}{353}
+\indexentry{Out of Vocabulary Word，OOV Word|hyperpage}{353}
+\indexentry{子词切分|hyperpage}{353}
+\indexentry{Sub-word Segmentation|hyperpage}{353}
+\indexentry{标准化|hyperpage}{353}
+\indexentry{Normalization|hyperpage}{353}
+\indexentry{数据清洗|hyperpage}{353}
+\indexentry{Dada Cleaning|hyperpage}{353}
+\indexentry{数据选择|hyperpage}{355}
+\indexentry{Data Selection|hyperpage}{355}
+\indexentry{数据过滤|hyperpage}{355}
+\indexentry{Data Filtering|hyperpage}{355}
+\indexentry{开放词表|hyperpage}{358}
+\indexentry{Open-Vocabulary|hyperpage}{358}
+\indexentry{子词|hyperpage}{359}
+\indexentry{Sub-word|hyperpage}{359}
+\indexentry{字节对编码|hyperpage}{359}
+\indexentry{双字节编码|hyperpage}{359}
+\indexentry{Byte Pair Encoding，BPE|hyperpage}{359}
+\indexentry{正则化|hyperpage}{362}
+\indexentry{Regularization|hyperpage}{362}
+\indexentry{过拟合问题|hyperpage}{362}
+\indexentry{Overfitting Problem|hyperpage}{362}
+\indexentry{反问题|hyperpage}{362}
+\indexentry{Inverse Problem|hyperpage}{362}
+\indexentry{适定的|hyperpage}{363}
+\indexentry{Well-posed|hyperpage}{363}
+\indexentry{不适定问题|hyperpage}{363}
+\indexentry{Ill-posed Problem|hyperpage}{363}
+\indexentry{降噪|hyperpage}{363}
+\indexentry{Denoising|hyperpage}{363}
+\indexentry{泛化|hyperpage}{364}
+\indexentry{Generalization|hyperpage}{364}
+\indexentry{标签平滑|hyperpage}{365}
+\indexentry{Label Smoothing|hyperpage}{365}
+\indexentry{相互适应|hyperpage}{366}
+\indexentry{Co-Adaptation|hyperpage}{366}
+\indexentry{集成学习|hyperpage}{368}
+\indexentry{Ensemble Learning|hyperpage}{368}
+\indexentry{容量|hyperpage}{369}
+\indexentry{Capacity|hyperpage}{369}
+\indexentry{宽残差网络|hyperpage}{369}
+\indexentry{Wide Residual Network|hyperpage}{369}
+\indexentry{探测任务|hyperpage}{371}
+\indexentry{Probing Task|hyperpage}{371}
+\indexentry{表面信息|hyperpage}{371}
+\indexentry{Surface Information|hyperpage}{371}
+\indexentry{语法信息|hyperpage}{371}
+\indexentry{Syntactic Information|hyperpage}{371}
+\indexentry{语义信息|hyperpage}{371}
+\indexentry{Semantic Information|hyperpage}{371}
+\indexentry{词嵌入|hyperpage}{371}
+\indexentry{Embedding|hyperpage}{371}
+\indexentry{数据并行|hyperpage}{372}
+\indexentry{Data Parallelism|hyperpage}{372}
+\indexentry{模型并行|hyperpage}{372}
+\indexentry{Model Parallelism|hyperpage}{372}
+\indexentry{小批量训练|hyperpage}{372}
+\indexentry{Mini-batch Training|hyperpage}{372}
+\indexentry{课程学习|hyperpage}{374}
+\indexentry{Curriculum Learning|hyperpage}{374}
+\indexentry{推断|hyperpage}{375}
+\indexentry{Inference|hyperpage}{375}
+\indexentry{解码|hyperpage}{375}
+\indexentry{Decoding|hyperpage}{375}
+\indexentry{准确性|hyperpage}{375}
+\indexentry{Accuracy|hyperpage}{375}
+\indexentry{时延|hyperpage}{375}
+\indexentry{Latency|hyperpage}{375}
+\indexentry{时延|hyperpage}{375}
+\indexentry{Memory|hyperpage}{375}
+\indexentry{搜索错误|hyperpage}{375}
+\indexentry{Search Error|hyperpage}{375}
+\indexentry{模型错误|hyperpage}{375}
+\indexentry{Modeling Error|hyperpage}{375}
+\indexentry{重排序|hyperpage}{377}
+\indexentry{Re-ranking|hyperpage}{377}
+\indexentry{双向推断|hyperpage}{377}
+\indexentry{Bidirectional Inference|hyperpage}{377}
+\indexentry{批量推断|hyperpage}{381}
+\indexentry{Batch Inference|hyperpage}{381}
+\indexentry{批量处理|hyperpage}{381}
+\indexentry{Batching|hyperpage}{381}
+\indexentry{二值网络|hyperpage}{383}
+\indexentry{Binarized Neural Networks|hyperpage}{383}
+\indexentry{自回归翻译|hyperpage}{383}
+\indexentry{Autoregressive Translation|hyperpage}{383}
+\indexentry{非自回归翻译|hyperpage}{383}
+\indexentry{Regressive Translation|hyperpage}{383}
+\indexentry{繁衍率|hyperpage}{383}
+\indexentry{Fertility|hyperpage}{383}
+\indexentry{偏置|hyperpage}{385}
+\indexentry{Bias|hyperpage}{385}
+\indexentry{退化|hyperpage}{385}
+\indexentry{Degenerate|hyperpage}{385}
+\indexentry{过翻译|hyperpage}{386}
+\indexentry{Over Translation|hyperpage}{386}
+\indexentry{欠翻译|hyperpage}{386}
+\indexentry{Under Translation|hyperpage}{386}
+\indexentry{充分性|hyperpage}{387}
+\indexentry{Adequacy|hyperpage}{387}
+\indexentry{系统融合|hyperpage}{388}
+\indexentry{System Combination|hyperpage}{388}
+\indexentry{假设选择|hyperpage}{388}
+\indexentry{Hypothesis Selection|hyperpage}{388}
+\indexentry{多样性|hyperpage}{388}
+\indexentry{Diversity|hyperpage}{388}
+\indexentry{重排序|hyperpage}{389}
+\indexentry{Re-ranking|hyperpage}{389}
+\indexentry{混淆网络|hyperpage}{390}
+\indexentry{Confusion Network|hyperpage}{390}
+\indexentry{动态线性层聚合方法|hyperpage}{394}
+\indexentry{Dynamic Linear Combination of Layers，DLCL|hyperpage}{394}
+\indexentry{相互适应|hyperpage}{398}
+\indexentry{Co-adaptation|hyperpage}{398}
+\indexentry{数据增强|hyperpage}{401}
+\indexentry{Data Augmentation|hyperpage}{401}
+\indexentry{回译|hyperpage}{401}
+\indexentry{Back Translation|hyperpage}{401}
+\indexentry{迭代式回译|hyperpage}{401}
+\indexentry{Iterative Back Translation|hyperpage}{401}
+\indexentry{前向翻译|hyperpage}{402}
+\indexentry{Forward Translation|hyperpage}{402}
+\indexentry{预训练|hyperpage}{402}
+\indexentry{Pre-training|hyperpage}{402}
+\indexentry{微调|hyperpage}{402}
+\indexentry{Fine-tuning|hyperpage}{402}
+\indexentry{多任务学习|hyperpage}{404}
+\indexentry{Multitask Learning|hyperpage}{404}
+\indexentry{模型压缩|hyperpage}{405}
+\indexentry{Model Compression|hyperpage}{405}
+\indexentry{学习难度|hyperpage}{405}
+\indexentry{Learning Difficulty|hyperpage}{406}
+\indexentry{教师模型|hyperpage}{406}
+\indexentry{Teacher Model|hyperpage}{406}
+\indexentry{学生模型|hyperpage}{406}
+\indexentry{Student Model|hyperpage}{406}
+\indexentry{基于单词的知识精炼|hyperpage}{406}
+\indexentry{Word-level Knowledge Distillation|hyperpage}{406}
+\indexentry{基于序列的知识精炼|hyperpage}{407}
+\indexentry{Sequence-level Knowledge Distillation|hyperpage}{407}
+\indexentry{中间层输出|hyperpage}{408}
+\indexentry{Hint-based Knowledge Transfer|hyperpage}{408}
+\indexentry{注意力分布|hyperpage}{408}
+\indexentry{Attention To Attention Transfer|hyperpage}{408}
+\indexentry{循环一致性|hyperpage}{410}
+\indexentry{Circle Consistency|hyperpage}{410}
+\indexentry{翻译中回译|hyperpage}{411}
+\indexentry{On-the-fly Back-translation|hyperpage}{411}
+\indexentry{网络结构搜索技术|hyperpage}{414}
+\indexentry{Neural Architecture Search；NAS|hyperpage}{414}
--- a/Book/mt-book-xelatex.ptc
+++ b/Book/mt-book-xelatex.ptc
 \boolfalse {citerequest}\boolfalse {citetracker}\boolfalse {pagetracker}\boolfalse {backtracker}\relax 
-\babel@toc {english}{}
 \defcounter {refsection}{0}\relax 
-\contentsline {part}{\@mypartnumtocformat {I}{统计机器翻译}}{9}{part.1}%
+\select@language {english}
+\defcounter {refsection}{0}\relax 
+\contentsline {part}{\@mypartnumtocformat {I}{机器翻译基础}}{15}{part.1}
 \ttl@starttoc {default@1}
 \defcounter {refsection}{0}\relax 
-\contentsline {chapter}{\numberline {1}基于词的机器翻译模型}{11}{chapter.1}%
+\contentsline {chapter}{\numberline {1}机器翻译简介}{17}{chapter.1}
+\defcounter {refsection}{0}\relax 
+\contentsline {section}{\numberline {1.1}机器翻译的概念}{17}{section.1.1}
+\defcounter {refsection}{0}\relax 
+\contentsline {section}{\numberline {1.2}机器翻译简史}{20}{section.1.2}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {1.2.1}人工翻译}{20}{subsection.1.2.1}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {1.2.2}机器翻译的萌芽}{21}{subsection.1.2.2}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {1.2.3}机器翻译的受挫}{22}{subsection.1.2.3}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {1.2.4}机器翻译的快速成长}{23}{subsection.1.2.4}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {1.2.5}机器翻译的爆发}{24}{subsection.1.2.5}
+\defcounter {refsection}{0}\relax 
+\contentsline {section}{\numberline {1.3}机器翻译现状}{25}{section.1.3}
+\defcounter {refsection}{0}\relax 
+\contentsline {section}{\numberline {1.4}机器翻译方法}{27}{section.1.4}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {1.4.1}基于规则的机器翻译}{27}{subsection.1.4.1}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {1.4.2}基于实例的机器翻译}{28}{subsection.1.4.2}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {1.4.3}统计机器翻译}{29}{subsection.1.4.3}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {1.4.4}神经机器翻译}{30}{subsection.1.4.4}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {1.4.5}对比分析}{31}{subsection.1.4.5}
+\defcounter {refsection}{0}\relax 
+\contentsline {section}{\numberline {1.5}翻译质量评价}{32}{section.1.5}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {1.5.1}人工评价}{32}{subsection.1.5.1}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {1.5.2}自动评价}{33}{subsection.1.5.2}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{BLEU}{33}{section*.17}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{TER}{35}{section*.18}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{基于检测点的评价}{35}{section*.19}
+\defcounter {refsection}{0}\relax 
+\contentsline {section}{\numberline {1.6}机器翻译应用}{36}{section.1.6}
+\defcounter {refsection}{0}\relax 
+\contentsline {section}{\numberline {1.7}开源项目与评测}{38}{section.1.7}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {1.7.1}开源机器翻译系统}{38}{subsection.1.7.1}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{统计机器翻译开源系统}{39}{section*.21}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{神经机器翻译开源系统}{40}{section*.22}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {1.7.2}常用数据集及公开评测任务}{42}{subsection.1.7.2}
+\defcounter {refsection}{0}\relax 
+\contentsline {section}{\numberline {1.8}推荐学习资源}{44}{section.1.8}
+\defcounter {refsection}{0}\relax 
+\contentsline {chapter}{\numberline {2}词法、语法及统计建模基础}{49}{chapter.2}
+\defcounter {refsection}{0}\relax 
+\contentsline {section}{\numberline {2.1}问题概述 }{50}{section.2.1}
+\defcounter {refsection}{0}\relax 
+\contentsline {section}{\numberline {2.2}概率论基础}{51}{section.2.2}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {2.2.1}随机变量和概率}{52}{subsection.2.2.1}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {2.2.2}联合概率、条件概率和边缘概率}{53}{subsection.2.2.2}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {2.2.3}链式法则}{54}{subsection.2.2.3}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {2.2.4}贝叶斯法则}{55}{subsection.2.2.4}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {2.2.5}KL距离和熵}{57}{subsection.2.2.5}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{信息熵}{57}{section*.29}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{KL距离}{58}{section*.31}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{交叉熵}{58}{section*.32}
+\defcounter {refsection}{0}\relax 
+\contentsline {section}{\numberline {2.3}中文分词}{59}{section.2.3}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {2.3.1}基于词典的分词方法}{60}{subsection.2.3.1}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {2.3.2}基于统计的分词方法}{61}{subsection.2.3.2}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{统计模型的学习与推断}{61}{section*.36}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{掷骰子游戏}{62}{section*.38}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{全概率分词方法}{64}{section*.42}
+\defcounter {refsection}{0}\relax 
+\contentsline {section}{\numberline {2.4}$n$-gram语言模型 }{66}{section.2.4}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {2.4.1}建模}{67}{subsection.2.4.1}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {2.4.2}未登录词和平滑算法}{69}{subsection.2.4.2}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{加法平滑方法}{70}{section*.48}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{古德-图灵估计法}{71}{section*.50}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{Kneser-Ney平滑方法}{72}{section*.52}
+\defcounter {refsection}{0}\relax 
+\contentsline {section}{\numberline {2.5}句法分析（短语结构分析）}{74}{section.2.5}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {2.5.1}句子的句法树表示}{74}{subsection.2.5.1}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {2.5.2}上下文无关文法}{76}{subsection.2.5.2}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {2.5.3}规则和推导的概率}{81}{subsection.2.5.3}
+\defcounter {refsection}{0}\relax 
+\contentsline {section}{\numberline {2.6}小结及深入阅读}{83}{section.2.6}
+\defcounter {refsection}{0}\relax 
+\contentsline {part}{\@mypartnumtocformat {II}{统计机器翻译}}{85}{part.2}
+\ttl@stoptoc {default@1}
+\ttl@starttoc {default@2}
+\defcounter {refsection}{0}\relax 
+\contentsline {chapter}{\numberline {3}基于词的机器翻译模型}{87}{chapter.3}
+\defcounter {refsection}{0}\relax 
+\contentsline {section}{\numberline {3.1}什么是基于词的翻译模型}{87}{section.3.1}
+\defcounter {refsection}{0}\relax 
+\contentsline {section}{\numberline {3.2}构建一个简单的机器翻译系统}{89}{section.3.2}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {3.2.1}如何进行翻译？}{89}{subsection.3.2.1}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{机器翻译流程}{90}{section*.65}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{人工翻译 vs. 机器翻译}{91}{section*.67}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {3.2.2}基本框架}{91}{subsection.3.2.2}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {3.2.3}单词翻译概率}{92}{subsection.3.2.3}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{什么是单词翻译概率？}{92}{section*.69}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{如何从一个双语平行数据中学习？}{93}{section*.71}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{如何从大量的双语平行数据中学习？}{94}{section*.72}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {3.2.4}句子级翻译模型}{95}{subsection.3.2.4}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{基础模型}{95}{section*.74}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{生成流畅的译文}{97}{section*.76}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {3.2.5}解码}{99}{subsection.3.2.5}
+\defcounter {refsection}{0}\relax 
+\contentsline {section}{\numberline {3.3}基于词的翻译建模}{101}{section.3.3}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {3.3.1}噪声信道模型}{101}{subsection.3.3.1}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {3.3.2}统计机器翻译的三个基本问题}{104}{subsection.3.3.2}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{词对齐}{104}{section*.86}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{基于词对齐的翻译模型}{105}{section*.89}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{基于词对齐的翻译实例}{107}{section*.91}
+\defcounter {refsection}{0}\relax 
+\contentsline {section}{\numberline {3.4}IBM模型1-2}{107}{section.3.4}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {3.4.1}IBM模型1}{108}{subsection.3.4.1}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {3.4.2}IBM模型2}{109}{subsection.3.4.2}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {3.4.3}解码及计算优化}{110}{subsection.3.4.3}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {3.4.4}训练}{112}{subsection.3.4.4}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{目标函数}{112}{section*.96}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{优化}{113}{section*.98}
+\defcounter {refsection}{0}\relax 
+\contentsline {section}{\numberline {3.5}IBM模型3-5及隐马尔可夫模型}{119}{section.3.5}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {3.5.1}基于产出率的翻译模型}{119}{subsection.3.5.1}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {3.5.2}IBM 模型3}{122}{subsection.3.5.2}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {3.5.3}IBM 模型4}{123}{subsection.3.5.3}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {3.5.4} IBM 模型5}{125}{subsection.3.5.4}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {3.5.5}隐马尔可夫模型}{126}{subsection.3.5.5}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{隐马尔可夫模型}{126}{section*.110}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{词对齐模型}{127}{section*.112}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {3.5.6}解码和训练}{128}{subsection.3.5.6}
+\defcounter {refsection}{0}\relax 
+\contentsline {section}{\numberline {3.6}问题分析}{129}{section.3.6}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {3.6.1}词对齐及对称化}{129}{subsection.3.6.1}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {3.6.2}Deficiency}{130}{subsection.3.6.2}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {3.6.3}句子长度}{131}{subsection.3.6.3}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {3.6.4}其他问题}{131}{subsection.3.6.4}
+\defcounter {refsection}{0}\relax 
+\contentsline {section}{\numberline {3.7}小结及深入阅读}{132}{section.3.7}
+\defcounter {refsection}{0}\relax 
+\contentsline {chapter}{\numberline {4}基于短语和句法的机器翻译模型}{135}{chapter.4}
+\defcounter {refsection}{0}\relax 
+\contentsline {section}{\numberline {4.1}翻译中的结构信息}{135}{section.4.1}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {4.1.1}更大粒度的翻译单元}{136}{subsection.4.1.1}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {4.1.2}句子的结构信息}{138}{subsection.4.1.2}
+\defcounter {refsection}{0}\relax 
+\contentsline {section}{\numberline {4.2}基于短语的翻译模型}{140}{section.4.2}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {4.2.1}机器翻译中的短语}{140}{subsection.4.2.1}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {4.2.2}数学建模及判别式模型}{143}{subsection.4.2.2}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{基于翻译推导的建模}{143}{section*.124}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{对数线性模型}{144}{section*.125}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{搭建模型的基本流程}{145}{section*.126}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {4.2.3}短语抽取}{146}{subsection.4.2.3}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{与词对齐一致的短语}{147}{section*.129}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{获取词对齐}{148}{section*.133}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{度量双语短语质量}{149}{section*.135}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {4.2.4}调序}{150}{subsection.4.2.4}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{基于距离的调序}{151}{section*.139}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{基于方向的调序}{151}{section*.141}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{基于分类的调序}{152}{section*.144}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {4.2.5}特征}{153}{subsection.4.2.5}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {4.2.6}最小错误率训练}{154}{subsection.4.2.6}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {4.2.7}栈解码}{157}{subsection.4.2.7}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{翻译候选匹配}{158}{section*.149}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{翻译假设扩展}{159}{section*.151}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{剪枝}{160}{section*.153}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{解码中的栈结构}{161}{section*.155}
+\defcounter {refsection}{0}\relax 
+\contentsline {section}{\numberline {4.3}基于层次短语的模型}{162}{section.4.3}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {4.3.1}同步上下文无关文法}{164}{subsection.4.3.1}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{文法定义}{165}{section*.160}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{推导}{166}{section*.161}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{胶水规则}{167}{section*.162}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{处理流程}{168}{section*.163}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {4.3.2}层次短语规则抽取}{168}{subsection.4.3.2}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {4.3.3}翻译模型及特征}{170}{subsection.4.3.3}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {4.3.4}CKY解码}{171}{subsection.4.3.4}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {4.3.5}立方剪枝}{174}{subsection.4.3.5}
+\defcounter {refsection}{0}\relax 
+\contentsline {section}{\numberline {4.4}基于语言学句法的模型}{177}{section.4.4}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {4.4.1}基于句法的翻译模型分类}{179}{subsection.4.4.1}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {4.4.2}基于树结构的文法}{179}{subsection.4.4.2}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{树到树翻译规则}{182}{section*.179}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{基于树结构的翻译推导}{183}{section*.181}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{树到串翻译规则}{185}{section*.184}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {4.4.3}树到串翻译规则抽取}{186}{subsection.4.4.3}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{树的切割与最小规则}{186}{section*.186}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{空对齐处理}{190}{section*.192}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{组合规则}{191}{section*.194}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{SPMT规则}{191}{section*.196}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{句法树二叉化}{192}{section*.198}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {4.4.4}树到树翻译规则抽取}{194}{subsection.4.4.4}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{基于节点对齐的规则抽取}{195}{section*.202}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{基于对齐矩阵的规则抽取}{195}{section*.205}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {4.4.5}句法翻译模型的特征}{196}{subsection.4.4.5}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {4.4.6}基于超图的推导空间表示}{199}{subsection.4.4.6}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {4.4.7}基于树的解码 vs 基于串的解码}{201}{subsection.4.4.7}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{基于树的解码}{202}{section*.213}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{基于串的解码}{204}{section*.216}
+\defcounter {refsection}{0}\relax 
+\contentsline {section}{\numberline {4.5}小结及深入阅读}{206}{section.4.5}
+\defcounter {refsection}{0}\relax 
+\contentsline {part}{\@mypartnumtocformat {III}{神经机器翻译}}{209}{part.3}
+\ttl@stoptoc {default@2}
+\ttl@starttoc {default@3}
+\defcounter {refsection}{0}\relax 
+\contentsline {chapter}{\numberline {5}人工神经网络和神经语言建模}{211}{chapter.5}
+\defcounter {refsection}{0}\relax 
+\contentsline {section}{\numberline {5.1}深度学习与人工神经网络}{212}{section.5.1}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {5.1.1}发展简史}{212}{subsection.5.1.1}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{早期的人工神经网络和第一次寒冬}{212}{section*.218}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{神经网络的第二次高潮和第二次寒冬}{213}{section*.219}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{深度学习和神经网络方法的崛起}{214}{section*.220}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {5.1.2}为什么需要深度学习}{215}{subsection.5.1.2}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{端到端学习和表示学习}{215}{section*.222}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{深度学习的效果}{216}{section*.224}
+\defcounter {refsection}{0}\relax 
+\contentsline {section}{\numberline {5.2}神经网络基础}{216}{section.5.2}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {5.2.1}线性代数基础}{216}{subsection.5.2.1}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{标量、向量和矩阵}{217}{section*.226}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{矩阵的转置}{218}{section*.227}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{矩阵加法和数乘}{218}{section*.228}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{矩阵乘法和矩阵点乘}{219}{section*.229}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{线性映射}{220}{section*.230}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{范数}{221}{section*.231}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {5.2.2}人工神经元和感知机}{222}{subsection.5.2.2}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{感知机\ \raisebox {0.5mm}{------}\ 最简单的人工神经元模型}{223}{section*.234}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{神经元内部权重}{224}{section*.237}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{神经元的输入\ \raisebox {0.5mm}{------}\ 离散 vs 连续}{225}{section*.239}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{神经元内部的参数学习}{225}{section*.241}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {5.2.3}多层神经网络}{226}{subsection.5.2.3}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{线性变换和激活函数}{226}{section*.243}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{单层神经网络$\rightarrow $多层神经网络}{229}{section*.250}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {5.2.4}函数拟合能力}{230}{subsection.5.2.4}
+\defcounter {refsection}{0}\relax 
+\contentsline {section}{\numberline {5.3}神经网络的张量实现}{233}{section.5.3}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {5.3.1} 张量及其计算}{233}{subsection.5.3.1}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{\ 张量}{233}{section*.259}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{张量的矩阵乘法}{235}{section*.262}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{张量的单元操作}{236}{section*.264}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {5.3.2}张量的物理存储形式}{237}{subsection.5.3.2}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {5.3.3}使用开源框架实现张量计算}{238}{subsection.5.3.3}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {5.3.4}前向传播与计算图}{241}{subsection.5.3.4}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {5.3.5}神经网络实例}{242}{subsection.5.3.5}
+\defcounter {refsection}{0}\relax 
+\contentsline {section}{\numberline {5.4}神经网络的参数训练}{243}{section.5.4}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {5.4.1}损失函数}{244}{subsection.5.4.1}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {5.4.2}基于梯度的参数优化}{245}{subsection.5.4.2}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{梯度下降}{246}{section*.278}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{梯度获取}{248}{section*.280}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{基于梯度的方法的变种和改进}{250}{section*.284}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {5.4.3}参数更新的并行化策略}{253}{subsection.5.4.3}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {5.4.4}梯度消失、梯度爆炸和稳定性训练}{255}{subsection.5.4.4}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{易于优化的激活函数}{255}{section*.287}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{梯度裁剪}{256}{section*.291}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{稳定性训练}{257}{section*.292}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {5.4.5}过拟合}{258}{subsection.5.4.5}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {5.4.6}反向传播}{259}{subsection.5.4.6}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{输出层的反向传播}{260}{section*.295}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{隐藏层的反向传播}{262}{section*.299}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{程序实现}{264}{section*.302}
+\defcounter {refsection}{0}\relax 
+\contentsline {section}{\numberline {5.5}神经语言模型}{265}{section.5.5}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {5.5.1}基于神经网络的语言建模}{265}{subsection.5.5.1}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{基于前馈神经网络的语言模型}{266}{section*.305}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{基于循环神经网络的语言模型}{268}{section*.308}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{基于自注意力机制的语言模型}{269}{section*.310}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{语言模型的评价}{271}{section*.312}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {5.5.2}单词表示模型}{271}{subsection.5.5.2}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{One-hot编码}{271}{section*.313}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{分布式表示}{272}{section*.315}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {5.5.3}句子表示模型及预训练}{273}{subsection.5.5.3}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{简单的上下文表示模型}{274}{section*.319}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{ELMO模型}{275}{section*.322}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{GPT模型}{275}{section*.324}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{BERT模型}{276}{section*.326}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{为什么要预训练？}{277}{section*.328}
+\defcounter {refsection}{0}\relax 
+\contentsline {section}{\numberline {5.6}小结及深入阅读}{278}{section.5.6}
+\defcounter {refsection}{0}\relax 
+\contentsline {chapter}{\numberline {6}神经机器翻译模型}{281}{chapter.6}
+\defcounter {refsection}{0}\relax 
+\contentsline {section}{\numberline {6.1}神经机器翻译的发展简史}{281}{section.6.1}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {6.1.1}神经机器翻译的起源}{283}{subsection.6.1.1}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {6.1.2}神经机器翻译的品质 }{285}{subsection.6.1.2}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {6.1.3}神经机器翻译的优势 }{288}{subsection.6.1.3}
+\defcounter {refsection}{0}\relax 
+\contentsline {section}{\numberline {6.2}编码器-解码器框架}{290}{section.6.2}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {6.2.1}框架结构}{290}{subsection.6.2.1}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {6.2.2}表示学习}{291}{subsection.6.2.2}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {6.2.3}简单的运行实例}{292}{subsection.6.2.3}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {6.2.4}机器翻译范式的对比}{293}{subsection.6.2.4}
+\defcounter {refsection}{0}\relax 
+\contentsline {section}{\numberline {6.3}基于循环神经网络的翻译模型及注意力机制}{294}{section.6.3}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {6.3.1}建模}{295}{subsection.6.3.1}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {6.3.2}输入（词嵌入）及输出（Softmax）}{298}{subsection.6.3.2}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {6.3.3}循环神经网络结构}{301}{subsection.6.3.3}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{循环神经单元（RNN）}{301}{section*.351}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{长短时记忆网络（LSTM）}{302}{section*.352}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{门控循环单元（GRU）}{304}{section*.355}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{双向模型}{305}{section*.357}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{多层循环神经网络}{306}{section*.359}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {6.3.4}注意力机制}{306}{subsection.6.3.4}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{翻译中的注意力机制}{307}{section*.362}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{上下文向量的计算}{309}{section*.365}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{注意力机制的解读}{312}{section*.370}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {6.3.5}训练}{313}{subsection.6.3.5}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{损失函数}{314}{section*.373}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{参数初始化}{314}{section*.374}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{优化策略}{315}{section*.375}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{梯度裁剪}{315}{section*.377}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{学习率策略}{316}{section*.378}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{并行训练}{317}{section*.381}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {6.3.6}推断}{320}{subsection.6.3.6}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{贪婪搜索}{321}{section*.386}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{束搜索}{321}{section*.389}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{长度惩罚}{323}{section*.391}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {6.3.7}实例-GNMT}{324}{subsection.6.3.7}
+\defcounter {refsection}{0}\relax 
+\contentsline {section}{\numberline {6.4}Transformer}{324}{section.6.4}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {6.4.1}自注意力模型}{326}{subsection.6.4.1}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {6.4.2}Transformer架构}{328}{subsection.6.4.2}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {6.4.3}位置编码}{330}{subsection.6.4.3}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {6.4.4}基于点乘的注意力机制}{332}{subsection.6.4.4}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {6.4.5}掩码操作}{334}{subsection.6.4.5}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {6.4.6}多头注意力}{335}{subsection.6.4.6}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {6.4.7}残差网络和层正则化}{336}{subsection.6.4.7}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {6.4.8}前馈全连接网络子层}{338}{subsection.6.4.8}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {6.4.9}训练}{339}{subsection.6.4.9}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {6.4.10}推断}{341}{subsection.6.4.10}
+\defcounter {refsection}{0}\relax 
+\contentsline {section}{\numberline {6.5}序列到序列问题及应用}{342}{section.6.5}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {6.5.1}自动问答}{342}{subsection.6.5.1}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {6.5.2}自动文摘}{343}{subsection.6.5.2}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {6.5.3}文言文翻译}{343}{subsection.6.5.3}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {6.5.4}对联生成}{344}{subsection.6.5.4}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {6.5.5}古诗生成}{344}{subsection.6.5.5}
+\defcounter {refsection}{0}\relax 
+\contentsline {section}{\numberline {6.6}小结及深入阅读}{346}{section.6.6}
+\defcounter {refsection}{0}\relax 
+\contentsline {chapter}{\numberline {7}神经机器翻译实战 \ \raisebox {0.5mm}{------}\ 参加一次比赛}{349}{chapter.7}
+\defcounter {refsection}{0}\relax 
+\contentsline {section}{\numberline {7.1}神经机器翻译并不简单}{349}{section.7.1}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {7.1.1}影响神经机器翻译性能的因素}{350}{subsection.7.1.1}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {7.1.2}搭建神经机器翻译系统的步骤 }{351}{subsection.7.1.2}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {7.1.3}架构选择 }{352}{subsection.7.1.3}
+\defcounter {refsection}{0}\relax 
+\contentsline {section}{\numberline {7.2}数据处理}{352}{section.7.2}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {7.2.1}分词}{353}{subsection.7.2.1}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {7.2.2}标准化}{354}{subsection.7.2.2}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {7.2.3}数据清洗}{355}{subsection.7.2.3}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {7.2.4}子词切分}{357}{subsection.7.2.4}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{大词表和OOV问题}{358}{section*.428}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{子词}{358}{section*.430}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{双字节编码（BPE）}{359}{section*.432}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{其他方法}{362}{section*.435}
+\defcounter {refsection}{0}\relax 
+\contentsline {section}{\numberline {7.3}建模与训练}{362}{section.7.3}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {7.3.1}正则化}{362}{subsection.7.3.1}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{L1/L2正则化}{364}{section*.437}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{标签平滑}{365}{section*.438}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{Dropout}{366}{section*.440}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{Layer Dropout}{368}{section*.443}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {7.3.2}增大模型容量}{369}{subsection.7.3.2}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{宽网络}{369}{section*.445}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{深网络}{370}{section*.447}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{增大输入层和输出层表示能力}{371}{section*.449}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{大模型的分布式计算}{372}{section*.450}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {7.3.3}大批量训练}{372}{subsection.7.3.3}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{为什么需要大批量训练}{372}{section*.451}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{如何构建批次}{373}{section*.454}
+\defcounter {refsection}{0}\relax 
+\contentsline {section}{\numberline {7.4}推断}{375}{section.7.4}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {7.4.1}推断优化}{375}{subsection.7.4.1}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{推断系统的架构}{375}{section*.456}
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{自左向右推断 vs 自右向左推断}{376}{section*.458}
 \defcounter {refsection}{0}\relax 
-\contentsline {section}{\numberline {1.1}什么是基于词的翻译模型}{11}{section.1.1}%
+\contentsline {subsubsection}{推断加速}{377}{section*.459}
 \defcounter {refsection}{0}\relax 
-\contentsline {section}{\numberline {1.2}构建一个简单的机器翻译系统}{13}{section.1.2}%
+\contentsline {subsection}{\numberline {7.4.2}译文长度控制}{384}{subsection.7.4.2}
 \defcounter {refsection}{0}\relax 
-\contentsline {subsection}{\numberline {1.2.1}如何进行翻译？}{13}{subsection.1.2.1}%
+\contentsline {subsubsection}{长度惩罚因子}{385}{section*.465}
 \defcounter {refsection}{0}\relax 
-\contentsline {subsubsection}{机器翻译流程}{14}{section*.7}%
+\contentsline {subsubsection}{译文长度范围约束}{386}{section*.467}
 \defcounter {refsection}{0}\relax 
-\contentsline {subsubsection}{人工翻译 vs. 机器翻译}{15}{section*.9}%
+\contentsline {subsubsection}{覆盖度模型}{386}{section*.468}
 \defcounter {refsection}{0}\relax 
-\contentsline {subsection}{\numberline {1.2.2}基本框架}{15}{subsection.1.2.2}%
+\contentsline {subsection}{\numberline {7.4.3}多模型集成}{387}{subsection.7.4.3}
 \defcounter {refsection}{0}\relax 
-\contentsline {subsection}{\numberline {1.2.3}单词翻译概率}{16}{subsection.1.2.3}%
+\contentsline {subsubsection}{假设选择}{388}{section*.469}
 \defcounter {refsection}{0}\relax 
-\contentsline {subsubsection}{什么是单词翻译概率？}{16}{section*.11}%
+\contentsline {subsubsection}{局部预测融合}{389}{section*.471}
 \defcounter {refsection}{0}\relax 
-\contentsline {subsubsection}{如何从一个双语平行数据中学习？}{17}{section*.13}%
+\contentsline {subsubsection}{译文重组}{390}{section*.473}
 \defcounter {refsection}{0}\relax 
-\contentsline {subsubsection}{如何从大量的双语平行数据中学习？}{18}{section*.14}%
+\contentsline {section}{\numberline {7.5}进阶技术}{391}{section.7.5}
 \defcounter {refsection}{0}\relax 
-\contentsline {subsection}{\numberline {1.2.4}句子级翻译模型}{19}{subsection.1.2.4}%
+\contentsline {subsection}{\numberline {7.5.1}深层模型}{391}{subsection.7.5.1}
 \defcounter {refsection}{0}\relax 
-\contentsline {subsubsection}{基础模型}{19}{section*.16}%
+\contentsline {subsubsection}{Post-Norm vs Pre-Norm}{392}{section*.476}
 \defcounter {refsection}{0}\relax 
-\contentsline {subsubsection}{生成流畅的译文}{21}{section*.18}%
+\contentsline {subsubsection}{层聚合}{394}{section*.479}
 \defcounter {refsection}{0}\relax 
-\contentsline {subsection}{\numberline {1.2.5}解码}{23}{subsection.1.2.5}%
+\contentsline {subsubsection}{深层模型的训练加速}{395}{section*.481}
 \defcounter {refsection}{0}\relax 
-\contentsline {section}{\numberline {1.3}基于词的翻译建模}{26}{section.1.3}%
+\contentsline {subsubsection}{渐进式训练}{395}{section*.482}
 \defcounter {refsection}{0}\relax 
-\contentsline {subsection}{\numberline {1.3.1}噪声信道模型}{26}{subsection.1.3.1}%
+\contentsline {subsubsection}{分组稠密连接}{396}{section*.484}
 \defcounter {refsection}{0}\relax 
-\contentsline {subsection}{\numberline {1.3.2}统计机器翻译的三个基本问题}{28}{subsection.1.3.2}%
+\contentsline {subsubsection}{学习率重置策略}{397}{section*.486}
 \defcounter {refsection}{0}\relax 
-\contentsline {subsubsection}{词对齐}{29}{section*.27}%
+\contentsline {subsubsection}{深层模型的鲁棒性训练}{398}{section*.488}
 \defcounter {refsection}{0}\relax 
-\contentsline {subsubsection}{基于词对齐的翻译模型}{30}{section*.30}%
+\contentsline {subsection}{\numberline {7.5.2}单语数据的使用}{400}{subsection.7.5.2}
 \defcounter {refsection}{0}\relax 
-\contentsline {subsubsection}{基于词对齐的翻译实例}{31}{section*.32}%
+\contentsline {subsubsection}{伪数据}{401}{section*.492}
 \defcounter {refsection}{0}\relax 
-\contentsline {section}{\numberline {1.4}IBM模型1-2}{32}{section.1.4}%
+\contentsline {subsubsection}{预训练}{402}{section*.495}
 \defcounter {refsection}{0}\relax 
-\contentsline {subsection}{\numberline {1.4.1}IBM模型1}{32}{subsection.1.4.1}%
+\contentsline {subsubsection}{联合训练}{404}{section*.498}
 \defcounter {refsection}{0}\relax 
-\contentsline {subsection}{\numberline {1.4.2}IBM模型2}{34}{subsection.1.4.2}%
+\contentsline {subsection}{\numberline {7.5.3}知识精炼}{404}{subsection.7.5.3}
 \defcounter {refsection}{0}\relax 
-\contentsline {subsection}{\numberline {1.4.3}解码及计算优化}{35}{subsection.1.4.3}%
+\contentsline {subsubsection}{什么是知识精炼}{405}{section*.500}
 \defcounter {refsection}{0}\relax 
-\contentsline {subsection}{\numberline {1.4.4}训练}{36}{subsection.1.4.4}%
+\contentsline {subsubsection}{知识精炼的基本方法}{406}{section*.501}
 \defcounter {refsection}{0}\relax 
-\contentsline {subsubsection}{目标函数}{36}{section*.37}%
+\contentsline {subsubsection}{机器翻译中的知识精炼}{408}{section*.503}
 \defcounter {refsection}{0}\relax 
-\contentsline {subsubsection}{优化}{37}{section*.39}%
+\contentsline {subsection}{\numberline {7.5.4}双向训练}{408}{subsection.7.5.4}
 \defcounter {refsection}{0}\relax 
-\contentsline {section}{\numberline {1.5}IBM模型3-5及隐马尔可夫模型}{42}{section.1.5}%
+\contentsline {subsubsection}{有监督对偶学习}{410}{section*.505}
 \defcounter {refsection}{0}\relax 
-\contentsline {subsection}{\numberline {1.5.1}基于产出率的翻译模型}{44}{subsection.1.5.1}%
+\contentsline {subsubsection}{无监督对偶学习}{410}{section*.506}
 \defcounter {refsection}{0}\relax 
-\contentsline {subsection}{\numberline {1.5.2}IBM 模型3}{46}{subsection.1.5.2}%
+\contentsline {subsubsection}{翻译中回译}{411}{section*.508}
 \defcounter {refsection}{0}\relax 
-\contentsline {subsection}{\numberline {1.5.3}IBM 模型4}{48}{subsection.1.5.3}%
+\contentsline {section}{\numberline {7.6}小结及深入阅读}{412}{section.7.6}
 \defcounter {refsection}{0}\relax 
-\contentsline {subsection}{\numberline {1.5.4} IBM 模型5}{49}{subsection.1.5.4}%
+\contentsline {part}{\@mypartnumtocformat {IV}{附录}}{417}{part.4}
+\ttl@stoptoc {default@3}
+\ttl@starttoc {default@4}
 \defcounter {refsection}{0}\relax 
-\contentsline {subsection}{\numberline {1.5.5}隐马尔可夫模型}{51}{subsection.1.5.5}%
+\contentsline {chapter}{\numberline {A}附录A}{419}{Appendix.1.A}
 \defcounter {refsection}{0}\relax 
-\contentsline {subsubsection}{隐马尔可夫模型}{51}{section*.51}%
+\contentsline {section}{\numberline {A.1}基准数据集}{419}{section.1.A.1}
 \defcounter {refsection}{0}\relax 
-\contentsline {subsubsection}{词对齐模型}{52}{section*.53}%
+\contentsline {section}{\numberline {A.2}平行语料}{420}{section.1.A.2}
 \defcounter {refsection}{0}\relax 
-\contentsline {subsection}{\numberline {1.5.6}解码和训练}{53}{subsection.1.5.6}%
+\contentsline {section}{\numberline {A.3}相关工具}{421}{section.1.A.3}
 \defcounter {refsection}{0}\relax 
-\contentsline {section}{\numberline {1.6}问题分析}{54}{section.1.6}%
+\contentsline {subsection}{\numberline {A.3.1}数据预处理工具}{421}{subsection.1.A.3.1}
 \defcounter {refsection}{0}\relax 
-\contentsline {subsection}{\numberline {1.6.1}词对齐及对称化}{54}{subsection.1.6.1}%
+\contentsline {subsection}{\numberline {A.3.2}评价工具}{422}{subsection.1.A.3.2}
 \defcounter {refsection}{0}\relax 
-\contentsline {subsection}{\numberline {1.6.2}Deficiency}{55}{subsection.1.6.2}%
+\contentsline {chapter}{\numberline {B}附录B}{423}{Appendix.2.B}
 \defcounter {refsection}{0}\relax 
-\contentsline {subsection}{\numberline {1.6.3}句子长度}{56}{subsection.1.6.3}%
+\contentsline {section}{\numberline {B.1}IBM模型3训练方法}{423}{section.2.B.1}
 \defcounter {refsection}{0}\relax 
-\contentsline {subsection}{\numberline {1.6.4}其他问题}{56}{subsection.1.6.4}%
+\contentsline {section}{\numberline {B.2}IBM模型4训练方法}{425}{section.2.B.2}
 \defcounter {refsection}{0}\relax 
-\contentsline {section}{\numberline {1.7}小结及深入阅读}{57}{section.1.7}%
+\contentsline {section}{\numberline {B.3}IBM模型5训练方法}{427}{section.2.B.3}
 \contentsfinish 
--- a/Book/mt-book-xelatex.tex
+++ b/Book/mt-book-xelatex.tex
 % !Mode:: "TeX:UTF-8"
 % !TEX encoding = UTF-8 Unicode

+%----------------------------------------------------------------------------------------
+% 机器翻译：统计建模与深度学习方法
+% Machine Translation: Statistical Modeling and Deep Learning Methods
+%
+% Copyright 2020
+% 肖桐(xiaotong@mail.neu.edu.cn) 朱靖波 (zhujingbo@mail.neu.edu.cn)
+%----------------------------------------------------------------------------------------

 %----------------------------------------------------------------------------------------
 %	BASIC CONFIGURATIONS
@@ -98,7 +105,7 @@
 {\large
 \noindent {\color{red} 在此感谢所有为本书做出贡献的人} \\

-\noindent 曹润柘、曾信、孟霞、单韦乔、姜雨帆、王子扬、刘辉、许诺、李北、刘继强、张哲旸、周书含、周涛、张裕浩、李炎洋、林野、陈贺轩、刘晓倩、牛蕊、田丰宁、杜权 \\
+\noindent 曹润柘、曾信、孟霞、单韦乔、姜雨帆、王子扬、刘辉、许诺、李北、刘继强、张哲旸、周书含、周涛、张裕浩、李炎洋、林野、陈贺轩、刘晓倩、牛蕊、田丰宁、杜权、张春良、王会珍、张俐、杨木润、宁义明、李洋、秦浩 \\
 }

 %----------------------------------------------------------------------------------------
@@ -122,14 +129,14 @@
 %	CHAPTERS
 %----------------------------------------------------------------------------------------

-%\include{Chapter1/chapter1}
-%\include{Chapter2/chapter2}
-%\include{Chapter3/chapter3}
+\include{Chapter1/chapter1}
+\include{Chapter2/chapter2}
+\include{Chapter3/chapter3}
 \include{Chapter4/chapter4}
-%\include{Chapter5/chapter5}
-%\include{Chapter6/chapter6}
-%\include{Chapter7/chapter7}
-%\include{ChapterAppend/chapterappend}
+\include{Chapter5/chapter5}
+\include{Chapter6/chapter6}
+\include{Chapter7/chapter7}
+\include{ChapterAppend/chapterappend}


 %----------------------------------------------------------------------------------------

--- a/Book/structure.tex
+++ b/Book/structure.tex
@@ -273,10 +273,10 @@
 \newtheorem{problem}{Problem}[chapter]
 \newtheorem{exerciseT}{Example}[chapter]
 \theoremstyle{blacknumex}
-\newtheorem{exampleT}{Example}[chapter]
+\newtheorem{exampleT}{实例}[chapter]
 \theoremstyle{blacknumbox}
 \newtheorem{vocabulary}{Vocabulary}[chapter]
-\newtheorem{definitionT}{Definition}[section]
+\newtheorem{definitionT}{定义}[section]
 \newtheorem{corollaryT}[dummy]{Corollary}
 \theoremstyle{ocrenum}
 \newtheorem{proposition}[dummy]{Proposition}