updates of section 5

28b2ae5b · xiaotong · bd69d653 · 28b2ae5b · 28b2ae5b · 28b2ae5b
Commit 28b2ae5b authored Apr 22, 2020 by xiaotong
--- a/Book/Chapter5/Figures/fig-back-propagation-hid.tex
+++ b/Book/Chapter5/Figures/fig-back-propagation-hid.tex
@@ -7,20 +7,20 @@
 \node [anchor=east] (prev) at ([xshift=-2em]h.west) {...};
 \node [anchor=west] (next) at ([xshift=2em]h2.east) {...};
 \draw [->,thick] ([xshift=0.1em]prev.east) -- ([xshift=-0.1em]h.west);
-\draw [->,thick] ([xshift=0.1em]h.east) -- ([xshift=-0.1em]s.west) node [pos=0.5,below] {\tiny{$\textbf{s}^k = \textbf{h}^{k-1}\textbf{w}^k$}};
+\draw [->,thick] ([xshift=0.1em]h.east) -- ([xshift=-0.1em]s.west) node [pos=0.5,below] {\scriptsize{$\textbf{s}^k = \textbf{h}^{k-1}\textbf{w}^k$}};
-\draw [->,thick] ([xshift=0.1em]s.east) -- ([xshift=-0.1em]h2.west) node [pos=0.5,below] {\tiny{$\textbf{h}^k = f^k(\textbf{s}^{k})$}};
+\draw [->,thick] ([xshift=0.1em]s.east) -- ([xshift=-0.1em]h2.west) node [pos=0.5,below] {\scriptsize{$\textbf{h}^k = f^k(\textbf{s}^{k})$}};
 \draw [->,thick] ([xshift=0.1em]h2.east) -- ([xshift=-0.1em]next.west);
 {
-\draw [<-,thick,red] ([xshift=0.1em,yshift=0.4em]h2.east) -- ([xshift=-0.1em,yshift=0.4em]next.west) node [pos=0.8,above] {\tiny{反向传播}};
+\draw [<-,thick,red] ([xshift=0.1em,yshift=0.4em]h2.east) -- ([xshift=-0.1em,yshift=0.4em]next.west) node [pos=0.8,above] {\scriptsize{反向传播}};
 }
 {
-\draw [<-,thick,red] ([xshift=0.1em,yshift=0.4em]s.east) -- ([xshift=-0.1em,yshift=0.4em]h2.west) node [pos=0.5,above] {\tiny{反向传播}};
+\draw [<-,thick,red] ([xshift=0.1em,yshift=0.4em]s.east) -- ([xshift=-0.1em,yshift=0.4em]h2.west) node [pos=0.5,above] {\scriptsize{反向传播}};
 }
 {
-\draw [<-,thick,red] ([xshift=0.1em,yshift=0.4em]h.east) -- ([xshift=-0.1em,yshift=0.4em]s.west) node [pos=0.5,above] {\tiny{反向传播}};
+\draw [<-,thick,red] ([xshift=0.1em,yshift=0.4em]h.east) -- ([xshift=-0.1em,yshift=0.4em]s.west) node [pos=0.5,above] {\scriptsize{反向传播}};
 }
 {
@@ -33,7 +33,7 @@
 }
 {
-\node [anchor=south] (slabel) at (s.north) {$\pi^k = \frac{\partial L}{\partial \textbf{s}^{k}}$};
+\node [anchor=south] (slabel) at (s.north) {$\frac{\partial L}{\partial \textbf{s}^{k}}$};
 }
 {

--- a/Book/Chapter5/Figures/fig-back-propagation-output1.tex
+++ b/Book/Chapter5/Figures/fig-back-propagation-output1.tex
@@ -7,8 +7,8 @@
 \draw [->] (s.east) -- (h2.west);
 \draw [->] (h2.east) -- (l.west);
-\draw [->,very thick,red] ([yshift=1em,xshift=-0.1em]l.north) -- ([yshift=1em,xshift=0.1em]h2.north) node [pos=0.5,above] {\tiny{求梯度{$\frac{\partial L}{\partial \textbf{h}^K} = ?$}}};
+\draw [->,very thick,red] ([yshift=1em,xshift=-0.1em]l.north) -- ([yshift=1em,xshift=0.1em]h2.north) node [pos=0.5,above] {\scriptsize{求梯度{$\frac{\partial L}{\partial \textbf{h}^K} = ?$}}};
-\draw [->,very thick,red] ([yshift=1em,xshift=-0.1em]h2.north) -- ([yshift=1em,xshift=0.1em]s.north) node [pos=0.5,above] {\tiny{求梯度{$\frac{\partial f^K(\textbf{s}^K)}{\partial \textbf{s}^K} = ?$}}};
+\draw [->,very thick,red] ([yshift=1em,xshift=-0.1em]h2.north) -- ([yshift=1em,xshift=0.1em]s.north) node [pos=0.5,above] {\scriptsize{求梯度{$\frac{\partial f^K(\textbf{s}^K)}{\partial \textbf{s}^K} = ?$}}};
 \draw [-,very thick,red] ([yshift=0.5em]l.north) -- ([yshift=1.5em]l.north);
 \draw [-,very thick,red] ([yshift=0.5em]h2.north) -- ([yshift=1.5em]h2.north);
 \draw [-,very thick,red] ([yshift=0.5em]s.north) -- ([yshift=1.5em]s.north);

--- a/Book/Chapter5/Figures/fig-back-propagation-output2.tex
+++ b/Book/Chapter5/Figures/fig-back-propagation-output2.tex
@@ -2,19 +2,19 @@
 \begin{tikzpicture}
 \begin{scope}
 \node [anchor=center,minimum height=1.7em,fill=yellow!20,draw] (h) at (0,0) {$\textbf{h}^{K-1}$};
-\node [anchor=west,minimum height=1.7em,fill=blue!20,draw] (s) at ([xshift=5.5em]h.east) {$\textbf{s}^{K}$};
+\node [anchor=west,minimum height=1.7em,fill=blue!20,draw] (s) at ([xshift=6.0em]h.east) {$\textbf{s}^{K}$};
 \draw [->] (h.east) -- (s.west);
-\node [anchor=south west,inner sep=2pt] (step100) at ([xshift=0.5em,yshift=-0.8em]h.north east) {\tiny{$\textbf{s}^K = \textbf{h}^{K-1} \textbf{w}^K$}};
+\node [anchor=south west,inner sep=2pt] (step100) at ([xshift=0.5em,yshift=-0.8em]h.north east) {\scriptsize{$\textbf{s}^K = \textbf{h}^{K-1} \textbf{w}^K$}};
 \node [anchor=south west] (slabel) at ([yshift=1em,xshift=0.3em]s.north) {\scriptsize{\red{\textbf{{已经得到：$\pi^K = \frac{\partial L}{\partial \textbf{s}^K}$}}}}};
 \draw [->,red] ([yshift=0.3em]slabel.south) .. controls +(south:0.5) and +(north:0.5) .. ([xshift=0.5em]s.north);
 {
-\draw [->,very thick,red] ([yshift=1em,xshift=-0.1em]s.north) -- ([yshift=1em,xshift=0.1em]h.north) node [pos=0.5,above] {\tiny{{$\frac{\partial L}{\partial \textbf{w}^K} = ?$, $\frac{\partial L}{\partial \textbf{h}^{K-1}} = ?$}}};
+\draw [->,very thick,red] ([yshift=1em,xshift=-0.1em]s.north) -- ([yshift=1em,xshift=0.1em]h.north) node [pos=0.5,above] {\scriptsize{{$\frac{\partial L}{\partial \textbf{w}^K} = ?$, $\frac{\partial L}{\partial \textbf{h}^{K-1}} = ?$}}};
 \draw [-,very thick,red] ([yshift=0.5em]h.north) -- ([yshift=1.5em]h.north);
 \draw [-,very thick,red] ([yshift=0.5em]s.north) -- ([yshift=1.5em]s.north);
 }

--- a/Book/Chapter5/Figures/fig-bert.tex
+++ b/Book/Chapter5/Figures/fig-bert.tex
@@ -33,7 +33,7 @@
 \node [anchor=south,draw,inner sep=4pt,fill=yellow!30,minimum width=2em] (t5) at ([yshift=1em]Trm9.north) {\scriptsize{$\textbf{h}_m$}};
 \node [anchor=west,draw,inner sep=3pt,fill=blue!20!white,minimum width=1em] (Lt1) at ([yshift=1.5em]t1.west) {\tiny{TRM}};
-\node [anchor=west] (Lt2) at ([xshift=-0.1em]Lt1.east) {\tiny{: Transformer}};
+\node [anchor=west] (Lt2) at ([xshift=-0.1em]Lt1.east) {\scriptsize{: Transformer Block}};
 \draw [->] ([yshift=0.1em]e1.north) -- ([yshift=-0.1em]Trm0.south);
 \draw [->] ([yshift=0.1em]e1.north) -- ([yshift=-0.1em]Trm1.south);

--- a/Book/Chapter5/Figures/fig-code-back-propagation-1.tex
+++ b/Book/Chapter5/Figures/fig-code-back-propagation-1.tex
 %%%------------------------------------------------------------------------------------------------------------
 \begin{tcolorbox}
-[bicolor,sidebyside,width=12cm,righthand width=4cm,size=title,frame engine=empty,
+[bicolor,sidebyside,width=13cm,righthand width=4cm,size=title,frame engine=empty,
 colback=blue!10!white,colbacklower=black!5!white]
 {\scriptsize
 \begin{tabbing}
@@ -14,16 +14,16 @@
 \texttt{} \\
 \texttt{CrossEntropyBackward(dh[4], y, gold);} \\
 \texttt{SoftmaxBackward(y, s[4], dh[4], ds[4]);}\\
-\texttt{MMul(h[3], {\tiny X\_TRANS}, ds[4], {\tiny X\_NOTRANS}, dw[4]);}\\
+\texttt{MMul(h[3], {\scriptsize X\_TRANS}, ds[4], {\scriptsize X\_NOTRANS}, dw[4]);}\\
-\texttt{MMul(ds[4], {\tiny X\_NOTRANS}, w[4], {\tiny X\_RANS}, dh[3]);}\\
+\texttt{MMul(ds[4], {\scriptsize X\_NOTRANS}, w[4], {\scriptsize X\_RANS}, dh[3]);}\\
 \texttt{} \\
 \texttt{dh[2] = dh[3];}\\
 \texttt{ReluBackward(h[2], s[2], dh[2], ds[2]);}\\
-\texttt{MMul(h[1], {\tiny X\_TRANS}, ds[2], {\tiny X\_NOTRANS}, dw[2]);}\\
+\texttt{MMul(h[1], {\scriptsize X\_TRANS}, ds[2], {\scriptsize X\_NOTRANS}, dw[2]);}\\
-\texttt{MMul(ds[2], {\tiny X\_NOTRANS}, w[2], {\tiny X\_TRANS}, dh[2]);}\\
+\texttt{MMul(ds[2], {\scriptsize X\_NOTRANS}, w[2], {\scriptsize X\_TRANS}, dh[2]);}\\
@@ -46,10 +46,10 @@
 \begin{tikzpicture}
-\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=8em,minimum height=1.2em,fill=red!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (h1) at (0,0) {\tiny{x (input)}};
+\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=8em,minimum height=1.2em,fill=red!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (h1) at (0,0) {\scriptsize{x (input)}};
-\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=8em,minimum height=1.2em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (h2) at ([yshift=1.5em]h1.north) {\tiny{h1 = Relu(x * w1)}};
+\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=8em,minimum height=1.2em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (h2) at ([yshift=1.5em]h1.north) {\scriptsize{h1 = Relu(x * w1)}};
-\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=8em,minimum height=1.2em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (h3) at ([yshift=1.5em]h2.north) {\tiny{h2 = Relu(h1 * w2)}};
+\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=8em,minimum height=1.2em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (h3) at ([yshift=1.5em]h2.north) {\scriptsize{h2 = Relu(h1 * w2)}};
-\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=8em,minimum height=1.2em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (h4) at ([yshift=1.5em]h3.north) {\tiny{h3 = h2 + h1}};
+\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=8em,minimum height=1.2em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (h4) at ([yshift=1.5em]h3.north) {\scriptsize{h3 = h2 + h1}};
 {\draw [->,thick] (h1.north) -- (h2.south);}
 {\draw [->,thick] (h2.north) -- (h3.south);}

--- a/Book/Chapter5/Figures/fig-code-back-propagation-2.tex
+++ b/Book/Chapter5/Figures/fig-code-back-propagation-2.tex
 %%%------------------------------------------------------------------------------------------------------------
 \begin{tcolorbox}
-[bicolor,sidebyside,width=12cm,righthand width=4cm,size=title,frame engine=empty,
+[bicolor,sidebyside,width=13cm,righthand width=4cm,size=title,frame engine=empty,
 colback=blue!10!white,colbacklower=black!5!white]
 {\scriptsize
 \begin{tabbing}
@@ -14,16 +14,16 @@
 \texttt{} \\
 \texttt{CrossEntropyBackward(dh[4], y, gold);} \\
 \texttt{SoftmaxBackward(y, s[4], dh[4], ds[4]);}\\
-\texttt{MMul(h[3], {\tiny X\_TRANS}, ds[4], {\tiny X\_NOTRANS}, dw[4]);}\\
+\texttt{MMul(h[3], {\scriptsize X\_TRANS}, ds[4], {\scriptsize X\_NOTRANS}, dw[4]);}\\
-\texttt{MMul(ds[4], {\tiny X\_NOTRANS}, w[4], {\tiny X\_RANS}, dh[3]);}\\
+\texttt{MMul(ds[4], {\scriptsize X\_NOTRANS}, w[4], {\scriptsize X\_RANS}, dh[3]);}\\
 \texttt{} \\
 \texttt{dh[2] = dh[3];}\\
 \texttt{ReluBackward(h[2], s[2], dh[2], ds[2]);}\\
-\texttt{MMul(h[1], {\tiny X\_TRANS}, ds[2], {\tiny X\_NOTRANS}, dw[2]);}\\
+\texttt{MMul(h[1], {\scriptsize X\_TRANS}, ds[2], {\scriptsize X\_NOTRANS}, dw[2]);}\\
-\texttt{MMul(ds[2], {\tiny X\_NOTRANS}, w[2], {\tiny X\_TRANS}, dh[2]);}\\
+\texttt{MMul(ds[2], {\scriptsize X\_NOTRANS}, w[2], {\scriptsize X\_TRANS}, dh[2]);}\\
@@ -46,10 +46,10 @@
 \begin{tikzpicture}
-\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=8em,minimum height=1.2em,fill=red!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (h1) at (0,0) {\tiny{x (input)}};
+\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=8em,minimum height=1.2em,fill=red!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (h1) at (0,0) {\scriptsize{x (input)}};
-\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=8em,minimum height=1.2em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (h2) at ([yshift=1.5em]h1.north) {\tiny{h1 = Relu(x * w1)}};
+\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=8em,minimum height=1.2em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (h2) at ([yshift=1.5em]h1.north) {\scriptsize{h1 = Relu(x * w1)}};
-\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=8em,minimum height=1.2em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (h3) at ([yshift=1.5em]h2.north) {\tiny{h2 = Relu(h1 * w2)}};
+\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=8em,minimum height=1.2em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (h3) at ([yshift=1.5em]h2.north) {\scriptsize{h2 = Relu(h1 * w2)}};
-\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=8em,minimum height=1.2em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (h4) at ([yshift=1.5em]h3.north) {\tiny{h3 = h2 + h1}};
+\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=8em,minimum height=1.2em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (h4) at ([yshift=1.5em]h3.north) {\scriptsize{h3 = h2 + h1}};
 {\draw [->,thick] (h1.north) -- (h2.south);}
 {\draw [->,thick] (h2.north) -- (h3.south);}

--- a/Book/Chapter5/Figures/fig-forward-propagation-hid.tex
+++ b/Book/Chapter5/Figures/fig-forward-propagation-hid.tex
@@ -7,8 +7,8 @@
 \node [anchor=east] (prev) at ([xshift=-2em]h.west) {...};
 \node [anchor=west] (next) at ([xshift=2em]h2.east) {...};
 \draw [->,thick] ([xshift=0.1em]prev.east) -- ([xshift=-0.1em]h.west);
-\draw [->,thick] ([xshift=0.1em]h.east) -- ([xshift=-0.1em]s.west) node [pos=0.5,below] {\tiny{$\textbf{s}^k = \textbf{h}^{k-1}\textbf{w}^k$}};
+\draw [->,thick] ([xshift=0.1em]h.east) -- ([xshift=-0.1em]s.west) node [pos=0.5,below] {\scriptsize{$\textbf{s}^k = \textbf{h}^{k-1}\textbf{w}^k$}};
-\draw [->,thick] ([xshift=0.1em]s.east) -- ([xshift=-0.1em]h2.west) node [pos=0.5,below] {\tiny{$\textbf{h}^k = f^k(\textbf{s}^{k})$}};
+\draw [->,thick] ([xshift=0.1em]s.east) -- ([xshift=-0.1em]h2.west) node [pos=0.5,below] {\scriptsize{$\textbf{h}^k = f^k(\textbf{s}^{k})$}};
 \draw [->,thick] ([xshift=0.1em]h2.east) -- ([xshift=-0.1em]next.west);

--- a/Book/Chapter5/Figures/fig-gpt.tex
+++ b/Book/Chapter5/Figures/fig-gpt.tex
@@ -33,7 +33,7 @@
 \node [anchor=south,draw,inner sep=4pt,fill=yellow!30,minimum width=2em] (t5) at ([yshift=1em]Trm9.north) {\scriptsize{$\textbf{h}_m$}};
 \node [anchor=west,draw,inner sep=3pt,fill=blue!20!white,minimum width=1em] (Lt1) at ([yshift=1.5em]t1.west) {\tiny{TRM}};
-\node [anchor=west] (Lt2) at ([xshift=-0.1em]Lt1.east) {\tiny{: Transformer}};
+\node [anchor=west] (Lt2) at ([xshift=-0.1em]Lt1.east) {\scriptsize{: Transformer Block}};
 \draw [->] ([yshift=0.1em]e1.north) -- ([yshift=-0.1em]Trm0.south);
 \draw [->] ([yshift=0.1em]e1.north) -- ([yshift=-0.1em]Trm1.south);

--- a/Book/Chapter5/chapter5.tex
+++ b/Book/Chapter5/chapter5.tex
@@ -1683,31 +1683,49 @@ w_{t+1}&=&w_t-\frac{\eta}{\sqrt{z_t+\epsilon}} v_t
 \end{figure}
 %-------------------------------------------
-\parinterval  定义：
+\parinterval  下面是一些符号的定义：
-\parinterval  $ h_i^k $：第$ k $层第$ i $个神经元的输出。
+\begin{itemize}
+\item  $ h_i^k $：第$ k $层第$ i $个神经元的输出；
-\parinterval  $ \mathbf h^k $：第$ k $层的输出。若第$ k $层有$ n $个神经元，则$ \mathbf h^k=(h_1^k,h_2^k,\dots,h_n^k) $。
+\item  $ \mathbf h^k $：第$ k $层的输出。若第$ k $层有$ n $个神经元，则：
+       \begin{equation}
+       \mathbf h^k=(h_1^k,h_2^k,\dots,h_n^k)
+       \end{equation}
-\parinterval  $ w_{j,i}^k $：第$ k-1 $层神经元$ j $与第$ k $层神经元$ i $的连接权重。
+\item  $ w_{j,i}^k $：第$ k-1 $层神经元$ j $与第$ k $层神经元$ i $的连接权重；
-\parinterval  $ \mathbf w^k $：第$ k-1 $层与第$ k $层的连接权重。若第$ k-1 $层有$ m $个神经元，第$ k $层有$ n $个神经元，则$ \mathbf w^k = \begin{pmatrix} w_{1,1}^k & w_{1,2}^k & \dots & w_{1,n}^k\\w_{2,1}^k & \dots & \dots & \dots\\ \dots & \dots & \dots & \dots\\w_{m,1}^k & \dots & \dots & w_{m,n}^k\end{pmatrix} $。
+\item  $ \mathbf w^k $：第$ k-1 $层与第$ k $层的连接权重。若第$ k-1 $层有$ m $个神经元，第$ k $层有$ n $个神经元，则：
+       \begin{equation}
+       \mathbf w^k = \begin{pmatrix} w_{1,1}^k & w_{1,2}^k & \dots & w_{1,n}^k\\w_{2,1}^k & \dots & \dots & \dots\\ \dots & \dots & \dots & \dots\\w_{m,1}^k & \dots & \dots & w_{m,n}^k\end{pmatrix}
+       \end{equation}
-\parinterval  $ \mathbf h^K $：整个网络的输出。
+\item  $ \mathbf h^K $：整个网络的输出；
-\parinterval  $ \mathbf s^k $：第$ k $层的线性变换结果，$ \mathbf s^k=\mathbf h^{k-1}\mathbf w^k=\sum{h_j^{k-1}w_{j,i}^k}$。
+\item  $ \mathbf s^k $：第$ k $层的线性变换结果，有：
+       \begin{eqnarray}
+       \mathbf s^k & = & \mathbf h^{k-1}\mathbf w^k \nonumber \\
+                   & = & \sum{h_j^{k-1}w_{j,i}^k}
+       \end{eqnarray}
-\parinterval  $ f^k $：第$ k $层的激活函数，$ \mathbf h_k=f^k(\mathbf s^k)$。
+\item  $ f^k $：第$ k $层的激活函数，$ \mathbf h_k=f^k(\mathbf s^k)$。
+\end{itemize}
-\parinterval  在神经网络的第$ k $层，前向计算过程为：
+\vspace{0.5em}
+\parinterval  于是，在神经网络的第$ k $层，前向计算过程为：
 %公式--------------------------------------------------------------------
 \begin{eqnarray}
-\mathbf h^k&=&f^k(\mathbf s^k)\;\;=\;\;f^k(\mathbf h^{k-1}\mathbf w^k)\;\;=\;\;f^k(\sum_{j}{h_j^{k-1}w_{j,i}^k})
+\mathbf h^k & = & f^k(\mathbf s^k) \nonumber \nonumber \\
+            & = & f^k(\mathbf h^{k-1}\mathbf w^k) \nonumber \\
+            & = & f^k(\sum_{j}{h_j^{k-1}w_{j,i}^k})
 \label{eqa1.46}
 \end{eqnarray}
 %公式--------------------------------------------------------------------
 %--5.4.6.1输出层的反向传播---------------------
-\subsubsection{（一）输出层的反向传播}\index{Chapter5.4.6.1}
+\subsubsection{输出层的反向传播}\index{Chapter5.4.6.1}
 \parinterval  反向传播是由输出层开始计算梯度，之后逆向传播到每一层网络，直至到达输入层。这里首先讨论输出层的反向传播机制。输出层（即第$ K $层）可以被描述为：
 %公式--------------------------------------------------------------------
@@ -1736,7 +1754,7 @@ w_{t+1}&=&w_t-\frac{\eta}{\sqrt{z_t+\epsilon}} v_t
 \vspace{0.5em}
 \item 第二步，获取$ \frac{\partial L}{\partial \mathbf s^K} $，即计算损失函数$ L $关于中间状态$ \mathbf s^K $的梯度，并将梯度向前传递；
 \vspace{0.5em}
-\item 第三步，获取$ \frac{\partial L}{\partial \mathbf h^{K-1}} $和$ \frac{\partial L}{\partial \mathbf w^K} $，即计算损失函数$ L $关于第$ K-1 $层输出结果$ \mathbf h^{K-1} $的梯度，并将梯度向前传递；同时计算损失函数$ L $关于参数$ \mathbf w^K $的梯度，并用于参数更新。
+\item 第三步，获取$ \frac{\partial L}{\partial \mathbf h^{K-1}} $和$ \frac{\partial L}{\partial \mathbf w^K} $，即计算损失函数$ L $关于第$ K-1 $层输出结果$ \mathbf h^{K-1} $的梯度，并将梯度向前传递；同时计算损失函数$ L $关于第$K$层参数$ \mathbf w^K $的梯度，并用于参数更新。
 \end{itemize}
 \vspace{0.5em}
@@ -1751,7 +1769,7 @@ w_{t+1}&=&w_t-\frac{\eta}{\sqrt{z_t+\epsilon}} v_t
 \end{figure}
 %-------------------------------------------
-\parinterval  在这一阶段，计算的目标是得到损失函数$ L $关于第$ K $层中间状态$ \mathbf s^K $的梯度，这里令$ {\pi}^K= \frac{\partial L}{\partial \mathbf s^K} $，利用链式法则有：
+\parinterval  在第一阶段，计算的目标是得到损失函数$ L $关于第$ K $层中间状态$ \mathbf s^K $的梯度，这里令$ {\pi}^K= \frac{\partial L}{\partial \mathbf s^K} $，利用链式法则有：
 %公式--------------------------------------------------------------------
 \begin{eqnarray}
 {\pi}^K&=& \frac{\partial L}{\partial \mathbf s^K}\nonumber\\
@@ -1772,7 +1790,7 @@ w_{t+1}&=&w_t-\frac{\eta}{\sqrt{z_t+\epsilon}} v_t
 \end{spacing}
 \vspace{0.5em}
-\parinterval  这个过程可以得到$ \mathbf s^K $节点处的梯度$ {\pi}^K= \frac{\partial L}{\partial \mathbf s^K} $，在后续的过程中可以直接使用作为前一层提供的梯度计算结果，而不需要从$ \mathbf h^K $节点处重新计算。这也体现了自动微分与符号微分的差别，对于计算图的每一个阶段，并不需要得到完成的微分表达式，而是通过前一层提供的梯度，直接计算当前的梯度即可，这样避免了大量的重复计算。
+\parinterval  这个过程可以得到$ \mathbf s^K $节点处的梯度$ {\pi}^K= \frac{\partial L}{\partial \mathbf s^K} $，在后续的过程中可以直接使用其作为前一层提供的梯度计算结果，而不需要从$ \mathbf h^K $节点处重新计算。这也体现了自动微分与符号微分的差别，对于计算图的每一个阶段，并不需要得到完成的微分表达式，而是通过前一层提供的梯度，直接计算当前的梯度即可，这样避免了大量的重复计算。
 \parinterval  在得到$ {\pi}^K= \frac{\partial L}{\partial \mathbf s^K} $之后，下一步的目标是：1）计算损失函数$ L $相对于第$ K-1 $层与输出层之间连接权重$ \mathbf w^K $的梯度；2）计算损失函数$ L $相对于神经网络网络第$ K-1 $层输出结果$ \mathbf h^{K-1} $的梯度。这部分内容如图\ref{fig:back-propagation-output2}所示。
 %----------------------------------------------
@@ -1796,7 +1814,7 @@ w_{t+1}&=&w_t-\frac{\eta}{\sqrt{z_t+\epsilon}} v_t
 \label{eqa1.50}
 \end{eqnarray}
 %公式--------------------------------------------------------------------
-其中${[\cdot]}^{\rm T}$表示转置操作\footnote{如果$ \mathbf h^{K-1} $是一个向量，$ {\left[\mathbf h^{K-1}\right]}^{\rm T} $表示向量的转置，比如，行向量变成列向量；如果$ \mathbf h^{K-1} $ 是一个高阶张量，$ {\left[\mathbf h^{K-1}\right]}^{\rm T} $表示沿着张量最后两个方向的转置}。
+其中${[\cdot]}^{\rm T}$表示转置操作\footnote{如果$ \mathbf h^{K-1} $是一个向量，$ {\left[\mathbf h^{K-1}\right]}^{\rm T} $表示向量的转置，比如，行向量变成列向量；如果$ \mathbf h^{K-1} $ 是一个高阶张量，$ {\left[\mathbf h^{K-1}\right]}^{\rm T} $表示沿着张量最后两个方向的转置。}。
 \vspace{0.5em}
 \item 计算$ \frac{\partial L}{\partial \mathbf h^{K-1}} $ ：与求解$ \frac{\partial L}{\partial \mathbf w^K} $类似，可以得到
 %公式--------------------------------------------------------------------
@@ -1809,7 +1827,7 @@ w_{t+1}&=&w_t-\frac{\eta}{\sqrt{z_t+\epsilon}} v_t
 \end{itemize}
 \vspace{0.5em}
 %--5.4.6.2隐藏层的反向传播---------------------
-\subsubsection{（二）隐藏层的反向传播}\index{Chapter5.4.6.2}
+\subsubsection{隐藏层的反向传播}\index{Chapter5.4.6.2}
 \parinterval  对于第$ k $个隐藏层，有：
 %公式--------------------------------------------------------------------
@@ -1819,7 +1837,7 @@ w_{t+1}&=&w_t-\frac{\eta}{\sqrt{z_t+\epsilon}} v_t
 \label{eqa1.53}
 \end{eqnarray}
 %公式--------------------------------------------------------------------
-\noindent  其中，$ \mathbf h^k $、$ \mathbf s^k $、$ \mathbf h^{k-1} $、$\mathbf w^k $和分别表示隐藏层的输出、中间状态、输入和参数矩阵。隐藏层的前向计算过程如图\ref{fig:forward-propagation-hid}所示，第$ k-1 $ 层神经元的输出$ \mathbf h^{k-1} $经过线性变换和激活函数后，将计算结果$ \mathbf h^k $向后一层传递。
+\noindent  其中，$ \mathbf h^k $、$ \mathbf s^k $、$ \mathbf h^{k-1} $、$\mathbf w^k $和分别表示隐藏层的输出、中间状态、隐藏层的输入和参数矩阵。隐藏层的前向计算过程如图\ref{fig:forward-propagation-hid}所示，第$ k-1 $ 层神经元的输出$ \mathbf h^{k-1} $经过线性变换和激活函数后，将计算结果$ \mathbf h^k $向后一层传递。
 %----------------------------------------------
 % 图
 \begin{figure}[htp]
@@ -1872,9 +1890,9 @@ w_{t+1}&=&w_t-\frac{\eta}{\sqrt{z_t+\epsilon}} v_t
 \parinterval  综合输出层和隐藏层的反向传播方法，可以得到神经网络中任意位置和任意参数的梯度信息。只需要根据网络的拓扑结构，逆向访问每一个节点，并执行上述反向计算过程。
 %--5.4.6.3程序实现---------------------
-\subsubsection{（三）程序实现}\index{Chapter5.4.6.3}
+\subsubsection{程序实现}\index{Chapter5.4.6.3}
-\parinterval  在了解了反向传播的原理之后，实现反向传播就变得非常容易了。实际上，现在主流的深度学习框架都支持自动微分。这里，为了进一步说明反向传播的过程，这里使用NiuTensor工具构建两个简单的实例。我们分别尝试手动编写反向传播代码和使用NiuTensor自带的自动微分模块。
+\parinterval  在了解了反向传播的原理之后，实现反向传播就变得非常容易了。实际上，现在主流的深度学习框架都支持自动微分。这里，为了进一步说明反向传播的过程，这里使用NiuTensor工具构建两个简单的实例，并分别尝试手动编写反向传播代码和使用NiuTensor自带的自动微分模块。
 \parinterval  图\ref{fig:code-back-propagation-1}展示了实现一个简单的神经网络的反向传播代码。这种反向传播的实现方式正是上一节内容的代码实现：按层实现自动微分并将梯度向前一层传播。
 %----------------------------------------------
@@ -1887,7 +1905,7 @@ w_{t+1}&=&w_t-\frac{\eta}{\sqrt{z_t+\epsilon}} v_t
 \end{figure}
 %-------------------------------------------
-\parinterval  此外，NiuTensor还提供了一种更简单的一步完成神经网络反向传播的实现方式。如图\ref{fig:code-back-propagation-2}所示，在完成神经网络的搭建后，无论前向计算过程是怎样的，直接利用Backward 函数就可以实现整个神经网络的反向传播，用户可以完全不用关心其求解过程。
+\parinterval  此外，NiuTensor还提供了一种更简单的一步完成神经网络反向传播的实现方式。如图\ref{fig:code-back-propagation-2}所示，在完成神经网络的搭建后，无论前向计算过程是怎样的，直接利用Backward 函数就可以实现整个神经网络的反向传播，系统开发人员可以完全不用关心其求解过程。
 %----------------------------------------------
 % 图
 \begin{figure}[htp]
@@ -1901,9 +1919,10 @@ w_{t+1}&=&w_t-\frac{\eta}{\sqrt{z_t+\epsilon}} v_t
 %--5.5神经语言模型-----------------------------------------
 \section{神经语言模型}\label{sec5:nlm}\index{Chapter5.5}
-\parinterval  神经网络给我们提供了一种工具，只要将问题的输入和输出定义好，就可以学习输入和输出之间的对应关系。显然，很多自然语言处理任务都可以用神经网络进行实现。比如，在机器翻译中，可以把输入的源语言句子和输出的目标语句子用神经网络建模；在文本分类中，可以把输入的文本内容和输出的类别标签进行神经网络建模，等等。
+\parinterval  神经网络给我们提供了一种工具，只要将问题的输入和输出定义好，就可以学习输入和输出之间的对应关系。显然，很多自然语言处理任务都可以用神经网络进行实现。比如，在机器翻译中，可以把输入的源语言句子和输出的目标语言句子用神经网络建模；在文本分类中，可以把输入的文本内容和输出的类别标签进行神经网络建模，等等。
 \parinterval  为了更好地理解神经网络和深度学习在自然语言处理中的应用。这里介绍一种基于神经网络的语言建模方法\ \dash \ {\small\sffamily\bfseries{神经语言模型}}（Neural Language Model）。可以说，神经语言模型是深度学习时代下自然语言处理的标志性成果，它所涉及的许多概念至今仍是研究的热点，比如：词嵌入、表示学习、预训练等。此外，神经语言模型也为机器翻译的建模提供了很好的思路。从某种意义上说，机器翻译的深度学习建模的很多灵感均来自神经语言模型，二者在一定程度上是统一的。
 %--5.5.1基于神经网络的语言建模---------------------
 \subsection{基于神经网络的语言建模}\index{Chapter5.5.1}
@@ -1914,7 +1933,7 @@ w_{t+1}&=&w_t-\frac{\eta}{\sqrt{z_t+\epsilon}} v_t
 \label{eqa1.57}
 \end{eqnarray}
 %公式--------------------------------------------------------------------
-\parinterval  由于$ {\rm P}(w_m|w_1\dots w_{m-1}) $需要建模$ m-1 $个词构成的历史信息，这个模型仍然很复杂。于是就有了基于局部历史的$ n-{\rm{gram}} $语言模型，即：
+\parinterval  由于$ {\rm P}(w_m|w_1\dots w_{m-1}) $需要建模$ m-1 $个词构成的历史信息，这个模型仍然很复杂。于是就有了基于局部历史的$n$-gram语言模型，即：
 %公式--------------------------------------------------------------------
 \begin{eqnarray}
 {\rm P}(w_m|w_1\dots w_{m-1})&=&{\rm P}(w_m|w_{m-n+1}\dots w_{m-1})
@@ -1928,22 +1947,26 @@ w_{t+1}&=&w_t-\frac{\eta}{\sqrt{z_t+\epsilon}} v_t
 \label{}
 \end{eqnarray}
 %公式--------------------------------------------------------------------
-\parinterval  $ w_{m-n+1}\dots w_m $也被称作$ n-{\rm{gram}} $，即$ n $元语法单元。$ n-{\rm{gram}} $语言模型是一种典型的基于离散表示的模型。在这个模型中，所有的词都被看作是离散的符号。因此，不同单词之间是``完全''不同的。另一方面，语言现象是十分多样的，即使在很大的语料库上也无法得到所有$ n-{\rm{gram}} $的准确统计。甚至很多$ n-{\rm{gram}} $在训练数据中从未出现过。由于不同$ n-{\rm{gram}} $间没有建立直接的联系， $ n-{\rm{gram}} $语言模型往往面临数据稀疏的问题。比如，虽然在训练数据中见过``景色''这个词，但是测试数据中却出现了``风景''这个词，恰巧``风景''在训练数据中没有出现过。即使``风景''和``景色''表达的是相同的意思，$ n-{\rm{gram}} $语言模型仍然会把``风景''看作未登录词，赋予一个很低的概率值。
-\parinterval  上面这个问题的本质是$ n-{\rm{gram}} $语言模型对词使用了离散化表示，即每个单词都孤立的对应词表中的一个索引，词与词之间在语义上没有任何``重叠''。神经语言模型重新定义了这个问题。这里并不需要显性的通过统计离散的$ n-{\rm{gram}} $的频度，而是直接设计一个神经网络模型$ g(\cdot)$来估计单词生成的概率，
+\noindent 这里，$ w_{m-n+1}\dots w_m $也被称作$n$-gram，即$ n $元语法单元。$n$-gram语言模型是一种典型的基于离散表示的模型。在这个模型中，所有的词都被看作是离散的符号。因此，不同单词之间是``完全''不同的。另一方面，语言现象是十分多样的，即使在很大的语料库上也无法得到所有$n$-gram的准确统计。甚至很多$n$-gram在训练数据中从未出现过。由于不同$n$-gram 间没有建立直接的联系， $n$-gram语言模型往往面临数据稀疏的问题。比如，虽然在训练数据中见过``景色''这个词，但是测试数据中却出现了``风景''这个词，恰巧``风景''在训练数据中没有出现过。即使``风景''和``景色''表达的是相同的意思，$n$-gram语言模型仍然会把``风景''看作未登录词，赋予一个很低的概率值。
+\parinterval  上面这个问题的本质是$n$-gram语言模型对词使用了离散化表示，即每个单词都孤立的对应词表中的一个索引，词与词之间在语义上没有任何``重叠''。神经语言模型重新定义了这个问题。这里并不需要显性的通过统计离散的$n$-gram的频度，而是直接设计一个神经网络模型$ g(\cdot)$来估计单词生成的概率，
 %公式--------------------------------------------------------------------
 \begin{eqnarray}
 {\rm P}(w_m|w_1\dots w_{m-1})&=&g(w_1\dots w_m)
 \label{eqa1.59}
 \end{eqnarray}
 %公式--------------------------------------------------------------------
-\parinterval  $ g(w_1\dots w_m) $实际上是一个多层神经网络。与$ n-{\rm{gram}} $语言模型不同的是$ g(w_1\dots \\ w_m) $并不包含对$ w_1\dots w_m $的任何假设，比如，在神经网络模型中，单词不再是离散的符号，而是连续空间上的点。这样两个单词之间也不再是简单的非0即1的关系，而是具有可计算的距离。此外，由于没有对$ w_1\dots w_m $进行任何结构性的假设，神经语言模型对问题进行端到端学习。通过设计不同的神经网络$ g(\cdot)$，可以从不同的角度``定义''序列的表示问题。当然，这么说可能还有一些抽象，下面就一起看看神经语言模型究竟是什么样子的。
+\parinterval  $ g(w_1\dots w_m) $实际上是一个多层神经网络。与$n$-gram语言模型不同的是$ g(w_1\dots \\ w_m) $并不包含对$ w_1\dots w_m $的任何假设，比如，在神经网络模型中，单词不再是离散的符号，而是连续空间上的点。这样两个单词之间也不再是简单的非0即1的关系，而是具有可计算的距离。此外，由于没有对$ w_1\dots w_m $进行任何结构性的假设，神经语言模型对问题进行端到端学习。通过设计不同的神经网络$ g(\cdot)$，可以从不同的角度``定义''序列的表示问题。当然，这么说可能还有一些抽象，下面就一起看看神经语言模型究竟是什么样子的。
 %--5.5.1.1基于前馈神经网络的语言模型---------------------
-\subsubsection{（一）基于前馈神经网络的语言模型}\index{Chapter5.5.1.1}
+\subsubsection{基于前馈神经网络的语言模型}\index{Chapter5.5.1.1}
-\parinterval  最具代表性的神经语言模型是Bengio等人在2013年提出的{\small\sffamily\bfseries{前馈神经网络语言模型}}（Feed-forward Neural Network Language Model，简称FNNLM）。这种语言模型的目标是计算$ {\rm P}(w_m|w_{m-n+1}\dots w_{m-1}) $，之后将多个$ n-{\rm{gram}} $的概率相乘得到整个序列的概率\cite{bengio2003neural}，即$ {\rm P}(w_1w_2\dots w_m)={\rm P}(w_1)P(w_2|w_1)\dots {\rm P}(w_m|w_{m-n+1}\dots w_{m-1}) $。
+\parinterval  最具代表性的神经语言模型是Bengio等人提出的{\small\sffamily\bfseries{前馈神经网络语言模型}}（Feed-forward Neural Network Language Model，简称FNNLM）。这种语言模型的目标是用神经网络计算$ {\rm P}(w_m|w_{m-n+1}\dots w_{m-1}) $，之后将多个$n$-gram的概率相乘得到整个序列的概率\cite{bengio2003neural}。
-\parinterval  为了有一个直观的认识，这里以$ 4-{\rm{gram}} $的FNNLM语言模型为例，即根据前三个单词$ w_{i-3} $、 $ w_{i-2} $ 、$ w_{i-1} $预测当前单词$ w_i $的概率。如图1所示，$ w_{i-3} $、 $ w_{i-2} $ 、$ w_{i-1} $为该语言模型的输入（绿色方框），输入为每个词的One-hot向量表示（维度大小与词表大小一致），每个One-hot向量仅一维为1，其余为0，比如：$ (0,0,1,\dots,0) $表示词表中第三个单词。之后把One-hot向量乘以一个矩阵$ \mathbf C $得到单词的分布式表示（紫色方框）。令$ w_i $为第$ i $个词的One-hot表示，$ \mathbf e_i $为第$ i $个词的分布式表示，有：
+\parinterval  为了有一个直观的认识，这里以4-gram的FNNLM语言模型为例，即根据前三个单词$ w_{i-3} $、 $ w_{i-2} $ 、$ w_{i-1} $预测当前单词$ w_i $的概率。如图\ref{fig:4-gram}所示，$ w_{i-3} $、 $ w_{i-2} $ 、$ w_{i-1} $为该语言模型的输入（绿色方框），输入为每个词的One-hot向量表示（维度大小与词表大小一致），每个One-hot向量仅一维为1，其余为0，比如：$ (0,0,1,\dots,0) $表示词表中第三个单词。之后把One-hot向量乘以一个矩阵$ \mathbf C $得到单词的分布式表示（紫色方框）。令$ w_i $为第$ i $个词的One-hot表示，$ \mathbf e_i $为第$ i $个词的分布式表示，有：
 %公式--------------------------------------------------------------------
 \begin{eqnarray}
 \mathbf e_i&=&w_i\mathbf C
@@ -1960,7 +1983,7 @@ w_{t+1}&=&w_t-\frac{\eta}{\sqrt{z_t+\epsilon}} v_t
 %公式--------------------------------------------------------------------
 \parinterval  输出$ \mathbf y $是词表上的一个分布，通过$ w_i $可以索引到相应的概率$ {\rm P}(w_i|w_{i-1},w_{i-2},w_{i-3}) $。\\$ \mathbf U $、$ \mathbf H $和$ \mathbf d $是模型的参数。从结构上看，FNNLM主要有三层：1）词的分布式表示层，即把输入的离散的单词变为分布式表示对应的实数向量；2）隐藏层，即将得到的词的分布式表示进行线性和非线性变换；3）输出层，根据隐藏层的输出预测单词的概率分布。这三层堆叠在一起构成了整个网络，而且也可以加入从词的分布式表示直接到输出层的连接（红色虚线箭头）。
-\parinterval  值得注意的是，在FNNLM中，单词已经不再是一个孤立的符号串，而是被表示为一个实数向量。这样，两个单词之间可以通过向量计算某种相似度或距离。这导致相似的单词会具有相似的分布，进而缓解$ n-{\rm{gram}} $语言模型的问题\ \dash \ 明明意思很相近的两个词但是概率估计的结果差异性却很大。
+\parinterval  值得注意的是，在FNNLM中，单词已经不再是一个孤立的符号串，而是被表示为一个实数向量。这样，两个单词之间可以通过向量计算某种相似度或距离。这导致相似的单词会具有相似的分布，进而缓解$n$-gram语言模型的问题\ \dash \ 明明意思很相近的两个词但是概率估计的结果差异性却很大。
 %----------------------------------------------
 % 图
 \begin{figure}[htp]
@@ -1973,17 +1996,17 @@ w_{t+1}&=&w_t-\frac{\eta}{\sqrt{z_t+\epsilon}} v_t
 \parinterval  在FNNLM中，所有的参数、输入、输出都是连续变量，因此FNNLM也是典型的一个连续空间模型。通过使用交叉熵等损失函数，FNNLM很容易进行优化。比如，可以使梯度下降方法对FNNLM的模型参数进行训练。
-\parinterval  FNNLM的实现也非常简单，图\ref{fig:code-FNNLM}展示了基于NiuTensor的FNNLM的部分代码。需要注意的是，在程序实现时， Tanh函数一般会用hardTanh函数代替。因为 Tanh函数中的指数运算容易导致溢出：
+\parinterval  FNNLM的实现也非常简单，图\ref{fig:code-FNNLM}展示了基于NiuTensor的FNNLM的部分代码。需要注意的是，在程序实现时， Tanh函数一般会用HardTanh函数代替。因为 Tanh函数中的指数运算容易导致溢出：
 %公式--------------------------------------------------------------------
 \begin{eqnarray}
 {\rm{Tanh}}(x)&=&\frac{{\rm{exp}}(x)-{\rm{exp}}(-x)}{{\rm{exp}}(x)+{\rm{exp}}(-x)}
 \label{}
 \end{eqnarray}
 %公式--------------------------------------------------------------------
-\noindent 而hardTanh函数不存在这个问题，因此具有数值计算的稳定性。hardTanh函数表达式如下：
+\noindent 而HardTanh函数不存在这个问题，因此具有数值计算的稳定性。HardTanh函数表达式如下：
 %公式--------------------------------------------------------------------
 \begin{eqnarray}
-{\rm{hardTanh}}(x)&=&\begin{cases} -1 & x<-1\\x & -1\leqslant x\leqslant 1\\1 & x>1\end{cases}
+{\rm{HardTanh}}(x)&=&\begin{cases} -1 & x<-1\\x & -1\leqslant x\leqslant 1\\1 & x>1\end{cases}
 \label{}
 \end{eqnarray}
 %公式--------------------------------------------------------------------
@@ -1997,15 +2020,16 @@ w_{t+1}&=&w_t-\frac{\eta}{\sqrt{z_t+\epsilon}} v_t
 \end{figure}
 %-------------------------------------------
-\parinterval  虽然Bengio等人提出的FNNLM模型形式简单，却为处理自然语言提供了一个全新的视角。首先，该模型重新定义了``词是什么''\ \dash \ 它并非词典的一项，而是可以用一个连续实数向量进行表示的可计算的``量''。此外，由于$ n-{\rm{gram}} $不再是离散的符号序列，模型不需要记录$ n-{\rm{gram}} $，所以很好的缓解了上面所提到的数据稀疏问题，模型体积也大大减小。
+\parinterval  虽然FNNLM模型形式简单，却为处理自然语言提供了一个全新的视角。首先，该模型重新定义了``词是什么''\ \dash \ 它并非词典的一项，而是可以用一个连续实数向量进行表示的可计算的``量''。此外，由于$n$-gram不再是离散的符号序列，模型不需要记录$n$-gram，所以很好的缓解了上面所提到的数据稀疏问题，模型体积也大大减小。
+\parinterval  当然，FNNLM模型也引发后人的许多思考，比如：神经网络每一层都学到了什么？是词法、句法、还是一些其他知识？如何理解词的分布式表示？等等。在随后的内容中也会看到，随着近几年深度学习和自然语言处理的发展，部分问题已经得到了很好的解答，但是仍有许多问题需要进一步探索。
-\parinterval  当然，FNNLM这也引发后人的许多思考，比如：神经网络每一层都学到了什么？是词法、句法、还是一些其他知识？如何理解词的分布式表示？等等。在随后的内容中也会看到，随着近几年深度学习和自然语言处理的发展，部分问题已经得到了很好的解答，但是仍有许多问题需要进一步探索。
 %--5.5.1.2基于循环神经网络的语言模型---------------------
-\subsubsection{（二）基于循环神经网络的语言模型}\index{Chapter5.5.1.2}
+\subsubsection{基于循环神经网络的语言模型}\index{Chapter5.5.1.2}
-\parinterval  FNNLM模型固然有效，但是和传统的$ n-{\rm{gram}} $语言模型一样需要依赖有限上下文假设，也就是$ w_i $的生成概率只依赖于之前的$ n-1 $个单词。很自然的一个想法是引入更大范围的历史信息，这样可以扑捉单词间的长距离依赖。
+\parinterval  FNNLM模型固然有效，但是和传统的$n$-gram语言模型一样需要依赖有限上下文假设，也就是$ w_i $的生成概率只依赖于之前的$ n-1 $个单词。很自然的一个想法是引入更大范围的历史信息，这样可以扑捉单词间的长距离依赖。
-\parinterval  对于这个问题，可以通过{\small\sffamily\bfseries{循环神经网络}}（Recurrent Neural Network，或RNN）进行求解。通过引入循环单元这种特殊的结构，循环神经网络可以对任意长度的历史进行建模，因此在一定程度上解决了传统基于$ n-{\rm{gram}} $的有限历史的问题。正是基于这个优点，{\small\sffamily\bfseries{循环神经网络语言模型}}（RNNLM）应运而生\cite{mikolov2010recurrent}。
+\parinterval  对于这个问题，可以通过{\small\sffamily\bfseries{循环神经网络}}（Recurrent Neural Network，或RNN）进行求解。通过引入循环单元这种特殊的结构，循环神经网络可以对任意长度的历史进行建模，因此在一定程度上解决了传统$n$-gram语言模型有限历史的问题。正是基于这个优点，{\small\sffamily\bfseries{循环神经网络语言模型}}（RNNLM）应运而生\cite{mikolov2010recurrent}。
 \parinterval  在循环神经网络中，输入和输出都是一个序列，分别记为$ (\mathbf x_1,\dots,\mathbf x_m) $和$ (\mathbf y_1,\dots,\\ \mathbf y_m) $。它们都可以被看作是时序序列，其中每个时刻$ t $都对应一个输入$ \mathbf x_t $和输出$ \mathbf y_t $。循环神经网络的核心是{\small\sffamily\bfseries{循环单元}}（RNN Cell），它读入前一个时刻循环单元的输出和当前时刻的输入，生成当前时刻循环单元的输出。图\ref{fig:rnn-LM}展示了一个简单的循环单元结构，对于时刻$ t $，循环单元的输出被定义为：
 %公式--------------------------------------------------------------------
@@ -2017,7 +2041,7 @@ w_{t+1}&=&w_t-\frac{\eta}{\sqrt{z_t+\epsilon}} v_t
 \noindent  其中，$ \mathbf h_t $表示$ t $时刻循环单元的输出，$ \mathbf h_{t-1} $表示$ t-1 $时刻循环单元的输出，$ \mathbf U $和$ \mathbf W $是模型的参数。可以看出，循环单元的结构其实很简单，只是一个对$ \mathbf h_{t-1} $和$ \mathbf x_t $的线性变换再加上一个Tanh函数。通过读入上一时刻的输出，当前时刻可以访问以前的历史信息。这个过程可以循环执行，这样就完成了对所有历史信息的建模。$ \mathbf h_t $可以被看作是序列在$ t $时刻的一种表示，也可以被看作是网络的一个隐藏层。进一步，$ \mathbf h_t $可以被送入输出层，得到$ t $时刻的输出：
 %公式--------------------------------------------------------------------
 \begin{eqnarray}
-\mathbf Y_t&=&{\rm{Softmax}}(\mathbf h_t\mathbf V)
+\mathbf y_t&=&{\rm{Softmax}}(\mathbf h_t\mathbf V)
 \label{eqa1.64}
 \end{eqnarray}
 %公式--------------------------------------------------------------------
@@ -2036,11 +2060,11 @@ w_{t+1}&=&w_t-\frac{\eta}{\sqrt{z_t+\epsilon}} v_t
 \parinterval  RNNLM体现了一种``记忆''的能力。对于每一个时刻，循环单元都会保留一部分``以前''的信息，并加入``现在''的信息。从这个角度说，RNNLM本质上是一种记忆模型。在简单的循环单元结构的基础上，也有很多改进工作，如LSTM、GRU等模型，这部分内容将会在第六章进行介绍。
 %--5.5.1.3基于自注意力机制的语言模型---------------------
-\subsubsection{（三）基于自注意力机制的语言模型}\index{Chapter5.5.1.3}
+\subsubsection{基于自注意力机制的语言模型}\index{Chapter5.5.1.3}
-\parinterval  通过引入记忆历史的能力，RNNLM缓解了$ n-{\rm{gram}} $模型中有限上下文的局限性，但依旧存在一些问题。随着序列变长，不同单词之间信息传递路径变长，信息传递的效率变低。对于长序列，很难通过很多次的循环单元操作保留很长的历史信息。过长的序列还容易引起梯度消失和梯度爆炸问题（详见\ref{sec:5.4.4}节），增加模型训练的难度。
+\parinterval  通过引入记忆历史的能力，RNNLM缓解了$n$-gram模型中有限上下文的局限性，但依旧存在一些问题。随着序列变长，不同单词之间信息传递路径变长，信息传递的效率变低。对于长序列，很难通过很多次的循环单元操作保留很长的历史信息。过长的序列还容易引起梯度消失和梯度爆炸问题（详见\ref{sec:5.4.4}节），增加模型训练的难度。
-\parinterval  对于这个问题，研究者又提出了一种新的结构---自注意力机制（Self-Attention Mechanism）。自注意力是一种特殊的神经网络结构，它可以对序列上任意两个词的相互作用直接进行建模，这样也就避免了循环神经网络中随着距离变长信息传递步骤增多的缺陷。在自然语言处理领域，自注意力机制被成功的应用在机器翻译，形成了著名的Transformer模型\cite{NIPS2017_7181}。第六章会系统的介绍自注意力机制和Transformer模型。
+\parinterval  对于这个问题，研究者又提出了一种新的结构$\ \dash \ ${\small\bfnew{自注意力机制}}（Self-Attention Mechanism）。自注意力是一种特殊的神经网络结构，它可以对序列上任意两个词的相互作用直接进行建模，这样也就避免了循环神经网络中随着距离变长信息传递步骤增多的缺陷。在自然语言处理领域，自注意力机制被成功的应用在机器翻译，形成了著名的Transformer模型\cite{NIPS2017_7181}。第六章会系统的介绍自注意力机制和Transformer模型。
 \parinterval  这里，先简单了解一下基于Transformer的语言模型结构（图\ref{fig:transformer-LM}）。与FNNLM\\和RNNLM一样，Transformer首先对输入单词进行分布式表示，同时加上每个位置的编码构成了整个模型的输入（蓝色方框）。之后，利用自注意力机制对输入的向量进行处理（绿色方框）。自注意力的结果会被送入一个前馈神经网络，之后再送给Softmax输出层（橙色方框）。
 %----------------------------------------------
@@ -2053,9 +2077,9 @@ w_{t+1}&=&w_t-\frac{\eta}{\sqrt{z_t+\epsilon}} v_t
 \end{figure}
 %-------------------------------------------
-\parinterval  在传统的语言模型中，给定一个单词$ w_i $，其他单词对它的影响并没有显性的被建模。而在基于注意力机制的语言模型中，当前需要预测的单词会更加关注与该位置联系较大的单词。具体来说，注意力机制会计算位置$ i $与其他任意位置之间的相关度，称为{\small\sffamily\bfseries{注意力权重}}（Attention Weights），通过这个权重可以更多的使用与$ w_i $关联紧密的位置的信息。举个简单的例子，在``我 喜欢 学习 数学''这个句子中，需要预测``数学''这个词，通过注意力机制很可能知道``数学''与``学习''的联系更紧密，所以在预测过程中``学习''所占的权重会更大，预测结果会更加精确。
+\parinterval  在传统的语言模型中，给定一个单词$ w_i $，其他单词对它的影响并没有显性的被建模。而在基于注意力机制的语言模型中，当前需要预测的单词会更加关注与该位置联系较大的单词。具体来说，注意力机制会计算位置$ i $与其他任意位置之间的相关度，称为{\small\sffamily\bfseries{注意力权重}}（Attention Weight），通过这个权重可以更多的使用与$ w_i $关联紧密的位置的信息。举个简单的例子，在``我\ 喜欢\ 学习\ 数学''这个句子中，需要预测``数学''这个词，通过注意力机制很可能知道``数学''与``学习''的联系更紧密，所以在预测过程中``学习''所占的权重会更大，预测结果会更加精确。
 %--5.5.1.4语言模型的评价---------------------
-\subsubsection{（四）语言模型的评价}\index{Chapter5.5.1.4}
+\subsubsection{语言模型的评价}\index{Chapter5.5.1.4}
 \parinterval  在使用语言模型时，往往需要知道模型的质量。{\small\sffamily\bfseries{困惑度}}（Perplexity，PPL）是一种衡量语言模型的好坏的指标。对于一个真实的词序列$ w_1\dots w_m $，困惑度被定义为
 %公式--------------------------------------------------------------------
@@ -2069,8 +2093,9 @@ w_{t+1}&=&w_t-\frac{\eta}{\sqrt{z_t+\epsilon}} v_t
 \subsection{单词表示模型}\index{Chapter5.5.2}
 \parinterval  在神经语言建模中，每个单词都会被表示为一个实数向量。这对应了一种单词的表示模型。下面就来看看传统的单词表示模型和这种基于实数向量的单词表示模型有何不同。
 %--5.5.2.1One-hot编码---------------------
-\subsubsection{（一）One-hot编码}\index{Chapter5.5.2.1}
+\subsubsection{One-hot编码}\index{Chapter5.5.2.1}
 \parinterval  {\small\sffamily\bfseries{One-hot编码}}（也称{\small\sffamily\bfseries{独热编码}}）是传统的单词表示方法。One-hot编码把单词表示为词汇表大小的0-1向量，其中只有该词所对应的那一项是1，而其余所有项都是零。举个简单的例子，假如有一个词典，里面包含10k个单词，并进行编号。那么每个单词都可以表示为一个10k维的One-hot向量，它仅在对应编号那个维度为1，其他维度都为0，如图\ref{fig:one-hot}所示。
 %----------------------------------------------
@@ -2084,8 +2109,9 @@ w_{t+1}&=&w_t-\frac{\eta}{\sqrt{z_t+\epsilon}} v_t
 %-------------------------------------------
 \parinterval  One-hot编码的优点是形式简单、易于计算，而且这种表示与词典具有很好的对应关系，因此每个编码都可以进行解释。但是，One-hot编码把单词都看作是相互正交的向量。这导致所有单词之间没有任何的相关性。只要是不同的单词，在One-hot编码下都是完全不同的东西。比如，大家可能会期望诸如``桌子''和``椅子''之类的词具有一些相似性，但是One-hot编码把它们看作相似度为0的两个单词。
 %--5.5.2.2分布式表示---------------------
-\subsubsection{（二）分布式表示}\index{Chapter5.5.2.2}
+\subsubsection{分布式表示}\index{Chapter5.5.2.2}
 \parinterval  神经语言模型中使用的是一种{\small\sffamily\bfseries{分布式表示}}（Distributed Representation）。在神经语言模型里，每个单词不再是完全正交的0-1向量，而是在多维实数空间中的一个点，具体表现为一个实数向量。很多时候，也会把单词的这种分布式表示叫做{\small\sffamily\bfseries{词嵌入}}（Word Embedding）。
@@ -2102,7 +2128,7 @@ w_{t+1}&=&w_t-\frac{\eta}{\sqrt{z_t+\epsilon}} v_t
 \parinterval  那么，分布式表示中每个维度的含义是什么？可以把每一维度都理解为一种属性，比如一个人的身高、体重等。但是，神经网络模型更多的是把每个维度看作是单词的一种抽象``刻画''，是一种统计意义上的``语义''，而非简单的人工归纳的事物的一个个属性。使用这种连续空间的表示的好处在于，表示的内容（实数向量）可以进行计算和学习，因此可以通过模型训练得到更适用于自然语言处理的单词表示结果。
-\parinterval  为了方便理解，看一个简单的例子。假如现在有个``预测下一个单词''的任务：有这样一个句子``屋里 要 摆放 一个 \rule[-3pt]{1cm}{0.05em}''，其中下划线的部分表示需要预测的下一个单词。如果模型在训练数据中看到过类似于``摆放 一个 桌子''这样的片段，那么就可以很自信的预测出``桌子''。另一方面，我也知道，实际上与``桌子''相近的单词，如``椅子''，也是可以预测的单词的。但是，``椅子''恰巧没有出现在训练数据中，这时如果用One-hot编码来表示单词，显然无法把``椅子''填到下划线处的；而如果使用单词的分布式表示，很容易就知道 ``桌子''与``椅子''是相似的，因此预测`` 椅子''在一定程度上也是合理的。
+\parinterval  为了方便理解，看一个简单的例子。假如现在有个``预测下一个单词''的任务：有这样一个句子``屋里 要 摆放 一个 \rule[-3pt]{1cm}{0.05em}''，其中下划线的部分表示需要预测的下一个单词。如果模型在训练数据中看到过类似于``摆放 一个 桌子''这样的片段，那么就可以很自信的预测出``桌子''。另一方面，很容易知道，实际上与``桌子''相近的单词，如``椅子''，也是可以预测的单词的。但是，``椅子''恰巧没有出现在训练数据中，这时如果用One-hot编码来表示单词，显然无法把``椅子''填到下划线处的；而如果使用单词的分布式表示，很容易就知道 ``桌子''与``椅子''是相似的，因此预测`` 椅子''在一定程度上也是合理的。
 \begin{example}
 屋里 要 摆放 一个 \_\_\_\_\_ \hspace{0.5em} \quad \quad 预测下个词
@@ -2111,7 +2137,7 @@ w_{t+1}&=&w_t-\frac{\eta}{\sqrt{z_t+\epsilon}} v_t
 \qquad \qquad \quad 屋里 要 摆放 一个{ \blue{椅子}} \hspace{0.5em}\quad\quad 没见过，但是仍然是合理预测
 \end{example}
-\parinterval  关于单词的分布式表示还有一个经典的例子：通过词嵌入可以得到如下关系：``国王''=``女王''-``女人'' +``男人' '。从这个例子可以看出，词嵌入也具有一些代数性质，比如，词的分布式表示可以通过加、减等代数运算相互转换。图\ref{fig:word-graph}展示了词嵌入在一个二维平面上的投影，不难发现，含义相近的单词分布比较临近。
+\parinterval  关于单词的分布式表示还有一个经典的例子：通过词嵌入可以得到如下关系：$\textrm{``国王''}=\textrm{``女王''}-\textrm{``女人''} +\textrm{``男人''}$。从这个例子可以看出，词嵌入也具有一些代数性质，比如，词的分布式表示可以通过加、减等代数运算相互转换。图\ref{fig:word-graph}展示了词嵌入在一个二维平面上的投影，不难发现，含义相近的单词分布比较临近。
 %----------------------------------------------
 % 图
 \begin{figure}[htp]
@@ -2137,7 +2163,7 @@ w_{t+1}&=&w_t-\frac{\eta}{\sqrt{z_t+\epsilon}} v_t
 %--5.5.3句子表示模型及预训练---------------------
 \subsection{句子表示模型及预训练}\index{Chapter5.5.3}
-\parinterval  目前，词嵌入已经成为诸多自然语言处理系统的标配，也衍生出很多有趣的研究法方向，甚至有人开玩笑的喊出``embedding everything''的口号。但是，冷静的看，词嵌入依旧存在一些问题：每个词都对应唯一的向量表示，那么对于一词多义现象，词义需要通过上下文进行区分，这时使用简单的词嵌入式是无法处理的。有一个著名的例子：
+\parinterval  目前，词嵌入已经成为诸多自然语言处理系统的标配，也衍生出很多有趣的研究法方向，甚至有人开玩笑的喊出``embed everything''的口号。但是，冷静的看，词嵌入依旧存在一些问题：每个词都对应唯一的向量表示，那么对于一词多义现象，词义需要通过上下文进行区分，这时使用简单的词嵌入式是无法处理的。有一个著名的例子：
 \vspace{0.3em}
 \begin{example}
 Jobs was the CEO of {\red{\underline{apple}}}.
@@ -2147,9 +2173,9 @@ Jobs was the CEO of {\red{\underline{apple}}}.
 \parinterval  这两句中``apple''的语义显然是不同的，第一句子中的上下文``Jobs''和``CEO''可以帮助我们判断``apple''是一个公司名字，而不是水果。但是词嵌入只有一个结果，因此无法区分这两种情况。这个例子给我们一个启发：在一个句子中，不能孤立的看待单词，应同时考虑其上下文的信息。也就是需要一个能包含句子中上下文信息的表示模型。
 %--5.5.3.1简单的上下文表示模型---------------------
-\subsubsection{（一）简单的上下文表示模型}\index{Chapter5.5.3.1}
+\subsubsection{简单的上下文表示模型}\index{Chapter5.5.3.1}
-\parinterval  回忆一下神经语言模型的结构，它需要在每个位置预测单词生成的概率。这个概率是由若干层神经网络进行计算后，通过输出层得到的。实际上，在送入输出层之前，系统已经得到了这个位置的一个向量（隐藏层的输出），因此可以把它看作是含有一部分上下文信息的表示结果。以RNN为例，图\ref{fig:rnn-model}展示了一个由四个词组成的句子，这里使用了一个两层循环神经网络对其进行建模。可以看到，对于第三个位置，RNN已经积累了从第1个单词到第3个单词的信息，因此可以看作是单词1-3（``乔布斯 就职 于''）的一种表示；另一方面，第4个单词的词嵌入可以看作是``苹果''自身的表示。这样，可以把第3 个位置RNN的输出和第4个位置的词嵌入进行合并，就得到了第4个位置上含有上下文信息的表示结果。从另一个角度说，我们得到了``苹果''的一种新的表示，它不仅包含苹果这个词自身的信息，也包含它前文的信息。
+\parinterval  回忆一下神经语言模型的结构，它需要在每个位置预测单词生成的概率。这个概率是由若干层神经网络进行计算后，通过输出层得到的。实际上，在送入输出层之前，系统已经得到了这个位置的一个向量（隐藏层的输出），因此可以把它看作是含有一部分上下文信息的表示结果。以RNN为例，图\ref{fig:rnn-model}展示了一个由四个词组成的句子，这里使用了一个两层循环神经网络对其进行建模。可以看到，对于第三个位置，RNN已经积累了从第1个单词到第3个单词的信息，因此可以看作是单词1-3（``乔布斯\ 就职\ 于''）的一种表示；另一方面，第4个单词的词嵌入可以看作是``苹果''自身的表示。这样，可以把第3 个位置RNN的输出和第4个位置的词嵌入进行合并，就得到了第4个位置上含有上下文信息的表示结果。从另一个角度说，这里得到了``苹果''的一种新的表示，它不仅包含苹果这个词自身的信息，也包含它前文的信息。
 %----------------------------------------------
 % 图
 \begin{figure}[htp]
@@ -2162,7 +2188,7 @@ Jobs was the CEO of {\red{\underline{apple}}}.
 \parinterval  在自然语言处理中，{\small\sffamily\bfseries{句子表示模型}}是指把输入的句子进行分布式表示。不过表示的形式不一定是一个单独的向量。现在广泛使用的句子表示模型可以被描述为：给定一个输入的句子$ \{ w_1,\dots ,w_m\} $，得到一个表示序列$ \{ \mathbf h_1,\dots ,\mathbf h_m\} $，其中$ h_i $是句子在第$ i $个位置的表示结果。$ \{ \mathbf h_1,\dots ,\mathbf h_m\} $就被看作是{\small\sffamily\bfseries{句子的表示}}，它可以被送入下游模块。比如，在机器翻译任务中，可以用这种模型表示源语言句子，然后通过这种表示结果进行目标语译文的生成；在序列标注（如词性标注）任务中，可以对输入的句子进行表示，然后在这个表示之上构建标签预测模块。很多自然语言处理任务都可以用句子表示模型进行建模，因此句子的表示模型也是应用最广泛的深度学习模型之一。而学习这种表示的过程也被称作{\small\sffamily\bfseries{表示学习}}（Representation Learning）。
-\parinterval  句子表示模型有两种训练方法。最简单的方法是把它作为目标系统中的一个模块进行训练，比如把句子表示模型作为机器翻译系统的一部分。也就是，我们并不单独训练句子表示模型，而是把它作为一个内部模块放到其他系统中。另一种方法是把句子表示作为独立的模块，用外部系统进行训练，之后把训练好的表示模型放入目标系统中，再进行微调。这种方法构成了一种新的范式：预训练+微调（pre-training + fine-tuning）。图\ref{fig:model-training}对比了这两种不同的方法。
+\parinterval  句子表示模型有两种训练方法。最简单的方法是把它作为目标系统中的一个模块进行训练，比如把句子表示模型作为机器翻译系统的一部分。也就是，并不单独训练句子表示模型，而是把它作为一个内部模块放到其他系统中。另一种方法是把句子表示作为独立的模块，用外部系统进行训练，之后把训练好的表示模型放入目标系统中，再进行微调。这种方法构成了一种新的范式：预训练+微调（pre-training + fine-tuning）。图\ref{fig:model-training}对比了这两种不同的方法。
 %----------------------------------------------
 % 图
 \begin{figure}[htp]
@@ -2173,13 +2199,13 @@ Jobs was the CEO of {\red{\underline{apple}}}.
 \end{figure}
 %-------------------------------------------
-\parinterval  目前，句子表示模型的预训练方法在多项自然语言处理任务上取得了很好的效果。预训练模型也成为了当今自然语言处理中的热点方向，相关系统也在很多评测任务上刷榜。除了上面提到的简单的神经语言模型的方法，还有各式各样基于预训练的表示模型被提出。接下来，会简单介绍其中比较有代表性的三种模型---ELMO、 GPT 和BERT。
+\parinterval  目前，句子表示模型的预训练方法在多项自然语言处理任务上取得了很好的效果。预训练模型也成为了当今自然语言处理中的热点方向，相关系统也在很多评测任务上刷榜。除了上面提到的简单的神经语言模型的方法，还有各式各样基于预训练的表示模型被提出。接下来，会简单介绍其中比较有代表性的三种模型$\ \dash\ $ELMO、 GPT 和BERT。
 %--5.5.3.2ELMO模型---------------------
-\subsubsection{（二）ELMO模型}\index{Chapter5.5.3.2}
+\subsubsection{ELMO模型}\index{Chapter5.5.3.2}
 \parinterval  ELMO（Embedding from Language Models）掀起了基于语言模型的预训练的热潮\cite{peters2018deep}。ELMO的论文也获得了自然语言处理领域顶级会议NAACL2018的最佳论文。
-\parinterval  在ELMO中，作者认为词的表示应该能够包含丰富的句子结构信息，并且能够对多义词进行建模。而传统的词嵌入（例如word2vec）是上下文无关的，所以他们利用语言模型来获得一个上下文相关的预训练表示。EMLO基于双向LSTM语言模型\footnote{ LSTM（Long Short-Term Memory），即长短时记忆模型，是一种循环神经网络结构。}，由一个正向语言模型和一个反向语言模型构成，目标函数是最大化这两个方向语言模型的似然。简单来说，ELMO就是一个预训练好的双向语言模型，对于每个句子都可以生成相应的句子表示结果，这个结果会作为输入的特征被送入下游任务中。比如，ELMO在问答、文本蕴含、情感分析等多个任务中都表现出非常好的效果。
+\parinterval  在ELMO中，作者认为词的表示应该能够包含丰富的句子结构信息，并且能够对多义词进行建模。而传统的词嵌入（例如word2vec）是上下文无关的，所以他们利用语言模型来获得一个上下文相关的预训练表示。EMLO基于双向LSTM语言模型\footnote{ LSTM（Long Short-Term Memory），即长短时记忆模型，是一种循环神经网络结构。}，由一个正向语言模型和一个反向语言模型构成，目标函数是最大化这两个方向语言模型的似然（图\ref{fig:elmo}）。简单来说，ELMO就是一个预训练好的双向语言模型，对于每个句子都可以生成相应的句子表示结果，这个结果会作为输入的特征被送入下游任务中。比如，ELMO在问答、文本蕴含、情感分析等多个任务中都表现出非常好的效果。
 %----------------------------------------------
 % 图
 \begin{figure}[htp]
@@ -2191,7 +2217,7 @@ Jobs was the CEO of {\red{\underline{apple}}}.
 %-------------------------------------------
 %--5.5.3.3GPT模型---------------------
-\subsubsection{（三）GPT模型}\index{Chapter5.5.3.3}
+\subsubsection{GPT模型}\index{Chapter5.5.3.3}
 \parinterval  GPT（Generative Pre-Training）也是一种基于语言建模的句子表示模型\cite{radford2018improving}。该工作的贡献在于利用Transformer结构代替了LSTM。而且该模型基于Pre-training + Fine-tuning的框架，预训练的结果做为下游系统的句子表示模块的参数初始值，因此可以更好的适应目标任务。
@@ -2207,7 +2233,7 @@ Jobs was the CEO of {\red{\underline{apple}}}.
 %-------------------------------------------
 %--5.5.3.4BERT模型---------------------
-\subsubsection{（四）BERT模型}\index{Chapter5.5.3.4}
+\subsubsection{BERT模型}\index{Chapter5.5.3.4}
 \parinterval  BERT（Bidirectional Encoder Representations from Transformers）是另一个非常有代表性的基于预训练的句子表示模型\cite{devlin2018bert}。某种意义上，BERT把基于预训练的句子表示模型推向了新的高潮。BERT的论文也获得了NAACL2019最佳论文奖。
@@ -2233,7 +2259,7 @@ Jobs was the CEO of {\red{\underline{apple}}}.
 %-------------------------------------------
 %--5.5.3.5为什么要预训练？---------------------
-\subsubsection{（五）为什么要预训练？}\index{Chapter5.5.3.5}
+\subsubsection{为什么要预训练？}\index{Chapter5.5.3.5}
 \parinterval  基于预训练的句子表示模型确实给自然语言处理带来了新的思路。相比传统的基于目标任务的建模和训练方法，预训练有如下优势：

--- a/Book/mt-book-xelatex.idx
+++ b/Book/mt-book-xelatex.idx
-\indexentry{Chapter5.1|hyperpage}{10}
+\indexentry{Chapter1.1|hyperpage}{13}
-\indexentry{Chapter5.1.1|hyperpage}{10}
+\indexentry{Chapter1.2|hyperpage}{16}
-\indexentry{Chapter5.1.1.1|hyperpage}{10}
+\indexentry{Chapter1.3|hyperpage}{21}
-\indexentry{Chapter5.1.1.2|hyperpage}{11}
+\indexentry{Chapter1.4|hyperpage}{22}
-\indexentry{Chapter5.1.1.3|hyperpage}{12}
+\indexentry{Chapter1.4.1|hyperpage}{22}
-\indexentry{Chapter5.1.2|hyperpage}{13}
+\indexentry{Chapter1.4.2|hyperpage}{24}
-\indexentry{Chapter5.1.2.1|hyperpage}{13}
+\indexentry{Chapter1.4.3|hyperpage}{25}
-\indexentry{Chapter5.1.2.2|hyperpage}{14}
+\indexentry{Chapter1.4.4|hyperpage}{26}
-\indexentry{Chapter5.2|hyperpage}{14}
+\indexentry{Chapter1.4.5|hyperpage}{27}
-\indexentry{Chapter5.2.1|hyperpage}{14}
+\indexentry{Chapter1.5|hyperpage}{28}
-\indexentry{Chapter5.2.1.1|hyperpage}{15}
+\indexentry{Chapter1.5.1|hyperpage}{28}
-\indexentry{Chapter5.2.1.2|hyperpage}{16}
+\indexentry{Chapter1.5.2|hyperpage}{29}
-\indexentry{Chapter5.2.1.3|hyperpage}{16}
+\indexentry{Chapter1.5.2.1|hyperpage}{29}
-\indexentry{Chapter5.2.1.4|hyperpage}{17}
+\indexentry{Chapter1.5.2.2|hyperpage}{31}
-\indexentry{Chapter5.2.1.5|hyperpage}{18}
+\indexentry{Chapter1.5.2.3|hyperpage}{31}
-\indexentry{Chapter5.2.1.6|hyperpage}{19}
+\indexentry{Chapter1.6|hyperpage}{32}
-\indexentry{Chapter5.2.2|hyperpage}{20}
+\indexentry{Chapter1.7|hyperpage}{34}
-\indexentry{Chapter5.2.2.1|hyperpage}{21}
+\indexentry{Chapter1.7.1|hyperpage}{34}
-\indexentry{Chapter5.2.2.2|hyperpage}{22}
+\indexentry{Chapter1.7.1.1|hyperpage}{34}
-\indexentry{Chapter5.2.2.3|hyperpage}{23}
+\indexentry{Chapter1.7.1.2|hyperpage}{36}
-\indexentry{Chapter5.2.2.4|hyperpage}{23}
+\indexentry{Chapter1.7.2|hyperpage}{38}
-\indexentry{Chapter5.2.3|hyperpage}{24}
+\indexentry{Chapter1.8|hyperpage}{40}
-\indexentry{Chapter5.2.3.1|hyperpage}{24}
+\indexentry{Chapter2.1|hyperpage}{46}
-\indexentry{Chapter5.2.3.2|hyperpage}{26}
+\indexentry{Chapter2.2|hyperpage}{47}
-\indexentry{Chapter5.2.4|hyperpage}{27}
+\indexentry{Chapter2.2.1|hyperpage}{47}
-\indexentry{Chapter5.3|hyperpage}{31}
+\indexentry{Chapter2.2.2|hyperpage}{49}
-\indexentry{Chapter5.3.1|hyperpage}{32}
+\indexentry{Chapter2.2.3|hyperpage}{50}
-\indexentry{Chapter5.3.1.1|hyperpage}{32}
+\indexentry{Chapter2.2.4|hyperpage}{51}
-\indexentry{Chapter5.3.1.2|hyperpage}{34}
+\indexentry{Chapter2.2.5|hyperpage}{53}
-\indexentry{Chapter5.3.1.3|hyperpage}{35}
+\indexentry{Chapter2.2.5.1|hyperpage}{53}
-\indexentry{Chapter5.3.2|hyperpage}{36}
+\indexentry{Chapter2.2.5.2|hyperpage}{54}
-\indexentry{Chapter5.3.3|hyperpage}{36}
+\indexentry{Chapter2.2.5.3|hyperpage}{54}
-\indexentry{Chapter5.3.4|hyperpage}{38}
+\indexentry{Chapter2.3|hyperpage}{55}
-\indexentry{Chapter5.3.5|hyperpage}{41}
+\indexentry{Chapter2.3.1|hyperpage}{56}
-\indexentry{Chapter5.4|hyperpage}{42}
+\indexentry{Chapter2.3.2|hyperpage}{57}
-\indexentry{Chapter5.4.1|hyperpage}{43}
+\indexentry{Chapter2.3.2.1|hyperpage}{57}
-\indexentry{Chapter5.4.2|hyperpage}{44}
+\indexentry{Chapter2.3.2.2|hyperpage}{58}
-\indexentry{Chapter5.4.2.1|hyperpage}{44}
+\indexentry{Chapter2.3.2.3|hyperpage}{60}
-\indexentry{Chapter5.4.2.2|hyperpage}{46}
+\indexentry{Chapter2.4|hyperpage}{62}
-\indexentry{Chapter5.4.2.3|hyperpage}{49}
+\indexentry{Chapter2.4.1|hyperpage}{63}
-\indexentry{Chapter5.4.3|hyperpage}{52}
+\indexentry{Chapter2.4.2|hyperpage}{65}
-\indexentry{Chapter5.4.4|hyperpage}{54}
+\indexentry{Chapter2.4.2.1|hyperpage}{66}
-\indexentry{Chapter5.4.4.1|hyperpage}{54}
+\indexentry{Chapter2.4.2.2|hyperpage}{67}
-\indexentry{Chapter5.4.4.2|hyperpage}{55}
+\indexentry{Chapter2.4.2.3|hyperpage}{68}
-\indexentry{Chapter5.4.4.3|hyperpage}{56}
+\indexentry{Chapter2.5|hyperpage}{70}
-\indexentry{Chapter5.4.5|hyperpage}{57}
+\indexentry{Chapter2.5.1|hyperpage}{70}
-\indexentry{Chapter5.4.6|hyperpage}{58}
+\indexentry{Chapter2.5.2|hyperpage}{72}
-\indexentry{Chapter5.4.6.1|hyperpage}{59}
+\indexentry{Chapter2.5.3|hyperpage}{76}
-\indexentry{Chapter5.4.6.2|hyperpage}{61}
+\indexentry{Chapter2.6|hyperpage}{78}
-\indexentry{Chapter5.4.6.3|hyperpage}{62}
+\indexentry{Chapter3.1|hyperpage}{83}
-\indexentry{Chapter5.5|hyperpage}{64}
+\indexentry{Chapter3.2|hyperpage}{85}
-\indexentry{Chapter5.5.1|hyperpage}{64}
+\indexentry{Chapter3.2.1|hyperpage}{85}
-\indexentry{Chapter5.5.1.1|hyperpage}{65}
+\indexentry{Chapter3.2.1.1|hyperpage}{85}
-\indexentry{Chapter5.5.1.2|hyperpage}{67}
+\indexentry{Chapter3.2.1.2|hyperpage}{86}
-\indexentry{Chapter5.5.1.3|hyperpage}{68}
+\indexentry{Chapter3.2.1.3|hyperpage}{87}
-\indexentry{Chapter5.5.1.4|hyperpage}{69}
+\indexentry{Chapter3.2.2|hyperpage}{87}
-\indexentry{Chapter5.5.2|hyperpage}{70}
+\indexentry{Chapter3.2.3|hyperpage}{88}
-\indexentry{Chapter5.5.2.1|hyperpage}{70}
+\indexentry{Chapter3.2.3.1|hyperpage}{88}
-\indexentry{Chapter5.5.2.2|hyperpage}{70}
+\indexentry{Chapter3.2.3.2|hyperpage}{88}
-\indexentry{Chapter5.5.3|hyperpage}{72}
+\indexentry{Chapter3.2.3.3|hyperpage}{90}
-\indexentry{Chapter5.5.3.1|hyperpage}{72}
+\indexentry{Chapter3.2.4|hyperpage}{91}
-\indexentry{Chapter5.5.3.2|hyperpage}{74}
+\indexentry{Chapter3.2.4.1|hyperpage}{91}
-\indexentry{Chapter5.5.3.3|hyperpage}{74}
+\indexentry{Chapter3.2.4.2|hyperpage}{93}
-\indexentry{Chapter5.5.3.4|hyperpage}{75}
+\indexentry{Chapter3.2.5|hyperpage}{95}
-\indexentry{Chapter5.5.3.5|hyperpage}{76}
+\indexentry{Chapter3.3|hyperpage}{98}
-\indexentry{Chapter5.6|hyperpage}{76}
+\indexentry{Chapter3.3.1|hyperpage}{98}
+\indexentry{Chapter3.3.2|hyperpage}{100}
+\indexentry{Chapter3.3.2.1|hyperpage}{101}
+\indexentry{Chapter3.3.2.2|hyperpage}{101}
+\indexentry{Chapter3.3.2.3|hyperpage}{103}
+\indexentry{Chapter3.4|hyperpage}{104}
+\indexentry{Chapter3.4.1|hyperpage}{104}
+\indexentry{Chapter3.4.2|hyperpage}{106}
+\indexentry{Chapter3.4.3|hyperpage}{107}
+\indexentry{Chapter3.4.4|hyperpage}{108}
+\indexentry{Chapter3.4.4.1|hyperpage}{108}
+\indexentry{Chapter3.4.4.2|hyperpage}{109}
+\indexentry{Chapter3.5|hyperpage}{115}
+\indexentry{Chapter3.5.1|hyperpage}{115}
+\indexentry{Chapter3.5.2|hyperpage}{118}
+\indexentry{Chapter3.5.3|hyperpage}{119}
+\indexentry{Chapter3.5.4|hyperpage}{121}
+\indexentry{Chapter3.5.5|hyperpage}{122}
+\indexentry{Chapter3.5.5|hyperpage}{125}
+\indexentry{Chapter3.6|hyperpage}{125}
+\indexentry{Chapter3.6.1|hyperpage}{125}
+\indexentry{Chapter3.6.2|hyperpage}{126}
+\indexentry{Chapter3.6.4|hyperpage}{127}
+\indexentry{Chapter3.6.5|hyperpage}{128}
+\indexentry{Chapter3.7|hyperpage}{128}
+\indexentry{Chapter4.1|hyperpage}{131}
+\indexentry{Chapter4.1.1|hyperpage}{132}
+\indexentry{Chapter4.1.2|hyperpage}{134}
+\indexentry{Chapter4.2|hyperpage}{136}
+\indexentry{Chapter4.2.1|hyperpage}{136}
+\indexentry{Chapter4.2.2|hyperpage}{139}
+\indexentry{Chapter4.2.2.1|hyperpage}{139}
+\indexentry{Chapter4.2.2.2|hyperpage}{140}
+\indexentry{Chapter4.2.2.3|hyperpage}{141}
+\indexentry{Chapter4.2.3|hyperpage}{142}
+\indexentry{Chapter4.2.3.1|hyperpage}{142}
+\indexentry{Chapter4.2.3.2|hyperpage}{143}
+\indexentry{Chapter4.2.3.3|hyperpage}{144}
+\indexentry{Chapter4.2.4|hyperpage}{146}
+\indexentry{Chapter4.2.4.1|hyperpage}{146}
+\indexentry{Chapter4.2.4.2|hyperpage}{147}
+\indexentry{Chapter4.2.4.3|hyperpage}{148}
+\indexentry{Chapter4.2.5|hyperpage}{149}
+\indexentry{Chapter4.2.6|hyperpage}{149}
+\indexentry{Chapter4.2.7|hyperpage}{153}
+\indexentry{Chapter4.2.7.1|hyperpage}{154}
+\indexentry{Chapter4.2.7.2|hyperpage}{154}
+\indexentry{Chapter4.2.7.3|hyperpage}{155}
+\indexentry{Chapter4.2.7.4|hyperpage}{156}
+\indexentry{Chapter4.3|hyperpage}{157}
+\indexentry{Chapter4.3.1|hyperpage}{159}
+\indexentry{Chapter4.3.1.1|hyperpage}{160}
+\indexentry{Chapter4.3.1.2|hyperpage}{161}
+\indexentry{Chapter4.3.1.3|hyperpage}{162}
+\indexentry{Chapter4.3.1.4|hyperpage}{163}
+\indexentry{Chapter4.3.2|hyperpage}{163}
+\indexentry{Chapter4.3.3|hyperpage}{165}
+\indexentry{Chapter4.3.4|hyperpage}{166}
+\indexentry{Chapter4.3.5|hyperpage}{169}
+\indexentry{Chapter4.4|hyperpage}{172}
+\indexentry{Chapter4.4.1|hyperpage}{173}
+\indexentry{Chapter4.4.2|hyperpage}{176}
+\indexentry{Chapter4.4.2.1|hyperpage}{177}
+\indexentry{Chapter4.4.2.2|hyperpage}{178}
+\indexentry{Chapter4.4.2.3|hyperpage}{180}
+\indexentry{Chapter4.4.3|hyperpage}{181}
+\indexentry{Chapter4.4.3.1|hyperpage}{182}
+\indexentry{Chapter4.4.3.2|hyperpage}{186}
+\indexentry{Chapter4.4.3.3|hyperpage}{186}
+\indexentry{Chapter4.4.3.4|hyperpage}{187}
+\indexentry{Chapter4.4.3.5|hyperpage}{188}
+\indexentry{Chapter4.4.4|hyperpage}{189}
+\indexentry{Chapter4.4.4.1|hyperpage}{190}
+\indexentry{Chapter4.4.4.2|hyperpage}{191}
+\indexentry{Chapter4.4.5|hyperpage}{193}
+\indexentry{Chapter4.4.5|hyperpage}{194}
+\indexentry{Chapter4.4.7|hyperpage}{196}
+\indexentry{Chapter4.4.7.1|hyperpage}{197}
+\indexentry{Chapter4.4.7.2|hyperpage}{198}
+\indexentry{Chapter4.5|hyperpage}{200}
+\indexentry{Chapter5.1|hyperpage}{206}
+\indexentry{Chapter5.1.1|hyperpage}{206}
+\indexentry{Chapter5.1.1.1|hyperpage}{206}
+\indexentry{Chapter5.1.1.2|hyperpage}{207}
+\indexentry{Chapter5.1.1.3|hyperpage}{208}
+\indexentry{Chapter5.1.2|hyperpage}{209}
+\indexentry{Chapter5.1.2.1|hyperpage}{209}
+\indexentry{Chapter5.1.2.2|hyperpage}{210}
+\indexentry{Chapter5.2|hyperpage}{210}
+\indexentry{Chapter5.2.1|hyperpage}{210}
+\indexentry{Chapter5.2.1.1|hyperpage}{211}
+\indexentry{Chapter5.2.1.2|hyperpage}{212}
+\indexentry{Chapter5.2.1.3|hyperpage}{212}
+\indexentry{Chapter5.2.1.4|hyperpage}{213}
+\indexentry{Chapter5.2.1.5|hyperpage}{214}
+\indexentry{Chapter5.2.1.6|hyperpage}{215}
+\indexentry{Chapter5.2.2|hyperpage}{216}
+\indexentry{Chapter5.2.2.1|hyperpage}{217}
+\indexentry{Chapter5.2.2.2|hyperpage}{218}
+\indexentry{Chapter5.2.2.3|hyperpage}{219}
+\indexentry{Chapter5.2.2.4|hyperpage}{219}
+\indexentry{Chapter5.2.3|hyperpage}{220}
+\indexentry{Chapter5.2.3.1|hyperpage}{220}
+\indexentry{Chapter5.2.3.2|hyperpage}{222}
+\indexentry{Chapter5.2.4|hyperpage}{223}
+\indexentry{Chapter5.3|hyperpage}{227}
+\indexentry{Chapter5.3.1|hyperpage}{228}
+\indexentry{Chapter5.3.1.1|hyperpage}{228}
+\indexentry{Chapter5.3.1.2|hyperpage}{230}
+\indexentry{Chapter5.3.1.3|hyperpage}{231}
+\indexentry{Chapter5.3.2|hyperpage}{232}
+\indexentry{Chapter5.3.3|hyperpage}{232}
+\indexentry{Chapter5.3.4|hyperpage}{234}
+\indexentry{Chapter5.3.5|hyperpage}{237}
+\indexentry{Chapter5.4|hyperpage}{238}
+\indexentry{Chapter5.4.1|hyperpage}{239}
+\indexentry{Chapter5.4.2|hyperpage}{240}
+\indexentry{Chapter5.4.2.1|hyperpage}{240}
+\indexentry{Chapter5.4.2.2|hyperpage}{242}
+\indexentry{Chapter5.4.2.3|hyperpage}{245}
+\indexentry{Chapter5.4.3|hyperpage}{248}
+\indexentry{Chapter5.4.4|hyperpage}{250}
+\indexentry{Chapter5.4.4.1|hyperpage}{250}
+\indexentry{Chapter5.4.4.2|hyperpage}{251}
+\indexentry{Chapter5.4.4.3|hyperpage}{252}
+\indexentry{Chapter5.4.5|hyperpage}{253}
+\indexentry{Chapter5.4.6|hyperpage}{254}
+\indexentry{Chapter5.4.6.1|hyperpage}{255}
+\indexentry{Chapter5.4.6.2|hyperpage}{257}
+\indexentry{Chapter5.4.6.3|hyperpage}{258}
+\indexentry{Chapter5.5|hyperpage}{259}
+\indexentry{Chapter5.5.1|hyperpage}{260}
+\indexentry{Chapter5.5.1.1|hyperpage}{261}
+\indexentry{Chapter5.5.1.2|hyperpage}{263}
+\indexentry{Chapter5.5.1.3|hyperpage}{265}
+\indexentry{Chapter5.5.1.4|hyperpage}{266}
+\indexentry{Chapter5.5.2|hyperpage}{266}
+\indexentry{Chapter5.5.2.1|hyperpage}{266}
+\indexentry{Chapter5.5.2.2|hyperpage}{267}
+\indexentry{Chapter5.5.3|hyperpage}{268}
+\indexentry{Chapter5.5.3.1|hyperpage}{269}
+\indexentry{Chapter5.5.3.2|hyperpage}{270}
+\indexentry{Chapter5.5.3.3|hyperpage}{270}
+\indexentry{Chapter5.5.3.4|hyperpage}{271}
+\indexentry{Chapter5.5.3.5|hyperpage}{272}
+\indexentry{Chapter5.6|hyperpage}{273}
+\indexentry{Chapter6.1|hyperpage}{275}
+\indexentry{Chapter6.1.1|hyperpage}{277}
+\indexentry{Chapter6.1.2|hyperpage}{279}
+\indexentry{Chapter6.1.3|hyperpage}{282}
+\indexentry{Chapter6.2|hyperpage}{284}
+\indexentry{Chapter6.2.1|hyperpage}{284}
+\indexentry{Chapter6.2.2|hyperpage}{285}
+\indexentry{Chapter6.2.3|hyperpage}{286}
+\indexentry{Chapter6.2.4|hyperpage}{287}
+\indexentry{Chapter6.3|hyperpage}{288}
+\indexentry{Chapter6.3.1|hyperpage}{290}
+\indexentry{Chapter6.3.2|hyperpage}{292}
+\indexentry{Chapter6.3.3|hyperpage}{296}
+\indexentry{Chapter6.3.3.1|hyperpage}{296}
+\indexentry{Chapter6.3.3.2|hyperpage}{296}
+\indexentry{Chapter6.3.3.3|hyperpage}{298}
--- a/Book/mt-book-xelatex.ptc
+++ b/Book/mt-book-xelatex.ptc
 \boolfalse {citerequest}\boolfalse {citetracker}\boolfalse {pagetracker}\boolfalse {backtracker}\relax 
 \babel@toc {english}{}
 \defcounter {refsection}{0}\relax 
-\contentsline {part}{\@mypartnumtocformat {I}{神经机器翻译}}{7}{part.1}%
+\contentsline {part}{\@mypartnumtocformat {I}{机器翻译基础}}{7}{part.1}%
 \ttl@starttoc {default@1}
 \defcounter {refsection}{0}\relax 
-\contentsline {chapter}{\numberline {1}人工神经网络和神经语言建模}{9}{chapter.1}%
+\contentsline {chapter}{\numberline {1}机器翻译简介}{9}{chapter.1}%
 \defcounter {refsection}{0}\relax 
-\contentsline {section}{\numberline {1.1}深度学习与人工神经网络}{10}{section.1.1}%
+\contentsline {section}{\numberline {1.1}机器翻译的概念}{9}{section.1.1}%
 \defcounter {refsection}{0}\relax 
-\contentsline {subsection}{\numberline {1.1.1}发展简史}{10}{subsection.1.1.1}%
+\contentsline {section}{\numberline {1.2}机器翻译简史}{12}{section.1.2}%
 \defcounter {refsection}{0}\relax 
-\contentsline {subsubsection}{早期的人工神经网络和第一次寒冬}{10}{section*.2}%
+\contentsline {subsection}{\numberline {1.2.1}人工翻译}{12}{subsection.1.2.1}%
 \defcounter {refsection}{0}\relax 
-\contentsline {subsubsection}{神经网络的第二次高潮和第二次寒冬}{11}{section*.3}%
+\contentsline {subsection}{\numberline {1.2.2}机器翻译的萌芽}{13}{subsection.1.2.2}%
 \defcounter {refsection}{0}\relax 
-\contentsline {subsubsection}{深度学习和神经网络方法的崛起}{12}{section*.4}%
+\contentsline {subsection}{\numberline {1.2.3}机器翻译的受挫}{14}{subsection.1.2.3}%
 \defcounter {refsection}{0}\relax 
-\contentsline {subsection}{\numberline {1.1.2}为什么需要深度学习}{13}{subsection.1.1.2}%
+\contentsline {subsection}{\numberline {1.2.4}机器翻译的快速成长}{15}{subsection.1.2.4}%
 \defcounter {refsection}{0}\relax 
-\contentsline {subsubsection}{端到端学习和表示学习}{13}{section*.6}%
+\contentsline {subsection}{\numberline {1.2.5}机器翻译的爆发}{16}{subsection.1.2.5}%
 \defcounter {refsection}{0}\relax 
-\contentsline {subsubsection}{深度学习的效果}{14}{section*.8}%
+\contentsline {section}{\numberline {1.3}机器翻译现状}{17}{section.1.3}%
 \defcounter {refsection}{0}\relax 
-\contentsline {section}{\numberline {1.2}神经网络基础}{14}{section.1.2}%
+\contentsline {section}{\numberline {1.4}机器翻译方法}{18}{section.1.4}%
 \defcounter {refsection}{0}\relax 
-\contentsline {subsection}{\numberline {1.2.1}线性代数基础}{14}{subsection.1.2.1}%
+\contentsline {subsection}{\numberline {1.4.1}基于规则的机器翻译}{18}{subsection.1.4.1}%
 \defcounter {refsection}{0}\relax 
-\contentsline {subsubsection}{标量、向量和矩阵}{15}{section*.10}%
+\contentsline {subsection}{\numberline {1.4.2}基于实例的机器翻译}{20}{subsection.1.4.2}%
 \defcounter {refsection}{0}\relax 
-\contentsline {subsubsection}{矩阵的转置}{16}{section*.11}%
+\contentsline {subsection}{\numberline {1.4.3}统计机器翻译}{21}{subsection.1.4.3}%
 \defcounter {refsection}{0}\relax 
-\contentsline {subsubsection}{矩阵加法和数乘}{16}{section*.12}%
+\contentsline {subsection}{\numberline {1.4.4}神经机器翻译}{22}{subsection.1.4.4}%
 \defcounter {refsection}{0}\relax 
-\contentsline {subsubsection}{矩阵乘法和矩阵点乘}{17}{section*.13}%
+\contentsline {subsection}{\numberline {1.4.5}对比分析}{23}{subsection.1.4.5}%
 \defcounter {refsection}{0}\relax 
-\contentsline {subsubsection}{线性映射}{18}{section*.14}%
+\contentsline {section}{\numberline {1.5}翻译质量评价}{24}{section.1.5}%
 \defcounter {refsection}{0}\relax 
-\contentsline {subsubsection}{范数}{19}{section*.15}%
+\contentsline {subsection}{\numberline {1.5.1}人工评价}{24}{subsection.1.5.1}%
 \defcounter {refsection}{0}\relax 
-\contentsline {subsection}{\numberline {1.2.2}人工神经元和感知机}{20}{subsection.1.2.2}%
+\contentsline {subsection}{\numberline {1.5.2}自动评价}{25}{subsection.1.5.2}%
 \defcounter {refsection}{0}\relax 
-\contentsline {subsubsection}{感知机\ \raisebox {0.5mm}{------}\ 最简单的人工神经元模型}{21}{section*.18}%
+\contentsline {subsubsection}{BLEU}{25}{section*.15}%
 \defcounter {refsection}{0}\relax 
-\contentsline {subsubsection}{神经元内部权重}{22}{section*.21}%
+\contentsline {subsubsection}{TER}{27}{section*.16}%
 \defcounter {refsection}{0}\relax 
-\contentsline {subsubsection}{神经元的输入\ \raisebox {0.5mm}{------}\ 离散 vs 连续}{23}{section*.23}%
+\contentsline {subsubsection}{基于检测点的评价}{27}{section*.17}%
 \defcounter {refsection}{0}\relax 
-\contentsline {subsubsection}{神经元内部的参数学习}{23}{section*.25}%
+\contentsline {section}{\numberline {1.6}机器翻译应用}{28}{section.1.6}%
 \defcounter {refsection}{0}\relax 
-\contentsline {subsection}{\numberline {1.2.3}多层神经网络}{24}{subsection.1.2.3}%
+\contentsline {section}{\numberline {1.7}开源项目与评测}{30}{section.1.7}%
 \defcounter {refsection}{0}\relax 
-\contentsline {subsubsection}{线性变换和激活函数}{24}{section*.27}%
+\contentsline {subsection}{\numberline {1.7.1}开源机器翻译系统}{30}{subsection.1.7.1}%
 \defcounter {refsection}{0}\relax 
-\contentsline {subsubsection}{单层神经网络$\rightarrow $多层神经网络}{26}{section*.34}%
+\contentsline {subsubsection}{统计机器翻译开源系统}{30}{section*.19}%
 \defcounter {refsection}{0}\relax 
-\contentsline {subsection}{\numberline {1.2.4}函数拟合能力}{27}{subsection.1.2.4}%
+\contentsline {subsubsection}{神经机器翻译开源系统}{32}{section*.20}%
 \defcounter {refsection}{0}\relax 
-\contentsline {section}{\numberline {1.3}神经网络的张量实现}{31}{section.1.3}%
+\contentsline {subsection}{\numberline {1.7.2}常用数据集及公开评测任务}{34}{subsection.1.7.2}%
 \defcounter {refsection}{0}\relax 
-\contentsline {subsection}{\numberline {1.3.1} 张量及其计算}{32}{subsection.1.3.1}%
+\contentsline {section}{\numberline {1.8}推荐学习资源}{36}{section.1.8}%
 \defcounter {refsection}{0}\relax 
-\contentsline {subsubsection}{张量}{32}{section*.44}%
+\contentsline {chapter}{\numberline {2}词法、语法及统计建模基础}{41}{chapter.2}%
 \defcounter {refsection}{0}\relax 
-\contentsline {subsubsection}{张量的矩阵乘法}{34}{section*.47}%
+\contentsline {section}{\numberline {2.1}问题概述 }{42}{section.2.1}%
 \defcounter {refsection}{0}\relax 
-\contentsline {subsubsection}{张量的单元操作}{35}{section*.49}%
+\contentsline {section}{\numberline {2.2}概率论基础}{43}{section.2.2}%
 \defcounter {refsection}{0}\relax 
-\contentsline {subsection}{\numberline {1.3.2}张量的物理存储形式}{36}{subsection.1.3.2}%
+\contentsline {subsection}{\numberline {2.2.1}随机变量和概率}{43}{subsection.2.2.1}%
 \defcounter {refsection}{0}\relax 
-\contentsline {subsection}{\numberline {1.3.3}使用开源框架实现张量计算}{36}{subsection.1.3.3}%
+\contentsline {subsection}{\numberline {2.2.2}联合概率、条件概率和边缘概率}{45}{subsection.2.2.2}%
 \defcounter {refsection}{0}\relax 
-\contentsline {subsection}{\numberline {1.3.4}前向传播与计算图}{38}{subsection.1.3.4}%
+\contentsline {subsection}{\numberline {2.2.3}链式法则}{46}{subsection.2.2.3}%
 \defcounter {refsection}{0}\relax 
-\contentsline {subsection}{\numberline {1.3.5}神经网络实例}{41}{subsection.1.3.5}%
+\contentsline {subsection}{\numberline {2.2.4}贝叶斯法则}{47}{subsection.2.2.4}%
 \defcounter {refsection}{0}\relax 
-\contentsline {section}{\numberline {1.4}神经网络的参数训练}{42}{section.1.4}%
+\contentsline {subsection}{\numberline {2.2.5}KL距离和熵}{49}{subsection.2.2.5}%
 \defcounter {refsection}{0}\relax 
-\contentsline {subsection}{\numberline {1.4.1}损失函数}{43}{subsection.1.4.1}%
+\contentsline {subsubsection}{信息熵}{49}{section*.27}%
 \defcounter {refsection}{0}\relax 
-\contentsline {subsection}{\numberline {1.4.2}基于梯度的参数优化}{44}{subsection.1.4.2}%
+\contentsline {subsubsection}{KL距离}{50}{section*.29}%
 \defcounter {refsection}{0}\relax 
-\contentsline {subsubsection}{梯度下降}{44}{section*.67}%
+\contentsline {subsubsection}{交叉熵}{50}{section*.30}%
 \defcounter {refsection}{0}\relax 
-\contentsline {subsubsection}{梯度获取}{46}{section*.69}%
+\contentsline {section}{\numberline {2.3}中文分词}{51}{section.2.3}%
 \defcounter {refsection}{0}\relax 
-\contentsline {subsubsection}{基于梯度的方法的变种和改进}{49}{section*.73}%
+\contentsline {subsection}{\numberline {2.3.1}基于词典的分词方法}{52}{subsection.2.3.1}%
 \defcounter {refsection}{0}\relax 
-\contentsline {subsection}{\numberline {1.4.3}参数更新的并行化策略}{52}{subsection.1.4.3}%
+\contentsline {subsection}{\numberline {2.3.2}基于统计的分词方法}{53}{subsection.2.3.2}%
 \defcounter {refsection}{0}\relax 
-\contentsline {subsection}{\numberline {1.4.4}梯度消失、梯度爆炸和稳定性训练}{54}{subsection.1.4.4}%
+\contentsline {subsubsection}{统计模型的学习与推断}{53}{section*.34}%
 \defcounter {refsection}{0}\relax 
-\contentsline {subsubsection}{易于优化的激活函数}{54}{section*.76}%
+\contentsline {subsubsection}{掷骰子游戏}{54}{section*.36}%
 \defcounter {refsection}{0}\relax 
-\contentsline {subsubsection}{梯度裁剪}{55}{section*.80}%
+\contentsline {subsubsection}{全概率分词方法}{56}{section*.40}%
 \defcounter {refsection}{0}\relax 
-\contentsline {subsubsection}{稳定性训练}{56}{section*.81}%
+\contentsline {section}{\numberline {2.4}$n$-gram语言模型 }{58}{section.2.4}%
 \defcounter {refsection}{0}\relax 
-\contentsline {subsection}{\numberline {1.4.5}过拟合}{57}{subsection.1.4.5}%
+\contentsline {subsection}{\numberline {2.4.1}建模}{59}{subsection.2.4.1}%
 \defcounter {refsection}{0}\relax 
-\contentsline {subsection}{\numberline {1.4.6}反向传播}{58}{subsection.1.4.6}%
+\contentsline {subsection}{\numberline {2.4.2}未登录词和平滑算法}{61}{subsection.2.4.2}%
 \defcounter {refsection}{0}\relax 
-\contentsline {subsubsection}{（一）输出层的反向传播}{59}{section*.84}%
+\contentsline {subsubsection}{加法平滑方法}{62}{section*.46}%
 \defcounter {refsection}{0}\relax 
-\contentsline {subsubsection}{（二）隐藏层的反向传播}{61}{section*.88}%
+\contentsline {subsubsection}{古德-图灵估计法}{63}{section*.48}%
 \defcounter {refsection}{0}\relax 
-\contentsline {subsubsection}{（三）程序实现}{62}{section*.91}%
+\contentsline {subsubsection}{Kneser-Ney平滑方法}{64}{section*.50}%
 \defcounter {refsection}{0}\relax 
-\contentsline {section}{\numberline {1.5}神经语言模型}{64}{section.1.5}%
+\contentsline {section}{\numberline {2.5}句法分析（短语结构分析）}{66}{section.2.5}%
 \defcounter {refsection}{0}\relax 
-\contentsline {subsection}{\numberline {1.5.1}基于神经网络的语言建模}{64}{subsection.1.5.1}%
+\contentsline {subsection}{\numberline {2.5.1}句子的句法树表示}{66}{subsection.2.5.1}%
 \defcounter {refsection}{0}\relax 
-\contentsline {subsubsection}{（一）基于前馈神经网络的语言模型}{65}{section*.94}%
+\contentsline {subsection}{\numberline {2.5.2}上下文无关文法}{68}{subsection.2.5.2}%
 \defcounter {refsection}{0}\relax 
-\contentsline {subsubsection}{（二）基于循环神经网络的语言模型}{67}{section*.97}%
+\contentsline {subsection}{\numberline {2.5.3}规则和推导的概率}{72}{subsection.2.5.3}%
 \defcounter {refsection}{0}\relax 
-\contentsline {subsubsection}{（三）基于自注意力机制的语言模型}{68}{section*.99}%
+\contentsline {section}{\numberline {2.6}小结及深入阅读}{74}{section.2.6}%
 \defcounter {refsection}{0}\relax 
-\contentsline {subsubsection}{（四）语言模型的评价}{69}{section*.101}%
+\contentsline {part}{\@mypartnumtocformat {II}{统计机器翻译}}{77}{part.2}%
+\ttl@stoptoc {default@1}
+\ttl@starttoc {default@2}
 \defcounter {refsection}{0}\relax 
-\contentsline {subsection}{\numberline {1.5.2}单词表示模型}{70}{subsection.1.5.2}%
+\contentsline {chapter}{\numberline {3}基于词的机器翻译模型}{79}{chapter.3}%
 \defcounter {refsection}{0}\relax 
-\contentsline {subsubsection}{（一）One-hot编码}{70}{section*.102}%
+\contentsline {section}{\numberline {3.1}什么是基于词的翻译模型}{79}{section.3.1}%
 \defcounter {refsection}{0}\relax 
-\contentsline {subsubsection}{（二）分布式表示}{70}{section*.104}%
+\contentsline {section}{\numberline {3.2}构建一个简单的机器翻译系统}{81}{section.3.2}%
 \defcounter {refsection}{0}\relax 
-\contentsline {subsection}{\numberline {1.5.3}句子表示模型及预训练}{72}{subsection.1.5.3}%
+\contentsline {subsection}{\numberline {3.2.1}如何进行翻译？}{81}{subsection.3.2.1}%
 \defcounter {refsection}{0}\relax 
-\contentsline {subsubsection}{（一）简单的上下文表示模型}{72}{section*.108}%
+\contentsline {subsubsection}{机器翻译流程}{82}{section*.63}%
 \defcounter {refsection}{0}\relax 
-\contentsline {subsubsection}{（二）ELMO模型}{74}{section*.111}%
+\contentsline {subsubsection}{人工翻译 vs. 机器翻译}{83}{section*.65}%
 \defcounter {refsection}{0}\relax 
-\contentsline {subsubsection}{（三）GPT模型}{74}{section*.113}%
+\contentsline {subsection}{\numberline {3.2.2}基本框架}{83}{subsection.3.2.2}%
 \defcounter {refsection}{0}\relax 
-\contentsline {subsubsection}{（四）BERT模型}{75}{section*.115}%
+\contentsline {subsection}{\numberline {3.2.3}单词翻译概率}{84}{subsection.3.2.3}%
 \defcounter {refsection}{0}\relax 
-\contentsline {subsubsection}{（五）为什么要预训练？}{76}{section*.117}%
+\contentsline {subsubsection}{什么是单词翻译概率？}{84}{section*.67}%
 \defcounter {refsection}{0}\relax 
-\contentsline {section}{\numberline {1.6}小结及深入阅读}{76}{section.1.6}%
+\contentsline {subsubsection}{如何从一个双语平行数据中学习？}{84}{section*.69}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{如何从大量的双语平行数据中学习？}{86}{section*.70}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {3.2.4}句子级翻译模型}{87}{subsection.3.2.4}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{基础模型}{87}{section*.72}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{生成流畅的译文}{89}{section*.74}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {3.2.5}解码}{91}{subsection.3.2.5}%
+\defcounter {refsection}{0}\relax 
+\contentsline {section}{\numberline {3.3}基于词的翻译建模}{94}{section.3.3}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {3.3.1}噪声信道模型}{94}{subsection.3.3.1}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {3.3.2}统计机器翻译的三个基本问题}{96}{subsection.3.3.2}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{词对齐}{97}{section*.83}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{基于词对齐的翻译模型}{97}{section*.86}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{基于词对齐的翻译实例}{99}{section*.88}%
+\defcounter {refsection}{0}\relax 
+\contentsline {section}{\numberline {3.4}IBM模型1-2}{100}{section.3.4}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {3.4.1}IBM模型1}{100}{subsection.3.4.1}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {3.4.2}IBM模型2}{102}{subsection.3.4.2}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {3.4.3}解码及计算优化}{103}{subsection.3.4.3}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {3.4.4}训练}{104}{subsection.3.4.4}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{目标函数}{104}{section*.93}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{优化}{105}{section*.95}%
+\defcounter {refsection}{0}\relax 
+\contentsline {section}{\numberline {3.5}IBM模型3-5及隐马尔可夫模型}{111}{section.3.5}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {3.5.1}基于产出率的翻译模型}{111}{subsection.3.5.1}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {3.5.2}IBM 模型3}{114}{subsection.3.5.2}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {3.5.3}IBM 模型4}{115}{subsection.3.5.3}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {3.5.4} IBM 模型5}{117}{subsection.3.5.4}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {3.5.5}隐马尔可夫模型}{118}{subsection.3.5.5}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{隐马尔可夫模型}{119}{section*.107}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{词对齐模型}{120}{section*.109}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {3.5.6}解码和训练}{121}{subsection.3.5.6}%
+\defcounter {refsection}{0}\relax 
+\contentsline {section}{\numberline {3.6}问题分析}{121}{section.3.6}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {3.6.1}词对齐及对称化}{121}{subsection.3.6.1}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {3.6.2}Deficiency}{122}{subsection.3.6.2}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {3.6.3}句子长度}{123}{subsection.3.6.3}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {3.6.4}其他问题}{124}{subsection.3.6.4}%
+\defcounter {refsection}{0}\relax 
+\contentsline {section}{\numberline {3.7}小结及深入阅读}{124}{section.3.7}%
+\defcounter {refsection}{0}\relax 
+\contentsline {chapter}{\numberline {4}基于短语和句法的机器翻译模型}{127}{chapter.4}%
+\defcounter {refsection}{0}\relax 
+\contentsline {section}{\numberline {4.1}翻译中的结构信息}{127}{section.4.1}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {4.1.1}更大粒度的翻译单元}{128}{subsection.4.1.1}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {4.1.2}句子的结构信息}{130}{subsection.4.1.2}%
+\defcounter {refsection}{0}\relax 
+\contentsline {section}{\numberline {4.2}基于短语的翻译模型}{132}{section.4.2}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {4.2.1}机器翻译中的短语}{132}{subsection.4.2.1}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {4.2.2}数学建模及判别式模型}{135}{subsection.4.2.2}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{基于翻译推导的建模}{135}{section*.121}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{对数线性模型}{136}{section*.122}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{搭建模型的基本流程}{137}{section*.123}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {4.2.3}短语抽取}{138}{subsection.4.2.3}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{与词对齐一致的短语}{138}{section*.126}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{获取词对齐}{139}{section*.130}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{度量双语短语质量}{140}{section*.132}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {4.2.4}调序}{142}{subsection.4.2.4}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{基于距离的调序}{142}{section*.136}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{基于方向的调序}{143}{section*.138}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{基于分类的调序}{144}{section*.141}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {4.2.5}特征}{145}{subsection.4.2.5}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {4.2.6}最小错误率训练}{145}{subsection.4.2.6}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {4.2.7}栈解码}{149}{subsection.4.2.7}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{翻译候选匹配}{150}{section*.146}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{翻译假设扩展}{150}{section*.148}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{剪枝}{151}{section*.150}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{解码中的栈结构}{152}{section*.152}%
+\defcounter {refsection}{0}\relax 
+\contentsline {section}{\numberline {4.3}基于层次短语的模型}{153}{section.4.3}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {4.3.1}同步上下文无关文法}{155}{subsection.4.3.1}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{文法定义}{156}{section*.157}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{推导}{157}{section*.158}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{胶水规则}{158}{section*.159}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{处理流程}{159}{section*.160}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {4.3.2}层次短语规则抽取}{159}{subsection.4.3.2}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {4.3.3}翻译模型及特征}{161}{subsection.4.3.3}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {4.3.4}CYK解码}{162}{subsection.4.3.4}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {4.3.5}立方剪枝}{165}{subsection.4.3.5}%
+\defcounter {refsection}{0}\relax 
+\contentsline {section}{\numberline {4.4}基于语言学句法的模型}{168}{section.4.4}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {4.4.1}基于句法的翻译模型分类}{169}{subsection.4.4.1}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {4.4.2}基于树结构的文法}{172}{subsection.4.4.2}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{树到树翻译规则}{173}{section*.176}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{基于树结构的翻译推导}{174}{section*.178}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{树到串翻译规则}{176}{section*.181}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {4.4.3}树到串翻译规则抽取}{177}{subsection.4.4.3}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{树的切割与最小规则}{178}{section*.183}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{空对齐处理}{182}{section*.189}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{组合规则}{182}{section*.191}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{SPMT规则}{183}{section*.193}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{句法树二叉化}{184}{section*.195}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {4.4.4}树到树翻译规则抽取}{185}{subsection.4.4.4}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{基于节点对齐的规则抽取}{186}{section*.199}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{基于对齐矩阵的规则抽取}{187}{section*.202}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {4.4.5}句法翻译模型的特征}{189}{subsection.4.4.5}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {4.4.6}基于超图的推导空间表示}{190}{subsection.4.4.6}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {4.4.7}基于树的解码 vs 基于串的解码}{192}{subsection.4.4.7}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{基于树的解码}{193}{section*.209}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{基于串的解码}{194}{section*.212}%
+\defcounter {refsection}{0}\relax 
+\contentsline {section}{\numberline {4.5}小结及深入阅读}{196}{section.4.5}%
+\defcounter {refsection}{0}\relax 
+\contentsline {part}{\@mypartnumtocformat {III}{神经机器翻译}}{199}{part.3}%
+\ttl@stoptoc {default@2}
+\ttl@starttoc {default@3}
+\defcounter {refsection}{0}\relax 
+\contentsline {chapter}{\numberline {5}人工神经网络和神经语言建模}{201}{chapter.5}%
+\defcounter {refsection}{0}\relax 
+\contentsline {section}{\numberline {5.1}深度学习与人工神经网络}{202}{section.5.1}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {5.1.1}发展简史}{202}{subsection.5.1.1}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{早期的人工神经网络和第一次寒冬}{202}{section*.214}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{神经网络的第二次高潮和第二次寒冬}{203}{section*.215}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{深度学习和神经网络方法的崛起}{204}{section*.216}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {5.1.2}为什么需要深度学习}{205}{subsection.5.1.2}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{端到端学习和表示学习}{205}{section*.218}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{深度学习的效果}{206}{section*.220}%
+\defcounter {refsection}{0}\relax 
+\contentsline {section}{\numberline {5.2}神经网络基础}{206}{section.5.2}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {5.2.1}线性代数基础}{206}{subsection.5.2.1}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{标量、向量和矩阵}{207}{section*.222}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{矩阵的转置}{208}{section*.223}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{矩阵加法和数乘}{208}{section*.224}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{矩阵乘法和矩阵点乘}{209}{section*.225}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{线性映射}{210}{section*.226}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{范数}{211}{section*.227}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {5.2.2}人工神经元和感知机}{212}{subsection.5.2.2}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{感知机\ \raisebox {0.5mm}{------}\ 最简单的人工神经元模型}{213}{section*.230}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{神经元内部权重}{214}{section*.233}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{神经元的输入\ \raisebox {0.5mm}{------}\ 离散 vs 连续}{215}{section*.235}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{神经元内部的参数学习}{215}{section*.237}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {5.2.3}多层神经网络}{216}{subsection.5.2.3}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{线性变换和激活函数}{216}{section*.239}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{单层神经网络$\rightarrow $多层神经网络}{218}{section*.246}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {5.2.4}函数拟合能力}{219}{subsection.5.2.4}%
+\defcounter {refsection}{0}\relax 
+\contentsline {section}{\numberline {5.3}神经网络的张量实现}{223}{section.5.3}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {5.3.1} 张量及其计算}{224}{subsection.5.3.1}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{张量}{224}{section*.256}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{张量的矩阵乘法}{226}{section*.259}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{张量的单元操作}{227}{section*.261}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {5.3.2}张量的物理存储形式}{228}{subsection.5.3.2}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {5.3.3}使用开源框架实现张量计算}{228}{subsection.5.3.3}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {5.3.4}前向传播与计算图}{230}{subsection.5.3.4}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {5.3.5}神经网络实例}{233}{subsection.5.3.5}%
+\defcounter {refsection}{0}\relax 
+\contentsline {section}{\numberline {5.4}神经网络的参数训练}{234}{section.5.4}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {5.4.1}损失函数}{235}{subsection.5.4.1}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {5.4.2}基于梯度的参数优化}{236}{subsection.5.4.2}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{梯度下降}{236}{section*.279}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{梯度获取}{238}{section*.281}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{基于梯度的方法的变种和改进}{241}{section*.285}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {5.4.3}参数更新的并行化策略}{244}{subsection.5.4.3}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {5.4.4}梯度消失、梯度爆炸和稳定性训练}{246}{subsection.5.4.4}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{易于优化的激活函数}{246}{section*.288}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{梯度裁剪}{247}{section*.292}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{稳定性训练}{248}{section*.293}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {5.4.5}过拟合}{249}{subsection.5.4.5}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {5.4.6}反向传播}{250}{subsection.5.4.6}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{输出层的反向传播}{251}{section*.296}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{隐藏层的反向传播}{253}{section*.300}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{程序实现}{254}{section*.303}%
+\defcounter {refsection}{0}\relax 
+\contentsline {section}{\numberline {5.5}神经语言模型}{255}{section.5.5}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {5.5.1}基于神经网络的语言建模}{256}{subsection.5.5.1}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{基于前馈神经网络的语言模型}{257}{section*.306}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{基于循环神经网络的语言模型}{259}{section*.309}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{基于自注意力机制的语言模型}{261}{section*.311}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{语言模型的评价}{262}{section*.313}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {5.5.2}单词表示模型}{262}{subsection.5.5.2}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{One-hot编码}{262}{section*.314}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{分布式表示}{263}{section*.316}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {5.5.3}句子表示模型及预训练}{264}{subsection.5.5.3}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{简单的上下文表示模型}{265}{section*.320}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{ELMO模型}{266}{section*.323}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{GPT模型}{266}{section*.325}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{BERT模型}{267}{section*.327}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{为什么要预训练？}{268}{section*.329}%
+\defcounter {refsection}{0}\relax 
+\contentsline {section}{\numberline {5.6}小结及深入阅读}{269}{section.5.6}%
+\defcounter {refsection}{0}\relax 
+\contentsline {chapter}{\numberline {6}神经机器翻译模型}{271}{chapter.6}%
+\defcounter {refsection}{0}\relax 
+\contentsline {section}{\numberline {6.1}神经机器翻译的发展简史}{271}{section.6.1}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {6.1.1}神经机器翻译的起源}{273}{subsection.6.1.1}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {6.1.2}神经机器翻译的品质 }{275}{subsection.6.1.2}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {6.1.3}神经机器翻译的优势 }{278}{subsection.6.1.3}%
+\defcounter {refsection}{0}\relax 
+\contentsline {section}{\numberline {6.2}编码器-解码器框架}{280}{section.6.2}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {6.2.1}框架结构}{280}{subsection.6.2.1}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {6.2.2}表示学习}{281}{subsection.6.2.2}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {6.2.3}简单的运行实例}{282}{subsection.6.2.3}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {6.2.4}机器翻译范式的对比}{283}{subsection.6.2.4}%
+\defcounter {refsection}{0}\relax 
+\contentsline {section}{\numberline {6.3}基于循环神经网络的翻译模型及注意力机制}{284}{section.6.3}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {6.3.1}建模}{286}{subsection.6.3.1}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {6.3.2}输入（词嵌入）及输出（Softmax）}{288}{subsection.6.3.2}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {6.3.3}循环神经网络结构}{292}{subsection.6.3.3}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{循环神经单元（RNN）}{292}{section*.351}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{长短时记忆网络（LSTM）}{292}{section*.352}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{门控循环单元（GRU）}{294}{section*.355}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{双向模型}{295}{section*.357}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{多层循环神经网络}{297}{section*.359}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {6.3.4}注意力机制}{297}{subsection.6.3.4}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{翻译中的注意力机制}{298}{section*.362}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{上下文向量的计算}{299}{section*.365}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{注意力机制的解读}{302}{section*.370}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {6.3.5}训练}{304}{subsection.6.3.5}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{损失函数}{305}{section*.373}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{长参数初始化}{305}{section*.374}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{优化策略}{306}{section*.375}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{梯度裁剪}{306}{section*.377}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{学习率策略}{307}{section*.378}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{并行训练}{308}{section*.381}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {6.3.6}推断}{309}{subsection.6.3.6}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{贪婪搜索}{311}{section*.385}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{束搜索}{312}{section*.388}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{长度惩罚}{313}{section*.390}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {6.3.7}实例-GNMT}{314}{subsection.6.3.7}%
+\defcounter {refsection}{0}\relax 
+\contentsline {section}{\numberline {6.4}Transformer}{316}{section.6.4}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {6.4.1}自注意力模型}{317}{subsection.6.4.1}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {6.4.2}Transformer架构}{318}{subsection.6.4.2}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {6.4.3}位置编码}{320}{subsection.6.4.3}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {6.4.4}基于点乘的注意力机制}{322}{subsection.6.4.4}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {6.4.5}掩码操作}{324}{subsection.6.4.5}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {6.4.6}多头注意力}{326}{subsection.6.4.6}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {6.4.7}残差网络和层正则化}{327}{subsection.6.4.7}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {6.4.8}前馈全连接网络子层}{328}{subsection.6.4.8}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {6.4.9}训练}{329}{subsection.6.4.9}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {6.4.10}推断}{332}{subsection.6.4.10}%
+\defcounter {refsection}{0}\relax 
+\contentsline {section}{\numberline {6.5}序列到序列问题及应用}{332}{section.6.5}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {6.5.1}自动问答}{333}{subsection.6.5.1}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {6.5.2}自动文摘}{333}{subsection.6.5.2}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {6.5.3}文言文翻译}{333}{subsection.6.5.3}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {6.5.4}对联生成}{335}{subsection.6.5.4}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {6.5.5}古诗生成}{335}{subsection.6.5.5}%
+\defcounter {refsection}{0}\relax 
+\contentsline {section}{\numberline {6.6}小结及深入阅读}{335}{section.6.6}%
+\defcounter {refsection}{0}\relax 
+\contentsline {part}{\@mypartnumtocformat {IV}{附录}}{339}{part.4}%
+\ttl@stoptoc {default@3}
+\ttl@starttoc {default@4}
+\defcounter {refsection}{0}\relax 
+\contentsline {chapter}{\numberline {A}附录A}{341}{appendix.1.A}%
+\defcounter {refsection}{0}\relax 
+\contentsline {chapter}{\numberline {B}附录B}{343}{appendix.2.B}%
+\defcounter {refsection}{0}\relax 
+\contentsline {section}{\numberline {B.1}IBM模型3训练方法}{343}{section.2.B.1}%
+\defcounter {refsection}{0}\relax 
+\contentsline {section}{\numberline {B.2}IBM模型4训练方法}{345}{section.2.B.2}%
+\defcounter {refsection}{0}\relax 
+\contentsline {section}{\numberline {B.3}IBM模型5训练方法}{347}{section.2.B.3}%
 \contentsfinish 
--- a/Book/mt-book-xelatex.tex
+++ b/Book/mt-book-xelatex.tex
@@ -112,13 +112,13 @@
 %	CHAPTERS
 %----------------------------------------------------------------------------------------
-%\include{Chapter1/chapter1}
+\include{Chapter1/chapter1}
-%\include{Chapter2/chapter2}
+\include{Chapter2/chapter2}
-%\include{Chapter3/chapter3}
+\include{Chapter3/chapter3}
-%\include{Chapter4/chapter4}
+\include{Chapter4/chapter4}
 \include{Chapter5/chapter5}
-%\include{Chapter6/chapter6}
+\include{Chapter6/chapter6}
-%\include{ChapterAppend/chapterappend}
+\include{ChapterAppend/chapterappend}