合并分支 'caorunzhe' 到 'shanweiqiao'

Caorunzhe 查看合并请求 !117

合并分支 'caorunzhe' 到 'shanweiqiao'
Caorunzhe 查看合并请求 !117
b514cf71 · 单韦乔 · 59a98acd · 49986b6f · b514cf71 · 59a98acd
Commit b514cf71 authored Aug 29, 2020 by 单韦乔
--- a/Chapter1/Figures/comparison-between-interlingua-based-and-transfer-based-translation.tex
+++ b/Chapter1/Figures/comparison-between-interlingua-based-and-transfer-based-translation.tex
@@ -14,10 +14,10 @@
 \node [datanode,anchor=north] (s4) at ([yshift=-4.5em]s3.south) {{ \small{语言4}}};
 \node [circle,anchor=north west,inner sep=2pt,fill=blue!20] (m1) at ([xshift=0.8em,yshift=-0.5em]s1.south east) {{ \small{中间语言}}};
-\draw [<->,very thick] (s1.south) -- (m1.west);
+\draw [<->,very thick] (s1.south) -- (m1.170);
-\draw [<->,very thick] (s2.north) -- (m1.west);
+\draw [<->,very thick] (s2.north) -- (m1.190);
-\draw [<->,very thick] (s3.south) -- (m1.east);
+\draw [<->,very thick] (s3.south) -- (m1.10);
-\draw [<->,very thick] (s4.north) -- (m1.east);
+\draw [<->,very thick] (s4.north) -- (m1.-10);
 \node [anchor=north] (l) at ([xshift=5em,yshift=-1em]s2.south) {\footnotesize{(a) 基于中间语言的方法}};

--- a/Chapter3/Figures/fig-cover.jpg
+++ b/Chapter3/Figures/fig-cover.jpg
--- a/Chapter3/Figures/figure-a-simple-pre-processing-process.tex
+++ b/Chapter3/Figures/figure-a-simple-pre-processing-process.tex
+\definecolor{ublue}{rgb}{0.152,0.250,0.545}
+\definecolor{ugreen}{rgb}{0,0.5,0}
+%%% outline
+%-------------------------------------------------------------------------
+ \vspace{0.5em}
+    \begin{tikzpicture}
+    {\small
+    \node [ugreen] (input) at (0,0) {猫喜欢吃鱼};
+    \node [draw,thick,anchor=west,ublue] (preprocessing) at ([xshift=1em]input.east) {分词系统};
+    \node [ugreen,anchor=west] (mtinput) at ([xshift=1em]preprocessing.east) {猫/喜欢/吃/鱼};
+    \node [draw,thick,anchor=west,ublue] (smt) at ([xshift=1em]mtinput.east) {MT系统};
+    \node [anchor=west] (mtoutput) at ([xshift=1em]smt.east) {...};
+    \draw [->,thick,ublue] ([xshift=0.1em]input.east) -- ([xshift=-0.2em]preprocessing.west);
+    \draw [->,thick,ublue] ([xshift=0.2em]preprocessing.east) -- ([xshift=-0.1em]mtinput.west);
+    \draw [->,thick,ublue] ([xshift=0.1em]mtinput.east) -- ([xshift=-0.2em]smt.west);
+    \draw [->,thick,ublue] ([xshift=0.2em]smt.east) -- ([xshift=-0.1em]mtoutput.west);
+    }
+    \end{tikzpicture}
+%---------------------------------------------------------------------
--- a/Chapter3/Figures/figure-analysis-of-sentence-participle&syntactic.tex
+++ b/Chapter3/Figures/figure-analysis-of-sentence-participle&syntactic.tex
+%%% outline
+%-------------------------------------------------------------------------
+\vspace{0.3em}
+\begin{tikzpicture}
+\begin{scope}[scale=1.0,level distance=30pt,sibling distance=15pt,grow'=up]
+{
+\Tree[.\node(sn0){IP};
+          [.\node(sn1){NP};
+               [.\node(sn2){NN}; \node(sw1){猫}; ]
+          ]
+          [.\node(sn3){VP};
+               [.\node(sn4){VV}; \node(sw2){喜欢}; ]
+               [.\node(sn5){VP}; \edge[roof]; \node(sw3){吃 \ 鱼}; ]
+          ]
+     ]
+}
+\end{scope}
+\end{tikzpicture}
+%---------------------------------------------------------------------
--- a/Chapter3/Figures/figure-crf-to-deal-with-sequence-problems.tex
+++ b/Chapter3/Figures/figure-crf-to-deal-with-sequence-problems.tex
+\begin{tikzpicture}
+	\tikzstyle{hide} = [draw,line width=1pt,inner sep=2pt,fill=green!30,minimum size=2em]
+		\node[hide] (y1) at (0,0){$y_1$};
+		\node[anchor=west,hide](y2)at([xshift=2em]y1.east){$y_2$};
+		\node[anchor=west,hide](y3)at([xshift=2em]y2.east){$y_3$};
+		\node[anchor=west,line width=1pt,inner sep=2pt,minimum size=2em](dots)at([xshift=2em]y3.east){$\cdots$};
+		\node[anchor=west,hide](yn-1)at([xshift=2em]dots.east){$y_{n-1}$};
+		\node[anchor=west,hide](yn)at([xshift=2em]yn-1.east){$y_n$};
+		\node[anchor=north,draw,line width=1pt,inner sep=2pt,fill=red!30,minimum height=2em,minimum width=12em](see)at ([yshift=-3em,xshift=2em]y3.south){$X=x_1,x_2,\ldots,x_{n-1},x_n$};
+		\node[anchor=south,font=\footnotesize] at ([yshift=1em,xshift=2em]y3.north){(待预测的隐藏状态序列)};
+		\node[anchor=north,font=\footnotesize] at ([yshift=-1em]see.south){(可见状态序列)};
+		\draw[line width=1pt] (y1.east) -- (y2.west);
+		\draw[line width=1pt] (y2.east) -- (y3.west);
+		\draw[line width=1pt] (y3.east) -- (dots.west);
+		\draw[line width=1pt] (dots.east) -- (yn-1.west);
+		\draw[line width=1pt] (yn-1.east) -- (yn.west);
+		\draw[line width=1pt] (y1.south) -- (see.north);
+		\draw[line width=1pt] (y2.south) -- (see.north);
+		\draw[line width=1pt] (y3.south) -- (see.north);
+		\draw[line width=1pt] (yn-1.south) -- (see.north);
+		\draw[line width=1pt] (yn.south) -- (see.north);	
+\end{tikzpicture}
\ No newline at end of file
--- a/Chapter3/Figures/figure-cross-type-word-segmentation-ambiguity.tex
+++ b/Chapter3/Figures/figure-cross-type-word-segmentation-ambiguity.tex
+\definecolor{ublue}{rgb}{0.152,0.250,0.545}
+\definecolor{ugreen}{rgb}{0,0.5,0}
+%%% outline
+%-------------------------------------------------------------------------
+\begin{tikzpicture}
+\begin{scope}
+{
+{\small
+\node [anchor=north west] (entry1) at (0,0) {\textbf{1:} 很};
+\node [anchor=north west] (entry2) at ([yshift=0.1em]entry1.south west) {\textbf{2:} 高};
+\node [anchor=north west] (entry3) at ([yshift=0.1em]entry2.south west) {\textbf{3:} 现在};
+\node [anchor=north west] (entry4) at ([yshift=0.1em]entry3.south west) {\textbf{4:} 物价};
+\node [anchor=north west] (entry5) at ([yshift=0.1em]entry4.south west) {\textbf{5:} 确实};
+\node [anchor=north west] (entry6) at ([yshift=0.1em]entry5.south west) {\textbf{6:} 实现};
+\node [anchor=south west] (dictionarylabel) at (entry1.north west) {{\color{ublue} 分词词典}};
+}
+}
+\begin{pgfonlayer}{background}
+{
+\node[rectangle,draw=ublue, inner sep=0.2em] [fit = (entry1) (entry2) (entry3) (entry4) (entry5) (entry6) (dictionarylabel)] {};
+}
+\end{pgfonlayer}
+\end{scope}
+{
+\begin{scope}[xshift=1.2in,yshift=1em]
+\node [anchor=west] (c1) at (0,0) {确};
+\node [anchor=west] (c2) at ([xshift=0em]c1.east) {实};
+\node [anchor=west] (c3) at ([xshift=0em]c2.east) {现};
+\node [anchor=west] (c4) at ([xshift=0em]c3.east) {在};
+\node [anchor=west] (c5) at ([xshift=0em]c4.east) {物};
+\node [anchor=west] (c6) at ([xshift=0em]c5.east) {价};
+\node [anchor=west] (c7) at ([xshift=0em]c6.east) {很};
+\node [anchor=west] (c8) at ([xshift=0em]c7.east) {高};
+\end{scope}
+}
+{
+\node [anchor=west,thick,draw,minimum width=3.4em,minimum height=1.5em] (w1) at (c3.west){};
+\draw [->,thick] (entry3.30) ..controls +(70:1) and +(south:1.5).. ([xshift=0.3em]w1.south) node [pos=0.5, above] {\footnotesize{命中}};
+}
+{
+\node [anchor=west,very thick,draw,dotted,minimum width=3.4em,minimum height=1.9em,red] (w3) at (c2.west){};
+\draw [->,very thick,dotted,red] ([yshift=-0.2em]entry6.30) ..controls +(60:2) and +(south:3).. ([xshift=-0.6em]w3.south) node [pos=0.5, below] {\color{red}{\footnotesize{命中}}};
+}
+\end{tikzpicture}
+%---------------------------------------------------------------------
--- a/Chapter3/Figures/figure-evaluation-of-probability-for-grammar.tex
+++ b/Chapter3/Figures/figure-evaluation-of-probability-for-grammar.tex
+\definecolor{ublue}{rgb}{0.152,0.250,0.545}
+\definecolor{ugreen}{rgb}{0,0.5,0}
+%%% outline
+%-------------------------------------------------------------------------
+\begin{tikzpicture}
+\begin{scope}[sibling distance=0pt,level distance=17pt]
+{\footnotesize
+\Tree[.\node[inner sep=2pt](t1n1){IP};
+          [.\node[inner sep=2pt](t1n2){NP};
+               [.\node[inner sep=2pt](t1n3){NN}; 狗 ]
+          ]
+          [.\node[inner sep=2pt](t1n4){VP};
+               [.\node[inner sep=2pt](t1n5){VV}; 喜欢 ]
+               [.\node[inner sep=2pt](t1n6){VP};
+                     [.\node[inner sep=2pt](t1n7){VV}; 吃 ]
+                     [.\node[inner sep=2pt](t1n8){NN}; 骨头 ]
+               ]
+          ]
+     ]
+}
+\end{scope}
+\begin{scope}[sibling distance=0pt,level distance=17pt,yshift=-7em]
+{\footnotesize
+\Tree[.\node[inner sep=2pt](t2n1){IP};
+          [.\node[inner sep=2pt](t2n2){VP};
+               [.\node[inner sep=2pt](t2n3){VV}; \node[](t2w1){请}; ]
+          ]
+          [.\node[inner sep=2pt](t2n4){IP};
+               [.\node[inner sep=2pt](t2n5){VP};
+                    [.\node[inner sep=2pt](t2n6){VV}; 看 ]
+               ]
+               [.\node[inner sep=2pt](t2n7){NP};
+                    [.\node[inner sep=2pt](t2n8){QP};
+                          [.\node[inner sep=2pt](t2n9){CD}; 一 ]
+                          [.\node[inner sep=2pt](t2n10){M}; 个 ]
+                    ]
+                    [.\node[inner sep=2pt](t2n11){ADJP};
+                          [.\node[inner sep=2pt](t2n12){JJ}; 新 ]
+                    ]
+                    [.\node[inner sep=2pt](t2n13){NP};
+                          [.\node[inner sep=2pt](t2n14){NN}; \node[](t2wn){句子}; ]
+                    ]
+               ]
+          ]
+     ]
+}
+\end{scope}
+\node [anchor=south] (treebanklabel) at (t1n1.north) {{\color{ublue} 数据：树库}};
+\begin{pgfonlayer}{background}
+\node[rectangle,draw=ublue, inner sep=0.2em] [fit = (treebanklabel) (t1n1) (t2w1) (t2wn)] (treebank) {};
+\end{pgfonlayer}
+\node [anchor=north west] (math1) at ([xshift=2em]treebank.north east) {P(VP $\to$ VV NN)};
+\node [anchor=north west] (math1part2) at ([xshift=-1em,yshift=0.2em]math1.south west) {$=\frac{\textrm{``VP''和``VV NN''同时出现的次数=1}}{\textrm{``VP''出现的次数}=4}$};
+\node [anchor=north west] (math1part3) at ([yshift=0.2em]math1part2.south west){$=\frac{1}{4}$};
+\node [anchor=north west] (math2) at ([yshift=-6em]math1.north west) {P(NP $\to$ NN)};
+\node [anchor=north west] (math2part2) at ([xshift=-1em,yshift=0.2em]math2.south west) {$=\frac{\textrm{``NP''和``NN''同时出现的次数=2}}{\textrm{``NP''出现的次数}=3}$};
+\node [anchor=north west] (math2part3) at ([yshift=0.2em]math2part2.south west){$=\frac{2}{3}$};
+\node [anchor=north west] (math3) at ([yshift=-6em]math2.north west) {P(IP $\to$ NP NP)};
+\node [anchor=north west] (math3part2) at ([xshift=-1em,yshift=0.2em]math3.south west) {$=\frac{\textrm{``IP''和``NP NP''同时出现的次数=0}}{\textrm{``IP''出现的次数}=3}$};
+\node [anchor=north west] (math3part3) at ([yshift=0.2em]math3part2.south west){$=\frac{0}{3}$};
+\begin{pgfonlayer}{background}
+\path [] (t1n4.north east) -- (t1n4.north west) -- (t1n4.south west) -- (t1n4.south east);
+\path [] (t1n6.north east) -- (t1n6.north west) -- (t1n6.south west) -- (t1n6.south east);
+\path [] (t2n2.north east) -- (t2n2.north west) -- (t2n2.south west) -- (t2n2.south east);
+\path [] (t2n5.north east) -- (t2n5.north west) -- (t2n5.south west) -- (t2n5.south east);
+\path [] (t1n6.north west) -- (t1n7.north west) -- (t1n7.south west) -- (t1n8.south east) -- (t1n8.north east) -- (t1n6.north east);
+\path [] (t1n2.north east) -- (t1n2.north west) -- (t1n2.south west) -- (t1n2.south east);
+\path [] (t2n7.north east) -- (t2n7.north west) -- (t2n7.south west) -- (t2n7.south east);
+\path [] (t2n13.north east) -- (t2n13.north west) -- (t2n13.south west) -- (t2n13.south east);
+\path [] (t1n2.north west) -- (t1n3.south west) -- (t1n3.south east) -- (t1n2.north east) -- (t1n2.north west);
+\path [] (t2n13.north west) -- (t2n14.south west) -- (t2n14.south east) -- (t2n13.north east) -- (t2n13.north west);
+\path [] (t1n1.north east) -- (t1n1.north west) -- (t1n1.south west) -- (t1n1.south east);
+\path [] (t2n4.north east) -- (t2n4.north west) -- (t2n4.south west) -- (t2n4.south east);
+\path [] (t2n1.north east) -- (t2n1.north west) -- (t2n1.south west) -- (t2n1.south east);
+\end{pgfonlayer}
+\end{tikzpicture}
+%---------------------------------------------------------------------
--- a/Chapter3/Figures/figure-example-of-derivation.tex
+++ b/Chapter3/Figures/figure-example-of-derivation.tex
+\definecolor{ublue}{rgb}{0.152,0.250,0.545}
+\definecolor{ugreen}{rgb}{0,0.5,0}
+%%% outline
+%-------------------------------------------------------------------------
+\begin{minipage}[t]{0.38\linewidth}
+{\small
+\begin{eqnarray}
+& &\textrm{IP} \nonumber \\
+& \overset{r_8}{\Rightarrow} & \textrm{NP VP} \nonumber \\
+& \overset{r_5}{\Rightarrow} & \textrm{NN VP} \nonumber \\
+& \overset{r_1}{\Rightarrow} & \textrm{猫 VP} \nonumber \\
+& \overset{r_7}{\Rightarrow} & \textrm{猫 VV VP} \nonumber \\
+& \overset{r_2}{\Rightarrow} & \textrm{猫 喜欢 VP} \nonumber \\
+& \overset{r_6}{\Rightarrow} & \textrm{猫 喜欢 VV NN} \nonumber \\
+& \overset{r_3}{\Rightarrow} & \textrm{猫 喜欢 吃 NN} \nonumber \\
+& \overset{r_4}{\Rightarrow} & \textrm{猫 喜欢 吃 鱼} \nonumber
+\end{eqnarray}
+}
+\end{minipage}
+\hfill
+\begin{minipage}[t]{0.55\linewidth}
+\vspace{1.0em}
+\begin{center}
+\begin{tikzpicture}
+\begin{scope}
+{\scriptsize
+\node [anchor=west,inner sep=2pt] (r1) at (0,0) {$r_1$: NN $\to$ 猫};
+\node [anchor=west,inner sep=2pt] (r2) at ([xshift=3em]r1.east) {$r_2$: VV $\to$ 喜欢};
+\node [anchor=north west,inner sep=2pt] (r3) at ([yshift=-0.2em]r1.south west) {$r_3$: VV $\to$ 吃};
+\node [anchor=north west,inner sep=2pt] (r4) at ([yshift=-0.2em]r2.south west) {$r_4$: NN $\to$ 鱼};
+\node [anchor=north west,inner sep=2pt] (r5) at ([yshift=-0.2em]r3.south west) {$r_5$: NP $\to$ NN};
+\node [anchor=north west,inner sep=2pt] (r6) at ([yshift=-0.2em]r4.south west) {$r_6$: VP $\to$ VV NN};
+\node [anchor=north west,inner sep=2pt] (r7) at ([yshift=-0.2em]r5.south west) {$r_7$: VP $\to$ VV VP};
+\node [anchor=north west,inner sep=2pt] (r8) at ([yshift=-0.2em]r6.south west) {$r_8$: IP $\to$ NP VP};
+\node [anchor=west,inner sep=2pt] (r1) at (0,0) {$r_1$: NN $\to$ 猫};
+\node [anchor=west,inner sep=2pt] (r2) at ([xshift=3em]r1.east) {$r_2$: VV $\to$ 喜欢};
+\node [anchor=north west,inner sep=2pt] (r3) at ([yshift=-0.2em]r1.south west) {$r_3$: VV $\to$ 吃};
+\node [anchor=north west,inner sep=2pt] (r4) at ([yshift=-0.2em]r2.south west) {$r_4$: NN $\to$ 鱼};
+\node [anchor=north west,inner sep=2pt] (r5) at ([yshift=-0.2em]r3.south west) {$r_5$: NP $\to$ NN};
+\node [anchor=north west,inner sep=2pt] (r6) at ([yshift=-0.2em]r4.south west) {$r_6$: VP $\to$ VV NN};
+\node [anchor=north west,inner sep=2pt] (r7) at ([yshift=-0.2em]r5.south west) {$r_7$: VP $\to$ VV VP};
+\node [anchor=north west,inner sep=2pt] (r8) at ([yshift=-0.2em]r6.south west) {$r_8$: IP $\to$ NP VP};
+}
+\end{scope}
+\begin{scope}[xshift=4.5em,yshift=-5.5em,level distance=20pt,sibling distance=13pt]
+\Tree[.IP \edge[white];
+          [.{\color{white} NP} \edge[white];
+               [.{\color{white} NN} \edge[white]; \node[white](sw1){猫}; ]
+          ] \edge[white];
+          [.{\color{white} VP} \edge[white];
+               [.{\color{white} VV} \edge[white]; \node[white](sw2){喜欢}; ] \edge[white];
+               [.{\color{white} VP} \edge[white];
+                     [.{\color{white} VV} \edge[white]; \node[white](sw1){吃}; ] \edge[white];
+                     [.{\color{white} NN} \edge[white]; \node[white](sw1){鱼}; ]
+               ]
+          ]
+     ]
+\Tree[.IP
+          [.NP \edge[white];
+               [.{\color{white} NN} \edge[white]; \node[white](sw1){猫}; ]
+          ]
+          [.VP \edge[white];
+               [.{\color{white} VV} \edge[white]; \node[white](sw2){喜欢}; ] \edge[white];
+               [.{\color{white} VP} \edge[white];
+                     [.{\color{white} VV} \edge[white]; \node[white](sw1){吃}; ] \edge[white];
+                     [.{\color{white} NN} \edge[white]; \node[white](sw1){鱼}; ]
+               ]
+          ]
+     ]
+\Tree[.IP
+          [.NP
+               [.NN \edge[white]; \node[white](sw1){猫}; ]
+          ]
+          [.VP \edge[white];
+               [.{\color{white} VV} \edge[white]; \node[white](sw2){喜欢}; ] \edge[white];
+               [.{\color{white} VP} \edge[white];
+                     [.{\color{white} VV} \edge[white]; \node[white](sw1){吃}; ] \edge[white];
+                     [.{\color{white} NN} \edge[white]; \node[white](sw1){鱼}; ]
+               ]
+          ]
+     ]
+\Tree[.IP
+          [.NP
+               [.NN \node(sw1){猫}; ]
+          ]
+          [.VP \edge[white];
+               [.{\color{white} VV} \edge[white]; \node[white](sw2){喜欢}; ] \edge[white];
+               [.{\color{white} VP} \edge[white];
+                     [.{\color{white} VV} \edge[white]; \node[white](sw1){吃}; ] \edge[white];
+                     [.{\color{white} NN} \edge[white]; \node[white](sw1){鱼}; ]
+               ]
+          ]
+     ]
+\Tree[.IP
+          [.NP
+               [.NN \node(sw1){猫}; ]
+          ]
+          [.VP
+               [.VV \edge[white]; \node[white](sw2){喜欢}; ]
+               [.VP \edge[white];
+                     [.{\color{white} VV} \edge[white]; \node[white](sw1){吃}; ] \edge[white];
+                     [.{\color{white} NN} \edge[white]; \node[white](sw1){鱼}; ]
+               ]
+          ]
+     ]
+\Tree[.IP
+          [.NP
+               [.NN \node(sw1){猫}; ]
+          ]
+          [.VP
+               [.VV \node(sw2){喜欢}; ]
+               [.VP \edge[white];
+                     [.{\color{white} VV} \edge[white]; \node[white](sw1){吃}; ] \edge[white];
+                     [.{\color{white} NN} \edge[white]; \node[white](sw1){鱼}; ]
+               ]
+          ]
+     ]
+\Tree[.IP
+          [.NP
+               [.NN \node(sw1){猫}; ]
+          ]
+          [.VP
+               [.VV \node(sw2){喜欢}; ]
+               [.VP
+                     [.VV \edge[white]; \node[white](sw1){吃}; ]
+                     [.NN \edge[white]; \node[white](sw1){鱼}; ]
+               ]
+          ]
+     ]
+\Tree[.IP
+          [.NP
+               [.NN \node(sw1){猫}; ]
+          ]
+          [.VP
+               [.VV \node(sw2){喜欢}; ]
+               [.VP
+                     [.VV \node(sw1){吃}; ]
+                     [.NN \node(sw1){鱼}; ]
+               ]
+          ]
+     ]
+\end{scope}
+\end{tikzpicture}
+\end{center}
+\end{minipage}
+%---------------------------------------------------------------------
--- a/Chapter3/Figures/figure-example-of-hmm-in-coin-toss.tex
+++ b/Chapter3/Figures/figure-example-of-hmm-in-coin-toss.tex
+\begin{tikzpicture}
+	\tikzstyle{hide} = [draw,inner sep=2pt,line width=1pt,align=center,drop shadow,fill=green!20,font=\footnotesize,minimum height=1.8em,minimum width=1.8em]
+	\tikzstyle{see} = [draw,inner sep=2pt,line width=1pt,align=center,drop shadow,fill=red!30,font=\footnotesize,minimum height=1.2em,minimum width=1.2em,circle]
+		\node[hide] (h1) at (0,0){C};
+		\node[hide,anchor=west] (h2) at ([xshift=2em]h1.east){B};
+		\node[hide,anchor=west] (h3) at ([xshift=2em]h2.east){A};
+		\node[hide,anchor=west] (h4) at ([xshift=2em]h3.east){B};
+		\node[hide,anchor=west] (h5) at ([xshift=2em]h4.east){C};
+		\node[hide,anchor=west] (h6) at ([xshift=2em]h5.east){A};
+		\node[see,anchor=north] (s1) at ([yshift=-1.6em]h1.south){正};
+		\node[see,anchor=north] (s2) at ([yshift=-1.6em]h2.south){正};
+		\node[see,anchor=north] (s3) at ([yshift=-1.6em]h3.south){反};
+		\node[see,anchor=north] (s4) at ([yshift=-1.6em]h4.south){反};
+		\node[see,anchor=north] (s5) at ([yshift=-1.6em]h5.south){正};
+		\node[see,anchor=north] (s6) at ([yshift=-1.6em]h6.south){反};
+		\draw[->,line width=1.4pt] (h1.east) -- node[above]{$\frac{1}{3}$}(h2.west);
+		\draw[->,line width=1.4pt] (h2.east) -- node[above]{$\frac{1}{3}$}(h3.west);
+		\draw[->,line width=1.4pt] (h3.east) -- node[above]{$\frac{1}{3}$}(h4.west);
+		\draw[->,line width=1.4pt] (h4.east) -- node[above]{$\frac{1}{3}$}(h5.west);
+		\draw[->,line width=1.4pt] (h5.east) -- node[above]{$\frac{1}{3}$}(h6.west);
+		\draw[->,line width=1.4pt,blue!60] (h1.south) -- node[right,black]{\footnotesize $0.7$}(s1.north);
+		\draw[->,line width=1.4pt,blue!60] (h2.south) -- node[right,black]{\footnotesize $0.5$}(s2.north);
+		\draw[->,line width=1.4pt,blue!60] (h3.south) -- node[right,black]{\footnotesize $0.7$}(s3.north);
+		\draw[->,line width=1.4pt,blue!60] (h4.south) -- node[right,black]{\footnotesize $0.5$}(s4.north);
+		\draw[->,line width=1.4pt,blue!60] (h5.south) -- node[right,black]{\footnotesize $0.7$}(s5.north);
+		\draw[->,line width=1.4pt,blue!60] (h6.south) -- node[right,black]{\footnotesize $0.7$}(s6.north);
+		\begin{pgfonlayer}{background}
+        	\node [draw,rectangle,inner sep=1em,rounded corners=2pt,fill=gray!20] [fit = (h1)(s1)(h6) (s6)] (box) {};
+    	\end{pgfonlayer}
+		\node[anchor=north,font=\footnotesize] (note) at ([xshift=0.4em,yshift=-1.4em]s1.south){图示说明：};
+		\node[anchor=north,hide,minimum height=1em,minimum width=1em] (one_h) at ([xshift=-0.6em,yshift=-1em]note.south){};
+		\node[anchor=west,font=\scriptsize] at ([xshift=0.2em]one_h.east){一个隐含状态};
+		\node[anchor=north,see] (one_s) at ([yshift=-1.4em]one_h.south){};
+		\node[anchor=west,font=\scriptsize] at ([xshift=0.2em]one_s.east){一个可见状态};
+		\draw[->,line width=1.4pt] ([xshift=8em]one_h.east) -- ([xshift=9em]one_h.east);
+		\node[anchor=west,align=left,font=\scriptsize] at ([xshift=9.2em]one_h.east){从一个隐含状态到下一个隐含状态\\的转换，该过程隐含着转移概率};
+		\draw[->,line width=1.4pt,blue!60] ([yshift=-2em,xshift=8.5em]one_h.east) --([yshift=-3em,xshift=8.5em]one_h.east) ;
+		\node[anchor=west,align=left,font=\scriptsize] at ([yshift=-2.5em,xshift=9.2em]one_h.east){从一个隐含状态到可见状态的输出\\，该过程隐含着发射概率};
+\end{tikzpicture}
\ No newline at end of file
--- a/Chapter3/Figures/figure-example-of-hmm.tex
+++ b/Chapter3/Figures/figure-example-of-hmm.tex
+\begin{tikzpicture}
+	\tikzstyle{unit} = [draw,circle,line width=0.8pt,align=center,fill=green!30,minimum size=1em]
+		\node[minimum width=3em,minimum height=1.8em] (o) at (0,0){};
+		\node[anchor=north,inner sep=1pt,font=\footnotesize] (state_A) at ([xshift=-1em,yshift=-1em]o.south){state A};
+		\node[anchor=north,inner sep=1pt,font=\footnotesize] (state_B) at ([yshift=-2em]state_A.south){state B};
+		\node[anchor=north,inner sep=1pt,font=\footnotesize] (state_C) at ([yshift=-2em]state_B.south){state C};
+		\node[anchor=north,inner sep=1pt,font=\footnotesize] (state_D) at ([yshift=-2em]state_C.south){state D};
+		\node[anchor=west,inner sep=1pt,font=\footnotesize] (c1) at ([yshift=0.2em,xshift=2em]o.east){T};
+		\node[anchor=west,inner sep=1pt,font=\footnotesize] (c2) at ([xshift=5em]c1.east){F};
+		\node[anchor=west,inner sep=1pt,font=\footnotesize] (c3) at ([xshift=5em]c2.east){F};
+		\node[anchor=west,inner sep=1pt,font=\footnotesize] (c4) at ([xshift=5em]c3.east){T};
+		\node[anchor=north,unit,fill=red!30] (u11) at ([yshift=-1.6em]c1.south){};
+		\node[anchor=north,unit] (u21) at ([yshift=-1.6em]u11.south){};
+		\node[anchor=north,unit] (u31) at ([yshift=-1.6em]u21.south){};
+		\node[anchor=north,unit] (u41) at ([yshift=-1.6em]u31.south){};
+		\node[anchor=north,unit,fill=red!30] (u12) at ([yshift=-1.6em]c2.south){};
+		\node[anchor=north,unit] (u22) at ([yshift=-1.6em]u12.south){};
+		\node[anchor=north,unit] (u32) at ([yshift=-1.6em]u22.south){};
+		\node[anchor=north,unit] (u42) at ([yshift=-1.6em]u32.south){};
+		\node[anchor=north,unit,fill=red!30] (u13) at ([yshift=-1.6em]c3.south){};
+		\node[anchor=north,unit] (u23) at ([yshift=-1.6em]u13.south){};
+		\node[anchor=north,unit] (u33) at ([yshift=-1.6em]u23.south){};
+		\node[anchor=north,unit] (u43) at ([yshift=-1.6em]u33.south){};
+		\node[anchor=north,unit,fill=red!30] (u14) at ([yshift=-1.6em]c4.south){};
+		\node[anchor=north,unit] (u24) at ([yshift=-1.6em]u14.south){};
+		\node[anchor=north,unit] (u34) at ([yshift=-1.6em]u24.south){};
+		\node[anchor=north,unit] (u44) at ([yshift=-1.6em]u34.south){};
+		\draw[line width=1pt] (o.north west)--(o.south east);
+		\node[anchor=south west,align=center,font=\tiny] at ([yshift=-1.4em,xshift=-1.2em]o.45){$i+1$位置\\隐藏状态};
+	\node[anchor=north east,align=center,font=\tiny] at ([yshift=1.2em,xshift=1.2em]o.-135){$i$位置\\可见状态};
+		\draw[->,line width=1pt] (u11.east) -- node[above,red!50,font=\footnotesize]{0.65}(u12.west);
+		\draw[->,line width=1pt] (u12.east) -- node[above,red!50,font=\footnotesize]{0.55}(u13.west);
+		\draw[->,line width=1pt] (u12.east) -- node[right,pos=0.6,font=\footnotesize]{0.45}(u23.west);
+		\draw[->,line width=1pt] (u13.east) -- node[above,red!50,font=\footnotesize]{0.5}(u14.west);
+		\draw[->,line width=1pt] (u13.east) -- node[right,pos=0.6,font=\footnotesize]{0.5}(u24.west);
+		\draw[->,line width=1pt] (u11.east) -- node[right,font=\footnotesize]{0.35}(u22.west);	
+		\draw[->,line width=1pt] (u22.east) -- node[left,pos=0.4,font=\footnotesize]{0.3}(u13.west);
+		\draw[->,line width=1pt] (u22.east) -- node[font=\footnotesize]{0.2}(u23.west);
+		\draw[->,line width=1pt] (u22.east) -- node[font=\footnotesize]{0.2}(u33.west);
+		\draw[->,line width=1pt] (u22.east) -- node[below,font=\footnotesize]{0.3}(u43.west);
+		\draw[->,line width=1pt] (u23.east) -- node[left,pos=0.4,font=\footnotesize]{0.35}(u14.west);
+		\draw[->,line width=1pt] (u23.east) -- node[font=\footnotesize]{0.15}(u24.west);
+		\draw[->,line width=1pt] (u23.east) -- node[font=\footnotesize]{0.15}(u34.west);
+		\draw[->,line width=1pt] (u23.east) -- node[below,font=\footnotesize]{0.35}(u44.west);
+\end{tikzpicture}
\ No newline at end of file
--- a/Chapter3/Figures/figure-example-of-word-segmentation-based-on-dictionary.tex
+++ b/Chapter3/Figures/figure-example-of-word-segmentation-based-on-dictionary.tex
+\definecolor{ublue}{rgb}{0.152,0.250,0.545}
+\definecolor{ugreen}{rgb}{0,0.5,0}
+%%% outline
+%-------------------------------------------------------------------------
+\begin{tikzpicture}
+\begin{scope}
+{
+{\small
+\node [anchor=north west] (entry1) at (0,0) {\textbf{1:} 很};
+\node [anchor=north west] (entry2) at ([yshift=0.1em]entry1.south west) {\textbf{2:} 高};
+\node [anchor=north west] (entry3) at ([yshift=0.1em]entry2.south west) {\textbf{3:} 现在};
+\node [anchor=north west] (entry4) at ([yshift=0.1em]entry3.south west) {\textbf{4:} 物价};
+\node [anchor=north west] (entry5) at ([yshift=0.1em]entry4.south west) {\textbf{5:} 确实};
+\node [anchor=north west] (entry6) at ([yshift=0.1em]entry5.south west) {\textbf{6:} 实现};
+\node [anchor=south west] (dictionarylabel) at (entry1.north west) {{\color{ublue} 分词词典}};
+}
+}
+\begin{pgfonlayer}{background}
+{
+\node[rectangle,draw=ublue, inner sep=0.2em] [fit = (entry1) (entry2) (entry3) (entry4) (entry5) (entry6) (dictionarylabel)] {};
+}
+\end{pgfonlayer}
+\end{scope}
+{
+\begin{scope}[xshift=1.2in,yshift=1em]
+\node [anchor=west] (c1) at (0,0) {确};
+\node [anchor=west] (c2) at ([xshift=0em]c1.east) {实};
+\node [anchor=west] (c3) at ([xshift=0em]c2.east) {现};
+\node [anchor=west] (c4) at ([xshift=0em]c3.east) {在};
+\node [anchor=west] (c5) at ([xshift=0em]c4.east) {物};
+\node [anchor=west] (c6) at ([xshift=0em]c5.east) {价};
+\node [anchor=west] (c7) at ([xshift=0em]c6.east) {很};
+\node [anchor=west] (c8) at ([xshift=0em]c7.east) {高};
+\end{scope}
+}
+\begin{scope}[xshift=1.2in,yshift=-4em]
+{
+\node [anchor=west] (bc1) at (0,0) {确};
+\node [anchor=west] (bc2) at ([xshift=0em]bc1.east) {实};
+}
+{
+\node [anchor=west] (bc3) at ([xshift=0em]bc2.east) {现};
+\node [anchor=west] (bc4) at ([xshift=0em]bc3.east) {在};
+}
+{
+\node [anchor=west] (bc5) at ([xshift=0em]bc4.east) {物};
+\node [anchor=west] (bc6) at ([xshift=0em]bc5.east) {价};
+}
+{
+\node [anchor=west] (bc7) at ([xshift=0em]bc6.east) {很};
+}
+{
+\node [anchor=west] (bc8) at ([xshift=0em]bc7.east) {高};
+}
+{
+\draw [-,very thick] ([xshift=-0.3em,yshift=0.1em]bc2.south east) -- ([xshift=0.3em,yshift=-0.1em]bc3.north west);
+}
+{
+\draw [-,very thick] ([xshift=-0.3em,yshift=0.1em]bc4.south east) -- ([xshift=0.3em,yshift=-0.1em]bc5.north west);
+}
+{
+\draw [-,very thick] ([xshift=-0.3em,yshift=0.1em]bc6.south east) -- ([xshift=0.3em,yshift=-0.1em]bc7.north west);
+}
+{
+\draw [-,very thick] ([xshift=-0.3em,yshift=0.1em]bc7.south east) -- ([xshift=0.3em,yshift=-0.1em]bc8.north west);
+}
+\end{scope}
+{
+\draw [<-,thick] ([yshift=-0.2em]c1.north west) -- ([yshift=0.2em]c1.north west);
+\node [anchor=south] (b1) at ([yshift=0.0em]c1.north west) {\scriptsize{起始}};
+}
+{
+\draw [<-,thick] ([yshift=-0.2em]c3.north west) -- ([yshift=0.2em]c3.north west);
+\node [anchor=south] (b2) at ([yshift=0.0em]c3.north west) {\scriptsize{起始}};
+}
+{
+\draw [<-,thick] ([yshift=-0.2em]c5.north west) -- ([yshift=0.2em]c5.north west);
+\node [anchor=south] (b3) at ([yshift=0.0em]c5.north west) {\scriptsize{起始}};
+}
+{
+\draw [<-,thick] ([yshift=-0.2em]c7.north west) -- ([yshift=0.2em]c7.north west);
+\node [anchor=south] (b4) at ([yshift=0.0em]c7.north west) {\scriptsize{起始}};
+}
+{
+\draw [<-,thick] ([yshift=-0.2em]c8.north west) -- ([yshift=0.2em]c8.north west);
+\node [anchor=south] (b5) at ([yshift=0.0em]c8.north west) {\scriptsize{起始}};
+}
+{
+\node [anchor=west,thick,draw,red,minimum width=1.6em,minimum height=1.3em] (w18) at ([xshift=0.1em]c8.west){};
+\node [anchor=north] (l18) at ([yshift=0.2em]w18.south) {{\color{red} \footnotesize{命中:2}}};
+}
+\end{tikzpicture}
--- a/Chapter3/Figures/figure-examples-of-chinese-word-segmentation-based-on-1-gram-model.tex
+++ b/Chapter3/Figures/figure-examples-of-chinese-word-segmentation-based-on-1-gram-model.tex
+\definecolor{ublue}{rgb}{0.152,0.250,0.545}
+\definecolor{ugreen}{rgb}{0,0.5,0}
+%%% outline
+%-------------------------------------------------------------------------
+\begin{tikzpicture}
+{\scriptsize
+\node [anchor=north west] (entry1) at (0,0) {\textbf{1:} 这 / 是 / 数据};
+\node [anchor=north west] (entry2) at ([yshift=0.1em]entry1.south west) {\textbf{2:} 现在 / 已经 / 实现};
+\node [anchor=north west] (entry3) at ([yshift=0.1em]entry2.south west) {\textbf{3:} 确实 / 有 / 很 / 多};
+\node [anchor=north west] (entry4) at ([yshift=0.1em]entry3.south west) {...};
+\node [anchor=south west] (corpuslabel) at (entry1.north west) {{\color{ublue} \textbf{学习用数据}}};
+\begin{pgfonlayer}{background}
+\node[rectangle,draw=ublue,thick,inner sep=0.2em,fill=white,drop shadow] [fit = (entry1) (entry2) (entry3) (entry4) (corpuslabel)] (corpus) {};
+\end{pgfonlayer}
+}
+\node [anchor=west,ugreen] (P) at ([xshift=5.2em,yshift=-0.8em]corpus.east){\large{\funp{P}($\cdot$)}};
+\node [anchor=south] (modellabel) at (P.north) {{\color{ublue} {\scriptsize \textbf{统计模型}}}};
+\begin{pgfonlayer}{background}
+\node[rectangle,draw=ublue,thick,inner sep=0.2em,fill=white,drop shadow] [fit = (P) (modellabel)] (model) {};
+\end{pgfonlayer}
+\draw [->,very thick,ublue] ([xshift=0.2em]corpus.east) -- ([xshift=4.2em]corpus.east)  node [pos=0.5, above] {\color{red}{\scriptsize{统计学习}}};
+\draw [->,very thick,ublue] ([xshift=0.2em]model.east) -- ([xshift=4.2em]model.east)  node [pos=0.5, above] {\color{red}{\scriptsize{搜索\&计算}}};
+{\scriptsize
+\node [anchor=north west] (sentlabel) at ([xshift=6.2em,yshift=-1em]model.north east) {\color{red}{自动分词系统}};
+\node [anchor=north west] (sent) at (sentlabel.south west) {\textbf{对任意句子进行分词}};
+}
+\begin{pgfonlayer}{background}
+\node[rectangle,draw=ublue,thick,inner sep=0.2em,fill=white,drop shadow] [fit = (sentlabel) (sent)] (segsystem) {};
+\end{pgfonlayer}
+{\footnotesize
+{
+\node [anchor=west] (label1) at (0,6em) {实际上，通过学习我们得到了一个分词模型\funp{P}($\cdot$)，给定任意的分词结果};
+\node [anchor=north west] (label1part2) at ([yshift=0.5em]label1.south west) {$W=w_1 w_2...w_n$，都能通过\funp{P}($W$)=$\funp{P}(w_1) \cdot \funp{P}(w_2) \cdot ... \cdot \funp{P}(w_n)$ 计算这种分词的\hspace{0.13em} };
+\node [anchor=north west] (label1part3) at ([yshift=0.5em]label1part2.south west) {概率值};
+}
+\begin{pgfonlayer}{background}
+{
+\node[rectangle,fill=blue!10,thick,dotted,inner sep=0.2em] [fit = (label1) (label1part2) (label1part3)] (label1content) {};
+}
+\end{pgfonlayer}
+{
+\draw [-,thick,red,dotted] ([yshift=0.3em]modellabel.north) ..controls +(north:0.5) and +(south:0.5).. ([xshift=-3em]label1content.south);
+}
+}
+{\footnotesize
+{
+\node [anchor=west] (label1) at (0,-6.8em) {\textbf{自动分词系统}：对任意的数据句子$S$，找到最佳的分词结果$W^{*}$输出};
+}
+{
+\node [anchor=north west] (label2) at (label1.south west) {假设输入$S$=“确实现在数据很多”};
+}
+{
+\node [anchor=north west,draw,thick,inner sep=2pt] (data11) at (label2.south west) {枚举所有可能的切分};
+}
+{
+\node [anchor=west,draw,thick,inner sep=2pt] (data12) at ([xshift=4em]data11.east) {计算每种切分的概率};
+}
+{
+\node [anchor=west,draw,thick,inner sep=2pt] (data13) at ([xshift=4.0em]data12.east) {选择最佳结果};
+}
+{
+\draw [->,thick] ([xshift=0.1em]data11.east) -- ([xshift=-0.1em]data12.west);
+}
+{
+\draw [->,thick] ([xshift=0.1em]data12.east) -- ([xshift=-0.1em]data13.west);
+}
+{\scriptsize
+{
+\node [anchor=north west] (data21) at (data11.south west) {确/实现/在/数/据很/多};
+}
+{
+\node [anchor=north west] (data22) at (data12.south west) {$\funp{P}(\textrm{确}) \cdot \funp{P}(\textrm{实现}) \cdot \funp{P}(\textrm{在}) \cdot \funp{P}(\textrm{数}) \cdot $};
+}
+\node [anchor=north west,minimum height=1.6em] (data23) at (data13.south west) {};
+\node [anchor=north west,minimum height=1.6em] (data31) at ([yshift=0.3em]data21.south west) {};
+{
+\node [anchor=north west] (data32) at ([yshift=0.3em]data22.south west) {$\funp{P}(\textrm{据很}) \cdot  \funp{P}(\textrm{多}) = 2.13 \times 10^{-45}$};
+}
+\node [anchor=north west,minimum height=1.6em] (data33) at ([yshift=0.3em]data23.south west) {};
+{
+\node [anchor=north west] (data41) at (data31.south west) {确实/现在/数据/很多};
+}
+{
+\node [anchor=north west] (data42) at (data32.south west) {$\funp{P}(\textrm{确实}) \cdot \funp{P}(\textrm{现在}) \cdot \funp{P}(\textrm{数据}) \cdot $};
+}
+{
+\node [anchor=north west] (data43) at ([yshift=-0.2em,xshift=-2em]data33.south west) {\color{red}{\textbf{输出}}};
+\draw [->,red,thick] (data43.west)--([xshift=-1em]data43.west);
+}
+{
+\node [anchor=north west] (data51) at (data41.south west) {...};
+}
+{
+\node [anchor=north west] (data52) at ([yshift=0.3em]data42.south west) {$\funp{P}(\textrm{很}) \cdot  \funp{P}(\textrm{多}) = 1.54 \times 10^{-25}$};
+}
+\node [anchor=north west] (data53) at ([yshift=0.3em]data43.south west) {};
+}
+}
+\begin{pgfonlayer}{background}
+{
+\node[rectangle,fill=blue!10,thick,dotted,inner sep=0.1em] [fit = (label1) (data11) (data13) (data51) (data52) (data53)] (segcontent) {};
+}
+\end{pgfonlayer}
+{
+\draw [-,thick,red,dotted] (segcontent.north) ..controls +(north:0.7) and +(south:0.7).. (segsystem.south);
+}
+\end{tikzpicture}
--- a/Chapter3/Figures/figure-labeling-named-entities-in-bio-format.tex
+++ b/Chapter3/Figures/figure-labeling-named-entities-in-bio-format.tex
+\begin{tikzpicture}
+	\tikzstyle{unit} = [draw,inner sep=2pt,line width=0.8pt,align=center,drop shadow,fill=red!30,font=\footnotesize,minimum height=1.2em,minimum width=1.8em]
+	\tikzstyle{lab} = [inner sep=0pt,align=center,rotate=-90,font=\scriptsize]
+		\node[unit] (n11) at (0,0){北京};
+		\node[anchor=west,inner sep=0pt,font=\footnotesize] at ([xshift=0.5em]n11.east){\Large{/}};
+		\node[unit,anchor=west] (n12) at ([xshift=1.2em]n11.east){是};
+		\node[anchor=west,inner sep=0pt,font=\footnotesize] at ([xshift=0.5em]n12.east){\Large{/}};
+		\node[unit,anchor=west] (n13) at ([xshift=1.2em]n12.east){中华};
+		\node[anchor=west,inner sep=0pt,font=\footnotesize] at ([xshift=0.5em]n13.east){\Large{/}};
+		\node[unit,anchor=west] (n14) at ([xshift=1.2em]n13.east){人民};
+		\node[anchor=west,inner sep=0pt,font=\footnotesize] at ([xshift=0.5em]n14.east){\Large{/}};
+		\node[unit,anchor=west] (n15) at ([xshift=1.2em]n14.east){共和};
+		\node[anchor=west,inner sep=0pt,font=\footnotesize] at ([xshift=0.5em]n15.east){\Large{/}};
+		\node[unit,anchor=west] (n16) at ([xshift=1.2em]n15.east){国};
+		\node[anchor=west,inner sep=0pt,font=\footnotesize] at ([xshift=0.5em]n16.east){\Large{/}};
+		\node[unit,anchor=west] (n17) at ([xshift=1.2em]n16.east){的};
+		\node[anchor=west,inner sep=0pt,font=\footnotesize] at ([xshift=0.5em]n17.east){\Large{/}};
+		\node[unit,anchor=west] (n18) at ([xshift=1.2em]n17.east){首都};
+		\node[lab,anchor=north] at ([yshift=-1.4em,xshift=0.2em]n11.south){B-GPE};
+		\node[lab,anchor=north] at ([yshift=-0.8em,xshift=0.2em]n12.south){O};
+		\node[lab,anchor=north] at ([yshift=-1.4em,xshift=0.2em]n13.south){B-GPE};
+		\node[lab,anchor=north] at ([yshift=-1.4em,xshift=0.2em]n14.south){I-GPE};
+		\node[lab,anchor=north] at ([yshift=-1.4em,xshift=0.2em]n15.south){I-GPE};
+		\node[lab,anchor=north] at ([yshift=-1.4em,xshift=0.2em]n16.south){I-GPE};
+		\node[lab,anchor=north] at ([yshift=-0.8em,xshift=0.2em]n17.south){O};
+		\node[lab,anchor=north] at ([yshift=-0.8em,xshift=0.2em]n18.south){O};
+\end{tikzpicture}
\ No newline at end of file
--- a/Chapter3/Figures/figure-labeling-named-entities-in-bioes-format.tex
+++ b/Chapter3/Figures/figure-labeling-named-entities-in-bioes-format.tex
+\begin{tikzpicture}
+	\tikzstyle{unit} = [draw,inner sep=2pt,line width=0.8pt,align=center,drop shadow,fill=red!30,font=\footnotesize,minimum height=1.2em,minimum width=1.8em]
+	\tikzstyle{lab} = [inner sep=0pt,align=center,rotate=-90,font=\scriptsize]
+		\node[unit] (n11) at (0,0){北京};
+		\node[anchor=west,inner sep=0pt,font=\footnotesize] at ([xshift=0.5em]n11.east){\Large{/}};
+		\node[unit,anchor=west] (n12) at ([xshift=1.2em]n11.east){是};
+		\node[anchor=west,inner sep=0pt,font=\footnotesize] at ([xshift=0.5em]n12.east){\Large{/}};
+		\node[unit,anchor=west] (n13) at ([xshift=1.2em]n12.east){中华};
+		\node[anchor=west,inner sep=0pt,font=\footnotesize] at ([xshift=0.5em]n13.east){\Large{/}};
+		\node[unit,anchor=west] (n14) at ([xshift=1.2em]n13.east){人民};
+		\node[anchor=west,inner sep=0pt,font=\footnotesize] at ([xshift=0.5em]n14.east){\Large{/}};
+		\node[unit,anchor=west] (n15) at ([xshift=1.2em]n14.east){共和};
+		\node[anchor=west,inner sep=0pt,font=\footnotesize] at ([xshift=0.5em]n15.east){\Large{/}};
+		\node[unit,anchor=west] (n16) at ([xshift=1.2em]n15.east){国};
+		\node[anchor=west,inner sep=0pt,font=\footnotesize] at ([xshift=0.5em]n16.east){\Large{/}};
+		\node[unit,anchor=west] (n17) at ([xshift=1.2em]n16.east){的};
+		\node[anchor=west,inner sep=0pt,font=\footnotesize] at ([xshift=0.5em]n17.east){\Large{/}};
+		\node[unit,anchor=west] (n18) at ([xshift=1.2em]n17.east){首都};
+		\node[lab,anchor=north] at ([yshift=-1.4em,xshift=0.2em]n11.south){S-GPE};
+		\node[lab,anchor=north] at ([yshift=-0.8em,xshift=0.2em]n12.south){O};
+		\node[lab,anchor=north] at ([yshift=-1.4em,xshift=0.2em]n13.south){B-GPE};
+		\node[lab,anchor=north] at ([yshift=-1.4em,xshift=0.2em]n14.south){I-GPE};
+		\node[lab,anchor=north] at ([yshift=-1.4em,xshift=0.2em]n15.south){I-GPE};
+		\node[lab,anchor=north] at ([yshift=-1.4em,xshift=0.2em]n16.south){E-GPE};
+		\node[lab,anchor=north] at ([yshift=-0.8em,xshift=0.2em]n17.south){O};
+		\node[lab,anchor=north] at ([yshift=-0.8em,xshift=0.2em]n18.south){O};
+\end{tikzpicture}
\ No newline at end of file
--- a/Chapter3/Figures/figure-mt-system-as-a-black-box.tex
+++ b/Chapter3/Figures/figure-mt-system-as-a-black-box.tex
+\definecolor{ublue}{rgb}{0.152,0.250,0.545}
+\definecolor{ugreen}{rgb}{0,0.5,0}
+%%% outline
+%-------------------------------------------------------------------------
+\vspace{0.3em}
+\begin{tikzpicture}
+\begin{scope}
+\node [] (input) at (0,0) {{\scriptsize 猫喜欢吃鱼}};
+\node [] (output) at ([xshift=3.35in]input.east) {{\scriptsize Cats like eating fish}};
+\draw[->,thick] ([xshift=-1pt]input.east) -- ([xshift=8pt]input.east);
+\draw[->,thick] ([xshift=-10pt]output.west) -- ([xshift=-0pt]output.west);
+%{
+%\draw[->,thick] ([xshift=-12pt]mtengine.west) -- ([xshift=-2pt]mtengine.west);
+%\draw[->,thick] ([xshift=2pt]mtengine.east) -- ([xshift=12pt]mtengine.east);
+%}
+{
+\node[minimum height=4em,minimum width=4.5em,fill=white] (inputmarking) at (0.88in,-0.39in) {};
+\node[minimum height=4em,minimum width=5.2em,fill=white] (outputmarking) at (2.57in,-0.39in) {};
+}
+\node [anchor=south] (inputlabel) at ([yshift=-0.5em]input.north) {{\scriptsize \color{red}{\textbf{输入}}}};
+\node [anchor=south] (outputlabel) at ([yshift=-0.5em]output.north) {{\scriptsize \color{red}{\textbf{输出}}}};
+{
+\node [anchor=west] (mtinputlabel) at ([xshift=0.32in]inputlabel.east) {{\scriptsize \color{red}{\textbf{}}}};
+\node [anchor=west] (mtoutputlabel) at ([xshift=0.88in]mtinputlabel.east) {{\scriptsize \color{red}{\textbf{}}}};
+\node[rectangle,draw=ublue, inner sep=0mm] [fit = (mtinputlabel) (mtoutputlabel) (inputmarking) (outputmarking)] {};
+}
+{
+\node[rectangle,fill=ublue,inner sep=0mm] [fit = (mtinputlabel) (mtoutputlabel) (inputmarking) (outputmarking)] {{\color{white} \textbf{\Large{MT 系统}}}};
+}
+%\begin{scope}[scale=0.9,xshift=1.2in,yshift=-1.2in,level distance=20pt,sibling distance=0pt]
+%
+%\end{scope}
+\end{scope}
+\end{tikzpicture}
--- a/Chapter3/Figures/figure-mt=language-analysis+translation-engine.tex
+++ b/Chapter3/Figures/figure-mt=language-analysis+translation-engine.tex
+\definecolor{ublue}{rgb}{0.152,0.250,0.545}
+\definecolor{ugreen}{rgb}{0,0.5,0}
+%%% outline
+%-------------------------------------------------------------------------
+\vspace{0.3em}
+\begin{tikzpicture}
+\begin{scope}
+\node [] (input) at (0,0) {{\scriptsize 猫喜欢吃鱼}};
+{
+\begin{scope}[scale=0.8,xshift=0.9in,yshift=-0.87in,level distance=20pt,sibling distance=-1pt,grow'=up]
+{\scriptsize
+\Tree[.\node(sn0){IP};
+          [.\node(sn1){NP};
+               [.\node(sn2){NN}; \node(sw1){猫}; ]
+          ]
+          [.\node(sn3){VP};
+               [.\node(sn4){VV}; \node(sw2){喜欢}; ]
+               [.\node(sn5){VP}; \edge[roof]; \node(sw3){吃 \ 鱼}; ]
+          ]
+     ]
+}
+\end{scope}
+\node [anchor=west,draw,thick,inner sep=3pt,ublue] (mtengine) at ([xshift=1.05in]input.east) {{\scriptsize MT系统}};
+\begin{scope}[scale=0.8,xshift=3.0in,yshift=-0.87in,level distance=20pt,sibling distance=-3pt,grow'=up]
+{\scriptsize
+\Tree[.\node(tn0){S};
+          [.\node(tn1){NP};
+               [.\node(tn2){NNS}; \node(tw1){cats}; ]
+          ]
+          [.\node(tn3){VP};
+               [.\node(tn4){VB}; \node(tw2){like}; ]
+               [.\node(tn5){VP}; \edge[roof]; \node(tw3){eating fish}; ]
+          ]
+     ]
+}
+\end{scope}
+}
+\node [] (output) at ([xshift=3.35in]input.east) {{\scriptsize Cats like eating fish}};
+\draw[->,thick] ([xshift=-1pt]input.east) -- ([xshift=8pt]input.east);
+\draw[->,thick] ([xshift=-10pt]output.west) -- ([xshift=-0pt]output.west);
+{
+\draw[->,thick] ([xshift=-12pt]mtengine.west) -- ([xshift=-2pt]mtengine.west);
+\draw[->,thick] ([xshift=2pt]mtengine.east) -- ([xshift=12pt]mtengine.east);
+}
+{
+\node[minimum height=4em,minimum width=4.5em] (inputmarking) at (0.85in,-0.39in) {};
+\node[minimum height=4em,minimum width=5.2em] (outputmarking) at (2.55in,-0.39in) {};
+}
+\node [anchor=south] (inputlabel) at ([yshift=-0.5em]input.north) {{\scriptsize \color{red}{\textbf{输入}}}};
+\node [anchor=south] (outputlabel) at ([yshift=-0.5em]output.north) {{\scriptsize \color{red}{\textbf{输出}}}};
+{
+\node [anchor=west] (mtinputlabel) at ([xshift=0.29in]inputlabel.east) {{\scriptsize \color{red}{\textbf{实际的输入}}}};
+\node [anchor=west] (mtoutputlabel) at ([xshift=1.0in]mtinputlabel.east) {{\scriptsize \color{red}{\textbf{实际的输出}}}};
+\node[rectangle,draw=ublue, inner sep=0mm] [fit = (mtinputlabel) (mtoutputlabel) (inputmarking) (outputmarking)] {};
+}
+\end{scope}
+\end{tikzpicture}
--- a/Chapter3/Figures/figure-ner-based-on-hmm.tex
+++ b/Chapter3/Figures/figure-ner-based-on-hmm.tex
+\begin{tikzpicture}
+	\tikzstyle{class} = [draw,inner sep=2pt,line width=1pt,align=center,drop shadow,fill=green!20,font=\footnotesize,minimum height=1.6em,minimum width=3.4em,rotate=-90]
+	\tikzstyle{word} = [draw,inner sep=2pt,line width=1pt,align=center,drop shadow,fill=red!30,font=\footnotesize,minimum height=1.4em,minimum width=1.6em]
+		\coordinate (o) at (0,0);
+		\node[anchor=west,class] (c1) at ([xshift=0em]o.east){B-GPE};
+		\node[anchor=west,class] (c2) at ([xshift=4em]o.east){O};
+		\node[anchor=west,class] (c3) at ([xshift=8em]o.east){B-GPE};
+		\node[anchor=west,class] (c4) at ([xshift=12em]o.east){I-GPE};
+		\node[anchor=west,class] (c5) at ([xshift=16em]o.east){I-GPE};
+		\node[anchor=west,class] (c6) at ([xshift=20em]o.east){I-GPE};
+		\node[anchor=west,class] (c7) at ([xshift=24em]o.east){O};
+		\node[anchor=west,class] (c8) at ([xshift=28em]o.east){O};
+		\node[anchor=north,word] (w1) at ([xshift=-0.8em,yshift=-4em]c1.north){北京};
+		\node[anchor=north,word] (w2) at ([xshift=-0.8em,yshift=-4em]c2.north){是};
+		\node[anchor=north,word] (w3) at ([xshift=-0.8em,yshift=-4em]c3.north){中华};
+		\node[anchor=north,word] (w4) at ([xshift=-0.8em,yshift=-4em]c4.north){人民};
+		\node[anchor=north,word] (w5) at ([xshift=-0.8em,yshift=-4em]c5.north){共和};
+		\node[anchor=north,word] (w6) at ([xshift=-0.8em,yshift=-4em]c6.north){国};
+		\node[anchor=north,word] (w7) at ([xshift=-0.8em,yshift=-4em]c7.north){的};
+		\node[anchor=north,word] (w8) at ([xshift=-0.8em,yshift=-4em]c8.north){首都};
+		\draw[->,line width=1.4pt] (c1.north) -- (c2.south);
+		\draw[->,line width=1.4pt] (c2.north) -- (c3.south);
+		\draw[->,line width=1.4pt] (c3.north) -- (c4.south);
+		\draw[->,line width=1.4pt] (c4.north) -- (c5.south);
+		\draw[->,line width=1.4pt] (c5.north) -- (c6.south);
+		\draw[->,line width=1.4pt] (c6.north) -- (c7.south);
+		\draw[->,line width=1.4pt] (c7.north) -- (c8.south);
+		\draw[->,line width=1.4pt] (c1.east) -- (w1.north);
+		\draw[->,line width=1.4pt] (c2.east) -- (w2.north);
+		\draw[->,line width=1.4pt] (c3.east) -- (w3.north);
+		\draw[->,line width=1.4pt] (c4.east) -- (w4.north);
+		\draw[->,line width=1.4pt] (c5.east) -- (w5.north);
+		\draw[->,line width=1.4pt] (c6.east) -- (w6.north);
+		\draw[->,line width=1.4pt] (c7.east) -- (w7.north);
+		\draw[->,line width=1.4pt] (c8.east) -- (w8.north);
+\end{tikzpicture}
\ No newline at end of file
--- a/Chapter3/Figures/figure-perspectives-of-expert-ordinary-and-syntactic-parser.tex
+++ b/Chapter3/Figures/figure-perspectives-of-expert-ordinary-and-syntactic-parser.tex
+%%% outline
+%-------------------------------------------------------------------------
+\begin{tabular}{l c c c}
+&
+\begin{tikzpicture}
+\begin{scope}[sibling distance=-5pt, level distance=20pt]
+{\footnotesize
+\Tree[.IP
+          [.VP
+               [.VP
+                    [.NN 猫 ]
+                    [.VV 喜欢 ]
+               ]
+               [.VV 吃 ]
+          ]
+          [.NP
+               [.NN 鱼 ]
+          ]
+     ]
+}
+\end{scope}
+\end{tikzpicture}
+&
+\begin{tikzpicture}
+\begin{scope}[sibling distance=-5pt, level distance=20pt]
+{\footnotesize
+\Tree[.IP
+          [.NP
+               [.NN 猫 ]
+          ]
+          [.VP
+               [.VV 喜欢 ]
+               [.VP
+                     [.VV 吃 ]
+                     [.NN 鱼 ]
+               ]
+          ]
+     ]
+}
+\end{scope}
+\end{tikzpicture}
+&
+\begin{tikzpicture}
+\begin{scope}[sibling distance=-5pt, level distance=20pt]
+{\footnotesize
+\Tree[.IP
+          [.NP
+               [.NN 猫 ]
+          ]
+          [.VP
+              [.VP
+                   [.VV 喜欢 ]
+                   [.VV 吃 ]
+              ]
+              [.NP
+                   [.NN 鱼 ]
+              ]
+          ]
+     ]
+}
+\end{scope}
+\end{tikzpicture}
+\\ 
+语言学家: & 不对 & 对 & 不对  \\ 
+我们: & 似乎对了 & 比较肯定 & 不太可能 \\ 
+分析器: & $\textrm{P}=0.2$ & $\textrm{P}=0.6$ & $\textrm{P}=0.1$
+\end{tabular}
+%---------------------------------------------------------------------
--- a/Chapter3/Figures/figure-phrase-structure-tree-and-dependency-tree.tex
+++ b/Chapter3/Figures/figure-phrase-structure-tree-and-dependency-tree.tex
+\definecolor{ublue}{rgb}{0.152,0.250,0.545}
+\definecolor{ugreen}{rgb}{0,0.5,0}
+%%% outline
+%-------------------------------------------------------------------------
+\begin{tikzpicture}
+\begin{scope}[sibling distance=7pt,level distance=22pt]
+\Tree[.\node[inner sep=1pt](sn0){IP\scriptsize{:句子}};
+          [.\node[inner sep=1pt](sn1){NP\scriptsize{:名}};
+               [.\node[inner sep=1pt](sn2){NN\tiny{:名词}}; \node(sw1){猫}; ]
+          ]
+          [.\node[inner sep=1pt](sn3){VP\scriptsize{:动}};
+               [.\node[inner sep=1pt](sn4){VV\tiny{:动词}}; \node(sw2){喜欢}; ]
+               [.\node[inner sep=1pt](sn5){VP\scriptsize{:动}};
+                     [.\node[inner sep=1pt](sn6){VV\tiny{:动词}}; \node(sw1){吃}; ]
+                     [.\node[inner sep=1pt](sn7){NN\tiny{:名词}}; \node(sw1){鱼}; ]
+               ]
+          ]
+     ]
+\Tree[.\node[inner sep=1pt](sn0){IP\scriptsize{:句子}};
+          [.\node[inner sep=1pt](sn1){NP\scriptsize{:名}};
+               [.\node[inner sep=1pt,fill=blue!20](sn2){NN\tiny{:名词}}; \node(sw1){猫}; ]
+          ]
+          [.\node[inner sep=1pt](sn3){VP\scriptsize{:动}};
+               [.\node[inner sep=1pt,fill=blue!20](sn4){VV\tiny{:动词}}; \node(sw2){喜欢}; ]
+               [.\node[inner sep=1pt](sn5){VP\scriptsize{:动}};
+                     [.\node[inner sep=1pt,fill=blue!20](sn6){VV\tiny{:动词}}; \node(sw1){吃}; ]
+                     [.\node[inner sep=1pt,fill=blue!20](sn7){NN\tiny{:名词}}; \node(sw1){鱼}; ]
+               ]
+          ]
+     ]
+\Tree[.\node[inner sep=1pt,fill=red!20](sn0){IP\scriptsize{:句子}};
+          [.\node[inner sep=1pt,fill=red!20](sn1){NP\scriptsize{:名}};
+               [.\node[inner sep=1pt,fill=blue!20](sn2){NN\tiny{:名词}}; \node(sw1){猫}; ]
+          ]
+          [.\node[inner sep=1pt,fill=red!20](sn3){VP\scriptsize{:动}};
+               [.\node[inner sep=1pt,fill=blue!20](sn4){VV\tiny{:动词}}; \node(sw2){喜欢}; ]
+               [.\node[inner sep=1pt](sn5){VP\scriptsize{:动}};
+                     [.\node[inner sep=1pt,fill=blue!20](sn6){VV\tiny{:动词}}; \node(sw1){吃}; ]
+                     [.\node[inner sep=1pt,fill=blue!20](sn7){NN\tiny{:名词}}; \node(sw1){鱼}; ]
+               ]
+          ]
+     ]
+\Tree[.\node[inner sep=1pt,fill=red!20](sn0){IP\scriptsize{:句子}};
+          [.\node[inner sep=1pt,fill=red!20](sn1){NP\scriptsize{:名}};
+               [.\node[inner sep=1pt,fill=blue!20](sn2){NN\tiny{:名词}}; \node(sw1){猫}; ]
+          ]
+          [.\node[inner sep=1pt,fill=red!20](sn3){VP\scriptsize{:动}};
+               [.\node[inner sep=1pt,fill=blue!20](sn4){VV\tiny{:动词}}; \node(sw2){喜欢}; ]
+               [.\node[inner sep=1pt,fill=green!20](sn5){VP\scriptsize{:动}};
+                     [.\node[inner sep=1pt,fill=green!20](sn6){VV\tiny{:动词}}; \node(sw1){吃}; ]
+                     [.\node[inner sep=1pt,fill=green!20](sn7){NN\tiny{:名词}}; \node(sw1){鱼}; ]
+               ]
+          ]
+     ]
+\end{scope}
+\begin{scope}[xshift=1.7in,yshift=-0.4in]
+\node [,inner sep=2pt] (w1) at (0,0) {猫};
+\node [anchor=west,inner sep=2pt] (w2) at ([xshift=0.8em,yshift=3em]w1.east) {喜欢};
+\node [anchor=west,inner sep=2pt] (w3) at ([xshift=4.5em]w1.east) {吃};
+\node [anchor=west,inner sep=2pt] (w4) at ([xshift=2em,yshift=-3em]w3.east) {鱼};
+\draw [-,ultra thick,ublue] (w1.north east) -- (w2.south) node [pos=0.3, above, xshift=-0.5em] {\color{red}{\tiny{主谓}}};
+\draw [-,ultra thick,red] (w3.north west) -- (w2.south) node [pos=0.3, above, xshift=0.5em] {\color{red}{\tiny{连动}}};
+\draw [-,ultra thick,ugreen] (w3.south east) -- (w4.north west) node [pos=0.5, above, xshift=0.5em] {\color{red}{\tiny{谓宾}}};
+\end{scope}
+\end{tikzpicture}
--- a/Chapter3/Figures/figure-probability-values-corresponding-to-different-derivations.tex
+++ b/Chapter3/Figures/figure-probability-values-corresponding-to-different-derivations.tex
+\definecolor{ublue}{rgb}{0.152,0.250,0.545}
+\definecolor{ugreen}{rgb}{0,0.5,0}
+%%% outline
+%-------------------------------------------------------------------------
+\begin{tikzpicture}
+\begin{scope}
+\node [] (sent) at (0,0) {\textbf{猫 喜欢 吃 鱼}};
+\end{scope}
+\begin{scope}[xshift=-8em,yshift=-9em,sibling distance=-5pt,level distance=17pt,grow'=up]
+{\footnotesize
+\Tree[.IP
+          [.VP
+               [.VP
+                    [.NN 猫 ]
+                    [.VV 喜欢 ]
+               ]
+               [.VV 吃 ]
+          ]
+          [.NP
+               [.NN 鱼 ]
+          ]
+     ]
+}
+\end{scope}
+\begin{scope}[xshift=-1em,yshift=-9em,sibling distance=-5pt,level distance=17pt,grow'=up]
+{\footnotesize
+\Tree[.IP
+          [.NP
+               [.NN 猫 ]
+          ]
+          [.VP
+               [.VV 喜欢 ]
+               [.VP
+                     [.VV 吃 ]
+                     [.NN 鱼 ]
+               ]
+          ]
+     ]
+}
+\end{scope}
+\begin{scope}[xshift=8em,yshift=-9em,sibling distance=-5pt, level distance=17pt,grow'=up]
+{\footnotesize
+\Tree[.IP
+          [.NP
+               [.NN 猫 ]
+          ]
+          [.VP
+              [.VP
+                   [.VV 喜欢 ]
+                   [.VV 吃 ]
+              ]
+              [.NP
+                   [.NN 鱼 ]
+              ]
+          ]
+     ]
+}
+\end{scope}
+\draw [->,thick,ublue] ([xshift=-2em]sent.south) ..controls + (south:2em) and +(north:2em).. ([xshift=-8em,yshift=-2em]sent.south);
+\draw [->,thick,ublue] ([xshift=-1em]sent.south) ..controls + (south:2em) and +(north:2em).. ([xshift=-2em,yshift=-2em]sent.south);
+\draw [->,thick,ublue] ([xshift=0em]sent.south) ..controls + (south:2em) and +(north:2em).. ([xshift=6.5em,yshift=-2em]sent.south);
+\draw [->,thick,ublue,dotted] ([xshift=1em]sent.south) ..controls + (south:1.5em) and +(north:2.5em).. ([xshift=12.5em,yshift=-2em]sent.south);
+\node [anchor=north west] (others) at (11.8em,-3em) {...};
+\node [] (d1) at (-9em,-10em) {$d_1$};
+\node [] (d2) at (0em,-10em) {$d_2$};
+\node [] (d3) at (8.5em,-10em) {$d_2$};
+\node [anchor=east] (d1p) at ([xshift=0.4em]d1.west) {$\textrm{P}($};
+\node [anchor=west] (d1p2) at ([xshift=-0.4em]d1.east) {$)=0.0123$};
+\node [anchor=east] (d2p) at ([xshift=0.4em]d2.west) {$\textrm{P}($};
+\node [anchor=west] (d2p2) at ([xshift=-0.4em]d2.east) {$)=0.4031$};
+\node [anchor=east] (d3p) at ([xshift=0.4em]d3.west) {$\textrm{P}($};
+\node [anchor=west] (d3p2) at ([xshift=-0.4em]d3.east) {$)=0.0056$};
+\end{tikzpicture}
+%---------------------------------------------------------------------
--- a/Chapter3/Figures/figure-process-of-statistical-syntax-analysis.tex
+++ b/Chapter3/Figures/figure-process-of-statistical-syntax-analysis.tex
+\definecolor{ublue}{rgb}{0.152,0.250,0.545}
+\definecolor{ugreen}{rgb}{0,0.5,0}
+%%% outline
+%-------------------------------------------------------------------------
+\begin{tikzpicture}
+{\scriptsize
+\begin{scope}[level distance=10pt,xshift=1.5em,yshift=-0.5em]
+{\tiny
+\Tree[.\node[inner sep=1pt](t1n1){IP};
+     [.\node[inner sep=1pt](t1n2){VP};
+         [.\node[inner sep=1pt](t1n3){VV}; \node[](t1w1){看}; ]
+     ]
+     [.\node[inner sep=1pt](t1n4){NP};
+         [.\node[inner sep=1pt](t1n5){M}; 个 ]
+         [.\node[inner sep=1pt](t1n6){NN}; 例子 ]
+     ]
+]
+}
+\end{scope}
+\begin{scope}[level distance=10pt,xshift=6.5em,yshift=-0.5em]
+{\tiny
+\Tree[.\node[inner sep=1pt](t2n1){IP};
+     [.\node[inner sep=1pt](t2n2){NP};
+         [.\node[inner sep=1pt](t2n3){PN}; 你 ]
+     ]
+     [.\node[inner sep=1pt](t2n4){VP};
+         [.\node[inner sep=1pt](t2n5){VV}; 看到 ]
+         [.\node[inner sep=1pt](t2n6){AS}; \node[](t2wn){了}; ]
+     ]
+]
+}
+\end{scope}
+\node [anchor=west] (othertrees) at ([xshift=1em]t2n1.east) {...};
+\node [anchor=south west] (corpuslabel) at ([xshift=-1em]t1n1.north west) {{\color{ublue} \textbf{学习用数据}}};
+\begin{pgfonlayer}{background}
+\node[rectangle,draw=ublue,thick,inner sep=0.2em,fill=white,drop shadow] [fit = (t1n1) (t1w1) (t2wn) (othertrees) (corpuslabel)] (corpus) {};
+\end{pgfonlayer}
+}
+\node [anchor=west,ugreen] (P) at ([xshift=5.95em,yshift=-0.8em]corpus.east){\large{P($\cdot$)}};
+\node [anchor=south] (modellabel) at (P.north) {{\color{ublue} {\scriptsize \textbf{统计分析模型}}}};
+\begin{pgfonlayer}{background}
+\node[rectangle,draw=ublue,thick,inner sep=0.2em,fill=white,drop shadow] [fit = (P) (modellabel)] (model) {};
+\end{pgfonlayer}
+\draw [->,very thick,ublue] ([xshift=0.2em]corpus.east) -- ([xshift=4.2em]corpus.east)  node [pos=0.5, above] {\color{red}{\scriptsize{统计学习}}};
+\draw [->,very thick,ublue] ([xshift=0.2em]model.east) -- ([xshift=4.2em]model.east)  node [pos=0.5, above] {\color{red}{\scriptsize{搜索\&计算}}};
+{\scriptsize
+\node [anchor=north west] (sentlabel) at ([xshift=6.2em,yshift=-1em]model.north east) {{\color{ublue} {\scriptsize \textbf{统计分析模型}}}};
+\node [anchor=north west] (sent) at (sentlabel.south west) {\textbf{对任意句子进行分析}};
+}
+\begin{pgfonlayer}{background}
+\node[rectangle,draw=ublue,thick,inner sep=0.2em,fill=white,drop shadow] [fit = (sentlabel) (sent)] (parser) {};
+\end{pgfonlayer}
+\end{tikzpicture}
+%---------------------------------------------------------------------
--- a/Chapter3/Figures/figure-process-sequence-labeling-by-classfication.tex
+++ b/Chapter3/Figures/figure-process-sequence-labeling-by-classfication.tex
+\begin{tikzpicture}
+	\tikzstyle{unit} = [draw,minimum size=1em,circle]
+		\node[unit,fill=green!20] (g1) at (0,0){};
+		\node[anchor=west,unit,fill=green!20]	(g2)at([xshift=1.8em]g1.east){};
+		\node[anchor=west,unit,fill=green!20]	(g3)at([xshift=1.8em]g2.east){};
+		\node[anchor=west,unit,fill=green!20]	(g4)at([xshift=1.8em]g3.east){};
+		\node[anchor=north,unit,fill=red!30]	(r1)at([yshift=-4em]g1.south){};
+		\node[anchor=north,unit,fill=red!30]	(r2)at([yshift=-4em]g2.south){};
+		\node[anchor=north,unit,fill=red!30]	(r3)at([yshift=-4em]g3.south){};
+		\node[anchor=north,unit,fill=red!30]	(r4)at([yshift=-4em]g4.south){};
+		\begin{pgfonlayer}{background}
+        	\node [draw=green!20,rectangle,inner sep=2pt,rounded corners=4pt,dashed,line width=1.5pt] [fit = (g1)(g2)(g3)(g4)] (box1) {};
+        	\node [draw=red!30,rectangle,inner sep=2pt,rounded corners=4pt,dashed,line width=1.5pt] [fit = (r1)(r2)(r3)(r4)] (box2) {};
+    	\end{pgfonlayer}
+		\node[anchor=north,draw,inner sep=2pt,rounded corners=2pt,fill=blue!30,minimum width=6em](cla) at ([yshift=-1em]box1.south){分类器};
+		\node[anchor=south,font=\scriptsize] at ([yshift=0.4em,xshift=1.4em]g2.north){(待预测标签)};
+		\node[anchor=north,font=\scriptsize] at ([yshift=-0.4em,xshift=1.4em]r2.south){(待标注标签)};
+		\draw[->,thick] (cla.north) -- (box1.south);
+		\draw[->,thick] (box2.north) -- (cla.south);
+\end{tikzpicture}
\ No newline at end of file
--- a/Chapter3/Figures/figure-process-sequence-labeling-by-crf.tex
+++ b/Chapter3/Figures/figure-process-sequence-labeling-by-crf.tex
+\begin{tikzpicture}
+	\tikzstyle{unit} = [draw,minimum size=1em,circle]
+		\node[unit,fill=green!20] (g1) at (0,0){};
+		\node[anchor=west,unit,fill=green!20]	(g2)at([xshift=1.8em]g1.east){};
+		\node[anchor=west,unit,fill=green!20]	(g3)at([xshift=1.8em]g2.east){};
+		\node[anchor=west,unit,fill=green!20]	(g4)at([xshift=1.8em]g3.east){};
+		\node[anchor=north,unit,fill=red!30]	(r1)at([yshift=-1.8em,xshift=1.4em]g2.south){};
+		\node[anchor=south,font=\scriptsize] at ([yshift=0.4em,xshift=1.4em]g2.north){(待预测标签)};
+		\node[anchor=north,font=\scriptsize] at ([yshift=-0.4em]r1.south){(待标注标签)};
+		\draw[-,thick] (g1.east) -- (g2.west);
+		\draw[-,thick] (g2.east) -- (g3.west);
+		\draw[-,thick] (g3.east) -- (g4.west);
+		\draw[-,thick] (g1.south) -- (r1.north);
+		\draw[-,thick] (g2.south) -- (r1.north);
+		\draw[-,thick] (g3.south) -- (r1.north);
+		\draw[-,thick] (g4.south) -- (r1.north);
+\end{tikzpicture}
\ No newline at end of file
--- a/Chapter3/Figures/figure-process-sequence-labeling-by-hmm.tex
+++ b/Chapter3/Figures/figure-process-sequence-labeling-by-hmm.tex
+\begin{tikzpicture}
+	\tikzstyle{unit} = [draw,minimum size=1em,circle]
+		\node[unit,fill=green!2] (g1) at (0,0){};
+		\node[anchor=west,unit,fill=green!20]	(g2)at([xshift=1.8em]g1.east){};
+		\node[anchor=west,unit,fill=green!20]	(g3)at([xshift=1.8em]g2.east){};
+		\node[anchor=west,unit,fill=green!20]	(g4)at([xshift=1.8em]g3.east){};
+		\node[anchor=north,unit,fill=red!30]	(r1)at([yshift=-1.8em]g1.south){};
+		\node[anchor=north,unit,fill=red!30]	(r2)at([yshift=-1.8em]g2.south){};
+		\node[anchor=north,unit,fill=red!30]	(r3)at([yshift=-1.8em]g3.south){};
+		\node[anchor=north,unit,fill=red!30]	(r4)at([yshift=-1.8em]g4.south){};
+		\node[anchor=south,font=\scriptsize] at ([yshift=0.4em,xshift=1.4em]g2.north){(待预测标签)};
+		\node[anchor=north,font=\scriptsize] at ([yshift=-0.4em,xshift=1.4em]r2.south){(待标注标签)};
+		\draw[->,thick] (g1.east) -- (g2.west);
+		\draw[->,thick] (g2.east) -- (g3.west);
+		\draw[->,thick] (g3.east) -- (g4.west);
+		\draw[->,thick] (g1.south) -- (r1.north);
+		\draw[->,thick] (g2.south) -- (r2.north);
+		\draw[->,thick] (g3.south) -- (r3.north);
+		\draw[->,thick] (g4.south) -- (r4.north);
+\end{tikzpicture}
\ No newline at end of file
--- a/Chapter3/Figures/figure-rules-of-grammar.tex
+++ b/Chapter3/Figures/figure-rules-of-grammar.tex
+\definecolor{ublue}{rgb}{0.152,0.250,0.545}
+\definecolor{ugreen}{rgb}{0,0.5,0}
+%%% outline
+%-------------------------------------------------------------------------
+\begin{tikzpicture}
+\small
+\node [anchor=west,inner sep=2pt] (r1) at (0,0) {$r_1$: NN $\to$ 猫};
+\node [anchor=west,inner sep=2pt] (r2) at ([xshift=6em]r1.east) {$r_2$: VV $\to$ 喜欢};
+\node [anchor=north west,inner sep=2pt] (r3) at ([yshift=-0.2em]r1.south west) {$r_3$: VV $\to$ 吃};
+\node [anchor=north west,inner sep=2pt] (r4) at ([yshift=-0.2em]r2.south west) {$r_4$: NN $\to$ 鱼};
+\node [anchor=north west,inner sep=2pt] (r5) at ([yshift=-0.2em]r3.south west) {$r_5$: NP $\to$ NN};
+\node [anchor=north west,inner sep=2pt] (r6) at ([yshift=-0.2em]r4.south west) {$r_6$: VP $\to$ VV NN};
+\node [anchor=north west,inner sep=2pt] (r7) at ([yshift=-0.2em]r5.south west) {$r_7$: VP $\to$ VV VP};
+\node [anchor=north west,inner sep=2pt] (r8) at ([yshift=-0.2em]r6.south west) {$r_8$: IP $\to$ NP VP};
+\node [anchor=west,inner sep=2pt,fill=blue!20] (r1) at (0,0) {$r_1$: NN $\to$ 猫};
+\node [anchor=west,inner sep=2pt,fill=blue!20] (r2) at ([xshift=6em]r1.east) {$r_2$: VV $\to$ 喜欢};
+\node [anchor=north west,inner sep=2pt,fill=blue!20] (r3) at ([yshift=-0.2em]r1.south west) {$r_3$: VV $\to$ 吃};
+\node [anchor=north west,inner sep=2pt,fill=blue!20] (r4) at ([yshift=-0.2em]r2.south west) {$r_4$: NN $\to$ 鱼};
+\node [anchor=north west,inner sep=2pt,fill=red!20] (r5) at ([yshift=-0.2em]r3.south west) {$r_5$: NP $\to$ NN};
+\node [anchor=north west,inner sep=2pt,fill=green!20] (r6) at ([yshift=-0.2em]r4.south west) {$r_6$: VP $\to$ VV NN};
+\node [anchor=north west,inner sep=2pt,fill=green!20] (r7) at ([yshift=-0.2em]r5.south west) {$r_7$: VP $\to$ VV VP};
+\node [anchor=north west,inner sep=2pt,fill=green!20] (r8) at ([yshift=-0.2em]r6.south west) {$r_8$: IP $\to$ NP VP};
+\node [anchor=north west,fill=blue!20] (sent1) at ([yshift=-0.4em]r7.south west) {$r_1,r_2,r_3,r_4$};
+\node [anchor=west] (sent1part2) at (sent1.east) {为生成单词词性的规则};
+\node [anchor=north west,fill=red!20] (sent2) at ([yshift=-0.2em]sent1.south west) {$r_5$};
+\node [anchor=west] (sent2part2) at (sent2.east) {为单变量规则，它将词性NN进一步抽象为名词短语NP};
+\node [anchor=north west,fill=green!20] (sent3) at ([yshift=-0.2em]sent2.south west){$r_6,r_7,r_8$};
+\node [anchor=west] (sent3part2) at (sent3.east) {为句法结构规则，比如$r_8$表示了主(NP)+谓(VP)结构};
+\end{tikzpicture}
+%---------------------------------------------------------------------
--- a/Chapter3/Figures/figure-transition-prob-and-launch-prob-in-coin-toss-game.tex
+++ b/Chapter3/Figures/figure-transition-prob-and-launch-prob-in-coin-toss-game.tex
+\begin{tikzpicture}
+	\begin{scope}
+	\node[minimum width=3em,minimum height=1.5em] (o) at (0,0){};
+	\node[anchor=west,inner sep=0pt] (ca) at ([yshift=0.2em,xshift=1.4em]o.east){\scriptsize\bfnew{硬币A}};
+	\node[anchor=west,inner sep=0pt] (cb) at ([xshift=1.4em]ca.east){\scriptsize\bfnew{硬币B}};
+	\node[anchor=west,inner sep=0pt] (cc) at ([xshift=1.4em]cb.east){\scriptsize\bfnew{硬币C}};
+	\node[anchor=north,inner sep=0pt] (ra) at ([yshift=-0.6em,xshift=-0.4em]o.south){\scriptsize\bfnew{硬币A}};
+	\node[anchor=north,inner sep=0pt] (rb) at ([yshift=-1.4em]ra.south){\scriptsize\bfnew{硬币B}};
+	\node[anchor=north,inner sep=0pt] (rc) at ([yshift=-1.4em]rb.south){\scriptsize\bfnew{硬币C}};
+	\node[anchor=north,inner sep=0pt] (n11) at ([yshift=-0.9em]ca.south){\small{$\frac{1}{3}$}};
+	\node[anchor=north,inner sep=0pt] (n21) at ([yshift=-1em]n11.south){\small{$\frac{1}{3}$}};
+	\node[anchor=north,inner sep=0pt] (n31) at ([yshift=-1em]n21.south){\small{$\frac{1}{3}$}};
+	\node[anchor=north,inner sep=0pt] (n12) at ([yshift=-0.9em]cb.south){\small{$\frac{1}{3}$}};
+	\node[anchor=north,inner sep=0pt] (n22) at ([yshift=-1em]n12.south){\small{$\frac{1}{3}$}};
+	\node[anchor=north,inner sep=0pt] (n32) at ([yshift=-1em]n22.south){\small{$\frac{1}{3}$}};
+	\node[anchor=north,inner sep=0pt] (n13) at ([yshift=-0.9em]cc.south){\small{$\frac{1}{3}$}};
+	\node[anchor=north,inner sep=0pt] (n23) at ([yshift=-1em]n13.south){\small{$\frac{1}{3}$}};
+	\node[anchor=north,inner sep=0pt] (n33) at ([yshift=-1em]n23.south){\small{$\frac{1}{3}$}};
+	\draw[thick] (o.north west) -- (o.south east);
+	\node[anchor=south west] at ([yshift=-1em,xshift=-1.4em]o.45){\tiny{第$i+1$次}};
+	\node[anchor=north east] at ([yshift=1em,xshift=1em]o.-135){\tiny{第$i$次}};
+	\begin{pgfonlayer}{background}
+        	\node [rectangle,inner sep=0.5em,rounded corners=2pt,fill=red!10] [fit = (o)(n33)(cc) ] (box0) {};
+    	\end{pgfonlayer}
+   \node[anchor=south] at (box0.north){\scriptsize{转移概率$\funp{P}$(第$i+1$次|第$i$次)}};
+	\end{scope}
+	\begin{scope}[xshift=8cm]
+	\node[minimum width=3em,minimum height=1.5em] (o) at (0,0){};
+	\node[anchor=west,inner sep=0pt] (ca) at ([yshift=0.2em,xshift=1.4em]o.east){\scriptsize\bfnew{正面}};
+	\node[anchor=west,inner sep=0pt] (cb) at ([xshift=1.4em]ca.east){\scriptsize\bfnew{反面}};
+	\node[anchor=north,inner sep=0pt] (ra) at ([yshift=-0.6em,xshift=-0.4em]o.south){\scriptsize\bfnew{硬币A}};
+	\node[anchor=north,inner sep=0pt] (rb) at ([yshift=-1.5em]ra.south){\scriptsize\bfnew{硬币B}};
+	\node[anchor=north,inner sep=0pt] (rc) at ([yshift=-1.5em]rb.south){\scriptsize\bfnew{硬币C}};
+	\node[anchor=north,inner sep=0pt] (n11) at ([yshift=-1.2em]ca.south){\footnotesize{$0.3$}};
+	\node[anchor=north,inner sep=0pt] (n21) at ([yshift=-1.7em]n11.south){\footnotesize{$0.5$}};
+	\node[anchor=north,inner sep=0pt] (n31) at ([yshift=-1.7em]n21.south){\footnotesize{$0.7$}};
+	\node[anchor=north,inner sep=0pt] (n12) at ([yshift=-1.2em]cb.south){\footnotesize{$0.7$}};
+	\node[anchor=north,inner sep=0pt] (n22) at ([yshift=-1.7em]n12.south){\footnotesize{$0.5$}};
+	\node[anchor=north,inner sep=0pt] (n32) at ([yshift=-1.7em]n22.south){\footnotesize{$0.3$}};
+	\draw[thick] (o.north west) -- (o.south east);
+	\node[anchor=south west] at ([yshift=-1em,xshift=-1.4em]o.45){\tiny{可见}};
+	\node[anchor=north east] at ([yshift=1em,xshift=1em]o.-135){\tiny{隐藏}};
+	\begin{pgfonlayer}{background}
+        	\node [rectangle,inner sep=0.5em,rounded corners=2pt,fill=red!10] [fit = (o)(n32)(rc)(cb) ] (box1) {};
+    	\end{pgfonlayer}
+   \node[anchor=south] at (box1.north){\scriptsize{发射概率$\funp{P}$(可见状态|隐藏状态)}};
+	\end{scope}
+\end{tikzpicture}
\ No newline at end of file
--- a/Chapter3/Figures/figure-two-different-derivation-of-regulation.tex
+++ b/Chapter3/Figures/figure-two-different-derivation-of-regulation.tex
+%%% outline
+%-------------------------------------------------------------------------
+\begin{minipage}[t]{0.32\linewidth}
+\vspace{1.5em}
+\begin{tikzpicture}
+\begin{scope}[sibling distance=3pt, level distance=25pt]
+{\small
+\Tree[.IP
+          [.NP
+               [.NN \node(sw1){猫}; ]
+          ]
+          [.VP
+               [.VV \node(sw2){喜欢}; ]
+               [.VP
+                     [.VV \node(sw1){吃}; ]
+                     [.NN \node(sw1){鱼}; ]
+               ]
+          ]
+     ]
+}
+\end{scope}
+\end{tikzpicture}
+\end{minipage}
+\hfill
+\begin{minipage}[t]{0.32\linewidth}
+{\small
+\begin{eqnarray}
+& & \textrm{\textbf{推导1}} \nonumber \\
+& & \textrm{IP} \nonumber \\
+& \overset{r_8}{\Rightarrow} & \textrm{NP VP} \nonumber \\
+& \overset{r_5}{\Rightarrow} & \textrm{NN VP} \nonumber \\
+& \overset{r_1}{\Rightarrow} & \textrm{猫 VP} \nonumber \\
+& \overset{r_7}{\Rightarrow} & \textrm{猫 VV VP} \nonumber \\
+& \overset{r_2}{\Rightarrow} & \textrm{猫 喜欢 VP} \nonumber \\
+& \overset{r_6}{\Rightarrow} & \textrm{猫 喜欢 VV NN} \nonumber \\
+& \overset{r_3}{\Rightarrow} & \textrm{猫 喜欢 吃 NN} \nonumber \\
+& \overset{r_4}{\Rightarrow} & \textrm{猫 喜欢 吃 鱼} \nonumber
+\end{eqnarray}
+}
+\end{minipage}
+\hfill
+\begin{minipage}[t]{0.32\linewidth}
+{\small
+\begin{eqnarray}
+& & \textrm{\textbf{推导2}} \nonumber \\
+& & \textrm{IP} \nonumber \\
+& \overset{r_8}{\Rightarrow} & \textrm{NP VP} \nonumber \\
+& \overset{r_7}{\Rightarrow} & \textrm{NP VV VP} \nonumber \\
+& \overset{r_2}{\Rightarrow} & \textrm{NP 喜欢 VP} \nonumber \\
+& \overset{r_6}{\Rightarrow} & \textrm{NP 喜欢 VV NN} \nonumber \\
+& \overset{r_4}{\Rightarrow} & \textrm{NP 喜欢 VV 鱼} \nonumber \\
+& \overset{r_5}{\Rightarrow} & \textrm{NN 喜欢 VV 鱼} \nonumber \\
+& \overset{r_3}{\Rightarrow} & \textrm{NN 喜欢 吃 鱼} \nonumber \\
+& \overset{r_1}{\Rightarrow} & \textrm{猫 喜欢 吃 鱼} \nonumber
+\end{eqnarray}
+}
+\end{minipage}
+%---------------------------------------------------------------------
--- a/Chapter3/Figures/figure-usage-of-regulation.tex
+++ b/Chapter3/Figures/figure-usage-of-regulation.tex
+\definecolor{ublue}{rgb}{0.152,0.250,0.545}
+\definecolor{ugreen}{rgb}{0,0.5,0}
+%%% outline
+%-------------------------------------------------------------------------
+\begin{tikzpicture}
+\begin{scope}
+\node [anchor=west] (arule) at (0,0) {\large{$u \overset{r}{\Rightarrow} v$}};
+\node [anchor=west,inner sep=2pt] (u) at ([xshift=-4em,yshift=-4em]arule.west) {VV};
+\node [anchor=west,inner sep=2pt,fill=orange!40] (u) at ([xshift=-4em,yshift=-4em]arule.west) {VV};
+\node [anchor=west] (upart2) at (u.east) {NN};
+\node [anchor=east] (unumber) at (u.west) {$u:$};
+\node [anchor=west,inner sep=2pt] (r) at ([xshift=-3em,yshift=2.5em]u.west) {VV};
+\node [anchor=west,inner sep=2pt,fill=orange!40] (r) at ([xshift=-3em,yshift=2.5em]u.west) {VV};
+\node [anchor=west] (rpart2) at (r.east) {$\to$};
+\node [anchor=west,inner sep=2pt] (rpart3) at (rpart2.east) {吃};
+\node [anchor=east] (rnumber) at (r.west) {$r:$};
+\node [anchor=west,inner sep=2pt,fill=red!20] (rpart3) at (rpart2.east) {吃};
+\node [anchor=west] (v) at ([xshift=5.5em]u.east) {$v$:};
+\node [anchor=west,inner sep=2pt,fill=red!20] (vpart2) at (v.east) {吃};
+\node [anchor=west] (vpart3) at (vpart2.east) {NN};
+\node [anchor=west] (arrow) at ([xshift=3em]u.east) {$\Rightarrow$};
+\draw [<-,dotted,thick] ([xshift=0.6em]arule.south west) ..controls +(south:0.7) and +(north:0.7).. ([xshift=-1.0em]upart2.north east);
+\draw [<-,dotted,thick] ([xshift=-0.6em]arule.south east) ..controls +(south:0.7) and +(north:0.7).. ([xshift=1.0em]v.north west);
+\draw [<-,dotted,thick] ([xshift=-0.4em,yshift=-0.5em]arule.north) ..controls +(180:0.7) and +(50:0.7) .. ([xshift=1em]rpart2.north west);
+\draw [->,dotted,thick,ublue] (r.south) ..controls +(south:0.5) and +(north:0.5).. ([yshift=0]u.north);
+\end{scope}
+\end{tikzpicture}
+%---------------------------------------------------------------------
--- a/Chapter3/Figures/figure-word-segmentation-based-on-statistics.tex
+++ b/Chapter3/Figures/figure-word-segmentation-based-on-statistics.tex
+\definecolor{ublue}{rgb}{0.152,0.250,0.545}
+\definecolor{ugreen}{rgb}{0,0.5,0}
+%%% outline
+%-------------------------------------------------------------------------
+\vspace{-0.5em}
+\begin{tikzpicture}
+{\scriptsize
+{
+\node [anchor=north west] (entry1) at (0,0) {\textbf{1:} 这 / 是 / 数据};
+\node [anchor=north west] (entry2) at ([yshift=0.1em]entry1.south west) {\textbf{2:} 现在 / 已经 / 实现};
+\node [anchor=north west] (entry3) at ([yshift=0.1em]entry2.south west) {\textbf{3:} 确实 / 有 / 很 / 多};
+\node [anchor=north west] (entry4) at ([yshift=0.1em]entry3.south west) {...};
+\node [anchor=south west] (corpuslabel) at (entry1.north west) {{\color{ublue} \textbf{学习用数据}}};
+}
+\begin{pgfonlayer}{background}
+{
+\node[rectangle,draw=ublue,thick,inner sep=0.2em,fill=white,drop shadow] [fit = (entry1) (entry2) (entry3) (entry4) (corpuslabel)] (corpus) {};
+}
+\end{pgfonlayer}
+}
+{
+\node [anchor=west,ugreen] (P) at ([xshift=5.2em,yshift=-0.8em]corpus.east){\large{\funp{P}($\cdot$)}};
+\node [anchor=south] (modellabel) at (P.north) {{\color{ublue} {\scriptsize \textbf{统计模型}}}};
+}
+\begin{pgfonlayer}{background}
+{
+\node[rectangle,draw=ublue,thick,inner sep=0.2em,fill=white,drop shadow] [fit = (P) (modellabel)] (model) {};
+}
+\end{pgfonlayer}
+{
+\draw [->,very thick,ublue] ([xshift=0.2em]corpus.east) -- ([xshift=4.2em]corpus.east)  node [pos=0.5, above] {\color{red}{\scriptsize{统计学习}}};
+}
+{
+\draw [->,very thick,ublue] ([xshift=0.2em]model.east) -- ([xshift=4.2em]model.east)  node [pos=0.5, above] {\color{red}{\scriptsize{搜索\&计算}}};
+}
+{\scriptsize
+{
+\node [anchor=north west] (sentlabel) at ([xshift=6.8em,yshift=2em]model.north east) {\color{red}{新的句子}};
+\node [anchor=north west] (sent) at (sentlabel.south west) {\textbf{确实现在数据很多}};
+}
+{
+\node [anchor=north west] (seg1) at ([xshift=1.0em]sent.south west) {确/实现/在/数/据很/多};
+\node [anchor=north west] (seg2) at (seg1.south west) {确实/现在/数据/很/多};
+\node [anchor=north west] (seg3) at (seg2.south west) {确实/现在/数/据/很/多};
+}
+{
+\node [anchor=north west] (seg4) at ([xshift=-1.0em,yshift=0.4em]seg3.south west) {...};
+\node [anchor=east,ugreen] (p1seg1) at ([xshift=0.5em]seg1.west) {P(};
+\node [anchor=west,ugreen] (p2seg1) at ([xshift=-0.5em]seg1.east) {)=0.1};
+\node [anchor=east,ugreen] (p1seg2) at ([xshift=0.5em]seg2.west) {P(};
+\node [anchor=west,ugreen] (p2seg2) at ([xshift=-0.5em]seg2.east) {)=0.6};
+\node [anchor=east,ugreen] (p1seg3) at ([xshift=0.5em]seg3.west) {P(};
+\node [anchor=west,ugreen] (p2seg3) at ([xshift=-0.5em]seg3.east) {)=0.2};
+}
+{
+\node [anchor=east,draw,dashed,red,thick,minimum width=13em,minimum height=1.4em] (final) at (p2seg2.east) {};
+\node [anchor=west,red] (finallabel) at ([xshift=3.1em]sentlabel.east) {输出概率最大的结果};
+%\node [anchor=north east,red] (finallabel2) at ([yshift=0.5em]finallabel.south east) {的结果};
+\draw [->,thick,red] ([xshift=0.0em,yshift=-0.5em]final.north east) ..controls +(east:0.3) and +(south:0.0).. ([xshift=1.0em]finallabel.south);
+}
+}
+\end{tikzpicture}
+%---------------------------------------------------------------------
--- a/Chapter3/chapter3.tex
+++ b/Chapter3/chapter3.tex
@@ -15,16 +15,847 @@
 \renewcommand\figurename{图}%将figure改为图
 \renewcommand\tablename{表}%将figure改为图
-\chapterimage{fig-NEU-3.jpg} % Chapter heading image
 %----------------------------------------------------------------------------------------
 %	CHAPTER 3
 %----------------------------------------------------------------------------------------
+\chapter{词法分析和语法分析基础}
-\chapter{词法分析和语法分析}
+\parinterval 机器翻译并非是一个孤立的系统，它依赖于很多模块，并且需要很多学科知识的融合。其中就会用到一些自然语言处理工具来对不同语言的文字进行分析。因此，在正式开始机器翻译内容的介绍之前，本章会对相关的词法分析和语法分析知识进行概述，包括：分词、命名实体识别、成分句法分析。它们都是自然语言处理中的经典问题，而且在机器翻译中也经常被使用。本章会重点介绍这些任务的定义和求解问题的思路。其中也会使用到统计建模方法，可以被看作是第二章内容的延伸。
 %----------------------------------------------------------------------------------------
 %    NEW SECTION
 %----------------------------------------------------------------------------------------
-\section{}
+\section{问题概述}
+\parinterval 很多时候机器翻译系统被看作是孤立的``黑盒''系统（图\ref{fig:3.1-1}(a)）。将一段文本作为输入送入机器翻译系统之后，系统输出翻译好的译文。但是真实的机器翻译系统非常复杂，因为系统看到的输入和输出实际上只是一些符号串，这些符号并没有任何意义，因此需要进一步对这些符号串进行处理才能更好的使用它们。比如，需要定义翻译中最基本的单元是什么？符号串是否具有结构信息？如何用数学工具刻画这些基本单元和结构？
+%----------------------------------------------
+\begin{figure}[htp]
+    \centering
+ 	\subfigure[机器翻译系统被看作一个黑盒] {\input{./Chapter3/Figures/figure-mt-system-as-a-black-box}  }
+ 	\subfigure[机器翻译系统 = 前/后处理 + 翻译引擎] {\input{./Chapter3/Figures/figure-mt=language-analysis+translation-engine}}
+	\caption{机器翻译系统的结构}
+    \label{fig:3.1-1}
+\end{figure}
+%-------------------------------------------
+\parinterval 图\ref{fig:3.1-1}(b)展示了一个机器翻译系统的输入和输出形式。可以看到，输入的中文词串``猫喜欢吃鱼''被加工成一个新的结构（图\ref{fig:3.1-2}）。直觉上，这个结构有些奇怪，因为上面多了很多新的符号，而且还有一些线将不同符号进行连接。实际上这就是语言分析中对句子常用的结构表示\ \dash \ 短语结构树。这里会涉及两方面问题：
+\begin{itemize}
+\vspace{0.5em}
+\item {\small\bfnew{分词}}\index{分词}（Segmentation）\index{Segmentation}：这个过程会把词串进行切分，切割成最小的单元。因为只有知道了什么是待处理字符串的最小单元，机器翻译系统才能对其进行表示、分析和生成。
+\vspace{0.5em}
+\item {\small\bfnew{句法分析}}\index{句法分析}（Parsing）\index{Parsing}：这个过程会对分词的结果进行进一步分析。比如，可以对句子进行浅层分析，得到一些句子中实体的信息（如人名、地名等）。也可以对句子进行更深层次的分析，得到完整的句法结构，类似于图\ref{fig:3.1-2}中的结果。这种结构可以被看作是对句子的进一步抽象，被称为成分句法树，比如，NP+VP就可以表示由名词短语（NP）和动词短语（VP）构成的主谓结构。利用这些信息，机器翻译可以更加准确地对语言的结构进行分析和生成。
+\vspace{0.5em}
+\end{itemize}
+%----------------------------------------------
+\begin{figure}[htp]
+\centering
+\input{./Chapter3/Figures/figure-analysis-of-sentence-participle&syntactic}
+\caption{中文句子``猫喜欢吃鱼''的分析结果（分词和句法分析）}
+\label{fig:3.1-2}
+\end{figure}
+%-------------------------------------------
+\parinterval 类似地，机器翻译输出的结果也可以包含同样的信息。甚至系统输出英文译文之后，还有一个额外的步骤来把部分英文单词的大小写恢复出来，比如，上例中句首单词``Cats''的首字母要大写。
+\parinterval 一般来说，在送入机器翻译系统前需要对文字序列进行处理和加工，这个过程被称为{\small\bfnew{预处理}}\index{预处理}（Preprocessing）\index{Preprocessing}。同理，在机器翻译模型输出译文后进行的处理被称作{\small\bfnew{后处理}}\index{后处理}（Postprocessing）\index{Postprocessing}。这两个过程对机器翻译性能影响很大，比如，在神经机器翻译里，不同的分词策略可能会造成翻译性能的天差地别。
+\parinterval 值得注意的是，有些观点认为，对于机器翻译来说，不论是分词还是句法分析，并不要求符合人的认知和语言学约束。换句话说，机器翻译所使用的``单词''和``结构''本身并不是为了符合人类的解释，它们更直接目的是为了进行翻译。从系统开发的角度，有时候即使使用一些与人类的语言习惯有差别的处理，仍然会带来性能的提升，比如在神经机器翻译中，在传统分词的基础上进一步使用{\small\bfnew{双字节编码}}\index{双字节编码}（Byte Pair Encoding，BPE）\index{Byte Pair Encoding}子词切分会使得机器翻译性能大幅提高。当然，自然语言处理中语言学信息的使用一直是学界关注的焦点。甚至关于语言学结构对机器翻译是否有作用这个问题也有争论。但是不能否认的是，无论是语言学的知识，还是计算机自己学习到的知识，对机器翻译都是有价值的。在后续章节会看到，这两种类型的知识对机器翻译帮助很大。
+\parinterval 剩下的问题是如何进行句子的切分和结构的分析。思路有很多，一种常用的方法是对问题进行概率化，用统计模型来描述问题并求解之。比如，一个句子切分的好坏，并不是非零即一的判断，而是要估计出这种切分的可能性大小，最终选择可能性最大的结果进行输出。这也是一种典型的用统计建模的方式来描述自然语言处理问题的方法。
+\parinterval 本章将会对上述问题及求解问题的方法进行介绍。并将统计建模应用到中文分词、命名实体识别和成分句法分析等任务。
+%----------------------------------------------------------------------------------------
+%    NEW SECTION
+%----------------------------------------------------------------------------------------
+\section{中文分词}
+\parinterval 对于机器翻译系统而言，输入的是已经切分好的单词序列，而不是原始的字符串（图\ref{fig:3.2-1}）。比如，对于一个中文句子，单词之间是没有间隔的，因此需要把一个个的单词切分出来，这样机器翻译系统可以区分不同的翻译单元。甚至，可以对语言学上的单词进行进一步切分，得到词片段序列（比如：中国人$\to$中国/人）。可以把上述过程看作是一种分词（Segmentation）过程，即：将一个输入的自然语言字符串切割成单元序列（token序列），每个单元都对应可以处理的最小单位。
+%----------------------------------------------
+\begin{figure}[htp]
+\centering
+\input{./Chapter3/Figures/figure-a-simple-pre-processing-process}
+\caption{一个简单的预处理流程}
+\label{fig:3.2-1}
+\end{figure}
+%-------------------------------------------
+\parinterval 分词得到的单元序列可以是语言学上的词序列，也可以是根据其他方式定义的基本处理单元。在本章中，把分词得到的一个个单元称为{\small\bfnew{单词}}\index{单词}（Word）\index{Word}，或词，尽管这些单元可以不是语言学上的完整单词。而这个过程也被称作{\small\bfnew{词法分析}}\index{词法分析}（Lexical Analysis）\index{Lexical Analysis}。除了汉语，词法分析在日语、泰语等单词之间无明确分割符的语言中有着广泛的应用，芬兰语、维吾尔语等一些形态学十分丰富的语言，也需要使用词法分析来解决复杂的词尾、词缀变化等形态学变化。
+\parinterval 在机器翻译中，分词系统的好坏往往会决定译文的质量。分词的目的是定义系统处理的基本单元，那么什么叫做``词'' 呢？关于词的定义有很多，比如：
+\vspace{0.5em}
+\begin{definition} 词
+\vspace{0.5em}
+语言里最小的可以独立运用的单位。
+\begin{flushright}——《新华字典》\end{flushright}
+单词（word），含有语义内容或语用内容，且能被单独念出来的的最小单位。
+\begin{flushright}——《维基百科》\end{flushright}
+語句中具有完整概念，能獨立自由運用的基本單位。
+\begin{flushright}——《国语辞典》\end{flushright}
+\end{definition}
+\parinterval 从语言学的角度来看，人们普遍认为词是可以单独运用的、包含意义的基本单位。这样可以使用有限的词组合出无限的句子，这也正体现出自然语言的奇妙之处。不过，机器翻译并不仅仅局限在语言学定义的单词。比如，神经机器翻译中广泛使用的BPE子词切分方法，可以被理解为将词的一部分也进行切开，也就是得到词片段送给机器翻译系统使用。比如，对如下英文字符串，可以得到如下切分结果：
+\parinterval Interesting \; $\to$ \; Interest/ing  selection \hspace{0.08em} $\to$ \;se/lect/ion  procession \hspace{0.43em} $\to$ \; pro/cess/ion
+\parinterval Interested \hspace{0.62em} $\to$ \; Interest/ed   selecting \hspace{0.34em} $\to$ \; se/lect/ing  processing \hspace{0.22em} $\to$ \; pro/cess/ing
+\parinterval Interests \hspace{1.17em} $\to$ \; Interest/s   selected \hspace{1.24em} $\to$ \; se/lect/ed   processed \hspace{0.82em} $\to$ \; pro/cess/ed 
+\vspace{0.5em}
+\parinterval 词法分析的重要性在自然语言处理领域已经有共识。如果切分的颗粒度很大，获得的单词的歧义也很小，比如``中华人民共和国''整体作为一个单词不存在歧义，而如果单独的一个单词``国''，可能会代表``中国''、``美国''等不同的国家，存在歧义。但是随着切分颗粒度的增大，特定单词出现的频度也随之降低，低频词容易和噪音混淆，系统很难进行学习。因此，处理这些问题并开发适合翻译任务的分词系统是机器翻译的第一步。
+%----------------------------------------------------------------------------------------
+%    NEW SUB-SECTION
+%----------------------------------------------------------------------------------------
+\subsection{基于词典的分词方法}
+\parinterval 计算机并不能像人类一样在概念上理解``词''，因此需要使用其他方式让计算机可以进行分词。一个最简单的方法就是给定一个词典，在这个词典中出现的汉字组合就是所定义的``词''。也就是，通过一个词典定义一个标准，符合这个标准定义的字符串都是合法的``词''。
+\parinterval 在使用基于词典的分词方法时，只需预先加载词典到计算机中，扫描输入句子，查询每个词串是否出现在词典中。如图\ref{fig:3.2-2}所示，有一个包含六个词的词典，给定输入句子``确实现在物价很高''后，分词系统自左至右遍历输入句子的每个字，发现词串``确实''在词典中出现，说明``确实''是一个``词''，进行分词操作并在切分该``词''之后重复这个过程。
+%----------------------------------------------
+\begin{figure}[htp]
+\centering
+\input{./Chapter3/Figures/figure-example-of-word-segmentation-based-on-dictionary}
+\caption{基于词典进行分词的实例}
+\label{fig:3.2-2}
+\end{figure}
+%-------------------------------------------
+\parinterval 但是，基于词典的分词方法很``硬''。这是因为自然语言非常灵活，经常出现歧义，用词典定义的合法单词之间有重叠的交叉型歧义就很难解决。图\ref{fig:3.2-3}就给出了上面例子中的交叉型歧义，从词典中查看，``实现''和``现在''都是合法的单词，但是在句子中二者有重叠，因此词典无法告诉系统哪个结果是正确的。
+%----------------------------------------------
+\begin{figure}[htp]
+\centering
+\input{./Chapter3/Figures/figure-cross-type-word-segmentation-ambiguity}
+\caption{交叉型分词歧义}
+\label{fig:3.2-3}
+\end{figure}
+%-------------------------------------------
+\parinterval 类似的例子在生活中也很常见。再比如``答辩结束的和尚未答辩的同学都请留在教室''一句中，正常的分词结果是``答辩/结束/的/和/尚未/答辩/的/同学/都/请/留在/教室''，但是由于``尚未''、``和尚''都是常见词汇，使用基于词典的分词方法在这时很容易出现切分错误。
+\parinterval 基于词典的分词方法是典型的基于规则的方法，完全依赖于人工给定的词典。在遇到歧义时，需要人工定义消除歧义的规则，比如，可以自左向右扫描每次匹配最长的单词，这是一种简单的启发式的消歧策略。图\ref{fig:3.2-2}中的例子实际上就是使用这种策略得到的分词结果。但是，启发式的消岐方法对人工的依赖程度很高，而且启发式规则也不能处理所有的情况。所以说简单的基于词典的方法还不能很好的解决分词问题。
+%----------------------------------------------------------------------------------------
+%    NEW SUB-SECTION
+%----------------------------------------------------------------------------------------
+\subsection{基于统计的分词方法}
+\parinterval 既然基于词典的方法有很多问题，那么就需要一种更为有效的方法。在上文中提到，想要搭建一个分词系统，需要让计算机知道什么是``词''，那么可不可以给出已经切分好的分词数据，让计算机在这些数据中学习到规律呢？答案是肯定的，利用``数据''来让计算机明白``词''的定义，让计算机直接在数据中学到知识，这就是一个典型的基于统计建模的学习过程。
+%----------------------------------------------------------------------------------------
+%    NEW SUBSUB-SECTION
+%----------------------------------------------------------------------------------------
+\subsubsection{1. 统计模型的学习与推断}
+\parinterval 在分词任务中，数据驱动主要指用已经分词切分好的数据``喂''给系统，这个数据也被称作{\small\bfnew{标注数据}}\index{标注数据}（Annotated Data）\index{Annotated Data}。在获得标注数据后，系统自动学习一个统计模型来描述分词的过程，而这个模型会把分词的``知识''作为参数保存在模型中。当送入一个新的需要分词的句子时，可以利用学习到的模型对所有可能的分词结果进行预测，并进行概率化的描述，最终选择概率最大的结果作为输出。这个方法就是基于统计的分词方法。具体来说，可以分为两个步骤：
+\begin{itemize}
+\vspace{0.5em}
+\item {\small\bfnew{训练}}\index{训练}（Training）\index{Training}。利用标注数据，对统计模型的参数进行学习。
+\vspace{0.5em}
+\item {\small\bfnew{推断}}\index{推断}（Inference）\index{Inference}。利用学习到的模型和参数，对新的句子进行切分。
+\vspace{0.5em}
+\end{itemize}
+\parinterval 图\ref{fig:3.2-4}给出了一个基于统计建模的汉语分词实例。左侧是标注数据，其中每个句子是已经经过人工标注的分词结果（单词用斜杠分开）。之后，建立一个统计模型，记为$\funp{P}(\cdot)$。模型通过在标注数据上的学习来对问题进行描述，即学习$\funp{P}(\cdot)$。最后，对于新的未分词的句子，使用模型$\funp{P}(\cdot)$对每个可能的切分方式进行概率估计，之后选择概率最高的切分结果输出。
+%----------------------------------------------
+\begin{figure}[htp]
+\centering
+\input{./Chapter3/Figures/figure-word-segmentation-based-on-statistics}
+\caption{基于统计的自动分词流程}
+\label{fig:3.2-4}
+\end{figure}
+%-------------------------------------------
+%----------------------------------------------------------------------------------------
+%    NEW SUBSUB-SECTION
+%----------------------------------------------------------------------------------------
+\subsubsection{2. 全概率分词方法}
+\parinterval 上述过程的核心在于从标注数据中学习一种对分词现象的统计描述，即句子的分词结果概率$\funp{P}(\cdot)$。如何让计算机利用分好词的数据学习到分词知识呢？本书的{\chaptertwo}曾介绍如何对单词概率进行统计建模，而对分词现象的统计描述就是在单词概率的基础上，基于独立性假设获取的\footnote{即假定所有词的出现都是相互独立的。}。虽然独立性假设并不能完美描述分词过程中单词之间的关系，但是它大大化简了分词问题的复杂度。
+\parinterval 如图\ref{fig:3.2-5}所示，可以利用大量人工标注好的分词数据，通过统计学习方法获得一个统计模型$\funp{P}(\cdot)$，给定任意分词结果$W = w_1w_2 \ldots w_m$，都能通过$\funp{P}(W)=\funp{P}(w_1) \cdot \funp{P}(w_2) \cdot \ldots \cdot \funp{P}(w_m) 
+$计算这种切分的概率值。
+%----------------------------------------------
+\begin{figure}[htp]
+\centering
+\input{./Chapter3/Figures/figure-examples-of-chinese-word-segmentation-based-on-1-gram-model}
+\caption{基于1-gram语言模型的中文分词实例}
+\label{fig:3.2-5}
+\end{figure}
+%-------------------------------------------
+\parinterval 以``确实现在数据很多''这个实例来说，如果把这句话按照``确实/现在/数据/很/多''这样的方式进行切分，这个句子切分的概率$\funp{P}$(确实/现在/数据/很/多) 可以通过每个词出现概率相乘的方式进行计算。
+\begin{eqnarray}
+&\funp{P}&\textrm{(确实/现在/数据/很/多)} \nonumber \\
+& = &\funp{P}\textrm{(确实)} \cdot \funp{P}\textrm{(现在)} \cdot \funp{P}\textrm{(数据)} \cdot \funp{P}\textrm{(很)} \cdot \funp{P}\textrm{(多)} 
+\label{eq:3.2-1}
+\end{eqnarray}
+\parinterval 经过充分训练的统计模型$\funp{P}(\cdot)$就是得到的分词模型。对于输入的新句子S，通过这个模型找到最佳的分词结果$W^{*}$输出。假设输入句子S是``确实现在数据很多''，可以通过列举获得不同切分方式的概率，其中概率最高的切分方式，就是系统的目标输出。
+\parinterval 这种分词方法也被称作基于1-gram语言模型的分词，或全概率分词，使用标注好的分词数据进行学习，获得分词模型。这种方法最大的优点是整个学习过程（模型训练过程）和推导过程（处理新句子进行切分的过程）都是全自动进行的。这种方法虽然简单，但是其效率很高，因此被广泛应用在工业界系统里。
+\parinterval 当然，真正的分词系统还需要解决很多其他问题，比如使用动态规划等方法高效搜索最优解以及如何处理未见过的词等等，由于本节的重点是介绍中文分词的基础方法和统计建模思想，因此无法覆盖所有中文分词的技术内容，有兴趣的读者可以参考\ref{sec3:summary}节的相关文献做进一步深入研究。
+%----------------------------------------------------------------------------------------
+%    NEW SECTION
+%----------------------------------------------------------------------------------------
+\section{命名实体识别}
+\parinterval 在人类使用语言的过程中，单词往往不是独立出现的。很多时候，多个单词会组合成一个更大的单元，来表达特定的意思。其中，最典型的代表是{\small\bfnew{命名实体}}\index{命名实体}（Named Entity）\index{Named Entity}。通常，命名实体是指名词性的专用短语，例如公司名称、品牌名称、产品名称等专有名词和行业术语。准确地识别出这些命名实体，是提高机器翻译质量的关键。比如，在技术文献中，往往需要对术语进行识别并进行准确翻译。因此引入{\small\bfnew{命名实体识别}}\index{命名实体识别}（Named Entity Recognition）\index{Named Entity Recognition}可以帮助系统对特定术语进行更加细致的处理。
+\parinterval 从句法分析的角度来说，命名实体识别是一种浅层句法分析任务。它在分词的基础上，进一步对句子浅层结构进行识别。包括词性标注、组块识别在内的很多任务都可以被看作是浅层句法分析的内容。本节会以命名实体识别为例，对基于序列标注的浅层句法分析方法进行介绍。
+%----------------------------------------------------------------------------------------
+%    NEW SUB-SECTION
+%----------------------------------------------------------------------------------------
+\subsection{序列标注任务}
+\parinterval 命名实体识别是一种典型的序列标注任务。对于一个输入的序列，它会生成一个相同长度的输出序列。输入序列的每一个位置，都有一个与之对应的输出，输出的内容是这个位置所对应的标签（或者类别）。比如，对于命名实体识别，每个位置的标签可以被看作是一种命名实体``开始''和``结束''的标志，而命名实体识别的任务就是得到这种``开始''和``结束''标注的序列。不仅如此，分词、词性标注、组块识别等也都可以被看作是序列标注任务。
+\parinterval 通常来说，序列标注任务中首先需要定义标注策略，即使用什么样的格式来对序列进行标注。为了便于描述，这里假设输入序列为一个个单词\footnote{广义上，序列标注任务并不限制输入序列的形式，比如，字符、单词、多个单词构成的词组都可以作为序列标注的输入单元。}。常用的标注策略有：
+\begin{itemize}
+\vspace{0.5em}
+\item BIO（Beginning-inside-outside）格式。以命名实体识别为例，B代表一个命名实体的开始，I表示一个命名实体的其它部分，O表示一个非命名实体单元。
+\vspace{0.5em}
+\item BIOES格式。与BIO格式相比，多出了标签E（End）和S（Single）。仍然以命名实体识别为例，E和S分别用于标注一个命名实体的结束位置和仅含一个单词的命名实体。
+\vspace{0.5em}
+\end{itemize}
+%----------------------------------------------
+\begin{figure}[htp]
+    \centering
+ 	\subfigure[BIO格式标注命名实体] {\input{./Chapter3/Figures/figure-labeling-named-entities-in-bio-format} }
+ 	\subfigure[BIOES格式标注命名实体] {\input{./Chapter3/Figures/figure-labeling-named-entities-in-bioes-format}}
+	\caption{BIO和BIOES格式对比}
+    \label{fig:3.3-1}
+\end{figure}
+%-------------------------------------------
+%
+\parinterval 图\ref{fig:3.3-1}给出了不同标注格式所对应的标注结果\footnote{标注中的``GPE''在命名实体识别任务中被用于标注``地缘政治实体''。}，可以看出文本序列中的非命名实体直接被标注为``O''，而命名实体的标注则被分为了两部分：位置和命名实体类别，图中的``B''、``I''、``E''等标注出了位置信息，而``GPE''则标注出了命名实体类别。可以看到，命名实体的识别结果可以通过BIO、BIOES这类序列标注结果归纳出来：例如在BIOES格式中，标签``B-GPE''后面的标签只会是``I-GPE''或``E-GPE''，而不会是其他的标签。同时，在命名实体识别任务中涉及到实体边界的确定，而``BIO''或``BIOES''的标注格式本身就暗含着边界问题：在``BIO''格式下，实体左边界只能在``B''的左边，右边界只能在``B''或``I''的右边；在``BIOES''格式下，实体左边界只能在``B''或``S''的左边，右边界只能在``E''和``S''的右边。
+\parinterval 图\ref{fig:3.3-1}的例子中的命名实体识别以单词为基本标注单位，有时也会在输入的字序列上进行命名实体识别。其方法与基于词序列的命名实体识别是一样的。因此，这里仍然以基于词序列的方法为例进行介绍。
+\parinterval 对于像命名实体识别这样的任务，早期的方法主要是基于词典和规则的方法。这些方法依赖于手工构造的规则模板，通过字符串匹配的方式识别出文本中的命名实体\upcite{rau1991extracting}\upcite{张小衡1997中文机构名称的识别与分析}。严格意义上来说，那时命名实体识别还并没有被看作是一种序列标注。
+\parinterval 序列标注这个概念更多的是出现在基于统计建模的方法中。许多统计机器学习方法都被成功应用用于命名实体识别任务，例如{\small\bfnew{隐马尔可夫模型}}\index{隐马尔可夫模型}（Hidden Markov Model，HMM）\index{HMM}、{\small\bfnew{条件随机场}}\index{条件随机场}（Conditional Random Fields，CRFs）\index{Conditional Random Fields}、{\small\bfnew{最大熵}}\index{最大熵}（Maximum Entropy，ME）\index{ME}模型和{\small\bfnew{支持向量机}}\index{支持向量机}（Support Vector Machine，SVM）\index{SVM}等。而命名实体识别也成为了验证机器学习方法有效的代表性任务之一。本节将对几类基础统计方法进行介绍。其中会涉及概率图模型、统计分类器等方法。特别是统计分类的概念，在后续章节的学习中也会被使用到。
+%----------------------------------------------------------------------------------------
+%    NEW SUB-SECTION
+%----------------------------------------------------------------------------------------
+\subsection{基于概率图模型的方法}
+\parinterval {\small\bfnew{概率图模型}}\index{概率图模型}（Probabilistic Graphical Model）\index{Probabilistic Graphical Model}是使用图表示变量及变量间概率依赖关系的方法。在概率图模型中，可以根据可观测变量推测出未知变量的条件概率分布等信息。如果把序列标注任务中的输入序列看作观测变量，而把输出序列看作需要预测的未知变量，那么就可以把概率图模型应用于命名实体识别等序列标注任务。
+%----------------------------------------------------------------------------------------
+%    NEW SUBSUB-SECTION
+%----------------------------------------------------------------------------------------
+\subsubsection{1. 特征} \label{sec3:feature}
+\parinterval 概率图模型中的一个基础概念是``特征''。简单来说，{\small\bfnew{特征}}\index{特征}（Feature）\index{Feature}是指能够反映事物在某方面表现或行为的一种属性，如现实生活中小鸟的羽毛颜色、喙的形状、翼展长度等就是小鸟的特征；命名实体识别任务中的每个词的词根、词性和上下文组合也可以被看做是识别出命名实体可以采用的特征。
+\parinterval 从统计建模的角度看，特征的形式可以非常灵活。比如，可以分为连续性特征和离散型特征，前者通常用于表示取值蕴含数值大小关系的信息，如人的身高和体重，后者通常用于表示取值不蕴含数值大小关系的信息，例如人的性别。正是由于这种灵活性，系统开发者可以通过定义多样的特征从多个不同的角度对目标问题进行建模。而这种设计特征的过程也被称作{\small\bfnew{特征工程}}\index{特征工程}（Feature Engineering）\index{Feature Engineering}。
+\parinterval 设计更好的特征也成为了很多机器学习方法的关键。通常有两个因素需要考虑：
+\begin{itemize}
+\vspace{0.5em}
+\item 样本在这些特征上的差异度，即特征对于样本的区分能力。比如，可以考虑优先选择样本特征值方差较大即区分能力强的特征\footnote{方差如果很小，意味着样本在这个特征上基本上没有差异，那么这个特征对于样本的区分并没有什么用。}；
+\vspace{0.5em}
+\item 特征与任务目标的相关性。优先选择相关性高的特征。
+\vspace{0.5em}
+\end{itemize}
+\parinterval 回到命名实体识别任务上来。对于输入的每个单词，可以将其表示为一个单词和对应的{\small\bfnew{词特征}}\index{词特征}（word feature）\index{word feature}的组合，记作$<w, f>$。通过这样的表示，就可以将原始的单词序列转换为词特征序列。命名实体识别中的特征可以分为两大类，一种是单词对应各个标签的特征；另一种是标签之间组合的特征。常用的特征包括词根、词缀、词性或者标签的固定搭配等。表\ref{tab:3.3-1}展示了命名实体识别任务中一些典型的特征。
+\begin{table}[htp]{
+\begin{center}
+\caption{命名实体识别中常用的特征}
+{
+\begin{tabular}{c|c|c}
+特征名 & 示例文本 & 释义 \\
+\hline
+\rule{0pt}{10pt} LocSuffix & 沈阳\underline{市} & 地名后缀 \\
+\rule{0pt}{10pt} FourDigitYear & \underline{2020} & 四位数年份 \\
+\rule{0pt}{10pt} OtherDigit & \underline{202020} & 其他数字 \\
+\rule{0pt}{10pt} NamePrefix & \underline{张}三 & 姓名前缀 \\
+\rule{0pt}{10pt} ShortName & \underline{东大}成立120周年 & 缩略词 \\
+\end{tabular}
+\label{tab:3.3-1}
+}
+\end{center}
+}\end{table}
+\parinterval 在相当长的一段时期内，基于特征工程的方法都是自然语言处理领域的主流。虽然深度学习的逐渐兴起使得系统研发人员可以逐步摆脱繁重的特征设计工作。但是很多传统的模型和方法在今天仍然被广泛使用。比如，在当今最先进的序列标注模型中\upcite{lample2016neural}，本节即将介绍的条件随机场模型仍然是一个主要部件。
+%----------------------------------------------------------------------------------------
+%    NEW SUBSUB-SECTION
+%----------------------------------------------------------------------------------------
+\subsubsection{2. 经典概率图模型}
+\noindent a) {\small\bfnew{隐马尔可夫模型}}
+\parinterval 隐马尔可夫模型是一种经典的序列模型\upcite{Baum1966Statistical}\upcite{baum1970maximization}。它在语音识别、自然语言处理的很多领域得到了广泛的应用。隐马尔可夫模型的本质是一个概率模型，用来描述一个含有隐含参数的马尔可夫过程，这个过程隐含着状态的转移和可见状态的概率。
+\parinterval 这里用一个简单的``抛硬币''游戏来对这些概念进行说明：假设有三枚质地不同的硬币A、B、C，已知这三个硬币抛出正面的概率分别为0.3、0.5、0.7。在游戏中，游戏发起者在上述三枚硬币中选择一枚硬币上抛，每枚硬币被挑选到的概率可能会受上次被挑选的硬币的影响，且每枚硬币正面向上的概率都各不相同。不停的重复挑选硬币、上抛硬币的过程，会得到一串硬币的正反序列，例如：抛硬币6次，得到：正正反反正反。游戏挑战者通过观察6次后获得的硬币正反序列，猜测每次选择的究竟是哪一枚硬币。
+\parinterval 在上面的例子中，每次挑选并上抛硬币后得到的``正面''或``反面''即为``可见状态''；再次挑选并上抛硬币会获得新的``可见状态''，这个过程即为``状态的转移''；经过6次反复挑选上抛后得到的硬币正反序列叫做可见状态序列，由每个回合的可见状态构成。此外，在这个游戏中还暗含着一个会对最终``可见状态序列''产生影响的``隐含状态序列''\ \dash \ 每次挑选的硬币形成的序列，例如CBABCA。
+\parinterval 实际上，隐马尔科夫模型在处理序列问题时的关键依据是两个至关重要的概率关系，并且这两个概率关系也始终贯穿于``抛硬币''的游戏中。一方面，隐马尔可夫模型中用{\small\bfnew{发射概率}}\index{发射概率}（Emission Probability）\index{Emission Probability}来描述了隐含状态和可见状态之间存在的输出概率（即A、B、C 抛出正面的输出概率为0.3、0.5、0.7）；同样的，隐马尔可夫模型还会描述系统隐藏状态的{\small\bfnew{转移概率}}\index{转移概率}（Transition Probability）\index{Transition Probability}，在这个例子中，A 的下一个状态是A、B、C 的概率都是1/3。B、C 的下一个状态是A、B、C 的转移概率也同样是1/3。图\ref{fig:3.3-2}展示了在``抛硬币''游戏中的转移概率和发射概率，它们都可以被看做是条件概率矩阵。
+%----------------------------------------------
+\begin{figure}[htp]
+\centering
+\input{./Chapter3/Figures/figure-transition-prob-and-launch-prob-in-coin-toss-game}
+\caption{``抛硬币''游戏中的转移概率和发射概率}
+\label{fig:3.3-2}
+\end{figure}
+%-------------------------------------------
+%
+\parinterval 由于隐含状态序列之间存在转移概率，并且隐马尔可夫模型中隐含状态和可见状态之间存在着发射概率，因此根据可见状态的转移猜测隐藏状态序列并非无迹可循。图\ref{fig:3.3-3}描述了如何使用隐马尔可夫模型根据``抛硬币''结果推测挑选的硬币序列。可见，通过隐藏状态之间的联系（绿色方框及它们之间的连线）可以对有序的状态进行描述，进而得到隐藏状态序列所对应的可见状态序列（红色圆圈）。
+%----------------------------------------------
+\begin{figure}[htp]
+\centering
+\input{./Chapter3/Figures/figure-example-of-hmm-in-coin-toss}
+\caption{抛硬币的隐马尔可夫模型实例}
+\label{fig:3.3-3}
+\end{figure}
+%-------------------------------------------
+\parinterval 一般来说，隐马尔可夫模型中包含下面三个问题：
+\begin{itemize}
+\vspace{0.5em}
+\item 隐藏状态序列的概率计算：即给定模型（转移概率和发射概率），根据可见状态序列（抛硬币的结果），计算在该模型下得到这个结果的概率，这个问题的解决需要用到前后向算法\upcite{baum1970maximization}。
+\vspace{0.5em}
+\item 参数学习：即给定硬币种类（隐含状态数量），根据多个可见状态序列（抛硬币的结果），估计模型的参数（转移概率），这个问题的求解需要用到EM算法\upcite{1977Maximum}。
+\vspace{0.5em}
+\item 解码：即给定模型（转移概率和发射概率）和可见状态序列（抛硬币的结果），计算在可见状态序列的情况下，最可能出现的对应的状态序列，这个问题的求解需要用到基于动态规划方法，通常被称作{\small\bfnew{维特比算法}}\index{维特比算法}（Viterbi Algorithm）\index{Viterbi Algorithm}\upcite{1967Error}。
+\vspace{0.5em}
+\end{itemize}
+\parinterval 隐马尔可夫模型处理序列标注问题的基本思路是：
+\begin{itemize}
+\vspace{0.5em}
+\item [(1)]根据可见状态序列（输入序列）和其对应的隐藏状态序列（标记序列）样本，估算模型的转移概率和发射概率；
+\vspace{0.5em}
+\item [(2)]对于给定的可见状态序列，预测概率最大的隐藏状态序列，比如，根据输入的词序列预测最有可能的命名实体标记序列
+\vspace{0.5em}
+\end{itemize}
+\parinterval 这里，用输入文本$\mathbf{X}$表示可见状态序列，待预测标签$\mathbf{Y}$表示隐藏状态序列。一种简单的办法是使用相对频度估计得到转移概率和发射概率估计值（公式\ref{eq:3.3-1}和公式\ref{eq:3.3-2}）。
+\begin{eqnarray}
+\funp{P}(y_i|y_{i-1}) = \frac{{\textrm{Count}}(y_{i-1},y_i)}{{\textrm{Count}}(y_{i-1})}
+\label{eq:3.3-1}
+\end{eqnarray}
+\begin{eqnarray}
+\funp{P}(x_i|y_{i}) = \frac{{\textrm{Count}}(x_i,y_i)}{{\textrm{Count}}(y_i)}
+\label{eq:3.3-2}
+\end{eqnarray}
+\parinterval 其中${\rm{Count}}()$表示训练集中某种现象出现的次数。$i$表示序列的第$i$个位置。$x_i$是某可见状态，$y_i$是某隐藏状态。：
+\parinterval 在获得转移概率和发射概率的基础上，对于一个句子进行命名实体识别可以被描述为：在观测序列$\mathbf{X}$（可见状态，即输入的词序列）的条件下，最大化标签序列$\mathbf{Y}$（隐藏状态，即标记序列）的概率，即
+\begin{eqnarray}
+\hat{\mathbf{Y}} = \arg\max_{\mathbf{Y}}\funp{P}(\mathbf{Y}|\mathbf{X})
+\label{eq:3.3-3}
+\end{eqnarray}
+\parinterval 根据贝叶斯定理，该概率被分解为$\funp{P}(\mathbf{Y}|\mathbf{X})=\frac{\funp{P}(\mathbf{Y})\funp{P}(\mathbf{X}|\mathbf{Y})}{\funp{P}(\mathbf{X})}$，其中$\funp{P}(\mathbf{X})$是固定概率，因为$\mathbf{X}$在这个过程中可以被认为是不变的。因此只需考虑如何求解分子，则可将问题转换为求解公式\ref{eq:3.3-4}：
+\begin{eqnarray}
+\hat{\mathbf{Y}} = \arg\max_{\mathbf{Y}}\funp{P}(\mathbf{Y})\funp{P}(\mathbf{X}|\mathbf{Y})
+\label{eq:3.3-4}
+\end{eqnarray}
+\parinterval 在隐马尔可夫模型中隐含着这样的假设：某隐藏状态的概率仅由上一个隐藏状态决定（``抛硬币''游戏中，每枚硬币被挑选到的概率可能会受上次被挑选的硬币的影响）。这就意味着在隐马尔可夫模型中有：
+\begin{eqnarray}
+\hat{\mathbf{Y}} = \arg\max_{\mathbf{Y}}\prod_{i=1}^{m}\funp{P}(x_i|y_i)\funp{P}(y_i|y_{i-1})
+\label{eq:3.3-5}
+\end{eqnarray}
+\parinterval 图\ref{fig:3.3-4}展示了基于隐马尔可夫模型的命名实体识别模型。值得注意的是，这种描述序列生成的过程也可以被应用于机器翻译，在第五章还将看到隐马尔可夫模型在翻译建模中的应用。
+%----------------------------------------------
+\begin{figure}[htp]
+\centering
+\input{./Chapter3/Figures/figure-ner-based-on-hmm}
+\caption{基于隐马尔可夫模型的命名实体识别（解码过程）}
+\label{fig:3.3-4}
+\end{figure}
+%-------------------------------------------
+\noindent b) {\small\bfnew{条件随机场}}
+\parinterval 在隐马尔可夫模型中隐藏着这样的假设：某隐藏状态的概率仅由上一个隐藏状态决定。这个假设也会带来一些问题。举一个例子：在某个隐马尔可夫模型中，隐藏状态集合为{A、B、C、D}，可见状态集合为{T、F}，待预测的可见序列为T F F T。其中隐藏状态A可能的后继隐藏状态集合为{A、B}，隐藏状态B可能的后继隐藏状态集合为{A、B、C、D}，于是有：
+\begin{eqnarray}
+\funp{P}({\rm A}|{\rm A})+\funp{P}({\rm A}|{\rm B})=1
+\label{eq:3.3-6}
+\end{eqnarray}
+\begin{eqnarray}
+\funp{P}({\rm A}|{\rm B})+\funp{P}({\rm B}|{\rm B})+\funp{P}({\rm C}|{\rm B})+\funp{P}({\rm D}|{\rm B})=1
+\label{eq:3.3-7}
+\end{eqnarray}
+\parinterval 其中，$\funp{P}(y|x)$表示由状态$x$转移到状态$y$的概率。由于式\ref{eq:3.3-6}中的分式数量少于式\ref{eq:3.3-7}，这就导致在统计中获得的$\funp{P}({\rm A}|{\rm A})$、$\funp{P}({\rm A}|{\rm B})$的值很可能会比$\funp{P}({\rm A}|{\rm B})$、$\funp{P}({\rm B}|{\rm B})$、$\funp{P}({\rm C}|{\rm B})$、$\funp{P}({\rm D}|{\rm B})$要大。如图\ref{fig:3.3-5}所示，假设初始隐藏状态是A，图中线上的概率值是对应的转移概率与发射概率的乘积，比如图中隐藏状态A开始，下一个隐藏状态是A且可见状态是F的概率是0.45，下一个隐藏状态是B且可见状态是F的概率是0.55。图中可以看出，由于有较大的值，当可见状态序列为T F F T时，隐马尔可夫计算出的最有可能的隐藏状态序列为A A A A。但是如果对训练集进行统计可能会发现，当可见序列为T F F T 时，对应的隐藏状态是A A A A的概率可能是比较大的，但也可能是比较小的。这个例子中出现预测偏差的主要原因是：由于比其他状态转移概率要大得多，隐藏状态的预测一直停留在状态A。
+%----------------------------------------------
+\begin{figure}[htp]
+\centering
+\input{./Chapter3/Figures/figure-example-of-hmm}
+\caption{隐马尔可夫实例}
+\label{fig:3.3-5}
+\end{figure}
+%-------------------------------------------
+\parinterval 上述现象也被称作{\small\bfnew{标注偏置}}\index{标注偏置}（Label Bias）\index{Label Bias}。条件随机场模型在隐马尔可夫模型的基础上，解决了这个问题\upcite{lafferty2001conditional}。在条件随机场模型中，以全局范围的统计归一化代替了隐马尔可夫模型中的局部归一化。除此之外，条件随机场模型中并非使用概率计算而是特征函数的方式对可见状态序列$\mathbf{X}$对应的隐藏状态序列$\mathbf{Y}$的概率进行计算。
+\parinterval 条件随机场中一般有若干个特征函数，都是经过设计的、能够反映序列规律的一些二元函数\footnote{二元函数的函数值一般非1即0}，并且每个特征函数都有其对应的权重$\lambda$。特征函数一般由两部分组成：能够反映隐藏序列之间转移规则的转移特征$\textbf{t}(y_{i-1},y_i,\mathbf{X},i)$和状态特征$\textbf{s}(y_i,\mathbf{X},i)$。其中$y_i$和$y_{i-1}$分别是位置$i$和前一个位置的隐藏状态，$\mathbf{X}$则是可见状态序列。转移特征$\textbf{t}(y_{i-1},y_i,\mathbf{X},i)$反映了两个相邻的隐藏状态之间的转换关系，而状态特征$\textbf{s}(y_i,\mathbf{X},i)$则反映了第$i$个可见状态应该对应什么样的隐藏状态。这两部分共同组成了一个特征函数$F(y_{i-1},y_i,\mathbf{X})$，即
+\begin{eqnarray}
+F(y_{i-1},y_i,\mathbf{X})=\textbf{t}(y_{i-1},y_i,\mathbf{X},i)+\textbf{s}(y_i,\mathbf{X},i)
+\label{eq:3.3-8}
+\end{eqnarray}
+\parinterval 实际上，基于特征函数的方法更像是对隐藏状态序列的一种打分：根据人为设计的规则（特征函数），测试隐藏状态之间的转换以及隐藏状态与可见状态之间的对应关系是否符合这种规则。在处理序列问题时，假设可见状态序列$\mathbf{X}$的长度和待预测隐藏状态序列$\mathbf{Y}$的长度均为$I$，且共设计了$k$个特征函数，则有：
+\begin{eqnarray}
+\funp{P}(Y|X)=\frac{1}{Z(X)}\exp(\sum_i^I\sum_k\lambda_kF_k(y_{i-1},y_i,x,i))
+\label{eq:3.3-9}
+\end{eqnarray}
+\parinterval 公式\ref{eq:3.3-9}中的$Z(X)$即为上面提到的实现全局统计归一化的归一化因子，其计算方式为：
+\begin{eqnarray}
+Z(X)=\sum_y\exp(\sum_i^I\sum_k\lambda_kF_k(y_{i-1},y_i,x,i))
+\label{eq:3.3-10}
+\end{eqnarray}
+\parinterval 由公式\ref{eq:3.3-10}可以看出，归一化因子的求解依赖于整个可见状态序列和每个位置的隐藏状态。条件随机场模型中的归一化是一种全局范围的归一化方式。图\ref{fig:3.3-6}为条件随机场模型处理序列问题的示意图。
+%----------------------------------------------
+\begin{figure}[htp]
+\centering
+\input{./Chapter3/Figures/figure-crf-to-deal-with-sequence-problems}
+\caption{条件随机场模型处理序列问题}
+\label{fig:3.3-6}
+\end{figure}
+%-------------------------------------------
+\parinterval 条件随机场模型处理命名实体识别任务时，可见状态序列对应着文本内容，隐藏状态序列对应着待预测的标签。对于命名实体识别任务，需要单独设计若干适合命名实体识别任务的特征函数。例如在使用BIOES标准标注命名实体识别任务时，标签``B-ORG''\footnote{表示机构实体名的开始标签}后面的标签必然是``I-ORG''或是``E-ORG''，而绝不可能是``O''，针此规则可以设计相应特征函数。
+%----------------------------------------------------------------------------------------
+%    NEW SUB-SECTION
+%----------------------------------------------------------------------------------------
+\subsection{基于分类器的方法}
+\parinterval 基于概率图的模型将序列表示为有向图或无向图，用图中的节点表示文本或是标签，如图\ref{fig:3.3-7}(a)、(b)所示。这种方法增加了建模的复杂度。既然这里要得到每个位置的类别输出，另一种更加直接的方法是使用分类器对每个位置进行独立预测。分类器是机器学习中广泛使用的方法，它可以根据输入自动地对类别进行预测。如图\ref{fig:3.3-7}(c)所示，对于序列标注任务，如果把每一个位置所对应的所有特征看作是输入，而把这个位置对应的标注看作输出，就可以直接使用分类器预测每个位置的标注结果。从这个角度说，隐马尔可夫模型等方法实际上也是在进行一种``分类''操作，只不过这些方法考虑了不同位置输出（或隐藏状态）之间的依赖。
+%----------------------------------------------
+\begin{figure}[htp]
+\centering
+\begin{tabular}{l l l}
+\subfigure[HMM处理序列标注]{\input{./Chapter3/Figures/figure-process-sequence-labeling-by-hmm}} & \subfigure[CRF处理序列标注]{\input{./Chapter3/Figures/figure-process-sequence-labeling-by-crf}} & \subfigure[分类算法处理序列标注]{\input{./Chapter3/Figures/figure-process-sequence-labeling-by-classfication}}
+\end{tabular}
+\caption{HMM、CRF、分类算法三种方法对比}
+\label{fig:3.3-7}
+\end{figure}
+%-------------------------------------------
+\parinterval 值得注意的是分类模型可以被应用于序列标注之外的很多任务。在后面的章节中还会看到，机器翻译中的很多模型也借鉴了统计分类模型的思想。其中使用到的基础数学模型和特征定义形式，与这里提到的分类器本质上是一样的。
+%----------------------------------------------------------------------------------------
+%    NEW SUBSUB-SECTION
+%----------------------------------------------------------------------------------------
+\subsubsection{1. 分类任务与分类器}
+\parinterval 无论在日常生活中还是在研究工作中，都会遇到各种各样的分类问题，例如挑选西瓜时需要区分``好瓜''和``坏瓜''、编辑看到一篇新闻稿件时要对稿件进行分门别类。事实上，在机器学习中，对``分类任务''的定义会更宽泛而并不拘泥于``类别''的概念：在对样本进行预测时，只要预测标签集合是有限的且预测标签是离散的，就可认定其为分类任务。
+\parinterval 具体来说，分类任务目标是训练一个可以根据输入数据预测离散标签的分类器（也可称为分类模型）。在有监督的分类任务中\footnote{与之相对应的，还有无监督、半监督分类任务，不过这些内容不是本书讨论的重点。读者可以参看\upcite{周志华2016《机器学习》}\upcite{李航2012统计学习方法}对相关概念进行了解。}，训练数据集合通常由形似$(\mathbf{x}_i,y_i)$的带标注数据构成：$\mathbf{x}_i=(x_1,x_2,\ldots,x_m)$作为分类器的输入数据（通常被称作一个训练样本），例如文本分类任务中的单词表示、图像分类任务中的像素表示；$y_i$作为输入数据对应的{\small\bfnew{标签}}\index{标签}（label）\index{label}，反映了输入数据对应的``类别''。若标签集合大小为$n$，则分类任务的本质是通过对训练数据集合的学习，建立一个从$m$维样本空间到$n$维标签空间的映射关系。更确切地说，分类任务的最终目标是学习一个条件概率分布：$\funp{P}(Y|X)$，其中$X=(\mathbf{x}_1,\mathbf{x}_2,\ldots,\mathbf{x}_m)$代表着所有可能出现的输入数据集合，$Y=(y_1,y_2,\ldots,y_n)$代表着标签集合，通过比较$\funp{P}(y_1|\mathbf{x}_i)$、$\funp{P}(y_2|\mathbf{x}_i)$、$\ldots$、$\funp{P}(y_n|\mathbf{x}_i)$之间的大小关系从而确定输入数据$\mathbf{x}_i$最终的类别标签。
+\parinterval 与概率图模型一样，分类模型中也依赖特征定义。其定义形式与\ref{sec3:feature}节的描述一致，这里不再赘述。分类任务一般根据类别数量分为二分类任务和多分类任务。二分类任务是最经典的分类任务，只需要对输出进行非零即一的预测。多分类任务则可以有多种处理手段，比如，可以将其``拆解''为多个二分类任务求解，或者直接让模型输出多个类别中的一个。在命名实体识别中，往往会使用多类别分类模型。比如，在BIO标注下，有三个类别（B、I和O）。一般来说，类别数量越大分类的难度也越大。比如，BIOES标注包含5个类别，因此使用同样的分类器，它要比BIO标注下的分类问题难度大。另一方面，更多的类别有助于准确的刻画目标问题。因此在实践中需要在类别数量和分类难度之间找到一种平衡。
+\parinterval 在机器翻译和语言建模中也会遇到类似的问题，比如，生成单词的过程可以被看做是一个分类问题，类别数量就是词表的大小。显然，词表越大可以覆盖更多样的词法变化，但是过大的词表里会包含很多低频词，其计算复杂度会显著增加。但是，过小的词表又无法包含足够多的单词。因此，在设计这类系统的时候对词表大小的选择（类别数量的选择）是十分重要的，往往要通过大量的实验得到最优的设置。
+%----------------------------------------------------------------------------------------
+%    NEW SUBSUB-SECTION
+%----------------------------------------------------------------------------------------
+\subsubsection{2. 经典的分类模型}
+\parinterval 经过多年的发展，研究者提出了很多分类模型。由于篇幅所限，本书无法一一列举这些模型，这里仅列出了部分经典的模型。关于分类模型更全面的介绍可以参考相关文献\upcite{harrington2013机器学习实战}\upcite{李航2012统计学习方法}。
+\begin{itemize}
+\vspace{0.5em}
+\item K-近邻分类算法。K-近邻分类算法通过计算不同特征值之间的距离进行分类，这种方法适用于可以提取到数值型特征\footnote{即可以用数值大小对某方面特征进行衡量。}的分类问题。该方法的基本思想为：将提取到的特征分别作为坐标轴，建立一个$n$维坐标系（当特征数量为$n$时），此时每个样本都将成为该$n$维空间的一个点，将未知样本与已知类别样本的空间距离作为分类依据进行分类。
+\vspace{0.5em}
+\item 支持向量机。支持向量机是一种二分类模型，其思想是通过线性超平面将不同输入划分为正例和负例，并使线性超平面与不同输入的距离都达到最大。与K-近邻分类算法类似，支持向量机也适用于可以提取到数值型特征的分类问题。
+\vspace{0.5em}
+\item 最大熵模型。最大熵模型是根据最大熵原理提出的一种分类模型，其基本思想是：以在训练数据集中学习到的经验知识作为一种``约束''，并在符合约束的前提下，在若干合理的条件概率分布中选择``使条件熵最大''的模型。
+\vspace{0.5em}
+\item 决策树分类算法。决策树分类算法是一种基于实例的归纳学习方法：将样本中某些决定性特征作为决策树的节点，根据特征表现进行对样本划分，最终根节点到每个叶子节点均形成一条分类的路径规则。这种分类方法适用于可以提取到离散型特征\footnote{即特征值是离散的。}的分类问题。
+\vspace{0.5em}
+\item 朴素贝叶斯分类算法。朴素贝叶斯算法是以贝叶斯定理（其主要内容为$\funp{P}(B_i|A)=\frac{\funp{P}(B_i)\funp{P}(A|B_i)}{\sum_{j=1}^n\funp{P}(B_j)\funp{P}(A|B_j)}$）为基础并且假设特征之间相互独立的方法：以特征之间相互独立作为前提假设，学习从输入到输出的联合概率分布，并以后验概率最大的输出作为最终类别。
+\vspace{0.5em}
+\end{itemize}
+%----------------------------------------------------------------------------------------
+%    NEW SECTION
+%----------------------------------------------------------------------------------------
+\section{句法分析（短语结构分析）}
+\parinterval 前面两节已经介绍了什么叫做``词''、如何对分词问题进行统计建模。同时，也介绍了如何对多个单词构成的命名实体进行识别。无论是分词还是命名实体识别都是句子浅层信息的一种表示。对于一个自然语言句子来说，它更深层次的结构信息可以通过更完整的句法结构来描述，而句法结构也是机器翻译和自然语言处理其他任务中常用的知识之一。 
+%----------------------------------------------------------------------------------------
+%    NEW SUB-SECTION
+%----------------------------------------------------------------------------------------
+\subsection{句子的句法树表示}
+\parinterval {\small\bfnew{句法}}\index{句法}（Syntax）\index{Syntax}是研究句子的每个组成部分和它们之间的组合方式。一般来说，句法和语言是相关的，比如，英文是主谓宾结构，而日语是主宾谓结构。因此不同的语言也会有不同的句法描述方式。自然语言处理领域最常用的两种句法分析形式是{\small\bfnew{短语结构分析}}\index{短语结构分析}（Phrase Structure Parsing）\index{Phrase Structure Parsing}和{\small\bfnew{依存分析}}\index{依存分析}（Dependency Parsing）\index{Dependency Parsing}。图\ref{fig:3.4-1}展示了这两种的句法表示形式的实例。其中，左侧是短语结构树。它描述的是短语的结构功能，比如``吃''是动词（记为VV），``鱼''是名词（记为NN），``吃\ 鱼''组成动词短语，这个短语再与``喜欢''这一动词组成新的动词短语。短语结构树的每个子树都是一个句法功能单元，比如，子树VP(VV(吃) NN(鱼))就表示了``吃\ 鱼''这个动词短语的结构，其中子树根节点VP是句法功能标记。短语结构树利用嵌套的方式描述了语言学的功能。短语结构树中，每个词都有词性(或词类)，不同的词或者短语可以组成名动结构、动宾结构等语言学短语结构。短语结构分析一般也被称为{\small\bfnew{成分分析}}\index{成分分析}(Constituency Parsing)或{\small\bfnew{完全分析}}\index{完全分析}（Full Parsing）\index{Full Parsing}。
+%----------------------------------------------
+\begin{figure}[htp]
+    \centering
+\input{./Chapter3/Figures/figure-phrase-structure-tree-and-dependency-tree}
+    \caption{短语结构树(左)和依存树(右)}
+    \label{fig:3.4-1}
+\end{figure}
+%---------------------------
+\parinterval 图\ref{fig:3.4-1}右侧展示的是另一种句法结构，被称作依存句法树。依存句法树表示了句子中单词和单词之间的依存关系。比如，从这个例子可以了解，``猫''依赖``喜欢''，``吃''依赖``喜欢''，``鱼''依赖``吃''。
+\parinterval 短语结构树和依存句法树的结构和功能有很大不同。短语结构树的叶子节点是单词，中间节点是词性或者短语句法标记。在短语结构分析中，通常把单词称作{\small\bfnew{终结符}}\index{终结符}（Terminal）\index{Terminal}，把词性称为{\small\bfnew{预终结符}}\index{预终结符}（Pre-terminal）\index{Pre-terminal}，而把其他句法标记称为{\small\bfnew{非终结符}}\index{非终结符}（Non-terminal）\index{Non-terminal}。依存句法树没有预终结符和非终结符，所有的节点都是句子里的单词，通过不同节点间的连线表示句子中各个单词之间的依存关系。每个依存关系实际上都是有方向的，头和尾分别指向``接受''和``发出''依存关系的词。依存关系也可以进行分类，图\ref{fig:3.4-1}中我们对每个依存关系的类型都进行了标记，这也被称作是有标记的依存分析。如果不生成这些标记，这样的句法分析被称作无标记的依存分析。
+\parinterval 虽然短语结构树和依存树的句法表现形式有很大不同，但是它们在某些条件下能相互转化。比如，可以使用启发性规则将短语结构树自动转化为依存树。从应用的角度，依存分析由于形式更加简单，而且直接建模词语之间的依赖，因此在自然语言处理领域中受到很多关注。在机器翻译中，无论是哪种句法树结构，都已经被证明会对机器翻译系统产生帮助。特别是短语结构树，在机器翻译中的应用历史更长，研究更为深入，因此本节将会以短语结构分析为例介绍句法分析的相关概念。
+\parinterval 而句法分析到底是什么呢？简单的理解，句法分析就是在小学语文课程中学习的句子成分的分析，以及对句子中各个成分内部、外部关系的判断。更规范一些的定义，可以参照百度百科维基百科的句法分析的解释。
+%-------------------------------------------
+\begin{definition} 句法分析
+句法分析(Parsing)就是指对句子中的词语语法功能进行分析。
+\begin{flushright}——《百度百科》\end{flushright}
+在自然语言或者计算机语言中，句法分析是利用形式化的文法规则对一个符号串进行分析的过程。
+\begin{flushright}——《维基百科（译文）》\end{flushright}
+\end{definition}
+%-------------------------------------------
+\parinterval 上面的定义中，句法分析包含三个重要的概念：
+\begin{itemize}
+\vspace{0.5em}
+\item 形式化的文法：描述语言结构的定义，由文法规则组成。
+\vspace{0.5em}
+\item 符号串：在本节中，符号串就是指词串，由前面提到的分词系统生成。
+\vspace{0.5em}
+\item 分析：使用形式文法对符号串进行分析的具体方法，在这里指实现分析的计算机算法。
+\vspace{0.5em}
+\end{itemize}
+\parinterval 以上三点是实现一个句法分析器的要素。本节的后半部分会对相关的概念和技术方法进行介绍。
+%----------------------------------------------------------------------------------------
+%    NEW SUB-SECTION
+%----------------------------------------------------------------------------------------
+\subsection{上下文无关文法}
+\parinterval 句法树是对句子的一种抽象。这种树形结构表达了一种对句子结构的归纳过程，比如，从树的叶子开始，把每一个树节点看作一次抽象，最终形成一个根节点。那这个过程如何用计算机来实现呢？这就需要使用到形式文法。
+\parinterval 形式文法是分析自然语言的一种重要工具。根据乔姆斯基的定义\upcite{chomsky2002syntactic}，形式文法分为四种类型：无限制文法（0型文法）、上下文相关文法（1型文法）、上下文无关文法（2型文法）和正规文法（3型文法）。不同类型的文法有不同的应用，比如，正规文法可以用来描述有限状态自动机，因此也会被使用在语言模型等系统中。对于短语结构分析问题，常用的是{\small\bfnew{上下文无关文法}}\index{上下文无关文法}（Context-Free Grammar）\index{Context-Free Grammar}。上下文无关文法的具体形式如下：
+%-------------------------------------------
+\vspace{0.5em}
+\begin{definition} 上下文无关文法
+一个上下文无关文法可以被视为一个系统$G=<N,\Sigma,R,S>$，其中
+\begin{itemize}
+\vspace{0.5em}
+\item $N$为一个非终结符集合
+\vspace{0.5em}
+\item $\Sigma$为一个终结符集合
+\vspace{0.5em}
+\item $R$为一个规则（产生式）集合，每条规则 $r \in R$的形式为$X \to Y_1Y_2...Y_n$，其中$X \in N$, $Y_i \in N \cup \Sigma$
+\vspace{0.5em}
+\item $S$为一个起始符号集合且$S \subseteq N$
+\vspace{0.5em}
+\end{itemize}
+\end{definition}
+%-------------------------------------------
+\parinterval 举例说明，假设有上下文无关文法$G=<N,\Sigma,R,S>$，可以用它描述一个简单中文句法结构。其中非终结符集合为不同的中文句法标记
+\begin{eqnarray}
+N=\{\textrm{NN},\textrm{VV},\textrm{NP},\textrm{VP},\textrm{IP}\} \nonumber
+\label{eq:3.4-1}
+\end{eqnarray}
+这里，\textrm{NN}代表名词，\textrm{VV}代表动词，\textrm{NP}代表名词短语，\textrm{VP}代表动词短语，\textrm{IP}代表单句。进一步，把终结符集合定义为
+\begin{eqnarray}
+\Sigma = \{\text{猫,喜欢,吃,鱼}\} \nonumber
+\label{eq:3.4-2}
+\end{eqnarray}
+再定义起始符集合为
+\begin{eqnarray}
+S=\{\textrm{IP}\} \nonumber
+\label{eq:3.4-3}
+\end{eqnarray}
+最后，文法的规则集定义图\ref{fig:3.4-2}所示（其中$r_i$为规则的编号）
+%----------------------------------------------
+\begin{figure}[htp]
+    \centering
+ \input{./Chapter3/Figures/figure-rules-of-grammar}
+ \caption{一个示例文法的规则集}
+     \label{fig:3.4-2}
+ \end{figure}
+%---------------------------
+\parinterval 上面这个文法蕴含了不同``层次''的句法信息。比如，规则$r_1$、$r_2$、$r_3$和$r_4$表达了词性对单词的抽象；规则$r_6$、$r_7$和$r_8$是表达了短语结构的抽象，其中，规则$r_8$描述了汉语中名词短语(主语)+动词短语(谓语)的结构。在实际应用中，像$r_8$这样的规则可以覆盖很大的片段（试想一下一个包含50个词的主谓结构的句子，可以使用$r_8$进行描述）。
+\parinterval 上下文无关文法的规则是一种{\small\bfnew{产生式规则}}\index{产生式规则}（Production Rule）\index{Production Rule}，形如$\alpha \to \beta $，它表示把规则左端的非终结符$\alpha$替换为规则右端的符号序列$\beta$。 通常，$\alpha$被称作规则的左部（Left-hand Side），$\beta$被称作规则的右部（Right-hand Side）。使用右部$\beta$替换左部$\alpha$的过程也被称作规则的使用，而这个过程的逆过程称为规约。规则的使用可以如下定义：
+%-------------------------------------------
+\vspace{0.5em}
+\begin{definition} 上下文无关文法规则的使用
+一个符号序列$u$可以通过使用规则$r$替换其中的某个非终结符，并得到符号序列$v$，于是$v$是在$u$上使用$r$的结果，记为$u \overset{r}{\Rightarrow} v$：
+\begin{center}
+\input{./Chapter3/Figures/figure-usage-of-regulation}
+\end{center}
+\end{definition}
+%-------------------------------------------
+\parinterval 给定起始非终结符，可以不断地使用规则，最终生成一个终结符串，这个过程也被称为{\small\bfnew{推导}}\index{推导}（Derivation）\index{Derivation}。形式化的定义为：
+%-------------------------------------------
+\vspace{0.5em}
+\begin{definition} 推导
+给定一个文法$G=<N,\Sigma,R,S>$，对于一个字符串序 \\
+列$s_0,s_1,...,s_n$和规则序列$r_1,r_2,...,r_n$，满足
+\vspace{-0.5em}
+\begin{displaymath}
+s_0 \overset{r_1}{\Rightarrow} s_1 \overset{r_2}{\Rightarrow} s_2 \overset{r_3}{\Rightarrow} ... \overset{r_{n}}{\Rightarrow} s_n
+\end{displaymath}
+且
+\begin{itemize}
+\vspace{0.5em}
+\item $\forall i \in [0,n], s_i \in (N\cup\Sigma)^*$ \hspace{3.5em} $\lhd$ $s_i$为合法的字符串
+\vspace{0.5em}
+\item $\forall j \in [1,n], r_j \in R$ \hspace{6.3em} $\lhd$ $r_j$为$G$的规则
+\vspace{0.5em}
+\item $s_0 \in S$ \hspace{10.9em} $\lhd$ $s_0$为起始非终结符
+\vspace{0.5em}
+\item $s_n \in \Sigma^{*}$ \hspace{10.4em} $\lhd$ $s_n$为终结符序列
+\vspace{0.5em}
+\end{itemize}
+\vspace{0.8em}
+则$s_0 \overset{r_1}{\Rightarrow} s_1 \overset{r_2}{\Rightarrow} s_2 \overset{r_3}{\Rightarrow} ... \overset{r_{n}}{\Rightarrow} s_n$为一个推导
+\end{definition}
+%-------------------------------------------
+\parinterval 比如，使用前面的示例文法，可以对``猫 喜欢 吃 鱼''进行分析，并形成句法分析树（图\ref{fig:3.4-3}）。从起始非终结符IP开始，使用唯一拥有IP作为左部的规则$r_8$推导出NP和VP，之后依次使用规则$r_5$、$r_1$、$r_7$、$r_2$、$r_6$、$r_3$、$r_4$，得到了完整的句法树。
+%-------------------------------------------
+\begin{figure}[htp]
+    \centering
+\input{./Chapter3/Figures/figure-example-of-derivation}
+	\caption{上下文无关文法推导实例}
+    \label{fig:3.4-3}
+\end{figure}
+%-------------------------------------------
+\parinterval 通常，可以把推导简记为$d=r_1 \circ r_2 \circ ... \circ r_n$，其中$ \circ $表示规则的组合。显然，$d$也对应了树形结构，也就是句法分析结果。从这个角度看，推导就是描述句法分析树的一种方式。此外，规则的推导也把规则的使用过程与生成的字符串对应起来。一个推导所生成的字符串，也被称作文法所产生的一个{\small\bfnew{句子}}\index{句子}（Sentence）\index{Sentence}。而一个文法所能生成的所有句子的集合是这个文法所对应的{\small\bfnew{语言}}\index{语言}（Language）\index{Language}。
+\parinterval 但是，句子和规则的推导并不是一一对应的。同一个句子，往往有很多推导的方式，这种现象被称为{\small\bfnew{歧义}}\index{歧义}（Ambiguity）\index{Ambiguity}。甚至同一棵句法树，也可以对应不同的推导。图\ref{fig:3.4-4} 给出同一棵句法树所对应的两种不同的规则推导。
+%-------------------------------------------
+\begin{figure}[htp]
+    \centering
+\input{./Chapter3/Figures/figure-two-different-derivation-of-regulation}
+\setlength{\abovecaptionskip}{-0.5em}
+	\caption{同一棵句法树对应的不同规则推导}
+    \label{fig:3.4-4}
+\end{figure}
+%-------------------------------------------
+\parinterval 显然，规则顺序的不同会导致句法树的推导这一确定的过程变得不确定。因此，需要进行{\small\bfnew{消歧}}\index{消歧}（Disambiguation）\index{Disambiguation}。这里，可以使用启发式方法：要求规则使用都服从最左优先原则，这样得到的推导被称为{\small\bfnew{最左优先推导}}\index{最左优先推导}（Left-most Derivation）\index{Left-most Derivation}。图\ref{fig:3.4-4}中的推导1 就是符合最左优先原则的推导。
+%-------------------------------------------
+\begin{figure}[htp]
+    \centering
+\input{./Chapter3/Figures/figure-perspectives-of-expert-ordinary-and-syntactic-parser}
+	\caption{如何选择最佳的句法分析结果 - 专家、普通人和句法分析器的视角}
+    \label{fig:3.4-5}
+\end{figure}
+%-------------------------------------------
+\parinterval 这样，对于一个上下文无关文法，每一棵句法树都有唯一的最左推导与之对应。于是，句法分析可以被描述为：对于一个句子找到能够生成它的最佳推导，这个推导所对应的句法树就是这个句子的句法分析结果。
+\parinterval 不过问题又回来了，怎样才能知道什么样的推导或者句法树是``最佳''的呢？如图\ref{fig:3.4-5}所示，对于语言学专家，他们可以很确定的分辨出哪些句法树是正确的，哪些句法树是错误。甚至普通人也可以通过一些课本中学到的知识产生一些模糊的判断。而计算机如何进行判别呢？沿着前面介绍的统计建模的思想，计算机可以得出不同句法树出现的概率，进而选择概率最高的句法树作为输出，而这正是统计句法分析所做的事情。
+\parinterval 在统计句法分析中，需要对每个推导进行统计建模，于是定义一个模型$\funp{P}( \cdot )$，对于任意的推导$d$，都可以用$\funp{P}(d)$计算出推导$d$的概率。这样，给定一个输入句子，我们可以对所有可能的推导用$\funp{P}(d)$计算其概率值，并选择概率最大的结果作为句法分析的结果输出（图\ref{fig:3.4-6}）。
+%-------------------------------------------
+\begin{figure}[htp]
+    \centering
+\input{./Chapter3/Figures/figure-probability-values-corresponding-to-different-derivations}
+	\caption{不同推导（句法树）对应的概率值}
+    \label{fig:3.4-6}
+\end{figure}
+%-------------------------------------------
+%----------------------------------------------------------------------------------------
+%    NEW SUB-SECTION
+%----------------------------------------------------------------------------------------
+\subsection{规则和推导的概率}
+\parinterval 对句法树进行概率化，首先要对使用的规则进行概率化。为了达到这个目的，可以使用{\small\bfnew{概率上下文无关文法}}\index{概率上下文无关文法}（Probabilistic Context-Free Grammar）\index{Probabilistic Context-Free Grammar}，它是上下文无关文法的一种扩展。
+%-------------------------------------------
+\vspace{0.5em}
+\begin{definition} 概率上下文无关文法
+一个概率上下文无关文法可以被视为一个系统$G=<N,\Sigma,R,S>$，其中
+\begin{itemize}
+\vspace{0.5em}
+\item $N$为一个非终结符集合
+\vspace{0.5em}
+\item $\Sigma$为一个终结符集合
+\vspace{0.5em}
+\item $R$为一个规则(产生式)集合，每条规则 $r \in R$的形式为$p:X \to Y_1Y_2...Y_n$，其中$X \in N$, $Y_i \in N \cup \Sigma$，每个$r$都对应一个概率$p$，表示其生成的可能性。
+\vspace{0.5em}
+\item $S$为一个起始符号集合且$S \subseteq N$
+\vspace{0.5em}
+\end{itemize}
+\end{definition}
+%-------------------------------------------
+\parinterval 概率上下文无关文法与传统上下文无关文法的区别在于，每条规则都会有一个概率，描述规则生成的可能性。具体来说，规则$\funp{P}(\alpha \to \beta)$的概率可以被定义为：
+\begin{eqnarray}
+\funp{P}(\alpha \to \beta)=\funp{P}(\beta | \alpha)
+\label{eq:3.4-4}
+\end{eqnarray}
+\noindent 即，在给定规则左部的情况下生成规则右部的可能性。进一步，在上下文无关文法中，每条规则之间的使用都是相互独立的 \footnote{如果是上下文有关文法，规则会形如 $a\alpha b\to a\beta b$，这时$\alpha \to \beta $的过程会依赖前后上下文$a$和$b$}。因此可以把$\funp{P}(d)$分解为规则概率的乘积：
+\begin{eqnarray}
+\funp{P}(d) & = & \funp{P}(r_1 \cdot r_2 \cdot ... \cdot r_n) \nonumber \\
+& = & \funp{P}(r_1) \cdot \funp{P}(r_2) \cdots \funp{P}(r_n)
+\label{eq:3.4-5}
+\end{eqnarray}
+\parinterval 这个模型可以很好的解释词串的生成过程。比如，对于规则集
+\begin{eqnarray}
+r_3: & &\textrm{VV} \to \text{吃}\nonumber \\
+r_4: & & \textrm{NN} \to \text{鱼}\nonumber \\
+r_6: & & \textrm{VP} \to \textrm{VV}\ \textrm{NN} \nonumber
+\label{eq:3.4-6}
+\end{eqnarray}
+\parinterval 可以得到 $d_1=r_3 \cdot r_4 \cdot r_6$的概率为
+\begin{eqnarray}
+\funp{P}(d_1) & = &\funp{P}(r_3) \cdot \funp{P}(r_4) \cdot \funp{P}(r_6)\nonumber  \\
+& = & \funp{P}(\textrm{VV} \to \text{吃}) \cdot \funp{P}(\textrm{NN} \to \text{鱼}) \cdot \funp{P}(\textrm{VP} \to \textrm{VV NN})
+\label{eq:3.4-7}
+\end{eqnarray}
+\parinterval 这也对应了词串``吃\ 鱼''的生成过程。首先，从起始非终结符VP开始，使用规则$r_6$生成两个非终结符VV和NN；进一步，分别使用规则$r_3$和$r_4$从VV和NN进一步生成单词``吃''和``鱼''。整个过程的概率等于三条规则概率的乘积。
+\parinterval 新的问题又来了，如何得到规则的概率呢？这里仍然可以从数据中学习文法规则的概率。假设有人工标注的数据，它包括很多人工标注句法树的句法，称之为{\small\bfnew{树库}}\index{树库}（Treebank）\index{Treebank}。然后，对于规则$\textrm{r}:\alpha \to \beta$可以使用极大似然估计：
+\begin{eqnarray}
+\funp{P}(r)  = \frac{\text{规则$r$在树库中出现的次数}}{\alpha \text{在树库中出现的次数}}
+\label{eq:3.4-8}
+\end{eqnarray}
+\parinterval 图\ref{fig:3.4-7}展示了通过这种方法计算规则概率的过程。与词法分析类似，可以统计树库中规则左部和右部同时出现的次数，除以规则左部出现的全部次数，所得的结果就是所求规则的概率。这种方法也是典型的相对频度估计。但是如果规则左部和右部同时出现的次数为0时是否代表这个规则概率是0呢？遇到这种情况，可以使用平滑方法对概率进行平滑处理，具体思路可参考{\chaptertwo}内容。
+%-------------------------------------------
+\begin{figure}[htp]
+   \centering
+\input{./Chapter3/Figures/figure-evaluation-of-probability-for-grammar}
+	\caption{上下文无关文法规则概率估计}
+    \label{fig:3.4-7}
+\end{figure}
+%-------------------------------------------
+\parinterval 图\ref{fig:3.4-8}展示了基于统计的句法分析的流程。首先，通过树库上的统计，获得各个规则的概率，这样就得到了一个上下文无关句法分析模型$\funp{P}( \cdot )$。对于任意句法分析结果$d=r_1 \circ r_2 \circ ... \circ r_n$，都能通过如下公式计算其概率值：
+\begin{equation}
+\funp{P}(d)= \prod_{i=1}^{n}\funp{P}(r_i)
+\end{equation}
+%-------------------------------------------
+\begin{figure}[htp]
+    \centering
+\input{./Chapter3/Figures/figure-process-of-statistical-syntax-analysis}
+	\caption{统计句法分析的流程}
+    \label{fig:3.4-8}
+\end{figure}
+%-------------------------------------------
+\parinterval 在获取统计分析模型后，就可以使用模型对任意句子进行分析，计算每个句法分析树的概率，并输出概率最高的树作为句法分析的结果。
+%----------------------------------------------------------------------------------------
+%    NEW SECTION
+%----------------------------------------------------------------------------------------
+\sectionnewpage
+\section{小结及深入阅读} \label{sec3:summary}
+\parinterval 本章将统计建模的思想应用到两个自然语言处理任务中，包括：中文分词、句法分析。它们和机器翻译有着紧密的联系，往往作为机器翻译系统输入和输出的数据加工方法。通过系统化的建模，可以发现：经过适当的假设和化简，统计模型可以很好的描述复杂的自然语言处理问题。相关概念和方法也会在后续章节的内容中被广泛使用。
+\parinterval 由于本章重点介绍如何用统计的思想对自然语言处理任务进行建模，因此并没有对具体的问题展开深入讨论。有几方面内容，读者可以继续关注：
+\begin{itemize}
+\vspace{0.5em}
+\item 在建模方面，本章介绍的分词、句法分析任务是是基于人工先验知识进行模型设计的思路。也就是，问题所表达的现象被``一步一步''生成出来。这是一种典型的生成式建模思想，它把要解决的问题看作一些观测结果的隐含变量（比如，句子是观测结果，分词结果是隐含在背后的变量），之后通过对隐含变量生成观测结果的过程进行建模，以达到对问题进行数学描述的目的。这类模型一般需要依赖一些独立性假设，假设的合理性对最终的性能有较大影响。相对于{\small\sffamily\bfseries{生成模型}}\index{生成模型}（Generative Model）\index{Generative Model}，另一类方法是{\small\sffamily\bfseries{判别模型}}\index{判别模型}（Discriminative Model）\index{Discriminative Model}。本章序列标注内容中提到一些模型就是判别式模型。它直接描述了从隐含变量生成观测结果的过程，这样对问题的建模更加直接，同时这类模型可以更加灵活的引入不同的特征。判别模型在自然语言处理中也有广泛应用\upcite{shannon1948mathematical}\upcite{ng2002discriminative}。 在本书的第七章也会使用到判别式模型。
+\vspace{0.5em}
+\item 此外，本章并没有对模型的推断方法进行深入介绍。比如，对于一个句子如何有效的找到概率最大的分词结果？这部分可以直接借鉴第二章中介绍的搜索方法。对于分词等问题，比较简单的解决方法是使用动态规划\upcite{huang2008advanced}。如果使用动态规划的条件不满足，可以考虑使用更加复杂的搜索策略，并配合一定剪枝方法。实际上，无论是基于$n$-gram语言模型的分词还是简单的上下文无关文法都有高效的推断方法。比如，$n$-gram语言模型可以被视为概率有限状态自动机，因此可以直接使用成熟的自动机工具。对于更复杂的句法分析问题，可以考虑使用移进-规约方法来解决推断问题\upcite{aho1972theory}。
+\vspace{0.5em}
+\item 从自然语言处理的角度来看，词法分析和语法分析的一部分内容都是典型的序列标注问题，例如本章正文部分介绍的分词和命名实体识别都可以看成序列标注的问题，此外序列标注还可以被扩展到词性标注\upcite{brants-2000-tnt}、组块识别\upcite{tsuruoka-tsujii-2005-chunk}、关键词抽取\upcite{li-etal-2003-news-oriented}、词义角色标注\upcite{chomsky1993lectures}等任务，本章着重介绍了传统的基础方法，前沿的方法大多与深度学习相结合，感兴趣的读者可以自行了解，其中比较有代表性的包括双向长短期记忆网络和条件随机场相结合的模型结构（BiLSTM-CRF）\upcite{2015Bidirectional}、双向长短期记忆网络和卷积神经网络的混合模型（BiLSTM-CNNs）\upcite{chiu2016named}、双向长短期记忆网络和softmax结构相结合的模型\upcite{vzukov2018named}等。此外，对于序列标注任务，模型性能很大程度上依赖其输入表示，因此集成或微调预训练语言模型的方法也能应用于序列标注任务\upcite{Li2020A}，常见的预训练语言模型包括BERT\upcite{devlin2018bert}、GPT\upcite{radford2018improving}、XLM\upcite{conneau2019unsupervised}等。
+\vspace{0.5em}
+\end{itemize}
\ No newline at end of file
--- a/Chapter4/Figures/Distributed-representation-of-words.png
+++ b/Chapter4/Figures/Distributed-representation-of-words.png
--- a/Chapter4/Figures/a-deeper-and-wider-grammar-tree-means-more-complex-sentence-structures.tex
+++ b/Chapter4/Figures/a-deeper-and-wider-grammar-tree-means-more-complex-sentence-structures.tex
+\begin{tikzpicture}[scale=0.5]
+\Tree[.IP
+		[.ADVP 
+			[.AD \node(e1){前几天};]
+		]
+		[.LCP
+			[.NP
+				[.CP
+					[.IP
+						[.VP
+							[.VV \node(e2){出};]
+							[.NP
+								[.NN \node(e3){事故};]
+							]
+						]
+					]
+					[.DEC \node(e4){的};]
+				]
+				[.DP
+					[.DT \node(e5){那};]
+					[.CLP
+						[.M \node(e6){条};]
+					]
+				]
+				[.NP
+					[.NN \node(e7){街};]
+				]
+			]
+			[.LC \node(e8){上};]
+		]
+		[.VP
+			[.VP
+				[.VE \node(e9){有};]
+				[.IP
+					[.NP
+						[.NN \node(e10){一家};]
+					]
+					[.VP
+						[.ADVP
+							[.AD \node(e11){非常};]
+						]
+						[.VP
+							[.VV \node(e12){气派的};]
+							[.NP
+								[.ADJP
+									[.JJ \node(e13){大};]
+								]
+								[.NP
+									[.NN \node(e14){商店};]
+								]
+							]
+						]
+					]
+				]
+			]
+			[.VP 
+				[.ADVP
+					[.AD \node(e15){，那里};]
+				]
+				[.ADVP
+					[.AD \node(e16){经常};]
+				]
+				[.VP
+					[.VV \node(e17){出售};]
+					[.NP
+						[.QP
+							[.CD \node(e18){一些};]
+						]
+						[.ADJP
+							[.JJ \node(e19){名贵};]
+						]
+						[.NP
+							[.NN \node(e20){鲜花};]
+						]
+					]
+				]
+			]
+		]
+		[.PU \node(e21){.};]
+	]
+\end{tikzpicture}
\ No newline at end of file
--- a/Chapter4/Figures/a-shallow-and-narrow-grammar-means-a-simpler-sentence-structure.tex
+++ b/Chapter4/Figures/a-shallow-and-narrow-grammar-means-a-simpler-sentence-structure.tex
+\begin{tikzpicture}[scale=0.7]
+	\Tree[.IP
+		[.NP 
+			[.NR \node(e1){俄国};]
+		]
+		[.VP
+			[.VV \node(e2){希望};]
+			[.IP
+				[.NP
+					[.NR \node(e3){伊朗};]
+				]
+				[.VP
+					[.ADVP
+						[.AD \node(e4) {没有};]
+					]
+					[.VP
+						[.VV \node(e5) {制造};]
+						[.NP
+							[.NN \node(e6){核武器};]
+							[.NN \node(e7){计划};]
+						]
+					]
+				]
+			]
+		]
+		[.PU \node(e8){.};]
+		]
+\end{tikzpicture}
\ No newline at end of file
--- a/Chapter4/Figures/absolute-match-word-alignment-1.tex
+++ b/Chapter4/Figures/absolute-match-word-alignment-1.tex
+\begin{tikzpicture}[scale=0.5]
+	\tikzstyle{cand} = [draw,line width=1pt,align=center,minimum width=2.6em,minimum height=1.6em,drop shadow={shadow xshift=0.15em},fill=green!30]
+	\tikzstyle{ref} = [draw,line width=1pt,align=center,minimum width=2.6em,minimum height=1.6em,drop shadow={shadow xshift=0.15em},fill=red!30]
+		\node[align=center,minimum width=2.4em,minimum height=1.6em,minimum width=6em] (n11) at (0,0){\small\bfnew{Candidate :}};
+		\node[cand,anchor=west] (n12) at ([xshift=0.0em]n11.east){Can};
+		\node[cand,anchor=west] (n13) at ([xshift=1em]n12.east){I};
+		\node[cand,anchor=west] (n14) at ([xshift=1em]n13.east){have};
+		\node[cand,anchor=west] (n15) at ([xshift=1em]n14.east){this};
+		\node[cand,anchor=west] (n16) at ([xshift=1em]n15.east){like};
+		\node[cand,anchor=west] (n17) at ([xshift=1em]n16.east){he};
+		\node[cand,anchor=west] (n18) at ([xshift=1em]n17.east){do};
+		\node[cand,anchor=west] (n19) at ([xshift=1em]n18.east){it};
+		\node[cand,anchor=west] (n20) at ([xshift=1em]n19.east){?};
+		\node[align=center,minimum width=2.4em,minimum height=1.6em,anchor=north,minimum width=6em] (n21) at ([yshift=-6em]n11.south){\small\bfnew{Reference :}};
+		\node[ref,anchor=west] (n22) at ([xshift=0.0em]n21.east){Can};
+		\node[ref,anchor=west] (n23) at ([xshift=1em]n22.east){I};
+		\node[ref,anchor=west] (n24) at ([xshift=1em]n23.east){eat};
+		\node[ref,anchor=west] (n25) at ([xshift=1em]n24.east){this};
+		\node[ref,anchor=west] (n26) at ([xshift=1em]n25.east){Can};
+		\node[ref,anchor=west] (n27) at ([xshift=1em]n26.east){like};
+		\node[ref,anchor=west] (n28) at ([xshift=1em]n27.east){he};
+		\node[ref,anchor=west] (n29) at ([xshift=1em]n28.east){did};
+		\node[ref,anchor=west] (n60) at ([xshift=1em]n29.east){?};
+		\draw[line width=1.6pt,blue!40] (n12.south) -- (n22.north);
+		\draw[line width=1.6pt,blue!40] (n13.south) -- (n23.north);
+		\draw[line width=1.6pt,blue!40] (n15.south) -- (n25.north);
+		\draw[line width=1.6pt,blue!40] (n16.south) -- (n27.north);
+		\draw[line width=1.6pt,blue!40] (n17.south) -- (n28.north);
+		\draw[line width=1.6pt,blue!40] (n20.south) -- (n60.north);
+\end{tikzpicture}
--- a/Chapter4/Figures/absolute-match-word-alignment-2.tex
+++ b/Chapter4/Figures/absolute-match-word-alignment-2.tex
+\begin{tikzpicture}[scale=0.5]
+	\tikzstyle{cand} = [draw,line width=1pt,align=center,minimum width=2.6em,minimum height=1.6em,drop shadow={shadow xshift=0.15em},fill=green!30]
+	\tikzstyle{ref} = [draw,line width=1pt,align=center,minimum width=2.6em,minimum height=1.6em,drop shadow={shadow xshift=0.15em},fill=red!30]
+	\node[align=center,minimum width=2.4em,minimum height=1.6em,minimum width=6em] (n11) at (0,0){\small\bfnew{Candidate :}};
+	\node[cand,anchor=west] (n12) at ([xshift=0.0em]n11.east){Can};
+	\node[cand,anchor=west] (n13) at ([xshift=1em]n12.east){I};
+	\node[cand,anchor=west] (n14) at ([xshift=1em]n13.east){have};
+	\node[cand,anchor=west] (n15) at ([xshift=1em]n14.east){this};
+	\node[cand,anchor=west] (n16) at ([xshift=1em]n15.east){like};
+	\node[cand,anchor=west] (n17) at ([xshift=1em]n16.east){he};
+	\node[cand,anchor=west] (n18) at ([xshift=1em]n17.east){do};
+	\node[cand,anchor=west] (n19) at ([xshift=1em]n18.east){it};
+	\node[cand,anchor=west] (n20) at ([xshift=1em]n19.east){?};
+	\node[align=center,minimum width=2.4em,minimum height=1.6em,anchor=north,minimum width=6em] (n21) at ([yshift=-6em]n11.south){\small\bfnew{Reference :}};
+	\node[ref,anchor=west] (n22) at ([xshift=0.0em]n21.east){Can};
+	\node[ref,anchor=west] (n23) at ([xshift=1em]n22.east){I};
+	\node[ref,anchor=west] (n24) at ([xshift=1em]n23.east){eat};
+	\node[ref,anchor=west] (n25) at ([xshift=1em]n24.east){this};
+	\node[ref,anchor=west] (n26) at ([xshift=1em]n25.east){Can};
+	\node[ref,anchor=west] (n27) at ([xshift=1em]n26.east){like};
+	\node[ref,anchor=west] (n28) at ([xshift=1em]n27.east){he};
+	\node[ref,anchor=west] (n29) at ([xshift=1em]n28.east){did};
+	\node[ref,anchor=west] (n30) at ([xshift=1em]n29.east){?};
+	\draw[line width=1.6pt,blue!40] (n12.south) -- (n26.north);
+	\draw[line width=1.6pt,blue!40] (n13.south) -- (n23.north);
+	\draw[line width=1.6pt,blue!40] (n15.south) -- (n25.north);
+	\draw[line width=1.6pt,blue!40] (n16.south) -- (n27.north);
+	\draw[line width=1.6pt,blue!40] (n17.south) -- (n28.north);
+	\draw[line width=1.6pt,blue!40] (n20.south) -- (n30.north);
+\end{tikzpicture}
--- a/Chapter4/Figures/determine-final-word-alignment.tex
+++ b/Chapter4/Figures/determine-final-word-alignment.tex
+\definecolor{ugreen}{rgb}{0,0.5,0}
+\begin{tikzpicture}[scale=0.5]
+	\tikzstyle{cand} = [draw,line width=1pt,align=center,minimum width=2.6em,minimum height=1.6em,drop shadow={shadow xshift=0.15em},fill=green!30]
+	\tikzstyle{ref} = [draw,line width=1pt,align=center,minimum width=2.6em,minimum height=1.6em,drop shadow={shadow xshift=0.15em},fill=red!30]
+		\node[align=center,minimum width=2.4em,minimum height=1.6em,minimum width=6em] (n11) at (0,0){\small\bfnew{Candidate :}};
+		\node[cand,anchor=west] (n12) at ([xshift=0.0em]n11.east){Can};
+		\node[cand,anchor=west] (n13) at ([xshift=1em]n12.east){I};
+		\node[cand,anchor=west] (n14) at ([xshift=1em]n13.east){have};
+		\node[cand,anchor=west] (n15) at ([xshift=1em]n14.east){this};
+		\node[cand,anchor=west] (n16) at ([xshift=1em]n15.east){like};
+		\node[cand,anchor=west] (n17) at ([xshift=1em]n16.east){he};
+		\node[cand,anchor=west] (n18) at ([xshift=1em]n17.east){do};
+		\node[cand,anchor=west] (n19) at ([xshift=1em]n18.east){it};
+		\node[cand,anchor=west] (n20) at ([xshift=1em]n19.east){?};
+		\node[align=center,minimum width=2.4em,minimum height=1.6em,anchor=north,minimum width=6em] (n21) at ([yshift=-6em]n11.south){\small\bfnew{Reference :}};
+		\node[ref,anchor=west] (n22) at ([xshift=0.0em]n21.east){Can};
+		\node[ref,anchor=west] (n23) at ([xshift=1em]n22.east){I};
+		\node[ref,anchor=west] (n24) at ([xshift=1em]n23.east){eat};
+		\node[ref,anchor=west] (n25) at ([xshift=1em]n24.east){this};
+		\node[ref,anchor=west] (n26) at ([xshift=1em]n25.east){Can};
+		\node[ref,anchor=west] (n27) at ([xshift=1em]n26.east){like};
+		\node[ref,anchor=west] (n28) at ([xshift=1em]n27.east){he};
+		\node[ref,anchor=west] (n29) at ([xshift=1em]n28.east){did};
+		\node[ref,anchor=west] (n30) at ([xshift=1em]n29.east){?};
+		\draw[line width=1.6pt,blue!40] (n12.south) -- (n22.north);
+		\draw[line width=1.6pt,blue!40] (n13.south) -- (n23.north);
+		\draw[line width=1.6pt,blue!40] (n15.south) -- (n25.north);
+		\draw[line width=1.6pt,blue!40] (n16.south) -- (n27.north);
+		\draw[line width=1.6pt,blue!40] (n17.south) -- (n28.north);
+		\draw[line width=1.6pt,blue!40] (n20.south) -- (n30.north);
+		\draw[line width=1.6pt,orange!40] (n18.south) -- (n29.north);
+		\draw[line width=1.6pt,ugreen!40](n14.south) -- (n24.north);
+\end{tikzpicture}
--- a/Chapter4/Figures/fig-cover.jpg
+++ b/Chapter4/Figures/fig-cover.jpg
--- a/Chapter4/Figures/logic-diagram-of-translation-quality-evaluation-method.tex
+++ b/Chapter4/Figures/logic-diagram-of-translation-quality-evaluation-method.tex
+%\documentclass[tikz]{standalone}
+%\usepackage{tikz}
+%\usepackage[UTF8]{ctex}
+%\usepackage{setspace}
+%\usetikzlibrary{shapes}
+%\usetikzlibrary{decorations.pathreplacing}
+%\begin{document}
+\begin{tikzpicture}[scale=0.8]
+\begin{scope}
+\tikzstyle{every node}=[scale=0.8]
+% big circle at center
+\node [anchor=center,circle,draw,minimum width=10em,line width=0.2em,ublue] (base) at (0,0) {};
+\draw [-,very thick,line width=0.15em,ublue] (0,0) -- (base.90);
+\draw [-,very thick,line width=0.15em,ublue] (0,0) -- (base.-30);
+\draw [-,very thick,line width=0.15em,ublue] (0,0) -- (base.210);
+\node [anchor=south east,align=left] (autoevallabel) at ([xshift=-0.5em,yshift=0.5em]base.-10) {{\small\bfnew\footnotesize{人工构造}}\\{\small\bfnew\footnotesize{参考答案}}};
+\node [anchor=south west,align=left] (qualityestlabel) at ([xshift=0.5em,yshift=0.5em]base.190) {{\small\bfnew\footnotesize{人不参与}}\\{\small\bfnew\footnotesize{评价}}};
+\node [anchor=south,align=left] (humanevallabel) at ([yshift=1.0em]base.-90) {{\small\bfnew\footnotesize{人直接}}\\{\small\bfnew\footnotesize{进行评价}}};
+% quality estimation
+\node [anchor=north east,minimum width=10em,minimum height=10em,draw=black!60,very thick,fill=ugreen!20,drop shadow] (qebox) at ([xshift=-8em]base.90) {};
+\node [draw,anchor=south,minimum width=10em,align=center,draw=black!60,very thick,fill=ugreen!20,drop shadow] (qelabel) at ([yshift=0.5em]qebox.north) {\small\footnotesize{需要较为复杂的建模，}\\\small\footnotesize{开发难度同机器翻译系统}};
+\node [anchor=north,minimum width=10em] (qetitle) at ([yshift=-0.2em]qebox.north) {{\small\bfnew\large{无参考答案的评价}}};
+\draw [-] ([yshift=-2em]qebox.north west) -- ([yshift=-2em]qebox.north east);
+\node [anchor=north] (qemethod1) at ([yshift=-0.3em]qetitle.south) {单词级评价};
+\node [anchor=north] (qemethod2) at ([yshift=-0.3em]qemethod1.south) {短语级评价};
+\node [anchor=north] (qemethod3) at ([yshift=-0.3em]qemethod2.south) {句子级评价};
+\node [anchor=north] (qemethod4) at ([yshift=-0.3em]qemethod3.south) {篇章级评价};
+% auto evaluation
+\node [anchor=north west,minimum width=10em,minimum height=10em,draw=black!60,very thick,fill=red!20,drop shadow] (aebox) at ([xshift=8em]base.90) {};
+\node [draw,anchor=south,minimum width=10em,align=center,draw=black!60,very thick,fill=red!20,drop shadow] (aelabel) at ([yshift=0.5em]aebox.north) {\small\footnotesize{基于指标性公式}\\\small\footnotesize{和简单的建模}};
+\node [anchor=north,minimum width=10em] (aetitle) at ([yshift=-0.2em]aebox.north) {{\small\bfnew\large{有参考答案的评价}}};
+\draw [-] ([yshift=-2em]aebox.north west) -- ([yshift=-2em]aebox.north east);
+\node [anchor=north] (aemethod1) at ([yshift=-0.5em]aetitle.south) {BLEU、NIST、};
+\node [anchor=north] (aemethod2) at ([yshift=-0.3em]aemethod1.south) {GTM、Meteor、};
+\node [anchor=north] (aemethod3) at ([yshift=-0.3em]aemethod2.south) {WER、PER、TER、};
+\node [anchor=north] (aemethod4) at ([yshift=-0.3em]aemethod3.south) {HTER ...};
+% human evaluation
+\node [anchor=north,minimum width=10em,minimum height=6em,draw=black!60,very thick,fill=yellow!20,drop shadow] (hebox) at ([yshift=-4em]base.-90) {};
+\node [anchor=north,minimum width=10em] (hetitle) at ([yshift=-0.2em]hebox.north) {{\small\bfnew\large{人工评价}}};
+\draw [-] ([yshift=-2em]hebox.north west) -- ([yshift=-2em]hebox.north east);
+\node [anchor=north] (hemethod1) at ([yshift=-0.5em]hetitle.south) {流畅度、忠实度、};
+\node [anchor=north west] (hemethod2) at ([yshift=-0.0em]hemethod1.south west) {一致性\ \ ...};
+% confidence estimation
+\node [anchor=east,align=left] (conf) at ([xshift=-6em,yshift=0.6em]hebox.west) {\small\bfnew{用于估计同}\\\small\bfnew{一个系统输}\\\small\bfnew{出的可信度}};
+\node [anchor=north,single arrow,minimum height=4.0em,fill=blue!30,rotate=-90] (arrow1) at ([yshift=-2.4em]qebox.south) {};
+% comparing different systems
+\node [anchor=west,align=left] (com) at ([xshift=8em,yshift=0.6em]hebox.east) {\small\bfnew{用于对比}\\\small\bfnew{不同系统}\\\small\bfnew{的好坏}};
+\node [anchor=west,single arrow,minimum height=7.5em,fill=blue!30] (arrow2) at ([yshift=-1.4em,xshift=0.5em]hebox.north east) {};
+\node [anchor=north,fill=white] (arrow2label) at ([xshift=-0.5em]arrow2.south) {\footnotesize{{\color{blue} 成本高但精度高}}};
+\node [anchor=north,single arrow,minimum height=4.0em,fill=blue!30,rotate=-90] (arrow3) at ([yshift=-2.4em,xshift=2.2em]aebox.south) {};
+\node [anchor=west,fill=white,font=\footnotesize,align=left,text=blue,inner sep=0pt] (arrow3label) at ([yshift=2.6em,xshift=0.6em]arrow3.east) {成本低\\无人工\\有偏差};
+% system optimization
+\node [anchor=west,align=left] (optimization) at ([xshift=2em]aebox.east) {\small\bfnew{用于}\\\small\bfnew{机器}\\\small\bfnew{翻译}\\\small\bfnew{系统}\\\small\bfnew{的调}\\\small\bfnew{优}};
+\node [anchor=west,single arrow,minimum height=1.8em,fill=blue!30] (arrow4) at ([xshift=0.4em]aebox.east) {};
+\begin{pgfonlayer}{background}
+\draw [->,line width=0.3em,dotted,red] ([yshift=1em,xshift=0em]hebox.south east) -- ([yshift=1em,xshift=4em]hebox.south east) -- ([yshift=10em,xshift=4em]hebox.south east) node [pos=0.8,left] {\small{{\color{red} 评价标准}}};
+\end{pgfonlayer}
+% more arrows
+\draw [->,line width=0.3em,ublue] ([yshift=-0.2em]base.-90) -- ([yshift=0.2em]hebox.north);
+\draw [->,line width=0.3em,ublue] ([xshift=0.2em]base.0) -- ([xshift=2.7em]base.0);
+\draw [->,line width=0.3em,ublue] ([xshift=-0.2em]base.180) -- ([xshift=-2.7em]base.180);
+\end{scope}
+\end{tikzpicture}
\ No newline at end of file
--- a/Chapter4/Figures/match-words-with-stem.tex
+++ b/Chapter4/Figures/match-words-with-stem.tex
+\begin{tikzpicture}[scale=0.5]
+	\tikzstyle{cand} = [draw,line width=1pt,align=center,minimum width=2.6em,minimum height=1.6em,drop shadow={shadow xshift=0.15em},fill=green!30]
+	\tikzstyle{ref} = [draw,line width=1pt,align=center,minimum width=2.6em,minimum height=1.6em,drop shadow={shadow xshift=0.15em},fill=red!30]
+		\node[align=center,minimum width=2.4em,minimum height=1.6em,minimum width=6em] (n11) at (0,0){\small\bfnew{Candidate :}};
+		\node[cand,anchor=west] (n12) at ([xshift=0.0em]n11.east){Can};
+		\node[cand,anchor=west] (n13) at ([xshift=1em]n12.east){I};
+		\node[cand,anchor=west] (n14) at ([xshift=1em]n13.east){have};
+		\node[cand,anchor=west] (n15) at ([xshift=1em]n14.east){this};
+		\node[cand,anchor=west] (n16) at ([xshift=1em]n15.east){like};
+		\node[cand,anchor=west] (n17) at ([xshift=1em]n16.east){he};
+		\node[cand,anchor=west] (n18) at ([xshift=1em]n17.east){do};
+		\node[cand,anchor=west] (n19) at ([xshift=1em]n18.east){it};
+		\node[cand,anchor=west] (n20) at ([xshift=1em]n19.east){?};
+		\node[align=center,minimum width=2.4em,minimum height=1.6em,anchor=north,minimum width=6em] (n21) at ([yshift=-6em]n11.south){\small\bfnew{Reference :}};
+		\node[ref,anchor=west] (n22) at ([xshift=0.0em]n21.east){Can};
+		\node[ref,anchor=west] (n23) at ([xshift=1em]n22.east){I};
+		\node[ref,anchor=west] (n24) at ([xshift=1em]n23.east){eat};
+		\node[ref,anchor=west] (n25) at ([xshift=1em]n24.east){this};
+		\node[ref,anchor=west] (n26) at ([xshift=1em]n25.east){Can};
+		\node[ref,anchor=west] (n27) at ([xshift=1em]n26.east){like};
+		\node[ref,anchor=west] (n28) at ([xshift=1em]n27.east){he};
+		\node[ref,anchor=west] (n29) at ([xshift=1em]n28.east){did};
+		\node[ref,anchor=west] (n30) at ([xshift=1em]n29.east){?};
+		\node[align=center,minimum width=2.4em,minimum height=1.6em,minimum width=6em] (n31) at ([yshift=-5em]n21.south){\small\bfnew{Candidate :}};
+		\node[cand,anchor=west] (n32) at ([xshift=0.0em]n31.east){Can};
+		\node[cand,anchor=west] (n33) at ([xshift=1em]n32.east){I};
+		\node[cand,anchor=west] (n34) at ([xshift=1em]n33.east){have};
+		\node[cand,anchor=west] (n35) at ([xshift=1em]n34.east){this};
+		\node[cand,anchor=west] (n36) at ([xshift=1em]n35.east){like};
+		\node[cand,anchor=west] (n37) at ([xshift=1em]n36.east){he};
+		\node[cand,anchor=west] (n38) at ([xshift=1em]n37.east){do};
+		\node[cand,anchor=west] (n39) at ([xshift=1em]n38.east){it};
+		\node[cand,anchor=west] (n40) at ([xshift=1em]n39.east){?};
+		\node[align=center,minimum width=2.4em,minimum height=1.6em,anchor=north,minimum width=6em] (n41) at ([yshift=-6em]n31.south){\small\bfnew{Reference :}};
+		\node[ref,anchor=west] (n42) at ([xshift=0.0em]n41.east){Can};
+		\node[ref,anchor=west] (n43) at ([xshift=1em]n42.east){I};
+		\node[ref,anchor=west] (n44) at ([xshift=1em]n43.east){eat};
+		\node[ref,anchor=west] (n45) at ([xshift=1em]n44.east){this};
+		\node[ref,anchor=west] (n46) at ([xshift=1em]n45.east){Can};
+		\node[ref,anchor=west] (n47) at ([xshift=1em]n46.east){like};
+		\node[ref,anchor=west] (n48) at ([xshift=1em]n47.east){he};
+		\node[ref,anchor=west] (n49) at ([xshift=1em]n48.east){did};
+		\node[ref,anchor=west] (n50) at ([xshift=1em]n49.east){?};
+		\draw[line width=1.6pt,blue!40] (n12.south) -- (n22.north);
+		\draw[line width=1.6pt,blue!40] (n13.south) -- (n23.north);
+		\draw[line width=1.6pt,blue!40] (n15.south) -- (n25.north);
+		\draw[line width=1.6pt,blue!40] (n16.south) -- (n27.north);
+		\draw[line width=1.6pt,blue!40] (n17.south) -- (n28.north);
+		\draw[line width=1.6pt,blue!40] (n20.south) -- (n30.north);
+		\draw[line width=2pt,orange!40] (n18.south) -- (n29.north);
+		\draw[line width=1.6pt,blue!40] (n32.south) -- (n46.north);
+		\draw[line width=1.6pt,blue!40] (n33.south) -- (n43.north);
+		\draw[line width=1.6pt,blue!40] (n35.south) -- (n45.north);
+		\draw[line width=1.6pt,blue!40] (n36.south) -- (n47.north);
+		\draw[line width=1.6pt,blue!40] (n37.south) -- (n48.north);
+		\draw[line width=1.6pt,blue!40] (n40.south) -- (n50.north);
+		\draw[line width=2pt,orange!40] (n38.south) -- (n49.north);
+\end{tikzpicture}
\ No newline at end of file
--- a/Chapter4/Figures/representation-of-czech-reference-answer-set.tex
+++ b/Chapter4/Figures/representation-of-czech-reference-answer-set.tex
+\begin{tikzpicture}	
+	\tikzstyle{unit} = [circle,line width=1.5pt,draw,minimum size=1.5em]
+		\node[unit] (u1)at (0,0){};
+		\node[unit,anchor=west](u2) at ([xshift=5em]u1.east){};
+		\node[unit,anchor=west](u3) at ([xshift=5em]u2.east){};
+		\node[unit,anchor=west](u4) at ([xshift=2em]u3.east){};
+		\node[unit,anchor=west](u5) at ([xshift=4em]u4.east){};
+		\draw[->,out=40,in=140,red,line width=1.5pt] (u1.north east) to  node[inner sep=0pt,color=red,above]{\footnotesize městská rada}(u2.north west);
+		\draw[->,out=-40,in=-140,red,line width=1.5pt] (u1.south east) to  node[inner sep=0pt,color=red,below]{\footnotesize zastupitelstvo města}(u2.south west);
+		\draw[->,out=40,in=140,teal,line width=1.5pt] (u2.north east) to  node[inner sep=0pt,color=teal,above]{\footnotesize schválila}(u3.north west);
+		\draw[->,teal,line width=1.5pt](u2.east)-- node[inner sep=0pt,color=teal,above]{\footnotesize požehnala}(u3.west);
+		\draw[->,out=-40,in=-140,teal,line width=1.5pt] (u2.south east) to  node[inner sep=0pt,color=teal,below]{\footnotesize souhlasila s}(u3.south west);
+		\draw[->,line width=1.5pt](u3.east) -- node[above]{\footnotesize nový} (u4.west);
+		\draw[->,out=40,in=140,blue,line width=1.5pt] (u4.north east) to  node[inner sep=0pt,color=blue,above]{\footnotesize předpis}(u5.north west);
+		\draw[->,blue,line width=1.5pt](u4.east)-- node[inner sep=0pt,color=blue,above]{\footnotesize směrnici}(u5.west);
+		\draw[->,out=-40,in=-140,blue,line width=1.5pt] (u4.south east) to  node[inner sep=0pt,color=blue,below]{\footnotesize nařízení}(u5.south west);
+\end{tikzpicture}
--- a/Chapter4/Figures/representation-of-english-reference-answer-set.tex
+++ b/Chapter4/Figures/representation-of-english-reference-answer-set.tex
+\begin{tikzpicture}	
+	\tikzstyle{unit} = [circle,line width=1.5pt,draw,minimum size=1.5em]
+		\node[unit] (u1)at (0,0){};
+		\node[unit,anchor=west](u2) at ([xshift=2em]u1.east){};
+		\node[unit,anchor=west](u3) at ([xshift=5em]u2.east){};
+		\node[unit,anchor=west](u4) at ([xshift=6em]u3.east){};
+		\node[unit,anchor=west](u5) at ([xshift=3em]u4.east){};
+		\node[unit,anchor=west](u6) at ([xshift=4em]u5.east){};
+		\draw[->,line width=1.5pt](u1.east) -- node[above]{\footnotesize the} (u2.west);
+		\draw[->,out=40,in=140,red,line width=1.5pt] (u2.north east) to  node[inner sep=0pt,color=red,above]{\footnotesize city council}(u3.north west);
+		\draw[->,out=-40,in=-140,red,line width=1.5pt] (u2.south east) to  node[inner sep=0pt,color=red,below]{\footnotesize local government}(u3.south west);
+		\draw[->,out=40,in=140,teal,line width=1.5pt] (u3.north east) to  node[inner sep=0pt,color=teal,above]{\footnotesize approved}(u4.north west);
+		\draw[->,teal,line width=1.5pt](u3.east)-- node[inner sep=0pt,color=teal,above]{\footnotesize gave blessing to}(u4.west);
+		\draw[->,out=-40,in=-140,teal,line width=1.5pt] (u3.south east) to  node[inner sep=0pt,color=teal,below]{\footnotesize agreed with}(u4.south west);
+		\draw[->,line width=1.5pt](u4.east) -- node[above]{\footnotesize a new} (u5.west);
+		\draw[->,out=40,in=140,blue,line width=1.5pt] (u5.north east) to  node[inner sep=0pt,color=blue,above]{\footnotesize regulation}(u6.north west);
+		\draw[->,blue,line width=1.5pt](u5.east)-- node[inner sep=0pt,color=blue,above]{\footnotesize decree}(u6.west);
+		\draw[->,out=-40,in=-140,blue,line width=1.5pt] (u5.south east) to  node[inner sep=0pt,color=blue,below]{\footnotesize directive}(u6.south west);
+\end{tikzpicture}
--- a/Chapter4/Figures/representation-of-reference-answer-set-in-hyter.tex
+++ b/Chapter4/Figures/representation-of-reference-answer-set-in-hyter.tex
+\begin{tikzpicture}	
+	\tikzstyle{unit} = [circle,line width=1.5pt,draw,minimum size=1.5em]
+		\node[unit] (u1)at (0,0){};
+		\node[unit,anchor=west](u2) at ([xshift=7em]u1.east){};
+		\node[unit,anchor=west](u3) at ([xshift=1.5em]u2.east){};
+		\node[unit,anchor=west](u4) at ([xshift=8em]u3.east){};
+		\node[unit,anchor=west](u5) at ([xshift=1.5em]u4.east){};
+		\node[unit,anchor=west](u6) at ([xshift=5em]u5.east){};
+		\node[unit,anchor=west,line width=1.5pt](u7) at ([xshift=2em]u6.east){};
+		\node[unit,anchor=west,line width=1.5pt,minimum size=1em](u8) at ([xshift=2.25em]u6.east){};
+		\draw[->,out=40,in=140,red,line width=1.5pt] (u1.north east) to  node[inner sep=0pt,color=red,above]{\footnotesize the level of approval}(u2.north west);
+		\draw[->,red,line width=1.5pt](u1.east)-- node[inner sep=0pt,color=red,above]{\footnotesize the approval rate}(u2.west);
+		\draw[->,out=-30,in=-150,red,line width=1.5pt] (u1.south east) to  node[inner sep=0pt,color=red,below]{\footnotesize the approval level}(u2.south west);
+		\draw[->,line width=1.5pt](u2.east) -- node[above]{\footnotesize for} (u3.west);
+		\draw[->,line width=1.5pt](u3.east) -- node[above]{\footnotesize national football team} (u4.west);
+		\draw[->,line width=1.5pt](u4.east) -- node[above]{\footnotesize was} (u5.west);
+		\draw[->,out=40,in=140,blue,line width=1.5pt] (u5.north east) to  node[inner sep=0pt,color=blue,above]{\footnotesize pratically}(u6.north west);
+		\draw[->,blue,line width=1.5pt](u5.east)-- node[inner sep=0pt,color=blue,above]{\footnotesize close to}(u6.west);
+		\draw[->,out=-30,in=-150,blue,line width=1.5pt] (u5.south east) to  node[inner sep=0pt,color=blue,below]{\footnotesize about equal to}(u6.south west);
+		\draw[->,line width=1.5pt](u6.east) -- node[above]{\footnotesize zero} (u7.west);
+\end{tikzpicture}
--- a/Chapter4/Figures/schematic-diagram-of-phrase-level-quality assessment-task.tex
+++ b/Chapter4/Figures/schematic-diagram-of-phrase-level-quality assessment-task.tex
+%\usetikzlibrary{backgrounds} 
+%\usetikzlibrary{fit}
+\begin{tikzpicture}[scale=0.5]
+	\tikzstyle{unit} = [draw,inner sep=1.2pt,font=\tiny,minimum height=1em]
+	\tikzstyle{box} = [draw=blue!40,rectangle,inner xsep=1.4pt,inner ysep=3pt,line width=1.2pt]
+	\tikzstyle{bad_tag} = [fill=red!15,inner sep=1pt,align=center,font=\tiny,text=red,minimum height=0.8em]
+	\tikzstyle{ok_tag} = [fill=ugreen!15,inner sep=1pt,align=center,font=\tiny,text=ugreen,minimum height=0.8em]
+	\coordinate (o) at (0, 0);
+	\node[anchor=west,inner sep=0pt,align=center,font=\tiny] (n1_1) at ([yshift=8em]o.east){\textbf{Source}};
+	\node[unit,anchor=west,fill=green!20](n1_2) at ([xshift=1.2em]n1_1.east){Bei};
+	\node[unit,anchor=west,fill=green!20](n1_3) at ([xshift=0.4em]n1_2.east){Patienten};
+	\node[unit,anchor=west,fill=green!20](n1_4) at ([xshift=0.4em]n1_3.east){mit};
+	\node[unit,anchor=west,fill=green!20](n1_5) at ([xshift=1em]n1_4.east){eingeschränkter};
+	\node[unit,anchor=west,fill=green!20](n1_6) at ([xshift=0.4em]n1_5.east){Nierenfunktion};
+	\node[unit,anchor=west,fill=green!20](n1_7) at ([xshift=0.4em]n1_6.east){kann};
+	\node[unit,anchor=west,fill=green!20](n1_8) at ([xshift=1em]n1_7.east){der};
+	\node[unit,anchor=west,fill=green!20](n1_9) at ([xshift=0.4em]n1_8.east){Insulinbedarf};
+	\node[unit,anchor=west,fill=green!20](n1_10) at ([xshift=1em]n1_9.east){infolge};
+	\node[unit,anchor=west,fill=green!20](n1_11) at ([xshift=0.4em]n1_10.east){des};
+	\node[unit,anchor=west,fill=green!20](n1_12) at ([xshift=1em]n1_11.east){verminderten};
+	\node[unit,anchor=west,fill=green!20](n1_13) at ([xshift=1em]n1_12.east){Insulinabbaus};
+	\node[unit,anchor=west,fill=green!20](n1_14) at ([xshift=1em]n1_13.east){verringert};
+	\node[unit,anchor=west,fill=green!20](n1_15) at ([xshift=0.4em]n1_14.east){sein};
+	\node[anchor=west,inner sep=0pt,align=center,font=\tiny] (n2_1) at ([yshift=-2em]o.east){\textbf{MT}};
+	\node[unit,anchor=west,fill=red!20](n2_2) at ([xshift=3.4em]n2_1.east){In};
+	\node[unit,anchor=west,fill=red!20](n2_3) at ([xshift=0.4em]n2_2.east){patients};
+	\node[unit,anchor=west,fill=red!20](n2_4) at ([xshift=0.4em]n2_3.east){with};
+	\node[unit,anchor=west,fill=red!20](n2_5) at ([xshift=1.6em]n2_4.east){renal};
+	\node[unit,anchor=west,fill=red!20](n2_6) at ([xshift=0.4em]n2_5.east){impairment};
+	\node[unit,anchor=west,fill=red!20](n2_7) at ([xshift=1em]n2_6.east){,};
+	\node[unit,anchor=west,fill=red!20](n2_8) at ([xshift=1em]n2_7.east){insulin};
+	\node[unit,anchor=west,fill=red!20](n2_9) at ([xshift=0.4em]n2_8.east){requirements};
+	\node[unit,anchor=west,fill=red!20](n2_10) at ([xshift=0.4em]n2_9.east){may};
+	\node[unit,anchor=west,fill=red!20](n2_11) at ([xshift=2.2em]n2_10.east){be};
+	\node[unit,anchor=west,fill=red!20](n2_12) at ([xshift=2.2em]n2_11.east){diminished};
+	\node[unit,anchor=west,fill=red!20](n2_13) at ([xshift=1.6em]n2_12.east){due};
+	\node[unit,anchor=west,fill=red!20](n2_14) at ([xshift=0.4em]n2_13.east){to};
+	\node[unit,anchor=west,fill=red!20](n2_15) at ([xshift=1.6em]n2_14.east){reduced};
+	\begin{pgfonlayer}{background}
+        \node [box] [fit = (n1_2) (n1_3) (n1_4)] (box1_1) {};
+        \node [box] [fit = (n1_5) (n1_6) (n1_7)] (box1_2) {};
+        \node [box] [fit = (n1_8) (n1_9)] (box1_3) {};
+        \node [box] [fit = (n1_10) (n1_11) ] (box1_4) {};
+        \node [box] [fit = (n1_12)] (box1_5) {};
+        \node [box] [fit = (n1_13)] (box1_6) {};
+        \node [box] [fit = (n1_14) (n1_15)] (box1_7) {};
+        \node [box] [fit = (n2_2) (n2_3) (n2_4)] (box2_1) {};
+        \node [box] [fit = (n2_5) (n2_6)] (box2_2) {};
+        \node [box] [fit = (n2_8) (n2_9) (n2_10)] (box2_3) {};
+        \node [box] [fit = (n2_11)] (box2_4) {};
+        \node [box] [fit = (n2_12)] (box2_5) {};
+        \node [box] [fit = (n2_13) (n2_14) ] (box2_6) {};
+        \node [box] [fit = (n2_15)] (box2_7) {};
+    \end{pgfonlayer}
+	\node[bad_tag,anchor=north] at ([yshift=-2pt]box2_1.south){BAD\_word\_order};
+	\node[bad_tag,anchor=north] at ([yshift=-2pt]box2_2.south){BAD};
+	\node[ok_tag,anchor=north] at ([yshift=-2pt]box2_3.south){OK};
+	\node[ok_tag,anchor=north] at ([yshift=-2pt]box2_4.south){OK};
+	\node[ok_tag,anchor=north] at ([yshift=-2pt]box2_5.south){OK};
+	\node[ok_tag,anchor=north] at ([yshift=-2pt]box2_6.south){OK};
+	\node[ok_tag,anchor=north] (tag_1) at ([yshift=-2pt]box2_7.south){OK};
+	\node[ok_tag,anchor=north] (gap_1) at ([xshift=-5.6em,yshift=-3em]box2_1.south){OK};
+	\node[ok_tag,anchor=north] (gap_2) at ([xshift=5.1em,yshift=-3em]box2_1.south){OK};
+	\node[ok_tag,anchor=north] (gap_3) at ([xshift=5.7em,yshift=-3em]box2_2.south){OK};
+	\node[ok_tag,anchor=north] (gap_4) at ([xshift=7.5em,yshift=-3em]box2_3.south){OK};
+	\node[bad_tag,anchor=north] (gap_5) at ([xshift=2.8em,yshift=-3em]box2_4.south){BAD\_omission};
+	\node[ok_tag,anchor=north] (gap_6) at ([xshift=3.5em,yshift=-3em]box2_5.south){OK};
+	\node[ok_tag,anchor=north] (gap_7) at ([xshift=2.7em,yshift=-3em]box2_6.south){OK};
+	\node[ok_tag,anchor=north] (tag_2) at ([xshift=3.1em,yshift=-3em]box2_7.south){OK};
+	\node[anchor=west,inner sep=0pt,align=center,font=\tiny] at ([xshift=3.6em]tag_1.east){\textbf{Phrase-target tags}};
+	\node[anchor=west,inner sep=0pt,align=center,font=\tiny] at ([xshift=4.8em]tag_2.east){\textbf{Gap tags}};
+	\draw[magenta!50,line width=1pt] (n1_2.south) -- (n2_2.north);
+	\draw[magenta!50,line width=1pt] (n1_3.south) -- (n2_3.north);
+	\draw[magenta!50,line width=1pt] (n1_3.south) -- (n2_4.north);
+	\draw[magenta!50,line width=1pt] (n1_4.south) -- (n2_4.north);
+	\draw[magenta!50,line width=1pt] (n1_5.south) -- (n2_5.north);
+	\draw[magenta!50,line width=1pt] (n1_6.south) -- (n2_5.north);
+	\draw[magenta!50,line width=1pt] (n1_6.south) -- (n2_6.north);
+	\draw[magenta!50,line width=1pt] (n1_7.south) -- (n2_10.north);
+	\draw[magenta!50,line width=1pt] (n1_8.south) -- (n2_12.north);
+	\draw[magenta!50,line width=1pt] (n1_9.south) -- (n2_12.north);
+	\draw[magenta!50,line width=1pt] (n1_10.south) -- (n2_13.north);
+	\draw[magenta!50,line width=1pt] (n1_10.south) -- (n2_14.north);
+	\draw[magenta!50,line width=1pt] (n1_12.south) -- (n2_15.north);
+	\draw[magenta!50,line width=1pt] (n1_13.south) -- (n2_11.north);	
+	\draw[dash pattern=on 2pt off 1pt,gray,line width=1pt](gap_1.north) -- ([yshift=3em]gap_1.north);
+	\draw[dash pattern=on 2pt off 1pt,gray,line width=1pt](gap_2.north) -- ([yshift=3em]gap_2.north);
+	\draw[dash pattern=on 2pt off 1pt,gray,line width=1pt](gap_3.north) -- ([yshift=3em]gap_3.north);
+	\draw[dash pattern=on 2pt off 1pt,gray,line width=1pt](gap_4.north) -- ([yshift=3em]gap_4.north);
+	\draw[dash pattern=on 2pt off 1pt,gray,line width=1pt]([xshift=-0.8em]gap_5.north) -- ([xshift=-0.8em,yshift=3em]gap_5.north);
+	\draw[dash pattern=on 2pt off 1pt,gray,line width=1pt](gap_6.north) -- ([yshift=3em]gap_6.north);
+	\draw[dash pattern=on 2pt off 1pt,gray,line width=1pt](gap_7.north) -- ([yshift=3em]gap_7.north);
+	\draw[dash pattern=on 2pt off 1pt,gray,line width=1pt](tag_2.north) -- ([yshift=3em]tag_2.north);
+\end{tikzpicture}
--- a/Chapter4/Figures/schematic-diagram-of-word-level-quality-assessment-task.log
+++ b/Chapter4/Figures/schematic-diagram-of-word-level-quality-assessment-task.log
--- a/Chapter4/Figures/schematic-diagram-of-word-level-quality-assessment-task.tex
+++ b/Chapter4/Figures/schematic-diagram-of-word-level-quality-assessment-task.tex
+\definecolor{ugreen}{rgb}{0,0.5,0}
+\begin{tikzpicture}[scale=0.6]
+	\tikzstyle{unit} = [draw,inner sep=3pt,font=\tiny,minimum height=1.2em]
+	\tikzstyle{bad_tag} = [fill=red!15,inner sep=1pt,align=center,font=\tiny,text=red]
+	\tikzstyle{ok_tag} = [fill=ugreen!15,inner sep=1pt,align=center,font=\tiny,text=ugreen]
+	\coordinate (o) at (0, 0);
+	\node[anchor=west,inner sep=0pt,align=center,font=\scriptsize] (n1_1) at ([yshift=5.5em]o.east){\textbf{Source}};
+	\node[unit,anchor=west,fill=green!20](n1_2) at ([xshift=7.6em]n1_1.east){The};
+	\node[unit,anchor=west,fill=green!20](n1_3) at ([xshift=0.8em]n1_2.east){Sharpen};
+	\node[unit,anchor=west,fill=green!20](n1_4) at ([xshift=0.8em]n1_3.east){tool};
+	\node[unit,anchor=west,fill=green!20](n1_5) at ([xshift=0.8em]n1_4.east){sharpens};
+	\node[unit,anchor=west,fill=green!20](n1_6) at ([xshift=0.8em]n1_5.east){areas};
+	\node[unit,anchor=west,fill=green!20](n1_7) at ([xshift=0.8em]n1_6.east){in};
+	\node[unit,anchor=west,fill=green!20](n1_8) at ([xshift=0.8em]n1_7.east){an};
+	\node[unit,anchor=west,fill=green!20](n1_9) at ([xshift=0.8em]n1_8.east){image};
+	\node[unit,anchor=west,fill=green!20](n1_10) at ([xshift=0.8em]n1_9.east){.};
+	\node[anchor=west,inner sep=0pt,align=center,font=\scriptsize] (n2_1) at (o.east){\textbf{PE}};
+	\node[unit,anchor=west,fill=red!20](n2_2) at ([xshift=1em]n2_1.east){Mit};
+	\node[unit,anchor=west,fill=red!20](n2_3) at ([xshift=0.8em]n2_2.east){dem};
+	\node[unit,anchor=west,fill=red!20](n2_4) at ([xshift=0.8em]n2_3.east){Scharfzeichner};
+	\node[unit,anchor=west,fill=red!20](n2_5) at ([xshift=0.8em]n2_4.east){können};
+	\node[unit,anchor=west,fill=red!20](n2_6) at ([xshift=0.8em]n2_5.east){Sie};
+	\node[unit,anchor=west,fill=red!20](n2_7) at ([xshift=0.8em]n2_6.east){einzelne};
+	\node[unit,anchor=west,fill=red!20](n2_8) at ([xshift=0.8em]n2_7.east){Bereiche};
+	\node[unit,anchor=west,fill=red!20](n2_9) at ([xshift=0.8em]n2_8.east){in};
+	\node[unit,anchor=west,fill=red!20](n2_10) at ([xshift=0.8em]n2_9.east){einem};
+	\node[unit,anchor=west,fill=red!20](n2_11) at ([xshift=0.8em]n2_10.east){Bild};
+	\node[unit,anchor=west,fill=red!20](n2_12) at ([xshift=0.8em]n2_11.east){scharfzeichnen};
+	\node[unit,anchor=west,fill=red!20](n2_13) at ([xshift=0.8em]n2_12.east){.};
+	\node[anchor=west,inner sep=0pt,align=center,font=\scriptsize] (n3_1) at ([yshift=-5.5em]o.east){\textbf{MT}};
+	\node[unit,anchor=west,fill=blue!20](n3_2) at ([xshift=4.7em]n3_1.east){Der};
+	\node[unit,anchor=west,fill=blue!20](n3_3) at ([xshift=0.8em]n3_2.east){Schärfen-Werkezug};
+	\node[unit,anchor=west,fill=blue!20](n3_4) at ([xshift=0.8em]n3_3.east){Bereiche};
+	\node[unit,anchor=west,fill=blue!20](n3_5) at ([xshift=0.8em]n3_4.east){in};
+	\node[unit,anchor=west,fill=blue!20](n3_6) at ([xshift=0.8em]n3_5.east){einem};
+	\node[unit,anchor=west,fill=blue!20](n3_7) at ([xshift=0.8em]n3_6.east){Bild};
+	\node[unit,anchor=west,fill=blue!20](n3_8) at ([xshift=0.8em]n3_7.east){Schärfer};
+	\node[unit,anchor=west,fill=blue!20](n3_9) at ([xshift=0.8em]n3_8.east){erscheint};
+	\node[unit,anchor=west,fill=blue!20](n3_10) at ([xshift=0.8em]n3_9.east){.};
+	\node[bad_tag,anchor=south] at ([yshift=2pt]n1_2.north){BAD};
+	\node[bad_tag,anchor=south] at ([yshift=2pt]n1_3.north){BAD};
+	\node[bad_tag,anchor=south] at ([yshift=2pt]n1_4.north){BAD};
+	\node[bad_tag,anchor=south] at ([yshift=2pt]n1_5.north){BAD};
+	\node[ok_tag,anchor=south] at ([yshift=2pt]n1_6.north){OK};
+	\node[ok_tag,anchor=south] at ([yshift=2pt]n1_7.north){OK};
+	\node[ok_tag,anchor=south] at ([yshift=2pt]n1_8.north){OK};
+	\node[ok_tag,anchor=south] at ([yshift=2pt]n1_9.north){OK};
+	\node[ok_tag,anchor=south] (tag1) at ([yshift=2pt]n1_10.north){OK};
+	\node[bad_tag,anchor=north] at ([yshift=-2pt]n3_2.south){BAD};
+	\node[bad_tag,anchor=north] at ([yshift=-2pt]n3_3.south){BAD};
+	\node[ok_tag,anchor=north] at ([yshift=-2pt]n3_4.south){OK};
+	\node[ok_tag,anchor=north] at ([yshift=-2pt]n3_5.south){OK};
+	\node[ok_tag,anchor=north] at ([yshift=-2pt]n3_6.south){OK};
+	\node[ok_tag,anchor=north] at ([yshift=-2pt]n3_7.south){OK};
+	\node[bad_tag,anchor=north] at ([yshift=-2pt]n3_8.south){BAD};
+	\node[bad_tag,anchor=north] at ([yshift=-2pt]n3_9.south){BAD};
+	\node[ok_tag,anchor=north] (tag2) at ([yshift=-2pt]n3_10.south){OK};
+	\node[bad_tag,anchor=north] (gap_1)at ([xshift=-2em,yshift=-2em]n3_2.south){BAD};
+	\node[ok_tag,anchor=north] (gap_2)at ([xshift=1.6em,yshift=-2em]n3_2.south){OK};
+	\node[bad_tag,anchor=north] (gap_3)at ([xshift=4.4em,yshift=-2em]n3_3.south){BAD};
+	\node[ok_tag,anchor=north] (gap_4)at ([xshift=2.5em,yshift=-2em]n3_4.south){OK};
+	\node[ok_tag,anchor=north] (gap_5)at ([xshift=1.3em,yshift=-2em]n3_5.south){OK};
+	\node[ok_tag,anchor=north] (gap_6)at ([xshift=2em,yshift=-2em]n3_6.south){OK};
+	\node[ok_tag,anchor=north] (gap_7)at ([xshift=1.7em,yshift=-2em]n3_7.south){OK};
+	\node[ok_tag,anchor=north] (gap_8)at ([xshift=2.4em,yshift=-2em]n3_8.south){OK};
+	\node[ok_tag,anchor=north] (gap_9)at ([xshift=2.5em,yshift=-2em]n3_9.south){OK};
+	\node[ok_tag,anchor=north](tag3) at ([xshift=1.3em,yshift=-2em]n3_10.south){OK};
+	\draw[dash pattern=on 2pt off 1pt,gray,line width=1pt](gap_1.north) -- ([yshift=2em]gap_1.north);
+	\draw[dash pattern=on 2pt off 1pt,gray,line width=1pt](gap_2.north) -- ([yshift=2em]gap_2.north);
+	\draw[dash pattern=on 2pt off 1pt,gray,line width=1pt](gap_3.north) -- ([yshift=2em]gap_3.north);
+	\draw[dash pattern=on 2pt off 1pt,gray,line width=1pt](gap_4.north) -- ([yshift=2em]gap_4.north);
+	\draw[dash pattern=on 2pt off 1pt,gray,line width=1pt](gap_5.north) -- ([yshift=2em]gap_5.north);
+	\draw[dash pattern=on 2pt off 1pt,gray,line width=1pt](gap_6.north) -- ([yshift=2em]gap_6.north);
+	\draw[dash pattern=on 2pt off 1pt,gray,line width=1pt](gap_7.north) -- ([yshift=2em]gap_7.north);
+	\draw[dash pattern=on 2pt off 1pt,gray,line width=1pt](gap_8.north) -- ([yshift=2em]gap_8.north);
+	\draw[dash pattern=on 2pt off 1pt,gray,line width=1pt](gap_9.north) -- ([yshift=2em]gap_9.north);
+	\draw[dash pattern=on 2pt off 1pt,gray,line width=1pt](tag3.north) -- ([yshift=2em]tag3.north);
+	\draw [line width=1pt](n1_2.south) -- (n2_3.north);
+	\draw [line width=1pt](n1_3.south) -- (n2_4.north);
+	\draw [line width=1pt](n1_4.south) -- (n2_4.north);
+	\draw [line width=1pt](n1_5.south) -- (n2_12.north);
+	\draw [line width=1pt](n1_6.south) -- (n2_8.north);
+	\draw [line width=1pt](n1_7.south) -- (n2_9.north);
+	\draw [line width=1pt](n1_8.south) -- (n2_10.north);
+	\draw [line width=1pt](n1_9.south) -- (n2_11.north);
+	\draw [line width=1pt](n1_10.south) -- (n2_13.north);
+	\draw[line width=1pt,red!60] (n2_3.south) -- (n3_2.north);
+	\draw[line width=1pt,red!60] (n2_4.south) -- (n3_3.north);
+	\draw[line width=1pt,ugreen!60] (n2_8.south) -- (n3_4.north);
+	\draw[line width=1pt,ugreen!60] (n2_9.south) -- (n3_5.north);
+	\draw[line width=1pt,ugreen!60] (n2_10.south) -- (n3_6.north);
+	\draw[line width=1pt,ugreen!60] (n2_11.south) -- (n3_7.north);
+	\draw[line width=1pt,red!60] (n2_12.south) -- (n3_8.north);
+	\draw[line width=1pt,ugreen!60] (n2_13.south) -- (n3_10.north);
+	\node[anchor=west,inner sep=0pt,align=center,font=\scriptsize]  at ([xshift=4.6em]tag1.east){\textbf{Source tags}};
+	\node[anchor=west,inner sep=0pt,align=center,font=\scriptsize]  at ([xshift=2.6em]tag2.east){\textbf{MT tags}};
+	\node[anchor=west,inner sep=0pt,align=center,font=\scriptsize]  at ([xshift=1.1em]tag3.east){\textbf{Gap tags}};
+\end{tikzpicture}
\ No newline at end of file
--- a/Chapter4/Figures/several-common-distributed-representations.tex
+++ b/Chapter4/Figures/several-common-distributed-representations.tex
+\begin{tikzpicture}
+	\tikzstyle{unit1} = [inner sep=1pt,align=center,font=\footnotesize]
+	\tikzstyle{unit2} = [inner sep=1pt,align=center,font=\scriptsize]
+		\node[unit1] (n1) at (0,0){单词分布式表示};
+		\node[unit2,anchor=west] (n11) at ([xshift=1em,yshift=4em]n1.east){one-hot词向量};
+		\node[unit2,anchor=west] (n12) at ([xshift=1em,yshift=2.4em]n1.east){Word2Vec词向量};
+		\node[unit2,anchor=west] (n13) at ([xshift=1em,yshift=0.8em]n1.east){GloVe词向量};
+		\node[unit2,anchor=west] (n14) at ([xshift=1em,yshift=-0.8em]n1.east){…};
+		\node[unit2,anchor=west] (n15) at ([xshift=1em,yshift=-2.4em]n1.east){ELMO预训练词向量};
+		\node[unit2,anchor=west] (n16) at ([xshift=1em,yshift=-4em]n1.east){BERT预训练词向量};
+		\draw[decorate,decoration={brace,mirror,amplitude=2mm}] ([xshift=-0.3em]n11.west) -- ([xshift=-0.3em]n16.west);
+		\node[unit1,anchor=west] (n2) at ([xshift=9em]n1.east){句子分布式表示};
+		\node[unit2,anchor=west] (n21) at ([xshift=1em,yshift=4.2em]n2.east){RAE编码};
+		\node[unit2,anchor=west] (n22) at ([xshift=1em,yshift=2.8em]n2.east){Doc2Vec向量};
+		\node[unit2,anchor=west] (n23) at ([xshift=1em,yshift=1.4em]n2.east){ELMO预训练句子表示};
+		\node[unit2,anchor=west] (n24) at ([xshift=1em,yshift=0em]n2.east){…};
+		\node[unit2,anchor=west] (n25) at ([xshift=1em,yshift=-1.4em]n2.east){GPT句子表示};
+		\node[unit2,anchor=west] (n26) at ([xshift=1em,yshift=-2.8em]n2.east){BERT预训练句子表示};
+		\node[unit2,anchor=west] (n27) at ([xshift=1em,yshift=-4.2em]n2.east){skip-thought向量};
+		\draw[decorate,decoration={brace,mirror,amplitude=2mm}] ([xshift=-0.3em]n21.west) -- ([xshift=-0.3em]n27.west);
+\end{tikzpicture}
--- a/Chapter4/Figures/synonym-matching-word-alignment.tex
+++ b/Chapter4/Figures/synonym-matching-word-alignment.tex
+\definecolor{ugreen}{rgb}{0,0.5,0}
+\begin{tikzpicture}[scale=0.5]
+	\tikzstyle{cand} = [draw,line width=1pt,align=center,minimum width=2.6em,minimum height=1.6em,drop shadow={shadow xshift=0.15em},fill=green!30]
+	\tikzstyle{ref} = [draw,line width=1pt,align=center,minimum width=2.6em,minimum height=1.6em,drop shadow={shadow xshift=0.15em},fill=red!30]
+		\node[align=center,minimum width=2.4em,minimum height=1.6em,minimum width=6em] (n11) at (0,0){\small\bfnew{Candidate :}};
+		\node[cand,anchor=west] (n12) at ([xshift=0.0em]n11.east){Can};
+		\node[cand,anchor=west] (n13) at ([xshift=1em]n12.east){I};
+		\node[cand,anchor=west] (n14) at ([xshift=1em]n13.east){have};
+		\node[cand,anchor=west] (n15) at ([xshift=1em]n14.east){this};
+		\node[cand,anchor=west] (n16) at ([xshift=1em]n15.east){like};
+		\node[cand,anchor=west] (n17) at ([xshift=1em]n16.east){he};
+		\node[cand,anchor=west] (n18) at ([xshift=1em]n17.east){do};
+		\node[cand,anchor=west] (n19) at ([xshift=1em]n18.east){it};
+		\node[cand,anchor=west] (n20) at ([xshift=1em]n19.east){?};
+		\node[align=center,minimum width=2.4em,minimum height=1.6em,anchor=north,minimum width=6em] (n21) at ([yshift=-6em]n11.south){\small\bfnew{Reference :}};
+		\node[ref,anchor=west] (n22) at ([xshift=0.0em]n21.east){Can};
+		\node[ref,anchor=west] (n23) at ([xshift=1em]n22.east){I};
+		\node[ref,anchor=west] (n24) at ([xshift=1em]n23.east){eat};
+		\node[ref,anchor=west] (n25) at ([xshift=1em]n24.east){this};
+		\node[ref,anchor=west] (n26) at ([xshift=1em]n25.east){Can};
+		\node[ref,anchor=west] (n27) at ([xshift=1em]n26.east){like};
+		\node[ref,anchor=west] (n28) at ([xshift=1em]n27.east){he};
+		\node[ref,anchor=west] (n29) at ([xshift=1em]n28.east){did};
+		\node[ref,anchor=west] (n30) at ([xshift=1em]n29.east){?};
+		\node[align=center,minimum width=2.4em,minimum height=1.6em,anchor=north,minimum width=6em] (n31) at ([yshift=-5em]n21.south){\small\bfnew{Candidate :}};
+		\node[cand,anchor=west] (n32) at ([xshift=0.0em]n31.east){Can};
+		\node[cand,anchor=west] (n33) at ([xshift=1em]n32.east){I};
+		\node[cand,anchor=west] (n34) at ([xshift=1em]n33.east){have};
+		\node[cand,anchor=west] (n35) at ([xshift=1em]n34.east){this};
+		\node[cand,anchor=west] (n36) at ([xshift=1em]n35.east){like};
+		\node[cand,anchor=west] (n37) at ([xshift=1em]n36.east){he};
+		\node[cand,anchor=west] (n38) at ([xshift=1em]n37.east){do};
+		\node[cand,anchor=west] (n39) at ([xshift=1em]n38.east){it};
+		\node[cand,anchor=west] (n40) at ([xshift=1em]n39.east){?};
+		\node[align=center,minimum width=2.4em,minimum height=1.6em,anchor=north,minimum width=6em] (n41) at ([yshift=-6em]n31.south){\small\bfnew{Candidate :}};
+		\node[ref,anchor=west] (n42) at ([xshift=0.0em]n41.east){Can};
+		\node[ref,anchor=west] (n43) at ([xshift=1em]n42.east){I};
+		\node[ref,anchor=west] (n44) at ([xshift=1em]n43.east){eat};
+		\node[ref,anchor=west] (n45) at ([xshift=1em]n44.east){this};
+		\node[ref,anchor=west] (n46) at ([xshift=1em]n45.east){Can};
+		\node[ref,anchor=west] (n47) at ([xshift=1em]n46.east){like};
+		\node[ref,anchor=west] (n48) at ([xshift=1em]n47.east){he};
+		\node[ref,anchor=west] (n49) at ([xshift=1em]n48.east){did};
+		\node[ref,anchor=west] (n50) at ([xshift=1em]n49.east){?};
+		\draw[line width=1.6pt,blue!40] (n12.south) -- (n22.north);
+		\draw[line width=1.6pt,blue!40] (n13.south) -- (n23.north);
+		\draw[line width=1.6pt,blue!40] (n15.south) -- (n25.north);
+		\draw[line width=1.6pt,blue!40] (n16.south) -- (n27.north);
+		\draw[line width=1.6pt,blue!40] (n17.south) -- (n28.north);
+		\draw[line width=1.6pt,blue!40] (n20.south) -- (n30.north);
+		\draw[line width=1.6pt,orange!40] (n18.south) -- (n29.north);
+		\draw[line width=2pt,ugreen!40](n14.south) -- (n24.north);
+		\draw[line width=1.6pt,blue!40] (n32.south) -- (n46.north);
+		\draw[line width=1.6pt,blue!40] (n33.south) -- (n43.north);
+		\draw[line width=1.6pt,blue!40] (n35.south) -- (n45.north);
+		\draw[line width=1.6pt,blue!40] (n36.south) -- (n47.north);
+		\draw[line width=1.6pt,blue!40] (n37.south) -- (n48.north);
+		\draw[line width=1.6pt,blue!40] (n40.south) -- (n50.north);
+		\draw[line width=1.6pt,orange!40] (n38.south) -- (n49.north);
+		\draw[line width=2pt,ugreen!40](n34.south) -- (n44.north);
+\end{tikzpicture}
--- a/Chapter4/chapter4.tex
+++ b/Chapter4/chapter4.tex
@@ -15,7 +15,7 @@
 \renewcommand\figurename{图}%将figure改为图
 \renewcommand\tablename{表}%将figure改为图
-\chapterimage{fig-NEU-4.jpg} % Chapter heading image
+\chapterimage{fig-NEU-2.jpg} % Chapter heading image
 %----------------------------------------------------------------------------------------
 %	CHAPTER 4
@@ -23,8 +23,860 @@
 \chapter{翻译质量评价}
+\parinterval 人们在使用机器翻译系统时需要评估系统输出结果的质量。这个过程也被称作机器翻译译文质量评价，简称为{\small\sffamily\bfseries{译文质量评价}}\index{译文质量评价}（Quality Evaluation of Translation）\index{Quality Evaluation of Translation}。在机器翻译的发展进程中，译文质量评价有着非常重要的作用，不论在系统研发的反复迭代中，还是在诸多的机器翻译应用场景中，都存在大量的译文质量评价环节。从某种意义上说，没有译文质量评价，机器翻译也不会发展成今天的样子。比如，本世纪初研究人员提出了译文质量自动评价方法BLEU\upcite{DBLP:conf/acl/PapineniRWZ02}。该方法使得机器系统的评价变得自动、快速、便捷，而且评价过程可以重复。正是由于BLEU等自动评价方法的提出，机器翻译研究人员可以在更短的时间内得到译文质量的评价结果，加速系统研发的进程。
+\parinterval 时至今日，译文质量评价方法已经非常丰富，针对不同的使用场景研究人员陆续提出了不同的方法。本章将会对其中的典型方法进行介绍，包括：人工评价、有参考答案自动评价、无参考答案自动评价等。相关方法及概念也会在本章的后续章节中被广泛使用。
+%----------------------------------------------------------------------------------------
+%    NEW SECTION
+%----------------------------------------------------------------------------------------
+\section{译文质量评价所面临的挑战}
+\parinterval 一般来说，译文质量评价可以被看作是一个对译文进行打分或者排序的过程，打分或者排序的结果代表了翻译质量的好坏。比如，表\ref{tab:4-1}展示一个汉译英的译文质量评价结果。这里采用了5分制打分，1代表最低分，5代表最高分。可以看出，流畅的高质量译文分较高，相反，存在问题的译文得分较低。
+\begin{table}[htp]{
+\begin{center}
+\caption{汉译英译文质量评价实例}
+{
+\begin{tabular}{c|l|c}
+源文 & 那只敏捷的棕色狐狸跳过了那只懒惰的狗。 & 打分 \\
+\hline
+\rule{0pt}{10pt} 机器译文1 & The quick brown fox jumped over the lazy dog. & 5 \\
+\rule{0pt}{10pt} 机器译文2 & The fast brown fox jumped over a sleepy dog. & 4 \\
+\rule{0pt}{10pt} 机器译文3 & The fast brown fox jumps over the dog. & 3 \\
+\rule{0pt}{10pt} 机器译文4 & The quick brown fox jumps over dog. & 2 \\
+\rule{0pt}{10pt} 机器译文5 & A fast fox jump dog. & 1 \\
+\end{tabular}
+\label{tab:4-1}
+}
+\end{center}
+}\end{table}
+\parinterval 这里的一个核心问题是：从哪个角度对译文质量进行评价呢？常用的标准有：{\small\sffamily\bfseries{流畅度}}\index{流畅度}（Fluency）\index{Fluency}和{\small\sffamily\bfseries{忠实度}}\index{忠实度}（Adequacy）\index{Adequacy}\upcite{DBLP:journals/mt/ChurchH93}。其中流畅度是指译文在目标语言中的流畅程度，越通顺的译文流畅度越高；忠实度是指译文表达源文意思的程度，如果译文能够全面、准确的表达原文的意思，那么它具有较高的翻译忠实度。在一些极端的情况下，译文可以非常流畅，但是与原文完全不对应。或者，译文可以非常好的对应原文，但是读起来非常不连贯。这些译文都不是很好的译文。
+\parinterval 传统观点把翻译分为``信''、``达''、``雅''三个层次，而忠实度体现的是一种``信''的思想，而流畅度体现的是一种``达''的思想。不过``雅''在机器翻译评价中还不是一个常用的标准，而且机器翻译还没有达到``雅''的水平，是未来所追求的目标。
+\parinterval 给定评价标准，译文质量评价有很多实现方式。比如，可以使用人工评价的方式让评委对每个译文进行打分（\ref{Manual evaluation}节），也可以用自动评价的方式让计算机比对译文和参考答案之间的匹配的程度（\ref{Automatic evaluation with reference answers}节）。但是，自然语言的翻译是最复杂的人工智能问题之一。这不仅仅体现在相关问题的建模和系统实现的复杂性上，译文质量评价也同样面临着诸多挑战。
+\begin{itemize}
+\vspace{0.5em}
+\item {\small\sffamily\bfseries{译文不唯一}}。自然语言表达的丰富性决定了同一个意思往往有很多种表达方式。同一句话，由不同译者来的翻译也往往存在差异。译者的背景、翻译水平、翻译所处的语境，甚至译者的情绪都会对译文产生影响。如何在评价过程中尽可能考虑多样的译文，是译文质量评价中最具挑战的问题之一。
+\vspace{0.5em}
+\item {\small\sffamily\bfseries{评价标准不唯一}}。虽然流畅度和忠实度给译文质量评价提供了很好的参考依据，但是在实践中往往会有更多样的需求。比如，在专利翻译中，术语翻译的准确性就是必须要考虑的因素，一个术语的翻译错误会导致整个译文不可用。此外，术语翻译的一致性也是非常重要的，即使同一个术语有多种正确的译文，但是在同一个专利文档中，术语翻译需要保持一致。不同的需求使得很难用统一的标准对译文质量进行评价。在实践中，往往需要针对不同应用场景设计不同的评价标准。
+\vspace{0.5em}
+\item {\small\sffamily\bfseries{自动评价与人工评价存在着偏差}}。固然使用人工的方式可以准确地评估译文质量，但是这种方式费时、费力。而且由于人工评价的主观性，其结果不易重现，也就是不同人的评价结果会有差异。这些因素也造成了人工评价不能被过于频繁的使用。翻译质量的自动评价可以充分利用计算机的计算能力，对译文与参考答案进行比对，具有速度快、结果可重现的优点，但是其精度不如人工评价。使用何种评价方法也是实践中需要考虑的重要问题之一。
+\vspace{0.5em}
+\item {\small\sffamily\bfseries{自动评价过程中会存在无法提供参考答案或者只能提供部分参考答案的情况}}。很多情况下，译文的正确答案并不容易获取。甚至对于某些低资源语种，相关的语言学家都很稀缺。这时很难进行基于标准答案的评价。如何在没有参考答案的情况下对译文质量进行估计是极具应用前景且颇具挑战的方向。
+\vspace{0.5em}
+\end{itemize}
+\parinterval 针对以上问题，研究人员设计出多种不同的译文质量评价方法。根据人工参与方式的不同，可以分为人工评价、有参考答案的自动评价、无参考答案的自动评价。这些方法也对应了不同的使用场景。
+\begin{itemize}
+\vspace{0.5em}
+\item {\small\sffamily\bfseries{人工评价}}。当需要对系统进行准确的评估时，往往采用人工评价。比如机器翻译的一些互联网上应用，在系统上线前都会采用人工评价对机器翻译系统性能进行测试。当然，这种方法的时间和人力成本是最高的。
+\vspace{0.5em}
+\item {\small\sffamily\bfseries{有参考答案的自动评价}}。由于机器翻译系统研发过程中需要频繁地对系统性能进行评价，这时可以让人标注一些正确的译文，之后把这些译文作为参考答案与机器翻译系统输出的结果进行比对。这种自动评价的结果获取成本低，可以多次重复，而且可以用于对系统结果的快速反馈，指导系统优化的方向。
+\vspace{0.5em}
+\item {\small\sffamily\bfseries{无参考答案的自动评价}}。在很多应用场景中，在系统输出译文时，使用者希望提前知道译文的质量，即使这时并没有可比对的参考答案。这样，系统使用者可以根据这个对质量的``估计''结果有选择地使用机器翻译译文。严格意义上说，这并不是一个传统的译文质量评价方法，而是一种对译文置信度和可能性的估计。
+\vspace{0.5em}
+\end{itemize}
+\parinterval 图\ref{fig:4-2}给出了机器翻译译文评价方法的逻辑关系图。需要注意的是，很多时候，译文质量评价结果是用于机器翻译系统优化的。在随后的章节中也会看到，译文评价的结果会被用于不同的机器翻译模型优化中。甚至说，很多统计指标（如极大似然）也可以被看作是一种对译文的``评价''，这样也就把机器翻译的统计建模和译文评价联系在了一起。本章的后半部分将重点介绍传统的译文质量评价方法。与译文质量评价相关的模型优化方法将会在后续章节详细论述。
+%----------------------------------------------
+\begin{figure}[htp]
+    \centering
+	\input{./Chapter4/Figures/logic-diagram-of-translation-quality-evaluation-method}
+   \caption{译文质量评价方法逻辑图}
+   \label{fig:4-2}
+\end{figure}
+%----------------------------------------------
+%----------------------------------------------------------------------------------------
+%    NEW SECTION
+%----------------------------------------------------------------------------------------
+\sectionnewpage
+\section{人工评价}\label{Manual evaluation}
+\parinterval 顾名思义，人工评价是指评价者根据翻译结果好坏对译文进行评价。例如，可以根据句子的忠实度和流畅度对其进行打分，这样能够准确评定出译文是否准确翻译出源文的意思以及译文是否通顺。在人工评价时，一般由多个评价者匿名对译文打分，之后综合所有评价者的评价结果给出最终的得分。人工评价可以准确反映句子的翻译质量，是最权威、可信度最高的评价方法，但是其缺点也十分明显：需要耗费人力物力，而且评价的周期长，不能及时得到有效的反馈。因此在实际系统开发中，纯人工评价不会过于频繁的被使用，它往往和自动评价一起配合，帮助系统研发人员准确的了解当前系统的状态。
+%----------------------------------------------------------------------------------------
+%    NEW SUB-SECTION
+%----------------------------------------------------------------------------------------
+\subsection{评价策略}
+\parinterval 合理的评价指标是人工评价得以顺利进行的基础。机器译文质量的人工评价可以追溯到1966年，自然语言处理咨询委员会提出{\small\sffamily\bfseries{可理解度}}\index{可理解度}（Intelligibility）\index{Intelligibility}和{\small\sffamily\bfseries{忠诚度}}\index{忠诚度}（Fidelity）\index{Fidelity}作为机器译文质量人工评价指标\upcite{DBLP:journals/mtcl/Carroll66}。1994年，{\small\sffamily\bfseries{充分性}}\index{充分性}（Adequacy）\index{Adequacy}、{\small\sffamily\bfseries{流利度}}\index{流利度}（Fluency）\index{Fluency}和{\small\sffamily\bfseries{信息性}}\index{信息性}（Informativeness）\index{Informativeness}成为为ARPA MT\footnote{ARPA MT计划是美国高级研究计划局软件和智能系统技术处人类语言技术计划的一部分。}的人工评价标准\upcite{DBLP:conf/amta/WhiteOO94}。此后，有不少研究者提出了更多的机器译文质量人工评估指标，例如将{\small\sffamily\bfseries{清晰度}}\index{清晰度}（Clarity）\index{Clarity}和{\small\sffamily\bfseries{连贯性}}\index{连贯性}（Coherence）\index{Coherence}加入人工评价指标中\upcite{Miller:2005:MTS}。甚至有人将各种人工评价指标集中在一起，组成了尽可能详尽无遗的机器翻译评估框架\upcite{king2003femti}。
+\parinterval 人工评价的策略非常多。考虑不同的因素，往往会使用不同的评价方案，比如：
+\begin{itemize}
+\vspace{0.5em}
+\item {\small\sffamily\bfseries{是否呈现源语言文本}}。在进行人工评价时，可以向评价者提供源语言文本或参考答案、或同时提供源语言文本和参考答案。从评价的角度，参考答案已经能够帮助评价者进行正确评价，但是源语言文本可以提供更多信息帮助评估译文的准确性。
+\vspace{0.5em}
+\item {\small\sffamily\bfseries{评价者选择}}。理想情况下，评价者应同时具有源语言和目标语言的语言能力。但是，很多时候双语能力的评价者很难招募，因此这时会考虑使用目标语为母语的评价者。配合参考答案，单语评价者也可以准确地评价译文质量。
+\vspace{0.5em}
+\item {\small\sffamily\bfseries{多个系统同时评价}}。如果有多个不同系统的译文需要评价，可以直接使用每个系统单独打分的方法。但是，如果仅仅是想了解不同译文之间的相对好坏，也可以采用竞评的方式，即对于每个句子，对不同系统根据译文质量进行排序，这样做的效率会高于直接打分，而且评价准确性也能够得到保证。
+\vspace{0.5em}
+\item {\small\sffamily\bfseries{数据选择}}。评价数据一般需要根据目标任务进行采集，为了避免和系统训练数据重复，往往会搜集最新的数据。而且，评价数据的规模越大，评价结果越科学。常用的做法是搜集一定量的评价数据，之后从中采样出所需的数据。由于不同的采样会得到不同的评价集合，这样的方法可以复用多次，得到不同的测试集。
+\vspace{0.5em}
+\item {\small\sffamily\bfseries{面向应用的评价}}。除了人工直接打分，一种更有效的方法是把机器翻译的译文嵌入到下游应用中，通过机器翻译对下游应用的改善效果评估机器翻译译文质量。比如，可以把机器翻译放入译后编辑流程中，通过对比译员翻译效率的提升来评价译文质量。再比如，把机器翻译放入线上应用中，通过点击率或者用户反馈来评价机器翻译的品质。
+\vspace{0.5em}
+\end{itemize}
+%----------------------------------------------------------------------------------------
+%    NEW SUB-SECTION
+%----------------------------------------------------------------------------------------
+\subsection{打分标准}
+\parinterval 如何对译文进行打分是机器翻译评价的核心问题。在人工评价方法中，一种被广泛使用的方法是{\small\sffamily\bfseries{直接评估}}\index{直接评估}（Direct Assessment，DA）\index{Direct Assessment}\upcite{DBLP:conf/amta/Whi94teOO}，这种评价方法需要评价者给出对机器译文绝对的评分：在给定一个机器译文和一个参考答案的情况下，评价者直接给出1-100的分数用来表征机器译文的质量。与其类似的策略是对机器翻译质量进行等级评定\upcite{DBLP:journals/mt/PrzybockiPBS09}，常见的是在5级或7级标准中指定单一等级用以反映机器翻译质量。也有研究者提出利用语言测试技术对机器翻译质量进行评价\upcite{reeder2006direct}，其中涉及多等级内容的评价：第一等级测试简单的短语、成语、词汇等；第二等级利用简单的句子测试机器翻译在简单文本上的表现；第三等级利用稍复杂的句子测试机器翻译在复杂语法结构上的表现；第四等级测试引入更加复杂的补语结构和附加语等等。
+\parinterval 除了对译文进行简单的打分，另一种经典的人工评价方法是{\small\sffamily\bfseries{相对排序}}\index{相对排序}（Relative Ranking，RR）\index{Relative Ranking}\upcite{DBLP:conf/wmt/Callison-BurchF07}。这种方法通过对不同机器翻译的译文质量进行相对排序得到最终的评价结果。举例来说：
+\parinterval （1）在每次评价过程中，若干个等待评价的机器翻译系统被分为5个一组，评价者被提供3个连续的源文片段和1组机器翻译系统的相应译文；
+\parinterval （2）评价者需要对本组的机器译文根据其质量进行排序，不过评价者并不需要一次性将5个译文排序，而是将其两两进行比较，判出胜负或是平局。在评价过程中，由于排序是两两一组进行的，为了评价的公平性，将采用排列组合的方式进行分组和比较，若共有$n$个机器翻译系统，则会为被分为 $\mathrm{C}_n^5$组，组内每个系统都将与其他4个系统进行比较，由于需要针对3个源文片段进行评价对比，则意味着每个系统都需要被比较$\mathrm{C}_n^5 \times 4 \times 3$次；
+\parinterval （3）最终根据多次比较的结果，对所有参与评价的系统进行总体排名。对于如何获取合理的总体排序，有三种常见的策略：
+\begin{itemize}
+\vspace{0.5em}
+\item {\small\sffamily\bfseries{根据某系统比其他系统更好的频率进行排序}}\upcite{DBLP:conf/wmt/Callison-BurchK12}。以系统${S}_j$和系统${S}_k$为例，两个系统都被比较了$\mathrm{C}_n^5 \times 4 \times 3$次，其中系统${S}_j$获胜20次，系统${S}_k$获胜30次，总体排名中系统${S}_k$优于系统${S}_j$。
+\vspace{0.5em}
+\item {\small\sffamily\bfseries{选取与成对比较中冲突最少的排序作为最终排序}}\upcite{DBLP:conf/wmt/Lopez12}。第一种排序策略中存在冲突现象：例如在每次两两比较中，系统${S}_j$胜过系统${S}_k$的次数比系统${S}_j$不敌系统${S}_k$的次数多，若待评价系统仅有系统${S}_j$、${S}_k$，显然系统${S}_j$的排名高于系统${S}_k$。但当待评价系统很多时，可能系统${S}_j$在所有比较中获胜的频率低于系统${S}_k$，此时就出现了总体排序与局部排序不一致的冲突。因此，有研究者提出，能够与局部排序冲突最少的总体排序才是最合理的。该方法中用公式\ref{eq:4-1}定义排名的``冲突'',其中${R}$是待评价的系统集合，${S}_j$、${S}_k$分别是成对比较的两个系统，$\mathrm{Count}_{win}({S}_j,{S}_k)$和$\mathrm{Count}_{loss}({S}_j,{S}_k)$分别是${S}_j$、${S}_k$进行成对比较时系统${S}_j$胜利和失败的频率。
+\begin{eqnarray}
+\mathrm{conflic} = \sum\limits_{{{S}_j} \in R,{{S}_k} \in R,j \ne k} {{\rm{max}}(0,\mathrm{Count}_{win}({{S}_j},{{S}_k}) - \mathrm{Count}_{loss}({{S}_j},{{S}_k}))}
+\label{eq:4-1}
+\end{eqnarray}
+\vspace{0.5em}
+\item {\small\sffamily\bfseries{根据某系统最终获胜的期望进行排序}}\upcite{DBLP:conf/iwslt/Koehn12}。以系统$\mathrm{S}_j$为例，若共有$n$个待评价的系统，则进行总体排序时系统 $\mathrm{S}_j$的得分其最终获胜的期望\footnote{根据公式\ref{eq:4-2}可以看出，该策略去除了平局的影响。}，即：
+\begin{eqnarray}
+\mathrm{score}({{S}_j}) = \frac{1}{n}\sum\limits_{k,k \ne j} {\frac{\mathrm{Count}_{win}({{S}_j},{{S}_k})}{{\mathrm{Count}_{win}({{S}_j},{{S}_k}) + \mathrm{Count}_{loss}({{S}_j},{{S}_k})}}}
+\label{eq:4-2}
+\end{eqnarray}
+\vspace{0.5em}
+\end{itemize}
+\parinterval 与相对排序相比，直接评估方法虽然更加直观，但是过度依赖评价者的主观性，因而直接评估适用于直观反映某机器翻译系统性能，而不适合用来比较机器翻译系统之间的性能差距。在需要对大量系统的进行快速人工评价时，找出不同译文质量之间的相关关系要比直接准确评估译文质量简单的多，基于排序的评价方法可以大大降低评价者的工作量，所以也被系统研发人员经常使用。
+\parinterval 在实际应用中，研究者可以根据实际情况选择不同的人工评价方案，人工评价也没有统一的标准。WMT 和CCMT 机器翻译评测都有配套的人工评价方案\upcite{DBLP:conf/wmt/BojarCFHHHKLMNP15}，可以作为业界的参考标准。
+%----------------------------------------------------------------------------------------
+%    NEW SECTION
+%----------------------------------------------------------------------------------------
+\sectionnewpage
+\section{有参考答案的自动评价}\label{Automatic evaluation with reference answers}
+\parinterval 由于人工评价费事费力，同时具有一定的主观性，甚至不同人在不同时刻面对同一篇文章的理解都会不同，为了克服这种限制，另一种思路是将人类专家翻译的结果看作是参考答案，将译文与答案的近似程度作为评价结果。即译文与答案越接近，评价结果越好；反之，评价结果较差。这种评价方式叫做自动评价。
+\parinterval 自动评价的方式虽然不如人工评价准确，但是具有速度快，成本低、一致性高的优点，因此自动评价是也是机器翻译系统研发人员所青睐的方法。
+\parinterval 随着评价技术的不断发展，自动评价方式已经具有了比较好的指导性，可以帮助使用者快速了解当前机器翻译译文的质量。在机器翻译领域，自动评价已经成为了一个重要分支，被提出的自动评价方法不下几十种。这里无法对这些方法一一列举，为了便于后续章节的描述，这里仅对一些代表性的方法进行简要介绍。
+%----------------------------------------------------------------------------------------
+%    NEW SUB-SECTION
+%----------------------------------------------------------------------------------------
+\subsection{基于词串比对的方法}
+\parinterval 这种方法比较关注译文中的词汇及译文语序，其思想是将译文看成是符号序列，通过计算机器译文和参考答案间的序列相似性来评价机器翻译的质量。
+%----------------------------------------------------------------------------------------
+%    NEW SUBSUB-SECTION
+%----------------------------------------------------------------------------------------
+\subsubsection{1.基于距离的方法}
+\parinterval 基于距离的自动评价方法的基本思想是根据将机器译文转化为参考答案所需要的最小编辑步骤数来衡量机器译文的质量，基于此类思想的自动评价方法主要有{\small\sffamily\bfseries{单词错误率}}\index{单词错误率}（Word Error Rate，WER）\index{Word Error Rate}\upcite{DBLP:conf/coling/SuWC92}、{\small\sffamily\bfseries{与位置无关的单词错误率}}\index{与位置无关的单词错误率}（Position-independent word Error Rate，PER）\index{Position-independent word Error Rate}\upcite{DBLP:conf/interspeech/TillmannVNZS97}和{\small\sffamily\bfseries{翻译错误率}}\index{翻译错误率}（Translation Error Rate，TER）\index{Translation Error Rate}\upcite{snover2006study}等，下面主要介绍TER方法以深入理解此类自动评价方法的思想。
+\parinterval 翻译错误率是一种典型的基于距离的评价方法，通过评定机器译文的译后编辑工作量来衡量机器译文质量。在这里``距离''被定义为将一个序列转换成另一个序列所需要的最少编辑操作次数，操作次数越多，距离越大，序列之间的相似性越低；相反距离越小，表示一个句子越容易改写成另一个句子，序列之间的相似性越高。TER 使用的编辑操作包括：增加、删除、替换和移位。其中增加、删除、替换操作计算得到的距离被称为编辑距离。TER根据错误率的形式给出评分：
+\begin{eqnarray}
+\mathrm{score}= \frac{\mathrm{edit}(c,r)}{l}
+\label{eq:4-3}
+\end{eqnarray}
+\parinterval 其中$\mathrm{edit}(c,r)$是指机器翻译生成的译文$c$和参考答案$r$之间的距离，$l$是归一化因子,通常为参考答案的长度。在距离计算中所有的操作的代价都为1。在计算距离时，优先考虑移位操作，再计算编辑距离（即增加、删除和替换操作的次数）。直到增加、移位操作无法减少编辑距离时，将编辑距离和移位操作的次数累加得到TER计算的距离。
+\begin{example}
+Candidate：cat is standing in the ground
+\qquad\ \ \ Reference：The cat is standing on the ground
+\label{eg:4-1}
+\end{example}
+\parinterval 将Candidate序列 转换为Reference序列，需要进行一次增加操作，在句首增加``The''；一次替换操作，将``in'' 替换为``on''。所以$\mathrm{edit}(c,r)$ = 2，归一化因子$l$为Reference的长度7，所以该机器译文的TER 错误率为2/7。
+\parinterval WER和PER与TER的基本思想相同，这三种方法的主要区别在于对``错误'' 的定义和考虑的操作类型略有不同。WER使用的编辑操作包括：增加、删除、替换，由于没有移位操作，当机器译文出现词序问题时，会发生多次替代，因而一般会低估译文质量；而PER只考虑增加和删除两个动作，在不考虑词序的情况下，PER计算两个句子中出现相同单词的次数，根据翻译句子比参考答案长或短，其余操作无非是插入词或删除词，这样往往会高估译文质量。
+%----------------------------------------------------------------------------------------
+%    NEW SUBSUB-SECTION
+%----------------------------------------------------------------------------------------
+\subsubsection{2.基于$\bm{n}$元语法的方法}
+\parinterval BLUE是目前使用最广泛的自动评价指标。BLEU 是Bilingual Evaluation Understudy的缩写，最早由IBM 在2002 年提出\upcite{DBLP:conf/acl/PapineniRWZ02}。通过采用$n$-gram匹配的方式评定机器翻译结果和参考答案之间的相似度，机器译文越接近参考答案就认定它的质量越高。$n$-gram是指$n$个连续单词组成的单元，称为{\small\sffamily\bfseries{$\bm{n}$元语法单元}}\index{$\bm{n}$元语法单元}（见{\chapterthree}）。$n$越大表示评价时考虑的匹配片段越大。
+\parinterval BLEU 的计算首先考虑待评价机器译文中$n$-gram在参考答案中的匹配率，称为{\small\sffamily\bfseries{$\bm{n}$-gram准确率}}\index{$\bm{n}$-gram准确率}（$n$-gram Precision）\index{$n$-gram Precision}。其计算方法如下：
+\begin{eqnarray}
+\mathrm P_{n} = \frac{{{\rm{Coun}}{{\rm{t}}_{{\rm{hit}}}}}}{{{\rm{Coun}}{{\rm{t}}_{{\rm{output}}}}}}
+\label{eq:4-4}
+\end{eqnarray}
+\parinterval 其中$\mathrm{Count_{hit}}$表示机器译文中$n$-gram在参考答案中命中的次数，$\mathrm{Count_{output}}$表示机器译文中总共有多少$n$-gram。为了避免同一个词被重复计算，BLEU的定义中使用了截断的方式定义$\mathrm{Count_{hit}}$和$\mathrm{Count_{output}}$。
+\parinterval 在引入截断方式之前，该译文的1-gram准确率为4/4 = 1，这显然是不合理的。在引入截断的方式之后，``the'' 在译文中出现4 次，在参考答案中出现2 次，截断操作则是取二者的最小值，即$\mathrm{Count_{hit}}$= 2，$\mathrm{Count_{output}}$= 4，该译文的1-gram准确率为2/4。
+\parinterval 译文整体的准确率等于各$n$-gram的加权平均：
+\begin{eqnarray}
+{\rm P_{{\rm{avg}}}} = \exp (\sum\limits_{n = 1}^N {{w_n} \cdot {{{\mathop{\rm logP}\nolimits} }_n}} )
+\label{eq:4-5}
+\end{eqnarray}
+\parinterval 但是，该方法更倾向于对短句子打出更高的分数。一个极端的例子是译文只有很少的几个词，但是都命中答案，准确率很高可显然不是好的译文。因此，BLEU 引入{\small\sffamily\bfseries{短句惩罚因子}}\index{短句惩罚因子}（Brevity Penalty, BP）\index{Brevity Penalty}的概念，对短句进行惩罚:
+\begin{eqnarray}
+\rm BP = \left\{ \begin{array}{l}
+1\quad \quad \;\;c > r\\
+{\rm{exp}}(1 - \frac{r}{v})\quad c \le r
+\end{array} \right.
+\label{eq:4-6}
+\end{eqnarray}
+\parinterval 其中$c$表示机器译文的句子长度，$r$表示参考答案的句子长度。最终BLEU的计算公式为：
+\begin{eqnarray}
+\mathrm {BLEU} = \mathrm {BP} \cdot \mathrm {exp}(\sum\limits_{i = 1}^N {{w_n} \cdot {{{\mathop{\mathrm {log}}\nolimits} }\mathrm P_n}} )
+\label{eq:4-7}
+\end{eqnarray}
+\parinterval 实际上，BLEU的计算也是一种综合考虑{\small\sffamily\bfseries{准确率}}\index{准确率}（Precision）\index{Precision}和{\small\sffamily\bfseries{召回率}}\index{召回率}（Recall）\index{Recall}的方法。公式中，${\rm{exp}}( \cdot )$是一种准确率的表示。BP本是一种召回率的度量，它会惩罚过短的结果。这种设计同分类系统中评价指标F1值是有相通之处的\upcite{DBLP:conf/muc/Chinchor92}。
+\parinterval 从机器翻译的发展来看，BLEU 的意义在于它给系统研发人员提供了一种简单、高效、可重复的自动评价手段，在研发机器翻译系统时可以不需要依赖人工评价。同时，BLEU 也有很多创新之处，包括引入$n$-gram的匹配，截断计数和短句惩罚等等，包括NIST 等很多评价指标都是受到BLEU 的启发。此外，BLEU本身也有很多不同的实现方式，包括IBM-BLEU\upcite{DBLP:conf/acl/PapineniRWZ02}、NIST-BLEU\upcite{doddington2002automatic}、BLEU-SBP\upcite{DBLP:conf/emnlp/ChiangDCN08}、ScareBLEU等，使用不同实现方式得到评价结果会有差异。因此在实际使用BLEU进行评价时需要确认其实现细节，以保证结果与相关工作评价要求相符。
+\parinterval 还需要注意的是，BLEU的评价结果与所使用的参考答案数量有很大相关性。如果参考答案数量多，$n$-gram匹配的几率变大，BLEU的结果也会偏高。同一个系统，在不同数量的参考答案下进行BLEU评价，结果相差10个点都十分正常。此外，考虑测试的同源性等因素，相似系统在不同测试条件下的BLEU结果差异可能会更大，这时可以采用人工评价的方式得到更准确的评价结果。
+\parinterval 虽然BLEU被广泛使用，但也并不完美，甚至经常被人诟病。比如，它需要依赖参考答案，而且评价结果有时与人工评价不一致，同时BLEU 评价只是单纯地从匹配度的角度思考翻译质量的好坏，并没有真正考虑句子的语义是否翻译正确。但是，毫无疑问，BLEU 仍然是机器翻译中最常用的评价方法。在没有找到更好的替代方案之前，BLEU 还是机器翻译研究所使用的标准评价指标。
+%----------------------------------------------------------------------------------------
+%    NEW SUB-SECTION
+%----------------------------------------------------------------------------------------
+\subsection{基于词对齐的方法}
+\parinterval 基于词对齐的方法，顾名思义就是根据参考答案中的单词与译文中的单词之间的对齐关系对机器翻译译文进行评价。词对齐的概念也被用于统计机器翻译的建模（\chapterfive），这里借用了相同的思想来度量机器译文与参考答案之间的匹配程度。在基于$n$-gram匹配的评价方法中（如BLEU），BP可以起到一些度量召回率的作用，但是这类方法并没有对召回率进行准确的定义。与其不同的是，基于词对齐的方法在机器译文和参考答案的单词之间建立一对一的对应关系，这种评价方法在引入准确率的同时还能显性引入召回率作为评价指标。
+\parinterval 在基于词对齐的自动评价方法中，一种典型的方法是Meteor。该方法通过计算精确的word-to-word匹配来度量一个译文的质量\upcite{DBLP:conf/acl/BanerjeeL05}，并且在``绝对''匹配之外，还引入了``波特词干匹配''和``同义词''匹配。在下面的内容中，将利用实例对Meteor方法进行介绍。
+\parinterval 在Meteor方法中，首先在机器译文与参考答案之间建立单词之间的对应关系，再根据其对应关系计算精确率和召回率。
+\parinterval （1）单词之间的对应关系在建立过程中主要涉及三个模型，在对齐过程中依次使用这三个模型进行匹配：\\\\\\
+\begin{example}
+Candidate：Can I have it like he ? 
+\qquad\  Reference：Can I eat this can like him ?
+\label{eg:4-2}
+\end{example}
+\begin{itemize}
+\vspace{0.5em}
+\item {\small\sffamily\bfseries{``绝对''匹配模型}}\index{``绝对''匹配模型}（Exact Module）\index{Exact Module}。绝对匹配模型在建立单词对应关系时，要求机器译文端的单词与参考答案端的单词完全一致，并且在参考答案端至多有1个单词与机器译文端的单词对应，否则会将其视为多种对应情况。对于实例\ref{eg:4-2}，使用``绝对''匹配模型，共有两种匹配结果：
+%----------------------------------------------
+\begin{figure}[htp]
+    \centering
+	\subfigure[``绝对''匹配词对齐-1]{\input{./Chapter4/Figures/absolute-match-word-alignment-1}}
+	\subfigure[``绝对''匹配词对齐-2]{\input{./Chapter4/Figures/absolute-match-word-alignment-2}}
+   \caption{``绝对''匹配模型}
+   \label{fig:4-3}
+\end{figure}
+%----------------------------------------------
+\vspace{0.5em}
+\item {\small\sffamily\bfseries{``波特词干''匹配模型}}\index{``波特词干''匹配模型}（Porter Stem Module）\index{Porter Stem Module}。该模型在``绝对''匹配结果的基础上，对尚未对齐的单词进行基于词干的匹配，只需机器译文端单词与参考答案端单词的词干相同即可，如上文中的``do''和``did''。对于图\ref{fig:4-3}的结果，再使用``波特词干'' 匹配模型，结果如下：
+%----------------------------------------------
+\begin{figure}[htp]
+    \centering
+	\input{./Chapter4/Figures/match-words-with-stem}
+    \caption{同词干匹配词对齐}
+    \label{fig:4-4}
+\end{figure}
+%----------------------------------------------
+\vspace{0.5em}
+\item {\small\sffamily\bfseries{``同义词''匹配模型}}\index{``同义词''匹配模型}（WN synonymy module）\index{WN Synonymy Module}。该模型在前两个模型匹配结果的基础上，对尚未对齐的单词进行同义词的匹配，即基于WordNet词典匹配机器译文与参考答案中的同义词。如上例中的``eat''和``have''。
+%----------------------------------------------
+\begin{figure}[htp]
+    \centering
+    \input{./Chapter4/Figures/synonym-matching-word-alignment}
+    \caption{同义词匹配词对齐}
+    \label{fig:4-5}
+\end{figure}
+%----------------------------------------------
+\vspace{0.5em}
+\end{itemize}
+\parinterval 经过上面的处理，可以得到若干对机器译文与参考答案的对齐关系，下一步需要从中确定一个拥有最大的子集的对齐关系（即机器译文中被对齐的单词个数最多的对齐关系）。但是在上例中的两种对齐关系子集基数相同，这种情况下，需要选择一个对齐关系中交叉现象出现最少的对齐关系。于是，最终的对齐关系如图\ref{fig:4-6}所示：
+%----------------------------------------------
+\begin{figure}[htp]
+    \centering
+	\input{./Chapter4/Figures/determine-final-word-alignment}
+   	\caption{确定最终词对齐}
+  	 \label{fig:4-6}
+\end{figure}
+%----------------------------------------------
+\parinterval （2）在得到机器译文与参考答案的对齐关系后，需要基于对齐关系计算准确率和召回率。
+\parinterval 准确率：机器译文中命中单词与机器译文单词总数的比值。即：
+\begin{eqnarray}
+\mathrm P = \frac {\rm{Count}_{hit}}{\rm{Count}_{candidate}}
+\label{eq:4-8}
+\end{eqnarray}
+\parinterval 召回率：机器译文中命中单词个数与参考答案单词总数的比值。即：
+\begin{eqnarray}
+\mathrm R = \frac {\rm{Count}_{hit}}{\rm{Count}_{reference}}
+\label{eq:4-9}
+\end{eqnarray}
+\parinterval 接下来，计算机器译文的得分。利用{\small\sffamily\bfseries{调和均值}}\index{调和均值}（Harmonic-mean）\index{Harmonic-mean}将准确率和召回率结合起来，并加大召回率的重要性将其权重调大，例如将召回率的权重设置为9：
+\begin{eqnarray}
+\rm F_{mean} = \frac {\rm 10PR}{\rm{R+9P}}
+\label{eq:4-10}
+\end{eqnarray}
+\parinterval 在上文提到的评价指标中，无论是准确率、召回率还是$\rm F_{mean}$，都是基于单个词汇信息衡量译文质量，而忽略了语序问题。为了将语序问题纳入道评价内容中，Meteor会考虑更长的匹配：将机器译文按照最长匹配长度分块，并对``块数''较多的机器译文给予惩罚。例如上例中，机器译文被分为了三个``块''——``Can I have this''、``like he do''、``？''在这种情况下，看起来上例中的准确率、召回率都还不错，但最终会受到很严重的惩罚。这种罚分机制能够识别出机器译文中的词序问题，因为当待测译文词序与参考答案相差较大时，机器译文将会被分割得比较零散，这种惩罚机制的计算公式如式\ref{eq:4-11}，其中$\rm Count_{chunks}$表示匹配的块数。
+\begin{eqnarray}
+\rm P = 0.5*{\left({\frac{\rm Count_{chunks}}{\rm Count_{hit}}} \right)^3}
+\label{eq:4-11}
+\end{eqnarray}
+\parinterval Meteor评价方法的最终评分为：
+\begin{eqnarray}
+\rm score = {F_{mean}}*(1 - P)
+\label{eq:4-12}
+\end{eqnarray}
+\parinterval Meteor方法也是目前使用最广泛的自动评价方法之一，它的创新点之一在于引入了词干匹配和同义词匹配，扩大了词汇匹配的范围。Meteor方法被提出后，很多人尝试对其进行了改进，使其评价结果与人工评价结果更相近。例如Meteor-next在Meteor的基础上增加{\small\sffamily\bfseries{释义匹配器}}\index{释义匹配器}（Paraphrase Matcher）\index{Paraphrase Matcher}，利用该匹配器能够捕获机器译文中与参考答案意思相近的短语，从而在短语层面进行匹配。此外这种方法还引入了{\small\sffamily\bfseries{可调权值向量}}\index{可调权值向量}（Tunable Weight Vector）\index{Tunable Weight Vector}，用于调节每个匹配类型的相应贡献\upcite{DBLP:conf/wmt/DenkowskiL10}；Meteor 1.3在Meteor的基础上增加了改进的{\small\sffamily\bfseries{文本规范器}}\index{文本规范器}（Meteor Normalizer）\index{Meteor Normalizer}、更高精度的释义匹配以及区分内容词和功能词等指标，其中文本规范器能够根据一些规范化规则，将机器译文中意义等价的标点减少到通用的形式。而区分内容词和功能词则能够得到更为准确地词汇对应关系\upcite{DBLP:conf/wmt/DenkowskiL11}；Meteor Universial则通过机器学习方法学习不同语言的可调权值，在对低资源语言进行评价时可对其进行复用，从而实现对低资源语言的译文更准确的评价\upcite{DBLP:conf/wmt/DenkowskiL14}。
+\parinterval 由于召回率反映参考答案在何种程度上覆盖目标译文的全部内容，而Meteor在评价过程中显式引入召回率，所以Meteor的评价与人工评价更为接近。但Meteor方法需要借助同义词表、功能词表等外部数据，当外部数据中的目标词对应不正确或缺失相应的目标词时，评价水准就会降低。不仅如此，超参数的设置和使用，对于评分影响较大。
+%----------------------------------------------------------------------------------------
+%    NEW SUB-SECTION
+%----------------------------------------------------------------------------------------
+\subsection{基于检测点的方法}
+\parinterval 基于词串比对和词对齐的自动评价方法中提出的BLEU、TER 等评价指标可以对译文的整体质量进行评估，但是缺乏对具体问题的细致评价。很多情况下，研究人员需要知道系统是否能够处理特定类型的翻译问题，而不是得到一个笼统的评价结果。基于检测点的方法正是基于此想法\upcite{DBLP:journals/mt/Shiwen93}。基于检测点的评价的优点在于对机器翻译系统给出一个总体评价的同时针对系统在各个具体问题上的翻译能力进行评估，方便比较不同翻译模型的性能。这种方法也被多次用于机器翻译比赛的质量评测。
+\parinterval 基于检测点的评价根据事先定义好的语言学检测点对译文的相应部分进行打分。如下是几个英中翻译中的检测点实例：
+\begin{example}
+They got up at six this morning.
+\qquad\ \ \ 他们今天早晨六点钟起床。
+\qquad\ \ \ 检测点：时间词的顺序
+\label{eg:4-3}
+\end{example}
+\begin{example}
+There are nine cows on the farm.
+\qquad\ \ \ 农场里有九头牛。
+\qquad\ \ \ 检测点：量词``头''
+\label{eg:4-4}
+\end{example}
+\begin{example}
+His house is on the south bank of the river.
+\qquad\ \ \ 他的房子在河的南岸。
+\qquad\ \ \ We keep our money in a bank.
+\qquad\ \ \ 我们在一家银行存钱。
+\qquad\ \ \ 检测点：bank 的多义翻译
+\label{eg:4-5}
+\end{example}
+\parinterval 该方法的关键在于检测点的获取，有工作曾提出一种从平行双语句子中自动提取检查点的方法\upcite{DBLP:conf/coling/ZhouWLLZZ08}，借助大量的双语词对齐平行语料，利用自然语言处理工具对其进行词性标注、依存分析、成分分析等处理，利用预先构建的人工词典和人为定义的规则，识别语料中不同类别的检查点，从而构建检查点数据库。其中，将检查点分别设计为单词级（如介词、歧义词等）、短语级（如固定搭配）、句子级（特殊句型、复合句型等）三个层面，在对机器翻译系统进行评价时，在检查点数据库中分别选取不同类别检查点对应的测试数据进行测试，从而了解机器翻译系统在各种重要语言现象方面的翻译能力。除此之外，这种方法也能应用于机器翻译系统之间的性能比较中，通过为各个检查点分配合理的权重，用翻译系统在各个检查点得分的加权平均作为系统得分，从而对机器翻译系统的整体水平作出评价。
+\parinterval 基于检测点的评价方法的意义在于，它并不是简单给出一个分数，反而更像是一种诊断型评估方法，能够帮助系统研发人员定位系统问题。因此这类方法更多地使用在对机器翻译系统的翻译能力进行分析上，是对BLEU 等整体评价指标的一种很好的补充。
+%----------------------------------------------------------------------------------------
+%    NEW SUB-SECTION
+%----------------------------------------------------------------------------------------
+\subsection{多策略融合的评价方法}\label{Evaluation method of Multi Strategy fusion}
+\parinterval 前面介绍的几种自动评价方法中，大多是从某个单一的角度比对机器译文与参考答案之间的相似度，例如BLEU更关注$n$-gram是否命中、Meteor更关注机器译文与参考答案之间的词对齐信息、WER、PER与TER等方法只关注机器译文与参考译文之间的编辑距离，此外还有一些并不常见的自动评价方法比较关注机器译文和参考译文在语法、句法方面的相似度。但无一例外的是，每种自动评价的关注点都是单一的，无法对译文质量进行全面、综合的评价。为了克服这种限制，研究人员们提出了一些基于多策略融合的译文质量评估方法，以期提高自动评价方法与人工评价方法的结果一致性。
+\parinterval 基于策略融合的自动评价方法往往会将多个基于词汇、句法和语义的自动评价方法融合在内，其中比较核心的问题是如何将多个评价方法进行合理地组合。目前提出的方法中颇具代表性的是使用参数化方式和非参数化方式对多种自动评价方法进行筛选和组合。
+\parinterval 参数化组合方法的实现主要有两种方式：一种方式是广泛使用不同的译文质量评价作为特征，借助回归算法实现多种评价策略的融合\upcite{DBLP:conf/acl/AlbrechtH07a,DBLP:conf/acl/AlbrechtH07}；另一种方式则是对各种译文质量评价方法的结果进行加权求和，并借助机器学习算法更新内部的权重参数，从而实现多种评价策略的融合\upcite{DBLP:conf/naacl/LiuG07}。
+\parinterval 非参数化组合方法的思想与贪心算法异曲同工：将多个自动评价方法以与人工评价的相关度为标准进行降序排列，依次尝试将其加入最优策略集合中，如果能提高最优策略集合的``性能''，则将该自动评价方法加入最优策略集合中，否则不加入。其中最优策略集合的``性能''用QUEEN定义\upcite{DBLP:conf/ijcnlp/GimenezM08}。该方法是首次尝试使用非参数的组合方式将多种自动评价方法进行融合，也不可避免的存在一些瑕疵。一方面在评价最优策略集合性能时，对于一个源文需要至少三个参考答案；另一方面，这种``贪心''的组合策略很有可能会得到局部最优的组合。
+\parinterval 与单一的译文评价方法相比，多策略融合的评价方法能够对机器译文从多角度进行综合评价，这显然是一个模拟人工评价的过程，因而多策略融合的评价结果也与人工评价结果更加接近。但是对于不同的语言，多策略融合的评价方法需要不断调整最优策略集合或是调整组合方法内部的参数才能达到最佳的评价效果，这个过程势必要比单一的自动评价方法更繁琐些。
+%----------------------------------------------------------------------------------------
+%    NEW SUB-SECTION
+%----------------------------------------------------------------------------------------
+\subsection{译文多样性}
+\parinterval 在自然语言中，由于句子的灵活排序和大量同义词的存在，导致同一个源语言句子可能对应几百个合理的目标语言译文，甚至更多。然而上文提到的几种人工评价仅仅比较机器译文与有限数量的参考答案之间的差距，得出的评价结果往往会低估了机器译文的质量。为了改变这种窘况，比较直观的想法是增大参考答案集或是直接比较机器译文与参考答案在词法、句法和语义等方面的差距。
+%----------------------------------------------------------------------------------------
+%    NEW SUBSUB-SECTION
+%----------------------------------------------------------------------------------------
+\subsubsection{1.增大参考答案集}
+\parinterval BLUE、Meteor、TER等自动评价方法的结果往往与人工评价结果存在差距，其主要原因是这些自动评价方法往往通过直接比对机器译文与有限的参考答案之间的``外在差异''，由于参考答案集可覆盖的人类译文数量过少，当机器译文本来十分合理但却未被包含在参考答案集中时，就会将其质量过分低估。
+\parinterval HyTER是2012年被提出的一种自动评价方法，它致力于得到所有可能译文的紧凑编码，从而实现自动评价过程中访问所有合理译文\upcite{DBLP:conf/naacl/DreyerM12}。这种评价方法的原理非常简单直观：
+\begin{itemize}
+\vspace{0.5em}
+\item 通过注释工具标记出一个短语的所有备选含义（同义词）并存储在一起作为一个同义单元。可以认为每个同义单元表达了一个语义概念。在生成参考答案时，可以通过对某参考答案中的短语用同义单元进行替换生成一个新的参考答案。例如，将中文句子``对国足的支持率接近于0''翻译为英文，同义单元有以下几种：
+\noindent [THE-SUPPORT-RATE]：
+\parinterval <the level of approval; the approval level; the approval rate ; the support rate>
+\noindent [CLOSE-TO]：
+\parinterval <close to; about equal to; practically>
+\vspace{0.5em}
+\item 通过已有同义单元和附加单词的组合用于覆盖更大的语言片段。在生成参考答案时就是采用这种方式不断覆盖更大的语言片段，直到将所有可能的参考答案覆盖进去。例如可以将短语[THE-SUPPORT-RATE]与``national football team''组合为``[THE-SUPPORT-RATE] for national football team''。
+\vspace{0.5em}
+\item 利用同义单元的组合将所有所有合理的人类译文都编码出来。中文句子``对国足的支持率接近于0''翻译为英文，其可能的参考答案被编码成：
+\vspace{0.5em}
+\end{itemize}
+%----------------------------------------------
+\begin{figure}[htp]
+    \centering
+	\input{./Chapter4/Figures/representation-of-reference-answer-set-in-hyter}
+   \caption{HyTER中参考答案集的表示方式}
+   \label{fig:4-7}
+\end{figure}
+%----------------------------------------------
+\parinterval 从上面的例子中可以看出，HyTER方法通过构造同义单元的方式，可以列举出译文中每个片段的所有可能的表达方式，从而增大参考答案的数量，上例中的每一条路径都代表一个参考答案。但是这种对参考答案集的编码方式存在问题，同义单元之间的组合往往存在一定的限制关系\upcite{DBLP:conf/tsd/BojarMTZ13}，使用HyTER方法会导致参考答案集中包含有错误的参考答案。
+\begin{example}
+将中文``市政府批准了一项新规定''分别翻译为英语和捷克语，使用HyTER构造的参考答案集如下图所示：
+\label{eg:4-6}
+\end{example}
+%----------------------------------------------
+\begin{figure}[htp]
+    \centering
+\subfigure[英语参考答案集表示]{\input{./Chapter4/Figures/representation-of-english-reference-answer-set}}
+\subfigure[捷克语参考答案集表示]{\input{./Chapter4/Figures/representation-of-czech-reference-answer-set}}
+   \caption{使用HyTER构造的参考答案集}
+   \label{fig:4-8}
+\end{figure}
+%----------------------------------------------
+\parinterval 但是在捷克语中主语``městská rada''或是``zastupitelstvo města''的性别必须由动词来反映，那么上述捷克语的参考答案集中有部分存在语法错误。为了避免此类现象的出现，研究人员在同义单元中加入了将同义单元组合在一起必须满足的限制条件\upcite{DBLP:conf/tsd/BojarMTZ13}，从而在增大参考答案集地同时确保了每个参考答案的准确性
+\parinterval 将参考答案集扩大后，可以继续沿用BLEU或NIST等基于$n$元语法的方法进行自动评价，但是传统方法往往会忽略多重参考答案中的重复信息，于是对每个$n$元语法进行加权的自动评价方法被提出\upcite{DBLP:conf/eamt/QinS15}。该方法根据每个$n$元语法单元的长度、在参考答案集中出现的频率、被虚词（如``the''``by''``a''等）分开后的分散度等方面，确定其在计算最终分数时所占的权重。以BLEU方法为例，原分数计算方式如公式13所示：
+\begin{eqnarray}
+\mathrm {BLEU} &=& \mathrm {BP} \cdot {\rm{exp}}(\sum\limits_{i = 1}^N {{w_n} \cdot {{{\mathop{\rm log}\nolimits} }\mathrm{P}_n}} )
+\label{eq:4-13}\\
+\mathrm{BLEU} &=& \mathrm {BP} \cdot {\rm{exp}}(\sum\limits_{i = 1}^N {{w_n} \cdot \log (\mathrm{S}_n \times \mathrm{P}_n} ))
+\label{eq:4-14}\\
+\mathrm{S}_n &=& \mathrm{Ngram_{diver}} \times \log (n + \frac{M}{\rm{Count_{ref}}})
+\label{eq:4-15}
+\end{eqnarray}
+\parinterval 本方法分数的计算方法见公式\ref{eq:4-14}，其中$\mathrm{S}_n$即为为某个$n$元语法单元分配的权重，计算方式见公式\ref{eq:4-15}，公式中$n$为语法单语的长度，$M$为参考答案集中出现该$n$元语法单元的参考答案数量，$\rm{Count_{ref}}$为参考答案集大小。$\mathrm{Ngram_{diver}}$为该$n$元语法单元的分散度，用$n$元语法单元种类数量与语法单元总数的比值计算。
+%----------------------------------------------------------------------------------------
+%    NEW SUBSUB-SECTION
+%----------------------------------------------------------------------------------------
+\subsubsection{2.利用分布式表示进行质量评价}
+\parinterval 2003年，在自然语言处理的神经语言建模任务中引入了词嵌入技术，其思想是把每个单词映射为多维实数空间中的一个点（具体表现为一个实数向量），这种技术也被称作单词的分布式表示。在这项技术中，研究人员们发现单词之间的关系可以通过空间的几何性质进行刻画，意义相近的单词之间的欧式距离也十分相近。（单词分布式表示的具体内容，将在书的{\chapternine}详细介绍，在此不再赘述。）
+\parinterval 受词嵌入技术的启发，研究人员尝试借助参考答案和机器译文的分布式表示来进行译文质量评价，为译文质量评价提供了新思路。在自然语言的上下文中，表示是与每个单词、句子或文档相关联的数学对象。这个对象通常是一个向量，其中每个元素的值在某种程度上描述了相关单词、句子或文档的语义或句法属性。{\small\sffamily\bfseries{分布式表示评价度量}}\index{分布式表示评价度量}（Distributed Representations Evaluation Metrics，DREEM）\index{Distributed Representations Evaluation Metrics}将单词或句子的分布式表示映射到连续的低维空间，发现在该空间中，具有相似句法和语义属性的单词彼此接近\upcite{DBLP:journals/jmlr/BengioDVJ03,DBLP:conf/emnlp/SocherPHNM11,DBLP:conf/emnlp/SocherPWCMNP13}，证明了利用分布式表示实现译文质量评估的可行性。
+\parinterval 在该类方法中，分布式表示的选取是一个十分关键的问题，理想的情况下，分布式表示应该涵盖句子在词汇、句法、语法、语义、依存关系等各个方面的信息。目前常见的分布式表示方式如表\ref{tab:4-2}所示。除此之外，还可以通过词袋模型、循环神经网路、卷积神经网络、深层平均网络\upcite{iyyer-etal-2015-deep}、Quick-Thought模型\upcite{DBLP:conf/iclr/LogeswaranL18}等将词向量表示转换为句子向量表示。
+\begin{table}[htp]{
+\begin{center}
+\caption{常见的单词及句子分布表示}
+{
+\begin{tabular}{l|l}
+单词分布表示 & 句子分布表示 \\
+\hline
+\rule{0pt}{10pt} One-hot词向量 & RAE编码\upcite{DBLP:conf/emnlp/SocherPHNM11} \\
+\rule{0pt}{10pt} Word2Vec词向量\upcite{DBLP:journals/corr/abs-1301-3781} & Doc2Vec向量\upcite{DBLP:conf/icml/LeM14}  \\
+\rule{0pt}{10pt} Prob-fasttext词向量\upcite{DBLP:conf/acl/AthiwaratkunW17} & ELMO预训练句子表示\upcite{DBLP:conf/naacl/PetersNIGCLZ18} \\
+\rule{0pt}{10pt} GloVe词向量\upcite{DBLP:conf/emnlp/PenningtonSM14} & GPT句子表示\upcite{radford2018improving} \\
+\rule{0pt}{10pt} ELMO预训练词向量\upcite{DBLP:conf/naacl/PetersNIGCLZ18} & BERT预训练句子表示\upcite{DBLP:conf/naacl/DevlinCLT19} \\
+\rule{0pt}{10pt} BERT预训练词向量\upcite{DBLP:conf/naacl/DevlinCLT19} & Skip-thought向量\upcite{DBLP:conf/nips/KirosZSZUTF15} \\
+\end{tabular}
+\label{tab:4-2}
+}
+\end{center}
+}\end{table}
+\parinterval DREEM方法中选取了能够反映句子中使用的特定词汇的One-hot向量、能够反映词汇信息的词嵌入向量\upcite{DBLP:journals/jmlr/BengioDVJ03}、能够反映句子的合成语义信息的{\small\sffamily\bfseries{递归自动编码}}\index{递归自动编码}（Recursive Autoencoder Embedding, RAE）\index{Recursive Autoencoder Embedding}，这三种表示级联在一起，最终形成句子的向量表示。在得到机器译文和参考答案的上述分布式表示后，利用余弦相似度和长度惩罚对机器译文质量进行评价。机器译文$t$和参考答案$r$之间的相似度如公式\ref{eq:4-16}所示，其中${v_i}(t)$和${v_i}(r)$分别是机器译文和参考答案的向量表示中的第$i$个元素，$N$是向量表示的维度大小。
+\begin{eqnarray}
+\mathrm {cos}(t,r) = \frac{{\sum\limits_{i = 1}^N {{v_i}(t) \cdot {v_i}(r)} }}{{\sqrt {\sum\limits_{i = 1}^N {v_i^2(t)} } \sqrt {\sum\limits_{i = 1}^N {v_i^2(r)} } }}
+\label{eq:4-16}
+\end{eqnarray}
+\parinterval 在此基础上，DREEM方法还引入了长度惩罚项，对与参考答案长度相差太多的机器译文进行惩罚，长度惩罚项如公式\ref{eq:4-17}所示，其中${l_t}$和${l_r}$分别是机器译文和参考答案长度：
+\begin{eqnarray}
+\mathrm{BP} = \left\{ \begin{array}{l}
+\exp (1 - {{{l_r}} \mathord{\left/
+ {\vphantom {{{l_r}} {{l_t}}}} \right.
+ \kern-\nulldelimiterspace} {{l_t}}})\quad {l_t} < {l_r}\\
+\exp (1 - {{{l_t}} \mathord{\left/
+ {\vphantom {{{l_t}} {{l_r}}}} \right.
+ \kern-\nulldelimiterspace} {{l_r}}})\quad {l_t} \ge {l_r}
+\end{array} \right.
+\label{eq:4-17}
+\end{eqnarray}
+\parinterval 机器译文的最终得分如下，其中$\alpha$是一个需要手动设置的参数：
+\begin{eqnarray}
+\mathrm{score}(t,r) = \mathrm{cos}{^\alpha }(t,r) \times \mathrm{BP}
+\label{eq:4-18}
+\end{eqnarray}
+\parinterval 与传统自动评价方法中对机器译文与参考答案的外在的词汇或是$n$元语法单元进行比较不同，该方法观察到的不只是单词的多余、缺少、乱序等问题，还可以从句法、语义等更深层的内容对两者进行相似度对比。此方法在译文质量评价方面的成功，也鼓励了更多研究人员利用分布式表示方法进行译文质量评价。
+\parinterval 在DREEM方法取得成功后，基于词嵌入的词对齐自动评价方法被提出\upcite{DBLP:journals/corr/MatsuoKS17}，该方法中先得到机器译文与参考答案的词对齐关系后，通过平均对齐关系$x_i$和$y_i$中两者的词嵌入相似度来计算机器译文与参考答案的相似度，具体见公式\ref{eq:4-19}，其中$x$是机器译文，$y$是参考答案，函数$\varphi(\cdot)$用来计算对齐关系$x_i$和$y_i$的相似度。
+\begin{eqnarray}
+\mathrm{ASS}(x,y) = \frac{1}{{\left| x \right|\left| y \right|}}\sum\limits_{i = 1}^{\left| x \right|} {\sum\limits_{j = 1}^{\left| y \right|} {\varphi ({x_i},{y_j})} }
+\label{eq:4-19}
+\end{eqnarray}
+\parinterval 此外，将分布式表示与相对排序融合也是一个很有趣的想法\upcite{DBLP:journals/csl/GuzmanJMN17}，在这个尝试中，研究人员利用分布式表示提取参考答案和多个机器译文中的句法信息和语义信息，利用神经网络模型对多个机器译文进行排序。
+\parinterval 在基于分布式表示的这类译文质量评价方法中，译文和参考答案的所有词汇信息和句法语义信息都被包含在句子的分布式表示中，克服了单一参考答案的限制。但是同时也带来了新的问题，一方面将句子转化成分布式表示使评价过程变得不太直观，另一方面该类评价方法的优劣与分布式表示的选取息息相关，为了获得与人工评价更相关的评价效果，分布式表示的选取和组合方式还需要进一步的研究。
+%----------------------------------------------------------------------------------------
+%    NEW SUB-SECTION
+%----------------------------------------------------------------------------------------
+\subsection{自动评价与人工评价的相关性}
+\parinterval {\small\sffamily\bfseries{相关性}}\index{相关性}（Correlation）\index{Correlation}是统计学中的概念，当两个变量之间存在密切的依赖或制约关系，但却无法确切地表示时，可以认为两个变量之间存在``相关关系''，并往往用``相关性''作为衡量关系密切程度的标准\upcite{pearson1920notes}。对于相关关系，虽然无法求解两个变量之间确定的函数关系，但是通过大量的观测数据，能够发现变量之间存在的统计规律性，而``相关性''也同样可以利用统计手段获取。
+\parinterval 在机器译文质量评价工作中，相比人工评价，有参考答案的自动评价具有效率、成本低的优点，因而广受机器翻译系统研发人员青睐。在这种情况下，自动评价结果的可信度一般取决于它们与可靠的人工评价之间的相关性。随着越来越多有参考答案的自动评价方法的提出，``与人工评价之间的相关性''也被视为衡量一种新的自动评价方法是否可靠的衡量标准。
+\parinterval 很多研究工作中都曾对BLEU、NIST等有参考答案的自动评价与人工评价的相关性进行研究和讨论，其中也有很多工作对``相关性''的统计过程作过比较详细的阐述。在``相关性''的统计过程中，一般是分别利用人工评价方法和某种有参考答案的自动评价方法对若干个机器翻译系统的输出进行等级评价\upcite{coughlin2003correlating}或是相对排序\upcite{popescu2003experiment}，从而对比两种评价手段的评价结果是否一致。该过程中的几个关键问题会可能会对最终结果产生影响。
+\begin{itemize}
+\vspace{0.5em}
+\item {\small\sffamily\bfseries{源语言句子的选择。}}。由于机器翻译系统一般以单句作为翻译单元，因而评价过程中涉及的源语言句子是脱离上下文语境的单句\upcite{coughlin2003correlating}。
+\vspace{0.5em}
+\item {\small\sffamily\bfseries{人工评估结果的产生}}。人工评价过程中采用只提供标准高质量参考答案的单语评价方法，由多位评委对译文质量做出评价后进行平均作为最终的人工评价结果\upcite{coughlin2003correlating}。
+\vspace{0.5em}
+\item {\small\sffamily\bfseries{自动评价中参考答案的数量}}。在有参考答案的自动评价过程中，为了使评价结果更加准确，一般会设置多个参考答案。参考答案数量的设置会对自动评价与人工评价的相关性产生影响，也有很多工作对此进行了研究。例如人们发现有参考答案的自动评价方法在区分人类翻译和机器翻译时，设置4个参考答案的区分效果远远高于2个参考答案\upcite{culy2003limits}；也有人曾专注于研究怎样设置参考答案数量能够产生最高的相关性\upcite{finch2004using}。
+\vspace{0.5em}
+\item {\small\sffamily\bfseries{自动评价中参考答案的质量}}。从直觉上，自动评价中参考答案的质量一般会影响最终的评价结果，从而对相关性的计算产生影响。然而，有相关实验表明，只要参考答案的质量不是过分低劣，很多情况下自动评价都能得到相同的评价结果\upcite{DBLP:conf/coling/HamonM08}。
+\vspace{0.5em}
+\end{itemize}
+\parinterval 目前在机器译文质量评价的领域中，有很多研究工作尝试比较各种有参考答案的自动评价方法（主要以BLEU、NIST等基于$n$元语法的方法为主）与人工评价方法的相关性。整体来看，这些方法与人工评价具有一定的相关性，自动评价结果能够较好地翻译译文质量\upcite{coughlin2003correlating}\upcite{doddington2002automatic}。
+\parinterval 但是也有相关研究指出，不应该对有参考答案的自动评价方法过于乐观，而应该存谨慎态度，因为目前的自动评价方法对于流利度的评价并不可靠，同时参考答案的体裁和风格往往会对自动评价结果产生很大影响\upcite{culy2003limits}。同时，有研究者提出，在机器翻译研究过程中，忽略实际的示例翻译而仅仅通过BLEU等自动评价方式得分的提高来表明机器翻译质量的提高是不可取的，因为BLEU的提高并不足以反映翻译质量的真正提高，而在另一些情况下，为了实现翻译质量的显著提高，并不需要提高BLEU\upcite{callison2006re}。
+%----------------------------------------------------------------------------------------
+%    NEW SECTION
+%----------------------------------------------------------------------------------------
+\sectionnewpage
+\section{无参考答案的自动评价}
+\parinterval 无参考答案自动评价在机器翻译领域又被称作{\small\sffamily\bfseries{质量评估}}\index{质量评估}（Quality Estimation，\\QE）\index{Quality Estimation，QE}。与传统的译文质量评价方法不同，质量评估旨在不参照标准译文的情况下，对机器翻译系统的输出在单词、短语、句子、文档等各个层次进行评价，于是在质量评估这个任务的基础上衍生出了单词级质量评估、短语级质量评估、句子级质量评估和文档级质量评估几种相关任务。
+\parinterval 人们对于无参考答案自动评价的需求大多来源于机器翻译的实际应用。例如，在机器翻译的译后编辑过程中，译员不仅仅希望了解机器翻译系统的整体翻译质量，还需要了解该系统在某个句子上的表现如何：该机器译文的质量是否很差？需要修改的内容有多少？是否值得进行后编辑？这时，译员更加关注系统在单个数据点上（比如一段话）的可信度而非系统在测试数据集上的平均质量。这时，太多的人工介入就无法保证使用机器翻译所带来的高效性，因此在机器翻译输出译文的同时，需要质量评估系统给出对译文质量的预估结果。这些需求也促使研究人员在质量评估问题上投入了更多的研究力量。包括WMT、CCMT等知名机器翻译评测中也都设置了相关任务，受到了业界的认可。
+%----------------------------------------------------------------------------------------
+%    NEW SUB-SECTION
+%----------------------------------------------------------------------------------------
+\subsection{质量评估任务}
+\parinterval 质量评估任务本质上是通过预测一个能够反映评价单元的质量标签，在各个层次上对译文进行质量评价。在上文中已经提到，质量评估任务通常被划分为单词级、短语级、句子级和文档级，在接下来的内容中，将对各个级别的任务进行更加详细的介绍。
+%----------------------------------------------------------------------------------------
+%    NEW SUBSUB-SECTION
+%----------------------------------------------------------------------------------------
+\subsubsection{1.单词级质量评估}
+\parinterval 机器翻译系统在翻译某个句子时，会出现各种类型的错误，这些错误多是一些单词翻译问题，例如单词出现歧义、单词漏译、单词错译、词形转化错误等等。单词级质量评价以单词为评估单元，目的是确定译文句子中每个单词的所在位置是否存在翻译错误和单词漏译现象。
+\parinterval 单词级质量评估任务可以被定义为：参照源语言句子，以单词为评价单位，自动标记出机器译文中的错误。其中的``错误''包括单词错译、单词词形错误、单词漏译等。在单词级质量评估任务中，输入是机器译文和源语言句子，输出是一系列标签序列，即图\ref{fig:4-11}中的Source tags、MT tags、Gap tags，标签序列中的每个标签对应翻译中的每个单词（或其间隙），并表明该位置是否出现错误。
+%----------------------------------------------
+\begin{figure}[htp]
+    \centering
+	\input{./Chapter4/Figures/schematic-diagram-of-word-level-quality-assessment-task}
+   \caption{单词级质量评估任务示意图}
+   \label{fig:4-11}
+\end{figure}
+%----------------------------------------------
+\parinterval 下面以实例\ref{eg:4-7}为例介绍该任务的具体内容，在实例\ref{eg:4-7}中加入后编辑结果是方便读者理解任务内容，实际上质量评估任务在预测质量标签时并不依赖后编辑结果：
+\begin{example}
+单词级质量评估任务
+源句：The Sharpen tool sharpens areas in an imag.（英语）
+机器译文：Mit dem Scharfzeichner können Sie einzelne Bereiche in einem Bild 
+scharfzeichnen.（德语）
+后编辑结果：Der Schärfen-Werkezug Bereiche in einem Bild Schärfer erscheint.
+（德语）
+\label{eg:4-7}
+\end{example}
+\parinterval 单词级质量评估主要通过以下三类错误评价译文好坏：
+\begin{itemize}
+\vspace{0.5em}
+\item {\small\sffamily\bfseries{找出译文中翻译错误的单词}}。单词级质量评估任务要求预测一个与译文等长的质量标签序列，该标签序列反映译文端的每个单词是否能够准确表达出其对应的源端单词的含义，若是可以，则标签为``OK''，反之则为``BAD''。图\ref{fig:4-11}中的连线表示单词之间的对齐关系，图\ref{fig:4-11}中的MT tags即为该过程中需要预测的质量标签序列。
+\vspace{0.5em}
+\item {\small\sffamily\bfseries{找出源文中导致翻译错误的单词}}。单词级质量评估任务还要求预测一个与源文等长的质量标签序列，该标签序列反映源文端的每个单词是否会导致本次翻译出现错误，若是不会，则标签为``OK''，反之则为``BAD''。图\ref{fig:4-11}中的Source tags即为该过程中的质量标签序列。在实际实现中，质量评估系统往往先预测译文端的质量标签序列，并根据源文与译文之间的对齐关系，推测源端的质量标签序列。
+\vspace{0.5em}
+\item {\small\sffamily\bfseries{找出在翻译句子时出现漏译现象的位置}}。单词级质量评估任务同时也要求预测一个能够捕捉到漏译现象的质量标签序列，在译文端单词的两侧位置进行预测，若某位置未出现漏译，则该位置的质量标签为``OK''，否则为``BAD''。图\ref{fig:4-11}中的Gap tags即为该过程中的质量标签序列。为了检测句子翻译中的漏译现象，需要在译文中标记缺口，即译文中的每个单词两边都各有一个``GAP''标记，如图\ref{fig:4-11}所示。
+\vspace{0.5em}
+\end{itemize}
+%----------------------------------------------------------------------------------------
+%    NEW SUBSUB-SECTION
+%----------------------------------------------------------------------------------------
+\subsubsection{2.短语级质量评估}
+\parinterval 短语级质量评估可以看做是单词级质量评估任务的扩展：机器翻译系统引发的错误往往都是相互关联的，解码过程中某个单词出错会导致更多的错误，特别是在其局部上下文当中，以单词的``局部上下文''为基本单元进行指令评估即为短语级质量评估。
+\parinterval 短语级质量评估与单词级质量评估类似，其目标是找出短语中翻译错误、短语内部语序问题及漏译问题。短语级质量评估任务可以被定义为：以若干个连续单词组成的短语为基本评估单位，参照源语言句子，自动标记出短语内部短语错误以及短语之间的是否存在漏译。其中的短语错误包括短语内部单词的错译和漏译、短语内部单词的语序错误，而漏译问题则特指短语之间的漏译错误。在短语级质量评估任务中，输入是机器译文和源语言句子，输出是一系列标签序列，即图\ref{fig:4-12}中的Phrase-target tags、Gap tags，标签序列中的每个标签对应翻译中的每个单词，并表明该位置是否出现错误。
+%----------------------------------------------
+\begin{figure}[htp]
+    \centering
+	\input{./Chapter4/Figures/schematic-diagram-of-phrase-level-quality assessment-task}
+   \caption{短语级质量评估任务示意图}
+   \label{fig:4-12}
+\end{figure}
+%----------------------------------------------
+\parinterval 下面以实例\ref{eg:4-8}为例介绍该任务的具体内容：
+\begin{example}
+短语级质量评估任务
+源句：Bei Patienten mit || eingeschränkter Nierenfunktion kann || Insulinabbaus || 
+\hspace{3em}der Insulinbedarf || infolge des || verminderten || verringert sein . （德语）
+机器译文：In patients with || renal impairment , insulin requirements may || be || 
+\hspace{5em}diminished || due to || reduced || . （英语）
+\label{eg:4-8}
+\end{example}
+\parinterval 短语级质量评估任务主要通过以下两类类错误评价译文好坏：
+\begin{itemize}
+\vspace{0.5em}
+\item 找出译文中翻译错误的短语。要求预测出一个能够捕捉短语内部单词翻译错误、单词漏译以及单词顺序错误的标签序列。该序列中每个标签都对应着一个短语，若是短语不存在任何错误，则标签为``OK''；若是短语内部存在单词翻译错误和单词漏译，则标签为``BAD''；若短语内部的单词顺序存在问题，则标签为``BAD\_word\_order''。图\ref{fig:4-12}中的连线表示单词之间的对齐关系，蓝色虚线框标出了每个短语的范围，图\ref{fig:4-12}中的Phrase-target tags即为该过程中需要预测的质量标签序列。
+\vspace{0.5em}
+\item 找出译文中短语之间漏译错误。短语级质量评估任务同时也要求预测一个能够捕捉到短语间的漏译现象的质量标签序列，在译文端短语的两侧位置进行预测，若某位置未出现漏译，则该位置的质量标签为``OK''，否则为``BAD\_omission''。图\ref{fig:4-12}中的Gap tags即为该过程中的质量标签序列。
+\vspace{0.5em}
+\end{itemize}
+\parinterval 为了检测句子翻译中的漏译现象，参与者也被要求在译文中短语之间标记缺口，即译文中的每对短语之间都有两个``GAP''标记，一个在短语前面，一个在短语后面，与单词级类似。
+%----------------------------------------------------------------------------------------
+%    NEW SUBSUB-SECTION
+%----------------------------------------------------------------------------------------
+\subsubsection{3.句子级质量评估}
+\parinterval 迄今为止，质量评估的大部分工作都集中在句子层次的预测上，这是因为多数情况下机器翻译系统的处理都是逐句进行，系统用户也总是每次翻译一个句子或是以句子为单位组成的文本块（段落、文档等），因此以句子作为质量评估的基本单元是相当自然的。
+\parinterval 句子级质量评估的目标是生成能够反映译文句子整体质量的质量标签——可以是离散型的表示某种质量等级的标签，也可以是连续型的基于评分的标签。虽然以不同的标准进行评估，同一个译文句子的质量标签可能有所不同，但可以肯定的是句子的最终质量绝不是句子中单词质量的简单累加。因为与词级的质量评估相比，句子级质量评估也会关注是否保留源句的语义、译文的语义是否连贯、译文中的单词顺序是否合理等因素。
+\parinterval 句子级质量评估，顾名思义就是根据某种评价标准，通过建立模型来预测一个反映句子质量的标签。人们可以根据句子翻译的目的、后编辑的工作难度、是否达到发表要求或是是否能让非母语者读懂等各个角度、各个标准去设定句子级质量评估的标准。句子级质量评估任务的发展经历过下面几个阶段：
+\begin{itemize}
+\vspace{0.5em}
+\item 区分``人工翻译''和``机器翻译''。在最初的工作中，研究人员试图训练一个能够区分人工翻译和机器翻译的二分类器完成句子级的质量评估\upcite{gamon2005sentence}，将被分类器判断为``人工翻译''的机器译文视为优秀的译文，将被分类器判断为``机器翻译''的机器译文视为较差的译文。一方面，这种评估方式不够直观，另一方面，这种评估方式并不十分合理，因为通过人工比对发现很多被判定为``机器翻译''的译文具有与人们期望的人类翻译相同的质量水平。
+\vspace{0.5em}
+\item 预测反映译文句子质量的``质量标签''。此后，研究人员们试图使用人工为机器译文分配能够反映译文质量的标签\upcite{DBLP:conf/lrec/Quirk04}，例如``不可接受''``一定程度上可接受''``可接受''``理想''等，同时将获取机器译文的质量标签作为句子级质量评估的任务目标。
+\vspace{0.5em}
+\item 预测译文句子的相对排名。当相对排序（详见4.2节）的译文评价方法被引入后，给出机器译文的相对排名成为句子级质量评估的任务目标。
+\vspace{0.5em}
+\item 预测译文句子的后编辑工作量。在最近的研究中，句子级地质量评估一直在探索各种类型的离散或连续的后编辑标签。例如，通过测量以秒为单位的后编辑时间对译文句子进行评分；通过测量预测后编辑过程所需的击键数对译文句子进行评分；通过计算{\small\sffamily\bfseries{人工译后编辑距离}}\index{人工译后编辑距离}（Human Translation Error Rate，HTER）\index{Human Translation Error Rate，HTER}，即在后编辑过程中编辑（插入/删除/替换）)数量与参考翻译长度的占比率对译文句子进行评分。HTER的计算公式为：
+\vspace{0.5em}
+\begin{eqnarray}
+\rm{HTER}= \frac{\mbox{编辑操作数目}}{\mbox{翻译后编辑结果长度}}
+\label{eq:4-20}
+\end{eqnarray}
+\parinterval 这种质量评估方式往往以单词级质量评估为基础，在其结果的基础上进行计算。以实例\ref{eg:4-7}中词级质量评估结果为例，与编辑后结果相比较，机器翻译译文中有四处漏译（``Mit''``können''``Sie''``einzelne''）、三处误译（``dem''``Scharfzeichner''\\``scharfzeichnen''分别被误译为``Der''``Schärfen-Werkezug''``Schärfer''）、一处多译（``erscheint''），因而需要进行4次插入操作、3次替换操作和1次删除操作，而最终译文长度为12，则有$\rm HTER=(4+3+1)/12=0.667$。需要注意的是，即便这种评估方式以单词级质量评估为基础，也不意味这句子级质量评估只是在单词级质量评估的结果上通过简单的计算来获得其得分，在实际研究中，常将其视为一个回归问题，利用大量数据学习其评分规则。
+\vspace{0.5em}
+\end{itemize}
+%----------------------------------------------------------------------------------------
+%    NEW SUBSUB-SECTION
+%----------------------------------------------------------------------------------------
+\subsubsection{4.文档级质量评估}
+\parinterval 文档级质量评估的主要目的就是对机器翻译得到的译文文档进行打分。文档级质量评估中，``文档''这个术语很多时候并不单单指一整篇文档，而是指包含多个句子的文本，例如包含3到5个句子的段落或是像新闻文章一样的长文本。
+\parinterval 传统的机器翻译任务中，往往以一个句子作为输入和翻译的单元，而忽略了文档中句子之间的联系，这可能会使文档的论述要素受到影响，最终导致整个文档的语义不连贯。如实例1所示，在第二句中``he''原本指代第一句中的``housewife''，这里出现了错误，但这种错误在句子级的质量评估中并不能被发现。
+\begin{example}
+文档级质量评估任务
+Candidate： A {\red housewife} won the first prize in the supermarket's anniversary 
+\hspace{5em}celebration.
+Reference： A few days ago, {\red he} contacted the News Channel and said that the 
+\hspace{5em}supermarket owner refused to give {\red him} the prize.
+\label{eg:4-9}
+\end{example}
+\parinterval 在文档级质量评估中，有两种衡量文档译文的质量的方式：
+\begin{itemize}
+\vspace{0.5em}
+\item 阅读理解测试得分情况。以往衡量文档译文质量的主要方法是采用理解测试\upcite{,DBLP:conf/icassp/JonesGSGHRW05}，即利用提前设计好的与文档相关的阅读理解题目（包括多项选择题类型和问答题类型）对母语为目标语言的多个测试者进行测试，将代表测试者在给定文档上的问卷中的所有问题所得到的分数作为质量标签。
+\vspace{0.5em}
+\item 两阶段后编辑工作量。 最近的研究工作中，多是采用对文档译文进行后编辑的工作量作为评价指标评估文档译文的质量，为了准确获取文档后编辑的工作量，两阶段后编辑方法被提出\upcite{DBLP:conf/eamt/ScartonZVGS15}，即第一阶段对文档中的句子单独在无语境情况下进行后编辑，第二阶段将所有句子重新合并成文档后再进行后编辑。两阶段中后编辑工作量的总和越多，意味着文档译文质量越差。
+\vspace{0.5em}
+\end{itemize}
+\parinterval 在文档级质量评估任务中，需要对译文文档做一些更细粒度的注释，注释内容包括错误、错误类型和错误的严重程度，最终在注释的基础上对译文文档质量进行评估。
+\parinterval 文档级质量评估与更细粒度的词级和句子级的质量评价相比更加复杂、更加难以实现。其难点之一在于文档级的质量评估过程中需要根据一些主观的质量标准去对文档进行评分，例如在注释的过程中，对于错误的严重程度并没有严格的界限和规定，只能靠评测人员主观判断，这就意味着随着出现主观偏差的注释的增多，文档级质量评估的参考价值会大打折扣。另一方面，根据所有注释（错误、错误类型及其严重程度）对整个文档进行评分本身就具有不合理性，因为译文中有些在抛开上下文环境的情况下可以并判定为``翻译的不错的''单词和句子，一旦被放在文档中的语境后就可能变得不合理，而某些在无语境条件下看起来翻译得``糟糕透了''的单词和句子，一旦被放在文档中的语境中可能会变得恰到好处。此外，构建一个质量评测模型势必需要大量的标注数据，而文档级质量评测所需要的带有注释的数据的获取代价相当高。
+%----------------------------------------------------------------------------------------
+%    NEW SUB-SECTION
+%----------------------------------------------------------------------------------------
+\subsection{怎样构建质量评估模型}
+\parinterval 不同于有参考答案的自动评价，质量评估任务中对译文质量进行评价需要更加``复杂''的计算方式。质量评估本质上是一个统计推断问题，即：如何根据以往得到的经验对从未见过的机器译文的质量做出预测。从这个角度说，质量评估和机器翻译问题一样，都需要设计模型进行求解，而不是像BLEU计算一样直接使用一两个指标性的公式就能得到结果。
+\parinterval 实际上，质量评估的灵感最初来源于语音识别中的置信度评价，所以最初研究人员也尝试通过翻译模型中的后验概率来直接评价翻译质量\upcite{DBLP:conf/interspeech/FetterDR96}，然而仅仅依靠概率值作为评价标准显然是远远不够的，其效果也让人大失所望。直到2003年，质量评估被定义为一个有监督的预测类机器学习问题，此后，``使用机器学习算法从许多通过特征（或学习表示）描述的句子翻译示例中归纳模型''成为了处理质量评估问题的基本思路。
+\parinterval 研究人员将质量评估模型的基本框架设计为两部分：
+\parinterval（1）特征提取模块：用于在数据中提取能够反映翻译结果``质量''的特征。
+\parinterval（2）质量评估模块：基于提取的特征，利用机器学习算法预测翻译结果``质量''。
+\parinterval 特征提取模块主要提取四个方面的特征：从源文中提取复杂度特征、从机器译文中提取流畅度特征、借助机器翻译系统提取翻译置信度特征、比照源文和机器译文提取充分度特征。
+\begin{itemize}
+\vspace{0.5em}
+\item 复杂度特征：反映了翻译一个源文的难易程度，翻译难度越大，译文质量低的可能性就越大。源文的形态句法信息最能反映源文的复杂度，例如源文的长度越长，源文往往越复杂；源文的句法树越宽、越深，源文往往越复杂，因为源句的句法树越深代表句子结构越复杂，源句的句法树越宽代表句子中各成分相互联系越多，正如图\ref{fig:4-13}所示；源文中定语从句的数量越多，源文往往更复杂。
+%----------------------------------------------
+\begin{figure}[htp]
+    \centering
+	\subfigure[较浅较窄的句法书意味着较简单的句子结构]{\input{./Chapter4/Figures/a-shallow-and-narrow-grammar-means-a-simpler-sentence-structure}}
+	\subfigure[较深较宽的语法树意味着更复杂的句子结构]{\input{./Chapter4/Figures/A-deeper-and-wider-grammar-tree-means-more-complex-sentence-structures}}
+   \caption{句法树隐含着复杂度特征}
+   \label{fig:4-13}
+\end{figure}
+%----------------------------------------------
+\vspace{0.5em}
+\item 流畅度特征：反映了译文的自然度、流畅度、语法合理程度。为了衡量译文的流畅度，往往需要借助大型目标语言语料库、语言模型和语法检查工具等。例如借助大型目标语料库和统计语言模型获取的译文3-gram语言模型概率、利用语法检查工具获取的译文语法正确性等等，这些数学性指标均可用来衡量译文的流畅度。
+\vspace{0.5em}
+\item 置信度特征：反映了机器翻译系统对输出的译文的置信程度。翻译系统解码过程中对应的译文的全局概率、最终$n$-best清单中翻译假设的数量、译文中的词语在$n$-best输出中的出现频率等指标都可以作为机器翻译提供的置信度特征用于质量评估。
+\item 充分度特征：反映了源文和机器译文在不同语言层次上的密切程度或关联程度。比较常用的充分度特征包括源文和译文的长度比、源文和译文的词对齐信息、源文和译文表层结构（例如括号、数字、标点符号等）数量的绝对差异、源文和译文句法树的深度和宽度差异、源文和译文中命名实体数量的差异、源文和译文之间$n$元语法单元的匹配比例，此外，还可以用源文和译文的分布式表示衡量其间的相似性。由于源文和译文之间语言的不同，充分度特征是最难可靠提取的特征类型。
+\vspace{0.5em}
+\end{itemize}
+\parinterval 随着深度学习技术的发展，目前比较常用的特征提取手段还包括利用神经网络模型自动提取质量特征，但由于这种方法的可解释性比较差，研究人员无法对该方法提取到的质量特征类型进行判断。
+\parinterval 提取到质量特征之后，使用质量评估模块对译文质量进行预测。质量评估模型通常由分类算法或回归算法实现：
+\begin{itemize}
+\vspace{0.5em}
+\item 句子级和文档级质量评估多由回归算法实现。由于在句子级和文档级的质量评估中，标签是使用连续数字表示的，因此回归算法是最合适的选择，其中最常用的算法有朴素贝叶斯、线性回归、支持向量机、脊回归、偏最小二乘法、随机森林算法等。
+\vspace{0.5em}
+\item 单词级和短语级质量评估多由分类算法实现。对于单词级质量评估任务中标记``OK''或``BAD''，这对应了经典的二分类问题，因此经常使用分类算法对其进行预测，自动分类算法在{\chapterthree}已经涉及，质量评估中直接使用成熟的分类器即可。
+\vspace{0.5em}
+\end{itemize}
+%----------------------------------------------------------------------------------------
+%    NEW SUB-SECTION
+%----------------------------------------------------------------------------------------
+\subsection{质量评估的应用场景}
+\parinterval 质量评估在过去十年中越来越受欢迎，这个现象在一些机器翻译领域之外的人看来非常令人费解，无论是人工评价方法还是自动评价方法都能够对机器译文质量进行评价从而衡量机器翻译系统的整体性能，无参考答案评价方法看起来似乎是没有存在的必要，但事实却并非如此。
+\parinterval 传统的有参考答案的评价方式可以通过计算机器译文与参考答案之间的相似性用来评测机器翻译系统的整体性能，从而加快系统研发进程。与其相比，无需参考答案的质量评估是在无参考答案的情况下，直接对机器译文质量作出预测，这个课题的提出，其目的并不在于其``评价''功能而在于其``预测''功能。
+\parinterval 大多数情况下参考答案的获取具有很大难度，因而质量评估比传统的有参考答案的自动评价方法更接近生产生活的实际情况，更适合被应用到生产实践和实时机器翻译场景中，为社会创造更多的实用价值和商业价值。下面将列举几个质量评估合理的应用场景：
+\begin{itemize}
+\vspace{0.5em}
+\item 判断人工后编辑的工作量，辅助人工后编辑过程。人工后编辑工作中有两个不可避免的问题：待编辑的机器译文是否值得改、待编辑的机器译文需要修改哪里。对于一些质量较差的机器译文来说，人工重译远远比修改译文的效率高，后编辑人员可以借助质量评估模型提供的句子分数或是编辑距离这两个指标筛选出值得进行后编辑的机器译文，另一方面，质量评估模型可以为每条机器译文提供{错误内容、错误类型、错误严重程度}的注释，这些内容将帮助后编辑人员准确定位到需要修改的位置，同时在一定程度上提示后编辑人员采取何种修改策略，势必能大大减少后编辑的工作内容。
+\vspace{0.5em}
+\item 自动识别翻译错误，助力机器翻译后编辑工作完全自动化.质量评估模型和{\small\sffamily\bfseries{自动后编辑模型}}\index{自动后编辑模型}（Automatic Podt-Editing,APE）\index{Automatic Podt-Editing,APE}协同工作能够实现后编辑工作的自动化和流水化，提高日常工作效率，创造更多的经济效益：将机器翻译系统输出作为质量评估模型的输入，质量评估模型能够自动识别出机器译文中不准确、不流畅的现象，完成三方面内容：锁定错误出现的位置、识别错误类型、描述错误的严重程度。此后，自动后编辑模型将根据质量评估模型提供的错误提示，自动对译文中的错误进行修改，并生成$n$个最优的后编辑译文。质量评估模型此后将充当``评委''的角色，在最优后编辑译文列表中筛选出后编辑工作的最终输出。比较遗憾的是，目前自动后编辑模型的输出结果与人工后编辑结果相比仍存在一定的差距，在一些对译文准确性要求较高的场合仍需要人工后编辑的参与，但相信随着质量评估技术和自动后编辑技术的发展，后编辑工作的完全自动化在不远的将来必然可以实现。
+\vspace{0.5em}
+\item 多语言场景下，参与人机协同过程实现无语言障碍交流。在跨国性质的服务网站和社交网站这种典型的多语言场景下，质量评估模型将鼓励用户使用非母语语言进行交流。例如在某社交网站上，当一名英国用户尝试用自己并不熟练的德语对某个德国用户的言论发表评价时，质量评估模型可以提示该用户评论内容中存在的用词、语法等问题。或者该用户选择借助机器翻译系统将英文评论内容翻译为德文，质量评估模型可以对翻译内容进行评分，由用户根据评分的高低决定使用机器翻译系统输出的德文进行评论还是使用原本的英文进行评论；例如在某国际酒店的预定网站上，酒店经营者希望使用机器翻译系统将某些服务评价内容译为多种语言供顾客参考，使用质量评估模型后，可以筛选出更加准确流畅的评价译文对顾客进行公示。在大型国际会议现场，{\small\sffamily\bfseries{自动语音识别系统}}\index{自动语音识别系统}（Automatic Speech Recognition，ASR）\index{Automatic Speech Recognition，ASR}、{\small\sffamily\bfseries{机器翻译系统}}\index{机器翻译系统}（Machine Translation，MT）\index{Machine Translation，MT}和质量评估工具的相互配合，有望在未来取代随身翻译和国际会议现场的同声传译专业人员：ASR系统给出较为准确的语音识别结果，由几个高性能的MT系统对其进行翻译，产生若干翻译结果，使用质量评估工具对翻译结果进行质量评估，评分最高的译文作为整体输出。比较遗憾的是，目前语音识别和机器翻译的发展水平都并未达到会议级别的要求，所以以机器代替专业人员还需要很长一段时间。
+\end{itemize}
 %----------------------------------------------------------------------------------------
 %    NEW SECTION
 %----------------------------------------------------------------------------------------
-\section{}
+\sectionnewpage
+\section{小结及深入阅读}
+\parinterval 译文的质量评价是机器翻译研究中不可或缺的环节。与其他任务不同，由于自然语言高度的歧义性和表达方式的多样性，机器翻译的参考答案本身就不唯一。此外，对译文准确、全面的评价准则很难制定，导致译文质量的自动评价变得异常艰难，因此其也成为了广受关注的研究课题。本章系统阐述了译文质量评估的研究现状和主要挑战。从人类参与程度和标注类型两个角度对译文质量评价中的经典方法进行介绍，其中对广受学界关注的无参考译文的质量评估问题从方法、模型、应用各个角度进行着重介绍，力求读者对领域内的热点内容有更加全面的了解。比较遗憾的是，由于篇幅限制笔者无法对译文评价的相关工作讲述得面面俱到，除了章节中的内容，还有很多研究问题值得关注：
+\begin{itemize}
+\vspace{0.5em}
+\item 基于句法和语义的机器译文质量自动评价方法。本章内容中介绍的自动评价多是基于表面字符串形式判定机器翻译结果和参考译文之间的相似度，而忽略了更抽象的语言层次的信息。基于句法和语义的机器译文质量自动评价方法在在评价度量标准中加入能反映句法信息\upcite{DBLP:conf/acl/LiuG05}和语义信息\upcite{DBLP:conf/wmt/GimenezM07a}的相关内容，通过比较机器译文与参考答案之间的句法相似度和语义等价性\upcite{DBLP:journals/mt/PadoCGJM09}，能够大大提高自动评价与人工评价之间的相关性。其中句法信息往往能够对机器译文流利度方面的评价起到促进作用\upcite{DBLP:conf/acl/LiuG05}，常见的句法信息包括语法成分\upcite{DBLP:conf/acl/LiuG05}、依赖关系\upcite{DBLP:conf/ssst/OwczarzakGW07}\upcite{DBLP:conf/wmt/OwczarzakGW07}\upcite{DBLP:conf/coling/YuWXJLL14}、句法结构\upcite{DBLP:conf/wmt/PopovicN09}等。语义信息则机器翻译的充分性评价更有帮助\upcite{DBLP:conf/acl/BanchsL11}\upcite{reeder2006measuring}，近年来也有很多很多用于机器译文质量评估的语义框架被提出，如AM-FM\upcite{DBLP:conf/acl/BanchsL11}、XMEANT\upcite{DBLP:conf/acl/LoBSW14}等。
+\vspace{0.5em}
+\item 对机器译文中的错误分析和错误分类。无论是人工评价还是自动评价手段，其评价结果只能反映机器翻译系统性能，而无法确切表明机器翻译系统的强项或弱点是什么、系统最常犯什么类型的错误、一个特定的修改是否改善了系统的某一方面、排名较差的系统是否在任何方面都优于排名较好的系统等等。对机器译文进行错误分析和错误分类有助于找出机器翻译系统中存在的主要问题，以便集中精力进行研究改进\upcite{DBLP:conf/lrec/VilarXDN06}。相关的研究工作中，一些致力于错误分类方法的设计，如手动的机器译文错误分类框架\upcite{DBLP:conf/lrec/VilarXDN06}、自动的机器译文错误分类框架\upcite{popovic2011human}、基于语言学的错误分类方法\upcite{DBLP:journals/mt/CostaLLCC15}以及目前被用作篇章级质量评估注释标准的MQM错误分类框架\upcite{lommel2014using}；其他的研究工作则致力于对机器译文进行错误分析，如引入形态句法信息的自动错误分析框架\upcite{DBLP:conf/wmt/PopovicGGLNMFB06}、引入词错误率(WER)和位置无关词错误率(PER)的错误分析框架\upcite{DBLP:conf/wmt/PopovicN07}、基于检索的错误分析工具tSEARCH\upcite{DBLP:conf/acl/GonzalezMM13}等等。
+\vspace{0.5em}
+\item 译文质量的多角度评价。章节内主要介绍的几种经典方法如BLEU、TER、METEOR等，大都是从某个单一的角度计算机器译文和参考答案的相似性，如何对译文从多个角度进行综合评价是需要进一步思考的问题，\ref{Evaluation method of Multi Strategy fusion}节中介绍的多策略融合评价方法就可以看作是一种多角度评价方法，其思想是将各种评价方法下的译文得分通过某种方式进行组合，从而实现对译文的综合评价。译文质量多角度评价的另一种思路则是直接将BLEU、TER、Meteor等多种指标看做是某种特征，使用分类、回归、排序等机器学习手段形成一种综合度量。此外，也有相关工作专注于多等级的译文质量评价，使用聚类算法大致将译文按其质量分为不同等级，并对不同质量等级的译文按照不同权重组合几种不同的评价方法。
+\vspace{0.5em}
+\item 不同评价方法的应用场景有明显不同：人工评价主要用于需要对机器翻译系统进行准确的评估的场合。例如，在系统对比中利用人工评价方法对不同系统进行人工评价、给出最终排名，或上线机器翻译服务时对翻译品质进行详细的测试；有参考答案的自动评价则可以为机器翻译系统提供快速、相对可靠的评价。在机器翻译系统的快速研发过程中，一般都使用有参考答案的自动评价方法对最终模型的性能进行评估。有相关研究工作专注在机器翻译模型的训练过程中充分利用评价信息进行参数调优（如BLEU分数），其中比较有代表性的工作包括最小错误率训练\upcite{DBLP:conf/acl/Och03}、最小风险训练等\upcite{DBLP:conf/acl/ShenCHHWSL16}。这部分内容可以参考{\chapterseven}和{\chapterthirteen}进行进一步阅读；无参考答案的质量评估主要用来对译文质量做出预测，经常被应用在是在一些无法提供参考译文的实时翻译场景中，例如人机交互过程、自动纠错、后编辑等\upcite{DBLP:conf/wmt/FreitagCR19}。
+\vspace{0.5em}
+\item 质量评估领域比较值得关注的一个研究问题是如何使模型更加鲁棒，因为通常情况下，一个质量评估模型会受语种、评价等级等问题的约束，设计一个能应用于任何语种，同时从单词、短语、句子等各个等级对译文质量进行评估的模型是很有难度的。Bicici等人最先关注质量评估的鲁棒性问题，并设计开发了一种与语言无关的机器翻译性能预测器\upcite{DBLP:journals/mt/BiciciGG13}，此后又在该工作的基础上研究如何利用外在的、与语言无关的特征对译文进行句子级别的质量评估\upcite{DBLP:conf/wmt/BiciciW14}，该项研究的最终成果是一个与语言无关，可以从各个等级对译文质量进行评估的模型——RTMs（Referential Translation Machines）\upcite{DBLP:conf/wmt/BiciciLW15a}。
+\vspace{0.5em}
+\end{itemize}
\ No newline at end of file
--- a/Chapter6/Figures/figure-zh-en-sentence-alignment.tex
+++ b/Chapter6/Figures/figure-zh-en-sentence-alignment.tex
@@ -22,11 +22,11 @@
 \end{scope}
-\draw [-,thick,ublue,dashed] (s1.south) -- (t1.north);
+\draw [-,thick,dashed] (s1.south) -- (t1.north);
-\draw [-,thick,ublue,dashed] (s4.south) -- ([yshift=0.3em]t2.north);
+\draw [-,thick,dashed] (s4.south) -- ([yshift=0.3em]t2.north);
-\draw [-,thick,ublue,dashed] (s2.south) ..controls +(south:1em) and +(north:1em).. (t4.north);
+\draw [-,thick,dashed] (s2.south) ..controls +(south:1em) and +(north:1em).. (t4.north);
-\draw [-,thick,ublue,dashed] (s3.south) ..controls +(south:0.5em) and +(north:1.5em).. (t5.north);
+\draw [-,thick,dashed] (s3.south) ..controls +(south:0.5em) and +(north:1.5em).. (t5.north);
-\draw [-,thick,ublue,dashed] (s5.south) -- (t3.north);
+\draw [-,thick,dashed] (s5.south) -- (t3.north);
 \end{tikzpicture}

--- a/Chapter6/chapter6.tex
+++ b/Chapter6/chapter6.tex
@@ -23,7 +23,7 @@
 \chapter{基于扭曲度和繁衍率的模型}
-第五章展示了一种简单的基于单词的翻译模型。这种模型的形式非常简单，而且其隐含的词对齐信息具有较好的可解释性。不过，语言翻译的复杂性远远超出人们想象。这里仍然面临两方面挑战\ \dash\ 如何对`` 调序''问题进行建模以及如何对``一对多翻译''问题进行建模。调序是翻译问题中所特有的现象，比如，汉语到日语的翻译中，需要对谓词进行调序。另一方面，一个单词在另一种语言中可能会被翻译为多个连续的词，比如，汉语`` 联合国''翻译到英语会对应三个单词``The United Nations''。这种现象也被称作一对多翻译。
+{\chapterfive}展示了一种基于单词的翻译模型。这种模型的形式非常简单，而且其隐含的词对齐信息具有较好的可解释性。不过，语言翻译的复杂性远远超出人们的想象。有两方面挑战\ \dash\ 如何对`` 调序''问题进行建模以及如何对``一对多翻译''问题进行建模。调序是翻译问题中所特有的现象，比如，汉语到日语的翻译中，需要对谓词进行调序。另一方面，一个单词在另一种语言中可能会被翻译为多个连续的词，比如，汉语`` 联合国''翻译到英语会对应三个单词``The United Nations''。这种现象也被称作一对多翻译，它与句子长度预测有着密切的联系。
 无论是调序还是一对多翻译，简单的翻译模型（如IBM模型1）都无法对其进行很好的处理。因此，需要考虑对这两个问题单独进行建模。本章将会对机器翻译中两个常用的概念进行介绍\ \dash\ 扭曲度（Distortion）和繁衍率（Fertility）。它们可以被看做是对调序和一对多翻译现象的一种统计描述。基于此，本章会进一步介绍基于扭曲度和繁衍率的翻译模型，建立相对完整的基于单词的统计建模体系。相关的技术和概念在后续章节也会被进一步应用。
@@ -34,7 +34,7 @@
 \sectionnewpage
 \section{基于扭曲度的翻译模型}
-下面将介绍扭曲度在机器翻译中的定义及使用方法。这也带来了两个新的翻译模型\ \dash\ IBM模型2和HMM翻译模型。
+下面将介绍扭曲度在机器翻译中的定义及使用方法。这也带来了两个新的翻译模型\ \dash\ IBM模型2\cite{Peter1993The}和HMM翻译模型\cite{vogel1996hmm}。
 %----------------------------------------------------------------------------------------
 %    NEW SUB-SECTION
@@ -51,9 +51,9 @@
 \end{figure}
 %----------------------------------------------
-\parinterval 既然调序时翻译中的基本现象，机器翻译自然就需要一种方式对其进行描述。其中，最常见的是基于“调序距离”的方法。这里，可以假设完全进行顺序翻译时，调序的“代价”是最低的。当调序出现时，可以用调序相对于顺序翻译产生的位置偏移来度量调序的程度，也被称为“调序距离”。图\ref{fig:6-2}展示了翻译时两种语言中词的对齐矩阵。比如，在图\ref{fig:6-2}(a)中，系统需要跳过“对”和“你”来翻译“感到”和“满意”，之后再回过头翻译“对”和“你”，这样就完成了对单词的调序。这时可以简单的把调序时需要跳过的单词数看作一种距离。
+\parinterval 在对调序问题进行建模的方法中，最基本的是使用调序距离方法。这里，可以假设完全进行顺序翻译时，调序的“代价”是最低的。当调序出现时，可以用调序相对于顺序翻译产生的位置偏移来度量调序的程度，也被称为调序距离。图\ref{fig:6-2}展示了翻译时两种语言中词的对齐矩阵。比如，在图\ref{fig:6-2}(a)中，系统需要跳过“对”和“你”来翻译“感到”和“满意”，之后再回过头翻译“对”和“你”，这样就完成了对单词的调序。这时可以简单的把需要跳过的单词数看作一种距离。
-\parinterval 可以看到，调序距离实际上是在度量译文词序相对于源文词序的一种扭曲程度。因此，也常常把这种调序距离称作{\small\sffamily\bfseries{扭曲度}}（Distortion）。调序距离越大对应的扭曲度也越大。比如，可以明显看出图\ref{fig:6-2}（b）中调序的扭曲度要比图\ref{fig:6-2}（a）中调序的扭曲度大，因此\ref{fig:6-2}（b）实例的调序代价也更大。
+\parinterval 可以看到，调序距离实际上是在度量目标语言词序相对于源语言词序的一种扭曲程度。因此，也常常把这种调序距离称作{\small\sffamily\bfseries{扭曲度}}（Distortion）。调序距离越大对应的扭曲度也越大。比如，可以明显看出图\ref{fig:6-2}（b）中调序的扭曲度要比图\ref{fig:6-2}（a）中调序的扭曲度大，因此\ref{fig:6-2}（b）实例的调序代价也更大。
 \parinterval 在机器翻译中使用扭曲度进行翻译建模是一种十分自然的想法。接下来，会介绍两个基于扭曲度的翻译模型，分别是IBM模型2和隐马尔可夫模型。不同于IBM模型1，它们利用了单词的位置信息定义了扭曲度，并将扭曲度融入翻译模型中，使得对翻译问题的建模更加合理。
@@ -71,46 +71,45 @@
 %----------------------------------------------------------------------------------------
 \subsection{IBM模型2}
-\parinterval IBM模型1很好地化简了问题，但是由于使用了很强的假设，导致模型和实际情况有较大差异。其中一个比较严重的问题是假设词对齐的生成概率服从均匀分布。图\ref{fig:6-3}展示了一个简单的实例。尽管译文$\mathbf{t}$比$\mathbf{t}'$的质量更好，但对于IBM模型1来说它们对应的翻译概率相同。这是因为当词对齐服从均匀分布时，模型会忽略目标语言单词的位置信息。因此当单词翻译相同但顺序不同时，翻译概率一样。同时，由于源语言单词是由错误位置的目标语单词生成的，不合理的对齐也会导致不合理的词汇翻译概率。
+\parinterval 对于建模来说，IBM模型1很好地化简了翻译问题，但是由于使用了很强的假设，导致模型和实际情况有较大差异。其中一个比较严重的问题是假设词对齐的生成概率服从均匀分布。IBM模型2抛弃了这个假设\cite{Peter1993The}。它认为词对齐是有倾向性的，它与源语言单词的位置和目标语言单词的位置有关。具体来说，对齐位置$a_j$的生成概率与位置$j$、源语言句子长度$m$和目标语言句子长度$l$有关，形式化表述为：
-%----------------------------------------------
-\begin{figure}[htp]
-    \centering
-\input{./Chapter6/Figures/figure-different-translation-result-in-different-score-ibm1}
-    \caption{不同的译文导致不同IBM模型1得分的情况}
-    \label{fig:6-3}
-\end{figure}
-%----------------------------------------------
-\parinterval 因此，IBM模型2抛弃了对$\textrm{P}(a_j|a_1^{j-1},s_1^{j-1},m,\mathbf{t})$服从均匀分布的假设。IBM模型2认为词对齐是有倾向性的，它要与源语单词的位置和目标语单词的位置有关。具体来说，对齐位置$a_j$的生成概率与位置$j$、源语句子长度$m$和译文长度$l$有关，形式化表述为：
 \begin{eqnarray}
 \textrm{P}(a_j|a_1^{j-1},s_1^{j-1},m,\mathbf{t}) \equiv a(a_j|j,m,l)
 \label{eq:6-1}
 \end{eqnarray}
+\parinterval 这里还用{\chapterthree}中的例子（图\ref{fig:6-3}）来进行说明。在IBM模型1中，``桌子''对齐到目标语言四个位置的概率是一样的。但在IBM模型2中，``桌子''对齐到``table''被形式化为$a(a_j |j,m,l)=a(3|2,3,3)$，意思是对于源语言位置2（$j=2$）的词，如果它的源语言和目标语言都是3个词（$l=3,m=3$），对齐到目标语言位置3（$a_j=3$）的概率是多少？因为$a(a_j|j,m,l)$也是模型需要学习的参数，因此``桌子''对齐到不同目标语言单词的概率也是不一样的。理想的情况下，通过$a(a_j|j,m,l)$，``桌子''对齐到``table''应该得到更高的概率。
 %----------------------------------------------
 \begin{figure}[htp]
    \centering
 \input{./Chapter6/Figures/figure-zh-en-bilingual-sentence-pairs}
-    \caption{汉译英双语句对及词对齐}
+    \caption{汉译英句对及词对齐}
-    \label{fig:6-4-a}
+    \label{fig:6-3}
 \end{figure}
 %----------------------------------------------
-\parinterval 这里还用{\chapterthree}中的例子（图\ref{fig:6-4-a}）来进行说明j。在模型1中，``桌子''对齐到译文四个位置上的单词的概率是一样的。但在模型2中，``桌子''对齐到``table''被形式化为$a(a_j |j,m,l)=a(3|2,3,3)$，意思是对于源文位置2（$j=2$）的词，如果它的源语言和译文都是3个词（$l=3,m=3$），对齐到目标语译文位置3（$a_j=3$）的概率是多少？因为$a(a_j|j,m,l)$也是模型需要学习的参数，因此``桌子''对齐到不同目标语单词的概率也是不一样的。理想的情况下，通过$a(a_j|j,m,l)$，``桌子''对齐到``table''应该得到更高的概率。
+\parinterval IBM模型2的其他假设均与模型1相同，即源语言长度预测概率及源语言单词生成概率被定义为：
-\parinterval IBM模型2的其他假设均与模型1相同。把公式$\textrm{P}(m|\mathbf{t})\equiv\varepsilon$、$\textrm{P}(s_j|a_1^{j},s_1^{j-1},m,\mathbf{t}) \equiv f(s_j|t_{a_j})$和\ref{eq:6-1}重新带入公式$\textrm{P}(\mathbf{s},\mathbf{a}|\mathbf{t})=\textrm{P}(m|\mathbf{t})\prod_{j=1}^{m}{\textrm{P}(a_j|a_1^{j-1},s_1^{j-1},m,\mathbf{t})\textrm{P}(s_j|a_1^{j},s_1^{j-1},}$\\${m,\mathbf{t})}$和$\textrm{P}(\mathbf{s}|\mathbf{t})= \sum_{\mathbf{a}}\textrm{P}(\mathbf{s},\mathbf{a}|\mathbf{t})$，可以得到IBM模型2的数学描述：
+\begin{eqnarray}
+\textrm{P}(m|\mathbf{t}) & \equiv & \varepsilon \label{eq:s-len-gen-prob} \\
+\textrm{P}(s_j|a_1^{j},s_1^{j-1},m,\mathbf{t}) & \equiv & f(s_j|t_{a_j}) 
+\label{eq:s-word-gen-prob}
+\end{eqnarray}
+把公式\ref{eq:s-len-gen-prob}、\ref{eq:s-word-gen-prob}和\ref{eq:6-1} 重新带入公式$\textrm{P}(\mathbf{s},\mathbf{a}|\mathbf{t})=\textrm{P}(m|\mathbf{t})\prod_{j=1}^{m}{\textrm{P}(a_j|a_1^{j-1},s_1^{j-1},m,\mathbf{t})}$\\${\textrm{P}(s_j|a_1^{j},s_1^{j-1},m,\mathbf{t})}$ 和$\textrm{P}(\mathbf{s}|\mathbf{t})= \sum_{\mathbf{a}}\textrm{P}(\mathbf{s},\mathbf{a}|\mathbf{t})$，可以得到IBM模型2的数学描述：
 \begin{eqnarray}
 \textrm{P}(\mathbf{s}| \mathbf{t}) & = &  \sum_{\mathbf{a}}{\textrm{P}(\mathbf{s},\mathbf{a}| \mathbf{t})} \nonumber \\
                       & = & \sum_{a_1=0}^{l}{\cdots}\sum _{a_m=0}^{l}{\varepsilon}\prod_{j=1}^{m}{a(a_j|j,m,l)f(s_j|t_{a_j})}
-\label{eq:6-2}
+\label{eq:6-4}
 \end{eqnarray}
-\parinterval 类似于模型1，模型2的表达式\ref{{eq:6-2}}也能被拆分为两部分进行理解。第一部分：遍历所有的$\mathbf{a}$；第二部分：对于每个$\mathbf{a}$累加对齐概率$\textrm{P}(\mathbf{s},\mathbf{a}| \mathbf{t})$，即计算对齐概率$a(a_j|j,m,l)$和词汇翻译概率$f(s_j|t_{a_j})$对于所有源语言位置的乘积。
+\parinterval 类似于模型1，模型2的表达式\ref{eq:6-4}也能被拆分为两部分进行理解。第一部分：遍历所有的$\mathbf{a}$；第二部分：对于每个$\mathbf{a}$累加对齐概率$\textrm{P}(\mathbf{s},\mathbf{a}| \mathbf{t})$，即计算对齐概率$a(a_j|j,m,l)$和词汇翻译概率$f(s_j|t_{a_j})$对于所有源语言位置的乘积。
-\parinterval 同样的，模型2的解码及训练优化和模型1的十分相似，在此不再赘述，详细推导过程可以参看{\chapterthree}解码及计算优化这一小节，这里给出IBM模型2的最终表达式：
+\parinterval 同样的，模型2的解码及训练优化和模型1的十分相似，在此不再赘述，详细推导过程可以参看{\chapterfive}解码及计算优化部分。这里直接给出IBM模型2的最终表达式：
 \begin{eqnarray}
-\textrm{IBM模型2：\ \ \ \ }\textrm{P}(\mathbf{s}| \mathbf{t}) & = & \varepsilon \prod\limits_{j=1}^{m} \sum\limits_{i=0}^{l} a(i|j,m,l) f(s_j|t_i)
+\textrm{P}(\mathbf{s}| \mathbf{t}) & = & \varepsilon \prod\limits_{j=1}^{m} \sum\limits_{i=0}^{l} a(i|j,m,l) f(s_j|t_i)
-\label{eq:6-3}
+\label{eq:6-5}
 \end{eqnarray}
@@ -120,39 +119,41 @@
 \subsection{隐马尔可夫模型}
-\parinterval IBM模型把翻译问题定义为对译文和词对齐同时进行生成的问题，模型翻译质量的好坏与词对齐有着非常紧密的联系。IBM模型1假设对齐概率仅依赖于译文长度，即对齐概率服从均匀分布；IBM模型2假设对齐概率与源语言、目标语言的句子长度以及源语言位置和目标语言位置相关。IBM模型2已经覆盖到了大部分的词对齐问题，但是该模型只考虑到了词语的绝对位置，并未考虑到相邻词语间的关系。图\ref{fig:6-5}展示了一个简单的实例，可以看到的是，汉语的每个词都被分配给了英语句子中的每一个单词，但是词语并不是任意分布在各个位置上的，而是倾向于生成簇。也就是说，如果源语言的两个词位置越近，它们的目标词在目标语言句子的位置也越近。
+\parinterval IBM模型把翻译问题定义为生成词对齐的问题，模型翻译质量的好坏与词对齐有着非常紧密的联系。IBM模型1假设对齐概率仅依赖于目标语言句子长度，即对齐概率服从均匀分布；IBM模型2假设对齐概率与源语言、目标语言的句子长度以及源语言位置和目标语言位置相关。虽然IBM模型2已经覆盖了一部分词对齐问题，但是该模型只考虑到了单词的绝对位置，并未考虑到相邻单词间的关系。图\ref{fig:6-4} 展示了一个简单的实例，可以看到的是，汉语的每个单词都被分配给了英语句子中的每一个单词，但是单词并不是任意分布在各个位置上的，而是倾向于生成簇。也就是说，如果源语言的两个单词位置越近，它们的译文在目标语言句子中的位置也越近。
 %----------------------------------------------
 \begin{figure}[htp]
    \centering
 \input{./Chapter6/Figures/figure-zh-en-sentence-alignment}
    \caption{汉译英句对及对齐}
-    \label{fig:6-5}
+    \label{fig:6-4}
 \end{figure}
 %----------------------------------------------
-\parinterval 因此，基于HMM的词对齐模型抛弃了IBM模型1-2的绝对位置假设，将一阶隐马尔可夫模型用于单词对齐问题。HMM词对齐模型认为，词语与词语之间并不是毫无联系的，对齐概率应该取决于对齐位置的差异而不是本身词语所在的位置。具体来说，位置$j$的对齐概率$a_j$与前一个位置$j-1$的对齐位置$a_{j-1}$和译文长度$l$有关，形式化的表述为：
+\parinterval 针对此问题，基于HMM的词对齐模型抛弃了IBM模型1-2的绝对位置假设，将一阶隐马尔可夫模型用于词对齐问题\cite{vogel1996hmm}。HMM词对齐模型认为，单词与单词之间并不是毫无联系的，对齐概率应该取决于对齐位置的差异而不是本身单词所在的位置。具体来说，位置$j$的对齐概率$a_j$与前一个位置$j-1$的对齐位置$a_{j-1}$和译文长度$l$有关，形式化的表述为：
 \begin{eqnarray}
-\textrm{P}(a_{j}|a_{1}^{j-1},s_{1}^{j-1},m,\mathbf{t})=\textrm{P}(a_{j}|a_{j-1},l)
+\textrm{P}(a_{j}|a_{1}^{j-1},s_{1}^{j-1},m,\mathbf{t})\equiv\textrm{P}(a_{j}|a_{j-1},l)
-\label{eq:6-4}
+\label{eq:6-6}
 \end{eqnarray}
-\parinterval 这里用图\ref{fig:6-5}的例子对公式进行说明。在IBM模型1-2中，词语的对齐都是与单词所在的绝对位置有关。但在HMM词对齐模型中，``你''对齐到``you''被形式化为$\textrm{P}(a_{j}|a_{j-1},l)= P(5|4,5)$，意思是对于源文位置$3(j=3)$的词，如果它的目标译文是5个词，上一个对齐位置是$4(a_{2}=4)$，对齐到目标语译文位置$5(a_{j}=5)$的概率是多少？理想的情况下，通过$\textrm{P}(a_{j}|a_{j-1},l)$，``你''对齐到``you''应该得到更高的概率，并且由于源语词``对''和``你''距离很近，因此其对应的对齐位置``with''和``you''的距离也应该很近。
+\parinterval 这里用图\ref{fig:6-4}的例子对公式进行说明。在IBM模型1-2中，单词的对齐都是与单词所在的绝对位置有关。但在HMM词对齐模型中，``你''对齐到``you''被形式化为$\textrm{P}(a_{j}|a_{j-1},l)= P(5|4,5)$，意思是对于源语言位置$3(j=3)$上的单词，如果它的译文是第5个目标语言单词，上一个对齐位置是$4(a_{2}=4)$，对齐到目标语言位置$5(a_{j}=5)$的概率是多少？理想的情况下，通过$\textrm{P}(a_{j}|a_{j-1},l)$，``你''对齐到``you''应该得到更高的概率，并且由于源语言单词``对''和``你''距离很近，因此其对应的对齐位置``with''和``you''的距离也应该很近。
-\parinterval 因此，把公式$\textrm{P}(s_j|a_1^{j},s_1^{j-1},m,\mathbf{t}) \equiv f(s_j|t_{a_j})$和\ref{eq:6-4}重新带入公式$\textrm{P}(\mathbf{s},\mathbf{a}|\mathbf{t})=\textrm{P}(m|\mathbf{t})$\\$\prod_{j=1}^{m}{\textrm{P}(a_j|a_1^{j-1},s_1^{j-1},m,\mathbf{t})\textrm{P}(s_j|a_1^{j},s_1^{j-1},m,\mathbf{t})}$和$\textrm{P}(\mathbf{s}|\mathbf{t})= \sum_{\mathbf{a}}\textrm{P}(\mathbf{s},\mathbf{a}|\mathbf{t})$,可得HMM词对齐模型的数学描述：
+\parinterval 把公式$\textrm{P}(s_j|a_1^{j},s_1^{j-1},m,\mathbf{t}) \equiv f(s_j|t_{a_j})$和\ref{eq:6-6}重新带入公式$\textrm{P}(\mathbf{s},\mathbf{a}|\mathbf{t})=\textrm{P}(m|\mathbf{t})$\\$\prod_{j=1}^{m}{\textrm{P}(a_j|a_1^{j-1},s_1^{j-1},m,\mathbf{t})\textrm{P}(s_j|a_1^{j},s_1^{j-1},m,\mathbf{t})}$和$\textrm{P}(\mathbf{s}|\mathbf{t})= \sum_{\mathbf{a}}\textrm{P}(\mathbf{s},\mathbf{a}|\mathbf{t})$,可得HMM词对齐模型的数学描述：
 \begin{eqnarray}
 \textrm{P}(\mathbf{s}| \mathbf{t})=\sum_{\mathbf{a}}{\textrm{P}(m|\mathbf{t})}\prod_{j=1}^{m}{\textrm{P}(a_{j}|a_{j-1},l)f(s_{j}|t_{a_j})}
-\label{eq:6-5}
+\label{eq:6-7}
 \end{eqnarray}
 \parinterval 此外，为了使得HMM的对齐概率$\textrm{P}(a_{j}|a_{j-1},l)$满足归一化的条件，这里还假设其对齐概率只取决于$a_{j}-a_{j-1}$，即：
 \begin{eqnarray}
 \textrm{P}(a_{j}|a_{j-1},l)=\frac{\mu(a_{j}-a_{j-1})}{\sum_{i=1}^{l}{\mu(i-a_{j-1})}}
-\label{eq:6-6}
+\label{eq:6-8}
 \end{eqnarray}
 \noindent 其中，$\mu( \cdot )$是隐马尔可夫模型的参数，可以通过训练得到。
+\parinterval 需要注意的是，公式\ref{eq:6-7}之所以被看作是一种隐马尔可夫模型，是由于其形式与标准的一阶隐马尔可夫模型无异。$\textrm{P}(a_{j}|a_{j-1},l)$可以被看作是一种状态转移概率，$f(s_{j}|t_{a_j})$可以被看作是一种发射概率。关于隐马尔可夫模型具体的数学描述也可参考{\chapterthree}中的相关内容。
 %----------------------------------------------------------------------------------------
@@ -162,62 +163,85 @@
 \sectionnewpage
 \section{基于繁衍率的翻译模型}
+下面介绍翻译中的一对多问题，以及这个问题所带来的句子长度预测问题。
 %----------------------------------------------------------------------------------------
 %    NEW SUB-SECTION
 %----------------------------------------------------------------------------------------
 \subsection{什么是繁衍率}
-{\color{red}{扣后面非自回归解码的问题，SMT和NMT中都有应用。从另一个角度说，繁衍率也是对翻译长度的一种建模，它本质在描述：一个词(或序列)翻译到另一种语言有多长？因此，在需要对翻译长度建模的问题中，繁衍率都可以被使用}}
-\parinterval 从前面的介绍可知，IBM模型1和模型2把不同的源语言单词看作相互独立的单元来进行词对齐和翻译。换句话说，即使某个源语言短语中的两个单词都对齐到同一个目标语单词，它们之间也是相互独立的。这样模型1和模型2对于多个源语言单词对齐到同一个目标语单词的情况并不能很好地进行描述。
+\parinterval 从前面的介绍可知，IBM模型1和模型2把不同的源语言单词看作相互独立的单元来进行词对齐和翻译。换句话说，即使某个源语言短语中的两个单词都对齐到同一个目标语单词，它们之间也是相互独立的。这样IBM模型1和模型2对于多个源语言单词对齐到同一个目标语单词的情况并不能很好地进行描述。
-\parinterval 这里将会给出另一个翻译模型，能在一定程度上解决上面提到的问题。该模型把译文生成源文的过程分解为如下几个步骤：首先，确定每个目标语言单词生成源语言单词的个数，这里把它称为{\small\sffamily\bfseries{产出率}}\index{繁衍率}或{\small\sffamily\bfseries{产出率}}\index{产出率}（Fertility）\index{Fertility}；其次，决定译文中每个单词生成的源语言单词都是什么，即决定生成的第一个源语言单词是什么，生成的第二个源语言单词是什么，以此类推。这样每个目标语单词就对应了一个源语言单词列表；最后把各组源语言单词列表中的每个单词都放置到合适的位置上，完成目标语言译文到源语言句子的生成。
+\parinterval 这里将会给出另一个翻译模型，能在一定程度上解决上面提到的问题\cite{Peter1993The,och2003systematic}。该模型把目标语言生成源语言的过程分解为如下几个步骤：首先，确定每个目标语言单词生成源语言单词的个数，这里把它称为{\small\sffamily\bfseries{繁衍率}}\index{繁衍率}或{\small\sffamily\bfseries{产出率}}\index{产出率}（Fertility）\index{Fertility}；其次，决定目标语言句子中每个单词生成的源语言单词都是什么，即决定生成的第一个源语言单词是什么，生成的第二个源语言单词是什么，以此类推。这样每个目标语言单词就对应了一个源语言单词列表；最后把各组源语言单词列表中的每个单词都放置到合适的位置上，完成目标语言译文到源语言句子的生成。
-\parinterval 对于句对$(\mathbf{s},\mathbf{t})$，令$\varphi$表示产出率，同时令${\tau}$表示每个目标语单词对应的源语言单词列表。图{\ref{fig:6-6}}描述了一个英文句子生成中文句子的过程。首先，对于每个英语单词$t_i$决定它的产出率$\varphi_{i}$。比如``Scientists''的产出率是2，可表示为${\varphi}_{1}=2$。这表明它会生成2个中文单词；其次，确定英文句子中每个单词生成的中文单词列表。比如``Scientists''生成``科学家''和``们''两个中文单词，可表示为${\tau}_1=\{{\tau}_{11}=\textrm{``科学家''},{\tau}_{12}=\textrm{``们''}$。这里用特殊的空标记NULL表示翻译对空的情况；最后，把生成的所有中文单词放在合适的位置。比如``科学家''和``们''分别放在$\mathbf{s}$的位置1和位置2。可以用符号$\pi$记录生成的单词在源语言句子$\mathbf{s}$中的位置。比如``Scientists''生成的中文单词在$\mathbf{s}$ 中的位置表示为${\pi}_{1}=\{{\pi}_{11}=1,{\pi}_{12}=2\}$。
+\parinterval 对于句对$(\mathbf{s},\mathbf{t})$，令$\varphi$表示产出率，同时令${\tau}$表示每个目标语言单词对应的源语言单词列表。图{\ref{fig:6-5}}描述了一个英语句子生成汉语句子的过程。
+\begin{itemize}
+\vspace{0.3em}
+\item 首先，对于每个英语单词$t_i$决定它的产出率$\varphi_{i}$。比如``Scientists''的产出率是2，可表示为${\varphi}_{1}=2$。这表明它会生成2个汉语单词；
+\vspace{0.3em}
+\item 其次，确定英语句子中每个单词生成的汉语单词列表。比如``Scientists''生成``科学家''和``们''两个汉语单词，可表示为${\tau}_1=\{{\tau}_{11}=\textrm{``科学家''},{\tau}_{12}=\textrm{``们''}$。 这里用特殊的空标记NULL表示翻译对空的情况；
+\vspace{0.3em}
+\item 最后，把生成的所有汉语单词放在合适的位置。比如``科学家''和``们''分别放在$\mathbf{s}$的位置1和位置2。可以用符号$\pi$记录生成的单词在源语言句子$\mathbf{s}$中的位置。比如``Scientists'' 生成的汉语单词在$\mathbf{s}$ 中的位置表示为${\pi}_{1}=\{{\pi}_{11}=1,{\pi}_{12}=2\}$。
+\vspace{0.3em}
+\end{itemize}
 %----------------------------------------------
 \begin{figure}[htp]
    \centering
 \input{./Chapter6/Figures/figure-probability-translation-process}
   \caption{基于产出率的翻译模型执行过程}
-   \label{fig:6-6}
+   \label{fig:6-5}
 \end{figure}
 %----------------------------------------------
-\parinterval 为了表述清晰，这里重新说明每个符号的含义。$\mathbf{s}$、$\mathbf{t}$、$m$和$l$分别表示源语言句子、目标语译文、源语言单词数量以及译文单词数量。$\mathbf{\varphi}$、$\mathbf{\tau}$ 和$\mathbf{\pi}$分别记录产出率、生成的源语言单词以及它们在源文中的位置。${\varphi}_{i}$表示第$i$个译文单词$t_i$的产出率。${\tau}_{i}$和${\pi}_i$分别表示$t_i$生成的源语言单词列表及其在源语言句子$\mathbf{s}$中的位置列表。
+\parinterval 为了表述清晰，这里重新说明每个符号的含义。$\mathbf{s}$、$\mathbf{t}$、$m$和$l$分别表示源语言句子、目标语言译文、源语言单词数量以及译文单词数量。$\mathbf{\varphi}$、$\mathbf{\tau}$ 和$\mathbf{\pi}$分别表示产出率、生成的源语言单词以及它们在源语言句子中的位置。${\varphi}_{i}$表示第$i$个目标语言单词$t_i$的产出率。${\tau}_{i}$和${\pi}_i$ 分别表示$t_i$生成的源语言单词列表及其在源语言句子$\mathbf{s}$中的位置列表。
 \parinterval 可以看出，一组$\tau$和$\pi$（记为$<\tau,\pi>$）可以决定一个对齐$\mathbf{a}$和一个源语句子$\mathbf{s}$。
-\noindent 相反的，一个对齐$\mathbf{a}$和一个源语句子$\mathbf{s}$可以对应多组$<\tau,\pi>$。如图\ref{fig:6-7}所示，不同的$<\tau,\pi>$对应同一个源语言句子和词对齐。它们的区别在于目标语单词``Scientists''生成的源语言单词``科学家''和``们''的顺序不同。这里把不同的$<\tau,\pi>$对应到的相同的源语句子$\mathbf{s}$和对齐$\mathbf{a}$记为$<\mathbf{s},\mathbf{a}>$。因此计算$\textrm{P}(\mathbf{s},\mathbf{a}| \mathbf{t})$时需要把每个可能结果的概率加起来，如下：
+\noindent 相反的，一个对齐$\mathbf{a}$和一个源语句子$\mathbf{s}$可以对应多组$<\tau,\pi>$。如图\ref{fig:6-6}所示，不同的$<\tau,\pi>$对应同一个源语言句子和词对齐。它们的区别在于目标语单词``Scientists''生成的源语言单词``科学家''和`` 们''的顺序不同。这里把不同的$<\tau,\pi>$对应到的相同的源语句子$\mathbf{s}$和对齐$\mathbf{a}$记为$<\mathbf{s},\mathbf{a}>$。因此计算$\textrm{P}(\mathbf{s},\mathbf{a}| \mathbf{t})$时需要把每个可能结果的概率加起来，如下：
 \begin{equation}
 \textrm{P}(\mathbf{s},\mathbf{a}| \mathbf{t})=\sum_{{<\tau,\pi>}\in{<\mathbf{s},\mathbf{a}>}}{\textrm{P}(\tau,\pi|\mathbf{t}) }
-\label{eq:6-7}
+\label{eq:6-9}
 \end{equation}
-\parinterval 不过$<\mathbf{s},\mathbf{a}>$中有多少个元素呢？通过图\ref{fig:6-6}中的例子，可以推出$<\mathbf{s},\mathbf{a}>$应该包含$\prod_{i=0}^{l}{\varphi_i !}$个不同的二元组$<\tau,\pi>$。 这是因为在给定源语言句子和词对齐时，对于每一个$\tau_i$都有$\varphi_{i}!$种排列。
 %----------------------------------------------
 \begin{figure}[htp]
    \centering
 \input{./Chapter6/Figures/figure-example-of-t-s-generate}
   \caption{不同$\tau$和$\pi$对应相同的源语言句子和词对齐的情况}
-   \label{fig:6-7}
+   \label{fig:6-6}
 \end{figure}
 %----------------------------------------------
-\parinterval 进一步，$\textrm{P}(\tau,\pi|\mathbf{t})$可以被表示如图\ref{fig:6-8}的形式。其中$\tau_{i1}^{k-1}$表示$\tau_{i1}\tau_{i2}\cdots \tau_{i(k-1)}$，$\pi_{i1}^{ k-1}$表示$\pi_{i1}\pi_{i2}\cdots \pi_{i(k-1)}$。可以把图\ref{fig:6-8}中的公式分为5个部分，并用不同的序号和颜色进行标注。每部分的具体含义是：
+\parinterval 不过$<\mathbf{s},\mathbf{a}>$中有多少组$<\tau,\pi>$呢？通过图\ref{fig:6-5}中的例子，可以推出$<\mathbf{s},\mathbf{a}>$应该包含$\prod_{i=0}^{l}{\varphi_i !}$个不同的二元组$<\tau,\pi>$。 这是因为在给定源语言句子和词对齐时，对于每一个$\tau_i$都有$\varphi_{i}!$种排列。
+\parinterval 进一步，$\textrm{P}(\tau,\pi|\mathbf{t})$可以被表示如图\ref{fig:6-7}的形式。其中$\tau_{i1}^{k-1}$表示$\tau_{i1}\tau_{i2}\cdots \tau_{i(k-1)}$，$\pi_{i1}^{ k-1}$表示$\pi_{i1}\pi_{i2}\cdots \pi_{i(k-1)}$。可以把图\ref{fig:6-7}中的公式分为5个部分，并用不同的序号和颜色进行标注。每部分的具体含义是：
+%----------------------------------------------
+\begin{figure}[htp]
+    \centering
+\input{./Chapter6/Figures/figure-expression}
+   \caption{{$\textrm{P}(\tau,\pi|t)$}的详细表达式}
+\setlength{\belowcaptionskip}{-0.5em}
+   \label{fig:6-7}
+\end{figure}
+%----------------------------------------------
 \begin{itemize}
 \vspace{0.5em}
-\item 对每个$i\in[1,l]$的目标语单词的产出率建模（{\color{red!70} 红色}），即$\varphi_i$的概率。它依赖于$\mathbf{t}$和区间$[1,i-1]$的目标语单词的产出率$\varphi_1^{i-1}$。\footnote{这里约定，当$i=1$ 时，$\varphi_1^0$ 表示空。}
+\item 第一部分：每个$i\in[1,l]$的目标语单词的产出率建模（{\color{red!70} 红色}），即$\varphi_i$的生成概率。它依赖于$\mathbf{t}$和区间$[1,i-1]$的目标语单词的产出率$\varphi_1^{i-1}$。\footnote{这里约定，当$i=1$ 时，$\varphi_1^0$ 表示空。}
 \vspace{0.5em}
-\item $i=0$时的产出率建模（{\color{blue!70} 蓝色}），即空标记$t_0$的产出率的概率。它依赖于$\mathbf{t}$和区间$[1,i-1]$的目标语单词的产出率$\varphi_1^l$。
+\item 第二部分：$i=0$时的产出率建模（{\color{blue!70} 蓝色}），即空标记$t_0$的产出率生成概率。它依赖于$\mathbf{t}$和区间$[1,i-1]$的目标语单词的产出率$\varphi_1^l$。
 \vspace{0.5em}
-\item 词汇翻译建模（{\color{green!70} 绿色}），目标语言单词$t_i$生成第$k$个源语言单词$\tau_{ik}$时的概率，依赖于$\mathbf{t}$、所有目标语言单词的产出率$\varphi_0^l$、区间$i\in[1,l]$的目标语言单词生成的源语言单词$\tau_1^{i-1}$和目标语单词$t_i$生成的前$k$个源语言单词$\tau_{i1}^{k-1}$。
+\item 第三部分：词汇翻译建模（{\color{green!70} 绿色}），目标语言单词$t_i$生成第$k$个源语言单词$\tau_{ik}$时的概率，依赖于$\mathbf{t}$、所有目标语言单词的产出率$\varphi_0^l$、区间$i\in[1,l]$的目标语言单词生成的源语言单词$\tau_1^{i-1}$和目标语单词$t_i$生成的前$k$个源语言单词$\tau_{i1}^{k-1}$。
 \vspace{0.5em}
-\item 对于每个$i\in[1,l]$的目标语言单词生成的源语言单词的{\small\bfnew{扭曲度}}\index{扭曲度}（Distortion）\index{Distortion}建模（{\color{yellow!70!black} 黄色}），即第$i$个译文单词生成的第$k$个源语言单词在源文中的位置$\pi_{ik}$ 的概率。其中$\pi_1^{i-1}$ 和$\pi_{i1}^{k-1}$分别表示区间$[1,i-1]$的目标语言单词生成的源语言单词的扭曲度和第$i$译文单词生成的前$k$个源语言单词的扭曲度。
+\item 第四部分：对于每个$i\in[1,l]$的目标语言单词生成的源语言单词的扭曲度建模（{\color{yellow!70!black} 黄色}），即第$i$个目标语言单词生成的第$k$个源语言单词在源文中的位置$\pi_{ik}$ 的概率。其中$\pi_1^{i-1}$ 表示区间$[1,i-1]$的目标语言单词生成的源语言单词的扭曲度，$\pi_{i1}^{k-1}$表示第$i$目标语言单词生成的前$k-1$个源语言单词的扭曲度。
 \vspace{0.5em}
-\item $i=0$时的扭曲度建模（{\color{gray!70} 灰色}），即空标记$t_0$生成的源语言单词在源语言句子中位置的概率。
+\item 第五部分：$i=0$时的扭曲度建模（{\color{gray!70} 灰色}），即空标记$t_0$生成源语言位置的概率。
 \end{itemize}
 %----------------------------------------------------------------------------------------
@@ -226,31 +250,35 @@
 \subsection{IBM 模型3}
-\parinterval IBM模型3通过一些假设对图\ref{fig:6-8}所表示的基本模型进行了化简。具体来说，对于每个$i\in[1,l]$，假设$\textrm{P}(\varphi_i |\varphi_1^{i-1},\mathbf{t})$仅依赖于$\varphi_i$和$t_i$，$\textrm{P}(\pi_{ik}|\pi_{i1}^{k-1},\pi_1^{i-1},\tau_0^l,\varphi_0^l,\mathbf{t})$仅依赖于$\pi_{ik}$、$i$、$m$和$l$。而对于所有的$i\in[0,l]$，假设$\textrm{P}(\tau_{ik}|\tau_{i1}^{k-1},\tau_1^{i-1},\phi_0^l,\mathbf{t})$仅依赖于$\tau_{ik}$和$t_i$。形式化这些假设，可以得到：
+\parinterval IBM模型3通过一些假设对图\ref{fig:6-7}所表示的基本模型进行了化简。具体来说，对于每个$i\in[1,l]$，假设$\textrm{P}(\varphi_i |\varphi_1^{i-1},\mathbf{t})$仅依赖于$\varphi_i$和$t_i$，$\textrm{P}(\pi_{ik}|\pi_{i1}^{k-1},\pi_1^{i-1},\tau_0^l,\varphi_0^l,\mathbf{t})$仅依赖于$\pi_{ik}$、$i$、$m$和$l$。而对于所有的$i\in[0,l]$，假设$\textrm{P}(\tau_{ik}|\tau_{i1}^{k-1},\tau_1^{i-1},\varphi_0^l,\mathbf{t})$仅依赖于$\tau_{ik}$和$t_i$。这些假设的形式化描述为：
-%----------------------------------------------
-\begin{figure}[htp]
-    \centering
-\input{./Chapter6/Figures/figure-expression}
-   \caption{{$\textrm{P}(\tau,\pi|t)$}的详细表达式}
-\setlength{\belowcaptionskip}{-0.5em}
-   \label{fig:6-8}
-\end{figure}
-%----------------------------------------------
 \begin{eqnarray}
-\textrm{P}(\varphi_i|\varphi_1^{i-1},\mathbf{t})                                                              & = &{\textrm{P}(\varphi_i|t_i)} \label{eq:6-8} \\
+\textrm{P}(\varphi_i|\varphi_1^{i-1},\mathbf{t})                                                              & = &{\textrm{P}(\varphi_i|t_i)} \label{eq:6-10} \\
-\textrm{P}(\tau_{ik} = s_j |\tau_{i1}^{k-1},\tau_{1}^{i-1},\varphi_0^t,\mathbf{t})             & = & t(s_j|t_i) \label{eq:6-9} \\
+\textrm{P}(\tau_{ik} = s_j |\tau_{i1}^{k-1},\tau_{1}^{i-1},\varphi_0^t,\mathbf{t})             & = & t(s_j|t_i) \label{eq:6-11} \\
-\textrm{P}(\pi_{ik} = j |\pi_{i1}^{k-1},\pi_{1}^{i-1},\tau_{0}^{l},\varphi_{0}^{l},\mathbf{t}) & = & d(j|i,m,l) \label{eq:6-10}
+\textrm{P}(\pi_{ik} = j |\pi_{i1}^{k-1},\pi_{1}^{i-1},\tau_{0}^{l},\varphi_{0}^{l},\mathbf{t}) & = & d(j|i,m,l) \label{eq:6-12}
 \end{eqnarray}
-\parinterval 通常把$d(j|i,m,l)$称为扭曲度函数。这里$\textrm{P}(\varphi_i|\varphi_1^{i-1},\mathbf{t})={\textrm{P}(\varphi_i|t_i)}$和${\textrm{P}(\pi_{ik}=j|\pi_{i1}^{k-1},}$ $\pi_{1}^{i-1},\tau_0^l,\varphi_0^l,\mathbf{t})=d(j|i,m,l)$仅对$1 \le i \le l$成立。这样就完成了图\ref{fig:6-8}中第1、 3和4部分的建模。
+\parinterval 通常把$d(j|i,m,l)$称为扭曲度函数。这里$\textrm{P}(\varphi_i|\varphi_1^{i-1},\mathbf{t})={\textrm{P}(\varphi_i|t_i)}$和${\textrm{P}(\pi_{ik}=j|\pi_{i1}^{k-1},}$ $\pi_{1}^{i-1},\tau_0^l,\varphi_0^l,\mathbf{t})=d(j|i,m,l)$仅对$1 \le i \le l$成立。这样就完成了图\ref{fig:6-7}中第1、 3和4部分的建模。
+\parinterval 对于$i=0$的情况需要单独进行考虑。实际上，$t_0$只是一个虚拟的单词。它要对应$\mathbf{s}$中原本为空对齐的单词。这里假设：要等其他非空对应单词都被生成（放置）后，才考虑这些空对齐单词的生成（放置）。即非空对单词都被生成后，在那些还有空的位置上放置这些空对的源语言单词。此外，在任何的空位置上放置空对的源语言单词都是等概率的，即放置空对齐源语言单词服从均匀分布。这样在已经放置了$k$个空对齐源语言单词的时候，应该还有$\varphi_0-k$个空位置。如果第$j$个源语言位置为空，那么
-\parinterval 对于$i=0$的情况需要单独进行考虑。实际上，$t_0$只是一个虚拟的单词。它要对应$\mathbf{s}$中原本为空对齐的单词。这里假设要等其他非空对应单词都被生成（放置）后，才考虑这些空对齐单词的生成（放置）。即非空对单词都被生成后，在那些还有空的位置上放置这些空对的源语单词。此外，在任何的空位置上放置空对的源语单词都是等概率的，即放置空对齐源语言单词服从均匀分布。这样在已经放置了$k$个空对齐源语言单词的时候，应该还有$\varphi_0-k$个空位置。如果第$i$个位置为空，那么$\textrm{P}(\pi_{0k}=i|\pi_{01}^{k-1},\pi_1^l,\tau_0^l,\varphi_0^l,\mathbf{t})=\frac{1}{\varphi_0-k}$，否则$\textrm{P}(\pi_{0k}=i|\pi_{01}^{k-1},\pi_1^l,\tau_0^l,\varphi_0^l,\mathbf{t})=0$。这样对于$t_0$所对应的$\tau_0$，就有
+\begin{equation}
+\textrm{P}(\pi_{0k}=j|\pi_{01}^{k-1},\pi_1^l,\tau_0^l,\varphi_0^l,\mathbf{t})=\frac{1}{\varphi_0-k}
+\label{eq:6-13}
+\end{equation}
+否则
+\begin{equation}
+\textrm{P}(\pi_{0k}=j|\pi_{01}^{k-1},\pi_1^l,\tau_0^l,\varphi_0^l,\mathbf{t})=0
+\label{eq:6-14}
+\end{equation}
+这样对于$t_0$所对应的$\tau_0$，就有
 {
 \begin{eqnarray}
 \prod_{k=1}^{\varphi_0}{\textrm{P}(\pi_{0k}|\pi_{01}^{k-1},\pi_{1}^{l},\tau_{0}^{l},\varphi_{0}^{l},\mathbf{t})         }=\frac{1}{\varphi_{0}!}
-\label{eq:6-11}
+\label{eq:6-15}
 \end{eqnarray}
 }
 \parinterval 而上面提到的$t_0$所对应的这些空位置是如何生成的呢？即如何确定哪些位置是要放置空对齐的源语言单词。在IBM模型3中，假设在所有的非空对齐源语言单词都被生成出来后（共$\varphi_1+\varphi_2+\cdots {\varphi}_l$个非空对源语单词），这些单词后面都以$p_1$概率随机地产生一个``槽''用来放置空对齐单词。这样，${\varphi}_0$就服从了一个二项分布。于是得到
@@ -260,10 +288,10 @@
 \varphi_1+\varphi_2+\cdots \varphi_l\\
 \varphi_0\\
 \end{array}\big)p_0^{\varphi_1+\varphi_2+\cdots \varphi_l-\varphi_0}p_1^{\varphi_0}
-\label{eq:6-12}
+\label{eq:6-16}
 \end{eqnarray}
 }
-\noindent 其中，$p_0+p_1=1$。到此为止，我们完成了图\ref{fig:6-8}中第2和5部分的建模。最终根据这些假设可以得到$\textrm{P}(\mathbf{s}| \mathbf{t})$的形式：
+\noindent 其中，$p_0+p_1=1$。到此为止，已经完成了图\ref{fig:6-7}中第2和5部分的建模。最终根据这些假设可以得到$\textrm{P}(\mathbf{s}| \mathbf{t})$的形式为：
 {
 \begin{eqnarray}
 {\textrm{P}(\mathbf{s}| \mathbf{t})}&= &{\sum_{a_1=0}^{l}{\cdots}\sum_{a_m=0}^{l}{\Big[\big(\begin{array}{c}
@@ -271,16 +299,16 @@ m-\varphi_0\\
 \varphi_0\\
 \end{array}\big)}p_0^{m-2\varphi_0}p_1^{\varphi_0}\prod_{i=1}^{l}{{\varphi_i}!n(\varphi_i|t_i)    }} \nonumber \\
 & & \times{\prod_{j=1}^{m}{t(s_j|t_{a_j})} \times \prod_{j=1,a_j\neq 0}^{m}{d(j|a_j,m,l)}} \Big]
-\label{eq:6-13}
+\label{eq:6-17}
 \end{eqnarray}
 }
 \noindent 其中，$n(\varphi_i |t_i)={\textrm{P}(\varphi_i|t_i)}$表示产出率的分布。这里的约束条件为，
 {
 \begin{eqnarray}
-\sum_{s}t(s|t)                     & = &1 \label{eq:6-14} \\
+\sum_{s_x}t(s_x|t_y)                     & = &1 \label{eq:6-18} \\
-\sum_{j}d(j|i,m,l)                & = & 1 \label{eq:6-15} \\
+\sum_{j}d(j|i,m,l)                & = & 1 \label{eq:6-19} \\
-\sum_{\varphi} n(\varphi|t) & = &1 \label{eq:6-16} \\
+\sum_{\varphi} n(\varphi|t_y) & = &1 \label{eq:6-20} \\
-p_0+p_1                            & = & 1 \label{eq:6-17}
+p_0+p_1                            & = & 1 \label{eq:6-21}
 \end{eqnarray}
 }
@@ -292,34 +320,35 @@ p_0+p_1                            & = & 1 \label{eq:6-17}
 \parinterval IBM模型3仍然存在问题，比如，它不能很好地处理一个目标语言单词生成多个源语言单词的情况。这个问题在模型1和模型2中也存在。如果一个目标语言单词对应多个源语言单词，往往这些源语言单词构成短语或搭配。但是模型1-3把这些源语言单词看成独立的单元，而实际上它们是一个整体。这就造成了在模型1-3中这些源语言单词可能会``分散''开。为了解决这个问题，模型4对模型3进行了进一步修正。
+\parinterval 为了更清楚的阐述，这里引入新的术语\ \dash \ {\small\bfnew{概念单元}}\index{概念单元}或{\small\bfnew{概念}}\index{概念}（Concept）\index{Concept}。词对齐可以被看作概念之间的对应。这里的概念是指具有独立语法或语义功能的一组单词。依照Brown等人的表示方法\cite{Peter1993The}，可以把概念记为cept.。每个句子都可以被表示成一系列的cept.。这里要注意的是，源语言句子中的cept.数量不一定等于目标句子中的cept.数量。因为有些cept. 可以为空，因此可以把那些空对的单词看作空cept.。比如，在图\ref{fig:6-8}的实例中，``了''就对应一个空cept.。
 %----------------------------------------------
 \begin{figure}[htp]
    \centering
 \input{./Chapter6/Figures/figure-word-alignment}
-   \caption{词对齐的汉译英句对及独立单词cept.的位置}
+   \caption{词对齐的汉译英句对及独立单词cept.的位置（记为$[i]$）}
-   \label{fig:6-9}
+   \label{fig:6-8}
 \end{figure}
 %----------------------------------------------
-\parinterval 为了更清楚的阐述，这里引入新的术语\ \dash \ {\small\bfnew{概念单元}}\index{概念单元}或{\small\bfnew{概念}}\index{概念}（Concept）\index{Concept}。词对齐可以被看作概念之间的对应。这里的概念是指具有独立语法或语义功能的一组单词。依照Brown等人的表示方法\cite{Peter1993The}，可以把概念记为cept.。每个句子都可以被表示成一系列的cept.。这里要注意的是，源语言句子中的cept.数量不一定等于目标句子中的cept.数量。因为有些cept. 可以为空，因此可以把那些空对的单词看作空cept.。比如，在图\ref{fig:6-9}的实例中，``了''就对应一个空cept.。
+\parinterval 在IBM模型的词对齐框架下，目标语的cept.只能是那些非空对齐的目标语单词，而且每个cept.只能由一个目标语言单词组成（通常把这类由一个单词组成的cept.称为独立单词cept.）。这里用$[i]$表示第$i$ 个独立单词cept.在目标语言句子中的位置。换句话说，$[i]$表示第$i$个非空对的目标语单词的位置。比如在本例中``mind''在$\mathbf{t}$中的位置表示为$[3]$。
-\parinterval 在IBM模型的词对齐框架下，目标语的cept.只能是那些非空对齐的目标语单词，而且每个cept.只能由一个目标语单词组成（通常把这类由一个单词组成的cept.称为独立单词cept.）。这里用$[i]$表示第$i$ 个独立单词cept.在目标语言句子中的位置。换句话说，$[i]$表示第$i$个非空对的目标语单词的位置。比如在本例中``mind''在$\mathbf{t}$中的位置表示为$[3]$。
+\parinterval 另外，可以用$\odot_{i}$表示位置为$[i]$的目标语言单词对应的那些源语言单词位置的平均值，如果这个平均值不是整数则对它向上取整。比如在本例中，目标语句中第4个cept. （``.''）对应在源语言句子中的第5个单词。可表示为${\odot}_{4}=5$。
-\parinterval 另外，可以用$\odot_{i}$表示位置为$[i]$的目标语言单词对应的那些源语言单词位置的平均值，如果这个平均值不是整数则对它向上取整。比如在本例中，目标语句中第4个cept. （``.''）对应在源语言句子中的第5个输出值。可表示为${\odot}_{4}=5$。
 \parinterval 利用这些新引进的概念，模型4对模型3的扭曲度进行了修改。主要是把扭曲度分解为两类参数。对于$[i]$对应的源语言单词列表($\tau_{[i]}$)中的第一个单词($\tau_{[i]1}$），它的扭曲度用如下公式计算：
 \begin{equation}
 \textrm{P}(\pi_{[i]1}=j|{\pi}_1^{[i]-1},{\tau}_0^l,{\varphi}_0^l,\mathbf{t})=d_{1}(j-{\odot}_{i-1}|A(t_{[i-1]}),B(s_j))
-\label{eq:6-18}
+\label{eq:6-22}
 \end{equation}
-\noindent 其中，译文的第$i$个单词生成的第$k$个源语单词在源语言句子中的位置用变量$\pi_{ik}$表示。而对于列表($\tau_{[i]}$)中的其他的单词($\tau_{[i]k},1 < k \le \varphi[i]$)的扭曲度计算，进行如下计算
+\noindent 其中，第$i$个目标语言单词生成的第$k$个源语言单词的位置用变量$\pi_{ik}$表示。而对于列表($\tau_{[i]}$)中的其他的单词($\tau_{[i]k},1 < k \le \varphi_{[i]}$)的扭曲度，用如下公式计算：
 \begin{equation}
 \textrm{P}(\pi_{[i]k}=j|{\pi}_{[i]1}^{k-1},\pi_1^{[i]-1},\tau_0^l,\varphi_0^l,\mathbf{t})=d_{>1}(j-\pi_{[i]k-1}|B(s_j))
-\label{eq:6-19}
+\label{eq:6-23}
 \end{equation}
-\parinterval 这里的函数$A(\cdot)$和函数$B(\cdot)$分别把目标语言和源语言的单词影射到单词的词类。这么做的目的一方面要减小参数空间的大小，另一方面是要减小数据的稀疏程度。词类信息通常可以通过外部工具得到，比如Brown聚类等。另一种简单的方法是把单词直接映射为它的词性。这样可以直接用现在已经非常成熟的词性标注工具解决问题。
+\parinterval 这里的函数$A(\cdot)$和函数$B(\cdot)$分别把目标语言和源语言的单词影射到单词的词类。这么做的目的是要减小参数空间的大小。词类信息通常可以通过外部工具得到，比如Brown聚类等。另一种简单的方法是把单词直接映射为它的词性。这样可以直接用现在已经非常成熟的词性标注工具解决问题。
 \parinterval 从上面改进的扭曲度模型可以看出，对于$t_{[i]}$生成的第一个源语言单词，要考虑中心$\odot_{[i]}$和这个源语言单词之间的绝对距离。实际上也就要把$t_{[i]}$生成的所有源语言单词看成一个整体并把它放置在合适的位置。这个过程要依据第一个源语言单词的词类和对应源语中心位置，和前一个非空对目标语言单词$t_{[i-1]}$的词类。而对于$t[i]$生成的其他源语言单词，只需要考虑它与前一个刚放置完的源语言单词的相对位置和这个源语言单词的词类。
@@ -331,32 +360,32 @@ p_0+p_1                            & = & 1 \label{eq:6-17}
 \subsection{ IBM 模型5}
-\parinterval 模型3和模型4并不是``准确''的模型。这两个模型会把一部分概率分配给一些根本就不存在的句子。这个问题被称作IBM模型3和模型4的{\small\bfnew{缺陷}}\index{缺陷}（Deficiency）\index{Deficiency}。说的具体一些，模型3和模型4 中并没有这样的约束：如果已经放置了某个源语言单词的位置不能再放置其他单词，也就是说句子的任何位置只能放置一个词，不能多也不能少。由于缺乏这个约束，模型3和模型4中在所有合法的词对齐上概率和不等于1。 这部分缺失的概率被分配到其他不合法的词对齐上。举例来说，如图\ref{fig:6-10}所示，``吃 早饭''和``have breakfast''之间的合法词对齐用直线表示 。但是在模型3和模型4中， 在它们上的概率和为$0.9<1$。 损失掉的概率被分配到像5和6这样的对齐上了（红色）。虽然IBM模型并不支持一对多的对齐，但是模型3和模型4把概率分配给这些``不合法''的词对齐上，因此也就产生所谓的Deficiency问题。
+\parinterval 模型3和模型4并不是``准确''的模型。这两个模型会把一部分概率分配给一些根本就不存在的句子。这个问题被称作IBM模型3和模型4的{\small\bfnew{缺陷}}\index{缺陷}（Deficiency）\index{Deficiency}。说的具体一些，模型3和模型4 中并没有这样的约束：如果已经放置了某个源语言单词的位置不能再放置其他单词，也就是说句子的任何位置只能放置一个词，不能多也不能少。由于缺乏这个约束，模型3和模型4中在所有合法的词对齐上概率和不等于1。 这部分缺失的概率被分配到其他不合法的词对齐上。举例来说，如图\ref{fig:6-9}所示，``吃/早饭''和``have breakfast''之间的合法词对齐用直线表示 。但是在模型3和模型4中， 它们的概率和为$0.9<1$。 损失掉的概率被分配到像5和6这样的对齐上了（红色）。虽然IBM模型并不支持一对多的对齐，但是模型3和模型4把概率分配给这些`` 不合法''的词对齐上，因此也就产生所谓的缺陷。
 %----------------------------------------------
 \begin{figure}[htp]
    \centering
 \input{./Chapter6/Figures/figure-word-alignment&probability-distribution-in-ibm-model-3}
    \caption{IBM模型3的词对齐及概率分配}
-    \label{fig:6-10}
+    \label{fig:6-9}
 \end{figure}
 %----------------------------------------------
-\parinterval 为了解决这个问题，模型5在模型中增加了额外的约束。基本想法是，在放置一个源语言单词的时候检查这个位置是否已经放置了单词，如果可以则把这个放置过程赋予一定的概率，否则把它作为不可能事件。依据这个想法，就需要在逐个放置源语言单词的时候判断源语言句子的哪些位置为空。这里引入一个变量$v(j, {\tau_1}^{[i]-1}, \tau_{[i]1}^{k-1})$，它表示在放置$\tau_{[i]k}$之前（$\tau_1^{[i]-1}$和$\tau_{[i]1}^{k-1}$已经被放置完了），从源语言句子的第一个位置到位置$j$（包含$j$）为止还有多少个空位置。这里，把这个变量简写为$v_j$。于是，对于$[i]$所对应的源语言单词列表（$\tau_{[i]}$）中的第一个单词（$\tau_{[i]1}$），有：
+\parinterval 为了解决这个问题，模型5在模型中增加了额外的约束。基本想法是，在放置一个源语言单词的时候检查这个位置是否已经放置了单词，如果可以则把这个放置过程赋予一定的概率，否则把它作为不可能事件。基于这个想法，就需要在逐个放置源语言单词的时候判断源语言句子的哪些位置为空。这里引入一个变量$v(j, {\tau_1}^{[i]-1}, \tau_{[i]1}^{k-1})$，它表示在放置$\tau_{[i]k}$之前（$\tau_1^{[i]-1}$ 和$\tau_{[i]1}^{k-1}$已经被放置完了），从源语言句子的第一个位置到位置$j$（包含$j$）为止还有多少个空位置。这里，把这个变量简写为$v_j$。于是，对于$[i]$所对应的源语言单词列表（$\tau_{[i]}$）中的第一个单词（$\tau_{[i]1}$），有：
 \begin{eqnarray}
 \textrm{P}(\pi_{[i]1} = j | \pi_1^{[i]-1}, \tau_0^l, \varphi_0^l, \mathbf{t}) & = & d_1(v_j|B(s_j), v_{\odot_{i-1}}, v_m-(\varphi_{[i]}-1)) \cdot \nonumber \\
                                                                                                   &     & (1-\delta(v_j,v_{j-1}))
-\label{eq:6-20}
+\label{eq:6-24}
 \end{eqnarray}
 \parinterval 对于其他单词（$\tau_{[i]k}$, $1 < k\le\varphi_{[i]}$），有：
 \begin{eqnarray}
 &   & \textrm{P}(\pi_{[i]k}=j|\pi_{[i]1}^{k-1}, \pi_1^{[i]-1}, \tau_0^l, \varphi_0^l,\mathbf{t}) \nonumber \\
 &= & d_{>1}(v_j-v_{\pi_{[i]k-1}}|B(s_j), v_m-v_{\pi_{[i]k-1}}-\varphi_{[i]}+k) \cdot (1-\delta(v_j,v_{j-1}))
-\label{eq:6-20}
+\label{eq:6-25}
 \end{eqnarray}
-\noindent 这里，因子$1-\delta(v_i, v_{i-1})$是用来判断第$i$个位置是不是为空。如果第$i$个位置为空则$v_i = v_{i-1}$，这样$\textrm{P}(\pi_{[i]1}=i|\pi_1^{[i]-1}, \tau_0^l, \varphi_0^l, \mathbf{t}) = 0$。这样就从模型上避免了模型3和模型4中生成不存在的字符串的问题。这里还要注意的是，对于放置第一个单词的情况，影响放置的因素有$v_i$，$B(s_i)$和$v_{i-1}$。此外还要考虑在$i$位置放置了第一个单词以后它的右边是不是还有足够的位置留给剩下的$k-1$个单词。参数$v_m-(\varphi_{[i]}-1)$正是为了考虑这个因素，这里$v_m$表示整个源语言句子中还有多少空位置，$\varphi_{[i]}-1$表示$i$位置右边至少还要留出的空格数。对于放置非第一个单词的情况，主要是要考虑它和前一个放置位置的相对位置。这主要体现在参数$v_i-v_{\varphi_{[i]}k-1}$上。式\ref{eq:6-15}的其他部分都可以用上面的理论解释，这里不再赘述。
+\noindent 这里，因子$1-\delta(v_j, v_{j-1})$是用来判断第$j$个位置是不是为空。如果第$j$个位置为空则$v_j = v_{j-1}$，这样$\textrm{P}(\pi_{[i]1}=j|\pi_1^{[i]-1}, \tau_0^l, \varphi_0^l, \mathbf{t}) = 0$。这样就从模型上避免了模型3和模型4中生成不存在的字符串的问题。这里还要注意的是，对于放置第一个单词的情况，影响放置的因素有$v_j$，$B(s_i)$和$v_{j-1}$。此外还要考虑位置$j$放置了第一个源语言单词以后它的右边是不是还有足够的位置留给剩下的$k-1$个源语言单词。参数$v_m-(\varphi_{[i]}-1)$正是为了考虑这个因素，这里$v_m$表示整个源语言句子中还有多少空位置，$\varphi_{[i]}-1$ 表示源语言位置$j$右边至少还要留出的空格数。对于放置非第一个单词的情况，主要是要考虑它和前一个放置位置的相对位置。这主要体现在参数$v_j-v_{\varphi_{[i]}k-1}$上。式\ref{eq:6-25} 的其他部分都可以用上面的理论解释，这里不再赘述。
 \parinterval 实际上，模型5和模型4的思想基本一致，即，先确定$\tau_{[i]1}$的绝对位置，然后再确定$\tau_{[i]}$中剩余单词的相对位置。模型5消除了产生不存在的句子的可能性，不过模型5的复杂性也大大增加了。
 %----------------------------------------------------------------------------------------
@@ -366,9 +395,9 @@ p_0+p_1                            & = & 1 \label{eq:6-17}
 \sectionnewpage
 \section{解码和训练}
-\parinterval 和IBM模型1-2一样，IBM模型3-5和隐马尔可夫模型的解码可以直接使用\ref{sec:sentence-level-translation}\\节所描述的方法。基本思路是对译文自左向右生成，每次扩展一个源语言单词的翻译，即把源语言单词的译文放到已经生成的译文的右侧。每次扩展可以选择不同的源语言单词或者同一个源语言单词的不同翻译候选，这样就可以得到多个不同的扩展译文。在这个过程中，同时计算翻译模型和语言模型的得分，对每个得到译文候选打分。最终，保留一个或者多个译文。这个过程重复执行直至所有源语言单词被翻译完。
+\parinterval 与IBM模型1一样，IBM模型2-5和隐马尔可夫模型的解码可以直接使用{\chapterfive}所描述的方法。基本思路与{\chaptertwo}所描述的自左向右搜索方法一致，即：对译文自左向右生成，每次扩展一个源语言单词的翻译，即把源语言单词的译文放到已经生成的译文的右侧。每次扩展可以选择不同的源语言单词或者同一个源语言单词的不同翻译候选，这样就可以得到多个不同的扩展译文。在这个过程中，同时计算翻译模型和语言模型的得分，对每个得到译文候选打分。最终，保留一个或者多个译文。这个过程重复执行直至所有源语言单词被翻译完。
-\parinterval 类似的，IBM模型3-5和隐马尔可夫模型也都可以使用期望最大化（EM）方法进行模型训练。相关数学推导可参考附录\ref{appendix-B}的内容。通常，可以使用这些模型获得双语句子间的词对齐结果，比如著名的GIZA++工具。这时，往往会使用多个模型，把简单的模型训练后的参数作为初始值送给后面更加复杂的模型。比如，先用IBM模型1训练，之后把参数送给IBM模型2，再训练，之后把参数送给隐马尔可夫模型等。值得注意的是，并不是所有的模型使用EM算法都能找到全局最优解。特别是IBM模型3-5的训练中使用一些剪枝和近似的方法，优化的真实目标函数会更加复杂。不过，IBM模型1是一个{\small\bfnew{凸函数}}\index{凸函数}（Convex Function）\index{Convex function}，因此理论上使用EM方法是能找到全局最优解的。更实际的好处是，IBM模型1训练的最终结果与参数的初始化过程无关。这也是为什么在使用IBM系列模型时，往往会使用IBM模型1作为起始模型的原因。
+\parinterval 类似的，IBM模型2-5和隐马尔可夫模型也都可以使用期望最大化（EM）方法进行模型训练。相关数学推导可参考附录\ref{appendix-B}的内容。通常，可以使用这些模型获得双语句子间的词对齐结果，比如使用GIZA++工具。这时，往往会使用多个模型，把简单的模型训练后的参数作为初始值送给后面更加复杂的模型。比如，先用IBM模型1训练，之后把参数送给IBM模型2，再训练，之后把参数送给隐马尔可夫模型等。值得注意的是，并不是所有的模型使用EM算法都能找到全局最优解。特别是IBM模型3-5的训练中使用一些剪枝和近似的方法，优化的真实目标函数会更加复杂。不过，IBM模型1是一个{\small\bfnew{凸函数}}\index{凸函数}（Convex Function）\index{Convex function}，因此理论上使用EM方法能够找到全局最优解。更实际的好处是，IBM 模型1训练的最终结果与参数的初始化过程无关。这也是为什么在使用IBM 系列模型时，往往会使用IBM模型1作为起始模型的原因。
 %----------------------------------------------------------------------------------------
 %    NEW SECTION
@@ -385,11 +414,11 @@ p_0+p_1                            & = & 1 \label{eq:6-17}
 \subsection{词对齐及对称化}
-\parinterval IBM的五个模型都是基于一个词对齐的假设\ \dash \ 一个源语言单词最多只能对齐到一个目标语言单词。这个约束大大化简了IBM模型的建模。最初，Brown等人提出这个假设可能是因为在法英翻译中一对多的对齐情况并不多见，这个假设带来的问题也不是那么严重。但是，在像汉英翻译这样的任务中，一个汉语单词对应多个英语单词的翻译很常见，这时IBM模型的词对齐假设就表现出了明显的问题。比如在翻译``我\ \ 会\ \ 试一试 。''\ $\to$ \ ``I will have a try .''时，IBM模型根本不能把单词``试一试''对齐到三个单词``have a try''，因而可能无法得到正确的翻译结果。
+\parinterval IBM五个模型都是基于一个词对齐的假设\ \dash \ 一个源语言单词最多只能对齐到一个目标语言单词。这个约束大大降低了建模的难度。在法英翻译中一对多的对齐情况并不多见，这个假设带来的问题也不是那么严重。但是，在像汉英翻译这样的任务中，一个汉语单词对应多个英语单词的翻译很常见，这时IBM模型的词对齐假设就表现出了明显的问题。比如在翻译`` 我/会/试一试/。''\ $\to$ \ ``I will have a try .''时，IBM模型根本不能把单词``试一试''对齐到三个单词``have a try''，因而可能无法得到正确的翻译结果。
-\parinterval 本质上说，IBM模型词对齐的``不完整''问题是IBM模型本身的缺陷。解决这个问题有很多思路，第一种方法就是，反向训练后，合并源语言单词，然后再正向训练。这里用汉英翻译为例来解释这个方法。首先反向训练，就是把英语当作待翻译语言，而把汉语当作目标语言进行训练（参数估计）。这样可以得到一个词对齐结果（参数估计的中间结果）。在这个词对齐结果里面，一个汉语单词可对应多个英语单词。之后，扫描每个英语句子，如果有多个英语单词对应同一个汉语单词，就把这些英语单词合并成一个英语单词。处理完之后，再把汉语当作源语言而把英语当作目标语言进行训练。这样就可以把一个汉语单词对应到合并的英语单词上。虽然从模型上看，还是一个汉语单词对应一个英语``单词''，但实质上已经把这个汉语单词对应到多个英语单词上了。训练完之后，再利用这些参数进行翻译（解码）时，就能把一个中文单词翻译成多个英文单词了。但是反向训练后再训练也存在一些问题。首先，合并英语单词会使数据变得更稀疏，训练不充分。其次，由于IBM模型的词对齐结果并不是高精度的，利用它的词对齐结果来合并一些英文单词可能造成严重的错误，比如：把本来很独立的几个单词合在了一起。因此，此方法也并不完美。具体使用时还要考虑实际需要和问题的严重程度来决定是否使用这个方法。
+\parinterval 本质上，IBM模型词对齐的``不完整''问题是IBM模型本身的缺陷。解决这个问题有很多思路。一种思路是，反向训练后，合并源语言单词，然后再正向训练。这里用汉英翻译为例来解释这个方法。首先反向训练，就是把英语当作待翻译语言，而把汉语当作目标语言进行训练（参数估计）。这样可以得到一个词对齐结果（参数估计的中间结果）。在这个词对齐结果里面，一个汉语单词可对应多个英语单词。之后，扫描每个英语句子，如果有多个英语单词对应同一个汉语单词，就把这些英语单词合并成一个英语单词。处理完之后，再把汉语当作源语言而把英语当作目标语言进行训练。这样就可以把一个汉语单词对应到合并的英语单词上。虽然从模型上看，还是一个汉语单词对应一个英语``单词''，但实质上已经把这个汉语单词对应到多个英语单词上了。训练完之后，再利用这些参数进行翻译（解码）时，就能把一个中文单词翻译成多个英文单词了。但是反向训练后再训练也存在一些问题。首先，合并英语单词会使数据变得更稀疏，训练不充分。其次，由于IBM模型的词对齐结果并不是高精度的，利用它的词对齐结果来合并一些英文单词可能造成严重的错误，比如：把本来很独立的几个单词合在了一起。因此，还要考虑实际需要和问题的严重程度来决定是否使用该方法。
-\parinterval 另一种方法是双向对齐之后进行词对齐{\small\sffamily\bfseries{对称化}}\index{对称化}（Symmetrization）\index{Symmetrization}。这个方法可以在IBM词对齐的基础上获得对称的词对齐结果。思路很简单，用正向（汉语为源语言，英语为目标语言）和反向（汉语为目标语言，英语为源语言）同时训练。这样可以得到两个词对齐结果。然后利用一些启发性方法用这两个词对齐生成对称的结果（比如，取``并集''、``交集''等），这样就可以得到包含一对多和多对多的词对齐结果。比如，在基于短语的统计机器翻译中已经很成功地使用了这种词对齐信息进行短语的获取。直到今天，对称化仍然是很多自然语言处理系统中的一个关键步骤。
+\parinterval 另一种思路是双向对齐之后进行词对齐{\small\sffamily\bfseries{对称化}}\index{对称化}（Symmetrization）\index{Symmetrization}。这个方法可以在IBM词对齐的基础上获得对称的词对齐结果。思路很简单，用正向（汉语为源语言，英语为目标语言）和反向（汉语为目标语言，英语为源语言）同时训练。这样可以得到两个词对齐结果。然后利用一些启发性方法用这两个词对齐生成对称的结果（比如，取`` 并集''、``交集''等），这样就可以得到包含一对多和多对多的词对齐结果\cite{och2003systematic}。比如，在基于短语的统计机器翻译中已经很成功地使用了这种词对齐信息进行短语的获取。直到今天，对称化仍然是很多自然语言处理系统中的一个关键步骤。
 %----------------------------------------------------------------------------------------
 %    NEW SUB-SECTION
@@ -397,23 +426,23 @@ p_0+p_1                            & = & 1 \label{eq:6-17}
 \subsection{``缺陷''问题}
-\parinterval ``缺陷''问题是指翻译模型会把一部分概率分配给一些根本不存在的源语言字符串。如果用$\textrm{P}(\textrm{well}|\mathbf{t})$表示$\textrm{P}(\mathbf{s}| \mathbf{t})$在所有的正确的（可以理解为语法上正确的）$\mathbf{s}$上的和，即
+\parinterval IBM模型的缺陷是指翻译模型会把一部分概率分配给一些根本不存在的源语言字符串。如果用$\textrm{P}(\textrm{well}|\mathbf{t})$表示$\textrm{P}(\mathbf{s}| \mathbf{t})$在所有的正确的（可以理解为语法上正确的）$\mathbf{s}$上的和，即
 \begin{eqnarray}
 \textrm{P}(\textrm{well}|\mathbf{t})=\sum_{\mathbf{s}\textrm{\;is\;well\;formed}}{\textrm{P}(\mathbf{s}| \mathbf{t})}
-\label{eq:6-22}
+\label{eq:6-26}
 \end{eqnarray}
 \parinterval 类似地，用$\textrm{P}(\textrm{ill}|\mathbf{t})$表示$\textrm{P}(\mathbf{s}| \mathbf{t})$在所有的错误的（可以理解为语法上错误的）$\mathbf{s}$上的和。如果$\textrm{P}(\textrm{well}|\mathbf{t})+ \textrm{P}(\textrm{ill}|\mathbf{t})<1$，就把剩余的部分定义为$\textrm{P}(\textrm{failure}|\mathbf{t})$。它的形式化定义为，
 \begin{eqnarray}
 \textrm{P}({\textrm{failure}|\mathbf{t}})  = 1 - \textrm{P}({\textrm{well}|\mathbf{t}}) - \textrm{P}({\textrm{ill}|\mathbf{t}})
-\label{eq:6-23}
+\label{eq:6-27}
 \end{eqnarray}
-\parinterval 本质上，模型3和模型4就是对应$\textrm{P}({\textrm{failure}|\mathbf{t}})>0$的情况。这部分概率是模型损失掉的。有时候也把这类``缺陷''问题称为Technical Deficiency。还有一种``缺陷''问题被称作Spiritually Deficiency，它是指$\textrm{P}({\textrm{well}|\mathbf{t}}) + \textrm{P}({\textrm{ill}|\mathbf{t}}) = 1$且$\textrm{P}({\textrm{ill}|\mathbf{t}}) > 0$的情况。模型1和模型2就有Spiritually Deficiency的问题。可以注意到，Technical Deficiency只存在于模型3和模型4中，模型1和模型2并没有Technical Deficiency问题。根本原因是模型1和模型2的词对齐是从源语言出发对应到目标语言，$\mathbf{t}$到$\mathbf{s}$的翻译过程实际上是从单词$s_1$开始到单词$s_m$结束，依次把每个源语言单词$s_j$对应到唯一一个目标语言位置。显然，这个过程能够保证每个源语言单词仅对应一个目标语言单词。但是，模型3和模型4中对齐是从目标语言出发对应到源语言，$\mathbf{t}$到$\mathbf{s}$的翻译过程从$t_1$开始$t_l$结束，依次把目标语言单词$t_i$生成的单词对应到某个源语言位置上。但是这个过程不能保证$t_i$中生成的单词所对应的位置没有被其他已经完成对齐的目标语单词所生成的某个源语言单词对应过，因此也就产生了``缺陷''问题。
+\parinterval 本质上，模型3和模型4就是对应$\textrm{P}({\textrm{failure}|\mathbf{t}})>0$的情况。这部分概率是模型损失掉的。有时候也把这类缺陷称为{\small\bfnew{物理缺陷}}\index{物理缺陷}（Physical Deficiency\index{Physical Deficiency}）或{\small\bfnew{技术缺陷}}\index{技术缺陷}（Technical Deficiency\index{Technical Deficiency}）。还有一种缺陷被称作{\small\bfnew{精神缺陷}}（Spiritual Deficiency\index{Spiritual Deficiency}）或{\small\bfnew{逻辑缺陷}}\index{逻辑缺陷}（Logical Deficiency\index{Logical Deficiency}），它是指$\textrm{P}({\textrm{well}|\mathbf{t}}) + \textrm{P}({\textrm{ill}|\mathbf{t}}) = 1$ 且$\textrm{P}({\textrm{ill}|\mathbf{t}}) > 0$的情况。模型1 和模型2 就有逻辑缺陷。可以注意到，技术缺陷只存在于模型3 和模型4 中，模型1和模型2并没有技术缺陷问题。根本原因在于模型1和模型2的词对齐是从源语言出发对应到目标语言，$\mathbf{t}$到$\mathbf{s}$ 的翻译过程实际上是从单词$s_1$开始到单词$s_m$ 结束，依次把每个源语言单词$s_j$对应到唯一一个目标语言位置。显然，这个过程能够保证每个源语言单词仅对应一个目标语言单词。但是，模型3 和模型4中对齐是从目标语言出发对应到源语言，$\mathbf{t}$到$\mathbf{s}$的翻译过程从$t_1$开始$t_l$ 结束，依次把目标语言单词$t_i$生成的单词对应到某个源语言位置上。但是这个过程不能保证$t_i$中生成的单词所对应的位置没有被其他单词占用，因此也就产生了缺陷。
-\parinterval 这里还要强调的是，Technical Deficiency是模型3和模型4是模型本身的缺陷造成的，如果有一个``更好''的模型就可以完全避免这个问题。而Spiritually Deficiency几乎是不能从模型上根本解决的，因为对于任意一种语言都不能枚举所有的句子（$\textrm{P}({\textrm{ill}|\mathbf{t}})$实际上是得不到的）。
+\parinterval 这里还要强调的是，技术缺陷是模型3和模型4是模型本身的缺陷造成的，如果有一个``更好''的模型就可以完全避免这个问题。而逻辑缺陷几乎是不能从模型上根本解决的，因为对于任意一种语言都不能枚举所有的句子（$\textrm{P}({\textrm{ill}|\mathbf{t}})$实际上是得不到的）。
-\parinterval IBM的模型5已经解决了Technical Deficiency问题。不过模型5过于复杂。实际上Technical Deficiency问题是不是需要解决，这一点在本节随后的内容中还要进行讨论。Spiritually Deficiency的解决很困难，因为即使对于人来说也很难判断一个句子是不是``良好''的句子。当然可以考虑用语言模型来缓解这个问题，不过由于在翻译的时候源语言句子都是定义``良好''的句子，$\textrm{P}({\textrm{ill}|\mathbf{t}})$对$\textrm{P}(\mathbf{s}| \mathbf{t})$的影响并不大。但用输入的源语言句子$\mathbf{s}$的``良好性''并不能解决Technical Deficiency，因为Technical Deficiency是模型的问题或者模型参数估计方法的问题。无论输入什么样的$\mathbf{s}$，模型3和模型4的Technical Deficiency问题都存在。
+\parinterval IBM的模型5已经解决了技术缺陷问题。但逻辑缺陷的解决很困难，因为即使对于人来说也很难判断一个句子是不是``良好''的句子。当然可以考虑用语言模型来缓解这个问题，不过由于在翻译的时候源语言句子都是定义``良好''的句子，$\textrm{P}({\textrm{ill}|\mathbf{t}})$对$\textrm{P}(\mathbf{s}| \mathbf{t})$的影响并不大。但用输入的源语言句子$\mathbf{s}$的``良好性''并不能解决技术缺陷，因为技术缺陷是模型的问题或者模型参数估计方法的问题。无论输入什么样的$\mathbf{s}$，模型3和模型4的技术缺陷问题都存在。
 %----------------------------------------------------------------------------------------
 %    NEW SUB-SECTION
@@ -421,9 +450,9 @@ p_0+p_1                            & = & 1 \label{eq:6-17}
 \subsection{句子长度}
-\parinterval 在IBM模型中，$\textrm{P}(\mathbf{t})\textrm{P}(\mathbf{s}| \mathbf{t})$会随着目标语言句子长度的增加而减少，因为这种生成模型有多个概率化的因素组成，一般乘积项越多结果的值越小。这也就是说，IBM模型会更倾向选择长度短一些的目标语言句子。显然这种对短句子的偏向性并不是我们所期望的。
+\parinterval 在IBM模型中，$\textrm{P}(\mathbf{t})\textrm{P}(\mathbf{s}| \mathbf{t})$会随着目标语言句子长度的增加而减少，因为这种模型有多个概率化的因素组成，乘积项越多结果的值越小。这也就是说，IBM模型会更倾向选择长度短一些的目标语言句子。显然这种对短句子的偏向性并不是机器翻译所期望的。
-\parinterval 这个问题在很多统计机器翻译系统中都存在，实际上也是一种{\small\bfnew{系统偏置}}\index{系统偏置}（System Bias）\index{System Bias}的体现。为了消除这种偏置，可以通过在模型中增加一个短句子惩罚引子来抵消掉模型对短句子的倾向性。比如，可以定义一个惩罚引子，它的值随着长度的减少而增加。不过，简单引入这样的惩罚因子会导致模型并不符合一个严格的噪声信道模型。它对应一个判别式框架的翻译模型，这部分内容会在下一章进行介绍。
+\parinterval 这个问题在很多机器翻译系统中都存在。它实际上也反应了一种{\small\bfnew{系统偏置}}\index{系统偏置}（System Bias）\index{System Bias}的体现。为了消除这种偏置，可以通过在模型中增加一个短句子惩罚引子来抵消掉模型对短句子的倾向性。比如，可以定义一个惩罚引子，它的值随着长度的减少而增加。不过，简单引入这样的惩罚因子会导致模型并不符合一个严格的噪声信道模型。它对应一个基于判别式框架的翻译模型，这部分内容会在{\chapterseven}进行介绍。
 %----------------------------------------------------------------------------------------
 %    NEW SUB-SECTION
@@ -431,7 +460,7 @@ p_0+p_1                            & = & 1 \label{eq:6-17}
 \subsection{其他问题}
-\parinterval 模型5的意义是什么？模型5的提出是为了消除模型3和模型4的Deficiency问题。Deficiency问题的本质是，$\textrm{P}(\mathbf{s},\mathbf{a}| \mathbf{t})$在所有合理的对齐上概率和不为1。 但是，在统计机器翻译中更关心是哪个对齐$\mathbf{a}$使$\textrm{P}(\mathbf{s},\mathbf{a}| \mathbf{t})$达到最大，即使$\textrm{P}(\mathbf{s},\mathbf{a}|\mathbf{t})$不符合概率分布的定义，也并不影响我们寻找理想的对齐$\mathbf{a}$。从工程的角度说，$\textrm{P}(\mathbf{s},\mathbf{a}| \mathbf{t})$不归一并不是一个十分严重的问题。遗憾的是，实际上到现在为止有太多对IBM模型3和模型4中的Deficiency 问题进行过系统的实验和分析，但对于这个问题到底有多严重并没有定论。当然用模型5是可以解决这个问题。但是如果用一个非常复杂的模型去解决了一个并不产生严重后果的问题，那这个模型也就没有太大意义了（从实践的角度）。
+\parinterval 模型5的意义是什么？模型5的提出是为了消除模型3和模型4的缺陷。缺陷的本质是，$\textrm{P}(\mathbf{s},\mathbf{a}| \mathbf{t})$在所有合理的对齐上概率和不为1。 但是，在这里更关心是哪个对齐$\mathbf{a}$使$\textrm{P}(\mathbf{s},\mathbf{a}| \mathbf{t})$达到最大，即使$\textrm{P}(\mathbf{s},\mathbf{a}|\mathbf{t})$不符合概率分布的定义，也并不影响我们寻找理想的对齐$\mathbf{a}$。从工程的角度说，$\textrm{P}(\mathbf{s},\mathbf{a}| \mathbf{t})$不归一并不是一个十分严重的问题。遗憾的是，实际上到现在为止有太多对IBM模型3和模型4中的缺陷进行过系统的实验和分析，但对于这个问题到底有多严重并没有定论。当然用模型5是可以解决这个问题。但是如果用一个非常复杂的模型去解决了一个并不产生严重后果的问题，那这个模型也就没有太大意义了（从实践的角度）。
 \parinterval 概念（cept.）的意义是什么？经过前面的分析可知，IBM模型的词对齐模型使用了cept.这个概念。但是，在IBM模型中使用的cept.最多只能对应一个目标语言单词（模型并没有用到源语言cept. 的概念）。因此可以直接用单词代替cept.。这样，即使不引入cept.的概念，也并不影响IBM模型的建模。实际上，cept.的引入确实可以帮助我们从语法和语义的角度解释词对齐过程。不过，这个方法在IBM 模型中的效果究竟如何还没有定论。

--- a/ChapterAppend/chapterappend.tex
+++ b/ChapterAppend/chapterappend.tex
@@ -172,26 +172,56 @@
 %    NEW SECTION
 %----------------------------------------------------------------------------------------
+\section{IBM模型2训练方法}
+IBM模型2与模型1的训练过程完全一样，本质上都是EM方法，因此可以直接复用{\chapterfive}中训练模型1的流程。对于句对$(\mathbf{s},\mathbf{t})$，$m=|\mathbf{s}|$，$l=|\mathbf{t}|$，E-Step的计算公式如下，其中参数$f(s_j|t_i)$与IBM模型1 一样：
+\begin{eqnarray}
+c(s_u|t_v;\mathbf{s},\mathbf{t}) &=&\sum\limits_{j=1}^{m} \sum\limits_{i=0}^{l} \frac{f(s_u|t_v)a(i|j,m,l) \delta(s_j,s_u)\delta (t_i,t_v) }   {\sum_{k=0}^{l} f(s_u|t_k)a(k|j,m,l)} \\
+c(i|j,m,l;\mathbf{s},\mathbf{t}) &=&\frac{f(s_j|t_i)a(i|j,m,l)}   {\sum_{k=0}^{l} f(s_j|t_k)a(k,j,m,l)}
+\label{eq:append-1}
+\end{eqnarray}
+\parinterval M-Step的计算公式如下，其中参数$a(i|j,m,l)$表示调序概率：
+\begin{eqnarray}
+f(s_u|t_v) &=\frac{c(s_u|t_v;\mathbf{s},\mathbf{t}) }    {\sum_{s_u} c(s_u|t_v;\mathbf{s},\mathbf{t})} \\
+a(i|j,m,l) &=\frac{c(i|j;\mathbf{s},\mathbf{t})}  {\sum_{i}c(i|j;\mathbf{s},\mathbf{t})}
+\label{eq:append-2}
+\end{eqnarray}
+对于由$K$个样本组成的训练集$\{(\mathbf{s}^{[1]},\mathbf{t}^{[1]}),...,(\mathbf{s}^{[K]},\mathbf{t}^{[K]})\}$，可以将M-Step的计算调整为：
+\begin{eqnarray}
+f(s_u|t_v) &=\frac{\sum_{k=0}^{K}c_{\mathbb{E}}(s_u|t_v;\mathbf{s}^{[k]},\mathbf{t}^{[k]}) }    {\sum_{s_u} \sum_{k=0}^{K} c_{\mathbb{E}}(s_u|t_v;\mathbf{s}^{[k]},\mathbf{t}^{[k]})} \\
+a(i|j,m,l) &=\frac{\sum_{k=0}^{K}c_{\mathbb{E}}(i|j;\mathbf{s}^{[k]},\mathbf{t}^{[k]})}  {\sum_{i}\sum_{k=0}^{K}c_{\mathbb{E}}(i|j;\mathbf{s}^{[k]},\mathbf{t}^{[k]})}
+\label{eq:append-3}
+\end{eqnarray}
+%----------------------------------------------------------------------------------------
+%    NEW SECTION
+%----------------------------------------------------------------------------------------
 \section{IBM模型3训练方法}
-\parinterval 模型3的参数估计与模型1和模型2采用相同的方法。这里直接给出辅助函数。
+\parinterval IBM模型3的参数估计与模型1和模型2采用相同的方法。这里直接给出辅助函数。
 \begin{eqnarray}
-h(t,d,n,p, \lambda,\mu, \nu, \zeta) & = &  \textrm{P}_{\theta}(\mathbf{s}|\mathbf{t})-\sum_{t}\lambda_{t}\big(\sum_{s}t(s|t)-1\big)  \nonumber \\
+h(t,d,n,p, \lambda,\mu, \nu, \zeta) & = &  \funp{P}_{\theta}(\mathbf{s}|\mathbf{t})-\sum_{t}\lambda_{t}\big(\sum_{s}t(s|t)-1\big)  \nonumber \\
 & & -\sum_{i}\mu_{iml}\big(\sum_{j}d(j|i,m,l)-1\big) \nonumber \\
 & & -\sum_{t}\nu_{t}\big(\sum_{\varphi}n(\varphi|t)-1\big)-\zeta(p^0+p^1-1)
 \label{eq:1.1}
 \end{eqnarray}
-\parinterval 由于篇幅所限这里略去了推导步骤直接给出一些用于参数估计的等式。
+\parinterval 由于篇幅所限这里略去了推导步骤直接给出具体公式。
 \begin{eqnarray}
-c(s|t,\mathbf{s},\mathbf{t}) & = & \sum_{\mathbf{a}}\big[\textrm{P}_{\theta}(\mathbf{s},\mathbf{a}|\mathbf{t}) \times \sum_{j=1}^{m} (\delta(s_j,s) \cdot \delta(t_{a_{j}},t))\big] \label{eq:1.2} \\
+c(s|t,\mathbf{s},\mathbf{t}) & = & \sum_{\mathbf{a}}\big[\funp{P}_{\theta}(\mathbf{s},\mathbf{a}|\mathbf{t}) \times \sum_{j=1}^{m} (\delta(s_j,s) \cdot \delta(t_{a_{j}},t))\big] \label{eq:1.2} \\
-c(j|i,m,l;\mathbf{s},\mathbf{t}) & = & \sum_{\mathbf{a}}\big[\textrm{P}_{\theta}(\mathbf{s},\mathbf{a}|\mathbf{t}) \times \delta(i,a_j)\big] \label{eq:1.3} \\
+c(j|i,m,l;\mathbf{s},\mathbf{t}) & = & \sum_{\mathbf{a}}\big[\funp{P}_{\theta}(\mathbf{s},\mathbf{a}|\mathbf{t}) \times \delta(i,a_j)\big] \label{eq:1.3} \\
-c(\varphi|t;\mathbf{s},\mathbf{t}) & = & \sum_{\mathbf{a}}\big[\textrm{P}_{\theta}(\mathbf{s},\mathbf{a}|\mathbf{t}) \times \sum_{i=1}^{l}\delta(\varphi,\varphi_{i})\delta(t,t_i)\big]
+c(\varphi|t;\mathbf{s},\mathbf{t}) & = & \sum_{\mathbf{a}}\big[\funp{P}_{\theta}(\mathbf{s},\mathbf{a}|\mathbf{t}) \times \sum_{i=1}^{l}\delta(\varphi,\varphi_{i})\delta(t,t_i)\big]
 \label{eq:1.4}
 \end{eqnarray}
 \begin{eqnarray}
-c(0|\mathbf{s},\mathbf{t}) & = & \sum_{\mathbf{a}}\big[\textrm{P}_{\theta}(\mathbf{s},\mathbf{a}|\mathbf{t})  \times (m-2\varphi_0) \big] \label{eq:1.5} \\
+c(0|\mathbf{s},\mathbf{t}) & = & \sum_{\mathbf{a}}\big[\funp{P}_{\theta}(\mathbf{s},\mathbf{a}|\mathbf{t})  \times (m-2\varphi_0) \big] \label{eq:1.5} \\
-c(1|\mathbf{s},\mathbf{t}) & = & \sum_{\mathbf{a}}\big[\textrm{P}_{\theta}(\mathbf{s},\mathbf{a}|\mathbf{t}) \times \varphi_0 \big] \label{eq:1.6}
+c(1|\mathbf{s},\mathbf{t}) & = & \sum_{\mathbf{a}}\big[\funp{P}_{\theta}(\mathbf{s},\mathbf{a}|\mathbf{t}) \times \varphi_0 \big] \label{eq:1.6}
 \end{eqnarray}
 \parinterval 进一步，对于由$K$个样本组成的训练集，有：
@@ -202,9 +232,9 @@ n(\varphi|t) & = & \nu_{t}^{-1} \times \sum_{s=1}^{K}c(\varphi |t;\mathbf{s}^{[k
 p_x & = & \zeta^{-1} \sum_{k=1}^{K}c(x;\mathbf{s}^{[k]},\mathbf{t}^{[k]}) \label{eq:1.10}
 \end{eqnarray}
-\parinterval 在模型3中，因为产出率的引入，并不能像模型1和模型2那样，在保证正确性的情况下加速参数估计的过程。这就使得每次迭代过程中，都不得不面对大小为$(l+1)^m$的词对齐空间。遍历所有$(l+1)^m$个词对齐所带来的高时间复杂度显然是不能被接受的。因此就要考虑能否仅利用词对齐空间中的部分词对齐对这些参数进行估计。比较简单且直接的方法就是仅利用Viterbi对齐来进行参数估计\footnote{Viterbi词对齐可以被简单的看作搜索到的最好词对齐。}。 遗憾的是，在模型3中并没有方法直接获得Viterbi对齐。这样只能采用一种折中的策略，即仅考虑那些使得$\textrm{P}_{\theta}(\mathbf{s},\mathbf{a}|\mathbf{t})$达到较高值的词对齐。这里把这部分词对齐组成的集合记为$S$。式\ref{eq:1.2}可以被修改为：
+\parinterval 在模型3中，因为繁衍率的引入，并不能像模型1和模型2那样，在保证正确性的情况下加速参数估计的过程。这就使得每次迭代过程中，都不得不面对大小为$(l+1)^m$的词对齐空间。遍历所有$(l+1)^m$个词对齐所带来的高时间复杂度显然是不能被接受的。因此就要考虑能否仅利用词对齐空间中的部分词对齐对这些参数进行估计。比较简单的方法是仅使用Viterbi对齐来进行参数估计，这里Viterbi 词对齐可以被简单的看作搜索到的最好词对齐。遗憾的是，在模型3中并没有方法直接获得Viterbi对齐。这样只能采用一种折中的策略，即仅考虑那些使得$\funp{P}_{\theta}(\mathbf{s},\mathbf{a}|\mathbf{t})$ 达到较高值的词对齐。这里把这部分词对齐组成的集合记为$S$。式\ref{eq:1.2}可以被修改为：
 \begin{eqnarray}
-c(s|t,\mathbf{s},\mathbf{t}) \approx \sum_{\mathbf{a} \in \mathbf{S}}\big[\textrm{P}_{\theta}(\mathbf{s},\mathbf{a}|\mathbf{t}) \times \sum_{j=1}^{m}(\delta(s_j,\mathbf{s}) \cdot \delta(t_{a_{j}},\mathbf{t})) \big]
+c(s|t,\mathbf{s},\mathbf{t}) \approx \sum_{\mathbf{a} \in S}\big[\funp{P}_{\theta}(\mathbf{s},\mathbf{a}|\mathbf{t}) \times \sum_{j=1}^{m}(\delta(s_j,\mathbf{s}) \cdot \delta(t_{a_{j}},\mathbf{t})) \big]
 \label{eq:1.11}
 \end{eqnarray}
@@ -217,15 +247,15 @@ S = N(b^{\infty}(V(\mathbf{s}|\mathbf{t};2))) \cup (\mathop{\cup}\limits_{ij} N(
 \parinterval 为了理解这个公式，先介绍几个概念。
 \begin{itemize}
 \item $V(\mathbf{s}|\mathbf{t})$表示Viterbi词对齐，$V(\mathbf{s}|\mathbf{t},1)$、$V(\mathbf{s}|\mathbf{t},2)$和$V(\mathbf{s}|\mathbf{t},3)$就分别对应了模型1、2 和3 的Viterbi 词对齐；
-\item 把那些满足第$j$个源语言单词对应第$i$个目标语言单词（$a_j=i$）的词对齐构成的集合记为$\mathbf{A}_{i \leftrightarrow j}(\mathbf{s},\mathbf{t})$。通常称这些对齐中$j$和$i$被``钉''在了一起。在$\mathbf{A}_{i \leftrightarrow j}(\mathbf{s},\mathbf{t})$中使$\textrm{P}(\mathbf{a}|\mathbf{s},\mathbf{t})$达到最大的那个词对齐被记为$V_{i \leftrightarrow j}(\mathbf{s},\mathbf{t})$；
+\item 把那些满足第$j$个源语言单词对应第$i$个目标语言单词（$a_j=i$）的词对齐构成的集合记为$\mathbf{A}_{i \leftrightarrow j}(\mathbf{s},\mathbf{t})$。通常称这些对齐中$j$和$i$被``钉''在了一起。在$\mathbf{A}_{i \leftrightarrow j}(\mathbf{s},\mathbf{t})$中使$\funp{P}(\mathbf{a}|\mathbf{s},\mathbf{t})$达到最大的那个词对齐被记为$V_{i \leftrightarrow j}(\mathbf{s},\mathbf{t})$；
 \item 如果两个词对齐，通过交换两个词对齐连接就能互相转化，则称它们为邻居。一个词对齐$\mathbf{a}$的所有邻居记为$N(\mathbf{a})$。
 \end{itemize}
 \vspace{0.5em}
-\parinterval 公式\ref{eq:1.12}中，$b^{\infty}(V(\mathbf{s}|\mathbf{t};2))$ 和 $b_{i \leftrightarrow j}^{\infty}(V_{i \leftrightarrow j}(\mathbf{s}|\mathbf{t},2))$ 分别是对 $V(\mathbf{s}|\mathbf{t};3)$ 和 $V_{i \leftrightarrow j}(\mathbf{s}|\mathbf{t},3)$ 的估计。在计算$S$的过程中，需要知道一个对齐$\bf{a}$的邻居$\bf{a}^{'}$的概率，即通过$\textrm{P}_{\theta}(\mathbf{a},\mathbf{s}|\mathbf{t})$计算$\textrm{p}_{\theta}(\mathbf{a}',\mathbf{s}|\mathbf{t})$。在模型3中，如果$\bf{a}$和$\bf{a}'$仅区别于某个源语单词对齐到的目标位置上（$a_j \neq a_{j}'$），那么
+\parinterval 公式\ref{eq:1.12}中，$b^{\infty}(V(\mathbf{s}|\mathbf{t};2))$ 和 $b_{i \leftrightarrow j}^{\infty}(V_{i \leftrightarrow j}(\mathbf{s}|\mathbf{t},2))$ 分别是对 $V(\mathbf{s}|\mathbf{t};3)$ 和 $V_{i \leftrightarrow j}(\mathbf{s}|\mathbf{t},3)$ 的估计。在计算$S$的过程中，需要知道一个对齐$\bf{a}$的邻居$\bf{a}^{'}$的概率，即通过$\funp{P}_{\theta}(\mathbf{a},\mathbf{s}|\mathbf{t})$计算$\funp{P}_{\theta}(\mathbf{a}',\mathbf{s}|\mathbf{t})$。在模型3中，如果$\bf{a}$和$\bf{a}'$仅区别于某个源语单词对齐到的目标位置上（$a_j \neq a_{j}'$），那么
 \begin{eqnarray}
-\textrm{P}_{\theta}(\mathbf{a}',\mathbf{s}|\mathbf{t}) & = & \textrm{P}_{\theta}(\mathbf{a},\mathbf{s}|\mathbf{t}) \cdot  \nonumber \\
+\funp{P}_{\theta}(\mathbf{a}',\mathbf{s}|\mathbf{t}) & = & \funp{P}_{\theta}(\mathbf{a},\mathbf{s}|\mathbf{t}) \cdot  \nonumber \\
                                                                                   &     & \frac{\varphi_{i'}+1}{\varphi_i} \cdot \frac{n(\varphi_{i'}+1|t_{i'})}{n(\varphi_{i'}|t_{i'})} \cdot \frac{n(\varphi_{i}-1|t_{i})}{n(\varphi_{i}|t_{i})} \cdot \nonumber \\
                                                                                   &     & \frac{t(s_j|t_{i'})}{t(s_{j}|t_{i})} \cdot \frac{d(j|i',m,l)}{d(j|i,m,l)}
 \label{eq:1.13}
@@ -233,7 +263,7 @@ S = N(b^{\infty}(V(\mathbf{s}|\mathbf{t};2))) \cup (\mathop{\cup}\limits_{ij} N(
 \parinterval 如果$\bf{a}$和$\bf{a}'$区别于两个位置$j_1$和$j_2$的对齐上，$a_{j_{1}}=a_{j_{2}^{'}}$且$a_{j_{2}}=a_{j_{1}^{'}}$，那么
 \begin{eqnarray}
-\textrm{P}_{\theta}(\mathbf{a'},\mathbf{s}|\mathbf{t}) = \textrm{P}_{\theta}(\mathbf{a},\mathbf{s}|\mathbf{t}) \cdot \frac{t(s_{j_{2}}|t_{a_{j_{2}}})}{t(s_{j_{1}}|t_{a_{j_{1}}})} \cdot \frac{d(j_{2}|a_{j_{2}},m,l)}{d(j_{1}|a_{j_{1}},m,l)}
+\funp{P}_{\theta}(\mathbf{a'},\mathbf{s}|\mathbf{t}) = \funp{P}_{\theta}(\mathbf{a},\mathbf{s}|\mathbf{t}) \cdot \frac{t(s_{j_{2}}|t_{a_{j_{2}}})}{t(s_{j_{1}}|t_{a_{j_{1}}})} \cdot \frac{d(j_{2}|a_{j_{2}},m,l)}{d(j_{1}|a_{j_{1}},m,l)}
 \label{eq:1.14}
 \end{eqnarray}
@@ -247,15 +277,15 @@ S = N(b^{\infty}(V(\mathbf{s}|\mathbf{t};2))) \cup (\mathop{\cup}\limits_{ij} N(
 \parinterval 模型4的参数估计基本与模型3一致。需要修改的是扭曲度的估计公式，对于目标语第$i$个cept.生成的第一单词，可以得到（假设有$K$个训练样本）：
 \begin{eqnarray}
-d_1(\Delta_j|ca,cb;\mathbf{s},\mathbf{t}) = \mu_{1cacb}^{-1} \times \sum_{k=1}^{K}c_1(\Delta_j|ca,cb;\mathbf{s}^{[k]},\mathbf{t}^{[k]})
+d_1(\Delta_j|ca,cb) = \mu_{1cacb}^{-1} \times \sum_{k=1}^{K}c_1(\Delta_j|ca,cb;\mathbf{s}^{[k]},\mathbf{t}^{[k]})
 \label{eq:1.15}
 \end{eqnarray}
 其中，
 \begin{eqnarray}
-c_1(\Delta_j|ca,cb;\mathbf{s},\mathbf{t})           & = & \sum_{\mathbf{a}}\big[\textrm{P}_{\theta}(\mathbf{s},\mathbf{a}|\mathbf{t}) \times s_1(\Delta_j|ca,cb;\mathbf{a},\mathbf{s},\mathbf{t})\big] \label{eq:1.16} \\
+c_1(\Delta_j|ca,cb;\mathbf{s},\mathbf{t})           & = & \sum_{\mathbf{a}}\big[\funp{P}_{\theta}(\mathbf{s},\mathbf{a}|\mathbf{t}) \times s_1(\Delta_j|ca,cb;\mathbf{a},\mathbf{s},\mathbf{t})\big] \label{eq:1.16} \\
-s_1(\Delta_j|ca,cb;\rm{a},\mathbf{s},\mathbf{t}) & = & \sum_{i=1}^l \big[\varepsilon(\phi_i) \cdot \delta(\pi_{i1}-\odot _{i},\Delta_j) \cdot \nonumber \\
+s_1(\Delta_j|ca,cb;\rm{a},\mathbf{s},\mathbf{t}) & = & \sum_{i=1}^l \big[\varepsilon(\varphi_i) \cdot \delta(\pi_{i1}-\odot _{i},\Delta_j) \cdot \nonumber \\
                                                                           &     & \delta(A(t_{i-1}),ca) \cdot \delta(B(\tau_{i1}),cb) \big] \label{eq:1.17}
 \end{eqnarray}
@@ -272,7 +302,7 @@ s_1(\Delta_j|ca,cb;\rm{a},\mathbf{s},\mathbf{t}) & = & \sum_{i=1}^l \big[\vareps
 对于目标语第$i$个cept.生成的其他单词（非第一个单词），可以得到：
 \begin{eqnarray}
-d_{>1}(\Delta_j|cb;\mathbf{s},\mathbf{t}) = \mu_{>1cb}^{-1} \times \sum_{k=1}^{K}c_{>1}(\Delta_j|cb;\mathbf{s}^{[k]},\mathbf{t}^{[k]})
+d_{>1}(\Delta_j|cb) = \mu_{>1cb}^{-1} \times \sum_{k=1}^{K}c_{>1}(\Delta_j|cb;\mathbf{s}^{[k]},\mathbf{t}^{[k]})
 \label{eq:1.18}
 \end{eqnarray}
@@ -280,7 +310,7 @@ d_{>1}(\Delta_j|cb;\mathbf{s},\mathbf{t}) = \mu_{>1cb}^{-1} \times \sum_{k=1}^{K
 \begin{eqnarray}
 c_{>1}(\Delta_j|cb;\mathbf{s},\mathbf{t})                  & = & \sum_{\mathbf{a}}\big[\textrm{p}_{\theta}(\mathbf{s},\mathbf{a}|\mathbf{t}) \times s_{>1}(\Delta_j|cb;\mathbf{a},\mathbf{s},\mathbf{t}) \big] \label{eq:1.19} \\
-s_{>1}(\Delta_j|cb;\mathbf{a},\mathbf{s},\mathbf{t}) & = & \sum_{i=1}^l \big[\varepsilon(\phi_i-1)\sum_{k=2}^{\phi_i}\delta(\pi_{[i]k}-\pi_{[i]k-1},\Delta_j) \cdot \nonumber ß\\
+s_{>1}(\Delta_j|cb;\mathbf{a},\mathbf{s},\mathbf{t}) & = & \sum_{i=1}^l \big[\varepsilon(\varphi_i-1)\sum_{k=2}^{\varphi_i}\delta(\pi_{[i]k}-\pi_{[i]k-1},\Delta_j) \cdot \nonumber ß\\
                                                                                  &    & \delta(B(\tau_{[i]k}),cb) \big] \label{eq:1.20}
 \end{eqnarray}
@@ -291,7 +321,7 @@ s_{>1}(\Delta_j|cb;\mathbf{a},\mathbf{s},\mathbf{t}) & = & \sum_{i=1}^l \big[\va
 \label{eq:1.22}
 \end{eqnarray}
-\parinterval 对于一个对齐$\mathbf{a}$，可用模型3对它的邻居进行排名，即按$\textrm{P}_{\theta}(b(\mathbf{a})|\mathbf{s},\mathbf{t};3)$排序，其中$b(\mathbf{a})$表示$\mathbf{a}$的邻居。$\tilde{b}(\mathbf{a})$ 表示这个排名表中满足$\textrm{P}_{\theta}(\mathbf{a}'|\mathbf{s},\mathbf{t};4) > \textrm{P}_{\theta}⁡(\mathbf{a}|\mathbf{s},\mathbf{t};4)$的最高排名的$\mathbf{a}'$。同理可知$\tilde{b}_{i \leftrightarrow j}^{\infty}(\mathbf{a})$ 的意义。这里之所以不用模型3中采用的方法直接利用$b^{\infty}(\mathbf{a})$得到模型4中高概率的对齐，是因为模型4中，要想获得某个对齐$\mathbf{a}$的邻居$\mathbf{a}'$，必须做很大调整，比如：调整$\tau_{[i]1}$和$\odot_{i}$等等。这个过程要比模型3的相应过程复杂得多。因此在模型4中只能借助于模型3的中间步骤来进行参数估计。
+\parinterval 对于一个对齐$\mathbf{a}$，可用模型3对它的邻居进行排名，即按$\funp{P}_{\theta}(b(\mathbf{a})|\mathbf{s},\mathbf{t};3)$排序，其中$b(\mathbf{a})$表示$\mathbf{a}$的邻居。$\tilde{b}(\mathbf{a})$ 表示这个排名表中满足$\funp{P}_{\theta}(\mathbf{a}'|\mathbf{s},\mathbf{t};4) > \funp{P}_{\theta}⁡(\mathbf{a}|\mathbf{s},\mathbf{t};4)$的最高排名的$\mathbf{a}'$。 同理可知$\tilde{b}_{i \leftrightarrow j}^{\infty}(\mathbf{a})$ 的意义。这里之所以不用模型3中采用的方法直接利用$b^{\infty}(\mathbf{a})$得到模型4中高概率的对齐，是因为模型4中要想获得某个对齐$\mathbf{a}$的邻居$\mathbf{a}'$必须做很大调整，比如：调整$\tau_{[i]1}$和$\odot_{i}$等等。这个过程要比模型3的相应过程复杂得多。因此在模型4中只能借助于模型3的中间步骤来进行参数估计。
 \setlength{\belowdisplayskip}{3pt}%调整空白大小
 %----------------------------------------------------------------------------------------
@@ -299,45 +329,45 @@ s_{>1}(\Delta_j|cb;\mathbf{a},\mathbf{s},\mathbf{t}) & = & \sum_{i=1}^l \big[\va
 %----------------------------------------------------------------------------------------
 \section{IBM模型5训练方法}
-\parinterval 模型5的参数估计过程也与模型3的过程基本一致，二者的区别在于扭曲度的估计公式。在模型5中，对于目标语第$i$个cept.生成的第一单词，可以得到（假设有$K$个训练样本）：
+\parinterval 模型5的参数估计过程也模型4的过程基本一致，二者的区别在于扭曲度的估计公式。在模型5中，对于目标语第$i$个cept.生成的第一单词，可以得到（假设有$K$个训练样本）：
 \begin{eqnarray}
-d_1(\Delta_j|cb;\mathbf{s},\mathbf{t}) = \mu_{1cb}^{-1} \times \sum_{k=1}^{K}c_1(\Delta_j|cb;\mathbf{s}^{[k]},\mathbf{t}^{[k]})
+d_1(\Delta_j|cb) = \mu_{1cb}^{-1} \times \sum_{k=1}^{K}c_1(\Delta_j|cb;\mathbf{s}^{[k]},\mathbf{t}^{[k]})
 \label{eq:1.23}
 \end{eqnarray}
 其中，
 \begin{eqnarray}
-c_1(\Delta_j|cb,v_x,v_y;\mathbf{s},\mathbf{t})                   & = & \sum_{\mathbf{a}}\Big[ \textrm{P}(\mathbf{s},\mathbf{a}|\mathbf{t}) \times s_1(\Delta_j|cb,v_x,v_y;\mathbf{a},\mathbf{s},\mathbf{t}) \Big] \label{eq:1.24} \\
+c_1(\Delta_j|cb,v_x,v_y;\mathbf{s},\mathbf{t})                   & = & \sum_{\mathbf{a}}\Big[ \funp{P}(\mathbf{s},\mathbf{a}|\mathbf{t}) \times s_1(\Delta_j|cb,v_x,v_y;\mathbf{a},\mathbf{s},\mathbf{t}) \Big] \label{eq:1.24} \\
-s_1(\Delta_j|cb,v_x,v_y;\mathbf{a},\mathbf{s},\mathbf{t}) & = & \sum_{i=1}^l \Big [ \varepsilon(\phi_i) \cdot \delta(v_{\pi_{i1}},\Delta_j) \cdot \delta(v_{\odot _{i-1}},v_x) \nonumber \\
+s_1(\Delta_j|cb,v_x,v_y;\mathbf{a},\mathbf{s},\mathbf{t}) & = & \sum_{i=1}^l \Big [ \varepsilon(\varphi_i) \cdot \delta(v_{\pi_{i1}},\Delta_j) \cdot \delta(v_{\odot _{i-1}},v_x) \nonumber \\
-                                                                                          &    & \cdot \delta(v_m-\phi_i+1,v_y) \cdot \delta(v_{\pi_{i1}},v_{\pi_{i1}-1} )\Big] \label{eq:1.25}
+                                                                                          &    & \cdot \delta(v_m-\varphi_i+1,v_y) \cdot \delta(v_{\pi_{i1}},v_{\pi_{i1}-1} )\Big] \label{eq:1.25}
 \end{eqnarray}
 对于目标语第$i$个cept.生成的其他单词（非第一个单词），可以得到：
 \begin{eqnarray}
-d_{>1}(\Delta_j|cb,v;\mathbf{s},\mathbf{t}) = \mu_{>1cb}^{-1} \times \sum_{k=1}^{K}c_{>1}(\Delta_j|cb,v;\mathbf{s}^{[k]},\mathbf{t}^{[k]})
+d_{>1}(\Delta_j|cb,v) = \mu_{>1cb}^{-1} \times \sum_{k=1}^{K}c_{>1}(\Delta_j|cb,v;\mathbf{s}^{[k]},\mathbf{t}^{[k]})
 \label{eq:1.26}
 \end{eqnarray}
 其中，
 \begin{eqnarray}
-c_{>1}(\Delta_j|cb,v;\mathbf{s},\mathbf{t})                   & =  & \sum_{\mathbf{a}}\Big[\textrm{P}(\mathbf{a},\mathbf{s}|\mathbf{t}) \times s_{>1}(\Delta_j|cb,v;\mathbf{a},\mathbf{s},\mathbf{t}) \Big] \label{eq:1.27} \\
+c_{>1}(\Delta_j|cb,v;\mathbf{s},\mathbf{t})                   & =  & \sum_{\mathbf{a}}\Big[\funp{P}(\mathbf{a},\mathbf{s}|\mathbf{t}) \times s_{>1}(\Delta_j|cb,v;\mathbf{a},\mathbf{s},\mathbf{t}) \Big] \label{eq:1.27} \\
-s_{>1}(\Delta_j|cb,v;\mathbf{a},\mathbf{s},\mathbf{t}) & = & \sum_{i=1}^l\Big[\varepsilon(\phi_i-1)\sum_{k=2}^{\phi_i} \big[\delta(v_{\pi_{ik}}-v_{\pi_{[i]k}-1},\Delta_j)  \nonumber \\
+s_{>1}(\Delta_j|cb,v;\mathbf{a},\mathbf{s},\mathbf{t}) & = & \sum_{i=1}^l\Big[\varepsilon(\varphi_i-1)\sum_{k=2}^{\varphi_i} \big[\delta(v_{\pi_{ik}}-v_{\pi_{[i]k}-1},\Delta_j)  \nonumber \\
-                                                                                    &     & \cdot \delta(B(\tau_{[i]k}) ,cb) \cdot \delta(v_m-v_{\pi_{i(k-1)}}-\phi_i+k,v) \nonumber \\
+                                                                                    &     & \cdot \delta(B(\tau_{[i]k}) ,cb) \cdot \delta(v_m-v_{\pi_{i(k-1)}}-\varphi_i+k,v) \nonumber \\
                                                                                    &     & \cdot \delta(v_{\pi_{i1}},v_{\pi_{i1}-1}) \big] \Big] \label{eq:1.28}
 \end{eqnarray}
 \vspace{0.5em}
-\parinterval 从式(\ref{eq:1.24})中可以看出因子$\delta(v_{\pi_{i1}},v_{\pi_{i1}-1})$保证了，即使对齐$\mathbf{a}$不合理（一个源语位置对应多个目标语位置）也可以避免在这个不合理的对齐上计算结果。需要注意的是因子$\delta(v_{\pi_{p1}},v_{\pi_{p1-1}})$，确保了$\mathbf{a}$中不合理的部分不产生坏的影响，而$\mathbf{a}$中其他正确的部分仍会参与迭代。
+\parinterval 从式\ref{eq:1.24}中可以看出因子$\delta(v_{\pi_{i1}},v_{\pi_{i1}-1})$保证了，即使对齐$\mathbf{a}$不合理（一个源语言位置对应多个目标语言位置）也可以避免在这个不合理的对齐上计算结果。需要注意的是因子$\delta(v_{\pi_{p1}},v_{\pi_{p1-1}})$，确保了$\mathbf{a}$中不合理的部分不产生坏的影响，而$\mathbf{a}$中其他正确的部分仍会参与迭代。
 \parinterval 不过上面的参数估计过程与IBM前4个模型的参数估计过程并不完全一样。IBM前4个模型在每次迭代中，可以在给定$\mathbf{s}$、$\mathbf{t}$和一个对齐$\mathbf{a}$的情况下直接计算并更新参数。但是在模型5的参数估计过程中（如公式\ref{eq:1.24}），需要模拟出由$\mathbf{t}$生成$\mathbf{s}$的过程才能得到正确的结果，因为从$\mathbf{t}$、$\mathbf{s}$和$\mathbf{a}$中是不能直接得到 的正确结果的。具体说，就是要从目标语言句子的第一个单词开始到最后一个单词结束，依次生成每个目标语言单词对应的源语言单词，每处理完一个目标语言单词就要暂停，然后才能计算式\ref{eq:1.24}中求和符号里面的内容。这也就是说即使给定了$\mathbf{s}$、$\mathbf{t}$和一个对齐$\mathbf{a}$，也不能直接在它们上进行计算，必须重新模拟$\mathbf{t}$到$\mathbf{s}$的生成过程。
-\parinterval 从前面的分析可以看出，虽然模型5比模型4更精确，但是模型5过于复杂以至于给参数估计增加了计算量（对于每组$\mathbf{t}$、$\mathbf{s}$和$\mathbf{a}$都要模拟$\mathbf{t}$生成$\mathbf{s}$的翻译过程）。因此模型5的开发对于系统实现是一个挑战。
+\parinterval 从前面的分析可以看出，虽然模型5比模型4更精确，但是模型5过于复杂以至于给参数估计增加了计算量（对于每组$\mathbf{t}$、$\mathbf{s}$和$\mathbf{a}$都要模拟$\mathbf{t}$生成$\mathbf{s}$的翻译过程）。因此模型5的系统实现是一个挑战。
 \parinterval 在模型5中同样需要定义一个词对齐集合$S$，使得每次迭代都在$S$上进行。可以对$S$进行如下定义
 \begin{eqnarray}
@@ -346,7 +376,7 @@ s_{>1}(\Delta_j|cb,v;\mathbf{a},\mathbf{s},\mathbf{t}) & = & \sum_{i=1}^l\Big[\v
 \end{eqnarray}
 \vspace{0.5em}
-\parinterval 这里$\tilde{\tilde{b}}(\mathbf{a})$借用了模型4中$\tilde{b}(\mathbf{a})$的概念。不过$\tilde{\tilde{b}}(\mathbf{a})$表示在利用模型3进行排名的列表中满足$\textrm{P}_{\theta}(\mathbf{a}'|\mathbf{s},\mathbf{t};5)$的最高排名的词对齐。
+\noindent 其中，$\tilde{\tilde{b}}(\mathbf{a})$借用了模型4中$\tilde{b}(\mathbf{a})$的概念。不过$\tilde{\tilde{b}}(\mathbf{a})$表示在利用模型3进行排名的列表中满足$\funp{P}_{\theta}(\mathbf{a}'|\mathbf{s},\mathbf{t};5)$的最高排名的词对齐，这里$\mathbf{a}'$表示$\mathbf{a}$的邻居。
 \end{appendices}

--- a/ChapterPreface/chapterpreface.tex
+++ b/ChapterPreface/chapterpreface.tex
@@ -38,7 +38,7 @@
 最初，笔者的想法仅仅是将机器翻译的技术内容做成资料供人阅读。但是，朋友、同事们一直鼓励将内容正式出版。虽然担心书的内容不够精致，无法给同行作为参考，但是最终还是下定决心重构内容。所幸，得到电子工业出版社的支持，形成新版，共十八章。
-写作中，每当笔者翻起以前的资料，都会想起当年的一些故事。与其说这部书是写给读者，还不如说这本书是写给笔者自己，写给所有同笔者一样，经历过或正在经历机器翻译蓬勃发展年代的人。希望本书可以作为一个时代的记录，但是这个时代未并结束，还将继续，并更加美好。
+写作中，每当笔者翻起以前的资料，都会想起当年的一些故事。与其说这部书是写给读者，还不如说这本书是写给笔者自己，写给所有同笔者一样，经历过或正在经历机器翻译蓬勃发展年代的人。希望本书可以作为一个时代的记录，但是这个时代并未结束，还将继续，并更加美好。
 \vspace{1.0em}

--- a/bibliography.bib
+++ b/bibliography.bib
@@ -937,18 +937,6 @@
 }
-@article{shannon1948mathematical,
-  title ={A mathematical theory of communication},
-  author ={Shannon, Claude E},
-  journal ={Bell system technical journal},
-  volume ={27},
-  number ={3},
-  pages ={379--423},
-  year ={1948},
-  publisher ={Wiley Online Library}
-}
 @inproceedings{ng2002discriminative,
  title ={On discriminative vs. generative classifiers: A comparison of logistic regression and naive bayes},
  author ={Ng, Andrew Y and Jordan, Michael I},
@@ -1042,13 +1030,6 @@
  year={2001}
 }
-@article{周志华2016《机器学习》,
-  title={《机器学习》},
-  author={周志华},
-  journal={中国民商},
-  number={3},
-  year={2016},
-}
 @book{李航2012统计学习方法,
  title={统计学习方法},
@@ -1065,39 +1046,39 @@
 }
 @inproceedings{brants-2000-tnt,
-    title = "{T}n{T} {--} A Statistical Part-of-Speech Tagger",
+    title = {{T}n{T} {--} A Statistical Part-of-Speech Tagger},
-    author = "Brants, Thorsten",
+    author = {Brants, Thorsten},
    month = apr,
-    year = "2000",
+    year = {2000},
-    address = "Seattle, Washington, USA",
+    address = {Seattle, Washington, USA},
-    publisher = "Association for Computational Linguistics",
+    publisher = {Association for Computational Linguistics},
-    doi = "10.3115/974147.974178",
+    doi = {10.3115/974147.974178},
-    pages = "224--231",
+    pages = {224--231},
 }
 @inproceedings{tsuruoka-tsujii-2005-chunk,
-    title = "Chunk Parsing Revisited",
+    title = {Chunk Parsing Revisited},
-    author = "Tsuruoka, Yoshimasa  and
+    author = {Tsuruoka, Yoshimasa  and
-      Tsujii, Jun{'}ichi",
+      Tsujii, Jun{'}ichi},
    month = oct,
-    year = "2005",
+    year = {2005},
-    address = "Vancouver, British Columbia",
+    address = {Vancouver, British Columbia},
-    publisher = "Association for Computational Linguistics",
+    publisher = {Association for Computational Linguistics},
-    pages = "133--140",
+    pages = {133--140},
 }
 @inproceedings{li-etal-2003-news-oriented,
-    title = "News-Oriented Automatic {C}hinese Keyword Indexing",
+    title = {News-Oriented Automatic {C}hinese Keyword Indexing},
-    author = "Li, Sujian  and
+    author = {Li, Sujian  and
      Wang, Houfeng  and
      Yu, Shiwen  and
-      Xin, Chengsheng",
+      Xin, Chengsheng},
    month = jul,
-    year = "2003",
+    year = {2003},
-    address = "Sapporo, Japan",
+    address = {Sapporo, Japan},
-    publisher = "Association for Computational Linguistics",
+    publisher = {Association for Computational Linguistics},
-    doi = "10.3115/1119250.1119263",
+    doi = {10.3115/1119250.1119263},
-    pages = "92--97",
+    pages = {92--97},
 }
 @article{2015Bidirectional,
@@ -1141,11 +1122,6 @@
  year={2018}
 }
-@misc{radford2018improving,
-  title={Improving language understanding by generative pre-training},
-  author={Radford, Alec and Narasimhan, Karthik and Salimans, Tim and Sutskever, Ilya},
-  year={2018}
-}
 @article{conneau2019unsupervised,
  title={Unsupervised cross-lingual representation learning at scale},
@@ -1187,20 +1163,7 @@
  pages     = {239--258},
  year      = {1993}
 }
-@inproceedings{callison-burch-etal-2012-findings,
-    title = {Findings of the 2012 Workshop on Statistical Machine Translation},
-    author = {Callison-Burch, Chris  and
-      Koehn, Philipp  and
-      Monz, Christof  and
-      Post, Matt  and
-      Soricut, Radu  and
-      Specia, Lucia},
-    booktitle = {Proceedings of the Seventh Workshop on Statistical Machine Translation},
-    month = jun,
-    year = {2012},
-    publisher = {Association for Computational Linguistics},
-    pages = {10--51}
-}
 @inproceedings{DBLP:conf/coling/SuWC92,
  author    = {Keh{-}Yih Su and
               Ming{-}Wen Wu and
@@ -1234,12 +1197,6 @@
  publisher = {{ACL}},
  year      = {1992}
 }
-@inproceedings{doddington2002automatic,
-  title={Automatic evaluation of machine translation quality using n-gram co-occurrence statistics},
-  author={Doddington, George},
-  pages={138--145},
-  year={2002}
-}
 @inproceedings{DBLP:conf/emnlp/ChiangDCN08,
  author    = {David Chiang and
               Steve DeNeefe and
@@ -1368,17 +1325,6 @@
  publisher = {European Association for Machine Translation},
  year      = {2015}
 }
-@article{DBLP:journals/jmlr/BengioDVJ03,
-  author    = {Yoshua Bengio and
-               R{\'{e}}jean Ducharme and
-               Pascal Vincent and
-               Christian Janvin},
-  title     = {A Neural Probabilistic Language Model},
-  journal   = {J. Mach. Learn. Res.},
-  volume    = {3},
-  pages     = {1137--1155},
-  year      = {2003}
-}
 @inproceedings{DBLP:conf/emnlp/SocherPHNM11,
  author    = {Richard Socher and
               Jeffrey Pennington and
@@ -1437,13 +1383,6 @@
  publisher = {European Language Resources Association},
  year      = {2004}
 }
-@article{article,
-author = {Jones, Douglas and Shen, Wade and Granoien, Neil and Herzog, Martha and Weinstein, Clifford},
-year = {2005},
-month = {05},
-pages = {7},
-title = {Measuring Translation Quality by Testing English Speakers with a New Defense Language Proficiency Test for Arabic}
-}
 @inproceedings{DBLP:conf/icassp/JonesGSGHRW05,
  author    = {Douglas A. Jones and
               Edward Gibson and
@@ -1571,19 +1510,6 @@ title = {Measuring Translation Quality by Testing English Speakers with a New De
  publisher = {Association for Computational Linguistics},
  year      = {2017}
 }
-@inproceedings{DBLP:conf/naacl/PetersNIGCLZ18,
-  author    = {Matthew E. Peters and
-               Mark Neumann and
-               Mohit Iyyer and
-               Matt Gardner and
-               Christopher Clark and
-               Kenton Lee and
-               Luke Zettlemoyer},
-  title     = {Deep Contextualized Word Representations},
-  pages     = {2227--2237},
-  publisher = {Association for Computational Linguistics},
-  year      = {2018}
-}
 @inproceedings{DBLP:conf/emnlp/PenningtonSM14,
  author    = {Jeffrey Pennington and
               Richard Socher and
@@ -1593,17 +1519,6 @@ title = {Measuring Translation Quality by Testing English Speakers with a New De
  publisher = {{ACL}},
  year      = {2014}
 }
-@inproceedings{DBLP:conf/naacl/DevlinCLT19,
-  author    = {Jacob Devlin and
-               Ming{-}Wei Chang and
-               Kenton Lee and
-               Kristina Toutanova},
-  title     = {{BERT:} Pre-training of Deep Bidirectional Transformers for Language
-               Understanding},
-  pages     = {4171--4186},
-  publisher = {Association for Computational Linguistics},
-  year      = {2019}
-}
 @inproceedings{DBLP:conf/nips/KirosZSZUTF15,
  author    = {Ryan Kiros and
               Yukun Zhu and
@@ -1657,14 +1572,6 @@ title = {Measuring Translation Quality by Testing English Speakers with a New De
  pages={224--231},
  year={2003}
 }
-@inproceedings{DBLP:conf/amta/Whi94teOO,
-  author    = {John S. White and
-               Theresa A. O'Connell and
-               Francis E. O'Mara},
-  title     = {The {ARPA} {MT} Evaluation Methodologies: Evolution, Lessons, and
-               Future Approaches},
-  year      = {1994}
-}
 @article{DBLP:journals/mt/PrzybockiPBS09,
  author    = {Mark A. Przybocki and
               Kay Peterson and
@@ -1956,9 +1863,7 @@ title = {Measuring Translation Quality by Testing English Speakers with a New De
 }
 @InProceedings{Miller:2005:MTS,
  author = {Keith J. Miller and Michelle Vanni},
-  title = {Inter-rater Agreement Measures, and the Refinement of Metrics in the {PLATO} {MT} Evaluation             Paradigm},
+  title = {Inter-rater Agreement Measures, and the Refinement of Metrics in the {PLATO} {MT} Evaluation Paradigm},
-  url = {http://www.dtic.mil/cgi-bin/GetTRDoc?AD=ADA456393},
-  googlescholar = {4922588981188199265},
  month = {September},
  year = 2005
 }
@@ -2177,4 +2082,4 @@ title = {Measuring Translation Quality by Testing English Speakers with a New De
 %%%%% chapter 18------------------------------------------------------
 %%%%% chapter 18------------------------------------------------------
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\ No newline at end of file
--- a/mt-book-xelatex.tex
+++ b/mt-book-xelatex.tex
@@ -135,7 +135,7 @@
 %\include{Chapter2/chapter2}
 %\include{Chapter3/chapter3}
 %\include{Chapter4/chapter4}
-\include{Chapter5/chapter5}
+%\include{Chapter5/chapter5}
 %\include{Chapter6/chapter6}
 %\include{Chapter7/chapter7}
 %\include{Chapter8/chapter8}
@@ -149,7 +149,7 @@
 %\include{Chapter16/chapter16}
 %\include{Chapter17/chapter17}
 %\include{Chapter18/chapter18}
-%\include{ChapterAppend/chapterappend}
+\include{ChapterAppend/chapterappend}
 %----------------------------------------------------------------------------------------

--- a/structure.tex
+++ b/structure.tex
@@ -76,7 +76,7 @@
 %	BIBLIOGRAPHY AND INDEX
 %----------------------------------------------------------------------------------------
-\usepackage[style=numeric,citestyle=numeric,sorting=nyt,sortcites=true,maxbibnames=40,minbibnames=30,autopunct=true,babel=hyphen,hyperref=true,abbreviate=false,backref=true,backend=biber,autocite=plain]{biblatex}
+\usepackage[style=numeric,citestyle=numeric,sorting=none,sortcites=true,maxbibnames=40,minbibnames=30,autopunct=true,babel=hyphen,hyperref=true,abbreviate=false,backref=true,backend=biber,autocite=plain]{biblatex}
 %maxbibnames 设置参考文献最多显示作者数目
 %minbibnames 如果作者数目超过maxbibnames，则只显示minbibnames个作者
 \addbibresource{bibliography.bib} % BibTeX bibliography file