modified chapter3 and bib

ec27b648 · zhoutao · cd443b76 · ec27b648 · ec27b648 · ec27b648
Commit ec27b648 authored Sep 15, 2020 by zhoutao
--- a/Chapter3/Figures/figure-crf-to-deal-with-sequence-problems.tex
+++ b/Chapter3/Figures/figure-crf-to-deal-with-sequence-problems.tex
@@ -8,8 +8,8 @@
 		\node[anchor=west,hide](yn-1)at([xshift=2em]dots.east){$y_{m-1}$};
 		\node[anchor=west,hide](yn)at([xshift=2em]yn-1.east){$y_m$};
 		
-		\node[anchor=north,draw,line width=1pt,inner sep=2pt,fill=red!30,minimum height=2em,minimum width=12em](see)at ([yshift=-3em,xshift=2em]y3.south){$\mathbf{X}=(x_1,x_2,\ldots,x_{m-1},x_m)$};
-		\node[anchor=south,font=\footnotesize] at ([yshift=1em,xshift=2em]y3.north){待预测的隐藏状态序列};
+		\node[anchor=north,draw,line width=1pt,inner sep=2pt,fill=red!30,minimum height=2em,minimum width=12em](see)at ([yshift=-3em,xshift=2em]y3.south){${X}=(x_1,x_2,\ldots,x_{m-1},x_m)$};
+		\node[anchor=south,font=\footnotesize] at ([yshift=1em,xshift=2em]y3.north){待预测的隐含状态序列};
 		\node[anchor=north,font=\footnotesize] at ([yshift=-1em]see.south){可见状态序列};
 		
 		\draw[line width=1pt] (y1.east) -- (y2.west);

--- a/Chapter3/Figures/figure-evaluation-of-probability-for-grammar.tex
+++ b/Chapter3/Figures/figure-evaluation-of-probability-for-grammar.tex
@@ -57,17 +57,17 @@
 \end{pgfonlayer}

 \node [anchor=north west] (math1) at ([xshift=2em]treebank.north east) {$\funp{P}$(VP $\to$ VV NN)};
-\node [anchor=north west] (math1part2) at ([xshift=-1em,yshift=0.2em]math1.south west) {$=\frac{\textrm{“VP”和“VV NN”同时出现的次数=1}}{\textrm{“VP”出现的次数}=4}$};
+\node [anchor=north west] (math1part2) at ([xshift=-1em,yshift=0.2em]math1.south west) {$=\frac{\textrm{VP和VV NN同时出现的次数=1}}{\textrm{VP出现的次数}=4}$};

 \node [anchor=north west] (math1part3) at ([yshift=0.2em]math1part2.south west){$=\frac{1}{4}$};

 \node [anchor=north west] (math2) at ([yshift=-6em]math1.north west) {$\funp{P}$(NP $\to$ NN)};
-\node [anchor=north west] (math2part2) at ([xshift=-1em,yshift=0.2em]math2.south west) {$=\frac{\textrm{“NP”和“NN”同时出现的次数=2}}{\textrm{“NP”出现的次数}=3}$};
+\node [anchor=north west] (math2part2) at ([xshift=-1em,yshift=0.2em]math2.south west) {$=\frac{\textrm{NP和NN同时出现的次数=2}}{\textrm{NP出现的次数}=3}$};
 \node [anchor=north west] (math2part3) at ([yshift=0.2em]math2part2.south west){$=\frac{2}{3}$};


 \node [anchor=north west] (math3) at ([yshift=-6em]math2.north west) {$\funp{P}$(IP $\to$ NP NP)};
-\node [anchor=north west] (math3part2) at ([xshift=-1em,yshift=0.2em]math3.south west) {$=\frac{\textrm{“IP”和“NP NP”同时出现的次数=0}}{\textrm{“IP”出现的次数}=3}$};
+\node [anchor=north west] (math3part2) at ([xshift=-1em,yshift=0.2em]math3.south west) {$=\frac{\textrm{IP和NP NP同时出现的次数=0}}{\textrm{IP出现的次数}=3}$};
 \node [anchor=north west] (math3part3) at ([yshift=0.2em]math3part2.south west){$=\frac{0}{3}$};

 \begin{pgfonlayer}{background}

--- a/Chapter3/Figures/figure-example-of-hmm.tex
+++ b/Chapter3/Figures/figure-example-of-hmm.tex
@@ -2,15 +2,15 @@
 	\tikzstyle{unit} = [draw,circle,line width=0.8pt,align=center,fill=green!30,minimum size=1em]
 		
 		\node[minimum width=3em,minimum height=1.8em] (o) at (0,0){};
-		\node[anchor=north,inner sep=1pt,font=\footnotesize] (state_A) at ([xshift=-0em,yshift=-1em]o.south){隐藏状态A};
-		\node[anchor=north,inner sep=1pt,font=\footnotesize] (state_B) at ([yshift=-1.6em]state_A.south){隐藏状态B};
-		\node[anchor=north,inner sep=1pt,font=\footnotesize] (state_C) at ([yshift=-1.6em]state_B.south){隐藏状态C};
-		\node[anchor=north,inner sep=1pt,font=\footnotesize] (state_D) at ([yshift=-1.6em]state_C.south){隐藏状态D};
-		
-		\node[anchor=west,inner sep=1pt,font=\footnotesize] (c1) at ([yshift=0.2em,xshift=2em]o.east){T};
-		\node[anchor=west,inner sep=1pt,font=\footnotesize] (c2) at ([xshift=5em]c1.east){F};
-		\node[anchor=west,inner sep=1pt,font=\footnotesize] (c3) at ([xshift=5em]c2.east){F};
-		\node[anchor=west,inner sep=1pt,font=\footnotesize] (c4) at ([xshift=5em]c3.east){T};
+		\node[anchor=north,inner sep=1pt,font=\footnotesize] (state_A) at ([xshift=-0em,yshift=-1em]o.south){隐含状态$A$};
+		\node[anchor=north,inner sep=1pt,font=\footnotesize] (state_B) at ([yshift=-1.6em]state_A.south){隐含状态$B$};
+		\node[anchor=north,inner sep=1pt,font=\footnotesize] (state_C) at ([yshift=-1.6em]state_B.south){隐含状态$C$};
+		\node[anchor=north,inner sep=1pt,font=\footnotesize] (state_D) at ([yshift=-1.6em]state_C.south){隐含状态$D$};
+		
+		\node[anchor=west,inner sep=1pt,font=\footnotesize] (c1) at ([yshift=0.2em,xshift=2em]o.east){$T$};
+		\node[anchor=west,inner sep=1pt,font=\footnotesize] (c2) at ([xshift=5em]c1.east){$F$};
+		\node[anchor=west,inner sep=1pt,font=\footnotesize] (c3) at ([xshift=5em]c2.east){$F$};
+		\node[anchor=west,inner sep=1pt,font=\footnotesize] (c4) at ([xshift=5em]c3.east){$T$};
 		\node[anchor=south,font=\scriptsize] (cl1) at (c1.north) {时刻1};
 		\node[anchor=south,font=\scriptsize] (cl2) at (c2.north) {时刻2};
 		\node[anchor=south,font=\scriptsize] (cl3) at (c3.north) {时刻3};

--- a/Chapter3/Figures/figure-perspectives-of-expert-ordinary-and-syntactic-parser.tex
+++ b/Chapter3/Figures/figure-perspectives-of-expert-ordinary-and-syntactic-parser.tex
@@ -74,7 +74,7 @@

 语言学家： & 不对 & 对 & 不对  \\ 
 我们： & 似乎对了 & 比较肯定 & 不太可能 \\ 
-分析器： & $\textrm{P}=0.2$ & $\textrm{P}=0.6$ & $\textrm{P}=0.1$
+分析器： & $\funp{P}=0.2$ & $\funp{P}=0.6$ & $\funp{P}=0.1$

 \end{tabular}
 %---------------------------------------------------------------------

--- a/Chapter3/Figures/figure-transition-prob-and-launch-prob-in-coin-toss-game.tex
+++ b/Chapter3/Figures/figure-transition-prob-and-launch-prob-in-coin-toss-game.tex
@@ -2,13 +2,13 @@
 	\begin{scope}
 	\node[minimum width=3em,minimum height=1.5em] (o) at (0,0){};
 	
-	\node[anchor=west,inner sep=0pt] (ca) at ([yshift=0.2em,xshift=1.4em]o.east){\scriptsize\bfnew{硬币A}};
-	\node[anchor=west,inner sep=0pt] (cb) at ([xshift=1.4em]ca.east){\scriptsize\bfnew{硬币B}};
-	\node[anchor=west,inner sep=0pt] (cc) at ([xshift=1.4em]cb.east){\scriptsize\bfnew{硬币C}};
+	\node[anchor=west,inner sep=0pt] (ca) at ([yshift=0.2em,xshift=1.4em]o.east){\scriptsize\bfnew{硬币$\boldsymbol A$}};
+	\node[anchor=west,inner sep=0pt] (cb) at ([xshift=1.4em]ca.east){\scriptsize\bfnew{硬币$\boldsymbol B$}};
+	\node[anchor=west,inner sep=0pt] (cc) at ([xshift=1.4em]cb.east){\scriptsize\bfnew{硬币$\boldsymbol C$}};
 	
-	\node[anchor=north,inner sep=0pt] (ra) at ([yshift=-0.6em,xshift=-0.4em]o.south){\scriptsize\bfnew{硬币A}};
-	\node[anchor=north,inner sep=0pt] (rb) at ([yshift=-1.4em]ra.south){\scriptsize\bfnew{硬币B}};
-	\node[anchor=north,inner sep=0pt] (rc) at ([yshift=-1.4em]rb.south){\scriptsize\bfnew{硬币C}};
+	\node[anchor=north,inner sep=0pt] (ra) at ([yshift=-0.6em,xshift=-0.4em]o.south){\scriptsize\bfnew{硬币$\boldsymbol A$}};
+	\node[anchor=north,inner sep=0pt] (rb) at ([yshift=-1.4em]ra.south){\scriptsize\bfnew{硬币$\boldsymbol B$}};
+	\node[anchor=north,inner sep=0pt] (rc) at ([yshift=-1.4em]rb.south){\scriptsize\bfnew{硬币$\boldsymbol C$}};
 	
 	\node[anchor=north,inner sep=0pt] (n11) at ([yshift=-0.9em]ca.south){\small{$\frac{1}{3}$}};
 	\node[anchor=north,inner sep=0pt] (n21) at ([yshift=-1em]n11.south){\small{$\frac{1}{3}$}};
@@ -38,9 +38,9 @@
 	\node[anchor=west,inner sep=0pt] (ca) at ([yshift=0.2em,xshift=1.4em]o.east){\scriptsize\bfnew{正面}};
 	\node[anchor=west,inner sep=0pt] (cb) at ([xshift=1.4em]ca.east){\scriptsize\bfnew{反面}};
 	
-	\node[anchor=north,inner sep=0pt] (ra) at ([yshift=-0.6em,xshift=-0.4em]o.south){\scriptsize\bfnew{硬币A}};
-	\node[anchor=north,inner sep=0pt] (rb) at ([yshift=-1.5em]ra.south){\scriptsize\bfnew{硬币B}};
-	\node[anchor=north,inner sep=0pt] (rc) at ([yshift=-1.5em]rb.south){\scriptsize\bfnew{硬币C}};
+	\node[anchor=north,inner sep=0pt] (ra) at ([yshift=-0.6em,xshift=-0.4em]o.south){\scriptsize\bfnew{硬币$\boldsymbol A$}};
+	\node[anchor=north,inner sep=0pt] (rb) at ([yshift=-1.5em]ra.south){\scriptsize\bfnew{硬币$\boldsymbol B$}};
+	\node[anchor=north,inner sep=0pt] (rc) at ([yshift=-1.5em]rb.south){\scriptsize\bfnew{硬币$\boldsymbol C$}};
 	
 	\node[anchor=north,inner sep=0pt] (n11) at ([yshift=-1.2em]ca.south){\footnotesize{$0.3$}};
 	\node[anchor=north,inner sep=0pt] (n21) at ([yshift=-1.7em]n11.south){\footnotesize{$0.5$}};
@@ -57,6 +57,6 @@
 	\begin{pgfonlayer}{background}
        	\node [rectangle,inner sep=0.5em,rounded corners=2pt,fill=red!10] [fit = (o)(n32)(rc)(cb) ] (box1) {};
    	\end{pgfonlayer}
-   \node[anchor=south] at (box1.north){\scriptsize{发射概率$\funp{P}$(可见状态|隐藏状态)}};
+   \node[anchor=south] at (box1.north){\scriptsize{发射概率$\funp{P}$(可见状态|隐含状态)}};
 	\end{scope}
 \end{tikzpicture}
\ No newline at end of file
--- a/Chapter3/chapter3.aux
+++ b/Chapter3/chapter3.aux
-\relax 
-\providecommand\zref@newlabel[2]{}
-\providecommand\hyper@newdestlabel[2]{}
-\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {chapter}{\numberline {1}词法分析和语法分析基础}{11}{chapter.1}\protected@file@percent }
-\@writefile{lof}{\defcounter {refsection}{0}\relax }\@writefile{lof}{\addvspace {10\p@ }}
-\@writefile{lot}{\defcounter {refsection}{0}\relax }\@writefile{lot}{\addvspace {10\p@ }}
-\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {section}{\numberline {1.1}问题概述}{11}{section.1.1}\protected@file@percent }
-\@writefile{lof}{\defcounter {refsection}{0}\relax }\@writefile{lof}{\contentsline {figure}{\numberline {1.1}{\ignorespaces 机器翻译系统的结构\relax }}{12}{figure.caption.3}\protected@file@percent }
-\providecommand*\caption@xref[2]{\@setref\relax\@undefined{#1}}
-\newlabel{fig:3.1-1}{{1.1}{12}{机器翻译系统的结构\relax }{figure.caption.3}{}}
-\@writefile{lof}{\defcounter {refsection}{0}\relax }\@writefile{lof}{\contentsline {subfigure}{\numberline{(a)}{\ignorespaces {机器翻译系统被看作一个黑盒}}}{12}{figure.caption.3}\protected@file@percent }
-\@writefile{lof}{\defcounter {refsection}{0}\relax }\@writefile{lof}{\contentsline {subfigure}{\numberline{(b)}{\ignorespaces {机器翻译系统 = 前/后处理 + 翻译引擎}}}{12}{figure.caption.3}\protected@file@percent }
-\@writefile{lof}{\defcounter {refsection}{0}\relax }\@writefile{lof}{\contentsline {figure}{\numberline {1.2}{\ignorespaces 汉语句子“猫喜欢吃鱼”的分析结果（分词和句法分析）\relax }}{12}{figure.caption.4}\protected@file@percent }
-\newlabel{fig:3.1-2}{{1.2}{12}{汉语句子“猫喜欢吃鱼”的分析结果（分词和句法分析）\relax }{figure.caption.4}{}}
-\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {section}{\numberline {1.2}中文分词}{13}{section.1.2}\protected@file@percent }
-\@writefile{lof}{\defcounter {refsection}{0}\relax }\@writefile{lof}{\contentsline {figure}{\numberline {1.3}{\ignorespaces 一个简单的预处理流程\relax }}{13}{figure.caption.5}\protected@file@percent }
-\newlabel{fig:3.2-1}{{1.3}{13}{一个简单的预处理流程\relax }{figure.caption.5}{}}
-\zref@newlabel{mdf@pagelabel-1}{\default{1.2}\page{14}\abspage{14}\mdf@pagevalue{14}}
-\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{\numberline {1.2.1}基于词典的分词方法}{14}{subsection.1.2.1}\protected@file@percent }
-\@writefile{lof}{\defcounter {refsection}{0}\relax }\@writefile{lof}{\contentsline {figure}{\numberline {1.4}{\ignorespaces 基于词典进行分词的实例\relax }}{15}{figure.caption.6}\protected@file@percent }
-\newlabel{fig:3.2-2}{{1.4}{15}{基于词典进行分词的实例\relax }{figure.caption.6}{}}
-\@writefile{lof}{\defcounter {refsection}{0}\relax }\@writefile{lof}{\contentsline {figure}{\numberline {1.5}{\ignorespaces 交叉型分词歧义\relax }}{15}{figure.caption.7}\protected@file@percent }
-\newlabel{fig:3.2-3}{{1.5}{15}{交叉型分词歧义\relax }{figure.caption.7}{}}
-\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{\numberline {1.2.2}基于统计的分词方法}{16}{subsection.1.2.2}\protected@file@percent }
-\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsubsection}{1. 统计模型的学习与推断}{16}{section*.8}\protected@file@percent }
-\@writefile{lof}{\defcounter {refsection}{0}\relax }\@writefile{lof}{\contentsline {figure}{\numberline {1.6}{\ignorespaces 基于统计的自动分词流程\relax }}{16}{figure.caption.9}\protected@file@percent }
-\newlabel{fig:3.2-4}{{1.6}{16}{基于统计的自动分词流程\relax }{figure.caption.9}{}}
-\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsubsection}{2. 全概率分词方法}{16}{section*.10}\protected@file@percent }
-\@writefile{lof}{\defcounter {refsection}{0}\relax }\@writefile{lof}{\contentsline {figure}{\numberline {1.7}{\ignorespaces 基于1-gram语言模型的中文分词实例\relax }}{17}{figure.caption.11}\protected@file@percent }
-\newlabel{fig:3.2-5}{{1.7}{17}{基于1-gram语言模型的中文分词实例\relax }{figure.caption.11}{}}
-\newlabel{eq:3.2-1}{{1.1}{17}{2. 全概率分词方法}{equation.1.2.1}{}}
-\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {section}{\numberline {1.3}命名实体识别}{18}{section.1.3}\protected@file@percent }
-\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{\numberline {1.3.1}序列标注任务}{18}{subsection.1.3.1}\protected@file@percent }
-\@writefile{lof}{\defcounter {refsection}{0}\relax }\@writefile{lof}{\contentsline {figure}{\numberline {1.8}{\ignorespaces BIO和BIOES格式对比\relax }}{19}{figure.caption.12}\protected@file@percent }
-\newlabel{fig:3.3-1}{{1.8}{19}{BIO和BIOES格式对比\relax }{figure.caption.12}{}}
-\@writefile{lof}{\defcounter {refsection}{0}\relax }\@writefile{lof}{\contentsline {subfigure}{\numberline{(a)}{\ignorespaces {BIO格式标注命名实体}}}{19}{figure.caption.12}\protected@file@percent }
-\@writefile{lof}{\defcounter {refsection}{0}\relax }\@writefile{lof}{\contentsline {subfigure}{\numberline{(b)}{\ignorespaces {BIOES格式标注命名实体}}}{19}{figure.caption.12}\protected@file@percent }
-\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{\numberline {1.3.2}基于特征的统计学习}{19}{subsection.1.3.2}\protected@file@percent }
-\newlabel{sec3:feature}{{1.3.2}{19}{基于特征的统计学习}{subsection.1.3.2}{}}
-\@writefile{lot}{\defcounter {refsection}{0}\relax }\@writefile{lot}{\contentsline {table}{\numberline {1.1}{\ignorespaces 命名实体识别中常用的特征\relax }}{20}{table.caption.13}\protected@file@percent }
-\newlabel{tab:3.3-1}{{1.1}{20}{命名实体识别中常用的特征\relax }{table.caption.13}{}}
-\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{\numberline {1.3.3}基于概率图模型的方法}{21}{subsection.1.3.3}\protected@file@percent }
-\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsubsection}{1. 隐马尔可夫模型}{21}{section*.14}\protected@file@percent }
-\@writefile{lof}{\defcounter {refsection}{0}\relax }\@writefile{lof}{\contentsline {figure}{\numberline {1.9}{\ignorespaces “抛硬币”游戏中的转移概率和发射概率\relax }}{22}{figure.caption.15}\protected@file@percent }
-\newlabel{fig:3.3-2}{{1.9}{22}{“抛硬币”游戏中的转移概率和发射概率\relax }{figure.caption.15}{}}
-\newlabel{eq:joint-prob-xy}{{1.2}{22}{1. 隐马尔可夫模型}{equation.1.3.2}{}}
-\@writefile{lof}{\defcounter {refsection}{0}\relax }\@writefile{lof}{\contentsline {figure}{\numberline {1.10}{\ignorespaces 抛硬币的隐马尔可夫模型实例\relax }}{23}{figure.caption.16}\protected@file@percent }
-\newlabel{fig:3.3-3}{{1.10}{23}{抛硬币的隐马尔可夫模型实例\relax }{figure.caption.16}{}}
-\newlabel{eq:3.3-1}{{1.3}{23}{1. 隐马尔可夫模型}{equation.1.3.3}{}}
-\newlabel{eq:3.3-2}{{1.4}{23}{1. 隐马尔可夫模型}{equation.1.3.4}{}}
-\newlabel{eq:3.3-3}{{1.5}{24}{1. 隐马尔可夫模型}{equation.1.3.5}{}}
-\newlabel{eq:markov-sequence-argmax}{{1.6}{24}{1. 隐马尔可夫模型}{equation.1.3.6}{}}
-\newlabel{eq:3.3-4}{{1.6}{24}{1. 隐马尔可夫模型}{equation.1.3.6}{}}
-\newlabel{eq:3.3-5}{{1.7}{24}{1. 隐马尔可夫模型}{equation.1.3.7}{}}
-\@writefile{lof}{\defcounter {refsection}{0}\relax }\@writefile{lof}{\contentsline {figure}{\numberline {1.11}{\ignorespaces 基于隐马尔可夫模型的命名实体识别（解码过程）\relax }}{24}{figure.caption.17}\protected@file@percent }
-\newlabel{fig:3.3-4}{{1.11}{24}{基于隐马尔可夫模型的命名实体识别（解码过程）\relax }{figure.caption.17}{}}
-\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsubsection}{2. 条件随机场}{24}{section*.18}\protected@file@percent }
-\newlabel{eq:3.3-6}{{1.8}{24}{2. 条件随机场}{equation.1.3.8}{}}
-\newlabel{eq:3.3-7}{{1.9}{24}{2. 条件随机场}{equation.1.3.8}{}}
-\@writefile{lof}{\defcounter {refsection}{0}\relax }\@writefile{lof}{\contentsline {figure}{\numberline {1.12}{\ignorespaces 隐马尔可夫实例\relax }}{25}{figure.caption.19}\protected@file@percent }
-\newlabel{fig:3.3-5}{{1.12}{25}{隐马尔可夫实例\relax }{figure.caption.19}{}}
-\newlabel{eq:3.3-8}{{1.10}{25}{2. 条件随机场}{equation.1.3.10}{}}
-\newlabel{eq:3.3-9}{{1.11}{26}{2. 条件随机场}{equation.1.3.11}{}}
-\newlabel{eq:3.3-10}{{1.12}{26}{2. 条件随机场}{equation.1.3.12}{}}
-\@writefile{lof}{\defcounter {refsection}{0}\relax }\@writefile{lof}{\contentsline {figure}{\numberline {1.13}{\ignorespaces 条件随机场模型处理序列问题\relax }}{26}{figure.caption.20}\protected@file@percent }
-\newlabel{fig:3.3-6}{{1.13}{26}{条件随机场模型处理序列问题\relax }{figure.caption.20}{}}
-\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{\numberline {1.3.4}基于分类器的方法}{27}{subsection.1.3.4}\protected@file@percent }
-\@writefile{lof}{\defcounter {refsection}{0}\relax }\@writefile{lof}{\contentsline {figure}{\numberline {1.14}{\ignorespaces HMM、CRF、分类算法三种方法对比\relax }}{27}{figure.caption.21}\protected@file@percent }
-\newlabel{fig:3.3-7}{{1.14}{27}{HMM、CRF、分类算法三种方法对比\relax }{figure.caption.21}{}}
-\@writefile{lof}{\defcounter {refsection}{0}\relax }\@writefile{lof}{\contentsline {subfigure}{\numberline{(a)}{\ignorespaces {HMM处理序列标注}}}{27}{figure.caption.21}\protected@file@percent }
-\@writefile{lof}{\defcounter {refsection}{0}\relax }\@writefile{lof}{\contentsline {subfigure}{\numberline{(b)}{\ignorespaces {CRF处理序列标注}}}{27}{figure.caption.21}\protected@file@percent }
-\@writefile{lof}{\defcounter {refsection}{0}\relax }\@writefile{lof}{\contentsline {subfigure}{\numberline{(c)}{\ignorespaces {分类模型处理序列标注}}}{27}{figure.caption.21}\protected@file@percent }
-\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsubsection}{1. 分类任务与分类器}{27}{section*.22}\protected@file@percent }
-\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsubsection}{2. 经典的分类模型}{28}{section*.23}\protected@file@percent }
-\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {section}{\numberline {1.4}句法分析（短语结构分析）}{29}{section.1.4}\protected@file@percent }
-\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{\numberline {1.4.1}句法树}{29}{subsection.1.4.1}\protected@file@percent }
-\@writefile{lof}{\defcounter {refsection}{0}\relax }\@writefile{lof}{\contentsline {figure}{\numberline {1.15}{\ignorespaces 短语结构树(左)和依存树(右)\relax }}{30}{figure.caption.24}\protected@file@percent }
-\newlabel{fig:3.4-1}{{1.15}{30}{短语结构树(左)和依存树(右)\relax }{figure.caption.24}{}}
-\zref@newlabel{mdf@pagelabel-2}{\default{1.4.1}\page{30}\abspage{30}\mdf@pagevalue{30}}
-\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{\numberline {1.4.2}上下文无关文法}{31}{subsection.1.4.2}\protected@file@percent }
-\zref@newlabel{mdf@pagelabel-3}{\default{1.4.2}\page{31}\abspage{31}\mdf@pagevalue{31}}
-\newlabel{eq:3.4-1}{{1.13}{31}{上下文无关文法}{equation.1.4.13}{}}
-\newlabel{eq:3.4-2}{{1.13}{31}{上下文无关文法}{equation.1.4.13}{}}
-\newlabel{eq:3.4-3}{{1.13}{31}{上下文无关文法}{equation.1.4.13}{}}
-\@writefile{lof}{\defcounter {refsection}{0}\relax }\@writefile{lof}{\contentsline {figure}{\numberline {1.16}{\ignorespaces 一个示例文法的规则集\relax }}{32}{figure.caption.25}\protected@file@percent }
-\newlabel{fig:3.4-2}{{1.16}{32}{一个示例文法的规则集\relax }{figure.caption.25}{}}
-\zref@newlabel{mdf@pagelabel-4}{\default{1.4.2}\page{32}\abspage{32}\mdf@pagevalue{32}}
-\zref@newlabel{mdf@pagelabel-5}{\default{1.4.2}\page{32}\abspage{32}\mdf@pagevalue{32}}
-\zref@newlabel{mdf@pagelabel-6}{\default{1.4.2}\page{33}\abspage{33}\mdf@pagevalue{33}}
-\@writefile{lof}{\defcounter {refsection}{0}\relax }\@writefile{lof}{\contentsline {figure}{\numberline {1.17}{\ignorespaces 上下文无关文法推导实例\relax }}{33}{figure.caption.26}\protected@file@percent }
-\newlabel{fig:3.4-3}{{1.17}{33}{上下文无关文法推导实例\relax }{figure.caption.26}{}}
-\@writefile{lof}{\defcounter {refsection}{0}\relax }\@writefile{lof}{\contentsline {figure}{\numberline {1.18}{\ignorespaces 同一棵句法树对应的不同规则推导\relax }}{34}{figure.caption.27}\protected@file@percent }
-\newlabel{fig:3.4-4}{{1.18}{34}{同一棵句法树对应的不同规则推导\relax }{figure.caption.27}{}}
-\@writefile{lof}{\defcounter {refsection}{0}\relax }\@writefile{lof}{\contentsline {figure}{\numberline {1.19}{\ignorespaces 如何选择最佳的句法分析结果 - 专家、普通人和句法分析器的视角\relax }}{34}{figure.caption.28}\protected@file@percent }
-\newlabel{fig:3.4-5}{{1.19}{34}{如何选择最佳的句法分析结果 - 专家、普通人和句法分析器的视角\relax }{figure.caption.28}{}}
-\@writefile{lof}{\defcounter {refsection}{0}\relax }\@writefile{lof}{\contentsline {figure}{\numberline {1.20}{\ignorespaces 不同推导（句法树）对应的概率值\relax }}{35}{figure.caption.29}\protected@file@percent }
-\newlabel{fig:3.4-6}{{1.20}{35}{不同推导（句法树）对应的概率值\relax }{figure.caption.29}{}}
-\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{\numberline {1.4.3}规则和推导的概率}{35}{subsection.1.4.3}\protected@file@percent }
-\zref@newlabel{mdf@pagelabel-7}{\default{1.4.3}\page{35}\abspage{35}\mdf@pagevalue{35}}
-\newlabel{eq:3.4-4}{{1.13}{35}{规则和推导的概率}{equation.1.4.13}{}}
-\newlabel{eq:3.4-5}{{1.14}{36}{规则和推导的概率}{equation.1.4.14}{}}
-\newlabel{eq:3.4-6}{{1.15}{36}{规则和推导的概率}{equation.1.4.15}{}}
-\newlabel{eq:3.4-7}{{1.15}{36}{规则和推导的概率}{equation.1.4.15}{}}
-\newlabel{eq:3.4-8}{{1.16}{36}{规则和推导的概率}{equation.1.4.16}{}}
-\@writefile{lof}{\defcounter {refsection}{0}\relax }\@writefile{lof}{\contentsline {figure}{\numberline {1.21}{\ignorespaces 上下文无关文法规则概率估计\relax }}{37}{figure.caption.30}\protected@file@percent }
-\newlabel{fig:3.4-7}{{1.21}{37}{上下文无关文法规则概率估计\relax }{figure.caption.30}{}}
-\@writefile{lof}{\defcounter {refsection}{0}\relax }\@writefile{lof}{\contentsline {figure}{\numberline {1.22}{\ignorespaces 统计句法分析的流程\relax }}{37}{figure.caption.31}\protected@file@percent }
-\newlabel{fig:3.4-8}{{1.22}{37}{统计句法分析的流程\relax }{figure.caption.31}{}}
-\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {section}{\numberline {1.5}小结及深入阅读}{37}{section.1.5}\protected@file@percent }
-\newlabel{sec3:summary}{{1.5}{37}{小结及深入阅读}{section.1.5}{}}
-\@setckpt{Chapter3/chapter3}{
-\setcounter{page}{39}
-\setcounter{equation}{17}
-\setcounter{enumi}{0}
-\setcounter{enumii}{0}
-\setcounter{enumiii}{0}
-\setcounter{enumiv}{0}
-\setcounter{footnote}{9}
-\setcounter{mpfootnote}{0}
-\setcounter{part}{0}
-\setcounter{chapter}{1}
-\setcounter{section}{5}
-\setcounter{subsection}{0}
-\setcounter{subsubsection}{0}
-\setcounter{paragraph}{0}
-\setcounter{subparagraph}{0}
-\setcounter{figure}{22}
-\setcounter{table}{1}
-\setcounter{tabx@nest}{0}
-\setcounter{listtotal}{0}
-\setcounter{listcount}{0}
-\setcounter{liststart}{0}
-\setcounter{liststop}{0}
-\setcounter{citecount}{0}
-\setcounter{citetotal}{0}
-\setcounter{multicitecount}{0}
-\setcounter{multicitetotal}{0}
-\setcounter{instcount}{45}
-\setcounter{maxnames}{3}
-\setcounter{minnames}{1}
-\setcounter{maxitems}{3}
-\setcounter{minitems}{1}
-\setcounter{citecounter}{0}
-\setcounter{maxcitecounter}{0}
-\setcounter{savedcitecounter}{0}
-\setcounter{uniquelist}{0}
-\setcounter{uniquename}{0}
-\setcounter{refsection}{0}
-\setcounter{refsegment}{0}
-\setcounter{maxextratitle}{0}
-\setcounter{maxextratitleyear}{0}
-\setcounter{maxextraname}{2}
-\setcounter{maxextradate}{0}
-\setcounter{maxextraalpha}{0}
-\setcounter{abbrvpenalty}{50}
-\setcounter{highnamepenalty}{50}
-\setcounter{lownamepenalty}{25}
-\setcounter{maxparens}{3}
-\setcounter{parenlevel}{0}
-\setcounter{mincomprange}{10}
-\setcounter{maxcomprange}{100000}
-\setcounter{mincompwidth}{1}
-\setcounter{afterword}{0}
-\setcounter{savedafterword}{0}
-\setcounter{annotator}{0}
-\setcounter{savedannotator}{0}
-\setcounter{author}{0}
-\setcounter{savedauthor}{0}
-\setcounter{bookauthor}{0}
-\setcounter{savedbookauthor}{0}
-\setcounter{commentator}{0}
-\setcounter{savedcommentator}{0}
-\setcounter{editor}{0}
-\setcounter{savededitor}{0}
-\setcounter{editora}{0}
-\setcounter{savededitora}{0}
-\setcounter{editorb}{0}
-\setcounter{savededitorb}{0}
-\setcounter{editorc}{0}
-\setcounter{savededitorc}{0}
-\setcounter{foreword}{0}
-\setcounter{savedforeword}{0}
-\setcounter{holder}{0}
-\setcounter{savedholder}{0}
-\setcounter{introduction}{0}
-\setcounter{savedintroduction}{0}
-\setcounter{namea}{0}
-\setcounter{savednamea}{0}
-\setcounter{nameb}{0}
-\setcounter{savednameb}{0}
-\setcounter{namec}{0}
-\setcounter{savednamec}{0}
-\setcounter{translator}{0}
-\setcounter{savedtranslator}{0}
-\setcounter{shortauthor}{0}
-\setcounter{savedshortauthor}{0}
-\setcounter{shorteditor}{0}
-\setcounter{savedshorteditor}{0}
-\setcounter{labelname}{0}
-\setcounter{savedlabelname}{0}
-\setcounter{institution}{0}
-\setcounter{savedinstitution}{0}
-\setcounter{lista}{0}
-\setcounter{savedlista}{0}
-\setcounter{listb}{0}
-\setcounter{savedlistb}{0}
-\setcounter{listc}{0}
-\setcounter{savedlistc}{0}
-\setcounter{listd}{0}
-\setcounter{savedlistd}{0}
-\setcounter{liste}{0}
-\setcounter{savedliste}{0}
-\setcounter{listf}{0}
-\setcounter{savedlistf}{0}
-\setcounter{location}{0}
-\setcounter{savedlocation}{0}
-\setcounter{organization}{0}
-\setcounter{savedorganization}{0}
-\setcounter{origlocation}{0}
-\setcounter{savedoriglocation}{0}
-\setcounter{origpublisher}{0}
-\setcounter{savedorigpublisher}{0}
-\setcounter{publisher}{0}
-\setcounter{savedpublisher}{0}
-\setcounter{language}{0}
-\setcounter{savedlanguage}{0}
-\setcounter{origlanguage}{0}
-\setcounter{savedoriglanguage}{0}
-\setcounter{pageref}{0}
-\setcounter{savedpageref}{0}
-\setcounter{textcitecount}{0}
-\setcounter{textcitetotal}{0}
-\setcounter{textcitemaxnames}{0}
-\setcounter{biburlbigbreakpenalty}{100}
-\setcounter{biburlbreakpenalty}{200}
-\setcounter{biburlnumpenalty}{0}
-\setcounter{biburlucpenalty}{0}
-\setcounter{biburllcpenalty}{0}
-\setcounter{smartand}{1}
-\setcounter{bbx:relatedcount}{0}
-\setcounter{bbx:relatedtotal}{0}
-\setcounter{parentequation}{0}
-\setcounter{notation}{0}
-\setcounter{dummy}{0}
-\setcounter{problem}{0}
-\setcounter{exerciseT}{0}
-\setcounter{exampleT}{0}
-\setcounter{vocabulary}{0}
-\setcounter{definitionT}{0}
-\setcounter{mdf@globalstyle@cnt}{0}
-\setcounter{mdfcountframes}{0}
-\setcounter{mdf@env@i}{0}
-\setcounter{mdf@env@ii}{0}
-\setcounter{mdf@zref@counter}{7}
-\setcounter{Item}{0}
-\setcounter{Hfootnote}{9}
-\setcounter{Hy@AnnotLevel}{0}
-\setcounter{bookmark@seq@number}{0}
-\setcounter{caption@flags}{0}
-\setcounter{continuedfloat}{0}
-\setcounter{cp@cnt}{0}
-\setcounter{cp@tempcnt}{0}
-\setcounter{subfigure}{0}
-\setcounter{lofdepth}{1}
-\setcounter{subtable}{0}
-\setcounter{lotdepth}{1}
-\setcounter{@pps}{0}
-\setcounter{@ppsavesec}{0}
-\setcounter{@ppsaveapp}{0}
-\setcounter{tcbbreakpart}{0}
-\setcounter{tcblayer}{0}
-\setcounter{tcolorbox@number}{0}
-\setcounter{section@level}{1}
-}
--- a/Chapter3/chapter3.tex
+++ b/Chapter3/chapter3.tex
@@ -60,7 +60,7 @@
 \end{figure}
 %-------------------------------------------

-\parinterval 类似地，机器翻译输出的结果也可以包含同样的信息。甚至系统输出英语译文之后，还有一个额外的步骤来把部分英语单词的大小写恢复出来，比如，上例中句首单词“Cats”的首字母要大写。
+\parinterval 类似地，机器翻译输出的结果也可以包含同样的信息。甚至系统输出英语译文之后，还有一个额外的步骤来把部分英语单词的大小写恢复出来，比如，句首单词的首字母要大写。

 \parinterval 一般来说，在送入机器翻译系统前需要对文字序列进行处理和加工，这个过程被称为{\small\bfnew{预处理}}\index{预处理}（Preprocessing）\index{Preprocessing}。类似地，在机器翻译模型输出译文后进行的处理被称作{\small\bfnew{后处理}}\index{后处理}（Postprocessing）\index{Postprocessing}。这两个过程对机器翻译性能影响很大，比如，对于神经机器翻译系统来说，不同的分词策略可能会造成翻译性能的天差地别。

@@ -220,7 +220,7 @@ $计算这种切分的概率值。

 \parinterval 经过充分训练的统计模型$\funp{P}(\cdot)$就是我们所说的分词模型。对于输入的新句子$S$，通过这个模型找到最佳的分词结果输出。假设输入句子$S$是“确实现在数据很多”，可以通过列举获得不同切分方式的概率，其中概率最高的切分方式，就是系统的目标输出。

-\parinterval 这种分词方法也被称作基于1-gram语言模型的分词，或全概率分词\upcite{刘挺1998最大概率分词问题及其解法,丁洁2010基于最大概率分词算法的中文分词方法研究}。全概率分词最大的优点在于方法简单、效率高，因此被广泛应用在工业界系统里。它本质上就是一个1-gram语言模型，因此可以直接复用$n$-gram语言模型的训练方法和未登录词处理方法。与传统$n$-gram语言模型稍有不同的是，分词的预测过程需要找到一个在给定字符串所有可能切分中1-gram语言模型得分最高的切分。因此，可以使用{\chaptertwo}中所描述的搜索算法实现这个预测过程，也可以使用动态规划方法快速找到最优切分结果。由于本节的重点是介绍中文分词的基础方法和统计建模思想，因此不会对相关搜索算法进行进一步介绍，有兴趣的读者可以参考{\chaptertwo}和本章\ref{sec3:summary}节的相关文献做进一步深入研究。
+\parinterval 这种分词方法也被称作基于1-gram语言模型的分词，或全概率分词\upcite{刘挺1998最大概率分词问题及其解法,丁洁2010基于最大概率分词算法的中文分词方法研究}。全概率分词最大的优点在于方法简单、效率高，因此被广泛应用在工业界系统里。它本质上就是一个1-gram语言模型，因此可以直接复用$n$-gram语言模型的训练方法和未登录词处理方法。与传统$n$-gram语言模型稍有不同的是，分词的预测过程需要找到一个在给定字符串所有可能切分中1-gram语言模型得分最高的切分。因此，可以使用{\chaptertwo}中所描述的搜索算法实现这个预测过程，也可以使用动态规划方法\upcite{bellman1966dynamic}快速找到最优切分结果。由于本节的重点是介绍中文分词的基础方法和统计建模思想，因此不会对相关搜索算法进行进一步介绍，有兴趣的读者可以参考{\chaptertwo}和本章\ref{sec3:summary}节的相关文献做进一步深入研究。

 %----------------------------------------------------------------------------------------
 %    NEW SECTION
@@ -260,7 +260,7 @@ $计算这种切分的概率值。
 \end{figure}
 %-------------------------------------------
 %
-\parinterval 图\ref{fig:3.3-1}给出了不同标注格式所对应的标注结果。可以看出文本序列中的非命名实体直接被标注为“O”，而命名实体的标注则被分为了两部分：位置和命名实体类别，图中的“B”、“I”、“E”等标注出了位置信息，而“CIT”和“CNT”则标注出了命名实体类别（“CIT”表示城市，“CNT”表示国家）。可以看到，命名实体的识别结果可以通过BIO、BIOES这类序列标注结果归纳出来：例如在BIOES格式中，标签“B-CNT”后面的标签只会是“I-CNT”或“E-CNT”，而不会是其他的标签。同时，在命名实体识别任务中涉及到实体边界的确定，而“BIO”或“BIOES”的标注格式本身就暗含着边界问题：在“BIO”格式下，实体左边界只能在“B”的左边，右边界只能在“B”或“I”的右边；在“BIOES”格式下，实体左边界只能在“B”或“S”的左边，右边界只能在“E”和“S”的右边。
+\parinterval 图\ref{fig:3.3-1}给出了不同标注格式所对应的标注结果。可以看出文本序列中的非命名实体直接被标注为“O”，而命名实体的标注则被分为了两部分：位置和命名实体类别，图中的“B”、“I”、“E”等标注出了位置信息，而“CIT”和“CNT”则标注出了命名实体类别（“CIT”表示城市，“CNT”表示国家）。可以看到，命名实体的识别结果可以通过BIO、BIOES这类序列标注结果归纳出来：例如在BIOES格式中，标签“B-CNT”后面的标签只会是“I-CNT”或“E-CNT”，而不会是其他的标签。同时，在命名实体识别任务中涉及到实体边界的确定，而“BIO”或“BIOES”的标注格式本身就暗含着边界问题：在“BIO”格式下，实体左边界只能在“B”的左侧，右边界只能在“B”或“I”的右侧；在“BIOES”格式下，实体左边界只能在“B”或“S”的左侧，右边界只能在“E”和“S”的右侧。

 \parinterval 需要注意的是，虽然图\ref{fig:3.3-1}中的命名实体识别以单词为基本单位进行标注，但真实系统中也可以在字序列上进行命名实体识别，其方法与基于词序列的命名实体识别是一样的。因此，这里仍然以基于词序列的方法为例进行介绍。

@@ -328,11 +328,11 @@ $计算这种切分的概率值。

 \parinterval 隐马尔可夫模型是一种经典的序列模型\upcite{Baum1966Statistical,baum1970maximization,1996Hidden}。它在语音识别、自然语言处理的很多领域得到了广泛的应用。隐马尔可夫模型的本质就是概率化的马尔可夫过程，这个过程隐含着状态间转移和可见状态生成的概率。

-\parinterval 这里用一个简单的“抛硬币”游戏来对这些概念进行说明：假设有三枚质地不同的硬币A、B、C，已知这三个硬币抛出正面的概率分别为0.3、0.5、0.7，在游戏中，游戏发起者在上述三枚硬币中选择一枚硬币上抛，每枚硬币被挑选到的概率可能会受上次被挑选的硬币的影响，且每枚硬币正面向上的概率都各不相同。不停的重复挑选硬币、上抛硬币的过程，会得到一串硬币的正反序列，例如：抛硬币6次，得到：正正反反正反。游戏挑战者通过观察6次后获得的硬币正反序列，猜测每次选择的究竟是哪一枚硬币。
+\parinterval 这里用一个简单的“抛硬币”游戏来对这些概念进行说明：假设有三枚质地不同的硬币$A$、$B$、$C$，已知这三个硬币抛出正面的概率分别为0.3、0.5、0.7，在游戏中，游戏发起者在上述三枚硬币中选择一枚硬币上抛，每枚硬币被挑选到的概率可能会受上次被挑选的硬币的影响，且每枚硬币正面向上的概率都各不相同。不停的重复挑选硬币、上抛硬币的过程，会得到一串硬币的正反序列，例如：抛硬币6次，得到：正正反反正反。游戏挑战者通过观察6次后获得的硬币正反序列，猜测每次选择的究竟是哪一枚硬币。

-\parinterval 在上面的例子中，每次挑选并上抛硬币后得到的“正面”或“反面”即为“可见状态”，再次挑选并上抛硬币会获得新的“可见状态”，这个过程即为“状态的转移”，经过6次反复挑选上抛后得到的硬币正反序列叫做可见状态序列，由每个回合的可见状态构成。此外，在这个游戏中还暗含着一个会对最终“可见状态序列”产生影响的“隐含状态序列”\ \dash \ 每次挑选的硬币形成的序列，例如CBABCA。
+\parinterval 在上面的例子中，每次挑选并上抛硬币后得到的“正面”或“反面”即为“可见状态”，再次挑选并上抛硬币会获得新的“可见状态”，这个过程即为“状态的转移”，经过6次反复挑选上抛后得到的硬币正反序列叫做可见状态序列，由每个回合的可见状态构成。此外，在这个游戏中还暗含着一个会对最终“可见状态序列”产生影响的“隐含状态序列”\ \dash \ 每次挑选的硬币形成的序列，例如$CBABCA$。

-\parinterval 实际上，隐马尔科夫模型在处理序列问题时的关键依据是两个至关重要的概率关系，并且这两个概率关系也始终贯穿于“抛硬币”的游戏中。一方面，隐马尔可夫模型中用{\small\bfnew{发射概率}}\index{发射概率}（Emission Probability）\index{Emission Probability}来描述隐含状态和可见状态之间存在的输出概率（即A、B、C 抛出正面的输出概率为0.3、0.5、0.7），同样的，隐马尔可夫模型还会描述系统隐含状态的{\small\bfnew{转移概率}}\index{转移概率}（Transition Probability）\index{Transition Probability}，在这个例子中，A 的下一个状态是A、B、C 的概率都是1/3，B、C 的下一个状态是A、B、C 的转移概率也同样是1/3。图\ref{fig:3.3-2}展示了在“抛硬币”游戏中的转移概率和发射概率，它们都可以被看做是条件概率矩阵。
+\parinterval 实际上，隐马尔科夫模型在处理序列问题时的关键依据是两个至关重要的概率关系，并且这两个概率关系也始终贯穿于“抛硬币”的游戏中。一方面，隐马尔可夫模型中用{\small\bfnew{发射概率}}\index{发射概率}（Emission Probability）\index{Emission Probability}来描述隐含状态和可见状态之间存在的输出概率（即$A$、$B$、$C$抛出正面的输出概率为0.3、0.5、0.7），同样的，隐马尔可夫模型还会描述系统隐含状态的{\small\bfnew{转移概率}}\index{转移概率}（Transition Probability）\index{Transition Probability}，在这个例子中，$A$的下一个状态是$A$、$B$、$C$的概率都是1/3，$B$、$C$的下一个状态是$A$、$B$、$C$的转移概率也同样是1/3。图\ref{fig:3.3-2}展示了在“抛硬币”游戏中的转移概率和发射概率，它们都可以被看做是条件概率矩阵。

 %----------------------------------------------
 \begin{figure}[htp]
@@ -356,7 +356,6 @@ $计算这种切分的概率值。
 \end{itemize}

 于是，联合概率$\funp{P}(\seq{X},\seq{Y})$可以被定义为：
-
 \begin{eqnarray}
 \funp{P}(\seq{X},\seq{Y}) & = & \funp{P}(\seq{X}|\seq{Y})\funp{P}(\seq{Y}) \nonumber \\
                                                   & = & \funp{P}(x_1,...,x_m|y_1,...,y_m) \funp{P}(y_1,...,y_m) \nonumber \\
@@ -435,7 +434,7 @@ $计算这种切分的概率值。
 \begin{figure}[htp]
 \centering
 \input{./Chapter3/Figures/figure-ner-based-on-hmm}
-\caption{基于隐马尔可夫模型的命名实体识别（解码过程）}
+\caption{基于隐马尔可夫模型的命名实体识别}
 \label{fig:3.3-4}
 \end{figure}
 %-------------------------------------------
@@ -446,7 +445,7 @@ $计算这种切分的概率值。

 \subsubsection{2. 条件随机场}

-\parinterval 隐马尔可夫模型有一个很强的假设：一个隐含状态出现的概率仅由上一个隐含状态决定。这个假设也会带来一些问题，举个例子：在某个隐马尔可夫模型中，隐含状态集合为\{$A, B, C, D$\}，可见状态集合为\{$T, F$\}，其中隐含状态A可能的后继隐含状态集合为\{$A, B$\}，隐含状态B可能的后继隐含状态集合为\{$A, B, C, D$\}，于是有：
+\parinterval 隐马尔可夫模型有一个很强的假设：一个隐含状态出现的概率仅由上一个隐含状态决定。这个假设也会带来一些问题，举个例子：在某个隐马尔可夫模型中，隐含状态集合为\{$A, B, C, D$\}，可见状态集合为\{$T, F$\}，其中隐含状态$A$可能的后继隐含状态集合为\{$A, B$\}，隐含状态$B$可能的后继隐含状态集合为\{$A, B, C, D$\}，于是有：

 \begin{eqnarray}
 \funp{P}(A|A)+\funp{P}(A|B) & = & 1 \label{eq:3.3-6} \\
@@ -455,7 +454,7 @@ $计算这种切分的概率值。

 \noindent 其中，$\funp{P}(b|a)$表示由状态$a$转移到状态$b$的概率，由于式(\ref{eq:3.3-6})中的分式数量少于式(\ref{eq:3.3-7})，这就导致在统计中获得的$\funp{P}(A|A)$、$\funp{P}(A|B)$的值很可能会比$\funp{P}(A|B)$、$\funp{P}(B|B)$、$\funp{P}(C|B)$、$\funp{P}(D|B)$要大。

-\parinterval 图\ref{fig:3.3-5}展示了一个具体的例子，有一个可见状态序列T F F T，假设初始隐含状态是A，图中线上的概率值是对应的转移概率与发射概率的乘积，比如图中隐含状态A开始，下一个隐含状态是A 且可见状态是F 的概率是0.45，下一个隐含状态是B 且可见状态是F的概率是0.55。图中可以看出，由于有较大的值，当可见状态序列为T F F T时，隐马尔可夫计算出的最有可能的隐含状态序列为A A A A。但是如果对训练集进行统计可能会发现，当可见序列为T F F T 时，对应的隐含状态是A A A A的概率可能是比较大的，但也可能是比较小的。这个例子中出现预测偏差的主要原因是：由于比其他状态转移概率要大得多，隐含状态的预测一直停留在状态A。
+\parinterval 图\ref{fig:3.3-5}展示了一个具体的例子，有一个可见状态序列$T F F T$，假设初始隐含状态是$A$，图中线上的概率值是对应的转移概率与发射概率的乘积，比如图中隐含状态$A$开始，下一个隐含状态是$A$且可见状态是$F$的概率是0.45，下一个隐含状态是$B$且可见状态是$F$的概率是0.55。图中可以看出，由于有较大的值，当可见状态序列为$T F F T$时，隐马尔可夫计算出的最有可能的隐含状态序列为$A A A A$。但是如果对训练集进行统计可能会发现，当可见序列为$T F F T$ 时，对应的隐含状态是$A A A A$的概率可能是比较大的，但也可能是比较小的。这个例子中出现预测偏差的主要原因是：由于比其他状态转移概率要大得多，隐含状态的预测一直停留在状态$A$。

 %----------------------------------------------
 \begin{figure}[htp]
@@ -532,7 +531,7 @@ Z(\seq{X})=\sum_{\seq{Y}}\exp(\sum_{i=1}^m\sum_{j=1}^k\lambda_{j}F_{j}(y_{i-1},y

 \parinterval 无论在日常生活中还是在研究工作中，都会遇到各种各样的分类问题，例如挑选西瓜时需要区分“好瓜”和“坏瓜”、编辑看到一篇新闻稿件时要对稿件进行分门别类。事实上，在机器学习中，对“分类任务”的定义会更宽泛而并不拘泥于“类别”的概念，在对样本进行预测时，只要预测标签集合是有限的且预测标签是离散的，就可认定其为分类任务。

-\parinterval 具体来说，分类任务目标是训练一个可以根据输入数据预测离散标签的{\small\bfnew{分类器}}\index{分类器}（Classifier\index{Classifier}），也可称为分类模型。在有监督的分类任务中\footnote{与之相对应的，还有无监督、半监督分类任务，不过这些内容不是本书讨论的重点。读者可以参看参考文献\upcite{周志华2016机器学习,李航2019统计学习方法}对相关概念进行了解。}，训练数据集合通常由形似$(\seq{x}_i,y_i)$的带标注数据构成，$\seq{x}_i=(x_i^1,x_i^2,\ldots,x_i^k)$作为分类器的输入数据（通常被称作一个训练样本），其中$x_i^j$表示样本$\seq{x}_i$的第$j$个特征；$y_i$作为输入数据对应的{\small\bfnew{标签}}\index{标签}（Label）\index{Label}，反映了输入数据对应的“类别”。若标签集合大小为$n$，则分类任务的本质是通过对训练数据集合的学习，建立一个从$k$ 维样本空间到$n$维标签空间的映射关系。更确切地说，分类任务的最终目标是学习一个条件概率分布$\funp{P}(y|\seq{x})$，这样对于输入$\seq{x}$可以找到概率最大的$y$作为分类结果输出。
+\parinterval 具体来说，分类任务目标是训练一个可以根据输入数据预测离散标签的{\small\bfnew{分类器}}\index{分类器}（Classifier\index{Classifier}），也可称为分类模型。在有监督的分类任务中\footnote{与之相对应的，还有无监督、半监督分类任务，不过这些内容不是本书讨论的重点。读者可以参看参考文献\upcite{周志华2016机器学习,李航2019统计学习方法}对相关概念进行了解。}，训练数据集合通常由形似$(\boldsymbol{x_i},y_i)$的带标注数据构成，$\boldsymbol{x_i}=(x_{i1},x_{i2},\ldots,x_{ik})$作为分类器的输入数据（通常被称作一个训练样本），其中$x_{ij}$表示样本$\boldsymbol{x_i}$的第$j$个特征；$y_i$作为输入数据对应的{\small\bfnew{标签}}\index{标签}（Label）\index{Label}，反映了输入数据对应的“类别”。若标签集合大小为$n$，则分类任务的本质是通过对训练数据集合的学习，建立一个从$k$ 维样本空间到$n$维标签空间的映射关系。更确切地说，分类任务的最终目标是学习一个条件概率分布$\funp{P}(y|\boldsymbol{x})$，这样对于输入$\boldsymbol{x}$可以找到概率最大的$y$作为分类结果输出。

 \parinterval 与概率图模型一样，分类模型中也依赖特征定义。其定义形式与\ref{sec3:feature}节的描述一致，这里不再赘述。分类任务一般根据类别数量分为二分类任务和多分类任务，二分类任务是最经典的分类任务，只需要对输出进行非零即一的预测。多分类任务则可以有多种处理手段，比如，可以将其“拆解”为多个二分类任务求解，或者直接让模型输出多个类别中的一个。在命名实体识别中，往往会使用多类别分类模型。比如，在BIO标注下，有三个类别（B、I和O）。一般来说，类别数量越大分类的难度也越大。比如，BIOES标注包含5个类别，因此使用同样的分类器，它要比BIO标注下的分类问题难度大。另一方面，更多的类别有助于准确的刻画目标问题。因此在实践中需要在类别数量和分类难度之间找到一种平衡。

@@ -626,7 +625,7 @@ Z(\seq{X})=\sum_{\seq{Y}}\exp(\sum_{i=1}^m\sum_{j=1}^k\lambda_{j}F_{j}(y_{i-1},y

 \parinterval 句法树是对句子的一种抽象，这种树形结构表达了一种对句子结构的归纳过程，比如，从树的叶子开始，把每一个树节点看作一次抽象，最终形成一个根节点。那这个过程如何用计算机来实现呢？这就需要使用到形式文法。

-\parinterval 形式文法是分析自然语言的一种重要工具。根据乔姆斯基的定义\upcite{chomsky2002syntactic}，形式文法分为四种类型：无限制文法（0型文法）、上下文有关文法（1型文法）、上下文无关文法（2型文法）和正规文法（3型文法）。不同类型的文法有不同的应用，比如，正规文法可以用来描述有限状态自动机，因此也会被使用在语言模型等系统中。对于短语结构分析问题，常用的是{\small\bfnew{上下文无关文法}}\index{上下文无关文法}（Context-Free Grammar）\index{Context-Free Grammar}。上下文无关文法的具体形式如下：
+\parinterval 形式文法是分析自然语言的一种重要工具。根据乔姆斯基的定义\upcite{chomsky1957syntactic}，形式文法分为四种类型：无限制文法（0型文法）、上下文有关文法（1型文法）、上下文无关文法（2型文法）和正规文法（3型文法）。不同类型的文法有不同的应用，比如，正规文法可以用来描述有限状态自动机，因此也会被使用在语言模型等系统中。对于短语结构分析问题，常用的是{\small\bfnew{上下文无关文法}}\index{上下文无关文法}（Context-Free Grammar）\index{Context-Free Grammar}。上下文无关文法的具体形式如下：

 %-------------------------------------------
 \vspace{0.5em}

--- a/bibliography.bib
+++ b/bibliography.bib
@@ -847,25 +847,28 @@
 %%%%% chapter 3------------------------------------------------------

 @inproceedings{ng2002discriminative,
-  title ={On discriminative vs. generative classifiers: A comparison of logistic regression and naive bayes},
-  author ={Ng, Andrew Y and Jordan, Michael I},
-  pages ={841--848},
-  year ={2002}
+  author    = {Andrew Y. Ng and
+               Michael I. Jordan},
+  title     = {On Discriminative vs. Generative Classifiers: {A} comparison of logistic
+               regression and naive Bayes},
+  pages     = {841--848},
+  publisher = {{MIT} Press},
+  year      = {2001},
 }


-@proceedings{huang2008advanced,
-    title = {Coling 2008: Advanced Dynamic Programming in Computational Linguistics: Theory, Algorithms and       Applications - Tutorial notes},
+@inproceedings{huang2008coling,
+	author = {Huang, Liang},
+    title = {Coling 2008: Advanced Dynamic Programming in Computational Linguistics: Theory, Algorithms and Applications-Tutorial notes},
    year = {2008},
-    address = {Manchester, UK},
-    publisher = {Coling 2008 Organizing Committee},
+    publisher = {International Conference on Computational Linguistics},
 }

 @book{aho1972theory,
-  author    = {Alfred V. Aho and
-               Jeffrey D. Ullman},
-  title     = {The theory of parsing, translation, and compiling. 2: Compiling},
-  publisher = {Prentice-Hall},
+  author    = {Aho, Alfred V and
+               Ullman, Jeffrey D},
+  title     = {The theory of parsing, translation, and compiling},
+  publisher = {Prentice-Hall Englewood Cliffs, NJ},
  year      = {1973},
 }

@@ -874,7 +877,7 @@
  author={Rau, Lisa F},
  pages={29--30},
  year={1991},
-  organization={IEEE Computer Society}
+  publisher={IEEE Conference on Artificial Intelligence Application},
 }

 @article{张小衡1997中文机构名称的识别与分析,
@@ -887,11 +890,16 @@
  year={1997},
 }

-@article{lample2016neural,
-  title={Neural architectures for named entity recognition},
-  author={Lample, Guillaume and Ballesteros, Miguel and Subramanian, Sandeep and Kawakami, Kazuya and Dyer, Chris},
-  journal={arXiv preprint arXiv:1603.01360},
-  year={2016}
+@inproceedings{lample2016neural,
+  author    = {Guillaume Lample and
+               Miguel Ballesteros and
+               Sandeep Subramanian and
+               Kazuya Kawakami and
+               Chris Dyer},
+  title     = {Neural Architectures for Named Entity Recognition},
+  pages     = {260--270},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
+  year      = {2016},
 }

 @article{Baum1966Statistical,
@@ -912,7 +920,6 @@
  number={1},
  pages={164--171},
  year={1970},
-  publisher={JSTOR}
 }

 @article{1977Maximum,
@@ -926,19 +933,13 @@
 @article{1967Error,
  title={Error bounds for convolutional codes and an asymptotically optimum decoding algorithm},
  author={ Viterbi, Andrew J. },
-  journal={IEEE Trans.informat.theory},
+  journal={IEEE Transactions on Information Theory},
  volume={13},
  number={2},
  pages={260-269},
  year={1967},
 }

-@article{lafferty2001conditional,
-  title={Conditional random fields: Probabilistic models for segmenting and labeling sequence data},
-  author={Lafferty, John and McCallum, Andrew and Pereira, Fernando CN},
-  year={2001}
-}
-
 @article{harrington2013机器学习实战,
  title={机器学习实战},
  author={Harrington, Peter},
@@ -951,41 +952,36 @@
    author = {Brants, Thorsten},
    month = apr,
    year = {2000},
-    address = {Seattle, Washington, USA},
    publisher = {Association for Computational Linguistics},
-    doi = {10.3115/974147.974178},
    pages = {224--231},
 }

 @inproceedings{tsuruoka-tsujii-2005-chunk,
    title = {Chunk Parsing Revisited},
-    author = {Tsuruoka, Yoshimasa  and
-      Tsujii, Jun{'}ichi},
+    author = {Yoshimasa Tsuruoka and
+               Jun'ichi Tsujii},
    month = oct,
    year = {2005},
-    address = {Vancouver, British Columbia},
-    publisher = {Association for Computational Linguistics},
+    publisher = {Annual Meeting of the Association for Computational Linguistics},
    pages = {133--140},
 }

 @inproceedings{li-etal-2003-news-oriented,
-    title = {News-Oriented Automatic {C}hinese Keyword Indexing},
+    title = {News-Oriented Automatic Chinese Keyword Indexing},
    author = {Li, Sujian  and
      Wang, Houfeng  and
      Yu, Shiwen  and
      Xin, Chengsheng},
    month = jul,
    year = {2003},
-    address = {Sapporo, Japan},
-    publisher = {Association for Computational Linguistics},
-    doi = {10.3115/1119250.1119263},
+    publisher = {Annual Meeting of the Association for Computational Linguistics},
    pages = {92--97},
 }

 @article{2015Bidirectional,
  title={Bidirectional LSTM-CRF Models for Sequence Tagging},
  author={ Huang, Zhiheng  and  Xu, Wei  and  Yu, Kai },
-  journal={Computer ence},
+  journal={CoRR},
  year={2015},
 }

@@ -1000,10 +996,13 @@
 }

 @inproceedings{vzukov2018named,
-  title={Named entity recognition with parallel recurrent neural networks},
-  author={{\v{Z}}ukov-Gregori{\v{c}}, Andrej and Bachrach, Yoram and Coope, Sam},
-  pages={69--74},
-  year={2018}
+  author    = {Andrej Zukov Gregoric and
+               Yoram Bachrach and
+               Sam Coope},
+  title     = {Named Entity Recognition With Parallel Recurrent Neural Networks},
+  pages     = {69--74},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
+  year      = {2018},
 }

 @article{Li2020A,
@@ -1016,29 +1015,196 @@
  year={2020},
 }

-@article{devlin2018bert,
+@article{devlin2019bert,
  title={Bert: Pre-training of deep bidirectional transformers for language understanding},
  author={Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina},
-  journal={arXiv preprint arXiv:1810.04805},
-  year={2018}
-}
-
-
-@article{conneau2019unsupervised,
-  title={Unsupervised cross-lingual representation learning at scale},
-  author={Conneau, Alexis and Khandelwal, Kartikay and Goyal, Naman and Chaudhary, Vishrav and Wenzek, Guillaume and Guzm{\'a}n, Francisco and Grave, Edouard and Ott, Myle and Zettlemoyer, Luke and Stoyanov, Veselin},
-  journal={arXiv preprint arXiv:1911.02116},
-  year={2019}
+  year={2019},
+  pages = {4171--4186},
+  journal = {Annual Meeting of the Association for Computational Linguistics},
+}
+
+@inproceedings{conneau2019unsupervised,
+  author    = {Alexis Conneau and
+               Kartikay Khandelwal and
+               Naman Goyal and
+               Vishrav Chaudhary and
+               Guillaume Wenzek and
+               Francisco Guzm{\'{a}}n and
+               Edouard Grave and
+               Myle Ott and
+               Luke Zettlemoyer and
+               Veselin Stoyanov},
+  title     = {Unsupervised Cross-lingual Representation Learning at Scale},
+  pages     = {8440--8451},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
+  year      = {2020},
 }

 @book{chomsky1993lectures,
  title={Lectures on government and binding: The Pisa lectures},
  author={Chomsky, Noam},
-  number={9},
  year={1993},
  publisher={Walter de Gruyter}
 }

+@inproceedings{DBLP:conf/acl/SennrichHB16a,
+  author    = {Rico Sennrich and
+               Barry Haddow and
+               Alexandra Birch},
+  title     = {Neural Machine Translation of Rare Words with Subword Units},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
+  year      = {2016},
+}
+
+@article{刘挺1998最大概率分词问题及其解法,
+  title={最大概率分词问题及其解法},
+  author={刘挺 and 吴岩 and 王开铸},
+  journal={哈尔滨工业大学学报},
+  number={06},
+  pages={37-41},
+  year={1998},
+}
+
+@article{丁洁2010基于最大概率分词算法的中文分词方法研究,
+  title={基于最大概率分词算法的中文分词方法研究},
+  author={丁洁},
+  journal={科技信息},
+  number={21},
+  pages={I0075--I0075},
+  year={2010}
+}
+
+@book{1995University,
+  title     = {University of Sheffield: Description of the LaSIE-II system as used for MUC-7},
+  author    = {Kevin Humphreys and
+               Robert J. Gaizauskas and
+               Saliha Azzam and
+               Charles Huyck and
+               Brian Mitchell and
+               Hamish Cunningham and
+               Yorick Wilks},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
+  year      = {1995},
+}
+
+@inproceedings{krupka1998isoquest,
+  title={IsoQuest Inc.: Description of the NetOwl™ Extractor System as Used for MUC-7},
+  author={Krupka, George and Hausman, Kevin},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
+  year={1998}
+}
+
+@inproceedings{DBLP:conf/muc/BlackRM98,
+  author    = {William J. Black and
+               Fabio Rinaldi and
+               David Mowatt},
+  title     = {{FACILE:} Description of the {NE} System Used for {MUC-7}},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
+  year      = {1998},
+}
+
+@article{1996Hidden,
+  title={Hidden Markov models.},
+  author={ Eddy, Sean R },
+  journal={Current Opinion in Structural Biology},
+  volume={6},
+  number={3},
+  pages={361-5},
+  year={1996},
+}
+
+@inproceedings{lafferty2001conditional,
+  author    = {John D. Lafferty and
+               Andrew McCallum and
+               Fernando C. N. Pereira},
+  title     = {Conditional Random Fields: Probabilistic Models for Segmenting and
+               Labeling Sequence Data}, 
+  pages     = {282--289},
+  publisher = {proceedings of the Eighteenth International Conference on Machine
+               Learning},
+  year      = {2001},
+}
+
+@book{kapur1989maximum,
+  title={Maximum-entropy models in science and engineering},
+  author={Kapur, Jagat Narain},
+  year={1989},
+  publisher={John Wiley \& Sons}
+}
+
+@article{1998Support,
+  title={Support vector machines},
+  author={Hearst, Marti A. and Dumais, Susan T and Osuna, Edgar and Platt, John and Scholkopf, Bernhard},
+  journal={IEEE Intelligent Systems \& Their Applications},
+  volume={13},
+  number={4},
+  pages={18-28},
+  year={1998},
+}
+
+@article{2011Natural,
+  title={Natural Language Processing (almost) from Scratch},
+  author={ Collobert, Ronan  and  Weston, Jason  and Bottou, Léon and  Karlen, Michael  and  Kavukcuoglu, Koray  and  Kuksa, Pavel },
+  journal={Journal of Machine Learning Research},
+  volume={12},
+  number={1},
+  pages={2493-2537},
+  year={2011},
+}
+
+@book{manning2008introduction,
+  title={Introduction to information retrieval},
+  author={Manning, Christopher D and Sch{\"u}tze, Hinrich and Raghavan, Prabhakar},
+  year={2008},
+  publisher={Cambridge university press}
+}
+
+@article{berger1996maximum,
+  title={A maximum entropy approach to natural language processing},
+  author={Berger, Adam and Della Pietra, Stephen A and Della Pietra, Vincent J},
+  journal={Computational linguistics},
+  volume={22},
+  number={1},
+  pages={39--71},
+  year={1996}
+}
+
+@article{mitchell1996m,
+  title={Machine Learning},
+  author={Mitchell, Tom},
+  journal={McCraw Hill},
+  year={1996}
+}
+
+@inproceedings{DBLP:conf/acl/OchN02,
+  author    = {Franz Josef Och and
+               Hermann Ney},
+  title     = {Discriminative Training and Maximum Entropy Models for Statistical
+               Machine Translation},
+  pages     = {295--302},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
+  year      = {2002},
+}
+
+@incollection{mohri2008speech,
+  title={Speech recognition with weighted finite-state transducers},
+  author={Mohri, Mehryar and Pereira, Fernando and Riley, Michael},
+  pages={559--584},
+  year={2008},
+  publisher={Springer}
+}
+
+@article{bellman1966dynamic,
+  title={Dynamic programming},
+  author={Bellman, Richard},
+  journal={Science},
+  volume={153},
+  number={3731},
+  pages={34--37},
+  year={1966},
+  publisher={American Association for the Advancement of Science}
+}
+
 %%%%% chapter 3------------------------------------------------------
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

@@ -1830,10 +1996,10 @@
 }
 @article{肖桐1991面向统计机器翻译的重对齐方法研究,
  title={面向统计机器翻译的重对齐方法研究},
-  author={肖桐 and 
-          李天宁 and 
-          陈如山 and 
-          朱靖波 and 
+  author={肖桐 and
+          李天宁 and
+          陈如山 and
+          朱靖波 and
          王会珍},
  journal={中文信息学报},
  volume={24},
@@ -1841,15 +2007,20 @@
  year={2010},
 }

-@article{2005Improving,
-  title={Improving Statistical Word Alignment with Ensemble Methods},
-  author={ Wu Hua  and  Wang Haifeng },
-  year={2005},
+@inproceedings{2005Improvin,
+  author    = {Hua Wu and
+               Haifeng Wang},
+  title     = {Improving Statistical Word Alignment with Ensemble Methods},
+  volume    = {3651},
+  pages     = {462--473},
+  publisher = {International Joint Conference on Natural Language Processing},
+  year      = {2005}
 }
 @article{1998Grammar,
  title={Grammar Inference and Statistical Machine Translation},
  author={Ye-Yi Wang and Jaime Carbonell},
  year={1998},
+  publisher={Carnegie Mellon University}
 }

 @inproceedings{DBLP:conf/acl-vlc/DaganCG93,
@@ -1857,8 +2028,10 @@
               Kenneth Ward Church and
               Willian Gale},
  title     = {Robust Bilingual Word Alignment for Machine Aided Translation},
+  publisher = {Very Large Corpora},
  year      = {1993}
 }
+
 @inproceedings{DBLP:conf/naacl/GaleC91,
  author    = {William A. Gale and
               Kenneth Ward Church},
@@ -1871,7 +2044,7 @@
               Benjamin Taskar and
               Dan Klein},
  title     = {Alignment by Agreement},
-  publisher = {The Association for Computational Linguistics},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2006}
 }
 @inproceedings{DBLP:conf/naacl/DyerCS13,
@@ -1880,14 +2053,14 @@
               Noah A. Smith},
  title     = {A Simple, Fast, and Effective Reparameterization of {IBM} Model 2},
  pages     = {644--648},
-  publisher = {The Association for Computational Linguistics},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2013}
 }
 @article{DBLP:journals/coling/FraserM07,
  author    = {Alexander M. Fraser and
               Daniel Marcu},
  title     = {Measuring Word Alignment Quality for Statistical Machine Translation},
-  journal   = {Comput. Linguistics},
+  journal   = {Computational Linguistics},
  volume    = {33},
  number    = {3},
  pages     = {293--303},
@@ -1897,24 +2070,24 @@
  author    = {John DeNero and
               Dan Klein},
  title     = {Tailoring Word Alignments to Syntactic Machine Translation},
-  publisher = {The Association for Computational Linguistics},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2007}
 }

 @inproceedings{paul2007all,
-  author = {Paul C Davis，Zhuli Xie and 
+  author = {Paul C Davis，Zhuli Xie and
            Kevin Small},
-  publisher={暂时未找到},
+  publisher={Machine Translation Summit XI},
  title  = {All Links are not the Same: Evaluating Word Alignments for Statistical Machine Translation},
  year   = {2007}
 }

 @article{黄书剑2009一种错误敏感的词对齐评价方法,
  title={一种错误敏感的词对齐评价方法},
-  author={黄书剑 and 
-          奚宁 and 
-          赵迎功 and 
-          戴新宇 and 
+  author={黄书剑 and
+          奚宁 and
+          赵迎功 and
+          戴新宇 and
          陈家骏},
  journal={中文信息学报},
  volume={23},
@@ -1925,7 +2098,7 @@
  author    = {Alexander M. Fraser and
               Daniel Marcu},
  title     = {Measuring Word Alignment Quality for Statistical Machine Translation},
-  journal   = {Comput. Linguistics},
+  journal   = {Computational Linguistics},
  volume    = {33},
  number    = {3},
  pages     = {293--303},
@@ -1949,7 +2122,7 @@
  title     = {An Algorithmic Framework for Solving the Decoding Problem in Statistical
               Machine Translation},
  year      = {2004},
- publisher  = {International Conference on Computational Linguistics}
+  publisher  = {International Conference on Computational Linguistics}
 }
 @inproceedings{DBLP:conf/naacl/RiedelC09,
  author    = {Sebastian Riedel and
@@ -1979,7 +2152,7 @@
               Antonina Kolokolova and
               Renesa Nizamee},
  title     = {Complexity of alignment and decoding problems: restrictions and approximations},
-  journal   = {Mach. Transl.},
+  journal   = {Machine Translation},
  volume    = {29},
  number    = {3-4},
  pages     = {163--187},
@@ -1995,7 +2168,7 @@
  author    = {Abraham Ittycheriah and
               Salim Roukos},
  title     = {A Maximum Entropy Word Aligner for Arabic-English Machine Translation},
-  publisher = {The Association for Computational Linguistics},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2005}
 }
 @inproceedings{koehn2003statistical,
@@ -2003,7 +2176,7 @@
               Franz Josef Och and
               Daniel Marcu},
  title     = {Statistical Phrase-Based Translation},
-  publisher = {The Association for Computational Linguistics},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2003}
 }
 @book{manning1999foundations,
@@ -2037,6 +2210,7 @@
               Hermann Ney and
               Christoph Tillmann},
  title     = {HMM-Based Word Alignment in Statistical Translation},
+  publisher = {International Conference on Computational Linguistics},
  pages     = {836--841},
  year      = {1996}
 }
@@ -2048,14 +2222,6 @@
  pages ={733--782},
  year ={2013}
 }
-@inproceedings{vogel1996hmm,
-  author    = {Stephan Vogel and
-               Hermann Ney and
-               Christoph Tillmann},
-  title     = {HMM-Based Word Alignment in Statistical Translation},
-  pages     = {836--841},
-  year      = {1996},
-}
 @inproceedings{1966Decentering,
  author    = {Brown D.C.},
  title     = {Decentering Distortion of Lenses},
@@ -2142,7 +2308,7 @@
 }
 @inproceedings{taskar2005a,
  author    = {Benjamin Taskar and
-               Simon Lacoste{-}Julien and
+               Simon Lacoste-Julien and
               Dan Klein},
  title     = {A Discriminative Matching Approach to Word Alignment},
  pages     = {73--80},
@@ -2260,7 +2426,7 @@
  year      = {2002}
 }
 @inproceedings{DBLP:conf/acl/WangW98,
-  author    = {Ye{-}Yi Wang and
+  author    = {Ye-Yi Wang and
               Alex Waibel},
  title     = {Modeling with Structures in Statistical Machine Translation},
  pages     = {1357--1363},
@@ -2401,21 +2567,21 @@
 }

 @inproceedings{matthias2012discriminative,
-  author    = {Matthias Huck and 
-			   Stephan Peitz and 
-               Markus Freitag and 
+  author    = {Matthias Huck and
+			   Stephan Peitz and
+               Markus Freitag and
               Hermann Ney},
  title     = {Discriminative Reordering Extensions for Hierarchical Phrase-Based Machine Translation },
  publisher = {International Conference on Material Engineering and Advanced Manufacturing Technology},
  year      = {2012}
 }
 @inproceedings{vinh2009improving,
-  author    = {Vinh Van Nguyen and 
-			   Akira Shimazu and 
-               Minh Le Nguyen and 
+  author    = {Vinh Van Nguyen and
+			   Akira Shimazu and
+               Minh Le Nguyen and
               Thai Phuong Nguyen},
  title     = {Improving a Lexicalized Hierarchical Reordering Model Using Maximum Entropy},
-  publisher = {MT summit XII},
+  publisher = {Machine Translation Summit XII},
  year      = {2009}
 }
 @article{DBLP:journals/coling/BisazzaF16,
@@ -2489,7 +2655,7 @@
  author    = {Robert C Moore and
               Chris Quirk},
  title     = {Faster Beam-Search Decoding for Phrasal Statistical Machine Translation},
-  publisher = {MT Summit XI},
+  publisher = {Machine Translation Summit XI},
  year      = {2007}
 }
 @inproceedings{DBLP:conf/acl/HeafieldKM14,
@@ -2638,11 +2804,12 @@
  pages     = {2069--2083},
  year      = {2016}
 }
-@article{marcu2006practical,
-	title={Practical structured learning techniques for natural language processing},
-	author={Daniel Marcu and Harold Charles Daume},
-	journal={Ph.D. thesis, University of Southern California, Los Angeles, CA},
-	year={2006}
+
+@book{marcu2006practical,
+  title={Practical structured learning techniques for natural language processing},
+  author={Daume Iii, Harold Charles },
+  publisher={University of Southern California},
+  year={2006},
 }
 @inproceedings{DBLP:conf/iwslt/ZensN08,
  author    = {Richard Zens and
@@ -2655,7 +2822,7 @@
 }
 @inproceedings{DBLP:conf/emnlp/SchwenkCF07,
  author    = {Holger Schwenk and
-               Marta R. Costa{-}juss{\`{a}} and
+               Marta R. Costa-juss{\`{a}} and
               Jos{\'{e}} A. R. Fonollosa},
  title     = {Smooth Bilingual N-Gram Translation},
  pages     = {430--438},
@@ -2668,7 +2835,7 @@
               George Foster and
 			   Howard Johnson},
  title     = {Unpacking and Transforming Feature Functions: New Ways to Smooth Phrase Tables},
-  publisher = {MT Summit},
+  publisher = {Machine Translation Summit},
  year      = {2011}
 }
 @inproceedings{DBLP:conf/coling/DuanSZ10,
@@ -2677,7 +2844,7 @@
               Ming Zhou},
  title     = {Translation Model Generalization using Probability Averaging for Machine
               Translation},
-  publisher = {Tsinghua University Press},
+  publisher = {International Conference on Computational Linguistics},
  year      = {2010}
 }
 @inproceedings{DBLP:conf/naacl/QuirkM06,
@@ -2695,7 +2862,7 @@
               Adri{\`{a}} de Gispert and
               Patrik Lambert and
               Jos{\'{e}} A. R. Fonollosa and
-               Marta R. Costa{-}juss{\`{a}}},
+               Marta R. Costa-juss{\`{a}}},
  title     = {\emph{N}-gram-based Machine Translation},
  journal   = {Computational Linguistics},
  volume    = {32},
@@ -2752,7 +2919,7 @@
  year      = {2007},
 }
 @inproceedings{DBLP:conf/acl/Callison-BurchBS05,
-  author    = {Chris Callison{-}Burch and
+  author    = {Chris Callison-Burch and
               Colin J. Bannard and
               Josh Schroeder},
  title     = {Scaling Phrase-Based Statistical Machine Translation to Larger Corpora
@@ -2813,7 +2980,7 @@
    author ={Chiang David},
    journal ={Computational Linguistics},
    volume ={33},
-    number ={2},   
+    number ={2},
    pages ={201--228},
    year ={2007}
 }
@@ -2860,11 +3027,11 @@
 @inproceedings{huang2006statistical,
  title ={Statistical syntax-directed translation with extended domain of locality},
  author ={Huang, Liang and Knight, Kevin and Joshi, Aravind},
-  booktitle ={Proceedings of AMTA},
  pages ={66--73},
  year ={2006},
-  organization ={Cambridge, MA}
+  publisher ={Computationally Hard Problems \& Joint Inference in Speech \& Language Processing}
 }
+
 @inproceedings{galley2006scalable,
  author    = {Michel Galley and
               Jonathan Graehl and
@@ -2993,7 +3160,7 @@
  title     = {Discriminative Induction of Sub-Tree Alignment using Limited Labeled
               Data},
  pages     = {1047--1055},
-  publisher = {Tsinghua University Press},
+  publisher = {International Conference on Computational Linguistics},
  year      = {2010}
 }
 @inproceedings{liu2009weighted,
@@ -3123,7 +3290,7 @@
               Daniel Marcu},
  title     = {What's in a translation rule?},
  pages     = {273--280},
-  publisher = {The Association for Computational Linguistics},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2004}
 }
 @inproceedings{DBLP:conf/naacl/HuangK06,
@@ -3131,7 +3298,7 @@
               Kevin Knight},
  title     = {Relabeling Syntax Trees to Improve Syntax-Based Machine Translation
               Quality},
-  publisher = {The Association for Computational Linguistics},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2006}
 }
 @inproceedings{DBLP:conf/emnlp/DeNeefeKWM07,
@@ -3141,7 +3308,7 @@
               Daniel Marcu},
  title     = {What Can Syntax-Based {MT} Learn from Phrase-Based MT?},
  pages     = {755--763},
-  publisher = {The Association for Computational Linguistics},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2007}
 }
 @inproceedings{DBLP:conf/wmt/LiuG08,
@@ -3149,7 +3316,7 @@
               Daniel Gildea},
  title     = {Improved Tree-to-String Transducer for Machine Translation},
  pages     = {62--69},
-  publisher = {The Association for Computational Linguistics},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2008}
 }
 @inproceedings{DBLP:conf/acl/LiuLL06,
@@ -3186,7 +3353,7 @@
    author = {Min Zhang and Hongfei Jiang and Ai Ti Aw and Jun Sun and Sheng Li and Chew Lim Tan},
    title = {A Tree-to-Tree Alignment-based Model for Statistical Machine Translation},
    year = {2007},
-	publisher = {MT-Summit}
+	publisher = {Machine Translation Summit}
 }
 @inproceedings{DBLP:conf/acl/LiuLL09,
  author    = {Yang Liu and
@@ -3290,7 +3457,7 @@
               Yu Zhou and
               Chengqing Zong},
  title     = {Unsupervised Tree Induction for Tree-based Translation},
-  journal   = {Trans. Assoc. Comput. Linguistics},
+  journal   = {Transactions of Association for Computational Linguistic},
  volume    = {1},
  pages     = {243--254},
  year      = {2013}
@@ -3386,7 +3553,7 @@
 @inproceedings{DBLP:conf/coling/TuLHLL10,
  author    = {Zhaopeng Tu and
               Yang Liu and
-               Young{-}Sook Hwang and
+               Young-Sook Hwang and
               Qun Liu and
               Shouxun Lin},
  title     = {Dependency Forest for Statistical Machine Translation},
@@ -3402,7 +3569,7 @@
  organization ={The Institute of Electrical and Electronics Engineers}
 }
 @inproceedings{rosti2007combining,
-  author    = {Antti{-}Veikko I. Rosti and
+  author    = {Antti-Veikko I. Rosti and
               Necip Fazil Ayan and
               Bing Xiang and
               Spyridon Matsoukas and
@@ -3445,7 +3612,7 @@
  year      = {2008}
 }
 @inproceedings{Li2009Incremental,
-  author    = {Chi{-}Ho Li and
+  author    = {Chi-Ho Li and
               Xiaodong He and
               Yupeng Liu and
               Ning Xi},
@@ -3468,7 +3635,7 @@
  author    = {Mu Li and
               Nan Duan and
               Dongdong Zhang and
-               Chi{-}Ho Li and
+               Chi-Ho Li and
               Ming Zhou},
  title     = {Collaborative Decoding: Partial Hypothesis Re-ranking Using Translation
               Consensus between Decoders},
@@ -3520,14 +3687,14 @@
 @article{brown1992class,
  title={Class-based n-gram models of natural language},
  author={Brown and
-              Peter F and 
+              Peter F and
              Desouza and
-              Peter V and 
-              Mercer amd 
-              Robert L 
+              Peter V and
+              Mercer amd
+              Robert L
              and Pietra and
-              Vincent J Della 
-              and Lai and 
+              Vincent J Della
+              and Lai and
              Jenifer C},
  journal={Computational linguistics},
  volume={18},
@@ -3551,10 +3718,10 @@

 @article{zaremba2014recurrent,
  title={Recurrent Neural Network Regularization},
-  author={Zaremba and 
-             Wojciech and 
-             Sutskever and 
-             Ilya and 
+  author={Zaremba and
+             Wojciech and
+             Sutskever and
+             Ilya and
             Vinyals and
             Oriol},
  journal={arXiv: Neural and Evolutionary Computing},
@@ -3568,7 +3735,7 @@
            Srivastava and
            Rupesh Kumar and
            Koutnik and
-            Jan and 
+            Jan and
            Schmidhuber and
            Jurgen},
  journal={arXiv: Learning},
@@ -3860,7 +4027,7 @@

 @inproceedings{perozzi2014deepwalk,
  author    = {Bryan Perozzi and
-               Rami Al{-}Rfou and
+               Rami Al-Rfou and
               Steven Skiena},
  //editor    = {Sofus A. Macskassy and
               Claudia Perlich and
@@ -4076,11 +4243,11 @@ pages ={157-166},
               Jonathan Clark and
               Christian Federmann and
               Xuedong Huang and
-               Marcin Junczys{-}Dowmunt and
+               Marcin Junczys-Dowmunt and
               William Lewis and
               Mu Li and
               Shujie Liu and
-               Tie{-}Yan Liu and
+               Tie-Yan Liu and
               Renqian Luo and
               Arul Menezes and
               Tao Qin and
@@ -4281,7 +4448,7 @@ pages ={157-166},
               Alexandra Birch and
               Barry Haddow and
               Julian Hitschler and
-               Marcin Junczys{-}Dowmunt and
+               Marcin Junczys-Dowmunt and
               Samuel L{\"{a}}ubli and
               Antonio Valerio Miceli Barone and
               Jozef Mokry and
@@ -4626,28 +4793,7 @@ pages ={157-166},
  //biburl    = {https://dblp.org/rec/conf/acl/LiLLMS19.bib},
  //bibsource = {dblp computer science bibliography, https://dblp.org}
 }
-@inproceedings{Zhang2017PriorKI,
-  author    = {Jiacheng Zhang and
-               Yang Liu and
-               Huanbo Luan and
-               Jingfang Xu and
-               Maosong Sun},
-  //editor    = {Regina Barzilay and
-               Min{-}Yen Kan},
-  title     = {Prior Knowledge Integration for Neural Machine Translation using Posterior
-               Regularization},
-  publisher = {Proceedings of the 55th Annual Meeting of the Association for Computational
-               Linguistics, {ACL} 2017, Vancouver, Canada, July 30 - August 4, Volume
-               1: Long Papers},
-  pages     = {1514--1523},
-  //publisher = {Association for Computational Linguistics},
-  year      = {2017},
-  //url       = {https://doi.org/10.18653/v1/P17-1139},
-  //doi       = {10.18653/v1/P17-1139},
-  //timestamp = {Tue, 20 Aug 2019 11:59:06 +0200},
-  //biburl    = {https://dblp.org/rec/conf/acl/ZhangLLXS17.bib},
-  //bibsource = {dblp computer science bibliography, https://dblp.org}
-}
+
 @inproceedings{Werlen2018DocumentLevelNM,
  author    = {Lesly Miculicich Werlen and
               Dhananjay Ram and
@@ -4710,21 +4856,7 @@ pages ={157-166},
  //biburl    = {https://dblp.org/rec/journals/corr/abs-1906-00532.bib},
  //bibsource = {dblp computer science bibliography, https://dblp.org}
 }
-@article{DBLP:journals/corr/CourbariauxB16,
-  author    = {Matthieu Courbariaux and
-               Yoshua Bengio},
-  title     = {BinaryNet: Training Deep Neural Networks with Weights and Activations
-               Constrained to +1 or -1},
-  journal   = {CoRR},
-  volume    = {abs/1602.02830},
-  year      = {2016},
-  //url       = {http://arxiv.org/abs/1602.02830},
-  //archivePrefix = {arXiv},
-  //eprint    = {1602.02830},
-  //timestamp = {Mon, 13 Aug 2018 16:46:57 +0200},
-  //biburl    = {https://dblp.org/rec/journals/corr/CourbariauxB16.bib},
-  //bibsource = {dblp computer science bibliography, https://dblp.org}
-}
+
 @inproceedings{Zhang2018SpeedingUN,
  author    = {Wen Zhang and
               Liang Huang and
@@ -4748,7 +4880,7 @@ pages ={157-166},
 }
 @inproceedings{DBLP:journals/corr/SeeLM16,
  author    = {Abigail See and
-               Minh{-}Thang Luong and
+               Minh-Thang Luong and
               Christopher D. Manning},
  //editor    = {Yoav Goldberg and
               Stefan Riezler},
@@ -4770,7 +4902,7 @@ pages ={157-166},
               Yong Cheng and
               Victor O. K. Li},
  //editor    = {Regina Barzilay and
-               Min{-}Yen Kan},
+               Min-Yen Kan},
  title     = {A Teacher-Student Framework for Zero-Resource Neural Machine Translation},
  publisher = {Proceedings of the 55th Annual Meeting of the Association for Computational
               Linguistics, {ACL} 2017, Vancouver, Canada, July 30 - August 4, Volume
@@ -4799,28 +4931,31 @@ pages ={157-166},
  //biburl    = {https://dblp.org/rec/journals/corr/HintonVD15.bib},
  //bibsource = {dblp computer science bibliography, https://dblp.org}
 }
-@inproceedings{Sun2019PatientKD,
-  author    = {Siqi Sun and
-               Yu Cheng and
-               Zhe Gan and
-               Jingjing Liu},
-  //editor    = {Kentaro Inui and
-               Jing Jiang and
-               Vincent Ng and
-               Xiaojun Wan},
-  title     = {Patient Knowledge Distillation for {BERT} Model Compression},
-  publisher = {Proceedings of the 2019 Conference on Empirical Methods in Natural
-               Language Processing and the 9th International Joint Conference on
-               Natural Language Processing, {EMNLP-IJCNLP} 2019, Hong Kong, China,
-               November 3-7, 2019},
-  pages     = {4322--4331},
-  //publisher = {Association for Computational Linguistics},
-  year      = {2019},
-  //url       = {https://doi.org/10.18653/v1/D19-1441},
-  //doi       = {10.18653/v1/D19-1441},
-  //timestamp = {Mon, 06 Apr 2020 14:36:31 +0200},
-  //biburl    = {https://dblp.org/rec/conf/emnlp/SunCGL19.bib},
-  //bibsource = {dblp computer science bibliography, https://dblp.org}
+
+@inproceedings{Ott2018ScalingNM,
+  title={Scaling Neural Machine Translation},
+  author={Myle Ott and Sergey Edunov and David Grangier and M. Auli},
+  publisher={Workshop on Machine Translation},
+  year={2018}
+}
+@inproceedings{Lin2020TowardsF8,
+  title={Towards Fully 8-bit Integer Inference for the Transformer Model},
+  author={Y. Lin and Yanyang Li and Tengbo Liu and Tong Xiao and T. Liu and Jingbo Zhu},
+  publisher={International Joint Conference on Artificial Intelligence},
+  year={2020}
+}
+@inproceedings{kim-rush-2016-sequence,
+    title = "Sequence-Level Knowledge Distillation",
+    author = "Kim, Yoon  and
+      Rush, Alexander M.",
+    publisher = "Proceedings of the 2016 Conference on Empirical Methods in Natural Language Processing",
+    month = nov,
+    year = "2016",
+    //address = "Austin, Texas",
+    //publisher = "Association for Computational Linguistics",
+    //url = "https://www.aclweb.org/anthology/D16-1139",
+    //doi = "10.18653/v1/D16-1139",
+    pages = "1317--1327",
 }

 %%%%% chapter 10------------------------------------------------------
@@ -4834,6 +4969,138 @@ pages ={157-166},

 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 %%%%% chapter 12------------------------------------------------------
+@inproceedings{DBLP:journals/corr/LinFSYXZB17,
+  author    = {Zhouhan Lin and
+               Minwei Feng and
+               C{\'{\i}}cero Nogueira dos Santos and
+               Mo Yu and
+               Bing Xiang and
+               Bowen Zhou and
+               Yoshua Bengio},
+  title     = {A Structured Self-Attentive Sentence Embedding},
+  publisher = {5th International Conference on Learning Representations, {ICLR} 2017,
+               Toulon, France, April 24-26, 2017, Conference Track Proceedings},
+  //publisher = {OpenReview.net},
+  year      = {2017},
+  //url       = {https://openreview.net/forum?id=BJC\_jUqxe},
+  //timestamp = {Thu, 25 Jul 2019 14:25:44 +0200},
+  //biburl    = {https://dblp.org/rec/conf/iclr/LinFSYXZB17.bib},
+  //bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+@inproceedings{Shaw2018SelfAttentionWR,
+  author    = {Peter Shaw and
+               Jakob Uszkoreit and
+               Ashish Vaswani},
+  //editor    = {Marilyn A. Walker and
+               Heng Ji and
+               Amanda Stent},
+  title     = {Self-Attention with Relative Position Representations},
+  publisher = {Proceedings of the 2018 Conference of the North American Chapter of
+               the Association for Computational Linguistics: Human Language Technologies,
+               NAACL-HLT, New Orleans, Louisiana, USA, June 1-6, 2018, Volume 2 (Short
+               Papers)},
+  pages     = {464--468},
+  //publisher = {Association for Computational Linguistics},
+  year      = {2018},
+  //url       = {https://doi.org/10.18653/v1/n18-2074},
+  //doi       = {10.18653/v1/n18-2074},
+  //timestamp = {Tue, 28 Jan 2020 10:30:17 +0100},
+  //biburl    = {https://dblp.org/rec/conf/naacl/ShawUV18.bib},
+  //bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+@inproceedings{DBLP:journals/corr/HeZRS15,
+  author    = {Kaiming He and
+               Xiangyu Zhang and
+               Shaoqing Ren and
+               Jian Sun},
+  title     = {Deep Residual Learning for Image Recognition},
+  publisher = {2016 {IEEE} Conference on Computer Vision and Pattern Recognition,
+               {CVPR} 2016, Las Vegas, NV, USA, June 27-30, 2016},
+  pages     = {770--778},
+  //publisher = {{IEEE} Computer Society},
+  year      = {2016},
+  //url       = {https://doi.org/10.1109/CVPR.2016.90},
+  //doi       = {10.1109/CVPR.2016.90},
+  //timestamp = {Wed, 16 Oct 2019 14:14:50 +0200},
+  //biburl    = {https://dblp.org/rec/conf/cvpr/HeZRS16.bib},
+  //bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+@article{Ba2016LayerN,
+  author    = {Lei Jimmy Ba and
+               Jamie Ryan Kiros and
+               Geoffrey E. Hinton},
+  title     = {Layer Normalization},
+  journal   = {CoRR},
+  volume    = {abs/1607.06450},
+  year      = {2016},
+  //url       = {http://arxiv.org/abs/1607.06450},
+  //archivePrefix = {arXiv},
+  //eprint    = {1607.06450},
+  //timestamp = {Tue, 23 Jul 2019 17:33:23 +0200},
+  //biburl    = {https://dblp.org/rec/journals/corr/BaKH16.bib},
+  //bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+@article{JMLR:v15:srivastava14a,
+  author  = {Nitish Srivastava and Geoffrey Hinton and Alex Krizhevsky and Ilya Sutskever and Ruslan Salakhutdinov},
+  title   = {Dropout: A Simple Way to Prevent Neural Networks from Overfitting},
+  journal = {Journal of Machine Learning Research},
+  year    = {2014},
+  volume  = {15},
+  pages   = {1929-1958},
+  //url     = {http://jmlr.org/papers/v15/srivastava14a.html}
+}
+@inproceedings{Szegedy_2016_CVPR,
+  author    = {Christian Szegedy and
+               Vincent Vanhoucke and
+               Sergey Ioffe and
+               Jonathon Shlens and
+               Zbigniew Wojna},
+  title     = {Rethinking the Inception Architecture for Computer Vision},
+  publisher = {2016 {IEEE} Conference on Computer Vision and Pattern Recognition,
+               {CVPR} 2016, Las Vegas, NV, USA, June 27-30, 2016},
+  pages     = {2818--2826},
+  //publisher = {{IEEE} Computer Society},
+  year      = {2016},
+  //url       = {https://doi.org/10.1109/CVPR.2016.308},
+  //doi       = {10.1109/CVPR.2016.308},
+  //timestamp = {Wed, 16 Oct 2019 14:14:50 +0200},
+  //biburl    = {https://dblp.org/rec/conf/cvpr/SzegedyVISW16.bib},
+  //bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+@inproceedings{DBLP:journals/corr/abs-1805-00631,
+  author    = {Biao Zhang and
+               Deyi Xiong and
+               Jinsong Su},
+  //editor    = {Iryna Gurevych and
+               Yusuke Miyao},
+  title     = {Accelerating Neural Transformer via an Average Attention Network},
+  publisher = {Proceedings of the 56th Annual Meeting of the Association for Computational
+               Linguistics, {ACL} 2018, Melbourne, Australia, July 15-20, 2018, Volume
+               1: Long Papers},
+  pages     = {1789--1798},
+  //publisher = {Association for Computational Linguistics},
+  year      = {2018},
+  //url       = {https://www.aclweb.org/anthology/P18-1166/},
+  //doi       = {10.18653/v1/P18-1166},
+  //timestamp = {Mon, 16 Sep 2019 13:46:41 +0200},
+  //biburl    = {https://dblp.org/rec/conf/acl/XiongZS18.bib},
+  //bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+@article{DBLP:journals/corr/CourbariauxB16,
+  author    = {Matthieu Courbariaux and
+               Yoshua Bengio},
+  title     = {BinaryNet: Training Deep Neural Networks with Weights and Activations
+               Constrained to +1 or -1},
+  journal   = {CoRR},
+  volume    = {abs/1602.02830},
+  year      = {2016},
+  //url       = {http://arxiv.org/abs/1602.02830},
+  //archivePrefix = {arXiv},
+  //eprint    = {1602.02830},
+  //timestamp = {Mon, 13 Aug 2018 16:46:57 +0200},
+  //biburl    = {https://dblp.org/rec/journals/corr/CourbariauxB16.bib},
+  //bibsource = {dblp computer science bibliography, https://dblp.org}
+}

 %%%%% chapter 12------------------------------------------------------
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%