copyright info

02f515c5 · xiaotong · a961ca47 · 02f515c5 · 02f515c5 · 02f515c5
Commit 02f515c5 authored May 15, 2020 by xiaotong
--- a/Book/Chapter1/chapter1.tex
+++ b/Book/Chapter1/chapter1.tex
@@ -2,6 +2,14 @@
 % !TEX encoding = UTF-8 Unicode

 %----------------------------------------------------------------------------------------
+% 机器翻译：统计建模与深度学习方法
+% Machine Translation: Statistical Modeling and Deep Learning Methods
+%
+% Copyright 2020
+% 肖桐(xiaotong@mail.neu.edu.cn) 朱靖波 (zhujingbo@mail.neu.edu.cn)
+%----------------------------------------------------------------------------------------
+
+%----------------------------------------------------------------------------------------
 %    CONFIGURATIONS
 %----------------------------------------------------------------------------------------


--- a/Book/Chapter2/chapter2.tex
+++ b/Book/Chapter2/chapter2.tex
@@ -2,6 +2,14 @@
 % !TEX encoding = UTF-8 Unicode

 %----------------------------------------------------------------------------------------
+% 机器翻译：统计建模与深度学习方法
+% Machine Translation: Statistical Modeling and Deep Learning Methods
+%
+% Copyright 2020
+% 肖桐(xiaotong@mail.neu.edu.cn) 朱靖波 (zhujingbo@mail.neu.edu.cn)
+%----------------------------------------------------------------------------------------
+
+%----------------------------------------------------------------------------------------
 %    CONFIGURATIONS
 %----------------------------------------------------------------------------------------

@@ -939,9 +947,9 @@ I cannot see without my reading \underline{\ \ \ \ \ \ \ \ }
 \end{eqnarray}
 \begin{eqnarray}
 c_{\textrm{KN}}(\cdot) = \left\{\begin{array}{ll}
-\textrm{count}(\cdot) & \textrm{for\ highest\ order}  \\ 
-\textrm{catcount}(\cdot) & \textrm{for\ lower\ order} 
-\end{array}\right. 
+\textrm{count}(\cdot) & \textrm{for\ highest\ order}  \\
+\textrm{catcount}(\cdot) & \textrm{for\ lower\ order}
+\end{array}\right.
 \label{eq:2-41}
 \end{eqnarray}
 \noindent 其中catcount$(\cdot)$表示的是基于某个单个词作为第$n$个词的$n$-gram的种类数目。

--- a/Book/Chapter3/Chapter3.tex
+++ b/Book/Chapter3/Chapter3.tex
@@ -2,6 +2,14 @@
 % !TEX encoding = UTF-8 Unicode

 %----------------------------------------------------------------------------------------
+% 机器翻译：统计建模与深度学习方法
+% Machine Translation: Statistical Modeling and Deep Learning Methods
+%
+% Copyright 2020
+% 肖桐(xiaotong@mail.neu.edu.cn) 朱靖波 (zhujingbo@mail.neu.edu.cn)
+%----------------------------------------------------------------------------------------
+
+%----------------------------------------------------------------------------------------
 %    CONFIGURATIONS
 %----------------------------------------------------------------------------------------


--- a/Book/Chapter4/chapter4.tex
+++ b/Book/Chapter4/chapter4.tex
@@ -2,6 +2,14 @@
 % !TEX encoding = UTF-8 Unicode

 %----------------------------------------------------------------------------------------
+% 机器翻译：统计建模与深度学习方法
+% Machine Translation: Statistical Modeling and Deep Learning Methods
+%
+% Copyright 2020
+% 肖桐(xiaotong@mail.neu.edu.cn) 朱靖波 (zhujingbo@mail.neu.edu.cn)
+%----------------------------------------------------------------------------------------
+
+%----------------------------------------------------------------------------------------
 %    CONFIGURATIONS configurations
 %----------------------------------------------------------------------------------------
 \renewcommand\figurename{图}%将figure改为图
@@ -780,7 +788,7 @@ dr = start_i-end_{i-1}-1

 \subsubsection{翻译候选匹配}

-\parinterval 在解码时，首先要知道每个源语言短语可能的译文都是什么。对于一个源语言短语，每个可能的译文也被称作{\small\bfnew{翻译候选}}\index{翻译候选}（Translation Candidate）\index{Translation Candidate}。实现翻译候选的匹配很简单。只需要遍历输入的源语言句子中所有可能的短语，之后在短语表中找到相应的翻译即可。比如，图\ref{fig:4-27}展示了句子``桌子\ 上\ 有\ 一个\ 苹果''的翻译候选匹配结果。可以看到，不同的短语会对应若干翻译候选。这些翻译候选会保存在所对应的跨度中。比如，``upon the table''是短语``桌子 上 有''的翻译候选，即对应源语言跨度[0,3]。\\ \\ \\ 
+\parinterval 在解码时，首先要知道每个源语言短语可能的译文都是什么。对于一个源语言短语，每个可能的译文也被称作{\small\bfnew{翻译候选}}\index{翻译候选}（Translation Candidate）\index{Translation Candidate}。实现翻译候选的匹配很简单。只需要遍历输入的源语言句子中所有可能的短语，之后在短语表中找到相应的翻译即可。比如，图\ref{fig:4-27}展示了句子``桌子\ 上\ 有\ 一个\ 苹果''的翻译候选匹配结果。可以看到，不同的短语会对应若干翻译候选。这些翻译候选会保存在所对应的跨度中。比如，``upon the table''是短语``桌子 上 有''的翻译候选，即对应源语言跨度[0,3]。\\ \\ \\

 %----------------------------------------------
 \begin{figure}[htp]

--- a/Book/Chapter5/chapter5.tex
+++ b/Book/Chapter5/chapter5.tex
 % !Mode:: "TeX:UTF-8"
 % !TEX encoding = UTF-8 Unicode

+%----------------------------------------------------------------------------------------
+% 机器翻译：统计建模与深度学习方法
+% Machine Translation: Statistical Modeling and Deep Learning Methods
+%
+% Copyright 2020
+% 肖桐(xiaotong@mail.neu.edu.cn) 朱靖波 (zhujingbo@mail.neu.edu.cn)
+%----------------------------------------------------------------------------------------
+
 \part{神经机器翻译}
 %----------------------------------------------------------------------------------------
 %    CONFIGURATIONS
@@ -393,7 +401,7 @@

 \subsubsection{线性映射}

-\parinterval {\small\sffamily\bfseries{线性映射}}\index{线性映射}（ Linear Mapping）\index{Linear Mapping}或{\small\sffamily\bfseries{线性变换}}\index{线性变换}（Linear Transformation）\index{Linear Transformation}是从一个向量空间V到另一个向量空间W的映射函数$ f:v\rightarrow w$，且该映射函数保持加法运算和数量乘法运算，即对于空间V中任何两个向量$ \mathbf u $和$ \mathbf v $以及任何标量$ c $，有：
+\parinterval {\small\sffamily\bfseries{线性映射}}\index{线性映射}（ Linear Mapping）\index{Linear Mapping}或{\small\sffamily\bfseries{线性变换}}\index{线性变换}（Linear Transformation）\index{Linear Transformation}是从一个向量空间V到另一个向量空间W的映射函数$ f:v\rightarrow w$，且该映射函数保持加法运算和数量乘法运算，即对于空间V中任何两个向量$ \mathbf u $ 和$ \mathbf v $以及任何标量$ c $，有：
 \begin{eqnarray}
 f(\mathbf u+\mathbf v)&=&f(\mathbf u)+f(\mathbf v)\label{eq:5-9}\\
 f(c\mathbf v)&=&cf(\mathbf v)
@@ -499,7 +507,7 @@ l_p(\mathbf x) & = & {\Vert{\mathbf x}\Vert}_p \nonumber \\
 \end{figure}
 %-------------------------------------------

-\parinterval 同样，人工神经元是人工神经网络的基本单元。在人们的想象中，人工神经元应该与生物神经元类似。但事实上，二者在形态上是有明显差别的。如图\ref{fig:5-4} 是一个典型的人工神经元，其本质是一个形似$ y=f(\mathbf x\cdot \mathbf w+b) $的函数。显而易见，一个神经元主要由$ \mathbf x $，$ \mathbf w $，$ b $，$ f $四个部分构成。其中$ \mathbf x $是一个形如$ (x_0,x_1,\dots,x_n) $的实数向量，在一个神经元中担任``输入''的角色。$ \mathbf w $是一个权重矩阵，其中的每一个元素都对应着一个输入和一个输出，代表着``某输入对某输出的贡献程度''，通常也被理解为神经元连接的{\small\sffamily\bfseries{权重}}\index{权重}（weight）\index{weight}。$ b $被称作偏置，是一个实数。$ f $被称作激活函数，其本质是一个非线性函数。可见，一个人工神经元的功能是将输入向量与权重矩阵右乘（做内积）后，加上偏置量，经过一个非线性激活函数得到一个标量结果。
+\parinterval 同样，人工神经元是人工神经网络的基本单元。在人们的想象中，人工神经元应该与生物神经元类似。但事实上，二者在形态上是有明显差别的。如图\ref{fig:5-4} 是一个典型的人工神经元，其本质是一个形似$ y=f(\mathbf x\cdot \mathbf w+b) $的函数。显而易见，一个神经元主要由$ \mathbf x $，$ \mathbf w $，$ b $，$ f $四个部分构成。其中$ \mathbf x $是一个形如$ (x_0,x_1,\dots,x_n) $ 的实数向量，在一个神经元中担任``输入''的角色。$ \mathbf w $是一个权重矩阵，其中的每一个元素都对应着一个输入和一个输出，代表着``某输入对某输出的贡献程度''，通常也被理解为神经元连接的{\small\sffamily\bfseries{权重}}\index{权重}（weight）\index{weight}。$ b $被称作偏置，是一个实数。$ f $被称作激活函数，其本质是一个非线性函数。可见，一个人工神经元的功能是将输入向量与权重矩阵右乘（做内积）后，加上偏置量，经过一个非线性激活函数得到一个标量结果。

 %----------------------------------------------
 \begin{figure}[htp]
@@ -1249,7 +1257,7 @@ y&=&{\rm{Sigmoid}}({\rm{Tanh}}(\mathbf x\cdot \mathbf w^{[1]}+\mathbf b^{[1]})\c

 \parinterval 下面用几个实例来说明搭建神经网络的过程。搭建神经网络的过程本质上就是定义前向计算的过程。首先构造一个单层神经网络。如图\ref{fig:5-39}(a)所示，简单的定义输入、权重和偏置后，定义激活函数为Sigmoid函数，输入$ \mathbf x $经过线性变换和激活函数，得到输出$ \mathbf y $。

-\parinterval 图\ref{fig:5-39}(b)是一个构造三层神经网络的程序示例。在第一层中，$ \mathbf x $作为输入，$ \mathbf h1 $作为输出，其中$ \mathbf h1={\rm{Sigmoid}}(\mathbf x\cdot \mathbf w1+\mathbf b1) $。在第二层中，$ \mathbf h1 $作为输入，$ \mathbf h2 $作为输出，其中$ \mathbf h2={\rm{Tanh}}(\mathbf h1\cdot \mathbf w2) $。在第三层中，$ \mathbf h2 $作为输入，$ \mathbf y $作为输出，其中$ \mathbf y={\rm{ReLU}}(\mathbf h2\cdot \mathbf w3) $。$ \mathbf y $也会作为整个神经网络的输出。
+\parinterval 图\ref{fig:5-39}(b)是一个构造三层神经网络的程序示例。在第一层中，$ \mathbf x $作为输入，$ \mathbf h1 $作为输出，其中$ \mathbf h1={\rm{Sigmoid}}(\mathbf x\cdot \mathbf w1+\mathbf b1) $。在第二层中，$ \mathbf h1 $作为输入，$ \mathbf h2 $作为输出，其中$ \mathbf h2={\rm{Tanh}}(\mathbf h1\cdot \mathbf w2) $。在第三层中，$ \mathbf h2 $作为输入，$ \mathbf y $ 作为输出，其中$ \mathbf y={\rm{ReLU}}(\mathbf h2\cdot \mathbf w3) $。$ \mathbf y $也会作为整个神经网络的输出。

 %----------------------------------------------
 \begin{figure}[htp]
@@ -1380,7 +1388,7 @@ y&=&{\rm{Sigmoid}}({\rm{Tanh}}(\mathbf x\cdot \mathbf w^{[1]}+\mathbf b^{[1]})\c
 \parinterval 从优化的角度看，梯度下降是一种典型的 {\small\bfnew{基于梯度的方法}}\index{基于梯度的方法}（Gradient-based Method）\index{Gradient-based Method}，属于基于一阶导数的方法。其他类似的方法还有牛顿法、共轭方向法、拟牛顿法等。在具体实现时，公式\ref{eq:5-29}可以有以下不同的形式。\\

 %----------------------------------------------------------------------------------------
-%  
+%
 %----------------------------------------------------------------------------------------

 \vspace{0.5em}
@@ -1398,7 +1406,7 @@ J(\mathbf w)&=&\frac{1}{n}\sum_{i=1}^{n}{L(\mathbf x_i,\mathbf {\widetilde y}_i;
 \parinterval 不过，这种方法的缺点也十分明显，因为要在全部训练数据上最小化损失，每一次参数更新都需要计算在所有样本上的损失。在使用海量数据进行训练的情况下，这种计算是非常消耗时间的。当训练数据规模很大时，很少使用这种方法。

 %----------------------------------------------------------------------------------------
-%  
+%
 %----------------------------------------------------------------------------------------

 \vspace{0.5em}
@@ -1416,7 +1424,7 @@ J(\mathbf w)&=&L(\mathbf x_i,\mathbf {\widetilde y}_i;\mathbf w)
 \parinterval 因为随机梯度下降算法每次优化的只是某一个样本上的损失，所以它的问题也非常明显：单个样本上的损失无法代表在全部样本上的损失，因此参数更新的效率低，方法收敛速度极慢。即使在目标函数为强凸函数的情况下，SGD仍旧无法做到线性收敛。

 %----------------------------------------------------------------------------------------
-%  
+%
 %----------------------------------------------------------------------------------------

 \vspace{0.5em}
@@ -1440,7 +1448,7 @@ J(\mathbf w)&=&\frac{1}{m}\sum_{i=j}^{j+m-1}{L(\mathbf x_i,\mathbf {\widetilde y
 \parinterval 梯度下降算法的一个核心是要得到目标函数相对于参数的梯度。下面将介绍三种常见的求梯度方法：数值微分、符号微分和自动微分，深度学习实现过程中多是采用自动微分方法计算梯度\cite{baydin2017automatic}。

 %----------------------------------------------------------------------------------------
-%  
+%
 %----------------------------------------------------------------------------------------

 \vspace{0.5em}
@@ -1464,7 +1472,7 @@ J(\mathbf w)&=&\frac{1}{m}\sum_{i=j}^{j+m-1}{L(\mathbf x_i,\mathbf {\widetilde y
 \parinterval 尽管数值微分不适用于大模型中的梯度求解，但是由于其非常简单，因此经常被用于检验其他梯度计算方法的正确性。比如在实现反向传播的时候（详见\ref{sec:5.4.6}节），可以检验求导是否正确（Gradient Check），这个过程就是利用数值微分实现的。\\ \\

 %----------------------------------------------------------------------------------------
-%  
+%
 %----------------------------------------------------------------------------------------

 \noindent {\small\sffamily\bfseries{ 符号微分\index{符号微分}（Symbolic Differentiation）\index{Symbolic Differentiation}}}
@@ -1499,7 +1507,7 @@ $+2x^2+x+1)$ & \ \ $(x^4+2x^3+2x^2+x+1)$ & $+6x+1$ \\
 %--------------------------------------------------------------------

 %----------------------------------------------------------------------------------------
-%  
+%
 %----------------------------------------------------------------------------------------

 \vspace{0.5em}
@@ -1570,7 +1578,7 @@ $+2x^2+x+1)$ & \ \ $(x^4+2x^3+2x^2+x+1)$ & $+6x+1$ \\
 %-------------------------------------------

 %----------------------------------------------------------------------------------------
-%  
+%
 %----------------------------------------------------------------------------------------

 \vspace{0.5em}
@@ -1589,7 +1597,7 @@ w_{t+1}&=&w_t-\alpha v_t
 \parinterval  这里的``梯度''不再只是现在的损失函数的梯度，而是之前的梯度的加权和。在原始的梯度下降算法中，如果在某个参数状态下，梯度方向变化特别大，甚至与上一次参数更新中梯度方向成90度夹角，下一次参数更新中梯度方向可能又是一次90度的改变，这时参数优化路径将会成``锯齿''状（如图\ref{fig:5-46}所示），优化效率极慢。而Momentum梯度下降算法不会让梯度发生90度的变化，而是让梯度慢慢发生改变：如果当前的梯度方向与之前的梯度方向相同，在原梯度方向上加速更新参数；如果当前的梯度方向与之前的梯度方向相反，并不会产生一个急转弯，而是尽量把优化路径平滑地进行改变。这样做的优点也非常明显，一方面杜绝了``锯齿''状优化路径的出现，另一方面将优化幅度变得更加平滑，不会导致频频跳过最优点。

 %----------------------------------------------------------------------------------------
-%  
+%
 %----------------------------------------------------------------------------------------

 \vspace{0.5em}
@@ -1608,7 +1616,7 @@ w_{t+1}&=&w_t-\eta \frac{1}{\sqrt{z_t}}\cdot \frac{\partial L}{\partial w_t}
 \parinterval  这里新出现了变量$ z $，它保存了以前的所有梯度值的平方和，在更新参数时，通过乘以$ \frac{1}{\sqrt{z_t}} $ ，就可以调整学习的尺度。这意味着，变动较大（被大幅度更新）的参数的学习率将变小。也就是说，可以按参数的元素进行学习率衰减，使变动大的参数的学习率逐渐减小。

 %----------------------------------------------------------------------------------------
-%  
+%
 %----------------------------------------------------------------------------------------

 \vspace{0.5em}
@@ -1629,7 +1637,7 @@ w_{t+1}&=&w_t-\frac{\eta}{\sqrt{z_t+\epsilon}}\cdot \frac{\partial L}{\partial w
 \parinterval  RMSProp与AdaGrad相比，学习率的分母部分（即两种梯度下降算法迭代公式中的$ z $）的计算由累积方式变成了指数衰减移动平均。于是，每个参数的学习率并不是呈衰减趋势，而是既可以变小也可以变大，从而避免AdaGrad算法中学习率不断单调下降以至于过早衰减的缺点。

 %----------------------------------------------------------------------------------------
-%  
+%
 %----------------------------------------------------------------------------------------

 \vspace{0.5em}

--- a/Book/Chapter6/Chapter6.tex
+++ b/Book/Chapter6/Chapter6.tex
@@ -2,6 +2,14 @@
 % !TEX encoding = UTF-8 Unicode

 %----------------------------------------------------------------------------------------
+% 机器翻译：统计建模与深度学习方法
+% Machine Translation: Statistical Modeling and Deep Learning Methods
+%
+% Copyright 2020
+% 肖桐(xiaotong@mail.neu.edu.cn) 朱靖波 (zhujingbo@mail.neu.edu.cn)
+%----------------------------------------------------------------------------------------
+
+%----------------------------------------------------------------------------------------
 %    CONFIGURATIONS
 %----------------------------------------------------------------------------------------


--- a/Book/Chapter7/Chapter7.tex
+++ b/Book/Chapter7/Chapter7.tex
@@ -2,6 +2,14 @@
 % !TEX encoding = UTF-8 Unicode

 %----------------------------------------------------------------------------------------
+% 机器翻译：统计建模与深度学习方法
+% Machine Translation: Statistical Modeling and Deep Learning Methods
+%
+% Copyright 2020
+% 肖桐(xiaotong@mail.neu.edu.cn) 朱靖波 (zhujingbo@mail.neu.edu.cn)
+%----------------------------------------------------------------------------------------
+
+%----------------------------------------------------------------------------------------
 %    CONFIGURATIONS
 %----------------------------------------------------------------------------------------


--- a/Book/ChapterAppend/ChapterAppend.tex
+++ b/Book/ChapterAppend/ChapterAppend.tex
@@ -2,6 +2,14 @@
 % !TEX encoding = UTF-8 Unicode

 %----------------------------------------------------------------------------------------
+% 机器翻译：统计建模与深度学习方法
+% Machine Translation: Statistical Modeling and Deep Learning Methods
+%
+% Copyright 2020
+% 肖桐(xiaotong@mail.neu.edu.cn) 朱靖波 (zhujingbo@mail.neu.edu.cn)
+%----------------------------------------------------------------------------------------
+
+%----------------------------------------------------------------------------------------
 %    CONFIGURATIONS
 %----------------------------------------------------------------------------------------

@@ -208,7 +216,7 @@ S = N(b^{\infty}(V(\mathbf{s}|\mathbf{t};2))) \cup (\mathop{\cup}\limits_{ij} N(

 \parinterval 为了理解这个公式，先介绍几个概念。
 \begin{itemize}
-\item $V(\mathbf{s}|\mathbf{t})$表示Viterbi词对齐，$V(\mathbf{s}|\mathbf{t},1)$、$V(\mathbf{s}|\mathbf{t},2)$和$V(\mathbf{s}|\mathbf{t},3)$就分别对应了模型1、2 和3 的Viterbi 词对齐； 
+\item $V(\mathbf{s}|\mathbf{t})$表示Viterbi词对齐，$V(\mathbf{s}|\mathbf{t},1)$、$V(\mathbf{s}|\mathbf{t},2)$和$V(\mathbf{s}|\mathbf{t},3)$就分别对应了模型1、2 和3 的Viterbi 词对齐；
 \item 把那些满足第$j$个源语言单词对应第$i$个目标语言单词（$a_j=i$）的词对齐构成的集合记为$\mathbf{A}_{i \leftrightarrow j}(\mathbf{s},\mathbf{t})$。通常称这些对齐中$j$和$i$被``钉''在了一起。在$\mathbf{A}_{i \leftrightarrow j}(\mathbf{s},\mathbf{t})$中使$\textrm{P}(\mathbf{a}|\mathbf{s},\mathbf{t})$达到最大的那个词对齐被记为$V_{i \leftrightarrow j}(\mathbf{s},\mathbf{t})$；
 \item 如果两个词对齐，通过交换两个词对齐连接就能互相转化，则称它们为邻居。一个词对齐$\mathbf{a}$的所有邻居记为$N(\mathbf{a})$。
 \end{itemize}

--- a/Book/ChapterPreface/ChapterPreface.tex
+++ b/Book/ChapterPreface/ChapterPreface.tex
 % !Mode:: "TeX:UTF-8"
 % !TEX encoding = UTF-8 Unicode

+%----------------------------------------------------------------------------------------
+% 机器翻译：统计建模与深度学习方法
+% Machine Translation: Statistical Modeling and Deep Learning Methods
+%
+% Copyright 2020
+% 肖桐(xiaotong@mail.neu.edu.cn) 朱靖波 (zhujingbo@mail.neu.edu.cn)
+%----------------------------------------------------------------------------------------
+
 \renewcommand\figurename{图}

 %----------------------------------------------------------------------------------------

--- a/Book/mt-book-xelatex.tex
+++ b/Book/mt-book-xelatex.tex
 % !Mode:: "TeX:UTF-8"
 % !TEX encoding = UTF-8 Unicode

+%----------------------------------------------------------------------------------------
+% 机器翻译：统计建模与深度学习方法
+% Machine Translation: Statistical Modeling and Deep Learning Methods
+%
+% Copyright 2020
+% 肖桐(xiaotong@mail.neu.edu.cn) 朱靖波 (zhujingbo@mail.neu.edu.cn)
+%----------------------------------------------------------------------------------------

 %----------------------------------------------------------------------------------------
 %	BASIC CONFIGURATIONS