Merge branch 'master' of http://47.105.50.196/NiuTrans/mtbookv2

b133ca17 · xiaotong · 37309b46 · 06cc270b · b133ca17 · b133ca17
Commit b133ca17 authored Sep 20, 2020 by xiaotong
--- a/Chapter10/Figures/mt-history.png
+++ b/Chapter10/Figures/mt-history.png
--- a/Chapter10/chapter10.tex
+++ b/Chapter10/chapter10.tex
@@ -42,8 +42,8 @@
 %----------------------------------------------
 \begin{figure}[htp]
 \centering
-\includegraphics[scale=0.36]{./Chapter10/Figures/mt-history.png}
-\caption{机器翻译发展简史（{\color{red} 图需要到2020}）}
+\includegraphics[scale=0.4]{./Chapter10/Figures/mt-history.png}
+\caption{机器翻译发展简史}
 \label{fig:10-1}
 \end{figure}
 %----------------------------------------------
@@ -201,7 +201,7 @@ NMT                     & 21.7          & 18.7           & -13.7      \\
 \label{tab:10-3}
 \begin{tabular}{ l | l l l}
   模型         		 &作者	& 年份	& BLEU[\%] \\ \hline
-   ConvS2S \upcite{DBLP:journals/corr/GehringAGYD17}                			&Gehring等 		&2017 			&25.2 \\
+   \rule{0pt}{13pt}ConvS2S \upcite{DBLP:journals/corr/GehringAGYD17}                			&Gehring等 		&2017 			&25.2 \\
   Transformer-Base \upcite{vaswani2017attention}			&Vaswani等 		&2017 			&27.3 \\
   Transformer-Big  \upcite{vaswani2017attention} 			&Vaswani等 		&2017 			&28.4 \\
   RNMT+		 \upcite{Chen2018TheBO}			&Chen等 	  	&2018  			&28.5 \\
@@ -662,7 +662,7 @@ $\funp{P}({y_j | \vectorn{\emph{s}}_{j-1} ,y_{j-1},\vectorn{\emph{C}}})$由Softm

 \noindent 之所以能想到在横线处填“吃饭”、“吃东西”很有可能是因为看到了“没/吃饭”、 “很/饿”等关键信息。也就是这些关键的片段对预测缺失的单词起着关键性作用。而预测“吃饭”与前文中的“ 中午”、“又”之间的联系似乎不那么紧密。也就是说，在形成 “吃饭”的逻辑时，在潜意识里会更注意“没/吃饭”、“很饿”等关键信息。也就是我们的关注度并不是均匀地分布在整个句子上的。

-\parinterval 这个现象可以用注意力机制进行解释。注意力机制的概念来源于生物学的一些现象：当待接收的信息过多时，人类会选择性地关注部分信息而忽略其他信息。它在人类的视觉、听觉、嗅觉等方面均有体现，当我们在感受事物时，大脑会自动过滤或衰减部分信息，仅关注其中少数几个部分。例如，当看到图\ref{fig:12-20}时，往往不是“均匀地”看图像中的所有区域，可能最先注意到的是小狗头上戴的帽子，然后才会关注图片中其他的部分。那注意力机制是如何解决神经机器翻译的问题呢？下面就一起来看一看。
+\parinterval 这个现象可以用注意力机制进行解释。注意力机制的概念来源于生物学的一些现象：当待接收的信息过多时，人类会选择性地关注部分信息而忽略其他信息。它在人类的视觉、听觉、嗅觉等方面均有体现，当我们在感受事物时，大脑会自动过滤或衰减部分信息，仅关注其中少数几个部分。例如，当看到图\ref{fig:12-20}时，往往不是“均匀地”看图像中的所有区域，可能最先注意到的是小狗的嘴，然后才会关注图片中其他的部分。那注意力机制是如何解决神经机器翻译的问题呢？下面就一起来看一看。

 %----------------------------------------------
 \begin{figure}[htp]
@@ -718,7 +718,7 @@ $\funp{P}({y_j | \vectorn{\emph{s}}_{j-1} ,y_{j-1},\vectorn{\emph{C}}})$由Softm
 \subsection{上下文向量的计算}
 \label{sec:10.1.3}

-\parinterval 神经机器翻译中，注意力机制的核心是：针对不同目标语言单词生成不同的上下文向量呢？这里，可以将注意力机制看做是一种对接收到的信息的加权处理。对于更重要的信息赋予更高的权重即更高的关注度，对于贡献度较低的信息分配较低的权重，弱化其对结果的影响。这样，$\vectorn{\emph{C}}_j$可以包含更多对当前目标语言位置有贡献的源语言片段的信息。
+\parinterval 神经机器翻译中，注意力机制的核心是：针对不同目标语言单词生成不同的上下文向量。这里，可以将注意力机制看做是一种对接收到的信息的加权处理。对于更重要的信息赋予更高的权重即更高的关注度，对于贡献度较低的信息分配较低的权重，弱化其对结果的影响。这样，$\vectorn{\emph{C}}_j$可以包含更多对当前目标语言位置有贡献的源语言片段的信息。

 \parinterval 根据这种思想，上下文向量$\vectorn{\emph{C}}_j$被定义为对不同时间步编码器输出的状态序列$\{ \vectorn{\emph{h}}_1, \vectorn{\emph{h}}_2,...,\vectorn{\emph{h}}_m \}$进行加权求和，如下：
 \begin{eqnarray}
@@ -956,7 +956,7 @@ L(\vectorn{\emph{Y}},\widehat{\vectorn{\emph{Y}}}) = \sum_{j=1}^n L_{\textrm{ce}

 \parinterval 公式\ref{eq:10-31}是一种非常通用的损失函数形式，除了交叉熵，也可以使用其他的损失函数，这时只需要替换$L_{ce} (\cdot)$即可。这里使用交叉熵损失函数的好处在于，它非常容易优化，特别是与Softmax组合，其反向传播的实现非常高效。此外，交叉熵损失（在一定条件下）也对应了极大似然的思想，这种方法在自然语言处理中已经被证明是非常有效的。

-\parinterval 除了交叉熵，很多系统也使用了面向评价的损失函数，比如，直接利用评价指标BLEU定义损失函数。不过这类损失函数往往不可微分，因此无法直接获取梯度。这时可以引入强化学习技术，通过策略梯度等方法进行优化。不过这类方法需要采样等手段，这里不做重点讨论，相关内容会在后面技术部分进行介绍。
+\parinterval 除了交叉熵，很多系统也使用了面向评价的损失函数，比如，直接利用评价指标BLEU定义损失函数\upcite{DBLP:conf/acl/ShenCHHWSL16}。不过这类损失函数往往不可微分，因此无法直接获取梯度。这时可以引入强化学习技术，通过策略梯度等方法进行优化。不过这类方法需要采样等手段，这里不做重点讨论，相关内容会在后面技术部分进行介绍。

 %----------------------------------------------------------------------------------------
 %    NEW SUBSUB-SECTION

--- a/Chapter11/chapter11.tex
+++ b/Chapter11/chapter11.tex
--- a/Chapter12/Figures/figure-matrix-representation-of-attention-weights-between-chinese-english-sentence-pairs.tex
+++ b/Chapter12/Figures/figure-matrix-representation-of-attention-weights-between-chinese-english-sentence-pairs.tex

-
-
-
-
 %-------------------------------------------
 \begin{tikzpicture}

@@ -17,7 +13,7 @@
 \tikzstyle{labelnode} = [above]

 % alignment matrix
-\begin{scope}[scale=0.9,yshift=0.12in]
+\begin{scope}[scale=1.2,yshift=0.12in]
 \foreach \i / \j / \c in
    {0/7/0.2, 1/7/0.45, 2/7/0.15, 3/7/0.15, 4/7/0.15, 5/7/0.15,
    0/6/0.35, 1/6/0.45, 2/6/0.15, 3/6/0.15, 4/6/0.15, 5/6/0.15,
@@ -27,7 +23,7 @@
    0/2/0.15, 1/2/0.15, 2/2/0.15, 3/2/0.15, 4/2/0.25, 5/2/0.3,
    0/1/0.15, 1/1/0.15, 2/1/0.15, 3/1/0.15, 4/1/0.8, 5/1/0.15,
    0/0/0.15, 1/0/0.15, 2/0/0.15, 3/0/0.15, 4/0/0.25, 5/0/0.60}
-    \node[elementnode,minimum size=0.6*1.2cm*\c,inner sep=0.1pt,fill=blue] (a\i\j) at (0.5*1.2cm*\i-5.4*0.5*1.2cm,0.5*1.2cm*\j-1.05*1.2cm) {};
+    \node[elementnode,minimum size=0.8*1.2cm*\c,inner sep=0.1pt,fill=blue] (a\i\j) at (0.5*1.2cm*\i-5.4*0.5*1.2cm,0.5*1.2cm*\j-1.05*1.2cm) {};

 %attention score labels
 \node[align=center] (l17) at (a17) {\scriptsize{{\color{white} .4}}};
@@ -40,22 +36,22 @@
 \node[align=center] (l17) at (a50) {\small{{\color{white} .7}}};

 % source
-\node[srcnode] (src1) at (-5.4*0.5*1.2cm,-1.05*1.2cm+7.5*0.5*1.2cm) {\scriptsize{Have}};
-\node[srcnode] (src2) at ([xshift=0.6cm]src1.south west) {\scriptsize{you}};
-\node[srcnode] (src3) at ([xshift=0.6cm]src2.south west) {\scriptsize{learned}};
-\node[srcnode] (src4) at ([xshift=0.6cm]src3.south west) {\scriptsize{nothing}};
-\node[srcnode] (src5) at ([xshift=0.6cm]src4.south west) {\scriptsize{?}};
-\node[srcnode] (src6) at ([xshift=0.6cm]src5.south west) {\scriptsize{$\langle$eos$\rangle$}};
+\node[srcnode] (src1) at (-5.4*0.5*1.2cm,-1.05*1.2cm+7.5*0.5*1.2cm) {\small{Have}};
+\node[srcnode] (src2) at ([xshift=0.6cm]src1.south west) {\small{you}};
+\node[srcnode] (src3) at ([xshift=0.6cm]src2.south west) {\small{learned}};
+\node[srcnode] (src4) at ([xshift=0.6cm]src3.south west) {\small{nothing}};
+\node[srcnode] (src5) at ([xshift=0.6cm]src4.south west) {\small{?}};
+\node[srcnode] (src6) at ([xshift=0.6cm]src5.south west) {\small{$\langle$eos$\rangle$}};

 % target
-\node[tgtnode] (tgt1) at (-6.0*0.5*1.2cm,-1.05*1.2cm+7.5*0.5*1.2cm) {\scriptsize{你}};
-\node[tgtnode] (tgt2) at ([yshift=-0.6cm]tgt1.north east) {\scriptsize{什么}};
-\node[tgtnode] (tgt3) at ([yshift=-0.6cm]tgt2.north east) {\scriptsize{都}};
-\node[tgtnode] (tgt4) at ([yshift=-0.6cm]tgt3.north east) {\scriptsize{没}};
-\node[tgtnode] (tgt5) at ([yshift=-0.6cm]tgt4.north east) {\scriptsize{学}};
-\node[tgtnode] (tgt6) at ([yshift=-0.6cm]tgt5.north east) {\scriptsize{到}};
-\node[tgtnode] (tgt7) at ([yshift=-0.6cm]tgt6.north east) {\scriptsize{?}};
-\node[tgtnode] (tgt8) at ([yshift=-0.6cm]tgt7.north east) {\scriptsize{$\langle$eos$\rangle$}};
+\node[tgtnode] (tgt1) at (-6.0*0.5*1.2cm,-1.05*1.2cm+7.5*0.5*1.2cm) {\small{你}};
+\node[tgtnode] (tgt2) at ([yshift=-0.6cm]tgt1.north east) {\small{什么}};
+\node[tgtnode] (tgt3) at ([yshift=-0.6cm]tgt2.north east) {\small{都}};
+\node[tgtnode] (tgt4) at ([yshift=-0.6cm]tgt3.north east) {\small{没}};
+\node[tgtnode] (tgt5) at ([yshift=-0.6cm]tgt4.north east) {\small{学}};
+\node[tgtnode] (tgt6) at ([yshift=-0.6cm]tgt5.north east) {\small{到}};
+\node[tgtnode] (tgt7) at ([yshift=-0.6cm]tgt6.north east) {\small{?}};
+\node[tgtnode] (tgt8) at ([yshift=-0.6cm]tgt7.north east) {\small{$\langle$eos$\rangle$}};

 \end{scope}


--- a/Chapter12/Figures/figure-query-model-corresponding-to-attention-mechanism.tex
+++ b/Chapter12/Figures/figure-query-model-corresponding-to-attention-mechanism.tex
@@ -12,17 +12,17 @@

 \tikzstyle{rnode} = [draw,minimum width=3.5em,minimum height=1.2em]

-\node [rnode,anchor=south west,fill=red!20!white] (value1) at (0,0) {\scriptsize{$\vectorn{\emph{h}}(\textrm{“你”})$}};
-\node [rnode,anchor=south west,fill=red!20!white] (value2) at ([xshift=1em]value1.south east) {\scriptsize{$\vectorn{\emph{h}}(\textrm{“什么”})$}};
-\node [rnode,anchor=south west,fill=red!20!white] (value3) at ([xshift=1em]value2.south east) {\scriptsize{$\vectorn{\emph{h}}(\textrm{“也”})$}};
-\node [rnode,anchor=south west,fill=red!20!white] (value4) at ([xshift=1em]value3.south east) {\scriptsize{$\vectorn{\emph{h}}(\textrm{“没”})$}};
+\node [rnode,anchor=south west,fill=red!20!white] (value1) at (0,0) {\scriptsize{${{h}}(\textrm{你})$}};
+\node [rnode,anchor=south west,fill=red!20!white] (value2) at ([xshift=1em]value1.south east) {\scriptsize{${{h}}(\textrm{什么})$}};
+\node [rnode,anchor=south west,fill=red!20!white] (value3) at ([xshift=1em]value2.south east) {\scriptsize{${{h}}(\textrm{也})$}};
+\node [rnode,anchor=south west,fill=red!20!white] (value4) at ([xshift=1em]value3.south east) {\scriptsize{${{h}}(\textrm{没})$}};

-\node [rnode,anchor=south west,fill=green!20!white] (key1) at ([yshift=0.2em]value1.north west) {\scriptsize{$\vectorn{\emph{h}}(\textrm{“你”})$}};
-\node [rnode,anchor=south west,fill=green!20!white] (key2) at ([yshift=0.2em]value2.north west) {\scriptsize{$\vectorn{\emph{h}}(\textrm{“什么”})$}};
-\node [rnode,anchor=south west,fill=green!20!white] (key3) at ([yshift=0.2em]value3.north west) {\scriptsize{$\vectorn{\emph{h}}(\textrm{“也”})$}};
-\node [rnode,anchor=south west,fill=green!20!white] (key4) at ([yshift=0.2em]value4.north west) {\scriptsize{$\vectorn{\emph{h}}(\textrm{“没”})$}};
+\node [rnode,anchor=south west,fill=green!20!white] (key1) at ([yshift=0.2em]value1.north west) {\scriptsize{${{h}}(\textrm{你})$}};
+\node [rnode,anchor=south west,fill=green!20!white] (key2) at ([yshift=0.2em]value2.north west) {\scriptsize{${{h}}(\textrm{什么})$}};
+\node [rnode,anchor=south west,fill=green!20!white] (key3) at ([yshift=0.2em]value3.north west) {\scriptsize{${{h}}(\textrm{也})$}};
+\node [rnode,anchor=south west,fill=green!20!white] (key4) at ([yshift=0.2em]value4.north west) {\scriptsize{${{h}}(\textrm{没})$}};

-\node [rnode,anchor=east] (query) at ([xshift=-2em]key1.west) {\scriptsize{$\vectorn{\emph{s}}(\textrm{“you”})$}};
+\node [rnode,anchor=east] (query) at ([xshift=-2em]key1.west) {\scriptsize{${{s}}(\textrm{you})$}};
 \node [anchor=east] (querylabel) at ([xshift=-0.2em]query.west) {\scriptsize{query}};

 \draw [->] ([yshift=1pt,xshift=6pt]query.north) .. controls +(90:1em) and +(90:1em) .. ([yshift=1pt]key1.north);