Merge branch 'master' of 47.105.50.196:NiuTrans/mtbookv2

5ecdd492 · xiaotong · 35ddb471 · 0e897136 · 5ecdd492 · 5ecdd492
Commit 5ecdd492 authored Jan 02, 2021 by xiaotong
--- a/Chapter13/Figures/figure-exposure-bias.tex
+++ b/Chapter13/Figures/figure-exposure-bias.tex
--- a/Chapter13/Figures/figure-of-scheduling-sampling-method.tex
+++ b/Chapter13/Figures/figure-of-scheduling-sampling-method.tex
@@ -18,21 +18,26 @@
 \node [anchor=south,snode] (n7) at ([xshift=0em,yshift=1em]n4.north) {Softmax};
 \node [anchor=south,ynode] (n8) at ([xshift=0em,yshift=1em]n7.north) {$\tilde{{y}}_{j}$};

-\node [anchor=north] (x1) at ([xshift=0em,yshift=-1em]n1.south) {$\seq{x}$};
-\node [anchor=north,font=\small,align=left] (x2) at ([xshift=-4em,yshift=-1.7em]n3.south) {采样出\\的$\tilde{{y}}_{j-2}$};
-\node [anchor=north,font=\small,align=left] (x3) at ([xshift=2em,yshift=-2.5em]n3.south) {真实答\\案${y}_{j-2}$};
-\node [anchor=north,font=\small,align=left] (x4) at ([xshift=2em,yshift=-2.5em]n4.south) {真实答\\案${y}_{j-1}$};
+\node [anchor=south,snode,font=\footnotesize] (n13) at ([xshift=0em,yshift=1em]n1.north) {Softmax};
+\node [anchor=south,ynode] (n14) at ([xshift=0em,yshift=1em]n13.north) {$\tilde{{y}}_{1}$};
+
+\node [anchor=north] (x1) at ([xshift=0em,yshift=-1em]n1.south) {$\langle$sos$\rangle$};
+\node [anchor=north,font=\small] (x2) at ([xshift=-1.3em,yshift=-2.3em]n3.south) {$\tilde{{y}}_{j-2}$};
+\node [anchor=north,font=\small] (x3) at ([xshift=1.3em,yshift=-2.5em]n3.south) {${y}_{j-2}$};
+\node [anchor=north,font=\small] (x4) at ([xshift=1.3em,yshift=-2.5em]n4.south) {${y}_{j-1}$};
+\node [anchor=north,font=\small] (x5) at ([xshift=-1.3em,yshift=-2.3em]n4.south) {$\tilde{{y}}_{j-1}$};

 \node [anchor=south,inner sep=2pt] (st1) at (n6.north) {\scriptsize{\textbf{[step $j-1$]}}};
 \node [anchor=south,inner sep=2pt] (st2) at (n8.north) {\scriptsize{\textbf{[step $j$]}}};
+\node [anchor=south,inner sep=2pt] (st3) at (n14.north) {\scriptsize{\textbf{[step $1$]}}};

-\node [anchor=north,font=\scriptsize] (e1) at ([xshift=-3em,yshift=-0em]n3.south) {$\funp{P}={(1-\epsilon_i)}^2$};
-\node [anchor=north,font=\scriptsize] (e2) at ([xshift=2em,yshift=-0.1em]n3.south) {$\funp{P}=\epsilon_i$};
-\node [anchor=north,font=\scriptsize] (e3) at ([xshift=-2em,yshift=-1em]n4.south) {$\funp{P}={(1-\epsilon_i)}^2$};
-\node [anchor=north,font=\scriptsize] (e4) at ([xshift=2em,yshift=-0.1em]n4.south) {$\funp{P}=\epsilon_i$};
+\node [anchor=north,font=\tiny,rotate=90] (e1) at ([xshift=-2.7em,yshift=-1.1em]n3.south) {${(1-\epsilon_i)}^2$};
+%\node [anchor=north,font=\scriptsize] (e2) at ([xshift=2em,yshift=-0.1em]n3.south) {$\funp{P}=\epsilon_i$};
+%\node [anchor=north,font=\scriptsize] (e3) at ([xshift=-2em,yshift=-1em]n4.south) {$\funp{P}={(1-\epsilon_i)}^2$};
+\node [anchor=north,font=\tiny,rotate=90] (e4) at ([xshift=1.5em,yshift=-1.2em]n4.south) {$\epsilon_i$};

-\node [anchor=south east,font=\small] (l1) at ([xshift=-1em,yshift=0.5em]n5.north west) {Loss};
-\node [anchor=south west,font=\small] (l2) at ([xshift=1em,yshift=0.5em]n7.north east) {Loss};
+%\node [anchor=south east,font=\small] (l1) at ([xshift=-1em,yshift=0.5em]n5.north west) {Loss};
+%\node [anchor=south west,font=\small] (l2) at ([xshift=1em,yshift=0.5em]n7.north east) {Loss};

 \draw [->,thick] ([xshift=0em,yshift=0em]x1.north)--([xshift=0em,yshift=0em]n1.south);
 \draw [->,thick] ([xshift=0em,yshift=0em]n1.east)--([xshift=0em,yshift=0em]n2.west);
@@ -45,13 +50,31 @@
 \draw [->,thick] ([xshift=0em,yshift=0em]n5.north)--([xshift=0em,yshift=0em]n6.south);
 \draw [->,thick] ([xshift=0em,yshift=0em]n4.north)--([xshift=0em,yshift=0em]n7.south);
 \draw [->,thick] ([xshift=0em,yshift=0em]n7.north)--([xshift=0em,yshift=0em]n8.south);
+\draw [->,thick] ([xshift=0em,yshift=0em]n1.north)--([xshift=0em,yshift=0em]n13.south);
+\draw [->,thick] ([xshift=0em,yshift=0em]n13.north)--([xshift=0em,yshift=0em]n14.south);
+
+%\draw [->,thick] ([xshift=0em,yshift=0em]l1.south) .. controls +(south:1em) and +(west:0.1em) .. ([xshift=0em,yshift=0em]n5.west);
+%\draw [->,thick] ([xshift=0em,yshift=0em]l2.south) .. controls +(south:1em) and +(east:0.1em) .. ([xshift=0em,yshift=0em]n7.east);
+
+\node [circle,fill=black,minimum size=1pt,inner sep=1.2pt] (c1) at ([xshift=0em,yshift=0.6em]x2.north) {};
+\node [circle,fill=black,minimum size=1pt,inner sep=1.2pt] (c2) at ([xshift=0em,yshift=0.8em]x3.north) {};
+\node [circle,fill=black,minimum size=1pt,inner sep=1.2pt] (c3) at ([xshift=0em,yshift=0.8em]x4.north) {};
+\node [circle,fill=black,minimum size=1pt,inner sep=1.2pt] (c4) at ([xshift=0em,yshift=0.6em]x5.north) {};
+
+\draw [-,thick] ([xshift=0em,yshift=-0em]x2.north)-- ([xshift=-1.3em,yshift=0em]n3.south);
+
+\draw [-,thick] ([xshift=0em,yshift=0.2em]x3.north)-- ([xshift=0em,yshift=0em]c2.south);
+\draw [-,thick] ([xshift=0em,yshift=0em]c2.north)-- ([xshift=0.3em,yshift=0.6em]c2.north);
+\draw [-,thick] ([xshift=0em,yshift=0.6em]c2.north)-- ([xshift=1.3em,yshift=0em]n3.south);
+
+\draw [-,thick] ([xshift=-0em,yshift=-0em]x5.north)-- ([xshift=0em,yshift=0em]c4.south);
+\draw [-,thick] ([xshift=-0em,yshift=0em]c4.north)-- ([xshift=0.3em,yshift=0.6em]c4.north);
+\draw [-,thick] ([xshift=-0em,yshift=0.6em]c4.north)-- ([xshift=-1.3em,yshift=0em]n4.south);

-\draw [->,thick] ([xshift=0em,yshift=0em]l1.south) .. controls +(south:1em) and +(west:0.1em) .. ([xshift=0em,yshift=0em]n5.west);
-\draw [->,thick] ([xshift=0em,yshift=0em]l2.south) .. controls +(south:1em) and +(east:0.1em) .. ([xshift=0em,yshift=0em]n7.east);
+\draw [-,thick] ([xshift=0em,yshift=0.2em]x4.north)-- ([xshift=1.3em,yshift=0em]n4.south);

-\draw [->,thick,dotted] ([xshift=0em,yshift=-0.5em]x2.north east) .. controls +(east:1.5em) and +(south:0.2em) .. ([xshift=-0.5em,yshift=0em]n3.south);
-\draw [->,thick] ([xshift=0em,yshift=0em]x3.north) .. controls +(north:1em) and +(south:2em) .. ([xshift=0em,yshift=0em]n3.south);
-\draw [->,thick,dotted] ([xshift=0em,yshift=0em]n6.east) .. controls ([xshift=2em,yshift=1em]n6.east) and ([xshift=-2em,yshift=-5em]n4.south west) .. ([xshift=-0.5em,yshift=-0em]n4.south);
-\draw [->,thick] ([xshift=0em,yshift=0em]x4.north) .. controls +(north:1em) and +(south:2em) .. ([xshift=0em,yshift=0em]n4.south);
+\draw [->,thick,dotted] ([xshift=-2.5em,yshift=1em]x2.north) .. controls +(south:2em) and +(west:0.1em) .. ([xshift=0.2em,yshift=0em]x2.west);
+\draw [->,thick,dotted] ([xshift=0em,yshift=0em]n6.east) .. controls ([xshift=2em,yshift=1em]n6.east) and ([xshift=-2.5em,yshift=-4em]n4.south west) .. ([xshift=-0em,yshift=-0em]x5.west);
+\draw [->,thick,dotted] ([xshift=0em,yshift=0em]n14.east) .. controls +(east:0.3em) and +(north:2em) .. ([xshift=3em,yshift=-0.5em]n14.south);

 \end{tikzpicture}
--- a/Chapter13/chapter13.tex
+++ b/Chapter13/chapter13.tex
--- a/Chapter14/chapter14.tex
+++ b/Chapter14/chapter14.tex
--- a/Chapter16/chapter16.tex
+++ b/Chapter16/chapter16.tex
@@ -803,7 +803,7 @@

 \parinterval 在真实场景中，由于每个领域的数据量有限，同时领域数量较多，针对每个领域单独训练一个机器翻译模型是不现实的。所以，通常的策略是混合多领域的数据，来训练一个能够支持多领域翻译的机器翻译模型。虽然混合多个领域的数据可以有效增加训练数据规模，但正如前面所说，由于各个领域训练数据量之间的不平衡，在训练数据过少的领域上，模型表现往往差强人意。一种观点认为，数据量较少的领域数据应该在训练过程中获得更大的权重，从而使这些更有价值的数据发挥出更大的作用\upcite{DBLP:conf/emnlp/MatsoukasRZ09,DBLP:conf/emnlp/FosterGK10}。

-\parinterval 实际上，基于数据加权的方法与{\chapterthirteen}中基于样本价值的学习方法是一致的，只是描述的场景略有不同。这类方法本质上在解决{\small\bfnew{类别不均衡问题}}\index{类别不均衡问题}（Class Imbalance Problem\index{Class Imbalance Problem}）\upcite{DBLP:conf/emnlp/ZhuH07}。数据加权可以通过修改损失函数，将其缩放$\alpha$ 倍来实现（$\alpha$  是样本的权重）。在具体实践中，也可以直接将低资源的领域数据进行复制\footnote{相当于对数据进行重采样}达到与其相同的效果。
+\parinterval 实际上，基于数据加权的方法与{\chapterthirteen}中基于样本价值的学习方法是一致的，只是描述的场景略有不同。这类方法本质上在解决{\small\bfnew{类别不均衡问题}}\index{类别不均衡问题}（Class Imbalance Problem\index{Class Imbalance Problem}）\upcite{DBLP:conf/emnlp/ZhuH07}。数据加权可以通过修改损失函数，将其缩放$\alpha$ 倍来实现（$\alpha$  是样本的权重）。在具体实践中，也可以直接将低资源的领域数据进行复制\footnote{相当于对数据进行重采样}达到与其相同的效果\upcite{DBLP:conf/wmt/ShahBS10}。

 \parinterval 数据选择是数据加权的一种特殊情况，它可以被看做是样本权重非零即一的情况。具体来说，可以直接选择与领域相关的数据参与训练\upcite{DBLP:conf/acl/DuhNST13}。由于这种方法并不需要使用全量数据进行训练，因此模型的训练成本较低。由于{\chapterthirteen}已经对数据加权和数据选择方法进行了详细介绍，这里不再赘述。


--- a/Chapter4/chapter4.tex
+++ b/Chapter4/chapter4.tex
@@ -532,9 +532,9 @@ His house is on the south bank of the river.
 \hline
 \rule{0pt}{10pt} One-hot词向量 & RAE编码\upcite{DBLP:conf/emnlp/SocherPHNM11} \\
 \rule{0pt}{10pt} Word2Vec词向量\upcite{DBLP:journals/corr/abs-1301-3781} & Doc2Vec向量\upcite{DBLP:conf/icml/LeM14}  \\
-\rule{0pt}{10pt} Prob-fasttext词向量\upcite{DBLP:conf/acl/AthiwaratkunW17} & ELMO预训练句子表示\upcite{DBLP:conf/naacl/PetersNIGCLZ18} \\
+\rule{0pt}{10pt} Prob-fasttext词向量\upcite{DBLP:conf/acl/AthiwaratkunW17} & ELMO预训练句子表示\upcite{Peters2018DeepCW} \\
 \rule{0pt}{10pt} GloVe词向量\upcite{DBLP:conf/emnlp/PenningtonSM14} & GPT句子表示\upcite{radford2018improving} \\
-\rule{0pt}{10pt} ELMO预训练词向量\upcite{DBLP:conf/naacl/PetersNIGCLZ18} & BERT预训练句子表示\upcite{devlin2019bert} \\
+\rule{0pt}{10pt} ELMO预训练词向量\upcite{Peters2018DeepCW} & BERT预训练句子表示\upcite{devlin2019bert} \\
 \rule{0pt}{10pt} BERT预训练词向量\upcite{devlin2019bert} & Skip-thought向量\upcite{DBLP:conf/nips/KirosZSZUTF15} \\
 \end{tabular}
 \label{tab:4-2}
@@ -874,7 +874,7 @@ d&=&t \frac{s}{\sqrt{n}}
 \vspace{0.5em}
 \end{itemize}

-\parinterval 随着深度学习技术的发展，另一种思路是使用表示学习技术生成句子的分布式表示，并在此基础上利用神经网络自动提取高度抽象的句子特征\upcite{DBLP:conf/wmt/KreutzerSR15,DBLP:conf/wmt/MartinsAHK16,DBLP:conf/wmt/ChenTZXZLW17}，这样就避免了人工设计特征所带来的时间以及人工代价，同时表示学习所得到的分布式表示可以涵盖更多人工设计难以捕获到的特征，更加全面地反映句子的特点，因此在质量评估任务上也取得了很好的效果\upcite{kreutzer2015quality,DBLP:conf/wmt/ShahLPBBBS15,DBLP:conf/wmt/ScartonBSSS16,DBLP:conf/wmt/AbdelsalamBE16,DBLP:conf/wmt/BasuPN18,DBLP:conf/wmt/Lo19,DBLP:conf/wmt/YankovskayaTF19}。比如，最近的一些工作中大量使用了神经机器翻译模型来获得双语句子的表示结果，并用于质量评估\upcite{DBLP:conf/wmt/Qi19,DBLP:conf/wmt/ZhouZH19,DBLP:conf/wmt/Hokamp17,wang2019niutrans}。这样做的好处在于，质量评估可以直接复用机器翻译的模型，从某种意义上降低了质量评估系统开发的代价。此外，随着近几年各种预训练模型的出现，使用预训练模型来获取用于质量评估的句子表示也成为一大流行趋势，这种方法大大减少了质量评估模型自身的训练时间，在该领域内的表现也十分亮眼\upcite{kepler2019unbabel,DBLP:conf/wmt/YankovskayaTF19,DBLP:conf/wmt/KimLKN19}。关于表示学习、神经机器翻译、预训练模型的内容在第九章和第十章会有进一步介绍。
+\parinterval 随着深度学习技术的发展，另一种思路是使用表示学习技术生成句子的分布式表示，并在此基础上利用神经网络自动提取高度抽象的句子特征\upcite{DBLP:conf/wmt/KreutzerSR15,DBLP:conf/wmt/MartinsAHK16,DBLP:conf/wmt/ChenTZXZLW17}，这样就避免了人工设计特征所带来的时间以及人工代价，同时表示学习所得到的分布式表示可以涵盖更多人工设计难以捕获到的特征，更加全面地反映句子的特点，因此在质量评估任务上也取得了很好的效果\upcite{kreutzer2015quality,DBLP:conf/wmt/ShahLPBBBS15,DBLP:conf/wmt/ScartonBSSS16,DBLP:conf/wmt/AbdelsalamBE16,DBLP:conf/wmt/BasuPN18}。比如，最近的一些工作中大量使用了神经机器翻译模型来获得双语句子的表示结果，并用于质量评估\upcite{DBLP:conf/wmt/Qi19,DBLP:conf/wmt/ZhouZH19,DBLP:conf/wmt/Hokamp17,wang2019niutrans}。这样做的好处在于，质量评估可以直接复用机器翻译的模型，从某种意义上降低了质量评估系统开发的代价。此外，随着近几年各种预训练模型的出现，使用预训练模型来获取用于质量评估的句子表示也成为一大流行趋势，这种方法大大减少了质量评估模型自身的训练时间，在该领域内的表现也十分亮眼\upcite{kepler2019unbabel,DBLP:conf/wmt/YankovskayaTF19,DBLP:conf/wmt/KimLKN19}。关于表示学习、神经机器翻译、预训练模型的内容在第九章和第十章会有进一步介绍。

 \parinterval 在得到句子表示之后，可以使用质量评估模块对译文质量进行预测。质量评估模型通常由回归算法或分类算法实现：