update chapter16

9e54756f · 曹润柘 · 208e4099 · 9e54756f · 9e54756f
Commit 9e54756f authored Oct 26, 2020 by 曹润柘
--- a/Chapter16/chapter16.tex
+++ b/Chapter16/chapter16.tex
@@ -272,7 +272,7 @@ g_{t} = \sigma (w^{T}s_{t}^{TM} + b)
 \label{eq:16-7-xc}
 \end{eqnarray}

-\parinterval 公式\ref{eq:16-7-xc}很自然地把两个方向的翻译模型$\textrm{P}(\mathbf t|\mathbf s)$和$\textrm{P}(\mathbf s|\mathbf t)$以及两个语言模型$\textrm{P}(\mathbf s)$和$\textrm{P}(\mathbf t)$联系起来：$\textrm{P}(\mathbf s)\textrm{P}(\mathbf t|\mathbf s)$应该与$\textrm{P}(\mathbf t)\textrm{P}(\mathbf s|\mathbf t)$接近，因为它们都表达了同一个联合分布$\textrm{P}(\mathbf s,\mathbf t)$。因此，在构建训练两个方向的翻译模型的目标函数时，除了它们单独训练时各自使用的极大似然估计目标函数，可以额外增加一个目标项来鼓励两个方向的翻译模型去满足公式\ref{eq:7-32}：
+\parinterval 公式\ref{eq:16-7-xc}很自然地把两个方向的翻译模型$\textrm{P}(\mathbf t|\mathbf s)$和$\textrm{P}(\mathbf s|\mathbf t)$以及两个语言模型$\textrm{P}(\mathbf s)$和$\textrm{P}(\mathbf t)$联系起来：$\textrm{P}(\mathbf s)\textrm{P}(\mathbf t|\mathbf s)$应该与$\textrm{P}(\mathbf t)\textrm{P}(\mathbf s|\mathbf t)$接近，因为它们都表达了同一个联合分布$\textrm{P}(\mathbf s,\mathbf t)$。因此，在构建训练两个方向的翻译模型的目标函数时，除了它们单独训练时各自使用的极大似然估计目标函数，可以额外增加一个目标项来鼓励两个方向的翻译模型去满足公式\ref{eq:16-8-xc}：
 \begin{eqnarray}
 \mathcal{L} = (\textrm{log P}(\mathbf s) + \textrm{log P}(\mathbf t|\mathbf s) - \textrm{log P}(\mathbf t) - \textrm{log P}(\mathbf s|\mathbf t))^{2}
 \label{eq:16-8-xc}
@@ -294,7 +294,7 @@ g_{t} = \sigma (w^{T}s_{t}^{TM} + b)
 \label{eq:16-9-xc}
 \end{eqnarray}

-\noindent  公式\ref{eq:16-9-xc}假设$\textrm{P}(\mathbf s|\mathbf t)=\textrm{P}(\mathbf s|\mathbf s,\mathbf t)$。这个假设显然是成立的，因为当知道一个句子的译文时，并不需要知道它的源文就可以把它翻译回去。如果直接优化（最大化）公式\ref{eq:16-9-xc}右侧，相当于对这个等式$\textrm{P}(\mathbf s|\mathbf t)$和$\textrm{P}(\mathbf t|\mathbf s)$施加了{\small\sffamily\bfnew{循环一致性}}\index{循环一致性}（Circle Consistency）\index{Circle Consistency}的约束\cite{DBLP:conf/iccv/ZhuPIE17}，也就是对于一个句子$\mathbf s$，通过$\textrm{P}(\mathbf t|\mathbf s)$把它翻译成$\mathbf t$后，根据$\textrm{P}(\mathbf s|\mathbf t)$应该能重新翻译出$\mathbf s$，如图\ref{fig:7-43}所示。公式\ref{fig:16-7-xc}给出了同时优化$\textrm{P}(\mathbf s|\mathbf t)$和$\textrm{P}(\mathbf t|\mathbf s)$的一个目标函数形式。这个目标函数的一个额外的好处是它本质上是在学习一个由$\textrm{P}(\mathbf s|\mathbf t)$和$\textrm{P}(\mathbf t|\mathbf s)$组成的语言模型$\textrm{P}(\mathbf s)$，而$\textrm{P}(\mathbf s)$的学习依赖于单语数据，这意味着这个目标函数可以很自然地直接使用大量单语数据来同时训练两个翻译模型。相同的结论可以推广到$\textrm{P}(\mathbf t)$上\cite{DBLP:conf/nips/HeXQWYLM16}。
+\noindent  公式\ref{eq:16-9-xc}假设$\textrm{P}(\mathbf s|\mathbf t)=\textrm{P}(\mathbf s|\mathbf s,\mathbf t)$。这个假设显然是成立的，因为当知道一个句子的译文时，并不需要知道它的源文就可以把它翻译回去。如果直接优化（最大化）公式\ref{eq:16-9-xc}右侧，相当于对这个等式$\textrm{P}(\mathbf s|\mathbf t)$和$\textrm{P}(\mathbf t|\mathbf s)$施加了{\small\sffamily\bfnew{循环一致性}}\index{循环一致性}（Circle Consistency）\index{Circle Consistency}的约束\cite{DBLP:conf/iccv/ZhuPIE17}，也就是对于一个句子$\mathbf s$，通过$\textrm{P}(\mathbf t|\mathbf s)$把它翻译成$\mathbf t$后，根据$\textrm{P}(\mathbf s|\mathbf t)$应该能重新翻译出$\mathbf s$，如图\ref{fig:16-7-xc}所示。公式\ref{fig:16-7-xc}给出了同时优化$\textrm{P}(\mathbf s|\mathbf t)$和$\textrm{P}(\mathbf t|\mathbf s)$的一个目标函数形式。这个目标函数的一个额外的好处是它本质上是在学习一个由$\textrm{P}(\mathbf s|\mathbf t)$和$\textrm{P}(\mathbf t|\mathbf s)$组成的语言模型$\textrm{P}(\mathbf s)$，而$\textrm{P}(\mathbf s)$的学习依赖于单语数据，这意味着这个目标函数可以很自然地直接使用大量单语数据来同时训练两个翻译模型。相同的结论可以推广到$\textrm{P}(\mathbf t)$上\cite{DBLP:conf/nips/HeXQWYLM16}。

 %----------------------------------------------
 \begin{figure}[htp]

--- a/bibliography.bib
+++ b/bibliography.bib
@@ -5427,6 +5427,88 @@ pages ={157-166},
  journal   = {CoRR},
  year      = {2020}
 }
+@inproceedings{song2019mass,
+  author    = {Kaitao Song and
+               Xu Tan and
+               Tao Qin and
+               Jianfeng Lu and
+               Tie{-}Yan Liu},
+  title     = {{MASS:} Masked Sequence to Sequence Pre-training for Language Generation},
+  volume    = {97},
+  pages     = {5926--5936},
+  publisher = {{PMLR}},
+  year      = {2019}
+}
+@article{DBLP:journals/corr/Ruder17a,
+  author    = {Sebastian Ruder},
+  title     = {An Overview of Multi-Task Learning in Deep Neural Networks},
+  journal   = {CoRR},
+  volume    = {abs/1706.05098},
+  year      = {2017}
+}
+@inproceedings{DBLP:conf/emnlp/DomhanH17,
+  author    = {Tobias Domhan and
+               Felix Hieber},
+  title     = {Using Target-side Monolingual Data for Neural Machine Translation
+               through Multi-task Learning},
+  pages     = {1500--1505},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
+  year      = {2017}
+}
+@inproceedings{DBLP:conf/icml/XiaQCBYL17,
+  author    = {Yingce Xia and
+               Tao Qin and
+               Wei Chen and
+               Jiang Bian and
+               Nenghai Yu and
+               Tie-Yan Liu},
+  title     = {Dual Supervised Learning},
+  volume    = {70},
+  pages     = {3789--3798},
+  publisher = {{PMLR}},
+  year      = {2017}
+}
+@inproceedings{DBLP:conf/iccv/ZhuPIE17,
+  author    = {Jun{-}Yan Zhu and
+               Taesung Park and
+               Phillip Isola and
+               Alexei A. Efros},
+  title     = {Unpaired Image-to-Image Translation Using Cycle-Consistent Adversarial
+               Networks},
+  pages     = {2242--2251},
+  publisher = {{IEEE} Computer Society},
+  year      = {2017}
+}
+@inproceedings{DBLP:conf/nips/HeXQWYLM16,
+  author    = {Di He and
+               Yingce Xia and
+               Tao Qin and
+               Liwei Wang and
+               Nenghai Yu and
+               Tie{-}Yan Liu and
+               Wei{-}Ying Ma},
+  title     = {Dual Learning for Machine Translation},
+  pages     = {820--828},
+  year      = {2016}
+}
+@inproceedings{DBLP:conf/nips/SuttonMSM99,
+  author    = {Richard S. Sutton and
+               David A. McAllester and
+               Satinder P. Singh and
+               Yishay Mansour},
+  title     = {Policy Gradient Methods for Reinforcement Learning with Function Approximation},
+  pages     = {1057--1063},
+  publisher = {The {MIT} Press},
+  year      = {1999}
+}
+@inproceedings{lample2019cross,
+  author    = {Alexis Conneau and
+               Guillaume Lample},
+  title     = {Cross-lingual Language Model Pretraining},
+  pages     = {7057--7067},
+  year      = {2019}
+}
+
 %%%%% chapter 16------------------------------------------------------
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%