Commit 9e54756f by 曹润柘

update chapter16

parent 208e4099
......@@ -272,7 +272,7 @@ g_{t} = \sigma (w^{T}s_{t}^{TM} + b)
\label{eq:16-7-xc}
\end{eqnarray}
\parinterval 公式\ref{eq:16-7-xc}很自然地把两个方向的翻译模型$\textrm{P}(\mathbf t|\mathbf s)$$\textrm{P}(\mathbf s|\mathbf t)$以及两个语言模型$\textrm{P}(\mathbf s)$$\textrm{P}(\mathbf t)$联系起来:$\textrm{P}(\mathbf s)\textrm{P}(\mathbf t|\mathbf s)$应该与$\textrm{P}(\mathbf t)\textrm{P}(\mathbf s|\mathbf t)$接近,因为它们都表达了同一个联合分布$\textrm{P}(\mathbf s,\mathbf t)$。因此,在构建训练两个方向的翻译模型的目标函数时,除了它们单独训练时各自使用的极大似然估计目标函数,可以额外增加一个目标项来鼓励两个方向的翻译模型去满足公式\ref{eq:7-32}
\parinterval 公式\ref{eq:16-7-xc}很自然地把两个方向的翻译模型$\textrm{P}(\mathbf t|\mathbf s)$$\textrm{P}(\mathbf s|\mathbf t)$以及两个语言模型$\textrm{P}(\mathbf s)$$\textrm{P}(\mathbf t)$联系起来:$\textrm{P}(\mathbf s)\textrm{P}(\mathbf t|\mathbf s)$应该与$\textrm{P}(\mathbf t)\textrm{P}(\mathbf s|\mathbf t)$接近,因为它们都表达了同一个联合分布$\textrm{P}(\mathbf s,\mathbf t)$。因此,在构建训练两个方向的翻译模型的目标函数时,除了它们单独训练时各自使用的极大似然估计目标函数,可以额外增加一个目标项来鼓励两个方向的翻译模型去满足公式\ref{eq:16-8-xc}
\begin{eqnarray}
\mathcal{L} = (\textrm{log P}(\mathbf s) + \textrm{log P}(\mathbf t|\mathbf s) - \textrm{log P}(\mathbf t) - \textrm{log P}(\mathbf s|\mathbf t))^{2}
\label{eq:16-8-xc}
......@@ -294,7 +294,7 @@ g_{t} = \sigma (w^{T}s_{t}^{TM} + b)
\label{eq:16-9-xc}
\end{eqnarray}
\noindent 公式\ref{eq:16-9-xc}假设$\textrm{P}(\mathbf s|\mathbf t)=\textrm{P}(\mathbf s|\mathbf s,\mathbf t)$。这个假设显然是成立的,因为当知道一个句子的译文时,并不需要知道它的源文就可以把它翻译回去。如果直接优化(最大化)公式\ref{eq:16-9-xc}右侧,相当于对这个等式$\textrm{P}(\mathbf s|\mathbf t)$$\textrm{P}(\mathbf t|\mathbf s)$施加了{\small\sffamily\bfnew{循环一致性}}\index{循环一致性}(Circle Consistency)\index{Circle Consistency}的约束\cite{DBLP:conf/iccv/ZhuPIE17},也就是对于一个句子$\mathbf s$,通过$\textrm{P}(\mathbf t|\mathbf s)$把它翻译成$\mathbf t$后,根据$\textrm{P}(\mathbf s|\mathbf t)$应该能重新翻译出$\mathbf s$,如图\ref{fig:7-43}所示。公式\ref{fig:16-7-xc}给出了同时优化$\textrm{P}(\mathbf s|\mathbf t)$$\textrm{P}(\mathbf t|\mathbf s)$的一个目标函数形式。这个目标函数的一个额外的好处是它本质上是在学习一个由$\textrm{P}(\mathbf s|\mathbf t)$$\textrm{P}(\mathbf t|\mathbf s)$组成的语言模型$\textrm{P}(\mathbf s)$,而$\textrm{P}(\mathbf s)$的学习依赖于单语数据,这意味着这个目标函数可以很自然地直接使用大量单语数据来同时训练两个翻译模型。相同的结论可以推广到$\textrm{P}(\mathbf t)$\cite{DBLP:conf/nips/HeXQWYLM16}
\noindent 公式\ref{eq:16-9-xc}假设$\textrm{P}(\mathbf s|\mathbf t)=\textrm{P}(\mathbf s|\mathbf s,\mathbf t)$。这个假设显然是成立的,因为当知道一个句子的译文时,并不需要知道它的源文就可以把它翻译回去。如果直接优化(最大化)公式\ref{eq:16-9-xc}右侧,相当于对这个等式$\textrm{P}(\mathbf s|\mathbf t)$$\textrm{P}(\mathbf t|\mathbf s)$施加了{\small\sffamily\bfnew{循环一致性}}\index{循环一致性}(Circle Consistency)\index{Circle Consistency}的约束\cite{DBLP:conf/iccv/ZhuPIE17},也就是对于一个句子$\mathbf s$,通过$\textrm{P}(\mathbf t|\mathbf s)$把它翻译成$\mathbf t$后,根据$\textrm{P}(\mathbf s|\mathbf t)$应该能重新翻译出$\mathbf s$,如图\ref{fig:16-7-xc}所示。公式\ref{fig:16-7-xc}给出了同时优化$\textrm{P}(\mathbf s|\mathbf t)$$\textrm{P}(\mathbf t|\mathbf s)$的一个目标函数形式。这个目标函数的一个额外的好处是它本质上是在学习一个由$\textrm{P}(\mathbf s|\mathbf t)$$\textrm{P}(\mathbf t|\mathbf s)$组成的语言模型$\textrm{P}(\mathbf s)$,而$\textrm{P}(\mathbf s)$的学习依赖于单语数据,这意味着这个目标函数可以很自然地直接使用大量单语数据来同时训练两个翻译模型。相同的结论可以推广到$\textrm{P}(\mathbf t)$\cite{DBLP:conf/nips/HeXQWYLM16}
%----------------------------------------------
\begin{figure}[htp]
......
......@@ -5427,6 +5427,88 @@ pages ={157-166},
journal = {CoRR},
year = {2020}
}
@inproceedings{song2019mass,
author = {Kaitao Song and
Xu Tan and
Tao Qin and
Jianfeng Lu and
Tie{-}Yan Liu},
title = {{MASS:} Masked Sequence to Sequence Pre-training for Language Generation},
volume = {97},
pages = {5926--5936},
publisher = {{PMLR}},
year = {2019}
}
@article{DBLP:journals/corr/Ruder17a,
author = {Sebastian Ruder},
title = {An Overview of Multi-Task Learning in Deep Neural Networks},
journal = {CoRR},
volume = {abs/1706.05098},
year = {2017}
}
@inproceedings{DBLP:conf/emnlp/DomhanH17,
author = {Tobias Domhan and
Felix Hieber},
title = {Using Target-side Monolingual Data for Neural Machine Translation
through Multi-task Learning},
pages = {1500--1505},
publisher = {Annual Meeting of the Association for Computational Linguistics},
year = {2017}
}
@inproceedings{DBLP:conf/icml/XiaQCBYL17,
author = {Yingce Xia and
Tao Qin and
Wei Chen and
Jiang Bian and
Nenghai Yu and
Tie-Yan Liu},
title = {Dual Supervised Learning},
volume = {70},
pages = {3789--3798},
publisher = {{PMLR}},
year = {2017}
}
@inproceedings{DBLP:conf/iccv/ZhuPIE17,
author = {Jun{-}Yan Zhu and
Taesung Park and
Phillip Isola and
Alexei A. Efros},
title = {Unpaired Image-to-Image Translation Using Cycle-Consistent Adversarial
Networks},
pages = {2242--2251},
publisher = {{IEEE} Computer Society},
year = {2017}
}
@inproceedings{DBLP:conf/nips/HeXQWYLM16,
author = {Di He and
Yingce Xia and
Tao Qin and
Liwei Wang and
Nenghai Yu and
Tie{-}Yan Liu and
Wei{-}Ying Ma},
title = {Dual Learning for Machine Translation},
pages = {820--828},
year = {2016}
}
@inproceedings{DBLP:conf/nips/SuttonMSM99,
author = {Richard S. Sutton and
David A. McAllester and
Satinder P. Singh and
Yishay Mansour},
title = {Policy Gradient Methods for Reinforcement Learning with Function Approximation},
pages = {1057--1063},
publisher = {The {MIT} Press},
year = {1999}
}
@inproceedings{lample2019cross,
author = {Alexis Conneau and
Guillaume Lample},
title = {Cross-lingual Language Model Pretraining},
pages = {7057--7067},
year = {2019}
}
%%%%% chapter 16------------------------------------------------------
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论