Commit 5ce55d6b by 曹润柘

合并分支 'caorunzhe' 到 'master'

Caorunzhe

查看合并请求 !637
parents fb3ffad9 e552711b
......@@ -744,14 +744,16 @@ P(\mathbi{y}|\mathbi{x}) & = & \frac{\mathrm{cos}(\mathbi{x},\mathbi{y})/\tau}{\
\label{fig:16-19}
\end{figure}
%----------------------------------------------
\begin{itemize}
\vspace{0.5em}
\item {\small\bfnew{模型参数初始化}}。无监督神经机器翻译的关键在于如何提供最开始的监督信号,从而启动后续的迭代流程。无监督词典归纳已经可以提供一些可靠的监督信号,那么如何在模型初始化中融入这些信息?既然神经机器翻译模型都使用词嵌入层作为输入,而无监督词典归纳总是先把两个语言各自的单语词嵌入映射到一个空间后才归纳双语词典,那么可以使用这些映射后的词嵌入来初始化模型的词嵌入层,然后在这个基础上训练模型,因为这些映射后的词嵌入天然就包含了大量的监督信号,比如,两个语言里意思相近的词对应的词嵌入会比其他词更靠近对方\upcite{DBLP:journals/ipm/FarhanTAJATT20}。 为了防止训练过程中模型参数的更新会破坏词嵌入当中的词对齐信息,通常初始化后会固定模型的词嵌入层不让其更新\upcite{DBLP:conf/emnlp/ArtetxeLA18}
\noindent{\small\bfnew{1)模型参数初始化}}
\parinterval 无监督神经机器翻译的关键在于如何提供最开始的监督信号,从而启动后续的迭代流程。无监督词典归纳已经可以提供一些可靠的监督信号,那么如何在模型初始化中融入这些信息?既然神经机器翻译模型都使用词嵌入层作为输入,而无监督词典归纳总是先把两个语言各自的单语词嵌入映射到一个空间后才归纳双语词典,那么可以使用这些映射后的词嵌入来初始化模型的词嵌入层,然后在这个基础上训练模型,因为这些映射后的词嵌入天然就包含了大量的监督信号,比如,两个语言里意思相近的词对应的词嵌入会比其他词更靠近对方\upcite{DBLP:journals/ipm/FarhanTAJATT20}。 为了防止训练过程中模型参数的更新会破坏词嵌入当中的词对齐信息,通常初始化后会固定模型的词嵌入层不让其更新\upcite{DBLP:conf/emnlp/ArtetxeLA18}
\parinterval 进一步,无监督神经机器翻译能在提供更少监督信号的情况下启动,也就是可以去除无监督词典归纳这一步骤\upcite{DBLP:conf/nips/ConneauL19}。这时候模型的初始化直接使用共享词表的预训练模型的参数作为起始点。这个预训练模型直接使用前面提到的预训练方法(如MASS)进行训练,区别在于模型的大小如宽度和深度需要严格匹配翻译模型。此外,这个模型不仅仅只在一个语言的单语数据上进行训练,而是同时在两个语言的单语数据上进行训练,并且两个语言的词表进行共享。前面提到,在共享词表特别是共享子词词表的情况下,已经隐式地告诉模型源语言和目标语言里一样的(子)词互为翻译,相当于模型使用了少量的监督信号。在这基础上使用两个语言的单语数据进行预训练,则通过模型共享进一步挖掘了语言之间共通的部分。因此,使用预训练模型进行初始化后,无监督神经机器翻译模型已经得到大量的监督信号,从而得以不断通过优化来提升模型性能。
\vspace{0.5em}
\item {\small\bfnew{语言模型的使用}}。无监督神经机器翻译的一个重要部分就是来自语言模型的目标函数。因为翻译模型本质上是在完成文本生成任务,所以只有文本生成类型的语言模型建模方法才可以应用到无监督神经机器翻译里。比如,经典的给定前文预测下一词就是一个典型的自回归生成任务(见{\chaptertwo}),因此可以运用到无监督神经机器翻译里。但是,目前在预训练里流行的BERT等模型是掩码语言模型\upcite{devlin2019bert},就不能直接在无监督神经翻译里使用。
\noindent{\small\bfnew{2)语言模型的使用}}
\parinterval 无监督神经机器翻译的一个重要部分就是来自语言模型的目标函数。因为翻译模型本质上是在完成文本生成任务,所以只有文本生成类型的语言模型建模方法才可以应用到无监督神经机器翻译里。比如,经典的给定前文预测下一词就是一个典型的自回归生成任务(见{\chaptertwo}),因此可以运用到无监督神经机器翻译里。但是,目前在预训练里流行的BERT等模型是掩码语言模型\upcite{devlin2019bert},就不能直接在无监督神经翻译里使用。
\parinterval 另外一个在无监督神经机器翻译中比较常见的语言模型目标函数则是降噪自编码器。它也是文本生成类型的语言模型建模方法。对于一个句子$\seq{x}$,首先使用一个噪声函数$\seq{x}^{'}=\mathrm{noise}(\seq{x})$ 来对$\seq{x}$注入噪声,产生一个质量较差的句子$\seq{x}^{'}$。然后,让模型学习如何从$\seq{x}^{'}$还原出$\seq{x}$。这样一个目标函数比预测下一词更贴近翻译任务,因为它是一个序列到序列的映射,并且输入、输出两个序列在语义上是等价的。这里之所以采用$\seq{x}^{'}$而不是$\seq{x}$自己来预测$\seq{x}$,是因为模型可以通过简单的复制输入作为输出来完成从$\seq{x}$预测$\seq{x}$的任务,并且在输入中注入噪声会让模型更加健壮,因此模型可以通过训练集数据学会如何利用句子中噪声以外的信息来忽略其中噪声并得到正确的输出。通常来说,噪声函数$\mathrm{noise}$有三种形式,如表\ref{tab:16-1}所示。
%----------------------------------------------
......@@ -770,8 +772,7 @@ P(\mathbi{y}|\mathbi{x}) & = & \frac{\mathrm{cos}(\mathbi{x},\mathbi{y})/\tau}{\
%----------------------------------------------
\parinterval 实际当中三种形式的噪声函数都会被使用到,其中在交换方法中越相近的词越容易被交换,并且保证被交换的词的对数有限,而删除和空白方法里词的删除和替换概率通常都非常低,如$0.1$等。
\vspace{0.5em}
\end{itemize}
%----------------------------------------------------------------------------------------
% NEW SECTION 16.5
%----------------------------------------------------------------------------------------
......
......@@ -5936,6 +5936,275 @@ author = {Yoshua Bengio and
year = {2012}
}
@article{JMLR:v15:srivastava14a,
author = {Nitish Srivastava and Geoffrey Hinton and Alex Krizhevsky and Ilya Sutskever and Ruslan Salakhutdinov},
title = {Dropout: A Simple Way to Prevent Neural Networks from Overfitting},
journal = {Journal of Machine Learning Research},
year = {2014},
volume = {15},
pages = {1929-1958},
}
@inproceedings{DBLP:conf/amta/MullerRS20,
author = {Mathias M{\"{u}}ller and
Annette Rios and
Rico Sennrich},
title = {Domain Robustness in Neural Machine Translation},
pages = {151--164},
publisher = {Association for Machine Translation in the Americas},
year = {2020}
}
@inproceedings{DBLP:conf/sp/Carlini017,
author = {Nicholas Carlini and
David A. Wagner},
title = {Towards Evaluating the Robustness of Neural Networks},
pages = {39--57},
publisher = {IEEE Symposium on Security and Privacy},
year = {2017}
}
@inproceedings{DBLP:conf/cvpr/Moosavi-Dezfooli16,
author = {Seyed-Mohsen Moosavi-Dezfooli and
Alhussein Fawzi and
Pascal Frossard},
title = {DeepFool: {A} Simple and Accurate Method to Fool Deep Neural Networks},
pages = {2574--2582},
publisher = {IEEE Conference on Computer Vision and Pattern Recognition},
year = {2016}
}
@inproceedings{DBLP:conf/acl/ChengJM19,
author = {Yong Cheng and
Lu Jiang and
Wolfgang Macherey},
title = {Robust Neural Machine Translation with Doubly Adversarial Inputs},
pages = {4324--4333},
publisher = {Annual Meeting of the Association for Computational Linguistics},
year = {2019}
}
@inproceedings{DBLP:conf/cvpr/NguyenYC15,
author = {Anh Mai Nguyen and
Jason Yosinski and
Jeff Clune},
title = {Deep neural networks are easily fooled: High confidence predictions
for unrecognizable images},
pages = {427--436},
publisher = {IEEE Conference on Computer Vision and Pattern Recognition},
year = {2015}
}
@inproceedings{DBLP:journals/corr/SzegedyZSBEGF13,
author = {Christian Szegedy and
Wojciech Zaremba and
Ilya Sutskever and
Joan Bruna and
Dumitru Erhan and
Ian J. Goodfellow and
Rob Fergus},
title = {Intriguing properties of neural networks},
publisher = {International Conference on Learning Representations},
year = {2014}
}
@inproceedings{DBLP:journals/corr/GoodfellowSS14,
author = {Ian J. Goodfellow and
Jonathon Shlens and
Christian Szegedy},
title = {Explaining and Harnessing Adversarial Examples},
publisher = {International Conference on Learning Representations},
year = {2015}
}
@inproceedings{DBLP:conf/emnlp/JiaL17,
author = {Robin Jia and
Percy Liang},
title = {Adversarial Examples for Evaluating Reading Comprehension Systems},
pages = {2021--2031},
publisher = {Conference on Empirical Methods in Natural Language Processing},
year = {2017}
}
@inproceedings{DBLP:conf/emnlp/BekoulisDDD18,
author = {Giannis Bekoulis and
Johannes Deleu and
Thomas Demeester and
Chris Develder},
title = {Adversarial training for multi-context joint entity and relation extraction},
pages = {2830--2836},
publisher = {Conference on Empirical Methods in Natural Language Processing},
year = {2018}
}
@inproceedings{DBLP:conf/naacl/YasunagaKR18,
author = {Michihiro Yasunaga and
Jungo Kasai and
Dragomir R. Radev},
title = {Robust Multilingual Part-of-Speech Tagging via Adversarial Training},
pages = {976--986},
publisher = {Annual Conference of the North American Chapter of the Association for Computational Linguistics},
year = {2018}
}
@inproceedings{DBLP:conf/iclr/BelinkovB18,
author = {Yonatan Belinkov and
Yonatan Bisk},
title = {Synthetic and Natural Noise Both Break Neural Machine Translation},
publisher = {International Conference on Learning Representations},
year = {2018}
}
@inproceedings{DBLP:conf/naacl/MichelLNP19,
author = {Paul Michel and
Xian Li and
Graham Neubig and
Juan Miguel Pino},
title = {On Evaluation of Adversarial Perturbations for Sequence-to-Sequence
Models},
pages = {3103--3114},
publisher = {Annual Conference of the North American Chapter of the Association for Computational Linguistics},
year = {2019}
}
@article{Gong2018AdversarialTW,
title={Adversarial Texts with Gradient Methods},
author={Zhitao Gong and Wenlu Wang and B. Li and D. Song and W. Ku},
journal={ArXiv},
year={2018},
volume={abs/1801.07175}
}
@inproceedings{DBLP:conf/naacl/VaibhavSSN19,
author = {Vaibhav and
Sumeet Singh and
Craig Stewart and
Graham Neubig},
title = {Improving Robustness of Machine Translation with Synthetic Noise},
pages = {1916--1920},
publisher = {Annual Conference of the North American Chapter of the Association for Computational Linguistics},
year = {2019}
}
@inproceedings{DBLP:conf/naacl/AnastasopoulosL19,
author = {Antonios Anastasopoulos and
Alison Lui and
Toan Q. Nguyen and
David Chiang},
title = {Neural Machine Translation of Text from Non-Native Speakers},
pages = {3070--3080},
publisher = {Annual Conference of the North American Chapter of the Association for Computational Linguistics},
year = {2019}
}
@inproceedings{DBLP:conf/acl/SinghGR18,
author = {Marco T{\'{u}}lio Ribeiro and
Sameer Singh and
Carlos Guestrin},
title = {Semantically Equivalent Adversarial Rules for Debugging {NLP} models},
pages = {856--865},
publisher = {Annual Meeting of the Association for Computational Linguistics},
year = {2018}
}
@article{DBLP:journals/corr/SamantaM17,
author = {Suranjana Samanta and
Sameep Mehta},
title = {Towards Crafting Text Adversarial Samples},
journal = {CoRR},
volume = {abs/1707.02812},
year = {2017}
}
@inproceedings{DBLP:conf/ijcai/0002LSBLS18,
author = {Bin Liang and
Hongcheng Li and
Miaoqiang Su and
Pan Bian and
Xirong Li and
Wenchang Shi},
title = {Deep Text Classification Can be Fooled},
pages = {4208--4215},
publisher = {International Joint Conference on Artificial Intelligence},
year = {2018}
}
@inproceedings{DBLP:conf/coling/EbrahimiLD18,
author = {Javid Ebrahimi and
Daniel Lowd and
Dejing Dou},
title = {On Adversarial Examples for Character-Level Neural Machine Translation},
pages = {653--663},
publisher = {International Conference on Computational Linguistics},
year = {2018}
}
@inproceedings{DBLP:conf/iclr/ZhaoDS18,
author = {Zhengli Zhao and
Dheeru Dua and
Sameer Singh},
title = {Generating Natural Adversarial Examples},
publisher = {International Conference on Learning Representations},
year = {2018}
}
@inproceedings{DBLP:conf/acl/LiuTMCZ18,
author = {Yong Cheng and
Zhaopeng Tu and
Fandong Meng and
Junjie Zhai and
Yang Liu},
title = {Towards Robust Neural Machine Translation},
pages = {1756--1766},
publisher = {Annual Meeting of the Association for Computational Linguistics},
year = {2018}
}
@inproceedings{DBLP:conf/acl/LiuMHXH19,
author = {Hairong Liu and
Mingbo Ma and
Liang Huang and
Hao Xiong and
Zhongjun He},
title = {Robust Neural Machine Translation with Joint Textual and Phonetic
Embedding},
pages = {3044--3049},
publisher = {Annual Meeting of the Association for Computational Linguistics},
year = {2019}
}
@inproceedings{DBLP:conf/acl/LiLWJXZLL20,
author = {Bei Li and
Hui Liu and
Ziyang Wang and
Yufan Jiang and
Tong Xiao and
Jingbo Zhu and
Tongran Liu and
Changliang Li},
title = {Does Multi-Encoder Help? {A} Case Study on Context-Aware Neural Machine
Translation},
pages = {3512--3518},
publisher = {Annual Meeting of the Association for Computational Linguistics},
year = {2020}
}
@techreport{chen1999gaussian,
title={A Gaussian prior for smoothing maximum entropy models},
author={Chen, Stanley F and Rosenfeld, Ronald},
year={1999},
institution={CARNEGIE-MELLON UNIV PITTSBURGH PA SCHOOL OF COMPUTER SCIENCE}
}
@inproceedings{DBLP:conf/emnlp/MichelN18,
author = {Paul Michel and
Graham Neubig},
title = {{MTNT:} {A} Testbed for Machine Translation of Noisy Text},
pages = {543--553},
publisher = {Conference on Empirical Methods in Natural Language Processing},
year = {2018}
}
%%%%% chapter 13------------------------------------------------------
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论