10 12

d9c7e8d9 · zengxin · 2b0aa7b3 · d9c7e8d9 · d9c7e8d9 · d9c7e8d9
Commit d9c7e8d9 authored Nov 09, 2020 by zengxin
--- a/Chapter10/chapter10.tex
+++ b/Chapter10/chapter10.tex
@@ -1257,7 +1257,7 @@ L(\mathbi{Y},\widehat{\mathbi{Y}}) = \sum_{j=1}^n L_{\textrm{ce}}(\mathbi{y}_j,\
 \vspace{0.5em}
 \item 注意力机制的使用是机器翻译乃至整个自然语言处理近几年获得成功的重要因素之一\upcite{bahdanau2014neural,DBLP:journals/corr/LuongPM15}。早期，有研究者尝试将注意力机制和统计机器翻译的词对齐进行统一\upcite{WangNeural,He2016ImprovedNM,li-etal-2019-word}。最近，也有大量的研究工作对注意力机制进行改进，比如，使用自注意力机制构建翻译模型等\upcite{vaswani2017attention}。而对注意力模型的改进也成为了自然语言处理中的热点问题之一。在{\chapterfifteen}会对机器翻译中不同注意力模型进行进一步讨论。
 \vspace{0.5em}
-\item 一般来说，神经机器翻译的计算过程是没有人工干预的，翻译流程也无法用人类的知识直接进行解释，因此一个有趣的方向是在神经机器翻译中引入先验知识，使得机器翻译的行为更“像”人。比如，可以使用句法树来引入人类的语言学知识\upcite{Yang2017TowardsBH,Wang2019TreeTI}，基于句法的神经机器翻译也包含大量的树结构的神经网络建模\upcite{DBLP:journals/corr/abs-1809-01854,DBLP:journals/corr/abs-1808-09374}。此外，也可以把用户定义的词典或者翻译记忆加入到翻译过程来\upcite{DBLP:journals/corr/ZhangZ16c,zhang-etal-2017-prior,duan-etal-2020-bilingual,cao-xiong-2018-encoding}，使得用户的约束可以直接反映到机器翻译的结果上来。先验知识的种类还有很多，包括词对齐\upcite{li-etal-2019-word}、 篇章信息\upcite{Werlen2018DocumentLevelNM,DBLP:journals/corr/abs-1805-10163} 等等，都是神经机器翻译中能够使用的信息。
+\item 一般来说，神经机器翻译的计算过程是没有人工干预的，翻译流程也无法用人类的知识直接进行解释，因此一个有趣的方向是在神经机器翻译中引入先验知识，使得机器翻译的行为更“像”人。比如，可以使用句法树来引入人类的语言学知识\upcite{Yang2017TowardsBH,Wang2019TreeTI}，基于句法的神经机器翻译也包含大量的树结构的神经网络建模\upcite{DBLP:journals/corr/abs-1809-01854,DBLP:journals/corr/abs-1808-09374}。此外，也可以把用户定义的词典或者翻译记忆加入到翻译过程来\upcite{DBLP:journals/corr/ZhangZ16c,zhang-etal-2017-prior,duan-etal-2020-bilingual,cao-xiong-2018-encoding}，使得用户的约束可以直接反映到机器翻译的结果上来。先验知识的种类还有很多，包括词对齐\upcite{li-etal-2019-word,DBLP:conf/emnlp/MiWI16,DBLP:conf/coling/LiuUFS16}、 篇章信息\upcite{Werlen2018DocumentLevelNM,DBLP:journals/corr/abs-1805-10163,DBLP:conf/acl/LiLWJXZLL20} 等等，都是神经机器翻译中能够使用的信息。
 \end{itemize}
--- a/Chapter12/chapter12.tex
+++ b/Chapter12/chapter12.tex
@@ -587,7 +587,7 @@ Transformer Deep（48层） & 30.2            & 43.1            & 194$\times 10^
 \begin{itemize}
 \vspace{0.5em}
-\item 近两年，有研究已经发现注意力机制可以捕捉一些语言现象\upcite{DBLP:journals/corr/abs-1905-09418}，比如，在Transformer 的多头注意力中，不同头往往会捕捉到不同的信息，比如，有些头对低频词更加敏感，有些头更适合词意消歧，甚至有些头可以捕捉句法信息。此外，由于注意力机制增加了模型的复杂性，而且随着网络层数的增多，神经机器翻译中也存在大量的冗余，因此研发轻量的注意力模型也是具有实践意义的方向\upcite{Xiao2019SharingAW,DBLP:journals/corr/abs-1805-00631,Lin2020WeightDT}。
+\item 近两年，有研究已经发现注意力机制可以捕捉一些语言现象\upcite{DBLP:journals/corr/abs-1905-09418}，比如，在Transformer 的多头注意力中，不同头往往会捕捉到不同的信息，比如，有些头对低频词更加敏感，有些头更适合词意消歧，甚至有些头可以捕捉句法信息。此外，由于注意力机制增加了模型的复杂性，而且随着网络层数的增多，神经机器翻译中也存在大量的冗余，因此研发轻量的注意力模型也是具有实践意义的方向\upcite{Xiao2019SharingAW,DBLP:journals/corr/abs-1805-00631,Lin2020WeightDT,DBLP:conf/iclr/WuLLLH20,Kitaev2020ReformerTE,DBLP:journals/corr/abs-2005-00743,dai-etal-2019-transformer,DBLP:journals/corr/abs-2004-05150,DBLP:conf/iclr/RaePJHL20}。
 \vspace{0.5em}
 \item 神经机器翻译依赖成本较高的GPU设备，因此对模型的裁剪和加速也是很多系统研发人员所感兴趣的方向。比如，从工程上，可以考虑减少运算强度，比如使用低精度浮点数\upcite{Ott2018ScalingNM} 或者整数\upcite{DBLP:journals/corr/abs-1906-00532,Lin2020TowardsF8}进行计算，或者引入缓存机制来加速模型的推断\upcite{Vaswani2018Tensor2TensorFN}；也可以通过对模型参数矩阵的剪枝来减小整个模型的体积\upcite{DBLP:journals/corr/SeeLM16}；另一种方法是知识蒸馏\upcite{Hinton2015Distilling,kim-rush-2016-sequence}。 利用大模型训练小模型，这样往往可以得到比单独训练小模型更好的效果\upcite{DBLP:journals/corr/ChenLCL17}。
 \vspace{0.5em}

--- a/bibliography.bib
+++ b/bibliography.bib
@@ -4337,6 +4337,43 @@ year = {2012}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 %%%%% chapter 10------------------------------------------------------
+@inproceedings{DBLP:conf/acl/LiLWJXZLL20,
+  author    = {Bei Li and
+               Hui Liu and
+               Ziyang Wang and
+               Yufan Jiang and
+               Tong Xiao and
+               Jingbo Zhu and
+               Tongran Liu and
+               Changliang Li},
+  title     = {Does Multi-Encoder Help? {A} Case Study on Context-Aware Neural Machine
+               Translation},
+  pages     = {3512--3518},
+  publisher = {Association for Computational Linguistics},
+  year      = {2020}
+}
+@inproceedings{DBLP:conf/emnlp/MiWI16,
+  author    = {Haitao Mi and
+               Zhiguo Wang and
+               Abe Ittycheriah},
+  title     = {Supervised Attentions for Neural Machine Translation},
+  pages     = {2283--2288},
+  publisher = {The Association for Computational Linguistics},
+  year      = {2016}
+}
+@inproceedings{DBLP:conf/coling/LiuUFS16,
+  author    = {Lemao Liu and
+               Masao Utiyama and
+               Andrew M. Finch and
+               Eiichiro Sumita},
+  title     = {Neural Machine Translation with Supervised Attention},
+  pages     = {3093--3102},
+  publisher = {The Association for Computational Linguistics},
+  year      = {2016}
+}
 @inproceedings{devlin-etal-2014-fast,
  author    = {Jacob Devlin and
               Rabih Zbib and
@@ -4378,13 +4415,15 @@ year = {2012}
  year      = {1998},
 }
 @article{BENGIO1994Learning,
-author ={Y. {Bengio} and P. {Simard} and P. {Frasconi}},
+author    = {Yoshua Bengio and
-journal ={IEEE Transactions on Neural Networks},
+               Patrice Y. Simard and
-title ={Learning long-term dependencies with gradient descent is difficult},
+               Paolo Frasconi},
-year ={1994},
+  title     = {Learning long-term dependencies with gradient descent is difficult},
-volume ={5},
+  journal   = {Institute of Electrical and Electronics Engineers},
-number ={2},
+  volume    = {5},
-pages ={157-166},
+  number    = {2},
+  pages     = {157--166},
+  year      = {1994}
 }
 @inproceedings{NIPS2017_7181,
  author    = {Ashish Vaswani and
@@ -4460,7 +4499,7 @@ pages ={157-166},
  title     = {Learning Deep Transformer Models for Machine Translation},
  pages     = {1810--1822},
  publisher = {Association for Computational Linguistics},
-  year      = {2019},
+  year      = {2019}
 }
 @article{Li2020NeuralMT,
  author    = {Yanyang Li and
@@ -4860,21 +4899,24 @@ pages ={157-166},
  year={2018}
 }
 @inproceedings{Lin2020TowardsF8,
-  title={Towards Fully 8-bit Integer Inference for the Transformer Model},
+  author    = {Ye Lin and
-  author={Y. Lin and Yanyang Li and Tengbo Liu and Tong Xiao and T. Liu and Jingbo Zhu},
+               Yanyang Li and
-  publisher={International Joint Conference on Artificial Intelligence},
+               Tengbo Liu and
-  year={2020}
+               Tong Xiao and
+               Tongran Liu and
+               Jingbo Zhu},
+  title     = {Towards Fully 8-bit Integer Inference for the Transformer Model},
+  pages     = {3759--3765},
+  publisher = {International Joint Conference on Artificial Intelligence},
+  year      = {2020}
 }
 @inproceedings{kim-rush-2016-sequence,
-    title = "Sequence-Level Knowledge Distillation",
+    author    = {Yoon Kim and
-    author = "Kim, Yoon  and
+               Alexander M. Rush},
-      Rush, Alexander M.",
+  title     = {Sequence-Level Knowledge Distillation},
-    publisher = "Proceedings of the 2016 Conference on Empirical Methods in Natural Language Processing",
+  pages     = {1317--1327},
-    month = nov,
+  publisher = {The Association for Computational Linguistics},
-    year = "2016",
+  year      = {2016}
-    //address = "Austin, Texas",
-    //publisher = "Association for Computational Linguistics",
-    pages = "1317--1327",
 }
 @article{Akaike1969autoregressive,
  author    = {Hirotugu Akaike},
@@ -4914,16 +4956,14 @@ pages ={157-166},
  year={2018}
 }
 @inproceedings{cho-etal-2014-properties,
-    title = "On the Properties of Neural Machine Translation: Encoder--Decoder Approaches",
+    author    = {Kyunghyun Cho and
-    author = {Cho, Kyunghyun  and
+               Bart van Merrienboer and
-      van Merri{\"e}nboer, Bart  and
+               Dzmitry Bahdanau and
-      Bahdanau, Dzmitry  and
+               Yoshua Bengio},
-      Bengio, Yoshua},
+  title     = {On the Properties of Neural Machine Translation: Encoder-Decoder Approaches},
-    month = oct,
+  pages     = {103--111},
-    year = "2014",
+  publisher = {Association for Computational Linguistics},
-    address = "Doha, Qatar",
+  year      = {2014}
-    publisher = "Association for Computational Linguistics",
-    pages = "103--111",
 }
 @inproceedings{DBLP:conf/acl/JeanCMB15,
@@ -4948,10 +4988,14 @@ pages ={157-166},
  year      = {2015}
 }
 @inproceedings{He2016ImprovedNM,
-  title={Improved Neural Machine Translation with SMT Features},
+  author    = {Wei He and
-  author={W. He and Zhongjun He and Hua Wu and H. Wang},
+               Zhongjun He and
-  booktitle={AAAI Conference on Artificial Intelligence},
+               Hua Wu and
-  year={2016}
+               Haifeng Wang},
+  title     = {Improved Neural Machine Translation with {SMT} Features},
+  pages     = {151--157},
+  publisher = {the Association for the Advance of Artificial Intelligence},
+  year      = {2016}
 }
 @inproceedings{zhang-etal-2017-prior,
    title = {Prior Knowledge Integration for Neural Machine Translation using Posterior Regularization},
@@ -4966,45 +5010,40 @@ pages ={157-166},
 }
 @inproceedings{duan-etal-2020-bilingual,
-    title = "Bilingual Dictionary Based Neural Machine Translation without Using Parallel Sentences",
+    author    = {Xiangyu Duan and
-    author = "Duan, Xiangyu  and
+               Baijun Ji and
-      Ji, Baijun  and
+               Hao Jia and
-      Jia, Hao  and
+               Min Tan and
-      Tan, Min  and
+               Min Zhang and
-      Zhang, Min  and
+               Boxing Chen and
-      Chen, Boxing  and
+               Weihua Luo and
-      Luo, Weihua  and
+               Yue Zhang},
-      Zhang, Yue",
+  title     = {Bilingual Dictionary Based Neural Machine Translation without Using
-    month = jul,
+               Parallel Sentences},
-    year = "2020",
+  pages     = {1570--1579},
-    address = "Online",
+  publisher = {Association for Computational Linguistics},
-    publisher = "Association for Computational Linguistics",
+  year      = {2020}
-    pages = "1570--1579",
 }
 @inproceedings{cao-xiong-2018-encoding,
-    title = "Encoding Gated Translation Memory into Neural Machine Translation",
+    author    = {Qian Cao and
-    author = "Cao, Qian  and
+               Deyi Xiong},
-      Xiong, Deyi",
+  title     = {Encoding Gated Translation Memory into Neural Machine Translation},
-    month = oct,
+  pages     = {3042--3047},
-    year = "2018",
+  publisher = {Association for Computational Linguistics},
-    address = "Brussels, Belgium",
+  year      = {2018}
-    publisher = "Association for Computational Linguistics",
-    pages = "3042--3047",
 }
 @inproceedings{yang-etal-2016-hierarchical,
-    title = "Hierarchical Attention Networks for Document Classification",
+    author    = {Zichao Yang and
-    author = "Yang, Zichao  and
+               Diyi Yang and
-      Yang, Diyi  and
+               Chris Dyer and
-      Dyer, Chris  and
+               Xiaodong He and
-      He, Xiaodong  and
+               Alexander J. Smola and
-      Smola, Alex  and
+               Eduard H. Hovy},
-      Hovy, Eduard",
+  title     = {Hierarchical Attention Networks for Document Classification},
-    month = jun,
+  pages     = {1480--1489},
-    year = "2016",
+  publisher = {The Association for Computational Linguistics},
-    address = "San Diego, California",
+  year      = {2016}
-    publisher = "Association for Computational Linguistics",
-    pages = "1480--1489",
 }
 %%%%% chapter 10------------------------------------------------------
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@@ -5014,9 +5053,6 @@ pages ={157-166},
 @inproceedings{DBLP:conf/naacl/Johnson015,
  author    = {Rie Johnson and
               Tong Zhang},
-  editor    = {Rada Mihalcea and
-               Joyce Yue Chai and
-               Anoop Sarkar},
  title     = {Effective Use of Word Order for Text Categorization with Convolutional
               Neural Networks},
  pages     = {103--112},
@@ -5027,10 +5063,6 @@ pages ={157-166},
 @inproceedings{DBLP:conf/naacl/NguyenG15,
  author    = {Thien Huu Nguyen and
               Ralph Grishman},
-  editor    = {Phil Blunsom and
-               Shay B. Cohen and
-               Paramveer S. Dhillon and
-               Percy Liang},
  title     = {Relation Extraction: Perspective from Convolutional Neural Networks},
  pages     = {39--48},
  publisher = {The Association for Computational Linguistics},
@@ -5411,6 +5443,51 @@ pages ={157-166},
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 %%%%% chapter 12------------------------------------------------------
+@inproceedings{DBLP:conf/iclr/RaePJHL20,
+  author    = {Jack W. Rae and
+               Anna Potapenko and
+               Siddhant M. Jayakumar and
+               Chloe Hillier and
+               Timothy P. Lillicrap},
+  title     = {Compressive Transformers for Long-Range Sequence Modelling},
+  publisher = {OpenReview.net},
+  year      = {2020}
+}
+@article{DBLP:journals/corr/abs-2004-05150,
+  author    = {Iz Beltagy and
+               Matthew E. Peters and
+               Arman Cohan},
+  title     = {Longformer: The Long-Document Transformer},
+  journal   = {CoRR},
+  volume    = {abs/2004.05150},
+  year      = {2020}
+}
+@article{DBLP:journals/corr/abs-2005-00743,
+  author    = {Yi Tay and
+               Dara Bahri and
+               Donald Metzler and
+               Da-Cheng Juan and
+               Zhe Zhao and
+               Che Zheng},
+  title     = {Synthesizer: Rethinking Self-Attention in Transformer Models},
+  journal   = {CoRR},
+  volume    = {abs/2005.00743},
+  year      = {2020}
+}
+@inproceedings{DBLP:conf/iclr/WuLLLH20,
+  author    = {Zhanghao Wu and
+               Zhijian Liu and
+               Ji Lin and
+               Yujun Lin and
+               Song Han},
+  title     = {Lite Transformer with Long-Short Range Attention},
+  publisher = {OpenReview.net},
+  year      = {2020}
+}
 @inproceedings{DBLP:journals/corr/abs-1905-09418,
  author    = {Elena Voita and
               David Talbot and
@@ -5506,18 +5583,16 @@ pages ={157-166},
 }
 @inproceedings{dai-etal-2019-transformer,
- title = "Transformer-{XL}: Attentive Language Models beyond a Fixed-Length Context",
+ author    = {Zihang Dai and
- author = "Dai, Zihang and
+               Zhilin Yang and
-		 Yang, Zhilin and
+               Yiming Yang and
-		 Yang, Yiming and
+               Jaime G. Carbonell and
-		 Carbonell, Jaime and
+               Quoc Viet Le and
-		 Le, Quoc and
+               Ruslan Salakhutdinov},
-		 Salakhutdinov, Ruslan",
+  title     = {Transformer-XL: Attentive Language Models beyond a Fixed-Length Context},
- month = jul,
+  pages     = {2978--2988},
- year = "2019",
+  publisher = {Association for Computational Linguistics},
- address = "Florence, Italy",
+  year      = {2019}
- publisher = "Association for Computational Linguistics",
- pages = "2978--2988",
 }
 @article{Liu2020LearningTE,
 	title={Learning to Encode Position for Transformer with Continuous Dynamical Model},
@@ -5563,10 +5638,15 @@ pages ={157-166},
  year={2018}
 }
 @inproceedings{Dou2018ExploitingDR,
-  title={Exploiting Deep Representations for Neural Machine Translation},
+   author    = {Zi-Yi Dou and
-  author={Zi-Yi Dou and Zhaopeng Tu and Xing Wang and Shuming Shi and T. Zhang},
+               Zhaopeng Tu and
-  publisher={Conference on Empirical Methods in Natural Language Processing},
+               Xing Wang and
-  year={2018}
+               Shuming Shi and
+               Tong Zhang},
+  title     = {Exploiting Deep Representations for Neural Machine Translation},
+  pages     = {4253--4262},
+  publisher = {Association for Computational Linguistics},
+  year      = {2018}
 }
 @inproceedings{Wang2019ExploitingSC,
  title={Exploiting Sentential Context for Neural Machine Translation},
@@ -5576,10 +5656,16 @@ pages ={157-166},
 }
 @inproceedings{Dou2019DynamicLA,
-  title={Dynamic Layer Aggregation for Neural Machine Translation},
+  author    = {Zi-Yi Dou and
-  author={Zi-Yi Dou and Zhaopeng Tu and Xing Wang and Longyue Wang and Shuming Shi and T. Zhang},
+               Zhaopeng Tu and
-  publisher={AAAI Conference on Artificial Intelligence},
+               Xing Wang and
-  year={2019}
+               Longyue Wang and
+               Shuming Shi and
+               Tong Zhang},
+  title     = {Dynamic Layer Aggregation for Neural Machine Translation with Routing-by-Agreement},
+  pages     = {86--93},
+  publisher = {the Association for the Advance of Artificial Intelligence},
+  year      = {2019}
 }
 @inproceedings{Wei2020MultiscaleCD,
  title={Multiscale Collaborative Deep Models for Neural Machine Translation},
@@ -5614,7 +5700,7 @@ pages ={157-166},
 @article{li2020shallow,
  title={Shallow-to-Deep Training for Neural Machine Translation},
  author={Li, Bei and Wang, Ziyang and Liu, Hui and Jiang, Yufan and Du, Quan and Xiao, Tong and Wang, Huizhen and Zhu, Jingbo},
-  journal={arXiv preprint arXiv:2010.03737},
+  publisher={Conference on Empirical Methods in Natural Language Processing},
  year={2020}
 }
 %%%%% chapter 12------------------------------------------------------