\item 自注意力网络作为Transformer模型中重要组成部分,近年来受到研究人员的广泛关注,尝试设计更高效地操作来替代它。比如,利用动态卷积网络来替换编码端与解码端的自注意力网络,在保证推断效率的同时取得了和Transformer相当甚至略好的翻译性能\upcite{Wu2019PayLA};为了加速Transformer处理较长输入文本的效率,利用局部敏感哈希替换自注意力机制的Reformer模型吸引了广泛学者的关注\upcite{Kitaev2020ReformerTE}。此外,在自注意力网络引入额外的编码信息能够进一步提高模型的表示能力。比如,引入固定窗口大小的相对位置编码信息\upcite{Shaw2018SelfAttentionWR,dai-etal-2019-transformer},或利用动态系统的思想从数据中学习特定的位置编码表示,具有更好的泛化能力\upcite{Liu2020LearningTE}。通过对Transformer模型中各层输出进行可视化分析,研究人员发现Transformer自底向上各层网络依次聚焦于词级-语法级-语义级的表示\upcite{Jawahar2019WhatDB}(Shallow-to-Deep Training for Neural Machine Translation(我的EMNLP,过两天挂arXiv)),因此在底层的自注意力网络中引入局部编码信息有助于模型对局部特征的抽象\upcite{Yang2018ModelingLF,DBLP:journals/corr/abs-1904-03107}。
title = {A Structured Self-Attentive Sentence Embedding},
publisher = {5th International Conference on Learning Representations, {ICLR} 2017,
Toulon, France, April 24-26, 2017, Conference Track Proceedings},
//publisher = {OpenReview.net},
publisher = {5th International Conference on Learning Representations},
year = {2017},
}
@inproceedings{Shaw2018SelfAttentionWR,
...
...
@@ -4789,11 +4881,8 @@ pages ={157-166},
Ashish Vaswani},
title = {Self-Attention with Relative Position Representations},
publisher = {Proceedings of the 2018 Conference of the North American Chapter of
the Association for Computational Linguistics: Human Language Technologies,
NAACL-HLT, New Orleans, Louisiana, USA, June 1-6, 2018, Volume 2 (Short
Papers)},
the Association for Computational Linguistics: Human Language Technologies},
pages = {464--468},
//publisher = {Association for Computational Linguistics},
year = {2018},
}
@inproceedings{DBLP:journals/corr/HeZRS15,
...
...
@@ -4802,10 +4891,8 @@ pages ={157-166},
Shaoqing Ren and
Jian Sun},
title = {Deep Residual Learning for Image Recognition},
publisher = {2016 {IEEE} Conference on Computer Vision and Pattern Recognition,
{CVPR} 2016, Las Vegas, NV, USA, June 27-30, 2016},
publisher = {{IEEE} Conference on Computer Vision and Pattern Recognition},
pages = {770--778},
//publisher = {{IEEE} Computer Society},
year = {2016},
}
@article{JMLR:v15:srivastava14a,
...
...
@@ -4823,10 +4910,8 @@ pages ={157-166},
Jonathon Shlens and
Zbigniew Wojna},
title = {Rethinking the Inception Architecture for Computer Vision},
publisher = {2016 {IEEE} Conference on Computer Vision and Pattern Recognition,
{CVPR} 2016, Las Vegas, NV, USA, June 27-30, 2016},
publisher = {{IEEE} Conference on Computer Vision and Pattern Recognition},
pages = {2818--2826},
//publisher = {{IEEE} Computer Society},
year = {2016},
}
@inproceedings{DBLP:journals/corr/abs-1805-00631,
...
...
@@ -4835,10 +4920,8 @@ pages ={157-166},
Jinsong Su},
title = {Accelerating Neural Transformer via an Average Attention Network},
publisher = {Proceedings of the 56th Annual Meeting of the Association for Computational
Linguistics, {ACL} 2018, Melbourne, Australia, July 15-20, 2018, Volume
1: Long Papers},
Linguistics},
pages = {1789--1798},
//publisher = {Association for Computational Linguistics},
year = {2018},
}
@article{DBLP:journals/corr/CourbariauxB16,
...
...
@@ -4850,7 +4933,136 @@ pages ={157-166},
volume = {abs/1602.02830},
year = {2016},
}
@inproceedings{Wu2019PayLA,
author = {Felix Wu and
Angela Fan and
Alexei Baevski and
Yann N. Dauphin and
Michael Auli},
title = {Pay Less Attention with Lightweight and Dynamic Convolutions},
publisher = {7th International Conference on Learning Representations},
year = {2019},
}
@article{Shaw2018SelfAttentionWR,
title={Self-Attention with Relative Position Representations},
author={Peter Shaw and Jakob Uszkoreit and Ashish Vaswani},
journal={ArXiv},
year={2018},
volume={abs/1803.02155}
}
@inproceedings{dai-etal-2019-transformer,
title = "Transformer-{XL}: Attentive Language Models beyond a Fixed-Length Context",
author = "Dai, Zihang and
Yang, Zhilin and
Yang, Yiming and
Carbonell, Jaime and
Le, Quoc and
Salakhutdinov, Ruslan",
month = jul,
year = "2019",
address = "Florence, Italy",
publisher = "Association for Computational Linguistics",
pages = "2978--2988",
}
@article{Liu2020LearningTE,
title={Learning to Encode Position for Transformer with Continuous Dynamical Model},
author={Xuanqing Liu and Hsiang-Fu Yu and I. Dhillon and Cho-Jui Hsieh},
journal={ArXiv},
year={2020},
volume={abs/2003.09229}
}
@inproceedings{Jawahar2019WhatDB,
title={What Does BERT Learn about the Structure of Language?},
author={Ganesh Jawahar and B. Sagot and Djam{\'e} Seddah},
publisher={Annual Meeting of the Association for Computational Linguistics},
year={2019}
}
@inproceedings{Yang2018ModelingLF,
title={Modeling Localness for Self-Attention Networks},
author={Baosong Yang and Zhaopeng Tu and Derek F. Wong and Fandong Meng and Lidia S. Chao and T. Zhang},
publisher={Conference on Empirical Methods in Natural Language Processing},
year={2018}
}
@inproceedings{DBLP:journals/corr/abs-1904-03107,
author = {Baosong Yang and
Longyue Wang and
Derek F. Wong and
Lidia S. Chao and
Zhaopeng Tu},
title = {Convolutional Self-Attention Networks},
pages = {4040--4045},
publisher = {Association for Computational Linguistics},
year = {2019},
}
@article{Wang2018MultilayerRF,
title={Multi-layer Representation Fusion for Neural Machine Translation},
author={Qiang Wang and Fuxue Li and Tong Xiao and Yanyang Li and Yinqiao Li and Jingbo Zhu},
journal={ArXiv},
year={2018},
volume={abs/2002.06714}
}
@inproceedings{Bapna2018TrainingDN,
title={Training Deeper Neural Machine Translation Models with Transparent Attention},
author={Ankur Bapna and M. Chen and Orhan Firat and Yuan Cao and Y. Wu},
publisher={Conference on Empirical Methods in Natural Language Processing},
year={2018}
}
@inproceedings{Dou2018ExploitingDR,
title={Exploiting Deep Representations for Neural Machine Translation},
author={Zi-Yi Dou and Zhaopeng Tu and Xing Wang and Shuming Shi and T. Zhang},
publisher={Conference on Empirical Methods in Natural Language Processing},
year={2018}
}
@inproceedings{Wang2019ExploitingSC,
title={Exploiting Sentential Context for Neural Machine Translation},
author={Xing Wang and Zhaopeng Tu and Longyue Wang and Shuming Shi},
publisher={Annual Meeting of the Association for Computational Linguistics},
year={2019}
}
@inproceedings{Wang2019LearningDT,
title = "Learning Deep Transformer Models for Machine Translation",
author = "Wang, Qiang and
Li, Bei and
Xiao, Tong and
Zhu, Jingbo and
Li, Changliang and
Wong, Derek F. and
Chao, Lidia S.",
month = jul,
year = "2019",
address = "Florence, Italy",
publisher = "Association for Computational Linguistics",
pages = "1810--1822"
}
@inproceedings{Dou2019DynamicLA,
title={Dynamic Layer Aggregation for Neural Machine Translation},
author={Zi-Yi Dou and Zhaopeng Tu and Xing Wang and Longyue Wang and Shuming Shi and T. Zhang},
publisher={AAAI Conference on Artificial Intelligence},
year={2019}
}
@inproceedings{Wei2020MultiscaleCD,
title={Multiscale Collaborative Deep Models for Neural Machine Translation},
author={Xiangpeng Wei and Heng Yu and Yue Hu and Yue Zhang and Rongxiang Weng and Weihua Luo},
booktitle={Annual Meeting of the Association for Computational Linguistics},
year={2020}
}
@inproceedings{Vaswani2018Tensor2TensorFN,
title={Tensor2Tensor for Neural Machine Translation},
author={Ashish Vaswani and S. Bengio and E. Brevdo and F. Chollet and Aidan N. Gomez and S. Gouws and Llion Jones and L. Kaiser and Nal Kalchbrenner and Niki Parmar and Ryan Sepassi and Noam Shazeer and Jakob Uszkoreit},
booktitle={American Mobile Telecommunications Association },
year={2018}
}
@article{Kitaev2020ReformerTE,
title={Reformer: The Efficient Transformer},
author={Nikita Kitaev and L. Kaiser and Anselm Levskaya},