Commit 2863fdf0 by 曹润柘

合并分支 'caorunzhe' 到 'master'

Caorunzhe

查看合并请求 !309
parents 2ba9e0ab 8def89dd
......@@ -207,7 +207,7 @@ NMT & 21.7 & 18.7 & -13.7 \\
RNMT+ \upcite{Chen2018TheBO} &Chen等 &2018 &28.5 \\
Layer-Wise Coordination \upcite{He2018LayerWiseCB} &He等 &2018 &29.0 \\
Transformer-RPR \upcite{Shaw2018SelfAttentionWR} &Shaw等 &2018 &29.2 \\
Transformer-DLCL \upcite{Wang2019LearningDT} &Wang等 &2019 &29.3 \\
Transformer-DLCL \upcite{WangLearning} &Wang等 &2019 &29.3 \\
Msc \upcite{Wei2020MultiscaleCD} &Wei等 &2020 &30.56 \\
(北哥的论文) & &2020 & \\
\end{tabular}
......@@ -392,7 +392,7 @@ NMT & 21.7 & 18.7 & -13.7 \\
\rule{0pt}{16pt} 2014 & Bahdanau等 & Neural Machine Translation by Jointly Learning to Align and Translate \upcite{bahdanau2014neural} \\
\rule{0pt}{16pt} 2014 & Cho等 & On the Properties of Neural Machine Translation \upcite{cho-etal-2014-properties} \\
\rule{0pt}{16pt} 2015 & Jean等 & On Using Very Large Target Vocabulary for Neural Machine Translation \upcite{DBLP:conf/acl/JeanCMB15} \\
\rule{0pt}{16pt} 2015 & Luong等 & Effective Approches to Attention-based Neural Machine Translation \upcite{luong-etal-2015-effective}
\rule{0pt}{16pt} 2015 & Luong等 & Effective Approches to Attention-based Neural Machine Translation \upcite{DBLP:journals/corr/LuongPM15}
\end{tabular}
\end{table}
%----------------------------------------------
......
......@@ -581,11 +581,11 @@ Transformer Deep(48层) & 30.2 & 43.1 & 194$\times 10^
\begin{itemize}
\vspace{0.5em}
\item 近两年,有研究已经发现注意力机制可以捕捉一些语言现象\upcite{DBLP:journals/corr/abs-1905-09418},比如,在Transformer 的多头注意力中,不同头往往会捕捉到不同的信息,比如,有些头对低频词更加敏感,有些头更适合词意消歧,甚至有些头可以捕捉句法信息。此外,由于注意力机制增加了模型的复杂性,而且随着网络层数的增多,神经机器翻译中也存在大量的冗余,因此研发轻量的注意力模型也是具有实践意义的方向\upcite{Xiao2019SharingAW,zhang-etal-2018-accelerating,Lin2020WeightDT}
\item 近两年,有研究已经发现注意力机制可以捕捉一些语言现象\upcite{DBLP:journals/corr/abs-1905-09418},比如,在Transformer 的多头注意力中,不同头往往会捕捉到不同的信息,比如,有些头对低频词更加敏感,有些头更适合词意消歧,甚至有些头可以捕捉句法信息。此外,由于注意力机制增加了模型的复杂性,而且随着网络层数的增多,神经机器翻译中也存在大量的冗余,因此研发轻量的注意力模型也是具有实践意义的方向\upcite{Xiao2019SharingAW,DBLP:journals/corr/abs-1805-00631,Lin2020WeightDT}
\vspace{0.5em}
\item 神经机器翻译依赖成本较高的GPU设备,因此对模型的裁剪和加速也是很多系统研发人员所感兴趣的方向。比如,从工程上,可以考虑减少运算强度,比如使用低精度浮点数\upcite{Ott2018ScalingNM} 或者整数\upcite{DBLP:journals/corr/abs-1906-00532,Lin2020TowardsF8}进行计算,或者引入缓存机制来加速模型的推断\upcite{Vaswani2018Tensor2TensorFN};也可以通过对模型参数矩阵的剪枝来减小整个模型的体积\upcite{DBLP:journals/corr/SeeLM16};另一种方法是知识精炼\upcite{Hinton2015Distilling,kim-rush-2016-sequence}。 利用大模型训练小模型,这样往往可以得到比单独训练小模型更好的效果\upcite{DBLP:journals/corr/ChenLCL17}
\vspace{0.5em}
\item 自注意力网络作为Transformer模型中重要组成部分,近年来受到研究人员的广泛关注,尝试设计更高效地操作来替代它。比如,利用动态卷积网络来替换编码端与解码端的自注意力网络,在保证推断效率的同时取得了和Transformer相当甚至略好的翻译性能\upcite{Wu2019PayLA};为了加速Transformer处理较长输入文本的效率,利用局部敏感哈希替换自注意力机制的Reformer模型也吸引了广泛的关注\upcite{Kitaev2020ReformerTE}。此外,在自注意力网络引入额外的编码信息能够进一步提高模型的表示能力。比如,引入固定窗口大小的相对位置编码信息\upcite{Shaw2018SelfAttentionWR,dai-etal-2019-transformer},或利用动态系统的思想从数据中学习特定的位置编码表示,具有更好的泛化能力\upcite{Liu2020LearningTE}。通过对Transformer模型中各层输出进行可视化分析,研究人员发现Transformer自底向上各层网络依次聚焦于词级-语法级-语义级的表示\upcite{Jawahar2019WhatDB}(Shallow-to-Deep Training for Neural Machine Translation(我的EMNLP,过两天挂arXiv)),因此在底层的自注意力网络中引入局部编码信息有助于模型对局部特征的抽象\upcite{Yang2018ModelingLF,DBLP:journals/corr/abs-1904-03107}
\vspace{0.5em}
\item 除了针对Transformer中子层的优化,网络各层之间的连接方式在一定程度上也能影响模型的表示能力。近年来针对网络连接优化的工作如下:在编码端顶部利用平均池化或权重累加等融合手段得到编码端各层的全局表示\upcite{Wang2018MultilayerRF,Bapna2018TrainingDN,Dou2018ExploitingDR,Wang2019ExploitingSC},利用之前各层表示来生成当前层的输入表示\upcite{Wang2019LearningDT,Dou2019DynamicLA,Wei2020MultiscaleCD}
\item 除了针对Transformer中子层的优化,网络各层之间的连接方式在一定程度上也能影响模型的表示能力。近年来针对网络连接优化的工作如下:在编码端顶部利用平均池化或权重累加等融合手段得到编码端各层的全局表示\upcite{Wang2018MultilayerRF,Bapna2018TrainingDN,Dou2018ExploitingDR,Wang2019ExploitingSC},利用之前各层表示来生成当前层的输入表示\upcite{WangLearning,Dou2019DynamicLA,Wei2020MultiscaleCD}
\end{itemize}
......@@ -4752,16 +4752,15 @@ pages ={157-166},
year = {2015}
}
@inproceedings{luong-etal-2015-effective,
title = "Effective Approaches to Attention-based Neural Machine Translation",
author = "Luong, Thang and
Pham, Hieu and
Manning, Christopher D.",
month = sep,
year = "2015",
address = "Lisbon, Portugal",
publisher = "Association for Computational Linguistics",
pages = "1412--1421",
@inproceedings{DBLP:journals/corr/LuongPM15,
author = {Thang Luong and
Hieu Pham and
Christopher D. Manning},
title = {Effective Approaches to Attention-based Neural Machine Translation},
publisher = {Conference on Empirical Methods in Natural
Language Processing},
pages = {1412--1421},
year = {2015}
}
@inproceedings{He2016ImprovedNM,
title={Improved Neural Machine Translation with SMT Features},
......@@ -4915,13 +4914,7 @@ pages ={157-166},
publisher = {7th International Conference on Learning Representations},
year = {2019},
}
@article{Shaw2018SelfAttentionWR,
title={Self-Attention with Relative Position Representations},
author={Peter Shaw and Jakob Uszkoreit and Ashish Vaswani},
journal={ArXiv},
year={2018},
volume={abs/1803.02155}
}
@inproceedings{dai-etal-2019-transformer,
title = "Transformer-{XL}: Attentive Language Models beyond a Fixed-Length Context",
author = "Dai, Zihang and
......@@ -4992,22 +4985,6 @@ pages ={157-166},
year={2019}
}
@inproceedings{Wang2019LearningDT,
title = "Learning Deep Transformer Models for Machine Translation",
author = "Wang, Qiang and
Li, Bei and
Xiao, Tong and
Zhu, Jingbo and
Li, Changliang and
Wong, Derek F. and
Chao, Lidia S.",
month = jul,
year = "2019",
address = "Florence, Italy",
publisher = "Association for Computational Linguistics",
pages = "1810--1822"
}
@inproceedings{Dou2019DynamicLA,
title={Dynamic Layer Aggregation for Neural Machine Translation},
author={Zi-Yi Dou and Zhaopeng Tu and Xing Wang and Longyue Wang and Shuming Shi and T. Zhang},
......@@ -5035,17 +5012,7 @@ pages ={157-166},
year={2020},
volume={abs/2001.04451}
}
@inproceedings{zhang-etal-2018-accelerating,
title = "Accelerating Neural Transformer via an Average Attention Network",
author = "Zhang, Biao and
Xiong, Deyi and
Su, Jinsong",
month = jul,
year = "2018",
address = "Melbourne, Australia",
publisher = "Association for Computational Linguistics",
pages = "1789--1798",
}
@article{Lin2020WeightDT,
title={Weight Distillation: Transferring the Knowledge in Neural Network Parameters},
author={Ye Lin and Yanyang Li and Ziyang Wang and Bei Li and Quan Du and Tong Xiao and Jingbo Zhu},
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论