Commit d9c7e8d9 by zengxin

10 12

parent 2b0aa7b3
...@@ -1257,7 +1257,7 @@ L(\mathbi{Y},\widehat{\mathbi{Y}}) = \sum_{j=1}^n L_{\textrm{ce}}(\mathbi{y}_j,\ ...@@ -1257,7 +1257,7 @@ L(\mathbi{Y},\widehat{\mathbi{Y}}) = \sum_{j=1}^n L_{\textrm{ce}}(\mathbi{y}_j,\
\vspace{0.5em} \vspace{0.5em}
\item 注意力机制的使用是机器翻译乃至整个自然语言处理近几年获得成功的重要因素之一\upcite{bahdanau2014neural,DBLP:journals/corr/LuongPM15}。早期,有研究者尝试将注意力机制和统计机器翻译的词对齐进行统一\upcite{WangNeural,He2016ImprovedNM,li-etal-2019-word}。最近,也有大量的研究工作对注意力机制进行改进,比如,使用自注意力机制构建翻译模型等\upcite{vaswani2017attention}。而对注意力模型的改进也成为了自然语言处理中的热点问题之一。在{\chapterfifteen}会对机器翻译中不同注意力模型进行进一步讨论。 \item 注意力机制的使用是机器翻译乃至整个自然语言处理近几年获得成功的重要因素之一\upcite{bahdanau2014neural,DBLP:journals/corr/LuongPM15}。早期,有研究者尝试将注意力机制和统计机器翻译的词对齐进行统一\upcite{WangNeural,He2016ImprovedNM,li-etal-2019-word}。最近,也有大量的研究工作对注意力机制进行改进,比如,使用自注意力机制构建翻译模型等\upcite{vaswani2017attention}。而对注意力模型的改进也成为了自然语言处理中的热点问题之一。在{\chapterfifteen}会对机器翻译中不同注意力模型进行进一步讨论。
\vspace{0.5em} \vspace{0.5em}
\item 一般来说,神经机器翻译的计算过程是没有人工干预的,翻译流程也无法用人类的知识直接进行解释,因此一个有趣的方向是在神经机器翻译中引入先验知识,使得机器翻译的行为更“像”人。比如,可以使用句法树来引入人类的语言学知识\upcite{Yang2017TowardsBH,Wang2019TreeTI},基于句法的神经机器翻译也包含大量的树结构的神经网络建模\upcite{DBLP:journals/corr/abs-1809-01854,DBLP:journals/corr/abs-1808-09374}。此外,也可以把用户定义的词典或者翻译记忆加入到翻译过程来\upcite{DBLP:journals/corr/ZhangZ16c,zhang-etal-2017-prior,duan-etal-2020-bilingual,cao-xiong-2018-encoding},使得用户的约束可以直接反映到机器翻译的结果上来。先验知识的种类还有很多,包括词对齐\upcite{li-etal-2019-word}、 篇章信息\upcite{Werlen2018DocumentLevelNM,DBLP:journals/corr/abs-1805-10163} 等等,都是神经机器翻译中能够使用的信息。 \item 一般来说,神经机器翻译的计算过程是没有人工干预的,翻译流程也无法用人类的知识直接进行解释,因此一个有趣的方向是在神经机器翻译中引入先验知识,使得机器翻译的行为更“像”人。比如,可以使用句法树来引入人类的语言学知识\upcite{Yang2017TowardsBH,Wang2019TreeTI},基于句法的神经机器翻译也包含大量的树结构的神经网络建模\upcite{DBLP:journals/corr/abs-1809-01854,DBLP:journals/corr/abs-1808-09374}。此外,也可以把用户定义的词典或者翻译记忆加入到翻译过程来\upcite{DBLP:journals/corr/ZhangZ16c,zhang-etal-2017-prior,duan-etal-2020-bilingual,cao-xiong-2018-encoding},使得用户的约束可以直接反映到机器翻译的结果上来。先验知识的种类还有很多,包括词对齐\upcite{li-etal-2019-word,DBLP:conf/emnlp/MiWI16,DBLP:conf/coling/LiuUFS16}、 篇章信息\upcite{Werlen2018DocumentLevelNM,DBLP:journals/corr/abs-1805-10163,DBLP:conf/acl/LiLWJXZLL20} 等等,都是神经机器翻译中能够使用的信息。
\end{itemize} \end{itemize}
...@@ -587,7 +587,7 @@ Transformer Deep(48层) & 30.2 & 43.1 & 194$\times 10^ ...@@ -587,7 +587,7 @@ Transformer Deep(48层) & 30.2 & 43.1 & 194$\times 10^
\begin{itemize} \begin{itemize}
\vspace{0.5em} \vspace{0.5em}
\item 近两年,有研究已经发现注意力机制可以捕捉一些语言现象\upcite{DBLP:journals/corr/abs-1905-09418},比如,在Transformer 的多头注意力中,不同头往往会捕捉到不同的信息,比如,有些头对低频词更加敏感,有些头更适合词意消歧,甚至有些头可以捕捉句法信息。此外,由于注意力机制增加了模型的复杂性,而且随着网络层数的增多,神经机器翻译中也存在大量的冗余,因此研发轻量的注意力模型也是具有实践意义的方向\upcite{Xiao2019SharingAW,DBLP:journals/corr/abs-1805-00631,Lin2020WeightDT} \item 近两年,有研究已经发现注意力机制可以捕捉一些语言现象\upcite{DBLP:journals/corr/abs-1905-09418},比如,在Transformer 的多头注意力中,不同头往往会捕捉到不同的信息,比如,有些头对低频词更加敏感,有些头更适合词意消歧,甚至有些头可以捕捉句法信息。此外,由于注意力机制增加了模型的复杂性,而且随着网络层数的增多,神经机器翻译中也存在大量的冗余,因此研发轻量的注意力模型也是具有实践意义的方向\upcite{Xiao2019SharingAW,DBLP:journals/corr/abs-1805-00631,Lin2020WeightDT,DBLP:conf/iclr/WuLLLH20,Kitaev2020ReformerTE,DBLP:journals/corr/abs-2005-00743,dai-etal-2019-transformer,DBLP:journals/corr/abs-2004-05150,DBLP:conf/iclr/RaePJHL20}
\vspace{0.5em} \vspace{0.5em}
\item 神经机器翻译依赖成本较高的GPU设备,因此对模型的裁剪和加速也是很多系统研发人员所感兴趣的方向。比如,从工程上,可以考虑减少运算强度,比如使用低精度浮点数\upcite{Ott2018ScalingNM} 或者整数\upcite{DBLP:journals/corr/abs-1906-00532,Lin2020TowardsF8}进行计算,或者引入缓存机制来加速模型的推断\upcite{Vaswani2018Tensor2TensorFN};也可以通过对模型参数矩阵的剪枝来减小整个模型的体积\upcite{DBLP:journals/corr/SeeLM16};另一种方法是知识蒸馏\upcite{Hinton2015Distilling,kim-rush-2016-sequence}。 利用大模型训练小模型,这样往往可以得到比单独训练小模型更好的效果\upcite{DBLP:journals/corr/ChenLCL17} \item 神经机器翻译依赖成本较高的GPU设备,因此对模型的裁剪和加速也是很多系统研发人员所感兴趣的方向。比如,从工程上,可以考虑减少运算强度,比如使用低精度浮点数\upcite{Ott2018ScalingNM} 或者整数\upcite{DBLP:journals/corr/abs-1906-00532,Lin2020TowardsF8}进行计算,或者引入缓存机制来加速模型的推断\upcite{Vaswani2018Tensor2TensorFN};也可以通过对模型参数矩阵的剪枝来减小整个模型的体积\upcite{DBLP:journals/corr/SeeLM16};另一种方法是知识蒸馏\upcite{Hinton2015Distilling,kim-rush-2016-sequence}。 利用大模型训练小模型,这样往往可以得到比单独训练小模型更好的效果\upcite{DBLP:journals/corr/ChenLCL17}
\vspace{0.5em} \vspace{0.5em}
......
...@@ -4337,6 +4337,43 @@ year = {2012} ...@@ -4337,6 +4337,43 @@ year = {2012}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%% chapter 10------------------------------------------------------ %%%%% chapter 10------------------------------------------------------
@inproceedings{DBLP:conf/acl/LiLWJXZLL20,
author = {Bei Li and
Hui Liu and
Ziyang Wang and
Yufan Jiang and
Tong Xiao and
Jingbo Zhu and
Tongran Liu and
Changliang Li},
title = {Does Multi-Encoder Help? {A} Case Study on Context-Aware Neural Machine
Translation},
pages = {3512--3518},
publisher = {Association for Computational Linguistics},
year = {2020}
}
@inproceedings{DBLP:conf/emnlp/MiWI16,
author = {Haitao Mi and
Zhiguo Wang and
Abe Ittycheriah},
title = {Supervised Attentions for Neural Machine Translation},
pages = {2283--2288},
publisher = {The Association for Computational Linguistics},
year = {2016}
}
@inproceedings{DBLP:conf/coling/LiuUFS16,
author = {Lemao Liu and
Masao Utiyama and
Andrew M. Finch and
Eiichiro Sumita},
title = {Neural Machine Translation with Supervised Attention},
pages = {3093--3102},
publisher = {The Association for Computational Linguistics},
year = {2016}
}
@inproceedings{devlin-etal-2014-fast, @inproceedings{devlin-etal-2014-fast,
author = {Jacob Devlin and author = {Jacob Devlin and
Rabih Zbib and Rabih Zbib and
...@@ -4378,13 +4415,15 @@ year = {2012} ...@@ -4378,13 +4415,15 @@ year = {2012}
year = {1998}, year = {1998},
} }
@article{BENGIO1994Learning, @article{BENGIO1994Learning,
author ={Y. {Bengio} and P. {Simard} and P. {Frasconi}}, author = {Yoshua Bengio and
journal ={IEEE Transactions on Neural Networks}, Patrice Y. Simard and
title ={Learning long-term dependencies with gradient descent is difficult}, Paolo Frasconi},
year ={1994}, title = {Learning long-term dependencies with gradient descent is difficult},
volume ={5}, journal = {Institute of Electrical and Electronics Engineers},
number ={2}, volume = {5},
pages ={157-166}, number = {2},
pages = {157--166},
year = {1994}
} }
@inproceedings{NIPS2017_7181, @inproceedings{NIPS2017_7181,
author = {Ashish Vaswani and author = {Ashish Vaswani and
...@@ -4460,7 +4499,7 @@ pages ={157-166}, ...@@ -4460,7 +4499,7 @@ pages ={157-166},
title = {Learning Deep Transformer Models for Machine Translation}, title = {Learning Deep Transformer Models for Machine Translation},
pages = {1810--1822}, pages = {1810--1822},
publisher = {Association for Computational Linguistics}, publisher = {Association for Computational Linguistics},
year = {2019}, year = {2019}
} }
@article{Li2020NeuralMT, @article{Li2020NeuralMT,
author = {Yanyang Li and author = {Yanyang Li and
...@@ -4860,21 +4899,24 @@ pages ={157-166}, ...@@ -4860,21 +4899,24 @@ pages ={157-166},
year={2018} year={2018}
} }
@inproceedings{Lin2020TowardsF8, @inproceedings{Lin2020TowardsF8,
title={Towards Fully 8-bit Integer Inference for the Transformer Model}, author = {Ye Lin and
author={Y. Lin and Yanyang Li and Tengbo Liu and Tong Xiao and T. Liu and Jingbo Zhu}, Yanyang Li and
publisher={International Joint Conference on Artificial Intelligence}, Tengbo Liu and
year={2020} Tong Xiao and
Tongran Liu and
Jingbo Zhu},
title = {Towards Fully 8-bit Integer Inference for the Transformer Model},
pages = {3759--3765},
publisher = {International Joint Conference on Artificial Intelligence},
year = {2020}
} }
@inproceedings{kim-rush-2016-sequence, @inproceedings{kim-rush-2016-sequence,
title = "Sequence-Level Knowledge Distillation", author = {Yoon Kim and
author = "Kim, Yoon and Alexander M. Rush},
Rush, Alexander M.", title = {Sequence-Level Knowledge Distillation},
publisher = "Proceedings of the 2016 Conference on Empirical Methods in Natural Language Processing", pages = {1317--1327},
month = nov, publisher = {The Association for Computational Linguistics},
year = "2016", year = {2016}
//address = "Austin, Texas",
//publisher = "Association for Computational Linguistics",
pages = "1317--1327",
} }
@article{Akaike1969autoregressive, @article{Akaike1969autoregressive,
author = {Hirotugu Akaike}, author = {Hirotugu Akaike},
...@@ -4914,16 +4956,14 @@ pages ={157-166}, ...@@ -4914,16 +4956,14 @@ pages ={157-166},
year={2018} year={2018}
} }
@inproceedings{cho-etal-2014-properties, @inproceedings{cho-etal-2014-properties,
title = "On the Properties of Neural Machine Translation: Encoder--Decoder Approaches", author = {Kyunghyun Cho and
author = {Cho, Kyunghyun and Bart van Merrienboer and
van Merri{\"e}nboer, Bart and Dzmitry Bahdanau and
Bahdanau, Dzmitry and Yoshua Bengio},
Bengio, Yoshua}, title = {On the Properties of Neural Machine Translation: Encoder-Decoder Approaches},
month = oct, pages = {103--111},
year = "2014", publisher = {Association for Computational Linguistics},
address = "Doha, Qatar", year = {2014}
publisher = "Association for Computational Linguistics",
pages = "103--111",
} }
@inproceedings{DBLP:conf/acl/JeanCMB15, @inproceedings{DBLP:conf/acl/JeanCMB15,
...@@ -4948,10 +4988,14 @@ pages ={157-166}, ...@@ -4948,10 +4988,14 @@ pages ={157-166},
year = {2015} year = {2015}
} }
@inproceedings{He2016ImprovedNM, @inproceedings{He2016ImprovedNM,
title={Improved Neural Machine Translation with SMT Features}, author = {Wei He and
author={W. He and Zhongjun He and Hua Wu and H. Wang}, Zhongjun He and
booktitle={AAAI Conference on Artificial Intelligence}, Hua Wu and
year={2016} Haifeng Wang},
title = {Improved Neural Machine Translation with {SMT} Features},
pages = {151--157},
publisher = {the Association for the Advance of Artificial Intelligence},
year = {2016}
} }
@inproceedings{zhang-etal-2017-prior, @inproceedings{zhang-etal-2017-prior,
title = {Prior Knowledge Integration for Neural Machine Translation using Posterior Regularization}, title = {Prior Knowledge Integration for Neural Machine Translation using Posterior Regularization},
...@@ -4966,45 +5010,40 @@ pages ={157-166}, ...@@ -4966,45 +5010,40 @@ pages ={157-166},
} }
@inproceedings{duan-etal-2020-bilingual, @inproceedings{duan-etal-2020-bilingual,
title = "Bilingual Dictionary Based Neural Machine Translation without Using Parallel Sentences", author = {Xiangyu Duan and
author = "Duan, Xiangyu and Baijun Ji and
Ji, Baijun and Hao Jia and
Jia, Hao and Min Tan and
Tan, Min and Min Zhang and
Zhang, Min and Boxing Chen and
Chen, Boxing and Weihua Luo and
Luo, Weihua and Yue Zhang},
Zhang, Yue", title = {Bilingual Dictionary Based Neural Machine Translation without Using
month = jul, Parallel Sentences},
year = "2020", pages = {1570--1579},
address = "Online", publisher = {Association for Computational Linguistics},
publisher = "Association for Computational Linguistics", year = {2020}
pages = "1570--1579",
} }
@inproceedings{cao-xiong-2018-encoding, @inproceedings{cao-xiong-2018-encoding,
title = "Encoding Gated Translation Memory into Neural Machine Translation", author = {Qian Cao and
author = "Cao, Qian and Deyi Xiong},
Xiong, Deyi", title = {Encoding Gated Translation Memory into Neural Machine Translation},
month = oct, pages = {3042--3047},
year = "2018", publisher = {Association for Computational Linguistics},
address = "Brussels, Belgium", year = {2018}
publisher = "Association for Computational Linguistics",
pages = "3042--3047",
} }
@inproceedings{yang-etal-2016-hierarchical, @inproceedings{yang-etal-2016-hierarchical,
title = "Hierarchical Attention Networks for Document Classification", author = {Zichao Yang and
author = "Yang, Zichao and Diyi Yang and
Yang, Diyi and Chris Dyer and
Dyer, Chris and Xiaodong He and
He, Xiaodong and Alexander J. Smola and
Smola, Alex and Eduard H. Hovy},
Hovy, Eduard", title = {Hierarchical Attention Networks for Document Classification},
month = jun, pages = {1480--1489},
year = "2016", publisher = {The Association for Computational Linguistics},
address = "San Diego, California", year = {2016}
publisher = "Association for Computational Linguistics",
pages = "1480--1489",
} }
%%%%% chapter 10------------------------------------------------------ %%%%% chapter 10------------------------------------------------------
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
...@@ -5014,9 +5053,6 @@ pages ={157-166}, ...@@ -5014,9 +5053,6 @@ pages ={157-166},
@inproceedings{DBLP:conf/naacl/Johnson015, @inproceedings{DBLP:conf/naacl/Johnson015,
author = {Rie Johnson and author = {Rie Johnson and
Tong Zhang}, Tong Zhang},
editor = {Rada Mihalcea and
Joyce Yue Chai and
Anoop Sarkar},
title = {Effective Use of Word Order for Text Categorization with Convolutional title = {Effective Use of Word Order for Text Categorization with Convolutional
Neural Networks}, Neural Networks},
pages = {103--112}, pages = {103--112},
...@@ -5027,10 +5063,6 @@ pages ={157-166}, ...@@ -5027,10 +5063,6 @@ pages ={157-166},
@inproceedings{DBLP:conf/naacl/NguyenG15, @inproceedings{DBLP:conf/naacl/NguyenG15,
author = {Thien Huu Nguyen and author = {Thien Huu Nguyen and
Ralph Grishman}, Ralph Grishman},
editor = {Phil Blunsom and
Shay B. Cohen and
Paramveer S. Dhillon and
Percy Liang},
title = {Relation Extraction: Perspective from Convolutional Neural Networks}, title = {Relation Extraction: Perspective from Convolutional Neural Networks},
pages = {39--48}, pages = {39--48},
publisher = {The Association for Computational Linguistics}, publisher = {The Association for Computational Linguistics},
...@@ -5411,6 +5443,51 @@ pages ={157-166}, ...@@ -5411,6 +5443,51 @@ pages ={157-166},
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%% chapter 12------------------------------------------------------ %%%%% chapter 12------------------------------------------------------
@inproceedings{DBLP:conf/iclr/RaePJHL20,
author = {Jack W. Rae and
Anna Potapenko and
Siddhant M. Jayakumar and
Chloe Hillier and
Timothy P. Lillicrap},
title = {Compressive Transformers for Long-Range Sequence Modelling},
publisher = {OpenReview.net},
year = {2020}
}
@article{DBLP:journals/corr/abs-2004-05150,
author = {Iz Beltagy and
Matthew E. Peters and
Arman Cohan},
title = {Longformer: The Long-Document Transformer},
journal = {CoRR},
volume = {abs/2004.05150},
year = {2020}
}
@article{DBLP:journals/corr/abs-2005-00743,
author = {Yi Tay and
Dara Bahri and
Donald Metzler and
Da-Cheng Juan and
Zhe Zhao and
Che Zheng},
title = {Synthesizer: Rethinking Self-Attention in Transformer Models},
journal = {CoRR},
volume = {abs/2005.00743},
year = {2020}
}
@inproceedings{DBLP:conf/iclr/WuLLLH20,
author = {Zhanghao Wu and
Zhijian Liu and
Ji Lin and
Yujun Lin and
Song Han},
title = {Lite Transformer with Long-Short Range Attention},
publisher = {OpenReview.net},
year = {2020}
}
@inproceedings{DBLP:journals/corr/abs-1905-09418, @inproceedings{DBLP:journals/corr/abs-1905-09418,
author = {Elena Voita and author = {Elena Voita and
David Talbot and David Talbot and
...@@ -5506,18 +5583,16 @@ pages ={157-166}, ...@@ -5506,18 +5583,16 @@ pages ={157-166},
} }
@inproceedings{dai-etal-2019-transformer, @inproceedings{dai-etal-2019-transformer,
title = "Transformer-{XL}: Attentive Language Models beyond a Fixed-Length Context", author = {Zihang Dai and
author = "Dai, Zihang and Zhilin Yang and
Yang, Zhilin and Yiming Yang and
Yang, Yiming and Jaime G. Carbonell and
Carbonell, Jaime and Quoc Viet Le and
Le, Quoc and Ruslan Salakhutdinov},
Salakhutdinov, Ruslan", title = {Transformer-XL: Attentive Language Models beyond a Fixed-Length Context},
month = jul, pages = {2978--2988},
year = "2019", publisher = {Association for Computational Linguistics},
address = "Florence, Italy", year = {2019}
publisher = "Association for Computational Linguistics",
pages = "2978--2988",
} }
@article{Liu2020LearningTE, @article{Liu2020LearningTE,
title={Learning to Encode Position for Transformer with Continuous Dynamical Model}, title={Learning to Encode Position for Transformer with Continuous Dynamical Model},
...@@ -5563,10 +5638,15 @@ pages ={157-166}, ...@@ -5563,10 +5638,15 @@ pages ={157-166},
year={2018} year={2018}
} }
@inproceedings{Dou2018ExploitingDR, @inproceedings{Dou2018ExploitingDR,
title={Exploiting Deep Representations for Neural Machine Translation}, author = {Zi-Yi Dou and
author={Zi-Yi Dou and Zhaopeng Tu and Xing Wang and Shuming Shi and T. Zhang}, Zhaopeng Tu and
publisher={Conference on Empirical Methods in Natural Language Processing}, Xing Wang and
year={2018} Shuming Shi and
Tong Zhang},
title = {Exploiting Deep Representations for Neural Machine Translation},
pages = {4253--4262},
publisher = {Association for Computational Linguistics},
year = {2018}
} }
@inproceedings{Wang2019ExploitingSC, @inproceedings{Wang2019ExploitingSC,
title={Exploiting Sentential Context for Neural Machine Translation}, title={Exploiting Sentential Context for Neural Machine Translation},
...@@ -5576,10 +5656,16 @@ pages ={157-166}, ...@@ -5576,10 +5656,16 @@ pages ={157-166},
} }
@inproceedings{Dou2019DynamicLA, @inproceedings{Dou2019DynamicLA,
title={Dynamic Layer Aggregation for Neural Machine Translation}, author = {Zi-Yi Dou and
author={Zi-Yi Dou and Zhaopeng Tu and Xing Wang and Longyue Wang and Shuming Shi and T. Zhang}, Zhaopeng Tu and
publisher={AAAI Conference on Artificial Intelligence}, Xing Wang and
year={2019} Longyue Wang and
Shuming Shi and
Tong Zhang},
title = {Dynamic Layer Aggregation for Neural Machine Translation with Routing-by-Agreement},
pages = {86--93},
publisher = {the Association for the Advance of Artificial Intelligence},
year = {2019}
} }
@inproceedings{Wei2020MultiscaleCD, @inproceedings{Wei2020MultiscaleCD,
title={Multiscale Collaborative Deep Models for Neural Machine Translation}, title={Multiscale Collaborative Deep Models for Neural Machine Translation},
...@@ -5614,7 +5700,7 @@ pages ={157-166}, ...@@ -5614,7 +5700,7 @@ pages ={157-166},
@article{li2020shallow, @article{li2020shallow,
title={Shallow-to-Deep Training for Neural Machine Translation}, title={Shallow-to-Deep Training for Neural Machine Translation},
author={Li, Bei and Wang, Ziyang and Liu, Hui and Jiang, Yufan and Du, Quan and Xiao, Tong and Wang, Huizhen and Zhu, Jingbo}, author={Li, Bei and Wang, Ziyang and Liu, Hui and Jiang, Yufan and Du, Quan and Xiao, Tong and Wang, Huizhen and Zhu, Jingbo},
journal={arXiv preprint arXiv:2010.03737}, publisher={Conference on Empirical Methods in Natural Language Processing},
year={2020} year={2020}
} }
%%%%% chapter 12------------------------------------------------------ %%%%% chapter 12------------------------------------------------------
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论