12

d3ac2c17 · zengxin · f9f2f74d · d3ac2c17 · d3ac2c17
Commit d3ac2c17 authored Sep 28, 2020 by zengxin
--- a/Chapter12/chapter12.tex
+++ b/Chapter12/chapter12.tex
@@ -573,7 +573,7 @@ Transformer Deep（48层） & 30.2            & 43.1            & 194$\times 10^
 %----------------------------------------------------------------------------------------
 %    NEW SECTION  12.3
 %----------------------------------------------------------------------------------------
-\section{小结及深入阅读}
+\section{小结及拓展阅读}

 \parinterval 编码器-解码器框架提供了一个非常灵活的机制，因为开发者只需要设计编码器和解码器的结构就能完成机器翻译。但是，架构的设计是深度学习中最具挑战的工
 作，优秀的架构往往需要长时间的探索和大量的实验验证，而且还需要一点点 “灵感”。前面介绍的基于循环神经网络的翻译模型和注意力机制就是研究人员通过长期
@@ -581,7 +581,7 @@ Transformer Deep（48层） & 30.2            & 43.1            & 194$\times 10^

 \begin{itemize}
 \vspace{0.5em}
-\item 近两年，有研究已经发现注意力机制可以捕捉一些语言现象\upcite{DBLP:journals/corr/abs-1905-09418}，比如，在Transformer 的多头注意力中，不同头往往会捕捉到不同的信息，比如，有些头对低频词更加敏感，有些头更适合词意消歧，甚至有些头可以捕捉句法信息。此外，由于注意力机制增加了模型的复杂性，而且随着网络层数的增多，神经机器翻译中也存在大量的冗余，因此研发轻量的注意力模型也是具有实践意义的方向\upcite{Xiao2019SharingAW,zhang-etal-2018-accelerating}（{\color{red} Weight Distillation: Transferring the Knowledge in Neural Network Parameters}）。
+\item 近两年，有研究已经发现注意力机制可以捕捉一些语言现象\upcite{DBLP:journals/corr/abs-1905-09418}，比如，在Transformer 的多头注意力中，不同头往往会捕捉到不同的信息，比如，有些头对低频词更加敏感，有些头更适合词意消歧，甚至有些头可以捕捉句法信息。此外，由于注意力机制增加了模型的复杂性，而且随着网络层数的增多，神经机器翻译中也存在大量的冗余，因此研发轻量的注意力模型也是具有实践意义的方向\upcite{Xiao2019SharingAW,zhang-etal-2018-accelerating,Lin2020WeightDT}。
 \vspace{0.5em}
 \item 神经机器翻译依赖成本较高的GPU设备，因此对模型的裁剪和加速也是很多系统研发人员所感兴趣的方向。比如，从工程上，可以考虑减少运算强度，比如使用低精度浮点数\upcite{Ott2018ScalingNM} 或者整数\upcite{DBLP:journals/corr/abs-1906-00532,Lin2020TowardsF8}进行计算，或者引入缓存机制来加速模型的推断\upcite{Vaswani2018Tensor2TensorFN}；也可以通过对模型参数矩阵的剪枝来减小整个模型的体积\upcite{DBLP:journals/corr/SeeLM16}；另一种方法是知识精炼\upcite{Hinton2015Distilling,kim-rush-2016-sequence}。 利用大模型训练小模型，这样往往可以得到比单独训练小模型更好的效果\upcite{DBLP:journals/corr/ChenLCL17}。
 \vspace{0.5em}

--- a/bibliography.bib
+++ b/bibliography.bib
@@ -252,12 +252,11 @@
  year      = {2010}
 }

-@article{DBLP:journals/corr/abs-1709-07809,
+@book{DBLP:journals/corr/abs-1709-07809,
  author    = {Philipp Koehn},
  title     = {Neural Machine Translation},
-  journal   = {CoRR},
-  volume    = {abs/1709.07809},
-  year      = {2017}
+  publisher   = {Cambridge University Press},
+  year      = {2020}
 }

 @book{宗成庆2013统计自然语言处理,
@@ -304,9 +303,7 @@
 @book{邱锡鹏2020神经网络与深度学习,
  title ={神经网络与深度学习},
  author ={邱锡鹏},
-  journal ={中文信息学报},
-  volume ={34},
-  pages ={4},
+  publisher ={机械工业出版社},
  year ={2020}
 }

@@ -843,8 +840,7 @@
 %%%%% chapter 3------------------------------------------------------

 @inproceedings{ng2002discriminative,
-  author    = {Andrew Y. Ng and
-               Michael I. Jordan},
+  author    = {Ng, Andrew Y and Jordan, Michael I},
  title     = {On Discriminative vs. Generative Classifiers: {A} comparison of logistic
               regression and naive Bayes},
  pages     = {841--848},
@@ -852,7 +848,6 @@
  year      = {2001},
 }

-
 @inproceedings{huang2008coling,
 	author = {Huang, Liang},
    title = {Coling 2008: Advanced Dynamic Programming in Computational Linguistics: Theory, Algorithms and Applications-Tutorial notes},
@@ -900,7 +895,7 @@

 @article{Baum1966Statistical,
  title={Statistical Inference for Probabilistic Functions of Finite State Markov Chains},
-  author={Baum, Leonard E. and Petrie, Ted},
+  author={Baum, Leonard E and Petrie, Ted},
  journal={Annals of Mathematical Stats},
  volume={37},
  number={6},
@@ -911,7 +906,7 @@
 @article{baum1970maximization,
  title={A maximization technique occurring in the statistical analysis of probabilistic functions of Markov chains},
  author={Baum, Leonard E and Petrie, Ted and Soules, George and Weiss, Norman},
-  journal={The annals of mathematical statistics},
+  journal={Annals of Mathematical Stats},
  volume={41},
  number={1},
  pages={164--171},
@@ -920,15 +915,17 @@

 @article{1977Maximum,
  title={Maximum likelihood from incomplete data via the EM algorithm},
-  author={ Dempster, A. P. },
-  journal={Journal of the Royal Statal Society},
+  author={Dempster, Arthur P and Laird, Nan M and Rubin, Donald B},
+  journal={Journal of the Royal Statistical Society: Series B (Methodological)},
  volume={39},
-  year={1977},
+  number={1},
+  pages={1--22},
+  year={1977}
 }

 @article{1967Error,
  title={Error bounds for convolutional codes and an asymptotically optimum decoding algorithm},
-  author={ Viterbi, Andrew J. },
+  author={Viterbi, Andrew},
  journal={IEEE Transactions on Information Theory},
  volume={13},
  number={2},
@@ -944,11 +941,10 @@
 }

 @inproceedings{brants-2000-tnt,
-    title = {{T}n{T} {--} A Statistical Part-of-Speech Tagger},
+    title = {TnT - {A} Statistical Part-of-Speech Tagger},
    author = {Brants, Thorsten},
-    month = apr,
    year = {2000},
-    publisher = {Association for Computational Linguistics},
+    publisher = {Annual Meeting of the Association for Computational Linguistics},
    pages = {224--231},
 }

@@ -956,7 +952,6 @@
    title = {Chunk Parsing Revisited},
    author = {Yoshimasa Tsuruoka and
               Jun'ichi Tsujii},
-    month = oct,
    year = {2005},
    publisher = {Annual Meeting of the Association for Computational Linguistics},
    pages = {133--140},
@@ -968,7 +963,6 @@
      Wang, Houfeng  and
      Yu, Shiwen  and
      Xin, Chengsheng},
-    month = jul,
    year = {2003},
    publisher = {Annual Meeting of the Association for Computational Linguistics},
    pages = {92--97},
@@ -1091,9 +1085,7 @@
 }

 @inproceedings{DBLP:conf/muc/BlackRM98,
-  author    = {William J. Black and
-               Fabio Rinaldi and
-               David Mowatt},
+  author    = {Black, William J and Rinaldi, Fabio and Mowatt, David},
  title     = {{FACILE:} Description of the {NE} System Used for {MUC-7}},
  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {1998},
@@ -1165,7 +1157,7 @@
  year={1996}
 }

-@article{mitchell1996m,
+@book{mitchell1996m,
  title={Machine Learning},
  author={Mitchell, Tom},
  journal={McCraw Hill},
@@ -1197,8 +1189,7 @@
  volume={153},
  number={3731},
  pages={34--37},
-  year={1966},
-  publisher={American Association for the Advancement of Science}
+  year={1966}
 }

 %%%%% chapter 3------------------------------------------------------
@@ -1424,8 +1415,7 @@
  journal={Computer Speech \& Language},
  volume={45},
  pages={180--200},
-  year={2017},
-  publisher={Elsevier}
+  year={2017}
 }
 @inproceedings{gamon2005sentence,
  title={Sentence-level MT evaluation without reference translations: Beyond language modeling},
@@ -1501,8 +1491,7 @@
  volume={27},
  number={3-4},
  pages={171--192},
-  year={2013},
-  publisher={Springer}
+  year={2013}
 }
 @inproceedings{DBLP:conf/wmt/BiciciW14,
  author    = {Ergun Bi{\c{c}}ici and
@@ -1803,7 +1792,7 @@
 @inproceedings{popovic2011human,
  title={From human to automatic error classification for machine translation output},
  author={Popovic, Maja and Burchardt, Aljoscha and others},
-  booktitle={European Association for Machine Translation},
+  publisher={European Association for Machine Translation},
  year={2011}
 }
 @article{DBLP:journals/mt/CostaLLCC15,
@@ -2221,6 +2210,7 @@ year = {2012}
 }
 @article{kepler2019unbabel,
  title={Unbabel's Participation in the WMT19 Translation Quality Estimation Shared Task},
+  pages={78--84},
  author={Kepler, F{\'a}bio and Tr{\'e}nous, Jonay and Treviso, Marcos and Vera, Miguel and G{\'o}is, Ant{\'o}nio and Farajian, M Amin and Lopes, Ant{\'o}nio V and Martins, Andr{\'e} FT},
  year={2019}
 }
@@ -2273,15 +2263,8 @@ year = {2012}
  year={2000},
  publisher={Pearson Education India}
 }
-@article{devlin2018bert,
-  title={Bert: Pre-training of deep bidirectional transformers for language understanding},
-  author={Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina},
-  journal={arXiv preprint arXiv:1810.04805},
-  year={2018}
-}
 %%%%% chapter 4------------------------------------------------------
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 %%%%% chapter 5------------------------------------------------------
 @article{brown1990statistical,
@@ -2312,13 +2295,12 @@ year = {2012}
 }
 @article{shannon1949communication,
  title ={Communication theory of secrecy systems},
-  author ={Shannon, Claude E},
+  author ={Claude Elwood Shannon},
  journal ={Bell system technical journal},
  volume ={28},
  number ={4},
  pages ={656--715},
-  year ={1949},
-  publisher ={Wiley Online Library}
+  year ={1949}
 }
 @inproceedings{DBLP:conf/acl/Moore04,
  author    = {Robert C. Moore},
@@ -2352,8 +2334,8 @@ year = {2012}
 }
 @article{1998Grammar,
  title={Grammar Inference and Statistical Machine Translation},
-  author={Ye-Yi Wang and Jaime Carbonell},
-  year={1998},
+  author={Ye-Yi Wang and Wayne Ward},
+  year={1999},
  publisher={Carnegie Mellon University}
 }

@@ -2419,7 +2401,7 @@ year = {2012}
  year={2009}
 }
 @article{DBLP:journals/coling/FraserM07,
-  author    = {Alexander M. Fraser and
+  author    = {Alexander Fraser and
               Daniel Marcu},
  title     = {Measuring Word Alignment Quality for Statistical Machine Translation},
  journal   = {Computational Linguistics},
@@ -2540,7 +2522,7 @@ year = {2012}
 }
 @article{xiao2013unsupervised,
  title ={Unsupervised sub-tree alignment for tree-to-tree translation},
-  author ={Xiao, Tong and Zhu, Jingbo},
+  author ={Tong Xiao and Jingbo Zhu},
  journal ={Journal of Artificial Intelligence Research},
  volume ={48},
  pages ={733--782},
@@ -2576,7 +2558,6 @@ year = {2012}
  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2005},
 }
-
 %%%%% chapter 6------------------------------------------------------
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

@@ -2950,7 +2931,7 @@ year = {2012}
 }

 @inproceedings{robert2007faster,
-  author    = {Robert C Moore and
+  author    = {Robert C. Moore and
               Chris Quirk},
  title     = {Faster Beam-Search Decoding for Phrasal Statistical Machine Translation},
  publisher = {Machine Translation Summit XI},
@@ -3180,7 +3161,7 @@ year = {2012}
 }
 @inproceedings{DBLP:conf/naacl/ZettlemoyerM07,
  author    = {Luke S. Zettlemoyer and
-               Robert Moore},
+               Robert C. Moore},
  title     = {Selective Phrase Pair Extraction for Improved Statistical Machine
               Translation},
  pages     = {209--212},
@@ -3227,12 +3208,10 @@ year = {2012}

 @inproceedings{2014Dynamic,
  title={Dynamic Phrase Tables for Machine Translation in an Interactive Post-editing Scenario},
-  author={Germann, Ulrich},
+  author={Ulrich Germann },
  publisher = {Association for Machine Translation in the Americas},
  year={2014},
 }
-
-
 %%%%% chapter 7------------------------------------------------------
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

@@ -3249,7 +3228,7 @@ year = {2012}
 }
 @article{chiang2007hierarchical,
    title={Hierarchical Phrase-Based Translation},
-    author ={Chiang David},
+    author ={David Chiang},
    journal ={Computational Linguistics},
    volume ={33},
    number ={2},
@@ -3258,8 +3237,7 @@ year = {2012}
 }
 @book{cocke1969programming,
  title ={Programming Languages and Their Compilers: Preliminary Notes},
-  author ={Cocke, J. and Schwartz, J.T.},
-  lccn ={76374279},
+  author ={Cocke, John and Schwartz, J.T.},
  year ={1970},
  publisher ={Courant Institute of Mathematical Sciences, New York University}
 }
@@ -3273,7 +3251,7 @@ year = {2012}
  year      = {1967}
 }
 @article{kasami1966efficient,
-  author ={Kasami, Tadao},
+  author ={Tadao Kasami},
  title ={An efficient recognition and syntax-analysis algorithm for context-free languages},
  journal ={Coordinated Science Laboratory Report no. R-257},
  year ={1966}
@@ -3298,14 +3276,14 @@ year = {2012}
 }
 @inproceedings{huang2006statistical,
  title ={Statistical syntax-directed translation with extended domain of locality},
-  author ={Huang, Liang and Knight, Kevin and Joshi, Aravind},
+  author ={Liang Huang and Kevin Knight and Aravind Joshi},
  pages ={66--73},
  year ={2006},
  publisher ={Computationally Hard Problems \& Joint Inference in Speech \& Language Processing}
 }
 @inproceedings{galley2004s,
  title ={What’s in a translation rule?},
-  author ={Galley, Michel and Hopkins, Mark and Knight, Kevin and Marcu, Daniel},
+  author ={Michel Galleyand Mark Hopkins and Kevin Knight and Daniel Marcu},
  publisher={Proceedings of the Human Language Technology Conference of the North American Chapter of the Association for Computational Linguistics},
  pages ={273--280},
  year ={2004}
@@ -3399,7 +3377,7 @@ year = {2012}
 @inproceedings{charniak2006multilevel,
 	title={Multilevel Coarse-to-Fine PCFG Parsing},
 	author={Eugene {Charniak} and Mark {Johnson} and Micha {Elsner} and Joseph {Austerweil} and David {Ellis} and Isaac {Haxton} and Catherine {Hill} and R. {Shrivaths} and Jeremy {Moore} and Michael {Pozar} and Theresa {Vu}},
-	booktitle={Proceedings of the Human Language Technology Conference of the North American Chapter of the Association for Computational Linguistics},
+	publisher={Proceedings of the Human Language Technology Conference of the North American Chapter of the Association for Computational Linguistics},
 	pages={168--175},
 	year={2006}
 }
@@ -3655,7 +3633,10 @@ year = {2012}
 }
 @article{Zhai2012Treebased,
  title={Treebased translation without using parse trees},
-  author={Zhai, Feifei and Zhang, Jiajun and Zhou, Yu and Zong, Chengqing},
+  author    = {Feifei Zhai and
+               Jiajun Zhang and
+               Yu Zhou and
+               Chengqing Zong},
  publisher = {International Conference on Computational Linguistics},
  year={2012},
 }
@@ -3771,10 +3752,10 @@ year = {2012}
 }
 @inproceedings{bangalore2001computing,
  title ={Computing consensus translation from multiple machine translation systems},
-  author ={Bangalore, B and Bordel, German and Riccardi, Giuseppe},
+  author ={Srinivas Bangalore, German Bordel and Giuseppe Riccardi},
+  publisher = {IEEE Workshop on Automatic Speech Recognition and Understanding},
  pages ={351--354},
-  year ={2001},
-  organization ={The Institute of Electrical and Electronics Engineers}
+  year ={2001}
 }
 @inproceedings{rosti2007combining,
  author    = {Antti-Veikko I. Rosti and
@@ -3790,7 +3771,7 @@ year = {2012}
 }
 @article{xiao2013bagging,
  title ={Bagging and boosting statistical machine translation systems},
-  author ={Xiao, Tong and Zhu, Jingbo and Liu, Tongran},
+  author ={Tong Xiao and Jingbo Zhu and Tongran Liu },
  publisher ={Artificial Intelligence},
  volume ={195},
  pages ={496--527},
@@ -3812,7 +3793,7 @@ year = {2012}
               Mei Yang and
               Jianfeng Gao and
               Patrick Nguyen and
-               Robert Moore},
+               Robert C. Moore},
  title     = {Indirect-HMM-based Hypothesis Alignment for Combining Outputs from
               Machine Translation Systems},
  pages     = {98--107},
@@ -5087,6 +5068,13 @@ pages ={157-166},
    publisher = "Association for Computational Linguistics",
    pages = "1789--1798",
 }
+@article{Lin2020WeightDT,
+  title={Weight Distillation: Transferring the Knowledge in Neural Network Parameters},
+  author={Ye Lin and Yanyang Li and Ziyang Wang and Bei Li and Quan Du and Tong Xiao and Jingbo Zhu},
+  journal={ArXiv},
+  year={2020},
+  volume={abs/2009.09152}
+}
 %%%%% chapter 12------------------------------------------------------
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

@@ -5105,6 +5093,102 @@ pages ={157-166},
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 %%%%% chapter 15------------------------------------------------------

+@inproceedings{DBLP:conf/cvpr/YuYR18,
+  author    = {Xin Yu and
+               Zhiding Yu and
+               Srikumar Ramalingam},
+  title     = {Learning Strict Identity Mappings in Deep Residual Networks},
+  pages     = {4432--4440},
+  publisher = {{IEEE} Conference on Computer Vision and Pattern Recognition},
+  year      = {2018}
+}
+
+@inproceedings{DBLP:conf/emnlp/ZhangTS19,
+  author    = {Biao Zhang and
+               Ivan Titov and
+               Rico Sennrich},
+  title     = {Improving Deep Transformer with Depth-Scaled Initialization and Merged
+               Attention},
+  pages     = {898--909},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
+  year      = {2019}
+}
+
+@inproceedings{DBLP:conf/eccv/HeZRS16,
+  author    = {Kaiming He and
+               Xiangyu Zhang and
+               Shaoqing Ren and
+               Jian Sun},
+  title     = {Identity Mappings in Deep Residual Networks},
+  volume    = {9908},
+  pages     = {630--645},
+  publisher = {European Conference on Computer Vision},
+  year      = {2016}
+}
+
+@inproceedings{Ottfairseq,
+  author    = {Myle Ott and
+               Sergey Edunov and
+               Alexei Baevski and
+               Angela Fan and
+               Sam Gross and
+               Nathan Ng and
+               David Grangier and
+               Michael Auli},
+  title     = {fairseq: {A} Fast, Extensible Toolkit for Sequence Modeling},
+  pages     = {48--53},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
+  year      = {2019}
+}
+
+@inproceedings{KleinOpenNMT,
+  author    = {Guillaume Klein and
+               Yoon Kim and
+               Yuntian Deng and
+               Jean Senellart and
+               Alexander M. Rush},
+  title     = {OpenNMT: Open-Source Toolkit for Neural Machine Translation},
+  pages     = {67--72},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
+  year      = {2017}
+}
+
+@inproceedings{DBLP:conf/acl/WuWXTGQLL19,
+  author    = {Lijun Wu and
+               Yiren Wang and
+               Yingce Xia and
+               Fei Tian and
+               Fei Gao and
+               Tao Qin and
+               Jianhuang Lai and
+               Tie{-}Yan Liu},
+  title     = {Depth Growing for Neural Machine Translation},
+  pages     = {5558--5563},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
+  year      = {2019}
+}
+
+@inproceedings{DBLP:conf/cvpr/HuangLMW17,
+  author    = {Gao Huang and
+               Zhuang Liu and
+               Laurens van der Maaten and
+               Kilian Q. Weinberger},
+  title     = {Densely Connected Convolutional Networks},
+  pages     = {2261--2269},
+  publisher = {{IEEE} Conference on Computer Vision and Pattern Recognition},
+  year      = {2017}
+}
+
+@article{DBLP:journals/corr/GreffSS16,
+  author    = {Klaus Greff and
+               Rupesh Kumar Srivastava and
+               J{\"{u}}rgen Schmidhuber},
+  title     = {Highway and Residual Networks learn Unrolled Iterative Estimation},
+  publisher = {International Conference on Learning Representations},
+  year      = {2017}
+}
+
+
 %%%%% chapter 15------------------------------------------------------
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%