bib update

4027b354 · 曹润柘 · 3994e2fd · 4027b354 · 4027b354 · 4027b354
Commit 4027b354 authored Nov 17, 2020 by 曹润柘
--- a/Chapter11/Figures/figure-convolution-kernel.tex
+++ b/Chapter11/Figures/figure-convolution-kernel.tex
@@ -52,7 +52,7 @@
 %\node[minimum width = 1.8cm] (sub) at ([xshift=-5.5cm,yshift=2cm]num9_9.east) {};
 \draw[decorate,decoration={brace,mirror,raise=0pt,amplitude=0.3cm},black,thick] ([yshift=0.4cm,xshift=-0.1cm]num1_1.west) -- node[att,xshift=-0.5cm]{$Q$} ([yshift=-0.4cm,xshift=-0.1cm]num3_3.west);
-\draw[decorate,decoration={brace,raise=0pt,amplitude=0.3cm},black,thick] ([xshift=-0.4cm,yshift=0.1cm]num1.north) -- node[att,yshift=0.5cm]{$U$}([xshift=0.4cm,yshift=0.1cm]num7.north);
+\draw[decorate,decoration={brace,raise=0pt,amplitude=0.3cm},black,thick] ([xshift=-0.4cm,yshift=0.1cm]num1.north) -- node[att,yshift=0.5cm]{$K$}([xshift=0.4cm,yshift=0.1cm]num7.north);
 \draw[decorate,decoration={brace,mirror,raise=0pt,amplitude=0.3cm},black,thick] ([xshift=0.5cm,yshift=0.00cm]num9_9.south) -- node[att,xshift=0.5cm,yshift=-0.3cm]{$O$}([xshift=0.5cm,yshift=0.00cm]num9.south);

--- a/Chapter11/Figures/figure-deep-vs-light.tex
+++ b/Chapter11/Figures/figure-deep-vs-light.tex
@@ -21,10 +21,10 @@
 	\draw[line width=0.9pt, gray!80, -latex] (l\point_3.east) -- (r2_3.west);
 	}
-	\node[vuale] at (-1.5em, 1.9em) {$\mathbi{x}_2$};
+	\node[vuale] at (-1.5em, 1.9em) {$x_2$};
-	\node[vuale] at (-1.5em, 9.9em) {$\mathbi{x}_1$};
+	\node[vuale] at (-1.5em, 9.9em) {$x_1$};
-	\node[vuale] at (6.5em, 1.9em) {$\mathbi{z}_2$};
+	\node[vuale] at (6.5em, 1.9em) {$y_1$};
-	\node[vuale] at (6.5em, 9.9em) {$\mathbi{z}_1$};
+	\node[vuale] at (6.5em, 9.9em) {$y_2$};
 	\node (t2) at (2.5em, -1em) {\large{$\cdots$}};
 	\node [anchor=north,font=\tiny] at ([yshift=-0.2em]t2.south) {深度卷积};
@@ -46,10 +46,10 @@
 	\draw[line width=0.9pt, cyan!80, -latex] (l\point_3.east) -- (r2_3.west);
 	}
-	\node[vuale] at (-1.5em, 1.9em) {$\mathbi{x}_2$};
+	\node[vuale] at (-1.5em, 1.9em) {$x_2$};
-	\node[vuale] at (-1.5em, 9.9em) {$\mathbi{x}_1$};
+	\node[vuale] at (-1.5em, 9.9em) {$x_1$};
-	\node[vuale] at (6.5em, 1.9em) {$\mathbi{z}_2$};
+	\node[vuale] at (6.5em, 1.9em) {$y_1$};
-	\node[vuale] at (6.5em, 9.9em) {$\mathbi{z}_1$};
+	\node[vuale] at (6.5em, 9.9em) {$y_2$};
 	\node (t2) at (2.5em, -1em) {\large{$\cdots$}};
 	\node [anchor=north,font=\tiny] at ([yshift=-0.2em]t2.south) {轻量卷积};

--- a/Chapter11/Figures/figure-standard.tex
+++ b/Chapter11/Figures/figure-standard.tex
@@ -32,12 +32,12 @@
 	\draw[line width=0.5pt, cyan!80, -latex] (l3_\point.east) -- ([xshift=0em,yshift=0.1em]r3_2.west);
 	}
-	\node[vuale] at ([xshift=-0.9em]l1_1.west) {$\mathbi{x}_3$};
+	\node[vuale] at ([xshift=-0.9em]l1_1.west) {$x_3$};
-	\node[vuale] at ([xshift=-0.9em]l2_1.west) {$\mathbi{x}_2$};
+	\node[vuale] at ([xshift=-0.9em]l2_1.west) {$x_2$};
-	\node[vuale] at ([xshift=-0.9em]l3_1.west) {$\mathbi{x}_1$};
+	\node[vuale] at ([xshift=-0.9em]l3_1.west) {$x_1$};
-	\node[vuale] at ([xshift=0.9em]r1_1.east) {$\mathbi{z}_3$};
+	\node[vuale] at ([xshift=0.9em]r1_1.east) {$y_3$};
-	\node[vuale] at ([xshift=0.9em]r2_1.east) {$\mathbi{z}_2$};
+	\node[vuale] at ([xshift=0.9em]r2_1.east) {$y_3$};
-	\node[vuale] at ([xshift=0.9em]r3_1.east) {$\mathbi{z}_1$};
+	\node[vuale] at ([xshift=0.9em]r3_1.east) {$y_3$};
 	\node (t1) at (2.5em, -1em) {\large{$\cdots$}};
 	\node [anchor=north,font=\tiny] at ([yshift=-0.2em]t1.south) {传统卷积};
@@ -66,12 +66,12 @@
 	\draw[line width=0.5pt, cyan!80, -latex] (l\point_2.east) -- (r3_2.west);
 	}
-	\node[vuale] at ([xshift=-0.9em]l1_1.west) {$\mathbi{x}_3$};
+	\node[vuale] at ([xshift=-0.9em]l1_1.west) {$x_3$};
-	\node[vuale] at ([xshift=-0.9em]l2_1.west) {$\mathbi{x}_2$};
+	\node[vuale] at ([xshift=-0.9em]l2_1.west) {$x_2$};
-	\node[vuale] at ([xshift=-0.9em]l3_1.west) {$\mathbi{x}_1$};
+	\node[vuale] at ([xshift=-0.9em]l3_1.west) {$x_1$};
-	\node[vuale] at ([xshift=0.9em]r1_1.east) {$\mathbi{z}_3$};
+	\node[vuale] at ([xshift=0.9em]r1_1.east) {$y_3$};
-	\node[vuale] at ([xshift=0.9em]r2_1.east) {$\mathbi{z}_2$};
+	\node[vuale] at ([xshift=0.9em]r2_1.east) {$y_3$};
-	\node[vuale] at ([xshift=0.9em]r3_1.east) {$\mathbi{z}_1$};
+	\node[vuale] at ([xshift=0.9em]r3_1.east) {$y_3$};
 	\node (t2) at (2.5em, -1em) {\large{$\cdots$}};
 	\node [anchor=north,font=\tiny] at ([yshift=-0.2em]t2.south) {深度卷积};
@@ -102,12 +102,12 @@
 	\draw[line width=0.5pt, cyan!80, -latex] (l3_\point.east) -- (r3_2.west);
 	}
-	\node[vuale] at ([xshift=-0.9em]l1_1.west) {$\mathbi{x}_3$};
+	\node[vuale] at ([xshift=-0.9em]l1_1.west) {$x_3$};
-	\node[vuale] at ([xshift=-0.9em]l2_1.west) {$\mathbi{x}_2$};
+	\node[vuale] at ([xshift=-0.9em]l2_1.west) {$x_2$};
-	\node[vuale] at ([xshift=-0.9em]l3_1.west) {$\mathbi{x}_1$};
+	\node[vuale] at ([xshift=-0.9em]l3_1.west) {$x_1$};
-	\node[vuale] at ([xshift=0.9em]r1_1.east) {$\mathbi{z}_3$};
+	\node[vuale] at ([xshift=0.9em]r1_1.east) {$y_3$};
-	\node[vuale] at ([xshift=0.9em]r2_1.east) {$\mathbi{z}_2$};
+	\node[vuale] at ([xshift=0.9em]r2_1.east) {$y_3$};
-	\node[vuale] at ([xshift=0.9em]r3_1.east) {$\mathbi{z}_1$};
+	\node[vuale] at ([xshift=0.9em]r3_1.east) {$y_3$};
 	\node (t3) at (2.5em, -1em) {\large{$\cdots$}};
 	\node [anchor=north,font=\tiny] at ([yshift=-0.2em]t3.south) {逐点卷积};

--- a/Chapter11/chapter11.tex
+++ b/Chapter11/chapter11.tex
@@ -83,7 +83,7 @@
 \end{figure}
 %----------------------------------------------
-\parinterval 在图像卷积中，卷积核是一组$Q \times U \times O$的参数（如图\ref{fig:11-3}）。其中$Q$和$U$表示卷积核窗口的长度与宽度，分别对应图像中的长和宽两个维度，$Q \times U$决定了该卷积核窗口的大小。$O$是该卷积核的深度，它的取值和输入数据通道数保持一致。在这里，通道可以看作图像不同的特征，比如灰色图像只有灰度信息，通道数为1；而RGB格式的图像有3个通道，分别对应红绿蓝三种颜色信息。
+\parinterval 在图像卷积中，卷积核是一组$Q \times K \times O$的参数（如图\ref{fig:11-3}）。其中$Q$和$K$表示卷积核窗口的长度与宽度，分别对应图像中的长和宽两个维度，$Q \times K$决定了该卷积核窗口的大小。$O$是该卷积核的深度，它的取值和输入数据通道数保持一致。在这里，通道可以看作图像不同的特征，比如灰色图像只有灰度信息，通道数为1；而RGB格式的图像有3个通道，分别对应红绿蓝三种颜色信息。
 %----------------------------------------------
 % 图4.
@@ -567,7 +567,7 @@
 \parinterval 卷积是一种高效处理网格数据的计算方式，在图像、语音等领域取得了令人瞩目的成绩。本章介绍了卷积的概念及其特性，并对池化、填充等操作进行了详细的讨论。前面介绍的基于循环神经网络的翻译模型在引入注意力机制后已经大幅度超越了基于统计的机器翻译模型，但由于循环神经网络的计算方式导致网络整体的并行能力差，训练耗时。本章介绍了具有高并行计算的能力的模型范式，即基于卷积神经网络的编码器-解码器框架。其在机器翻译任务上取得了与基于循环神经网络的GNMT模型相当的性能，并大幅度缩短了模型的训练周期。除了基础部分，本章还针对卷积计算进行了延伸，包括逐通道卷积、逐点卷积、轻量卷积和动态卷积等。除了上述提及的内容，卷积神经网络及其变种在文本分类、命名实体识别等其他自然语言处理任务上也有许多应用。
-\parinterval 和机器翻译任务不同的是，文本分类任务侧重于对序列特征的提取，然后通过压缩后的特征表示做出类别预测。卷积神经网络可以对序列中一些$n$-gram特征进行提取，也可以用在文本分类任务中，其基本结构包括输入层、卷积层、池化层和全连接层。除了在本章介绍过的TextCNN模型\upcite{Kim2014ConvolutionalNN}，不少研究工作在此基础上对其进行改进。比如，通过改变输入层来引入更多特征\upcite{DBLP:conf/acl/NguyenG15,DBLP:conf/aaai/LaiXLZ15}，对卷积层的改进\upcite{DBLP:conf/acl/ChenXLZ015,DBLP:conf/emnlp/LeiBJ15}以及对池化层的改进\upcite{Kalchbrenner2014ACN,DBLP:conf/acl/ChenXLZ015}。在命名实体识别任务中，同样可以使用卷积神经网络来进行特征提取\upcite{DBLP:journals/jmlr/CollobertWBKKK11,DBLP:conf/cncl/ZhouZXQBX17}，或者使用更高效的空洞卷积对更长的上下文进行建模\upcite{DBLP:conf/emnlp/StrubellVBM17}。此外，也有一些研究工作尝试使用卷积神经网络来提取字符级特征\upcite{DBLP:conf/acl/MaH16,DBLP:conf/emnlp/LiDWCM17,DBLP:conf/acl-codeswitch/WangCK18}。
+\parinterval 和机器翻译任务不同的是，文本分类任务侧重于对序列特征的提取，然后通过压缩后的特征表示做出类别预测。卷积神经网络可以对序列中一些$n$-gram特征进行提取，也可以用在文本分类任务中，其基本结构包括输入层、卷积层、池化层和全连接层。除了在本章介绍过的TextCNN模型\upcite{Kim2014ConvolutionalNN}，不少研究工作在此基础上对其进行改进。比如，通过改变输入层来引入更多特征\upcite{DBLP:conf/acl/NguyenG15,DBLP:conf/aaai/LaiXLZ15}，对卷积层的改进\upcite{DBLP:conf/acl/ChenXLZ015,DBLP:conf/emnlp/LeiBJ15}以及对池化层的改进\upcite{Kalchbrenner2014ACN,DBLP:conf/acl/ChenXLZ015}。在命名实体识别任务中，同样可以使用卷积神经网络来进行特征提取\upcite{2011Natural,DBLP:conf/cncl/ZhouZXQBX17}，或者使用更高效的空洞卷积对更长的上下文进行建模\upcite{DBLP:conf/emnlp/StrubellVBM17}。此外，也有一些研究工作尝试使用卷积神经网络来提取字符级特征\upcite{DBLP:conf/acl/MaH16,DBLP:conf/emnlp/LiDWCM17,DBLP:conf/acl-codeswitch/WangCK18}。

--- a/Chapter9/chapter9.tex
+++ b/Chapter9/chapter9.tex
@@ -2166,6 +2166,6 @@ Jobs was the CEO of {\red{\underline{apple}}}.
 \vspace{0.5em}
 \item 为了进一步提高神经语言模型性能，除了改进模型，还可以在模型中引入新的结构或是其他有效信息，该领域也有很多典型工作值得关注。例如在神经语言模型中引入除了词嵌入以外的单词特征，如语言特征（形态、语法、语义特征等）\upcite{Wu2012FactoredLM,Adel2015SyntacticAS}、上下文信息\upcite{mikolov2012context,Wang2015LargerContextLM}、知识图谱等外部知识\upcite{Ahn2016ANK}；或是在神经语言模型中引入字符级信息，将其作为字符特征单独\upcite{Kim2016CharacterAwareNL,Hwang2017CharacterlevelLM}或与单词特征一起\upcite{Onoe2016GatedWR,Verwimp2017CharacterWordLL}送入模型中；在神经语言模型中引入双向模型也是一种十分有效的尝试，在单词预测时可以同时利用来自过去和未来的文本信息\upcite{Graves2013HybridSR,bahdanau2014neural,Peters2018DeepCW}。
 \vspace{0.5em}
-\item 词嵌入是自然语言处理近些年的重要进展。所谓“嵌入”是一类方法，理论上，把一个事物进行分布式表示的过程都可以被看作是广义上的“嵌入”。基于这种思想的表示学习也成为了自然语言处理中的前沿方法。比如，如何对树结构，甚至图结构进行分布式表示成为了分析自然语言的重要方法\upcite{DBLP:journals/corr/abs-1809-01854,Yin2018StructVAETL,Aharoni2017TowardsSN,Bastings2017GraphCE,KoncelKedziorski2019TextGF}。此外，除了语言建模，还有很多方式可以进行词嵌入的学习，比如，SENNA\upcite{collobert2011natural}、word2vec\upcite{DBLP:journals/corr/abs-1301-3781,mikolov2013distributed}、Glove\upcite{DBLP:conf/emnlp/PenningtonSM14}、CoVe\upcite{mccann2017learned} 等。
+\item 词嵌入是自然语言处理近些年的重要进展。所谓“嵌入”是一类方法，理论上，把一个事物进行分布式表示的过程都可以被看作是广义上的“嵌入”。基于这种思想的表示学习也成为了自然语言处理中的前沿方法。比如，如何对树结构，甚至图结构进行分布式表示成为了分析自然语言的重要方法\upcite{DBLP:journals/corr/abs-1809-01854,Yin2018StructVAETL,Aharoni2017TowardsSN,Bastings2017GraphCE,KoncelKedziorski2019TextGF}。此外，除了语言建模，还有很多方式可以进行词嵌入的学习，比如，SENNA\upcite{2011Natural}、word2vec\upcite{DBLP:journals/corr/abs-1301-3781,mikolov2013distributed}、Glove\upcite{DBLP:conf/emnlp/PenningtonSM14}、CoVe\upcite{mccann2017learned} 等。
 \vspace{0.5em}
 \end{itemize}
--- a/bibliography.bib
+++ b/bibliography.bib
@@ -3867,8 +3867,7 @@ year = {2012}
  volume={18},
  number={4},
  pages={467--479},
-  year={1992},
+  year={1992}
-  publisher={MIT Press}
 }
 @inproceedings{mikolov2012context,
@@ -3877,10 +3876,9 @@ year = {2012}
            Tomas and
            Zweig and
            Geoffrey},
-  booktitle={2012 IEEE Spoken Language Technology Workshop (SLT)},
+  publisher={IEEE Spoken Language Technology Workshop},
  pages={234--239},
-  year={2012},
+  year={2012}
-  organization={IEEE}
 }
 @article{zaremba2014recurrent,
@@ -3905,7 +3903,7 @@ year = {2012}
            Jan and
            Schmidhuber and
            Jurgen},
-  journal={arXiv: Learning},
+  journal={International Conference on Machine Learning},
  year={2016}
 }
@@ -3917,7 +3915,7 @@ year = {2012}
             Nitish Shirish and
             Socher and
             Richard},
-  journal={arXiv: Computation and Language},
+  journal={International Conference on Learning Representations},
  year={2017}
 }
@@ -3934,12 +3932,11 @@ year = {2012}
 @article{baydin2017automatic,
  title ={Automatic differentiation in machine learning: a survey},
  author ={Baydin, At{\i}l{\i}m G{\"u}nes and Pearlmutter, Barak A and Radul, Alexey Andreyevich and Siskind, Jeffrey Mark},
-  journal ={The Journal of Machine Learning Research},
+  journal ={Journal of Machine Learning Research},
  volume ={18},
  number ={1},
  pages ={5595--5637},
-  year ={2017},
+  year ={2017}
-  publisher ={JMLR. org}
 }
 @article{qian1999momentum,
@@ -3977,9 +3974,8 @@ year = {2012}
  author    = {Diederik P. Kingma and
               Jimmy Ba},
  title     = {Adam: {A} Method for Stochastic Optimization},
-  booktitle = {3rd International Conference on Learning Representations, {ICLR} 2015,
+  publisher = {International Conference on Learning Representations},
-               San Diego, CA, USA, May 7-9, 2015, Conference Track Proceedings},
+  year      = {2015}
-  year      = {2015},
 }
 @inproceedings{ioffe2015batch,
@@ -3987,13 +3983,10 @@ year = {2012}
               Christian Szegedy},
  title     = {Batch Normalization: Accelerating Deep Network Training by Reducing
               Internal Covariate Shift},
-  booktitle = {Proceedings of the 32nd International Conference on Machine Learning,
+  publisher = {International Conference on Machine Learning},
-               {ICML} 2015, Lille, France, 6-11 July 2015},
-  series    = {{JMLR} Workshop and Conference Proceedings},
  volume    = {37},
  pages     = {448--456},
-  publisher = {JMLR.org},
+  year      = {2015}
-  year      = {2015},
 }
 @article{Ba2016LayerN,
@@ -4003,7 +3996,7 @@ year = {2012}
  title     = {Layer Normalization},
  journal   = {CoRR},
  volume    = {abs/1607.06450},
-  year      = {2016},
+  year      = {2016}
 }
 @inproceedings{mikolov2013distributed,
@@ -4013,11 +4006,9 @@ year = {2012}
               Gregory S. Corrado and
               Jeffrey Dean},
  title     = {Distributed Representations of Words and Phrases and their Compositionality},
-  booktitle = {Advances in Neural Information Processing Systems 26: 27th Annual
+  publisher = {Conference on Neural Information Processing Systems},
-               Conference on Neural Information Processing Systems 2013. Proceedings
-               of a meeting held December 5-8, 2013, Lake Tahoe, Nevada, United States},
  pages     = {3111--3119},
-  year      = {2013},
+  year      = {2013}
 }
 @inproceedings{arthur2016incorporating,
@@ -4025,12 +4016,9 @@ year = {2012}
               Graham Neubig and
               Satoshi Nakamura},
  title     = {Incorporating Discrete Translation Lexicons into Neural Machine Translation},
-  booktitle = {Proceedings of the 2016 Conference on Empirical Methods in Natural
-               Language Processing, {EMNLP} 2016, Austin, Texas, USA, November 1-4,
-               2016},
  pages     = {1557--1567},
-  publisher = {The Association for Computational Linguistics},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
-  year      = {2016},
+  year      = {2016}
 }
 @inproceedings{stahlberg2016syntactically,
@@ -4039,10 +4027,7 @@ year = {2012}
               Aurelien Waite and
               Bill Byrne},
  title     = {Syntactically Guided Neural Machine Translation},
-  booktitle = {Proceedings of the 54th Annual Meeting of the Association for Computational
+  publisher = {Annual Meeting of the Association for Computational Linguistics}
-               Linguistics, {ACL} 2016, August 7-12, 2016, Berlin, Germany, Volume
-               2: Short Papers},
-  publisher = {The Association for Computer Linguistics},
  year      = {2016},
 }
@@ -4051,12 +4036,9 @@ year = {2012}
               Alessandro Moschitti},
  title     = {Embedding Semantic Similarity in Tree Kernels for Domain Adaptation
               of Relation Extraction},
-  booktitle = {Proceedings of the 51st Annual Meeting of the Association for Computational
-               Linguistics, {ACL} 2013, 4-9 August 2013, Sofia, Bulgaria, Volume
-               1: Long Papers},
  pages     = {1498--1507},
-  publisher = {The Association for Computer Linguistics},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
-  year      = {2013},
+  year      = {2013}
 }
 @inproceedings{perozzi2014deepwalk,
@@ -4064,42 +4046,32 @@ year = {2012}
               Rami Al-Rfou and
               Steven Skiena},
  title     = {DeepWalk: online learning of social representations},
-  booktitle = {The 20th {ACM} {SIGKDD} International Conference on Knowledge Discovery
+  publisher = {ACM SIGKDD Conference on Knowledge Discovery and Data Mining},
-               and Data Mining, {KDD} '14, New York, NY, {USA} - August 24 - 27,
-               2014},
  pages     = {701--710},
-  publisher = {{ACM}},
+  year      = {2014}
-  year      = {2014},
 }
-@article{collobert2011natural,
+@article{2011Natural,
-  author    = {Ronan Collobert and
+  title={Natural Language Processing (almost) from Scratch},
-               Jason Weston and
+  author={ Collobert, Ronan  and  Weston, Jason  and Bottou, Léon and  Karlen, Michael  and  Kavukcuoglu, Koray  and  Kuksa, Pavel },
-			   L{\'{e}}on Bottou and
+  journal={Journal of Machine Learning Research},
-               Michael Karlen and
+  volume={12},
-               Koray Kavukcuoglu and
+  number={1},
-               Pavel P. Kuksa},
+  pages={2493-2537},
-  title     = {Natural Language Processing (Almost) from Scratch},
+  year={2011}
-  journal   = {Journal of Machine Learning Research},
-  volume    = {12},
-  pages     = {2493--2537},
-  year      = {2011},
 }
 @inproceedings{mccann2017learned,
  author    = {Bryan McCann and
               James Bradbury and
               Caiming Xiong and
               Richard Socher},
  title     = {Learned in Translation: Contextualized Word Vectors},
-  booktitle = {Advances in Neural Information Processing Systems 30: Annual Conference
+  booktitle = {Conference on Neural Information Processing Systems},
-               on Neural Information Processing Systems 2017, 4-9 December 2017,
-               Long Beach, CA, {USA}},
  pages     = {6294--6305},
-  year      = {2017},
+  year      = {2017}
 }
-%%%%%%%%%%%%%%%%%%%%%%%神经语言模型，待检查修改%%%%%%%%%%%%%%%%%%%%%%%%%
+%%%%%%%%%%%%%%%%%%%%%%%神经语言模型，已检查修改%%%%%%%%%%%%%%%%%%%%%%%%%
 @inproceedings{Peters2018DeepCW,
  title={Deep contextualized word representations},
  author={Matthew E. Peters and 
@@ -4135,13 +4107,13 @@ year = {2012}
 }
 @inproceedings{Onoe2016GatedWR,
-  title={Gated Word-Character Recurrent Language Model},
+  author    = {Yasumasa Miyamoto and
-  author={Yasumasa Miyamoto and 
+               Kyunghyun Cho},
-          Kyunghyun Cho},
+  title     = {Gated Word-Character Recurrent Language Model},
-  publisher={arXiv preprint arXiv:1606.01700},
+  pages     = {1992--1997},
-  year={2016}
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
+  year      = {2016}
 }
 @inproceedings{Hwang2017CharacterlevelLM,
  title={Character-level language modeling with hierarchical recurrent neural networks},
  author={Kyuyeon Hwang and 
@@ -4216,12 +4188,11 @@ year = {2012}
 		  Ruocheng Guo and 
 		  Adrienne Raglin and 
 		  Huan Liu},
-  journal={ACM SIGKDD Explorations Newsletter},
+  journal={ACM SIGKDD Conference on Knowledge Discovery and Data Mining},
  volume={22},
  number={1},
  pages={18--33},
-  year={2020},
+  year={2020}
-  publisher={ACM New York, NY, USA}
 }
 @incollection{nguyen2019understanding,
@@ -4231,7 +4202,7 @@ year = {2012}
 		  Jeff Clune},
  pages={55--76},
  year={2019},
-  publisher={Explainable AI}
+  publisher={Springer}
 }
 @inproceedings{yang2017improving,
  title={Improving adversarial neural machine translation with prior knowledge},
@@ -4250,15 +4221,16 @@ year = {2012}
  title={Incorporating source syntax into transformer-based neural machine translation},
  author={Anna Currey and 
          Kenneth Heafield},
-  publisher={Proceedings of the Fourth Conference on Machine Translation},
+  publisher={Annual Meeting of the Association for Computational Linguistics},
  pages={24--33},
  year={2019}
 }
 @article{currey2018multi,
  title={Multi-source syntactic neural machine translation},
  author={Anna Currey and 
          Kenneth Heafield},
-  journal={arXiv preprint arXiv:1808.10267},
+  journal={Conference on Empirical Methods in Natural Language Processing},
  year={2018}
 }
 @inproceedings{marevcek2018extracting,
@@ -4272,7 +4244,7 @@ year = {2012}
 @article{blevins2018deep,
  title={Deep rnns encode soft hierarchical syntax},
  author={Blevins, Terra and Levy, Omer and Zettlemoyer, Luke},
-  journal={arXiv preprint arXiv:1805.04218},
+  journal={Annual Meeting of the Association for Computational Linguistics},
  year={2018}
 }
 @inproceedings{Yin2018StructVAETL,
@@ -4288,7 +4260,7 @@ year = {2012}
  title={Towards String-To-Tree Neural Machine Translation},
  author={Roee Aharoni and 
          Yoav Goldberg},
-  journal={arXiv preprint arXiv:1704.04743},
+  journal={Annual Meeting of the Association for Computational Linguistics},
  year={2017}
 }
@@ -4308,9 +4280,8 @@ year = {2012}
          Dhanush Bekal and Yi Luan and 
 		  Mirella Lapata and 
 		  Hannaneh Hajishirzi},
-  journal={ArXiv},
+  journal={Annual Conference of the North American Chapter of the Association for Computational Linguistics},
-  year={2019},
+  year={2019}
-  volume={abs/1904.02342}
 }
 @article{Kovalerchuk2020SurveyOE,
@@ -4327,7 +4298,7 @@ year = {2012}
  title={Towards A Rigorous Science of Interpretable Machine Learning},
  author={Finale Doshi-Velez and 
          Been Kim},
-  journal={arXiv: Machine Learning},
+  journal={arXiv preprint arXiv:1702.08608},
  year={2017}
 }
@@ -4349,7 +4320,7 @@ year = {2012}
  title     = {Does Multi-Encoder Help? {A} Case Study on Context-Aware Neural Machine
               Translation},
  pages     = {3512--3518},
-  publisher = {Association for Computational Linguistics},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2020}
 }
@@ -4359,7 +4330,7 @@ year = {2012}
               Abe Ittycheriah},
  title     = {Supervised Attentions for Neural Machine Translation},
  pages     = {2283--2288},
-  publisher = {The Association for Computational Linguistics},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2016}
 }
@@ -4370,7 +4341,7 @@ year = {2012}
               Eiichiro Sumita},
  title     = {Neural Machine Translation with Supervised Attention},
  pages     = {3093--3102},
-  publisher = {The Association for Computational Linguistics},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2016}
 }
@@ -4384,16 +4355,16 @@ year = {2012}
  title     = {Fast and Robust Neural Network Joint Models for Statistical Machine
               Translation},
  pages     = {1370--1380},
-  publisher = {The Association for Computer Linguistics},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
-  year      = {2014},
+  year      = {2014}
 }
 @inproceedings{Schwenk_continuousspace,
  author    = {Holger Schwenk},
  title     = {Continuous Space Translation Models for Phrase-Based Statistical Machine
               Translation},
  pages     = {1071--1080},
-  publisher = {Indian Institute of Technology Bombay},
+  publisher = {International Conference on Computational Linguistics},
-  year      = {2012},
+  year      = {2012}
 }
 @inproceedings{kalchbrenner-blunsom-2013-recurrent,
  author    = {Nal Kalchbrenner and
@@ -4401,25 +4372,24 @@ year = {2012}
  title     = {Recurrent Continuous Translation Models},
  pages     = {1700--1709},
  publisher = {Annual Meeting of the Association for Computational Linguistics},
-  year      = {2013},
+  year      = {2013}
 }
 @article{HochreiterThe,
  author    = {Sepp Hochreiter},
  title     = {The Vanishing Gradient Problem During Learning Recurrent Neural Nets
               and Problem Solutions},
-  journal   = {International Journal of Uncertainty, Fuzziness and Knowledge-Based
+  journal   = {International Journal of Uncertainty, Fuzziness and Knowledge-Based Systems},
-               Systems},
  volume    = {6},
  number    = {2},
  pages     = {107--116},
-  year      = {1998},
+  year      = {1998}
 }
 @article{BENGIO1994Learning,
 author    = {Yoshua Bengio and
               Patrice Y. Simard and
               Paolo Frasconi},
  title     = {Learning long-term dependencies with gradient descent is difficult},
-  journal   = {Institute of Electrical and Electronics Engineers},
+  journal   = {IEEE Transportation Neural Networks},
  volume    = {5},
  number    = {2},
  pages     = {157--166},
@@ -4435,15 +4405,14 @@ author    = {Yoshua Bengio and
               Lukasz Kaiser and
               Illia Polosukhin},
  title     = {Attention is All you Need},
-  publisher = {Advances in Neural Information Processing Systems 30: Annual Conference
+  publisher = {Conference on Neural Information Processing Systems},
-               on Neural Information Processing Systems},
  pages     = {5998--6008},
-  year      = {2017},
+  year      = {2017}
 }
 @article{StahlbergNeural,
  title={Neural Machine Translation: A Review},
  author={Felix Stahlberg},
-  journal={journal of artificial intelligence research},
+  journal={Journal of Artificial Intelligence Research},
  year={2020},
  volume={69},
  pages={343-418}
@@ -4455,8 +4424,8 @@ author    = {Yoshua Bengio and
               Marcello Federico},
  title     = {Neural versus Phrase-Based Machine Translation Quality: a Case Study},
  pages     = {257--267},
-  publisher = {The Association for Computational Linguistics},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
-  year      = {2016},
+  year      = {2016}
 }
 @article{Hassan2018AchievingHP,
  author    = {Hany Hassan and
@@ -4498,19 +4467,19 @@ author    = {Yoshua Bengio and
               Lidia S. Chao},
  title     = {Learning Deep Transformer Models for Machine Translation},
  pages     = {1810--1822},
-  publisher = {Association for Computational Linguistics},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2019}
 }
-@article{Li2020NeuralMT,
+@inproceedings{Li2020NeuralMT,
  author    = {Yanyang Li and
               Qiang Wang and
               Tong Xiao and
               Tongran Liu and
               Jingbo Zhu},
  title     = {Neural Machine Translation with Joint Representation},
-  journal   = {CoRR},
+  pages     = {8285--8292},
-  volume    = {abs/2002.06546},
+  publisher = {AAAI Conference on Artificial Intelligence},
-  year      = {2020},
+  year      = {2020}
 }
 @article{HochreiterLong,
  author = {Hochreiter, Sepp and Schmidhuber, Jürgen},
@@ -4519,7 +4488,7 @@ author    = {Yoshua Bengio and
  pages = {1735-80},
  title = {Long Short-term Memory},
  volume = {9},
-  journal = {Neural computation},
+  journal = {Neural Computation}
 }
 @inproceedings{Cho2014Learning,
  author    = {Kyunghyun Cho and
@@ -4531,24 +4500,18 @@ author    = {Yoshua Bengio and
               Yoshua Bengio},
  title     = {Learning Phrase Representations using {RNN} Encoder-Decoder for Statistical
               Machine Translation},
-  publisher = {Proceedings of the 2014 Conference on Empirical Methods in Natural
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
-               Language Processing, {EMNLP} 2014, October 25-29, 2014, Doha, Qatar,
-               {A} meeting of SIGDAT, a Special Interest Group of the {ACL}},
  pages     = {1724--1734},
-  //publisher = {{ACL}},
+  year      = {2014}
-  year      = {2014},
 }
 @inproceedings{pmlr-v9-glorot10a,
  author    = {Xavier Glorot and
               Yoshua Bengio},
  title     = {Understanding the difficulty of training deep feedforward neural networks},
-  publisher = {Proceedings of the Thirteenth International Conference on Artificial
+  publisher = {International Conference on Artificial Intelligence and Statistics},
-               Intelligence and Statistics, {AISTATS} 2010, Chia Laguna Resort, Sardinia,
-               Italy, May 13-15, 2010},
  volume    = {9},
  pages     = {249--256},
-  //publisher = {JMLR.org},
+  year      = {2010}
-  year      = {2010},
 }
 @inproceedings{xiao2017fast,
  author    = {Tong Xiao and
@@ -4556,12 +4519,9 @@ author    = {Yoshua Bengio and
               Tongran Liu and
               Chunliang Zhang},
  title     = {Fast Parallel Training of Neural Language Models},
-  publisher = {Proceedings of the Twenty-Sixth International Joint Conference on
+  publisher = {International Joint Conference on Artificial Intelligence},
-               Artificial Intelligence, {IJCAI} 2017, Melbourne, Australia, August
-               19-25, 2017},
  pages     = {4193--4199},
-  //publisher = {ijcai.org},
+  year      = {2017}
-  year      = {2017},
 }
 @inproceedings{Gu2017NonAutoregressiveNM,
  author    = {Jiatao Gu and
@@ -4571,7 +4531,7 @@ author    = {Yoshua Bengio and
               Richard Socher},
  title     = {Non-Autoregressive Neural Machine Translation},
  publisher = {International Conference on Learning Representations},
-  year      = {2018},
+  year      = {2018}
 }
 @inproceedings{li-etal-2018-simple,
  author    = {Yanyang Li and
@@ -4581,12 +4541,9 @@ author    = {Yoshua Bengio and
               Changming Xu and
               Jingbo Zhu},
  title     = {A Simple and Effective Approach to Coverage-Aware Neural Machine Translation},
-  publisher = {Proceedings of the 56th Annual Meeting of the Association for Computational
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
-               Linguistics, {ACL} 2018, Melbourne, Australia, July 15-20, 2018, Volume
-               2: Short Papers},
  pages     = {292--297},
-  //publisher = {Association for Computational Linguistics},
+  year      = {2018}
-  year      = {2018},
 }
 @inproceedings{TuModeling,
  author    = {Zhaopeng Tu and
@@ -4595,11 +4552,8 @@ author    = {Yoshua Bengio and
               Xiaohua Liu and
               Hang Li},
  title     = {Modeling Coverage for Neural Machine Translation},
-  publisher = {Proceedings of the 54th Annual Meeting of the Association for Computational
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
-               Linguistics, {ACL} 2016, August 7-12, 2016, Berlin, Germany, Volume
+  year      = {2016}
-               1: Long Papers},
-  //publisher = {The Association for Computer Linguistics},
-  year      = {2016},
 }
 @inproceedings{DBLP:journals/corr/SennrichFCBHHJL17,
  author    = {Rico Sennrich and
@@ -4614,23 +4568,17 @@ author    = {Yoshua Bengio and
               Jozef Mokry and
               Maria Nadejde},
  title     = {Nematus: a Toolkit for Neural Machine Translation},
-  publisher = {Proceedings of the 15th Conference of the European Chapter of the
+  publisher = {European Association of Computational Linguistics},
-               Association for Computational Linguistics, {EACL} 2017, Valencia,
-               Spain, April 3-7, 2017, Software Demonstrations},
  pages     = {65--68},
-  //publisher = {Association for Computational Linguistics},
+  year      = {2017}
-  year      = {2017},
 }
 @inproceedings{DBLP:journals/corr/abs-1905-13324,
  author    = {Biao Zhang and
               Rico Sennrich},
  title     = {A Lightweight Recurrent Network for Sequence Modeling},
-  publisher = {Proceedings of the 57th Conference of the Association for Computational
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
-               Linguistics, {ACL} 2019, Florence, Italy, July 28- August 2, 2019,
-               Volume 1: Long Papers},
  pages     = {1538--1548},
-  //publisher = {Association for Computational Linguistics},
+  year      = {2019}
-  year      = {2019},
 }
 @article{Lei2017TrainingRA,
  author    = {Tao Lei and
@@ -4639,7 +4587,7 @@ author    = {Yoshua Bengio and
  title     = {Training RNNs as Fast as CNNs},
  journal   = {CoRR},
  volume    = {abs/1709.02755},
-  year      = {2017},
+  year      = {2017}
 }
 @inproceedings{Zhang2018SimplifyingNM,
  author    = {Biao Zhang and
@@ -4649,22 +4597,18 @@ author    = {Yoshua Bengio and
               Huiji Zhang},
  title     = {Simplifying Neural Machine Translation with Addition-Subtraction Twin-Gated
               Recurrent Networks},
-  publisher = {Proceedings of the 2018 Conference on Empirical Methods in Natural
+  publisher = {Conference on Empirical Methods in Natural Language Processing},
-               Language Processing, Brussels, Belgium, October 31 - November 4, 2018},
  pages     = {4273--4283},
-  //publisher = {Association for Computational Linguistics},
+  year      = {2018}
-  year      = {2018},
 }
 @inproceedings{Liu_2019_CVPR,
  author    = {Shikun Liu and
               Edward Johns and
               Andrew J. Davison},
  title     = {End-To-End Multi-Task Learning With Attention},
-  publisher = {{IEEE} Conference on Computer Vision and Pattern Recognition, {CVPR}
+  publisher = {IEEE Conference on Computer Vision and Pattern Recognition},
-               2019, Long Beach, CA, USA, June 16-20, 2019},
  pages     = {1871--1880},
-  //publisher = {Computer Vision Foundation / {IEEE}},
+  year      = {2019}
-  year      = {2019},
 }
 @inproceedings{DBLP:journals/corr/abs-1811-00498,
  author    = {Ra{\'{u}}l V{\'{a}}zquez and
@@ -4672,11 +4616,9 @@ author    = {Yoshua Bengio and
               J{\"{o}}rg Tiedemann and
               Mathias Creutz},
  title     = {Multilingual {NMT} with a Language-Independent Attention Bridge},
-  publisher = {Proceedings of the 4th Workshop on Representation Learning for NLP,
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
-               RepL4NLP@ACL 2019, Florence, Italy, August 2, 2019},
  pages     = {33--39},
-  //publisher = {Association for Computational Linguistics},
+  year      = {2019}
-  year      = {2019},
 }
 @inproceedings{MoradiInterrogating,
  author    = {Pooya Moradi and
@@ -4684,11 +4626,9 @@ author    = {Yoshua Bengio and
               Anoop Sarkar},
  title     = {Interrogating the Explanatory Power of Attention in Neural Machine
               Translation},
-  publisher = {Proceedings of the 3rd Workshop on Neural Generation and Translation@EMNLP-IJCNLP
+  publisher = {Conference on Empirical Methods in Natural Language Processing},
-               2019, Hong Kong, November 4, 2019},
  pages     = {221--230},
-  //publisher = {Association for Computational Linguistics},
+  year      = {2019}
-  year      = {2019},
 }
 @inproceedings{WangNeural,
  author    = {Xing Wang and
@@ -4698,11 +4638,9 @@ author    = {Yoshua Bengio and
               Deyi Xiong and
               Min Zhang},
  title     = {Neural Machine Translation Advised by Statistical Machine Translation},
-  publisher = {Proceedings of the Thirty-First {AAAI} Conference on Artificial Intelligence,
+  publisher = {AAAI Conference on Artificial Intelligence},
-               February 4-9, 2017, San Francisco, California, {USA}},
  pages     = {3330--3336},
-  //publisher = {{AAAI} Press},
+  year      = {2017}
-  year      = {2017},
 }
 @inproceedings{Xiao2019SharingAW,
  author    = {Tong Xiao and
@@ -4711,12 +4649,9 @@ author    = {Yoshua Bengio and
               Zhengtao Yu and
               Tongran Liu},
  title     = {Sharing Attention Weights for Fast Transformer},
-  publisher = {Proceedings of the Twenty-Eighth International Joint Conference on
+  publisher = {International Joint Conference on Artificial Intelligence},
-               Artificial Intelligence, {IJCAI} 2019, Macao, China, August 10-16,
-               2019},
  pages     = {5292--5298},
-  //publisher = {ijcai.org},
+  year      = {2019}
-  year      = {2019},
 }
 @inproceedings{Yang2017TowardsBH,
  author    = {Baosong Yang and
@@ -4726,36 +4661,27 @@ author    = {Yoshua Bengio and
               Jingbo Zhu},
  title     = {Towards Bidirectional Hierarchical Representations for Attention-based
               Neural Machine Translation},
-  publisher = {Proceedings of the 2017 Conference on Empirical Methods in Natural
+  publisher = {Conference on Empirical Methods in Natural Language Processing},
-               Language Processing, {EMNLP} 2017, Copenhagen, Denmark, September
-               9-11, 2017},
  pages     = {1432--1441},
-  //publisher = {Association for Computational Linguistics},
+  year      = {2017}
-  year      = {2017},
 }
 @inproceedings{Wang2019TreeTI,
  author    = {Yau-Shian Wang and
               Hung-yi Lee and
               Yun-Nung Chen},
  title     = {Tree Transformer: Integrating Tree Structures into Self-Attention},
-  publisher = {Proceedings of the 2019 Conference on Empirical Methods in Natural
+  publisher = {Conference on Empirical Methods in Natural Language Processing},
-               Language Processing and the 9th International Joint Conference on
-               Natural Language Processing, {EMNLP-IJCNLP} 2019, Hong Kong, China,
-               November 3-7, 2019},
-  //publisher = {Association for Computational Linguistics},
  pages     = {1061--1070},
-  year      = {2019},
+  year      = {2019}
 }
 @inproceedings{DBLP:journals/corr/abs-1809-01854,
  author    = {Jetic Gu and
               Hassan S. Shavarani and
               Anoop Sarkar},
  title     = {Top-down Tree Structured Decoding with Syntactic Connections for Neural Machine Translation and Parsing},
-  publisher = {Proceedings of the 2018 Conference on Empirical Methods in Natural
+  publisher = {Conference on Empirical Methods in Natural Language Processing},
-               Language Processing, Brussels, Belgium, October 31 - November 4, 2018},
  pages     = {401--413},
-  //publisher = {Association for Computational Linguistics},
+  year      = {2018}
-  year      = {2018},
 }
 @inproceedings{DBLP:journals/corr/abs-1808-09374,
  author    = {Xinyi Wang and
@@ -4763,11 +4689,9 @@ author    = {Yoshua Bengio and
               Pengcheng Yin and
               Graham Neubig},
  title     = {A Tree-based Decoder for Neural Machine Translation},
-  publisher = {Proceedings of the 2018 Conference on Empirical Methods in Natural
+  publisher = {Conference on Empirical Methods in Natural Language Processing},
-               Language Processing, Brussels, Belgium, October 31 - November 4, 2018},
  pages     = {4772--4777},
-  //publisher = {Association for Computational Linguistics},
+  year      = {2018}
-  year      = {2018},
 }
 @article{DBLP:journals/corr/ZhangZ16c,
  author    = {Jiajun Zhang and
@@ -4775,7 +4699,7 @@ author    = {Yoshua Bengio and
  title     = {Bridging Neural Machine Translation and Bilingual Dictionaries},
  journal   = {CoRR},
  volume    = {abs/1610.07272},
-  year      = {2016},
+  year      = {2016}
 }
 @article{Dai2019TransformerXLAL,
  author    = {Zihang Dai and
@@ -4787,7 +4711,7 @@ author    = {Yoshua Bengio and
  title     = {Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context},
  journal   = {CoRR},
  volume    = {abs/1901.02860},
-  year      = {2019},
+  year      = {2019}
 }
 @inproceedings{li-etal-2019-word,
  author    = {Xintong Li and
@@ -4796,12 +4720,9 @@ author    = {Yoshua Bengio and
               Max Meng and
               Shuming Shi},
  title     = {On the Word Alignment from Neural Machine Translation},
-  publisher = {Proceedings of the 57th Conference of the Association for Computational
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
-               Linguistics, {ACL} 2019, Florence, Italy, July 28- August 2, 2019,
-               Volume 1: Long Papers},
  pages     = {1293--1303},
-  //publisher = {Association for Computational Linguistics},
+  year      = {2019}
-  year      = {2019},
 }
 @inproceedings{Werlen2018DocumentLevelNM,
@@ -4811,11 +4732,9 @@ author    = {Yoshua Bengio and
               James Henderson},
  title     = {Document-Level Neural Machine Translation with Hierarchical Attention
               Networks},
-  publisher = {Proceedings of the 2018 Conference on Empirical Methods in Natural
+  publisher = {Conference on Empirical Methods in Natural Language Processing},
-               Language Processing, Brussels, Belgium, October 31 - November 4, 2018},
  pages     = {2947--2954},
-  //publisher = {Association for Computational Linguistics},
+  year      = {2018}
-  year      = {2018},
 }
 @inproceedings{DBLP:journals/corr/abs-1805-10163,
  author    = {Elena Voita and
@@ -4823,12 +4742,9 @@ author    = {Yoshua Bengio and
               Rico Sennrich and
               Ivan Titov},
  title     = {Context-Aware Neural Machine Translation Learns Anaphora Resolution},
-  publisher = {Proceedings of the 56th Annual Meeting of the Association for Computational
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
-               Linguistics, {ACL} 2018, Melbourne, Australia, July 15-20, 2018, Volume
-               1: Long Papers},
  pages     = {1264--1274},
-  //publisher = {Association for Computational Linguistics},
+  year      = {2018}
-  year      = {2018},
 }
 @article{DBLP:journals/corr/abs-1906-00532,
  author    = {Aishwarya Bhandare and
@@ -4842,7 +4758,7 @@ author    = {Yoshua Bengio and
               Translation Model},
  journal   = {CoRR},
  volume    = {abs/1906.00532},
-  year      = {2019},
+  year      = {2019}
 }
 @inproceedings{Zhang2018SpeedingUN,
@@ -4852,22 +4768,18 @@ author    = {Yoshua Bengio and
               Lei Shen and
               Qun Liu},
  title     = {Speeding Up Neural Machine Translation Decoding by Cube Pruning},
-  publisher = {Proceedings of the 2018 Conference on Empirical Methods in Natural
+  publisher = {Conference on Empirical Methods in Natural Language Processing},
-               Language Processing, Brussels, Belgium, October 31 - November 4, 2018},
  pages     = {4284--4294},
-  //publisher = {Association for Computational Linguistics},
+  year      = {2018}
-  year      = {2018},
 }
 @inproceedings{DBLP:journals/corr/SeeLM16,
  author    = {Abigail See and
               Minh-Thang Luong and
               Christopher D. Manning},
  title     = {Compression of Neural Machine Translation Models via Pruning},
-  publisher = {Proceedings of the 20th {SIGNLL} Conference on Computational Natural
+  publisher = {International Conference on Computational Linguistics},
-               Language Learning, CoNLL 2016, Berlin, Germany, August 11-12, 2016},
  pages     = {291--301},
-  //publisher = {{ACL}},
+  year      = {2016}
-  year      = {2016},
 }
 @inproceedings{DBLP:journals/corr/ChenLCL17,
  author    = {Yun Chen and
@@ -4875,12 +4787,9 @@ author    = {Yoshua Bengio and
               Yong Cheng and
               Victor O. K. Li},
  title     = {A Teacher-Student Framework for Zero-Resource Neural Machine Translation},
-  publisher = {Proceedings of the 55th Annual Meeting of the Association for Computational
-               Linguistics, {ACL} 2017, Vancouver, Canada, July 30 - August 4, Volume
-               1: Long Papers},
  pages     = {1925--1935},
-  //publisher = {Association for Computational Linguistics},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
-  year      = {2017},
+  year      = {2017}
 }
 @article{Hinton2015Distilling,
  author    = {Geoffrey E. Hinton and
@@ -4889,13 +4798,13 @@ author    = {Yoshua Bengio and
  title     = {Distilling the Knowledge in a Neural Network},
  journal   = {CoRR},
  volume    = {abs/1503.02531},
-  year      = {2015},
+  year      = {2015}
 }
 @inproceedings{Ott2018ScalingNM,
  title={Scaling Neural Machine Translation},
  author={Myle Ott and Sergey Edunov and David Grangier and M. Auli},
-  publisher={Workshop on Machine Translation},
+  publisher={Annual Meeting of the Association for Computational Linguistics},
  year={2018}
 }
 @inproceedings{Lin2020TowardsF8,
@@ -4915,7 +4824,7 @@ author    = {Yoshua Bengio and
               Alexander M. Rush},
  title     = {Sequence-Level Knowledge Distillation},
  pages     = {1317--1327},
-  publisher = {The Association for Computational Linguistics},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2016}
 }
 @article{Akaike1969autoregressive,
@@ -4946,13 +4855,13 @@ author    = {Yoshua Bengio and
  title     = {The Best of Both Worlds: Combining Recent Advances in Neural Machine
               Translation},
  pages     = {76--86},
-  publisher = {Association for Computational Linguistics},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2018}
 }
 @inproceedings{He2018LayerWiseCB,
  title={Layer-Wise Coordination between Encoder and Decoder for Neural Machine Translation},
  author={Tianyu He and X. Tan and Yingce Xia and D. He and T. Qin and Zhibo Chen and T. Liu},
-  publisher={Conference and Workshop on Neural Information Processing Systems},
+  publisher={Conference on Neural Information Processing Systems},
  year={2018}
 }
 @inproceedings{cho-etal-2014-properties,
@@ -4962,7 +4871,7 @@ author    = {Yoshua Bengio and
               Yoshua Bengio},
  title     = {On the Properties of Neural Machine Translation: Encoder-Decoder Approaches},
  pages     = {103--111},
-  publisher = {Association for Computational Linguistics},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2014}
 }
@@ -4973,7 +4882,7 @@ author    = {Yoshua Bengio and
               Yoshua Bengio},
  title     = {On Using Very Large Target Vocabulary for Neural Machine Translation},
  pages     = {1--10},
-  publisher = {The Association for Computer Linguistics},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2015}
 }
@@ -4982,8 +4891,7 @@ author    = {Yoshua Bengio and
               Hieu Pham and
               Christopher D. Manning},
  title     = {Effective Approaches to Attention-based Neural Machine Translation},
-  publisher = {Conference on Empirical Methods in Natural
+  publisher = {Conference on Empirical Methods in Natural Language Processing},
-               Language Processing},
  pages     = {1412--1421},
  year      = {2015}
 }
@@ -4994,7 +4902,7 @@ author    = {Yoshua Bengio and
               Haifeng Wang},
  title     = {Improved Neural Machine Translation with {SMT} Features},
  pages     = {151--157},
-  publisher = {the Association for the Advance of Artificial Intelligence},
+  publisher = {AAAI Conference on Artificial Intelligence},
  year      = {2016}
 }
 @inproceedings{zhang-etal-2017-prior,
@@ -5005,7 +4913,7 @@ author    = {Yoshua Bengio and
      Xu, Jingfang  and
      Sun, Maosong},
    year = {2017},
-    publisher = {Association for Computational Linguistics},
+    publisher = {Annual Meeting of the Association for Computational Linguistics},
    pages = {1514--1523},
 }
@@ -5021,7 +4929,7 @@ author    = {Yoshua Bengio and
  title     = {Bilingual Dictionary Based Neural Machine Translation without Using
               Parallel Sentences},
  pages     = {1570--1579},
-  publisher = {Association for Computational Linguistics},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2020}
 }
@@ -5030,7 +4938,7 @@ author    = {Yoshua Bengio and
               Deyi Xiong},
  title     = {Encoding Gated Translation Memory into Neural Machine Translation},
  pages     = {3042--3047},
-  publisher = {Association for Computational Linguistics},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2018}
 }
 @inproceedings{yang-etal-2016-hierarchical,
@@ -5042,7 +4950,7 @@ author    = {Yoshua Bengio and
               Eduard H. Hovy},
  title     = {Hierarchical Attention Networks for Document Classification},
  pages     = {1480--1489},
-  publisher = {The Association for Computational Linguistics},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2016}
 }
 %%%%% chapter 10------------------------------------------------------
@@ -5056,7 +4964,7 @@ author    = {Yoshua Bengio and
               Douwe Kiela},
  title     = {Code-Switched Named Entity Recognition with Embedding Attention},
  pages     = {154--158},
-  publisher = {Association for Computational Linguistics},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2018}
 }
@@ -5069,7 +4977,7 @@ author    = {Yoshua Bengio and
  title     = {Leveraging Linguistic Structures for Named Entity Recognition with
               Bidirectional Recursive Neural Networks},
  pages     = {2664--2669},
-  publisher = {Association for Computational Linguistics},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2017}
 }
@@ -5077,7 +4985,7 @@ author    = {Yoshua Bengio and
  author    = {Xuezhe Ma and
               Eduard H. Hovy},
  title     = {End-to-end Sequence Labeling via Bi-directional LSTM-CNNs-CRF},
-  publisher = {The Association for Computer Linguistics},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2016}
 }
@@ -5088,7 +4996,7 @@ author    = {Yoshua Bengio and
               Andrew McCallum},
  title     = {Fast and Accurate Entity Recognition with Iterated Dilated Convolutions},
  pages     = {2670--2680},
-  publisher = {Association for Computational Linguistics},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2017}
 }
@@ -5107,26 +5015,21 @@ author    = {Yoshua Bengio and
  year      = {2017}
 }
-@article{DBLP:journals/jmlr/CollobertWBKKK11,
+@article{2011Natural,
-  author    = {Ronan Collobert and
+  title={Natural Language Processing (almost) from Scratch},
-               Jason Weston and
+  author={ Collobert, Ronan  and  Weston, Jason  and Bottou, Léon and  Karlen, Michael  and  Kavukcuoglu, Koray  and  Kuksa, Pavel },
-               L{\'{e}}on Bottou and
+  journal={Journal of Machine Learning Research},
-               Michael Karlen and
+  volume={12},
-               Koray Kavukcuoglu and
+  number={1},
-               Pavel P. Kuksa},
+  pages={2493-2537},
-  title     = {Natural Language Processing (Almost) from Scratch},
+  year={2011},
-  journal   = {J. Mach. Learn. Res.},
-  volume    = {12},
-  pages     = {2493--2537},
-  year      = {2011}
 }
 @inproceedings{DBLP:conf/acl/NguyenG15,
  author    = {Thien Huu Nguyen and
               Ralph Grishman},
  title     = {Event Detection and Domain Adaptation with Convolutional Neural Networks},
  pages     = {365--371},
-  publisher = {The Association for Computer Linguistics},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2015}
 }
@@ -5137,7 +5040,7 @@ author    = {Yoshua Bengio and
               Jun Zhao},
  title     = {Recurrent Convolutional Neural Networks for Text Classification},
  pages     = {2267--2273},
-  publisher = {the Association for the Advance of Artificial Intelligence},
+  publisher = {AAAI Conference on Artificial Intelligence},
  year      = {2015}
 }
@@ -5149,7 +5052,7 @@ author    = {Yoshua Bengio and
               Jun Zhao},
  title     = {Event Extraction via Dynamic Multi-Pooling Convolutional Neural Networks},
  pages     = {167--176},
-  publisher = {The Association for Computer Linguistics},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2015}
 }
@@ -5159,7 +5062,7 @@ author    = {Yoshua Bengio and
               Tommi S. Jaakkola},
  title     = {Molding CNNs for text: non-linear, non-consecutive convolutions},
  pages     = {1565--1575},
-  publisher = {The Association for Computational Linguistics},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2015}
 }
@@ -5169,7 +5072,7 @@ author    = {Yoshua Bengio and
  title     = {Effective Use of Word Order for Text Categorization with Convolutional
               Neural Networks},
  pages     = {103--112},
-  publisher = {The Association for Computational Linguistics},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2015}
 }
@@ -5178,14 +5081,14 @@ author    = {Yoshua Bengio and
               Ralph Grishman},
  title     = {Relation Extraction: Perspective from Convolutional Neural Networks},
  pages     = {39--48},
-  publisher = {The Association for Computational Linguistics},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2015}
 }
 @article{StahlbergNeural,
  title={Neural Machine Translation: A Review},
  author={Felix Stahlberg},
-  journal={journal of artificial intelligence research},
+  journal={Journal of Artificial Intelligence Research},
  year={2020},
  volume={69},
  pages={343-418}
@@ -5211,7 +5114,7 @@ author    = {Yoshua Bengio and
 @article{Waibel1989PhonemeRU,
  title={Phoneme recognition using time-delay neural networks},
  author={Alexander H. Waibel and Toshiyuki Hanazawa and Geoffrey E. Hinton and K. Shikano and K. Lang},
-  journal={IEEE Trans. Acoust. Speech Signal Process.},
+  journal={IEEE Transactions on Acoustics, Speech, and Signal Processing},
  year={1989},
  volume={37},
  pages={328-339}
@@ -5226,7 +5129,7 @@ author    = {Yoshua Bengio and
  pages={541-551}
 }
-@ARTICLE{726791,
+@article{726791,
  author={Y. {Lecun} and L. {Bottou} and Y. {Bengio} and P. {Haffner}},
  journal={Proceedings of the IEEE}, 
  title={Gradient-based learning applied to document recognition}, 
@@ -5234,7 +5137,6 @@ author    = {Yoshua Bengio and
  volume={86},
  number={11},
  pages={2278-2324},
-  //doi={10.1109/5.726791}
 }
 @inproceedings{DBLP:journals/corr/HeZRS15,
@@ -5262,7 +5164,7 @@ author    = {Yoshua Bengio and
 @article{Girshick2015FastR,
  title={Fast R-CNN},
  author={Ross B. Girshick},
-  journal={2015 IEEE International Conference on Computer Vision (ICCV)},
+  journal={International Conference on Computer Vision},
  year={2015},
  pages={1440-1448}
 }
@@ -5279,7 +5181,7 @@ author    = {Yoshua Bengio and
 @inproceedings{Kalchbrenner2014ACN,
  title={A Convolutional Neural Network for Modelling Sentences},
  author={Nal Kalchbrenner and Edward Grefenstette and P. Blunsom},
-  booktitle={ACL},
+  publisher={Annual Meeting of the Association for Computational Linguistics},
  pages={655--665},
  year={2014}
 }
@@ -5287,7 +5189,7 @@ author    = {Yoshua Bengio and
 @inproceedings{Kim2014ConvolutionalNN,
  title={Convolutional Neural Networks for Sentence Classification},
  author={Yoon Kim},
-  booktitle={Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing},
+  publisher={Conference on Empirical Methods in Natural Language Processing},
  pages = {1746--1751},
  year={2014}
 }
@@ -5299,7 +5201,7 @@ author    = {Yoshua Bengio and
               Bowen Zhou and
               Bing Xiang},
  pages = {174--179},
-  booktitle={The Association for Computer Linguistics},
+  publisher={Annual Meeting of the Association for Computational Linguistics},
  year={2015}
 }
@@ -5308,7 +5210,7 @@ author    = {Yoshua Bengio and
  author    = {C{\'{\i}}cero Nogueira dos Santos and
               Maira Gatti},
  pages     = {69--78},
-  publisher = {The Association for Computer Linguistics},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year={2014}
 }
@@ -5318,7 +5220,7 @@ author    = {Yoshua Bengio and
               Angela Fan and
               Michael Auli and
               David Grangier},
-  booktitle={Proceedings of the 34th International Conference on Machine Learning},
+  publisher={International Conference on Machine Learning},
  volume    = {70},
  pages     = {933--941},
  year={2017}
@@ -5330,7 +5232,7 @@ author    = {Yoshua Bengio and
               Michael Auli and
               David Grangier and
               Yann N. Dauphin},
-  booktitle={The Association for Computer Linguistics},
+  publisher={Annual Meeting of the Association for Computational Linguistics},
  pages     = {123--135},
  year={2017}
 }
@@ -5353,7 +5255,7 @@ author    = {Yoshua Bengio and
  author    = {Lukasz Kaiser and
               Aidan N. Gomez and
               Fran{\c{c}}ois Chollet},
-  publisher = {OpenReview.net},
+  journal = {International Conference on Learning Representations},
  year={2018},
 }
@@ -5364,7 +5266,7 @@ author    = {Yoshua Bengio and
 		 Yann N. Dauphin and
 		 Michael Auli},
 title = {Pay Less Attention with Lightweight and Dynamic Convolutions},
- publisher = {7th International Conference on Learning Representations},
+ publisher = {International Conference on Learning Representations},
 year = {2019},
 }
@@ -5421,7 +5323,7 @@ author    = {Yoshua Bengio and
               Shaoqing Ren and
               Jian Sun},
  title     = {Deep Residual Learning for Image Recognition},
-  publisher = {{IEEE} Conference on Computer Vision and Pattern Recognition},
+  publisher = {IEEE Conference on Computer Vision and Pattern Recognition},
  pages     = {770--778},
  year      = {2016},
 }
@@ -5432,26 +5334,26 @@ author    = {Yoshua Bengio and
               Arthur Szlam and
               Jason Weston and
               Rob Fergus},
-  booktitle={Conference and Workshop on Neural Information Processing Systems},
+  publisher={Conference on Neural Information Processing Systems},
  pages     = {2440--2448},
  year={2015}
 }
-@article{Islam2020HowMP,
+@inproceedings{Islam2020HowMP,
-  title={How Much Position Information Do Convolutional Neural Networks Encode?},
+  author    = {Md. Amirul Islam and
-  author={Md. Amirul Islam and Sen Jia and Neil D. B. Bruce},
+               Sen Jia and
-  journal={ArXiv},
+               Neil D. B. Bruce},
-  year={2020},
+  title     = {How much Position Information Do Convolutional Neural Networks Encode?},
-  volume={abs/2001.08248}
+  publisher = {International Conference on Learning Representations},
+  year      = {2020},
 }
 @inproceedings{Sutskever2013OnTI,
  title={On the importance of initialization and momentum in deep learning},
  author    = {Ilya Sutskever and
               James Martens and
               George E. Dahl and
               Geoffrey E. Hinton},
-  booktitle={International Conference on Machine Learning},
+  publisher = {International Conference on Machine Learning},
  pages     = {1139--1147},
  year={2013}
 }
@@ -5459,7 +5361,7 @@ author    = {Yoshua Bengio and
 @article{Bengio2013AdvancesIO,
  title={Advances in optimizing recurrent networks},
  author={Yoshua Bengio and Nicolas Boulanger-Lewandowski and Razvan Pascanu},
-  journal={2013 IEEE International Conference on Acoustics, Speech and Signal Processing},
+  journal={IEEE Transactions on Acoustics, Speech, and Signal Processing},
  year={2013},
  pages={8624-8628}
 }
@@ -5476,7 +5378,7 @@ author    = {Yoshua Bengio and
 @article{Chollet2017XceptionDL,
  title={Xception: Deep Learning with Depthwise Separable Convolutions},
  author    = {Fran{\c{c}}ois Chollet},
-  journal={2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
+  journal={IEEE Conference on Computer Vision and Pattern Recognition},
  year={2017},
  pages={1800-1807}
 }
@@ -5512,7 +5414,7 @@ author    = {Yoshua Bengio and
  title={Rotation, Scaling and Deformation Invariant Scattering for Texture Discrimination},
  author    = {Laurent Sifre and
               St{\'{e}}phane Mallat},
-  journal={2013 IEEE Conference on Computer Vision and Pattern Recognition},
+  journal={IEEE Conference on Computer Vision and Pattern Recognition},
  year={2013},
  pages={1233-1240}
 }
@@ -5520,7 +5422,7 @@ author    = {Yoshua Bengio and
 @article{Taigman2014DeepFaceCT,
  title={DeepFace: Closing the Gap to Human-Level Performance in Face Verification},
  author={Yaniv Taigman and Ming Yang and Marc'Aurelio Ranzato and Lior Wolf},
-  journal={2014 IEEE Conference on Computer Vision and Pattern Recognition},
+  journal={IEEE Conference on Computer Vision and Pattern Recognition},
  year={2014},
  pages={1701-1708}
 }
@@ -5533,7 +5435,7 @@ author    = {Yoshua Bengio and
               Mirk{\'{o}} Visontai and
               Raziel Alvarez and
               Carolina Parada},
-  booktitle={the International Speech Communication Association},
+  publisher={Conference of the International Speech Communication Association},
  pages     = {1136--1140},
  year={2015}
 }
@@ -5546,7 +5448,7 @@ author    = {Yoshua Bengio and
               Dongdong Chen and
               Lu Yuan and
               Zicheng Liu},
-  publisher = {Institute of Electrical and Electronics Engineers},
+  journal = {IEEE Conference on Computer Vision and Pattern Recognition},
  year={2020},
  pages={11027-11036}
 }
@@ -5563,7 +5465,7 @@ author    = {Yoshua Bengio and
               Chloe Hillier and
               Timothy P. Lillicrap},
  title     = {Compressive Transformers for Long-Range Sequence Modelling},
-  publisher = {OpenReview.net},
+  publisher = {International Conference on Learning Representations},
  year      = {2020}
 }
@@ -5597,7 +5499,7 @@ author    = {Yoshua Bengio and
               Yujun Lin and
               Song Han},
  title     = {Lite Transformer with Long-Short Range Attention},
-  publisher = {OpenReview.net},
+  publisher = {International Conference on Learning Representations},
  year      = {2020}
 }
@@ -5610,7 +5512,7 @@ author    = {Yoshua Bengio and
  title     = {Analyzing Multi-Head Self-Attention: Specialized Heads Do the Heavy
               Lifting, the Rest Can Be Pruned},
  pages     = {5797--5808},
-  publisher = {Association for Computational Linguistics},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2019},
 }
@@ -5623,7 +5525,7 @@ author    = {Yoshua Bengio and
               Bowen Zhou and
               Yoshua Bengio},
  title     = {A Structured Self-Attentive Sentence Embedding},
-  publisher = {5th International Conference on Learning Representations},
+  publisher = {International Conference on Learning Representations},
  year      = {2017},
 }
 @inproceedings{Shaw2018SelfAttentionWR,
@@ -5631,8 +5533,8 @@ author    = {Yoshua Bengio and
               Jakob Uszkoreit and
               Ashish Vaswani},
  title     = {Self-Attention with Relative Position Representations},
-  publisher = {Proceedings of the 2018 Conference of the North American Chapter of
+  publisher = {Proceedings of the Human Language Technology Conference of 
-               the Association for Computational Linguistics: Human Language Technologies},
+               the North American Chapter of the Association for Computational Linguistics},
  pages     = {464--468},
  year      = {2018},
 }
@@ -5642,7 +5544,7 @@ author    = {Yoshua Bengio and
               Shaoqing Ren and
               Jian Sun},
  title     = {Deep Residual Learning for Image Recognition},
-  publisher = {{IEEE} Conference on Computer Vision and Pattern Recognition},
+  publisher = {IEEE Conference on Computer Vision and Pattern Recognition},
  pages     = {770--778},
  year      = {2016},
 }
@@ -5661,7 +5563,7 @@ author    = {Yoshua Bengio and
               Jonathon Shlens and
               Zbigniew Wojna},
  title     = {Rethinking the Inception Architecture for Computer Vision},
-  publisher = {{IEEE} Conference on Computer Vision and Pattern Recognition},
+  publisher = {IEEE Conference on Computer Vision and Pattern Recognition},
  pages     = {2818--2826},
  year      = {2016},
 }
@@ -5670,8 +5572,7 @@ author    = {Yoshua Bengio and
               Deyi Xiong and
               Jinsong Su},
  title     = {Accelerating Neural Transformer via an Average Attention Network},
-  publisher = {Proceedings of the 56th Annual Meeting of the Association for Computational
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
-               Linguistics},
  pages     = {1789--1798},
  year      = {2018},
 }
@@ -5691,7 +5592,7 @@ author    = {Yoshua Bengio and
 		 Yann N. Dauphin and
 		 Michael Auli},
 title = {Pay Less Attention with Lightweight and Dynamic Convolutions},
- publisher = {7th International Conference on Learning Representations},
+ publisher = {International Conference on Learning Representations},
 year = {2019},
 }
@@ -5704,7 +5605,7 @@ author    = {Yoshua Bengio and
               Ruslan Salakhutdinov},
  title     = {Transformer-XL: Attentive Language Models beyond a Fixed-Length Context},
  pages     = {2978--2988},
-  publisher = {Association for Computational Linguistics},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2019}
 }
 @article{Liu2020LearningTE,
@@ -5729,7 +5630,7 @@ author    = {Yoshua Bengio and
               Tong Zhang},
  title     = {Modeling Localness for Self-Attention Networks},
  pages     = {4449--4458},
-  publisher = {Association for Computational Linguistics},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2018}
 }
 @inproceedings{DBLP:journals/corr/abs-1904-03107,
@@ -5740,7 +5641,7 @@ author    = {Yoshua Bengio and
 			Zhaopeng Tu},
 	title = {Convolutional Self-Attention Networks},
 	pages = {4040--4045},
-	publisher = {Association for Computational Linguistics},
+	publisher = {Annual Meeting of the Association for Computational Linguistics},
 	year = {2019},
 }
 @article{Wang2018MultilayerRF,
@@ -5759,7 +5660,7 @@ author    = {Yoshua Bengio and
  title     = {Training Deeper Neural Machine Translation Models with Transparent
               Attention},
  pages     = {3028--3033},
-  publisher = {Association for Computational Linguistics},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2018}
 }
 @inproceedings{Dou2018ExploitingDR,
@@ -5770,7 +5671,7 @@ author    = {Yoshua Bengio and
               Tong Zhang},
  title     = {Exploiting Deep Representations for Neural Machine Translation},
  pages     = {4253--4262},
-  publisher = {Association for Computational Linguistics},
+  publisher = {Annual Meeting of the Association for Computational Linguistics},
  year      = {2018}
 }
 @inproceedings{Wang2019ExploitingSC,
@@ -5789,13 +5690,13 @@ author    = {Yoshua Bengio and
               Tong Zhang},
  title     = {Dynamic Layer Aggregation for Neural Machine Translation with Routing-by-Agreement},
  pages     = {86--93},
-  publisher = {the Association for the Advance of Artificial Intelligence},
+  publisher = {AAAI Conference on Artificial Intelligence},
  year      = {2019}
 }
 @inproceedings{Wei2020MultiscaleCD,
  title={Multiscale Collaborative Deep Models for Neural Machine Translation},
  author={Xiangpeng Wei and Heng Yu and Yue Hu and Yue Zhang and Rongxiang Weng and Weihua Luo},
-  booktitle={Annual Meeting of the Association for Computational Linguistics},
+  publisher={Annual Meeting of the Association for Computational Linguistics},
  year={2020}
 }
@@ -5824,7 +5725,7 @@ author    = {Yoshua Bengio and
               Lukasz Kaiser and
               Anselm Levskaya},
  title     = {Reformer: The Efficient Transformer},
-  publisher = {OpenReview.net},
+  journal = {International Conference on Learning Representations},
  year      = {2020}
 }
@@ -5839,7 +5740,7 @@ author    = {Yoshua Bengio and
 @article{li2020shallow,
  title={Shallow-to-Deep Training for Neural Machine Translation},
  author={Li, Bei and Wang, Ziyang and Liu, Hui and Jiang, Yufan and Du, Quan and Xiao, Tong and Wang, Huizhen and Zhu, Jingbo},
-  publisher={Conference on Empirical Methods in Natural Language Processing},
+  journal={Conference on Empirical Methods in Natural Language Processing},
  year={2020}
 }
 %%%%% chapter 12------------------------------------------------------
@@ -6673,15 +6574,7 @@ author    = {Yoshua Bengio and
  publisher = {Annual Meeting of the Annual Meeting of the Association for Computational Linguistics},
  year      = {2019}
 }
-@inproceedings{DBLP:conf/naacl/MohiuddinJ19,
-  author    = {Tasnim Mohiuddin and
-               Shafiq R. Joty},
-  title     = {Revisiting Adversarial Autoencoder for Unsupervised Word Translation
-               with Cycle Consistency and Improved Training},
-  pages     = {3857--3867},
-  publisher = {Annual Meeting of the Annual Meeting of the Association for Computational Linguistics},
-  year      = {2019}
-}
 @article{DBLP:journals/corr/abs-1811-01124,
  author    = {Jean Alaux and
               Edouard Grave and
@@ -6896,394 +6789,6 @@ author    = {Yoshua Bengio and
  publisher = {Annual Meeting of the Annual Meeting of the Association for Computational Linguistics},
  year      = {2019}
 }
-@article{2019ADabre,
-  title={A Survey of Multilingual Neural Machine Translation},
-  author={Dabre, Raj  and  Chu, Chenhui  and  Kunchukuttan, Anoop },
-  year={2019},
-}
-@inproceedings{DBLP:conf/naacl/ZophK16,
-  author    = {Barret Zoph and
-               Kevin Knight},
-  title     = {Multi-Source Neural Translation},
-  pages     = {30--34},
-  publisher = {Annual Conference of the North American Chapter of the Association for Computational Linguistics},
-  year      = {2016}
-}
-@inproceedings{DBLP:conf/naacl/FiratCB16,
-  author    = {Orhan Firat and
-               Kyunghyun Cho and
-               Yoshua Bengio},
-  title     = {Multi-Way, Multilingual Neural Machine Translation with a Shared Attention
-               Mechanism},
-  pages     = {866--875},
-  publisher = {Annual Meeting of the Association for Computational Linguistics},
-  year      = {2016}
-}
-@article{DBLP:journals/tacl/JohnsonSLKWCTVW17,
-  author    = {Melvin Johnson and
-               Mike Schuster and
-               Quoc V. Le and
-               Maxim Krikun and
-               Yonghui Wu and
-               Zhifeng Chen and
-               Nikhil Thorat and
-               Fernanda B. Vi{\'{e}}gas and
-               Martin Wattenberg and
-               Greg Corrado and
-               Macduff Hughes and
-               Jeffrey Dean},
-  title     = {Google's Multilingual Neural Machine Translation System: Enabling
-               Zero-Shot Translation},
-  journal   = {Trans. Assoc. Comput. Linguistics},
-  volume    = {5},
-  pages     = {339--351},
-  year      = {2017}
-}
-@inproceedings{DBLP:conf/emnlp/KimPPKN19,
-  author    = {Yunsu Kim and
-               Petre Petrov and
-               Pavel Petrushkov and
-               Shahram Khadivi and
-               Hermann Ney},
-  title     = {Pivot-based Transfer Learning for Neural Machine Translation between
-               Non-English Languages},
-  pages     = {866--876},
-  publisher = {Association for Computational Linguistics},
-  year      = {2019}
-}
-@inproceedings{DBLP:conf/acl/ChenLCL17,
-  author    = {Yun Chen and
-               Yang Liu and
-               Yong Cheng and
-               Victor O. K. Li},
-  title     = {A Teacher-Student Framework for Zero-Resource Neural Machine Translation},
-  pages     = {1925--1935},
-  publisher = {Association for Computational Linguistics},
-  year      = {2017}
-}
-@article{DBLP:journals/mt/WuW07,
-  author    = {Hua Wu and
-               Haifeng Wang},
-  title     = {Pivot language approach for phrase-based statistical machine translation},
-  journal   = {Mach. Transl.},
-  volume    = {21},
-  number    = {3},
-  pages     = {165--181},
-  year      = {2007}
-}
-@article{Farsi2010somayeh,
-  author    = {Somayeh Bakhshaei and Shahram Khadivi and Noushin Riahi },
-  title     = {Farsi-german statistical machine translation through bridge language},
-  publisher   = {International Telecommunications Symposium},
-  pages     = {165--181},
-  year      = {2010}
-}
-@inproceedings{DBLP:conf/acl/ZahabiBK13,
-  author    = {Samira Tofighi Zahabi and
-               Somayeh Bakhshaei and
-               Shahram Khadivi},
-  title     = {Using Context Vectors in Improving a Machine Translation System with
-               Bridge Language},
-  pages     = {318--322},
-  publisher = {The Association for Computer Linguistics},
-  year      = {2013}
-}
-@inproceedings{DBLP:conf/emnlp/ZhuHWZWZ14,
-  author    = {Xiaoning Zhu and
-               Zhongjun He and
-               Hua Wu and
-               Conghui Zhu and
-               Haifeng Wang and
-               Tiejun Zhao},
-  title     = {Improving Pivot-Based Statistical Machine Translation by Pivoting
-               the Co-occurrence Count of Phrase Pairs},
-  pages     = {1665--1675},
-  publisher = {{ACL}},
-  year      = {2014}
-}
-@inproceedings{DBLP:conf/acl/MiuraNSTN15,
-  author    = {Akiva Miura and
-               Graham Neubig and
-               Sakriani Sakti and
-               Tomoki Toda and
-               Satoshi Nakamura},
-  title     = {Improving Pivot Translation by Remembering the Pivot},
-  pages     = {573--577},
-  publisher = {The Association for Computer Linguistics},
-  year      = {2015}
-}
-@inproceedings{DBLP:conf/acl/CohnL07,
-  author    = {Trevor Cohn and
-               Mirella Lapata},
-  title     = {Machine Translation by Triangulation: Making Effective Use of Multi-Parallel
-               Corpora},
-  publisher = {The Association for Computational Linguistics},
-  year      = {2007}
-}
-@article{DBLP:journals/mt/WuW07,
-  author    = {Hua Wu and
-               Haifeng Wang},
-  title     = {Pivot language approach for phrase-based statistical machine translation},
-  journal   = {Mach. Transl.},
-  volume    = {21},
-  number    = {3},
-  pages     = {165--181},
-  year      = {2007}
-}
-@inproceedings{DBLP:conf/acl/WuW09,
-  author    = {Hua Wu and
-               Haifeng Wang},
-  title     = {Revisiting Pivot Language Approach for Machine Translation},
-  pages     = {154--162},
-  publisher = {The Association for Computer Linguistics},
-  year      = {2009}
-}
-@article{DBLP:journals/corr/ChengLYSX16,
-  author    = {Yong Cheng and
-               Yang Liu and
-               Qian Yang and
-               Maosong Sun and
-               Wei Xu},
-  title     = {Neural Machine Translation with Pivot Languages},
-  journal   = {CoRR},
-  volume    = {abs/1611.04928},
-  year      = {2016}
-}
-@inproceedings{DBLP:conf/interspeech/KauersVFW02,
-  author    = {Manuel Kauers and
-               Stephan Vogel and
-               Christian F{\"{u}}gen and
-               Alex Waibel},
-  title     = {Interlingua based statistical machine translation},
-  publisher = {International Symposium on Computer Architecture},
-  year      = {2002}
-}
-@inproceedings{de2006catalan,
-  title={Catalan-English statistical machine translation without parallel corpus: bridging through Spanish},
-  author={De Gispert, Adri{\`a} and Marino, Jose B},
-  booktitle={Proc. of 5th International Conference on Language Resources and Evaluation (LREC)},
-  pages={65--68},
-  year={2006}
-}
-@inproceedings{DBLP:conf/naacl/UtiyamaI07,
-  author    = {Masao Utiyama and
-               Hitoshi Isahara},
-  title     = {A Comparison of Pivot Methods for Phrase-Based Statistical Machine
-               Translation},
-  pages     = {484--491},
-  publisher = {The Association for Computational Linguistics},
-  year      = {2007}
-}
-@inproceedings{DBLP:conf/ijcnlp/Costa-JussaHB11,
-  author    = {Marta R. Costa-juss{\`{a}} and
-               Carlos A. Henr{\'{\i}}quez Q. and
-               Rafael E. Banchs},
-  title     = {Enhancing scarce-resource language translation through pivot combinations},
-  pages     = {1361--1365},
-  publisher = {The Association for Computer Linguistics},
-  year      = {2011}
-}
-@article{DBLP:journals/corr/HintonVD15,
-  author    = {Geoffrey E. Hinton and
-               Oriol Vinyals and
-               Jeffrey Dean},
-  title     = {Distilling the Knowledge in a Neural Network},
-  journal   = {CoRR},
-  volume    = {abs/1503.02531},
-  year      = {2015}
-}
-@article{gu2018meta,
-  title={Meta-learning for low-resource neural machine translation},
-  author={Gu, Jiatao and Wang, Yong and Chen, Yun and Cho, Kyunghyun and Li, Victor OK},
-  journal={arXiv preprint arXiv:1808.08437},
-  year={2018}
-}
-@inproceedings{DBLP:conf/naacl/GuHDL18,
-  author    = {Jiatao Gu and
-               Hany Hassan and
-               Jacob Devlin and
-               Victor O. K. Li},
-  title     = {Universal Neural Machine Translation for Extremely Low Resource Languages},
-  pages     = {344--354},
-  publisher = {Association for Computational Linguistics},
-  year      = {2018}
-}
-@inproceedings{DBLP:conf/icml/FinnAL17,
-  author    = {Chelsea Finn and
-               Pieter Abbeel and
-               Sergey Levine},
-  title     = {Model-Agnostic Meta-Learning for Fast Adaptation of Deep Networks},
-  series    = {Proceedings of Machine Learning Research},
-  volume    = {70},
-  pages     = {1126--1135},
-  publisher = {International Conference on Machine Learning},
-  year      = {2017}
-}
-@inproceedings{DBLP:conf/acl/DongWHYW15,
-  author    = {Daxiang Dong and
-               Hua Wu and
-               Wei He and
-               Dianhai Yu and
-               Haifeng Wang},
-  title     = {Multi-Task Learning for Multiple Language Translation},
-  pages     = {1723--1732},
-  publisher = {The Association for Computer Linguistics},
-  year      = {2015}
-}
-@article{DBLP:journals/tacl/LeeCH17,
-  author    = {Jason Lee and
-               Kyunghyun Cho and
-               Thomas Hofmann},
-  title     = {Fully Character-Level Neural Machine Translation without Explicit
-               Segmentation},
-  journal   = {Trans. Assoc. Comput. Linguistics},
-  volume    = {5},
-  pages     = {365--378},
-  year      = {2017}
-}
-@inproceedings{DBLP:conf/lrec/RiktersPK18,
-  author    = {Matiss Rikters and
-               Marcis Pinnis and
-               Rihards Krislauks},
-  title     = {Training and Adapting Multilingual {NMT} for Less-resourced and Morphologically
-               Rich Languages},
-  publisher = {European Language Resources Association},
-  year      = {2018}
-}
-@article{DBLP:journals/tkde/PanY10,
-  author    = {Sinno Jialin Pan and
-               Qiang Yang},
-  title     = {A Survey on Transfer Learning},
-  journal   = {{IEEE} Trans. Knowl. Data Eng.},
-  volume    = {22},
-  number    = {10},
-  pages     = {1345--1359},
-  year      = {2010}
-}
-@article{DBLP:journals/tacl/JohnsonSLKWCTVW17,
-  author    = {Melvin Johnson and
-               Mike Schuster and
-               Quoc V. Le and
-               Maxim Krikun and
-               Yonghui Wu and
-               Zhifeng Chen and
-               Nikhil Thorat and
-               Fernanda B. Vi{\'{e}}gas and
-               Martin Wattenberg and
-               Greg Corrado and
-               Macduff Hughes and
-               Jeffrey Dean},
-  title     = {Google's Multilingual Neural Machine Translation System: Enabling
-               Zero-Shot Translation},
-  journal   = {Trans. Assoc. Comput. Linguistics},
-  volume    = {5},
-  pages     = {339--351},
-  year      = {2017}
-}
-@book{2009Handbook,
-  title={Handbook Of Research On Machine Learning Applications and Trends: Algorithms, Methods and Techniques - 2 Volumes},
-  author={ Olivas, Emilio Soria  and  Guerrero, Jose David Martin  and  Sober, Marcelino Martinez  and  Benedito, Jose Rafael Magdalena  and  Lopez, Antonio Jose Serrano },
-  publisher={Information Science Reference - Imprint of: IGI Publishing},
-  year={2009},
-}
-@incollection{DBLP:books/crc/aggarwal14/Pan14,
-  author    = {Sinno Jialin Pan},
-  title     = {Transfer Learning},
-  booktitle = {Data Classification: Algorithms and Applications},
-  pages     = {537--570},
-  publisher = {{CRC} Press},
-  year      = {2014}
-}
-@inproceedings{DBLP:conf/iclr/TanRHQZL19,
-  author    = {Xu Tan and
-               Yi Ren and
-               Di He and
-               Tao Qin and
-               Zhou Zhao and
-               Tie-Yan Liu},
-  title     = {Multilingual Neural Machine Translation with Knowledge Distillation},
-  publisher = {OpenReview.net},
-  year      = {2019}
-}
-@article{platanios2018contextual,
-  title={Contextual parameter generation for universal neural machine translation},
-  author={Platanios, Emmanouil Antonios and Sachan, Mrinmaya and Neubig, Graham and Mitchell, Tom},
-  journal={arXiv preprint arXiv:1808.08493},
-  year={2018}
-}
-@inproceedings{ji2020cross,
-  title={Cross-Lingual Pre-Training Based Transfer for Zero-Shot Neural Machine Translation},
-  author={Ji, Baijun and Zhang, Zhirui and Duan, Xiangyu and Zhang, Min and Chen, Boxing and Luo, Weihua},
-  booktitle={Proceedings of the AAAI Conference on Artificial Intelligence},
-  volume={34},
-  number={01},
-  pages={115--122},
-  year={2020}
-}
-@inproceedings{DBLP:conf/wmt/KocmiB18,
-  author    = {Tom Kocmi and
-               Ondrej Bojar},
-  title     = {Trivial Transfer Learning for Low-Resource Neural Machine Translation},
-  pages     = {244--252},
-  publisher = {Association for Computational Linguistics},
-  year      = {2018}
-}
-@inproceedings{DBLP:conf/acl/ZhangWTS20,
-  author    = {Biao Zhang and
-               Philip Williams and
-               Ivan Titov and
-               Rico Sennrich},
-  title     = {Improving Massively Multilingual Neural Machine Translation and Zero-Shot
-               Translation},
-  pages     = {1628--1639},
-  publisher = {Association for Computational Linguistics},
-  year      = {2020}
-}
-@inproceedings{DBLP:conf/naacl/PaulYSN09,
-  author    = {Michael Paul and
-               Hirofumi Yamamoto and
-               Eiichiro Sumita and
-               Satoshi Nakamura},
-  title     = {On the Importance of Pivot Language Selection for Statistical Machine
-               Translation},
-  pages     = {221--224},
-  publisher = {The Association for Computational Linguistics},
-  year      = {2009}
-}
-@article{dabre2019brief,
-  title={A Brief Survey of Multilingual Neural Machine Translation},
-  author={Dabre, Raj and Chu, Chenhui and Kunchukuttan, Anoop},
-  journal={arXiv preprint arXiv:1905.05395},
-  year={2019}
-}
-@article{dabre2020survey,
-  title={A survey of multilingual neural machine translation},
-  author={Dabre, Raj and Chu, Chenhui and Kunchukuttan, Anoop},
-  journal={ACM Computing Surveys (CSUR)},
-  volume={53},
-  number={5},
-  pages={1--38},
-  year={2020}
-}
-@inproceedings{DBLP:conf/emnlp/VulicGRK19,
-  author    = {Ivan Vulic and
-               Goran Glavas and
-               Roi Reichart and
-               Anna Korhonen},
-  title     = {Do We Really Need Fully Unsupervised Cross-Lingual Embeddings?},
-  pages     = {4406--4417},
-  publisher = {Association for Computational Linguistics},
-  year      = {2019}
-}
-@article{DBLP:journals/corr/MikolovLS13,
-  author    = {Tomas Mikolov and
-               Quoc V. Le and
-               Ilya Sutskever},
-  title     = {Exploiting Similarities among Languages for Machine Translation},
-  journal   = {CoRR},
-  volume    = {abs/1309.4168},
-  year      = {2013}
-}
 %%%%% chapter 16------------------------------------------------------
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%