add dashes in figure names

96c091cf · xiaotong · 3ef421f4 · 96c091cf · 96c091cf · 96c091cf
Commit 96c091cf authored Mar 07, 2020 by xiaotong
--- a/.gitignore
+++ b/.gitignore
@@ -25,3 +25,5 @@ Book/mt-book.pdf
 Section07-Making-A-Strong-NMT-System/section07.pdf
 Section07-Towards-Strong-NMT-Systems/section07.pdf
 Book/mt-book.run.xml
+Book/mt-book-xelatex.bcf
+Book/mt-book-xelatex.idx
--- a/Book/Chapter6/Chapter6.tex
+++ b/Book/Chapter6/Chapter6.tex
@@ -13,7 +13,7 @@

 %公式1.7之后往后串一个
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\chapter{神经机器翻译}
+\chapter{神经机器翻译模型}

 \parinterval 神经机器翻译（Neural Machine Translation）是机器翻译的前沿方法。近几年，随着深度学习技术的发展和在各领域中的深入应用，基于端到端表示学习的方法正在改变着我们处理自然语言的方式，神经机器翻译在这种趋势下应运而生。一方面，神经机器翻译仍然延续着统计建模和基于数据驱动的思想，因此在基本问题的定义上与前人的研究是一致的；另一方面，神经机器翻译脱离了统计机器翻译中对隐含翻译结构的假设，同时使用分布式表示来对文字序列进行建模，这使得它可以从一个全新的视角看待翻译问题。现在，神经机器翻译已经成为了机器翻译研究及应用的热点，译文质量得到了巨大的提升。在本章中，我们将会对神经机器翻译的基础模型和方法进行介绍。
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@@ -80,7 +80,7 @@
 % 图3.10
 \begin{figure}[htp]
 \centering
-\input{./Chapter6/Figures/figure-example of mt}
+\input{./Chapter6/Figures/figure-example-of-mt}
 \label{fig:6-61}
 \end{figure}
 %----------------------------------------------
@@ -92,7 +92,7 @@
 % 图3.10
 \begin{figure}[htp]
 \centering
-\input{./Chapter6/Figures/figure-numbers of WMT systems}
+\input{./Chapter6/Figures/figure-numbers-of-WMT-systems}
 \caption{WMT冠军系统的数量}
 \label{fig:6-2}
 \end{figure}
@@ -243,7 +243,7 @@ NMT                     & $ 21.7^{\ast}$          & $18.7^{\ast}$           & -1
 % 图3.10
 \begin{figure}[htp]
 \centering
-\input{./Chapter6/Figures/figure- encoder-decoder-process}
+\input{./Chapter6/Figures/figure-encoder-decoder-process}
 \caption{ encoder-decoder过程 }
 \label{fig:6-4}
 \end{figure}
@@ -264,7 +264,7 @@ NMT                     & $ 21.7^{\ast}$          & $18.7^{\ast}$           & -1
 % 图3.6
 \begin{figure}[htp]
    \centering
-    \input{./Chapter6/Figures/figure-Presentation space}
+    \input{./Chapter6/Figures/figure-Presentation-space}
    \caption{统计机器翻译和神经机器翻译的表示空间}
    \label{fig:6-5}
 \end{figure}
@@ -281,7 +281,7 @@ NMT                     & $ 21.7^{\ast}$          & $18.7^{\ast}$           & -1
 % 图3.10
 \begin{figure}[htp]
 \centering
-\input{./Chapter6/Figures/figure-a simple example for tl}
+\input{./Chapter6/Figures/figure-a-simple-example-for-tl}
 \label{fig:6-62}
 \end{figure}
 %----------------------------------------------
@@ -292,7 +292,7 @@ NMT                     & $ 21.7^{\ast}$          & $18.7^{\ast}$           & -1
 % 图3.10
 \begin{figure}[htp]
 \centering
-\input{./Chapter6/Figures/figure-A working example of neural machine translation}
+\input{./Chapter6/Figures/figure-A-working-example-of-neural-machine-translation}
 \caption{神经机器翻译的运行实例}
 \label{fig:6-6}
 \end{figure}
@@ -386,7 +386,7 @@ NMT                     & $ 21.7^{\ast}$          & $18.7^{\ast}$           & -1
 % 图3.10
 \begin{figure}[htp]
 \centering
-\input{./Chapter6/Figures/figure-Structure of a recurrent network model}
+\input{./Chapter6/Figures/figure-Structure-of-a-recurrent-network-model}
 \caption{循环网络模型的结构}
 \label{fig:6-8}
 \end{figure}
@@ -398,7 +398,7 @@ NMT                     & $ 21.7^{\ast}$          & $18.7^{\ast}$           & -1
 % 图3.10
 \begin{figure}[htp]
 \centering
-\input{./Chapter6/Figures/figure-Model structure based on recurrent neural network translation}
+\input{./Chapter6/Figures/figure-Model-structure-based-on-recurrent-neural-network-translation}
 \caption{基于循环神经网络翻译的模型结构}
 \label{fig:6-9}
 \end{figure}
@@ -454,7 +454,7 @@ $\textrm{P}({y_j | \mathbf{s}_{j-1} ,y_{j-1},C})$由Softmax实现，Softmax的
 % 图3.10
 \begin{figure}[htp]
 \centering
- \input{./Chapter6/Figures/figure-3 base problom of P}
+ \input{./Chapter6/Figures/figure-3-base-problom-of-P}
 \caption{求解$\textrm{P} (y_j | \mathbf{y}_{<j},\mathbf{x})$的三个基本问题}
 \label{fig:6-10}
 \end{figure}
@@ -481,7 +481,7 @@ $\textrm{P}({y_j | \mathbf{s}_{j-1} ,y_{j-1},C})$由Softmax实现，Softmax的
 % 图3.10
 \begin{figure}[htp]
 \centering
-\input{./Chapter6/Figures/figure-Word embedding structure}
+\input{./Chapter6/Figures/figure-Word-embedding-structure}
 \caption{词嵌入层结构}
 \label{fig:6-11}
 \end{figure}
@@ -493,7 +493,7 @@ $\textrm{P}({y_j | \mathbf{s}_{j-1} ,y_{j-1},C})$由Softmax实现，Softmax的
 % 图3.10
 \begin{figure}[htp]
 \centering
-\input{./Chapter6/Figures/figure-Output layer structur}
+\input{./Chapter6/Figures/figure-Output-layer-structur}
 \caption{输出层结构}
 \label{fig:6-12}
 \end{figure}
@@ -628,7 +628,7 @@ $\textrm{P}({y_j | \mathbf{s}_{j-1} ,y_{j-1},C})$由Softmax实现，Softmax的
 % 图3.10
 \begin{figure}[htp]
 \centering
-\input{./Chapter6/Figures/figure-the whole of LSTM}
+\input{./Chapter6/Figures/figure-the-whole-of-LSTM}
 \caption{LSTM的整体结构}
 \label{fig:6-15}
 \end{figure}
@@ -703,7 +703,7 @@ $\textrm{P}({y_j | \mathbf{s}_{j-1} ,y_{j-1},C})$由Softmax实现，Softmax的
 % 图3.10
 \begin{figure}[htp]
 \centering
-\input{./Chapter6/Figures/figure-Double layer RNN} \hspace{10em} 
+\input{./Chapter6/Figures/figure-Double-layer-RNN} \hspace{10em} 
 \caption{双层循环神经网络}
 \label{fig:6-18}
 \end{figure}
@@ -750,7 +750,7 @@ $\textrm{P}({y_j | \mathbf{s}_{j-1} ,y_{j-1},C})$由Softmax实现，Softmax的
 % 图3.10
 \begin{figure}[htp]
 \centering
-\input{./Chapter6/Figures/figure-Attention of source and target words}
+\input{./Chapter6/Figures/figure-Attention-of-source-and-target-words}
 \caption{源语词和目标语词的关注度}
 \label{fig:6-20}
 \end{figure}
@@ -764,7 +764,7 @@ $\textrm{P}({y_j | \mathbf{s}_{j-1} ,y_{j-1},C})$由Softmax实现，Softmax的
 % 图3.10
 \begin{figure}[htp]
 \centering
-\input{./Chapter6/Figures/figure-encoder-decoder with Attention}
+\input{./Chapter6/Figures/figure-encoder-decoder-with-Attention}
 \caption{不使用（a）和使用（b）注意力机制的翻译模型对比}
 \label{fig:6-21}
 \end{figure}
@@ -786,7 +786,7 @@ $\textrm{P}({y_j | \mathbf{s}_{j-1} ,y_{j-1},C})$由Softmax实现，Softmax的
 % 图3.10
 \begin{figure}[htp]
 \centering
-\input{./Chapter6/Figures/figure-Calculation process of context vector C}
+\input{./Chapter6/Figures/figure-Calculation-process-of-context-vector-C}
 \caption{上下文向量$\mathbf{C}_j$的计算过程}
 \label{fig:6-22}
 \end{figure}
@@ -828,7 +828,7 @@ $\textrm{a}(\cdot)$可以被看作是目标语表示和源语言表示的一种'
 % 图3.10
 \begin{figure}[htp]
 \centering
-\input{./Chapter6/Figures/figure-Matrix Representation of Attention Weights Between Chinese-English Sentence Pairs}
+\input{./Chapter6/Figures/figure-Matrix-Representation-of-Attention-Weights-Between-Chinese-English-Sentence-Pairs}
 \caption{一个汉英句对之间的注意力权重{$\alpha_{i,j}$}的矩阵表示}
 \label{fig:6-23}
 \end{figure}
@@ -841,7 +841,7 @@ $\textrm{a}(\cdot)$可以被看作是目标语表示和源语言表示的一种'
 % 图3.10
 \begin{figure}[htp]
 \centering
-\input{./Chapter6/Figures/figure-Example of context vector calculation process}
+\input{./Chapter6/Figures/figure-Example-of-context-vector-calculation-process}
 \caption{上下文向量计算过程实例}
 \label{fig:6-24}
 \end{figure}
@@ -882,7 +882,7 @@ $\textrm{a}(\cdot)$可以被看作是目标语表示和源语言表示的一种'
 % 图3.10
 \begin{figure}[htp]
 \centering
-\input{./Chapter6/Figures/figure-Query model corresponding to traditional query model vs attention mechanism}
+\input{./Chapter6/Figures/figure-Query-model-corresponding-to-traditional-query-model-vs-attention-mechanism}
 \caption{传统查询模型（a） vs 注意力机制（b）所对应的查询模型}
 \label{fig:6-25}
 \end{figure}
@@ -902,7 +902,7 @@ $\textrm{a}(\cdot)$可以被看作是目标语表示和源语言表示的一种'
 % 图3.10
 \begin{figure}[htp]
 \centering
-\input{./Chapter6/Figures/figure-Query model corresponding to attention mechanism}
+\input{./Chapter6/Figures/figure-Query-model-corresponding-to-attention-mechanism}
 \caption{注意力机制所对应的查询模型}
 \label{fig:6-26}
 \end{figure}
@@ -1009,7 +1009,7 @@ L(\mathbf{Y},\hat{\mathbf{Y}}) = \sum_{j=1}^n L_{ce}(\mathbf{y}_j,\hat{\mathbf{y
 % 图3.10
 \begin{figure}[htp]
 \centering
-\input{./Chapter6/Figures/figure-Relationship between learning rate and number of updates}
+\input{./Chapter6/Figures/figure-Relationship-between-learning-rate-and-number-of-updates}
 \caption{学习率与更新次数的变化关系}
 \label{fig:6-28}
 \end{figure}
@@ -1052,7 +1052,7 @@ L(\mathbf{Y},\hat{\mathbf{Y}}) = \sum_{j=1}^n L_{ce}(\mathbf{y}_j,\hat{\mathbf{y
 % 图3.10
 \begin{figure}[htp]
 \centering
-\input{./Chapter6/Figures/figure-Data parallel process}
+\input{./Chapter6/Figures/figure-Data-parallel-process}
 \caption{数据并行过程}
 \label{fig:6-29}
 \end{figure}
@@ -1111,7 +1111,7 @@ L(\mathbf{Y},\hat{\mathbf{Y}}) = \sum_{j=1}^n L_{ce}(\mathbf{y}_j,\hat{\mathbf{y
 % 图3.10
 \begin{figure}[htp]
 \centering
-\input{./Chapter6/Figures/figure-Decoding process based on greedy method}
+\input{./Chapter6/Figures/figure-Decoding-process-based-on-greedy-method}
 \caption{基于贪婪方法的解码过程}
 \label{fig:6-31}
 \end{figure}
@@ -1123,7 +1123,7 @@ L(\mathbf{Y},\hat{\mathbf{Y}}) = \sum_{j=1}^n L_{ce}(\mathbf{y}_j,\hat{\mathbf{y
 % 图3.10
 \begin{figure}[htp]
 \centering
-\input{./Chapter6/Figures/figure-Decode the word probability distribution at the first position}
+\input{./Chapter6/Figures/figure-Decode-the-word-probability-distribution-at-the-first-position}
 \caption{解码第一个位置输出的单词概率分布（''Have''的概率最高）}
 \label{fig:6-32}
 \end{figure}
@@ -1148,7 +1148,7 @@ L(\mathbf{Y},\hat{\mathbf{Y}}) = \sum_{j=1}^n L_{ce}(\mathbf{y}_j,\hat{\mathbf{y
 % 图3.10
 \begin{figure}[htp]
 \centering
-\input{./Chapter6/Figures/figure-Beam search process}
+\input{./Chapter6/Figures/figure-Beam-search-process}
 \caption{束搜索过程}
 \label{fig:6-33}
 \end{figure}
@@ -1201,7 +1201,7 @@ L(\mathbf{Y},\hat{\mathbf{Y}}) = \sum_{j=1}^n L_{ce}(\mathbf{y}_j,\hat{\mathbf{y
 % 图3.10
 \begin{figure}[htp]
 \centering
-\input{./Chapter6/Figures/figure-structure of GNMT}
+\input{./Chapter6/Figures/figure-structure-of-GNMT}
 \caption{GNMT结构}
 \label{fig:6-59}
 \end{figure}
@@ -1283,7 +1283,7 @@ L(\mathbf{Y},\hat{\mathbf{Y}}) = \sum_{j=1}^n L_{ce}(\mathbf{y}_j,\hat{\mathbf{y
 % 图3.10
 \begin{figure}[htp]
 \centering
-\input{./Chapter6/Figures/figure-Dependencies between words in a recurrent neural network}
+\input{./Chapter6/Figures/figure-Dependencies-between-words-in-a-recurrent-neural-network}
 \caption{循环神经网络中单词之间的依赖关系}
 \label{fig:6-34}
 \end{figure}
@@ -1295,7 +1295,7 @@ L(\mathbf{Y},\hat{\mathbf{Y}}) = \sum_{j=1}^n L_{ce}(\mathbf{y}_j,\hat{\mathbf{y
 % 图3.10
 \begin{figure}[htp]
 \centering
-\input{./Chapter6/Figures/figure-Dependencies between words in a recurrent neural network}
+\input{./Chapter6/Figures/figure-Dependencies-between-words-in-a-recurrent-neural-network}
 \caption{注意力机制中单词之间的依赖关系}
 \label{fig:6-35}
 \end{figure}
@@ -1307,7 +1307,7 @@ L(\mathbf{Y},\hat{\mathbf{Y}}) = \sum_{j=1}^n L_{ce}(\mathbf{y}_j,\hat{\mathbf{y
 % 图3.10
 \begin{figure}[htp]
 \centering
-\input{./Chapter6/Figures/figure-Example of self-attention mechanism calculation}
+\input{./Chapter6/Figures/figure-Example-of-self-attention-mechanism-calculation}
 \caption{自注意力计算实例}
 \label{fig:6-36}
 \end{figure}
@@ -1350,7 +1350,7 @@ L(\mathbf{Y},\hat{\mathbf{Y}}) = \sum_{j=1}^n L_{ce}(\mathbf{y}_j,\hat{\mathbf{y
 % 图3.30
 \begin{figure}[htp]
    \centering
-   \input{./Chapter6/Figures/figure-self-att vs enco-deco att}
+   \input{./Chapter6/Figures/figure-self-att-vs-enco-deco-att}
    \caption{ 注意力模型的输入（自注意力子层 vs 编码-解码注意力子层）}
    \label{fig:6-37}
 \end{figure}
@@ -1381,7 +1381,7 @@ L(\mathbf{Y},\hat{\mathbf{Y}}) = \sum_{j=1}^n L_{ce}(\mathbf{y}_j,\hat{\mathbf{y
 % 图3.10
 \begin{figure}[htp]
 \centering
-\input{./Chapter6/Figures/figure-Calculation of context vector C}
+\input{./Chapter6/Figures/figure-Calculation-of-context-vector-C}
 \caption{上下文向量C的计算}
 \label{fig:6-39}
 \end{figure}
@@ -1393,7 +1393,7 @@ L(\mathbf{Y},\hat{\mathbf{Y}}) = \sum_{j=1}^n L_{ce}(\mathbf{y}_j,\hat{\mathbf{y
 % 图3.10
 \begin{figure}[htp]
 \centering
-\input{./Chapter6/Figures/figure-Transformer input and position encoding}
+\input{./Chapter6/Figures/figure-Transformer-input-and-position-encoding}
 \caption{Transformer输入与位置编码}
 \label{fig:6-40}
 \end{figure}
@@ -1416,7 +1416,7 @@ L(\mathbf{Y},\hat{\mathbf{Y}}) = \sum_{j=1}^n L_{ce}(\mathbf{y}_j,\hat{\mathbf{y
 % 图3.10
 \begin{figure}[htp]
 \centering
-\input{./Chapter6/Figures/figure-A combination of position encoding and word encoding}
+\input{./Chapter6/Figures/figure-A-combination-of-position-encoding-and-word-encoding}
 \caption{位置编码与词编码的组合}
 \label{fig:6-41}
 \end{figure}
@@ -1450,7 +1450,7 @@ L(\mathbf{Y},\hat{\mathbf{Y}}) = \sum_{j=1}^n L_{ce}(\mathbf{y}_j,\hat{\mathbf{y
 % 图3.10
 \begin{figure}[htp]
 \centering
-\input{./Chapter6/Figures/figure-Position of self-attention mechanism in the model}
+\input{./Chapter6/Figures/figure-Position-of-self-attention-mechanism-in-the-model}
 \caption{自注意力机制在模型中的位置}
 \label{fig:6-42}
 \end{figure}
@@ -1477,7 +1477,7 @@ L(\mathbf{Y},\hat{\mathbf{Y}}) = \sum_{j=1}^n L_{ce}(\mathbf{y}_j,\hat{\mathbf{y
 % 图3.10
 \begin{figure}[htp]
 \centering
-\input{./Chapter6/Figures/figure-Point product attention model}
+\input{./Chapter6/Figures/figure-Point-product-attention-model}
 \caption{点乘注意力力模型 }
 \label{fig:6-43}
 \end{figure}
@@ -1490,7 +1490,7 @@ L(\mathbf{Y},\hat{\mathbf{Y}}) = \sum_{j=1}^n L_{ce}(\mathbf{y}_j,\hat{\mathbf{y
 \begin{figure}[htp]
 \centering
 %\includegraphics[scale=0.4]{./Chapter6/Figures/process of 5.png}
-\input{./Chapter6/Figures/process of 5}
+\input{./Chapter6/Figures/figure-process-of-5}
 \caption{公式（5）的执行过程示例}
 \label{fig:6-44}
 \end{figure}
@@ -1509,7 +1509,7 @@ L(\mathbf{Y},\hat{\mathbf{Y}}) = \sum_{j=1}^n L_{ce}(\mathbf{y}_j,\hat{\mathbf{y
 % 图3.10
 \begin{figure}[htp]
 \centering
-\input{./Chapter6/Figures/figure-Mask instance for future positions in Transformer}
+\input{./Chapter6/Figures/figure-Mask-instance-for-future-positions-in-Transformer}
 \caption{Transformer中对于未来位置进行的屏蔽的Mask实例}
 \label{fig:6-45}
 \end{figure}
@@ -1533,7 +1533,7 @@ L(\mathbf{Y},\hat{\mathbf{Y}}) = \sum_{j=1}^n L_{ce}(\mathbf{y}_j,\hat{\mathbf{y
 % 图3.10
 \begin{figure}[htp]
 \centering
-\input{./Chapter6/Figures/figure-Multi-Head Attention Model}
+\input{./Chapter6/Figures/figure-Multi-Head-Attention-Model}
 \caption{多头注意力模型}
 \label{fig:6-46}
 \end{figure}
@@ -1562,7 +1562,7 @@ L(\mathbf{Y},\hat{\mathbf{Y}}) = \sum_{j=1}^n L_{ce}(\mathbf{y}_j,\hat{\mathbf{y
 % 图3.10
 \begin{figure}[htp]
 \centering
-\input{./Chapter6/Figures/figure-Residual network structure}
+\input{./Chapter6/Figures/figure-Residual-network-structure}
 \caption{残差网络结构}
 \label{fig:6-47}
 \end{figure}
@@ -1581,7 +1581,7 @@ x_{l+1} = x_l + \digamma (x_l)
 % 图3.10
 \begin{figure}[htp]
 \centering
-\input{./Chapter6/Figures/figure-Position of difference and layer regularization in the model}
+\input{./Chapter6/Figures/figure-Position-of-difference-and-layer-regularization-in-the-model}
 \caption{残差和层正则化在模型中的位置}
 \label{fig:6-48}
 \end{figure}
@@ -1602,7 +1602,7 @@ x_{l+1} = x_l + \digamma (x_l)
 % 图3.10
 \begin{figure}[htp]
 \centering
-\input{./Chapter6/Figures/figure-Different regularization methods}
+\input{./Chapter6/Figures/figure-Different-regularization-methods}
 \caption{不同正则化方式 }
 \label{fig:6-49}
 \end{figure}
@@ -1615,7 +1615,7 @@ x_{l+1} = x_l + \digamma (x_l)
 % 图3.10
 \begin{figure}[htp]
 \centering
-\input{./Chapter6/Figures/figure-Position of feedforward neural network in the model}
+\input{./Chapter6/Figures/figure-Position-of-feedforward-neural-network-in-the-model}
 \caption{前馈神经网络在模型中的位置}
 \label{fig:6-50}
 \end{figure}
@@ -1637,7 +1637,7 @@ x_{l+1} = x_l + \digamma (x_l)
 % 图3.10
 \begin{figure}[htp]
 \centering
-\input{./Chapter6/Figures/figure-Structure of the network during Transformer training}
+\input{./Chapter6/Figures/figure-Structure-of-the-network-during-Transformer-training}
 \caption{Transformer训练时网络的结构}
 \label{fig:6-51}
 \end{figure}
@@ -1662,7 +1662,7 @@ lrate = d_{model}^{-0.5} \cdot \textrm{min} (step^{-0.5} , step \cdot warmup\_st
 % 图3.10
 \begin{figure}[htp]
 \centering
-\input{./Chapter6/Figures/figure-lrate of transformer}
+\input{./Chapter6/Figures/figure-lrate-of-transformer}
 \caption{Transformer模型的学习率调整曲线}
 \label{fig:6-52}
 \end{figure}
@@ -1677,7 +1677,7 @@ lrate = d_{model}^{-0.5} \cdot \textrm{min} (step^{-0.5} , step \cdot warmup\_st
 % 图3.10
 \begin{figure}[htp]
 \centering
-\input{./Chapter6/Figures/figure-Comparison of the number of padding in batch}
+\input{./Chapter6/Figures/figure-Comparison-of-the-number-of-padding-in-batch}
 \caption{batch中padding数量对比（白色部分为padding）}
 \label{fig:6-53}
 \end{figure}
@@ -1729,7 +1729,7 @@ Transformer Deep(48层) & 30.2            & 43.1            & 194$\times 10^{6}$
 % 图3.10
 \begin{figure}[htp]
 \centering
-\input{./Chapter6/Figures/figure-decode of transformer}
+\input{./Chapter6/Figures/figure-decode-of-transformer}
 \caption{Transformer推断过程示例}
 \label{fig:6-54}
 \end{figure}
@@ -1752,7 +1752,7 @@ Transformer Deep(48层) & 30.2            & 43.1            & 194$\times 10^{6}$
 % 图3.6.1
 \begin{figure}[htp]
    \centering
-\input{./Chapter6/Figures/figure-Generate summary}
+\input{./Chapter6/Figures/figure-Generate-summary}
   \label{fig:6-64}
 \end{figure}
 %---------------------------
@@ -1763,7 +1763,7 @@ Transformer Deep(48层) & 30.2            & 43.1            & 194$\times 10^{6}$
 % 图3.6.1
 \begin{figure}[htp]
    \centering
-\input{./Chapter6/Figures/figure-Example of automatic translation of classical Chinese}
+\input{./Chapter6/Figures/figure-Example-of-automatic-translation-of-classical-Chinese}
   \caption{文言文自动翻译实例}
   \label{fig:6-56}
 \end{figure}
@@ -1779,7 +1779,7 @@ Transformer Deep(48层) & 30.2            & 43.1            & 194$\times 10^{6}$

 \begin{figure}[htp]
    \centering
-\input{./Chapter6/Figures/figure-Automatically generate instances of couplets}
+\input{./Chapter6/Figures/figure-Automatically-generate-instances-of-couplets}
    \caption{对联自动生成实例（人工给定上联）}
    \label{fig:6-57}
 \end{figure}
@@ -1795,7 +1795,7 @@ Transformer Deep(48层) & 30.2            & 43.1            & 194$\times 10^{6}$

 \begin{figure}[htp]
    \centering
-\input{./Chapter6/Figures/figure-Automatic generation of ancient poems based on encoder-decoder framework}
+\input{./Chapter6/Figures/figure-Automatic-generation-of-ancient-poems-based-on-encoder-decoder-framework}
    \caption{基于编码器-解码器框架的古诗自动生成}
    \label{fig:6-58}
 \end{figure}

--- a/Book/Chapter6/Figures/figure-3 base problom of P.tex
+++ b/Book/Chapter6/Figures/figure-3 base problom of P.tex
--- a/Book/Chapter6/Figures/figure-A combination of position encoding and word encoding.tex
+++ b/Book/Chapter6/Figures/figure-A combination of position encoding and word encoding.tex
--- a/Book/Chapter6/Figures/figure-A working example of neural machine translation.tex
+++ b/Book/Chapter6/Figures/figure-A working example of neural machine translation.tex
--- a/Book/Chapter6/Figures/figure-Attention of source and target words.tex
+++ b/Book/Chapter6/Figures/figure-Attention of source and target words.tex
--- a/Book/Chapter6/Figures/figure-Automatic generation of ancient poems based on encoder-decoder framework.tex
+++ b/Book/Chapter6/Figures/figure-Automatic generation of ancient poems based on encoder-decoder framework.tex
--- a/Book/Chapter6/Figures/figure-Automatically generate instances of couplets.tex
+++ b/Book/Chapter6/Figures/figure-Automatically generate instances of couplets.tex
--- a/Book/Chapter6/Figures/figure-Beam search process.tex
+++ b/Book/Chapter6/Figures/figure-Beam search process.tex
--- a/Book/Chapter6/Figures/figure-Calculation of context vector C.tex
+++ b/Book/Chapter6/Figures/figure-Calculation of context vector C.tex
--- a/Book/Chapter6/Figures/figure-Calculation process of context vector C.tex
+++ b/Book/Chapter6/Figures/figure-Calculation process of context vector C.tex
--- a/Book/Chapter6/Figures/figure-Comparison of the number of padding in batch.tex
+++ b/Book/Chapter6/Figures/figure-Comparison of the number of padding in batch.tex
--- a/Book/Chapter6/Figures/figure-Data parallel process.tex
+++ b/Book/Chapter6/Figures/figure-Data parallel process.tex
--- a/Book/Chapter6/Figures/figure-Decode the word probability distribution at the first position.tex
+++ b/Book/Chapter6/Figures/figure-Decode the word probability distribution at the first position.tex
--- a/Book/Chapter6/Figures/figure-Decoding process based on greedy method.tex
+++ b/Book/Chapter6/Figures/figure-Decoding process based on greedy method.tex
--- a/Book/Chapter6/Figures/figure-Dependencies between words in a recurrent neural network.tex
+++ b/Book/Chapter6/Figures/figure-Dependencies between words in a recurrent neural network.tex
--- a/Book/Chapter6/Figures/figure-Dependencies between words of Attention.tex
+++ b/Book/Chapter6/Figures/figure-Dependencies between words of Attention.tex
--- a/Book/Chapter6/Figures/figure-Different regularization methods.tex
+++ b/Book/Chapter6/Figures/figure-Different regularization methods.tex
--- a/Book/Chapter6/Figures/figure-Double layer RNN.tex
+++ b/Book/Chapter6/Figures/figure-Double layer RNN.tex
--- a/Book/Chapter6/Figures/figure-Example of automatic translation of classical Chinese.tex
+++ b/Book/Chapter6/Figures/figure-Example of automatic translation of classical Chinese.tex
--- a/Book/Chapter6/Figures/figure-Example of context vector calculation process.tex
+++ b/Book/Chapter6/Figures/figure-Example of context vector calculation process.tex
--- a/Book/Chapter6/Figures/figure-Example of self-attention mechanism calculation.tex
+++ b/Book/Chapter6/Figures/figure-Example of self-attention mechanism calculation.tex
--- a/Book/Chapter6/Figures/figure-Generate summary.tex
+++ b/Book/Chapter6/Figures/figure-Generate summary.tex
--- a/Book/Chapter6/Figures/figure-Mask instance for future positions in Transformer.tex
+++ b/Book/Chapter6/Figures/figure-Mask instance for future positions in Transformer.tex
--- a/Book/Chapter6/Figures/figure-Matrix Representation of Attention Weights Between Chinese-English Sentence Pairs.tex
+++ b/Book/Chapter6/Figures/figure-Matrix Representation of Attention Weights Between Chinese-English Sentence Pairs.tex
--- a/Book/Chapter6/Figures/figure-Model structure based on recurrent neural network translation.tex
+++ b/Book/Chapter6/Figures/figure-Model structure based on recurrent neural network translation.tex
--- a/Book/Chapter6/Figures/figure-Multi-Head Attention Model.tex
+++ b/Book/Chapter6/Figures/figure-Multi-Head Attention Model.tex
--- a/Book/Chapter6/Figures/figure-Output layer structur.tex
+++ b/Book/Chapter6/Figures/figure-Output layer structur.tex
--- a/Book/Chapter6/Figures/figure-Point product attention model.tex
+++ b/Book/Chapter6/Figures/figure-Point product attention model.tex
--- a/Book/Chapter6/Figures/figure-Position of difference and layer regularization in the model.tex
+++ b/Book/Chapter6/Figures/figure-Position of difference and layer regularization in the model.tex
--- a/Book/Chapter6/Figures/figure-Position of feedforward neural network in the model.tex
+++ b/Book/Chapter6/Figures/figure-Position of feedforward neural network in the model.tex
--- a/Book/Chapter6/Figures/figure-Position of self-attention mechanism in the model.tex
+++ b/Book/Chapter6/Figures/figure-Position of self-attention mechanism in the model.tex
--- a/Book/Chapter6/Figures/figure-Presentation space.tex
+++ b/Book/Chapter6/Figures/figure-Presentation space.tex
--- a/Book/Chapter6/Figures/figure-Query model corresponding to attention mechanism.tex
+++ b/Book/Chapter6/Figures/figure-Query model corresponding to attention mechanism.tex
--- a/Book/Chapter6/Figures/figure-Query model corresponding to traditional query model vs attention mechanism.tex
+++ b/Book/Chapter6/Figures/figure-Query model corresponding to traditional query model vs attention mechanism.tex
--- a/Book/Chapter6/Figures/figure-Query model corresponding to traditional query model vs attention mechanism02.tex
+++ b/Book/Chapter6/Figures/figure-Query model corresponding to traditional query model vs attention mechanism02.tex
--- a/Book/Chapter6/Figures/figure-Relationship between learning rate and number of updates.tex
+++ b/Book/Chapter6/Figures/figure-Relationship between learning rate and number of updates.tex
--- a/Book/Chapter6/Figures/figure-Residual network structure.tex
+++ b/Book/Chapter6/Figures/figure-Residual network structure.tex
--- a/Book/Chapter6/Figures/figure-Structure of a recurrent network model.tex
+++ b/Book/Chapter6/Figures/figure-Structure of a recurrent network model.tex
--- a/Book/Chapter6/Figures/figure-Structure of the network during Transformer training.tex
+++ b/Book/Chapter6/Figures/figure-Structure of the network during Transformer training.tex
--- a/Book/Chapter6/Figures/figure-Transformer input and position encoding.tex
+++ b/Book/Chapter6/Figures/figure-Transformer input and position encoding.tex
--- a/Book/Chapter6/Figures/figure-Word embedding structure.tex
+++ b/Book/Chapter6/Figures/figure-Word embedding structure.tex
--- a/Book/Chapter6/Figures/figure-a simple example for tl.tex
+++ b/Book/Chapter6/Figures/figure-a simple example for tl.tex
--- a/Book/Chapter6/Figures/figure-decode of transformer.tex
+++ b/Book/Chapter6/Figures/figure-decode of transformer.tex
--- a/Book/Chapter6/Figures/figure- encoder-decoder-process.tex
+++ b/Book/Chapter6/Figures/figure- encoder-decoder-process.tex
--- a/Book/Chapter6/Figures/figure-encoder-decoder with Attention.tex
+++ b/Book/Chapter6/Figures/figure-encoder-decoder with Attention.tex
--- a/Book/Chapter6/Figures/figure-example of mt.tex
+++ b/Book/Chapter6/Figures/figure-example of mt.tex
--- a/Book/Chapter6/Figures/figure-lrate of transformer.tex
+++ b/Book/Chapter6/Figures/figure-lrate of transformer.tex
--- a/Book/Chapter6/Figures/figure-numbers of WMT systems.tex
+++ b/Book/Chapter6/Figures/figure-numbers of WMT systems.tex
--- a/Book/Chapter6/Figures/process of 5.tex
+++ b/Book/Chapter6/Figures/process of 5.tex
--- a/Book/Chapter6/Figures/figure-process test (2).tex
+++ b/Book/Chapter6/Figures/figure-process test (2).tex
--- a/Book/Chapter6/Figures/figure-self-att vs enco-deco att.tex
+++ b/Book/Chapter6/Figures/figure-self-att vs enco-deco att.tex
--- a/Book/Chapter6/Figures/figure-structure of GNMT.tex
+++ b/Book/Chapter6/Figures/figure-structure of GNMT.tex
--- a/Book/Chapter6/Figures/figure-the whole of LSTM.tex
+++ b/Book/Chapter6/Figures/figure-the whole of LSTM.tex