合并分支 'shanweiqiao' 到 'caorunzhe'

13章查看合并请求 !722

合并分支 'shanweiqiao' 到 'caorunzhe'
13章查看合并请求 !722
afc9b12a · 单韦乔 · 8c14e5a8 · 87981c60 · afc9b12a · afc9b12a
Commit afc9b12a authored Dec 28, 2020 by 单韦乔
--- a/Chapter13/Figures/figure-bpe.tex
+++ b/Chapter13/Figures/figure-bpe.tex
@@ -2,11 +2,11 @@
 	\tikzstyle{node} =[font=\scriptsize]
 	\tikzstyle{sentence} =[font=\scriptsize,fill=blue!5!white]
-	\node[sentence] (node1) at (0,0) {[`low', `lower', `newest', `widest']};
+	\node[sentence] (node1) at (0,0) {['low', 'lower', 'newest', 'widest']};
-	\node[sentence,anchor = north] (node2) at ([yshift = -1em]node1.south) {[`l o w $<$e$>$':5, `l o w e r $<$e$>$':2, `n e w e s t $<$e$>$':6, `w i d e s t $<$e$>$':3]};	
+	\node[sentence,anchor = north] (node2) at ([yshift = -1em]node1.south) {['l o w $<$e$>$':5, 'l o w e r $<$e$>$':2, 'n e w e s t $<$e$>$':6, 'w i d e s t $<$e$>$':3]};	
-	\node[sentence,anchor = north] (node3) at ([yshift = -1.5em]node2.south) {[`l o w $<$e$>$':5, `l o w e r $<$e$>$':2, `n e w {\red es} t $<$e$>$':6, `w i d {\red es} t $<$e$>$':3]};
+	\node[sentence,anchor = north] (node3) at ([yshift = -1.5em]node2.south) {['l o w $<$e$>$':5, 'l o w e r $<$e$>$':2, 'n e w {\red es} t $<$e$>$':6, 'w i d {\red es} t $<$e$>$':3]};
-	\node[sentence,anchor = north] (node4) at ([yshift = -1em]node3.south) {[`l o w $<$e$>$':5, `l o w e r $<$e$>$':2, `n e w {\red est} $<$e$>$':6, `w i d {\red est} $<$e$>$':3]};
+	\node[sentence,anchor = north] (node4) at ([yshift = -1em]node3.south) {['l o w $<$e$>$':5, 'l o w e r $<$e$>$':2, 'n e w {\red est} $<$e$>$':6, 'w i d {\red est} $<$e$>$':3]};
-	\node[sentence,anchor = north] (node5) at ([yshift = -1em]node4.south) {[`l o w $<$e$>$':5, `l o w e r $<$e$>$':2, `n e w {\red est$<$e$>$}':6, `w i d {\red est$<$e$>$}':3]};
+	\node[sentence,anchor = north] (node5) at ([yshift = -1em]node4.south) {['l o w $<$e$>$':5, 'l o w e r $<$e$>$':2, 'n e w {\red est$<$e$>$}':6, 'w i d {\red est$<$e$>$}':3]};
 	\node[sentence,anchor = north] (node6) at ([yshift = -1em]node5.south) {$\cdots$};
 	\node[node,anchor = north] (node7) at ([yshift = -1.6em]node6.south) {直到达到预设的子词词表大小或下一个最高频的字节对出现频率为1。};

--- a/Chapter13/Figures/figure-computation-of-dropout.tex
+++ b/Chapter13/Figures/figure-computation-of-dropout.tex
@@ -14,7 +14,7 @@
 \node [neuronnode] (neuron_z) at (1.2 * \nodespace,-1.5 * \neuronsep) {\scriptsize{$z_{i}^{l+1}$}};
 \node [neuronnode] (neuron_y') at (2.4 * \nodespace,-1.5 * \neuronsep) {\scriptsize{$x_{i}^{l+1}$}};
-\node [anchor=north,ublue] (standard) at ([yshift=-4em]neuron_z.south) {\scriptsize{standard}};
+\node [anchor=north,ublue] (standard) at ([yshift=-4em]neuron_z.south) {\scriptsize{标准网络}};
 \node [ublue] (standard) at ([xshift=-1em]neuron_z.west) {\scriptsize{$\mathbf{w}_{i}^{l}$}};
 \node [ublue] (standard) at ([xshift=0.6em,yshift=0.3em]neuron_z.east) {\scriptsize{$f$}};
@@ -40,7 +40,7 @@
 \node [neuronnode] (drop_neuron_r2) at (4.4*\nodespace,-1.5*\neuronsep) {\scriptsize{$r_{2}^{l}$}};
 \node [neuronnode] (drop_neuron_r1) at (4.4*\nodespace,-2.5*\neuronsep) {\scriptsize{$r_{1}^{l}$}};
-\node [anchor=north,ublue] (standard) at ([yshift=-4em]drop_neuron_z.south) {\scriptsize{dropout}};
+\node [anchor=north,ublue] (standard) at ([xshift=2em,yshift=-4em]drop_neuron_z.south) {\scriptsize{应用Dropout后的网络}};
 \node [ublue] (standard) at ([xshift=-1em]drop_neuron_z.west) {\scriptsize{$\mathbf{w}_{i}^{l}$}};
 \node [ublue] (standard) at ([xshift=0.6em,yshift=0.3em]drop_neuron_z.east) {\scriptsize{$f$}};
 %structure
@@ -59,10 +59,10 @@
 \draw [-,line width=0.3mm] (drop_neuron_r1.south) -- ([yshift=-1em]drop_neuron_r1.south);
 %equ
-\node [anchor=west,inner sep = 2pt] (line1) at (9*\nodespace,0) {未应用dropout：};
+\node [anchor=west,inner sep = 2pt] (line1) at (9*\nodespace,0) {未应用Dropout：};
 \node [anchor=north west,inner sep = 2pt] (line2) at (line1.south west) {$z_{i}^{l+1}=\mathbf{w}_{i}^{l} \mathbf{x}+b_{i}^{l}$};
 \node [anchor=north west,inner sep = 2pt] (line3) at (line2.south west) {$x_{i}^{l+1}=f\left(x_{i}^{l}\right)$};
-\node [anchor=north west,inner sep = 2pt] (line4) at (line3.south west) {应用dropout：};
+\node [anchor=north west,inner sep = 2pt] (line4) at (line3.south west) {应用Dropout：};
 \node [anchor=north west,inner sep = 2pt] (line5) at (line4.south west) {$r_{j}^{l} \sim$ Bernoulli $(1-p)$};
 \node [anchor=north west,inner sep = 2pt] (line6) at (line5.south west) {$\tilde{\mathbf{x}}=\mathbf{r} * \mathbf{x}$};
 \node [anchor=north west,inner sep = 2pt] (line7) at (line6.south west) {$z_{i}^{l+1}=\mathbf{w}_{i}^{l} \widetilde{\mathbf{x}}+b_{i}^{l}$};

--- a/Chapter13/Figures/figure-ensemble-knowledge-distillation.tex
+++ b/Chapter13/Figures/figure-ensemble-knowledge-distillation.tex
@@ -6,7 +6,7 @@
    \setlength{\YShift}{0.8\base}
    \setlength{\XShift}{0.8\base}
-    \tikzstyle{modelnode} = [rectangle,draw,rounded corners=2pt,inner sep=0pt,minimum height=4.2em,minimum width=2em,font=\small,anchor=north]
+    \tikzstyle{modelnode} = [rectangle,draw,rounded corners=2pt,inner sep=0pt,minimum height=4.5em,minimum width=2em,font=\small,anchor=north]
    \coordinate (stu01) at (0,0);
    \coordinate (stu02) at ([xshift=3em]stu01);
@@ -20,27 +20,27 @@
    \foreach \curr / \prev in {1/0,2/1,3/2}
    {
      % models
-      \node[modelnode,fill=yellow!20] (stu\curr1) at ([yshift=-2em]stu\prev1.south) {\rotatebox{90}{Student $1$}};
+      \node[modelnode,fill=yellow!20] (stu\curr1) at ([yshift=-2em]stu\prev1.south) {\rotatebox{90}{学生模型 $1$}};
-      \node[modelnode,fill=yellow!20] (stu\curr2) at ([yshift=-2em]stu\prev2.south) {\rotatebox{90}{Student $2$}};
+      \node[modelnode,fill=yellow!20] (stu\curr2) at ([yshift=-2em]stu\prev2.south) {\rotatebox{90}{学生模型 $2$}};
-      \node[modelnode,fill=yellow!20] (stu\curr3) at ([yshift=-2em]stu\prev3.south) {\rotatebox{90}{Student $3$}};
+      \node[modelnode,fill=yellow!20] (stu\curr3) at ([yshift=-2em]stu\prev3.south) {\rotatebox{90}{学生模型 $3$}};
-      \node[modelnode,fill=yellow!20] (stu\curr4) at ([yshift=-2em]stu\prev4.south) {\rotatebox{90}{Student $4$}};
+      \node[modelnode,fill=yellow!20] (stu\curr4) at ([yshift=-2em]stu\prev4.south) {\rotatebox{90}{学生模型 $4$}};
-      \node[modelnode,fill=yellow!20] (stu\curr5) at ([yshift=-2em]stu\prev5.south) {\rotatebox{90}{Student $5$}};
+      \node[modelnode,fill=yellow!20] (stu\curr5) at ([yshift=-2em]stu\prev5.south) {\rotatebox{90}{学生模型 $5$}};
-      \node[modelnode] (tea\curr1) at ([yshift=-2em]tea\prev1.south) {\rotatebox{90}{\color{red!60} Teacher $1$}};
+      \node[modelnode] (tea\curr1) at ([yshift=-2em]tea\prev1.south) {\rotatebox{90}{\color{red!60} 教师模型 $1$}};
-      \node[modelnode] (tea\curr2) at ([yshift=-2em]tea\prev2.south) {\rotatebox{90}{\color{blue!60} Teacher $2$}};
+      \node[modelnode] (tea\curr2) at ([yshift=-2em]tea\prev2.south) {\rotatebox{90}{\color{blue!60} 教师模型 $2$}};
-      % ensemble labels
+      % 集成 labels
-      \draw[-latex'] ([xshift=2pt]stu\curr5.east) to node [auto] {\small Ensemble} ([xshift=-2pt]tea\curr1.west);
+      \draw[-latex'] ([xshift=2pt]stu\curr5.east) to node [auto] {\small 集成} ([xshift=-2pt]tea\curr1.west);
    }
    % iteration labels
-    \node[font=\small,anchor=east,purple!80] (iterate1) at ([xshift=-1em]stu21.west) {\rotatebox{90}{Iteration $1$}};
+    \node[font=\small,anchor=east,purple!80] (iterate1) at ([xshift=-1em]stu21.west) {\rotatebox{90}{轮数 $1$}};
-    \node[font=\small,anchor=east,purple!80] (iterate2) at ([xshift=-1em]stu31.west) {\rotatebox{90}{Iteration $2$}};
+    \node[font=\small,anchor=east,purple!80] (iterate2) at ([xshift=-1em]stu31.west) {\rotatebox{90}{轮数 $2$}};
    % distillation labels
-    \node[font=\small,anchor=south west] (distill1) at ([yshift=0.2em]iterate1.north west) {Distillation};
+    \node[font=\small,anchor=south west] (distill1) at ([yshift=0.8em]iterate1.north west) {知识蒸馏};
-    \node[font=\small,anchor=south west] (distill2) at ([yshift=0.2em]iterate2.north west) {Distillation};
+    \node[font=\small,anchor=south west] (distill2) at ([yshift=0.8em]iterate2.north west) {知识蒸馏};
-    % student groups
+    % 学生模型 groups
    \begin{pgfonlayer}{background}
      \node[rectangle,draw,very thick,red!60,densely dotted,inner sep=2pt,rounded corners=2pt,fill=red!20] [fit = (stu21) (stu22) (stu23) ] (group21) {};
      \node[rectangle,draw,very thick,blue!60,densely dotted,inner sep=2pt,rounded corners=2pt,fill=blue!20] [fit = (stu24) (stu25) ] (group22) {};

--- a/Chapter13/Figures/figure-label-smoothing.tex
+++ b/Chapter13/Figures/figure-label-smoothing.tex
 \begin{tikzpicture}
-	\node[font=\scriptsize] (model) at (0,0) {Model Output:};
+	\node[font=\scriptsize,align=left] (model) at (0,0) {模型输出:\\（未使用\\标签平滑）};
-	\node[anchor=north west,font=\scriptsize] (label_smooth) at ([yshift=-1.8em]model.south west) {Label Smoothing:};
+	\node[anchor=north west,font=\scriptsize,align=left] (label_smooth) at ([yshift=-0.3em]model.south west) {模型输出:\\（使用标\\签平滑）};
-	\node[anchor=south west,font=\scriptsize] (one-hot) at ([yshift=2em]model.north west) {One-hot:};
+	\node[anchor=south west,font=\scriptsize] (one-hot) at ([yshift=1em]model.north west) {One-hot分布:};
 	%model out
-	\node [anchor=west,minimum width=1.2em,minimum height=0.2em,fill=ublue!80,inner sep=0pt] (model_label1) at ([xshift=1.5em,yshift=-0.5em]model.east) {};
+	\node [anchor=west,minimum width=1.2em,minimum height=0.2em,fill=ublue!80,inner sep=0pt] (model_label1) at ([xshift=1.5em,yshift=-0.8em]model.east) {};
    \node [anchor=south,font=\scriptsize] (model_w1) at (model_label1.north) {$p_{1}$};
    \node [anchor=south west,minimum width=1.2em,minimum height=0.1em,fill=ublue!80,inner sep=0pt] (model_label2) at (model_label1.south east) {};
    \node [anchor=south,font=\scriptsize] (model_w2) at (model_label2.north) {$p_{2}$};
@@ -21,7 +21,7 @@
    \node [anchor=south west,minimum width=1.2em,minimum height=0.2em,fill=ublue!80,inner sep=0pt] (model_label7) at (model_label6.south east) {};
    \node [anchor=south,font=\scriptsize] (model_w8) at (model_label7.north) {$p_{7}$};
    %no label smooth
-    \node [anchor=west,minimum width=1.2em,minimum height=0.05em,fill=orange!50,inner sep=0pt,font=\tiny] (one_hot_label1) at ([xshift=1.5em,yshift=3em]model.east) {};
+    \node [anchor=west,minimum width=1.2em,minimum height=0.05em,fill=orange!50,inner sep=0pt,font=\tiny] (one_hot_label1) at ([xshift=1.5em,yshift=2.5em]model.east) {};
    \node [anchor=south,font=\scriptsize] (one_hot_w1) at (one_hot_label1.north) {$0$};
    \node [anchor=south west,minimum width=1.2em,minimum height=0.05em,fill=orange!50,inner sep=0pt,font=\tiny] (one_hot_label2) at (one_hot_label1.south east) {};
    \node [anchor=south,font=\scriptsize] (one_hot_w2) at (one_hot_label2.north) {$0$};
@@ -38,7 +38,7 @@
    \node [anchor=south,font=\scriptsize] (one_hot_w7) at (one_hot_label7.north) {$0$};
    %label smoothing
-	\node [anchor=west,minimum width=1.2em,minimum height=0.2em,fill=red!50,inner sep=0pt] (label1) at ([xshift=1.5em,yshift=-3.2em]model.east) {};
+	\node [anchor=west,minimum width=1.2em,minimum height=0.2em,fill=red!50,inner sep=0pt] (label1) at ([xshift=1.5em,yshift=-4.4em]model.east) {};
    \node [anchor=south,font=\scriptsize] (w1) at (label1.north) {$0.1$};
    \node [anchor=south west,minimum width=1.2em,minimum height=0.2em,fill=red!50,inner sep=0pt] (label2) at (label1.south east) {};
    \node [anchor=south,font=\scriptsize] (w2) at (label2.north) {$0.1$};
@@ -55,18 +55,18 @@
    \node [anchor=south,font=\scriptsize] (w8) at (label7.north) {$0.1$};
-    \node[font=\scriptsize] (line1) at ([xshift=9em,yshift=-1.5em]model_label7.east) {$loss =-0.3 \log p_{3}-\sum_{i=1}^{7} 0.1 \log p_{i}$};
+    \node[font=\scriptsize] (line1) at ([xshift=9em,yshift=-1.5em]model_label7.east) {$Loss =-0.3 \log p_{3}-\sum_{i=1}^{7} 0.1 \log p_{i}$};
-    \node[font=\scriptsize] (line2) at ([xshift=5.9em,yshift=3.5em]model_label7.east) {$loss =-\log p_{3}$};
+    \node[font=\scriptsize] (line2) at ([xshift=5.9em,yshift=3em]model_label7.east) {$Loss =-\log p_{3}$};
    \begin{pgfonlayer}{background}
-        \node [rectangle,inner sep=0.1em,rounded corners=1pt,very thick,dotted,draw=red] [fit = (one_hot_label1) (one_hot_w3) (one_hot_label7) (model_label1) (model_label7)] (box1) {};
+        \node [rectangle,inner sep=0.5em,rounded corners=1pt,very thick,dotted,draw=red] [fit =(model_w3) (model_label1) (model_label7)] (box1) {};
        \node [rectangle,inner sep=0.2em,rounded corners=1pt,fill=purple!10,drop shadow,draw=purple] [fit = (line2)] (box3) {}; 
-        \draw [->,dotted,very thick,red] ([yshift=-1em]box1.east) .. controls +(east:1) and +(west:1) .. (box3.west);
+        \draw [->,dotted,very thick,red] ([yshift=-0.5em]box1.east) .. controls +(east:1) and +(west:1) .. (box3.west);
-        \node [rectangle,inner sep=0.1em,rounded corners=1pt,very thick,dotted,draw=ugreen] [fit = (label1) (label7) (model_label1) (model_label7) (model_w3)] (box2) {};
+        \node [rectangle,inner sep=0.5em,rounded corners=1pt,very thick,dotted,draw=ugreen] [fit =(w3) (label1) (label7) ] (box2) {};
        \node [rectangle,inner sep=0.2em,rounded corners=1pt,fill=green!10,drop shadow,draw=ugreen] [fit = (line1)] (box4) {};
-        \draw [->,dotted,very thick,ugreen] ([yshift=1em]box2.east) .. controls +(east:1) and +(west:1) .. (box4.west);
+        \draw [->,dotted,very thick,ugreen] ([yshift=-0.5em]box2.east) .. controls +(east:1) and +(west:1) .. (box4.west);
    \end{pgfonlayer}

--- a/Chapter13/chapter13.tex
+++ b/Chapter13/chapter13.tex
--- a/Chapter15/Figures/figure-encoder-structure-of-transformer-model-optimized-by-nas.tex
+++ b/Chapter15/Figures/figure-encoder-structure-of-transformer-model-optimized-by-nas.tex
@@ -43,8 +43,6 @@
 \foreach \x/\d in {1/2em, 2/8em, 3/14em}
 	\node[unit,fill=yellow!20] at (0,\d) (ln_\x) {层正则化};
-\node[unit,fill=green!20] at (0,24em) (sa_1) {8头自注意力：512};
 \foreach \x/\d in {1/6em, 2/12em, 3/22em}
 	\node[draw,circle,minimum size=1em,inner sep=1pt] at (0,\d) (add_\x) {\scriptsize\bfnew{+}};
@@ -76,7 +74,7 @@
 \node[font=\scriptsize,align=center] at (0em, -1.5em){(b) 使用结构搜索方法优化后的 \\ Transformer编码器中若干块的结构};
-\node[minimum size=0.8em,inner sep=0pt,rounded corners=1pt,draw,fill=blue!20] (act) at (5.5em, 24em){};
+\node[minimum size=0.8em,inner sep=0pt,rounded corners=1pt,draw,fill=blue!20] (act) at (5.5em, 20em){};
 \node[anchor=west,font=\footnotesize] at ([xshift=0.1em]act.east){激活函数};
 \node[anchor=north,minimum size=0.8em,inner sep=0pt,rounded corners=1pt,draw,fill=yellow!20] (nor) at ([yshift=-0.6em]act.south){};
 \node[anchor=west,font=\footnotesize] at ([xshift=0.1em]nor.east){层正则化};

--- a/bibliography.bib
+++ b/bibliography.bib
@@ -6829,6 +6829,106 @@ year={2012}
  publisher = {International Conference on Learning Representations},
  year      = {2017}
 }
+@inproceedings{DBLP:journals/nature/LeCunBH15,
+  author    = {Yann LeCun and
+               Yoshua Bengio and
+               Geoffrey E. Hinton},
+  title     = {Deep learning},
+  publisher   = {Nature},
+  volume    = {521},
+  number    = {7553},
+  pages     = {436--444},
+  year      = {2015}
+}
+@inproceedings{DBLP:journals/corr/abs-1207-0580,
+  author    = {Geoffrey E. Hinton and
+               Nitish Srivastava and
+               Alex Krizhevsky and
+               Ilya Sutskever and
+               Ruslan Salakhutdinov},
+  title     = {Improving neural networks by preventing co-adaptation of feature detectors},
+  publisher   = {CoRR},
+  volume    = {abs/1207.0580},
+  year      = {2012}
+}
+@inproceedings{DBLP:journals/tslp/ZhuM12,
+  author    = {Jingbo Zhu and
+               Matthew Y. Ma},
+  title     = {Uncertainty-based active learning with instability estimation for
+               text classification},
+  publisher   = {ACM Transactions on Speech and Language Processing},
+  volume    = {8},
+  number    = {4},
+  pages     = {5:1--5:21},
+  year      = {2012}
+}
+@inproceedings{DBLP:conf/coling/ZhuWYT08,
+  author    = {Jingbo Zhu and
+               Huizhen Wang and
+               Tianshun Yao and
+               Benjamin K. Tsou},
+  title     = {Active Learning with Sampling by Uncertainty and Density for Word
+               Sense Disambiguation and Text Classification},
+  publisher = {International Conference on Computational Linguistics},
+  pages     = {1137--1144},
+  year      = {2008}
+}
+@inproceedings{DBLP:conf/medprai/SurendranathJ18,
+  author    = {Ajay Surendranath and
+               Dinesh Babu Jayagopi},
+  title     = {Curriculum Learning for Depth Estimation with Deep Convolutional Neural
+               Networks},
+  publisher = {Mediterranean Conference on Pattern Recognition and Artificial Intelligence},
+  pages     = {95--100},
+  year      = {2018}
+}
+@inproceedings{DBLP:conf/icml/BengioLCW09,
+  author    = {Yoshua Bengio and
+               J{\'{e}}r{\^{o}}me Louradour and
+               Ronan Collobert and
+               Jason Weston},
+  title     = {Curriculum learning},
+  series    = {{ACM} International Conference Proceeding Series},
+  volume    = {382},
+  pages     = {41--48},
+  publisher = {International Conference on Machine Learning}
+}
+@inproceedings{DBLP:journals/corr/abs-2002-11794,
+  author    = {Zhuohan Li and
+               Eric Wallace and
+               Sheng Shen and
+               Kevin Lin and
+               Kurt Keutzer and
+               Dan Klein and
+               Joseph E. Gonzalez},
+  title     = {Train Large, Then Compress: Rethinking Model Size for Efficient Training
+               and Inference of Transformers},
+  publisher   = {CoRR},
+  volume    = {abs/2002.11794},
+  year      = {2020}
+}
+@inproceedings{kim-rush-2016-sequence,
+    author    = {Yoon Kim and
+               Alexander M. Rush},
+  title     = {Sequence-Level Knowledge Distillation},
+  pages     = {1317--1327},
+  publisher = {Conference on Empirical Methods in Natural Language Processing},
+  year      = {2016}
+}
+@inproceedings{Jiao2020TinyBERTDB,
+  author    = {Xiaoqi Jiao and
+               Yichun Yin and
+               Lifeng Shang and
+               Xin Jiang and
+               Xiao Chen and
+               Linlin Li and
+               Fang Wang and
+               Qun Liu},
+  title     = {TinyBERT: Distilling {BERT} for Natural Language Understanding},
+  pages     = {4163--4174},
+  publisher={Conference on Empirical Methods in Natural Language Processing},
+  year={2020}
+}
 %%%%% chapter 13------------------------------------------------------
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%