section05.tex 250 KB
Newer Older
xiaotong committed
1 2
% !Mode:: "TeX:UTF-8"
% !TEX encoding = UTF-8 Unicode
xiaotong committed
3 4 5 6 7 8 9 10 11 12

\def\CTeXPreproc{Created by ctex v0.2.13, don't edit!}
\documentclass[cjk,t,compress,12pt]{beamer}
\usepackage{pstricks}
\usepackage{etex}
\usepackage{eso-pic,graphicx}
\usepackage{fancybox}
\usepackage{amsmath,amssymb}
\usepackage{setspace}
\usepackage{xcolor}
xiaotong committed
13
\usepackage{array,multirow}
xiaotong committed
14 15 16 17
\usepackage{CJK}
\usepackage{tikz}
\usepackage{tikz-qtree}
\usepackage{hyperref}
xiaotong committed
18 19 20
\usepackage{changepage}
\usepackage{pgfplots}
\usepackage{subfigure}
xiaotong committed
21
\usepackage{tikz-3dplot}
xiaotong committed
22
\usepackage{esvect}
xiaotong committed
23

xiaotong committed
24 25 26
\usepackage{tcolorbox}
\tcbuselibrary{skins}

xiaotong committed
27
\usetikzlibrary{calc,intersections}
xiaotong committed
28
\usetikzlibrary{matrix}
xiaotong committed
29 30
\usetikzlibrary{arrows,decorations.pathreplacing}
\usetikzlibrary{shadows} % LATEX and plain TEX when using Tik Z
31
\usetikzlibrary{shadows.blur}
xiaotong committed
32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69

\usepgflibrary{arrows} % LATEX and plain TEX and pure pgf
\usetikzlibrary{arrows} % LATEX and plain TEX when using Tik Z
\usetikzlibrary{decorations}
\usetikzlibrary{arrows,shapes}

\usetikzlibrary{positioning,fit,calc}

\usetikzlibrary{mindmap,backgrounds} % mind map

\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}

\setbeamertemplate{items}[ball]
\usefonttheme[onlymath]{serif}  % fout of math

\definecolor{ugreen}{rgb}{0,0.5,0}
\definecolor{lgreen}{rgb}{0.9,1,0.8}
\definecolor{xtgreen1}{rgb}{0.824,0.898,0.8}
\definecolor{xtgreen}{rgb}{0.914,0.945,0.902}
\definecolor{lightgray}{gray}{0.85}

\setbeamercolor{uppercol}{fg=white,bg=ugreen}
\setbeamercolor{lowercol}{fg=black,bg=xtgreen}

\definecolor{ublue}{rgb}{0.152,0.250,0.545}
\setbeamercolor{uppercolblue}{fg=white,bg=ublue}
\setbeamercolor{lowercolblue}{fg=black,bg=blue!10}


%\usetheme{default}
%\usetheme{Darmstadt}
%\usetheme{Madrid}
%\usetheme{Frankfurt}
%\usetheme{Dresden}
%\usetheme{Boadilla}
%\usecolortheme{dolphin}

xiaotong committed
70 71 72 73
\newcounter{mycount1}
\newcounter{mycount2}
\newcounter{mycount3}
\newcounter{mycount4}
xiaotong committed
74 75 76

\usefonttheme[onlylarge]{structurebold}

xiaotong committed
77 78 79 80
\IfFileExists{C:/WINDOWS/win.ini}
{\newcommand{\mycfont}{you}}
{\newcommand{\mycfont}{gbsn}}

xiaotong committed
81
\begin{CJK}{UTF8}{\mycfont}
xiaotong committed
82 83 84
\end{CJK}

\setbeamerfont*{frametitle}{size=\large,series=\bfseries}
xiaotong committed
85
\setbeamertemplate{navigation symbols}{\begin{CJK}{UTF8}{\mycfont} 第五章 神经网络和语言模型 \hspace*{2em} 肖桐\&朱靖波 \end{CJK} \hspace*{2em} \today \hspace*{2em} \insertframenumber{}/\inserttotalframenumber}
xiaotong committed
86 87 88 89 90 91 92

\setbeamertemplate{itemize items}[circle] % if you want a circle
\setbeamertemplate{itemize subitem}[triangle] % if you wnat a triangle
\setbeamertemplate{itemize subsubitem}[ball] % if you want a ball

\begin{document}

xiaotong committed
93
\begin{CJK}{UTF8}{\mycfont}
xiaotong committed
94

xiaotong committed
95 96
\title{\Large{神经网络和语言模型}}
\author{\large{\textbf{肖桐\ \ 朱靖波}}}
xiaotong committed
97 98 99 100
\institute{
\blue{\url{xiaotong@mail.neu.edu.cn}} \black{} \\
\blue{\url{zhujingbo@mail.neu.edu.cn}} \black{} \\
\vspace{1.0em}
xiaotong committed
101
东北大学 自然语言处理实验室 \\
xiaotong committed
102 103 104 105 106 107 108 109 110 111 112 113
\blue{\underline{\url{http://www.nlplab.com}}} \black{} \\
\vspace{0.2cm}
\hspace{0.1cm} \includegraphics[scale=0.1]{../Figures/logo.pdf}
}
\date{}

\maketitle

\setlength{\leftmargini}{1em}
\setlength{\leftmarginii}{1em}

%%%------------------------------------------------------------------------------------------------------------
xiaotong committed
114
\section{为什么要谈神经网络}
xiaotong committed
115 116

%%%------------------------------------------------------------------------------------------------------------
xiaotong committed
117
\subsection{历史}
xiaotong committed
118

xiaotong committed
119
%%%------------------------------------------------------------------------------------------------------------
xiaotong committed
120 121
%%% 为什么要谈神经网络
\begin{frame}{为什么要谈神经网络}
xiaotong committed
122
\begin{itemize}
xiaotong committed
123
\item 近些年\textbf{深度学习(Deep Learning)}体现了巨大的潜力
xiaotong committed
124
    \begin{itemize}
xiaotong committed
125 126 127
    \item 席卷了包括机器翻译在内的很多NLP任务
    \item 已经成为了NLP中方法的新范式
    \item 衍生出\textbf{神经机器翻译}等新一代方法(下一章内容)
xiaotong committed
128 129 130 131 132 133 134
    \end{itemize}
\vspace{0.2em}
\begin{center}
\includegraphics[scale=0.45]{./Figures/deeplearning.jpg}
\end{center}
\vspace{0.5em}

xiaotong committed
135
\item<2-> \textbf{人工神经网络(Artificial Neural Network)}是深度学习的实践基础
xiaotong committed
136 137 138 139 140 141

\end{itemize}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
xiaotong committed
142 143
%%% 简单的历史
\begin{frame}{神经网络和深度学习的概念(1940s-1970s)}
xiaotong committed
144
\begin{itemize}
xiaotong committed
145
\item \textbf{神经网络}最早出现在控制论中(Cybernetics),随后更多地在连接主义(Connectionism)中被提及
xiaotong committed
146
    \begin{itemize}
xiaotong committed
147 148
    \item \textbf{最初的想法}:模拟大脑的生物学习机制进行计算机建模
    \item<2-> 比如使用线性加权函数来描述输入$\textbf{x}$和结果$\textbf{y}$之间的联系
xiaotong committed
149 150 151 152 153
    \vspace{-0.5em}
    \begin{displaymath}
    f(\textbf{x},\textbf{w})=x_1 \cdot w_1 + ... + x_n \cdot w_n
    \end{displaymath}\\
    \vspace{-0.5em}
xiaotong committed
154 155
    其中$\textbf{w}$是权重。这类模型也影响了随机梯度下降等现在机器学习方法的发展。
    \item<3-> 这类方法的局限也很明显,无法描述非线性问题,如著名的异或函数(XOR)学习问题
xiaotong committed
156 157 158 159 160 161 162
    \end{itemize}

\end{itemize}

\vspace{-0.5em}
\begin{center}
\includegraphics[scale=0.21]{./Figures/concept-history.jpg}\\
xiaotong committed
163
\scriptsize{图片引自《Deep Learning》}
xiaotong committed
164 165 166 167 168
\end{center}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
xiaotong committed
169 170
%%% 深度学习的发展
\begin{frame}{神经网络和深度学习的发展(1980s-1990s)}
xiaotong committed
171
\begin{itemize}
xiaotong committed
172
\item 现在,生物学属性已经不是神经网络的唯一灵感来源。深度学习也进入了新的发展阶段。两类思潮影响巨大:
xiaotong committed
173
    \begin{itemize}
xiaotong committed
174 175 176
    \item<2-> \textbf{连接主义(Connectionism)}。在认知学科中,早期的符号主义(Symbolicism)很难解释大脑如何使用神经元进行推理。连接主义的核心思想是:“大量简单的计算单元连接到一起可以实现智能行为”。\\
        这也推动了反向传播等训练多层神经网络方法的应用,并发展了包括长短时记忆模型在内的经典建模方法。
    \item<3-> \textbf{分布式表示(Distributed representation)}:一个复杂系统的任何部分的输入都应该是多个特征共同表示的结果。比如,一个单词并非一个词条,而是由成百上千个特征共同描述出来,而每个特征都描述了这个词的"某个"方面。
xiaotong committed
177
    \end{itemize}
xiaotong committed
178
\item<4-> \alert{遗憾的是},上世纪90年代后期,在很多应用中人们对神经网络方法期望过高,但是结果并没有达到预期。特别是,核方法、图模型等机器学习方法取得了很好的效果,神经网络研究进入又一次低谷。
xiaotong committed
179

xiaotong committed
180 181 182 183 184
\end{itemize}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
xiaotong committed
185 186
%%% 深度学习的第三次浪潮
\begin{frame}{第三次浪潮(2000s-now)}
xiaotong committed
187
\begin{itemize}
xiaotong committed
188
\item \textbf{深度学习的爆发}源于2006年Hinton等人成功训练了一个深度信念网络(deep belief network)。之后,深度学习的浪潮逐步席卷了机器学习及人工智能应用领域,延续至今。现代深度学习的成功有三方面原因:
xiaotong committed
189
    \begin{enumerate}
xiaotong committed
190 191 192
    \item \textbf{模型和算法}的完善与改进
    \item \textbf{并行计算能力}的提升使大规模实践变为了可能
    \item 以Hinton等人为代表的学者的\textbf{坚持与持续投入}
xiaotong committed
193
    \end{enumerate}
xiaotong committed
194
\item<2-> \textbf{从应用的角度},数据量的快速提升和模型容量的增加也为深度学习的成功提供了条件
xiaotong committed
195 196 197 198
\end{itemize}

\visible<2->{
\begin{center}
xiaotong committed
199
\begin{tikzpicture}
xiaotong committed
200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228
\scriptsize{
\begin{semilogyaxis}[
    width=.95\textwidth,
    height=.38\textwidth,
    yticklabel style={/pgf/number format/precision=1,/pgf/number format/fixed zerofill},
    xticklabel style={/pgf/number format/1000 sep=},
    xlabel style={yshift=0.5em},
    xlabel={\footnotesize{Year}},ylabel={\footnotesize{\# of sents.}},
    ymin=1,ymax=1000000000000,
    xmin=1999,xmax=2020,xtick={2000,2005,2010,2015,2020},
    legend style={yshift=-5em,xshift=0em,legend cell align=left,legend plot pos=right}
]

\addplot[purple,mark=square,mark=star,very thick] coordinates {(2001,10000) (2005,2000000) (2008,8000000) (2009,9000000) (2011,10000000) (2012,12000000) (2014,20000000) (2016,30000000) (2018,40000000) };
\addlegendentry{\tiny{Bi-text used in MT papers}\ \ \ \ \ \ \ \ \ \ }
\only<3->{
\addplot[ublue,mark=otimes*,very thick] coordinates {(2005,10000000) (2008,100000000) (2012,3000000000) (2016,5000000000) (2019,10000000000) };
\addlegendentry{\tiny{Bi-text used in practical systems}}
}

\end{semilogyaxis}
}
\end{tikzpicture}
\end{center}
}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
xiaotong committed
229
\subsection{深度学习的优势}
xiaotong committed
230 231

%%%------------------------------------------------------------------------------------------------------------
xiaotong committed
232 233
%%% 端到端学习
\begin{frame}{端到端学习}
xiaotong committed
234
\begin{itemize}
xiaotong committed
235
\item 深度神经网络给我们提供了一种机制,可以直接从学习输入到输出的关系,称之为\alert{端到端学习}
xiaotong committed
236
    \begin{itemize}
xiaotong committed
237 238
    \item<2-> \textbf{基于特征工程的方法}:需要大量人工定义的特征,这个过程往往会带来对问题的隐含假设
    \item<3-> \textbf{基于端到端学习的方法}:没有人工定义的特征,整个过程完全由神经网络建模
xiaotong committed
239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254
    \end{itemize}
\end{itemize}
\vspace{-0.5em}
\begin{center}
\visible<2->{
\includegraphics[scale=0.31]{./Figures/end2end-learning-1.jpg}\\
}
\visible<3->{
\Large{\textbf{VS.}}\\
\vspace{0.3em}
\includegraphics[scale=0.31]{./Figures/end2end-learning-2.jpg}\\
}
\end{center}
\end{frame}

%%%------------------------------------------------------------------------------------------------------------
xiaotong committed
255 256
%%% 深度学习的表现
\begin{frame}{深度学习的表现 - 以语言建模为例}
xiaotong committed
257
\begin{itemize}
xiaotong committed
258
\item \textbf{比如},在语言建模(LM)任务上,基于神经网络和深度学习的方法体现了巨大优势,在PTB数据上PPL值已经得到惊人的下降(PPL越低越好)
xiaotong committed
259
	\begin{itemize}
xiaotong committed
260 261
	\item 传统$n$元语法模型面临数据稀疏等问题
	\item<2-> 神经语言模型可以更好地描述序列生成问题
xiaotong committed
262 263 264
	\end{itemize}
\end{itemize}
\begin{tabular}{l | l | l | r}
xiaotong committed
265
模型 & 作者 & 年份 & PPL  \\ \hline
xiaotong committed
266
3-gram LM & Brown et al. & 1992 & 178.0 \pause \\ \hline
xiaotong committed
267 268 269 270 271 272 273 274
Feed-forward Neural LM & Bengio et al. & 2003 & 162.2 \\
Recurrent NN-based LM & Mikolov et al. & 2010 & 124.7 \\
Recurrent NN-LDA & Mikolov et al. & 2012 & 92.0 \\
LSTM & Zaremba et al. & 2014 & 78.4 \\
RHN & Zilly et al. & 2016 & 65.4 \\
AWD-LSTM & Merity et al. & 2018 & 58.8 \\
GPT-2 (Transformer) & Radford et al. & 2019 & \alert{35.7}
\end{tabular}
xiaotong committed
275 276
\end{frame}

xiaotong committed
277
%%%------------------------------------------------------------------------------------------------------------
xiaotong committed
278
\section{神经网络基础}
xiaotong committed
279 280

%%%------------------------------------------------------------------------------------------------------------
xiaotong committed
281
\subsection{简单的例子}
xiaotong committed
282 283

%%%------------------------------------------------------------------------------------------------------------
xiaotong committed
284 285
%%% 神经元
\begin{frame}{神经网络的基本单元 - 神经元}
xiaotong committed
286
\begin{itemize}
xiaotong committed
287
\item 生物学上,神经元是神经系统的基本组成单元。很多人想象的神经网络应该是这样的\\
xiaotong committed
288 289 290
\begin{center}
\includegraphics[scale=0.25]{./Figures/neuron-real.jpg}\\
\end{center}
xiaotong committed
291
\item<2-> 但我们这里说的是\textbf{人工神经元},实际上是这样的 :)
xiaotong committed
292
    \begin{itemize}
xiaotong committed
293
    \item 输入$\textbf{x}$经过$\textbf{w}$进行线性变化,之后加上偏移$\textbf{b}$,在经过激活函数$f$,最后得到$\textbf{y}$ - 啥东东???
xiaotong committed
294 295 296
    \end{itemize}
{\Large
\begin{displaymath}
xiaotong committed
297
\textbf{y} = f(\textbf{x} \cdot \textbf{w} + \textbf{b})
xiaotong committed
298 299 300 301 302 303 304 305
\end{displaymath}
}
\\
\vspace{-0.5em}
\end{itemize}
\end{frame}

%%%------------------------------------------------------------------------------------------------------------
xiaotong committed
306
%%% 感知机
xiaotong committed
307
\begin{frame}{最简单的人工神经元模型 - 感知机(Perceptron)}
xiaotong committed
308
\begin{itemize}
xiaotong committed
309
\item 感知机是人工神经元的一种实例。在上世纪50-60年代被提出后,对神经网络研究产生了深远影响。
xiaotong committed
310
    \begin{itemize}
xiaotong committed
311 312 313
    \item<2-> \textbf{输入}是若干个二值变量,$x_i=0$ or $1$
    \item<3-> 每一个输入变量对应一个\textbf{权重}$w_i$(实数)
    \item<4-> \textbf{输出}也是一个二值结果,$y=0$ or $1$。 判断的依据是,输入和加权和是否大于(或者小于)一个阈值$\sigma$
xiaotong committed
314 315 316 317 318 319 320 321 322 323 324 325
    \begin{displaymath}
    y = \left\{ \begin{array}{ll}
    0 & \sum_i {w_i \cdot x_i} < \sigma \\
    1 & \sum_i {w_i \cdot x_i} \ge \sigma
    \end{array} \right.
    \end{displaymath}
    \end{itemize}
\end{itemize}

\begin{center}
\begin{tikzpicture}
\begin{scope}
xiaotong committed
326
\node [anchor=center,circle,draw,ublue,very thick,minimum size=3.5em,fill=white,drop shadow={shadow xshift=0.1em,shadow yshift=-0.1em}] (neuron) at (0,0) {};
xiaotong committed
327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365
\node [anchor=east] (x1) at ([xshift=-6em]neuron.west) {\Large{$x_1$}};
\node [anchor=center] (x0) at ([yshift=3em]x1.center) {\Large{$x_0$}};
\node [anchor=center] (x2) at ([yshift=-3em]x1.center) {\Large{$x_2$}};
\node [anchor=west] (y) at ([xshift=6em]neuron.east) {\Large{$y$}};

\draw [->,thick] (x0.east) -- (neuron.150) node [pos=0.5,above] {$w_0$};
\draw [->,thick] (x1.east) -- (neuron.180) node [pos=0.5,above] {$w_1$};
\draw [->,thick] (x2.east) -- (neuron.210) node [pos=0.5,above] {$w_2$};
\draw [->,thick] (neuron.east) -- (y.west);

\visible<2>{
\draw [->,thick,red] (x0.east) -- (neuron.150) node [pos=0.5,above] {\black{$w_0$}};
\draw [->,thick,red] (x1.east) -- (neuron.180) node [pos=0.5,above] {\black{$w_1$}};
\draw [->,thick,red] (x2.east) -- (neuron.210) node [pos=0.5,above] {\black{$w_2$}};
}

\visible<3>{
\draw [->,thick] (x0.east) -- (neuron.150) node [pos=0.5,above] {\red{$w_0$}};
\draw [->,thick] (x1.east) -- (neuron.180) node [pos=0.5,above] {\red{$w_1$}};
\draw [->,thick] (x2.east) -- (neuron.210) node [pos=0.5,above] {\red{$w_2$}};
}

\visible<4->{
\node [anchor=center] (neuronmath) at (neuron.center) {\red{\small{$\sum \ge \sigma$}}};
}

\visible<5->{
\node [anchor=south] (prediction) at ([xshift=-2em,yshift=1em]y.north west) {\footnotesize{\red{$x_0 w_0 + x_1 w_1 + x_2 w_2 \ge \sigma$}}};
\draw [->,thick,red] (neuron.east) -- (y.west);
\node [anchor=west] (yvalue) at ([yshift=0.2em]y.east) {\Large{$=1$}};
}

\end{scope}
\end{tikzpicture}
\end{center}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
xiaotong committed
366 367
%%% 感知机 - 一个例子
\begin{frame}{一个例子}
xiaotong committed
368
\begin{itemize}
xiaotong committed
369
\item 一个非常简单的例子。比如,有一个音乐会,你正在纠结是否去参加,有三个因素会影响你的决定
xiaotong committed
370
    \begin{itemize}
xiaotong committed
371 372
    \item $x_0$:剧场是否离你足够近?
    \item $x_1$:票价是否低于300元?
xiaotong committed
373
    \item $x_2$:女朋友是否喜欢音乐会?
xiaotong committed
374
    \end{itemize}
xiaotong committed
375
\item<2-> 如何决定?比如,女朋友很希望和你一起,但是剧场很远而且票价500元。如果这些因素对你的决策都是同等重要的,那么会有一个综合得分:
xiaotong committed
376 377 378
    \begin{displaymath}
    x_0 \cdot w_0 + x_1 \cdot w_1 + x_2 \cdot w_2 = 0 \cdot 1 + 0 \cdot 1 + 1 \cdot 1 = 1
    \end{displaymath}
xiaotong committed
379
\item<3-> 如果你不是十分纠结,能够接受不完美的事情,你可能会有$\sigma=1$,于是
xiaotong committed
380 381 382
    \begin{displaymath}
    \sum_i x_i \cdot w_i \ge \sigma
    \end{displaymath}
xiaotong committed
383
    \textbf{那么},你会去参加音乐会
xiaotong committed
384 385 386 387
\end{itemize}
\end{frame}

%%%------------------------------------------------------------------------------------------------------------
xiaotong committed
388 389
%%% 感知机 - 一个例子:权重
\begin{frame}{一个例子 - 权重}
xiaotong committed
390
\begin{itemize}
xiaotong committed
391 392
\item 可以看出,实际上这个决策过程本质上就是一个感知机
\item<2-> 但是,人并不是完美的,往往对有些事情会更在意一些。如果你是\textbf{守财奴},因此会对票价看的更重一些,这时你会用不均匀的权重计算每个因素的影响,比如:$w_0=0.5$$w_1=2$$w_2=0.5$
xiaotong committed
393
\item<3-> 女友很希望和你一起,但是剧场很远而且票价500元,会导致你\alert{选择不去}看音乐会(女朋友都不要了,咋整)
xiaotong committed
394 395 396 397 398
    \begin{displaymath}
    \sum_i x_i \cdot w_i = 0 \cdot 0.5 + 0 \cdot 2 + 1 \cdot 0.5 = 0.5 < \sigma = 1
    \end{displaymath}
\end{itemize}

xiaotong committed
399
\vspace{-1.8em}
xiaotong committed
400 401 402
\begin{center}
\begin{tikzpicture}
\begin{scope}
xiaotong committed
403
\node [anchor=center,circle,draw,ublue,very thick,minimum size=3.5em,fill=white,drop shadow={shadow xshift=0.1em,shadow yshift=-0.1em}] (neuron) at (0,0) {};
xiaotong committed
404 405
\node [anchor=east] (x1) at ([xshift=-6em]neuron.west) {$x_1$:票价够低?};
\node [anchor=center] (x0) at ([yshift=3em]x1.center) {$x_0$:距离够近?};
xiaotong committed
406 407
\node [anchor=center] (x2) at ([yshift=-3em]x1.center) {$x_2$:女友喜欢?};
\node [anchor=west] (y) at ([xshift=2em]neuron.east) {$y$:去?还是不去?};
xiaotong committed
408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424

\visible<1>{
\draw [->,thick] (x0.east) -- (neuron.150) node [pos=0.5,above,yshift=0.2em] {\small{$w_0=1$}};
\draw [->,thick] (x1.east) -- (neuron.180) node [pos=0.5,above,yshift=-0.1em] {\small{$w_1=1$}};
\draw [->,thick] (x2.east) -- (neuron.210) node [pos=0.5,above,yshift=0.1em] {\small{$w_2=1$}};
}
\draw [->,thick] (neuron.east) -- (y.west);

\node [anchor=center] (neuronmath) at (neuron.center) {\small{$\sum \ge \sigma$}};

\visible<2->{
\draw [->,thin,red] (x0.east) -- (neuron.150) node [pos=0.5,above,yshift=0.2em] {\small{$w_0=.5$}};
\draw [->,line width=0.8mm,red] (x1.east) -- (neuron.180) node [pos=0.5,above,yshift=-0.1em] {\textbf{\small{$w_1=2$}}};
\draw [->,thin,red] (x2.east) -- (neuron.210) node [pos=0.5,above,yshift=0.1em] {\small{$w_2=.5$}};
}

\visible<3->{
xiaotong committed
425
\node [anchor=south] (ylabel) at (y.north) {\red{\textbf{不去了!}}};
xiaotong committed
426 427 428 429 430 431 432 433 434
}

\end{scope}
\end{tikzpicture}
\end{center}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
xiaotong committed
435 436
%%% 感知机 - 一个例子:输入形式
\begin{frame}{一个例子 - 输入形式}
xiaotong committed
437

xiaotong committed
438
\begin{itemize}
xiaotong committed
439
\item 在遭受了女友一万点伤害之后,你意识到决策不应该只考虑非0即1的因素,应该把``程度''考虑进来:
xiaotong committed
440
    \begin{itemize}
xiaotong committed
441 442
    \item $x_0$:10/距离
    \item $x_1$:150/票价
xiaotong committed
443
    \item $x_2$:女朋友是否喜欢?(这条不敢改)
xiaotong committed
444
    \end{itemize}
xiaotong committed
445
\item<2-> 新模型中,$x_0$$x_1$是连续变量,$x_2$是一个离散变量
xiaotong committed
446
\end{itemize}
xiaotong committed
447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484

\visible<2->{
\begin{tikzpicture}

\begin{scope}
\draw [->,thick] (0,0) -- (2.5,0);
\draw [->,thick] (0,0) -- (0, 1.5);
\draw [-,very thick,ublue,domain=0.6:2,samples=100] plot (\x,{ 1/\x - 0.2});
\node [anchor=east] (ylabel) at (0, 3.2em) {\footnotesize{$x_0$}};
\node [anchor=north] (xlabel) at (5em, 0em) {\scriptsize{距离(km)}};
\end{scope}

\begin{scope}[xshift=9em]
\draw [->,thick] (0,0) -- (2.5,0);
\draw [->,thick] (0,0) -- (0, 1.5);
\draw [-,very thick,ublue,domain=0.4:2,samples=100] plot (\x,{ 0.5/\x});
\node [anchor=east] (ylabel) at (0, 3.2em) {\footnotesize{$x_1$}};
\node [anchor=north] (xlabel) at (5em, 0em) {\scriptsize{票价(元)}};
\end{scope}

\begin{scope}[xshift=18em]
\draw [->,thick] (0,0) -- (2.5,0);
\draw [->,thick] (0,0) -- (0, 1.5);
\node [anchor=east] (ylabel) at (0, 3.2em) {\footnotesize{$x_2$}};
\node [anchor=south, fill=ublue, minimum width=1.5em, minimum height=0.1em, inner sep=0] (histogram1) at (1.5em, 0) {};
\node [anchor=south, fill=ublue, minimum width=1.5em, minimum height=3em, inner sep=0] (histogram2) at (4.0em, 0) {};
\node [anchor=north] (hlabel1) at (histogram1.south) {\tiny{女友不去}};
\node [anchor=north] (hlabel2) at (histogram2.south) {\tiny{女友去}};
\end{scope}

\end{tikzpicture}
}

\begin{itemize}
\item<3-> 女朋友很希望和你一起,但是剧场有20km远而且票价500元。于是有\ $x_0 = 10/20 = 0.5$$x_1=150/500 = 0.3$, $x_2=1$。综合来看$\sum_i x_i \cdot w_i \ge \sigma$,还是{\color{red} 去听音乐会} :)
    \begin{displaymath}
    \sum_i x_i \cdot w_i = 0.5 \cdot 0.5 + 0.3 \cdot 2 + 1 \cdot 0.5 = 1.35 \ge \sigma = 1
    \end{displaymath}
xiaotong committed
485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504
\end{itemize}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 感知机 - 一个例子 - 学习
\begin{frame}{一个例子 - 学习}

\begin{itemize}
\item 一次成功的音乐会之后,你似乎掌握了真理:只要女朋友开心就好,为何不把这个因素的权重调大。最简单的方式是把$w_0$$w_1$的权重都置0,同时令$w_3 > 0$
\item<3-> 很快又有一场音乐会,距你1000公里,票价(不含路费)3000元,当然你女友是一直是喜欢音乐会的。根据新的决策模型,你义无反顾地\alert{决定去听}这场音乐会
\item<4-> \textbf{之后},你女朋友又给了你1万点伤害,痛啊!!!
    \begin{itemize}
    \item \alert{结果你发现}:女友既要浪漫,同时也爱财
    \end{itemize}
\end{itemize}

\begin{center}
\begin{tikzpicture}
\begin{scope}
xiaotong committed
505
\node [anchor=center,circle,draw,ublue,very thick,minimum size=3.5em,fill=white,drop shadow={shadow xshift=0.1em,shadow yshift=-0.1em}] (neuron) at (0,0) {};
xiaotong committed
506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525
\node [anchor=east] (x1) at ([xshift=-6em]neuron.west) {$x_1$:便宜程度\ \ \ \ };
\node [anchor=center] (x0) at ([yshift=3em]x1.center) {$x_0$:远近程度\ \ \ \ };
\node [anchor=center] (x2) at ([yshift=-3em]x1.center) {$x_2$:女友喜欢?};
\node [anchor=west] (y) at ([xshift=2em]neuron.east) {$y$:去?还是不去?};

\draw [->,thick] (neuron.east) -- (y.west);

\node [anchor=center] (neuronmath) at (neuron.center) {\small{$\sum \ge \sigma$}};

\visible<1>{
\draw [->,thin] (x0.east) -- (neuron.150) node [pos=0.5,above,yshift=0.2em] {\small{$w_0=.5$}};
\draw [->,line width=0.8mm] (x1.east) -- (neuron.180) node [pos=0.5,above,yshift=-0.1em] {\textbf{\small{$w_1=2$}}};
\draw [->,thin] (x2.east) -- (neuron.210) node [pos=0.5,above,yshift=0.1em] {\small{$w_2=.5$}};
}

\visible<2->{
\draw [->,dotted] (x0.east) -- (neuron.150) node [pos=0.5,above,yshift=0.2em] {\small{$w_0=0$}};
\draw [->,dotted] (x1.east) -- (neuron.180) node [pos=0.5,above,yshift=-0.1em] {\textbf{\small{$w_1=0$}}};
\draw [->,line width=1mm] (x2.east) -- (neuron.210) node [pos=0.5,above,yshift=0.1em] {\small{$w_2=10$}};
}
xiaotong committed
526

xiaotong committed
527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544

\end{scope}
\end{tikzpicture}
\end{center}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 感知机 - 一个例子 - 学习(cont)
\begin{frame}{一个例子 - 权重学习}
\begin{itemize}
\item \textbf{痛定思痛},你发现每个因素的权重需要准确地设置才能达到最好的决策效果
    \begin{itemize}
    \item 如何确定最好的权重?
    \end{itemize}
\item<2-> \textbf{当然},你是一个勇于实践的人
    \begin{itemize}
    \item 方法很简单:不断地尝试,根据结构不断地调整权重
xiaotong committed
545
    \item<10-> 在进行了很多次实验后,发现了相对好的一组权重
xiaotong committed
546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642
    \end{itemize}
\end{itemize}

\visible<2->{
\begin{center}
\begin{tikzpicture}

\begin{scope}[scale=0.6]
\visible<3->{
\draw [->,thick] (0,0) -- (2.5,0);
\draw [->,thick] (0,0) -- (0, 1.5);
\draw [-,very thick,ublue,domain=0.6:2,samples=100] plot (\x,{ 1/\x - 0.2});
\node [anchor=east] (ylabel) at (0, 3.2em) {\footnotesize{$x_0$}};
\node [anchor=north] (xlabel) at (5em, 0em) {\tiny{距离(km)}};
}

\visible<4->{
\draw [-,thick] (0.25,-1.5) -- (2.25,-1.5);
\node [anchor=east] (wlabel) at (0.25,-1.5) {\footnotesize{$w_0$}};
}

\visible<5>{\node [anchor=north,fill=ugreen,minimum height=0.5em,minimum width=1.5em] (w0) at (1.25,-1.5) {};}
\visible<6>{\node [anchor=north,fill=ugreen,minimum height=0.3em,minimum width=1.5em] (w0) at (1.25,-1.5) {};}
\visible<7>{\node [anchor=north,fill=ugreen,minimum height=1.8em,minimum width=1.5em] (w0) at (1.25,-1.5) {};}
\visible<8>{\node [anchor=north,fill=ugreen,minimum height=0.3em,minimum width=1.5em] (w0) at (1.25,-1.5) {};}
\visible<9>{\node [anchor=north,fill=ugreen,minimum height=0.3em,minimum width=1.5em] (w0) at (1.25,-1.5) {};}
\visible<10->{\node [anchor=north,fill=ugreen,minimum height=0.8em,minimum width=1.5em] (w0) at (1.25,-1.5) {};}

\end{scope}

\begin{scope}[scale=0.6,xshift=12em]
\visible<3->{
\draw [->,thick] (0,0) -- (2.5,0);
\draw [->,thick] (0,0) -- (0, 1.5);
\draw [-,very thick,ublue,domain=0.4:2,samples=100] plot (\x,{ 0.5/\x});
\node [anchor=east] (ylabel) at (0, 3.2em) {\footnotesize{$x_1$}};
\node [anchor=north] (xlabel) at (5em, 0em) {\tiny{票价(元)}};
}

\visible<4->{
\draw [-,thick] (0.25,-1.5) -- (2.25,-1.5);
\node [anchor=east] (wlabel) at (0.25,-1.5) {\footnotesize{$w_1$}};
}

\visible<5>{\node [anchor=north,fill=ugreen,minimum height=0.5em,minimum width=1.5em] (w1) at (1.25,-1.5) {};}
\visible<6>{\node [anchor=north,fill=ugreen,minimum height=1.5em,minimum width=1.5em] (w1) at (1.25,-1.5) {};}
\visible<7>{\node [anchor=north,fill=ugreen,minimum height=0.8em,minimum width=1.5em] (w1) at (1.25,-1.5) {};}
\visible<8>{\node [anchor=north,fill=ugreen,minimum height=0.1em,minimum width=1.5em] (w1) at (1.25,-1.5) {};}
\visible<9>{\node [anchor=north,fill=ugreen,minimum height=1.0em,minimum width=1.5em] (w1) at (1.25,-1.5) {};}
\visible<10->{\node [anchor=north,fill=ugreen,minimum height=0.3em,minimum width=1.5em] (w1) at (1.25,-1.5) {};}

\end{scope}

\begin{scope}[scale=0.6,xshift=24em]
\visible<3->{
\draw [->,thick] (0,0) -- (2.5,0);
\draw [->,thick] (0,0) -- (0, 1.5);
\node [anchor=east] (ylabel) at (0, 3.2em) {\footnotesize{$x_2$}};
\node [anchor=south, fill=ublue, minimum width=0.8em, minimum height=0.1em, inner sep=0] (histogram1) at (1.5em, 0) {};
\node [anchor=south, fill=ublue, minimum width=0.8em, minimum height=2em, inner sep=0] (histogram2) at (4.0em, 0) {};
\node [anchor=north,align=left] (hlabel1) at (histogram1.south) {\tiny{女友no}};
\node [anchor=north,align=left] (hlabel2) at ([xshift=0.5em]histogram2.south) {\tiny{女友yes}};
}

\visible<4->{
\draw [-,thick] (0.25,-1.5) -- (2.25,-1.5);
\node [anchor=east] (wlabel) at (0.25,-1.5) {\footnotesize{$w_2$}};
}

\visible<5>{\node [anchor=north,fill=ugreen,minimum height=0.5em,minimum width=1.5em] (w2) at (1.25,-1.5) {};}
\visible<6>{\node [anchor=north,fill=ugreen,minimum height=1.2em,minimum width=1.5em] (w2) at (1.25,-1.5) {};}
\visible<7>{\node [anchor=north,fill=ugreen,minimum height=0.8em,minimum width=1.5em] (w2) at (1.25,-1.5) {};}
\visible<8>{\node [anchor=north,fill=ugreen,minimum height=1.2em,minimum width=1.5em] (w2) at (1.25,-1.5) {};}
\visible<9>{\node [anchor=north,fill=ugreen,minimum height=1.5em,minimum width=1.5em] (w2) at (1.25,-1.5) {};}
\visible<10->{\node [anchor=north,fill=ugreen,minimum height=1.3em,minimum width=1.5em] (w2) at (1.25,-1.5) {};}

\end{scope}

\end{tikzpicture}
\end{center}

}

\visible<5->{
\begin{center}
\begin{tabular}{c<{\onslide<5->}c<{\onslide<6->}c<{\onslide<7->}c<{\onslide<8->}c<{\onslide<9->}c<{\onslide<10->}c<{\onslide}}
实验 & 1 & 2 & 3 & 4 & ... & 10k \\
结果 & 失败 & 成功 & 失败 & 失败 & ... & 成功
\end{tabular}
\end{center}
}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 感知机 - 一个例子 - 总结
\begin{frame}{一个例子 - 总结}
xiaotong committed
643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693
\begin{itemize}
\item 即便对于一个简单问题,如何设计一种合理方法的准确的进行决策并不简单。在上面这个模型中,还有一些\alert{问题}需要回答
    \begin{itemize}
    \item<2-> 对问题建模,即:定义输入$\{x_i\}$的形式
    \item<3-> 设计有效的决策模型,即:定义$y$
    \item<4-> 决定模型所涉及的参数(如权重$\{w_i\}$)的最优值
    \end{itemize}
\end{itemize}

\vspace{-2em}
\begin{center}
\begin{tikzpicture}
\begin{scope}
\node [anchor=center,circle,draw,ublue,very thick,minimum size=3.5em,fill=white,drop shadow={shadow xshift=0.1em,shadow yshift=-0.1em}] (neuron) at (0,0) {};

\visible<2->{
\node [anchor=east] (x1) at ([xshift=-6em]neuron.west) {$x_1$:便宜程度\ \ \ \ };
\node [anchor=center] (x0) at ([yshift=3em]x1.center) {$x_0$:远近程度\ \ \ \ };
\node [anchor=center] (x2) at ([yshift=-3em]x1.center) {$x_2$:女友喜欢?};
}

\visible<3->{
\node [anchor=west] (y) at ([xshift=2em]neuron.east) {$y$:去?还是不去?};
\node [anchor=center] (neuronmath) at (neuron.center) {\small{$\sum \ge \sigma$}};
}

\draw [->,thick] (neuron.east) -- (y.west);

\draw [->,thick] (x0.east) -- (neuron.150);
\draw [->,thick] (x1.east) -- (neuron.180);
\draw [->,thick] (x2.east) -- (neuron.210);

\visible<4->{
\draw [->,thick] (x0.east) -- (neuron.150) node [pos=0.5,above,yshift=0.2em] {$w_0$};
\draw [->,thick] (x1.east) -- (neuron.180) node [pos=0.5,above,yshift=-0.1em] {$w_1$};
\draw [->,thick] (x2.east) -- (neuron.210) node [pos=0.5,above,yshift=0.1em] {$w_2$};
}

\end{scope}
\end{tikzpicture}
\end{center}

\vspace{-0.5em}

\begin{itemize}
\item<5-> \textbf{当然},后面的内容会涉及上面的问题,而且不止这些 :)
\end{itemize}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
xiaotong committed
694 695 696
\subsection{人工神经元}

%%%------------------------------------------------------------------------------------------------------------
xiaotong committed
697
%%% outline
xiaotong committed
698
\begin{frame}{入门人工神经网络(深度学习)的三个基本问题}
xiaotong committed
699

xiaotong committed
700
\vspace{1em}
xiaotong committed
701

xiaotong committed
702
\begin{tcolorbox}[enhanced,size=normal,left=2mm,right=1mm,colback=red!5!white,colframe=red!75!black,drop fuzzy shadow]
xiaotong committed
703
{\Large
xiaotong committed
704
\textbf{1. 人工神经网络的基本单元是什么,}
xiaotong committed
705

xiaotong committed
706 707 708 709 710 711
\vspace{0.4em}
\textbf{\hspace{0.9em} 如何组合出更强大的模型?}
}
\end{tcolorbox}

\vspace{0.5em}
xiaotong committed
712

xiaotong committed
713 714 715 716 717 718
\begin{tcolorbox}[enhanced,size=normal,left=2mm,right=1mm,colback=red!5!white,colframe=red!75!black,drop fuzzy shadow]
{\Large
\textbf{2. 人工神经网络的数学描述是什么,}

\vspace{0.4em}
\textbf{\hspace{0.9em} 如何编程实现这种数学模型?}
xiaotong committed
719
}
xiaotong committed
720
\end{tcolorbox}
xiaotong committed
721

xiaotong committed
722 723 724 725 726 727 728 729 730 731
\vspace{0.5em}

\begin{tcolorbox}[enhanced,size=normal,left=2mm,right=1mm,colback=red!5!white,colframe=red!75!black,drop fuzzy shadow]
{\Large
\textbf{3. 如何对模型中的参数进行学习,}

\vspace{0.4em}
\textbf{\hspace{0.9em} 之后使用学习到的模型进行推断?}
}
\end{tcolorbox}
xiaotong committed
732

xiaotong committed
733 734 735
\end{frame}

%%%------------------------------------------------------------------------------------------------------------
xiaotong committed
736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757
%%% outline: problem 1
\begin{frame}{首先}

\vspace{6em}

\begin{tcolorbox}[enhanced,size=normal,left=2mm,right=1mm,colback=red!5!white,colframe=red!75!black,drop fuzzy shadow]
{\Large
\textbf{人工神经网络的基本单元是什么,}

\vspace{0.4em}
\textbf{如何组合出更强大的模型?}
}
\end{tcolorbox}

\vspace{2em}
\begin{center}
\begin{tikzpicture}
\node [fill=blue!10] (label) at (0,0) {\Large{$\textbf{y} = ?(\textbf{x})$ }};
\end{tikzpicture}
\end{center}

\end{frame}
xiaotong committed
758 759

%%%------------------------------------------------------------------------------------------------------------
xiaotong committed
760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799
%%% 线性代数基础
\begin{frame}{预热 - 线性代数知识}
\begin{itemize}
\item \textbf{矩阵}:我们用$a$表示一个标量(一个数),用粗体$\textbf{a}$表示一个矩阵(或向量),其中$a_{ij}$表示$\textbf{a}$$i$行、第$j$列的元素\\
    \begin{displaymath}
    a = 5 \hspace{3em} \textbf{a} = \begin{pmatrix} a_{11} & a_{12} \\ a_{21} & a_{22} \end{pmatrix} = \begin{pmatrix} 1 & 2 \\ 3 & 4 \end{pmatrix}
    \end{displaymath}
\item \textbf{向量}:一种特殊的矩阵,只有一行或者一列,这里默认使用行向量,比如$\textbf{a} = (a_1,a_2,a_3) = (10, 20, 30)$$\textbf{a}$对应的列向量记为$\textbf{a}^T$
\item<2-> \textbf{代数运算}:矩阵可以按位进行+、-等代数运算,对于$\textbf{a} = \begin{pmatrix} 1 & 2 \\ 3 & 4 \end{pmatrix}$$\textbf{b} = \begin{pmatrix} 1 & 1 \\ 1 & 1 \end{pmatrix}$,有$\textbf{a} + \textbf{b} = \begin{pmatrix} 2 & 3 \\ 4 & 5 \end{pmatrix}$
\item<3-> \textbf{矩阵的微分}:按位进行,对于矩阵$\textbf{c}$和标量$x$
    \begin{displaymath}
    \frac{\partial \textbf{c}}{\partial x} = \begin{pmatrix} \frac{\partial c_{11}}{\partial x} & \frac{\partial c_{12}}{\partial x} \\ \frac{\partial c_{21}}{\partial x} & \frac{\partial c_{22}}{\partial x} \end{pmatrix} \hspace{2em} \frac{\partial x}{\partial \textbf{c}} = \begin{pmatrix} \frac{\partial x}{\partial c_{11}} & \frac{\partial x}{\partial c_{12}} \\ \frac{\partial x}{\partial c_{21}} & \frac{\partial x}{\partial c_{22}} \end{pmatrix}
    \end{displaymath}
\end{itemize}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 线性代数基础
\begin{frame}{预热 - 线性代数知识(续)}
\begin{itemize}


\item \textbf{矩阵的乘法}:对于$\textbf{a} \in \mathbb{R}^{n \times k}$$\textbf{b} \in \mathbb{R}^{k \times m}$,用$\textbf{c} = \textbf{a} \textbf{b} \in \mathbb{R}^{n \times m}$表示\textbf{a}\textbf{b}的矩阵乘法,其中
    \begin{displaymath}
    c_{pq} = \sum_{i = 1}^k a_{pi} b_{iq}
    \end{displaymath}
    对于方程$\left\{ \begin{array}{l} 5x_{1} + 2x_{2} = y_{1} \\ 3x_{1} + x_{2} = y_{2}\end{array} \right.$,可以表示为$\textbf{a} \textbf{x}^T = \textbf{y}^T$ 其中$\textbf{a}=\begin{pmatrix} 5 & 2 \\ 3 & 1 \end{pmatrix}$$\textbf{x}^T =\begin{pmatrix} x_1 \\ x_2 \end{pmatrix}$$\textbf{y}^T =\begin{pmatrix} y_1 \\ y_2 \end{pmatrix}$
\item<2-> \textbf{其它}
    \begin{itemize}
    \item \textbf{单位矩阵}:方阵$\textbf{I}$$I_{ij} = 1$当且仅当$i=j$,否则$I_{ij} = 0$
    \item \textbf{转置}$\textbf{a}$的转置记为$\textbf{a}^T$,有$a^T_{ji}=a_{ij}$
    \item \textbf{逆矩阵}:方阵$\textbf{a}$的逆矩阵记为$\textbf{a}^{-1}$,有$\textbf{a} \textbf{a}^{-1} = \textbf{a}^{-1} \textbf{a} = \textbf{I}$
    \item \textbf{向量(矩阵)的范数}$||\textbf{a}||_p = \big( \sum_i |a_i|^p \big)^{\frac{1}{p}}$
    \end{itemize}

\end{itemize}
\end{frame}

%%%------------------------------------------------------------------------------------------------------------
800 801 802 803 804 805 806 807 808 809 810 811 812 813 814
%%% 人工神经元的函数形式
\begin{frame}{人工神经元即一个函数}

\begin{itemize}
\item 神经元:
\end{itemize}

\vspace{-1em}
\begin{center}
\begin{tikzpicture}

\node [anchor=center] (y) at (0,0) {\LARGE{$\textbf{y}$}};
\node [anchor=west] (eq) at (y.east) {\LARGE{$=$}};
\node [anchor=west] (func) at (eq.east) {\LARGE{$f$}};
\node [anchor=west] (brace01) at (func.east) {\LARGE{$($}};
815 816 817 818 819 820
\node [anchor=west] (x) at (brace01.east) {\LARGE{$\textbf{x}$}};
\node [anchor=west] (dot) at (x.east) {\LARGE{$\cdot$}};
\node [anchor=west] (w) at (dot.east) {\LARGE{$\textbf{w}$}};
\node [anchor=west] (plus) at (w.east) {\LARGE{$+$}};
\node [anchor=west] (b) at (plus.east) {\LARGE{$\textbf{b}$}};
\node [anchor=west] (brace02) at (b.east) {\LARGE{$)$}};
821 822

\visible<2->{
823
\node [anchor=center,fill=yellow!30] (x2) at (x) {\LARGE{$\textbf{x}$}};
824 825 826 827 828
\node [anchor=south] (xlabel) at ([yshift=1.5em]x.north) {输入};
\draw [<-] ([yshift=0.2em]x2.north) -- (xlabel.south);
}

\visible<3->{
829
\node [anchor=center,fill=green!20] (w2) at (w) {\LARGE{$\textbf{w}$}};
830 831 832 833 834
\node [anchor=north] (wlabel) at ([yshift=-1.5em]w.south) {参数(权重)};
\draw [<-] ([yshift=-0.2em]w2.south) -- (wlabel.north);
}

\visible<4->{
835
\node [anchor=center,fill=purple!20] (b2) at (b) {\LARGE{$\textbf{b}$}};
836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862
\node [anchor=south] (blabel) at ([yshift=1.3em]b.north) {偏移};
\draw [<-] ([yshift=0.2em]b2.north) -- (blabel.south);
}

\visible<5->{
\node [anchor=center,fill=blue!20] (func2) at (func) {\LARGE{$f$}};
\node [anchor=north] (funclabel) at ([yshift=-1.1em]func.south) {激活函数};
\draw [<-] ([yshift=-0.2em]func2.south) -- (funclabel.north);
}

\visible<6->{
\node [anchor=center,fill=red!20] (y2) at (y) {\LARGE{$\textbf{y}$}};
\node [anchor=south] (ylabel) at ([yshift=1.3em]y.north) {输出};
\draw [<-] ([yshift=0.2em]y2.north) -- (ylabel.south);
}

\end{tikzpicture}
\end{center}

\vspace{-1em}
\begin{itemize}
\item<7-> 以感知机为例
	\begin{itemize}
	\item 输入:$\textbf{x}=(x_0,...,x_n)$
	\item 权重:$\textbf{w}=(w_0,...,w_n)$
	\item 偏移:$\textbf{b} = (-\sigma)$
	\item 激活函数:$f(z)=1$$z \ge 0$, 其它情况$f(z)=0$
xiaotong committed
863
	\item 输出:$\textbf{y}=f(\textbf{x} \cdot \textbf{w} - \sigma)$
864 865 866 867 868 869
	\end{itemize}
\end{itemize}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
xiaotong committed
870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956
%%% 层的概念
\begin{frame}{``层"的概念}
\begin{itemize}
\item 对于一个问题(相同输入),可能会有多个输出,这时可以把\alert{多个相同的神经元并列起来},构成一\alert{``层"}
    \begin{itemize}
    \item 比如,天气预报需要同时预测湿度和温度
    \end{itemize}
\end{itemize}

\vspace{-2em}

\begin{center}
\begin{tikzpicture}
\begin{scope}

\tikzstyle{neuronnode} = [minimum size=1.5em,circle,draw,ublue,very thick,fill=white,drop shadow={shadow xshift=0.1em,shadow yshift=-0.1em}]

\node [anchor=center,neuronnode] (neuron00) at (0,0) {};
\visible<2->{
\node [anchor=center,neuronnode] (neuron01) at ([yshift=-3em]neuron00) {};
}
\visible<3->{
\node [anchor=center,neuronnode] (neuron02) at ([yshift=-3em]neuron01) {};
}

\node [anchor=east] (x0) at ([xshift=-6em]neuron00.west) {$x_0$};
\node [anchor=east] (x1) at ([xshift=-6em]neuron01.west) {$x_1$};
\node [anchor=east] (x2) at ([xshift=-6em]neuron02.west) {$b$};

\node [anchor=west] (y0) at ([xshift=4em]neuron00.east) {$y_0$};

\draw [->] (x0.east) -- (neuron00.180) node [pos=0.1,above] {\tiny{$w_{00}$}};
\draw [->] (x1.east) -- (neuron00.200) node [pos=0.1,above] {\tiny{$w_{10}$}};
\draw [->] (x2.east) -- (neuron00.220) node [pos=0.05,above,yshift=0.3em] {\tiny{$b_{0}$}};
\draw [->] (neuron00.east) -- (y0.west);

\visible<2->{
\node [anchor=west] (y1) at ([xshift=4em]neuron01.east) {$y_1$};
\draw [->] (x0.east) -- (neuron01.160) node [pos=0.4,above] {\tiny{$w_{01}$}};
\draw [->] (x1.east) -- (neuron01.180) node [pos=0.35,above,yshift=-0.2em] {\tiny{$w_{11}$}};
\draw [->] (x2.east) -- (neuron01.200) node [pos=0.3,below,yshift=0.2em] {\tiny{$b_{1}$}};
\draw [->] (neuron01.east) -- (y1.west);
}

\visible<3->{
\node [anchor=west] (y2) at ([xshift=4em]neuron02.east) {$y_2$};
\draw [->] (x0.east) -- (neuron02.140) node [pos=0.1,below,yshift=-0.2em] {\tiny{$w_{02}$}};
\draw [->] (x1.east) -- (neuron02.160) node [pos=0.1,below] {\tiny{$w_{12}$}};
\draw [->] (x2.east) -- (neuron02.180) node [pos=0.3,below] {\tiny{$b_{2}$}};
\draw [->] (neuron02.east) -- (y2.west);
}

\visible<4->{
\node [anchor=east,align=left] (inputlabel) at ([xshift=-0.1em]x1.west) {输入向量:\\\small{$\textbf{x}=(x_0,x_1)$}};
}
\visible<5->{
\node [anchor=west,align=left] (outputlabel) at ([xshift=0.1em]y1.east) {输出向量:\\\small{$\textbf{y}=(y_0,y_1,y_2)$}};
}

\begin{pgfonlayer}{background}
\visible<6->{
\node [rectangle,inner sep=0.4em,fill=red!20] [fit = (neuron00) (neuron01) (neuron02)] (layer) {};
\node [anchor=south] (layerlabel) at ([yshift=0.2em]layer.north) {一层神经元};
}

\visible<4->{
\node [rectangle,inner sep=0.1em,fill=ugreen!20] [fit = (x0) (x1)] (inputshadow) {};
}
\visible<5->{
\node [rectangle,inner sep=0.1em,fill=blue!20] [fit = (y0) (y1) (y2)] (outputshadow) {};
}
\end{pgfonlayer}

\visible<7->{
\node [anchor=north west] (wlabel) at ([yshift=-1em,xshift=-7em]x2.south) {参数(矩阵):$\textbf{w} = \Big( \begin{array}{lll} w_{00} & w_{01} & w_{02} \\ w_{10} & w_{11} & w_{12} \end{array} \Big)$};
}
\visible<8->{
\node [anchor=west] (blabel) at (wlabel.east) {参数(向量):$\textbf{b} = (b_0, b_1, b_2)$};
}

\end{scope}
\end{tikzpicture}
\end{center}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
957 958 959
%%% 神经网络的作用
\begin{frame}{神经网络:线性变换 + 激活函数}
\begin{itemize}
xiaotong committed
960 961 962
\item 对于向量$\textbf{x} \in \mathbb{R}^m$,一层神经网络首先把他经过\textbf{\alert{线性变换}}映射到$\mathbb{R}^m$,之后经过\textbf{{\color{blue}激活函数}}变换成$\textbf{y} \in \mathbb{R}^n$
\end{itemize}

xiaotong committed
963 964
\vspace{1em}

xiaotong committed
965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990
\begin{center}
\begin{tikzpicture}

\node [anchor=center] (y) at (0,0) {\Large{$\textbf{y}$}};
\node [anchor=west] (eq) at (y.east) {\Large{$=$}};
\node [anchor=west] (func) at (eq.east) {\Large{$f$}};
\node [anchor=west] (brace01) at (func.east) {\Large{$($}};
\node [anchor=west] (x) at (brace01.east) {\Large{$\textbf{x}$}};
\node [anchor=west] (dot) at (x.east) {\Large{$\cdot$}};
\node [anchor=west] (w) at (dot.east) {\Large{$\textbf{w}$}};
\node [anchor=west] (plus) at (w.east) {\Large{$+$}};
\node [anchor=west] (b) at (plus.east) {\Large{$\textbf{b}$}};
\node [anchor=west] (brace02) at (b.east) {\Large{$)$}};

\node [anchor=center,fill=blue!20] (func2) at (func) {\LARGE{$f$}};
\node [anchor=north] (funclabel) at ([yshift=-1.1em]func.south) {\blue{激活函数}};
\draw [<-] ([yshift=-0.2em]func2.south) -- (funclabel.north);

\begin{pgfonlayer}{background}
\node [rectangle,inner sep=0.2em,fill=red!20] [fit = (x) (w) (b)] (linear) {};
\node [anchor=north] (linearlabel) at ([yshift=-1.1em]linear.south) {\alert{线性变换}};
\draw [<-] ([yshift=-0.2em]linear.south) -- (linearlabel.north);
\end{pgfonlayer}

\end{tikzpicture}

xiaotong committed
991 992 993 994 995 996 997 998 999
\begin{figure}[htp!]

\includegraphics[scale=0.24]{./Figures/wf.png}
% \begin{tikzpicture}
%     \node [rectangle,inner sep=0.2em,fill=red!20] [fit = (x) (w) (b)] (linear) {};
%     \node [anchor=north] (linearlabel) at ([yshift=-1.1em]linear.south) {\alert{线性变换}}
\end{figure}
\tikz {\node () at (0,0) {}; \node () at (0,10) {};}
\end{center}
xiaotong committed
1000 1001
\end{frame}

xiaotong committed
1002

xiaotong committed
1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014
%%%------------------------------------------------------------------------------------------------------------
%%% 线性变换
\begin{frame}{线性变换}
\begin{itemize}
\item 对于线性空间$V$,任意$\textbf{a}$$\textbf{b} \in V$和数域中的任意$\alpha$,线性变换$T(\cdot)$需满足
\begin{eqnarray}
T(\textbf{a} + \textbf{b}) & = & T(\textbf{a}) + T(\textbf{b}) \nonumber \\
T(\alpha \textbf{a}) & = & \alpha T(\textbf{a}) \nonumber
\end{eqnarray}
\item<2-> 线性变换的一种几何解释:
\end{itemize}

xiaotong committed
1015
\vspace{-0.5em}
xiaotong committed
1016 1017 1018 1019 1020 1021 1022 1023 1024 1025
\visible<2->{
\begin{center}
\begin{tikzpicture}

\node [anchor=west] (x) at (0,0) {\Large{$\textbf{x}$}};
\node [anchor=west] (dot) at (x.east) {\Large{$\cdot$}};
\node [anchor=west] (w) at (dot.east) {\Large{$\textbf{w}$}};
\node [anchor=west] (plus) at (w.east) {\Large{$+$}};
\node [anchor=west] (b) at (plus.east) {\Large{$\textbf{b}$}};

xiaotong committed
1026 1027 1028 1029 1030 1031 1032 1033 1034
\tikzstyle{neuron} = [rectangle,draw,thick,fill=red!30,red!35,minimum height=2em,minimum width=2em,font=\small]
\node[neuron,anchor=north] (a1) at ([xshift=-6em,yshift=-4em]x.south) {};
\draw[->,thick] ([xshift=-2em,yshift=0em]a1.south) to ([xshift=3em,yshift=0em]a1.south);
\draw[->,thick] ([xshift=0em,yshift=-4em]a1.west) to ([xshift=0em,yshift=2em]a1.west);
\node[below] at ([xshift=0.5em,yshift=-1em]a1.west){0};
\node[below] at ([xshift=2em,yshift=-1em]a1.west){1};
\node[below] at ([xshift=-0.5em,yshift=2em]a1.west){1};
\node [anchor=west] (x) at ([xshift=-0.7em,yshift=1em]a1.south) {\Large{$\textbf{F}$}};

xiaotong committed
1035 1036 1037 1038
\visible<3->{
\node [anchor=center,fill=green!20] (w2) at (w) {\Large{$\textbf{w}$}};
\node [anchor=north,inner sep=1pt] (wlabel) at ([yshift=-0.7em]w.south) {\small{旋转(rotation)}};
\draw [<-] ([yshift=-0.2em]w2.south) -- (wlabel.north);
xiaotong committed
1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058

\tikzstyle{neuron} = [rectangle,draw,thick,fill=red!30,red!35,minimum height=2em,minimum width=2em,font=\small]
\node[neuron,anchor=north] (a2) at ([xshift=10em,yshift=0em]a1.south) {};
\draw[->,thick] ([xshift=-2em,yshift=0em]a2.north) to ([xshift=3em,yshift=0em]a2.north);
\draw[->,thick] ([xshift=0em,yshift=-2em]a2.west) to ([xshift=0em,yshift=4em]a2.west);
\node[above] at ([xshift=0.5em,yshift=1em]a2.west){0};
\node[above] at ([xshift=2em,yshift=1em]a2.west){1};
\node[below] at ([xshift=-0.5em,yshift=0em]a2.west){-1};
\node [anchor=west] (x) at ([xshift=-3.5cm,yshift=2em]a2.north) {\scriptsize{
    $w=\begin{bmatrix}
    1&0&0\\
    0&-1&0\\
    0&0&1
    \end{bmatrix}$}
    };

\node [anchor=west,rotate = 180] (x) at ([xshift=0.7em,yshift=1em]a2.south) {\Large{$\textbf{F}$}};


\draw[-stealth, line width=2pt,dashed] ([xshift=4em,yshift=0em]a1.south) to ([xshift=-3em,yshift=0em]a2.north);
xiaotong committed
1059 1060 1061 1062 1063 1064
}

\visible<4->{
\node [anchor=center,fill=purple!20] (b2) at (b) {\Large{$\textbf{b}$}};
\node [anchor=west] (blabel) at ([xshift=1.5em]b2.east) {平移(shift)};
\draw [<-] ([xshift=0.2em]b2.east) -- (blabel.west);
xiaotong committed
1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083

\tikzstyle{neuron} = [rectangle,draw,thick,fill=red!30,red!35,minimum height=2em,minimum width=2em,font=\small]
\node[neuron,anchor=north] (a3) at ([xshift=11em,yshift=2.05em]a2.south) {};
\draw[->,thick] ([xshift=-3em,yshift=0em]a3.north) to ([xshift=2em,yshift=0em]a3.north);
\draw[->,thick] ([xshift=-1em,yshift=-2em]a3.west) to ([xshift=-1em,yshift=4em]a3.west);
\node[above] at ([xshift=-0.5em,yshift=1em]a3.west){0};
\node[above] at ([xshift=1em,yshift=1em]a3.west){1};
\node[left] at ([xshift=-0.75em,yshift=-0.5em]a3.west){-1};
\node [anchor=west,rotate = 180] (x) at ([xshift=0.7em,yshift=1em]a3.south) {\Large{$\textbf{F}$}};


\node [anchor=west] (x) at ([xshift=-4cm,yshift=2em]a3.north) {\scriptsize{
    $b=\begin{bmatrix}
    0.5&0&0\\
    0&0&0\\
    0&0&0
    \end{bmatrix}$}
    };
\draw[-stealth, line width=2pt,dashed] ([xshift=3em,yshift=1em]a2.east) to ([xshift=-3em,yshift=1em]a3.west);
xiaotong committed
1084 1085 1086 1087 1088 1089
}

\end{tikzpicture}
\end{center}
}

xiaotong committed
1090

xiaotong committed
1091 1092
\end{frame}

xiaotong committed
1093

xiaotong committed
1094 1095
%%%------------------------------------------------------------------------------------------------------------
%%% 线性变换:更复杂的实例
xiaotong committed
1096
\begin{frame}[fragile]{线性变换(续)}
xiaotong committed
1097 1098
\begin{itemize}
\item 线性变换也适用于更加复杂的情况,这也给神经网络提供了拟合不同数据分布的能力
xiaotong committed
1099 1100 1101 1102
    \begin{itemize}
    \item 比如,我们可以把三维图形投影到二维平面上
    \item 再比如,我们也可以把二维平面上的图形映射到三维平面
    \end{itemize}
xiaotong committed
1103
\end{itemize}
xiaotong committed
1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207
\begin{tiny}
$$
\begin{smallmatrix}  \underbrace{
    \left\{
        \begin{smallmatrix}
            \left[
            \begin{array}{cccc}
             1& 0 &0 \\
             0& 1 &0 \\
             0& 0 &1
            \end{array}
            \right ]
            \cdots
            \left[
            \begin{array}{cccc}
                1& 0 &0 \\
                0& 1 &0 \\
                0& 0 &1
            \end{array}
            \right]
        \end{smallmatrix}
        \right\}
     }\\5
\end{smallmatrix}
\times
\begin{smallmatrix}
\left[
    \begin{array}{cccc}
    1\\
    1\\
    1
    \end{array}
\right ]
\end{smallmatrix}
=
\begin{smallmatrix}  \underbrace{
    \left\{
        \begin{smallmatrix}
            \left[
            \begin{array}{cccc}
             1 \\
             1 \\
             1
            \end{array}
            \right ]
            \cdots
            \left[
            \begin{array}{cccc}
                1 \\
                1 \\
                1
            \end{array}
            \right]
        \end{smallmatrix}
        \right\}
     }\\5
\end{smallmatrix}
$$
\end{tiny}
%\vspace{1em}

\newcommand{\plane}[1]{
(-1.95, #1, 1.35) --
++(3.6, 0.6, 0.0) --
++(0.3, -1.8, -2.7) --
++(-3.6, -0.6, -0.0) --
cycle}
\newcommand{\nullspacepicture}{
% bottom part of the row space line
\draw (0,0,0) -- (0.3,-1.8,1.233);
% five planes
\draw[fill=gray!20]\plane{-0.2};
\draw[fill=gray!20]\plane{0.2};
\draw[fill=blue!70!gray]\plane{0.6};
\draw[fill=gray!20]\plane{1};
\draw[fill=gray!20]\plane{1.4};
% top part of the row space line
\draw (-.094,.562,-.385) -- (-0.3,1.8,-1.233);
}
\newcommand{\rangepicture}[1]{
% axes
\draw[help lines,->] (-2,0) -- (2,0);
\draw[help lines,->] (0,-2) -- (0,2);
% the line and circles
\draw (1,-2) -- (-1,2);
\draw[fill=#1] (0,0) circle (2.5pt);
\draw[fill=gray!50] (0.2,-0.4) circle (2.5pt);
\draw[fill=gray!50] (0.4,-0.8) circle (2.5pt);
\draw[fill=gray!50] (-0.2,0.4) circle (2.5pt);
\draw[fill=gray!50] (-0.4,0.8) circle (2.5pt);
}

\begin{tikzpicture}[scale=0.95]
\centering
\nullspacepicture
% the label
\node at (-2,1.8) {$\mathbb{R}^3$};
% arrow between diagrams
\path[->] (3,0) edge[bend left] node[above] {线性变换} (4.5,0);
\begin{scope}[xshift=7cm]
\rangepicture{blue!70!gray}
\node at (1.8,1.8) {$\mathbb{R}^2$};
\end{scope}
\end{tikzpicture}
xiaotong committed
1208 1209
\end{frame}

xiaotong committed
1210

xiaotong committed
1211 1212 1213 1214
%%%------------------------------------------------------------------------------------------------------------
%%% 激活函数
\begin{frame}{激活函数}
\begin{itemize}
xiaotong committed
1215
\item 激活函数更多地是为了解决实际问题中的\alert{非线性}变换
xiaotong committed
1216 1217 1218 1219
    \begin{itemize}
    \item 非线性部分提供了拟合任意函数的能力(稍后介绍)
    \end{itemize}
\end{itemize}
xiaotong committed
1220 1221 1222 1223 1224

\vspace{-1em}
\begin{center}
\begin{tikzpicture}
\begin{scope}
xiaotong committed
1225
\draw [line width=3pt,ublue,-](0,0) -- (-2.0,1);
xiaotong committed
1226 1227 1228 1229
\node [anchor=north] (linelabel) at (-1.0,-0.5) {\footnotesize{我是一根筷子}};
\end{scope}

\begin{scope}[xshift=10em]
xiaotong committed
1230
\draw [line width=3pt,ublue,-,line cap=round](0,0) .. controls (-0.5,-0.25) and (-0.5,1).. (-1.3,0.3) .. controls (-2.3,-0.3) and (-1.1,1.8).. (-2.0,1);
xiaotong committed
1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258
\node [] at (-2,1) {\white{$\cdot$}};
\node [anchor=north] (linelabel) at (-1.0,-0.5) {\footnotesize{我是一只蚯蚓}};
\end{scope}
\end{tikzpicture}
\end{center}

\begin{itemize}
\item<2-> 简单的非线性函数
\end{itemize}

\vspace{-1em}

\visible<2->{
\begin{center}
\begin{tikzpicture}
\begin{scope}[]
\draw [->,thick] (-1.5,0) -- (1.5,0);
\draw [->,thick] (0,-0.1) -- (0,1.5);
\draw [-,very thick,ublue,domain=-1.2:1.2,samples=100] plot (\x,{0.5 * (\x -0.3)^2 + 0.2});
\node [anchor=west] (ylabel) at (0,1.3) {$y$};
\node [anchor=north] (xlabel) at (1.3,0) {$x$};
\node [anchor=north] (func) at (0,-0.8) {\footnotesize{$y = \frac{1}{2}  (x - 0.3)^2 + 0.2$}};
\node [anchor=south west] (flabel) at (func.north west) {\footnotesize{Quadratic:}};
\end{scope}

\begin{scope}[xshift=9.5em]
\draw [->,thick] (-1.5,0) -- (1.5,0);
\draw [->,thick] (0,-0.1) -- (0,1.5);
xiaotong committed
1259
\draw [-,very thick,ublue,domain=-1.2:1.2,samples=100] plot (\x, {0.5 * exp(\x)});
xiaotong committed
1260 1261
\node [anchor=west] (ylabel) at (0,1.3) {$y$};
\node [anchor=north] (xlabel) at (1.3,0) {$x$};
xiaotong committed
1262
\node [anchor=north] (func) at (0,-0.8) {\footnotesize{$y = 0.5 \cdot  \exp(x)$}};
xiaotong committed
1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278
\node [anchor=south west] (flabel) at ([xshift=-1.8em]func.north west) {\footnotesize{Exponential:}};
\end{scope}

\begin{scope}[xshift=19em]
\draw [->,thick] (-1.5,0) -- (1.5,0);
\draw [->,thick] (0,-0.1) -- (0,1.5);
\draw [-,very thick,ublue,domain=-1.1:1.2,samples=100] plot (\x,{abs(\x -0.2) + 0.1});
\node [anchor=west] (ylabel) at (0,1.3) {$y$};
\node [anchor=north] (xlabel) at (1.3,0) {$x$};
\node [anchor=north] (func) at (0,-0.8) {\footnotesize{$y = |x - 0.3| + 0.1$}};
\node [anchor=south west] (flabel) at ([xshift=-0.4em]func.north west) {\footnotesize{Absolute:}};
\end{scope}
\end{tikzpicture}
\end{center}
}

xiaotong committed
1279 1280 1281 1282 1283
\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 常用的激活函数
\begin{frame}{常用的激活函数}
xiaotong committed
1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381
    \begin{itemize}
    \item 好多好多,列举不全 ...
    \end{itemize}
    \vspace{-1em}
    \begin{figure}
    \subfigure[softplus]{
    \centering
    \begin{minipage}{.2\textwidth}
        \begin{tikzpicture}
        \draw[->](-1.2,0)--(1.2,0)node[left,below,font=\tiny]{$x$};
        \draw[->](0,-1.2)--(0,1.2)node[right,font=\tiny]{$y$};
        \foreach \x in {-1.0,-0.5,0.0,0.5,1.0}{\draw(\x,0)--(\x,0.05)node[below,outer sep=2pt,font=\tiny]at(\x,0){\x};}
        \foreach \y in {1.0,0.5}{\draw(0,\y)--(0.05,\y)node[left,outer sep=2pt,font=\tiny]at(0,\y){\y};}
        \draw[color=red ,domain=-1.2:1]plot(\x,{ln(1+(exp(\x))});
        \node[black,anchor=south] at (0,1.2) {\small $y = ln(1+e^x)$};
        \end{tikzpicture}
    \end{minipage}%
    }
    \hfill
    \subfigure[sigmoid]{
    \centering
    \begin{minipage}{.2\textwidth}
        \begin{tikzpicture}
        \draw[->](-1.2,0)--(1.2,0)node[left,below,font=\tiny]{$x$};
        \draw[->](0,-1.2)--(0,1.2)node[right,font=\tiny]{$y$};
        \draw[dashed](-1.2,1)--(1.2,1);
        \foreach \x in {-1,-0.5,0,0.5,1}{\draw(\x,0)--(\x,0.05)node[below,outer sep=2pt,font=\tiny]at(\x,0){
            \pgfmathparse{(\x)*5}
            \pgfmathresult};}
        \foreach \y in {0.5,1.0}{\draw(0,\y)--(0.05,\y)node[left,outer sep=2pt,font=\tiny]at(0,\y){\y};}
        \draw[color=red,domain=-1.2:1.2]plot(\x,{1/(1+(exp(-5*\x)))});
        \node[black,anchor=south] at (0,1.2) {\small $y = \frac{1}{1+e^{-x}}$};
        \end{tikzpicture}
    \end{minipage}%
    }
    \hfill
    \subfigure[tanh]{
    \centering
    \begin{minipage}{.2\textwidth}
        \begin{tikzpicture}
        \draw[->](-1.2,0)--(1.2,0)node[left,below,font=\tiny]{$x$};
        \draw[->](0,-1.2)--(0,1.2)node[right,font=\tiny]{$y$};
        \draw[dashed](-1.2,1)--(1.2,1);
        \draw[dashed](-1.2,-1)--(1.2,-1);
        \foreach \x in {-1.0,-0.5,0.0,0.5,1.0}{\draw(\x,0)--(\x,0.05)node[below,outer sep=2pt,font=\tiny]at(\x,0){\x};}
        \foreach \y in {0.5,1.0}{\draw(0,\y)--(0.05,\y)node[left,outer sep=2pt,font=\tiny]at(0,\y){\y};}
        \draw[color=red ,domain=-1.2:1.2]plot(\x,{tanh(\x)});
        \node[black,anchor=south] at (0,1.2) {\small $y = \frac{e^{x}-e^{-x}}{e^{x}+e^{-x}}$};
        \end{tikzpicture}
    \end{minipage}
    }
    \end{figure}
    \vspace{-1em}
    \begin{figure}
    \subfigure[relu]{
    \centering
    \begin{minipage}{.2\textwidth}
        \begin{tikzpicture}
        \draw[->](-1.2,0)--(1.2,0)node[left,below,font=\tiny]{$x$};
        \draw[->](0,-1.2)--(0,1.2)node[right,font=\tiny]{$y$};
        \draw[dashed](-1.2,1)--(1.2,1);
        \draw[dashed](-1.2,-1)--(1.2,-1);
        \foreach \x in {-1.0,-0.5,0.0,0.5,1.0}{\draw(\x,0)--(\x,0.05)node[below,outer sep=2pt,font=\tiny]at(\x,0){\x};}
        \foreach \y in {0.5,1.0}{\draw(0,\y)--(0.05,\y)node[left,outer sep=2pt,font=\tiny]at(0,\y){\y};}
        \draw[color=red ,domain=-1.2:1.2]plot(\x,{max(\x,0)});
        \node[black,anchor=south] at (0,1.2) {\small $y =\max (0, x)$};
        \end{tikzpicture}
    \end{minipage}%
    }
    \hfill
    \subfigure[gaussian]{
    \centering
    \begin{minipage}{.2\textwidth}
        \begin{tikzpicture}
        \draw[->](-1.2,0)--(1.2,0)node[left,below,font=\tiny]{$x$};
        \draw[->](0,-1.2)--(0,1.2)node[right,font=\tiny]{$y$};
        \draw[dashed](-1.2,1)--(1.2,1);
        \foreach \x in {-1.0,-0.5,0.0,0.5,1.0}{\draw(\x,0)--(\x,0.05)node[below,outer sep=2pt,font=\tiny]at(\x,0){\x};}
        \foreach \y in {0.5,1.0}{\draw(0,\y)--(0.05,\y)node[left,outer sep=2pt,font=\tiny]at(0,\y){\y};}
        \draw[color=red ,domain=-1.2:1.2]plot(\x,{exp(-1*((\x)^2))});
        \node[black,anchor=south] at (0,1.2) {\small $y =e^{-x^2}$};
        \end{tikzpicture}
    \end{minipage}%
    }
    \hfill
    \subfigure[identity]{
    \centering
    \begin{minipage}{.2\textwidth}
        \begin{tikzpicture}
        \draw[->](-1.2,0)--(1.2,0)node[left,below,font=\tiny]{$x$};
        \draw[->](0,-1.2)--(0,1.2)node[right,font=\tiny]{$y$};
        \foreach \x in {-1.0,-0.5,0.0,0.5,1.0}{\draw(\x,0)--(\x,0.05)node[below,outer sep=2pt,font=\tiny]at(\x,0){\x};}
        \foreach \y in {0.5,1.0}{\draw(0,\y)--(0.05,\y)node[left,outer sep=2pt,font=\tiny]at(0,\y){\y};}
        \draw[color=red ,domain=-1:1]plot(\x,\x);
        \node[black,anchor=south] at (0,1.2) {\small $y =x$};
        \end{tikzpicture}
    \end{minipage}
    }
xiaotong committed
1382

xiaotong committed
1383
    \end{figure}
xiaotong committed
1384

1385 1386
\end{frame}

xiaotong committed
1387

1388
%%%------------------------------------------------------------------------------------------------------------
xiaotong committed
1389
\subsection{多层神经网络}
xiaotong committed
1390 1391

%%%------------------------------------------------------------------------------------------------------------
xiaotong committed
1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660
%%% 一层 -> 多层
\begin{frame}{更多的层}
\begin{itemize}
\item \textbf{单层神经网络}:线性变换 + 激活函数(非线性)
\item 我们可以重复上面的过程,构建\textbf{多层神经网络}
\end{itemize}

\vspace{-1.0em}
\begin{center}
\begin{tikzpicture}
\begin{scope}[]

\def\neuronsep{1.6}

\tikzstyle{neuronnode} = [minimum size=1.7em,circle,draw,ublue,very thick,inner sep=1pt, fill=white,align=center,drop shadow={shadow xshift=0.1em,shadow yshift=-0.1em}]

%%% layer 1
\foreach \n in {1,...,5}{
    \node [neuronnode] (neuron0\n) at (\n * \neuronsep,0) {\tiny{$f_1$}\\[-1ex] \tiny{$\sum$}};
    \draw [-,ublue] (neuron0\n.east) -- (neuron0\n.west);
}

\foreach \n in {1,...,5}{
    \foreach \m in {1,...,5}{
        \draw [<-] (neuron0\m.south) -- ([yshift=-2em]neuron0\n.south);
    }
    \node [anchor=north] (x\n) at ([yshift=-2em]neuron0\n.south) {$x_\n$};
    \visible<1>{
    \draw [<-,thick] ([yshift=1.5em]neuron0\n.north) -- (neuron0\n.north);
    \node [anchor=south] (y\n) at ([yshift=1.5em]neuron0\n.north) {$y_\n$};
    }
}

\node [anchor=west] (w1label) at ([xshift=-0.5em,yshift=0.5em]x5.north east) {$\textbf{w}_1$};

\begin{pgfonlayer}{background}
\node [rectangle,inner sep=0.2em,fill=red!20] [fit = (neuron01) (neuron05)] (layer01) {};
\end{pgfonlayer}

\node [anchor=west] (layer00label) at ([xshift=1.25em]x5.east) {\alert{输入层}};

\visible<2->{
\node [anchor=west] (layer01label) at ([xshift=1em]layer01.east) {第二层};
}
\visible<4->{
\node [anchor=west] (layer01label2) at (layer01label.east) {(\alert{隐层})};
}

%%% layer 2
\visible<2->{
\foreach \n in {2,...,4}{
    \node [neuronnode] (neuron1\n) at (\n * \neuronsep,4em) {\tiny{$f_2$}\\[-1ex] \tiny{$\sum$}};
    \draw [-,ublue] (neuron1\n.east) -- (neuron1\n.west);
}

\foreach \n in {2,...,4}{
    \foreach \m in {1,...,5}{
        \draw [<-] (neuron1\n.south) -- (neuron0\m.north);
    }
    \visible<2>{
    \draw [<-,thick] ([yshift=1.5em]neuron1\n.north) -- (neuron1\n.north);
    \node [anchor=south] (y\n) at ([yshift=1.5em]neuron1\n.north) {$y_\n$};
    }
}

\node [anchor=west] (w2label) at ([xshift=-2.5em,yshift=5.0em]x5.north east) {$\textbf{w}_2$};

\begin{pgfonlayer}{background}
\visible<2->{
\node [rectangle,inner sep=0.2em,fill=ugreen!20] [fit = (neuron12) (neuron14)] (layer02) {};
}
\end{pgfonlayer}

\node [anchor=west] (layer02label) at ([xshift=4.9em]layer02.east) {第三层};
\visible<4->{
\node [anchor=west] (layer02label2) at (layer02label.east) {(\alert{隐层})};
}
}

%%% layer 3
\visible<3->{
\foreach \n in {1,...,5}{
    \node [neuronnode] (neuron2\n) at (\n * \neuronsep,8em) {\tiny{$f_3$}\\[-1ex] \tiny{$\sum$}};
    \draw [-,ublue] (neuron2\n.east) -- (neuron2\n.west);
}

\foreach \n in {1,...,5}{
    \foreach \m in {2,...,4}{
        \draw [<-] (neuron2\n.south) -- (neuron1\m.north);
    }

    \node [anchor=south] (y\n) at ([yshift=1.5em]neuron2\n.north) {$y_\n$};
    \draw [<-,thick] ([yshift=1.5em]neuron2\n.north) -- (neuron2\n.north);
}

\node [anchor=west] (w3label) at ([xshift=-2.5em,yshift=8.5em]x5.north east) {$\textbf{w}_3$};

\begin{pgfonlayer}{background}
\visible<3->{
\node [rectangle,inner sep=0.2em,fill=blue!20] [fit = (neuron21) (neuron25)] (layer03) {};
}
\end{pgfonlayer}

\node [anchor=west] (layer03label) at ([xshift=1em]layer03.east) {第四层};
\visible<4->{
\node [anchor=west] (layer03label2) at (layer03label.east) {(\alert{输出层})};
}
}

\end{scope}
\end{tikzpicture}
\end{center}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 两层神经网络可以逼近任何函数
\begin{frame}{多层神经网络可以逼近任意函数}
\begin{itemize}
\item 以一个简单的三层网络为例(隐层激活函数:sigmoid)
\end{itemize}

\begin{center}
\begin{tikzpicture}

%% a two-layer neural network
\begin{scope}
\tikzstyle{neuronnode} = [minimum size=1.7em,circle,draw,ublue,very thick,inner sep=1pt, fill=white,align=center,drop shadow={shadow xshift=0.1em,shadow yshift=-0.1em}]

%% input and hidden layers
\node [neuronnode] (n10) at (0,0) {\tiny{$f$}\\[-1ex] \tiny{$\sum$}};
\node [neuronnode] (n11) at (1.5,0) {\tiny{$f$}\\[-1ex] \tiny{$\sum$}};
\draw [-,ublue] (n10.west) -- (n10.east);
\draw [-,ublue] (n11.west) -- (n11.east);
\node [anchor=north] (x1) at ([yshift=-6em]n11.south) {$x_1$};
\node [anchor=north] (b) at ([yshift=-6em]n10.south) {$b$};
\visible<1-10>{
\draw [->,thick] (b.north) -- ([yshift=-0.1em]n10.south);
\draw [->,thick] (x1.north) -- ([yshift=-0.1em]n10.290);
}
\visible<1>{
\draw [->,thick] (b.north) -- ([yshift=-0.1em]n11.250);
\draw [->,thick] (x1.north) -- ([yshift=-0.1em]n11.south);
}

\visible<11->{
\draw [->,thick,red] (b.north) -- ([yshift=-0.1em]n10.south);
\draw [->,thick,ugreen] (x1.north) -- ([yshift=-0.1em]n10.290);
}

\visible<2->{
\draw [->,thick,blue] (b.north) -- ([yshift=-0.1em]n11.250);
\draw [->,thick,purple] (x1.north) -- ([yshift=-0.1em]n11.south);
}

\visible<15->{
\node [neuronnode] (n12) at (2.7,0) {\tiny{$f$}\\[-1ex] \tiny{$\sum$}};
\node [neuronnode] (n13) at (3.8,0) {\tiny{$f$}\\[-1ex] \tiny{$\sum$}};
\draw [-,ublue] (n12.west) -- (n12.east);
\draw [-,ublue] (n13.west) -- (n13.east);
\draw [->,thick] (b.north) -- ([yshift=-0.1em]n12.250);
\draw [->,thick] (x1.north) -- ([yshift=-0.1em]n12.270);
\draw [->,thick] (b.north) -- ([yshift=-0.1em]n13.230);
\draw [->,thick] (x1.north) -- ([yshift=-0.1em]n13.250);
}

\visible<16->{
\node [anchor=west] (morenodes) at (n13.east) {...};
}

%% output layers
\node [neuronnode] (n20) at (0.75,5em) {\scriptsize{$\sum$}};
\visible<1-10>{\draw [->,thick] ([yshift=0.1em]n10.north) -- ([yshift=-0.1em]n20.250);}
\visible<1-8>{\draw [->,thick] ([yshift=0.1em]n11.north) -- ([yshift=-0.1em]n20.290);}

\visible<11->{\draw [->,thick,brown] ([yshift=0.1em]n10.north) -- ([yshift=-0.1em]n20.250);}
\visible<9->{\draw [->,thick,orange] ([yshift=0.1em]n11.north) -- ([yshift=-0.1em]n20.290);}

\node [] (y) at ([yshift=3em]n20.north) {$y$};
\draw [->,thick] ([yshift=0.1em]n20.north) -- (y.south);

\visible<15->{
\draw [->,thick] ([yshift=0.1em]n12.north) -- ([yshift=-0.1em]n20.310);
\draw [->,thick] ([yshift=0.1em]n13.north) -- ([yshift=-0.1em]n20.330);
}

%% weight and bias
\visible<11->{\node [anchor=center,rotate=90,fill=white,inner sep=1pt] (b0) at ([yshift=3em,xshift=-0.5em]b.north) {\tiny{$b=-6$}};}
\visible<11->{\node [anchor=center,rotate=-59,fill=white,inner sep=1pt] (w2) at ([yshift=1.2em,xshift=-1.2em]x1.north) {\tiny{$w=100$}};}

\visible<2-6>{\node [anchor=center,rotate=59,fill=white,inner sep=1pt] (b1) at ([yshift=4.9em,xshift=2.2em]b.north) {\tiny{$b=0$}};}
\visible<7>{\node [anchor=center,rotate=59,fill=white,inner sep=1pt] (b1) at ([yshift=4.9em,xshift=2.2em]b.north) {\tiny{$b=-2$}};}
\visible<8->{\node [anchor=center,rotate=59,fill=white,inner sep=1pt] (b1) at ([yshift=4.9em,xshift=2.2em]b.north) {\tiny{$b=-4$}};}
\visible<2-4>{\node [anchor=center,rotate=90,fill=white,inner sep=1pt] (w1) at ([yshift=3em,xshift=0.5em]x1.north) {\tiny{$w=1$}};}
\visible<5>{\node [anchor=center,rotate=90,fill=white,inner sep=1pt] (w1) at ([yshift=3em,xshift=0.5em]x1.north) {\tiny{$w=10$}};}
\visible<6->{\node [anchor=center,rotate=90,fill=white,inner sep=1pt] (w1) at ([yshift=3em,xshift=0.5em]x1.north) {\tiny{$w=100$}};}

\visible<11>{\node [anchor=center,rotate=62,fill=white,inner sep=1pt] (w21) at ([yshift=2em,xshift=0.5em]n10.north) {\tiny{$w'=0.7$}};}
\visible<12->{\node [anchor=center,rotate=62,fill=white,inner sep=1pt] (w21) at ([yshift=2em,xshift=0.5em]n10.north) {\tiny{$w'=-0.7$}};}

\visible<2-8>{\node [anchor=center,rotate=-62,fill=white,inner sep=1pt] (w22) at ([yshift=2em,xshift=-0.5em]n11.north) {\tiny{$w'=1$}};}
\visible<9>{\node [anchor=center,rotate=-62,fill=white,inner sep=1pt] (w22) at ([yshift=2em,xshift=-0.5em]n11.north) {\tiny{$w'=0.9$}};}
\visible<10->{\node [anchor=center,rotate=-62,fill=white,inner sep=1pt] (w22) at ([yshift=2em,xshift=-0.5em]n11.north) {\tiny{$w'=0.7$}};}


%% sigmoid box
\begin{scope}
\visible<3->{
\node [anchor=west] (flabel) at ([xshift=1.2in]y.east) {\footnotesize{sigmoid:}};
\node [anchor=north east] (slabel) at ([xshift=0]flabel.south east) {\footnotesize{sum:}};

\node [anchor=west,inner sep=2pt] (flabel2) at (flabel.east) {\footnotesize{$f(s)=1/(1+e^{-s})$}};
\node [anchor=west,inner sep=2pt] (flabel3) at (slabel.east) {\footnotesize{$s=x_1 \cdot w + b$}};
\draw [->,thick,dotted] ([yshift=-0.3em,xshift=-0.1em]n11.60)  .. controls +(east:1) and +(west:2) ..  ([xshift=-0.2em]flabel.west) ;

\begin{pgfonlayer}{background}
\visible<3->{
\node [rectangle,inner sep=0.2em,fill=blue!20,drop shadow={shadow xshift=0.1em,shadow yshift=-0.1em}] [fit = (flabel) (flabel2) (flabel3)] (funcbox) {};
}
\end{pgfonlayer}
}
\end{scope}

%% output illustration
\begin{scope}[xshift=2.8in,yshift=0.1in]
\visible<4->{
\draw [->,thick] (-2.2,0) -- (2.2,0);
\draw [->,thick] (0,0) -- (0,2);
\draw [-] (-0.05,1) -- (0.05,1);
\node [anchor=east,inner sep=1pt] (label1) at (0,1) {\tiny{1}};
\node [anchor=south east,inner sep=1pt] (label2) at (0,0) {\tiny{0}};
}

\visible<4>{\draw [-,very thick,ublue,domain=-2:2,samples=100] plot (\x,{1/(1+exp(-2*\x))});}
\visible<5>{\draw [-,very thick,ublue,domain=-2:2,samples=100] plot (\x,{1/(1+exp(-4*\x))});}
\visible<6>{\draw [-,very thick,ublue,rounded corners=0.1em] (-2,0) -- (0,0) -- (0,1) -- (2,1);}
\visible<7>{\draw [-,very thick,ublue,rounded corners=0.1em] (-2,0) -- (0.25,0) -- (0.25,1) -- (2,1);}
\visible<8>{\draw [-,very thick,ublue,rounded corners=0.1em] (-2,0) -- (0.5,0) -- (0.5,1) -- (2,1);}
\visible<9>{\draw [-,very thick,ublue,rounded corners=0.1em] (-2,0) -- (0.5,0) -- (0.5,0.9) -- (2,0.9);}
\visible<10>{\draw [-,very thick,ublue,rounded corners=0.1em] (-2,0) -- (0.5,0) -- (0.5,0.7) -- (2,0.7);}
\visible<11>{\draw [-,very thick,ublue,rounded corners=0.1em] (-2,0) -- (0.5,0) -- (0.5,0.7) -- (0.7,0.7) -- (0.7,1.4) -- (2,1.4);}
\visible<12->{\draw [-,very thick,ublue,rounded corners=0.1em] (-2,0) -- (0.5,0) -- (0.5,0.7) -- (0.7,0.7) -- (0.7,0) -- (2,0);}
\visible<15->{\draw [-,very thick,ublue,rounded corners=0.1em] (-2,0) -- (0.7,0) -- (0.7,0.6) -- (0.9,0.6) -- (0.9,0) -- (2,0);}

\visible<14>{\draw [->,dashed] (0.6,-0.05) -- (0.6,-0.96in);}
\visible<15->{\draw [->,dashed] (0.8,-0.05) -- (0.8,-0.98in);}

\visible<4>{\node [anchor=north west,align=left] (wblabel) at (-2,2) {\scriptsize{$w_1=1$}\\[-0ex] \scriptsize{\ $b_1=0$}};}
\visible<5>{\node [anchor=north west,align=left] (wblabel) at (-2,2) {\alert{\scriptsize{$w_1=10$}}\\[-0ex] \scriptsize{\ $b_1=0$}};}
\visible<6>{\node [anchor=north west,align=left] (wblabel) at (-2,2) {\alert{\scriptsize{$w_1=100$}}\\[-0ex] \scriptsize{\ $b_1=0$}};}
\visible<7>{\node [anchor=north west,align=left] (wblabel) at (-2,2) {\scriptsize{$w_1=100$}\\[-0ex] \alert{\scriptsize{\ $b_1=-2$}}};}
\visible<8>{\node [anchor=north west,align=left] (wblabel) at (-2,2) {\scriptsize{$w_1=100$}\\[-0ex] \alert{\scriptsize{\ $b_1=-4$}}};}
\visible<9>{\node [anchor=north west,align=left] (wblabel) at (-2,2) {\alert{\scriptsize{$w'_1=0.9$}}};}
\visible<10>{\node [anchor=north west,align=left] (wblabel) at (-2,2) {\alert{\scriptsize{$w'_1=0.7$}}};}
\visible<11>{\node [anchor=north west,align=left] (wblabel) at (-2,2) {\alert{\scriptsize{$w_2=100$}}\\[-0ex] \alert{\scriptsize{\ $b_2=-6$}}\\[-0ex] \alert{\scriptsize{\ $w'_2=0.7$}}};}
\visible<12>{\node [anchor=north west,align=left] (wblabel) at (-2,2) {\scriptsize{$w_2=100$}\\[-0ex] \scriptsize{\ $b_2=-6$}\\[-0ex] \alert{\scriptsize{\ $w'_2=-0.7$}}};}
\visible<13->{\node [anchor=north west,align=left] (wblabel) at (-2.5,2) {\scriptsize{这是一个}\\[-1ex] \scriptsize{step function}};}
\end{scope}

\begin{scope}[xshift=2.8in,yshift=-1.2in]

\visible<13->{
\draw [->,thick] (-2.2,0) -- (2.2,0);
\draw [->,thick] (0,0) -- (0,2);
\draw [-,very thick,red,domain=-1.98:2,samples=100] plot (\x,{0.2 * (\x +0.4)^3 + 1.2 - 0.3 *(\x + 0.8)^2});
}

\visible<14->{
\foreach \n in {0.5}{
xiaotong committed
1661
    \pgfmathsetmacro{\result}{0.2 * (\n + 0.1 + 0.4)^3 + 1.2 - 0.3 *(\n + 0.1 + 0.8)^2};
xiaotong committed
1662 1663 1664 1665 1666 1667
    \draw [-,ublue,thick] (\n,0) -- (\n, \result) -- (\n + 0.2, \result) -- (\n + 0.2, 0);
}
}

\visible<15->{
\foreach \n in {0.7}{
xiaotong committed
1668
    \pgfmathsetmacro{\result}{0.2 * (\n + 0.1 + 0.4)^3 + 1.2 - 0.3 *(\n + 0.1 + 0.8)^2};
xiaotong committed
1669 1670 1671 1672 1673 1674
    \draw [-,ublue,thick] (\n,0) -- (\n, \result) -- (\n + 0.2, \result) -- (\n + 0.2, 0);
}
}

\visible<16->{
\foreach \n in {-1.9,-1.7,...,1.9}{
xiaotong committed
1675
    \pgfmathsetmacro{\result}{0.2 * (\n + 0.1 + 0.4)^3 + 1.2 - 0.3 *(\n + 0.1 + 0.8)^2};
xiaotong committed
1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692
    \draw [-,ublue,thick] (\n,0) -- (\n, \result) -- (\n + 0.2, \result) -- (\n + 0.2, 0);
}
}

\visible<14>{\node [anchor=north west,align=left] (wblabel) at (-2.5,2.5) {\scriptsize{函数的每一段都可}\\[-1ex] \scriptsize{由step function}\\[-1ex] \scriptsize{近似}};}
\visible<15>{\node [anchor=north west,align=left] (wblabel) at (-2.5,2.5) {\scriptsize{增加因层神经元}\\[-1ex] \scriptsize{可以拟合更多的}\\[-1ex] \scriptsize{部分}};}
\visible<16>{\node [anchor=north west,align=left] (wblabel) at (-2.5,2.5) {\scriptsize{理论上足够多的}\\[-1ex] \scriptsize{隐层神经元可以}\\[-1ex] \scriptsize{拟合\alert{任意函数}}};}

\end{scope}

\end{scope}

\end{tikzpicture}
\end{center}
\end{frame}

%%%------------------------------------------------------------------------------------------------------------
xiaotong committed
1693
\subsection{神经网络的简单实现 - 张量计算}
xiaotong committed
1694 1695

%%%------------------------------------------------------------------------------------------------------------
xiaotong committed
1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756
%%% outline: problem 2
\begin{frame}{然后}

\vspace{6em}
\begin{tcolorbox}[enhanced,size=normal,left=2mm,right=1mm,colback=red!5!white,colframe=red!75!black,drop fuzzy shadow]
{\Large
\textbf{人工神经网络的数学描述是什么,}

\vspace{0.4em}
\textbf{如何编程实现这种数学模型?}
}
\end{tcolorbox}

\vspace{1em}
\begin{center}
\begin{tikzpicture}
\begin{scope}[yshift=6.5em,xshift=1em]
\setcounter{mycount1}{1}
\draw[step=0.5cm,color=orange,thick] (-1,-1) grid (0.5,0.5);
\foreach \y in {+0.25,-0.25,-0.75}
  \foreach \x in {-0.75,-0.25,0.25}{
    \node [fill=orange!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (\x,\y) {\number\value{mycount1}};
    \addtocounter{mycount1}{1};
  }
\end{scope}

\begin{scope}[yshift=6em,xshift=0.5em]
\setcounter{mycount2}{2}
\draw[step=0.5cm,color=blue,thick] (-1,-1) grid (0.5,0.5);
\foreach \y in {+0.25,-0.25,-0.75}
  \foreach \x in {-0.75,-0.25,0.25}{
    \node [fill=blue!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (\x,\y) {\number\value{mycount2}};
    \addtocounter{mycount2}{1};
  }
\end{scope}

\begin{scope}[yshift=5.5em,xshift=0em]
\setcounter{mycount3}{3}
\draw[step=0.5cm,color=ugreen,thick] (-1,-1) grid (0.5,0.5);
\foreach \y in {+0.25,-0.25,-0.75}
  \foreach \x in {-0.75,-0.25,0.25}{
    \node [fill=green!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (\x,\y) {\number\value{mycount3}};
    \addtocounter{mycount3}{1};
  }
\end{scope}

\begin{scope}[yshift=5em,xshift=-0.5em]
\setcounter{mycount4}{4}
\draw[step=0.5cm,color=red,thick] (-1,-1) grid (0.5,0.5);
\foreach \y in {+0.25,-0.25,-0.75}
  \foreach \x in {-0.75,-0.25,0.25}{
    \node [fill=red!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (\x,\y) {\number\value{mycount4}};
    \addtocounter{mycount4}{1};
  }
\end{scope}
\end{tikzpicture}
\end{center}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
xiaotong committed
1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814
%%% 张量
\begin{frame}{如何描述神经网络 - 张量计算}
\begin{itemize}
\item 对于神经网络,输入$\textbf{x}$和输出$\textbf{y}$的形式并不仅仅是向量
\end{itemize}

\begin{center}
\begin{tikzpicture}

\node [anchor=center] (y) at (0,0) {\LARGE{$\textbf{y}$}};
\node [anchor=west] (eq) at (y.east) {\LARGE{$=$}};
\node [anchor=west] (func) at (eq.east) {\LARGE{$f$}};
\node [anchor=west] (brace01) at (func.east) {\LARGE{$($}};
\node [anchor=west] (x) at (brace01.east) {\LARGE{$\textbf{x}$}};
\node [anchor=west] (dot) at (x.east) {\LARGE{$\cdot$}};
\node [anchor=west] (w) at (dot.east) {\LARGE{$\textbf{w}$}};
\node [anchor=west] (plus) at (w.east) {\LARGE{$+$}};
\node [anchor=west] (b) at (plus.east) {\LARGE{$\textbf{b}$}};
\node [anchor=west] (brace02) at (b.east) {\LARGE{$)$}};

\visible<2->{
\node [anchor=center,fill=yellow!30] (x2) at (x) {\LARGE{$\textbf{x}$}};
\node [anchor=south] (xlabel) at ([xshift=-3em,yshift=1.5em]x.north) {\alert{向量?矩阵?...}};
\draw [<-] ([yshift=0.2em,xshift=-0.5em]x2.north) -- ([xshift=1em]xlabel.south);

\node [anchor=center,fill=red!20] (y2) at (y) {\LARGE{$\textbf{y}$}};
\draw [<-] ([yshift=0.2em,xshift=0.5em]y2.north) -- ([xshift=-1em]xlabel.south);

\node [anchor=center,fill=green!20] (w2) at (w) {\LARGE{$\textbf{w}$}};
\node [anchor=north] (wlabel) at ([yshift=-1.0em]w.south) {矩阵 e.g.,};
\draw [<-] ([yshift=-0.2em]w2.south) -- (wlabel.north);
\node [anchor=west] (wsample) at ([xshift=-0.5em]wlabel.east) {\footnotesize{$\left(\begin{array}{c c} 1 & 2 \\ 3 & 4 \end{array}\right)$}};

\node [anchor=center,fill=purple!20] (b2) at (b) {\LARGE{$\textbf{b}$}};
\node [anchor=south] (blabel) at ([yshift=1.3em]b.north) {向量 e.g.,};
\draw [<-] ([yshift=0.2em]b2.north) -- (blabel.south);
\node [anchor=west] (bsample) at ([xshift=-0.5em]blabel.east) {\footnotesize{$(1, 3)$}};
}

\end{tikzpicture}
\end{center}

\begin{itemize}
\item<3-> $\textbf{x}$$\textbf{y}$实际上是一个叫tensor的东西,即\textbf{张量}。比如,
\end{itemize}

\begin{center}
\begin{tikzpicture}
\begin{scope}
\visible<4->{\node [anchor=west] (vector) at (0,0) {$\textbf{x} = (1,  3)$};}
\visible<5->{\node [anchor=west] (matrix) at ([xshift=0.1in]vector.east) {$\textbf{x} = \left(\begin{array}{c c} -1 & 3 \\ 0.2 & 2 \end{array}\right)$};}
\visible<6->{\node [anchor=west] (tensor3d) at ([xshift=0.1in]matrix.east) {啥?$\textbf{x} = \left(\begin{array}{c} \left(\begin{array}{c c} -1 & 3 \\ 0.2 & 2 \end{array}\right) \\ \left(\begin{array}{c c} -1 & 3 \\ 0.2 & 2 \end{array}\right) \end{array}\right)$};}
\end{scope}
\end{tikzpicture}
\end{center}

\end{frame}

xiaotong committed
1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925
%%%------------------------------------------------------------------------------------------------------------
%%% 张量的简单定义
\begin{frame}{张量是什么}
\begin{itemize}
\item \textbf{深度学习}中,张量被``简单"地定义为\alert{多维数组}
    \begin{itemize}
    \item 张量的阶(rank)表示有多少个独立的方向,每个方向可以由多个维度表示
    \end{itemize}
\end{itemize}

\begin{center}
\begin{tikzpicture}

\begin{scope}
\visible<2->{
\node [anchor=north] (label) at (0,0) {标量};
\node [anchor=center] (label2) at ([yshift=-0.7em]label.south) {scalar};
\node [anchor=center] (rank) at ([yshift=-1.5em]label2.center) {(rank=0)};
\node [anchor=center] (scalar) at ([yshift=5em]label.north) {\Huge{3}};
}
\end{scope}

\begin{scope}[xshift=1in]
\visible<3->{
\node [anchor=north] (label) at (0,0) {向量};
\node [anchor=center] (label2) at ([yshift=-0.7em]label.south) {vector};
\node [anchor=center] (rank) at ([yshift=-1.5em]label2.center) {(rank=1)};
\node [anchor=center] (scalar) at ([yshift=5em]label.north) {$\begin{pmatrix} 2 \\ .3 \\ -8 \\ .2\end{pmatrix}$};
}
\end{scope}

\begin{scope}[xshift=2in]
\visible<4->{
\node [anchor=north] (label) at (0,0) {矩阵};
\node [anchor=center] (label2) at ([yshift=-0.7em]label.south) {matrix};
\node [anchor=center] (rank) at ([yshift=-1.5em]label2.center) {(rank=2)};
\node [anchor=center] (scalar) at ([yshift=5em]label.north) {$\begin{pmatrix} 1 & 1 & 9 \\ 1 & 0 & 0 \\ 1 & -4 & 7 \end{pmatrix}$};
}
\end{scope}

\begin{scope}[xshift=3.2in]
\visible<5->{
\node [anchor=north] (label) at (0,0) {3阶张量};
\node [anchor=center] (label2) at ([yshift=-0.7em]label.south) {tensor};
\node [anchor=center] (rank) at ([yshift=-1.5em]label2.center) {(rank=3)};
}
\begin{scope}[yshift=6.5em,xshift=1em]
\visible<5->{
\setcounter{mycount1}{1}
\draw[step=0.5cm,color=orange,thick] (-1,-1) grid (1,1);
\foreach \y in {+0.75,+0.25,-0.25,-0.75}
  \foreach \x in {-0.75,-0.25,0.25,0.75}{
    \node [fill=orange!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (\x,\y) {\number\value{mycount1}};
    \addtocounter{mycount1}{1};
  }
}
\end{scope}

\begin{scope}[yshift=6em,xshift=0.5em]
\visible<5->{
\setcounter{mycount2}{1}
\draw[step=0.5cm,color=blue,thick] (-1,-1) grid (1,1);
\foreach \y in {+0.75,+0.25,-0.25,-0.75}
  \foreach \x in {-0.75,-0.25,0.25,0.75}{
    \node [fill=blue!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (\x,\y) {\number\value{mycount2}};
    \addtocounter{mycount2}{1};
  }
}
\end{scope}

\begin{scope}[yshift=5.5em,xshift=0em]
\visible<5->{
\setcounter{mycount3}{1}
\draw[step=0.5cm,color=ugreen,thick] (-1,-1) grid (1,1);
\foreach \y in {+0.75,+0.25,-0.25,-0.75}
  \foreach \x in {-0.75,-0.25,0.25,0.75}{
    \node [fill=green!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (\x,\y) {\number\value{mycount3}};
    \addtocounter{mycount3}{1};
  }
}
\end{scope}

\begin{scope}[yshift=5em,xshift=-0.5em]
\visible<5->{
\setcounter{mycount4}{1}
\draw[step=0.5cm,color=red,thick] (-1,-1) grid (1,1);
\foreach \y in {+0.75,+0.25,-0.25,-0.75}
  \foreach \x in {-0.75,-0.25,0.25,0.75}{
    \node [fill=red!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (\x,\y) {\number\value{mycount4}};
    \addtocounter{mycount4}{1};
  }
}
\end{scope}

\end{scope}
\end{tikzpicture}
\end{center}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 张量是一个多维线性函数
\begin{frame}{事实上,张量不是简单的多维数组 - 别慌,了解下 :)}
\begin{itemize}
\item \textbf{非常负责任的说},张量\alert{不是}向量和矩阵的简单扩展,甚至说,多维数组\alert{也不是}张量所必须的表达形式
\item<2-> 严格意义上,张量是:
    \begin{enumerate}
    \item<2-> \textbf{看不懂的定义}:由若干坐标系改变时满足一定坐标转化关系的抽象对象,它是一个不随参照系的坐标变换而变化的几何量(几何定义)
    \item<3-> \textbf{还是看不懂的定义}:若干向量和协向量通过张量乘法定义的量(代数定义)
    \item<4-> \textbf{还可以解释的定义}\alert{张量是多重线性函数},是定义在一些向量空间和笛卡儿积上的多重线性映射
        \begin{itemize}
xiaotong committed
1926
        \item 张量记为$T(v_0,...,v_r)$,其中输入是$r$个向量$\{v_0,...,v_r\}$
xiaotong committed
1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030
        \item 多重线性是指,对于每个输入,函数都是线性的,比如,对于一个$v_i$,我们有
        \vspace{-0.3em}
        \begin{displaymath}
        T(v_0,...,v_i+c \cdot u,...,v_r) = T(v_0,...,v_i,...,v_r) + c \cdot T(v_0,...,u,...,v_r)
        \end{displaymath}
        其中,$c$为任意数。这个性质非常重要,它可以推导出前面的其它定义。
        \end{itemize}
    \end{enumerate}
\end{itemize}
\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 进一步解释一下张量的定义
\begin{frame}{张量是一个``量'',不是``矩阵''}
\begin{itemize}
\item 再理解一下,
    \begin{itemize}
    \item 如果一个物理量,在物体的某个位置上只是一个单值,它就是标量,比如密度
    \item 如果它在同一个位置、从多个的方向上看,有不同的值,而且这个数恰好用矩阵乘观察方向来算出来,就是张量(rank$>$1),比如应力张量
    \end{itemize}
\end{itemize}

\vspace{-0.8em}
\begin{center}
\tdplotsetmaincoords{50}{140}
\begin{tikzpicture}[scale=2,tdplot_main_coords]
\visible<3->{
\draw[thick,->] (0,0,0) -- (1,0,0) node[anchor=north east]{$a$};
\draw[thick,->] (0,0,0) -- (0,1,0) node[anchor=north west]{$b$};
\draw[thick,->] (0,0,0) -- (0,0,1) node[anchor=south]{$c$};
}
\pgfmathsetmacro{\ax}{2}
\pgfmathsetmacro{\ay}{2}
\pgfmathsetmacro{\az}{1}
\tdplotsetrotatedcoords{20}{40}{00}
\visible<4->{
\draw[thick,color=red,tdplot_rotated_coords,->] (0,0,0)
        -- (.7,0,0) node[anchor=east]{$a'$};
\draw[thick,color=green!50!black,tdplot_rotated_coords,->] (0,0,0)
        -- (0,.7,0) node[anchor=west]{$b'$};
\draw[thick,color=blue,tdplot_rotated_coords,->] (0,0,0)
        -- (0,0,.7) node[anchor=south]{$c'$};
}
\tdplottransformmainrot{\ax}{\ay}{\az}

\visible<3->{\node [anchor=west,inner sep=2pt] (coord1) at (-0.40in,-0.4in) {\footnotesize{方向$v=(a,b,c)$}};}
\visible<4->{\node [anchor=north west,inner sep=2pt] (coord2) at (coord1.south west) {\footnotesize{方向$u=(\red{a'}\black{,}{\color{ugreen} b'}\black{,}\blue{c'}\black{)}$}};}

\begin{scope}[xshift=0.4in,yshift=0.35in]
\visible<2->{
\node [anchor=west,inner sep = 2pt] (description) at (0,0) {\small{$T(v,u)$是一个三维空间$(x,y,z)$上的}};
\node [anchor=north west,inner sep = 2pt] (description2) at (description.south west) {\small{2阶张量,其中$v$$u$是两个向量}};
}

\visible<5->{
\node [anchor=north west,inner sep=2pt] (T) at ([yshift=-2em]description2.south west) {\small{$T(v,u)=$}};
\node [anchor=west,inner sep=1pt] (T2) at (T.east) {\footnotesize{$\begin{pmatrix} v_x \\ v_y \\ v_z \end{pmatrix}^T$}};
\node [anchor=west,inner sep=1pt] (T3) at ([xshift=2pt]T2.east) {\footnotesize{$\begin{pmatrix} T_{xx} & T_{xy} & T_{xz} \\ T_{yx} & T_{yy} & T_{yz} \\ T_{zx} & T_{zy} & T_{zz} \end{pmatrix}$}};
\node [anchor=west,inner sep=1pt] (T4) at ([xshift=2pt]T3.east) {\footnotesize{$\begin{pmatrix} u_x \\ u_y \\ u_z \end{pmatrix}$}};
}
\begin{pgfonlayer}{background}
\visible<7->{
\node [rectangle,inner sep=0pt,fill=red!20,minimum height=3.5em,minimum width=7em] [fit = (T3) ] (TBox) {};
}
\visible<6->{
\node [rectangle,inner sep=0pt,fill=green!20,minimum height=3.5em,minimum width=3em] [fit = (T2) ] (VBox) {};
\node [rectangle,inner sep=0pt,fill=blue!20,minimum height=3.5em,minimum width=2.5em] [fit = (T4) ] (UBox) {};
}
\end{pgfonlayer}

\visible<6->{
\draw [<-] (VBox.north) -- ([yshift=0.3em]VBox.north);
\node [anchor=south,align=left] (Vlabel) at ([yshift=0.3em]VBox.north) {\scriptsize{$v$在基向量上的投影}};
\draw [<-] (UBox.north) -- ([yshift=0.3em]UBox.north);
\node [anchor=south,align=left] (Ulabel) at ([yshift=0.3em,xshift=-1em]UBox.north) {\scriptsize{$u$在基向量上的投影}};
}
\visible<7->{
\draw [<-] (TBox.south) -- ([yshift=-0.3em]TBox.south);
\node [anchor=north,align=left] (Vlabel) at ([xshift=-0.5em,yshift=-0.3em]TBox.south) {\scriptsize{张量在$3 \times 3$个方向上的分量,恰巧用``矩阵''表示,}};
\node [anchor=north west,align=left] (Vlabel2) at ([yshift=0.2em]Vlabel.south west) {\scriptsize{记为$[T]$,想象一下坐标系的旋转}};
}
\end{scope}
\end{tikzpicture}
\end{center}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 如何在深度学习中定义一个张量
\begin{frame}{``矩阵''是``张量''的扩展:在神经网络中使用张量}
\begin{itemize}
\item 但是前面的可以忽略 - 在这里,``\alert{张量就是多维数组}''
    \begin{itemize}
    \item 向量、矩阵都可以看做数学上的``张量''的扩展
    \end{itemize}

\item<2-> 张量$T(1:3)$表示一个向量,有三个元素\\
\vspace{0.5em}
\begin{tikzpicture}
\begin{scope}
\node [anchor=north east, inner sep=1pt] (label) at (0,0) {物理存储:};
\draw[step=0.5cm,thick] (0,-0.5) grid (1.5,0);
\setcounter{mycount1}{1}
\foreach \x in {0.25,0.75,1.25}{
xiaotong committed
2031
    \node [fill=green!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (\x,-0.25) {$\number\value{mycount1}$};
xiaotong committed
2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044
    \addtocounter{mycount1}{1};
}
\end{scope}
\end{tikzpicture}

\item<3-> 张量$T(1:2,1:3)$表示一个$3 \times 2$的矩阵\\
\vspace{0.5em}
\begin{tikzpicture}
\begin{scope}
\node [anchor=north east, inner sep=1pt] (label) at (0,0) {物理存储:};
\draw[step=0.5cm,thick] (0,-0.5) grid (3.0,0);
\setcounter{mycount2}{1}
\foreach \x in {0.25,0.75,1.25}{
xiaotong committed
2045
    \node [fill=green!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (\x,-0.25) {$\number\value{mycount2}$};
xiaotong committed
2046 2047 2048
    \addtocounter{mycount2}{1};
}
\foreach \x in {1.75,2.25,2.75}{
xiaotong committed
2049
    \node [fill=red!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (\x,-0.25) {$\number\value{mycount2}$};
xiaotong committed
2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062
    \addtocounter{mycount2}{1};
}
\end{scope}
\end{tikzpicture}

\item<4-> 张量$T(1:2,1:2,1:3)$表示一个三阶张量,大小是$3 \times 2 \times 2$\\
\vspace{0.5em}
\begin{tikzpicture}
\begin{scope}
\node [anchor=north east, inner sep=1pt] (label) at (0,0) {物理存储:};
\draw[step=0.5cm,thick] (0,-0.5) grid (6.0,0);
\setcounter{mycount3}{1}
\foreach \x in {0.25,0.75,1.25}{
xiaotong committed
2063
    \node [fill=green!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (\x,-0.25) {$\number\value{mycount3}$};
xiaotong committed
2064 2065 2066
    \addtocounter{mycount3}{1};
}
\foreach \x in {1.75,2.25,2.75}{
xiaotong committed
2067
    \node [fill=red!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (\x,-0.25) {$\number\value{mycount3}$};
xiaotong committed
2068 2069 2070
    \addtocounter{mycount3}{1};
}
\foreach \x in {3.25,3.75,4.25}{
xiaotong committed
2071
    \node [fill=green!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (\x,-0.25) {$\number\value{mycount3}$};
xiaotong committed
2072 2073 2074
    \addtocounter{mycount3}{1};
}
\foreach \x in {4.75,5.25,5.75}{
xiaotong committed
2075
    \node [fill=red!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (\x,-0.25) {$\number\value{mycount3}$};
xiaotong committed
2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091
    \addtocounter{mycount3}{1};
}
\draw[decorate,thick,decoration={brace,mirror,raise=0.2em}] (0,-0.50) -- (2.95,-0.50);
\draw[decorate,thick,decoration={brace,mirror,raise=0.2em}] (3.05,-0.50) -- (6,-0.50);
\node [anchor=north] (subtensor1) at (1.5,-0.6) {\footnotesize{$3 \times 2$ sub-tensor}};
\node [anchor=north] (subtensor1) at (4.5,-0.6) {\footnotesize{$3 \times 2$ sub-tensor}};

\end{scope}
\end{tikzpicture}

\item<5-> 高阶张量:数组!数组!数组!
    \begin{itemize}
    \item 和C++、Python中的多维数组一模一样
    \end{itemize}
\end{itemize}
\end{frame}
xiaotong committed
2092 2093

%%%------------------------------------------------------------------------------------------------------------
xiaotong committed
2094 2095 2096 2097 2098 2099 2100
%%% 张量矩阵乘法
\begin{frame}{张量的矩阵乘法}
\begin{itemize}
\item 对于神经网络$\textbf{y}=f(\textbf{x}\cdot \textbf{w} + \textbf{b})$$\textbf{x} \cdot \textbf{w}$$\textbf{x} \times \textbf{w}$是线性变换,其中$\textbf{x}$是输入张量,$\textbf{w}$是一个矩阵
    \begin{itemize}
    \item $\textbf{x} \cdot \textbf{w}$表示的是矩阵乘法(或记为$\times$
    \item 注意,这里不是张量乘法,因为张量乘法还有其它定义
xiaotong committed
2101
    \item $\textbf{w}$$n \times m$的矩阵,$\textbf{x}$的形状是$... \times n$,即$\textbf{x}$的第一维度需要和$\textbf{w}$的行数大小相等\\
xiaotong committed
2102
    \vspace{0.5em}
xiaotong committed
2103
    $\textbf{x}(1:4,1:4,\alert{1:4}) \times \textbf{w}(\alert{1:4},1:2) = \textbf{s}(1:4,1:4,1:2)$
xiaotong committed
2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115
    \end{itemize}
\end{itemize}

\begin{center}
\begin{tikzpicture}

\begin{scope}[yshift=6.5em,xshift=1em]
\visible<2->{
\setcounter{mycount1}{1}
\draw[step=0.5cm,color=orange,thick] (-1,-1) grid (1,1);
\foreach \y in {+0.75,+0.25,-0.25,-0.75}
  \foreach \x in {-0.75,-0.25,0.25,0.75}{
xiaotong committed
2116
    \node [fill=orange!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (\x,\y) {$\number\value{mycount1}$};
xiaotong committed
2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127
    \addtocounter{mycount1}{1};
  }
}
\end{scope}

\begin{scope}[yshift=6em,xshift=0.5em]
\visible<2->{
\setcounter{mycount2}{2}
\draw[step=0.5cm,color=blue,thick] (-1,-1) grid (1,1);
\foreach \y in {+0.75,+0.25,-0.25,-0.75}
  \foreach \x in {-0.75,-0.25,0.25,0.75}{
xiaotong committed
2128
    \node [fill=blue!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (\x,\y) {$\number\value{mycount2}$};
xiaotong committed
2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139
    \addtocounter{mycount2}{1};
  }
}
\end{scope}

\begin{scope}[yshift=5.5em,xshift=0em]
\visible<2->{
\setcounter{mycount3}{3}
\draw[step=0.5cm,color=ugreen,thick] (-1,-1) grid (1,1);
\foreach \y in {+0.75,+0.25,-0.25,-0.75}
  \foreach \x in {-0.75,-0.25,0.25,0.75}{
xiaotong committed
2140
    \node [fill=green!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (\x,\y) {$\number\value{mycount3}$};
xiaotong committed
2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151
    \addtocounter{mycount3}{1};
  }
}
\end{scope}

\begin{scope}[yshift=5em,xshift=-0.5em]
\visible<2->{
\setcounter{mycount4}{4}
\draw[step=0.5cm,color=red,thick] (-1,-1) grid (1,1);
\foreach \y in {+0.75,+0.25,-0.25,-0.75}
  \foreach \x in {-0.75,-0.25,0.25,0.75}{
xiaotong committed
2152
    \node [fill=red!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (\x,\y) {$\number\value{mycount4}$};
xiaotong committed
2153 2154 2155 2156 2157 2158 2159 2160 2161
    \addtocounter{mycount4}{1};
  }
\node [anchor=north] (xlabel) at (0,-1.2) {$\textbf{x}$};
}
\end{scope}

\begin{scope}[yshift=5em,xshift=1.5in]
\visible<2->{
\draw[step=0.5cm,thick] (-0.5,-1) grid (0.5,1.0);
xiaotong committed
2162 2163 2164 2165 2166 2167 2168 2169
\node [fill=black!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (-0.25,0.75) {\small{$-1$}};
\node [fill=black!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (-0.25,0.25) {$0$};
\node [fill=black!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (-0.25,-0.25) {$1$};
\node [fill=black!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (-0.25,-0.75) {$0$};
\node [fill=black!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (0.25,0.75) {$0$};
\node [fill=black!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (0.25,0.25) {\small{$-1$}};
\node [fill=black!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (0.25,-0.25) {$1$};
\node [fill=black!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (0.25,-0.75) {$0$};
xiaotong committed
2170 2171 2172
\node [anchor=north] (xlabel) at (0,-1.2) {$\textbf{w}$};
}

xiaotong committed
2173 2174 2175 2176
\visible<3>{\draw [->,thick,dashed] (-1.5in+2em+1.5em,-0.3) .. controls +(east:2) and +(west:1) .. (-0.55,0.8) node [pos=0.5,left] {\scriptsize{\textbf{矩阵乘}}};}
\visible<4>{\draw [->,thick,dashed] (-1.5in+2em+1.0em,-0.5) .. controls +(east:2) and +(west:1) .. (-0.55,0.8) node [pos=0.5,left] {\scriptsize{\textbf{矩阵乘}}};}
\visible<5>{\draw [->,thick,dashed] (-1.5in+2em+0.5em,-0.7) .. controls +(east:2.5) and +(west:1) .. (-0.55,0.8) node [pos=0.5,left] {\scriptsize{\textbf{矩阵乘}}};}
\visible<6->{\draw [->,thick,dashed] (-1.5in+2em,-0.9) .. controls +(east:3) and +(west:1) .. (-0.55,0.8) node [pos=0.5,left] {\scriptsize{\textbf{矩阵乘}}};}
xiaotong committed
2177 2178 2179 2180 2181 2182 2183 2184
\end{scope}

\begin{scope}[yshift=6.5em,xshift=1em+3in]
\visible<3->{
\draw[step=0.5cm,color=orange,thick] (-0.5,-1) grid (0.5,1.0);
\foreach \y in {+0.75,+0.25,-0.25,-0.75}{
  \setcounter{mycount1}{2}
  \foreach \x in {-0.25,0.25}{
xiaotong committed
2185
    \node [fill=orange!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (\x,\y) {$\number\value{mycount1}$};
xiaotong committed
2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197
    \addtocounter{mycount1}{-1};
  }
}
}
\end{scope}

\begin{scope}[yshift=6em,xshift=0.5em+3in]
\visible<4->{
\draw[step=0.5cm,color=blue,thick] (-0.5,-1) grid (0.5,1.0);
\foreach \y in {+0.75,+0.25,-0.25,-0.75}{
  \setcounter{mycount1}{2}
  \foreach \x in {-0.25,0.25}{
xiaotong committed
2198
    \node [fill=blue!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (\x,\y) {$\number\value{mycount1}$};
xiaotong committed
2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210
    \addtocounter{mycount1}{-1};
  }
}
}
\end{scope}

\begin{scope}[yshift=5.5em,xshift=0em+3in]
\visible<5->{
\draw[step=0.5cm,color=ugreen,thick] (-0.5,-1) grid (0.5,1.0);
\foreach \y in {+0.75,+0.25,-0.25,-0.75}{
  \setcounter{mycount1}{2}
  \foreach \x in {-0.25,0.25}{
xiaotong committed
2211
    \node [fill=green!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (\x,\y) {$\number\value{mycount1}$};
xiaotong committed
2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223
    \addtocounter{mycount1}{-1};
  }
}
}
\end{scope}

\begin{scope}[yshift=5.0em,xshift=-0.5em+3in]
\visible<6->{
\draw[step=0.5cm,color=red,thick] (-0.5,-1) grid (0.5,1.0);
\foreach \y in {+0.75,+0.25,-0.25,-0.75}{
  \setcounter{mycount1}{2}
  \foreach \x in {-0.25,0.25}{
xiaotong committed
2224
    \node [fill=red!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (\x,\y) {$\number\value{mycount1}$};
xiaotong committed
2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244
    \addtocounter{mycount1}{-1};
  }
}
}

\visible<3->{
\node [anchor=north] (xlabel) at (0,-1.2) {$\textbf{x} \cdot \textbf{w}$};
\node [anchor=center] (elabel) at (-0.7in,0) {\Huge{$\textbf{=}$}};
}
\end{scope}

\end{tikzpicture}
\end{center}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 张量的单元操作
\begin{frame}{张量的单元操作}
\begin{itemize}
xiaotong committed
2245
\item 神经网络$\textbf{y}=f(\textbf{x}\cdot \textbf{w} + \textbf{b})$也包括一些张量的单元操作(element-wise operation)
xiaotong committed
2246 2247 2248 2249 2250 2251
	\begin{itemize}
	\item 加法:$\textbf{s}+\textbf{b}$,其中$\textbf{s} = \textbf{x}\cdot \textbf{w}$
	\item 激活函数:$f(\cdot)$
	\end{itemize}
\item<2-> \textbf{单元加}就是对张量中的每个位置都进行加法
	\begin{itemize}
xiaotong committed
2252
	\item<3-> 扩展:加法的\textbf{广播},重复利用一个张量进行加法,并不要求两个张量形状相同
xiaotong committed
2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264
	\end{itemize}
\end{itemize}

\vspace{-1.5em}
\begin{center}
\begin{tikzpicture}
\visible<3->{
\begin{scope}
\setcounter{mycount1}{1}
\draw[step=0.5cm,color=orange,thick] (-1,-0.5) grid (1,0.5);
\foreach \y in {+0.25,-0.25}
  \foreach \x in {-0.75,-0.25,0.25,0.75}{
xiaotong committed
2265
    \node [fill=orange!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (\x,\y) {$\number\value{mycount1}$};
xiaotong committed
2266 2267 2268 2269 2270 2271 2272 2273 2274
    \addtocounter{mycount1}{1};
  }
\node [anchor=south] (varlabel) at (0,0.6) {$\textbf{s}$};
\end{scope}
\begin{scope}[xshift=1.5in]
\setcounter{mycount1}{1}
\draw[step=0.5cm,color=ugreen,thick] (-1,-0) grid (1,0.5);
\foreach \y in {+0.25}
  \foreach \x in {-0.75,-0.25,0.25,0.75}{
xiaotong committed
2275
    \node [fill=green!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (\x,\y) {$1$};
xiaotong committed
2276 2277 2278 2279 2280 2281 2282 2283 2284 2285
    \addtocounter{mycount1}{1};
  }
\node [anchor=center] (plabel) at (-4.5em,0) {\huge{$\textbf{+}$}};
\node [anchor=south] (varlabel) at (0,0.6) {$\textbf{b}$};
\end{scope}
\begin{scope}[xshift=3in]
\setcounter{mycount1}{2}
\draw[step=0.5cm,color=orange,thick] (-1,-0.5) grid (1,0.5);
\foreach \y in {+0.25,-0.25}
  \foreach \x in {-0.75,-0.25,0.25,0.75}{
xiaotong committed
2286
    \node [fill=orange!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (\x,\y) {$\number\value{mycount1}$};
xiaotong committed
2287 2288 2289 2290 2291 2292 2293 2294 2295 2296
    \addtocounter{mycount1}{1};
  }
\node [anchor=center] (plabel) at (-4.5em,0) {\huge{$\textbf{=}$}};
\node [anchor=south] (varlabel) at (0,0.6) {$\textbf{s+b}$};
\end{scope}
}

\end{tikzpicture}
\end{center}

xiaotong committed
2297 2298
\vspace{-0.3em}

xiaotong committed
2299
\begin{itemize}
xiaotong committed
2300
\item<4-> 类似的,我们可以做减法、乘法,也包括激活函数。这也被称作函数的向量化(vectorization)
xiaotong committed
2301 2302
\end{itemize}

xiaotong committed
2303 2304 2305 2306 2307 2308 2309
\vspace{-0.5em}
\visible<4->{
\begin{displaymath}
\textrm{Relu} \Big( \begin{pmatrix} 2 \\ -.3 \end{pmatrix} \Big) = \begin{pmatrix} 2 \\ 0 \end{pmatrix}
\end{displaymath}
}

xiaotong committed
2310 2311 2312
\end{frame}

%%%------------------------------------------------------------------------------------------------------------
xiaotong committed
2313 2314 2315
%%% 深度学习工具包
\begin{frame}{如何实现?- 开源张量计算框架}
\begin{itemize}
xiaotong committed
2316
\item 实现神经网络的开源系统很多,一个简单好用的工具包NumPy \url{https://numpy.org/}
xiaotong committed
2317 2318 2319 2320
    \begin{itemize}
    \item Python接口,多维数组的定义使用方便
    \item 提供了张量表示和使用的范式
    \end{itemize}
xiaotong committed
2321
\item<2-> 最近,很火的两个框架:TensorFlow和PyTorch
xiaotong committed
2322 2323
    \begin{itemize}
    \item Google和Facebook出品,质量有保证
xiaotong committed
2324
    \item 功能强大,接口丰富
xiaotong committed
2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371
    \item 可以进行大规模部署和应用
    \item 大量可参考的实例
    \end{itemize}

    \includegraphics[scale=0.13]{./Figures/tensorflowpytorch.jpg}
\item<3-> 还有其它还在更新的优秀框架: CNTK、MXNet、PaddlePaddle、Keras、Chainer、 dl4j、NiuTensor等
\end{itemize}
\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% NiuTrans.Tensor工具包
\begin{frame}{NiuTensor}
\begin{itemize}
\item 这里使用我们自研的NiuTensor工具包进行教学 \url{http://www.niutrans.com/opensource/niutensor/index.html}
    \begin{itemize}
    \item 简单小巧,易于修改
    \item C++语言编写,代码高度优化
    \item 同时支持CPU和GPU设备
    \item 丰富的张量计算接口
    \end{itemize}
\end{itemize}

\includegraphics[scale=0.35]{./Figures/niutensor.jpg}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 使用NiuTensor
\begin{frame}{使用NiuTensor}
\begin{itemize}
\item NiuTensor的使用很简单,下面是一个C++例子
\end{itemize}

\begin{tcolorbox}[enhanced,frame engine=empty,boxrule=0.1mm,size=title,colback=blue!10!white]
\begin{flushleft}
{\scriptsize
\begin{tabbing}
\texttt{\#include "source/tensor/XTensor.h"} \hspace{4em} \= // 引用XTensor定义的头文件 \\

\texttt{using namespace nts;} \> // 引用nts命名空间 \\
\ \\

\texttt{int main(int argc, const char ** argv)\{} \\
\ \ \ \ \texttt{XTensor tensor;} \> // 声明张量tensor \\

\ \ \ \ \texttt{InitTensor2D(\&tensor, 2, 2, X\_FLOAT);} \> // 定义张量为2*2的矩阵 \\

xiaotong committed
2372
\ \ \ \ \texttt{tensor.SetDataRand();} \> // [0,1]均匀分布初始化张量 \\
xiaotong committed
2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384

\ \ \ \ \texttt{tensor.Dump(stdout);} \> // 输出张量内容 \\

\ \ \ \ \texttt{return 0;}\\
\texttt{\}}

\end{tabbing}
}
\end{flushleft}
\end{tcolorbox}

\begin{itemize}
xiaotong committed
2385
\item<2-> 运行这个程序会显示张量每个元素的值
2386 2387 2388
\begin{itemize}
\item<2-> 二阶张量(order=2),形状是$2 \times 2$ (dimsize=2,2),数据类型是单精度浮点(dtype=X\_FLOAT),非稀疏(dense=1.00)
\end{itemize}
xiaotong committed
2389 2390
\end{itemize}

2391
\vspace{-0em}
xiaotong committed
2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406
\visible<2->{
\begin{tcolorbox}[enhanced,frame engine=empty,boxrule=0.1mm,size=title,colback=black!10!white]
\begin{flushleft}
{\scriptsize
\begin{tabbing}
\texttt{order=2 dimsize=2,2 dtype=X\_FLOAT dense=1.000000} \\
\texttt{3.605762e-001 2.992340e-001 1.393780e-001 7.301248e-001}
\end{tabbing}
}
\end{flushleft}
\end{tcolorbox}
}

\end{frame}

xiaotong committed
2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419
%%%------------------------------------------------------------------------------------------------------------
%%% 定义XTensor
\begin{frame}{定义XTensor}
\begin{itemize}
\item 张量由类XTensor表示,利用InitTensor定义,参数:
    \begin{itemize}
    \item 指向XTensor类型变量的指针
    \item 张量的阶
    \item 各个方向维度的大小(与传统多维数组约定一样)
    \item 张量的数据类型等(有缺省值)
    \end{itemize}
\end{itemize}

2420
\vspace{-0.3em}
xiaotong committed
2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557
\begin{tcolorbox}[enhanced,frame engine=empty,boxrule=0.1mm,size=title,colback=blue!10!white]
\begin{flushleft}
{\scriptsize
\begin{tabbing}
\texttt{XTensor tensor;} \hspace{12em} \= // 声明张量tensor \\
\texttt{int sizes[6] = \{2,3,4,2,3,4\};} \> // 张量的形状为2*3*4*2*3*4 \\
\texttt{InitTensor(\&tensor, 6, sizes, X\_FLOAT);} \> // 定义形状为sizes的6阶张量
\end{tabbing}
}
\end{flushleft}
\end{tcolorbox}

\visible<2->{
\begin{itemize}
\item 更简便的定义方式
\end{itemize}

\vspace{-0.2em}
\begin{tcolorbox}[enhanced,frame engine=empty,boxrule=0.1mm,size=title,colback=blue!10!white]
\begin{flushleft}
{\scriptsize
\begin{tabbing}
\texttt{XTensor a, b, c;} \hspace{11.5em} \= // 声明张量tensor \\
\texttt{InitTensor1D(\&a, 10, X\_INT);} \> // 10维的整数型向量\\
\texttt{InitTensor1D(\&b, 10);} \> // 10维的向量,缺省类型(浮点)\\
\texttt{InitTensor4D(\&c, 10, 20, 30, 40);} \> // 10*20*30*40的4阶张量(浮点)
\end{tabbing}
}
\end{flushleft}
\end{tcolorbox}
}

\visible<3->{
\begin{itemize}
\item 直接在GPU上定义张量
\end{itemize}

\vspace{-0.2em}
\begin{tcolorbox}[enhanced,frame engine=empty,boxrule=0.1mm,size=title,colback=blue!10!white]
\begin{flushleft}
{\scriptsize
\begin{tabbing}
\texttt{XTensor tensorGPU;} \hspace{10.5em} \= // 声明张量tensor \\
\texttt{InitTensor2D(\&tensorGPU, 10, 20,} $\backslash$ \> // 在编号为0的GPU上定义张量 \\
\hspace{6.7em} \texttt{X\_FLOAT, 0);}
\end{tabbing}
}
\end{flushleft}
\end{tcolorbox}
}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% XTensor的代数运算
\begin{frame}{代数运算}
\begin{itemize}
\item 各种单元算子(1阶运算)+、-、*、$\backslash$、Log、Exp、 Power、Absolute等,还有Sigmoid、Softmax等激活函数
\end{itemize}

\vspace{-0.2em}
\begin{tcolorbox}[enhanced,frame engine=empty,boxrule=0.1mm,size=title,colback=blue!10!white]
\begin{flushleft}
{\scriptsize
\begin{tabbing}
\texttt{XTensor a, b, c, d, e;} \hspace{7em} \= // 声明张量tensor \\
\texttt{InitTensor3D(\&a, 2, 3, 4);} \> // a为2*3*4的3阶张量 \\
\texttt{InitTensor3D(\&b, 2, 3, 4);} \> // b为2*3*4的3阶张量 \\
\texttt{InitTensor3D(\&c, 2, 3, 4);} \> // c为2*3*4的3阶张量 \\
\texttt{a.SetDataRand();} \> // 随机初始化a \\
\texttt{b.SetDataRand();} \> // 随机初始化b \\
\texttt{c.SetDataRand();} \> // 随机初始化c \\
\texttt{d = a + b * c;} \> // d被赋值为 a + b * c \\
\texttt{d = ((a + b) * d - b / c ) * d;} \> // d可以被嵌套使用 \\
\texttt{e = Sigmoid(d);} \> // d经过激活函数Sigmoid赋值给e
\end{tabbing}
}
\end{flushleft}
\end{tcolorbox}

\visible<2->{
\begin{itemize}
\item 高阶运算,最常用的是矩阵乘法(MMul)
\end{itemize}

\vspace{-0.2em}
\begin{tcolorbox}[enhanced,frame engine=empty,boxrule=0.1mm,size=title,colback=blue!10!white]
\begin{flushleft}
{\scriptsize
\begin{tabbing}
\texttt{XTensor a, b, c;} \hspace{10.0em} \= // 声明张量tensor \\
\texttt{InitTensor4D(\&a, 2, 2, 3, 4);} \> // a为2*2*3*4的4阶张量 \\
\texttt{InitTensor2D(\&b, 4, 5);} \> // b为4*5的矩阵 \\
\texttt{a.SetDataRand();} \> // 随机初始化a \\
\texttt{b.SetDataRand();} \> // 随机初始化b \\
\texttt{c = MMul(a, b);} \> // 矩阵乘的结果为2*2*3*5的4阶张量
\end{tabbing}
}
\end{flushleft}
\end{tcolorbox}
}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% XTensor的其它函数
\begin{frame}{其它常用函数}
\begin{itemize}
\item 其它函数,列不全,可以参考网站上的详细说明
\end{itemize}

\footnotesize{
\begin{center}
\begin{tabular}{l|l}
函数 & 描述 \\ \hline
\texttt{a.Reshape(o, s)} & 把a变换为阶为o、形状为s的张量\\
\texttt{a.Get(pos)} & 取张量中位置为pos的元素 \\
\texttt{a.Set(v, pos)} & 把张量中位置为pos的元素的值设为v \\
\texttt{a.Dump(file)} & 把张量存到file中,file为文件句柄 \\
\texttt{a.Read(file)} & 从file中读取张量,file为文件句柄 \\ \hline
\texttt{Power(a, p)}  & 计算指数$\textrm{a}^{\textrm{p}}$ \\
\texttt{Linear(a, s, b)}  & 计算 a * s + b,s和b都是一个数 \\
\texttt{CopyValues(a)} & 构建a的一个拷贝 \\
\texttt{ReduceMax(a, d)} & 对a沿着方向d进行规约,得到最大值 \\
\texttt{ReduceSum(a, d)} & 对a沿着方向d进行规约,得到和 \\
\texttt{Concatenate(a, b, d)} & 把两个张量a和b沿d方向级联\\
\texttt{Merge(a, d)} & 对张量a沿d方向合并\\
\texttt{Split(a, d, n)} & 对张量a沿d方向分裂成n份\\ \hline
\texttt{Sigmoid(a)}  & 对a进行Sigmoid变换 \\
\texttt{Softmax(a)}  & 对a进行Softmax变换,沿最后一个方向 \\
\texttt{HardTanH(a)} & 对a进行hard tanh变换(双曲正切的近似)\\
\texttt{Relu(a)}     & 对a进行Relu变换\\
\end{tabular}
\end{center}
}

\end{frame}
xiaotong committed
2558 2559

%%%------------------------------------------------------------------------------------------------------------
2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727
%%% 利用XTensor构建神经网络
\begin{frame}{构建神经网络}
\begin{itemize}
\item 可以很方便的构建一个单层网络
\end{itemize}

\begin{tcolorbox}
[bicolor,sidebyside,righthand width=4cm,size=title,frame engine=empty,
 colback=blue!10!white,colbacklower=black!5!white]
 {\scriptsize
\begin{tabbing}
\texttt{XTensor x, y, w, b;} \\
\texttt{InitTensor3D(\&x, 3, 4, 5);} \\
\texttt{InitTensor2D(\&w, 5, 3);} \\
\texttt{InitTensor1D(\&b, 3);} \\
\texttt{...} \\
\texttt{y = Sigmoid(MMul(x, w) + b);}
\end{tabbing}
}
\tcblower
\begin{center}
\begin{tikzpicture}
\node [draw,circle,inner sep=2pt,fill=red!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (x) at (0,0) {\footnotesize{$\textrm{x}$}};
\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=4em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (layer) at ([yshift=0.7em]x.north) {\scriptsize{layer}};
\node [anchor=south,draw,circle,inner sep=2pt,fill=red!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (y) at ([yshift=0.7em]layer.north) {\scriptsize{$\textrm{y}$}};
\draw [thick,->] (x.north) -- (layer.south);
\draw [thick,->] (layer.north) -- (y.south);
\node [anchor=west,align=left] (xshape) at (x.east) {\tiny{shape: 3*4*5}};
\node [anchor=west,align=left] (yshape) at (y.east) {\tiny{shape: 3*4*3}};
\end{tikzpicture}
\end{center}
\end{tcolorbox}

\visible<2->{
\begin{itemize}
\item 一个多层网络
\end{itemize}

\begin{tcolorbox}
[bicolor,sidebyside,righthand width=4cm,size=title,frame engine=empty,
 colback=blue!10!white,colbacklower=black!5!white]
 {\scriptsize
\begin{tabbing}
\texttt{XTensor x, y, h1, h2;} \\
\texttt{XTensor w1, b1, w2, w3;} \\
\texttt{InitTensor3D(\&x, 3, 4, 5);} \\
\texttt{InitTensor2D(\&w1, 5, 3);} \\
\texttt{InitTensor1D(\&b1, 3);} \\
\texttt{InitTensor2D(\&w2, 3, 6);} \\
\texttt{InitTensor2D(\&w3, 6, 4);} \\
\texttt{...} \\
\texttt{h1 = Sigmoid(MMul(x, w1) + b1);} \\
\texttt{h2 = HandTanH(MMul(h1, w2));} \\
\texttt{y = Relu(MMul(h2, w3));}
\end{tabbing}
}
\tcblower
\begin{center}
\begin{tikzpicture}
\node [draw,circle,inner sep=2pt,fill=red!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (x) at (0,0) {\footnotesize{$\textrm{x}$}};
\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=4em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (layer1) at ([yshift=0.7em]x.north) {\scriptsize{layer1}};
\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=4em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (layer2) at ([yshift=1.0em]layer.north) {\scriptsize{layer2}};
\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=4em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (layer3) at ([yshift=1.0em]layer2.north) {\scriptsize{layer3}};
\node [anchor=south,draw,circle,inner sep=2pt,fill=red!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (y) at ([yshift=0.7em]layer3.north) {\scriptsize{$\textrm{y}$}};
\draw [thick,->] (x.north) -- (layer1.south);
\draw [thick,->] (layer1.north) -- (layer2.south);
\draw [thick,->] (layer2.north) -- (layer3.south);
\draw [thick,->] (layer3.north) -- (y.south);
\node [anchor=west,align=left] (xshape) at (x.east) {\tiny{shape: 3*4*5}};
\node [anchor=west,align=left] (yshape) at (y.east) {\tiny{shape: 3*4*4}};
\node [anchor=south west,align=left,inner sep=2pt] (l1shape) at (layer1.north) {\tiny{shape: 3*4*3}};
\node [anchor=south west,align=left,inner sep=2pt] (l2shape) at (layer2.north) {\tiny{shape: 3*4*6}};
\end{tikzpicture}
\end{center}
\end{tcolorbox}
}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 利用XTensor构建更复杂的神经网络
\begin{frame}{更复杂一点的例子}
\begin{itemize}
\item 任何网络都可以构建,比如RNN、Transformer 等
\end{itemize}

\begin{tcolorbox}
[bicolor,sidebyside,righthand width=4cm,size=title,frame engine=empty,
 colback=blue!10!white,colbacklower=black!5!white]
 {\scriptsize
\begin{tabbing}
\texttt{XTensor x[3], y[3], r, wh;} \\
\texttt{XTensor h1, h2, w1, b1, h3, h4;} \\
\texttt{XList splits;} \\
\texttt{...} \\
\texttt{for(unsigned i = 0; i < 3; i++)\{} \\
\texttt{\hspace{2em}r = Concatenate(x[i] + r) * wh;}\\
\texttt{\hspace{2em}splits.Add(\&r);}\\
\texttt{\}}\\
\visible<2->{
\texttt{} \\
\texttt{h1 = Merge(splits, 0);}\\
\texttt{h2 = Relu(h1 * w1 + b1);}\\
\texttt{h3 = h1 + h2;} \\
\texttt{h4 = Softmax(h3);} \\
}
\visible<3->{
\texttt{} \\
\texttt{Split(h4, splits, 0);} \\
\texttt{} \\
\texttt{for(unsigned i = 0; i < 3; i++)\{} \\
\texttt{\hspace{2em}y[i] = *(XTensor*)splits.Get(i);}\\
\texttt{\hspace{2em}y[i].Dump(stdout);}\\
\texttt{\}}
}
\end{tabbing}
}
\tcblower
\begin{center}
\begin{tikzpicture}
\node [draw,circle,inner sep=1pt,fill=red!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (x1) at (0,0) {\footnotesize{$\textrm{x}_1$}};
\node [anchor=west,draw,circle,inner sep=1pt,fill=red!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (x2) at ([xshift=2em]x1.east) {\footnotesize{$\textrm{x}_2$}};
\node [anchor=west,draw,circle,inner sep=1pt,fill=red!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (x3) at ([xshift=2em]x2.east) {\footnotesize{$\textrm{x}_3$}};
\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=2.5em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (rlayer1) at ([yshift=1em]x1.north) {\tiny{rlayer}};
\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=2.5em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (rlayer2) at ([yshift=1em]x2.north) {\tiny{rlayer}};
\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=2.5em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (rlayer3) at ([yshift=1em]x3.north) {\tiny{rlayer}};
\draw [->,thick] (x1.north) -- (rlayer1.south);
\draw [->,thick] (x2.north) -- (rlayer2.south);
\draw [->,thick] (x3.north) -- (rlayer3.south);
\draw [->,thick] (rlayer1.east) -- (rlayer2.west);
\draw [->,thick] (rlayer2.east) -- (rlayer3.west);
\draw [->,thick] (rlayer1.north) -- ([yshift=1em]rlayer1.north);
\draw [->,thick] (rlayer2.north) -- ([yshift=1em]rlayer2.north);
\draw [->,thick] (rlayer3.north) -- ([yshift=1em]rlayer3.north);

\visible<2->{
\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=9.4em,minimum height=1.0em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (h1) at ([yshift=1em]rlayer2.north) {\tiny{h1 = Merge($\cdot$)}};
\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=9.4em,minimum height=1.0em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (h2) at ([yshift=1em]h1.north) {\tiny{h2 = Relu($\cdot$)}};
\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=9.4em,minimum height=1.0em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (h3) at ([yshift=1em]h2.north) {\tiny{h3 = Sum($\cdot$)}};
\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=9.4em,minimum height=1.0em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (h4) at ([yshift=1em]h3.north) {\tiny{h4 = Softmax($\cdot$)}};
\draw [->,thick] (h1.north) -- (h2.south);
\draw [->,thick] (h2.north) -- (h3.south);
\draw [->,thick] (h3.north) -- (h4.south);
\draw [->,thick,rounded corners] (h1.east) -- ([xshift=0.5em]h1.east) -- ([xshift=0.5em,yshift=0.5em]h2.north east) -- ([xshift=-2em,yshift=0.5em]h2.north east) -- ([xshift=-2em,yshift=1em]h2.north east);
}

\visible<3->{
\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=9.4em,minimum height=1.0em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (slayer) at ([yshift=1em]h4.north) {\tiny{Split($\cdot$)}};
\node [anchor=south,draw,circle,inner sep=1pt,fill=red!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (y2) at ([yshift=1em]slayer.north) {\footnotesize{$\textrm{y}_2$}};
\node [anchor=east,draw,circle,inner sep=1pt,fill=red!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (y1) at ([xshift=-2em]y2.west) {\footnotesize{$\textrm{y}_1$}};
\node [anchor=west,draw,circle,inner sep=1pt,fill=red!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (y3) at ([xshift=2em]y2.east) {\footnotesize{$\textrm{y}_3$}};
\draw [<-,thick] (y1.south) -- ([yshift=-1em]y1.south);
\draw [<-,thick] (y2.south) -- ([yshift=-1em]y2.south);
\draw [<-,thick] (y3.south) -- ([yshift=-1em]y3.south);
}

\visible<2->{
\draw [->,thick] (h4.north) -- (slayer.south);
}

\end{tikzpicture}
\end{center}
\end{tcolorbox}

\end{frame}


%%%------------------------------------------------------------------------------------------------------------
xiaotong committed
2728
\subsection{参数学习 - 反向传播}
xiaotong committed
2729

xiaotong committed
2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746
%%%------------------------------------------------------------------------------------------------------------
%%% outline: problem 3
\begin{frame}{还有一个问题}

\vspace{6em}
\begin{tcolorbox}[enhanced,size=normal,left=2mm,right=1mm,colback=red!5!white,colframe=red!75!black,drop fuzzy shadow]
{\Large
\textbf{如何对模型中的参数进行学习,}

\vspace{0.4em}
\textbf{之后使用学习到的模型进行推断?}
}
\end{tcolorbox}

\vspace{2em}
\begin{center}
\begin{tikzpicture}
xiaotong committed
2747
\node [fill=blue!10] (label) at (0,0) {\LARGE{$\frac{\partial L(\textbf{w})}{\partial \textbf{w}} = $ ? }};
xiaotong committed
2748 2749 2750 2751 2752
\end{tikzpicture}
\end{center}

\end{frame}

xiaotong committed
2753 2754
%%%------------------------------------------------------------------------------------------------------------
%%% 神经网络 = 表达式
xiaotong committed
2755
\begin{frame}{神经网络 = 函数表达式}
xiaotong committed
2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798
\begin{itemize}
\item 所有的神经网络都可以看做由变量和函数组成的表达式\\
\end{itemize}

\begin{center}
\begin{tikzpicture}
\node [anchor=north west] (eq1) at (0,0) {$\textbf{y} = \textbf{x} + \textbf{b}$};
\node [anchor=north west] (eq2) at (eq1.south west) {$\textbf{y} = \textrm{Relu}(\textbf{x} \cdot \textbf{w} + \textbf{b})$};
\node [anchor=north west] (eq3) at (eq2.south west) {$\textbf{y} = (\textrm{Relu}(\textbf{x} \cdot \textbf{w}_1 + \textbf{b}) + \textbf{x}) \cdot \textbf{w}_2$};
\node [anchor=north west] (eq4) at (eq3.south west) {$\textbf{y} = \textrm{Sigmoid}(\textrm{Relu}(\textbf{x} \cdot \textbf{w}_1 + \textbf{b}_1) + \textbf{x}) \cdot \textbf{w}_2 + \textbf{b}_2$};

\visible<2->{
\node [anchor=north west,minimum height=1.2em,minimum width=1.2em,fill=green!30!white] (xlabel) at ([yshift=-0.5em,xshift=0.3em]eq4.south west) {};
\node [anchor=west] (xlabel2) at (xlabel.east) {输入变量 - 由用户指定};
}

\begin{pgfonlayer}{background}
\visible<2->{
\node [anchor=south, minimum height=1.6em,minimum width=0.8em,fill=green!30!white] (x1) at ([xshift=-1.3em]eq4.south) {};
\node [anchor=south, minimum height=1.6em,minimum width=0.8em,fill=green!30!white] (x2) at ([xshift=4.9em]eq4.south) {};
}
\end{pgfonlayer}

\visible<3->{
\node [anchor=north west,minimum height=1.2em,minimum width=1.2em,fill=red!30!white] (wlabel) at ([yshift=-0.3em]xlabel.south west) {};
\node [anchor=west] (wlabel2) at (wlabel.east) {模型参数 - 怎么设置???};
}

\begin{pgfonlayer}{background}
\visible<3->{
\node [anchor=south, minimum height=1.6em,minimum width=1.2em,fill=red!30!white] (w1) at ([xshift=0.2em]eq4.south) {};
\node [anchor=south, minimum height=1.6em,minimum width=1.2em,fill=red!30!white] (b1) at ([xshift=2.5em]eq4.south) {};
\node [anchor=south, minimum height=1.6em,minimum width=1.2em,fill=red!30!white] (w2) at ([xshift=6.85em]eq4.south) {};
\node [anchor=south, minimum height=1.6em,minimum width=1.2em,fill=red!30!white] (b2) at ([xshift=9.2em]eq4.south) {};
}
\end{pgfonlayer}

\end{tikzpicture}
\end{center}

\visible<4->{
\begin{tcolorbox}[enhanced,size=normal,left=2mm,right=1mm,colback=blue!5!white,colframe=blue!75!black,drop fuzzy shadow]
{\Large
xiaotong committed
2799
\textbf{问题来了:}
xiaotong committed
2800 2801 2802 2803 2804 2805 2806 2807 2808

\vspace{0.4em}
\textbf{如何确定w和b,使x与y对应得更好?}
}
\end{tcolorbox}
}

\end{frame}

xiaotong committed
2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819
%%%------------------------------------------------------------------------------------------------------------
%%% 学习的目标是什么
\begin{frame}{目标函数和损失函数}
\begin{itemize}
\item 这是一个典型的优化问题,有两个基本问题\\
    \begin{enumerate}
    \item 优化的目标是什么?
    \item 如何调整参数$\textbf{w}$$\textbf{b}$达成目标?
    \end{enumerate}
\item<2-> \textbf{定义目标}:对于给定$\textbf{x}$,什么样的$\textbf{y}$是好的
    \begin{itemize}
liuhui committed
2820
    \item 假设:多个输入样本$\{\textbf{x}_1,...,\textbf{x}_n\}$,每个$\textbf{x}_i$都对应\alert{正确答案}$\tilde{\textbf{y}}_i$
xiaotong committed
2821
    \item 对于一个神经网络$\textbf{y}=f(\textbf{x})$,每个$\textbf{x}_i$也会有一个输出$\textbf{y}_i$
liuhui committed
2822
    \item 如果可以度量答案$\tilde{\textbf{y}}_i$和网络输出$\textbf{y}_i$之间的偏差,进而调整网络参数减小这种偏差,就可以得到更好的模型
xiaotong committed
2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837
    \end{itemize}
\end{itemize}

\visible<3->{
\vspace{-0.7em}
\begin{center}
\begin{tikzpicture}
\begin{scope}[yscale=0.2,xscale=0.8]
\draw[-,very thick,ublue,domain=-4.2:3.5,samples=100] plot (\x,{ - 1/14 * (\x + 4) * (\x + 1) * (\x - 1) * (\x - 3)});
\visible<4->{
\draw[-,very thick,ugreen,domain=-3.8:3.0,samples=100] plot (\x,{ - 1/14 * (4*\x*\x*\x + 3*\x*\x - 26*\x - 1)});
}
\draw[->,thick] (-6,0) -- (5,0);
\draw[->,thick] (-5,-4) -- (-5,5);

liuhui committed
2838
\draw [<-] (-2.5,4) -- (-2,5) node [pos=1,right,inner sep=2pt] {\footnotesize{答案$\tilde{\textbf{y}}_i$}};
xiaotong committed
2839 2840 2841 2842
\visible<4->{
\draw [<-] (-3,-3) -- (-2.5,-2) node [pos=0,left,inner sep=2pt] {\footnotesize{预测$\textbf{y}_i$}};}

\visible<5->{
liuhui committed
2843
\draw [<-] (2.3,1) -- (3.3,2) node [pos=1,right,inner sep=2pt] {\footnotesize{偏差$|\tilde{\textbf{y}}_i - \textbf{y}_i|$}};
xiaotong committed
2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857
\foreach \x in {-3.8,-3.7,...,3.0}{
    \pgfmathsetmacro{\p}{- 1/14 * (\x + 4) * (\x + 1) * (\x - 1) * (\x - 3)};
    \pgfmathsetmacro{\q}{- 1/14 * (4*\x*\x*\x + 3*\x*\x - 26*\x - 1)};
    \draw [-] (\x,\p) -- (\x, \q);
}
}

\end{scope}
\end{tikzpicture}
\end{center}
}

\vspace{-0.3em}
\begin{itemize}
liuhui committed
2858
\item<6-> 这个过程就是\alert{参数优化/训练},而$\tilde{\textbf{y}}_i$$\textbf{y}_i$之间偏差的度量就是一种\alert{损失函数},也称作训练的\alert{目标函数},而优化的目标就是\textbf{最小化损失函数}
xiaotong committed
2859 2860 2861 2862 2863 2864 2865 2866 2867
\end{itemize}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 常见的目标函数
\begin{frame}{常见的损失函数}

\begin{itemize}
liuhui committed
2868
\item 损失函数记为$Loss(\tilde{\textbf{y}}_i,\textbf{y}_i)$,简记为$L$,以下是常用的定义
xiaotong committed
2869 2870 2871 2872 2873 2874 2875
\end{itemize}

\vspace{0.5em}

\footnotesize{
\renewcommand{\arraystretch}{1.2}
\begin{tabular}{l | l | l | l}
liuhui committed
2876 2877 2878 2879 2880 2881 2882 2883
名称 & 定义 & NiuTensor实现(\texttt{yh}表示$\tilde{\textbf{y}}_i$) & 应用 \\ \hline
0-1 & $L = \left\{ \begin{array}{ll} 0 & \tilde{\textbf{y}}_i = \textbf{y}_i \\ 1 & \tilde{\textbf{y}}_i \ne \textbf{y}_i \end{array} \right.$ & \scriptsize{\texttt{L = Sign(Absolute(yh - y))}} & 感知机 \\
Hinge & $L=\max(0,1-\tilde{\textbf{y}}_i \cdot \textbf{y}_i)$ & \scriptsize{\texttt{L = Max(0, 1 - yh * y))}} & SVM \\
绝对值 & $L=|\tilde{\textbf{y}}_i - \textbf{y}_i|$ & \scriptsize{\texttt{L = Absolute(yh - y)}} & 回归 \\
Logistic & $L=\log(1 + \tilde{\textbf{y}}_i \cdot \textbf{y}_i)$ & \scriptsize{\texttt{L = Log(1 + yh * y)}} & 回归 \\
平方 & $L=(\tilde{\textbf{y}}_i - \textbf{y}_i)^2$ & \scriptsize{\texttt{L = Power(yh - y, 2)}} & 回归 \\
指数 & $L=\exp(- \tilde{\textbf{y}}_i \cdot \textbf{y}_i) $ & \scriptsize{\texttt{L = Exp(Negate(yh * y))}} & \scriptsize{AdaBoost} \\
交叉熵 & $L=-\sum_k \textbf{y}_i^{[k]} \log \tilde{\textbf{y}}_i^{[k]} $ & \scriptsize{\texttt{L = CrossEntropy(y, yh)}} & 多分类 \\
xiaotong committed
2884 2885 2886 2887 2888 2889 2890 2891 2892 2893
       & \scriptsize{$\textbf{y}_i^{[k]}$: $\textbf{y}_i$的第$k$} & & \\
\end{tabular}
\renewcommand{\arraystretch}{1.0}
}

\vspace{-0.5em}
\begin{itemize}
\item 注意:
    \begin{itemize}
    \item 损失函数可以根据问题不同进行选择,没有固定要求
liuhui committed
2894
    \item 有些损失函数对网络输出有约束,比如交叉熵要求$\tilde{\textbf{y}}_i$$\textbf{y}_i$都是概率分布
xiaotong committed
2895 2896 2897 2898 2899
    \end{itemize}
\end{itemize}

\end{frame}

xiaotong committed
2900 2901 2902 2903 2904
%%%------------------------------------------------------------------------------------------------------------
%%% 优化目标函数
\begin{frame}{参数优化}

\begin{itemize}
liuhui committed
2905
\item 对于第$i$个输入样本($\textbf{x}_i$,$\tilde{\textbf{y}}_i$),如果把损失函数看做参数$\textbf{w}$的函数(把$\textbf{b}$也作为一种$\textbf{w}$),记为$L(\textbf{x}_i,\tilde{\textbf{y}}_i;\textbf{w})$,则参数学习可以被描述为:\\
xiaotong committed
2906 2907

\begin{displaymath}
liuhui committed
2908
\hat{\textbf{w}} = \argmin_{\textbf{w}} \frac{1}{n} \sum_{i=1}^{n} L(\textbf{x}_i,\tilde{\textbf{y}}_i;\textbf{w})
xiaotong committed
2909 2910
\end{displaymath}

liuhui committed
2911
$\hat{\textbf{w}}$表示在训练集上使得损失的平均值达到最小的参数。$\frac{1}{n} \sum_{i=1}^{n} L(\textbf{x}_i,\tilde{\textbf{y}}_i;\textbf{w})$被称作代价函数(cost function),它是损失函数均值期望的估计。
xiaotong committed
2912 2913 2914 2915 2916

\vspace{0.5em}

\item<2-> 核心问题:\textbf{求解$\argmin$,即找到代价函数最小值点}
    \begin{itemize}
xiaotong committed
2917
    \item 这是非常常见的问题,回忆一下第三章的IBM模型,当时使用的是EM算法
xiaotong committed
2918 2919 2920 2921 2922 2923 2924 2925
    \item 但是这里并不是一个生成模型
    \item 需要一种更加通用的求解方法
    \end{itemize}

\end{itemize}

\end{frame}

xiaotong committed
2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983
%%%------------------------------------------------------------------------------------------------------------
%%% 梯度下降
\begin{frame}{梯度下降(Gradient Descent)}

\begin{itemize}
\item 如果把目标函数看做是参数$\textbf{w}$的函数,记为$J(\textbf{w})$。优化目标是:找到使$J(\textbf{w})$达到最小的$\textbf{w}$
\item 注意,$\textbf{w}$可能包含几亿个实数,不能是SMT中MERT之类的调参方法。这里可以考虑一种更加适合大量实数参数的优化方法,其核心思想是\alert{梯度下降}
    \begin{itemize}
    \item<2-> 如果$J(\textbf{w})$对于$\textbf{w}$可微分,$\frac{\partial J(\textbf{w})}{\partial \textbf{w}}$表示$J$$\textbf{w}$处变化最大的方向
    \item<2-> $\textbf{w}$沿着梯度方向更新,新的$\textbf{w}$可以使函数更接近极值
    \end{itemize}
\end{itemize}

\pgfplotsset{%
  colormap={whitered}{color(-1cm)=(orange!75!red);color(1cm)=(white)}
}


\begin{center}
\begin{tikzpicture}[
  declare function = {mu1=1;},
  declare function = {mu2=2;},
  declare function = {sigma1=0.5;},
  declare function = {sigma2=1;},
  declare function = {normal(\m,\s)=1/(2*\s*sqrt(pi))*exp(-(x-\m)^2/(2*\s^2));},
  declare function = {bivar(\ma,\sa,\mb,\sb)=1/(2*pi*\sa*\sb) * exp(-((x-\ma)^2/\sa^2 + (y-\mb)^2/\sb^2))/2;}]
  \footnotesize{
  \visible<2->{
  \begin{scope}
  \begin{axis}[
    colormap name  = whitered,
    width          = 8cm,
    height         = 5cm,
    view           = {20}{45},
    enlargelimits  = false,
    grid           = major,
    domain         = -1:3,
    y domain       = 0:4,
    samples        = 30,
    xlabel         = $\textbf{w}^{[1]}$,
    ylabel         = $\textbf{w}^{[2]}$,
    xlabel style   = {xshift=0em,yshift=0.8em},
    ylabel style   = {xshift=0.2em,yshift=0.8em},
    zlabel         = {$J(\textbf{w})$},
    ztick          = {-0.1},
    colorbar,
    colorbar style = {
      at     = {(1.2,0.5)},
      anchor = north west,
      ytick  = {0,-0.1},
      height = 0.25*\pgfkeysvalueof{/pgfplots/parent axis height},
      title  = {}
    }
  ]

    \addplot3 [surf] {-bivar(mu1,sigma1,mu2,sigma2)};

    \node [circle,fill=red,minimum size=3pt,inner sep=1.5pt] () at (axis cs:0.5,2,-0.01) {};
xiaotong committed
2984

xiaotong committed
2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014
    \draw [->,very thick,ublue] (axis cs:0.5,2,-0.01) -- (axis cs:0.8,1.6,-0.03) node [pos=1,right,inner sep=2pt] {\tiny{-$\frac{\partial J(\textbf{w})}{\partial \textbf{w}}$}};
    \draw [->,very thick,dotted] (axis cs:0.5,2,-0.01) -- (axis cs:0.2,1.5,-0.03);
    \draw [->,very thick,dotted] (axis cs:0.5,2,-0.01) -- (axis cs:0.2,3.5,-0.03);
    %\draw [black!50] (axis cs:0,-1,0) -- (axis cs:0,4,0);

  \end{axis}
  \end{scope}
  }
  }
\end{tikzpicture}
\end{center}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 梯度下降的变种
\begin{frame}{梯度下降的不同实现方式}

\begin{itemize}
\item \textbf{梯度下降}:我们可以沿着梯度方向更新$\textbf{w}$一小步,之后得到更好的$\textbf{w}$,之后重新计算梯度,不断重复上述过程

\begin{displaymath}
\textbf{w}_{t+1} = \textbf{w}_t - \alpha \cdot \frac{\partial J(\textbf{w}_t)}{\partial \textbf{w}_t}
\end{displaymath}

其中$t$表示更新的步数,$\alpha$是一个参数,表示更新步幅的大小。$\alpha$的设置需要根据任务进行调整。而$J(\textbf{w}_t)$的形式决定了具体的算法具体的实现。

\item<2-> \textbf{批量梯度下降(Batch Gradient Descent)}

\begin{displaymath}
liuhui committed
3015
J(\textbf{w}_t) = \frac{1}{n} \sum_{i=1}^{n} L(\textbf{x}_i,\tilde{\textbf{y}}_i;\textbf{w}_t)
xiaotong committed
3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031
\end{displaymath}

这种方法训练稳定,但是由于每次更新需要对所有训练样本进行遍历,效率低(比如$n$很大),大规模数据上很少使用

\end{itemize}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 梯度下降的变种
\begin{frame}{梯度下降的不同实现方式(续)}

\begin{itemize}
\item \textbf{随机梯度下降(Stochastic Gradient Descent)}

\begin{displaymath}
liuhui committed
3032
J(\textbf{w}_t) = L(\textbf{x}_i,\tilde{\textbf{y}}_i;\textbf{w}_t)
xiaotong committed
3033 3034 3035 3036 3037 3038 3039 3040 3041 3042
\end{displaymath}

大名鼎鼎的SGD,所有机器学习的课程里几乎都有介绍。每次随机选取一个样本进行梯度计算和参数更新,更新的计算代价低,而且适用于利用少量样本进行在线学习(online learning),不过方法收敛慢


\vspace{0.3em}

\item<2-> \textbf{小批量梯度下降(Mini-batch Gradient Descent)}

\begin{displaymath}
liuhui committed
3043
J(\textbf{w}_t) = \frac{1}{m} \sum_{i=j}^{j+m-1} L(\textbf{x}_i,\tilde{\textbf{y}}_i;\textbf{w}_t)
xiaotong committed
3044 3045 3046 3047 3048 3049 3050 3051
\end{displaymath}

每次随机使用若干样本进行参数更新(数量不会特别大),算是一种折中方案,当今最常用的方法之一

\end{itemize}

\end{frame}

xiaotong committed
3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068
%%%------------------------------------------------------------------------------------------------------------
%%% 关于梯度下降的改进
\begin{frame}{一些改进}

\begin{itemize}
\item \textbf{变种和改进}:提高基于梯度的方法的收敛速度、训练稳定性等,可以google一下
    \begin{itemize}
    \item Momentum, Adagrad, Adadelta, RMSprop, Adam, AdaMax, Nadam, AMSGrad等等
    \item \footnotesize{\url{http://ruder.io/optimizing-gradient-descent}}
    \end{itemize}
\item<2-> \textbf{并行化}:大规模数据处理需要分布式计算,梯度更新的策略需要设计
    \begin{itemize}
    \item \textbf{同步更新}:所有计算节点完成计算后,统一汇总并更新参数。效果稳定,但是并行度低
    \item \textbf{异步更新}:每个节点可以随时更新。并行度高,但是由于节点间参数可能不同步,方法不十分稳定
    \end{itemize}
\item<3-> \textbf{其它}
    \begin{itemize}
xiaotong committed
3069
    \item 深度网络梯度消失和爆炸的问题,使用梯度裁剪、残差连接等
xiaotong committed
3070 3071 3072 3073 3074 3075
    \item 引入正则化因子,可以对外部知识建模,比如引入噪声让训练更稳定
    \end{itemize}
\end{itemize}

\end{frame}

xiaotong committed
3076 3077
%%%------------------------------------------------------------------------------------------------------------
%%% 如何计算梯度
xiaotong committed
3078
\begin{frame}{如何计算梯度? - 数值微分}
xiaotong committed
3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092

\begin{itemize}
\item \textbf{还有一个核心问题}:如何计算梯度
\begin{displaymath}
\frac{\partial L(\textbf{w})}{\partial \textbf{w}} = ?
\end{displaymath}

\vspace{0.5em}

\item<2-> \textbf{数值微分} - 简单粗暴的方法
\begin{displaymath}
\frac{\partial L(\textbf{w})}{\partial \textbf{w}} = \lim_{\Delta \textbf{w} \to 0} \frac{L(\textbf{w} + \Delta \textbf{w}) - L(\textbf{w} - \Delta \textbf{w}) }{2\Delta \textbf{w}}
\end{displaymath}

xiaotong committed
3093
最基本的微分公式,我们可以将$\textbf{w}$变化一点儿(用$\Delta \textbf{w}$表示),之后看$L(\cdot)$的变化
xiaotong committed
3094 3095 3096

    \begin{itemize}
    \item<3-> \textbf{优点很明显}:方法真的非常简单,易于实现
xiaotong committed
3097
    \item<3-> \textbf{缺点也很明显}:效率太低,对于复杂网络、参数量稍微大一些的模型基本上无法使用
xiaotong committed
3098 3099 3100 3101 3102 3103
    \end{itemize}

\end{itemize}

\end{frame}

xiaotong committed
3104 3105
%%%------------------------------------------------------------------------------------------------------------
%%% 如何计算梯度 - 符号微分
xiaotong committed
3106
\begin{frame}{如何计算梯度? - 符号微分}
xiaotong committed
3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145

\begin{itemize}
\item \textbf{符号微分}:类似于手写出微分表达式,最后带入变量的值,得到微分结果。比如,对于如下表达式
\begin{displaymath}
L(\textbf{w}) = \textbf{x} \cdot \textbf{w} + 2 \textbf{w}^2
\end{displaymath}

\visible<2->{
\vspace{0.5em}
可以手动推导出微分表达式

\begin{displaymath}
\frac{\partial L(\textbf{w})}{\partial \textbf{w}} = \textbf{x} + 4 \textbf{w}
\end{displaymath}
}

\visible<3->{
\vspace{0.5em}
最后,带入$\textbf{x} = \begin{pmatrix} 2 \\ -3 \end{pmatrix}$$\textbf{w} = \begin{pmatrix} -1 \\ 1 \end{pmatrix}$,得到微分结果\\

\vspace{1em}

\begin{displaymath}
\frac{\partial L(\textbf{w})}{\partial \textbf{w}} =  \begin{pmatrix} 2 \\ -3 \end{pmatrix} + 4 \begin{pmatrix} -1 \\ 1 \end{pmatrix} = \begin{pmatrix} -2 \\ 1 \end{pmatrix}
\end{displaymath}
}

\end{itemize}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 符号微分的膨胀问题
\begin{frame}{符号微分的膨胀问题}

\begin{itemize}
\item \textbf{Expression Swell}:深层函数的微分表达式会非常复杂
	\begin{itemize}
	\item 表达式冗长不易存储和管理
xiaotong committed
3146
	\item 真正需要的是微分的\alert{结果值},而不是微分表达式
xiaotong committed
3147 3148 3149 3150 3151 3152 3153 3154 3155 3156
	\end{itemize}
\end{itemize}

\vspace{0.5em}

{\small
\begin{tabular} {l | l | l}
函数 & 微分表达式 & 化简的微分表达式 \\ \hline
$x$ & $1$ & $1$ \\ \hline
$x(x+1)$ & $(x+1)+x$ & $2x + 1$ \\ \hline
xiaotong committed
3157
$x(x+1)$ & $(x+1)(x^2+x+1)$ & $4x^3+6x^2$ \\
xiaotong committed
3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172
$(x^2+x+1)$ & $+x(x^2+x+1)$ & $+4x+1$ \\
                     & $+x(x+1)(2x+1)$ & \\ \hline
$(x^2+x)$ & $(2x+1)(x^2+x+1)$ & $8x^7+28x^6$ \\
$(x^2+x+1)$ & $(x^4+2x^3+2x^2+x+1)$ & $+48x^5+50x^4$ \\
$(x^4+2x^3$ & $+(2x+1)(x^2+x)$ & $+36x^3+18x^2$ \\
$+2x^2+x+1)$ & \ \ $(x^4+2x^3+2x^2+x+1)$ & $+6x+1$ \\
 & $+(x^2+x)(x^2+x+1)$ & \\
 & \ \ $(4x^3+6x^2+4x+1)$ & \\


\end{tabular}
}

\end{frame}

xiaotong committed
3173 3174
%%%------------------------------------------------------------------------------------------------------------
%%% 自动微分
xiaotong committed
3175
\begin{frame}{如何计算梯度? - 自动微分}
xiaotong committed
3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235

\begin{itemize}
\item \textbf{自动微分}:复杂的微分变成简单的步骤,这些步骤完全自动化,而且容易进行存储、计算。这可以用一种反向模式进行描述(也就是\alert{反向传播}思想),包括两步
	\begin{enumerate}
	\item \textbf{前向计算}:从神经网络的输入,逐层计算每层网络的输出值,这也是神经网络的标准使用方式
	\item \textbf{反向计算}:从神经网络的输出,逆向逐层计算每层网络输入(输出)所对应的微分
	\end{enumerate}
\end{itemize}

\visible<2->{
\vspace{-1em}
\begin{center}
\begin{tikzpicture}
\begin{scope}
\tikzstyle{layernode} = [draw,thick,fill=ugreen!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}];

\node [anchor=center,layernode,minimum height=4em,minimum width=1em] (layer01) at (0,0) {};
\node [anchor=north west,layernode,minimum height=3em,minimum width=1em] (layer02) at ([xshift=3em]layer01.north east) {};
\node [anchor=south west,layernode,minimum height=3em,minimum width=1em] (layer03) at ([xshift=7em]layer01.south east) {};
\node [anchor=south west,layernode,minimum height=4em,minimum width=1em] (layer04) at ([xshift=11em]layer01.south east) {};
\node [anchor=south west,layernode,minimum height=4em,minimum width=1em] (layer05) at ([xshift=3em]layer04.south east) {};

\node [anchor=east] (input) at ([xshift=-1em]layer01.west){\scriptsize{输入}};
\node [anchor=west] (output) at ([xshift=1em]layer05.east){\scriptsize{输出}};

\draw [->] ([xshift=-1em]layer01.west) -- ([xshift=-0.1em]layer01.west);
\draw [->] ([xshift=0.1em,yshift=-0.5em]layer01.north east) -- ([xshift=-0.1em,yshift=-0.5em]layer02.north west);
\draw [->] ([xshift=0.1em,yshift=0.5em]layer01.south east) -- ([xshift=-0.1em,yshift=0.5em]layer03.south west);
\draw [->] ([xshift=0.1em,yshift=-0.5em]layer02.north east) -- ([xshift=-0.1em,yshift=-0.5em]layer04.north west);
\draw [->] ([xshift=0.1em,yshift=0.5em]layer03.south east) -- ([xshift=-0.1em,yshift=0.5em]layer04.south west);
\draw [->] ([xshift=0.1em]layer04.east) -- ([xshift=-0.1em]layer05.west);
\draw [->] ([xshift=0.1em]layer05.east) -- ([xshift=1.0em]layer05.east);

\visible<3->{
\draw [->,very thick,ublue] ([xshift=-1em]layer01.west) -- ([xshift=-0.1em]layer01.west);
}
\visible<4->{
\draw [->,very thick,ublue] ([xshift=0.1em,yshift=-0.5em]layer01.north east) -- ([xshift=-0.1em,yshift=-0.5em]layer02.north west);
}
\visible<5->{
\draw [->,very thick,ublue] ([xshift=0.1em,yshift=0.5em]layer01.south east) -- ([xshift=-0.1em,yshift=0.5em]layer03.south west);
}
\visible<6->{
\draw [->,very thick,ublue] ([xshift=0.1em,yshift=-0.5em]layer02.north east) -- ([xshift=-0.1em,yshift=-0.5em]layer04.north west);
\draw [->,very thick,ublue] ([xshift=0.1em,yshift=0.5em]layer03.south east) -- ([xshift=-0.1em,yshift=0.5em]layer04.south west);
\draw [->,very thick,ublue] ([xshift=0.1em]layer04.east) -- ([xshift=-0.1em]layer05.west);
\draw [->,very thick,ublue] ([xshift=0.1em]layer05.east) -- ([xshift=1.0em]layer05.east);
}

\visible<8->{
\draw [<-,very thick,red] ([xshift=-1em,yshift=-0.3em]layer01.west) -- ([xshift=-0.1em,yshift=-0.3em]layer01.west);
\draw [<-,very thick,red] ([xshift=0.1em,yshift=-0.8em]layer01.north east) -- ([xshift=-0.1em,yshift=-0.8em]layer02.north west);
\draw [<-,very thick,red] ([xshift=0.1em,yshift=0.2em]layer01.south east) -- ([xshift=-0.1em,yshift=0.2em]layer03.south west);
\draw [<-,very thick,red] ([xshift=0.1em,yshift=-0.8em]layer02.north east) -- ([xshift=-0.1em,yshift=-0.8em]layer04.north west);
\draw [<-,very thick,red] ([xshift=0.1em,yshift=0.2em]layer03.south east) -- ([xshift=-0.1em,yshift=0.2em]layer04.south west);
\draw [<-,very thick,red] ([xshift=0.1em,yshift=-0.3em]layer04.east) -- ([xshift=-0.1em,yshift=-0.3em]layer05.west);
\draw [<-,very thick,red] ([xshift=0.1em,yshift=-0.3em]layer05.east) -- ([xshift=1.0em,yshift=-0.3em]layer05.east);
}

\visible<7->{
xiaotong committed
3236
\draw [<-,thin] ([xshift=0.3em,yshift=0.3em]layer04.east) .. controls +(35:1) and +(215:1) .. ([xshift=-2em,yshift=0.3em]layer05.north west) node [pos=1,above] {\scriptsize{前向:层$i$ 的输出$h_{i}$}};
xiaotong committed
3237 3238
}
\visible<9->{
xiaotong committed
3239
\draw [<-,thin] ([xshift=0.3em,yshift=-0.7em]layer04.east) .. controls +(-35:1) and +(145:1) .. ([xshift=-2em,yshift=-0.3em]layer05.south west) node [pos=1,below] {\scriptsize{反向:$h_{i}$ 处的梯度$\frac{\partial L}{\partial h_i}$}};
xiaotong committed
3240 3241 3242 3243 3244 3245 3246 3247 3248
}

\end{scope}
\end{tikzpicture}
\end{center}
}

\vspace{-1em}
\begin{itemize}
xiaotong committed
3249
\item<10-> 自动微分可以用\alert{计算图}实现(TensorFlow、 NiuTensor 等),不过计算图超出了课程的范围,建议大家自行学习
xiaotong committed
3250 3251 3252 3253
\end{itemize}

\end{frame}

xiaotong committed
3254 3255 3256 3257 3258
%%%------------------------------------------------------------------------------------------------------------
%%% 反向传播 - 符号说明
\begin{frame}{符号说明}

\begin{itemize}
xiaotong committed
3259
\item 以一个$K$层神经网络为例重新明确一下符号
xiaotong committed
3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281
    \begin{itemize}
    \item 这里假设每层神经网络中都不含偏置项(不含$\textbf{b}$
    \end{itemize}
\end{itemize}

\vspace{-1em}
\begin{center}
\begin{tikzpicture}
\begin{scope}

\def\neuronsep{1}
\tikzstyle{neuronnode} = [minimum size=1.2em,circle,draw,ublue,very thick,inner sep=1pt, fill=white,align=center,drop shadow={shadow xshift=0.1em,shadow yshift=-0.1em}];

%%% layer 1
\foreach \n in {1,...,4}{
    \node [neuronnode] (neuron0\n) at (\n * \neuronsep,0) {};
    \draw [->] ([yshift=-0.8em]neuron0\n.south) -- ([yshift=-0.1em]neuron0\n.south) node [pos=0,below] {\tiny{...}};
}


\begin{pgfonlayer}{background}
\node [rectangle,inner sep=0.2em,fill=red!20] [fit = (neuron01) (neuron04)] (layer01) {};
liuhui committed
3282
\node [anchor=east] (layer01label) at (layer01.west) {\scriptsize{$k-1$}};
xiaotong committed
3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297
\end{pgfonlayer}

%%% layer 2
\foreach \n in {1,...,4}{
    \node [neuronnode] (neuron1\n) at (\n * \neuronsep,3em) {};
}

\foreach \n in {1,...,4}{
    \foreach \m in {1,...,4}{
        \draw [<-] (neuron1\n.south) -- (neuron0\m.north);
    }
}

\begin{pgfonlayer}{background}
\node [rectangle,inner sep=0.2em,fill=ugreen!20] [fit = (neuron11) (neuron14)] (layer02) {};
liuhui committed
3298
\node [anchor=east] (layer02label) at (layer02.west) {\scriptsize{$k$}};
xiaotong committed
3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314
\end{pgfonlayer}

%%% layer 3
\foreach \n in {1,...,4}{
    \node [neuronnode] (neuron2\n) at (\n * \neuronsep,6em) {};
    \draw [<-] ([yshift=0.8em]neuron2\n.north) -- ([yshift=0.0em]neuron2\n.north) node [pos=0,above] {\tiny{...}};
}

\foreach \n in {1,...,4}{
    \foreach \m in {1,...,4}{
        \draw [<-] (neuron2\n.south) -- (neuron1\m.north);
    }
}

\begin{pgfonlayer}{background}
\node [rectangle,inner sep=0.2em,fill=blue!20] [fit = (neuron21) (neuron24)] (layer03) {};
liuhui committed
3315
\node [anchor=east] (layer03label) at (layer03.west) {\scriptsize{$k+1$}};
xiaotong committed
3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331
\end{pgfonlayer}

%%% output layer
\foreach \n in {1,...,4}{
    \node [neuronnode] (neuron3\n) at (\n * \neuronsep,9.4em) {};
    \visible<1-3,5->{
    \draw [<-] ([yshift=0.6em]neuron3\n.north) -- ([yshift=0.0em]neuron3\n.north) node [pos=0,above] {\tiny{output}};
    }
    \visible<4>{
    \draw [<-,red,very thick] ([yshift=0.6em]neuron3\n.north) -- ([yshift=0.0em]neuron3\n.north) node [pos=0,above] {\tiny{output}};
    }
    \draw [->] ([yshift=-0.6em]neuron3\n.south) -- ([yshift=0.0em]neuron3\n.south);
}

\begin{pgfonlayer}{background}
\node [rectangle,inner sep=0.2em,fill=ugreen!20] [fit = (neuron31) (neuron34)] (layer04) {};
liuhui committed
3332
\node [anchor=east] (layer04label) at (layer04.west) {\scriptsize{$K$(输出)}};
xiaotong committed
3333 3334 3335 3336
\end{pgfonlayer}

\visible<2->{
\node [neuronnode,draw=red,fill=red!20!white,inner sep=1pt] (neuron12new) at (2 * \neuronsep,3em) {};
liuhui committed
3337
\node [anchor=east] (neuronsamplelabel) at ([yshift=-1em]layer02label.south east) {\alert{\textbf{\tiny{$k$层, 第$i$个神经元}}}};
xiaotong committed
3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348
\draw [->,dashed,very thick,red] ([xshift=-0.2em,yshift=0.2em]neuronsamplelabel.east) .. controls +(30:1) and +(220:1) .. ([xshift=-0em,yshift=-0em]neuron12new.210);
}

\visible<3>{
\foreach \n in {1,...,4}{
\draw [<-,thick,red] (neuron2\n.south) -- (neuron12.north);
}
}

\visible<5->{
\draw [<-,thick,red] (neuron14.south) -- (neuron04.north);
3349
\node [anchor=north] (wlabel) at (layer02.south east) {\alert{\scriptsize{$w_{4,4}^{k}$}}};
xiaotong committed
3350 3351 3352
}

\visible<3->{
liuhui committed
3353
\node [anchor=west,align=left] (line01) at ([xshift=1em,yshift=1em]layer04.east) {\footnotesize{$h_{i}^{k}$:第$k$层, 第$i$个神经元的输出}};
xiaotong committed
3354 3355
\node [anchor=north west,align=left] (line02) at (line01.south west) {\footnotesize{$\textbf{h}^{k}$:第$k$层的输出}};
\node [anchor=north west,align=left] (line03) at (line02.south west) {\footnotesize{$\textbf{s}^{k}$:第$k$层的线性变换$\textbf{s}^k=\textbf{h}^{k-1}\textbf{w}^k$}};
liuhui committed
3356
\node [anchor=north west,align=left] (line04) at (line03.south west) {\footnotesize{$f^{k}$:第$k$层的激活函数$\textbf{h}^k=f^k(\textbf{s}^k)$}};
xiaotong committed
3357 3358
}
\visible<4->{
xiaotong committed
3359
\node [anchor=north west,align=left] (line05) at (line04.south west) {\footnotesize{$\textbf{h}^{K}$:网络最后的输出}};
xiaotong committed
3360 3361
}
\visible<5->{
xiaotong committed
3362
\node [anchor=north west,align=left] (line06) at (line05.south west) {\footnotesize{$w_{j,i}^{k}$:第$k-1$层神经元$j$}\\\footnotesize{$k$层神经元$i$的连接权重}};
liuhui committed
3363
\node [anchor=north west,align=left] (line07) at (line06.south west) {\footnotesize{$\textbf{w}^{k}$:第$k-1$层与第$k$层的}\\\footnotesize{连接权重}};
xiaotong committed
3364 3365 3366 3367 3368 3369 3370 3371 3372
}

\end{scope}
\end{tikzpicture}
\end{center}

\vspace{-1.5em}

\visible<6->{
xiaotong committed
3373
\begin{displaymath} \textrm{对于第}k\textrm{}: \textbf{h}^k = f^k(\textbf{s}^k) = f^k(\sum_j h_{j}^{k-1}w_{j,i}^k) = f^k(\textbf{h}^{k-1} \textbf{w}^k) \end{displaymath}
xiaotong committed
3374 3375 3376 3377
}

\end{frame}

xiaotong committed
3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413
%%%------------------------------------------------------------------------------------------------------------
%%% 输出层的反向传播
\begin{frame}{反向传播 - 输出层}

\begin{itemize}
\item 输出层(两个阶段)
\end{itemize}

\vspace{-1.0em}
\begin{center}
\begin{tikzpicture}
\begin{scope}
\node [anchor=center,minimum height=1.7em,fill=yellow!20,draw] (h) at (0,0) {$\textbf{h}^{K-1}$};
\node [anchor=west,minimum height=1.7em,fill=blue!20,draw] (s) at ([xshift=5.5em]h.east) {$\textbf{s}^{K}$};
\node [anchor=west,minimum height=1.7em,fill=green!20,draw] (h2) at ([xshift=5.5em]s.east) {$\textbf{h}^{K}$};
\node [anchor=west,minimum height=1.7em,fill=orange!20,draw] (l) at ([xshift=5.5em]h2.east) {$L$};
\draw [->] (h.east) -- (s.west);
\draw [->] (s.east) -- (h2.west);
\draw [->] (h2.east) -- (l.west) node [pos=0.5,above] {\tiny{损失}};

\node [anchor=south west,inner sep=2pt] (step100) at ([xshift=0.5em,yshift=-0.8em]h.north east) {\tiny{$\textbf{s}^K = \textbf{h}^{K-1} \textbf{w}^K$}};
\node [anchor=south west,inner sep=2pt] (step101) at (step100.north west) {\tiny{线性变换}};

\node [anchor=south west,inner sep=2pt] (step200) at ([xshift=0.5em,yshift=-0.8em]s.north east) {\tiny{$\textbf{h}^K = f^K(\textbf{s}^K)$}};
\node [anchor=south west,inner sep=2pt] (step201) at (step200.north west) {\tiny{激活函数}};

\node [anchor=south,inner sep=1pt] (outputlabel) at ([yshift=0.0em]h2.north) {\tiny{\textbf{输出层}}};

\visible<2->{
\draw[decorate,thick,decoration={brace,mirror,raise=0.4em,amplitude=2mm}] (h.south west) -- (s.south west) node [pos=0.5,below,yshift=-1em] {\scriptsize{\textbf{第一阶段:线性变换}}};
}
\visible<3->{
\draw[decorate,thick,decoration={brace,mirror,raise=0.4em,amplitude=2mm}] ([xshift=0.2em]s.south west) -- (l.south east) node [pos=0.5,below,yshift=-1em] (step2) {\scriptsize{\textbf{第二阶段:激活函数+损失函数}}};
}

\visible<4->{
xiaotong committed
3414
\draw [->,very thick,red] ([yshift=1em,xshift=-0.1em]l.north) -- ([yshift=1em,xshift=0.1em]s.north) node [pos=0.5,above] {\tiny{反向求梯度\alert{$\frac{\partial L}{\partial \textbf{s}^K} = ?$}}};
xiaotong committed
3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471
\draw [-,very thick,red] ([yshift=0.5em]l.north) -- ([yshift=1.5em]l.north);
\draw [-,very thick,red] ([yshift=0.5em]s.north) -- ([yshift=1.5em]s.north);
}

\end{scope}

\end{tikzpicture}
\end{center}

\begin{itemize}
\item<4-> 反向传播从输出向输入传播梯度,因此我们先考虑阶段二\visible<5->{。令$\pi^k = \frac{\partial L}{\partial \textbf{s}^k}$表示损失$L$在第$k$层激活函数输入处的梯度,利用链式法有}

\vspace{-1.5em}
\visible<5->{
\begin{eqnarray}
\pi^K & = & \frac{\partial L}{\partial \textbf{s}^K} \nonumber \\
         & = & \frac{\partial L}{\partial \textbf{h}^K} \cdot \frac{\partial \textbf{h}^K}{\partial \textbf{s}^K}  \nonumber \\
         & = & \frac{\partial L}{\partial \textbf{h}^K} \cdot \frac{\partial f^K(\textbf{s}^K)}{\partial \textbf{s}^K}  \nonumber
%\frac{\partial L}{\partial \textbf{w}^K} & = & \frac{\partial L}{\partial \textbf{h}^K} \cdot \frac{\partial \textbf{h}^K}{\partial \textbf{w}^K} \nonumber \\
%                                                         & \visible<4->{=} & \visible<4->{\frac{\partial L}{\partial \textbf{h}^K} \cdot \frac{\partial f^K(\textbf{h}^{K-1} \textbf{w}^K)}{\partial \textbf{w}^K} \ \ \ \ (\textrm{因为}\textbf{h}^K=f^K(\textbf{h}^{K-1} \textbf{w}^K))} \nonumber \\
%                                                         & \visible<5->{=} & \visible<5->{\frac{\partial L}{\partial \textbf{h}^K} \cdot \frac{\partial f^K(\textbf{s}^K)}{\partial \textbf{s}^K} \cdot \frac{\partial \textbf{s}^K}{\partial \textbf{w}^K}  \ \ \ (\textrm{因为}\textbf{s}^K=\textbf{h}^{K-1} \textbf{w}^K)} \nonumber
\end{eqnarray}
}

\end{itemize}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 输出层的反向传播 - 各个因子的意义
\begin{frame}{反向传播 - 输出层($\textbf{s}^K$处的梯度)}
\begin{center}
\begin{tikzpicture}

\begin{scope}
\node [anchor=center] (factor00) at (0,0) {${\displaystyle \pi^K \ = }$};
\node [anchor=west] (factor01) at (factor00.east) {${\displaystyle \frac{\partial L}{\partial \textbf{h}^K}}$};
\node [anchor=west,inner sep=1pt] (factor02) at (factor01.east) {${\displaystyle \cdot}$};
\node [anchor=west] (factor03) at (factor02.east) {${\displaystyle \frac{\partial f^K(\textbf{s}^K)}{\partial \textbf{s}^K}}$};

\begin{pgfonlayer}{background}
\visible<2-4>{
\node [rectangle,inner sep=0em,fill=red!20] [fit = (factor01)] (p1) {};
}
\visible<3-4>{
\node [rectangle,inner sep=0em,fill=blue!20] [fit = (factor03)] (p2) {};
}
\visible<5->{
\node [circle,inner sep=0em,fill=green!20] [fit = (factor02)] (p3) {};
}
\end{pgfonlayer}

\end{scope}
\end{tikzpicture}
\end{center}

\begin{itemize}
liuhui committed
3472
\item<2-> \raisebox{-0.7em}{\tikz{\node [anchor=west,fill=red!20] (factor01) at (factor00.east) {$\frac{\partial L}{\partial \textbf{h}^K}$};}} 表示损失$L$相对网络输出的变化率,比如,对于$L = \frac{1}{2} ||\tilde{\textbf{y}} - \textbf{h}^K||^2$,有$\frac{\partial L}{\partial \textbf{h}^K} = \tilde{\textbf{y}} - \textbf{h}^K$
xiaotong committed
3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628
\item<3-> \raisebox{-0.7em}{\tikz{\node [anchor=west,fill=blue!20] (factor01) at (factor00.east) {$\frac{\partial f^K(\textbf{s}^K)}{\partial \textbf{s}^K}$};}} 表示激活函数相对于它自己的输入的变化率,比如,对于$f(\textbf{s}) = \frac{1}{1+\exp(-\textbf{s})}$,有$\frac{\partial f(\textbf{s})}{\partial \textbf{s}} = f(\textbf{s})(1-f(\textbf{s}))$
\item<4-> 这个结果符合直觉,在$s^K$出的梯度相当于在损失函数微分($\frac{\partial L}{\partial \textbf{h}^K}$)和激活函数微分($\frac{\partial f^K(\textbf{s}^K)}{\partial \textbf{s}^K}$) 的乘积\visible<5->{,注意这里所有操作都是单元级,比如张量按单元乘法}

\end{itemize}

\visible<4->{
\vspace{-0.5em}
\begin{center}
\begin{tikzpicture}
\begin{scope}
\node [anchor=west,minimum height=1.7em,fill=blue!20,draw] (s) at (0,0) {$\textbf{s}^{K}$};
\node [anchor=west,minimum height=1.7em,fill=green!20,draw] (h2) at ([xshift=5.5em]s.east) {$\textbf{h}^{K}$};
\node [anchor=west,minimum height=1.7em,fill=orange!20,draw] (l) at ([xshift=5.5em]h2.east) {$L$};
\draw [->] (s.east) -- (h2.west);
\draw [->] (h2.east) -- (l.west);

\draw [->,very thick,red] ([yshift=1em,xshift=-0.1em]l.north) -- ([yshift=1em,xshift=0.1em]h2.north) node [pos=0.5,above] {\tiny{求梯度\alert{$\frac{\partial L}{\partial \textbf{h}^K} = ?$}}};
\draw [->,very thick,red] ([yshift=1em,xshift=-0.1em]h2.north) -- ([yshift=1em,xshift=0.1em]s.north) node [pos=0.5,above] {\tiny{求梯度\alert{$\frac{\partial f^K(\textbf{s}^K)}{\partial \textbf{s}^K} = ?$}}};
\draw [-,very thick,red] ([yshift=0.5em]l.north) -- ([yshift=1.5em]l.north);
\draw [-,very thick,red] ([yshift=0.5em]h2.north) -- ([yshift=1.5em]h2.north);
\draw [-,very thick,red] ([yshift=0.5em]s.north) -- ([yshift=1.5em]s.north);

\end{scope}

\end{tikzpicture}
\end{center}
}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 输出层的反向传播 - 求 dL/dw
\begin{frame}{反向传播 - 输出层($\textbf{h}^{K-1}$处的梯度)}

\begin{itemize}
\item 已经得到$\textbf{s}^K$处的梯度\visible<2->{,下面求解两个问题}
	\begin{enumerate}
	\item<2-> 计算损失$L$对于第$K$层参数矩阵$\textbf{w}^K$的梯度,$\frac{\partial L}{\partial \textbf{w}^K}$
	\item<2-> 计算损失$L$对于第$K$层输入$\textbf{h}^{K-1}$的梯度,$\frac{\partial L}{\partial \textbf{h}^{K-1}}$
	\end{enumerate}
\end{itemize}

\vspace{-0.8em}
\begin{center}
\begin{tikzpicture}
\begin{scope}
\node [anchor=center,minimum height=1.7em,fill=yellow!20,draw] (h) at (0,0) {$\textbf{h}^{K-1}$};
\node [anchor=west,minimum height=1.7em,fill=blue!20,draw] (s) at ([xshift=5.5em]h.east) {$\textbf{s}^{K}$};
\node [anchor=west,minimum height=1.7em,fill=green!20,draw] (h2) at ([xshift=5.5em]s.east) {$\textbf{h}^{K}$};
\node [anchor=west,minimum height=1.7em,fill=orange!20,draw] (l) at ([xshift=5.5em]h2.east) {$L$};
\draw [->] (h.east) -- (s.west);
\draw [->] (s.east) -- (h2.west);
\draw [->] (h2.east) -- (l.west) node [pos=0.5,above] {\tiny{损失}};

\node [anchor=south west,inner sep=2pt] (step100) at ([xshift=0.5em,yshift=-0.8em]h.north east) {\tiny{$\textbf{s}^K = \textbf{h}^{K-1} \textbf{w}^K$}};

\node [anchor=south west,inner sep=2pt] (step200) at ([xshift=0.5em,yshift=-0.8em]s.north east) {\tiny{$\textbf{h}^K = f^K(\textbf{s}^K)$}};

\node [anchor=south,inner sep=1pt] (outputlabel) at ([yshift=0.0em]h2.north) {\tiny{\textbf{输出层}}};
\node [anchor=south west] (slabel) at ([yshift=1em,xshift=0.3em]s.north) {\scriptsize{\textbf{\alert{已经得到:$\pi^K = \frac{\partial L}{\partial \textbf{s}^K}$}}}};
\draw [->,red] ([yshift=0.3em]slabel.south) .. controls +(south:0.5) and +(north:0.5) .. ([xshift=0.5em]s.north);

\visible<2->{
\draw [->,very thick,red] ([yshift=1em,xshift=-0.1em]s.north) -- ([yshift=1em,xshift=0.1em]h.north) node [pos=0.5,above] {\tiny{\alert{$\frac{\partial L}{\partial \textbf{w}^K} = ?$, $\frac{\partial L}{\partial \textbf{h}^{K-1}} = ?$}}};
\draw [-,very thick,red] ([yshift=0.5em]h.north) -- ([yshift=1.5em]h.north);
\draw [-,very thick,red] ([yshift=0.5em]s.north) -- ([yshift=1.5em]s.north);
}

\end{scope}

\end{tikzpicture}
\end{center}

\begin{itemize}
\item<3-> 由于$\textbf{s}^K = \textbf{h}^{K-1} \textbf{w}^K$,而且$\pi^K = \frac{\partial L}{\partial \textbf{s}^K}$已经求解,可以得到(需要一些数学分析和线性代数的知识,推导一下!):

\vspace{-1.2em}

\begin{eqnarray}
\frac{\partial L}{\partial \textbf{w}^K}      & = & [\textbf{h}^{K-1}]^T \pi^K \nonumber \\
\frac{\partial L}{\partial \textbf{h}^{K-1}} & = & \pi^K  [\textbf{w}^K]^T\nonumber
\end{eqnarray}

这里,$[\textbf{A}]^T$表示$\textbf{A}$的转置,$\pi^K  [\textbf{w}^K]^T$表示张量$\pi^K$\alert{矩阵乘}$\textbf{w}^K$的转置
\end{itemize}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 隐层的反向传播
\begin{frame}{反向传播 - 隐层}
\begin{itemize}
\item 对于任意隐层$k$$\textbf{h}^k = f^k(\textbf{s}^k) = f^k(\textbf{h}^{k-1}\textbf{w}^k)$。给定:隐层输出处的梯度$\pi^k=\frac{\partial L}{\partial \textbf{h}^{k}}$,需要
	\begin{enumerate}
	\item 计算损失$L$对于第$k$层参数矩阵$\textbf{w}^k$的梯度,$\frac{\partial L}{\partial \textbf{w}^k}$
	\item 计算损失$L$对于第$k$层输入$\textbf{h}^{k-1}$的梯度,$\frac{\partial L}{\partial \textbf{h}^{k-1}}$
	\end{enumerate}
\item<2-> 直接套用上一页的方法,可以将$\pi^k=\frac{\partial L}{\partial \textbf{h}^{k}} \frac{\partial f^k(\textbf{s}^k)}{\partial \textbf{s}^{k}}$反向传播
\vspace{-0.0em}
\begin{eqnarray}
\frac{\partial L}{\partial \textbf{w}^k}      & = & [\textbf{h}^{k-1}]^T \pi^k \nonumber \\
\frac{\partial L}{\partial \textbf{h}^{k-1}} & = & \pi^k  [\textbf{w}^k]^T\nonumber
\end{eqnarray}

\end{itemize}

\visible<3->{
\begin{center}
\begin{tikzpicture}
\begin{scope}
\node [anchor=center,draw,fill=red!20,minimum height=1.8em,minimum width=2.5em] (h) at (0,0) {$\textbf{h}^{k-1}$};
\node [anchor=west,draw,fill=blue!20,minimum height=1.8em,minimum width=2.5em] (s) at ([xshift=6em]h.east) {$\textbf{s}^{k}$};
\node [anchor=west,draw,fill=green!20,minimum height=1.8em,minimum width=2.5em] (h2) at ([xshift=6em]s.east) {$\textbf{h}^{k}$};
\node [anchor=east] (prev) at ([xshift=-2em]h.west) {...};
\node [anchor=west] (next) at ([xshift=2em]h2.east) {...};
\draw [->,thick] ([xshift=0.1em]prev.east) -- ([xshift=-0.1em]h.west);
\draw [->,thick] ([xshift=0.1em]h.east) -- ([xshift=-0.1em]s.west) node [pos=0.5,below] {\tiny{$\textbf{s}^k = \textbf{h}^{k-1}\textbf{w}^k$}};
\draw [->,thick] ([xshift=0.1em]s.east) -- ([xshift=-0.1em]h2.west) node [pos=0.5,below] {\tiny{$\textbf{h}^k = f(\textbf{s}^{k})$}};
\draw [->,thick] ([xshift=0.1em]h2.east) -- ([xshift=-0.1em]next.west);

\visible<4->{
\draw [<-,thick,red] ([xshift=0.1em,yshift=0.4em]h2.east) -- ([xshift=-0.1em,yshift=0.4em]next.west) node [pos=0.8,above] {\tiny{反向传播}};
}

\visible<5->{
\draw [<-,thick,red] ([xshift=0.1em,yshift=0.4em]s.east) -- ([xshift=-0.1em,yshift=0.4em]h2.west) node [pos=0.5,above] {\tiny{反向传播}};
}

\visible<6->{
\draw [<-,thick,red] ([xshift=0.1em,yshift=0.4em]h.east) -- ([xshift=-0.1em,yshift=0.4em]s.west) node [pos=0.5,above] {\tiny{反向传播}};
}

\visible<7->{
\draw [->,thick,red,dashed] ([yshift=-0.1em]h.south) -- ([yshift=-1em]h.south) -- ([yshift=-1em]h2.south) -- ([yshift=-0.1em]h2.south);
\node [anchor=north,red] (recur) at ([yshift=-1em]s.south) {\scriptsize{$k=k-1$重复上述过程}};
}

\visible<4->{
\node [anchor=south] (h2label) at (h2.north) {$\frac{\partial L}{\partial \textbf{h}^{k}}$};
}

\visible<5->{
\node [anchor=south] (slabel) at (s.north) {$\pi^k = \frac{\partial L}{\partial \textbf{s}^{k}}$};
}

\visible<6->{
\node [anchor=south] (hlabel) at (h.north) {$\frac{\partial L}{\partial \textbf{h}^{k-1}}$, $\frac{\partial L}{\partial \textbf{w}^{k}}$};
}

\end{scope}
\end{tikzpicture}
\end{center}
}

\end{frame}

xiaotong committed
3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648
%%%------------------------------------------------------------------------------------------------------------
%%% 反向传播实例
\begin{frame}{反向传播的实现}
\begin{itemize}
\item 对于一个多层神经网络很容易实现反向传播
\end{itemize}

\begin{tcolorbox}
[bicolor,sidebyside,righthand width=3.5cm,size=title,frame engine=empty,
 colback=blue!10!white,colbacklower=black!5!white]
 {\scriptsize
\begin{tabbing}
\texttt{XTensor x, y, gold, h[5], w[5], s[5];} \\
\texttt{XTensor dh[5], dw[5], ds[5];} \\
\texttt{...} // 前向过程 \\
\texttt{h[0] = x;} \\
\texttt{y = h[4];} \\

\visible<2->{
\texttt{} \\
liuhui committed
3649
\texttt{CrossEntropyBackward(dh[4], y, gold);} \\
xiaotong committed
3650 3651
\texttt{SoftmaxBackward(y, s[4], dh[4], ds[4]);}\\
\texttt{MMul(h[3], {\tiny X\_TRANS}, ds[4], {\tiny X\_NOTRANS}, dw[4]);}\\
liuhui committed
3652
\texttt{MMul(ds[4], {\tiny X\_NOTRANS}, w[4], {\tiny X\_RANS}, dh[3]);}\\
xiaotong committed
3653 3654 3655 3656 3657
}

\visible<3->{
\texttt{} \\
\texttt{dh[2] = dh[3];}\\
liuhui committed
3658 3659 3660
\texttt{ReluBackward(h[2], s[2], dh[2], ds[2]);}\\
\texttt{MMul(h[1], {\tiny X\_TRANS}, ds[2], {\tiny X\_NOTRANS}, dw[2]);}\\
\texttt{MMul(ds[2], {\tiny X\_NOTRANS}, w[2], {\tiny X\_TRANS}, dh[2]);}\\
xiaotong committed
3661 3662 3663 3664
}

\visible<4->{
\texttt{} \\
liuhui committed
3665
\texttt{dh[1] = dh[1] + dh[3];}\\
xiaotong committed
3666 3667 3668
}

\visible<5->{
liuhui committed
3669
\texttt{...} // 继续反向传播 \\
xiaotong committed
3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687
\texttt{} \\
\texttt{for(unsigned i = 0; i < 5; i++)\{} \\
\texttt{} \ \ \ \ ... // 通过\alert{\texttt{dw[i]}}访问参数的梯度\\
\texttt{\}}
}

\end{tabbing}
}
\tcblower
\begin{center}
\begin{tikzpicture}


\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=8em,minimum height=1.2em,fill=red!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (h1) at (0,0) {\tiny{x (input)}};
\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=8em,minimum height=1.2em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (h2) at ([yshift=1.5em]h1.north) {\tiny{h1 = Relu(x * w1)}};
\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=8em,minimum height=1.2em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (h3) at ([yshift=1.5em]h2.north) {\tiny{h2 = Relu(h1 * w2)}};
\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=8em,minimum height=1.2em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (h4) at ([yshift=1.5em]h3.north) {\tiny{h3 = h2 + h1}};

liuhui committed
3688 3689
\visible<1-4>{\draw [->,thick] (h1.north) -- (h2.south);}
\visible<1-2>{\draw [->,thick] (h2.north) -- (h3.south);}
xiaotong committed
3690
\visible<1-2>{\draw [->,thick] (h3.north) -- (h4.south);}
liuhui committed
3691
\visible<1-3>{\draw [->,thick,rounded corners] (h2.east) -- ([xshift=0.5em]h2.east) -- ([xshift=0.5em,yshift=0.5em]h3.north east) -- ([xshift=-2em,yshift=0.5em]h3.north east) -- ([xshift=-2em,yshift=1.5em]h3.north east);}
xiaotong committed
3692

liuhui committed
3693 3694
\visible<5>{\draw [<-,very thick,red] (h1.north) -- (h2.south);}
\visible<3->{\draw [<-,very thick,red] (h2.north) -- (h3.south);}
xiaotong committed
3695
\visible<3->{\draw [<-,very thick,red] (h3.north) -- (h4.south);}
liuhui committed
3696
\visible<4->{\draw [<-,very thick,red,rounded corners] (h2.east) -- ([xshift=0.5em]h2.east) -- ([xshift=0.5em,yshift=0.5em]h3.north east) -- ([xshift=-2em,yshift=0.5em]h3.north east) -- ([xshift=-2em,yshift=1.5em]h3.north east);}
xiaotong committed
3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790

\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=8.0em,minimum height=1.2em,fill=red!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (slayer) at ([yshift=1.5em]h4.north) {\tiny{h4 = Softmax(h3 * w4) (output)}};
\node [anchor=south] (losslabel) at (slayer.north) {\scriptsize{\textbf{Cross Entropy Loss}}};

\visible<1>{\draw [->,thick] (h4.north) -- (slayer.south);}
\visible<2->{\draw [<-,very thick,red] (h4.north) -- (slayer.south);}

\end{tikzpicture}
\end{center}
\end{tcolorbox}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 自动微分的实现
\begin{frame}{更简单的实现}
\begin{itemize}
\item 幸运的是,现在几乎所有的主流深度学习框架都实现了自动微分,一个函数可以搞定
\end{itemize}

\begin{tcolorbox}
[bicolor,sidebyside,righthand width=3.5cm,size=title,frame engine=empty,
 colback=blue!10!white,colbacklower=black!5!white]
 {\scriptsize
\begin{tabbing}
\texttt{XTensor x, loss, gold, h[5], w[5], b[5];} \\
\texttt{...} \\

\texttt{} \\
\texttt{h[1] = Relu(MMul(x, w[1]) + b[1]);} \\
\texttt{h[2] = Relu(MMul(h[1], w[2]) + b[2]);} \\
\texttt{h[3] = HardTanH(h[2]);} \\
\texttt{h[4] = Softmax(MMul(h[3], w[3]));} \\
\texttt{loss = CrossEntropy(h[4], gold);} \\

\texttt{} \\
\texttt{XNet net;}\\
\alert{\texttt{net.Backward(loss);} //一行代码实现自动微分}\\

\texttt{} \\
\texttt{for(unsigned i = 0; i < 5; i++)\{} \\
\texttt{} \ \ \ \ ... // 通过\alert{\texttt{w[i].grad}}访问参数的梯度\\
\texttt{\}}

\end{tabbing}
}
\tcblower
\begin{center}
\begin{tikzpicture}


\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=8em,minimum height=1.0em,fill=red!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (h1) at (0,0) {\tiny{x (input)}};
\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=8em,minimum height=1.0em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (h2) at ([yshift=1.0em]h1.north) {\tiny{h1 = Relu(x * w1 + b1)}};
\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=8em,minimum height=1.0em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (h3) at ([yshift=1.0em]h2.north) {\tiny{h2 = Relu(h1 * w2 + b2)}};
\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=8em,minimum height=1.0em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (h4) at ([yshift=1.0em]h3.north) {\tiny{h3 = HardTanh(h2)}};

\draw [->,thick] (h1.north) -- (h2.south);
\draw [->,thick] (h2.north) -- (h3.south);
\draw [->,thick] (h3.north) -- (h4.south);

\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=8.0em,minimum height=1.0em,fill=red!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (slayer) at ([yshift=1.0em]h4.north) {\tiny{h4 = Softmax(h3 * w4) (output)}};
\node [anchor=south] (losslabel) at (slayer.north) {\scriptsize{\textbf{Cross Entropy Loss}}};

\draw [->,thick] (h4.north) -- (slayer.south);

\end{tikzpicture}
\end{center}
\end{tcolorbox}

\begin{itemize}
\item 其它优秀的自动微分实现也可以参考TensorFlow、 PyTorch等工具
\end{itemize}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 前向计算过程及其它值得关注的问题
\begin{frame}{前向计算及其它问题}
\begin{itemize}
\item \alert{前向计算}实际上就是网络构建的过程,有两种常用方式
    \begin{itemize}
    \item \textbf{动态图}(如PyTorch、NiuTensor):写完函数表达式,前向计算即完成,易于调试
    \item \textbf{静态图}(如TensorFlow):函数表达式完成后,并不能得到前向计算结果,需要显性调用一个Forward函数,但是计算图可以进行深度优化,执行效率较高
    \end{itemize}
\item<2-> 其它一些深度学习系统实现的问题,值得关注,不过这些都超出了本课程的范围
    \begin{itemize}
    \item \textbf{分布式训练}:对于复杂模型的海量数据训练,需要利用多个设备(多机、多卡)同时训练
    \item \textbf{低精度计算}:为了提高效率可以采用半精度或者定点数进行计算
    \item \textbf{模型压缩}:减少冗余,可以压缩模型,使得模型易于存储同时提高系统运行效率
    \item \textbf{训练方法和超参选择}:不同任务往往需要不同的训练策略,包括超参设置,坑很多,需要积累经验
    \end{itemize}
\end{itemize}
\end{frame}

xiaotong committed
3791
%%%------------------------------------------------------------------------------------------------------------
xiaotong committed
3792 3793
\section{神经语言模型}

xiaotong committed
3794
%%%------------------------------------------------------------------------------------------------------------
xiaotong committed
3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816
%%% outline: neural language modeling
\begin{frame}{进入正题}

\vspace{6em}
\begin{tcolorbox}[enhanced,size=normal,left=2mm,right=1mm,colback=red!5!white,colframe=red!75!black,drop fuzzy shadow]
{\Large
\textbf{如何将神经元网络应用到NLP?}

\vspace{0.4em}
\textbf{- 语言模型的神经网络建模}
}
\end{tcolorbox}

\vspace{2em}
\begin{center}
\begin{tikzpicture}
\end{tikzpicture}
\end{center}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
xiaotong committed
3817 3818
\subsection{前馈、循环、自注意力神经网络}

xiaotong committed
3819
%%%------------------------------------------------------------------------------------------------------------
xiaotong committed
3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833
%%% 在NLP中神经网络能干什么?
\begin{frame}{自然语言处理遇到神经网络}
\begin{itemize}
\item 神经网络方法给自然语言处理(NLP)带来了新的思路
\end{itemize}

\begin{tabular} {l | l}
\textbf{传统基于统计的方法} & \textbf{深度学习方法} \\ \hline
基于\alert{离散}空间的表示模型 & 基于\alert{连续}空间的表示模型 \\
NLP问题的\alert{隐含结构}假设 & 无隐含结构假设,\alert{端到端}学习 \\
\alert{特征工程}为主 & 无显性特征,但需要\alert{设计网络} \\
特征、规则的\alert{存储耗资源} & 模型存储相对小,但\alert{计算慢}
\end{tabular}

xiaotong committed
3834
\vspace{0em}
xiaotong committed
3835 3836

\begin{itemize}
xiaotong committed
3837
\item<2-> 语言模型任务也可以使用深度学习方法(效果非常好)
xiaotong committed
3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851
    \begin{itemize}
    \item 语言模型要回答的问题是如何评价一个词串的好坏
    \item 可以回忆一下第二章提到的$n$元语法模型
    \end{itemize}
    \vspace{0.5em}
    \begin{displaymath}
    \textbf{P}(w_0 w_1 ... w_m) = ?
    \end{displaymath}

\end{itemize}

\visible<3->{
\begin{tcolorbox}[enhanced,size=normal,left=2mm,right=1mm,colback=blue!5!white,colframe=blue!75!black,drop fuzzy shadow]
{\Large
xiaotong committed
3852
\textbf{如何对词串的生成概率进行建模?}
xiaotong committed
3853 3854 3855 3856 3857 3858 3859
}
\end{tcolorbox}
}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
xiaotong committed
3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891
%%% n-gram语言模型
\begin{frame}{$n$-gram语言模型}
\begin{itemize}
\item \textbf{链式法则}
\begin{eqnarray}
\textrm{P}(w_1 w_2 ... w_m)  & = & \textrm{P}(w_1) \textrm{P}(w_2|w_1) \textrm{P}(w_3 | w_1 w_2) ... \nonumber \\
                                               &    & \textrm{P}(w_m|w_1...w_{m-1}) \nonumber
\end{eqnarray}
\item<2-> \textbf{传统$n$-gram语言模型}:当前词仅依赖于前面$n-1$个词
\begin{eqnarray}
\textrm{P}(w_1 w_2 ... w_m)  & = & \textrm{P}(w_1) \textrm{P}(w_2|w_1) \textrm{P}(w_3 | w_1 w_2) ... \nonumber \\
                                               &    & \textrm{P}(w_m|\underbrace{w_{m-n+1}...w_{m-1}}_{\text{前面$n-1$个词}}) \nonumber
\end{eqnarray}
\vspace{-1.0em}
\ \ \ \ \ \ 其中
\begin{displaymath}
\textrm{P}(w_m | w_{m-n+1} ... w_{m-1})  = \frac{\textrm{count}(w_{m-n+1}...w_{m})}{\textrm{count}(w_{m-n+1}...w_{m-1})}
\end{displaymath}
\ \ \ \ \ \ $\textrm{count}(\cdot)$表示在训练数据上统计的频次
\end{itemize}
\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% n-gram lm => neural lm
\begin{frame}{$n$-gram生成概率的神经网络建模}
\begin{itemize}
\item 传统的$n$-gram语言模型实际上就是一个查询表,用$w_{m-n+1} ... w_{m}$查询$n$-gram概率$\textrm{P}(w_m | w_{m-n+1} ... w_{m-1})$
    \begin{itemize}
    \item 这张表本质上是一种$w_{m-n+1} ... w_{m}$\alert{离散表示}
    \item 随着$n$的增大,\alert{数据稀疏}问题会非常严重,因为绝大多数$n$-gram是没见过的
    \item 因为要维护$n$-gram的索引,存储消耗大
    \end{itemize}
xiaotong committed
3892
\item<2-> 另一种思路是直接对$\textrm{P}(w_m | w_{m-n+1} ... w_{m-1})$进行连续空间建模,即定义函数$g$,对于任意的$w_{m-n+1} ... w_{m}$
xiaotong committed
3893 3894 3895 3896 3897 3898
    \begin{displaymath}
    g(w_{m-n+1} ... w_{m}) \approx \textrm{P}(w_m | w_{m-n+1} ... w_{m-1})
    \end{displaymath}



xiaotong committed
3899
\item<3-> 最具代表性的方法是前馈神经网络(FNN)语言模型
xiaotong committed
3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925
    \begin{itemize}
    \item 经典中的经典,对现代神经语言模型的设计产生深远影响
    \end{itemize}

    \textbf{A Neural Probabilistic Language Model}\\
    \textbf{Bengio et al., 2003, Journal of Machine Learning Research 3: 1137-1155}
\end{itemize}
\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% FNNLM architecture
\begin{frame}{前馈神经网络语言模型(Bengio et al., 2003)}
\begin{itemize}
\item 以4-gram语言模型为例
\end{itemize}

\vspace{-1em}
\begin{center}
\begin{tikzpicture}
\begin{scope}
\node [anchor=west] (w0) at (0,0) {\footnotesize{$w_{i-3}$}};
\node [anchor=west] (w1) at ([xshift=2em]w0.east) {\footnotesize{$w_{i-2}$}};
\node [anchor=west] (w2) at ([xshift=2em]w1.east) {\footnotesize{$w_{i-1}$}};
\node [anchor=north] (index0) at ([yshift=0.5em]w0.south) {\tiny(index)};
\node [anchor=north] (index1) at ([yshift=0.5em]w1.south) {\tiny(index)};
\node [anchor=north] (index2) at ([yshift=0.5em]w2.south) {\tiny(index)};
姜雨帆 committed
3926 3927 3928 3929
\node [anchor=south,draw,inner sep=3pt] (e0) at ([yshift=1em]w0.north) {\tiny{$\textbf{e}_0=w_{i-3} \textbf{C}$}};
\node [anchor=south,draw,inner sep=3pt] (e1) at ([yshift=1em]w1.north) {\tiny{$\textbf{e}_1=w_{i-2} \textbf{C}$}};
\node [anchor=south,draw,inner sep=3pt] (e2) at ([yshift=1em]w2.north) {\tiny{$\textbf{e}_2=w_{i-1} \textbf{C}$}};
\node [anchor=south,draw,minimum width=9em,inner sep=3pt] (h0) at ([yshift=1.5em]e1.north) {\tiny{$\textbf{h}_0=\textrm{Tanh}([\textbf{e}_0,\textbf{e}_1,\textbf{e}_2] \textbf{H} + \textbf{d})$}};
姜雨帆 committed
3930
\node [anchor=south,draw,minimum width=9em,inner sep=3pt] (h1) at ([yshift=1.5em]h0.north) {\tiny{$\textbf{y}=\textrm{Softmax}(\textbf{h}_0 \textbf{U})$}};
xiaotong committed
3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956
\node [anchor=south] (ylabel) at ([yshift=1em]h1.north) {\footnotesize{$\textrm{P}(w_i|w_{i-3}w_{i-2}w_{i-1})$}};

\draw [->] ([yshift=0.1em]w0.north) -- ([yshift=-0.1em]e0.south);
\draw [->] ([yshift=0.1em]w1.north) -- ([yshift=-0.1em]e1.south);
\draw [->] ([yshift=0.1em]w2.north) -- ([yshift=-0.1em]e2.south);
\draw [->] ([yshift=0.1em]e0.north) -- ([xshift=-2em,yshift=-0.1em]h0.south);
\draw [->] ([yshift=0.1em]e1.north) -- ([yshift=-0.1em]h0.south);
\draw [->] ([yshift=0.1em]e2.north) -- ([xshift=2em,yshift=-0.1em]h0.south);
\draw [->] ([yshift=0.1em]h0.north) -- ([yshift=-0.1em]h1.south);
\draw [->] ([yshift=0.1em]h1.north) -- ([yshift=-0.1em]ylabel.south);

\visible<6->{
\draw [->,dashed,red,thick] ([xshift=1em,yshift=0.1em]e1.north) -- ([xshift=1em,yshift=-0.1em]h1.south);
\draw [->,dashed,red,thick] ([xshift=-1em,yshift=0.1em]e0.north) .. controls +(north:2) and +(south:1) .. ([xshift=-3em,yshift=-0.1em]h1.south);
\draw [->,dashed,red,thick] ([xshift=1em,yshift=0.1em]e2.north) .. controls +(north:2) and +(south:1) .. ([xshift=3em,yshift=-0.1em]h1.south);
}

\begin{pgfonlayer}{background}
\visible<2->{
\node [rectangle,inner sep=0.1em,fill=ugreen!20!white] [fit = (w0) (index0)] (wordbox0) {};
\node [rectangle,inner sep=0.1em,fill=ugreen!20!white] [fit = (w1) (index1)] (wordbox1) {};
\node [rectangle,inner sep=0.1em,fill=ugreen!20!white] [fit = (w2) (index2)] (wordbox2) {};
}
\end{pgfonlayer}

\visible<3->{
姜雨帆 committed
3957 3958 3959
\node [anchor=south,draw,inner sep=3pt,fill=blue!20!white] (e0) at ([yshift=1em]w0.north) {\tiny{$\textbf{e}_0=w_{i-3} \textbf{C}$}};
\node [anchor=south,draw,inner sep=3pt,fill=blue!20!white] (e1) at ([yshift=1em]w1.north) {\tiny{$\textbf{e}_1=w_{i-2} \textbf{C}$}};
\node [anchor=south,draw,inner sep=3pt,fill=blue!20!white] (e2) at ([yshift=1em]w2.north) {\tiny{$\textbf{e}_2=w_{i-1} \textbf{C}$}};
xiaotong committed
3960 3961
}
\visible<5->{
姜雨帆 committed
3962 3963
\node [anchor=south,draw,minimum width=9em,inner sep=3pt,fill=orange!20!white] (h0) at ([yshift=1.5em]e1.north) {\tiny{$\textbf{h}_0=\textrm{Tanh}([\textbf{e}_0,\textbf{e}_1,\textbf{e}_2] \textbf{H} + \textbf{d})$}};
\node [anchor=south,draw,minimum width=9em,inner sep=3pt,fill=orange!20!white] (h1) at ([yshift=1.5em]h0.north) {\tiny{$\textbf{y}=\textrm{Softmax}(\textbf{h}_0 \textbf{U})$}};
xiaotong committed
3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016
}

\visible<2->{
\node [anchor=north west] (indexlabel0) at ([yshift=-0.5em,xshift=-1.2em]index0.south west) {\scriptsize{{\color{ugreen} \textbf{One-hot表示}}}};
\node [anchor=north west] (indexlabel1) at ([yshift=0.3em]indexlabel0.south west) {\scriptsize{每个词用一个词汇表大小的0-1向量表示,}};
\node [anchor=north west] (indexlabel2) at ([yshift=0.3em]indexlabel1.south west) {\scriptsize{仅一位为1,其余为0,比如:}};
\node [anchor=north west] (indexlabel3) at ([yshift=0.0em]indexlabel2.south west) {\scriptsize{$(0,0,{\red 1},0,0,0,0,0,0,0,0,0)$}};
\node [anchor=north west] (indexlabel4) at ([xshift=1em,yshift=0.0em]indexlabel3.south west) {\scriptsize{词表中第3个词}};
\draw [->] ([xshift=1.2em,yshift=-0.2em]indexlabel4.north west) -- ([xshift=1.2em,yshift=0.3em]indexlabel4.north west);
}

\visible<3->{
\node [anchor=west] (embedinglabel0) at ([xshift=1em,yshift=-1em]e2.east) {\scriptsize{{\blue \textbf{词的分布式表示}}}};
\node [anchor=north west] (embedinglabel1) at ([yshift=0.3em]embedinglabel0.south west) {\scriptsize{词的0-1表示乘一个矩阵$\textbf{C}$,这里可以}};
\node [anchor=north west] (embedinglabel2) at ([yshift=0.3em]embedinglabel1.south west) {\scriptsize{$\textbf{C}$看做一个查询表}};
}

\visible<4->{
\node [anchor=north west] (wordvector) at ([yshift=-1em]embedinglabel2.south west) {\tiny{$(0,0,{\red 1},...)$}};
\node [anchor=west] (timeslabel) at ([xshift=-0.3em]wordvector.east) {\footnotesize{$\times$}};
\node [anchor=north west,inner sep=2pt] (embeddingmatrix) at ([xshift=1em]wordvector.north east) {\tiny{$\begin{pmatrix} 0 & 1 & 3 \\ .2 & -1 & .3 \\ 1 & 7 & .3 \\ ... \end{pmatrix}$}};
\node [anchor=south,inner sep=1pt] (wordvectorlabel) at (wordvector.north) {\scriptsize{$w_{i-1}$}};
\node [anchor=south,inner sep=1pt] (embeddingmatrixlabel) at (embeddingmatrix.north) {\scriptsize{$\textbf{C}$}};
\node [anchor=north west] (selectedlabel) at ([yshift=-2em]wordvector.south west) {\scriptsize{在把$\textbf{C}$中索引到的行输出(i.e., $e_{i-1}$)}};

\begin{pgfonlayer}{background}
\visible<4->{
\node [anchor=north west,fill=blue!20!white,minimum height=0.6em,minimum width=5.0em] (selected) at ([yshift=-1.3em]embeddingmatrix.north west) {};
}
\end{pgfonlayer}
\draw [->] ([xshift=0.15em,yshift=0.3em]wordvector.south) .. controls +(south:0.3) and +(west:0.5) .. (selected.west);
}

\visible<5->{
\node [anchor=south west] (hiddenlabel0) at ([yshift=5em]embedinglabel0.north west) {\scriptsize{{\color{orange} \textbf{多层神经网络}}}};
\node [anchor=north west] (hiddenlabel1) at ([yshift=0.3em]hiddenlabel0.south west) {\scriptsize{$[e_0,e_1,e_2]$表示把三个向量级联在一起,}};
\node [anchor=north west] (hiddenlabel2) at ([yshift=0.3em]hiddenlabel1.south west) {\scriptsize{之后经过两层网络,最后通过Softmax输出}};
\node [anchor=north west] (hiddenlabel3) at ([yshift=0.3em]hiddenlabel2.south west) {\scriptsize{注意,$h_0\textbf{U}$得到所有词的表示(向量),}};
\node [anchor=north west] (hiddenlabel4) at ([yshift=0.3em]hiddenlabel3.south west) {\scriptsize{Softmax确保输出词汇表上的一个分布}};
}

\visible<6->{
\node [anchor=south west] (directlabel0) at ([yshift=1em]hiddenlabel0.north west) {\scriptsize{\alert{\textbf{底层向上层的直接连接(可选)}}}};
}

\end{scope}
\end{tikzpicture}
\end{center}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% FNNLM implementation
xiaotong committed
4017
\begin{frame}{前馈神经网络语言模型(FNN LM)的实现}
xiaotong committed
4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036

\begin{itemize}
\item 实现非常简单,几行代码
    \begin{itemize}
    \item 细节1:做batching时可以把$w[i]$进行扩展,比如放入多个词
    \item 细节2:TanH一般会用HardTanH实现,因为TanH容易溢出
    \end{itemize}
\end{itemize}

\begin{tcolorbox}
[bicolor,sidebyside,righthand width=3.8cm,size=title,frame engine=empty,
 colback=blue!10!white,colbacklower=black!5!white]
 {\scriptsize
\begin{tabbing}
\texttt{XTensor w[3], e[3], h0, y;} \\
\texttt{XTensor C, H, d, U;} \\
\texttt{...}\\

\texttt{} \\
xiaotong committed
4037
\texttt{for(unsigned i = 0; i < 3; i++)} \\
xiaotong committed
4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088
\texttt{\ \ \ \ e[i] = MMul(w[i], C);}\\
\texttt{e01 = Concatenate(e[0], e[1], -1);}\\
\texttt{e = Concatenate(e01, e[2], -1);}\\

\texttt{} \\
\texttt{h0 = TanH(MMul(e, H) + d);}\\
\texttt{y = Softmax(MMul(h0, U));}\\

\texttt{} \\
\texttt{for(unsigned k = 0; k < size; k++)\{} \\
\texttt{} \ \ \ \ ... // \alert{\texttt{y}}的第$k$元素表示 $\textrm{P}(w|...)$\\
\texttt{} \ \ \ \ ... // $w$为词汇表里第$k$个词\\
\texttt{\}}

\end{tabbing}
}
\tcblower
\begin{center}
\begin{tikzpicture}
\begin{scope}
\node [anchor=west] (w0) at (0,0) {\scriptsize{$w_{i-3}$}};
\node [anchor=west] (w1) at ([xshift=0.5em]w0.east) {\scriptsize{$w_{i-2}$}};
\node [anchor=west] (w2) at ([xshift=0.5em]w1.east) {\scriptsize{$w_{i-1}$}};
\node [anchor=north] (index0) at ([yshift=0.5em]w0.south) {\tiny(index)};
\node [anchor=north] (index1) at ([yshift=0.5em]w1.south) {\tiny(index)};
\node [anchor=north] (index2) at ([yshift=0.5em]w2.south) {\tiny(index)};
\node [anchor=south,draw,inner sep=3pt,align=left] (e0) at ([yshift=1.0em]w0.north) {\tiny{$e_0:$}\\\tiny{$w_{i-3} \textbf{C}$}};
\node [anchor=south,draw,inner sep=3pt,align=left] (e1) at ([yshift=1.0em]w1.north) {\tiny{$e_1:$}\\\tiny{$w_{i-2} \textbf{C}$}};
\node [anchor=south,draw,inner sep=3pt,align=left] (e2) at ([yshift=1.0em]w2.north) {\tiny{$e_2:$}\\\tiny{$w_{i-1} \textbf{C}$}};
\node [anchor=south,draw,minimum width=9em,inner sep=3pt] (h0) at ([yshift=1.5em]e1.north) {\tiny{$h_0=\textrm{Tanh}([e_0,e_1,e_2] \textbf{H} + \textbf{d})$}};
\node [anchor=south,draw,minimum width=9em,inner sep=3pt] (h1) at ([yshift=1.5em]h0.north) {\tiny{$y=\textrm{Softmax}(h_0 \textbf{U})$}};
\node [anchor=south] (ylabel) at ([yshift=1em]h1.north) {\scriptsize{$\textrm{P}(w_i|w_{i-3}w_{i-2}w_{i-1})$}};

\draw [->] ([yshift=0.1em]w0.north) -- ([yshift=-0.1em]e0.south);
\draw [->] ([yshift=0.1em]w1.north) -- ([yshift=-0.1em]e1.south);
\draw [->] ([yshift=0.1em]w2.north) -- ([yshift=-0.1em]e2.south);
\draw [->] ([yshift=0.1em]e0.north) -- ([xshift=-2em,yshift=-0.1em]h0.south);
\draw [->] ([yshift=0.1em]e1.north) -- ([yshift=-0.1em]h0.south);
\draw [->] ([yshift=0.1em]e2.north) -- ([xshift=2em,yshift=-0.1em]h0.south);
\draw [->] ([yshift=0.1em]h0.north) -- ([yshift=-0.1em]h1.south);
\draw [->] ([yshift=0.1em]h1.north) -- ([yshift=-0.1em]ylabel.south);
\end{scope}
\end{tikzpicture}
\end{center}
\end{tcolorbox}
\vspace{-0.5em}
\footnotesize{注: size表示词汇表大小}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
xiaotong committed
4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104
%%% 神经语言模型给我们带来了什么
\begin{frame}{神经语言建模的意义}

\begin{itemize}
\item Bengio el al. (2003)中有待讨论的问题
    \begin{enumerate}
    \item 神经网络每一层究竟学到了什么 \\
    词汇、句法?还是其它一些知识?如何解释?
    \item 网络的层数变多会怎样 - 10层、20层、100层的网络 \\
    \# of layers: 10 $\to$ 20 $\to$ 100 $\to$ 1000
    \item 超参(比如隐藏层大小)如何选择 - 不同任务的最优设置\\
    单词的分布式表示维度多大好?\\
    隐层多大好?\\
    激活函数如何选择?\\
    ...
    \end{enumerate}
xiaotong committed
4105
\item<2-> 从FNN LM得到的启发
xiaotong committed
4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155
    \begin{itemize}
    \item 重新定义词是什么 - 非词典里的一项,而是一个实数向量
    \item 多层神经网络可以很好的表示单词之间的(短距离)依赖
    \item $n$-gram的生成概率可以使用连续空间函数描述,缓解数据稀疏问题,模型并不需要记录完整的$n$-gram
    \end{itemize}
\end{itemize}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 循环神经网络
\begin{frame}{循环神经网络(Recurrent Neural Networks)}

\begin{itemize}
\item FNN LM固然有效,但是和传统的$n$-gram LM一样,需要依赖\alert{有限上下文}假设
\begin{center}
\begin{tikzpicture}
\begin{scope}
\node [anchor=west] (w0) at (0,0) {$w_1$};
\node [anchor=west] (w1) at ([xshift=0.5em]w0.east) {$w_2$};
\node [anchor=west] (w2) at ([xshift=0.5em]w1.east) {$...$};
\node [anchor=west] (w3) at ([xshift=0.5em]w2.east) {$w_{m-n+1}$};
\node [anchor=west] (w4) at ([xshift=0.5em]w3.east) {$...$};
\node [anchor=west,fill=green!20!white] (w5) at ([xshift=0.5em]w4.east) {$w_{m}$};
\draw [->,thick,ublue] (w5.south).. controls +(210:0.5) and +(-30:0.5) .. (w3.south);
\draw [->,thick,red] (w5.north).. controls +(150:1) and +(30:1) .. (w1.north);
\draw [->,very thick,ublue] ([xshift=-5em,yshift=1em]w0.west) -- ([xshift=-6.5em,yshift=1em]w0.west) node [pos=0,right] {\scriptsize{依赖}};
\draw [->,very thick,red] ([xshift=-5em,yshift=-0.5em]w0.west) -- ([xshift=-6.5em,yshift=-0.5em]w0.west) node [pos=0,right] {\scriptsize{不依赖}};

\end{scope}
\end{tikzpicture}
\end{center}
\item<2-> 能否直接对原始问题建模,即定义函数$g$,对于任意的$w_{1} ... w_{m}$
    \vspace{-0.5em}
    \begin{displaymath}
    g(w_{1} ... w_{m}) \approx \textrm{P}(w_m | w_{1} ... w_{m-1})
    \end{displaymath}
\item<3-> \textbf{循环神经网络(RNNs)}可以很好的解决上述问题,因此也被成功的应用于语言建模任务
	\begin{itemize}
	\item 它假设每个词的生成都依赖已经生成的所有词
	\item 对于不同位置的词的生成概率都可以用同一个函数描述
	\end{itemize}
	
        \textbf{Recurrent Neural Network Based Language Model}\\
        \textbf{Mikolov et al., 2010, In Proc. of Interspeech, 1045-1048}
\end{itemize}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
xiaotong committed
4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166
%%% 循环神经网络的结构
\begin{frame}{循环单元}

\begin{itemize}
\item 有输入序列$(\textbf{x}_0,\textbf{x}_1,...,\textbf{x}_t,...)$,其中$\textbf{x}_t$表示序列中第$t$个元素,也被称作\alert{时刻$t$}输入。它所对应的输出序列是$(\textbf{y}_0,\textbf{y}_1,...,\textbf{y}_t,...)$。 在循环神经网络中,每个时刻的输出都可以用同一个\alert{循环单元}来描述。\visible<2->{对于语言模型,一种简单的结构:}

\visible<2->{
{\small
\begin{tcolorbox}
[bicolor,sidebyside,righthand width=4.3cm,size=title,frame engine=empty,
 colback=blue!10!white,colbacklower=black!5!white]
xiaotong committed
4167

xiaotong committed
4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205
\begin{eqnarray}
\textbf{y}_t & = & \textrm{Softmax}(\textbf{h}_t \textbf{V}) \nonumber \\
\textbf{h}_t & = & \textrm{TanH}(\textbf{x}_t \textbf{U} + \textbf{h}_{t-1} \textbf{W}) \nonumber
\end{eqnarray}
\footnotesize{$\textbf{h}_t$: $t$时刻的隐层状态\\
$\textbf{h}_{t-1}$: $t-1$时刻的隐层状态\\
$\textbf{V}, \textbf{U}, \textbf{W}$: 参数
}
\tcblower
\begin{center}
\begin{tikzpicture}
\begin{scope}
\node [anchor=west,inner sep=3pt,minimum width=8em] (h) at (0,0) {\tiny{$\textbf{h}_t  =  \textrm{TanH}(\textbf{x}_t \textbf{U} + \textbf{h}_{t-1} \textbf{W})$}};
\node [anchor=south west,inner sep=3pt] (r) at ([yshift=-0.2em]h.north west) {\tiny{循环单元:}};
\begin{pgfonlayer}{background}
\node [rectangle,draw,inner sep=0em,fill=green!20!white] [fit = (r) (h)] (rbox) {};
\end{pgfonlayer}
\node [anchor=south,draw,minimum width=8em,fill=green!20!white] (y) at ([yshift=1.5em]rbox.north) {\tiny{$\textbf{y}_t = \textrm{Softmax}(\textbf{h}_t \textbf{V})$}};
\node [anchor=south,inner sep=2pt] (output) at ([yshift=1em]y.north) {\scriptsize{$\textbf{y}_t$}};
\node [anchor=north,inner sep=2pt] (input) at ([yshift=-1em]h.south) {\scriptsize{$\textbf{x}_t$}};
\draw [->,thick] (input.north) -- ([yshift=-0.1em]rbox.south);
\draw [->,thick] ([yshift=0.1em]rbox.north) -- ([yshift=-0.1em]y.south) node [pos=0.5,left] {\tiny{$\textbf{h}_t$}};
\draw [->,thick] ([yshift=0.1em]y.north) -- (output.south);
\draw [->,thick] ([xshift=0.1em]rbox.east) -- ([xshift=1em]rbox.east) node [pos=1,above] {\tiny{$\textbf{h}_t$}};
\draw [->,thick] ([xshift=-1em]rbox.west) -- ([xshift=-0.1em]rbox.west) node [pos=0,above] {\tiny{$\textbf{h}_{t-1}$}};

\end{scope}
\end{tikzpicture}
\end{center}
\end{tcolorbox}
}
}

\item<3-> \textbf{如何体现循环?}$t$时刻的状态是$t-1$时刻状态的函数,这个过程可以不断被执行
\end{itemize}

\end{frame}

xiaotong committed
4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410
%%%------------------------------------------------------------------------------------------------------------
%%% 循环神经网络的“记忆”
\begin{frame}{循环神经网络的``记忆''}
\begin{itemize}
\item 循环神经网络可以记忆任意长度的历史,因此可以非常适合处理不定长的序列,比如自然语言句子
    \begin{itemize}
    \item 注意:$\textbf{h}_{t-1}$可以被传递到后续状态
    \end{itemize}
\end{itemize}

\vspace{-1em}
\begin{eqnarray}
\textbf{h}_t & = & \textrm{TanH}(\textbf{x}_t \textbf{U} + \alert{\textbf{h}_{t-1}} \textbf{W}) \nonumber \\
\visible<2->{
\textbf{h}_{t+1} & = & \textrm{TanH}(\textbf{x}_{t+1} \textbf{U} + \textbf{h}_{t} \textbf{W}) \nonumber \\
                 & = & \textrm{TanH}(\textbf{x}_{t+1} \textbf{U} + \textrm{TanH}(\textbf{x}_t \textbf{U} + \alert{\textbf{h}_{t-1}} \textbf{W}) \textbf{W}) \nonumber \\
                 }
\visible<3->{
\textbf{h}_{t+2} & = & \textrm{TanH}(\textbf{x}_{t+2} \textbf{U} + \textbf{h}_{t+1} \textbf{W}) \nonumber \\
                 & = & \textrm{TanH}(\textbf{x}_{t+2} \textbf{U} + \nonumber \\
                 &   & \textrm{TanH}(\textbf{x}_{t+1} \textbf{U} + \textrm{TanH}(\textbf{x}_t \textbf{U} + \alert{\textbf{h}_{t-1}} \textbf{W}) \textbf{W}) \textbf{W}) \nonumber
                 }
\end{eqnarray}

\vspace{-1em}
\begin{center}
\begin{tikzpicture}
\begin{scope}
\tikzstyle{rnnnode} = [draw,inner sep=5pt,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}]
\node [anchor=west,rnnnode] (node1) at (0,0) {\scriptsize{RNN Cell}};
\visible<2->{
\node [anchor=west,rnnnode] (node2) at ([xshift=4.5em]node1.east) {\scriptsize{RNN Cell}};
}
\visible<3->{
\node [anchor=west,rnnnode] (node3) at ([xshift=4.5em]node2.east) {\scriptsize{RNN Cell}};
}
\node [anchor=north] (x1) at ([yshift=-1em]node1.south) {\footnotesize{$\textbf{x}_{t}$}};
\visible<2->{
\node [anchor=north] (x2) at ([yshift=-1em]node2.south) {\footnotesize{$\textbf{x}_{t+1}$}};
}
\visible<3->{
\node [anchor=north] (x3) at ([yshift=-1em]node3.south) {\footnotesize{$\textbf{x}_{t+2}$}};
}
\node [anchor=south] (h1) at ([yshift=1em]node1.north) {\footnotesize{$\textbf{h}_{t}$}};
\visible<2->{
\node [anchor=south] (h2) at ([yshift=1em]node2.north) {\footnotesize{$\textbf{h}_{t+1}$}};
}
\visible<3->{
\node [anchor=south] (h3) at ([yshift=1em]node3.north) {\footnotesize{$\textbf{h}_{t+2}$}};
}
\draw [->,thick] ([xshift=-1.0em]node1.west)--([xshift=-0.1em]node1.west) node [pos=0,left] {\scriptsize{$\alert{\textbf{h}_{t-1}}$}};
\visible<3->{
\draw [->,thick] ([xshift=0.1em]node3.east)--([xshift=1.0em]node3.east) node [pos=1,right] {\scriptsize{$\textbf{h}_{t+2}$}};
}
\draw [->,thick] ([xshift=0.1em]node1.east)--([xshift=-0.1em]node2.west) node [pos=0.5,above] {\tiny{$\textbf{h}_{t}(\alert{\textbf{h}_{t-1}})$}};
\visible<2->{
\draw [->,thick] ([xshift=0.1em]node2.east)--([xshift=-0.1em]node3.west) node [pos=0.5,above] {\tiny{$\textbf{h}_{t+1}(\textbf{h}_{t}(\alert{\textbf{h}_{t-1}}))$}};
}
\draw [->,thick] (x1.north)--([yshift=-0.1em]node1.south);
\visible<2->{
\draw [->,thick] (x2.north)--([yshift=-0.1em]node2.south);
}
\visible<3->{
\draw [->,thick] (x3.north)--([yshift=-0.1em]node3.south);
}
\draw [->,thick] ([yshift=0.1em]node1.north)--(h1.south);
\visible<2->{
\draw [->,thick] ([yshift=0.1em]node2.north)--(h2.south);
}
\visible<3->{
\draw [->,thick] ([yshift=0.1em]node3.north)--(h3.south);
}

\end{scope}
\end{tikzpicture}
\end{center}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 基于循环神经网络的语言模型
\begin{frame}{基于循环神经网络的语言模型(RNN LM)}
\begin{itemize}
\item 循环神经网络可以被直接用于语言模型
    \begin{itemize}
    \item<2-> 与FNN LM类似,首先把词从one-hot表示转换成分布式表示
    \item<3-> $t$时刻预测$\textrm{P}(x_{t+1}|x_1...x_{t})$
    \item<4-> 可以叠加更多的层
    \end{itemize}
\end{itemize}

\visible<2->{
\begin{center}
\begin{tikzpicture}
\begin{scope}
\tikzstyle{rnnnode} = [draw,inner sep=5pt,minimum width=4em,minimum height=1.5em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}]
\visible<3->{
\node [anchor=west,rnnnode] (node11) at (0,0) {\scriptsize{RNN Cell}};
\node [anchor=west,rnnnode] (node12) at ([xshift=2em]node11.east) {\scriptsize{RNN Cell}};
\node [anchor=west,rnnnode] (node13) at ([xshift=2em]node12.east) {\scriptsize{RNN Cell}};
\node [anchor=west,rnnnode] (node14) at ([xshift=2em]node13.east) {\scriptsize{RNN Cell}};
}
\node [anchor=north,rnnnode,fill=red!30!white] (e1) at ([yshift=-1.2em]node11.south) {\tiny{$e_1=w_1\textbf{C}$}};
\node [anchor=north,rnnnode,fill=red!30!white] (e2) at ([yshift=-1.2em]node12.south) {\tiny{$e_2=w_2\textbf{C}$}};
\node [anchor=north,rnnnode,fill=red!30!white] (e3) at ([yshift=-1.2em]node13.south) {\tiny{$e_3=w_3\textbf{C}$}};
\node [anchor=north,rnnnode,fill=red!30!white] (e4) at ([yshift=-1.2em]node14.south) {\tiny{$e_4=w_4\textbf{C}$}};
\node [anchor=north] (w1) at ([yshift=-1em]e1.south) {\footnotesize{$w_1$}};
\node [anchor=north] (w2) at ([yshift=-1em]e2.south) {\footnotesize{$w_2$}};
\node [anchor=north] (w3) at ([yshift=-1em]e3.south) {\footnotesize{$w_3$}};
\node [anchor=north] (w4) at ([yshift=-1em]e4.south) {\footnotesize{$w_4$}};

\draw [->,thick] ([yshift=0.1em]w1.north)--([yshift=-0.1em]e1.south);
\draw [->,thick] ([yshift=0.1em]w2.north)--([yshift=-0.1em]e2.south);
\draw [->,thick] ([yshift=0.1em]w3.north)--([yshift=-0.1em]e3.south);
\draw [->,thick] ([yshift=0.1em]w4.north)--([yshift=-0.1em]e4.south);

\draw [->,thick] ([yshift=0.1em]e1.north)--([yshift=-0.1em]node11.south);
\draw [->,thick] ([yshift=0.1em]e2.north)--([yshift=-0.1em]node12.south);
\draw [->,thick] ([yshift=0.1em]e3.north)--([yshift=-0.1em]node13.south);
\draw [->,thick] ([yshift=0.1em]e4.north)--([yshift=-0.1em]node14.south);

\visible<4->{
\node [anchor=south,rnnnode] (node21) at ([yshift=1.5em]node11.north) {\scriptsize{RNN Cell}};
\node [anchor=south,rnnnode] (node22) at ([yshift=1.5em]node12.north) {\scriptsize{RNN Cell}};
\node [anchor=south,rnnnode] (node23) at ([yshift=1.5em]node13.north) {\scriptsize{RNN Cell}};
\node [anchor=south,rnnnode] (node24) at ([yshift=1.5em]node14.north) {\scriptsize{RNN Cell}};

\node [anchor=south,rnnnode,fill=blue!30!white] (node31) at ([yshift=1.5em]node21.north) {\scriptsize{Softmax($\cdot$)}};
\node [anchor=south,rnnnode,fill=blue!30!white] (node32) at ([yshift=1.5em]node22.north) {\scriptsize{Softmax($\cdot$)}};
\node [anchor=south,rnnnode,fill=blue!30!white] (node33) at ([yshift=1.5em]node23.north) {\scriptsize{Softmax($\cdot$)}};
\node [anchor=south,rnnnode,fill=blue!30!white] (node34) at ([yshift=1.5em]node24.north) {\scriptsize{Softmax($\cdot$)}};
}

\visible<3>{
\node [anchor=south,rnnnode,fill=blue!30!white] (node21) at ([yshift=1.5em]node11.north) {\scriptsize{Softmax($\cdot$)}};
\node [anchor=south,rnnnode,fill=blue!30!white] (node22) at ([yshift=1.5em]node12.north) {\scriptsize{Softmax($\cdot$)}};
\node [anchor=south,rnnnode,fill=blue!30!white] (node23) at ([yshift=1.5em]node13.north) {\scriptsize{Softmax($\cdot$)}};
\node [anchor=south,rnnnode,fill=blue!30!white] (node24) at ([yshift=1.5em]node14.north) {\scriptsize{Softmax($\cdot$)}};

\draw [->,thick] ([yshift=0.1em]node21.north)--([yshift=-0.1em]node31.south) node[pos=1,above] {\scriptsize{$\textrm{P}(w_2)$}};
\draw [->,thick] ([yshift=0.1em]node22.north)--([yshift=-0.1em]node32.south) node[pos=1,above] {\scriptsize{$\textrm{P}(w_3|w_2)$}};
\draw [->,thick] ([yshift=0.1em]node23.north)--([yshift=-0.1em]node33.south) node[pos=1,above] {\scriptsize{$\textrm{P}(w_4|w_2 w_3)$}};
\draw [->,thick] ([yshift=0.1em]node24.north)--([yshift=-0.1em]node34.south) node[pos=1,above] {\scriptsize{$\textrm{P}(w_5|w_2 w_3 w_4)$}};
}

\visible<4->{
\draw [->,thick] ([yshift=0.1em]node31.north)--([yshift=1em]node31.north) node[pos=1,above] {\scriptsize{$\textrm{P}(w_2)$}};
\draw [->,thick] ([yshift=0.1em]node32.north)--([yshift=1em]node32.north) node[pos=1,above] {\scriptsize{$\textrm{P}(w_3|w_2)$}};
\draw [->,thick] ([yshift=0.1em]node33.north)--([yshift=1em]node33.north) node[pos=1,above] {\scriptsize{$\textrm{P}(w_4|w_2 w_3)$}};
\draw [->,thick] ([yshift=0.1em]node34.north)--([yshift=1em]node34.north) node[pos=1,above] {\scriptsize{$\textrm{P}(w_5|w_2 w_3 w_4)$}};

\draw [->,thick] ([yshift=0.1em]node21.north)--([yshift=-0.1em]node31.south);
\draw [->,thick] ([yshift=0.1em]node22.north)--([yshift=-0.1em]node32.south);
\draw [->,thick] ([yshift=0.1em]node23.north)--([yshift=-0.1em]node33.south);
\draw [->,thick] ([yshift=0.1em]node24.north)--([yshift=-0.1em]node34.south);

\draw [->,thick] ([xshift=-1em]node21.west)--([xshift=-0.1em]node21.west);
\draw [->,thick] ([xshift=0.1em]node21.east)--([xshift=-0.1em]node22.west);
\draw [->,thick] ([xshift=0.1em]node22.east)--([xshift=-0.1em]node23.west);
\draw [->,thick] ([xshift=0.1em]node23.east)--([xshift=-0.1em]node24.west);
\draw [->,thick] ([xshift=0.1em]node24.east)--([xshift=1em]node24.east);
}

\visible<3->{
\draw [->,thick] ([yshift=0.1em]node11.north)--([yshift=-0.1em]node21.south);
\draw [->,thick] ([yshift=0.1em]node12.north)--([yshift=-0.1em]node22.south);
\draw [->,thick] ([yshift=0.1em]node13.north)--([yshift=-0.1em]node23.south);
\draw [->,thick] ([yshift=0.1em]node14.north)--([yshift=-0.1em]node24.south);

\draw [->,thick] ([xshift=-1em]node11.west)--([xshift=-0.1em]node11.west);
\draw [->,thick] ([xshift=0.1em]node11.east)--([xshift=-0.1em]node12.west);
\draw [->,thick] ([xshift=0.1em]node12.east)--([xshift=-0.1em]node13.west);
\draw [->,thick] ([xshift=0.1em]node13.east)--([xshift=-0.1em]node14.west);
\draw [->,thick] ([xshift=0.1em]node14.east)--([xshift=1em]node14.east);
}

\end{scope}
\end{tikzpicture}
\end{center}
}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 循环单元的设计、梯度消失、训练等问题
\begin{frame}{进一步的问题}
\begin{itemize}
\item \textbf{循环单元设计}:循环单元就是一个函数,入读当前时刻的输入和上一时刻的状态,生成当前时刻的状态
    \begin{displaymath}
    \textbf{h}_t = g(\textbf{x}_t, \textbf{h}_{t-1}; \theta)
    \end{displaymath}
    很多种方式设计$g(\cdot)$,如著名的LSTM、GRU等
\item<2-> \textbf{梯度消失/爆炸}:随着序列变长,在反向传播时循环神经网络会产生更多的局部梯度相乘计算,这会导致\alert{梯度消失/爆炸问题}
    \begin{displaymath}
    \underbrace{0.2 \times 0.3 \times ... \times 0.2 \times 0.1}_{\text{100项}} \approx 0
    \end{displaymath}
    \vspace{-0.8em}
    \begin{itemize}
    \item 可以考虑梯度裁剪,限制梯度的大小
    \item 也可以引入short-cut connection,如残差网络
    \end{itemize}
\item<2-> \textbf{训练}:有了自动微分,这不是个大问题 :)
\end{itemize}
\end{frame}

xiaotong committed
4411
%%%------------------------------------------------------------------------------------------------------------
xiaotong committed
4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485
%%% 自注意力机制
\begin{frame}{自注意力机制(Self-Attention)}

\begin{itemize}
\item RNN LM效果很好,但是当序列过长,词汇之间信息传递路径过长,容易出现梯度消失、梯度爆炸的问题。
\vspace{0.5em}
\begin{center}
\begin{tikzpicture}
\begin{scope}
\node [anchor=west] (w0) at (0,0) {$w_1$};
\node [anchor=west] (w1) at ([xshift=0.5em]w0.east) {$w_2$};
\node [anchor=west] (w2) at ([xshift=0.5em]w1.east) {$w_3$};
\node [anchor=west] (w3) at ([xshift=0.5em]w2.east) {$...$};
\node [anchor=west] (w4) at ([xshift=0.5em]w3.east) {$w_{m-1}$};
\node [anchor=west,fill=green!20!white] (w5) at ([xshift=0.5em]w4.east) {$w_{m}$};
\draw [->,thick,red] (w1.north).. controls +(130:0.5) and +(50:0.5) .. (w0.north);
\draw [->,thick,red] (w2.north).. controls +(130:0.5) and +(50:0.5) .. (w1.north);
\draw [->,thick,red] ([yshift=0.2em]w3.north).. controls +(130:0.5) and +(50:0.5) .. (w2.north);
\draw [->,thick,red] (w4.north).. controls +(130:0.5) and +(50:0.5) .. ([yshift=0.2em]w3.north);
\draw [->,thick,red] (w5.north).. controls +(130:0.5) and +(50:0.5) .. (w4.north);
\draw [->,very thick,red] ([xshift=-5em]w0.west) -- ([xshift=-6.5em]w0.west) node [pos=0,right] {\scriptsize{信息传递}};


\end{scope}

\end{tikzpicture}
\end{center}
\item<2-> 能否将不同位置之间的词汇间信息传递的距离拉近为1?


\begin{center}
\begin{tikzpicture}
\begin{scope}
\node [anchor=west] (w0) at (0,-2) {$w_1$};
\node [anchor=west] (w1) at ([xshift=0.5em]w0.east) {$w_2$};
\node [anchor=west] (w2) at ([xshift=0.5em]w1.east) {$w_3$};
\node [anchor=west] (w3) at ([xshift=0.5em]w2.east) {$...$};
\node [anchor=west] (w4) at ([xshift=0.5em]w3.east) {$w_{m-1}$};
\node [anchor=west,fill=green!20!white] (w5) at ([xshift=0.5em]w4.east) {$w_{m}$};
\draw [->,thick,red] (w5.north).. controls +(100:0.8) and +(50:0.8) .. (w0.north);
\draw [->,thick,red] (w5.north).. controls +(110:0.7) and +(50:0.7) .. (w1.north);
\draw [->,thick,red] (w5.north).. controls +(120:0.6) and +(50:0.6) .. ([yshift=0.2em]w3.north);
\draw [->,thick,red] (w5.north).. controls +(130:0.5) and +(50:0.5) .. (w4.north);
\draw [->,very thick,red] ([xshift=-5em]w0.west) -- ([xshift=-6.5em]w0.west) node [pos=0,right] {\scriptsize{信息传递}};


\end{scope}

\end{tikzpicture}
\end{center}
\item<3-> \textbf{自注意力机制(Self-Attention)}可以很好的解决长距离依赖问题,在长距离语言建模任务取得了很好的效果
	\begin{itemize}
	\item 更充分的表示序列不同位置之间的复杂关系
	\item 并行训练,提高效率
	\end{itemize}
	
        \textbf{Attention Is All You Need}\\
        \textbf{Vaswani et al., 2017, In Proc. of Neural Information Processing Systems, 6000-6010}
\end{itemize}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% Transformer architecture
\begin{frame}{Transformer语言模型(Vaswani et al., 2017)}
\begin{itemize}
\item 一个简单的例子
\end{itemize}

\vspace{-2em}
\begin{center}
\begin{tikzpicture}
\begin{scope}
\node [anchor=west] (w0) at (0,0) {\footnotesize{$w_{0}$}};
姜雨帆 committed
4486 4487 4488
\node [anchor=west] (w1) at ([xshift=5em]w0.east) {\footnotesize{$w_{1}$}};
\node [anchor=west] (w2) at ([xshift=5em]w1.east) {\footnotesize{$w_{2}$}};
\node [anchor=west] (w3) at ([xshift=5em]w2.east) {\footnotesize{$w_{3}$}};
xiaotong committed
4489 4490 4491 4492
\node [anchor=north] (index0) at ([yshift=0.5em]w0.south) {\tiny(index)};
\node [anchor=north] (index1) at ([yshift=0.5em]w1.south) {\tiny(index)};
\node [anchor=north] (index2) at ([yshift=0.5em]w2.south) {\tiny(index)};
\node [anchor=north] (index3) at ([yshift=0.5em]w3.south) {\tiny(index)};
姜雨帆 committed
4493 4494 4495 4496 4497 4498 4499 4500 4501 4502
\node [anchor=south,draw,inner sep=3pt] (e0) at ([yshift=1em]w0.north) {\tiny{$\textbf{e}_0=w_{0} \textbf{C} + \textrm{PE}(0)$}};
\node [anchor=south,draw,inner sep=3pt] (e1) at ([yshift=1em]w1.north) {\tiny{$\textbf{e}_1=w_{1} \textbf{C} + \textrm{PE}(1)$}};
\node [anchor=south,draw,inner sep=3pt] (e2) at ([yshift=1em]w2.north) {\tiny{$\textbf{e}_2=w_{2} \textbf{C} + \textrm{PE}(2)$}};
\node [anchor=south,draw,inner sep=3pt] (e3) at ([yshift=1em]w3.north) {\tiny{$\textbf{e}_3=w_{3} \textbf{C} + \textrm{PE}(3)$}};

\node [anchor=south,draw,inner sep=3pt] (h0) at ([xshift=-0.5em, yshift=1.5em]e0.north) {\tiny{$\textbf{h}_{0}=\textrm{SelfAtt}(\textbf{e}_0,\textbf{e}_3)$}};
\node [anchor=south,draw,inner sep=3pt] (h1) at ([xshift=0.5em, yshift=1.5em]e1.north) {\tiny{$\textbf{h}_{1}=\textrm{SelfAtt}(\textbf{e}_1,\textbf{e}_3)$}};
\node [anchor=south,draw,inner sep=3pt] (h2) at ([xshift=1.5em, yshift=1.5em]e2.north) {\tiny{$\textbf{h}_{2}=\textrm{SelfAtt}(\textbf{e}_2,\textbf{e}_3)$}};
\node [anchor=south,draw,minimum width=9em,inner sep=3pt] (f1) at ([xshift=0.5em, yshift=1.5em]h2.north) {\tiny{$\textbf{f}_3=\textrm{FNN}([\textbf{h}_0,\textbf{h}_1,\textbf{h}_2,\textbf{e}_3])$}};
\node [anchor=south,draw,minimum width=9em,inner sep=3pt] (o1) at ([yshift=1em]f1.north) {\tiny{$\textbf{y}=\textrm{Softmax}(f_3 \textbf{U})$}};
xiaotong committed
4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523
\node [anchor=south] (ylabel) at ([yshift=1em]o1.north) {\footnotesize{$\textrm{P}(w_4|w_{0}w_{1}w_{2}w_{3})$}};

\draw [->] ([yshift=0.1em]w0.north) -- ([yshift=-0.1em]e0.south);
\draw [->] ([yshift=0.1em]w1.north) -- ([yshift=-0.1em]e1.south);
\draw [->] ([yshift=0.1em]w2.north) -- ([yshift=-0.1em]e2.south);
\draw [->] ([yshift=0.1em]w3.north) -- ([yshift=-0.1em]e3.south);
\draw [->] ([yshift=0.1em]e0.north) -- ([xshift=0em,yshift=-0.1em]h0.south);
\draw [->] ([yshift=0.1em]e1.north) -- ([xshift=-0.5em,yshift=-0.1em]h1.south);
\draw [->] ([yshift=0.1em]e2.north) -- ([xshift=-1em,yshift=-0.1em]h2.south);
\draw [->] ([yshift=0.1em]e3.north) -- ([xshift=1em,yshift=-0.1em]h0.south);
\draw [->] ([yshift=0.1em]e3.north) -- ([xshift=1em,yshift=-0.1em]h1.south);
\draw [->] ([yshift=0.1em]e3.north) -- ([xshift=1em,yshift=-0.1em]h2.south);

\draw [->] ([yshift=0.1em]h0.north) -- ([xshift=-2em,yshift=-0.1em]f1.south);
\draw [->] ([yshift=0.1em]e3.north) -- ([xshift=2em,yshift=-0.1em]f1.south);
\draw [->] ([yshift=0.1em]h1.north) -- ([xshift=-1em,yshift=-0.1em]f1.south);
\draw [->] ([yshift=0.1em]h2.north) -- ([xshift=0em,yshift=-0.1em]f1.south);
\draw [->] ([yshift=0.1em]f1.north) -- ([yshift=-0.1em]o1.south);
\draw [->] ([yshift=0.1em]o1.north) -- ([yshift=-0.1em]ylabel.south);

\visible<2->{
姜雨帆 committed
4524 4525 4526 4527
\node [anchor=south,draw,inner sep=3pt,fill=blue!20!white] (e0) at ([yshift=1em]w0.north) {\tiny{$\textbf{e}_0=w_{0} \textbf{C} + \textrm{PE}(0)$}};
\node [anchor=south,draw,inner sep=3pt,fill=blue!20!white] (e1) at ([yshift=1em]w1.north) {\tiny{$\textbf{e}_1=w_{1} \textbf{C} + \textrm{PE}(1)$}};
\node [anchor=south,draw,inner sep=3pt,fill=blue!20!white] (e2) at ([yshift=1em]w2.north) {\tiny{$\textbf{e}_2=w_{2} \textbf{C} + \textrm{PE}(2)$}};
\node [anchor=south,draw,inner sep=3pt,fill=blue!20!white] (e3) at ([yshift=1em]w3.north) {\tiny{$\textbf{e}_3=w_{3} \textbf{C} + \textrm{PE}(3)$}};
xiaotong committed
4528 4529 4530 4531 4532 4533
}

\visible<2->{
\node [anchor=west] (embedinglabel0) at ([xshift=-5em,yshift=-2em]w0.south) {\scriptsize{{\blue \textbf{词的分布式表示}}}};
\node [anchor=north west] (embedinglabel1) at ([yshift=0.3em]embedinglabel0.south west) {\scriptsize{前面已经介绍过!}};
\node [anchor=north west] (embedinglabel2) at ([yshift=0.3em]embedinglabel1.south west) {\scriptsize{基于One-hot表示获得}};
姜雨帆 committed
4534
\node [anchor=north west] (embedinglabel3) at ([yshift=0.3em]embedinglabel2.south west) {\scriptsize{新加入位置向量PE}};
xiaotong committed
4535 4536 4537
}

\visible<3->{
姜雨帆 committed
4538 4539 4540
\node [anchor=south,draw,inner sep=3pt,fill=ugreen!20!white] (h0) at ([xshift=-0.5em, yshift=1.5em]e0.north) {\tiny{$\textbf{h}_{0}=\textrm{SelfAtt}(\textbf{e}_0,\textbf{e}_3)$}};
\node [anchor=south,draw,inner sep=3pt,fill=ugreen!20!white] (h1) at ([xshift=0.5em, yshift=1.5em]e1.north) {\tiny{$\textbf{h}_{1}=\textrm{SelfAtt}(\textbf{e}_1,\textbf{e}_3)$}};
\node [anchor=south,draw,inner sep=3pt,fill=ugreen!20!white] (h2) at ([xshift=1.5em, yshift=1.5em]e2.north) {\tiny{$\textbf{h}_{2}=\textrm{SelfAtt}(\textbf{e}_2,\textbf{e}_3)$}};
xiaotong committed
4541 4542 4543 4544 4545 4546 4547 4548 4549 4550
}

\visible<3->{
\node [anchor=west] (selfattlabel0) at ([xshift=3em]embedinglabel0.east) {\scriptsize{{\color{ugreen} \textbf{自注意力机制}}}};
\node [anchor=west] (selfattlabel1) at ([yshift=-0.3em]selfattlabel0.south west) {\scriptsize{计算词汇之间的相关度}};
\node [anchor=west] (selfattlabel2) at ([yshift=-0.3em]selfattlabel1.south west) {\scriptsize{多头自注意力机制}};
\node [anchor=west] (directlabel0) at ([yshift=-0.3em]selfattlabel2.south west) {\scriptsize{\alert{\textbf{后面将会介绍}}}};
}

\visible<4->{
姜雨帆 committed
4551 4552
\node [anchor=south,draw,minimum width=9em,inner sep=3pt,fill=orange!20!white] (f1) at ([xshift=0.5em, yshift=1.5em]h2.north) {\tiny{$\textbf{f}_3=\textrm{FNN}([\textbf{h}_0,\textbf{h}_1,\textbf{h}_2,\textbf{e}_3])$}};
\node [anchor=south,draw,minimum width=9em,inner sep=3pt,fill=orange!20!white] (o1) at ([yshift=1em]f1.north) {\tiny{$\textbf{y}=\textrm{Softmax}(f_3 \textbf{U})$}};
xiaotong committed
4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678
}

\visible<4->{
\node [anchor=west] (ffnlabel0) at ([xshift=3em]selfattlabel0.east) {\scriptsize{{\color{orange} \textbf{前馈神经网络和输出层}}}};
\node [anchor=west] (ffnlabel1) at ([yshift=-0.3em]ffnlabel0.south west) {\scriptsize{双层全连接网络}};
\node [anchor=west] (ffnlabel2) at ([yshift=-0.3em]ffnlabel1.south west) {\scriptsize{激活函数为Relu}};
\node [anchor=west] (ffnlabel3) at ([yshift=-0.3em]ffnlabel2.south west) {\scriptsize{最后通过Softmax输出}};
}


\end{scope}
\end{tikzpicture}
\end{center}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% Transformer architecture
\begin{frame}{Transformer语言模型(Vaswani et al., 2017)}
\begin{itemize}
\item 多头注意力机制
\end{itemize}

\vspace{-1.5em}
\begin{center}
\begin{tikzpicture}
\begin{scope}

\node [anchor=west,draw=black!30,inner sep=4pt,fill=ugreen!20!white] (Linear0) at (0,0) {\tiny{Linear}};
\node [anchor=south west,draw=black!50,fill=ugreen!20!white,draw,inner sep=4pt] (Linear01) at ([shift={(-0.2em,-0.2em)}]Linear0.south west) {\tiny{Linear}};
\node [anchor=south west,fill=ugreen!20!white,draw,inner sep=4pt] (Linear02) at ([shift={(-0.2em,-0.2em)}]Linear01.south west) {\tiny{Linear}};
\node [anchor=north] (Q) at ([xshift=0em,yshift=-1em]Linear02.south) {\footnotesize{$Q$}};

\node [anchor=west,draw=black!30,inner sep=4pt,fill=ugreen!20!white] (Linear1) at ([xshift=1.5em]Linear0.east) {\tiny{Linear}};
\node [anchor=south west,draw=black!50,fill=ugreen!20!white,draw,inner sep=4pt] (Linear11) at ([shift={(-0.2em,-0.2em)}]Linear1.south west) {\tiny{Linear}};
\node [anchor=south west,fill=ugreen!20!white,draw,inner sep=4pt] (Linear12) at ([shift={(-0.2em,-0.2em)}]Linear11.south west) {\tiny{Linear}};
\node [anchor=north] (K) at ([xshift=0em,yshift=-1em]Linear12.south) {\footnotesize{$K$}};

\node [anchor=west,draw=black!30,inner sep=4pt,fill=ugreen!20!white] (Linear2) at ([xshift=1.5em]Linear1.east) {\tiny{Linear}};
\node [anchor=south west,draw=black!50,fill=ugreen!20!white,draw,inner sep=4pt] (Linear21) at ([shift={(-0.2em,-0.2em)}]Linear2.south west) {\tiny{Linear}};
\node [anchor=south west,fill=ugreen!20!white,draw,inner sep=4pt] (Linear22) at ([shift={(-0.2em,-0.2em)}]Linear21.south west) {\tiny{Linear}};
\node [anchor=north] (V) at ([xshift=0em,yshift=-1em]Linear22.south) {\footnotesize{$V$}};

\node [anchor=south,draw=black!30,minimum width=9em,inner sep=4pt,fill=blue!20!white] (Scale) at ([yshift=1em]Linear1.north) {\tiny{Scaled Dot-Product Attention}};
\node [anchor=south west,draw=black!50,minimum width=9em,fill=blue!20!white,draw,inner sep=4pt] (Scale1) at ([shift={(-0.2em,-0.2em)}]Scale.south west) {\tiny{Scaled Dot-Product Attention}};
\node [anchor=south west,fill=blue!20!white,draw,minimum width=9em,inner sep=4pt] (Scale2) at ([shift={(-0.2em,-0.2em)}]Scale1.south west) {\tiny{Scaled Dot-Product Attention}};

\node [anchor=south,draw,minimum width=4em,inner sep=4pt,fill=yellow!30] (Concat) at ([yshift=1em]Scale2.north) {\tiny{Concat}};

\node [anchor=south,draw,minimum width=4em,inner sep=4pt,fill=ugreen!20!white] (Linear) at ([yshift=1em]Concat.north) {\tiny{Linear}};


\draw [->] ([yshift=0.1em]Q.north) -- ([yshift=-0.1em]Linear02.south);
\draw [-,draw=black!50] ([yshift=0.1em]Q.north) -- ([xshift=0.2em,yshift=-0.1em]Linear02.south);
\draw [-,draw=black!30] ([yshift=0.1em]Q.north) -- ([xshift=0.4em,yshift=-0.1em]Linear02.south);

\draw [->] ([yshift=0.1em]K.north) -- ([yshift=-0.1em]Linear12.south);
\draw [-,draw=black!50] ([yshift=0.1em]K.north) -- ([xshift=0.2em,yshift=-0.1em]Linear12.south);
\draw [-,draw=black!30] ([yshift=0.1em]K.north) -- ([xshift=0.4em,yshift=-0.1em]Linear12.south);

\draw [->] ([yshift=0.1em]V.north) -- ([yshift=-0.1em]Linear22.south);
\draw [-,draw=black!50] ([yshift=0.1em]V.north) -- ([xshift=0.2em,yshift=-0.1em]Linear22.south);
\draw [-,draw=black!30] ([yshift=0.1em]V.north) -- ([xshift=0.4em,yshift=-0.1em]Linear22.south);

\draw [->] ([yshift=0em]Linear02.north) -- ([yshift=1em]Linear02.north);
\draw [-,draw=black!50] ([yshift=0em]Linear01.north) -- ([yshift=0.8em]Linear01.north);
\draw [-,draw=black!30] ([yshift=0em]Linear0.north) -- ([yshift=0.6em]Linear0.north);

\draw [->] ([yshift=0em]Linear12.north) -- ([yshift=1em]Linear12.north);
\draw [-,draw=black!50] ([yshift=0em]Linear11.north) -- ([yshift=0.8em]Linear11.north);
\draw [-,draw=black!30] ([yshift=0em]Linear1.north) -- ([yshift=0.6em]Linear1.north);

\draw [->] ([yshift=0em]Linear22.north) -- ([yshift=1em]Linear22.north);
\draw [-,draw=black!50] ([yshift=0em]Linear21.north) -- ([yshift=0.8em]Linear21.north);
\draw [-,draw=black!30] ([yshift=0em]Linear2.north) -- ([yshift=0.6em]Linear2.north);

\draw [->] ([yshift=0em]Scale2.north) -- ([yshift=0em]Concat.south);
\draw [-,draw=black!50] ([yshift=0em]Scale1.north) -- ([yshift=0.8em]Scale1.north);
\draw [-,draw=black!30] ([yshift=0em]Scale.north) -- ([yshift=0.6em]Scale.north);

\draw [->] ([yshift=0em]Concat.north) -- ([yshift=0em]Linear.south);
\draw [->] ([yshift=0em]Linear.north) -- ([yshift=1em]Linear.north);

\node [anchor=west] (Multiheadlabel0) at ([xshift=-5em,yshift=-1.2em]Q.south) {\scriptsize{{\blue \textbf{多头注意力}}}};
\node [anchor=north west] (Multiheadlabel1) at ([yshift=0em]Multiheadlabel0.south west) {\scriptsize{$MultiHead(Q,K,V)=Concat(head_1,...head_n)W^0$}};
\node [anchor=north west] (Multiheadlabel2) at ([yshift=0.2em]Multiheadlabel1.south west) {\scriptsize{把输入压缩成多个维度较小的输出,分别做自注意力}};
\node [anchor=north west] (Multiheadlabel3) at ([yshift=0.2em]Multiheadlabel2.south west) {\scriptsize{再把结果级联,经过线性变换得到最终输出}};


\visible<2->{
\node [anchor=south west,fill=white,draw,inner sep=4pt,minimum width=3.5em,fill=blue!20!white] (MatMul) at ([xshift=8em]Linear22.south west) {\tiny{MatMul}};
\node [anchor=north] (Q1) at ([xshift=-1em,yshift=-1em]MatMul.south) {\footnotesize{$Q$}};
\node [anchor=north] (K1) at ([xshift=1em,yshift=-1em]MatMul.south) {\footnotesize{$K$}};
\node [anchor=south,draw,inner sep=4pt,fill=yellow!30] (Scale3) at ([yshift=1em]MatMul.north) {\tiny{Scale}};
\node [anchor=south,draw,inner sep=4pt,fill=purple!20,minimum width=3.5em] (Mask) at ([yshift=0.8em]Scale3.north) {\tiny{Mask(opt.)}};
\node [anchor=south,draw,inner sep=4pt,fill=ugreen!20!white] (SoftMax) at ([yshift=1em]Mask.north) {\tiny{SoftMax}};
\node [anchor=south,draw,minimum width=3.5em,inner sep=4pt,fill=blue!20!white] (MatMul1) at ([xshift=1.5em,yshift=1em]SoftMax.north) {\tiny{MatMul}};
\node [anchor=north] (V1) at ([xshift=2em]K1.north) {\footnotesize{$V$}};
\node [anchor=north] (null) at ([yshift=0.8em]MatMul1.north) {};

\node [rectangle,draw, densely dashed,inner sep=0.4em] [fit = (MatMul) (MatMul1) (Q1) (K1) (V1) (null)] (inputshadow) {};

\draw [->] ([yshift=0.1em]Q1.north) -- ([xshift=-1em,yshift=-0.1em]MatMul.south);
\draw [->] ([yshift=0.1em]K1.north) -- ([xshift=1em,yshift=-0.1em]MatMul.south);
\draw [->] ([yshift=0.1em]MatMul.north) -- ([yshift=-0.1em]Scale3.south);
\draw [->] ([yshift=0.1em]Scale3.north) -- ([yshift=-0.1em]Mask.south);
\draw [->] ([yshift=0.1em]Mask.north) -- ([yshift=-0.1em]SoftMax.south);
\draw [->] ([yshift=0.1em]SoftMax.north) -- ([yshift=0.9em]SoftMax.north);
\draw [->] ([yshift=0.1em]V1.north) -- ([yshift=9.1em]V1.north);
\draw [->] ([yshift=0.1em]MatMul1.north) -- ([yshift=0.8em]MatMul1.north);

\draw [->,dashed,red,thick] ([xshift=0.1em]Scale.east) .. controls +(east:1) and +(west:1) .. ([xshift=-0.1em,yshift=1em]inputshadow.west);

\node [anchor=west] (Attentionlabel0) at ([xshift=-2em,yshift=-1.2em]Q1.south) {\scriptsize{{\color{ugreen} \textbf{基于点乘的自注意力}}}};
\node [anchor=north west] (Attentionlabel1) at ([yshift=0.3em]Attentionlabel0.south west) {\scriptsize{$head_i=softmax(\frac{QK^{T}}{\sqrt{d_k}})V$}};
\node [anchor=north west] (Attentionlabel2) at ([yshift=0.6em]Attentionlabel1.south west) {\scriptsize{计算得到位置向量的加权和}};
\node [anchor=north west] (Attentionlabel3) at ([yshift=0.2em]Attentionlabel2.south west) {\scriptsize{Q,K,V都是相同的}};
}

\end{scope}
\end{tikzpicture}
\end{center}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
xiaotong committed
4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710
%%% evaluation
\begin{frame}{语言模型评价}
\begin{itemize}
\item 语言模型的评价指标 - 困惑度(Perplexity, PPL)
\begin{itemize}
\item 语言模型预测一个语言样本的能力
\item 困惑度越低,建模的效果越好
\end{itemize}
\vspace{0.5em}
\begin{displaymath}
\textrm{PPL}(w_1 ... w_m)=\textrm{P}(w_1 ... w_m)^{-1/m}
\end{displaymath}
\vspace{-0.5em}
\item<2-> Penn Treebank(PTB)上的评价结果
\end{itemize}
\vspace{0.0em}
\visible<2->{
\begin{tabular}{l | l | l | r}
模型 & 作者 & 年份 & PPL \\ \hline
FNN LM & Bengio et al. & 2003 & 162.2 \\
RNN LM & Mikolov et al. & 2010 & 124.7 \\
RNN-LDA LM & Mikolov et al. & 2012 & 92.0 \\
RNN(LSTM) LM & Zaremba et al. & 2014 & 78.4 \\
RHN & Zilly et al. & 2016 & 65.4 \\
RNN(AWD-LSTM) LM & Merity et al. & 2018 & 58.8 \\
GPT-2 (Transformer) & Radford et al. & 2019 & 35.7
\end{tabular}
}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
xiaotong committed
4711 4712 4713
\subsection{词嵌入}

%%%------------------------------------------------------------------------------------------------------------
xiaotong committed
4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733
%%% 词的one-hot和distributed表示
\begin{frame}{单词的表示}
\begin{itemize}
\item 如何表示一个单词?
    \begin{itemize}
    \item \textbf{One-hot}: 假如有一个词典$V$,里面包含10k个单词,并进行编号。每个单词都可以表示为10k维的one-hot向量,仅在编号那个维度为1,其它为0
    \item<2-> \textbf{Distributed}: 类似于神经语言模型,每个单词可以被表示为一个实数向量,每一维都对应一种``属性'' - \alert{词嵌入}
    \end{itemize}
\end{itemize}

\begin{center}
\begin{tikzpicture}
\begin{scope}
\node [anchor=north west] (o1) at (0,0) {\footnotesize{$\begin{bmatrix} 0 \\ 1 \\ 0 \\ 0 \\ 0 \\ ... \\ 0 \end{bmatrix}$}};
\node [anchor=north west] (o2) at ([xshift=1em]o1.north east) {\footnotesize{$\begin{bmatrix} 0 \\ 0 \\ 0 \\ 1 \\ 0 \\ ... \\ 0 \end{bmatrix}$}};
\node [anchor=north east] (v) at ([xshift=-0em]o1.north west) {\footnotesize{$\begin{matrix} \textrm{\ \ \ \ \ }_1 \\ \textrm{\ \ 桌子}_2 \\ \textrm{\ \ \ \ \ }_3 \\ \textrm{\ \ 椅子}_4 \\ \textrm{\ \ 我们}_5 \\ ... \\ \textrm{你好}_{10k} \end{matrix}$}};
\node [anchor=south] (w1) at (o1.north) {\footnotesize{桌子}};
\node [anchor=south] (w2) at (o2.north) {\footnotesize{椅子}};
\node [anchor=north] (label) at (o1.south) {\footnotesize{单词的one-hot表示}};
\visible<3->{
姜雨帆 committed
4734
\node [anchor=south,fill=red!20!white] (cosine) at (w1.north) {\footnotesize{$\textrm{cosine}(\textrm{`桌子'},\textrm{`椅子'})=0$}};
xiaotong committed
4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746
}
\end{scope}

\visible<2->{
\begin{scope}[xshift=2in]
\node [anchor=north west] (o1) at (0,0) {\footnotesize{$\begin{bmatrix} .1 \\ -1 \\ 2 \\ ... \\ 0 \end{bmatrix}$}};
\node [anchor=north west] (o2) at ([xshift=1em]o1.north east) {\footnotesize{$\begin{bmatrix} 1 \\ 2 \\ .2 \\ ... \\ -1 \end{bmatrix}$}};
\node [anchor=north east] (v) at ([xshift=-0em]o1.north west) {\footnotesize{$\begin{matrix} \textrm{\ \ \ 属性}_1 \\ \textrm{\ \ \ 属性}_2 \\ \textrm{\ \ \ 属性}_3 \\ ... \\ \textrm{属性}_{512} \end{matrix}$}};
\node [anchor=south] (w1) at (o1.north) {\footnotesize{桌子}};
\node [anchor=south] (w2) at (o2.north) {\footnotesize{椅子}};
\node [anchor=north] (label) at ([yshift=-2em]o1.south) {\footnotesize{单词的分布式表示(词嵌入)}};
\visible<3->{
姜雨帆 committed
4747
\node [anchor=south,fill=red!20!white] (cosine) at (w1.north) {\footnotesize{$\textrm{cosine}(\textrm{`桌子'},\textrm{`椅子'})=0.5$}};
xiaotong committed
4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793
}
\end{scope}
}
\end{tikzpicture}
\end{center}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 分布式表示的优点
\begin{frame}{为什么需要分布式表示?}
\begin{itemize}
\item \textbf{一个自然的问题}:分布式表示中每一维都是什么意思
    \begin{itemize}
    \item 可以把每一维都理解为一个属性,比如:性别、身高等
    \item 但是,模型更多的是把一个维度看做是事物的一种``刻画'',是一种统计意义上的``语义'',而非人工归纳的属性
    \end{itemize}
\item<2-> 那这种方法有什么好处?
    \begin{itemize}
    \item 更容易刻画词语之间的\alert{相似性}
    \item 连续空间表示模型可以更准确的刻画客观事物,而不是非零即一的判断
    \end{itemize}
\item<2-> 预测下一个词任务
    \begin{itemize}
    \item 分布式表示很容易指导``桌子''和``椅子''是相似的
    \item 即使``椅子''没在这个句型中出现过,系统仍然可以通过它和``桌子''的相似性进行预测
    \end{itemize}
    \begin{tabular}{l | l}
    屋里 要 摆放 一个 \_\_\_\_\_ & 预测下个词 \\ \hline
    屋里 要 摆放 一个 \alert{桌子} & 见过 \\
    屋里 要 摆放 一个 \blue{椅子} & 没见过,但是仍然是合理预测
    \end{tabular}
\end{itemize}
\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 用实例理解词的分布式表示
\begin{frame}{分布式表示的可视化}
\begin{itemize}
\item \textbf{一个著名的例子}:国王 $\to$ 王后\\
    \begin{displaymath}
    \vv{\textrm{国王}} - \vv{\textrm{男人}} + \vv{\textrm{女人}} = \vv{\textrm{王后}}
    \end{displaymath}
    这里,$\vv{\textrm{word}}$表示单词的分布式向量表示
\item 更多的词的可视化:相似的词聚在一起
\end{itemize}
xiaotong committed
4794 4795 4796
\begin{center}
\includegraphics[scale=0.4]{./Figures/word-graph.png}
\end{center}
xiaotong committed
4797 4798
\end{frame}

xiaotong committed
4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851
%%%------------------------------------------------------------------------------------------------------------
%%% 神经语言模型中的词嵌入
\begin{frame}{神经语言模型中的词嵌入}
\begin{itemize}
\item 在神经语言模型中,需要把词表示成它的分布式表示
    \begin{itemize}
    \item<2-> 其中$\textbf{C}$是词嵌入矩阵,每一行对应一个词的分布式表示
    \item<3-> $\textbf{C}$可以用语言模型训练,也可以利用其它模型训练,固定词嵌入,让语言模型专注长片段的学习
    \end{itemize}
\end{itemize}

\vspace{-0.5em}
\begin{center}
\begin{tikzpicture}
\begin{scope}
\node [anchor=center,inner sep=2pt] (e) at (0,0) {\small{$e=w$}};
\node [anchor=west,inner sep=2pt] (c) at (e.east) {\small{$\textbf{C}$}};

\begin{pgfonlayer}{background}
\node [rectangle,inner sep=0.4em,draw,fill=blue!20!white] [fit = (e) (c)] (box) {};
\end{pgfonlayer}

\draw [->,thick] ([yshift=-1em]box.south)--([yshift=-0.1em]box.south) node [pos=0,below] (bottom1) {\small{单词$w$}};
\draw [->,thick] ([yshift=0.1em]box.north)--([yshift=1em]box.north) node [pos=1,above] (top1) {\scriptsize{$e$=(8,.2,-1,.9,...,1)}};
\node [anchor=north] (bottom2) at ([yshift=0.3em]bottom1.south) {\scriptsize{$w$=(0,0,1,0,...,0)}};
\node [anchor=south] (top2) at ([yshift=-0.3em]top1.north) {\small{$w$的分布式表示}};

\visible<2->{
\node [anchor=north west,fill=red!20!white] (cmatrix) at ([xshift=3em,yshift=1.0em]c.north east) {\scriptsize{$\begin{pmatrix} 1 & .2 & -.2 & 8 & ... & 0 \\ .6 & .8 & -2 & 1 & ... & -.2 \\ 8 & .2 & -1 & .9 & ... & 2.3 \\ 1 & 1.2 & -.9 & 3 & ... & .2 \\ ... & ... & ... & ... & ... & ... \\ 1 & .3 & 3 & .9 & ... & 5.1 \end{pmatrix}$}};
\node [anchor=west,inner sep=2pt,fill=red!30!white] (c) at (e.east) {\small{$\textbf{C}$}};
\draw [<-,thick] (c.east) -- ([xshift=3em]c.east);
}

\visible<3->{
\node [anchor=south,draw,fill=green!20!white] (e2) at ([yshift=1.5em]cmatrix.north) {\scriptsize{外部词嵌入系统得到的$\textbf{C}$}};
\draw [->,very thick,dashed] (e2.south) -- (cmatrix.north);
}

\end{scope}
\end{tikzpicture}
\end{center}

\vspace{-1.0em}

\begin{itemize}
\item<4-> 词嵌入如何学习得到?
    \begin{itemize}
    \item 可以和语言模型的其它部分一起训练,不过速度较慢
    \item 也可以考虑使用效率更高的外部模型,如word2vec、 Glove等,这样可以使用更大规模的数据
    \end{itemize}
\end{itemize}

\end{frame}
xiaotong committed
4852 4853

%%%------------------------------------------------------------------------------------------------------------
xiaotong committed
4854 4855
\subsection{句子表示模型及预训练}

xiaotong committed
4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001
%%%------------------------------------------------------------------------------------------------------------
%%% 词嵌入的问题
\begin{frame}{不仅如``词''}
\begin{itemize}
\item 词嵌入已经成为诸多NLP系统的标配,当然也衍生出各种花式玩法,甚至有``embed everything''的口号,但是词嵌入也有问题
    \begin{itemize}
    \item 每个词都对应唯一的向量表示,但是对于一词多义现象,词义需要通过上下文进行区分。一个著名的例子:
    \end{itemize}
    \vspace{0.3em}
    \hspace{6em} Jobs was the CEO of \alert{\underline{apple}}.\\
    \hspace{6em} He finally ate the \alert{\underline{apple}}.
\item<2-> 引入上下文信息
    \begin{itemize}
    \item 上述问题引发了新的思考:不能简单地考虑词的表示,应同时考虑其上下文信息
    \item 对于句子中的一个词(或者位置),同时表示词和上下文
    \end{itemize}
\end{itemize}

\visible<2->{
\begin{center}
\begin{tikzpicture}
\begin{scope}
\node [anchor=west] (node1) at (0,0) {\footnotesize{Jobs was the CEO of}};
\node [anchor=west] (node2) at ([xshift=-0.2em,yshift=-0.05em]node1.east) {\footnotesize{\alert{\underline{apple}}}};
\node [anchor=west] (node3) at ([xshift=-0.2em,yshift=-0.1em]node2.east) {\footnotesize{.}};
\node [anchor=south,inner sep=2pt,minimum width=2.4em,fill=red!20!white] (node4) at ([yshift=1.5em]node2.north) {\scriptsize{}};
\node [anchor=north] (label) at ([xshift=1em]node1.south) {\scriptsize{\textbf{词表示模型}}};
\draw [->,thick] (node2.north) -- (node4.south);
\end{scope}
\begin{scope}[xshift=2in]
\node [anchor=west] (node1) at (0,0) {\footnotesize{Jobs was the CEO of}};
\node [anchor=west] (node2) at ([xshift=-0.2em,yshift=-0.05em]node1.east) {\footnotesize{\alert{\underline{apple}}}};
\node [anchor=west] (node3) at ([xshift=-0.2em,yshift=-0.1em]node2.east) {\footnotesize{.}};
\node [anchor=south,inner sep=2pt,minimum width=2.4em,fill=red!20!white] (node4) at ([yshift=1.5em]node2.north) {\scriptsize{}};
\node [anchor=south,inner sep=2pt,minimum width=2.4em,fill=blue!20!white] (node5) at (node4.north) {\scriptsize{上下文}};
\node [anchor=north] (label) at ([xshift=1em]node1.south) {\scriptsize{\textbf{词+上下文表示模型}}};
\draw [->,thick] (node2.north) -- (node4.south);
\draw [->] ([xshift=1em]node1.north west) .. controls +(north:1) and +(west:2) .. ([yshift=0.2em]node5.west);
\draw [->] ([xshift=3em]node1.north west) .. controls +(north:0.8) and +(west:1.5) .. ([yshift=-0.2em]node5.west);
\node [anchor=east] (morelines) at ([xshift=-1.5em]node4.west) {...};
\end{scope}
\end{tikzpicture}
\end{center}
}
\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 上下文表示模型
\begin{frame}{表示更长的片段 - 上下文表示模型}
\begin{itemize}
\item 在语言模型中已经包含了每个位置的上下文表示信息
    \begin{itemize}
    \item 以RNN LM为例,位置$i$的隐层输出就是一种$w_1...w_i$的表示
    \end{itemize}
\end{itemize}

\vspace{-0.5em}
\begin{center}
\begin{tikzpicture}
\begin{scope}
\tikzstyle{rnnnode} = [draw,inner sep=5pt,minimum width=4em,minimum height=1.5em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}]
\node [anchor=west,rnnnode] (node11) at (0,0) {\scriptsize{RNN Cell}};
\node [anchor=west,rnnnode] (node12) at ([xshift=2em]node11.east) {\scriptsize{RNN Cell}};
\node [anchor=west,rnnnode] (node13) at ([xshift=2em]node12.east) {\scriptsize{RNN Cell}};
\node [anchor=west,rnnnode] (node14) at ([xshift=2em]node13.east) {\scriptsize{RNN Cell}};

\node [anchor=north,rnnnode,fill=red!30!white] (e1) at ([yshift=-1.2em]node11.south) {\scriptsize{embedding}};
\node [anchor=north,rnnnode,fill=red!30!white] (e2) at ([yshift=-1.2em]node12.south) {\scriptsize{embedding}};
\node [anchor=north,rnnnode,fill=red!30!white] (e3) at ([yshift=-1.2em]node13.south) {\scriptsize{embedding}};
\node [anchor=north,rnnnode,fill=red!30!white] (e4) at ([yshift=-1.2em]node14.south) {\scriptsize{embedding}};
\node [anchor=north] (w1) at ([yshift=-1em]e1.south) {\footnotesize{乔布斯}};
\node [anchor=north] (w2) at ([yshift=-1em]e2.south) {\footnotesize{任职}};
\node [anchor=north] (w3) at ([yshift=-1em]e3.south) {\footnotesize{}};
\node [anchor=north] (w4) at ([yshift=-1em]e4.south) {\footnotesize{苹果}};

\draw [->,thick] ([yshift=0.1em]w1.north)--([yshift=-0.1em]e1.south);
\draw [->,thick] ([yshift=0.1em]w2.north)--([yshift=-0.1em]e2.south);
\draw [->,thick] ([yshift=0.1em]w3.north)--([yshift=-0.1em]e3.south);
\draw [->,thick] ([yshift=0.1em]w4.north)--([yshift=-0.1em]e4.south);

\draw [->,thick] ([yshift=0.1em]e1.north)--([yshift=-0.1em]node11.south);
\draw [->,thick] ([yshift=0.1em]e2.north)--([yshift=-0.1em]node12.south);
\draw [->,thick] ([yshift=0.1em]e3.north)--([yshift=-0.1em]node13.south);
\draw [->,thick] ([yshift=0.1em]e4.north)--([yshift=-0.1em]node14.south);

\node [anchor=south,rnnnode] (node21) at ([yshift=1.5em]node11.north) {\scriptsize{RNN Cell}};
\node [anchor=south,rnnnode] (node22) at ([yshift=1.5em]node12.north) {\scriptsize{RNN Cell}};
\node [anchor=south,rnnnode] (node23) at ([yshift=1.5em]node13.north) {\scriptsize{RNN Cell}};
\node [anchor=south,rnnnode] (node24) at ([yshift=1.5em]node14.north) {\scriptsize{RNN Cell}};

\node [anchor=south] (node31) at ([yshift=1.0em]node21.north) {\scriptsize{的表示}};
\node [anchor=south west] (node31new) at ([yshift=-0.3em]node31.north west) {\scriptsize{``乔布斯''}};
\node [anchor=south] (node32) at ([yshift=1.0em]node22.north) {\scriptsize{的表示\ \ \ }};
\node [anchor=south west] (node32new) at ([yshift=-0.3em]node32.north west) {\scriptsize{``乔布斯 任职''}};
\node [anchor=south] (node33) at ([yshift=1.0em]node23.north) {\scriptsize{的表示\ \ \ \ \ \ \ \ }};
\node [anchor=south west] (node33new) at ([yshift=-0.3em]node33.north west) {\scriptsize{``乔布斯 任职 于''}};
\node [anchor=south] (node34) at ([yshift=1.0em]node24.north) {\scriptsize{的表示\ \ \ \ \ \ \ \ }};
\node [anchor=south west] (node34new) at ([yshift=-0.3em]node34.north west) {\scriptsize{``乔布斯 任职 于 苹果''}};

\draw [->,thick] ([yshift=0.1em]node21.north)--([yshift=-0.1em]node31.south);
\draw [->,thick] ([yshift=0.1em]node22.north)--([yshift=-0.1em]node32.south);
\draw [->,thick] ([yshift=0.1em]node23.north)--([yshift=-0.1em]node33.south);
\draw [->,thick] ([yshift=0.1em]node24.north)--([yshift=-0.1em]node34.south);

\draw [->,thick] ([xshift=-1em]node21.west)--([xshift=-0.1em]node21.west);
\draw [->,thick] ([xshift=0.1em]node21.east)--([xshift=-0.1em]node22.west);
\draw [->,thick] ([xshift=0.1em]node22.east)--([xshift=-0.1em]node23.west);
\draw [->,thick] ([xshift=0.1em]node23.east)--([xshift=-0.1em]node24.west);
\draw [->,thick] ([xshift=0.1em]node24.east)--([xshift=1em]node24.east);

\draw [->,thick] ([yshift=0.1em]node11.north)--([yshift=-0.1em]node21.south);
\draw [->,thick] ([yshift=0.1em]node12.north)--([yshift=-0.1em]node22.south);
\draw [->,thick] ([yshift=0.1em]node13.north)--([yshift=-0.1em]node23.south);
\draw [->,thick] ([yshift=0.1em]node14.north)--([yshift=-0.1em]node24.south);

\draw [->,thick] ([xshift=-1em]node11.west)--([xshift=-0.1em]node11.west);
\draw [->,thick] ([xshift=0.1em]node11.east)--([xshift=-0.1em]node12.west);
\draw [->,thick] ([xshift=0.1em]node12.east)--([xshift=-0.1em]node13.west);
\draw [->,thick] ([xshift=0.1em]node13.east)--([xshift=-0.1em]node14.west);
\draw [->,thick] ([xshift=0.1em]node14.east)--([xshift=1em]node14.east);

\visible<2->{
\node [anchor=south] (toplabel1) at ([yshift=2em,xshift=-2em]node32new.north) {\footnotesize{``苹果''的表示:}};
\node [anchor=west,fill=blue!20!white,minimum width=3em] (toplabel2) at (toplabel1.east) {\footnotesize{上下文}};
}
\visible<3->{
\node [anchor=west,fill=red!20!white,minimum width=3em] (toplabel3) at (toplabel2.east) {\footnotesize{}};
}

\begin{pgfonlayer}{background}
\visible<3->{
\node [rectangle,inner sep=2pt,draw,thick,dashed,red] [fit = (e4)] (r2) {};
\draw [->,thick,red] (r2.west) .. controls +(west:0.8) and +(south:2) .. ([xshift=1.3em]toplabel3.south);
}
\visible<2->{
\node [rectangle,inner sep=2pt,draw,thick,dashed,ublue,fill=white] [fit = (node33) (node33new)] (r1) {};
\draw [->,thick,ublue] ([xshift=-2em]r1.north) .. controls +(north:0.7) and +(south:0.7) .. ([xshift=-0.5em]toplabel2.south);
}
\end{pgfonlayer}

\end{scope}
\end{tikzpicture}
\end{center}

\end{frame}

xiaotong committed
5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017
%%%------------------------------------------------------------------------------------------------------------
%%% 更强大的表示模型 - ELMO
\begin{frame}{更强的表示模型 - ELMO}
\begin{itemize}
\item \textbf{ELMO}(Embedding from Language Models)可以说是掀起了基于语言模型的预训练的热潮
    \begin{itemize}
    \item 仍然使用RNN结构,不过循环单元换成了LSTM
    \item 同时考虑自左向右和自右向左的建模方式,同时表示一个词左端和右端的上下文
    \item 融合所有层的输出,送给下游应用,提供了更丰富的信息
    \end{itemize}
\end{itemize}
\vspace{0.5em}
\begin{center}
\begin{tikzpicture}
\begin{scope}[scale=1.2]

姜雨帆 committed
5018 5019 5020 5021
\node [anchor=west,draw,inner sep=4pt,fill=blue!20!white] (Lstm0) at (0,0) {\scriptsize{LSTM}};
\node [anchor=west,draw,inner sep=4pt,fill=blue!20!white] (Lstm1) at ([xshift=1em]Lstm0.east) {\scriptsize{LSTM}};
\node [anchor=west,inner sep=4pt] (sep) at ([xshift=0.5em]Lstm1.east) {\scriptsize{...}};
\node [anchor=west,draw,inner sep=4pt,fill=blue!20!white] (Lstm2) at ([xshift=0.5em]sep.east) {\scriptsize{LSTM}};
xiaotong committed
5022

姜雨帆 committed
5023 5024 5025 5026
\node [anchor=south,draw,inner sep=4pt,fill=blue!20!white] (Lstm3) at ([yshift=1em]Lstm0.north) {\scriptsize{LSTM}};
\node [anchor=west,draw,inner sep=4pt,fill=blue!20!white] (Lstm4) at ([xshift=1em]Lstm3.east) {\scriptsize{LSTM}};
\node [anchor=west,inner sep=4pt] (sep1) at ([xshift=0.5em]Lstm4.east) {\scriptsize{...}};
\node [anchor=west,draw,inner sep=4pt,fill=blue!20!white] (Lstm5) at ([xshift=0.5em]sep1.east) {\scriptsize{LSTM}};
xiaotong committed
5027 5028 5029

\node [rectangle,rounded corners,draw=black!50,densely dashed,inner sep=0.4em] [fit = (Lstm0) (Lstm2) (Lstm3) (Lstm5)] (inputshadow) {};

姜雨帆 committed
5030 5031
\node [anchor=north,draw,inner sep=4pt,fill=ugreen!20!white,minimum width=2em] (e1) at ([xshift=-2em,yshift=-1em]Lstm2.south) {\scriptsize{$\textbf{e}_1$}};
\node [anchor=west,draw,inner sep=4pt,fill=ugreen!20!white,minimum width=2em] (e2) at ([xshift=1em]e1.east) {\scriptsize{$\textbf{e}_2$}};
xiaotong committed
5032
\node [anchor=west,inner sep=4pt] (sep5) at ([xshift=1em]e2.east) {\scriptsize{...}};
姜雨帆 committed
5033
\node [anchor=west,draw,inner sep=4pt,fill=ugreen!20!white,minimum width=2em] (e3) at ([xshift=1em]sep5.east) {\scriptsize{$\textbf{e}_m$}};
xiaotong committed
5034

姜雨帆 committed
5035 5036
\node [anchor=south,draw,inner sep=4pt,fill=yellow!30,minimum width=2em] (t1) at ([xshift=-2em,yshift=1em]Lstm5.north) {\scriptsize{$\textbf{h}_1$}};
\node [anchor=west,draw,inner sep=4pt,fill=yellow!30,minimum width=2em] (t2) at ([xshift=1em]t1.east) {\scriptsize{$\textbf{h}_2$}};
xiaotong committed
5037
\node [anchor=west,inner sep=4pt] (sep6) at ([xshift=1em]t2.east) {\scriptsize{...}};
姜雨帆 committed
5038
\node [anchor=west,draw,inner sep=4pt,fill=yellow!30,minimum width=2em] (t3) at ([xshift=1em]sep6.east) {\scriptsize{$\textbf{h}_m$}};
xiaotong committed
5039

姜雨帆 committed
5040 5041 5042 5043
\node [anchor=west,draw,inner sep=4pt,fill=blue!20!white] (Lstm6) at ([xshift=1.5em]Lstm2.east) {\scriptsize{LSTM}};
\node [anchor=west,draw,inner sep=4pt,fill=blue!20!white] (Lstm7) at ([xshift=1em]Lstm6.east) {\scriptsize{LSTM}};
\node [anchor=west,inner sep=4pt] (sep3) at ([xshift=0.5em]Lstm7.east) {\scriptsize{...}};
\node [anchor=west,draw,inner sep=4pt,fill=blue!20!white] (Lstm8) at ([xshift=0.5em]sep3.east) {\scriptsize{LSTM}};
xiaotong committed
5044

姜雨帆 committed
5045 5046 5047 5048
\node [anchor=south,draw,inner sep=4pt,fill=blue!20!white] (Lstm9) at ([yshift=1em]Lstm6.north) {\scriptsize{LSTM}};
\node [anchor=west,draw,inner sep=4pt,fill=blue!20!white] (Lstm10) at ([xshift=1em]Lstm9.east) {\scriptsize{LSTM}};
\node [anchor=west,inner sep=4pt] (sep4) at ([xshift=0.5em]Lstm10.east) {\scriptsize{...}};
\node [anchor=west,draw,inner sep=4pt,fill=blue!20!white] (Lstm11) at ([xshift=0.5em]sep4.east) {\scriptsize{LSTM}};
xiaotong committed
5049 5050 5051 5052

\node [rectangle,rounded corners,draw=black!50,densely dashed,inner sep=0.4em] [fit = (Lstm6) (Lstm8) (Lstm9) (Lstm11)] (inputshadow) {};

\draw [->] ([xshift=0.1em]Lstm0.east) -- ([xshift=-0.1em]Lstm1.west);
姜雨帆 committed
5053 5054
\draw [->] ([xshift=0.1em]Lstm1.east) -- ([xshift=0.1em]sep.west);
\draw [->] ([xshift=-0.1em]sep.east) -- ([xshift=-0.1em]Lstm2.west);
xiaotong committed
5055 5056

\draw [->] ([xshift=0.1em]Lstm3.east) -- ([xshift=-0.1em]Lstm4.west);
姜雨帆 committed
5057 5058
\draw [->] ([xshift=0.1em]Lstm4.east) -- ([xshift=0.1em]sep1.west);
\draw [->] ([xshift=-0.1em]sep1.east) -- ([xshift=-0.1em]Lstm5.west);
xiaotong committed
5059 5060 5061 5062 5063 5064

\draw [->] ([yshift=0.1em]Lstm0.north) -- ([yshift=-0.1em]Lstm3.south);
\draw [->] ([yshift=0.1em]Lstm1.north) -- ([yshift=-0.1em]Lstm4.south);
\draw [->] ([yshift=0.1em]Lstm2.north) -- ([yshift=-0.1em]Lstm5.south);

\draw [->] ([xshift=0.1em]Lstm6.east) -- ([xshift=-0.1em]Lstm7.west);
姜雨帆 committed
5065 5066
\draw [->] ([xshift=0.1em]Lstm7.east) -- ([xshift=0.1em]sep3.west);
\draw [->] ([xshift=-0.1em]sep3.east) -- ([xshift=-0.1em]Lstm8.west);
xiaotong committed
5067 5068

\draw [->] ([xshift=0.1em]Lstm9.east) -- ([xshift=-0.1em]Lstm10.west);
姜雨帆 committed
5069 5070
\draw [->] ([xshift=0.1em]Lstm10.east) -- ([xshift=0.1em]sep4.west);
\draw [->] ([xshift=-0.1em]sep4.east) -- ([xshift=-0.1em]Lstm11.west);
xiaotong committed
5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110

\draw [->] ([yshift=0.1em]Lstm6.north) -- ([yshift=-0.1em]Lstm9.south);
\draw [->] ([yshift=0.1em]Lstm7.north) -- ([yshift=-0.1em]Lstm10.south);
\draw [->] ([yshift=0.1em]Lstm8.north) -- ([yshift=-0.1em]Lstm11.south);

\draw [->] ([yshift=0.1em]e1.north) -- ([yshift=-0.1em]Lstm0.south);
\draw [->] ([yshift=0.1em]e1.north) -- ([yshift=-0.1em]Lstm6.south);
\draw [->] ([yshift=0.1em]e2.north) -- ([yshift=-0.1em]Lstm1.south);
\draw [->] ([yshift=0.1em]e2.north) -- ([yshift=-0.1em]Lstm7.south);
\draw [->] ([yshift=0.1em]e3.north) -- ([yshift=-0.1em]Lstm2.south);
\draw [->] ([yshift=0.1em]e3.north) -- ([yshift=-0.1em]Lstm8.south);

\draw [->] ([yshift=0.1em]Lstm3.north) -- ([xshift=-0.05em,yshift=-0.1em]t1.south);
\draw [->] ([yshift=0.1em]Lstm9.north) -- ([yshift=-0.1em]t1.south);
\draw [->] ([yshift=0.1em]Lstm4.north) -- ([xshift=-0.05em,yshift=-0.1em]t2.south);
\draw [->] ([yshift=0.1em]Lstm10.north) -- ([yshift=-0.1em]t2.south);
\draw [->] ([yshift=0.1em]Lstm5.north) -- ([xshift=-0.05em,yshift=-0.1em]t3.south);
\draw [->] ([yshift=0.1em]Lstm11.north) -- ([yshift=-0.1em]t3.south);

\end{scope}
\end{tikzpicture}
\end{center}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 更强大的表示模型 - GTP
\begin{frame}{更强的表示模型 - GPT}
\begin{itemize}
\item \textbf{GPT}(Generative Pre-Training)也是一种基于语言模型的表示模型
    \begin{itemize}
    \item 架构换成了Transformer,特征抽取能力更强
    \item 基于Pre-training + Fine-tuning的框架,预训练作为下游系统部件的参数初始值,因此可以更好的适应目标任务
    \end{itemize}
\end{itemize}
\vspace{0.5em}
\begin{center}
\begin{tikzpicture}
\begin{scope}[scale=1.2]

姜雨帆 committed
5111 5112 5113 5114
\node [anchor=west,draw,inner sep=4pt,fill=blue!20!white,minimum width=3em] (Trm0) at (0,0) {\scriptsize{TRM}};
\node [anchor=west,draw,inner sep=4pt,fill=blue!20!white,minimum width=3em] (Trm1) at ([xshift=1em]Trm0.east) {\scriptsize{TRM}};
\node [anchor=west,draw,inner sep=4pt,fill=blue!20!white,minimum width=3em] (Trm2) at ([xshift=1em]Trm1.east) {\scriptsize{TRM}};
\node [anchor=west,draw,inner sep=4pt,fill=blue!20!white,minimum width=3em] (Trm3) at ([xshift=1em]Trm2.east) {\scriptsize{TRM}};
xiaotong committed
5115
\node [anchor=west,inner sep=4pt] (sep) at ([xshift=1em]Trm3.east) {\scriptsize{...}};
姜雨帆 committed
5116
\node [anchor=west,draw,inner sep=4pt,fill=blue!20!white,minimum width=3em] (Trm4) at ([xshift=1em]sep.east) {\scriptsize{TRM}};
xiaotong committed
5117

姜雨帆 committed
5118 5119 5120 5121
\node [anchor=south,draw,inner sep=4pt,fill=blue!20!white,minimum width=3em] (Trm5) at ([yshift=1em]Trm0.north) {\scriptsize{TRM}};
\node [anchor=west,draw,inner sep=4pt,fill=blue!20!white,minimum width=3em] (Trm6) at ([xshift=1em]Trm5.east) {\scriptsize{TRM}};
\node [anchor=west,draw,inner sep=4pt,fill=blue!20!white,minimum width=3em] (Trm7) at ([xshift=1em]Trm6.east) {\scriptsize{TRM}};
\node [anchor=west,draw,inner sep=4pt,fill=blue!20!white,minimum width=3em] (Trm8) at ([xshift=1em]Trm7.east) {\scriptsize{TRM}};
xiaotong committed
5122
\node [anchor=west,inner sep=4pt] (sep1) at ([xshift=1em]Trm8.east) {\scriptsize{...}};
姜雨帆 committed
5123
\node [anchor=west,draw,inner sep=4pt,fill=blue!20!white,minimum width=3em] (Trm9) at ([xshift=1em]sep1.east) {\scriptsize{TRM}};
xiaotong committed
5124 5125 5126

\node [rectangle,rounded corners,draw=black!50,densely dashed,inner sep=0.4em] [fit = (Trm0) (Trm4) (Trm5) (Trm9)] (inputshadow) {};

姜雨帆 committed
5127 5128 5129 5130
\node [anchor=north,draw,inner sep=4pt,fill=ugreen!20!white,minimum width=2em] (e1) at ([yshift=-1em]Trm0.south) {\scriptsize{$\textbf{e}_1$}};
\node [anchor=north,draw,inner sep=4pt,fill=ugreen!20!white,minimum width=2em] (e2) at ([yshift=-1em]Trm1.south) {\scriptsize{$\textbf{e}_2$}};
\node [anchor=north,draw,inner sep=4pt,fill=ugreen!20!white,minimum width=2em] (e3) at ([yshift=-1em]Trm2.south) {\scriptsize{$\textbf{e}_3$}};
\node [anchor=north,draw,inner sep=4pt,fill=ugreen!20!white,minimum width=2em] (e4) at ([yshift=-1em]Trm3.south) {\scriptsize{$\textbf{e}_4$}};
xiaotong committed
5131
\node [anchor=north,inner sep=4pt] (sep5) at ([yshift=-1em]sep.south) {\scriptsize{...}};
姜雨帆 committed
5132
\node [anchor=north,draw,inner sep=4pt,fill=ugreen!20!white,minimum width=2em] (e5) at ([yshift=-1em]Trm4.south) {\scriptsize{$\textbf{e}_m$}};
xiaotong committed
5133

姜雨帆 committed
5134 5135 5136 5137
\node [anchor=south,draw,inner sep=4pt,fill=yellow!30,minimum width=2em] (t1) at ([yshift=1em]Trm5.north) {\scriptsize{$\textbf{h}_1$}};
\node [anchor=south,draw,inner sep=4pt,fill=yellow!30,minimum width=2em] (t2) at ([yshift=1em]Trm6.north) {\scriptsize{$\textbf{h}_2$}};
\node [anchor=south,draw,inner sep=4pt,fill=yellow!30,minimum width=2em] (t3) at ([yshift=1em]Trm7.north) {\scriptsize{$\textbf{h}_3$}};
\node [anchor=south,draw,inner sep=4pt,fill=yellow!30,minimum width=2em] (t4) at ([yshift=1em]Trm8.north) {\scriptsize{$\textbf{h}_4$}};
xiaotong committed
5138
\node [anchor=south,inner sep=4pt] (sep6) at ([yshift=1em]sep1.north) {\scriptsize{...}};
姜雨帆 committed
5139
\node [anchor=south,draw,inner sep=4pt,fill=yellow!30,minimum width=2em] (t5) at ([yshift=1em]Trm9.north) {\scriptsize{$\textbf{h}_m$}};
姜雨帆 committed
5140 5141 5142

\node [anchor=west,draw,inner sep=3pt,fill=blue!20!white,minimum width=1em] (Lt1) at ([yshift=1.5em]t1.west) {\tiny{TRM}};
\node [anchor=west] (Lt2) at ([xshift=-0.1em]Lt1.east) {\tiny{: Transformer}};
xiaotong committed
5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195

\draw [->] ([yshift=0.1em]e1.north) -- ([yshift=-0.1em]Trm0.south);
\draw [->] ([yshift=0.1em]e1.north) -- ([yshift=-0.1em]Trm1.south);
\draw [->] ([yshift=0.1em]e1.north) -- ([yshift=-0.1em]Trm2.south);
\draw [->] ([yshift=0.1em]e1.north) -- ([yshift=-0.1em]Trm3.south);
\draw [->] ([yshift=0.1em]e1.north) -- ([yshift=-0.1em]Trm4.south);
\draw [->] ([yshift=0.1em]e2.north) -- ([yshift=-0.1em]Trm1.south);
\draw [->] ([yshift=0.1em]e3.north) -- ([yshift=-0.1em]Trm2.south);
\draw [->] ([yshift=0.1em]e3.north) -- ([yshift=-0.1em]Trm3.south);
\draw [->] ([yshift=0.1em]e3.north) -- ([yshift=-0.1em]Trm4.south);
\draw [->] ([yshift=0.1em]e4.north) -- ([yshift=-0.1em]Trm3.south);
\draw [->] ([yshift=0.1em]e5.north) -- ([yshift=-0.1em]Trm4.south);

\draw [->] ([yshift=0.1em]Trm0.north) -- ([yshift=-0.1em]Trm5.south);
\draw [->] ([yshift=0.1em]Trm0.north) -- ([yshift=-0.1em]Trm6.south);
\draw [->] ([yshift=0.1em]Trm0.north) -- ([yshift=-0.1em]Trm7.south);
\draw [->] ([yshift=0.1em]Trm0.north) -- ([yshift=-0.1em]Trm8.south);
\draw [->] ([yshift=0.1em]Trm0.north) -- ([yshift=-0.1em]Trm9.south);
\draw [->] ([yshift=0.1em]Trm1.north) -- ([yshift=-0.1em]Trm6.south);
\draw [->] ([yshift=0.1em]Trm2.north) -- ([yshift=-0.1em]Trm7.south);
\draw [->] ([yshift=0.1em]Trm2.north) -- ([yshift=-0.1em]Trm8.south);
\draw [->] ([yshift=0.1em]Trm2.north) -- ([yshift=-0.1em]Trm9.south);
\draw [->] ([yshift=0.1em]Trm3.north) -- ([yshift=-0.1em]Trm8.south);
\draw [->] ([yshift=0.1em]Trm4.north) -- ([yshift=-0.1em]Trm9.south);

\draw [->] ([yshift=0.1em]Trm5.north) -- ([yshift=-0.1em]t1.south);
\draw [->] ([yshift=0.1em]Trm6.north) -- ([yshift=-0.1em]t2.south);
\draw [->] ([yshift=0.1em]Trm7.north) -- ([yshift=-0.1em]t3.south);
\draw [->] ([yshift=0.1em]Trm8.north) -- ([yshift=-0.1em]t4.south);
\draw [->] ([yshift=0.1em]Trm9.north) -- ([yshift=-0.1em]t5.south);

\end{scope}
\end{tikzpicture}
\end{center}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 更强大的表示模型 - BERT
\begin{frame}{更强的表示模型 - BERT}
\begin{itemize}
\item \textbf{BERT}( Bidirectional Encoder Representations from Transformers)是最近非常火爆的表示模型
    \begin{itemize}
    \item 仍然基于Transformer但是考虑了左右两端的上下文(可以对比GPT)
    \item 使用了Mask方法来增加训练得到模型的健壮性,这个方法几乎成为了预训练表示模型的新范式
    \end{itemize}
\end{itemize}
\vspace{0.5em}
\begin{center}
\begin{tikzpicture}
\begin{scope}[scale=1.2]

\node [anchor=west,draw,inner sep=4pt,fill=blue!20!white,minimum width=3em] (Trm0) at (0,0) {\scriptsize{Trm}};
姜雨帆 committed
5196 5197 5198
\node [anchor=west,draw,inner sep=4pt,fill=blue!20!white,minimum width=3em] (Trm1) at ([xshift=1em]Trm0.east) {\scriptsize{TRM}};
\node [anchor=west,draw,inner sep=4pt,fill=blue!20!white,minimum width=3em] (Trm2) at ([xshift=1em]Trm1.east) {\scriptsize{TRM}};
\node [anchor=west,draw,inner sep=4pt,fill=blue!20!white,minimum width=3em] (Trm3) at ([xshift=1em]Trm2.east) {\scriptsize{TRM}};
xiaotong committed
5199
\node [anchor=west,inner sep=4pt] (sep) at ([xshift=1em]Trm3.east) {\scriptsize{...}};
姜雨帆 committed
5200
\node [anchor=west,draw,inner sep=4pt,fill=blue!20!white,minimum width=3em] (Trm4) at ([xshift=1em]sep.east) {\scriptsize{TRM}};
xiaotong committed
5201

姜雨帆 committed
5202 5203 5204 5205
\node [anchor=south,draw,inner sep=4pt,fill=blue!20!white,minimum width=3em] (Trm5) at ([yshift=1em]Trm0.north) {\scriptsize{TRM}};
\node [anchor=west,draw,inner sep=4pt,fill=blue!20!white,minimum width=3em] (Trm6) at ([xshift=1em]Trm5.east) {\scriptsize{TRM}};
\node [anchor=west,draw,inner sep=4pt,fill=blue!20!white,minimum width=3em] (Trm7) at ([xshift=1em]Trm6.east) {\scriptsize{TRM}};
\node [anchor=west,draw,inner sep=4pt,fill=blue!20!white,minimum width=3em] (Trm8) at ([xshift=1em]Trm7.east) {\scriptsize{TRM}};
xiaotong committed
5206
\node [anchor=west,inner sep=4pt] (sep1) at ([xshift=1em]Trm8.east) {\scriptsize{...}};
姜雨帆 committed
5207
\node [anchor=west,draw,inner sep=4pt,fill=blue!20!white,minimum width=3em] (Trm9) at ([xshift=1em]sep1.east) {\scriptsize{TRM}};
xiaotong committed
5208 5209 5210

\node [rectangle,rounded corners,draw=black!50,densely dashed,inner sep=0.4em] [fit = (Trm0) (Trm4) (Trm5) (Trm9)] (inputshadow) {};

姜雨帆 committed
5211 5212 5213 5214
\node [anchor=north,draw,inner sep=4pt,fill=ugreen!20!white,minimum width=2em] (e1) at ([yshift=-1em]Trm0.south) {\scriptsize{$\textbf{e}_1$}};
\node [anchor=north,draw,inner sep=4pt,fill=ugreen!20!white,minimum width=2em] (e2) at ([yshift=-1em]Trm1.south) {\scriptsize{$\textbf{e}_2$}};
\node [anchor=north,draw,inner sep=4pt,fill=ugreen!20!white,minimum width=2em] (e3) at ([yshift=-1em]Trm2.south) {\scriptsize{$\textbf{e}_3$}};
\node [anchor=north,draw,inner sep=4pt,fill=ugreen!20!white,minimum width=2em] (e4) at ([yshift=-1em]Trm3.south) {\scriptsize{$\textbf{e}_4$}};
xiaotong committed
5215
\node [anchor=north,inner sep=4pt] (sep5) at ([yshift=-1em]sep.south) {\scriptsize{...}};
姜雨帆 committed
5216
\node [anchor=north,draw,inner sep=4pt,fill=ugreen!20!white,minimum width=2em] (e5) at ([yshift=-1em]Trm4.south) {\scriptsize{$\textbf{e}_m$}};
xiaotong committed
5217

姜雨帆 committed
5218 5219 5220 5221
\node [anchor=south,draw,inner sep=4pt,fill=yellow!30,minimum width=2em] (t1) at ([yshift=1em]Trm5.north) {\scriptsize{$\textbf{h}_1$}};
\node [anchor=south,draw,inner sep=4pt,fill=yellow!30,minimum width=2em] (t2) at ([yshift=1em]Trm6.north) {\scriptsize{$\textbf{h}_2$}};
\node [anchor=south,draw,inner sep=4pt,fill=yellow!30,minimum width=2em] (t3) at ([yshift=1em]Trm7.north) {\scriptsize{$\textbf{h}_3$}};
\node [anchor=south,draw,inner sep=4pt,fill=yellow!30,minimum width=2em] (t4) at ([yshift=1em]Trm8.north) {\scriptsize{$\textbf{h}_4$}};
xiaotong committed
5222
\node [anchor=south,inner sep=4pt] (sep6) at ([yshift=1em]sep1.north) {\scriptsize{...}};
姜雨帆 committed
5223
\node [anchor=south,draw,inner sep=4pt,fill=yellow!30,minimum width=2em] (t5) at ([yshift=1em]Trm9.north) {\scriptsize{$\textbf{h}_m$}};
姜雨帆 committed
5224 5225 5226

\node [anchor=west,draw,inner sep=3pt,fill=blue!20!white,minimum width=1em] (Lt1) at ([yshift=1.5em]t1.west) {\tiny{TRM}};
\node [anchor=west] (Lt2) at ([xshift=-0.1em]Lt1.east) {\tiny{: Transformer}};
xiaotong committed
5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267

\draw [->] ([yshift=0.1em]e1.north) -- ([yshift=-0.1em]Trm0.south);
\draw [->] ([yshift=0.1em]e1.north) -- ([yshift=-0.1em]Trm1.south);
\draw [->] ([yshift=0.1em]e1.north) -- ([yshift=-0.1em]Trm2.south);
\draw [->] ([yshift=0.1em]e1.north) -- ([yshift=-0.1em]Trm3.south);
\draw [->] ([yshift=0.1em]e1.north) -- ([yshift=-0.1em]Trm4.south);
\draw [->] ([yshift=0.1em]e2.north) -- ([yshift=-0.1em]Trm1.south);
\draw [->] ([yshift=0.1em]e3.north) -- ([yshift=-0.1em]Trm0.south);
\draw [->] ([yshift=0.1em]e3.north) -- ([yshift=-0.1em]Trm1.south);
\draw [->] ([yshift=0.1em]e3.north) -- ([yshift=-0.1em]Trm2.south);
\draw [->] ([yshift=0.1em]e3.north) -- ([yshift=-0.1em]Trm3.south);
\draw [->] ([yshift=0.1em]e3.north) -- ([yshift=-0.1em]Trm4.south);
\draw [->] ([yshift=0.1em]e4.north) -- ([yshift=-0.1em]Trm3.south);
\draw [->] ([yshift=0.1em]e5.north) -- ([yshift=-0.1em]Trm4.south);

\draw [->] ([yshift=0.1em]Trm0.north) -- ([yshift=-0.1em]Trm5.south);
\draw [->] ([yshift=0.1em]Trm0.north) -- ([yshift=-0.1em]Trm6.south);
\draw [->] ([yshift=0.1em]Trm0.north) -- ([yshift=-0.1em]Trm7.south);
\draw [->] ([yshift=0.1em]Trm0.north) -- ([yshift=-0.1em]Trm8.south);
\draw [->] ([yshift=0.1em]Trm0.north) -- ([yshift=-0.1em]Trm9.south);
\draw [->] ([yshift=0.1em]Trm1.north) -- ([yshift=-0.1em]Trm6.south);
\draw [->] ([yshift=0.1em]Trm2.north) -- ([yshift=-0.1em]Trm5.south);
\draw [->] ([yshift=0.1em]Trm2.north) -- ([yshift=-0.1em]Trm6.south);
\draw [->] ([yshift=0.1em]Trm2.north) -- ([yshift=-0.1em]Trm7.south);
\draw [->] ([yshift=0.1em]Trm2.north) -- ([yshift=-0.1em]Trm8.south);
\draw [->] ([yshift=0.1em]Trm2.north) -- ([yshift=-0.1em]Trm9.south);
\draw [->] ([yshift=0.1em]Trm3.north) -- ([yshift=-0.1em]Trm8.south);
\draw [->] ([yshift=0.1em]Trm4.north) -- ([yshift=-0.1em]Trm9.south);

\draw [->] ([yshift=0.1em]Trm5.north) -- ([yshift=-0.1em]t1.south);
\draw [->] ([yshift=0.1em]Trm6.north) -- ([yshift=-0.1em]t2.south);
\draw [->] ([yshift=0.1em]Trm7.north) -- ([yshift=-0.1em]t3.south);
\draw [->] ([yshift=0.1em]Trm8.north) -- ([yshift=-0.1em]t4.south);
\draw [->] ([yshift=0.1em]Trm9.north) -- ([yshift=-0.1em]t5.south);

\end{scope}
\end{tikzpicture}
\end{center}

\end{frame}

xiaotong committed
5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329
%%%------------------------------------------------------------------------------------------------------------
%%% 预训练
\begin{frame}{预训练}
\begin{itemize}
\item 语言模型可以使用大量无标注数据进行训练,得到的模型可以被直接用于下游系统,以序列到序列任务为例

\begin{center}
\begin{tikzpicture}
\node [anchor=south,minimum width=17em,fill=red!20!white] (encoder) at (0,0) {Encoder (语言模型预先训练)};
\node [anchor=south,minimum width=17em,fill=blue!20!white] (decoder) at (encoder.north) {Decoder (目标任务正常训练)};
\end{tikzpicture}
\end{center}

\item<2-> 衍生出了非常火爆的\alert{范式}:大规模语言模型pre-training + 目标任务fine-tuning
	\begin{itemize}
	\item 许多NLP任务都可以被描述为语言建模,在外部训练得到的语言模型作为模块放入目标系统中(参数初始化)
	\end{itemize}
\end{itemize}

\visible<2->{
\begin{center}
\begin{tikzpicture}

\begin{scope}

\node [anchor=west,draw,thick,minimum width=4em,minimum height=1.7em,fill=blue!20] (encoder) at (0,0) {模块};
\node [anchor=south,minimum width=4em,minimum height=1.7em] (space) at ([yshift=0.3em]encoder.north) {\footnotesize{目标系统}};

\begin{pgfonlayer}{background}
\node [rectangle,draw,thick,fill=red!20] [fit = (encoder) (space)] (system) {};
\end{pgfonlayer}

\node [anchor=north] (data) at ([yshift=-1em]system.south) {\scriptsize{\textbf{目标任务有标注数据}}};
\draw [->,thick] (data.north) -- ([yshift=-0.1em]system.south);
\node [anchor=north] (label) at ([yshift=-0em]data.south) {\scriptsize{(a) standard method}};

\end{scope}

\begin{scope}[xshift=2.8in]

\node [anchor=west,draw,dashed,thick,minimum width=4em,minimum height=1.7em,fill=blue!20] (encoder) at (0,0) {模块};
\node [anchor=south,minimum width=4em,minimum height=1.7em] (space) at ([yshift=0.3em]encoder.north) {\footnotesize{目标系统}};
\node [anchor=center,draw,thick,minimum width=4em,minimum height=1.7em,fill=green!20] (encoderpre) at ([xshift=-7em]encoder.center) {\footnotesize{语言模型}};
\draw [->,thick] (encoderpre.east) -- (encoder.west);

\begin{pgfonlayer}{background}
\node [rectangle,draw,thick,fill=red!20] [fit = (encoder) (space)] (system) {};
\end{pgfonlayer}

\node [anchor=north] (data) at ([yshift=-1em]system.south) {\scriptsize{\textbf{目标任务有标注数据}}};
\draw [->,thick] (data.north) -- ([yshift=-0.1em]system.south);
\node [anchor=north] (data2) at ([yshift=-1em,xshift=-7em]system.south) {\scriptsize{\textbf{大规模无标注数据}}};
\draw [->,thick] (data2.north) -- ([yshift=-0.1em]encoderpre.south);
\node [anchor=north] (label) at ([yshift=-0em,xshift=-4em]data.south) {\scriptsize{(b) pre-training + fine-tuning}};

\end{scope}

\end{tikzpicture}
\end{center}
}

\end{frame}
xiaotong committed
5330

xiaotong committed
5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392
%%%------------------------------------------------------------------------------------------------------------
%%% 预训练带来的新思路
\begin{frame}{预训练带来的新思路}
\begin{itemize}
\item 预训练模型刷榜各种任务的同时,引发了一些思考:\\
      预训练究竟给我们带来了什么?
    \begin{itemize}
    \item 有标注数据量有限,预训练提供使用超大规模数据的方法
    \item 从大规模无标注数据中学习通用知识,提升泛化能力
    \item 神经网络复杂且不容易训练,预训练可以使模型关注优质解的高密度区域
    \end{itemize}
\end{itemize}

\visible<2->{
\begin{center}
\begin{tikzpicture}
\draw[name path=ellipse,thick] (0,0) circle[x radius = 2, y radius = 1];
\node[rectangle,minimum size=0.1em,inner sep=2pt,fill=red] (p1) at (0.2,0.5) {};
\node[rectangle,minimum size=0.1em,inner sep=2pt,fill=red] (p2) at (0.3,0.6) {};
\node[rectangle,minimum size=0.1em,inner sep=2pt,fill=red] (p3) at (0.1,-0.1) {};
\node[rectangle,minimum size=0.1em,inner sep=2pt,fill=red] (p4) at (0.4,0) {};
\node[rectangle,minimum size=0.1em,inner sep=2pt,fill=red] (p5) at (0.5,0.3) {};
\node[rectangle,minimum size=0.1em,inner sep=2pt,fill=red] (p6) at (0.6,0.1) {};
\node[rectangle,minimum size=0.1em,inner sep=2pt,fill=red] (p7) at (0.7,-0.1) {};
\node[rectangle,minimum size=0.1em,inner sep=2pt,fill=red] (p8) at (-1.2,0.4) {};
\node[rectangle,minimum size=0.1em,inner sep=2pt,fill=red] (p9) at (-1.0,-0.3) {};
\node[rectangle,minimum size=0.1em,inner sep=2pt,fill=red] (p10) at (-0.1,-0.8) {};

\begin{pgfonlayer}{background}
\visible<4->{
\node [rectangle,inner sep=0.4em,draw,blue] [fit = (p1) (p2) (p3) (p4) (p5) (p6)] (area) {};
}
\end{pgfonlayer}

\draw [->] (2.5,-0.7) -- (1.8,-0.5) node [pos=0,right] {\scriptsize{模型参数解空间}};

\visible<4->{
\draw [->] (2.0,0.7) -- (area.20) node [pos=0,right] {\scriptsize{优质解高密度区域(预训练)}};
}
\visible<3->{
\draw [->] (-2.0,0.7) -- (p8.west) node [pos=0,left] {\scriptsize{游离的解}};
}

\end{tikzpicture}
\end{center}
}

\begin{itemize}
\item<5-> 机器翻译中的预训练
    \begin{itemize}
    \item 机器翻译中预训练还没有屠榜,一方面由于很多机器翻译任务训练数据量并不小,另一方面也反应出翻译的双语建模对预训练也提出了新的要求
    \end{itemize}
\end{itemize}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 总结
\begin{frame}{总结 - 长出一口气}
\begin{itemize}
\item 讲了很多,累呀累,再整理一下主要观点
    \begin{itemize}
xiaotong committed
5393
    \item 神经网络没有那么复杂,入门并不难
xiaotong committed
5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483
    \item 简单的网络结构可以组合成强大的模型
    \item 语言模型可以用神经网络实现,效果很好,最近出现的预训练等范式证明了神经语言模型的潜力
    \end{itemize}
\item<2-> 仍然有很多问题需要讨论
    \begin{itemize}
    \item 常见的神经网络结构(面向NLP)\\
          google一下LSTM、GRU、CNN
    \item 深层模型和训练方法。深度学习如何体现``深''?\\
          深层网络可以带来什么?\\
          如何有效的训练深层模型?
    \item 如何把神经网络用于包括机器翻译在内的其它NLP任务?\\
          比如encoder-decoder框架
    \item 深度学习的实践技巧\\
          ``炼金术''了解下,因为不同任务调参和模型设计都有技巧\\
          ...
    \end{itemize}
\end{itemize}
\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% last slide
\begin{frame}{又结束一章内容~}

\vspace{2em}

\begin{center}
\textbf{内容很多,开个了个头}\\
\textbf{学习深度学习技术需要实践和经验的积累!}

\vspace{2em}

\begin{tikzpicture}

\tikzstyle{rnnnode} = [draw,inner sep=5pt,minimum width=4em,minimum height=1.5em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}]
\node [anchor=west,rnnnode] (node11) at (0,0) {\tiny{RNN Cell}};
\node [anchor=west,rnnnode] (node12) at ([xshift=2em]node11.east) {\tiny{RNN Cell}};
\node [anchor=west,rnnnode] (node13) at ([xshift=2em]node12.east) {\tiny{RNN Cell}};
\node [anchor=west,rnnnode] (node14) at ([xshift=2em]node13.east) {\tiny{RNN Cell}};

\node [anchor=north,rnnnode,fill=red!30!white] (e1) at ([yshift=-1.2em]node11.south) {\tiny{embedding}};
\node [anchor=north,rnnnode,fill=red!30!white] (e2) at ([yshift=-1.2em]node12.south) {\tiny{embedding}};
\node [anchor=north,rnnnode,fill=red!30!white] (e3) at ([yshift=-1.2em]node13.south) {\tiny{embedding}};
\node [anchor=north,rnnnode,fill=red!30!white] (e4) at ([yshift=-1.2em]node14.south) {\tiny{embedding}};
\node [anchor=north] (w1) at ([yshift=-1em]e1.south) {\footnotesize{$<$s$>$}};
\node [anchor=north] (w2) at ([yshift=-1em]e2.south) {\footnotesize{谢谢}};
\node [anchor=north] (w3) at ([yshift=-1em]e3.south) {\footnotesize{大家}};
\node [anchor=north] (w4) at ([yshift=-1em]e4.south) {\footnotesize{聆听}};

\draw [->,thick] ([yshift=0.1em]w1.north)--([yshift=-0.1em]e1.south);
\draw [->,thick] ([yshift=0.1em]w2.north)--([yshift=-0.1em]e2.south);
\draw [->,thick] ([yshift=0.1em]w3.north)--([yshift=-0.1em]e3.south);
\draw [->,thick] ([yshift=0.1em]w4.north)--([yshift=-0.1em]e4.south);

\draw [->,thick] ([yshift=0.1em]e1.north)--([yshift=-0.1em]node11.south);
\draw [->,thick] ([yshift=0.1em]e2.north)--([yshift=-0.1em]node12.south);
\draw [->,thick] ([yshift=0.1em]e3.north)--([yshift=-0.1em]node13.south);
\draw [->,thick] ([yshift=0.1em]e4.north)--([yshift=-0.1em]node14.south);

\node [anchor=south,rnnnode,fill=red!30!white] (node21) at ([yshift=1.0em]node11.north) {\tiny{Softmax($\cdot$)}};
\node [anchor=south,rnnnode,fill=red!30!white] (node22) at ([yshift=1.0em]node12.north) {\tiny{Softmax($\cdot$)}};
\node [anchor=south,rnnnode,fill=red!30!white] (node23) at ([yshift=1.0em]node13.north) {\tiny{Softmax($\cdot$)}};
\node [anchor=south,rnnnode,fill=red!30!white] (node24) at ([yshift=1.0em]node14.north) {\tiny{Softmax($\cdot$)}};

\node [anchor=south] (output1) at ([yshift=1em]node21.north) {\Large{\textbf{谢谢}}};
\node [anchor=south] (output2) at ([yshift=1em]node22.north) {\Large{\textbf{大家}}};
\node [anchor=south] (output3) at ([yshift=1em]node23.north) {\Large{\textbf{聆听}}};
\node [anchor=south] (output4) at ([yshift=1em]node24.north) {\Large{\textbf{$<$/s$>$}}};

\draw [->,thick] ([yshift=0.1em]node21.north)--([yshift=-0.1em]output1.south);
\draw [->,thick] ([yshift=0.1em]node22.north)--([yshift=-0.1em]output2.south);
\draw [->,thick] ([yshift=0.1em]node23.north)--([yshift=-0.1em]output3.south);
\draw [->,thick] ([yshift=0.1em]node24.north)--([yshift=-0.1em]output4.south);

\draw [->,thick] ([yshift=0.1em]node11.north)--([yshift=-0.1em]node21.south);
\draw [->,thick] ([yshift=0.1em]node12.north)--([yshift=-0.1em]node22.south);
\draw [->,thick] ([yshift=0.1em]node13.north)--([yshift=-0.1em]node23.south);
\draw [->,thick] ([yshift=0.1em]node14.north)--([yshift=-0.1em]node24.south);

\draw [->,thick] ([xshift=-1em]node11.west)--([xshift=-0.1em]node11.west);
\draw [->,thick] ([xshift=0.1em]node11.east)--([xshift=-0.1em]node12.west);
\draw [->,thick] ([xshift=0.1em]node12.east)--([xshift=-0.1em]node13.west);
\draw [->,thick] ([xshift=0.1em]node13.east)--([xshift=-0.1em]node14.west);
\draw [->,thick] ([xshift=0.1em]node14.east)--([xshift=1em]node14.east);

\end{tikzpicture}

\end{center}

\end{frame}

xiaotong committed
5484 5485
\end{CJK}
\end{document}