section06.tex 310 KB
Newer Older
xiaotong committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
% !Mode:: "TeX:UTF-8"
% !TEX encoding = UTF-8 Unicode

\def\CTeXPreproc{Created by ctex v0.2.13, don't edit!}
\documentclass[cjk,t,compress,12pt]{beamer}
\usepackage{pstricks}
\usepackage{etex}
\usepackage{eso-pic,graphicx}
\usepackage{fancybox}
\usepackage{amsmath,amssymb}
\usepackage{setspace}
\usepackage{xcolor}
\usepackage{array,multirow}
\usepackage{CJK}
\usepackage{tikz}
\usepackage{tikz-qtree}
\usepackage{hyperref}
\usepackage{changepage}
\usepackage{pgfplots}
\usepackage{subfigure}
\usepackage{tikz-3dplot}
\usepackage{esvect}
xiaotong committed
23
\usepackage{CJKulem}
Lee committed
24
\usepackage{booktabs}
Lee committed
25
\usepackage{amsmath}
xiaotong committed
26 27 28 29 30 31

\usepackage{tcolorbox}
\tcbuselibrary{skins}

\usetikzlibrary{calc,intersections}
\usetikzlibrary{matrix}
xiaotong committed
32
\usetikzlibrary{patterns}
xiaotong committed
33
\usetikzlibrary{arrows,decorations.pathreplacing}
xiaotong committed
34
\usetikzlibrary{shadows}
xiaotong committed
35
\usetikzlibrary{shadows.blur}
xiaotong committed
36 37
\usepgflibrary{arrows}
\usetikzlibrary{arrows}
xiaotong committed
38 39 40 41 42
\usetikzlibrary{decorations}
\usetikzlibrary{arrows,shapes}

\usetikzlibrary{positioning,fit,calc}

xiaotong committed
43
\usetikzlibrary{mindmap,backgrounds}
xiaotong committed
44 45 46 47 48

\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}

\setbeamertemplate{items}[ball]
xiaotong committed
49
\usefonttheme[onlymath]{serif}
xiaotong committed
50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72

\definecolor{ugreen}{rgb}{0,0.5,0}
\definecolor{lgreen}{rgb}{0.9,1,0.8}
\definecolor{xtgreen1}{rgb}{0.824,0.898,0.8}
\definecolor{xtgreen}{rgb}{0.914,0.945,0.902}
\definecolor{lightgray}{gray}{0.85}

\setbeamercolor{uppercol}{fg=white,bg=ugreen}
\setbeamercolor{lowercol}{fg=black,bg=xtgreen}

\definecolor{ublue}{rgb}{0.152,0.250,0.545}
\setbeamercolor{uppercolblue}{fg=white,bg=ublue}
\setbeamercolor{lowercolblue}{fg=black,bg=blue!10}


%\usetheme{default}
%\usetheme{Darmstadt}
%\usetheme{Madrid}
%\usetheme{Frankfurt}
%\usetheme{Dresden}
%\usetheme{Boadilla}
%\usecolortheme{dolphin}

Lee committed
73
% not compatible with [scale=?]
Lee committed
74 75 76 77 78 79
\newdimen\XCoord
\newdimen\YCoord
\newdimen\TMP
\newcommand*{\ExtractCoordinate}[1]{\path (#1); \pgfgetlastxy{\XCoord}{\YCoord};}%
\newcommand*{\ExtractX}[1]{\path (#1); \pgfgetlastxy{\XCoord}{\TMP};}%
\newcommand*{\ExtractY}[1]{\path (#1); \pgfgetlastxy{\TMP}{\YCoord};}%
Lee committed
80 81
\newcommand{\specialcell}[3][c]{%
  \begin{tabular}[#1]{@{}#2@{}}#3\end{tabular}}
Lee committed
82

xiaotong committed
83 84 85 86
\newcounter{mycount1}
\newcounter{mycount2}
\newcounter{mycount3}
\newcounter{mycount4}
xiaotong committed
87
\newlength{\mystep}
Lee committed
88
\newlength{\base}
89 90 91 92
\newlength{\wseg}
\newlength{\hseg}
\newlength{\wnode}
\newlength{\hnode}
xiaotong committed
93

xiaotong committed
94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136

\usefonttheme[onlylarge]{structurebold}

\IfFileExists{C:/WINDOWS/win.ini}
{\newcommand{\mycfont}{you}}
{\newcommand{\mycfont}{gbsn}}

\begin{CJK}{UTF8}{\mycfont}
\end{CJK}

\setbeamerfont*{frametitle}{size=\large,series=\bfseries}
\setbeamertemplate{navigation symbols}{\begin{CJK}{UTF8}{\mycfont} 第六章 神经机器翻译 \hspace*{2em} 肖桐\&朱靖波 \end{CJK} \hspace*{2em} \today \hspace*{2em} \insertframenumber{}/\inserttotalframenumber}

\setbeamertemplate{itemize items}[circle] % if you want a circle
\setbeamertemplate{itemize subitem}[triangle] % if you wnat a triangle
\setbeamertemplate{itemize subsubitem}[ball] % if you want a ball

\begin{document}

\begin{CJK}{UTF8}{\mycfont}

\title{\Large{神经机器翻译}}
\author{\large{\textbf{肖桐\ \ 朱靖波}}}
\institute{
\blue{\url{xiaotong@mail.neu.edu.cn}} \black{} \\
\blue{\url{zhujingbo@mail.neu.edu.cn}} \black{} \\
\vspace{1.0em}
东北大学 自然语言处理实验室 \\
\blue{\underline{\url{http://www.nlplab.com}}} \black{} \\
\vspace{0.2cm}
\hspace{0.1cm} \includegraphics[scale=0.1]{../Figures/logo.pdf}
}
\date{}

\maketitle

\setlength{\leftmargini}{1em}
\setlength{\leftmarginii}{1em}

%%%------------------------------------------------------------------------------------------------------------
\section{编码器-解码器框架}

%%%------------------------------------------------------------------------------------------------------------
xiaotong committed
137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163
%%% landscape
\begin{frame}{Landscape}

\vspace{-1em}
\begin{center}
\begin{tikzpicture}

\node [] (history) at (0,0) {\includegraphics[scale=0.29]{./Figures/mt-history.png}};

\draw[-,thick] ([xshift=-1.1in]history.south) coordinate (top01) -- ([yshift=-2em]top01);
\draw[-,thick] ([xshift=1in]history.south) coordinate (top02) -- ([yshift=-2em]top02);
\draw[-,thick] ([xshift=-0.3in]history.south east) coordinate (top03) -- ([yshift=-2em]top03);
\draw[<->,thick] ([yshift=-1em]top01) -- ([yshift=-1em]top02) node [pos=0.5,below,align=left,yshift=-0.5em] {\footnotesize{\textbf{Sections 3-4:}}\\\footnotesize{统计机器翻译}};
\draw[<->,thick] ([yshift=-1em]top02) -- ([yshift=-1em]top03) node [pos=0.5,below,align=left,yshift=-0.5em] {\footnotesize{\textbf{Sections 5-6:}}\\\footnotesize{神经机器翻译}};

\visible<2->{
\draw[<-,thick] ([xshift=0.9in,yshift=-0.2in]history.north) coordinate (label01) -- ([yshift=1.5em,xshift=-1.5em]label01) coordinate (label02);
\node[anchor=south west,align=left] (label) at ([xshift=-5em]label02) {\textbf{\alert{本章内容(Section 6):}}\\神经机器翻译建模及实现};
}

\end{tikzpicture}
\end{center}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 译文对比
xiaotong committed
164
\begin{frame}{机器翻译今天的水平}
xiaotong committed
165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201

\begin{tcolorbox}[enhanced,frame engine=empty,boxrule=0.1mm,size=title,colback=red!10!white]
\begin{flushleft}

\begin{spacing}{0.9}
\scriptsize{\textbf{原文(英语)}: During Soviet times, if a city's population topped one million, it would become eligible for its own metro. Planners wanted to brighten the lives of everyday Soviet citizens, and saw the metros, with their tens of thousands of daily passengers, as a singular opportunity to do so. In 1977, Tashkent, the capital of Uzbekistan, became the seventh Soviet city to have a metro built. Grand themes celebrating the history of Uzbekistan and the Soviet Union were brought to life, as art was commissioned and designers set to work. The stations reflected different themes, some with domed ceilings and painted tiles reminiscent of Uzbekistan's Silk Road mosques, while others ... }
\end{spacing}
\end{flushleft}
\end{tcolorbox}

\begin{minipage}[t]{0.47\textwidth}
\begin{beamerboxesrounded}[upper=uppercolblue,lower=lowercolblue,shadow=true]{\scriptsize{\textbf{译文\visible<2->{(统计机器翻译)}}}}
{\scriptsize
\begin{spacing}{0.9}
在苏联时代,如果一个城市的人口突破一百万,这将成为合资格为自己的地铁。规划者想去照亮每天的苏联公民的生命,看到地铁,与他们的数十每天数千乘客,作为一个独特的机会来这样做。1977年,塔什干,乌兹别克斯坦的首都,成了苏联第七城市建有地铁。宏大主题,庆祝乌兹别克斯坦和苏联的历史被带到生活,因为艺术是委托和设计师开始工作。车站反映了不同的主题,有的圆顶天花板和绘瓷砖让人想起乌兹别克斯坦是丝绸之路的清真寺,而另一些则装饰着...
\end{spacing}
}
\end{beamerboxesrounded}
\end{minipage}
\hfill
\begin{minipage}[t]{0.47\textwidth}
\begin{beamerboxesrounded}[upper=uppercolblue,lower=lowercolblue,shadow=true]{\scriptsize{\textbf{译文\visible<2->{(神经机器翻译 - 很流畅!)}}}}
{\scriptsize
\begin{spacing}{0.9}
在苏联时期,如果一个城市的人口超过一百万,它就有资格拥有自己的地铁。 规划者想要照亮日常苏联公民的生活,并把拥有数万名每日乘客的地铁看作是这样做的一个绝佳机会。 1977年,乌兹别克斯坦首都塔什干成为苏联第七个修建地铁的城市。 随着艺术的委托和设计师们的工作,乌兹别克斯坦和苏联历史的宏伟主题被赋予了生命力。 这些电台反映了不同的主题,有的有穹顶和彩砖,让人想起乌兹别克斯坦的丝绸之路清真寺,有的则用...
\end{spacing}
}
\end{beamerboxesrounded}
\end{minipage}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 神经机器翻译的性能增长
\begin{frame}{神经机器翻译的进展}
\begin{itemize}
\item 想当年,08年NIST举办的汉英机器翻译评测,BLEU能突破30已经是巨猛无比的结果,而现在的神经机器翻译轻松突破45!
xiaotong committed
202 203 204 205
\item 再比如,机器翻译的旗舰评测比赛WMT(Workshop of Machine Translation)已经被神经机器翻译刷榜
    \begin{itemize}
    \item 现在冠军系统几乎没有纯统计机器翻译系统
    \end{itemize}
xiaotong committed
206 207
\end{itemize}
%%% 秀smt nmt的blue随着年代的变化
208 209 210
\begin{center}
    \begin{tikzpicture}
        \begin{scope}[local bounding box=WMT]
xiaotong committed
211 212
            \draw[->,thick] (0.5,0) to (10,0);
            \draw[->,thick] (0.5,-0) to (0.5,3.5);
曹润柘 committed
213 214 215 216
            \draw[thick] (0.4,1.6) to (0.6,1.6);
            \draw[thick] (0.4,3.2) to (0.6,3.2);
            \node[font=\scriptsize] at (0,2) {10};
            \node[font=\scriptsize] at (0,3.2) {20};
217

曹润柘 committed
218
%\normalsize
219
            % 2015
曹润柘 committed
220 221 222
             \node[minimum width=0.5cm,thick,minimum height=7*0.16cm,draw,fill=blue!30!white,inner sep=0pt,outer sep=0pt,anchor=south west] (smt2015) at (1.5*0.7,0.5pt) {};
            \node[minimum width=0.5cm,thick,minimum height=2*0.16cm,draw,fill=red!30!white,inner sep=0pt,outer sep=0pt,anchor=south west] (nmt2015) at (smt2015.south east) {};
            \node[font=\normalsize,anchor=north] () at ([yshift=-0.2em]smt2015.south east) {2015};
223
            % 2016
曹润柘 committed
224 225 226
            \node[minimum width=0.5cm,thick,minimum height=3*0.16cm,draw,fill=blue!30!white,inner sep=0pt,outer sep=0pt,anchor=south west] (smt2016) at ($(nmt2015.south east)+(0.7,0)$) {};
            \node[minimum width=0.5cm,thick,minimum height=8*0.16cm,draw,fill=red!30!white,inner sep=0pt,outer sep=0pt,anchor=south west] (nmt2016) at (smt2016.south east) {};
            \node[font=\normalsize,anchor=north] () at ([yshift=-0.2em]smt2016.south east) {2016};
227
            % 2017
曹润柘 committed
228 229 230
            \node[minimum width=0.5cm,thick,minimum height=3*0.16cm,draw,fill=blue!30!white,inner sep=0pt,outer sep=0pt,anchor=south west] (smt2017) at ($(nmt2016.south east)+(0.7,0)$) {};
            \node[minimum width=0.5cm,thick,minimum height=13*0.16cm,draw,fill=red!30!white,inner sep=0pt,outer sep=0pt,anchor=south west] (nmt2017) at (smt2017.south east) {};
            \node[font=\normalsize,anchor=north] () at ([yshift=-0.2em]smt2017.south east) {2017};
231
            % 2018
曹润柘 committed
232 233 234 235 236 237 238 239
            \node[minimum width=0.5cm,thick,minimum height=0cm,draw,fill=blue!30!white,inner sep=0pt,outer sep=0pt,anchor=south west] (smt2018) at ($(nmt2017.south east)+(0.7,0)$) {};
            \node[minimum width=0.5cm,thick,minimum height=14*0.16cm,draw,fill=red!30!white,inner sep=0pt,outer sep=0pt,anchor=south west] (nmt2018) at (smt2018.south east) {};
            \node[font=\normalsize,anchor=north] () at ([yshift=-0.2em]smt2018.south east) {2018};
             % 2019
            \node[minimum width=0.5cm,thick,minimum height=0cm,draw,fill=blue!30!white,inner sep=0pt,outer sep=0pt,anchor=south west] (smt2019) at ($(nmt2018.south east)+(0.7,0)$) {};
            \node[minimum width=0.5cm,thick,minimum height=21*0.16cm,draw,fill=red!30!white,inner sep=0pt,outer sep=0pt,anchor=south west] (nmt2019) at (smt2019.south east) {};
            \node[font=\normalsize,anchor=north] () at ([yshift=-0.2em]smt2019.south east) {2019};
            
240 241 242 243
        \end{scope}

        % legend
        \ExtractX{$(nmt2015.west)$}
Lee committed
244 245
        \ExtractY{$(WMT.north)$}
        \node[minimum width=0.5cm,rectangle,draw,fill=blue!30!white,anchor=north west,label={[label distance=1pt,font=\scriptsize]0:统计机器翻译}] () at (\XCoord,\YCoord) {};
246
        \ExtractX{$(nmt2017.west)$}
Lee committed
247
        \node[minimum width=0.5cm,rectangle,draw,fill=red!30!white,anchor=north west,label={[label distance=1pt,font=\scriptsize]0:神经机器翻译}] () at (\XCoord,\YCoord) {};
248

Lee committed
249
        \node[font=\normalsize,below=0pt of WMT] () {WMT冠军系统};
xiaotong committed
250
        \node[font=\normalsize,rotate=90] () at ([xshift=-1em]WMT.west) {数量};
251 252
    \end{tikzpicture}
\end{center}
xiaotong committed
253 254 255 256 257 258
\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 神经机器翻译的性能增长
\begin{frame}{神经机器翻译的进展(续)}
\begin{itemize}
xiaotong committed
259
\item 神经机器翻译在很多场景下已经超越统计机器翻译
Lee committed
260 261
{
    \footnotesize
Lee committed
262
    \begin{center}
Lee committed
263 264 265
        \setlength{\tabcolsep}{3pt}
        \renewcommand\arraystretch{1}
        \begin{tabular}{l}
Lee committed
266
            \begin{tabular}{lcccl}
Lee committed
267
                \specialrule{1pt}{1pt}{1pt}
Lee committed
268 269
                \multirow{2}{*}{\#} & \multicolumn{3}{c}{自动评价} & \multirow{2}{*}{系统} \\
                \cline{2-4}
Lee committed
270 271
                & BLEU & HTER & mTER & \\
                \specialrule{0.6pt}{1pt}{1pt}
Lee committed
272 273 274
                统计机器翻译 & 25.3 & 28.0 & 21.8 & PBSY \\
                & 24.6 & 29.9 & 23.4 & HPB \\
                & 25.8 & 29.0 & 22.7 & SPB \\
Lee committed
275
                \specialrule{0.6pt}{1pt}{1pt}
Lee committed
276
                神经机器翻译 & \textbf{31.1} & \textbf{21.1} & \textbf{16.2} & NMT \\
Lee committed
277 278 279
                \specialrule{1pt}{1pt}{1pt}
            \end{tabular}\\
            \addlinespace[-0.3ex]
Lee committed
280 281 282 283
            \tiny *Neural versus Phrase-Based Machine Translation Quality: a Case Study\\
        \end{tabular}
    \end{center}
}
xiaotong committed
284
\item 微软的报道:在部分场景下机器翻译质量已经接近甚至超过人工翻译
Lee committed
285 286
{
    \footnotesize
Lee committed
287
    \begin{center}
Lee committed
288 289
        \renewcommand\arraystretch{1}
        \begin{tabular}{l}
Lee committed
290
            \begin{tabular}{lrl}
Lee committed
291
                \specialrule{1pt}{1pt}{1pt}
Lee committed
292
                \# & 人工评价 & 系统 \\
Lee committed
293
                \specialrule{0.6pt}{1pt}{1pt}
Lee committed
294 295 296
                机器翻译 & \textbf{69.9} & COMBO-6 \\
                & 69.8 & COMBO-4 \\
                & \textbf{69.9} & COMBO-5 \\
Lee committed
297
                \specialrule{0.6pt}{1pt}{1pt}
Lee committed
298 299
                人工翻译 & 68.6 & REFERENCE-HT \\
                & 67.6 & REFERENCE-PE \\
Lee committed
300 301 302
                \specialrule{1pt}{1pt}{1pt}
            \end{tabular}\\
            \addlinespace[-0.3ex]
Lee committed
303 304 305 306
            \tiny *Achieving Human Parity on Automatic Chinese to English News Translation\\
        \end{tabular}
    \end{center}
}
xiaotong committed
307 308 309 310 311 312 313 314 315 316 317 318
\end{itemize}
%%% 进一步找一些证据说明NMT的进展,很大google、microsoft的文章,可以看看提升了多少性能等等
\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 传统方法 vs 神经网络方法
\begin{frame}{神经机器翻译的优势}
\begin{itemize}
\item 神经网络方法带来了新的思路,还是同样的表再看一遍
\end{itemize}

\begin{tabular} {l | l}
xiaotong committed
319
\textbf{传统基于统计的方法} & \textbf{深度学习方法} \\
xiaotong committed
320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335
\textbf{(统计机器翻译)} & \textbf{(神经机器翻译)} \\ \hline
基于离散空间的表示模型 & 基于连续空间的表示模型 \\
NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\
特征工程为主 & 无显性特征,但需要设计网络 \\
特征、规则的存储耗资源 & 模型存储相对小,但计算慢
\end{tabular}

\begin{itemize}
\item<2-> \only<2>{在统计机器翻译时代,系统依赖很多模块,比如}\only<3->{\sout{在统计机器翻译时代,系统依赖很多模块,比如}}
    \begin{itemize}
    \item \only<2>{\textbf{词对齐}:双语句子之间词和词的对应关系}\only<3->{\sout{\textbf{词对齐}:双语句子之间词和词的对应关系}}
    \item \only<2>{\textbf{短语(规则)表}:原文和译文互译的片段}\only<3->{\sout{\textbf{短语(规则)表}:原文和译文互译的片段}}
    \item \only<2>{\textbf{特征}:人工设计的用于建模翻译的各种各样的子模型}\only<3->{\sout{\textbf{特征}:人工设计的用于建模翻译的各种各样的子模型}}
    \item \only<2>{\textbf{目标语语言模型}:大量单语数据训练的目标语建模模块}\only<3->{\sout{\textbf{目标语语言模型}:大量单语数据训练的目标语建模模块}}
    \item \only<2>{...}\only<3->{\sout{...}}
    \end{itemize}
xiaotong committed
336
\item<3-> 在\alert{神经机器翻译}时代,以上均不是必要的,而仅仅需要一个神经网络进行端到端建模
xiaotong committed
337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450
\end{itemize}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 编码器-解码器框架
\begin{frame}{编码器-解码器框架}

\begin{itemize}
\item 神经机器翻译遵循一种叫\alert{``编码器-解码器''}的结构
    \begin{itemize}
    \item<2-> \textbf{编码器}负责把源语言编码成一种句子表示形式(如:向量)
    \item<3-> \textbf{解码器}负责利用这种表示逐词把目标语句子生成处理
    \item<4-> 编码器和解码器之间的向量就是连接这两部分的句子表示
    \end{itemize}
\end{itemize}

\vspace{0.5em}
\begin{tikzpicture}

\begin{scope}
\small{
\node [anchor=south west,minimum width=15em] (source) at (0,0) {\textbf{source}: 我\ \ \ \ \ \ \ \ \ \ \ \ 感到\ \ \ \ 满意};
\visible<3->{
\node [anchor=south west,minimum width=15em] (target) at ([yshift=12em]source.north west) {\textbf{target}: I\ \ am\ \ \ satisfied\ \ \ with\ \ \ you};
}
\visible<2->{
\node [anchor=center,minimum width=9.6em,minimum height=1.8em,draw,rounded corners=0.3em] (hidden) at ([yshift=6em]source.north) {};
\node [anchor=west,minimum width=1.5em,minimum size=1.5em,fill=ugreen!20] (cell01) at ([xshift=0.2em]hidden.west) {\footnotesize{.2}};
\node [anchor=west,minimum width=1.5em,minimum size=1.5em,fill=ugreen!10] (cell02) at (cell01.east) {\footnotesize{-1}};
\node [anchor=west,minimum width=1.5em,minimum size=1.5em,fill=ugreen!70] (cell03) at (cell02.east) {\footnotesize{6}};
\node [anchor=west,minimum width=1.5em,minimum size=1.5em,fill=ugreen!50] (cell04) at (cell03.east) {\footnotesize{5}};
\node [anchor=west,minimum width=1.5em,minimum size=1.5em,fill=ugreen!30] (cell05) at (cell04.east) {\footnotesize{.7}};
\node [anchor=west,minimum width=1.5em,minimum size=1.5em,fill=ugreen!10] (cell06) at (cell05.east) {\footnotesize{-2}};
}

\visible<2->{
\filldraw [fill=red!20,draw=white] (source.north west) -- (source.north east) -- ([xshift=-0.2em,yshift=-0.1em]hidden.south east) -- ([xshift=0.2em,yshift=-0.1em]hidden.south west);
}
\visible<3->{
\filldraw [fill=blue!20,draw=white] (target.south west) -- (target.south east) -- ([xshift=-0.2em,yshift=0.1em]hidden.north east) -- ([xshift=0.2em,yshift=0.1em]hidden.north west);
}
\visible<2->{
\draw [->,thick] (source.north west) -- ([xshift=0.2em,yshift=-0.1em]hidden.south west);
\draw [->,thick] (source.north east) -- ([xshift=-0.2em,yshift=-0.1em]hidden.south east);
}
\visible<3->{
\draw [->,thick] ([xshift=0.2em,yshift=0.1em]hidden.north west) -- (target.south west);
\draw [->,thick] ([xshift=-0.2em,yshift=0.1em]hidden.north east) -- (target.south east);
}
}

\visible<2->{
\node [anchor=south] (enclabel) at ([yshift=2em]source.north) {\large{\textbf{Encoder}}};
}
\visible<3->{
\node [anchor=north] (declabel) at ([yshift=-2em]target.south) {\large{\textbf{Decoder}}};
}

\visible<4->{
\node [anchor=west] (representation) at ([xshift=3.5em,yshift=3em]hidden.east) {\footnotesize{\textbf{句子的表示:}}};
\node [anchor=north west] (rline1) at (representation.south west) {\footnotesize{源语言句子被表示成一个实数}};
\node [anchor=north west] (rline2) at ([yshift=0.2em]rline1.south west) {\footnotesize{向量}\footnotesize{$(0.2,-1,6,5,0.7,-2)$}};
\node [anchor=north west] (rline3) at ([yshift=0.2em]rline2.south west) {\footnotesize{不要问0.2是什么意思,因为}};
\node [anchor=north west] (rline4) at ([yshift=0.2em]rline3.south west) {\footnotesize{只有系统自己才知道 :)}};

\begin{pgfonlayer}{background}
\visible<4->{
\node [rectangle,inner sep=0.4em,rounded corners=1pt,fill=ugreen!10,drop shadow] [fit = (representation) (rline1) (rline2) (rline3) (rline4)] (rlabel) {};
}
\end{pgfonlayer}

\draw[->,very thick,dashed] (hidden.east) .. controls +(east:1) and +(west:1) .. ([yshift=-1em]rlabel.north west);
}

\end{scope}

\end{tikzpicture}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 表示模型
\begin{frame}{基于连续空间表示模型的方法}
\begin{itemize}
\item 编码器-解码器框架的革命在于:\alert{它把传统基于符号的离散型知识转化为基于表示的连续型知识}
    \begin{itemize}
    \item 比如,对于一个短语,它可以对应一个文法规则的使用过程,而规则都是由离散的符号表示
    \item 它也可以被表示为一种更加抽象的形式,通过向量记录短语的各个``属性''
    \end{itemize}
\item<2-> 这种表示形式上的创新,让我们不再依赖离散符号系统的各种分解、组合,而直接在连续空间下学习这种表示
    \begin{itemize}
    \item 所谓``端到端''只是这种表示模型下的一种自然的学习范式
    \end{itemize}
\end{itemize}

\visible<3->{
\begin{center}
\begin{tikzpicture}

\begin{scope}
\node [anchor=south west,draw,thick,red,minimum width=0.9in,minimum height=0.7in] (space1) at (0,0) {};
\node [anchor=south west,fill=blue,minimum width=0.1in,minimum height=0.1in] (unit1) at (0.2,0.8) {};
\node [anchor=south west,fill=ugreen,minimum width=0.1in,minimum height=0.1in] (unit2) at (0.7,0.3) {};
\node [anchor=south west,fill=blue,minimum width=0.1in,minimum height=0.1in] (unit3) at (1.3,1.3) {};
\node [anchor=south west,fill=ugreen,minimum width=0.1in,minimum height=0.1in] (unit4) at ([xshift=0.1em]unit3.south east) {};

\begin{pgfonlayer}{background}
\node [rectangle,inner sep=2pt,rounded corners=1pt,draw,thick] [fit = (unit3) (unit4)] (unitbox) {};
\end{pgfonlayer}

\draw [->] ([yshift=1pt]unit1.north) .. controls +(north:0.23) and +(west:0.2) .. ([yshift=0.2em,xshift=-1pt]unitbox.west);
\draw [->] ([xshift=1pt]unit2.east) .. controls +(east:0.5) and +(south:0.2) .. ([xshift=0.2em,yshift=-1pt]unitbox.south);

xiaotong committed
451
\node [anchor=south] (spacelabel1) at (space1.north) {\scriptsize{离散表示空间}};
xiaotong committed
452 453 454 455 456 457 458 459 460 461 462 463 464 465 466
\node [anchor=north] (captain1) at ([yshift=-0.5em]space1.south) {\scriptsize{(a) \textbf{统计机器翻译}}};

\end{scope}

\begin{scope}[xshift=1.3in]
\node [anchor=south west,draw,thick,red,minimum width=0.9in,minimum height=0.7in] (space1) at (0,0) {};
\node [anchor=south west,fill=blue,minimum width=0.1in,minimum height=0.1in] (unit1) at (0.2,0.8) {};
\node [anchor=south west,fill=ugreen,minimum width=0.1in,minimum height=0.1in] (unit2) at (0.7,0.3) {};

\node [anchor=south west,draw,thick,red,minimum width=0.9in,minimum height=0.7in] (space2) at (1.1in,0) {};
\node [anchor=south west,circle,fill=orange,minimum width=0.1in,minimum height=0.1in] (unit3) at (1.5in,1.3) {};

\draw [->] ([yshift=1pt]unit1.north) .. controls +(north:0.4) and +(west:2) .. ([yshift=0.0em,xshift=-1pt]unit3.west);
\draw [->] ([xshift=1pt]unit2.east) .. controls +(east:1.5) and +(south:1) .. ([xshift=0.0em,yshift=-1pt]unit3.south);

xiaotong committed
467 468
\node [anchor=south] (spacelabel1) at (space1.north) {\scriptsize{离散表示空间}};
\node [anchor=south] (spacelabel2) at (space2.north) {\scriptsize{连续表示空间}};
xiaotong committed
469 470 471 472 473 474 475 476 477 478 479
\node [anchor=north] (captain1) at ([yshift=-0.5em,xshift=1em]space1.south east) {\scriptsize{(b) \textbf{神经机器翻译}}};

\end{scope}

\end{tikzpicture}
\end{center}
}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
xiaotong committed
480 481 482 483 484
%%% 一个NMT运行的简单例子
\begin{frame}{简单的运行实例}
\begin{itemize}
\item 一个简单的例子:基于循环神经网络的翻译过程
        \begin{itemize}
xiaotong committed
485 486 487
        \item<1-> \textbf{编码器}顺序处理源语言单词
        \item<5-> 源语言句子信息被表示在最后一个循环单元的输出中
        \item<6-> \textbf{解码器}利用源语言句子信息逐词生成目标语译文
xiaotong committed
488 489 490
        \end{itemize}
\end{itemize}
%%% 运行实例的图
Lee committed
491 492 493 494 495
\begin{center}
    \begin{tikzpicture}
        \setlength{\base}{0.6cm}

        \tikzstyle{rnnnode} = [minimum size=\base,inner sep=0pt,rounded corners=1pt,draw]
xiaotong committed
496
        \tikzstyle{wordnode} = [font=\normalsize,align=center]
Lee committed
497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519

        \begin{scope}
            \visible<1->{
                \node[wordnode] (init) at (0,0) {$0$};
                \node[rnnnode,fill=blue!30!white,right=\base of init] (rnn1) {};
                \node[rnnnode,fill=green!30!white,below=\base of rnn1] (emb1) {};
                \node[wordnode,below=0pt of emb1] (word1) {};
                \draw[-latex'] (emb1.north) to (rnn1.south);
                \draw[-latex'] (init.east) to (rnn1.west);
            }
            \visible<2->{
                \node[rnnnode,fill=blue!30!white,right=\base of rnn1] (rnn2) {};
                \node[rnnnode,fill=green!30!white,below=\base of rnn2] (emb2) {};
                \node[wordnode,below=0pt of emb2] (word2) {};
                \draw[-latex'] (emb2.north) to (rnn2.south);
                \draw[-latex'] (rnn1.east) to (rnn2.west);
            }
            \visible<3->{
                \node[rnnnode,fill=blue!30!white,right=\base of rnn2] (rnn3) {};
                \node[rnnnode,fill=green!30!white,below=\base of rnn3] (emb3) {};
                \node[wordnode,below=0pt of emb3] (word3) {};
                \draw[-latex'] (emb3.north) to (rnn3.south);
                \draw[-latex'] (rnn2.east) to (rnn3.west);
xiaotong committed
520

Lee committed
521 522
                \node[rnnnode,fill=blue!30!white,right=\base of rnn3] (rnn4) {};
                \node[rnnnode,fill=green!30!white,below=\base of rnn4] (emb4) {};
xiaotong committed
523
                \node[wordnode,below=0pt of emb4] (word4) {EOS};
Lee committed
524 525 526
                \draw[-latex'] (emb4.north) to (rnn4.south);
                \draw[-latex'] (rnn3.east) to (rnn4.west);
            }
xiaotong committed
527 528
            \visible<4->{
                \draw[decoration={mirror,brace},decorate] (word1.south west) to node [auto,anchor=north,align=center] {编码器} ([yshift=-0.2em]word4.south east);
Lee committed
529 530
            }
            \visible<5->{
xiaotong committed
531 532 533
                \node[rnnnode,fill=purple] (repr) at (rnn4) {};
                \node[wordnode,above=\base of rnn2] (label) {源语言句子信息};
                \draw[->,dashed,thick] (label.east) .. controls +(east:\base) and +(north:\base) .. (rnn4.north);
Lee committed
534 535 536 537
            }
            \visible<6->{
                \node[rnnnode,fill=blue!30!white,right=\base of rnn4] (rnn5) {};
                \node[rnnnode,fill=green!30!white,below=\base of rnn5] (emb5) {};
xiaotong committed
538
                \node[wordnode,below=0pt of emb5] (word5) {SOS};
Lee committed
539 540
                \draw[-latex'] (emb5.north) to (rnn5.south);
                \draw[-latex'] (rnn4.east) to (rnn5.west);
xiaotong committed
541 542 543
                \node[rnnnode,fill=red!30!white,above=\base of rnn5] (softmax1) {};
                \node[wordnode,above=0pt of softmax1] (out1) {I};
                \draw[-latex'] (rnn5.north) to (softmax1.south);
Lee committed
544 545 546 547
            }
            \visible<7->{
                \node[rnnnode,fill=blue!30!white,right=\base of rnn5] (rnn6) {};
                \node[rnnnode,fill=green!30!white,below=\base of rnn6] (emb6) {};
xiaotong committed
548
                \node[rnnnode,fill=red!30!white,above=\base of rnn6] (softmax2) {};
Lee committed
549 550
                \ExtractX{$(emb6)$}
                \ExtractY{$(word4.base)$}
xiaotong committed
551
                \node[wordnode,anchor=base] (word6) at (\XCoord,\YCoord) {I};
Lee committed
552 553
                \ExtractX{$(emb6)$}
                \ExtractY{$(out1.base)$}
xiaotong committed
554
                \node[wordnode,anchor=base] (out2) at (\XCoord,\YCoord) {am};
Lee committed
555 556
                \draw[-latex'] (emb6.north) to (rnn6.south);
                \draw[-latex'] (rnn5.east) to (rnn6.west);
xiaotong committed
557 558 559
                \draw[-latex'] (rnn6.north) to (softmax2.south);
            }
            \visible<8->{
Lee committed
560 561
                \node[rnnnode,fill=blue!30!white,right=\base of rnn6] (rnn7) {};
                \node[rnnnode,fill=green!30!white,below=\base of rnn7] (emb7) {};
xiaotong committed
562
                \node[rnnnode,fill=red!30!white,above=\base of rnn7] (softmax3) {};
Lee committed
563 564
                \ExtractX{$(emb7)$}
                \ExtractY{$(word4.base)$}
xiaotong committed
565
                \node[wordnode,anchor=base] (word7) at (\XCoord,\YCoord) {am};
Lee committed
566 567
                \ExtractX{$(emb7)$}
                \ExtractY{$(out1.base)$}
xiaotong committed
568
                \node[wordnode,anchor=base] (out3) at (\XCoord,\YCoord) {fine};
Lee committed
569 570
                \draw[-latex'] (emb7.north) to (rnn7.south);
                \draw[-latex'] (rnn6.east) to (rnn7.west);
xiaotong committed
571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589
                \draw[-latex'] (rnn7.north) to (softmax3.south);

                \node[rnnnode,fill=blue!30!white,right=\base of rnn7] (rnn8) {};
                \node[rnnnode,fill=green!30!white,below=\base of rnn8] (emb8) {};
                \node[rnnnode,fill=red!30!white,above=\base of rnn8] (softmax4) {};
                \ExtractX{$(emb8)$}
                \ExtractY{$(word4.base)$}
                \node[wordnode,anchor=base] (word8) at (\XCoord,\YCoord) {fine};
                \ExtractX{$(emb8)$}
                \ExtractY{$(out1.base)$}
                \node[wordnode,anchor=base] (out4) at (\XCoord,\YCoord) {EOS};
                \draw[-latex'] (emb8.north) to (rnn8.south);
                \draw[-latex'] (rnn7.east) to (rnn8.west);
                \draw[-latex'] (rnn8.north) to (softmax4.south);
            }
            \visible<9->{
                \ExtractX{$(word8.east)$}
                \ExtractY{$(word5.south)$}
                \draw[decoration={mirror,brace},decorate] ([yshift=-0.2em]word5.south west) to node [auto,anchor=north,align=center] {解码器} (\XCoord,\YCoord-0.2em);
Lee committed
590 591 592 593
            }
        \end{scope}
    \end{tikzpicture}
\end{center}
xiaotong committed
594 595 596 597 598
\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% NMT是魔法?
\begin{frame}{神经机器翻译会``魔法''?}
Lee committed
599
\begin{minipage}[c][6.5cm][t]{0.46\textwidth}
xiaotong committed
600 601 602 603 604 605
\alert{统计机器翻译}仍然依赖人工定义特征和翻译单元
\begin{itemize}
\item 相对符合人类的理解
\item 具有一定可解释性
\end{itemize}
\vspace{1em}
Lee committed
606
\includegraphics[width=\textwidth,height=3.5cm]{./Figures/whiteboard-math.jpg}
xiaotong committed
607 608
\end{minipage}
\hfill
Lee committed
609
\begin{minipage}[c][6.5cm][t]{0.48\textwidth}
xiaotong committed
610 611 612 613 614 615
\alert{神经机器翻译}把所有工作都交给神经网络
\begin{itemize}
\item 不需要先验假设
\item 语言的``表示''给机器看的
\end{itemize}
\vspace{1em}
Lee committed
616
\includegraphics[width=\textwidth,height=3.5cm]{./Figures/expelliarmus.jpg}
xiaotong committed
617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640
\end{minipage}

\begin{itemize}
\item 神经机器翻译在使用魔法?\\
        - No! No! No! 变化的只有人类使用知识的形式
\end{itemize}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% NMT所对应的范式变换
\begin{frame}{神经机器翻译所带来的范式更迭}
\begin{itemize}
\item 不同机器翻译时代,人类知识的使用方式
\end{itemize}

\begin{center}
\begin{tabular}{l | l}
机器翻译方法 & 人类参与方式 \\ \hline
基于规则的方法 & 设计翻译规则 \\
传统统计方法 & 设计翻译特征 \\
神经网络方法 & 设计网络架构
 \end{tabular}
 \end{center}
xiaotong committed
641

xiaotong committed
642 643 644 645 646 647 648 649 650
 \visible<2->{
 \begin{tcolorbox}[enhanced,size=normal,left=2mm,right=1mm,colback=blue!5!white,colframe=blue!75!black,drop fuzzy shadow]
{\Large
\textbf{人类在机器翻译中将扮演什么角色?}
}
\end{tcolorbox}

\begin{itemize}
\item<3-> 机器翻译会逐渐替代人?\visible<4->{\alert{- 不会,爹是不能被儿子替代的}}
xiaotong committed
651
\item<3-> 还需要人的语言学知识吗?\visible<4->{\alert{- 会,但是需要新的思路}}
xiaotong committed
652 653 654 655 656 657 658 659 660 661 662 663 664 665
\item<3-> 就连神经网络也可以通过结构搜索自动学习,人类是不是就什么也不用干了?\visible<4->{\alert{- 结构搜索又是谁设计的,人至少需要敲下键盘吧,哈哈哈哈}}
\end{itemize}
}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 本章的内容
\begin{frame}{Outline}

\vspace{1.0em}
 \begin{tcolorbox}[enhanced,size=normal,left=2mm,right=1mm,colback=red!5!white,colframe=red!75!black,drop fuzzy shadow]
{\large
\textbf{入门:循环网络翻译模型及注意力机制} \\
xiaotong committed
666 667
\small{1. 起源} \\
\small{2. 模型结构} \\
xiaotong committed
668 669
\small{3. 注意力机制} \\
\small{4. 训练和推断}
xiaotong committed
670 671 672
}
\end{tcolorbox}

xiaotong committed
673
\vspace{0.2em}
xiaotong committed
674 675 676 677

 \begin{tcolorbox}[enhanced,size=normal,left=2mm,right=1mm,colback=red!5!white,colframe=red!75!black,drop fuzzy shadow]
{\large
\textbf{热门:Transformer} \\
xiaotong committed
678 679 680
\small{1. 自注意力模型} \\
\small{2. 多头注意力和层正则化} \\
\small{3. 更深、更宽的模型}
xiaotong committed
681 682 683
}
\end{tcolorbox}

xiaotong committed
684
\vspace{0.2em}
xiaotong committed
685 686 687 688 689 690 691 692 693 694

\begin{tcolorbox}[enhanced,size=normal,left=2mm,right=1mm,colback=red!5!white,colframe=red!75!black,drop fuzzy shadow]
{\large
\textbf{其它:一些有趣的神经机器翻译应用}
}
\end{tcolorbox}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
xiaotong committed
695
\section{循环神经网络翻译模型及注意力机制}
xiaotong committed
696

xiaotong committed
697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718

%%%------------------------------------------------------------------------------------------------------------
%%% 第一部分
\begin{frame}{首先}

\vspace{5.0em}
 \begin{tcolorbox}[enhanced,size=normal,left=2mm,right=1mm,colback=red!5!white,colframe=red!75!black,drop fuzzy shadow]
{\Large
\textbf{循环网络翻译模型及注意力机制}
}
\end{tcolorbox}

\begin{center}
\begin{tikzpicture}

\begin{scope}[scale=0.7]
\tikzstyle{rnnnode} = [draw,inner sep=5pt,minimum width=3em,minimum height=0.8em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}]
\node [anchor=west,rnnnode] (node11) at (0,0) {\tiny{RNN Cell}};
\node [anchor=west,rnnnode] (node12) at ([xshift=1em]node11.east) {\tiny{RNN Cell}};
\node [anchor=west,rnnnode] (node13) at ([xshift=1em]node12.east) {\tiny{RNN Cell}};
\node [anchor=west,rnnnode] (node14) at ([xshift=1em]node13.east) {\tiny{RNN Cell}};

xiaotong committed
719 720 721 722
\node [anchor=north,rnnnode,fill=blue!30!white] (e1) at ([yshift=-1em]node11.south) {\tiny{}};
\node [anchor=north,rnnnode,fill=blue!30!white] (e2) at ([yshift=-1em]node12.south) {\tiny{}};
\node [anchor=north,rnnnode,fill=blue!30!white] (e3) at ([yshift=-1em]node13.south) {\tiny{}};
\node [anchor=north,rnnnode,fill=blue!30!white] (e4) at ([yshift=-1em]node14.south) {\tiny{}};
xiaotong committed
723 724 725 726 727 728 729 730 731 732 733 734 735 736 737
\node [anchor=north,inner sep=2pt] (w1) at ([yshift=-1em]e1.south) {\tiny{$<$s$>$}};
\node [anchor=north,inner sep=2pt] (w2) at ([yshift=-1em]e2.south) {\tiny{}};
\node [anchor=north,inner sep=2pt] (w3) at ([yshift=-1em]e3.south) {\tiny{我们}};
\node [anchor=north,inner sep=2pt] (w4) at ([yshift=-1em]e4.south) {\tiny{开始}};

\draw [->,thick] ([yshift=0.1em]w1.north)--([yshift=-0.1em]e1.south);
\draw [->,thick] ([yshift=0.1em]w2.north)--([yshift=-0.1em]e2.south);
\draw [->,thick] ([yshift=0.1em]w3.north)--([yshift=-0.1em]e3.south);
\draw [->,thick] ([yshift=0.1em]w4.north)--([yshift=-0.1em]e4.south);

\draw [->,thick] ([yshift=0.1em]e1.north)--([yshift=-0.1em]node11.south);
\draw [->,thick] ([yshift=0.1em]e2.north)--([yshift=-0.1em]node12.south);
\draw [->,thick] ([yshift=0.1em]e3.north)--([yshift=-0.1em]node13.south);
\draw [->,thick] ([yshift=0.1em]e4.north)--([yshift=-0.1em]node14.south);

xiaotong committed
738 739 740 741
\node [anchor=south,rnnnode,fill=blue!30!white] (node21) at ([yshift=1.0em]node11.north) {\tiny{}};
\node [anchor=south,rnnnode,fill=blue!30!white] (node22) at ([yshift=1.0em]node12.north) {\tiny{}};
\node [anchor=south,rnnnode,fill=blue!30!white] (node23) at ([yshift=1.0em]node13.north) {\tiny{}};
\node [anchor=south,rnnnode,fill=blue!30!white] (node24) at ([yshift=1.0em]node14.north) {\tiny{}};
xiaotong committed
742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769

\node [anchor=south] (output1) at ([yshift=1em]node21.north) {\Large{\textbf{}}};
\node [anchor=south] (output2) at ([yshift=1em]node22.north) {\Large{\textbf{}}};
\node [anchor=south] (output3) at ([yshift=1em]node23.north) {\Large{\textbf{}}};
\node [anchor=south] (output4) at ([yshift=1em]node24.north) {\Large{\textbf{}}};

\draw [->,thick] ([yshift=0.1em]node21.north)--([yshift=-0.1em]output1.south);
\draw [->,thick] ([yshift=0.1em]node22.north)--([yshift=-0.1em]output2.south);
\draw [->,thick] ([yshift=0.1em]node23.north)--([yshift=-0.1em]output3.south);
\draw [->,thick] ([yshift=0.1em]node24.north)--([yshift=-0.1em]output4.south);

\draw [->,thick] ([yshift=0.1em]node11.north)--([yshift=-0.1em]node21.south);
\draw [->,thick] ([yshift=0.1em]node12.north)--([yshift=-0.1em]node22.south);
\draw [->,thick] ([yshift=0.1em]node13.north)--([yshift=-0.1em]node23.south);
\draw [->,thick] ([yshift=0.1em]node14.north)--([yshift=-0.1em]node24.south);

\draw [->,thick] ([xshift=-1em]node11.west)--([xshift=-0.1em]node11.west);
\draw [->,thick] ([xshift=0.1em]node11.east)--([xshift=-0.1em]node12.west);
\draw [->,thick] ([xshift=0.1em]node12.east)--([xshift=-0.1em]node13.west);
\draw [->,thick] ([xshift=0.1em]node13.east)--([xshift=-0.1em]node14.west);
\draw [->,thick] ([xshift=0.1em]node14.east)--([xshift=1em]node14.east);
\end{scope}

\end{tikzpicture}
\end{center}

\end{frame}

xiaotong committed
770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793
%%%------------------------------------------------------------------------------------------------------------
\subsection{起源}

%%%------------------------------------------------------------------------------------------------------------
%%% 神经机器翻译的历史
\begin{frame}{最初的神经机器翻译}

\begin{itemize}
\item 神经网络的在机器翻译中并不新鲜,在很多模块中早有实现,比如,翻译候选打分、语言模型等
    \begin{itemize}
    \item 但是,整个框架仍然是统计机器翻译
    \end{itemize}
\item<2-> 基于神经元网络的端到端建模出现在2013-2015,被称为\alert{Neural Machine Translation (NMT)},一些代表性工作:
\end{itemize}

\visible<2->{
\begin{center}
{\footnotesize
\begin{tabular}{l | l | l}
\textbf{时间} & \textbf{作者} & \textbf{论文} \\ \hline
2013 & Kalchbrenner和 & Recurrent Continuous Translation Models \\
     & Blunsom        & \\
2014 & Sutskever等 & Sequence to Sequence Learning with \\
     &             & neural networks \\
xiaotong committed
794 795
2014 & Bahdanau等 & Neural Machine Translation by Jointly  \\
     &       & Learning to Align and Translate \\
xiaotong committed
796 797 798
2014 & Cho等 & On the Properties of Neural Machine \\
     &       & Translation \\
2015 & Jean等 & On Using Very Large Target Vocabulary \\
xiaotong committed
799 800 801
     &        & for Neural Machine Translation \\
2015 & Luong等 & Effective Approaches to Attention-based \\
         &               & Neural Machine Translation
xiaotong committed
802 803 804 805 806 807 808 809 810 811 812
\end{tabular}
}
\end{center}
}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 逐渐崛起的NMT
\begin{frame}{崛起}
\begin{itemize}
xiaotong committed
813
\item 2015年前统计机器翻译(SMT)在NLP是具有统治力的
xiaotong committed
814
    \begin{itemize}
xiaotong committed
815
    \item 当时的NMT系统还很初级,被SMT碾压
xiaotong committed
816 817
    \item 大多数的认知还没有进化到NMT时代,甚至Kalchbrenner等人早期的报告也被人质疑
    \end{itemize}
xiaotong committed
818
\item<2-> 2016年情况大有改变,当时非常受关注的一项工作是Google上线了神经机器翻译系统GNMT
xiaotong committed
819 820
    \begin{itemize}
    \item 在GNMT前后,百度、微软、小牛翻译等也分别推出了自己的神经机器翻译系统,出现了百花齐放的局面
xiaotong committed
821 822
    \end{itemize}
\end{itemize}
xiaotong committed
823
\visible<2->{
xiaotong committed
824 825 826
\begin{center}
\includegraphics[scale=0.35]{./Figures/google-news.png}
\end{center}
xiaotong committed
827
}
xiaotong committed
828 829 830
\end{frame}

%%%------------------------------------------------------------------------------------------------------------
xiaotong committed
831 832 833
\subsection{模型结构}

%%%------------------------------------------------------------------------------------------------------------
xiaotong committed
834 835 836
%%% 模型结构
\begin{frame}{基于循环神经网络的翻译模型}
\begin{itemize}
xiaotong committed
837 838 839 840 841
\item 一种简单的模型:用循环神经网络进行编码和解码
    \begin{itemize}
    \item 编码端是一个RNN,最后一个隐层状态被看做句子表示
    \item 解码端也是一个RNN,利用编码结果逐词解码出译文
    \end{itemize}
xiaotong committed
842
\end{itemize}
xiaotong committed
843 844

\vspace{-0.5em}
Lee committed
845 846
\begin{center}
    \begin{tikzpicture}
Lee committed
847
        \setlength{\base}{0.9cm}
Lee committed
848

Lee committed
849 850
        \tikzstyle{rnnnode} = [rounded corners=1pt,minimum size=0.5\base,draw,inner sep=0pt,outer sep=0pt]
        \tikzstyle{wordnode} = [font=\tiny]
Lee committed
851

Lee committed
852 853
        % RNN translation model
        \begin{scope}[local bounding box=RNNMT]
Lee committed
854 855 856
            % RNN Encoder
            \coordinate (eemb0) at (0,0);
            \foreach \x [count=\y from 0] in {1,2,...,10}
xiaotong committed
857
                \node[rnnnode,minimum height=0.5\base,fill=green!30!white,anchor=west] (eemb\x) at ([xshift=0.4\base]eemb\y.east) {};
Lee committed
858 859
            \foreach \x in {1,2,...,10}
                \node[rnnnode,fill=blue!30!white,anchor=south] (enc\x) at ([yshift=0.5\base]eemb\x.north) {};
Lee committed
860
            \node[wordnode,left=0.4\base of enc1] (init) {$0$};
xiaotong committed
861

Lee committed
862 863 864 865 866 867 868 869 870 871
            \node[wordnode,below=0pt of eemb1] () {};
            \node[wordnode,below=0pt of eemb2] () {知道};
            \node[wordnode,below=0pt of eemb3] () {};
            \node[wordnode,below=0pt of eemb4] () {北京站};
            \node[wordnode,below=0pt of eemb5] () {};
            \node[wordnode,below=0pt of eemb6] () {};
            \node[wordnode,below=0pt of eemb7] () {怎么};
            \node[wordnode,below=0pt of eemb8] () {};
            \node[wordnode,below=0pt of eemb9] () {};
            \node[wordnode,below=0pt of eemb10] () {$\langle$eos$\rangle$};
xiaotong committed
872

Lee committed
873 874
            % RNN Decoder
            \foreach \x in {1,2,...,10}
xiaotong committed
875
                \node[rnnnode,minimum height=0.5\base,fill=green!30!white,anchor=south] (demb\x) at ([yshift=2\base]enc\x.north) {};
Lee committed
876 877
            \foreach \x in {1,2,...,10}
                \node[rnnnode,fill=blue!30!white,anchor=south] (dec\x) at ([yshift=0.5\base]demb\x.north) {};
Lee committed
878
            \foreach \x in {1,2,...,10}
xiaotong committed
879 880
                \node[rnnnode,minimum height=0.5\base,fill=red!30!white,anchor=south] (softmax\x) at ([yshift=0.5\base]dec\x.north) {};

Lee committed
881
            % Decoder input words
Lee committed
882
            \node[wordnode,below=0pt of demb1] (decwordin) {$\langle$sos$\rangle$};
Lee committed
883 884
            \ExtractX{$(demb2.south)$}
            \ExtractY{$(decwordin.base)$}
Lee committed
885
            \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {Do};
Lee committed
886 887
            \ExtractX{$(demb3.south)$}
            \ExtractY{$(decwordin.base)$}
Lee committed
888
            \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {you};
Lee committed
889 890
            \ExtractX{$(demb4.south)$}
            \ExtractY{$(decwordin.base)$}
Lee committed
891
            \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {know};
Lee committed
892 893
            \ExtractX{$(demb5.south)$}
            \ExtractY{$(decwordin.base)$}
Lee committed
894
            \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {the};
Lee committed
895 896
            \ExtractX{$(demb6.south)$}
            \ExtractY{$(decwordin.base)$}
Lee committed
897
            \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {way};
Lee committed
898 899
            \ExtractX{$(demb7.south)$}
            \ExtractY{$(decwordin.base)$}
Lee committed
900
            \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {to};
Lee committed
901 902 903 904 905 906 907 908 909
            \ExtractX{$(demb8.south)$}
            \ExtractY{$(decwordin.base)$}
            \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {Beijing};
            \ExtractX{$(demb9.south)$}
            \ExtractY{$(decwordin.base)$}
            \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {Railway};
            \ExtractX{$(demb10.south)$}
            \ExtractY{$(decwordin.base)$}
            \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {Station};
xiaotong committed
910

Lee committed
911
            % Decoder output words
Lee committed
912
            \node[wordnode,above=0pt of softmax1] (decwordout) {Do};
Lee committed
913
            \ExtractX{$(softmax2.north)$}
Lee committed
914
            \ExtractY{$(decwordout.base)$}
Lee committed
915
            \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {you};
Lee committed
916
            \ExtractX{$(softmax3.north)$}
Lee committed
917
            \ExtractY{$(decwordout.base)$}
Lee committed
918
            \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {know};
Lee committed
919
            \ExtractX{$(softmax4.north)$}
Lee committed
920
            \ExtractY{$(decwordout.base)$}
Lee committed
921
            \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {the};
Lee committed
922
            \ExtractX{$(softmax5.north)$}
Lee committed
923
            \ExtractY{$(decwordout.base)$}
Lee committed
924
            \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {way};
Lee committed
925
            \ExtractX{$(softmax6.north)$}
Lee committed
926
            \ExtractY{$(decwordout.base)$}
Lee committed
927
            \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {to};
Lee committed
928
            \ExtractX{$(softmax7.north)$}
Lee committed
929
            \ExtractY{$(decwordout.base)$}
Lee committed
930
            \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {Beijing};
Lee committed
931
            \ExtractX{$(softmax8.north)$}
Lee committed
932
            \ExtractY{$(decwordout.base)$}
Lee committed
933 934 935 936 937 938
            \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {Railway};
            \ExtractX{$(softmax9.north)$}
            \ExtractY{$(decwordout.base)$}
            \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {Station};
            \ExtractX{$(softmax10.north)$}
            \ExtractY{$(decwordout.base)$}
Lee committed
939
            \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {$\langle$eos$\rangle$};
xiaotong committed
940

Lee committed
941 942
            % Connections
            \draw[-latex'] (init.east) to (enc1.west);
Lee committed
943
            \foreach \x in {1,2,...,10}
Lee committed
944
                \draw[-latex'] (eemb\x) to (enc\x);
Lee committed
945
            \foreach \x in {1,2,...,10}
Lee committed
946
                \draw[-latex'] (demb\x) to (dec\x);
Lee committed
947
            \foreach \x in {1,2,...,10}
Lee committed
948
                \draw[-latex'] (dec\x.north) to ([yshift=0.5\base]dec\x.north);
Lee committed
949
            \foreach \x [count=\y from 2] in {1,2,...,9}
Lee committed
950 951 952 953
            {
                \draw[-latex'] (enc\x.east) to (enc\y.west);
                \draw[-latex'] (dec\x.east) to (dec\y.west);
            }
Lee committed
954
            \coordinate (bridge) at ([yshift=-1.2\base]demb2);
Lee committed
955
            \draw[-latex'] (enc10.north) .. controls +(north:\base) and +(east:1.5\base) .. (bridge) .. controls +(west:2.5\base) and +(west:0.6\base) .. (dec1.west);
Lee committed
956 957 958
        \end{scope}

        % legend
xiaotong committed
959 960 961 962 963 964 965 966 967 968
        \begin{scope}[shift={(10\base,2.5\base)}]
            \node[rnnnode,minimum height=0.5\base,fill=green!30!white,label={[label distance=3pt,font=\scriptsize]0:词嵌入层}] (emb) at (0,0) {};
            \node[rnnnode,fill=blue!30!white,anchor=north west,label={[label distance=3pt,font=\scriptsize]0:循环单元}] (rnn) at ([yshift=2\base]emb.south west) {};
            \node[rnnnode,minimum height=0.5\base,fill=red!30!white,anchor=north west,label={[label distance=3pt,font=\scriptsize]0:输出层}] (softmax) at ([yshift=2\base]rnn.south west) {};
            \node [anchor=north west] (softmax2) at ([xshift=0.6\base]softmax.south west) {\scriptsize{Softmax}};
            \node [anchor=north west] (rnn2) at ([xshift=0.6\base]rnn.south west) {\scriptsize{LSTM}};

            \node [anchor=west] (reprlabel) at ([xshift=1em]enc10.east) {\scriptsize{句子表示}};
            \draw [->,dashed] (reprlabel.west) -- ([xshift=0.1em]enc10.east);
            \node [rnnnode,fill=purple!30!white] at (enc10) {};
Lee committed
969 970 971
        \end{scope}
    \end{tikzpicture}
\end{center}
xiaotong committed
972 973 974
\end{frame}

%%%------------------------------------------------------------------------------------------------------------
xiaotong committed
975 976
%%% NMT的数学描述
\begin{frame}{数学建模}
xiaotong committed
977 978 979 980 981 982 983 984 985 986 987 988
\begin{itemize}
\item 对于源语言序列$\textbf{x} = \{x_1,x_2,...,x_m\}$,生成目标语序列$\textbf{y} = \{y_1,y_2,...,y_n\}$的概率可以被描述为

\begin{displaymath}
\log\textrm{P}(\textbf{y}|\textbf{x}) = \sum_{j=1}^{n} \log\textrm{P}(y_j|\textbf{y}_{<j}, \textbf{x})
\end{displaymath}

根据源于句子$\textbf{x}$和已生成的译文$\textbf{y}_{<j} = \{y_1,y_2,...,y_{j-1}\}$生成第$j$个译文$y_j$

\item<2-> \textbf{核心}:如何求解$\textrm{P}(y_j|\textbf{y}_{<j}, \textbf{x})$。在这个循环神经网络模型中,有三个步骤
    \begin{enumerate}
    \item 输入的单词用分布式表示,如$\textbf{x}$被表示为词向量序列$e_x(\textbf{x})$,同理$\textbf{y}_{<j}$被表示为$e_y(\textbf{y}_{<j})$
xiaotong committed
989 990
    \item 源语言句子被一个RNN编码为一个表示$\textbf{C}$,如前面的例子中是一个实数向量
    \item 目标端解码用另一个RNN,因此生成$y_j$时只考虑前一个状态$\textbf{s}_{j-1}$(这里,$\textbf{s}_{j-1}$表示RNN第$j-1$步骤的隐层状态)
xiaotong committed
991
    \end{enumerate}
992

xiaotong committed
993
\end{itemize}
xiaotong committed
994 995 996
\end{frame}

%%%------------------------------------------------------------------------------------------------------------
xiaotong committed
997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016
%%% 各部分的解释
\begin{frame}{数学建模(续)}

\vspace{-1.5em}
	\begin{center}
		% \hspace*{-1.5cm}
		\begin{tikzpicture}
			\setlength{\base}{0.9cm}
	
			\tikzstyle{rnnnode} = [rounded corners=1pt,minimum height=0.5\base,minimum width=1\base,draw,inner sep=0pt,outer sep=0pt]
			\tikzstyle{wordnode} = [font=\tiny]
	
			% RNN translation model
			\begin{scope}[local bounding box=RNNMT]
				% RNN Encoder
				\coordinate (eemb0) at (0,0);
				\foreach \x [count=\y from 0] in {1,2,...,3}
					\node[rnnnode,minimum height=0.5\base,fill=green!30!white,anchor=west] (eemb\x) at ([xshift=0.4\base]eemb\y.east) {\tiny{$e_x()$}};
				\foreach \x in {1,2,...,3}
					\node[rnnnode,fill=blue!30!white,anchor=south] (enc\x) at ([yshift=0.3\base]eemb\x.north) {};
xiaotong committed
1017 1018 1019
			        \node[] (enclabel1) at (enc1) {\tiny{$\textbf{h}_{m-2}$}};
			        \node[] (enclabel2) at (enc2) {\tiny{$\textbf{h}_{m-1}$}};
			        \node[rnnnode,fill=purple!30!white] (enclabel3) at (enc3) {\tiny{$\textbf{h}_{m}$}};
xiaotong committed
1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030
				\node[wordnode,left=0.4\base of enc1] (init1) {$\cdots$};
				\node[wordnode,left=0.4\base of eemb1] (init2) {$\cdots$};

				\node[wordnode,below=0pt of eemb1] () {};
				\node[wordnode,below=0pt of eemb2] () {};
				\node[wordnode,below=0pt of eemb3] () {$\langle$eos$\rangle$};
	
				% RNN Decoder
				\foreach \x in {1,2,...,3}
					\node[rnnnode,minimum height=0.5\base,fill=green!30!white,anchor=south] (demb\x) at ([yshift=\base]enc\x.north) {\tiny{$e_y()$}};
				\foreach \x in {1,2,...,3}
xiaotong committed
1031
					\node[rnnnode,fill=blue!30!white,anchor=south] (dec\x) at ([yshift=0.3\base]demb\x.north) {{\tiny{$\textbf{s}_\x$}}};
xiaotong committed
1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074
				\foreach \x in {1,2,...,3}
					\node[rnnnode,minimum height=0.5\base,fill=red!30!white,anchor=south] (softmax\x) at ([yshift=0.3\base]dec\x.north) {\tiny{Softmax}};
				\node[wordnode,right=0.4\base of demb3] (end1) {$\cdots$};
				\node[wordnode,right=0.4\base of dec3] (end2) {$\cdots$};
				\node[wordnode,right=0.4\base of softmax3] (end3) {$\cdots$};

				% Decoder input words
				\node[wordnode,below=0pt of demb1] (decwordin) {$\langle$sos$\rangle$};
				\ExtractX{$(demb2.south)$}
				\ExtractY{$(decwordin.base)$}
				\node[wordnode,anchor=base] () at (\XCoord,\YCoord) {Do};
				\ExtractX{$(demb3.south)$}
				\ExtractY{$(decwordin.base)$}
				\node[wordnode,anchor=base] () at (\XCoord,\YCoord) {you};

				% Decoder output words
				\node[wordnode,above=0pt of softmax1] (decwordout) {Do};
				\ExtractX{$(softmax2.north)$}
				\ExtractY{$(decwordout.base)$}
				\node[wordnode,anchor=base] () at (\XCoord,\YCoord) {you};
				\ExtractX{$(softmax3.north)$}
				\ExtractY{$(decwordout.base)$}
				\node[wordnode,anchor=base] () at (\XCoord,\YCoord) {know};
	
				% Connections
				\draw[-latex'] (init1.east) to (enc1.west);
				\draw[-latex'] (dec3.east) to (end2.west);
				\foreach \x in {1,2,...,3}
					\draw[-latex'] (eemb\x) to (enc\x);
				\foreach \x in {1,2,...,3}
					\draw[-latex'] (demb\x) to (dec\x);
				\foreach \x in {1,2,...,3}
					\draw[-latex'] (dec\x.north) to (softmax\x.south);
				\foreach \x [count=\y from 2] in {1,2}
				{
					\draw[-latex'] (enc\x.east) to (enc\y.west);
					\draw[-latex'] (dec\x.east) to (dec\y.west);
				}
	
				\coordinate (bridge) at ([yshift=0.4\base]enc2.north west);
				\draw[-latex'] (enc3.north) .. controls +(north:0.3\base) and +(east:\base) .. (bridge) .. controls +(west:2.7\base) and +(west:0.3\base) .. (dec1.west);
				
				\visible<2->{
xiaotong committed
1075
				\node [anchor=east] (line1) at ([xshift=-3em,yshift=0.5em]softmax1.west) {\scriptsize{基于RNN的隐层状态$\textbf{s}_i$}};
xiaotong committed
1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091
				\node [anchor=north west] (line2) at ([yshift=0.3em]line1.south west) {\scriptsize{预测目标词的概率}};
				\node [anchor=north west] (line3) at ([yshift=0.3em]line2.south west) {\scriptsize{通常,用Softmax函数}};
				\node [anchor=north west] (line4) at ([yshift=0.3em]line3.south west) {\scriptsize{实现 $\textrm{P}(y_i|...)$}};
				}
				
				\visible<3->{
				\node [anchor=north west] (line11) at ([yshift=-1.8em]line4.west) {\scriptsize{每个词的one-hot}};
				\node [anchor=north west] (line12) at ([yshift=0.3em]line11.south west) {\scriptsize{离散化表示都被转化为}};
				\node [anchor=north west] (line13) at ([yshift=0.3em]line12.south west) {\scriptsize{实数向量,即词嵌入}};
				\node [anchor=north west] (line14) at ([yshift=0.3em]line13.south west) {\scriptsize{($e_x()$$e_y()$函数)}};
				}
				
				\visible<4->{
				\node [anchor=west] (line21) at ([xshift=1.3em,yshift=1.5em]enc3.east)  {\scriptsize{源语编码器最后一个}};
				\node [anchor=north west] (line22) at ([yshift=0.3em]line21.south west) {\scriptsize{循环单元的输出被}};
				\node [anchor=north west] (line23) at ([yshift=0.3em]line22.south west) {\scriptsize{看作是句子的表示,}};
xiaotong committed
1092
				\node [anchor=north west] (line24) at ([yshift=0.3em]line23.south west) {\scriptsize{记为$\textbf{C}$}};
xiaotong committed
1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127
				}
				
				\begin{pgfonlayer}{background}
				\visible<2->{
				\node [rectangle,inner sep=0.2em,rounded corners=1pt,fill=red!10,drop shadow,draw=red] [fit = (line1) (line2) (line3) (line4)] (box1) {};
				\node [rectangle,inner sep=0.2em,rounded corners=1pt,very thick,dotted,draw=red] [fit = (softmax1) (softmax2) (softmax3)] (box4) {};
				\draw [->,dotted,very thick,red] ([yshift=1em,xshift=2.5em]box1.east) -- ([yshift=1em,xshift=0.1em]box1.east);
				}
				
				\visible<3->{
				\node [rectangle,inner sep=0.2em,rounded corners=1pt,fill=green!10,drop shadow,draw=ugreen] [fit = (line11) (line12) (line13) (line14)] (box2) {};
				\node [rectangle,inner sep=0.2em,rounded corners=1pt,very thick,dotted,draw=ugreen] [fit = (eemb1) (eemb2) (eemb3)] (box5) {};
				\node [rectangle,inner sep=0.2em,rounded corners=1pt,very thick,dotted,draw=ugreen] [fit = (demb1) (demb2) (demb3)] (box6) {};
				\draw [->,dotted,very thick,ugreen] ([yshift=-1.3em,xshift=2.5em]box2.east) -- ([yshift=-1.3em,xshift=0.1em]box2.east);
				\draw [->,dotted,very thick,ugreen] ([xshift=0.1em]box6.west) .. controls +(west:1) and +(east:1) .. ([yshift=1.0em]box2.east) ;
				}
				
				\visible<4->{
				\node [rectangle,inner sep=0.2em,rounded corners=1pt,fill=purple!10,drop shadow,draw=purple] [fit = (line21) (line22) (line23) (line24)] (box3) {};
				\node [rectangle,inner sep=0.2em,rounded corners=1pt,very thick,dotted,draw=purple] [fit = (enc3)] (box7) {};
				\draw [->,dotted,very thick,purple] ([xshift=0.1em]box7.east) -- ([xshift=0.8em]box7.east) ;
				}
							
				\end{pgfonlayer}
	
			\end{scope}
		\end{tikzpicture}
	\end{center}

\visible<5->{
\vspace{-1.5em}
\begin{itemize}
\item 可以重新定义\\
\vspace{-0.8em}
\begin{displaymath}
xiaotong committed
1128
\textrm{P}(y_j|\textbf{y}_{<j}, \textbf{x}) \triangleq \textrm{P}(y_j|\textbf{s}_{j-1}, y_{j-1},\textbf{C})
xiaotong committed
1129 1130 1131 1132 1133 1134
\end{displaymath}

对于上图中的模型,进一步化简为:\\
\vspace{-0.3em}

\begin{displaymath}
1135
\textrm{P}(y_j|\textbf{y}_{<j}, \textbf{x}) \triangleq \left\{
xiaotong committed
1136
    \begin{matrix}
xiaotong committed
1137 1138
        \textrm{P}(y_j|\textbf{C},y_{j-1})\ \ \ \  & j = 1 \\
        \textrm{P}(y_j|\textbf{s}_{j-1},y_{j-1}) & j > 1
xiaotong committed
1139 1140 1141 1142 1143 1144 1145 1146
    \end{matrix} \right.
\end{displaymath}

\end{itemize}
}

\end{frame}
%%%------------------------------------------------------------------------------------------------------------
xiaotong committed
1147 1148
%%% 词嵌入
\begin{frame}{模块1:词嵌入层}
xiaotong committed
1149
    \begin{itemize}
1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162
    \item 词嵌入的作用是把离散化的单词表示转换为连续空间上的分布式表示
        \begin{itemize}
        \item<2-> 把输入的词转换成唯一对应的词表大小的0-1向量
        \item<3-> 根据0-1向量,从词嵌入矩阵中取出对应的词嵌入$e()$
        \item<4-> 取出的词嵌入$e()$作为循环神经网络的输入
    \end{itemize}
    \end{itemize}
    \vspace{-1em}
    %%% 图
    \begin{center}
        \hspace*{-0.6cm}
        \begin{tikzpicture}
            \setlength{\base}{0.9cm}
xiaotong committed
1163

1164 1165
            \tikzstyle{rnnnode} = [rounded corners=1pt,minimum height=0.5\base,minimum width=1\base,draw,inner sep=0pt,outer sep=0pt]
            \tikzstyle{wordnode} = [font=\tiny]
xiaotong committed
1166

1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179
            % RNN translation model
            \begin{scope}[local bounding box=RNNMT]
                % RNN Encoder
                \coordinate (eemb0) at (0,0);
                \foreach \x [count=\y from 0] in {1,2,...,3}
                    \node[rnnnode,minimum height=0.5\base,fill=green!30!white,anchor=west] (eemb\x) at ([xshift=0.4\base]eemb\y.east) {\tiny{$e_x()$}};
                \foreach \x in {1,2,...,3}
                    \node[rnnnode,fill=blue!30!white,anchor=south] (enc\x) at ([yshift=0.3\base]eemb\x.north) {};
                    \node[] (enclabel1) at (enc1) {\tiny{$\textbf{h}_{m-2}$}};
                    \node[] (enclabel2) at (enc2) {\tiny{$\textbf{h}_{m-1}$}};
                    \node[rnnnode,fill=purple!30!white] (enclabel3) at (enc3) {\tiny{$\textbf{h}_{m}$}};
                \node[wordnode,left=0.4\base of enc1] (init1) {$\cdots$};
                \node[wordnode,left=0.4\base of eemb1] (init2) {$\cdots$};
xiaotong committed
1180

1181 1182 1183
                \node[wordnode,below=0pt of eemb1] () {};
                \node[wordnode,below=0pt of eemb2] () {};
                \node[wordnode,below=0pt of eemb3] () {$\langle$eos$\rangle$};
xiaotong committed
1184

1185 1186 1187 1188 1189 1190 1191 1192 1193 1194
                % RNN Decoder
                \foreach \x in {1,2,...,3}
                    \node[rnnnode,minimum height=0.5\base,fill=green!30!white,anchor=south] (demb\x) at ([yshift=\base]enc\x.north) {\tiny{$e_y()$}};
                \foreach \x in {1,2,...,3}
                    \node[rnnnode,fill=blue!30!white,anchor=south] (dec\x) at ([yshift=0.3\base]demb\x.north) {{\tiny{$\textbf{s}_\x$}}};
                \foreach \x in {1,2,...,3}
                    \node[rnnnode,minimum height=0.5\base,fill=red!30!white,anchor=south] (softmax\x) at ([yshift=0.3\base]dec\x.north) {\tiny{Softmax}};
                \node[wordnode,right=0.4\base of demb3] (end1) {$\cdots$};
                \node[wordnode,right=0.4\base of dec3] (end2) {$\cdots$};
                \node[wordnode,right=0.4\base of softmax3] (end3) {$\cdots$};
xiaotong committed
1195

1196 1197 1198 1199 1200 1201 1202 1203
                % Decoder input words
                \node[wordnode,below=0pt of demb1] (decwordin) {$\langle$sos$\rangle$};
                \ExtractX{$(demb2.south)$}
                \ExtractY{$(decwordin.base)$}
                \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {Do};
                \ExtractX{$(demb3.south)$}
                \ExtractY{$(decwordin.base)$}
                \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {you};
xiaotong committed
1204

1205 1206 1207 1208 1209 1210 1211 1212
                % Decoder output words
                \node[wordnode,above=0pt of softmax1] (decwordout) {Do};
                \ExtractX{$(softmax2.north)$}
                \ExtractY{$(decwordout.base)$}
                \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {you};
                \ExtractX{$(softmax3.north)$}
                \ExtractY{$(decwordout.base)$}
                \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {know};
xiaotong committed
1213

1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227
                % Connections
                \draw[-latex'] (init1.east) to (enc1.west);
                \draw[-latex'] (dec3.east) to (end2.west);
                \foreach \x in {1,2,...,3}
                    \draw[-latex'] (eemb\x) to (enc\x);
                \foreach \x in {1,2,...,3}
                    \draw[-latex'] (demb\x) to (dec\x);
                \foreach \x in {1,2,...,3}
                    \draw[-latex'] (dec\x.north) to (softmax\x.south);
                \foreach \x [count=\y from 2] in {1,2}
                {
                    \draw[-latex'] (enc\x.east) to (enc\y.west);
                    \draw[-latex'] (dec\x.east) to (dec\y.west);
                }
xiaotong committed
1228

1229 1230 1231
                \coordinate (bridge) at ([yshift=0.4\base]enc2.north west);
                \draw[-latex'] (enc3.north) .. controls +(north:0.3\base) and +(east:\base) .. (bridge) .. controls +(west:2.7\base) and +(west:0.3\base) .. (dec1.west);
            \end{scope}
xiaotong committed
1232

1233 1234 1235 1236
            \begin{scope}
                \coordinate (start) at (5.8\base,0.3\base);
                \visible<2->{
                    \node [anchor=south west] (one) at (start) {\scriptsize{$\begin{bmatrix} 0 \\ 0 \\ 0 \\ \vdots \\ 0 \\ {\color{ugreen} 1} \\ 0 \\ 0 \end{bmatrix}$}};
xiaotong committed
1237
                    \node [anchor=south west,inner sep=0pt] (T) at ([yshift=-0.5em,xshift=-0.5em]one.north east) {\tiny{T}};
1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255
                }
                \visible<3->{
                    \node [draw=ugreen,fill=green!20!white,rounded corners=0.3em,minimum width=3.8cm,minimum height=0.9em,anchor=south west] (emb) at ([shift={(1.25cm,0.8cm)}]start) {};
                }
                \node [anchor=north] (w) at ([yshift=3pt]one.south) {\scriptsize{\color{ugreen} you}};
                \node [anchor=north west] (words) at ([xshift=10pt]one.north east) {\scriptsize{$\begin{matrix} \langle\textrm{eos}\rangle \\ \langle\textrm{sos}\rangle \\ \textrm{Do} \\ \vdots \\ \textrm{know} \\ \textrm{you} \\ \textrm{?} \\ \textrm{have} \end{matrix}$}};
                \node [anchor=north west] (mat) at ([xshift=-6pt]words.north east) {\scriptsize{$
                    \begin{bmatrix}
                        .1 & -4 & \cdots & 2 \\
                        5 & 2 & \cdots & .2 \\
                        2 & .1 & \cdots & .3 \\
                        \vdots & \vdots & \ddots & \vdots \\
                        0 & .8 & \cdots & 4 \\
                        -1 & -2 & \cdots & -3 \\
                        .7 &  .5 & \cdots & 3 \\
                        -2 & .3 & \cdots & .1
                    \end{bmatrix}
                $}};
xiaotong committed
1256

1257
                \draw [decorate,decoration={brace,mirror}] ([shift={(6pt,2pt)}]mat.south west) to node [auto,swap,font=\scriptsize] {词嵌入矩阵} ([shift={(-6pt,2pt)}]mat.south east);
xiaotong committed
1258

1259 1260 1261 1262 1263 1264 1265 1266
                \visible<3->{
                    \draw [-latex'] ([xshift=-2pt,yshift=-0.65cm]one.east) to ([yshift=-0.65cm]words.west);
                }
                \visible<4->{
                    \draw [-latex'] (emb.east) -| ([yshift=0.4cm]mat.north east) node [pos=1,above] {\scriptsize{RNN输入}};
                }
                \draw [-latex'] ([yshift=-0.4cm]w.south) to ([yshift=2pt]w.south);
                \node [anchor=north] (wlabel) at ([yshift=-0.6em]w.south) {\scriptsize{输入的单词}};
xiaotong committed
1267

1268 1269
                \node [draw=ugreen,densely dashed,thick,rounded corners=3pt,fit=(one) (words) (mat) (w)] (input) {};
            \end{scope}
xiaotong committed
1270

1271 1272 1273
            \draw [->,thick,densely dashed,ugreen] ([yshift=-0.2em]demb3.east) to [out=0,in=180] ([yshift=-1cm]input.west);
        \end{tikzpicture}
    \end{center}
xiaotong committed
1274 1275 1276 1277 1278
\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 输出
\begin{frame}{模块2:输出层}
xiaotong committed
1279
    \begin{itemize}
1280 1281 1282 1283 1284
    \item 输出层需要得到每个目标语单词的生成概率,进而选取概率最高的词作为输出。但RNN中的隐藏层并不会输出单词概率,而是输出$s$,其每一行对应一个单词表示
        \begin{itemize}
        \item<2-> $s$经过权重矩阵$W$变成$\hat{s}$,其隐藏层维度变换成词表的大小
        \item<3-> $\hat{s}$经过Softmax变换得到不同词作为输出的概率,即单词$i$的概率$p_i = \textrm{Softmax}(i) = \frac{e^{\hat{s}_i}}{\sum_{j} e^{\hat{s}_{j}}} $
        \end{itemize}
xiaotong committed
1285
    \end{itemize}
1286 1287 1288 1289 1290
    %%% 图
    \begin{center}
        \hspace*{-0.6cm}
        \begin{tikzpicture}
            \setlength{\base}{0.9cm}
xiaotong committed
1291

1292 1293
            \tikzstyle{rnnnode} = [rounded corners=1pt,minimum height=0.5\base,minimum width=1\base,draw,inner sep=0pt,outer sep=0pt]
            \tikzstyle{wordnode} = [font=\tiny]
xiaotong committed
1294

1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307
            % RNN translation model
            \begin{scope}[local bounding box=RNNMT]
                % RNN Encoder
                \coordinate (eemb0) at (0,0);
                \foreach \x [count=\y from 0] in {1,2,...,3}
                    \node[rnnnode,minimum height=0.5\base,fill=green!30!white,anchor=west] (eemb\x) at ([xshift=0.4\base]eemb\y.east) {\tiny{$e_x()$}};
                \foreach \x in {1,2,...,3}
                    \node[rnnnode,fill=blue!30!white,anchor=south] (enc\x) at ([yshift=0.3\base]eemb\x.north) {};
                    \node[] (enclabel1) at (enc1) {\tiny{$\textbf{h}_{m-2}$}};
                    \node[] (enclabel2) at (enc2) {\tiny{$\textbf{h}_{m-1}$}};
                    \node[rnnnode,fill=purple!30!white] (enclabel3) at (enc3) {\tiny{$\textbf{h}_{m}$}};
                \node[wordnode,left=0.4\base of enc1] (init1) {$\cdots$};
                \node[wordnode,left=0.4\base of eemb1] (init2) {$\cdots$};
xiaotong committed
1308

1309 1310 1311
                \node[wordnode,below=0pt of eemb1] () {};
                \node[wordnode,below=0pt of eemb2] () {};
                \node[wordnode,below=0pt of eemb3] () {$\langle$eos$\rangle$};
xiaotong committed
1312

1313 1314 1315 1316 1317 1318 1319 1320 1321 1322
                % RNN Decoder
                \foreach \x in {1,2,...,3}
                    \node[rnnnode,minimum height=0.5\base,fill=green!30!white,anchor=south] (demb\x) at ([yshift=\base]enc\x.north) {\tiny{$e_y()$}};
                \foreach \x in {1,2,...,3}
                    \node[rnnnode,fill=blue!30!white,anchor=south] (dec\x) at ([yshift=0.3\base]demb\x.north) {{\tiny{$\textbf{s}_\x$}}};
                \foreach \x in {1,2,...,3}
                    \node[rnnnode,minimum height=0.5\base,fill=red!30!white,anchor=south] (softmax\x) at ([yshift=0.3\base]dec\x.north) {\tiny{Softmax}};
                \node[wordnode,right=0.4\base of demb3] (end1) {$\cdots$};
                \node[wordnode,right=0.4\base of dec3] (end2) {$\cdots$};
                \node[wordnode,right=0.4\base of softmax3] (end3) {$\cdots$};
xiaotong committed
1323

1324 1325 1326 1327 1328 1329 1330 1331
                % Decoder input words
                \node[wordnode,below=0pt of demb1] (decwordin) {$\langle$sos$\rangle$};
                \ExtractX{$(demb2.south)$}
                \ExtractY{$(decwordin.base)$}
                \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {Do};
                \ExtractX{$(demb3.south)$}
                \ExtractY{$(decwordin.base)$}
                \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {you};
xiaotong committed
1332

1333 1334 1335 1336 1337 1338 1339 1340
                % Decoder output words
                \node[wordnode,above=0pt of softmax1] (decwordout) {Do};
                \ExtractX{$(softmax2.north)$}
                \ExtractY{$(decwordout.base)$}
                \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {you};
                \ExtractX{$(softmax3.north)$}
                \ExtractY{$(decwordout.base)$}
                \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {know};
xiaotong committed
1341

1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355
                % Connections
                \draw[-latex'] (init1.east) to (enc1.west);
                \draw[-latex'] (dec3.east) to (end2.west);
                \foreach \x in {1,2,...,3}
                    \draw[-latex'] (eemb\x) to (enc\x);
                \foreach \x in {1,2,...,3}
                    \draw[-latex'] (demb\x) to (dec\x);
                \foreach \x in {1,2,...,3}
                    \draw[-latex'] (dec\x.north) to (softmax\x.south);
                \foreach \x [count=\y from 2] in {1,2}
                {
                    \draw[-latex'] (enc\x.east) to (enc\y.west);
                    \draw[-latex'] (dec\x.east) to (dec\y.west);
                }
xiaotong committed
1356

1357 1358 1359
                \coordinate (bridge) at ([yshift=0.4\base]enc2.north west);
                \draw[-latex'] (enc3.north) .. controls +(north:0.3\base) and +(east:\base) .. (bridge) .. controls +(west:2.7\base) and +(west:0.3\base) .. (dec1.west);
            \end{scope}
xiaotong committed
1360

1361 1362
            \begin{scope}
                \coordinate (start) at (8.5\base,0.1\base);
xiaotong committed
1363

1364 1365 1366 1367 1368
                \node [anchor=center,minimum width=5.7em,minimum height=1.3em,draw,rounded corners=0.3em] (hidden) at (start) {};
                \node [anchor=west,minimum width=1em,minimum size=1em,fill=ugreen!20] (cell01) at ([xshift=0.2em]hidden.west) {\scriptsize{.2}};
                \node [anchor=west,minimum width=1em,minimum size=1em,fill=ugreen!10] (cell02) at (cell01.east) {\scriptsize{-1}};
                \node [anchor=west,minimum width=1em,minimum size=1em,fill=white] (cell03) at (cell02.east) {\scriptsize{$\cdots$}};
                \node [anchor=west,minimum width=1em,minimum size=1em,fill=ugreen!50] (cell04) at (cell03.east) {\scriptsize{5}};
xiaotong committed
1369

1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380
                \visible<2->{
                    \node [anchor=south,minimum width=10.9em,minimum height=1.3em,draw,rounded corners=0.3em] (target) at ([yshift=1.5em]hidden.north) {};
                    \node [anchor=west,minimum width=1em,minimum size=1em,fill=ugreen!10] (cell11) at ([xshift=0.2em]target.west) {\scriptsize{-2}};
                    \node [anchor=west,minimum width=1em,minimum size=1em,fill=ugreen!10] (cell12) at (cell11.east) {\scriptsize{-1}};
                    \node [anchor=west,minimum width=1em,minimum size=1em,fill=ugreen!30] (cell13) at (cell12.east) {\scriptsize{.7}};
                    \node [anchor=west,minimum width=1em,minimum size=1em,fill=white] (cell14) at (cell13.east) {\scriptsize{$\cdots$}};
                    \node [anchor=west,minimum width=1em,minimum size=1em,fill=ugreen!70] (cell15) at (cell14.east) {\scriptsize{6}};
                    \node [anchor=west,minimum width=1em,minimum size=1em,fill=ugreen!10] (cell16) at (cell15.east) {\scriptsize{-3}};
                    \node [anchor=west,minimum width=1em,minimum size=1em,fill=ugreen!10] (cell17) at (cell16.east) {\scriptsize{-1}};
                    \node [anchor=west,minimum width=1em,minimum size=1em,fill=ugreen!20] (cell18) at (cell17.east) {\scriptsize{.2}};
                }
xiaotong committed
1381

1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404
                \visible<3->{
                    \node [anchor=south,minimum width=1em,minimum height=0.2em,fill=ublue!80,inner sep=0pt] (label1) at ([yshift=2.5em]cell11.north) {};
                    \node [anchor=west,rotate=90,font=\tiny] (w1) at (label1.north) {$\langle$eos$\rangle$};
                    \node [anchor=south,minimum width=1em,minimum height=0.3em,fill=ublue!80,inner sep=0pt] (label2) at ([yshift=2.5em]cell12.north) {};
                    \node [anchor=west,rotate=90,font=\tiny] (w2) at (label2.north) {$\langle$sos$\rangle$};
                    \node [anchor=south,minimum width=1em,minimum height=0.5em,fill=ublue!80,inner sep=0pt] (label3) at ([yshift=2.5em]cell13.north) {};
                    \node [anchor=west,rotate=90,font=\tiny] (w3) at (label3.north) {Do};
                    \node [anchor=south,font=\scriptsize] (w4) at ([yshift=2.5em]cell14.north) {$\cdots$};
                    \node [anchor=south,minimum width=1em,minimum height=1em,fill=ublue!80,inner sep=0pt] (label5) at ([yshift=2.5em]cell15.north) {};
                    \alt<4->
                    {
                        \node [anchor=west,rotate=90,font=\tiny] (w5) at (label5.north) {{\color{red} know}};
                    }
                    {
                        \node [anchor=west,rotate=90,font=\tiny] (w5) at (label5.north) {know};
                    }
                    \node [anchor=south,minimum width=1em,minimum height=0.1em,fill=ublue!80,inner sep=0pt] (label6) at ([yshift=2.5em]cell16.north) {};
                    \node [anchor=west,rotate=90,font=\tiny] (w6) at (label6.north) {you};
                    \node [anchor=south,minimum width=1em,minimum height=0.3em,fill=ublue!80,inner sep=0pt] (label7) at ([yshift=2.5em]cell17.north) {};
                    \node [anchor=west,rotate=90,font=\tiny] (w7) at (label7.north) {?};
                    \node [anchor=south,minimum width=1em,minimum height=0.4em,fill=ublue!80,inner sep=0pt] (label8) at ([yshift=2.5em]cell18.north) {};
                    \node [anchor=west,rotate=90,font=\tiny] (w8) at (label8.north) {have};
                }
xiaotong committed
1405

1406 1407 1408 1409
                \visible<2->{
                    \filldraw [fill=red!20,draw=white] (target.south west) -- (target.south east) -- ([xshift=-0.2em,yshift=0.1em]hidden.north east) -- ([xshift=0.2em,yshift=0.1em]hidden.north west);
                    \draw [->,thick] ([xshift=0.2em,yshift=0.1em]hidden.north west) -- (target.south west);
                    \draw [->,thick] ([xshift=-0.2em,yshift=0.1em]hidden.north east) -- (target.south east);
xiaotong committed
1410

1411 1412
                    \node [anchor=south] () at ([yshift=0.3em]hidden.north) {\scriptsize{$\hat{s}=Ws$}};
                }
xiaotong committed
1413

1414 1415 1416 1417 1418 1419 1420 1421 1422
                \visible<3->{
                    \node [rounded corners=0.3em] (softmax) at ([yshift=1.25em]target.north) {\scriptsize{$p(\hat{s}_i)=\frac{e^{\hat{s}_i}}{\sum_j e^{\hat{s}_j}}$}};
                    \filldraw [fill=blue!20,draw=white] ([yshift=0.1em]cell11.north west) {[rounded corners=0.3em] -- (softmax.west)} -- (label1.south west) -- (label8.south east) {[rounded corners=0.3em] -- (softmax.east)} -- ([yshift=0.1em]cell18.north east) -- ([yshift=0.1em]cell11.north west);
                    \node [rounded corners=0.3em] (softmax) at ([yshift=1.25em]target.north) {\scriptsize{$p(\hat{s}_i)=\frac{e^{\hat{s}_i}}{\sum_j e^{\hat{s}_j}}$}};
                }
                \draw [-latex'] ([yshift=-0.3cm]hidden.south) to (hidden.south);
                \visible<4->{
                    \draw [-latex'] (w5.east) to ([yshift=0.3cm]w5.east);
                }
xiaotong committed
1423

1424 1425 1426
                \coordinate (tmp) at ([yshift=-3pt]w5.east);
                \node [draw=red,thick,densely dashed,rounded corners=3pt,inner sep=5pt,fit=(cell01) (cell11) (label1) (label8) (target) (hidden) (tmp)] (output) {};
            \end{scope}
xiaotong committed
1427

1428 1429 1430
            \draw [->,thick,densely dashed,red] ([yshift=-0.2em]softmax3.east) .. controls +(east:2\base) and +(west:\base) .. (output.west);
        \end{tikzpicture}
    \end{center}
xiaotong committed
1431 1432 1433
\end{frame}

%%%------------------------------------------------------------------------------------------------------------
xiaotong committed
1434
%%% LSTM
Lee committed
1435
\begin{frame}{模块3:循环单元 - 长短时记忆模型(LSTM)}
xiaotong committed
1436 1437 1438
\begin{itemize}
\item LSTM是最常用的循环单元结构,它一种典型的记忆网络,通过``门''单元来动态地选择遗忘多少以前的信息
\end{itemize}
xiaotong committed
1439
%%% 图
Lee committed
1440 1441
\begin{center}
    \begin{tikzpicture}
Lee committed
1442
        \setlength{\base}{0.6cm}
Lee committed
1443

Lee committed
1444
        \tikzstyle{wordnode} = [font=\scriptsize]
Lee committed
1445
        \tikzstyle{auxnode} = [inner sep=0pt,outer sep=0pt,opacity=0,draw=red,fill=red,circle,minimum size=3pt]
Lee committed
1446 1447
        \tikzstyle{opnode} = [inner sep=0pt,outer sep=0pt,draw,fill=green!30!white,font=\scriptsize,minimum size=10pt]
        \tikzstyle{standard} = [rounded corners=5pt,thick]
Lee committed
1448 1449
        \tikzstyle{emph} = [rounded corners=5pt,thick,draw=red]
        \tikzstyle{formulanode} = [font=\scriptsize,align=left,draw=red,rectangle,fill=red!10!white,rounded corners=2pt,drop shadow]
Lee committed
1450 1451 1452 1453 1454 1455 1456 1457

        % Skeleton
        \begin{scope}[every label/.append style={label distance=1pt,font=\tiny,inner sep=0pt,opacity=0}]
            \coordinate (aux12) at (0,0);
            \node[auxnode,label={-45:12}] () at (aux12) {};

            \coordinate (aux22) at ([yshift=\base]aux12);
            \node[auxnode,label={-45:22}] () at (aux22) {};
Lee committed
1458
            \coordinate (aux21) at ([xshift=-2\base]aux22);
Lee committed
1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471
            \node[auxnode,label={-45:21}] () at (aux21) {};
            \coordinate (aux23) at ([xshift=\base]aux22);
            \node[auxnode,label={-45:23}] () at (aux23) {};
            \coordinate (aux24) at ([xshift=\base]aux23);
            \node[auxnode,label={-45:24}] () at (aux24) {};
            \coordinate (aux25) at ([xshift=\base]aux24);
            \node[auxnode,label={-45:25}] () at (aux25) {};
            \coordinate (aux26) at ([xshift=\base]aux25);
            \node[auxnode,label={-45:26}] () at (aux26) {};
            \coordinate (aux27) at ([xshift=\base]aux26);
            \node[auxnode,label={-45:27}] () at (aux27) {};
            \coordinate (aux28) at ([xshift=\base]aux27);
            \node[auxnode,label={-45:28}] () at (aux28) {};
Lee committed
1472
            \coordinate (aux29) at ([xshift=2\base]aux28);
Lee committed
1473
            \node[auxnode,label={-45:29}] () at (aux29) {};
xiaotong committed
1474

Lee committed
1475 1476 1477 1478 1479 1480 1481 1482
            \coordinate (aux33) at ([yshift=\base]aux23);
            \node[auxnode,label={-45:33}] () at (aux33) {};
            \coordinate (aux34) at ([yshift=\base]aux24);
            \node[auxnode,label={-45:34}] () at (aux34) {};
            \coordinate (aux35) at ([yshift=\base]aux25);
            \node[auxnode,label={-45:35}] () at (aux35) {};
            \coordinate (aux37) at ([yshift=\base]aux27);
            \node[auxnode,label={-45:37}] () at (aux37) {};
xiaotong committed
1483

Lee committed
1484 1485
            \coordinate (aux45) at ([yshift=\base]aux35);
            \node[auxnode,label={-45:45}] () at (aux45) {};
xiaotong committed
1486

Lee committed
1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504
            \coordinate (aux55) at ([yshift=\base]aux45);
            \node[auxnode,label={-45:55}] () at (aux55) {};
            \ExtractX{$(aux21)$}
            \ExtractY{$(aux55)$}
            \coordinate (aux51) at (\XCoord,\YCoord);
            \node[auxnode,label={-45:51}] () at (aux51) {};
            \ExtractX{$(aux23)$}
            \ExtractY{$(aux55)$}
            \coordinate (aux53) at (\XCoord,\YCoord);
            \node[auxnode,label={-45:53}] () at (aux53) {};
            \ExtractX{$(aux28)$}
            \ExtractY{$(aux55)$}
            \coordinate (aux58) at (\XCoord,\YCoord);
            \node[auxnode,label={-45:58}] () at (aux58) {};
            \ExtractX{$(aux29)$}
            \ExtractY{$(aux55)$}
            \coordinate (aux59) at (\XCoord,\YCoord);
            \node[auxnode,label={-45:59}] () at (aux59) {};
xiaotong committed
1505

Lee committed
1506 1507 1508
            \coordinate (aux68) at ([yshift=\base]aux58);
            \node[auxnode,label={-45:68}] () at (aux68) {};
        \end{scope}
Lee committed
1509 1510

        \begin{scope}
Lee committed
1511 1512
            \node[opnode,circle,opacity=0] (f53) at (aux53) {};
            \node[opnode,circle,opacity=0] (u55) at (aux55) {};
Lee committed
1513
            % forget gate
Lee committed
1514 1515 1516 1517 1518 1519 1520 1521 1522 1523
            \visible<1>{
                \draw[emph] (aux21) -- (aux23) -- (aux33);
                \draw[-latex,emph] (aux12) -- (aux22) -- (aux23) -- (f53);
                \node[opnode,circle,draw=red,thick] () at (aux33) {$\sigma$};
            }
            \visible<2->{
                \draw[standard] (aux21) -- (aux23) -- (aux33);
                \draw[-latex,standard] (aux12) -- (aux22) -- (aux23) -- (f53);
                \node[opnode,circle] () at (aux33) {$\sigma$};
            }
Lee committed
1524
            % input gate
Lee committed
1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541
            \visible<2>{
                \node[opnode,circle] (i45) at (aux45) {};
                \draw[-latex,emph] (aux21) -- (aux24) |- (i45);
                \draw[-latex,emph] (aux21) -- (aux25) -- (u55);
                \draw[emph] (aux12) -- (aux22) -- (aux23);
                \node[opnode,circle,draw=red,thick] () at (aux34) {$\sigma$};
                \node[opnode,rectangle,rounded corners=2pt,inner sep=2pt,draw=red,thick] () at (aux35) {$\mathrm{tanh}$};
                \node[opnode,circle,draw=red,thick] (i45) at (aux45) {X};
            }
            \visible<3->{
                \node[opnode,circle] (i45) at (aux45) {};
                \draw[-latex,standard] (aux21) -- (aux24) |- (i45);
                \draw[-latex,standard] (aux21) -- (aux25) -- (u55);
                \node[opnode,circle] () at (aux34) {$\sigma$};
                \node[opnode,rectangle,rounded corners=2pt,inner sep=2pt] () at (aux35) {$\mathrm{tanh}$};
                \node[opnode,circle] (i45) at (aux45) {X};
            }
Lee committed
1542
            % cell update
Lee committed
1543 1544 1545 1546 1547 1548 1549 1550 1551 1552
            \visible<3>{
                \draw[-latex,emph] (aux51) -- (aux59);
                \node[opnode,circle,draw=red,thick] (f53) at (aux53) {X};
                \node[opnode,circle,draw=red,thick] (u55) at (aux55) {\textbf{+}};
            }
            \visible<4->{
                \draw[-latex,standard] (aux51) -- (aux59);
                \node[opnode,circle] (f53) at (aux53) {X};
                \node[opnode,circle] (u55) at (aux55) {\textbf{+}};
            }
Lee committed
1553
            % output gate
Lee committed
1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574
            \visible<4>{
                \node[opnode,circle,draw=red,thick] (o27) at (aux27) {X};
                \draw[-latex,emph] (u55) -| (o27);
                \draw[-latex,emph] (aux21) -- (o27);
                \draw[emph] (aux12) -- (aux22) -- (aux23);
                \node[opnode,circle,draw=red,thick] () at (aux26) {$\sigma$};
                \node[opnode,rectangle,rounded corners=2pt,inner sep=2pt,draw=red,thick] () at (aux37) {$\mathrm{tanh}$};

                \draw[-latex,emph] (o27) -- (aux29);
                \draw[-latex,emph] (o27) -| (aux68);
            }
            \visible<5->{
                \node[opnode,circle] (o27) at (aux27) {X};
                \draw[-latex,standard] (u55) -| (o27);
                \draw[-latex,standard] (aux21) -- (o27);
                \node[opnode,circle] () at (aux26) {$\sigma$};
                \node[opnode,rectangle,rounded corners=2pt,inner sep=2pt] () at (aux37) {$\mathrm{tanh}$};

                \draw[-latex,standard] (o27) -- (aux29);
                \draw[-latex,standard] (o27) -| (aux68);
            }
Lee committed
1575
        \end{scope}
Lee committed
1576

Lee committed
1577
        \begin{scope}
曹润柘 committed
1578 1579 1580
            \node[wordnode,anchor=south] () at ([xshift=0.5\base]aux21) {$\mathbf{h}_{t-1}$};
            \node[wordnode,anchor=west] () at (aux12) {$\mathbf{x}_t$};
            \node[wordnode,anchor=south] () at ([xshift=0.5\base]aux51) {$\mathbf{c}_{t-1}$};
Lee committed
1581
            \visible<3->{
曹润柘 committed
1582
                \node[wordnode,anchor=south] () at ([xshift=-0.5\base]aux59) {$\mathbf{c}_{t}$};
Lee committed
1583 1584
            }
            \visible<4->{
曹润柘 committed
1585 1586
                \node[wordnode,anchor=east] () at (aux68) {$\mathbf{h}_{t}$};
                \node[wordnode,anchor=south] () at ([xshift=-0.5\base]aux29) {$\mathbf{h}_{t}$};
Lee committed
1587
            }
Lee committed
1588 1589 1590 1591 1592 1593 1594
        \end{scope}

        \begin{pgfonlayer}{background}
            \node[draw,very thick,rectangle,fill=blue!30!white,rounded corners=5pt,inner sep=4pt,fit=(aux22) (aux58) (u55) (o27)] (LSTM) {};
        \end{pgfonlayer}

        \begin{scope}
Lee committed
1595 1596
            \visible<1->{
                % forget gate formula
曹润柘 committed
1597
                \node[formulanode,anchor=south east,text width=3.4cm] () at ([shift={(4\base,1.5\base)}]aux51) {遗忘门\\$\mathbf{f}_t=\sigma(\mathbf{W}_f[\mathbf{h}_{t-1},\mathbf{x}_t]+\mathbf{b}_f)$};
Lee committed
1598 1599 1600
            }
            \visible<2->{
                % input gate formula
曹润柘 committed
1601
                \node[formulanode,anchor=north east] () at ([shift={(4\base,-1.5\base)}]aux21) {输入门\\$\mathbf{i}_t=\sigma(\mathbf{W}_i[\mathbf{h}_{t-1},\mathbf{x}_t]+\mathbf{b}_i)$\\$\hat{\mathbf{c}}_t=\mathrm{tanh}(\mathbf{W}_c[\mathbf{h}_{t-1},\mathbf{x}_t]+\mathbf{b}_c)$};
Lee committed
1602 1603 1604
            }
            \visible<3->{
                % cell update formula
曹润柘 committed
1605
                \node[formulanode,anchor=south west,text width=3.02cm] () at ([shift={(-4\base,1.5\base)}]aux59) {记忆更新\\$\mathbf{c}_{t}=\mathbf{f}_t\cdot \mathbf{c}_{t-1}+\mathbf{i}_t\cdot \hat{\mathbf{c}}_t$};
Lee committed
1606 1607 1608
            }
            \visible<4->{
                % output gate formula
曹润柘 committed
1609
                \node[formulanode,anchor=north west] () at ([shift={(-4\base,-1.5\base)}]aux29) {输出门\\$\mathbf{o}_t=\sigma(\mathbf{W}_o[\mathbf{h}_{t-1},\mathbf{x}_t]+\mathbf{b}_o)$\\$\mathbf{h}_{t}=\mathbf{o}_t\cdot \mathrm{tanh}(\mathbf{c}_{t})$};
Lee committed
1610
            }
Lee committed
1611
        \end{scope}
Lee committed
1612 1613
    \end{tikzpicture}
\end{center}
xiaotong committed
1614 1615

{\scriptsize\begin{tabular}{l}
曹润柘 committed
1616 1617
    *$\mathbf{x}_t$: 上一层的输出,$\mathbf{h}_{t-1}$: 同一层上一时刻的隐藏状态\\
    *$\mathbf{c}_{t-1}$: 同一层上一时刻的记忆
xiaotong committed
1618
\end{tabular}}
xiaotong committed
1619 1620 1621
\end{frame}

%%%------------------------------------------------------------------------------------------------------------
xiaotong committed
1622
%%% GRU
xiaotong committed
1623
\begin{frame}{另一种循环单元 - 门循环单元(GRU)}
xiaotong committed
1624
\begin{itemize}
Lee committed
1625
\item GRU是LSTM的一个变种,它把隐藏状态$h$和记忆$c$合并成一个隐藏状态$h$,同时使用了更少的``门''单元,大大提升了计算效率
xiaotong committed
1626 1627 1628
    \begin{itemize}
    \item 在NMT中GRU会带来20-25\%的速度提升
    \end{itemize}
xiaotong committed
1629 1630
\end{itemize}
%%% 图
Lee committed
1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747
\begin{center}
    \begin{tikzpicture}
        \setlength{\base}{0.6cm}

        \tikzstyle{auxnode} = [inner sep=0pt,outer sep=0pt,opacity=0,draw=red,fill=red,circle,minimum size=3pt]
        \tikzstyle{wordnode} = [font=\scriptsize]
        \tikzstyle{opnode} = [inner sep=0pt,outer sep=0pt,draw,fill=green!30!white,font=\scriptsize,minimum size=10pt]
        \tikzstyle{standard} = [rounded corners=5pt,thick]
        \tikzstyle{emph} = [rounded corners=5pt,thick,draw=red]
        \tikzstyle{formulanode} = [font=\scriptsize,align=left,draw=red,rectangle,fill=red!10!white,rounded corners=2pt,drop shadow]

        % Skeleton
        \begin{scope}[every label/.append style={label distance=1pt,font=\tiny,inner sep=0pt,opacity=0}]
            \coordinate (aux12) at (0,0);
            \node[auxnode,label={-45:12}] () at (aux12) {};

            \coordinate (aux22) at ([yshift=\base]aux12);
            \node[auxnode,label={-45:22}] () at (aux22) {};
            \coordinate (aux23) at ([xshift=0.5\base]aux22);
            \node[auxnode,label={-45:23}] () at (aux23) {};

            \coordinate (aux32) at ([yshift=0.4\base]aux22);
            \node[auxnode,label={-45:32}] () at (aux32) {};

            \ExtractX{$([xshift=\base]aux23)$}
            \ExtractY{$([yshift=\base]aux32)$}
            \coordinate (aux44) at (\XCoord,\YCoord);
            \node[auxnode,label={-45:44}] () at (aux44) {};
            \coordinate (aux45) at ([xshift=\base]aux44);
            \node[auxnode,label={-45:45}] () at (aux45) {};
            \coordinate (aux46) at ([xshift=1.3\base]aux45);
            \node[auxnode,label={-45:46}] () at (aux46) {};

            \ExtractX{$(aux23)$}
            \ExtractY{$([yshift=\base]aux44)$}
            \coordinate (aux53) at (\XCoord,\YCoord);
            \node[auxnode,label={-45:53}] () at (aux53) {};
            \coordinate (aux56) at ([yshift=\base]aux46);
            \node[auxnode,label={-45:56}] () at (aux56) {};

            \ExtractX{$(aux45)$}
            \ExtractY{$([yshift=0.5\base]aux56)$}
            \coordinate (aux65) at (\XCoord,\YCoord);
            \node[auxnode,label={-45:65}] () at (aux65) {};

            \ExtractX{$([xshift=-\base]aux12)$}
            \ExtractY{$([yshift=\base]aux65)$}
            \coordinate (aux71) at (\XCoord,\YCoord);
            \node[auxnode,label={-45:71}] () at (aux71) {};
            \coordinate (aux75) at ([yshift=\base]aux65);
            \node[auxnode,label={-45:75}] () at (aux75) {};
            \ExtractX{$(aux56)$}
            \ExtractY{$(aux75)$}
            \coordinate (aux76) at (\XCoord,\YCoord);
            \node[auxnode,label={-45:76}] () at (aux76) {};
            \coordinate (aux78) at ([xshift=1.7\base]aux76);
            \node[auxnode,label={-45:78}] () at (aux78) {};

            \coordinate (aux87) at ([shift={(0.7\base,1.3\base)}]aux76);
            \node[auxnode,label={-45:87}] () at (aux87) {};
        \end{scope}

        \begin{scope}
            \node[opnode,circle,opacity=0] (r53) at (aux53) {};
            \node[opnode,circle,opacity=0] (z56) at (aux56) {};
            \node[opnode,circle,opacity=0] (z75) at (aux75) {};
            \node[opnode,circle,opacity=0] (z76) at (aux76) {};
            % reset gate
            \visible<1>{
                \draw[-latex,emph] (aux12) -- (aux32) -| (aux44) |- (r53);
                \draw[emph] (aux71) -| (aux32) -| (aux44);
                \node[opnode,circle,draw=red,thick] () at (aux44) {$\sigma$};
            }
            \visible<2->{
                \draw[-latex,standard] (aux12) -- (aux32) -| (aux44) |- (r53);
                \draw[standard] (aux71) -| (aux32) -| (aux44);
                \node[opnode,circle] () at (aux44) {$\sigma$};
            }
            % update gate
            \visible<2>{
                \draw[-latex,emph] (aux12) -- (aux32) -| (aux45) -- (z75);
                \draw[-latex,emph] (aux71) -| (aux32) -| (aux45) |- (z56);
                \node[opnode,circle,draw=red,thick] () at (aux45) {$\sigma$};
                \node[opnode,rectangle,rounded corners=2pt,inner sep=2pt,font=\tiny,draw=red,thick] () at (aux65) {$1-$};
            }
            \visible<3->{
                \draw[-latex,standard] (aux12) -- (aux32) -| (aux45) -- (z75);
                \draw[-latex,standard] (aux71) -| (aux32) -| (aux45) |- (z56);
                \node[opnode,circle] () at (aux45) {$\sigma$};
                \node[opnode,rectangle,rounded corners=2pt,inner sep=2pt,font=\tiny] () at (aux65) {$1-$};
            }
            % hidden update
            \visible<3>{
                \draw[-latex,emph] (aux71) -- (aux78);
                \draw[-latex,emph] (aux71) -| (aux87);
                \draw[-latex,emph] (aux71) -| (aux53) -- (aux23) -| (aux46) -- (z76);
                \draw[emph] (aux12) |- (aux23) -| (aux46);
                \node[opnode,rectangle,rounded corners=2pt,inner sep=2pt,draw=red,thick] (tanh) at (aux46) {$\mathrm{tanh}$};
                \node[opnode,circle,draw=red,thick] () at (aux53) {X};
                \node[opnode,circle,draw=red,thick] () at (aux56) {X};
                \node[opnode,circle,draw=red,thick] () at (aux75) {X};
                \node[opnode,circle,draw=red,thick] () at (aux76) {\textbf{+}};
            }
            \visible<4->{
                \draw[-latex,standard] (aux71) -- (aux78);
                \draw[-latex,standard] (aux71) -| (aux87);
                \draw[-latex,standard] (aux71) -| (aux53) -- (aux23) -| (aux46) -- (z76);
                \draw[standard] (aux12) |- (aux23) -| (aux46);
                \node[opnode,rectangle,rounded corners=2pt,inner sep=2pt] (tanh) at (aux46) {$\mathrm{tanh}$};
                \node[opnode,circle] () at (aux53) {X};
                \node[opnode,circle] () at (aux56) {X};
                \node[opnode,circle] () at (aux75) {X};
                \node[opnode,circle] () at (aux76) {\textbf{+}};
            }
        \end{scope}

        \begin{scope}
曹润柘 committed
1748 1749
            \node[wordnode,anchor=south] () at (aux71) {$\mathbf{h}_{t-1}$};
            \node[wordnode,anchor=west] () at (aux12) {$\mathbf{x}_t$};
Lee committed
1750
            \visible<3->{
曹润柘 committed
1751 1752
                \node[wordnode,anchor=east] () at (aux87) {$\mathbf{h}_{t}$};
                \node[wordnode,anchor=south] () at (aux78) {$\mathbf{h}_{t}$};
Lee committed
1753 1754 1755 1756 1757 1758 1759 1760 1761 1762
            }
        \end{scope}

        \begin{pgfonlayer}{background}
            \node[draw,very thick,rectangle,fill=blue!30!white,rounded corners=5pt,inner sep=6pt,fit=(aux22) (aux76) (z76) (tanh)] (GRU) {};
        \end{pgfonlayer}

        \begin{scope}
            \visible<1->{
                % reset gate formula
曹润柘 committed
1763
                \node[formulanode,anchor=west,text width=4cm] (reset) at ([shift={(\base,0.7\base)}]aux78) {重置门\\$\mathbf{r}_t=\sigma(\mathbf{W}_r[\mathbf{h}_{t-1},\mathbf{x}_t])$};
Lee committed
1764 1765 1766
            }
            \visible<2->{
                % update gate formula
曹润柘 committed
1767
                \node[formulanode,anchor=north west,text width=4cm] (update) at ([yshift=-0.5\base]reset.south west) {更新门\\$\mathbf{u}_t=\sigma(\mathbf{W}_u[\mathbf{h}_{t-1},\mathbf{x}_t])$};
Lee committed
1768 1769 1770
            }
            \visible<3->{
                % hidden update formula
曹润柘 committed
1771
                \node[formulanode,anchor=north west,text width=4cm] () at ([yshift=-0.5\base]update.south west) {隐藏状态更新\\$\hat{\mathbf{h}}_t=\mathrm{tanh}(\mathbf{W}[\mathbf{r}_t\cdot \mathbf{h}_{t-1},\mathbf{x}_t])$\\$\mathbf{h}_{t}=(1-\mathbf{u}_t)\cdot \mathbf{h}_{t-1}+\mathbf{u}_t\cdot \hat{\mathbf{h}}_t$};
Lee committed
1772 1773 1774 1775
            }
        \end{scope}
    \end{tikzpicture}
\end{center}
Lee committed
1776

xiaotong committed
1777
{\footnotesize\begin{tabular}{l}
曹润柘 committed
1778 1779
    *$\mathbf{x}_t$: 上一层的输出\\
    *$\mathbf{h}_{t-1}$: 同一层上一时刻的隐藏状态
Lee committed
1780
\end{tabular}}
xiaotong committed
1781 1782 1783 1784
\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 一些变种
xiaotong committed
1785
\begin{frame}{改进 - 双向模型}
xiaotong committed
1786 1787 1788 1789 1790 1791 1792 1793
\begin{itemize}
\item 自左向右的模型只考虑了左侧的上下文,因此可以用自右向左的模型对右侧上下文建模
	\begin{itemize}
	\item 最终将两个模型融合同时送给编码端
	\end{itemize}
\end{itemize}

\vspace{-0.5em}
Lee committed
1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828
\begin{center}
    \begin{tikzpicture}
        \setlength{\base}{0.9cm}

        \tikzstyle{rnnnode} = [rounded corners=1pt,minimum size=0.5\base,draw,inner sep=0pt,outer sep=0pt]
        \tikzstyle{wordnode} = [font=\tiny]

        % RNN translation model
        \begin{scope}[local bounding box=RNNMT]
            % RNN Encoder
            \coordinate (eemb0) at (0,0);
            \foreach \x [count=\y from 0] in {1,2,...,10}
                \node[rnnnode,minimum height=0.5\base,fill=green!30!white,anchor=west] (eemb\x) at ([xshift=0.4\base]eemb\y.east) {};
            \foreach \x in {1,2,...,10}
                \node[rnnnode,fill=blue!30!white,anchor=south] (backenc\x) at ([yshift=0.5\base]eemb\x.north) {};
            \foreach \x in {1,2,...,10}
                \node[rnnnode,fill=blue!30!white,anchor=south] (enc\x) at ([yshift=0.5\base]backenc\x.north) {};
            \node[wordnode,left=0.4\base of enc1] (init) {$0$};
            \node[wordnode,right=0.4\base of backenc10] (backinit) {$0$};
            \node [rnnnode,fill=purple!30!white] at (enc10) {};
            \node [rnnnode,fill=purple!30!white] at (backenc1) {};

            \node[wordnode,below=0pt of eemb1] () {};
            \node[wordnode,below=0pt of eemb2] () {知道};
            \node[wordnode,below=0pt of eemb3] () {};
            \node[wordnode,below=0pt of eemb4] () {北京站};
            \node[wordnode,below=0pt of eemb5] () {};
            \node[wordnode,below=0pt of eemb6] () {};
            \node[wordnode,below=0pt of eemb7] () {怎么};
            \node[wordnode,below=0pt of eemb8] () {};
            \node[wordnode,below=0pt of eemb9] () {};
            \node[wordnode,below=0pt of eemb10] () {$\langle$eos$\rangle$};

            % RNN Decoder
            \foreach \x in {1,2,...,10}
xiaotong committed
1829
                \node[rnnnode,minimum height=0.5\base,fill=green!30!white,anchor=south] (demb\x) at ([yshift=1.5\base]enc\x.north) {};
Lee committed
1830 1831 1832 1833 1834 1835
            \foreach \x in {1,2,...,10}
                \node[rnnnode,fill=blue!30!white,anchor=south] (dec\x) at ([yshift=0.5\base]demb\x.north) {};
            \foreach \x in {1,2,...,10}
                \node[rnnnode,minimum height=0.5\base,fill=red!30!white,anchor=south] (softmax\x) at ([yshift=0.5\base]dec\x.north) {};

            % Decoder input words
xiaotong committed
1836
            \node[wordnode,below=0pt of demb1] (decwordin) {EOS};
Lee committed
1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892
            \ExtractX{$(demb2.south)$}
            \ExtractY{$(decwordin.base)$}
            \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {Do};
            \ExtractX{$(demb3.south)$}
            \ExtractY{$(decwordin.base)$}
            \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {you};
            \ExtractX{$(demb4.south)$}
            \ExtractY{$(decwordin.base)$}
            \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {know};
            \ExtractX{$(demb5.south)$}
            \ExtractY{$(decwordin.base)$}
            \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {the};
            \ExtractX{$(demb6.south)$}
            \ExtractY{$(decwordin.base)$}
            \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {way};
            \ExtractX{$(demb7.south)$}
            \ExtractY{$(decwordin.base)$}
            \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {to};
            \ExtractX{$(demb8.south)$}
            \ExtractY{$(decwordin.base)$}
            \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {Beijing};
            \ExtractX{$(demb9.south)$}
            \ExtractY{$(decwordin.base)$}
            \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {Railway};
            \ExtractX{$(demb10.south)$}
            \ExtractY{$(decwordin.base)$}
            \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {Station};

            % Decoder output words
            \node[wordnode,above=0pt of softmax1] (decwordout) {Do};
            \ExtractX{$(softmax2.north)$}
            \ExtractY{$(decwordout.base)$}
            \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {you};
            \ExtractX{$(softmax3.north)$}
            \ExtractY{$(decwordout.base)$}
            \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {know};
            \ExtractX{$(softmax4.north)$}
            \ExtractY{$(decwordout.base)$}
            \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {the};
            \ExtractX{$(softmax5.north)$}
            \ExtractY{$(decwordout.base)$}
            \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {way};
            \ExtractX{$(softmax6.north)$}
            \ExtractY{$(decwordout.base)$}
            \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {to};
            \ExtractX{$(softmax7.north)$}
            \ExtractY{$(decwordout.base)$}
            \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {Beijing};
            \ExtractX{$(softmax8.north)$}
            \ExtractY{$(decwordout.base)$}
            \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {Railway};
            \ExtractX{$(softmax9.north)$}
            \ExtractY{$(decwordout.base)$}
            \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {Station};
            \ExtractX{$(softmax10.north)$}
            \ExtractY{$(decwordout.base)$}
xiaotong committed
1893
            \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {EOS};
Lee committed
1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917

            % Connections
            \draw[-latex'] (init.east) to (enc1.west);
            \draw[-latex'] (backinit.west) to (backenc10.east);
            \foreach \x in {1,2,...,10}
                \draw[-latex'] (eemb\x) to (backenc\x);
            \foreach \x in {1,2,...,10}
                \draw[-latex'] (eemb\x.north) to [out=15,in=-15] (enc\x.south);
            \foreach \x in {1,2,...,10}
                \draw[-latex'] (demb\x) to (dec\x);
            \foreach \x in {1,2,...,10}
                \draw[-latex'] (dec\x.north) to ([yshift=0.5\base]dec\x.north);
            \foreach \x [count=\y from 2] in {1,2,...,9}
            {
                \draw[-latex'] (enc\x.east) to (enc\y.west);
                \draw[-latex'] (dec\x.east) to (dec\y.west);
            }
            \def\y{0}
            \foreach \x in {10,9,...,2}
            {
                \pgfmathtruncatemacro{\y}{\x - 1}
                \draw[-latex'] (backenc\x.west) to (backenc\y.east);
            }
            \coordinate (bridge) at ([yshift=-1.2\base]demb2);
xiaotong committed
1918
            \draw[-latex'] (enc10.north) .. controls +(north:0.7\base) and +(east:1.5\base) .. (bridge) .. controls +(west:2.5\base) and +(west:0.6\base) .. (dec1.west);
Lee committed
1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929
            \draw[-latex'] (backenc1) to [out=180,in=180] (dec1.west);

            % Backward RNN
            \begin{pgfonlayer}{background}
                \node[draw=red,thick,densely dashed,inner sep=5pt] [fit = (backinit) (backenc1) (backenc10)] (backrnn) {};
            \end{pgfonlayer}
            \node[font=\scriptsize,anchor=south] (backrnnlabel) at ([xshift=-0.5\base,yshift=\base]backrnn.north east) {反向RNN};
            \draw[->,dashed] (backrnnlabel.south) to ([xshift=-0.5\base]backrnn.north east);
        \end{scope}
    \end{tikzpicture}
\end{center}
xiaotong committed
1930 1931 1932 1933
\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 一些变种
xiaotong committed
1934
\begin{frame}{改进 - 多层网络}
xiaotong committed
1935 1936 1937 1938 1939 1940 1941 1942 1943

\begin{itemize}
\item 堆叠更多层的网络,可以提升模型的表示能力
    \begin{itemize}
    \item 常见的NMT系统有2-8层
    \end{itemize}
\end{itemize}

\vspace*{-0.6cm}
Lee committed
1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958
\begin{center}
    \hspace*{-2.5cm}
    \begin{tikzpicture}
        \setlength{\base}{0.9cm}

        \tikzstyle{rnnnode} = [rounded corners=1pt,minimum size=0.5\base,draw,inner sep=0pt,outer sep=0pt]
        \tikzstyle{wordnode} = [font=\tiny]

        % RNN translation model
        \begin{scope}[local bounding box=RNNMT]
            % RNN Encoder
            \coordinate (eemb0) at (0,0);
            \foreach \x [count=\y from 0] in {1,2,...,10}
                \node[rnnnode,minimum height=0.5\base,fill=green!30!white,anchor=west] (eemb\x) at ([xshift=0.4\base]eemb\y.east) {};
            \foreach \x in {1,2,...,10}
xiaotong committed
1959
                \node[rnnnode,fill=blue!30!white,anchor=south] (enc1\x) at ([yshift=0.3\base]eemb\x.north) {};
Lee committed
1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973
            \foreach \x in {1,2,...,10}
                \node[rnnnode,fill=blue!30!white,anchor=south] (enc2\x) at ([yshift=0.5\base]enc1\x.north) {};
            \node[wordnode,left=0.4\base of enc11] (init1) {$0$};
            \node[wordnode,left=0.4\base of enc21] (init2) {$0$};

            \node[wordnode,below=0pt of eemb1] () {};
            \node[wordnode,below=0pt of eemb2] () {知道};
            \node[wordnode,below=0pt of eemb3] () {};
            \node[wordnode,below=0pt of eemb4] () {北京站};
            \node[wordnode,below=0pt of eemb5] () {};
            \node[wordnode,below=0pt of eemb6] () {};
            \node[wordnode,below=0pt of eemb7] () {怎么};
            \node[wordnode,below=0pt of eemb8] () {};
            \node[wordnode,below=0pt of eemb9] () {};
xiaotong committed
1974
            \node[wordnode,below=0pt of eemb10] () {EOS};
Lee committed
1975 1976 1977 1978 1979

            % RNN Decoder
            \foreach \x in {1,2,...,10}
                \node[rnnnode,minimum height=0.5\base,fill=green!30!white,anchor=south] (demb\x) at ([yshift=1.5\base]enc2\x.north) {};
            \foreach \x in {1,2,...,10}
xiaotong committed
1980
                \node[rnnnode,fill=blue!30!white,anchor=south] (dec1\x) at ([yshift=0.3\base]demb\x.north) {};
Lee committed
1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043
            \foreach \x in {1,2,...,10}
                \node[rnnnode,fill=blue!30!white,anchor=south] (dec2\x) at ([yshift=0.5\base]dec1\x.north) {};
            \foreach \x in {1,2,...,10}
                \node[rnnnode,minimum height=0.5\base,fill=red!30!white,anchor=south] (softmax\x) at ([yshift=0.5\base]dec2\x.north) {};

            % Decoder input words
            \node[wordnode,below=0pt of demb1] (decwordin) {$\langle$sos$\rangle$};
            \ExtractX{$(demb2.south)$}
            \ExtractY{$(decwordin.base)$}
            \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {Do};
            \ExtractX{$(demb3.south)$}
            \ExtractY{$(decwordin.base)$}
            \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {you};
            \ExtractX{$(demb4.south)$}
            \ExtractY{$(decwordin.base)$}
            \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {know};
            \ExtractX{$(demb5.south)$}
            \ExtractY{$(decwordin.base)$}
            \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {the};
            \ExtractX{$(demb6.south)$}
            \ExtractY{$(decwordin.base)$}
            \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {way};
            \ExtractX{$(demb7.south)$}
            \ExtractY{$(decwordin.base)$}
            \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {to};
            \ExtractX{$(demb8.south)$}
            \ExtractY{$(decwordin.base)$}
            \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {Beijing};
            \ExtractX{$(demb9.south)$}
            \ExtractY{$(decwordin.base)$}
            \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {Railway};
            \ExtractX{$(demb10.south)$}
            \ExtractY{$(decwordin.base)$}
            \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {Station};

            % Decoder output words
            \node[wordnode,above=0pt of softmax1] (decwordout) {Do};
            \ExtractX{$(softmax2.north)$}
            \ExtractY{$(decwordout.base)$}
            \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {you};
            \ExtractX{$(softmax3.north)$}
            \ExtractY{$(decwordout.base)$}
            \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {know};
            \ExtractX{$(softmax4.north)$}
            \ExtractY{$(decwordout.base)$}
            \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {the};
            \ExtractX{$(softmax5.north)$}
            \ExtractY{$(decwordout.base)$}
            \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {way};
            \ExtractX{$(softmax6.north)$}
            \ExtractY{$(decwordout.base)$}
            \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {to};
            \ExtractX{$(softmax7.north)$}
            \ExtractY{$(decwordout.base)$}
            \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {Beijing};
            \ExtractX{$(softmax8.north)$}
            \ExtractY{$(decwordout.base)$}
            \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {Railway};
            \ExtractX{$(softmax9.north)$}
            \ExtractY{$(decwordout.base)$}
            \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {Station};
            \ExtractX{$(softmax10.north)$}
            \ExtractY{$(decwordout.base)$}
xiaotong committed
2044
            \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {EOS};
Lee committed
2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083

            % Connections
            \draw[-latex'] (init1.east) to (enc11.west);
            \draw[-latex'] (init2.east) to (enc21.west);
            \foreach \x in {1,2,...,10}
                \draw[-latex'] (eemb\x) to (enc1\x);
            \foreach \x in {1,2,...,10}
                \draw[-latex'] (enc1\x) to (enc2\x);
            \foreach \x in {1,2,...,10}
                \draw[-latex'] (demb\x) to (dec1\x);
            \foreach \x in {1,2,...,10}
                \draw[-latex'] (dec1\x) to (dec2\x);
            \foreach \x in {1,2,...,10}
                \draw[-latex'] (dec2\x.north) to ([yshift=0.5\base]dec2\x.north);
            \foreach \x [count=\y from 2] in {1,2,...,9}
            {
                \draw[-latex'] (enc1\x.east) to (enc1\y.west);
                \draw[-latex'] (enc2\x.east) to (enc2\y.west);
                \draw[-latex'] (dec1\x.east) to (dec1\y.west);
                \draw[-latex'] (dec2\x.east) to (dec2\y.west);
            }

            \coordinate (bridge) at ([yshift=1.4\base]enc16.north west);
            \draw[-latex'] (enc210.north) .. controls +(north:0.4\base) and +(east:1.5\base) .. (bridge) .. controls +(west:9.5\base) and +(west:0.6\base) .. (dec21.west);

            \coordinate (bridge) at ([yshift=1.6\base]enc16.north west);
            \draw[-latex'] (enc110.east) .. controls +(east:0.5\base) and +(east:8\base) .. (bridge) .. controls +(west:8\base) and +(west:0.3\base) .. (dec11.west);

            % stack RNN
            \begin{pgfonlayer}{background}
                \node[draw=red,thick,densely dashed,inner sep=5pt] [fit = (init2) (enc21) (enc210)] (enc2) {};
                \node[draw=red,thick,densely dashed,inner sep=5pt] [fit = (dec21) (dec210)] (dec2) {};
            \end{pgfonlayer}
            \node[font=\scriptsize,anchor=west] (label) at ([xshift=0.2\base]demb10.east) {堆叠RNN};
            \draw[->,dashed] (label.north) to (dec2.east);
            \draw[->,dashed] (label.south) to (enc2.east);
        \end{scope}
    \end{tikzpicture}
\end{center}
xiaotong committed
2084 2085 2086
\end{frame}

%%%------------------------------------------------------------------------------------------------------------
xiaotong committed
2087
\subsection{注意力机制}
xiaotong committed
2088 2089

%%%------------------------------------------------------------------------------------------------------------
xiaotong committed
2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168
%%% 简单的编码-解码框架的问题
\begin{frame}{简单的编码器-解码器就足够了?}
\begin{itemize}
\item 将源语言句子编码为一个实数向量确实很神奇,但是也有明显问题
    \begin{itemize}
    \item 整个句子编码到一个向量里可能会有信息丢失
    \item 缺少源语单词与目标语单词之间的对应。某种意义上讲,一个目标语单词的生成无法区分不同源语单词的贡献
    \end{itemize}
\item<2-> 但是,翻译是具有很强的\alert{局部性}的,有些词之间会有更紧密的关系
    \begin{itemize}
    \item 源语词和目标语词的对应并不是均匀的,甚至非常稀疏
    \item 比如,一些短语的生成仅依赖于源文中的少数词
    \item<3-> 这些关系可以在表示模型中考虑
    \end{itemize}
\end{itemize}

\begin{center}
\begin{tikzpicture}
\begin{scope}
\setlength{\mystep}{1.6em}

\foreach \x in {1,2,...,6}
    \node[] (s\x) at (\x * \mystep,0) {};

\node [] (ws1) at (s1) {\scriptsize{}};
\node [] (ws2) at (s2) {\scriptsize{}};
\node [] (ws3) at (s3) {\scriptsize{}};
\node [] (ws4) at (s4) {\scriptsize{很长}};
\node [] (ws5) at (s5) {\scriptsize{}};
\node [] (ws6) at (s6) {\scriptsize{句子}};

\foreach \x in {1,2,...,6}
    \node[] (t\x) at (\x * \mystep + 2.4in,0) {};

\node [] (wt1) at (t1) {\scriptsize{This}};
\node [] (wt2) at (t2) {\scriptsize{is}};
\node [] (wt3) at ([yshift=-1pt]t3) {\scriptsize{a}};
\node [] (wt4) at ([yshift=-0.1em]t4) {\scriptsize{very}};
\node [] (wt5) at (t5) {\scriptsize{long}};
\node [] (wt6) at ([xshift=1em]t6) {\scriptsize{sentence}};

\node [anchor=south west,fill=red!30,minimum width=1.6in,minimum height=1.5em] (encoder) at ([yshift=1.0em]ws1.north west) {\footnotesize{Encoder}};
\node [anchor=west,fill=blue!30,minimum width=1.9in,minimum height=1.5em] (decoder) at ([xshift=4.5em]encoder.east) {\footnotesize{Decoder}};
\node [anchor=west,fill=green!30,minimum height=1.5em] (representation) at ([xshift=1em]encoder.east) {\footnotesize{表示}};
\draw [->,thick] ([xshift=1pt]encoder.east)--([xshift=-1pt]representation.west);
\draw [->,thick] ([xshift=1pt]representation.east)--([xshift=-1pt]decoder.west);

\foreach \x in {1,2,...,6}
    \draw[->] ([yshift=0.1em]s\x.north) -- ([yshift=1.2em]s\x.north);

\foreach \x in {1,2,...,5}
    \draw[<-] ([yshift=0.1em]t\x.north) -- ([yshift=1.2em]t\x.north);

\draw[<-] ([yshift=0.1em,xshift=1em]t6.north) -- ([yshift=1.2em,xshift=1em]t6.north);

\visible<2->{
\draw [<->,ublue,thick] ([xshift=0.3em]ws4.south) .. controls +(-60:1) and +(south:1) .. (wt4.south);
\draw [<->,ublue,thick] (ws4.south) .. controls +(south:1.0) and +(south:1.5) .. (wt5.south);
}

\visible<3->{
\node [anchor=north,fill=green!30,draw=ublue] (attentionlabel) at ([yshift=-3.4em]representation.south) {\footnotesize{词语的关注度}};
\draw [->,dotted,very thick,ublue] ([yshift=0.1em]attentionlabel.north)--([yshift=-0.1em]representation.south);
}

\end{scope}
\end{tikzpicture}
\end{center}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 注意力机制
\begin{frame}{注意力机制}
\begin{itemize}
\item 关注的``局部性''在图像处理、语音识别等领域也有广泛讨论,比如,对于下图
    \begin{itemize}
    \item 关注的顺序:大狗的帽子 $\to$ 大狗 $\to$ 小狗的帽子 $\to$ 小狗
    \end{itemize}
xiaotong committed
2169
\item 人往往不是``均匀地''看图像中的所有区域,翻译是一个道理,生成一个目标语单词时参考的源语单词不会太多
xiaotong committed
2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182
\end{itemize}

\begin{center}
\includegraphics[scale=0.20]{./Figures/dog-hat.jpg}
\end{center}

\begin{itemize}
\item<2-> \alert{注意力机制}在机器翻译中已经成功应用,经典的论文\\
\textbf{Neural Machine Translation by Jointly Learning to Align and Translate}\\
\textbf{Bahdanau et al., 2015, In Proc of ICLR}
\end{itemize}

\end{frame}
xiaotong committed
2183 2184

%%%------------------------------------------------------------------------------------------------------------
xiaotong committed
2185 2186 2187 2188 2189
%%% 注意力机制的简单示例
\begin{frame}{神经机器翻译的注意力机制}
\begin{itemize}
\item 在注意力机制中,每个目标语单词的生成会使用一个动态的源语表示,而非一个统一的固定表示
    \begin{itemize}
xiaotong committed
2190
    \item 这里$\textbf{C}_i$表示第$i$个目标语单词所使用的源语表示
xiaotong committed
2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271
    \end{itemize}
\end{itemize}

\vspace{0.4em}
\begin{center}
\begin{tikzpicture}

\setlength{\mystep}{1.6em}

%%% a simple encoder-decoder model
\begin{scope}
\foreach \x in {1,2,...,6}
    \node[] (s\x) at (\x * \mystep,0) {};

\node [] (ws1) at (s1) {\scriptsize{}};
\node [] (ws2) at (s2) {\scriptsize{}};
\node [] (ws3) at (s3) {\scriptsize{}};
\node [] (ws4) at (s4) {\scriptsize{很长}};
\node [] (ws5) at (s5) {\scriptsize{}};
\node [] (ws6) at (s6) {\scriptsize{句子}};

\foreach \x in {1,2,...,6}
    \node[] (t\x) at (\x * \mystep + 2.4in,0) {};

\node [] (wt1) at (t1) {\scriptsize{This}};
\node [] (wt2) at (t2) {\scriptsize{is}};
\node [] (wt3) at ([yshift=-1pt]t3) {\scriptsize{a}};
\node [] (wt4) at ([yshift=-0.1em]t4) {\scriptsize{very}};
\node [] (wt5) at (t5) {\scriptsize{long}};
\node [] (wt6) at ([xshift=1em]t6) {\scriptsize{sentence}};

\node [anchor=south west,fill=red!30,minimum width=1.6in,minimum height=1.5em] (encoder) at ([yshift=1.0em]ws1.north west) {\footnotesize{Encoder}};
\node [anchor=west,fill=blue!30,minimum width=1.9in,minimum height=1.5em] (decoder) at ([xshift=4.5em]encoder.east) {\footnotesize{Decoder}};
\node [anchor=west,fill=green!30,minimum height=1.5em] (representation) at ([xshift=1em]encoder.east) {\footnotesize{表示}};
\draw [->,thick] ([xshift=1pt]encoder.east)--([xshift=-1pt]representation.west);
\draw [->,thick] ([xshift=1pt]representation.east)--([xshift=-1pt]decoder.west);

\foreach \x in {1,2,...,6}
    \draw[->] ([yshift=0.1em]s\x.north) -- ([yshift=1.2em]s\x.north);

\foreach \x in {1,2,...,5}
    \draw[<-] ([yshift=0.1em]t\x.north) -- ([yshift=1.2em]t\x.north);

\draw[<-] ([yshift=0.1em,xshift=1em]t6.north) -- ([yshift=1.2em,xshift=1em]t6.north);
\node [anchor=north] (cap) at ([xshift=2em,yshift=-2.5em]encoder.south east) {\scriptsize{(a) 简单的编码器-解码器框架}};

\end{scope}

%%% a encoder-decoder model with attention
\begin{scope}[yshift=-1.7in]
\foreach \x in {1,2,...,6}
    \node[] (s\x) at (\x * \mystep,0) {};

\node [] (ws1) at (s1) {\scriptsize{}};
\node [] (ws2) at (s2) {\scriptsize{}};
\node [] (ws3) at (s3) {\scriptsize{}};
\node [] (ws4) at (s4) {\scriptsize{很长}};
\node [] (ws5) at (s5) {\scriptsize{}};
\node [] (ws6) at (s6) {\scriptsize{句子}};

\foreach \x in {1,2,...,6}
    \node[] (t\x) at (\x * \mystep + 2.4in,0) {};

\node [] (wt1) at (t1) {\scriptsize{This}};
\node [] (wt2) at (t2) {\scriptsize{is}};
\node [] (wt3) at ([yshift=-1pt]t3) {\scriptsize{a}};
\node [] (wt4) at ([yshift=-0.1em]t4) {\scriptsize{very}};
\node [] (wt5) at (t5) {\scriptsize{long}};
\node [] (wt6) at ([xshift=1em]t6) {\scriptsize{sentence}};

\node [anchor=south west,fill=red!30,minimum width=1.6in,minimum height=1.5em] (encoder) at ([yshift=1.0em]ws1.north west) {\footnotesize{Encoder}};
\node [anchor=west,fill=blue!30,minimum width=1.9in,minimum height=1.5em] (decoder) at ([xshift=4.5em]encoder.east) {\footnotesize{Decoder}};

\foreach \x in {1,2,...,6}
    \draw[->] ([yshift=0.1em]s\x.north) -- ([yshift=1.2em]s\x.north);

\foreach \x in {1,2,...,5}
    \draw[<-] ([yshift=0.1em]t\x.north) -- ([yshift=1.2em]t\x.north);

\draw[<-] ([yshift=0.1em,xshift=1em]t6.north) -- ([yshift=1.2em,xshift=1em]t6.north);

xiaotong committed
2272 2273 2274
\draw [->] ([yshift=3em]s6.north) -- ([yshift=4em]s6.north) -- ([yshift=4em]t1.north) node [pos=0.5,fill=green!30,inner sep=2pt] (c1) {\scriptsize{表示$\textbf{C}_1$}} -- ([yshift=3em]t1.north) ;
\draw [->] ([yshift=3em]s5.north) -- ([yshift=5.3em]s5.north) -- ([yshift=5.3em]t2.north) node [pos=0.5,fill=green!30,inner sep=2pt] (c2) {\scriptsize{表示$\textbf{C}_2$}} -- ([yshift=3em]t2.north) ;
\draw [->] ([yshift=3.5em]s3.north) -- ([yshift=6.6em]s3.north) -- ([yshift=6.6em]t4.north) node [pos=0.5,fill=green!30,inner sep=2pt] (c3) {\scriptsize{表示$\textbf{C}_i$}} -- ([yshift=3.5em]t4.north) ;
xiaotong committed
2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287
\node [anchor=north] (smore) at ([yshift=3.5em]s3.north) {...};
\node [anchor=north] (tmore) at ([yshift=3.5em]t4.north) {...};

\node [anchor=north] (cap) at ([xshift=2em,yshift=-2.5em]encoder.south east) {\scriptsize{(b) 引入注意力机制的编码器-解码器框架}};

\end{scope}

\end{tikzpicture}
\end{center}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
xiaotong committed
2288
%%% C_i的定义
xiaotong committed
2289
\begin{frame}{上下文向量$\textbf{C}_i$}
xiaotong committed
2290
\begin{itemize}
xiaotong committed
2291
\item 对于目标语位置$i$$\textbf{C}_i$是目标语$i$使用的上下文向量
xiaotong committed
2292
	\begin{itemize}
xiaotong committed
2293 2294
	\item $\textbf{h}_j$表示编码器第$j$个位置的隐层状态
	\item $\textbf{s}_i$表示解码器第$i$个位置的隐层状态
xiaotong committed
2295
	\item<2-> $\alpha_{i,j}$表示注意力权重,表示目标语第$i$个位置与源语第$j$个位置之间的相关性大小
xiaotong committed
2296 2297
	\item<2-> $a(\cdot)$表示注意力函数,计算$\textbf{s}_{i-1}$$\textbf{h}_j$之间的相关性
	\item<3-> $\textbf{C}_i$是所有源语编码表示$\{\textbf{h}_j\}$的加权求和,权重为$\{\alpha_{i,j}\}$
xiaotong committed
2298 2299 2300 2301 2302 2303 2304 2305
	\end{itemize}
\end{itemize}

\begin{center}
\begin{tikzpicture}

\begin{scope}

xiaotong committed
2306 2307
\node [anchor=west,draw,fill=red!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (h1) at (0,0) {\scriptsize{$\textbf{h}_1$}};
\node [anchor=west,draw,fill=red!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (h2) at ([xshift=1em]h1.east) {\scriptsize{$\textbf{h}_2$}};
xiaotong committed
2308
\node [anchor=west,inner sep=0pt,minimum width=3em] (h3) at ([xshift=0.5em]h2.east) {\scriptsize{...}};
xiaotong committed
2309
\node [anchor=west,draw,fill=red!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (h4) at ([xshift=0.5em]h3.east) {\scriptsize{$\textbf{h}_n$}};
xiaotong committed
2310 2311 2312 2313 2314

\node [anchor=south,circle,minimum size=1.0em,draw,ublue,thick] (sum) at ([yshift=2em]h2.north east) {};
\draw [thick,-,ublue] (sum.north) -- (sum.south);
\draw [thick,-,ublue] (sum.west) -- (sum.east);

xiaotong committed
2315 2316
\node [anchor=south,draw,fill=green!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (th1) at ([yshift=2em,xshift=-1em]sum.north west) {\scriptsize{$\textbf{s}_{i-1}$}};
\node [anchor=west,draw,fill=green!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (th2) at ([xshift=2em]th1.east) {\scriptsize{$\textbf{s}_{i}$}};
xiaotong committed
2317 2318 2319 2320 2321 2322 2323 2324

\draw [->] (h1.north) .. controls +(north:0.8) and +(west:1) ..  (sum.190) node [pos=0.3,left] {\tiny{$\alpha_{i,1}$}};
\draw [->] (h2.north) .. controls +(north:0.6) and +(220:0.2) ..  (sum.220) node [pos=0.2,right] {\tiny{$\alpha_{i,2}$}};
\draw [->] (h4.north) .. controls +(north:0.8) and +(east:1) ..  (sum.-10) node [pos=0.1,left] (alphan) {\tiny{$\alpha_{i,n}$}};

\draw [->] ([xshift=-1.5em]th1.west) -- ([xshift=-0.1em]th1.west);
\draw [->] ([xshift=0.1em]th1.east) -- ([xshift=-0.1em]th2.west);
\draw [->] ([xshift=0.1em]th2.east) -- ([xshift=1.5em]th2.east);
xiaotong committed
2325
\draw [->] (sum.north) .. controls +(north:0.8) and +(west:0.2) ..  ([yshift=-0.4em,xshift=-0.1em]th2.west) node [pos=0.2,right] (ci) {\scriptsize{$\textbf{C}_{i}$}};
xiaotong committed
2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336

\node [anchor=south,inner sep=1pt] (output) at ([yshift=0.8em]th2.north) {\tiny{输出层}};
\draw [->] ([yshift=0.1em]th2.north) -- ([yshift=-0.1em]output.south);
\node [anchor=north] (enc1) at (h1.south) {\tiny{编码器输出}};
\node [anchor=north] (enc12) at ([yshift=0.5em]enc1.south) {\tiny{(位置$1$)}};
\node [anchor=north] (enc2) at (h2.south) {\tiny{编码器输出}};
\node [anchor=north] (enc22) at ([yshift=0.5em]enc2.south) {\tiny{(位置$2$)}};
\node [anchor=north] (enc4) at (h4.south) {\tiny{编码器输出}};
\node [anchor=north] (enc42) at ([yshift=0.5em]enc4.south) {\tiny{(位置$4$)}};

\visible<2->{
xiaotong committed
2337
\node [anchor=west] (math1) at ([xshift=5em,yshift=1em]th2.east) {$\textbf{C}_i = \sum_{j} \alpha_{i,j} \textbf{h}_j \ \ $};
xiaotong committed
2338 2339
}
\visible<3->{
2340
\node [anchor=north west] (math2) at ([yshift=-2em]math1.south west) {$\alpha_{i,j} = \frac{\exp(\beta_{i,j})}{\sum_{j'} \exp(\beta_{i,j'})}$};
xiaotong committed
2341
\node [anchor=north west] (math3) at ([yshift=-0em]math2.south west) {$\beta_{i,j} = a(\textbf{s}_{i-1}, \textbf{h}_j)$};
xiaotong committed
2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367
}

\begin{pgfonlayer}{background}
\visible<2->{
\node [rectangle,inner sep=0.4em,rounded corners=1pt,fill=blue!10,drop shadow] [fit = (math1)] (box1) {};
}
\visible<3->{
\node [rectangle,inner sep=0.4em,rounded corners=1pt,fill=orange!10,drop shadow] [fit = (math2) (math3)] (box2) {};
}
\end{pgfonlayer}

\visible<2->{
\draw [->,dotted,thick,blue] (box1.west) .. controls +(west:1.2) and +(east:2.0) .. ([xshift=-0.3em]ci.east);
}
\visible<3->{
\draw [->,dotted,thick,orange] ([yshift=1em]box2.west) .. controls +(west:1.2) and +(east:1.0) .. ([xshift=-0.35em]alphan.east);
}

\end{scope}

\end{tikzpicture}
\end{center}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446
%%% 注意力权重的可视化
\begin{frame}{注意力权重$\alpha_{ij}$}

\begin{itemize}
\item 注意力权重$\alpha_{ij}$的可视化
\end{itemize}

\vspace{-1.5em}

\begin{center}
\begin{tikzpicture}

\setlength{\wseg}{1.5cm}
\setlength{\hseg}{1.0cm}
\setlength{\wnode}{3.75cm}
\setlength{\hnode}{1.0cm}

\tikzstyle{elementnode} = [rectangle,text=white,anchor=center]
\tikzstyle{srcnode} = [rotate=45,font=\small,anchor=south west]
\tikzstyle{tgtnode} = [left,font=\small,anchor=north east]
\tikzstyle{alignmentnode} = [rectangle,draw,minimum height=3.6\hnode,minimum width=0.36\hnode]
\tikzstyle{probnode} = [fill=blue!30,minimum width=0.4\hnode]
\tikzstyle{labelnode} = [above]

% alignment matrix
\begin{scope}[scale=0.9,yshift=0.12in]
\foreach \i / \j / \c in
    {0/7/0.2, 1/7/0.45, 2/7/0.15, 3/7/0.15, 4/7/0.15, 5/7/0.15,
    0/6/0.35, 1/6/0.45, 2/6/0.15, 3/6/0.15, 4/6/0.15, 5/6/0.15,
    0/5/0.25, 1/5/0.15, 2/5/0.15, 3/5/0.35, 4/5/0.15, 5/5/0.15,
    0/4/0.15, 1/4/0.25, 2/4/0.2, 3/4/0.30, 4/4/0.15, 5/4/0.15,
    0/3/0.15, 1/3/0.15, 2/3/0.8, 3/3/0.25, 4/3/0.15, 5/3/0.25,
    0/2/0.15, 1/2/0.15, 2/2/0.15, 3/2/0.15, 4/2/0.25, 5/2/0.3,
    0/1/0.15, 1/1/0.15, 2/1/0.15, 3/1/0.15, 4/1/0.8, 5/1/0.15,
    0/0/0.15, 1/0/0.15, 2/0/0.15, 3/0/0.15, 4/0/0.25, 5/0/0.60}
    \node[elementnode,minimum size=0.6*\hnode*\c,inner sep=0.1pt,fill=blue] (a\i\j) at (0.5*\hnode*\i-5.4*0.5*\hnode,0.5*\hnode*\j-1.05*\hnode) {};

%attention score labels
\node[align=center] (l17) at (a17) {\scriptsize{{\color{white} .4}}};
\node[align=center] (l26) at (a06) {\scriptsize{{\color{white} .3}}};
\node[align=center] (l26) at (a16) {\scriptsize{{\color{white} .4}}};
\node[align=center] (l17) at (a35) {\scriptsize{{\color{white} .3}}};
\node[align=center] (l17) at (a34) {\tiny{{\color{white} .3}}};
\node[align=center] (l17) at (a23) {\small{{\color{white} .8}}};
\node[align=center] (l17) at (a41) {\small{{\color{white} .8}}};
\node[align=center] (l17) at (a50) {\small{{\color{white} .7}}};

% source
\node[srcnode] (src1) at (-5.4*0.5*\hnode,-1.05*\hnode+7.5*0.5*\hnode) {\scriptsize{Have}};
\node[srcnode] (src2) at ([xshift=0.5\hnode]src1.south west) {\scriptsize{you}};
\node[srcnode] (src3) at ([xshift=0.5\hnode]src2.south west) {\scriptsize{learned}};
\node[srcnode] (src4) at ([xshift=0.5\hnode]src3.south west) {\scriptsize{nothing}};
\node[srcnode] (src5) at ([xshift=0.5\hnode]src4.south west) {\scriptsize{?}};
\node[srcnode] (src6) at ([xshift=0.5\hnode]src5.south west) {\scriptsize{EOS}};

% target
\node[tgtnode] (tgt1) at (-6.0*0.5*\hnode,-1.05*\hnode+7.5*0.5*\hnode) {\scriptsize{}};
\node[tgtnode] (tgt2) at ([yshift=-0.5\hnode]tgt1.north east) {\scriptsize{什么}};
\node[tgtnode] (tgt3) at ([yshift=-0.5\hnode]tgt2.north east) {\scriptsize{}};
\node[tgtnode] (tgt4) at ([yshift=-0.5\hnode]tgt3.north east) {\scriptsize{}};
\node[tgtnode] (tgt5) at ([yshift=-0.5\hnode]tgt4.north east) {\scriptsize{}};
\node[tgtnode] (tgt6) at ([yshift=-0.5\hnode]tgt5.north east) {\scriptsize{}};
\node[tgtnode] (tgt7) at ([yshift=-0.5\hnode]tgt6.north east) {\scriptsize{?}};
\node[tgtnode] (tgt8) at ([yshift=-0.5\hnode]tgt7.north east) {\scriptsize{EOS}};

\end{scope}

\visible<2->{
% alignment rectangle 2
\node[alignmentnode, ugreen, anchor=north west] (alignment1) at ([xshift=-0.3em,yshift=0.4em]a07.north west) {};
}

\visible<3->{
% alignment rectangle 1
\node[alignmentnode, red, anchor=north west] (alignment2) at ([xshift=-0.1em,yshift=0.2em]a17.north west) {};
}

\visible<3->{
% alignment bars 2
xiaotong committed
2447
\node[probnode,anchor=south west,minimum height=0.4\hnode,inner sep=0.1pt,fill=red!40,label=below:\scriptsize{$0.4$}] (attn21) at ([xshift=2.3\hnode,yshift=0.5\hnode]alignment2.east) {};
2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466
\node[probnode,anchor=south west,minimum height=0.4\hnode,inner sep=0.1pt,fill=red!40,label=below:\scriptsize{$0.4$}] (attn22) at ([xshift=1pt]attn21.south east) {};
\node[probnode,anchor=south west,minimum height=0.05\hnode,inner sep=0.1pt,fill=red!40,label=below:\scriptsize{$0$}] (attn23) at ([xshift=1pt]attn22.south east) {};
\node[probnode,anchor=south west,minimum height=0.1\hnode,inner sep=0.1pt,fill=red!40,label=below:\scriptsize{$0.1$}] (attn24) at ([xshift=1pt]attn23.south east) {};
\node[probnode,anchor=south west,minimum height=0.05\hnode,inner sep=0.1pt,fill=red!40,label=below:\scriptsize{$0$}] (attn25) at ([xshift=1pt]attn24.south east) {};
\node[probnode,anchor=south west,minimum height=0.05\hnode,inner sep=0.1pt,fill=red!40,label=below:\scriptsize{$...$}] (attn26) at ([xshift=1pt]attn25.south east) {};
}

\visible<2->{
% alignment bars 1
\node[probnode,anchor=south,minimum height=0.2\hnode,inner sep=0.1pt,fill=ugreen!40,label=below:\scriptsize{$0.2$}] (attn11) at ([xshift=2.5\hnode,yshift=-1em]alignment2.north east) {};
\node[probnode,anchor=south west,minimum height=0.3\hnode,inner sep=0.1pt,fill=ugreen!40,label=below:\scriptsize{$0.3$}] (attn12) at ([xshift=1pt]attn11.south east) {};
\node[probnode,anchor=south west,minimum height=0.2\hnode,inner sep=0.1pt,fill=ugreen!40,label=below:\scriptsize{$0.2$}] (attn13) at ([xshift=1pt]attn12.south east) {};
\node[probnode,anchor=south west,minimum height=0.05\hnode,inner sep=0.1pt,fill=ugreen!40,label=below:\scriptsize{$0$}] (attn14) at ([xshift=1pt]attn13.south east) {};
\node[probnode,anchor=south west,minimum height=0.05\hnode,inner sep=0.1pt,fill=ugreen!40,label=below:\scriptsize{$0$}] (attn15) at ([xshift=1pt]attn14.south east) {};
\node[probnode,anchor=south west,minimum height=0.05\hnode,inner sep=0.1pt,fill=ugreen!40,label=below:\scriptsize{$...$}] (attn16) at ([xshift=1pt]attn15.south east) {};
}

\visible<3->{
% coverage score formula node
xiaotong committed
2467 2468
\node [anchor=north west] (formula) at ([xshift=-0.3\hnode,yshift=-1.5\hnode]attn11.south) {\small{不同$\textbf{C}_i$所对应的源语言词的权重是不同的}};
\node [anchor=north west] (example) at (formula.south west) {\footnotesize{$\textbf{C}_2=0.4 \times \textbf{h}(\textrm{``你''}) + 0.4 \times \textbf{h}(\textrm{``什么''}) +$}};
姜雨帆 committed
2469
\node [anchor=north west] (example2) at ([yshift=0.4em]example.south west) {\footnotesize{$\ \ \ \ \ \ \ \ 0 \times \textbf{h}(\textrm{``都''}) + 0.1 \times \textbf{h}(\textrm{`` 没''}) + ..$}};
2470 2471 2472 2473
}

\visible<3->{
% matrix -> attn2
xiaotong committed
2474
\draw[->,red] ([xshift=0.1em,yshift=2.3em]alignment2.east).. controls +(east:1.9cm) and +(west:1.0cm) ..([xshift=-0.15\hnode,yshift=-1em]attn21.north west);
2475 2476 2477 2478 2479 2480 2481 2482
}

\visible<2->{
\draw[->,ugreen] ([xshift=0.1em,yshift=-1.2em]alignment1.north east)--([xshift=2.2\hnode,yshift=-1.2em]alignment2.north east);
}

\visible<3->{
% attn2 -> cov2
xiaotong committed
2483
\draw[->] ([xshift=0.2\hnode,yshift=0.0\hnode]attn26.east)--([xshift=0.7\hnode,yshift=0]attn26.east) node[pos=0.5,above] (sum2) {\small{$\sum$}}; % 0.3 - 0.5 height of the
2484 2485 2486 2487 2488 2489 2490 2491 2492
}

\visible<2->{
% attn1 -> cov1
\draw[->] ([xshift=0.2\hnode]attn16.east)--([xshift=0.7\hnode]attn16.east) node[pos=0.5,above] (sum1) {\small{$\sum$}};
}

% coverage score for each source word
\visible<2->{
xiaotong committed
2493
\node[anchor=west] (sc1) at ([xshift=0.9\hnode]attn16.east) {$\textbf{C}_1 = \sum_{i=1}^{8} \alpha_{i1} \textbf{h}_{i}$};
2494 2495 2496
}

\visible<3->{
xiaotong committed
2497
\node[anchor=west] (sc2) at ([xshift=0.9\hnode,yshift=0.0\hnode]attn26.east) {$\textbf{C}_2 = \sum_{i=1}^{8} \alpha_{i2} \textbf{h}_{i}$};
2498 2499 2500 2501 2502 2503 2504 2505 2506 2507
}

\end{tikzpicture}
\end{center}

\visible<4->{
\begin{itemize}
\item 对比
\end{itemize}

xiaotong committed
2508
\vspace{-0.5em}
2509
\begin{center}
xiaotong committed
2510
{\footnotesize
2511 2512
\begin{tabular}{l | l}
引入注意力机制以前 & 引入注意力机制以后 \\ \hline
xiaotong committed
2513 2514
$\textrm{``Have''} = \argmax_{y_1} \textrm{P}(y_1|\alert{\textbf{C}},y_0)$ & $\textrm{``Have''} = \argmax_{y_1} \textrm{P}(y_1|0,\alert{\textbf{C}_1}, y_0)$ \\
$\textrm{``you''} = \argmax_{y_2} \textrm{P}(y_2|\textbf{s}_1, y_1)$ & $\textrm{``you''} = \argmax_{y_2} \textrm{P}(y_2|\textbf{s}_1, \alert{\textbf{C}_2},y_1)$
2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528
\end{tabular}
}
\end{center}
}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 如何定义注意力函数
\begin{frame}{计算注意力权重 - 注意力函数}
\begin{itemize}
\item 再来看一下注意力权重的定义。这个过程实际上是对$a(\cdot,\cdot)$做指数归一化:\\
\vspace{-0.3em}
\begin{displaymath}
xiaotong committed
2529
\alpha_{i,j} = \frac{\exp(a(\textbf{s}_{i-1}, \textbf{h}_j))}{\sum_{j'} \exp(a(\textbf{s}_{i-1}, \textbf{h}_{j'}))}
2530 2531
\end{displaymath}

xiaotong committed
2532
\item<2-> 注意力函数$a(\textbf{s},\textbf{h})$的目的是捕捉$\textbf{s}$$\textbf{h}$之间的\alert{相似性},这也可以被看作是目标语表示和源语言表示的一种``统一化'',即把源语言和目标语表示在同一个语义空间,进而语义相近的内容有更大的相似性。\visible<3->{定义$a(\textbf{s},\textbf{h})$的方式:}
2533 2534 2535 2536
    \vspace{-1em}

    \visible<3->{
    \begin{displaymath}
xiaotong committed
2537 2538 2539 2540 2541
    a(\textbf{s},\textbf{h}) =  \left\{ \begin{array}{ll}
    \textbf{s} \textbf{h}^{\textrm{T}} & \textrm{向量乘} \\
    \textrm{cos}(\textbf{s}, \textbf{h}) & \textrm{向量夹角} \\
    \textbf{s} \textbf{W} \textbf{h}^{\textrm{T}} & \textrm{线性模型} \\
    \textrm{TanH}(\textbf{W}[\textbf{s},\textbf{h}])\textbf{v}^{\textrm{T}} & \textrm{拼接}[\textbf{s},\textbf{h}]+\textrm{单层网络}
2542 2543 2544 2545 2546 2547 2548 2549 2550 2551
    \end{array}
    \right.
    \end{displaymath}
    \vspace{-0.3em}
    $\textbf{W}$$\textbf{v}$是可学习参数
    }
\end{itemize}
\end{frame}

%%%------------------------------------------------------------------------------------------------------------
xiaotong committed
2552 2553
%%% 注意力模型的效果 - 热图
\begin{frame}{真实的实例}
Lee committed
2554 2555 2556 2557 2558 2559 2560 2561 2562
    \begin{itemize}
    \item 注意力的权重符合双语对应的规律
    \begin{enumerate}
        \item 翻译出``New York Times''的时候``纽约时报''的权重很大
        \item 翻译出``I''的时候``我''的权重很大
        \item 翻译出``came''的时候``来到''的权重很大
        \item 翻译出``world''的时候``世界''的权重很大
    \end{enumerate}
    \item 互译的词通常都会产生较大的注意力权重
xiaotong committed
2563
    \item 注意力的权重一定程度上反应了词语间的对应关系
Lee committed
2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579
    \end{itemize}
    \begin{center}
        \hspace*{\fill}
        \begin{tikzpicture}
            \begin{axis}[
                width=0.4\linewidth,height=0.4\linewidth,
                view={0}{90},
                enlargelimits=false,
                ymin=-0.5,ymax=5.5,
                xmin=-0.5,xmax=2.5,
                ytick={0,1,...,5},
                yticklabels={The,New,York,Times,comments,EOS},
                yticklabel style={font=\scriptsize},
                xtick={0,1,2},
                xticklabels={纽约时报,发表,评论},
                xticklabel style={rotate=45,anchor=south west,font=\scriptsize,inner sep=0pt,outer sep=2pt},
xiaotong committed
2580
                xticklabel pos=upper,
Lee committed
2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600
                colormap={bluewhite}{color=(white) color=(blue)},
            ]
                \addplot3[matrix plot] file [meta=index 2] {Figures/example1.dat};
            \end{axis}
            \draw [red,very thick] (0,0.9) rectangle (0.9,2.3);
        \end{tikzpicture}
        \hfill
        \begin{tikzpicture}
            \begin{axis}[
                width=0.4\linewidth,height=0.4\linewidth,
                view={0}{90},
                enlargelimits=false,
                ymin=-0.5,ymax=5.5,
                xmin=-0.5,xmax=3.5,
                ytick={0,1,...,5},
                yticklabels={I,came,to,this,world,EOS},
                yticklabel style={font=\scriptsize},
                xtick={0,1,2,3},
                xticklabels={我,来到,这个,世界},
                xticklabel style={rotate=45,anchor=south west,font=\scriptsize,inner sep=0pt,outer sep=2pt},
xiaotong committed
2601
                xticklabel pos=upper,
Lee committed
2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623
                colormap={bluewhite}{color=(white) color=(blue)},
                colorbar,
                colorbar style={
                    font=\scriptsize,
                    title style={align=center},
                    title={注意力\\大小},
                    yticklabel style={
                        /pgf/number format/precision=1,
                        /pgf/number format/.cd,
                            fixed,
                            fixed zerofill
                    },
                    at={(1.2,1)},anchor=north west,
               },
            ]
                \addplot3[matrix plot] file [meta=index 2] {Figures/example2.dat};
            \end{axis}
            \draw [red,very thick] (0,2.3) rectangle (0.7,2.72);
            \draw [red,very thick] (0.7,1.8) rectangle (1.35,2.3);
            \draw [red,very thick] (2.05,0.45) rectangle (2.75,0.9);
        \end{tikzpicture}
    \end{center}
xiaotong committed

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 如何进一步理解注意力机制
\begin{frame}{重新解释注意力机制}
\begin{itemize}
\item 换一个问题,假设有若干key-value单元,其中key是这个单元的索引表示,value是这个单元的值。对于任意一个query,可以找到匹配的key,并输出其对应的value
\end{itemize}

\vspace{-0.8em}
\begin{center}
\begin{tikzpicture}
\begin{scope}

\tikzstyle{rnode} = [draw,minimum width=3em,minimum height=1.2em]

\node [rnode,anchor=south west,fill=blue!20!white] (value1) at (0,0) {\scriptsize{value$_1$}};
\node [rnode,anchor=south west,fill=blue!20!white] (value2) at ([xshift=1em]value1.south east) {\scriptsize{value$_2$}};
\node [rnode,anchor=south west,fill=red!20!white] (value3) at ([xshift=1em]value2.south east) {\scriptsize{value$_3$}};
\node [rnode,anchor=south west,fill=blue!20!white] (value4) at ([xshift=1em]value3.south east) {\scriptsize{value$_4$}};

\node [rnode,anchor=south west,pattern=north east lines] (key1) at ([yshift=0.2em]value1.north west) {};
\node [rnode,anchor=south west,pattern=dots] (key2) at ([yshift=0.2em]value2.north west) {};
\node [rnode,anchor=south west,pattern=horizontal lines] (key3) at ([yshift=0.2em]value3.north west) {};
\node [rnode,anchor=south west,pattern=crosshatch dots] (key4) at ([yshift=0.2em]value4.north west) {};
\node [fill=white,inner sep=1pt] (key1label) at (key1) {\scriptsize{key$_1$}};
\node [fill=white,inner sep=1pt] (key1label) at (key2) {\scriptsize{key$_2$}};
\node [fill=white,inner sep=1pt] (key1label) at (key3) {\scriptsize{key$_3$}};
\node [fill=white,inner sep=1pt] (key1label) at (key4) {\scriptsize{key$_4$}};

\node [rnode,anchor=east,pattern=horizontal lines] (query) at ([xshift=-3em]key1.west) {};
\node [anchor=east] (querylabel) at ([xshift=-0.2em]query.west) {\scriptsize{query}};

\draw [->] ([yshift=1pt]query.north) .. controls +(90:2em) and +(90:2em) .. ([yshift=1pt]key3.north) node [pos=0.5,below,yshift=0.2em] {\scriptsize{匹配}};

\node [anchor=north] (result) at (value3.south) {\scriptsize{\alert{返回结果}}};

\end{scope}
\end{tikzpicture}
\end{center}

\vspace{-0.7em}

\begin{itemize}
\item<2-> 注意力机制也可以被看做对key-value单元的查询,但是所有key和query之间都有一种匹配程度,返回结果是对所有value的加权
\end{itemize}

\visible<2->{
\vspace{-0.5em}
\begin{center}
\begin{tikzpicture}
\begin{scope}

\tikzstyle{rnode} = [draw,minimum width=3em,minimum height=1.2em]

\node [rnode,anchor=south west,fill=red!20!white] (value1) at (0,0) {\scriptsize{value$_1$}};
\node [rnode,anchor=south west,fill=red!20!white] (value2) at ([xshift=1em]value1.south east) {\scriptsize{value$_2$}};
\node [rnode,anchor=south west,fill=red!20!white] (value3) at ([xshift=1em]value2.south east) {\scriptsize{value$_3$}};
\node [rnode,anchor=south west,fill=red!20!white] (value4) at ([xshift=1em]value3.south east) {\scriptsize{value$_4$}};

\node [rnode,anchor=south west,pattern=north east lines] (key1) at ([yshift=0.2em]value1.north west) {};
\node [rnode,anchor=south west,pattern=dots] (key2) at ([yshift=0.2em]value2.north west) {};
\node [rnode,anchor=south west,pattern=horizontal lines] (key3) at ([yshift=0.2em]value3.north west) {};
\node [rnode,anchor=south west,pattern=crosshatch dots] (key4) at ([yshift=0.2em]value4.north west) {};
\node [fill=white,inner sep=1pt] (key1label) at (key1) {\scriptsize{key$_1$}};
\node [fill=white,inner sep=1pt] (key1label) at (key2) {\scriptsize{key$_2$}};
\node [fill=white,inner sep=1pt] (key1label) at (key3) {\scriptsize{key$_3$}};
\node [fill=white,inner sep=1pt] (key1label) at (key4) {\scriptsize{key$_4$}};

\node [rnode,anchor=east,pattern=vertical lines] (query) at ([xshift=-3em]key1.west) {};
\node [anchor=east] (querylabel) at ([xshift=-0.2em]query.west) {\scriptsize{query}};

\draw [->] ([yshift=1pt,xshift=6pt]query.north) .. controls +(90:1em) and +(90:1em) .. ([yshift=1pt]key1.north);
\draw [->] ([yshift=1pt,xshift=3pt]query.north) .. controls +(90:1.5em) and +(90:1.5em) .. ([yshift=1pt]key2.north);
\draw [->] ([yshift=1pt]query.north) .. controls +(90:2em) and +(90:2em) .. ([yshift=1pt]key3.north);
\draw [->] ([yshift=1pt,xshift=-3pt]query.north) .. controls +(90:2.5em) and +(90:2.5em) .. ([yshift=1pt]key4.north);
\node [anchor=south east] (alpha1) at (key1.north east) {\scriptsize{$\alpha_1$}};
\node [anchor=south east] (alpha2) at (key2.north east) {\scriptsize{$\alpha_2$}};
\node [anchor=south east] (alpha3) at (key3.north east) {\scriptsize{$\alpha_3$}};
\node [anchor=south east] (alpha4) at (key4.north east) {\scriptsize{$\alpha_4$}};

\node [anchor=north] (result) at ([xshift=-1.5em]value2.south east) {\scriptsize{\alert{返回结果}=$\alpha_1 \cdot \textrm{value}_1 + \alpha_2 \cdot \textrm{value}_2 + \alpha_3 \cdot \textrm{value}_3 + \alpha_4 \cdot \textrm{value}_4$}};

\end{scope}
\end{tikzpicture}
\end{center}
}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 如何进一步理解注意力机制 - 回到机器翻译任务
\begin{frame}{重新解释注意力机制(续)}
\begin{itemize}
\item 回到机器翻译,如果把目标语状态$\textbf{s}_{i-1}$看做query,而把源语言所有位置的最上层RNN表示$\textbf{h}_{j}$看做{\color{ugreen} \textbf{key}}{\color{red} \textbf{value}}
\end{itemize}

\vspace{-1.5em}
\begin{center}
\begin{tikzpicture}
\begin{scope}

\tikzstyle{rnode} = [draw,minimum width=3.5em,minimum height=1.2em]

\node [rnode,anchor=south west,fill=red!20!white] (value1) at (0,0) {\scriptsize{$\textbf{h}(\textrm{``你''})$}};
\node [rnode,anchor=south west,fill=red!20!white] (value2) at ([xshift=1em]value1.south east) {\scriptsize{$\textbf{h}(\textrm{``什么''})$}};
\node [rnode,anchor=south west,fill=red!20!white] (value3) at ([xshift=1em]value2.south east) {\scriptsize{$\textbf{h}(\textrm{``也''})$}};
\node [rnode,anchor=south west,fill=red!20!white] (value4) at ([xshift=1em]value3.south east) {\scriptsize{$\textbf{h}(\textrm{``没''})$}};

\node [rnode,anchor=south west,fill=green!20!white] (key1) at ([yshift=0.2em]value1.north west) {\scriptsize{$\textbf{h}(\textrm{``你''})$}};
\node [rnode,anchor=south west,fill=green!20!white] (key2) at ([yshift=0.2em]value2.north west) {\scriptsize{$\textbf{h}(\textrm{``什么''})$}};
\node [rnode,anchor=south west,fill=green!20!white] (key3) at ([yshift=0.2em]value3.north west) {\scriptsize{$\textbf{h}(\textrm{``也''})$}};
\node [rnode,anchor=south west,fill=green!20!white] (key4) at ([yshift=0.2em]value4.north west) {\scriptsize{$\textbf{h}(\textrm{``没''})$}};

\node [rnode,anchor=east] (query) at ([xshift=-2em]key1.west) {\scriptsize{$\textbf{s}(\textrm{``you''})$}};
\node [anchor=east] (querylabel) at ([xshift=-0.2em]query.west) {\scriptsize{query}};

\draw [->] ([yshift=1pt,xshift=6pt]query.north) .. controls +(90:1em) and +(90:1em) .. ([yshift=1pt]key1.north);
\draw [->] ([yshift=1pt,xshift=3pt]query.north) .. controls +(90:1.5em) and +(90:1.5em) .. ([yshift=1pt]key2.north);
\draw [->] ([yshift=1pt]query.north) .. controls +(90:2em) and +(90:2em) .. ([yshift=1pt]key3.north);
\draw [->] ([yshift=1pt,xshift=-3pt]query.north) .. controls +(90:2.5em) and +(90:2.5em) .. ([yshift=1pt]key4.north);
\node [anchor=south east] (alpha1) at ([xshift=1em]key1.north east) {\scriptsize{$\alpha_1=.4$}};
\node [anchor=south east] (alpha2) at ([xshift=1em]key2.north east) {\scriptsize{$\alpha_2=.4$}};
\node [anchor=south east] (alpha3) at ([xshift=1em]key3.north east) {\scriptsize{$\alpha_3=0$}};
\node [anchor=south east] (alpha4) at ([xshift=1em]key4.north east) {\scriptsize{$\alpha_4=.1$}};

\end{scope}
\end{tikzpicture}
\end{center}

\vspace{-2.5em}
\begin{eqnarray}
xiaotong committed
2756 2757
\textbf{C}_3 & = & 0.4 \times \textbf{h}(\textrm{``你''}) + 0.4 \times \textbf{h}(\textrm{``什么''}) + \nonumber \\
             &   & 0 \times \textbf{h}(\textrm{``也''}) + 0.1 \times \textbf{h}(\textrm{``没''}) \nonumber
xiaotong committed
2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770
\end{eqnarray}

\vspace{-0.5em}
\begin{itemize}
\item<2-> 注意力机制也可以被看做是一个重新生成value的过程:对于一组value值,注意力模型对他们加权求和,并得到一个新的value。而这个新的value实际上就是query所对应查询结果,在机器翻译中被看做是目标语所对应的源语言上下文表示。
\end{itemize}

\end{frame}

\subsection{训练及推断}

%%%------------------------------------------------------------------------------------------------------------
%%% 训练
Lee committed
2771
\begin{frame}{训练 - 整体流程}
Lee committed
2772
    \begin{itemize}
Lee committed
2773
        \item 有了一个NMT模型,我们应该怎么使用梯度下降算法来训练一个翻译模型呢? 或者说哪些因素会对RNN训练产生影响?
Lee committed
2774
    \end{itemize}
姜雨帆 committed
2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831
\begin{center}
\begin{tikzpicture}
\begin{scope}

\node [anchor=south west,draw,inner sep=0.7em,minimum width=3em,fill=blue!20!white] (c1) at (0,0) {参数初始化};
\node [anchor=north,draw,inner sep=0.7em,minimum width=3em,fill=yellow!20!white] (c2) at ([yshift=-1em]c1.south) {优化器选择};
\node [anchor=north,draw,inner sep=0.7em,minimum width=3em,fill=red!20!white] (c3) at ([yshift=-1em]c2.south) {学习率调度};
\node [anchor=north,draw,inner sep=0.7em,minimum width=3em,fill=ugreen!20!white] (c4) at ([yshift=-1em]c3.south) {多设备加速};



\node [anchor=east] (line1) at ([xshift=-1.5em,yshift=0em]c1.west) {给定模型结构,初};
\node [anchor=north west] (line2) at ([yshift=0.3em]line1.south west) {始化的好坏决定了};
\node [anchor=north west] (line3) at ([yshift=0.3em]line2.south west) {模型最后的性能};


\node [anchor=west] (line11) at ([xshift=1.5em,yshift=0em]c1.east) {选择不同的优化器};
\node [anchor=north west] (line12) at ([yshift=0.3em]line11.south west) {需要对使用的便利};
\node [anchor=north west] (line13) at ([yshift=0.3em]line12.south west) {性与效果进行权衡};


\node [anchor=west] (line21) at ([yshift=-7em]line1.west) {合适的学习率调度};
\node [anchor=north west] (line22) at ([yshift=0.3em]line21.south west) {方案可以让训练};
\node [anchor=north west] (line23) at ([yshift=0.3em]line22.south west) {过程又好又快};


\node [anchor=west] (line31) at ([yshift=-7em]line11.west) {当训练非常缓慢的};
\node [anchor=north west] (line32) at ([yshift=0.3em]line31.south west) {时候,可以使用多};
\node [anchor=north west] (line33) at ([yshift=0.3em]line32.south west) {设备并行计算加速};

\draw [->,very thick] ([yshift=-0.1em]c1.south) -- ([yshift=0.1em]c2.north);
\draw [->,very thick] ([yshift=-0.1em]c2.south) -- ([yshift=0.1em]c3.north);
\draw [->,very thick] ([yshift=-0.1em]c3.south) -- ([yshift=0.1em]c4.north);

\begin{pgfonlayer}{background}

\node [rectangle,inner sep=0.2em,rounded corners=1pt,fill=blue!10,drop shadow,draw=blue] [fit = (line1) (line2) (line3)] (box1) {};
\draw [->,dotted,very thick,blue] ([yshift=1.5em,xshift=1em]box1.east) -- ([yshift=1.5em,xshift=0.1em]box1.east);


\node [rectangle,inner sep=0.2em,rounded corners=1pt,fill=yellow!20!white,drop shadow,draw=black] [fit = (line11) (line12) (line13)] (box2) {};
\draw [->,dotted,very thick,black] ([yshift=-1.5em,xshift=-1em]box2.west) -- ([yshift=-1.5em,xshift=-0.1em]box2.west);


\node [rectangle,inner sep=0.2em,rounded corners=1pt,fill=red!20,drop shadow,draw=red] [fit = (line21) (line22) (line23)] (box3) {};
\draw [->,dotted,very thick,red] ([xshift=1em,yshift=1.5em]box3.east) -- ([xshift=0.1em,yshift=1.5em]box3.east) ;


\node [rectangle,inner sep=0.2em,rounded corners=1pt,fill=ugreen!10,drop shadow,draw=ugreen] [fit = (line31) (line32) (line33)] (box4) {};
\draw [->,dotted,very thick,ugreen] ([yshift=-1.5em,xshift=-1em]box4.west) -- ([yshift=-1.5em,xshift=-0.1em]box4.west);
				
\end{pgfonlayer}


\end{scope}
\end{tikzpicture}
\end{center}
Lee committed
2832 2833 2834 2835 2836
\end{frame}

\begin{frame}{训练 - 初始化}
    \begin{itemize}
        \item 模型结构是确定了,但是我们初始化参数还有很多需要注意的地方,否则训练不了一个优秀的模型
Lee committed
2837
        \begin{itemize}
Lee committed
2838 2839
            \item LSTM遗忘门偏置初始为1,也就是始终选择遗忘记忆$c$,可以有效防止初始时$c$里包含的错误信号传播后面所有时刻
            \item 网络的其他偏置一般都初始化成0,可以有效防止加入过大或过小的偏置后使得激活函数的输出跑到``饱和区'',也就是梯度接近0的区域,使得训练一开始就无法跳出局部极小
xiaotong committed
2840 2841
            \item<2-> 网络的权重矩阵$W$一般使用Xavier参数初始化方法,可以有效稳定训练过程,特别是对于比较``深''的网络$$W \sim \mathcal{U}(-\sqrt{\frac{6}{d_{\mathrm{in}}+d_{\mathrm{out}}}},\sqrt{\frac{6}{d_{\mathrm{in}}+d_{\mathrm{out}}}})$$
            $d_{\mathrm{in}}$$d_{\mathrm{out}}$分别是$W$的输入和输出的维度大小,参考论文\\
Lee committed
2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860
            \textbf{Understanding the difficulty of training deep feedforward neural networks}\\
            \textbf{Glorot, X., \& Bengio, Y., 2010, In Proc of AISTATS}
        \end{itemize}
    \end{itemize}
\end{frame}

\begin{frame}{训练 - 优化器}
    \begin{itemize}
        \item 训练RNN我们通常会使用Adam或者SGD两种优化器,它们各有优劣
        \begin{center}
            \footnotesize
            \begin{tabular}{c|c|c}
                & 使用 & 性能 \\
                \hline
                Adam & 一套配置包打天下 & 不算差,但没到极限 \\
                SGD & 换一个任务就得调 & 效果杠杠的 \\
            \end{tabular}
        \end{center}
        \item 因此需要快速得到模型看一下初步效果,选择Adam
xiaotong committed
2861
        \item<2-> 若是需要在一个任务上得到最优的结果,选择SGD
Lee committed
2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874
        \begin{itemize}
            \item 需要注意的是,训练RNN的时候,我们通常会遇到梯度爆炸的问题,也就是梯度突然变得很大,这种情况下需要使用``梯度裁剪''来防止梯度$\pi$超过阈值$$\pi'=\pi \cdot \frac{\mathrm{threshold}}{\max(\mathrm{threshold},\parallel \pi \parallel_2)}$$
            \item 其中$\mathrm{threshold}$是手工设定的梯度大小阈值,$\parallel \cdot \parallel_2$是L2范数
            \item 这个公式含义在于只要梯度大小超过阈值,就按照阈值与当前梯度大小的比例进行放缩
        \end{itemize}
    \end{itemize}
\end{frame}

\begin{frame}{训练 - 学习率}
    \begin{itemize}
        \item 不同优化器需要的学习率不同,比如Adam一般使用$0.001$$0.0001$,而SGD则在$0.1\sim 1$之间挑选
        \item 但是无论使用哪个优化器,为了保证训练又快又好,我们通常都需要根据当前的更新次数来调整学习率的大小
        \begin{itemize}
xiaotong committed
2875 2876
            \item<2-> 学习率预热:模型训练初期,梯度通常很大,直接使用很大的学习率很容易让模型跑偏,因此需要学习率有一个从小到大的过程
            \item<2-> 学习率衰减:模型训练接近收敛的时候,使用大学习率会很容易让模型错过局部极小,因此需要学习率逐渐变小来逼近局部最小
Lee committed
2877
        \end{itemize}
xiaotong committed
2878

xiaotong committed
2879
        \visible<2->{
Lee committed
2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893
        \begin{center}
            \begin{tikzpicture}
            \footnotesize{
                \begin{axis}[
                    width=.60\textwidth,
                    height=.40\textwidth,
                    legend style={at={(0.60,0.08)}, anchor=south west},
                    xlabel={\scriptsize{更新次数}},
                    ylabel={\scriptsize{学习率}},
                    xtick=\empty,
                    ytick=\empty,
                    ylabel style={yshift=-2.5em},xlabel style={yshift=1.5em},
                    legend style={yshift=-6pt, legend plot pos=right,font=\scriptsize,cells={anchor=west}}
                ]
Lee committed
2894
                \addplot[orange,line width=1.25pt] coordinates {(329,0.000045) (447,0.000078) (540,0.00012) (661,0.0002) (752,0.00032) (856,0.00051) (975,0.00089) (996,0.001) (6599,0.001) (6624,0.0005) (7200,0.0005) (7218,0.00025) (7784,0.00025) (7821,0.000125) (8398,0.000125)};
Lee committed
2895
                \end{axis}
Lee committed
2896
            }
Lee committed
2897 2898
            \end{tikzpicture}
        \end{center}
xiaotong committed
2899
        }
Lee committed
2900 2901 2902 2903 2904
    \end{itemize}
\end{frame}

\begin{frame}{训练 - 加速}
    \begin{itemize}
xiaotong committed
2905 2906 2907
        \item 万事俱备,只是为什么训练这么慢?\visible<2->{\alert{- RNN需要等前面所有时刻都完成计算以后才能开始计算当前时刻的输出}}
        \item 我有钱,是不是多买几台设备会更快?\visible<2->{\alert{- 可以,但是需要技巧,而且也不是无限增长的}}
        \item<3> 使用多个设备并行计算进行加速的两种方法
Lee committed
2908 2909 2910
        \begin{itemize}
            \item 数据并行:把``输入''分到不同设备上并行计算
            \item 模型并行:把``模型''分到不同设备上并行计算
Lee committed
2911
        \end{itemize}
Lee committed
2912 2913 2914 2915 2916 2917 2918 2919 2920
        \begin{center}
            \small
            \begin{tabular}{c|cc}
                & 优点 & 缺点 \\
                \hline
                数据并行 & \specialcell{l}{并行度高,理论上多大\\的batch就可以有多少\\个设备并行计算} & \specialcell{l}{模型不能大于单个设\\备的极限} \\
                模型并行 & \specialcell{l}{可以对很大的模型进行\\运算} & \specialcell{l}{只能有限并行,比如\\多少层就多少个设备} \\
            \end{tabular}
        \end{center}
xiaotong committed
2921 2922
        \vspace{0.5em}
        \item<3> 这两种方法可以一起使用!!!
Lee committed
2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943
    \end{itemize}
\end{frame}

\begin{frame}{训练 - 数据并行}
    \begin{itemize}
        \item 如果一台设备能完整放下一个RNN模型,那么数据并行可以把一个大batch均匀切分成$n$个小batch,然后分发到$n$个设备上并行计算,最后把结果汇总,相当于把运算时间变为原来的$1/n$
        \vspace{-0.5em}
        \begin{center}
            \hspace*{-0.5cm}
            \begin{tikzpicture}
                \setlength{\base}{1.5em}
                \tikzstyle{samplenode} = [rounded corners=1pt,minimum size=1\base,draw,inner sep=3pt,outer sep=0pt,fill=green!30!white]
                \tikzstyle{rnnnode} = [rounded corners=1pt,minimum size=1\base,draw,inner sep=0pt,outer sep=0pt,fill=blue!30!white]
                \tikzstyle{wordnode} = [font=\footnotesize,align=center]

                \begin{scope}
                    \coordinate (batch0) at (0,0);

                    \foreach \i [count=\j from 0,evaluate=\i as \k using int(4-\i)] in {1,2,3}
                        \node [samplenode,anchor=south west] (batch\i) at ([shift={(-1em,-0.5em)}]batch\j.south west) {句子\k};
                    \draw [decorate,decoration={brace}] (batch1.south east) to node [auto,rotate=30,anchor=north,font=\scriptsize] {batch大小} (batch3.south east);
姜雨帆 committed
2944

Lee committed
2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984
                    \node [samplenode,anchor=west] (sample2) at ([xshift=4em]batch2.east) {句子2};
                    \node [samplenode,anchor=south] (sample3) at ([yshift=3em]sample2.north) {句子3};
                    \node [samplenode,anchor=north] (sample1) at ([yshift=-3em]sample2.south) {句子1};

                    \foreach \i in {1,2,3}
                        \draw [->,thick] ([xshift=1.5em]batch2.east) -- ([xshift=-3pt]sample\i.west);

                    \foreach \i in {1,2,3}
                    {
                        \coordinate (start) at ([xshift=2em]sample\i.east);
                        \node [wordnode,anchor=west] (rnn0) at (start) {$0$};
                        \foreach \j [count=\k from 0] in {1,2,3}
                        {
                            \node [rnnnode,anchor=west] (rnn\j) at ([xshift=1em]rnn\k.east) {};
                            \draw [-latex'] (rnn\k) to (rnn\j);
                            \coordinate (in\j) at ([yshift=-1em]rnn\j.south);
                            \draw [-latex'] (in\j) to (rnn\j.south);
                            \coordinate (out\j) at ([yshift=1em]rnn\j.north);
                            \draw [-latex'] (rnn\j.north) to (out\j);
                        }
                        \node [wordnode,anchor=west] (rnn4) at ([xshift=1em]rnn3.east) {$\cdots$};
                        \draw [-latex'] (rnn3) to (rnn4);
                        \node [draw,densely dashed,thick,rounded corners=0.3em,fit=(start) (in3) (out3) (rnn4),label={[font=\footnotesize,rotate=90,anchor=north]0:设备\i}] (rnn) {};
                        \draw [->,double] ([xshift=3pt]sample\i.east) -- ([xshift=-3pt]rnn.west);
                    }
                \end{scope}
            \end{tikzpicture}
        \end{center}
    \end{itemize}
\end{frame}

\begin{frame}{训练 - 模型并行}
    \begin{itemize}
        \item 做完了数据并行,仍然太慢了,因为RNN模型太大了,算一个样本也很慢,那么可以把RNN模型按层均匀切分成$l$个小模型,然后分发到$l$个设备上并行计算,相当于把运算时间变为原来的$1/l$
        \hspace*{-0.5cm}
        \begin{center}
            \begin{tikzpicture}
                \setlength{\base}{1.5em}
                \tikzstyle{rnnnode} = [rounded corners=1pt,minimum size=1\base,draw,inner sep=0pt,outer sep=0pt,fill=blue!30!white]
                \tikzstyle{wordnode} = [font=\footnotesize,align=center]
姜雨帆 committed
2985

Lee committed
2986 2987 2988 2989 2990 2991 2992
                \begin{scope}
                    % rnn[layer][step]
                    \coordinate (rnn00) at (0,0);
                    \foreach \i [count=\j from 0] in {1,2,3}
                        \node[wordnode] (rnn\i0) at ([yshift=2\base]rnn\j0) {$0$};
                    \foreach \i [count=\j from 0] in {1,2,...,4}
                        \coordinate (rnn0\i) at ([xshift=2\base]rnn0\j);
姜雨帆 committed
2993

Lee committed
2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004
                    % step 1
                    \visible<1->{
                        \node[rnnnode] (rnn11) at ([xshift=2\base]rnn10) {};
                        \draw[-latex'] ([yshift=0.5\base]rnn01) to (rnn11);
                        \draw[-latex'] ([xshift=0.5\base]rnn10) to (rnn11);
                    }
                    \visible<1>{
                        % frontier
                        \node[rnnnode,fill=purple] () at (rnn11) {};
                        \node[draw=red,thick,inner sep=7pt,rounded corners=0.3em,rotate fit=-45,label={[font=\footnotesize,align=center]90:正在运算的\\{\color{red} 循环单元}},fit=(rnn11)] () {};
                    }
姜雨帆 committed
3005

Lee committed
3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020
                    % step 2
                    \visible<2->{
                        \node[rnnnode] (rnn12) at ([xshift=2\base]rnn11) {};
                        \node[rnnnode] (rnn21) at ([yshift=2\base]rnn11) {};
                        \draw[-latex'] ([yshift=0.5\base]rnn02) to (rnn12);
                        \draw[-latex'] ([xshift=0.5\base]rnn20) to (rnn21);
                        \draw[-latex'] (rnn11) to (rnn12);
                        \draw[-latex'] (rnn11) to (rnn21);
                    }
                    \visible<2>{
                        % frontier
                        \node[rnnnode,fill=purple] () at (rnn12) {};
                        \node[rnnnode,fill=purple] () at (rnn21) {};
                        \node[draw=red,thick,inner sep=7pt,rounded corners=0.3em,rotate fit=-45,label={[font=\footnotesize,align=center]90:正在运算的\\{\color{red} 循环单元}},fit=(rnn12) (rnn21)] () {};
                    }
姜雨帆 committed
3021

Lee committed
3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042
                    % step 3
                    \visible<3->{
                        \node[rnnnode] (rnn13) at ([xshift=2\base]rnn12) {};
                        \node[rnnnode] (rnn31) at ([yshift=2\base]rnn21) {};
                        \node[rnnnode] (rnn22) at ([xshift=2\base]rnn21) {};
                        \node[wordnode,anchor=south] (o1) at ([yshift=\base]rnn31.north) {};
                        \draw[-latex'] ([yshift=0.5\base]rnn03) to (rnn13);
                        \draw[-latex'] ([xshift=0.5\base]rnn30) to (rnn31);
                        \draw[-latex'] (rnn12) to (rnn13);
                        \draw[-latex'] (rnn21) to (rnn31);
                        \draw[-latex'] (rnn12) to (rnn22);
                        \draw[-latex'] (rnn21) to (rnn22);
                        \draw[-latex'] (rnn31) to (o1);
                    }
                    \visible<3>{
                        % frontier
                        \node[rnnnode,fill=purple] () at (rnn13) {};
                        \node[rnnnode,fill=purple] () at (rnn31) {};
                        \node[rnnnode,fill=purple] () at (rnn22) {};
                        \node[draw=red,thick,inner sep=7pt,rounded corners=0.3em,rotate fit=-45,label={[font=\footnotesize,align=center]90:正在运算的\\{\color{red} 循环单元}},fit=(rnn13) (rnn31) (rnn22)] () {};
                    }
姜雨帆 committed
3043

Lee committed
3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064
                    % step 4
                    \visible<4->{
                        \node[rnnnode] (rnn14) at ([xshift=2\base]rnn13) {};
                        \node[rnnnode] (rnn23) at ([xshift=2\base]rnn22) {};
                        \node[rnnnode] (rnn32) at ([xshift=2\base]rnn31) {};
                        \node[wordnode,anchor=south] (o2) at ([yshift=\base]rnn32.north) {不错};
                        \draw[-latex'] ([yshift=0.5\base]rnn04) to (rnn14);
                        \draw[-latex'] (rnn13) to (rnn14);
                        \draw[-latex'] (rnn13) to (rnn23);
                        \draw[-latex'] (rnn22) to (rnn23);
                        \draw[-latex'] (rnn22) to (rnn32);
                        \draw[-latex'] (rnn31) to (rnn32);
                        \draw[-latex'] (rnn32) to (o2);
                    }
                    \visible<4>{
                        % frontier
                        \node[rnnnode,fill=purple] () at (rnn14) {};
                        \node[rnnnode,fill=purple] () at (rnn23) {};
                        \node[rnnnode,fill=purple] () at (rnn32) {};
                        \node[draw=red,thick,inner sep=7pt,rounded corners=0.3em,rotate fit=-45,label={[font=\footnotesize,align=center]90:正在运算的\\{\color{red} 循环单元}},fit=(rnn14) (rnn23) (rnn32)] () {};
                    }
姜雨帆 committed
3065

Lee committed
3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082
                    % step 5
                    \visible<5->{
                        \node[rnnnode] (rnn24) at ([xshift=2\base]rnn23) {};
                        \node[rnnnode] (rnn33) at ([xshift=2\base]rnn32) {};
                        \node[wordnode,anchor=south] (o3) at ([yshift=\base]rnn33.north) {};
                        \draw[-latex'] (rnn14) to (rnn24);
                        \draw[-latex'] (rnn23) to (rnn24);
                        \draw[-latex'] (rnn23) to (rnn33);
                        \draw[-latex'] (rnn32) to (rnn33);
                        \draw[-latex'] (rnn33) to (o3);
                    }
                    \visible<5>{
                        % frontier
                        \node[rnnnode,fill=purple] () at (rnn24) {};
                        \node[rnnnode,fill=purple] () at (rnn33) {};
                        \node[draw=red,thick,inner sep=7pt,rounded corners=0.3em,rotate fit=-45,label={[font=\footnotesize,align=center]90:正在运算的\\{\color{red} 循环单元}},fit=(rnn24) (rnn33)] () {};
                    }
姜雨帆 committed
3083

Lee committed
3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096
                    % step 6
                    \visible<6->{
                        \node[rnnnode] (rnn34) at ([xshift=2\base]rnn33) {};
                        \node[wordnode,anchor=south] (o4) at ([yshift=\base]rnn34.north) {EOS};
                        \draw[-latex'] (rnn33) to (rnn34);
                        \draw[-latex'] (rnn24) to (rnn34);
                        \draw[-latex'] (rnn34) to (o4);
                    }
                    \visible<6>{
                        % frontier
                        \node[rnnnode,fill=purple] () at (rnn34) {};
                        \node[draw=red,thick,inner sep=7pt,rounded corners=0.3em,rotate fit=-45,label={[font=\footnotesize,align=center]90:正在运算的\\{\color{red} 循环单元}},fit=(rnn34)] () {};
                    }
姜雨帆 committed
3097

Lee committed
3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113
                    % labels
                    \alt<1-4>{
                        \draw[decorate,decoration={brace}] ([yshift=-\base]rnn10.west) to node[wordnode,align=right,left,text=red] {正在使用的\\设备1} ([yshift=\base]rnn10.west);
                    }{
                        \draw[decorate,decoration={brace}] ([yshift=-\base]rnn10.west) to node[wordnode,align=right,left] {空闲的\\设备1} ([yshift=\base]rnn10.west);
                    }
                    \alt<2-5>{
                        \draw[decorate,decoration={brace}] ([yshift=-\base]rnn20.west) to node[wordnode,align=right,left,text=red] {正在使用的\\设备2} ([yshift=\base]rnn20.west);
                    }{
                        \draw[decorate,decoration={brace}] ([yshift=-\base]rnn20.west) to node[wordnode,align=right,left] {空闲的\\设备2} ([yshift=\base]rnn20.west);
                    }
                    \alt<3-6>{
                        \draw[decorate,decoration={brace}] ([yshift=-\base]rnn30.west) to node[wordnode,align=right,left,text=red] {正在使用的\\设备3} ([yshift=\base]rnn30.west);
                    }{
                        \draw[decorate,decoration={brace}] ([yshift=-\base]rnn30.west) to node[wordnode,align=right,left] {空闲的\\设备3} ([yshift=\base]rnn30.west);
                    }
姜雨帆 committed
3114

Lee committed
3115 3116
                    \foreach \i in {1,2,3}
                        \node[wordnode,font=\scriptsize,anchor=south west] () at (rnn\i0.north west) {\i};
姜雨帆 committed
3117

Lee committed
3118 3119 3120 3121 3122 3123 3124
                    \node[wordnode] () at (rnn01) {};
                    \node[wordnode] () at (rnn02) {};
                    \node[wordnode] () at (rnn03) {不错};
                    \node[wordnode] () at (rnn04) {};
                \end{scope}
            \end{tikzpicture}
        \end{center}
Lee committed
3125
    \end{itemize}
xiaotong committed

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 解码
\begin{frame}{推断}
\begin{itemize}
\item 使用NMT时,对于源语言句子$\textbf{x}$,需要得到最优译文$\hat{\textbf{y}}$

\vspace{-1.5em}
\begin{displaymath}
\hat{\textbf{y}} = \argmax_{\textbf{y}} \log\textrm{P}(\textbf{y}|\textbf{x}) = \argmax_{\textbf{y}} \sum_{j=1}^{n} \log\textrm{P}(y_j|\textbf{y}_{<j}, \textbf{x})
\end{displaymath}

\item<2-> 由于$y_i$的生成需要依赖$y_{i-1}$,因此无法同时生成$\{y_1,...,y_n\}$。常用的方法是自左向右逐个单词生成

\end{itemize}

\vspace{-0.8em}
\visible<3->{
\vspace{-0.5em}
\begin{center}
\begin{tikzpicture}
\begin{scope}
\tikzstyle{rnnnode} = [minimum height=1.1em,minimum width=2.1em,inner sep=2pt,rounded corners=1pt,draw,fill=red!20];

\node [rnnnode,anchor=west] (h1) at (0,0) {\tiny{$\textbf{h}_1$}};
\node [anchor=west] (h2) at ([xshift=1em]h1.east) {\tiny{...}};
\node [rnnnode,anchor=west] (h3) at ([xshift=1em]h2.east) {\tiny{$\textbf{h}_m$}};
\node [rnnnode,anchor=north,fill=green!20] (e1) at ([yshift=-1em]h1.south) {\tiny{$e_x()$}};
\node [anchor=west] (e2) at ([xshift=1em]e1.east) {\tiny{...}};
\node [rnnnode,anchor=west,fill=green!20] (e3) at ([xshift=1em]e2.east) {\tiny{$e_x()$}};
\node [anchor=north,inner sep=2pt] (w1) at ([yshift=-0.6em]e1.south) {\tiny{}};
\node [anchor=north,inner sep=2pt] (w2) at ([yshift=-0.8em]e2.south) {\tiny{...}};
\node [anchor=north,inner sep=2pt] (w3) at ([yshift=-0.6em]e3.south) {\tiny{EOS}};

\draw [->] (w1.north) -- ([yshift=-0.1em]e1.south);
\draw [->] (w3.north) -- ([yshift=-0.1em]e3.south);
\draw [->] ([yshift=0.1em]e1.north) -- ([yshift=-0.1em]h1.south);
\draw [->] ([yshift=0.1em]e3.north) -- ([yshift=-0.1em]h3.south);
\draw [->] ([xshift=0.1em]h1.east) -- ([xshift=-0.1em]h2.west);
\draw [->] ([xshift=0.1em]h2.east) -- ([xshift=-0.1em]h3.west);
\draw [->] ([xshift=-0.8em]h1.west) -- ([xshift=-0.1em]h1.west) node [pos=0,left,inner sep=2pt] {\tiny{0}};
\node [anchor=south] (encoder) at ([xshift=-0.2em]h1.north west) {\scriptsize{\textbf{编码器}}};

\visible<5->{
\node [rnnnode,anchor=west,fill=green!20] (t1) at ([xshift=3em]h3.east) {\tiny{$e_y()$}};
}
\visible<8->{
\node [rnnnode,anchor=west,fill=green!20] (t2) at ([xshift=1.5em]t1.east) {\tiny{$e_y()$}};
}
\visible<9->{
\node [rnnnode,anchor=west,fill=green!20] (t3) at ([xshift=1.5em]t2.east) {\tiny{$e_y()$}};
\node [rnnnode,anchor=west,fill=green!20] (t4) at ([xshift=1.5em]t3.east) {\tiny{$e_y()$}};
\node [anchor=west,inner sep=2pt] (t5) at ([xshift=0.3em]t4.east) {\tiny{...}};
}
\visible<5->{
\node [rnnnode,anchor=south] (s1) at ([yshift=1em]t1.north) {\tiny{$\textbf{s}_1$}};
}
\visible<8->{
\node [rnnnode,anchor=south] (s2) at ([yshift=1em]t2.north) {\tiny{$\textbf{s}_2$}};
}
\visible<9->{
\node [rnnnode,anchor=south] (s3) at ([yshift=1em]t3.north) {\tiny{$\textbf{s}_3$}};
\node [rnnnode,anchor=south] (s4) at ([yshift=1em]t4.north) {\tiny{$\textbf{s}_4$}};
\node [anchor=west,inner sep=2pt] (s5) at ([xshift=0.3em]s4.east) {\tiny{...}};
}
\visible<5->{
\node [rnnnode,anchor=south,fill=blue!20] (o1) at ([yshift=1em]s1.north) {\tiny{softmax}};
\node [anchor=east] (decoder) at ([xshift=-0.5em]o1.north west) {\scriptsize{\textbf{解码器}}};
}
\visible<8->{
\node [rnnnode,anchor=south,fill=blue!20] (o2) at ([yshift=1em]s2.north) {\tiny{softmax}};
}
\visible<9->{
\node [rnnnode,anchor=south,fill=blue!20] (o3) at ([yshift=1em]s3.north) {\tiny{softmax}};
\node [rnnnode,anchor=south,fill=blue!20] (o4) at ([yshift=1em]s4.north) {\tiny{softmax}};
\node [anchor=west,inner sep=2pt] (o5) at ([xshift=0.3em]o4.east) {\tiny{...}};
}
\visible<4->{
\node [anchor=north,inner sep=2pt] (wt1) at ([yshift=-0.6em]t1.south) {\tiny{EOS}};
}
\visible<7->{
\node [anchor=north,inner sep=2pt] (wt2) at ([yshift=-0.6em]t2.south) {\tiny{Have}};
}
\visible<9->{
\node [anchor=north,inner sep=2pt] (wt3) at ([yshift=-0.8em]t3.south) {\tiny{you}};
\node [anchor=north,inner sep=2pt] (wt4) at ([yshift=-0.6em]t4.south) {\tiny{learned}};
}
\visible<5->{
\node [anchor=center,inner sep=2pt] (wo1) at ([yshift=1.2em]o1.north) {\tiny{Have}};
}
\visible<4->{
\node [anchor=south,inner sep=2pt] (wos1) at (wo1.north) {\tiny{\textbf{[step 1]}}};
}
\visible<8->{
\node [anchor=center,inner sep=2pt] (wo2) at ([yshift=1.2em]o2.north) {\tiny{you}};
}
\visible<7->{
\node [anchor=south,inner sep=2pt] (wos2) at (wo2.north) {\tiny{\textbf{[step 2]}}};
}
\visible<9->{
\node [anchor=center,inner sep=2pt] (wo3) at ([yshift=1.2em]o3.north) {\tiny{learned}};
\node [anchor=south,inner sep=2pt] (wos3) at (wo3.north) {\tiny{\textbf{[step 3]}}};
\node [anchor=center,inner sep=2pt] (wo4) at ([yshift=1.2em]o4.north) {\tiny{nothing}};
\node [anchor=south,inner sep=2pt] (wos4) at (wo4.north) {\tiny{\textbf{[step 4]}}};
}

\visible<5->{
\foreach \x in {1}{
    \draw [->] ([yshift=-0.7em]t\x.south) -- ([yshift=-0.1em]t\x.south);
    \draw [->] ([yshift=0.1em]t\x.north) -- ([yshift=-0.1em]s\x.south);
    \draw [->] ([yshift=0.1em]s\x.north) -- ([yshift=-0.1em]o\x.south);
    \draw [->] ([yshift=0.1em]o\x.north) -- ([yshift=0.8em]o\x.north) node [pos=0.5,right] {\tiny{top1}};
}
}

\visible<8->{
\foreach \x in {2}{
    \draw [->] ([yshift=-0.7em]t\x.south) -- ([yshift=-0.1em]t\x.south);
    \draw [->] ([yshift=0.1em]t\x.north) -- ([yshift=-0.1em]s\x.south);
    \draw [->] ([yshift=0.1em]s\x.north) -- ([yshift=-0.1em]o\x.south);
    \draw [->] ([yshift=0.1em]o\x.north) -- ([yshift=0.8em]o\x.north) node [pos=0.5,right] {\tiny{top1}};
}
}

\visible<9->{
\foreach \x in {3,4}{
    \draw [->] ([yshift=-0.7em]t\x.south) -- ([yshift=-0.1em]t\x.south);
    \draw [->] ([yshift=0.1em]t\x.north) -- ([yshift=-0.1em]s\x.south);
    \draw [->] ([yshift=0.1em]s\x.north) -- ([yshift=-0.1em]o\x.south);
    \draw [->] ([yshift=0.1em]o\x.north) -- ([yshift=0.8em]o\x.north) node [pos=0.5,right] {\tiny{top1}};
}
}

\visible<5->{
\draw [->] ([xshift=-0.8em]s1.west) -- ([xshift=-0.1em]s1.west) node [pos=0,left] {\tiny{0}};
}
\visible<8->{
\draw [->] ([xshift=0.1em]s1.east) -- ([xshift=-0.1em]s2.west);
}
\visible<9->{
\draw [->] ([xshift=0.1em]s2.east) -- ([xshift=-0.1em]s3.west);
\draw [->] ([xshift=0.1em]s3.east) -- ([xshift=-0.1em]s4.west);
}

\visible<7->{
\draw [->,thick,dotted] (wo1.east) .. controls +(east:1.0) and +(west:1.0) ..(wt2.west);
}
\visible<9->{
\draw [->,thick,dotted] (wo2.east) .. controls +(east:1.3) and +(west:1.1) ..(wt3.west);
\draw [->,thick,dotted] (wo3.east) .. controls +(east:0.9) and +(west:0.9) ..(wt4.west);
}

\visible<6->{
\node [circle,draw,anchor=south,inner sep=3pt,fill=orange!20] (c2) at ([yshift=2em]h2.north) {\tiny{$\textbf{C}_2$}};
\node [anchor=south] (c2label) at (c2.north) {\tiny{\textbf{注意力机制:上下文}}};
\node [anchor=south] (c2more) at ([yshift=-1.5em]c2.south) {\tiny{...}};
\draw [->] (h1.north) .. controls +(north:0.6) and +(250:0.9) .. (c2.250);
\draw [->] (h3.north) .. controls +(north:0.6) and +(290:0.9) .. (c2.290);
\draw [->] ([yshift=-0.3em]s1.west) .. controls +(west:2) and +(-50:0.3) .. (c2.-40);
}
\visible<8->{
\draw [->] (c2.0) -- ([xshift=1.358in]c2.0) -- ([yshift=0.3em,xshift=-1.2em]s2.west) -- ([yshift=0.3em,xshift=-0.1em]s2.west);
}

\visible<9->{
\node [circle,draw,anchor=north,inner sep=3pt,fill=orange!20] (c3) at ([yshift=-2em]t2.south) {\tiny{$\textbf{C}_3$}};
\draw [->] ([xshift=-0.7em]c3.west) -- ([xshift=-0.1em]c3.west);
\draw [->] ([xshift=0.1em]c3.east) .. controls +(east:0.6) and +(west:0.8) ..([yshift=-0.3em,xshift=-0.1em]s3.west);
}

\visible<9->{
\node [circle,draw,anchor=north,inner sep=3pt,fill=orange!20] (c4) at ([yshift=-2em]t3.south) {\tiny{$\textbf{C}_4$}};
\draw [->] ([xshift=-0.7em]c4.west) -- ([xshift=-0.1em]c4.west);
\draw [->] ([xshift=0.1em]c4.east) .. controls +(east:0.6) and +(west:0.8) ..([yshift=-0.3em,xshift=-0.1em]s4.west);
}

\end{scope}
\end{tikzpicture}
\end{center}
}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 解码 - beam search
\begin{frame}{推断 - Beam Search}
\begin{itemize}
\item \textbf{Greedy Search}: 目标语每一个位置,输出层的Softmax可以得到所有单词的概率,然后选择一个概率最大单词输出,下一个位置的预测就基于这一步输出的单词
xiaotong committed
3315
\item \textbf{Beam Search}: 为了避免贪婪方法造成的错误累加,可以每次对$b$个单词进行扩展,而不是只使用一个单词,其中$b$称做束的宽度 - 这样可以搜索更多可能的译文
xiaotong committed
3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394
\end{itemize}

\vspace{-0.3em}
\visible<2->{
\begin{center}
\begin{tikzpicture}
\begin{scope}
\tikzstyle{rnnnode} = [minimum height=1.1em,minimum width=3.5em,inner sep=2pt,rounded corners=1pt,draw,fill=red!20];
\tikzstyle{wnode} = [minimum height=1.0em,minimum width=3.5em,inner sep=2pt,rounded corners=1pt,draw,fill=white];


\visible<3->{
\node [rnnnode,anchor=west,fill=green!20] (t1) at (0,0) {\tiny{$e_y()$}};
}
\visible<7->{
\node [rnnnode,anchor=west,fill=green!20] (t2) at ([xshift=2.2em]t1.east) {\tiny{$e_y()$ ($\times 3$)}};
}
\visible<8->{
\node [rnnnode,anchor=west,fill=green!20] (t3) at ([xshift=2.2em]t2.east) {\tiny{$e_y()$ ($\times 3$)}};
\node [anchor=west,inner sep=2pt] (t4) at ([xshift=0.3em]t3.east) {\tiny{...}};
}
\visible<3->{
\node [rnnnode,anchor=south] (s1) at ([yshift=1em]t1.north) {\tiny{$\textbf{s}_1$}};
}
\visible<7->{
\node [rnnnode,anchor=south] (s2) at ([yshift=1em]t2.north) {\tiny{$\textbf{s}_2$ ($\times 3$)}};
}
\visible<8->{
\node [rnnnode,anchor=south] (s3) at ([yshift=1em]t3.north) {\tiny{$\textbf{s}_3$ ($\times 3$)}};
\node [anchor=west,inner sep=2pt] (s4) at ([xshift=0.3em]s3.east) {\tiny{...}};
}
\visible<3->{
\node [rnnnode,anchor=south,fill=blue!20] (o1) at ([yshift=1em]s1.north) {\tiny{softmax}};
}
\visible<7->{
\node [rnnnode,anchor=south,fill=blue!20] (o2) at ([yshift=1em]s2.north) {\tiny{softmax ($\times 3$)}};
}
\visible<8->{
\node [rnnnode,anchor=south,fill=blue!20] (o3) at ([yshift=1em]s3.north) {\tiny{softmax ($\times 3$)}};
\node [anchor=west,inner sep=2pt] (o4) at ([xshift=0.3em]o3.east) {\tiny{...}};
}

\node [wnode,anchor=north] (wt1) at ([yshift=-0.8em]t1.south) {\tiny{EOS}};

\visible<6->{
\node [wnode,anchor=north] (wt2) at ([yshift=-0.8em]t2.south) {\tiny{Have}};
\node [wnode,anchor=north] (wt2copy1) at ([xshift=-0.2em,yshift=-0.2em]wt2.north) {\tiny{Have}};
\node [wnode,anchor=north] (wt2copy2) at ([xshift=-0.4em,yshift=-0.4em]wt2.north) {\tiny{Have}};
}

\visible<8->{
\node [wnode,anchor=north,inner sep=2pt] (wt3) at ([yshift=-0.8em]t3.south) {\tiny{you}};
\node [wnode,anchor=north] (wt3copy1) at ([xshift=-0.2em,yshift=-0.2em]wt3.north) {\tiny{you}};
\node [wnode,anchor=north] (wt3copy2) at ([xshift=-0.4em,yshift=-0.4em]wt3.north) {\tiny{you}};
}

\visible<5->{
\node [wnode,anchor=center,inner sep=2pt] (wo1) at ([xshift=0.4em,yshift=1.8em]o1.north) {\tiny{Have}};
\node [wnode,anchor=north] (wo1copy1) at ([xshift=-0.2em,yshift=-0.2em]wo1.north) {\tiny{Have}};
\node [wnode,anchor=north] (wo1copy2) at ([xshift=-0.4em,yshift=-0.4em]wo1.north) {\tiny{Have}};
}

\visible<8->{
\node [wnode,anchor=center,inner sep=2pt] (wo2) at ([xshift=0.4em,yshift=1.8em]o2.north) {\tiny{you}};
\node [wnode,anchor=north] (wo2copy1) at ([xshift=-0.2em,yshift=-0.2em]wo2.north) {\tiny{you}};
\node [wnode,anchor=north] (wo2copy2) at ([xshift=-0.4em,yshift=-0.4em]wo2.north) {\tiny{you}};
}

\visible<8->{
\node [wnode,anchor=center,inner sep=2pt] (wo3) at ([xshift=0.4em,yshift=1.8em]o3.north) {\tiny{learned}};
\node [wnode,anchor=north] (wo3copy1) at ([xshift=-0.2em,yshift=-0.2em]wo3.north) {\tiny{learned}};
\node [wnode,anchor=north] (wo3copy2) at ([xshift=-0.4em,yshift=-0.4em]wo3.north) {\tiny{learned}};
}

\visible<3->{
\foreach \x in {1}{
    \draw [->] ([yshift=-0.7em]t\x.south) -- ([yshift=-0.1em]t\x.south);
    \draw [->] ([yshift=0.1em]t\x.north) -- ([yshift=-0.1em]s\x.south);
    \draw [->] ([yshift=0.1em]s\x.north) -- ([yshift=-0.1em]o\x.south);
xiaotong committed
3395

xiaotong committed
3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456
}
}

\visible<5->{
 \draw [->] ([yshift=0.1em]o1.north) -- ([yshift=0.8em]o1.north) node [pos=0.5,right] {\tiny{top-3}};
 }

\visible<7->{
\foreach \x in {2}{
    \draw [->] ([yshift=-0.7em]t\x.south) -- ([yshift=-0.1em]t\x.south);
    \draw [->] ([yshift=0.1em]t\x.north) -- ([yshift=-0.1em]s\x.south);
    \draw [->] ([yshift=0.1em]s\x.north) -- ([yshift=-0.1em]o\x.south);
    \draw [->] ([yshift=0.1em]o\x.north) -- ([yshift=0.8em]o\x.north) node [pos=0.5,right] {\tiny{top-3}};
}
}

\visible<8->{
\foreach \x in {3}{
    \draw [->] ([yshift=-0.7em]t\x.south) -- ([yshift=-0.1em]t\x.south);
    \draw [->] ([yshift=0.1em]t\x.north) -- ([yshift=-0.1em]s\x.south);
    \draw [->] ([yshift=0.1em]s\x.north) -- ([yshift=-0.1em]o\x.south);
    \draw [->] ([yshift=0.1em]o\x.north) -- ([yshift=0.8em]o\x.north) node [pos=0.5,right] {\tiny{top-3}};
}
}

\visible<3->{
\draw [->] ([xshift=-0.5em]s1.west) -- ([xshift=-0.1em]s1.west) node [pos=0,left,inner sep=1pt] {\tiny{0}};
}
\visible<7->{
\draw [->] ([xshift=0.1em]s1.east) -- ([xshift=-0.1em]s2.west);
}
\visible<8->{
\draw [->] ([xshift=0.1em]s2.east) -- ([xshift=-0.1em]s3.west);
}

\visible<6->{
\draw [->,very thick,dotted] (wo1.east) .. controls +(east:0.6) and +(west:0.8) ..(wt2copy2.west);
}
\visible<8->{
\draw [->,very thick,dotted] (wo2.east) .. controls +(east:0.6) and +(west:0.8) ..(wt3copy2.west);
}

\visible<7->{
\node [circle,draw,anchor=north,inner sep=2pt,fill=orange!20] (c2) at ([yshift=-2.5em]t1.south) {\tiny{$\textbf{C}_2$}};
\node [circle,draw,inner sep=2pt,fill=orange!20] (c2copy1) at ([yshift=-0.1em,xshift=-0.1em]c2) {\tiny{$\textbf{C}_2$}};
\node [circle,draw,inner sep=2pt,fill=orange!20] (c2copy2) at ([yshift=-0.2em,xshift=-0.2em]c2) {\tiny{$\textbf{C}_2$}};
\draw [->] ([xshift=-0.9em]c2.west) -- ([xshift=-0.3em]c2.west);
\draw [->] ([xshift=0.1em]c2.east) .. controls +(east:1.5) and +(west:0.8) ..([yshift=-0.3em,xshift=-0.1em]s2.west);
}

\visible<8->{
\node [circle,draw,anchor=north,inner sep=2pt,fill=orange!20] (c3) at ([yshift=-2.5em]t2.south) {\tiny{$\textbf{C}_3$}};
\node [circle,draw,inner sep=2pt,fill=orange!20] (c3copy1) at ([yshift=-0.1em,xshift=-0.1em]c3) {\tiny{$\textbf{C}_3$}};
\node [circle,draw,inner sep=2pt,fill=orange!20] (c3copy2) at ([yshift=-0.2em,xshift=-0.2em]c3) {\tiny{$\textbf{C}_3$}};
\draw [->] ([xshift=-0.9em]c3.west) -- ([xshift=-0.3em]c3.west);
\draw [->] ([xshift=0.1em]c3.east) .. controls +(east:1.5) and +(west:0.8) ..([yshift=-0.3em,xshift=-0.1em]s3.west);
}

\visible<3->{
\node [anchor=east] (vocab) at ([xshift=-5em]s1.west) {\tiny{$\begin{bmatrix} \textrm{Have} & 0.50 \\ \textrm{I} & 0.02 \\ \textrm{it} & 0.03 \\ \textrm{has} & 0.30 \\ \textrm{you} & 0.01 \\ \textrm{the} & 0.01 \\ \textrm{a} & 0.01 \\ \textrm{an} & 0.02 \\ \textrm{he} & 0.03 \\ \textrm{she} & 0.01 \\ \textrm{are} & 0.00 \\ \textrm{am} & 0.01 \\ ... & ... \end{bmatrix}$}};
\node [anchor=south] (vocablabel) at (vocab.north) {\tiny{单词的概率分布}};
xiaotong committed
3457
\draw [->,red,very thick,dotted] (o1.west) .. controls +(west:1) and +(east:2) .. ([yshift=1em]vocab.south east);
xiaotong committed
3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477
}

\visible<4->{
\node [anchor=east,inner sep=1pt] (vocabtopn) at ([xshift=-0.5em,yshift=-0.5em]wo1.west) {\tiny{$\begin{bmatrix} \textrm{Have} \\ \textrm{has} \\ \textrm{it} \end{bmatrix}$}};
\draw [->] ([yshift=-1.6em,xshift=-0.4em]vocab.north east) .. controls +(east:1) and +(west:1) ..  ([xshift=0.1em,yshift=0.4em]vocabtopn.west) node [pos=0.3,below] (topnlabel) {\tiny{top-3}};

\visible<4->{
\node [anchor=north] (cap) at (vocab.south east) {\scriptsize{\textbf{束搜索($b=3$)}}};
}
}


\end{scope}
\end{tikzpicture}
\end{center}
}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
xiaotong committed
3478 3479 3480 3481 3482 3483 3484 3485
%%% 解码 - 长度惩罚和覆盖度
\begin{frame}{推断 - 其它特征}
\begin{itemize}
\item 直接用$\textrm{P}(\textbf{y}|\textbf{x})$进行解码,面临两方面问题
    \begin{itemize}
    \item$\textrm{P}(y_j|\textbf{y}_{<j},\textbf{x})$进行乘积会导致长句的概率很低
    \item 模型本身并没有考虑每个源语言单词被使用的程度,比如一个单词可能会被翻译了很多``次''
    \end{itemize}
xiaotong committed
3486
\item<2-> 因此,解码时会使用其它特征与$\textrm{P}(\textbf{y}|\textbf{x})$一起组成模型得分$\textrm{score}(\textbf{y},\textbf{x})$$\textrm{score}(\textbf{y},\textbf{x})$也作为beam search 的排序依据
xiaotong committed
3487
    \begin{eqnarray}
xiaotong committed
3488
    \textrm{score}(\textbf{y},\textbf{x}) & = & \textrm{P}(\textbf{y}|\textbf{x})/\textrm{lp}(\textbf{y}) + \textrm{cp}(\textbf{y},\textbf{x}) \nonumber \\
xiaotong committed
3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500
    \textrm{lp}(\textbf{y})      & = & \frac{(5 + |\textbf{y}|)^\alpha}{(5 + 1)^\alpha} \nonumber \\
    \textrm{cp}(\textbf{y},\textbf{x}) & = & \beta \cdot \sum\nolimits_{i=1}^{|\textbf{x}|} \log (\min(\sum\nolimits_{j}^{|\textbf{y}|} a_{ij}, 1))) \nonumber
    \end{eqnarray}

    \vspace{-0.5em}
    \begin{itemize}
    \item lp会惩罚译文过短的结果(长度惩罚);cp会惩罚把某些源语单词对应到很多目标语单词的情况(覆盖度),被覆盖的程度用$\sum\nolimits_{j}^{|\textbf{y}|} a_{ij}$度量;$\alpha$$\beta$是超参,需要经验性设置
    \end{itemize}
\end{itemize}

\end{frame}

xiaotong committed
3501 3502 3503 3504 3505

%%%------------------------------------------------------------------------------------------------------------
%%% GNMT
\begin{frame}{成功案例 - GNMT}
%% GNMT的图和几句话说它多牛
Lee committed
3506 3507 3508 3509 3510
\begin{itemize}
    \item 使用残差连接来训练8层的编码解码器  % 提升性能
    \item 编码器只有最下面2层为双向LSTM  % 提升速度
    \item 解码器只使用最底层输出作为注意力机制的query  % 提升速度
\end{itemize}
Lee committed
3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554
\begin{center}
    \begin{tikzpicture}
        \setlength{\base}{0.25cm}

        \tikzstyle{rnnnode} = [minimum height=1.1em,minimum width=1.1em,inner sep=2pt,rounded corners=1pt,draw,fill=red!20,font=\scriptsize];
        \tikzstyle{wnode} = [minimum height=1.2em,inner sep=3pt,rounded corners=1pt,font=\scriptsize];

        % Encoder
        \begin{scope}
            \node[rnnnode,fill=green!20] (encemb1) at (0,0) {};
            \node[rnnnode,fill=green!20,right=\base of encemb1] (encemb2) {};
            \node[rnnnode,draw=white,fill=white,right=\base of encemb2] (encemb3) {$\cdots$};
            \node[rnnnode,fill=green!20,right=\base of encemb3] (encemb4) {};

            \node[rnnnode,above=\base of encemb1] (enc11) {};
            \node[rnnnode,above=\base of encemb2] (enc12) {};
            \node[rnnnode,draw=white,fill=white,above=\base of encemb3] (enc13) {$\cdots$};
            \node[rnnnode,above=\base of encemb4] (enc14) {};
            \foreach \cur [count=\prev from 1] in {2,...,4}
            {
                \node[rnnnode,above=\base of enc\prev1] (enc\cur1) {};
                \node[rnnnode,above=\base of enc\prev2] (enc\cur2) {};
                \node[rnnnode,draw=white,fill=white,above=\base of enc\prev3] (enc\cur3) {$\cdots$};
                \node[rnnnode,above=\base of enc\prev4] (enc\cur4) {};
            }
            \node[rnnnode,draw=white,fill=white,above=\base of enc41] (enc51) {$\cdots$};
            \node[rnnnode,draw=white,fill=white,above=\base of enc42] (enc52) {$\cdots$};
            \node[rnnnode,draw=white,fill=white,above=\base of enc43] (enc53) {};
            \node[rnnnode,draw=white,fill=white,above=\base of enc44] (enc54) {$\cdots$};

            \node[rnnnode,above=\base of enc51] (enc61) {};
            \node[rnnnode,above=\base of enc52] (enc62) {};
            \node[rnnnode,draw=white,fill=white,above=\base of enc53] (enc63) {$\cdots$};
            \node[rnnnode,above=\base of enc54] (enc64) {};

            % words
            \node[wnode,below=0pt of encemb1] (encword1) {};
            \node[wnode,below=0pt of encemb2] (encword2) {什么};
            \node[wnode,below=0pt of encemb4] (encword4) {EOS};

            % connections
            \draw[-latex'] (enc11) to (enc12);
            \draw[-latex'] (enc12) to (enc13);
            \draw[-latex'] (enc13) to (enc14);
xiaotong committed
3555

Lee committed
3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582
            \draw[-latex'] (enc24) to (enc23);
            \draw[-latex'] (enc23) to (enc22);
            \draw[-latex'] (enc22) to (enc21);

            \draw[-latex'] (enc31) to (enc32);
            \draw[-latex'] (enc32) to (enc33);
            \draw[-latex'] (enc33) to (enc34);

            \draw[-latex'] (enc41) to (enc42);
            \draw[-latex'] (enc42) to (enc43);
            \draw[-latex'] (enc43) to (enc44);

            \draw[-latex'] (enc61) to (enc62);
            \draw[-latex'] (enc62) to (enc63);
            \draw[-latex'] (enc63) to (enc64);

            \draw[-latex'] (encemb1) to (enc11);
            \draw[-latex'] (encemb2) to (enc12);
            \draw[-latex'] (encemb4) to (enc14);

            \draw[-latex'] ([xshift=2pt]encemb1.north) to [out=30,in=-30] ([xshift=2pt]enc21.south);
            \draw[-latex'] ([xshift=2pt]encemb2.north) to [out=30,in=-30] ([xshift=2pt]enc22.south);
            \draw[-latex'] ([xshift=2pt]encemb4.north) to [out=30,in=-30] ([xshift=2pt]enc24.south);

            \draw[-latex'] ([xshift=-2pt]enc11.north) to [out=150,in=-150] ([xshift=-2pt]enc31.south);
            \draw[-latex'] ([xshift=-2pt]enc12.north) to [out=150,in=-150] ([xshift=-2pt]enc32.south);
            \draw[-latex'] ([xshift=-2pt]enc14.north) to [out=150,in=-150] ([xshift=-2pt]enc34.south);
xiaotong committed
3583

Lee committed
3584 3585 3586 3587 3588 3589 3590
            \draw[-latex'] (enc22) to (enc32);
            \draw[-latex'] (enc21) to (enc31);
            \draw[-latex'] (enc24) to (enc34);

            \draw[-latex'] ([xshift=-2pt]enc31.north) to [out=150,in=-150] ([xshift=-2pt]enc51.south);
            \draw[-latex'] ([xshift=-2pt]enc32.north) to [out=150,in=-150] ([xshift=-2pt]enc52.south);
            \draw[-latex'] ([xshift=-2pt]enc34.north) to [out=150,in=-150] ([xshift=-2pt]enc54.south);
xiaotong committed
3591

Lee committed
3592 3593 3594
            \draw[-latex'] (enc31) to (enc41);
            \draw[-latex'] (enc32) to (enc42);
            \draw[-latex'] (enc34) to (enc44);
xiaotong committed
3595

Lee committed
3596 3597 3598
            \draw[-latex'] (enc41) to (enc51);
            \draw[-latex'] (enc42) to (enc52);
            \draw[-latex'] (enc44) to (enc54);
xiaotong committed
3599

Lee committed
3600 3601 3602
            \draw[-latex'] (enc51) to (enc61);
            \draw[-latex'] (enc52) to (enc62);
            \draw[-latex'] (enc54) to (enc64);
xiaotong committed
3603

Lee committed
3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615
            \draw[-latex'] (enc61) to ([yshift=\base]enc61.north);
            \draw[-latex'] (enc62) to ([yshift=\base]enc62.north);
            \draw[-latex'] (enc64) to ([yshift=\base]enc64.north);
        \end{scope}

        \node[rnnnode,fill=orange!20,minimum width=3.5cm,anchor=south west] (attention) at ([yshift=\base]enc61.north west) {注意力机制};

        \begin{scope}
            \node[rnnnode,fill=green!20,right=2.5cm of encemb4] (decemb1) {};
            \node[rnnnode,fill=green!20,right=\base of decemb1] (decemb2) {};
            \node[rnnnode,draw=white,fill=white,right=\base of decemb2] (decemb3) {$\cdots$};
            \node[rnnnode,fill=green!20,right=\base of decemb3] (decemb4) {};
xiaotong committed
3616

Lee committed
3617 3618 3619 3620
            \node[rnnnode,above=\base of decemb1] (dec11) {};
            \node[rnnnode,above=\base of decemb2] (dec12) {};
            \node[rnnnode,draw=white,fill=white,above=\base of decemb3] (dec13) {$\cdots$};
            \node[rnnnode,above=\base of decemb4] (dec14) {};
xiaotong committed
3621

Lee committed
3622 3623 3624 3625
            \node[rnnnode,above=\base of dec11] (dec21) {};
            \node[rnnnode,above=\base of dec12] (dec22) {};
            \node[rnnnode,draw=white,fill=white,above=\base of dec13] (dec23) {$\cdots$};
            \node[rnnnode,above=\base of dec14] (dec24) {};
xiaotong committed
3626

Lee committed
3627 3628 3629 3630
            \node[rnnnode,above=\base of dec21] (dec31) {};
            \node[rnnnode,above=\base of dec22] (dec32) {};
            \node[rnnnode,draw=white,fill=white,above=\base of dec23] (dec33) {$\cdots$};
            \node[rnnnode,above=\base of dec24] (dec34) {};
xiaotong committed
3631

Lee committed
3632 3633 3634 3635
            \node[rnnnode,draw=white,fill=white,above=\base of dec31] (dec41) {$\cdots$};
            \node[rnnnode,draw=white,fill=white,above=\base of dec32] (dec42) {$\cdots$};
            \node[rnnnode,draw=white,fill=white,above=\base of dec33] (dec43) {};
            \node[rnnnode,draw=white,fill=white,above=\base of dec34] (dec44) {$\cdots$};
xiaotong committed
3636

Lee committed
3637 3638 3639 3640
            \node[rnnnode,above=\base of dec41] (dec51) {};
            \node[rnnnode,above=\base of dec42] (dec52) {};
            \node[rnnnode,draw=white,fill=white,above=\base of dec43] (dec53) {$\cdots$};
            \node[rnnnode,above=\base of dec44] (dec54) {};
xiaotong committed
3641

Lee committed
3642 3643 3644 3645 3646 3647 3648 3649 3650
            \node[rnnnode,fill=blue!20,above=\base of dec51] (softmax1) {};
            \node[rnnnode,fill=blue!20,above=\base of dec52] (softmax2) {};
            \node[rnnnode,draw=white,fill=white,above=\base of dec53] (softmax3) {$\cdots$};
            \node[rnnnode,fill=blue!20,above=\base of dec54] (softmax4) {};

            % words
            \node[wnode,below=0pt of decemb1] (decinword1) {SOS};
            \node[wnode,below=0pt of decemb2] (decinword2) {Have};
            \node[wnode,below=0pt of decemb4] (decinword4) {?};
xiaotong committed
3651

Lee committed
3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663
            \node[wnode,above=0pt of softmax1] (decoutword1) {Have};
            \ExtractX{$(softmax2.north)$}
            \ExtractY{$(decoutword1.base)$}
            \node[wnode,anchor=base] (decoutword2) at (\XCoord,\YCoord) {you};
            \ExtractX{$(softmax4.north)$}
            \ExtractY{$(decoutword1.base)$}
            \node[wnode,anchor=base] (decoutword4) at (\XCoord,\YCoord) {EOS};

            % connections
            \draw[-latex'] (dec11) to (dec12);
            \draw[-latex'] (dec12) to (dec13);
            \draw[-latex'] (dec13) to (dec14);
xiaotong committed
3664

Lee committed
3665 3666 3667
            \draw[-latex'] (dec21) to (dec22);
            \draw[-latex'] (dec22) to (dec23);
            \draw[-latex'] (dec23) to (dec24);
xiaotong committed
3668

Lee committed
3669 3670 3671
            \draw[-latex'] (dec31) to (dec32);
            \draw[-latex'] (dec32) to (dec33);
            \draw[-latex'] (dec33) to (dec34);
xiaotong committed
3672

Lee committed
3673 3674 3675 3676 3677 3678 3679
            \draw[-latex'] (dec51) to (dec52);
            \draw[-latex'] (dec52) to (dec53);
            \draw[-latex'] (dec53) to (dec54);

            \draw[-latex'] (decemb1) to (dec11);
            \draw[-latex'] (decemb2) to (dec12);
            \draw[-latex'] (decemb4) to (dec14);
xiaotong committed
3680

Lee committed
3681 3682 3683 3684 3685 3686 3687
            \foreach \cur [count=\prev from 1] in {2,...,5}
            {
                \draw[-latex'] (dec\prev1) to (dec\cur1);
                \draw[-latex'] (dec\prev2) to (dec\cur2);
                \draw[-latex'] (dec\prev4) to (dec\cur4);
            }

Lee committed
3688 3689 3690
            \draw[-latex'] ([xshift=-2pt]dec21.north) to [out=150,in=-150] ([xshift=-2pt]dec41.south);
            \draw[-latex'] ([xshift=-2pt]dec22.north) to [out=150,in=-150] ([xshift=-2pt]dec42.south);
            \draw[-latex'] ([xshift=-2pt]dec24.north) to [out=150,in=-150] ([xshift=-2pt]dec44.south);
Lee committed
3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742

            \draw[-latex'] (dec51) to (softmax1);
            \draw[-latex'] (dec52) to (softmax2);
            \draw[-latex'] (dec54) to (softmax4);
        \end{scope}

        % attention connections
        \draw[-latex',rounded corners=2pt] (dec11) -| ([xshift=-0.4cm]attention.south east);

        \ExtractX{$([xshift=9pt]attention.east)$}
        \ExtractY{$([yshift=2pt]dec11.north)$}
        \coordinate (tmp1) at (\XCoord,\YCoord);
        \ExtractX{$([xshift=-5pt]dec12.west)$}
        \coordinate (tmp2) at (\XCoord,\YCoord);
        \draw[-latex',rounded corners=2pt] ([yshift=-3pt]attention.east) -| (tmp1) -- (tmp2) |- ([yshift=3pt]dec12.west);

        \ExtractX{$([xshift=11pt]attention.east)$}
        \ExtractY{$([yshift=2pt]dec21.north)$}
        \coordinate (tmp1) at (\XCoord,\YCoord);
        \ExtractX{$([xshift=-5pt]dec22.west)$}
        \coordinate (tmp2) at (\XCoord,\YCoord);
        \draw[-latex',rounded corners=2pt] ([yshift=-1pt]attention.east) -| (tmp1) -- (tmp2) |- ([yshift=3pt]dec22.west);

        \ExtractX{$([xshift=13pt]attention.east)$}
        \ExtractY{$([yshift=2pt]dec31.north)$}
        \coordinate (tmp1) at (\XCoord,\YCoord);
        \ExtractX{$([xshift=-5pt]dec32.west)$}
        \coordinate (tmp2) at (\XCoord,\YCoord);
        \draw[-latex',rounded corners=2pt] ([yshift=1pt]attention.east) -| (tmp1) -- (tmp2) |- ([yshift=3pt]dec32.west);

        \ExtractX{$([xshift=15pt]attention.east)$}
        \ExtractY{$([yshift=2pt]dec51.north)$}
        \coordinate (tmp1) at (\XCoord,\YCoord);
        \ExtractX{$([xshift=-5pt]dec52.west)$}
        \coordinate (tmp2) at (\XCoord,\YCoord);
        \draw[-latex',rounded corners=2pt] ([yshift=3pt]attention.east) -| (tmp1) -- (tmp2) |- ([yshift=3pt]dec52.west);

        % label
        \draw[decorate,decoration={brace}] ([xshift=-5pt]enc11.south west) to node [auto,font=\scriptsize,name=label1] {8层} ([xshift=-5pt]enc61.north west);
        \draw[decorate,decoration={brace,mirror}] ([xshift=5pt]dec14.south east) to node [auto,swap,font=\scriptsize,name=label2] {8层} ([xshift=5pt]dec54.north east);
        \begin{pgfonlayer}{background}
            \coordinate (tmp) at ([xshift=-4pt]label1.west);
            \node[draw,densely dashed,rounded corners=2pt,inner sep=2pt,fit=(label1) (encword1) (attention) (tmp)] (encoder) {};
            \ExtractX{$([xshift=4pt]label2.east)$}
            \ExtractY{$([yshift=6pt]decoutword4.north)$}
            \coordinate (tmp) at (\XCoord,\YCoord);
            \node[draw,densely dashed,rounded corners=2pt,inner sep=2pt,fit=(label2) (decinword1) (decoutword4) (tmp)] (decoder) {};
        \end{pgfonlayer}
        \node[wnode,anchor=north west] () at (encoder.north west) {编码器};
        \node[wnode,anchor=north east] () at (decoder.north east) {解码器};
    \end{tikzpicture}
\end{center}
xiaotong committed
3743 3744 3745
\end{frame}

%%%------------------------------------------------------------------------------------------------------------
姜雨帆 committed
3746 3747 3748 3749
%%% 实验结果
\begin{frame}{效果}
    %% 实用注意力机制带来的提升
    %% 个大评测比赛没有不使用注意力机制的系统,已经成为标配
姜雨帆 committed
3750

姜雨帆 committed
3751 3752 3753 3754 3755
    \begin{itemize}
    \item 在引入注意力机制之前,神经机器翻译(RNNSearch)的性能要弱于统计机器翻译(PBMT)
    \item 加入注意力机制和深层网络之后,神经机器翻译性能有了很大幅度的提升
    \item 虽然网络深度增加了,但是通过相应的结构设计和解码策略保证了解码速度
    \end{itemize}
姜雨帆 committed
3756

姜雨帆 committed
3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781
    {
        \footnotesize
        \begin{center}
            \setlength{\tabcolsep}{3pt}
            \renewcommand\arraystretch{1}
            \begin{tabular}{l}
                \begin{tabular}{lcccl}
                    \specialrule{1pt}{1pt}{1pt}
                    \multirow{2}{*}{\#} & \multicolumn{2}{c}{BLEU} & \multicolumn{2}{c}{ \multirow{2}{*}{CPU decoding time}}\\
                    \cline{2-3}
                    & EN-DE & EN-FR & \\
                    \specialrule{0.6pt}{1pt}{1pt}
                    PBMT & 20.7 & 37.0 & \multicolumn{2}{c}{-} \\
                    RNNSearch & 16.5 & - & \multicolumn{2}{c}{-} \\
                    LSTM(6 layers) & - & 31.5 & \multicolumn{2}{c}{-} \\
                    Deep-Att & 20.6 & 37.7 & \multicolumn{2}{c}{-} \\
                \specialrule{0.6pt}{1pt}{1pt}
                    GNMT & 24.6 & 39.0 & \multicolumn{2}{c}{0.2s per sentence} \\
                    \specialrule{1pt}{1pt}{1pt}
                \end{tabular}\\
                \addlinespace[-0.3ex]
                \footnote *GNMT versus previous state-of-the-art models\\
            \end{tabular}
        \end{center}
    }
姜雨帆 committed
3782

姜雨帆 committed
3783 3784 3785
    \end{frame}

%%%------------------------------------------------------------------------------------------------------------
xiaotong committed
3786 3787 3788 3789
\section{Transformer}

%%%------------------------------------------------------------------------------------------------------------
%%% Transformer模型部分
姜雨帆 committed
3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864
\begin{frame}{Transformer}

\vspace{5.0em}
 \begin{tcolorbox}[enhanced,size=normal,left=13mm,right=1mm,colback=red!5!white,colframe=red!75!black,drop fuzzy shadow]
{\Large
\textbf{Transformer以及自注意机制}
}
\end{tcolorbox}

\begin{center}
\begin{tikzpicture}

\begin{scope}[scale=0.7]
\tikzstyle{rnnnode} = [draw,inner sep=5pt,minimum width=3em,minimum height=0.8em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}]
\node [anchor=west,rnnnode] (node11) at (0,0) {\tiny{Self-Attention}};
\node [anchor=west,rnnnode] (node12) at ([xshift=1em]node11.east) {\tiny{Self-Attention}};
\node [anchor=west,rnnnode] (node13) at ([xshift=1em]node12.east) {\tiny{Self-Attention}};
\node [anchor=west,rnnnode] (node14) at ([xshift=1em]node13.east) {\tiny{Self-Attention}};

\node [anchor=north,rnnnode,fill=blue!30!white] (e1) at ([yshift=-2em]node11.south) {\tiny{}};
\node [anchor=north,rnnnode,fill=blue!30!white] (e2) at ([yshift=-2em]node12.south) {\tiny{}};
\node [anchor=north,rnnnode,fill=blue!30!white] (e3) at ([yshift=-2em]node13.south) {\tiny{}};
\node [anchor=north,rnnnode,fill=blue!30!white] (e4) at ([yshift=-2em]node14.south) {\tiny{}};
\node [anchor=north,inner sep=2pt] (w1) at ([yshift=-1em]e1.south) {\tiny{$<$s$>$}};
\node [anchor=north,inner sep=2pt] (w2) at ([yshift=-1em]e2.south) {\tiny{}};
\node [anchor=north,inner sep=2pt] (w3) at ([yshift=-1em]e3.south) {\tiny{我们}};
\node [anchor=north,inner sep=2pt] (w4) at ([yshift=-1em]e4.south) {\tiny{开始}};

\draw [->,thick] ([yshift=0.1em]w1.north)--([yshift=-0.1em]e1.south);
\draw [->,thick] ([yshift=0.1em]w2.north)--([yshift=-0.1em]e2.south);
\draw [->,thick] ([yshift=0.1em]w3.north)--([yshift=-0.1em]e3.south);
\draw [->,thick] ([yshift=0.1em]w4.north)--([yshift=-0.1em]e4.south);

\draw [->,thick] ([yshift=0.1em]e1.north)--([yshift=-0.1em]node11.south);
\draw [->,thick] ([yshift=0.1em]e2.north)--([yshift=-0.1em]node12.south);
\draw [->,thick] ([yshift=0.1em]e3.north)--([yshift=-0.1em]node13.south);
\draw [->,thick] ([yshift=0.1em]e4.north)--([yshift=-0.1em]node14.south);

\draw [->,thick] ([yshift=0.1em]e1.north)--([xshift=-0.5em,yshift=-0.2em]node12.south);
\draw [->,thick] ([yshift=0.1em]e3.north)--([xshift=0.5em,yshift=-0.2em]node12.south);
\draw [->,thick] ([yshift=0.1em]e4.north)--([xshift=1.5em,yshift=-0.2em]node12.south);

\node [anchor=south,rnnnode,fill=blue!30!white] (node21) at ([yshift=2.0em]node11.north) {\tiny{}};
\node [anchor=south,rnnnode,fill=blue!30!white] (node22) at ([yshift=2.0em]node12.north) {\tiny{}};
\node [anchor=south,rnnnode,fill=blue!30!white] (node23) at ([yshift=2.0em]node13.north) {\tiny{}};
\node [anchor=south,rnnnode,fill=blue!30!white] (node24) at ([yshift=2.0em]node14.north) {\tiny{}};

\node [anchor=south] (output1) at ([yshift=1em]node21.north) {\Large{\textbf{}}};
\node [anchor=south] (output2) at ([yshift=1em]node22.north) {\Large{\textbf{}}};
\node [anchor=south] (output3) at ([yshift=1em]node23.north) {\Large{\textbf{}}};
\node [anchor=south] (output4) at ([yshift=1em]node24.north) {\Large{\textbf{}}};

\draw [->,thick] ([yshift=0.1em]node21.north)--([yshift=-0.1em]output1.south);
\draw [->,thick] ([yshift=0.1em]node22.north)--([yshift=-0.1em]output2.south);
\draw [->,thick] ([yshift=0.1em]node23.north)--([yshift=-0.1em]output3.south);
\draw [->,thick] ([yshift=0.1em]node24.north)--([yshift=-0.1em]output4.south);

\draw [->,thick] ([yshift=0.1em]node11.north)--([yshift=-0.1em]node21.south);
\draw [->,thick] ([yshift=0.1em]node12.north)--([yshift=-0.1em]node22.south);
\draw [->,thick] ([yshift=0.1em]node13.north)--([yshift=-0.1em]node23.south);
\draw [->,thick] ([yshift=0.1em]node14.north)--([yshift=-0.1em]node24.south);

%\draw [->,thick] ([xshift=-1em]node11.west)--([xshift=-0.1em]node11.west);
%\draw [->,thick] ([xshift=0.1em]node11.east)--([xshift=-0.1em]node12.west);
%\draw [->,thick] ([xshift=0.1em]node12.east)--([xshift=-0.1em]node13.west);
%\draw [->,thick] ([xshift=0.1em]node13.east)--([xshift=-0.1em]node14.west);
%\draw [->,thick] ([xshift=0.1em]node14.east)--([xshift=1em]node14.east);
\end{scope}

\end{tikzpicture}
\end{center}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
xiaotong committed
3865 3866 3867
\subsection{自注意力机制}

%%%------------------------------------------------------------------------------------------------------------
姜雨帆 committed
3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920
%%% 自注意力机制
\begin{frame}{自注意力机制}
\begin{itemize}
\item 使用循环神经网络对源语、目标语建模进行信息提取效果很好,但是当序列过长时,词汇之间信息传递距离过长,导致模型的信息提取能力变差。

\vspace{0.3em}
\begin{center}
\begin{tikzpicture}
\begin{scope}
\node [anchor=west] (w0) at (0,0) {$w_1$};
\node [anchor=west] (w1) at ([xshift=0.5em]w0.east) {$w_2$};
\node [anchor=west] (w2) at ([xshift=0.5em]w1.east) {$w_3$};
\node [anchor=west] (w3) at ([xshift=0.5em]w2.east) {$...$};
\node [anchor=west] (w4) at ([xshift=0.5em]w3.east) {$w_{m-1}$};
\node [anchor=west,fill=green!20!white] (w5) at ([xshift=0.5em]w4.east) {$w_{m}$};
\draw [->,thick,red] (w1.north).. controls +(130:0.5) and +(50:0.5) .. (w0.north);
\draw [->,thick,red] (w2.north).. controls +(130:0.5) and +(50:0.5) .. (w1.north);
\draw [->,thick,red] ([yshift=0.2em]w3.north).. controls +(130:0.5) and +(50:0.5) .. (w2.north);
\draw [->,thick,red] (w4.north).. controls +(130:0.5) and +(50:0.5) .. ([yshift=0.2em]w3.north);
\draw [->,thick,red] (w5.north).. controls +(130:0.5) and +(50:0.5) .. (w4.north);
\draw [->,very thick,red] ([xshift=-5em]w0.west) -- ([xshift=-6.5em]w0.west) node [pos=0,right] {\scriptsize{信息传递}};
\end{scope}
\end{tikzpicture}
\end{center}

\item<2-> 能否将不同位置之间的词汇间信息传递的距离拉近为1?

\vspace{0.3em}
\begin{center}
\begin{tikzpicture}
\begin{scope}
\node [anchor=west] (w0) at (0,-2) {$w_1$};
\node [anchor=west] (w1) at ([xshift=0.5em]w0.east) {$w_2$};
\node [anchor=west] (w2) at ([xshift=0.5em]w1.east) {$w_3$};
\node [anchor=west] (w3) at ([xshift=0.5em]w2.east) {$...$};
\node [anchor=west] (w4) at ([xshift=0.5em]w3.east) {$w_{m-1}$};
\node [anchor=west,fill=green!20!white] (w5) at ([xshift=0.5em]w4.east) {$w_{m}$};
\draw [->,thick,red] (w5.north).. controls +(100:0.85) and +(50:0.85) .. (w0.north);
\draw [->,thick,red] (w5.north).. controls +(110:0.75) and +(50:0.75) .. (w1.north);
\draw [->,thick,red] (w5.north).. controls +(120:0.6) and +(50:0.6) .. ([yshift=0.2em]w3.north);
\draw [->,thick,red] (w5.north).. controls +(130:0.5) and +(50:0.5) .. (w4.north);
\draw [->,very thick,red] ([xshift=-5em]w0.west) -- ([xshift=-6.5em]w0.west) node [pos=0,right] {\scriptsize{信息传递}};

\end{scope}
\end{tikzpicture}
\end{center}

\item<3-> \textbf{自注意力机制(Self-Attention)}可以很好的解决长距离依赖问题,增强信息抽取能力,在长距离语言建模任务取得了很好的效果。
        \textbf{Attention Is All You Need}\\
        \textbf{Vaswani et al., 2017, In Proc. of Neural Information Processing Systems, 6000-6010}

\end{itemize}
\end{frame}
姜雨帆 committed
3921

姜雨帆 committed
3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932
%%%------------------------------------------------------------------------------------------------------------
%%% 自注意力机制(续)
\begin{frame}{自注意力机制(续)}
\begin{itemize}
\item 基于前面对注意力机制的介绍,自注意力机制则是将源语言每个位置的表示$\textbf{h}_{i}$看做query,同时将所有位置的表示看做{\color{ugreen} \textbf{key}}{\color{red} \textbf{value}}

\vspace{-0.5em}
\begin{center}
\begin{tikzpicture}
\begin{scope}

姜雨帆 committed
3933
\tikzstyle{rnode} = [draw,minimum width=2.8em,minimum height=1.2em]
姜雨帆 committed
3934 3935

\node [rnode,anchor=south west,fill=red!20!white] (value1) at (0,0) {\scriptsize{$\textbf{h}(\textrm{``你''})$}};
姜雨帆 committed
3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960
\node [rnode,anchor=south west,fill=red!20!white] (value2) at ([xshift=0.8em]value1.south east) {\scriptsize{$\textbf{h}(\textrm{``什么''})$}};
\node [rnode,anchor=south west,fill=red!20!white] (value3) at ([xshift=0.8em]value2.south east) {\scriptsize{$\textbf{h}(\textrm{``也''})$}};
\node [rnode,anchor=south west,fill=red!20!white] (value4) at ([xshift=0.8em]value3.south east) {\scriptsize{$\textbf{h}(\textrm{``没''})$}};
\node [rnode,anchor=south west,fill=red!20!white] (value5) at ([xshift=0.8em]value4.south east) {\scriptsize{$\textbf{h}(\textrm{``学''})$}};

\node [rnode,anchor=south west,fill=green!20!white] (key1) at ([yshift=0.3em]value1.north west) {\scriptsize{$\textbf{h}(\textrm{``你''})$}};
\node [rnode,anchor=south west,fill=green!20!white] (key2) at ([yshift=0.3em]value2.north west) {\scriptsize{$\textbf{h}(\textrm{``什么''})$}};
\node [rnode,anchor=south west,fill=green!20!white] (key3) at ([yshift=0.3em]value3.north west) {\scriptsize{$\textbf{h}(\textrm{``也''})$}};
\node [rnode,anchor=south west,fill=green!20!white] (key4) at ([yshift=0.3em]value4.north west) {\scriptsize{$\textbf{h}(\textrm{``没''})$}};
\node [rnode,anchor=south west,fill=green!20!white] (key5) at ([yshift=0.3em]value5.north west) {\scriptsize{$\textbf{h}(\textrm{``学''})$}};

\node [rnode,anchor=east] (query5) at ([xshift=-1em]key1.west) {\scriptsize{$\textbf{h}(\textrm{``学''})$}};

\node [anchor=south] (sep) at ([yshift=1em]query5.north) {\scriptsize{$\textbf{...}$}};

\draw [->] ([yshift=1pt,xshift=4pt]query5.north) .. controls +(90:0.6em) and +(90:0.6em) .. ([yshift=1pt]key1.north);
\draw [->] ([yshift=1pt,xshift=0pt]query5.north) .. controls +(90:1.0em) and +(90:1.0em) .. ([yshift=1pt]key2.north);
\draw [->] ([yshift=1pt,xshift=-4pt]query5.north) .. controls +(90:1.4em) and +(90:1.4em) .. ([yshift=1pt]key3.north);
\draw [->] ([yshift=1pt,xshift=-8pt]query5.north) .. controls +(90:1.8em) and +(90:1.8em) .. ([yshift=1pt]key4.north);
\draw [->] ([yshift=1pt,xshift=-12pt]query5.north) .. controls +(90:2.2em) and +(90:2.2em) .. ([yshift=1pt]key5.north);
\node [anchor=south west] (alpha1) at ([xshift=0.3em]key1.north) {\scriptsize{$\alpha_1$}};
\node [anchor=south west] (alpha2) at ([xshift=0.3em]key2.north) {\scriptsize{$\alpha_2$}};
\node [anchor=south west] (alpha3) at ([xshift=0.3em]key3.north) {\scriptsize{$\alpha_3$}};
\node [anchor=south west] (alpha4) at ([xshift=0.3em]key4.north) {\scriptsize{$\alpha_4$}};
\node [anchor=south west] (alpha5) at ([xshift=0.3em]key5.north) {\scriptsize{$\alpha_5$}};
姜雨帆 committed
3961 3962 3963

\node [rnode,anchor=south west,fill=green!20!white] (key6) at ([yshift=2em]key1.north west) {\scriptsize{$\textbf{h}(\textrm{``你''})$}};
\node [rnode,anchor=south west,fill=green!20!white] (key7) at ([yshift=2em]key2.north west) {\scriptsize{$\textbf{h}(\textrm{``什么''})$}};
姜雨帆 committed
3964
\node [rnode,anchor=south west,fill=green!20!white] (key8) at ([yshift=2em]key3.north west) {\scriptsize{$\textbf{h}(\textrm{``也''})$}};
姜雨帆 committed
3965 3966 3967
\node [rnode,anchor=south west,fill=green!20!white] (key9) at ([yshift=2em]key4.north west) {\scriptsize{$\textbf{h}(\textrm{``没''})$}};
\node [rnode,anchor=south west,fill=green!20!white] (key10) at ([yshift=2em]key5.north west) {\scriptsize{$\textbf{h}(\textrm{``学''})$}};

姜雨帆 committed
3968
\node [rnode,anchor=east] (query3) at ([xshift=-1em]key6.west) {\scriptsize{$\textbf{h}(\textrm{``也''})$}};
姜雨帆 committed
3969

姜雨帆 committed
3970
\node [anchor=south] (sep1) at ([yshift=1em]query3.north) {\scriptsize{$\textbf{...}$}};
姜雨帆 committed
3971

姜雨帆 committed
3972 3973 3974 3975 3976 3977 3978 3979 3980 3981
\draw [->] ([yshift=1pt,xshift=4pt]query3.north) .. controls +(90:0.6em) and +(90:0.6em) .. ([yshift=1pt]key6.north);
\draw [->] ([yshift=1pt,xshift=0pt]query3.north) .. controls +(90:1.0em) and +(90:1.0em) .. ([yshift=1pt]key7.north);
\draw [->] ([yshift=1pt,xshift=-4pt]query3.north) .. controls +(90:1.4em) and +(90:1.4em) .. ([yshift=1pt]key8.north);
\draw [->] ([yshift=1pt,xshift=-8pt]query3.north) .. controls +(90:1.8em) and +(90:1.8em) .. ([yshift=1pt]key9.north);
\draw [->] ([yshift=1pt,xshift=-12pt]query3.north) .. controls +(90:2.2em) and +(90:2.2em) .. ([yshift=1pt]key10.north);
\node [anchor=south west] (alpha6) at ([xshift=0.3em]key6.north) {\scriptsize{$\alpha_1$}};
\node [anchor=south west] (alpha7) at ([xshift=0.3em]key7.north) {\scriptsize{$\alpha_2$}};
\node [anchor=south west] (alpha8) at ([xshift=0.3em]key8.north) {\scriptsize{$\alpha_3$}};
\node [anchor=south west] (alpha9) at ([xshift=0.3em]key9.north) {\scriptsize{$\alpha_4$}};
\node [anchor=south west] (alpha10) at ([xshift=0.3em]key10.north) {\scriptsize{$\alpha_5$}};
姜雨帆 committed
3982

姜雨帆 committed
3983
\node [rnode,anchor=south west,fill=green!20!white] (key11) at ([yshift=2em]key6.north west) {\scriptsize{$\textbf{h}(\textrm{``你''})$}};
姜雨帆 committed
3984 3985 3986 3987 3988
\node [rnode,anchor=south west,fill=green!20!white] (key12) at ([yshift=2em]key7.north west) {\scriptsize{$\textbf{h}(\textrm{``什么''})$}};
\node [rnode,anchor=south west,fill=green!20!white] (key13) at ([yshift=2em]key8.north west) {\scriptsize{$\textbf{h}(\textrm{``也''})$}};
\node [rnode,anchor=south west,fill=green!20!white] (key14) at ([yshift=2em]key9.north west) {\scriptsize{$\textbf{h}(\textrm{``没''})$}};
\node [rnode,anchor=south west,fill=green!20!white] (key15) at ([yshift=2em]key10.north west) {\scriptsize{$\textbf{h}(\textrm{``学''})$}};

姜雨帆 committed
3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000
\node [rnode,anchor=east] (query1) at ([xshift=-1em]key11.west) {\scriptsize{$\textbf{h}(\textrm{``你''})$}};

\draw [->] ([yshift=1pt,xshift=4pt]query1.north) .. controls +(90:0.6em) and +(90:0.6em) .. ([yshift=1pt]key11.north);
\draw [->] ([yshift=1pt,xshift=0pt]query1.north) .. controls +(90:1.0em) and +(90:1.0em) .. ([yshift=1pt]key12.north);
\draw [->] ([yshift=1pt,xshift=-4pt]query1.north) .. controls +(90:1.4em) and +(90:1.4em) .. ([yshift=1pt]key13.north);
\draw [->] ([yshift=1pt,xshift=-8pt]query1.north) .. controls +(90:1.8em) and +(90:1.8em) .. ([yshift=1pt]key14.north);
\draw [->] ([yshift=1pt,xshift=-12pt]query1.north) .. controls +(90:2.2em) and +(90:2.2em) .. ([yshift=1pt]key15.north);
\node [anchor=south west] (alpha11) at ([xshift=0.3em]key11.north) {\scriptsize{$\alpha_1$}};
\node [anchor=south west] (alpha12) at ([xshift=0.3em]key12.north) {\scriptsize{$\alpha_2$}};
\node [anchor=south west] (alpha13) at ([xshift=0.3em]key13.north) {\scriptsize{$\alpha_3$}};
\node [anchor=south west] (alpha14) at ([xshift=0.3em]key14.north) {\scriptsize{$\alpha_4$}};
\node [anchor=south west] (alpha15) at ([xshift=0.3em]key15.north) {\scriptsize{$\alpha_5$}};
姜雨帆 committed
4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011

\end{scope}
\end{tikzpicture}
\end{center}

\item<2->自注意力模型通过计算源语各位置的匹配程度对value进行加权求和,完成对源语信息的提取

\end{itemize}
\end{frame}

%%%------------------------------------------------------------------------------------------------------------
xiaotong committed
4012
\subsection{模型架构}
xiaotong committed
4013 4014

%%%------------------------------------------------------------------------------------------------------------
姜雨帆 committed
4015 4016 4017 4018 4019 4020 4021
\begin{frame}{Transformer 介绍}
\begin{itemize}

\item Transformer是Google在2017年提出的一个新型网络结构,完全基于注意力机制,取得了很好成绩!

\item 通过自注意机制能够直接获取全局信息,不像RNN需要逐步进行信息提取,也不像CNN只能获取局部信息,可以并行化操作,提高训练效率

4022
\item<2-> Transformer不仅仅被用于神经机器翻译任务,还广泛用于其他NLP任务、甚至图像处理任务。目前最火的预训练模型Bert也基于Transformer
姜雨帆 committed
4023 4024 4025 4026 4027

\end{itemize}

\vspace{0em}

4028
\visible<2->{
姜雨帆 committed
4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052
{
    \footnotesize
    \begin{center}
        \setlength{\tabcolsep}{3pt}
        \renewcommand\arraystretch{1}
        \begin{tabular}{l}
            \begin{tabular}{lcccl}
                \specialrule{1pt}{1pt}{1pt}
                \multirow{2}{*}{\#} & \multicolumn{2}{c}{BLEU} & \multicolumn{2}{c}{ \multirow{2}{*}{Training Cost(FLOPs)}}\\
                \cline{2-3}
                & EN-DE & EN-FR & \\
                \specialrule{0.6pt}{1pt}{1pt}
                GNMT + RL & 24.6 & 39.92 & \multicolumn{2}{c}{ 1.4$\times10^{20}$} \\
                ConvS2S & 25.16 & 40.46 & \multicolumn{2}{c}{ 1.5$\times10^{20}$} \\
                MoE & 26.03 & 40.56 & \multicolumn{2}{c}{ 1.2$\times10^{20}$} \\
                \specialrule{0.6pt}{1pt}{1pt}
			Transformer (big) & \textbf{28.4} & \textbf{41.8} & \multicolumn{2}{c}{ 2.3$\times10^{19}$}\\
                \specialrule{1pt}{1pt}{1pt}
            \end{tabular}\\
            \addlinespace[-0.3ex]
            \footnote *Transformer versus previous state-of-the-art models\\
        \end{tabular}
    \end{center}
}
4053 4054
}

姜雨帆 committed
4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065
\end{frame}

%%%------------------------------------------------------------------------------------------------------------
\begin{frame}{Transformer}
%\begin{tcolorbox}
%[bicolor,sidebyside,righthand width=4.5cm,size=title,frame engine=empty,
% colback=blue!10!white,colbacklower=black!5!white]

\begin{itemize}
\item Transformer 总体结构
\end{itemize}
姜雨帆 committed
4066
\vspace{-0.5em}
姜雨帆 committed
4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083
\begin{center}
\begin{tikzpicture}
\begin{scope}
\tikzstyle{Sanode} = [minimum height=1.4em,minimum width=7em,inner sep=3pt,rounded corners=1.5pt,draw,fill=orange!20];
\tikzstyle{Resnode} = [minimum height=1.1em,minimum width=7em,inner sep=3pt,rounded corners=1.5pt,draw,fill=yellow!20];
\tikzstyle{ffnnode} = [minimum height=1.4em,minimum width=7em,inner sep=3pt,rounded corners=1.5pt,draw,fill=blue!10];
\tikzstyle{outputnode} = [minimum height=1.4em,minimum width=7em,inner sep=3pt,rounded corners=1.5pt,draw,fill=blue!30];
\tikzstyle{inputnode} = [minimum height=1.4em,minimum width=3.5em,inner sep=3pt,rounded corners=1.5pt,draw,fill=red!10];
\tikzstyle{posnode} = [minimum height=1.4em,minimum width=3.5em,inner sep=3pt,rounded corners=1.5pt,draw,fill=black!5!white];
\tikzstyle{standard} = [rounded corners=3pt]

\node [Sanode,anchor=west] (sa1) at (0,0) {\tiny{$\textbf{Self-Attention}$}};
\node [Resnode,anchor=south] (res1) at ([yshift=0.3em]sa1.north) {\tiny{$\textbf{Add \& LayerNorm}$}};
\node [ffnnode,anchor=south] (ffn1) at ([yshift=1em]res1.north) {\tiny{$\textbf{Feed Forward Network}$}};
\node [Resnode,anchor=south] (res2) at ([yshift=0.3em]ffn1.north) {\tiny{$\textbf{Add \& LayerNorm}$}};
\node [inputnode,anchor=north west] (input1) at ([yshift=-1em]sa1.south west) {\tiny{$\textbf{Embedding}$}};
\node [posnode,anchor=north east] (pos1) at ([yshift=-1em]sa1.south east) {\tiny{$\textbf{Postion}$}};
xiaotong committed
4084
\node [anchor=north] (inputs) at ([yshift=-3em]sa1.south) {\tiny{$\textbf{编码器输入: 我  很  好}$}};
姜雨帆 committed
4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102
\node [anchor=south] (encoder) at ([xshift=0.2em,yshift=0.6em]res2.north west) {\scriptsize{\textbf{编码器}}};

\draw [->] (sa1.north) -- (res1.south);
\draw [->] (res1.north) -- (ffn1.south);
\draw [->] (ffn1.north) -- (res2.south);
\draw [->] ([yshift=-1em]sa1.south) -- (sa1.south);
\draw [->] ([yshift=-0.3em]inputs.north) -- ([yshift=0.6em]inputs.north);


\node [Sanode,anchor=west] (sa2) at ([xshift=3em]sa1.east) {\tiny{$\textbf{Self-Attention}$}};
\node [Resnode,anchor=south] (res3) at ([yshift=0.3em]sa2.north) {\tiny{$\textbf{Add \& LayerNorm}$}};
\node [Sanode,anchor=south] (ed1) at ([yshift=1em]res3.north) {\tiny{$\textbf{Encoder-Decoder Attention}$}};
\node [Resnode,anchor=south] (res4) at ([yshift=0.3em]ed1.north) {\tiny{$\textbf{Add \& LayerNorm}$}};
\node [ffnnode,anchor=south] (ffn2) at ([yshift=1em]res4.north) {\tiny{$\textbf{Feed Forward Network}$}};
\node [Resnode,anchor=south] (res5) at ([yshift=0.3em]ffn2.north) {\tiny{$\textbf{Add \& LayerNorm}$}};
\node [outputnode,anchor=south] (o1) at ([yshift=1em]res5.north) {\tiny{$\textbf{Output layer}$}};
\node [inputnode,anchor=north west] (input2) at ([yshift=-1em]sa2.south west) {\tiny{$\textbf{Embedding}$}};
\node [posnode,anchor=north east] (pos2) at ([yshift=-1em]sa2.south east) {\tiny{$\textbf{Postion}$}};
xiaotong committed
4103
\node [anchor=north] (outputs) at ([yshift=-3em]sa2.south) {\tiny{$\textbf{解码器输入: $<$SOS$>$ I  am  fine}$}};
姜雨帆 committed
4104
\node [anchor=east] (decoder) at ([xshift=-1em,yshift=-1.5em]o1.west) {\scriptsize{\textbf{解码器}}};
姜雨帆 committed
4105
\node [anchor=north] (decoutputs) at ([yshift=1.5em]o1.north) {\tiny{$\textbf{解码器输出: I  am  fine $<$EOS$>$ }$}};
姜雨帆 committed
4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143

\draw [->] (sa2.north) -- (res3.south);
\draw [->] (res3.north) -- (ed1.south);
\draw [->] (ed1.north) -- (res4.south);
\draw [->] (res4.north) -- (ffn2.south);
\draw [->] (ffn2.north) -- (res5.south);
\draw [->] (res5.north) -- (o1.south);
\draw [->] (o1.north) -- ([yshift=0.5em]o1.north);
\draw [->] ([yshift=-1em]sa2.south) -- (sa2.south);
\draw [->] ([yshift=-0.3em]outputs.north) -- ([yshift=0.6em]outputs.north);


\draw[->,standard] ([yshift=-0.5em]sa1.south) -- ([xshift=-4em,yshift=-0.5em]sa1.south) -- ([xshift=-4em,yshift=2.3em]sa1.south) -- ([xshift=-3.5em,yshift=2.3em]sa1.south);
\draw[->,standard] ([yshift=0.5em]res1.north) -- ([xshift=-4em,yshift=0.5em]res1.north) -- ([xshift=-4em,yshift=3.3em]res1.north) -- ([xshift=-3.5em,yshift=3.3em]res1.north);

\draw[->,standard] ([yshift=-0.5em]sa2.south) -- ([xshift=4em,yshift=-0.5em]sa2.south) -- ([xshift=4em,yshift=2.3em]sa2.south) -- ([xshift=3.5em,yshift=2.3em]sa2.south);
\draw[->,standard] ([yshift=0.5em]res3.north) -- ([xshift=4em,yshift=0.5em]res3.north) -- ([xshift=4em,yshift=3.3em]res3.north) -- ([xshift=3.5em,yshift=3.3em]res3.north);
\draw[->,standard] ([yshift=0.5em]res4.north) -- ([xshift=4em,yshift=0.5em]res4.north) -- ([xshift=4em,yshift=3.3em]res4.north) -- ([xshift=3.5em,yshift=3.3em]res4.north);

\draw[->,standard] (res2.north) -- ([yshift=0.5em]res2.north) -- ([xshift=5em,yshift=0.5em]res2.north) -- ([xshift=5em,yshift=-2.2em]res2.north) -- ([xshift=6.5em,yshift=-2.2em]res2.north);

\node [rectangle,inner sep=0.7em,rounded corners=1pt,very thick,dotted,draw=ugreen!70] [fit = (sa1) (res1) (ffn1) (res2)] (box0) {};
\node [rectangle,inner sep=0.7em,rounded corners=1pt,very thick,dotted,draw=red!60] [fit = (sa2) (res3) (res5)] (box1) {};

\end{scope}
\end{tikzpicture}
\end{center}

%\end{tcolorbox}
\end{frame}

%%%------------------------------------------------------------------------------------------------------------
\begin{frame}{Transformer}
%\begin{tcolorbox}
%[bicolor,sidebyside,righthand width=4.5cm,size=title,frame engine=empty,
% colback=blue!10!white,colbacklower=black!5!white]

\begin{itemize}
姜雨帆 committed
4144
\item Transformer 输入和位置编码
姜雨帆 committed
4145
\end{itemize}
姜雨帆 committed
4146
\vspace{-0.5em}
姜雨帆 committed
4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163
\begin{center}
\begin{tikzpicture}
\begin{scope}
\tikzstyle{Sanode} = [minimum height=1.4em,minimum width=7em,inner sep=3pt,rounded corners=1.5pt,draw];
\tikzstyle{Resnode} = [minimum height=1.1em,minimum width=7em,inner sep=3pt,rounded corners=1.5pt,draw];
\tikzstyle{ffnnode} = [minimum height=1.4em,minimum width=7em,inner sep=3pt,rounded corners=1.5pt,draw];
\tikzstyle{outputnode} = [minimum height=1.4em,minimum width=7em,inner sep=3pt,rounded corners=1.5pt,draw];
\tikzstyle{inputnode} = [minimum height=1.4em,minimum width=3.5em,inner sep=3pt,rounded corners=1.5pt,draw,fill=red!10];
\tikzstyle{posnode} = [minimum height=1.4em,minimum width=3.5em,inner sep=3pt,rounded corners=1.5pt,draw,fill=black!5!white];
\tikzstyle{standard} = [rounded corners=3pt]

\node [Sanode,anchor=west] (sa1) at (0,0) {\tiny{$\textbf{Self-Attention}$}};
\node [Resnode,anchor=south] (res1) at ([yshift=0.3em]sa1.north) {\tiny{$\textbf{Add \& LayerNorm}$}};
\node [ffnnode,anchor=south] (ffn1) at ([yshift=1em]res1.north) {\tiny{$\textbf{Feed Forward Network}$}};
\node [Resnode,anchor=south] (res2) at ([yshift=0.3em]ffn1.north) {\tiny{$\textbf{Add \& LayerNorm}$}};
\node [inputnode,anchor=north west] (input1) at ([yshift=-1em]sa1.south west) {\tiny{$\textbf{Embedding}$}};
\node [posnode,anchor=north east] (pos1) at ([yshift=-1em]sa1.south east) {\tiny{$\textbf{Postion}$}};
xiaotong committed
4164
\node [anchor=north] (inputs) at ([yshift=-3em]sa1.south) {\tiny{$\textbf{编码器输入: 我  很  好}$}};
姜雨帆 committed
4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182
\node [anchor=south] (encoder) at ([xshift=0.2em,yshift=0.6em]res2.north west) {\scriptsize{\textbf{编码器}}};

\draw [->] (sa1.north) -- (res1.south);
\draw [->] (res1.north) -- (ffn1.south);
\draw [->] (ffn1.north) -- (res2.south);
\draw [->] ([yshift=-1em]sa1.south) -- (sa1.south);
\draw [->] ([yshift=-0.3em]inputs.north) -- ([yshift=0.6em]inputs.north);


\node [Sanode,anchor=west] (sa2) at ([xshift=3em]sa1.east) {\tiny{$\textbf{Self-Attention}$}};
\node [Resnode,anchor=south] (res3) at ([yshift=0.3em]sa2.north) {\tiny{$\textbf{Add \& LayerNorm}$}};
\node [Sanode,anchor=south] (ed1) at ([yshift=1em]res3.north) {\tiny{$\textbf{Encoder-Decoder Attention}$}};
\node [Resnode,anchor=south] (res4) at ([yshift=0.3em]ed1.north) {\tiny{$\textbf{Add \& LayerNorm}$}};
\node [ffnnode,anchor=south] (ffn2) at ([yshift=1em]res4.north) {\tiny{$\textbf{Feed Forward Network}$}};
\node [Resnode,anchor=south] (res5) at ([yshift=0.3em]ffn2.north) {\tiny{$\textbf{Add \& LayerNorm}$}};
\node [outputnode,anchor=south] (o1) at ([yshift=1em]res5.north) {\tiny{$\textbf{Output layer}$}};
\node [inputnode,anchor=north west] (input2) at ([yshift=-1em]sa2.south west) {\tiny{$\textbf{Embedding}$}};
\node [posnode,anchor=north east] (pos2) at ([yshift=-1em]sa2.south east) {\tiny{$\textbf{Postion}$}};
xiaotong committed
4183
\node [anchor=north] (outputs) at ([yshift=-3em]sa2.south) {\tiny{$\textbf{解码器输入: $<$SOS$>$ I  am  fine}$}};
姜雨帆 committed
4184
\node [anchor=east] (decoder) at ([xshift=-1em,yshift=-1.5em]o1.west) {\scriptsize{\textbf{解码器}}};
xiaotong committed
4185
\node [anchor=north] (decoutputs) at ([yshift=1.5em]o1.north) {\tiny{$\textbf{解码器输出: I  am  fine $<$EOS$>$ }$}};
姜雨帆 committed
4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206

\draw [->] (sa2.north) -- (res3.south);
\draw [->] (res3.north) -- (ed1.south);
\draw [->] (ed1.north) -- (res4.south);
\draw [->] (res4.north) -- (ffn2.south);
\draw [->] (ffn2.north) -- (res5.south);
\draw [->] (res5.north) -- (o1.south);
\draw [->] (o1.north) -- ([yshift=0.5em]o1.north);
\draw [->] ([yshift=-1em]sa2.south) -- (sa2.south);
\draw [->] ([yshift=-0.3em]outputs.north) -- ([yshift=0.6em]outputs.north);


\draw[->,standard] ([yshift=-0.5em]sa1.south) -- ([xshift=-4em,yshift=-0.5em]sa1.south) -- ([xshift=-4em,yshift=2.3em]sa1.south) -- ([xshift=-3.5em,yshift=2.3em]sa1.south);
\draw[->,standard] ([yshift=0.5em]res1.north) -- ([xshift=-4em,yshift=0.5em]res1.north) -- ([xshift=-4em,yshift=3.3em]res1.north) -- ([xshift=-3.5em,yshift=3.3em]res1.north);

\draw[->,standard] ([yshift=-0.5em]sa2.south) -- ([xshift=4em,yshift=-0.5em]sa2.south) -- ([xshift=4em,yshift=2.3em]sa2.south) -- ([xshift=3.5em,yshift=2.3em]sa2.south);
\draw[->,standard] ([yshift=0.5em]res3.north) -- ([xshift=4em,yshift=0.5em]res3.north) -- ([xshift=4em,yshift=3.3em]res3.north) -- ([xshift=3.5em,yshift=3.3em]res3.north);
\draw[->,standard] ([yshift=0.5em]res4.north) -- ([xshift=4em,yshift=0.5em]res4.north) -- ([xshift=4em,yshift=3.3em]res4.north) -- ([xshift=3.5em,yshift=3.3em]res4.north);

\draw[->,standard] (res2.north) -- ([yshift=0.5em]res2.north) -- ([xshift=5em,yshift=0.5em]res2.north) -- ([xshift=5em,yshift=-2.2em]res2.north) -- ([xshift=6.5em,yshift=-2.2em]res2.north);

姜雨帆 committed
4207 4208 4209 4210 4211 4212 4213
%\node [rectangle,inner sep=0.7em,rounded corners=1pt,very thick,dotted,draw=ugreen!70] [fit = (sa1) (res1) (ffn1) (res2)] (box0) {};
%\node [rectangle,inner sep=0.7em,rounded corners=1pt,very thick,dotted,draw=red!60] [fit = (sa2) (res3) (res5)] (box1) {};

\begin{pgfonlayer}{background}
	\node [rectangle,inner sep=0.2em,rounded corners=1pt,very thick,dotted,fill=red!40] [fit = (input1) (pos1)] (box1) {};		
	\node [rectangle,inner sep=0.2em,rounded corners=1pt,very thick,dotted,fill=red!40] [fit = (input2) (pos2)] (box2) {};	
\end{pgfonlayer}
姜雨帆 committed
4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236

\end{scope}
\end{tikzpicture}
\end{center}

%\end{tcolorbox}
\end{frame}

%%%------------------------------------------------------------------------------------------------------------
\begin{frame}{位置编码}
\begin{itemize}
\item 自注意力机制与前面的循环神经网络相比,忽略了词之间的顺序关系,例如下面两个语义不同的句子,通过自注意力得到的表示却是相同的

%\vspace{-0.5em}
\begin{center}
\begin{tikzpicture}
\begin{scope}

\tikzstyle{rnode} = [draw,minimum width=3.5em,minimum height=1.2em]

\node [rnode,anchor=south west,fill=green!20!white] (key1) at (0,0) {\scriptsize{$\textbf{h}(\textrm{``沈阳''})$}};
\node [rnode,anchor=south west,fill=green!20!white] (key2) at ([xshift=1em]key1.south east) {\scriptsize{$\textbf{h}(\textrm{``到''})$}};
\node [rnode,anchor=south west,fill=green!20!white] (key3) at ([xshift=1em]key2.south east) {\scriptsize{$\textbf{h}(\textrm{``广州''})$}};
姜雨帆 committed
4237
\node [rnode,anchor=south west,fill=green!20!white] (key4) at ([xshift=2em]key3.south east) {\scriptsize{$\textbf{h}(\textrm{``机票''})$}};
姜雨帆 committed
4238 4239
\node [rnode,anchor=south west] (key5) at ([xshift=1em]key4.south east) {\scriptsize{$\textbf{h}(\textrm{``机票''})$}};

姜雨帆 committed
4240 4241 4242
\node [anchor=west] (sep1) at ([xshift=0.3em]key3.east) {\scriptsize{$\textbf{...}$}};

\draw [->] ([yshift=1pt,xshift=-3pt]key5.north) .. controls +(90:1em) and +(90:0.7em) .. ([yshift=1pt]key4.north);
姜雨帆 committed
4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256
\draw [->] ([yshift=1pt,xshift=0pt]key5.north) .. controls +(90:1.4em) and +(90:1.4em) .. ([yshift=1pt]key3.north);
\draw [->] ([yshift=1pt,xshift=3pt]key5.north) .. controls +(90:1.8em) and +(90:1.8em) .. ([yshift=1pt]key2.north);
\draw [->] ([yshift=1pt,xshift=6pt]key5.north) .. controls +(90:2.2em) and +(90:2.2em) .. ([yshift=1pt]key1.north);

\node [anchor=south west] (alpha1) at ([xshift=-1em]key1.north west) {\scriptsize{$\alpha_1=.2$}};
\node [anchor=south west] (alpha2) at ([xshift=-1em]key2.north west) {\scriptsize{$\alpha_2=.3$}};
\node [anchor=south west] (alpha3) at ([xshift=-1em]key3.north west) {\scriptsize{$\alpha_3=.1$}};
\node [anchor=south west] (alpha4) at ([xshift=-1em]key4.north west) {\scriptsize{$\alpha_4=.3$}};

\vspace{0.5em}

\node [rnode,anchor=south west,fill=green!20!white] (key6) at ([yshift=2em]key1.north west) {\scriptsize{$\textbf{h}(\textrm{``广州''})$}};
\node [rnode,anchor=south west,fill=green!20!white] (key7) at ([yshift=2em]key2.north west) {\scriptsize{$\textbf{h}(\textrm{``到''})$}};
\node [rnode,anchor=south west,fill=green!20!white] (key8) at ([yshift=2em]key3.north west) {\scriptsize{$\textbf{h}(\textrm{``沈阳''})$}};
姜雨帆 committed
4257
\node [rnode,anchor=south west,fill=green!20!white] (key9) at ([yshift=2em]key4.north west) {\scriptsize{$\textbf{h}(\textrm{``机票''})$}};
姜雨帆 committed
4258 4259
\node [rnode,anchor=south west] (key10) at ([yshift=2em]key5.north west) {\scriptsize{$\textbf{h}(\textrm{``机票''})$}};

姜雨帆 committed
4260 4261 4262
\node [anchor=west] (sep1) at ([xshift=0.3em]key8.east) {\scriptsize{$\textbf{...}$}};

\draw [->] ([yshift=1pt,xshift=-3pt]key10.north) .. controls +(90:1em) and +(90:0.7em) .. ([yshift=1pt]key9.north);
姜雨帆 committed
4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278
\draw [->] ([yshift=1pt,xshift=0pt]key10.north) .. controls +(90:1.4em) and +(90:1.4em) .. ([yshift=1pt]key8.north);
\draw [->] ([yshift=1pt,xshift=3pt]key10.north) .. controls +(90:1.8em) and +(90:1.8em) .. ([yshift=1pt]key7.north);
\draw [->] ([yshift=1pt,xshift=6pt]key10.north) .. controls +(90:2.2em) and +(90:2.2em) .. ([yshift=1pt]key6.north);

\node [anchor=south west] (alpha5) at ([xshift=-1em]key6.north west) {\scriptsize{$\alpha_1=.1$}};
\node [anchor=south west] (alpha6) at ([xshift=-1em]key7.north west) {\scriptsize{$\alpha_2=.3$}};
\node [anchor=south west] (alpha7) at ([xshift=-1em]key8.north west) {\scriptsize{$\alpha_3=.2$}};
\node [anchor=south west] (alpha8) at ([xshift=-1em]key9.north west) {\scriptsize{$\alpha_4=.3$}};

\end{scope}
\end{tikzpicture}
\end{center}

\vspace{-1.5em}
\begin{eqnarray}
\textbf{C}(\textrm{``机票''}) & = & 0.2 \times \textbf{h}(\textrm{``沈阳''}) + 0.3 \times \textbf{h}(\textrm{``到''}) + \nonumber \\
姜雨帆 committed
4279
             &   & 0.1 \times \textbf{h}(\textrm{``广州''}) + ... + 0.3 \times \textbf{h}(\textrm{``机票''}) \nonumber
姜雨帆 committed
4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377
\end{eqnarray}

\item<2->为了解决这个问题,引入了{\color{red} \textbf{位置编码}}

\end{itemize}
\end{frame}

%%%------------------------------------------------------------------------------------------------------------
\begin{frame}{位置编码(续)}
\begin{itemize}
\item 位置编码的计算方式有很多种,这里使用正余弦函数来编码。式中\textit{pos}代表第几个词,\textit{i}代表词嵌入中的第几维
\vspace{-0.6em}

\begin{displaymath}
PE_{(pos,2i)} = sin(pos/10000^{2i/d_{model}})
\end{displaymath}
\vspace{-1em}
\begin{displaymath}
PE_{(pos,2i+1)} = cos(pos/10000^{2i/d_{model}})
\end{displaymath}

\visible<2->{
\item 将得到的位置编码加到原有的词向量中
%\vspace{-0.5em}
\vspace{1em}
\begin{center}
\begin{tikzpicture}
\begin{scope}
\tikzstyle{rnode} = [draw,minimum width=3.5em,minimum height=1.2em]

\node [rnode,anchor=south west,fill=red!20!white] (e1) at (0,0) {\scriptsize{$\textbf{e}(\textrm{``沈阳''})$}};
\node [rnode,anchor=south west,fill=red!20!white] (e2) at ([xshift=1em]e1.south east) {\scriptsize{$\textbf{e}(\textrm{``到''})$}};
\node [rnode,anchor=south west,fill=red!20!white] (e3) at ([xshift=1em]e2.south east) {\scriptsize{$\textbf{e}(\textrm{``广州''})$}};
\node [rnode,anchor=south west,fill=red!20!white] (e4) at ([xshift=1em]e3.south east) {\scriptsize{$\textbf{e}(\textrm{``的''})$}};
\node [rnode,anchor=south west,fill=red!20!white] (e5) at ([xshift=1em]e4.south east) {\scriptsize{$\textbf{e}(\textrm{``机票''})$}};

\node [rnode,anchor=south west,fill=green!20!white] (h1) at ([yshift=1.5em]e1.north west) {\scriptsize{$\textbf{h}(\textrm{``沈阳''})$}};
\node [rnode,anchor=south west,fill=green!20!white] (h2) at ([yshift=1.5em]e2.north west) {\scriptsize{$\textbf{h}(\textrm{``到''})$}};
\node [rnode,anchor=south west,fill=green!20!white] (h3) at ([yshift=1.5em]e3.north west) {\scriptsize{$\textbf{h}(\textrm{``广州''})$}};
\node [rnode,anchor=south west,fill=green!20!white] (h4) at ([yshift=1.5em]e4.north west) {\scriptsize{$\textbf{h}(\textrm{``的''})$}};
\node [rnode,anchor=south west,fill=green!20!white] (h5) at ([yshift=1.5em]e5.north west) {\scriptsize{$\textbf{h}(\textrm{``机票''})$}};

\foreach \x in {1,2,3,4,5}{
	\node [anchor=north] (plus\x) at ([yshift=-0em]e\x.south) {\scriptsize{$\mathbf{\oplus}$}};
}

\node [rnode,anchor=north,fill=yellow!20!white] (pos1) at ([yshift=-1.1em]e1.south) {\scriptsize{$\textbf{PE}(1)$}};
\node [rnode,anchor=north,fill=yellow!20!white] (pos2) at ([yshift=-1.1em]e2.south) {\scriptsize{$\textbf{PE}(2)$}};
\node [rnode,anchor=north,fill=yellow!20!white] (pos3) at ([yshift=-1.1em]e3.south) {\scriptsize{$\textbf{PE}(3)$}};
\node [rnode,anchor=north,fill=yellow!20!white] (pos4) at ([yshift=-1.1em]e4.south) {\scriptsize{$\textbf{PE}(4)$}};
\node [rnode,anchor=north,fill=yellow!20!white] (pos5) at ([yshift=-1.1em]e5.south) {\scriptsize{$\textbf{PE}(5)$}};


\foreach \x in {1,2,3,4,5}{
	\node [rectangle,inner sep=0.1em,rounded corners=1pt,very thick,dotted,draw=red!60] [fit = (e\x) (pos\x)] (box\x) {};
}

\node [anchor=north] (inputs1) at ([yshift=-1em]pos1.south) {\scriptsize{沈阳}};
\node [anchor=north] (inputs2) at ([yshift=-1em]pos2.south) {\scriptsize{}};
\node [anchor=north] (inputs3) at ([yshift=-1em]pos3.south) {\scriptsize{广州}};
\node [anchor=north] (inputs4) at ([yshift=-1em]pos4.south) {\scriptsize{}};
\node [anchor=north] (inputs5) at ([yshift=-1em]pos5.south) {\scriptsize{机票}};

\draw [->] ([yshift=0.1em]e1.north) .. controls +(north:0.5) and +(south:0.5) .. ([xshift=-1em,yshift=-0.1em]h3.south);
\draw [->] ([yshift=0.1em]e2.north) .. controls +(north:0.3) and +(south:0.6) .. ([xshift=-0.5em,yshift=-0.1em]h3.south);
\draw [->] ([yshift=0.1em]e3.north) -- ([yshift=-0.1em]h3.south);
\draw [->] ([yshift=0.1em]e4.north) .. controls +(north:0.3) and +(south:0.6) .. ([xshift=0.5em,yshift=-0.1em]h3.south);
\draw [->] ([yshift=0.1em]e5.north) .. controls +(north:0.5) and +(south:0.5) .. ([xshift=1em,yshift=-0.1em]h3.south);
\draw [->] ([yshift=0.1em]e1.north) -- ([yshift=-0.1em]h1.south);
\draw [->] ([yshift=0.1em]e2.north) -- ([yshift=-0.1em]h2.south);
\draw [->] ([yshift=0.1em]e4.north) -- ([yshift=-0.1em]h4.south);
\draw [->] ([yshift=0.1em]e5.north) -- ([yshift=-0.1em]h5.south);

\foreach \x in {1,2,3,4,5}{
	\draw [->] ([yshift=-0.1em]inputs\x.north) -- ([yshift=-0.2em]pos\x.south);
}


\node [anchor=north] (dot1) at ([xshift=0.4em,yshift=-0.2em]h1.south) {\tiny{...}};
\node [anchor=north] (dot2) at ([xshift=0.4em,yshift=-0.2em]h2.south) {\tiny{...}};
\node [anchor=north] (dot4) at ([xshift=-0.4em,yshift=-0.2em]h4.south) {\tiny{...}};
\node [anchor=north] (dot5) at ([xshift=-0.4em,yshift=-0.2em]h5.south) {\tiny{...}};

\end{scope}
\end{tikzpicture}
\end{center}
}
\end{itemize}
\end{frame}


%%%------------------------------------------------------------------------------------------------------------
\begin{frame}{Transformer}
%\begin{tcolorbox}
%[bicolor,sidebyside,righthand width=4.5cm,size=title,frame engine=empty,
% colback=blue!10!white,colbacklower=black!5!white]

\begin{itemize}
姜雨帆 committed
4378
\item Transformer 多头自注意力机制
姜雨帆 committed
4379
\end{itemize}
姜雨帆 committed
4380
\vspace{-0.5em}
姜雨帆 committed
4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397
\begin{center}
\begin{tikzpicture}
\begin{scope}
\tikzstyle{Sanode} = [minimum height=1.4em,minimum width=7em,inner sep=3pt,rounded corners=1.5pt,draw,fill=orange!20];
\tikzstyle{Resnode} = [minimum height=1.1em,minimum width=7em,inner sep=3pt,rounded corners=1.5pt,draw];
\tikzstyle{ffnnode} = [minimum height=1.4em,minimum width=7em,inner sep=3pt,rounded corners=1.5pt,draw];
\tikzstyle{outputnode} = [minimum height=1.4em,minimum width=7em,inner sep=3pt,rounded corners=1.5pt,draw];
\tikzstyle{inputnode} = [minimum height=1.4em,minimum width=3.5em,inner sep=3pt,rounded corners=1.5pt,draw,fill=red!10];
\tikzstyle{posnode} = [minimum height=1.4em,minimum width=3.5em,inner sep=3pt,rounded corners=1.5pt,draw,fill=black!5!white];
\tikzstyle{standard} = [rounded corners=3pt]

\node [Sanode,anchor=west] (sa1) at (0,0) {\tiny{$\textbf{Self-Attention}$}};
\node [Resnode,anchor=south] (res1) at ([yshift=0.3em]sa1.north) {\tiny{$\textbf{Add \& LayerNorm}$}};
\node [ffnnode,anchor=south] (ffn1) at ([yshift=1em]res1.north) {\tiny{$\textbf{Feed Forward Network}$}};
\node [Resnode,anchor=south] (res2) at ([yshift=0.3em]ffn1.north) {\tiny{$\textbf{Add \& LayerNorm}$}};
\node [inputnode,anchor=north west] (input1) at ([yshift=-1em]sa1.south west) {\tiny{$\textbf{Embedding}$}};
\node [posnode,anchor=north east] (pos1) at ([yshift=-1em]sa1.south east) {\tiny{$\textbf{Postion}$}};
xiaotong committed
4398
\node [anchor=north] (inputs) at ([yshift=-3em]sa1.south) {\tiny{$\textbf{编码器输入: 我  很  好}$}};
姜雨帆 committed
4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416
\node [anchor=south] (encoder) at ([xshift=0.2em,yshift=0.6em]res2.north west) {\scriptsize{\textbf{编码器}}};

\draw [->] (sa1.north) -- (res1.south);
\draw [->] (res1.north) -- (ffn1.south);
\draw [->] (ffn1.north) -- (res2.south);
\draw [->] ([yshift=-1em]sa1.south) -- (sa1.south);
\draw [->] ([yshift=-0.3em]inputs.north) -- ([yshift=0.6em]inputs.north);


\node [Sanode,anchor=west] (sa2) at ([xshift=3em]sa1.east) {\tiny{$\textbf{Self-Attention}$}};
\node [Resnode,anchor=south] (res3) at ([yshift=0.3em]sa2.north) {\tiny{$\textbf{Add \& LayerNorm}$}};
\node [Sanode,anchor=south] (ed1) at ([yshift=1em]res3.north) {\tiny{$\textbf{Encoder-Decoder Attention}$}};
\node [Resnode,anchor=south] (res4) at ([yshift=0.3em]ed1.north) {\tiny{$\textbf{Add \& LayerNorm}$}};
\node [ffnnode,anchor=south] (ffn2) at ([yshift=1em]res4.north) {\tiny{$\textbf{Feed Forward Network}$}};
\node [Resnode,anchor=south] (res5) at ([yshift=0.3em]ffn2.north) {\tiny{$\textbf{Add \& LayerNorm}$}};
\node [outputnode,anchor=south] (o1) at ([yshift=1em]res5.north) {\tiny{$\textbf{Output layer}$}};
\node [inputnode,anchor=north west] (input2) at ([yshift=-1em]sa2.south west) {\tiny{$\textbf{Embedding}$}};
\node [posnode,anchor=north east] (pos2) at ([yshift=-1em]sa2.south east) {\tiny{$\textbf{Postion}$}};
xiaotong committed
4417
\node [anchor=north] (outputs) at ([yshift=-3em]sa2.south) {\tiny{$\textbf{解码器输入: $<$SOS$>$ I  am  fine}$}};
姜雨帆 committed
4418
\node [anchor=east] (decoder) at ([xshift=-1em,yshift=-1.5em]o1.west) {\scriptsize{\textbf{解码器}}};
xiaotong committed
4419
\node [anchor=north] (decoutputs) at ([yshift=1.5em]o1.north) {\tiny{$\textbf{解码器输出: I  am  fine $<$EOS$>$ }$}};
姜雨帆 committed
4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440

\draw [->] (sa2.north) -- (res3.south);
\draw [->] (res3.north) -- (ed1.south);
\draw [->] (ed1.north) -- (res4.south);
\draw [->] (res4.north) -- (ffn2.south);
\draw [->] (ffn2.north) -- (res5.south);
\draw [->] (res5.north) -- (o1.south);
\draw [->] (o1.north) -- ([yshift=0.5em]o1.north);
\draw [->] ([yshift=-1em]sa2.south) -- (sa2.south);
\draw [->] ([yshift=-0.3em]outputs.north) -- ([yshift=0.6em]outputs.north);


\draw[->,standard] ([yshift=-0.5em]sa1.south) -- ([xshift=-4em,yshift=-0.5em]sa1.south) -- ([xshift=-4em,yshift=2.3em]sa1.south) -- ([xshift=-3.5em,yshift=2.3em]sa1.south);
\draw[->,standard] ([yshift=0.5em]res1.north) -- ([xshift=-4em,yshift=0.5em]res1.north) -- ([xshift=-4em,yshift=3.3em]res1.north) -- ([xshift=-3.5em,yshift=3.3em]res1.north);

\draw[->,standard] ([yshift=-0.5em]sa2.south) -- ([xshift=4em,yshift=-0.5em]sa2.south) -- ([xshift=4em,yshift=2.3em]sa2.south) -- ([xshift=3.5em,yshift=2.3em]sa2.south);
\draw[->,standard] ([yshift=0.5em]res3.north) -- ([xshift=4em,yshift=0.5em]res3.north) -- ([xshift=4em,yshift=3.3em]res3.north) -- ([xshift=3.5em,yshift=3.3em]res3.north);
\draw[->,standard] ([yshift=0.5em]res4.north) -- ([xshift=4em,yshift=0.5em]res4.north) -- ([xshift=4em,yshift=3.3em]res4.north) -- ([xshift=3.5em,yshift=3.3em]res4.north);

\draw[->,standard] (res2.north) -- ([yshift=0.5em]res2.north) -- ([xshift=5em,yshift=0.5em]res2.north) -- ([xshift=5em,yshift=-2.2em]res2.north) -- ([xshift=6.5em,yshift=-2.2em]res2.north);

姜雨帆 committed
4441 4442 4443 4444 4445 4446 4447 4448
%\node [rectangle,inner sep=0.7em,rounded corners=1pt,very thick,dotted,draw=ugreen!70] [fit = (sa1) (res1) (ffn1) (res2)] (box0) {};
%\node [rectangle,inner sep=0.7em,rounded corners=1pt,very thick,dotted,draw=red!60] [fit = (sa2) (res3) (res5)] (box1) {};

\begin{pgfonlayer}{background}
	\node [rectangle,inner sep=0.2em,rounded corners=1pt,very thick,dotted,fill=red!40] [fit = (sa1)] (box1) {};		
	\node [rectangle,inner sep=0.2em,rounded corners=1pt,very thick,dotted,fill=red!40] [fit = (sa2)] (box2) {};	
	\node [rectangle,inner sep=0.2em,rounded corners=1pt,very thick,dotted,fill=red!40] [fit = (ed1)] (box3) {};	
\end{pgfonlayer}
姜雨帆 committed


\end{scope}
\end{tikzpicture}
\end{center}

%\end{tcolorbox}
\end{frame}

%%%------------------------------------------------------------------------------------------------------------
\begin{frame}{基于点乘的注意力机制}
\begin{itemize}
\item Transformer使用点乘的自注意力方法来捕获句子内部各个位置之间的相似性:
\vspace{-0.5em}
\begin{displaymath}
\textrm{Attention}(Q,K,V) = \textrm{softmax}(\frac{QK^{T}}{\sqrt{d_k}}+Mask)V
\end{displaymath}

\vspace{-1em}

\begin{center}
\begin{tikzpicture}
\begin{scope}

\node [anchor=south west,fill=white,draw,inner sep=4pt,minimum width=4em,fill=blue!20!white] (MatMul) at (0,0) {\tiny{MatMul}};
\node [anchor=north] (Q1) at ([xshift=-1.4em,yshift=-1em]MatMul.south) {\footnotesize{$Q$}};
\node [anchor=north] (K1) at ([xshift=1.4em,yshift=-1em]MatMul.south) {\footnotesize{$K$}};
\node [anchor=south,draw,inner sep=4pt,fill=yellow!30,minimum width=2.5em] (Scale3) at ([yshift=1em]MatMul.north) {\tiny{Scale}};
\node [anchor=south,draw,inner sep=4pt,fill=purple!20,minimum width=3.5em] (Mask) at ([yshift=0.8em]Scale3.north) {\tiny{Mask(opt.)}};
\node [anchor=south,draw,inner sep=4pt,fill=ugreen!20!white] (SoftMax) at ([yshift=1em]Mask.north) {\tiny{SoftMax}};
\node [anchor=south,draw,minimum width=4em,inner sep=4pt,fill=blue!20!white] (MatMul1) at ([xshift=1.7em,yshift=1em]SoftMax.north) {\tiny{MatMul}};
\node [anchor=north] (V1) at ([xshift=2em]K1.north) {\footnotesize{$V$}};
\node [anchor=north] (null) at ([yshift=0.8em]MatMul1.north) {};

\draw [->] ([yshift=0.1em]Q1.north) -- ([xshift=-1.4em,yshift=-0.1em]MatMul.south);
\draw [->] ([yshift=0.1em]K1.north) -- ([xshift=1.4em,yshift=-0.1em]MatMul.south);
\draw [->] ([yshift=0.1em]MatMul.north) -- ([yshift=-0.1em]Scale3.south);
\draw [->] ([yshift=0.1em]Scale3.north) -- ([yshift=-0.1em]Mask.south);
\draw [->] ([yshift=0.1em]Mask.north) -- ([yshift=-0.1em]SoftMax.south);
\draw [->] ([yshift=0.1em]SoftMax.north) -- ([yshift=0.9em]SoftMax.north);
\draw [->] ([yshift=0.1em]V1.north) -- ([yshift=9.1em]V1.north);
\draw [->] ([yshift=0.1em]MatMul1.north) -- ([yshift=0.8em]MatMul1.north);

\visible<2->{
\node [anchor=east] (line1) at ([xshift=-3em,yshift=1em]MatMul.west) {\scriptsize{自注意力机制的Query}};
\node [anchor=north west] (line2) at ([yshift=0.3em]line1.south west) {\scriptsize{Key和Value均来自同一句子}};
\node [anchor=north west] (line3) at ([yshift=0.3em]line2.south west) {\scriptsize{编码-解码注意力机制}};
\node [anchor=north west] (line4) at ([yshift=0.3em]line3.south west) {\scriptsize{与前面讲的一样}};
}
\visible<3->{
\node [anchor=west] (line11) at ([xshift=3em,yshift=0em]MatMul.east) {\scriptsize{Query和Key的转置}};
\node [anchor=north west] (line12) at ([yshift=0.3em]line11.south west) {\scriptsize{进行点积,得到句子内部}};
\node [anchor=north west] (line13) at ([yshift=0.3em]line12.south west) {\scriptsize{各个位置的相关性}};
}

\visible<4->{
\node [anchor=west] (line21) at ([yshift=5em]line11.west) {\scriptsize{相关性矩阵在训练中}};
\node [anchor=north west] (line22) at ([yshift=0.3em]line21.south west) {\scriptsize{方差变大,不利于训练}};
\node [anchor=north west] (line23) at ([yshift=0.3em]line22.south west) {\scriptsize{所以对其进行缩放}};
}

\visible<5->{
\node [anchor=west] (line31) at ([yshift=6em]line1.west) {\scriptsize{在编码端,对句子补齐}};
\node [anchor=north west] (line32) at ([yshift=0.3em]line31.south west) {\scriptsize{填充的部分进行屏蔽}};
\node [anchor=north west] (line33) at ([yshift=0.3em]line32.south west) {\scriptsize{解码时看不到未来的信息}};
\node [anchor=north west] (line34) at ([yshift=0.3em]line33.south west) {\scriptsize{需要对未来的信息进行屏蔽}};
}

\visible<6->{
\node [anchor=west] (line41) at ([yshift=4em]line21.west) {\scriptsize{用归一化的相关性打分}};
\node [anchor=north west] (line42) at ([yshift=0.3em]line41.south west) {\scriptsize{对Value进行加权求和}};
}

\begin{pgfonlayer}{background}
\visible<2->{
\node [rectangle,inner sep=0.2em,rounded corners=1pt,fill=green!10,drop shadow,draw=ugreen] [fit = (line1) (line2) (line3) (line4)] (box1) {};
\node [rectangle,inner sep=0.1em,rounded corners=1pt,very thick,dotted,draw=ugreen] [fit = (Q1) (K1) (V1)] (box0) {};
\draw [->,dotted,very thick,ugreen] ([yshift=-1.5em,xshift=0.8em]box1.east) -- ([yshift=-1.5em,xshift=0.1em]box1.east);
}
\visible<3->{
\node [rectangle,inner sep=0.2em,rounded corners=1pt,fill=blue!20!white,drop shadow,draw=blue] [fit = (line11) (line12) (line13)] (box2) {};
\draw [->,dotted,very thick,blue] ([yshift=1em,xshift=-2.8em]box2.west) -- ([yshift=1em,xshift=-0.1em]box2.west);
}

\visible<4->{
\node [rectangle,inner sep=0.2em,rounded corners=1pt,fill=yellow!20,drop shadow,draw=black] [fit = (line21) (line22) (line23)] (box3) {};
\draw [->,dotted,very thick,black] ([xshift=0.1em]Scale3.east) .. controls +(east:1) and +(west:1) .. ([yshift=1.0em]box3.west) ;
}

\visible<5->{
\node [rectangle,inner sep=0.2em,rounded corners=1pt,fill=red!10,drop shadow,draw=red] [fit = (line31) (line32) (line33) (line34)] (box4) {};
\draw [->,dotted,very thick,red] ([yshift=-1.5em,xshift=1.5em]box4.east) -- ([yshift=-1.5em,xshift=0.1em]box4.east);
}

\visible<6->{
\node [rectangle,inner sep=0.2em,rounded corners=1pt,fill=blue!20!white,drop shadow,draw=blue] [fit = (line41) (line42)] (box5) {};
\draw [->,dotted,very thick,blue] ([yshift=-0.3em,xshift=-1em]box5.west) -- ([yshift=-0.3em,xshift=-0.1em]box5.west);
}					
\end{pgfonlayer}


\end{scope}
\end{tikzpicture}
\end{center}

\end{itemize}
\end{frame}
xiaotong committed
4555

xiaotong committed
4556 4557 4558 4559 4560 4561
%%%------------------------------------------------------------------------------------------------------------
%%% mask的实例
\begin{frame}{Mask}
% 为什么要加mask
% self-attention的mask
% enc-dec的mask (也可以加页)
姜雨帆 committed
4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612
\begin{itemize}
\item 对于源语和目标语的输入,由于需要进行batch处理,有些部分是填充的(Padding),需要用Mask进行屏蔽
\item 对于解码器来说,由于在预测的时候是自左向右进行的,为了保持{\color{red} \textbf{训练解码一致}},需要对未来信息进行屏蔽
\end{itemize}
%%% 运行实例的图
\begin{center}
\begin{tikzpicture}
\begin{scope}
\tikzstyle{attnode} = [minimum size=1.5em,inner sep=0pt,rounded corners=1pt,draw]
\tikzstyle{srcnode} = [rotate=45,font=\small,anchor=south west]
\tikzstyle{tgtnode} = [left,font=\small,anchor=north east]
\tikzstyle{masknode} = [minimum size=5.8em,inner sep=0pt,rounded corners=1pt,draw]
\tikzstyle{elementnode} = [rectangle,text=white,anchor=center]
\setlength{\hnode}{1.0cm}
%\node [anchor=west,attnode] (node1) at (0,0) {\tiny{}};
%\node [anchor=west,attnode] (node2) at ([xshift=1em]node1.east) {\tiny{}};
\visible<2->{
\foreach \i / \j / \c in
    {0/5/0.25, 1/5/0.15, 2/5/0.15, 3/5/0.35, 4/5/0.25, 5/5/0.15,
    0/4/0.15, 1/4/0.25, 2/4/0.2, 3/4/0.30, 4/4/0.15, 5/4/0.15,
    0/3/0.15, 1/3/0.15, 2/3/0.5, 3/3/0.25, 4/3/0.15, 5/3/0.25,
    0/2/0.15, 1/2/0.15, 2/2/0.15, 3/2/0.15, 4/2/0.25, 5/2/0.3,
    0/1/0.25, 1/1/0.15, 2/1/0.15, 3/1/0.15, 4/1/0.5, 5/1/0.15,
    0/0/0.15, 1/0/0.15, 2/0/0.15, 3/0/0.15, 4/0/0.25, 5/0/0.40}
    \node[elementnode,minimum size=0.6*\hnode*\c,inner sep=0.1pt,fill=blue] (a\i\j) at (0.5*\hnode*\i-5.4*0.5*\hnode,0.5*\hnode*\j-1.05*\hnode) {};

% source
\node[srcnode] (src1) at (-5.4*0.5*\hnode,-1.05*\hnode+5.5*0.5*\hnode) {\scriptsize{Have}};
\node[srcnode] (src2) at ([xshift=0.5\hnode]src1.south west) {\scriptsize{you}};
\node[srcnode] (src3) at ([xshift=0.5\hnode]src2.south west) {\scriptsize{learned}};
\node[srcnode] (src4) at ([xshift=0.5\hnode]src3.south west) {\scriptsize{nothing}};
\node[srcnode] (src5) at ([xshift=0.5\hnode]src4.south west) {\scriptsize{?}};
\node[srcnode] (src6) at ([xshift=0.5\hnode]src5.south west) {\scriptsize{EOS}};

% target
\node[tgtnode] (tgt1) at (-6.0*0.5*\hnode,-1.05*\hnode+5.5*0.5*\hnode) {\scriptsize{Have}};
\node[tgtnode] (tgt2) at ([yshift=-0.5\hnode]tgt1.north east) {\scriptsize{you}};
\node[tgtnode] (tgt3) at ([yshift=-0.5\hnode]tgt2.north east) {\scriptsize{learned}};
\node[tgtnode] (tgt4) at ([yshift=-0.5\hnode]tgt3.north east) {\scriptsize{nothing}};
\node[tgtnode] (tgt5) at ([yshift=-0.5\hnode]tgt4.north east) {\scriptsize{?}};
\node[tgtnode] (tgt6) at ([yshift=-0.5\hnode]tgt5.north east) {\scriptsize{EOS}};

\node [rounded corners=0.3em,fill=yellow!30] (qk) at ([xshift=2.5em,yshift=5em]a55.north) {\large{$\frac{QK^{T}}{\sqrt{d_k}}$}};
\node [rounded corners=0.3em,anchor=west] (add) at ([xshift=0.1em]qk.east) {\large{+}};
\node [rounded corners=0.3em,anchor=west] (mask) at ([xshift=0.1em]add.east) {\large{$Mask$}};
\node [rounded corners=0.3em,anchor=east] (left) at ([xshift=-0em]qk.west) {\large{$($}};
\node [rounded corners=0.3em,anchor=west] (right) at ([xshift=0em]mask.east) {\large{$)$}};
\node [rounded corners=0.3em,anchor=west] (softmax) at ([xshift=-6em]left.east) {\large{Softmax}};
}
\visible<3->{
\filldraw [fill=blue!20,draw,thick,fill opacity=0.85] ([xshift=-0.9em,yshift=0.5em]a15.north west) -- ([xshift=0.5em,yshift=-0.9em]a51.south east) --  ([xshift=0.5em,yshift=0.5em]a55.north east) -- ([xshift=-0.9em,yshift=0.5em]a15.north west);
xiaotong committed
4613
\node[anchor=west] (labelmask) at ([xshift=0.3em,yshift=0.5em]a23.north east) {Masked};
姜雨帆 committed
4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650
\node [rounded corners=0.3em,anchor=west,fill=blue!20] (mask) at ([xshift=0.1em]add.east) {\large{$Mask$}};
}

\visible<4->{
\foreach \i / \j / \c in
    {0/5/0.25,
    0/4/0.15, 1/4/0.25,
    0/3/0.15, 1/3/0.15, 2/3/0.5,
    0/2/0.15, 1/2/0.15, 2/2/0.15, 3/2/0.15,
    0/1/0.25, 1/1/0.15, 2/1/0.15, 3/1/0.15, 4/1/0.5,
    0/0/0.15, 1/0/0.15, 2/0/0.15, 3/0/0.15, 4/0/0.25, 5/0/0.40}
    \node[elementnode,minimum size=0.6*\hnode*\c,inner sep=0.1pt,fill=blue] (a\i\j) at (0.5*\hnode*\i+6*0.5*\hnode,0.5*\hnode*\j-1.05*\hnode) {};

% source
\node[srcnode] (src1) at (6*0.5*\hnode,-1.05*\hnode+5.5*0.5*\hnode) {\scriptsize{Have}};
\node[srcnode] (src2) at ([xshift=0.5\hnode]src1.south west) {\scriptsize{you}};
\node[srcnode] (src3) at ([xshift=0.5\hnode]src2.south west) {\scriptsize{learned}};
\node[srcnode] (src4) at ([xshift=0.5\hnode]src3.south west) {\scriptsize{nothing}};
\node[srcnode] (src5) at ([xshift=0.5\hnode]src4.south west) {\scriptsize{?}};
\node[srcnode] (src6) at ([xshift=0.5\hnode]src5.south west) {\scriptsize{EOS}};

% target
\node[tgtnode] (tgt1) at (5.4*0.5*\hnode,-1.05*\hnode+5.5*0.5*\hnode) {\scriptsize{Have}};
\node[tgtnode] (tgt2) at ([yshift=-0.5\hnode]tgt1.north east) {\scriptsize{you}};
\node[tgtnode] (tgt3) at ([yshift=-0.5\hnode]tgt2.north east) {\scriptsize{learned}};
\node[tgtnode] (tgt4) at ([yshift=-0.5\hnode]tgt3.north east) {\scriptsize{nothing}};
\node[tgtnode] (tgt5) at ([yshift=-0.5\hnode]tgt4.north east) {\scriptsize{?}};
\node[tgtnode] (tgt6) at ([yshift=-0.5\hnode]tgt5.north east) {\scriptsize{EOS}};

\node [rounded corners=0.3em,anchor=west,fill=green!30] (softmax) at ([xshift=-6em]left.east) {\large{Softmax}};

}

\end{scope}
\end{tikzpicture}
\end{center}

xiaotong committed
4651 4652
\end{frame}

xiaotong committed
4653

姜雨帆 committed
4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763
%%%------------------------------------------------------------------------------------------------------------
\begin{frame}{多头自注意力模型}
\begin{itemize}
\item Transformer首次提出了多头注意力机制,将输入的Query、Key、Value沿着隐层维度切分为$h$个子集,分别进行注意力操作,取得了很好的效果
\vspace{-0.5em}
\begin{displaymath}
\textrm{MultiHead}(Q,K,V) = \mathrm{Concat(head_1,...,head_h)}W^O
\end{displaymath}
\begin{displaymath}
\mathrm{where\ head_i} = \textrm{Attention(}Q{W_i}^Q,K{W_i}^K,V{W_i}^V\textrm{)}
\end{displaymath}

\vspace{0.3em}

\begin{center}
\begin{tikzpicture}
\begin{scope}

\node [anchor=west,draw=black!30,inner sep=4pt,fill=ugreen!20!white] (Linear0) at (0,0) {\tiny{Linear}};
\node [anchor=south west,draw=black!50,fill=ugreen!20!white,draw,inner sep=4pt] (Linear01) at ([shift={(-0.2em,-0.2em)}]Linear0.south west) {\tiny{Linear}};
\node [anchor=south west,fill=ugreen!20!white,draw,inner sep=4pt] (Linear02) at ([shift={(-0.2em,-0.2em)}]Linear01.south west) {\tiny{Linear}};
\node [anchor=north] (Q) at ([xshift=0em,yshift=-1em]Linear02.south) {\footnotesize{$Q$}};

\node [anchor=west,draw=black!30,inner sep=4pt,fill=ugreen!20!white] (Linear1) at ([xshift=1.5em]Linear0.east) {\tiny{Linear}};
\node [anchor=south west,draw=black!50,fill=ugreen!20!white,draw,inner sep=4pt] (Linear11) at ([shift={(-0.2em,-0.2em)}]Linear1.south west) {\tiny{Linear}};
\node [anchor=south west,fill=ugreen!20!white,draw,inner sep=4pt] (Linear12) at ([shift={(-0.2em,-0.2em)}]Linear11.south west) {\tiny{Linear}};
\node [anchor=north] (K) at ([xshift=0em,yshift=-1em]Linear12.south) {\footnotesize{$K$}};

\node [anchor=west,draw=black!30,inner sep=4pt,fill=ugreen!20!white] (Linear2) at ([xshift=1.5em]Linear1.east) {\tiny{Linear}};
\node [anchor=south west,draw=black!50,fill=ugreen!20!white,draw,inner sep=4pt] (Linear21) at ([shift={(-0.2em,-0.2em)}]Linear2.south west) {\tiny{Linear}};
\node [anchor=south west,fill=ugreen!20!white,draw,inner sep=4pt] (Linear22) at ([shift={(-0.2em,-0.2em)}]Linear21.south west) {\tiny{Linear}};
\node [anchor=north] (V) at ([xshift=0em,yshift=-1em]Linear22.south) {\footnotesize{$V$}};

\node [anchor=south,draw=black!30,minimum width=9em,inner sep=4pt,fill=blue!20!white] (Scale) at ([yshift=1em]Linear1.north) {\tiny{Scaled Dot-Product Attention}};
\node [anchor=south west,draw=black!50,minimum width=9em,fill=blue!20!white,draw,inner sep=4pt] (Scale1) at ([shift={(-0.2em,-0.2em)}]Scale.south west) {\tiny{Scaled Dot-Product Attention}};
\node [anchor=south west,fill=blue!20!white,draw,minimum width=9em,inner sep=4pt] (Scale2) at ([shift={(-0.2em,-0.2em)}]Scale1.south west) {\tiny{Scaled Dot-Product Attention}};

\node [anchor=south,draw,minimum width=4em,inner sep=4pt,fill=yellow!30] (Concat) at ([yshift=1em]Scale2.north) {\tiny{Concat}};

\node [anchor=south,draw,minimum width=4em,inner sep=4pt,fill=ugreen!20!white] (Linear) at ([yshift=1em]Concat.north) {\tiny{Linear}};


\draw [->] ([yshift=0.1em]Q.north) -- ([yshift=-0.1em]Linear02.south);
\draw [-,draw=black!50] ([yshift=0.1em]Q.north) -- ([xshift=0.2em,yshift=-0.1em]Linear02.south);
\draw [-,draw=black!30] ([yshift=0.1em]Q.north) -- ([xshift=0.4em,yshift=-0.1em]Linear02.south);

\draw [->] ([yshift=0.1em]K.north) -- ([yshift=-0.1em]Linear12.south);
\draw [-,draw=black!50] ([yshift=0.1em]K.north) -- ([xshift=0.2em,yshift=-0.1em]Linear12.south);
\draw [-,draw=black!30] ([yshift=0.1em]K.north) -- ([xshift=0.4em,yshift=-0.1em]Linear12.south);

\draw [->] ([yshift=0.1em]V.north) -- ([yshift=-0.1em]Linear22.south);
\draw [-,draw=black!50] ([yshift=0.1em]V.north) -- ([xshift=0.2em,yshift=-0.1em]Linear22.south);
\draw [-,draw=black!30] ([yshift=0.1em]V.north) -- ([xshift=0.4em,yshift=-0.1em]Linear22.south);

\draw [->] ([yshift=0em]Linear02.north) -- ([yshift=1em]Linear02.north);
\draw [-,draw=black!50] ([yshift=0em]Linear01.north) -- ([yshift=0.8em]Linear01.north);
\draw [-,draw=black!30] ([yshift=0em]Linear0.north) -- ([yshift=0.6em]Linear0.north);

\draw [->] ([yshift=0em]Linear12.north) -- ([yshift=1em]Linear12.north);
\draw [-,draw=black!50] ([yshift=0em]Linear11.north) -- ([yshift=0.8em]Linear11.north);
\draw [-,draw=black!30] ([yshift=0em]Linear1.north) -- ([yshift=0.6em]Linear1.north);

\draw [->] ([yshift=0em]Linear22.north) -- ([yshift=1em]Linear22.north);
\draw [-,draw=black!50] ([yshift=0em]Linear21.north) -- ([yshift=0.8em]Linear21.north);
\draw [-,draw=black!30] ([yshift=0em]Linear2.north) -- ([yshift=0.6em]Linear2.north);

\draw [->] ([yshift=0em]Scale2.north) -- ([yshift=0em]Concat.south);
\draw [-,draw=black!50] ([yshift=0em]Scale1.north) -- ([yshift=0.8em]Scale1.north);
\draw [-,draw=black!30] ([yshift=0em]Scale.north) -- ([yshift=0.6em]Scale.north);

\draw [->] ([yshift=0em]Concat.north) -- ([yshift=0em]Linear.south);
\draw [->] ([yshift=0em]Linear.north) -- ([yshift=1em]Linear.north);

\visible<2->{
\node [anchor=east] (line1) at ([xshift=-3.5em,yshift=6.5em]Linear0.west) {\scriptsize{将输入沿着隐层维度分为h个子集}};
\node [anchor=north west] (line2) at ([yshift=0.3em]line1.south west) {\scriptsize{首先对不同的子集分别进行线性变换}};
\node [anchor=north west] (line3) at ([yshift=0.3em]line2.south west) {\scriptsize{然后执行h次基于点乘的注意力操作}};
\node [anchor=north west] (line4) at ([yshift=0.3em]line3.south west) {\scriptsize{最后将注意力操作的输出连接到一起}};
\node [anchor=north west] (line5) at ([yshift=0.3em]line4.south west) {\scriptsize{h通常设置为8}};
}

\visible<3->{
\node [anchor=north west] (line6) at ([yshift=-1.5em]line5.south west) {\scriptsize{这样做是希望模型在不同的子空间中}};
\node [anchor=north west] (line7) at ([yshift=0.3em]line6.south west) {\scriptsize{学习到更丰富的信息}};
}

\begin{pgfonlayer}{background}
\visible<2->{
\node [rectangle,inner sep=0.2em,rounded corners=1pt,fill=red!10,drop shadow,draw=red] [fit = (line1) (line2) (line3) (line4) (line5)] (box1) {};
}
\visible<3->{
\node [rectangle,inner sep=0.2em,rounded corners=1pt,fill=green!10,drop shadow,draw=ugreen] [fit = (line6) (line7)] (box2) {};
}
\end{pgfonlayer}

\end{scope}
\end{tikzpicture}
\end{center}

\end{itemize}
\end{frame}


%%%------------------------------------------------------------------------------------------------------------
\begin{frame}{Transformer}
%\begin{tcolorbox}
%[bicolor,sidebyside,righthand width=4.5cm,size=title,frame engine=empty,
% colback=blue!10!white,colbacklower=black!5!white]

\begin{itemize}
姜雨帆 committed
4764
\item Transformer 残差和层正则化
姜雨帆 committed
4765
\end{itemize}
姜雨帆 committed
4766
\vspace{-0.5em}
姜雨帆 committed
4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783
\begin{center}
\begin{tikzpicture}
\begin{scope}
\tikzstyle{Sanode} = [minimum height=1.4em,minimum width=7em,inner sep=3pt,rounded corners=1.5pt,draw,fill=orange!20];
\tikzstyle{Resnode} = [minimum height=1.1em,minimum width=7em,inner sep=3pt,rounded corners=1.5pt,draw,fill=yellow!20];
\tikzstyle{ffnnode} = [minimum height=1.4em,minimum width=7em,inner sep=3pt,rounded corners=1.5pt,draw];
\tikzstyle{outputnode} = [minimum height=1.4em,minimum width=7em,inner sep=3pt,rounded corners=1.5pt,draw];
\tikzstyle{inputnode} = [minimum height=1.4em,minimum width=3.5em,inner sep=3pt,rounded corners=1.5pt,draw,fill=red!10];
\tikzstyle{posnode} = [minimum height=1.4em,minimum width=3.5em,inner sep=3pt,rounded corners=1.5pt,draw,fill=black!5!white];
\tikzstyle{standard} = [rounded corners=3pt]

\node [Sanode,anchor=west] (sa1) at (0,0) {\tiny{$\textbf{Self-Attention}$}};
\node [Resnode,anchor=south] (res1) at ([yshift=0.3em]sa1.north) {\tiny{$\textbf{Add \& LayerNorm}$}};
\node [ffnnode,anchor=south] (ffn1) at ([yshift=1em]res1.north) {\tiny{$\textbf{Feed Forward Network}$}};
\node [Resnode,anchor=south] (res2) at ([yshift=0.3em]ffn1.north) {\tiny{$\textbf{Add \& LayerNorm}$}};
\node [inputnode,anchor=north west] (input1) at ([yshift=-1em]sa1.south west) {\tiny{$\textbf{Embedding}$}};
\node [posnode,anchor=north east] (pos1) at ([yshift=-1em]sa1.south east) {\tiny{$\textbf{Postion}$}};
xiaotong committed
4784
\node [anchor=north] (inputs) at ([yshift=-3em]sa1.south) {\tiny{$\textbf{编码器输入: 我  很  好}$}};
姜雨帆 committed
4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802
\node [anchor=south] (encoder) at ([xshift=0.2em,yshift=0.6em]res2.north west) {\scriptsize{\textbf{编码器}}};

\draw [->] (sa1.north) -- (res1.south);
\draw [->] (res1.north) -- (ffn1.south);
\draw [->] (ffn1.north) -- (res2.south);
\draw [->] ([yshift=-1em]sa1.south) -- (sa1.south);
\draw [->] ([yshift=-0.3em]inputs.north) -- ([yshift=0.6em]inputs.north);


\node [Sanode,anchor=west] (sa2) at ([xshift=3em]sa1.east) {\tiny{$\textbf{Self-Attention}$}};
\node [Resnode,anchor=south] (res3) at ([yshift=0.3em]sa2.north) {\tiny{$\textbf{Add \& LayerNorm}$}};
\node [Sanode,anchor=south] (ed1) at ([yshift=1em]res3.north) {\tiny{$\textbf{Encoder-Decoder Attention}$}};
\node [Resnode,anchor=south] (res4) at ([yshift=0.3em]ed1.north) {\tiny{$\textbf{Add \& LayerNorm}$}};
\node [ffnnode,anchor=south] (ffn2) at ([yshift=1em]res4.north) {\tiny{$\textbf{Feed Forward Network}$}};
\node [Resnode,anchor=south] (res5) at ([yshift=0.3em]ffn2.north) {\tiny{$\textbf{Add \& LayerNorm}$}};
\node [outputnode,anchor=south] (o1) at ([yshift=1em]res5.north) {\tiny{$\textbf{Output layer}$}};
\node [inputnode,anchor=north west] (input2) at ([yshift=-1em]sa2.south west) {\tiny{$\textbf{Embedding}$}};
\node [posnode,anchor=north east] (pos2) at ([yshift=-1em]sa2.south east) {\tiny{$\textbf{Postion}$}};
xiaotong committed
4803
\node [anchor=north] (outputs) at ([yshift=-3em]sa2.south) {\tiny{$\textbf{解码器输入: $<$SOS$>$ I  am  fine}$}};
姜雨帆 committed
4804
\node [anchor=east] (decoder) at ([xshift=-1em,yshift=-1.5em]o1.west) {\scriptsize{\textbf{解码器}}};
xiaotong committed
4805
\node [anchor=north] (decoutputs) at ([yshift=1.5em]o1.north) {\tiny{$\textbf{解码器输出: I  am  fine $<$EOS$>$ }$}};
姜雨帆 committed
4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826

\draw [->] (sa2.north) -- (res3.south);
\draw [->] (res3.north) -- (ed1.south);
\draw [->] (ed1.north) -- (res4.south);
\draw [->] (res4.north) -- (ffn2.south);
\draw [->] (ffn2.north) -- (res5.south);
\draw [->] (res5.north) -- (o1.south);
\draw [->] (o1.north) -- ([yshift=0.5em]o1.north);
\draw [->] ([yshift=-1em]sa2.south) -- (sa2.south);
\draw [->] ([yshift=-0.3em]outputs.north) -- ([yshift=0.6em]outputs.north);


\draw[->,standard] ([yshift=-0.5em]sa1.south) -- ([xshift=-4em,yshift=-0.5em]sa1.south) -- ([xshift=-4em,yshift=2.3em]sa1.south) -- ([xshift=-3.5em,yshift=2.3em]sa1.south);
\draw[->,standard] ([yshift=0.5em]res1.north) -- ([xshift=-4em,yshift=0.5em]res1.north) -- ([xshift=-4em,yshift=3.3em]res1.north) -- ([xshift=-3.5em,yshift=3.3em]res1.north);

\draw[->,standard] ([yshift=-0.5em]sa2.south) -- ([xshift=4em,yshift=-0.5em]sa2.south) -- ([xshift=4em,yshift=2.3em]sa2.south) -- ([xshift=3.5em,yshift=2.3em]sa2.south);
\draw[->,standard] ([yshift=0.5em]res3.north) -- ([xshift=4em,yshift=0.5em]res3.north) -- ([xshift=4em,yshift=3.3em]res3.north) -- ([xshift=3.5em,yshift=3.3em]res3.north);
\draw[->,standard] ([yshift=0.5em]res4.north) -- ([xshift=4em,yshift=0.5em]res4.north) -- ([xshift=4em,yshift=3.3em]res4.north) -- ([xshift=3.5em,yshift=3.3em]res4.north);

\draw[->,standard] (res2.north) -- ([yshift=0.5em]res2.north) -- ([xshift=5em,yshift=0.5em]res2.north) -- ([xshift=5em,yshift=-2.2em]res2.north) -- ([xshift=6.5em,yshift=-2.2em]res2.north);

姜雨帆 committed
4827 4828 4829 4830 4831 4832 4833 4834 4835
%\node [rectangle,inner sep=0.7em,rounded corners=1pt,very thick,dotted,draw=ugreen!70] [fit = (sa1) (res1) (ffn1) (res2)] (box0) {};
%\node [rectangle,inner sep=0.7em,rounded corners=1pt,very thick,dotted,draw=red!60] [fit = (sa2) (res3) (res5)] (box1) {};

\begin{pgfonlayer}{background}
	\node [rectangle,inner sep=0.2em,rounded corners=1pt,very thick,dotted,fill=red!40] [fit = (res1)] (box1) {};		
	\node [rectangle,inner sep=0.2em,rounded corners=1pt,very thick,dotted,fill=red!40] [fit = (res2)] (box2) {};	
	\node [rectangle,inner sep=0.2em,rounded corners=1pt,very thick,dotted,fill=red!40] [fit = (res3)] (box3) {};	
	\node [rectangle,inner sep=0.2em,rounded corners=1pt,very thick,dotted,fill=red!40] [fit = (res4)] (box4) {};	
\end{pgfonlayer}
姜雨帆 committed
4836 4837 4838 4839 4840 4841 4842

\end{scope}
\end{tikzpicture}
\end{center}

%\end{tcolorbox}
\end{frame}
xiaotong committed
4843

姜雨帆 committed
4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879
%%%------------------------------------------------------------------------------------------------------------
\begin{frame}{残差\&层正则化}
\begin{itemize}
\item 在Transformer中,编码器、解码器分别由6层网络组成,每层网络又包含多个子层(自注意力网络、前馈神经网络)。Transformer实际上是一个很深的网络结构,在训练过程中容易出现梯度消失的情况

\vspace{1.5em}
\begin{center}
\begin{tikzpicture}
\begin{scope}
\tikzstyle{lnode} = [minimum height=1.5em,minimum width=3em,inner sep=3pt,rounded corners=1.5pt,draw,fill=orange!20];


\node [lnode,anchor=west] (l1) at (0,0) {\scriptsize{子层1}};
\node [lnode,anchor=west] (l2) at ([xshift=3em]l1.east) {\scriptsize{子层2}};
\node [lnode,anchor=west] (l3) at ([xshift=3em]l2.east) {\scriptsize{子层3}};
\node [anchor=west,inner sep=2pt] (dot1) at ([xshift=1em]l3.east) {\scriptsize{$\textbf{...}$}};
\node [lnode,anchor=west] (l4) at ([xshift=1em]dot1.east) {\scriptsize{子层n}};

\draw [->,thick] ([xshift=-1.5em]l1.west) -- ([xshift=-0.1em]l1.west);
\draw [->,thick] ([xshift=0.1em]l1.east) -- ([xshift=-0.1em]l2.west);
\draw [->,thick] ([xshift=0.1em]l2.east) -- ([xshift=-0.1em]l3.west);
\draw [->,thick] ([xshift=0.1em]l3.east) -- ([xshift=-0.1em]dot1.west);
\draw [->,thick] ([xshift=0.1em]dot1.east) -- ([xshift=-0.1em]l4.west);
\draw [->,thick] ([xshift=0.1em]l4.east) -- ([xshift=1.5em]l4.east);

\draw [->,very thick,red] ([xshift=1.5em,yshift=-0.3em]l4.east) -- ([xshift=0.1em,,yshift=-0.3em]l4.east);
\draw [->,very thick,red!80] ([xshift=-0.1em,yshift=-0.3em]l4.west) -- ([xshift=0.1em,yshift=-0.3em]dot1.east);
\draw [->,very thick,red!60] ([xshift=-0.1em,yshift=-0.3em]dot1.west) -- ([xshift=0.1em,yshift=-0.3em]l3.east);
\draw [->,very thick,red!40] ([xshift=-0.1em,yshift=-0.3em]l3.west) -- ([xshift=0.1em,yshift=-0.3em]l2.east);
\draw [->,very thick,red!15] ([xshift=-0.1em,yshift=-0.3em]l2.west) -- ([xshift=0.1em,yshift=-0.3em]l1.east);

\end{scope}
\end{tikzpicture}
\end{center}
\vspace{0.5em}
\item<2-> 在这里引入了在图像领域用来训练深层网络的技术,\\{\color{red} \textbf{残差网络}}来避免上述问题
姜雨帆 committed
4880 4881 4882 4883 4884
%\vspace{0.5em}

\begin{displaymath}
x_{l+1} = x_l+\mathcal{F}(x_l)
\end{displaymath}
姜雨帆 committed
4885
\vspace{-0.8em}
姜雨帆 committed
4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918
\begin{center}
\begin{tikzpicture}
\begin{scope}
\tikzstyle{lnode} = [minimum height=1.5em,minimum width=3em,inner sep=3pt,rounded corners=1.5pt,draw,fill=orange!20];
\tikzstyle{standard} = [rounded corners=3pt]

\node [lnode,anchor=west] (l1) at (0,0) {\scriptsize{子层1}};
\node [lnode,anchor=west] (l2) at ([xshift=3em]l1.east) {\scriptsize{子层2}};
\node [lnode,anchor=west] (l3) at ([xshift=3em]l2.east) {\scriptsize{子层3}};
\node [anchor=west,inner sep=2pt] (dot1) at ([xshift=1em]l3.east) {\scriptsize{$\textbf{...}$}};
\node [lnode,anchor=west] (l4) at ([xshift=1em]dot1.east) {\scriptsize{子层n}};

\node [anchor=west] (plus1) at ([xshift=0.9em]l1.east) {\scriptsize{$\mathbf{\oplus}$}};
\node [anchor=west] (plus2) at ([xshift=0.9em]l2.east) {\scriptsize{$\mathbf{\oplus}$}};

\draw [->,thick] ([xshift=-1.5em]l1.west) -- ([xshift=-0.1em]l1.west);
\draw [->,thick] ([xshift=0.1em]l1.east) -- ([xshift=0.2em]plus1.west);
\draw [->,thick] ([xshift=-0.2em]plus1.east) -- ([xshift=-0.1em]l2.west);
\draw [->,thick] ([xshift=0.1em]l2.east) -- ([xshift=0.2em]plus2.west);
\draw [->,thick] ([xshift=-0.2em]plus2.east) -- ([xshift=-0.1em]l3.west);
\draw [->,thick] ([xshift=0.1em]l3.east) -- ([xshift=-0.1em]dot1.west);
\draw [->,thick] ([xshift=0.1em]dot1.east) -- ([xshift=-0.1em]l4.west);
\draw [->,thick] ([xshift=0.1em]l4.east) -- ([xshift=1.5em]l4.east);

\draw[->,standard,thick] ([xshift=-0.8em]l1.west) -- ([xshift=-0.8em,yshift=2em]l1.west) -- ([yshift=2em]plus1.center) -- ([yshift=-0.2em]plus1.north);
\draw[->,standard,thick] ([xshift=-0.8em]l2.west) -- ([xshift=-0.8em,yshift=2em]l2.west) -- ([yshift=2em]plus2.center) -- ([yshift=-0.2em]plus2.north);

\draw [->,very thick,red] ([xshift=1.5em,yshift=-0.3em]l4.east) -- ([xshift=0.1em,,yshift=-0.3em]l4.east);
\draw [->,very thick,red] ([xshift=-0.1em,yshift=-0.3em]l4.west) -- ([xshift=0.1em,yshift=-0.3em]dot1.east);
\draw [->,very thick,red] ([xshift=-0.1em,yshift=-0.3em]dot1.west) -- ([xshift=0.1em,yshift=-0.3em]l3.east);
\draw[->,standard,very thick,red] ([xshift=-0.3em,yshift=-0.2em]plus2.north) -- ([xshift=-0.3em,yshift=1.8em]plus2.center) -- ([xshift=-0.5em,yshift=1.8em]l2.west) -- ([xshift=-0.5em,yshift=0.2em]l2.west);
\draw[->,standard,very thick,red] ([xshift=-0.3em,yshift=-0.2em]plus1.north) -- ([xshift=-0.3em,yshift=1.8em]plus1.center) -- ([xshift=-0.5em,yshift=1.8em]l1.west) -- ([xshift=-0.5em,yshift=0.2em]l1.west);

姜雨帆 committed
4919 4920 4921 4922 4923 4924
\node [anchor=west] (label1) at ([xshift=1em,yshift=1.5em]l3.north) {\tiny{前向计算}};
\draw [->,thick] ([xshift=-1.5em]label1.west) -- ([xshift=-0.1em]label1.west);

\node [anchor=west] (label2) at ([xshift=2.5em]label1.east) {\tiny{反向传播}};
\draw [->,thick,red] ([xshift=-1.5em]label2.west) -- ([xshift=-0.1em]label2.west);

姜雨帆 committed
4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936
\end{scope}
\end{tikzpicture}
\end{center}

\end{itemize}
\end{frame}

%%%------------------------------------------------------------------------------------------------------------
\begin{frame}{残差\&层正则化(续)}
\begin{itemize}
\item 在Transformer的训练过程中,由于引入了残差操作,将前面所有层的输出加到一起。这样会导致高层的参数分布不断变大,造成训练过程不稳定、训练时间较长。
为了避免这种情况,在每层中加入了层正则化操作
姜雨帆 committed
4937

姜雨帆 committed
4938 4939 4940
    \begin{itemize}
    \item 使用均值和方差对样本进行平移缩放,将数据规范化为均值为0,方差为1的标准分布
    \end{itemize}
姜雨帆 committed
4941 4942 4943 4944 4945 4946
\vspace{-0.3em}

\begin{displaymath}
\textrm{LN(}x\textrm{)} = g\cdot\frac{x-\mu}{\sigma}+b
\end{displaymath}

姜雨帆 committed
4947
\item<2->在Transformer中经常使用的层正则化操作有两种,分别是后正则化和前正则化
姜雨帆 committed
4948
%\vspace{0.5em}
姜雨帆 committed
4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985
\begin{center}
\begin{tikzpicture}
\begin{scope}
\tikzstyle{lnode} = [minimum height=1.5em,minimum width=3em,inner sep=3pt,rounded corners=1.5pt,draw,fill=orange!20];
\tikzstyle{standard} = [rounded corners=3pt]

\node [lnode,anchor=west] (l1) at (0,0) {\scriptsize{子层n}};
\node [lnode,anchor=west] (l2) at ([xshift=3em]l1.east) {\scriptsize{层正则化}};
\node [lnode,anchor=west] (l3) at ([xshift=4em]l2.east) {\scriptsize{层正则化}};
\node [lnode,anchor=west] (l4) at ([xshift=1.5em]l3.east) {\scriptsize{子层n}};

\node [anchor=west] (plus1) at ([xshift=0.9em]l1.east) {\scriptsize{$\mathbf{\oplus}$}};
\node [anchor=west] (plus2) at ([xshift=0.9em]l4.east) {\scriptsize{$\mathbf{\oplus}$}};

\node [anchor=north] (label1) at ([xshift=3em,yshift=-0.5em]l1.south) {\scriptsize{后正则化}};
\node [anchor=north] (label2) at ([xshift=3em,yshift=-0.5em]l3.south) {\scriptsize{前正则化}};

\draw [->,thick] ([xshift=-1.5em]l1.west) -- ([xshift=-0.1em]l1.west);
\draw [->,thick] ([xshift=0.1em]l1.east) -- ([xshift=0.2em]plus1.west);
\draw [->,thick] ([xshift=-0.2em]plus1.east) -- ([xshift=-0.1em]l2.west);
\draw [->,thick] ([xshift=0.1em]l2.east) -- ([xshift=1em]l2.east);
\draw [->,thick] ([xshift=-1.5em]l3.west) -- ([xshift=-0.1em]l3.west);
\draw [->,thick] ([xshift=0.1em]l3.east) -- ([xshift=-0.1em]l4.west);
\draw [->,thick] ([xshift=0.1em]l4.east) -- ([xshift=0.2em]plus2.west);
\draw [->,thick] ([xshift=-0.2em]plus2.east) -- ([xshift=1em]plus2.east);

\draw[->,standard,thick] ([xshift=-0.8em]l1.west) -- ([xshift=-0.8em,yshift=2em]l1.west) -- ([yshift=2em]plus1.center) -- ([yshift=-0.2em]plus1.north);
\draw[->,standard,thick] ([xshift=-0.8em]l3.west) -- ([xshift=-0.8em,yshift=2em]l3.west) -- ([yshift=2em]plus2.center) -- ([yshift=-0.2em]plus2.north);

\end{scope}
\end{tikzpicture}
\end{center}

\end{itemize}
\end{frame}


姜雨帆 committed
4986

姜雨帆 committed
4987 4988 4989 4990 4991 4992 4993
%%%------------------------------------------------------------------------------------------------------------
\begin{frame}{Transformer}
%\begin{tcolorbox}
%[bicolor,sidebyside,righthand width=4.5cm,size=title,frame engine=empty,
% colback=blue!10!white,colbacklower=black!5!white]

\begin{itemize}
姜雨帆 committed
4994
\item Transformer 前馈全连接网络
姜雨帆 committed
4995
\end{itemize}
姜雨帆 committed
4996
\vspace{-0.5em}
姜雨帆 committed
4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013
\begin{center}
\begin{tikzpicture}
\begin{scope}
\tikzstyle{Sanode} = [minimum height=1.4em,minimum width=7em,inner sep=3pt,rounded corners=1.5pt,draw,fill=orange!20];
\tikzstyle{Resnode} = [minimum height=1.1em,minimum width=7em,inner sep=3pt,rounded corners=1.5pt,draw,fill=yellow!20];
\tikzstyle{ffnnode} = [minimum height=1.4em,minimum width=7em,inner sep=3pt,rounded corners=1.5pt,draw,fill=blue!20];
\tikzstyle{outputnode} = [minimum height=1.4em,minimum width=7em,inner sep=3pt,rounded corners=1.5pt,draw];
\tikzstyle{inputnode} = [minimum height=1.4em,minimum width=3.5em,inner sep=3pt,rounded corners=1.5pt,draw,fill=red!10];
\tikzstyle{posnode} = [minimum height=1.4em,minimum width=3.5em,inner sep=3pt,rounded corners=1.5pt,draw,fill=black!5!white];
\tikzstyle{standard} = [rounded corners=3pt]

\node [Sanode,anchor=west] (sa1) at (0,0) {\tiny{$\textbf{Self-Attention}$}};
\node [Resnode,anchor=south] (res1) at ([yshift=0.3em]sa1.north) {\tiny{$\textbf{Add \& LayerNorm}$}};
\node [ffnnode,anchor=south] (ffn1) at ([yshift=1em]res1.north) {\tiny{$\textbf{Feed Forward Network}$}};
\node [Resnode,anchor=south] (res2) at ([yshift=0.3em]ffn1.north) {\tiny{$\textbf{Add \& LayerNorm}$}};
\node [inputnode,anchor=north west] (input1) at ([yshift=-1em]sa1.south west) {\tiny{$\textbf{Embedding}$}};
\node [posnode,anchor=north east] (pos1) at ([yshift=-1em]sa1.south east) {\tiny{$\textbf{Postion}$}};
xiaotong committed
5014
\node [anchor=north] (inputs) at ([yshift=-3em]sa1.south) {\tiny{$\textbf{编码器输入: 我  很  好}$}};
姜雨帆 committed
5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032
\node [anchor=south] (encoder) at ([xshift=0.2em,yshift=0.6em]res2.north west) {\scriptsize{\textbf{编码器}}};

\draw [->] (sa1.north) -- (res1.south);
\draw [->] (res1.north) -- (ffn1.south);
\draw [->] (ffn1.north) -- (res2.south);
\draw [->] ([yshift=-1em]sa1.south) -- (sa1.south);
\draw [->] ([yshift=-0.3em]inputs.north) -- ([yshift=0.6em]inputs.north);


\node [Sanode,anchor=west] (sa2) at ([xshift=3em]sa1.east) {\tiny{$\textbf{Self-Attention}$}};
\node [Resnode,anchor=south] (res3) at ([yshift=0.3em]sa2.north) {\tiny{$\textbf{Add \& LayerNorm}$}};
\node [Sanode,anchor=south] (ed1) at ([yshift=1em]res3.north) {\tiny{$\textbf{Encoder-Decoder Attention}$}};
\node [Resnode,anchor=south] (res4) at ([yshift=0.3em]ed1.north) {\tiny{$\textbf{Add \& LayerNorm}$}};
\node [ffnnode,anchor=south] (ffn2) at ([yshift=1em]res4.north) {\tiny{$\textbf{Feed Forward Network}$}};
\node [Resnode,anchor=south] (res5) at ([yshift=0.3em]ffn2.north) {\tiny{$\textbf{Add \& LayerNorm}$}};
\node [outputnode,anchor=south] (o1) at ([yshift=1em]res5.north) {\tiny{$\textbf{Output layer}$}};
\node [inputnode,anchor=north west] (input2) at ([yshift=-1em]sa2.south west) {\tiny{$\textbf{Embedding}$}};
\node [posnode,anchor=north east] (pos2) at ([yshift=-1em]sa2.south east) {\tiny{$\textbf{Postion}$}};
xiaotong committed
5033
\node [anchor=north] (outputs) at ([yshift=-3em]sa2.south) {\tiny{$\textbf{解码器输入: $<$SOS$>$ I  am  fine}$}};
姜雨帆 committed
5034
\node [anchor=east] (decoder) at ([xshift=-1em,yshift=-1.5em]o1.west) {\scriptsize{\textbf{解码器}}};
姜雨帆 committed
5035
\node [anchor=north] (decoutputs) at ([yshift=1.5em]o1.north) {\tiny{$\textbf{解码器输出: I  am  fine $<$EOS$>$ }$}};
姜雨帆 committed
5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056

\draw [->] (sa2.north) -- (res3.south);
\draw [->] (res3.north) -- (ed1.south);
\draw [->] (ed1.north) -- (res4.south);
\draw [->] (res4.north) -- (ffn2.south);
\draw [->] (ffn2.north) -- (res5.south);
\draw [->] (res5.north) -- (o1.south);
\draw [->] (o1.north) -- ([yshift=0.5em]o1.north);
\draw [->] ([yshift=-1em]sa2.south) -- (sa2.south);
\draw [->] ([yshift=-0.3em]outputs.north) -- ([yshift=0.6em]outputs.north);


\draw[->,standard] ([yshift=-0.5em]sa1.south) -- ([xshift=-4em,yshift=-0.5em]sa1.south) -- ([xshift=-4em,yshift=2.3em]sa1.south) -- ([xshift=-3.5em,yshift=2.3em]sa1.south);
\draw[->,standard] ([yshift=0.5em]res1.north) -- ([xshift=-4em,yshift=0.5em]res1.north) -- ([xshift=-4em,yshift=3.3em]res1.north) -- ([xshift=-3.5em,yshift=3.3em]res1.north);

\draw[->,standard] ([yshift=-0.5em]sa2.south) -- ([xshift=4em,yshift=-0.5em]sa2.south) -- ([xshift=4em,yshift=2.3em]sa2.south) -- ([xshift=3.5em,yshift=2.3em]sa2.south);
\draw[->,standard] ([yshift=0.5em]res3.north) -- ([xshift=4em,yshift=0.5em]res3.north) -- ([xshift=4em,yshift=3.3em]res3.north) -- ([xshift=3.5em,yshift=3.3em]res3.north);
\draw[->,standard] ([yshift=0.5em]res4.north) -- ([xshift=4em,yshift=0.5em]res4.north) -- ([xshift=4em,yshift=3.3em]res4.north) -- ([xshift=3.5em,yshift=3.3em]res4.north);

\draw[->,standard] (res2.north) -- ([yshift=0.5em]res2.north) -- ([xshift=5em,yshift=0.5em]res2.north) -- ([xshift=5em,yshift=-2.2em]res2.north) -- ([xshift=6.5em,yshift=-2.2em]res2.north);

姜雨帆 committed
5057 5058 5059 5060 5061 5062 5063
%\node [rectangle,inner sep=0.7em,rounded corners=1pt,very thick,dotted,draw=ugreen!70] [fit = (sa1) (res1) (ffn1) (res2)] (box0) {};
%\node [rectangle,inner sep=0.7em,rounded corners=1pt,very thick,dotted,draw=red!60] [fit = (sa2) (res3) (res5)] (box1) {};

\begin{pgfonlayer}{background}
	\node [rectangle,inner sep=0.2em,rounded corners=1pt,very thick,dotted,fill=red!40] [fit = (ffn1)] (box1) {};		
	\node [rectangle,inner sep=0.2em,rounded corners=1pt,very thick,dotted,fill=red!40] [fit = (ffn2)] (box2) {};	
\end{pgfonlayer}
姜雨帆 committed


\end{scope}
\end{tikzpicture}
\end{center}

%\end{tcolorbox}
\end{frame}

%%------------------------------------------------------------------------------------------------------------
\begin{frame}{前馈全连接网络}
\begin{itemize}
\item 在每层中,除了注意力操作,还包含了一个全连接的前馈神经网络,网络中包含两次线性变换和一次非线性变换(ReLU激活函数),每层的前馈神经网络参数不共享
\vspace{0.5em}
\begin{displaymath}
\textrm{FFN}(x) = \textrm{max}(0,xW_1+b_1)W_2+b_2
\end{displaymath}

\vspace{0em}

\item 通常情况下,注意力部分的隐层维度为512,FFN部分的隐层维度设置为2048

\vspace{-0.5 em}

\begin{center}
\begin{tikzpicture}
\begin{scope}

\def\neuronsep{1}
\tikzstyle{neuronnode} = [minimum size=1.2em,circle,draw,ublue,very thick,inner sep=1pt, fill=white,align=center,drop shadow={shadow xshift=0.1em,shadow yshift=-0.1em}];

\node [neuronnode,anchor=west] (neuron01) at (0,0) {};
\node [neuronnode,anchor=west] (neuron02) at ([xshift=1.5em]neuron01.east) {};
\node [neuronnode,anchor=south] (neuron11) at ([yshift=1.5em]neuron01.north) {};
\node [neuronnode,anchor=south] (neuron12) at ([yshift=1.5em]neuron02.north) {};
\node [neuronnode,anchor=west] (neuron13) at ([xshift=1.5em]neuron12.east) {};
\node [neuronnode,anchor=east] (neuron14) at ([xshift=-1.5em]neuron11.west) {};
\node [neuronnode,anchor=south] (neuron21) at ([yshift=1.5em]neuron11.north) {};
\node [neuronnode,anchor=south] (neuron22) at ([yshift=1.5em]neuron12.north) {};
\node [anchor=west] (dot1) at ([xshift=0.2em]neuron11.east) {\scriptsize{...}};

%% layer 1
\foreach \n in {1,2}{
    \draw [->] ([yshift=-0.8em]neuron0\n.south) -- ([yshift=-0.1em]neuron0\n.south) node [pos=0,below] {};
}

%% layer 2
\foreach \n in {1,2}{
    \foreach \m in {1,...,4}{
        \draw [->] (neuron0\n.north) -- (neuron1\m.south);
    }
}

%% layer 3
\foreach \n in {1,2}{
    \draw [<-] ([yshift=0.8em]neuron2\n.north) -- ([yshift=0.0em]neuron2\n.north) node [pos=0,above] {};
}

\foreach \n in {1,...,4}{
    \foreach \m in {1,2}{
        \draw [->] (neuron1\n.north) -- (neuron2\m.south);
    }
}

\node [anchor=east] (line1) at ([xshift=-3.5em,yshift=2.5em]neuron14.west) {\scriptsize{全连接网络的作用主要体现在}};
\node [anchor=north west] (line2) at ([yshift=0.3em]line1.south west) {\scriptsize{将经过注意力操作之后的表示}};
\node [anchor=north west] (line3) at ([yshift=0.3em]line2.south west) {\scriptsize{映射到更大的网络空间中}};
\node [anchor=north west] (line4) at ([yshift=0.3em]line3.south west) {\scriptsize{提升了网络模型的表示能力}};
\node [anchor=north west] (line5) at ([yshift=0.3em]line4.south west) {\scriptsize{实验证明,去掉全连接网络}};
\node [anchor=north west] (line6) at ([yshift=0.3em]line5.south west) {\scriptsize{会对模型的性能造成影响}};

\begin{pgfonlayer}{background}
\node [rectangle,inner sep=0.2em,fill=blue!20] [fit = (neuron01) (neuron14) (neuron13) (neuron22)] (ffn) {};
\node [rectangle,inner sep=0.2em,rounded corners=1pt,fill=green!10,drop shadow,draw=ugreen] [fit = (line1) (line2) (line3) (line6)] (box1) {};
\end{pgfonlayer}

\end{scope}
\end{tikzpicture}
\end{center}

\end{itemize}
\end{frame}

%%%------------------------------------------------------------------------------------------------------------
\subsection{训练}
\begin{frame}{训练}
\begin{itemize}
\item Transformer一个主要的改进就是可以进行并行化训练。由于之前的RNN是基于时序进行训练,只有在前一时刻训练结束才能进行当前时刻的训练,而Transformer将任意时刻的输入信息之间的距离拉近为1,因此可以并行化训练,提高训练效率

\vspace{0.5em}

\end{itemize}

\vspace{-0.8em}
\visible<2->{
%\vspace{-0.5em}
\begin{center}
\begin{tikzpicture}
\begin{scope}
\tikzstyle{rnnnode} = [minimum height=1.1em,minimum width=2.1em,inner sep=2pt,rounded corners=1pt,draw,fill=red!20];
\tikzstyle{lossnode} = [minimum height=1.1em,minimum width=6em,inner sep=2pt,rounded corners=1pt,draw,fill=red!20];

\node [rnnnode,anchor=west] (h1) at (0,0) {\tiny{$\textbf{h}_1$}};
\node [rnnnode,anchor=west] (h2) at ([xshift=1em]h1.east) {\tiny{$\textbf{h}_2$}};
\node [rnnnode,anchor=west] (h3) at ([xshift=1em]h2.east) {\tiny{$\textbf{h}_3$}};
\node [rnnnode,anchor=north,fill=green!20] (e1) at ([yshift=-1em]h1.south) {\tiny{$e_x()$}};
\node [rnnnode,anchor=west,fill=green!20] (e2) at ([xshift=1em]e1.east) {\tiny{$e_x()$}};
\node [rnnnode,anchor=west,fill=green!20] (e3) at ([xshift=1em]e2.east) {\tiny{$e_x()$}};
\node [anchor=north,inner sep=2pt] (w1) at ([yshift=-0.6em]e1.south) {\tiny{}};
\node [anchor=north,inner sep=2pt] (w2) at ([yshift=-0.6em]e2.south) {\tiny{}};
\node [anchor=north,inner sep=2pt] (w3) at ([yshift=-0.6em]e3.south) {\tiny{EOS}};
\node [anchor=south] (dot1) at ([xshift=0.4em,yshift=-0.7em]h1.south) {\tiny{...}};
\node [anchor=south] (dot2) at ([xshift=-0.4em,yshift=-0.7em]h3.south) {\tiny{...}};

\draw [->] (w1.north) -- ([yshift=-0.1em]e1.south);
\draw [->] (w2.north) -- ([yshift=-0.1em]e2.south);
\draw [->] (w3.north) -- ([yshift=-0.1em]e3.south);
\draw [->] ([yshift=0.1em]e1.north) -- ([yshift=-0.1em]h1.south);
\draw [->] ([yshift=0.1em]e2.north) -- ([yshift=-0.1em]h2.south);
\draw [->] ([yshift=0.1em]e3.north) -- ([yshift=-0.1em]h3.south);
\draw [->] ([xshift=0.2em,yshift=0.1em]e1.north) .. controls +(north:0.3) and +(south:0.4) .. ([xshift=-0.3em,yshift=-0.1em]h2.south);
\draw [->] ([xshift=-0.2em,yshift=0.1em]e3.north) .. controls +(north:0.3) and +(south:0.4) .. ([xshift=0.3em,yshift=-0.1em]h2.south);
\node [anchor=south] (encoder) at ([xshift=-0.2em]h1.north west) {\scriptsize{\textbf{编码器}}};

\visible<3->{
\node [rnnnode,anchor=west,fill=green!20] (t1) at ([xshift=3em]e3.east) {\tiny{$e_y()$}};
\node [rnnnode,anchor=west,fill=green!20] (t2) at ([xshift=1.5em]t1.east) {\tiny{$e_y()$}};
\node [rnnnode,anchor=west,fill=green!20] (t3) at ([xshift=1.5em]t2.east) {\tiny{$e_y()$}};
\node [rnnnode,anchor=west,fill=green!20] (t4) at ([xshift=1.5em]t3.east) {\tiny{$e_y()$}};
}
\visible<3->{
\node [rnnnode,anchor=south] (s1) at ([yshift=1em]t1.north) {\tiny{$\textbf{s}_1$}};
\node [rnnnode,anchor=south] (s2) at ([yshift=1em]t2.north) {\tiny{$\textbf{s}_2$}};
\node [rnnnode,anchor=south] (s3) at ([yshift=1em]t3.north) {\tiny{$\textbf{s}_3$}};
\node [rnnnode,anchor=south] (s4) at ([yshift=1em]t4.north) {\tiny{$\textbf{s}_4$}};
\node [anchor=south] (dot3) at ([xshift=-0.4em,yshift=-0.7em]s3.south) {\tiny{...}};
\node [anchor=south] (dot4) at ([xshift=-0.4em,yshift=-0.7em]s4.south) {\tiny{...}};
}

\visible<4->{
\node [rnnnode,anchor=south] (f1) at ([yshift=1em]s1.north) {\tiny{$\textbf{f}_1$}};
\node [rnnnode,anchor=south] (f2) at ([yshift=1em]s2.north) {\tiny{$\textbf{f}_2$}};
\node [rnnnode,anchor=south] (f3) at ([yshift=1em]s3.north) {\tiny{$\textbf{f}_3$}};
\node [rnnnode,anchor=south] (f4) at ([yshift=1em]s4.north) {\tiny{$\textbf{f}_4$}};
\node [rnnnode,anchor=south,fill=blue!20] (o1) at ([yshift=1em]f1.north) {\tiny{softmax}};
\node [rnnnode,anchor=south,fill=blue!20] (o2) at ([yshift=1em]f2.north) {\tiny{softmax}};
\node [rnnnode,anchor=south,fill=blue!20] (o3) at ([yshift=1em]f3.north) {\tiny{softmax}};
\node [rnnnode,anchor=south,fill=blue!20] (o4) at ([yshift=1em]f4.north) {\tiny{softmax}};
\node [anchor=east] (decoder) at ([xshift=-0.3em,yshift=0.5em]o1.north west) {\scriptsize{\textbf{解码器}}};
\node [anchor=south,fill=black!5!white,minimum height=1.1em,minimum width=13em,inner sep=2pt,rounded corners=1pt,draw] (loss) at ([xshift=1.8em,yshift=1em]o2.north) {\scriptsize{\textbf{Cross Entropy Loss}}};
}
\visible<3->{
\node [anchor=north,inner sep=2pt] (wt1) at ([yshift=-0.6em]t1.south) {\tiny{EOS}};
\node [anchor=north,inner sep=2pt] (wt2) at ([yshift=-0.6em]t2.south) {\tiny{How}};
\node [anchor=north,inner sep=2pt] (wt3) at ([yshift=-0.8em]t3.south) {\tiny{are}};
\node [anchor=north,inner sep=2pt] (wt4) at ([yshift=-0.8em]t4.south) {\tiny{you}};
}

\visible<3->{
\foreach \x in {1,2,3,4}{
    \draw [->] ([yshift=-0.7em]t\x.south) -- ([yshift=-0.1em]t\x.south);
    \draw [->] ([yshift=0.1em]t\x.north) -- ([yshift=-0.1em]s\x.south);
\draw [->] ([xshift=0.2em,yshift=0.1em]t1.north) .. controls +(north:0.3) and +(south:0.3) .. ([xshift=-0.3em,yshift=-0.1em]s2.south);
}
}

\visible<4->{
\foreach \x in {1,2,3,4}{
    \draw [->] ([yshift=0.1em]s\x.north) -- ([yshift=-0.1em]f\x.south);
    \draw [->] ([yshift=0.1em]f\x.north) -- ([yshift=-0.1em]o\x.south);
    \draw [->] ([yshift=0.1em]o\x.north) -- ([yshift=0.8em]o\x.north);
}
}

\visible<4->{
\node [circle,draw,anchor=south,inner sep=3pt,fill=orange!20] (c1) at ([yshift=2em]h2.north) {\tiny{$\textbf{C}_1$}};
\node [anchor=south] (c1label) at (c1.north) {\tiny{\textbf{编码-解码注意力机制:上下文}}};
\draw [->] (h1.north) .. controls +(north:0.6) and +(250:0.9) .. (c1.250);
\draw [->] (h2.north) .. controls +(north:0.6) and +(270:0.9) .. (c1.270);
\draw [->] (h3.north) .. controls +(north:0.6) and +(290:0.9) .. (c1.290);
\draw [->] ([yshift=0.3em]s1.west) .. controls +(west:1) and +(east:1) .. (c1.-30);
\draw [->] (c1.0) .. controls +(east:1) and +(west:1) .. ([yshift=0em]f1.west);
}

\visible<4->{
\node [circle,draw,anchor=north,inner sep=3pt,fill=orange!20] (c2) at ([yshift=-2em]t1.south) {\tiny{$\textbf{C}_2$}};
\draw [->] ([xshift=-0.7em]c2.west) -- ([xshift=-0.1em]c2.west);
\draw [->] ([xshift=0.1em]c2.east) .. controls +(east:0.6) and +(west:0.8) ..([yshift=-0.3em,xshift=-0.1em]f2.west);
\node [circle,draw,anchor=north,inner sep=3pt,fill=orange!20] (c3) at ([yshift=-2em]t2.south) {\tiny{$\textbf{C}_3$}};
\draw [->] ([xshift=-0.7em]c3.west) -- ([xshift=-0.1em]c3.west);
\draw [->] ([xshift=0.1em]c3.east) .. controls +(east:0.6) and +(west:0.8) ..([yshift=-0.3em,xshift=-0.1em]f3.west);
\node [circle,draw,anchor=north,inner sep=3pt,fill=orange!20] (c4) at ([yshift=-2em]t3.south) {\tiny{$\textbf{C}_4$}};
\draw [->] ([xshift=-0.7em]c4.west) -- ([xshift=-0.1em]c4.west);
\draw [->] ([xshift=0.1em]c4.east) .. controls +(east:0.6) and +(west:0.8) ..([yshift=-0.3em,xshift=-0.1em]f4.west);
}

\end{scope}
\end{tikzpicture}
\end{center}
}
\end{frame}

%%%------------------------------------------------------------------------------------------------------------
xiaotong committed
5266 5267 5268 5269 5270
%%% Transformer所使用的优化器
\begin{frame}{优化器}
% Adam
% 学习率调整
\begin{itemize}
姜雨帆 committed
5271 5272
\item \textbf{优化器}:使用Adam优化器,$\beta_1$=0.9,$\beta_2$=0.98,$\epsilon=10^{-9}$
\item \textbf{学习率}:关于学习率的设置,引入了warmup策略,在训练初期,学习率从一个较小的初始值逐渐增大,当到达一定的步数,学习率再逐渐减小
xiaotong committed
5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303
    \begin{displaymath}
    lrate=d_{\mathrm{model}}^{-0.5}\cdot \min(step^{-0.5},step\cdot \mathrm{warmup\_steps}^{-1.5})
    \end{displaymath}
    这样做可以减缓在训练初期的不稳定现象,保持分布平稳,通常warmup\_steps通常设置为4000

\vspace{0.5em}

\only<1>{
\begin{figure}
  \centering
  \begin{tikzpicture}
    \footnotesize{
      \begin{axis}[
      width=.60\textwidth,
      height=.40\textwidth,
      legend style={at={(0.60,0.08)}, anchor=south west},
      xlabel={\footnotesize{num update (10k)}},
      ylabel={\footnotesize{Learn rate  (\scriptsize{$10^{-3}$)}}},
      ylabel style={yshift=-1em},xlabel style={yshift=0.0em},
      yticklabel style={/pgf/number format/precision=2,/pgf/number format/fixed zerofill},
      ymin=0,ymax=0.9, ytick={0.2, 0.4, 0.6, 0.8},
      xmin=0,xmax=12,xtick={2,4,6,8,10},
      legend style={yshift=-6pt, legend plot pos=right,font=\scriptsize,cells={anchor=west}}
      ]
      \addplot[orange,line width=1.25pt] coordinates {(0,0) (4,0.7) (5,0.63) (6,0.57) (7,0.525) (8,0.49) (9,0.465) (10,0.44) (11,0.42) (12,0.4)};
      \end{axis}
     }
  \end{tikzpicture}
  \caption{}\label{}
\end{figure}
}
姜雨帆 committed
5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317
\end{itemize}
\end{frame}

%%%------------------------------------------------------------------------------------------------------------
\begin{frame}{训练配置}
\begin{itemize}
\item \textbf{Mini Batch训练}:选择批量的数据作为训练样本,来计算损失函数,提高训练效率
    \begin{itemize}
    \item Mini Batch大小通常设置为2048/4096(token数)
    \item 通常对句子长度进行排序,选取长度相近的句子组成一个batch,可以减少padding数量,提高训练效率
    \end{itemize}
\vspace{0.5em}
\begin{center}
\begin{tikzpicture}
xiaotong committed
5318

姜雨帆 committed
5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342
\tikzstyle{snode} = [draw,inner sep=1pt,minimum width=3em,minimum height=0.5em,rounded corners=1pt,fill=green!30!white]
\tikzstyle{pnode} = [draw,inner sep=1pt,minimum width=1em,minimum height=0.5em,rounded corners=1pt]
\node [anchor=west,snode] (s1) at (0,0) {\tiny{}};
\node [anchor=north west,snode,minimum width=6.3em] (s2) at ([yshift=-0.3em]s1.south west) {\tiny{}};
\node [anchor=north west,snode,minimum width=2em] (s3) at ([yshift=-0.3em]s2.south west) {\tiny{}};

\node [anchor=east] (label1) at ([xshift=-0.8em,yshift=0.6em]s1.west) {\scriptsize{Shuffle:}};

\node [anchor=west,pnode,minimum width=3em] (p1) at ([xshift=0.3em]s1.east) {\tiny{}};
\node [anchor=west,pnode,minimum width=4em] (p3) at ([xshift=0.3em]s3.east) {\tiny{}};

\node [anchor=west,snode,minimum width=5em] (s4) at ([xshift=6em]p1.east) {\tiny{}};
\node [anchor=north west,snode,minimum width=5em] (s5) at ([yshift=-0.3em]s4.south west) {\tiny{}};
\node [anchor=north west,snode,minimum width=6.3em] (s6) at ([yshift=-0.3em]s5.south west) {\tiny{}};

\node [anchor=east] (label2) at ([xshift=-0.8em,yshift=0.6em]s4.west) {\scriptsize{Sorted:}};

\node [anchor=west,pnode,minimum width=1em] (p4) at ([xshift=0.3em]s4.east) {\tiny{}};
\node [anchor=west,pnode,minimum width=1em] (p5) at ([xshift=0.3em]s5.east) {\tiny{}};

\node [rectangle,inner sep=0.5em,rounded corners=2pt,very thick,dotted,draw=ugreen!80] [fit = (s1) (s3) (p1) (p3)] (box0) {};
\node [rectangle,inner sep=0.5em,rounded corners=2pt,very thick,dotted,draw=ugreen!80] [fit = (s4) (s6) (p4) (p5)] (box0) {};
\end{tikzpicture}
\end{center}
xiaotong committed
5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389

\item<2-> \textbf{Dropout }:为了防止网络训练过拟合,加入了Dropout操作。在四个地方用到了Dropout,词嵌入和位置编码、残差连接、注意力操作和前馈神经网络。Drop率通常设置为0.1

\item<3-> \textbf{标签平滑}:学习一个较平滑的的目标,可以提升泛化能力,防止过拟合 :)
\end{itemize}
\vspace{-0.8em}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
\begin{frame}{训练配置(续)}
\begin{itemize}
\item \textbf{Transformer Base}:标准的Transformer结构,解码器编码器均包含6层,隐层维度为512,前馈神经网络维度为2048,多头注意力机制为8头,Dropout设为0.1

\item \textbf{Transformer Big}:为了提升网络的表示能力,在Base的基础上增大隐层维度至1024,前馈神经网络的维度变为4096,多头注意力机制为16头,Dropout设为0.3

\item \textbf{Transformer Deep}:加深编码器网络层数可以进一步提升网络的性能,但简单堆叠网络层数会出现梯度消失问题,导致训练无法收敛。需要使用DLCL、正则化前作等方法来训练更深的网络。
\end{itemize}
\vspace{-0.8em}


{
    \footnotesize
    \begin{center}
        \setlength{\tabcolsep}{3pt}
        \renewcommand\arraystretch{1}
        \begin{tabular}{l}
            \begin{tabular}{lcccl}
                \specialrule{1pt}{1pt}{1pt}
                \multirow{2}{*}{\#} & \multicolumn{2}{c}{BLEU} & \multicolumn{2}{c}{ \multirow{2}{*}{params}}\\
                \cline{2-3}
                & EN-DE & EN-FR & \\
                \specialrule{0.6pt}{1pt}{1pt}
                Transformer Base & 27.3 & 38.1 & \multicolumn{2}{c}{ 65$\times10^{6}$} \\
                Transformer Big & 28.4 & 41.8 & \multicolumn{2}{c}{ 213$\times10^{6}$} \\
                Transformer Deep(48层) & 30.2 & 43.1 & \multicolumn{2}{c}{ 194$\times10^{6}$} \\
                \specialrule{1pt}{1pt}{1pt}
            \end{tabular}\\
            \addlinespace[-0.3ex]
            %\footnote \\
        \end{tabular}
    \end{center}
}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
姜雨帆 committed
5390 5391 5392 5393 5394 5395
\subsection{推断}
\begin{frame}{推断}
\begin{itemize}

\item 由于自回归性,Transformer在推断阶段无法进行并行化操作,导致推断速度非常慢!

xiaotong committed
5396
\item<2-> 加速手段:低精度、Cache(缓存需要重复计算的变量) 、Average Attention Network、Shared Attention Network
姜雨帆 committed
5397 5398 5399 5400 5401 5402 5403

\end{itemize}

\vspace{-1.5em}
\visible<3->{
%\vspace{-0.5em}
\begin{center}
姜雨帆 committed
5404 5405 5406
    \begin{tikzpicture}
    \begin{scope}
    \tikzstyle{rnnnode} = [minimum height=1.1em,minimum width=2.1em,inner sep=2pt,rounded corners=1pt,draw,fill=red!20];
姜雨帆 committed
5407

姜雨帆 committed
5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418
    \node [rnnnode,anchor=west] (h1) at (0,0) {\tiny{$\textbf{h}_1$}};
    \node [rnnnode,anchor=west] (h2) at ([xshift=1em]h1.east) {\tiny{$\textbf{h}_2$}};
    \node [rnnnode,anchor=west] (h3) at ([xshift=1em]h2.east) {\tiny{$\textbf{h}_3$}};
    \node [rnnnode,anchor=north,fill=green!20] (e1) at ([yshift=-1em]h1.south) {\tiny{$e_x()$}};
    \node [rnnnode,anchor=west,fill=green!20] (e2) at ([xshift=1em]e1.east) {\tiny{$e_x()$}};
    \node [rnnnode,anchor=west,fill=green!20] (e3) at ([xshift=1em]e2.east) {\tiny{$e_x()$}};
    \node [anchor=north,inner sep=2pt] (w1) at ([yshift=-0.6em]e1.south) {\tiny{}};
    \node [anchor=north,inner sep=2pt] (w2) at ([yshift=-0.6em]e2.south) {\tiny{}};
    \node [anchor=north,inner sep=2pt] (w3) at ([yshift=-0.6em]e3.south) {\tiny{EOS}};
    %\node [anchor=south] (dot1) at ([xshift=0.4em,yshift=-0.7em]h1.south) {\tiny{...}};
    %\node [anchor=south] (dot2) at ([xshift=-0.4em,yshift=-0.7em]h3.south) {\tiny{...}};
姜雨帆 committed
5419

姜雨帆 committed
5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431
    \draw [->] (w1.north) -- ([yshift=-0.1em]e1.south);
    \draw [->] (w2.north) -- ([yshift=-0.1em]e2.south);
    \draw [->] (w3.north) -- ([yshift=-0.1em]e3.south);
    \draw [->] ([yshift=0.1em]e1.north) -- ([yshift=-0.1em]h1.south);
    \draw [->] ([yshift=0.1em]e2.north) -- ([yshift=-0.1em]h2.south);
    \draw [->] ([yshift=0.1em]e3.north) -- ([yshift=-0.1em]h3.south);
    \draw [->] ([xshift=0.2em,yshift=0.1em]e1.north) .. controls +(north:0.3) and +(south:0.4) .. ([xshift=-0.3em,yshift=-0.1em]h2.south);
    \draw [->] ([xshift=-0.2em,yshift=0.1em]e3.north) .. controls +(north:0.3) and +(south:0.4) .. ([xshift=0.3em,yshift=-0.1em]h2.south);
    \draw [->] ([xshift=0.4em,yshift=-0.4em]h1.south) -- ([xshift=0.3em,yshift=-0.1em]h1.south);
    \draw [->] ([xshift=0.8em,yshift=-0.4em]h1.south) -- ([xshift=0.6em,yshift=-0.1em]h1.south);
    \draw [->] ([xshift=-0.4em,yshift=-0.4em]h3.south) -- ([xshift=-0.3em,yshift=-0.1em]h3.south);
    \draw [->] ([xshift=-0.8em,yshift=-0.4em]h3.south) -- ([xshift=-0.6em,yshift=-0.1em]h3.south);
姜雨帆 committed
5432

姜雨帆 committed
5433
    \node [anchor=south] (encoder) at ([xshift=-0.2em]h1.north west) {\scriptsize{\textbf{编码器}}};
姜雨帆 committed
5434

姜雨帆 committed
5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502
    \visible<5->{
    \node [rnnnode,anchor=west,fill=green!20] (t1) at ([xshift=3em]e3.east) {\tiny{$e_y()$}};
    }
    \visible<7->{
    \node [rnnnode,anchor=west,fill=green!20] (t2) at ([xshift=1.5em]t1.east) {\tiny{$e_y()$}};
    }
    \visible<8->{
    \node [rnnnode,anchor=west,fill=green!20] (t3) at ([xshift=1.5em]t2.east) {\tiny{$e_y()$}};
    \node [rnnnode,anchor=west,fill=green!20] (t4) at ([xshift=1.5em]t3.east) {\tiny{$e_y()$}};
    %\node [anchor=west,inner sep=2pt] (t5) at ([xshift=0.3em]t4.east) {\tiny{...}};
    }
    \visible<5->{
    \node [rnnnode,anchor=south] (s1) at ([yshift=1em]t1.north) {\tiny{$\textbf{s}_1$}};
    \node [rnnnode,anchor=south] (f1) at ([yshift=1em]s1.north) {\tiny{$\textbf{f}_1$}};
    }
    \visible<7->{
    \node [rnnnode,anchor=south] (s2) at ([yshift=1em]t2.north) {\tiny{$\textbf{s}_2$}};
    \node [rnnnode,anchor=south] (f2) at ([yshift=1em]s2.north) {\tiny{$\textbf{f}_2$}};
    }
    \visible<8->{
    \node [rnnnode,anchor=south] (s3) at ([yshift=1em]t3.north) {\tiny{$\textbf{s}_3$}};
    \node [rnnnode,anchor=south] (f3) at ([yshift=1em]s3.north) {\tiny{$\textbf{f}_3$}};
    \node [rnnnode,anchor=south] (s4) at ([yshift=1em]t4.north) {\tiny{$\textbf{s}_4$}};
    \node [rnnnode,anchor=south] (f4) at ([yshift=1em]s4.north) {\tiny{$\textbf{f}_4$}};
    %\node [anchor=west,inner sep=2pt] (s5) at ([xshift=0.3em]s4.east) {\tiny{...}};
    %\node [anchor=south] (dot3) at ([xshift=-0.4em,yshift=-0.7em]s3.south) {\tiny{...}};
    \node [anchor=south] (dot4) at ([xshift=-0.4em,yshift=-0.7em]s4.south) {\tiny{...}};
    }
    \visible<5->{
    \node [rnnnode,anchor=south,fill=blue!20] (o1) at ([yshift=1em]f1.north) {\tiny{softmax}};
    \node [anchor=east] (decoder) at ([xshift=-0.3em,yshift=0.5em]o1.north west) {\scriptsize{\textbf{解码器}}};
    }
    \visible<7->{
    \node [rnnnode,anchor=south,fill=blue!20] (o2) at ([yshift=1em]f2.north) {\tiny{softmax}};
    }
    \visible<8->{
    \node [rnnnode,anchor=south,fill=blue!20] (o3) at ([yshift=1em]f3.north) {\tiny{softmax}};
    \node [rnnnode,anchor=south,fill=blue!20] (o4) at ([yshift=1em]f4.north) {\tiny{softmax}};
    %\node [anchor=west,inner sep=2pt] (o5) at ([xshift=0.3em]o4.east) {\tiny{...}};
    }
    \visible<4->{
    \node [anchor=north,inner sep=2pt] (wt1) at ([yshift=-0.6em]t1.south) {\tiny{EOS}};
    }
    \visible<6->{
    \node [anchor=north,inner sep=2pt] (wt2) at ([yshift=-0.6em]t2.south) {\tiny{How}};
    }
    \visible<8->{
    \node [anchor=north,inner sep=2pt] (wt3) at ([yshift=-0.8em]t3.south) {\tiny{are}};
    \node [anchor=north,inner sep=2pt] (wt4) at ([yshift=-0.8em]t4.south) {\tiny{you}};
    }
    \visible<5->{
    \node [anchor=center,inner sep=2pt] (wo1) at ([yshift=1.2em]o1.north) {\tiny{How}};
    }
    \visible<4->{
    \node [anchor=south,inner sep=2pt] (wos1) at (wo1.north) {\tiny{\textbf{[step 1]}}};
    }
    \visible<7->{
    \node [anchor=center,inner sep=2pt] (wo2) at ([yshift=1.2em]o2.north) {\tiny{are}};
    }
    \visible<6->{
    \node [anchor=south,inner sep=2pt] (wos2) at (wo2.north) {\tiny{\textbf{[step 2]}}};
    }
    \visible<8->{
    \node [anchor=center,inner sep=2pt] (wo3) at ([yshift=1.2em]o3.north) {\tiny{you}};
    \node [anchor=south,inner sep=2pt] (wos3) at (wo3.north) {\tiny{\textbf{[step 3]}}};
    \node [anchor=center,inner sep=2pt] (wo4) at ([yshift=1.2em]o4.north) {\tiny{EOS}};
    \node [anchor=south,inner sep=2pt] (wos4) at (wo4.north) {\tiny{\textbf{[step 4]}}};
    }
姜雨帆 committed
5503

姜雨帆 committed
5504 5505 5506 5507 5508 5509 5510 5511 5512
    \visible<5->{
    \foreach \x in {1}{
        \draw [->] ([yshift=-0.7em]t\x.south) -- ([yshift=-0.1em]t\x.south);
        \draw [->] ([yshift=0.1em]t\x.north) -- ([yshift=-0.1em]s\x.south);
        \draw [->] ([yshift=0.1em]s\x.north) -- ([yshift=-0.1em]f\x.south);
        \draw [->] ([yshift=0.1em]f\x.north) -- ([yshift=-0.1em]o\x.south);
        \draw [->] ([yshift=0.1em]o\x.north) -- ([yshift=0.8em]o\x.north) node [pos=0.5,right] {\tiny{top1}};
    }
    }
姜雨帆 committed
5513

姜雨帆 committed
5514 5515 5516 5517 5518 5519 5520 5521 5522 5523
    \visible<7->{
    \foreach \x in {2}{
        \draw [->] ([yshift=-0.7em]t\x.south) -- ([yshift=-0.1em]t\x.south);
        \draw [->] ([yshift=0.1em]t\x.north) -- ([yshift=-0.1em]s\x.south);
        \draw [->] ([yshift=0.1em]s\x.north) -- ([yshift=-0.1em]f\x.south);
        \draw [->] ([yshift=0.1em]f\x.north) -- ([yshift=-0.1em]o\x.south);
        \draw [->] ([yshift=0.1em]o\x.north) -- ([yshift=0.8em]o\x.north) node [pos=0.5,right] {\tiny{top1}};
    \draw [->] ([xshift=0.2em,yshift=0.1em]t1.north) .. controls +(north:0.3) and +(south:0.3) .. ([xshift=-0.3em,yshift=-0.1em]s2.south);
    }
    }
姜雨帆 committed
5524

姜雨帆 committed
5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537
    \visible<8->{
    \foreach \x in {3,4}{
        \draw [->] ([yshift=-0.7em]t\x.south) -- ([yshift=-0.1em]t\x.south);
        \draw [->] ([yshift=0.1em]t\x.north) -- ([yshift=-0.1em]s\x.south);
        \draw [->] ([yshift=0.1em]s\x.north) -- ([yshift=-0.1em]f\x.south);
        \draw [->] ([yshift=0.1em]f\x.north) -- ([yshift=-0.1em]o\x.south);
        \draw [->] ([yshift=0.1em]o\x.north) -- ([yshift=0.8em]o\x.north) node [pos=0.5,right] {\tiny{top1}};
    %\draw [->] ([xshift=0.4em,yshift=0.1em]t1.north) .. controls +(north:0.25) and +(south:0.3) .. ([xshift=-0.6em,yshift=-0.1em]s3.south);
    %\draw [->] ([xshift=0.2em,yshift=0.1em]t2.north) .. controls +(north:0.2) and +(south:0.4) .. ([xshift=-0.3em,yshift=-0.1em]s3.south);
    \draw [->] ([xshift=-0.6em,yshift=-0.5em]s3.south) .. controls +(north:0) and +(south:0.2) .. ([xshift=-0.3em,yshift=-0.1em]s3.south);
    \draw [->] ([xshift=-1.5em,yshift=-0.5em]s3.south) .. controls +(north:0) and +(south:0.15) .. ([xshift=-0.6em,yshift=-0.1em]s3.south);
    }
    }
姜雨帆 committed
5538

姜雨帆 committed
5539 5540 5541 5542 5543 5544 5545
    \visible<6->{
    \draw [->,thick,dotted] (wo1.east) .. controls +(east:1.0) and +(west:1.0) ..(wt2.west);
    }
    \visible<8->{
    \draw [->,thick,dotted] (wo2.east) .. controls +(east:1.3) and +(west:1.1) ..(wt3.west);
    \draw [->,thick,dotted] (wo3.east) .. controls +(east:1.1) and +(west:0.9) ..(wt4.west);
    }
姜雨帆 committed
5546

姜雨帆 committed
5547 5548 5549 5550 5551 5552 5553 5554 5555
    \visible<5->{
    \node [circle,draw,anchor=south,inner sep=3pt,fill=orange!20] (c1) at ([yshift=2em]h2.north) {\tiny{$\textbf{C}_1$}};
    \node [anchor=south] (c1label) at (c1.north) {\tiny{\textbf{编码-解码注意力机制:上下文}}};
    \draw [->] (h1.north) .. controls +(north:0.6) and +(250:0.9) .. (c1.250);
    \draw [->] (h2.north) .. controls +(north:0.6) and +(270:0.9) .. (c1.270);
    \draw [->] (h3.north) .. controls +(north:0.6) and +(290:0.9) .. (c1.290);
    \draw [->] ([yshift=0.3em]s1.west) .. controls +(west:1) and +(east:1) .. (c1.-30);
    \draw [->] (c1.0) .. controls +(east:1) and +(west:1) .. ([yshift=0em]f1.west);
    }
姜雨帆 committed
5556

姜雨帆 committed
5557 5558 5559 5560 5561
    \visible<7->{
    \node [circle,draw,anchor=north,inner sep=3pt,fill=orange!20] (c2) at ([yshift=-2em]t1.south) {\tiny{$\textbf{C}_2$}};
    \draw [->] ([xshift=-0.7em]c2.west) -- ([xshift=-0.1em]c2.west);
    \draw [->] ([xshift=0.1em]c2.east) .. controls +(east:0.6) and +(west:0.8) ..([yshift=-0.3em,xshift=-0.1em]f2.west);
    }
姜雨帆 committed
5562

姜雨帆 committed
5563 5564 5565 5566 5567
    \visible<8->{
    \node [circle,draw,anchor=north,inner sep=3pt,fill=orange!20] (c3) at ([yshift=-2em]t2.south) {\tiny{$\textbf{C}_3$}};
    \draw [->] ([xshift=-0.7em]c3.west) -- ([xshift=-0.1em]c3.west);
    \draw [->] ([xshift=0.1em]c3.east) .. controls +(east:0.6) and +(west:0.8) ..([yshift=-0.3em,xshift=-0.1em]f3.west);
    }
姜雨帆 committed
5568

姜雨帆 committed
5569 5570 5571 5572 5573
    \visible<8->{
    \node [circle,draw,anchor=north,inner sep=3pt,fill=orange!20] (c4) at ([yshift=-2em]t3.south) {\tiny{$\textbf{C}_4$}};
    \draw [->] ([xshift=-0.7em]c4.west) -- ([xshift=-0.1em]c4.west);
    \draw [->] ([xshift=0.1em]c4.east) .. controls +(east:0.6) and +(west:0.8) ..([yshift=-0.3em,xshift=-0.1em]f4.west);
    }
姜雨帆 committed
5574

姜雨帆 committed
5575 5576 5577
    \end{scope}
    \end{tikzpicture}
    \end{center}
姜雨帆 committed
5578 5579 5580 5581
}
\end{frame}

%%%------------------------------------------------------------------------------------------------------------
xiaotong committed
5582
\section{应用}
姜雨帆 committed
5583 5584

%%%------------------------------------------------------------------------------------------------------------
姜雨帆 committed
5585 5586 5587 5588 5589 5590 5591 5592 5593
%%% 应用
\begin{frame}{有趣的应用}

    \vspace{5.0em}
     \begin{tcolorbox}[enhanced,size=normal,left=13mm,right=1mm,colback=red!5!white,colframe=red!75!black,drop fuzzy shadow]
    {\Large
    \textbf{一些有趣的神经机器翻译应用}
    }
    \end{tcolorbox}
xiaotong committed
5594

姜雨帆 committed
5595 5596
    \begin{center}
    \begin{tikzpicture}
xiaotong committed
5597

姜雨帆 committed
5598 5599 5600 5601 5602 5603
    \begin{scope}[scale=0.7]
    \tikzstyle{rnnnode} = [draw,inner sep=5pt,minimum width=3em,minimum height=0.8em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}]
    \node [anchor=west,rnnnode] (node11) at (0,0) {\tiny{Self-Attention}};
    \node [anchor=west,rnnnode] (node12) at ([xshift=1em]node11.east) {\tiny{Self-Attention}};
    \node [anchor=west,rnnnode] (node13) at ([xshift=1em]node12.east) {\tiny{Self-Attention}};
    \node [anchor=west,rnnnode] (node14) at ([xshift=1em]node13.east) {\tiny{Self-Attention}};
xiaotong committed
5604

姜雨帆 committed
5605 5606 5607 5608 5609 5610 5611 5612
    \node [anchor=north,rnnnode,fill=blue!30!white] (e1) at ([yshift=-2em]node11.south) {\tiny{}};
    \node [anchor=north,rnnnode,fill=blue!30!white] (e2) at ([yshift=-2em]node12.south) {\tiny{}};
    \node [anchor=north,rnnnode,fill=blue!30!white] (e3) at ([yshift=-2em]node13.south) {\tiny{}};
    \node [anchor=north,rnnnode,fill=blue!30!white] (e4) at ([yshift=-2em]node14.south) {\tiny{}};
    \node [anchor=north,inner sep=2pt] (w1) at ([yshift=-1em]e1.south) {\tiny{$<$s$>$}};
    \node [anchor=north,inner sep=2pt] (w2) at ([yshift=-1em]e2.south) {\tiny{}};
    \node [anchor=north,inner sep=2pt] (w3) at ([yshift=-1em]e3.south) {\tiny{我们}};
    \node [anchor=north,inner sep=2pt] (w4) at ([yshift=-1em]e4.south) {\tiny{开始}};
xiaotong committed
5613

姜雨帆 committed
5614 5615 5616 5617
    \draw [->,thick] ([yshift=0.1em]w1.north)--([yshift=-0.1em]e1.south);
    \draw [->,thick] ([yshift=0.1em]w2.north)--([yshift=-0.1em]e2.south);
    \draw [->,thick] ([yshift=0.1em]w3.north)--([yshift=-0.1em]e3.south);
    \draw [->,thick] ([yshift=0.1em]w4.north)--([yshift=-0.1em]e4.south);
xiaotong committed
5618

姜雨帆 committed
5619 5620 5621 5622
    \draw [->,thick] ([yshift=0.1em]e1.north)--([yshift=-0.1em]node11.south);
    \draw [->,thick] ([yshift=0.1em]e2.north)--([yshift=-0.1em]node12.south);
    \draw [->,thick] ([yshift=0.1em]e3.north)--([yshift=-0.1em]node13.south);
    \draw [->,thick] ([yshift=0.1em]e4.north)--([yshift=-0.1em]node14.south);
xiaotong committed
5623

姜雨帆 committed
5624 5625 5626
    \draw [->,thick] ([yshift=0.1em]e1.north)--([xshift=-0.5em,yshift=-0.2em]node12.south);
    \draw [->,thick] ([yshift=0.1em]e3.north)--([xshift=0.5em,yshift=-0.2em]node12.south);
    \draw [->,thick] ([yshift=0.1em]e4.north)--([xshift=1.5em,yshift=-0.2em]node12.south);
xiaotong committed
5627

姜雨帆 committed
5628 5629 5630 5631
    \node [anchor=south,rnnnode,fill=blue!30!white] (node21) at ([yshift=2.0em]node11.north) {\tiny{}};
    \node [anchor=south,rnnnode,fill=blue!30!white] (node22) at ([yshift=2.0em]node12.north) {\tiny{}};
    \node [anchor=south,rnnnode,fill=blue!30!white] (node23) at ([yshift=2.0em]node13.north) {\tiny{}};
    \node [anchor=south,rnnnode,fill=blue!30!white] (node24) at ([yshift=2.0em]node14.north) {\tiny{}};
xiaotong committed
5632

姜雨帆 committed
5633 5634 5635 5636
    \node [anchor=south] (output1) at ([yshift=1em]node21.north) {\Large{\textbf{}}};
    \node [anchor=south] (output2) at ([yshift=1em]node22.north) {\Large{\textbf{}}};
    \node [anchor=south] (output3) at ([yshift=1em]node23.north) {\Large{\textbf{}}};
    \node [anchor=south] (output4) at ([yshift=1em]node24.north) {\Large{\textbf{}}};
xiaotong committed
5637

姜雨帆 committed
5638 5639 5640 5641
    \draw [->,thick] ([yshift=0.1em]node21.north)--([yshift=-0.1em]output1.south);
    \draw [->,thick] ([yshift=0.1em]node22.north)--([yshift=-0.1em]output2.south);
    \draw [->,thick] ([yshift=0.1em]node23.north)--([yshift=-0.1em]output3.south);
    \draw [->,thick] ([yshift=0.1em]node24.north)--([yshift=-0.1em]output4.south);
xiaotong committed
5642

姜雨帆 committed
5643 5644 5645 5646
    \draw [->,thick] ([yshift=0.1em]node11.north)--([yshift=-0.1em]node21.south);
    \draw [->,thick] ([yshift=0.1em]node12.north)--([yshift=-0.1em]node22.south);
    \draw [->,thick] ([yshift=0.1em]node13.north)--([yshift=-0.1em]node23.south);
    \draw [->,thick] ([yshift=0.1em]node14.north)--([yshift=-0.1em]node24.south);
xiaotong committed
5647

姜雨帆 committed
5648
    \end{scope}
xiaotong committed
5649

姜雨帆 committed
5650 5651
    \end{tikzpicture}
    \end{center}
xiaotong committed
5652

姜雨帆 committed
5653
    \end{frame}
xiaotong committed
5654

姜雨帆 committed
5655 5656

%%%------------------------------------------------------------------------------------------------------------
姜雨帆 committed
5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667
\begin{frame}{NMT应用}
\begin{itemize}
\item 神经机器翻译翻译系统除了满足日常翻译需求,还有很多其他有意思的应用!
    \begin{itemize}
    \item 例如文言文翻译,将翻译系统中的源语、目标语,换成文言文和相应的现代文译文进行训练,就可以获得一个古文翻译系统
    \end{itemize}
\vspace{-0.5em}

\begin{center}
\begin{tikzpicture}
\begin{scope}
5668
\tikzstyle{lnode} = [minimum height=2em,minimum width=8em,inner sep=3pt,rounded corners=2pt,draw,fill=red!20];
姜雨帆 committed
5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684
\tikzstyle{standard} = [rounded corners=3pt]

\node [lnode,anchor=west] (l1) at (0,0) {编码器};
\node [lnode,anchor=west,fill=blue!20] (l2) at ([xshift=3em]l1.east) {解码器};
\node [anchor=north] (inputs) at ([xshift=-1.5em,yshift=-1em]l1.south) {Inputs: 学  而  时 习 之};
\node [anchor=south] (outputs) at ([xshift=-2em,yshift=1em]l2.north) {Outputs: 学习  并且  按时 复习};
\draw [->,very thick] ([yshift=-1em]l1.south) -- ([yshift=-0.1em]l1.south);
\draw [->,very thick] ([yshift=0.1em]l2.north) -- ([yshift=1em]l2.north);
\draw [->,very thick] ([xshift=0.1em]l1.east) -- ([xshift=-0.1em]l2.west);

\node [rectangle,inner sep=0.5em,rounded corners=1pt,very thick,dotted,draw=ugreen!80] [fit = (l1) (l2)] (box0) {};

\end{scope}
\end{tikzpicture}
\end{center}

5685 5686 5687
\item 需要考虑的问题:
    \begin{itemize}
    \item 古文短,现代文长,过翻译或者欠翻译对性能影响很大,如何对长度进行更精确的建模
xiaotong committed
5688
    \item 不同时代语言差异性大,如何进行自动适应和风格迁移
5689 5690
    \end{itemize}

姜雨帆 committed
5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701
\end{itemize}
\vspace{-0.8em}


\end{frame}


%%%------------------------------------------------------------------------------------------------------------
\begin{frame}{NMT应用}
\begin{itemize}
\item 古文翻译实例
xiaotong committed
5702
\vspace{0.0em}
姜雨帆 committed
5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743

 \begin{tcolorbox}[size=normal,left=2mm,right=1mm,colback=red!5!white,colframe=red!75!black]
{
\small{古文:侍卫步军都指挥使、彰信节度使李继勋营于寿州城南,唐刘仁赡伺继勋无备,出兵击之,杀士卒数百人,焚其攻具。}
}
\end{tcolorbox}
\vspace{-0.4em}
 \begin{tcolorbox}[size=normal,left=2mm,right=1mm,colback=blue!5!white,colframe=blue!75!black]
{
\small{现代文:侍卫步军都指挥使、彰信节度使李继勋在寿州城南扎营,唐刘仁赡窥伺李继勋没有防备,出兵攻打他,杀死士兵几百人,烧毁李继勋的攻城器}
}
\end{tcolorbox}

\vspace{0.2em}

\begin{tcolorbox}[size=normal,left=2mm,right=1mm,colback=red!5!white,colframe=red!75!black]
{
\small{古文:其后人稍稍识之,多延至其家,使为弟子论学。}
}
\end{tcolorbox}
\vspace{-0.4em}
 \begin{tcolorbox}[size=normal,left=2mm,right=1mm,colback=blue!5!white,colframe=blue!75!black]
{
\small{现代文:后来的人渐渐认识他,多把他请到家里,让他为弟子讲授学问。}
}
\end{tcolorbox}


\end{itemize}
\vspace{-0.8em}


\end{frame}

%%%------------------------------------------------------------------------------------------------------------
\begin{frame}{NMT应用}
\begin{itemize}
\item 神经机器翻译翻译系统除了满足日常翻译需求,还有很多其他有意思的应用!
    \begin{itemize}
    \item 除了古文翻译,对联也可以用机器翻译系统生成,只需将输入输出变为对联的上联和下联
    \end{itemize}
xiaotong committed
5744
\vspace{-0.8em}
姜雨帆 committed
5745 5746 5747 5748

\begin{center}
\begin{tikzpicture}
\begin{scope}
5749
\tikzstyle{lnode} = [minimum height=2em,minimum width=8em,inner sep=3pt,rounded corners=2pt,draw,fill=red!20];
姜雨帆 committed
5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765
\tikzstyle{standard} = [rounded corners=3pt]

\node [lnode,anchor=west] (l1) at (0,0) {编码器};
\node [lnode,anchor=west,fill=blue!20] (l2) at ([xshift=3em]l1.east) {解码器};
\node [anchor=north] (inputs) at ([xshift=-1.5em,yshift=-1em]l1.south) {Inputs: 雪里梅花霜里菊};
\node [anchor=south] (outputs) at ([xshift=-0.5em,yshift=1em]l2.north) {Outputs: 水中明月镜中天};
\draw [->,very thick] ([yshift=-1em]l1.south) -- ([yshift=-0.1em]l1.south);
\draw [->,very thick] ([yshift=0.1em]l2.north) -- ([yshift=1em]l2.north);
\draw [->,very thick] ([xshift=0.1em]l1.east) -- ([xshift=-0.1em]l2.west);

\node [rectangle,inner sep=0.5em,rounded corners=1pt,very thick,dotted,draw=ugreen!80] [fit = (l1) (l2)] (box0) {};

\end{scope}
\end{tikzpicture}
\end{center}

5766 5767 5768 5769 5770 5771
\item 需要考虑的问题:
    \begin{itemize}
    \item 对联的上下联有较严格的对应要求,包括长度、押韵、词义的对应等
    \item 横批生成难度比较大,横批是对内容的高度概括,但是数据非常缺乏
    \end{itemize}

姜雨帆 committed
5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807
\end{itemize}
\vspace{-0.8em}


\end{frame}


%%%------------------------------------------------------------------------------------------------------------
\begin{frame}{NMT应用}
\begin{itemize}
\item 对联实例
\vspace{-0.3em}
%
% \begin{tcolorbox}[size=normal,left=2mm,right=1mm,colback=red!5!white,colframe=red!75!black]
%{
%\small{上联:翠竹千支歌盛世\	    下联:红梅万点报新春}
%}
%\end{tcolorbox}
%\vspace{-0.4em}
% \begin{tcolorbox}[size=normal,left=2mm,right=1mm,colback=blue!5!white,colframe=blue!75!black]
%{
%\small{下联:红梅万点报新春}
%}
%\end{tcolorbox}


\begin{center}
\begin{tikzpicture}
\begin{scope}
\tikzstyle{lnode} = [minimum height=2.5em,minimum width=12em,inner sep=3pt,very thick,rounded corners=2pt,draw=red!75!black,fill=red!5];
\tikzstyle{rnode} = [minimum height=2.5em,minimum width=12em,inner sep=3pt,very thick,rounded corners=2pt,draw=blue!75!black,fill=blue!5];
\tikzstyle{standard} = [rounded corners=3pt]

\node [lnode,anchor=west] (l1) at (0,0) {上联:翠竹千支歌盛世};
\node [rnode,anchor=west] (l2) at ([xshift=1em]l1.east) {下联:红梅万点报新春};

xiaotong committed
5808
\node [lnode,anchor=north] (l3) at ([yshift=-0.8em]l1.south) {上联:一帆风顺年年好};
姜雨帆 committed
5809 5810
\node [rnode,anchor=west] (l4) at ([xshift=1em]l3.east) {下联:万事如意步步高};

xiaotong committed
5811
\node [lnode,anchor=north] (l5) at ([yshift=-0.8em]l3.south) {上联:佳节迎春春生笑脸};
姜雨帆 committed
5812 5813
\node [rnode,anchor=west] (l6) at ([xshift=1em]l5.east) {下联:新年纳福富华满堂};

xiaotong committed
5814
\node [lnode,anchor=north] (l7) at ([yshift=-0.8em]l5.south) {上联:腊梅吐芳迎红日};
姜雨帆 committed
5815 5816
\node [rnode,anchor=west] (l8) at ([xshift=1em]l7.east) {下联:绿柳展枝舞春风};

xiaotong committed
5817
\node [lnode,anchor=north] (l9) at ([yshift=-0.8em]l7.south) {上联:雪兆丰年丛岭翠};
姜雨帆 committed
5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837
\node [rnode,anchor=west] (l10) at ([xshift=1em]l9.east) {下联:春回大地满园红};

\end{scope}
\end{tikzpicture}
\end{center}


\end{itemize}
\vspace{-0.8em}


\end{frame}

%%%------------------------------------------------------------------------------------------------------------
\begin{frame}{NMT应用}
\begin{itemize}
\item 神经机器翻译翻译系统除了满足日常翻译需求,还有很多其他有意思的应用!
    \begin{itemize}
    \item 还可以用机器翻译系统来写诗。如藏头诗,给定诗句的第一个字,生成一首完整的诗。还可以根据意境生成诗句
    \end{itemize}
xiaotong committed
5838
\vspace{0.0em}
姜雨帆 committed
5839 5840 5841 5842

\begin{center}
\begin{tikzpicture}
\begin{scope}
5843
\tikzstyle{lnode} = [minimum height=2em,minimum width=8em,inner sep=3pt,rounded corners=2pt,draw,fill=red!20];
姜雨帆 committed
5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860
\tikzstyle{standard} = [rounded corners=3pt]

\node [lnode,anchor=west] (l1) at (0,0) {编码器};
\node [lnode,anchor=west,fill=blue!20] (l2) at ([xshift=3em]l1.east) {解码器};
\node [anchor=north] (inputs) at ([xshift=-1.5em,yshift=-1em]l1.south) {Inputs: 五 星 红 旗};
\node [anchor=south] (outputs) at ([xshift=-3.5em,yshift=2em]l2.north) {Outputs: {\color{red}}云深处小蓬莱 {\color{red}}斗阑干次第开};
\node [anchor=south] (outputs1) at ([xshift=-1.5em,yshift=1em]l2.north) {{\color{red}}旆壁幢春色里 {\color{red}}亭鼓吹乐声来};
\draw [->,very thick] ([yshift=-1em]l1.south) -- ([yshift=-0.1em]l1.south);
\draw [->,very thick] ([yshift=0.1em]l2.north) -- ([yshift=1em]l2.north);
\draw [->,very thick] ([xshift=0.1em]l1.east) -- ([xshift=-0.1em]l2.west);

\node [rectangle,inner sep=0.5em,rounded corners=1pt,very thick,dotted,draw=ugreen!80] [fit = (l1) (l2)] (box0) {};

\end{scope}
\end{tikzpicture}
\end{center}

5861 5862 5863 5864 5865 5866 5867
\item 需要考虑的问题:
    \begin{itemize}
    \item 古诗的的书写有对仗要求
    \item 意境和字面背后的意思如何体现
    \item 藏头诗需要有约束条件的生成
    \end{itemize}

姜雨帆 committed
5868 5869 5870 5871 5872 5873 5874
\end{itemize}
\vspace{-0.8em}


\end{frame}

%%%------------------------------------------------------------------------------------------------------------
xiaotong committed
5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896
%%% 做个小结
\begin{frame}{我们赶上了好时代 ...}
\begin{itemize}
\item 神经机器翻译的火爆这几年有目共睹,好事情!!!
    \begin{itemize}
    \item https://arxiv.org上搜索neural machine translation
    \item ACL、EMNLP等顶会神经机器翻译论文数量近些年几乎呈线性增长
    \item 神经机器翻译系统在各大比赛中霸榜,开源机器翻译满天飞,大厂秀肌肉,小作坊刷存在感
    \end{itemize}
    \vspace{0.3em}
\item<2-> 这里只介绍了最基本的概念,NMT的内容远不止这些
    \begin{itemize}
    \item 各种专题:解码、压缩、先验知识、低资源翻译、无指导方法、篇章级翻译等等等等
    \item 推荐一个survey,有些基础的可以参考一下,很全面 \\
    ``Neural Machine Translation: A Review'' by Felix Stahlberg\\
    \url{https://arxiv.org/abs/1912.02047}
    \item 如何搭建一个优秀的NMT系统?- 有许多技巧 \\
             下一章介绍
    \item 回忆一下第一章介绍的NMT开源系统,可以试试
    \end{itemize}
\end{itemize}
\end{frame}
xiaotong committed
5897 5898

%%%------------------------------------------------------------------------------------------------------------
xiaotong committed
5899 5900 5901 5902
%%% 小结
\section{小结}

%%%------------------------------------------------------------------------------------------------------------
xiaotong committed
5903 5904
%%% open source NMT
\begin{frame}{一些开源NMT系统}
姜雨帆 committed
5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926
    \begin{itemize}
    \item Tensor2Tensor
        \begin{itemize}
        \item Google Brain开发,基于静态图实现
        \item 先定义、后运行、速度快、可优化,但是代码中的错误难以发现
        \item https://github.com/tensorflow/tensor2tensor
        \end{itemize}
        \vspace{0.2em}
    \item Fairseq
        \begin{itemize}
        \item Facebook开发,基于动态图实现
        \item 灵活,debug方便,更适合自然语言处理
        \item https://github.com/pytorch/fairseq
        \end{itemize}
    \vspace{0.2em}
    \item NiuTrans.NMT
        \begin{itemize}
        \item 小牛翻译开发,基于动态图实现
        \item 简单小巧,易于修改、C语言编写,代码高度优化
        \item https://github.com/NiuTrans/NiuTensor
        \end{itemize}
    \vspace{0.2em}
姜雨帆 committed
5927
    \item 其他优秀的开源NMT系统:OpenNMT、THUMT、\\Sockeye、Marian、Nematus、SGNMT、Neural Monkey...
姜雨帆 committed
5928
    \end{itemize}
姜雨帆 committed
5929

姜雨帆 committed
5930
    \end{frame}
xiaotong committed
5931 5932

%%%------------------------------------------------------------------------------------------------------------
xiaotong committed
5933 5934
%%% last slide
\begin{frame}{结束~}
xiaotong committed
5935

xiaotong committed
5936
\vspace{2em}
xiaotong committed
5937

xiaotong committed
5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973
\begin{center}

\begin{tikzpicture}

\begin{scope}
\small{
\node [anchor=south west,minimum width=15em] (source) at (0,0) {\Large{\textbf{source}: 谢谢 大家 !}};
\node [anchor=south west,minimum width=15em] (target) at ([yshift=12em]source.north west) {\Large{\textbf{target}: Thank You !}};
\node [anchor=center,minimum width=9.6em,minimum height=1.8em,draw,rounded corners=0.3em] (hidden) at ([yshift=6em]source.north) {};
\node [anchor=west,minimum width=1.5em,minimum size=1.5em,fill=ugreen!20] (cell01) at ([xshift=0.2em]hidden.west) {\footnotesize{.2}};
\node [anchor=west,minimum width=1.5em,minimum size=1.5em,fill=ugreen!10] (cell02) at (cell01.east) {\footnotesize{-1}};
\node [anchor=west,minimum width=1.5em,minimum size=1.5em,fill=ugreen!70] (cell03) at (cell02.east) {\footnotesize{6}};
\node [anchor=west,minimum width=1.5em,minimum size=1.5em,fill=ugreen!50] (cell04) at (cell03.east) {\footnotesize{5}};
\node [anchor=west,minimum width=1.5em,minimum size=1.5em,fill=ugreen!30] (cell05) at (cell04.east) {\footnotesize{.7}};
\node [anchor=west,minimum width=1.5em,minimum size=1.5em,fill=ugreen!10] (cell06) at (cell05.east) {\footnotesize{-2}};

\filldraw [fill=red!20,draw=white] (source.north west) -- (source.north east) -- ([xshift=-0.2em,yshift=-0.1em]hidden.south east) -- ([xshift=0.2em,yshift=-0.1em]hidden.south west);
\filldraw [fill=blue!20,draw=white] (target.south west) -- (target.south east) -- ([xshift=-0.2em,yshift=0.1em]hidden.north east) -- ([xshift=0.2em,yshift=0.1em]hidden.north west);

\draw [->,thick] (source.north west) -- ([xshift=0.2em,yshift=-0.1em]hidden.south west);
\draw [->,thick] (source.north east) -- ([xshift=-0.2em,yshift=-0.1em]hidden.south east);
\draw [->,thick] ([xshift=0.2em,yshift=0.1em]hidden.north west) -- (target.south west);
\draw [->,thick] ([xshift=-0.2em,yshift=0.1em]hidden.north east) -- (target.south east);
}

\node [anchor=south] (enclabel) at ([yshift=2em]source.north) {\large{Encoder}};
\node [anchor=north] (declabel) at ([yshift=-2em]target.south) {\large{Decoder}};
\end{scope}

\end{tikzpicture}

\vspace{2em}

\end{center}

\end{frame}
xiaotong committed
5974

xiaotong committed
5975

xiaotong committed
5976 5977
\end{CJK}
\end{document}