Commit a587d26a by zengxin

slide 7

parent 1055a717
......@@ -3,6 +3,7 @@
\def\CTeXPreproc{Created by ctex v0.2.13, don't edit!}
\documentclass[CJKutf8,t,compress,12pt]{beamer}
%\usepackage[UTF8, heading = false, scheme = plain]{ctex}
\usepackage{pstricks}
\usepackage{etex}
\usepackage{eso-pic,graphicx}
......@@ -23,6 +24,9 @@
\usepackage{CJKulem}
\usepackage{booktabs}
\usepackage{amsmath}
\usepackage{ulem}
\usepackage{soul}
\usepackage{color}
\usepackage{tcolorbox}
\tcbuselibrary{skins}
......@@ -49,6 +53,7 @@
\usefonttheme[onlymath]{serif}
\definecolor{ugreen}{rgb}{0,0.5,0}
\definecolor{lightred}{RGB}{255,121,116}
\definecolor{lgreen}{rgb}{0.9,1,0.8}
\definecolor{xtgreen1}{rgb}{0.824,0.898,0.8}
\definecolor{xtgreen}{rgb}{0.914,0.945,0.902}
......@@ -341,52 +346,1848 @@
%%%------------------------------------------------------------------------------------------------------------
%%% 数据处理(也可以提一下不同blue脚本对tokenization的要求)
\begin{frame}{数据筛选、过滤}
\begin{frame}{数据处理}
\begin{itemize}
\item 双语数据的质量和数量对于训练一个神经机器翻译系统至关重要,有时即使拥有海量的数据,但是质量较差仍然会损失机器翻译的性能
\end{itemize}
\footnotesize
\begin{center}
\setlength{\tabcolsep}{3pt}
\renewcommand\arraystretch{1}
\begin{tabular}{l}
\begin{tabular}{lccr}
\specialrule{1pt}{1pt}{1pt}
& & \multicolumn{2}{c}{BLEU} \\ \cline{3-4}
\rule{0pt}{12pt}数据设置 & 数据量 & 英-德 & 德-英 \\
\specialrule{0.6pt}{1pt}{1pt}
初始数据 & 3M & 32.5 & 31 \\
$+$未处理额外数据 & 38M & 26.6 & 32.7 \\
$+$处理后数据 & 7M & 37.9 & 37.5 \\
\specialrule{1pt}{1pt}{1pt}
\end{tabular}\\
\addlinespace[-0.3ex]
\end{tabular}
\end{center}
\begin{itemize}
\item 数据处理便是对已有的数据集进行筛选处理的过程,一般的流程为:
\end{itemize}
\begin{center}
\begin{tikzpicture}
%\tikzstyle{op} =[rounded corners=1pt,thick,minimum width=3.0em,minimum height=3.0em,draw=red!75!black,fill=red!5!white,font=\scriptsize,align=center]
\tikzstyle{op} =[rounded corners=1pt,thick,minimum width=3.0em,minimum height=3.0em,draw,fill=red!5!white,font=\scriptsize,align=center]
%\tikzstyle{data} = [cylinder,draw=blue!90!black,thick,minimum height=2.5em,minimum width=3em,shape border rotate=0,cylinder uses custom fill, cylinder body fill=blue!10,cylinder end fill=blue!5,anchor = east,font=\scriptsize,align=center]
\tikzstyle{data} = [cylinder,draw,thick,minimum height=2.5em,minimum width=3em,shape border rotate=0,cylinder uses custom fill, cylinder body fill=blue!10,cylinder end fill=blue!5,anchor = east,font=\scriptsize,align=center]
\node[op] (node1) at (0,0) {分词};
\node[op,anchor = west] (node2) at ([xshift = 2.0em]node1.east) {符号 \\ 标准化};
\node[op,anchor = west] (node3) at ([xshift = 2.0em]node2.east) {\begin{tabular}{c} 数据 \\ 过滤\end{tabular}};
\node[op,anchor = west] (node4) at ([xshift = 2.0em]node3.east) {子词 \\ 切分};
\node [data,anchor = east] (data1) at ([xshift = -2.0em]node1.west){原始 \\ 数据};
\node [data,anchor = west] (data2) at ([xshift = 2.0em]node4.east){训练 \\ 数据};
\draw[-stealth,line width=.05cm] ([xshift=0.25em]data1.east) -- ([xshift=-0.25em]node1.west);
\draw[-stealth,line width=.05cm] ([xshift=0.25em]node1.east) -- ([xshift=-0.25em]node2.west);
\draw[-stealth,line width=.05cm] ([xshift=0.25em]node2.east) -- ([xshift=-0.25em]node3.west);
\draw[-stealth,line width=.05cm] ([xshift=0.25em]node3.east) -- ([xshift=-0.25em]node4.west);
\draw[-stealth,line width=.05cm] ([xshift=0.25em]node4.east) -- ([xshift=-0.25em]data2.west);
\end{tikzpicture}
\end{center}
\end{frame}
\begin{frame}{数据处理}
\begin{itemize}
\item 神经机器翻译以词为单位处理句子序列,因此需要对以句子为单位的数据切分。
\begin{itemize}
\item 以词表示的语言如英语,主要处理标点符号的粘连和短语的划分
\item 中文,日语等无明显词界限的语种,则需要特殊的处理。
\end{itemize}
\end{itemize}
\begin{center}
\begin{tikzpicture}
\setlength{\base}{4cm}
\tikzstyle{node} = [minimum width=\base,minimum height=3.5cm,inner sep=0pt,rounded corners=6pt,draw,thick,fill=blue!10!white,drop shadow,font=\footnotesize]
\begin{scope}
\node[node] (word1) at (0,0) {
\begin{tabular}{l}
\parbox{10em}{中文:神经机器翻译改变了我们的生活。}
\\
\\
\parbox{10em}{英文:Neural machine translation has changed our lives.}
\end{tabular}
};
%\node[] (arrow) at ([xshift=0.23\base]word1.east){\Huge{$\Longrightarrow$}};
%\node[] (arrow) at ([xshift=0.23\base]word1.east){\scriptsize{$\Longrightarrow$};
\node[node] (word2) at ([xshift=\base]word1.east){
\begin{tabular}{l}
\parbox{10em}{中文:神经\ \ 机器\ \ 翻译\ \ 改变\ \ \ \ 我们\ \ \ \ 生活\ \ }
\\
\\
\parbox{10em}{英文:Neural machine translation has changed our lives .}
\end{tabular}
};
\draw[->,line width=.2cm,blue!40 ] ([xshift=0.1\base]word1.east) -- ([xshift=-0.1\base]word2.west);
\end{scope}
\end{tikzpicture}
\end{center}
\vspace{-0.5em}
\begin{itemize}
\item 常用的分词工具包括:mose,niutrans分词。。。
\end{itemize}
\end{frame}
%%%%%%%%----------------------------
%%%%%%
\begin{frame}{数据处理}
\begin{itemize}
\item 符号标准化:统一源语目标语中的标点符号。
\item 数据过滤是指过滤低质量数据,过滤的手段主要包括:
\begin{itemize}
\item 重复数据过滤、乱码过滤、长度比过滤、HTML标签过滤、流畅度过滤。。。
\end{itemize}
\end{itemize}
\begin{tikzpicture}
\setlength{\base}{0.9em}
\node[minimum width=8em,minimum height=13.5em,inner sep=0.1pt,rounded corners=10pt,draw,thick,draw=ugreen,font=\tiny,fill=white!50,drop shadow] (word1) at (0,0){
\begin{tabular}{l l}
\rule{0pt}{10pt} \tiny{源语} &\tiny{\hspace{0.0em} 目标语} \\
\specialrule{1pt}{1pt}{1pt}
\begin{tikzpicture}
\tikzstyle{node} = [minimum width=0em,minimum height=1em,inner sep=2pt,font=\tiny,anchor = west,rounded corners=0pt,outer sep=0pt]
\node [node] (a1) at (0,0) {天气 \ \ \ \ };
\node [node] (a2) at (0,-\base*1) {\ \ 喜欢 \ \ 下雨 \ \ \ \ };
\node [node] (a3) at (0,-\base*2) {\textless{}p\textgreater \ \ 显示 \ \ 所选 \ \ \ \ \ \ \ \ \textless{}\textbackslash{}p\textgreater{} };
\node [node] (a41) at (0,-\base*3) {桃树 \ \ \ \ 杏树 \ \ \ \ 梨树 \ \ , \ \ \ \ 不让 \ \ \ \ , \ \ };
\node [node] (a42) at (0,-\base*4) {\ \ 不让 \ \ \ \ , \ \ \ \ 开满 \ \ \ \ \ \ };
\node [node] (a5) at (0,-\base*5.5) {机器 \ \ 翻译 \ \ \ \ 人们 \ \ \ \ 生活 \ \ 带来了 \ \ 便利 \ \ };
\node [node] (a6) at (0,-\base*7) {这件 \ \ 事情 \ \ \ \ 成功率 \ \ \ \ 50 o/o \ \ };
\node [node] (a7) at (0,-\base*8) {翻译 \ \ \ \ \ \ 特别 \ \ 感兴趣 \ \ };
\node [node] (a8) at (0,-\base*9) {他说 \ \ : \ \ \ \ 这个 \ \ 深深 \ \ 有趣 \ \ \ \ 想法 \ \ \ \ };
\node [node] (a81) at (0,-\base*10) {\ \ 心里 \ \ \ \ };
\node [node] (a9) at (0,-\base*11) {\ \ 喜欢 \ \ 下雨 \ \ \ \ };
\node [node] (a10) at (0,-\base*12) {花下 \ \ 成千成百 \ \ \ \ 蜜蜂 \ \ 嗡嗡 \ \ \ \ 闹着 \ \ };
\end{tikzpicture}
&
\begin{tikzpicture}
\tikzstyle{node} = [minimum width=0em,minimum height=1em,inner sep=2pt,font=\tiny,anchor = west,rounded corners=0pt,outer sep=0pt]
\node [node] (a1) at (0,0) {The weather today is good , but . . . };
\node [node] (a2) at (0,-\base*1) {I like rainy days 。 };
\node [node] (a3) at (0,-\base*2) {\textless{}p\textgreater to show the selected side . \textless{}\textbackslash{}p\textgreater{}};
\node [node] (a41) at (0,-\base*3.5) {Flowers bloom .};
\node [node] (a42) at (0,-\base*5) {Machine translation brings convenience to people's };
\node [node] (a5) at (0,-\base*6) {lives. };
\node [node] (a6) at (0,-\base*7) {The success rate for this matter is 50 o/o . };
\node [node] (a7) at (0,-\base*8) {I'm interested in translation . };
\node [node] (a8) at (0,-\base*9) {He said: 、、This interesting idea is deeply};
\node [node] (a81) at (0,-\base*10) { imprinted in my heart. 、、};
\node [node] (a9) at (0,-\base*11) {I like rainy days 。};
\node [node] (a10) at (0,-\base*12) {Hundreds of bees hummed under the flowers . };
\end{tikzpicture}
\end{tabular}
};
\end{tikzpicture}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%----------------------标点color
\begin{frame}{数据处理}
\begin{itemize}
\item 符号标准化:统一源语目标语中的标点符号。
\item 数据过滤是指过滤低质量数据,过滤的手段主要包括:
\begin{itemize}
\item 重复数据过滤、乱码过滤、长度比过滤、HTML标签过滤、流畅度过滤。。。
\end{itemize}
\end{itemize}
\begin{tikzpicture}
\setlength{\base}{0.9em}
\node[minimum width=8em,minimum height=13.5em,inner sep=0.1pt,rounded corners=10pt,draw,thick,draw=ugreen,font=\tiny,fill=white!50,drop shadow] (word1) at (0,0){
\begin{tabular}{l l}
\rule{0pt}{10pt} \tiny{源语} &\tiny{\hspace{0.0em} 目标语} \\
\specialrule{1pt}{1pt}{1pt}
\begin{tikzpicture}
\tikzstyle{node} = [minimum width=0em,minimum height=1em,inner sep=2pt,font=\tiny,anchor = west,rounded corners=0pt,outer sep=0pt]
\node [node] (a1) at (0,0) {天气 \ \ \ \ };
\node [node] (a2) at (0,-\base*1) {\ \ 喜欢 \ \ 下雨 \ \ \ \ };
\node [node] (a3) at (0,-\base*2) {\textless{}p\textgreater \ \ 显示 \ \ 所选 \ \ \ \ \ \ \ \ \textless{}\textbackslash{}p\textgreater{} };
\node [node] (a41) at (0,-\base*3) {桃树 \ \ \ \ 杏树 \ \ \ \ 梨树 \ \ , \ \ \ \ 不让 \ \ \ \ , \ \ };
\node [node] (a42) at (0,-\base*4) {\ \ 不让 \ \ \ \ , \ \ \ \ 开满 \ \ \ \ \ \ };
\node [node] (a5) at (0,-\base*5.5) {机器 \ \ 翻译 \ \ \ \ 人们 \ \ \ \ 生活 \ \ 带来了 \ \ 便利 \ \ };
\node [node] (a6) at (0,-\base*7) {这件 \ \ 事情 \ \ \ \ 成功率 \ \ \ \ 50};
\node [node,minimum width=1.5em,fill=red!20] (a62) at ([xshift=0.5em]a6.east) {$\%$};
\node [node] (a63) at ([xshift=0.5em]a62.east) {};
\node [node] (a7) at (0,-\base*8) {翻译 \ \ \ \ \ \ 特别 \ \ 感兴趣 \ \ };
\node [node] (a8) at (0,-\base*9) {他说 \ \ : };
\node [node,minimum width=1.5em,fill=red!20] (a82) at ([xshift=0.5em]a8.east) {``};
\node [node] (a83) at ([xshift=0.5em]a82.east) {这个 \ \ 深深 \ \ 有趣 \ \ \ \ 想法 \ \ \ \ };
\node [node] (a81) at (0,-\base*10) {\ \ 心里 \ \ };
\node [node,minimum width=1.5em,fill=red!20] (a84) at ([xshift=0.5em]a81.east) {''};
\node [node] (a9) at (0,-\base*11) {\ \ 喜欢 \ \ 下雨 \ \ \ \ };
\node [node] (a10) at (0,-\base*12) {花下 \ \ 成千成百 \ \ \ \ 蜜蜂 \ \ 嗡嗡 \ \ \ \ 闹着 \ \ };
\end{tikzpicture}
&
\begin{tikzpicture}
\tikzstyle{node} = [minimum width=0em,minimum height=1em,inner sep=2pt,font=\tiny,anchor = west,rounded corners=0pt,outer sep=0pt]
\node [node] (a1) at (0,0) {The weather today is good , but };
\node [node,minimum width=1.5em,fill=red!20] (a12) at ([xshift=0.5em]a1.east) {...};
\node [node] (a2) at (0,-\base*1) {I like rainy days };
\node [node,minimum width=1.5em,fill=red!20] (a22) at ([xshift=0.5em]a2.east) {.};
\node [node] (a3) at (0,-\base*2) {\textless{}p\textgreater to show the selected side . \textless{}\textbackslash{}p\textgreater{}};
\node [node] (a41) at (0,-\base*3.5) {Flowers bloom .};
\node [node] (a42) at (0,-\base*5) {Machine translation brings convenience to people's };
\node [node] (a5) at (0,-\base*6) {lives. };
\node [node] (a6) at (0,-\base*7) {The success rate for this matter is };
\node [node,minimum width=1.5em,fill=red!20] (a62) at ([xshift=0.5em]a6.east) {$\%$};
\node [node] (a63) at ([xshift=0.5em]a62.east) {.};
\node [node] (a7) at (0,-\base*8) {I'm interested in translation . };
\node [node] (a8) at (0,-\base*9) {He said: };
\node [node,minimum width=1.5em,fill=red!20] (a82) at ([xshift=0.5em]a8.east) {``};
\node [node] (a83) at ([xshift=0.5em]a82.east) {This interesting idea is deeply };
\node [node] (a81) at (0,-\base*10) {imprinted in my heart. };
\node [node,minimum width=1.5em,fill=red!20] (a84) at ([xshift=0.5em]a81.east) {''};
\node [node] (a9) at (0,-\base*11) {I like rainy days };
\node [node,minimum width=1.5em,fill=red!20] (a92) at ([xshift=0.5em]a9.east) {.};
\node [node] (a10) at (0,-\base*12) {Hundreds of bees hummed under the flowers . };
\end{tikzpicture}
\end{tabular}
};
\end{tikzpicture}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%----------------------去除句子color
\begin{frame}{数据处理}
\begin{itemize}
\item 符号标准化:统一源语目标语中的标点符号。
\item 数据过滤是指过滤低质量数据,过滤的手段主要包括:
\begin{itemize}
\item 重复数据过滤、乱码过滤、长度比过滤、HTML标签过滤、流畅度过滤。。。
\end{itemize}
\end{itemize}
\begin{tikzpicture}
\setlength{\base}{0.9em}
\node[minimum width=8em,minimum height=13.5em,inner sep=0.1pt,rounded corners=10pt,draw,thick,draw=ugreen,font=\tiny,fill=white!50,drop shadow] (word1) at (0,0){
\begin{tabular}{l l}
\rule{0pt}{10pt} \tiny{源语} &\tiny{\hspace{0.0em} 目标语} \\
\specialrule{1pt}{1pt}{1pt}
\begin{tikzpicture}
\tikzstyle{node} = [minimum width=0em,minimum height=1em,inner sep=2pt,font=\tiny,anchor = west,rounded corners=0pt,outer sep=0pt]
\node [node,fill=green!15] (a1) at (0,0) {天气 \ \ \ \ };
\node [node] (a2) at (0,-\base*1) {\ \ 喜欢 \ \ 下雨 \ \ \ \ };
\node [node,fill=blue!15] (a3) at (0,-\base*2) {\textless{}p\textgreater \ \ 显示 \ \ 所选 \ \ \ \ \ \ \ \ \textless{}\textbackslash{}p\textgreater{} };
\node [node,fill=green!15] (a41) at (0,-\base*3) {桃树 \ \ \ \ 杏树 \ \ \ \ 梨树 \ \ , \ \ \ \ 不让 \ \ \ \ , \ \ };
\node [node,fill=green!15] (a42) at (0,-\base*4) {\ \ 不让 \ \ \ \ , \ \ \ \ 开满 \ \ \ \ \ \ };
\node [node] (a5) at (0,-\base*5.5) {机器 \ \ 翻译 \ \ \ \ 人们 \ \ \ \ 生活 \ \ 带来了 \ \ 便利 \ \ };
\node [node] (a6) at (0,-\base*7) {这件 \ \ 事情 \ \ \ \ 成功率 \ \ \ \ 50};
\node [node,minimum width=1.5em,fill=red!20] (a62) at ([xshift=0.5em]a6.east) {$\%$};
\node [node] (a63) at ([xshift=0.5em]a62.east) {};
\node [node,fill=orange!15] (a7) at (0,-\base*8) {翻译 \ \ \ \ \ \ 特别 \ \ 感兴趣 \ \ };
\node [node] (a8) at (0,-\base*9) {他说 \ \ : };
\node [node,minimum width=1.5em,fill=red!20] (a82) at ([xshift=0.5em]a8.east) {``};
\node [node] (a83) at ([xshift=0.5em]a82.east) {这个 \ \ 深深 \ \ 有趣 \ \ \ \ 想法 \ \ \ \ };
\node [node] (a81) at (0,-\base*10) {\ \ 心里 \ \ };
\node [node,minimum width=1.5em,fill=red!20] (a84) at ([xshift=0.5em]a81.east) {''};
\node [node,fill=yellow!15] (a9) at (0,-\base*11) {\ \ 喜欢 \ \ 下雨 \ \ \ \ };
\node [node] (a10) at (0,-\base*12) {花下 \ \ 成千成百 \ \ \ \ 蜜蜂 \ \ 嗡嗡 \ \ \ \ 闹着 \ \ };
\end{tikzpicture}
&
\begin{tikzpicture}
\tikzstyle{node} = [minimum width=0em,minimum height=1em,inner sep=2pt,font=\tiny,anchor = west,rounded corners=0pt,outer sep=0pt]
\node [node,fill=green!15] (a1) at (0,0) {The weather today is good , but };
\node [node,minimum width=1.5em,fill=red!20] (a12) at ([xshift=0.5em]a1.east) {...};
\node [node] (a2) at (0,-\base*1) {I like rainy days };
\node [node,minimum width=1.5em,fill=red!20] (a22) at ([xshift=0.5em]a2.east) {.};
\node [node,fill=blue!15] (a3) at (0,-\base*2) {\textless{}p\textgreater to show the selected side . \textless{}\textbackslash{}p\textgreater{}};
\node [node,fill=green!15] (a41) at (0,-\base*3.5) {Flowers bloom .};
\node [node] (a42) at (0,-\base*5) {Machine translation brings convenience to people's };
\node [node] (a5) at (0,-\base*6) {lives. };
\node [node] (a6) at (0,-\base*7) {The success rate for this matter is };
\node [node,minimum width=1.5em,fill=red!20] (a62) at ([xshift=0.5em]a6.east) {$\%$};
\node [node] (a63) at ([xshift=0.5em]a62.east) {.};
\node [node,fill=orange!15] (a7) at (0,-\base*8) {I'm interested in translation . };
\node [node] (a8) at (0,-\base*9) {He said: };
\node [node,minimum width=1.5em,fill=red!20] (a82) at ([xshift=0.5em]a8.east) {``};
\node [node] (a83) at ([xshift=0.5em]a82.east) {This interesting idea is deeply};
\node [node] (a81) at (0,-\base*10) {imprinted in my heart. };
\node [node,minimum width=1.5em,fill=red!20] (a84) at ([xshift=0.5em]a81.east) {''};
\node [node,fill=yellow!15] (a9) at (0,-\base*11) {I like rainy days };
\node [node,minimum width=1.5em,fill=red!20] (a92) at ([xshift=0.5em]a9.east) {.};
\node [node] (a10) at (0,-\base*12) {Hundreds of bees hummed under the flowers . };
\end{tikzpicture}
\end{tabular}
};
\end{tikzpicture}
\end{frame}
%%%------------------------------------------------------------------------------------------------------------
%%% 翻译单元切分
\begin{frame}{BPE}
\begin{frame}{翻译单元切分}
\begin{itemize}
\item 对于某些形态学丰富语言,比如英语,德语等,常用一个单词的不同形态表示不同的意思,这就导致一个单词因为不同形态会产生各种不同词,就造成了大词汇量问题。
\end{itemize}
\vspace{-1.0em}
\begin{center}
\centerline{以英语为例:}
\vspace{-0.5em}
\begin{tikzpicture}
\node[rounded corners=3pt,minimum width=11.0em,minimum height=2.0em,draw,thick,fill=blue!10!white,font=\footnotesize,drop shadow](left) at (0,0) {
\begin{tabular}{c}
\rule{0pt}{12pt}名词\\
\rule{0pt}{12pt}cat,cats 、watch,watches\\
\rule{0pt}{15pt}baby,babies、wife,wives\\
\end{tabular}
};
\node[rounded corners=3pt,minimum width=11.0em,minimum height=2.0em,draw,thick,fill=blue!10!white,font=\footnotesize,drop shadow](right) at ([xshift=8em]left.east) {
\begin{tabular}{c}
\rule{0pt}{12pt}动词\\
\rule{0pt}{12pt}do,did ,does,doing,done\\
\rule{0pt}{15pt}have,had,has,having\\
\end{tabular}
};
\end{tikzpicture}
\end{center}
\vspace{-1.0em}
\begin{itemize}
\item 大词汇量导致翻译系统需要更大的词表,构造更大的词向量矩阵,带来了两个问题:
\begin{itemize}
\item 词汇表稀疏,低频词得不到充分的训练。
\item 计算量更大,更大词向量矩阵,造成了计算资源的浪费。
\end{itemize}
\end{itemize}
\end{frame}
%%%------------------------------------------------------------------------------------------------------------
%%% 翻译单元切分
\begin{frame}{翻译单元切分}
\begin{itemize}
\item 为了解决这个问题,提出了子词的概念。
\end{itemize}
\vspace{-1em}
\begin{center}
\begin{tikzpicture}
\node[] (do) at (0,0) {{\red do}};
\node[anchor = west] (does) at ([xshift = 1em]do.east) {{\red do}es};
\node[anchor = west] (doing) at ([xshift = 0.7em]does.east) {{\red do}ing};
\node[anchor = north] (do_root) at ([yshift = -1em]does.south) {do};
\node[anchor = west] (new) at ([xshift = 2em]doing.east) {{\red new}};
\node[anchor = west] (newer) at ([xshift = 1em]new.east) {{\red new}er};
\node[anchor = west] (newest) at ([xshift = 0.7em]newer.east) {{\red new}est};
\node[anchor = north] (new_root) at ([yshift = -1em]newer.south) {new};
\draw [->] (do_root.north) .. controls +(north:0.4) and +(south:0.6) ..(do.south);
\draw [->] (do_root.north) -- (does.south);
\draw [->] (do_root.north) .. controls +(north:0.4) and +(south:0.6) ..(doing.south);
\draw [->] (new_root.north) .. controls +(north:0.4) and +(south:0.6) ..(new.south);
\draw [->] (new_root.north) -- (newer.south);
\draw [->] (new_root.north) .. controls +(north:0.4) and +(south:0.6) ..(newest.south);
\end{tikzpicture}
\end{center}
\vspace{-1em}
\begin{itemize}
\item 将词分解为子词的方式有很多种,主流的方法包括:
\begin{itemize}
\item 字节对编码( Byte Pair Encoding,BPE)-- 统计频次
\item Wordpiece -- 语言模型
\item Unigram Language Model -- 语言模型
\end{itemize}
\end{itemize}
\begin{itemize}
\item BPE算法最早用于压缩,后来在NLP领域得到推广,是目前最流行,最简单有效的方法。
\item 子词切分,缩减词表大小,节省计算资源,提高性能。
\item 一般设定BPE切分后的词表大小为32000。
\end{itemize}
\end{frame}
%%%------------------------------------------------------------------------------------------------------------
%%% 正则化(dropout、label smoothing)
\begin{frame}{dropout}
%%% 翻译单元切分
\begin{frame}{翻译单元切分}
\begin{itemize}
\item 使用BPE切分的具体的流程如下:
\end{itemize}
\begin{center}
\begin{tikzpicture}
\tikzstyle{op} =[rounded corners=1pt,minimum width=6.0em,minimum height=3.0em,draw,thick,font=\scriptsize]
\node[op,fill=blue!10!white] (node1) at (0,0) {对原始数据分词};
\node[op,anchor = west,fill=red!5!white] (node2) at ([xshift = 2.0em]node1.east) {统计构建子词词表};
\node[op,anchor = west,fill=green!10!white] (node3) at ([xshift = 2.0em]node2.east) {根据子词词表分词};
\draw[-stealth,line width=.05cm,black!60] ([xshift=0.25em]node1.east) -- ([xshift=-0.25em]node2.west);
\draw[-stealth,line width=.05cm,black!60] ([xshift=0.25em]node2.east) -- ([xshift=-0.25em]node3.west);
\end{tikzpicture}
\end{center}
\vspace{-1em}
\begin{itemize}
\item 在构建子词词表,应用BPE时存在两种方式:
\begin{itemize}
\item 分别对源语和目标语构建子词词表,应用BPE。 \\
\item 联合源语和目标语词表,称为Joint-BPE,可以增强源语和目标语的一致性。 \\
\end{itemize}
\item BPE能够有效减少词表大小,以及UNK的数量:
\end{itemize}
\begin{center}
\scriptsize{
\begin{tabular}{l r r r}
\specialrule{1pt}{1pt}{1pt}
\rule{0pt}{8pt} segmentation & \# tokens & \# types & \# UNK \\ \hline
\rule{0pt}{8pt} compound splitting & 102 m & 1 100 000 & 643 \\
\rule{0pt}{8pt} morfessor & 109 m & 544 000 & 237 \\
\rule{0pt}{8pt} hyphenation & 186 m & 404 000 & 230 \\ \hline
\rule{0pt}{8pt} BPE & 112 m & 63 000 & 0 \\
\rule{0pt}{8pt} BPE (joint) & 111 m & 82 000 & 32 \\
\specialrule{1pt}{1pt}{1pt}
\end{tabular}}
\end{center}
\end{frame}
\begin{frame}{翻译单元切分-子词构造}
\vspace{-1em}
\begin{center}
\begin{tikzpicture}
\tikzstyle{node} =[font=\tiny]
\tikzstyle{sentence} =[font=\tiny,fill=blue!5!white]
\node[sentence] (node1) at (0,0) {[`low', `lower', `newest', `widest']};
\node[sentence,anchor = north] (node2) at ([yshift = -1em]node1.south) {[`l o w $<$e$>$':5, `l o w e r $<$e$>$':2, `n e w e s t $<$e$>$':6, `w i d e s t $<$e$>$':3]};
\node[sentence,anchor = north] (node3) at ([yshift = -1.5em]node2.south) {[`l o w $<$e$>$':5, `l o w e r $<$e$>$':2, `n e w {\red es} t $<$e$>$':6, `w i d {\red es} t $<$e$>$':3]};
\node[sentence,anchor = north] (node4) at ([yshift = -1em]node3.south) {[`l o w $<$e$>$':5, `l o w e r $<$e$>$':2, `n e w {\red est} $<$e$>$':6, `w i d {\red est} $<$e$>$':3]};
\node[sentence,anchor = north] (node5) at ([yshift = -1em]node4.south) {[`l o w $<$e$>$':5, `l o w e r $<$e$>$':2, `n e w {\red est$<$e$>$}':6, `w i d {\red est$<$e$>$}':3]};
\node[sentence,anchor = north] (node6) at ([yshift = -1em]node5.south) {$\cdots$};
\node[node,anchor = north] (node7) at ([yshift = -1.6em]node6.south) {直到达到预设的子词词表大小或下一个最高频的字节对出现频率为1。};
\draw[->,line width=.03cm] ([yshift=0em]node1.south) -- ([yshift=0em]node2.north);
\draw[->,line width=.03cm] ([yshift=0em]node3.south) -- ([yshift=0em]node4.north);
\draw[->,line width=.03cm] ([yshift=0em]node4.south) -- ([yshift=0em]node5.north);
\draw[->,line width=.03cm] ([yshift=0em]node5.south) -- ([yshift=0em]node6.north);
\node[node,anchor = west] (node8) at ([xshift = 2em,yshift = 3em]node7.east) {对于词表外的词lowest};
\node[node,anchor = north west] (node9) at ([yshift = 0.3em]node8.south west) {可以被分割为low est};
\node[node,font=\tiny,anchor = north,fill=ugreen!5,drop shadow,draw] (dict) at ([xshift = 8em,yshift = -4em]node6.south){\begin{tabular}{llllll}
\multirow{3}{*}{子词词表:} & `es' & `est' & `est$<$e$>$' & `lo' & `low' \\
& `ne' & `new'&`newest$<$e$>$' & `low$<$e$>$'& `wi'\\
& `wid' & `widest$<$e$>$' & `lowe' & `lower'& `lower$<$e$>$'
\end{tabular}};
\node[node,anchor=west] (line1) at ([xshift = 7.2em]node1.south east) {按字符拆分,并添加};
\node[node,anchor=north west] (line2) at ([yshift=0.3em]line1.south west) {终结符$<$e$>$,统计词频。};
\node[node,anchor=north west] (line3) at ([yshift=-3em]line2.south west) {统计每一个连续字节对};
\node[node,anchor=north west] (line4) at ([yshift=0.3em]line3.south west) {的出现频率,选择最高};
\node[node,anchor=north west] (line5) at ([yshift=0.3em]line4.south west) {频者合并成新的子词};
\begin{pgfonlayer}{background}
%\node [rectangle,inner sep=0.2em,rounded corners=1pt,fill=red!10,drop shadow,draw=red] [fit = (line1) (line2) (line3) (line4)] (box1) {};
\node [rectangle,inner sep=0.2em,rounded corners=1pt,very thick,dotted,draw=purple] [fit = (node1) (node2)] (box1) {};
\node [rectangle,inner sep=0.2em,rounded corners=1pt,very thick,dotted,draw=teal] [fit = (node3) (node4) (node5) (node6)] (box2) {};
\node [rectangle,inner sep=0.2em,rounded corners=1pt,fill=purple!5,drop shadow,draw] [fit = (line1) (line2)] (box3) {};
\node [rectangle,inner sep=0.2em,rounded corners=1pt,fill=ugreen!5,drop shadow,draw] [fit = (line3) (line4) (line5)] (box4) {};
\node [rectangle,inner sep=0.2em,rounded corners=1pt,fill=purple!5,drop shadow,draw] [fit = (node7)] (box5) {};
\node [rectangle,inner sep=0.2em,rounded corners=1pt,fill=blue!5,drop shadow,draw] [fit = (node8) (node9)] (box6) {};
\end{pgfonlayer}
\draw[->,line width=.03cm] ([yshift=0em]box2.south) -- ([yshift=0.2em]node7.north);
\draw[->,line width=.03cm] ([yshift=0em]box1.south) -- ([yshift=0em]box2.north);
\draw [->,dotted,very thick,purple] (box3.west) -- ([xshift=-1.5em]box3.west);
\draw [->,dotted,very thick,teal] (box4.west) -- ([xshift=-1.7em]box4.west);
\draw [->,dotted,very thick] ([xshift=6em]dict.north) .. controls +(north:1) and +(south:1) .. (box6.south);
\end{tikzpicture}
\end{center}
\end{frame}
%%%%%%%%%%%%%%%%%%
\begin{frame}{翻译单元切分}
\begin{itemize}
\item 用构造的BPE词表,重新对句子中的词进行切分:
\begin{itemize}
\item 对子词词表按长度由大到小排序
\visible<2->{\item 遍历子词词表,寻找是否有当前单词的子串,如有则进行替换切分}
\visible<3->{\item 对子词词表中不存在的序列,替换为$<$unk$>$}
\end{itemize}
\end{itemize}
\vspace{-1.5em}
\begin{center}
\begin{tikzpicture}
\node[rounded corners=3pt,minimum width=1.0em,minimum height=2.0em,font=\scriptsize,fill=green!5,drop shadow,thick,draw](top) at (0,0) {
\begin{tabular}{ll}
\multicolumn{2}{c}{BPE词表:} \\
errrr$<$e$>$ & tain$<$e$>$ \\
moun & est$<$e$>$ \\
high & the$<$e$>$ \\
a$<$e$>$ &
\end{tabular}
};
\visible<2->{
\node[font=\scriptsize,anchor=west] (node1) at ([xshift=0.5em,yshift=1em]top.east) {原始序列:};
\node[font=\scriptsize,anchor=west] (this) at ([xshift=-0.5em]node1.east) {"this$<$e$>$" ,};
\node[font=\scriptsize,anchor=west] (highest) at ([xshift=-0.5em]this.east) {"highest$<$e$>$",};
\node[font=\scriptsize,anchor=west] (mountain) at ([xshift=-0.5em]highest.east) { "mountain$<$e$>$"};
}
\visible<2->{
\node[font=\scriptsize,anchor=west] (node2) at ([yshift=-1.5em]node1.south west) {BPE切分:};
\node[font=\scriptsize,anchor=west] (unk) at ([xshift=-0.5em]node2.east) {"$<$unk$>$",};
\node[font=\scriptsize,anchor=west] (high) at ([xshift=-0.5em]unk.east) {"high",};
\node[font=\scriptsize,anchor=west] (est) at ([xshift=-0.5em]high.east) {"est$<$e$>$",};
\node[font=\scriptsize,anchor=west] (moun) at ([xshift=-0.5em]est.east) {"moun",};
\node[font=\scriptsize,anchor=west] (tain) at ([xshift=-0.5em]moun.east) {"tain$<$e$>$"};
}
\visible<2->{
\draw[->,thick](highest.south) -- (high);
\draw[->,thick](highest.south) -- (est);
\draw[->,thick](mountain.south) -- (moun);
\draw[->,thick](mountain.south) -- (tain);
}
%\draw[->,thick](node1.south) -- ([xshift=1.0em]node2.north);
\visible<3->{\draw[->,thick]([xshift=-0.2em]this.south) -- (unk);}
\end{tikzpicture}
\end{center}
\vspace{-1em}
\begin{itemize}
\visible<4->{\item 机器翻译解码的句子由子词组成,需要进行BPE还原,将每个子词向后合并,直至遇到终结符}
\end{itemize}
\begin{center}
\begin{tikzpicture}
\visible<4->{
\node[font=\scriptsize] (node1) at (0,0) {翻译结果:"moun","tain$<$e$>$" };
\node[font=\scriptsize] (node2) at (14em,0) {BPE还原:"mountain$<$e$>$" };
\draw[-stealth,line width=.05cm]([xshift=1em]node1.east) -- ([xshift=-1em]node2.west);
}
\end{tikzpicture}
\end{center}
\end{frame}
%%%------------------------------------------------------------------------------------------------------------
%%% 正则化(dropout、label smoothing)
\begin{frame}{label smoothing}
\begin{frame}{正则化}
\begin{itemize}
\item 神经机器翻译模型十分复杂,由于观测数据的不充分,及数据中存在的噪声,导致其求解十分不稳定,产生过拟合的现象。
%\item 正则化是机器学习中的经典技术,通常用于缓解过拟合问题,即模型参数过度拟合噪声数据,因此正则化也常被称作降噪。
\end{itemize}
\begin{tabular}{l l l}
\begin{tikzpicture}
\draw[->, thick] (0,0) to (3,0);
\draw[->, thick] (0,-0) to (0,2);
\node (a) at (1*0.3,6*0.2) {};
\node (b) at (2*0.3,4*0.2) {};
\node (c) at (3*0.3,3*0.2) {};
\node (d) at (4*0.3,3*0.2) {};
\node (e) at (6*0.3,4*0.2) {};
\node (f) at (7*0.3,6*0.2) {};
\node (g) at (8*0.3,8.4*0.2) {};
\node (h) at (9*0.3,9.7*0.2) {};
\fill [black] (a) circle(1pt);
\fill [black] (b) circle(1pt);
\fill [black] (c) circle(1pt);
\fill [black] (d) circle(1pt);
\fill [black] (e) circle(1pt);
\fill [black] (f) circle(1pt);
\fill [black] (g) circle(1pt);
\fill [black] (h) circle(1pt);
% y=0.73x + 2.54
\draw [thick,red] (-1*0.3,1.81*0.2) to (10*0.3,9.84*0.2);
\node [font=\footnotesize] at (1.5,-0.5) {欠拟合};
\end{tikzpicture}
&\begin{tikzpicture}
\draw[->, thick] (0,0) to (3,0);
\draw[->, thick] (0,-0) to (0,2);
\node (a) at (1*0.3,6*0.2) {};
\node (b) at (2*0.3,4*0.2) {};
\node (c) at (3*0.3,3*0.2) {};
\node (d) at (4*0.3,3*0.2) {};
\node (e) at (6*0.3,4*0.2) {};
\node (f) at (7*0.3,6*0.2) {};
\node (g) at (8*0.3,8.4*0.2) {};
\node (h) at (9*0.3,9.7*0.2) {};
\fill [black] (a) circle(1pt);
\fill [black] (b) circle(1pt);
\fill [black] (c) circle(1pt);
\fill [black] (d) circle(1pt);
\fill [black] (e) circle(1pt);
\fill [black] (f) circle(1pt);
\fill [black] (g) circle(1pt);
\fill [black] (h) circle(1pt);
\draw [thick,red] (0.5*0.3,6.15*0.2) to [bend right] (5*0.3,3*0.2) ;
\draw [thick,red] (5*0.3,3*0.2) to [bend right] (8.5*0.3,10*0.2) ;
\node [font=\footnotesize] at (1.5,-0.5) {拟合合适};
\end{tikzpicture}
&\begin{tikzpicture}
\draw[->, thick] (0,0) to (3,0);
\draw[->, thick] (0,-0) to (0,2);
\node (a) at (1*0.3,6*0.2) {};
\node (b) at (2*0.3,4*0.2) {};
\node (c) at (3*0.3,3*0.2) {};
\node (d) at (4*0.3,3*0.2) {};
\node (e) at (6*0.3,4*0.2) {};
\node (f) at (7*0.3,6*0.2) {};
\node (g) at (8.4*0.3,8.4*0.2) {};
\node (h) at (9.4*0.3,9.7*0.2) {};
\fill [black] (a) circle(1pt);
\fill [black] (b) circle(1pt);
\fill [black] (c) circle(1pt);
\fill [black] (d) circle(1pt);
\fill [black] (e) circle(1pt);
\fill [black] (f) circle(1pt);
\fill [black] (g) circle(1pt);
\fill [black] (h) circle(1pt);
%0-a
\draw [thick,red] (0.2*0.3,4*0.2) to [bend left] (1*0.3,6*0.2) ;
% a-b
\draw [thick,red] (1*0.3,6*0.2) to [bend left] (2*0.3,3*0.2) ;
% b-c
\draw [thick,red] (2*0.3,3*0.2) to [bend right] (3*0.3,2.5*0.2) ;
% c-d
\draw [thick,red] (3*0.3,2.5*0.2) to [bend left] (3.5*0.3,4*0.2) ;
\draw [thick,red] (3.5*0.3,4*0.2) to [bend left] (4.3*0.3,2*0.2) ;
\draw [thick,red] (4.3*0.3,2*0.2) to [bend right] (5*0.3,1.5*0.2) ;
% d-e
\draw [thick,red] (5*0.3,1.5*0.2) to [bend right] (6.2*0.3,7*0.2) ;
\draw [thick,red] (6.2*0.3,7*0.2) to [bend right] (6.5*0.3,7*0.2) ;
% e-f
\draw [thick,red] (6.5*0.3,7*0.2) to [bend left] (7*0.3,5*0.2) ;
\draw [thick,red] (7*0.3,5*0.2) to [bend right] (7.5*0.3,4*0.2) ;
\draw [thick,red] (7.5*0.3,4*0.2) to [bend right] (8*0.3,4*0.2) ;
%
% f-g
\draw [thick,red] (8*0.3,4*0.2) to [bend right] (8*0.3,10*0.2) ;
\draw [thick,red] (8*0.3,10*0.2) to [bend left] (8.7*0.3,10*0.2) ;
% g-h
\draw [thick,red] (8.7*0.3,10*0.2) to [bend left] (9.7*0.3,9.4*0.2) ;
\node [font=\footnotesize] at (1.5,-0.5) {过拟合};
\end{tikzpicture} \\
\end{tabular}
%\vspace{-0.5em}
%\begin{itemize}
%\item 正则化可以帮助降低模型复杂度,简单来说,正则化是一种为了减小测试误差的行为,主要的手段包括:
% \begin{itemize}
% \item 在损失函数上增加正则化项:$\overline{Loss} = Loss + \lambda \times \Omega (\omega)$,改变模型的拟合状态
% \item 数据增强,Early stop技术
% \item dropout 和 label smoothing
% \end{itemize}
%\end{itemize}
\begin{itemize}
%\item 神经机器翻译模型十分复杂,由于观测数据的不充分,及数据中存在的噪声,导致其求解十分不稳定,产生过拟合的现象。
\item 正则化是机器学习中的经典技术,通常用于缓解过拟合问题,即模型参数过度拟合噪声数据,因此正则化也常被称作降噪。
\item 常用的正则化方法包括:调整训练目标、标签平滑、dropout。。。
\end{itemize}
\end{frame}
%%%------------------------------------------------------------------------------------------------------------
\begin{frame}{正则化-优化训练目标}
\begin{itemize}
\item 正则化的一种实现是在训练目标中引入一个正则项。在神经机器翻译中,引入正则项的训练目标为:
\begin{center}
$\widehat{\mathbf{w}}=\underset{\mathbf{w}}{\operatorname{argmin}} L(\mathbf{w})+\lambda R(\mathbf{w})$
\end{center}
\item $L(\mathbf{w})$是损失函数,通过$\lambda$控制正则化项$R(\mathbf{w})$的强度
\item 常用的正则化项包括:
\begin{itemize}
\item L1正则化:$R(\mathbf{w})=|| \mathbf{w}||_{1}=\sum_{w_{i}}\left|\mathrm{w}_{i}\right|$
\item L2正则化:$R(\mathbf{w})=\left(|| \mathbf{w}||_{2}\right)^{2}=\sum_{w_{i}} \mathrm{w}_{i}^{2}$
\end{itemize}
\end{itemize}
\vspace{-1em}
\begin{center}
\begin{tikzpicture}
\node[minimum width=2em,minimum height=2em,draw,rotate=45,thick] (node1) at (0,0) {};
\node[minimum size=3em,circle,draw,thick] (node2) at (12em,0) {};
\node[font=\scriptsize] (w1) at (5.5em,-0.5em) {${w_{1}}$};
\node[font=\scriptsize] (w2) at (-0.5em,3.5em) {${w_{2}}$};
\node[font=\scriptsize] (w3) at (17em,-0.5em) {${w_{1}}$};
\node[font=\scriptsize] (w4) at (11.5em,3.5em) {${w_{2}}$};
\draw [thick,purple,rotate=-20] ([xshift=2em,yshift=1.8em]node1.north) ellipse [x radius=3em, y radius=1em];
\draw [thick,purple,rotate=-20] ([xshift=2em,yshift=1.0em]node2.north) ellipse [x radius=3em, y radius=1em];
\draw[->,thick] (-2em,0) -- (6em,0);
\draw[->,thick] (0,-2em) -- (0,4em);
\draw[->,thick] (9.5em,0) -- (17.5em,0);
\draw[->,thick] (12em,-2em) -- (12em,4em);
\node[font=\tiny] (line1) at ([xshift=-2em,yshift=4em]w3.east) {优化目标的等值线};
\node[font=\tiny] (line2) at ([xshift=-14em,yshift=4em]w3.east) {优化目标的等值线};
\node[font=\tiny] (line3) at ([xshift=-3em,yshift=-1em]w3.east) {L2正则化图像};
\node[font=\tiny] (line4) at ([xshift=-15em,yshift=-1em]w3.east) {L1正则化图像};
\node[font=\tiny] (line5) at (1em,-3em) {最优解处w1=0,L1正则化可以使参数矩阵更稀疏};
\node[font=\tiny,anchor=west] (line6) at ([xshift=1em]line5.east) {切线处为最优解,L2正则化帮助缓解过拟合};
%\node[font=\tiny,anchor=north west] (line3) at ([yshift=0.3em]line2.south west) {交点为最优解点};
%\node[font=\tiny,anchor=north west] (line4) at ([yshift=0.3em]line3.south west) {L1正则化可以使参数矩阵稀疏};
%\node[font=\tiny,anchor=north west] (line5) at ([yshift=0.3em]line4.south west) {,一定程度上缓解过拟合};
%\node[font=\tiny,anchor=north west] (line6) at ([yshift=0.3em]line5.south west) {L2正则化缓解过拟合效果优于L1};
\end{tikzpicture}
\end{center}
\end{frame}
\begin{frame}{正则化-优化训练目标}
\begin{itemize}
\item 引入正则化项可以看作是使用了一种先验知识,使模型在求解时参数不会偏离0点太多,降低了模型的复杂度。
\end{itemize}
\vspace{-1em}
\begin{center}
\begin{tikzpicture}
\node [] (mat1) at (0,0) {\scriptsize{$
\begin{bmatrix}
.1 & -4 & \cdots & 2 \\
5 & 2 & \cdots & .2 \\
2 & .1 & \cdots & .3 \\
\vdots & \vdots & \ddots & \vdots \\
0 & .8 & \cdots & 4 \\
-2 & .3 & \cdots & .1
\end{bmatrix}
$}};
\node [anchor=west] (mat2) at ([xshift=4em]mat1.east) {\scriptsize{$
\begin{bmatrix}
0.11 & -0.4 & \cdots & 0.2 \\
0.04 & 0.12 & \cdots & 0 \\
0.23 & -0.1 & \cdots & 0.03 \\
\vdots & \vdots & \ddots & \vdots \\
0 & 0.28 & \cdots & -0.31 \\
-0.13 & 0.27 & \cdots & 0.01
\end{bmatrix}
$}};
\draw [decorate,decoration={brace,mirror}] ([shift={(6pt,2pt)}]mat1.south west) to node [auto,swap,font=\scriptsize] {标准模型参数} ([shift={(-6pt,2pt)}]mat1.south east);
\draw [decorate,decoration={brace,mirror}] ([shift={(6pt,2pt)}]mat2.south west) to node [auto,swap,font=\scriptsize] {引入L2正则化} ([shift={(-6pt,2pt)}]mat2.south east);
\end{tikzpicture}
\end{center}
\vspace{-1em}
\begin{itemize}
\item 定义不同的正则化项,使模型偏向于我们希望的方向。
\begin{itemize}
\item 通过约束层输出差异,增强模型的表现力
\item 修正注意力的分布,使其更关注全局或局部的信息
\item 增加层之间的辅助损失加快模型收敛。。。
\end{itemize}
\item 除了将正则化项与损失线性结合的方式,我们还可以通过调整标准答案的方式来引入先验知识,如标签平滑
%\item 神经机器翻译系统对目标语的每个位置预测一个概率分布,表示词表中每个单词在该位置出现的可能性。
\end{itemize}
\end{frame}
\begin{frame}{正则化-标签平滑}
\begin{itemize}
\item 神经机器翻译系统对目标语的每个位置预测一个概率分布,表示词表中每个单词在该位置出现的可能性。
\item 通过计算当前位置的分布与标准答案的差异作为损失
\item 使用one-hot向量作为标准答案,不考虑类别间的相关性
\visible<2->{\item label smoothing为所有位置分配了概率}
\end{itemize}
\begin{center}
\begin{tikzpicture}
%\node[rounded corners=3pt,minimum width=4.0em,minimum height=4.0em,font=\scriptsize,fill=green!10,drop shadow,draw=ugreen] (model) at (0,0) {Model};
\node[font=\tiny] (model) at (0,0) {Model out:};
%\node[font=\tiny] (onehot) at (0,0) {Model out:};
\visible<2->{\node[anchor=north west,font=\tiny] (label_smooth) at ([yshift=-1.8em]model.south west) {label smoothing:};}
\node[anchor=south west,font=\tiny] (one-hot) at ([yshift=2em]model.north west) {one hot:};
%model out
%\node [anchor=west,font=\tiny] (model_out) at ([xshift=1em]model.east) {model out:};
\node [anchor=west,minimum width=1em,minimum height=0.2em,fill=ublue!80,inner sep=0pt] (model_label1) at ([xshift=1.5em]model.east) {};
\node [anchor=south,font=\tiny] (model_w1) at (model_label1.north) {$p_{1}$};
\node [anchor=south west,minimum width=1em,minimum height=0.1em,fill=ublue!80,inner sep=0pt] (model_label2) at (model_label1.south east) {};
\node [anchor=south,font=\tiny] (model_w2) at (model_label2.north) {$p_{2}$};
\node [anchor=south west,minimum width=1em,minimum height=0.7em,fill=ublue!80,inner sep=0pt] (model_label3) at (model_label2.south east) {};
\node [anchor=south,font=\tiny] (model_w3) at (model_label3.north) {{\color{red} $p_{3}$}};
\node [anchor=south west,minimum width=1em,minimum height=0.4em,fill=ublue!80,inner sep=0pt] (model_label4) at (model_label3.south east) {};
\node [anchor=south,font=\tiny] (model_w5) at (model_label4.north) {$p_{4}$};
\node [anchor=south west,minimum width=1em,minimum height=0.1em,fill=ublue!80,inner sep=0pt] (model_label5) at (model_label4.south east) {};
\node [anchor=south,font=\tiny] (model_w6) at (model_label5.north) {$p_{5}$};
\node [anchor=south west,minimum width=1em,minimum height=0.3em,fill=ublue!80,inner sep=0pt] (model_label6) at (model_label5.south east) {};
\node [anchor=south,font=\tiny] (model_w7) at (model_label6.north) {$p_{6}$};
\node [anchor=south west,minimum width=1em,minimum height=0.2em,fill=ublue!80,inner sep=0pt] (model_label7) at (model_label6.south east) {};
\node [anchor=south,font=\tiny] (model_w8) at (model_label7.north) {$p_{7}$};
%no label smooth
\node [anchor=west,minimum width=1em,minimum height=0.05em,fill=orange!50,inner sep=0pt,font=\tiny] (one_hot_label1) at ([xshift=1.5em,yshift=3em]model.east) {};
\node [anchor=south,font=\tiny] (one_hot_w1) at (one_hot_label1.north) {$0$};
\node [anchor=south west,minimum width=1em,minimum height=0.05em,fill=orange!50,inner sep=0pt,font=\tiny] (one_hot_label2) at (one_hot_label1.south east) {};
\node [anchor=south,font=\tiny] (one_hot_w2) at (one_hot_label2.north) {$0$};
\node [anchor=south west,minimum width=1em,minimum height=1.5em,fill=orange!50,inner sep=0pt] (one_hot_label3) at (one_hot_label2.south east) {};
\node [anchor=south,font=\tiny] (one_hot_w3) at (one_hot_label3.north) {{\color{red} $1$}};
\node [anchor=south west,minimum width=1em,minimum height=0.05em,fill=orange!50,inner sep=0pt,font=\tiny] (one_hot_label4) at (one_hot_label3.south east) {};
\node [anchor=south,font=\tiny] (one_hot_w4) at (one_hot_label4.north) {$0$};
\node [anchor=south west,minimum width=1em,minimum height=0.05em,fill=orange!50,inner sep=0pt,font=\tiny] (one_hot_label5) at (one_hot_label4.south east) {};
\node [anchor=south,font=\tiny] (one_hot_w5) at (one_hot_label5.north) {$0$};
\node [anchor=south west,minimum width=1em,minimum height=0.05em,fill=orange!50,inner sep=0pt,font=\tiny] (one_hot_label6) at (one_hot_label5.south east) {};
\node [anchor=south,font=\tiny] (one_hot_w6) at (one_hot_label6.north) {$0$};
\node [anchor=south west,minimum width=1em,minimum height=0.05em,fill=orange!50,inner sep=0pt,font=\tiny] (one_hot_label7) at (one_hot_label6.south east) {};
\node [anchor=south,font=\tiny] (one_hot_w7) at (one_hot_label7.north) {$0$};
%label smoothing
\visible<2->{
\node [anchor=west,minimum width=1em,minimum height=0.2em,fill=red!50,inner sep=0pt] (label1) at ([xshift=1.5em,yshift=-3em]model.east) {};
\node [anchor=south,font=\tiny] (w1) at (label1.north) {$0.1$};
\node [anchor=south west,minimum width=1em,minimum height=0.2em,fill=red!50,inner sep=0pt] (label2) at (label1.south east) {};
\node [anchor=south,font=\tiny] (w2) at (label2.north) {$0.1$};
\node [anchor=south west,minimum width=1em,minimum height=0.8em,fill=red!50,inner sep=0pt] (label3) at (label2.south east) {};
\node [anchor=south,font=\tiny] (w3) at (label3.north) {{\color{red} $0.4$}};
\node [anchor=south west,minimum width=1em,minimum height=0.2em,fill=red!50,inner sep=0pt] (label4) at (label3.south east) {};
\node [anchor=south,font=\tiny] (w5) at (label4.north) {$0.1$};
\node [anchor=south west,minimum width=1em,minimum height=0.2em,fill=red!50,inner sep=0pt] (label5) at (label4.south east) {};
\node [anchor=south,font=\tiny] (w6) at (label5.north) {$0.1$};
\node [anchor=south west,minimum width=1em,minimum height=0.2em,fill=red!50,inner sep=0pt] (label6) at (label5.south east) {};
\node [anchor=south,font=\tiny] (w7) at (label6.north) {$0.1$};
\node [anchor=south west,minimum width=1em,minimum height=0.2em,fill=red!50,inner sep=0pt] (label7) at (label6.south east) {};
\node [anchor=south,font=\tiny] (w8) at (label7.north) {$0.1$};
}
\visible<2->{\node[font=\scriptsize] (line1) at ([xshift=9em,yshift=-1.5em]model_label7.east) {$loss =-0.3 \log p_{3}-\sum_{i=1}^{7} 0.1 \log p_{i}$};}
\node[font=\scriptsize] (line2) at ([xshift=5.9em,yshift=3.5em]model_label7.east) {$loss =-\log p_{3}$};
\begin{pgfonlayer}{background}
\node [rectangle,inner sep=0.1em,rounded corners=1pt,very thick,dotted,draw=red] [fit = (one_hot_label1) (one_hot_w3) (one_hot_label7) (model_label1) (model_label7)] (box1) {};
\node [rectangle,inner sep=0.2em,rounded corners=1pt,fill=purple!10,drop shadow,draw=purple] [fit = (line2)] (box3) {};
\draw [->,dotted,very thick,red] ([yshift=-1em]box1.east) .. controls +(east:1) and +(west:1) .. (box3.west);
\visible<2->{
\node [rectangle,inner sep=0.1em,rounded corners=1pt,very thick,dotted,draw=ugreen] [fit = (label1) (label7) (model_label1) (model_label7) (model_w3)] (box2) {};
\node [rectangle,inner sep=0.2em,rounded corners=1pt,fill=green!10,drop shadow,draw=ugreen] [fit = (line1)] (box4) {};
\draw [->,dotted,very thick,ugreen] ([yshift=1em]box2.east) .. controls +(east:1) and +(west:1) .. (box4.west);
}
\end{pgfonlayer}
\end{tikzpicture}
\end{center}
\end{frame}
\begin{frame}{正则化-标签平滑}
\begin{itemize}
\item 可以看到,label smoothing不会独享所有的概率。
\item 在考虑正确答案的同时,也考虑了类别之间的相关性,提高了模型的泛化能力。
\item 标签平滑的实现特别简单,只需要将真实答案独享的概率拿出一部分分享给所有的标签。\\
\begin{center}
$y_{j}^{\mathrm{ls}}=(1-\alpha) \cdot \tilde{y}_{j}+\alpha \cdot q$
\end{center}
\item 其中$\tilde{y}_{j}$为对j位置预测时的真实分布,即one-hot向量,$q$则是在词表大小V上的均匀分布
\end{itemize}
\begin{center}
\begin{tikzpicture}
\node[font=\scriptsize,rounded corners=1pt,minimum width=1em,fill=purple!10,drop shadow,draw,thick] (node1) at (0,0) {\begin{tabular}{c}0.0 \\0.0 \\1.0 \\0.0 \\0.0 \\0.0 \\0.0\end{tabular}};
%\node[font=\scriptsize,anchor=north] (line2) at ([yshift=-0.5em]line1.south) {[ 1/7 1/7 1/7 1/7 1/7 1/7 1/7 ]};
\node[font=\scriptsize,anchor=east] (1-alpha) at (node1.west) {$(1-0.7)*$};
\node[font=\scriptsize,anchor=west] (alpha) at (node1.east) {$+0.7*$};
\node[font=\scriptsize,rounded corners=1pt,minimum width=1em,anchor=west,fill=green!10,drop shadow,draw,thick] (node2) at (alpha.east) {\begin{tabular}{c}1/7 \\1/7 \\1/7 \\1/7 \\1/7 \\1/7 \\1/7\end{tabular}};
\node[font=\scriptsize,anchor=west] (=) at (node2.east) {$=$};
\node[font=\scriptsize,rounded corners=1pt,minimum width=1em,anchor=west,fill=blue!10,drop shadow,draw,thick] (node3) at (=.east) {\begin{tabular}{c}0.1 \\0.1 \\0.4 \\0.1 \\0.1 \\0.1 \\0.1\end{tabular}};
\node[font=\scriptsize,anchor=west] (line1) at ([xshift=1em,yshift=2em]node3.east) {$\alpha$表示分出去的概率大小};
\node[font=\scriptsize,anchor=north west] (line2) at ([yshift=0.3em]line1.south west) {$\alpha$越大,得到的分布越软};
\node[font=\scriptsize,anchor=north west] (line3) at ([yshift=0.3em]line2.south west) {更软的分布更适合作为目};
\node[font=\scriptsize,anchor=north west] (line4) at ([yshift=0.3em]line3.south west) {标分布,可以缓解数据中};
\node[font=\scriptsize,anchor=north west] (line5) at ([yshift=0.3em]line4.south west) {噪声的影响};
\end{tikzpicture}
\end{center}
\end{frame}
\begin{frame}{正则化-dropout}
\begin{itemize}
\item Dropout通过在每一次的迭代中,只激活一部分节点,屏蔽其他神经元。相当于每次迭代训练的都是不一样的网络,从而降低节点之间的关联性以及模型的复杂度,达到正则化的效果。
\end{itemize}
\vspace{-1em}
\begin{center}
\begin{tikzpicture}
\def\neuronsep{1}
\tikzstyle{neuronnode} = [minimum size=1.0em,circle,draw,thick,ublue,inner sep=1pt, fill=white,align=center]
%standard
\foreach \n in {1,...,4}{
\node [neuronnode] (neuron0\n) at (0,\n * \neuronsep) {};
}
\foreach \n in {1,...,4}{
\node [neuronnode] (neuron1\n) at (1.2\neuronsep ,\n * \neuronsep) {};
}
\foreach \n in {1,...,4}{
\node [neuronnode] (neuron2\n) at (2.4*\neuronsep ,\n * \neuronsep) {};
}
\node [neuronnode] (neuron3) at (3.6*\neuronsep ,2.5 * \neuronsep) {};
\foreach \n in {1,...,4}{
\foreach \m in {1,...,4}{
\draw [->] (neuron0\n.east) -- (neuron1\m.west);
}
}
\foreach \n in {1,...,4}{
\foreach \m in {1,...,4}{
\draw [->] (neuron1\n.east) -- (neuron2\m.west);
}
}
\foreach \n in {1,...,4}{
\draw [->] (neuron2\n.east) -- (neuron3.west);
}
%drop
%layer1
\foreach \n in {1,3,4}{
\node [neuronnode] (neuron4\n) at (5*\neuronsep,\n * \neuronsep) {};
}
\node [neuronnode,dashed] (neuron42) at (5*\neuronsep,2 * \neuronsep) {};
%layer1
\foreach \n in {1,2,4}{
\node [neuronnode] (neuron5\n) at (6.2*\neuronsep ,\n * \neuronsep) {};
}
\node [neuronnode,dashed] (neuron53) at (6.2*\neuronsep,3 * \neuronsep) {};
%layer3
\foreach \n in {1,4}{
\node [neuronnode] (neuron6\n) at (7.4*\neuronsep ,\n * \neuronsep) {};
}
\node [neuronnode,dashed] (neuron62) at (7.4*\neuronsep ,2 * \neuronsep) {};
\node [neuronnode,dashed] (neuron63) at (7.4*\neuronsep ,3 * \neuronsep) {};
%layer4
\node [neuronnode] (neuron7) at (8.6*\neuronsep ,2.5 * \neuronsep) {};
\foreach \n in {1,3,4}{
\foreach \m in {1,2,4}{
\draw [->] (neuron4\n.east) -- (neuron5\m.west);
}
}
\foreach \n in {1,2,4}{
\foreach \m in {1,4}{
\draw [->] (neuron5\n.east) -- (neuron6\m.west);
}
}
\foreach \n in {1,4}{
\draw [->] (neuron6\n.east) -- (neuron7.west);
}
\end{tikzpicture}
\end{center}
\vspace{-1em}
\begin{itemize}
\item 简单的说就是,在前向传播过程中,以一定的概率$p$激活神经元,使其不过于依赖某些特征,增强模型的泛化能力。
\end{itemize}
\end{frame}
\begin{frame}{正则化-dropout}
\begin{itemize}
\item Dropout的工作流程如下:
\begin{itemize}
\item 以一定的概率$p$随机的激活隐藏层神经元,掩盖其他位置神经元
\item 得到输入后进行训练,更新被激活的的神经元参数
\item 恢复被掩盖的神经元,此时,被激活的神经元已经更新
\item 不断重复此过程,直到训练结束
\end{itemize}
\end{itemize}
\begin{center}
\vspace{-1em}
\begin{tikzpicture}
\def\neuronsep{1.5}
\def\nodespace{1}
\def\picturespace{0.8}
\tikzstyle{neuronnode} = [minimum size=1.8em,circle,draw,very thick,ublue,inner sep=0pt, fill=white,align=center]
%standard
\node [neuronnode] (neuron_b) at (0,0) {\scriptsize{$b_{i}^{l}$}};
\node [neuronnode] (neuron_y3) at (0,-1*\neuronsep) {\scriptsize{$x_{3}^{l}$}};
\node [neuronnode] (neuron_y2) at (0,-2*\neuronsep) {\scriptsize{$x_{2}^{l}$}};
\node [neuronnode] (neuron_y1) at (0,-3*\neuronsep) {\scriptsize{$x_{1}^{l}$}};
\node [neuronnode] (neuron_z) at (1.2 * \nodespace,-1.5 * \neuronsep) {\scriptsize{$z_{i}^{l+1}$}};
\node [neuronnode] (neuron_y') at (2.4 * \nodespace,-1.5 * \neuronsep) {\scriptsize{$x_{i}^{l+1}$}};
\node [anchor=north,ublue] (standard) at ([yshift=-4em]neuron_z.south) {\scriptsize{standard}};
\node [ublue] (standard) at ([xshift=-1em]neuron_z.west) {\scriptsize{$\mathbf{w}_{i}^{l}$}};
\node [ublue] (standard) at ([xshift=0.6em,yshift=0.3em]neuron_z.east) {\scriptsize{$f$}};
\draw [->,line width=0.3mm] (neuron_b.east) -- (neuron_z.west);
\draw [->,line width=0.3mm] (neuron_y3.east) -- (neuron_z.west);
\draw [->,line width=0.3mm] (neuron_y2.east) -- (neuron_z.west);
\draw [->,line width=0.3mm] (neuron_y1.east) -- (neuron_z.west);
\draw [->,line width=0.3mm] (neuron_z.east) -- (neuron_y'.west);
%dropout
\node [neuronnode] (drop_neuron_b) at (4.7*\nodespace,0) {\scriptsize{$b_{i}^{l}$}};
\node [neuronnode] (drop_neuron_y3') at (4.7*\nodespace,-1*\neuronsep) {\scriptsize{$\tilde{x}_{3}^{l}$}};
\node [neuronnode] (drop_neuron_y2') at (4.7*\nodespace,-2*\neuronsep) {\scriptsize{$\tilde{x}_{2}^{l}$}};
\node [neuronnode] (drop_neuron_y1') at (4.7*\nodespace,-3*\neuronsep) {\scriptsize{$\tilde{x}_{1}^{l}$}};
\node [neuronnode] (drop_neuron_z) at (5.9 * \nodespace,-1.5 * \neuronsep) {\scriptsize{$z_{i}^{l+1}$}};
\node [neuronnode] (drop_neuron_y') at (7.1 * \nodespace,-1.5 * \neuronsep) {\scriptsize{$x_{i}^{l+1}$}};
\node [neuronnode] (drop_neuron_y3) at (3.5*\nodespace,-1*\neuronsep) {\scriptsize{$x_{3}^{l}$}};
\node [neuronnode] (drop_neuron_y2) at (3.5*\nodespace,-2*\neuronsep) {\scriptsize{$x_{2}^{l}$}};
\node [neuronnode] (drop_neuron_y1) at (3.5*\nodespace,-3*\neuronsep) {\scriptsize{$x_{1}^{l}$}};
\node [neuronnode] (drop_neuron_r3) at (4.1*\nodespace,-0.5*\neuronsep) {\scriptsize{$r_{3}^{l}$}};
\node [neuronnode] (drop_neuron_r2) at (4.1*\nodespace,-1.5*\neuronsep) {\scriptsize{$r_{2}^{l}$}};
\node [neuronnode] (drop_neuron_r1) at (4.1*\nodespace,-2.5*\neuronsep) {\scriptsize{$r_{1}^{l}$}};
\node [anchor=north,ublue] (standard) at ([yshift=-4em]drop_neuron_z.south) {\scriptsize{dropout}};
\node [ublue] (standard) at ([xshift=-1em]drop_neuron_z.west) {\scriptsize{$\mathbf{w}_{i}^{l}$}};
\node [ublue] (standard) at ([xshift=0.6em,yshift=0.3em]drop_neuron_z.east) {\scriptsize{$f$}};
%structure
\draw [->,line width=0.3mm] (drop_neuron_b.east) -- (drop_neuron_z.west);
\draw [->,line width=0.3mm] (drop_neuron_y3'.east) -- (drop_neuron_z.west);
\draw [->,line width=0.3mm] (drop_neuron_y2'.east) -- (drop_neuron_z.west);
\draw [->,line width=0.3mm] (drop_neuron_y1'.east) -- (drop_neuron_z.west);
\draw [->,line width=0.3mm] (drop_neuron_z.east) -- (drop_neuron_y'.west);
%r
\draw [->,line width=0.3mm] (drop_neuron_y3.east) -- (drop_neuron_y3'.west);
\draw [->,line width=0.3mm] (drop_neuron_y2.east) -- (drop_neuron_y2'.west);
\draw [->,line width=0.3mm] (drop_neuron_y1.east) -- (drop_neuron_y1'.west);
\draw [-,line width=0.3mm] (drop_neuron_r3.south) -- ([yshift=-0.9em]drop_neuron_r3.south);
\draw [-,line width=0.3mm] (drop_neuron_r2.south) -- ([yshift=-0.9em]drop_neuron_r2.south);
\draw [-,line width=0.3mm] (drop_neuron_r1.south) -- ([yshift=-0.9em]drop_neuron_r1.south);
%equ
\node [anchor=west,inner sep = 2pt] (line1) at (7.5*\nodespace,0) {未应用dropout:};
\node [anchor=north west,inner sep = 2pt] (line2) at (line1.south west) {$z_{i}^{l+1}=\mathbf{w}_{i}^{l} \mathbf{x}+b_{i}^{l}$};
\node [anchor=north west,inner sep = 2pt] (line3) at (line2.south west) {$x_{i}^{l+1}=f\left(x_{i}^{l}\right)$};
\node [anchor=north west,inner sep = 2pt] (line4) at (line3.south west) {应用dropout:};
\node [anchor=north west,inner sep = 2pt] (line5) at (line4.south west) {$r_{j}^{l} \sim$ Bernoulli $(1-p)$};
\node [anchor=north west,inner sep = 2pt] (line6) at (line5.south west) {$\tilde{\mathbf{x}}=\mathbf{r} * \mathbf{x}$};
\node [anchor=north west,inner sep = 2pt] (line7) at (line6.south west) {$z_{i}^{l+1}=\mathbf{w}_{i}^{l} \widetilde{\mathbf{x}}+b_{i}^{l}$};
\node [anchor=north west,inner sep = 2pt] (line8) at (line7.south west) {$x_{i}^{l+1}=f\left(z_{i}^{l}\right)$};
\end{tikzpicture}
\end{center}
\end{frame}
\begin{frame}{正则化--dropout}
\begin{itemize}
%\item 在伯努利分布中采样掩盖向量r,未被掩盖的位置以$1/(1-p)$进行放缩
\item 在实现中,我们常常用$p$来表示神经元被掩盖的几率
\item 训练阶段,通过dropout的方式类似于训练了不同的网络
\item 推断时为了结果的稳定性,无法随机丢弃神经元,需要对此进行补偿:
\begin{itemize}
\item 推断时对每个神经元乘以概率$p$
\item 训练时,对激活的神经元以$1/(1-p)$进行放缩
\end{itemize}
\item 推断时,激活全部的神经元类似于对很不同结构的网络进行集成,使互为“反向”的状态抵消缓解过拟合现象。
\item 根据数据量和模型结构合理的设置$p$能够有效缓解过拟合问题,提升神经机器翻译的性能。
\end{itemize}
\begin{center}
\begin{tikzpicture}
\tikzstyle{node} = [minimum size=1.0em,circle,draw,thick,inner sep=1pt,align=center]
\node [node,blue!50,fill=blue!30] (state) at (0,0) {};
\draw [->,very thick,red] (state) -- ([xshift = 2em,yshift = 2em]state.west);
\draw [->,very thick,red] (state) -- ([xshift = 2em,yshift = -2em]state.west);
\draw [->,very thick,red] (state) -- ([xshift = -2.8em]state.west);
\node[font=\scriptsize](line) at ([yshift=-2.5em]state.south) {反向状态抵消};
\node[font=\scriptsize] (table) at ([xshift=12em]state.east) {\begin{tabular}{l l l}
\specialrule{1pt}{1pt}{1pt}
model & dropout & bleu \\ \hline
& 0 & 24.6 \\
transformer-base & 0.1 & 25.8 \\
& 0.2 & 25.5 \\
transformer-big & 0.3 & 26.4 \\
\specialrule{1pt}{1pt}{1pt}
\end{tabular}};
\end{tikzpicture}
\end{center}
\end{frame}
\begin{frame}{正则化-dropout}
\begin{itemize}
\item dropout的思想同样可以应用在更高的维度,其中最常见的便是对模型结构的drop
\item Transformer的结构由多层堆叠的编码解码层组成,残差连接组合各层的输出,不同层之间会相互影响
\item 在深层Transformer结构中,更容易导致过拟合的现象
\item 我们可以借鉴dropout的思想,对子层结构进行drop
\end{itemize}
\begin{center}
\begin{tikzpicture}
\tikzstyle{sublayernode} = [rectangle,draw,thick,inner sep=3pt,rounded corners=2pt,align=center,minimum height=1.5em,minimum width=1.5em,font=\scriptsize]
\tikzstyle{inputnode} = [rectangle,inner sep=3pt,align=center,font=\tiny]
\tikzstyle{circlenode} = [circle,draw,thick,minimum size=0.3\base,font=\small,inner sep=0pt]
\tikzstyle{mnode} = [circle,thick,minimum size=0.7em,font=\small,inner sep=0pt,draw]
\node[anchor=south west,inputnode] (input) at (0,0) {$x_{i}$};
\node[anchor=west,sublayernode,fill=red!15] (ln) at ([xshift=1.2em]input.east) {LN};
\node[anchor=west,sublayernode,fill=green!15] (fn) at ([xshift=1.2em]ln.east) {F};
\node[anchor=west,mnode] (m) at ([xshift=2em]fn.east) {};
%\node[circlenode] (res) at ([xshift=2.4em]fn.east) {+};
\draw[-] (m.west) -- (m.east);
\draw[-] (m.north) -- (m.south);
\node[anchor=west,sublayernode,fill=red!15] (ln1) at ([xshift=2em]m.east) {LN};
\node[anchor=west,sublayernode,fill=green!15] (fn1) at ([xshift=1.2em]ln1.east) {F};
\node[anchor=west,mnode] (m1) at ([xshift=2em]fn1.east) {};
%\node[circlenode] (res1) at ([xshift=2.4em]fn1.east) {+};
\draw[-] (m1.west) -- (m1.east);
\draw[-] (m1.north) -- (m1.south);
\node[anchor=west,inputnode] (output) at ([xshift=1.2em]m1.east) {$x_{i}^{\prime}$};
\node[anchor=west,inputnode] (legend1) at (8em,-1em) {Standtard Pre-Norm};
%\coordinate (mend) at ([xshift=1em]m.west);
\draw[-latex',thick] (input)--(ln);
\draw[-latex',thick] (ln)--(fn);
\draw[-latex',thick] (fn)--(m);
%\draw[-,thick] (mend)--(res);
\coordinate (h) at ([xshift=-0.7em]ln.west);
\draw[-latex',thick,rounded corners] (h) -- ([yshift=1.35em]h.north) -- ([yshift=1em]m.north) -- (m.north);
%\coordinate (mend1) at ([xshift=1.0\hseg]m1.west);
\draw[-latex',thick] (m)--(ln1);
\draw[-latex',thick] (ln1)--(fn1);
\draw[-latex',thick] (fn1)--(m1);
%\draw[-,thick] (mend1)--(res1);
\draw[-latex',thick] (m1)--(output);
\coordinate (h1) at ([xshift=-0.7em]ln1.west);
\draw[-latex',thick,rounded corners] (h1) -- ([yshift=1.35em]h1.north) -- ([yshift=1em]m1.north) -- (m1.north);
%--------------------------------------------------------
\node[anchor=south west,inputnode] (input_2) at (0,-4em) {$x_{i}$};
\node[anchor=west,sublayernode,fill=red!15] (ln_2) at ([xshift=1.2em]input_2.east) {LN};
\node[anchor=west,sublayernode,fill=green!15] (fn_2) at ([xshift=1.2em]ln_2.east) {F};
\node[anchor=west,mnode] (m_2) at ([xshift=2em]fn_2.east) {};
%\node[circlenode] (res_2) at ([xshift=2.4em]fn_2.east) {+};
\draw[-] (m_2.west) -- (m_2.east);
\draw[-] (m_2.north) -- (m_2.south);
\node[anchor=west,sublayernode,fill=red!15] (ln1_2) at ([xshift=2em]m_2.east) {LN};
\node[anchor=west,sublayernode,fill=green!15] (fn1_2) at ([xshift=1.2em]ln1_2.east) {F};
\node[anchor=west,mnode] (m1_2) at ([xshift=2em]fn1_2.east) {};
% \node[circlenode] (res1_2) at ([xshift=2.4em]fn1_2.east) {+};
\draw[-] (m1_2.west) -- (m1_2.east);
\draw[-] (m1_2.north) -- (m1_2.south);
\node[anchor=west,inputnode] (output_2) at ([xshift=1.2em]m1_2.east) {$x_{i}^{\prime}$};
\node[anchor=west,inputnode] (legend2) at (6.5em,-5em) {Pre-Norm with Skip sub-layer};
\node[anchor=south west,inputnode,red,font=\tiny] (mlable) at ([xshift=-2.2em,yshift=-0.6em]m_2.south) {M=1};
\node[anchor=south west,inputnode,red,font=\tiny] (mlable1) at ([xshift=-2.2em,yshift=-0.6em]m1_2.south) {M=0};
\coordinate (start_1) at ([xshift=-1.3em]m_2.west);
\coordinate (end_1) at ([xshift=-0.5em]m_2.west);
%\node[red,font=\scriptsize] (dot1) at (start_1) {$\cdot$};
\draw[->,thick] (input_2)--(ln_2);
\draw[->,thick] (ln_2)--(fn_2);
\draw[->,thick] (fn_2)--(start_1);
\draw[-,thick,red] (start_1)--(end_1);
\draw[-,thick] (end_1)--(m_2);
%\draw[-,thick] (mend)--(res);
\coordinate (h_2) at ([xshift=-0.7em]ln_2.west);
\draw[->,thick,rounded corners] (h_2) -- ([yshift=1.35em]h_2.north) -- ([yshift=1em]m_2.north) -- (m_2.north);
%\coordinate (mend1) at ([xshift=1.0\hseg]m1.west);
\coordinate (start_2) at ([xshift=-1.3em]m1_2.west);
\coordinate (end_2) at ([xshift=-0.5em]m1_2.west);
\draw[->,thick] (m_2)--(ln1_2);
\draw[->,thick] (ln1_2)--(fn1_2);
\draw[->,thick] (fn1_2)--(start_2);
\draw[-,thick,red] (start_2)--([yshift=0.3em]end_2);
\draw[-,thick] (end_2)--(m1_2);
%\draw[-,thick] (mend1)--(res1);
\draw[->,thick] (m1_2)--(output_2);
\coordinate (h1_2) at ([xshift=-0.7em]ln1_2.west);
\draw[->,thick,rounded corners] (h1_2) -- ([yshift=1.35em]h1_2.north) -- ([yshift=1em]m1_2.north) -- (m1_2.north);
\end{tikzpicture}
\end{center}
\vspace{-1.5em}
\begin{itemize}
\item 对层结构的dropout,能够有效缓解深层网络中的过拟合现象,提升模型性能
\end{itemize}
\end{frame}
%%%-----------------------------------------------------------------------------------------------------------
%%% 多模型集成(如何生成多样的模型、如何集成)
\begin{frame}{ensemble}
\begin{frame}{多模型集成}
\begin{itemize}
\item 不同模型在训练的过程中的学习到的侧重面不一样。
\begin{itemize}
\item 模型A:短语翻译的好
\item 模型B:单词翻译的好
\item 模型C:句子流畅度更好
\end{itemize}
\item 模型集成便是通过对多个翻译模型的结果进行融合,各取所长,提升性能。
\end{itemize}
\vspace{-1.0em}
\begin{center}
\begin{tikzpicture}
\node[rounded corners=3pt,minimum width=3.0em,minimum height=2.0em,draw,thick,fill=blue!10!white,font=\scriptsize ](model1) at (0,0) {Model-1};
\node[rounded corners=3pt,minimum width=3.0em,minimum height=2.0em,draw,thick,fill=blue!10!white,font=\scriptsize ](model2) at ([xshift=3em]model1.east) {Model-2};
\node[minimum width=2.0em,minimum height=2.0em,font=\scriptsize ] (dot) at ([xshift=1em]model2.east) {...};
\node[rounded corners=3pt,minimum width=3.0em,minimum height=2.0em,draw,thick,fill=blue!10!white,font=\scriptsize ] (modeln) at ([xshift=2em]dot.east) {Model-$n$};
\node[rounded corners=3pt,minimum width=3.0em,minimum height=2.0em,draw,thick,fill=purple!10!white,font=\scriptsize ] (Ensemble) at ([xshift=5em]modeln.east) {Ensemble Model
};
\draw [<-,thick] (Ensemble.north) .. controls +(90:2em) and +(90:2em) .. (model1.north);
\draw [<-,thick] (Ensemble.north) .. controls +(90:1.5em) and +(90:1.5em) .. (model2.north);
\draw [<-,thick] (Ensemble.north) .. controls +(90:1em) and +(90:1em) .. (modeln.north);
\end{tikzpicture}
\end{center}
\vspace{-1.0em}
\begin{itemize}
\item 如何构造多个候选模型:
\begin{itemize}
\item 通过不同的随机种子进行初始化,增加模型的多样性
\item 构造不同的模型结构,比如,相对位置建模,动态层聚合...
\item 使用不同的数据集
\end{itemize}
\end{itemize}
\end{frame}
%%%------------------------------------------------------------------------------------------------------------
%%% 自左向右解码 vs. 自右向左解码
\begin{frame}{L2R}
\begin{frame}{多模型集成}
\begin{itemize}
\item 在机器翻译中如何实现多模型集成?
\begin{itemize}
\item 检查点平均:对单个模型训练不同时刻保存的参数状态进行平均
\item 预测分布平均:对多个不同结构预测的概率分布进行平均
\begin{center}
\begin{tikzpicture}
\node (one) at (0,0) {\scriptsize{$\begin{bmatrix}
Have, & 0.5 \\
has, & 0.1 \\
it,& 0.03 \\
fun,& 0.1 \\
the, & 0.01\\
\cdots & \cdots \\
\end{bmatrix}$
}};
\node (two) at ([xshift=3em]one.east) {\scriptsize{$\begin{bmatrix}
Have, & 0.5 \\
has, & 0.3 \\
it,& 0.04 \\
fun,& 0.15 \\
the, & 0.03 \\
\cdots & \cdots \\
\end{bmatrix}$
}};
\node(three) at ([xshift=3em]two.east) {\scriptsize{$\begin{bmatrix}
Have, & 0.1 \\
has, & 0.2 \\
it,& 0.05 \\
fun,& 0.4 \\
the, & 0.07 \\
\cdots & \cdots \\
\end{bmatrix}$
}};
\node (four) at ([xshift=3em]three.east) {\scriptsize{$\begin{bmatrix}
Have, & 0.03 \\
has, & 0.02 \\
it,& 0.15 \\
fun,& 0.6 \\
the, & 0.1 \\
\cdots & \cdots \\
\end{bmatrix}$
}};
\node (a) at ([xshift=1em]four.east) {$\cdots$};
\node[minimum width=2.0em,minimum height=2.0em,font=\scriptsize] (have) at ([xshift=3em,yshift=2em]one.north) {Have};
\node[minimum width=2.0em,minimum height=2.0em,font=\scriptsize] (fun) at ([xshift=3em,yshift=2em]three.north) {fun};
\draw [->,thick] (one.north) -- ([xshift=-0.2em]have.south) ;
\draw [->,thick] (two.north) -- ([xshift=0.2em]have.south) ;
\draw [->,thick] (three.north) -- ([xshift=-0.2em]fun.south) ;
\draw [->,thick] (four.north) -- ([xshift=0.2em]fun.south) ;
\node[rounded corners=3pt,minimum width=4.0em,minimum height=2.0em,draw,thick,fill=blue!10!white,font=\scriptsize](model1) at ([yshift=-2em]one.south) {Model-1};
\node[rounded corners=3pt,minimum width=4.0em,minimum height=2.0em,draw,thick,fill=blue!10!white,font=\scriptsize](model2) at ([yshift=-2em]two.south) {Model-2};
\node[rounded corners=3pt,minimum width=4.0em,minimum height=2.0em,draw,thick,fill=blue!10!white,font=\scriptsize](model3) at ([yshift=-2em]three.south) {Model-1};
\node[rounded corners=3pt,minimum width=4.0em,minimum height=2.0em,draw,thick,fill=blue!10!white,font=\scriptsize](model4) at ([yshift=-2em]four.south) {Model-2};
\node[minimum width=2.0em,minimum height=2.0em,font=\scriptsize] (eos1) at ([xshift=0em,yshift=-4em]model1.north) {EOS};
\node[minimum width=2.0em,minimum height=2.0em,font=\scriptsize] (eos2) at ([xshift=0em,yshift=-4em]model2.north) {EOS};
\node[minimum width=2.0em,minimum height=2.0em,font=\scriptsize] (have1) at ([xshift=0em,yshift=-4em]model3.north) {Have};
\node[minimum width=2.0em,minimum height=2.0em,font=\scriptsize] (have2) at ([xshift=0em,yshift=-4em]model4.north) {Have};
\draw [->,thick] (model1.north) -- (one.south) ;
\draw [->,thick] (model2.north) -- (two.south) ;
\draw [->,thick] (model3.north) -- (three.south) ;
\draw [->,thick] (model4.north) -- (four.south) ;
\draw [->,thick] (eos1.north) -- (model1.south) ;
\draw [->,thick] (eos2.north) -- (model2.south) ;
\draw [->,thick] (have1.north) -- (model3.south) ;
\draw [->,thick] (have2.north) -- (model4.south) ;
%\draw [->,thick] (have.east) .. controls ([xshift=9.5em,yshift=-0.0em]have.east) and +(west:5em) .. (have1.west) ;
\draw [->,thick] (have.east) .. controls +(east:9.5em) and +(west:5em) .. (have1.west) ;
\end{tikzpicture}
\end{center}
\end{itemize}
\end{itemize}
\end{frame}
%%%------------------------------------------------------------------------------------------------------------
%%% 自左向右解码 vs. 自右向左解码
\begin{frame}{R2L}
\begin{frame}{多模型集成}
\begin{itemize}
%\item 在比赛中,先对候选模型进行检查点平均,再整合其输出的概率分布进行解码。
%\item 在通过概率分布进行多模型的集成时,我们往往训练大量的候选模型,不断的尝试各种组合,从中挑选出性能较强的组合,作为最终的系统进行预测。
\item 多模型的集成时,需要在众多的候选模型的选出最佳的集成组合。
\begin{itemize}
\item 由于自回归特性,同时计算多个模型的概率分布,集成效率较慢,盲目尝试时间成本大。
\item 这里给出一种基于贪婪搜索的一种简单算法以便快速搜索出优秀的模型组合进行集成:
\end{itemize}
%\begin{itemize}
% \item 按性能和多样性挑选出n个候选
% \item 遍历所有的m(m$<$n)个模型的组合,挑选出最佳的m个模型组合
% \item 在m个模型的基础上不断增加模型进行集成,直到集成性能不在提升
%\end{itemize}
\begin{center}
\begin{tikzpicture}
\setlength{\base}{1.5em}
\tikzstyle{samplenode} = [rounded corners=1pt,minimum size=1\base,draw,inner sep=3pt,outer sep=0pt,fill=blue!10!white,font=\tiny]
\node [samplenode] (modeln) at (0,0) {Model-$n$};
\node [rotate=20] (model6) at ([xshift = 0.2em,yshift=0.1em]modeln.south west) {...};
%\node [samplenode] (model5) at ([xshift = 0.8em,yshift=0.5em]model6.south west) {model-5};
\node [samplenode] (model3) at ([xshift = -1.2em,yshift=-0.8em]modeln.south west) {Model-3};
\node [samplenode] (model2) at ([xshift = 0.6em,yshift=0.5em]model3.south west) {Model-2};
\node [samplenode] (model1) at ([xshift = 0.6em,yshift=0.5em]model2.south west) {Model-1};
%\node [samplenode] (model1) at ([xshift = 0.8em,yshift=0.5em]model2.south west) {model-1};
\draw [decorate,decoration={brace}] (modeln.south east) to node [auto,rotate=20,anchor=north,font=\tiny] {候选模型} (model1.south east);
\node [font=\tiny,anchor=west] (line1) at ([xshift=4.8em,yshift=2.75em]model3.east) {1.遍历所有的$m$($m$<$n$)个模型的组合,};
\node[font=\tiny,anchor=north west] (line2) at ([yshift=0.3em]line1.south west) {挑选出最佳的$m$个模型组合};
\node [font=\tiny,anchor=west] (line3) at ([xshift=4.8em,yshift=-0.75em]model3.east) {2.在$m$个模型的基础上不断增加模型进};
\node[font=\tiny,anchor=north west] (line4) at ([yshift=0.3em]line3.south west) {行集成,直到集成性能不在提升};
\node [samplenode] (e7) at ([xshift = 17em,yshift= -0.8em]modeln.east) {Model-7};
\node [samplenode] (e4) at ([xshift = 0.6em,yshift=0.5em]e7.south west) {Model-4};
\node [samplenode] (e3) at ([xshift = 0.6em,yshift=0.5em]e4.south west) {Model-3};
\node [samplenode] (e1) at ([xshift = 0.6em,yshift=0.5em]e3.south west) {Model-1};
\draw [decorate,decoration={brace}] (e7.south east) to node [auto,rotate=20,anchor=north,font=\tiny] {集成解码} (e1.south east);
\begin{pgfonlayer}{background}
\node [rectangle,inner sep=0.2em,rounded corners=1pt,fill=ugreen!10,drop shadow,draw=ugreen] [fit = (line1) (line2)] (box1) {};
\node [rectangle,inner sep=0.2em,rounded corners=1pt,fill=red!10,drop shadow,draw=purple] [fit = (line3) (line4)] (box2) {};
\draw [-stealth,line width=.05cm,ugreen!40] ([xshift=2.8em,yshift= 0.5em]model3.east) -- ([xshift=3.8em,yshift= 0.5em]model3.east);
\draw [-stealth,line width=.05cm,ugreen!40] ([xshift=15.2em,yshift= 0.5em]model3.east) -- ([xshift=16.2em,yshift= 0.5em]model3.east);
\node [rectangle,inner sep=0.4em,rounded corners=1pt,very thick,dotted,draw] [fit = (box1) (box2)] (box3) {};
\end{pgfonlayer}
\end{tikzpicture}
\end{center}
\item 不是模型越多,集成性能越好,搜索范围可根据时间和计算成本确定
\end{itemize}
\end{frame}
%%%------------------------------------------------------------------------------------------------------------
%%% 自左向右解码 vs. 自右向左解码
%\begin{frame}{L2R}
%\end{frame}
%%%------------------------------------------------------------------------------------------------------------
%%% 自左向右解码 vs. 自右向左解码
%\begin{frame}{R2L}
%\end{frame}
%%%------------------------------------------------------------------------------------------------------------
%%% 翻译长度控制
\begin{frame}{翻译长度控制}
\end{frame}
%\begin{frame}{翻译长度控制}
%\end{frame}
%%%------------------------------------------------------------------------------------------------------------
%%% 大模型和大批量训练
\begin{frame}{大模型和大批量训练}
\begin{frame}{增大模型容量}
\begin{itemize}
%\item 在比赛中,往往数据量巨大,普通的模型不足以充分学习到数据中的表示,因此往往采用更大容量的模型,增强模型的表示能力。
\item 模型容量与性能息息相关,在大规模的数据上训练时需要复杂的模型结构。
\item 增大模型容量的方式主要包括:
\begin{itemize}
\item 通过增大网络的隐层大小,即网络宽度
\item 增加网络的层数,即网络深度
\item 增大输入和输出层,即更大的词表和词向量维度
\end{itemize}
\end{itemize}
\vspace{-0.5em}
\begin{center}
\begin{tikzpicture}
\def\hneuron{0.8}
\def\vneuron{0.8}
\def\hmodel{4}
\tikzstyle{neuronnode} = [minimum size=1.0em,circle,draw,thick,inner sep=1pt,align=center,draw,fill=blue!15]
%base
\foreach \n in {0,...,2}{
\node [neuronnode] (base_neuron0_\n) at (\n * \hneuron,0) {};
}
\foreach \n in {0,1}{
\node [neuronnode] (base_neuron1_\n) at (\n * \hneuron + 0.5 * \hneuron,\vneuron) {};
}
\foreach \n in {0,...,2}{
\node [neuronnode] (base_neuron2_\n) at (\n * \hneuron,2*\vneuron) {};
}
\foreach \n in {0,...,2}{
\node [anchor=north] (base_s_\n) at ([yshift=-0.1em]base_neuron0_\n.south) {$s_{\n}$};
\node [anchor=south] (base_t_\n) at ([yshift=0.1em]base_neuron2_\n.north) {$t_{\n}$};
}
\foreach \n in {0,...,2}{
\foreach \m in {0,1}{
\draw [-] (base_neuron0_\n.north) -- (base_neuron1_\m.south);
}
}
\foreach \n in {0,1}{
\foreach \m in {0,...,2}{
\draw [-] (base_neuron1_\n.north) -- (base_neuron2_\m.south);
}
}
\node [anchor=north] (base) at ([yshift=-1.2em]base_neuron0_1.south) {\footnotesize{base}};
%big
\foreach \n in {0,...,2}{
\node [neuronnode] (big_neuron0_\n) at (\hmodel+\n * \hneuron,0) {};
}
\foreach \n in {0,...,3}{
\node [neuronnode] (big_neuron1_\n) at (\hmodel + \n * \hneuron - 0.5 * \hneuron,\vneuron) {};
}
\foreach \n in {0,...,2}{
\node [neuronnode] (big_neuron2_\n) at (\hmodel+\n * \hneuron,2*\vneuron) {};
}
\foreach \n in {0,...,2}{
\node [anchor=north] (big_s_\n) at ([yshift=-0.2em]big_neuron0_\n.south) {$s_{\n}$};
\node [anchor=south] (big_t_\n) at ([yshift=0.2em]big_neuron2_\n.north) {$t_{\n}$};
}
\foreach \n in {0,...,2}{
\foreach \m in {0,...,3}{
\draw [-] (big_neuron0_\n.north) -- (big_neuron1_\m.south);
}
}
\foreach \n in {0,...,3}{
\foreach \m in {0,...,2}{
\draw [-] (big_neuron1_\n.north) -- (big_neuron2_\m.south);
}
}
\node [anchor=north] (big) at ([yshift=-1.2em]big_neuron0_1.south) {\footnotesize{big}};
%deep
\foreach \n in {0,...,2}{
\node [neuronnode] (deep_neuron0_\n) at (2*\hmodel+\n * \hneuron,0) {};
}
\foreach \n in {0,...,1}{
\node [neuronnode] (deep_neuron1_\n) at (2*\hmodel + \n * \hneuron + 0.5 * \hneuron,\vneuron) {};
}
\foreach \n in {0,...,1}{
\node [neuronnode] (deep_neuron2_\n) at (2*\hmodel + \n * \hneuron + 0.5 * \hneuron,2*\vneuron) {};
}
\foreach \n in {0,...,2}{
\node [neuronnode] (deep_neuron3_\n) at (2*\hmodel+\n * \hneuron,3*\vneuron) {};
}
\foreach \n in {0,...,2}{
\node [anchor=north] (deep_s_\n) at ([yshift=-0.2em]deep_neuron0_\n.south) {$s_{\n}$};
\node [anchor=south] (deep_t_\n) at ([yshift=0.2em]deep_neuron3_\n.north) {$t_{\n}$};
}
\foreach \n in {0,...,2}{
\foreach \m in {0,1}{
\draw [-] (deep_neuron0_\n.north) -- (deep_neuron1_\m.south);
}
}
\foreach \n in {0,1}{
\foreach \m in {0,1}{
\draw [-] (deep_neuron1_\n.north) -- (deep_neuron2_\m.south);
}
}
\foreach \n in {0,1}{
\foreach \m in {0,...,2}{
\draw [-] (deep_neuron2_\n.north) -- (deep_neuron3_\m.south);
}
}
\node [anchor=north] (deeq) at ([yshift=-1.2em]deep_neuron0_1.south) {\footnotesize{deep}};
\end{tikzpicture}
\end{center}
\end{frame}
\begin{frame}{大模型-big model}
\begin{itemize}
\item 基于Transformer架构,我们增大其宽度的手段是使用Transformer-big模型。
\item 通过增大模型的隐藏层大小,及输入输出层来提升模型容量,提升翻译品质。
\item 除使用不同的参数设置外,transformer-base和big模型采用完全相同的网络结构。
\end{itemize}
\vspace{-0.5em}
\begin{center}
\begin{tabular}{lcc}
\specialrule{1pt}{1pt}{1pt}
& tranformer-base & transformer-big \\ \hline
词向量维度 & 512 & 1024 \\
注意力头数 & 8 & 16 \\
隐藏层维度 & 512 & 1024 \\
FFN子层映射维度 & 2048 & 4096 \\
\specialrule{1pt}{1pt}{1pt}
\end{tabular}
\end{center}
\vspace{-1em}
\begin{itemize}
\item 采用transformer-big模型,同时针对不同的数据合理的调整学习率,dropout等参数,能够有效提升模型的性能。
\end{itemize}
\end{frame}
\begin{frame}{大模型-deep model}
\begin{itemize}
\item 增大模型深度也是常用的提升模型容量的手段之一
\item 对于transformer结构来说,增大模型深度指的是,增加编码端的编码层个数
\end{itemize}
\begin{center}
\begin{tikzpicture}
\setlength{\base}{1.2em}
\tikzstyle{node} = [rounded corners=1pt,minimum width=1.2em,minimum height=1.2em,draw,fill=green!30!white]
\tikzstyle{node2} = [rounded corners=1pt,minimum width=1.2em,minimum height=1.2em,draw,fill=blue!30!white]
\node[node] (enc1) at (0,0) {};
\node[node] (enc2) at ([xshift = \base]enc1.east) {};
\node[node] (enc3) at ([xshift = \base]enc2.east) {};
\node[node] (enc4) at ([xshift = \base]enc3.east) {};
\node[node] (enc5) at ([xshift = \base]enc4.east) {};
\node[node] (enc6) at ([xshift = \base]enc5.east) {};
\node[] (enc7) at ([xshift = \base]enc6.east) {...};
\node[node] (enc8) at ([xshift = \base]enc7.east) {};
\node[node] (enc9) at ([xshift = \base]enc8.east) {};
\node[node] (enc10) at ([xshift = \base]enc9.east) {};
\node[font=\scriptsize,rotate=270] (src) at ([xshift = -\base]enc1.west) {src};
\draw [->] ([xshift=-0.75em]enc1.west) -- (enc1.west);
\draw [decorate,decoration={brace}] ([yshift=0.3em]enc1.north west) to node [auto,anchor=south,font=\tiny] {Nx} ([yshift=0.3em]enc10.north east);
\draw [->] (enc1.east) -- (enc2.west);
\draw [->] (enc2.east) -- (enc3.west);
\draw [->] (enc3.east) -- (enc4.west);
\draw [->] (enc4.east) -- (enc5.west);
\draw [->] (enc5.east) -- (enc6.west);
\draw [->] (enc8.east) -- (enc9.west);
\draw [->] (enc9.east) -- (enc10.west);
\node[node2,anchor=north] (dec1) at ([yshift=-2em]enc1.south) {};
\node[node2,anchor=north] (dec2) at ([yshift=-2em]enc2.south) {};
\node[node2,anchor=north] (dec3) at ([yshift=-2em]enc3.south) {};
\node[node2,anchor=north] (dec4) at ([yshift=-2em]enc4.south) {};
\node[node2,anchor=north] (dec5) at ([yshift=-2em]enc5.south) {};
\node[node2,anchor=north] (dec6) at ([yshift=-2em]enc6.south) {};
\node[font=\scriptsize,rotate=270] (tgt) at ([xshift = -\base]dec1.west) {tgt};
\node[font=\scriptsize,rotate=270] (tgt) at ([xshift = \base]dec6.east) {out};
\draw [->] ([xshift=-0.75em]dec1.west) -- (dec1.west);
\draw [->] (dec6.east) -- ([xshift=0.75em]dec6.east);
\draw [decorate,decoration={brace,mirror}] ([yshift=-0.3em]dec1.south west) to node [auto,anchor=north,font=\tiny] {6x} ([yshift=-0.3em]dec6.south east);
\draw [->] (dec1.east) -- (dec2.west);
\draw [->] (dec2.east) -- (dec3.west);
\draw [->] (dec3.east) -- (dec4.west);
\draw [->] (dec4.east) -- (dec5.west);
\draw [->] (dec5.east) -- (dec6.west);
\node[node] (enc_legend) at ([xshift = 2\base]enc10.east) {};
\node[node2,anchor=north] (dec_legend) at ([yshift = -\base]enc_legend.south) {};
\node[font=\tiny,anchor=west] (line1) at (enc_legend.east) {:编码层};
\node[font=\tiny,anchor=west] (line1) at (dec_legend.east) {:解码层};
%\node[node] (dec1) at ([xshift=4em]enc1.east) {Decoder};
%\node[node2] (enc2) at ([xshift=4em]dec1.east) {Encoder};
%\node[node] (dec2) at ([xshift=4em]enc2.east) {Decoder};
\coordinate (c1) at ([xshift=1em]enc10.east);
\coordinate (c2) at ([yshift=-1.6em]c1.south);
\draw [->,rounded corners] (enc10.east) -- (c1) -- (c2)--([yshift=1em]dec1.north) -- (dec1.north);
\draw [->,rounded corners] (enc10.east) -- (c1) -- (c2)--([yshift=1em]dec2.north) -- (dec2.north);
\draw [->,rounded corners] (enc10.east) -- (c1) -- (c2)--([yshift=1em]dec3.north) -- (dec3.north);
\draw [->,rounded corners] (enc10.east) -- (c1) -- (c2)--([yshift=1em]dec4.north) -- (dec4.north);
\draw [->,rounded corners] (enc10.east) -- (c1) -- (c2)--([yshift=1em]dec5.north) -- (dec5.north);
\draw [->,rounded corners] (enc10.east) -- (c1) -- (c2)--([yshift=1em]dec6.north) -- (dec6.north);
\end{tikzpicture}
\end{center}
\vspace{-1em}
\begin{itemize}
\item 增大transformer模型的编码层深度,通过更多的线形及非线形变换提升编码端的特称抽取能力。
\item 单纯堆叠编码层,无法成功训练,通过合理的手段可以训练50层,甚至100层的模型。
\end{itemize}
\end{frame}
%%%------------------------------------------------------------------------------------------------------------
\begin{frame}{大批量训练及解码}
\begin{itemize}
\item 在人脑对文本进行处理时,通常以句子为单位,机器翻译中则通过批量的方式处理文本
\item 批量的方法即使通过一定的方式将多个句子组合成一个批次送入模型
\item 批量的方法可以用于模型的训练以及解码
\begin{itemize}
\item 在训练时,同时输入多组源语和目标语,计算平均损失层,可以节省内存,加快收敛。
\item 解码时可以同时输入多句源语,得到一组译文,可以提高模型解码的效率
\end{itemize}
\end{itemize}
\begin{center}
\begin{tikzpicture}
\tikzstyle{node} = [rounded corners=3pt,minimum height=2.0em,minimum width=3.0em,draw,font=\footnotesize,draw,thick]
\node [node,minimum height=5.0em,minimum width=11.0em,rounded corners=5pt,fill=blue!10!white,draw,thick] (data) at (0,0) {data};
\node [node,minimum height=5.0em,minimum width=11.0em,rounded corners=5pt] (mini-batch) at ([xshift = 8em]data.east) {};
\draw[-stealth,line width=.05cm,black!60] ([xshift=0.25em]data.east) -- ([xshift=-0.25em]mini-batch.west);
\node [anchor=east,node,fill=blue!10!white] (batch1) at ([xshift = 6em,yshift = 1.2em]data.east) {batch};
\node [anchor=east,node,fill=blue!10!white] (batch2) at ([xshift = 9.5em,yshift = 1.2em]data.east) {batch};
\node [anchor=east,node,fill=blue!10!white] (batch3) at ([xshift = 13em,yshift = 1.2em]data.east) {batch};
\node [anchor=east,node,fill=blue!10!white] (batch4) at ([xshift = 6em,yshift = -1.2em]data.east) {batch};
\node [anchor=east,node,fill=blue!10!white] (batch5) at ([xshift = 9.5em,yshift = -1.2em]data.east) {batch};
\node [anchor=east,node,fill=blue!10!white] (batch6) at ([xshift = 13em,yshift = -1.2em]data.east) {batch};
\end{tikzpicture}
\end{center}
\end{frame}
%--------------------
\begin{frame}{大批量-训练}
\begin{itemize}
\item 逐句训练缓慢,在提出批量方法之前一次性把所有样本送入神经网络
\begin{itemize}
\item 在整个语料库计算梯度,梯度方向更为准确
\item 大语料库导致内存爆炸,梯度间差异大难以使用全局学习率
\end{itemize}
\item 如何合理构建batch十分重要,由于不同句子之间的长度有明显的的差异,使用padding对空白位置填充
\item 由于padding机制,随机的生成batch会导致padding过多,收敛缓慢,对长度进行排序可有效缓解
\end{itemize}
\begin{center}
\begin{tikzpicture}
\tikzstyle{node} = [minimum height=0.8em,draw,fill=green!20]
\node[node,minimum width=2.0em] (sent1) at (0,0) {};
\node[node,minimum width=5.0em,anchor=north west] (sent2) at (sent1.south west) {};
\node[node,minimum width=1.0em,anchor=north west] (sent3) at (sent2.south west) {};
\node[node,minimum width=3.0em,anchor=north west] (sent4) at (sent3.south west) {};
\node[node,minimum width=4.0em] (sent5) at (12em,0) {};
\node[node,minimum width=4.5em,anchor=north west] (sent6) at (sent5.south west) {};
\node[node,minimum width=4.5em,anchor=north west] (sent7) at (sent6.south west) {};
\node[node,minimum width=5em,anchor=north west] (sent8) at (sent7.south west) {};
%\node[node,minimum width=2.0em] (sent3) at (0,0) {};
%\node[node,minimum width=2.0em] (sent4) at (0,0) {};
\node[font=\scriptsize,anchor=east] (line1) at (sent1.west) {sent1};
\node[font=\scriptsize,anchor=east] (line2) at (sent2.west) {sent2};
\node[font=\scriptsize,anchor=east] (line3) at (sent3.west) {sent3};
\node[font=\scriptsize,anchor=east] (line4) at (sent4.west) {sent4};
\node[font=\scriptsize,anchor=east] (line5) at (sent5.west) {sent1};
\node[font=\scriptsize,anchor=east] (line6) at (sent6.west) {sent2};
\node[font=\scriptsize,anchor=east] (line7) at (sent7.west) {sent3};
\node[font=\scriptsize,anchor=east] (line8) at (sent8.west) {sent4};
\begin{pgfonlayer}{background}
\node [rectangle,inner sep=-0.0em,draw] [fit = (sent1) (sent2) (sent3) (sent4)] (box1) {};
\node [rectangle,inner sep=-0.0em,draw] [fit = (sent5) (sent6) (sent7) (sent8)] (box2) {};
\end{pgfonlayer}
\node[font=\scriptsize,anchor=west] (node1) at ([yshift=-3.2em]sent1.south) {随机生成};
\node[font=\scriptsize,anchor=west] (node2) at ([xshift=7.5em]node1.east) {排序生成};
\end{tikzpicture}
\end{center}
\vspace{-1.3em}
\begin{itemize}
\item 除排序还有很多的策略用于batch生成,包括课程学习等,多种策略之间可以共同作用
\end{itemize}
\end{frame}
%------------
\begin{frame}{大批量-训练}
\begin{itemize}
\item 批量训练中一个重要的概念便是batch size,即每次送入模型的样本数量
\item 除以句子作为度量单位外,常使用单词个数作为单位,batch size即为每次送入模型的单词个数
\item 相比于句子的方式,进一步减少了padding数量,提高计算效率
\item 实际中发现,训练时采用更大批量,配合更大的学习率,在加快模型收敛的同时有效提升了模型的性能。
\end{itemize}
\begin{center}
\begin{tikzpicture}
\tikzstyle{node} = [rounded corners=3pt,minimum height=2em,minimum width=3.0em,fill=blue!10!white,draw,thick,font=\footnotesize]
\node [minimum height=5.0em,minimum width=11.0em,rounded corners=5pt,draw,thick] (data) at (0,0) {};
\node [node] (batch1) at ([xshift = -3.5em]data) {batch};
\node [node] (batch2) at (data) {batch};
\node [node] (batch3) at ([xshift = 3.5em]data) {batch};
\node[] (table) at ([xshift=8em]data.east) {\begin{tabular}{lcl}
\hline
batch & lr & BLEU \\
\hline
4096 & 0.01 & 29.15 \\
8192 & 0.01 & 29.06 \\
8192 & 0.02 & 29.49 \\
\hline
\end{tabular}};
\end{tikzpicture}
\end{center}
\end{frame}
%--------------------
\begin{frame}{大批量-训练}
\begin{itemize}
\item 那么如何在有限的计算资源的情况下,增大batch size?
\begin{itemize}
\item 多GPU以数据并行的方式分布式训练,同步更新,假设单块GPU容纳m个词,若拥有n块GPU,batch size便等于m*n
\item GPU的数量和内存受限时,可以采用累计梯度的方式
\end{itemize}
\item 累计梯度是迭代多个批次,累积最终梯度进行更新
\begin{itemize}
\item 累积n次,等价于batch size翻n倍
\item 减少设备间的通信,平衡多设备间运算差异,增大计算效率
\end{itemize}
\end{itemize}
\begin{center}
\begin{tikzpicture}
\tikzstyle{node} = [minimum height=0.8em,draw,fill=green!20]
\tikzstyle{legend} = [minimum height=0.8em,minimum width=0.8em,draw]
\tikzstyle{node2} = [minimum width=0.8em,minimum height=3.3em,draw,fill=blue!20]
\node[node,minimum width=2.8em] (node1) at (0,0) {};
\node[node,minimum width=4.0em,anchor=north west] (node2) at (node1.south west) {};
\node[node,minimum width=3.2em,anchor=north west] (node3) at (node2.south west) {};
\node[node,minimum width=3.0em,anchor=north west] (node4) at (node3.south west) {};
\node[node2,anchor = north west] (grad1) at ([xshift=1.2em]node1.north east) {};
\node[node,minimum width=3.7em,anchor=north west] (node5) at (grad1.north east) {};
\node[node,minimum width=2.8em,anchor=north west] (node6) at (node5.south west) {};
\node[node,minimum width=3.2em,anchor=north west] (node7) at (node6.south west) {};
\node[node,minimum width=4.0em,anchor=north west] (node8) at (node7.south west) {};
\node[font=\scriptsize,anchor=east] (line1) at (node1.west) {gpu1};
\node[font=\scriptsize,anchor=east] (line2) at (node2.west) {gpu2};
\node[font=\scriptsize,anchor=east] (line3) at (node3.west) {gpu3};
\node[font=\scriptsize,anchor=east] (line4) at (node4.west) {gpu4};
\node[node2,anchor = north west] (grad2) at ([xshift=0.3em]node5.north east) {};
\draw[->] (-1.4em,-2.92em) -- (9em,-2.92em);
\node[node,minimum width=2.8em] (node9) at (13em,0) {};
\node[node,minimum width=4.0em,anchor=north west] (node10) at (node9.south west) {};
\node[node,minimum width=3.2em,anchor=north west] (node11) at (node10.south west) {};
\node[node,minimum width=3.0em,anchor=north west] (node12) at (node11.south west) {};
\node[node,minimum width=3.7em,anchor=north west] (node13) at (node9.north east) {};
\node[node,minimum width=2.8em,anchor=north west] (node14) at (node10.north east) {};
\node[node,minimum width=3.2em,anchor=north west] (node15) at (node11.north east) {};
\node[node,minimum width=4.0em,anchor=north west] (node16) at (node12.north east) {};
\node[node2,anchor = north west] (grad3) at ([xshift=0.5em]node13.north east) {};
\node[font=\scriptsize,anchor=east] (line1) at (node9.west) {gpu1};
\node[font=\scriptsize,anchor=east] (line2) at (node10.west) {gpu2};
\node[font=\scriptsize,anchor=east] (line3) at (node11.west) {gpu3};
\node[font=\scriptsize,anchor=east] (line4) at (node12.west) {gpu4};
\draw[->] (11.6em,-2.92em) -- (20.2em,-2.92em);
\begin{pgfonlayer}{background}
\node [rectangle,inner sep=-0.0em,draw] [fit = (node1) (node2) (node3) (node4)] (box1) {};
\node [rectangle,inner sep=-0.0em,draw] [fit = (node5) (node6) (node7) (node8)] (box2) {};
\node [rectangle,inner sep=-0.0em,draw] [fit = (node9) (node13) (node12) (node16)] (box2) {};
\end{pgfonlayer}
\node[font=\tiny,anchor=north] (legend1) at ([xshift=3em]node4.south) {一步一更新};
\node[font=\tiny,anchor=north] (legend2) at ([xshift=2.5em]node12.south) {累积两步更新};
\node[font=\tiny,anchor=north] (time1) at (grad2.south) {time};
\node[font=\tiny,anchor=north] (time1) at (grad3.south) {time};
\node[legend] (legend3) at (2em,2em) {};
\node[font=\tiny,anchor=west] (idle) at (legend3.east) {:空闲};
\node[legend,anchor=west,draw,fill=green!20] (legend4) at ([xshift = 2em]idle.east) {};
\node[font=\tiny,anchor=west] (FB) at (legend4.east) {:前向/反向};
\node[legend,anchor=west,draw,fill=blue!20] (legend5) at ([xshift = 2em]FB.east) {};
\node[font=\tiny,anchor=west] (grad_sync) at (legend5.east) {:梯度更新};
\end{tikzpicture}
\end{center}
\end{frame}
%------
\begin{frame}{大批量-解码}
\begin{itemize}
\item 批量的方法同样可以应用于解码阶段,提升解码的速度和设备利用率
\begin{itemize}
\item 批次生成的策略:常以句子数作为batch的度量单位
\begin{itemize}
\item 当源语文本已知时,与训练类似,排序后划分batch
\item 实时翻译时,等待一个时间段,对期间得到的句子划分batch
\end{itemize}
\item 批次大小的设置:根据任务合理选择
\begin{itemize}
\item 批量大,GPU利用率高,吞吐大,短句需要等待长句
\item 实时性要求高时不适合过大批量
\end{itemize}
\end{itemize}
\end{itemize}
\end{frame}
%%%------------------------------------------------------------------------------------------------------------
%%% 实验分析
\begin{frame}{实验分析}
\end{frame}
%%%------------------------------------------------------------------------------------------------------------
......@@ -395,6 +2196,7 @@
%%%------------------------------------------------------------------------------------------------------------
%%% 深层模型
\begin{frame}{深层模型}
\end{frame}
%%%------------------------------------------------------------------------------------------------------------
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论