Commit 37f6ca40 by 曹润柘

合并分支 'caorunzhe' 到 'master'

Caorunzhe

查看合并请求 !864
parents 1c6625fe cb935bd2
......@@ -73,7 +73,7 @@
\draw [->,thick] (model.east) .. controls +(east:0.5) and +(west:0.5) .. ([xshift=-1.8em]box1.west);
\draw [->,thick] (model.east) .. controls +(east:0.5) and +(west:0.5) .. ([xshift=-1.8em]box2.west);
\draw [-,very thick] ([xshift=0.3em]box2.east) -- ([xshift=-0.3em]box3.west);
\draw [-,very thick] ([xshift=0.782em,yshift=0.5em]box2.east) -- ([xshift=0.782em,yshift=-0.5em]box2.east);
\draw [-,very thick] ([xshift=0.782em,yshift=0.48em]box2.east) -- ([xshift=0.782em,yshift=-0.48em]box2.east);
%%%%%
\node [] (t10) at ([yshift=1.5em]box1.north) {$t_1$};
......
......@@ -28,7 +28,7 @@
\node[font=\footnotesize,anchor=east] (w1) at ([xshift=-0.5em]w2.west){\scriptsize\textbf{1}};
\node[font=\footnotesize,anchor=west] (w4) at ([xshift=0.5em]w3.east){\scriptsize\textbf{0}};
\node[font=\footnotesize,anchor=west] (w5) at ([xshift=0.5em]w4.east){\scriptsize\textbf{1}};
\node[font=\footnotesize,anchor=south] (output) at ([yshift=1em]tgt_sf.north){\scriptsize\sffamily\bfseries{我们\quad 完全\quad 接受\quad\quad}};
\node[font=\footnotesize,anchor=south] (output) at ([yshift=1em]tgt_sf.north){\scriptsize\sffamily\bfseries{我们/完全/接受/ 它/}};
\node[font=\footnotesize,anchor=north] (src) at ([yshift=-1em]src_emb.south){\scriptsize\textbf{We totally accept it .}};
\node[font=\footnotesize,anchor=north] (tgt) at ([yshift=-1em]tgt_emb.south){\scriptsize\textbf{We totally accept accept .}};
......
......@@ -31,7 +31,7 @@
\node[anchor=north,font=\scriptsize,align=center] (w2) at ([yshift=-2em]decoder.south){\scriptsize\bfnew{There exist different} \\ \scriptsize\bfnew{opinions on this question .}};
\node[anchor=north,font=\scriptsize,text=gray] (w3) at ([yshift=0.6em]w2.south){\scriptsize\bfnew{(复制源语言句子)}};
\node[anchor=south,font=\scriptsize,align=center] (w4) at ([yshift=1.6em]box2.north){\scriptsize\bfnew{on this question} \\ \scriptsize\bfnew{There exist different opinions .}};
\node[anchor=south,font=\scriptsize,align=center] (w5) at ([yshift=1.6em]box3.north){\tiny\bfnew{ \ 这个 \ 问题 \ 存在 \ 不同的 \ 看法 \ }};
\node[anchor=south,font=\scriptsize,align=center] (w5) at ([yshift=1.6em]box3.north){\tiny\bfnew{/这个/问题/存在/不同/的/看法/}};
\node[font=\tiny] at ([xshift=-0.8em,yshift=-0.6em]encoder.east) {$N\times$};
\node[font=\tiny] at ([xshift=-0.8em,yshift=-0.6em]decoder.east) {$1\times$};
\node[font=\tiny] at ([xshift=-1.2em,yshift=-0.6em]decoder2.east) {$N-1\times$};
......
......@@ -5,7 +5,7 @@
\node (encoder)[encoder] at (0,0) {编码器};
%\node (des)[anchor=north] at ([yshift=2cm]encoder.north) {<Mask>:<Mask>};
\node (text_left)[anchor=south] at ([yshift=-3em]encoder.south) {\footnotesize{\ \ 熟睡\ }};
\node (text_left)[anchor=south] at ([yshift=-3em]encoder.south) {\footnotesize{/在/熟睡/}};
\node (autodecoder)[autodecoder,right of=encoder,xshift=6em ] {自回归解码器};
\node (text_mid1)[anchor=north] at ([yshift=3em]autodecoder.north) {\scriptsize{NP1\ VP3\ PU1\ <eos>}};
\node (text_mid2)[anchor=south] at ([yshift=-3em]autodecoder.south) {\scriptsize{<sos>\ NP1\ VP3\ PU1}};
......
......@@ -97,14 +97,14 @@
\draw [decorate,decoration={brace,mirror}] ([xshift=0em]n3.south west) to node [midway,font=\small,align=center,xshift=0em,yshift=-0.8em] {$d$} ([xshift=0em]n3.south east);
\draw [decorate,decoration={brace,mirror}] ([xshift=0em]n6.south west) to node [midway,font=\small,align=center,xshift=0em,yshift=-0.8em] {$d_a$} ([xshift=0em]n6.south east);
\draw [decorate,decoration={brace,mirror}] ([xshift=0em]n6.south west) to node [midway,font=\small,align=center,xshift=0em,yshift=-0.8em] {$d_{\rm a}$} ([xshift=0em]n6.south east);
\draw [decorate,decoration={brace,mirror}] ([xshift=0em]n7.north west) to node [midway,font=\small,align=center,xshift=-0.7em,yshift=-0em] {$d$} ([xshift=0em]n7.south west);
\draw [decorate,decoration={brace}] ([xshift=0em]n7.north west) to node [midway,font=\small,align=center,xshift=0em,yshift=0.7em] {$d$} ([xshift=0em]n7.north east);
\draw [decorate,decoration={brace,mirror}] ([xshift=0em]n8.north west) to node [midway,font=\small,align=center,xshift=-0.8em,yshift=-0em] {$d_a$} ([xshift=0em]n8.south west);
\draw [decorate,decoration={brace}] ([xshift=0em]n8.north west) to node [midway,font=\small,align=center,xshift=0em,yshift=0.8em] {$n_{hop}$} ([xshift=0em]n8.north east);
\draw [decorate,decoration={brace,mirror}] ([xshift=0em]nc31.south west) to node [midway,font=\small,align=center,xshift=0em,yshift=-0.8em] {$n_{hop}$} ([xshift=0em]nc35.south east);
\draw [decorate,decoration={brace,mirror}] ([xshift=0em]n8.north west) to node [midway,font=\small,align=center,xshift=-0.8em,yshift=-0em] {$d_{\rm a}$} ([xshift=0em]n8.south west);
\draw [decorate,decoration={brace}] ([xshift=0em]n8.north west) to node [midway,font=\small,align=center,xshift=0em,yshift=0.8em] {$n_{\rm hop}$} ([xshift=0em]n8.north east);
\draw [decorate,decoration={brace,mirror}] ([xshift=0em]nc31.south west) to node [midway,font=\small,align=center,xshift=0em,yshift=-0.8em] {$n_{\rm hop}$} ([xshift=0em]nc35.south east);
\draw [decorate,decoration={brace,mirror}] ([xshift=0em]ln5.south west) to node [midway,font=\small,align=center,xshift=0em,yshift=-0.8em] {$d$} ([xshift=0em]ln5.south east);
\draw [decorate] ([xshift=0em]ln5.south east) to node [midway,font=\footnotesize,align=center,xshift=1em,yshift=-0.5em] {$n_{hop}$} ([xshift=0em]ln1.south east);
\draw [decorate] ([xshift=0em]ln5.south east) to node [midway,font=\footnotesize,align=center,xshift=1em,yshift=-0.5em] {$n_{\rm hop}$} ([xshift=0em]ln1.south east);
\draw [decorate,decoration={brace,mirror}] ([xshift=0em]fn.south east) to node [midway,font=\small,align=center,xshift=0.7em,yshift=-0em] {$d$} ([xshift=0em]fn.north east);
......
......@@ -4,7 +4,7 @@
\begin{axis}[
width=.50\textwidth,
height=.40\textwidth,
legend style={at={(0.60,0.08)}, anchor=south west},
legend style={at={(0.45,0.08)}, anchor=south west},
xlabel={\scriptsize{更新次数(10k)}},
ylabel={\scriptsize{学习率 ($10^{-3}$}},
ylabel style={yshift=-1em},xlabel style={yshift=0.0em},
......@@ -15,7 +15,7 @@
]
\addplot[red,line width=1.25pt] coordinates {(0,0) (1.6,2) (1.8,1.888) (2,1.787) (2.5,1.606) (3,1.462) (3.5,1.3549) (4,1.266) (4.5,1.193) (5,1.131)};
\addlegendentry{\scriptsize Base48}
\addlegendentry{\scriptsize 原始学习率}
%\addplot[red,line width=1.25pt] coordinates {(0,0) (8000,0.002) (10000,0.00179) (12000,0.00163) (12950,0.001572)};
\addplot[blue,line width=1.25pt] coordinates {(0,0) (0.8,2) (0.9906,1.7983)};
%\addplot[red,line width=1.25pt] coordinates {(0,0) (8000,0.002) (9906,0.0017983)};
......@@ -31,7 +31,7 @@
\addplot[blue,line width=1.25pt] coordinates {(2.9706,2) (3.1706,1.79) (3.3706,1.63) (3.4656,1.572) (3.6706,1.4602) (3.7136,1.44)};
\addplot[blue,dashed,line width=1.25pt] coordinates {(3.7136,1.44) (3.7136,2)};
\addplot[blue,line width=1.25pt] coordinates {(3.7136,2) (3.9136,1.79) (4.1136,1.63) (4.2086,1.572) (4.4136,1.4602) (4.4566,1.44) (4.7000,1.3574) (5.0000,1.2531)};
\addlegendentry{\scriptsize SDT48}
\addlegendentry{\scriptsize 调整后的学习率}
\end{axis}
}
......
......@@ -9,22 +9,22 @@
\begin{scope}[]
\node [anchor=west,ffnnode] (f1) at (0, 0){FFN};
\node [anchor=west,inner sep=0mm,minimum height=1.8em] (f1) at (0, 0){input};
\node [anchor=south,ebnode] (e1) at ([xshift=0em,yshift=1em]f1.north){Embedding};
\node [anchor=south west,manode] (a1) at ([xshift=0em,yshift=1em]e1.north west){Attention};
\node [anchor=south east,manode] (c1) at ([xshift=0em,yshift=1em]e1.north east){Conv};
\node [anchor=south west,ebnode] (e2) at ([xshift=0em,yshift=1em]a1.north west){Embedding};
\node [anchor=south,draw,circle,inner sep=4pt] (add1) at ([xshift=0em,yshift=0.5em]e2.north){};
\node [anchor=south,ffnnode] (f2) at ([xshift=0em,yshift=0.5em]add1.north){FFN};
\node [anchor=south,inner sep=0mm,minimum height=1.8em] (f2) at ([xshift=0em,yshift=0.5em]add1.north){output};
\draw[->,thick] ([xshift=0em,yshift=0em]f1.north)--([xshift=0em,yshift=0em]e1.south);
\draw[->,thick] ([xshift=0em,yshift=-0.3em]f1.north)--([xshift=0em,yshift=0em]e1.south);
\draw[->,thick] ([xshift=0em,yshift=-1em]a1.south)--([xshift=0em,yshift=0em]a1.south);
\draw[->,thick] ([xshift=0em,yshift=-1em]c1.south)--([xshift=0em,yshift=0em]c1.south);
\draw[->,thick] ([xshift=0em,yshift=0em]a1.north)--([xshift=0em,yshift=1em]a1.north);
\draw[->,thick] ([xshift=0em,yshift=0em]c1.north)--([xshift=0em,yshift=1em]c1.north);
\draw[-,thick] ([xshift=0em,yshift=0em]e2.north)--([xshift=0em,yshift=0em]add1.south);
\draw[->,thick] ([xshift=0em,yshift=0em]add1.north)--([xshift=0em,yshift=0em]f2.south);
\draw[->,thick] ([xshift=0em,yshift=0em]add1.north)--([xshift=0em,yshift=0.3em]f2.south);
\draw[-] ([xshift=0em,yshift=0em]add1.west)--([xshift=-0em,yshift=0em]add1.east);
\draw[-] ([xshift=0em,yshift=0em]add1.south)--([xshift=-0em,yshift=-0em]add1.north);
......@@ -33,11 +33,6 @@
\draw[->,thick,rectangle,rounded corners=5pt] ([xshift=0em,yshift=0.5em]f1.north)--([xshift=-6em,yshift=0.5em]f1.north)--([xshift=-5.45em,yshift=0em]add1.west)--([xshift=0em,yshift=0em]add1.west);
\node [anchor=north,inner sep=0mm,minimum height=1.5em] (ip) at ([xshift=0em,yshift=-1em]f1.south){input};
\node [anchor=south,inner sep=0mm,minimum height=1.5em] (op) at ([xshift=0em,yshift=1em]f2.north){output};
\draw[->,thick] ([xshift=0em,yshift=0em]ip.north)--([xshift=0em,yshift=0em]f1.south);
\draw[->,thick] ([xshift=0em,yshift=0em]f2.north)--([xshift=0em,yshift=0em]op.south);
\end{scope}
\end{tikzpicture}
\end{center}
\ No newline at end of file
......@@ -11,8 +11,8 @@
\node [anchor=west,enode] (e1) at ([xshift=1.5em,yshift=0em]w1.east) {编码器};
\node [anchor=west,dnode] (d1) at ([xshift=3em,yshift=6em]e1.east) {翻译任务};
\node [anchor=north,dnode] (d2) at ([xshift=0em,yshift=-2em]d1.south) {文本处理任务};
\node [anchor=north,dnode] (d3) at ([xshift=0em,yshift=-2em]d2.south) {语言理解};
\node [anchor=north,dnode] (d2) at ([xshift=0em,yshift=-2em]d1.south) {句法分析任务};
\node [anchor=north,dnode] (d3) at ([xshift=0em,yshift=-2em]d2.south) {语言理解任务};
\node [anchor=north,dnode] (d4) at ([xshift=0em,yshift=-2em]d3.south) {其他任务};
\node [anchor=west] (w2) at ([xshift=2em,yshift=0em]d1.east) {英语(目标语言)};
......
......@@ -12,7 +12,7 @@
\draw[->,very thick]([xshift=-1.4em]trans.west) to (trans.west);
\draw[->,very thick](trans.east) to ([xshift=1.4em]trans.east);
\node[anchor=east] (de1) at ([xshift=4.8cm,yshift=-0.1em]trans.east) {\begin{tabular}{l}{\normalsize{译文:}}{\normalsize{一个\ \;女孩\ \;\ \;{\red{河床}}}}\end{tabular}};
\node[anchor=south] (de2) at ([xshift=-0em,yshift=-1.5em]de1.south) {\begin{tabular}{l}{\normalsize{\ \;跳下来\ \;}} \end{tabular}};
\node[anchor=east] (de1) at ([xshift=4.5cm,yshift=-0.1em]trans.east) {\begin{tabular}{l}{\normalsize{译文:}}{\normalsize{一个/女孩/从/{\red{河床}}/}}\end{tabular}};
\node[anchor=south] (de2) at ([xshift=-0em,yshift=-1.5em]de1.south) {\begin{tabular}{l}{\normalsize{/跳下来/}} \end{tabular}};
\end {scope}
\end{tikzpicture}
\ No newline at end of file
......@@ -333,7 +333,7 @@
\parinterval 说到噪音问题就不得不提到注意力机制的引入,前面章节中提到过这样的一个例子:
\vspace{0.8em}
\centerline{中午\ \ 吃饭\ \ \ \ \ \ \ 下午\ 篮球\ \ \ 现在\ \ 饿\ \ \ \underline{\quad \quad \quad}}
\centerline{中午/没/吃饭/,/又/刚/打/了/ 一/下午/篮球/,/我/现在/很/饿/ ,/我/想\underline{\quad \quad}}
\vspace{0.8em}
\parinterval 想在横线处填写“吃饭”,“吃东西”的原因是我们在读句子的过程中,关注到了“没/吃饭”,“很/饿”等关键息。这是在语言生成中注意力机制所解决的问题,即对于要生成的目标语言单词,相关性更高的语言片段应该更加“重要”,而不是将所有单词一视同仁。同样的,注意力机制也应用在多模态机器翻译中,即在生成目标单词时,更应该关注与目标单词相关的图像部分,而弱化对其他部分的关注。另外,注意力机制的引入,也使图像信息更加直接地参与目标语言的生成,解决了在不使用注意力机制的方法中图像信息传递损失的问题。
......@@ -464,9 +464,9 @@
\parinterval “篇章”在这里是指一系列连续的段落或句子所构成的整体,其中各个句子间从形式和内容上都具有一定的连贯性和一致性\upcite{jurafsky2000speech}。这些联系主要体现在{\small\sffamily\bfseries{衔接}}\index{衔接}(Cohesion \index{Cohesion})以及连贯两个方面。其中衔接体现在显性的语言成分和结构上,包括篇章中句子间的语法和词汇的联系,而连贯体现在各个句子之间的逻辑和语义的联系上。因此,篇章级翻译就是要将这些上下文之间的联系考虑在内,从而生成比句子级翻译更连贯和准确的翻译结果。实例\ref{eg:17-1}就展示了一个使用篇章信息进行机器翻译的实例。
\begin{example}
上下文句子:我\ 上周\ 针对\ 这个\ 问题\ 做出\ 解释\ \ 咨询\ \ 他的\ 意见\
上下文句子:我/上周/针对/这个/问题/做出/解释/并/咨询/了/他的/意见/
\hspace{2em} 待翻译句子:他\ \ 同意\ 我的\ 看法\
\hspace{2em} 待翻译句子:他/也/同意/我的/看法/
\hspace{2em} 句子级翻译结果:He also agrees with me .
......@@ -523,14 +523,14 @@
\begin{example}
传统模型训练输入:
\hspace{10em}源语言:你\ 看到\ \ \
\hspace{10em}源语言:你/看到/了/吗/
\hspace{10em}目标语言:Do you see them ?
\vspace{0.5em}
\qquad\ 改进后模型训练输入:
\hspace{10em}源语言:{\red{他们\ \ \ \ <sep>\ }}\ \ 看到\ \ \
\hspace{10em}源语言:{\red{他们/在/哪/?\ <sep>\ }}\ 你/看到/了/吗/
\hspace{10em}目标语言:Do you see them ?
......
......@@ -40,13 +40,13 @@
\caption{汉译英译文质量评价实例}
{
\begin{tabular}{c|l|c}
源文 &只敏捷的棕色狐狸跳过了那只懒惰的狗& 评价得分 \\
源文 &/只/敏捷/的/棕色/狐狸/跳过/了/那/只/懒惰/的/狗/& 评价得分 \\
\hline
\rule{0pt}{10pt} 机器译文1 & The quick brown fox jumped over the lazy dog. & 5 \\
\rule{0pt}{10pt} 机器译文2 & The fast brown fox jumped over a sleepy dog. & 4 \\
\rule{0pt}{10pt} 机器译文3 & The fast brown fox jumps over the dog. & 3 \\
\rule{0pt}{10pt} 机器译文4 & The quick brown fox jumps over dog. & 2 \\
\rule{0pt}{10pt} 机器译文5 & A fast fox jump dog. & 1 \\
\rule{0pt}{10pt} 机器译文1 & The quick brown fox jumped over the lazy dog . & 5 \\
\rule{0pt}{10pt} 机器译文2 & The fast brown fox jumped over a sleepy dog . & 4 \\
\rule{0pt}{10pt} 机器译文3 & The fast brown fox jumps over the dog . & 3 \\
\rule{0pt}{10pt} 机器译文4 & The quick brown fox jumps over dog . & 2 \\
\rule{0pt}{10pt} 机器译文5 & A fast fox jump dog . & 1 \\
\end{tabular}
\label{tab:4-1}
}
......@@ -205,9 +205,9 @@
\noindent 其中,$\textrm{edit}(o,g)$表示系统生成的译文$o$和参考答案$g$之间的距离,$l$是归一化因子,通常为参考答案的长度。在距离计算中所有的操作的代价都为1。在计算距离时,优先考虑移位操作,再计算编辑距离(即增加、删除和替换操作的次数)。直到增加、移位操作无法减少编辑距离时,将编辑距离和移位操作的次数累加得到TER计算的距离。
\begin{example}
机器译文:A cat is standing in the ground.
机器译文:A cat is standing in the ground .
\qquad\ 参考答案:The cat is standing on the ground.
\qquad\ 参考答案:The cat is standing on the ground .
\label{eg:4-1}
\end{example}
......@@ -234,7 +234,7 @@
\begin{example}
机器译文:the the the the
\qquad \ 参考答案:The cat is standing on the ground.
\qquad \ 参考答案:The cat is standing on the ground .
\label{eg:4-bleu-example}
\end{example}
......@@ -385,31 +385,31 @@
\parinterval 基于检测点的评价根据事先定义好的语言学检测点对译文的相应部分进行打分。如下是几个英中翻译中的检测点实例:
\begin{example}
They got up at six this morning.
They got up at six this morning .
\qquad\ \ 他们今天早晨六点钟起床
\qquad\ \ 他们/今天/早晨/六点钟/起床/
\qquad\ \ 检测点:时间词的顺序
\label{eg:4-3}
\end{example}
\begin{example}
There are nine cows on the farm.
There are nine cows on the farm .
\qquad\ \ 农场里有九头牛
\qquad\ \ 农场/里/有/九/头/牛/
\qquad\ \ 检测点:量词“头”
\label{eg:4-4}
\end{example}
\begin{example}
His house is on the south bank of the river.
His house is on the south bank of the river .
\qquad\ \ 的房子在河的南岸
\qquad\ \ /的/房子/在/河/的/南岸/
\qquad\ \ We keep our money in a bank.
\qquad\ \ We keep our money in a bank .
\qquad\ \ 我们在一家银行存钱
\qquad\ \ 我们/在/一家/银行/存钱/
\qquad\ \ 检测点:bank 的多义翻译
\label{eg:4-5}
......@@ -704,11 +704,11 @@ d&=&t \frac{s}{\sqrt{n}}
\begin{example}
单词级质量评估任务
源句(Source):Draw or select a line.(英语)
源句(Source):Draw or select a line .(英语)
机器译文(MT):Zeichnen oder wählen Sie eine Linie aus.(德语)
机器译文(MT):Zeichnen oder wählen Sie eine Linie aus .(德语)
后编辑结果(PE):Zeichnen oder Sie eine Linie, oder wählen Sie eine aus.(德语)
后编辑结果(PE):Zeichnen oder Sie eine Linie, oder wählen Sie eine aus .(德语)
\label{eg:4-7}
\end{example}
......@@ -751,7 +751,7 @@ d&=&t \frac{s}{\sqrt{n}}
源句(Source):Nach Zubereitung || im Kühlschrank aufbewahren || und innerha-
\hspace{7.3em}lb von 24 || Stunden aufbrauchen.(德语)
\hspace{7.3em}lb von 24 || Stunden aufbrauchen .(德语)
机器译文(MT):After reconstitution || in the refrigerator || and used within 24 ||
......@@ -817,11 +817,11 @@ d&=&t \frac{s}{\sqrt{n}}
上文信息:A {\red housewife} won the first prize in the supermarket's anniversary
\hspace{5em}celebration.
\hspace{5em}celebration .
机器译文:A few days ago, {\red he} contacted the News Channel and said that the
\hspace{5em}supermarket owner refused to give {\red him} the prize.
\hspace{5em}supermarket owner refused to give {\red him} the prize .
\label{eg:4-9}
\end{example}
......
......@@ -2074,13 +2074,13 @@ z_t&=&\gamma z_{t-1}+(1-\gamma) \frac{\partial J}{\partial {\theta}_t} \cdot \f
\parinterval 那么,分布式表示中每个维度的含义是什么?可以把每一维度都理解为一种属性,比如一个人的身高、体重等。但是,神经网络模型更多的是把每个维度看作是单词的一种抽象“刻画”,是一种统计意义上的“语义”,而非简单的人工归纳的事物的一个个属性。使用这种连续空间的表示的好处在于,表示的内容(实数向量)可以进行计算和学习,因此可以通过模型训练得到更适用于自然语言处理的单词表示结果。
\parinterval 为了方便理解,看一个简单的例子。假如现在有个“预测下一个单词”的任务:有这样一个句子“屋里 要 摆放 一个 \rule[-3pt]{1cm}{0.05em}”,其中下划线的部分表示需要预测的下一个单词。如果模型在训练数据中看到过类似于“摆放 一个 桌子”这样的片段,那么就可以很自信的预测出“桌子”。另一方面,很容易知道,实际上与“桌子”相近的单词,如“椅子”,也是可以预测的单词的。但是,“椅子”恰巧没有出现在训练数据中,这时如果用One-hot编码来表示单词,显然无法把“椅子”填到下划线处;而如果使用单词的分布式表示,很容易就知道 “桌子”与“椅子”是相似的,因此预测“ 椅子”在一定程度上也是合理的。
\parinterval 为了方便理解,看一个简单的例子。假如现在有个“预测下一个单词”的任务:有这样一个句子“屋里/要/摆放/一个/\rule[-3pt]{1cm}{0.05em}”,其中下划线的部分表示需要预测的下一个单词。如果模型在训练数据中看到过类似于“摆放 一个 桌子”这样的片段,那么就可以很自信的预测出“桌子”。另一方面,很容易知道,实际上与“桌子”相近的单词,如“椅子”,也是可以预测的单词的。但是,“椅子”恰巧没有出现在训练数据中,这时如果用One-hot编码来表示单词,显然无法把“椅子”填到下划线处;而如果使用单词的分布式表示,很容易就知道 “桌子”与“椅子”是相似的,因此预测“ 椅子”在一定程度上也是合理的。
\begin{example}
屋里\ \ 摆放\ 一个 \_\_\_\_\_ \hspace{0.5em} \quad \quad 预测下个词
屋里/要/摆放/一个/\_\_\_\_ \hspace{0.5em} \quad \quad 预测下个词
\hspace{2em} 屋里\ \ 摆放\ 一个\ { \red{桌子}} \hspace{3.2em}见过
\hspace{2em} 屋里/要/摆放/一个/{\red{桌子}} \hspace{3.2em}见过
\hspace{2em} 屋里\ \ 摆放\ 一个\ { \blue{椅子}} \hspace{3.2em}没见过,但是仍然是合理预测
\hspace{2em} 屋里/要/摆放/一个/{\blue{椅子}} \hspace{3.2em}没见过,但是仍然是合理预测
\end{example}
\parinterval 关于单词的分布式表示还有一个经典的例子:通过词嵌入可以得到如下关系:$\textrm{“国王”}=\textrm{“女王”}-\textrm{“女人”} +\textrm{“男人”}$。从这个例子可以看出,词嵌入也具有一些代数性质,比如,词的分布式表示可以通过加、减等代数运算相互转换。图\ref{fig:9-66}展示了词嵌入在一个二维平面上的投影,不难发现,含义相近的单词分布比较临近。
......
......@@ -9273,6 +9273,64 @@ author = {Zhuang Liu and
publisher = {International Conference on Machine Learning},
year = {2019}
}
@inproceedings{DBLP:journals/corr/abs-2012-13866,
author = {Bei Li and
Ziyang Wang and
Hui Liu and
Quan Du and
Tong Xiao and
Chunliang Zhang and
Jingbo Zhu},
title = {Learning Light-Weight Translation Models from Deep Transformer},
publisher = {CoRR},
volume = {abs/2012.13866},
year = {2020}
}
@inproceedings{DBLP:conf/aclnmt/HuLLLLWXZ20,
author = {Chi Hu and
Bei Li and
Yinqiao Li and
Ye Lin and
Yanyang Li and
Chenglong Wang and
Tong Xiao and
Jingbo Zhu},
title = {The NiuTrans System for {WNGT} 2020 Efficiency Task},
pages = {204--210},
publisher = {Annual Meeting of the Association for Computational Linguistics},
year = {2020}
}
@inproceedings{DBLP:conf/acl/WeiYHZWL20,
author = {Xiangpeng Wei and
Heng Yu and
Yue Hu and
Yue Zhang and
Rongxiang Weng and
Weihua Luo},
title = {Multiscale Collaborative Deep Models for Neural Machine Translation},
pages = {414--426},
publisher = {Annual Meeting of the Association for Computational Linguistics},
year = {2020}
}
@inproceedings{DBLP:conf/emnlp/DeNeefeKWM07,
author = {Steve DeNeefe and
Kevin Knight and
Wei Wang and
Daniel Marcu},
title = {What Can Syntax-Based {MT} Learn from Phrase-Based MT?},
pages = {755--763},
publisher = {Conference on Empirical Methods in Natural Language Processing},
year = {2007}
}
@inproceedings{DBLP:conf/emnlp/ShiPK16,
author = {Xing Shi and
Inkit Padhi and
Kevin Knight},
title = {Does String-Based Neural {MT} Learn Source Syntax?},
pages = {1526--1534},
publisher = {Conference on Empirical Methods in Natural Language Processing},
year = {2016}
}
%%%%% chapter 15------------------------------------------------------
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论