Commit cecbceb9 by xiaotong

add dropout to transformer

parent 2e20824a
......@@ -125,17 +125,8 @@ XTensor T2TAttention::Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask)
dot = Linear(dot, 1.0F/(float)sqrt((float)dk));
//if(llnum == 1)
// dot.Dump(tf, "dot:");
scalar = Softmax(dot, -1);
//if(llnum == 1)
// scalar.Dump(tf, "scalar:");
//if(ignored > 0)
// _SetDataDim(&scalar, 0, ignored, scalar.order - 2, 1e-9F);
att = BMMul(scalar, vheads);
/* concatenate the heads */
......
......@@ -73,6 +73,9 @@ public:
special design for the attention model. */
int ignored;
/* indicates whether the model is used for training */
bool isTraining;
public:
/* constructor */
T2TAttention();
......
......@@ -63,6 +63,7 @@ void AttEncoder::InitModel(int argc, const char ** argv,
LoadParamInt(argc, argv, "hsize", &hSize, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "esize", &eSize, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "vsize", &vSize, -1);
LoadParamFloat(argc, argv, "dropout", &dropoutP, 0);
CheckNTErrors(nlayer >= 1, "We have one encoding layer at least!");
CheckNTErrors(vSize > 1, "set vocabulary size by \"-vsize\"");
......@@ -89,9 +90,10 @@ make the encoding network
>> input - the input tensor of the encoder
>> mask - the mask that indicate each position is valid
>> skipInputRes - indicates whether we skip the residual connection of the first layer
>> isTraining - indicates whether the model is for training
<< return - the output tensor of the encoder
*/
XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool skipInputRes)
XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool skipInputRes, bool isTraining)
{
XTensor x;
......@@ -111,7 +113,9 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool skipInputRes)
/* self attention */
att = attentions[i].Make(x, x, x, mask);
/* TODO: dropout */
/* dropout */
if(isTraining && dropoutP > 0)
att = Dropout(att);
/* layer normalization */
x = attLayerNorms[i].Make(att);
......@@ -121,10 +125,12 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool skipInputRes)
/* self attention */
att = attentions[i].Make(x, x, x, mask);
/* dropout */
if(isTraining && dropoutP > 0)
att = Dropout(att);
/* residual connection */
res = Sum(att, x);
/* TODO: dropout */
/* layer normalization */
x = attLayerNorms[i].Make(res);
......@@ -133,13 +139,18 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool skipInputRes)
/* fnn */
fnn = fnns[i].Make(x);
/* dropout */
if(isTraining && dropoutP > 0)
fnn = Dropout(fnn);
/* residual connection */
res = Sum(fnn, x);
/* TODO: dropout */
/* layer normalization */
x = fnnLayerNorms[i].Make(res);
if(isTraining && dropoutP > 0)
x = Dropout(x);
}
return x;
......
......@@ -40,7 +40,7 @@ class T2TEncoder
{
public:
virtual
XTensor Make(XTensor &input, XTensor &mask, bool skipInputRes) = 0;
XTensor Make(XTensor &input, XTensor &mask, bool skipInputRes, bool isTraining) = 0;
};
/*
......@@ -49,7 +49,7 @@ the encoder based on RNN
class RNNEncoder : T2TEncoder
{
public:
XTensor Make(XTensor &input, XTensor &mask, bool skipInputRes);
XTensor Make(XTensor &input, XTensor &mask, bool skipInputRes, bool isTraining);
};
......@@ -77,6 +77,9 @@ public:
/* vocabulary size */
int vSize;
/* dropout probability */
DTYPE dropoutP;
/* some positions can be ignored in attention. this is useful in lm where the first position needs
special design for the attention model. */
int ignored;
......@@ -115,7 +118,7 @@ public:
int myDevID = -1, XMem * myMem = NULL);
/* make the encoding network */
XTensor Make(XTensor &input, XTensor &mask, bool skipInputRes);
XTensor Make(XTensor &input, XTensor &mask, bool skipInputRes, bool isTraining);
};
......
......@@ -77,11 +77,12 @@ make the encoding network
>> input - input tensor
>> mask - the mask for positions that are/not involved in computation
>> skipInputRes - indicates whether we skip the residual connection of the first layer
>> isTraining - indicates whether we are training the model
<< return - encoding result
*/
XTensor T2TModel::MakeEncoding(XTensor &input, XTensor &mask, bool skipInputRes)
XTensor T2TModel::MakeEncoding(XTensor &input, XTensor &mask, bool skipInputRes, bool isTraining)
{
return encoder.Make(input, mask, skipInputRes);
return encoder.Make(input, mask, skipInputRes, isTraining);
}
/*
......@@ -89,8 +90,9 @@ make the entire network (with the output softmax layer)
>> input - input tensor
>> output - output tensor (distribution)
>> padding - padding of the sequences
>> isTraining - indicates whether the model is for training
*/
void T2TModel::Make(XTensor &input, XTensor &output, XTensor &padding)
void T2TModel::Make(XTensor &input, XTensor &output, XTensor &padding, bool isTraining)
{
XTensor encoding;
......@@ -134,7 +136,7 @@ void T2TModel::Make(XTensor &input, XTensor &output, XTensor &padding)
//_Sum(&mask, padding3, &mask);
encoding = MakeEncoding(input, mask, true);
encoding = MakeEncoding(input, mask, true, isTraining);
outputLayer.Make(encoding, output);
delete[] dims;
......
......@@ -69,10 +69,10 @@ public:
void InitModel(int argc, const char ** argv);
/* make the encoding network */
XTensor MakeEncoding(XTensor &input, XTensor &mask, bool skipInputRes);
XTensor MakeEncoding(XTensor &input, XTensor &mask, bool skipInputRes, bool isTraining);
/* make the entire network (with the output softmax layer) */
void Make(XTensor &input, XTensor &output, XTensor &padding);
void Make(XTensor &input, XTensor &output, XTensor &padding, bool isTraining);
/* get parameter matrics */
void GetParams(XList &list);
......
......@@ -149,7 +149,7 @@ void T2TTrainer::Train(const char * fn, T2TModel * model)
XTensor output;
/* make the network */
model->Make(batch, output, padding);
model->Make(batch, output, padding, true);
/* make paddings for the output */
if(output.GetDim(0) > 1)
......@@ -271,7 +271,7 @@ void T2TTrainer::Test(const char * fn, const char * ofn, T2TModel * model)
XTensor output;
/* make the network */
model->Make(batch, output, padding);
model->Make(batch, output, padding, false);
int bSize = batch.GetDim(0);
int length = batch.GetDim(1);
......
......@@ -30,14 +30,6 @@
namespace nts{ // namespace nts(NiuTrans.Tensor
/*
generate a random bernoulli number
*/
DTYPE RandomBernoulli(DTYPE prob)
{
return (DTYPE)rand()/(DTYPE)RAND_MAX > prob ? (DTYPE)1.0 : (DTYPE)0.0;
}
/*
dropout function
It randomly zeroes some of the elements of the input tensor
with probability p via a Bernoulli distribution.
......@@ -64,7 +56,7 @@ void _Dropout(const XTensor *x, XTensor *y, unsigned int seed, DTYPE prob)
int unitNum = x->unitNum;
DTYPE * maskArray = new DTYPE[unitNum];
for (int i = 0; i < unitNum; i++)
maskArray[i] = RandomBernoulli(prob);
maskArray[i] = RandomBernoulli(prob, 1.0F);
XTensor * maskTensor = NewTensorBuf(x, x->devID, x->mem);
maskTensor->SetData(maskArray, unitNum);
......@@ -112,7 +104,7 @@ void _DropoutBackward(const XTensor * y, const XTensor * x,
srand(seed);
DTYPE * maskArray = new DTYPE[unitNum];
for (int i = 0; i < unitNum; i++)
maskArray[i] = RandomBernoulli(prob);
maskArray[i] = RandomBernoulli(prob, 1.0F);
XTensor * maskTensor = NewTensorBuf(x, x->devID, x->mem);
maskTensor->SetData(maskArray, unitNum);
......@@ -142,48 +134,39 @@ void _DropoutBackward(const XTensor * y, const XTensor * x,
}
/*
dropout function (we make tensor connections here)
It randomly zeroes some of the elements of the input tensor
with probability p via a Bernoulli distribution.
dropout function (we make tensor connections here)
It randomly zeroes some of the elements of the input tensor
with probability p via a Bernoulli distribution.
See "Improving neural networks by preventing co-adaptation of feature detectors"
for more details.
See "Improving neural networks by preventing co-adaptation of feature detectors"
for more details.
Here, the output is scaled by a factor of \frac{1}{1-p} so that we do not need
to mark the tensor with probability p in the inference phase. Instead we perform
the same inference procedure as that with no use of dropout on the test data.
Here, the output is scaled by a factor of \frac{1}{1-p} so that we do not need
to mark the tensor with probability p in the inference phase. Instead we perform
the same inference procedure as that with no use of dropout on the test data.
>> x - input tensor
>> y - output tensor
>> prob - probability to set an element to zero
>> x - input tensor
>> y - output tensor
>> prob - probability to set an element to zero
*/
XTensor Dropout(const XTensor &x, DTYPE prob)
{
DTYPE scaleFactor = (DTYPE)1.0 / ((DTYPE)1.0 - prob);
/* generate a mask tensor again with special probability */
srand((unsigned int)time(NULL));
/* generate a mask tensor with probability p */
int unitNum = x.unitNum;
DTYPE * maskArray = new DTYPE[unitNum];
srand((unsigned int)time(NULL));
for (int i = 0; i < unitNum; i++)
maskArray[i] = RandomBernoulli(prob);
XTensor maskTensor(&x);
maskTensor.SetData(maskArray, unitNum);
XTensor y;
XTensor inter;
inter = Multiply(x, maskTensor);
y = ScaleAndShift(inter, scaleFactor, 0);
maskArray[i] = RandomBernoulli(prob, scaleFactor);
XTensor mask(&x);
mask.SetData(maskArray, unitNum);
delete[] maskArray;
///* tensor connection */
//XLink::MakeLink(&x, NULL, &y, FUNC_DROPOUT);
//XLink::AddParamToHead(&y, prob);
return y;
return Multiply(x, mask);
}
} // namespace nts(NiuTrans.Tensor)
......@@ -27,6 +27,12 @@
namespace nts{ // namespace nts(NiuTrans.Tensor)
/* generate a random bernoulli number */
inline DTYPE RandomBernoulli(DTYPE prob, DTYPE value)
{
return (DTYPE)rand()/(DTYPE)RAND_MAX > prob ? (DTYPE)value : (DTYPE)0.0;
}
/* dropout function */
void _Dropout(const XTensor * x, XTensor * y, unsigned int seed, DTYPE prob = 0.5);
......
......@@ -26,6 +26,7 @@
#include "../XTensor.h"
#include "Dropout.h"
#include "HardTanH.h"
#include "Identity.h"
#include "LogSoftmax.h"
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论