Commit 52a27964 by xiaotong

better implementation of dropout

parent a8304bed
......@@ -87,6 +87,8 @@ void XMathGrad::MakeGrad(XTensor * node, bool isEfficient)
GradSum(node, isEfficient);
else if(operID == MATH_SUMDIM)
GradSumDim(node, isEfficient);
else if(operID == MATH_SUMBROADCAST)
GradSumBroadcast(node, isEfficient);
else if(operID == REDUCE_REDUCEMEAN)
GradReduceMean(node, isEfficient);
else if(operID == REDUCE_REDUCESUM)
......@@ -817,6 +819,37 @@ void XMathGrad::GradMultiplyDim(XTensor * node, bool isEfficient)
}
/*
gradient for multiplication by broadcasting:
c = a * b
where some dimensions of b are of size 1
dE/da = dE/dc * b
dE/db = (dE/dc * a).reduce(0...n)
where a.reduce(0...n) is the reduction along the dimension
whose size is 1 in b. Note that there might be several reductions.
>> node - the node (c) for backward computation
>> isEfficient - indicates whether the computation is in
an efficient manner
*/
void XMathGrad::GradMultiplyBroadcast(XTensor * node, bool isEfficient)
{
XLink &income = node->income;
CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for MULTIPLYBROADCAST!");
XTensor * a = income.tails[0];
XTensor * b = income.tails[1];
DTYPE beta = income.GetParam(0);
XNoder::MakeGrad(a);
_MultiplyBroadcast(node->grad, b, a->grad, 1.0F);
if(b->isVar || b->income.tailNum > 0){
ShowNTErrors("TODO");
}
}
/*
gradient for negate
for
c = -a
......@@ -1253,6 +1286,37 @@ void XMathGrad::GradSumDim(XTensor * node, bool isEfficient)
node->visitMark = NODE_FINISHED;
}
/*
gradient for sum by broadcasting:
c = a + b * \beta
where some dimensions of b are of size 1
dE/da = dE/dc
dE/db = dE/dc * a.reduce(0..n) * \beta
where a.reduce(0..n) is the reduction along the dimension
whose size is 1 in b
>> node - the node (c) for backward computation
>> isEfficient - indicates whether the computation is in
an efficient manner
*/
void XMathGrad::GradSumBroadcast(XTensor * node, bool isEfficient)
{
XLink &income = node->income;
CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for SUMBROADCAST!");
XTensor * a = income.tails[0];
XTensor * b = income.tails[1];
DTYPE beta = income.GetParam(0);
XNoder::MakeGrad(a);
_Sum(a->grad, node->grad, a->grad);
if(b->isVar || b->income.tailNum > 0){
ShowNTErrors("TODO");
}
}
/*
gradient for reduceMean
for
......
......@@ -109,6 +109,11 @@ private:
static
void GradMultiplyDim(XTensor * node, bool isEfficient);
/* gradient for multiply one dimension: c = a * b
where some dimensions of b are of size 1 */
static
void GradMultiplyBroadcast(XTensor * node, bool isEfficient);
/* gradient for negate */
static
void GradNegate(XTensor * node, bool isEfficient);
......@@ -143,6 +148,11 @@ private:
static
void GradSumDim(XTensor * node, bool isEfficient);
/* gradient for sum by broadcasting: c = a + b * \beta
where some dimensions of b are of size 1 */
static
void GradSumBroadcast(XTensor * node, bool isEfficient);
/* gradient for reduceMean */
static
void GradReduceMean(XTensor * node, bool isEfficient);
......
......@@ -82,7 +82,7 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X
/* dropout */
if(isTraining && dropoutP > 0)
x = Dropout(x, dropoutP);
x = Dropout(x, dropoutP, 2);
for(int i = 0; i < nlayer; i++){
XTensor att;
......@@ -97,7 +97,7 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X
/* dropout */
if(isTraining && dropoutP > 0)
att = Dropout(att, dropoutP);
att = Dropout(att, dropoutP, 2);
/* residual connection */
res = Sum(att, x);
......@@ -111,7 +111,7 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X
/* dropout */
if(isTraining && dropoutP > 0)
ende = Dropout(ende, dropoutP);
ende = Dropout(ende, dropoutP, 2);
/* residual connection */
res = Sum(ende, x);
......@@ -125,7 +125,7 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X
/* dropout */
if(isTraining && dropoutP > 0)
fnn = Dropout(fnn, dropoutP);
fnn = Dropout(fnn, dropoutP, 2);
/* residual connection */
res = Sum(fnn, x);
......
......@@ -107,7 +107,7 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, boo
/* dropout */
if(isTraining && dropoutP > 0)
x = Dropout(x, dropoutP);
x = Dropout(x, dropoutP, 2);
for(int i = 0; i < nlayer; i++){
XTensor att;
......@@ -117,10 +117,10 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, boo
/* self attention */
att = attentions[i].Make(x, x, x, mask, isTraining);
/* dropout */
if(isTraining && dropoutP > 0)
att = Dropout(att, dropoutP);
att = Dropout(att, dropoutP, 2);
/* residual connection */
res = Sum(att, x);
......@@ -133,7 +133,7 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, boo
/* dropout */
if(isTraining && dropoutP > 0)
fnn = Dropout(fnn, dropoutP);
fnn = Dropout(fnn, dropoutP, 2);
/* residual connection */
res = Sum(fnn, x);
......
......@@ -274,7 +274,9 @@ void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTe
_Sum(&maskEnc, padding3, &maskEnc);
encoding = MakeEncoder(inputEnc, maskEnc, isTraining);
decoding = MakeDecoder(inputDec, encoding, maskDec, maskEncDec, isTraining);
outputLayer->Make(decoding, output);
delete[] dims;
......
......@@ -231,7 +231,7 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
DTYPE lossLocal = -prob / wc;
bool doUpdate = (!IsNAN(lossLocal) && !IsINF(lossLocal) && lossLocal < 1e3F);
XTensor &g = labelSmoothingP > 0 ? goldSmoothed : gold;
XTensor &g = labelSmoothingP > 0 ? goldSmoothed : gold;
if (doUpdate) {
......
......@@ -60,6 +60,7 @@ int TransformerMain(int argc, const char ** argv)
LoadParamString(argc, args, "output", outputFN, "");
srand((unsigned int)time(NULL));
T2TTrainer trainer;
trainer.Init(argc, args);
......
......@@ -39,7 +39,7 @@ for more details.
Here, the output is scaled by a factor of \frac{1}{1-p} so that we do not need
to mark the tensor with probability p in the inference phase. Instead we perform
the same inference procedure as that with no use of dropout on the test data.
the same inference procedure as that on the test data withno nb use of dropout.
>> x - input tensor
>> y - output tensor
......@@ -122,8 +122,8 @@ void _DropoutBackward(const XTensor * y, const XTensor * x,
else
ShowNTErrors("TODO!");
}
/*
/*
dropout function (we make tensor connections here)
It randomly zeroes some of the elements of the input tensor
with probability p via a Bernoulli distribution.
......@@ -134,36 +134,79 @@ for more details.
Here, the output is scaled by a factor of \frac{1}{1-p} so that we do not need
to mark the tensor with probability p in the inference phase. Instead we perform
the same inference procedure as that with no use of dropout on the test data.
>> x - input tensor
>> dropProb - probability to set an element to zero
>> leadingDim - the dimension which we generate the random numbers and perform broadcasting
>> leadingDim2 - another dimension which we generate the random numbers and perform broadcasting
<< return - tensor after dropout
*/
XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim)
XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim, int leadingDim2)
{
CheckNTErrors(dropProb >= 0.0 && dropProb <= 1.0, "The probability must be 0-1!");
int n = leadingDim < 0 ? x.order - 1 : leadingDim;
XTensor mask;
DTYPE * maskArray = NULL;
CheckNTErrors(n >= 0 && n < x.order, "Wrong leadingDim!");
if(leadingDim < 0 && leadingDim2 < 0){
ShowNTErrors("TODO");
}
else if(leadingDim2 < 0){
int n = leadingDim;
DTYPE scaleFactor = (DTYPE)1.0 / ((DTYPE)1.0 - dropProb);
CheckNTErrors(n >= 0 && n < x.order, "Wrong leadingDim!");
DTYPE scaleFactor = (DTYPE)1.0 / ((DTYPE)1.0 - dropProb);
/* generate a mask tensor with probability p */
int unitNum = x.dimSize[n];
DTYPE * maskArray = new DTYPE[unitNum];
/* generate a mask tensor with probability p */
int unitNum = x.dimSize[n];
maskArray = new DTYPE[unitNum];
//srand((unsigned int)time(NULL));
for (int i = 0; i < unitNum; i++)
maskArray[i] = RandomBernoulli(dropProb, scaleFactor);
//srand((unsigned int)time(NULL));
for (int i = 0; i < unitNum; i++)
maskArray[i] = RandomBernoulli(dropProb, scaleFactor);
XTensor mask;
InitTensor1D(&mask, unitNum, x.dataType, x.devID, x.mem);
mask.SetData(maskArray, unitNum);
XTensor mask;
InitTensor1D(&mask, unitNum, x.dataType, x.devID, x.mem);
mask.SetData(maskArray, unitNum);
delete[] maskArray;
return MultiplyDim(x, mask, n);
}
else{
int n = leadingDim;
int m = leadingDim2;
CheckNTErrors(n >= 0 && n < x.order, "Wrong leadingDim!");
CheckNTErrors(m >= 0 && m < x.order, "Wrong leadingDim!");
DTYPE scaleFactor = (DTYPE)1.0 / ((DTYPE)1.0 - dropProb);
/* generate a mask tensor with probability p */
int unitNum = x.dimSize[n] * x.dimSize[m];
maskArray = new DTYPE[unitNum];
//srand((unsigned int)time(NULL));
for (int i = 0; i < unitNum; i++)
maskArray[i] = RandomBernoulli(dropProb, scaleFactor);
int dims[MAX_TENSOR_DIM_NUM];
for(int i = 0; i < x.order; i++)
dims[i] = 1;
dims[n] = x.GetDim(n);
dims[m] = x.GetDim(m);
InitTensor(&mask, x.order, dims, x.dataType, x.denseRatio,x.devID, x.mem);
mask.SetData(maskArray, unitNum);
delete[] maskArray;
return MultiplyBroadcast(x, mask);
}
delete[] maskArray;
return MultiplyDim(x, mask, n, 0);
}
/*
......@@ -182,7 +225,6 @@ XTensor DropoutWithoutBroadcast(const XTensor &x, DTYPE dropProb)
int unitNum = x.unitNum;
DTYPE * maskArray = new DTYPE[unitNum];
srand((unsigned int)time(NULL));
for (int i = 0; i < unitNum; i++)
maskArray[i] = RandomBernoulli(dropProb, scaleFactor);
......
......@@ -39,9 +39,9 @@ void _Dropout(const XTensor * x, XTensor * y, unsigned int seed, DTYPE dropProb,
void _DropoutBackward(const XTensor * y, const XTensor * x,
const XTensor * dedy, XTensor * dedx,
unsigned int seed, DTYPE dropProb, int leadingDim = -1);
/* dropout function */
XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim = -1);
XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim = -1, int leadingDim2 = -1);
/* dropout function without broadcast */
XTensor DropoutWithoutBroadcast(const XTensor &x, DTYPE dropProb);
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论