Commit 52a27964 by xiaotong

better implementation of dropout

parent a8304bed
...@@ -87,6 +87,8 @@ void XMathGrad::MakeGrad(XTensor * node, bool isEfficient) ...@@ -87,6 +87,8 @@ void XMathGrad::MakeGrad(XTensor * node, bool isEfficient)
GradSum(node, isEfficient); GradSum(node, isEfficient);
else if(operID == MATH_SUMDIM) else if(operID == MATH_SUMDIM)
GradSumDim(node, isEfficient); GradSumDim(node, isEfficient);
else if(operID == MATH_SUMBROADCAST)
GradSumBroadcast(node, isEfficient);
else if(operID == REDUCE_REDUCEMEAN) else if(operID == REDUCE_REDUCEMEAN)
GradReduceMean(node, isEfficient); GradReduceMean(node, isEfficient);
else if(operID == REDUCE_REDUCESUM) else if(operID == REDUCE_REDUCESUM)
...@@ -817,6 +819,37 @@ void XMathGrad::GradMultiplyDim(XTensor * node, bool isEfficient) ...@@ -817,6 +819,37 @@ void XMathGrad::GradMultiplyDim(XTensor * node, bool isEfficient)
} }
/* /*
gradient for multiplication by broadcasting:
c = a * b
where some dimensions of b are of size 1
dE/da = dE/dc * b
dE/db = (dE/dc * a).reduce(0...n)
where a.reduce(0...n) is the reduction along the dimension
whose size is 1 in b. Note that there might be several reductions.
>> node - the node (c) for backward computation
>> isEfficient - indicates whether the computation is in
an efficient manner
*/
void XMathGrad::GradMultiplyBroadcast(XTensor * node, bool isEfficient)
{
XLink &income = node->income;
CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for MULTIPLYBROADCAST!");
XTensor * a = income.tails[0];
XTensor * b = income.tails[1];
DTYPE beta = income.GetParam(0);
XNoder::MakeGrad(a);
_MultiplyBroadcast(node->grad, b, a->grad, 1.0F);
if(b->isVar || b->income.tailNum > 0){
ShowNTErrors("TODO");
}
}
/*
gradient for negate gradient for negate
for for
c = -a c = -a
...@@ -1254,6 +1287,37 @@ void XMathGrad::GradSumDim(XTensor * node, bool isEfficient) ...@@ -1254,6 +1287,37 @@ void XMathGrad::GradSumDim(XTensor * node, bool isEfficient)
} }
/* /*
gradient for sum by broadcasting:
c = a + b * \beta
where some dimensions of b are of size 1
dE/da = dE/dc
dE/db = dE/dc * a.reduce(0..n) * \beta
where a.reduce(0..n) is the reduction along the dimension
whose size is 1 in b
>> node - the node (c) for backward computation
>> isEfficient - indicates whether the computation is in
an efficient manner
*/
void XMathGrad::GradSumBroadcast(XTensor * node, bool isEfficient)
{
XLink &income = node->income;
CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for SUMBROADCAST!");
XTensor * a = income.tails[0];
XTensor * b = income.tails[1];
DTYPE beta = income.GetParam(0);
XNoder::MakeGrad(a);
_Sum(a->grad, node->grad, a->grad);
if(b->isVar || b->income.tailNum > 0){
ShowNTErrors("TODO");
}
}
/*
gradient for reduceMean gradient for reduceMean
for for
c = reduceMean(a, dim) c = reduceMean(a, dim)
......
...@@ -109,6 +109,11 @@ private: ...@@ -109,6 +109,11 @@ private:
static static
void GradMultiplyDim(XTensor * node, bool isEfficient); void GradMultiplyDim(XTensor * node, bool isEfficient);
/* gradient for multiply one dimension: c = a * b
where some dimensions of b are of size 1 */
static
void GradMultiplyBroadcast(XTensor * node, bool isEfficient);
/* gradient for negate */ /* gradient for negate */
static static
void GradNegate(XTensor * node, bool isEfficient); void GradNegate(XTensor * node, bool isEfficient);
...@@ -143,6 +148,11 @@ private: ...@@ -143,6 +148,11 @@ private:
static static
void GradSumDim(XTensor * node, bool isEfficient); void GradSumDim(XTensor * node, bool isEfficient);
/* gradient for sum by broadcasting: c = a + b * \beta
where some dimensions of b are of size 1 */
static
void GradSumBroadcast(XTensor * node, bool isEfficient);
/* gradient for reduceMean */ /* gradient for reduceMean */
static static
void GradReduceMean(XTensor * node, bool isEfficient); void GradReduceMean(XTensor * node, bool isEfficient);
......
...@@ -82,7 +82,7 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X ...@@ -82,7 +82,7 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X
/* dropout */ /* dropout */
if(isTraining && dropoutP > 0) if(isTraining && dropoutP > 0)
x = Dropout(x, dropoutP); x = Dropout(x, dropoutP, 2);
for(int i = 0; i < nlayer; i++){ for(int i = 0; i < nlayer; i++){
XTensor att; XTensor att;
...@@ -97,7 +97,7 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X ...@@ -97,7 +97,7 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X
/* dropout */ /* dropout */
if(isTraining && dropoutP > 0) if(isTraining && dropoutP > 0)
att = Dropout(att, dropoutP); att = Dropout(att, dropoutP, 2);
/* residual connection */ /* residual connection */
res = Sum(att, x); res = Sum(att, x);
...@@ -111,7 +111,7 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X ...@@ -111,7 +111,7 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X
/* dropout */ /* dropout */
if(isTraining && dropoutP > 0) if(isTraining && dropoutP > 0)
ende = Dropout(ende, dropoutP); ende = Dropout(ende, dropoutP, 2);
/* residual connection */ /* residual connection */
res = Sum(ende, x); res = Sum(ende, x);
...@@ -125,7 +125,7 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X ...@@ -125,7 +125,7 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X
/* dropout */ /* dropout */
if(isTraining && dropoutP > 0) if(isTraining && dropoutP > 0)
fnn = Dropout(fnn, dropoutP); fnn = Dropout(fnn, dropoutP, 2);
/* residual connection */ /* residual connection */
res = Sum(fnn, x); res = Sum(fnn, x);
......
...@@ -107,7 +107,7 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, boo ...@@ -107,7 +107,7 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, boo
/* dropout */ /* dropout */
if(isTraining && dropoutP > 0) if(isTraining && dropoutP > 0)
x = Dropout(x, dropoutP); x = Dropout(x, dropoutP, 2);
for(int i = 0; i < nlayer; i++){ for(int i = 0; i < nlayer; i++){
XTensor att; XTensor att;
...@@ -120,7 +120,7 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, boo ...@@ -120,7 +120,7 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, boo
/* dropout */ /* dropout */
if(isTraining && dropoutP > 0) if(isTraining && dropoutP > 0)
att = Dropout(att, dropoutP); att = Dropout(att, dropoutP, 2);
/* residual connection */ /* residual connection */
res = Sum(att, x); res = Sum(att, x);
...@@ -133,7 +133,7 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, boo ...@@ -133,7 +133,7 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, boo
/* dropout */ /* dropout */
if(isTraining && dropoutP > 0) if(isTraining && dropoutP > 0)
fnn = Dropout(fnn, dropoutP); fnn = Dropout(fnn, dropoutP, 2);
/* residual connection */ /* residual connection */
res = Sum(fnn, x); res = Sum(fnn, x);
......
...@@ -274,7 +274,9 @@ void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTe ...@@ -274,7 +274,9 @@ void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTe
_Sum(&maskEnc, padding3, &maskEnc); _Sum(&maskEnc, padding3, &maskEnc);
encoding = MakeEncoder(inputEnc, maskEnc, isTraining); encoding = MakeEncoder(inputEnc, maskEnc, isTraining);
decoding = MakeDecoder(inputDec, encoding, maskDec, maskEncDec, isTraining); decoding = MakeDecoder(inputDec, encoding, maskDec, maskEncDec, isTraining);
outputLayer->Make(decoding, output); outputLayer->Make(decoding, output);
delete[] dims; delete[] dims;
......
...@@ -60,6 +60,7 @@ int TransformerMain(int argc, const char ** argv) ...@@ -60,6 +60,7 @@ int TransformerMain(int argc, const char ** argv)
LoadParamString(argc, args, "output", outputFN, ""); LoadParamString(argc, args, "output", outputFN, "");
srand((unsigned int)time(NULL)); srand((unsigned int)time(NULL));
T2TTrainer trainer; T2TTrainer trainer;
trainer.Init(argc, args); trainer.Init(argc, args);
......
...@@ -39,7 +39,7 @@ for more details. ...@@ -39,7 +39,7 @@ for more details.
Here, the output is scaled by a factor of \frac{1}{1-p} so that we do not need Here, the output is scaled by a factor of \frac{1}{1-p} so that we do not need
to mark the tensor with probability p in the inference phase. Instead we perform to mark the tensor with probability p in the inference phase. Instead we perform
the same inference procedure as that with no use of dropout on the test data. the same inference procedure as that on the test data withno nb use of dropout.
>> x - input tensor >> x - input tensor
>> y - output tensor >> y - output tensor
...@@ -138,12 +138,21 @@ the same inference procedure as that with no use of dropout on the test data. ...@@ -138,12 +138,21 @@ the same inference procedure as that with no use of dropout on the test data.
>> x - input tensor >> x - input tensor
>> dropProb - probability to set an element to zero >> dropProb - probability to set an element to zero
>> leadingDim - the dimension which we generate the random numbers and perform broadcasting >> leadingDim - the dimension which we generate the random numbers and perform broadcasting
>> leadingDim2 - another dimension which we generate the random numbers and perform broadcasting
<< return - tensor after dropout
*/ */
XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim) XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim, int leadingDim2)
{ {
CheckNTErrors(dropProb >= 0.0 && dropProb <= 1.0, "The probability must be 0-1!"); CheckNTErrors(dropProb >= 0.0 && dropProb <= 1.0, "The probability must be 0-1!");
int n = leadingDim < 0 ? x.order - 1 : leadingDim; XTensor mask;
DTYPE * maskArray = NULL;
if(leadingDim < 0 && leadingDim2 < 0){
ShowNTErrors("TODO");
}
else if(leadingDim2 < 0){
int n = leadingDim;
CheckNTErrors(n >= 0 && n < x.order, "Wrong leadingDim!"); CheckNTErrors(n >= 0 && n < x.order, "Wrong leadingDim!");
...@@ -151,7 +160,7 @@ XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim) ...@@ -151,7 +160,7 @@ XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim)
/* generate a mask tensor with probability p */ /* generate a mask tensor with probability p */
int unitNum = x.dimSize[n]; int unitNum = x.dimSize[n];
DTYPE * maskArray = new DTYPE[unitNum]; maskArray = new DTYPE[unitNum];
//srand((unsigned int)time(NULL)); //srand((unsigned int)time(NULL));
for (int i = 0; i < unitNum; i++) for (int i = 0; i < unitNum; i++)
...@@ -163,7 +172,41 @@ XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim) ...@@ -163,7 +172,41 @@ XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim)
delete[] maskArray; delete[] maskArray;
return MultiplyDim(x, mask, n, 0); return MultiplyDim(x, mask, n);
}
else{
int n = leadingDim;
int m = leadingDim2;
CheckNTErrors(n >= 0 && n < x.order, "Wrong leadingDim!");
CheckNTErrors(m >= 0 && m < x.order, "Wrong leadingDim!");
DTYPE scaleFactor = (DTYPE)1.0 / ((DTYPE)1.0 - dropProb);
/* generate a mask tensor with probability p */
int unitNum = x.dimSize[n] * x.dimSize[m];
maskArray = new DTYPE[unitNum];
//srand((unsigned int)time(NULL));
for (int i = 0; i < unitNum; i++)
maskArray[i] = RandomBernoulli(dropProb, scaleFactor);
int dims[MAX_TENSOR_DIM_NUM];
for(int i = 0; i < x.order; i++)
dims[i] = 1;
dims[n] = x.GetDim(n);
dims[m] = x.GetDim(m);
InitTensor(&mask, x.order, dims, x.dataType, x.denseRatio,x.devID, x.mem);
mask.SetData(maskArray, unitNum);
delete[] maskArray;
return MultiplyBroadcast(x, mask);
}
} }
/* /*
...@@ -182,7 +225,6 @@ XTensor DropoutWithoutBroadcast(const XTensor &x, DTYPE dropProb) ...@@ -182,7 +225,6 @@ XTensor DropoutWithoutBroadcast(const XTensor &x, DTYPE dropProb)
int unitNum = x.unitNum; int unitNum = x.unitNum;
DTYPE * maskArray = new DTYPE[unitNum]; DTYPE * maskArray = new DTYPE[unitNum];
srand((unsigned int)time(NULL));
for (int i = 0; i < unitNum; i++) for (int i = 0; i < unitNum; i++)
maskArray[i] = RandomBernoulli(dropProb, scaleFactor); maskArray[i] = RandomBernoulli(dropProb, scaleFactor);
......
...@@ -41,7 +41,7 @@ void _DropoutBackward(const XTensor * y, const XTensor * x, ...@@ -41,7 +41,7 @@ void _DropoutBackward(const XTensor * y, const XTensor * x,
unsigned int seed, DTYPE dropProb, int leadingDim = -1); unsigned int seed, DTYPE dropProb, int leadingDim = -1);
/* dropout function */ /* dropout function */
XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim = -1); XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim = -1, int leadingDim2 = -1);
/* dropout function without broadcast */ /* dropout function without broadcast */
XTensor DropoutWithoutBroadcast(const XTensor &x, DTYPE dropProb); XTensor DropoutWithoutBroadcast(const XTensor &x, DTYPE dropProb);
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论