Commit 90dc67f2 by xiaotong

fix bugs in back propagation and transformer

parent 50c3670f
......@@ -468,21 +468,19 @@ void XMathGrad::GradPower(XTensor * node)
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for POWER!");
XTensor * a = income.tails[0];
XTensor * b = NewTensor(a);
XTensor * c = NewTensor(a);
XTensor * b = NewTensorBuf(a, a->devID, a->mem);
DTYPE p = income.GetParam(0);
XNoder::MakeGrad(a);
_Power(a, b, (p-1)/p);
_ScaleAndShift(b, c, p);
_Multiply(node->grad, c, a->grad, 1.0F);
_Power(a, b, p - 1.0F);
_ScaleAndShiftMe(b, p);
_Multiply(node->grad, b, a->grad, 1.0F);
node->visitMark = NODE_FINISHED;
DelTensor(b);
delete b;
delete c;
node->visitMark = NODE_FINISHED;
}
/*
......@@ -499,16 +497,16 @@ void XMathGrad::GradNegate(XTensor * node)
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for NEGATE!");
XTensor * a = income.tails[0];
XTensor * b = NewTensor(a);
XTensor * b = NewTensorBuf(a, a->devID, a->mem);
XNoder::MakeGrad(a);
_ScaleAndShift(node->grad, b, -1.0F);
_Sum(a->grad, b, a->grad);
node->visitMark = NODE_FINISHED;
DelTensorBuf(b);
delete b;
node->visitMark = NODE_FINISHED;
}
/*
......@@ -525,18 +523,14 @@ void XMathGrad::GradScaleAndShift(XTensor * node)
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for SCALEANDSHIFT!");
XTensor * a = income.tails[0];
XTensor * b = NewTensor(a);
DTYPE scale = income.GetParam(0);
XNoder::MakeGrad(a);
_ScaleAndShift(node->grad, b, scale);
_Sum(a->grad, b, a->grad);
_Sum(a->grad, node->grad, a->grad, scale);
node->visitMark = NODE_FINISHED;
delete b;
}
/*
......@@ -582,9 +576,7 @@ void XMathGrad::GradDiv(XTensor * node)
XTensor * a = income.tails[0];
XTensor * b = income.tails[1];
XTensor * c = NewTensor(b);
XTensor * d = NewTensor(b);
XTensor * e = NewTensor(b);
XTensor * ab2 = NewTensorBuf(a, a->devID, a->mem);
XNoder::MakeGrad(a);
XNoder::MakeGrad(b);
......@@ -592,16 +584,15 @@ void XMathGrad::GradDiv(XTensor * node)
CheckNTErrors(XTensor::IsSameShaped(a, b), "Wrong sized input tensors!");
_Div(node->grad, b, a->grad, 1.0F);
_Power(b, c, -2.0F);
_Multiply(a, c, d);
_ScaleAndShift(d, e, -1.0F);
_Multiply(node->grad, e, b->grad, 1.0F);
node->visitMark = NODE_FINISHED;
_Power(b, ab2, -2.0F);
_Multiply(a, ab2, ab2);
_ScaleAndShiftMe(ab2, -1.0F);
_Multiply(node->grad, ab2, b->grad, 1.0F);
delete c;
delete d;
delete e;
DelTensorBuf(ab2);
node->visitMark = NODE_FINISHED;
}
/*
......@@ -618,16 +609,16 @@ void XMathGrad::GradExp(XTensor * node)
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for EXP!");
XTensor * a = income.tails[0];
XTensor * b = NewTensor(a);
XTensor * b = NewTensorBuf(a, a->devID, a->mem);
XNoder::MakeGrad(a);
_Exp(a, b);
_Multiply(node->grad, b, a->grad, 1.0F);
node->visitMark = NODE_FINISHED;
DelTensorBuf(b);
delete b;
node->visitMark = NODE_FINISHED;
}
/*
......@@ -644,16 +635,16 @@ void XMathGrad::GradSin(XTensor * node)
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for SIN!");
XTensor * a = income.tails[0];
XTensor * b = NewTensor(a);
XTensor * b = NewTensorBuf(a, a->devID, a->mem);
XNoder::MakeGrad(a);
_Cos(a, b);
_Multiply(node->grad, b, a->grad, 1.0F);
node->visitMark = NODE_FINISHED;
DelTensorBuf(b);
delete b;
node->visitMark = NODE_FINISHED;
}
/*
......@@ -670,19 +661,17 @@ void XMathGrad::GradCos(XTensor * node)
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for COS!");
XTensor * a = income.tails[0];
XTensor * b = NewTensor(a);
XTensor * c = NewTensor(a);
XTensor * b = NewTensorBuf(a, a->devID, a->mem);
XNoder::MakeGrad(a);
_Sin(a, b);
_ScaleAndShift(b, c, -1.0F);
_Multiply(node->grad, c, a->grad, 1.0F);
_ScaleAndShiftMe(b, -1.0F);
_Multiply(node->grad, b, a->grad, 1.0F);
node->visitMark = NODE_FINISHED;
DelTensorBuf(b);
delete b;
delete c;
node->visitMark = NODE_FINISHED;
}
/*
......@@ -699,19 +688,17 @@ void XMathGrad::GradTan(XTensor * node)
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for TAN!");
XTensor * a = income.tails[0];
XTensor * b = NewTensor(a);
XTensor * c = NewTensor(a);
XTensor * b = NewTensorBuf(a, a->devID, a->mem);
XNoder::MakeGrad(a);
_Cos(a, b);
_Power(b, c, -2.0F);
_Multiply(node->grad, c, a->grad, 1.0F);
_PowerMe(b, -2.0F);
_Multiply(node->grad, b, a->grad, 1.0F);
node->visitMark = NODE_FINISHED;
DelTensorBuf(b);
delete b;
delete c;
node->visitMark = NODE_FINISHED;
}
/*
......@@ -817,16 +804,16 @@ void XMathGrad::GradAbsolute(XTensor * node)
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for ABSOLUTE!");
XTensor * a = income.tails[0];
XTensor * b = NewTensor(a);
XTensor * b = NewTensorBuf(a, a->devID, a->mem);
XNoder::MakeGrad(a);
_Sign(a, b);
_Multiply(node->grad, b, a->grad, 1.0F);
node->visitMark = NODE_FINISHED;
DelTensorBuf(b);
delete b;
node->visitMark = NODE_FINISHED;
}
/*
......@@ -842,17 +829,9 @@ void XMathGrad::GradSign(XTensor * node)
XLink &income = node->income;
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for SIGN!");
XTensor * a = income.tails[0];
XTensor * b = NewTensor(a);
XNoder::MakeGrad(a);
b->SetZeroAll();
_Sum(a->grad, b, a->grad);
// we do nothing here
node->visitMark = NODE_FINISHED;
delete b;
}
/*
......@@ -868,17 +847,9 @@ void XMathGrad::GradRound(XTensor * node)
XLink &income = node->income;
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for ROUND!");
XTensor * a = income.tails[0];
XTensor * b = NewTensor(a);
XNoder::MakeGrad(a);
b->SetZeroAll();
_Sum(a->grad, b, a->grad);
// we do nothing here
node->visitMark = NODE_FINISHED;
delete b;
}
/*
......@@ -894,7 +865,7 @@ void XMathGrad::GradClip(XTensor * node)
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for CLIP!");
XTensor * a = income.tails[0];
XTensor * b = NewTensor(a);
XTensor * b = NewTensorBuf(a, a->devID, a->mem);
DTYPE lower = income.GetParam(0);
DTYPE upper = income.GetParam(1);
......@@ -904,9 +875,9 @@ void XMathGrad::GradClip(XTensor * node)
_ClipBackward(node, a, node->grad, a->grad, lower, upper);
_Sum(a->grad, b, a->grad);
node->visitMark = NODE_FINISHED;
DelTensorBuf(b);
delete b;
node->visitMark = NODE_FINISHED;
}
/*
......@@ -923,21 +894,20 @@ void XMathGrad::GradReduceMean(XTensor * node)
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for Reduce!");
XTensor * a = income.tails[0];
XTensor * b = NewTensor(a);
XTensor * c = NewTensor(a);
XTensor * b = NewTensorBuf(a, a->devID, a->mem);
int dim = income.GetParamInt(0);
int n = a->GetDim(dim);
XNoder::MakeGrad(a);
_Unsqueeze(node->grad, b, dim, n);
_ScaleAndShift(b, c, 1.0F/n);
_Sum(a->grad, c, a->grad);
_ScaleAndShiftMe(b, 1.0F/n);
_Sum(a->grad, b, a->grad);
node->visitMark = NODE_FINISHED;
DelTensorBuf(b);
delete b;
delete c;
node->visitMark = NODE_FINISHED;
}
/*
......@@ -954,18 +924,19 @@ void XMathGrad::GradReduceSum(XTensor * node)
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for Reduce!");
XTensor * a = income.tails[0];
XTensor * b = NewTensor(a);
XTensor * b = NewTensorBuf(a, a->devID, a->mem);
int dim = income.GetParamInt(0);
int n = a->GetDim(dim);
XNoder::MakeGrad(a);
_Unsqueeze(node->grad, b, dim, n);
_Sum(a->grad, b, a->grad);
node->visitMark = NODE_FINISHED;
DelTensor(b);
delete b;
node->visitMark = NODE_FINISHED;
}
/*
......@@ -984,9 +955,9 @@ void XMathGrad::GradReduceSumSquared(XTensor * node)
XTensor * a = income.tails[0];
XTensor * b = income.tails[1];
XTensor * c = NewTensor(a);
XTensor * d = NewTensor(b);
XTensor * e = NewTensor(c);
XTensor * c = NewTensorBuf(a, a->devID, a->mem);
XTensor * d = NewTensorBuf(b, b->devID, b->mem);
XTensor * e = NewTensorBuf(a, a->devID, a->mem);
int dim = income.GetParamInt(0);
int n = a->GetDim(dim);
......@@ -999,11 +970,11 @@ void XMathGrad::GradReduceSumSquared(XTensor * node)
_Multiply(e, c, a->grad, 1.0F);
_Multiply(node->grad, d, b->grad, 1.0F);
node->visitMark = NODE_FINISHED;
DelTensorBuf(c);
DelTensorBuf(d);
DelTensorBuf(e);
delete c;
delete d;
delete e;
node->visitMark = NODE_FINISHED;
}
/*
......@@ -1022,9 +993,9 @@ void XMathGrad::GradReduceVariance(XTensor * node)
XTensor * a = income.tails[0];
XTensor * b = income.tails[1];
XTensor * c = NewTensor(a);
XTensor * d = NewTensor(b);
XTensor * e = NewTensor(a);
XTensor * c = NewTensorBuf(a, a->devID, a->mem);
XTensor * d = NewTensorBuf(b, b->devID, b->mem);
XTensor * e = NewTensorBuf(a, a->devID, a->mem);
int dim = income.GetParamInt(0);
int n = a->GetDim(dim);
......@@ -1037,11 +1008,11 @@ void XMathGrad::GradReduceVariance(XTensor * node)
_Multiply(e, c, a->grad, 1.0F);
_Multiply(node->grad, d, b->grad, 1.0F);
node->visitMark = NODE_FINISHED;
DelTensorBuf(c);
DelTensorBuf(d);
DelTensorBuf(e);
delete c;
delete d;
delete e;
node->visitMark = NODE_FINISHED;
}
}
......@@ -66,8 +66,9 @@ void T2TAttention::InitModel(int argc, const char ** argv, int myDevID, XMem * m
InitTensor2D(&wq, d, dk, X_FLOAT, devID, mem);
InitTensor2D(&wv, d, dv, X_FLOAT, devID, mem);
float finfoutk = sqrt(6/(d + dk));
float finfoutv = sqrt(6/(d + dv));
float scale = 1.0F;
float finfoutk = (float)sqrt(6.0F * scale/(d + dk));
float finfoutv = (float)sqrt(6.0F * scale/(d + dv));
wk.SetDataRand(-finfoutk, finfoutk);
wq.SetDataRand(-finfoutk, finfoutk);
......@@ -107,7 +108,7 @@ XTensor T2TAttention::Make(XTensor &k, XTensor &q, XTensor &v)
XTensor scalar;
/* scalar = softmax(Q * K^T / sqrt(dk)) * V */
scalar = Softmax(Linear(BMMul(qheads, X_NOTRANS, kheads, X_TRANS), 1/sqrt((float)dk)), -1);
scalar = Softmax(Linear(BMMul(qheads, X_NOTRANS, kheads, X_TRANS), 1/(float)sqrt((float)dk)), -1);
att = BMMul(scalar, vheads);
/* concatenate the heads */
......
......@@ -62,7 +62,7 @@ void T2TEmbedder::InitModel(int argc, const char ** argv, int myDevID, XMem * my
InitTensor2D(&w, vSize, eSize, X_FLOAT, devID, mem);
w.SetDataRandn(0, 1/sqrt((float)eSize));
w.SetDataRandn(0, 1/(float)sqrt((float)eSize));
/* create the positional embedding matrix */
MakePosEmbedding(eSize, d, maxLength);
......@@ -84,11 +84,11 @@ void T2TEmbedder::MakePosEmbedding(int eSize, int d, int length)
for(int k = 0; k < eSize; k++){
if(k % 2 == 0){
int i = k/2;
dp[k] = sin(pos/pow(10000.0F, 2.0F*i/d));
dp[k] = (float)sin(pos/pow(10000.0F, 2.0F*i/d));
}
else{
int i = (k - 1)/2;
dp[k] = cos(pos/pow(10000.0F, 2.0F*i/d));
dp[k] = (float)cos(pos/pow(10000.0F, 2.0F*i/d));
}
}
}
......
......@@ -67,8 +67,9 @@ void T2TFNN::InitModel(int argc, const char ** argv, int myDevID, XMem * myMem)
InitTensor2D(&w2, hSize, outSize, X_FLOAT, devID, mem);
InitTensor1D(&b2, outSize, X_FLOAT, devID, mem);
float finfout1 = sqrt(6/(inSize + hSize));
float finfout2 = sqrt(6/(hSize + outSize));
float scale = 1.0F;
float finfout1 = (float)sqrt(6.0F * scale/(inSize + hSize));
float finfout2 = (float)sqrt(6.0F * scale/(hSize + outSize));
w1.SetDataRand(-finfout1, finfout1);
b1.SetZeroAll();
......
......@@ -63,7 +63,8 @@ void T2TOutput::InitModel(int argc, const char ** argv, int myDevID, XMem * myMe
InitTensor2D(&w, hSize, vSize, X_FLOAT, devID, mem);
float finfout = sqrt(6/(hSize + vSize));
float scale = 1.0F;
float finfout = (float)sqrt(6.0F * scale/(hSize + vSize));
w.SetDataRand(-finfout, finfout);
}
......
......@@ -112,8 +112,8 @@ void T2TTrainer::Train(const char * fn, T2TModel * model)
net.Backward(output, batch, CROSSENTROPY);
/* learning rate */
lr = (1/sqrt((float)d)) * MIN(pow(step + 1, -0.5), (step + 1) * pow(nwarmup, -1.5));
lr = 0.000005F;
lr = (1 / (float)sqrt((float)d)) * (float)MIN(pow(step + 1, -0.5), (step + 1) * pow(nwarmup, -1.5));
lr = 0.000002F;
/* update the parameters */
Update(model, lr);
......@@ -132,7 +132,7 @@ void T2TTrainer::Train(const char * fn, T2TModel * model)
if (step % 1 == 0) {
double elapsed = GetClockSec() - startT;
XPRINT6(0, stderr, "[INFO] lr=%e, elapsed=%.1fs, step=%d, epoch=%d, ngram=%d, ppl=%.3f\n",
XPRINT6(0, stderr, "[INFO] lr=%.2e, elapsed=%.1fs, step=%d, epoch=%d, word=%d, ppl=%.3f\n",
lr, elapsed, step, epoch + 1, wordCountTotal, exp(loss / wordCount));
}
}
......@@ -142,7 +142,7 @@ void T2TTrainer::Train(const char * fn, T2TModel * model)
double elapsed = GetClockSec() - startT;
XPRINT6(0, stderr, "[INFO] lr=%e, elapsed=%.1fs, step=%d, epoch=%d, ngram=%d, ppl=%.3f\n",
XPRINT6(0, stderr, "[INFO] lr=%.2e, elapsed=%.1fs, step=%d, epoch=%d, word=%d, ppl=%.3f\n",
lr, elapsed, step, epoch, wordCountTotal, exp(loss / wordCount));
XPRINT3(0, stderr, "[INFO] training finished (took %.1fs, step=%d and epoch=%d)\n",
elapsed, step, epoch);
......
......@@ -71,7 +71,7 @@ void _SetDataFanInOut(XTensor * tensor, DTYPE gain)
}
DTYPE std = gain * (float)sqrt(2.0/(fanIn + fanOut));
DTYPE a = sqrt(3.0) * std;
DTYPE a = (DTYPE)sqrt(3.0) * std;
_SetDataRand(tensor, -a, a);
}
......
......@@ -103,10 +103,10 @@ void _Softmax(const XTensor * x, XTensor * y, int leadDim)
else{
for(int i = 0; i < n; i++){
DTYPE r = (DTYPE)exp(ip[i * m + j] - mp[j])/sp[j];
if(IsNAN(r))
r = DTYPE_MIN;
if(IsINF(r))
r = DTYPE_MIN;
if (r > (DTYPE)1.0F)
r = (DTYPE)1.0F;
else if (r < 0)
r = 0;
op[i * m + j] = r;
}
}
......
......@@ -85,7 +85,13 @@ void KernelSoftmaxComputeTensor(DTYPE * x, DTYPE * max, DTYPE * sum, DTYPE * y,
if(i < strideSizeTotal && j < strideNum){
int offset = int(i / stride) * blockSize + j * stride + i2[threadIdx.x];
y[offset] = exp(x[offset] - xMax[threadIdx.x])/xSum[threadIdx.x];
DTYPE r = exp(x[offset] - xMax[threadIdx.x])/xSum[threadIdx.x];
if (r >(DTYPE)1.0F)
r = (DTYPE)1.0F;
else if (r < 0)
r = 0;
y[offset] = r;
}
}
......@@ -194,7 +200,12 @@ void KernelSoftmaxComputeTensorUseBroadcast(DTYPE * input, DTYPE * max, DTYPE *
maxData = broadcast(maxData);
if (i < strideNum){
int offset = int(j / stride) * blockSize + i * stride + i2;
output[offset] = exp(input[offset] - maxData) / sumData;
DTYPE r = exp(input[offset] - maxData) / sumData;
if (r > (DTYPE)1.0F)
r = (DTYPE)1.0F;
else if (r < 0)
r = 0;
output[offset] = r;
}
}
}
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论