Commit 90dc67f2 by xiaotong

fix bugs in back propagation and transformer

parent 50c3670f
...@@ -468,21 +468,19 @@ void XMathGrad::GradPower(XTensor * node) ...@@ -468,21 +468,19 @@ void XMathGrad::GradPower(XTensor * node)
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for POWER!"); CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for POWER!");
XTensor * a = income.tails[0]; XTensor * a = income.tails[0];
XTensor * b = NewTensor(a); XTensor * b = NewTensorBuf(a, a->devID, a->mem);
XTensor * c = NewTensor(a);
DTYPE p = income.GetParam(0); DTYPE p = income.GetParam(0);
XNoder::MakeGrad(a); XNoder::MakeGrad(a);
_Power(a, b, (p-1)/p); _Power(a, b, p - 1.0F);
_ScaleAndShift(b, c, p); _ScaleAndShiftMe(b, p);
_Multiply(node->grad, c, a->grad, 1.0F); _Multiply(node->grad, b, a->grad, 1.0F);
node->visitMark = NODE_FINISHED; DelTensor(b);
delete b; node->visitMark = NODE_FINISHED;
delete c;
} }
/* /*
...@@ -499,16 +497,16 @@ void XMathGrad::GradNegate(XTensor * node) ...@@ -499,16 +497,16 @@ void XMathGrad::GradNegate(XTensor * node)
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for NEGATE!"); CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for NEGATE!");
XTensor * a = income.tails[0]; XTensor * a = income.tails[0];
XTensor * b = NewTensor(a); XTensor * b = NewTensorBuf(a, a->devID, a->mem);
XNoder::MakeGrad(a); XNoder::MakeGrad(a);
_ScaleAndShift(node->grad, b, -1.0F); _ScaleAndShift(node->grad, b, -1.0F);
_Sum(a->grad, b, a->grad); _Sum(a->grad, b, a->grad);
node->visitMark = NODE_FINISHED; DelTensorBuf(b);
delete b; node->visitMark = NODE_FINISHED;
} }
/* /*
...@@ -525,18 +523,14 @@ void XMathGrad::GradScaleAndShift(XTensor * node) ...@@ -525,18 +523,14 @@ void XMathGrad::GradScaleAndShift(XTensor * node)
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for SCALEANDSHIFT!"); CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for SCALEANDSHIFT!");
XTensor * a = income.tails[0]; XTensor * a = income.tails[0];
XTensor * b = NewTensor(a);
DTYPE scale = income.GetParam(0); DTYPE scale = income.GetParam(0);
XNoder::MakeGrad(a); XNoder::MakeGrad(a);
_ScaleAndShift(node->grad, b, scale); _Sum(a->grad, node->grad, a->grad, scale);
_Sum(a->grad, b, a->grad);
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
delete b;
} }
/* /*
...@@ -582,9 +576,7 @@ void XMathGrad::GradDiv(XTensor * node) ...@@ -582,9 +576,7 @@ void XMathGrad::GradDiv(XTensor * node)
XTensor * a = income.tails[0]; XTensor * a = income.tails[0];
XTensor * b = income.tails[1]; XTensor * b = income.tails[1];
XTensor * c = NewTensor(b); XTensor * ab2 = NewTensorBuf(a, a->devID, a->mem);
XTensor * d = NewTensor(b);
XTensor * e = NewTensor(b);
XNoder::MakeGrad(a); XNoder::MakeGrad(a);
XNoder::MakeGrad(b); XNoder::MakeGrad(b);
...@@ -592,16 +584,15 @@ void XMathGrad::GradDiv(XTensor * node) ...@@ -592,16 +584,15 @@ void XMathGrad::GradDiv(XTensor * node)
CheckNTErrors(XTensor::IsSameShaped(a, b), "Wrong sized input tensors!"); CheckNTErrors(XTensor::IsSameShaped(a, b), "Wrong sized input tensors!");
_Div(node->grad, b, a->grad, 1.0F); _Div(node->grad, b, a->grad, 1.0F);
_Power(b, c, -2.0F);
_Multiply(a, c, d);
_ScaleAndShift(d, e, -1.0F);
_Multiply(node->grad, e, b->grad, 1.0F);
node->visitMark = NODE_FINISHED; _Power(b, ab2, -2.0F);
_Multiply(a, ab2, ab2);
_ScaleAndShiftMe(ab2, -1.0F);
_Multiply(node->grad, ab2, b->grad, 1.0F);
delete c; DelTensorBuf(ab2);
delete d;
delete e; node->visitMark = NODE_FINISHED;
} }
/* /*
...@@ -618,16 +609,16 @@ void XMathGrad::GradExp(XTensor * node) ...@@ -618,16 +609,16 @@ void XMathGrad::GradExp(XTensor * node)
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for EXP!"); CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for EXP!");
XTensor * a = income.tails[0]; XTensor * a = income.tails[0];
XTensor * b = NewTensor(a); XTensor * b = NewTensorBuf(a, a->devID, a->mem);
XNoder::MakeGrad(a); XNoder::MakeGrad(a);
_Exp(a, b); _Exp(a, b);
_Multiply(node->grad, b, a->grad, 1.0F); _Multiply(node->grad, b, a->grad, 1.0F);
node->visitMark = NODE_FINISHED; DelTensorBuf(b);
delete b; node->visitMark = NODE_FINISHED;
} }
/* /*
...@@ -644,16 +635,16 @@ void XMathGrad::GradSin(XTensor * node) ...@@ -644,16 +635,16 @@ void XMathGrad::GradSin(XTensor * node)
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for SIN!"); CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for SIN!");
XTensor * a = income.tails[0]; XTensor * a = income.tails[0];
XTensor * b = NewTensor(a); XTensor * b = NewTensorBuf(a, a->devID, a->mem);
XNoder::MakeGrad(a); XNoder::MakeGrad(a);
_Cos(a, b); _Cos(a, b);
_Multiply(node->grad, b, a->grad, 1.0F); _Multiply(node->grad, b, a->grad, 1.0F);
node->visitMark = NODE_FINISHED; DelTensorBuf(b);
delete b; node->visitMark = NODE_FINISHED;
} }
/* /*
...@@ -670,19 +661,17 @@ void XMathGrad::GradCos(XTensor * node) ...@@ -670,19 +661,17 @@ void XMathGrad::GradCos(XTensor * node)
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for COS!"); CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for COS!");
XTensor * a = income.tails[0]; XTensor * a = income.tails[0];
XTensor * b = NewTensor(a); XTensor * b = NewTensorBuf(a, a->devID, a->mem);
XTensor * c = NewTensor(a);
XNoder::MakeGrad(a); XNoder::MakeGrad(a);
_Sin(a, b); _Sin(a, b);
_ScaleAndShift(b, c, -1.0F); _ScaleAndShiftMe(b, -1.0F);
_Multiply(node->grad, c, a->grad, 1.0F); _Multiply(node->grad, b, a->grad, 1.0F);
node->visitMark = NODE_FINISHED; DelTensorBuf(b);
delete b; node->visitMark = NODE_FINISHED;
delete c;
} }
/* /*
...@@ -699,19 +688,17 @@ void XMathGrad::GradTan(XTensor * node) ...@@ -699,19 +688,17 @@ void XMathGrad::GradTan(XTensor * node)
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for TAN!"); CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for TAN!");
XTensor * a = income.tails[0]; XTensor * a = income.tails[0];
XTensor * b = NewTensor(a); XTensor * b = NewTensorBuf(a, a->devID, a->mem);
XTensor * c = NewTensor(a);
XNoder::MakeGrad(a); XNoder::MakeGrad(a);
_Cos(a, b); _Cos(a, b);
_Power(b, c, -2.0F); _PowerMe(b, -2.0F);
_Multiply(node->grad, c, a->grad, 1.0F); _Multiply(node->grad, b, a->grad, 1.0F);
node->visitMark = NODE_FINISHED; DelTensorBuf(b);
delete b; node->visitMark = NODE_FINISHED;
delete c;
} }
/* /*
...@@ -817,16 +804,16 @@ void XMathGrad::GradAbsolute(XTensor * node) ...@@ -817,16 +804,16 @@ void XMathGrad::GradAbsolute(XTensor * node)
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for ABSOLUTE!"); CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for ABSOLUTE!");
XTensor * a = income.tails[0]; XTensor * a = income.tails[0];
XTensor * b = NewTensor(a); XTensor * b = NewTensorBuf(a, a->devID, a->mem);
XNoder::MakeGrad(a); XNoder::MakeGrad(a);
_Sign(a, b); _Sign(a, b);
_Multiply(node->grad, b, a->grad, 1.0F); _Multiply(node->grad, b, a->grad, 1.0F);
node->visitMark = NODE_FINISHED; DelTensorBuf(b);
delete b; node->visitMark = NODE_FINISHED;
} }
/* /*
...@@ -842,17 +829,9 @@ void XMathGrad::GradSign(XTensor * node) ...@@ -842,17 +829,9 @@ void XMathGrad::GradSign(XTensor * node)
XLink &income = node->income; XLink &income = node->income;
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for SIGN!"); CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for SIGN!");
XTensor * a = income.tails[0]; // we do nothing here
XTensor * b = NewTensor(a);
XNoder::MakeGrad(a);
b->SetZeroAll();
_Sum(a->grad, b, a->grad);
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
delete b;
} }
/* /*
...@@ -868,17 +847,9 @@ void XMathGrad::GradRound(XTensor * node) ...@@ -868,17 +847,9 @@ void XMathGrad::GradRound(XTensor * node)
XLink &income = node->income; XLink &income = node->income;
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for ROUND!"); CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for ROUND!");
XTensor * a = income.tails[0]; // we do nothing here
XTensor * b = NewTensor(a);
XNoder::MakeGrad(a);
b->SetZeroAll();
_Sum(a->grad, b, a->grad);
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
delete b;
} }
/* /*
...@@ -894,7 +865,7 @@ void XMathGrad::GradClip(XTensor * node) ...@@ -894,7 +865,7 @@ void XMathGrad::GradClip(XTensor * node)
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for CLIP!"); CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for CLIP!");
XTensor * a = income.tails[0]; XTensor * a = income.tails[0];
XTensor * b = NewTensor(a); XTensor * b = NewTensorBuf(a, a->devID, a->mem);
DTYPE lower = income.GetParam(0); DTYPE lower = income.GetParam(0);
DTYPE upper = income.GetParam(1); DTYPE upper = income.GetParam(1);
...@@ -904,9 +875,9 @@ void XMathGrad::GradClip(XTensor * node) ...@@ -904,9 +875,9 @@ void XMathGrad::GradClip(XTensor * node)
_ClipBackward(node, a, node->grad, a->grad, lower, upper); _ClipBackward(node, a, node->grad, a->grad, lower, upper);
_Sum(a->grad, b, a->grad); _Sum(a->grad, b, a->grad);
node->visitMark = NODE_FINISHED; DelTensorBuf(b);
delete b; node->visitMark = NODE_FINISHED;
} }
/* /*
...@@ -923,21 +894,20 @@ void XMathGrad::GradReduceMean(XTensor * node) ...@@ -923,21 +894,20 @@ void XMathGrad::GradReduceMean(XTensor * node)
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for Reduce!"); CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for Reduce!");
XTensor * a = income.tails[0]; XTensor * a = income.tails[0];
XTensor * b = NewTensor(a); XTensor * b = NewTensorBuf(a, a->devID, a->mem);
XTensor * c = NewTensor(a);
int dim = income.GetParamInt(0); int dim = income.GetParamInt(0);
int n = a->GetDim(dim); int n = a->GetDim(dim);
XNoder::MakeGrad(a); XNoder::MakeGrad(a);
_Unsqueeze(node->grad, b, dim, n); _Unsqueeze(node->grad, b, dim, n);
_ScaleAndShift(b, c, 1.0F/n); _ScaleAndShiftMe(b, 1.0F/n);
_Sum(a->grad, c, a->grad); _Sum(a->grad, b, a->grad);
node->visitMark = NODE_FINISHED; DelTensorBuf(b);
delete b; node->visitMark = NODE_FINISHED;
delete c;
} }
/* /*
...@@ -954,18 +924,19 @@ void XMathGrad::GradReduceSum(XTensor * node) ...@@ -954,18 +924,19 @@ void XMathGrad::GradReduceSum(XTensor * node)
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for Reduce!"); CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for Reduce!");
XTensor * a = income.tails[0]; XTensor * a = income.tails[0];
XTensor * b = NewTensor(a); XTensor * b = NewTensorBuf(a, a->devID, a->mem);
int dim = income.GetParamInt(0); int dim = income.GetParamInt(0);
int n = a->GetDim(dim); int n = a->GetDim(dim);
XNoder::MakeGrad(a); XNoder::MakeGrad(a);
_Unsqueeze(node->grad, b, dim, n); _Unsqueeze(node->grad, b, dim, n);
_Sum(a->grad, b, a->grad); _Sum(a->grad, b, a->grad);
node->visitMark = NODE_FINISHED; DelTensor(b);
delete b; node->visitMark = NODE_FINISHED;
} }
/* /*
...@@ -984,9 +955,9 @@ void XMathGrad::GradReduceSumSquared(XTensor * node) ...@@ -984,9 +955,9 @@ void XMathGrad::GradReduceSumSquared(XTensor * node)
XTensor * a = income.tails[0]; XTensor * a = income.tails[0];
XTensor * b = income.tails[1]; XTensor * b = income.tails[1];
XTensor * c = NewTensor(a); XTensor * c = NewTensorBuf(a, a->devID, a->mem);
XTensor * d = NewTensor(b); XTensor * d = NewTensorBuf(b, b->devID, b->mem);
XTensor * e = NewTensor(c); XTensor * e = NewTensorBuf(a, a->devID, a->mem);
int dim = income.GetParamInt(0); int dim = income.GetParamInt(0);
int n = a->GetDim(dim); int n = a->GetDim(dim);
...@@ -999,11 +970,11 @@ void XMathGrad::GradReduceSumSquared(XTensor * node) ...@@ -999,11 +970,11 @@ void XMathGrad::GradReduceSumSquared(XTensor * node)
_Multiply(e, c, a->grad, 1.0F); _Multiply(e, c, a->grad, 1.0F);
_Multiply(node->grad, d, b->grad, 1.0F); _Multiply(node->grad, d, b->grad, 1.0F);
node->visitMark = NODE_FINISHED; DelTensorBuf(c);
DelTensorBuf(d);
DelTensorBuf(e);
delete c; node->visitMark = NODE_FINISHED;
delete d;
delete e;
} }
/* /*
...@@ -1022,9 +993,9 @@ void XMathGrad::GradReduceVariance(XTensor * node) ...@@ -1022,9 +993,9 @@ void XMathGrad::GradReduceVariance(XTensor * node)
XTensor * a = income.tails[0]; XTensor * a = income.tails[0];
XTensor * b = income.tails[1]; XTensor * b = income.tails[1];
XTensor * c = NewTensor(a); XTensor * c = NewTensorBuf(a, a->devID, a->mem);
XTensor * d = NewTensor(b); XTensor * d = NewTensorBuf(b, b->devID, b->mem);
XTensor * e = NewTensor(a); XTensor * e = NewTensorBuf(a, a->devID, a->mem);
int dim = income.GetParamInt(0); int dim = income.GetParamInt(0);
int n = a->GetDim(dim); int n = a->GetDim(dim);
...@@ -1037,11 +1008,11 @@ void XMathGrad::GradReduceVariance(XTensor * node) ...@@ -1037,11 +1008,11 @@ void XMathGrad::GradReduceVariance(XTensor * node)
_Multiply(e, c, a->grad, 1.0F); _Multiply(e, c, a->grad, 1.0F);
_Multiply(node->grad, d, b->grad, 1.0F); _Multiply(node->grad, d, b->grad, 1.0F);
node->visitMark = NODE_FINISHED; DelTensorBuf(c);
DelTensorBuf(d);
DelTensorBuf(e);
delete c; node->visitMark = NODE_FINISHED;
delete d;
delete e;
} }
} }
...@@ -66,8 +66,9 @@ void T2TAttention::InitModel(int argc, const char ** argv, int myDevID, XMem * m ...@@ -66,8 +66,9 @@ void T2TAttention::InitModel(int argc, const char ** argv, int myDevID, XMem * m
InitTensor2D(&wq, d, dk, X_FLOAT, devID, mem); InitTensor2D(&wq, d, dk, X_FLOAT, devID, mem);
InitTensor2D(&wv, d, dv, X_FLOAT, devID, mem); InitTensor2D(&wv, d, dv, X_FLOAT, devID, mem);
float finfoutk = sqrt(6/(d + dk)); float scale = 1.0F;
float finfoutv = sqrt(6/(d + dv)); float finfoutk = (float)sqrt(6.0F * scale/(d + dk));
float finfoutv = (float)sqrt(6.0F * scale/(d + dv));
wk.SetDataRand(-finfoutk, finfoutk); wk.SetDataRand(-finfoutk, finfoutk);
wq.SetDataRand(-finfoutk, finfoutk); wq.SetDataRand(-finfoutk, finfoutk);
...@@ -107,7 +108,7 @@ XTensor T2TAttention::Make(XTensor &k, XTensor &q, XTensor &v) ...@@ -107,7 +108,7 @@ XTensor T2TAttention::Make(XTensor &k, XTensor &q, XTensor &v)
XTensor scalar; XTensor scalar;
/* scalar = softmax(Q * K^T / sqrt(dk)) * V */ /* scalar = softmax(Q * K^T / sqrt(dk)) * V */
scalar = Softmax(Linear(BMMul(qheads, X_NOTRANS, kheads, X_TRANS), 1/sqrt((float)dk)), -1); scalar = Softmax(Linear(BMMul(qheads, X_NOTRANS, kheads, X_TRANS), 1/(float)sqrt((float)dk)), -1);
att = BMMul(scalar, vheads); att = BMMul(scalar, vheads);
/* concatenate the heads */ /* concatenate the heads */
......
...@@ -62,7 +62,7 @@ void T2TEmbedder::InitModel(int argc, const char ** argv, int myDevID, XMem * my ...@@ -62,7 +62,7 @@ void T2TEmbedder::InitModel(int argc, const char ** argv, int myDevID, XMem * my
InitTensor2D(&w, vSize, eSize, X_FLOAT, devID, mem); InitTensor2D(&w, vSize, eSize, X_FLOAT, devID, mem);
w.SetDataRandn(0, 1/sqrt((float)eSize)); w.SetDataRandn(0, 1/(float)sqrt((float)eSize));
/* create the positional embedding matrix */ /* create the positional embedding matrix */
MakePosEmbedding(eSize, d, maxLength); MakePosEmbedding(eSize, d, maxLength);
...@@ -84,11 +84,11 @@ void T2TEmbedder::MakePosEmbedding(int eSize, int d, int length) ...@@ -84,11 +84,11 @@ void T2TEmbedder::MakePosEmbedding(int eSize, int d, int length)
for(int k = 0; k < eSize; k++){ for(int k = 0; k < eSize; k++){
if(k % 2 == 0){ if(k % 2 == 0){
int i = k/2; int i = k/2;
dp[k] = sin(pos/pow(10000.0F, 2.0F*i/d)); dp[k] = (float)sin(pos/pow(10000.0F, 2.0F*i/d));
} }
else{ else{
int i = (k - 1)/2; int i = (k - 1)/2;
dp[k] = cos(pos/pow(10000.0F, 2.0F*i/d)); dp[k] = (float)cos(pos/pow(10000.0F, 2.0F*i/d));
} }
} }
} }
......
...@@ -67,8 +67,9 @@ void T2TFNN::InitModel(int argc, const char ** argv, int myDevID, XMem * myMem) ...@@ -67,8 +67,9 @@ void T2TFNN::InitModel(int argc, const char ** argv, int myDevID, XMem * myMem)
InitTensor2D(&w2, hSize, outSize, X_FLOAT, devID, mem); InitTensor2D(&w2, hSize, outSize, X_FLOAT, devID, mem);
InitTensor1D(&b2, outSize, X_FLOAT, devID, mem); InitTensor1D(&b2, outSize, X_FLOAT, devID, mem);
float finfout1 = sqrt(6/(inSize + hSize)); float scale = 1.0F;
float finfout2 = sqrt(6/(hSize + outSize)); float finfout1 = (float)sqrt(6.0F * scale/(inSize + hSize));
float finfout2 = (float)sqrt(6.0F * scale/(hSize + outSize));
w1.SetDataRand(-finfout1, finfout1); w1.SetDataRand(-finfout1, finfout1);
b1.SetZeroAll(); b1.SetZeroAll();
......
...@@ -63,7 +63,8 @@ void T2TOutput::InitModel(int argc, const char ** argv, int myDevID, XMem * myMe ...@@ -63,7 +63,8 @@ void T2TOutput::InitModel(int argc, const char ** argv, int myDevID, XMem * myMe
InitTensor2D(&w, hSize, vSize, X_FLOAT, devID, mem); InitTensor2D(&w, hSize, vSize, X_FLOAT, devID, mem);
float finfout = sqrt(6/(hSize + vSize)); float scale = 1.0F;
float finfout = (float)sqrt(6.0F * scale/(hSize + vSize));
w.SetDataRand(-finfout, finfout); w.SetDataRand(-finfout, finfout);
} }
......
...@@ -112,8 +112,8 @@ void T2TTrainer::Train(const char * fn, T2TModel * model) ...@@ -112,8 +112,8 @@ void T2TTrainer::Train(const char * fn, T2TModel * model)
net.Backward(output, batch, CROSSENTROPY); net.Backward(output, batch, CROSSENTROPY);
/* learning rate */ /* learning rate */
lr = (1/sqrt((float)d)) * MIN(pow(step + 1, -0.5), (step + 1) * pow(nwarmup, -1.5)); lr = (1 / (float)sqrt((float)d)) * (float)MIN(pow(step + 1, -0.5), (step + 1) * pow(nwarmup, -1.5));
lr = 0.000005F; lr = 0.000002F;
/* update the parameters */ /* update the parameters */
Update(model, lr); Update(model, lr);
...@@ -132,7 +132,7 @@ void T2TTrainer::Train(const char * fn, T2TModel * model) ...@@ -132,7 +132,7 @@ void T2TTrainer::Train(const char * fn, T2TModel * model)
if (step % 1 == 0) { if (step % 1 == 0) {
double elapsed = GetClockSec() - startT; double elapsed = GetClockSec() - startT;
XPRINT6(0, stderr, "[INFO] lr=%e, elapsed=%.1fs, step=%d, epoch=%d, ngram=%d, ppl=%.3f\n", XPRINT6(0, stderr, "[INFO] lr=%.2e, elapsed=%.1fs, step=%d, epoch=%d, word=%d, ppl=%.3f\n",
lr, elapsed, step, epoch + 1, wordCountTotal, exp(loss / wordCount)); lr, elapsed, step, epoch + 1, wordCountTotal, exp(loss / wordCount));
} }
} }
...@@ -142,7 +142,7 @@ void T2TTrainer::Train(const char * fn, T2TModel * model) ...@@ -142,7 +142,7 @@ void T2TTrainer::Train(const char * fn, T2TModel * model)
double elapsed = GetClockSec() - startT; double elapsed = GetClockSec() - startT;
XPRINT6(0, stderr, "[INFO] lr=%e, elapsed=%.1fs, step=%d, epoch=%d, ngram=%d, ppl=%.3f\n", XPRINT6(0, stderr, "[INFO] lr=%.2e, elapsed=%.1fs, step=%d, epoch=%d, word=%d, ppl=%.3f\n",
lr, elapsed, step, epoch, wordCountTotal, exp(loss / wordCount)); lr, elapsed, step, epoch, wordCountTotal, exp(loss / wordCount));
XPRINT3(0, stderr, "[INFO] training finished (took %.1fs, step=%d and epoch=%d)\n", XPRINT3(0, stderr, "[INFO] training finished (took %.1fs, step=%d and epoch=%d)\n",
elapsed, step, epoch); elapsed, step, epoch);
......
...@@ -71,7 +71,7 @@ void _SetDataFanInOut(XTensor * tensor, DTYPE gain) ...@@ -71,7 +71,7 @@ void _SetDataFanInOut(XTensor * tensor, DTYPE gain)
} }
DTYPE std = gain * (float)sqrt(2.0/(fanIn + fanOut)); DTYPE std = gain * (float)sqrt(2.0/(fanIn + fanOut));
DTYPE a = sqrt(3.0) * std; DTYPE a = (DTYPE)sqrt(3.0) * std;
_SetDataRand(tensor, -a, a); _SetDataRand(tensor, -a, a);
} }
......
...@@ -103,10 +103,10 @@ void _Softmax(const XTensor * x, XTensor * y, int leadDim) ...@@ -103,10 +103,10 @@ void _Softmax(const XTensor * x, XTensor * y, int leadDim)
else{ else{
for(int i = 0; i < n; i++){ for(int i = 0; i < n; i++){
DTYPE r = (DTYPE)exp(ip[i * m + j] - mp[j])/sp[j]; DTYPE r = (DTYPE)exp(ip[i * m + j] - mp[j])/sp[j];
if(IsNAN(r)) if (r > (DTYPE)1.0F)
r = DTYPE_MIN; r = (DTYPE)1.0F;
if(IsINF(r)) else if (r < 0)
r = DTYPE_MIN; r = 0;
op[i * m + j] = r; op[i * m + j] = r;
} }
} }
......
...@@ -85,7 +85,13 @@ void KernelSoftmaxComputeTensor(DTYPE * x, DTYPE * max, DTYPE * sum, DTYPE * y, ...@@ -85,7 +85,13 @@ void KernelSoftmaxComputeTensor(DTYPE * x, DTYPE * max, DTYPE * sum, DTYPE * y,
if(i < strideSizeTotal && j < strideNum){ if(i < strideSizeTotal && j < strideNum){
int offset = int(i / stride) * blockSize + j * stride + i2[threadIdx.x]; int offset = int(i / stride) * blockSize + j * stride + i2[threadIdx.x];
y[offset] = exp(x[offset] - xMax[threadIdx.x])/xSum[threadIdx.x]; DTYPE r = exp(x[offset] - xMax[threadIdx.x])/xSum[threadIdx.x];
if (r >(DTYPE)1.0F)
r = (DTYPE)1.0F;
else if (r < 0)
r = 0;
y[offset] = r;
} }
} }
...@@ -194,7 +200,12 @@ void KernelSoftmaxComputeTensorUseBroadcast(DTYPE * input, DTYPE * max, DTYPE * ...@@ -194,7 +200,12 @@ void KernelSoftmaxComputeTensorUseBroadcast(DTYPE * input, DTYPE * max, DTYPE *
maxData = broadcast(maxData); maxData = broadcast(maxData);
if (i < strideNum){ if (i < strideNum){
int offset = int(j / stride) * blockSize + i * stride + i2; int offset = int(j / stride) * blockSize + i * stride + i2;
output[offset] = exp(input[offset] - maxData) / sumData; DTYPE r = exp(input[offset] - maxData) / sumData;
if (r > (DTYPE)1.0F)
r = (DTYPE)1.0F;
else if (r < 0)
r = 0;
output[offset] = r;
} }
} }
} }
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论