Commit 8d1ae93b by xiaotong

renaming and bug fixes

parent 90dc67f2
...@@ -459,7 +459,7 @@ gradient for power ...@@ -459,7 +459,7 @@ gradient for power
for for
c = pow(a,p) c = pow(a,p)
we have we have
dE/da = (dE/dc) * p*a^(p-1) dE/da = (dE/dc) * p * a^(p-1)
>> node - the node (c) for backward computation >> node - the node (c) for backward computation
*/ */
void XMathGrad::GradPower(XTensor * node) void XMathGrad::GradPower(XTensor * node)
...@@ -942,10 +942,10 @@ void XMathGrad::GradReduceSum(XTensor * node) ...@@ -942,10 +942,10 @@ void XMathGrad::GradReduceSum(XTensor * node)
/* /*
gradient for reduceSumSquared gradient for reduceSumSquared
for for
c = reduceSumSquared(a, dim, b) c = \sum_i (a_i - b)^2
we have we have
dE/da = Unsqueeze(dE/dc) * 2a dE/da = Unsqueeze(dE/dc) * 2a
dE/db = Unsqueeze(dE/dc) * (-2b) dE/db = dE/dc * -2 * n * b
>> node - the node (c) for backward computation >> node - the node (c) for backward computation
*/ */
void XMathGrad::GradReduceSumSquared(XTensor * node) void XMathGrad::GradReduceSumSquared(XTensor * node)
...@@ -964,10 +964,13 @@ void XMathGrad::GradReduceSumSquared(XTensor * node) ...@@ -964,10 +964,13 @@ void XMathGrad::GradReduceSumSquared(XTensor * node)
XNoder::MakeGrad(a); XNoder::MakeGrad(a);
XNoder::MakeGrad(b); XNoder::MakeGrad(b);
/* dE/da = Unsqueeze(dE/dc) * 2a */
_ScaleAndShift(a, c, 2.0F); _ScaleAndShift(a, c, 2.0F);
_ScaleAndShift(b, d, -2.0F);
_Unsqueeze(node->grad, e, dim, n); _Unsqueeze(node->grad, e, dim, n);
_Multiply(e, c, a->grad, 1.0F); _Multiply(e, c, a->grad, 1.0F);
/* dE/db = dE/dc * -2 * n * b */
_ScaleAndShift(b, d, -2.0F * n);
_Multiply(node->grad, d, b->grad, 1.0F); _Multiply(node->grad, d, b->grad, 1.0F);
DelTensorBuf(c); DelTensorBuf(c);
...@@ -980,10 +983,11 @@ void XMathGrad::GradReduceSumSquared(XTensor * node) ...@@ -980,10 +983,11 @@ void XMathGrad::GradReduceSumSquared(XTensor * node)
/* /*
gradient for reduceVariance gradient for reduceVariance
for for
c = reduceVariance(a, dim, b) c = (sum_i (a_i - b)^2) * 1/n
where b is the mean, and n is the size of a
we have we have
dE/da = Unsqueeze(dE/dc) * 2a/dimSizeA[dim] dE/da = Unsqueeze(dE/dc) * 2a/n
dE/db = Unsqueeze(dE/dc) * (-2a/dimSizeA[dim]) dE/db = dE/dc * -2 * b
>> node - the node (c) for backward computation >> node - the node (c) for backward computation
*/ */
void XMathGrad::GradReduceVariance(XTensor * node) void XMathGrad::GradReduceVariance(XTensor * node)
...@@ -1002,10 +1006,13 @@ void XMathGrad::GradReduceVariance(XTensor * node) ...@@ -1002,10 +1006,13 @@ void XMathGrad::GradReduceVariance(XTensor * node)
XNoder::MakeGrad(a); XNoder::MakeGrad(a);
XNoder::MakeGrad(b); XNoder::MakeGrad(b);
/* dE/da = Unsqueeze(dE/dc) * 2a/n */
_ScaleAndShift(a, c, 2.0F / n); _ScaleAndShift(a, c, 2.0F / n);
_ScaleAndShift(b, d, -2.0F / n);
_Unsqueeze(node->grad, e, dim, n); _Unsqueeze(node->grad, e, dim, n);
_Multiply(e, c, a->grad, 1.0F); _Multiply(e, c, a->grad, 1.0F);
/* dE/db = dE/dc * -2 * b */
_ScaleAndShift(b, d, -2.0F);
_Multiply(node->grad, d, b->grad, 1.0F); _Multiply(node->grad, d, b->grad, 1.0F);
DelTensorBuf(c); DelTensorBuf(c);
......
...@@ -62,7 +62,7 @@ void T2TEmbedder::InitModel(int argc, const char ** argv, int myDevID, XMem * my ...@@ -62,7 +62,7 @@ void T2TEmbedder::InitModel(int argc, const char ** argv, int myDevID, XMem * my
InitTensor2D(&w, vSize, eSize, X_FLOAT, devID, mem); InitTensor2D(&w, vSize, eSize, X_FLOAT, devID, mem);
w.SetDataRandn(0, 1/(float)sqrt((float)eSize)); w.SetDataRandn(0, 1.0F/(float)sqrt((float)eSize));
/* create the positional embedding matrix */ /* create the positional embedding matrix */
MakePosEmbedding(eSize, d, maxLength); MakePosEmbedding(eSize, d, maxLength);
......
...@@ -53,13 +53,13 @@ void AttEncoder::InitModel(int argc, const char ** argv, int myDevID, XMem * myM ...@@ -53,13 +53,13 @@ void AttEncoder::InitModel(int argc, const char ** argv, int myDevID, XMem * myM
devID = myDevID; devID = myDevID;
mem = myMem; mem = myMem;
LoadParamInt(argc, argv, "nstack", &nlayer, 6); LoadParamInt(argc, argv, "nlayer", &nlayer, 6);
LoadParamInt(argc, argv, "hsize", &hSize, 512); LoadParamInt(argc, argv, "hsize", &hSize, 512);
LoadParamInt(argc, argv, "esize", &eSize, 512); LoadParamInt(argc, argv, "esize", &eSize, 512);
LoadParamInt(argc, argv, "vsize", &vSize, -1); LoadParamInt(argc, argv, "vsize", &vSize, -1);
CheckNTErrors(nlayer > 1, "We have one encoding layer at least!"); CheckNTErrors(nlayer >= 1, "We have one encoding layer at least!");
CheckNTErrors(vSize > 1, "set vocabulary size by \"-vsize\""); CheckNTErrors(vSize > 1, "set vocabulary size by \"-vsize\"");
/* embedding model */ /* embedding model */
......
...@@ -88,10 +88,10 @@ XTensor T2TFNN::Make(XTensor &input) ...@@ -88,10 +88,10 @@ XTensor T2TFNN::Make(XTensor &input)
XTensor t1; XTensor t1;
/* t1 = max(0, x * w1 + b1) */ /* t1 = max(0, x * w1 + b1) */
t1 = Rectify(MMul(input, X_NOTRANS, w1, X_NOTRANS) + b1); t1 = Rectify(MMul(input, w1) + b1);
/* result = t1 * w2 + b2 */ /* result = t1 * w2 + b2 */
return MMul(t1, X_NOTRANS, w2, X_NOTRANS) + b2; return MMul(t1, w2) + b2;
} }
......
...@@ -76,7 +76,7 @@ XTensor T2TLN::Make(XTensor &input) ...@@ -76,7 +76,7 @@ XTensor T2TLN::Make(XTensor &input)
standard = Power(variance, 0.5F); standard = Power(variance, 0.5F);
/* unsqueeze mean and standard deviation to fit them into /* unsqueeze mean and standard deviation to fit them into
the same size of x */ the same shape of x */
meanFilled = Unsqueeze(mean, x.order - 1, x.GetDim(-1)); meanFilled = Unsqueeze(mean, x.order - 1, x.GetDim(-1));
standardFilled = Unsqueeze(standard, x.order - 1, x.GetDim(-1)); standardFilled = Unsqueeze(standard, x.order - 1, x.GetDim(-1));
......
...@@ -342,6 +342,9 @@ void T2TTrainer::Update(T2TModel * model, const float lr) ...@@ -342,6 +342,9 @@ void T2TTrainer::Update(T2TModel * model, const float lr)
ws.Add(&model->encoder.fnns[i].b1); ws.Add(&model->encoder.fnns[i].b1);
ws.Add(&model->encoder.fnns[i].w2); ws.Add(&model->encoder.fnns[i].w2);
ws.Add(&model->encoder.fnns[i].b2); ws.Add(&model->encoder.fnns[i].b2);
ws.Add(&model->encoder.attentions[i].wk);
ws.Add(&model->encoder.attentions[i].wq);
ws.Add(&model->encoder.attentions[i].wv);
} }
ws.Add(&model->encoder.embedder.w); ws.Add(&model->encoder.embedder.w);
...@@ -353,16 +356,6 @@ void T2TTrainer::Update(T2TModel * model, const float lr) ...@@ -353,16 +356,6 @@ void T2TTrainer::Update(T2TModel * model, const float lr)
CheckNTErrors(para != NULL, "NULL parameter tensor!"); CheckNTErrors(para != NULL, "NULL parameter tensor!");
CheckNTErrors(paraGrad != NULL, "NULL gradient tensor!"); CheckNTErrors(paraGrad != NULL, "NULL gradient tensor!");
/*DTYPE * d = (DTYPE*)paraGrad->data;
for(int i = 0; i < paraGrad->unitNum; i++){
if(IsINF(d[i])){
fprintf(stderr, "isinf %d\n", i);
}
if(IsNAN(d[i])){
fprintf(stderr, "isnan %d\n", i);
}
}*/
/* the delta rule */ /* the delta rule */
_Sum(para, paraGrad, para, -lr); _Sum(para, paraGrad, para, -lr);
} }
......
...@@ -60,9 +60,13 @@ void _Power(const XTensor * a, XTensor * b, DTYPE p) ...@@ -60,9 +60,13 @@ void _Power(const XTensor * a, XTensor * b, DTYPE p)
bData[i] = aData[i] * aData[i]; bData[i] = aData[i] * aData[i];
} }
else { else {
for (int i = 0; i < a->unitNum; i++) for (int i = 0; i < a->unitNum; i++) {
if (p < 0 && aData[i] == 0)
bData[i] = 1e20F;
else
bData[i] = (DTYPE)pow(aData[i], p); bData[i] = (DTYPE)pow(aData[i], p);
} }
}
} }
/* /*
......
...@@ -77,8 +77,13 @@ void KernelPower(DTYPE * a, DTYPE * b, DTYPE p, int size) ...@@ -77,8 +77,13 @@ void KernelPower(DTYPE * a, DTYPE * b, DTYPE p, int size)
{ {
int i = blockDim.x * blockIdx.x + threadIdx.x; int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size) if (i < size) {
DTYPE v = a[i];
if (p < 0 && v == 0)
b[i] = 1e20;
else
b[i] = pow(a[i], p); b[i] = pow(a[i], p);
}
} }
/* /*
...@@ -94,8 +99,13 @@ void KernelPower(__half * a, __half * b, __half p, int size) ...@@ -94,8 +99,13 @@ void KernelPower(__half * a, __half * b, __half p, int size)
#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__) #if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
#else #else
int i = blockDim.x * blockIdx.x + threadIdx.x; int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size) if (i < size) {
float v = __half2float(a[i]);
if (__half2float(p) < 0 && v == 0)
b[i] = __float2half(1e20);
else
b[i] = __float2half(pow(__half2float(a[i]), __half2float(p))); b[i] = __float2half(pow(__half2float(a[i]), __half2float(p)));
}
#endif #endif
} }
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论