fix bugs in back propagation and transformer

90dc67f2 · xiaotong · 50c3670f · 90dc67f2 · 90dc67f2 · 90dc67f2
Commit 90dc67f2 authored Aug 05, 2018 by xiaotong
--- a/source/network/XBackwardMath.cpp
+++ b/source/network/XBackwardMath.cpp
@@ -468,21 +468,19 @@ void XMathGrad::GradPower(XTensor * node)
    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for POWER!");

    XTensor * a = income.tails[0];
-    XTensor * b = NewTensor(a);
-    XTensor * c = NewTensor(a);
+    XTensor * b = NewTensorBuf(a, a->devID, a->mem);

    DTYPE p = income.GetParam(0);

    XNoder::MakeGrad(a);

-    _Power(a, b, (p-1)/p);
-    _ScaleAndShift(b, c, p);
-    _Multiply(node->grad, c, a->grad, 1.0F);
+    _Power(a, b, p - 1.0F);
+    _ScaleAndShiftMe(b, p);
+    _Multiply(node->grad, b, a->grad, 1.0F);

-    node->visitMark = NODE_FINISHED;
+    DelTensor(b);

-    delete b;
-    delete c;
+    node->visitMark = NODE_FINISHED;
 }

 /*
@@ -499,16 +497,16 @@ void XMathGrad::GradNegate(XTensor * node)
    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for NEGATE!");

    XTensor * a = income.tails[0];
-    XTensor * b = NewTensor(a);
+    XTensor * b = NewTensorBuf(a, a->devID, a->mem);

    XNoder::MakeGrad(a);

    _ScaleAndShift(node->grad, b, -1.0F);
    _Sum(a->grad, b, a->grad);

-    node->visitMark = NODE_FINISHED;
+    DelTensorBuf(b);

-    delete b;
+    node->visitMark = NODE_FINISHED;
 }

 /*
@@ -525,18 +523,14 @@ void XMathGrad::GradScaleAndShift(XTensor * node)
    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for SCALEANDSHIFT!");

    XTensor * a = income.tails[0];
-    XTensor * b = NewTensor(a);

    DTYPE scale = income.GetParam(0);

    XNoder::MakeGrad(a);

-    _ScaleAndShift(node->grad, b, scale);
-    _Sum(a->grad, b, a->grad);
+    _Sum(a->grad, node->grad, a->grad, scale);

    node->visitMark = NODE_FINISHED;
-
-    delete b;
 }

 /*
@@ -582,9 +576,7 @@ void XMathGrad::GradDiv(XTensor * node)

    XTensor * a = income.tails[0];
    XTensor * b = income.tails[1];
-    XTensor * c = NewTensor(b);
-    XTensor * d = NewTensor(b);
-    XTensor * e = NewTensor(b);
+    XTensor * ab2 = NewTensorBuf(a, a->devID, a->mem);

    XNoder::MakeGrad(a);
    XNoder::MakeGrad(b);
@@ -592,16 +584,15 @@ void XMathGrad::GradDiv(XTensor * node)
    CheckNTErrors(XTensor::IsSameShaped(a, b), "Wrong sized input tensors!");

    _Div(node->grad, b, a->grad, 1.0F);
-    _Power(b, c, -2.0F);
-    _Multiply(a, c, d);
-    _ScaleAndShift(d, e, -1.0F);
-    _Multiply(node->grad, e, b->grad, 1.0F);

-    node->visitMark = NODE_FINISHED;
+    _Power(b, ab2, -2.0F);
+    _Multiply(a, ab2, ab2);
+    _ScaleAndShiftMe(ab2, -1.0F);
+    _Multiply(node->grad, ab2, b->grad, 1.0F);

-    delete c;
-    delete d;
-    delete e;
+    DelTensorBuf(ab2);
+
+    node->visitMark = NODE_FINISHED;
 }

 /*
@@ -618,16 +609,16 @@ void XMathGrad::GradExp(XTensor * node)
    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for EXP!");

    XTensor * a = income.tails[0];
-    XTensor * b = NewTensor(a);
+    XTensor * b = NewTensorBuf(a, a->devID, a->mem);

    XNoder::MakeGrad(a);

    _Exp(a, b);
    _Multiply(node->grad, b, a->grad, 1.0F);

-    node->visitMark = NODE_FINISHED;
+    DelTensorBuf(b);

-    delete b;
+    node->visitMark = NODE_FINISHED;
 }

 /*
@@ -644,16 +635,16 @@ void XMathGrad::GradSin(XTensor * node)
    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for SIN!");

    XTensor * a = income.tails[0];
-    XTensor * b = NewTensor(a);
+    XTensor * b = NewTensorBuf(a, a->devID, a->mem);

    XNoder::MakeGrad(a);

    _Cos(a, b);
    _Multiply(node->grad, b, a->grad, 1.0F);

-    node->visitMark = NODE_FINISHED;
+    DelTensorBuf(b);

-    delete b;
+    node->visitMark = NODE_FINISHED;
 }

 /*
@@ -670,19 +661,17 @@ void XMathGrad::GradCos(XTensor * node)
    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for COS!");

    XTensor * a = income.tails[0];
-    XTensor * b = NewTensor(a);
-    XTensor * c = NewTensor(a);
+    XTensor * b = NewTensorBuf(a, a->devID, a->mem);

    XNoder::MakeGrad(a);

    _Sin(a, b);
-    _ScaleAndShift(b, c, -1.0F);
-    _Multiply(node->grad, c, a->grad, 1.0F);
+    _ScaleAndShiftMe(b, -1.0F);
+    _Multiply(node->grad, b, a->grad, 1.0F);

-    node->visitMark = NODE_FINISHED;
+    DelTensorBuf(b);

-    delete b;
-    delete c;
+    node->visitMark = NODE_FINISHED;
 }

 /*
@@ -699,19 +688,17 @@ void XMathGrad::GradTan(XTensor * node)
    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for TAN!");

    XTensor * a = income.tails[0];
-    XTensor * b = NewTensor(a);
-    XTensor * c = NewTensor(a);
+    XTensor * b = NewTensorBuf(a, a->devID, a->mem);

    XNoder::MakeGrad(a);

    _Cos(a, b);
-    _Power(b, c, -2.0F);
-    _Multiply(node->grad, c, a->grad, 1.0F);
+    _PowerMe(b, -2.0F);
+    _Multiply(node->grad, b, a->grad, 1.0F);

-    node->visitMark = NODE_FINISHED;
+    DelTensorBuf(b);

-    delete b;
-    delete c;
+    node->visitMark = NODE_FINISHED;
 }

 /*
@@ -817,16 +804,16 @@ void XMathGrad::GradAbsolute(XTensor * node)
    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for ABSOLUTE!");

    XTensor * a = income.tails[0];
-    XTensor * b = NewTensor(a);
+    XTensor * b = NewTensorBuf(a, a->devID, a->mem);

    XNoder::MakeGrad(a);

    _Sign(a, b);
    _Multiply(node->grad, b, a->grad, 1.0F);

-    node->visitMark = NODE_FINISHED;
+    DelTensorBuf(b);

-    delete b;
+    node->visitMark = NODE_FINISHED;
 }

 /*
@@ -842,17 +829,9 @@ void XMathGrad::GradSign(XTensor * node)
    XLink &income = node->income;
    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for SIGN!");

-    XTensor * a = income.tails[0];
-    XTensor * b = NewTensor(a);
-
-    XNoder::MakeGrad(a);
-
-    b->SetZeroAll();
-    _Sum(a->grad, b, a->grad);
+    // we do nothing here

    node->visitMark = NODE_FINISHED;
-
-    delete b;
 }

 /*
@@ -868,17 +847,9 @@ void XMathGrad::GradRound(XTensor * node)
    XLink &income = node->income;
    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for ROUND!");

-    XTensor * a = income.tails[0];
-    XTensor * b = NewTensor(a);
-
-    XNoder::MakeGrad(a);
-
-    b->SetZeroAll();
-    _Sum(a->grad, b, a->grad);
+    // we do nothing here

    node->visitMark = NODE_FINISHED;
-
-    delete b;
 }

 /*
@@ -894,7 +865,7 @@ void XMathGrad::GradClip(XTensor * node)
    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for CLIP!");

    XTensor * a = income.tails[0];
-    XTensor * b = NewTensor(a);
+    XTensor * b = NewTensorBuf(a, a->devID, a->mem);

    DTYPE lower = income.GetParam(0);
    DTYPE upper = income.GetParam(1);
@@ -904,9 +875,9 @@ void XMathGrad::GradClip(XTensor * node)
    _ClipBackward(node, a, node->grad, a->grad, lower, upper);
    _Sum(a->grad, b, a->grad);

-    node->visitMark = NODE_FINISHED;
+    DelTensorBuf(b);

-    delete b;
+    node->visitMark = NODE_FINISHED;
 }

 /*
@@ -923,21 +894,20 @@ void XMathGrad::GradReduceMean(XTensor * node)
    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for Reduce!");

    XTensor * a = income.tails[0];
-    XTensor * b = NewTensor(a);
-    XTensor * c = NewTensor(a);
+    XTensor * b = NewTensorBuf(a, a->devID, a->mem);

    int dim = income.GetParamInt(0);
    int n = a->GetDim(dim);
+
    XNoder::MakeGrad(a);

    _Unsqueeze(node->grad, b, dim, n);
-    _ScaleAndShift(b, c, 1.0F/n);
-    _Sum(a->grad, c, a->grad);
+    _ScaleAndShiftMe(b, 1.0F/n);
+    _Sum(a->grad, b, a->grad);

-    node->visitMark = NODE_FINISHED;
+    DelTensorBuf(b);

-    delete b;
-    delete c;
+    node->visitMark = NODE_FINISHED;
 }

 /*
@@ -954,18 +924,19 @@ void XMathGrad::GradReduceSum(XTensor * node)
    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for Reduce!");

    XTensor * a = income.tails[0];
-    XTensor * b = NewTensor(a);
+    XTensor * b = NewTensorBuf(a, a->devID, a->mem);

    int dim = income.GetParamInt(0);
    int n = a->GetDim(dim);
+
    XNoder::MakeGrad(a);

    _Unsqueeze(node->grad, b, dim, n);
    _Sum(a->grad, b, a->grad);

-    node->visitMark = NODE_FINISHED;
+    DelTensor(b);

-    delete b;
+    node->visitMark = NODE_FINISHED;
 }

 /*
@@ -984,9 +955,9 @@ void XMathGrad::GradReduceSumSquared(XTensor * node)

    XTensor * a = income.tails[0];
    XTensor * b = income.tails[1];
-    XTensor * c = NewTensor(a);
-    XTensor * d = NewTensor(b);
-    XTensor * e = NewTensor(c);
+    XTensor * c = NewTensorBuf(a, a->devID, a->mem);
+    XTensor * d = NewTensorBuf(b, b->devID, b->mem);
+    XTensor * e = NewTensorBuf(a, a->devID, a->mem);

    int dim = income.GetParamInt(0);
    int n = a->GetDim(dim);
@@ -999,11 +970,11 @@ void XMathGrad::GradReduceSumSquared(XTensor * node)
    _Multiply(e, c, a->grad, 1.0F);
    _Multiply(node->grad, d, b->grad, 1.0F);

-    node->visitMark = NODE_FINISHED;
+    DelTensorBuf(c);
+    DelTensorBuf(d);
+    DelTensorBuf(e);

-    delete c;
-    delete d;
-    delete e;
+    node->visitMark = NODE_FINISHED;
 }

 /*
@@ -1022,9 +993,9 @@ void XMathGrad::GradReduceVariance(XTensor * node)

    XTensor * a = income.tails[0];
    XTensor * b = income.tails[1];
-    XTensor * c = NewTensor(a);
-    XTensor * d = NewTensor(b);
-    XTensor * e = NewTensor(a);
+    XTensor * c = NewTensorBuf(a, a->devID, a->mem);
+    XTensor * d = NewTensorBuf(b, b->devID, b->mem);
+    XTensor * e = NewTensorBuf(a, a->devID, a->mem);

    int dim = income.GetParamInt(0);
    int n = a->GetDim(dim);
@@ -1037,11 +1008,11 @@ void XMathGrad::GradReduceVariance(XTensor * node)
    _Multiply(e, c, a->grad, 1.0F);
    _Multiply(node->grad, d, b->grad, 1.0F);

-    node->visitMark = NODE_FINISHED;
+    DelTensorBuf(c);
+    DelTensorBuf(d);
+    DelTensorBuf(e);

-    delete c;
-    delete d;
-    delete e;
+    node->visitMark = NODE_FINISHED;
 }

 }
--- a/source/sample/transformer/T2TAttention.cpp
+++ b/source/sample/transformer/T2TAttention.cpp
@@ -66,8 +66,9 @@ void T2TAttention::InitModel(int argc, const char ** argv, int myDevID, XMem * m
    InitTensor2D(&wq, d, dk, X_FLOAT, devID, mem);
    InitTensor2D(&wv, d, dv, X_FLOAT, devID, mem);
    
-    float finfoutk = sqrt(6/(d + dk));
-    float finfoutv = sqrt(6/(d + dv));
+    float scale = 1.0F;
+    float finfoutk = (float)sqrt(6.0F * scale/(d + dk));
+    float finfoutv = (float)sqrt(6.0F * scale/(d + dv));

    wk.SetDataRand(-finfoutk, finfoutk);
    wq.SetDataRand(-finfoutk, finfoutk);
@@ -107,7 +108,7 @@ XTensor T2TAttention::Make(XTensor &k, XTensor &q, XTensor &v)
    XTensor scalar;

    /* scalar = softmax(Q * K^T / sqrt(dk)) * V */
-    scalar = Softmax(Linear(BMMul(qheads, X_NOTRANS, kheads, X_TRANS), 1/sqrt((float)dk)), -1);
+    scalar = Softmax(Linear(BMMul(qheads, X_NOTRANS, kheads, X_TRANS), 1/(float)sqrt((float)dk)), -1);
    att = BMMul(scalar, vheads);

    /* concatenate the heads */

--- a/source/sample/transformer/T2TEmbedding.cpp
+++ b/source/sample/transformer/T2TEmbedding.cpp
@@ -62,7 +62,7 @@ void T2TEmbedder::InitModel(int argc, const char ** argv, int myDevID, XMem * my

    InitTensor2D(&w, vSize, eSize, X_FLOAT, devID, mem);

-    w.SetDataRandn(0, 1/sqrt((float)eSize));
+    w.SetDataRandn(0, 1/(float)sqrt((float)eSize));

    /* create the positional embedding matrix */
    MakePosEmbedding(eSize, d, maxLength);
@@ -84,11 +84,11 @@ void T2TEmbedder::MakePosEmbedding(int eSize, int d, int length)
        for(int k = 0; k < eSize; k++){
            if(k % 2 == 0){
                int i = k/2;
-                dp[k] = sin(pos/pow(10000.0F, 2.0F*i/d));
+                dp[k] = (float)sin(pos/pow(10000.0F, 2.0F*i/d));
            }
            else{
                int i = (k - 1)/2;
-                dp[k] = cos(pos/pow(10000.0F, 2.0F*i/d));
+                dp[k] = (float)cos(pos/pow(10000.0F, 2.0F*i/d));
            }
        }
    }

--- a/source/sample/transformer/T2TFNN.cpp
+++ b/source/sample/transformer/T2TFNN.cpp
@@ -67,8 +67,9 @@ void T2TFNN::InitModel(int argc, const char ** argv, int myDevID, XMem * myMem)
    InitTensor2D(&w2, hSize, outSize, X_FLOAT, devID, mem);
    InitTensor1D(&b2, outSize, X_FLOAT, devID, mem);

-    float finfout1 = sqrt(6/(inSize + hSize));
-    float finfout2 = sqrt(6/(hSize + outSize));
+    float scale = 1.0F;
+    float finfout1 = (float)sqrt(6.0F * scale/(inSize + hSize));
+    float finfout2 = (float)sqrt(6.0F * scale/(hSize + outSize));
    
    w1.SetDataRand(-finfout1, finfout1);
    b1.SetZeroAll();

--- a/source/sample/transformer/T2TOutput.cpp
+++ b/source/sample/transformer/T2TOutput.cpp
@@ -63,7 +63,8 @@ void T2TOutput::InitModel(int argc, const char ** argv, int myDevID, XMem * myMe

    InitTensor2D(&w, hSize, vSize, X_FLOAT, devID, mem);
    
-    float finfout = sqrt(6/(hSize + vSize));
+    float scale = 1.0F;
+    float finfout = (float)sqrt(6.0F * scale/(hSize + vSize));
    w.SetDataRand(-finfout, finfout);
 }


--- a/source/sample/transformer/T2TTrainer.cpp
+++ b/source/sample/transformer/T2TTrainer.cpp
@@ -112,8 +112,8 @@ void T2TTrainer::Train(const char * fn, T2TModel * model)
            net.Backward(output, batch, CROSSENTROPY);
            
            /* learning rate */
-            lr = (1/sqrt((float)d)) * MIN(pow(step + 1, -0.5), (step + 1) * pow(nwarmup, -1.5));
-            lr = 0.000005F;
+            lr = (1 / (float)sqrt((float)d)) * (float)MIN(pow(step + 1, -0.5), (step + 1) * pow(nwarmup, -1.5));
+            lr = 0.000002F;
            
            /* update the parameters */
            Update(model, lr);
@@ -132,7 +132,7 @@ void T2TTrainer::Train(const char * fn, T2TModel * model)
            
            if (step % 1 == 0) {
                double elapsed = GetClockSec() - startT;
-                XPRINT6(0, stderr, "[INFO] lr=%e, elapsed=%.1fs, step=%d, epoch=%d, ngram=%d, ppl=%.3f\n",
+                XPRINT6(0, stderr, "[INFO] lr=%.2e, elapsed=%.1fs, step=%d, epoch=%d, word=%d, ppl=%.3f\n",
                        lr, elapsed, step, epoch + 1, wordCountTotal, exp(loss / wordCount));
            }
        }
@@ -142,7 +142,7 @@ void T2TTrainer::Train(const char * fn, T2TModel * model)
    
    double elapsed = GetClockSec() - startT;
    
-    XPRINT6(0, stderr, "[INFO] lr=%e, elapsed=%.1fs, step=%d, epoch=%d, ngram=%d, ppl=%.3f\n",
+    XPRINT6(0, stderr, "[INFO] lr=%.2e, elapsed=%.1fs, step=%d, epoch=%d, word=%d, ppl=%.3f\n",
            lr, elapsed, step, epoch, wordCountTotal, exp(loss / wordCount));
    XPRINT3(0, stderr, "[INFO] training finished (took %.1fs, step=%d and epoch=%d)\n",
            elapsed, step, epoch);

--- a/source/tensor/core/getandset/SetData.cpp
+++ b/source/tensor/core/getandset/SetData.cpp
@@ -71,7 +71,7 @@ void _SetDataFanInOut(XTensor * tensor, DTYPE gain)
    }

    DTYPE std = gain * (float)sqrt(2.0/(fanIn + fanOut));
-    DTYPE a = sqrt(3.0) * std;
+    DTYPE a = (DTYPE)sqrt(3.0) * std;
    _SetDataRand(tensor, -a, a);
 }


--- a/source/tensor/function/Softmax.cpp
+++ b/source/tensor/function/Softmax.cpp
@@ -103,10 +103,10 @@ void _Softmax(const XTensor * x, XTensor * y, int leadDim)
                    else{
                        for(int i = 0; i < n; i++){
                            DTYPE r = (DTYPE)exp(ip[i * m + j] - mp[j])/sp[j];
-                            if(IsNAN(r))
-                                r = DTYPE_MIN;
-                            if(IsINF(r))
-                                r = DTYPE_MIN;
+                            if (r > (DTYPE)1.0F)
+                                r = (DTYPE)1.0F;
+                            else if (r < 0)
+                                r = 0;
                            op[i * m + j] = r;
                        }
                    }

--- a/source/tensor/function/Softmax.cu
+++ b/source/tensor/function/Softmax.cu
@@ -85,7 +85,13 @@ void KernelSoftmaxComputeTensor(DTYPE * x, DTYPE * max, DTYPE * sum, DTYPE * y, 

    if(i < strideSizeTotal && j < strideNum){
        int offset = int(i / stride) * blockSize + j * stride + i2[threadIdx.x];
-        y[offset] = exp(x[offset] - xMax[threadIdx.x])/xSum[threadIdx.x];
+        DTYPE r = exp(x[offset] - xMax[threadIdx.x])/xSum[threadIdx.x];
+        if (r >(DTYPE)1.0F)
+            r = (DTYPE)1.0F;
+        else if (r < 0)
+            r = 0;
+        y[offset] = r;
+
    }
 }

@@ -194,7 +200,12 @@ void KernelSoftmaxComputeTensorUseBroadcast(DTYPE * input, DTYPE * max, DTYPE * 
        maxData = broadcast(maxData);
        if (i < strideNum){
            int offset = int(j / stride) * blockSize + i * stride + i2;
-            output[offset] = exp(input[offset] - maxData) / sumData;
+            DTYPE r = exp(input[offset] - maxData) / sumData;
+            if (r > (DTYPE)1.0F)
+                r = (DTYPE)1.0F;
+            else if (r < 0)
+                r = 0;
+            output[offset] = r;
        }
    }
 }