improve coding

0e1074ff · xiaotong · efe32603 · 0e1074ff · 0e1074ff · 0e1074ff
Commit 0e1074ff authored Sep 23, 2018 by xiaotong
--- a/source/network/XBackwardMath.cpp
+++ b/source/network/XBackwardMath.cpp
@@ -419,7 +419,7 @@ void XMathGrad::GradDivDim(XTensor * node, bool isEfficient)
    XTensor * interGradTMP = NewTensorBuf(node->grad, node->devID, node->mem);
    _Negate(a, aTMP1);
-    _Power(b, bTMP, -2);
+    _Power(b, bTMP, -2.0F);
    _MultiplyDim(aTMP1, bTMP, aTMP2, n);
    _Multiply(node->grad, aTMP2, interGradTMP);
@@ -433,17 +433,17 @@ void XMathGrad::GradDivDim(XTensor * node, bool isEfficient)
           size of b. Then we can reduce the matrix into a row vector. */
        interGradTMP->Reshape(2, reshapedSize);
-        if(b->outgo.tailNum > 1){
+        //if(b->outgo.tailNum > 1){
            XTensor * bGradTMP = NewTensorBuf(b->grad, b->devID, b->mem);
            _ReduceSum(interGradTMP, bGradTMP, 0);
            _Sum(b->grad, bGradTMP, b->grad);
            DelTensorBuf(bGradTMP);
-        }
+        /*}
        else{
            _ReduceSum(interGradTMP, b->grad, 0);
-        }
+        }*/
    }
    else{
        int reshapedSize[MAX_TENSOR_DIM_NUM];
@@ -465,17 +465,17 @@ void XMathGrad::GradDivDim(XTensor * node, bool isEfficient)
        XTensor * interGrad = NewTensorBuf(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);
        _ReduceSum(interGradTMP, interGrad, 2);
-        if(b->outgo.tailNum > 1){
+        //if(b->outgo.tailNum > 1){
            XTensor * bGradTMP2 = NewTensorBuf(b->grad, b->devID, b->mem);
            _ReduceSum(interGrad, bGradTMP2, 0);
            _Sum(b->grad, bGradTMP2, b->grad);
            DelTensorBuf(bGradTMP2);
-        }
+        /*}
        else{
            _ReduceSum(interGrad, b->grad, 0);
-        }
+        }*/
        DelTensorBuf(interGrad);
    }
@@ -765,17 +765,17 @@ void XMathGrad::GradMultiplyDim(XTensor * node, bool isEfficient)
           size of b. Then we can reduce the matrix into a row vector. */
        bGradTMP->Reshape(2, reshapedSize);
-        if(b->outgo.tailNum > 1){
+        //if(b->outgo.tailNum > 1){
            XTensor * bGradTMP2 = NewTensorBuf(b->grad, b->devID, b->mem);
            _ReduceSum(bGradTMP, bGradTMP2, 0);
            _Sum(b->grad, bGradTMP2, b->grad);
            DelTensorBuf(bGradTMP2);
-        }
+        /*}
        else{
            _ReduceSum(bGradTMP, b->grad, 0);
-        }
+        }*/
    }
    else{
        int reshapedSize[MAX_TENSOR_DIM_NUM];
@@ -797,17 +797,17 @@ void XMathGrad::GradMultiplyDim(XTensor * node, bool isEfficient)
        XTensor * interGrad = NewTensorBuf(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);
        _ReduceSum(bGradTMP, interGrad, 2);
-        if(b->outgo.tailNum > 1){
+        //if(b->outgo.tailNum > 1){
            XTensor * bGradTMP2 = NewTensorBuf(b->grad, b->devID, b->mem);
            _ReduceSum(interGrad, bGradTMP2, 0);
            _Sum(b->grad, bGradTMP2, b->grad);
            DelTensorBuf(bGradTMP2);
-        }
+        /*}
        else{
            _ReduceSum(interGrad, b->grad, 0);
-        }
+        }*/
        DelTensorBuf(interGrad);
    }
@@ -1059,20 +1059,20 @@ void XMathGrad::GradSubDim(XTensor * node, bool isEfficient)
 		   size of b. Then we can reduce the matrix into a row vector. */
 		node->grad->Reshape(2, reshapedSize);
-		if(b->outgo.tailNum > 1){
+		//if(b->outgo.tailNum > 1){
 			XTensor * bGradTMP = NewTensorBuf(b->grad, b->devID, b->mem);
 			_ReduceSum(node->grad, bGradTMP, 0);
 			if(beta != 1.0F)
 				_ScaleAndShiftMe(bGradTMP, beta);
 			_Sub(b->grad, bGradTMP, b->grad);
 			DelTensorBuf(bGradTMP);
-		}
+		/*}
 		else{
 			_ReduceSum(node->grad, b->grad, 0);
 			if(beta != 1.0F)
 				_ScaleAndShiftMe(b->grad, beta);
 			_ScaleAndShiftMe(b->grad, -1.0F);
-		}
+		}*/
 		node->grad->Reshape(order, dimSize);
 	}
@@ -1097,20 +1097,20 @@ void XMathGrad::GradSubDim(XTensor * node, bool isEfficient)
 		_ReduceSum(node->grad, interGrad, 2);
-		if(b->outgo.tailNum > 1){
+		//if(b->outgo.tailNum > 1){
 			XTensor * bGradTMP = NewTensorBuf(b->grad, b->devID, b->mem);
 			_ReduceSum(interGrad, bGradTMP, 0);
 			if(beta != 1.0F)
 				_ScaleAndShiftMe(bGradTMP, beta);
 			_Sub(b->grad, bGradTMP, b->grad);
 			DelTensorBuf(bGradTMP);
-		}
+		/*}
 		else{
 			_ReduceSum(interGrad, b->grad, 0);
 			if(beta != 1.0F)
 				_ScaleAndShiftMe(b->grad, beta);
 			_ScaleAndShiftMe(b->grad, -1.0F);
-		}
+		}*/
 		node->grad->Reshape(order, dimSize);
@@ -1160,7 +1160,7 @@ gradient for sum with one dimension
 c = a + b * \beta
 where the size of b is equal to dimension n of a, i.e., |b| = a.dimSize[n]
 dE/da = dE/dc
-dE/db = dE/dc * b.reduce(0,...,n-1,n+1,...) * \beta
+dE/db = dE/dc * a.reduce(0,...,n-1,n+1,...) * \beta
 >> node - the node (c) for backward computation
 >> isEfficient - indicates whether the computation is in
@@ -1193,19 +1193,19 @@ void XMathGrad::GradSumDim(XTensor * node, bool isEfficient)
           size of b. Then we can reduce the matrix into a row vector. */
        node->grad->Reshape(2, reshapedSize);
-        if(b->outgo.tailNum > 1){
+        //if(b->outgo.tailNum > 1){
            XTensor * bGradTMP = NewTensorBuf(b->grad, b->devID, b->mem);
            _ReduceSum(node->grad, bGradTMP, 0);
            if(beta != 1.0F)
                _ScaleAndShiftMe(bGradTMP, beta);
            _Sum(bGradTMP, b->grad, b->grad);
            DelTensorBuf(bGradTMP);
-        }
+        /*}
        else{
            _ReduceSum(node->grad, b->grad, 0);
            if(beta != 1.0F)
                _ScaleAndShiftMe(b->grad, beta);
-        }
+        }*/
        node->grad->Reshape(order, dimSize);
    }
@@ -1230,19 +1230,19 @@ void XMathGrad::GradSumDim(XTensor * node, bool isEfficient)
        _ReduceSum(node->grad, interGrad, 2);
-        if(b->outgo.tailNum > 1){
+        //if(b->outgo.tailNum > 1){
            XTensor * bGradTMP = NewTensorBuf(b->grad, b->devID, b->mem);
            _ReduceSum(interGrad, bGradTMP, 0);
            if(beta != 1.0F)
                _ScaleAndShiftMe(bGradTMP, beta);
            _Sum(bGradTMP, b->grad, b->grad);
            DelTensorBuf(bGradTMP);
-        }
+        /*}
        else{
            _ReduceSum(interGrad, b->grad, 0);
            if(beta != 1.0F)
                _ScaleAndShiftMe(b->grad, beta);
-        }
+        }*/
        node->grad->Reshape(order, dimSize);

--- a/source/sample/transformer/T2TTrainer.cpp
+++ b/source/sample/transformer/T2TTrainer.cpp
@@ -111,6 +111,7 @@ void T2TTrainer::Init(int argc, char ** argv)
    LoadParamFloat(argc, argv, "labelsmoothing", &labelSmoothingP, 0);
    LoadParamInt(argc, argv, "nstepcheckpoint", &nStepCheckpoint, -1);
    LoadParamBool(argc, argv, "epochcheckpoint", &useEpochCheckpoint, false);
+    LoadParamInt(argc, argv, "updatestep", &updateStep, 1);
    buf  = new int[bufSize];
    buf2 = new int[bufSize];
@@ -144,6 +145,8 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
    int nStepCheck = 0;
    int nCheckpoint = 0;
    int nSkipped = 0;
+    int gradStep = 0;
+    int validStep = 0;
    char * trainFN = new char[(int)strlen(fn) + 10];
    strcpy(trainFN, fn);
@@ -211,17 +214,26 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
            XTensor &g = labelSmoothingP > 0 ? goldSmoothed : gold;
            if (doUpdate) {
+                /* back-propagation */
                net.Backward(output, g, CROSSENTROPY);
-                /* learning rate */
+                gradStep += 1;
-                lr = lrate * (1.0F / (float)sqrt((float)d)) * (float)MIN(pow((float)step + 1, -0.5F - lrbias), ((float)step + 1) * pow((float)nwarmup, -1.5F - lrbias));
-                /* update the parameters */
-                Update(model, lr);
                loss += -prob;
                wordCount += wc;
                wordCountTotal += wc;
+                /* update the parameters */
+                if(gradStep == updateStep){
+                    /* learning rate */
+                    lr = lrate * (1.0F / (float)sqrt((float)d)) * (float)MIN(pow((float)validStep + 1, -0.5F - lrbias), ((float)validStep + 1) * pow((float)nwarmup, -1.5F - lrbias));
+                    /* model update */
+                    Update(model, lr);
+                    gradStep = 0;
+                    validStep++;
+                }
            }
            else
                nSkipped++;
@@ -827,13 +839,14 @@ void T2TTrainer::PadOutput(XTensor * output, XTensor * gold, XTensor * padding)
    XTensor * padding2 = NewTensorBuf(1, &padding->unitNum, X_FLOAT, 1.0F, padding->devID, padding->mem);
    _CopyValues(padding, padding2);
+    _MultiplyDim(output, padding2, output, 0);
    _ScaleAndShiftMe(padding2, 1e9F, -1e9F);
    _SumDim(output, padding2, output, 0);
    output->Reshape(on, dimso);
    if(gold != NULL){
-        gold->Reshape(gold->unitNum/dimso[output->order - 1], dimso[output->order - 1]);
+        gold->Reshape(gold->unitNum/dimso[gold->order - 1], dimso[gold->order - 1]);
        _CopyValues(padding, padding2);
        _MultiplyDim(gold, padding2, gold, 0);
        gold->Reshape(on, dimso);
@@ -847,11 +860,10 @@ void T2TTrainer::PadOutput(XTensor * output, XTensor * gold, XTensor * padding)
 perform label smoothing
 >> gold - gold standard
 >> smoothed - result of label smoothing
->> lsP - smoothing factor
+>> p - smoothing factor
 */
-void T2TTrainer::LabelSmooth(XTensor * gold, XTensor * smoothed, DTYPE lsP)
+void T2TTrainer::LabelSmooth(XTensor * gold, XTensor * smoothed, DTYPE p)
 {
-    DTYPE p = lsP;
    CheckNTErrors(p >= 0 && p <= 1.0F, "Smoothing factor must be in range [0,1]");
    int n = gold->GetDim(-1);

--- a/source/sample/transformer/T2TTrainer.h
+++ b/source/sample/transformer/T2TTrainer.h
@@ -124,6 +124,9 @@ public:
    /* indicates whether we make a checkpoint after each traing epoch */
    bool useEpochCheckpoint;
+    /* number of batches on which we do model update */
+    int updateStep;
 public:
    /* constructor */
@@ -174,7 +177,7 @@ public:
    void PadOutput(XTensor * output, XTensor * gold, XTensor * padding);
    /* perform label smoothing */
-    void LabelSmooth(XTensor * gold, XTensor * smoothed, DTYPE lsP);
+    void LabelSmooth(XTensor * gold, XTensor * smoothed, DTYPE p);
 };

--- a/source/sample/transformer/Transformer.cpp
+++ b/source/sample/transformer/Transformer.cpp
@@ -34,7 +34,7 @@ int TransformerMain(int argc, const char ** argv)
    if(argc == 0)
        return 1;
-    fprintf(stderr, "%e\n", exp(DTYPE_MIN));
+    fprintf(stderr, "%e\n", exp(-1e9F));
    char ** args = new char*[argc];
    for(int i = 0; i < argc; i++){