optimize xbackward implementation for supporting efficient propagate and gradient accumulation

18a08a65 · xuchen · 0e585782 · 18a08a65 · 18a08a65 · 18a08a65
Commit 18a08a65 authored Feb 18, 2020 by xuchen
--- a/source/network/XBackwardFunc.cpp
+++ b/source/network/XBackwardFunc.cpp
@@ -40,28 +40,37 @@ void XFuncGrad::MakeGrad(XTensor * node, bool isEfficient)
    XTensor * input = income.tails[0];
    XTensor * output = node;

-    XNoder::MakeGrad(input);
+    if (!isEfficient || input->isGrad) {
+        XNoder::MakeGrad(input);

-    if(operID == FUNC_HARDTANH)
-        _HardTanHBackward(output, input, output->grad, input->grad);
-    else if(operID == FUNC_IDENTITY)
-        _IdentityBackward(output, input, output->grad, input->grad);
-    else if(operID == FUNC_LOGSOFTMAX){
-        int leadDim = income.GetParamInt(0);
-        CheckNTErrors(leadDim >= 0 && leadDim < input->order, "wrong leading dimension in logsoftmax!");
-        _LogSoftmaxBackward(NULL, output, input, output->grad, input->grad, NULL, leadDim, NOLOSS);
-    }
-    else if(operID == FUNC_RECTIFY)
-        _RectifyBackward(output, input, output->grad, input->grad);
-    else if(operID == FUNC_SIGMOID)
-        _SigmoidBackward(output, input, output->grad, input->grad);
-    else if(operID == FUNC_SOFTMAX){
-        int leadDim = income.GetParamInt(0);
-        CheckNTErrors(leadDim >= 0 && leadDim < input->order, "wrong leading dimension in softmax!");
-        _SoftmaxBackward(NULL, output, input, output->grad, input->grad, NULL, leadDim, NOLOSS);
-    }
-    else{
-        ShowNTErrors("Wrong activation function type!");
+        XTensor * dedx = input->grad;
+        XTensor * dedy = output->grad;
+        XTensor * tmp = NewTensorBufV2(output, output->devID, output->mem);
+
+        if (operID == FUNC_HARDTANH)
+            _HardTanHBackward(output, input, dedy, tmp);
+        else if (operID == FUNC_IDENTITY)
+            _IdentityBackward(output, input, dedy, tmp);
+        else if (operID == FUNC_LOGSOFTMAX) {
+            int leadDim = income.GetParamInt(0);
+            CheckNTErrors(leadDim >= 0 && leadDim < input->order, "wrong leading dimension in logsoftmax!");
+            _LogSoftmaxBackward(NULL, output, input, dedy, tmp, NULL, leadDim, NOLOSS);
+        }
+        else if (operID == FUNC_RECTIFY)
+            _RectifyBackward(output, input, dedy, tmp);
+        else if (operID == FUNC_SIGMOID)
+            _SigmoidBackward(output, input, dedy, tmp);
+        else if (operID == FUNC_SOFTMAX) {
+            int leadDim = income.GetParamInt(0);
+            CheckNTErrors(leadDim >= 0 && leadDim < input->order, "wrong leading dimension in softmax!");
+            _SoftmaxBackward(NULL, output, input, dedy, tmp, NULL, leadDim, NOLOSS);
+        }
+        else {
+            ShowNTErrors("Wrong activation function type!");
+        }
+
+        _SumMe(dedx, tmp);
+        DelTensorBuf(tmp);
    }

    node->visitMark = NODE_FINISHED;

--- a/source/network/XBackwardLoss.cpp
+++ b/source/network/XBackwardLoss.cpp
@@ -48,33 +48,38 @@ void XLossGrad::MakeGrad(XTensor * node, bool isEfficient)
    XTensor * padding = NULL;
    int leadingDim;

-    XNoder::MakeGrad(output);
-    XTensor * dedy = output->grad;
-
-    if (income.tailNum == 1) {
-        if(dedy->dataType == X_FLOAT)
-            _SetDataFixedFloat(dedy, 1.0F);
-        else if(dedy->dataType == X_DOUBLE)
-            _SetDataFixedDouble(dedy, 1.0);
-        else if(dedy->dataType == X_INT)
-            _SetDataFixedInt(dedy, 1);
-        else
-            ShowNTErrors("TODO");
-
-        return;
-    }
-
-    gold = income.tails[1];
-
-    if(operID == LOSS_CROSSENTROPY) {
-        if (income.tailNum == 3) 
-            padding = income.tails[2];
-        leadingDim = income.GetParamInt(0);
-        CheckNTErrors(leadingDim >= 0 && leadingDim < output->order, "wrong leading dimension in logsoftmax!");
-        _CrossEntropyBackward(dedy, output, gold, weight, padding, leadingDim);
-    }
-    else{
-        ShowNTErrors("Wrong activation function type!");
+    if (!isEfficient || output->isGrad) {
+        XNoder::MakeGrad(output);
+        XTensor * dedy = output->grad;
+
+        if (income.tailNum == 1) {
+            if (dedy->dataType == X_FLOAT)
+                _SetDataFixedFloat(dedy, 1.0F);
+            else if (dedy->dataType == X_DOUBLE)
+                _SetDataFixedDouble(dedy, 1.0);
+            else if (dedy->dataType == X_INT)
+                _SetDataFixedInt(dedy, 1);
+            else
+                ShowNTErrors("TODO");
+
+            return;
+        }
+
+        gold = income.tails[1];
+
+        XTensor* tmp = NewTensorBufV2(output, output->devID, output->mem);
+        if (operID == LOSS_CROSSENTROPY) {
+            if (income.tailNum == 3)
+                padding = income.tails[2];
+            leadingDim = income.GetParamInt(0);
+            CheckNTErrors(leadingDim >= 0 && leadingDim < output->order, "wrong leading dimension in logsoftmax!");
+            _CrossEntropyBackward(tmp, output, gold, weight, padding, leadingDim);
+            _SumMe(dedy, tmp);
+        }
+        else {
+            ShowNTErrors("Wrong activation function type!");
+        }
+        DelTensorBuf(tmp);
    }

    node->visitMark = NODE_FINISHED;
@@ -87,79 +92,4 @@ bool XLossGrad::IsLossOP(XTensor * node)
    return (income.typeID & LOSS_BASE) != 0;
 }

-/* 
-compute dE/dx for a given function y = f(x) 
->> gold - gold standard to measure error (or loss)
->> y - output of the function
->> x - input of the function
->> dedy - dE/dy
->> dedx - dE/dx
->> funcID - id of the function f
->> params - parameters of the function
->> lossName - name of the loss, e.g., cross entropy
-*/
-//void XLossGrad::Compute(XTensor * gold, XTensor * y, XTensor * x, 
-//                        XTensor * dedy, XTensor * dedx, XTensor * padding,
-//                        int funcID, void * params,
-//                        LOSS_FUNCTION_NAME lossName)
-//{
-//    CheckNTErrors(gold && y && x, "Empty input tensors!");
-//    CheckNTErrors(dedx, "Empty gradient tensors!");
-//    CheckNTErrors((funcID & FUNCTION_BASE) != 0, "Illegal function id");
-//
-//    if(funcID == FUNC_HARDTANH){
-//        _HardTanHBackward(gold, y, x, dedy, dedx, lossName);
-//    }
-//    else if(funcID == FUNC_IDENTITY){
-//        _IdentityBackward(gold, y, x, dedy, dedx, lossName);
-//    }
-//    else if(funcID == FUNC_LOGSOFTMAX){
-//        int leadDim = *(int*)params;
-//        _LogSoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
-//    }
-//    else if(funcID == FUNC_RECTIFY){
-//        _RectifyBackward(gold, y, x, dedy, dedx, lossName);
-//    }
-//    else if(funcID == FUNC_SIGMOID){
-//        _SigmoidBackward(gold, y, x, dedy, dedx, lossName);
-//    }else if(funcID == FUNC_SOFTMAX){
-//        int leadDim = *(int*)params;
-//        _SoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
-//    }
-//    else{
-//        ShowNTErrors("wrong function found when call the backward process!");
-//    }
-//
-//}
-
-/* 
-compute dE/dy for variable y and error(loss) function E
->> gold - gold standard to measure error (or loss)
->> y - output of the function
->> dedy - dE/dy
->> lossName - name of the loss, e.g., cross entropy
-*/
-//void XLossGrad::Compute(XTensor * gold, XTensor * y, 
-//                        XTensor * dedy, XTensor * padding,
-//                        LOSS_FUNCTION_NAME lossName)
-//{
-//    if(gold == NULL){
-//        if(dedy->dataType == X_FLOAT)
-//            _SetDataFixedFloat(dedy, 1.0F);
-//        else if(dedy->dataType == X_DOUBLE)
-//            _SetDataFixedDouble(dedy, 1.0);
-//        else if(dedy->dataType == X_INT)
-//            _SetDataFixedInt(dedy, 1);
-//        else{
-//            ShowNTErrors("TODO");
-//        }
-//        return;
-//    }
-//
-//    //_LossBackward(dedy, gold, y, lossName);
-//    if(lossName == CROSSENTROPY)
-//        _CrossEntropyBackward(dedy, y, gold, NULL, padding);
-//
-//}
-
 }
\ No newline at end of file
--- a/source/network/XBackwardMath.cpp
+++ b/source/network/XBackwardMath.cpp
--- a/source/network/XBackwardShape.cpp
+++ b/source/network/XBackwardShape.cpp
--- a/source/network/XNet.cpp
+++ b/source/network/XNet.cpp
@@ -316,7 +316,6 @@ void XNet::ClearGrad(XTensor * node)
    }

    if(finished){
-        //fprintf(stderr, "del %d %ld\n", node->id, node->grad->unitNum);
        delete node->grad;
        node->grad = NULL;
    }

--- a/source/tensor/core/arithmetic/MatrixMul.cpp
+++ b/source/tensor/core/arithmetic/MatrixMul.cpp
@@ -62,7 +62,7 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
    
    /* we transform a higher order tensor to a matrix to kill the number
       of calls of matrix multiplication */
-    if(transposedA == X_NOTRANS && a->order > 2 && b->order == 2){
+    if (transposedA == X_NOTRANS && a->order > 2 && b->order == 2) {
        int ncolA = a->dimSize[a->order - 1];
        int ncolC = c->dimSize[c->order - 1];
        XTensor * a2 = NewTensor2DV2(a->unitNum/ncolA, -ncolA, a->dataType, a->devID, a->mem);

--- a/source/tensor/core/math/Compare.cpp
+++ b/source/tensor/core/math/Compare.cpp
@@ -199,8 +199,8 @@ void funcName(const XTensor &a, const XTensor &b, XTensor c)                    
 }

 #ifdef USE_CUDA
-_SIMPLE_MAX_MIN_FUNCTION(_Max, _CudaMax, max)
-_SIMPLE_MAX_MIN_FUNCTION(_Min, _CudaMin, min)
+_SIMPLE_MAX_MIN_FUNCTION(_Max, _CudaMax, MAX)
+_SIMPLE_MAX_MIN_FUNCTION(_Min, _CudaMin, MIN)
 #else
 _SIMPLE_MAX_MIN_FUNCTION(_Max, max)
 _SIMPLE_MAX_MIN_FUNCTION(_Min, min)

--- a/source/tensor/core/shape/Split.h
+++ b/source/tensor/core/shape/Split.h
 /* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */

 /*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
-*/
+ * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
+ */

 #ifndef __SPLIT_H__
 #define __SPLIT_H__

--- a/source/tensor/core/shape/Stack.cpp
+++ b/source/tensor/core/shape/Stack.cpp
@@ -85,7 +85,7 @@ XTensor Stack(const TensorList &smalls, int dim)
 {
    int count = smalls.count;
    CheckNTErrors(count > 0, "Empty list!");
-    CheckNTErrors(dim >= 0, "Illegal dimension to concatenate!");
+    CheckNTErrors(dim >= 0, "Illegal dimension to Stack!");

    XTensor * tensor = smalls.GetItem(0);
    int order = tensor->order + 1;
@@ -95,7 +95,7 @@ XTensor Stack(const TensorList &smalls, int dim)
        if (i < dim)
            dimSize[i] = tensor->GetDim(i);
        else if (i > dim)
-            dimSize[i] = tensor->GetDim(i);
+            dimSize[i] = tensor->GetDim(i-1);
        else if (i == dim)
            dimSize[i] = count;
    }
@@ -149,7 +149,7 @@ void Stack(const TensorList &smalls, XTensor &t, int dim)
 {
    int count = smalls.count;
    CheckNTErrors(count > 0, "Empty list!");
-    CheckNTErrors(dim >= 0, "Illegal dimension to concatenate!");
+    CheckNTErrors(dim >= 0, "Illegal dimension to Stack!");

    if (!t.isInit || !CheckStackShape(smalls, t, dim)) {
        XTensor * tensor = smalls.GetItem(0);