fix a bug in the softmax function. It executes backward comptuation incorrectly…

fix a bug in the softmax function. It executes backward comptuation incorrectly (in the condition of cpu computation and NOLOSS). I found this bug by chance, then I spent one night confirming that something must be wrong and I spent one night finding the location of bug. Finally, I fixed this bug in five minutes.

fix a bug in the softmax function. It executes backward comptuation incorrectly…
fix a bug in the softmax function. It executes backward comptuation incorrectly (in the condition of cpu computation and NOLOSS). I found this bug by chance, then I spent one night confirming that something must be wrong and I spent one night finding the location of bug. Finally, I fixed this bug in five minutes.
b409f07f · xuchen · ceb5b101 · b409f07f · b409f07f · b409f07f
Commit b409f07f authored Oct 03, 2018 by xuchen
--- a/source/tensor/core/movement/Gather.h
+++ b/source/tensor/core/movement/Gather.h
@@ -30,7 +30,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 void _Gather(const XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSize);
 /* gather selected sub-tensors (return a XTensor structure)
-make a new tensor to keep the result and return it */
+   make a new tensor to keep the result and return it */
 XTensor Gather(const XTensor &s, int dim, int * srcIndex, int indexSize);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/movement/Spread.cpp
+++ b/source/tensor/core/movement/Spread.cpp
@@ -62,7 +62,6 @@ void _Spread(XTensor * source, XTensor * collection, int dim,
             int * srcIndex, int indexSize, int * collIndex)
 {
    int order = source->order;
-    int size = source->GetDim(dim);
    CheckNTErrors(source->dataType == DEFAULT_DTYPE, "TODO!");
    CheckNTErrors(dim >= 0 && dim < order, "Illegal dimension!");
@@ -150,7 +149,6 @@ void _SpreadForGather(XTensor * source, XTensor * collection, int dim,
                      int * srcIndex, int indexSize, int * collIndex)
 {
    int order = source->order;
-    int size = source->GetDim(dim);
    CheckNTErrors(source->dataType == DEFAULT_DTYPE, "TODO!");
    CheckNTErrors(dim >= 0 && dim < order, "Illegal dimension!");

--- a/source/tensor/function/CrossEntropy.cpp
+++ b/source/tensor/function/CrossEntropy.cpp
@@ -61,42 +61,39 @@ void _CrossEntropy(const XTensor * output, const XTensor * gold,
    CheckNTErrors(loss->order == output->order - 1, "Wrong loss dimension!");
    CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE, "TODO!");
-    XTensor * logInter = NewTensorBuf(output, output->devID, output->mem);
-    XTensor * mulInter = NewTensorBuf(output, output->devID, output->mem);
-    XTensor * negInter = NewTensorBuf(output, output->devID, output->mem);
    XTensor * logBuf = NewTensorBuf(output, output->devID, output->mem);
    XTensor * mulBuf = NewTensorBuf(output, output->devID, output->mem);
-    XTensor * negBuf = NewTensorBuf(output, output->devID, output->mem);
    /* l = log(output) */
    _Log(output, logBuf);
    if(weight != NULL){
        XTensor * weightBuf = NewTensorBuf(output, output->devID, output->mem);
-        /* multiply gold and weight by broadcast wg = mulDim(g * w) */
+        /* multiply gold with weight by broadcast wg = mulDim(g * w) */
        _MultiplyDim(gold, weight, weightBuf, n, 0);
-        /* multiply weighted gold and log(output) wgl = mul(wg, l) */
+        /* multiply weighted gold with log(output) wgl = mul(wg, l) */
        _Multiply(weightBuf, logBuf, mulBuf, 0);
        DelTensorBuf(weightBuf);
    }
    else{
-        /* multiply gold and log(output) gl = mul(g, l) */
+        /* multiply gold with log(output) gl = mul(g, l) */
        _Multiply(gold, logBuf, mulBuf, 0);
    }
-    /* negate multiply result n = negate(mul) */
+    /* negate result n = negate(mul) */
    _NegateMe(mulBuf);
    _ReduceSum(mulBuf, loss, n);
-    DelTensorBuf(negInter);
+    DelTensorBuf(mulBuf);
-    DelTensorBuf(mulInter);
+    DelTensorBuf(logBuf);
-    DelTensorBuf(logInter);
 }
 /*
-compute the cross entropy loss (implementation manually)
+compute the cross entropy loss (faster implementation with optimized code)
 loss = sum_{i} (-gold_i * log(output_i))
 where gold and output are distributions 
@@ -108,13 +105,13 @@ where gold and output are distributions
 >> padding - specify a target value that is ignored and does not contribute to the loss computation
 >> leadingDim - the leading dimension for the output
 */
-void _CrossEntropyManual(const XTensor * output, const XTensor * gold,
+void _CrossEntropyFast(const XTensor * output, const XTensor * gold,
-                         XTensor * loss, const XTensor * weight, 
+                       XTensor * loss, const XTensor * weight,
-                         const XTensor * padding, int leadingDim)
+                       const XTensor * padding, int leadingDim)
 {
 #ifdef USE_CUDA
    if(output->devID >= 0) {
-        _CudaCrossEntropyManual(output, gold, loss, weight, padding, leadingDim);
+        _CudaCrossEntropyFast(output, gold, loss, weight, padding, leadingDim);
        return;
    }
 #endif
@@ -263,21 +260,22 @@ DTYPE _CrossEntropy(const XTensor * output, const XTensor * gold,
    XTensor * logBuf = NewTensorBuf(output, output->devID, output->mem);
    XTensor * mulBuf = NewTensorBuf(output, output->devID, output->mem);
-    XTensor * negBuf = NewTensorBuf(output, output->devID, output->mem);
    /* l = log(output) */
    _Log(output, logBuf);
    if(weight != NULL){
        XTensor * weightBuf = NewTensorBuf(output, output->devID, output->mem);
-        /* multiply gold and weight by broadcast wg = mulDim(g * w) */
+        /* multiply gold with weight by broadcast wg = mulDim(g * w) */
        _MultiplyDim(gold, weight, weightBuf, n, 0);
-        /* multiply weighted gold and log(output) wgl = mul(wg, l) */
+        /* multiply weighted gold with log(output) wgl = mul(wg, l) */
        _Multiply(weightBuf, logBuf, mulBuf, 0);
        DelTensorBuf(weightBuf);
    }
    else{
-        /* multiply gold and log(output) gl = mul(g, l) */
+        /* multiply gold with log(output) gl = mul(g, l) */
        _Multiply(gold, logBuf, mulBuf, 0);
    }
@@ -291,7 +289,6 @@ DTYPE _CrossEntropy(const XTensor * output, const XTensor * gold,
    /* reduce sum all classes */
    _ReduceSum(mulBuf, lossInter, n);
-    DelTensorBuf(negBuf);
    DelTensorBuf(mulBuf);
    DelTensorBuf(logBuf);
@@ -334,7 +331,7 @@ DTYPE _CrossEntropy(const XTensor * output, const XTensor * gold,
 }
 /*
-compute the cross entropy loss (implementation manually)
+compute the cross entropy loss (faster implementation with optimized code)
 loss = sum_{i} (-gold_i * log(output_i))
 where gold and output are distributions 
@@ -347,13 +344,13 @@ where gold and output are distributions
 >> leadingDim - the leading dimension for the output
 << return - the cross entropy loss that is a scalar
 */
-DTYPE _CrossEntropyManual(const XTensor * output, const XTensor * gold,
+DTYPE _CrossEntropyFast(const XTensor * output, const XTensor * gold,
-                          LOSS_COMPUTE_WAY reduceWay, const XTensor * weight, 
+                        LOSS_COMPUTE_WAY reduceWay, const XTensor * weight,
-                          const XTensor * padding, int leadingDim)
+                        const XTensor * padding, int leadingDim)
 {
 #ifdef USE_CUDA
    if(output->devID >= 0) {
-        return _CudaCrossEntropyManual(output, gold, reduceWay, weight, padding, leadingDim);
+        return _CudaCrossEntropyFast(output, gold, reduceWay, weight, padding, leadingDim);
    }
 #endif
@@ -459,7 +456,7 @@ DTYPE _CrossEntropyManual(const XTensor * output, const XTensor * gold,
 }
 /* 
-backward compuation for cross entropy function (tensor version)
+backward compuation for cross entropy function
 loss = sum_{i} (-t_i * log(y_i))
 dE/dy_i = -t_i / y_i
@@ -566,7 +563,7 @@ void _CrossEntropyBackward(XTensor * dedy, const XTensor * output, const XTensor
    if(padding != NULL) {
        XTensor * tmp(padding);
        _IsZero(padding, tmp);
-        int nonZeroNum = _ReduceSumAll(tmp);
+        int nonZeroNum = (int)_ReduceSumAll(tmp);
        _ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)nonZeroNum);
        delete tmp;
    }

--- a/source/tensor/function/CrossEntropy.cu
+++ b/source/tensor/function/CrossEntropy.cu
@@ -111,7 +111,7 @@ where gold and output are distributions
 >> padding - specify a target value that is ignored and does not contribute to the loss computation
 >> leadingDim - the leading dimension for the output
 */
-void _CudaCrossEntropyManual(const XTensor * output, const XTensor * gold,
+void _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,
                             XTensor * loss, const XTensor * weight, 
                             const XTensor * padding, int leadingDim)
 {
@@ -201,9 +201,9 @@ where gold and output are distributions
 >> leadingDim - the leading dimension for the output
 << return - the cross entropy loss that is a scalar
 */
-DTYPE _CudaCrossEntropyManual(const XTensor * output, const XTensor * gold,
+DTYPE _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,
-                              LOSS_COMPUTE_WAY reduceWay, const XTensor * weight, 
+                            LOSS_COMPUTE_WAY reduceWay, const XTensor * weight,
-                              const XTensor * padding, int leadingDim)
+                            const XTensor * padding, int leadingDim)
 {
    DTYPE loss = 0;
@@ -232,7 +232,7 @@ DTYPE _CudaCrossEntropyManual(const XTensor * output, const XTensor * gold,
    XTensor * lossInter = NewTensor(output->order - 1, dimSize, output->dataType, output->denseRatio, output->devID, output->mem);
-    _CudaCrossEntropyManual(output, gold, lossInter, weight, padding, leadingDim);
+    _CudaCrossEntropyFast(output, gold, lossInter, weight, padding, leadingDim);
    loss = _ReduceSumAll(lossInter);
@@ -400,7 +400,7 @@ void _CudaCrossEntropyBackward(XTensor * dedy, const XTensor * output, const XTe
    if(padding != NULL) {
        XTensor * tmp(padding);
        _IsZero(padding, tmp);
-        int nonZeroNum = _ReduceSumAll(tmp);
+        int nonZeroNum = (int)_ReduceSumAll(tmp);
        _ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)nonZeroNum);
        delete tmp;
    }

--- a/source/tensor/function/CrossEntropy.cuh
+++ b/source/tensor/function/CrossEntropy.cuh
@@ -27,13 +27,13 @@
 namespace nts{ // namespace nts(NiuTrans.Tensor)
-/* compute the cross entropy loss (tensor version) */
+/* compute the cross entropy loss */
-void _CudaCrossEntropyManual(const XTensor * output, const XTensor * gold,
+void _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,
                             XTensor * loss, const XTensor * weight = NULL, 
                             const XTensor * padding = NULL, int leadingDim = -1);
-/* compute the cross entropy loss (scalar version) */
+/* compute the cross entropy loss */
-DTYPE _CudaCrossEntropyManual(const XTensor * output, const XTensor * gold,
+DTYPE _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,
                              LOSS_COMPUTE_WAY reduceWay, const XTensor * weight = NULL, 
                              const XTensor * padding = NULL, int leadingDim = -1);

--- a/source/tensor/function/CrossEntropy.h
+++ b/source/tensor/function/CrossEntropy.h
@@ -31,25 +31,25 @@ REDUCE_SUM,
 REDUCE_MEAN
 };
-/* compute the cross entropy loss (tensor version) */
+/* compute the cross entropy loss */
 void _CrossEntropy(const XTensor * output, const XTensor * gold, 
                   XTensor * loss, const XTensor * weight = NULL, 
                   const XTensor * padding = NULL, int leadingDim = -1);
-/* compute the cross entropy loss (tensor version) */
+/* compute the cross entropy loss */
-void _CrossEntropyManual(const XTensor * output, const XTensor * gold,
+void _CrossEntropyFast(const XTensor * output, const XTensor * gold,
                         XTensor * loss, const XTensor * weight = NULL, 
                         const XTensor * padding = NULL, int leadingDim = -1);
-/* compute the cross entropy loss (scalar version) */
+/* compute the cross entropy loss (return the loss) */
 DTYPE _CrossEntropy(const XTensor * output, const XTensor * gold,
                    LOSS_COMPUTE_WAY reduceWay, const XTensor * weight = NULL, 
                    const XTensor * padding = NULL, int leadingDim = -1);
-/* compute the cross entropy loss (scalar version) */
+/* compute the cross entropy loss (return the loss) */
-DTYPE _CrossEntropyManual(const XTensor * output, const XTensor * gold,
+DTYPE _CrossEntropyFast(const XTensor * output, const XTensor * gold,
-                          LOSS_COMPUTE_WAY reduceWay = REDUCE_MEAN, const XTensor * weight = NULL, 
+                        LOSS_COMPUTE_WAY reduceWay = REDUCE_MEAN, const XTensor * weight = NULL,
-                          const XTensor * padding = NULL, int leadingDim = -1);
+                        const XTensor * padding = NULL, int leadingDim = -1);
 /* backward computation of cross entropy function */
 void _CrossEntropyBackward(XTensor * dedy, const XTensor * output, const XTensor * gold, 

--- a/source/tensor/function/Softmax.cpp
+++ b/source/tensor/function/Softmax.cpp
@@ -297,9 +297,10 @@ void _SoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
            \beta = \sum_i (dE/dy_i * y_i) 
            */
-            for(int k = 0; k < blockNum; k++){
+            for(int m = 0; m < blockNum; m++){
-                op = (DTYPE*)y->data + k * blockSize;
+                yp = (DTYPE*)dedy->data + m * blockSize;
-                sp = (DTYPE*)dedx->data + k * blockSize;
+                op = (DTYPE*)y->data + m * blockSize;
+                sp = (DTYPE*)dedx->data + m * blockSize;
                int nCols = stride;
                for(int k = 0; k < stride; k++){

--- a/source/tensor/test/TCrossEntropy.cpp
+++ b/source/tensor/test/TCrossEntropy.cpp
@@ -61,7 +61,7 @@ bool TestCrossEntropy1()
    gold->SetData(goldData, unitNum);
    /* call CrossEntropy function */
-    _CrossEntropyManual(output, gold, loss);
+    _CrossEntropyFast(output, gold, loss);
    error2 = _CrossEntropy(output, gold, REDUCE_SUM);
    error1 = loss->Get1D(0);
@@ -83,7 +83,7 @@ bool TestCrossEntropy1()
    goldGPU->SetData(goldData, unitNum);
    /* call CrossEntropy function */
-    _CrossEntropyManual(outputGPU, goldGPU, lossGPU);
+    _CrossEntropyFast(outputGPU, goldGPU, lossGPU);
    error1 = lossGPU->Get1D(0);
    error2 = _CrossEntropy(outputGPU, goldGPU, REDUCE_SUM);
@@ -163,8 +163,8 @@ bool TestCrossEntropy2()
    /* call CrossEntropy function */
    error1 = _CrossEntropy(output, gold, REDUCE_SUM);
    error2 = _CrossEntropy(output, gold, REDUCE_MEAN);
-    error3 = _CrossEntropyManual(output, gold, REDUCE_SUM);
+    error3 = _CrossEntropyFast(output, gold, REDUCE_SUM);
-    error4 = _CrossEntropyManual(output, gold, REDUCE_MEAN);
+    error4 = _CrossEntropyFast(output, gold, REDUCE_MEAN);
    /* check results */
    cpuTest = (fabs(error1 - answer1) < 1e-4F &&
@@ -191,8 +191,8 @@ bool TestCrossEntropy2()
    /* call CrossEntropy function */
    error1 = _CrossEntropy(outputGPU, goldGPU, REDUCE_SUM);
    error2 = _CrossEntropy(outputGPU, goldGPU, REDUCE_MEAN);
-    error3 = _CrossEntropyManual(outputGPU, goldGPU, REDUCE_SUM);
+    error3 = _CrossEntropyFast(outputGPU, goldGPU, REDUCE_SUM);
-    error4 = _CrossEntropyManual(outputGPU, goldGPU, REDUCE_MEAN);
+    error4 = _CrossEntropyFast(outputGPU, goldGPU, REDUCE_MEAN);
    /* check results */
    gpuTest = (fabs(error1 - answer1) < 1e-4F &&
@@ -272,7 +272,7 @@ bool TestCrossEntropy3()
    gold->Set2D(1.0F, 3, 3);
    /* call CrossEntropy function */
-    _CrossEntropyManual(output, gold, loss, weight);
+    _CrossEntropyFast(output, gold, loss, weight);
    /* check results */
    cpuTest = loss->CheckData(answer, 4, 1e-4F);
@@ -297,7 +297,7 @@ bool TestCrossEntropy3()
    goldGPU->Set2D(1.0F, 3, 3);
    /* call CrossEntropy function */
-    _CrossEntropyManual(outputGPU, goldGPU, lossGPU, weightGPU);
+    _CrossEntropyFast(outputGPU, goldGPU, lossGPU, weightGPU);
    /* check results */
    gpuTest = lossGPU->CheckData(answer, 4, 1e-4F);
@@ -361,7 +361,7 @@ bool TestCrossEntropy4()
    _ScaleAndShiftMe(gold, 1, 2);
    /* call CrossEntropy function */
-    error = _CrossEntropyManual(output, gold);
+    error = _CrossEntropyFast(output, gold);
    /* check results */
    cpuTest = (fabs(error - answer) < 1e-4);
@@ -381,7 +381,7 @@ bool TestCrossEntropy4()
    _ScaleAndShiftMe(goldGPU, 1, 2);
    /* call CrossEntropy function */
-    error = _CrossEntropyManual(outputGPU, goldGPU);
+    error = _CrossEntropyFast(outputGPU, goldGPU);
    /* check results */
    gpuTest = (fabs(error - answer) < 1e-4);