fix bugs and improve the implementation in the CrossEntropy function

52c0e35a · xuchen · b409f07f · 52c0e35a · 52c0e35a · 52c0e35a
Commit 52c0e35a authored Oct 10, 2018 by xuchen
--- a/source/tensor/XName.cpp
+++ b/source/tensor/XName.cpp
@@ -35,6 +35,8 @@ const char * GetOPName(int type)
            return "M_EXP";
        else if (type == MATH_FLOOR)
            return "M_FLOOR";
+        else if (type == MATH_ISNONZERO)
+            return "M_ISNONZERO";
        else if (type == MATH_ISZERO)
            return "M_ISZERO";
        else if (type == MATH_LOG)

--- a/source/tensor/XName.h
+++ b/source/tensor/XName.h
@@ -35,7 +35,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #define MATH_CEIL               MATH_ABSOLUTE + 1
 #define MATH_EXP                MATH_CEIL + 1
 #define MATH_FLOOR              MATH_EXP + 1
-#define MATH_ISZERO             MATH_FLOOR + 1
+#define MATH_ISNONZERO          MATH_FLOOR + 1
+#define MATH_ISZERO             MATH_ISNONZERO + 1
 #define MATH_LOG                MATH_ISZERO + 1
 #define MATH_SQRT               MATH_LOG + 1
 #define MATH_SQUARE             MATH_SQRT + 1

--- a/source/tensor/XTensor.cpp
+++ b/source/tensor/XTensor.cpp
@@ -2135,7 +2135,7 @@ generate a copy of XTensor
 >> isFilledData - indicates whether we allocate the data for
                  the newly-generated tensor
 */
-XTensor * NewTensor(XTensor * a, bool isFilledData)
+XTensor * NewTensor(const XTensor * a, bool isFilledData)
 {
    int dims[MAX_TENSOR_DIM_NUM];
    

--- a/source/tensor/XTensor.h
+++ b/source/tensor/XTensor.h
@@ -450,7 +450,7 @@ XTensor * NewTensor5D(const int d0, const int d1, const int d2, const int d3, co
                      const int myDevID = -1, XMem * myMem = NULL);

 /* generate a copy of XTensor (with a reference to a given tensor) */
-XTensor * NewTensor(XTensor * a, bool isFilledData = true);
+XTensor * NewTensor(const XTensor * a, bool isFilledData = true);

 /* free the data space of a given tensor */
 void DelTensor(XTensor * tensor);

--- a/source/tensor/core/math/Unary.cpp
+++ b/source/tensor/core/math/Unary.cpp
@@ -37,6 +37,11 @@ DTYPE round(DTYPE r)
 	return (r > 0.0) ? (DTYPE)floor(r + 0.5) : (DTYPE)ceil(r - 0.5);
 }

+DTYPE isnonzero(DTYPE r)
+{
+    return (r != 0.0) ? (DTYPE)1.0 : (DTYPE)0.0;
+}
+
 DTYPE iszero(DTYPE r)
 {
    return (r == 0.0) ? (DTYPE)1.0 : (DTYPE)0.0;
@@ -93,6 +98,10 @@ _SIMPLE_UNARY_FUNCTION(_Floor, _CudaFloor, floor)
 _SIMPLE_UNARY_FUNCTION_ME(_FloorMe, _Floor)
 SIMPLE_UNARY_FUNCTION(Floor, _Floor, MATH_FLOOR)

+_SIMPLE_UNARY_FUNCTION(_IsNonZero, _CudaIsNonZero, isnonzero)
+_SIMPLE_UNARY_FUNCTION_ME(_IsNonZeroMe, _IsNonZero)
+SIMPLE_UNARY_FUNCTION(IsNonZero, _IsNonZero, MATH_ISNONZERO)
+
 _SIMPLE_UNARY_FUNCTION(_IsZero, _CudaIsZero, iszero)
 _SIMPLE_UNARY_FUNCTION_ME(_IsZeroMe, _IsZero)
 SIMPLE_UNARY_FUNCTION(IsZero, _IsZero, MATH_ISZERO)
@@ -173,6 +182,10 @@ _SIMPLE_UNARY_FUNCTION(_Floor, floor)
 _SIMPLE_UNARY_FUNCTION_ME(_FloorMe, _Floor)
 SIMPLE_UNARY_FUNCTION(Floor, _Floor, MATH_FLOOR)

+_SIMPLE_UNARY_FUNCTION(_IsNonZero, isnonzero)
+_SIMPLE_UNARY_FUNCTION_ME(_IsNonZeroMe, _IsNonZero)
+SIMPLE_UNARY_FUNCTION(IsNonZero, _IsNonZero, MATH_ISNONZERO)
+
 _SIMPLE_UNARY_FUNCTION(_IsZero, iszero)
 _SIMPLE_UNARY_FUNCTION_ME(_IsZeroMe, _IsZero)
 SIMPLE_UNARY_FUNCTION(IsZero, _IsZero, MATH_ISZERO)

--- a/source/tensor/core/math/Unary.cu
+++ b/source/tensor/core/math/Unary.cu
@@ -41,11 +41,18 @@ DTYPE cudaround(DTYPE r)
 }

 __device__
+DTYPE cudaisnonzero(DTYPE r)
+{
+    return (r != 0.0) ? (DTYPE)1.0 : (DTYPE)0.0;
+}
+
+__device__
 DTYPE cudaiszero(DTYPE r)
 {
    return (r == 0.0) ? (DTYPE)1.0 : (DTYPE)0.0;
 }

+
 #define SIMPLE_UNARY_FUNCTION_GPU(funcName, origFunc)                       \
 __global__                                                                  \
 void Kernel##funcName(DTYPE * a, DTYPE * b, int size)                       \
@@ -96,6 +103,7 @@ SIMPLE_UNARY_FUNCTION_GPU(Absolute, fabs)
 SIMPLE_UNARY_FUNCTION_GPU(Ceil, ceil)
 SIMPLE_UNARY_FUNCTION_GPU(Exp, exp)
 SIMPLE_UNARY_FUNCTION_GPU(Floor, floor)
+SIMPLE_UNARY_FUNCTION_GPU(IsNonZero, cudaisnonzero)
 SIMPLE_UNARY_FUNCTION_GPU(IsZero, cudaiszero)
 SIMPLE_UNARY_FUNCTION_GPU(Log, log)
 SIMPLE_UNARY_FUNCTION_GPU(Round, cudaround)

--- a/source/tensor/core/math/Unary.cuh
+++ b/source/tensor/core/math/Unary.cuh
@@ -66,6 +66,15 @@ void KernelFloor(__half * a, __half * b, int size);
 /* set each entry to its floor value */
 void _CudaFloor(const XTensor * a, XTensor * b);

+/* if source entry is non-zero, set target entry to be one, otherwise zero (CUDA Kernel) */
+__global__
+void KernelIsNonZero(DTYPE * a, DTYPE * b, int size);
+/* if source entry is non-zero, set target entry to be one, otherwise zero (CUDA Kernel) with float16 data type*/
+__global__
+void KernelIsNonZero(__half * a, __half * b, int size);
+/* if source entry is non-zero, set target entry to be one, otherwise zero */
+void _CudaIsNonZero(const XTensor * a, XTensor * b);
+
 /* if source entry is zero, set target entry to be one, otherwise zero (CUDA Kernel) */
 __global__
 void KernelIsZero(DTYPE * a, DTYPE * b, int size);

--- a/source/tensor/core/math/Unary.h
+++ b/source/tensor/core/math/Unary.h
@@ -63,6 +63,15 @@ void _FloorMe(XTensor * a);
 make a new tensor to keep the result and return it */
 XTensor Floor(const XTensor & a);

+/* if source entry is non-zero, set target entry to be one, otherwise zero */
+void _IsNonZero(const XTensor *a, XTensor *b);
+/* if source entry is non-zero, set target entry to be one, otherwise zero (do it on site)
+keep the result in the input tensor a and return nothing */
+void _IsNonZeroMe(XTensor *a);
+/* if source entry is non-zero, set target entry to be one, otherwise zero (return a XTensor structure)
+make a new tensor to keep the result and return it */
+XTensor IsNonZero(const XTensor &a);
+
 /* if source entry is zero, set target entry to be one, otherwise zero */
 void _IsZero(const XTensor *a, XTensor *b);
 /* if source entry is zero, set target entry to be one, otherwise zero (do it on site)

--- a/source/tensor/function/CrossEntropy.cpp
+++ b/source/tensor/function/CrossEntropy.cpp
@@ -296,7 +296,7 @@ DTYPE _CrossEntropy(const XTensor * output, const XTensor * gold,

    /* compute the total loss */
    if(padding != NULL) {
-        XTensor * temp(lossInter);
+        XTensor * temp = NewTensor(lossInter);
        _Multiply(lossInter, padding, temp);
        loss = _ReduceSumAll(temp);
        delete temp;
@@ -305,17 +305,18 @@ DTYPE _CrossEntropy(const XTensor * output, const XTensor * gold,
        loss = _ReduceSumAll(lossInter);

    if(reduceWay == REDUCE_MEAN) {
-        if(padding != NULL) {
-            XTensor * zeroIndicator = NewTensorBuf(padding, padding->devID, padding->mem);
-            
-            _IsZero(padding, zeroIndicator);
-            int reduceSize = (int)_ReduceSumAll(zeroIndicator);
-            loss = loss / (DTYPE)(padding->unitNum - reduceSize);
-
-            DelTensorBuf(zeroIndicator);
+        int nonZeroNum;
+        if(padding == NULL) {
+            nonZeroNum = lossInter->unitNum;
+        }
+        else {
+            XTensor * tmp = NewTensor(padding);
+            _IsNonZero(padding, tmp);
+            nonZeroNum = (int)_ReduceSumAll(tmp);
+            delete tmp;
        }
-        else 
-            loss = loss / (DTYPE)lossInter->unitNum;
+
+        loss = loss / (DTYPE)nonZeroNum;
    }
    else if(reduceWay == REDUCE_SUM) {
        /* don't need to do anything */
@@ -471,7 +472,7 @@ with respect to gold standard, and y this the model output
 >> leadingDim - the leading dimension for the output
 */
 void _CrossEntropyBackward(XTensor * dedy, const XTensor * output, const XTensor * gold, 
-                           const XTensor * weight, XTensor * padding, 
+                           const XTensor * weight, const XTensor * padding, 
                           int leadingDim)
 {
 #ifdef USE_CUDA
@@ -561,8 +562,8 @@ void _CrossEntropyBackward(XTensor * dedy, const XTensor * output, const XTensor
    }

    if(padding != NULL) {
-        XTensor * tmp(padding);
-        _IsZero(padding, tmp);
+        XTensor * tmp = NewTensor(padding);
+        _IsNonZero(padding, tmp);
        int nonZeroNum = (int)_ReduceSumAll(tmp);
        _ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)nonZeroNum);
        delete tmp;

--- a/source/tensor/function/CrossEntropy.cu
+++ b/source/tensor/function/CrossEntropy.cu
@@ -237,18 +237,18 @@ DTYPE _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,
    loss = _ReduceSumAll(lossInter);

    if(reduceWay == REDUCE_MEAN) {
-        int totalNum;
+        int nonZeroNum;
        if(padding == NULL) {
-            totalNum = lossInter->unitNum;
+            nonZeroNum = lossInter->unitNum;
        }
        else {
-            XTensor * zeroIndicator = NewTensorBuf(output, output->devID, output->mem);
-            _IsZero(padding, zeroIndicator);
-            totalNum = lossInter->unitNum - (int)_ReduceSumAll(zeroIndicator);
-            DelTensorBuf(zeroIndicator);
+            XTensor * tmp = NewTensor(padding);
+            _IsNonZero(padding, tmp);
+            nonZeroNum = (int)_ReduceSumAll(tmp);
+            delete tmp;
        }

-        loss = loss / (DTYPE)totalNum;
+        loss = loss / (DTYPE)nonZeroNum;
    }

    return loss;
@@ -328,9 +328,9 @@ with respect to gold standard, and y this the model output
 >> padding - specify a target value that is ignored and does not contribute to the loss computation
 >> leadingDim - the leading dimension for the output
 */
-void _CudaCrossEntropyBackward(XTensor * dedy, const XTensor * output, const XTensor * gold, 
-                               const XTensor * weight, XTensor * padding,
-                               int leadingDim)
+void _CudaCrossEntropyBackward(XTensor * dedy, const XTensor * output, 
+                               const XTensor * gold, const XTensor * weight,
+                               const XTensor * padding, int leadingDim)
 {
    int order = output->order;
    int n = leadingDim < 0 ? output->order - 1 : leadingDim;
@@ -398,8 +398,8 @@ void _CudaCrossEntropyBackward(XTensor * dedy, const XTensor * output, const XTe
    }

    if(padding != NULL) {
-        XTensor * tmp(padding);
-        _IsZero(padding, tmp);
+        XTensor * tmp = NewTensor(padding);
+        _IsNonZero(padding, tmp);
        int nonZeroNum = (int)_ReduceSumAll(tmp);
        _ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)nonZeroNum);
        delete tmp;

--- a/source/tensor/function/CrossEntropy.cuh
+++ b/source/tensor/function/CrossEntropy.cuh
@@ -29,18 +29,18 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)

 /* compute the cross entropy loss */
 void _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,
-                             XTensor * loss, const XTensor * weight = NULL, 
-                             const XTensor * padding = NULL, int leadingDim = -1);
+                           XTensor * loss, const XTensor * weight = NULL, 
+                           const XTensor * padding = NULL, int leadingDim = -1);

 /* compute the cross entropy loss */
 DTYPE _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,
-                              LOSS_COMPUTE_WAY reduceWay, const XTensor * weight = NULL, 
-                              const XTensor * padding = NULL, int leadingDim = -1);
+                            LOSS_COMPUTE_WAY reduceWay, const XTensor * weight = NULL, 
+                            const XTensor * padding = NULL, int leadingDim = -1);

 /* backward computation of cross entropy function */
-void _CudaCrossEntropyBackward(XTensor * dedy, const XTensor * output, const XTensor * gold, 
-                               const XTensor * weight = NULL, XTensor * padding = NULL, 
-                               int leadingDim = -1);
+void _CudaCrossEntropyBackward(XTensor * dedy, const XTensor * output, 
+                               const XTensor * gold, const XTensor * weight = NULL, 
+                               const XTensor * padding = NULL, int leadingDim = -1);


 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/function/CrossEntropy.h
+++ b/source/tensor/function/CrossEntropy.h
@@ -53,7 +53,7 @@ DTYPE _CrossEntropyFast(const XTensor * output, const XTensor * gold,

 /* backward computation of cross entropy function */
 void _CrossEntropyBackward(XTensor * dedy, const XTensor * output, const XTensor * gold, 
-                           const XTensor * weight = NULL, XTensor * padding = NULL, 
+                           const XTensor * weight = NULL, const XTensor * padding = NULL, 
                           int leadingDim = -1);

 } // namespace nts(NiuTrans.Tensor)