merge with liyinqiao brach

38bff350 · xuchen · 509c0233 · 38bff350 · 38bff350 · 38bff350
Commit 38bff350 authored Mar 25, 2020 by xuchen
--- a/source/Main.cpp
+++ b/source/Main.cpp
@@ -32,10 +32,6 @@
 //#include <stdlib.h>
 //#include <crtdbg.h>
-void BackwardTest();
-void TransposeTest();
-void SumDimTest();
 using namespace nts;
 using namespace fnnlm;
 using namespace transformer;

--- a/source/sample/transformer/T2TTrainer.cpp
+++ b/source/sample/transformer/T2TTrainer.cpp
@@ -200,7 +200,7 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
            labelOnehot = IndexToOnehot(label, vSizeTgt, labelSmoothingP);
            lossTensor = CrossEntropy(output, labelOnehot, paddingDec);
-            float lossBatch = ReduceSumAll(lossTensor);
+            float lossBatch = ReduceSumAllValue(lossTensor);
            DTYPE lossLocal = lossBatch / wc;
            bool doUpdate = (!IsNAN(lossLocal) && !IsINF(lossLocal) && lossLocal < 1e3F);
@@ -345,7 +345,7 @@ void T2TTrainer::Validate(const char * fn, const char * ofn, T2TModel * model)
        XTensor lossTensor;
        labelOnehot = IndexToOnehot(label, vSizeTgt, 0);
        lossTensor = CrossEntropy(output, labelOnehot, paddingDec);
-        float lossBatch = ReduceSumAll(lossTensor);
+        float lossBatch = ReduceSumAllValue(lossTensor);
        /* dump the test result */
        for(int s = 0; s < bSize; s++){

--- a/source/tensor/XCall.cpp
+++ b/source/tensor/XCall.cpp
@@ -130,6 +130,39 @@ void InitTensor(XTensor * tensor,
 }
 /*
+initialize a scalar V2
+>> tensor - the tensor we intend to initialize
+>> myDataType - unit size (e.g., int, float, and double)
+>> myDevID - when myMem is NULL, myDevID specifies the device
+             on which we allocate the data on site
+>> myMem - memory pool used to allocating the data array
+           myMem = NULL means that the tensor is allocated on
+           the device dynamically, rather than on the memory pool
+*/
+void InitTensor0DV2(XTensor * tensor, const TENSOR_DATA_TYPE myDataType, const int myDevID, XMem * myMem)
+{
+    int dims[MAX_TENSOR_DIM_NUM];
+    InitTensorV2(tensor, 0, dims, myDataType, 1.0F, myDevID, myMem);
+}
+/*
+initialize a scalar
+>> tensor - the tensor we intend to initialize
+>> myDataType - unit size (e.g., int, float, and double)
+>> myDevID - when myMem is NULL, myDevID specifies the device
+             on which we allocate the data on site
+*/
+void InitTensor0D(XTensor * tensor, const TENSOR_DATA_TYPE myDataType, const int myDevID, const bool isEnableGrad)
+{
+    int dims[MAX_TENSOR_DIM_NUM];
+    InitTensor(tensor, 0, dims, myDataType, myDevID, isEnableGrad);
+}
+/* 
 initialize a dense tensor V2 
 >> tensor - the tensor we intend to initialize
 >> num - number of elements
@@ -551,6 +584,37 @@ XTensor * NewTensorBuf(const XTensor * reference, int devID, const bool isEnable
 }
 /*
+generate a scalar V2
+>> myDataType - unit size (e.g., int, float, and double)
+>> myDevID - when myMem is NULL, myDevID specifies the device
+             on which we allocate the data on site
+>> myMem - memory pool used to allocating the data array
+           myMem = NULL means that the tensor is allocated on
+           the device dynamically, rather than on the memory pool.
+*/
+XTensor * NewTensor0DV2(const TENSOR_DATA_TYPE myDataType, const int myDevID, XMem * myMem)
+{
+    int dims[MAX_TENSOR_DIM_NUM];
+    return NewTensorV2(0, dims, myDataType, 1.0F, myDevID, myMem);
+}
+/*
+generate a scalar
+>> myDataType - unit size (e.g., int, float, and double)
+>> myDevID - when myMem is NULL, myDevID specifies the device
+             on which we allocate the data on site.
+*/
+XTensor * NewTensor0D(const TENSOR_DATA_TYPE myDataType, const int myDevID, const bool isEnableGrad)
+{
+    int dims[MAX_TENSOR_DIM_NUM];
+    return NewTensor(0, dims, myDataType, myDevID, isEnableGrad);
+}
+/* 
 generate a dense vector V2 
 >> num - number of entries
 >> myDataType - unit size (e.g., int, float, and double) 
@@ -799,7 +863,7 @@ XTensor * NewTensor(const XTensor * a, bool isFilledData)
    memset(dims, 0, sizeof(int) * MAX_TENSOR_DIM_NUM);
-    if(a->order > 0)
+    if(a->order >= 0)
        memcpy(dims, a->dimSize, sizeof(int) * a->order);
    if(!isFilledData)
@@ -810,7 +874,6 @@ XTensor * NewTensor(const XTensor * a, bool isFilledData)
                                      a->devID, a->mem);
    return newTensor;
 }
 /* 

--- a/source/tensor/XCall.h
+++ b/source/tensor/XCall.h
@@ -26,6 +26,9 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)
+/* default settings */
+#define X_ENABLE_GRAD true
 /*
 * we define the "new and delete" functions below
 */
@@ -38,7 +41,13 @@ void InitTensorV2(XTensor * tensor,
 /* initialize a dense XTensor */
 void InitTensor(XTensor * tensor,
                const int myOrder, const int * myDimSize, const TENSOR_DATA_TYPE myDataType = X_FLOAT,
-                const int myDevID = -1, const bool isEnableGrad = true);
+                const int myDevID = -1, const bool isEnableGrad = X_ENABLE_GRAD);
+/* initialize a scalar V2 */
+void InitTensor0DV2(XTensor * tensor, const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, XMem * myMem = NULL);
+/* initialize a scalar */
+void InitTensor0D(XTensor * tensor, const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, const bool isEnableGrad = X_ENABLE_GRAD);
 /* initialize a dense vector V2 */
 void InitTensor1DV2(XTensor * tensor, const int num, 
@@ -46,7 +55,7 @@ void InitTensor1DV2(XTensor * tensor, const int num,
 /* initialize a dense vector */
 void InitTensor1D(XTensor * tensor, const int num, 
-                  const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, const bool isEnableGrad = true);
+                  const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, const bool isEnableGrad = X_ENABLE_GRAD);
 /* initialize a dense matrix V2 */
 void InitTensor2DV2(XTensor * tensor, const int rowNum, const int colNum,
@@ -54,7 +63,7 @@ void InitTensor2DV2(XTensor * tensor, const int rowNum, const int colNum,
 /* initialize a dense matrix */
 void InitTensor2D(XTensor * tensor, const int rowNum, const int colNum,
-                  const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, const bool isEnableGrad = true);
+                  const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, const bool isEnableGrad = X_ENABLE_GRAD);
 /* initialize a dense 3d tensor V2 */
 void InitTensor3DV2(XTensor * tensor, const int d0, const int d1, const int d2,
@@ -62,7 +71,7 @@ void InitTensor3DV2(XTensor * tensor, const int d0, const int d1, const int d2,
 /* initialize a dense 3d tensor */
 void InitTensor3D(XTensor * tensor, const int d0, const int d1, const int d2,
-                  const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, const bool isEnableGrad = true);
+                  const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, const bool isEnableGrad = X_ENABLE_GRAD);
 /* initialize a dense 4d tensor V2 */
 void InitTensor4DV2(XTensor * tensor, const int d0, const int d1, const int d2, const int d3,
@@ -70,7 +79,7 @@ void InitTensor4DV2(XTensor * tensor, const int d0, const int d1, const int d2, 
 /* initialize a dense 4d tensor */
 void InitTensor4D(XTensor * tensor, const int d0, const int d1, const int d2, const int d3,
-                  const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, const bool isEnableGrad = true);
+                  const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, const bool isEnableGrad = X_ENABLE_GRAD);
 /* initialize a dense 5d tensor V2 */
 void InitTensor5DV2(XTensor * tensor, const int d0, const int d1, const int d2, const int d3, const int d4,
@@ -78,7 +87,7 @@ void InitTensor5DV2(XTensor * tensor, const int d0, const int d1, const int d2, 
 /* initialize a dense 5d tensor */
 void InitTensor5D(XTensor * tensor, const int d0, const int d1, const int d2, const int d3, const int d4,
-                    const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, const bool isEnableGrad = true);
+                  const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, const bool isEnableGrad = X_ENABLE_GRAD);
 /* initialize a tensor with a reference tensor V2 */
 void InitTensorV2(XTensor * tensor, const XTensor * reference);
@@ -98,7 +107,7 @@ XTensor * NewTensorV2(const int myOrder, const int * myDimSize, const TENSOR_DAT
 /* generate a dense XTensor */
 XTensor * NewTensor(const int myOrder, const int * myDimSize, const TENSOR_DATA_TYPE myDataType = X_FLOAT,
-                      const int myDevID = -1, const bool isEnableGrad = true);
+                    const int myDevID = -1, const bool isEnableGrad = X_ENABLE_GRAD);
 /* generate a XTensor which allocates data on the buffer V2 */
 XTensor * NewTensorBufV2(const int myOrder, const int * myDimSize,
@@ -107,20 +116,26 @@ XTensor * NewTensorBufV2(const int myOrder, const int * myDimSize,
 /* generate a dense XTensor which allocates data on the buffer */
 XTensor * NewTensorBuf(const int myOrder, const int * myDimSize,
-                       const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, const bool isEnableGrad = true);
+                       const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, const bool isEnableGrad = X_ENABLE_GRAD);
 /* generate a XTensor which allocates data on the buffer V2 */
 XTensor * NewTensorBufV2(const XTensor * reference, int devID, XMem * myMem);
 /* generate a XTensor which allocates data on the buffer */
-XTensor * NewTensorBuf(const XTensor * reference, int devID, const bool isEnableGrad = true);
+XTensor * NewTensorBuf(const XTensor * reference, int devID, const bool isEnableGrad = X_ENABLE_GRAD);
+/* generate a scalar V2 */
+XTensor * NewTensor0DV2(const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, XMem * myMem = NULL);
+/* generate a scalar */
+XTensor * NewTensor0D(const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, const bool isEnableGrad = X_ENABLE_GRAD);
 /* generate a dense vector V2 */
 XTensor * NewTensor1DV2(const int num, const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, 
                        XMem * myMem = NULL);
 /* generate a dense vector */
-XTensor * NewTensor1D(const int num, const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, const bool isEnableGrad = true);
+XTensor * NewTensor1D(const int num, const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, const bool isEnableGrad = X_ENABLE_GRAD);
 /* generate a dense matrix V2 */
 XTensor * NewTensor2DV2(const int rowNum, const int colNum, 
@@ -130,7 +145,7 @@ XTensor * NewTensor2DV2(const int rowNum, const int colNum,
 /* generate a dense matrix */
 XTensor * NewTensor2D(const int rowNum, const int colNum, 
                      const TENSOR_DATA_TYPE myDataType = X_FLOAT, 
-                      const int myDevID = -1, const bool isEnableGrad = true);
+                      const int myDevID = -1, const bool isEnableGrad = X_ENABLE_GRAD);
 /* generate a dense 3d tensor V2 */
 XTensor * NewTensor3DV2(const int d0, const int d1, const int d2, 
@@ -140,7 +155,7 @@ XTensor * NewTensor3DV2(const int d0, const int d1, const int d2,
 /* generate a dense 3d tensor */
 XTensor * NewTensor3D(const int d0, const int d1, const int d2, 
                      const TENSOR_DATA_TYPE myDataType = X_FLOAT, 
-                      const int myDevID = -1, const bool isEnableGrad = true);
+                      const int myDevID = -1, const bool isEnableGrad = X_ENABLE_GRAD);
 /* generate a dense 4d tensor V2 */
 XTensor * NewTensor4DV2(const int d0, const int d1, const int d2, const int d3,
@@ -150,7 +165,7 @@ XTensor * NewTensor4DV2(const int d0, const int d1, const int d2, const int d3,
 /* generate a dense 4d tensor */
 XTensor * NewTensor4D(const int d0, const int d1, const int d2, const int d3,
                      const TENSOR_DATA_TYPE myDataType = X_FLOAT, 
-                      const int myDevID = -1, const bool isEnableGrad = true);
+                      const int myDevID = -1, const bool isEnableGrad = X_ENABLE_GRAD);
 /* generate a dense 5d tensor V2 */
 XTensor * NewTensor5DV2(const int d0, const int d1, const int d2, const int d3, const int d4,
@@ -160,10 +175,10 @@ XTensor * NewTensor5DV2(const int d0, const int d1, const int d2, const int d3, 
 /* generate a dense 5d tensor */
 XTensor * NewTensor5D(const int d0, const int d1, const int d2, const int d3, const int d4,
                      const TENSOR_DATA_TYPE myDataType = X_FLOAT, 
-                      const int myDevID = -1, const bool isEnableGrad = true);
+                      const int myDevID = -1, const bool isEnableGrad = X_ENABLE_GRAD);
 /* generate a dense vector by range */
-XTensor * NewTensorRange(int lower, int upper, int step, const TENSOR_DATA_TYPE myDataType = X_INT, const int myDevID = -1, const bool isEnableGrad = true);
+XTensor * NewTensorRange(int lower, int upper, int step, const TENSOR_DATA_TYPE myDataType = X_INT, const int myDevID = -1, const bool isEnableGrad = X_ENABLE_GRAD);
 /* generate a copy of XTensor (with a reference to a given tensor) */
 XTensor * NewTensor(const XTensor * a, bool isFilledData = true);

--- a/source/tensor/XGlobal.h
+++ b/source/tensor/XGlobal.h
@@ -78,7 +78,7 @@ namespace nts {
    if(!(x)) \
    { \
        fprintf(stderr, "[ERROR] calling '%s' (%s line %d): %s\n", #x, __FILENAME__, __LINE__, msg); \
-        exit(1); \
+        throw; \
    } \
 } \
@@ -87,7 +87,7 @@ namespace nts {
    if(!(x)) \
    { \
        fprintf(stderr, "[ERROR] calling '%s' (%s line %d): %s\n", #x, __FILENAME__, __LINE__); \
-        exit(1); \
+        throw; \
    } \
 } \
@@ -95,7 +95,7 @@ namespace nts {
 { \
    { \
        fprintf(stderr, "[ERROR] (%s line %d): %s\n", __FILENAME__, __LINE__, msg); \
-        exit(1); \
+        throw; \
    } \
 } \

--- a/source/tensor/XLink.cpp
+++ b/source/tensor/XLink.cpp
@@ -167,7 +167,7 @@ void XLink::SetType(int id)
    type[0] = 0;
    strcpy(type, GetOPName(id));
    typeID = id;
-    if(id != 0){
+    if (id != 0) {
        CheckNTErrors(strcmp(type, "NULL"), "illegal edge type name!");
    }
 }

--- a/source/tensor/XList.cpp
+++ b/source/tensor/XList.cpp
@@ -249,26 +249,6 @@ inline int TensorListBase<T>::FindFirst(const T& item)
    return -1;
 }
-template <>
-inline int TensorListBase<Example>::FindFirst(const Example& item)
-{
-    for (int i = 0; i < count; i++) {
-        if (item.id == items[i].id)
-            return i;
-    }
-    return -1;
-}
-template <>
-inline int TensorListBase<Result>::FindFirst(const Result& item)
-{
-    for (int i = 0; i < count; i++) {
-        if (item.id == items[i].id)
-            return i;
-    }
-    return -1;
-}
 /* clear the data array */
 template <typename T>
 void TensorListBase<T>::Clear()
@@ -383,8 +363,7 @@ template struct TensorListBase<long>;
 template struct TensorListBase<float>;
 template struct TensorListBase<short>;
 template struct TensorListBase<XTensor*>;
-template struct TensorListBase<Result>;
+template struct TensorListBase<uint64_t>;
-template struct TensorListBase<Example>;
 template struct TensorListBase<void*>;
 } /* end of the nts (NiuTrans.Tensor) namespace */
\ No newline at end of file
--- a/source/tensor/XList.h
+++ b/source/tensor/XList.h
@@ -26,6 +26,8 @@
 #include "XMem.h"
 #include "XGlobal.h"
+#include <cstdint>
 #ifndef __TensorList_H__
 #define __TensorList_H__
@@ -118,7 +120,14 @@ public:
    void Shuffle(int nround = 10, int beg = -1, int len = 0);
    /* short */
-    T& operator[] (int i) { return GetItem(i); };
+    T& operator[] (int i) { 
+        CheckNTErrors(i >= -count && i < count, "Index of a list item is out of scope!");
+        CheckNTErrors(count > 0, "Cannt index the item in an empty list!");
+        if (i < 0)
+            return items[count + i];
+        else
+            return items[i];
+    };
    T& Get(int i) { return GetItem(i); };
    void Set(int i, T item) { SetItem(i, item); };
 };
@@ -132,19 +141,7 @@ typedef TensorListBase<char*> StrList;
 typedef TensorListBase<long> LongList;
 typedef TensorListBase<float> FloatList;
 typedef TensorListBase<short> ShortList;
+typedef TensorListBase<uint64_t> UInt64List;
-struct Example {
-    int id;
-    IntList data;
-};
-struct Result {
-    int id;
-    IntList data;
-};
-typedef TensorListBase<Result> ResultList;
-typedef TensorListBase<Example> ExampleList;
 typedef TensorListBase<XTensor*> TensorList;
 } /* end of the nts (NiuTrans.Tensor) namespace */

--- a/source/tensor/XName.cpp
+++ b/source/tensor/XName.cpp
@@ -53,6 +53,8 @@ const char * GetOPName(int type)
            return "M_TAN";
        else if (type == MATH_ROUND)
            return "M_ROUND";
+        else if (type == MATH_RECIPROCAL)
+            return "M_RECIPROCAL";
        else if (type == MATH_CLIP)
            return "M_CLIP";
        else if (type == MATH_DIV)
@@ -105,6 +107,8 @@ const char * GetOPName(int type)
            return "R_REDUCEMEAN";
        else if (type == REDUCE_REDUCESUM)
            return "R_REDUCESUM";
+        else if (type == REDUCE_REDUCESUMALL)
+            return "R_REDUCESUMALL";
        else if (type == REDUCE_REDUCESUMSQUARED)
            return "R_REDUCESUMSQUARED";
        else if (type == REDUCE_REDUCEVARIANCE)
@@ -113,6 +117,8 @@ const char * GetOPName(int type)
    else if ((type & DATA_BASE) != 0){
        if (type == GETANDSET_SELECT)
            return "G_SELECT";
+        else if (type == GETANDSET_CONVERTDATATYPE)
+            return "G_CONVERTDATATYPE";
        else if (type == MOVEMENT_COPYINDEXED)
            return "M_COPYINDEXED";
        else if (type == MOVEMENT_COPYVALUES)

--- a/source/tensor/XName.h
+++ b/source/tensor/XName.h
@@ -44,8 +44,9 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #define MATH_COS                MATH_SIN + 1
 #define MATH_TAN                MATH_COS + 1
 #define MATH_ROUND              MATH_TAN + 1
+#define MATH_RECIPROCAL         MATH_ROUND + 1
-#define MATH_CLIP               MATH_ROUND + 1
+#define MATH_CLIP               MATH_RECIPROCAL + 1
 #define MATH_DIV                MATH_CLIP + 1
 #define MATH_DIVDIM             MATH_DIV + 1
 #define MATH_MASK               MATH_DIVDIM + 1
@@ -76,7 +77,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #define REDUCE_REDUCEMAX        REDUCE + 1
 #define REDUCE_REDUCEMEAN       REDUCE_REDUCEMAX + 1
 #define REDUCE_REDUCESUM        REDUCE_REDUCEMEAN + 1
-#define REDUCE_REDUCESUMSQUARED REDUCE_REDUCESUM + 1
+#define REDUCE_REDUCESUMALL     REDUCE_REDUCESUM + 1
+#define REDUCE_REDUCESUMSQUARED REDUCE_REDUCESUMALL + 1
 #define REDUCE_REDUCEVARIANCE   REDUCE_REDUCESUMSQUARED + 1
 /* data and shape related operations */

--- a/source/tensor/XQueue.cpp
+++ b/source/tensor/XQueue.cpp
--- a/source/tensor/XStream.cpp
+++ b/source/tensor/XStream.cpp
@@ -147,7 +147,11 @@ void XStream::StreamSynchronize()
 void XStream::ThreadSynchronize()
 {
 #ifdef USE_CUDA
+#if CUDART_VERSION < 10000
    cudaThreadSynchronize();
+#else
+    ShowNTErrors("TODO!");
+#endif
 #endif
 }

--- a/source/tensor/XTensor.cpp
+++ b/source/tensor/XTensor.cpp
--- a/source/tensor/XTensor.h
+++ b/source/tensor/XTensor.h
@@ -29,7 +29,6 @@
 #define __XTENSOR_H__
 #include "XGlobal.h"
-#include "XMem.h"
 #include "XPRunner.h"
 #include "XStream.h"
 #include "XHeap.h"
@@ -276,6 +275,18 @@ public:
    /* return a tensor that datatype is same as the special tensor */
    XTensor TypeAs(const XTensor input);
+    /* return a tensor that datatype is integer */
+    XTensor Int();
+    /* return a tensor that datatype is float */
+    XTensor Float();
+    /* return a tensor that datatype is float16 */
+    XTensor Float16();
+    /* return a tensor that datatype is double */
+    XTensor Double();
    /* get the number of items in the data array */
    int GetSize() const;
@@ -331,6 +342,9 @@ public:
    /* get the pointer to a cell */
    void * GetCell(int index[], int size = -1) const;
+    /* get the default type value of a cell in a 0d tensor */
+    DTYPE Get0D() const;
    /* get the default type value of a cell in a 1d tensor */
    DTYPE Get1D(int i) const;
@@ -343,6 +357,9 @@ public:
    /* get the int value of a cell by its offset */
    int GetInt(int offset) const;
+    /* get the int value of a cell in a 0d tensor */
+    int Get0DInt() const;
    /* get the int value of a cell in a 1d tensor */
    int Get1DInt(int i) const;
@@ -364,6 +381,9 @@ public:
    /* set the value of a cell with its offset in the array */
    bool Set(DTYPE value, int offset);
+    /* set the value of a cell in a 0d tensor */
+    bool Set0D(DTYPE value);
    /* set the value of a cell in a 1d tensor */
    bool Set1D(DTYPE value, int i);
@@ -379,6 +399,9 @@ public:
    /* set the integer value of a cell */
    bool SetInt(int value, int index[], int size = -1);
+    /* set the integer value of a cell in a 0d tensor */
+    bool Set0DInt(int value);
    /* set the integer value of a cell in a 1d tensor */
    bool Set1DInt(int value, int i);

--- a/source/tensor/core/CHeader.h
+++ b/source/tensor/core/CHeader.h
@@ -37,7 +37,6 @@
 #include "arithmetic/Multiply.h"
 #include "arithmetic/MultiplyDim.h"
 #include "arithmetic/Sub.h"
-#include "arithmetic/SubDim.h"
 #include "arithmetic/Sum.h"
 #include "arithmetic/SumDim.h"
 #include "arithmetic/XTensorBLAS.h"

--- a/source/tensor/core/arithmetic/Div.cpp
+++ b/source/tensor/core/arithmetic/Div.cpp
@@ -23,6 +23,8 @@
 #include "../../XName.h"
 #include "../../XUtility.h"
 #include "../shape/IsSameShaped.h"
+#include "Sum.h"
+#include "../math/ScaleAndShift.h"
 #include "Div.h"
 #include "Div.cuh"
 #include "DivDim.h"
@@ -127,7 +129,7 @@ void _Div(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int le
 element-wise division of two tensors (do it on site)
 keep the result in the input tensor a and return nothing
-a(i) = a(i)*b(i) + \alpha * a(i)
+a(i) = a(i)/b(i) + \alpha * a(i)
 where i is the index of the item
 >> a - tensor a (where keep the result)
@@ -144,7 +146,7 @@ void _DivMe(XTensor * a, const XTensor * b, DTYPE alpha, int leadingDim)
 element-wise division of two tensors (do it on site)
 keep the result in the input tensor a and return nothing
-a(i) = a(i)*b(i) + \alpha * a(i)
+a(i) = a(i)/b(i) + \alpha * a(i)
 where i is the index of the item
 >> a - tensor a (where keep the result)
@@ -152,45 +154,35 @@ where i is the index of the item
 >> alpha - the coefficient
 >> leadingDim - the dimension along which we perform broadcasting
 */
-void DivMe(XTensor& a, const XTensor& b, DTYPE alpha, int leadingDim)
+void DivMe(XTensor & a, const XTensor & b, DTYPE alpha, int leadingDim)
 {
-    _Div(&a, &b, &a, alpha, leadingDim);
+    if (b.order == 0){
-}
+        DTYPE scale = 1.0F / b.Get0D() + alpha;
-/* 
+        _ScaleAndShift(&a, &a, scale, 0.0F);
-return a dimension if the division is performed as DivDim (in more details in DivDim.h)
->> a - a tensor
->> b - another tensor for division
-*/
-int GetDivDimIndex(const XTensor &a, const XTensor &b)
-{
-    if(a.order < b.order)
-        return -1;
-    if(IsSameShaped(a, b))
-        return -1;
-    int hitCount = 0;
-    int hitDim = -1;
-    for(int i = 0; i < b.order; i++){
-        if(b.dimSize[b.order - 1 - i] == 1)
-            continue;
-        else if(b.dimSize[b.order - 1 - i] == a.dimSize[a.order - 1 - i]){
-            hitCount++;
-            hitDim = a.order - b.order + i;
-        }
    }
+    else {
+        int n = GetBroadcastDimIndex(a, b);
-    if(hitCount == 1)
+        if (n == -1) {
-        return hitDim;
+            CheckNTErrors(a.dimSize[leadingDim] == b.dimSize[leadingDim], "TODO!");
+            /* call _Div function */
+            _Div(&a, &b, &a, alpha, leadingDim);
+        }
+        else if (n >= 0 && n < a.order)
+            /* call _DivDim function */
+            _DivDim(&a, &b, &a, n, alpha);
        else
-        return -1;
+            ShowNTErrors("Something is wrong!");
+    }
 }
 /*
 element-wise division of two tensors (return an XTensor structure)
 make a new tensor c to keep the result and return it
-c(i) = a(i)*b(i)
+c(i) = a(i)/b(i)
 where i is the index of the item
 >> a - tensor a
@@ -199,12 +191,18 @@ where i is the index of the item
 >> leadingDim - the dimension along which we perform broadcasting
 << return - the product of the tensors
 */
-XTensor Div(const XTensor &a, const XTensor &b, DTYPE alpha, int leadingDim)
+XTensor Div(const XTensor & a, const XTensor & b, int leadingDim)
 {
    XTensor c(&a);
    c.SetTMPFlag();
-    int n = GetDivDimIndex(a, b);
+    if (b.order == 0){
+        DTYPE scale = 1.0F / b.Get0D();
+        ScaleAndShift(a, c, scale, 0.0F);
+    }
+    else {
+        DTYPE alpha = 0.0F;
+        int n = GetBroadcastDimIndex(a, b);
        if(n == -1){
            CheckNTErrors(a.dimSize[leadingDim] == b.dimSize[leadingDim], "TODO!");
@@ -215,8 +213,6 @@ XTensor Div(const XTensor &a, const XTensor &b, DTYPE alpha, int leadingDim)
            /* tensor connections */
            if (a.enableGrad && b.enableGrad) {
                XLink::MakeLink(&a, &b, &c, MATH_DIV);
-            XLink::AddParamToHead(&c, alpha);
-            XLink::AddParamToHeadInt(&c, leadingDim);
            }
        }
        else if(n >= 0 && n < a.order){
@@ -227,12 +223,12 @@ XTensor Div(const XTensor &a, const XTensor &b, DTYPE alpha, int leadingDim)
            if (a.enableGrad && b.enableGrad) {
                XLink::MakeLink(&a, &b, &c, MATH_DIVDIM);
                XLink::AddParamToHeadInt(&c, n);
-            XLink::AddParamToHead(&c, alpha);
            }
        }
        else{
            ShowNTErrors("Something is wrong!");
        }
+    }
    return c;
 }
@@ -249,25 +245,36 @@ where i is the index of the item
 >> alpha - the coefficient
 >> leadingDim - the dimension along which we perform broadcasting
 */
-void Div(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int leadingDim)
+void Div(const XTensor & a, const XTensor & b, XTensor & c, DTYPE alpha, int leadingDim)
 {
    if (!c.isInit || !IsSameShaped(a, c)) {
        InitTensorV2(&c, &a);
    }
-    int n = GetDivDimIndex(a, b);
+    if (b.order == 0){
+        DTYPE scale = 1.0F / b.Get0D();
+        XTensor * tmp1 = NewTensorBufV2(&a, a.devID, a.mem);
+        XTensor * tmp2 = NewTensorBufV2(&c, c.devID, c.mem);
+        ScaleAndShift(a, *tmp1, scale, 0.0F);
+        ScaleAndShift(c, *tmp2, alpha, 0.0F);
+        Sum(*tmp2, *tmp1, c);
+        DelTensorBuf(tmp1);
+        DelTensorBuf(tmp2);
+    }
+    else {
+        int n = GetBroadcastDimIndex(a, b);
        if (n == -1) {
            CheckNTErrors(a.dimSize[leadingDim] == b.dimSize[leadingDim], "TODO!");
            /* call _Div function */
-        _Div(&a, &b, &c, 0, leadingDim);
+            _Div(&a, &b, &c, alpha, leadingDim);
            if (a.enableGrad && b.enableGrad) {
                /* tensor connections */
                XLink::MakeLink(&a, &b, &c, MATH_DIV);
-            XLink::AddParamToHead(&c, alpha);
-            XLink::AddParamToHeadInt(&c, leadingDim);
            }
        }
        else if (n >= 0 && n < a.order) {
@@ -278,13 +285,12 @@ void Div(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int leadin
                /* tensor connections */
                XLink::MakeLink(&a, &b, &c, MATH_DIVDIM);
                XLink::AddParamToHeadInt(&c, n);
-            XLink::AddParamToHead(&c, alpha);
            }
        }
        else {
            ShowNTErrors("Something is wrong!");
        }
+    }
 }
 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/arithmetic/Div.h
+++ b/source/tensor/core/arithmetic/Div.h
@@ -48,7 +48,7 @@ make a new tensor to keep the result and return it
 c(i) = a(i)/b(i)
 where i is the index of the element 
 */
-XTensor Div(const XTensor &a, const XTensor &b, DTYPE alpha = 0.0, int leadingDim = 0);
+XTensor Div(const XTensor &a, const XTensor &b, int leadingDim = 0);
 /*
 element-wise division of two tensors:

--- a/source/tensor/core/arithmetic/Mask.cu
+++ b/source/tensor/core/arithmetic/Mask.cu
@@ -22,8 +22,8 @@
 */
 #include "../../XDevice.h"
+#include "../../XTensor.h"
 #include "../../XUtility.h"
-#include "Sub.cuh"
 namespace nts { // namespace nts(NiuTrans.Tensor)
@@ -39,7 +39,7 @@ c = a - b * \beta
 >> alpha - value
 */
 __global__
-    void KernelMASK(DTYPE * a, int * mask, DTYPE * c, int size, DTYPE alpha)
+void KernelMASK(DTYPE * a, int * mask, DTYPE * c, int size, DTYPE alpha)
 {
    int i = blockDim.x * blockIdx.x + threadIdx.x;

--- a/source/tensor/core/arithmetic/MatrixMul.cpp
+++ b/source/tensor/core/arithmetic/MatrixMul.cpp
@@ -159,6 +159,10 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
                      "The code must be run on the same GPU!");
        int devIDBackup;
+        if (beta == 0)
+            c->SetZeroAll();
        ProtectCudaDev(a->devID, devIDBackup);
        cublasHandle_t * handle = a->mem != NULL ? a->mem->GetCublasHandle() : GDevs.GetCudaHandle(a->devID);

--- a/source/tensor/core/arithmetic/MatrixMul2D.cpp
+++ b/source/tensor/core/arithmetic/MatrixMul2D.cpp
--- a/source/tensor/core/arithmetic/MatrixMul2D.cu
+++ b/source/tensor/core/arithmetic/MatrixMul2D.cu
@@ -156,6 +156,9 @@ void _CudaMatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
        if (stream != NULL)
            cublasSetStream(*handle, stream->stream);
+        if (beta == 0)
+            c->SetZeroAll();
        if (a->dataType == X_FLOAT && b->dataType == X_FLOAT && c->dataType == X_FLOAT) {
            _CudaBLASMatrixMUL(handle, a->data, transposedA, a->dataType, 
                               b->data, transposedB, a->dataType, c->data, c->dataType,

--- a/source/tensor/core/arithmetic/MatrixMul2DMultiTheading.cpp
+++ b/source/tensor/core/arithmetic/MatrixMul2DMultiTheading.cpp
--- a/source/tensor/core/arithmetic/MatrixMul2DParallel.cpp
+++ b/source/tensor/core/arithmetic/MatrixMul2DParallel.cpp
@@ -54,6 +54,9 @@ void _MatrixMul2DParallel(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
    int aColNum = am;
    int bColNum = bm;
+    if (beta == 0)
+        c->SetZeroAll();
    /* a * b */
    if (transposedA == X_NOTRANS && transposedB == X_NOTRANS) {
        RunParallel2D(parallelRunner, (void*)_MatrixMul2DMultiTheading, an * am * bm,

--- a/source/tensor/core/arithmetic/MatrixMulBatched.cpp
+++ b/source/tensor/core/arithmetic/MatrixMulBatched.cpp
@@ -118,6 +118,9 @@ void _MatrixMulBatchedGPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
        blockNum *= a->dimSize[i];
    }
+    if (beta == 0)
+        c->SetZeroAll();
    int devIDBackup = 0;
    ProtectCudaDev(a->devID, devIDBackup);

--- a/source/tensor/core/arithmetic/MulAndShift.cpp
+++ b/source/tensor/core/arithmetic/MulAndShift.cpp
@@ -27,36 +27,6 @@
 #include "Sum.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
-/*
-return a dimension if the sum is performed as SumDim (in more details in SumDim.h)
->> a - a tensor
->> b - another tensor for sum
-*/
-int GetSumIndex(const XTensor &a, const XTensor &b)
-{
-    if (a.order < b.order)
-        return -1;
-    if (IsSameShaped(a, b))
-        return -1;
-    int hitCount = 0;
-    int hitDim = -1;
-    for (int i = 0; i < b.order; i++) {
-        if (b.dimSize[b.order - 1 - i] == 1)
-            continue;
-        else if (b.dimSize[b.order - 1 - i] == a.dimSize[a.order - 1 - i]) {
-            hitCount++;
-            hitDim = a.order - b.order + i;
-        }
-    }
-    if (hitCount == 1)
-        return hitDim;
-    else
-        return -1;
-}
 /*
 operation c = x * w + b  MulAndShift
 >> x - tensor x
@@ -99,7 +69,10 @@ XTensor MulAndShift(const XTensor &x, const XTensor &w, const XTensor &b,
    XTensor c(tmp);
    c.SetTMPFlag();
-    int n = GetSumIndex(tmp, b);
+    if (b.order == 0)
+        ScaleAndShift(*tmp, c, 1.0F, b.Get0D());
+    else {
+        int n = GetBroadcastDimIndex(tmp, b);
        if (n == -1) {
            /* call _Sum function */
@@ -107,23 +80,22 @@ XTensor MulAndShift(const XTensor &x, const XTensor &w, const XTensor &b,
            // TODO!!
            ShowNTErrors("TODO!");
        }
        else if (n >= 0 && n < tmp->order) {
            /* call _SumDim function */
            _SumDim(tmp, &b, &c, n);
        }
        else {
            ShowNTErrors("Something is wrong!");
        }
        /* tensor connections */
        if (w.enableGrad && b.enableGrad) {
            XLink::MakeLink(&x, &w, &b, &c, MATH_MULANDSHIFT);
            XLink::AddParamToHeadInt(&c, n);
            XLink::AddParamToHeadTrans(&c, X_NOTRANS);
            XLink::AddParamToHeadTrans(&c, X_NOTRANS);
+            XLink::AddParamToHead(&c, alpha);
+        }
    }
    /* destroy variables */
@@ -174,7 +146,7 @@ XTensor MulAndShift(const XTensor& x, MATRIX_TRANS_TYPE transposedA,
    XTensor c(tmp);
    c.SetTMPFlag();
-    int n = GetSumIndex(tmp, b);
+    int n = GetBroadcastDimIndex(tmp, b);
    if (n == -1) {
        /* call _Sum function */

--- a/source/tensor/core/arithmetic/Multiply.cpp
+++ b/source/tensor/core/arithmetic/Multiply.cpp
@@ -23,6 +23,8 @@
 #include "../../XName.h"
 #include "../../XUtility.h"
 #include "../shape/IsSameShaped.h"
+#include "Sum.h"
+#include "../math/ScaleAndShift.h"
 #include "Multiply.h"
 #include "Multiply.cuh"
 #include "MultiplyDim.h"
@@ -155,36 +157,28 @@ where i is the index of the item
 */
 void MultiplyMe(XTensor& a, const XTensor& b, DTYPE alpha, int leadingDim)
 {
-    _Multiply(&a, &b, &a, alpha, leadingDim);
+    if (b.order == 0){
-}
+        DTYPE scale = b.Get0D() + alpha;
-/* 
+        _ScaleAndShift(&a, &a, scale, 0.0F);
-return a dimension if the multiplication is performed as MultiplyDim (in more details in MultiplyDim.h)
+    }
->> a - a tensor
+    else {
->> b - another tensor for multiplication
+        int n = GetBroadcastDimIndex(a, b);
-*/
-int GetMultiplyDimIndex(const XTensor &a, const XTensor &b)
+        if (n == -1) {
-{
+            CheckNTErrors(a.dimSize[leadingDim] == b.dimSize[leadingDim], "TODO!");
-    if(a.order < b.order)
-        return -1;
+            /* call _Multiply function */
-    if(IsSameShaped(a, b))
+            _Multiply(&a, &b, &a, alpha, leadingDim);
-        return -1;
+        }
+        else if (n >= 0 && n < a.order) {
-    int hitCount = 0;
+            /* call _MultiplyDim function */
-    int hitDim = -1;
+            _MultiplyDim(&a, &b, &a, n, alpha);
-    for(int i = 0; i < b.order; i++){
+        }
-        if(b.dimSize[b.order - 1 - i] == 1)
+        else {
-            continue;
+            ShowNTErrors("Something is wrong!");
-        else if(b.dimSize[b.order - 1 - i] == a.dimSize[a.order - 1 - i]){
+        }
-            hitCount++;
+    }
-            hitDim = a.order - b.order + i;
-        }
-    }
-    if(hitCount == 1)
-        return hitDim;
-    else
-        return -1;
 }
 /*
@@ -199,25 +193,28 @@ where i is the index of the item
 >> leadingDim - the dimension along which we perform broadcasting
 << return - the product of the tensors
 */
-XTensor Multiply(const XTensor &a, const XTensor &b, DTYPE alpha, int leadingDim)
+XTensor Multiply(const XTensor &a, const XTensor &b, int leadingDim)
 {
    XTensor c(&a);
    c.SetTMPFlag();
-    int n = GetMultiplyDimIndex(a, b);
+    if (b.order == 0){
+        DTYPE scale = b.Get0D();
+        ScaleAndShift(a, c, scale, 0.0F);
+    }
+    else {
+        DTYPE alpha = 0.0F;
+        int n = GetBroadcastDimIndex(a, b);
        if(n == -1){
            CheckNTErrors(a.dimSize[leadingDim] == b.dimSize[leadingDim], "TODO!");
            /* call _Multiply function */
-        _Multiply(&a, &b, &c, 0, leadingDim);
+            _Multiply(&a, &b, &c, alpha, leadingDim);
            /* tensor connections */
            if (a.enableGrad && b.enableGrad) {
                XLink::MakeLink(&a, &b, &c, MATH_MULTIPLY);
-            XLink::AddParamToHead(&c, alpha);
-            XLink::AddParamToHeadInt(&c, leadingDim);
            }
        }
        else if(n >= 0 && n < a.order){
@@ -228,12 +225,12 @@ XTensor Multiply(const XTensor &a, const XTensor &b, DTYPE alpha, int leadingDim
            if (a.enableGrad && b.enableGrad) {
                XLink::MakeLink(&a, &b, &c, MATH_MULTIPLYDIM);
                XLink::AddParamToHeadInt(&c, n);
-            XLink::AddParamToHead(&c, alpha);
            }
        }
        else{
            ShowNTErrors("Something is wrong!");
        }
+    }
    return c;
 }
@@ -256,19 +253,30 @@ void Multiply(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int l
        InitTensorV2(&c, &a);
    }
-    int n = GetMultiplyDimIndex(a, b);
+    if (b.order == 0){
+        DTYPE scale = b.Get0D();
+        XTensor * tmp1 = NewTensorBufV2(&a, a.devID, a.mem);
+        XTensor * tmp2 = NewTensorBufV2(&c, c.devID, c.mem);
+        ScaleAndShift(a, *tmp1, scale, 0.0F);
+        ScaleAndShift(c, *tmp2, alpha, 0.0F);
+        Sum(*tmp2, *tmp1, c);
+        DelTensorBuf(tmp1);
+        DelTensorBuf(tmp2);
+    }
+    else {
+        int n = GetBroadcastDimIndex(a, b);
        if (n == -1) {
            CheckNTErrors(a.dimSize[leadingDim] == b.dimSize[leadingDim], "TODO!");
            /* call _Multiply function */
-        _Multiply(&a, &b, &c, 0, leadingDim);
+            _Multiply(&a, &b, &c, alpha, leadingDim);
            if (a.enableGrad && b.enableGrad) {
                /* tensor connections */
                XLink::MakeLink(&a, &b, &c, MATH_MULTIPLY);
-            XLink::AddParamToHead(&c, alpha);
-            XLink::AddParamToHeadInt(&c, leadingDim);
            }
        }
        else if (n >= 0 && n < a.order) {
@@ -279,13 +287,12 @@ void Multiply(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int l
                /* tensor connections */
                XLink::MakeLink(&a, &b, &c, MATH_MULTIPLYDIM);
                XLink::AddParamToHeadInt(&c, n);
-            XLink::AddParamToHead(&c, alpha);
            }
        }
        else {
            ShowNTErrors("Something is wrong!");
        }
+    }
 }
 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/arithmetic/Multiply.h
+++ b/source/tensor/core/arithmetic/Multiply.h
@@ -48,7 +48,7 @@ make a new tensor to keep the result and return it
 c(i) = a(i)*b(i)
 where i is the index of the element 
 */
-XTensor Multiply(const XTensor &a, const XTensor &b, DTYPE alpha = 0.0, int leadingDim = 0);
+XTensor Multiply(const XTensor &a, const XTensor &b, int leadingDim = 0);
 /* 
 element-wise product of two tensors:

--- a/source/tensor/core/arithmetic/MultiplyDim.cpp
+++ b/source/tensor/core/arithmetic/MultiplyDim.cpp
@@ -233,7 +233,7 @@ void _MultiplyBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE
 {
    CheckNTErrors(a->order == b->order, "Wrong tensor orders!");
    CheckNTErrors(a->order == c->order, "Wrong tensor orders!");
-    CheckNTErrors(a->order > 0, "TODO!");
+    CheckNTErrors(a->order >= 0, "TODO!");
    int order = a->order;
    int count = 0;

--- a/source/tensor/core/arithmetic/Sub.cpp
+++ b/source/tensor/core/arithmetic/Sub.cpp
@@ -16,16 +16,16 @@
 */
 /*
- * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-01
+ * $Created by: Li Yinqiao (email: li.yin.qiao.2012@hotmail.com) 2020-02-11
+ * Paper review rebuttal of ACL2020 will start at this Thursday. So nervous :(
 */
-#include "../../XTensor.h"
 #include "../../XName.h"
-#include "../../XUtility.h"
 #include "../shape/IsSameShaped.h"
+#include "Sum.h"
+#include "SumDim.h"
+#include "../math/ScaleAndShift.h"
 #include "Sub.h"
-#include "Sub.cuh"
-#include "SubDim.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
@@ -39,80 +39,7 @@ tensor subtraction c = a - b * \beta
 */
 void _Sub(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
 {
-    CheckNTErrors(a && b && c, "Empty tensor input!");
+    _Sum(a, b, c, -beta);
-    CheckNTErrors(a->unitNum == b->unitNum && a->unitNum == c->unitNum,
-                  "Unmatched tensors in addition!");
-    CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
-                  "Unmatched tensors in addition!");
-    CheckDev(a->devID, b->devID);
-    if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
-#ifdef USE_CUDA
-        if (a == c) {
-            int P2PAccesible = 0;
-#ifdef CUDA_UVA
-            cudaDeviceCanAccessPeer(&P2PAccesible, a->devID, b->devID);
-#endif
-            if ((a->devID < 0 && b->devID >= 0) ||
-                (a->devID >= 0 && b->devID < 0) ||
-                (a->devID >= 0 && b->devID >= 0 && a->devID != b->devID && !P2PAccesible))
-            {
-                ShowNTErrors("Cannot run this method on multiple devices simultaneously!");
-            }
-            else
-                _CudaSub(a, b, c, beta);
-        }
-        else
-            _CudaSub(a, b, c, beta);
-#endif
-    }
-    else {
-        if (!a->isSparse && !b->isSparse) {
-            CheckNTErrors(!c->isSparse, "Illegal use of sparse tensor in addition!");
-            if (a->dataType == DEFAULT_DTYPE &&
-                b->dataType == DEFAULT_DTYPE &&
-                c->dataType == DEFAULT_DTYPE)
-            {
-                DTYPE * ap = (DTYPE*)a->data;
-                DTYPE * bp = (DTYPE*)b->data;
-                DTYPE * cp = (DTYPE*)c->data;
-                /* unrolling */
-                int num = a->unitNum;
-                if (num % 4 == 0) {
-                    for (int i = 0; i < num; i += 4) {
-                        cp[i] = ap[i] - bp[i] * beta;
-                        cp[i + 1] = ap[i + 1] - bp[i + 1] * beta;
-                        cp[i + 2] = ap[i + 2] - bp[i + 2] * beta;
-                        cp[i + 3] = ap[i + 3] - bp[i + 3] * beta;
-                    }
-                }
-                else if (num % 2 == 0) {
-                    for (int i = 0; i < num; i += 2) {
-                        cp[i] = ap[i] - bp[i] * beta;
-                        cp[i + 1] = ap[i + 1] - bp[i + 1] * beta;
-                    }
-                }
-                else {
-                    for (int i = 0; i < num; i++) {
-                        cp[i] = ap[i] - bp[i] * beta;
-                    }
-                }
-            }
-            else {
-                // TODO!!
-                ShowNTErrors("TODO!");
-            }
-        }
-        else {
-            // TODO!!
-            ShowNTErrors("TODO!");
-        }
-    }
 }
 /*
@@ -136,38 +63,24 @@ keep the result in the tensor a and return nothing
 >> b - another tensor
 >> beta - the scaling factor
 */
-void SubMe(XTensor& a, const XTensor& b, DTYPE beta)
+void SubMe(XTensor & a, const XTensor & b, DTYPE beta)
-{
-    _Sub(&a, &b, &a, beta);
-}
-/* 
-return a dimension if the subtraction is performed as SubDim (in more details in SubDim.h)
->> a - a tensor
->> b - another tensor for subtraction
-*/
-int GetSubDimIndex(const XTensor &a, const XTensor &b)
 {
-    if(a.order < b.order)
+    if (b.order == 0){
-        return -1;
+        DTYPE shift = -(b.Get0D() * beta);
-    if(IsSameShaped(a, b))
+        _ScaleAndShift(&a, &a, 1.0F, shift);
-        return -1;
-    int hitCount = 0;
-    int hitDim = -1;
-    for(int i = 0; i < b.order; i++){
-        if(b.dimSize[b.order - 1 - i] == 1)
-            continue;
-        else if(b.dimSize[b.order - 1 - i] == a.dimSize[a.order - 1 - i]){
-            hitCount++;
-            hitDim = a.order - b.order + i;
-        }
    }
+    else {
+        int n = GetBroadcastDimIndex(a, b);
-    if(hitCount == 1)
+        if (n == -1)
-        return hitDim;
+            /* call _Sub function */
+            _Sub(&a, &b, &a, beta);
+        else if (n >= 0 && n < a.order)
+            /* call _SumDim function to do the SubDim operation */
+            _SumDim(&a, &b, &a, n, -beta);
        else
-        return -1;
+            ShowNTErrors("Something is wrong!");
+    }
 }
 /*
@@ -179,12 +92,17 @@ make a new tensor c to keep the result and return it
 >> beta - the scaling factor
 << return - the result of tensor subtraction
 */
-XTensor Sub(const XTensor &a, const XTensor &b, DTYPE beta)
+XTensor Sub(const XTensor & a, const XTensor & b, DTYPE beta)
 {
    XTensor c(&a);
    c.SetTMPFlag();
-    int n = GetSubDimIndex(a, b);
+    if (b.order == 0){
+        DTYPE shift = -(b.Get0D() * beta);
+        ScaleAndShift(a, c, 1.0F, shift);
+    }
+    else {
+        int n = GetBroadcastDimIndex(a, b);
        if(n == -1){
            /* call _Sub function */
@@ -197,8 +115,8 @@ XTensor Sub(const XTensor &a, const XTensor &b, DTYPE beta)
            }
        }
        else if(n >= 0 && n < a.order){
-        /* call _SubDim function */
+            /* call _SumDim function to do the SubDim operation */
-        _SubDim(&a, &b, &c, n, beta);
+            _SumDim(&a, &b, &c, n, -beta);
            /* tensor connections */
            if (a.enableGrad && b.enableGrad) {
@@ -210,7 +128,7 @@ XTensor Sub(const XTensor &a, const XTensor &b, DTYPE beta)
        else{
            ShowNTErrors("Something is wrong!");
        }
+    }
    return c;
 }
@@ -222,13 +140,18 @@ tensor subtraction c = a - b * \beta
 >> c - where we put a-b*\beta. we save it in a if c is NULL
 >> beta - the scaling factor
 */
-void Sub(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta)
+void Sub(const XTensor & a, const XTensor & b, XTensor & c, DTYPE beta)
 {
    if (!c.isInit || !IsSameShaped(a, c)) {
        InitTensorV2(&c, &a);
    }
-    int n = GetSubDimIndex(a, b);
+    if (b.order == 0){
+        DTYPE shift = -(b.Get0D() * beta);
+        ScaleAndShift(a, c, 1.0F, shift);
+    }
+    else {
+        int n = GetBroadcastDimIndex(a, b);
        if (n == -1) {
            /* call _Sub function */
@@ -241,8 +164,8 @@ void Sub(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta)
            }
        }
        else if (n >= 0 && n < a.order) {
-        /* call _SubDim function */
+            /* call _SumDim function to do the SubDim operation */
-        _SubDim(&a, &b, &c, n, beta);
+            _SumDim(&a, &b, &c, n, -beta);
            if (a.enableGrad && b.enableGrad) {
                /* tensor connections */
@@ -254,6 +177,7 @@ void Sub(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta)
        else {
            ShowNTErrors("Something is wrong!");
        }
+    }
 }
 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/arithmetic/Sub.cu
+++ b/source/tensor/core/arithmetic/Sub.cu
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-/*
- * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-01
- */
-#include "../../XDevice.h"
-#include "../../XUtility.h"
-#include "Sub.cuh"
-namespace nts { // namespace nts(NiuTrans.Tensor)
-#ifdef USE_CUDA
-/*
-subtraction of data arrays (CUDA Kernel)
-c = a - b * \beta
->> a - A matrix
->> b - another matrix
->> c - where we put a-b
->> size - the size of a/b/c
->> beta - the coefficient
-*/
-__global__
-void KernelSUB(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE beta)
-{
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-    if (i < size)
-        c[i] = a[i] - b[i] * beta;
-}
-/*
-tensor subtraction c = a - b * \beta (cuda version)
->> a - a tensor
->> b - another tensor
->> c - where we put a-b*\beta.
->> beta - the scaling factor
-*/
-void _CudaSub(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
-{
-    CheckNTErrors(a && b && c, "Empty tensor input!");
-    CheckNTErrors((a->unitNum == b->unitNum && a->unitNum == c->unitNum),
-                  "Unmatched tensors in addition!");
-    CheckNTErrors((a->dataType == b->dataType && a->dataType == c->dataType),
-                  "Unmatched tensors in addition!");
-    CheckNTErrors((a->devID == b->devID && a->devID == c->devID),
-                  "The tensors must be on the same!");
-    int devIDBackup = XDevice::GetGPUDevice();
-    XDevice::SetGPUDevice(a->devID);
-    if (!a->isSparse && !b->isSparse) {
-        CheckNTErrors(!c->isSparse, "Illegal use of sparse matrix in addition!");
-        if (a->dataType == DEFAULT_DTYPE &&
-            b->dataType == DEFAULT_DTYPE &&
-            c->dataType == DEFAULT_DTYPE)
-        {
-            int gridSize[3], blockSize[3];
-            GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);
-            dim3 blocks(gridSize[0]);
-            dim3 threads(blockSize[0]);
-            KernelSUB << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, a->unitNum, beta);
-        }
-        else {
-            // TODO!!
-            ShowNTErrors("TODO!");
-        }
-    }
-    else {
-        // TODO!!
-        ShowNTErrors("TODO!");
-    }
-    XDevice::SetGPUDevice(devIDBackup);
-}
-/* subtraction over arrays
-tensor subtraction c = a - b * \beta (cuda version) with an input handle
->> devID - device ID (MUST >= 0)
->> handle - cuda handle
->> a - an array
->> b - another array
->> c - where we put a-b
->> size - size of the array
->> beta - the coefficient
-*/
-void _CudaSubWithHandle(int devID, cublasHandle_t * handle, DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE beta)
-{
-    if (size == 0)
-        return;
-    if (c == NULL)
-        c = a;
-    CheckNTErrors((a && b && c), "Empty arrays in addition!");
-    int devIDBackup;
-    ProtectCudaDev(devID, devIDBackup);
-    if (c == a) {
-#ifdef DOUBELPRICSION
-        cublasDaxpy(*handle, size, &beta, b, 1, a, 1);
-#else
-        cublasSaxpy(*handle, size, &beta, b, 1, a, 1);
-#endif
-    }
-    else {
-        int gridSize[3], blockSize[3];
-        GDevs.GetCudaThread(devID, size, gridSize, blockSize);
-        dim3 blocks(gridSize[0]);
-        dim3 threads(blockSize[0]);
-        KernelSUB<<<blocks, threads>>>((DTYPE*)a, (DTYPE*)b, (DTYPE*)c, size, beta);
-    }
-    BacktoCudaDev(devID, devIDBackup);
-}
-#endif // USE_CUDA
-} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/arithmetic/Sub.cuh
+++ b/source/tensor/core/arithmetic/Sub.cuh
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-/*
- * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-01
- */
-#ifndef __SUB_CUH__
-#define __SUB_CUH__
-#include "Sub.h"
-namespace nts { // namespace nts(NiuTrans.Tensor)
-#ifdef USE_CUDA
-/* subtraction of data arrays (CUDA Kernel) */
-__global__
-void KernelSUB(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE beta = (DTYPE)1.0);
-/* tensor subtraction c = a - b * \beta (cuda version) */
-void _CudaSub(const XTensor * a, const XTensor * b, XTensor * c = NULL, DTYPE beta = (DTYPE)1.0);
-/*  tensor subtraction c = a - b * \beta (cuda version) with an input handle */
-void _CudaSubWithHandle(int devID, cublasHandle_t * handle, DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE beta = (DTYPE)1.0);
-#endif // USE_CUDA
-} // namespace nts(NiuTrans.Tensor)
-#endif // __SUB_CUH__
--- a/source/tensor/core/arithmetic/SubDim.cpp
+++ b/source/tensor/core/arithmetic/SubDim.cpp
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-/*
-* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-13
-*/
-#include <math.h>
-#include "Sub.h"
-#include "SubDim.h"
-#include "SubDim.cuh"
-#include "../../XName.h"
-#include "../../XUtility.h"
-#include "../movement/CopyValues.h"
-#include "../shape/IsSameShaped.h"
-namespace nts { // namespace nts(NiuTrans.Tensor)
-/*
-tensor subtraction
-c = a - b * \beta
-where the size of b is equal to the n-th dimension of a,
-i.e., a is subtracted with b by broadcasting
->> a - a tensor
->> b - another tensor whose size is equal to that of dimension n of a
->> c - where we put a-b*\beta. we save it in a if c is NULL
->> n - the dimension index
->> beta - the scaling factor
-*/
-void _SubDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE beta)
-{
-    n = MODX(n, a->order);
-    CheckNTErrors(a && b && c, "Empty tensor input!");
-    CheckNTErrors(a->unitNum == c->unitNum, "Unmatched tensors in subtraction!");
-    CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
-                  "Unmatched data types in subtraction!");
-    CheckNTErrors(a->order == c->order, "The input tensors do not have the same order in subtraction!");
-    CheckNTErrors(!a->isSparse && !b->isSparse && !c->isSparse, "Dense tensors are required!");
-    CheckNTErrors(a->dimSize[n] == b->unitNum, "Wrong tensor size!");
-    CheckDev(a->devID, b->devID);
-    if (beta == 0) {
-        _CopyValues(a, c);
-        return;
-    }
-    if (_IsSameShaped(a, b)) {
-        _Sub(a, b, c, beta);
-        return;
-    }
-    if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
-#ifdef USE_CUDA
-        _CudaSubDim(a, b, c, n, beta);
-#else
-        ShowNTErrors("Please specify USE_CUDA and recompile the code!");
-#endif
-    }
-    else {
-        int stride = 1;
-        int blockSize = a->dimSize[n];
-        int blockNum = 1;
-        for (int i = a->order - 1; i >= 0; i--) {
-            if (i > n)
-                stride *= a->dimSize[i];
-            else if (i < n)
-                blockNum *= a->dimSize[i];
-        }
-        if (a->dataType == DEFAULT_DTYPE) {
-            int num = a->unitNum;
-            if (stride > 1) {
-                for (int i = 0, j = 0; i < num; i += stride, j++) {
-                    DTYPE * ap = (DTYPE*)a->data + i;
-                    DTYPE   bv = *((DTYPE*)b->data + j % blockSize) * beta;
-                    DTYPE * cp = (DTYPE*)c->data + i;
-                    for (int k = 0; k < stride; k++)
-                        cp[k] = ap[k] - bv;
-                }
-            }
-            else if (stride == 1) {
-                DTYPE * bp = (DTYPE*)b->data;
-                for (int i = 0; i < num; i += blockSize) {
-                    DTYPE * ap = (DTYPE*)a->data + i;
-                    DTYPE * cp = (DTYPE*)c->data + i;
-                    if (beta == 1.0F) {
-                        for (int j = 0; j < blockSize; j++)
-                            cp[j] = ap[j] - bp[j];
-                    }
-                    else {
-                        for (int j = 0; j < blockSize; j++)
-                            cp[j] = ap[j] - bp[j] * beta;
-                    }
-                }
-            }
-            else {
-                ShowNTErrors("Something is wrong!");
-            }
-        }
-        else {
-            ShowNTErrors("TODO!");
-        }
-    }
-}
-/*
-tensor subtraction (do it on site)
-keep the result in the input tensor and return nothing
-c = a - b * \beta
-where the size of b is equal to the n-th dimension of a,
-i.e., a is subtracted with b by broadcasting
->> a - a tensor
->> b - another tensor whose size is equal to that of dimension n of a
->> n - the dimension index
->> beta - the scaling factor
-*/
-void _SubDim(XTensor * a, const XTensor * b, int n, DTYPE beta)
-{
-    _SubDim(a, b, a, n, beta);
-}
-/*
-tensor subtraction (return an XTensor structure and make tensor connections)
-make a new tensor to keep the result and return it
-c = a - b * \beta
-where the size of b is equal to the n-th dimension of a,
-i.e., a is subtracted with b by broadcasting
->> a - a tensor
->> b - another tensor whose size is equal to that of dimension n of a
->> n - the dimension index
->> beta - the scaling factor
-<< return - the result tensor by tensor subtraction
-*/
-XTensor SubDim(const XTensor &a, const XTensor &b, int n, DTYPE beta)
-{
-    XTensor c(&a);
-    c.SetTMPFlag();
-    n = MODX(n, a.order);
-    /* call _Sub function */
-    _SubDim(&a, &b, &c, n, beta);
-    /* tensor connections */
-    if (a.enableGrad && b.enableGrad) {
-        XLink::MakeLink(&a, &b, &c, MATH_SUBDIM);
-        XLink::AddParamToHeadInt(&c, n);
-        XLink::AddParamToHead(&c, beta);
-    }
-    return c;
-}
-/*
-tensor subtraction
-c = a - b * \beta
-where the size of b is equal to the n-th dimension of a,
-i.e., a is subtracted with b by broadcasting
->> a - a tensor
->> b - another tensor whose size is equal to that of dimension n of a
->> c - where we put a-b*\beta. we save it in a if c is NULL
->> n - the dimension index
->> beta - the scaling factor
-*/
-void SubDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE beta)
-{
-    if (!c.isInit || !IsSameShaped(a, c)) {
-        InitTensorV2(&c, &a);
-    }
-    /* call _Sub function */
-    _SubDim(&a, &b, &c, n, beta);
-    if (a.enableGrad && b.enableGrad) {
-        /* tensor connections */
-        XLink::MakeLink(&a, &b, &c, MATH_SUBDIM);
-        XLink::AddParamToHeadInt(&c, n);
-        XLink::AddParamToHead(&c, beta);
-    }
-}
-}
--- a/source/tensor/core/arithmetic/SubDim.cu
+++ b/source/tensor/core/arithmetic/SubDim.cu
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-/*
-* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-13
-*/
-#include "SubDim.cuh"
-#include "../../XDevice.h"
-namespace nts { // namespace nts(NiuTrans.Tensor)
-#ifdef USE_CUDA
-/*
-tensor subtraction of a tensor and a row vector
-c = a - b * \beta
-where a is a tensor and b is a row vector
->> a - pointer to the data array of a
->> b - pointer to the data array of b
->> c - pointer to the data array of c
->> rowNum - number of rows of a and c
->> colNum - number of columns of a and c (i.e., the size of b)
->> beta - the scaling factor
-*/
-template <class T, bool betaFired>
-__global__
-    void KernelSubWithRow(T * a, T * b, T * c, int rowNum, int colNum, T beta)
-{
-    __shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
-    int col = blockDim.x * blockIdx.x + threadIdx.x;
-    int row = blockDim.y * blockIdx.y + threadIdx.y;
-    if (col >= colNum || row >= rowNum)
-        return;
-    if (threadIdx.y == 0)
-        bv[threadIdx.x] = b[col];
-    __syncthreads();
-    int offset = colNum * row + col;
-    if (betaFired)
-        c[offset] = a[offset] - bv[threadIdx.x] * beta;
-    else
-        c[offset] = a[offset] - bv[threadIdx.x];
-}
-/*
-tensor subtraction of a tensor and a colum vector
-c = a - b * \beta
-where a is a tensor and b is a colum vector
->> a - pointer to the data array of a
->> b - pointer to the data array of b
->> c - pointer to the data array of c
->> rowNum - number of rows of a and c (i.e., the size of b)
->> colNum - number of columns of a and c
->> blockNum - size of a block (matrix), i.e., rowNum * colNum
->> blockNum - number of matrics
->> beta - the scaling factor
-*/
-template <class T, bool betaFired>
-__global__
-    void KernelSubWithCol(T * a, T * b, T * c, int rowNum, int colNum, int blockSize, int blockNum, T beta)
-{
-    __shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
-    int colIndex = blockDim.x * blockIdx.x + threadIdx.x;
-    int row = blockDim.y * blockIdx.y + threadIdx.y;
-    int col = colIndex % colNum;
-    int block = colIndex / colNum;
-    if (row >= rowNum || block >= blockNum)
-        return;
-    if (threadIdx.x == 0)
-        bv[threadIdx.y] = b[row];
-    __syncthreads();
-    int offset = block * blockSize + row * colNum + col;
-    if (betaFired)
-        c[offset] = a[offset] - bv[threadIdx.y] * beta;
-    else
-        c[offset] = a[offset] - bv[threadIdx.y];
-}
-/*
-tensor subtraction (cuda version)
-c = a - b * \beta
-where the size of b is equal to the n-th dimension of a,
-i.e., a is subtracted with b by broadcasting
->> a - a tensor
->> b - another tensor whose size is equal to that of dimension n of a
->> c - where we put a+b*\beta. we save it in a if c is NULL
->> n - the dimension index
->> beta - the scaling factor
-*/
-void _CudaSubDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE beta)
-{
-    CheckNTErrors(a && b && c, "Empty tensor input!");
-    CheckNTErrors(a->unitNum == c->unitNum, "Unmatched tensors in subtraction!");
-    CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
-                  "Unmatched data types in subtraction!");
-    CheckNTErrors(a->order == c->order, "The input tensors do not have the same order in subtraction!");
-    CheckNTErrors(!a->isSparse && !b->isSparse && !c->isSparse, "Dense tensors are required!");
-    CheckNTErrors(a->dimSize[n] == b->unitNum, "Wrong tensor size!");
-    int stride = 1;
-    int blockSize = a->dimSize[n];
-    int blockNum = 1;
-    for (int i = a->order - 1; i >= 0; i--) {
-        if (i > n)
-            stride *= a->dimSize[i];
-        else if (i < n)
-            blockNum *= a->dimSize[i];
-    }
-    int cudaGrids[3];
-    int cudaBlocks[3];
-    int devIDBackup = 0;
-    ProtectCudaDev(a->devID, devIDBackup);
-    if (a->dataType == DEFAULT_DTYPE) {
-        if (stride > 1) {
-            GDevs.GetCudaThread2D(a->devID, stride * blockNum, blockSize, MAX_INT, cudaGrids, cudaBlocks);
-            if (beta == (DTYPE)1.0F)
-                KernelSubWithCol<DTYPE, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
-                                                ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
-                                                  blockSize, stride, blockSize * stride, blockNum, beta);
-            else
-                KernelSubWithCol<DTYPE, true>  <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
-                                                ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
-                                                  blockSize, stride, blockSize * stride, blockNum, beta);
-        }
-        else if (stride == 1) {
-            GDevs.GetCudaThread2D(a->devID, blockSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
-            if (beta == (DTYPE)1.0F)
-                KernelSubWithRow<DTYPE, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
-                                                ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
-                                                  blockNum, blockSize, beta);
-            else
-                KernelSubWithRow<DTYPE, true>  <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
-                                                ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
-                                                  blockNum, blockSize, beta);
-        }
-        else {
-            ShowNTErrors("Something is wrong!");
-        }
-    }
-    else {
-        ShowNTErrors("TODO!");
-    }
-    BacktoCudaDev(a->devID, devIDBackup);
-}
-#endif
-} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/arithmetic/SubDim.cuh
+++ b/source/tensor/core/arithmetic/SubDim.cuh
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-/*
-* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-13
-*/
-#ifndef __SUBDIM_CUH__
-#define __SUBDIM_CUH__
-#include "../../XTensor.h"
-namespace nts { // namespace nts(NiuTrans.Tensor)
-#ifdef USE_CUDA
-/* tensor subtraction c = a - b * \beta where the size of b is equal to the n-th dimension of a,
-   i.e., a is subtracted with b by broadcasting (cuda version) */
-void _CudaSubDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE beta = (DTYPE)1.0);
-#endif
-} // namespace nts(NiuTrans.Tensor)
-#endif // __SUBDIM_CUH__
--- a/source/tensor/core/arithmetic/SubDim.h
+++ b/source/tensor/core/arithmetic/SubDim.h
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-/*
-* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-13
-*/
-#ifndef __SUBDIM_H__
-#define __SUBDIM_H__
-#include "../../XTensor.h"
-namespace nts { // namespace nts(NiuTrans.Tensor)
-/* tensor subtraction c = a - b * \beta where the size of b is equal to the n-th dimension of a, 
-   i.e., a is subtracted with b by broadcasting*/
-void _SubDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE beta = (DTYPE)1.0);
-/* tensor subtraction c = a - b * \beta where the size of b is equal to the n-th dimension of a, 
-   i.e., a is subtracted with b by broadcasting. we keep the result in the input tensor a and return nothing */
-void _SubDim(XTensor * a, const XTensor * b, int n, DTYPE beta = (DTYPE)1.0);
-/* tensor subtraction c = a - b * \beta where the size of b is equal to the n-th dimension of a,
-   i.e., a is subtracted with b by broadcasting. We make a new tensor c to keep the result and return it */
-XTensor SubDim(const XTensor &a, const XTensor &b, int n, DTYPE beta = (DTYPE)1.0);
-/* tensor subtraction c = a - b * \beta where the size of b is equal to the n-th dimension of a, 
-   i.e., a is subtracted with b by broadcasting*/
-void SubDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE beta = (DTYPE)1.0);
-} // namespace nts(NiuTrans.Tensor)
-#endif // __SUBDIM_H__
--- a/source/tensor/core/arithmetic/Sum.cpp
+++ b/source/tensor/core/arithmetic/Sum.cpp
@@ -25,6 +25,7 @@
 #include "../../XBLAS.h"
 #include "../movement/CopyValues.h"
 #include "../shape/IsSameShaped.h"
+#include "../math/ScaleAndShift.h"
 #include "Sum.h"
 #include "Sum.cuh"
 #include "SumDim.h"
@@ -93,7 +94,38 @@ void _Sum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
                    AXPY(a->unitNum, beta, bp, 1, cp, 1);
                    return;
                }
+#else
+                /* unrolling */
+                int num = a->unitNum;
+                if (num % 4 == 0) {
+                    for (int i = 0; i < num; i += 4) {
+                        cp[i] = ap[i] + bp[i] * beta;
+                        cp[i + 1] = ap[i + 1] + bp[i + 1] * beta;
+                        cp[i + 2] = ap[i + 2] + bp[i + 2] * beta;
+                        cp[i + 3] = ap[i + 3] + bp[i + 3] * beta;
+                    }
+                }
+                else if (num % 2 == 0) {
+                    for (int i = 0; i < num; i += 2) {
+                        cp[i] = ap[i] + bp[i] * beta;
+                        cp[i + 1] = ap[i + 1] + bp[i + 1] * beta;
+                    }
+                }
+                else {
+                    for (int i = 0; i < num; i++) {
+                        cp[i] = ap[i] + bp[i] * beta;
+                    }
+                }
 #endif
+            }
+            else if (a->dataType == X_INT &&
+                     b->dataType == X_INT &&
+                     c->dataType == X_INT)
+            {
+                int * ap = (int*)a->data;
+                int * bp = (int*)b->data;
+                int * cp = (int*)c->data;
                /* unrolling */
                int num = a->unitNum;
                if (num % 4 == 0) {
@@ -149,38 +181,58 @@ keep the result in the tensor a and return nothing
 >> b - another tensor
 >> beta - the scaling factor
 */
-void SumMe(XTensor& a, const XTensor& b, DTYPE beta)
+void SumMe(XTensor & a, const XTensor & b, DTYPE beta)
 {
+    if (b.order == 0){
+        DTYPE shift = b.Get0D() * beta;
+        _ScaleAndShift(&a, &a, 1.0F, shift);
+    }
+    else {
+        int n = GetBroadcastDimIndex(a, b);
+        if (n == -1)
+            /* call _Sum function */
            _Sum(&a, &b, &a, beta);
+        else if (n >= 0 && n < a.order)
+            /* call _SumDim function */
+            _SumDim(&a, &b, &a, n, beta);
+        else
+            ShowNTErrors("Something is wrong!");
+    }
 }
 /* 
-return a dimension if the sum is performed as SumDim (in more details in SumDim.h)
+return a dimension if the operation is performed as broadcast(e.g. SumDim function)
 >> a - a tensor
->> b - another tensor for sum
+>> b - another tensor for operation
 */
-int GetSumDimIndex(const XTensor &a, const XTensor &b)
+int GetBroadcastDimIndex(const XTensor & a, const XTensor & b)
 {
    if(a.order < b.order)
        return -1;
    if(IsSameShaped(a, b))
        return -1;
-    int hitCount = 0;
    int hitDim = -1;
+    bool isHit = false;
    for(int i = 0; i < b.order; i++){
        if(b.dimSize[b.order - 1 - i] == 1)
            continue;
-        else if(b.dimSize[b.order - 1 - i] == a.dimSize[a.order - 1 - i]){
+        else {
-            hitCount++;
+            if (isHit == true)
-            hitDim = a.order - b.order + i;
+                return -1;
+            else
+                isHit = true;
+            for (int j = 0; j < a.order; j++){
+                if (b.dimSize[b.order - 1 - i] == a.dimSize[a.order - 1 - j]){
+                    hitDim = a.order - 1 - j;
+                    break;
+                }
+            }
        }
    }
-    if(hitCount == 1)
    return hitDim;
-    else
-        return -1;
 }
 /*
@@ -192,12 +244,17 @@ make a new tensor c to keep the result and return it
 >> beta - the scaling factor
 << return - the result of tensor summation
 */
-XTensor Sum(const XTensor &a, const XTensor &b, DTYPE beta)
+XTensor Sum(const XTensor & a, const XTensor & b, DTYPE beta)
 {
    XTensor c(&a);
    c.SetTMPFlag();
-    int n = GetSumDimIndex(a, b);
+    if (b.order == 0){
+        DTYPE shift = b.Get0D() * beta;
+        ScaleAndShift(a, c, 1.0F, shift);
+    }
+    else {
+        int n = GetBroadcastDimIndex(a, b);
        if(n == -1){
            /* call _Sum function */
@@ -223,7 +280,7 @@ XTensor Sum(const XTensor &a, const XTensor &b, DTYPE beta)
        else{
            ShowNTErrors("Something is wrong!");
        }
+    }
    return c;
 }
@@ -234,13 +291,18 @@ tensor summation c = a + b * \beta
 >> b - another tensor
 >> beta - the scaling factor
 */
-void Sum(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta)
+void Sum(const XTensor & a, const XTensor & b, XTensor & c, DTYPE beta)
 {
    if (!c.isInit || !IsSameShaped(a, c)) {
        InitTensorV2(&c, &a);
    }
-    int n = GetSumDimIndex(a, b);
+    if (b.order == 0){
+        DTYPE shift = b.Get0D() * beta;
+        ScaleAndShift(a, c, 1.0F, shift);
+    }
+    else {
+        int n = GetBroadcastDimIndex(a, b);
        if (n == -1) {
            /* call _Sum function */
@@ -266,6 +328,7 @@ void Sum(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta)
        else {
            ShowNTErrors("Something is wrong!");
        }
+    }
 }
 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/arithmetic/Sum.cu
+++ b/source/tensor/core/arithmetic/Sum.cu
@@ -45,6 +45,15 @@ void KernelADD(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE beta)
        c[i] = a[i] + b[i] * beta;
 }
+__global__
+void KernelADD(int * a, int * b, int * c, int size, int beta)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < size)
+        c[i] = a[i] + b[i] * beta;
+}
 /*
 tensor summation c = a + b * \beta (cuda version)
 >> a - a tensor
@@ -100,6 +109,17 @@ void _CudaSum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
                KernelADD << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, a->unitNum, beta);
            }
        }
+        else if (a->dataType == X_INT &&
+                 b->dataType == X_INT &&
+                 c->dataType == X_INT)
+        {
+            int gridSize[3], blockSize[3];
+            GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);
+            dim3 blocks(gridSize[0]);
+            dim3 threads(blockSize[0]);
+            KernelADD << <blocks, threads >> >((int*)a->data, (int*)b->data, (int*)c->data, a->unitNum, (int)beta);
+        }
        else {
            // TODO!!
            ShowNTErrors("TODO!");

--- a/source/tensor/core/arithmetic/Sum.h
+++ b/source/tensor/core/arithmetic/Sum.h
@@ -26,6 +26,9 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)
+/* return a dimension if the operation is performed as broadcast(e.g. SumDim function) */
+int GetBroadcastDimIndex(const XTensor & a, const XTensor & b);
 /* tensor summation c = a + b * \beta */
 void _Sum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta = (DTYPE)1.0);

--- a/source/tensor/core/arithmetic/SumDim.cpp
+++ b/source/tensor/core/arithmetic/SumDim.cpp
@@ -220,7 +220,7 @@ void _SumBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta
 {
    CheckNTErrors(a->order == b->order, "Wrong tensor orders!");
    CheckNTErrors(a->order == c->order, "Wrong tensor orders!");
-    CheckNTErrors(a->order > 0, "TODO!");
+    CheckNTErrors(a->order >= 0, "TODO!");
    int order = a->order;
    int count = 0;

--- a/source/tensor/core/arithmetic/SumDim.cu
+++ b/source/tensor/core/arithmetic/SumDim.cu
--- a/source/tensor/core/arithmetic/XTensorBLAS.cpp
+++ b/source/tensor/core/arithmetic/XTensorBLAS.cpp
--- a/source/tensor/core/math/Clip.cpp
+++ b/source/tensor/core/math/Clip.cpp
@@ -45,10 +45,10 @@ void _Clip(const XTensor * a, XTensor * b, DTYPE lower, DTYPE upper)
 #endif
    CheckNTErrors((_IsSameShaped(a, b)), "Input tensors should have the same type!");
-    CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
-    DTYPE * d = (DTYPE*)a->data;
+    if (a->dataType == DEFAULT_DTYPE) {
-    DTYPE * db = (DTYPE*)b->data;
+        DTYPE* d = (DTYPE*)a->data;
+        DTYPE* db = (DTYPE*)b->data;
        for (int i = 0; i < a->unitNum; i++) {
            if (d[i] > upper)
                db[i] = upper;
@@ -57,6 +57,21 @@ void _Clip(const XTensor * a, XTensor * b, DTYPE lower, DTYPE upper)
            else
                db[i] = d[i];
        }
+    }
+    else if (a->dataType == X_INT) {
+        int* d = (int*)a->data;
+        int* db = (int*)b->data;
+        for (int i = 0; i < a->unitNum; i++) {
+            if (d[i] > upper)
+                db[i] = upper;
+            else if (d[i] < lower)
+                db[i] = lower;
+            else
+                db[i] = d[i];
+        }
+    }
+    else
+        ShowNTErrors("TODO!");
 }
 /*

--- a/source/tensor/core/math/Clip.cu
+++ b/source/tensor/core/math/Clip.cu
@@ -36,8 +36,9 @@ set each entry to its clip value (CUDA Kernel)
 >> upper - the upper border
 >> size - size of the data array
 */
+template <class T>
 __global__
-void KernelClip(DTYPE * a, DTYPE * b, DTYPE lower, DTYPE upper, int size)
+void KernelClip(T * a, T * b, T lower, T upper, int size)
 {
    int i = blockDim.x * blockIdx.x + threadIdx.x;
@@ -90,10 +91,16 @@ void _CudaClip(const XTensor * a, XTensor * b, DTYPE lower, DTYPE upper)
    ProtectCudaDev(a->devID, devIDBackup);
    if (a->dataType == DEFAULT_DTYPE) {
-        KernelClip << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, lower, upper, a->unitNum);
+        KernelClip<DTYPE> << <blocks, threads >> >((DTYPE *)a->data, (DTYPE *)b->data, lower, upper, a->unitNum);
+    }
+    else if (a->dataType == X_INT) {
+        int lower1 = (int)lower;
+        int upper1 = (int)upper;
+        KernelClip<int> << <blocks, threads >> >((int *)a->data, (int *)b->data, lower1, upper1, a->unitNum);
    }
    else if (a->dataType == X_FLOAT16) {
-        KernelClip << <blocks, threads >> >((__half*)a->data, (__half*)b->data, lower, upper, a->unitNum);
+        ShowNTErrors("TODO!");
    }
    else {
        ShowNTErrors("TODO!");

--- a/source/tensor/core/math/Clip.cuh
+++ b/source/tensor/core/math/Clip.cuh
@@ -29,8 +29,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #ifdef USE_CUDA
 /* set each entry to its clip value (CUDA Kernel) */
-__global__
+template <class T> __global__
-void KernelClip(DTYPE * a, DTYPE * b, DTYPE lower, DTYPE upper, int size);
+void KernelClip(T * a, T * b, T lower, T upper, int size);
 /* set each entry to its clip value (CUDA Kernel) with float16 data type*/
 __global__

--- a/source/tensor/core/math/Compare.cpp
+++ b/source/tensor/core/math/Compare.cpp
--- a/source/tensor/core/math/ScaleAndShift.cpp
+++ b/source/tensor/core/math/ScaleAndShift.cpp
@@ -47,11 +47,9 @@ void _ScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift)
        return;
    }
 #endif
+    if (a->dataType == DEFAULT_DTYPE) {
-    CheckNTErrors((a->dataType == DEFAULT_DTYPE), "The tensor is not in the default data type!");
        /* sparse tensor */
-    if(a->isSparse){
+        if(a->isSparse) {
            int num = a->unitNumNonZero;
            char * d = (char*)a->data + sizeof(int);
            char * f = d + (sizeof(int) + sizeof(DTYPE)) * 0 + sizeof(int);
@@ -66,7 +64,7 @@ void _ScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift)
            }
        }
        /* dense tensor */
-    else{
+        else {
            DTYPE * va = (DTYPE*)a->data;
            DTYPE * vb = (DTYPE*)b->data;
            for(int i = 0; i < b->unitNum; i++){
@@ -75,6 +73,36 @@ void _ScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift)
                vb++;
            }
        }
+    }
+    else if (a->dataType == X_INT) {
+        /* sparse tensor */
+        if(a->isSparse) {
+            int num = a->unitNumNonZero;
+            char * d = (char*)a->data + sizeof(int);
+            char * f = d + (sizeof(int) + sizeof(int)) * 0 + sizeof(int);
+            char * db = (char*)b->data + sizeof(int);
+            char * fb = db + (sizeof(int) + sizeof(int)) * 0 + sizeof(int);
+            for(int i = 0; i < num; i++){
+                int * v = (int*)f;
+                int * vb = (int*)fb;
+                *vb = *v * scale + shift;
+                f += sizeof(int) + sizeof(int);
+                fb += sizeof(int) + sizeof(int);
+            }
+        }
+        /* dense tensor */
+        else {
+            int * va = (int*)a->data;
+            int * vb = (int*)b->data;
+            for(int i = 0; i < b->unitNum; i++){
+                *vb = *va * scale + shift;
+                va++;
+                vb++;
+            }
+        }
+    }
+    else
+        ShowNTErrors("TODO!");
 }
 /* 

--- a/source/tensor/core/math/ScaleAndShift.cu
+++ b/source/tensor/core/math/ScaleAndShift.cu
@@ -34,9 +34,9 @@ scale and shift all tensor entires b = a * scale + shift (CUDA Kernel)
 >> scale - how much we want to scale it
 >> shift - how much we want to shift it
 */
-template<bool isUnitScale, bool isZeroShift>
+template<class T, bool isUnitScale, bool isZeroShift>
 __global__ 
-void KernelScaleAndShift(DTYPE * a, DTYPE * b, int size, DTYPE scale, DTYPE shift)
+void KernelScaleAndShift(T * a, T * b, int size, T scale, T shift)
 {
    int i = blockDim.x * blockIdx.x + threadIdx.x;
@@ -108,13 +108,26 @@ void _CudaScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift
        if(a->dataType == DEFAULT_DTYPE){
            if(scale == 1.0F && shift == 0)
-                KernelScaleAndShift<true, true> <<<blocks, threads>>>((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift);
+                KernelScaleAndShift<DTYPE, true, true> <<<blocks, threads>>>((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift);
            else if (scale == 1.0F && shift != 0)
-                KernelScaleAndShift<true, false> << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift);
+                KernelScaleAndShift<DTYPE, true, false> << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift);
            else if(scale != 1.0F && shift == 0)
-                KernelScaleAndShift<false, true> << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift);
+                KernelScaleAndShift<DTYPE, false, true> << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift);
            else
-                KernelScaleAndShift<false, false> << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift);
+                KernelScaleAndShift<DTYPE, false, false> << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift);
+        }
+        else if (a->dataType == X_INT) {
+            int scale2 = int(scale);
+            int shift2 = int(shift);
+            if (scale == 1.0F && shift == 0)
+                KernelScaleAndShift<int, true, true><<<blocks, threads>>>((int *)a->data, (int *)b->data, a->unitNum, scale2, shift2);
+            else if (scale == 1.0F && shift != 0)
+                KernelScaleAndShift<int, true, false><<<blocks, threads>>>((int *)a->data, (int *)b->data, a->unitNum, scale2, shift2);
+            else if (scale != 1.0F && shift == 0)
+                KernelScaleAndShift<int, false, true><<<blocks, threads>>>((int *)a->data, (int *)b->data, a->unitNum, scale2, shift2);
+            else
+                KernelScaleAndShift<int, false, false><<<blocks, threads>>>((int *)a->data, (int *)b->data, a->unitNum, scale2, shift2);
        }
        else if(a->dataType == X_FLOAT16){
            unsigned short scale2 = FloatToFloat16(scale);

--- a/source/tensor/core/movement/CopyIndexed.cpp
+++ b/source/tensor/core/movement/CopyIndexed.cpp
@@ -146,7 +146,7 @@ void _CopyIndexed(const XTensor * s, XTensor * t, int dim,
            CheckNTErrors(s->GetDim(i) == t->GetDim(i), "Unmatched dimensions");
        }
        else {
-            CheckNTErrors(t->GetDim(i) == indexSize * copyNum, "Unmatched dimensions");
+            CheckNTErrors(t->GetDim(i) >= indexSize * copyNum, "Unmatched dimensions");
        }
    }

--- a/source/tensor/core/movement/Gather.cpp
+++ b/source/tensor/core/movement/Gather.cpp
@@ -43,12 +43,43 @@ void _Gather(const XTensor * s, XTensor * t, XTensor * srcIndex, int dim)
    CheckNTErrors((s && t), "Invalid tensors!");
    CheckNTErrors(s->devID == t->devID, "the data must be kept on the same device!");
    CheckNTErrors((t->unitSize == srcIndex->unitSize), "Unmatched tensors!");
+    CheckNTErrors((srcIndex->dataType == X_INT), "The index tensor should be INT type!");
+    CheckNTErrors((srcIndex->order == s->order), "index's order should be the same with source's");
 #ifdef USE_CUDA
    if (s->devID >= 0 && t->devID >= 0) {
        _CudaGather(s, t, srcIndex, dim);
        return;
    }
 #endif
+    int stride = 1;
+    int blockNum = 1;
+    for (int i = dim + 1; i < s->order; ++i)
+    {
+        stride *= s->GetDim(i);
+    }
+    for (int i = 0; i < dim; ++i)
+    {
+        blockNum *= s->GetDim(i);
+    }
+    int indexStrideNum = srcIndex->GetDim(dim);
+    int srcStrideNum = stride * s->GetDim(dim);
+    int tgtBlockSize = stride * indexStrideNum;
+    DTYPE * sData = (DTYPE*)s->data;
+    DTYPE * tData = (DTYPE*)t->data;
+    int * sIndexData = (int*)srcIndex->data;
+    for (int blockIndex = 0; blockIndex < blockNum; ++blockIndex)
+    {
+        for (int i = 0; i < indexStrideNum; i++) {
+            for (int j = 0; j < stride; j++)
+            {
+                int sIndex = sIndexData[i * stride + blockIndex * indexStrideNum + j] * stride + blockIndex * srcStrideNum + j;
+                CheckNTErrors(sIndex < s->unitNum, "Wrong index!");
+                int tIndex = i * stride + blockIndex * tgtBlockSize + j;
+                tData[tIndex] = sData[sIndex];
+            }
+        }
+    }
 }
 /*
@@ -64,13 +95,14 @@ void _Gather(const XTensor * s, XTensor * t, XTensor * srcIndex)
    CheckNTErrors(s->devID == t->devID, "the data must be kept on the same device!");
    CheckNTErrors((s->unitSize == t->unitSize), "Unmatched tensors!");
+    if (s->devID >= 0) {
 #ifdef USE_CUDA
-    if (s->devID >= 0 && t->devID >= 0) {
        _CudaGather(s, t, srcIndex);
-        return;
+#else
-    }
+        ShowNTErrors("Plesae specify USE_CUDA and recompile the code!");
 #endif
+    }
+    else {
        int stride = 1;
        int indexSize = 1;
@@ -83,9 +115,11 @@ void _Gather(const XTensor * s, XTensor * t, XTensor * srcIndex)
        for (int i = 0; i < indexSize; i++) {
            int sIndex = sIndexData[i] * stride;
+            CheckNTErrors(sIndex < s->unitNum, "Wrong index!");
            for (int j = 0; j < stride; j++)
                tData[i * stride + j] = sData[sIndex + j];
        }
+    }
 }
 /*

--- a/source/tensor/core/movement/Gather.cu
+++ b/source/tensor/core/movement/Gather.cu
@@ -77,7 +77,7 @@ gather indexed sub-tensors(cuda version)
 >> blockNum - block size of data
 */
 __global__
-void KernelGather(DTYPE * sData, DTYPE * tData, int * sIndex, int stride, int strideNum, int blockNum)
+void KernelGather(DTYPE * sData, DTYPE * tData, int * sIndex, int stride, int strideNum, int blockNum, int srcStrideNum)
 {
    int idx = blockDim.x * blockIdx.x + threadIdx.x;
    int idy = blockDim.y * blockIdx.y + threadIdx.y;
@@ -90,7 +90,7 @@ void KernelGather(DTYPE * sData, DTYPE * tData, int * sIndex, int stride, int st
    for (int i = idx * stride + stride * strideNum * blockIndex + offsetInBlock;
        i < stride * strideNum * blockIndex + offsetInBlock + stride * strideNum && i < size;
        i += stride * blockDim.x) {
-        tData[i] = sData[sIndex[i]];
+        tData[i] = sData[sIndex[i] * stride + stride * srcStrideNum * blockIndex + offsetInBlock];
    }
 }
@@ -126,14 +126,30 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex)
    int * sIndex = NULL;
    if (srcIndex->devID < 0) {
+        int * sIndexData = (int*)srcIndex->data;
+        for (int i = 0; i < indexSize; i++) {
+            int srcIndexValue = sIndexData[i] * stride;
+            CheckNTErrors(srcIndexValue < s->unitNum, "Wrong index!");
+        }
        sIndex = mem != NULL ? 
                  (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) : 
                  (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);
        XMemCopy(sIndex, devID, srcIndex, -1, sizeof(int) * indexSize);
    }
-    else
+    else {
+        int * sIndexData = new int[sizeof(int) * indexSize];
+        XMemCopy(sIndexData, -1, srcIndex->data, srcIndex->devID, sizeof(int) * indexSize);
+        for (int i = 0; i < indexSize; i++) {
+            int srcIndexValue = sIndexData[i] * stride;
+            CheckNTErrors(srcIndexValue < s->unitNum, "Wrong index!");
+        }
        sIndex = (int *)srcIndex->data;
+        delete[] sIndexData;
+    }
    KernelGather<<<blocks, threads >>>(sData, tData, sIndex, indexSize, stride);
    if (srcIndex->devID < 0) {
@@ -163,6 +179,7 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex, int dim)
    int blockNum = 1;
    int indexSize = srcIndex->unitNum;
    int strideNum = srcIndex->dimSize[dim];
+    int srcStrideNum = s->dimSize[dim];
    for (int i = 0; i < dim; i++)
        blockNum *= srcIndex->dimSize[i];
    for (int i = dim + 1; i < srcIndex->order; i++)
@@ -170,19 +187,33 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex, int dim)
    int * sIndex = NULL;
    if (srcIndex->devID < 0) {
+        int * sIndexData = (int*)srcIndex->data;
+        for (int i = 0; i < indexSize; i++) {
+            int srcIndexValue = sIndexData[i] * stride;
+            CheckNTErrors(srcIndexValue < s->unitNum, "Wrong index!");
+        }
        sIndex = mem != NULL ?
                  (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) :
                  (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);
        XMemCopy(sIndex, devID, srcIndex, -1, sizeof(int) * indexSize);
    }
-    else
+    else {
+        int * sIndexData = new int[sizeof(int) * indexSize];
+        XMemCopy(sIndexData, -1, srcIndex->data, srcIndex->devID, sizeof(int) * indexSize);
+        for (int i = 0; i < indexSize; i++) {
+            int srcIndexValue = sIndexData[i] * stride;
+            CheckNTErrors(srcIndexValue < s->unitNum, "Wrong index!");
+        }
        sIndex = (int *)srcIndex->data;
+	   delete[] sIndexData;
+    }
    int cudaGrids[3];
    int cudaBlocks[3];
    GDevs.GetCudaThread2D(devID, max(32, strideNum), stride*blockNum, MAX_INT, cudaGrids, cudaBlocks);
+    KernelGather << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> > ((DTYPE *)s->data, (DTYPE *)t->data, sIndex, stride, strideNum, blockNum, srcStrideNum);
-    KernelGather << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> > ((DTYPE *)s->data, (DTYPE *)t->data, sIndex, stride, strideNum, blockNum);
 }
 #endif // USE_CUDA

--- a/source/tensor/core/reduce/ReduceMax.cpp
+++ b/source/tensor/core/reduce/ReduceMax.cpp
@@ -86,7 +86,7 @@ void _funcCPUName(const XTensor * input, XTensor * output, int dim)             
                    vecBuf[j] = VectorBuffer::loadu((DTYPE*)(ip)+j * vecBufLength);                                 \
                }                                                                                                   \
                for (int j = 1; j < strideNum / 32; j++) {                                                          \
-                    const DTYPE* ptr = (DTYPE*)(ip + j * vecBufLength);                                             \
+                    const DTYPE* ptr = (DTYPE*)(ip + j * 4 * vecBufLength);                                         \
                    vecBuf[0] = vecBuf[0]._vectorOp(VectorBuffer::loadu(ptr + 0 * vecBufLength));                   \
                    vecBuf[1] = vecBuf[1]._vectorOp(VectorBuffer::loadu(ptr + 1 * vecBufLength));                   \
                    vecBuf[2] = vecBuf[2]._vectorOp(VectorBuffer::loadu(ptr + 2 * vecBufLength));                   \
@@ -106,7 +106,7 @@ void _funcCPUName(const XTensor * input, XTensor * output, int dim)             
        else {                                                                                                      \
            /* data is separated */                                                                                 \
            for(int i = 0; i < blockNum; i++){                                                                      \
-                for(int j = 0; j < input->dimSize[input->order - 1] / 32; j++){                                     \
+                for(int j = 0; j < stride / 32; j++){                                                               \
                    DTYPE * ip = (DTYPE*)input->data + blockSize * i;                                               \
                    DTYPE * op = (DTYPE*)output->data + stride * i;                                                 \
                    VectorBuffer vecBuf[4];                                                                         \

--- a/source/tensor/core/reduce/ReduceMean.cpp
+++ b/source/tensor/core/reduce/ReduceMean.cpp
@@ -42,7 +42,7 @@ void _ReduceMean(const XTensor * input, XTensor * output, int dim)
    int num = input->dimSize[dim];
    _ReduceSum(input, output, dim);
-    _ScaleAndShiftMe(output, (DTYPE)1/num, 0);
+    _ScaleAndShiftMe(output, 1.0F/(DTYPE)(num), 0);
 }
 /* 

--- a/source/tensor/core/reduce/ReduceSum.cpp
+++ b/source/tensor/core/reduce/ReduceSum.cpp
@@ -105,7 +105,7 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor 
                        vecBuf[j] = VectorBuffer::loadu((DTYPE*)(ip) + j * vecBufLength, isExp, power, bias);
                    }
                    for(int j = 1; j < strideNum / 32; j++){
-                        const DTYPE* ptr = (DTYPE*)(ip + j * vecBufLength);
+                        const DTYPE* ptr = (DTYPE*)(ip + (j * 4) * vecBufLength);
                        vecBuf[0] = vecBuf[0] + VectorBuffer::loadu(ptr + 0 * vecBufLength, isExp, power, bias);
                        vecBuf[1] = vecBuf[1] + VectorBuffer::loadu(ptr + 1 * vecBufLength, isExp, power, bias);
                        vecBuf[2] = vecBuf[2] + VectorBuffer::loadu(ptr + 2 * vecBufLength, isExp, power, bias);
@@ -122,7 +122,7 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor 
            } else{
                //data is separated
                for(int i = 0; i < blockNum; i++){
-                    for(int j = 0; j < input->dimSize[input->order - 1] / 32; j++){
+                    for(int j = 0; j < stride / 32; j++){
                        DTYPE * ip = (DTYPE*)input->data + blockSize * i;
                        DTYPE * op = (DTYPE*)output->data + stride * i;
                        DTYPE * sp = shift != NULL ? (DTYPE*)shift->data + stride * i : NULL;
@@ -133,8 +133,7 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor 
                        }
                        VectorBuffer vecBuf[4];
                        for(int k = 0; k < 4; k++){
-                            vecBuf[k] = VectorBuffer::loadu((DTYPE*)(ip) + (j * 4 + k) * 32 / sizeof(DTYPE), isExp, power, bias + j * 32 / sizeof(DTYPE));
+                            vecBuf[k] = VectorBuffer::loadu((DTYPE*)(ip) + (j * 4 + k) * 32 / sizeof(DTYPE), isExp, power, bias + k * 32 / sizeof(DTYPE));
                        }
                        for(int k = 1; k < strideNum; k++){
                            DTYPE * ptr = ip + k * stride + (j * 4) * vecBufLength;

--- a/source/tensor/core/reduce/ReduceSumAll.cpp
+++ b/source/tensor/core/reduce/ReduceSumAll.cpp
@@ -16,11 +16,12 @@
 */
 /*
- * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-09-27
+ * $Created by: LI Yinqqiao (email: li.yin.qiao.2012@hotmail.com) 2020-01-09
 */
 #include "ReduceSumAll.h"
 #include "ReduceSum.h"
+#include "../../XName.h"
 #include "../movement/CopyValues.h"
 namespace nts{ // namespace nts(NiuTrans.Tensor)
@@ -42,55 +43,70 @@ int * getDimSize(const XTensor * tensor, int n)
 /*
 sum all the items of the tensor (It should be optimized!)
 >> source - the inpute tensor
-<< return - the total summation
+<< target - the total summation
 */
-DTYPE _ReduceSumAll(const XTensor * source)
+void _ReduceSumAll(const XTensor * source, XTensor * target)
 {
-    int dims[2] = {1, source->unitNum};
+    CheckNTErrors((source->devID == target->devID || (source->devID < 0 && target->devID < 0)),
-    int one = 1;
+                  "This code must be run on the same device!");
+    CheckNTErrors((source && target), "Empty input or output tensors!");
+    CheckNTErrors((target->order == 0), "Incorrect target tensor sizes!");
+    CheckNTErrors((target->unitNum == 1), "Illegal dimension to reduce!");
+    CheckNTErrors((source->dataType == target->dataType), "Unmatched data types!");
-    XTensor * all = NewTensorBufV2(2, dims, source->dataType, source->denseRatio, source->devID, source->mem);
+    int dims[1] = {source->unitNum};
-    XTensor * result = NewTensorBufV2(1, &one, source->dataType, 1.0F, source->devID, source->mem);
-    _CopyValues(source, all);
+    XTensor * all = NewTensorBufV2(1, dims, source->dataType, source->denseRatio, source->devID, source->mem);
-    _ReduceSum(all, result, 1);
-    DTYPE r = result->Get1D(0);
+    _CopyValues(source, all);
+    _ReduceSum(all, target, 0);
-    DelTensorBuf(result);
    DelTensorBuf(all);
+}
-    return r;
+/*
+sum all the items of the tensor (It should be optimized!)
-    /*int order = source->order;
+>> source - the inpute tensor
-    DTYPE summation;
+<< value - the total summation
+*/
-    XTensor * big = NewTensor(source);
+void _ReduceSumAll(const XTensor * source, DTYPE * value)
-    _CopyValues(source, big);
+{
-    for(int i = order - 1; i >= 0; i--) {
+    int * dimSize = new int[MAX_TENSOR_DIM_NUM];
-        if(i == 0)
+    float dr = (!source->isSparse) ? 1.0F : source->denseRatio;
-            big->Reshape(1, big->unitNum);
+    XTensor * target = NewTensorBufV2(0, dimSize, source->dataType, source->denseRatio, source->devID, source->mem);
+    target->SetTMPFlag();
-        int leadingDim = big->order - 1;
+    /* call _ReduceSum function */
-        int * dimSize;
+    _ReduceSumAll(source, target);
-        dimSize = getDimSize(big, leadingDim);
+    *value = target->Get0D();
-        XTensor * little = NewTensorV2(big->order - 1, dimSize, source->dataType, source->denseRatio, 
-                                     source->devID, source->mem);
-        _ReduceSum(big, little, leadingDim);
+    DelTensorBuf(target);
+}
-        delete big;
+/*
-        delete dimSize;
+sum all the items of the tensor
+>> source - the inpute tensor
+<< return - the total summation
+*/
+XTensor ReduceSumAll(const XTensor & source)
+{
+    int * dimSize = new int[MAX_TENSOR_DIM_NUM];
+    float dr = (!source.isSparse) ? 1.0F : source.denseRatio;
+    XTensor target(0, dimSize, source.dataType, dr, source.devID, source.mem);
+    target.SetTMPFlag();
-        big = NewTensor(little);
+    /* call _ReduceSum function */
-        _CopyValues(little, big);
+    _ReduceSumAll(&source, &target);
-        delete little;
+    /* tensor connection */
+    if (source.enableGrad) {
+        XLink::MakeLink(&source, NULL, &target, REDUCE_REDUCESUMALL);
    }
-    summation = big->Get1D(0);
-    delete big;
-    return summation;*/
+    /* destroy variables */
+    delete[] dimSize;
+    return target;
 }
 /*
@@ -98,9 +114,11 @@ sum all the items of the tensor
 >> source - the inpute tensor
 << return - the total summation   
 */
-DTYPE ReduceSumAll(const XTensor & source)
+DTYPE ReduceSumAllValue(const XTensor & source)
 {
-    return _ReduceSumAll(&source);
+    XTensor target;
+    target = ReduceSumAll(source);
+    return target.Get0D();
 }
 } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/reduce/ReduceSumAll.h
+++ b/source/tensor/core/reduce/ReduceSumAll.h
@@ -16,7 +16,7 @@
 */
 /*
- * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-09-27
+ * $Created by: LI Yinqqiao (email: li.yin.qiao.2012@hotmail.com) 2020-01-09
 */
@@ -28,10 +28,16 @@
 namespace nts{ // namespace nts(NiuTrans.Tensor)
 /* sum all the items of the tensor */
-DTYPE _ReduceSumAll(const XTensor * source);
+void _ReduceSumAll(const XTensor * source, XTensor * target);
 /* sum all the items of the tensor */
-DTYPE ReduceSumAll(const XTensor & source);
+void _ReduceSumAll(const XTensor * source, DTYPE * target);
+/* sum all the items of the tensor */
+XTensor ReduceSumAll(const XTensor & source);
+/* sum all the items of the tensor */
+DTYPE ReduceSumAllValue(const XTensor & source);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/shape/Stack.cpp
+++ b/source/tensor/core/shape/Stack.cpp
@@ -95,7 +95,7 @@ XTensor Stack(const TensorList &smalls, int dim)
        if (i < dim)
            dimSize[i] = tensor->GetDim(i);
        else if (i > dim)
-            dimSize[i] = tensor->GetDim(i-1);
+            dimSize[i] = tensor->GetDim(i - 1);
        else if (i == dim)
            dimSize[i] = count;
    }
@@ -160,7 +160,7 @@ void Stack(const TensorList &smalls, XTensor &t, int dim)
            if (i < dim)
                dimSize[i] = tensor->GetDim(i);
            else if (i > dim)
-                dimSize[i] = tensor->GetDim(i-1);
+                dimSize[i] = tensor->GetDim(i - 1);
            else if (i == dim)
                dimSize[i] = count;
        }

--- a/source/tensor/core/shape/Stack.h
+++ b/source/tensor/core/shape/Stack.h
@@ -31,7 +31,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 void _Stack(const TensorList * smalls, XTensor * t, int dim);
 /* stack small tensors into a big tensor along with a dimension (return an XTensor structure) */
-XTensor Stack(const TensorList &list, int leadingDim);
+XTensor Stack(const TensorList &list, int dim);
 /* stack small tensors into a big tensor along with a dimension */
 void Stack(const TensorList &smalls, XTensor &t, int dim);

--- a/source/tensor/core/sort/TopK.cpp
+++ b/source/tensor/core/sort/TopK.cpp
@@ -35,8 +35,9 @@ get the top-k items along a given dimension
 >> index - index of the top-k items
 >> dim - the dimension along which the sorting is performed
 >> k - how many items returned after sorting
+>> isSorted - indicates whether the k items are sorted
 */
-void _TopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k)
+void _TopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k, bool isSorted)
 {
    dim = MODX(dim, a->order);
@@ -58,7 +59,7 @@ void _TopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k)
    if (a->devID >= 0 || b->devID >= 0) {
 #ifdef USE_CUDA
-        _CudaTopK(a, b, index, dim, k);
+        _CudaTopK(a, b, index, dim, k, isSorted);
 #else
        ShowNTErrors("Plesae specify USE_CUDA and recompile the code!");
 #endif
@@ -116,15 +117,16 @@ get the top-k items along a given dimension
 >> index - index of the top-k items
 >> dim - the dimension along which the sorting is performed
 >> k - how many items returned after sorting
+>> isSorted - indicates whether the k items are sorted
 */
-void TopK(XTensor &a, XTensor &b, XTensor &index, int dim, int k)
+void TopK(XTensor &a, XTensor &b, XTensor &index, int dim, int k, bool isSorted)
 {
    dim = MODX(dim, a.order);
    if(a.dimSize[dim] <= k)
        _Sort(&a, &b, &index, dim);
    else
-        _TopK(&a, &b, &index, dim, k);
+        _TopK(&a, &b, &index, dim, k, isSorted);
    /* tensor connection */
    //TensorList list(2);

--- a/source/tensor/core/sort/TopK.cu
+++ b/source/tensor/core/sort/TopK.cu
@@ -374,9 +374,10 @@ get the top-k items
 >> minValue - min value of an item
 >> output - the output data array
 >> index - the output index array
+>> isSorted - indicates whether the k items are sorted
 */
 template<class T> __global__
-void KernelTopK3(T * input, int stride, int strideNum, int blockNum, int k, T minValue, T * output, int * index)
+void KernelTopK3(T * input, int stride, int strideNum, int blockNum, int k, T minValue, T * output, int * index, bool isSorted)
 {
    __shared__ CudaHeapNode<T> heapData[(SHARED_MEMORY_SIZE - 512 * sizeof(T)) / sizeof(CudaHeapNode<T>)];
    __shared__ T eachHeapMaxValue[512];
@@ -479,11 +480,24 @@ void KernelTopK3(T * input, int stride, int strideNum, int blockNum, int k, T mi
        int offset = stride * k * blockIndex + offsetInBlock;
        T * dOutput = output + offset;
        int * indexOutput = index + offset;
-        for (int q = 0; q < k; ++q){
+        if (isSorted)
+        {
+            for (int q = k - 1; q >= 0; q--) {
+                dOutput[stride * q] = ansHeapData.items[0].value;
+                indexOutput[stride * q] = ansHeapData.items[0].index;
+                ansHeapData.items[0] = ansHeapData.items[ansHeapData.count - 1];
+                ansHeapData.count--;
+                ansHeapData.Down(0);
+            }
+        }
+        else
+        {
+            for (int q = 0; q < k; ++q) {
                dOutput[stride * q] = ansHeapData.items[q].value;
                indexOutput[stride * q] = ansHeapData.items[q].index;
            }
        }
+    }
 }
@@ -803,8 +817,9 @@ get the top-k items along a given dimension
 >> index - index of the top-k items
 >> dim - the dimension along which the sorting is performed
 >> k - how many items returned after sorting
+>> isSorted - indicates whether the k items are sorted
 */
-void _CudaTopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k)
+void _CudaTopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k, bool isSorted)
 {
    CheckNTErrors((a->unitSize == b->unitSize), "Unmatched input tensors!");
    CheckNTErrors((a->order == b->order), "Unmatched input tensors!");
@@ -846,7 +861,7 @@ void _CudaTopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k)
        if (a->dataType == DEFAULT_DTYPE) {
            KernelTopK3<DTYPE> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >>>
                                 ((DTYPE*)a->data, stride, strideNumA, blockNum, k, DTYPE_MIN,
-                                 (DTYPE*)b->data, (int*)index->data);
+                                 (DTYPE*)b->data, (int*)index->data, isSorted);
        }
        else {
            ShowNTErrors("TODO!");
@@ -882,6 +897,10 @@ void _CudaTopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k)
            KernelTopKRadixSelect<DTYPE> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >>> (goutput, stride, strideNumA, blockNum, k, DTYPE_MIN, (DTYPE *)b->data, (int *)index->data, stride * strideNumA * blockNum);
            deconvert2floatV2 <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >>> ((unsigned int *)a->data, (float *)goutput, stride, strideNumA, blockNum, strideNumA*blockNum*stride);
+            if (isSorted)
+            {
+                ShowNTErrors("TODO!");
+            }
        }
    }

--- a/source/tensor/core/sort/TopK.cuh
+++ b/source/tensor/core/sort/TopK.cuh
@@ -29,7 +29,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #ifdef USE_CUDA
 /* get the top-k items along a given dimension */
-void _CudaTopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k);
+void _CudaTopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k, bool isSorted);
 #endif // USE_CUDA

--- a/source/tensor/core/sort/TopK.h
+++ b/source/tensor/core/sort/TopK.h
@@ -27,10 +27,10 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)
 /* get the top-k items along a given dimension */
-void _TopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k);
+void _TopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k, bool isSorted = false);
 /* get the top-k items along a given dimension */
-void TopK(XTensor &a, XTensor &b, XTensor &index, int dim, int k);
+void TopK(XTensor &a, XTensor &b, XTensor &index, int dim, int k, bool isSorted = false);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/utilities/FlushToMem.cu
+++ b/source/tensor/core/utilities/FlushToMem.cu
@@ -78,6 +78,8 @@ void CudaCPUToGPUFlush(TensorList * mList, int devID, XMem * GPUMem)
        if(m->mem == NULL)
            delete[] (char*)m->data;
+        else
+            m->mem->Release(m->data, m->GetDataSizeInChar(), m->signature);
        m->dataHost = NULL;
        m->data = GPUData + p;
@@ -94,7 +96,36 @@ void CudaCPUToGPUFlush(TensorList * mList, int devID, XMem * GPUMem)
 #endif
 }
-/* copy the data from GPU memory to CPU memory */
+/* copy the data from GPU memory to CPU memory (memory pool) */
+void CudaGPUToCPUFlush(XTensor * tensor, int devID, XMem * CPUMem)
+{
+#ifdef USE_CUDA
+    CheckNTErrors((tensor->devID >= 0), "Cannot do cpu-flush on matrices that are already on CPU.");
+    /* compute the requried memory size */
+    int size = 0;
+    if (tensor->isSparse)
+        size = sizeof(int) + (sizeof(int) + tensor->unitSize) * tensor->unitNumNonZero;
+    else
+        size = tensor->unitSize * tensor->unitNum;
+    char * CPUData = CPUMem != NULL ? (char*)CPUMem->Alloc(CPUMem->devID, size):
+                                      (char*)XMemAlloc(devID, size);
+    /* copy from CPU memory to GPU memory */
+    cudaMemcpy(CPUData, tensor->data, size, cudaMemcpyDeviceToHost);
+    if (tensor->dataHost != NULL)
+        delete[](char*)tensor->dataHost;
+    tensor->dataHost = NULL;
+    tensor->mem->Release(tensor->data, tensor->GetDataSizeInChar(), tensor->signature);
+    tensor->data = CPUData;
+    tensor->devID = CPUMem != NULL ? CPUMem->devID : devID;
+    tensor->mem = CPUMem;
+#endif
+}
+/* copy the data from GPU memory to CPU memory ((dataHost)) and do not delete the data */
 void CudaGPUToCPUFlush(XTensor * tensor)
 {
    CheckNTErrors((sizeof(DTYPE) == tensor->unitSize), "Unsupported data type.");

--- a/source/tensor/core/utilities/FlushToMem.cuh
+++ b/source/tensor/core/utilities/FlushToMem.cuh
@@ -31,7 +31,10 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /* flush a list of XTensor to GPU memory */
 void CudaCPUToGPUFlush(TensorList * mList, int devID, XMem * GPUMem);
-/* copy the data from GPU memory to CPU memory */
+/* copy the data from GPU memory to CPU memory (memory pool) */
+void CudaGPUToCPUFlush(XTensor * tensor, int devID, XMem * CPUMem);
+/* copy the data from GPU memory to CPU memory ((dataHost)) and do not delete the data */
 void CudaGPUToCPUFlush(XTensor * tensor);
 #endif // USE_CUDA

--- a/source/tensor/core/utilities/XMatrixSegment.cpp
+++ b/source/tensor/core/utilities/XMatrixSegment.cpp
--- a/source/tensor/function/LogSoftmax.cpp
+++ b/source/tensor/function/LogSoftmax.cpp
@@ -293,7 +293,7 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
                         LOSS_FUNCTION_NAME lossName)
 {
    CheckNTErrors((!dedx->isSparse), "The gradient matrix must be dense!");
-    CheckNTErrors((gold != NULL), "The gold standard cannot be empty!");
+    CheckNTErrors((gold != NULL || lossName == NOLOSS), "The gold standard cannot be empty!");
    if(leadDim < 0)
        leadDim = y->order - 1;

--- a/source/tensor/function/LogSoftmax.h
+++ b/source/tensor/function/LogSoftmax.h
@@ -33,6 +33,7 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim);
 /* log scale softmax y = log(e^x / \sum_{i} e^{x_i}) (return an XTensor structure) */
 XTensor LogSoftmax(const XTensor &x, int leadDim);
+/* log scale softmax y = log(e^x / \sum_{i} e^{x_i}) (with both argument of x and y) */
 void LogSoftmax(const XTensor &x, XTensor &y, int leadDim);
 /* log scale softmax y = log(e^x / \sum_{i} e^{x_i}) (with both argument of x and y) */

--- a/source/tensor/loss/CrossEntropy.cpp
+++ b/source/tensor/loss/CrossEntropy.cpp
@@ -358,21 +358,21 @@ DTYPE _CrossEntropy(const XTensor * output, const XTensor * gold,
    _CrossEntropy(output, gold, lossBuf, weight, padding, leadingDim);
-    loss = _ReduceSumAll(lossBuf);
+    _ReduceSumAll(lossBuf, &loss);
    if(reduceWay == REDUCE_MEAN) {
-        int nonZeroNum;
+        DTYPE nonZeroNum;
        if(padding == NULL) {
-            nonZeroNum = lossBuf->unitNum;
+            nonZeroNum = (DTYPE)lossBuf->unitNum;
        }
        else {
            XTensor * tmp = NewTensorBufV2(padding, padding->devID, padding->mem);
            _IsNonZero(padding, tmp);
-            nonZeroNum = (int)_ReduceSumAll(tmp);
+            _ReduceSumAll(tmp, &nonZeroNum);
            DelTensorBuf(tmp);
        }
-        loss = loss / (DTYPE)nonZeroNum;
+        loss = loss / nonZeroNum;
    }
    else if(reduceWay == REDUCE_SUM) {
        /* don't need to do anything */
@@ -675,8 +675,9 @@ void _CrossEntropyBackward(XTensor * dedy, const XTensor * output,
    if(padding != NULL) {
        XTensor * tmp = NewTensor(padding);
        _IsNonZero(padding, tmp);
-        int nonZeroNum = (int)_ReduceSumAll(tmp);
+        DTYPE nonZeroNum;
-        _ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)nonZeroNum);
+        _ReduceSumAll(tmp, &nonZeroNum);
+        _ScaleAndShiftMe(dedy, (DTYPE)1.0/nonZeroNum);
        delete tmp;
    }
    else {

--- a/source/tensor/loss/CrossEntropy.cu
+++ b/source/tensor/loss/CrossEntropy.cu
@@ -123,21 +123,21 @@ DTYPE _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,
    _CudaCrossEntropyFast(output, gold, lossBuf, weight, padding, leadingDim);
-    loss = _ReduceSumAll(lossBuf);
+    _ReduceSumAll(lossBuf, &loss);
    if(reduceWay == REDUCE_MEAN) {
-        int nonZeroNum;
+        DTYPE nonZeroNum;
        if(padding == NULL) {
-            nonZeroNum = lossBuf->unitNum;
+            nonZeroNum = (DTYPE)lossBuf->unitNum;
        }
        else {
            XTensor * tmp = NewTensorBufV2(padding, padding->devID, padding->mem);
            _IsNonZero(padding, tmp);
-            nonZeroNum = (int)_ReduceSumAll(tmp);
+            _ReduceSumAll(tmp, &nonZeroNum);
            DelTensorBuf(tmp);
        }
-        loss = loss / (DTYPE)nonZeroNum;
+        loss = loss / nonZeroNum;
    }
    else if(reduceWay == REDUCE_SUM) {
        /* don't need to do anything */
@@ -199,8 +199,9 @@ void _CudaCrossEntropyBackward(XTensor * dedy, const XTensor * output,
    if(padding != NULL) {
        XTensor * tmp = NewTensor(padding);
        _IsNonZero(padding, tmp);
-        int nonZeroNum = (int)_ReduceSumAll(tmp);
+        DTYPE nonZeroNum;
-        _ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)nonZeroNum);
+        _ReduceSumAll(tmp, &nonZeroNum);
+        _ScaleAndShiftMe(dedy, (DTYPE)1.0/nonZeroNum);
        delete tmp;
    }
    else {

--- a/source/tensor/test/TCopyIndexed.cpp
+++ b/source/tensor/test/TCopyIndexed.cpp
@@ -606,7 +606,7 @@ bool TestCopyIndexed4()
 /* 
 case 5: copy indexed sub-tensors 
-In this case, (3, 2, 3) -> (3, 2, 2), dim = 2, indexSize = 1, 
+In this case, (3, 2, 3) -> (3, 2, 4), dim = 2, indexSize = 2,
 srcIndex = [0, 1], tgtIndex = [0, 2], copyNum = 2.
 */
 bool TestCopyIndexed5()
@@ -622,7 +622,7 @@ bool TestCopyIndexed5()
    for (int i = 0; i < sOrder; i++)
        sUnitNum *= sDimSize[i];
-    /* a output tensor of size (3, 2, 2) */
+    /* a output tensor of size (3, 2, 4) */
    int tOrder = 3;
    int * tDimSize = new int[tOrder];
    tDimSize[0] = 3;
@@ -749,6 +749,152 @@ bool TestCopyIndexed5()
 #endif // USE_CUDA
 }
+/*
+case 6: copy indexed sub-tensors
+In this case, (3, 2, 3) -> (3, 2, 4), dim = 2, indexSize = 2,
+srcIndex = [0, 2], tgtIndex = [0, 1], copyNum = 1.
+*/
+bool TestCopyIndexed6()
+{
+    /* a input tensor of size (3, 2, 3) */
+    int sOrder = 3;
+    int * sDimSize = new int[sOrder];
+    sDimSize[0] = 3;
+    sDimSize[1] = 2;
+    sDimSize[2] = 3;
+    int sUnitNum = 1;
+    for (int i = 0; i < sOrder; i++)
+        sUnitNum *= sDimSize[i];
+    /* a output tensor of size (3, 2, 4) */
+    int tOrder = 3;
+    int * tDimSize = new int[tOrder];
+    tDimSize[0] = 3;
+    tDimSize[1] = 2;
+    tDimSize[2] = 4;
+    int tUnitNum = 1;
+    for (int i = 0; i < tOrder; i++)
+        tUnitNum *= tDimSize[i];
+    /* a index tensor of size (2) */
+    int indexOrder = 1;
+    int * indexDimSize = new int[indexOrder];
+    indexDimSize[0] = 2;
+    int indexUnitNum = 1;
+    for (int i = 0; i < indexOrder; i++)
+        indexUnitNum *= indexDimSize[i];
+    DTYPE sData[3][2][3] = { { {0.0F, -1.0F, 2.0F},
+                               {2.0F, 1.0F, 3.0F} },
+                             { {1.0F, 2.0F, 4.0F},
+                               {3.0F, 1.0F, 2.0F}},
+                             { {-1.0F, 3.0F, 2.0F},
+                               {1.0F, -1.0F, 0.0F} } };
+    DTYPE tData[3][2][4] = { { {5.0F, 5.0F, 5.0F, 5.0F},
+                               {5.0F, 5.0F, 5.0F, 5.0F} },
+                             { {5.0F, 5.0F, 5.0F, 5.0F},
+                               {5.0F, 5.0F, 5.0F, 5.0F}},
+                             { {5.0F, 5.0F, 5.0F, 5.0F},
+                               {5.0F, 5.0F, 5.0F, 5.0F} } };
+    DTYPE answer[3][2][4] = { { {2.0F, 5.0F, 5.0F, 0.0F},
+                                {3.0F, 5.0F, 5.0F, 2.0F} },
+                              { {4.0F, 5.0F, 5.0F, 1.0F},
+                                {2.0F, 5.0F, 5.0F, 3.0F}},
+                              { {2.0F, 5.0F, 5.0F, -1.0F},
+                                {0.0F, 5.0F, 5.0F, 1.0F} } };
+    int dim = 2;
+    int indexSize = 2;
+    int srcIndex[2] = {0, 2};
+    int tgtIndex[2] = {3, 0};
+    int copyNum = 1;
+    /* CPU test */
+    bool cpuTest = true;
+    /* create tensors */
+    XTensor * s = NewTensorV2(sOrder, sDimSize);
+    XTensor * t1 = NewTensorV2(tOrder, tDimSize);
+    XTensor * t2 = NewTensorV2(tOrder, tDimSize);
+    XTensor * sIndex = NewTensorV2(indexOrder, indexDimSize, X_INT);
+    XTensor * tIndex = NewTensorV2(indexOrder, indexDimSize, X_INT);
+    /* initialize variables */
+    s->SetData(sData, sUnitNum);
+    t1->SetData(tData, tUnitNum);
+    t2->SetData(tData, tUnitNum);
+    sIndex->SetData(srcIndex, indexUnitNum);
+    tIndex->SetData(tgtIndex, indexUnitNum);
+    /* call CopyIndexed function */
+    _CopyIndexed(s, t1, dim, srcIndex, indexSize, tgtIndex, copyNum);
+    _CopyIndexed(s, t2, dim, sIndex, tIndex, copyNum);
+    /* check results */
+    cpuTest = _CheckData(t1, answer, tUnitNum) &&
+              _CheckData(t2, answer, tUnitNum);
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+    /* create tensors */
+    XTensor * sGPU = NewTensorV2(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * tGPU1 = NewTensorV2(sOrder, tDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * tGPU2 = NewTensorV2(sOrder, tDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * sIndexGPU = NewTensorV2(indexOrder, indexDimSize, X_INT, 1.0F, 0);
+    XTensor * tIndexGPU = NewTensorV2(indexOrder, indexDimSize, X_INT, 1.0F, 0);
+    /* initialize variables */
+    sGPU->SetData(sData, sUnitNum);
+    tGPU1->SetData(tData, tUnitNum);
+    tGPU2->SetData(tData, tUnitNum);
+    sIndexGPU->SetData(srcIndex, indexUnitNum);
+    tIndexGPU->SetData(tgtIndex, indexUnitNum);
+    /* call CopyIndexed function */
+    _CopyIndexed(sGPU, tGPU1, dim, srcIndex, indexSize, tgtIndex, copyNum);
+    _CopyIndexed(sGPU, tGPU2, dim, sIndexGPU, tIndexGPU, copyNum);
+    /* check results */
+    gpuTest = _CheckData(tGPU1, answer, tUnitNum) &&
+              _CheckData(tGPU2, answer, tUnitNum);
+    /* destroy variables */
+    delete s;
+    delete t1;
+    delete t2;
+    delete sIndex;
+    delete tIndex;
+    delete sGPU;
+    delete tGPU1;
+    delete tGPU2;
+    delete sIndexGPU;
+    delete tIndexGPU;
+    delete[] sDimSize;
+    delete[] tDimSize;
+    delete[] indexDimSize;
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete s;
+    delete t1;
+    delete t2;
+    delete sIndex;
+    delete tIndex;
+    delete[] sDimSize;
+    delete[] tDimSize;
+    delete[] indexDimSize;
+    return cpuTest;
+#endif // USE_CUDA
+}
 /* other cases */
 /*
 TODO!!
@@ -805,6 +951,15 @@ bool TestCopyIndexed()
    else
        XPRINT(0, stdout, ">> case 5 passed!\n");
+    /* case 5 test */
+    caseFlag = TestCopyIndexed6();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 6 failed!\n");
+    }
+    else
+    XPRINT(0, stdout, ">> case 6 passed!\n");
    /* other cases test */
    /*
    TODO!!

--- a/source/tensor/test/TMatrixMulBatched.cpp
+++ b/source/tensor/test/TMatrixMulBatched.cpp
@@ -27,7 +27,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /* 
 case 1: matrix multiplication of the two tensors. 
-In this case, a=(2, 3), b=(2, 3) -> c=(2, 2), transposedA=X_NOTRANS, transposedB=X_NOTRANS.
+In this case, a=(2, 3), b=(3, 2) -> c=(2, 2), transposedA=X_NOTRANS, transposedB=X_NOTRANS.
 */
 bool TestMatrixMulBatched1()
 {

--- a/source/tensor/test/TMultiply.cpp
+++ b/source/tensor/test/TMultiply.cpp
@@ -27,7 +27,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /* 
 case 1: element-wise product of two tensors
 c(i) = a(i)*b(i) + \alpha * c(i)
-In this case, (2, 2)  (2, 2) -> (2, 2), leadingDim=0, alpha=0.
+In this case, (2, 2) * (2, 2) -> (2, 2), leadingDim=0, alpha=0.
 */
 bool TestMultiply1()
 {
@@ -149,6 +149,131 @@ bool TestMultiply1()
 #endif // USE_CUDA
 }
+/*
+case 2: element-wise product of two tensors
+c(i) = a(i)*b(i) + \alpha * c(i)
+In this case, (2, 3, 4) * (2, 1, 1) -> (2, 3, 4), alpha=0.
+*/
+bool TestMultiply2()
+{
+    /* a source tensor of size (2, 3, 4) */
+    int sOrder1 = 3;
+    int * sDimSize1 = new int[sOrder1];
+    sDimSize1[0] = 2;
+    sDimSize1[1] = 3;
+    sDimSize1[2] = 4;
+    int sUnitNum1 = 1;
+    for (int i = 0; i < sOrder1; i++)
+        sUnitNum1 *= sDimSize1[i];
+    /* a source tensor of size (2, 1, 1) */
+    int sOrder2 = 3;
+    int * sDimSize2 = new int[sOrder2];
+    sDimSize2[0] = 2;
+    sDimSize2[1] = 1;
+    sDimSize2[2] = 1;
+    int sUnitNum2 = 1;
+    for (int i = 0; i < sOrder2; i++)
+        sUnitNum2 *= sDimSize2[i];
+    /* a target tensor of size (2, 3, 4) */
+    int tOrder = 3;
+    int * tDimSize = new int[tOrder];
+    tDimSize[0] = 2;
+    tDimSize[1] = 3;
+    tDimSize[2] = 4;
+    int tUnitNum = 1;
+    for (int i = 0; i < tOrder; i++)
+        tUnitNum *= tDimSize[i];
+    DTYPE sData1[2][3][4] = { { {0.0F, 1.0F, 2.0F, 3.0F},
+                                {3.0F, 2.0F, 1.0F, 0.0F},
+                                {0.0F, 1.0F, 2.0F, 3.0F} },
+                              { {3.0F, 2.0F, 1.0F, 0.0F},
+                                {0.0F, 1.0F, 2.0F, 3.0F},
+                                {3.0F, 2.0F, 1.0F, 0.0F} } };
+    DTYPE sData2[2][1][1] = { { {1.0F} },
+                              { {-1.0F} } };
+    DTYPE answer[2][3][4] = { { {0.0F, 1.0F, 2.0F, 3.0F},
+                                {3.0F, 2.0F, 1.0F, 0.0F},
+                                {0.0F, 1.0F, 2.0F, 3.0F} },
+                              { {-3.0F, -2.0F, -1.0F, 0.0F},
+                                {0.0F, -1.0F, -2.0F, -3.0F},
+                                {-3.0F, -2.0F, -1.0F, 0.0F} } };
+    /* CPU test */
+    bool cpuTest = true;
+    /* create tensors */
+    XTensor * s1 = NewTensorV2(sOrder1, sDimSize1);
+    XTensor * s2 = NewTensorV2(sOrder2, sDimSize2);
+    XTensor * tMe = NewTensorV2(tOrder, tDimSize);
+    XTensor tUser;
+    /* initialize variables */
+    s1->SetData(sData1, sUnitNum1);
+    tMe->SetData(sData1, sUnitNum1);
+    s2->SetData(sData2, sUnitNum2);
+    /* call Multiply function */
+    MultiplyMe(*tMe, *s2, 0);
+    tUser = Multiply(*s1, *s2);
+    /* check results */
+    cpuTest = _CheckData(tMe, answer, 1e-4, tUnitNum) &&
+              _CheckData(&tUser, answer, 1e-4, tUnitNum);
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+    /* create tensor */
+    XTensor * sGPU1 = NewTensorV2(sOrder1, sDimSize1, X_FLOAT, 1.0F, 0);
+    XTensor * sGPU2 = NewTensorV2(sOrder2, sDimSize2, X_FLOAT, 1.0F, 0);
+    XTensor * tMeGPU = NewTensorV2(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
+    XTensor tUserGPU;
+    /* Initialize variables */
+    sGPU1->SetData(sData1, sUnitNum1);
+    tMeGPU->SetData(sData1, sUnitNum1);
+    sGPU2->SetData(sData2, sUnitNum2);
+    /* call Multiply function */
+    MultiplyMe(*tMeGPU, *sGPU2, 0);
+    tUserGPU = Multiply(*sGPU1, *sGPU2);
+    /* check results */
+    gpuTest = _CheckData(tMeGPU, answer, tUnitNum, 1e-4F) &&
+              _CheckData(&tUserGPU, answer, tUnitNum, 1e-4F);
+    /* destroy variables */
+    delete s1;
+    delete s2;
+    delete tMe;
+    delete sGPU1;
+    delete sGPU2;
+    delete tMeGPU;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete s1;
+    delete s2;
+    delete tMe;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;
+    return cpuTest;
+#endif // USE_CUDA
+}
 /* other cases */
 /*
 TODO!!
@@ -170,6 +295,16 @@ bool TestMultiply()
    else
        XPRINT(0, stdout, ">> case 1 passed!\n");
+    /* case 2 test */
+    caseFlag = TestMultiply2();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 2 failed!\n");
+    }
+    else
+    XPRINT(0, stdout, ">> case 2 passed!\n");
    /* other cases test */
    /*
    TODO!!

--- a/source/tensor/test/TReduceMax.cpp
+++ b/source/tensor/test/TReduceMax.cpp
@@ -141,6 +141,90 @@ bool TestReduceMax1()
 #endif // USE_CUDA
 }
+/*
+case 2: get the max value of the items along a dimension of the scalar tensor.
+In this case,
+(4) -> scalar, dim = 0
+*/
+bool TestReduceMax2()
+{
+    /* a input tensor of size (4) */
+    int sOrder = 1;
+    int * sDimSize = new int[sOrder];
+    sDimSize[0] = 4;
+    int sUnitNum = 1;
+    for (int i = 0; i < sOrder; i++)
+        sUnitNum *= sDimSize[i];
+    /* a output scalar tensor */
+    int tOrder = 0;
+    int * tDimSize = new int[MAX_TENSOR_DIM_NUM];
+    int tUnitNum = 1;
+    DTYPE sData[4] = {0.0F, 5.0F, 2.0F, 3.0F};
+    DTYPE answer[1] = {5.0F};
+    /* CPU test */
+    bool cpuTest = true;
+    /* create tensors */
+    XTensor * s = NewTensorV2(sOrder, sDimSize);
+    XTensor * t = NewTensorV2(tOrder, tDimSize);
+    XTensor tUser;
+    /* initialize variables */
+    s->SetData(sData, sUnitNum);
+    t->SetZeroAll();
+    /* call ReduceMax function */
+    _ReduceMax(s, t, 0);
+    tUser = ReduceMax(*s, 0);
+    /* check results */
+    cpuTest = _CheckData(t, answer, tUnitNum) && _CheckData(&tUser, answer, tUnitNum);
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+    /* create tensors */
+    XTensor * sGPU = NewTensorV2(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * tGPU = NewTensorV2(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
+    XTensor tUserGPU;
+    /* initialize variables */
+    sGPU->SetData(sData, sUnitNum);
+    tGPU->SetZeroAll();
+    tGPU->SetZeroAll();
+    /* call ReduceMax function */
+    _ReduceMax(sGPU, tGPU, 0);
+    tUserGPU = ReduceMax(*sGPU, 0);
+    /* check results */
+    gpuTest = _CheckData(tGPU, answer, tUnitNum) && _CheckData(&tUserGPU, answer, tUnitNum);
+    /* destroy variables */
+    delete s;
+    delete t;
+    delete sGPU;
+    delete tGPU;
+    delete[] sDimSize;
+    delete[] tDimSize;
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete s;
+    delete t;
+    delete[] sDimSize;
+    delete[] tDimSize;
+    return cpuTest;
+#endif // USE_CUDA
+}
 /* other cases */
 /*
 TODO!!
@@ -161,6 +245,15 @@ bool TestReduceMax()
    else
        XPRINT(0, stdout, ">> case 1 passed!\n");
+    /* case 2 test */
+    caseFlag = TestReduceMax2();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 2 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 2 passed!\n");
    /* other cases test */
    /*
    TODO!!

--- a/source/tensor/test/TReduceMean.cpp
+++ b/source/tensor/test/TReduceMean.cpp
@@ -136,6 +136,85 @@ bool TestReduceMean1()
 #endif // USE_CUDA
 }
+/* case 2: get the mean value along a dimension of the scalar tensor */
+bool TestReduceMean2()
+{
+    /* a tensor of size (4) */
+    int sOrder = 1;
+    int * sDimSize = new int[sOrder];
+    sDimSize[0] = 4;
+    int sUnitNum = 1;
+    for (int i = 0; i < sOrder; i++)
+        sUnitNum *= sDimSize[i];
+    /* a scalar tensor */
+    int tOrder = 0;
+    int * tDimSize = new int[MAX_TENSOR_DIM_NUM];
+    int tUnitNum = 1;
+    DTYPE sData[4] = {0.0F, 1.0F, 2.0F, 3.0F};
+    DTYPE answer[1] = {1.5F};
+    /* CPU test */
+    bool cpuTest = true;
+    /* create tensors */
+    XTensor * s = NewTensorV2(sOrder, sDimSize);
+    XTensor * t = NewTensorV2(tOrder, tDimSize);
+    XTensor tUser;
+    /* initialize variables */
+    s->SetData(sData, sUnitNum);
+    t->SetZeroAll();
+    /* call ReduceMean function */
+    _ReduceMean(s, t, 0);
+    tUser = ReduceMean(*s, 0);
+    /* check results */
+    cpuTest = _CheckData(t, answer, tUnitNum) && _CheckData(&tUser, answer, tUnitNum);
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+    /* create tensor */
+    XTensor * sGPU = NewTensorV2(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * tGPU = NewTensorV2(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
+    XTensor tUserGPU;
+    /* Initialize variables */
+    sGPU->SetData(sData, sUnitNum);
+    tGPU->SetZeroAll();
+    /* call ReduceMean function */
+    _ReduceMean(sGPU, tGPU, 0);
+    tUserGPU = ReduceMean(*sGPU, 0);
+    /* check results */
+    gpuTest = _CheckData(tGPU, answer, tUnitNum) && _CheckData(&tUserGPU, answer, tUnitNum);
+    /* destroy variables */
+    delete s;
+    delete t;
+    delete sGPU;
+    delete tGPU;
+    delete[] sDimSize;
+    delete[] tDimSize;
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete s;
+    delete t;
+    delete[] sDimSize;
+    delete[] tDimSize;
+    return cpuTest;
+#endif // USE_CUDA
+}
 /* other cases */
 /*
 TODO!!
@@ -156,6 +235,15 @@ bool TestReduceMean()
    else
        XPRINT(0, stdout, ">> case 1 passed!\n");
+    /* case 2 test */
+    caseFlag = TestReduceMean2();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 2 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 2 passed!\n");
    ///* other cases test */
    ///*
    //TODO!!

--- a/source/tensor/test/TReduceSum.cpp
+++ b/source/tensor/test/TReduceSum.cpp
@@ -607,6 +607,89 @@ bool TestReduceSum6()
 #endif // USE_CUDA
 }
+/*
+case 7: test ReduceSum function.
+Sum the items along a dimension of the tensor.
+In this case,
+(4) -> scalar, dim = 0
+*/
+bool TestReduceSum7()
+{
+    /* a tensor of size (2, 4) */
+    int sOrder = 1;
+    int * sDimSize = new int[sOrder];
+    sDimSize[0] = 4;
+    int sUnitNum = 1;
+    for (int i = 0; i < sOrder; i++)
+        sUnitNum *= sDimSize[i];
+    /* a scalar */
+    int tOrder = 0;
+    int * tDimSize = new int[MAX_TENSOR_DIM_NUM];
+    int tUnitNum = 1;
+    DTYPE sData[4] = {0.0F, 1.0F, 2.0F, 3.0F};
+    DTYPE answer[1] = {6.0F};
+    /* CPU test */
+    bool cpuTest = true;
+    /* create tensors */
+    XTensor * s = NewTensorV2(sOrder, sDimSize);
+    XTensor * t = NewTensorV2(tOrder, tDimSize);
+    XTensor tUser;
+    /* initialize variables */
+    s->SetData(sData, sUnitNum);
+    t->SetZeroAll();
+    /* call ReduceSum function */
+    _ReduceSum(s, t, 0);
+    tUser = ReduceSum(*s, 0);
+    /* check results */
+    cpuTest = _CheckData(t, answer, tUnitNum) && _CheckData(&tUser, answer, tUnitNum);
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+    /* create tensors */
+    XTensor * sGPU = NewTensorV2(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * tGPU = NewTensorV2(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
+    XTensor tUserGPU;
+    /* initialize variables */
+    sGPU->SetData(sData, sUnitNum);
+    tGPU->SetZeroAll();
+    /* call ReduceSum function */
+    _ReduceSum(sGPU, tGPU, 0);
+    tUserGPU = ReduceSum(*sGPU, 0);
+    /* check results */
+    gpuTest = _CheckData(tGPU, answer, tUnitNum) && _CheckData(&tUserGPU, answer, tUnitNum);
+    /* destroy variables */
+    delete s;
+    delete t;
+    delete sGPU;
+    delete tGPU;
+    delete[] sDimSize;
+    delete[] tDimSize;
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete s;
+    delete t;
+    delete[] sDimSize;
+    delete[] tDimSize;
+    return cpuTest;
+#endif // USE_CUDA
+}
 /* other cases */
 /*
@@ -673,6 +756,15 @@ bool TestReduceSum()
    else
        XPRINT(0, stdout, ">> case 6 passed!\n");
+    /* case 7 test */
+    caseFlag = TestReduceSum7();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 7 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 7 passed!\n");
    /* other cases test */
    /*
    TODO!!

--- a/source/tensor/test/TReduceSumAll.cpp
+++ b/source/tensor/test/TReduceSumAll.cpp
@@ -55,7 +55,7 @@ bool TestReduceSumAll1()
    s->SetData(sData, sUnitNum);
    /* call ReduceSumAll function */
-    summation = _ReduceSumAll(s);
+    summation = ReduceSumAllValue(*s);
    /* check results */
    cpuTest = (fabs(answer - summation) < 1e-4F);
@@ -71,7 +71,7 @@ bool TestReduceSumAll1()
    sGPU->SetData(sData, sUnitNum);
    /* call ReduceSumAll function */
-    summation = _ReduceSumAll(sGPU);
+    summation = ReduceSumAllValue(*sGPU);
    /* check results */
    gpuTest = (fabs(answer - summation) < 1e-4F);

--- a/source/tensor/test/TReduceSumSquared.cpp
+++ b/source/tensor/test/TReduceSumSquared.cpp
@@ -240,6 +240,104 @@ bool TestReduceSumSquared2()
 #endif // USE_CUDA
 }
+/*
+case 3: squared sum of the items along a dimension of the scalar tensor.
+For a 1-dimensional data array a, sum = \sum_i (a_i - shift)^2.
+In this case, (4) -> scalar, dim = 0.
+*/
+bool TestReduceSumSquared3()
+{
+    /* a input tensor of size (4) */
+    int sOrder = 1;
+    int * sDimSize = new int[sOrder];
+    sDimSize[0] = 4;
+    int sUnitNum = 1;
+    for (int i = 0; i < sOrder; i++)
+        sUnitNum *= sDimSize[i];
+    /* a output scalar tensor */
+    int tOrder = 0;
+    int * tDimSize = new int[MAX_TENSOR_DIM_NUM];
+    int tUnitNum = 1;
+    /* a shift tensor of size (1) */
+    int shiftOrder = 0;
+    int * shiftDimSize = new int[MAX_TENSOR_DIM_NUM];
+    int shiftUnitNum = 1;
+    DTYPE sData[4] = {0.0F, 1.0F, 2.0F, 3.0F};
+    DTYPE shiftData[1] = {-1.0F};
+    DTYPE answer[1] = {30.0F};
+    /* CPU test */
+    bool cpuTest = true;
+    /* create tensors */
+    XTensor * s = NewTensorV2(sOrder, sDimSize);
+    XTensor * t = NewTensorV2(tOrder, tDimSize);
+    XTensor * shift = NewTensorV2(shiftOrder, shiftDimSize);
+    XTensor tUser;
+    /* initialize variables */
+    s->SetData(sData, sUnitNum);
+    shift->SetData(shiftData, shiftUnitNum);
+    t->SetZeroAll();
+    /* call ReduceSumSquared function */
+    _ReduceSumSquared(s, t, 0, shift);
+    tUser = ReduceSumSquared(*s, 0, *shift);
+    /* check results */
+    cpuTest = _CheckData(t, answer, tUnitNum) && _CheckData(&tUser, answer, tUnitNum);
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+    /* create tensors */
+    XTensor * sGPU = NewTensorV2(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * tGPU = NewTensorV2(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * shiftGPU = NewTensorV2(shiftOrder, shiftDimSize, X_FLOAT, 1.0F, 0);
+    XTensor tUserGPU;
+    /* initialize variables */
+    sGPU->SetData(sData, sUnitNum);
+    shiftGPU->SetData(shiftData, shiftUnitNum);
+    tGPU->SetZeroAll();
+    /* call ReduceSumSquared function */
+    _ReduceSumSquared(sGPU, tGPU, 0, shiftGPU);
+    tUserGPU = ReduceSumSquared(*sGPU, 0, *shiftGPU);
+    /* check results */
+    gpuTest = _CheckData(tGPU, answer, tUnitNum) && _CheckData(&tUserGPU, answer, tUnitNum);
+    /* destroy variables */
+    delete s;
+    delete t;
+    delete shift;
+    delete sGPU;
+    delete tGPU;
+    delete shiftGPU;
+    delete[] sDimSize;
+    delete[] tDimSize;
+    delete[] shiftDimSize;
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete s;
+    delete t;
+    delete shift;
+    delete[] sDimSize;
+    delete[] tDimSize;
+    delete[] shiftDimSize;
+    return cpuTest;
+#endif // USE_CUDA
+}
 /* other cases */
 /*
 TODO!!
@@ -264,10 +362,19 @@ bool TestReduceSumSquared()
    caseFlag = TestReduceSumSquared2();
    if (!caseFlag) {
        returnFlag = false;
-        XPRINT(0, stdout, ">> case 1 failed!\n");
+        XPRINT(0, stdout, ">> case 2 failed!\n");
    }
    else
-        XPRINT(0, stdout, ">> case 1 passed!\n");
+        XPRINT(0, stdout, ">> case 2 passed!\n");
+    /* case 3 test */
+    caseFlag = TestReduceSumSquared3();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 3 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 3 passed!\n");
    /* other cases test */
    /*

--- a/source/tensor/test/TReduceVariance.cpp
+++ b/source/tensor/test/TReduceVariance.cpp
@@ -132,6 +132,104 @@ bool TestReduceVariance1()
 #endif // USE_CUDA
 }
+/*
+case 2: variance of the items along a dimension of the scalar tensor.
+For a 1-dimensional data array a, variance = 1/n * \sum_i (a_i - mean)^2.
+In this case, (4) -> scalar, dim = 0.
+*/
+bool TestReduceVariance2()
+{
+    /* a input tensor of size (4) */
+    int sOrder = 1;
+    int * sDimSize = new int[sOrder];
+    sDimSize[0] = 4;
+    int sUnitNum = 1;
+    for (int i = 0; i < sOrder; i++)
+        sUnitNum *= sDimSize[i];
+    /* a output scalar tensor */
+    int tOrder = 0;
+    int * tDimSize = new int[MAX_TENSOR_DIM_NUM];
+    int tUnitNum = 1;
+    /* a mean scalar tensor */
+    int meanOrder = 0;
+    int * meanDimSize = new int[MAX_TENSOR_DIM_NUM];
+    int meanUnitNum = 1;
+    DTYPE sData[4] = {0.0F, 1.0F, 2.0F, 3.0F};
+    DTYPE meanData[1] = {1.5F};
+    DTYPE answer[1] = {1.25F};
+    /* CPU test */
+    bool cpuTest = true;
+    /* create tensors */
+    XTensor * s = NewTensorV2(sOrder, sDimSize);
+    XTensor * t = NewTensorV2(tOrder, tDimSize);
+    XTensor * mean = NewTensorV2(meanOrder, meanDimSize);
+    XTensor tUser;
+    /* initialize variables */
+    s->SetData(sData, sUnitNum);
+    mean->SetData(meanData, meanUnitNum);
+    t->SetZeroAll();
+    /* call ReduceVariance function */
+    _ReduceVariance(s, t, 0, mean);
+    tUser = ReduceVariance(*s, 0, *mean);
+    /* check results */
+    cpuTest = _CheckData(t, answer, tUnitNum) && _CheckData(&tUser, answer, tUnitNum);
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+    /* create tensors */
+    XTensor * sGPU = NewTensorV2(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * tGPU = NewTensorV2(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * meanGPU = NewTensorV2(meanOrder, meanDimSize, X_FLOAT, 1.0F, 0);
+    XTensor tUserGPU;
+    /* initialize variables */
+    sGPU->SetData(sData, sUnitNum);
+    meanGPU->SetData(meanData, meanUnitNum);
+    tGPU->SetZeroAll();
+    /* call ReduceVariance function */
+    _ReduceVariance(sGPU, tGPU, 0, meanGPU);
+    tUserGPU = ReduceVariance(*sGPU, 0, *meanGPU);
+    /* check results */
+    gpuTest = _CheckData(tGPU, answer, tUnitNum) && _CheckData(&tUserGPU, answer, tUnitNum);
+    /* destroy variables */
+    delete s;
+    delete t;
+    delete mean;
+    delete sGPU;
+    delete tGPU;
+    delete meanGPU;
+    delete[] sDimSize;
+    delete[] tDimSize;
+    delete[] meanDimSize;
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete s;
+    delete t;
+    delete mean;
+    delete[] sDimSize;
+    delete[] tDimSize;
+    delete[] meanDimSize;
+    return cpuTest;
+#endif // USE_CUDA
+}
 /* other cases */
 /*
 TODO!!
@@ -152,6 +250,15 @@ bool TestReduceVariance()
    else
        XPRINT(0, stdout, ">> case 1 passed!\n");
+    /* case 2 test */
+    caseFlag = TestReduceVariance2();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 2 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 2 passed!\n");
    /* other cases test */
    /*
    TODO!!

--- a/source/tensor/test/TSpread.cpp
+++ b/source/tensor/test/TSpread.cpp
@@ -32,7 +32,7 @@ spread a collection tensor to source tensor.
 */
 bool TestSpread1()
 {
-    /* a input tensor of size (2, 4, 3) */
+    /* a input tensor of size (4, 4, 3) */
    int sOrder = 3;
    int * sDimSize = new int[sOrder];
    sDimSize[0] = 4;

--- a/source/tensor/test/TSub.cpp
+++ b/source/tensor/test/TSub.cpp
@@ -215,6 +215,305 @@ bool TestSub2()
 #endif // USE_CUDA
 }
+/* case 3: tensor subtraction c = a - b * \beta, which b is a scalar tensor */
+bool TestSub3()
+{
+    /* a tensor of size (2, 4) */
+    int aOrder = 2;
+    int * aDimSize = new int[aOrder];
+    aDimSize[0] = 2;
+    aDimSize[1] = 4;
+    int aUnitNum = 1;
+    for (int i = 0; i < aOrder; i++)
+        aUnitNum *= aDimSize[i];
+    /* a scalar */
+    int bOrder = 0;
+    int * bDimSize = new int[MAX_TENSOR_DIM_NUM];
+    int bUnitNum = 1;
+    /* a tensor of size (2, 4) */
+    int cOrder = 2;
+    int * cDimSize = new int[cOrder];
+    cDimSize[0] = 2;
+    cDimSize[1] = 4;
+    int cUnitNum = 1;
+    for (int i = 0; i < cOrder; i++)
+        cUnitNum *= cDimSize[i];
+    DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
+                          {4.0F, 5.0F, 6.0F, 7.0F} };
+    DTYPE bData[1] = {-1.0F};
+    DTYPE beta = 2.0F;
+    DTYPE answer[2][4] = { {2.0F, 3.0F, 4.0F, 5.0F},
+                           {6.0F, 7.0F, 8.0F, 9.0F} };
+    /* CPU test */
+    bool cpuTest = true;
+    /* create tensors */
+    XTensor * a = NewTensorV2(aOrder, aDimSize);
+    XTensor * b = NewTensorV2(bOrder, bDimSize);
+    XTensor cUser;
+    /* initialize variables */
+    a->SetData(aData, aUnitNum);
+    b->SetData(bData, bUnitNum);
+    /* call Sum function */
+    cUser = Sub(*a, *b, beta);
+    /* check results */
+    cpuTest = _CheckData(&cUser, answer, cUnitNum);
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+    /* create tensor */
+    XTensor * aGPU = NewTensorV2(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * bGPU = NewTensorV2(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
+    XTensor cUserGPU;
+    /* Initialize variables */
+    aGPU->SetData(aData, aUnitNum);
+    bGPU->SetData(bData, bUnitNum);
+    /* call Sum function */
+    cUserGPU = Sub(*aGPU, *bGPU, beta);
+    /* check results */
+    gpuTest = _CheckData(&cUserGPU, answer, cUnitNum);
+    /* destroy variables */
+    delete a;
+    delete b;
+    delete aGPU;
+    delete bGPU;
+    delete[] aDimSize;
+    delete[] bDimSize;
+    delete[] cDimSize;
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete a;
+    delete b;
+    delete[] aDimSize;
+    delete[] bDimSize;
+    delete[] cDimSize;
+    return cpuTest;
+#endif // USE_CUDA
+}
+/* case 4: tensor subtraction c = a - b * \beta, which b is a 1d tensor */
+bool TestSub4()
+{
+    /* a tensor of size (3, 4, 2) */
+    int aOrder = 3;
+    int * aDimSize = new int[aOrder];
+    aDimSize[0] = 3;
+    aDimSize[1] = 4;
+    aDimSize[2] = 2;
+    int aUnitNum = 1;
+    for (int i = 0; i < aOrder; i++)
+        aUnitNum *= aDimSize[i];
+    /* a tensor of size (4) */
+    int bOrder = 1;
+    int * bDimSize = new int[bOrder];
+    bDimSize[0] = 4;
+    int bUnitNum = 1;
+    for (int i = 0; i < bOrder; i++)
+        bUnitNum *= bDimSize[i];
+    /* a tensor of size (3, 4, 2) */
+    int cOrder = 3;
+    int * cDimSize = new int[cOrder];
+    cDimSize[0] = 3;
+    cDimSize[1] = 4;
+    cDimSize[2] = 2;
+    int cUnitNum = 1;
+    for (int i = 0; i < cOrder; i++)
+        cUnitNum *= cDimSize[i];
+    DTYPE aData[3][4][2] = { { {0.0F, 1.0F}, {2.0F, 3.0F}, {4.0F, 5.0F}, {6.0F, 7.0F} },
+                             { {0.0F, -1.0F}, {-2.0F, -3.0F}, {-4.0F, -5.0F}, {-6.0F, -7.0F} },
+                             { {0.0F, 1.0F}, {2.0F, 3.0F}, {4.0F, 5.0F}, {6.0F, 7.0F} } };
+    DTYPE bData[4] = {-1.0F, 0.0F, 1.0F, 2.0F};
+    DTYPE beta = 2.0F;
+    DTYPE answer[3][4][2] = { { {2.0F, 3.0F}, {2.0F, 3.0F}, {2.0F, 3.0F}, {2.0F, 3.0F} },
+                              { {2.0F, 1.0F}, {-2.0F, -3.0F}, {-6.0F, -7.0F}, {-10.0F, -11.0F} },
+                              { {2.0F, 3.0F}, {2.0F, 3.0F}, {2.0F, 3.0F}, {2.0F, 3.0F} } };
+    /* CPU test */
+    bool cpuTest = true;
+    /* create tensors */
+    XTensor * a = NewTensorV2(aOrder, aDimSize);
+    XTensor * b = NewTensorV2(bOrder, bDimSize);
+    XTensor cUser;
+    /* initialize variables */
+    a->SetData(aData, aUnitNum);
+    b->SetData(bData, bUnitNum);
+    /* call Sum function */
+    cUser = Sub(*a, *b, beta);
+    /* check results */
+    cpuTest = _CheckData(&cUser, answer, cUnitNum);
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+    /* create tensor */
+    XTensor * aGPU = NewTensorV2(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * bGPU = NewTensorV2(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
+    XTensor cUserGPU;
+    /* Initialize variables */
+    aGPU->SetData(aData, aUnitNum);
+    bGPU->SetData(bData, bUnitNum);
+    /* call Sum function */
+    cUserGPU = Sub(*aGPU, *bGPU, beta);
+    /* check results */
+    gpuTest = _CheckData(&cUserGPU, answer, cUnitNum);
+    /* destroy variables */
+    delete a;
+    delete b;
+    delete aGPU;
+    delete bGPU;
+    delete[] aDimSize;
+    delete[] bDimSize;
+    delete[] cDimSize;
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete a;
+    delete b;
+    delete[] aDimSize;
+    delete[] bDimSize;
+    delete[] cDimSize;
+    return cpuTest;
+#endif // USE_CUDA
+}
+/* case 5: tensor subtraction c = a - b * \beta, which b is a 1d tensor */
+bool TestSub5()
+{
+    /* a tensor of size (4, 4) */
+    int aOrder = 2;
+    int * aDimSize = new int[aOrder];
+    aDimSize[0] = 4;
+    aDimSize[1] = 4;
+    int aUnitNum = 1;
+    for (int i = 0; i < aOrder; i++)
+        aUnitNum *= aDimSize[i];
+    /* a tensor of size (4) */
+    int bOrder = 1;
+    int * bDimSize = new int[bOrder];
+    bDimSize[0] = 4;
+    int bUnitNum = 1;
+    for (int i = 0; i < bOrder; i++)
+        bUnitNum *= bDimSize[i];
+    /* a tensor of size (4, 4) */
+    int cOrder = 2;
+    int * cDimSize = new int[cOrder];
+    cDimSize[0] = 4;
+    cDimSize[1] = 4;
+    int cUnitNum = 1;
+    for (int i = 0; i < cOrder; i++)
+        cUnitNum *= cDimSize[i];
+    DTYPE aData[4][4] = { {0.0F, 1.0F, 2.0F, 3.0F },
+                          {4.0F, 5.0F, 6.0F, 7.0F },
+                          {0.0F, -1.0F, -2.0F, -3.0F },
+                          {-4.0F, -5.0F, -6.0F, -7.0F } };
+    DTYPE bData[4] = {-1.0F, 0.0F, 1.0F, 2.0F};
+    DTYPE beta = 2.0F;
+    DTYPE answer[4][4] = { {2.0F, 1.0F, 0.0F, -1.0F },
+                           {6.0F, 5.0F, 4.0F, 3.0F },
+                           {2.0F, -1.0F, -4.0F, -7.0F },
+                           {-2.0F, -5.0F, -8.0F, -11.0F } };
+    /* CPU test */
+    bool cpuTest = true;
+    /* create tensors */
+    XTensor * a = NewTensorV2(aOrder, aDimSize);
+    XTensor * b = NewTensorV2(bOrder, bDimSize);
+    XTensor cUser;
+    /* initialize variables */
+    a->SetData(aData, aUnitNum);
+    b->SetData(bData, bUnitNum);
+    /* call Sum function */
+    cUser = Sub(*a, *b, beta);
+    /* check results */
+    cpuTest = _CheckData(&cUser, answer, cUnitNum);
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+    /* create tensor */
+    XTensor * aGPU = NewTensorV2(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * bGPU = NewTensorV2(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
+    XTensor cUserGPU;
+    /* Initialize variables */
+    aGPU->SetData(aData, aUnitNum);
+    bGPU->SetData(bData, bUnitNum);
+    /* call Sum function */
+    cUserGPU = Sub(*aGPU, *bGPU, beta);
+    /* check results */
+    gpuTest = _CheckData(&cUserGPU, answer, cUnitNum);
+    /* destroy variables */
+    delete a;
+    delete b;
+    delete aGPU;
+    delete bGPU;
+    delete[] aDimSize;
+    delete[] bDimSize;
+    delete[] cDimSize;
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete a;
+    delete b;
+    delete[] aDimSize;
+    delete[] bDimSize;
+    delete[] cDimSize;
+    return cpuTest;
+#endif // USE_CUDA
+}
 /* other cases */
 /*
    TODO!!
@@ -244,6 +543,33 @@ bool TestSub()
    else
        XPRINT(0, stdout, ">> case 2 passed!\n");
+    /* case 3 test */
+    caseFlag = TestSub3();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 3 failed!\n");
+    }
+    else
+    XPRINT(0, stdout, ">> case 3 passed!\n");
+    /* case 4 test */
+    caseFlag = TestSub4();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 4 failed!\n");
+    }
+    else
+    XPRINT(0, stdout, ">> case 4 passed!\n");
+    /* case 5 test */
+    caseFlag = TestSub5();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 5 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 5 passed!\n");
    /* other cases test */
    /*
        TODO!!

--- a/source/tensor/test/TSubDim.cpp
+++ b/source/tensor/test/TSubDim.cpp
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-/*
-* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-13
-*/
-#include "../core/utilities/CheckData.h"
-#include "../core/arithmetic/SubDim.h"
-#include "../XTensor.h"
-#include "TSubDim.h"
-namespace nts { // namespace nts(NiuTrans.Tensor)
-/*
-case 1: tensor subtraction c = a - b * \beta
-where the size of b is equal to the n-th dimension of a,
-i.e., a is subtracted with b by broadcasting
-*/
-bool TestSubDim1()
-{
-    /* a tensor of size (2, 4) */
-    int aOrder = 2;
-    int * aDimSize = new int[aOrder];
-    aDimSize[0] = 2;
-    aDimSize[1] = 4;
-    int aUnitNum = 1;
-    for (int i = 0; i < aOrder; i++)
-        aUnitNum *= aDimSize[i];
-    /* a tensor of size (2) */
-    int bOrder = 1;
-    int * bDimSize = new int[bOrder];
-    bDimSize[0] = 2;
-    int bUnitNum = 1;
-    for (int i = 0; i < bOrder; i++)
-        bUnitNum *= bDimSize[i];
-    DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
-                          {4.0F, 5.0F, 6.0F, 7.0F} };
-    DTYPE bData[2] = {1.0F, -1.0F};
-    DTYPE answer[2][4] = { {-1.0F, 0.0F, 1.0F, 2.0F},
-                           {5.0F, 6.0F, 7.0F, 8.0F} };
-    /* CPU test */
-    bool cpuTest = true;
-    /* create tensors */
-    XTensor * a = NewTensorV2(aOrder, aDimSize);
-    XTensor * b = NewTensorV2(bOrder, bDimSize);
-    XTensor * c = NewTensorV2(aOrder, aDimSize);
-    XTensor * cMe = NewTensorV2(aOrder, aDimSize);
-    XTensor cUser;
-    /* initialize variables */
-    a->SetData(aData, aUnitNum);
-    cMe->SetData(aData, aUnitNum);
-    b->SetData(bData, bUnitNum);
-    c->SetZeroAll();
-    /* call SubDim function */
-    _SubDim(a, b, c, 0);
-    _SubDim(cMe, b, 0);
-    cUser = SubDim(*a, *b, 0);
-    /* check results */
-    cpuTest = _CheckData(c, answer, aUnitNum) &&
-              _CheckData(cMe, answer, aUnitNum) &&
-              _CheckData(&cUser, answer, aUnitNum);
-#ifdef USE_CUDA
-    /* GPU test */
-    bool gpuTest = true;
-    /* create tensor */
-    XTensor * aGPU = NewTensorV2(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
-    XTensor * bGPU = NewTensorV2(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
-    XTensor * cGPU = NewTensorV2(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
-    XTensor * cMeGPU = NewTensorV2(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
-    XTensor cUserGPU;
-    /* Initialize variables */
-    aGPU->SetData(aData, aUnitNum);
-    cMeGPU->SetData(aData, aUnitNum);
-    bGPU->SetData(bData, bUnitNum);
-    cGPU->SetZeroAll();
-    /* call sub function */
-    _SubDim(aGPU, bGPU, cGPU, 0);
-    _SubDim(cMeGPU, bGPU, 0);
-    cUserGPU = SubDim(*aGPU, *bGPU, 0);
-    /* check results */
-    gpuTest = _CheckData(cGPU, answer, aUnitNum) &&
-              _CheckData(cMeGPU, answer, aUnitNum) &&
-              _CheckData(&cUserGPU, answer, aUnitNum);
-    /* destroy variables */
-    delete a;
-    delete b;
-    delete c;
-    delete cMe;
-    delete aGPU;
-    delete bGPU;
-    delete cGPU;
-    delete cMeGPU;
-    delete[] aDimSize;
-    delete[] bDimSize;
-    return cpuTest && gpuTest;
-#else
-    /* destroy variables */
-    delete a;
-    delete b;
-    delete c;
-    delete cMe;
-    delete[] aDimSize;
-    delete[] bDimSize;
-    return cpuTest;
-#endif // USE_CUDA
-}
-/*
-case 2: tensor subtraction c = a - b * \beta
-where the size of b is equal to the n-th dimension of a,
-i.e., a is subtracted with b by broadcasting
-*/
-bool TestSubDim2()
-{
-    /* a tensor of size (2, 4) */
-    int aOrder = 2;
-    int * aDimSize = new int[aOrder];
-    aDimSize[0] = 2;
-    aDimSize[1] = 4;
-    int aUnitNum = 1;
-    for (int i = 0; i < aOrder; i++)
-        aUnitNum *= aDimSize[i];
-    /* a tensor of size (2, 2) */
-    int bOrder = 2;
-    int * bDimSize = new int[bOrder];
-    bDimSize[0] = 2;
-    bDimSize[1] = 2;
-    int bUnitNum = 1;
-    for (int i = 0; i < bOrder; i++)
-        bUnitNum *= bDimSize[i];
-    DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
-                          {4.0F, 5.0F, 6.0F, 7.0F} };
-    DTYPE bData[2][2] = { {1.0F, -1.0F},
-                          {-1.0F, 1.0F} };
-    DTYPE answer[2][4] = { {-1.0F, 2.0F, 3.0F, 2.0F},
-                           {3.0F, 6.0F, 7.0F, 6.0F} };
-    /* CPU test */
-    bool cpuTest = true;
-    /* create tensors */
-    XTensor * a = NewTensorV2(aOrder, aDimSize);
-    XTensor * b = NewTensorV2(bOrder, bDimSize);
-    XTensor * c = NewTensorV2(aOrder, aDimSize);
-    XTensor * cMe = NewTensorV2(aOrder, aDimSize);
-    XTensor cUser;
-    /* initialize variables */
-    a->SetData(aData, aUnitNum);
-    cMe->SetData(aData, aUnitNum);
-    b->SetData(bData, bUnitNum);
-    c->SetZeroAll();
-    /* call SubDim function */
-    _SubDim(a, b, c, 1);
-    _SubDim(cMe, b, 1);
-    cUser = SubDim(*a, *b, 1);
-    /* check results */
-    cpuTest = _CheckData(c, answer, aUnitNum) &&
-              _CheckData(cMe, answer, aUnitNum) &&
-              _CheckData(&cUser, answer, aUnitNum);
-#ifdef USE_CUDA
-    /* GPU test */
-    bool gpuTest = true;
-    /* create tensor */
-    XTensor * aGPU = NewTensorV2(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
-    XTensor * bGPU = NewTensorV2(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
-    XTensor * cGPU = NewTensorV2(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
-    XTensor * cMeGPU = NewTensorV2(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
-    XTensor cUserGPU;
-    /* Initialize variables */
-    aGPU->SetData(aData, aUnitNum);
-    cMeGPU->SetData(aData, aUnitNum);
-    bGPU->SetData(bData, bUnitNum);
-    cGPU->SetZeroAll();
-    /* call sub function */
-    _SubDim(aGPU, bGPU, cGPU, 1);
-    _SubDim(cMeGPU, bGPU, 1);
-    cUserGPU = SubDim(*aGPU, *bGPU, 1);
-    /* check results */
-    gpuTest = _CheckData(cGPU, answer, aUnitNum) &&
-              _CheckData(cMeGPU, answer, aUnitNum) &&
-              _CheckData(&cUserGPU, answer, aUnitNum);
-    /* destroy variables */
-    delete a;
-    delete b;
-    delete c;
-    delete cMe;
-    delete aGPU;
-    delete bGPU;
-    delete cGPU;
-    delete cMeGPU;
-    delete[] aDimSize;
-    delete[] bDimSize;
-    return cpuTest && gpuTest;
-#else
-    /* destroy variables */
-    delete a;
-    delete b;
-    delete c;
-    delete cMe;
-    delete[] aDimSize;
-    delete[] bDimSize;
-    return cpuTest;
-#endif // USE_CUDA
-}
-/* other cases */
-/*
-TODO!!
-*/
-/* test for SubDim Function */
-bool TestSubDim()
-{
-    XPRINT(0, stdout, "[TEST SUBDIM] tensor subtraction c = a - b * beta by broadcasting\n");
-    bool returnFlag = true, caseFlag = true;
-    /* case 1 test */
-    caseFlag = TestSubDim1();
-    if (!caseFlag) {
-        returnFlag = false;
-        XPRINT(0, stdout, ">> case 1 failed!\n");
-    }
-    else
-        XPRINT(0, stdout, ">> case 1 passed!\n");
-    /* case 2 test */
-    caseFlag = TestSubDim2();
-    if (!caseFlag) {
-        returnFlag = false;
-        XPRINT(0, stdout, ">> case 2 failed!\n");
-    }
-    else
-        XPRINT(0, stdout, ">> case 2 passed!\n");
-    /* other cases test */
-    /*
-    TODO!!
-    */
-    if (returnFlag) {
-        XPRINT(0, stdout, ">> All Passed!\n");
-    }
-    else
-        XPRINT(0, stdout, ">> Failed!\n");
-    XPRINT(0, stdout, "\n");
-    return returnFlag;
-}
-} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/test/TSubDim.h
+++ b/source/tensor/test/TSubDim.h
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-/*
-* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-13
-*/
-#ifndef __TEST_SUBDIM_H__
-#define __TEST_SUBDIM_H__
-#include "../core/arithmetic/SubDim.h"
-namespace nts { // namespace nts(NiuTrans.Tensor)
-/* test for SubDim Function */
-bool TestSubDim();
-} // namespace nts(NiuTrans.Tensor)
-#endif // __TEST_SUBDIM_H__
--- a/source/tensor/test/TSum.cpp
+++ b/source/tensor/test/TSum.cpp
@@ -215,6 +215,305 @@ bool TestSum2()
 #endif // USE_CUDA
 }
+/* case 3: tensor summation c = a + b * \beta, which b is a scalar tensor */
+bool TestSum3()
+{
+    /* a tensor of size (2, 4) */
+    int aOrder = 2;
+    int * aDimSize = new int[aOrder];
+    aDimSize[0] = 2;
+    aDimSize[1] = 4;
+    int aUnitNum = 1;
+    for (int i = 0; i < aOrder; i++)
+        aUnitNum *= aDimSize[i];
+    /* a scalar */
+    int bOrder = 0;
+    int * bDimSize = new int[MAX_TENSOR_DIM_NUM];
+    int bUnitNum = 1;
+    /* a tensor of size (2, 4) */
+    int cOrder = 2;
+    int * cDimSize = new int[cOrder];
+    cDimSize[0] = 2;
+    cDimSize[1] = 4;
+    int cUnitNum = 1;
+    for (int i = 0; i < cOrder; i++)
+        cUnitNum *= cDimSize[i];
+    DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
+                          {4.0F, 5.0F, 6.0F, 7.0F} };
+    DTYPE bData[1] = {-1.0F};
+    DTYPE beta = 2.0F;
+    DTYPE answer[2][4] = { {-2.0F, -1.0F, 0.0F, 1.0F},
+                           {2.0F, 3.0F, 4.0F, 5.0F} };
+    /* CPU test */
+    bool cpuTest = true;
+    /* create tensors */
+    XTensor * a = NewTensorV2(aOrder, aDimSize);
+    XTensor * b = NewTensorV2(bOrder, bDimSize);
+    XTensor cUser;
+    /* initialize variables */
+    a->SetData(aData, aUnitNum);
+    b->SetData(bData, bUnitNum);
+    /* call Sum function */
+    cUser = Sum(*a, *b, beta);
+    /* check results */
+    cpuTest = _CheckData(&cUser, answer, cUnitNum);
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+    /* create tensor */
+    XTensor * aGPU = NewTensorV2(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * bGPU = NewTensorV2(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
+    XTensor cUserGPU;
+    /* Initialize variables */
+    aGPU->SetData(aData, aUnitNum);
+    bGPU->SetData(bData, bUnitNum);
+    /* call Sum function */
+    cUserGPU = Sum(*aGPU, *bGPU, beta);
+    /* check results */
+    gpuTest = _CheckData(&cUserGPU, answer, cUnitNum);
+    /* destroy variables */
+    delete a;
+    delete b;
+    delete aGPU;
+    delete bGPU;
+    delete[] aDimSize;
+    delete[] bDimSize;
+    delete[] cDimSize;
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete a;
+    delete b;
+    delete[] aDimSize;
+    delete[] bDimSize;
+    delete[] cDimSize;
+    return cpuTest;
+#endif // USE_CUDA
+}
+/* case 4: tensor summation c = a + b * \beta, which b is a 1d tensor */
+bool TestSum4()
+{
+    /* a tensor of size (3, 4, 2) */
+    int aOrder = 3;
+    int * aDimSize = new int[aOrder];
+    aDimSize[0] = 3;
+    aDimSize[1] = 4;
+    aDimSize[2] = 2;
+    int aUnitNum = 1;
+    for (int i = 0; i < aOrder; i++)
+        aUnitNum *= aDimSize[i];
+    /* a tensor of size (4) */
+    int bOrder = 1;
+    int * bDimSize = new int[bOrder];
+    bDimSize[0] = 4;
+    int bUnitNum = 1;
+    for (int i = 0; i < bOrder; i++)
+        bUnitNum *= bDimSize[i];
+    /* a tensor of size (3, 4, 2) */
+    int cOrder = 3;
+    int * cDimSize = new int[cOrder];
+    cDimSize[0] = 3;
+    cDimSize[1] = 4;
+    cDimSize[2] = 2;
+    int cUnitNum = 1;
+    for (int i = 0; i < cOrder; i++)
+        cUnitNum *= cDimSize[i];
+    DTYPE aData[3][4][2] = { { {0.0F, 1.0F}, {2.0F, 3.0F}, {4.0F, 5.0F}, {6.0F, 7.0F} },
+                             { {0.0F, -1.0F}, {-2.0F, -3.0F}, {-4.0F, -5.0F}, {-6.0F, -7.0F} },
+                             { {0.0F, 1.0F}, {2.0F, 3.0F}, {4.0F, 5.0F}, {6.0F, 7.0F} } };
+    DTYPE bData[4] = {-1.0F, 0.0F, 1.0F, 2.0F};
+    DTYPE beta = 2.0F;
+    DTYPE answer[3][4][2] = { { {-2.0F, -1.0F}, {2.0F, 3.0F}, {6.0F, 7.0F}, {10.0F, 11.0F} },
+                              { {-2.0F, -3.0F}, {-2.0F, -3.0F}, {-2.0F, -3.0F}, {-2.0F, -3.0F} },
+                              { {-2.0F, -1.0F}, {2.0F, 3.0F}, {6.0F, 7.0F}, {10.0F, 11.0F} } };
+    /* CPU test */
+    bool cpuTest = true;
+    /* create tensors */
+    XTensor * a = NewTensorV2(aOrder, aDimSize);
+    XTensor * b = NewTensorV2(bOrder, bDimSize);
+    XTensor cUser;
+    /* initialize variables */
+    a->SetData(aData, aUnitNum);
+    b->SetData(bData, bUnitNum);
+    /* call Sum function */
+    cUser = Sum(*a, *b, beta);
+    /* check results */
+    cpuTest = _CheckData(&cUser, answer, cUnitNum);
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+    /* create tensor */
+    XTensor * aGPU = NewTensorV2(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * bGPU = NewTensorV2(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
+    XTensor cUserGPU;
+    /* Initialize variables */
+    aGPU->SetData(aData, aUnitNum);
+    bGPU->SetData(bData, bUnitNum);
+    /* call Sum function */
+    cUserGPU = Sum(*aGPU, *bGPU, beta);
+    /* check results */
+    gpuTest = _CheckData(&cUserGPU, answer, cUnitNum);
+    /* destroy variables */
+    delete a;
+    delete b;
+    delete aGPU;
+    delete bGPU;
+    delete[] aDimSize;
+    delete[] bDimSize;
+    delete[] cDimSize;
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete a;
+    delete b;
+    delete[] aDimSize;
+    delete[] bDimSize;
+    delete[] cDimSize;
+    return cpuTest;
+#endif // USE_CUDA
+}
+/* case 5: tensor summation c = a + b * \beta, which b is a 1d tensor */
+bool TestSum5()
+{
+    /* a tensor of size (4, 4) */
+    int aOrder = 2;
+    int * aDimSize = new int[aOrder];
+    aDimSize[0] = 4;
+    aDimSize[1] = 4;
+    int aUnitNum = 1;
+    for (int i = 0; i < aOrder; i++)
+        aUnitNum *= aDimSize[i];
+    /* a tensor of size (4) */
+    int bOrder = 1;
+    int * bDimSize = new int[bOrder];
+    bDimSize[0] = 4;
+    int bUnitNum = 1;
+    for (int i = 0; i < bOrder; i++)
+        bUnitNum *= bDimSize[i];
+    /* a tensor of size (4, 4) */
+    int cOrder = 2;
+    int * cDimSize = new int[cOrder];
+    cDimSize[0] = 4;
+    cDimSize[1] = 4;
+    int cUnitNum = 1;
+    for (int i = 0; i < cOrder; i++)
+        cUnitNum *= cDimSize[i];
+    DTYPE aData[4][4] = { {0.0F, 1.0F, 2.0F, 3.0F },
+                          {4.0F, 5.0F, 6.0F, 7.0F },
+                          {0.0F, -1.0F, -2.0F, -3.0F },
+                          {-4.0F, -5.0F, -6.0F, -7.0F } };
+    DTYPE bData[4] = {-1.0F, 0.0F, 1.0F, 2.0F};
+    DTYPE beta = 2.0F;
+    DTYPE answer[4][4] = { {-2.0F, 1.0F, 4.0F, 7.0F },
+                           {2.0F, 5.0F, 8.0F, 11.0F },
+                           {-2.0F, -1.0F, 0.0F, 1.0F },
+                           {-6.0F, -5.0F, -4.0F, -3.0F } };
+    /* CPU test */
+    bool cpuTest = true;
+    /* create tensors */
+    XTensor * a = NewTensorV2(aOrder, aDimSize);
+    XTensor * b = NewTensorV2(bOrder, bDimSize);
+    XTensor cUser;
+    /* initialize variables */
+    a->SetData(aData, aUnitNum);
+    b->SetData(bData, bUnitNum);
+    /* call Sum function */
+    cUser = Sum(*a, *b, beta);
+    /* check results */
+    cpuTest = _CheckData(&cUser, answer, cUnitNum);
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+    /* create tensor */
+    XTensor * aGPU = NewTensorV2(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * bGPU = NewTensorV2(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
+    XTensor cUserGPU;
+    /* Initialize variables */
+    aGPU->SetData(aData, aUnitNum);
+    bGPU->SetData(bData, bUnitNum);
+    /* call Sum function */
+    cUserGPU = Sum(*aGPU, *bGPU, beta);
+    /* check results */
+    gpuTest = _CheckData(&cUserGPU, answer, cUnitNum);
+    /* destroy variables */
+    delete a;
+    delete b;
+    delete aGPU;
+    delete bGPU;
+    delete[] aDimSize;
+    delete[] bDimSize;
+    delete[] cDimSize;
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete a;
+    delete b;
+    delete[] aDimSize;
+    delete[] bDimSize;
+    delete[] cDimSize;
+    return cpuTest;
+#endif // USE_CUDA
+}
 /* other cases */
 /*
    TODO!!
@@ -244,6 +543,33 @@ bool TestSum()
    else
        XPRINT(0, stdout, ">> case 2 passed!\n");
+    /* case 3 test */
+    caseFlag = TestSum3();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 3 failed!\n");
+    }
+    else
+    XPRINT(0, stdout, ">> case 3 passed!\n");
+    /* case 4 test */
+    caseFlag = TestSum4();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 4 failed!\n");
+    }
+    else
+    XPRINT(0, stdout, ">> case 4 passed!\n");
+    /* case 5 test */
+    caseFlag = TestSum5();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 5 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 5 passed!\n");
    /* other cases test */
    /*
        TODO!!

--- a/source/tensor/test/TXMem.cpp
+++ b/source/tensor/test/TXMem.cpp
--- a/source/tensor/test/Test.cpp
+++ b/source/tensor/test/Test.cpp