Merge with the branch of huchi and fix bugs.

5bfbd041 · liyinqiao · 63eee374 · 5bfbd041 · 5bfbd041 · 5bfbd041
Commit 5bfbd041 authored Feb 06, 2021 by liyinqiao
--- a/source/sample/transformer/module/Attention.cpp
+++ b/source/sample/transformer/module/Attention.cpp
@@ -304,7 +304,7 @@ XTensor Attention::GetRPEmbedding(const int lenQ, const int lenKV,
        XTensor range2DTrans;
        range2D = Unsqueeze(range, 0, lenQ);
        range2DTrans = Transpose(range2D, 0, 1);
-        embMatrix = Sum(range2D, range2DTrans, -1);
+        embMatrix = Sum(range2D, range2DTrans, false, -1);
    }
    else {
        for (int i = 0; i < lenKV; i++)

--- a/source/tensor/XLink.cpp
+++ b/source/tensor/XLink.cpp
@@ -34,7 +34,7 @@ const int unusedOPs[] {
    MATH_SCALE, MATH_SCALEANDSHIFT,
    /* shape operators */
-    MOVEMENT_GATHER, SHAPE_UNSQUEEZE,
+    /*MOVEMENT_GATHER,*/ SHAPE_UNSQUEEZE,
    SHAPE_MERGE, SHAPE_SPLIT,
    /* reduce operators */

--- a/source/tensor/core/arithmetic/Multiply.cpp
+++ b/source/tensor/core/arithmetic/Multiply.cpp
@@ -196,7 +196,19 @@ where i is the index of the item
 */
 XTensor Multiply(const XTensor &a, const XTensor &b, bool inplace, int leadingDim)
 {
-    XTensor c(&a);
+    XTensor c;
+    if (inplace) {
+        /* the result is stored into the input tensor */
+        int dims[MAX_TENSOR_DIM_NUM];
+        memcpy(&(dims[0]), &(a.dimSize[0]), sizeof(int) * a.order);
+        dims[0] = -dims[0];
+        InitTensor(&c, a.order, dims, a.dataType, a.devID, a.enableGrad);
+        c.data = a.data;
+    }
+    else {
+        InitTensorV2(&c, &a);
+    }
    c.SetTMPFlag();
    if (b.order == 0){
@@ -239,6 +251,9 @@ XTensor Multiply(const XTensor &a, const XTensor &b, bool inplace, int leadingDi
        }
    }
+    XTensor* p = const_cast<XTensor*>(&a);
+    if (inplace)
+        p->data = NULL;
    return c;
 }

--- a/source/tensor/core/arithmetic/Sub.cpp
+++ b/source/tensor/core/arithmetic/Sub.cpp
@@ -89,12 +89,25 @@ make a new tensor c to keep the result and return it
 >> a - a tensor
 >> b - another tensor
+>> inplace - indicates whether the result will be placed in the input tensor
 >> beta - the scaling factor
 << return - the result of tensor subtraction
 */
-XTensor Sub(const XTensor & a, const XTensor & b, DTYPE beta)
+XTensor Sub(const XTensor & a, const XTensor & b, bool inplace, DTYPE beta)
 {
-    XTensor c(&a);
+    XTensor c;
+    if (inplace) {
+        /* the result is stored into the input tensor */
+        int dims[MAX_TENSOR_DIM_NUM];
+        memcpy(&(dims[0]), &(a.dimSize[0]), sizeof(int) * a.order);
+        dims[0] = -dims[0];
+        InitTensor(&c, a.order, dims, a.dataType, a.devID, a.enableGrad);
+        c.data = a.data;
+    }
+    else {
+        InitTensorV2(&c, &a);
+    }
    c.SetTMPFlag();
    if (b.order == 0){
@@ -129,6 +142,10 @@ XTensor Sub(const XTensor & a, const XTensor & b, DTYPE beta)
            ShowNTErrors("Something is wrong!");
        }
    }
+    XTensor* p = const_cast<XTensor*>(&a);
+    if (inplace)
+        p->data = NULL;
    return c;
 }

--- a/source/tensor/core/arithmetic/Sub.h
+++ b/source/tensor/core/arithmetic/Sub.h
@@ -41,7 +41,7 @@ void SubMe(XTensor & a, const XTensor & b, DTYPE beta = (DTYPE)1.0);
 tensor subtraction c = a - b * \beta
 make a new tensor c to keep the result and return it
 */
-XTensor Sub(const XTensor &a, const XTensor &b, DTYPE beta = (DTYPE)1.0);
+XTensor Sub(const XTensor &a, const XTensor &b, bool inplace = false, DTYPE beta = (DTYPE)1.0);
 /* tensor subtraction c = a - b * \beta */
 void Sub(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta = (DTYPE)1.0);

--- a/source/tensor/core/arithmetic/Sum.cpp
+++ b/source/tensor/core/arithmetic/Sum.cpp
@@ -262,13 +262,27 @@ make a new tensor c to keep the result and return it
 >> a - a tensor
 >> b - another tensor
+>> inplace - indicates whether the result will be placed in the input tensor
 >> beta - the scaling factor
 << return - the result of tensor summation
 */
-XTensor Sum(const XTensor & a, const XTensor & b, DTYPE beta)
+XTensor Sum(const XTensor &a, const XTensor &b, bool inplace, DTYPE beta)
 {
-    XTensor c(&a);
+    XTensor c;
+    if (inplace) {
+        /* the result is stored into the input tensor */
+        int dims[MAX_TENSOR_DIM_NUM];
+        memcpy(&(dims[0]), &(a.dimSize[0]), sizeof(int) * a.order);
+        dims[0] = -dims[0];
+        InitTensor(&c, a.order, dims, a.dataType, a.devID, a.enableGrad);
+        c.data = a.data;
+    }
+    else {
+        InitTensorV2(&c, &a);
+    }
    c.SetTMPFlag();
+    c.enableGrad = a.enableGrad;
    if (b.order == 0){
        DTYPE shift = b.Get0D() * beta;
@@ -302,6 +316,10 @@ XTensor Sum(const XTensor & a, const XTensor & b, DTYPE beta)
            ShowNTErrors("Something is wrong!");
        }
    }
+    XTensor* p = const_cast<XTensor*>(&a);
+    if (inplace)
+        p->data = NULL;
    return c;
 }

--- a/source/tensor/core/arithmetic/Sum.h
+++ b/source/tensor/core/arithmetic/Sum.h
@@ -43,7 +43,7 @@ void SumMe(XTensor & a, const XTensor & b, DTYPE beta = (DTYPE)1.0);
 tensor summation c = a + b * \beta
 make a new tensor c to keep the result and return it
 */
-XTensor Sum(const XTensor &a, const XTensor &b, DTYPE beta = (DTYPE)1.0);
+XTensor Sum(const XTensor &a, const XTensor &b, bool inplace = false, DTYPE beta = (DTYPE)1.0);
 /* tensor summation c = a + b * \beta */
 void Sum(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta = (DTYPE)1.0);

--- a/source/tensor/core/arithmetic/SumDim.cpp
+++ b/source/tensor/core/arithmetic/SumDim.cpp
@@ -154,12 +154,25 @@ i.e., a is summed with b by broadcasting
 >> a - a tensor
 >> b - another tensor whose size is equal to that of dimension n of a
 >> n - the dimension index
+>> inplace - indicates whether the result will be placed in the input tensor
 >> beta - the scaling factor
 << return - the result tensor by tensor summation
 */
-XTensor SumDim(const XTensor &a, const XTensor &b, int n, DTYPE beta)
+XTensor SumDim(const XTensor &a, const XTensor &b, int n, bool inplace, DTYPE beta)
 {
-    XTensor c(&a);
+    XTensor c;
+    if (inplace) {
+        /* the result is stored into the input tensor */
+        int dims[MAX_TENSOR_DIM_NUM];
+        memcpy(&(dims[0]), &(a.dimSize[0]), sizeof(int) * a.order);
+        dims[0] = -dims[0];
+        InitTensor(&c, a.order, dims, a.dataType, a.devID, a.enableGrad);
+        c.data = a.data;
+    }
+    else {
+        InitTensorV2(&c, &a);
+    }
    c.SetTMPFlag();
    n = MODX(n, a.order);
@@ -174,6 +187,9 @@ XTensor SumDim(const XTensor &a, const XTensor &b, int n, DTYPE beta)
        XLink::AddParamToHead(&c, beta);
    }
+    XTensor* p = const_cast<XTensor*>(&a);
+    if (inplace)
+        p->data = NULL;
    return c;
 }

--- a/source/tensor/core/arithmetic/SumDim.h
+++ b/source/tensor/core/arithmetic/SumDim.h
@@ -40,7 +40,7 @@ void _SumDim(XTensor * a, const XTensor * b, int n, DTYPE beta = (DTYPE)1.0);
 /* tensor summation c = a + b * \beta where the size of b is equal to the n-th dimension of a, 
   i.e., a is summed with b by broadcasting. We make a new tensor c to keep the result and return it */
-XTensor SumDim(const XTensor &a, const XTensor &b, int n, DTYPE beta = (DTYPE)1.0);
+XTensor SumDim(const XTensor &a, const XTensor &b, int n, bool inplace=false, DTYPE beta = (DTYPE)1.0);
 /* tensor summation c = a + b * \beta where the size of b is equal to the n-th dimension of a, 
   i.e., a is summed with b by broadcasting */

--- a/source/tensor/core/getandset/SetData.cpp
+++ b/source/tensor/core/getandset/SetData.cpp
@@ -38,6 +38,49 @@
 namespace nts{ // namespace nts(NiuTrans.Tensor)
+/* 
+generate data items according to the method
+described in `Understanding the difficulty 
+of training deep feedforward neural networks`
+- Glorot, X. & Bengio, Y. (2010), using a normal 
+distribution. The resulting tensor will have values sampled from
+:math:`\mathcal{N}(0, \text{std}^2)` where
+.. math::
+\text{std} = \text{gain} \times \sqrt{\frac{2}{\text{fan\_in} + \text{fan\_out}}}
+Also known as Glorot initialization.
+>> tensor - the tensor whose data array would be initialized
+>> gain - an optional scaling factor
+*/
+void _SetDataXavierNormal(XTensor * tensor, DTYPE gain)
+{
+    CheckNTErrors(tensor->dataType == X_FLOAT, "the tensor must be in X_FLOAT!");
+    CheckNTErrors(tensor->order >= 2, "the tensor dimension must be no less than 2!");
+    int fanIn = 1;
+    int fanOut = 1;
+    int order = tensor->order;
+    if (order == 2) {
+        fanIn = tensor->dimSize[1];
+        fanOut = tensor->dimSize[0];
+    }
+    else {
+        int numInputFmaps = tensor->dimSize[1];
+        int numOutputFmaps = tensor->dimSize[0];
+        int receptiveFieldSize = 0;
+        for (int i = 2; i < order; i++)
+            receptiveFieldSize += tensor->dimSize[i];
+        fanIn = numInputFmaps * receptiveFieldSize;
+        fanOut = numOutputFmaps * receptiveFieldSize;
+    }
+    DTYPE std = gain * (float)sqrt(2.0 / (float)(fanIn + fanOut));
+    tensor->SetDataRandn(0, std);
+}
 /*
 Fills the input Tensor or Variable with values according to the method described in 
 "Understanding the difficulty of training deep feedforward neural networks" - Glorot, X. & Bengio, Y. (2010), 
@@ -70,7 +113,7 @@ void _SetDataFanInOut(XTensor * tensor, DTYPE gain)
        fanOut = numOutputFmaps * receptiveFieldSize;
    }
-    DTYPE std = gain * (float)sqrt(2.0 / (fanIn + fanOut));
+    DTYPE std = gain * (float)sqrt(2.0 / (float)(fanIn + fanOut));
    DTYPE a = (DTYPE)sqrt(3.0F) * std;
    tensor->SetDataRand(-a, a);
    //_SetDataRand(tensor, -finfout, finfout);

--- a/source/tensor/core/getandset/SetData.h
+++ b/source/tensor/core/getandset/SetData.h
@@ -27,6 +27,9 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)
+/* generate data items with a Glorot initialization*/
+void _SetDataXavierNormal(XTensor * tensor, DTYPE gain = 1.0F);
 /* generate data items with a xavier initialization */
 void _SetDataFanInOut(XTensor * tensor, DTYPE gain = 1.0F);

--- a/source/tensor/core/math/ScaleAndShift.cpp
+++ b/source/tensor/core/math/ScaleAndShift.cpp
@@ -153,11 +153,24 @@ b = a * scale + shift
 >> a - the input tensor
 >> scale - the scale factor
 >> shift - the shift factor
+>> inplace - indicates whether the result will be placed in the input tensor
 << return - the result of scaling and shifting all tensor entires
 */
-XTensor ScaleAndShift(const XTensor &a, DTYPE scale, DTYPE shift)
+XTensor ScaleAndShift(const XTensor &a, DTYPE scale, DTYPE shift, bool inplace)
 {
-    XTensor b(&a);
+    XTensor b;
+    if (inplace) {
+        /* the result is stored into the input tensor */
+        int dims[MAX_TENSOR_DIM_NUM];
+        memcpy(&(dims[0]), &(a.dimSize[0]), sizeof(int) * a.order);
+        dims[0] = -dims[0];
+        InitTensor(&b, a.order, dims, a.dataType, a.devID, a.enableGrad);
+        b.data = a.data;
+    }
+    else {
+        InitTensorV2(&b, &a);
+    }
    b.SetTMPFlag();
    if (scale == 1.0F)
@@ -178,6 +191,9 @@ XTensor ScaleAndShift(const XTensor &a, DTYPE scale, DTYPE shift)
        }
    }
+    XTensor* p = const_cast<XTensor*>(&a);
+    if (inplace)
+        p->data = NULL;
    return b;
 }

--- a/source/tensor/core/math/ScaleAndShift.h
+++ b/source/tensor/core/math/ScaleAndShift.h
@@ -55,7 +55,7 @@ scale and shift all tensor entires
 make a new tensor to keep the result and return it
 b = a * scale + shift 
 */
-XTensor ScaleAndShift(const XTensor &a, DTYPE scale, DTYPE shift = 0);
+XTensor ScaleAndShift(const XTensor &a, DTYPE scale, DTYPE shift = 0, bool inplace=false);
 /* 
 scale and shift all tensor entires 

--- a/source/tensor/core/shape/Transpose.cpp
+++ b/source/tensor/core/shape/Transpose.cpp
@@ -138,6 +138,7 @@ XTensor Transpose(const XTensor &a, const int i, const int j)
    float dr = (!a.isSparse) ? 1.0F : a.denseRatio;
    XTensor b(order, dimSize, a.dataType, dr, a.devID, a.mem);
+    b.enableGrad = a.enableGrad;
    b.SetTMPFlag();
    /* call _Transpose function */

--- a/source/tensor/core/shape/Unsqueeze.cpp
+++ b/source/tensor/core/shape/Unsqueeze.cpp
@@ -149,6 +149,7 @@ XTensor Unsqueeze(const XTensor &a, int dim, int dSize)
    float dr = (!a.isSparse) ? 1.0F : a.denseRatio;
    XTensor b(order, dimSize, a.dataType, dr, a.devID, a.mem);
+    b.enableGrad = a.enableGrad;
    b.SetTMPFlag();
    /* call _Unsqueeze function */

--- a/source/tensor/loss/CrossEntropy.cpp
+++ b/source/tensor/loss/CrossEntropy.cpp
@@ -242,6 +242,7 @@ XTensor GetReduceTensor(const XTensor & input, int dim)
    XTensor output(order, dimSize, input.dataType, dr, input.devID, input.mem);
    output.SetTMPFlag();
+    delete[] dimSize;
    return output;
 }

--- a/source/tensor/test/TMultiply.cpp
+++ b/source/tensor/test/TMultiply.cpp
@@ -87,7 +87,7 @@ bool TestMultiply1()
    /* call Multiply function */
    _Multiply(s1, s2, t, 0, 0);
    _MultiplyMe(tMe, s2, 0, 0);
-    tUser = Multiply(*s1, *s2, 0);
+    tUser = Multiply(*s1, *s2, false, 0);
    /* check results */
    cpuTest = _CheckData(t, answer, tUnitNum, 1e-4F) &&

--- a/source/tensor/test/TSub.cpp
+++ b/source/tensor/test/TSub.cpp
@@ -161,7 +161,7 @@ bool TestSub2()
    /* call Sub function */
    _Sub(a, b, c, beta);
    _SubMe(cMe, b, beta);
-    cUser = Sub(*a, *b, beta);
+    cUser = Sub(*a, *b, false, beta);
    /* check results */
    cpuTest = _CheckData(c, answer, unitNum, 1e-4F) &&
@@ -268,7 +268,7 @@ bool TestSub3()
    b->SetData(bData, bUnitNum);
    /* call Sum function */
-    cUser = Sub(*a, *b, beta);
+    cUser = Sub(*a, *b, false, beta);
    /* check results */
    cpuTest = _CheckData(&cUser, answer, cUnitNum, 1e-4F);
@@ -370,7 +370,7 @@ bool TestSub4()
    b->SetData(bData, bUnitNum);
    /* call Sum function */
-    cUser = Sub(*a, *b, beta);
+    cUser = Sub(*a, *b, false, beta);
    /* check results */
    cpuTest = _CheckData(&cUser, answer, cUnitNum, 1e-4F);
@@ -472,7 +472,7 @@ bool TestSub5()
    b->SetData(bData, bUnitNum);
    /* call Sum function */
-    cUser = Sub(*a, *b, beta);
+    cUser = Sub(*a, *b, false, beta);
    /* check results */
    cpuTest = _CheckData(&cUser, answer, cUnitNum, 1e-4F);

--- a/source/tensor/test/TSum.cpp
+++ b/source/tensor/test/TSum.cpp
@@ -161,7 +161,7 @@ bool TestSum2()
    /* call Sum function */
    _Sum(a, b, c, beta);
    _SumMe(cMe, b, beta);
-    cUser = Sum(*a, *b, beta);
+    cUser = Sum(*a, *b, false, beta);
    /* check results */
    cpuTest = _CheckData(c, answer, unitNum, 1e-4F) &&
@@ -268,7 +268,7 @@ bool TestSum3()
    b->SetData(bData, bUnitNum);
    /* call Sum function */
-    cUser = Sum(*a, *b, beta);
+    cUser = Sum(*a, *b, false, beta);
    /* check results */
    cpuTest = _CheckData(&cUser, answer, cUnitNum, 1e-4F);
@@ -370,7 +370,7 @@ bool TestSum4()
    b->SetData(bData, bUnitNum);
    /* call Sum function */
-    cUser = Sum(*a, *b, beta);
+    cUser = Sum(*a, *b, false, beta);
    /* check results */
    cpuTest = _CheckData(&cUser, answer, cUnitNum, 1e-4F);
@@ -472,7 +472,7 @@ bool TestSum5()
    b->SetData(bData, bUnitNum);
    /* call Sum function */
-    cUser = Sum(*a, *b, beta);
+    cUser = Sum(*a, *b, false, beta);
    /* check results */
    cpuTest = _CheckData(&cUser, answer, cUnitNum, 1e-4F);