Merge code with Yuhao branch and update the size of memory pool.

c22e2e31 · liyinqiao · 823abb4f · c22e2e31 · c22e2e31 · c22e2e31
Commit c22e2e31 authored Nov 02, 2019 by liyinqiao
--- a/source/tensor/Main.cpp
+++ b/source/tensor/Main.cpp
@@ -41,9 +41,6 @@ using namespace nts;

 void SmallTest();
 void TransposeTest();
-void LittleTest();
-void T2TTest();
-void T2TTest2();
 void PowerTest();

 int main( int argc, const char ** argv )
@@ -168,127 +165,5 @@ void TransposeTest()
    delete[] data;
 }

-void LittleTest()
-{
-    int a = 5000;
-    int b = 100000;
-    int c = a*b;
-    printf("%d\n", c);
-
-    exit(1);
-}

-void T2TTest()
-{
-    XTensor * input;
-    XTensor * weight;
-    XTensor * output;
-    XTensor * gold;
-    XTensor * dedy;
-    XTensor * dedx;
-    XTensor * dedxTmp;
-    XTensor * dedw;
-    XTensor * padding;
-
-    DTYPE loss;
-
-    int * dimSize = new int[2];
-    dimSize[0] = 256;
-    dimSize[1] = 10001;
-
-    int * dimSize2 = new int[3];
-    dimSize2[0] = 2;
-    dimSize2[1] = 31;
-    dimSize2[2] = 256;
-   
-    int * dimSize3 = new int[3];
-    dimSize3[0] = 2;
-    dimSize3[1] = 31;
-    dimSize3[2] = 10001;
-
-    int * dimSize4 = new int[2];
-    dimSize4[0] = 2;
-    dimSize4[1] = 31;
-
-    input = NewTensor(3, dimSize2, X_FLOAT, 1.0F, 0);
-    weight = NewTensor(2, dimSize, X_FLOAT, 1.0F, 0);
-    dedw = NewTensor(2, dimSize, X_FLOAT, 1.0F, 0);
-    gold = NewTensor(3, dimSize3, X_FLOAT, 1.0F, 0);
-    output = NewTensor(3, dimSize3, X_FLOAT, 1.0F, 0);
-    dedy = NewTensor(3, dimSize3, X_FLOAT, 1.0F, 0);
-    dedx = NewTensor(3, dimSize3, X_FLOAT, 1.0F, 0);
-    dedxTmp = NewTensor(3, dimSize3, X_FLOAT, 1.0F, 0);
-    padding = NewTensor(2, dimSize4, X_FLOAT, 1.0F, 0);
-
-    //weight = NewTensor(2, dimSize);
-    //dedw = NewTensor(2, dimSize);
-    //input = NewTensor(3, dimSize2);
-    //gold = NewTensor(3, dimSize3);
-    //output = NewTensor(3, dimSize3);
-    //dedy = NewTensor(3, dimSize3);
-    //dedx = NewTensor(3, dimSize3);
-    //dedxTmp = NewTensor(3, dimSize3);
-    //padding = NewTensor(2, dimSize4);
-
-    myRead(input, "x.txt", "x");
-    myRead(weight, "w.txt", "w");
-    myRead(gold, "gold.txt", "gold");
-    myRead(padding, "padding.txt", "padding");
-
-    XTensor inter;
-    inter = MMul(*input, *weight);
-
-    _Softmax(&inter, output, 2);
-
-    //_LogMe(output);
-    loss = _CrossEntropyFast(output, gold, REDUCE_MEAN, NULL, padding);
-
-    printf("loss: %f\n", loss);
-
-    _CrossEntropyBackward(dedy, output, gold, NULL);
-    //_CrossEntropyBackward(dedy, output, gold, NULL, padding);
-
-    myDump(dedy, "dedy.txt", "dedy");
-
-    _SoftmaxBackward(NULL, output, input, dedy, dedx, NULL, -1, NOLOSS);
-    _Sub(output, gold, dedxTmp);
-
-    myDump(dedx, "dedx.txt", "dedx");
-    dedx->Dump(stderr, "dedx", 200);
-    dedxTmp->Dump(stderr, "dedxTmp", 200);
-
-    input->Reshape(input->unitNum/input->GetDim(-1), input->GetDim(-1));
-    dedx->Reshape(dedx->unitNum/dedx->GetDim(-1), dedx->GetDim(-1));
-
-    _MatrixMulBatched(input, X_TRANS, dedx, X_NOTRANS, dedw);
-
-    myDump(dedw, "dedw.txt", "dedw");
-}
-
-void T2TTest2()
-{
-    int dimSize[3];
-    dimSize[0] = 161;
-    dimSize[1] = 47;
-    dimSize[2] = 10001;
-    XTensor * probs = NewTensor(3, dimSize, X_FLOAT, 1.0F, 0);
-    //XTensor * probs = NewTensor(3, dimSize, X_FLOAT, 1.0F, -1);
-
-    //myRead(probs, "probs.txt", " ");
-    _SetDataFixedFloat(probs, 1.0F);
-
-    probs->Reshape(1, probs->unitNum);
-
-    DTYPE sum = _ReduceSumAll(probs);
-    printf("%e\n", sum);
-
-    //XTensor tmp;
-    //tmp = IsNonZero(*probs);
-    //DTYPE nonZeroNum = ReduceSumAll(tmp);
-    //printf("%f\n", nonZeroNum);
-    //
-    //DTYPE gpu = ReduceSum(*probs, 1).Get2D(0, 0);
-
-    //printf("%e\n", gpu);
-}

--- a/source/tensor/core/arithmetic/SubDim.cu
+++ b/source/tensor/core/arithmetic/SubDim.cu
@@ -39,7 +39,7 @@ where a is a tensor and b is a row vector
 */
 template <class T, bool betaFired>
 __global__
-    void KernelSubWithRow(T * a, T * b, T * c, int rowNum, int colNum, T beta)
+void KernelSubWithRow(T * a, T * b, T * c, int rowNum, int colNum, T beta)
 {
    __shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
    int col = blockDim.x * blockIdx.x + threadIdx.x;
@@ -75,7 +75,7 @@ where a is a tensor and b is a colum vector
 */
 template <class T, bool betaFired>
 __global__
-    void KernelSubWithCol(T * a, T * b, T * c, int rowNum, int colNum, int blockSize, int blockNum, T beta)
+void KernelSubWithCol(T * a, T * b, T * c, int rowNum, int colNum, int blockSize, int blockNum, T beta)
 {
    __shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];


--- a/source/tensor/core/arithmetic/Sum.cpp
+++ b/source/tensor/core/arithmetic/Sum.cpp
@@ -78,7 +78,7 @@ void _Sum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
    else {
        if (!a->isSparse && !b->isSparse) {
            CheckNTErrors(!c->isSparse, "Illegal use of sparse tensor in addition!");
-    
+
            if (a->dataType == DEFAULT_DTYPE &&
                b->dataType == DEFAULT_DTYPE &&
                c->dataType == DEFAULT_DTYPE)

--- a/source/tensor/core/arithmetic/SumDim.cu
+++ b/source/tensor/core/arithmetic/SumDim.cu
@@ -30,7 +30,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 /* 
 tensor summation of a tensor and a row vector
-c = a + b * \beta
+c = a + b * \beta 
 where a is a tensor and b is a row vector
 >> a - pointer to the data array of a
 >> b - pointer to the data array of b

--- a/source/tensor/core/getandset/OnehotAndIndex.cpp
+++ b/source/tensor/core/getandset/OnehotAndIndex.cpp
@@ -209,4 +209,4 @@ XTensor IndexToOnehot(const XTensor & index, int size, float labelSmoothingP)
    return onehot;
 }

-} // namespace nts(NiuTrans.Tensor)
+} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/getandset/OnehotAndIndex.cu
+++ b/source/tensor/core/getandset/OnehotAndIndex.cu
@@ -153,4 +153,4 @@ void _CudaIndexToOnehot(const XTensor * index, XTensor * onehot,

 #endif // USE_CUDA

-} // namespace nts(NiuTrans.Tensor)
+} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/getandset/Select.cpp
+++ b/source/tensor/core/getandset/Select.cpp
@@ -45,8 +45,6 @@ void _Select(const XTensor * a, XTensor * c, int* index, int dim)
    int stride = 1;
    for (int i = dim + 1; i < a->order; i++)
        stride *= a->dimSize[i];
-
-    printf("\n%d %d\n", a->order - dim - 1,stride);
    int copyTimes = 1;
    for (int i = 0; i < dim; i++)
    {
@@ -94,12 +92,46 @@ void _Select(const XTensor * a, XTensor * c, XTensor* index, int dim)
 }

 /*
+c = select(a)

+>> a - input tensor
+>> index - the selected index
+>> dim - the dimension along with which we do the job 
+<< return - the result of the generated tensor with selected data
 */
-/*XTensor Select(const XTensor &a, int* index, int dim)
+XTensor Select(const XTensor &a, XTensor &index, int dim)
 {
+    int order = a.order;
+    int * dimSize = new int[order];

-}*/
+    CheckNTErrors(dim >= 0 && dim < a.order, "The input dimension is out of bounds!");
+
+    for (int i = 0; i < a.order; i++) {
+        if (i == dim) {
+            dimSize[i] = index.dimSize[0];
+        }
+        else
+            dimSize[i] = a.dimSize[i];
+    }
+
+    float dr = (!a.isSparse) ? 1.0F : a.denseRatio;
+    XTensor c(order, dimSize, a.dataType, dr, a.devID, a.mem);
+    c.SetTMPFlag();
+
+    /* call _SelectRange function */
+    _Select(&a, &c, &index, dim);
+
+    /* tensor connection */
+    if (a.enableGrad) {
+        XLink::MakeLink(&a, &index, &c, GETANDSET_SELECT);
+        XLink::AddParamToHeadInt(&c, dim);
+    }
+
+    /* destroy variables */
+    delete[] dimSize;
+
+    return c;
+}

 /* 
 generate a tensor with selected data in range[low,high] along the given dimension 
@@ -192,10 +224,12 @@ XTensor SelectRange(const XTensor &a, int dim, int low, int high)
    _SelectRange(&a, &c, dim, low, high);

    /* tensor connection */
-    XLink::MakeLink(&a, NULL, &c, GETANDSET_SELECT);
-    XLink::AddParamToHeadInt(&c, dim);
-    XLink::AddParamToHeadInt(&c, low);
-    XLink::AddParamToHeadInt(&c, high);
+    if (a.enableGrad) {
+        XLink::MakeLink(&a, NULL, &c, GETANDSET_SELECT);
+        XLink::AddParamToHeadInt(&c, dim);
+        XLink::AddParamToHeadInt(&c, low);
+        XLink::AddParamToHeadInt(&c, high);
+    }

    /* destroy variables */
    delete[] dimSize;

--- a/source/tensor/core/getandset/Select.h
+++ b/source/tensor/core/getandset/Select.h
@@ -36,7 +36,7 @@ void _Select(const XTensor * a, XTensor * c, XTensor* index, int dim);
 generate a tensor with selected data c = select(a) (returna a XTensor structure)
 make a new tensor to keep the result and return it
 */
-XTensor Select(const XTensor &a, XTensor &indexCPU);
+XTensor Select(const XTensor &a, XTensor &index, int dim);

 /* 
 generate a tensor with selected data in range[low,high] along the given dimension 

--- a/source/tensor/core/math/Binary.cpp
+++ b/source/tensor/core/math/Binary.cpp
@@ -78,7 +78,7 @@ void _funcName(const XTensor * a, XTensor * b, T num)                           
        _cudaFuncName(a, b, num);                                                    \
        return;                                                                      \
    }                                                                                \
-    CheckNTErrors((_IsSameShaped(a, b)),                                              \
+    CheckNTErrors((_IsSameShaped(a, b)),                                             \
                  "Input tensors should have the same data type!");                  \
    if (a->dataType == X_INT) {                                                      \
        int * d = (int*)a->data;                                                     \
@@ -113,7 +113,7 @@ void _funcName(const XTensor * a, XTensor * b, T num)                           
    if (a->devID >= 0) {                                                             \
        ShowNTErrors("No GPU devices support!")                                      \
    }                                                                                \
-    CheckNTErrors((_IsSameShaped(a, b)),                                              \
+    CheckNTErrors((_IsSameShaped(a, b)),                                             \
                  "Input tensors should have the same data type!");                  \
    if (a->dataType == X_INT) {                                                      \
        int * d = (int*)a->data;                                                     \
@@ -170,8 +170,8 @@ XTensor funcName(const XTensor &a, T num)                                       
    _funcName(&a, &b, num);                                                          \
    if(a.enableGrad){                                                                \
        XLink::MakeLink(&a, NULL, &b, operationId);                                  \
+        XLink::AddParamToHead(&b, num);                                              \
    }                                                                                \
-    XLink::AddParamToHead(&b, num);                                                  \
    return b;                                                                        \
 }                                                                                    \
 template XTensor funcName<int>(const XTensor&, int);                                 \
@@ -182,8 +182,8 @@ template XTensor funcName<double>(const XTensor&, double);
 template<class T>                                                                    \
 void funcName(const XTensor &a, XTensor &b, T num)                                   \
 {                                                                                    \
-    if (!b.isInit || !IsSameShaped(a, b)) {                                        \
-        InitTensorV2(&b, &a);                                                          \
+    if (!b.isInit || !IsSameShaped(a, b)) {                                          \
+        InitTensorV2(&b, &a);                                                        \
    }                                                                                \
    _funcName(&a, &b, num);                                                          \
    if (a.enableGrad) {                                                              \

--- a/source/tensor/core/math/Clip.cu
+++ b/source/tensor/core/math/Clip.cu
@@ -37,7 +37,7 @@ set each entry to its clip value (CUDA Kernel)
 >> size - size of the data array
 */
 __global__
-    void KernelClip(DTYPE * a, DTYPE * b, DTYPE lower, DTYPE upper, int size)
+void KernelClip(DTYPE * a, DTYPE * b, DTYPE lower, DTYPE upper, int size)
 {
    int i = blockDim.x * blockIdx.x + threadIdx.x;


--- a/source/tensor/core/movement/Gather.cpp
+++ b/source/tensor/core/movement/Gather.cpp
@@ -33,28 +33,6 @@ gather indexed sub-tensors

 >> s - the source tensor
 >> t - the target tensor
->> dim - the leading dimension to define "sub-tensors"
-         e.g., for a tensor of size (3, 2, 4) and dim = 2, 
-         we have 4 sub-tensors of size (3, 2)
->> srcIndex - index of the source sub-tensors
->> indexSize - length of srcIndex (and tgtIndex)
-*/
-void _Gather(XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSize)
-{
-    int * tgtIndex = new int[indexSize];
-    for(int i = 0; i < indexSize; i++)
-        tgtIndex[i] = i;
-
-    _CopyIndexed(s, t, dim, srcIndex, indexSize, tgtIndex, 1);
-
-    delete[] tgtIndex;
-}
-
-/*
-gather indexed sub-tensors
-
->> s - the source tensor
->> t - the target tensor
 >> srcIndex - index of the source sub-tensors
 >> dim - the leading dimension to define "sub-tensors"
 e.g., for a tensor of size (3, 2, 4) and dim = 2,
@@ -143,7 +121,10 @@ XTensor Gather(XTensor &s, XTensor &index)
    _Gather(&s, &t, &index);

    /* tensor connection */
-    XLink::MakeLink(&s, &index, &t, MOVEMENT_GATHER);
+    if (s.enableGrad)
+    {
+        XLink::MakeLink(&s, &index, &t, MOVEMENT_GATHER);
+    }

    if(index.order > 1) {
        int * dims = new int[index.order + 1];

--- a/source/tensor/core/movement/Gather.cu
+++ b/source/tensor/core/movement/Gather.cu
@@ -75,7 +75,6 @@ gather indexed sub-tensors(cuda version)
 >> stride - stride of a data block
 >> strideNum - strideNum of a data block
 >> blockNum - block size of data
-
 */
 __global__
 void KernelGather(DTYPE * sData, DTYPE * tData, int * sIndex, int stride, int strideNum, int blockNum)

--- a/source/tensor/core/movement/Gather.h
+++ b/source/tensor/core/movement/Gather.h
@@ -27,9 +27,6 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)

 /* gather selected sub-tensors */
-void _Gather(XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSize);
-
-/* gather selected sub-tensors */
 void _Gather(const XTensor * s, XTensor * t, XTensor * srcIndex);

 /* gather selected sub-tensors accoding to the dimension */

--- a/source/tensor/core/movement/Spread.cpp
+++ b/source/tensor/core/movement/Spread.cpp
@@ -272,4 +272,4 @@ void _SpreadForGather(XTensor * source, XTensor * collection, XTensor * index)
    }
 }

-} // namespace nts(NiuTrans.Tensor)
+} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/movement/Spread.cu
+++ b/source/tensor/core/movement/Spread.cu
@@ -416,4 +416,4 @@ void _CudaSpreadForGather(XTensor * source, XTensor * collection, XTensor * srcI

 #endif // USE_CUDA

-} // namespace nts(NiuTrans.Tensor)
+} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/reduce/ReduceMax.cpp
+++ b/source/tensor/core/reduce/ReduceMax.cpp
@@ -203,8 +203,11 @@ XTensor funcName(const XTensor & input, int dim)                                
    funcOp(&input, &output, dim);                                                                                   \
                                                                                                                    \
    /* tensor connection */                                                                                         \
-    XLink::MakeLink(&input, NULL, &output, REDUCE_REDUCEMAX);                                                       \
-    XLink::AddParamToHeadInt(&output, dim);                                                                         \
+    if(input.enableGrad)                                                                                            \
+    {                                                                                                               \
+        XLink::MakeLink(&input, NULL, &output, REDUCE_REDUCEMAX);                                                   \
+        XLink::AddParamToHeadInt(&output, dim);                                                                     \
+    }                                                                                                               \
                                                                                                                    \
    /* destroy variables */                                                                                         \
    delete[] dimSize;                                                                                               \

--- a/source/tensor/core/reduce/ReduceSum.cu
+++ b/source/tensor/core/reduce/ReduceSum.cu
@@ -742,7 +742,7 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
                                                              strideNum, blockNum, sp, power, isExp);
        }
    }
-    else if (stride != 1 && stride * blockNum > 4096){
+    else if (stride != 1 && stride * blockNum > 4096) {
        //GDevs->GetGridAndBlockSize2D(devID, stride * blockNum, strideNum,MAX_INT, cudaGridSize, cudaBlockSize);
        //unsigned int* goutput = (unsigned int *)input->data;
        //convert2uintV2 << <dim3(cudaGridSize[0], cudaGridSize[1]), dim3(cudaBlockSize[0], cudaBlockSize[1]) >> > ((float*)input->data, goutput, stride, strideNum, blockNum, strideNum*blockNum*stride);

--- a/source/tensor/core/reduce/VectorBuffer.cpp
+++ b/source/tensor/core/reduce/VectorBuffer.cpp
@@ -20,7 +20,7 @@
 */

 #include "VectorBuffer.h"
-//#include "math.h"
+#include "math.h"
 namespace nts {
 /* data size for each buffer */
 int VectorBuffer::size()
@@ -172,7 +172,6 @@ VectorBuffer VectorBuffer::maxData(const VectorBuffer &a) {
 VectorBuffer VectorBuffer::minData(const VectorBuffer &a) {
    for (int i = 0; i != a.size(); i++) {
        this->values[i] = MIN(a[i], this->values[i]);
-        printf("runhere");
    }
    return *this;
 }

--- a/source/tensor/core/reduce/VectorBuffer.h
+++ b/source/tensor/core/reduce/VectorBuffer.h
@@ -19,7 +19,6 @@
 * $Created by: ZHANG Yuhao (email: zhangyuhao@stu.neu.edu.cn) 2019-07-23
 */

-//#include <cstring>
 #include "../../XGlobal.h"

 namespace nts {

--- a/source/tensor/core/sort/TopK.cu
+++ b/source/tensor/core/sort/TopK.cu
@@ -828,7 +828,7 @@ void _CudaTopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k)
    else if (k < 22) workerNum = 128;
    else if (k < 44) workerNum = 64;
    else workerNum = 32;
-
+ 
    int cudaGrids[3];
    int cudaBlocks[3];


--- a/source/tensor/function/LogSoftmax.cpp
+++ b/source/tensor/function/LogSoftmax.cpp
@@ -74,7 +74,7 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim)
        int blockSize = 1;
        int blockNum = 1;

-        for (int i = leadDim + 1; i < y->order; i++)
+        for (int i = leadDim + 1; i < x->order; i++)
            stride *= y->dimSize[i];
        blockSize = stride * dimensionSize;
        blockNum = y->unitNum / blockSize;

--- a/source/tensor/test/Test.cpp
+++ b/source/tensor/test/Test.cpp
@@ -74,7 +74,7 @@ bool Test()
    wrong = !TestSumDim() || wrong;
    wrong = !TestTan() || wrong;
    wrong = !TestTranspose() || wrong;
-    //wrong = !TestTopK() || wrong;
+    wrong = !TestTopK() || wrong;
    wrong = !TestUnsqueeze() || wrong;
    wrong = !TestXMem() || wrong;