cpu implementation of the gather and spread function

b3fa7a37 · xuchen · 2fea6615 · b3fa7a37 · b3fa7a37 · b3fa7a37
Commit b3fa7a37 authored Nov 28, 2018 by xuchen
--- a/source/network/XBackwardShape.cpp
+++ b/source/network/XBackwardShape.cpp
@@ -109,7 +109,7 @@ void XShapeGrad::GradCopyIndexed(XTensor * node, bool isEfficent)
    XTensor * input = income.tails[0];
    XNoder::MakeGrad(input);

-    _SpreadForGather(input->grad, node->grad, dim, realSrcIndex, realIndexSize, realTgtIndex);
+    _Spread(input->grad, node->grad, dim, realSrcIndex, realIndexSize, realTgtIndex);

    delete[] realSrcIndex;
    delete[] realTgtIndex;

--- a/source/sample/transformer/T2TTrainer.cpp
+++ b/source/sample/transformer/T2TTrainer.cpp
@@ -728,13 +728,12 @@ int T2TTrainer::LoadBatchLM(FILE * file,

    int seqSize = 0;
    
+    MTYPE * batchEncOffsets = new MTYPE[batchEnc->unitNum];
    int * batchEncValues = new int[batchEnc->unitNum];
    MTYPE * paddingEncOffsets = new MTYPE[paddingEnc->unitNum];
    MTYPE * goldOffsets = new MTYPE[gold->unitNum];
    MTYPE * paddingDecOffsets = new MTYPE[paddingDec->unitNum];

-	/* need to improve the implementation */
-    memset(batchEncValues, 0, sizeof(int) * batchEnc->unitNum);
    int wGold = 0;

    //fprintf(tf, "batch %d(%d)\n", tc++, sc);
@@ -745,9 +744,10 @@ int T2TTrainer::LoadBatchLM(FILE * file,
        for(int w = 0; w < len; w++){
            int num = buf[seqOffset[s] + w];
            //batchEnc->Set2DInt(buf[seqOffset[s] + w], s - seq, w);
+            batchEncOffsets[wCount] = batchEnc->GetOffset2D(s - seq, w);
+            batchEncValues[wCount] = num;
            //paddingEnc->Set2D(1.0F, s - seq, w);
            //paddingDec->Set2D(1.0F, s - seq, w);
-            batchEncValues[(s - seq) * dims[1] + w] = num;
            paddingEncOffsets[wCount] = paddingEnc->GetOffset2D(s - seq, w);
            paddingDecOffsets[wCount] = paddingDec->GetOffset2D(s - seq, w);
            if (w > 0)
@@ -779,11 +779,12 @@ int T2TTrainer::LoadBatchLM(FILE * file,
        }
    }

-    batchEnc->SetData(batchEncValues, batchEnc->unitNum);
+    batchEnc->SetDataBatched(batchEncOffsets, batchEncValues, wCount);
    paddingEnc->SetDataBatched(paddingEncOffsets, 1.0F, wCount);
    paddingDec->SetDataBatched(paddingDecOffsets, 1.0F, wCount);
    gold->SetDataBatched(goldOffsets, 1.0F, wGold);

+    delete[] batchEncOffsets;
    delete[] batchEncValues;
    delete[] paddingEncOffsets;
    delete[] paddingDecOffsets;

--- a/source/tensor/XTensor.cpp
+++ b/source/tensor/XTensor.cpp
@@ -790,7 +790,7 @@ void XTensor::SetDataRandn(DTYPE mean, DTYPE standardDeviation)
 /* 
 set tensor items with an array of offsets 
 >> offsets - offset for each data item
->> values - value for each data item
+>> value - value for data items
 >> num - number of the data items
 */
 void XTensor::SetDataBatched(MTYPE * offsets, DTYPE value, int num)
@@ -798,6 +798,17 @@ void XTensor::SetDataBatched(MTYPE * offsets, DTYPE value, int num)
    _SetDataWithOffset(this, offsets, value, num);
 }

+/* 
+set tensor items with an array of values 
+>> offsets - offset for each data item
+>> values - value for each data item
+>> num - number of the data items
+*/
+void XTensor::SetDataBatched(MTYPE * offsets, void * values, int num)
+{
+    _SetDataWithOffsetAndValue(this, offsets, values, num);
+}
+
 /* check whether the data array is the same as the answer
 >> d - input data. it must be on CPU
 >> num - number of data items

--- a/source/tensor/XTensor.h
+++ b/source/tensor/XTensor.h
@@ -282,6 +282,9 @@ public:
    /* set tensor items with an array of offsets */
    void SetDataBatched(MTYPE * offsets, DTYPE value, int num);

+    /* set tensor items with an array of values */
+    void SetDataBatched(MTYPE * offsets, void * values, int num);
+
    /* check whether the data array is the same as the answer */
    bool CheckData(const void * answer, int num, int beg = 0);


--- a/source/tensor/core/getandset/SetData.cpp
+++ b/source/tensor/core/getandset/SetData.cpp
@@ -485,9 +485,55 @@ set the data with an array of values
 >> values - value for each data item
 >> num - number of the data items
 */
-void _SetDataWithOffsetAndValue(XTensor * tensor, MTYPE * offsets, DTYPE * values, MTYPE num)
+void _SetDataWithOffsetAndValue(XTensor * tensor, MTYPE * offsets, void * values, MTYPE num)
 {
-    ShowNTErrors("TODO!");
+    if (tensor->devID < 0) {
+        for (int i = 0; i < num; i++) {
+            if (tensor->dataType == X_INT)
+                *((int *)tensor->data + offsets[i]) = *((int *)values + i);
+            else if (tensor->dataType == X_FLOAT)
+                *((float *)tensor->data + offsets[i]) = *((float *)values + i);
+            else 
+                ShowNTErrors("TO DO!!!");
+        }
+    }
+    else {
+#ifdef USE_CUDA
+        XMem * mem = tensor->mem;
+        MTYPE offsetSize = num * sizeof(MTYPE);
+        MTYPE valueSize;
+
+        if (tensor->dataType == X_INT)
+            valueSize = num * sizeof(int);
+        else if (tensor->dataType == X_FLOAT)
+            valueSize = num * sizeof(float);
+        else
+            ShowNTErrors("TO DO!!!");
+
+        MTYPE * offsetsCuda = mem != NULL ? 
+                             (MTYPE*)mem->AllocBuf(mem->devID, offsetSize) : 
+                             (MTYPE*)XMemAlloc(tensor->devID, offsetSize);
+        void * valuesCuda  = mem != NULL ? 
+                             mem->AllocBuf(mem->devID, valueSize) : 
+                             XMemAlloc(tensor->devID, valueSize);
+
+        XMemCopy(offsetsCuda, tensor->devID, offsets, -1, offsetSize);
+        XMemCopy(valuesCuda, tensor->devID, values, -1, valueSize);
+
+        _CudaSetDataWithOffsetAndValue(tensor, offsetsCuda, valuesCuda, num);
+        
+        if (mem != NULL) {
+            mem->ReleaseBuf(mem->devID, valueSize);
+            mem->ReleaseBuf(mem->devID, offsetSize);
+        }
+        else {
+            XMemFree(tensor->devID, offsetsCuda);
+            XMemFree(tensor->devID, valuesCuda);
+        }
+#else
+        ShowNTErrors("Please recompile the code with USE_CUDA");
+#endif
+    }
 }

 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/getandset/SetData.cu
+++ b/source/tensor/core/getandset/SetData.cu
@@ -470,8 +470,8 @@ void _CudaSetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper)
 set the data with an array of offsets (kernel version)
 >> data - pointer to the data array
 >> offsets - offset for each data item
->> num - number of the data items
 >> value - value of the data items
+>> num - number of the data items
 */
 __global__
 void _KernelSetDataWithOffset(DTYPE * data, MTYPE * offsets, DTYPE value, MTYPE num)
@@ -487,8 +487,8 @@ void _KernelSetDataWithOffset(DTYPE * data, MTYPE * offsets, DTYPE value, MTYPE 
 set the data with an array of offsets (cuda version)
 >> tensor - the tensor that keeps the data
 >> offsets - offset for each data item
->> num - number of the data items
 >> value - value of the data items
+>> num - number of the data items
 */
 void _CudaSetDataWithOffset(XTensor * tensor, MTYPE * offsets, DTYPE value, MTYPE num)
 {
@@ -510,4 +510,51 @@ void _CudaSetDataWithOffset(XTensor * tensor, MTYPE * offsets, DTYPE value, MTYP
    BacktoCudaDev(tensor->devID, devIDBackup);
 }

+/*
+set the data with an array of offsets (kernel version)
+>> data - pointer to the data array
+>> offsets - offset for each data item
+>> value - value of the data items
+>> num - number of the data items
+>> dataType - the data type of the data and values
+*/
+__global__
+void _KernelSetDataWithOffset(void * data, MTYPE * offsets, void * values, MTYPE num, TENSOR_DATA_TYPE dataType)
+{
+    /* index */
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < num) {
+        if (dataType == X_INT)
+            *((int *)data + offsets[i]) = *((int *)values + i);
+        else if (dataType == X_FLOAT)
+            *((float *)data + offsets[i]) = *((float *)values + i);
+    }
+}
+
+/*
+set the data with an array of values 
+>> tensor - the tensor that keeps the data
+>> offsets - offset for each data item
+>> value - value of the ech data item
+>> num - number of the data items
+*/
+void _CudaSetDataWithOffsetAndValue(XTensor * tensor, MTYPE * offsets, void * values, MTYPE num)
+{
+    int gridSize[3];
+    int blockSize[3];
+
+    GDevs.GetCudaThread(tensor->devID, (int)num, gridSize, blockSize);
+
+    dim3 blocks(gridSize[0]);
+    dim3 threads(blockSize[0]);
+
+    int devIDBackup;
+    ProtectCudaDev(tensor->devID, devIDBackup);
+
+    _KernelSetDataWithOffset << <blocks, threads >> > (tensor->data, offsets, values, num, tensor->dataType);
+
+    BacktoCudaDev(tensor->devID, devIDBackup);
+}
+
 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/getandset/SetData.cuh
+++ b/source/tensor/core/getandset/SetData.cuh
@@ -52,6 +52,9 @@ void _CudaSetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper);
 /* set the data with an array of offsets */
 void _CudaSetDataWithOffset(XTensor * tensor, MTYPE * offsets, DTYPE value, MTYPE num);

+/* set the data with an array of values */
+void _CudaSetDataWithOffsetAndValue(XTensor * tensor, MTYPE * offsets, void * value, MTYPE num);
+
 } // namespace nts(NiuTrans.Tensor)

 #endif // __SETDATA_CUH__
\ No newline at end of file
--- a/source/tensor/core/getandset/SetData.h
+++ b/source/tensor/core/getandset/SetData.h
@@ -64,7 +64,7 @@ void _SetDataRandN(XTensor * tensor, DTYPE mean = 0.0F, DTYPE standardDeviation 
 void _SetDataWithOffset(XTensor * tensor, MTYPE * offsets, DTYPE value, MTYPE num);

 /* set the data with an array of values */
-void _SetDataWithOffsetAndValue(XTensor * tensor, MTYPE * offsets, DTYPE * values, MTYPE num);
+void _SetDataWithOffsetAndValue(XTensor * tensor, MTYPE * offsets, void * values, MTYPE num);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/math/Unary.cu
+++ b/source/tensor/core/math/Unary.cu
@@ -26,7 +26,9 @@
 #include "Unary.h"
 #include "Unary.cuh"

-namespace nts {
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+#ifdef USE_CUDA

 __device__
 DTYPE cudasquare(DTYPE x)
@@ -114,4 +116,6 @@ SIMPLE_UNARY_FUNCTION_GPU(Sin, sin)
 SIMPLE_UNARY_FUNCTION_GPU(Cos, cos)
 SIMPLE_UNARY_FUNCTION_GPU(Tan, tan)

-}
\ No newline at end of file
+#endif // USE_CUDA
+
+} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/math/Unary.h
+++ b/source/tensor/core/math/Unary.h
@@ -25,7 +25,7 @@

 #include "../../XTensor.h"

-namespace nts{
+namespace nts{ // namespace nts(NiuTrans.Tensor)

 /* set every entry to its absolute value */
 void _Absolute(const XTensor * a, XTensor * b);

--- a/source/tensor/core/movement/Gather.cpp
+++ b/source/tensor/core/movement/Gather.cpp
@@ -70,6 +70,24 @@ void _Gather(XTensor * s, XTensor * t, XTensor * srcIndex)
        return;
    }
 #endif
+
+    int blockNum = 1;
+    int blockSize = 1;
+    int stride = 1;
+    int indexSize = 1;
+
+    stride = s->GetDim(-1);
+    indexSize = srcIndex->unitNum;
+
+    DTYPE * sData = (DTYPE*)s->data;
+    DTYPE * tData = (DTYPE*)t->data;
+    int * sIndexData = (int*)srcIndex->data;
+
+    for (int i = 0; i < indexSize; i++) {
+        int sIndex = sIndexData[i] * stride;
+        for (int j = 0; j < stride; j++)
+            tData[i * stride + j] = sData[sIndex + j];
+    }
 }

 /*

--- a/source/tensor/core/movement/Gather.cu
+++ b/source/tensor/core/movement/Gather.cu
@@ -31,14 +31,14 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /*
 gather indexed sub-tensors(cuda version)

->> source - the data pointer of the source tensor
->> target - the data pointer of the target tensor
->> srcIndex - the index of the source tensor
+>> sData - the data pointer of the source tensor
+>> tData - the data pointer of the target tensor
+>> sIndex - the index of the source tensor
 >> indexSize - the size of the srcIndex
 >> stride - stride of a data block
 */
 __global__
-void KernelGather(DTYPE * source, DTYPE * target, int * srcIndex, int indexSize, int stride)
+void KernelGather(DTYPE * sData, DTYPE * tData, int * sIndex, int indexSize, int stride)
 {
    __shared__ DTYPE * sp[MAX_CUDA_THREAD_NUM_PER_BLOCK];
    __shared__ DTYPE * cp[MAX_CUDA_THREAD_NUM_PER_BLOCK];
@@ -53,8 +53,8 @@ void KernelGather(DTYPE * source, DTYPE * target, int * srcIndex, int indexSize,
        return;

    if(threadIdx.y == 0){
-        sp[threadIdx.x] = source + srcIndex[i] * stride;
-        cp[threadIdx.x] = target + i * stride;
+        sp[threadIdx.x] = sData + sIndex[i] * stride;
+        cp[threadIdx.x] = tData + i * stride;
    }

    __syncthreads();
@@ -90,12 +90,12 @@ void _CudaGather(XTensor * s, XTensor * t, XTensor * srcIndex)
    dim3 blocks(cudaGrids[0], cudaGrids[1]);
    dim3 threads(cudaBlocks[0], cudaBlocks[1]);

-    DTYPE * source = (DTYPE*)s->data;
-    DTYPE * target = (DTYPE*)t->data;
+    DTYPE * sData = (DTYPE*)s->data;
+    DTYPE * tData = (DTYPE*)t->data;

-    int * si = (int *)srcIndex->data;
+    int * sIndex = (int *)srcIndex->data;

-    KernelGather<<<blocks, threads >>>(source, target, si, indexSize, stride);
+    KernelGather<<<blocks, threads >>>(sData, tData, sIndex, indexSize, stride);
    
    BacktoCudaDev(devID, devIDBackup);
 }

--- a/source/tensor/core/movement/Spread.cpp
+++ b/source/tensor/core/movement/Spread.cpp
@@ -143,31 +143,27 @@ And this is a special spread function for backward computation of gather functio
         we have 4 sub-tensors of size (3, 2)
 >> srcIndex - index of the source sub-tensors
 >> indexSize - length of srcIndex (and collIndex)
->> collIndex - index of the gathered sub-tensors
 */
 void _SpreadForGather(XTensor * source, XTensor * collection, int dim, 
-                      int * srcIndex, int indexSize, int * collIndex)
+                      int * srcIndex, int indexSize)
 {
    int order = source->order;

    CheckNTErrors(source->dataType == DEFAULT_DTYPE, "TODO!");
    CheckNTErrors(dim >= 0 && dim < order, "Illegal dimension!");
    
-    for(int i = 0; i < order; i++){
-        if(i < dim){
-            CheckNTErrors(collection->GetDim(i) == source->GetDim(i), "Illegal dimension!");
+    for (int i = 0; i < order; i++){
+        if (i == dim) {
+            CheckNTErrors(collection->GetDim(i) == indexSize, "Illegal dimension!");
        }
-        else if(i > dim){
+        else {
            CheckNTErrors(collection->GetDim(i) == source->GetDim(i), "Illegal dimension!");
        }
-        else{
-            CheckNTErrors(collection->GetDim(i) == indexSize, "Illegal dimension!");
-        }
    }

 #ifdef USE_CUDA
    if(source->devID >= 0 && collection->devID >= 0) {
-        _CudaSpreadForGather(source, collection, dim, srcIndex, indexSize, collIndex);
+        _CudaSpreadForGather(source, collection, dim, srcIndex, indexSize);
        return;
    }
 #endif
@@ -190,7 +186,7 @@ void _SpreadForGather(XTensor * source, XTensor * collection, int dim,

    for(int i = 0; i < indexSize; i++){
        int src = srcIndex[i];
-        int tgt = collIndex[i];
+        int tgt = i;
        DTYPE * s = sData + src * stride;
        DTYPE * c = cData + tgt * stride;
        _AssignmentForGather(s, c, blockNum, blockSizeSrc, blockSizeColl, stride);
@@ -203,12 +199,7 @@ And this is a special spread function for backward computation of gather functio

 >> source - the source tensor whose data would be modified
 >> collection - the collection whose data would be spread to source tensor
->> dim - the leading dimension to define "sub-tensors"
-         e.g., for a tensor of size (3, 2, 4) and dim = 2, 
-         we have 4 sub-tensors of size (3, 2)
->> srcIndex - index of the source sub-tensors
->> indexSize - length of srcIndex (and collIndex)
->> collIndex - index of the gathered sub-tensors
+>> index - the tensor to save the index of the collenction tensor
 */
 void _SpreadForGather(XTensor * source, XTensor * collection, XTensor * index)
 {
@@ -218,10 +209,10 @@ void _SpreadForGather(XTensor * source, XTensor * collection, XTensor * index)
    CheckNTErrors(source->dataType == DEFAULT_DTYPE, "TODO!");
    
    for(int i = 0; i < order; i++){
-        if(i < dim){
-            CheckNTErrors(collection->GetDim(i) == source->GetDim(i), "Illegal dimension!");
+        if(i == dim){
+            CheckNTErrors(collection->GetDim(i) == index->unitNum, "Illegal dimension!");
        }
-        else if(i > dim){
+        else {
            CheckNTErrors(collection->GetDim(i) == source->GetDim(i), "Illegal dimension!");
        }
    }
@@ -232,7 +223,24 @@ void _SpreadForGather(XTensor * source, XTensor * collection, XTensor * index)
        return;
    }
 #endif
+    
+    int blockNum = 1;
+    int blockSize = 1;
+    int stride = 1;
+    int indexSize = 1;
+
+    stride = source->GetDim(-1);
+    indexSize = index->unitNum;
+
+    DTYPE * sData = (DTYPE*)source->data;
+    DTYPE * cData = (DTYPE*)collection->data;
+    int * sIndexData = (int*)index->data;

+    for (int i = 0; i < indexSize; i++) {
+        int sIndex = sIndexData[i] * stride;
+        for (int j = 0; j < stride; j++)
+            sData[sIndex + j] += cData[i * stride + j];
+    }
 }

 } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/movement/Spread.cu
+++ b/source/tensor/core/movement/Spread.cu
@@ -29,6 +29,8 @@

 namespace nts { // namespace nts(NiuTrans.Tensor)

+#ifdef USE_CUDA
+
 /* 
 This is core assignment for spread function.

@@ -240,54 +242,6 @@ Care of the operator "+=" instead of "=".
 >> stride - stride of a data block
 >> subtensorNum - number of sub-tensors
 >> srcIndex - index of the source sub-tensor
->> colIndex - index of the sub-tensor in the collection tensor
-*/
-__global__
-void KernelSpreadForGatherFuzed(DTYPE * sData, DTYPE * cData,  int blockNum, 
-                                int blockSizeSrc, int blockSizeColl, int stride,
-                                int subtensorNum,
-                                int * srcIndex, int * colIndex)
-{
-    __shared__ DTYPE * sp[MAX_CUDA_THREAD_NUM_PER_BLOCK];
-    __shared__ DTYPE * cp[MAX_CUDA_THREAD_NUM_PER_BLOCK];
-
-    /* block id */
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-
-    /* offset in each block */
-    int offset = blockDim.y * blockIdx.y + threadIdx.y;
-
-    int blockId = i % blockNum;
-    int subtensorId = i / blockNum;
-    
-    if(subtensorId >= subtensorNum || offset >= stride)
-        return;
-
-    if(threadIdx.y == 0){
-        sp[threadIdx.x] = sData + srcIndex[subtensorId] * stride;
-        cp[threadIdx.x] = cData + colIndex[subtensorId] * stride;
-    }
-
-    __syncthreads();
-
-    DTYPE * s = sp[threadIdx.x] + blockSizeSrc * blockId;
-    DTYPE * c = cp[threadIdx.x] + blockSizeColl * blockId;
-
-    s[offset] += c[offset];
-}
-
-/* 
-This is core assignment for backward computation of gather function.
-Care of the operator "+=" instead of "=".
-
->> sData - the data pointer of the source tensor
->> cData - the data pointer of collection tensor
->> blockNum - number of data blocks
->> blockSizeSrc - size of source data block
->> blockSizeColl - size of source data block
->> stride - stride of a data block
->> subtensorNum - number of sub-tensors
->> srcIndex - index of the source sub-tensor
 */
 __global__
 void KernelSpreadForGatherFuzed(DTYPE * sData, DTYPE * cData,  int blockNum, 
@@ -334,10 +288,9 @@ And this is a special spread function for backward computation of gather functio
         we have 4 sub-tensors of size (3, 2)
 >> srcIndex - index of the source sub-tensors
 >> indexSize - length of srcIndex (and collIndex)
->> collIndex - index of the gathered sub-tensors
 */
 void _CudaSpreadForGather(XTensor * source, XTensor * collection, int dim, 
-                          int * srcIndex, int indexSize, int * collIndex)
+                          int * srcIndex, int indexSize)
 {
    int order = source->order;

@@ -372,7 +325,7 @@ void _CudaSpreadForGather(XTensor * source, XTensor * collection, int dim,
        DTYPE * cData = (DTYPE*)collection->data;
        for(int i = 0; i < indexSize; i++) {
            int src = srcIndex[i];
-            int tgt = collIndex[i];
+            int tgt = i;
            DTYPE * s = sData + src * stride;
            DTYPE * c = cData + tgt * stride;

@@ -384,12 +337,8 @@ void _CudaSpreadForGather(XTensor * source, XTensor * collection, int dim,
        int * si = mem != NULL ? 
                  (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) : 
                  (int*)XMemAlloc(source->devID, sizeof(int) * indexSize);
-        int * ci = mem != NULL ?
-                  (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) :
-                  (int*)XMemAlloc(collection->devID, sizeof(int) * indexSize);

        XMemCopy(si, source->devID, srcIndex, -1, sizeof(int) * indexSize);
-        XMemCopy(ci, collection->devID, collIndex, -1, sizeof(int) * indexSize);

        DTYPE * s = (DTYPE*)source->data;
        DTYPE * c = (DTYPE*)collection->data;
@@ -399,15 +348,13 @@ void _CudaSpreadForGather(XTensor * source, XTensor * collection, int dim,
        dim3 blocks(cudaGrids[0], cudaGrids[1]);
        dim3 threads(cudaBlocks[0], cudaBlocks[1]);

-        KernelSpreadForGatherFuzed<<<blocks, threads >>>(s, c, blockNum, blockSizeSrc, blockSizeColl, stride, indexSize, si, ci);
+        KernelSpreadForGatherFuzed<<<blocks, threads >>>(s, c, blockNum, blockSizeSrc, blockSizeColl, stride, indexSize, si);

        if (mem != NULL) {
            mem->ReleaseBuf(mem->devID, sizeof(int) * indexSize);
-            mem->ReleaseBuf(mem->devID, sizeof(int) * indexSize);
        }
        else {
            XMemFree(source->devID, si);
-            XMemFree(collection->devID, ci);
        }
    }
 }
@@ -454,6 +401,8 @@ void _CudaSpreadForGather(XTensor * source, XTensor * collection, XTensor * srcI
    BacktoCudaDev(source->devID, devIDBackup);
 }

+#endif // USE_CUDA
+
 } // namespace nts(NiuTrans.Tensor)

 #endif // __SPREAD_CUH__
\ No newline at end of file
--- a/source/tensor/core/movement/Spread.cuh
+++ b/source/tensor/core/movement/Spread.cuh
@@ -26,17 +26,21 @@

 namespace nts { // namespace nts(NiuTrans.Tensor)

+#ifdef USE_CUDA
+
 /* spread a collection tensor to source tensor (cuda version) */
 void _CudaSpread(XTensor * source, XTensor * collection, int dim, 
                 int * srcIndex, int indexSize, int * collIndex);

 /* special spread function for backward computation of gather function (cuda version) */
 void _CudaSpreadForGather(XTensor * source, XTensor * collection, int dim, 
-                          int * srcIndex, int indexSize, int * collIndex);
+                          int * srcIndex, int indexSize);

 /* special spread function for backward computation of gather function (cuda version) */
 void _CudaSpreadForGather(XTensor * source, XTensor * collection, XTensor * srcIndex);

+#endif // USE_CUDA
+
 } // namespace nts(NiuTrans.Tensor)

 #endif // __SPREAD_CUH__
\ No newline at end of file
--- a/source/tensor/core/movement/Spread.h
+++ b/source/tensor/core/movement/Spread.h
@@ -32,12 +32,13 @@ void _Spread(XTensor * source, XTensor * collection, int dim,

 /* spread a collection tensor to source tensor (return a XTensor structure)
   make a new tensor to keep the result and return it */
-void Spread(XTensor * source, XTensor * collection, int dim, 
-            int * srcIndex, int indexSize, int * collIndex);
+void Spread(XTensor * source, XTensor * collection, 
+            XTensor * srcIndex, XTensor * collIndex,
+            int dim);

 /* special spread function for backward computation of gather function */
 void _SpreadForGather(XTensor * source, XTensor * collection, int dim, 
-                      int * srcIndex, int indexSize, int * collIndex);
+                      int * srcIndex, int indexSize);

 /* special spread function for backward computation of gather function */
 void _SpreadForGather(XTensor * source, XTensor * collection, XTensor * index);

--- a/source/tensor/test/TGather.cpp
+++ b/source/tensor/test/TGather.cpp
@@ -202,7 +202,7 @@ bool TestGather2()
    _Gather(sGPU, tGPU, dim, srcIndex, indexSize);

    /* check results */
-    gpuTest = tGPU->CheckData(answer, tUnitNum) && tUserGPU.CheckData(answer, tUnitNum);
+    gpuTest = tGPU->CheckData(answer, tUnitNum);

    /* destroy variables */
    delete s;
@@ -287,11 +287,11 @@ bool TestGather3()

    /* call Gather function */
    _Gather(s, t, dim, srcIndex, indexSize);
-    //tUser = Gather(*s, *index);
+    tUser = Gather(*s, *index);

    /* check results */
-    cpuTest = t->CheckData(answer, tUnitNum);
-              //tUser2.CheckData(answer, tUnitNum);
+    cpuTest = t->CheckData(answer, tUnitNum) &&
+              tUser.CheckData(answer, tUnitNum);
    
 #ifdef USE_CUDA
    /* GPU test */
@@ -325,6 +325,7 @@ bool TestGather3()
    delete indexGPU;
    delete[] sDimSize;
    delete[] tDimSize;
+    delete[] indexDimSize;

    return cpuTest && gpuTest;
 #else
@@ -333,6 +334,7 @@ bool TestGather3()
    delete t;
    delete[] sDimSize;
    delete[] tDimSize;
+    delete[] indexDimSize;

    return cpuTest;
 #endif // USE_CUDA

--- a/source/tensor/test/TSpread.cpp
+++ b/source/tensor/test/TSpread.cpp
@@ -133,6 +133,130 @@ bool TestSpread1()
 #endif // USE_CUDA
 }

+/* 
+case 2: test _SpreadForGather function 
+spread a collection tensor to source tensor
+*/
+bool TestSpread2()
+{
+    /* a input tensor of size (3, 3) */
+    int sOrder = 2;
+    int * sDimSize = new int[sOrder];
+    sDimSize[0] = 3;
+    sDimSize[1] = 3;
+
+    int sUnitNum = 1;
+    for (int i = 0; i < sOrder; i++)
+        sUnitNum *= sDimSize[i];
+
+    /* a output tensor of size (2, 3) */
+    int tOrder = 2;
+    int * tDimSize = new int[tOrder];
+    tDimSize[0] = 2;
+    tDimSize[1] = 3;
+
+    int tUnitNum = 1;
+    for (int i = 0; i < tOrder; i++)
+        tUnitNum *= tDimSize[i];
+        
+    /* a index tensor of size (2) */
+    int indexOrder = 1;
+    int * indexDimSize = new int[indexOrder];
+    indexDimSize[0] = 2;
+
+    int indexUnitNum = 1;
+    for (int i = 0; i < indexOrder; i++)
+        indexUnitNum *= indexDimSize[i];
+
+    DTYPE sData[3][3] = { {0.0F, 0.0F, 2.0F},
+                          {2.0F, 1.0F, 3.0F},
+                          {2.0F, 2.0F, 4.0F} };
+
+    DTYPE tData[2][3] = { {0.0F, -1.0F, 2.0F},
+                          {1.0F, 2.0F, 0.0F} };
+
+    DTYPE answer[3][3] = { {0.0F, -1.0F, 4.0F},
+                           {2.0F, 1.0F, 3.0F},
+                           {3.0F, 4.0F, 4.0F} };
+
+    int dim = 0;
+    int indexSize = 2;
+    int srcIndex[2] = {0, 2};
+
+    /* CPU test */
+    bool cpuTest = true;
+
+    /* create tensors */
+    XTensor * s1 = NewTensor(sOrder, sDimSize);
+    XTensor * s2 = NewTensor(sOrder, sDimSize);
+    XTensor * t = NewTensor(tOrder, tDimSize);
+    XTensor * index = NewTensor(indexOrder, indexDimSize, X_INT);
+
+    /* initialize variables */
+    s1->SetData(sData, sUnitNum);
+    s2->SetData(sData, sUnitNum);
+    t->SetData(tData, tUnitNum);
+    index->SetData(srcIndex, indexSize);
+
+    /* call _SpreadForGather function */
+    _SpreadForGather(s1, t, dim, srcIndex, indexSize);
+    _SpreadForGather(s2, t, index);
+
+    /* check results */
+    cpuTest = s1->CheckData(answer, tUnitNum) &&
+              s2->CheckData(answer, tUnitNum);
+    
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+
+    /* create tensors */
+    XTensor * sGPU1 = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * sGPU2 = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * tGPU = NewTensor(sOrder, tDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * indexGPU = NewTensor(indexOrder, indexDimSize, X_INT, 1.0F, 0);
+
+    /* initialize variables */
+    sGPU1->SetData(sData, sUnitNum);
+    sGPU2->SetData(sData, sUnitNum);
+    tGPU->SetData(tData, tUnitNum);
+    indexGPU->SetData(srcIndex, indexSize);
+
+    /* call _SpreadForGather function */
+    _SpreadForGather(sGPU1, tGPU, dim, srcIndex, indexSize);
+    _SpreadForGather(sGPU2, tGPU, indexGPU);
+
+    /* check results */
+    gpuTest = sGPU1->CheckData(answer, tUnitNum) && 
+              sGPU2->CheckData(answer, tUnitNum);
+
+    /* destroy variables */
+    delete s1;
+    delete s2;
+    delete t;
+    delete index;
+    delete sGPU1;
+    delete sGPU2;
+    delete tGPU;
+    delete indexGPU;
+    delete[] sDimSize;
+    delete[] tDimSize;
+    delete[] indexDimSize;
+
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete s1;
+    delete s2;
+    delete t;
+    delete[] sDimSize;
+    delete[] tDimSize;
+    delete[] indexDimSize;
+
+    return cpuTest;
+#endif // USE_CUDA
+}
+
 /* other cases */
 /*
 TODO!!