Bug fixed.

1. CPU Support for Gather function. 2. Fix the bugs in GPU Gather. 2. Minor error fixed.

Bug fixed.
1. CPU Support for Gather function. 2. Fix the bugs in GPU Gather. 2. Minor error fixed.
29c26ce6 · liyinqiao · fe5c5b85 · 29c26ce6 · 29c26ce6 · 29c26ce6
Commit 29c26ce6 authored Mar 18, 2020 by liyinqiao
--- a/source/tensor/core/movement/Gather.cpp
+++ b/source/tensor/core/movement/Gather.cpp
@@ -43,14 +43,43 @@ void _Gather(const XTensor * s, XTensor * t, XTensor * srcIndex, int dim)
    CheckNTErrors((s && t), "Invalid tensors!");
    CheckNTErrors(s->devID == t->devID, "the data must be kept on the same device!");
    CheckNTErrors((t->unitSize == srcIndex->unitSize), "Unmatched tensors!");
+    CheckNTErrors((srcIndex->dataType == X_INT), "The index tensor should be INT type!");
+    CheckNTErrors((srcIndex->order == s->order), "index's order should be the same with source's");
 #ifdef USE_CUDA
    if (s->devID >= 0 && t->devID >= 0) {
        _CudaGather(s, t, srcIndex, dim);
        return;
    }
 #endif
-    ShowNTErrors("TODO!");
+    int stride = 1;
-    return;
+    int blockNum = 1;
+    for (int i = dim + 1; i < s->order; ++i)
+    {
+        stride *= s->GetDim(i);
+    }
+    for (int i = 0; i < dim; ++i)
+    {
+        blockNum *= s->GetDim(i);
+    }
+    int indexStrideNum = srcIndex->GetDim(dim);
+    int srcStrideNum = stride * s->GetDim(dim);
+    int tgtBlockSize = stride * indexStrideNum;
+    DTYPE * sData = (DTYPE*)s->data;
+    DTYPE * tData = (DTYPE*)t->data;
+    int * sIndexData = (int*)srcIndex->data;
+    for (int blockIndex = 0; blockIndex < blockNum; ++blockIndex)
+    {
+        for (int i = 0; i < indexStrideNum; i++) {
+            for (int j = 0; j < stride; j++)
+            {
+                int sIndex = sIndexData[i * stride + blockIndex * indexStrideNum + j] * stride + blockIndex * srcStrideNum + j;
+                CheckNTErrors(sIndex < s->unitNum, "Wrong index!");
+                int tIndex = i * stride + blockIndex * tgtBlockSize + j;
+                tData[tIndex] = sData[sIndex];
+            }
+        }
+    }
 }
 /*

--- a/source/tensor/core/movement/Gather.cu
+++ b/source/tensor/core/movement/Gather.cu
@@ -77,7 +77,7 @@ gather indexed sub-tensors(cuda version)
 >> blockNum - block size of data
 */
 __global__
-void KernelGather(DTYPE * sData, DTYPE * tData, int * sIndex, int stride, int strideNum, int blockNum)
+void KernelGather(DTYPE * sData, DTYPE * tData, int * sIndex, int stride, int strideNum, int blockNum, int srcStrideNum)
 {
    int idx = blockDim.x * blockIdx.x + threadIdx.x;
    int idy = blockDim.y * blockIdx.y + threadIdx.y;
@@ -90,7 +90,7 @@ void KernelGather(DTYPE * sData, DTYPE * tData, int * sIndex, int stride, int st
    for (int i = idx * stride + stride * strideNum * blockIndex + offsetInBlock;
        i < stride * strideNum * blockIndex + offsetInBlock + stride * strideNum && i < size;
        i += stride * blockDim.x) {
-        tData[i] = sData[sIndex[i]];
+        tData[i] = sData[sIndex[i] * stride + stride * srcStrideNum * blockIndex + offsetInBlock];
    }
 }
@@ -179,6 +179,7 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex, int dim)
    int blockNum = 1;
    int indexSize = srcIndex->unitNum;
    int strideNum = srcIndex->dimSize[dim];
+    int srcStrideNum = s->dimSize[dim];
    for (int i = 0; i < dim; i++)
        blockNum *= srcIndex->dimSize[i];
    for (int i = dim + 1; i < srcIndex->order; i++)
@@ -186,19 +187,33 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex, int dim)
    int * sIndex = NULL;
    if (srcIndex->devID < 0) {
+        int * sIndexData = (int*)srcIndex->data;
+        for (int i = 0; i < indexSize; i++) {
+            int srcIndexValue = sIndexData[i] * stride;
+            CheckNTErrors(srcIndexValue < s->unitNum, "Wrong index!");
+        }
        sIndex = mem != NULL ?
-            (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) :
+                  (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) :
-            (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);
+                  (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);
        XMemCopy(sIndex, devID, srcIndex, -1, sizeof(int) * indexSize);
    }
-    else
+    else {
+        int * sIndexData = new int[sizeof(int) * indexSize];
+        XMemCopy(sIndexData, -1, srcIndex->data, srcIndex->devID, sizeof(int) * indexSize);
+        for (int i = 0; i < indexSize; i++) {
+            int srcIndexValue = sIndexData[i] * stride;
+            CheckNTErrors(srcIndexValue < s->unitNum, "Wrong index!");
+        }
        sIndex = (int *)srcIndex->data;
+	   delete[] sIndexData;
+    }
    int cudaGrids[3];
    int cudaBlocks[3];
    GDevs.GetCudaThread2D(devID, max(32, strideNum), stride*blockNum, MAX_INT, cudaGrids, cudaBlocks);
+    KernelGather << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> > ((DTYPE *)s->data, (DTYPE *)t->data, sIndex, stride, strideNum, blockNum, srcStrideNum);
-    KernelGather << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> > ((DTYPE *)s->data, (DTYPE *)t->data, sIndex, stride, strideNum, blockNum);
 }
 #endif // USE_CUDA

--- a/source/tensor/test/TSpread.cpp
+++ b/source/tensor/test/TSpread.cpp
@@ -32,7 +32,7 @@ spread a collection tensor to source tensor.
 */
 bool TestSpread1()
 {
-    /* a input tensor of size (2, 4, 3) */
+    /* a input tensor of size (4, 4, 3) */
    int sOrder = 3;
    int * sDimSize = new int[sOrder];
    sDimSize[0] = 4;