Gather operation supports gathering data according to dimension.

1f4eecdd · 张裕浩 · ee86e91d · 1f4eecdd · 1f4eecdd · 1f4eecdd
Commit 1f4eecdd authored Oct 10, 2019 by 张裕浩
--- a/source/tensor/core/movement/Gather.cpp
+++ b/source/tensor/core/movement/Gather.cpp
@@ -55,6 +55,29 @@ gather indexed sub-tensors

 >> s - the source tensor
 >> t - the target tensor
+>> srcIndex - index of the source sub-tensors
+>> dim - the leading dimension to define "sub-tensors"
+e.g., for a tensor of size (3, 2, 4) and dim = 2,
+we have 4 sub-tensors of size (3, 2)
+*/
+void _Gather(const XTensor * s, XTensor * t, XTensor * srcIndex, int dim)
+{
+    CheckNTErrors((s && t), "Invalid tensors!");
+    CheckNTErrors(s->devID == t->devID, "the data must be kept on the same device!");
+    CheckNTErrors((t->unitSize == srcIndex->unitSize), "Unmatched tensors!");
+#ifdef USE_CUDA
+    if (s->devID >= 0 && t->devID >= 0) {
+        _CudaGather(s, t, srcIndex, dim);
+        return;
+    }
+#endif
+}
+
+/*
+gather indexed sub-tensors
+
+>> s - the source tensor
+>> t - the target tensor
 >> srcIndex - the tensor to save the index of the source tensor
 */
 void _Gather(const XTensor * s, XTensor * t, XTensor * srcIndex)

--- a/source/tensor/core/movement/Gather.cu
+++ b/source/tensor/core/movement/Gather.cu
@@ -68,6 +68,36 @@ void KernelGather(DTYPE * sData, DTYPE * tData, int * sIndex, int indexSize, int
 /*
 gather indexed sub-tensors(cuda version)

+>> sData - the data pointer of the source tensor
+>> tData - the data pointer of the target tensor
+>> sIndex - the index of the source tensor
+>> indexSize - the size of the srcIndex
+>> stride - stride of a data block
+>> strideNum - strideNum of a data block
+>> blockNum - block size of data
+
+*/
+__global__
+void KernelGather(DTYPE * sData, DTYPE * tData, int * sIndex, int stride, int strideNum, int blockNum)
+{
+    int idx = blockDim.x * blockIdx.x + threadIdx.x;
+    int idy = blockDim.y * blockIdx.y + threadIdx.y;
+    int blockIndex = idy / stride;
+    int offsetInBlock = idy % stride;
+
+    int size = stride * strideNum * blockNum;  
+
+#pragma unroll
+    for (int i = idx * stride + stride * strideNum * blockIndex + offsetInBlock;
+        i < stride * strideNum * blockIndex + offsetInBlock + stride * strideNum && i < size;
+        i += stride * blockDim.x) {
+        tData[i] = sData[sIndex[i]];
+    }
+}
+
+/*
+gather indexed sub-tensors(cuda version)
+
 >> s - the source tensor
 >> t - the target tensor
 >> srcIndex - the tensor to save the index of the source tensor
@@ -117,6 +147,46 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex)
    BacktoCudaDev(devID, devIDBackup);
 }

+/*
+gather indexed sub-tensors(cuda version)
+
+>> s - the source tensor
+>> t - the target tensor
+>> srcIndex - the tensor to save the index of the source tensor
+>> dim - the leading dimension to define "sub-tensors"
+*/
+void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex, int dim)
+{
+    int devID = srcIndex->devID;
+    XMem * mem = s->mem;
+
+    int dimRDI = srcIndex->order - dim - 1;
+    int stride = 1;
+    int indexSize = srcIndex->unitNum;
+    int strideNum = srcIndex->dimSizeRDI[dimRDI];
+    for (int i = 0; i < dimRDI; i++)
+        stride *= srcIndex->dimSizeRDI[i];
+
+    int blockNum = 1;
+    for (int i = dimRDI + 1; i < srcIndex->order; i++)
+        blockNum *= srcIndex->dimSizeRDI[i];
+
+    int * sIndex = NULL;
+    if (srcIndex->devID < 0) {
+        sIndex = mem != NULL ?
+            (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) :
+            (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);
+        XMemCopy(sIndex, devID, srcIndex, -1, sizeof(int) * indexSize);
+    }
+    else
+        sIndex = (int *)srcIndex->data;
+
+    int cudaGrids[3];
+    int cudaBlocks[3];
+    GDevs.GetCudaThread2D(devID, max(32, strideNum), stride*blockNum, MAX_INT, cudaGrids, cudaBlocks);
+
+    KernelGather << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> > ((DTYPE *)s->data, (DTYPE *)t->data, sIndex, stride, strideNum, blockNum);
+}
 #endif // USE_CUDA

 } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/movement/Gather.cuh
+++ b/source/tensor/core/movement/Gather.cuh
@@ -32,6 +32,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /* gather indexed sub-tensors(cuda version) */
 void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex);

+void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex,int dim);
+
 #endif // USE_CUDA

 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/movement/Gather.h
+++ b/source/tensor/core/movement/Gather.h
@@ -32,6 +32,9 @@ void _Gather(XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSize);
 /* gather selected sub-tensors */
 void _Gather(const XTensor * s, XTensor * t, XTensor * srcIndex);

+/* gather selected sub-tensors accoding to the dimension */
+void _Gather(const XTensor * s, XTensor * t, XTensor * srcIndex, int dim);
+
 /* gather selected sub-tensors (return an XTensor structure)
   make a new tensor to keep the result and return it */
 XTensor Gather(XTensor &s, XTensor &index);