Commit 1f4eecdd by 张裕浩

Gather operation supports gathering data according to dimension.

parent ee86e91d
......@@ -55,6 +55,29 @@ gather indexed sub-tensors
>> s - the source tensor
>> t - the target tensor
>> srcIndex - index of the source sub-tensors
>> dim - the leading dimension to define "sub-tensors"
e.g., for a tensor of size (3, 2, 4) and dim = 2,
we have 4 sub-tensors of size (3, 2)
*/
void _Gather(const XTensor * s, XTensor * t, XTensor * srcIndex, int dim)
{
CheckNTErrors((s && t), "Invalid tensors!");
CheckNTErrors(s->devID == t->devID, "the data must be kept on the same device!");
CheckNTErrors((t->unitSize == srcIndex->unitSize), "Unmatched tensors!");
#ifdef USE_CUDA
if (s->devID >= 0 && t->devID >= 0) {
_CudaGather(s, t, srcIndex, dim);
return;
}
#endif
}
/*
gather indexed sub-tensors
>> s - the source tensor
>> t - the target tensor
>> srcIndex - the tensor to save the index of the source tensor
*/
void _Gather(const XTensor * s, XTensor * t, XTensor * srcIndex)
......
......@@ -68,6 +68,36 @@ void KernelGather(DTYPE * sData, DTYPE * tData, int * sIndex, int indexSize, int
/*
gather indexed sub-tensors(cuda version)
>> sData - the data pointer of the source tensor
>> tData - the data pointer of the target tensor
>> sIndex - the index of the source tensor
>> indexSize - the size of the srcIndex
>> stride - stride of a data block
>> strideNum - strideNum of a data block
>> blockNum - block size of data
*/
__global__
void KernelGather(DTYPE * sData, DTYPE * tData, int * sIndex, int stride, int strideNum, int blockNum)
{
int idx = blockDim.x * blockIdx.x + threadIdx.x;
int idy = blockDim.y * blockIdx.y + threadIdx.y;
int blockIndex = idy / stride;
int offsetInBlock = idy % stride;
int size = stride * strideNum * blockNum;
#pragma unroll
for (int i = idx * stride + stride * strideNum * blockIndex + offsetInBlock;
i < stride * strideNum * blockIndex + offsetInBlock + stride * strideNum && i < size;
i += stride * blockDim.x) {
tData[i] = sData[sIndex[i]];
}
}
/*
gather indexed sub-tensors(cuda version)
>> s - the source tensor
>> t - the target tensor
>> srcIndex - the tensor to save the index of the source tensor
......@@ -117,6 +147,46 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex)
BacktoCudaDev(devID, devIDBackup);
}
/*
gather indexed sub-tensors(cuda version)
>> s - the source tensor
>> t - the target tensor
>> srcIndex - the tensor to save the index of the source tensor
>> dim - the leading dimension to define "sub-tensors"
*/
void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex, int dim)
{
int devID = srcIndex->devID;
XMem * mem = s->mem;
int dimRDI = srcIndex->order - dim - 1;
int stride = 1;
int indexSize = srcIndex->unitNum;
int strideNum = srcIndex->dimSizeRDI[dimRDI];
for (int i = 0; i < dimRDI; i++)
stride *= srcIndex->dimSizeRDI[i];
int blockNum = 1;
for (int i = dimRDI + 1; i < srcIndex->order; i++)
blockNum *= srcIndex->dimSizeRDI[i];
int * sIndex = NULL;
if (srcIndex->devID < 0) {
sIndex = mem != NULL ?
(int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) :
(int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);
XMemCopy(sIndex, devID, srcIndex, -1, sizeof(int) * indexSize);
}
else
sIndex = (int *)srcIndex->data;
int cudaGrids[3];
int cudaBlocks[3];
GDevs.GetCudaThread2D(devID, max(32, strideNum), stride*blockNum, MAX_INT, cudaGrids, cudaBlocks);
KernelGather << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> > ((DTYPE *)s->data, (DTYPE *)t->data, sIndex, stride, strideNum, blockNum);
}
#endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
......@@ -32,6 +32,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* gather indexed sub-tensors(cuda version) */
void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex);
void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex,int dim);
#endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor)
......
......@@ -32,6 +32,9 @@ void _Gather(XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSize);
/* gather selected sub-tensors */
void _Gather(const XTensor * s, XTensor * t, XTensor * srcIndex);
/* gather selected sub-tensors accoding to the dimension */
void _Gather(const XTensor * s, XTensor * t, XTensor * srcIndex, int dim);
/* gather selected sub-tensors (return an XTensor structure)
make a new tensor to keep the result and return it */
XTensor Gather(XTensor &s, XTensor &index);
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论