Commit 1f4eecdd by 张裕浩

Gather operation supports gathering data according to dimension.

parent ee86e91d
...@@ -55,6 +55,29 @@ gather indexed sub-tensors ...@@ -55,6 +55,29 @@ gather indexed sub-tensors
>> s - the source tensor >> s - the source tensor
>> t - the target tensor >> t - the target tensor
>> srcIndex - index of the source sub-tensors
>> dim - the leading dimension to define "sub-tensors"
e.g., for a tensor of size (3, 2, 4) and dim = 2,
we have 4 sub-tensors of size (3, 2)
*/
void _Gather(const XTensor * s, XTensor * t, XTensor * srcIndex, int dim)
{
CheckNTErrors((s && t), "Invalid tensors!");
CheckNTErrors(s->devID == t->devID, "the data must be kept on the same device!");
CheckNTErrors((t->unitSize == srcIndex->unitSize), "Unmatched tensors!");
#ifdef USE_CUDA
if (s->devID >= 0 && t->devID >= 0) {
_CudaGather(s, t, srcIndex, dim);
return;
}
#endif
}
/*
gather indexed sub-tensors
>> s - the source tensor
>> t - the target tensor
>> srcIndex - the tensor to save the index of the source tensor >> srcIndex - the tensor to save the index of the source tensor
*/ */
void _Gather(const XTensor * s, XTensor * t, XTensor * srcIndex) void _Gather(const XTensor * s, XTensor * t, XTensor * srcIndex)
......
...@@ -68,6 +68,36 @@ void KernelGather(DTYPE * sData, DTYPE * tData, int * sIndex, int indexSize, int ...@@ -68,6 +68,36 @@ void KernelGather(DTYPE * sData, DTYPE * tData, int * sIndex, int indexSize, int
/* /*
gather indexed sub-tensors(cuda version) gather indexed sub-tensors(cuda version)
>> sData - the data pointer of the source tensor
>> tData - the data pointer of the target tensor
>> sIndex - the index of the source tensor
>> indexSize - the size of the srcIndex
>> stride - stride of a data block
>> strideNum - strideNum of a data block
>> blockNum - block size of data
*/
__global__
void KernelGather(DTYPE * sData, DTYPE * tData, int * sIndex, int stride, int strideNum, int blockNum)
{
int idx = blockDim.x * blockIdx.x + threadIdx.x;
int idy = blockDim.y * blockIdx.y + threadIdx.y;
int blockIndex = idy / stride;
int offsetInBlock = idy % stride;
int size = stride * strideNum * blockNum;
#pragma unroll
for (int i = idx * stride + stride * strideNum * blockIndex + offsetInBlock;
i < stride * strideNum * blockIndex + offsetInBlock + stride * strideNum && i < size;
i += stride * blockDim.x) {
tData[i] = sData[sIndex[i]];
}
}
/*
gather indexed sub-tensors(cuda version)
>> s - the source tensor >> s - the source tensor
>> t - the target tensor >> t - the target tensor
>> srcIndex - the tensor to save the index of the source tensor >> srcIndex - the tensor to save the index of the source tensor
...@@ -117,6 +147,46 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex) ...@@ -117,6 +147,46 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex)
BacktoCudaDev(devID, devIDBackup); BacktoCudaDev(devID, devIDBackup);
} }
/*
gather indexed sub-tensors(cuda version)
>> s - the source tensor
>> t - the target tensor
>> srcIndex - the tensor to save the index of the source tensor
>> dim - the leading dimension to define "sub-tensors"
*/
void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex, int dim)
{
int devID = srcIndex->devID;
XMem * mem = s->mem;
int dimRDI = srcIndex->order - dim - 1;
int stride = 1;
int indexSize = srcIndex->unitNum;
int strideNum = srcIndex->dimSizeRDI[dimRDI];
for (int i = 0; i < dimRDI; i++)
stride *= srcIndex->dimSizeRDI[i];
int blockNum = 1;
for (int i = dimRDI + 1; i < srcIndex->order; i++)
blockNum *= srcIndex->dimSizeRDI[i];
int * sIndex = NULL;
if (srcIndex->devID < 0) {
sIndex = mem != NULL ?
(int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) :
(int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);
XMemCopy(sIndex, devID, srcIndex, -1, sizeof(int) * indexSize);
}
else
sIndex = (int *)srcIndex->data;
int cudaGrids[3];
int cudaBlocks[3];
GDevs.GetCudaThread2D(devID, max(32, strideNum), stride*blockNum, MAX_INT, cudaGrids, cudaBlocks);
KernelGather << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> > ((DTYPE *)s->data, (DTYPE *)t->data, sIndex, stride, strideNum, blockNum);
}
#endif // USE_CUDA #endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
...@@ -32,6 +32,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -32,6 +32,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* gather indexed sub-tensors(cuda version) */ /* gather indexed sub-tensors(cuda version) */
void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex); void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex);
void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex,int dim);
#endif // USE_CUDA #endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -32,6 +32,9 @@ void _Gather(XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSize); ...@@ -32,6 +32,9 @@ void _Gather(XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSize);
/* gather selected sub-tensors */ /* gather selected sub-tensors */
void _Gather(const XTensor * s, XTensor * t, XTensor * srcIndex); void _Gather(const XTensor * s, XTensor * t, XTensor * srcIndex);
/* gather selected sub-tensors accoding to the dimension */
void _Gather(const XTensor * s, XTensor * t, XTensor * srcIndex, int dim);
/* gather selected sub-tensors (return an XTensor structure) /* gather selected sub-tensors (return an XTensor structure)
make a new tensor to keep the result and return it */ make a new tensor to keep the result and return it */
XTensor Gather(XTensor &s, XTensor &index); XTensor Gather(XTensor &s, XTensor &index);
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论