Commit 29c26ce6 by liyinqiao

Bug fixed.

1. CPU Support for Gather function.
2. Fix the bugs in GPU Gather.
2. Minor error fixed.
parent fe5c5b85
...@@ -43,14 +43,43 @@ void _Gather(const XTensor * s, XTensor * t, XTensor * srcIndex, int dim) ...@@ -43,14 +43,43 @@ void _Gather(const XTensor * s, XTensor * t, XTensor * srcIndex, int dim)
CheckNTErrors((s && t), "Invalid tensors!"); CheckNTErrors((s && t), "Invalid tensors!");
CheckNTErrors(s->devID == t->devID, "the data must be kept on the same device!"); CheckNTErrors(s->devID == t->devID, "the data must be kept on the same device!");
CheckNTErrors((t->unitSize == srcIndex->unitSize), "Unmatched tensors!"); CheckNTErrors((t->unitSize == srcIndex->unitSize), "Unmatched tensors!");
CheckNTErrors((srcIndex->dataType == X_INT), "The index tensor should be INT type!");
CheckNTErrors((srcIndex->order == s->order), "index's order should be the same with source's");
#ifdef USE_CUDA #ifdef USE_CUDA
if (s->devID >= 0 && t->devID >= 0) { if (s->devID >= 0 && t->devID >= 0) {
_CudaGather(s, t, srcIndex, dim); _CudaGather(s, t, srcIndex, dim);
return; return;
} }
#endif #endif
ShowNTErrors("TODO!"); int stride = 1;
return; int blockNum = 1;
for (int i = dim + 1; i < s->order; ++i)
{
stride *= s->GetDim(i);
}
for (int i = 0; i < dim; ++i)
{
blockNum *= s->GetDim(i);
}
int indexStrideNum = srcIndex->GetDim(dim);
int srcStrideNum = stride * s->GetDim(dim);
int tgtBlockSize = stride * indexStrideNum;
DTYPE * sData = (DTYPE*)s->data;
DTYPE * tData = (DTYPE*)t->data;
int * sIndexData = (int*)srcIndex->data;
for (int blockIndex = 0; blockIndex < blockNum; ++blockIndex)
{
for (int i = 0; i < indexStrideNum; i++) {
for (int j = 0; j < stride; j++)
{
int sIndex = sIndexData[i * stride + blockIndex * indexStrideNum + j] * stride + blockIndex * srcStrideNum + j;
CheckNTErrors(sIndex < s->unitNum, "Wrong index!");
int tIndex = i * stride + blockIndex * tgtBlockSize + j;
tData[tIndex] = sData[sIndex];
}
}
}
} }
/* /*
......
...@@ -77,7 +77,7 @@ gather indexed sub-tensors(cuda version) ...@@ -77,7 +77,7 @@ gather indexed sub-tensors(cuda version)
>> blockNum - block size of data >> blockNum - block size of data
*/ */
__global__ __global__
void KernelGather(DTYPE * sData, DTYPE * tData, int * sIndex, int stride, int strideNum, int blockNum) void KernelGather(DTYPE * sData, DTYPE * tData, int * sIndex, int stride, int strideNum, int blockNum, int srcStrideNum)
{ {
int idx = blockDim.x * blockIdx.x + threadIdx.x; int idx = blockDim.x * blockIdx.x + threadIdx.x;
int idy = blockDim.y * blockIdx.y + threadIdx.y; int idy = blockDim.y * blockIdx.y + threadIdx.y;
...@@ -90,7 +90,7 @@ void KernelGather(DTYPE * sData, DTYPE * tData, int * sIndex, int stride, int st ...@@ -90,7 +90,7 @@ void KernelGather(DTYPE * sData, DTYPE * tData, int * sIndex, int stride, int st
for (int i = idx * stride + stride * strideNum * blockIndex + offsetInBlock; for (int i = idx * stride + stride * strideNum * blockIndex + offsetInBlock;
i < stride * strideNum * blockIndex + offsetInBlock + stride * strideNum && i < size; i < stride * strideNum * blockIndex + offsetInBlock + stride * strideNum && i < size;
i += stride * blockDim.x) { i += stride * blockDim.x) {
tData[i] = sData[sIndex[i]]; tData[i] = sData[sIndex[i] * stride + stride * srcStrideNum * blockIndex + offsetInBlock];
} }
} }
...@@ -179,6 +179,7 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex, int dim) ...@@ -179,6 +179,7 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex, int dim)
int blockNum = 1; int blockNum = 1;
int indexSize = srcIndex->unitNum; int indexSize = srcIndex->unitNum;
int strideNum = srcIndex->dimSize[dim]; int strideNum = srcIndex->dimSize[dim];
int srcStrideNum = s->dimSize[dim];
for (int i = 0; i < dim; i++) for (int i = 0; i < dim; i++)
blockNum *= srcIndex->dimSize[i]; blockNum *= srcIndex->dimSize[i];
for (int i = dim + 1; i < srcIndex->order; i++) for (int i = dim + 1; i < srcIndex->order; i++)
...@@ -186,19 +187,33 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex, int dim) ...@@ -186,19 +187,33 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex, int dim)
int * sIndex = NULL; int * sIndex = NULL;
if (srcIndex->devID < 0) { if (srcIndex->devID < 0) {
int * sIndexData = (int*)srcIndex->data;
for (int i = 0; i < indexSize; i++) {
int srcIndexValue = sIndexData[i] * stride;
CheckNTErrors(srcIndexValue < s->unitNum, "Wrong index!");
}
sIndex = mem != NULL ? sIndex = mem != NULL ?
(int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) : (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) :
(int*)XMemAlloc(mem->devID, sizeof(int) * indexSize); (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);
XMemCopy(sIndex, devID, srcIndex, -1, sizeof(int) * indexSize); XMemCopy(sIndex, devID, srcIndex, -1, sizeof(int) * indexSize);
} }
else else {
int * sIndexData = new int[sizeof(int) * indexSize];
XMemCopy(sIndexData, -1, srcIndex->data, srcIndex->devID, sizeof(int) * indexSize);
for (int i = 0; i < indexSize; i++) {
int srcIndexValue = sIndexData[i] * stride;
CheckNTErrors(srcIndexValue < s->unitNum, "Wrong index!");
}
sIndex = (int *)srcIndex->data; sIndex = (int *)srcIndex->data;
delete[] sIndexData;
}
int cudaGrids[3]; int cudaGrids[3];
int cudaBlocks[3]; int cudaBlocks[3];
GDevs.GetCudaThread2D(devID, max(32, strideNum), stride*blockNum, MAX_INT, cudaGrids, cudaBlocks); GDevs.GetCudaThread2D(devID, max(32, strideNum), stride*blockNum, MAX_INT, cudaGrids, cudaBlocks);
KernelGather << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> > ((DTYPE *)s->data, (DTYPE *)t->data, sIndex, stride, strideNum, blockNum, srcStrideNum);
KernelGather << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> > ((DTYPE *)s->data, (DTYPE *)t->data, sIndex, stride, strideNum, blockNum);
} }
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -32,7 +32,7 @@ spread a collection tensor to source tensor. ...@@ -32,7 +32,7 @@ spread a collection tensor to source tensor.
*/ */
bool TestSpread1() bool TestSpread1()
{ {
/* a input tensor of size (2, 4, 3) */ /* a input tensor of size (4, 4, 3) */
int sOrder = 3; int sOrder = 3;
int * sDimSize = new int[sOrder]; int * sDimSize = new int[sOrder];
sDimSize[0] = 4; sDimSize[0] = 4;
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论