Commit b3fa7a37 by xuchen

cpu implementation of the gather and spread function

parent 2fea6615
......@@ -109,7 +109,7 @@ void XShapeGrad::GradCopyIndexed(XTensor * node, bool isEfficent)
XTensor * input = income.tails[0];
XNoder::MakeGrad(input);
_SpreadForGather(input->grad, node->grad, dim, realSrcIndex, realIndexSize, realTgtIndex);
_Spread(input->grad, node->grad, dim, realSrcIndex, realIndexSize, realTgtIndex);
delete[] realSrcIndex;
delete[] realTgtIndex;
......
......@@ -728,13 +728,12 @@ int T2TTrainer::LoadBatchLM(FILE * file,
int seqSize = 0;
MTYPE * batchEncOffsets = new MTYPE[batchEnc->unitNum];
int * batchEncValues = new int[batchEnc->unitNum];
MTYPE * paddingEncOffsets = new MTYPE[paddingEnc->unitNum];
MTYPE * goldOffsets = new MTYPE[gold->unitNum];
MTYPE * paddingDecOffsets = new MTYPE[paddingDec->unitNum];
/* need to improve the implementation */
memset(batchEncValues, 0, sizeof(int) * batchEnc->unitNum);
int wGold = 0;
//fprintf(tf, "batch %d(%d)\n", tc++, sc);
......@@ -745,9 +744,10 @@ int T2TTrainer::LoadBatchLM(FILE * file,
for(int w = 0; w < len; w++){
int num = buf[seqOffset[s] + w];
//batchEnc->Set2DInt(buf[seqOffset[s] + w], s - seq, w);
batchEncOffsets[wCount] = batchEnc->GetOffset2D(s - seq, w);
batchEncValues[wCount] = num;
//paddingEnc->Set2D(1.0F, s - seq, w);
//paddingDec->Set2D(1.0F, s - seq, w);
batchEncValues[(s - seq) * dims[1] + w] = num;
paddingEncOffsets[wCount] = paddingEnc->GetOffset2D(s - seq, w);
paddingDecOffsets[wCount] = paddingDec->GetOffset2D(s - seq, w);
if (w > 0)
......@@ -779,11 +779,12 @@ int T2TTrainer::LoadBatchLM(FILE * file,
}
}
batchEnc->SetData(batchEncValues, batchEnc->unitNum);
batchEnc->SetDataBatched(batchEncOffsets, batchEncValues, wCount);
paddingEnc->SetDataBatched(paddingEncOffsets, 1.0F, wCount);
paddingDec->SetDataBatched(paddingDecOffsets, 1.0F, wCount);
gold->SetDataBatched(goldOffsets, 1.0F, wGold);
delete[] batchEncOffsets;
delete[] batchEncValues;
delete[] paddingEncOffsets;
delete[] paddingDecOffsets;
......
......@@ -790,7 +790,7 @@ void XTensor::SetDataRandn(DTYPE mean, DTYPE standardDeviation)
/*
set tensor items with an array of offsets
>> offsets - offset for each data item
>> values - value for each data item
>> value - value for data items
>> num - number of the data items
*/
void XTensor::SetDataBatched(MTYPE * offsets, DTYPE value, int num)
......@@ -798,6 +798,17 @@ void XTensor::SetDataBatched(MTYPE * offsets, DTYPE value, int num)
_SetDataWithOffset(this, offsets, value, num);
}
/*
set tensor items with an array of values
>> offsets - offset for each data item
>> values - value for each data item
>> num - number of the data items
*/
void XTensor::SetDataBatched(MTYPE * offsets, void * values, int num)
{
_SetDataWithOffsetAndValue(this, offsets, values, num);
}
/* check whether the data array is the same as the answer
>> d - input data. it must be on CPU
>> num - number of data items
......
......@@ -282,6 +282,9 @@ public:
/* set tensor items with an array of offsets */
void SetDataBatched(MTYPE * offsets, DTYPE value, int num);
/* set tensor items with an array of values */
void SetDataBatched(MTYPE * offsets, void * values, int num);
/* check whether the data array is the same as the answer */
bool CheckData(const void * answer, int num, int beg = 0);
......
......@@ -485,9 +485,55 @@ set the data with an array of values
>> values - value for each data item
>> num - number of the data items
*/
void _SetDataWithOffsetAndValue(XTensor * tensor, MTYPE * offsets, DTYPE * values, MTYPE num)
void _SetDataWithOffsetAndValue(XTensor * tensor, MTYPE * offsets, void * values, MTYPE num)
{
ShowNTErrors("TODO!");
if (tensor->devID < 0) {
for (int i = 0; i < num; i++) {
if (tensor->dataType == X_INT)
*((int *)tensor->data + offsets[i]) = *((int *)values + i);
else if (tensor->dataType == X_FLOAT)
*((float *)tensor->data + offsets[i]) = *((float *)values + i);
else
ShowNTErrors("TO DO!!!");
}
}
else {
#ifdef USE_CUDA
XMem * mem = tensor->mem;
MTYPE offsetSize = num * sizeof(MTYPE);
MTYPE valueSize;
if (tensor->dataType == X_INT)
valueSize = num * sizeof(int);
else if (tensor->dataType == X_FLOAT)
valueSize = num * sizeof(float);
else
ShowNTErrors("TO DO!!!");
MTYPE * offsetsCuda = mem != NULL ?
(MTYPE*)mem->AllocBuf(mem->devID, offsetSize) :
(MTYPE*)XMemAlloc(tensor->devID, offsetSize);
void * valuesCuda = mem != NULL ?
mem->AllocBuf(mem->devID, valueSize) :
XMemAlloc(tensor->devID, valueSize);
XMemCopy(offsetsCuda, tensor->devID, offsets, -1, offsetSize);
XMemCopy(valuesCuda, tensor->devID, values, -1, valueSize);
_CudaSetDataWithOffsetAndValue(tensor, offsetsCuda, valuesCuda, num);
if (mem != NULL) {
mem->ReleaseBuf(mem->devID, valueSize);
mem->ReleaseBuf(mem->devID, offsetSize);
}
else {
XMemFree(tensor->devID, offsetsCuda);
XMemFree(tensor->devID, valuesCuda);
}
#else
ShowNTErrors("Please recompile the code with USE_CUDA");
#endif
}
}
} // namespace nts(NiuTrans.Tensor)
......
......@@ -470,8 +470,8 @@ void _CudaSetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper)
set the data with an array of offsets (kernel version)
>> data - pointer to the data array
>> offsets - offset for each data item
>> num - number of the data items
>> value - value of the data items
>> num - number of the data items
*/
__global__
void _KernelSetDataWithOffset(DTYPE * data, MTYPE * offsets, DTYPE value, MTYPE num)
......@@ -487,8 +487,8 @@ void _KernelSetDataWithOffset(DTYPE * data, MTYPE * offsets, DTYPE value, MTYPE
set the data with an array of offsets (cuda version)
>> tensor - the tensor that keeps the data
>> offsets - offset for each data item
>> num - number of the data items
>> value - value of the data items
>> num - number of the data items
*/
void _CudaSetDataWithOffset(XTensor * tensor, MTYPE * offsets, DTYPE value, MTYPE num)
{
......@@ -510,4 +510,51 @@ void _CudaSetDataWithOffset(XTensor * tensor, MTYPE * offsets, DTYPE value, MTYP
BacktoCudaDev(tensor->devID, devIDBackup);
}
/*
set the data with an array of offsets (kernel version)
>> data - pointer to the data array
>> offsets - offset for each data item
>> value - value of the data items
>> num - number of the data items
>> dataType - the data type of the data and values
*/
__global__
void _KernelSetDataWithOffset(void * data, MTYPE * offsets, void * values, MTYPE num, TENSOR_DATA_TYPE dataType)
{
/* index */
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < num) {
if (dataType == X_INT)
*((int *)data + offsets[i]) = *((int *)values + i);
else if (dataType == X_FLOAT)
*((float *)data + offsets[i]) = *((float *)values + i);
}
}
/*
set the data with an array of values
>> tensor - the tensor that keeps the data
>> offsets - offset for each data item
>> value - value of the ech data item
>> num - number of the data items
*/
void _CudaSetDataWithOffsetAndValue(XTensor * tensor, MTYPE * offsets, void * values, MTYPE num)
{
int gridSize[3];
int blockSize[3];
GDevs.GetCudaThread(tensor->devID, (int)num, gridSize, blockSize);
dim3 blocks(gridSize[0]);
dim3 threads(blockSize[0]);
int devIDBackup;
ProtectCudaDev(tensor->devID, devIDBackup);
_KernelSetDataWithOffset << <blocks, threads >> > (tensor->data, offsets, values, num, tensor->dataType);
BacktoCudaDev(tensor->devID, devIDBackup);
}
} // namespace nts(NiuTrans.Tensor)
......@@ -52,6 +52,9 @@ void _CudaSetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper);
/* set the data with an array of offsets */
void _CudaSetDataWithOffset(XTensor * tensor, MTYPE * offsets, DTYPE value, MTYPE num);
/* set the data with an array of values */
void _CudaSetDataWithOffsetAndValue(XTensor * tensor, MTYPE * offsets, void * value, MTYPE num);
} // namespace nts(NiuTrans.Tensor)
#endif // __SETDATA_CUH__
\ No newline at end of file
......@@ -64,7 +64,7 @@ void _SetDataRandN(XTensor * tensor, DTYPE mean = 0.0F, DTYPE standardDeviation
void _SetDataWithOffset(XTensor * tensor, MTYPE * offsets, DTYPE value, MTYPE num);
/* set the data with an array of values */
void _SetDataWithOffsetAndValue(XTensor * tensor, MTYPE * offsets, DTYPE * values, MTYPE num);
void _SetDataWithOffsetAndValue(XTensor * tensor, MTYPE * offsets, void * values, MTYPE num);
} // namespace nts(NiuTrans.Tensor)
......
......@@ -26,7 +26,9 @@
#include "Unary.h"
#include "Unary.cuh"
namespace nts {
namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
__device__
DTYPE cudasquare(DTYPE x)
......@@ -114,4 +116,6 @@ SIMPLE_UNARY_FUNCTION_GPU(Sin, sin)
SIMPLE_UNARY_FUNCTION_GPU(Cos, cos)
SIMPLE_UNARY_FUNCTION_GPU(Tan, tan)
}
\ No newline at end of file
#endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
......@@ -25,7 +25,7 @@
#include "../../XTensor.h"
namespace nts{
namespace nts{ // namespace nts(NiuTrans.Tensor)
/* set every entry to its absolute value */
void _Absolute(const XTensor * a, XTensor * b);
......
......@@ -70,6 +70,24 @@ void _Gather(XTensor * s, XTensor * t, XTensor * srcIndex)
return;
}
#endif
int blockNum = 1;
int blockSize = 1;
int stride = 1;
int indexSize = 1;
stride = s->GetDim(-1);
indexSize = srcIndex->unitNum;
DTYPE * sData = (DTYPE*)s->data;
DTYPE * tData = (DTYPE*)t->data;
int * sIndexData = (int*)srcIndex->data;
for (int i = 0; i < indexSize; i++) {
int sIndex = sIndexData[i] * stride;
for (int j = 0; j < stride; j++)
tData[i * stride + j] = sData[sIndex + j];
}
}
/*
......
......@@ -31,14 +31,14 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/*
gather indexed sub-tensors(cuda version)
>> source - the data pointer of the source tensor
>> target - the data pointer of the target tensor
>> srcIndex - the index of the source tensor
>> sData - the data pointer of the source tensor
>> tData - the data pointer of the target tensor
>> sIndex - the index of the source tensor
>> indexSize - the size of the srcIndex
>> stride - stride of a data block
*/
__global__
void KernelGather(DTYPE * source, DTYPE * target, int * srcIndex, int indexSize, int stride)
void KernelGather(DTYPE * sData, DTYPE * tData, int * sIndex, int indexSize, int stride)
{
__shared__ DTYPE * sp[MAX_CUDA_THREAD_NUM_PER_BLOCK];
__shared__ DTYPE * cp[MAX_CUDA_THREAD_NUM_PER_BLOCK];
......@@ -53,8 +53,8 @@ void KernelGather(DTYPE * source, DTYPE * target, int * srcIndex, int indexSize,
return;
if(threadIdx.y == 0){
sp[threadIdx.x] = source + srcIndex[i] * stride;
cp[threadIdx.x] = target + i * stride;
sp[threadIdx.x] = sData + sIndex[i] * stride;
cp[threadIdx.x] = tData + i * stride;
}
__syncthreads();
......@@ -90,12 +90,12 @@ void _CudaGather(XTensor * s, XTensor * t, XTensor * srcIndex)
dim3 blocks(cudaGrids[0], cudaGrids[1]);
dim3 threads(cudaBlocks[0], cudaBlocks[1]);
DTYPE * source = (DTYPE*)s->data;
DTYPE * target = (DTYPE*)t->data;
DTYPE * sData = (DTYPE*)s->data;
DTYPE * tData = (DTYPE*)t->data;
int * si = (int *)srcIndex->data;
int * sIndex = (int *)srcIndex->data;
KernelGather<<<blocks, threads >>>(source, target, si, indexSize, stride);
KernelGather<<<blocks, threads >>>(sData, tData, sIndex, indexSize, stride);
BacktoCudaDev(devID, devIDBackup);
}
......
......@@ -143,31 +143,27 @@ And this is a special spread function for backward computation of gather functio
we have 4 sub-tensors of size (3, 2)
>> srcIndex - index of the source sub-tensors
>> indexSize - length of srcIndex (and collIndex)
>> collIndex - index of the gathered sub-tensors
*/
void _SpreadForGather(XTensor * source, XTensor * collection, int dim,
int * srcIndex, int indexSize, int * collIndex)
int * srcIndex, int indexSize)
{
int order = source->order;
CheckNTErrors(source->dataType == DEFAULT_DTYPE, "TODO!");
CheckNTErrors(dim >= 0 && dim < order, "Illegal dimension!");
for(int i = 0; i < order; i++){
if(i < dim){
CheckNTErrors(collection->GetDim(i) == source->GetDim(i), "Illegal dimension!");
for (int i = 0; i < order; i++){
if (i == dim) {
CheckNTErrors(collection->GetDim(i) == indexSize, "Illegal dimension!");
}
else if(i > dim){
else {
CheckNTErrors(collection->GetDim(i) == source->GetDim(i), "Illegal dimension!");
}
else{
CheckNTErrors(collection->GetDim(i) == indexSize, "Illegal dimension!");
}
}
#ifdef USE_CUDA
if(source->devID >= 0 && collection->devID >= 0) {
_CudaSpreadForGather(source, collection, dim, srcIndex, indexSize, collIndex);
_CudaSpreadForGather(source, collection, dim, srcIndex, indexSize);
return;
}
#endif
......@@ -190,7 +186,7 @@ void _SpreadForGather(XTensor * source, XTensor * collection, int dim,
for(int i = 0; i < indexSize; i++){
int src = srcIndex[i];
int tgt = collIndex[i];
int tgt = i;
DTYPE * s = sData + src * stride;
DTYPE * c = cData + tgt * stride;
_AssignmentForGather(s, c, blockNum, blockSizeSrc, blockSizeColl, stride);
......@@ -203,12 +199,7 @@ And this is a special spread function for backward computation of gather functio
>> source - the source tensor whose data would be modified
>> collection - the collection whose data would be spread to source tensor
>> dim - the leading dimension to define "sub-tensors"
e.g., for a tensor of size (3, 2, 4) and dim = 2,
we have 4 sub-tensors of size (3, 2)
>> srcIndex - index of the source sub-tensors
>> indexSize - length of srcIndex (and collIndex)
>> collIndex - index of the gathered sub-tensors
>> index - the tensor to save the index of the collenction tensor
*/
void _SpreadForGather(XTensor * source, XTensor * collection, XTensor * index)
{
......@@ -218,10 +209,10 @@ void _SpreadForGather(XTensor * source, XTensor * collection, XTensor * index)
CheckNTErrors(source->dataType == DEFAULT_DTYPE, "TODO!");
for(int i = 0; i < order; i++){
if(i < dim){
CheckNTErrors(collection->GetDim(i) == source->GetDim(i), "Illegal dimension!");
if(i == dim){
CheckNTErrors(collection->GetDim(i) == index->unitNum, "Illegal dimension!");
}
else if(i > dim){
else {
CheckNTErrors(collection->GetDim(i) == source->GetDim(i), "Illegal dimension!");
}
}
......@@ -233,6 +224,23 @@ void _SpreadForGather(XTensor * source, XTensor * collection, XTensor * index)
}
#endif
int blockNum = 1;
int blockSize = 1;
int stride = 1;
int indexSize = 1;
stride = source->GetDim(-1);
indexSize = index->unitNum;
DTYPE * sData = (DTYPE*)source->data;
DTYPE * cData = (DTYPE*)collection->data;
int * sIndexData = (int*)index->data;
for (int i = 0; i < indexSize; i++) {
int sIndex = sIndexData[i] * stride;
for (int j = 0; j < stride; j++)
sData[sIndex + j] += cData[i * stride + j];
}
}
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
......@@ -29,6 +29,8 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/*
This is core assignment for spread function.
......@@ -240,54 +242,6 @@ Care of the operator "+=" instead of "=".
>> stride - stride of a data block
>> subtensorNum - number of sub-tensors
>> srcIndex - index of the source sub-tensor
>> colIndex - index of the sub-tensor in the collection tensor
*/
__global__
void KernelSpreadForGatherFuzed(DTYPE * sData, DTYPE * cData, int blockNum,
int blockSizeSrc, int blockSizeColl, int stride,
int subtensorNum,
int * srcIndex, int * colIndex)
{
__shared__ DTYPE * sp[MAX_CUDA_THREAD_NUM_PER_BLOCK];
__shared__ DTYPE * cp[MAX_CUDA_THREAD_NUM_PER_BLOCK];
/* block id */
int i = blockDim.x * blockIdx.x + threadIdx.x;
/* offset in each block */
int offset = blockDim.y * blockIdx.y + threadIdx.y;
int blockId = i % blockNum;
int subtensorId = i / blockNum;
if(subtensorId >= subtensorNum || offset >= stride)
return;
if(threadIdx.y == 0){
sp[threadIdx.x] = sData + srcIndex[subtensorId] * stride;
cp[threadIdx.x] = cData + colIndex[subtensorId] * stride;
}
__syncthreads();
DTYPE * s = sp[threadIdx.x] + blockSizeSrc * blockId;
DTYPE * c = cp[threadIdx.x] + blockSizeColl * blockId;
s[offset] += c[offset];
}
/*
This is core assignment for backward computation of gather function.
Care of the operator "+=" instead of "=".
>> sData - the data pointer of the source tensor
>> cData - the data pointer of collection tensor
>> blockNum - number of data blocks
>> blockSizeSrc - size of source data block
>> blockSizeColl - size of source data block
>> stride - stride of a data block
>> subtensorNum - number of sub-tensors
>> srcIndex - index of the source sub-tensor
*/
__global__
void KernelSpreadForGatherFuzed(DTYPE * sData, DTYPE * cData, int blockNum,
......@@ -334,10 +288,9 @@ And this is a special spread function for backward computation of gather functio
we have 4 sub-tensors of size (3, 2)
>> srcIndex - index of the source sub-tensors
>> indexSize - length of srcIndex (and collIndex)
>> collIndex - index of the gathered sub-tensors
*/
void _CudaSpreadForGather(XTensor * source, XTensor * collection, int dim,
int * srcIndex, int indexSize, int * collIndex)
int * srcIndex, int indexSize)
{
int order = source->order;
......@@ -372,7 +325,7 @@ void _CudaSpreadForGather(XTensor * source, XTensor * collection, int dim,
DTYPE * cData = (DTYPE*)collection->data;
for(int i = 0; i < indexSize; i++) {
int src = srcIndex[i];
int tgt = collIndex[i];
int tgt = i;
DTYPE * s = sData + src * stride;
DTYPE * c = cData + tgt * stride;
......@@ -384,12 +337,8 @@ void _CudaSpreadForGather(XTensor * source, XTensor * collection, int dim,
int * si = mem != NULL ?
(int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) :
(int*)XMemAlloc(source->devID, sizeof(int) * indexSize);
int * ci = mem != NULL ?
(int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) :
(int*)XMemAlloc(collection->devID, sizeof(int) * indexSize);
XMemCopy(si, source->devID, srcIndex, -1, sizeof(int) * indexSize);
XMemCopy(ci, collection->devID, collIndex, -1, sizeof(int) * indexSize);
DTYPE * s = (DTYPE*)source->data;
DTYPE * c = (DTYPE*)collection->data;
......@@ -399,15 +348,13 @@ void _CudaSpreadForGather(XTensor * source, XTensor * collection, int dim,
dim3 blocks(cudaGrids[0], cudaGrids[1]);
dim3 threads(cudaBlocks[0], cudaBlocks[1]);
KernelSpreadForGatherFuzed<<<blocks, threads >>>(s, c, blockNum, blockSizeSrc, blockSizeColl, stride, indexSize, si, ci);
KernelSpreadForGatherFuzed<<<blocks, threads >>>(s, c, blockNum, blockSizeSrc, blockSizeColl, stride, indexSize, si);
if (mem != NULL) {
mem->ReleaseBuf(mem->devID, sizeof(int) * indexSize);
mem->ReleaseBuf(mem->devID, sizeof(int) * indexSize);
}
else {
XMemFree(source->devID, si);
XMemFree(collection->devID, ci);
}
}
}
......@@ -454,6 +401,8 @@ void _CudaSpreadForGather(XTensor * source, XTensor * collection, XTensor * srcI
BacktoCudaDev(source->devID, devIDBackup);
}
#endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor)
#endif // __SPREAD_CUH__
\ No newline at end of file
......@@ -26,17 +26,21 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/* spread a collection tensor to source tensor (cuda version) */
void _CudaSpread(XTensor * source, XTensor * collection, int dim,
int * srcIndex, int indexSize, int * collIndex);
/* special spread function for backward computation of gather function (cuda version) */
void _CudaSpreadForGather(XTensor * source, XTensor * collection, int dim,
int * srcIndex, int indexSize, int * collIndex);
int * srcIndex, int indexSize);
/* special spread function for backward computation of gather function (cuda version) */
void _CudaSpreadForGather(XTensor * source, XTensor * collection, XTensor * srcIndex);
#endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor)
#endif // __SPREAD_CUH__
\ No newline at end of file
......@@ -32,12 +32,13 @@ void _Spread(XTensor * source, XTensor * collection, int dim,
/* spread a collection tensor to source tensor (return a XTensor structure)
make a new tensor to keep the result and return it */
void Spread(XTensor * source, XTensor * collection, int dim,
int * srcIndex, int indexSize, int * collIndex);
void Spread(XTensor * source, XTensor * collection,
XTensor * srcIndex, XTensor * collIndex,
int dim);
/* special spread function for backward computation of gather function */
void _SpreadForGather(XTensor * source, XTensor * collection, int dim,
int * srcIndex, int indexSize, int * collIndex);
int * srcIndex, int indexSize);
/* special spread function for backward computation of gather function */
void _SpreadForGather(XTensor * source, XTensor * collection, XTensor * index);
......
......@@ -202,7 +202,7 @@ bool TestGather2()
_Gather(sGPU, tGPU, dim, srcIndex, indexSize);
/* check results */
gpuTest = tGPU->CheckData(answer, tUnitNum) && tUserGPU.CheckData(answer, tUnitNum);
gpuTest = tGPU->CheckData(answer, tUnitNum);
/* destroy variables */
delete s;
......@@ -287,11 +287,11 @@ bool TestGather3()
/* call Gather function */
_Gather(s, t, dim, srcIndex, indexSize);
//tUser = Gather(*s, *index);
tUser = Gather(*s, *index);
/* check results */
cpuTest = t->CheckData(answer, tUnitNum);
//tUser2.CheckData(answer, tUnitNum);
cpuTest = t->CheckData(answer, tUnitNum) &&
tUser.CheckData(answer, tUnitNum);
#ifdef USE_CUDA
/* GPU test */
......@@ -325,6 +325,7 @@ bool TestGather3()
delete indexGPU;
delete[] sDimSize;
delete[] tDimSize;
delete[] indexDimSize;
return cpuTest && gpuTest;
#else
......@@ -333,6 +334,7 @@ bool TestGather3()
delete t;
delete[] sDimSize;
delete[] tDimSize;
delete[] indexDimSize;
return cpuTest;
#endif // USE_CUDA
......
......@@ -133,6 +133,130 @@ bool TestSpread1()
#endif // USE_CUDA
}
/*
case 2: test _SpreadForGather function
spread a collection tensor to source tensor
*/
bool TestSpread2()
{
/* a input tensor of size (3, 3) */
int sOrder = 2;
int * sDimSize = new int[sOrder];
sDimSize[0] = 3;
sDimSize[1] = 3;
int sUnitNum = 1;
for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i];
/* a output tensor of size (2, 3) */
int tOrder = 2;
int * tDimSize = new int[tOrder];
tDimSize[0] = 2;
tDimSize[1] = 3;
int tUnitNum = 1;
for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i];
/* a index tensor of size (2) */
int indexOrder = 1;
int * indexDimSize = new int[indexOrder];
indexDimSize[0] = 2;
int indexUnitNum = 1;
for (int i = 0; i < indexOrder; i++)
indexUnitNum *= indexDimSize[i];
DTYPE sData[3][3] = { {0.0F, 0.0F, 2.0F},
{2.0F, 1.0F, 3.0F},
{2.0F, 2.0F, 4.0F} };
DTYPE tData[2][3] = { {0.0F, -1.0F, 2.0F},
{1.0F, 2.0F, 0.0F} };
DTYPE answer[3][3] = { {0.0F, -1.0F, 4.0F},
{2.0F, 1.0F, 3.0F},
{3.0F, 4.0F, 4.0F} };
int dim = 0;
int indexSize = 2;
int srcIndex[2] = {0, 2};
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * s1 = NewTensor(sOrder, sDimSize);
XTensor * s2 = NewTensor(sOrder, sDimSize);
XTensor * t = NewTensor(tOrder, tDimSize);
XTensor * index = NewTensor(indexOrder, indexDimSize, X_INT);
/* initialize variables */
s1->SetData(sData, sUnitNum);
s2->SetData(sData, sUnitNum);
t->SetData(tData, tUnitNum);
index->SetData(srcIndex, indexSize);
/* call _SpreadForGather function */
_SpreadForGather(s1, t, dim, srcIndex, indexSize);
_SpreadForGather(s2, t, index);
/* check results */
cpuTest = s1->CheckData(answer, tUnitNum) &&
s2->CheckData(answer, tUnitNum);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * sGPU1 = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * sGPU2 = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensor(sOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor * indexGPU = NewTensor(indexOrder, indexDimSize, X_INT, 1.0F, 0);
/* initialize variables */
sGPU1->SetData(sData, sUnitNum);
sGPU2->SetData(sData, sUnitNum);
tGPU->SetData(tData, tUnitNum);
indexGPU->SetData(srcIndex, indexSize);
/* call _SpreadForGather function */
_SpreadForGather(sGPU1, tGPU, dim, srcIndex, indexSize);
_SpreadForGather(sGPU2, tGPU, indexGPU);
/* check results */
gpuTest = sGPU1->CheckData(answer, tUnitNum) &&
sGPU2->CheckData(answer, tUnitNum);
/* destroy variables */
delete s1;
delete s2;
delete t;
delete index;
delete sGPU1;
delete sGPU2;
delete tGPU;
delete indexGPU;
delete[] sDimSize;
delete[] tDimSize;
delete[] indexDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete s1;
delete s2;
delete t;
delete[] sDimSize;
delete[] tDimSize;
delete[] indexDimSize;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */
/*
TODO!!
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论