Commit fa2ed07c by 张裕浩

clean code

parent ec71b1a9
......@@ -33,7 +33,8 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
/*
use PTX code to reduce float data
*/
__device__ __forceinline__ float shfl_down_reduce_max(float input)
__device__ __forceinline__
float shflDownReduceMax(float input)
{
float output;
asm volatile(
......@@ -61,6 +62,38 @@ __device__ __forceinline__ float shfl_down_reduce_max(float input)
return output;
}
/*
use PTX code to reduce int data
*/
__device__ __forceinline__
int shflDownReduceMax(int input)
{
int output;
asm volatile(
"{"
".reg .s32 r0;"
".reg .pred p;"
"shfl.down.b32 r0, %1, 0x10, 0x1f;"
"setp.lt.s32 p,%1,r0;"
"@p mov.s32 %1,r0;"
"shfl.down.b32 r0, %1, 0x8, 0xf;"
"setp.lt.s32 p,%1,r0;"
"@p mov.s32 %1,r0;"
"shfl.down.b32 r0, %1, 0x4, 0x7;"
"setp.lt.s32 p,%1,r0;"
"@p mov.s32 %1,r0;"
"shfl.down.b32 r0, %1, 0x2, 0x3;"
"setp.lt.s32 p,%1,r0;"
"@p mov.s32 %1,r0;"
"shfl.down.b32 r0, %1, 0x1, 0x1;"
"setp.lt.s32 p, %1, r0; "
"@p mov.s32 %1,r0;"
"mov.s32 %0,%1;"
"}"
: "=r"(output) : "r"(input));
return output;
}
/*
reduce a tensor to another that keeps the max value along a dimension - slow version
Given a block of data, we go over each dimension i in the stride and we have
......@@ -224,39 +257,18 @@ void KernelReduceMaxFast(DTYPE * input, DTYPE * output,
DTYPE value2 = j + blockDim.y < strideNum ? inputData[(j + blockDim.y) * stride + iOffset]: FLOAT_MIN;
value = MAX(value, value2);
value = shfl_down_reduce_max(value);
value = shflDownReduceMax(value);
if ((tid & 0x1f) == 0) { data[tid / 32] = value; }
__syncthreads();
if (tid < 32)
{
if (tid < 32) {
if (tid < blockDim.y / 32)
value = data[tid];
else value = FLOAT_MIN;
value = shfl_down_reduce_max(value);
value = shflDownReduceMax(value);
if (tid == 0 && blockIdx.y < reducedStrideNum)
output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = value;
}
///* load data into the shared mem */
//data[tid] = MAX(value, value2);
//__syncthreads();
///* unroll the warp */
//if(goodSize >= 512) {if(tid < 256) {if(data[tid] < data[tid + 256]) data[tid] = data[tid + 256];} __syncthreads();}
//if(goodSize >= 256) {if(tid < 128) {if(data[tid] < data[tid + 128]) data[tid] = data[tid + 128];} __syncthreads();}
//if(goodSize >= 128) {if(tid < 64) {if(data[tid] < data[tid + 64]) data[tid] = data[tid + 64];} __syncthreads();}
//if(goodSize >= 64) {if(tid < 32) {if(data[tid] < data[tid + 32]) data[tid] = data[tid + 32];} __syncthreads();}
//if(goodSize >= 32) {if(tid < 16) {if(data[tid] < data[tid + 16]) data[tid] = data[tid + 16];} __syncthreads();}
//if(goodSize >= 16) {if(tid < 8) {if(data[tid] < data[tid + 8]) data[tid] = data[tid + 8];} __syncthreads();}
//if(goodSize >= 8) {if(tid < 4) {if(data[tid] < data[tid + 4]) data[tid] = data[tid + 4];} __syncthreads();}
//if(goodSize >= 4) {if(tid < 2) {if(data[tid] < data[tid + 2]) data[tid] = data[tid + 2];} __syncthreads();}
//if(goodSize >= 2) {if(tid < 1) {if(data[tid] < data[tid + 1]) data[tid] = data[tid + 1];} __syncthreads();}
///* write result for this block to the output array */
//if(threadIdx.y == 0 && blockIdx.y < reducedStrideNum)
// output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = data[0];
}
/*
......@@ -373,14 +385,15 @@ void KernelReduceMaxSimpleFast(DTYPE * input, DTYPE * output,
op[offset] = max;
}
/*
according the GPU's sm number allocation warp num
*/
inline void continuousStorageThreadAllocation(dim3& grid, dim3& block, long long vectorNum, int vectorSize)
{
int warpNum = 4;
if (vectorNum < 20 * 8)
{
if (vectorNum < 20 * 8){
warpNum = 8;
if (vectorNum < 20 * 4)
{
if (vectorNum < 20 * 4){
warpNum = 16;
if (warpNum < 20 * 2)
warpNum = 32;
......@@ -389,6 +402,7 @@ inline void continuousStorageThreadAllocation(dim3& grid, dim3& block, long long
int minWarpNum = vectorSize / 32;
if (vectorSize % 32 != 0) minWarpNum++;
warpNum = min(warpNum, minWarpNum);
grid.x = vectorNum;
grid.y = 1;
grid.z = 1;
......@@ -397,39 +411,44 @@ inline void continuousStorageThreadAllocation(dim3& grid, dim3& block, long long
block.z = 1;
}
/*
adjust threads.x number then we can use warp optimization
*/
inline void adjustThreadForUseWarpOptimization(dim3& blocks, dim3& threads)
{
if (threads.x > 1)
{
if (threads.x > 1) {
blocks.x *= threads.x;
threads.x = 1;
}
if (threads.y<32)
if (threads.y < 32)
threads.y = 32;
}
/*
In some case,we use less block to imporve efficiency
*/
__global__
void KernelReduceMaxOpLessBlocks(DTYPE * input, DTYPE * output,
int strideNum, int blockNum)
void KernelReduceMaxOpLessBlocks(DTYPE * input, DTYPE * output, int strideNum, int blockNum)
{
int idx = threadIdx.x % 32;
int idy = (blockIdx.x * blockDim.x + threadIdx.x) / 32;
int startIndex = idy * strideNum;
DTYPE threadMax = FLOAT_MIN;
for (int i = idx; i < strideNum; i += 32)
{
for (int i = idx; i < strideNum; i += 32) {
threadMax = max(input[startIndex + i], threadMax);
}
threadMax = shfl_down_reduce_max(threadMax);
if (idx == 0)
threadMax = shflDownReduceMax(threadMax);
if (idx == 0)
output[idy] = threadMax;
}
/*
we use PTX code reduce
*/
__global__
void KernelReduceMaxOp(DTYPE * input, DTYPE * output,
int stride, int strideNum, int reducedStrideNum,
int blockSize, int blockNum)
void KernelReduceMaxOp(DTYPE * input, DTYPE * output,int stride, int strideNum,
int reducedStrideNum,int blockSize, int blockNum)
{
__shared__ DTYPE iData[MAX_CUDA_THREAD_NUM_PER_BLOCK / 32];
......@@ -447,26 +466,19 @@ void KernelReduceMaxOp(DTYPE * input, DTYPE * output,
DTYPE * data = iData + threadIdx.x * blockDim.y;
DTYPE * inputData = input + k * blockSize;
for (int it = j; it < strideNum; it += blockDim.y)
{
for (int it = j; it < strideNum; it += blockDim.y){
threadMax = max(inputData[it * stride + iOffset], threadMax);
}
__syncthreads();
//op reduce
/*threadSum += __shfl_down_sync(0xFFFFFFFF, threadSum, 16, 32);
threadSum += __shfl_down_sync(0xFFFFFFFF, threadSum, 8, 16);
threadSum += __shfl_down_sync(0xFFFFFFFF, threadSum, 4, 8);
threadSum += __shfl_down_sync(0xFFFFFFFF, threadSum, 2, 4);
threadSum += __shfl_down_sync(0xFFFFFFFF, threadSum, 1, 2);*/
threadMax = shfl_down_reduce_max(threadMax);
threadMax = shflDownReduceMax(threadMax);
if ((tid & 0x1f) == 0) { data[tid / 32] = threadMax; }
__syncthreads();
if (tid < 32)
{
/* use one warp to reduce remaining data */
if (tid < 32){
if (tid < blockDim.y / 32)
threadMax = data[tid];
else threadMax = 0;
threadMax = shfl_down_reduce_max(threadMax);
threadMax = shflDownReduceMax(threadMax);
if (tid == 0 && blockIdx.y < reducedStrideNum)
output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = threadMax;
}
......@@ -528,20 +540,18 @@ void _CudaReduceMax(const XTensor * input, XTensor * output, int dim)
int devIDBackup;
ProtectCudaDev(input->devID, devIDBackup);
if (stride == 1 && blockNum >= 10)
{
if (stride == 1 && blockNum >= 10) {
dim3 grids;
dim3 blocks;
continuousStorageThreadAllocation(grids, blocks, (long long)blockNum, strideNum);
if (blocks.y > 128)
KernelReduceMaxOp << <grids, blocks >> > ((DTYPE *)input->data, (DTYPE*)output->data, stride, strideNum, grids.y, blockSize, blockNum);
else
{
KernelReduceMaxOpLessBlocks << <blockNum / 4, 128 >> > ((DTYPE *)input->data, (DTYPE*)output->data, strideNum, blockNum);
if (blocks.y > 128) {
KernelReduceMaxOp <<<grids, blocks >>> ((DTYPE *)input->data, (DTYPE*)output->data, stride, strideNum, grids.y, blockSize, blockNum);
}
else {
KernelReduceMaxOpLessBlocks <<<blockNum / 4, 128 >>> ((DTYPE *)input->data, (DTYPE*)output->data, strideNum, blockNum);
}
}
else
{
else {
do {
if (input->dataType == DEFAULT_DTYPE) {
DTYPE * iData = NULL;
......@@ -565,7 +575,7 @@ void _CudaReduceMax(const XTensor * input, XTensor * output, int dim)
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (DTYPE*)output->data;
KernelReduceMax << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
KernelReduceMax <<<blocks, threads >>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
}
else if (strideNum < 128) {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 64), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
......@@ -574,7 +584,7 @@ void _CudaReduceMax(const XTensor * input, XTensor * output, int dim)
oData = (DTYPE*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 64), "Incorrect thread number when calling the cuda kernel!");
adjustThreadForUseWarpOptimization(blocks, threads);
KernelReduceMaxFast<64> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
KernelReduceMaxFast<64> <<<blocks, threads >>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
}
else if (strideNum < 256) {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 128), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
......@@ -583,7 +593,7 @@ void _CudaReduceMax(const XTensor * input, XTensor * output, int dim)
oData = (DTYPE*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 128), "Incorrect thread number when calling the cuda kernel!");
adjustThreadForUseWarpOptimization(blocks, threads);
KernelReduceMaxFast<128> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
KernelReduceMaxFast<128> <<<blocks, threads >>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
}
else if (strideNum < 512) {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 256), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
......@@ -592,7 +602,7 @@ void _CudaReduceMax(const XTensor * input, XTensor * output, int dim)
oData = (DTYPE*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 256), "Incorrect thread number when calling the cuda kernel!");
adjustThreadForUseWarpOptimization(blocks, threads);
KernelReduceMaxFast<256> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
KernelReduceMaxFast<256> <<<blocks, threads >>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
}
else {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 512), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
......@@ -601,7 +611,7 @@ void _CudaReduceMax(const XTensor * input, XTensor * output, int dim)
oData = (DTYPE*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 512), "Incorrect thread number when calling the cuda kernel!");
adjustThreadForUseWarpOptimization(blocks, threads);
KernelReduceMaxFast<512> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
KernelReduceMaxFast<512> <<<blocks, threads >>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
}
}
else if (input->dataType == X_FLOAT16) {
......
......@@ -30,7 +30,8 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
/*
use PTX code to reduce float data
*/
__device__ __forceinline__ float shfl_down_reduce_sum(float input)
__device__ __forceinline__
float shflDownReduceSum(float input)
{
float output;
asm volatile(
......@@ -54,7 +55,8 @@ __device__ __forceinline__ float shfl_down_reduce_sum(float input)
/*
use PTX code to reduce int data
*/
__device__ __forceinline__ int shfl_down_reduce_sum(int input)
__device__ __forceinline__
int shflDownReduceSum(int input)
{
int output;
asm volatile(
......@@ -326,47 +328,17 @@ void KernelReduceSumFast(DTYPE * input, DTYPE * output,
value = value + value2;
__syncthreads();
value = shfl_down_reduce_sum(value);
/*value += __shfl_down_sync(0x0000001F, value, 16, 32);
value += __shfl_down_sync(0x0000001F, value, 8, 16);
value += __shfl_down_sync(0x0000001F, value, 4, 8);
value += __shfl_down_sync(0x0000001F, value, 2, 4);
value += __shfl_down_sync(0x0000001F, value, 1, 2);*/
value = shflDownReduceSum(value);
if ((tid & 0x1f) == 0) { data[tid / 32] = value; }
__syncthreads();
if (tid < 32)
{
if (tid < 32){
if (tid < blockDim.y / 32)
value = data[tid];
else value = 0;
value = shfl_down_reduce_sum(value);
value = shflDownReduceSum(value);
if (tid == 0 && blockIdx.y < reducedStrideNum)
output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = value;
}
/*if (blockDim.y / 32 >= 16) { if (tid < 8) { data[tid] += data[tid + 8]; } __syncthreads(); }
if (blockDim.y / 32 >= 8) { if (tid < 4) { data[tid] += data[tid + 4]; } __syncthreads(); }
if (blockDim.y / 32 >= 4) { if (tid < 2) { data[tid] += data[tid + 2]; } __syncthreads(); }
if (blockDim.y / 32 >= 2) { if (tid < 1) { data[tid] += data[tid + 1]; } __syncthreads(); }*/
///* load data into the shared mem */
//data[tid] = value + value2;
//__syncthreads();
///* unroll the warp */
//if(goodSize >= 512) {if(tid < 256) {data[tid] += data[tid + 256];} __syncthreads();}
//if(goodSize >= 256) {if(tid < 128) {data[tid] += data[tid + 128];} __syncthreads();}
//if(goodSize >= 128) {if(tid < 64) {data[tid] += data[tid + 64];} __syncthreads();}
//if(goodSize >= 64) {if(tid < 32) {data[tid] += data[tid + 32];} __syncthreads();}
//if(goodSize >= 32) {if(tid < 16) {data[tid] += data[tid + 16];} __syncthreads();}
//if(goodSize >= 16) {if(tid < 8) {data[tid] += data[tid + 8];} __syncthreads();}
//if(goodSize >= 8) {if(tid < 4) {data[tid] += data[tid + 4];} __syncthreads();}
//if(goodSize >= 4) {if(tid < 2) {data[tid] += data[tid + 2];} __syncthreads();}
//if(goodSize >= 2) {if(tid < 1) {data[tid] += data[tid + 1];} __syncthreads();}
///* write result for this block to the output array */
//if(threadIdx.y == 0 && blockIdx.y < reducedStrideNum)
// output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = data[0];
}
/*
......@@ -502,9 +474,12 @@ void KernelReduceSumFast(__half * input, __half * output,
#endif
}
__global__ void KernelReduceSumDiscontinuousStorage(DTYPE * input, DTYPE * output,
int stride, int strideNum,
DTYPE * shift, DTYPE power, bool isExp)
/*
if data storage is discontinuius ,use this way to reduce
*/
__global__
void KernelReduceSumDiscontinuousStorage(DTYPE * input, DTYPE * output, int stride,
int strideNum, DTYPE * shift, DTYPE power, bool isExp)
{
//int idx = blockIdx.x * blockDim.x + threadIdx.x;
//int endIndex = (idx+1) * strideNum;
......@@ -515,11 +490,9 @@ __global__ void KernelReduceSumDiscontinuousStorage(DTYPE * input, DTYPE * outpu
#pragma unroll
for (int i = stride * strideNum * blockIndex + offsetInBlock;
i < stride * strideNum * blockIndex + offsetInBlock + stride * strideNum;
i += stride)
{
i += stride){
ans += input[i];
}
if (threadIdx.x == 0 && blockIdx.x == 0) printf("%d ", stride);
output[idx] = ans;
}
......@@ -551,8 +524,7 @@ void KernelReduceSumOp(DTYPE * input, DTYPE * output,
DTYPE * data = iData + threadIdx.x * blockDim.y;
DTYPE * inputData = input + k * blockSize;
for (int it = j; it < strideNum; it += blockDim.y)
{
for (int it = j; it < strideNum; it += blockDim.y){
DTYPE value = inputData[it * stride + iOffset] - bias[threadIdx.x];
if (power != (DTYPE)1.0) {
if (power == (DTYPE)2.0) {
......@@ -569,34 +541,18 @@ void KernelReduceSumOp(DTYPE * input, DTYPE * output,
threadSum += value;
}
__syncthreads();
//op reduce
/*threadSum += __shfl_down_sync(0xFFFFFFFF, threadSum, 16, 32);
threadSum += __shfl_down_sync(0xFFFFFFFF, threadSum, 8, 16);
threadSum += __shfl_down_sync(0xFFFFFFFF, threadSum, 4, 8);
threadSum += __shfl_down_sync(0xFFFFFFFF, threadSum, 2, 4);
threadSum += __shfl_down_sync(0xFFFFFFFF, threadSum, 1, 2);*/
threadSum = shfl_down_reduce_sum(threadSum);
threadSum = shflDownReduceSum(threadSum);
if ((tid & 0x1f) == 0) { data[tid / 32] = threadSum; }
__syncthreads();
if (tid < 32)
{
if (tid < 32){
if (tid < blockDim.y / 32)
threadSum = data[tid];
else threadSum = 0;
threadSum = shfl_down_reduce_sum(threadSum);
threadSum = shflDownReduceSum(threadSum);
if (tid == 0 && blockIdx.y < reducedStrideNum)
output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = threadSum;
}
/*if (blockDim.y / 32 >= 32) { if (tid < 16) { data[tid] += data[tid + 16]; } __syncthreads(); }
if (blockDim.y / 32 >= 16) { if (tid < 8) { data[tid] += data[tid + 8]; } __syncthreads(); }
if (blockDim.y / 32 >= 8) { if (tid < 4) { data[tid] += data[tid + 4]; } __syncthreads(); }
if (blockDim.y / 32 >= 4) { if (tid < 2) { data[tid] += data[tid + 2]; } __syncthreads(); }
if (blockDim.y / 32 >= 2) { if (tid < 1) { data[tid] += data[tid + 1]; } __syncthreads(); }
// write result for this block to the output array
if (threadIdx.y == 0 && blockIdx.y < reducedStrideNum)
output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = data[0];*/
}
__global__
......@@ -612,8 +568,7 @@ void KernelReduceSumOpLessBlocks(DTYPE * input, DTYPE * output,
bias[threadIdx.x / 32] = shift != NULL ? shift[idy] : 0;
int startIndex = idy * strideNum;
DTYPE threadSum = 0;
for (int i = idx; i < strideNum; i += 32)
{
for (int i = idx; i < strideNum; i += 32) {
DTYPE value = input[startIndex + i] - bias[threadIdx.x / 32];
if (power != (DTYPE)1.0) {
if (power == (DTYPE)2.0) {
......@@ -629,30 +584,20 @@ void KernelReduceSumOpLessBlocks(DTYPE * input, DTYPE * output,
if (isExp) value = exp(value);
threadSum += value;
}
threadSum = shfl_down_reduce_sum(threadSum);
threadSum = shflDownReduceSum(threadSum);
if (idx == 0)
output[idy] = threadSum;
/*__shared__ DTYPE idata[128];
idata[threadIdx.x] = threadSum;
__syncthreads();
if (idx < 16) { idata[threadIdx.x] += idata[threadIdx.x + 16]; }__syncthreads();
if (idx < 8) { idata[threadIdx.x ] += idata[threadIdx.x + 8]; }__syncthreads();
if (idx < 4) { idata[threadIdx.x ] += idata[threadIdx.x + 4]; }__syncthreads();
if (idx < 2) { idata[threadIdx.x ] += idata[threadIdx.x + 2]; }__syncthreads();
if (idx < 1) { idata[threadIdx.x ] += idata[threadIdx.x + 1]; }__syncthreads();
if (idx == 0)
output[idy] = idata[threadIdx.x];*/
}
//pytorch use this way to allocate threads,they maybe use hard-code according the SM number (the 1080 and 1080 Ti is 20),and it indeed have better perforamnce,
/*
according the GPU's sm number allocation warp num
*/
inline void continuousStorageThreadAllocation(dim3& grid, dim3& block, long long vectorNum, int vectorSize)
{
int warpNum = 4;
if (vectorNum < 20 * 8)
{
if (vectorNum < 20 * 8) {
warpNum = 8;
if (vectorNum < 20 * 4)
{
if (vectorNum < 20 * 4) {
warpNum = 16;
if (warpNum < 20 * 2)
warpNum = 32;
......@@ -661,6 +606,7 @@ inline void continuousStorageThreadAllocation(dim3& grid, dim3& block, long long
int minWarpNum = vectorSize / 32;
if (vectorSize % 32 != 0) minWarpNum++;
warpNum = min(warpNum, minWarpNum);
grid.x = vectorNum;
grid.y = 1;
grid.z = 1;
......@@ -669,7 +615,9 @@ inline void continuousStorageThreadAllocation(dim3& grid, dim3& block, long long
block.z = 1;
}
//this situation we use block.x * grid.x deal one vector for continuous read
/*
this situation we use block.x * grid.x deal one vector for continuous read
*/
inline void discontinuousStorageNoShareMemThreadAllocation(dim3& grid, dim3& block, int stride, int blockNum)
{
block.x = 512;
......@@ -681,10 +629,12 @@ inline void discontinuousStorageNoShareMemThreadAllocation(dim3& grid, dim3& blo
grid.y = 1;
}
/*
adjust threads.x number then we can use warp optimization
*/
inline void adjustThreadForUseWarpOptimization(dim3& blocks, dim3& threads)
{
if (threads.x > 1)
{
if (threads.x > 1){
blocks.x *= threads.x;
threads.x = 1;
}
......@@ -757,33 +707,24 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
int devIDBackup;
ProtectCudaDev(input->devID, devIDBackup);
if (stride == 1 && blockNum >= 10)
{
if (stride == 1 && blockNum >= 10) {
dim3 grids;
dim3 blocks;
continuousStorageThreadAllocation(grids, blocks, (long long)blockNum, strideNum);
if (blocks.y > 128)
KernelReduceSumOp << <grids, blocks >> > ((DTYPE *)input->data, (DTYPE*)output->data, stride, strideNum, grids.y, blockSize, blockNum, sp, power, isExp);
KernelReduceSumOp <<<grids, blocks >>> ((DTYPE *)input->data, (DTYPE*)output->data, stride, strideNum, grids.y, blockSize, blockNum, sp, power, isExp);
else
{
KernelReduceSumOpLessBlocks << <blockNum / 4, 128 >> > ((DTYPE *)input->data, (DTYPE*)output->data, strideNum, blockNum, sp, power, isExp);
}
//printf("grad %d %d thread %d %d\n", grids.x, grids.y, blocks.x, blocks.y);
KernelReduceSumOpLessBlocks <<<blockNum / 4, 128 >>> ((DTYPE *)input->data, (DTYPE*)output->data, strideNum, blockNum, sp, power, isExp);
}
else if (stride != 1 && stride * blockNum > 4096)
{
else if (stride != 1 && stride * blockNum > 4096){
//GDevs->GetGridAndBlockSize2D(devID, stride * blockNum, strideNum,MAX_INT, cudaGridSize, cudaBlockSize);
//printf("%d %d %d %d\n", cudaGridSize[0], cudaGridSize[1], cudaBlockSize[0], cudaBlockSize[1]);
//unsigned int* goutput = (unsigned int *)input->data;
//convert2uintV2 << <dim3(cudaGridSize[0], cudaGridSize[1]), dim3(cudaBlockSize[0], cudaBlockSize[1]) >> > ((float*)input->data, goutput, stride, strideNum, blockNum, strideNum*blockNum*stride);
dim3 grid, block;
discontinuousStorageNoShareMemThreadAllocation(grid, block, stride, blockNum);
//printf("%d %d %d %d\n", cudaGridSize[0], cudaGridSize[1], cudaBlockSize[0], cudaBlockSize[1]);
KernelReduceSumDiscontinuousStorage << <grid, block >> > ((DTYPE *)input->data, (DTYPE*)output->data, stride, strideNum, sp, power, isExp);
KernelReduceSumDiscontinuousStorage <<<grid, block >>> ((DTYPE *)input->data, (DTYPE*)output->data, stride, strideNum, sp, power, isExp);
}
else
{
else {
do {
if (input->dataType == DEFAULT_DTYPE) {
DTYPE * iData = NULL;
......@@ -806,7 +747,7 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (DTYPE*)output->data;
KernelReduceSum << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum, sp, power, isExp);
KernelReduceSum <<<blocks, threads >>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum, sp, power, isExp);
}
else if (strideNum < 128) {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 64), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
......@@ -815,7 +756,7 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
oData = (DTYPE*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 64), "Incorrect thread number when calling the cuda kernel!");
adjustThreadForUseWarpOptimization(blocks, threads);
KernelReduceSumFast<64> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum, sp, power, isExp);
KernelReduceSumFast<64> <<<blocks, threads >>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum, sp, power, isExp);
}
else if (strideNum < 256) {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 128), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
......@@ -824,7 +765,7 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
oData = (DTYPE*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 128), "Incorrect thread number when calling the cuda kernel!");
adjustThreadForUseWarpOptimization(blocks, threads);
KernelReduceSumFast<128> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum, sp, power, isExp);
KernelReduceSumFast<128> <<<blocks, threads >>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum, sp, power, isExp);
}
else if (strideNum < 512) {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 256), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
......@@ -833,7 +774,7 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
oData = (DTYPE*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 256), "Incorrect thread number when calling the cuda kernel!");
adjustThreadForUseWarpOptimization(blocks, threads);
KernelReduceSumFast<256> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum, sp, power, isExp);
KernelReduceSumFast<256> <<<blocks, threads >>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum, sp, power, isExp);
}
else {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 512), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
......@@ -842,7 +783,7 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
oData = (DTYPE*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 512), "Incorrect thread number when calling the cuda kernel!");
adjustThreadForUseWarpOptimization(blocks, threads);
KernelReduceSumFast<512> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum, sp, power, isExp);
KernelReduceSumFast<512> <<<blocks, threads >>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum, sp, power, isExp);
}
}
else if (input->dataType == X_FLOAT16) {
......@@ -872,7 +813,7 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (__half*)output->data;
KernelReduceSum << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum, spft16, *powerft16p, isExp);
KernelReduceSum <<<blocks, threads >>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum, spft16, *powerft16p, isExp);
}
else if (strideNum < 128) {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 64), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
......
......@@ -433,53 +433,42 @@ void KernelTopK3(T * input, int stride, int strideNum, int blockNum, int k, T mi
T minData = minValue;
int heapLimit = heap.count / 2;
if (heapLimit % 2 == 0 && heapLimit != 0) heapLimit -= 1;
for (int counter = heap.count - 1; counter >= heapLimit; --counter)
{
for (int counter = heap.count - 1; counter >= heapLimit; --counter) {
if (minData < heap.items[counter].value)
minData = heap.items[counter].value;
}
eachHeapMaxValue[threadIdx.y * blockDim.x + threadIdx.x] = minData;
//need more optimation
if (i == 0)
{
if (i == 0) {
int threadLimit = (threadIdx.y + 1) * blockDim.x;
CudaXHeap<MIN_HEAP, T> chooseHeap(k, heapData + k * ((blockDim.x * blockDim.y) + threadIdx.y));
int counter = threadIdx.y * blockDim.x;
for (; counter < threadIdx.y * blockDim.x + k; ++counter)
{
for (; counter < threadIdx.y * blockDim.x + k; ++counter) {
chooseHeap.Push(counter, eachHeapMaxValue[counter]);
}
for (; counter < threadLimit; ++counter)
{
if (eachHeapMaxValue[counter]>chooseHeap.items[0].value)
{
for (; counter < threadLimit; ++counter) {
if (eachHeapMaxValue[counter]>chooseHeap.items[0].value) {
chooseHeap.ReplaceTop(counter, eachHeapMaxValue[counter]);
}
}
CudaXHeap<MIN_HEAP, T> ansHeapData(k, k - parameter, heapData + k * chooseHeap.items[0].index);
int miss = parameter;
for (counter = 1; counter < k; ++counter)
{
//printf("%f %d\n",chooseHeap.items[0].value,chooseHeap.items[0].index);
for (counter = 1; counter < k; ++counter) {
chooseHeap.items[0] = chooseHeap.items[chooseHeap.count - 1];
chooseHeap.count--;
chooseHeap.Down(0);
CudaHeapNode<T> * cmpHeapData = heapData + k * (chooseHeap.items[0].index);
int cmpHeapLimit = 0;
if (counter + heapLimit <= k - parameter)
{
if (counter + heapLimit <= k - parameter){
cmpHeapLimit = heapLimit;
}
//take the max data from the minHeap,so start search from the leaf node
for (int iterator = k - 1 - parameter; iterator >= cmpHeapLimit; --iterator)
{
if (miss > 0)
{
/* take the max data from the minHeap,so start search from the leaf node */
for (int iterator = k - 1 - parameter; iterator >= cmpHeapLimit; --iterator){
if (miss > 0){
ansHeapData.Push(cmpHeapData[iterator].index, cmpHeapData[iterator].value);
miss--;
}
else if (ansHeapData.items[0].value < cmpHeapData[iterator].value)
{
else if (ansHeapData.items[0].value < cmpHeapData[iterator].value){
ansHeapData.ReplaceTop(cmpHeapData[iterator].index, cmpHeapData[iterator].value);
}
}
......@@ -487,8 +476,7 @@ void KernelTopK3(T * input, int stride, int strideNum, int blockNum, int k, T mi
int offset = stride * k * blockIndex + offsetInBlock;
T * dOutput = output + offset;
int * indexOutput = index + offset;
for (int q = 0; q < k; ++q)
{
for (int q = 0; q < k; ++q){
dOutput[stride * q] = ansHeapData.items[q].value;
indexOutput[stride * q] = ansHeapData.items[q].index;
}
......@@ -496,52 +484,61 @@ void KernelTopK3(T * input, int stride, int strideNum, int blockNum, int k, T mi
}
__device__ __forceinline__ unsigned getLaneMaskLe() {
__device__ __forceinline__
unsigned getLaneMaskLe()
{
unsigned mask;
asm("mov.u32 %0, %%lanemask_le;" : "=r"(mask));
return mask;
}
__device__ __forceinline__ int getLaneId() {
__device__ __forceinline__
int getLaneId()
{
int laneId;
asm("mov.s32 %0, %laneid;" : "=r"(laneId));
return laneId;
}
__device__ unsigned convert(float v)
__device__
unsigned convert(float v)
{
unsigned x = __float_as_int(v);
unsigned mask = (x & 0x80000000) ? 0xffffffff : 0x80000000;
return (x ^ mask);
}
__device__ float convert(unsigned int v)
__device__
float convert(unsigned int v)
{
float x = __uint_as_float(v);
return x;
}
__device__ float deconvert(unsigned int v) {
unsigned int mask = (v & 0x80000000) ? 0x80000000 : 0xffffffff;
__device__
float deconvert(unsigned int v)
{
unsigned int mask = (v & 0x80000000) ? 0x80000000 : 0xffffffff;
return __int_as_float(v ^ mask);
}
__global__ void convert2uintV2(float* input, unsigned int *output, int stride, int strideNum, int blockNum, int size)
__global__
void convert2uintV2(float* input, unsigned int *output, int stride, int strideNum, int blockNum, int size)
{
int idx = blockDim.x * blockIdx.x + threadIdx.x;
int idy = blockDim.y * blockIdx.y + threadIdx.y;
//int strideNum = (int)strideNumSize;
//if (flag) strideNum = strideNumSize[idy];
int blockIndex = idy / stride;
int offsetInBlock = idy% stride;
#pragma unroll
for (int i = idx * stride + stride * strideNum * blockIndex + offsetInBlock;
i < stride * strideNum * blockIndex + offsetInBlock + stride * strideNum && i < size;
i += stride * blockDim.x)
{
i += stride * blockDim.x){
output[i] = convert(input[i]);
}
}
__global__ void deconvert2floatV2(unsigned int * input, float *output, int stride, int strideNum, int blockNum, int size)
__global__
void deconvert2floatV2(unsigned int * input, float *output, int stride, int strideNum, int blockNum, int size)
{
int idx = blockDim.x * blockIdx.x + threadIdx.x;
int idy = blockDim.y * blockIdx.y + threadIdx.y;
......@@ -552,13 +549,13 @@ __global__ void deconvert2floatV2(unsigned int * input, float *output, int strid
#pragma unroll
for (int i = idx * stride + stride * strideNum * blockIndex + offsetInBlock;
i < stride * strideNum * blockIndex + offsetInBlock + stride * strideNum && i < size;
i += stride * blockDim.x)
{
i += stride * blockDim.x){
output[i] = deconvert(input[i]);
}
}
__device__ void radixCount(unsigned int *data, int limit, int *pos_count, unsigned int mask, int mask_desire, unsigned int desire, int stride, int strideNum, int blockNum)
__device__
void radixCount(unsigned int *data, int limit, int *posCount, unsigned int mask, int maskDesire, unsigned int desire, int stride, int strideNum, int blockNum)
{
/*the idx th thread in one vector */
......@@ -569,149 +566,141 @@ __device__ void radixCount(unsigned int *data, int limit, int *pos_count, unsign
int offsetInBlock = idy% stride;
for (int j = idx*stride + stride * strideNum * blockIndex + offsetInBlock;
j< stride * strideNum * blockIndex + offsetInBlock + stride*strideNum && j<limit;
j += stride * WORKERSNUM)
{
// printf("idx:%d, idy:%d,j:%d,addpos:%d\n",idx,idy,j, (idy % WORKERSNUM)*blockDim.x + idx);
if ((data[j] & mask_desire) == desire)
{
if (data[j] & mask)
{
pos_count[(idy % (512 / WORKERSNUM))*blockDim.x + idx]++;
j += stride * WORKERSNUM) {
if ((data[j] & maskDesire) == desire) {
if (data[j] & mask) {
posCount[(idy % (512 / WORKERSNUM))*blockDim.x + idx]++;
}
}
// printf("Radix Count: %d Idx: %d,Idy: %d,end: %d\n", j,idx,idy, stride * strideNum * blockIndex + offsetInBlock + stride*strideNum);
}
}
//the theard number need be 32 times
__device__ void gpu_check_warp(int *smem, bool in, int *carry, int *index)
/* We can use this way to check thread status in a warp fastly,
note that the theard number need be 32 times */
__device__
void gpuCheckWarp(int *smem, bool in, int *carry, int *index)
{
int vote = __ballot_sync(0xffffffff, in);
*index = __popc(getLaneMaskLe() & vote);
*carry = __popc(vote);
int idx = blockDim.x * blockIdx.x + threadIdx.x;
int warp = idx / 32; //idx 0 -- blockDim.x
int warp_num = blockDim.x / 32;//get each vector use how many warp
if (getLaneId() == 0)
{
smem[warp + warp_num * threadIdx.y] = *carry; //save each warp carry
//printf("%d ", warp + warp_num * threadIdx.y);
int warp = idx / 32;
int warpNum = blockDim.x / 32;
if (getLaneId() == 0) {
/* save each warp carry */
smem[warp + warpNum * threadIdx.y] = *carry;
}
__syncthreads();
if (idx == 0) //use one thread to count the carry for globe the warp
{
for (int i = 1 + warp_num * threadIdx.y; i < warp_num * (threadIdx.y + 1); ++i)
{
/* use one thread to count the carry for globe the warp */
if (idx == 0) {
for (int i = 1 + warpNum * threadIdx.y; i < warpNum * (threadIdx.y + 1); ++i) {
smem[i] += smem[i - 1];
}
}
__syncthreads();
if (warp % warp_num)
{
*index += smem[warp_num * threadIdx.y + warp - 1];
if (warp % warpNum) {
*index += smem[warpNum * threadIdx.y + warp - 1];
}
*carry = smem[warp_num * threadIdx.y + warp_num - 1];
*carry = smem[warpNum * threadIdx.y + warpNum - 1];
}
__device__ void collect_number(unsigned int *data, int stride, int strideNum, int limit, unsigned int pattern, float *ans, int *ansIndex, int k)
/*
collect the data bigger than pattern as ans return
*/
__device__
void collectNumber(unsigned int *data, int stride, int strideNum, int limit,
unsigned int pattern, float *ans, int *ansIndex, int k)
{
int idy = blockDim.y * blockIdx.y + threadIdx.y;
int idx = blockDim.x * blockIdx.x + threadIdx.x;
int blockIndex = idy / stride;
int offsetInBlock = idy % stride;
__shared__ int smem[32]; //for count each warp's tmp carry
/* for count each warp's tmp carry */
__shared__ int smem[32];
int carry;
int index;
int vector_limit = stride * strideNum * blockIndex + offsetInBlock + stride * strideNum;
int alibn_strideNum = strideNum;
if (alibn_strideNum % blockDim.x) alibn_strideNum = alibn_strideNum + blockDim.x - (alibn_strideNum % blockDim.x);
int vector_alibn_limit = stride * strideNum * blockIndex + offsetInBlock + stride * alibn_strideNum;
int ans_array_index = stride * k * blockIndex + offsetInBlock;
int vectorLimit = stride * strideNum * blockIndex + offsetInBlock + stride * strideNum;
int alibnStrideNum = strideNum;
if (alibnStrideNum % blockDim.x) alibnStrideNum = alibnStrideNum + blockDim.x - (alibnStrideNum % blockDim.x);
int vectorAlibnLimit = stride * strideNum * blockIndex + offsetInBlock + stride * alibnStrideNum;
int ansArrayIndex = stride * k * blockIndex + offsetInBlock;
int ans_size = 0;
int ansSize = 0;
__syncthreads();
#pragma unroll
for (int i = idx*stride + stride * strideNum * blockIndex + offsetInBlock;
i < vector_alibn_limit;
i += stride * WORKERSNUM)
{
bool has_topk = false;
if (i < vector_limit&&data[i] > pattern)
{
has_topk = true;
}
gpu_check_warp(smem, has_topk, &carry, &index);
if (carry>0)
{
if (has_topk)
{
ans[ans_array_index + (index - 1) * stride] = deconvert(data[i]);
ansIndex[ans_array_index + (index - 1) * stride] = i - stride * strideNum * blockIndex;
for (int i = idx * stride + stride * strideNum * blockIndex + offsetInBlock;
i < vectorAlibnLimit; i += stride * WORKERSNUM){
bool hasTopk = false;
if (i < vectorLimit&&data[i] > pattern){
hasTopk = true;
}
gpuCheckWarp(smem, hasTopk, &carry, &index);
if (carry > 0) {
if (hasTopk) {
ans[ansArrayIndex + (index - 1) * stride] = deconvert(data[i]);
ansIndex[ansArrayIndex + (index - 1) * stride] = i - stride * strideNum * blockIndex;
}
ans_array_index += carry * stride;
ans_size += carry;
ansArrayIndex += carry * stride;
ansSize += carry;
}
__syncthreads();
}
if (ans_size < k)
{
int ramind_num = k - ans_size;
if (ansSize < k){
int ramindNum = k - ansSize;
#pragma unroll
for (int i = idx*stride + stride * strideNum * blockIndex + offsetInBlock;
i < vector_alibn_limit;
i += stride * WORKERSNUM)
{
bool has_topk = false;
if (i < vector_limit&&data[i] == pattern)
{
has_topk = true;
for (int i = idx * stride + stride * strideNum * blockIndex + offsetInBlock; i < vectorAlibnLimit; i += stride * WORKERSNUM) {
bool hasTopk = false;
if (i < vectorLimit && data[i] == pattern) {
hasTopk = true;
}
gpu_check_warp(smem, has_topk, &carry, &index);
if (carry>0)
{
int check_tmp_index = ans_array_index + (index - 1) * stride;
// for don't pointer boundary overflow ,for instance,if there need one index,but two index fits ,wo should filter the bigger index
if (has_topk && check_tmp_index <stride * k * blockIndex + offsetInBlock + stride * k)
{
ans[check_tmp_index] = deconvert(pattern);
ansIndex[check_tmp_index] = i - stride * strideNum * blockIndex;
gpuCheckWarp(smem, hasTopk, &carry, &index);
if (carry>0) {
int checkTmpIndex = ansArrayIndex + (index - 1) * stride;
/* for don't pointer boundary overflow, for instance,
if there need one index,but two index fits, wo should filter the bigger index */
if (hasTopk && checkTmpIndex <stride * k * blockIndex + offsetInBlock + stride * k) {
ans[checkTmpIndex] = deconvert(pattern);
ansIndex[checkTmpIndex] = i - stride * strideNum * blockIndex;
}
ramind_num -= carry;
ans_array_index += carry * stride;
if (ramind_num <= 0) break;
ramindNum -= carry;
ansArrayIndex += carry * stride;
if (ramindNum <= 0) break;
}
__syncthreads();
}
}
}
__device__ void collect_number_old(unsigned int *data, int n, int k, unsigned int pattern, unsigned int *ans, int *indexNum, int stride, int strideNum)
/*
This is an old way,we use one thread to collect number and this way is very slow,so we drop it
*/
__device__
void collectNumberOld(unsigned int *data, int n, int k, unsigned int pattern, unsigned int *ans, int *indexNum, int stride, int strideNum)
{
int idy = blockDim.y * blockIdx.y + threadIdx.y;
int blockIndex = idy / stride;
int offsetInBlock = idy % stride;
int cot = 0;
for (int i = stride * strideNum * blockIndex + offsetInBlock, j = 0; j < strideNum; j++, i += stride)
{
if (data[i] > pattern)
{
for (int i = stride * strideNum * blockIndex + offsetInBlock, j = 0; j < strideNum; j++, i += stride) {
if (data[i] > pattern) {
ans[cot] = data[i];
indexNum[cot++] = j;
}
}
/*if the cot < k ,so the left value must be desire*/
if (cot < k)
{
for (int i = cot; i < k; ++i)
{
/* if the cot < k ,so the left value must be desire */
if (cot < k) {
for (int i = cot; i < k; ++i) {
ans[i] = pattern;
}
//count the remain index and the data value must equal pattern
for (int i = stride * strideNum * blockIndex + offsetInBlock, j = 0; j < strideNum; j++, i += stride)
{
if (data[i] == pattern)
{
/* count the remain index and the data value must equal pattern */
for (int i = stride * strideNum * blockIndex + offsetInBlock, j = 0; j < strideNum; j++, i += stride) {
if (data[i] == pattern) {
indexNum[cot++] = j;
if (cot == k) break;
}
......@@ -719,8 +708,12 @@ __device__ void collect_number_old(unsigned int *data, int n, int k, unsigned in
}
}
/*
When k is very big, we can't use share memory to calculate, so we use radix select algorithm
*/
template<class T> __global__
void KernelTopKRadixSelect(unsigned int * input, int stride, int strideNum, int blockNum, int k, T minValue, T * output, int* index, int limit)
void KernelTopKRadixSelect(unsigned int * input, int stride, int strideNum,
int blockNum, int k, T minValue, T * output, int* index, int limit)
{
/* the idx th thread in one vector */
int idx = blockDim.x * blockIdx.x + threadIdx.x;
......@@ -733,73 +726,71 @@ void KernelTopKRadixSelect(unsigned int * input, int stride, int strideNum, int
if (idy >= stride *blockNum) return;
int mask_desire = 0;
int maskDesire = 0;
unsigned int mask = 0x80000000;
unsigned int desire = 0;
__shared__ int pos_count[32 * 32];
int tmp_k = k;
//if (idx == 0)
//printf("%d %d blockSize: <%d ,%d>\n", idx + blockDim.x*idy,idy, blockDim.x, blockDim.y);
__shared__ int posCount[32 * 32];
int tmpK = k;
int flag = 1;
#pragma unroll
for (int i = 0; i < 32; i++)
{
//we need to clearn the shared memory every loop
for (int i = 0; i < 32; i++){
/* we need to clean the shared memory every loop */
pos_count[idx + blockDim.x*(idy % (512 / WORKERSNUM))] = 0;
posCount[idx + blockDim.x*(idy % (512 / WORKERSNUM))] = 0;
if (flag)
radixCount(input, stride*strideNum*blockNum, pos_count, mask, mask_desire, desire, stride, strideNum, blockNum);
radixCount(input, stride*strideNum*blockNum, posCount, mask, maskDesire, desire, stride, strideNum, blockNum);
__syncthreads();
int sumCount = 0;
#pragma unroll
for (int j = 0; j < WORKERSNUM; j++)
{
sumCount += pos_count[(idy % (512 / WORKERSNUM))*blockDim.x + j];
for (int j = 0; j < WORKERSNUM; j++) {
sumCount += posCount[(idy % (512 / WORKERSNUM))*blockDim.x + j];
}
__syncthreads();
if (tmp_k<sumCount)//this position should be 1
{
if (tmpK<sumCount) {
/* this position should be 1 */
desire = mask^desire;
}
else //zoom out the k size,this position should be 0
{
tmp_k = tmp_k - sumCount;
if (tmp_k == 0)
{
desire = (~(mask_desire >> 1)) | desire;
// avoid Synchronize deadlock
else {
/* zoom out the k size,this position should be 0 */
tmpK = tmpK - sumCount;
if (tmpK == 0){
desire = (~(maskDesire >> 1)) | desire;
/* avoid Synchronize deadlock ,can't use break,so we use flag */
//break;
flag = 0;
}
}
mask_desire = mask^mask_desire;
maskDesire = mask^maskDesire;
mask = mask >> 1;
}
__syncthreads();
//if (idx == 0)
//{
// unsigned int* uintOutput = new unsigned int;
// int* tmpIndex = new int;
// //*******************something worng***************************
// cudaMalloc((void **)&uintOutput, sizeof(unsigned int)* k);
// cudaMalloc((void **)&tmpIndex, sizeof(unsigned int)*k);
// //*************************************************************
// collect_number_old(input, limit, k, desire, uintOutput, tmpIndex, stride, strideNum);
// int blockIndex = idy / stride;
// int offsetInBlock = idy% stride;
// for (int i = stride * k * blockIndex + offsetInBlock, j = 0; j < k; j++, i += stride)
// {
// //for(int i = )
// output[i] = deconvert(uintOutput[j]);
// index[i] = tmpIndex[j];
// }
//}
//__syncthreads();
collect_number(input, stride, strideNum, limit, desire, output, index, k);
/* old way to collect number */
/*
if (idx == 0)
{
unsigned int* uintOutput = new unsigned int;
int* tmpIndex = new int;
//*******************something worng***************************
cudaMalloc((void **)&uintOutput, sizeof(unsigned int)* k);
cudaMalloc((void **)&tmpIndex, sizeof(unsigned int)*k);
//*************************************************************
collectNumberOld(input, limit, k, desire, uintOutput, tmpIndex, stride, strideNum);
int blockIndex = idy / stride;
int offsetInBlock = idy% stride;
for (int i = stride * k * blockIndex + offsetInBlock, j = 0; j < k; j++, i += stride)
{
//for(int i = )
output[i] = deconvert(uintOutput[j]);
index[i] = tmpIndex[j];
}
}
__syncthreads();
*/
collectNumber(input, stride, strideNum, limit, desire, output, index, k);
}
/*
......@@ -828,13 +819,14 @@ void _CudaTopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k)
for (int i = dimRDI + 1; i < a->order; i++)
blockNum *= a->dimSizeRDI[i];
int workerNum = blockNum < 16 ? 64 : 32; // should be tuned for better performance
/*adjust the thread num according size of k for fitting the share memory size*/
int workerNum = blockNum < 16 ? 64 : 32;
/* adjust the thread num according size of k for fitting the share memory size */
if (k< 6) workerNum = 512;
else if (k < 11) workerNum = 256;
else if (k < 22) workerNum = 128;
else if (k < 44) workerNum = 64;
else workerNum = 32;
int cudaGrids[3];
int cudaBlocks[3];
......@@ -842,22 +834,6 @@ void _CudaTopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k)
workerNum, stride * blockNum, MAX_INT,
cudaGrids, cudaBlocks);
/*for (int i = 0; i < 2; i++) {
if ((cudaBlocks[0] * cudaBlocks[1] + 1) * k * (a->unitSize + sizeof(int)) >= SHARED_MEMORY_SIZE) {
if (cudaBlocks[1] >= 2 && cudaBlocks[1] % 2 == 0) {
cudaBlocks[1] /= 2;
cudaGrids[1] *= 2;
}
}
if ((cudaBlocks[0] * cudaBlocks[1] + 1) * k * (a->unitSize + sizeof(int)) >= SHARED_MEMORY_SIZE) {
if (cudaBlocks[0] >= 2 && cudaBlocks[0] % 2 == 0) {
cudaBlocks[0] /= 2;
cudaGrids[0] *= 2;
}
}
}*/
int devIDBackup = 0;
ProtectCudaDev(a->devID, devIDBackup);
......@@ -866,7 +842,7 @@ void _CudaTopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k)
cudaBlocks[1] = 1;
if ((cudaBlocks[0] * cudaBlocks[1] + 1) * k * (a->unitSize + sizeof(int)) < SHARED_MEMORY_SIZE) {
if (a->dataType == DEFAULT_DTYPE) {
KernelTopK2<DTYPE> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
KernelTopK3<DTYPE> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >>>
((DTYPE*)a->data, stride, strideNumA, blockNum, k, DTYPE_MIN,
(DTYPE*)b->data, (int*)index->data);
}
......@@ -896,17 +872,14 @@ void _CudaTopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k)
GDevs.GetCudaThread2D(a->mem->devID,
workerNum, stride * blockNum, MAX_INT,
cudaGrids, cudaBlocks);
//printf("dim is %d %d %d %d\n", cudaGrids[0], cudaGrids[1], cudaBlocks[0], cudaBlocks[1]);
if (a->dataType == DEFAULT_DTYPE) {
unsigned int* goutput = (unsigned int *)a->data;
//two all almost the same time
convert2uintV2 << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> > ((float*)a->data, goutput, stride, strideNumA, blockNum, strideNumA*blockNum*stride);
/* two way all almost the same time to convert data*/
convert2uintV2 <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >>> ((float*)a->data, goutput, stride, strideNumA, blockNum, strideNumA*blockNum*stride);
//convert2uintV2 << <dim3(1, stride * blockNum), dim3(512,1) >> >((float*)a->data, goutput, stride, strideNumA, blockNum, strideNumA*blockNum*stride);
KernelTopKRadixSelect<DTYPE> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> > (goutput, stride, strideNumA, blockNum, k, DTYPE_MIN, (DTYPE *)b->data, (int *)index->data, stride * strideNumA * blockNum);
deconvert2floatV2 << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> > ((unsigned int *)a->data, (float *)goutput, stride, strideNumA, blockNum, strideNumA*blockNum*stride);
//int *indexTensorData = (int *)malloc(4 * strideNumA*blockNum*stride);
//cudaMemcpy(indexTensorData, index->data, sizeof(DTYPE)*index->unitNum, cudaMemcpyDeviceToHost);
KernelTopKRadixSelect<DTYPE> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >>> (goutput, stride, strideNumA, blockNum, k, DTYPE_MIN, (DTYPE *)b->data, (int *)index->data, stride * strideNumA * blockNum);
deconvert2floatV2 <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >>> ((unsigned int *)a->data, (float *)goutput, stride, strideNumA, blockNum, strideNumA*blockNum*stride);
}
}
......
......@@ -155,7 +155,11 @@ void KernelSoftmaxComputeTensor(__half * x, __half * max, __half * sum, __half *
}
}
__device__ __forceinline__ float broadCast(float input)
/*
use PTX code to broadcast float data
*/
__device__ __forceinline__
float broadcast(float input)
{
float output;
asm(
......@@ -167,28 +171,28 @@ __device__ __forceinline__ float broadCast(float input)
return output;
}
/*
use warp broadcast to optimize softmax computing
*/
__global__
void KernelSoftmaxComputeTensorUseBroadcast(DTYPE * input, DTYPE * max, DTYPE * sum, DTYPE * output, int stride, int strideNum, int blockNum)
void KernelSoftmaxComputeTensorUseBroadcast(DTYPE * input, DTYPE * max, DTYPE * sum, DTYPE * output,
int stride, int strideNum, int blockNum)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
int j = blockDim.y * blockIdx.y + threadIdx.y;
int i2 = j % stride;
int blockSize = stride * strideNum;
if (j < stride * blockNum)
{
if (j < stride * blockNum) {
DTYPE sumData, maxData;
if (i % 32 == 0)
{
if (i % 32 == 0) {
sumData = sum[j];
maxData = max[j];
}
//sumData = __shfl_sync(0xffffffff,sumData, 0);
//maxData = __shfl_sync(0xffffffff,maxData, 0);
sumData = broadCast(sumData);
maxData = broadCast(maxData);
if (i < strideNum)
{
sumData = broadcast(sumData);
maxData = broadcast(maxData);
if (i < strideNum){
int offset = int(j / stride) * blockSize + i * stride + i2;
output[offset] = exp(input[offset] - maxData) / sumData;
}
......@@ -223,20 +227,18 @@ void _CudaSoftmaxSumMax(const XTensor * x, XTensor * y, int leadDim, XTensor * s
int cudaGridSize[3];
int cudaBlockSize[3];
if (leadDim != 0 || dimensionSize <= 10)
{
//allocate thread num for old function
if (leadDim != 0 || dimensionSize <= 10){
/* allocate thread num for old function */
GDevs.GetCudaThread2D(x->devID, stride * blockNum, dimensionSize, MAX_INT, cudaGridSize, cudaBlockSize);
}
else
{
//allocate thread num for new function
else {
/* allocate thread num for new function */
GDevs.GetCudaThread2D(x->devID, dimensionSize, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
if (cudaBlockSize[0] < 32)
{
cudaBlockSize[0] = 32;//use at least a warp
if (cudaBlockSize[1] > 32)
{
if (cudaBlockSize[0] < 32) {
/* use at least a warp */
cudaBlockSize[0] = 32;
if (cudaBlockSize[1] > 32) {
cudaGridSize[1] = int(ceil(float(stride * blockNum) / 32));
cudaBlockSize[1] = 32;
}
......@@ -246,23 +248,21 @@ void _CudaSoftmaxSumMax(const XTensor * x, XTensor * y, int leadDim, XTensor * s
ProtectCudaDev(x->devID, devIDBackup);
if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE){
if (leadDim != 0 || dimensionSize <= 10)
{
KernelSoftmaxComputeTensor << <dim3(cudaGridSize[0], cudaGridSize[1]), dim3(cudaBlockSize[0], cudaBlockSize[1]) >> >
((DTYPE*)x->data, (DTYPE*)max->data, (DTYPE*)sum->data, (DTYPE*)y->data,
stride, dimensionSize, stride * dimensionSize, blockNum, stride * blockNum);
if (leadDim != 0 || dimensionSize <= 10) {
KernelSoftmaxComputeTensor <<< dim3(cudaGridSize[0], cudaGridSize[1]), dim3(cudaBlockSize[0], cudaBlockSize[1]) >>>
((DTYPE*)x->data, (DTYPE*)max->data, (DTYPE*)sum->data, (DTYPE*)y->data,
stride, dimensionSize, stride * dimensionSize, blockNum, stride * blockNum);
}
else
{
KernelSoftmaxComputeTensorUseBroadcast << <dim3(cudaGridSize[0], cudaGridSize[1]), dim3(cudaBlockSize[0], cudaBlockSize[1]) >> >
((DTYPE*)x->data, (DTYPE*)max->data, (DTYPE*)sum->data, (DTYPE*)y->data,
stride, dimensionSize, blockNum);
else {
KernelSoftmaxComputeTensorUseBroadcast <<< dim3(cudaGridSize[0], cudaGridSize[1]), dim3(cudaBlockSize[0], cudaBlockSize[1]) >>>
((DTYPE*)x->data, (DTYPE*)max->data, (DTYPE*)sum->data, (DTYPE*)y->data,
stride, dimensionSize, blockNum);
}
}
else if(x->dataType == X_FLOAT16 && y->dataType == X_FLOAT16){
KernelSoftmaxComputeTensor<<<dim3(cudaGridSize[0], cudaGridSize[1]), dim3(cudaBlockSize[0], cudaBlockSize[1])>>>
((__half*)x->data, (__half*)max->data, (__half*)sum->data, (__half*)y->data,
stride, dimensionSize, blockNum);
KernelSoftmaxComputeTensor <<< dim3(cudaGridSize[0], cudaGridSize[1]), dim3(cudaBlockSize[0], cudaBlockSize[1]) >>>
((__half*)x->data, (__half*)max->data, (__half*)sum->data, (__half*)y->data,
stride, dimensionSize, blockNum);
}
else{
ShowNTErrors("TODO!");
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论