//pytorch use this way to allocate threads,they maybe use hard-code according the SM number (the 1080 and 1080 Ti is 20),and it indeed have better perforamnce,
/*
according the GPU's sm number allocation warp num
*/
inline void continuousStorageThreadAllocation(dim3& grid, dim3& block, long long vectorNum, int vectorSize)
inline void continuousStorageThreadAllocation(dim3& grid, dim3& block, long long vectorNum, int vectorSize)
{
{
int warpNum = 4;
int warpNum = 4;
if (vectorNum < 20 * 8)
if (vectorNum < 20 * 8) {
{
warpNum = 8;
warpNum = 8;
if (vectorNum < 20 * 4)
if (vectorNum < 20 * 4) {
{
warpNum = 16;
warpNum = 16;
if (warpNum < 20 * 2)
if (warpNum < 20 * 2)
warpNum = 32;
warpNum = 32;
...
@@ -661,6 +606,7 @@ inline void continuousStorageThreadAllocation(dim3& grid, dim3& block, long long
...
@@ -661,6 +606,7 @@ inline void continuousStorageThreadAllocation(dim3& grid, dim3& block, long long
int minWarpNum = vectorSize / 32;
int minWarpNum = vectorSize / 32;
if (vectorSize % 32 != 0) minWarpNum++;
if (vectorSize % 32 != 0) minWarpNum++;
warpNum = min(warpNum, minWarpNum);
warpNum = min(warpNum, minWarpNum);
grid.x = vectorNum;
grid.x = vectorNum;
grid.y = 1;
grid.y = 1;
grid.z = 1;
grid.z = 1;
...
@@ -669,7 +615,9 @@ inline void continuousStorageThreadAllocation(dim3& grid, dim3& block, long long
...
@@ -669,7 +615,9 @@ inline void continuousStorageThreadAllocation(dim3& grid, dim3& block, long long
block.z = 1;
block.z = 1;
}
}
//this situation we use block.x * grid.x deal one vector for continuous read
/*
this situation we use block.x * grid.x deal one vector for continuous read
*/
inline void discontinuousStorageNoShareMemThreadAllocation(dim3& grid, dim3& block, int stride, int blockNum)
inline void discontinuousStorageNoShareMemThreadAllocation(dim3& grid, dim3& block, int stride, int blockNum)
unsigned int mask = (v & 0x80000000) ? 0x80000000 : 0xffffffff;
__device__
float deconvert(unsigned int v)
{
unsigned int mask = (v & 0x80000000) ? 0x80000000 : 0xffffffff;
return __int_as_float(v ^ mask);
return __int_as_float(v ^ mask);
}
}
__global__ void convert2uintV2(float* input, unsigned int *output, int stride, int strideNum, int blockNum, int size)
__global__
void convert2uintV2(float* input, unsigned int *output, int stride, int strideNum, int blockNum, int size)
{
{
int idx = blockDim.x * blockIdx.x + threadIdx.x;
int idx = blockDim.x * blockIdx.x + threadIdx.x;
int idy = blockDim.y * blockIdx.y + threadIdx.y;
int idy = blockDim.y * blockIdx.y + threadIdx.y;
//int strideNum = (int)strideNumSize;
//if (flag) strideNum = strideNumSize[idy];
int blockIndex = idy / stride;
int blockIndex = idy / stride;
int offsetInBlock = idy% stride;
int offsetInBlock = idy% stride;
#pragma unroll
#pragma unroll
for (int i = idx * stride + stride * strideNum * blockIndex + offsetInBlock;
for (int i = idx * stride + stride * strideNum * blockIndex + offsetInBlock;
i < stride * strideNum * blockIndex + offsetInBlock + stride * strideNum && i < size;
i < stride * strideNum * blockIndex + offsetInBlock + stride * strideNum && i < size;
i += stride * blockDim.x)
i += stride * blockDim.x){
{
output[i] = convert(input[i]);
output[i] = convert(input[i]);
}
}
}
}
__global__ void deconvert2floatV2(unsigned int * input, float *output, int stride, int strideNum, int blockNum, int size)
__global__
void deconvert2floatV2(unsigned int * input, float *output, int stride, int strideNum, int blockNum, int size)
{
{
int idx = blockDim.x * blockIdx.x + threadIdx.x;
int idx = blockDim.x * blockIdx.x + threadIdx.x;
int idy = blockDim.y * blockIdx.y + threadIdx.y;
int idy = blockDim.y * blockIdx.y + threadIdx.y;
...
@@ -552,13 +549,13 @@ __global__ void deconvert2floatV2(unsigned int * input, float *output, int strid
...
@@ -552,13 +549,13 @@ __global__ void deconvert2floatV2(unsigned int * input, float *output, int strid
#pragma unroll
#pragma unroll
for (int i = idx * stride + stride * strideNum * blockIndex + offsetInBlock;
for (int i = idx * stride + stride * strideNum * blockIndex + offsetInBlock;
i < stride * strideNum * blockIndex + offsetInBlock + stride * strideNum && i < size;
i < stride * strideNum * blockIndex + offsetInBlock + stride * strideNum && i < size;
i += stride * blockDim.x)
i += stride * blockDim.x){
{
output[i] = deconvert(input[i]);
output[i] = deconvert(input[i]);
}
}
}
}
__device__ void radixCount(unsigned int *data, int limit, int *pos_count, unsigned int mask, int mask_desire, unsigned int desire, int stride, int strideNum, int blockNum)
__device__
void radixCount(unsigned int *data, int limit, int *posCount, unsigned int mask, int maskDesire, unsigned int desire, int stride, int strideNum, int blockNum)
{
{
/*the idx th thread in one vector */
/*the idx th thread in one vector */
...
@@ -569,149 +566,141 @@ __device__ void radixCount(unsigned int *data, int limit, int *pos_count, unsign
...
@@ -569,149 +566,141 @@ __device__ void radixCount(unsigned int *data, int limit, int *pos_count, unsign
for (int i = idx*stride + stride * strideNum * blockIndex + offsetInBlock;
for (int i = idx * stride + stride * strideNum * blockIndex + offsetInBlock; i < vectorAlibnLimit; i += stride * WORKERSNUM) {
i < vector_alibn_limit;
bool hasTopk = false;
i += stride * WORKERSNUM)
if (i < vectorLimit && data[i] == pattern) {
{
hasTopk = true;
bool has_topk = false;
if (i < vector_limit&&data[i] == pattern)
{
has_topk = true;
}
}
gpu_check_warp(smem, has_topk, &carry, &index);
if (carry>0)
gpuCheckWarp(smem, hasTopk, &carry, &index);
{
int check_tmp_index = ans_array_index + (index - 1) * stride;
if (carry>0) {
// for don't pointer boundary overflow ,for instance,if there need one index,but two index fits ,wo should filter the bigger index
int checkTmpIndex = ansArrayIndex + (index - 1) * stride;
if (has_topk && check_tmp_index <stride * k * blockIndex + offsetInBlock + stride * k)
/* for don't pointer boundary overflow, for instance,
{
if there need one index,but two index fits, wo should filter the bigger index */
ans[check_tmp_index] = deconvert(pattern);
if (hasTopk && checkTmpIndex <stride * k * blockIndex + offsetInBlock + stride * k) {
ansIndex[check_tmp_index] = i - stride * strideNum * blockIndex;
ans[checkTmpIndex] = deconvert(pattern);
ansIndex[checkTmpIndex] = i - stride * strideNum * blockIndex;
}
}
ramind_num -= carry;
ramindNum -= carry;
ans_array_index += carry * stride;
ansArrayIndex += carry * stride;
if (ramind_num <= 0) break;
if (ramindNum <= 0) break;
}
}
__syncthreads();
__syncthreads();
}
}
}
}
}
}
__device__ void collect_number_old(unsigned int *data, int n, int k, unsigned int pattern, unsigned int *ans, int *indexNum, int stride, int strideNum)
/*
This is an old way,we use one thread to collect number and this way is very slow,so we drop it
*/
__device__
void collectNumberOld(unsigned int *data, int n, int k, unsigned int pattern, unsigned int *ans, int *indexNum, int stride, int strideNum)
{
{
int idy = blockDim.y * blockIdx.y + threadIdx.y;
int idy = blockDim.y * blockIdx.y + threadIdx.y;
int blockIndex = idy / stride;
int blockIndex = idy / stride;
int offsetInBlock = idy % stride;
int offsetInBlock = idy % stride;
int cot = 0;
int cot = 0;
for (int i = stride * strideNum * blockIndex + offsetInBlock, j = 0; j < strideNum; j++, i += stride)
for (int i = stride * strideNum * blockIndex + offsetInBlock, j = 0; j < strideNum; j++, i += stride) {
{
if (data[i] > pattern) {
if (data[i] > pattern)
{
ans[cot] = data[i];
ans[cot] = data[i];
indexNum[cot++] = j;
indexNum[cot++] = j;
}
}
}
}
/*if the cot < k ,so the left value must be desire*/
/* if the cot < k ,so the left value must be desire */
if (cot < k)
if (cot < k) {
{
for (int i = cot; i < k; ++i) {
for (int i = cot; i < k; ++i)
{
ans[i] = pattern;
ans[i] = pattern;
}
}
//count the remain index and the data value must equal pattern
/* count the remain index and the data value must equal pattern */
for (int i = stride * strideNum * blockIndex + offsetInBlock, j = 0; j < strideNum; j++, i += stride)
for (int i = stride * strideNum * blockIndex + offsetInBlock, j = 0; j < strideNum; j++, i += stride) {
{
if (data[i] == pattern) {
if (data[i] == pattern)
{
indexNum[cot++] = j;
indexNum[cot++] = j;
if (cot == k) break;
if (cot == k) break;
}
}
...
@@ -719,8 +708,12 @@ __device__ void collect_number_old(unsigned int *data, int n, int k, unsigned in
...
@@ -719,8 +708,12 @@ __device__ void collect_number_old(unsigned int *data, int n, int k, unsigned in
}
}
}
}
/*
When k is very big, we can't use share memory to calculate, so we use radix select algorithm
*/
template<class T> __global__
template<class T> __global__
void KernelTopKRadixSelect(unsigned int * input, int stride, int strideNum, int blockNum, int k, T minValue, T * output, int* index, int limit)
void KernelTopKRadixSelect(unsigned int * input, int stride, int strideNum,
int blockNum, int k, T minValue, T * output, int* index, int limit)
{
{
/* the idx th thread in one vector */
/* the idx th thread in one vector */
int idx = blockDim.x * blockIdx.x + threadIdx.x;
int idx = blockDim.x * blockIdx.x + threadIdx.x;
...
@@ -733,73 +726,71 @@ void KernelTopKRadixSelect(unsigned int * input, int stride, int strideNum, int
...
@@ -733,73 +726,71 @@ void KernelTopKRadixSelect(unsigned int * input, int stride, int strideNum, int