unsigned int mask = (v & 0x80000000) ? 0x80000000 : 0xffffffff;
return __int_as_float(v ^ mask);
}
__global__ void convert2uintV2(float* input, unsigned int *output, int stride, int strideNum, int blockNum, int size)
{
int idx = blockDim.x * blockIdx.x + threadIdx.x;
int idy = blockDim.y * blockIdx.y + threadIdx.y;
//int strideNum = (int)strideNumSize;
//if (flag) strideNum = strideNumSize[idy];
int blockIndex = idy / stride;
int offsetInBlock = idy% stride;
#pragma unroll
for (int i = idx * stride + stride * strideNum * blockIndex + offsetInBlock;
i < stride * strideNum * blockIndex + offsetInBlock + stride * strideNum && i < size;
i += stride * blockDim.x)
{
output[i] = convert(input[i]);
}
}
__global__ void deconvert2floatV2(unsigned int * input, float *output, int stride, int strideNum, int blockNum, int size)
{
int idx = blockDim.x * blockIdx.x + threadIdx.x;
int idy = blockDim.y * blockIdx.y + threadIdx.y;
//int strideNum = (int)strideNumSize;
//if (flag) strideNum = strideNumSize[idy];
int blockIndex = idy / stride;
int offsetInBlock = idy% stride;
#pragma unroll
for (int i = idx * stride + stride * strideNum * blockIndex + offsetInBlock;
i < stride * strideNum * blockIndex + offsetInBlock + stride * strideNum && i < size;
i += stride * blockDim.x)
{
output[i] = deconvert(input[i]);
}
}
__device__ void radixCount(unsigned int *data, int limit, int *pos_count, unsigned int mask, int mask_desire, unsigned int desire, int stride, int strideNum, int blockNum)
for (int i = idx*stride + stride * strideNum * blockIndex + offsetInBlock;
i < vector_alibn_limit;
i += stride * WORKERSNUM)
{
bool has_topk = false;
if (i < vector_limit&&data[i] == pattern)
{
has_topk = true;
}
gpu_check_warp(smem, has_topk, &carry, &index);
if (carry>0)
{
int check_tmp_index = ans_array_index + (index - 1) * stride;
// for don't pointer boundary overflow ,for instance,if there need one index,but two index fits ,wo should filter the bigger index
if (has_topk && check_tmp_index <stride * k * blockIndex + offsetInBlock + stride * k)
{
ans[check_tmp_index] = deconvert(pattern);
ansIndex[check_tmp_index] = i - stride * strideNum * blockIndex;
}
ramind_num -= carry;
ans_array_index += carry * stride;
if (ramind_num <= 0) break;
}
__syncthreads();
}
}
}
__device__ void collect_number_old(unsigned int *data, int n, int k, unsigned int pattern, unsigned int *ans, int *indexNum, int stride, int strideNum)
{
int idy = blockDim.y * blockIdx.y + threadIdx.y;
int blockIndex = idy / stride;
int offsetInBlock = idy % stride;
int cot = 0;
for (int i = stride * strideNum * blockIndex + offsetInBlock, j = 0; j < strideNum; j++, i += stride)
{
if (data[i] > pattern)
{
ans[cot] = data[i];
indexNum[cot++] = j;
}
}
/*if the cot < k ,so the left value must be desire*/
if (cot < k)
{
for (int i = cot; i < k; ++i)
{
ans[i] = pattern;
}
//count the remain index and the data value must equal pattern
for (int i = stride * strideNum * blockIndex + offsetInBlock, j = 0; j < strideNum; j++, i += stride)
{
if (data[i] == pattern)
{
indexNum[cot++] = j;
if (cot == k) break;
}
}
}
}
template<class T> __global__
void KernelTopKRadixSelect(unsigned int * input, int stride, int strideNum, int blockNum, int k, T minValue, T * output, int* index, int limit)