Commit d6d35fab by 张裕浩

添加topk共享内存不足时使用基数选择法

parent 002692e7
......@@ -25,6 +25,7 @@
#include "TopK.h"
#include "TopK.cuh"
#include "Sort.cuh"
#define WORKERSNUM 64
namespace nts { // namespace nts(NiuTrans.Tensor)
......@@ -495,6 +496,312 @@ void KernelTopK3(T * input, int stride, int strideNum, int blockNum, int k, T mi
}
__device__ __forceinline__ unsigned getLaneMaskLe() {
unsigned mask;
asm("mov.u32 %0, %%lanemask_le;" : "=r"(mask));
return mask;
}
__device__ __forceinline__ int getLaneId() {
int laneId;
asm("mov.s32 %0, %laneid;" : "=r"(laneId));
return laneId;
}
__device__ unsigned convert(float v)
{
unsigned x = __float_as_int(v);
unsigned mask = (x & 0x80000000) ? 0xffffffff : 0x80000000;
return (x ^ mask);
}
__device__ float convert(unsigned int v)
{
float x = __uint_as_float(v);
return x;
}
__device__ float deconvert(unsigned int v) {
unsigned int mask = (v & 0x80000000) ? 0x80000000 : 0xffffffff;
return __int_as_float(v ^ mask);
}
__global__ void convert2uintV2(float* input, unsigned int *output, int stride, int strideNum, int blockNum, int size)
{
int idx = blockDim.x * blockIdx.x + threadIdx.x;
int idy = blockDim.y * blockIdx.y + threadIdx.y;
//int strideNum = (int)strideNumSize;
//if (flag) strideNum = strideNumSize[idy];
int blockIndex = idy / stride;
int offsetInBlock = idy% stride;
#pragma unroll
for (int i = idx * stride + stride * strideNum * blockIndex + offsetInBlock;
i < stride * strideNum * blockIndex + offsetInBlock + stride * strideNum && i < size;
i += stride * blockDim.x)
{
output[i] = convert(input[i]);
}
}
__global__ void deconvert2floatV2(unsigned int * input, float *output, int stride, int strideNum, int blockNum, int size)
{
int idx = blockDim.x * blockIdx.x + threadIdx.x;
int idy = blockDim.y * blockIdx.y + threadIdx.y;
//int strideNum = (int)strideNumSize;
//if (flag) strideNum = strideNumSize[idy];
int blockIndex = idy / stride;
int offsetInBlock = idy% stride;
#pragma unroll
for (int i = idx * stride + stride * strideNum * blockIndex + offsetInBlock;
i < stride * strideNum * blockIndex + offsetInBlock + stride * strideNum && i < size;
i += stride * blockDim.x)
{
output[i] = deconvert(input[i]);
}
}
__device__ void radixCount(unsigned int *data, int limit, int *pos_count, unsigned int mask, int mask_desire, unsigned int desire, int stride, int strideNum, int blockNum)
{
/*the idx th thread in one vector */
int idx = blockDim.x * blockIdx.x + threadIdx.x;
/* the idy th vector in one tensor */
int idy = blockDim.y * blockIdx.y + threadIdx.y;
int blockIndex = idy / stride;
int offsetInBlock = idy% stride;
for (int j = idx*stride + stride * strideNum * blockIndex + offsetInBlock;
j< stride * strideNum * blockIndex + offsetInBlock + stride*strideNum && j<limit;
j += stride * WORKERSNUM)
{
// printf("idx:%d, idy:%d,j:%d,addpos:%d\n",idx,idy,j, (idy % WORKERSNUM)*blockDim.x + idx);
if ((data[j] & mask_desire) == desire)
{
if (data[j] & mask)
{
pos_count[(idy % (512 / WORKERSNUM))*blockDim.x + idx]++;
}
}
// printf("Radix Count: %d Idx: %d,Idy: %d,end: %d\n", j,idx,idy, stride * strideNum * blockIndex + offsetInBlock + stride*strideNum);
}
}
//the theard number need be 32 times
__device__ void gpu_check_warp(int *smem, bool in, int *carry, int *index)
{
int vote = __ballot_sync(0xffffffff, in);
*index = __popc(getLaneMaskLe() & vote);
*carry = __popc(vote);
int idx = blockDim.x * blockIdx.x + threadIdx.x;
int warp = idx / 32; //idx 0 -- blockDim.x
int warp_num = blockDim.x / 32;//get each vector use how many warp
if (getLaneId() == 0)
{
smem[warp + warp_num * threadIdx.y] = *carry; //save each warp carry
//printf("%d ", warp + warp_num * threadIdx.y);
}
__syncthreads();
if (idx == 0) //use one thread to count the carry for globe the warp
{
for (int i = 1 + warp_num * threadIdx.y; i < warp_num * (threadIdx.y + 1); ++i)
{
smem[i] += smem[i - 1];
}
}
__syncthreads();
if (warp % warp_num)
{
*index += smem[warp_num * threadIdx.y + warp - 1];
}
*carry = smem[warp_num * threadIdx.y + warp_num - 1];
}
__device__ void collect_number(unsigned int *data, int stride, int strideNum, int limit, unsigned int pattern, float *ans, int *ansIndex, int k)
{
int idy = blockDim.y * blockIdx.y + threadIdx.y;
int idx = blockDim.x * blockIdx.x + threadIdx.x;
int blockIndex = idy / stride;
int offsetInBlock = idy % stride;
__shared__ int smem[32]; //for count each warp's tmp carry
int carry;
int index;
int vector_limit = stride * strideNum * blockIndex + offsetInBlock + stride * strideNum;
int alibn_strideNum = strideNum;
if (alibn_strideNum % blockDim.x) alibn_strideNum = alibn_strideNum + blockDim.x - (alibn_strideNum % blockDim.x);
int vector_alibn_limit = stride * strideNum * blockIndex + offsetInBlock + stride * alibn_strideNum;
int ans_array_index = stride * k * blockIndex + offsetInBlock;
int ans_size = 0;
__syncthreads();
#pragma unroll
for (int i = idx*stride + stride * strideNum * blockIndex + offsetInBlock;
i < vector_alibn_limit;
i += stride * WORKERSNUM)
{
bool has_topk = false;
if (i < vector_limit&&data[i] > pattern)
{
has_topk = true;
}
gpu_check_warp(smem, has_topk, &carry, &index);
if (carry>0)
{
if (has_topk)
{
ans[ans_array_index + (index - 1) * stride] = deconvert(data[i]);
ansIndex[ans_array_index + (index - 1) * stride] = i - stride * strideNum * blockIndex;
}
ans_array_index += carry * stride;
ans_size += carry;
}
__syncthreads();
}
if (ans_size < k)
{
int ramind_num = k - ans_size;
#pragma unroll
for (int i = idx*stride + stride * strideNum * blockIndex + offsetInBlock;
i < vector_alibn_limit;
i += stride * WORKERSNUM)
{
bool has_topk = false;
if (i < vector_limit&&data[i] == pattern)
{
has_topk = true;
}
gpu_check_warp(smem, has_topk, &carry, &index);
if (carry>0)
{
int check_tmp_index = ans_array_index + (index - 1) * stride;
// for don't pointer boundary overflow ,for instance,if there need one index,but two index fits ,wo should filter the bigger index
if (has_topk && check_tmp_index <stride * k * blockIndex + offsetInBlock + stride * k)
{
ans[check_tmp_index] = deconvert(pattern);
ansIndex[check_tmp_index] = i - stride * strideNum * blockIndex;
}
ramind_num -= carry;
ans_array_index += carry * stride;
if (ramind_num <= 0) break;
}
__syncthreads();
}
}
}
__device__ void collect_number_old(unsigned int *data, int n, int k, unsigned int pattern, unsigned int *ans, int *indexNum, int stride, int strideNum)
{
int idy = blockDim.y * blockIdx.y + threadIdx.y;
int blockIndex = idy / stride;
int offsetInBlock = idy % stride;
int cot = 0;
for (int i = stride * strideNum * blockIndex + offsetInBlock, j = 0; j < strideNum; j++, i += stride)
{
if (data[i] > pattern)
{
ans[cot] = data[i];
indexNum[cot++] = j;
}
}
/*if the cot < k ,so the left value must be desire*/
if (cot < k)
{
for (int i = cot; i < k; ++i)
{
ans[i] = pattern;
}
//count the remain index and the data value must equal pattern
for (int i = stride * strideNum * blockIndex + offsetInBlock, j = 0; j < strideNum; j++, i += stride)
{
if (data[i] == pattern)
{
indexNum[cot++] = j;
if (cot == k) break;
}
}
}
}
template<class T> __global__
void KernelTopKRadixSelect(unsigned int * input, int stride, int strideNum, int blockNum, int k, T minValue, T * output, int* index, int limit)
{
/* the idx th thread in one vector */
int idx = blockDim.x * blockIdx.x + threadIdx.x;
/* the idy th vector in one tensor */
int idy = blockDim.y * blockIdx.y + threadIdx.y;
//use optimization or not
//int strideNum =(int)strideNumSize;
//if (isOptimization) strideNum = strideNumSize[idy];
if (idy >= stride *blockNum) return;
int mask_desire = 0;
unsigned int mask = 0x80000000;
unsigned int desire = 0;
__shared__ int pos_count[32 * 32];
int tmp_k = k;
//if (idx == 0)
//printf("%d %d blockSize: <%d ,%d>\n", idx + blockDim.x*idy,idy, blockDim.x, blockDim.y);
int flag = 1;
#pragma unroll
for (int i = 0; i < 32; i++)
{
//we need to clearn the shared memory every loop
pos_count[idx + blockDim.x*(idy % (512 / WORKERSNUM))] = 0;
if (flag)
radixCount(input, stride*strideNum*blockNum, pos_count, mask, mask_desire, desire, stride, strideNum, blockNum);
__syncthreads();
int sumCount = 0;
#pragma unroll
for (int j = 0; j < WORKERSNUM; j++)
{
sumCount += pos_count[(idy % (512 / WORKERSNUM))*blockDim.x + j];
}
__syncthreads();
if (tmp_k<sumCount)//this position should be 1
{
desire = mask^desire;
}
else //zoom out the k size,this position should be 0
{
tmp_k = tmp_k - sumCount;
if (tmp_k == 0)
{
desire = (~(mask_desire >> 1)) | desire;
// avoid Synchronize deadlock
//break;
flag = 0;
}
}
mask_desire = mask^mask_desire;
mask = mask >> 1;
}
__syncthreads();
//if (idx == 0)
//{
// unsigned int* uintOutput = new unsigned int;
// int* tmpIndex = new int;
// //*******************something worng***************************
// cudaMalloc((void **)&uintOutput, sizeof(unsigned int)* k);
// cudaMalloc((void **)&tmpIndex, sizeof(unsigned int)*k);
// //*************************************************************
// collect_number_old(input, limit, k, desire, uintOutput, tmpIndex, stride, strideNum);
// int blockIndex = idy / stride;
// int offsetInBlock = idy% stride;
// for (int i = stride * k * blockIndex + offsetInBlock, j = 0; j < k; j++, i += stride)
// {
// //for(int i = )
// output[i] = deconvert(uintOutput[j]);
// index[i] = tmpIndex[j];
// }
//}
//__syncthreads();
collect_number(input, stride, strideNum, limit, desire, output, index, k);
}
/*
get the top-k items along a given dimension
>> a - input tensor
......@@ -570,20 +877,37 @@ void _CudaTopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k)
}
/* we resort to sorting if the data cannot fit inside the shared memory */
else {
int dimSize[MAX_TENSOR_DIM_NUM];
memcpy(dimSize, a->dimSize, sizeof(int) * a->order);
dimSize[0] = -dimSize[0];
XTensor * indexA = new XTensor(a->order, dimSize, X_INT, 1.0F, a->devID, a->mem);
indexA->data = a->mem != NULL ? a->mem->AllocBuf(a->devID, a->unitNum * sizeof(int)) : XMemAlloc(a->devID, a->unitNum * sizeof(int));
//int dimSize[MAX_TENSOR_DIM_NUM];
//memcpy(dimSize, a->dimSize, sizeof(int) * a->order);
//dimSize[0] = -dimSize[0];
//XTensor * indexA = new XTensor(a->order, dimSize, X_INT, 1.0F, a->devID, a->mem);
//indexA->data = a->mem != NULL ? a->mem->AllocBuf(a->devID, a->unitNum * sizeof(int)) : XMemAlloc(a->devID, a->unitNum * sizeof(int));
/* make the index tensor */
indexA->SetAscendingOrder(dim);
///* make the index tensor */
//indexA->SetAscendingOrder(dim);
_CudaSortBig(a, b, indexA, index, dim, k);
//_CudaSortBig(a, b, indexA, index, dim, k);
if (a->mem != NULL)
a->mem->ReleaseBuf(a->devID, a->unitNum * sizeof(int));
delete indexA;
//if (a->mem != NULL)
// a->mem->ReleaseBuf(a->devID, a->unitNum * sizeof(int));
//delete indexA;
int workerNum = WORKERSNUM;
GDevs.GetCudaThread2D(a->mem->devID,
workerNum, stride * blockNum, MAX_INT,
cudaGrids, cudaBlocks);
//printf("dim is %d %d %d %d\n", cudaGrids[0], cudaGrids[1], cudaBlocks[0], cudaBlocks[1]);
if (a->dataType == DEFAULT_DTYPE) {
unsigned int* goutput = (unsigned int *)a->data;
//two all almost the same time
convert2uintV2 << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> > ((float*)a->data, goutput, stride, strideNumA, blockNum, strideNumA*blockNum*stride);
//convert2uintV2 << <dim3(1, stride * blockNum), dim3(512,1) >> >((float*)a->data, goutput, stride, strideNumA, blockNum, strideNumA*blockNum*stride);
KernelTopKRadixSelect<DTYPE> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> > (goutput, stride, strideNumA, blockNum, k, DTYPE_MIN, (DTYPE *)b->data, (int *)index->data, stride * strideNumA * blockNum);
deconvert2floatV2 << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> > ((unsigned int *)a->data, (float *)goutput, stride, strideNumA, blockNum, strideNumA*blockNum*stride);
//int *indexTensorData = (int *)malloc(4 * strideNumA*blockNum*stride);
//cudaMemcpy(indexTensorData, index->data, sizeof(DTYPE)*index->unitNum, cudaMemcpyDeviceToHost);
}
}
BacktoCudaDev(a->devID, devIDBackup);
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论