//pytorch use this way to allocate threads,they maybe use hard-code according the SM number (the 1080 and 1080 Ti is 20),and it indeed have better perforamnce,
inline void continuousStorageThreadAllocation(dim3& grid, dim3& block, long long vectorNum, int vectorSize)
{
int warpNum = 4;
if (vectorNum < 20 * 8)
{
warpNum = 8;
if (vectorNum < 20 * 4)
{
warpNum = 16;
if (warpNum < 20 * 2)
warpNum = 32;
}
}
int minWarpNum = vectorSize / 32;
if (vectorSize % 32 != 0) minWarpNum++;
warpNum = min(warpNum, minWarpNum);
grid.x = vectorNum;
grid.y = 1;
grid.z = 1;
block.x = 1;
block.y = warpNum * 32;
block.z = 1;
}
//this situation we use block.x * grid.x deal one vector for continuous read
inline void discontinuousStorageNoShareMemThreadAllocation(dim3& grid, dim3& block, int stride, int blockNum)