Commit fa2ed07c by 张裕浩

clean code

parent ec71b1a9
...@@ -155,7 +155,11 @@ void KernelSoftmaxComputeTensor(__half * x, __half * max, __half * sum, __half * ...@@ -155,7 +155,11 @@ void KernelSoftmaxComputeTensor(__half * x, __half * max, __half * sum, __half *
} }
} }
__device__ __forceinline__ float broadCast(float input) /*
use PTX code to broadcast float data
*/
__device__ __forceinline__
float broadcast(float input)
{ {
float output; float output;
asm( asm(
...@@ -167,28 +171,28 @@ __device__ __forceinline__ float broadCast(float input) ...@@ -167,28 +171,28 @@ __device__ __forceinline__ float broadCast(float input)
return output; return output;
} }
/*
use warp broadcast to optimize softmax computing
*/
__global__ __global__
void KernelSoftmaxComputeTensorUseBroadcast(DTYPE * input, DTYPE * max, DTYPE * sum, DTYPE * output, int stride, int strideNum, int blockNum) void KernelSoftmaxComputeTensorUseBroadcast(DTYPE * input, DTYPE * max, DTYPE * sum, DTYPE * output,
int stride, int strideNum, int blockNum)
{ {
int i = blockDim.x * blockIdx.x + threadIdx.x; int i = blockDim.x * blockIdx.x + threadIdx.x;
int j = blockDim.y * blockIdx.y + threadIdx.y; int j = blockDim.y * blockIdx.y + threadIdx.y;
int i2 = j % stride; int i2 = j % stride;
int blockSize = stride * strideNum; int blockSize = stride * strideNum;
if (j < stride * blockNum)
{ if (j < stride * blockNum) {
DTYPE sumData, maxData; DTYPE sumData, maxData;
if (i % 32 == 0) if (i % 32 == 0) {
{
sumData = sum[j]; sumData = sum[j];
maxData = max[j]; maxData = max[j];
} }
//sumData = __shfl_sync(0xffffffff,sumData, 0); sumData = broadcast(sumData);
//maxData = __shfl_sync(0xffffffff,maxData, 0); maxData = broadcast(maxData);
sumData = broadCast(sumData); if (i < strideNum){
maxData = broadCast(maxData);
if (i < strideNum)
{
int offset = int(j / stride) * blockSize + i * stride + i2; int offset = int(j / stride) * blockSize + i * stride + i2;
output[offset] = exp(input[offset] - maxData) / sumData; output[offset] = exp(input[offset] - maxData) / sumData;
} }
...@@ -223,20 +227,18 @@ void _CudaSoftmaxSumMax(const XTensor * x, XTensor * y, int leadDim, XTensor * s ...@@ -223,20 +227,18 @@ void _CudaSoftmaxSumMax(const XTensor * x, XTensor * y, int leadDim, XTensor * s
int cudaGridSize[3]; int cudaGridSize[3];
int cudaBlockSize[3]; int cudaBlockSize[3];
if (leadDim != 0 || dimensionSize <= 10) if (leadDim != 0 || dimensionSize <= 10){
{ /* allocate thread num for old function */
//allocate thread num for old function
GDevs.GetCudaThread2D(x->devID, stride * blockNum, dimensionSize, MAX_INT, cudaGridSize, cudaBlockSize); GDevs.GetCudaThread2D(x->devID, stride * blockNum, dimensionSize, MAX_INT, cudaGridSize, cudaBlockSize);
} }
else else {
{ /* allocate thread num for new function */
//allocate thread num for new function
GDevs.GetCudaThread2D(x->devID, dimensionSize, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize); GDevs.GetCudaThread2D(x->devID, dimensionSize, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
if (cudaBlockSize[0] < 32) if (cudaBlockSize[0] < 32) {
{ /* use at least a warp */
cudaBlockSize[0] = 32;//use at least a warp cudaBlockSize[0] = 32;
if (cudaBlockSize[1] > 32)
{ if (cudaBlockSize[1] > 32) {
cudaGridSize[1] = int(ceil(float(stride * blockNum) / 32)); cudaGridSize[1] = int(ceil(float(stride * blockNum) / 32));
cudaBlockSize[1] = 32; cudaBlockSize[1] = 32;
} }
...@@ -246,21 +248,19 @@ void _CudaSoftmaxSumMax(const XTensor * x, XTensor * y, int leadDim, XTensor * s ...@@ -246,21 +248,19 @@ void _CudaSoftmaxSumMax(const XTensor * x, XTensor * y, int leadDim, XTensor * s
ProtectCudaDev(x->devID, devIDBackup); ProtectCudaDev(x->devID, devIDBackup);
if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE){ if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE){
if (leadDim != 0 || dimensionSize <= 10) if (leadDim != 0 || dimensionSize <= 10) {
{ KernelSoftmaxComputeTensor <<< dim3(cudaGridSize[0], cudaGridSize[1]), dim3(cudaBlockSize[0], cudaBlockSize[1]) >>>
KernelSoftmaxComputeTensor << <dim3(cudaGridSize[0], cudaGridSize[1]), dim3(cudaBlockSize[0], cudaBlockSize[1]) >> >
((DTYPE*)x->data, (DTYPE*)max->data, (DTYPE*)sum->data, (DTYPE*)y->data, ((DTYPE*)x->data, (DTYPE*)max->data, (DTYPE*)sum->data, (DTYPE*)y->data,
stride, dimensionSize, stride * dimensionSize, blockNum, stride * blockNum); stride, dimensionSize, stride * dimensionSize, blockNum, stride * blockNum);
} }
else else {
{ KernelSoftmaxComputeTensorUseBroadcast <<< dim3(cudaGridSize[0], cudaGridSize[1]), dim3(cudaBlockSize[0], cudaBlockSize[1]) >>>
KernelSoftmaxComputeTensorUseBroadcast << <dim3(cudaGridSize[0], cudaGridSize[1]), dim3(cudaBlockSize[0], cudaBlockSize[1]) >> >
((DTYPE*)x->data, (DTYPE*)max->data, (DTYPE*)sum->data, (DTYPE*)y->data, ((DTYPE*)x->data, (DTYPE*)max->data, (DTYPE*)sum->data, (DTYPE*)y->data,
stride, dimensionSize, blockNum); stride, dimensionSize, blockNum);
} }
} }
else if(x->dataType == X_FLOAT16 && y->dataType == X_FLOAT16){ else if(x->dataType == X_FLOAT16 && y->dataType == X_FLOAT16){
KernelSoftmaxComputeTensor<<<dim3(cudaGridSize[0], cudaGridSize[1]), dim3(cudaBlockSize[0], cudaBlockSize[1])>>> KernelSoftmaxComputeTensor <<< dim3(cudaGridSize[0], cudaGridSize[1]), dim3(cudaBlockSize[0], cudaBlockSize[1]) >>>
((__half*)x->data, (__half*)max->data, (__half*)sum->data, (__half*)y->data, ((__half*)x->data, (__half*)max->data, (__half*)sum->data, (__half*)y->data,
stride, dimensionSize, blockNum); stride, dimensionSize, blockNum);
} }
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论