clean code

fa2ed07c · 张裕浩 · ec71b1a9 · fa2ed07c · fa2ed07c · fa2ed07c
Commit fa2ed07c authored Aug 03, 2018 by 张裕浩
--- a/source/tensor/core/reduce/ReduceMax.cu
+++ b/source/tensor/core/reduce/ReduceMax.cu
--- a/source/tensor/core/reduce/ReduceSum.cu
+++ b/source/tensor/core/reduce/ReduceSum.cu
--- a/source/tensor/core/sort/TopK.cu
+++ b/source/tensor/core/sort/TopK.cu
--- a/source/tensor/function/Softmax.cu
+++ b/source/tensor/function/Softmax.cu
@@ -155,7 +155,11 @@ void KernelSoftmaxComputeTensor(__half * x, __half * max, __half * sum, __half *
    }
 }
-__device__ __forceinline__ float broadCast(float input)
+/*
+use PTX code to broadcast float data
+*/
+__device__ __forceinline__ 
+float broadcast(float input)
 {
    float output;
    asm(
@@ -167,28 +171,28 @@ __device__ __forceinline__ float broadCast(float input)
    return output;
 }
+/*
+use warp broadcast to optimize softmax computing
+*/
 __global__
-void KernelSoftmaxComputeTensorUseBroadcast(DTYPE * input, DTYPE * max, DTYPE * sum, DTYPE * output, int stride, int strideNum, int blockNum)
+void KernelSoftmaxComputeTensorUseBroadcast(DTYPE * input, DTYPE * max, DTYPE * sum, DTYPE * output, 
+                                            int stride, int strideNum, int blockNum)
 {
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    int j = blockDim.y * blockIdx.y + threadIdx.y;
    int i2 = j % stride;
    int blockSize = stride * strideNum;
-    if (j < stride * blockNum)
-    {
+    if (j < stride * blockNum) {
        DTYPE sumData, maxData;
-        if (i % 32 == 0)
+        if (i % 32 == 0) {
-        {
            sumData = sum[j];
            maxData = max[j];
        }
-        //sumData = __shfl_sync(0xffffffff,sumData, 0);
+        sumData = broadcast(sumData);
-        //maxData = __shfl_sync(0xffffffff,maxData, 0);
+        maxData = broadcast(maxData);
-        sumData = broadCast(sumData);
+        if (i < strideNum){
-        maxData = broadCast(maxData);
-        if (i < strideNum)
-        {
            int offset = int(j / stride) * blockSize + i * stride + i2;
            output[offset] = exp(input[offset] - maxData) / sumData;
        }
@@ -223,20 +227,18 @@ void _CudaSoftmaxSumMax(const XTensor * x, XTensor * y, int leadDim, XTensor * s
    int cudaGridSize[3];
    int cudaBlockSize[3];
-    if (leadDim != 0 || dimensionSize <= 10)
+    if (leadDim != 0 || dimensionSize <= 10){
-    {
+        /* allocate thread num for old function */
-        //allocate thread num for old function
        GDevs.GetCudaThread2D(x->devID, stride * blockNum, dimensionSize, MAX_INT, cudaGridSize, cudaBlockSize);
    }
-    else
+    else {
-    {
+        /* allocate thread num for new function */
-        //allocate thread num for new function
        GDevs.GetCudaThread2D(x->devID, dimensionSize, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
-        if (cudaBlockSize[0] < 32)
+        if (cudaBlockSize[0] < 32) {
-        {
+            /* use at least a warp */
-            cudaBlockSize[0] = 32;//use at least a warp
+            cudaBlockSize[0] = 32;
-            if (cudaBlockSize[1] > 32)
-            {
+            if (cudaBlockSize[1] > 32) {
                cudaGridSize[1] = int(ceil(float(stride * blockNum) / 32));
                cudaBlockSize[1] = 32;
            }
@@ -246,21 +248,19 @@ void _CudaSoftmaxSumMax(const XTensor * x, XTensor * y, int leadDim, XTensor * s
    ProtectCudaDev(x->devID, devIDBackup);
    if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE){
-        if (leadDim != 0 || dimensionSize <= 10)
+        if (leadDim != 0 || dimensionSize <= 10) {
-        {
+            KernelSoftmaxComputeTensor <<< dim3(cudaGridSize[0], cudaGridSize[1]), dim3(cudaBlockSize[0], cudaBlockSize[1]) >>>
-            KernelSoftmaxComputeTensor << <dim3(cudaGridSize[0], cudaGridSize[1]), dim3(cudaBlockSize[0], cudaBlockSize[1]) >> >
                                         ((DTYPE*)x->data, (DTYPE*)max->data, (DTYPE*)sum->data, (DTYPE*)y->data,
                                           stride, dimensionSize, stride * dimensionSize, blockNum, stride * blockNum);
        }
-        else
+        else {
-        {
+            KernelSoftmaxComputeTensorUseBroadcast <<< dim3(cudaGridSize[0], cudaGridSize[1]), dim3(cudaBlockSize[0], cudaBlockSize[1]) >>>
-            KernelSoftmaxComputeTensorUseBroadcast << <dim3(cudaGridSize[0], cudaGridSize[1]), dim3(cudaBlockSize[0], cudaBlockSize[1]) >> >
                                                     ((DTYPE*)x->data, (DTYPE*)max->data, (DTYPE*)sum->data, (DTYPE*)y->data,
                                                       stride, dimensionSize, blockNum);
        }
    }
    else if(x->dataType == X_FLOAT16 && y->dataType == X_FLOAT16){
-        KernelSoftmaxComputeTensor<<<dim3(cudaGridSize[0], cudaGridSize[1]), dim3(cudaBlockSize[0], cudaBlockSize[1])>>>
+        KernelSoftmaxComputeTensor <<< dim3(cudaGridSize[0], cudaGridSize[1]), dim3(cudaBlockSize[0], cudaBlockSize[1]) >>>
                                     ((__half*)x->data, (__half*)max->data, (__half*)sum->data, (__half*)y->data, 
                                       stride, dimensionSize, blockNum);
    }