对于不同情况执行不同SoftMax的计算函数

002692e7 · 张裕浩 · acc044b2 · 002692e7
Commit 002692e7 authored Aug 02, 2018 by 张裕浩
--- a/source/tensor/function/Softmax.cu
+++ b/source/tensor/function/Softmax.cu
@@ -223,32 +223,41 @@ void _CudaSoftmaxSumMax(const XTensor * x, XTensor * y, int leadDim, XTensor * s
    int cudaGridSize[3];
    int cudaBlockSize[3];

-    //allocate thread num for old function
-    //GDevs.GetCudaThread2D(x->devID, stride * blockNum, dimensionSize, MAX_INT, cudaGridSize, cudaBlockSize);
-    //allocate thread num for new function
-    GDevs.GetCudaThread2D(x->devID, dimensionSize, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
-    if (cudaBlockSize[0] < 32)
+    if (leadDim != 0 || dimensionSize <= 10)
    {
-        cudaBlockSize[0] = 32;//use at least a warp
-        if (cudaBlockSize[1] > 32)
+        //allocate thread num for old function
+        GDevs.GetCudaThread2D(x->devID, stride * blockNum, dimensionSize, MAX_INT, cudaGridSize, cudaBlockSize);
+    }
+    else
+    {
+        //allocate thread num for new function
+        GDevs.GetCudaThread2D(x->devID, dimensionSize, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
+        if (cudaBlockSize[0] < 32)
        {
-            cudaGridSize[1] = int(ceil(float(stride * blockNum) / 32));
-            cudaBlockSize[1] = 32;
+            cudaBlockSize[0] = 32;//use at least a warp
+            if (cudaBlockSize[1] > 32)
+            {
+                cudaGridSize[1] = int(ceil(float(stride * blockNum) / 32));
+                cudaBlockSize[1] = 32;
+            }
        }
    }
    int devIDBackup;
    ProtectCudaDev(x->devID, devIDBackup);

    if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE){
-        /*KernelSoftmaxComputeTensor<<<dim3(cudaGridSize[0], cudaGridSize[1]), dim3(cudaBlockSize[0], cudaBlockSize[1])>>>
-                                   ((DTYPE*)x->data, (DTYPE*)max->data, (DTYPE*)sum->data, (DTYPE*)y->data, 
-                                     stride, dimensionSize, stride * dimensionSize, blockNum, stride * blockNum);
-                                     */
-
-        KernelSoftmaxComputeTensorUseBroadcast << <dim3(cudaGridSize[0], cudaGridSize[1]), dim3(cudaBlockSize[0], cudaBlockSize[1]) >> >
-                                   ((DTYPE*)x->data, (DTYPE*)max->data, (DTYPE*)sum->data, (DTYPE*)y->data,
-                                     stride, dimensionSize, blockNum);
-        printf("%d %d %d %d\n", cudaGridSize[0], cudaGridSize[1], cudaBlockSize[0], cudaBlockSize[1]);
+        if (leadDim != 0 || dimensionSize <= 10)
+        {
+            KernelSoftmaxComputeTensor << <dim3(cudaGridSize[0], cudaGridSize[1]), dim3(cudaBlockSize[0], cudaBlockSize[1]) >> >
+                                        ((DTYPE*)x->data, (DTYPE*)max->data, (DTYPE*)sum->data, (DTYPE*)y->data,
+                                        stride, dimensionSize, stride * dimensionSize, blockNum, stride * blockNum);
+        }
+        else
+        {
+            KernelSoftmaxComputeTensorUseBroadcast << <dim3(cudaGridSize[0], cudaGridSize[1]), dim3(cudaBlockSize[0], cudaBlockSize[1]) >> >
+                                       ((DTYPE*)x->data, (DTYPE*)max->data, (DTYPE*)sum->data, (DTYPE*)y->data,
+                                         stride, dimensionSize, blockNum);
+        }
    }
    else if(x->dataType == X_FLOAT16 && y->dataType == X_FLOAT16){
        KernelSoftmaxComputeTensor<<<dim3(cudaGridSize[0], cudaGridSize[1]), dim3(cudaBlockSize[0], cudaBlockSize[1])>>>