no message

6a3d713a · linye · 30217de4 · 6a3d713a · 6a3d713a · 6a3d713a
Commit 6a3d713a authored Jul 08, 2019 by linye
--- a/source/tensor/core/arithmetic/MultiplyDim.cu
+++ b/source/tensor/core/arithmetic/MultiplyDim.cu
@@ -169,6 +169,35 @@ void _CudaMultiplyDim(const XTensor * a, const XTensor * b, XTensor * c, int n, 
            ShowNTErrors("Something is wrong!");
        }
    }
+    if (a->dataType == X_FLOAT16) {
+        unsigned short temp = FloatToFloat16(alpha);
+        half alpha1 = *((half *)&temp);
+        if (stride > 1) {
+            GDevs.GetCudaThread2D(a->devID, stride * blockNum, blockSize, MAX_INT, cudaGrids, cudaBlocks);
+            if (alpha == 0.0F)
+                KernelMultiplyWithCol<__half, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
+                                                      ((__half*)a->data, (__half*)b->data, (__half*)c->data,
+                                                        blockSize, stride, blockSize * stride, blockNum, alpha1);
+            else
+                KernelMultiplyWithCol<__half, true> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
+                                                     ((__half*)a->data, (__half*)b->data, (__half*)c->data,
+                                                       blockSize, stride, blockSize * stride, blockNum, alpha1);
+        }
+        else if (stride == 1) {
+            GDevs.GetCudaThread2D(a->devID, blockSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
+            if (alpha == 0.0F)
+                KernelMultiplyWithRow<__half, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
+                                                      ((__half*)a->data, (__half*)b->data, (__half*)c->data,
+                                                        blockNum, blockSize, alpha1);
+            else
+                KernelMultiplyWithRow<__half, true> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
+                                                     ((__half*)a->data, (__half*)b->data, (__half*)c->data,
+                                                       blockNum, blockSize, alpha1);
+        }
+        else {
+            ShowNTErrors("Something is wrong!");
+        }
+    }
    else {
        ShowNTErrors("TODO!");
    }

--- a/source/tensor/core/arithmetic/Negate.cu
+++ b/source/tensor/core/arithmetic/Negate.cu
@@ -33,8 +33,9 @@ set each entry to its negtive value (CUDA Kernel)
 >> b - pointer to the output data array
 >> size - size of the data array
 */
+template <class T>
 __global__
-void KernelNegate(DTYPE * a, DTYPE * b, int size)
+void KernelNegate(T * a, T * b, int size)
 {
    int i = blockDim.x * blockIdx.x + threadIdx.x;

@@ -42,26 +43,26 @@ void KernelNegate(DTYPE * a, DTYPE * b, int size)
        b[i] = -a[i];
 }

-/*
-set each entry to its negtive value (CUDA Kernel)
-This is for float16 computation
->> a - pointer to the input data array
->> b - pointer to the output data array
->> size - size of the data array
-*/
-__global__
-void KernelNegate(__half * a, __half * b, int size)
-{
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-
-#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
-        if (i < size)
-            b[i] = __hsub(__float2half(0), a[i]);
-#else
-        if (i < size)
-            b[i] = __float2half(-__half2float(a[i]));
-#endif
-}
+///*
+//set each entry to its negtive value (CUDA Kernel)
+//This is for float16 computation
+//>> a - pointer to the input data array
+//>> b - pointer to the output data array
+//>> size - size of the data array
+//*/
+//__global__
+//void KernelNegate(__half * a, __half * b, int size)
+//{
+//    int i = blockDim.x * blockIdx.x + threadIdx.x;
+//
+//#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
+//        if (i < size)
+//            b[i] = __hsub(__float2half(0), a[i]);
+//#else
+//        if (i < size)
+//            b[i] = __float2half(-__half2float(a[i]));
+//#endif
+//}

 /*
 set each entry to its negtive value

--- a/source/tensor/function/LogSoftmax.cu
+++ b/source/tensor/function/LogSoftmax.cu
@@ -238,8 +238,9 @@ set dE/dx = exp(y)
 >> size - size of output
 >> lossName - name of the loss function
 */
+template <class T>
 __global__
-void KernelExpLoss(DTYPE * dedy, DTYPE * dedx, DTYPE * y, int size, LOSS_FUNCTION_NAME lossName)
+void KernelExpLoss(T * dedy, T * dedx, T * y, int size, LOSS_FUNCTION_NAME lossName)
 {
    int i = blockDim.x * blockIdx.x + threadIdx.x;

@@ -270,36 +271,59 @@ dE/dx = dE/dy * dy/dx
 >> size - size of input/output
 >> lossName - name of the loss function
 */
+template <class T, TENSOR_DATA_TYPE dataType>
 __global__
-void KernelLogSoftmaxBackwardDEDS(DTYPE * dedy, DTYPE * dedx, DTYPE * gold, DTYPE * y, DTYPE * x, 
+void KernelLogSoftmaxBackwardDEDS(T * dedy, T * dedx, T * gold, T * y, T * x,
                                  int size, LOSS_FUNCTION_NAME lossName)
 {
    int i = blockDim.x * blockIdx.x + threadIdx.x;

    if (i < size) {
-        DTYPE r = 0;
-        /* dE/ds_j = exp(y_j) */
-        if (lossName == CROSSENTROPY)
-            r = -gold[i] + exp(y[i]);
-        /* dE/ds_j = exp(y_j) */
-        else if (lossName == SQUAREDERROR)
-            r = -gold[i] + exp(y[i]);
-        else if (lossName == ONEHOTERROR) {
-            if (gold[i] == 1.0F)
-                r = -gold[i] + exp(y[i]);
-            else
+        if (dataType == X_FLOAT) {
+            DTYPE r = 0;
+            /* dE/ds_j = exp(y_j) */
+            if (lossName == CROSSENTROPY)
+                r = -(DTYPE)gold[i] + (DTYPE)exp(y[i]);
+            /* dE/ds_j = exp(y_j) */
+            else if (lossName == SQUAREDERROR)
+                r = -(DTYPE)gold[i] + (DTYPE)exp(y[i]);
+            else if (lossName == ONEHOTERROR) {
+                if ((DTYPE)gold[i] == 1.0)
+                    r = -(DTYPE)gold[i] + (DTYPE)exp(y[i]);
+                else
+                    r = 0;
+            }
+            else {
+                r = dedy[i];
+            }
+
+            if (isnan(r))
+                r = 0;
+            if (isinf(r))
                r = 0;
-        }
-        else {
-            r = dedy[i];
-        }

-        if (isnan(r))
-            r = 0;
-        if (isinf(r))
-            r = 0;
+            dedx[i] = r;
+        }
+        else if (dataType == X_FLOAT16) {
+            half r = 0;
+            /* dE/ds_j = exp(y_j) */
+            if (lossName == CROSSENTROPY)
+                r = -(half)gold[i] + (half)hexp(y[i]);
+            /* dE/ds_j = exp(y_j) */
+            else if (lossName == SQUAREDERROR)
+                r = -(half)gold[i] + (half)hexp(y[i]);
+            else if (lossName == ONEHOTERROR) {
+                if ((half)gold[i] == (half)1.0)
+                    r = -(half)gold[i] + (half)hexp(y[i]);
+                else
+                    r = 0;
+            }
+            else {
+                r = dedy[i];
+            }

-        dedx[i] = r;
+            dedx[i] = r;
+        }
    }
 }

@@ -320,11 +344,12 @@ dE/dx_j += -gold_j
 >> gNonZeroNum - 
 >> lossName - name of the loss function
 */
+template <class T>
 __global__
-void KernelLogSoftmaxBackwardDEDSSparseByRow(DTYPE * dedy, DTYPE * dedx, void * gold, DTYPE * y, DTYPE * x,
+void KernelLogSoftmaxBackwardDEDSSparseByRow(T * dedy, T * dedx, void * gold, T * y, T * x,
                                             int rowNum, int colNum, int gNonZeroNum, LOSS_FUNCTION_NAME lossName)
 {
-    int tupleSize = sizeof(int) + sizeof(DTYPE);
+    int tupleSize = sizeof(int) + sizeof(T);
    int k = blockDim.x * blockIdx.x + threadIdx.x;

    if (k < gNonZeroNum) {
@@ -332,7 +357,7 @@ void KernelLogSoftmaxBackwardDEDSSparseByRow(DTYPE * dedy, DTYPE * dedx, void * 
        int key = *(int*)((char*)gold + tupleSize * k);
        int ni = key / colNum;
        int mi = key % colNum;
-        int value = *(DTYPE*)((char*)gold + tupleSize * k + sizeof(int));
+        int value = *(T*)((char*)gold + tupleSize * k + sizeof(int));

        if (lossName == CROSSENTROPY)
            dedx[colNum * ni + mi] += -value;
@@ -421,6 +446,8 @@ void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
    CheckNTErrors((x->devID == y->devID && gold->devID == y->devID),
                  "Tensors used in log softmax are not on the same GPU.");
    CheckNTErrors((gold != NULL), "No x gold standard is found!");
+    CheckNTErrors((lossName == CROSSENTROPY || lossName == SQUAREDERROR || lossName == NOLOSS),
+                  "Unknown loss function.");

    int leadDimRDI = y->order - leadDim - 1;
    int dimensionSize = y->dimSizeRDI[leadDimRDI];
@@ -435,10 +462,7 @@ void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
    int devIDBackup;
    ProtectCudaDev(x->devID, devIDBackup);

-    if (x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE) {
-
-        CheckNTErrors((lossName == CROSSENTROPY || lossName == SQUAREDERROR || lossName == NOLOSS),
-                      "Unknown loss function.");
+    if (x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE) {     

        int cudaGridSize[3], cudaBlockSize[3];

@@ -449,7 +473,7 @@ void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
                GDevs.GetCudaThread(x->devID, x->unitNum, cudaGridSize, cudaBlockSize);

                /* dE/ds_j = exp(y_j) */
-                KernelExpLoss <<< dim3(cudaGridSize[0]), dim3(cudaBlockSize[0]) >>>
+                KernelExpLoss <DTYPE> <<< dim3(cudaGridSize[0]), dim3(cudaBlockSize[0]) >>>
                                 (NULL,
                                 (DTYPE*)dedx->data,
                                 (DTYPE*)y->data,
@@ -459,7 +483,7 @@ void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
                GDevs.GetCudaThread(x->devID, gold->unitNumNonZero, cudaGridSize, cudaBlockSize);

                /* dE/ds_j += -gold_j */
-                KernelLogSoftmaxBackwardDEDSSparseByRow <<< dim3(cudaGridSize[0]), dim3(cudaBlockSize[0]) >>>
+                KernelLogSoftmaxBackwardDEDSSparseByRow <DTYPE> <<< dim3(cudaGridSize[0]), dim3(cudaBlockSize[0]) >>>
                                                           (NULL,
                                                           (DTYPE*)dedx->data,
                                                           (char*)gold->data + sizeof(int),
@@ -474,7 +498,7 @@ void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
                    GDevs.GetCudaThread(x->devID, blockSize, cudaGridSize, cudaBlockSize);

                    /* dE/ds_j = -gold_j + exp(y_j) */
-                    KernelLogSoftmaxBackwardDEDS <<< dim3(cudaGridSize[0]), dim3(cudaBlockSize[0]) >>>
+                    KernelLogSoftmaxBackwardDEDS <DTYPE, X_FLOAT> <<< dim3(cudaGridSize[0]), dim3(cudaBlockSize[0]) >>>
                                                    (NULL,
                                                    (DTYPE*)dedx->data + k * blockSize,
                                                    (DTYPE*)gold->data + k * blockSize,
@@ -508,6 +532,76 @@ void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
            ShowNTErrors("TODO!");
        }
    }
+    else if (x->dataType == X_FLOAT16 && y->dataType == X_FLOAT16) {
+
+        int cudaGridSize[3], cudaBlockSize[3];
+
+        if (lossName == CROSSENTROPY || lossName == SQUAREDERROR) {
+            if (gold->isSparse) {
+                CheckNTErrors((gold->order == 2), "TODO!")
+                CheckNTErrors((leadDim == 0), "TODO!");
+                GDevs.GetCudaThread(x->devID, x->unitNum, cudaGridSize, cudaBlockSize);
+
+                /* dE/ds_j = exp(y_j) */
+                KernelExpLoss <__half> <<< dim3(cudaGridSize[0]), dim3(cudaBlockSize[0]) >>>
+                                 (NULL,
+                                 (__half*)dedx->data,
+                                 (__half*)y->data,
+                                 dimensionSize * stride,
+                                 lossName);
+
+                GDevs.GetCudaThread(x->devID, gold->unitNumNonZero, cudaGridSize, cudaBlockSize);
+
+                /* dE/ds_j += -gold_j */
+                KernelLogSoftmaxBackwardDEDSSparseByRow <__half> <<< dim3(cudaGridSize[0]), dim3(cudaBlockSize[0]) >>>
+                                                           (NULL,
+                                                           (__half*)dedx->data,
+                                                           (char*)gold->data + sizeof(int),
+                                                           (__half*)y->data,
+                                                           (__half*)x->data,
+                                                           dedx->dimSize[0], dedx->dimSize[1], gold->unitNumNonZero, lossName);
+            }
+            else {
+                CheckNTErrors((XTensor::IsSameShaped(gold, y)), "The tensors must be of the same size!");
+
+                for (int k = 0; k < blockNum; k++) {
+                    GDevs.GetCudaThread(x->devID, blockSize, cudaGridSize, cudaBlockSize);
+
+                    /* dE/ds_j = -gold_j + exp(y_j) */
+                    KernelLogSoftmaxBackwardDEDS <__half, X_FLOAT16> <<< dim3(cudaGridSize[0]), dim3(cudaBlockSize[0]) >>>
+                                                    (NULL,
+                                                    (__half*)dedx->data + k * blockSize,
+                                                    (__half*)gold->data + k * blockSize,
+                                                    (__half*)y->data + k * blockSize,
+                                                    (__half*)x->data + k * blockSize,
+                                                    dimensionSize * stride, lossName);
+                }
+            }
+            if (padding != NULL) {
+                int n = leadDim;
+
+                int paddingOrder = padding->order;
+                int * paddingDims = new int[paddingOrder];
+                memcpy(paddingDims, padding->dimSize, padding->order * sizeof(int));
+                padding->Reshape(padding->unitNum);
+
+                int order = dedx->order;
+                int * dims = new int[order];
+                memcpy(dims, dedx->dimSize, dedx->order * sizeof(int));
+                dedx->Reshape(dedx->unitNum / dedx->GetDim(n), dedx->GetDim(n));
+                _MultiplyDimMe(dedx, padding, 0);
+
+                padding->Reshape(paddingOrder, paddingDims);
+                dedx->Reshape(order, dims);
+
+                delete[] paddingDims;
+                delete[] dims;
+            }
+        }
+        else {
+            ShowNTErrors("TODO!");
+        }
+    }
    else{
        ShowNTErrors("TODO!");
    }

--- a/source/tensor/test/Test.cpp
+++ b/source/tensor/test/Test.cpp
@@ -84,7 +84,7 @@ bool Test()
 	//wrong = !TestDropout() || wrong;
    //wrong = !TestHardTanH() || wrong;
    //wrong = !TestIdentity() || wrong;
-    //wrong = !TestLogSoftmax() || wrong;
+    wrong = !TestLogSoftmax() || wrong;
    //wrong = !TestLoss() || wrong;
    //wrong = !TestRectify() || wrong;
    //wrong = !TestSigmoid() || wrong;