Support run NiuTensor on the older GPUs with Maxwell or more previous architectures.

1. Offer macro to set whether use the half precision in cuda codes. 2. Update the manuals.

Support run NiuTensor on the older GPUs with Maxwell or more previous architectures.
1. Offer macro to set whether use the half precision in cuda codes. 2. Update the manuals.
8178ba40 · liyinqiao · bb5eb7db · 8178ba40 · 8178ba40 · 8178ba40
Commit 8178ba40 authored Apr 29, 2020 by liyinqiao
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -27,6 +27,10 @@ if (USE_CUDA)
    set(CUDA_TOOLKIT_ROOT_DIR "/usr/local/cuda") 
 endif()

+# 0 - Not use half precision in CUDA codes
+# 1 - Use, note the architecture of GPU must be newer than Pascal(including Pascal)
+set(USE_HALF_PRECISION 0)
+
 # use CMAKE_MACOSX_RPATH for macOS
 set(CMAKE_MACOSX_RPATH 1)

@@ -67,18 +71,33 @@ endfunction(my_add_executable)
 if(USE_CUDA)
    set(NIUTRANS_EXE "${NIUTRANS_EXE}.GPU")
    add_definitions(-DUSE_CUDA)
+    if(USE_HALF_PRECISION)
+        add_definitions(-DHALF_PRECISION)
+    endif()
    if(ON_WINDOWS)
        find_package(CUDA ${CUDA_VERSION} REQUIRED)
        add_compile_options(-Wno-dev)
        SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4819")
        set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "-maxrregcount=0 -m64 --disable-warnings -use_fast_math -DUSE_CUDA")
-        set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} -arch=sm_60
-            -gencode=arch=compute_60,code=sm_60 
-            -gencode=arch=compute_61,code=sm_61 
-            -gencode=arch=compute_62,code=sm_62 
-            -gencode=arch=compute_70,code=sm_70 
-            -gencode=arch=compute_70,code=compute_70 
-        )
+        if(USE_HALF_PRECISION)
+            set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "-DHALF_PRECISION")
+            set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} -arch=sm_60
+                -gencode=arch=compute_60,code=sm_60
+                -gencode=arch=compute_61,code=sm_61
+                -gencode=arch=compute_62,code=sm_62
+                -gencode=arch=compute_70,code=sm_70
+                -gencode=arch=compute_70,code=compute_70)
+        else()
+            set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} -arch=sm_30
+                -gencode=arch=compute_30,code=sm_30
+                -gencode=arch=compute_50,code=sm_50
+                -gencode=arch=compute_52,code=sm_52
+                -gencode=arch=compute_60,code=sm_60
+                -gencode=arch=compute_61,code=sm_61
+                -gencode=arch=compute_62,code=sm_62
+                -gencode=arch=compute_70,code=sm_70
+                -gencode=arch=compute_70,code=compute_70)
+        endif()
        set(CMAKE_POLICY_DEFAULT_CMP0028 NEW)
        link_directories("${CUDA_TOOLKIT_ROOT_DIR}/lib/x64")
        include_directories("${CUDA_TOOLKIT_ROOT_DIR}/include")
@@ -93,13 +112,25 @@ if(USE_CUDA)
        find_package(CUDA ${CUDA_VERSION} REQUIRED)
        set(CMAKE_CXX_FLAGS "-fPIC -msse4.2 -w -march=native -Wno-enum-compare -Wno-sign-compare -Wno-format -Wno-dev -O3 -DNDEBUG -rdynamic")
        set(CUDA_NVCC_FLAGS "-Xcompiler -fPIC -maxrregcount=0 --disable-warnings -use_fast_math -DUSE_CUDA -Wno-deprecated-gpu-targets -std=c++11")
-        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -arch=sm_60
-            -gencode=arch=compute_60,code=sm_60 
-            -gencode=arch=compute_61,code=sm_61 
-            -gencode=arch=compute_62,code=sm_62 
-            -gencode=arch=compute_70,code=sm_70 
-            -gencode=arch=compute_70,code=compute_70 
-        )
+        if(USE_HALF_PRECISION)
+            set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-DHALF_PRECISION")
+            set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -arch=sm_60
+                -gencode=arch=compute_60,code=sm_60
+                -gencode=arch=compute_61,code=sm_61
+                -gencode=arch=compute_62,code=sm_62
+                -gencode=arch=compute_70,code=sm_70
+                -gencode=arch=compute_70,code=compute_70)
+        else()
+            set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -arch=sm_30
+                -gencode=arch=compute_30,code=sm_30
+                -gencode=arch=compute_50,code=sm_50
+                -gencode=arch=compute_52,code=sm_52
+                -gencode=arch=compute_60,code=sm_60
+                -gencode=arch=compute_61,code=sm_61
+                -gencode=arch=compute_62,code=sm_62
+                -gencode=arch=compute_70,code=sm_70
+                -gencode=arch=compute_70,code=compute_70)
+        endif()
        link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib64)
        include_directories(${CUDA_TOOLKIT_ROOT_DIR}/include)
        set(CUDA_LIB_PATH_ME "${CUDA_TOOLKIT_ROOT_DIR}/lib64/")

--- a/README.md
+++ b/README.md
@@ -23,7 +23,7 @@ NiuTensor工具包的安装方法目前支持CMake（跨平台：支持Windows
 在开始创建您的项目并使用NiuTensor工具包时，需要**注意**的是：

 * 所创建项目如在CPU上运行，我们的系统支持高性能的数学运算库，推荐安装[MKL](https://software.intel.com/en-us/mkl)或[OpenBLAS](http://www.openblas.net/)（目前CMake方式不支持MKL和OpenBLAS，如希望使用上述运算库，建议使用Visual Studio或Makefile的方式进行编译，后续CMake将提供对其的完整支持）。
-* 所创建项目如需在GPU上运行，需安装 [CUDA](https://developer.nvidia.com/cuda-downloads)，CUDA版本需求为9.0及以上，CUDA工具为创建高性能GPU加速应用程序提供了开发环境。
+* 所创建项目如需在GPU上运行，需安装 [CUDA](https://developer.nvidia.com/cuda-downloads)，CUDA版本需求为9.2及以上，CUDA工具为创建高性能GPU加速应用程序提供了开发环境。

 ### 编译文件的修改

@@ -35,7 +35,7 @@ NiuTensor工具包的安装方法目前支持CMake（跨平台：支持Windows

 - 打开CMakeLists.txt文件对其进行编辑。
 - 操作系统设置：若NiuTensor编译环境为Windows，则在`set(ON_WINDOWS 0)`中将`ON_WINDOWS`的值置为1；若编译环境为Linux或macOS，则将`ON_WINDOWS`的值置为0。
- 编译设备设置：若希望在CPU环境下编译使用NiuTensor工具包，则将`set(USE_CUDA 0)`中的`USE_CUDA`置为0即可；若希望在GPU环境下使用，则需将`USE_CUDA`置为1，同时在`set(CUDA_VERSION 9.0)`中设置CUDA版本号，在`set(CUDA_TOOLKIT_ROOT_DIR "/usr/local/cuda")`中设置CUDA工具安装的根目录。
+- 编译设备设置：若希望在CPU环境下编译使用NiuTensor工具包，则将`set(USE_CUDA 0)`中的`USE_CUDA`置为0即可；若希望在GPU环境下使用，则需将`USE_CUDA`置为1，同时在`set(CUDA_VERSION 9.2)`中设置CUDA版本号，在`set(CUDA_TOOLKIT_ROOT_DIR "/usr/local/cuda")`中设置CUDA工具安装的根目录；若希望使用GPU设备进行半精度操作，只需在`set(USE_HALF_PRECISION 0)`中将`USE_HALF_PRECISION`值置为1即可，但需要注意的是，半精度操作仅在使用Pascal及更新架构的NVIDIA GPU中提供支持（可参考[NVIDIA GPU设备信息](https://developer.nvidia.com/cuda-gpus)进行查询）。

 #### Makefile

@@ -43,7 +43,7 @@ NiuTensor工具包的安装方法目前支持CMake（跨平台：支持Windows

 - 打开Makefile文件对其进行编辑。
 - 操作系统设置：若NiuTensor编译环境为Windows或Linux，则在`OnMac = 0`中将`OnMac`的值置为0；若编译环境为macOS，则将`OnMac`的值置为1。
- 编译设备设置：若希望在CPU环境下编译使用NiuTensor工具包，则将`USE_CUDA = 0`中的`USE_CUDA`置为0即可；若希望在GPU环境下使用，则需将`USE_CUDA`置为1，同时在`CUDA_ROOT = /usr/local/cuda-9.0`中设置CUDA工具安装的根目录。
+- 编译设备设置：若希望在CPU环境下编译使用NiuTensor工具包，则将`USE_CUDA = 0`中的`USE_CUDA`置为0即可；若希望在GPU环境下使用，则需将`USE_CUDA`置为1，同时在`CUDA_ROOT = /usr/local/cuda`中设置CUDA工具安装的根目录。
 - 编译内容设置：若希望生成NiuTensor工具包的动态链接库，则将`dll = 0`中的`dll`置为1即可；若无需编译动态链接库，则将其置为0。

 ### 工具包的安装
@@ -156,4 +156,4 @@ NiuTensor张量计算库由东北大学自然语言处理实验室小牛开源

 ## 更新版本

-NiuTensor version 0.2.2 - 2020年4月14日
+NiuTensor version 0.2.3 - 2020年4月29日
--- a/doc/Configuration.md
+++ b/doc/Configuration.md
@@ -2,7 +2,7 @@

 ## 注意事项

-* 我们仅仅测试了VS2015和CUDA9.0之后的版本，对于之前的版本并不清楚是否存在问题。
+* 我们仅仅测试了VS2015和CUDA9.2之后的版本，对于之前的版本并不清楚是否存在问题。
 * VS2015版本可以直接使用，使用较新版本的VS（如VS2017）时，需要**安装组件“适用于桌面的 VC++ 2015.3 v14.00 (v140) 工具集”**。
 * 建议先安装Visual Studio再安装CUDA。安装CUDA时，建议不要勾选Visual Studio Integration，有时候可能会出错。CUDA安装完成后，解压CUDA安装文件（exe文件可以解压），在CUDAVisualStudioIntegration\extras\visual_studio_integration\MSBuildExtensions路径下有四个文件，拷贝到下述路径中。


--- a/doc/manual.md
+++ b/doc/manual.md
 # NiuTensor张量计算库

 ## NiuTensor
-NiuTensor是小牛开源项目所开发的一个工具包，提供了完整的张量定义及计算功能，可以被用于深度学习相关研究及工业系统的开发。NiuTensor具有以下特点：
+NiuTensor是小牛开源项目所开发的一个轻量级工具包，提供了完整的张量定义及计算功能，可以被用于深度学习相关研究及工业系统的开发。NiuTensor具有以下特点：

 * 简单小巧，易于修改
-* c语言编写，代码高度优化
+* C语言编写，代码高度优化
 * 同时支持CPU和GPU设备
 * 丰富的张量计算接口
-* 支持C/C++、Python等调用方式
+* 支持C/C++调用方式

-## 安装NiuTensor
+## 安装方法

 NiuTensor工具包的安装方法目前支持CMake（跨平台：支持Windows、Linux以及macOS）、Visual Studio项目（Windows平台）以及Makefile（Linux以及macOS平台）三种编译方式，这里推荐使用CMake对工具包进行安装。

@@ -28,7 +28,7 @@ NiuTensor工具包的安装方法目前支持CMake（跨平台：支持Windows

 - 打开CMakeLists.txt文件对其进行编辑。
 - 操作系统设置：若NiuTensor编译环境为Windows，则在`set(ON_WINDOWS 0)`中将`ON_WINDOWS`的值置为1；若编译环境为Linux或macOS，则将`ON_WINDOWS`的值置为0。
- 编译设备设置：若希望在CPU环境下编译使用NiuTensor工具包，则将`set(USE_CUDA 0)`中的`USE_CUDA`置为0即可；若希望在GPU环境下使用，则需将`USE_CUDA`置为1，同时在`set(CUDA_VERSION 9.2)`中设置CUDA版本号，在`set(CUDA_TOOLKIT_ROOT_DIR "/usr/local/cuda")`中设置CUDA工具安装的根目录。
+- 编译设备设置：若希望在CPU环境下编译使用NiuTensor工具包，则将`set(USE_CUDA 0)`中的`USE_CUDA`置为0即可；若希望在GPU环境下使用，则需将`USE_CUDA`置为1，同时在`set(CUDA_VERSION 9.2)`中设置CUDA版本号，在`set(CUDA_TOOLKIT_ROOT_DIR "/usr/local/cuda")`中设置CUDA工具安装的根目录；若希望使用GPU设备进行半精度操作，只需在`set(USE_HALF_PRECISION 0)`中将`USE_HALF_PRECISION`值置为1即可，但需要注意的是，半精度操作仅在使用Pascal及更新架构的NVIDIA GPU中提供支持（可参考[NVIDIA GPU设备信息](https://developer.nvidia.com/cuda-gpus)进行查询）。

 #### Makefile


--- a/source/tensor/core/arithmetic/Div.cu
+++ b/source/tensor/core/arithmetic/Div.cu
@@ -183,6 +183,7 @@ void _CudaDiv(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, in
            }
        }
        else if (a->dataType == X_FLOAT16 && b->dataType == X_FLOAT16) {
+#ifdef HALF_PRECISION
            int cudaGridSize[3];
            int cudaBlockSize[3];
            half alpha1 = __float2half(alpha);
@@ -208,6 +209,9 @@ void _CudaDiv(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, in
                            stride, dimensionSizeA, dimensionSizeB, dimensionSizeC, blockNum);
                }
            }
+#else
+            ShowNTErrors("Recompile the code with HALF_PRECISION!");
+#endif
        }
        else {
            // TODO!!

--- a/source/tensor/core/arithmetic/DivDim.cu
+++ b/source/tensor/core/arithmetic/DivDim.cu
@@ -170,6 +170,7 @@ void _CudaDivDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE
        }
    }
    else if (a->dataType == X_FLOAT16) {
+#ifdef HALF_PRECISION
        half alpha1 = __float2half(alpha);
        if (stride > 1){
            GDevs.GetCudaThread2D(a->devID, stride * blockNum, blockSize, MAX_INT, cudaGrids, cudaBlocks);
@@ -196,6 +197,9 @@ void _CudaDivDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE
        else {
            ShowNTErrors("Something is wrong!");
        }
+#else
+        ShowNTErrors("Recompile the code with HALF_PRECISION!");
+#endif
    }
    else {
        ShowNTErrors("TODO!");

--- a/source/tensor/core/arithmetic/MultiplyDim.cu
+++ b/source/tensor/core/arithmetic/MultiplyDim.cu
@@ -171,6 +171,7 @@ void _CudaMultiplyDim(const XTensor * a, const XTensor * b, XTensor * c, int n, 
        }
    }
    else if (a->dataType == X_FLOAT16) {
+#ifdef HALF_PRECISION
        half alpha1 = __float2half(alpha);
        if (stride > 1) {
            GDevs.GetCudaThread2D(a->devID, stride * blockNum, blockSize, MAX_INT, cudaGrids, cudaBlocks);
@@ -197,6 +198,9 @@ void _CudaMultiplyDim(const XTensor * a, const XTensor * b, XTensor * c, int n, 
        else {
            ShowNTErrors("Something is wrong!");
        }
+#else
+        ShowNTErrors("Recompile the code with HALF_PRECISION!");
+#endif
    }
    else {
        ShowNTErrors("TODO!");

--- a/source/tensor/core/arithmetic/Sum.cu
+++ b/source/tensor/core/arithmetic/Sum.cu
@@ -112,6 +112,7 @@ void _CudaSum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
                 b->dataType == X_FLOAT16 &&
                 c->dataType == X_FLOAT16)
        {
+#ifdef HALF_PRECISION
            int gridSize[3], blockSize[3];

            GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);
@@ -121,6 +122,9 @@ void _CudaSum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
            half beta1 = __float2half(beta);

            KernelADD << <blocks, threads >> >((__half *)a->data, (__half *)b->data, (__half *)c->data, a->unitNum, beta1);
+#else
+            ShowNTErrors("Recompile the code with HALF_PRECISION!");
+#endif
        }
        else if (a->dataType == X_INT &&
                 b->dataType == X_INT &&

--- a/source/tensor/core/arithmetic/SumDim.cu
+++ b/source/tensor/core/arithmetic/SumDim.cu
@@ -172,6 +172,7 @@ void _CudaSumDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE
        }
    }
    else if (a->dataType == X_FLOAT16) {
+#ifdef HALF_PRECISION
        half beta1 = __float2half(beta);
        if (stride > 1) {
            GDevs.GetCudaThread2D(a->devID, stride * blockNum, blockSize, MAX_INT, cudaGrids, cudaBlocks);
@@ -198,6 +199,9 @@ void _CudaSumDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE
        else {
            ShowNTErrors("Something is wrong!");
        }
+#else
+        ShowNTErrors("Recompile the code with HALF_PRECISION!");
+#endif
    }
    else {
        ShowNTErrors("TODO!");

--- a/source/tensor/core/math/Binary.cu
+++ b/source/tensor/core/math/Binary.cu
@@ -78,6 +78,7 @@ T1 BinaryCudaShift(T1 x, T2 num)
    return x + T1(num);
 }

+#ifdef HALF_PRECISION
 #define SIMPLE_BINARY_FUNCTION_GPU(funcName, origFunc)                              \
 template<class T1, class T2>                                                        \
 __global__                                                                          \
@@ -133,6 +134,62 @@ template void _Cuda##funcName<int>(const XTensor*, XTensor*, int);              
 template void _Cuda##funcName<float>(const XTensor*, XTensor*, float);              \
 template void _Cuda##funcName<__half>(const XTensor*, XTensor*, __half);            \
 template void _Cuda##funcName<double>(const XTensor*, XTensor*, double);            
+#else
+#define SIMPLE_BINARY_FUNCTION_GPU(funcName, origFunc)                              \
+template<class T1, class T2>                                                        \
+__global__                                                                          \
+void Kernel##funcName(T1 * a, T1 * b, int size, T2 num)                             \
+{                                                                                   \
+    int i = blockDim.x * blockIdx.x + threadIdx.x;                                  \
+                                                                                    \
+    if (i < size)                                                                   \
+        b[i] = (T1)origFunc((T1)a[i], (T2)num);                                     \
+}                                                                                   \
+                                                                                    \
+template<class T>                                                                   \
+void _Cuda##funcName(const XTensor * a, XTensor * b, T num)                         \
+{                                                                                   \
+    CheckNTErrors((_IsSameShaped(a, b)),                                            \
+                  "Input tensors should have the same type!");                      \
+    CheckNTErrors((a->isSparse == false), "TODO!");                                 \
+                                                                                    \
+    int gridSize[3];                                                                \
+    int blockSize[3];                                                               \
+                                                                                    \
+    GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);                 \
+                                                                                    \
+    dim3 blocks(gridSize[0]);                                                       \
+    dim3 threads(blockSize[0]);                                                     \
+                                                                                    \
+    int devIDBackup;                                                                \
+    ProtectCudaDev(a->devID, devIDBackup);                                          \
+                                                                                    \
+    if (a->dataType == X_FLOAT) {                                                   \
+        Kernel##funcName<<<blocks, threads>>>                                       \
+                         ((float*)a->data, (float*)b->data, a->unitNum, (T)num);    \
+    }                                                                               \
+    else if (a->dataType == X_DOUBLE) {                                             \
+        Kernel##funcName<<<blocks, threads>>>                                       \
+                         ((double*)a->data, (double*)b->data, a->unitNum, (T)num);  \
+    }                                                                               \
+    else if (a->dataType == X_INT) {                                                \
+        Kernel##funcName<<<blocks, threads>>>                                       \
+                         ((int*)a->data, (int*)b->data, a->unitNum, (T)num);        \
+    }                                                                               \
+    else if (a->dataType == X_FLOAT16) {                                            \
+        ShowNTErrors("Recompile the code with HALF_PRECISION!");            \
+    }                                                                               \
+    else {                                                                          \
+        ShowNTErrors("TODO!");                                                      \
+    }                                                                               \
+                                                                                    \
+    BacktoCudaDev(a->devID, devIDBackup);                                           \
+}                                                                                   \
+template void _Cuda##funcName<int>(const XTensor*, XTensor*, int);                  \
+template void _Cuda##funcName<float>(const XTensor*, XTensor*, float);              \
+template void _Cuda##funcName<__half>(const XTensor*, XTensor*, __half);            \
+template void _Cuda##funcName<double>(const XTensor*, XTensor*, double);
+#endif

 SIMPLE_BINARY_FUNCTION_GPU(Descale, BinaryCudaDescale)
 SIMPLE_BINARY_FUNCTION_GPU(Mod, BinaryCudaMod)

--- a/source/tensor/core/math/Clip.cu
+++ b/source/tensor/core/math/Clip.cu
@@ -80,9 +80,13 @@ void _CudaClip(const XTensor * a, XTensor * b, DTYPE lower, DTYPE upper)
        KernelClip << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, lower, upper, a->unitNum);
    }
    else if (a->dataType == X_FLOAT16) {
+#ifdef HALF_PRECISION
        half lower2 = __float2half(lower);
        half upper2 = __float2half(upper);
        KernelClip << <blocks, threads >> >((__half*)a->data, (__half*)b->data, lower2, upper2, a->unitNum);
+#else
+        ShowNTErrors("Recompile the code with HALF_PRECISION!");
+#endif
    }
    else if (a->dataType == X_INT) {
        int lower2 = (int)lower;

--- a/source/tensor/core/math/Normalize.cu
+++ b/source/tensor/core/math/Normalize.cu
@@ -78,8 +78,10 @@ void KernelNormalize(T * input, T * output, T * mean, T * var,
                         sqrt((DTYPE)(iVar[threadIdx.x] + epsilon)) + (DTYPE)b[inBlockOffset];
    }
    else if (datatype == X_FLOAT16) {
+#if __CUDA_ARCH__ >= 600
        output[offset] = __hadd(__hdiv(__hmul(a[inBlockOffset], __hsub(input[offset], iMean[threadIdx.x])),
                         hsqrt(iVar[threadIdx.x] + epsilon)), __float2half(b[inBlockOffset]));
+#endif
    }
 }

@@ -132,11 +134,13 @@ void _CudaNormalize(const XTensor * input, XTensor * output, int dim,
                                             stride, strideNum, blockNum);
    }
    else if (input->dataType == X_FLOAT16) {
+#ifdef HALF_PRECISION
        __half epsilon1 = __float2half(epsilon);
        KernelNormalize <__half, X_FLOAT16> <<<blocks, threads>>> ((__half*)input->data, (__half*)output->data,
                                             (__half*)mean->data, (__half*)var->data,
                                             (__half*)a->data, (__half*)b->data, epsilon1, 
-                                             stride, strideNum, blockNum);    
+                                             stride, strideNum, blockNum);
+#endif
    }

    BacktoCudaDev(a->devID, devIDBackup);

--- a/source/tensor/core/math/ScaleAndShift.cu
+++ b/source/tensor/core/math/ScaleAndShift.cu
@@ -118,6 +118,7 @@ void _CudaScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift
                KernelScaleAndShift<DTYPE, false, false> << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift);
        }
        else if(a->dataType == X_FLOAT16){
+#ifdef HALF_PRECISION
            half scale2 = __float2half(scale);
            half shift2 = __float2half(shift);

@@ -129,6 +130,9 @@ void _CudaScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift
                KernelScaleAndShift<half, false, true><<<blocks, threads>>>((half*)a->data, (half*)b->data, a->unitNum, scale2, shift2);
            else
                KernelScaleAndShift<half, false, false><<<blocks, threads >>>((half*)a->data, (half*)b->data, a->unitNum, scale2, shift2);
+#else
+        ShowNTErrors("Recompile the code with HALF_PRECISION!");
+#endif
        }
        else if (a->dataType == X_INT) {
            int scale2 = int(scale);

--- a/source/tensor/core/movement/Spread.cu
+++ b/source/tensor/core/movement/Spread.cu
@@ -409,9 +409,13 @@ void _CudaSpreadForGather(XTensor * source, XTensor * collection, XTensor * srcI
    }
    else if (source->dataType == X_FLOAT16 && collection->dataType == X_FLOAT16)
    {
+#ifdef HALF_PRECISION
        __half2 * sData = (__half2*)source->data;
        __half2 * cData = (__half2*)collection->data;
        KernelSpreadForGather<__half2> << <blocks, threads >> >(sData, cData, sIndex, indexSize, stride);
+#else
+        ShowNTErrors("Recompile the code with HALF_PRECISION!");
+#endif
    }
    else {
        ShowNTErrors("Unsupported dataType!");

--- a/source/tensor/core/reduce/ReduceMax.cu
+++ b/source/tensor/core/reduce/ReduceMax.cu
@@ -553,7 +553,7 @@ void _funcName(const XTensor * input, XTensor * output, int dim)                
    }                                                                                                                                         \
    blockSize = stride * strideNum;                                                                                                           \
                                                                                                                                              \
-    int devID = input->devID;                                                                                                                 \                                                                                                                                              \
+    int devID = input->devID;                                                                                                                 \
    int devIDBackup;                                                                                                                          \
    ProtectCudaDev(input->devID, devIDBackup);                                                                                                \
                                                                                                                                              \

--- a/source/tensor/core/sort/TopK.cu
+++ b/source/tensor/core/sort/TopK.cu
@@ -864,9 +864,13 @@ void _CudaTopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k, 
                                 (DTYPE*)b->data, (int*)index->data, isSorted);
        }
        else if (a->dataType == X_FLOAT16) {
+#ifdef HALF_PRECISION
            KernelTopK3<__half> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >>>
                                 ((__half*)a->data, stride, strideNumA, blockNum, k, DTYPE_MIN,
                                 (__half*)b->data, (int*)index->data, isSorted);
+#else
+            ShowNTErrors("Recompile the code with HALF_PRECISION!");
+#endif
        }
        else {
            ShowNTErrors("TODO!");

--- a/source/tensor/function/HardTanH.cu
+++ b/source/tensor/function/HardTanH.cu
@@ -76,7 +76,11 @@ void _CudaHardTanH(const XTensor * x, XTensor * y)
        KernelHardtanhCompute<<<dim3(gridSize[0]), dim3(blockSize[0])>>>((DTYPE*)x->data, (DTYPE*)y->data, x->unitNum);
    }
    else if (x->dataType == X_FLOAT16 && y->dataType == X_FLOAT16) {
+#ifdef HALF_PRECISION
        KernelHardtanhCompute<<<dim3(gridSize[0]), dim3(blockSize[0])>>>((__half *)x->data, (__half *)y->data, x->unitNum);
+#else
+        ShowNTErrors("Recompile the code with HALF_PRECISION!");
+#endif
    }
    else {
        //TODO!
@@ -148,12 +152,16 @@ void _CudaHardTanHBackward(XTensor * y, XTensor * x,
                                 x->unitNum);
    }
    else if (x->dataType == X_FLOAT16 && y->dataType == X_FLOAT16) {
+#ifdef HALF_PRECISION
        /* dE/dx = dE/dy * dy/dx */
        KernelHardtanhBackward<<<dim3(gridSize[0]), dim3(blockSize[0])>>>
                               ((half*)dedy->data,
                                (half*)dedx->data,
                                (half*)x->data,
                                 x->unitNum);
+#else
+        ShowNTErrors("Recompile the code with HALF_PRECISION!");
+#endif
    }
    else {
        ShowNTErrors("Unsupported dataType!");

--- a/source/tensor/function/LogSoftmax.cu
+++ b/source/tensor/function/LogSoftmax.cu
@@ -93,8 +93,10 @@ void KernelLogSoftmaxComputeByRow(T * x, T * max, T * sum, T * y, int rowNum, in
            y[key] = MAX(r, LOGPROB_MIN);
        }
        else if (dataType == X_FLOAT16) {
+#if __CUDA_ARCH__ >= 600
            half r = hlog((half)hexp(x[key] - inputMax[threadIdx.y]) / (half)inputSum[threadIdx.y]);
            y[key] = r;
+#endif
        }
    }
 }
@@ -152,8 +154,10 @@ void KernelLogSoftmaxComputeByCol(T * x, T * max, T * sum, T * y, int rowNum, in
            y[key] = MAX(r, LOGPROB_MIN);
        }
        else if (dataType == X_FLOAT16) {
+#if __CUDA_ARCH__ >= 600
            half r = hlog((half)hexp(x[key] - inputMax[threadIdx.y]) / (half)inputSum[threadIdx.y]);
            y[key] = r;
+#endif
        }
    }
 }
@@ -202,6 +206,7 @@ void _CudaLogSoftmaxSumMax(XTensor * x, XTensor * y, int leadDim, XTensor * sum,
        }
    }
    else if (x->dataType == X_FLOAT16 && y->dataType == X_FLOAT16) {
+#ifdef HALF_PRECISION
        int gridSize[3], blockSize[3];
        int n = x->dimSize[0];
        int m = x->dimSize[1];
@@ -220,6 +225,9 @@ void _CudaLogSoftmaxSumMax(XTensor * x, XTensor * y, int leadDim, XTensor * sum,
            KernelLogSoftmaxComputeByCol<half, X_FLOAT16> <<<dim3(gridSize[0], gridSize[1]), dim3(blockSize[0], blockSize[1])>>>
                                       ((half*)x->data, maxData, sumData, (half*)y->data, n, m);
        }
+#else
+        ShowNTErrors("Recompile the code with HALF_PRECISION!");
+#endif
    }
    else {
        ShowNTErrors("TODO!");

--- a/source/tensor/function/Rectify.cu
+++ b/source/tensor/function/Rectify.cu
@@ -67,8 +67,12 @@ void _CudaRectify(const XTensor * x, XTensor * y)
                        ((DTYPE*)x->data, (DTYPE*)y->data, x->unitNum);
    }
    else if (x->dataType == X_FLOAT16) {
+#ifdef HALF_PRECISION
        KernelRectify<<<dim3(gridSize[0]), dim3(blockSize[0]) >> >
                        ((__half*)x->data, (__half*)y->data, x->unitNum);
+#else
+        ShowNTErrors("Recompile the code with HALF_PRECISION!");
+#endif
    }
    else {
        // TODO!!
@@ -139,11 +143,15 @@ void _CudaRectifyBackward(XTensor * y, XTensor * x,
                                x->unitNum);
    }
    else if (x->dataType == X_FLOAT16 && y->dataType == X_FLOAT16) {
+#ifdef HALF_PRECISION
        KernelRectifyBackward<<<dim3(gridSize[0]), dim3(blockSize[0]) >> >
                              ((__half*)dedy->data,
                               (__half*)dedx->data,
                               (__half*)x->data,
                                x->unitNum);
+#else
+        ShowNTErrors("Recompile the code with HALF_PRECISION!");
+#endif
    }
    else {
        // TODO!!