NiuTrans.Tensor - version 0.1.0!

9d33e210 · xuchen · de548dd3 · d294ac15 · 9d33e210 · 9d33e210
Commit 9d33e210 authored Aug 04, 2018 by xuchen
--- a/README.md
+++ b/README.md
-NiuTrans.Tensor张量计算库
+# NiuTrans.Tensor张量计算库
\ No newline at end of file
+## NiuTrans.Tensor
+NiuTrans.Tensor是小牛开源项目所开发的一个工具包，提供了完整的张量定义及计算功能，可以被用于深度学习相关研究及工业系统的开发。NiuTrans.Tensor具有以下特点：
+* 简单小巧，易于修改
+* c语言编写，代码高度优化
+* 同时支持CPU和GPU设备
+* 丰富的张量计算接口
+* 支持C/C++、Python等调用方式
+## 安装方法
+在开始创建您的项目并使用NiuTrans.Tensor工具包时，需要注意的是：
+* 所创建项目如在CPU上运行，我们的系统支持高性能的数学运算库，推荐安装[MKL](https://software.intel.com/en-us/mkl)或[OpenBLAS](http://www.openblas.net/)。
+* 所创建项目如需在GPU上运行，需安装 [CUDA](https://developer.nvidia.com/cuda-downloads)，CUDA版本需求为9.0及以上，CUDA工具为创建高性能GPU加速应用程序提供了开发环境。
+小牛开源项目所开发的NiuTrans.Tensor工具包采用源程序编译方法，在Windows和Linux环境下的安装方法如下所示。
+### Windows
+若在Windows上使用NiuTrans.Tensor工具包：
+* 首先需要将NiuTrans.Tensor代码包含在所创建的项目中
+* 在所创建项目中需要引用XTensor.h、core里的CHeader.h和function里的FHeader.h这三个头文件：
+    * 通过XTensor.h可以获取我们需要操作的XTensor类
+    * 通过core里的CHeader.h可以对Tensor进行一些张量运算
+    * 通过function里的FHeader.h可以调用一些激活函数
+* 在所创建项目中使用命名空间nts
+此外，一些必须的环境配置方法请参考 [NiuTrans.Tensor环境配置](http://47.105.50.196/NiuTrans/NiuTrans.Tensor/blob/linye/doc/Configuration.md)。
+### Linux
+若在Linux上使用NiuTrans.Tensor工具包，直接执行make.sh即可在同级目录下生成tensorCPU和tensorGPU，分别对应于NiuTrans.Tensor的CPU以及GPU的可执行文件。以前馈神经网络语言模型为例，输入以下命令即可在GPU上执行提供的测试用例：
+>./tensorGPU -test
+更多详细使用方法请见[NiuTrans.Tensor开发文档](http://47.104.97.237/niutrans/site/niutensor/index.html)
+## 开发团队
+NiuTrans.Tensor张量计算库由东北大学自然语言处理实验室、小牛翻译、小牛雅智合作开发，致力于为深度学习相关研究及工业系统的开发提供完整的张量定义及计算功能。
+## 更新版本
+NiuTrans.Tensor version 0.1.0 - 2018年8月3日
\ No newline at end of file
--- a/doc/Configuration.md
+++ b/doc/Configuration.md
+# NiuTrans.Tensor环境配置
+## 注意事项
+CUDA最新版本9.2尚且不支持VS2017最新版本，因此建议使用CUDA版本为9.0或9.1，建议使用VS版本为VS2015，或使用VS2017时安装v140工具集。
+## CUDA配置
+在已安装好VS、CUDA并配置好环境变量后，一些关键的CUDA配置选项如下所示，以下配置选项在 **项目 -> 属性** 中可以找到。
+>$(CUDA_PATH)\include
+加入到 **VC++目录 -> 包含** 中。
+>$(CUDA_PATH)\lib\Win32
+加入到 **VC++目录 -> 库** 中。
+>cuda.lib;cudadevrt.lib;cudart.lib;cudart_static.lib;nvcuvid.lib;OpenCL.lib;cublas.lib;curand.lib;
+加入到 **链接器->输入->附加依赖项** 中。
+配置完成后，右键 **工程->项目依赖性** ，选择CUDA9。
+在.cu文件上右键属性，在项类型中选择"CUDA C/C++"（最好搜索.cu文件，然后全选设置）。
+## 其他配置
+**C/C++->常规->SDL检查**，设为否。
+在 **C/C++->预处理器->预处理器定义** 中，添加
+>USE_CUDA;USE_BLAS;WIN32;MKL;DEBUG;CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_WARNINGS_
+CONSOLE;
+**链接器->系统->子系统**，设置为控制台。
+**常规->字符集**，使用Unicode字符集。
+**调试->命令参数**中设置可执行文件所需要的参数。
--- a/doc/manual.md
+++ b/doc/manual.md
--- a/source/network/XBackwardMath.cpp
+++ b/source/network/XBackwardMath.cpp
@@ -71,6 +71,10 @@ void XMathGrad::MakeGrad(XTensor * node)
        GradAbsolute(node);
    else if (operID == MATH_SIGN)
        GradSign(node);
+    else if (operID == MATH_ROUND)
+        GradRound(node);
+    else if (operID == MATH_CLIP)
+        GradClip(node);
    else if (operID == REDUCE_REDUCEMEAN)
        GradReduceMean(node);
    else if (operID == REDUCE_REDUCESUM)
@@ -725,7 +729,7 @@ void XMathGrad::GradNormalize(XTensor * node)
    XTensor * var = income.tails[2];
    XTensor * a = income.tails[3];
    XTensor * b = income.tails[4];
-    XTensor * c = NewTensor(a);
+    XTensor * c = NewTensor(var);
    XTensor * d = NewTensor(a);
    XTensor * e = NewTensor(a);
    XTensor * f = NewTensor(a);
@@ -733,11 +737,14 @@ void XMathGrad::GradNormalize(XTensor * node)
    XTensor * h = NewTensor(a);
    XTensor * i = NewTensor(a);
    XTensor * j = NewTensor(a);
-    XTensor * k = NewTensor(a);
+    XTensor * k = NewTensor(var);
-    XTensor * p = NewTensor(a);
+    XTensor * p = NewTensor(var);
-    XTensor * q = NewTensor(a);
+    XTensor * q = NewTensor(var);
    XTensor * r = NewTensor(a);
-    DTYPE epsilon = income.GetParam(0);
+    XTensor * x = NewTensor(mean);
+    XTensor * y = NewTensor(mean);
+    XTensor * z = NewTensor(mean);
+    DTYPE epsilon = income.GetParam(1);
    int dim = income.GetParamInt(0);
    int n = a->GetDim(dim);
@@ -756,7 +763,9 @@ void XMathGrad::GradNormalize(XTensor * node)
    /* dEdmean */
    _ScaleAndShift(f, g, -1.0F);
-    _Multiply(node->grad, g, mean->grad, 1.0F);
+    _ReduceSum(g, x, dim);
+    _ReduceSum(node->grad, y, dim);
+    _Multiply(y, x, mean->grad, 1.0F);
    /* dEdvar */
    _Unsqueeze(mean, h, dim, n);
@@ -764,8 +773,9 @@ void XMathGrad::GradNormalize(XTensor * node)
    _Multiply(a, i, j);
    _Power(var, k, -1.5F);
    _ScaleAndShift(k, p, -0.5F);
-    _Multiply(j, p, q);
+    _ReduceSum(j, z, dim);
-    _Multiply(node->grad, q, var->grad, 1.0F);
+    _Multiply(z, p, q);
+    _Multiply(y, q, var->grad, 1.0F);
    /* dEda */
    _Multiply(i, e, r);
@@ -788,6 +798,9 @@ void XMathGrad::GradNormalize(XTensor * node)
    delete p;
    delete q;
    delete r;
+    delete x;
+    delete y;
+    delete z;
 }
 /*
@@ -844,6 +857,60 @@ void XMathGrad::GradSign(XTensor * node)
 }
 /*
+gradient for round
+for
+c = round(a)
+we have
+dE/da = 0
+>> node - the node (c) for backward computation
+*/
+void XMathGrad::GradRound(XTensor * node)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for ROUND!");
+    XTensor * a = income.tails[0];
+    XTensor * b = NewTensor(a);
+    XNoder::MakeGrad(a);
+    b->SetZeroAll();
+    _Sum(a->grad, b, a->grad);
+    node->visitMark = NODE_FINISHED;
+    delete b;
+}
+/*
+gradient for clip
+we have
+dE/da = 1  lower < a < upper
+dE/da = 0  otherwise 
+>> node - the node (c) for backward computation
+*/
+void XMathGrad::GradClip(XTensor * node)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for CLIP!");
+    XTensor * a = income.tails[0];
+    XTensor * b = NewTensor(a);
+    DTYPE lower = income.GetParam(0);
+    DTYPE upper = income.GetParam(1);
+    XNoder::MakeGrad(a);
+    _ClipBackward(node, a, node->grad, a->grad, lower, upper);
+    _Sum(a->grad, b, a->grad);
+    node->visitMark = NODE_FINISHED;
+    delete b;
+}
+/*
 gradient for reduceMean
 for
 c = reduceMean(a, dim)

--- a/source/network/XBackwardMath.h
+++ b/source/network/XBackwardMath.h
@@ -135,6 +135,14 @@ private:
    /* gradient for sign */
    static
    void GradSign(XTensor * node);
+    /* gradient for clip */
+    static
+    void GradClip(XTensor * node);
+    /* gradient for round */
+    static
+    void GradRound(XTensor * node);
 };
 }

--- a/source/tensor/Main.cpp
+++ b/source/tensor/Main.cpp
@@ -37,7 +37,6 @@
 using namespace nts;
-void SetDataTest();
 void SmallTest();
 void TransposeTest();

--- a/source/tensor/XName.cpp
+++ b/source/tensor/XName.cpp
@@ -39,16 +39,26 @@ const char * GetOPName(int type)
            return "M_COS";
        else if (type == MATH_TAN)
            return "M_TAN";
+        else if (type == MATH_ROUND)
+            return "M_ROUND";
+        else if (type == MATH_CLIP)
+            return "M_CLIP";
+        else if (type == MATH_DIV)
+            return "M_DIV";
        else if (type == MATH_MATRIXMUL)
            return "M_MATRIXMUL";
        else if (type == MATH_MATRIXMULBATCHED)
            return "M_MATRIXMULBATCHED";
        else if (type == MATH_MULTIPLY)
            return "M_MULTIPLY";
-        else if (type == MATH_DIV)
-            return "M_DIV";
        else if (type == MATH_NEGATE)
            return "M_NEGATE";
+        else if (type == MATH_NORMALIZE)
+            return "M_NORMALIZE";
+        else if (type == MATH_POWER)
+            return "M_POWER";
+        else if (type == MATH_SCALEANDSHIFT)
+            return "M_SCALEANDSHIFT";
        else if (type == MATH_SIGN)
            return "M_SIGN";
        else if (type == MATH_SUM)
@@ -57,12 +67,6 @@ const char * GetOPName(int type)
            return "M_SUB";
        else if (type == MATH_SUMDIM)
            return "M_SUMDIM";
-        else if (type == MATH_NORMALIZE)
-            return "M_NORMALIZE";
-        else if (type == MATH_POWER)
-            return "M_POWER";
-        else if (type == MATH_SCALEANDSHIFT)
-            return "M_SCALEANDSHIFT";
        else if (type == REDUCE_REDUCEMAX)
            return "R_REDUCEMAX";
        else if (type == REDUCE_REDUCEMEAN)

--- a/source/tensor/XName.h
+++ b/source/tensor/XName.h
@@ -30,28 +30,30 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /* math operations */
 #define MATH_BASE               0x00001000
 #define MATH_ABSOLUTE           MATH_BASE + 1
 #define MATH_EXP                MATH_ABSOLUTE + 1
 #define MATH_LOG                MATH_EXP + 1
 #define MATH_SIN                MATH_LOG + 1
 #define MATH_COS                MATH_SIN + 1
 #define MATH_TAN                MATH_COS + 1
+#define MATH_ROUND              MATH_TAN + 1
-#define MATH_NEGATE             MATH_TAN + 1
+#define MATH_CLIP               MATH_ROUND + 1
-#define MATH_MATRIXMUL          MATH_TAN + 1
+#define MATH_DIV                MATH_CLIP + 1
+#define MATH_MATRIXMUL          MATH_DIV + 1
 #define MATH_MATRIXMULBATCHED   MATH_MATRIXMUL + 1
 #define MATH_MULTIPLY           MATH_MATRIXMULBATCHED + 1
-#define MATH_DIV                MATH_MULTIPLY + 1
+#define MATH_NEGATE             MATH_MULTIPLY + 1
-#define MATH_SIGN               MATH_DIV + 1
+#define MATH_NORMALIZE          MATH_NEGATE + 1
+#define MATH_POWER              MATH_NORMALIZE + 1
+#define MATH_SCALEANDSHIFT      MATH_POWER + 1
+#define MATH_SIGN               MATH_SCALEANDSHIFT + 1
 #define MATH_SUM                MATH_SIGN + 1
 #define MATH_SUB                MATH_SUM + 1
 #define MATH_SUMDIM             MATH_SUB + 1
-#define MATH_NORMALIZE          MATH_SUMDIM + 1
+#define REDUCE                  MATH_SUMDIM + 1
-#define MATH_POWER              MATH_NORMALIZE + 1
-#define MATH_SCALEANDSHIFT      MATH_POWER + 1
-#define REDUCE                  MATH_SCALEANDSHIFT + 1
 #define REDUCE_REDUCEMAX        REDUCE + 1
 #define REDUCE_REDUCEMEAN       REDUCE_REDUCEMAX + 1
 #define REDUCE_REDUCESUM        REDUCE_REDUCEMEAN + 1

--- a/source/tensor/XTensor.cpp
+++ b/source/tensor/XTensor.cpp
@@ -599,25 +599,24 @@ set the tensor items by a uniform distribution in range [lower, upper]
 void XTensor::SetDataRand(DTYPE lower, DTYPE upper)
 {
    // TODO: cuda code!!!!!!!
-    // TODO: replace float with DTYPE
    if (data == NULL)
        return;
    // srand((unsigned)time(0));
+    DTYPE variance = upper - lower;
    void * d = NULL;
    if (dataType == X_FLOAT) {
        d = new float[unitNum];
        for (int i = 0; i < unitNum; i++) {
-            DTYPE value = lower + (upper - lower) * (float)rand() / RAND_MAX;
+            DTYPE value = lower + variance * (float)rand() / RAND_MAX;
            *((float*)d + i) = value;
        }
    }
    else if (dataType == X_DOUBLE) {
        d = new double[unitNum];
        for (int i = 0; i < unitNum; i++) {
-            *((double*)d + i) = lower + (upper - lower) * rand() / RAND_MAX;
+            *((double*)d + i) = lower + variance * rand() / RAND_MAX;
        }
    }
    else {
@@ -627,15 +626,15 @@ void XTensor::SetDataRand(DTYPE lower, DTYPE upper)
    SetData(d, unitNum);
    if (dataType == X_FLOAT) {
-        delete[](float*)d;
+        delete[] (float*)d;
    }
    else {
-        delete[](double*)d;
+        delete[] (double*)d;
    }
 }
-/* a gauss distribution */
+/* a gauss distribution (Box-Muller method) */
-double GaussRand()
+double GaussRand(DTYPE mean, DTYPE standardDeviation)
 {
    // TODO: cuda code!!!!!!!
@@ -645,8 +644,8 @@ double GaussRand()
    double pi = 3.141592654;
    if (phase == 0){
-        u = (rand() + 1) / (RAND_MAX + 1.0);
+        u = (rand() + 1.0) / (RAND_MAX + 1.0);
-        v = (rand() + 1) / (RAND_MAX + 1.0);
+        v = (rand() + 1.0) / (RAND_MAX + 1.0);
        z = sqrt(-2.0 * log(u))* sin(2.0 * pi * v);
    }
    else{
@@ -654,7 +653,7 @@ double GaussRand()
    }
    phase = 1 - phase;
-    return z;
+    return mean + (z * standardDeviation);
 }
 /* 
@@ -665,7 +664,6 @@ set the tensor items by a normal distribution
 void XTensor::SetDataRandn(DTYPE mean, DTYPE standardDeviation)
 {
    // TODO: cuda code!!!!!!!
-    // TODO: replace float with DTYPE
    if (data == NULL)
        return;
@@ -675,13 +673,13 @@ void XTensor::SetDataRandn(DTYPE mean, DTYPE standardDeviation)
    if (dataType == X_FLOAT) {
        d = new float[unitNum];
        for (int i = 0; i < unitNum; i++) {
-            *((float*)d + i) = (float)GaussRand();
+            *((float*)d + i) = (float)GaussRand(mean, standardDeviation);
        }
    }
    else if (dataType == X_DOUBLE) {
        d = new double[unitNum];
        for (int i = 0; i < unitNum; i++) {
-            *((double*)d + i) = GaussRand();
+            *((double*)d + i) = GaussRand(mean, standardDeviation);
        }
    }
    else {
@@ -691,10 +689,10 @@ void XTensor::SetDataRandn(DTYPE mean, DTYPE standardDeviation)
    SetData(d, unitNum);
    if (dataType == X_FLOAT) {
-        delete[](float*)d;
+        delete[] (float*)d;
    }
    else {
-        delete[](double*)d;
+        delete[] (double*)d;
    }
 }

--- a/source/tensor/core/CHeader.h
+++ b/source/tensor/core/CHeader.h
@@ -46,6 +46,7 @@
 #include "getandset/Select.h"
 #include "getandset/SetData.h"
+#include "math/Clip.h"
 #include "math/Normalize.h"
 #include "math/Power.h"
 #include "math/ScaleAndShift.h"

--- a/source/tensor/core/arithmetic/Sign.cpp
+++ b/source/tensor/core/arithmetic/Sign.cpp
@@ -76,7 +76,7 @@ XTensor Sign(const XTensor & a)
    XTensor b(&a);
    b.SetTMP();
-    /* call _ScaleAndShift function */
+    /* call _Sign function */
    _Sign(&a, &b);
    /* tensor connections */

--- a/source/tensor/core/getandset/SetData.cpp
+++ b/source/tensor/core/getandset/SetData.cpp
@@ -214,34 +214,32 @@ void _SetDataFixedDouble(XTensor * tensor, double p)
 }
 /*
-generate data items with a uniform distribution in [low,high]
+generate data items with a uniform distribution in [lower, upper]
 >> tensor - the tensor whose data array would be initialized
->> low - lower value of the range
+>> lower - lower value of the range
->> high - higher value of the range
+>> upper - upper value of the range
 */
-void _SetDataRand(XTensor * tensor, DTYPE low, DTYPE high)
+void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper)
 {
-    CheckNTErrors(high > low, "the high value must be greater than low value!");
+    CheckNTErrors(upper > lower, "the high value must be greater than low value!");
    if(tensor == NULL)
        return;
    /* GPU code */
    if(tensor->devID < 0){
-        DTYPE variance = high - low;
+        DTYPE variance = upper - lower;
-        srand((unsigned)time(NULL));
        if(tensor->dataType == X_FLOAT){
            float * d = (float*)tensor->data;
            for(int i = 0; i < tensor->unitNum; i++){
-                d[i] = variance * ((float)rand()/RAND_MAX) + low;
+                d[i] = variance * ((float)rand()/RAND_MAX) + lower;
            }
        }
        else if(tensor->dataType == X_DOUBLE){
            double * d = (double*)tensor->data;
            for(int i = 0; i < tensor->unitNum; i++){
-                d[i] = variance * ((double)rand()/RAND_MAX) + low;
+                d[i] = variance * ((double)rand()/RAND_MAX) + lower;
            }
        }
        else{
@@ -256,7 +254,7 @@ void _SetDataRand(XTensor * tensor, DTYPE low, DTYPE high)
    */
    else{
 #ifdef USE_CUDA
-        _CudaSetDataRand(tensor, low, high);
+        _CudaSetDataRand(tensor, lower, upper);
 #endif
        //XTensor * t2 = NewTensor(tensor->order, tensor->dimSize, tensor->dataType, tensor->denseRatio, -1);
        //_SetDataRand(t2, low, high);
@@ -265,5 +263,17 @@ void _SetDataRand(XTensor * tensor, DTYPE low, DTYPE high)
    }
 }
+/*
+generate data items with a normal distribution with specified mean and standard deviation 
+>> mean - mean or expectation of the distribution
+>> standardDeviation - standard deviation of the distribution
+*/
+void _SetDataRandN(XTensor * tensor, DTYPE mean, DTYPE standardDeviation)
+{
+    // TODO: rewrite it and add cuda code!!!!!!!
+    tensor->SetDataRandn(mean, standardDeviation);
+}
 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/getandset/SetData.cu
+++ b/source/tensor/core/getandset/SetData.cu
@@ -150,61 +150,20 @@ void _CudaSetDataFixedDouble(XTensor * tensor, double p)
 }
 /* 
-call curand_init function on each kernel with the same random seed
-and init the rng states
-*/
-__global__ 
-void KernelInitializeCurand(curandState * state, unsigned long seed)
-{
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-    curand_init(seed, i, 0, &state[i]);
-}
-/* */
-__device__ 
-float GenerateFloat(curandState* globalState, int i)
-{
-    //copy state to local mem
-    curandState localState = globalState[i];
-    //apply uniform distribution with calculated random
-    float randNum = curand_uniform(&localState);
-    //update state
-    globalState[i] = localState;
-    //return value
-    return randNum;
-}
-/**/
-__device__ 
-double GenerateDouble(curandState* globalState, int i)
-{
-    //copy state to local mem
-    curandState localState = globalState[i];
-    //apply uniform distribution with calculated random
-    double randNum = curand_uniform_double(&localState);
-    //update state
-    globalState[i] = localState;
-    //return value
-    return randNum;
-}
-/* 
 set data array with a uniform distribution in [low, high] 
 >> deviceStates - the state of curand
 >> d - float datatype pointer to the data array 
 >> size - size of the array
->> low - low value of the range
+>> lower - low value of the range
->> high - high value of the range
+>> variance - the variance of the range
 */
 __global__
-void KernelSetDataRandFloat(curandState* deviceStates, float * d, int size, DTYPE low, DTYPE variance)
+void KernelSetDataRandFloat(float * d, int size, DTYPE lower, DTYPE variance)
 {
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i < size) {
-        float randNum = GenerateFloat(deviceStates, i);
+        d[i] = d[i] * variance + lower;
-        d[i] = randNum * variance + low;
    }
 }
 /* 
@@ -212,29 +171,28 @@ set data array with a uniform distribution in [low, high]
 >> deviceStates - the state of curand
 >> d - double datatype pointer to the data array
 >> size - size of the array
->> low - low value of the range
+>> lower - low value of the range
->> high - high value of the range
+>> variance - the variance of the range
 */
 __global__
-void KernelSetDataRandDouble(curandState* deviceStates, double * d, int size, DTYPE low, DTYPE variance)
+void KernelSetDataRandDouble(double * d, int size, DTYPE lower, DTYPE variance)
 {
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i < size){
-        double randNum = GenerateDouble(deviceStates, i);
+        d[i] = d[i] * variance + lower;
-        d[i] = randNum * variance + low;
    }
 }
 /*
-generate data items with a uniform distribution in [low,high]
+generate data items with a uniform distribution in [lower, upper]
 >> tensor - the tensor whose data array would be initialized
->> low - lower value of the range
+>> lower - lower value of the range
->> high - higher value of the range
+>> upper - upper value of the range
 */
-void _CudaSetDataRand(XTensor * tensor, DTYPE low, DTYPE high)
+void _CudaSetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper)
 {
-    CheckNTErrors(high > low, "the high value must be greater than low value!");
+    CheckNTErrors(upper > lower, "the high value must be greater than low value!");
    int gridSize[3];
    int blockSize[3];
@@ -247,15 +205,17 @@ void _CudaSetDataRand(XTensor * tensor, DTYPE low, DTYPE high)
    int devIDBackup;
    ProtectCudaDev(tensor->devID, devIDBackup);
-    curandState *deviceStates;
+    curandGenerator_t gen;
-    cudaMalloc(&deviceStates, sizeof(curandState));
+    curandCreateGenerator (&gen, CURAND_RNG_PSEUDO_DEFAULT);
-    DTYPE variance = high - low;
+    curandSetPseudoRandomGeneratorSeed(gen, time(NULL));
+    curandGenerateUniform(gen , (float*)tensor->data , tensor->unitNum);
+    curandDestroyGenerator(gen);
+    DTYPE variance = upper - lower;
-    KernelInitializeCurand<<<blocks, threads>>>(deviceStates, unsigned(time(NULL)));
    if (tensor->dataType == X_FLOAT)
-        KernelSetDataRandFloat <<<blocks, threads >>>(deviceStates, (float*)tensor->data, tensor->unitNum, low, variance);
+        KernelSetDataRandFloat <<<blocks, threads >>>((float*)tensor->data, tensor->unitNum, lower, variance);
    else if (tensor->dataType == X_DOUBLE)
-        KernelSetDataRandDouble <<<blocks, threads >>>(deviceStates, (double*)tensor->data, tensor->unitNum, low, variance);
+        KernelSetDataRandDouble <<<blocks, threads >>>((double*)tensor->data, tensor->unitNum, lower, variance);
    BacktoCudaDev(tensor->devID, devIDBackup);
 }

--- a/source/tensor/core/getandset/SetData.cuh
+++ b/source/tensor/core/getandset/SetData.cuh
@@ -37,8 +37,8 @@ void _CudaSetDataFixedFloat(XTensor * tensor, float p);
 /* generate data items with a fixed value p (in double) */
 void _CudaSetDataFixedDouble(XTensor * tensor, double p);
-/* generate data items with a uniform distribution in [low,high] */
+/* generate data items with a uniform distribution in [lower, upper] */
-void _CudaSetDataRand(XTensor * tensor, DTYPE low, DTYPE high);
+void _CudaSetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/getandset/SetData.h
+++ b/source/tensor/core/getandset/SetData.h
@@ -45,8 +45,8 @@ void _SetDataFixedFloat(XTensor * tensor, float p);
 /* generate data items with a fixed value p (in double) */
 void _SetDataFixedDouble(XTensor * tensor, double p);
-/* generate data items with a uniform distribution in [low,high] */
+/* generate data items with a uniform distribution in [lower, upper] */
-void _SetDataRand(XTensor * tensor, DTYPE low, DTYPE high);
+void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper);
 /* generate data items with a normal distribution with specified mean and standard deviation */
 void _SetDataRandN(XTensor * tensor, DTYPE mean, DTYPE standardDeviation);

--- a/source/tensor/core/math/Clip.cpp
+++ b/source/tensor/core/math/Clip.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-03
+*/
+#include "../../XTensor.h"
+#include "../../XName.h"
+#include "Clip.h"
+#include "Clip.cuh"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/*
+set every entry to its clip value
+>> a - input tensor we are processing
+>> b - output tensor we are processing
+>> lower - the lower border
+>> upper - the upper border
+*/
+void _Clip(const XTensor * a, XTensor * b, DTYPE lower, DTYPE upper)
+{
+#ifdef USE_CUDA
+	/* run it on GPUs */
+	if (a->devID >= 0) {
+		_CudaClip(a, b, lower, upper);
+		return;
+	}
+#endif
+	CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
+	CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
+	DTYPE * d = (DTYPE*)a->data;
+	DTYPE * db = (DTYPE*)b->data;
+	for (int i = 0; i < a->unitNum; i++) {
+		if (d[i] > upper)
+			db[i] = upper;
+		else if (d[i] < lower)
+			db[i] = lower;
+		else
+			db[i] = d[i];
+	}
+}
+/*
+set every entry to its clip value (do it on site)
+keep the result in the input tensor a and return nothing
+>> a - the tensor we are processing
+>> lower - the lower border
+>> upper - the upper border
+*/
+void _ClipMe(XTensor * a, DTYPE lower, DTYPE upper)
+{
+	_Clip(a, a, lower, upper);
+}
+/*
+set every entry to its clip value (return a XTensor structure)
+make a new tensor to keep the result and return it
+>> a - input tensor we are processing
+>> lower - the lower border
+>> upper - the upper border
+<< return - the clip value of the input tensor
+*/
+XTensor Clip(const XTensor & a, DTYPE lower, DTYPE upper)
+{
+	XTensor b(&a);
+	b.SetTMP();
+	/* call _Clip function */
+	_Clip(&a, &b, lower, upper);
+	/* tensor connections */
+	XLink::MakeLink(&a, NULL, &b, MATH_CLIP);
+	XLink::AddParamToHead(&b, lower);
+	XLink::AddParamToHead(&b, upper);
+	return b;
+}
+/*
+backward computation
+dE/dx = dE/dy * dy/dx
+hard tanh: y =  upper    if x > upper
+x    if lower <= x <= upper
+lower    if x< lower
+and dy/dx =  1    if lower <= x <= upper
+0    otherwise
+>> gold - gold standard to measure error (or loss)
+>> y - output of the function
+>> x - input of the function
+>> dedy - dE/dy
+>> dedx - dE/dx
+>> lossName - type of loss function, e.g., cross entropy
+*/
+void _ClipBackward(XTensor * y, XTensor * x, XTensor * dedy, XTensor * dedx, DTYPE lower, DTYPE upper) 
+{
+#ifdef USE_CUDA
+    if (x->devID >= 0) {
+        _CudaClipBackward(y, x, dedy, dedx, lower, upper);
+        return;
+}
+#endif
+    if (x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE) {
+        DTYPE * dedyp = (DTYPE*)dedy->data;
+        DTYPE * dedxp = (DTYPE*)dedx->data;
+        DTYPE * ip = (DTYPE*)x->data;
+        int size = y->unitNum;
+        /* dE/dx = dE/dy * dy/dx */
+        for (int i = 0; i < size; i++) {
+            DTYPE s = ip[i];
+            if (s > upper || s < lower)
+                dedxp[i] = 0;
+            else
+                dedxp[i] = dedyp[i];
+        }
+    }
+    else
+        ShowNTErrors("TODO!");
+}
+} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/math/Clip.cu
+++ b/source/tensor/core/math/Clip.cu
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-03
+*/
+#include "../../XDevice.h"
+#include "../../XTensor.h"
+#include "Clip.h"
+#include "Clip.cuh"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+#ifdef USE_CUDA
+/*
+set each entry to its clip value (CUDA Kernel)
+>> a - pointer to input data array
+>> b - pointer to output data array
+>> lower - the lower border
+>> upper - the upper border
+>> size - size of the data array
+*/
+__global__
+	void KernelClip(DTYPE * a, DTYPE * b, DTYPE lower, DTYPE upper, int size)
+{
+	int i = blockDim.x * blockIdx.x + threadIdx.x;
+	if (i < size) {
+		if (a[i] > upper)
+			b[i] = upper;
+		else if (a[i] < lower)
+			b[i] = lower;
+		else
+			b[i] = a[i];
+	}
+}
+/*
+set each entry to its clip value with float16 data type value (CUDA Kernel)
+This is for float16 computation
+>> a - pointer to input data array
+>> b - pointer to output data array
+>> lower - the lower border
+>> upper - the upper border
+>> size - size of the data array
+*/
+__global__
+void KernelClip(__half * a, __half * b, DTYPE lower, DTYPE upper, int size)
+{
+	return;
+}
+/*
+set each entry to its clip value
+>> a - input tensor we are processing
+>> b - output tensor we are processing
+>> lower - the lower border
+>> upper - the upper border
+*/
+void _CudaClip(const XTensor * a, XTensor * b, DTYPE lower, DTYPE upper)
+{
+	CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
+	CheckNTErrors((a->isSparse == false), "TODO!");
+	int gridSize[3];
+	int blockSize[3];
+	GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);
+	dim3 blocks(gridSize[0]);
+	dim3 threads(blockSize[0]);
+	int devIDBackup;
+	ProtectCudaDev(a->devID, devIDBackup);
+	if (a->dataType == DEFAULT_DTYPE) {
+		KernelClip << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, lower, upper, a->unitNum);
+	}
+	else if (a->dataType == X_FLOAT16) {
+		KernelClip << <blocks, threads >> >((__half*)a->data, (__half*)b->data, lower, upper, a->unitNum);
+	}
+	else {
+		ShowNTErrors("TODO!");
+	}
+	BacktoCudaDev(a->devID, devIDBackup);
+}
+/*
+clip backward computation of dE/dx (Cuda kernel)
+dy/dx = 1     if lower <= x <= upper
+0     otherwise
+>> dedy - dE/dy
+>> dedx - dE/dx
+>> y - y of the function
+>> x - x of the function
+>> lower 
+>> upper 
+*/
+__global__
+void KernelClipBackward(DTYPE * dedy, DTYPE * dedx, DTYPE * y, DTYPE * x, DTYPE lower, DTYPE upper, int size)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < size) {
+        DTYPE s = x[i];
+        if (s > upper || s < lower)
+            dedx[i] = 0;
+        else
+            dedx[i] = dedy[i];
+    }
+}
+/*
+backward computation (Cuda version)
+dE/dx = dE/dy * dy/dx
+hard tanh: y =  upper    if x > upper
+x    if lower <= x <= upper
+lower    if x< lower
+and dy/dx =  1    if lower <= x <= upper
+0    otherwise
+>> gold - gold standard to measure error (or loss)
+>> y - output of the function
+>> x - input of the function
+>> dedy - dE/dy
+>> dedx - dE/dx
+>> lossName - type of loss function, e.g., cross entropy
+*/
+void _CudaClipBackward(XTensor * y, XTensor * x, XTensor * dedy, XTensor * dedx, DTYPE lower, DTYPE upper)
+{
+    if (x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE) {
+        int gridSize[3], blockSize[3];
+        GDevs.GetCudaThread(x->devID, x->unitNum, gridSize, blockSize);
+        int devIDBackup;
+        ProtectCudaDev(x->devID, devIDBackup);
+        /* dE/dx = dE/dy * dy/dx */
+        KernelClipBackward <<<dim3(gridSize[0]), dim3(blockSize[0])>>>
+                             ((DTYPE*)dedy->data,
+                              (DTYPE*)dedx->data,
+                              (DTYPE*)y->data, (DTYPE*)x->data,
+                              lower, upper,
+                              x->unitNum);
+        BacktoCudaDev(x->devID, devIDBackup);
+    }
+    else
+        ShowNTErrors("TODO!");
+}
+#endif // USE_CUDA
+} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/math/Clip.cuh
+++ b/source/tensor/core/math/Clip.cuh
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-03
+*/
+#ifndef __CLIP_CUH__
+#define __CLIP_CUH__
+#include "Clip.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+#ifdef USE_CUDA
+/* set each entry to its clip value (CUDA Kernel) */
+__global__
+void KernelClip(DTYPE * a, DTYPE * b, DTYPE lower, DTYPE upper, int size);
+/* set each entry to its clip value (CUDA Kernel) with float16 data type*/
+__global__
+void KernelClip(__half * a, __half * b, DTYPE lower, DTYPE upper, int size);
+/* set each entry to its clip value */
+void _CudaClip(const XTensor * a, XTensor * b, DTYPE lower, DTYPE upper);
+/* backward of Clip function (CUDA Kernel) */
+__global__
+void KernelClipBackward(DTYPE * dedy, DTYPE * dedx, DTYPE * y, DTYPE * x, DTYPE lower, DTYPE upper, int size);
+/* backward of Clip function */
+void _CudaClipBackward(XTensor * y, XTensor * x, XTensor * dedy, XTensor * dedx, DTYPE lower, DTYPE upper);
+#endif // USE_CUDA
+} // namespace nts(NiuTrans.Tensor)
+#endif // __CLIP_H__
\ No newline at end of file
--- a/source/tensor/core/math/Clip.h
+++ b/source/tensor/core/math/Clip.h
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-03
+*/
+#ifndef __CLIP_H__
+#define __CLIP_H__
+#include "../../XTensor.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/* set every entry to its clip value */
+void _Clip(const XTensor * a, XTensor * b, DTYPE lower, DTYPE upper);
+/*
+set every entry to its clip value (do it on site)
+keep the result in the input tensor a and return nothing
+*/
+void _ClipMe(XTensor * a, DTYPE lower, DTYPE upper);
+/*
+set every entry to its clip value  (return a XTensor structure)
+make a new tensor to keep the result and return it
+*/
+XTensor Clip(const XTensor & a, DTYPE lower, DTYPE upper);
+/*
+backward of Clip function
+*/
+void _ClipBackward(XTensor * y, XTensor * x, XTensor * dedy, XTensor * dedx, DTYPE lower, DTYPE upper);
+} // namespace nts(NiuTrans.Tensor)
+#endif // __CLIP_H__
--- a/source/tensor/core/math/Unary.cpp
+++ b/source/tensor/core/math/Unary.cpp
@@ -64,6 +64,10 @@ SIMPLE_UNARY_FUNCTION(Cos, _Cos, MATH_COS)
 _SIMPLE_UNARY_FUNCTION(_Tan, _CudaTan, tan)
 _SIMPLE_UNARY_FUNCTION_ME(_TanMe, _Tan)
 SIMPLE_UNARY_FUNCTION(Tan, _Tan, MATH_TAN)
+_SIMPLE_UNARY_FUNCTION(_Round, _CudaRound, round)
+_SIMPLE_UNARY_FUNCTION_ME(_RoundMe, _Round)
+SIMPLE_UNARY_FUNCTION(Round, _Round, MATH_ROUND)
 #else
 /* define three marco separately, specify the respective function names */
 #define _SIMPLE_UNARY_FUNCTION(_funcName, origFunc)          \
@@ -117,6 +121,10 @@ SIMPLE_UNARY_FUNCTION(Cos, _Cos, MATH_COS)
 _SIMPLE_UNARY_FUNCTION(_Tan, tan)
 _SIMPLE_UNARY_FUNCTION_ME(_TanMe, _Tan)
 SIMPLE_UNARY_FUNCTION(Tan, _Tan, MATH_TAN)
+_SIMPLE_UNARY_FUNCTION(_Round, round)
+_SIMPLE_UNARY_FUNCTION_ME(_RoundMe, _Round)
+SIMPLE_UNARY_FUNCTION(Round, _Round, MATH_ROUND)
 #endif
 }
\ No newline at end of file
--- a/source/tensor/core/math/Unary.cu
+++ b/source/tensor/core/math/Unary.cu
@@ -5,51 +5,51 @@
 namespace nts {
-#define SIMPLE_UNARY_FUNCTION_GPU(funcName, origFunc)                   \
+#define SIMPLE_UNARY_FUNCTION_GPU(funcName, origFunc)                       \
-__global__                                                              \
+__global__                                                                  \
-void Kernel##funcName(DTYPE * a, DTYPE * b, int size)                   \
+void Kernel##funcName(DTYPE * a, DTYPE * b, int size)                       \
-{                                                                       \
+{                                                                           \
-    int i = blockDim.x * blockIdx.x + threadIdx.x;                      \
+    int i = blockDim.x * blockIdx.x + threadIdx.x;                          \
-                                                                        \
+                                                                            \
-    if (i < size)                                                       \
+    if (i < size)                                                           \
-        b[i] = (DTYPE)origFunc(a[i]);                                   \
+        b[i] = (DTYPE)origFunc(a[i]);                                       \
-}                                                                       \
+}                                                                           \
-__global__                                                              \
+__global__                                                                  \
-    void Kernel##funcName(__half * a, __half * b, int size)             \
+    void Kernel##funcName(__half * a, __half * b, int size)                 \
-{                                                                       \
+{                                                                           \
-    return;                                                             \
+    return;                                                                 \
-}                                                                       \
+}                                                                           \
-void _Cuda##funcName(const XTensor * a, XTensor * b)                    \
+void _Cuda##funcName(const XTensor * a, XTensor * b)                        \
-{                                                                       \
+{                                                                           \
-    CheckNTErrors((XTensor::IsSameShaped(a, b)),                        \
+    CheckNTErrors((XTensor::IsSameShaped(a, b)),                            \
-                  "Input tensors should have the same type!");          \
+                  "Input tensors should have the same type!");              \
-    CheckNTErrors((a->isSparse == false), "TODO!");                     \
+    CheckNTErrors((a->isSparse == false), "TODO!");                         \
-                                                                        \
+                                                                            \
-    int gridSize[3];                                                    \
+    int gridSize[3];                                                        \
-    int blockSize[3];                                                   \
+    int blockSize[3];                                                       \
-                                                                        \
+                                                                            \
-    GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);     \
+    GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);         \
-                                                                        \
+                                                                            \
-    dim3 blocks(gridSize[0]);                                           \
+    dim3 blocks(gridSize[0]);                                               \
-    dim3 threads(blockSize[0]);                                         \
+    dim3 threads(blockSize[0]);                                             \
-                                                                        \
+                                                                            \
-    int devIDBackup;                                                    \
+    int devIDBackup;                                                        \
-    ProtectCudaDev(a->devID, devIDBackup);                              \
+    ProtectCudaDev(a->devID, devIDBackup);                                  \
-                                                                        \
+                                                                            \
-    if (a->dataType == DEFAULT_DTYPE) {                                 \
+    if (a->dataType == DEFAULT_DTYPE) {                                     \
-        Kernel##funcName << <blocks, threads >> >                       \
+        Kernel##funcName << <blocks, threads >> >                           \
-                     ((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum);    \
+                     ((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum);        \
-    }                                                                   \
+    }                                                                       \
-    else if (a->dataType == X_FLOAT16) {                                \
+    else if (a->dataType == X_FLOAT16) {                                    \
-        Kernel##funcName << <blocks, threads >> >                       \
+        Kernel##funcName << <blocks, threads >> >                           \
-                     ((__half*)a->data, (__half*)b->data, a->unitNum);  \
+                     ((__half*)a->data, (__half*)b->data, a->unitNum);      \
-    }                                                                   \
+    }                                                                       \
-    else {                                                              \
+    else {                                                                  \
-        ShowNTErrors("TODO!");                                          \
+        ShowNTErrors("TODO!");                                              \
-    }                                                                   \
+    }                                                                       \
-                                                                        \
+                                                                            \
-    BacktoCudaDev(a->devID, devIDBackup);                               \
+    BacktoCudaDev(a->devID, devIDBackup);                                   \
-}                                                                       \
+}                                                                           \
 SIMPLE_UNARY_FUNCTION_GPU(Absolute, fabs)
 SIMPLE_UNARY_FUNCTION_GPU(Exp, exp)
@@ -57,5 +57,6 @@ SIMPLE_UNARY_FUNCTION_GPU(Log, log)
 SIMPLE_UNARY_FUNCTION_GPU(Sin, sin)
 SIMPLE_UNARY_FUNCTION_GPU(Cos, cos)
 SIMPLE_UNARY_FUNCTION_GPU(Tan, tan)
+SIMPLE_UNARY_FUNCTION_GPU(Round, round)
 }
\ No newline at end of file
--- a/source/tensor/core/math/Unary.cuh
+++ b/source/tensor/core/math/Unary.cuh
@@ -83,6 +83,15 @@ void KernelTan(__half * a, __half * b, int size);
 /* set each entry to its tangent value */
 void _CudaTan(const XTensor * a, XTensor * b);
+/* set each entry to its round value (CUDA Kernel) */
+__global__
+void KernelRound(DTYPE * a, DTYPE * b, int size);
+/* set each entry to its round value (CUDA Kernel) with float16 data type*/
+__global__
+void KernelRound(__half * a, __half * b, int size);
+/* set each entry to its round value */
+void _CudaRound(const XTensor * a, XTensor * b);
 #endif // USE_CUDA
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/math/Unary.h
+++ b/source/tensor/core/math/Unary.h
@@ -104,5 +104,19 @@ make a new tensor to keep the result and return it
 */
 XTensor Tan(const XTensor & a);
+/* set every entry to its round value */
+void _Round(const XTensor * a, XTensor * b);
+/* 
+set every entry to its round value (do it on site)
+keep the result in the input tensor a and return nothing
+*/
+void _RoundMe(XTensor * a);
+/* 
+set every entry to its round value (return a XTensor structure)
+make a new tensor to keep the result and return it
+*/
+XTensor Round(const XTensor & a);
 }
 #endif //end __UNARY_H__
\ No newline at end of file
--- a/source/tensor/core/reduce/ReduceMax.cu
+++ b/source/tensor/core/reduce/ReduceMax.cu
--- a/source/tensor/core/reduce/ReduceSum.cu
+++ b/source/tensor/core/reduce/ReduceSum.cu
--- a/source/tensor/core/sort/TopK.cu
+++ b/source/tensor/core/sort/TopK.cu
--- a/source/tensor/function/HardTanH.cpp
+++ b/source/tensor/function/HardTanH.cpp
@@ -116,8 +116,7 @@ void _HardTanHBackward(XTensor * gold, XTensor * y, XTensor * x,
    }
 #endif
-    if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE)
+    if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE){
-    {
        /* calculate dE/dy */
        if(lossName != NOLOSS)
            _LossBackward(dedy, gold, y, lossName);

--- a/source/tensor/function/Softmax.cu
+++ b/source/tensor/function/Softmax.cu
@@ -156,6 +156,50 @@ void KernelSoftmaxComputeTensor(__half * x, __half * max, __half * sum, __half *
 }
 /*
+use PTX code to broadcast float data
+*/
+__device__ __forceinline__ 
+float broadcast(float input)
+{
+    float output;
+    asm(
+        "{"
+        "shfl.idx.b32 %0,%1,0x0,0x1f;"
+        "}"
+        :"=f"(output) : "f"(input)
+    );
+    return output;
+}
+/*
+use warp broadcast to optimize softmax computing
+*/
+__global__
+void KernelSoftmaxComputeTensorUseBroadcast(DTYPE * input, DTYPE * max, DTYPE * sum, DTYPE * output, 
+                                            int stride, int strideNum, int blockNum)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    int j = blockDim.y * blockIdx.y + threadIdx.y;
+    int i2 = j % stride;
+    int blockSize = stride * strideNum;
+    if (j < stride * blockNum) {
+        DTYPE sumData, maxData;
+        if (i % 32 == 0) {
+            sumData = sum[j];
+            maxData = max[j];
+        }
+        sumData = broadcast(sumData);
+        maxData = broadcast(maxData);
+        if (i < strideNum){
+            int offset = int(j / stride) * blockSize + i * stride + i2;
+            output[offset] = exp(input[offset] - maxData) / sumData;
+        }
+    }
+}
+/*
 softmax y = e^x / \sum_{i} e^{x_i} (Cuda version)
 >> x - x vector
 >> y - result
@@ -183,20 +227,42 @@ void _CudaSoftmaxSumMax(const XTensor * x, XTensor * y, int leadDim, XTensor * s
    int cudaGridSize[3];
    int cudaBlockSize[3];
-    GDevs.GetCudaThread2D(x->devID, stride * blockNum, dimensionSize, MAX_INT, cudaGridSize, cudaBlockSize);
+    if (leadDim != 0 || dimensionSize <= 10){
+        /* allocate thread num for old function */
+        GDevs.GetCudaThread2D(x->devID, stride * blockNum, dimensionSize, MAX_INT, cudaGridSize, cudaBlockSize);
+    }
+    else {
+        /* allocate thread num for new function */
+        GDevs.GetCudaThread2D(x->devID, dimensionSize, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
+        if (cudaBlockSize[0] < 32) {
+            /* use at least a warp */
+            cudaBlockSize[0] = 32;
+            if (cudaBlockSize[1] > 32) {
+                cudaGridSize[1] = int(ceil(float(stride * blockNum) / 32));
+                cudaBlockSize[1] = 32;
+            }
+        }
+    }
    int devIDBackup;
    ProtectCudaDev(x->devID, devIDBackup);
    if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE){
-        KernelSoftmaxComputeTensor<<<dim3(cudaGridSize[0], cudaGridSize[1]), dim3(cudaBlockSize[0], cudaBlockSize[1])>>>
+        if (leadDim != 0 || dimensionSize <= 10) {
-                                   ((DTYPE*)x->data, (DTYPE*)max->data, (DTYPE*)sum->data, (DTYPE*)y->data, 
+            KernelSoftmaxComputeTensor <<< dim3(cudaGridSize[0], cudaGridSize[1]), dim3(cudaBlockSize[0], cudaBlockSize[1]) >>>
-                                     stride, dimensionSize, stride * dimensionSize, blockNum, stride * blockNum);
+                                         ((DTYPE*)x->data, (DTYPE*)max->data, (DTYPE*)sum->data, (DTYPE*)y->data,
+                                           stride, dimensionSize, stride * dimensionSize, blockNum, stride * blockNum);
+        }
+        else {
+            KernelSoftmaxComputeTensorUseBroadcast <<< dim3(cudaGridSize[0], cudaGridSize[1]), dim3(cudaBlockSize[0], cudaBlockSize[1]) >>>
+                                                     ((DTYPE*)x->data, (DTYPE*)max->data, (DTYPE*)sum->data, (DTYPE*)y->data,
+                                                       stride, dimensionSize, blockNum);
+        }
    }
    else if(x->dataType == X_FLOAT16 && y->dataType == X_FLOAT16){
-        KernelSoftmaxComputeTensor<<<dim3(cudaGridSize[0], cudaGridSize[1]), dim3(cudaBlockSize[0], cudaBlockSize[1])>>>
+        KernelSoftmaxComputeTensor <<< dim3(cudaGridSize[0], cudaGridSize[1]), dim3(cudaBlockSize[0], cudaBlockSize[1]) >>>
-                                   ((__half*)x->data, (__half*)max->data, (__half*)sum->data, (__half*)y->data, 
+                                     ((__half*)x->data, (__half*)max->data, (__half*)sum->data, (__half*)y->data, 
-                                     stride, dimensionSize, blockNum);
+                                       stride, dimensionSize, blockNum);
    }
    else{
        ShowNTErrors("TODO!");

--- a/source/tensor/math.zip
+++ b/source/tensor/math.zip
--- a/source/tensor/test/TClip.cpp
+++ b/source/tensor/test/TClip.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-03
+*/
+#include "../XTensor.h"
+#include "TClip.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/*
+case 1: test Clip function.
+Set every entry to its clip value.
+*/
+bool TestClip1()
+{
+	/* a tensor of size (3, 2) */
+	int aOrder = 2;
+	int * aDimSize = new int[aOrder];
+	aDimSize[0] = 3;
+	aDimSize[1] = 2;
+	int aUnitNum = 1;
+	for (int i = 0; i < aOrder; i++)
+		aUnitNum *= aDimSize[i];
+	DTYPE aData[3][2] = { {1.0F, -2.0F},
+						  {0.0F, 4.0F},
+						  {5.0F, -6.0F} };
+	DTYPE answer[3][2] = { {1.0F, -1.0F},
+						   {0.0F, 1.0F},
+					   	   {1.0F, -1.0F} };
+	/* CPU test */
+	bool cpuTest = true;
+	/* create tensors */
+	XTensor * a = NewTensor(aOrder, aDimSize);
+	XTensor * b = NewTensor(aOrder, aDimSize);
+	XTensor * aMe = NewTensor(aOrder, aDimSize);
+	XTensor bUser;
+	/* initialize variables */
+	a->SetData(aData, aUnitNum);
+	aMe->SetData(aData, aUnitNum);
+	/* call Clip function */
+	_Clip(a, b, -1.0, 1.0);
+	_ClipMe(aMe, -1.0, 1.0);
+	bUser = Clip(*a, -1.0, 1.0);
+	/* check results */
+	cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) && 
+              aMe->CheckData(answer, aUnitNum, 1e-4F) && 
+              bUser.CheckData(answer, aUnitNum, 1e-4F);
+#ifdef USE_CUDA
+	/* GPU test */
+	bool gpuTest = true;
+	/* create tensor */
+	XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+	XTensor * bGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+	XTensor * aMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+	XTensor bUserGPU;
+	/* Initialize variables */
+	aGPU->SetData(aData, aUnitNum);
+	aMeGPU->SetData(aData, aUnitNum);
+	/* call Clip function */
+	_Clip(aGPU, bGPU, -1.0, 1.0);
+	_ClipMe(aMeGPU, -1.0, 1.0);
+	bUserGPU = Clip(*aGPU, -1.0, 1.0);
+	/* check results */
+	gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) && 
+              aMeGPU->CheckData(answer, aUnitNum, 1e-4F) && 
+              bUserGPU.CheckData(answer, aUnitNum, 1e-4F);
+	/* destroy variables */
+	delete a;
+	delete b;
+	delete aMe;
+	delete aGPU;
+	delete bGPU;
+	delete aMeGPU;
+	delete[] aDimSize;
+	return cpuTest && gpuTest;
+#else
+	/* destroy variables */
+	delete a;
+	delete b;
+	delete aMe;
+	delete[] aDimSize;
+	return cpuTest;
+#endif // USE_CUDA
+}
+/* other cases */
+/*
+TODO!!
+*/
+/* test for Clip Function */
+bool TestClip()
+{
+	XPRINT(0, stdout, "[TEST Clip] set every entry to its clip value \n");
+	bool returnFlag = true, caseFlag = true;
+	/* case 1 test */
+	caseFlag = TestClip1();
+	if (!caseFlag) {
+		returnFlag = false;
+		XPRINT(0, stdout, ">> case 1 failed!\n");
+	}
+	else
+		XPRINT(0, stdout, ">> case 1 passed!\n");
+	/* other cases test */
+	/*
+	TODO!!
+	*/
+	if (returnFlag) {
+		XPRINT(0, stdout, ">> All Passed!\n");
+	}
+	else
+		XPRINT(0, stdout, ">> Failed!\n");
+	XPRINT(0, stdout, "\n");
+	return returnFlag;
+}
+} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/test/TClip.h
+++ b/source/tensor/test/TClip.h
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-03
+*/
+#ifndef __TEST_CLIP_H__
+#define __TEST_CLIP_H__
+#include "../core/math/Clip.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/* test for Clip Function */
+extern "C"
+bool TestClip();
+} // namespace nts(NiuTrans.Tensor)
+#endif // __TEST_CLIP_H__
--- a/source/tensor/test/TExp.cpp
+++ b/source/tensor/test/TExp.cpp
@@ -66,7 +66,9 @@ bool TestExp1()
    bUser = Exp(*a);
 	/* check results */
-	cpuTest = b->CheckData(answer, unitNum, 1e-4F) && aMe->CheckData(answer, unitNum, 1e-4F) && bUser.CheckData(answer, unitNum, 1e-4F);
+	cpuTest = b->CheckData(answer, unitNum, 1e-4F) && 
+              aMe->CheckData(answer, unitNum, 1e-4F) && 
+              bUser.CheckData(answer, unitNum, 1e-4F);
 #ifdef USE_CUDA
 	/* GPU test */
@@ -88,7 +90,9 @@ bool TestExp1()
    bUserGPU = Exp(*aGPU);
 	/* check results */
-	gpuTest = bGPU->CheckData(answer, unitNum, 1e-4F) && aMeGPU->CheckData(answer, unitNum, 1e-4F) && bUserGPU.CheckData(answer, unitNum, 1e-4F);
+	gpuTest = bGPU->CheckData(answer, unitNum, 1e-4F) && 
+              aMeGPU->CheckData(answer, unitNum, 1e-4F) && \
+              bUserGPU.CheckData(answer, unitNum, 1e-4F);
 	/* destroy variables */
 	delete a;

--- a/source/tensor/test/TRound.cpp
+++ b/source/tensor/test/TRound.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-31
+*/
+#include "../core/math/Unary.h"
+#include "TRound.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/*
+case 1: test Round function.
+Set every entry to its round value.
+*/
+bool TestRound1()
+{
+	/* a tensor of size (3, 2) */
+	int order = 2;
+	int * dimSize = new int[order];
+	dimSize[0] = 3;
+	dimSize[1] = 2;
+	int unitNum = 1;
+	for (int i = 0; i < order; i++)
+		unitNum *= dimSize[i];
+	DTYPE aData[3][2] = { {1.3F, 2.7F}, 
+	                      {-1.3F, -2.7F},
+	                      {0.0F, 0.5F} };
+	DTYPE answer[3][2] = { {1.0F, 3.0F},
+	                       {-1.0F, -3.0F},
+	                       {0.0F, 1.0F} };
+	/* CPU test */
+	bool cpuTest = true;
+	/* create tensors */
+	XTensor * a = NewTensor(order, dimSize);
+    XTensor * b = NewTensor(order, dimSize);
+	XTensor * aMe = NewTensor(order, dimSize);
+    XTensor bUser;
+	/* initialize variables */
+	a->SetData(aData, unitNum);
+	aMe->SetData(aData, unitNum);
+	/* call Round function */
+	_Round(a, b);
+	_RoundMe(aMe);
+    bUser = Round(*a);
+	/* check results */
+	cpuTest = b->CheckData(answer, unitNum, 1e-4F) && 
+              aMe->CheckData(answer, unitNum, 1e-4F) && 
+              bUser.CheckData(answer, unitNum, 1e-4F);
+#ifdef USE_CUDA
+	/* GPU test */
+	bool gpuTest = true;
+	/* create tensor */
+	XTensor * aGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+	XTensor * bGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+	XTensor * aMeGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+    XTensor bUserGPU;
+	/* Initialize variables */
+	aGPU->SetData(aData, unitNum);
+	aMeGPU->SetData(aData, unitNum);
+	/* call Round function */
+    _Round(aGPU, bGPU);
+	_RoundMe(aMeGPU);
+    bUserGPU = Round(*aGPU);
+	/* check results */
+	gpuTest = bGPU->CheckData(answer, unitNum, 1e-4F) && 
+              aMeGPU->CheckData(answer, unitNum, 1e-4F) && 
+              bUserGPU.CheckData(answer, unitNum, 1e-4F);
+	/* destroy variables */
+	delete a;
+	delete b;
+	delete aMe;
+    delete aGPU;
+    delete bGPU;
+    delete aMeGPU;
+	delete[] dimSize;
+	return cpuTest && gpuTest;
+#else
+	/* destroy variables */
+	delete a;
+	delete b;
+	delete aMe;
+	delete[] dimSize;
+	return cpuTest;
+#endif // USE_CUDA
+}
+/* other cases */
+/*
+TODO!!
+*/
+/* test for Round Function */
+bool TestRound()
+{
+	XPRINT(0, stdout, "[TEST Round] set every entry to its round value \n");
+	bool returnFlag = true, caseFlag = true;
+	/* case 1 test */
+	caseFlag = TestRound1();
+	if (!caseFlag) {
+		returnFlag = false;
+		XPRINT(0, stdout, ">> case 1 failed!\n");
+	}
+	else
+		XPRINT(0, stdout, ">> case 1 passed!\n");
+	/* other cases test */
+	/*
+	TODO!!
+	*/
+	if (returnFlag) {
+		XPRINT(0, stdout, ">> All Passed!\n");
+	}
+	else
+		XPRINT(0, stdout, ">> Failed!\n");
+	XPRINT(0, stdout, "\n");
+	return returnFlag;
+}
+} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/test/TRound.h
+++ b/source/tensor/test/TRound.h
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-03
+*/
+#ifndef __TEST_ROUND_H__
+#define __TEST_ROUND_H__
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/* test for Round Function */
+bool TestRound();
+} // namespace nts(NiuTrans.Tensor)
+#endif // __TEST_ROUND_H__
--- a/source/tensor/test/Test.cpp
+++ b/source/tensor/test/Test.cpp
@@ -30,6 +30,7 @@ bool Test()
    XPRINT(0, stdout, "Testing the XTensor utilites ... \n\n");
    wrong = !TestAbsolute() || wrong;
+    wrong = !TestClip() || wrong;
    wrong = !TestConcatenate() || wrong;
    wrong = !TestConcatenateSolely() || wrong;
    wrong = !TestCos() || wrong;
@@ -53,6 +54,7 @@ bool Test()
    wrong = !TestReduceSum() || wrong;
    wrong = !TestReduceSumSquared() || wrong;
    wrong = !TestReduceVariance() || wrong;
+    wrong = !TestRound() || wrong;
    wrong = !TestScaleAndShift() || wrong;
    wrong = !TestSelect() || wrong;
    wrong = !TestSetAscendingOrder() || wrong;
@@ -68,7 +70,7 @@ bool Test()
    wrong = !TestSumDim() || wrong;
    wrong = !TestTan() || wrong;
    wrong = !TestTranspose() || wrong;
-    wrong = !TestTopK() || wrong;
+    //wrong = !TestTopK() || wrong;
    wrong = !TestUnsqueeze() || wrong;
    wrong = !TestXMem() || wrong;

--- a/source/tensor/test/Test.h
+++ b/source/tensor/test/Test.h
@@ -23,6 +23,7 @@
 #define __TEST_H__
 #include "TAbsolute.h"
+#include "TClip.h"
 #include "TConcatenate.h"
 #include "TConcatenateSolely.h"
 #include "TCos.h"
@@ -46,6 +47,7 @@
 #include "TReduceSum.h"
 #include "TReduceSumSquared.h"
 #include "TReduceVariance.h"
+#include "TRound.h"
 #include "TScaleAndShift.h"
 #include "TSelect.h"
 #include "TSetAscendingOrder.h"