Merge branch 'xuchen' into xiaotong-working

182633ac · xuchen · afa887d4 · 4336f2f9 · 182633ac · 182633ac
Commit 182633ac authored Sep 15, 2018 by xuchen
--- a/source/tensor/XName.cpp
+++ b/source/tensor/XName.cpp
@@ -29,10 +29,18 @@ const char * GetOPName(int type)
    if ((type & MATH_BASE) != 0){
        if (type == MATH_ABSOLUTE)
            return "M_ABSOLUTE";
+        else if (type == MATH_CEIL)
+            return "M_CEIL";
        else if (type == MATH_EXP)
            return "M_EXP";
+        else if (type == MATH_FLOOR)
+            return "M_FLOOR";
        else if (type == MATH_LOG)
            return "M_LOG";
+        else if (type == MATH_SQRT)
+            return "M_SQRT";
+        else if (type == MATH_SQUARE)
+            return "M_SQUARE";
        else if (type == MATH_SIN)
            return "M_SIN";
        else if (type == MATH_COS)
@@ -113,7 +121,9 @@ const char * GetOPName(int type)
            return "S_TOPK";
    }
    else if ((type & FUNCTION_BASE) != 0){
-        if (type == FUNC_HARDTANH)
+        if (type == FUNC_DROPOUT)
+            return "F_DROPOUT";
+        else if (type == FUNC_HARDTANH)
            return "F_HARDTANH";
        else if (type == FUNC_IDENTITY)
            return "F_IDENTITY";

--- a/source/tensor/XName.h
+++ b/source/tensor/XName.h
@@ -32,9 +32,13 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #define MATH_BASE               0x00001000
 #define MATH_ABSOLUTE           MATH_BASE + 1
-#define MATH_EXP                MATH_ABSOLUTE + 1
+#define MATH_CEIL               MATH_ABSOLUTE + 1
-#define MATH_LOG                MATH_EXP + 1
+#define MATH_EXP                MATH_CEIL + 1
-#define MATH_SIN                MATH_LOG + 1
+#define MATH_FLOOR              MATH_EXP + 1
+#define MATH_LOG                MATH_FLOOR + 1
+#define MATH_SQRT               MATH_LOG + 1
+#define MATH_SQUARE             MATH_SQRT + 1
+#define MATH_SIN                MATH_SQUARE + 1
 #define MATH_COS                MATH_SIN + 1
 #define MATH_TAN                MATH_COS + 1
 #define MATH_ROUND              MATH_TAN + 1
@@ -88,7 +92,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /* activation functions */
 #define FUNCTION_BASE           DATA_BASE * 2
-#define FUNC_HARDTANH           FUNCTION_BASE + 1
+#define FUNC_DROPOUT            FUNCTION_BASE + 1
+#define FUNC_HARDTANH           FUNC_DROPOUT + 1
 #define FUNC_IDENTITY           FUNC_HARDTANH + 1
 #define FUNC_LOGSOFTMAX         FUNC_IDENTITY + 1
 #define FUNC_RECTIFY            FUNC_LOGSOFTMAX + 1

--- a/source/tensor/core/math/Unary.cpp
+++ b/source/tensor/core/math/Unary.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-31
+*/
 #include <math.h>
 #include "../../XName.h"
 #include "Unary.h"
@@ -5,9 +26,18 @@
 namespace nts{
+DTYPE square(DTYPE x)
+{
+    return x * x;
+}
+DTYPE round(DTYPE r)
+{
+	return (r > 0.0) ? (DTYPE)floor(r + 0.5) : (DTYPE)ceil(r - 0.5);
+}
 #ifdef USE_CUDA
-/* define three marco separately, specify the respective function names */
+/* define three marco separately, specify the respective function names  (GPU mode) */
 #define _SIMPLE_UNARY_FUNCTION(_funcName, _cudaFuncName, origFunc)          \
 void _funcName(const XTensor * a, XTensor * b)                              \
 {                                                                           \
@@ -45,14 +75,35 @@ _SIMPLE_UNARY_FUNCTION(_Absolute, _CudaAbsolute, fabs)
 _SIMPLE_UNARY_FUNCTION_ME(_AbsoluteMe, _Absolute)
 SIMPLE_UNARY_FUNCTION(Absolute, _Absolute, MATH_ABSOLUTE)
+_SIMPLE_UNARY_FUNCTION(_Ceil, _CudaCeil, ceil)
+_SIMPLE_UNARY_FUNCTION_ME(_CeilMe, _Ceil)
+SIMPLE_UNARY_FUNCTION(Ceil, _Ceil, MATH_CEIL)
 _SIMPLE_UNARY_FUNCTION(_Exp, _CudaExp, exp)
 _SIMPLE_UNARY_FUNCTION_ME(_ExpMe, _Exp)
 SIMPLE_UNARY_FUNCTION(Exp, _Exp, MATH_EXP)
+_SIMPLE_UNARY_FUNCTION(_Floor, _CudaFloor, floor)
+_SIMPLE_UNARY_FUNCTION_ME(_FloorMe, _Floor)
+SIMPLE_UNARY_FUNCTION(Floor, _Floor, MATH_FLOOR)
 _SIMPLE_UNARY_FUNCTION(_Log, _CudaLog, log)
 _SIMPLE_UNARY_FUNCTION_ME(_LogMe, _Log)
 SIMPLE_UNARY_FUNCTION(Log, _Log, MATH_LOG)
+_SIMPLE_UNARY_FUNCTION(_Round, _CudaRound, round)
+_SIMPLE_UNARY_FUNCTION_ME(_RoundMe, _Round)
+SIMPLE_UNARY_FUNCTION(Round, _Round, MATH_ROUND)
+_SIMPLE_UNARY_FUNCTION(_Sqrt, _CudaSqrt, sqrt)
+_SIMPLE_UNARY_FUNCTION_ME(_SqrtMe, _Sqrt)
+SIMPLE_UNARY_FUNCTION(Sqrt, _Sqrt, MATH_SQRT)
+_SIMPLE_UNARY_FUNCTION(_Square, _CudaSquare, square)
+_SIMPLE_UNARY_FUNCTION_ME(_SquareMe, _Square)
+SIMPLE_UNARY_FUNCTION(Square, _Square, MATH_SQUARE)
 _SIMPLE_UNARY_FUNCTION(_Sin, _CudaSin, sin)
 _SIMPLE_UNARY_FUNCTION_ME(_SinMe, _Sin)
 SIMPLE_UNARY_FUNCTION(Sin, _Sin, MATH_SIN)
@@ -65,11 +116,8 @@ _SIMPLE_UNARY_FUNCTION(_Tan, _CudaTan, tan)
 _SIMPLE_UNARY_FUNCTION_ME(_TanMe, _Tan)
 SIMPLE_UNARY_FUNCTION(Tan, _Tan, MATH_TAN)
-/*_SIMPLE_UNARY_FUNCTION(_Round, _CudaRound, round)
-_SIMPLE_UNARY_FUNCTION_ME(_RoundMe, _Round)
-SIMPLE_UNARY_FUNCTION(Round, _Round, MATH_ROUND)*/
 #else
-/* define three marco separately, specify the respective function names */
+/* define three marco separately, specify the respective function names (CPU mode) */
 #define _SIMPLE_UNARY_FUNCTION(_funcName, origFunc)          \
 void _funcName(const XTensor * a, XTensor * b)                              \
 {                                                                           \
@@ -102,14 +150,35 @@ _SIMPLE_UNARY_FUNCTION(_Absolute, fabs)
 _SIMPLE_UNARY_FUNCTION_ME(_AbsoluteMe, _Absolute)
 SIMPLE_UNARY_FUNCTION(Absolute, _Absolute, MATH_ABSOLUTE)
+_SIMPLE_UNARY_FUNCTION(_Ceil, ceil)
+_SIMPLE_UNARY_FUNCTION_ME(_CeilMe, _Ceil)
+SIMPLE_UNARY_FUNCTION(Ceil, _Ceil, MATH_CEIL)
 _SIMPLE_UNARY_FUNCTION(_Exp, exp)
 _SIMPLE_UNARY_FUNCTION_ME(_ExpMe, _Exp)
 SIMPLE_UNARY_FUNCTION(Exp, _Exp, MATH_EXP)
+_SIMPLE_UNARY_FUNCTION(_Floor, floor)
+_SIMPLE_UNARY_FUNCTION_ME(_FloorMe, _Floor)
+SIMPLE_UNARY_FUNCTION(Floor, _Floor, MATH_FLOOR)
 _SIMPLE_UNARY_FUNCTION(_Log, log)
 _SIMPLE_UNARY_FUNCTION_ME(_LogMe, _Log)
 SIMPLE_UNARY_FUNCTION(Log, _Log, MATH_LOG)
+_SIMPLE_UNARY_FUNCTION(_Round, round)
+_SIMPLE_UNARY_FUNCTION_ME(_RoundMe, _Round)
+SIMPLE_UNARY_FUNCTION(Round, _Round, MATH_ROUND)
+_SIMPLE_UNARY_FUNCTION(_Sqrt, sqrt)
+_SIMPLE_UNARY_FUNCTION_ME(_SqrtMe, _Sqrt)
+SIMPLE_UNARY_FUNCTION(Sqrt, _Sqrt, MATH_SQRT)
+_SIMPLE_UNARY_FUNCTION(_Square, square)
+_SIMPLE_UNARY_FUNCTION_ME(_SquareMe, _Square)
+SIMPLE_UNARY_FUNCTION(Square, _Square, MATH_SQUARE)
 _SIMPLE_UNARY_FUNCTION(_Sin, sin)
 _SIMPLE_UNARY_FUNCTION_ME(_SinMe, _Sin)
 SIMPLE_UNARY_FUNCTION(Sin, _Sin, MATH_SIN)

--- a/source/tensor/core/math/Unary.cu
+++ b/source/tensor/core/math/Unary.cu
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-31
+*/
 #include <math.h>
 #include "../../XDevice.h"
 #include "../../XName.h"
+#include "Unary.h"
 #include "Unary.cuh"
 namespace nts {
+__device__
+DTYPE CudaSquare(DTYPE x)
+{
+    return x * x;
+}
+__device__
+DTYPE CudaRound(DTYPE r)
+{
+	return (r > 0.0) ? (DTYPE)floor(r + 0.5) : (DTYPE)ceil(r - 0.5);
+}
 #define SIMPLE_UNARY_FUNCTION_GPU(funcName, origFunc)                       \
 __global__                                                                  \
 void Kernel##funcName(DTYPE * a, DTYPE * b, int size)                       \
@@ -15,7 +49,7 @@ void Kernel##funcName(DTYPE * a, DTYPE * b, int size)                       \
        b[i] = (DTYPE)origFunc(a[i]);                                       \
 }                                                                           \
 __global__                                                                  \
-    void Kernel##funcName(__half * a, __half * b, int size)                 \
+void Kernel##funcName(__half * a, __half * b, int size)                     \
 {                                                                           \
    return;                                                                 \
 }                                                                           \
@@ -37,12 +71,12 @@ void _Cuda##funcName(const XTensor * a, XTensor * b)                        \
    ProtectCudaDev(a->devID, devIDBackup);                                  \
                                                                            \
    if (a->dataType == DEFAULT_DTYPE) {                                     \
-        Kernel##funcName << <blocks, threads >> >                           \
+        Kernel##funcName<<<blocks, threads>>>                               \
-                     ((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum);        \
+                         ((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum);    \
    }                                                                       \
    else if (a->dataType == X_FLOAT16) {                                    \
-        Kernel##funcName << <blocks, threads >> >                           \
+        Kernel##funcName<<<blocks, threads>>>                               \
-                     ((__half*)a->data, (__half*)b->data, a->unitNum);      \
+                         ((__half*)a->data, (__half*)b->data, a->unitNum);  \
    }                                                                       \
    else {                                                                  \
        ShowNTErrors("TODO!");                                              \
@@ -52,11 +86,16 @@ void _Cuda##funcName(const XTensor * a, XTensor * b)                        \
 }                                                                           \
 SIMPLE_UNARY_FUNCTION_GPU(Absolute, fabs)
+SIMPLE_UNARY_FUNCTION_GPU(Ceil, ceil)
 SIMPLE_UNARY_FUNCTION_GPU(Exp, exp)
+SIMPLE_UNARY_FUNCTION_GPU(Floor, floor)
 SIMPLE_UNARY_FUNCTION_GPU(Log, log)
+SIMPLE_UNARY_FUNCTION_GPU(Round, CudaRound)
+SIMPLE_UNARY_FUNCTION_GPU(Sqrt, sqrt)
+SIMPLE_UNARY_FUNCTION_GPU(Square, CudaSquare)
 SIMPLE_UNARY_FUNCTION_GPU(Sin, sin)
 SIMPLE_UNARY_FUNCTION_GPU(Cos, cos)
 SIMPLE_UNARY_FUNCTION_GPU(Tan, tan)
-//SIMPLE_UNARY_FUNCTION_GPU(Round, round)
 }
\ No newline at end of file
--- a/source/tensor/core/math/Unary.cuh
+++ b/source/tensor/core/math/Unary.cuh
@@ -38,6 +38,15 @@ void KernelAbsolute(__half * a, __half * b, int size);
 /* set each entry to its absolute value */
 void _CudaAbsolute(const XTensor * a, XTensor * b);
+/* set each entry to its ceil value (CUDA Kernel) */
+__global__
+void KernelCeil(DTYPE * a, DTYPE * b, int size);
+/* set each entry to its ceil value (CUDA Kernel) with float16 data type*/
+__global__
+void KernelCeil(__half * a, __half * b, int size);
+/* set each entry to its ceil value */
+void _CudaCeil(const XTensor * a, XTensor * b);
 /* set each entry to its exponent value (CUDA Kernel) */
 __global__
 void KernelExp(DTYPE * a, DTYPE * b, int size);
@@ -47,6 +56,15 @@ void KernelExp(__half * a, __half * b, int size);
 /* set each entry to its exponent value */
 void _CudaExp(const XTensor * a, XTensor * b);
+/* set each entry to its floor value (CUDA Kernel) */
+__global__
+void KernelFloor(DTYPE * a, DTYPE * b, int size);
+/* set each entry to its floor value (CUDA Kernel) with float16 data type*/
+__global__
+void KernelFloor(__half * a, __half * b, int size);
+/* set each entry to its floor value */
+void _CudaFloor(const XTensor * a, XTensor * b);
 /* set each entry to its logarithm value (CUDA Kernel) */
 __global__
 void KernelLog(DTYPE * a, DTYPE * b, int size);
@@ -56,6 +74,34 @@ void KernelLog(__half * a, __half * b, int size);
 /* set each entry to its logarithm value */
 void _CudaLog(const XTensor * a, XTensor * b);
+/* set each entry to its round value (CUDA Kernel) */
+__global__
+void KernelRound(DTYPE * a, DTYPE * b, int size);
+/* set each entry to its round value (CUDA Kernel) with float16 data type*/
+__global__
+void KernelRound(__half * a, __half * b, int size);
+/* set each entry to its round value */
+void _CudaRound(const XTensor * a, XTensor * b);
+/* set each entry to its sqrt value (CUDA Kernel) */
+__global__
+void KernelSqrt(DTYPE * a, DTYPE * b, int size);
+/* set each entry to its sqrt value (CUDA Kernel) with float16 data type*/
+__global__
+void KernelSqrt(__half * a, __half * b, int size);
+/* set each entry to its sqrt value */
+void _CudaSqrt(const XTensor * a, XTensor * b);
+/* set each entry to its square value (CUDA Kernel) */
+__global__
+void KernelSquare(DTYPE * a, DTYPE * b, int size);
+/* set each entry to its square value (CUDA Kernel) with float16 data type*/
+__global__
+void KernelSquare(__half * a, __half * b, int size);
+/* set each entry to its square value */
+void _CudaSquare(const XTensor * a, XTensor * b);
 /* set each entry to its sine value (CUDA Kernel) */
 __global__
 void KernelSin(DTYPE * a, DTYPE * b, int size);
@@ -83,15 +129,6 @@ void KernelTan(__half * a, __half * b, int size);
 /* set each entry to its tangent value */
 void _CudaTan(const XTensor * a, XTensor * b);
-/* set each entry to its round value (CUDA Kernel) */
-//__global__
-//void KernelRound(DTYPE * a, DTYPE * b, int size);
-/* set each entry to its round value (CUDA Kernel) with float16 data type*/
-//__global__
-//void KernelRound(__half * a, __half * b, int size);
-/* set each entry to its round value */
-//void _CudaRound(const XTensor * a, XTensor * b);
 #endif // USE_CUDA
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/math/Unary.h
+++ b/source/tensor/core/math/Unary.h
@@ -28,95 +28,103 @@ namespace nts{
 /* set every entry to its absolute value */
 void _Absolute(const XTensor * a, XTensor * b);
-/* 
+/* set every entry to its absolute value (do it on site)
-set every entry to its absolute value (do it on site)
+keep the result in the input tensor a and return nothing */
-keep the result in the input tensor a and return nothing
-*/
 void _AbsoluteMe(XTensor * a);
-/* 
+/* set every entry to its absolute value (return a XTensor structure)
-set every entry to its absolute value (return a XTensor structure)
+make a new tensor to keep the result and return it */
-make a new tensor to keep the result and return it
-*/
 XTensor Absolute(const XTensor & a);
+/* set every entry to its ceil value */
+void _Ceil(const XTensor * a, XTensor * b);
+/* set every entry to its ceil value (do it on site)
+keep the result in the input tensor a and return nothing */
+void _CeilMe(XTensor * a);
+/* set every entry to its ceil value (return a XTensor structure)
+make a new tensor to keep the result and return it */
+XTensor Ceil(const XTensor & a);
 /* set every entry to its exponent value */
 void _Exp(const XTensor * a, XTensor * b);
-/* 
+/* set every entry to its exponent value (do it on site)
-set every entry to its exponent value (do it on site)
+keep the result in the input tensor a and return nothing */
-keep the result in the input tensor a and return nothing
-*/
 void _ExpMe(XTensor * a);
-/* 
+/* set every entry to its exponent value (return a XTensor structure)
-set every entry to its exponent value (return a XTensor structure)
+make a new tensor to keep the result and return it */
-make a new tensor to keep the result and return it
-*/
 XTensor Exp(const XTensor & a);
+/* set every entry to its floor value */
+void _Floor(const XTensor * a, XTensor * b);
+/* set every entry to its floor value (do it on site)
+keep the result in the input tensor a and return nothing */
+void _FloorMe(XTensor * a);
+/* set every entry to its floor value (return a XTensor structure)
+make a new tensor to keep the result and return it */
+XTensor Floor(const XTensor & a);
 /* set every entry to its logarithm value */
 void _Log(const XTensor * a, XTensor * b);
-/* 
+/* set every entry to its logarithm value (do it on site)
-set every entry to its logarithm value (do it on site)
+keep the result in the input tensor a and return nothing */
-keep the result in the input tensor a and return nothing
-*/
 void _LogMe(XTensor * a);
-/* 
+/* set every entry to its logarithm value (return a XTensor structure)
-set every entry to its logarithm value (return a XTensor structure)
+make a new tensor to keep the result and return it */
-make a new tensor to keep the result and return it
-*/
 XTensor Log(const XTensor & a);
+/* set every entry to its round value */
+void _Round(const XTensor * a, XTensor * b);
+/* set every entry to its round value (do it on site)
+keep the result in the input tensor a and return nothing */
+void _RoundMe(XTensor * a);
+/* set every entry to its round value (return a XTensor structure)
+make a new tensor to keep the result and return it */
+XTensor Round(const XTensor & a);
+/* set every entry to its sqrt value */
+void _Sqrt(const XTensor * a, XTensor * b);
+/* set every entry to its sqrt value (do it on site)
+keep the result in the input tensor a and return nothing */
+void _SqrtMe(XTensor * a);
+/* set every entry to its sqrt value (return a XTensor structure)
+make a new tensor to keep the result and return it */
+XTensor Sqrt(const XTensor & a);
+/* set every entry to its square value */
+void _Square(const XTensor * a, XTensor * b);
+/* set every entry to its square value (do it on site)
+keep the result in the input tensor a and return nothing */
+void _SquareMe(XTensor * a);
+/* set every entry to its square value (return a XTensor structure)
+make a new tensor to keep the result and return it */
+XTensor Square(const XTensor & a);
 /* set every entry to its sine value */
 void _Sin(const XTensor * a, XTensor * b);
-/* 
+/* set every entry to its sine value (do it on site)
-set every entry to its sine value (do it on site)
+keep the result in the input tensor a and return nothing */
-keep the result in the input tensor a and return nothing
-*/
 void _SinMe(XTensor * a);
-/* 
+/* set every entry to its sine value (return a XTensor structure)
-set every entry to its sine value (return a XTensor structure)
+make a new tensor to keep the result and return it */
-make a new tensor to keep the result and return it
-*/
 XTensor Sin(const XTensor & a);
 /* set every entry to its cosine value */
 void _Cos(const XTensor * a, XTensor * b);
-/* 
+/* set every entry to its cosine value (do it on site)
-set every entry to its cosine value (do it on site)
+keep the result in the input tensor a and return nothing */
-keep the result in the input tensor a and return nothing
-*/
 void _CosMe(XTensor * a);
-/* 
+/* set every entry to its cosine value (return a XTensor structure)
-set every entry to its cosine value (return a XTensor structure)
+make a new tensor to keep the result and return it */
-make a new tensor to keep the result and return it
-*/
 XTensor Cos(const XTensor & a);
 /* set every entry to its tangent value */
 void _Tan(const XTensor * a, XTensor * b);
-/* 
+/* set every entry to its tangent value (do it on site)
-set every entry to its tangent value (do it on site)
+keep the result in the input tensor a and return nothing */
-keep the result in the input tensor a and return nothing
-*/
 void _TanMe(XTensor * a);
-/* 
+/* set every entry to its tangent value (return a XTensor structure)
-set every entry to its tangent value (return a XTensor structure)
+make a new tensor to keep the result and return it */
-make a new tensor to keep the result and return it
-*/
 XTensor Tan(const XTensor & a);
-/* set every entry to its round value */
-//void _Round(const XTensor * a, XTensor * b);
-/* 
-set every entry to its round value (do it on site)
-keep the result in the input tensor a and return nothing
-*/
-//void _RoundMe(XTensor * a);
-/* 
-set every entry to its round value (return a XTensor structure)
-make a new tensor to keep the result and return it
-*/
-//XTensor Round(const XTensor & a);
 }
 #endif //end __UNARY_H__
\ No newline at end of file
--- a/source/tensor/core/reduce/ReduceSum.cu
+++ b/source/tensor/core/reduce/ReduceSum.cu
@@ -480,8 +480,8 @@ void KernelReduceSumFast(__half * input, __half * output,
 if data storage is discontinuius ,use this way to reduce 
 */
 __global__ 
-void KernelReduceSumDiscontinuousStorage(DTYPE * input, DTYPE * output, int stride, int blockNum,
+void KernelReduceSumDiscontinuousStorage(DTYPE * input, DTYPE * output, int stride, int strideNum, 
-                                         int strideNum, DTYPE * shift, DTYPE power, bool isExp)
+                                         int blockNum, DTYPE * shift, DTYPE power, bool isExp)
 {
    __shared__ DTYPE bias[MAX_CUDA_THREAD_NUM_PER_BLOCK];
    int idx = blockDim.x * blockIdx.x + threadIdx.x;

--- a/source/tensor/function/Dropout.cpp
+++ b/source/tensor/function/Dropout.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-09-12
+ */
+#include "../XName.h"
+#include <math.h>
+#include <time.h>
+#include "Dropout.h"
+#include "Dropout.cuh"
+#include "../core/arithmetic/Multiply.h"
+#include "../core/math/ScaleAndShift.h"
+namespace nts{ // namespace nts(NiuTrans.Tensor
+/*
+generate a random bernoulli number
+*/
+DTYPE RandomBernoulli(DTYPE prob)
+{
+    return (DTYPE)rand()/(DTYPE)RAND_MAX > prob ? (DTYPE)1.0 : (DTYPE)0.0;
+}
+/*
+dropout function
+During training, randomly zeroes some of the elements of the input tensor
+with probability p using samples from a Bernoulli distribution.
+The elements to zero are randomized on every forward call.
+This has proven to be an effective technique for regularization and
+preventing the co-adaptation of neurons as described in the paper
+"Improving neural networks by preventing co-adaptation of feature detectors".
+Furthermore, the outputs are scaled by a factor of \frac{1}{1-p} during training.
+This means that during evaluation the module simply computes an identity function.
+>> x - input tensor
+>> y - output tensor
+>> prob - probability to set an element zero
+*/
+void _Dropout(const XTensor *x, XTensor *y, unsigned int seed, DTYPE prob)
+{
+    CheckNTErrors(prob >= 0.0 && prob <= 1.0, "The probability must be 0-1!");
+   DTYPE scaleFactor = (DTYPE)1.0 / ((DTYPE)1.0 - prob);
+    /* generate a mask tensor again with special probability */
+    srand(seed);
+    int unitNum = x->unitNum;
+    DTYPE * maskArray = new DTYPE[unitNum];
+    for (int i = 0; i < unitNum; i++)
+        maskArray[i] = RandomBernoulli(prob);
+    XTensor * maskTensor = NewTensorBuf(x, x->devID, x->mem);
+    maskTensor->SetData(maskArray, unitNum);
+#ifdef USE_CUDA
+    if(x->devID >=0 || y->devID >= 0){
+        _CudaDropout(x, y, maskTensor, scaleFactor);
+        DelTensorBuf(maskTensor);
+        delete[] maskArray;
+        return;
+    }
+#endif
+    XTensor * inter = NewTensorBuf(x, x->devID, x->mem);
+    _Multiply(x, maskTensor, inter);
+    _ScaleAndShift(inter, y, scaleFactor, 0);
+    DelTensorBuf(inter);
+    DelTensorBuf(maskTensor);
+    delete[] maskArray;
+}
+/*
+dropout function (return a XTensor structure)
+make a new tensor to keep the result and return it
+During training, randomly zeroes some of the elements of the input tensor
+with probability p using samples from a Bernoulli distribution.
+The elements to zero are randomized on every forward call.
+This has proven to be an effective technique for regularization and
+preventing the co-adaptation of neurons as described in the paper
+"Improving neural networks by preventing co-adaptation of feature detectors".
+Furthermore, the outputs are scaled by a factor of \frac{1}{1-p} during training.
+This means that during evaluation the module simply computes an identity function.
+>> x - input tensor
+>> y - output tensor
+>> prob - probability to set an element zero
+*/
+XTensor Dropout(const XTensor &x, DTYPE prob)
+{
+    XTensor y(&x);
+    y.SetTMP();
+   DTYPE scaleFactor = (DTYPE)1.0 / ((DTYPE)1.0 - prob);
+    /* generate a mask tensor again with special probability */
+    srand((unsigned int)time(NULL));
+    int unitNum = x.unitNum;
+    DTYPE * maskArray = new DTYPE[unitNum];
+    for (int i = 0; i < unitNum; i++)
+        maskArray[i] = RandomBernoulli(prob);
+    XTensor maskTensor(&x);
+    maskTensor.SetData(maskArray, unitNum);
+    XTensor inter;
+    inter = Multiply(x, maskTensor);
+    y = ScaleAndShift(inter, scaleFactor, 0);
+    delete[] maskArray;
+    ///* tensor connection */
+    //XLink::MakeLink(&x, NULL, &y, FUNC_DROPOUT);
+    //XLink::AddParamToHead(&y, prob);
+    return y;
+}
+/* 
+backward computation of dropout function
+dE/dx = dE/dy * dy/dx
+>> y - output of the dropout function
+>> x - input of the dropout function
+>> dedy - dE/dy
+>> dedx - dE/dx
+>> prob - probability to set an element zero
+*/
+void _DropoutBackward(const XTensor * y, const XTensor * x, 
+                      const XTensor * dedy, XTensor * dedx, 
+                      unsigned int seed, DTYPE prob)
+{
+    if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE)
+    {
+        int unitNum = y->unitNum;
+        DTYPE scaleFactor = (DTYPE)1.0F / ((DTYPE)1.0F - prob);
+        /* generate a mask tensor again with special probability */
+        srand(seed);
+        DTYPE * maskArray = new DTYPE[unitNum];
+        for (int i = 0; i < unitNum; i++)
+            maskArray[i] = RandomBernoulli(prob);
+        XTensor * maskTensor = NewTensorBuf(x, x->devID, x->mem);
+        maskTensor->SetData(maskArray, unitNum);
+        #ifdef USE_CUDA
+            if(x->devID >= 0 || y->devID >= 0){
+                _CudaDropoutBackward(y, x, dedy, dedx, maskTensor, scaleFactor);
+                DelTensorBuf(maskTensor);
+                delete[] maskArray;
+                return;
+            }
+        #endif
+        DTYPE * dedyp = (DTYPE*)dedy->data;
+        DTYPE * dedxp = (DTYPE*)dedx->data;
+        /* dE/dx = dE/dy * dy/dx */
+        for(int i = 0; i < unitNum; i++)
+            dedxp[i] = dedyp[i] * maskArray[i] * scaleFactor;
+        DelTensorBuf(maskTensor);
+        delete[] maskArray;
+    }
+    else
+        ShowNTErrors("TODO!");
+}
+} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/function/Dropout.cu
+++ b/source/tensor/function/Dropout.cu
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-09-12
+ */
+#include "Dropout.h"
+#include "Dropout.cuh"
+#include "Loss.cuh"
+#include "../XDevice.h"
+#ifdef USE_CUDA
+// the CUDA stuff
+#include <cuda_runtime.h>
+#include <cublas_v2.h>
+#include <cuda.h>
+#endif
+namespace nts{ // namespace nts(NiuTrans.Tensor)
+#ifdef USE_CUDA
+/* 
+dropout function (Cuda kernel) 
+>> x - input data pointer
+>> y - output data pointer
+>> m - mask indicator to set zero 
+>> s - the scale factor
+>> size - size of input/output
+*/
+__global__ 
+void KernelDropoutCompute(DTYPE * x, DTYPE * y, DTYPE * m, DTYPE s, int size)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < size){
+        y[i] = x[i] * m[i] * s;
+    }
+}
+/*
+dropout function (Cuda version)
+>> x - input tensor
+>> y - output tensor
+>> mask - mask tensor to set 0
+>> scaleFactor - the scale factor
+*/
+void _CudaDropout(const XTensor * x, XTensor * y, const XTensor * mask, DTYPE scaleFactor)
+{
+    if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE){
+        CheckNTErrors(!x->isSparse && !y->isSparse, "the activation function (rectify) does not support sparse matrices.");
+        CheckNTErrors(x->unitNum && y->unitNum, "we require two vectors with the same length.");
+        int gridSize[3], blockSize[3];
+        GDevs.GetCudaThread(x->devID, x->unitNum, gridSize, blockSize);
+        int devIDBackup;
+        ProtectCudaDev(x->devID, devIDBackup);
+        KernelDropoutCompute<<<dim3(gridSize[0]), dim3(blockSize[0])>>>((DTYPE*)x->data, (DTYPE*)y->data, (DTYPE*)mask->data, scaleFactor, x->unitNum);
+        BacktoCudaDev(x->devID, devIDBackup);
+    }
+    else
+        ShowNTErrors("TODO!");
+}
+/* 
+backward computation of dropout function (Cuda kernel)
+dE/dx = dE/dy * dy/dx
+>> dedy - dE/dy
+>> dedx - dE/dx
+>> m - mask indicator to set zero 
+>> s - the scale factor
+>> size - size of input/output
+*/
+__global__
+void KernelDropoutBackward(DTYPE * dedy, DTYPE * dedx, 
+                           DTYPE * m, DTYPE s, int size)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < size){
+        dedx[i] = dedy[i] * m[i] * s;
+    }
+}
+/* 
+backward computation of dropout function (Cuda version)
+dE/dx = dE/dy * dy/dx
+>> y - output of the dropout function
+>> x - input of the dropout function
+>> dedy - dE/dy
+>> dedx - dE/dx
+>> mask - mask tensor to set 0
+>> scaleFactor - the scale factor
+*/
+void _CudaDropoutBackward(const XTensor * y, const XTensor * x,
+                          const XTensor * dedy, XTensor * dedx,
+                          const XTensor * mask, DTYPE scaleFactor)
+{
+    int gridSize[3], blockSize[3];
+    if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE){
+        GDevs.GetCudaThread(x->devID, x->unitNum, gridSize, blockSize);
+        int devIDBackup;
+        ProtectCudaDev(x->devID, devIDBackup);
+        /* dE/ds = dE/dy * dy/ds */
+        KernelDropoutBackward<<<dim3(gridSize[0]),dim3(blockSize[0])>>>
+                              ((DTYPE*)dedy->data, (DTYPE*)dedx->data, 
+                               (DTYPE*)mask->data, scaleFactor, x->unitNum);
+        BacktoCudaDev(x->devID, devIDBackup);
+    }
+    else
+        ShowNTErrors("TODO!");
+}
+#endif
+} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/function/Dropout.cuh
+++ b/source/tensor/function/Dropout.cuh
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-09-12
+ */
+#ifndef __DROPOUT_CUH__
+#define __DROPOUT_CUH__
+#include "../XTensor.h"
+#include "Loss.h"
+namespace nts{ // namespace nts(NiuTrans.Tensor)
+#ifdef USE_CUDA
+/* dropout function (Cuda version) */
+void _CudaDropout(const XTensor * x, XTensor * y, const XTensor * r, DTYPE scaleFactor);
+/* de/dx (Cuda version) */
+void _CudaDropoutBackward(const XTensor * y, const XTensor * x,
+                          const XTensor * dedy, XTensor * dedx,
+                          const XTensor * mask, DTYPE scaleFactor);
+#endif // USE_CUDA
+} // namespace nts(NiuTrans.Tensor)
+#endif // __DROPOUT_CUH__
\ No newline at end of file
--- a/source/tensor/function/Dropout.h
+++ b/source/tensor/function/Dropout.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-09-12
+ */
+#ifndef __DROPOUT_H__
+#define __DROPOUT_H__
+#include "../XTensor.h"
+#include "Loss.h"
+namespace nts{ // namespace nts(NiuTrans.Tensor)
+/* dropout function */
+void _Dropout(const XTensor * x, XTensor * y, unsigned int seed, DTYPE prob = 0.5);
+/* dropout function */
+XTensor Dropout(const XTensor &x, DTYPE prob = 0.5);
+/* de/dx */
+void _DropoutBackward(const XTensor * y, const XTensor * x, 
+                      const XTensor * dedy, XTensor * dedx, 
+                      unsigned int seed, DTYPE prob = 0.5);
+} // namespace nts(NiuTrans.Tensor)
+#endif // __DROPOUT_H__
\ No newline at end of file
--- a/source/tensor/function/Loss.cpp
+++ b/source/tensor/function/Loss.cpp
@@ -51,8 +51,7 @@ DTYPE _LossCompute(XTensor * gold, XTensor * output, LOSS_FUNCTION_NAME LFName,
        CheckNTErrors((XTensor::IsSameShaped(gold, output)), "The input tensors must be of the same size!");
        CheckNTErrors((gold->dimSizeRDI[0] == 1 && output->dimSizeRDI[0] == 1), "TODO!");
        CheckNTErrors((gold->order > leadDim && leadDim >= 0), "Illegal leading dimension!");
-        CheckNTErrors((gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE),
+        CheckNTErrors((gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE), "TODO!");
-                             "TODO!");
        int leadDimRDI = output->order - leadDim - 1;
        int dimensionSize = output->dimSizeRDI[leadDimRDI];

--- a/source/tensor/function/Loss.cu
+++ b/source/tensor/function/Loss.cu
@@ -58,8 +58,7 @@ DTYPE _CudaLossCompute(XTensor * gold, XTensor * y, LOSS_FUNCTION_NAME LFName,
    CheckNTErrors((XTensor::IsSameShaped(gold, y)), "The input tensors must be of the same size!");
    CheckNTErrors((gold->dimSizeRDI[0] == 1 && y->dimSizeRDI[0] == 1), "TODO!");
    CheckNTErrors((gold->order > leadDim && leadDim >= 0), "Illegal leading dimension!");
-    CheckNTErrors((gold->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE),
+    CheckNTErrors((gold->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE), "TODO!");
-                         "TODO!");
    CheckNTErrors((gold->devID == y->devID), "Tensors must be on the same device!");
    CheckNTErrors((gold->devID >= 0), "Tensors must be on GPU device!");
    CheckNTErrors((gLen == gold->dimSize[leadDim] && gBeg == 0 && yBeg == 0), "TODO!");

--- a/source/tensor/function/Loss.h
+++ b/source/tensor/function/Loss.h
@@ -48,19 +48,19 @@ loss function to measure the "number" of errors
 /* compute the loss */
 DTYPE _LossCompute(XTensor * gold, XTensor * output, LOSS_FUNCTION_NAME LFName,
-                  bool isLogOutput, int leadDim, int gBeg, int gLen, int oBeg);
+                   bool isLogOutput, int leadDim, int gBeg, int gLen, int oBeg);
 /* compute the loss (log version) */
 DTYPE _LossComputeForLogScale(XTensor * gold, XTensor * output, LOSS_FUNCTION_NAME LFName,
-                             int leadDim, int gBeg, int gLen, int oBeg);
+                              int leadDim, int gBeg, int gLen, int oBeg);
 /* backward compuation for a single element */
 DTYPE _LossBackwardPoint(DTYPE t, DTYPE y, LOSS_FUNCTION_NAME LFName);
 /* backward compuation for (dense) vectors */
 void _LossBackward(XTensor * dEdY, XTensor * t, XTensor * y, 
-                  LOSS_FUNCTION_NAME LFName, 
+                   LOSS_FUNCTION_NAME LFName, 
-                  int leadDim = -1, int tBeg = 0, int tLen = -1, int yBeg = 0);
+                   int leadDim = -1, int tBeg = 0, int tLen = -1, int yBeg = 0);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/function/Sigmoid.cpp
+++ b/source/tensor/function/Sigmoid.cpp
@@ -16,8 +16,8 @@
 */
 /*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-25
+ * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-25
-*/
+ */
 #include "../XName.h"
 #include <math.h>

--- a/source/tensor/function/Sigmoid.cu
+++ b/source/tensor/function/Sigmoid.cu
@@ -16,8 +16,8 @@
 */
 /*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-25
+ * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-25
-*/
+ */
 #include "Sigmoid.h"
 #include "Sigmoid.cuh"

--- a/source/tensor/function/Sigmoid.cuh
+++ b/source/tensor/function/Sigmoid.cuh
@@ -29,7 +29,7 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
 #ifdef USE_CUDA
-/* rectify function y = max(0, x) (Cuda version) */
+/* sigmoid function y = 1/(1+exp(-x)) (Cuda version) */
 void _CudaSigmoid(const XTensor * input, XTensor * output);
 /* de/dx (Cuda version) */

--- a/source/tensor/test/TDropout.cpp
+++ b/source/tensor/test/TDropout.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-09-12
+ */
+#include "../XUtility.h"
+#include "TDropout.h"
+#include "../core/getandset/SetData.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/* 
+case 1: test Dropout function.
+*/
+bool TestDropout1()
+{
+    /* a input tensor of size (4, 5) */
+    int order = 2;
+    int * dimSize = new int[order];
+    dimSize[0] = 40;
+    dimSize[1] = 50;
+    int unitNum = 1;
+    for (int i = 0; i < order; i++)
+        unitNum *= dimSize[i];
+    /* CPU test */
+    bool cpuTest = true;
+    /* create tensors */
+    XTensor * x = NewTensor(order, dimSize);
+    XTensor * y = NewTensor(order, dimSize);
+    XTensor yUser;
+    /* initialize variables */
+    x->SetDataRand(0, 1);
+    y->SetZeroAll();
+    /* call Dropout function */
+    float prob = 0.2F;
+    int seed = 20;
+    _Dropout(x, y, seed, prob);
+    yUser = Dropout(*x);
+    /* check result */
+    int zeroNum1 = 0;
+    int zeroNum2 = 0;
+    float * data1 = (float*)y->data;
+    float * data2 = (float*)yUser.data;
+    for (int i = 0; i < unitNum; i++){
+        DTYPE tmp1 = data1[i];
+        DTYPE tmp2 = data2[i];
+        if(tmp1 == 0.0F)
+            zeroNum1 += 1;
+        if(tmp2 == 0.0F)
+            zeroNum2 += 1;
+    }
+    printf("CPU Test:\n");
+    printf("In tensor y, there are %d units.\n", unitNum);
+    printf("There are %d zero units by Dropout layer with probability %.2f.\n", zeroNum1, prob);
+    printf("In tensor yUser, there are %d units.\n", unitNum);
+    printf("There are %d zero units by Dropout layer with default probability %.2f.\n", zeroNum2, 0.5F);
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+    /* create tensors */
+    XTensor * xGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+    XTensor * yGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+    XTensor yUserGPU;
+    /* initialize variables */
+    xGPU->SetDataRand(0, 1);
+    yGPU->SetZeroAll();
+    /* call Dropout function */
+    _Dropout(xGPU, yGPU, seed, prob);
+    yUserGPU = Dropout(*xGPU);
+    /* check result */
+    zeroNum1 = 0;
+    zeroNum2 = 0;
+    data1 = (float*)y->data;
+    data2 = (float*)yUser.data;
+    for (int i = 0; i < unitNum; i++){
+        DTYPE tmp1 = data1[i];
+        DTYPE tmp2 = data2[i];
+        if(tmp1 == 0.0F)
+            zeroNum1 += 1;
+        if(tmp2 == 0.0F)
+            zeroNum2 += 1;
+    }
+    printf("CPU Test:\n");
+    printf("In tensor y, there are %d units.\n", unitNum);
+    printf("There are %d zero units by Dropout layer with probability %.2f.\n", zeroNum1, prob);
+    printf("In tensor yUser, there are %d units.\n", unitNum);
+    printf("There are %d zero units by Dropout layer with default probability %.2f.\n", zeroNum2, 0.5F);
+    /* destroy variables */
+    delete x;
+    delete y;
+    delete xGPU;
+    delete yGPU;
+    delete[] dimSize;
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete x;
+    delete y;
+    delete[] dimSize;
+    return cpuTest;
+#endif // USE_CUDA
+}
+/* 
+case 2: test Dropout function and backward computation.
+*/
+bool TestDropout2()
+{
+    /* a input tensor of size (4, 5) */
+    int order = 2;
+    int * dimSize = new int[order];
+    dimSize[0] = 4;
+    dimSize[1] = 5;
+    int unitNum = 1;
+    for (int i = 0; i < order; i++)
+        unitNum *= dimSize[i];
+    /* CPU test */
+    bool cpuTest = true;
+    /* create tensors */
+    XTensor * x = NewTensor(order, dimSize);
+    XTensor * y = NewTensor(order, dimSize);
+    XTensor * dedx = NewTensor(order, dimSize);
+    XTensor * dedy = NewTensor(order, dimSize);
+    /* initialize variables */
+    _SetDataFixedFloat(x, 1.0F);
+    y->SetZeroAll();
+    dedx->SetZeroAll();
+    _SetDataFixedFloat(dedy, 1.0F);
+    /* call Dropout function */
+    float prob = 0.5F;
+    int seed = 1;
+    _Dropout(x, y, seed, prob);
+    _DropoutBackward(y, x, dedy, dedx, 1, prob);
+    /* check result */
+    y->Dump(stderr, "y");
+    dedx->Dump(stderr, "dedy");
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+    /* create tensors */
+    XTensor * xGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+    XTensor * yGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+    XTensor * dedxGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+    XTensor * dedyGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+    /* initialize variables */
+    _SetDataFixedFloat(xGPU, 1.0F);
+    yGPU->SetZeroAll();
+    dedxGPU->SetZeroAll();
+    _SetDataFixedFloat(dedyGPU, 1.0F);
+    /* call Dropout function */
+    _Dropout(xGPU, yGPU, seed, prob);
+    _DropoutBackward(yGPU, xGPU, dedyGPU, dedxGPU, 1, prob);
+    /* check result */
+    yGPU->Dump(stderr, "yGPU");
+    dedxGPU->Dump(stderr, "dedyGPU");
+    /* destroy variables */
+    delete x;
+    delete y;
+    delete dedx;
+    delete dedy;
+    delete xGPU;
+    delete yGPU;
+    delete dedxGPU;
+    delete dedyGPU;
+    delete[] dimSize;
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete x;
+    delete y;
+    delete[] dimSize;
+    return cpuTest;
+#endif // USE_CUDA
+}
+/* other cases */
+/*
+    TODO!!
+*/
+/* test for Dropout Function */
+bool TestDropout()
+{
+    XPRINT(0, stdout, "[TEST DROPOUT] dropout function and its backward computation \n");
+    bool returnFlag = true, caseFlag = true;
+    /* case 1 test */
+    caseFlag = TestDropout1();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 1 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 1 passed!\n");
+    /* case 2 test */
+    caseFlag = TestDropout2();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 2 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 2 passed!\n");
+    /* other cases test */
+    /*
+    TODO!!
+    */
+    if (returnFlag) {
+        XPRINT(0, stdout, ">> All Passed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> Failed!\n");
+    XPRINT(0, stdout, "\n");
+    return returnFlag;
+}
+} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/test/TDropout.h
+++ b/source/tensor/test/TDropout.h
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-09-12
+*/
+#ifndef __TEST_DROPOUT_H__
+#define __TEST_DROPOUT_H__
+#include "../function/Dropout.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/* test for Dropout Function */
+bool TestDropout();
+} // namespace nts(NiuTrans.Tensor)
+#endif // __TEST_DROPOUT_H__
--- a/source/tensor/test/TLoss.cpp
+++ b/source/tensor/test/TLoss.cpp
@@ -19,6 +19,7 @@
 * $Created by: LI Yinqiao (email: li.yin.qiao.2012@hotmail.com) 2018-04-30
 */
+#include<math.h>
 #include "../core/math/ScaleAndShift.h"
 #include "TLoss.h"
@@ -62,7 +63,7 @@ bool TestLoss1()
    error = _LossCompute(gold, output, SQUAREDERROR, false, 0, 0, dimSize[0], 0);
    /* check results */
-    cpuTest = (error == answer);
+    cpuTest = (fabs(error - answer) < 1e-4);
 #ifdef USE_CUDA
    /* GPU test */
@@ -82,7 +83,7 @@ bool TestLoss1()
    error = _LossCompute(goldGPU, outputGPU, SQUAREDERROR, false, 0, 0, dimSize[0], 0);
    /* check results */
-    gpuTest = (error == answer);
+    gpuTest = (fabs(error - answer) < 1e-4);
    /* destroy variables */
    delete output;
@@ -140,7 +141,7 @@ bool TestLoss2()
    error = _LossCompute(gold, output, CROSSENTROPY, false, 0, 0, dimSize[0], 0);
    /* check results */
-    cpuTest = (error == answer);
+    cpuTest = (fabs(error - answer) < 1e-4);
 #ifdef USE_CUDA
    /* GPU test */
@@ -160,7 +161,7 @@ bool TestLoss2()
    error = _LossCompute(goldGPU, outputGPU, CROSSENTROPY, false, 0, 0, dimSize[0], 0);
    /* check results */
-    gpuTest = (error == answer);
+    gpuTest = (fabs(error - answer) < 1e-4);
    /* destroy variables */
    delete output;
@@ -226,7 +227,7 @@ bool TestLoss3()
    error = _LossCompute(gold, output, ONEHOTERROR, false, 0, 0, dimSize[0], 0);
    /* check results */
-    cpuTest = (error == answer);
+    cpuTest = (fabs(error - answer) < 1e-4);
 #ifdef USE_CUDA
    /* GPU test */
@@ -244,7 +245,7 @@ bool TestLoss3()
    error = _LossCompute(goldGPU, outputGPU, ONEHOTERROR, false, 0, 0, dimSize[0], 0);
    /* check results */
-    gpuTest = (error == answer);
+    gpuTest = (fabs(error - answer) < 1e-4);
    /* destroy variables */
    delete output;

--- a/source/tensor/test/Test.cpp
+++ b/source/tensor/test/Test.cpp
@@ -76,6 +76,7 @@ bool Test()
    wrong = !TestUnsqueeze() || wrong;
    wrong = !TestXMem() || wrong;
+	wrong = !TestDropout() || wrong;
    wrong = !TestHardTanH() || wrong;
    wrong = !TestIdentity() || wrong;
    wrong = !TestLogSoftmax() || wrong;

--- a/source/tensor/test/Test.h
+++ b/source/tensor/test/Test.h
@@ -69,6 +69,7 @@
 #include "TUnsqueeze.h"
 #include "TXMem.h"
+#include "TDropout.h"
 #include "THardTanH.h"
 #include "TIdentity.h"
 #include "TLogSoftmax.h"