Merge with li branch

f8a37184 · liyinqiao · 0ca350a3 · 9b11391e · f8a37184 · f8a37184
Commit f8a37184 authored Jul 12, 2018 by liyinqiao
--- a/doc/manual.md
+++ b/doc/manual.md
--- a/source/tensor/XTensor.cpp
+++ b/source/tensor/XTensor.cpp
@@ -542,14 +542,14 @@ void XTensor::SetDataRand(DTYPE lower, DTYPE upper)
    if (dataType == X_FLOAT) {
        d = new float[unitNum];
        for (int i = 0; i < unitNum; i++) {
-            DTYPE value = lower + upper * (float)rand() / RAND_MAX;
+            DTYPE value = lower + (upper - lower) * (float)rand() / RAND_MAX;
            *((float*)d + i) = value;
        }
    }
    else if (dataType == X_DOUBLE) {
        d = new double[unitNum];
        for (int i = 0; i < unitNum; i++) {
-            *((double*)d + i) = rand() / RAND_MAX;
+            *((double*)d + i) = lower + (upper - lower) * rand() / RAND_MAX;
        }
    }
    else {
@@ -922,8 +922,10 @@ set the value of a cell
 >> index - index of the cell for each dimension
 >> 
 */
-bool XTensor::Set(DTYPE value, int * index, int size)
+bool XTensor::Set(DTYPE value, int index[], int size)
 {
+	CheckNTErrors((dataType == DEFAULT_DTYPE), "The tensor is not in default type.");
+
    return SetToDevice(devID, GetCell(index, size), value);
 }


--- a/source/tensor/core/arithmetic/Absolute.cpp
+++ b/source/tensor/core/arithmetic/Absolute.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
+*/
+
+#include "../../XTensor.h"
+#include "Absolute.h"
+#include "Absolute.cuh"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/*
+set every entry to its absolute value
+>> a - the tensor we are processing
+*/
+void Absolute(XTensor * a)
+{
+#ifdef USE_CUDA
+    /* run it on GPUs */
+    if (a->devID >= 0) {
+        CudaAbsolute(a);
+    return;
+}
+#endif
+
+    CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
+    DTYPE * d = (DTYPE*)a->data;
+    for (int i = 0; i < a->unitNum; i++)
+        d[i] = (DTYPE)fabs(d[i]);
+}
+} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/arithmetic/Absolute.cu
+++ b/source/tensor/core/arithmetic/Absolute.cu
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
+*/
+
+#include "../../XDevice.h"
+#include "../../XTensor.h"
+#include "Absolute.h"
+#include "Absolute.cuh"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+#ifdef USE_CUDA
+/*
+set each entry to its absolute value (CUDA Kernel)
+>> d - pointer to the data array
+>> size - size of the data array
+*/
+__global__
+void KernelAbsolute(DTYPE * d, int size)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < size)
+        d[i] = fabs(d[i]);
+}
+
+/*
+set each entry to its absolute value (CUDA Kernel)
+This is for float16 computation
+>> d - pointer to the data array
+>> size - size of the data array
+*/
+__global__
+void KernelAbsolute(__half * d, int size)
+{
+    return;
+}
+
+/*
+set each entry to its  with float16 data type value
+>> a - the tensor
+*/
+extern "C"
+void CudaAbsolute(XTensor * a)
+{
+    CheckNTErrors((a->isSparse == false), "TODO!");
+
+    int gridSize[3];
+    int blockSize[3];
+
+    GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);
+
+    dim3 blocks(gridSize[0]);
+    dim3 threads(blockSize[0]);
+
+    int devIDBackup;
+    ProtectCudaDev(a->devID, devIDBackup);
+
+    if (a->dataType == DEFAULT_DTYPE) {
+        KernelAbsolute << <blocks, threads >> >((DTYPE*)a->data, a->unitNum);
+    }
+    else if (a->dataType == X_FLOAT16) {
+        KernelAbsolute << <blocks, threads >> >((__half*)a->data, a->unitNum);
+    }
+    else {
+        ShowNTErrors("TODO!");
+    }
+
+    BacktoCudaDev(a->devID, devIDBackup);
+}
+
+#endif // USE_CUDA
+} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/arithmetic/Absolute.cuh
+++ b/source/tensor/core/arithmetic/Absolute.cuh
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
+*/
+
+#include "Absolute.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+#ifdef USE_CUDA
+
+/* set each entry to its absolute value (CUDA Kernel) */
+__global__
+void KernelAbsolute(DTYPE * d, int size);
+
+/* set each entry to its absolute value (CUDA Kernel) with float16 data type*/
+__global__
+void KernelAbsolute(__half * d, int size);
+
+/* set each entry to its absolute value */
+extern "C"
+void CudaAbsolute(XTensor * a);
+
+#endif // USE_CUDA
+
+} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/arithmetic/Absolute.h
+++ b/source/tensor/core/arithmetic/Absolute.h
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
+*/
+
+#ifndef __ABSOLUTE_H__
+#define __ABSOLUTE_H__
+
+#include "../../XTensor.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/* set every entry to its absolute value */
+extern "C"
+void Absolute(XTensor * a);
+
+} // namespace nts(NiuTrans.Tensor)
+
+#endif // __ABSOLUTE_H__
--- a/source/tensor/core/arithmetic/MatrixMulBatched.cpp
+++ b/source/tensor/core/arithmetic/MatrixMulBatched.cpp
@@ -89,9 +89,9 @@ void MatrixMulBatched(XTensor * a, MATRIX_TRANS_TYPE transposedA,
        void * ap = (char*)a->data + aRealBlockSize * p;
        void * bp = (char*)b->data + bRealBlockSize * p;
        void * cp = (char*)c->data + cRealBlockSize * p;
-        XTensor * ai = new XTensor(2, aDimSize, a->dataType, a->denseRatio, a->devID, a->mem);
-        XTensor * bi = new XTensor(2, bDimSize, b->dataType, b->denseRatio, b->devID, b->mem);
-        XTensor * ci = new XTensor(2, cDimSize, c->dataType, c->denseRatio, c->devID, c->mem);
+        XTensor * ai = NewTensor(2, aDimSize, a->dataType, a->denseRatio, a->devID, a->mem);
+        XTensor * bi = NewTensor(2, bDimSize, b->dataType, b->denseRatio, b->devID, b->mem);
+        XTensor * ci = NewTensor(2, cDimSize, c->dataType, c->denseRatio, c->devID, c->mem);
        ai->data = ap;
        bi->data = bp;
        ci->data = cp;

--- a/source/tensor/core/arithmetic/Sign.cpp
+++ b/source/tensor/core/arithmetic/Sign.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
+*/
+
+#include "../../XTensor.h"
+#include "Sign.h"
+#include "Sign.cuh"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/*
+set every entry to its sign value
+>> a - the tensor we are processing
+*/
+void Sign(XTensor * a)
+{
+#ifdef USE_CUDA
+    /* run it on GPUs */
+    if (a->devID >= 0) {
+        CudaSign(a);
+    return;
+}
+#endif
+
+    CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
+    DTYPE * d = (DTYPE*)a->data;
+    for (int i = 0; i < a->unitNum; i++) {
+        if (d[i] > 0)
+            d[i] = 1.0F;
+        else if (d[i] == 0)
+            d[i] = 0.0F;
+        else
+            d[i] = -1.0F;
+    }
+}
+} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/arithmetic/Sign.cu
+++ b/source/tensor/core/arithmetic/Sign.cu
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
+*/
+
+#include "../../XDevice.h"
+#include "../../XTensor.h"
+#include "Sign.h"
+#include "Sign.cuh"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+#ifdef USE_CUDA
+/*
+set each entry to its sign value (CUDA Kernel)
+>> d - pointer to the data array
+>> size - size of the data array
+*/
+__global__
+void KernelSign(DTYPE * d, int size)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < size) {
+        if (d[i] > 0)
+            d[i] = 1.0F;
+        else if (d[i] == 0)
+            d[i] = 0.0F;
+        else
+            d[i] = -1.0F;
+    }
+}
+
+/*
+set each entry to its sign value (CUDA Kernel)
+This is for float16 computation
+>> d - pointer to the data array
+>> size - size of the data array
+*/
+__global__
+void KernelSign(__half * d, int size)
+{
+    return;
+}
+
+/*
+set each entry to its  with float16 data type value
+>> a - the tensor
+*/
+extern "C"
+void CudaSign(XTensor * a)
+{
+    CheckNTErrors((a->isSparse == false), "TODO!");
+
+    int gridSize[3];
+    int blockSize[3];
+
+    GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);
+
+    dim3 blocks(gridSize[0]);
+    dim3 threads(blockSize[0]);
+
+    int devIDBackup;
+    ProtectCudaDev(a->devID, devIDBackup);
+
+    if (a->dataType == DEFAULT_DTYPE) {
+        KernelSign << <blocks, threads >> >((DTYPE*)a->data, a->unitNum);
+    }
+    else if (a->dataType == X_FLOAT16) {
+        KernelSign << <blocks, threads >> >((__half*)a->data, a->unitNum);
+    }
+    else {
+        ShowNTErrors("TODO!");
+    }
+
+    BacktoCudaDev(a->devID, devIDBackup);
+}
+
+#endif // USE_CUDA
+} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/arithmetic/Sign.cuh
+++ b/source/tensor/core/arithmetic/Sign.cuh
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
+*/
+
+#include "Sign.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+#ifdef USE_CUDA
+
+/* set each entry to its sign value (CUDA Kernel) */
+__global__
+void KernelSign(DTYPE * d, int size);
+
+/* set each entry to its sign value (CUDA Kernel) with float16 data type*/
+__global__
+void KernelSign(__half * d, int size);
+
+/* set each entry to its sign value */
+extern "C"
+void CudaSign(XTensor * a);
+
+#endif // USE_CUDA
+
+} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/arithmetic/Sign.h
+++ b/source/tensor/core/arithmetic/Sign.h
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
+*/
+
+#ifndef __SIGN_H__
+#define __SIGN_H__
+
+#include "../../XTensor.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/* set every entry to its sign value */
+extern "C"
+void Sign(XTensor * a);
+
+} // namespace nts(NiuTrans.Tensor)
+
+#endif // __SIGN_H__
--- a/source/tensor/core/arithmetic/SumByColumnVT.cu
+++ b/source/tensor/core/arithmetic/SumByColumnVT.cu
@@ -52,7 +52,7 @@ void KernelADDByColumnVT(DTYPE * a, DTYPE * b, DTYPE * c, int colNum, int rowNum
        DTYPE * bp = b + (rowNum * k + row) * colNum;
        if (colNum % 4 == 0) {
            for (int i = 0; i < colNum; i += 4)
-                sum += bp[i] + bp[i + 1] + b[i + 2] + b[i + 3];
+                sum += bp[i] + bp[i + 1] + bp[i + 2] + bp[i + 3];
        }
        else if (colNum % 2 == 0) {
            for (int i = 0; i < colNum; i += 2)

--- a/source/tensor/core/getandset/ConvertDataType.cpp
+++ b/source/tensor/core/getandset/ConvertDataType.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
+*/
+
+#include "../../XTensor.h"
+#include "ConvertDataType.h"
+#include "ConvertDataType.cuh"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/*
+convert data type
+>> input - input tensor
+>> output - output tensor
+*/
+void ConvertTensorDataType(XTensor * input, XTensor * output)
+{
+    CheckNTErrors(XTensor::IsIdentical(input, output), "Input and Output are different in type or size!");
+    if (input->dataType == output->dataType)
+        return;
+#ifdef USE_CUDA
+    /* run it on GPUs */
+    if (input->devID >= 0) {
+        CudaConvertDataType(input, output);
+    return;
+}
+#endif
+
+    if (input->dataType == X_FLOAT && output->dataType == X_INT) {
+        float * inputData = (float*)input->data;
+        int * outputData = (int*)output->data;
+        for (int i = 0; i < input->unitNum; i++) 
+            outputData[i] = (int)inputData[i];
+    }
+    else if (input->dataType == X_INT && output->dataType == X_FLOAT) {
+        int * inputData = (int*)input->data;
+        float * outputData = (float*)output->data;
+        for (int i = 0; i < input->unitNum; i++) 
+            outputData[i] = (float)inputData[i];
+    }
+    else
+        ShowNTErrors("Unsupported data types for conversion!");
+
+}
+} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/getandset/ConvertDataType.cu
+++ b/source/tensor/core/getandset/ConvertDataType.cu
@@ -21,6 +21,7 @@

 #include "../../XTensor.h"
 #include "../../XDevice.h"
+#include "ConvertDataType.cuh"

 namespace nts { // namespace nts(NiuTrans.Tensor)

@@ -49,6 +50,24 @@ void KernelFloat16ToFloat(__half * s, float * t, int size)
    }
 }

+__global__ 
+void KernelFloatToInt(float * inputData, int * outputData, int size)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < size){
+        outputData[i] = (int)(inputData[i]);
+    }
+}
+
+__global__ 
+void KernelIntToFloat(int * inputData, float * outputData, int size)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < size){
+        outputData[i] = (float)(inputData[i]);
+    }}

 /* 
 data conversion (cuda code) 
@@ -88,6 +107,39 @@ void CudaConvertDataType(int devID, void * s, TENSOR_DATA_TYPE typeS, void * t, 
    ProtectCudaDev(devID, devIDBackup);
 }

+/*
+convert data type (cuda code) 
+>> input - input tensor
+>> output - output tensor
+*/
+void CudaConvertDataType(XTensor * input, XTensor * output)
+{
+    CheckNTErrors(XTensor::IsIdentical(input, output), "Input and Output are different in type or size!");
+    if (input->dataType == output->dataType)
+        return;
+
+    int gridSize[3];
+    int blockSize[3];
+
+    GDevs.GetCudaThread(input->devID, input->unitNum, gridSize, blockSize);
+
+    dim3 blocks(gridSize[0]);
+    dim3 threads(blockSize[0]);
+
+    int devIDBackup;
+    ProtectCudaDev(input->devID, devIDBackup);
+
+    if(input->dataType == X_FLOAT && output->dataType == X_INT)
+        KernelFloatToInt<<<blocks, threads>>>((float*)input->data, (int*)output->data, input->unitNum);
+    else if(input->dataType == X_INT && output->dataType == X_FLOAT)
+        KernelIntToFloat<<<blocks, threads>>>((int*)input->data, (float*)output->data, input->unitNum);
+    else{
+        ShowNTErrors("Unsupported data types for conversion!");
+    }
+
+    ProtectCudaDev(input->devID, devIDBackup);
+}
+
 #endif // USE_CUDA

 } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/getandset/ConvertDataType.cuh
+++ b/source/tensor/core/getandset/ConvertDataType.cuh
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
+*/
+
+#include "ConvertDataType.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+#ifdef USE_CUDA
+
+/* convert data type from X_FLOAT to X_FLOAT16 (CUDA Kernel) */
+__global__
+void KernelFloatToFloat16(float * s, __half * t, int size);
+
+/* convert data type from X_FLOAT16 to X_FLOAT (CUDA Kernel) */
+__global__
+void KernelFloat16ToFloat(__half * s, float * t, int size);
+
+/* convert data type from X_FLOAT to X_INT (CUDA Kernel) */
+__global__
+void KernelFloatToInt(float * inputData, int * outputData, int size);
+
+/* convert data type from X_INT to X_FLOAT (CUDA Kernel) */
+__global__
+void KernelIntToFloat(int * inputData, float * outputData, int size);
+
+/* convert data type */
+void CudaConvertDataType(XTensor * input, XTensor * output);
+
+#endif // USE_CUDA
+
+} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/getandset/ConvertDataType.h
+++ b/source/tensor/core/getandset/ConvertDataType.h
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
+*/
+
+#ifndef __CONVERTDATATYPE_H__
+#define __CONVERTDATATYPE_H__
+
+#include "../../XTensor.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/* convert data type */
+void ConvertDataType(XTensor * input, XTensor * output);
+
+} // namespace nts(NiuTrans.Tensor)
+
+#endif // __CONVERTDATATYPE_H__
--- a/source/tensor/core/math/Log.cpp
+++ b/source/tensor/core/math/Log.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
+*/
+
+#include "../../XTensor.h"
+#include "Log.h"
+#include "Log.cuh"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/*
+set every entry to its log value
+>> a - the tensor we are processing
+*/
+void Log(XTensor * a)
+{
+#ifdef USE_CUDA
+    /* run it on GPUs */
+    if (a->devID >= 0) {
+        CudaLog(a);
+    return;
+}
+#endif
+
+    CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
+    DTYPE * d = (DTYPE*)a->data;
+    for (int i = 0; i < a->unitNum; i++)
+        d[i] = (DTYPE)log(d[i]);
+}
+} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/math/Log.cu
+++ b/source/tensor/core/math/Log.cu
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
+*/
+
+#include "../../XDevice.h"
+#include "../../XTensor.h"
+#include "Log.h"
+#include "Log.cuh"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+#ifdef USE_CUDA
+/*
+set each entry to its log value (CUDA Kernel)
+>> d - pointer to the data array
+>> size - size of the data array
+*/
+__global__
+void KernelLog(DTYPE * d, int size)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < size)
+        d[i] = log(d[i]);
+}
+
+/*
+set each entry to its log value (CUDA Kernel)
+This is for float16 computation
+>> d - pointer to the data array
+>> size - size of the data array
+*/
+__global__
+void KernelLog(__half * d, int size)
+{
+    return;
+}
+
+/*
+set each entry to its log value
+>> a - the tensor
+*/
+extern "C"
+void CudaLog(XTensor * a)
+{
+    CheckNTErrors((a->isSparse == false), "TODO!");
+
+    int gridSize[3];
+    int blockSize[3];
+
+    GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);
+
+    dim3 blocks(gridSize[0]);
+    dim3 threads(blockSize[0]);
+
+    int devIDBackup;
+    ProtectCudaDev(a->devID, devIDBackup);
+
+    if (a->dataType == DEFAULT_DTYPE) {
+        KernelLog << <blocks, threads >> >((DTYPE*)a->data, a->unitNum);
+    }
+    else if (a->dataType == X_FLOAT16) {
+        KernelLog << <blocks, threads >> >((__half*)a->data, a->unitNum);
+    }
+    else {
+        ShowNTErrors("TODO!");
+    }
+
+    BacktoCudaDev(a->devID, devIDBackup);
+}
+
+#endif // USE_CUDA
+} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/math/Log.cuh
+++ b/source/tensor/core/math/Log.cuh
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
+*/
+
+#include "Log.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+#ifdef USE_CUDA
+
+/* set each entry to its log value (CUDA Kernel) */
+__global__
+void KernelLog(DTYPE * d, int size);
+
+/* set each entry to its log value (CUDA Kernel) with float16 data type*/
+__global__
+void KernelLog(__half * d, int size);
+
+/* set each entry to its log value */
+extern "C"
+void CudaLog(XTensor * a);
+
+#endif // USE_CUDA
+
+} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/math/Log.h
+++ b/source/tensor/core/math/Log.h
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
+*/
+
+#ifndef __LOG_H__
+#define __LOG_H__
+
+#include "../../XTensor.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/* set every entry to its log value */
+extern "C"
+void Log(XTensor * a);
+
+} // namespace nts(NiuTrans.Tensor)
+
+#endif // __LOG_H__
--- a/source/tensor/core/math/ScaleAndShift.cuh
+++ b/source/tensor/core/math/ScaleAndShift.cuh
@@ -37,6 +37,7 @@ __global__
 void KernelScaleAndShift(__half * a, __half * b, int size, __half scale, __half shift);

 /* scale and shift all tensor entires b = a * scale + shift (cuda version) */
+extern "C" 
 void _CudaScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift);

 #endif // USE_CUDA

--- a/source/tensor/core/movement/CopyBlocks.cpp
+++ b/source/tensor/core/movement/CopyBlocks.cpp
@@ -66,18 +66,19 @@ copy a number of blocks source source positions to target positions
 >> targetBlocks - target positions of the copy
 >> myMem - the memory pool
 */
-void CopyBlocks(void * source, int blockSize, int * sourceBlocks, int blockNum, void * target, int * targetBlocks, XMem * myMem)
+void CopyBlocks(void * source, int blockSize, int * sourceBlocks, int blockNum, void * target, int * targetBlocks, XMem * myMem, int devID)
 {
-    if (myMem != NULL && myMem->devID >= 0) {
+    if (myMem != NULL)
+        CheckNTErrors((myMem->devID == devID), "DevIDs are different between memory pool and input devID!");
+
+    if (devID >= 0) {
 #ifdef USE_CUDA
-        CudaCopyBlocksSelected(source, blockSize, sourceBlocks, blockNum, target, targetBlocks, myMem);
+        CudaCopyBlocksSelected(source, blockSize, sourceBlocks, blockNum, target, targetBlocks, myMem, devID);
 #else
        ShowNTErrors("Plesae specify USE_CUDA and recompile the code!");
 #endif
    }
    else {
-        int devID = myMem != NULL ? myMem->devID : -1;
-
        /* 
        The following code should be fine with GPUs, but too many
        kernel calls would slow down the system. We prefer to use

--- a/source/tensor/core/movement/CopyBlocks.h
+++ b/source/tensor/core/movement/CopyBlocks.h
@@ -30,7 +30,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 void CopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem);

 /* copy a number of blocks from source positions to target positions */
-void CopyBlocks(void * source, int blockSize, int * sourceBlocks, int blockNum, void * target, int * targetBlocks, XMem * myMem);
+void CopyBlocks(void * source, int blockSize, int * sourceBlocks, int blockNum, void * target, int * targetBlocks, XMem * myMem, int devID);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/movement/CopyBlocksSelected.cu
+++ b/source/tensor/core/movement/CopyBlocksSelected.cu
@@ -70,28 +70,33 @@ copy a number of blocks from source positions to target positions (cuda version)
 >> targetBlocks - target positions of the copy
 >> myMem - memory pool
 */
-void CudaCopyBlocksSelected(void * source, int blockSize, int * sourceBlocks, int blockNum, void * target, int * targetBlocks, XMem * myMem)
+void CudaCopyBlocksSelected(void * source, int blockSize, int * sourceBlocks, int blockNum, void * target, int * targetBlocks, XMem * myMem, int devID)
 {
-    CheckNTErrors((myMem != NULL), "No memory pool!");
-    CheckNTErrors((myMem->devID >= 0), "Wrong device to run!");
+    CheckNTErrors((devID >= 0), "Wrong device to run!");
    CheckNTErrors((blockSize % sizeof(DTYPE) == 0), "Unsupported block size!");

    /* copy the index to the GPU memory */
-    int * sourceBlocksTMP = (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int));
-    int * targetBlocksTMP = (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int));
-    XMemCopy(sourceBlocksTMP, myMem->devID, sourceBlocks, -1, blockNum * sizeof(int));
-    XMemCopy(targetBlocksTMP, myMem->devID, targetBlocks, -1, blockNum * sizeof(int));
+    int * sourceBlocksTMP = myMem != NULL ? (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int)) : (int *)XMemAlloc(devID, blockNum * sizeof(int));
+    int * targetBlocksTMP = myMem != NULL ? (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int)) : (int *)XMemAlloc(devID, blockNum * sizeof(int));
+    XMemCopy(sourceBlocksTMP, devID, sourceBlocks, -1, blockNum * sizeof(int));
+    XMemCopy(targetBlocksTMP, devID, targetBlocks, -1, blockNum * sizeof(int));

    int cudaGrids[3];
    int cudaBlocks[3];

-    GDevs.GetCudaThread2D(myMem->devID, blockSize / sizeof(DTYPE), blockNum, MAX_INT, cudaGrids, cudaBlocks);
+    GDevs.GetCudaThread2D(devID, blockSize / sizeof(DTYPE), blockNum, MAX_INT, cudaGrids, cudaBlocks);

    KernelCopyBlocksSelected << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
                               ((DTYPE*)source, blockSize / sizeof(DTYPE), sourceBlocksTMP, blockNum, (DTYPE*)target, targetBlocksTMP);
-
-    myMem->ReleaseBuf(myMem->devID, blockNum * sizeof(int));
-    myMem->ReleaseBuf(myMem->devID, blockNum * sizeof(int));
+    
+    if (myMem != NULL) {
+        myMem->ReleaseBuf(myMem->devID, blockNum * sizeof(int));
+        myMem->ReleaseBuf(myMem->devID, blockNum * sizeof(int));
+    }
+    else {
+        XMemFree(devID, sourceBlocksTMP);
+        XMemFree(devID, targetBlocksTMP);
+    }
 }

 #endif // USE_CUDA

--- a/source/tensor/core/movement/CopyBlocksSelected.cuh
+++ b/source/tensor/core/movement/CopyBlocksSelected.cuh
@@ -34,7 +34,7 @@ void KernelCopyBlocksSelected(DTYPE * source, int blockSize, int * sourceBlocks,

 /* copy a number of blocks form source positions to target positions (cuda version) */
 extern "C"
-void CudaCopyBlocksSelected(void * source, int blockSize, int * sourceBlocks, int blockNum, void * target, int * targetBlocks, XMem * myMem);
+void CudaCopyBlocksSelected(void * source, int blockSize, int * sourceBlocks, int blockNum, void * target, int * targetBlocks, XMem * myMem, int devID);

 #endif // USE_CUDA


--- a/source/tensor/core/movement/CopyIndexed.cpp
+++ b/source/tensor/core/movement/CopyIndexed.cpp
@@ -84,7 +84,7 @@ bool CopyIndexed(XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSiz
        CheckNTErrors((tgtIndex[i] < blockNumTgt), "Index is out of range!");
    }

-    CopyBlocks(s->data, blockSizeSrc * s->unitSize, realSrcIndex, realIndexSize, t->data, realTgtIndex, s->mem);
+    CopyBlocks(s->data, blockSizeSrc * s->unitSize, realSrcIndex, realIndexSize, t->data, realTgtIndex, s->mem, s->devID);

    delete[] realSrcIndex;
    delete[] realTgtIndex;

--- a/source/tensor/core/movement/CopyValues.h
+++ b/source/tensor/core/movement/CopyValues.h
@@ -27,8 +27,9 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)

 /* copy s to t */
+extern "C"
 bool CopyValues(const XTensor * s, XTensor * t, XStream * stream = NULL);

 } // namespace nts(NiuTrans.Tensor)

-#endif // __COPYVALUES_H__
+#endif // __COPYVALUES_H__
\ No newline at end of file
--- a/source/tensor/function/Loss.cpp
+++ b/source/tensor/function/Loss.cpp
--- a/source/tensor/function/Loss.cu
+++ b/source/tensor/function/Loss.cu
@@ -22,6 +22,14 @@
 #include "Loss.h"
 #include "Loss.cuh"
 #include "../XDevice.h"
+#include "../core/math/Power.h"
+#include "../core/math/ScaleAndShift.h"
+#include "../core/math/Log.h"
+#include "../core/arithmetic/Negate.h"
+#include "../core/arithmetic/Sum.h"
+#include "../core/arithmetic/Multiply.h"
+#include "../core/reduce/ReduceSum.h"
+#include "../core/movement/CopyValues.h"

 namespace nts{ // namespace nts(NiuTrans.Tensor)

@@ -46,7 +54,126 @@ compute the loss
 DTYPE CudaLossCompute(XTensor * gold, XTensor * y, LOSS_FUNCTION_NAME LFName,
                      bool isLogOutput, int leadDim, int gBeg, int gLen, int yBeg)
 {
-    return 0;
+    CheckNTErrors((gLen >= 0 && gLen <= y->unitNum), "Illegal input length!");
+    CheckNTErrors((XTensor::IsIdentical(gold, y)), "The input tensors must be of the same size!");
+    CheckNTErrors((gold->dimSizeRDI[0] == 1 && y->dimSizeRDI[0] == 1), "TODO!");
+    CheckNTErrors((gold->order > leadDim && leadDim >= 0), "Illegal leading dimension!");
+    CheckNTErrors((gold->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE),
+                         "TODO!");
+    CheckNTErrors((gold->devID == y->devID), "Tensors must be on the same device!");
+    CheckNTErrors((gold->devID >= 0), "Tensors must be on GPU device!");
+    CheckNTErrors((gLen == gold->dimSize[leadDim] && gBeg == 0 && yBeg == 0), "TODO!");
+
+    if(isLogOutput)
+        return LossComputeForLogScale(gold, y, LFName, leadDim, gBeg, gLen, yBeg);
+
+    DTYPE error = 0.0F;
+
+    /* 
+    squared error 
+    loss = sum_{i} 0.5*(gold_i - output_i)^2
+    where gold_i is the gold standard and output_i is the model prediction
+    */
+    if(LFName == SQUAREDERROR){
+        XTensor * diff = NewTensor(gold->order, gold->dimSize, gold->dataType, gold->denseRatio, gold->devID, gold->mem);
+        _Sum(gold, y, diff, -1.0F);
+        Power(diff, 2.0F);
+        _ScaleAndShiftMe(diff, 0.5F, 0.0F);
+
+        int reduceTimes = diff->order;
+        for (int i = 0; i < reduceTimes; i++) {
+            int diffOrder = diff->order - 1;
+            int * diffDimSize = new int[diffOrder];
+            memcpy(diffDimSize, diff->dimSize + 1, diffOrder * sizeof(int));
+            XTensor * diffNew = NewTensor(diffOrder, diffDimSize, X_FLOAT, 1.0F, diff->devID, diff->mem);
+            int reducePlace = diff->dimSize[0] == 1 ? 1 : 0;
+            ReduceSum(diff, diffNew, reducePlace);
+            if (diffNew->order == 1) {
+                diffNew->order = 2;
+                diffNew->dimSize[1] = diffNew->dimSize[0];
+                diffNew->dimSize[0] = 1;
+                diffNew->dimSizeRDI[1] = 1;
+            }
+            delete diff;
+            diff = diffNew;
+            delete diffDimSize;
+        }
+        error = diff->Get2D(0, 0);
+        delete diff;
+    }
+
+    /* 
+    cross entropy
+    loss = sum_{i} (-gold_i * log(output_i))
+    where gold and output are distributions 
+    */
+    if(LFName == CROSSENTROPY){
+        XTensor * diff = NewTensor(y->order, y->dimSize, y->dataType, y->denseRatio, y->devID, y->mem);
+        CopyValues(y, diff);
+        Log(diff);
+        _Multiply(gold, diff, diff);
+        Negate(diff);
+
+        int reduceTimes = diff->order;
+        for (int i = 0; i < reduceTimes; i++) {
+            int diffOrder = diff->order - 1;
+            int * diffDimSize = new int[diffOrder];
+            memcpy(diffDimSize, diff->dimSize + 1, diffOrder * sizeof(int));
+            XTensor * diffNew = NewTensor(diffOrder, diffDimSize, X_FLOAT, 1.0F, diff->devID, diff->mem);
+            int reducePlace = diff->dimSize[0] == 1 ? 1 : 0;
+            ReduceSum(diff, diffNew, reducePlace);
+            if (diffNew->order == 1) {
+                diffNew->order = 2;
+                diffNew->dimSize[1] = diffNew->dimSize[0];
+                diffNew->dimSize[0] = 1;
+                diffNew->dimSizeRDI[1] = 1;
+            }
+            delete diff;
+            diff = diffNew;
+            delete diffDimSize;
+        }
+        error = diff->Get2D(0, 0);
+        delete diff;
+    }
+    
+    /*
+    one hot error
+    loss = sum_{i} e_i 
+    where e_i = 0.5*(t_i - y_i)^2 if t_i = 1, 
+          e_i = 0 otherwise
+    */
+    if(LFName == ONEHOTERROR){
+        XTensor * diff = NewTensor(gold->order, gold->dimSize, gold->dataType, gold->denseRatio, gold->devID, gold->mem);
+        XTensor * yOnehot = NewTensor(y->order, y->dimSize, y->dataType, y->denseRatio, y->devID, y->mem);
+        CopyValues(y, yOnehot);
+        _Multiply(gold, y, yOnehot);
+        _Sum(gold, yOnehot, diff, -1.0F);
+        Power(diff, 2.0F);
+        _ScaleAndShiftMe(diff, 0.5F, 0.0F);
+
+        int reduceTimes = diff->order;
+        for (int i = 0; i < reduceTimes; i++) {
+            int diffOrder = diff->order - 1;
+            int * diffDimSize = new int[diffOrder];
+            memcpy(diffDimSize, diff->dimSize + 1, diffOrder * sizeof(int));
+            XTensor * diffNew = NewTensor(diffOrder, diffDimSize, X_FLOAT, 1.0F, diff->devID, diff->mem);
+            int reducePlace = diff->dimSize[0] == 1 ? 1 : 0;
+            ReduceSum(diff, diffNew, reducePlace);
+            if (diffNew->order == 1) {
+                diffNew->order = 2;
+                diffNew->dimSize[1] = diffNew->dimSize[0];
+                diffNew->dimSize[0] = 1;
+                diffNew->dimSizeRDI[1] = 1;
+            }
+            delete diff;
+            diff = diffNew;
+            delete diffDimSize;
+        }
+        error = diff->Get2D(0, 0);
+        delete diff;
+        delete yOnehot;
+    }
+    return error;

    // TODO: call cuda kernels for computing the errors
 }
@@ -140,13 +267,25 @@ backward compuation for cross entropy (Cuda kernel)
 >> size - size of the vector (dedy)
 */
 extern "C" __global__ 
-void KernelLossBackwardCrossEntropy(DTYPE * dedy, DTYPE * t, DTYPE * y, int size)
+void KernelLossBackwardCrossEntropy(DTYPE * dedy, DTYPE * t, DTYPE * y, int tBeg, int tLen, int yBeg, int blockNum, int stride, int dimensionSize)
 {
    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i > stride * dimensionSize * blockNum) 
+        return;

-    if (i < size){
+    int blockNumIndex = i / (stride * dimensionSize);
+    int blockNumTail = i % (stride * dimensionSize);
+    int dimensionSizeIndex = blockNumTail / stride;
+    int strideIndex = blockNumTail % stride;
+
+    if (dimensionSizeIndex >= tLen)
+        return;
+
+    dedy[blockNumIndex * stride * dimensionSize + strideIndex + stride * (yBeg + dimensionSizeIndex)] = -t[blockNumIndex * stride * dimensionSize + 
+        strideIndex + stride * (tBeg + dimensionSizeIndex)] / y[blockNumIndex * stride * dimensionSize + strideIndex + stride * (yBeg + dimensionSizeIndex)];
+    /*if (i < size){
        dedy[i] =  -t[i]/y[i];
-    }
+    }*/
 }

 /* 
@@ -193,9 +332,11 @@ void CudaLossBackward(XTensor * dedy, XTensor * t, XTensor * y,
                      LOSS_FUNCTION_NAME LFName, 
                      int leadDim, int tBeg, int tLen, int yBeg)
 {
+    CheckNTErrors((tLen <= y->unitNum), "Illegal input length!");
    CheckNTErrors((XTensor::IsIdentical(t, y)&& XTensor::IsIdentical(dedy, y)), 
                        "The input tensors must be of the same size!");
-    CheckNTErrors((t->dimSizeRDI[0] == 1 && y->dimSizeRDI[0] == 1 && dedy->dimSizeRDI[1] == 1), "TODO!");
+    CheckNTErrors(((dedy->devID == t->devID) && (dedy->devID == y->devID)), "Tensor must be on the same device!");
+    CheckNTErrors((t->order > leadDim), "Illegal leading dimension!");
    CheckNTErrors((t->dataType == DEFAULT_DTYPE && 
                         y->dataType == DEFAULT_DTYPE && 
                         dedy->dataType == DEFAULT_DTYPE),
@@ -208,21 +349,25 @@ void CudaLossBackward(XTensor * dedy, XTensor * t, XTensor * y,
                        "The vectors must be on the same GPU.");
    CheckNTErrors((tBeg == yBeg), "TODO!");

-    int leadDimRDI = y->order - leadDim - 1;
+    int leadDimRDI = leadDim >= 0 ? y->order - leadDim - 1 : -1;
    if(leadDimRDI < 0){
-        leadDimRDI = y->dimSizeRDI[y->order - 1];
+        leadDimRDI = y->order - 1;
        tBeg = 0;
        yBeg = 0;
        tLen = y->dimSizeRDI[leadDimRDI];
    }

+    int dimensionSize = y->dimSizeRDI[leadDimRDI];
    int stride = 1;
    int blockSize = 1;
+    int blockNum = 1;
    int size = 1;

    for(int i = 0; i < leadDimRDI; i++)
        stride *= y->dimSizeRDI[i];
    size = tLen * stride;
+    blockSize = stride * dimensionSize;
+    blockNum = y->unitNum / blockSize;

    int cudaGridSize[3], cudaBlockSize[3];

@@ -265,7 +410,7 @@ void CudaLossBackward(XTensor * dedy, XTensor * t, XTensor * y,
            ShowNTErrors("TODO!");
        }
        else if(size == y->unitNum){
-            KernelLossBackwardCrossEntropy<<<blocks, threads>>>(dedyp, tp, yp, tLen);
+            KernelLossBackwardCrossEntropy<<<blocks, threads>>>(dedyp, tp, yp, tBeg, tLen, yBeg, blockNum, stride, dimensionSize);
        }
        else{
            KernelLossBackwardCrossEntropyBlock<<<blocks, threads>>>(dedyp, tp, yp, blockSize, tBeg * stride, tLen * stride, y->unitNum);

--- a/source/tensor/function/Rectify.cu
+++ b/source/tensor/function/Rectify.cu
@@ -97,7 +97,7 @@ void KernelRectifyBackward(DTYPE * dedy, DTYPE * dedx, DTYPE * gold, DTYPE * y, 
    if (i < size){
        DTYPE s = x[i];
        if(s >= 0)
-            dedx[i] = 1;
+            dedx[i] = dedy[i];
        else
            dedx[i] = 0;
    }

--- a/source/tensor/function/Softmax.cu
+++ b/source/tensor/function/Softmax.cu
@@ -248,7 +248,7 @@ void CudaSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
                       "Unknown loss function.");

        if(lossName == CROSSENTROPY || lossName == SQUAREDERROR){
-            ShowNTErrors("TODO!");
+            _Sum(y, gold, dedx, -1.0F);
        }
        else if(lossName == ONEHOTERROR){
            ShowNTErrors("TODO!");

--- a/source/tensor/test/TConcatenate.cpp
+++ b/source/tensor/test/TConcatenate.cpp
@@ -483,9 +483,9 @@ bool TestConcatenate4()
    delete sGPU1;
    delete sGPU2;
    delete tGPU;
-    delete[] sDimSize1;
-    delete[] sDimSize2;
-    delete[] tDimSize;
+    //delete[] sDimSize1;
+    //delete[] sDimSize2;
+    //delete[] tDimSize;

 	return cpuTest && gpuTest;
 #else

--- a/source/tensor/test/TIdentity.cpp
+++ b/source/tensor/test/TIdentity.cpp
@@ -30,15 +30,15 @@ Identity function: y = x
 */
 bool TestIdentity1()
 {
-    /* a input tensor of size (2, 3) */
-    int sOrder = 2;
-    int * sDimSize = new int[sOrder];
-    sDimSize[0] = 2;
-    sDimSize[1] = 3;
+    /* a tensor of size (2, 3) */
+    int order = 2;
+    int * dimSize = new int[order];
+    dimSize[0] = 2;
+    dimSize[1] = 3;

-    int sUnitNum = 1;
-    for (int i = 0; i < sOrder; i++)
-        sUnitNum *= sDimSize[i];
+    int unitNum = 1;
+    for (int i = 0; i < order; i++)
+        unitNum *= dimSize[i];

    DTYPE xData[2][3] = { {0.0F, 1.0F, 2.0F}, 
                          {0.5F, 0.7F, 1.4F} };
@@ -49,47 +49,50 @@ bool TestIdentity1()
    bool cpuTest = true;

    /* create tensors */
-    XTensor * x = NewTensor(sOrder, sDimSize);
-    XTensor * y = NewTensor(sOrder, sDimSize);
+    XTensor * x = NewTensor(order, dimSize);
+    XTensor * y = NewTensor(order, dimSize);

    /* initialize variables */
-    x->SetData(xData, sUnitNum);
+    x->SetData(xData, unitNum);
    y->SetZeroAll();

    /* call Identity function */
    Identity(x, y);
    
    /* check result */
-    cpuTest = y->CheckData(answer, sUnitNum);
+    cpuTest = y->CheckData(answer, unitNum);

 #ifdef USE_CUDA
    /* GPU test */
    bool gpuTest = true;

    /* create tensors */
-    XTensor * xGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
-    XTensor * yGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * xGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+    XTensor * yGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);

    /* initialize variables */
-    xGPU->SetData(xData, sUnitNum);
+    xGPU->SetData(xData, unitNum);
    yGPU->SetZeroAll();

    /* call Identity function */
    Identity(xGPU, yGPU);
    
    /* check result */
-    gpuTest = yGPU->CheckData(answer, sUnitNum);
+    gpuTest = yGPU->CheckData(answer, unitNum);

    /* destroy variables */
-    delete x, y;
-    delete xGPU, yGPU;
-    delete[] sDimSize;
+    delete x;
+    delete y;
+    delete xGPU;
+    delete yGPU;
+    delete[] dimSize;

    return cpuTest && gpuTest;
 #else
    /* destroy variables */
-    delete x, y;
-    delete[] sDimSize;
+    delete x;
+    delete y;
+    delete[] dimSize;

    return cpuTest;
 #endif // USE_CUDA
@@ -98,35 +101,39 @@ bool TestIdentity1()
 /* 
 case 2: test IdentityBackward function.
 IdentityBackward function: dE/dx = dE/dy * dy/dx = dE/dy
+In this case, lossName=CROSSENTROPY.
 */
 bool TestIdentity2()
 {
-    int sOrder = 2;
-    int * sDimSize = new int[sOrder];
-    sDimSize[0] = 1;
-    sDimSize[1] = 3;
-
-    int sUnitNum = 1;
-    for (int i = 0; i < sOrder; i++)
-        sUnitNum *= sDimSize[i];
-
-    DTYPE xData[1][3] = { {0.0F, 1.0F, 2.0F} };
-    DTYPE gData[1][3] = { {0.0F, 0.0F, 1.0F} };
-    DTYPE dedxAnswer[3] = {0.090031F, 0.244728F, -0.334759F};
+    /* a tensor of size (2, 3) */
+    int order = 2;
+    int * dimSize = new int[order];
+    dimSize[0] = 1;
+    dimSize[1] = 3;
+
+    int unitNum = 1;
+    for (int i = 0; i < order; i++)
+        unitNum *= dimSize[i];
+
+    DTYPE xData[3] = {1.0F, 1.0F, 2.0F};
+    DTYPE gData[3] = {0.0F, 0.0F, 1.0F};
+    DTYPE yAnswer[3] = {1.0F, 1.0F, 2.0F};
+    DTYPE dedyAnswer[3] = {0.0F, 0.0F, -0.5F};
+    DTYPE dedxAnswer[3] = {0.0F, 0.0F, -0.5F};

    /* CPU test */
    bool cpuTest = true;

    /* create tensors */
-    XTensor * x = NewTensor(sOrder, sDimSize);
-    XTensor * y = NewTensor(sOrder, sDimSize);
-    XTensor * g = NewTensor(sOrder, sDimSize);
-    XTensor * dedy = NewTensor(sOrder, sDimSize);
-    XTensor * dedx = NewTensor(sOrder, sDimSize);
+    XTensor * x = NewTensor(order, dimSize);
+    XTensor * y = NewTensor(order, dimSize);
+    XTensor * g = NewTensor(order, dimSize);
+    XTensor * dedy = NewTensor(order, dimSize);
+    XTensor * dedx = NewTensor(order, dimSize);

    /* initialize variables */
-    x->SetData(xData, sUnitNum);
-    g->SetData(gData, sUnitNum);
+    x->SetData(xData, unitNum);
+    g->SetData(gData, unitNum);
    y->SetZeroAll();
    dedx->SetZeroAll();
    dedy->SetZeroAll();
@@ -138,22 +145,24 @@ bool TestIdentity2()
    IdentityBackward(g, y, x, dedy, dedx, CROSSENTROPY);
    
    /* check result */
-    cpuTest = dedx->CheckData(dedxAnswer, sUnitNum, 1e-4F);
+    cpuTest = y->CheckData(yAnswer, unitNum, 1e-4F)
+              && dedx->CheckData(dedxAnswer, unitNum, 1e-4F)
+              && dedy->CheckData(dedyAnswer, unitNum, 1e-4F);

 #ifdef USE_CUDA
    /* GPU test */
    bool gpuTest = true;

        /* create tensors */
-    XTensor * xGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
-    XTensor * yGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
-    XTensor * gGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
-    XTensor * dedyGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
-    XTensor * dedxGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * xGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+    XTensor * yGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+    XTensor * gGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+    XTensor * dedyGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+    XTensor * dedxGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);

    /* initialize variables */
-    xGPU->SetData(xData, sUnitNum);
-    gGPU->SetData(gData, sUnitNum);
+    xGPU->SetData(xData, unitNum);
+    gGPU->SetData(gData, unitNum);
    yGPU->SetZeroAll();
    dedxGPU->SetZeroAll();
    dedyGPU->SetZeroAll();
@@ -165,7 +174,9 @@ bool TestIdentity2()
    IdentityBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, CROSSENTROPY);
    
    /* check result */
-    gpuTest = dedxGPU->CheckData(dedxAnswer, sUnitNum, 1e-4F);
+    gpuTest = yGPU->CheckData(yAnswer, unitNum, 1e-4F)
+              && dedxGPU->CheckData(dedxAnswer, unitNum, 1e-4F)
+              && dedyGPU->CheckData(dedyAnswer, unitNum, 1e-4F);

    /* destroy variables */
    delete x;
@@ -178,7 +189,7 @@ bool TestIdentity2()
    delete gGPU;
    delete dedxGPU;
    delete dedyGPU;
-    delete[] sDimSize;
+    delete[] dimSize;

    return cpuTest && gpuTest;
 #else
@@ -188,7 +199,7 @@ bool TestIdentity2()
    delete g;
    delete dedx;
    delete dedy;
-    delete[] sDimSize;
+    delete[] dimSize;

    return cpuTest;
 #endif // USE_CUDA

--- a/source/tensor/test/TLogSoftmax.cpp
+++ b/source/tensor/test/TLogSoftmax.cpp
--- a/source/tensor/test/TLoss.cpp
+++ b/source/tensor/test/TLoss.cpp
@@ -20,15 +20,15 @@
 */

 #include "../core/math/ScaleAndShift.h"
-#include "../function/Loss.h"
+#include "TLoss.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)

 /* 
-case 1: test LossCompute function 
+case 1: test LossCompute function.
 In this case, Loss function name = SQUAREDERROR.
 loss = sum_{i} 0.5*(t_i - y_i)^2, 
-where t_i is the gold standard and y_i is the model output
+where t_i is the gold standard and y_i is the model output.
 */
 bool TestLoss1()
 {
@@ -102,10 +102,10 @@ bool TestLoss1()
 }

 /* 
-case 2: test LossCompute function 
+case 2: test LossCompute function.
 In this case, Loss function name = CROSSENTROPY.
 loss = sum_{i} (-t_i * log(y_i))
-where t_i is the gold standard and y_i is the model output
+where t_i is the gold standard and y_i is the model output.
 */
 bool TestLoss2()
 {
@@ -179,10 +179,10 @@ bool TestLoss2()
 }

 /* 
-case 3: test LossCompute function 
+case 3: test LossCompute function.
 In this case, Loss function name = ONEHOTERROR.
 loss = sum_{i} e_i
-where e_i = 0.5*(t_i - y_i)^2 if t_i = 1, e_i = 0 otherwise
+where e_i = 0.5*(t_i - y_i)^2 if t_i = 1, e_i = 0 otherwise.
 */
 bool TestLoss3()
 {

--- a/source/tensor/test/TMatrixMulBatched.cpp
+++ b/source/tensor/test/TMatrixMulBatched.cpp
@@ -19,6 +19,7 @@
 * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-06-15
 */

+#include "../XTensor.h"
 #include "TMatrixMulBatched.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)
@@ -105,7 +106,7 @@ bool TestMatrixMulBatched1()
    
    /* check results */
    gpuTest = tGPU->CheckData(answer, tUnitNum);
-
+    
    /* destroy variables */
    delete s1;
    delete s2;

--- a/source/tensor/test/TRectify.cpp
+++ b/source/tensor/test/TRectify.cpp
@@ -29,25 +29,15 @@ In this case, y = max(0, x)
 */
 bool TestRectify1()
 {
-    /* a x tensor of size (2, 3) */
-    int xOrder = 2;
-    int * xDimSize = new int[xOrder];
-    xDimSize[0] = 2;
-    xDimSize[1] = 3;
-
-    int xUnitNum = 1;
-    for (int i = 0; i < xOrder; i++)
-        xUnitNum *= xDimSize[i];
-
-    /* a y tensor of size (2, 3) */
-    int yOrder = 2;
-    int * yDimSize = new int[yOrder];
-    yDimSize[0] = 2;
-    yDimSize[1] = 3;
-
-    int yUnitNum = 1;
-    for (int i = 0; i < yOrder; i++)
-        yUnitNum *= yDimSize[i];
+    /* a tensor of size (2, 3) */
+    int order = 2;
+    int * dimSize = new int[order];
+    dimSize[0] = 2;
+    dimSize[1] = 3;
+
+    int unitNum = 1;
+    for (int i = 0; i < order; i++)
+        unitNum *= dimSize[i];

    DTYPE xData[2][3] = { {0.0F, -1.0F, 2.0F},
                          {3.0F, -4.0F, -5.0F} };
@@ -58,52 +48,50 @@ bool TestRectify1()
    bool cpuTest = true;

    /* create tensors */
-    XTensor * x = NewTensor(xOrder, xDimSize);
-    XTensor * y = NewTensor(yOrder, yDimSize);
+    XTensor * x = NewTensor(order, dimSize);
+    XTensor * y = NewTensor(order, dimSize);

    /* initialize variables */
-    x->SetData(xData, xUnitNum);
+    x->SetData(xData, unitNum);
    y->SetZeroAll();

    /* call Rectify function */
    Rectify(x, y);

    /* check results */
-    cpuTest = y->CheckData(answer, yUnitNum);
+    cpuTest = y->CheckData(answer, unitNum);

 #ifdef USE_CUDA
 	/* GPU test */
 	bool gpuTest = true;

 	/* create tensor */
-	XTensor * xGPU = NewTensor(xOrder, xDimSize, X_FLOAT, 1.0F, 0);
-	XTensor * yGPU = NewTensor(yOrder, yDimSize, X_FLOAT, 1.0F, 0);
+	XTensor * xGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+	XTensor * yGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);

 	/* Initialize variables */
-	xGPU->SetData(xData, xUnitNum);
+	xGPU->SetData(xData, unitNum);
 	yGPU->SetZeroAll();

 	/* call Rectify function */
 	Rectify(xGPU, yGPU);

 	/* check results */
-	gpuTest = yGPU->CheckData(answer, yUnitNum);
+	gpuTest = yGPU->CheckData(answer, unitNum);

 	/* destroy variables */
 	delete x;
    delete y;
    delete xGPU;
    delete yGPU;
-	delete[] xDimSize;
-    delete[] yDimSize;
+	delete[] dimSize;

 	return cpuTest && gpuTest;
 #else
 	/* destroy variables */
 	delete x;
    delete y;
-	delete[] xDimSize;
-    delete[] yDimSize;
+	delete[] dimSize;

 	return cpuTest;
 #endif // USE_CUDA
@@ -117,73 +105,83 @@ In this case, lossName=CROSSENTROPY.
 */
 bool TestRectify2()
 {
-	/* a x tensor of size (2, 3) */
-	int xOrder = 2;
-	int * xDimSize = new int[xOrder];
-	xDimSize[0] = 2;
-	xDimSize[1] = 3;
+	/* a tensor of size (2, 3) */
+	int order = 2;
+	int * dimSize = new int[order];
+	dimSize[0] = 2;
+	dimSize[1] = 3;

-	int xUnitNum = 1;
-	for (int i = 0; i < xOrder; i++)
-		xUnitNum *= xDimSize[i];
+	int unitNum = 1;
+	for (int i = 0; i < order; i++)
+		unitNum *= dimSize[i];

 	DTYPE xData[2][3] = { {1.0F, 1.0F, 2.0F},
 	                      {2.0F, 4.0F, 5.0F} };
-	DTYPE yData[2][3] = { {1.0F, 1.0F, 2.0F},
-	                      {2.0F, 4.0F, 5.0F} };
 	DTYPE goldData[2][3] = { {1.0F, 1.0F, 1.0F},
 	                         {1.0F, 1.0F, 1.0F} };
-	DTYPE dedyData[2][3] = { {-1.0F, -1.0F, -0.5F},
-	                         {-0.5F, -0.25F, -0.2F} };
-	DTYPE answer[2][3] = { {-1.0F, -1.0F, -0.5F},
-	                       {-0.5F, -0.25F, -0.2F} };
+    DTYPE yAnswer[2][3] = { {1.0F, 1.0F, 2.0F},
+	                        {2.0F, 4.0F, 5.0F} };
+	DTYPE dedyAnswer[2][3] = { {-1.0F, -1.0F, -0.5F},
+	                           {-0.5F, -0.25F, -0.2F} };
+	DTYPE dedxAnswer[2][3] = { {-1.0F, -1.0F, -0.5F},
+	                           {-0.5F, -0.25F, -0.2F} };

 	/* CPU test */
 	bool cpuTest = true;

 	/* create tensors */
-	XTensor * x = NewTensor(xOrder, xDimSize);
-	XTensor * y = NewTensor(xOrder, xDimSize);
-	XTensor * gold = NewTensor(xOrder, xDimSize);
-	XTensor * dedy = NewTensor(xOrder, xDimSize);
-	XTensor * dedx = NewTensor(xOrder, xDimSize);
+	XTensor * x = NewTensor(order, dimSize);
+	XTensor * y = NewTensor(order, dimSize);
+	XTensor * gold = NewTensor(order, dimSize);
+	XTensor * dedy = NewTensor(order, dimSize);
+	XTensor * dedx = NewTensor(order, dimSize);

 	/* initialize variables */
-	x->SetData(xData, xUnitNum);
-	y->SetData(yData, xUnitNum);
-	gold->SetData(goldData, xUnitNum);
-	dedy->SetData(dedyData, xUnitNum);
+	x->SetData(xData, unitNum);
+	gold->SetData(goldData, unitNum);
+	y->SetZeroAll();
+	dedy->SetZeroAll();
 	dedx->SetZeroAll();

+    /* call Rectify function */
+    Rectify(x, y);
+
 	/* call RectifyBackward function */
-	RectifyBackward(gold, y, x, dedy, dedx, NOLOSS);
+	RectifyBackward(gold, y, x, dedy, dedx, CROSSENTROPY);

 	/* check results */
-	cpuTest = dedx->CheckData(answer, xUnitNum);
+    cpuTest = y->CheckData(yAnswer, unitNum, 1e-4F)
+              && dedx->CheckData(dedxAnswer, unitNum, 1e-4F)
+              && dedy->CheckData(dedyAnswer, unitNum, 1e-4F);

 #ifdef USE_CUDA
 	/* GPU test */
 	bool gpuTest = true;

 	/* create tensors */
-	XTensor * xGPU = NewTensor(xOrder, xDimSize, X_FLOAT, 1.0F, 0);
-	XTensor * yGPU = NewTensor(xOrder, xDimSize, X_FLOAT, 1.0F, 0);
-	XTensor * goldGPU = NewTensor(xOrder, xDimSize, X_FLOAT, 1.0F, 0);
-	XTensor * dedyGPU = NewTensor(xOrder, xDimSize, X_FLOAT, 1.0F, 0);
-	XTensor * dedxGPU = NewTensor(xOrder, xDimSize, X_FLOAT, 1.0F, 0);
+	XTensor * xGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+	XTensor * yGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+	XTensor * goldGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+	XTensor * dedyGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+	XTensor * dedxGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);

 	/* initialize variables */
-	xGPU->SetData(xData, xUnitNum);
-	yGPU->SetData(yData, xUnitNum);
-	goldGPU->SetData(goldData, xUnitNum);
-	dedyGPU->SetData(dedyData, xUnitNum);
+	xGPU->SetData(xData, unitNum);
+	goldGPU->SetData(goldData, unitNum);
+	yGPU->SetZeroAll();
+	dedyGPU->SetZeroAll();
 	dedxGPU->SetZeroAll();
+    
+    /* call Rectify function */
+    Rectify(xGPU, yGPU);

 	/* call rectifybackward function */
-	RectifyBackward(goldGPU, yGPU, xGPU, dedyGPU, dedxGPU, NOLOSS);
+	RectifyBackward(goldGPU, yGPU, xGPU, dedyGPU, dedxGPU, CROSSENTROPY);
    
 	/* check results */
-	gpuTest = dedxGPU->CheckData(answer, xUnitNum);
+    gpuTest = yGPU->CheckData(yAnswer, unitNum, 1e-4F)
+              && dedxGPU->CheckData(dedxAnswer, unitNum, 1e-4F)
+              && dedyGPU->CheckData(dedyAnswer, unitNum, 1e-4F);

 	/* destroy variables */
    delete x;
@@ -196,7 +194,7 @@ bool TestRectify2()
    delete dedyGPU;
    delete dedxGPU;
    delete goldGPU;
-	delete[] xDimSize;
+	delete[] dimSize;

 	return cpuTest && gpuTest;
 #else
@@ -206,7 +204,7 @@ bool TestRectify2()
    delete dedy;
    delete dedx;
    delete gold;
-	delete[] xDimSize;
+	delete[] dimSize;

 	return cpuTest;
 #endif // USE_CUDA
@@ -220,7 +218,7 @@ TODO!!
 /* test for Rectify Function */
 bool TestRectify()
 {
-    XPRINT(0, stdout, "[TEST RECTIFY] test rectify and its backward computation \n");
+    XPRINT(0, stdout, "[TEST RECTIFY] rectify function and its backward computation \n");
    bool returnFlag = true, caseFlag = true;

    /* case 1 test */

--- a/source/tensor/test/TSetAscendingOrder.cpp
+++ b/source/tensor/test/TSetAscendingOrder.cpp
@@ -23,8 +23,7 @@

 namespace nts { // namespace nts(NiuTrans.Tensor)

-/* case 1: set the cell to the ascending order along a given dimension.
-*/
+/* case 1: set the cell to the ascending order along a given dimension. */
 bool TestSetAscendingOrder1()
 {
    /* a input tensor of size (2, 4) */
@@ -50,7 +49,6 @@ bool TestSetAscendingOrder1()
    s->SetZeroAll();

    /* call SetAscendingOrder function */
-
    s->SetAscendingOrder(1);
    
    /* check results */

--- a/source/tensor/test/TSetData.cpp
+++ b/source/tensor/test/TSetData.cpp
@@ -23,7 +23,10 @@

 namespace nts { // namespace nts(NiuTrans.Tensor)

-/* case 1: set the cell to the ascending order along a given dimension. */
+/* 
+case 1: test SetDataRand function.
+set the tensor items by a uniform distribution in range [lower, upper]. 
+*/
 bool TestSetData1()
 {
    /* a input tensor of size (2, 4) */
@@ -44,7 +47,7 @@ bool TestSetData1()
    /* create tensors */
    XTensor * s = NewTensor(sOrder, sDimSize);

-    /* call SetData function */
+    /* call SetDataRand function */
    s->SetDataRand(0.0, 1.0);
    
    /* check results */

--- a/source/tensor/test/TSigmoid.cpp
+++ b/source/tensor/test/TSigmoid.cpp
@@ -25,102 +25,71 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)

 /* 
-case 1: test Sigmoid function and SigmoidBackward function.
+case 1: test Sigmoid function.
 sigmoid function: y = 1/(1+exp(-x))
-backward computation: dE/ds = dE/dy * dy/dx
 */
 bool TestSigmoid1()
 {
    /* a input tensor of size (3) */
-    int sOrder = 1;
-    int * sDimSize = new int[sOrder];
-    sDimSize[0] = 3;
+    int order = 1;
+    int * dimSize = new int[order];
+    dimSize[0] = 3;

-    int sUnitNum = 1;
-    for (int i = 0; i < sOrder; i++)
-        sUnitNum *= sDimSize[i];
+    int unitNum = 1;
+    for (int i = 0; i < order; i++)
+        unitNum *= dimSize[i];

    DTYPE xData[3] = {0.0F, 1.0F, 2.0F};
-    DTYPE gData[3] = {0.4F, 0.8F, 1.0F};
-    DTYPE dedyData[3] = {-0.8F, -1.094F, -1.135F};
-    DTYPE yAnswer[3] = {0.5F, 0.731F, 0.881F};
-    DTYPE dedxAnswer[3] = {-0.2F, -0.215F, -0.119F};
+    DTYPE answer[3] = {0.5F, 0.7311F, 0.8808F};

    /* CPU test */
    bool cpuTest = true;

    /* create tensors */
-    XTensor * x = NewTensor(sOrder, sDimSize);
-    XTensor * y = NewTensor(sOrder, sDimSize);
-    XTensor * g = NewTensor(sOrder, sDimSize);
-    XTensor * dedy = NewTensor(sOrder, sDimSize);
-    XTensor * dedx = NewTensor(sOrder, sDimSize);
+    XTensor * x = NewTensor(order, dimSize);
+    XTensor * y = NewTensor(order, dimSize);

    /* initialize variables */
-    x->SetData(xData, sUnitNum);
-    g->SetData(gData, sUnitNum);
-    dedy->SetData(dedyData, sUnitNum);
+    x->SetData(xData, unitNum);
    y->SetZeroAll();
-    dedx->SetZeroAll();

    /* call Sigmoid function */
    Sigmoid(x, y);

-    /* call SigmoidBackward function */
-    SigmoidBackward(g, y, x, dedy, dedx, NOLOSS);
-    
    /* check result */
-    cpuTest = y->CheckData(yAnswer, sUnitNum) && dedx->CheckData(dedxAnswer, sUnitNum);
+    cpuTest = y->CheckData(answer, unitNum, 1e-4F);

 #ifdef USE_CUDA
    /* GPU test */
    bool gpuTest = true;

        /* create tensors */
-    XTensor * xGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
-    XTensor * yGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
-    XTensor * gGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
-    XTensor * dedyGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
-    XTensor * dedxGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * xGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+    XTensor * yGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);

    /* initialize variables */
-    xGPU->SetData(xData, sUnitNum);
-    gGPU->SetData(gData, sUnitNum);
-    dedyGPU->SetData(dedyData, sUnitNum);
+    xGPU->SetData(xData, unitNum);
    yGPU->SetZeroAll();
-    dedxGPU->SetZeroAll();

    /* call Sigmoid function */
    Sigmoid(xGPU, yGPU);

-    /* call SigmoidBackward function */
-    SigmoidBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, NOLOSS);
-    
    /* check result */
-    gpuTest = yGPU->CheckData(yAnswer, sUnitNum) && dedxGPU->CheckData(dedxAnswer, sUnitNum);
+    gpuTest = yGPU->CheckData(answer, unitNum, 1e-4F);

    /* destroy variables */
    delete x;
    delete y;
-    delete g;
-    delete dedx;
-    delete dedy;
    delete xGPU;
    delete yGPU;
-    delete gGPU;
-    delete dedxGPU;
-    delete dedyGPU;
-    delete[] sDimSize;
+    delete[] dimSize;

    return cpuTest && gpuTest;
 #else
    /* destroy variables */
    delete x;
    delete y;
-    delete g;
-    delete dedx;
-    delete dedy;
-    delete[] sDimSize;
+    delete[] dimSize;

    return cpuTest;
 #endif // USE_CUDA
@@ -129,70 +98,72 @@ bool TestSigmoid1()
 /* 
 case 2: test Sigmoid function and SigmoidBackward function.
 sigmoid function: y = 1/(1+exp(-x))
-backward computation: dE/ds = dE/dy * dy/dx
+backward computation: 
+dE/ds = dE/dy * dy/dx
+dy/dx = y * (1 -y)
+In this case, LossName=CROSSENTROPY.
 */
 bool TestSigmoid2()
 {
    /* a input tensor of size (3) */
-    int sOrder = 1;
-    int * sDimSize = new int[sOrder];
-    sDimSize[0] = 3;
+    int order = 1;
+    int * dimSize = new int[order];
+    dimSize[0] = 3;

-    int sUnitNum = 1;
-    for (int i = 0; i < sOrder; i++)
-        sUnitNum *= sDimSize[i];
+    int unitNum = 1;
+    for (int i = 0; i < order; i++)
+        unitNum *= dimSize[i];

    DTYPE xData[3] = {0.0F, 1.0F, 2.0F};
    DTYPE gData[3] = {0.4F, 0.8F, 1.0F};
-    DTYPE dedyData[3] = {-0.8F, -1.094F, -1.135F};
-    DTYPE yAnswer[3] = {0.5F, 0.731F, 0.881F};
-    DTYPE dedxAnswer[3] = {-0.2F, -0.215F, -0.119F};
+    DTYPE yAnswer[3] = {0.5F, 0.7311F, 0.8808F};
+    DTYPE dedyAnswer[3] = {-0.8F, -1.0943F, -1.1353F};
+    DTYPE dedxAnswer[3] = {-0.2F, -0.2151F, -0.1192F};

    /* CPU test */
    bool cpuTest = true;

    /* create tensors */
-    XTensor * x = NewTensor(sOrder, sDimSize);
-    XTensor * y = NewTensor(sOrder, sDimSize);
-    XTensor * g = NewTensor(sOrder, sDimSize);
-    XTensor * dedy = NewTensor(sOrder, sDimSize);
-    XTensor * dedx = NewTensor(sOrder, sDimSize);
+    XTensor * x = NewTensor(order, dimSize);
+    XTensor * y = NewTensor(order, dimSize);
+    XTensor * g = NewTensor(order, dimSize);
+    XTensor * dedy = NewTensor(order, dimSize);
+    XTensor * dedx = NewTensor(order, dimSize);

    /* initialize variables */
-    x->SetData(xData, sUnitNum);
-    g->SetData(gData, sUnitNum);
-    dedy->SetZeroAll();
+    x->SetData(xData, unitNum);
+    g->SetData(gData, unitNum);
    y->SetZeroAll();
+    dedy->SetZeroAll();
    dedx->SetZeroAll();

    /* call Sigmoid function */
    Sigmoid(x, y);

-    /* initialize variables */
-    dedy->SetData(dedyData, sUnitNum);
-
    /* call SigmoidBackward function */
    SigmoidBackward(g, y, x, dedy, dedx, CROSSENTROPY);
    
    /* check result */
-    cpuTest = y->CheckData(yAnswer, sUnitNum) && dedx->CheckData(dedxAnswer, sUnitNum);
+    cpuTest = y->CheckData(yAnswer, unitNum, 1e-4F)
+              && dedx->CheckData(dedxAnswer, unitNum, 1e-4F)
+              && dedy->CheckData(dedyAnswer, unitNum, 1e-4F);

 #ifdef USE_CUDA
    /* GPU test */
    bool gpuTest = true;

        /* create tensors */
-    XTensor * xGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
-    XTensor * yGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
-    XTensor * gGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
-    XTensor * dedyGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
-    XTensor * dedxGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * xGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+    XTensor * yGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+    XTensor * gGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+    XTensor * dedyGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+    XTensor * dedxGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);

    /* initialize variables */
-    xGPU->SetData(xData, sUnitNum);
-    gGPU->SetData(gData, sUnitNum);
-    dedyGPU->SetZeroAll();
+    xGPU->SetData(xData, unitNum);
+    gGPU->SetData(gData, unitNum);
    yGPU->SetZeroAll();
+    dedyGPU->SetZeroAll();
    dedxGPU->SetZeroAll();

    /* call Sigmoid function */
@@ -202,8 +173,9 @@ bool TestSigmoid2()
    SigmoidBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, CROSSENTROPY);
    
    /* check result */
-    gpuTest = yGPU->CheckData(yAnswer, sUnitNum) && dedxGPU->CheckData(dedxAnswer, sUnitNum);
-
+    gpuTest = yGPU->CheckData(yAnswer, unitNum, 1e-4F)
+              && dedxGPU->CheckData(dedxAnswer, unitNum, 1e-4F)
+              && dedyGPU->CheckData(dedyAnswer, unitNum, 1e-4F);
    /* destroy variables */
    delete x;
    delete y;
@@ -215,7 +187,7 @@ bool TestSigmoid2()
    delete gGPU;
    delete dedxGPU;
    delete dedyGPU;
-    delete[] sDimSize;
+    delete[] dimSize;

    return cpuTest && gpuTest;
 #else
@@ -225,7 +197,7 @@ bool TestSigmoid2()
    delete g;
    delete dedx;
    delete dedy;
-    delete[] sDimSize;
+    delete[] dimSize;

    return cpuTest;
 #endif // USE_CUDA
@@ -251,6 +223,16 @@ bool TestSigmoid()
    }
    else
        XPRINT(0, stdout, ">> case 1 passed!\n");
+    
+    /* case 2 test */
+    caseFlag = TestSigmoid2();
+
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 2 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 2 passed!\n");

    /* other cases test */
    /*

--- a/source/tensor/test/TSoftmax.cpp
+++ b/source/tensor/test/TSoftmax.cpp
@@ -31,68 +31,69 @@ softmax function: y = e^x / \sum_{i} e^{x_i}
 */
 bool TestSoftmax1()
 {
-    /* a input tensor of size (2, 3) */
-    int sOrder = 2;
-    int * sDimSize = new int[sOrder];
-    sDimSize[0] = 2;
-    sDimSize[1] = 3;
+    /* a tensor of size (2, 3) */
+    int order = 2;
+    int * dimSize = new int[order];
+    dimSize[0] = 2;
+    dimSize[1] = 3;

-    int sUnitNum = 1;
-    for (int i = 0; i < sOrder; i++)
-        sUnitNum *= sDimSize[i];
+    int unitNum = 1;
+    for (int i = 0; i < order; i++)
+        unitNum *= dimSize[i];

    DTYPE xData[2][3] = { {0.0F, 1.0F, 2.0F}, 
                          {0.5F, 0.7F, 1.4F} };
-    DTYPE answer[2][3] = { {0.09003057F, 0.24472848F, 0.66524094F}, 
-                           {0.21362929F, 0.2609274F , 0.52544326F} };
+    DTYPE answer[2][3] = { {0.0900F, 0.2447F, 0.6652F}, 
+                           {0.2136F, 0.2609F, 0.5254F} };

    /* CPU test */
    bool cpuTest = true;

    /* create tensors */
-    XTensor * x = NewTensor(sOrder, sDimSize);
-    XTensor * y = NewTensor(sOrder, sDimSize);
+    XTensor * x = NewTensor(order, dimSize);
+    XTensor * y = NewTensor(order, dimSize);

    /* initialize variables */
-    x->SetData(xData, sUnitNum);
+    x->SetData(xData, unitNum);
    y->SetZeroAll();

    /* call Softmax function */
    Softmax(x, y, 1);
    
    /* check result */
-    cpuTest = y->CheckData(answer, sUnitNum);
+    cpuTest = y->CheckData(answer, unitNum, 1e-4F);

 #ifdef USE_CUDA
    /* GPU test */
    bool gpuTest = true;

    /* create tensors */
-    XTensor * xGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
-    XTensor * yGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * xGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+    XTensor * yGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);

    /* initialize variables */
-    xGPU->SetData(xData, sUnitNum);
+    xGPU->SetData(xData, unitNum);
    yGPU->SetZeroAll();

    /* call Softmax function */
    Softmax(xGPU, yGPU, 1);
    
    /* check result */
-    gpuTest = yGPU->CheckData(answer, sUnitNum);
+    gpuTest = yGPU->CheckData(answer, unitNum, 1e-4F);

    /* destroy variables */
    delete x;
    delete y;
    delete xGPU;
    delete yGPU;
-    delete[] sDimSize;
+    delete[] dimSize;

    return cpuTest && gpuTest;
 #else
    /* destroy variables */
-    delete x, y;
-    delete[] sDimSize;
+    delete x;
+    delete y;
+    delete[] dimSize;

    return cpuTest;
 #endif // USE_CUDA
@@ -101,62 +102,66 @@ bool TestSoftmax1()
 /* 
 case 2: test SoftmaxBackward function.
 SoftmaxBackward function: dE/dx_j = -gold_j + y_j
+In this case, LossName=CROSSENTROPY.
 */
 bool TestSoftmax2()
 {
    /* a input tensor of size (2, 3) */
-    int sOrder = 2;
-    int * sDimSize = new int[sOrder];
-    sDimSize[0] = 1;
-    sDimSize[1] = 3;
+    int order = 2;
+    int * dimSize = new int[order];
+    dimSize[0] = 1;
+    dimSize[1] = 3;

-    int sUnitNum = 1;
-    for (int i = 0; i < sOrder; i++)
-        sUnitNum *= sDimSize[i];
+    int unitNum = 1;
+    for (int i = 0; i < order; i++)
+        unitNum *= dimSize[i];

    DTYPE xData[1][3] = { {0.0F, 1.0F, 2.0F} };
    DTYPE gData[1][3] = { {0.0F, 0.0F, 1.0F} };
-    DTYPE dedxAnswer[3] = {0.090031F, 0.244728F, -0.334759F};
+    DTYPE yAnswer[1][3] = { {0.0900F, 0.2447F, 0.6652F} };
+    DTYPE dedxAnswer[1][3] = {0.0900F, 0.2447F, -0.3347F};

    /* CPU test */
    bool cpuTest = true;

    /* create tensors */
-    XTensor * x = NewTensor(sOrder, sDimSize);
-    XTensor * y = NewTensor(sOrder, sDimSize);
-    XTensor * g = NewTensor(sOrder, sDimSize);
-    XTensor * dedy = NewTensor(sOrder, sDimSize);
-    XTensor * dedx = NewTensor(sOrder, sDimSize);
+    XTensor * x = NewTensor(order, dimSize);
+    XTensor * y = NewTensor(order, dimSize);
+    XTensor * g = NewTensor(order, dimSize);
+    XTensor * dedy = NewTensor(order, dimSize);
+    XTensor * dedx = NewTensor(order, dimSize);

    /* initialize variables */
-    x->SetData(xData, sUnitNum);
-    g->SetData(gData, sUnitNum);
+    x->SetData(xData, unitNum);
+    g->SetData(gData, unitNum);
    y->SetZeroAll();
    dedx->SetZeroAll();
    dedy->SetZeroAll();

    /* call Softmax function */
    Softmax(x, y, 1);
-
+    
+    /* call SoftmaxBackward function */
    SoftmaxBackward(g, y, x, dedy, dedx, 1, CROSSENTROPY);
    
    /* check result */
-    cpuTest = dedx->CheckData(dedxAnswer, sUnitNum);
+    cpuTest = y->CheckData(yAnswer, unitNum, 1e-4F)
+              && dedx->CheckData(dedxAnswer, unitNum, 1e-4F);

 #ifdef USE_CUDA
    /* GPU test */
    bool gpuTest = true;

        /* create tensors */
-    XTensor * xGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
-    XTensor * yGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
-    XTensor * gGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
-    XTensor * dedyGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
-    XTensor * dedxGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * xGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+    XTensor * yGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+    XTensor * gGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+    XTensor * dedyGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+    XTensor * dedxGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);

    /* initialize variables */
-    xGPU->SetData(xData, sUnitNum);
-    gGPU->SetData(gData, sUnitNum);
+    xGPU->SetData(xData, unitNum);
+    gGPU->SetData(gData, unitNum);
    yGPU->SetZeroAll();
    dedxGPU->SetZeroAll();
    dedyGPU->SetZeroAll();
@@ -168,7 +173,8 @@ bool TestSoftmax2()
    SoftmaxBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, 1, CROSSENTROPY);
    
    /* check result */
-    gpuTest = dedxGPU->CheckData(dedxAnswer, sUnitNum);
+    gpuTest = yGPU->CheckData(yAnswer, unitNum, 1e-4F)
+              && dedxGPU->CheckData(dedxAnswer, unitNum, 1e-4F);

    /* destroy variables */
    delete x;
@@ -181,7 +187,7 @@ bool TestSoftmax2()
    delete gGPU;
    delete dedxGPU;
    delete dedyGPU;
-    delete[] sDimSize;
+    delete[] dimSize;

    return cpuTest && gpuTest;
 #else
@@ -191,7 +197,7 @@ bool TestSoftmax2()
    delete g;
    delete dedx;
    delete dedy;
-    delete[] sDimSize;
+    delete[] dimSize;

    return cpuTest;
 #endif // USE_CUDA

--- a/source/tensor/test/TSplit.cpp
+++ b/source/tensor/test/TSplit.cpp
@@ -181,14 +181,20 @@ bool TestSplit2()
    gpuTest = tGPU->CheckData(answer, tUnitNum);

    /* destroy variables */
-	delete s, t, sGPU, tGPU;
-	delete[] sDimSize, tDimSize;
+	delete s;
+    delete t;
+    delete sGPU;
+    delete tGPU;
+	delete[] sDimSize;
+	delete[] tDimSize;

 	return cpuTest && gpuTest;
 #else
    /* destroy variables */
-	delete s, t;
-	delete[] sDimSize, tDimSize;
+	delete s;
+    delete t;
+	delete[] sDimSize;
+	delete[] tDimSize;

 	return cpuTest;
 #endif // USE_CUDA
@@ -295,14 +301,25 @@ bool TestSplit3()
 	gpuTest = tGPU1->CheckData(answer1, tUnitNum1) && tGPU2->CheckData(answer2, tUnitNum2);

    /* destroy variables */
-	delete s, t1, t2, sGPU, tGPU1, tGPU2;
-	delete[] sDimSize, tDimSize1, tDimSize2;
+	delete s;
+    delete t1;
+    delete t2;
+    delete sGPU;
+    delete tGPU1;
+    delete tGPU2;
+	delete[] sDimSize;
+	delete[] tDimSize1;
+	delete[] tDimSize2;

 	return cpuTest && gpuTest;
 #else
    /* destroy variables */
-    delete s, t1, t2;
-	delete[] sDimSize, tDimSize1, tDimSize2;
+	delete s;
+    delete t1;
+    delete t2;
+	delete[] sDimSize;
+	delete[] tDimSize1;
+	delete[] tDimSize2;

 	return cpuTest;
 #endif // USE_CUDA

--- a/source/tensor/test/Test.cpp
+++ b/source/tensor/test/Test.cpp
@@ -31,12 +31,12 @@ bool Test()

    wrong = !TestConcatenate() || wrong;
    wrong = !TestConcatenateSolely() || wrong;
-    //wrong = !TestCopyIndexed() || wrong;
+    wrong = !TestCopyIndexed() || wrong;
    wrong = !TestCopyValues() || wrong;
    wrong = !TestMatrixMul() || wrong;
    wrong = !TestMatrixMul2D() || wrong;
    wrong = !TestMatrixMul2DParallel() || wrong;
-    //wrong = !TestMatrixMulBatched() || wrong;
+    wrong = !TestMatrixMulBatched() || wrong;
    wrong = !TestMatrixMulBatchedCPU() || wrong;
    wrong = !TestMerge() || wrong;
    wrong = !TestMultiply() || wrong;
@@ -56,18 +56,18 @@ bool Test()
    wrong = !TestSplit() || wrong;
    wrong = !TestSum() || wrong;
    wrong = !TestSumByColumnTV() || wrong;
-    //wrong = !TestSumByColumnVT() || wrong;
+    wrong = !TestSumByColumnVT() || wrong;
    wrong = !TestTopK() || wrong;
    wrong = !TestUnsqueeze() || wrong;
    wrong = !TestXMem() || wrong;

-    //wrong = !TestHardTanH() || wrong;
-    //wrong = !TestIdentity() || wrong;
-    //wrong = !TestLogSoftmax() || wrong;
-    //wrong = !TestLoss() || wrong;
-    //wrong = !TestRectify() || wrong;
-    //wrong = !TestSigmoid() || wrong;
-    //wrong = !TestSoftmax() || wrong;
+    wrong = !TestHardTanH() || wrong;
+    wrong = !TestIdentity() || wrong;
+    wrong = !TestLogSoftmax() || wrong;
+    wrong = !TestLoss() || wrong;
+    wrong = !TestRectify() || wrong;
+    wrong = !TestSigmoid() || wrong;
+    wrong = !TestSoftmax() || wrong;

    /* other test */
    /*