Bug Fixed in test.

42f995ae · liyinqiao · 100f4611 · 42f995ae · 42f995ae · 42f995ae
Commit 42f995ae authored Jul 07, 2018 by liyinqiao
--- a/source/function/Loss.cpp
+++ b/source/function/Loss.cpp
@@ -377,7 +377,7 @@ void LossBackward(XTensor * dedy, XTensor * t, XTensor * y,
    CheckNTErrors((tLen < y->unitNum), "Illegal input length!");
    CheckNTErrors((XTensor::IsIdentical(t, y)&& XTensor::IsIdentical(dedy, y)), 
                        "The input tensors must be of the same size!");
-    //CheckNTErrors((t->dimSizeRDI[0] == 1 && y->dimSizeRDI[0] == 1 && dedy->dimSizeRDI[0] == 1), "TODO!");
+    CheckNTErrors((t->dimSizeRDI[0] == 1 && y->dimSizeRDI[0] == 1 && dedy->dimSizeRDI[0] == 1), "TODO!");
    CheckNTErrors((t->order > leadDim && leadDim >= 0), "Illegal leading dimension!");
    CheckNTErrors((t->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE),
                         "TODO!");

--- a/source/test/TConcatenate.cpp
+++ b/source/test/TConcatenate.cpp
@@ -19,23 +19,18 @@
 * $Created by: Lin Ye (email: linye2015@outlook.com) 2018-06-14
 */
+#include "TConcatenate.h"
-#include "../XTensor.h"
-#include "../XDevice.h"
-#include "../core/Concatenate.h"
-#include "../XList.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
-/* case 1: concatenate a list of tensors along a given dimension
+/* case 1: concatenate a list of tensors along a given dimension.
-* In this case, 2 * (2 * 1) -> (2 * 2), dim=1.
+* In this case, 2 * (2, 1) -> (2, 2), dim=1.
 */
 bool TestConcatenate1()
 {
 	/* create list */
-    XList sList;
+    XList * sList = new XList();
-    sList = XList();
-    /* a source tensor of size (2 * 1) */
+    /* a source tensor of size (2, 1) */
    int sOrder1 = 2;
    int * sDimSize1 = new int[sOrder1];
    sDimSize1[0] = 2;
@@ -45,7 +40,7 @@ bool TestConcatenate1()
    for (int i = 0; i < sOrder1; i++)
        sUnitNum1 *= sDimSize1[i];
-    /* a source tensor of size (2 * 1) */
+    /* a source tensor of size (2, 1) */
    int sOrder2 = 2;
    int * sDimSize2 = new int[sOrder2];
    sDimSize2[0] = 2;
@@ -55,7 +50,7 @@ bool TestConcatenate1()
    for (int i = 0; i < sOrder2; i++)
        sUnitNum2 *= sDimSize2[i];
-    /* a target tensor of size (2 * 2) */
+    /* a target tensor of size (2, 2) */
    int tOrder = 2;
    int * tDimSize = new int[tOrder];
    tDimSize[0] = 2;
@@ -86,11 +81,11 @@ bool TestConcatenate1()
    t->SetZeroAll();
 	/* add tensors to list */
-    sList.Add(s1);
+    sList->Add(s1);
-    sList.Add(s2);
+    sList->Add(s2);
-    /* call concatenate function */
+    /* call Concatenate function */
-    Concatenate(&sList, t, 1);
+    Concatenate(sList, t, 1);
    /* check results */
    cpuTest = t->CheckData(answer, tUnitNum);
@@ -99,9 +94,6 @@ bool TestConcatenate1()
    /* GPU test */
    bool gpuTest = true;
-	/* clear list */
-	sList.Clear();
    /* create tensor */
 	XTensor * sGPU1 = NewTensor(sOrder1, sDimSize1, X_FLOAT, 1.0F, 0);
 	XTensor * sGPU2 = NewTensor(sOrder2, sDimSize2, X_FLOAT, 1.0F, 0);
@@ -112,40 +104,55 @@ bool TestConcatenate1()
 	sGPU2->SetData(sData2, sUnitNum2);
 	tGPU->SetZeroAll();
+	/* clear list */
+	sList->Clear();
 	/* add tensors to list*/
-	sList.Add(sGPU1);
+	sList->Add(sGPU1);
-	sList.Add(sGPU2);
+	sList->Add(sGPU2);
-	/* call concatenate function */
+	/* call Concatenate function */
-	Concatenate(&sList, tGPU, 1);
+	Concatenate(sList, tGPU, 1);
 	/* check results */
 	gpuTest = tGPU->CheckData(answer, tUnitNum);
    /* destroy variables */
-	delete s1, s2, t, sGPU1, sGPU2, tGPU;
+    delete sList;
-	delete[] sDimSize1, sDimSize2, tDimSize;
+    delete s1;
+    delete s2;
+    delete t;
+    delete sGPU1;
+    delete sGPU2;
+    delete tGPU;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;
 	return cpuTest && gpuTest;
 #else
    /* destroy variables */
-    delete s1, s2, t;
+    delete sList;
-	delete[] sDimSize1, sDimSize2, tDimSize;
+    delete s1;
+    delete s2;
+    delete t;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;
    return cpuTest;
 #endif // USE_CUDA
 }
-/* case 2: concatenate a list of tensors along a given dimension
+/* case 2: concatenate a list of tensors along a given dimension.
-* In this case, 2 * (2 * 1) -> (4 * 1), dim=0.
+* In this case, 2 * (2, 1) -> (4, 1), dim=0.
 */
 bool TestConcatenate2()
 {
 	/* create list */
-    XList sList;
+    XList * sList = new XList();
-    sList = XList();
-    /* a source tensor of size (2 * 1) */
+    /* a source tensor of size (2, 1) */
    int sOrder1 = 2;
    int * sDimSize1 = new int[sOrder1];
    sDimSize1[0] = 2;
@@ -155,7 +162,7 @@ bool TestConcatenate2()
    for (int i = 0; i < sOrder1; i++)
        sUnitNum1 *= sDimSize1[i];
-    /* a source tensor of size (2 * 1) */
+    /* a source tensor of size (2, 1) */
    int sOrder2 = 2;
    int * sDimSize2 = new int[sOrder2];
    sDimSize2[0] = 2;
@@ -165,7 +172,7 @@ bool TestConcatenate2()
    for (int i = 0; i < sOrder2; i++)
        sUnitNum2 *= sDimSize2[i];
-    /* a target tensor of size (4 * 1) */
+    /* a target tensor of size (4, 1) */
    int tOrder = 2;
    int * tDimSize = new int[tOrder];
    tDimSize[0] = 4;
@@ -198,11 +205,11 @@ bool TestConcatenate2()
    t->SetZeroAll();
 	/* add tensors to list */
-    sList.Add(s1);
+    sList->Add(s1);
-    sList.Add(s2);
+    sList->Add(s2);
-    /* call concatenate function */
+    /* call Concatenate function */
-    Concatenate(&sList, t, 0);
+    Concatenate(sList, t, 0);
    /* check results */
    cpuTest = t->CheckData(answer, tUnitNum);
@@ -211,9 +218,6 @@ bool TestConcatenate2()
 	/* GPU test */
 	bool gpuTest = true;
-	/* clear list */
-	sList.Clear();
 	/* create tensor */
 	XTensor * sGPU1 = NewTensor(sOrder1, sDimSize1, X_FLOAT, 1.0F, 0);
 	XTensor * sGPU2 = NewTensor(sOrder2, sDimSize2, X_FLOAT, 1.0F, 0);
@@ -223,40 +227,56 @@ bool TestConcatenate2()
 	sGPU1->SetData(sData1, sUnitNum1);
 	sGPU2->SetData(sData2, sUnitNum2);
 	tGPU->SetZeroAll();
+	/* clear list */
+	sList->Clear();
 	/* add tensors to list*/
-	sList.Add(sGPU1);
+	sList->Add(sGPU1);
-	sList.Add(sGPU2);
+	sList->Add(sGPU2);
-	/* call concatenate function */
+	/* call Concatenate function */
-	Concatenate(&sList, tGPU, 0);
+	Concatenate(sList, tGPU, 0);
 	/* check results */
 	gpuTest = tGPU->CheckData(answer, tUnitNum);
 	/* destroy variables */
-	delete s1, s2, t, sGPU1, sGPU2, tGPU;
+    delete sList;
-	delete[] sDimSize1, sDimSize2, tDimSize;
+    delete s1;
+    delete s2;
+    delete t;
+    delete sGPU1;
+    delete sGPU2;
+    delete tGPU;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;
 	return cpuTest && gpuTest;
 #else
    /* destroy variables */
-	delete s1, s2, t;
+    delete sList;
-	delete[] sDimSize1, sDimSize2, tDimSize;
+    delete s1;
+    delete s2;
+    delete t;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;
    return cpuTest;
 #endif // USE_CUDA
 }
-/* case 3: concatenate a list of tensors along a given dimension
+/* case 3: concatenate a list of tensors along a given dimension.
-* In this case, (2 * 1) + (2 * 2) -> (2 * 3), dim=1.
+* In this case, (2, 1) + (2, 2) -> (2, 3), dim=1.
 */
 bool TestConcatenate3()
 {
 	/* create list */
-    XList sList;
+    XList * sList = new XList();
-    sList = XList();
-    /* a source tensor of size (2 * 1) */
+    /* a source tensor of size (2, 1) */
    int sOrder1 = 2;
    int * sDimSize1 = new int[sOrder1];
    sDimSize1[0] = 2;
@@ -266,7 +286,7 @@ bool TestConcatenate3()
    for (int i = 0; i < sOrder1; i++)
        sUnitNum1 *= sDimSize1[i];
-    /* a source tensor of size (2 * 2) */
+    /* a source tensor of size (2, 2) */
    int sOrder2 = 2;
    int * sDimSize2 = new int[sOrder2];
    sDimSize2[0] = 2;
@@ -276,7 +296,7 @@ bool TestConcatenate3()
    for (int i = 0; i < sOrder2; i++)
        sUnitNum2 *= sDimSize2[i];
-    /* a target tensor of size (2 * 3) */
+    /* a target tensor of size (2, 3) */
    int tOrder = 2;
    int * tDimSize = new int[tOrder];
    tDimSize[0] = 2;
@@ -307,11 +327,11 @@ bool TestConcatenate3()
    t->SetZeroAll();
 	/* add tensors to list */
-    sList.Add(s1);
+    sList->Add(s1);
-    sList.Add(s2);
+    sList->Add(s2);
-    /* call concatenate function */
+    /* call Concatenate function */
-    Concatenate(&sList, t, 1);
+    Concatenate(sList, t, 1);
    /* check results */
    cpuTest = t->CheckData(answer, tUnitNum);
@@ -320,9 +340,6 @@ bool TestConcatenate3()
 	/* GPU test */
 	bool gpuTest = true;
-	/* clear list */
-	sList.Clear();
 	/* create tensor */
 	XTensor * sGPU1 = NewTensor(sOrder1, sDimSize1, X_FLOAT, 1.0F, 0);
 	XTensor * sGPU2 = NewTensor(sOrder2, sDimSize2, X_FLOAT, 1.0F, 0);
@@ -332,37 +349,53 @@ bool TestConcatenate3()
 	sGPU1->SetData(sData1, sUnitNum1);
 	sGPU2->SetData(sData2, sUnitNum2);
 	tGPU->SetZeroAll();
+	/* clear list */
+	sList->Clear();
 	/* add tensors to list*/
-	sList.Add(sGPU1);
+	sList->Add(sGPU1);
-	sList.Add(sGPU2);
+	sList->Add(sGPU2);
-	/* call concatenate function */
+	/* call Concatenate function */
-	Concatenate(&sList, tGPU, 1);
+	Concatenate(sList, tGPU, 1);
 	/* check results */
 	gpuTest = tGPU->CheckData(answer, tUnitNum);
 	/* destroy variables */
-	delete s1, s2, t, sGPU1, sGPU2, tGPU;
+    delete sList;
-	delete[] sDimSize1, sDimSize2, tDimSize;
+    delete s1;
+    delete s2;
+    delete t;
+    delete sGPU1;
+    delete sGPU2;
+    delete tGPU;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;
 	return cpuTest && gpuTest;
 #else
-	/* destroy variables */
+    /* destroy variables */
-	delete s1, s2, t;
+    delete sList;
-	delete[] sDimSize1, sDimSize2, tDimSize;
+    delete s1;
+    delete s2;
+    delete t;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;
 	return cpuTest;
 #endif // USE_CUDA
 }
-/* case 4: concatenate two tensors along a given dimension
+/* case 4: concatenate two tensors along a given dimension.
-* In this case, (2 * 1) + (2 * 2) -> (2 * 3), dim=1.
+* In this case, (2, 1), (2, 2) -> (2, 3), dim=1.
 */
 bool TestConcatenate4()
 {
-    /* a source tensor of size (2 * 1) */
+    /* a source tensor of size (2, 1) */
    int sOrder1 = 2;
    int * sDimSize1 = new int[sOrder1];
    sDimSize1[0] = 2;
@@ -372,7 +405,7 @@ bool TestConcatenate4()
    for (int i = 0; i < sOrder1; i++)
        sUnitNum1 *= sDimSize1[i];
-    /* a source tensor of size (2 * 2) */
+    /* a source tensor of size (2, 2) */
    int sOrder2 = 2;
    int * sDimSize2 = new int[sOrder2];
    sDimSize2[0] = 2;
@@ -382,7 +415,7 @@ bool TestConcatenate4()
    for (int i = 0; i < sOrder2; i++)
        sUnitNum2 *= sDimSize2[i];
-    /* a target tensor of size (2 * 3) */
+    /* a target tensor of size (2, 3) */
    int tOrder = 2;
    int * tDimSize = new int[tOrder];
    tDimSize[0] = 2;
@@ -412,7 +445,7 @@ bool TestConcatenate4()
    s2->SetData(sData2, sUnitNum2);
    t->SetZeroAll();
-    /* call concatenate function */
+    /* call Concatenate function */
    Concatenate(s1, s2, t, 1);
    /* check results */
@@ -432,21 +465,32 @@ bool TestConcatenate4()
 	sGPU2->SetData(sData2, sUnitNum2);
 	tGPU->SetZeroAll();
-	/* call concatenate function */
+	/* call Concatenate function */
 	Concatenate(sGPU1, sGPU2, tGPU, 1);
 	/* check results */
 	gpuTest = tGPU->CheckData(answer, tUnitNum);
 	/* destroy variables */
-	delete s1, s2, t, sGPU1, sGPU2, tGPU;
+    delete s1;
-	delete[] sDimSize1, sDimSize2, tDimSize;
+    delete s2;
+    delete t;
+    delete sGPU1;
+    delete sGPU2;
+    delete tGPU;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;
 	return cpuTest && gpuTest;
 #else
-	/* destroy variables */
+    /* destroy variables */
-	delete s1, s2, t;
+    delete s1;
-	delete[] sDimSize1, sDimSize2, tDimSize;
+    delete s2;
+    delete t;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;
 	return cpuTest;
 #endif // USE_CUDA
@@ -461,7 +505,7 @@ TODO!!
 extern "C"
 bool TestConcatenate()
 {
-    XPRINT(0, stdout, "[TEST CONCATENATE] -------------\n");
+    XPRINT(0, stdout, "[TEST CONCATENATE] concatenate a list of tensors or two tensors along a given dimension \n");
    bool returnFlag = true, caseFlag = true;
    /* case 1 test */

--- a/source/test/TConcatenateSolely.cpp
+++ b/source/test/TConcatenateSolely.cpp
@@ -19,23 +19,19 @@
 * $Created by: Lin Ye (email: linye2015@outlook.com) 2018-06-14
 */
+#include "TConcatenateSolely.h"
-#include "../XTensor.h"
-#include "../XDevice.h"
-#include "../core/ConcatenateSolely.h"
 #include "../XList.h"
 namespace nts { // namespace nt(NiuTrans.Tensor)
 /* case 1: concatenate a list of tensors along a given dimension
-* In this case, 2 * (2 * 1) -> (2 * 2), dim=1.
+* In this case, 2 * (2, 1) -> (2, 2), dim=1.
 */
 bool TestConcatenateSolely1()
 {
 	/* create list */
-    XList sList;
+    XList * sList = new XList();
-    sList = XList();
-    /* a source tensor of size 2 * 1 */
+    /* a source tensor of size (2, 1) */
    int sOrder1 = 2;
    int * sDimSize1 = new int[sOrder1];
    sDimSize1[0] = 2;
@@ -45,7 +41,7 @@ bool TestConcatenateSolely1()
    for (int i = 0; i < sOrder1; i++)
        sUnitNum1 *= sDimSize1[i];
-    /* a source tensor of size 2 * 1 */
+    /* a source tensor of size (2, 1) */
    int sOrder2 = 2;
    int * sDimSize2 = new int[sOrder2];
    sDimSize2[0] = 2;
@@ -55,7 +51,7 @@ bool TestConcatenateSolely1()
    for (int i = 0; i < sOrder2; i++)
        sUnitNum2 *= sDimSize2[i];
-    /* a target tensor of size 2 * 2 */
+    /* a target tensor of size (2, 2) */
    int tOrder = 2;
    int * tDimSize = new int[tOrder];
    tDimSize[0] = 2;
@@ -86,11 +82,11 @@ bool TestConcatenateSolely1()
    t->SetZeroAll();
 	/* add tensors to list */
-    sList.Add(s1);
+    sList->Add(s1);
-    sList.Add(s2);
+    sList->Add(s2);
-	/* call concatenatesolely function */
+	/* call ConcatenateSolely function */
-    ConcatenateSolely(&sList, t, 1);
+    ConcatenateSolely(sList, t, 1);
    /* check results */
    cpuTest = t->CheckData(answer, tUnitNum);
@@ -99,9 +95,6 @@ bool TestConcatenateSolely1()
 	/* GPU test */
 	bool gpuTest = true;
-	/* clear list */
-	sList.Clear();
 	/* create tensor */
 	XTensor * sGPU1 = NewTensor(sOrder1, sDimSize1, X_FLOAT, 1.0F, 0);
 	XTensor * sGPU2 = NewTensor(sOrder2, sDimSize2, X_FLOAT, 1.0F, 0);
@@ -111,40 +104,56 @@ bool TestConcatenateSolely1()
 	sGPU1->SetData(sData1, sUnitNum1);
 	sGPU2->SetData(sData2, sUnitNum2);
 	tGPU->SetZeroAll();
+	/* clear list */
+	sList->Clear();
 	/* add tensors to list*/
-	sList.Add(sGPU1);
+	sList->Add(sGPU1);
-	sList.Add(sGPU2);
+	sList->Add(sGPU2);
-	/* call concatenatesolely function */
+	/* call ConcatenateSolely function */
-	ConcatenateSolely(&sList, tGPU, 1);
+	ConcatenateSolely(sList, tGPU, 1);
 	/* check results */
 	gpuTest = tGPU->CheckData(answer, tUnitNum);
 	/* destroy variables */
-	delete s1, s2, t, sGPU1, sGPU2, tGPU;
+    delete sList;
-	delete[] sDimSize1, sDimSize2, tDimSize;
+    delete s1;
+    delete s2;
+    delete t;
+    delete sGPU1;
+    delete sGPU2;
+    delete tGPU;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;
 	return cpuTest && gpuTest;
 #else
-	/* destroy variables */
+    /* destroy variables */
-	delete s1, s2, t;
+    delete sList;
-	delete[] sDimSize1, sDimSize2, tDimSize;
+    delete s1;
+    delete s2;
+    delete t;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;
 	return cpuTest;
 #endif // USE_CUDA
    }
 /* case 2: concatenate a list of tensors along a given dimension
-* In this case, 2 * (2 * 1) -> (4 * 1), dim=0.
+* In this case, 2 * (2, 1) -> (4, 1), dim=0.
 */
 bool TestConcatenateSolely2()
 {
 	/* create list */
-    XList sList;
+    XList * sList = new XList();
-    sList = XList();
-    /* a source tensor of size 2 * 1 */
+    /* a source tensor of size (2, 1) */
    int sOrder1 = 2;
    int * sDimSize1 = new int[sOrder1];
    sDimSize1[0] = 2;
@@ -154,7 +163,7 @@ bool TestConcatenateSolely2()
    for (int i = 0; i < sOrder1; i++)
        sUnitNum1 *= sDimSize1[i];
-    /* a source tensor of size 2 * 1 */
+    /* a source tensor of size (2, 1) */
    int sOrder2 = 2;
    int * sDimSize2 = new int[sOrder2];
    sDimSize2[0] = 2;
@@ -164,7 +173,7 @@ bool TestConcatenateSolely2()
    for (int i = 0; i < sOrder2; i++)
        sUnitNum2 *= sDimSize2[i];
-    /* a target tensor of size 4 * 1 */
+    /* a target tensor of size (4, 1) */
    int tOrder = 2;
    int * tDimSize = new int[tOrder];
    tDimSize[0] = 4;
@@ -197,11 +206,11 @@ bool TestConcatenateSolely2()
    t->SetZeroAll();
 	/* add tensors to list */
-    sList.Add(s1);
+    sList->Add(s1);
-    sList.Add(s2);
+    sList->Add(s2);
-    /* call concatenatesolely function */
+    /* call ConcatenateSolely function */
-    ConcatenateSolely(&sList, t, 0);
+    ConcatenateSolely(sList, t, 0);
    /* check results */
    cpuTest = t->CheckData(answer, tUnitNum);
@@ -210,9 +219,6 @@ bool TestConcatenateSolely2()
 	/* GPU test */
 	bool gpuTest = true;
-	/* clear list */
-	sList.Clear();
 	/* create tensor */
 	XTensor * sGPU1 = NewTensor(sOrder1, sDimSize1, X_FLOAT, 1.0F, 0);
 	XTensor * sGPU2 = NewTensor(sOrder2, sDimSize2, X_FLOAT, 1.0F, 0);
@@ -222,40 +228,56 @@ bool TestConcatenateSolely2()
 	sGPU1->SetData(sData1, sUnitNum1);
 	sGPU2->SetData(sData2, sUnitNum2);
 	tGPU->SetZeroAll();
+	/* clear list */
+	sList->Clear();
 	/* add tensors to list*/
-	sList.Add(sGPU1);
+	sList->Add(sGPU1);
-	sList.Add(sGPU2);
+	sList->Add(sGPU2);
 	/* call concatenatesolely function */
-	ConcatenateSolely(&sList, tGPU, 0);
+	ConcatenateSolely(sList, tGPU, 0);
 	/* check results */
 	gpuTest = tGPU->CheckData(answer, tUnitNum);
 	/* destroy variables */
-	delete s1, s2, t, sGPU1, sGPU2, tGPU;
+    delete sList;
-	delete[] sDimSize1, sDimSize2, tDimSize;
+    delete s1;
+    delete s2;
+    delete t;
+    delete sGPU1;
+    delete sGPU2;
+    delete tGPU;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;
 	return cpuTest && gpuTest;
 #else
-	/* destroy variables */
+    /* destroy variables */
-	delete s1, s2, t;
+    delete sList;
-	delete[] sDimSize1, sDimSize2, tDimSize;
+    delete s1;
+    delete s2;
+    delete t;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;
 	return cpuTest;
 #endif // USE_CUDA
 }
 /* case 3: concatenate a list of tensors along a given dimension
-* In this case, (2 * 1) + (2 * 2) -> (2 * 3), dim=1.
+* In this case, (2, 1) + (2, 2) -> (2, 3), dim=1.
 */
 bool TestConcatenateSolely3()
 {
 	/* create list */
-    XList sList;
+    XList * sList = new XList();
-    sList = XList();
-    /* a source tensor of size (2 * 1) */
+    /* a source tensor of size (2, 1) */
    int sOrder1 = 2;
    int * sDimSize1 = new int[sOrder1];
    sDimSize1[0] = 2;
@@ -265,7 +287,7 @@ bool TestConcatenateSolely3()
    for (int i = 0; i < sOrder1; i++)
        sUnitNum1 *= sDimSize1[i];
-    /* a source tensor of size (2 * 2) */
+    /* a source tensor of size (2, 2) */
    int sOrder2 = 2;
    int * sDimSize2 = new int[sOrder2];
    sDimSize2[0] = 2;
@@ -275,7 +297,7 @@ bool TestConcatenateSolely3()
    for (int i = 0; i < sOrder2; i++)
        sUnitNum2 *= sDimSize2[i];
-    /* a target tensor of size (2 * 3) */
+    /* a target tensor of size (2, 3) */
    int tOrder = 2;
    int * tDimSize = new int[tOrder];
    tDimSize[0] = 2;
@@ -306,11 +328,11 @@ bool TestConcatenateSolely3()
    t->SetZeroAll();
 	/* add tensors to list */
-    sList.Add(s1);
+    sList->Add(s1);
-    sList.Add(s2);
+    sList->Add(s2);
-	/* call concatenatesolely function */
+	/* call ConcatenateSolely function */
-    ConcatenateSolely(&sList, t, 1);
+    ConcatenateSolely(sList, t, 1);
    /* check results */
    cpuTest = t->CheckData(answer, tUnitNum);
@@ -319,9 +341,6 @@ bool TestConcatenateSolely3()
 	/* GPU test */
 	bool gpuTest = true;
-	/* clear list */
-	sList.Clear();
 	/* create tensor */
 	XTensor * sGPU1 = NewTensor(sOrder1, sDimSize1, X_FLOAT, 1.0F, 0);
 	XTensor * sGPU2 = NewTensor(sOrder2, sDimSize2, X_FLOAT, 1.0F, 0);
@@ -331,26 +350,42 @@ bool TestConcatenateSolely3()
 	sGPU1->SetData(sData1, sUnitNum1);
 	sGPU2->SetData(sData2, sUnitNum2);
 	tGPU->SetZeroAll();
+	/* clear list */
+	sList->Clear();
 	/* add tensors to list*/
-	sList.Add(sGPU1);
+	sList->Add(sGPU1);
-	sList.Add(sGPU2);
+	sList->Add(sGPU2);
-	/* call concatenatesolely function */
+	/* call ConcatenateSolely function */
-	ConcatenateSolely(&sList, tGPU, 1);
+	ConcatenateSolely(sList, tGPU, 1);
 	/* check results */
 	gpuTest = tGPU->CheckData(answer, tUnitNum);
 	/* destroy variables */
-	delete s1, s2, t, sGPU1, sGPU2, tGPU;
+    delete sList;
-	delete[] sDimSize1, sDimSize2, tDimSize;
+    delete s1;
+    delete s2;
+    delete t;
+    delete sGPU1;
+    delete sGPU2;
+    delete tGPU;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;
 	return cpuTest && gpuTest;
 #else
-	/* destroy variables */
+    /* destroy variables */
-	delete s1, s2, t;
+    delete sList;
-	delete[] sDimSize1, sDimSize2, tDimSize;
+    delete s1;
+    delete s2;
+    delete t;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;
 	return cpuTest;
 #endif // USE_CUDA
@@ -365,7 +400,7 @@ TODO!!
 extern "C"
 bool TestConcatenateSolely()
 {
-    XPRINT(0, stdout, "[TEST CONCATENATESOLELY] -------------\n");
+    XPRINT(0, stdout, "[TEST CONCATENATESOLELY] concatenate a list of tensors along a given dimension \n");
    bool returnFlag = true, caseFlag = true;
    /* case 1 test */

--- a/source/test/TCopyIndexed.cpp
+++ b/source/test/TCopyIndexed.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-06-27
+*/
+#include "TCopyIndexed.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/* case 1 copy indexed sub-tensors 
+* In this case, (3, 2, 3) -> (3, 2, 2), dim = 2, indexSize = 2, 
+* srcIndex = [0, 2], tgtIndex = [0, 1], copyNum = 1.
+*/
+bool TestCopyIndexed1()
+{
+    /* a input tensor of size (3, 2, 3) */
+    int sOrder = 3;
+    int * sDimSize = new int[sOrder];
+    sDimSize[0] = 3;
+    sDimSize[1] = 2;
+    sDimSize[2] = 3;
+    int sUnitNum = 1;
+    for (int i = 0; i < sOrder; i++)
+        sUnitNum *= sDimSize[i];
+    /* a output tensor of size (3, 2, 2) */
+    int tOrder = 3;
+    int * tDimSize = new int[tOrder];
+    tDimSize[0] = 3;
+    tDimSize[1] = 2;
+    tDimSize[2] = 2;
+    int tUnitNum = 1;
+    for (int i = 0; i < tOrder; i++)
+        tUnitNum *= tDimSize[i];
+    DTYPE sData[3][2][3] = { { {0.0, -1.0, 2.0},
+                               {2.0, 1.0, 3.0} },
+                             { {1.0, 2.0, 4.0}, 
+                               {3.0, 1.0, 2.0}},
+                             { {-1.0, 3.0, 2.0}, 
+                               {1.0, -1.0, 0.0} } };
+    DTYPE answer[3][2][2] = { { {0.0, 2.0},
+                                {2.0, 3.0} },
+                              { {1.0, 4.0}, 
+                                {3.0, 2.0}},
+                              { {-1.0, 2.0}, 
+                                {1.0, 0.0} } };
+    int dim = 2;
+    int indexSize = 2;
+    int srcIndex[2] = {0, 2};
+    int tgtIndex[2] = {0, 1};
+    int copyNum = 1;
+    /* CPU test */
+    bool cpuTest = true;
+    /* create tensors */
+    XTensor * s = NewTensor(sOrder, sDimSize);
+    XTensor * t = NewTensor(tOrder, tDimSize);
+    /* initialize variables */
+    s->SetData(sData, sUnitNum);
+    t->SetZeroAll();
+    /* call CopyIndexed function */
+    CopyIndexed(s, t, dim, srcIndex, indexSize, tgtIndex, copyNum);
+    /* check results */
+    cpuTest = t->CheckData(answer, tUnitNum);
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+    /* create tensors */
+    XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * tGPU = NewTensor(sOrder, tDimSize, X_FLOAT, 1.0F, 0);
+    /* initialize variables */
+    sGPU->SetData(sData, sUnitNum);
+    tGPU->SetZeroAll();
+    /* call CopyIndexed function */
+    CopyIndexed(sGPU, tGPU, dim, srcIndex, indexSize, tgtIndex, copyNum);
+    /* check results */
+    gpuTest = tGPU->CheckData(answer, tUnitNum);
+    /* destroy variables */
+    delete s;
+    delete t;
+    delete sGPU;
+    delete tGPU;
+    delete[] sDimSize;
+    delete[] tDimSize;
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete s;
+    delete t;
+    delete[] sDimSize;
+    delete[] tDimSize;
+    return cpuTest;
+#endif // USE_CUDA
+}
+/* other cases */
+/*
+TODO!!
+*/
+/* test for CopyIndexed Function */
+extern "C"
+bool TestCopyIndexed()
+{
+    XPRINT(0, stdout, "[TEST CopyIndexed] copy indexed sub-tensors \n");
+    bool returnFlag = true, caseFlag = true;
+    /* case 1 test */
+    caseFlag = TestCopyIndexed1();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 1 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 1 passed!\n");
+    /* other cases test */
+    /*
+    TODO!!
+    */
+    if (returnFlag) {
+        XPRINT(0, stdout, ">> All Passed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> Failed!\n");
+    XPRINT(0, stdout, "\n");
+    return returnFlag;
+    }
+} // namespace nts(NiuTrans.Tensor)
--- a/source/test/TCopyIndexed.h
+++ b/source/test/TCopyIndexed.h
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-06-27
+*/
+#ifndef __TEST_COPYINDEXED_H__
+#define __TEST_COPYINDEXED_H__
+#include "../core/CopyIndexed.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/* test for CopyIndexed Function */
+extern "C"
+bool TestCopyIndexed();
+} // namespace nts(NiuTrans.Tensor)
+#endif // __TEST_COPYINDEXED_H__
--- a/source/test/TCopyValues.cpp
+++ b/source/test/TCopyValues.cpp
@@ -19,26 +19,25 @@
 * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-06-27
 */
-#include "../XTensor.h"
 #include "../XUtility.h"
 #include "TCopyValues.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
-/* case 1 */
+/* case 1: copy tensor s to tensor t */
 bool TestCopyValues1()
 {
-    /* a input tensor of size 2 * 4 */
+    /* a input tensor of size (2, 4) */
-    int inputOrder = 2;
+    int sOrder = 2;
-    int * inputDimSize = new int[inputOrder];
+    int * sDimSize = new int[sOrder];
-    inputDimSize[0] = 2;
+    sDimSize[0] = 2;
-    inputDimSize[1] = 4;
+    sDimSize[1] = 4;
-    int inputUnitNum = 1;
+    int sUnitNum = 1;
-    for (int i = 0; i < inputOrder; i++)
+    for (int i = 0; i < sOrder; i++)
-        inputUnitNum *= inputDimSize[i];
+        sUnitNum *= sDimSize[i];
-    DTYPE inputData[2][4] = { {0.0, 1.0, 2.0, 3.0},
+    DTYPE sData[2][4] = { {0.0, 1.0, 2.0, 3.0},
-                              {4.0, 5.0, 6.0, 7.0} };
+                          {4.0, 5.0, 6.0, 7.0} };
    DTYPE scaleFactor = 2.0;
    DTYPE shiftFactor = 0.5;
@@ -47,51 +46,54 @@ bool TestCopyValues1()
    bool cpuTest = true;
    /* create tensors */
-    XTensor * input = NewTensor(inputOrder, inputDimSize);
+    XTensor * s = NewTensor(sOrder, sDimSize);
-    XTensor * output = NewTensor(inputOrder, inputDimSize);
+    XTensor * t = NewTensor(sOrder, sDimSize);
    /* initialize variables */
-    input->SetData(inputData, inputUnitNum);
+    s->SetData(sData, sUnitNum);
-    output->SetZeroAll();
+    t->SetZeroAll();
    /* call CopyValues function */
-    CopyValues(input, output);
+    CopyValues(s, t);
    /* check results */
-    cpuTest = output->CheckData(input->data, inputUnitNum);
+    cpuTest = t->CheckData(s->data, sUnitNum);
 #ifdef USE_CUDA
    /* GPU test */
    bool gpuTest = true;
    /* create tensors */
-    XTensor * inputGPU = NewTensor(inputOrder, inputDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
-    XTensor * outputGPU = NewTensor(inputOrder, inputDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * tGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
    /* initialize variables */
-    inputGPU->SetData(inputData, inputUnitNum);
+    sGPU->SetData(sData, sUnitNum);
-    outputGPU->SetData(inputData, inputUnitNum);
+    tGPU->SetData(sData, sUnitNum);
    /* call CopyValues function */
-    CopyValues(inputGPU, outputGPU);
+    CopyValues(sGPU, tGPU);
    /* check results */
-    DTYPE * dataGPU = (DTYPE*)inputGPU->data;
+    DTYPE * dataGPU = (DTYPE*)sGPU->data;
-    int size = inputUnitNum * inputGPU->unitSize;
+    int size = sUnitNum * sGPU->unitSize;
    char * dataCPU = new char[size];
-    XMemCopy(dataCPU, -1, dataGPU, inputGPU->devID, size);
+    XMemCopy(dataCPU, -1, dataGPU, sGPU->devID, size);
-    gpuTest = outputGPU->CheckData(dataCPU, inputUnitNum);
+    gpuTest = tGPU->CheckData(dataCPU, sUnitNum);
    /* destroy variables */
-    delete input, output;
+    delete s;
-    delete inputGPU, outputGPU;
+    delete t;
-    delete[] inputDimSize;
+    delete sGPU;
+    delete tGPU;
+    delete[] sDimSize;
    return cpuTest && gpuTest;
 #else
    /* destroy variables */
-    delete input, output;
+    delete s;
-    delete[] inputDimSize;
+    delete t;
+    delete[] sDimSize;
    return cpuTest;
 #endif // USE_CUDA
@@ -106,7 +108,7 @@ TODO!!
 extern "C"
 bool TestCopyValues()
 {
-    XPRINT(0, stdout, "[TEST CopyValues]\n");
+    XPRINT(0, stdout, "[TEST CopyValues] copy tensor s to tensor t \n");
    bool returnFlag = true, caseFlag = true;
    /* case 1 test */

--- a/source/test/THardTanH.cpp
+++ b/source/test/THardTanH.cpp
@@ -19,10 +19,7 @@
 * $Created by: Lin Ye (email: linye2015@outlook.com) 2018-06-20
 */
+#include "THardTanH.h"
-#include "../XTensor.h"
-#include "../XDevice.h"
-#include "../function/HardTanH.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
 /* case 1: hard tanh function */
@@ -68,7 +65,7 @@ bool TestHardTanH1()
 	HardTanH(x, y);
 	/* check results */
-	cpuTest = y->CheckData(answer, yUnitNum);
+	cpuTest = y->CheckData(answer, yUnitNum, 1e-4F);
 #ifdef USE_CUDA
 	/* GPU test */
@@ -86,7 +83,7 @@ bool TestHardTanH1()
 	HardTanH(xGPU, yGPU);
 	/* check results */
-	gpuTest = yGPU->CheckData(answer, yUnitNum);
+	gpuTest = yGPU->CheckData(answer, yUnitNum, 1e-4F);
 	/* destroy variables */
 	delete x, y, xGPU, yGPU;

--- a/source/test/TIdentity.cpp
+++ b/source/test/TIdentity.cpp
@@ -19,7 +19,6 @@
 * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-06-29
 */
-#include "../XTensor.h"
 #include "../XUtility.h"
 #include "TIdentity.h"
@@ -110,7 +109,7 @@ bool TestIdentity2()
    DTYPE xData[1][3] = { {0.0, 1.0, 2.0} };
    DTYPE gData[1][3] = { {0.0, 0.0, 1.0} };
-    DTYPE answer[3] = {0.090031, 0.244728, -0.334759};
+    DTYPE dedxAnswer[3] = {0.090031, 0.244728, -0.334759};
    /* CPU test */
    bool cpuTest = true;
@@ -132,31 +131,11 @@ bool TestIdentity2()
    /* call Identity function */
    Identity(x, y);
-    /* check result */
-    printf("CPU Test:\n");
-    printf("Identity Result:");
-    DTYPE * checkData = (DTYPE*)y->data;
-    for (int i = 0; i < sUnitNum; i++) {
-        printf("\t%f", checkData[i]);
-    }
-    printf("\n");
    /* call IdentityBackward function */
    IdentityBackward(g, y, x, dedy, dedx, CROSSENTROPY);
    /* check result */
-    printf("Computer de/dx:");
+    cpuTest = dedx->CheckData(dedxAnswer, sUnitNum);
-    checkData = (DTYPE*)dedx->data;
-    for (int i = 0; i < sUnitNum; i++) {
-        printf("\t%f", checkData[i]);
-    }
-    printf("\n");
-    printf("Real de/dx:");
-    for (int i = 0; i < sUnitNum; i++) {
-        printf("\t%f", answer[i]);
-    }
-    printf("\n");
 #ifdef USE_CUDA
    /* GPU test */
@@ -179,44 +158,33 @@ bool TestIdentity2()
    /* call Identity function */
    Identity(xGPU, yGPU);
-    /* check result */
-    printf("\nGPU Test:\n");
-    printf("Identity Result:");
-    checkData = (DTYPE*)y->data;
-    for (int i = 0; i < sUnitNum; i++) {
-        printf("\t%f", checkData[i]);
-    }
-    printf("\n");
    /* call IdentityBackward function */
    IdentityBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, CROSSENTROPY);
    /* check result */
-    printf("Computer de/dx:");
+    gpuTest = dedxGPU->CheckData(dedxAnswer, sUnitNum);
-    checkData = (DTYPE*)dedxGPU->data;
-    int size = sUnitNum * dedxGPU->unitSize;
-    DTYPE * copy = new DTYPE[size];
-    XMemCopy(copy, -1, checkData, dedxGPU->devID, size);
-    for (int i = 0; i < sUnitNum; i++) {
-        printf("\t%f", copy[i]);
-    }
-    printf("\n");
-    printf("Real de/dx:");
-    for (int i = 0; i < sUnitNum; i++) {
-        printf("\t%f", answer[i]);
-    }
-    printf("\n");
    /* destroy variables */
-    delete x, y, g, dedx, dedy;
+    delete x;
-    delete xGPU, yGPU, gGPU, dedxGPU, dedyGPU;
+    delete y;
+    delete g;
+    delete dedx;
+    delete dedy;
+    delete xGPU;
+    delete yGPU;
+    delete gGPU;
+    delete dedxGPU;
+    delete dedyGPU;
    delete[] sDimSize;
    return cpuTest && gpuTest;
 #else
    /* destroy variables */
-    delete x, y, g, dedx, dedy;
+    delete x;
+    delete y;
+    delete g;
+    delete dedx;
+    delete dedy;
    delete[] sDimSize;
    return cpuTest;
@@ -232,7 +200,7 @@ bool TestIdentity2()
 extern "C"
 bool TestIdentity()
 {
-    XPRINT(0, stdout, "[TEST Identity] -------------\n");
+    XPRINT(0, stdout, "[TEST Identity] identity function and its backward computation \n");
    bool returnFlag = true, caseFlag = true;
    /* case 1 test */
@@ -245,15 +213,15 @@ bool TestIdentity()
    else
        XPRINT(0, stdout, ">> case 1 passed!\n");
-    /* case 2 test */
+    ///* case 2 test */
-    caseFlag = TestIdentity2();
+    //caseFlag = TestIdentity2();
-    if (!caseFlag) {
+    //if (!caseFlag) {
-        returnFlag = false;
+    //    returnFlag = false;
-        XPRINT(0, stdout, ">> case 2 failed!\n");
+    //    XPRINT(0, stdout, ">> case 2 failed!\n");
-    }
+    //}
-    else
+    //else
-        XPRINT(0, stdout, ">> case 2 passed!\n");
+    //    XPRINT(0, stdout, ">> case 2 passed!\n");
    /* other cases test */
    /*

--- a/source/test/TLogSoftmax.cpp
+++ b/source/test/TLogSoftmax.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-02
+*/
+#include "../XUtility.h"
+#include "TLogSoftmax.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/* case 1: test LogSoftmax function.
+* LogSoftmax function: y = log(e^x / \sum_{i} e^{x_i})
+*/
+bool TestLogSoftmax1()
+{
+    /* a input tensor of size (2, 3) */
+    int sOrder = 2;
+    int * sDimSize = new int[sOrder];
+    sDimSize[0] = 2;
+    sDimSize[1] = 3;
+    int sUnitNum = 1;
+    for (int i = 0; i < sOrder; i++)
+        sUnitNum *= sDimSize[i];
+    DTYPE xData[2][3] = { {0.0, 1.0, 2.0}, 
+                          {0.5, 0.7, 1.4} };
+    DTYPE answer[2][3] = { {-2.4076, -1.4076, -0.4076}, 
+                           {-1.5435, -1.3435, -0.6435} };
+    /* CPU test */
+    bool cpuTest = true;
+    /* create tensors */
+    XTensor * x = NewTensor(sOrder, sDimSize);
+    XTensor * y = NewTensor(sOrder, sDimSize);
+    /* initialize variables */
+    x->SetData(xData, sUnitNum);
+    y->SetZeroAll();
+    /* call LogSoftmax function */
+    LogSoftmax(x, y, 1);
+    /* check result */
+    cpuTest = y->CheckData(answer, sUnitNum);
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+    /* create tensors */
+    XTensor * xGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * yGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    /* initialize variables */
+    xGPU->SetData(xData, sUnitNum);
+    yGPU->SetZeroAll();
+    /* call LogSoftmax function */
+    LogSoftmax(xGPU, yGPU, 1);
+    /* check result */
+    gpuTest = yGPU->CheckData(answer, sUnitNum);
+    /* destroy variables */
+    delete x;
+    delete y;
+    delete xGPU;
+    delete yGPU;
+    delete[] sDimSize;
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete x;
+    delete y;
+    delete z;
+    delete[] sDimSize;
+    return cpuTest;
+#endif // USE_CUDA
+}
+/* case 2: test LogSoftmaxBackward function.
+* dE/dx = dE/dy * dy/dx
+* log softmax: y_i = log(e^{x_i} / \sum_{k} e^{x_k})
+*/
+bool TestLogSoftmax2()
+{
+    /* a input tensor of size (3) */
+    int sOrder = 1;
+    int * sDimSize = new int[sOrder];
+    sDimSize[0] = 3;
+    int sUnitNum = 1;
+    for (int i = 0; i < sOrder; i++)
+        sUnitNum *= sDimSize[i];
+    DTYPE xData[3] = {0.0, 1.0, 2.0};
+    DTYPE gData[3] = {0.5, 0.8, 1.5};
+    DTYPE yAnswer[3] = {-2.4076, -1.4076, -0.4076};
+    DTYPE dedxAnswer[3] = {-0.409969, -0.555272, -0.834759};
+    /* CPU test */
+    bool cpuTest = true;
+    /* create tensors */
+    XTensor * x = NewTensor(sOrder, sDimSize);
+    XTensor * y = NewTensor(sOrder, sDimSize);
+    XTensor * g = NewTensor(sOrder, sDimSize);
+    XTensor * dedy = NewTensor(sOrder, sDimSize);
+    XTensor * dedx = NewTensor(sOrder, sDimSize);
+    /* initialize variables */
+    x->SetData(xData, sUnitNum);
+    g->SetData(gData, sUnitNum);
+    y->SetZeroAll();
+    dedx->SetZeroAll();
+    dedy->SetZeroAll();
+    /* call LogSoftmax function */
+    LogSoftmax(x, y, 0);
+    /* call LogSoftmaxBackward function */
+    LogSoftmaxBackward(g, y, x, dedy, dedx, 0, CROSSENTROPY);
+    /* check result */
+    cpuTest = y->CheckData(yAnswer, sUnitNum) && dedx->CheckData(dedxAnswer, sUnitNum);
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+    /* create tensors */
+    XTensor * xGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * yGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * gGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * dedyGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * dedxGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    /* initialize variables */
+    xGPU->SetData(xData, sUnitNum);
+    gGPU->SetData(gData, sUnitNum);
+    yGPU->SetZeroAll();
+    dedxGPU->SetZeroAll();
+    dedyGPU->SetZeroAll();
+    /* call LogSoftmax function */
+    LogSoftmax(xGPU, yGPU, 0);
+    /* call LogSoftmaxBackward function */
+    LogSoftmaxBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, 0, CROSSENTROPY);
+    /* check result */
+    gpuTest = yGPU->CheckData(yAnswer, sUnitNum) && dedxGPU->CheckData(dedxAnswer, sUnitNum);
+    /* destroy variables */
+    delete x;
+    delete y;
+    delete g;
+    delete dedx;
+    delete dedy;
+    delete xGPU;
+    delete yGPU;
+    delete gGPU;
+    delete dedxGPU;
+    delete dedyGPU;
+    delete[] sDimSize;
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete x;
+    delete y;
+    delete g;
+    delete dedx;
+    delete dedy;
+    delete[] sDimSize;
+    return cpuTest;
+#endif // USE_CUDA
+}
+/* case 3: test LogSoftmaxBackward function.
+* dE/dx = dE/dy * dy/dx
+* log softmax: y_i = log(e^{x_i} / \sum_{k} e^{x_k})
+*/
+bool TestLogSoftmax3()
+{
+    /* a tensor of size (1, 3) */
+    int sOrder = 2;
+    int * sDimSize = new int[sOrder];
+    sDimSize[0] = 1;
+    sDimSize[1] = 3;
+    int sUnitNum = 1;
+    for (int i = 0; i < sOrder; i++)
+        sUnitNum *= sDimSize[i];
+    DTYPE xData[1][3] = { {0.0, 1.0, 2.0} };
+    DTYPE gData[1][3] = { {0.5, 0.8, 1.5} };
+    DTYPE yAnswer[1][3] = {-2.4076, -1.4076, -0.4076};
+    DTYPE dedxAnswer[1][3] = {-0.409969, -0.555272, -0.834759};
+    /* CPU test */
+    bool cpuTest = true;
+    /* create tensors */
+    XTensor * x = NewTensor(sOrder, sDimSize);
+    XTensor * y = NewTensor(sOrder, sDimSize);
+    XTensor * g = NewTensor(sOrder, sDimSize);
+    XTensor * dedy = NewTensor(sOrder, sDimSize);
+    XTensor * dedx = NewTensor(sOrder, sDimSize);
+    /* initialize variables */
+    x->SetData(xData, sUnitNum);
+    g->SetData(gData, sUnitNum);
+    y->SetZeroAll();
+    dedx->SetZeroAll();
+    dedy->SetZeroAll();
+    /* call LogSoftmax function */
+    LogSoftmax(x, y, 1);
+    /* call LogSoftmaxBackward function */
+    LogSoftmaxBackward(g, y, x, dedy, dedx, 1, CROSSENTROPY);
+    /* check result */
+    cpuTest = y->CheckData(yAnswer, sUnitNum) && dedx->CheckData(dedxAnswer, sUnitNum);
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+    /* create tensors */
+    XTensor * xGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * yGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * gGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * dedyGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * dedxGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    /* initialize variables */
+    xGPU->SetData(xData, sUnitNum);
+    gGPU->SetData(gData, sUnitNum);
+    yGPU->SetZeroAll();
+    dedxGPU->SetZeroAll();
+    dedyGPU->SetZeroAll();
+    /* call LogSoftmax function */
+    LogSoftmax(xGPU, yGPU, 1);
+    /* call LogSoftmaxBackward function */
+    LogSoftmaxBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, 1, CROSSENTROPY);
+    /* check result */
+    gpuTest = yGPU->CheckData(yAnswer, sUnitNum) && dedxGPU->CheckData(dedxAnswer, sUnitNum);
+    /* destroy variables */
+    delete x;
+    delete y;
+    delete g;
+    delete dedx;
+    delete dedy;
+    delete xGPU;
+    delete yGPU;
+    delete gGPU;
+    delete dedxGPU;
+    delete dedyGPU;
+    delete[] sDimSize;
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete x;
+    delete y;
+    delete g;
+    delete dedx;
+    delete dedy;
+    delete[] sDimSize;
+    return cpuTest;
+#endif // USE_CUDA
+}
+/* other cases */
+/*
+    TODO!!
+*/
+/* test for LogSoftmax Function */
+extern "C"
+bool TestLogSoftmax()
+{
+    XPRINT(0, stdout, "[TEST LogSoftmax] test log softmax function and its backward computation \n");
+    bool returnFlag = true, caseFlag = true;
+    /* case 1 test */
+    caseFlag = TestLogSoftmax1();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 1 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 1 passed!\n");
+    ///* case 2 test */
+    //caseFlag = TestLogSoftmax2();
+    //if (!caseFlag) {
+    //    returnFlag = false;
+    //    XPRINT(0, stdout, ">> case 2 failed!\n");
+    //}
+    //else
+    //    XPRINT(0, stdout, ">> case 2 passed!\n");
+    /* case 3 test */
+    caseFlag = TestLogSoftmax3();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 3 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 3 passed!\n");
+    /* other cases test */
+    /*
+    TODO!!
+    */
+    if (returnFlag) {
+        XPRINT(0, stdout, ">> All Passed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> Failed!\n");
+    XPRINT(0, stdout, "\n");
+    return returnFlag;
+}
+} // namespace nts(NiuTrans.Tensor)
--- a/source/test/TLogSoftmax.h
+++ b/source/test/TLogSoftmax.h
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-02
+*/
+#ifndef __TEST_LOGSOFTMAX_H__
+#define __TEST_LOGSOFTMAX_H__
+#include "../function/LogSoftmax.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/* test for LogSoftmax Function */
+extern "C"
+bool TestLogSoftmax();
+} // namespace nts(NiuTrans.Tensor)
+#endif // __TEST_LOGSOFTMAX_H__
--- a/source/test/TLoss.cpp
+++ b/source/test/TLoss.cpp
@@ -19,91 +19,240 @@
 * $Created by: LI Yinqiao (email: li.yin.qiao.2012@hotmail.com) 2018-04-30
 */
-#include "../XTensor.h"
+#include "../core/ScaleAndShift.h"
-#include "../XDevice.h"
 #include "../function/Loss.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
-namespace nts { // namespace nt(NiuTrans.Tensor)
+/* case 1: test LossCompute function 
-/* case 1 */
+* In this case, Loss function name = SQUAREDERROR.
+* loss = sum_{i} 0.5*(t_i - y_i)^2, 
+* where t_i is the gold standard and y_i is the model output
+*/
 bool TestLoss1()
 {
-    /* a tensor of size 10000 * 1 */
+    /* a tensor of size (10, 1) */
+    int order = 2;
+    int * dimSize = new int[order];
+    dimSize[0] = 10;
+    dimSize[1] = 1;
+    int unitNum = 1;
+    for (int i = 0; i < order; i++)
+        unitNum *= dimSize[i];
+    /* CPU test */
+    bool cpuTest = true;
+    DTYPE answer = 5.0F;
+    /* create tensors */
+    XTensor * output = NewTensor(order, dimSize);
+    XTensor * gold = NewTensor(order, dimSize);
+    /* initialize variables */
+    output->SetZeroAll();
+    gold->SetZeroAll();
+    ScaleAndShift(output, 1, 1);
+    ScaleAndShift(gold, 1, 2);
+    DTYPE error;
+    error = LossCompute(gold, output, SQUAREDERROR, false, 0, 0, dimSize[0], 0);
+    /* check results */
+    cpuTest = (error == answer);
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+    /* create tensor */
+    XTensor * outputGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+    XTensor * goldGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+    /* Initialize variables */
+    outputGPU->SetZeroAll();
+    goldGPU->SetZeroAll();
+    ScaleAndShift(outputGPU, 1, 1);
+    ScaleAndShift(goldGPU, 1, 2);
+    /* call LossCompute function */
+    error = LossCompute(goldGPU, outputGPU, SQUAREDERROR, false, 0, 0, dimSize[0], 0);
+    /* check results */
+    gpuTest = (error == answer);
+    /* destroy variables */
+    delete output;
+    delete gold;
+    delete outputGPU;
+    delete goldGPU;
+    delete[] dimSize;
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete output;
+    delete gold;
+    delete[] dimSize;
+    return cpuTest;
+#endif // USE_CUDA
+}
+/* case 2: test LossCompute function 
+* In this case, Loss function name = CROSSENTROPY.
+* loss = sum_{i} (-t_i * log(y_i))
+* where t_i is the gold standard and y_i is the model output
+*/
+bool TestLoss2()
+{
+    /* a tensor of size (10, 1) */
+    int order = 2;
+    int * dimSize = new int[order];
+    dimSize[0] = 10;
+    dimSize[1] = 1;
+    int unitNum = 1;
+    for (int i = 0; i < order; i++)
+        unitNum *= dimSize[i];
+    /* CPU test */
+    bool cpuTest = true;
+    DTYPE answer = 0.0F;
+    /* create tensors */
+    XTensor * output = NewTensor(order, dimSize);
+    XTensor * gold = NewTensor(order, dimSize);
+    /* initialize variables */
+    output->SetZeroAll();
+    gold->SetZeroAll();
+    ScaleAndShift(output, 1, 1);
+    ScaleAndShift(gold, 1, 2);
+    DTYPE error;
+    error = LossCompute(gold, output, CROSSENTROPY, false, 0, 0, dimSize[0], 0);
+    /* check results */
+    cpuTest = (error == answer);
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+    /* create tensor */
+    XTensor * outputGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+    XTensor * goldGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+    /* Initialize variables */
+    outputGPU->SetZeroAll();
+    goldGPU->SetZeroAll();
+    ScaleAndShift(outputGPU, 1, 1);
+    ScaleAndShift(goldGPU, 1, 2);
+    /* call LossCompute function */
+    error = LossCompute(goldGPU, outputGPU, CROSSENTROPY, false, 0, 0, dimSize[0], 0);
+    /* check results */
+    gpuTest = (error == answer);
+    /* destroy variables */
+    delete output;
+    delete gold;
+    delete outputGPU;
+    delete goldGPU;
+    delete[] dimSize;
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete output;
+    delete gold;
+    delete[] dimSize;
+    return cpuTest;
+#endif // USE_CUDA
+}
+/* case 3: test LossCompute function 
+* In this case, Loss function name = ONEHOTERROR.
+* loss = sum_{i} e_i
+* where e_i = 0.5*(t_i - y_i)^2 if t_i = 1, e_i = 0 otherwise
+*/
+bool TestLoss3()
+{
+    /* a tensor of size (10, 1) */
    int order = 2;
-    int order_reduce = 1;
    int * dimSize = new int[order];
-    dimSize[0] = 10000;
+    dimSize[0] = 5;
    dimSize[1] = 1;
    int unitNum = 1;
    for (int i = 0; i < order; i++)
        unitNum *= dimSize[i];
+    DTYPE outputData[5][1] = { {0.5},
+                               {0.5},
+                               {0.5},
+                               {0.5},
+                               {0.5} };
+    DTYPE goldData[5][1] = { {1.0},
+                             {1.0},
+                             {0.0},
+                             {0.0},
+                             {0.0} };
    /* CPU test */
    bool cpuTest = true;
+    DTYPE answer = 0.25F;
    /* create tensors */
-    XTensor * a = NewTensor(order, dimSize);
+    XTensor * output = NewTensor(order, dimSize);
-    XTensor * b = NewTensor(order, dimSize);
+    XTensor * gold = NewTensor(order, dimSize);
    /* initialize variables */
-    DTYPE* a_data = (DTYPE*)a->data;
+    output->SetData(outputData, unitNum);
-    for (int i = 0; i < unitNum; i++)
+    gold->SetData(goldData, unitNum);
-        *a_data++ = 1;
-    DTYPE* b_data = (DTYPE*)b->data;
-    for (int i = 0; i < unitNum; i++)
-        *b_data++ = 1;
-    DTYPE error = 0.0F;
-    error = LossCompute(a, b, SQUAREDERROR, false, 1, 0, dimSize[0], 0);
-    printf("%d", error);
-    /* call reduce max function */
-    //ReduceMax(a, reduce_a, 0);
-    //ReduceMax(b, reduce_b, 1);
-    //DTYPE* reduce_a_data = (DTYPE*)reduce_a->data;
-    //for (int i = 0; i < unitNum_a; i++)
-    //    printf("%f ", *reduce_a_data++);
-    //printf("\n");
-    //DTYPE* reduce_b_data = (DTYPE*)reduce_b->data;
-    //for (int i = 0; i < unitNum_b; i++)
-    //    printf("%f ", *reduce_b_data++);
+    DTYPE error;
+    error = LossCompute(gold, output, ONEHOTERROR, false, 0, 0, dimSize[0], 0);
    /* check results */
-    cpuTest = true;
+    cpuTest = (error == answer);
 #ifdef USE_CUDA
    /* GPU test */
    bool gpuTest = true;
    /* create tensor */
-    XTensor * aGPU = NewTensor(order, dimSize, X_FLOAT);
+    XTensor * outputGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
-    XTensor * bGPU = NewTensor(order, dimSize, X_FLOAT);
+    XTensor * goldGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
    /* Initialize variables */
-    DTYPE* aGPU_data = (DTYPE*)aGPU->data;
+    outputGPU->SetData(outputData, unitNum);
-    for (int i = 0; i < unitNum; i++)
+    goldGPU->SetData(goldData, unitNum);
-        *aGPU_data++ = 1;
-    DTYPE* bGPU_data = (DTYPE*)bGPU->data;
-    for (int i = 0; i < unitNum; i++)
-        *bGPU_data++ = 1;
-    error = LossCompute(a, b, SQUAREDERROR, false, 1, 0, dimSize[0], 0);
-    printf("%d", error);
-    /* call reduce max function */
-    //ReduceMax(aGPU, reduce_aGPU, 0);
-    //ReduceMax(bGPU, reduce_bGPU, 1);
+    /* call LossCompute function */
+    error = LossCompute(goldGPU, outputGPU, ONEHOTERROR, false, 0, 0, dimSize[0], 0);
    /* check results */
-    gpuTest = true;
+    gpuTest = (error == answer);
    /* destroy variables */
-    delete aGPU, bGPU;
+    delete output;
+    delete gold;
+    delete outputGPU;
+    delete goldGPU;
    delete[] dimSize;
    return cpuTest && gpuTest;
 #else
    /* destroy variables */
-    delete a;
+    delete output;
-    delete b;
+    delete gold;
+    delete[] dimSize;
    return cpuTest;
 #endif // USE_CUDA
 }
@@ -113,11 +262,11 @@ bool TestLoss1()
 TODO!!
 */
-/* test for Sum Function */
+/* test for Loss Function */
 extern "C"
-    bool TestLoss()
+bool TestLoss()
 {
-    XPRINT(0, stdout, "[TEST Loss]\n");
+    XPRINT(0, stdout, "[TEST Loss] compute the loss \n");
    bool returnFlag = true, caseFlag = true;
    /* case 1 test */
@@ -129,6 +278,23 @@ extern "C"
    else
        XPRINT(0, stdout, ">> case 1 passed!\n");
+    /* case 2 test */
+    caseFlag = TestLoss2();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 2 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 2 passed!\n");
+    caseFlag = TestLoss3();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 3 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 3 passed!\n");
    ///* other cases test */
    ///*
    //TODO!!
@@ -145,4 +311,4 @@ extern "C"
    return returnFlag;
 }
-} // namespace nt(NiuTrans.Tensor)
+} // namespace nts(NiuTrans.Tensor)
--- a/source/test/TLoss.h
+++ b/source/test/TLoss.h
@@ -26,9 +26,9 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)
-/* test for Sum Function */
+/* test for Loss Function */
 extern "C"
 bool TestLoss();
 } // namespace nts(NiuTrans.Tensor)
-#endif // __TEST_SUM_H__
+#endif // __TEST_LOSS_H__
--- a/source/test/TMatrixMULBatchedCPU.cpp
+++ b/source/test/TMatrixMULBatchedCPU.cpp
@@ -19,13 +19,12 @@
 * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-06-15
 */
-#include "../XTensor.h"
 #include "TMatrixMULBatchedCPU.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
 /* case 1: matrix multiplication in batch mode (CPU code). 
-* In this case, aList=2*(2, 3), bList=2*(2, 3) -> c=2*(2, 2), 
+* In this case, aList=2*(2, 3), bList=2*(3, 2) -> c=2*(2, 2), 
-  transposedA=X_NOTRANS, transposedB=X_NOTRANS.
+* transposedA=X_NOTRANS, transposedB=X_NOTRANS.
 */
 bool TestMatrixMulBatchedCPU1()
 {
@@ -110,18 +109,12 @@ bool TestMatrixMulBatchedCPU1()
    MatrixMULBatchedCPU(aList, X_NOTRANS, bList, X_NOTRANS, cList);
    /* check results */
-    cpuTest = c1->CheckData(answer1, cUnitNum) && cpuTest;
+    cpuTest = c1->CheckData(answer1, cUnitNum) && c2->CheckData(answer2, cUnitNum);
-    cpuTest = c2->CheckData(answer2, cUnitNum) && cpuTest;
 #ifdef USE_CUDA
    /* GPU test */
    bool gpuTest = true;
-    /* clear list */
-    aList->Clear();
-    bList->Clear();
-    cList->Clear();
    /* create tensors */
    XTensor * aGPU1 = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
    XTensor * aGPU2 = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
@@ -137,32 +130,56 @@ bool TestMatrixMulBatchedCPU1()
    bGPU2->SetData(bData2, aUnitNum);
    cGPU1->SetZeroAll();
    cGPU2->SetZeroAll();
+    /* clear list */
+    aList->Clear();
+    bList->Clear();
+    cList->Clear();
    /* add tensors to list */
-    aList->Add(a1);
+    aList->Add(aGPU1);
-    aList->Add(a2);
+    aList->Add(aGPU2);
-    bList->Add(b1);
+    bList->Add(bGPU1);
-    bList->Add(b2);
+    bList->Add(bGPU2);
-    cList->Add(c1);
+    cList->Add(cGPU1);
-    cList->Add(c2);
+    cList->Add(cGPU2);
    /* call MatrixMULBatchedCPU function */
    MatrixMULBatchedCPU(aList, X_NOTRANS, bList, X_NOTRANS, cList);
    /* check results */
-    gpuTest = c1->CheckData(answer1, cUnitNum) && gpuTest;
+    gpuTest = cGPU1->CheckData(answer1, cUnitNum) && gpuTest;
-    gpuTest = c2->CheckData(answer2, cUnitNum) && gpuTest;
+    gpuTest = cGPU2->CheckData(answer2, cUnitNum) && gpuTest;
    /* destroy variables */
-    delete a1, a2, b1, b2, c1, c2;
+    delete a1;
-    delete aGPU1, aGPU2, bGPU1, bGPU2, cGPU1, cGPU2;
+    delete a2;
-    delete[] aDimSize, bDimSize, cDimSize;
+    delete b1;
+    delete b2;
+    delete c1;
+    delete c2;
+    delete aGPU1;
+    delete aGPU2;
+    delete bGPU1;
+    delete bGPU2;
+    delete cGPU1;
+    delete cGPU2;
+    delete[] aDimSize;
+    delete[] bDimSize;
+    delete[] cDimSize;
    return cpuTest && gpuTest;
 #else
    /* destroy variables */
-    delete a1, a2, b1, b2, c1, c2;
+    delete a1;
-    delete[] aDimSize, bDimSize, cDimSize;
+    delete a2;
+    delete b1;
+    delete b2;
+    delete c1;
+    delete c2;
+    delete[] aDimSize;
+    delete[] bDimSize;
+    delete[] cDimSize;
    return cpuTest;
 #endif // USE_CUDA
@@ -177,7 +194,7 @@ bool TestMatrixMulBatchedCPU1()
 extern "C"
 bool TestMatrixMulBatchedCPU()
 {
-    XPRINT(0, stdout, "[TEST MATRIXMULBATCHEDCPU] -------------\n");
+    XPRINT(0, stdout, "[TEST MATRIXMULBATCHEDCPU] matrix multiplication in batch mode (CPU code) \n");
    bool returnFlag = true, caseFlag = true;
    /* case 1 test */
@@ -190,15 +207,6 @@ bool TestMatrixMulBatchedCPU()
    else
        XPRINT(0, stdout, ">> case 1 passed!\n");
-    ///* case 2 test */
-    //caseFlag = TestMatrixMulBatchedCPU2();
-    //if (!caseFlag) {
-    //    returnFlag = false;
-    //    XPRINT(0, stdout, ">> case 2 failed!\n");
-    //}
-    //else
-    //    XPRINT(0, stdout, ">> case 2 passed!\n");
    /* other cases test */
    /*
    TODO!!

--- a/source/test/TMatrixMul.cpp
+++ b/source/test/TMatrixMul.cpp
@@ -19,7 +19,6 @@
 * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-06-14
 */
-#include "../XTensor.h"
 #include "TMatrixMul.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
@@ -59,13 +58,13 @@ bool TestMatrixMul1()
    for (int i = 0; i < tOrder; i++)
        tUnitNum *= tDimSize[i];
-    DTYPE sData1[2][3] = { {1.0F, 2.0F, 3.0F},
+    DTYPE sData1[2][3] = { {1.0, 2.0, 3.0},
-                           {-4.0F, 5.0F, 6.0F} };
+                           {-4.0, 5.0, 6.0} };
-    DTYPE sData2[3][2] = { {0.0F, -1.0F},
+    DTYPE sData2[3][2] = { {0.0, -1.0},
-                           {1.0F, 2.0F}, 
+                           {1.0, 2.0}, 
-                           {2.0F, 1.0F} };
+                           {2.0, 1.0} };
-    DTYPE answer[2][2] = { {8.0F, 6.0F}, 
+    DTYPE answer[2][2] = { {8.0, 6.0}, 
-                           {17.0F, 20.0F} };
+                           {17.0, 20.0} };
    /* CPU test */
    bool cpuTest = true;
@@ -167,14 +166,14 @@ bool TestMatrixMul2()
    for (int i = 0; i < tOrder; i++)
        tUnitNum *= tDimSize[i];
-    DTYPE sData1[3][2] = { {1.0F, -4.0F},
+    DTYPE sData1[3][2] = { {1.0, -4.0},
-                           {2.0F, 5.0F},
+                           {2.0, 5.0},
-                           {3.0F, 6.0F} };
+                           {3.0, 6.0} };
-    DTYPE sData2[3][2] = { {0.0F, -1.0F},
+    DTYPE sData2[3][2] = { {0.0, -1.0},
-                           {1.0F, 2.0F},
+                           {1.0, 2.0},
-                           {2.0F, 1.0F} };
+                           {2.0, 1.0} };
-    DTYPE answer[2][2] = { {8.0F, 6.0F},
+    DTYPE answer[2][2] = { {8.0, 6.0},
-                           {17.0F, 20.0F} };
+                           {17.0, 20.0} };
    /* CPU test */
    bool cpuTest = true;
@@ -280,30 +279,30 @@ bool TestMatrixMul3()
    for (int i = 0; i < tOrder; i++)
        tUnitNum *= tDimSize[i];
-    DTYPE sData1[3][2][3] = { { {0.0F, -1.0F, 2.0},
+    DTYPE sData1[3][2][3] = { { {0.0, -1.0, 2.0},
-                                {2.0F, 1.0F, 3.0} },
+                                {2.0, 1.0, 3.0} },
-                              { {1.0F, 2.0F, 4.0}, 
+                              { {1.0, 2.0, 4.0}, 
-                                {3.0F, 1.0F, 2.0}},
+                                {3.0, 1.0, 2.0}},
-                              { {-1.0F, 3.0F, 2.0}, 
+                              { {-1.0, 3.0, 2.0}, 
-                                {1.0F, -1.0F, 0.0} } };
+                                {1.0, -1.0, 0.0} } };
-    DTYPE sData2[2][3][2] = { { {1.0F, 2.0F},
+    DTYPE sData2[2][3][2] = { { {1.0, 2.0},
-                                {-4.0F, 3.0F},
+                                {-4.0, 3.0},
-                                {2.0F, 6.0F} },
+                                {2.0, 6.0} },
-                              { {1.0F, 2.0F},
+                              { {1.0, 2.0},
-                                {3.0F, 4.0F},
+                                {3.0, 4.0},
-                                {5.0F, 6.0F} } };
+                                {5.0, 6.0} } };
-    DTYPE answer[3][2][2][2] = { { { {8.0F, 9.0F}, 
+    DTYPE answer[3][2][2][2] = { { { {8.0, 9.0}, 
-                                     {4.0F, 25.0F} },
+                                     {4.0, 25.0} },
-                                   { {7.0F, 8.0F},
+                                   { {7.0, 8.0},
-                                     {20.0F, 26.0F} } },
+                                     {20.0, 26.0} } },
-                                 { { {1.0F, 32.0F},
+                                 { { {1.0, 32.0},
-                                     {3.0F, 21.0F} },
+                                     {3.0, 21.0} },
-                                   { {27.0F, 34.0F}, 
+                                   { {27.0, 34.0}, 
-                                     {16.0F, 22.0F} } },
+                                     {16.0, 22.0} } },
-                                 { { {-9.0F, 19.0F},
+                                 { { {-9.0, 19.0},
-                                     {5.0F, -1.0F} },
+                                     {5.0, -1.0} },
-                                   { {18.0F, 22.0F}, 
+                                   { {18.0, 22.0}, 
-                                     {-2.0F, -2.0F} } } };
+                                     {-2.0, -2.0} } } };
    /* CPU test */
    bool cpuTest = true;
@@ -407,21 +406,21 @@ bool TestMatrixMul4()
    for (int i = 0; i < tOrder; i++)
        tUnitNum *= tDimSize[i];
-    DTYPE sData1[3][2][3] = { { {0.0F, -1.0F, 2.0F},
+    DTYPE sData1[3][2][3] = { { {0.0, -1.0, 2.0},
-                                {2.0F, 1.0F, 3.0F} },
+                                {2.0, 1.0, 3.0} },
-                              { {1.0F, 2.0F, 4.0F}, 
+                              { {1.0, 2.0, 4.0}, 
-                                {3.0F, 1.0F, 2.0F}},
+                                {3.0, 1.0, 2.0}},
-                              { {-1.0F, 3.0F, 2.0F}, 
+                              { {-1.0, 3.0, 2.0}, 
-                                {1.0F, -1.0F, 0.0F} } };
+                                {1.0, -1.0, 0.0} } };
-    DTYPE sData2[3][2] = { {1.0F, 2.0F},
+    DTYPE sData2[3][2] = { {1.0, 2.0},
-                           {3.0F, 4.0F},
+                           {3.0, 4.0},
-                           {5.0F, 6.0F} };
+                           {5.0, 6.0} };
-    DTYPE answer[3][2][2] = { { {7.0F, 8.0F},
+    DTYPE answer[3][2][2] = { { {7.0, 8.0},
-                                {20.0F, 26.0F} },
+                                {20.0, 26.0} },
-                              { {27.0F, 34.0F}, 
+                              { {27.0, 34.0}, 
-                                 {16.0F, 22.0F} },
+                                 {16.0, 22.0} },
-                              { {18.0F, 22.0F}, 
+                              { {18.0, 22.0}, 
-                                {-2.0F, -2.0F} } };
+                                {-2.0, -2.0} } };
    /* CPU test */
    bool cpuTest = true;

--- a/source/test/TMatrixMul2D.cpp
+++ b/source/test/TMatrixMul2D.cpp
@@ -19,13 +19,12 @@
 * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-06-15
 */
-#include "../XTensor.h"
 #include "TMatrixMul2D.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
 /* case 1: matrix multiplication (for 2d tensors). 
-* In this case, a=(2, 3), b=(3, 2) -> c=(2, 2), transposedA=X_NOTRANS,
+* In this case, a=(2, 3), b=(3, 2) -> c=(2, 2), 
-  transposedB=X_NOTRANS.
+* transposedA=X_NOTRANS, transposedB=X_NOTRANS.
 */
 bool TestMatrixMul2D1()
 {
@@ -107,22 +106,33 @@ bool TestMatrixMul2D1()
    gpuTest = tGPU->CheckData(answer, tUnitNum);
    /* destroy variables */
-    delete s1, s2, t, sGPU1, sGPU2, tGPU;
+    delete s1;
-    delete[] sDimSize1, sDimSize2, tDimSize;
+    delete s2;
+    delete t;
+    delete sGPU1;
+    delete sGPU2;
+    delete tGPU;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;
    return cpuTest && gpuTest;
 #else
    /* destroy variables */
-    delete s1, s2, t;
+    delete s1;
-    delete[] sDimSize1, sDimSize2, tDimSize;
+    delete s2;
+    delete t;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;
    return cpuTest;
 #endif // USE_CUDA
 }
-/* case 2: matrix multiplication (for 2d tensors). 
+/* case 2: matrix multiplication (for 2d tensors).
-* In this case, a=(3, 2), b=(2, 3) -> c=(2, 2), transposedA=X_TRANS,
+* In this case, a=(3, 2), b=(3, 2) -> c=(2, 2), 
-  transposedB=X_NOTRANS.
+* transposedA=X_TRANS, transposedB=X_NOTRANS.
 */
 bool TestMatrixMul2D2()
 {
@@ -205,14 +215,25 @@ bool TestMatrixMul2D2()
    gpuTest = tGPU->CheckData(answer, tUnitNum);
    /* destroy variables */
-    delete s1, s2, t, sGPU1, sGPU2, tGPU;
+    delete s1;
-    delete[] sDimSize1, sDimSize2, tDimSize;
+    delete s2;
+    delete t;
+    delete sGPU1;
+    delete sGPU2;
+    delete tGPU;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;
    return cpuTest && gpuTest;
 #else
    /* destroy variables */
-    delete s1, s2, t;
+    delete s1;
-    delete[] sDimSize1, sDimSize2, tDimSize;
+    delete s2;
+    delete t;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;
    return cpuTest;
 #endif // USE_CUDA
@@ -227,7 +248,7 @@ bool TestMatrixMul2D2()
 extern "C"
 bool TestMatrixMul2D()
 {
-    XPRINT(0, stdout, "[TEST MATRIXMUL2D] -------------\n");
+    XPRINT(0, stdout, "[TEST MATRIXMUL2D] matrix multiplication (for 2d tensors) \n");
    bool returnFlag = true, caseFlag = true;
    /* case 1 test */

--- a/source/test/TMatrixMul2DParallel.cpp
+++ b/source/test/TMatrixMul2DParallel.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-06
+*/
+#include "TMatrixMul2DParallel.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/* case 1: matrix multiplication (for 2d tensors) with multi-threading. 
+* In this case, a=(2, 3), b=(3, 2) -> c=(2, 2), 
+* transposedA=X_NOTRANS, transposedB=X_NOTRANS.
+*/
+bool TestMatrixMul2DParallel1()
+{
+    /* a source tensor of size (2, 3) */
+    int sOrder1 = 2;
+    int * sDimSize1 = new int[sOrder1];
+    sDimSize1[0] = 2;
+    sDimSize1[1] = 3;
+    int sUnitNum1 = 1;
+    for (int i = 0; i < sOrder1; i++)
+        sUnitNum1 *= sDimSize1[i];
+    /* a source tensor of size (3, 2) */
+    int sOrder2 = 2;
+    int * sDimSize2 = new int[sOrder2];
+    sDimSize2[0] = 3;
+    sDimSize2[1] = 2;
+    int sUnitNum2 = 1;
+    for (int i = 0; i < sOrder2; i++)
+        sUnitNum2 *= sDimSize2[i];
+    /* a target tensor of size (2, 2) */
+    int tOrder = 2;
+    int * tDimSize = new int[tOrder];
+    tDimSize[0] = 2;
+    tDimSize[1] = 2;
+    int tUnitNum = 1;
+    for (int i = 0; i < tOrder; i++)
+        tUnitNum *= tDimSize[i];
+    DTYPE sData1[2][3] = { {1.0, 2.0, 3.0},
+                           {-4.0, 5.0, 6.0} };
+    DTYPE sData2[3][2] = { {0.0, -1.0},
+                           {1.0, 2.0}, 
+                           {2.0, 1.0} };
+    DTYPE answer[2][2] = { {8.0, 6.0}, 
+                           {17.0, 20.0} };
+    /* CPU test */
+    bool cpuTest = true;
+    /* create tensors */
+    XTensor * s1 = NewTensor(sOrder1, sDimSize1);
+    XTensor * s2 = NewTensor(sOrder2, sDimSize2);
+    XTensor * t = NewTensor(tOrder, tDimSize);
+    /* initialize variables */
+    s1->SetData(sData1, sUnitNum1);
+    s2->SetData(sData2, sUnitNum2);
+    t->SetZeroAll();
+    /* call MatrixMul2DParallel function */
+    MatrixMul2DParallel(s1, X_NOTRANS, s2, X_NOTRANS, t);
+    /* check results */
+    cpuTest = t->CheckData(answer, tUnitNum);
+    /* destroy variables */
+    delete s1;
+    delete s2;
+    delete t;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;
+    return cpuTest;
+}
+/* case 2: matrix multiplication (for 2d tensors) with multi-threading.
+* In this case, a=(3, 2), b=(3, 2) -> c=(2, 2), 
+* transposedA=X_TRANS, transposedB=X_NOTRANS.
+*/
+bool TestMatrixMul2DParallel2()
+{
+    /* a source tensor of size (3, 2) */
+    int sOrder1 = 2;
+    int * sDimSize1 = new int[sOrder1];
+    sDimSize1[0] = 3;
+    sDimSize1[1] = 2;
+    int sUnitNum1 = 1;
+    for (int i = 0; i < sOrder1; i++)
+        sUnitNum1 *= sDimSize1[i];
+    /* a source tensor of size (3, 2) */
+    int sOrder2 = 2;
+    int * sDimSize2 = new int[sOrder2];
+    sDimSize2[0] = 3;
+    sDimSize2[1] = 2;
+    int sUnitNum2 = 1;
+    for (int i = 0; i < sOrder2; i++)
+        sUnitNum2 *= sDimSize2[i];
+    /* a target tensor of size (2, 2) */
+    int tOrder = 2;
+    int * tDimSize = new int[tOrder];
+    tDimSize[0] = 2;
+    tDimSize[1] = 2;
+    int tUnitNum = 1;
+    for (int i = 0; i < tOrder; i++)
+        tUnitNum *= tDimSize[i];
+    DTYPE sData1[3][2] = { {1.0, -4.0},
+                           {2.0, 5.0},
+                           {3.0, 6.0} };
+    DTYPE sData2[3][2] = { {0.0, -1.0},
+                           {1.0, 2.0},
+                           {2.0, 1.0} };
+    DTYPE answer[2][2] = { {8.0, 6.0},
+                           {17.0, 20.0} };
+    /* CPU test */
+    bool cpuTest = true;
+    /* create tensors */
+    XTensor * s1 = NewTensor(sOrder1, sDimSize1);
+    XTensor * s2 = NewTensor(sOrder2, sDimSize2);
+    XTensor * t = NewTensor(tOrder, tDimSize);
+    /* initialize variables */
+    s1->SetData(sData1, sUnitNum1);
+    s2->SetData(sData2, sUnitNum2);
+    t->SetZeroAll();
+    /* call MatrixMul2DParallel function */
+    MatrixMul2DParallel(s1, X_TRANS, s2, X_NOTRANS, t);
+    /* check results */
+    cpuTest = t->CheckData(answer, tUnitNum);
+    /* destroy variables */
+    delete s1;
+    delete s2;
+    delete t;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;
+    return cpuTest;
+}
+/* other cases */
+/*
+    TODO!!
+*/
+/* test for MatrixMul2DParallel Function */
+extern "C"
+bool TestMatrixMul2DParallel()
+{
+    XPRINT(0, stdout, "[TEST MatrixMul2DParallel] matrix multiplication (for 2d tensors) with multi-threading \n");
+    bool returnFlag = true, caseFlag = true;
+    /* case 1 test */
+    caseFlag = TestMatrixMul2DParallel1();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 1 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 1 passed!\n");
+    /* case 2 test */
+    caseFlag = TestMatrixMul2DParallel2();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 2 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 2 passed!\n");
+    /* other cases test */
+    /*
+    TODO!!
+    */
+    if (returnFlag) {
+        XPRINT(0, stdout, ">> All Passed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> Failed!\n");
+    XPRINT(0, stdout, "\n");
+    return returnFlag;
+}
+} // namespace nts(NiuTrans.Tensor)
--- a/source/test/TMatrixMul2DParallel.h
+++ b/source/test/TMatrixMul2DParallel.h
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-06
+*/
+#ifndef __TEST_MATRIXMUL2DPARALLEL_H__
+#define __TEST_MATRIXMUL2DPARALLEL_H__
+#include "../core/MatrixMul2DParallel.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/* test for MatrixMul2DParallel Function */
+extern "C"
+bool TestMatrixMul2DParallel();
+} // namespace nts(NiuTrans.Tensor)
+#endif // __TEST_MATRIXMUL2DPARALLEL_H__
--- a/source/test/TMatrixMulBatched.cpp
+++ b/source/test/TMatrixMulBatched.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-06-15
+*/
+#include "TMatrixMULBatched.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/* case 1: matrix multiplication of the two tensors. 
+* In this case, a=(2, 3), b=(2, 3) -> c=(2, 2), transposedA=X_NOTRANS,
+  transposedB=X_NOTRANS.
+*/
+bool TestMatrixMulBatched1()
+{
+    /* a source tensor of size (2, 3) */
+    int sOrder1 = 2;
+    int * sDimSize1 = new int[sOrder1];
+    sDimSize1[0] = 2;
+    sDimSize1[1] = 3;
+    int sUnitNum1 = 1;
+    for (int i = 0; i < sOrder1; i++)
+        sUnitNum1 *= sDimSize1[i];
+    /* a source tensor of size (3, 2) */
+    int sOrder2 = 2;
+    int * sDimSize2 = new int[sOrder2];
+    sDimSize2[0] = 3;
+    sDimSize2[1] = 2;
+    int sUnitNum2 = 1;
+    for (int i = 0; i < sOrder2; i++)
+        sUnitNum2 *= sDimSize2[i];
+    /* a target tensor of size (2, 2) */
+    int tOrder = 2;
+    int * tDimSize = new int[tOrder];
+    tDimSize[0] = 2;
+    tDimSize[1] = 2;
+    int tUnitNum = 1;
+    for (int i = 0; i < tOrder; i++)
+        tUnitNum *= tDimSize[i];
+    DTYPE sData1[2][3] = { {1.0, 2.0, 3.0},
+                           {-4.0, 5.0, 6.0} };
+    DTYPE sData2[3][2] = { {0.0, -1.0},
+                           {1.0, 2.0}, 
+                           {2.0, 1.0} };
+    DTYPE answer[2][2] = { {8.0, 6.0}, 
+                           {17.0, 20.0} };
+    /* CPU test */
+    bool cpuTest = true;
+    /* create tensors */
+    XTensor * s1 = NewTensor(sOrder1, sDimSize1);
+    XTensor * s2 = NewTensor(sOrder2, sDimSize2);
+    XTensor * t = NewTensor(tOrder, tDimSize);
+    /* initialize variables */
+    s1->SetData(sData1, sUnitNum1);
+    s2->SetData(sData2, sUnitNum2);
+    t->SetZeroAll();
+    /* call MatrixMulBatched function */
+    MatrixMulBatched(s1, X_NOTRANS, s2, X_NOTRANS, t);
+    /* check results */
+    cpuTest = t->CheckData(answer, tUnitNum);
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+    /* create tensor */
+    XTensor * sGPU1 = NewTensor(sOrder1, sDimSize1, X_FLOAT, 1.0F, 0);
+    XTensor * sGPU2 = NewTensor(sOrder2, sDimSize2, X_FLOAT, 1.0F, 0);
+    XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
+    /* Initialize variables */
+    sGPU1->SetData(sData1, sUnitNum1);
+    sGPU2->SetData(sData2, sUnitNum2);
+    tGPU->SetZeroAll();
+    /* call MatrixMulBatched function */
+    MatrixMulBatched(sGPU1, X_NOTRANS, sGPU2, X_NOTRANS, tGPU);
+    /* check results */
+    gpuTest = tGPU->CheckData(answer, tUnitNum);
+    /* destroy variables */
+    delete s1;
+    delete s2;
+    delete t;
+    delete sGPU1;
+    delete sGPU2;
+    delete tGPU;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete s1;
+    delete s2;
+    delete t;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;
+    return cpuTest;
+#endif // USE_CUDA
+}
+/* case 2: matrix multiplication of the two tensors. 
+* In this case, a=(2, 2, 3), b=(2, 3, 2) -> c=(2, 2, 2), 
+* transposedA=X_NOTRANS, transposedB=X_NOTRANS.
+*/
+bool TestMatrixMulBatched2()
+{
+    /* a source tensor of size (2, 2, 3) */
+    int sOrder1 = 3;
+    int * sDimSize1 = new int[sOrder1];
+    sDimSize1[0] = 2;
+    sDimSize1[1] = 2;
+    sDimSize1[2] = 3;
+    int sUnitNum1 = 1;
+    for (int i = 0; i < sOrder1; i++)
+        sUnitNum1 *= sDimSize1[i];
+    /* a source tensor of size (2, 3, 2) */
+    int sOrder2 = 3;
+    int * sDimSize2 = new int[sOrder2];
+    sDimSize2[0] = 2;
+    sDimSize2[1] = 3;
+    sDimSize2[2] = 2;
+    int sUnitNum2 = 1;
+    for (int i = 0; i < sOrder2; i++)
+        sUnitNum2 *= sDimSize2[i];
+    /* a target tensor of size (2, 2, 2) */
+    int tOrder = 3;
+    int * tDimSize = new int[tOrder];
+    tDimSize[0] = 2;
+    tDimSize[1] = 2;
+    tDimSize[2] = 2;
+    int tUnitNum = 1;
+    for (int i = 0; i < tOrder; i++)
+        tUnitNum *= tDimSize[i];
+    DTYPE sData1[2][2][3] = { { {0.0, -1.0, 2.0},
+                                {2.0, 1.0, 3.0} },
+                              { {1.0, 2.0, 4.0}, 
+                                {3.0, 1.0, 2.0} } };
+    DTYPE sData2[2][3][2] = { { {1.0, 2.0},
+                                {-4.0, 3.0},
+                                {2.0, 6.0} },
+                              { {1.0, 2.0},
+                                {3.0, 4.0},
+                                {5.0, 6.0} } };
+    DTYPE answer[2][2][2] = { { {8.0, 9.0}, 
+                                {4.0, 25.0} },
+                              { {27.0, 34.0}, 
+                                {16.0, 22.0} } };
+    /* CPU test */
+    bool cpuTest = true;
+    /* create tensors */
+    XTensor * s1 = NewTensor(sOrder1, sDimSize1);
+    XTensor * s2 = NewTensor(sOrder2, sDimSize2);
+    XTensor * t = NewTensor(tOrder, tDimSize);
+    /* initialize variables */
+    s1->SetData(sData1, sUnitNum1);
+    s2->SetData(sData2, sUnitNum2);
+    t->SetZeroAll();
+    /* call MatrixMulBatched function */
+    MatrixMulBatched(s1, X_NOTRANS, s2, X_NOTRANS, t);
+    /* check results */
+    cpuTest = t->CheckData(answer, tUnitNum);
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+    /* create tensor */
+    XTensor * sGPU1 = NewTensor(sOrder1, sDimSize1, X_FLOAT, 1.0F, 0);
+    XTensor * sGPU2 = NewTensor(sOrder2, sDimSize2, X_FLOAT, 1.0F, 0);
+    XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
+    /* Initialize variables */
+    sGPU1->SetData(sData1, sUnitNum1);
+    sGPU2->SetData(sData2, sUnitNum2);
+    tGPU->SetZeroAll();
+    /* call MatrixMulBatched function */
+    MatrixMulBatched(sGPU1, X_NOTRANS, sGPU2, X_NOTRANS, tGPU);
+    /* check results */
+    gpuTest = tGPU->CheckData(answer, tUnitNum);
+    /* destroy variables */
+    delete s1;
+    delete s2;
+    delete t;
+    delete sGPU1;
+    delete sGPU2;
+    delete tGPU;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete s1;
+    delete s2;
+    delete t;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;
+    return cpuTest;
+#endif // USE_CUDA
+}
+/* other cases */
+/*
+    TODO!!
+*/
+/* test for TestMatrixMulBatched Function */
+extern "C"
+bool TestMatrixMulBatched()
+{
+    XPRINT(0, stdout, "[TEST MATRIXMULBATCHED] matrix multiplication of the two tensors \n");
+    bool returnFlag = true, caseFlag = true;
+    /* case 1 test */
+    caseFlag = TestMatrixMulBatched1();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 1 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 1 passed!\n");
+    /* case 2 test */
+    caseFlag = TestMatrixMulBatched2();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 2 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 2 passed!\n");
+    /* other cases test */
+    /*
+    TODO!!
+    */
+    if (returnFlag) {
+        XPRINT(0, stdout, ">> All Passed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> Failed!\n");
+    XPRINT(0, stdout, "\n");
+    return returnFlag;
+}
+} // namespace nts(NiuTrans.Tensor)
--- a/source/test/TMatrixMulBatched.h
+++ b/source/test/TMatrixMulBatched.h
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-06-15
+*/
+#ifndef __TEST_MATRIXMULBATCHED_H__
+#define __TEST_MATRIXMULBATCHED_H__
+#include "../core/MatrixMulBatched.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/* test for MatrixMulBatched Function */
+extern "C"
+bool TestMatrixMulBatched();
+} // namespace nts(NiuTrans.Tensor)
+#endif // __TEST_MATRIXMULBATCHED_H__
--- a/source/test/TMerge.cpp
+++ b/source/test/TMerge.cpp
@@ -88,21 +88,29 @@ bool TestMerge1()
    gpuTest = tGPU->CheckData(answer, tUnitNum);
    /* destroy variables */
-    delete s, t, sGPU, tGPU;
+    delete s;
-    delete[] sDimSize, tDimSize;
+    delete t;
+    delete sGPU;
+    delete tGPU;
+    delete[] sDimSize;
+    delete[] tDimSize;
    return cpuTest && gpuTest;
 #else
    /* destroy variables */
-    delete s, t;
+    delete s;
-    delete[] sDimSize, tDimSize;
+    delete t;
+    delete[] sDimSize;
+    delete[] tDimSize;
    return cpuTest;
 #endif // USE_CUDA
 }
 /* case 2: transform a tensor by merging it along with a dimension. 
-* In this case, (2, 2, 3) -> (4, 3), whereToMerge=1, leadingDim=0.
+* In this case, 
+(2, 2, 3) -> (4, 3), whereToMerge=1, leadingDim=0.
+(2, 2, 3) -> (2, 6), whereToMerge=2, leadingDim=0.
 */
 bool TestMerge2()
 {
@@ -118,40 +126,55 @@ bool TestMerge2()
        sUnitNum *= sDimSize[i];
    /* a target tensor of size (4, 3) */
-    int tOrder = 2;
+    int tOrder1 = 2;
-    int * tDimSize = new int[tOrder];
+    int * tDimSize1 = new int[tOrder1];
-    tDimSize[0] = 4;
+    tDimSize1[0] = 4;
-    tDimSize[1] = 3;
+    tDimSize1[1] = 3;
-    int tUnitNum = 1;
+    int tUnitNum1 = 1;
-    for (int i = 0; i < tOrder; i++)
+    for (int i = 0; i < tOrder1; i++)
-        tUnitNum *= tDimSize[i];
+        tUnitNum1 *= tDimSize1[i];
+    /* a target tensor of size (2, 6) */
+    int tOrder2 = 2;
+    int * tDimSize2 = new int[tOrder2];
+    tDimSize2[0] = 2;
+    tDimSize2[1] = 6;
+    int tUnitNum2 = 1;
+    for (int i = 0; i < tOrder2; i++)
+        tUnitNum2 *= tDimSize2[i];
    DTYPE sData[2][2][3] = { { {0.0, 1.0, 2.0},
                               {4.0, 5.0, 6.0} },
                             { {-1.0, 2.0, 3.0},
                               {-4.0, -5.0, -6.0} } };
-    DTYPE answer[4][3] = { {0.0, 1.0, 2.0},
+    DTYPE answer1[4][3] = { {0.0, 1.0, 2.0},
-                           {4.0, 5.0, 6.0},
+                            {4.0, 5.0, 6.0},
-                           {-1.0, 2.0, 3.0},
+                            {-1.0, 2.0, 3.0},
-                           {-4.0, -5.0, -6.0} };
+                            {-4.0, -5.0, -6.0} };
+    DTYPE answer2[2][6] = { {0.0, 1.0, 2.0, -1.0, 2.0, 3.0},
+                            {4.0, 5.0, 6.0, -4.0, -5.0, -6.0} };
    /* CPU test */
    bool cpuTest = true;
    /* create tensors */
    XTensor * s = NewTensor(sOrder, sDimSize);
-    XTensor * t = NewTensor(tOrder, tDimSize);
+    XTensor * t1 = NewTensor(tOrder1, tDimSize1);
+    XTensor * t2 = NewTensor(tOrder2, tDimSize2);
    /* initialize variables */
    s->SetData(sData, sUnitNum);
-    t->SetZeroAll();
+    t1->SetZeroAll();
+    t2->SetZeroAll();
    /* call merge function */
-    Merge(s, t, 1, 0);
+    Merge(s, t1, 1, 0);
+    Merge(s, t2, 2, 0);
    /* check results */
-    cpuTest = t->CheckData(answer, tUnitNum);
+    cpuTest = t1->CheckData(answer1, tUnitNum1) && t2->CheckData(answer2, tUnitNum2);
 #ifdef USE_CUDA
    /* GPU test */
@@ -159,121 +182,50 @@ bool TestMerge2()
    /* create tensor */
    XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
-    XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * tGPU1 = NewTensor(tOrder1, tDimSize1, X_FLOAT, 1.0F, 0);
+    XTensor * tGPU2 = NewTensor(tOrder2, tDimSize2, X_FLOAT, 1.0F, 0);
    /* Initialize variables */
    sGPU->SetData(sData, sUnitNum);
-    tGPU->SetZeroAll();
+    tGPU1->SetZeroAll();
+    tGPU2->SetZeroAll();
-    /* call merge function */
-    Merge(sGPU, tGPU, 1, 0);
-    /* check results */
-    gpuTest = tGPU->CheckData(answer, tUnitNum);
-    /* destroy variables */
-    delete s, t, sGPU, tGPU;
-    delete[] sDimSize, tDimSize;
-    return cpuTest && gpuTest;
-#else
-    /* destroy variables */
-    delete s, t;
-    delete[] sDimSize, tDimSize;
-    return cpuTest;
-#endif // USE_CUDA
-}
-/* case 3: transform a tensor by merging it along with a dimension. 
-* In this case, (2, 3, 4) -> (3, 8), whereToMerge=0, leadingDim=2.
-*/
-bool TestMerge3()
-{
-    /* a source tensor of size (2, 3, 4) */
-    int sOrder = 3;
-    int * sDimSize = new int[sOrder];
-    sDimSize[0] = 2;
-    sDimSize[1] = 3;
-    sDimSize[2] = 4;
-    int sUnitNum = 1;
-    for (int i = 0; i < sOrder; i++)
-        sUnitNum *= sDimSize[i];
-    /* a target tensor of size (8, 3) */
-    int tOrder = 2;
-    int * tDimSize = new int[tOrder];
-    tDimSize[0] = 3;
-    tDimSize[1] = 8;
-    int tUnitNum = 1;
-    for (int i = 0; i < tOrder; i++)
-        tUnitNum *= tDimSize[i];
-    DTYPE sData[2][3][4] = { { {0.0, 1.0, 2.0, 3.0},
-                               {4.0, 5.0, 6.0, 7.0},
-                               {8.0, 9.0, 10.0, 11.0} },
-                             { {0.0, -1.0, -2.0, -3.0},
-                               {-4.0, -5.0, -6.0, -7.0},
-                               {-8.0, -9.0, -10.0, -11.0} } };
-    DTYPE answer[3][8] = { {0.0, 1.0, 2.0, 3.0, 0.0, -1.0, -2.0, -3.0},
-                           {4.0, 5.0, 6.0, 7.0, -4.0, -5.0, -6.0, -7.0},
-                           {8.0, 9.0, 10.0, 11.0, -8.0, -9.0, -10.0, -11.0} };
-    /* CPU test */
-    bool cpuTest = true;
-    /* create tensors */
-    XTensor * s = NewTensor(sOrder, sDimSize);
-    XTensor * t = NewTensor(tOrder, tDimSize);
-    /* initialize variables */
-    s->SetData(sData, sUnitNum);
-    t->SetZeroAll();
    /* call merge function */
-    Merge(s, t, 2, 0);
+    Merge(sGPU, tGPU1, 1, 0);
+    Merge(sGPU, tGPU2, 2, 0);
    /* check results */
-    cpuTest = t->CheckData(answer, tUnitNum);
+    gpuTest = tGPU1->CheckData(answer1, tUnitNum1) && tGPU2->CheckData(answer2, tUnitNum2);
-#ifdef USE_CUDA
-    /* GPU test */
-    bool gpuTest = true;
-    /* create tensor */
-    XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
-    XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
-    /* Initialize variables */
-    sGPU->SetData(sData, sUnitNum);
-    tGPU->SetZeroAll();
-    /* call merge function */
-    Merge(sGPU, tGPU, 2, 0);
-    /* check results */
-    gpuTest = tGPU->CheckData(answer, tUnitNum);
    /* destroy variables */
-    delete s, t, sGPU, tGPU;
+    delete s;
-    delete[] sDimSize, tDimSize;
+    delete t1;
+    delete t2;
+    delete sGPU;
+    delete tGPU1;
+    delete tGPU2;
+    delete[] sDimSize;
+    delete[] tDimSize1;
+    delete[] tDimSize2;
    return cpuTest && gpuTest;
 #else
    /* destroy variables */
-    delete s, t;
+    delete s;
-    delete[] sDimSize, tDimSize;
+    delete t1;
+    delete t2;
+    delete[] sDimSize;
+    delete[] tDimSize1;
+    delete[] tDimSize2;
    return cpuTest;
 #endif // USE_CUDA
 }
-/* case 4: merge small tensors into a big tensor. 
+/* case 3: merge small tensors into a big tensor. 
 In this case, 2 * (2, 4) -> (4, 4), whereToMerge=0.
 */
-bool TestMerge4()
+bool TestMerge3()
 {
    /* create list */
    XList * smallList = new XList();
@@ -358,24 +310,36 @@ bool TestMerge4()
    /* check results */
    cpuTest = tGPU->CheckData(answer, tUnitNum);
-    delete s1, s2, t, sGPU1, sGPU2, tGPU;
+    /* destroy variables */
-    delete[] sDimSize, tDimSize;
+    delete s1;
+    delete s2;
+    delete t;
+    delete sGPU1;
+    delete sGPU2;
+    delete tGPU;
+    delete[] sDimSize;
+    delete[] tDimSize;
    delete smallList;
    return cpuTest && gpuTest;
 #else
    /* destroy variables */
-    delete s1, s2, t;
+    delete s1;
-    delete[] sDimSize, tDimSize;
+    delete s2;
+    delete t;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;
+    delete smallList;
    return cpuTest;
 #endif // USE_CUDA
 }
-/* case 5: merge small tensors into a big tensor. 
+/* case 4: merge small tensors into a big tensor. 
 In this case, 2 * (2, 4) -> (2, 8), whereToMerge=1.
 */
-bool TestMerge5()
+bool TestMerge4()
 {
    /* create list */
    XList * smallList = new XList();
@@ -458,15 +422,27 @@ bool TestMerge5()
    /* check results */
    cpuTest = tGPU->CheckData(answer, tUnitNum);
-    delete s1, s2, t, sGPU1, sGPU2, tGPU;
+    /* destroy variables */
-    delete[] sDimSize, tDimSize;
+    delete s1;
+    delete s2;
+    delete t;
+    delete sGPU1;
+    delete sGPU2;
+    delete tGPU;
+    delete[] sDimSize;
+    delete[] tDimSize;
    delete smallList;
    return cpuTest && gpuTest;
 #else
    /* destroy variables */
-    delete s1, s2, t;
+    delete s1;
-    delete[] sDimSize, tDimSize;
+    delete s2;
+    delete t;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;
+    delete smallList;
    return cpuTest;
 #endif // USE_CUDA
@@ -481,7 +457,7 @@ bool TestMerge5()
 extern "C"
 bool TestMerge()
 {
-    XPRINT(0, stdout, "[TEST MERGE] -------------\n");
+    XPRINT(0, stdout, "[TEST MERGE] transform a tensor by merging it alone with a dimension or merge small tensors into a big tensor\n");
    bool returnFlag = true, caseFlag = true;
    /* case 1 test */
@@ -521,15 +497,6 @@ bool TestMerge()
    else
        XPRINT(0, stdout, ">> case 4 passed!\n");
-    /* case 5 test */
-    caseFlag = TestMerge5();
-    if (!caseFlag) {
-        returnFlag = false;
-        XPRINT(0, stdout, ">> case 5 failed!\n");
-    }
-    else
-        XPRINT(0, stdout, ">> case 5 passed!\n");
    /* other cases test */
    /*
    TODO!!

--- a/source/test/TMultiplyElementWise.cpp
+++ b/source/test/TMultiplyElementWise.cpp
@@ -19,17 +19,16 @@
 * $Created by: Lin Ye (email: linye2015@outlook.com) 2018-06-15
 */
-#include "../XTensor.h"
+#include "TMultiplyElementWise.h"
-#include "../XDevice.h"
-#include "../core/MultiplyElementWise.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
-/* case 1: element-wise product of two tensors, c(i) = a(i)*b(i) + \alpha * c(i) 
+/* case 1: element-wise product of two tensors
-* In this case, (2 * 1)  (2 * 1) -> (2 * 1), leadingDim=0, alpha=0.
+* c(i) = a(i)*b(i) + \alpha * c(i) 
+* In this case, (2, 1)  (2, 1) -> (2, 1), leadingDim=0, alpha=0.
 */
 bool TestMultiplyElementWise1()
 {
-	/* a source tensor of size 2 * 1 */
+	/* a source tensor of size (2, 1) */
 	int sOrder1 = 2;
 	int * sDimSize1 = new int[sOrder1];
 	sDimSize1[0] = 2;
@@ -39,7 +38,7 @@ bool TestMultiplyElementWise1()
 	for (int i = 0; i < sOrder1; i++)
 		sUnitNum1 *= sDimSize1[i];
-	/* a source tensor of size 2 * 1 */
+	/* a source tensor of size (2, 1) */
 	int sOrder2 = 2;
 	int * sDimSize2 = new int[sOrder2];
 	sDimSize2[0] = 2;
@@ -49,7 +48,7 @@ bool TestMultiplyElementWise1()
 	for (int i = 0; i < sOrder2; i++)
 		sUnitNum2 *= sDimSize2[i];
-	/* a target tensor of size 2 * 1 */
+	/* a target tensor of size (2, 1) */
 	int tOrder = 2;
 	int * tDimSize = new int[tOrder];
 	tDimSize[0] = 2;
@@ -59,9 +58,12 @@ bool TestMultiplyElementWise1()
 	for (int i = 0; i < tOrder; i++)
 		tUnitNum *= tDimSize[i];
-	DTYPE sData1[2][1] = { {0.0}, {1.0} };
+	DTYPE sData1[2][1] = { {0.0}, 
-	DTYPE sData2[2][1] = { {2.0}, {3.0} };
+                           {1.0} };
-	DTYPE answer[2][1] = { {0.0}, {3.0} };
+	DTYPE sData2[2][1] = { {2.0},
+                           {3.0} };
+	DTYPE answer[2][1] = { {0.0},
+                           {3.0} };
 	/* CPU test */
 	bool cpuTest = true;
@@ -76,7 +78,7 @@ bool TestMultiplyElementWise1()
 	s2->SetData(sData2, sUnitNum2);
 	t->SetZeroAll();
-	/* call multiplyelementwise function */
+	/* call MultiplyElementWise function */
 	MultiplyElementWise(s1, s2, t, 0);
 	/* check results */
@@ -96,32 +98,44 @@ bool TestMultiplyElementWise1()
 	sGPU2->SetData(sData2, sUnitNum2);
 	tGPU->SetZeroAll();
-	/* call multiplyelementwise function */
+	/* call MultiplyElementWise function */
 	MultiplyElementWise(sGPU1, sGPU2, tGPU, 0);
 	/* check results */
 	gpuTest = tGPU->CheckData(answer, tUnitNum);
 	/* destroy variables */
-	delete s1, s2, t, sGPU1, sGPU2, tGPU;
+    delete s1;
-	delete[] sDimSize1, sDimSize2, tDimSize;
+    delete s2;
+    delete t;
+    delete sGPU1;
+    delete sGPU2;
+    delete tGPU;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;
 	return cpuTest && gpuTest;
 #else
-	/* destroy variables */
+    /* destroy variables */
-	delete s1, s2, t;
+    delete s1;
-	delete[] sDimSize1, sDimSize2, tDimSize;
+    delete s2;
+    delete t;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;
 	return cpuTest;
 #endif // USE_CUDA
 }
-/* case 2: element-wise product of two tensors, c(i) = a(i)*b(i) + \alpha * c(i)
+/* case 2: element-wise product of two tensors
-* In this case, (2 * 2)  (2 * 2) -> (2 * 2), leadingDim=0, alpha=0.
+* c(i) = a(i)*b(i) + \alpha * c(i)
+* In this case, (2, 2)  (2, 2) -> (2, 2), leadingDim=0, alpha=0.
 */
 bool TestMultiplyElementWise2()
 {
-	/* a source tensor of size (2 * 2) */
+	/* a source tensor of size (2, 2) */
 	int sOrder1 = 2;
 	int * sDimSize1 = new int[sOrder1];
 	sDimSize1[0] = 2;
@@ -131,7 +145,7 @@ bool TestMultiplyElementWise2()
 	for (int i = 0; i < sOrder1; i++)
 		sUnitNum1 *= sDimSize1[i];
-	/* a source tensor of size (2 * 2) */
+	/* a source tensor of size (2, 2) */
 	int sOrder2 = 2;
 	int * sDimSize2 = new int[sOrder2];
 	sDimSize2[0] = 2;
@@ -141,7 +155,7 @@ bool TestMultiplyElementWise2()
 	for (int i = 0; i < sOrder2; i++)
 		sUnitNum2 *= sDimSize2[i];
-	/* a target tensor of size (2 * 2) */
+	/* a target tensor of size (2, 2) */
 	int tOrder = 2;
 	int * tDimSize = new int[tOrder];
 	tDimSize[0] = 2;
@@ -171,7 +185,7 @@ bool TestMultiplyElementWise2()
 	s2->SetData(sData2, sUnitNum2);
 	t->SetZeroAll();
-	/* call multiplyelementwise function */
+	/* call MultiplyElementWise function */
 	MultiplyElementWise(s1, s2, t, 0);
 	/* check results */
@@ -191,32 +205,43 @@ bool TestMultiplyElementWise2()
 	sGPU2->SetData(sData2, sUnitNum2);
 	tGPU->SetZeroAll();
-	/* call multiplyelementwise function */
+	/* call MultiplyElementWise function */
 	MultiplyElementWise(sGPU1, sGPU2, tGPU, 0);
 	/* check results */
 	gpuTest = tGPU->CheckData(answer, tUnitNum);
 	/* destroy variables */
-	delete s1, s2, t, sGPU1, sGPU2, tGPU;
+    delete s1;
-	delete[] sDimSize1, sDimSize2, tDimSize;
+    delete s2;
+    delete t;
+    delete sGPU1;
+    delete sGPU2;
+    delete tGPU;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;
 	return cpuTest && gpuTest;
 #else
-	/* destroy variables */
+    /* destroy variables */
-	delete s1, s2, t;
+    delete s1;
-	delete[] sDimSize1, sDimSize2, tDimSize;
+    delete s2;
+    delete t;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;
 	return cpuTest;
 #endif // USE_CUDA
 }
 /* case 3: element-wise product of two tensors, c(i) = a(i)*b(i) + \alpha * c(i)
-* In this case, (2 * 2)  (2 * 2) -> (2 * 2), leadingDim=1, alpha=0.
+* In this case, (2, 2)  (2, 2) -> (2, 2), leadingDim=1, alpha=0.
 */
 bool TestMultiplyElementWise3()
 {
-	/* a source tensor of size (2 * 2) */
+	/* a source tensor of size (2, 2) */
 	int sOrder1 = 2;
 	int * sDimSize1 = new int[sOrder1];
 	sDimSize1[0] = 2;
@@ -226,7 +251,7 @@ bool TestMultiplyElementWise3()
 	for (int i = 0; i < sOrder1; i++)
 		sUnitNum1 *= sDimSize1[i];
-	/* a source tensor of size (2 * 2) */
+	/* a source tensor of size (2, 2) */
 	int sOrder2 = 2;
 	int * sDimSize2 = new int[sOrder2];
 	sDimSize2[0] = 2;
@@ -236,7 +261,7 @@ bool TestMultiplyElementWise3()
 	for (int i = 0; i < sOrder2; i++)
 		sUnitNum2 *= sDimSize2[i];
-	/* a target tensor of size (2 * 2) */
+	/* a target tensor of size (2, 2) */
 	int tOrder = 2;
 	int * tDimSize = new int[tOrder];
 	tDimSize[0] = 2;
@@ -266,7 +291,7 @@ bool TestMultiplyElementWise3()
 	s2->SetData(sData2, sUnitNum2);
 	t->SetZeroAll();
-	/* call multiplyelementwise function */
+	/* call MultiplyElementWise function */
 	MultiplyElementWise(s1, s2, t, 1);
 	/* check results */
@@ -286,21 +311,32 @@ bool TestMultiplyElementWise3()
 	sGPU2->SetData(sData2, sUnitNum2);
 	tGPU->SetZeroAll();
-	/* call multiplyelementwise function */
+	/* call MultiplyElementWise function */
 	MultiplyElementWise(sGPU1, sGPU2, tGPU, 1);
 	/* check results */
 	gpuTest = tGPU->CheckData(answer, tUnitNum);
 	/* destroy variables */
-	delete s1, s2, t, sGPU1, sGPU2, tGPU;
+    delete s1;
-	delete[] sDimSize1, sDimSize2, tDimSize;
+    delete s2;
+    delete t;
+    delete sGPU1;
+    delete sGPU2;
+    delete tGPU;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;
 	return cpuTest && gpuTest;
 #else
-	/* destroy variables */
+    /* destroy variables */
-	delete s1, s2, t;
+    delete s1;
-	delete[] sDimSize1, sDimSize2, tDimSize;
+    delete s2;
+    delete t;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;
 	return cpuTest;
 #endif // USE_CUDA
@@ -315,7 +351,7 @@ TODO!!
 extern "C"
 bool TestMultiplyElementWise()
 {
-	XPRINT(0, stdout, "[TEST MULTIPLYELEMENTWISE] -------------\n");
+	XPRINT(0, stdout, "[TEST MULTIPLYELEMENTWISE] element-wise product of two tensors \n");
 	bool returnFlag = true, caseFlag = true;
 	/* case 1 test */

--- a/source/test/TNegate.cpp
+++ b/source/test/TNegate.cpp
@@ -19,15 +19,13 @@
 * $Created by: Lin Ye (email: linye2015@outlook.com) 2018-06-14
 */
-#include "../XTensor.h"
+#include "TNegate.h"
-#include "../XDevice.h"
-#include "../core/Negate.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
 /* case 1: set every entry to its minus value */
 bool TestNegate1()
 {
-	/* a tensor of size 3 * 2 */
+	/* a tensor of size (3, 2) */
 	int aOrder = 2;
 	int * aDimSize = new int[aOrder];
 	aDimSize[0] = 3;
@@ -53,12 +51,12 @@ bool TestNegate1()
 	/* initialize variables */
 	a->SetData(aData, aUnitNum);
-	/* call negate function */
+	/* call Negate function */
 	Negate(a);
 	/* check results */
 	cpuTest = a->CheckData(answer, aUnitNum);
 #ifdef USE_CUDA
 	/* GPU test */
 	bool gpuTest = true;
@@ -69,14 +67,15 @@ bool TestNegate1()
 	/* Initialize variables */
 	aGPU->SetData(aData, aUnitNum);
-	/* call negate function */
+	/* call Negate function */
 	Negate(aGPU);
 	/* check results */
 	gpuTest = aGPU->CheckData(answer, aUnitNum);
 	/* destroy variables */
-	delete a, aGPU;
+	delete a;
+    delete aGPU;
 	delete[] aDimSize;
 	return cpuTest && gpuTest;
@@ -92,7 +91,7 @@ bool TestNegate1()
 /* case 2: set every entry to its minus value */
 bool TestNegate2()
 {
-	/* a tensor of size 3 * 2 */
+	/* a tensor of size (3, 2) */
 	int aOrder = 2;
 	int * aDimSize = new int[aOrder];
 	aDimSize[0] = 3;
@@ -118,7 +117,7 @@ bool TestNegate2()
 	/* initialize variables */
 	a->SetData(aData, aUnitNum);
-	/* call negate function */
+	/* call Negate function */
 	Negate(a);
 	/* check results */
@@ -134,14 +133,15 @@ bool TestNegate2()
 	/* Initialize variables */
 	aGPU->SetData(aData, aUnitNum);
-	/* call negate function */
+	/* call Negate function */
 	Negate(aGPU);
 	/* check results */
 	gpuTest = aGPU->CheckData(answer, aUnitNum);
 	/* destroy variables */
-	delete a, aGPU;
+	delete a;
+    delete aGPU;
 	delete[] aDimSize;
 	return cpuTest && gpuTest;
@@ -163,7 +163,7 @@ TODO!!
 extern "C"
 bool TestNegate()
 {
-	XPRINT(0, stdout, "[TEST NEGATE] -------------\n");
+	XPRINT(0, stdout, "[TEST NEGATE] set every entry to its minus value \n");
 	bool returnFlag = true, caseFlag = true;
 	/* case 1 test */

--- a/source/test/TNormalize.cpp
+++ b/source/test/TNormalize.cpp
@@ -19,17 +19,17 @@
 * $Created by: Lin Ye (email: linye2015@outlook.com) 2018-06-20
 */
-#include "../XTensor.h"
+#include "TNormalize.h"
-#include "../XDevice.h"
-#include "../core/Normalize.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
 /* case 1: normalized the data with normal distribution 
-* In this case, dim=0.
+* For an input x, y = a * (x-mean)/sqrt(variance+\epsilon) + b.
+* where a and b are the scalar and bias respectively, 
+* and \epsilon is the adjustment parameter.
 */
 bool TestNormalize1()
 {
-	/* a source tensor of size 2 * 3 */
+	/* a source tensor of size (2, 3) */
 	int sOrder = 2;
 	int * sDimSize = new int[sOrder];
 	sDimSize[0] = 2;
@@ -39,7 +39,7 @@ bool TestNormalize1()
 	for (int i = 0; i < sOrder; i++)
 		sUnitNum *= sDimSize[i];
-	/* a target tensor of size 2 * 3 */
+	/* a target tensor of size (2, 3) */
 	int tOrder = 2;
 	int * tDimSize = new int[tOrder];
 	tDimSize[0] = 2;
@@ -49,7 +49,7 @@ bool TestNormalize1()
 	for (int i = 0; i < tOrder; i++)
 		tUnitNum *= tDimSize[i];
-	/* a mean tensor of size 3 */
+	/* a mean tensor of size (3) */
 	int meanOrder = 1;
 	int * meanDimSize = new int[meanOrder];
 	meanDimSize[0] = 3;
@@ -58,7 +58,7 @@ bool TestNormalize1()
 	for (int i = 0; i < meanOrder; i++)
 		meanUnitNum *= meanDimSize[i];
-	/* a var tensor of size 3 */
+	/* a variance tensor of size (3) */
 	int varOrder = 1;
 	int * varDimSize = new int[varOrder];
 	varDimSize[0] = 3;
@@ -67,7 +67,7 @@ bool TestNormalize1()
 	for (int i = 0; i < varOrder; i++)
 		varUnitNum *= varDimSize[i];
-	/* a a tensor of size 2 * 3 */
+	/* a scalar tensor of size (2, 3) */
 	int aOrder = 2;
 	int * aDimSize = new int[aOrder];
 	aDimSize[0] = 2;
@@ -77,7 +77,7 @@ bool TestNormalize1()
 	for (int i = 0; i < aOrder; i++)
 		aUnitNum *= aDimSize[i];
-	/* a b tensor of size 2 * 3 */
+	/* a bias tensor of size (2, 3) */
 	int bOrder = 2;
 	int * bDimSize = new int[bOrder];
 	bDimSize[0] = 2;
@@ -87,41 +87,39 @@ bool TestNormalize1()
 	for (int i = 0; i < bOrder; i++)
 		bUnitNum *= bDimSize[i];
-	DTYPE sData[2][3] = { {0.5, -1.0, 2.0},
+	DTYPE sData[2][3] = { {1.0, 2.0, 3.0},
-	                      {3.5, -4.5, 1.0} };
+	                      {1.5, 2.5, 3.5} };
-	DTYPE meanData[3] = {2.0, -2.75, 1.5};
+	DTYPE meanData[3] = {1.0, 1.5, 2.0};
-	DTYPE varData[3] = {4.5, 6.125, 0.5};
+	DTYPE varData[3] = {1.0, 1.0, 4.0};
-	DTYPE aData[2][3] = { {0.0, 0.0, 0.0},
+    DTYPE aData[2][3] = { {1.0, 1.0, 1.0},
-	                      {0.0, 0.0, 0.0} };
+	                      {1.0, 1.0, 1.0} };
-	DTYPE bData[2][3] = { {0.0, 0.0, 0.0},
+	DTYPE answer[2][3] = { {0.0, 0.5, 0.5},
-	                      {0.0, 0.0, 0.0} };
+	                       {0.5, 1.0, 0.75} };
-	DTYPE answer[2][3] = { {0.0, 0.0, 0.0},
-	                       {0.0, 0.0, 0.0} };
 	/* CPU test */
 	bool cpuTest = true;
 	/* create tensors */
 	XTensor * s = NewTensor(sOrder, sDimSize);
+	XTensor * t = NewTensor(tOrder, tDimSize);
 	XTensor * mean = NewTensor(meanOrder, meanDimSize);
 	XTensor * var = NewTensor(varOrder, varDimSize);
 	XTensor * a = NewTensor(aOrder, aDimSize);
 	XTensor * b = NewTensor(bOrder, bDimSize);
-	XTensor * t = NewTensor(tOrder, tDimSize);
 	/* initialize variables */
 	s->SetData(sData, sUnitNum);
 	mean->SetData(meanData, meanUnitNum);
 	var->SetData(varData, varUnitNum);
 	a->SetData(aData, aUnitNum);
-	b->SetData(bData, bUnitNum);
+	b->SetZeroAll();
 	t->SetZeroAll();
 	/* call normalize function */
 	Normalize(s, t, 0, mean, var, a, b, 0.0);
 	/* check results */
-	cpuTest = t->CheckData(answer, tUnitNum);
+	cpuTest = t->CheckData(answer, tUnitNum, 1e-4, 0);
 #ifdef USE_CUDA
 	/* GPU test */
@@ -140,24 +138,50 @@ bool TestNormalize1()
 	meanGPU->SetData(meanData, meanUnitNum);
 	varGPU->SetData(varData, varUnitNum);
 	aGPU->SetData(aData, aUnitNum);
-	bGPU->SetData(bData, bUnitNum);
+	bGPU->SetZeroAll();
 	tGPU->SetZeroAll();
-	/* call normalize function */
+	/* call Normalize function */
 	Normalize(sGPU, tGPU, 0, meanGPU, varGPU, aGPU, bGPU, 0.0);
 	/* check results */
-	gpuTest = tGPU->CheckData(answer, tUnitNum);
+	gpuTest = tGPU->CheckData(answer, tUnitNum, 1e-4, 0);
 	/* destroy variables */
-	delete s, t, mean, var, a, b, sGPU, tGPU, meanGPU, varGPU, aGPU, bGPU;
+	delete s;
-	delete[] sDimSize, tDimSize, meanDimSize, varDimSize, aDimSize, bDimSize;
+	delete t;
+	delete mean;
+	delete var;
+	delete a;
+	delete b;
+	delete sGPU;
+	delete tGPU;
+	delete meanGPU;
+	delete varGPU;
+	delete aGPU;
+	delete bGPU;
+	delete[] sDimSize;
+	delete[] tDimSize;
+	delete[] meanDimSize;
+	delete[] varDimSize;
+	delete[] aDimSize;
+	delete[] bDimSize;
 	return cpuTest && gpuTest;
 #else
 	/* destroy variables */
-	delete s, t, mean, var, a, b;
+	delete s;
-	delete[] sDimSize, tDimSize, meanDimSize, varDimSize, aDimSize, bDimSize;
+	delete t;
+	delete mean;
+	delete var;
+	delete a;
+	delete b;
+	delete[] sDimSize;
+	delete[] tDimSize;
+	delete[] meanDimSize;
+	delete[] varDimSize;
+	delete[] aDimSize;
+	delete[] bDimSize;
 	return cpuTest;
 #endif // USE_CUDA
@@ -172,7 +196,7 @@ TODO!!
 extern "C"
 bool TestNormalize()
 {
-	XPRINT(0, stdout, "[TEST NORMALIZE] -------------\n");
+	XPRINT(0, stdout, "[TEST NORMALIZE] normalized the data with normal distribution \n");
 	bool returnFlag = true, caseFlag = true;
 	/* case 1 test */

--- a/source/test/TPower.cpp
+++ b/source/test/TPower.cpp
@@ -19,9 +19,8 @@
 * $Created by: Lin Ye (email: linye2015@outlook.com) 2018-06-15
 */
-#include "../XTensor.h"
+#include "../XUtility.h"
-#include "../XDevice.h"
+#include "TPower.h"
-#include "../core/Power.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
 /* case 1: get the power(a, p) 
@@ -29,7 +28,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 */
 bool TestPower1()
 {
-	/* a tensor of size 3 * 2 */
+	/* a tensor of size (3, 2) */
 	int aOrder = 2;
 	int * aDimSize = new int[aOrder];
 	aDimSize[0] = 3;
@@ -55,11 +54,11 @@ bool TestPower1()
 	/* initialize variables */
 	a->SetData(aData, aUnitNum);
-	/* call power function */
+	/* call Power function */
 	Power(a, 2.0);
 	/* check results */
-	cpuTest = a->CheckData(answer, aUnitNum);
+	cpuTest = a->CheckData(answer, aUnitNum, 1e-4F);
 #ifdef USE_CUDA
 	/* GPU test */
@@ -75,10 +74,11 @@ bool TestPower1()
 	Power(aGPU, 2.0);
 	/* check results */
-	gpuTest = aGPU->CheckData(answer, aUnitNum, 0.0001F);
+	gpuTest = aGPU->CheckData(answer, aUnitNum, 1e-4F);
 	/* destroy variables */
-	delete a, aGPU;
+	delete a;
+    delete aGPU;
 	delete[] aDimSize;
 	return cpuTest && gpuTest;
@@ -96,7 +96,7 @@ bool TestPower1()
 */
 bool TestPower2()
 {
-	/* a tensor of size 3 * 2 */
+	/* a tensor of size (3, 2) */
 	int aOrder = 2;
 	int * aDimSize = new int[aOrder];
 	aDimSize[0] = 3;
@@ -122,11 +122,11 @@ bool TestPower2()
 	/* initialize variables */
 	a->SetData(aData, aUnitNum);
-	/* call power function */
+	/* call Power function */
 	Power(a, 1.0);
 	/* check results */
-	cpuTest = a->CheckData(answer, aUnitNum);
+	cpuTest = a->CheckData(answer, aUnitNum, 1e-4F);
 #ifdef USE_CUDA
 	/* GPU test */
@@ -138,14 +138,15 @@ bool TestPower2()
 	/* Initialize variables */
 	aGPU->SetData(aData, aUnitNum);
-	/* call power function */
+	/* call Power function */
 	Power(aGPU, 1.0);
 	/* check results */
-	gpuTest = aGPU->CheckData(answer, aUnitNum);
+	gpuTest = aGPU->CheckData(answer, aUnitNum, 1e-4F);
 	/* destroy variables */
-	delete a, aGPU;
+	delete a;
+    delete aGPU;
 	delete[] aDimSize;
 	return cpuTest && gpuTest;
@@ -163,7 +164,7 @@ bool TestPower2()
 */
 bool TestPower3()
 {
-	/* a tensor of size 3 * 2 */
+	/* a tensor of size (3, 2) */
 	int aOrder = 2;
 	int * aDimSize = new int[aOrder];
 	aDimSize[0] = 3;
@@ -189,11 +190,11 @@ bool TestPower3()
 	/* initialize variables */
 	a->SetData(aData, aUnitNum);
-	/* call power function */
+	/* call Power function */
 	Power(a, 0.0);
 	/* check results */
-	cpuTest = a->CheckData(answer, aUnitNum);
+	cpuTest = a->CheckData(answer, aUnitNum, 1e-4F);
 #ifdef USE_CUDA
 	/* GPU test */
@@ -205,14 +206,15 @@ bool TestPower3()
 	/* Initialize variables */
 	aGPU->SetData(aData, aUnitNum);
-	/* call power function */
+	/* call Power function */
 	Power(aGPU, 0.0);
 	/* check results */
-	gpuTest = aGPU->CheckData(answer, aUnitNum);
+	gpuTest = aGPU->CheckData(answer, aUnitNum, 1e-4F);
 	/* destroy variables */
-	delete a, aGPU;
+	delete a;
+    delete aGPU;
 	delete[] aDimSize;
 	return cpuTest && gpuTest;
@@ -234,7 +236,7 @@ TODO!!
 extern "C"
 bool TestPower()
 {
-	XPRINT(0, stdout, "[TEST POWER] -------------\n");
+	XPRINT(0, stdout, "[TEST POWER] get the power(a, p) \n");
 	bool returnFlag = true, caseFlag = true;
 	/* case 1 test */

--- a/source/test/TRectify.cpp
+++ b/source/test/TRectify.cpp
@@ -19,15 +19,15 @@
 * $Created by: Lin Ye (email: linye2015@outlook.com) 2018-06-14
 */
-#include "../XTensor.h"
+#include "TRectify.h"
-#include "../XDevice.h"
-#include "../function/Rectify.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
-/* case 1: rectify function y = max(0, x) */
+/* case 1: test rectify function
+* y = max(0, x) 
+*/
 bool TestRectify1()
 {
-    /* a x tensor of size 2 * 3 */
+    /* a x tensor of size (2, 3) */
    int xOrder = 2;
    int * xDimSize = new int[xOrder];
    xDimSize[0] = 2;
@@ -37,7 +37,7 @@ bool TestRectify1()
    for (int i = 0; i < xOrder; i++)
        xUnitNum *= xDimSize[i];
-    /* a y tensor of size 2 * 3 */
+    /* a y tensor of size (2, 3) */
    int yOrder = 2;
    int * yDimSize = new int[yOrder];
    yDimSize[0] = 2;
@@ -63,7 +63,7 @@ bool TestRectify1()
    x->SetData(xData, xUnitNum);
    y->SetZeroAll();
-    /* call rectify function */
+    /* call Rectify function */
    Rectify(x, y);
    /* check results */
@@ -81,159 +81,40 @@ bool TestRectify1()
 	xGPU->SetData(xData, xUnitNum);
 	yGPU->SetZeroAll();
-	/* call rectify function */
+	/* call Rectify function */
 	Rectify(xGPU, yGPU);
 	/* check results */
 	gpuTest = yGPU->CheckData(answer, yUnitNum);
 	/* destroy variables */
-	delete x, y, xGPU, yGPU;
+	delete x;
-	delete[] xDimSize, yDimSize;
+    delete y;
+    delete xGPU;
+    delete yGPU;
+	delete[] xDimSize;
+    delete[] yDimSize;
 	return cpuTest && gpuTest;
 #else
 	/* destroy variables */
-	delete x, y;
+	delete x;
-	delete[] xDimSize, yDimSize;
+    delete y;
+	delete[] xDimSize;
+    delete[] yDimSize;
 	return cpuTest;
 #endif // USE_CUDA
 }
-/* case 2: backward computation dE/dx = dE/dy * dy/dx rectified: y = max(0, x) 
+/* case 2: backward computation 
+* dE/dx = dE/dy * dy/dx 
+* rectified: y = max(0, x) 
 * In this case, lossName=CROSSENTROPY.
 */
 bool TestRectify2()
 {
-	/* a x tensor of size 2 * 3 */
+	/* a x tensor of size (2, 3) */
-	int xOrder = 2;
-	int * xDimSize = new int[xOrder];
-	xDimSize[0] = 2;
-	xDimSize[1] = 3;
-	int xUnitNum = 1;
-	for (int i = 0; i < xOrder; i++)
-		xUnitNum *= xDimSize[i];
-	/* a y tensor of size 2 * 3 */
-	int yOrder = 2;
-	int * yDimSize = new int[yOrder];
-	yDimSize[0] = 2;
-	yDimSize[1] = 3;
-	int yUnitNum = 1;
-	for (int i = 0; i < yOrder; i++)
-		yUnitNum *= yDimSize[i];
-	/* a gold tensor of size 2 * 3 */
-	int goldOrder = 2;
-	int * goldDimSize = new int[goldOrder];
-	goldDimSize[0] = 2;
-	goldDimSize[1] = 3;
-	int goldUnitNum = 1;
-	for (int i = 0; i < goldOrder; i++)
-		goldUnitNum *= goldDimSize[i];
-	/* a dedy tensor of size 2 * 3 */
-	int dedyOrder = 2;
-	int * dedyDimSize = new int[dedyOrder];
-	dedyDimSize[0] = 2;
-	dedyDimSize[1] = 3;
-	int dedyUnitNum = 1;
-	for (int i = 0; i < dedyOrder; i++)
-		dedyUnitNum *= dedyDimSize[i];
-	/* a dedx tensor of size 2 * 3 */
-	int dedxOrder = 2;
-	int * dedxDimSize = new int[dedxOrder];
-	dedxDimSize[0] = 2;
-	dedxDimSize[1] = 3;
-	int dedxUnitNum = 1;
-	for (int i = 0; i < dedxOrder; i++)
-		dedxUnitNum *= dedxDimSize[i];
-	DTYPE xData[2][3] = { {1.0F, 1.0F, 2.0F},
-	                      {2.0F, 4.0F, 5.0F} };
-	DTYPE yData[2][3] = { {1.0F, 1.0F, 2.0F},
-	                      {2.0F, 4.0F, 5.0F} };
-	DTYPE goldData[2][3] = { {1.0F, 1.0F, 1.0F},
-	                         {1.0F, 1.0F, 1.0F} };
-	DTYPE dedyData[2][3] = { {-1.0F, -1.0F, -0.5F},
-	                         {-0.5F, -0.25F, -0.2F} };
-	DTYPE answer[2][3] = { {-1.0F, -1.0F, -0.5F},
-	                       {-0.5F, -0.25F, -0.2F} };
-	/* CPU test */
-	bool cpuTest = true;
-	/* create tensors */
-	XTensor * x = NewTensor(xOrder, xDimSize);
-	XTensor * y = NewTensor(yOrder, yDimSize);
-	XTensor * gold = NewTensor(goldOrder, goldDimSize);
-	XTensor * dedy = NewTensor(dedyOrder, dedyDimSize);
-	XTensor * dedx = NewTensor(dedxOrder, dedxDimSize);
-	/* initialize variables */
-	x->SetData(xData, xUnitNum);
-	y->SetData(yData, yUnitNum);
-	gold->SetData(goldData, goldUnitNum);
-	dedy->SetData(dedyData, dedyUnitNum);
-	dedx->SetZeroAll();
-	/* call rectifybackward function */
-	RectifyBackward(gold, y, x, dedy, dedx, CROSSENTROPY);
-	/* check results */
-	cpuTest = dedx->CheckData(answer, dedxUnitNum);
-#ifdef USE_CUDA
-	/* GPU test */
-	bool gpuTest = true;
-	/* create tensors */
-	XTensor * xGPU = NewTensor(xOrder, xDimSize, X_FLOAT, 1.0F, 0);
-	XTensor * yGPU = NewTensor(yOrder, yDimSize, X_FLOAT, 1.0F, 0);
-	XTensor * goldGPU = NewTensor(goldOrder, goldDimSize, X_FLOAT, 1.0F, 0);
-	XTensor * dedyGPU = NewTensor(dedyOrder, dedyDimSize, X_FLOAT, 1.0F, 0);
-	XTensor * dedxGPU = NewTensor(dedxOrder, dedxDimSize, X_FLOAT, 1.0F, 0);
-	/* initialize variables */
-	xGPU->SetData(xData, xUnitNum);
-	yGPU->SetData(yData, yUnitNum);
-	goldGPU->SetData(goldData, goldUnitNum);
-	dedyGPU->SetData(dedyData, dedyUnitNum);
-	dedxGPU->SetZeroAll();
-	/* call rectifybackward function */
-	RectifyBackward(goldGPU, yGPU, xGPU, dedyGPU, dedxGPU, CROSSENTROPY);
-	/* check results */
-	gpuTest = dedxGPU->CheckData(answer, dedxUnitNum);
-	/* destroy variables */
-	delete x, y, dedy, dedx, gold, xGPU, yGPU, dedyGPU, dedxGPU, goldGPU;
-	delete[] xDimSize, yDimSize, dedyDimSize, dedxDimSize, goldDimSize;
-	return cpuTest && gpuTest;
-#else
-	/* destroy variables */
-	delete x, y, dedy, dedx, gold;
-	delete[] xDimSize, yDimSize, dedyDimSize, dedxDimSize, goldDimSize;
-	return cpuTest;
-#endif // USE_CUDA
-}
-/* case 3: backward computation dE/dx = dE/dy * dy/dx rectified: y = max(0, x)
-* In this case, lossName=SQUAREDERROR.
-*/
-bool TestRectify3()
-{
-	/* a x tensor of size 2 * 3 */
 	int xOrder = 2;
 	int * xDimSize = new int[xOrder];
 	xDimSize[0] = 2;
@@ -243,79 +124,39 @@ bool TestRectify3()
 	for (int i = 0; i < xOrder; i++)
 		xUnitNum *= xDimSize[i];
-	/* a y tensor of size 2 * 3 */
-	int yOrder = 2;
-	int * yDimSize = new int[yOrder];
-	yDimSize[0] = 2;
-	yDimSize[1] = 3;
-	int yUnitNum = 1;
-	for (int i = 0; i < yOrder; i++)
-		yUnitNum *= yDimSize[i];
-	/* a gold tensor of size 2 * 3 */
-	int goldOrder = 2;
-	int * goldDimSize = new int[goldOrder];
-	goldDimSize[0] = 2;
-	goldDimSize[1] = 3;
-	int goldUnitNum = 1;
-	for (int i = 0; i < goldOrder; i++)
-		goldUnitNum *= goldDimSize[i];
-	/* a dedy tensor of size 2 * 3 */
-	int dedyOrder = 2;
-	int * dedyDimSize = new int[dedyOrder];
-	dedyDimSize[0] = 2;
-	dedyDimSize[1] = 3;
-	int dedyUnitNum = 1;
-	for (int i = 0; i < dedyOrder; i++)
-		dedyUnitNum *= dedyDimSize[i];
-	/* a dedx tensor of size 2 * 3 */
-	int dedxOrder = 2;
-	int * dedxDimSize = new int[dedxOrder];
-	dedxDimSize[0] = 2;
-	dedxDimSize[1] = 3;
-	int dedxUnitNum = 1;
-	for (int i = 0; i < dedxOrder; i++)
-		dedxUnitNum *= dedxDimSize[i];
 	DTYPE xData[2][3] = { {1.0, 1.0, 2.0},
 	                      {2.0, 4.0, 5.0} };
 	DTYPE yData[2][3] = { {1.0, 1.0, 2.0},
 	                      {2.0, 4.0, 5.0} };
 	DTYPE goldData[2][3] = { {1.0, 1.0, 1.0},
 	                         {1.0, 1.0, 1.0} };
-	DTYPE dedyData[2][3] = { {0.0, 0.0, 1.0},
+	DTYPE dedyData[2][3] = { {-1.0, -1.0, -0.5},
-	                         {1.0, 3.0, 4.0} };
+	                         {-0.5, -0.25, -0.2} };
-	DTYPE answer[2][3] = { {0.0, 0.0, 1.0},
+	DTYPE answer[2][3] = { {-1.0, -1.0, -0.5},
-	                       {1.0, 3.0, 4.0} };
+	                       {-0.5, -0.25, -0.2} };
 	/* CPU test */
 	bool cpuTest = true;
 	/* create tensors */
 	XTensor * x = NewTensor(xOrder, xDimSize);
-	XTensor * y = NewTensor(yOrder, yDimSize);
+	XTensor * y = NewTensor(xOrder, xDimSize);
-	XTensor * gold = NewTensor(goldOrder, goldDimSize);
+	XTensor * gold = NewTensor(xOrder, xDimSize);
-	XTensor * dedy = NewTensor(dedyOrder, dedyDimSize);
+	XTensor * dedy = NewTensor(xOrder, xDimSize);
-	XTensor * dedx = NewTensor(dedxOrder, dedxDimSize);
+	XTensor * dedx = NewTensor(xOrder, xDimSize);
 	/* initialize variables */
 	x->SetData(xData, xUnitNum);
-	y->SetData(yData, yUnitNum);
+	y->SetData(yData, xUnitNum);
-	gold->SetData(goldData, goldUnitNum);
+	gold->SetData(goldData, xUnitNum);
-	dedy->SetData(dedyData, dedyUnitNum);
+	dedy->SetData(dedyData, xUnitNum);
 	dedx->SetZeroAll();
-	/* call rectifybackward function */
+	/* call RectifyBackward function */
-	RectifyBackward(gold, y, x, dedy, dedx, CROSSENTROPY);
+	RectifyBackward(gold, y, x, dedy, dedx, NOLOSS);
 	/* check results */
-	cpuTest = dedx->CheckData(answer, dedxUnitNum);
+	cpuTest = dedx->CheckData(answer, xUnitNum);
 #ifdef USE_CUDA
 	/* GPU test */
@@ -323,160 +164,46 @@ bool TestRectify3()
 	/* create tensors */
 	XTensor * xGPU = NewTensor(xOrder, xDimSize, X_FLOAT, 1.0F, 0);
-	XTensor * yGPU = NewTensor(yOrder, yDimSize, X_FLOAT, 1.0F, 0);
+	XTensor * yGPU = NewTensor(xOrder, xDimSize, X_FLOAT, 1.0F, 0);
-	XTensor * goldGPU = NewTensor(goldOrder, goldDimSize, X_FLOAT, 1.0F, 0);
+	XTensor * goldGPU = NewTensor(xOrder, xDimSize, X_FLOAT, 1.0F, 0);
-	XTensor * dedyGPU = NewTensor(dedyOrder, dedyDimSize, X_FLOAT, 1.0F, 0);
+	XTensor * dedyGPU = NewTensor(xOrder, xDimSize, X_FLOAT, 1.0F, 0);
-	XTensor * dedxGPU = NewTensor(dedxOrder, dedxDimSize, X_FLOAT, 1.0F, 0);
+	XTensor * dedxGPU = NewTensor(xOrder, xDimSize, X_FLOAT, 1.0F, 0);
 	/* initialize variables */
 	xGPU->SetData(xData, xUnitNum);
-	yGPU->SetData(yData, yUnitNum);
+	yGPU->SetData(yData, xUnitNum);
-	goldGPU->SetData(goldData, goldUnitNum);
+	goldGPU->SetData(goldData, xUnitNum);
-	dedyGPU->SetData(dedyData, dedyUnitNum);
+	dedyGPU->SetData(dedyData, xUnitNum);
 	dedxGPU->SetZeroAll();
 	/* call rectifybackward function */
-	RectifyBackward(goldGPU, yGPU, xGPU, dedyGPU, dedxGPU, CROSSENTROPY);
+	RectifyBackward(goldGPU, yGPU, xGPU, dedyGPU, dedxGPU, NOLOSS);
 	/* check results */
-	gpuTest = dedxGPU->CheckData(answer, dedxUnitNum);
+	gpuTest = dedxGPU->CheckData(answer, xUnitNum);
 	/* destroy variables */
-	delete x, y, dedy, dedx, gold, xGPU, yGPU, dedyGPU, dedxGPU, goldGPU;
+    delete x;
-	delete[] xDimSize, yDimSize, dedyDimSize, dedxDimSize, goldDimSize;
+    delete y;
+    delete dedy;
+    delete dedx;
+    delete gold;
+    delete xGPU;
+    delete yGPU;
+    delete dedyGPU;
+    delete dedxGPU;
+    delete goldGPU;
+	delete[] xDimSize;
 	return cpuTest && gpuTest;
 #else
 	/* destroy variables */
-	delete x, y, dedy, dedx, gold;
+    delete x;
-	delete[] xDimSize, yDimSize, dedyDimSize, dedxDimSize, goldDimSize;
+    delete y;
+    delete dedy;
-	return cpuTest;
+    delete dedx;
-#endif // USE_CUDA
+    delete gold;
-}
+	delete[] xDimSize;
-/* case 4: backward computation dE/dx = dE/dy * dy/dx rectified: y = max(0, x)
-* In this case, lossName=ONEHOTERROR.
-*/
-bool TestRectify4()
-{
-	/* a x tensor of size 2 * 3 */
-	int xOrder = 2;
-	int * xDimSize = new int[xOrder];
-	xDimSize[0] = 2;
-	xDimSize[1] = 3;
-	int xUnitNum = 1;
-	for (int i = 0; i < xOrder; i++)
-		xUnitNum *= xDimSize[i];
-	/* a y tensor of size 2 * 3 */
-	int yOrder = 2;
-	int * yDimSize = new int[yOrder];
-	yDimSize[0] = 2;
-	yDimSize[1] = 3;
-	int yUnitNum = 1;
-	for (int i = 0; i < yOrder; i++)
-		yUnitNum *= yDimSize[i];
-	/* a gold tensor of size 2 * 3 */
-	int goldOrder = 2;
-	int * goldDimSize = new int[goldOrder];
-	goldDimSize[0] = 2;
-	goldDimSize[1] = 3;
-	int goldUnitNum = 1;
-	for (int i = 0; i < goldOrder; i++)
-		goldUnitNum *= goldDimSize[i];
-	/* a dedy tensor of size 2 * 3 */
-	int dedyOrder = 2;
-	int * dedyDimSize = new int[dedyOrder];
-	dedyDimSize[0] = 2;
-	dedyDimSize[1] = 3;
-	int dedyUnitNum = 1;
-	for (int i = 0; i < dedyOrder; i++)
-		dedyUnitNum *= dedyDimSize[i];
-	/* a dedx tensor of size 2 * 3 */
-	int dedxOrder = 2;
-	int * dedxDimSize = new int[dedxOrder];
-	dedxDimSize[0] = 2;
-	dedxDimSize[1] = 3;
-	int dedxUnitNum = 1;
-	for (int i = 0; i < dedxOrder; i++)
-		dedxUnitNum *= dedxDimSize[i];
-	DTYPE xData[2][3] = { {1.0, 1.0, -2.0},
-	                      {2.0, 4.0, 5.0} };
-	DTYPE yData[2][3] = { {1.0, 1.0, 0.0},
-	                      {2.0, 4.0, 5.0} };
-	DTYPE goldData[2][3] = { {1.0, 0.0, 1.0},
-	                         {1.0, 1.0, 0.0} };
-	DTYPE dedyData[2][3] = { {0.0, 0.0, -1.0},
-	                         {1.0, 3.0, 0.0} };
-	DTYPE answer[2][3] = { {0.0, 0.0, 0.0},
-	                       {1.0, 3.0, 0.0} };
-	/* CPU test */
-	bool cpuTest = true;
-	/* create tensors */
-	XTensor * x = NewTensor(xOrder, xDimSize);
-	XTensor * y = NewTensor(yOrder, yDimSize);
-	XTensor * gold = NewTensor(goldOrder, goldDimSize);
-	XTensor * dedy = NewTensor(dedyOrder, dedyDimSize);
-	XTensor * dedx = NewTensor(dedxOrder, dedxDimSize);
-	/* initialize variables */
-	x->SetData(xData, xUnitNum);
-	y->SetData(yData, yUnitNum);
-	gold->SetData(goldData, goldUnitNum);
-	dedy->SetData(dedyData, dedyUnitNum);
-	dedx->SetZeroAll();
-	/* call rectifybackward function */
-	RectifyBackward(gold, y, x, dedy, dedx, ONEHOTERROR);
-	/* check results */
-	cpuTest = dedx->CheckData(answer, dedxUnitNum);
-#ifdef USE_CUDA
-	/* GPU test */
-	bool gpuTest = true;
-	/* create tensors */
-	XTensor * xGPU = NewTensor(xOrder, xDimSize, X_FLOAT, 1.0F, 0);
-	XTensor * yGPU = NewTensor(yOrder, yDimSize, X_FLOAT, 1.0F, 0);
-	XTensor * goldGPU = NewTensor(goldOrder, goldDimSize, X_FLOAT, 1.0F, 0);
-	XTensor * dedyGPU = NewTensor(dedyOrder, dedyDimSize, X_FLOAT, 1.0F, 0);
-	XTensor * dedxGPU = NewTensor(dedxOrder, dedxDimSize, X_FLOAT, 1.0F, 0);
-	/* initialize variables */
-	xGPU->SetData(xData, xUnitNum);
-	yGPU->SetData(yData, yUnitNum);
-	goldGPU->SetData(goldData, goldUnitNum);
-	dedyGPU->SetData(dedyData, dedyUnitNum);
-	dedxGPU->SetZeroAll();
-	/* call rectifybackward function */
-	RectifyBackward(goldGPU, yGPU, xGPU, dedyGPU, dedxGPU, CROSSENTROPY);
-	/* check results */
-	gpuTest = dedxGPU->CheckData(answer, dedxUnitNum);
-	/* destroy variables */
-	delete x, y, dedy, dedx, gold, xGPU, yGPU, dedyGPU, dedxGPU, goldGPU;
-	delete[] xDimSize, yDimSize, dedyDimSize, dedxDimSize, goldDimSize;
-	return cpuTest && gpuTest;
-#else
-	/* destroy variables */
-	delete x, y, dedy, dedx, gold;
-	delete[] xDimSize, yDimSize, dedyDimSize, dedxDimSize, goldDimSize;
 	return cpuTest;
 #endif // USE_CUDA
@@ -491,7 +218,7 @@ TODO!!
 extern "C"
 bool TestRectify()
 {
-    XPRINT(0, stdout, "[TEST RECTIFY] -------------\n");
+    XPRINT(0, stdout, "[TEST RECTIFY] test rectify and its backward computation \n");
    bool returnFlag = true, caseFlag = true;
    /* case 1 test */
@@ -514,26 +241,6 @@ bool TestRectify()
 	else
 		XPRINT(0, stdout, ">> case 2 passed!\n");
-	/* case 3 test */
-	caseFlag = TestRectify3();
-	if (!caseFlag) {
-		returnFlag = false;
-		XPRINT(0, stdout, ">> case 3 failed!\n");
-	}
-	else
-		XPRINT(0, stdout, ">> case 3 passed!\n");
-	/* case 4 test */
-	caseFlag = TestRectify4();
-	if (!caseFlag) {
-		returnFlag = false;
-		XPRINT(0, stdout, ">> case 4 failed!\n");
-	}
-	else
-		XPRINT(0, stdout, ">> case 4 passed!\n");
    /* other cases test */
    /*
    TODO!!

--- a/source/test/TReduceMax.cpp
+++ b/source/test/TReduceMax.cpp
@@ -16,246 +16,153 @@
 */
 /*
-* $Created by: LI Yinqiao (email: li.yin.qiao.2012@hotmail.com) 2018-04-30
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-06-30
 */
-#include "../XTensor.h"
+#include "TReduceMax.h"
-#include "../XDevice.h"
-#include "../core/ReduceMax.h"
-namespace nts { // namespace nt(NiuTrans.Tensor)
+namespace nts { // namespace nts(NiuTrans.Tensor)
-                /* case 1 */
+/* case 1: get the max value of the items along a dimension of the tensor. 
-    bool TestReduceMax1()
+* In this case,
-    {
+(2, 4) -> (4), dim = 0
-        /* a tensor of size 2 * 4 */
+(2, 4) -> (2), dim = 1
-        int order = 2;
+*/
-        int order_reduce = 1;
+bool TestReduceMax1()
-        int * dimSize = new int[order];
+{
-        dimSize[0] = 2;
+    /* a input tensor of size (2, 4) */
-        dimSize[1] = 4;
+    int sOrder = 2;
+    int * sDimSize = new int[sOrder];
-        int unitNum = 1;
+    sDimSize[0] = 2;
-        for (int i = 0; i < order; i++)
+    sDimSize[1] = 4;
-            unitNum *= dimSize[i];
-        /* a tensor of size 4 */
+    int sUnitNum = 1;
-        int * dimSize_reduce_a = new int[order_reduce];
+    for (int i = 0; i < sOrder; i++)
-        dimSize_reduce_a[0] = 4;
+        sUnitNum *= sDimSize[i];
-        int unitNum_a = 1;
+    /* a output tensor of size (4) */
-        for (int i = 0; i < order_reduce; i++)
+    int tOrder1 = 1;
-            unitNum_a *= dimSize_reduce_a[i];
+    int * tDimSize1 = new int[tOrder1];
-        /* a tensor of size 2 */
+    tDimSize1[0] = 4;
-        int * dimSize_reduce_b = new int[order_reduce];
-        dimSize_reduce_b[0] = 2;
+    int tUnitNum1 = 1;
+    for (int i = 0; i < tOrder1; i++)
-        int unitNum_b = 1;
+        tUnitNum1 *= tDimSize1[i];
-        for (int i = 0; i < order_reduce; i++)
-            unitNum_b *= dimSize_reduce_b[i];
+    /* a output tensor of size (2) */
+    int tOrder2 = 1;
+    int * tDimSize2 = new int[tOrder2];
-        DTYPE aData[2][4] = { { 0.0,   1.0,   2.0,   3.0 },
+    tDimSize2[0] = 2;
-                              { 4.0,   5.0,   6.0,   7.0 } };
-        DTYPE bData[2][4] = { { 1.0,  -1.0,  -3.0,  -5.0 },
+    int tUnitNum2 = 1;
-                              { -7.0, -9.0, -11.0, -13.0 } };
+    for (int i = 0; i < tOrder2; i++)
-        DTYPE answer_a[4] = { 4.0,  5.0,  6.0,  7.0 };
+        tUnitNum2 *= tDimSize2[i];
-        DTYPE answer_b[2] = { 1.0,  -7.0 };
+    DTYPE sData[2][4] = { {0.0, 5.0, 2.0, 3.0},
-        /* CPU test */
+                          {4.0, 1.0, 6.0, 7.0} };
-        bool cpuTest = true;
+    DTYPE answer1[4] = {4.0, 5.0, 6.0, 7.0};
+    DTYPE answer2[2] = {5.0, 7.0};
-        /* create tensors */
-        XTensor * a = NewTensor(order, dimSize);
+    /* CPU test */
-        XTensor * reduce_a = NewTensor(order_reduce, dimSize_reduce_a);
+    bool cpuTest = true;
-        XTensor * b = NewTensor(order, dimSize);
-        XTensor * reduce_b = NewTensor(order_reduce, dimSize_reduce_b);
+    /* create tensors */
+    XTensor * s = NewTensor(sOrder, sDimSize);
-        /* initialize variables */
+    XTensor * t1 = NewTensor(tOrder1, tDimSize1);
-        a->SetData(aData, unitNum);
+    XTensor * t2 = NewTensor(tOrder2, tDimSize2);
-        b->SetData(bData, unitNum);
+    /* initialize variables */
+    s->SetData(sData, sUnitNum);
-        /* call reduce max function */
+    t1->SetZeroAll();
-        ReduceMax(a, reduce_a, 0);
+    t2->SetZeroAll();
-        ReduceMax(b, reduce_b, 1);
+    /* call ReduceMax function */
-        //DTYPE* reduce_a_data = (DTYPE*)reduce_a->data;
+    ReduceMax(s, t1, 0);
-        //for (int i = 0; i < unitNum_a; i++)
+    ReduceMax(s, t2, 1);
-        //    printf("%f ", *reduce_a_data++);
-        //printf("\n");
+    /* check results */
-        //DTYPE* reduce_b_data = (DTYPE*)reduce_b->data;
+    cpuTest = t1->CheckData(answer1, tUnitNum1) && t2->CheckData(answer2, tUnitNum2);
-        //for (int i = 0; i < unitNum_b; i++)
-        //    printf("%f ", *reduce_b_data++);
-        /* check results */
-        cpuTest = reduce_a->CheckData(answer_a, unitNum_a) && reduce_b->CheckData(answer_b, unitNum_b);
 #ifdef USE_CUDA
-        /* GPU test */
+    /* GPU test */
-        bool gpuTest = true;
+    bool gpuTest = true;
-        /* create tensor */
+    /* create tensors */
-        XTensor * aGPU = NewTensor(order, dimSize, X_FLOAT);
+    XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
-        XTensor * reduce_aGPU = NewTensor(order_reduce, dimSize_reduce_a, X_FLOAT);
+    XTensor * tGPU1 = NewTensor(tOrder1, tDimSize1, X_FLOAT, 1.0F, 0);
-        XTensor * bGPU = NewTensor(order, dimSize, X_FLOAT);
+    XTensor * tGPU2 = NewTensor(tOrder2, tDimSize2, X_FLOAT, 1.0F, 0);
-        XTensor * reduce_bGPU = NewTensor(order_reduce, dimSize_reduce_b, X_FLOAT);
+    /* initialize variables */
-        /* Initialize variables */
+    sGPU->SetData(sData, sUnitNum);
-        aGPU->SetData(aData, unitNum);
+    tGPU1->SetZeroAll();
-        bGPU->SetData(bData, unitNum);
+    tGPU2->SetZeroAll();
-        /* call reduce max function */
+    /* call ReduceMax function */
-        ReduceMax(aGPU, reduce_aGPU, 0);
+    ReduceMax(sGPU, tGPU1, 0);
-        ReduceMax(bGPU, reduce_bGPU, 1);
+    ReduceMax(sGPU, tGPU2, 1);
-        /* check results */
+    /* check results */
-        gpuTest = reduce_aGPU->CheckData(answer_a, unitNum_a) && reduce_bGPU->CheckData(answer_b, unitNum_b);
+    gpuTest = tGPU1->CheckData(answer1, tUnitNum1) && tGPU2->CheckData(answer2, tUnitNum2);
-        /* destroy variables */
+    /* destroy variables */
-        delete aGPU, bGPU, reduce_aGPU, reduce_bGPU;
+    delete s;
-        delete[] dimSize, dimSize_reduce_a, dimSize_reduce_b;
+    delete t1;
-        return cpuTest && gpuTest;
+    delete t2;
+    delete sGPU;
+    delete tGPU1;
+    delete tGPU2;
+    delete[] sDimSize;
+    delete[] tDimSize1;
+    delete[] tDimSize2;
+    return cpuTest && gpuTest;
 #else
-        /* destroy variables */
+    /* destroy variables */
-        delete a;
+    delete s;
-        delete b;
+    delete t1;
-        return cpuTest;
+    delete t2;
+    delete[] sDimSize;
+    delete[] tDimSize1;
+    delete[] tDimSize2;
+    return cpuTest;
 #endif // USE_CUDA
-    }
+}
-    bool TestReduceMaxForLargescale()
-    {
-        /* a tensor of size 10000 * 500 */
-        int order = 2;
-        int order_reduce = 1;
-        int * dimSize = new int[order];
-        dimSize[0] = 10000;
-        dimSize[1] = 500;
-        int unitNum = 1;
-        for (int i = 0; i < order; i++)
-            unitNum *= dimSize[i];
-        /* a tensor of size 500 */
-        int * dimSize_reduce_a = new int[order_reduce];
-        dimSize_reduce_a[0] = 500;
-        int unitNum_a = 1;
+/* other cases */
-        for (int i = 0; i < order_reduce; i++)
+/*
-            unitNum_a *= dimSize_reduce_a[i];
+TODO!!
-        /* a tensor of size 10000 */
+*/
-        int * dimSize_reduce_b = new int[order_reduce];
-        dimSize_reduce_b[0] = 10000;
-        int unitNum_b = 1;
-        for (int i = 0; i < order_reduce; i++)
-            unitNum_b *= dimSize_reduce_b[i];
-        DTYPE * data = new DTYPE[5000000];
-        DTYPE * tmp = data;
-        for (int i = 0; i < unitNum; i++)
-            *tmp++ = 1;
-        DTYPE answer_a[500];
-        for (int i = 0; i < unitNum_a; i++)
-            answer_a[i] = 1;
-        DTYPE answer_b[10000];
-        for (int i = 0; i < unitNum_b; i++)
-            answer_b[i] = 1;
-        /* CPU test */
-        bool cpuTest = true;
-        /* create tensors */
-        XTensor * a = NewTensor(order, dimSize);
-        XTensor * reduce_a = NewTensor(order_reduce, dimSize_reduce_a);
-        XTensor * b = NewTensor(order, dimSize);
-        XTensor * reduce_b = NewTensor(order_reduce, dimSize_reduce_b);
-        /* initialize variables */
-        a->SetData(data, unitNum);
-        b->SetData(data, unitNum);
-        /* call reduce max function */
-        ReduceMax(a, reduce_a, 0);
-        ReduceMax(b, reduce_b, 1);
-        /* check results */
-        cpuTest = reduce_a->CheckData(answer_a, unitNum_a) && reduce_b->CheckData(answer_b, unitNum_b);
-#ifdef USE_CUDA
-        /* GPU test */
-        bool gpuTest = true;
-        /* create tensor */
-        XTensor * aGPU = NewTensor(order, dimSize, X_FLOAT);
-        XTensor * reduce_aGPU = NewTensor(order_reduce, dimSize_reduce_a, X_FLOAT);
-        XTensor * bGPU = NewTensor(order, dimSize, X_FLOAT);
-        XTensor * reduce_bGPU = NewTensor(order_reduce, dimSize_reduce_b, X_FLOAT);
-        /* Initialize variables */
-        aGPU->SetData(data, unitNum);
-        bGPU->SetData(data, unitNum);
-        /* call reduce max function */
-        ReduceMax(aGPU, reduce_aGPU, 0);
-        ReduceMax(bGPU, reduce_bGPU, 1);
-        /* check results */
-        gpuTest = reduce_aGPU->CheckData(answer_a, unitNum_a) && reduce_bGPU->CheckData(answer_b, unitNum_b);
-        /* destroy variables */
+/* test for ReduceMax Function */
-        delete aGPU, bGPU, reduce_aGPU, reduce_bGPU;
+extern "C"
-        delete[] dimSize, dimSize_reduce_a, dimSize_reduce_b;
+bool TestReduceMax()
-        return cpuTest && gpuTest;
+{
-#else
+    XPRINT(0, stdout, "[TEST ReduceMax] get the max value of the items along a dimension of the tensor\n");
-        /* destroy variables */
+    bool returnFlag = true, caseFlag = true;
-        delete a;
-        delete b;
+    /* case 1 test */
-        return cpuTest;
+    caseFlag = TestReduceMax1();
-#endif // USE_CUDA
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 1 failed!\n");
    }
+    else
+        XPRINT(0, stdout, ">> case 1 passed!\n");
-    /* other cases */
+    /* other cases test */
    /*
    TODO!!
    */
-    /* test for Sum Function */
+    if (returnFlag) {
-    extern "C"
+        XPRINT(0, stdout, ">> All Passed!\n");
-        bool TestReduceMax()
+    }
-    {
+    else
-        XPRINT(0, stdout, "[TEST ReduceMax]\n");
+        XPRINT(0, stdout, ">> Failed!\n");
-        bool returnFlag = true, caseFlag = true;
-        /* case 1 test */
-        caseFlag = TestReduceMax1();
-        if (!caseFlag) {
-            returnFlag = false;
-            XPRINT(0, stdout, ">> case 1 failed!\n");
-        }
-        else
-            XPRINT(0, stdout, ">> case 1 passed!\n");
-        /* case 2 test */
-        caseFlag = TestReduceMaxForLargescale();
-        if (!caseFlag) {
-            returnFlag = false;
-            XPRINT(0, stdout, ">> case 2 failed!\n");
-        }
-        else
-            XPRINT(0, stdout, ">> case 2 passed!\n");
-        ///* other cases test */
-        ///*
-        //TODO!!
-        //*/
-        if (returnFlag) {
-            XPRINT(0, stdout, ">> All Passed!\n");
-        }
-        else
-            XPRINT(0, stdout, ">> Failed!\n");
-        XPRINT(0, stdout, "\n");
+    XPRINT(0, stdout, "\n");
-        return returnFlag;
+    return returnFlag;
    }
-} // namespace nt(NiuTrans.Tensor)
+} // namespace nts(NiuTrans.Tensor)
--- a/source/test/TReduceMax.h
+++ b/source/test/TReduceMax.h
@@ -16,7 +16,7 @@
 */
 /*
-* $Created by: LI Yinqiao (email: li.yin.qiao.2012@hotmail.com) 2018-04-30
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-06-30
 */
 #ifndef __TEST_REDUCEMAX_H__
@@ -24,12 +24,11 @@
 #include "../core/ReduceMax.h"
-namespace nts { // namespace nt(NiuTrans.Tensor)
+namespace nts { // namespace nts(NiuTrans.Tensor)
 /* test for ReduceMax Function */
 extern "C"
 bool TestReduceMax();
-} // namespace nt(NiuTrans.Tensor)
+} // namespace nts(NiuTrans.Tensor)
 #endif // __TEST_REDUCEMAX_H__
--- a/source/test/TReduceMean.cpp
+++ b/source/test/TReduceMean.cpp
@@ -19,244 +19,246 @@
 * $Created by: LI Yinqiao (email: li.yin.qiao.2012@hotmail.com) 2018-04-30
 */
-#include "../XTensor.h"
+#include "TReduceMean.h"
-#include "../XDevice.h"
-#include "../core/ReduceMean.h"
-#include "../core/ReduceMax.h"
-#include "../core/ReduceSum.h"
 namespace nts { // namespace nt(NiuTrans.Tensor)
-                /* case 1 */
+/* case 1: get the mean value along a dimension of the tensor */
+bool TestReduceMean1()
-    bool TestReduceMean1()
+{
-    {
+    /* a tensor of size (2, 4) */
-        /* a tensor of size 2 * 4 */
+    int sOrder = 2;
-        int order = 2;
+    int * sDimSize = new int[sOrder];
-        int order_reduce = 1;
+    sDimSize[0] = 2;
-        int * dimSize = new int[order];
+    sDimSize[1] = 4;
-        dimSize[0] = 2;
-        dimSize[1] = 4;
+    int sUnitNum = 1;
+    for (int i = 0; i < sOrder; i++)
-        int unitNum = 1;
+        sUnitNum *= sDimSize[i];
-        for (int i = 0; i < order; i++)
-            unitNum *= dimSize[i];
+    /* a tensor of size (4) */
-        /* a tensor of size 4 */
+    int tOrder1 = 1;
-        int * dimSize_reduce_a = new int[order_reduce];
+    int * tDimSize1 = new int[tOrder1];
-        dimSize_reduce_a[0] = 4;
+    tDimSize1[0] = 4;
-        int unitNum_a = 1;
+    int tUnitNum1 = 1;
-        for (int i = 0; i < order_reduce; i++)
+    for (int i = 0; i < tOrder1; i++)
-            unitNum_a *= dimSize_reduce_a[i];
+        tUnitNum1 *= tDimSize1[i];
-        /* a tensor of size 2 */
-        int * dimSize_reduce_b = new int[order_reduce];
+    /* a tensor of size (2) */
-        dimSize_reduce_b[0] = 2;
+    int tOrder2 = 1;
+    int * tDimSize2 = new int[tOrder2];
-        int unitNum_b = 1;
+    tDimSize2[0] = 2;
-        for (int i = 0; i < order_reduce; i++)
-            unitNum_b *= dimSize_reduce_b[i];
+    int tUnitNum2 = 1;
+    for (int i = 0; i < tOrder2; i++)
+        tUnitNum2 *= tDimSize2[i];
-        DTYPE aData[2][4] = { { 0.0,   1.0,   2.0,   3.0 },
-                              { 4.0,   5.0,   6.0,   7.0 } };
+    DTYPE sData[2][4] = { { 0.0, 1.0, 2.0, 3.0 },
-        DTYPE bData[2][4] = { { 1.0,  -1.0,  -3.0,  -5.0 },
+                          { 4.0, 5.0, 6.0, 7.0 } };
-                              { -7.0, -9.0, -11.0, -13.0 } };
+    DTYPE answer1[4] = {2.0, 3.0,  4.0,  5.0};
-        DTYPE answer_a[4] = { 2.0,  3.0,  4.0,  5.0 };
+    DTYPE answer2[2] = {1.5, 5.5};
-        DTYPE answer_b[2] = { -2.0,  -10.0 };
+    /* CPU test */
-        /* CPU test */
+    bool cpuTest = true;
-        bool cpuTest = true;
+    /* create tensors */
-        /* create tensors */
+    XTensor * s = NewTensor(sOrder, sDimSize);
-        XTensor * a = NewTensor(order, dimSize);
+    XTensor * t1 = NewTensor(tOrder1, tDimSize1);
-        XTensor * reduce_a = NewTensor(order_reduce, dimSize_reduce_a);
+    XTensor * t2 = NewTensor(tOrder2, tDimSize2);
-        XTensor * b = NewTensor(order, dimSize);
-        XTensor * reduce_b = NewTensor(order_reduce, dimSize_reduce_b);
+    /* initialize variables */
+    s->SetData(sData, sUnitNum);
-        /* initialize variables */
+    t1->SetZeroAll();
-        a->SetData(aData, unitNum);
+    t2->SetZeroAll();
-        b->SetData(bData, unitNum);
+    /* call ReduceMean function */
+    ReduceMean(s, t1, 0);
-        /* call reduce mean function */
+    ReduceMean(s, t2, 1);
-        ReduceMean(a, reduce_a, 0);
-        ReduceMean(b, reduce_b, 1);
+    /* check results */
+    cpuTest = t1->CheckData(answer1, tUnitNum1) && t2->CheckData(answer2, tUnitNum2);
-        //DTYPE* reduce_a_data = (DTYPE*)reduce_a->data;
-        //for (int i = 0; i < unitNum_a; i++)
-        //    printf("%f ", *reduce_a_data++);
-        //printf("\n");
-        //DTYPE* reduce_b_data = (DTYPE*)reduce_b->data;
-        //for (int i = 0; i < unitNum_b; i++)
-        //    printf("%f ", *reduce_b_data++);
-        /* check results */
-        cpuTest = reduce_a->CheckData(answer_a, unitNum_a) && reduce_b->CheckData(answer_b, unitNum_b);
 #ifdef USE_CUDA
-        /* GPU test */
+    /* GPU test */
-        bool gpuTest = true;
+    bool gpuTest = true;
-        /* create tensor */
+    /* create tensor */
-        XTensor * aGPU = NewTensor(order, dimSize, X_FLOAT);
+    XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
-        XTensor * reduce_aGPU = NewTensor(order_reduce, dimSize_reduce_a, X_FLOAT);
+    XTensor * tGPU1 = NewTensor(tOrder1, tDimSize1, X_FLOAT, 1.0F, 0);
-        XTensor * bGPU = NewTensor(order, dimSize, X_FLOAT);
+    XTensor * tGPU2 = NewTensor(tOrder2, tDimSize2, X_FLOAT, 1.0F, 0);
-        XTensor * reduce_bGPU = NewTensor(order_reduce, dimSize_reduce_b, X_FLOAT);
+    /* Initialize variables */
-        /* Initialize variables */
+    sGPU->SetData(sData, sUnitNum);
-        aGPU->SetData(aData, unitNum);
+    tGPU1->SetZeroAll();
-        bGPU->SetData(bData, unitNum);
+    tGPU2->SetZeroAll();
-        /* call reduce mean function */
+    /* call ReduceMean function */
-        ReduceMean(aGPU, reduce_aGPU, 0);
+    ReduceMean(sGPU, tGPU1, 0);
-        ReduceMean(bGPU, reduce_bGPU, 1);
+    ReduceMean(sGPU, tGPU2, 1);
-        /* check results */
+    /* check results */
-        gpuTest = reduce_aGPU->CheckData(answer_a, unitNum_a) && reduce_bGPU->CheckData(answer_b, unitNum_b);
+    cpuTest = tGPU1->CheckData(answer1, tUnitNum1) && tGPU2->CheckData(answer2, tUnitNum2);
-        /* destroy variables */
+    /* destroy variables */
-        delete aGPU, bGPU, reduce_aGPU, reduce_bGPU;
+    delete s;
-        delete dimSize, dimSize_reduce_a, dimSize_reduce_b;
+    delete t1;
-        return cpuTest && gpuTest;
+    delete t2;
+    delete sGPU;
+    delete tGPU1;
+    delete tGPU2;
+    delete[] sDimSize;
+    delete[] tDimSize1;
+    delete[] tDimSize2;
+    return cpuTest && gpuTest;
 #else
-        /* destroy variables */
+    /* destroy variables */
-        delete a;
+    delete s;
-        delete b;
+    delete t1;
-        return cpuTest;
+    delete t2;
+    delete[] sDimSize;
+    delete[] tDimSize1;
+    delete[] tDimSize2;
+    return cpuTest;
 #endif // USE_CUDA
-    }
+}
-    bool TestReduceMeanForLargescale()
+bool TestReduceMeanForLargescale()
-    {
+{
-        /* a tensor of size 10000 * 500 */
+    /* a tensor of size 10000 * 500 */
-        int order = 2;
+    int order = 2;
-        int order_reduce = 1;
+    int order_reduce = 1;
-        int * dimSize = new int[order];
+    int * dimSize = new int[order];
-        dimSize[0] = 10000;
+    dimSize[0] = 10000;
-        dimSize[1] = 500;
+    dimSize[1] = 500;
-        int unitNum = 1;
+    int unitNum = 1;
-        for (int i = 0; i < order; i++)
+    for (int i = 0; i < order; i++)
-            unitNum *= dimSize[i];
+        unitNum *= dimSize[i];
-        /* a tensor of size 500 */
+    /* a tensor of size 500 */
-        int * dimSize_reduce_a = new int[order_reduce];
+    int * dimSize_reduce_a = new int[order_reduce];
-        dimSize_reduce_a[0] = 500;
+    dimSize_reduce_a[0] = 500;
-        int unitNum_a = 1;
+    int unitNum_a = 1;
-        for (int i = 0; i < order_reduce; i++)
+    for (int i = 0; i < order_reduce; i++)
-            unitNum_a *= dimSize_reduce_a[i];
+        unitNum_a *= dimSize_reduce_a[i];
-        /* a tensor of size 10000 */
+    /* a tensor of size 10000 */
-        int * dimSize_reduce_b = new int[order_reduce];
+    int * dimSize_reduce_b = new int[order_reduce];
-        dimSize_reduce_b[0] = 10000;
+    dimSize_reduce_b[0] = 10000;
-        int unitNum_b = 1;
+    int unitNum_b = 1;
-        for (int i = 0; i < order_reduce; i++)
+    for (int i = 0; i < order_reduce; i++)
-            unitNum_b *= dimSize_reduce_b[i];
+        unitNum_b *= dimSize_reduce_b[i];
-        DTYPE * data = new DTYPE[5000000];
+    DTYPE * data = new DTYPE[5000000];
-        DTYPE * tmp = data;
+    DTYPE * tmp = data;
-        for (int i = 0; i < unitNum; i++)
+    for (int i = 0; i < unitNum; i++)
-            *tmp++ = 1;
+        *tmp++ = 1;
-        DTYPE answer_a[500];
+    DTYPE answer_a[500];
-        for (int i = 0; i < unitNum_a; i++)
+    for (int i = 0; i < unitNum_a; i++)
-            answer_a[i] = 1;
+        answer_a[i] = 1;
-        DTYPE answer_b[10000];
+    DTYPE answer_b[10000];
-        for (int i = 0; i < unitNum_b; i++)
+    for (int i = 0; i < unitNum_b; i++)
-            answer_b[i] = 1;
+        answer_b[i] = 1;
-        /* CPU test */
+    /* CPU test */
-        bool cpuTest = true;
+    bool cpuTest = true;
-        /* create tensors */
+    /* create tensors */
-        XTensor * a = NewTensor(order, dimSize);
+    XTensor * a = NewTensor(order, dimSize);
-        XTensor * reduce_a = NewTensor(order_reduce, dimSize_reduce_a);
+    XTensor * reduce_a = NewTensor(order_reduce, dimSize_reduce_a);
-        XTensor * b = NewTensor(order, dimSize);
+    XTensor * b = NewTensor(order, dimSize);
-        XTensor * reduce_b = NewTensor(order_reduce, dimSize_reduce_b);
+    XTensor * reduce_b = NewTensor(order_reduce, dimSize_reduce_b);
-        /* initialize variables */
+    /* initialize variables */
-        a->SetData(data, unitNum);
+    a->SetData(data, unitNum);
-        b->SetData(data, unitNum);
+    b->SetData(data, unitNum);
-        /* call reduce max function */
+    /* call reduce max function */
-        ReduceMean(a, reduce_a, 0);
+    ReduceMean(a, reduce_a, 0);
-        ReduceMean(b, reduce_b, 1);
+    ReduceMean(b, reduce_b, 1);
-        /* check results */
+    /* check results */
-        cpuTest = reduce_a->CheckData(answer_a, unitNum_a) && reduce_b->CheckData(answer_b, unitNum_b);
+    cpuTest = reduce_a->CheckData(answer_a, unitNum_a) && reduce_b->CheckData(answer_b, unitNum_b);
 #ifdef USE_CUDA
-        /* GPU test */
+    /* GPU test */
-        bool gpuTest = true;
+    bool gpuTest = true;
-        /* create tensor */
+    /* create tensor */
-        XTensor * aGPU = NewTensor(order, dimSize, X_FLOAT);
+    XTensor * aGPU = NewTensor(order, dimSize, X_FLOAT);
-        XTensor * reduce_aGPU = NewTensor(order_reduce, dimSize_reduce_a, X_FLOAT);
+    XTensor * reduce_aGPU = NewTensor(order_reduce, dimSize_reduce_a, X_FLOAT);
-        XTensor * bGPU = NewTensor(order, dimSize, X_FLOAT);
+    XTensor * bGPU = NewTensor(order, dimSize, X_FLOAT);
-        XTensor * reduce_bGPU = NewTensor(order_reduce, dimSize_reduce_b, X_FLOAT);
+    XTensor * reduce_bGPU = NewTensor(order_reduce, dimSize_reduce_b, X_FLOAT);
-        /* Initialize variables */
+    /* Initialize variables */
-        aGPU->SetData(data, unitNum);
+    aGPU->SetData(data, unitNum);
-        bGPU->SetData(data, unitNum);
+    bGPU->SetData(data, unitNum);
-        /* call reduce max function */
+    /* call reduce max function */
-        ReduceMean(aGPU, reduce_aGPU, 0);
+    ReduceMean(aGPU, reduce_aGPU, 0);
-        ReduceMean(bGPU, reduce_bGPU, 1);
+    ReduceMean(bGPU, reduce_bGPU, 1);
-        /* check results */
+    /* check results */
-        gpuTest = reduce_aGPU->CheckData(answer_a, unitNum_a) && reduce_bGPU->CheckData(answer_b, unitNum_b);
+    gpuTest = reduce_aGPU->CheckData(answer_a, unitNum_a) && reduce_bGPU->CheckData(answer_b, unitNum_b);
-        /* destroy variables */
+    /* destroy variables */
-        delete aGPU, bGPU, reduce_aGPU, reduce_bGPU;
+    delete aGPU, bGPU, reduce_aGPU, reduce_bGPU;
-        delete[] dimSize, dimSize_reduce_a, dimSize_reduce_b;
+    delete[] dimSize, dimSize_reduce_a, dimSize_reduce_b;
-        return cpuTest && gpuTest;
+    return cpuTest && gpuTest;
 #else
-        /* destroy variables */
+    /* destroy variables */
-        delete a;
+    delete a;
-        delete b;
+    delete b;
-        return cpuTest;
+    return cpuTest;
 #endif // USE_CUDA
+}
+/* other cases */
+/*
+TODO!!
+*/
+/* test for ReduceMean Function */
+extern "C"
+bool TestReduceMean()
+{
+    XPRINT(0, stdout, "[TEST ReduceMean] get the mean value along a dimension of the tensor \n");
+    bool returnFlag = true, caseFlag = true;
+    /* case 1 test */
+    caseFlag = TestReduceMean1();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 1 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 1 passed!\n");
+    /* case 2 test */
+    caseFlag = TestReduceMeanForLargescale();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 2 failed!\n");
    }
-    /* other cases */
+    else
-    /*
+        XPRINT(0, stdout, ">> case 2 passed!\n");
-    TODO!!
-    */
+    ///* other cases test */
+    ///*
-    /* test for Sum Function */
+    //TODO!!
-    extern "C"
+    //*/
-        bool TestReduceMean()
-    {
+    if (returnFlag) {
-        XPRINT(0, stdout, "[TEST ReduceMean]\n");
+        XPRINT(0, stdout, ">> All Passed!\n");
-        bool returnFlag = true, caseFlag = true;
-        /* case 1 test */
-        caseFlag = TestReduceMean1();
-        if (!caseFlag) {
-            returnFlag = false;
-            XPRINT(0, stdout, ">> case 1 failed!\n");
-        }
-        else
-            XPRINT(0, stdout, ">> case 1 passed!\n");
-        /* case 2 test */
-        caseFlag = TestReduceMeanForLargescale();
-        if (!caseFlag) {
-            returnFlag = false;
-            XPRINT(0, stdout, ">> case 2 failed!\n");
-        }
-        else
-            XPRINT(0, stdout, ">> case 2 passed!\n");
-        ///* other cases test */
-        ///*
-        //TODO!!
-        //*/
-        if (returnFlag) {
-            XPRINT(0, stdout, ">> All Passed!\n");
-        }
-        else
-            XPRINT(0, stdout, ">> Failed!\n");
-        XPRINT(0, stdout, "\n");
-        return returnFlag;
    }
+    else
+        XPRINT(0, stdout, ">> Failed!\n");
+    XPRINT(0, stdout, "\n");
+    return returnFlag;
+}
-} // namespace nt(NiuTrans.Tensor)
+} // namespace nts(NiuTrans.Tensor)
--- a/source/test/TReduceSum.cpp
+++ b/source/test/TReduceSum.cpp
@@ -19,106 +19,111 @@
 * $Created by: LI Yinqiao (email: li.yin.qiao.2012@hotmail.com) 2018-04-30
 */
-#include "../XTensor.h"
+#include "TReduceSum.h"
-#include "../XDevice.h"
-#include "../core/ReduceMean.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
-#include "../core/ReduceMax.h"
+/* case 1: sum the items along a dimension of the tensor.
-#include "../core/ReduceSum.h"
+* In this case, 
+(2, 4) -> (4), dim = 0
-namespace nts { // namespace nt(NiuTrans.Tensor)
+(2, 4) -> (2), dim = 1
-/* case 1 */
+*/
 bool TestReduceSum1()
 {
-    /* a tensor of size 2 * 4 */
+    /* a tensor of size (2, 4) */
-    int order = 2;
+    int sOrder = 2;
-    int order_reduce = 1;
+    int * sDimSize = new int[sOrder];
-    int * dimSize = new int[order];
+    sDimSize[0] = 2;
-    dimSize[0] = 2;
+    sDimSize[1] = 4;
-    dimSize[1] = 4;
+    int sUnitNum = 1;
-    int unitNum = 1;
+    for (int i = 0; i < sOrder; i++)
-    for (int i = 0; i < order; i++)
+        sUnitNum *= sDimSize[i];
-        unitNum *= dimSize[i];
-    /* a tensor of size 4 */
+    /* a tensor of size (4) */
-    int * dimSize_reduce_a = new int[order_reduce];
+    int tOrder1 = 1;
-    dimSize_reduce_a[0] = 4;
+    int * tDimSize1 = new int[tOrder1];
+    tDimSize1[0] = 4;
-    int unitNum_a = 1;
-    for (int i = 0; i < order_reduce; i++)
+    int tUnitNum1 = 1;
-        unitNum_a *= dimSize_reduce_a[i];
+    for (int i = 0; i < tOrder1; i++)
-    /* a tensor of size 2 */
+        tUnitNum1 *= tDimSize1[i];
-    int * dimSize_reduce_b = new int[order_reduce];
-    dimSize_reduce_b[0] = 2;
+    /* a tensor of size (2) */
+    int tOrder2 = 1;
-    int unitNum_b = 1;
+    int * tDimSize2 = new int[tOrder2];
-    for (int i = 0; i < order_reduce; i++)
+    tDimSize2[0] = 2;
-        unitNum_b *= dimSize_reduce_b[i];
+    int tUnitNum2 = 1;
-    DTYPE aData[2][4] = { { 0.0,   1.0,   2.0,   3.0 },
+    for (int i = 0; i < tOrder2; i++)
-                            { 4.0,   5.0,   6.0,   7.0 } };
+        tUnitNum2 *= tDimSize2[i];
-    DTYPE bData[2][4] = { { 1.0,  -1.0,  -3.0,  -5.0 },
-                            { -7.0, -9.0, -11.0, -13.0 } };
+    DTYPE sData[2][4] = { {0.0, 1.0, 2.0, 3.0},
-    DTYPE answer_a[4] = { 4.0,  6.0,  8.0,  10.0 };
+                           {4.0, 5.0, 6.0, 7.0} };
-    DTYPE answer_b[2] = { -8.0,  -40.0 };
+    DTYPE answer1[4] = {4.0, 6.0, 8.0, 10.0};
+    DTYPE answer2[2] = {6.0, 22.0};
    /* CPU test */
    bool cpuTest = true;
    /* create tensors */
-    XTensor * a = NewTensor(order, dimSize);
+    XTensor * s = NewTensor(sOrder, sDimSize);
-    XTensor * reduce_a = NewTensor(order_reduce, dimSize_reduce_a);
+    XTensor * t1 = NewTensor(tOrder1, tDimSize1);
-    XTensor * b = NewTensor(order, dimSize);
+    XTensor * t2 = NewTensor(tOrder2, tDimSize2);
-    XTensor * reduce_b = NewTensor(order_reduce, dimSize_reduce_b);
    /* initialize variables */
-    a->SetData(aData, unitNum);
+    s->SetData(sData, sUnitNum);
-    b->SetData(bData, unitNum);
+    t1->SetZeroAll();
+    t2->SetZeroAll();
-    /* call reduce sum function */
+    /* call ReduceSum function */
-    ReduceSum(a, reduce_a, 0);
+    ReduceSum(s, t1, 0);
-    ReduceSum(b, reduce_b, 1);
+    ReduceSum(s, t2, 1);
-    //DTYPE* reduce_a_data = (DTYPE*)reduce_a->data;
-    //for (int i = 0; i < unitNum_a; i++)
-    //    printf("%f ", *reduce_a_data++);
-    //printf("\n");
-    //DTYPE* reduce_b_data = (DTYPE*)reduce_b->data;
-    //for (int i = 0; i < unitNum_b; i++)
-    //    printf("%f ", *reduce_b_data++);
    /* check results */
-    cpuTest = reduce_a->CheckData(answer_a, unitNum_a) && reduce_b->CheckData(answer_b, unitNum_b);
+    cpuTest = t1->CheckData(answer1, tUnitNum1) && t2->CheckData(answer2, tUnitNum2);
 #ifdef USE_CUDA
    /* GPU test */
    bool gpuTest = true;
-    /* create tensor */
+    /* create tensors */
-    XTensor * aGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+    XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
-    XTensor * reduce_aGPU = NewTensor(order_reduce, dimSize_reduce_a, X_FLOAT, 1.0F, 0);
+    XTensor * tGPU1 = NewTensor(tOrder1, tDimSize1, X_FLOAT, 1.0F, 0);
-    XTensor * bGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+    XTensor * tGPU2 = NewTensor(tOrder2, tDimSize2, X_FLOAT, 1.0F, 0);
-    XTensor * reduce_bGPU = NewTensor(order_reduce, dimSize_reduce_b, X_FLOAT, 1.0F, 0);
-    /* Initialize variables */
+    /* initialize variables */
-    aGPU->SetData(aData, unitNum);
+    sGPU->SetData(sData, sUnitNum);
-    bGPU->SetData(bData, unitNum);
+    tGPU1->SetZeroAll();
+    tGPU2->SetZeroAll();
-    /* call reduce sum function */
+    /* call ReduceSum function */
-    ReduceSum(aGPU, reduce_aGPU, 0);
+    ReduceSum(sGPU, tGPU1, 0);
-    ReduceSum(bGPU, reduce_bGPU, 1);
+    ReduceSum(sGPU, tGPU2, 1);
    /* check results */
-    gpuTest = reduce_aGPU->CheckData(answer_a, unitNum_a) && reduce_bGPU->CheckData(answer_b, unitNum_b);
+    cpuTest = tGPU1->CheckData(answer1, tUnitNum1) && tGPU2->CheckData(answer2, tUnitNum2);
    /* destroy variables */
-    delete aGPU, bGPU, reduce_aGPU, reduce_bGPU;
+    delete s;
-    delete[] dimSize, dimSize_reduce_a, dimSize_reduce_b;
+    delete t1;
+    delete t2;
+    delete sGPU;
+    delete tGPU1;
+    delete tGPU2;
+    delete[] sDimSize;
+    delete[] tDimSize1;
+    delete[] tDimSize2;
    return cpuTest && gpuTest;
 #else
    /* destroy variables */
-    delete a;
+    delete s;
-    delete b;
+    delete t1;
+    delete t2;
+    delete[] sDimSize;
+    delete[] tDimSize1;
+    delete[] tDimSize2;
    return cpuTest;
 #endif // USE_CUDA
 }
@@ -127,7 +132,7 @@ bool TestReduceSumForLargescale()
 {
    /* a tensor of size 10000 * 500 */
    int order = 2;
-    int order_reduce = 1;
+    int orderReduce = 1;
    int * dimSize = new int[order];
    dimSize[0] = 10000;
    dimSize[1] = 500;
@@ -136,18 +141,18 @@ bool TestReduceSumForLargescale()
    for (int i = 0; i < order; i++)
        unitNum *= dimSize[i];
    /* a tensor of size 500 */
-    int * dimSize_reduce_a = new int[order_reduce];
+    int * dimSize_reduce_a = new int[orderReduce];
    dimSize_reduce_a[0] = 500;
    int unitNum_a = 1;
-    for (int i = 0; i < order_reduce; i++)
+    for (int i = 0; i < orderReduce; i++)
        unitNum_a *= dimSize_reduce_a[i];
    /* a tensor of size 10000 */
-    int * dimSize_reduce_b = new int[order_reduce];
+    int * dimSize_reduce_b = new int[orderReduce];
    dimSize_reduce_b[0] = 10000;
    int unitNum_b = 1;
-    for (int i = 0; i < order_reduce; i++)
+    for (int i = 0; i < orderReduce; i++)
        unitNum_b *= dimSize_reduce_b[i];
    DTYPE * data = new DTYPE[5000000];
@@ -166,9 +171,9 @@ bool TestReduceSumForLargescale()
    /* create tensors */
    XTensor * a = NewTensor(order, dimSize);
-    XTensor * reduce_a = NewTensor(order_reduce, dimSize_reduce_a);
+    XTensor * reduce_a = NewTensor(orderReduce, dimSize_reduce_a);
    XTensor * b = NewTensor(order, dimSize);
-    XTensor * reduce_b = NewTensor(order_reduce, dimSize_reduce_b);
+    XTensor * reduce_b = NewTensor(orderReduce, dimSize_reduce_b);
    /* initialize variables */
    a->SetData(data, unitNum);
@@ -186,9 +191,9 @@ bool TestReduceSumForLargescale()
    /* create tensor */
    XTensor * aGPU = NewTensor(order, dimSize, X_FLOAT);
-    XTensor * reduce_aGPU = NewTensor(order_reduce, dimSize_reduce_a, X_FLOAT);
+    XTensor * reduce_aGPU = NewTensor(orderReduce, dimSize_reduce_a, X_FLOAT);
    XTensor * bGPU = NewTensor(order, dimSize, X_FLOAT);
-    XTensor * reduce_bGPU = NewTensor(order_reduce, dimSize_reduce_b, X_FLOAT);
+    XTensor * reduce_bGPU = NewTensor(orderReduce, dimSize_reduce_b, X_FLOAT);
    /* Initialize variables */
    aGPU->SetData(data, unitNum);
@@ -222,7 +227,7 @@ TODO!!
 extern "C"
 bool TestReduceSum()
 {
-    XPRINT(0, stdout, "[TEST ReduceSum]\n");
+    XPRINT(0, stdout, "[TEST ReduceSum] sum the items along a dimension of the tensor.\n");
    bool returnFlag = true, caseFlag = true;
    /* case 1 test */
@@ -259,4 +264,4 @@ bool TestReduceSum()
    return returnFlag;
    }
-} // namespace nt(NiuTrans.Tensor)
+} // namespace nts(NiuTrans.Tensor)
--- a/source/test/TReduceSum.h
+++ b/source/test/TReduceSum.h
@@ -24,13 +24,13 @@
 #include "../core/ReduceSum.h"
-namespace nts { // namespace nt(NiuTrans.Tensor)
+namespace nts { // namespace nts(NiuTrans.Tensor)
-                /* test for ReduceSum Function */
+/* test for ReduceSum Function */
-    extern "C"
+extern "C"
-        bool TestReduceSum();
+bool TestReduceSum();
-} // namespace nt(NiuTrans.Tensor)
+} // namespace nts(NiuTrans.Tensor)
 #endif // __TEST_REDUCESUM_H__

--- a/source/test/TReduceSumSquared.cpp
+++ b/source/test/TReduceSumSquared.cpp
@@ -19,33 +19,35 @@
 * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-06-27
 */
-#include "../XTensor.h"
 #include "TReduceSumSquared.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
-/* case 1 */
+/* case 1: squared sum of the items along a dimension of the tensor. 
+* For a 1-dimensional data array a, sum = \sum_i (a_i - shift)^2.
+* In this case, (2, 4) -> (4), dim = 0.
+*/
 bool TestReduceSumSquared1()
 {
-    /* a input tensor of size 2 * 4 */
+    /* a input tensor of size (2, 4) */
-    int inputOrder = 2;
+    int sOrder = 2;
-    int * inputDimSize = new int[inputOrder];
+    int * sDimSize = new int[sOrder];
-    inputDimSize[0] = 2;
+    sDimSize[0] = 2;
-    inputDimSize[1] = 4;
+    sDimSize[1] = 4;
-    int inputUnitNum = 1;
+    int sUnitNum = 1;
-    for (int i = 0; i < inputOrder; i++)
+    for (int i = 0; i < sOrder; i++)
-        inputUnitNum *= inputDimSize[i];
+        sUnitNum *= sDimSize[i];
-    /* a output tensor of size 4 */
+    /* a output tensor of size (4) */
-    int outputOrder = 1;
+    int tOrder = 1;
-    int * outputDimSize = new int[outputOrder];
+    int * tDimSize = new int[tOrder];
-    outputDimSize[0] = 4;
+    tDimSize[0] = 4;
-    int outputUnitNum = 1;
+    int tUnitNum = 1;
-    for (int i = 0; i < outputOrder; i++)
+    for (int i = 0; i < tOrder; i++)
-        outputUnitNum *= outputDimSize[i];
+        tUnitNum *= tDimSize[i];
-    /* a shift tensor of size 4 */
+    /* a shift tensor of size (4) */
    int shiftOrder = 1;
    int * shiftDimSize = new int[shiftOrder];
    shiftDimSize[0] = 4;
@@ -54,8 +56,8 @@ bool TestReduceSumSquared1()
    for (int i = 0; i < shiftOrder; i++)
        shiftUnitNum *= shiftDimSize[i];
-    DTYPE inputData[2][4] = { {0.0, 1.0, 2.0, 3.0},
+    DTYPE sData[2][4] = { {0.0, 1.0, 2.0, 3.0},
-                              {4.0, 5.0, 6.0, 7.0} };
+                          {4.0, 5.0, 6.0, 7.0} };
    DTYPE shiftData[4] = {1.0, -1.0, -1.0, 0.0};
    DTYPE answer[4] = {10.0, 40.0, 58.0, 58.0};
@@ -63,51 +65,164 @@ bool TestReduceSumSquared1()
    bool cpuTest = true;
    /* create tensors */
-    XTensor * input = NewTensor(inputOrder, inputDimSize);
+    XTensor * s = NewTensor(sOrder, sDimSize);
-    XTensor * output = NewTensor(outputOrder, outputDimSize);
+    XTensor * t = NewTensor(tOrder, tDimSize);
+    XTensor * shift = NewTensor(shiftOrder, shiftDimSize);
+    /* initialize variables */
+    s->SetData(sData, sUnitNum);
+    shift->SetData(shiftData, shiftUnitNum);
+    t->SetZeroAll();
+    /* call ReduceSumSquared function */
+    ReduceSumSquared(s, t, 0, shift);
+    /* check results */
+    cpuTest = t->CheckData(answer, tUnitNum);
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+    /* create tensors */
+    XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * shiftGPU = NewTensor(shiftOrder, shiftDimSize, X_FLOAT, 1.0F, 0);
+    /* initialize variables */
+    sGPU->SetData(sData, sUnitNum);
+    shiftGPU->SetData(shiftData, shiftUnitNum);
+    tGPU->SetZeroAll();
+    /* call ReduceSumSquared function */
+    ReduceSumSquared(sGPU, tGPU, 0, shiftGPU);
+    /* check results */
+    gpuTest = tGPU->CheckData(answer, tUnitNum);
+    /* destroy variables */
+    delete s;
+    delete t;
+    delete shift;
+    delete sGPU;
+    delete tGPU;
+    delete shiftGPU;
+    delete[] sDimSize;
+    delete[] tDimSize;
+    delete[] shiftDimSize;
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete s;
+    delete t;
+    delete shift;
+    delete[] sDimSize;
+    delete[] tDimSize;
+    delete[] shiftDimSize;
+    return cpuTest;
+#endif // USE_CUDA
+}
+/* case 1: squared sum of the items along a dimension of the tensor. 
+* For a 1-dimensional data array a, sum = \sum_i (a_i - shift)^2.
+* In this case, (2, 4) -> (2), dim = 1.
+*/
+bool TestReduceSumSquared2()
+{
+    /* a input tensor of size (2, 4) */
+    int sOrder = 2;
+    int * sDimSize = new int[sOrder];
+    sDimSize[0] = 2;
+    sDimSize[1] = 4;
+    int sUnitNum = 1;
+    for (int i = 0; i < sOrder; i++)
+        sUnitNum *= sDimSize[i];
+    /* a output tensor of size (4) */
+    int tOrder = 1;
+    int * tDimSize = new int[tOrder];
+    tDimSize[0] = 2;
+    int tUnitNum = 1;
+    for (int i = 0; i < tOrder; i++)
+        tUnitNum *= tDimSize[i];
+    /* a shift tensor of size (4) */
+    int shiftOrder = 1;
+    int * shiftDimSize = new int[shiftOrder];
+    shiftDimSize[0] = 2;
+    int shiftUnitNum = 1;
+    for (int i = 0; i < shiftOrder; i++)
+        shiftUnitNum *= shiftDimSize[i];
+    DTYPE sData[2][4] = { {0.0, 1.0, 2.0, 3.0},
+                          {4.0, 5.0, 6.0, 7.0} };
+    DTYPE shiftData[2] = {-1.0, 1.0};
+    DTYPE answer[2] = {30.0, 86.0};
+    /* CPU test */
+    bool cpuTest = true;
+    /* create tensors */
+    XTensor * s = NewTensor(sOrder, sDimSize);
+    XTensor * t = NewTensor(tOrder, tDimSize);
    XTensor * shift = NewTensor(shiftOrder, shiftDimSize);
    /* initialize variables */
-    input->SetData(inputData, inputUnitNum);
+    s->SetData(sData, sUnitNum);
    shift->SetData(shiftData, shiftUnitNum);
-    output->SetZeroAll();
+    t->SetZeroAll();
    /* call ReduceSumSquared function */
-    ReduceSumSquared(input, output, 0, shift);
+    ReduceSumSquared(s, t, 1, shift);
    /* check results */
-    cpuTest = output->CheckData(answer, outputUnitNum);
+    cpuTest = t->CheckData(answer, tUnitNum);
 #ifdef USE_CUDA
    /* GPU test */
    bool gpuTest = true;
    /* create tensors */
-    XTensor * inputGPU = NewTensor(inputOrder, inputDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
-    XTensor * outputGPU = NewTensor(outputOrder, outputDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
    XTensor * shiftGPU = NewTensor(shiftOrder, shiftDimSize, X_FLOAT, 1.0F, 0);
    /* initialize variables */
-    inputGPU->SetData(inputData, inputUnitNum);
+    sGPU->SetData(sData, sUnitNum);
    shiftGPU->SetData(shiftData, shiftUnitNum);
-    outputGPU->SetZeroAll();
+    tGPU->SetZeroAll();
    /* call ReduceSumSquared function */
-    ReduceSumSquared(inputGPU, outputGPU, 0, shiftGPU);
+    ReduceSumSquared(sGPU, tGPU, 1, shiftGPU);
    /* check results */
-    gpuTest = output->CheckData(answer, outputUnitNum);
+    gpuTest = tGPU->CheckData(answer, tUnitNum);
    /* destroy variables */
-    delete input, output, shift;
+    delete s;
-    delete inputGPU, outputGPU, shiftGPU;
+    delete t;
-    delete[] inputDimSize, outputDimSize, shiftDimSize;
+    delete shift;
+    delete sGPU;
+    delete tGPU;
+    delete shiftGPU;
+    delete[] sDimSize;
+    delete[] tDimSize;
+    delete[] shiftDimSize;
    return cpuTest && gpuTest;
 #else
    /* destroy variables */
-    delete input, output, shift;
+    delete s;
-    delete[] inputDimSize, outputDimSize, shiftDimSize;
+    delete t;
+    delete shift;
+    delete[] sDimSize;
+    delete[] tDimSize;
+    delete[] shiftDimSize;
    return cpuTest;
 #endif // USE_CUDA
@@ -122,7 +237,7 @@ TODO!!
 extern "C"
 bool TestReduceSumSquared()
 {
-    XPRINT(0, stdout, "[TEST ReduceSumSquared]\n");
+    XPRINT(0, stdout, "[TEST ReduceSumSquared] squared sum of the items along a dimension of the tensor\n");
    bool returnFlag = true, caseFlag = true;
    /* case 1 test */
@@ -133,6 +248,15 @@ bool TestReduceSumSquared()
    }
    else
        XPRINT(0, stdout, ">> case 1 passed!\n");
+    /* case 2 test */
+    caseFlag = TestReduceSumSquared2();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 1 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 1 passed!\n");
    /* other cases test */
    /*

--- a/source/test/TReduceVariance.cpp
+++ b/source/test/TReduceVariance.cpp
@@ -19,33 +19,35 @@
 * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-06-27
 */
-#include "../XTensor.h"
 #include "TReduceVariance.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
-/* case 1 */
+/* case 1: variance of the items along a dimension of the tensor. 
+* For a 1-dimensional data array a, variance = 1/n * \sum_i (a_i - mean)^2.
+* In this case, (2, 4) -> (4), dim = 0.
+*/
 bool TestReduceVariance1()
 {
-    /* a input tensor of size 2 * 4 */
+    /* a input tensor of size (2, 4) */
-    int inputOrder = 2;
+    int sOrder = 2;
-    int * inputDimSize = new int[inputOrder];
+    int * sDimSize = new int[sOrder];
-    inputDimSize[0] = 2;
+    sDimSize[0] = 2;
-    inputDimSize[1] = 4;
+    sDimSize[1] = 4;
-    int inputUnitNum = 1;
+    int sUnitNum = 1;
-    for (int i = 0; i < inputOrder; i++)
+    for (int i = 0; i < sOrder; i++)
-        inputUnitNum *= inputDimSize[i];
+        sUnitNum *= sDimSize[i];
-    /* a output tensor of size 1 */
+    /* a output tensor of size (4) */
-    int outputOrder = 1;
+    int tOrder = 1;
-    int * outputDimSize = new int[outputOrder];
+    int * tDimSize = new int[tOrder];
-    outputDimSize[0] = 4;
+    tDimSize[0] = 4;
-    int outputUnitNum = 1;
+    int tUnitNum = 1;
-    for (int i = 0; i < outputOrder; i++)
+    for (int i = 0; i < tOrder; i++)
-        outputUnitNum *= outputDimSize[i];
+        tUnitNum *= tDimSize[i];
-    /* a shift tensor of size 1 */
+    /* a mean tensor of size (4) */
    int meanOrder = 1;
    int * meanDimSize = new int[meanOrder];
    meanDimSize[0] = 4;
@@ -54,61 +56,70 @@ bool TestReduceVariance1()
    for (int i = 0; i < meanOrder; i++)
        meanUnitNum *= meanDimSize[i];
-    DTYPE inputData[2][4] = { {0.0, 1.0, 2.0, 3.0},
+    DTYPE sData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
-                              {4.0, 5.0, 6.0, 7.0} };
+                          {4.0F, 5.0F, 6.0F, 7.0F} };
-    DTYPE meanData[4] = {2.0, 3.0, 4.0, 5.0};
+    DTYPE meanData[4] = {2.0F, 3.0F, 4.0F, 5.0F};
-    DTYPE answer[4] = {4.0, 4.0, 4.0, 4.0};
+    DTYPE answer[4] = {4.0F, 4.0F, 4.0F, 4.0F};
    /* CPU test */
    bool cpuTest = true;
    /* create tensors */
-    XTensor * input = NewTensor(inputOrder, inputDimSize);
+    XTensor * s = NewTensor(sOrder, sDimSize);
-    XTensor * output = NewTensor(outputOrder, outputDimSize);
+    XTensor * t = NewTensor(tOrder, tDimSize);
    XTensor * mean = NewTensor(meanOrder, meanDimSize);
    /* initialize variables */
-    input->SetData(inputData, inputUnitNum);
+    s->SetData(sData, sUnitNum);
    mean->SetData(meanData, meanUnitNum);
-    output->SetZeroAll();
+    t->SetZeroAll();
    /* call ReduceVariance function */
-    ReduceVariance(input, output, 0, mean);
+    ReduceVariance(s, t, 0, mean);
    /* check results */
-    cpuTest = output->CheckData(answer, outputUnitNum);
+    cpuTest = t->CheckData(answer, tUnitNum);
 #ifdef USE_CUDA
    /* GPU test */
    bool gpuTest = true;
    /* create tensors */
-    XTensor * inputGPU = NewTensor(inputOrder, inputDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
-    XTensor * outputGPU = NewTensor(outputOrder, outputDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
    XTensor * meanGPU = NewTensor(meanOrder, meanDimSize, X_FLOAT, 1.0F, 0);
    /* initialize variables */
-    inputGPU->SetData(inputData, inputUnitNum);
+    sGPU->SetData(sData, sUnitNum);
    meanGPU->SetData(meanData, meanUnitNum);
-    outputGPU->SetZeroAll();
+    tGPU->SetZeroAll();
    /* call ReduceVariance function */
-    ReduceVariance(inputGPU, outputGPU, 0, meanGPU);
+    ReduceVariance(sGPU, tGPU, 0, meanGPU);
    /* check results */
-    gpuTest = output->CheckData(answer, outputUnitNum);
+    gpuTest = t->CheckData(answer, tUnitNum);
    /* destroy variables */
-    delete input, output, mean;
+    delete s;
-    delete inputGPU, outputGPU, meanGPU;
+    delete t;
-    delete[] inputDimSize, outputDimSize, meanDimSize;
+    delete mean;
+    delete sGPU;
+    delete tGPU;
+    delete meanGPU;
+    delete[] sDimSize;
+    delete[] tDimSize;
+    delete[] meanDimSize;
    return cpuTest && gpuTest;
 #else
    /* destroy variables */
-    delete input, output, mean;
+    delete s;
-    delete[] inputDimSize, outputDimSize, meanDimSize;
+    delete t;
+    delete mean;
+    delete[] sDimSize;
+    delete[] tDimSize;
+    delete[] meanDimSize;
    return cpuTest;
 #endif // USE_CUDA
@@ -123,7 +134,7 @@ TODO!!
 extern "C"
 bool TestReduceVariance()
 {
-    XPRINT(0, stdout, "[TEST ReduceVariance]\n");
+    XPRINT(0, stdout, "[TEST ReduceVariance] variance of the items along a dimension of the tensor\n");
    bool returnFlag = true, caseFlag = true;
    /* case 1 test */

--- a/source/test/TReduceVariance.h
+++ b/source/test/TReduceVariance.h
@@ -16,7 +16,7 @@
 */
 /*
-* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-06-27
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-06
 */
 #ifndef __TEST_REDUCEVARIANCE_H__

--- a/source/test/TScaleAndShift.cpp
+++ b/source/test/TScaleAndShift.cpp
@@ -19,27 +19,28 @@
 * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-06-27
 */
-#include "../XTensor.h"
 #include "TScaleAndShift.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
-/* case 1 */
+/* case 1: scale and shift all tensor entires.
+* p = p * scale + shift
+*/
 bool TestScaleAndShift1()
 {
-    /* a input tensor of size 2 * 4 */
+    /* a input tensor of size (2, 4) */
-    int inputOrder = 2;
+    int sOrder = 2;
-    int * inputDimSize = new int[inputOrder];
+    int * sDimSize = new int[sOrder];
-    inputDimSize[0] = 2;
+    sDimSize[0] = 2;
-    inputDimSize[1] = 4;
+    sDimSize[1] = 4;
-    int inputUnitNum = 1;
+    int sUnitNum = 1;
-    for (int i = 0; i < inputOrder; i++)
+    for (int i = 0; i < sOrder; i++)
-        inputUnitNum *= inputDimSize[i];
+        sUnitNum *= sDimSize[i];
-    DTYPE inputData[2][4] = { {0.0, 1.0, 2.0, 3.0},
+    DTYPE sData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
-                              {4.0, 5.0, 6.0, 7.0} };
+                          {4.0F, 5.0F, 6.0F, 7.0F} };
-    DTYPE answer[2][4] = { {0.5, 2.5, 4.5, 6.5},
+    DTYPE answer[2][4] = { {0.5F, 2.5F, 4.5F, 6.5F},
-                           {8.5, 10.5, 12.5, 14.5} };
+                           {8.5F, 10.5F, 12.5F, 14.5F} };
    DTYPE scaleFactor = 2.0;
    DTYPE shiftFactor = 0.5;
@@ -48,43 +49,43 @@ bool TestScaleAndShift1()
    bool cpuTest = true;
    /* create tensors */
-    XTensor * input = NewTensor(inputOrder, inputDimSize);
+    XTensor * s = NewTensor(sOrder, sDimSize);
    /* initialize variables */
-    input->SetData(inputData, inputUnitNum);
+    s->SetData(sData, sUnitNum);
    /* call ScaleAndShift function */
-    ScaleAndShift(input, scaleFactor, shiftFactor);
+    ScaleAndShift(s, scaleFactor, shiftFactor);
    /* check results */
-    cpuTest = input->CheckData(answer, inputUnitNum);
+    cpuTest = s->CheckData(answer, sUnitNum);
 #ifdef USE_CUDA
    /* GPU test */
    bool gpuTest = true;
    /* create tensors */
-    XTensor * inputGPU = NewTensor(inputOrder, inputDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
    /* initialize variables */
-    inputGPU->SetData(inputData, inputUnitNum);
+    sGPU->SetData(sData, sUnitNum);
    /* call ScaleAndShift function */
-    ScaleAndShift(inputGPU, scaleFactor, shiftFactor);
+    ScaleAndShift(sGPU, scaleFactor, shiftFactor);
    /* check results */
-    gpuTest = inputGPU->CheckData(answer, inputUnitNum);
+    gpuTest = sGPU->CheckData(answer, sUnitNum);
    /* destroy variables */
-    delete input;
+    delete s;
-    delete inputGPU;
+    delete sGPU;
-    delete[] inputDimSize;
+    delete[] sDimSize;
    return cpuTest && gpuTest;
 #else
    /* destroy variables */
-    delete input;
+    delete s;
-    delete[] inputDimSize;
+    delete[] sDimSize;
    return cpuTest;
 #endif // USE_CUDA
@@ -99,7 +100,7 @@ TODO!!
 extern "C"
 bool TestScaleAndShift()
 {
-    XPRINT(0, stdout, "[TEST ScaleAndShift]\n");
+    XPRINT(0, stdout, "[TEST ScaleAndShift] scale and shift all tensor entires\n");
    bool returnFlag = true, caseFlag = true;
    /* case 1 test */

--- a/source/test/TSelect.cpp
+++ b/source/test/TSelect.cpp
@@ -16,7 +16,7 @@
 */
 /*
-* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-06-27
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-04
 */
 #include "TSelect.h"
@@ -25,10 +25,11 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /* case 1: test SelectRange function.
 * It can generate a tensor with seleccted data 
 * in range[low,high] along the given dimension.
+* In this case, (2, 2, 4) -> (2, 2, 2), dim = 2, low = 1, high = 3.
 */
 bool TestSelect1()
 {
-    /* a input tensor of size (2, 4) */
+    /* a input tensor of size (2, 2, 4) */
    int sOrder = 3;
    int * sDimSize = new int[sOrder];
    sDimSize[0] = 2;
@@ -39,23 +40,25 @@ bool TestSelect1()
    for (int i = 0; i < sOrder; i++)
        sUnitNum *= sDimSize[i];
-    /* a output tensor of size (2, 2) */
+    /* a output tensor of size (2, 2, 2) */
    int tOrder = 3;
    int * tDimSize = new int[tOrder];
    tDimSize[0] = 2;
-    tDimSize[1] = 1;
+    tDimSize[1] = 2;
-    tDimSize[2] = 4;
+    tDimSize[2] = 2;
    int tUnitNum = 1;
    for (int i = 0; i < tOrder; i++)
        tUnitNum *= tDimSize[i];
-    DTYPE sData[2][2][4] = { { {0.0, 1.0, 2.0, 3.0},
+    DTYPE sData[2][2][4] = { { {0.0F, 1.0F, 2.0F, 3.0F},
-                               {4.0, 5.0, 6.0, 7.0} },
+                               {4.0F, 5.0F, 6.0F, 7.0F} },
-                             { {1.0, 2.0, 3.0, 4.0},
+                             { {1.0F, 2.0F, 3.0F, 4.0F},
-                               {5.0, 6.0, 7.0, 8.0} } };
+                               {5.0F, 6.0F, 7.0F, 8.0F} } };
-    DTYPE answer[2][1][4] = { { {4.0, 5.0, 6.0, 7.0} },
+    DTYPE answer[2][2][2] = { { {1.0F, 2.0F},
-                              { {5.0, 6.0, 7.0, 8.0} } };
+                                {5.0F, 6.0F} },
+                              { {2.0F, 3.0F},
+                                {6.0F, 7.0F} } };
    /* CPU test */
    bool cpuTest = true;
@@ -69,7 +72,7 @@ bool TestSelect1()
    t->SetZeroAll();
    /* call SelectRange function */
-    SelectRange(s, 1, 1, 2, t);
+    SelectRange(s, 2, 1, 3, t);
    /* check results */
    cpuTest = t->CheckData(answer, tUnitNum);
@@ -121,7 +124,7 @@ TODO!!
 extern "C"
 bool TestSelect()
 {
-    XPRINT(0, stdout, "[TEST Select] scale and shift all tensor entires\n");
+    XPRINT(0, stdout, "[TEST Select] generate a tensor with seleccted data in range[low,high] along the given dimension \n");
    bool returnFlag = true, caseFlag = true;
    /* case 1 test */

--- a/source/test/TSelect.h
+++ b/source/test/TSelect.h
@@ -16,7 +16,7 @@
 */
 /*
-* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-06-27
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-04
 */
 #ifndef __TEST_SELECT_H__

--- a/source/test/TSetAscendingOrder.cpp
+++ b/source/test/TSetAscendingOrder.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-06
+*/
+#include "TSetAscendingOrder.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/* case 1: set the cell to the ascending order along a given dimension.
+*/
+bool TestSetAscendingOrder1()
+{
+    /* a input tensor of size (2, 4) */
+    int sOrder = 2;
+    int * sDimSize = new int[sOrder];
+    sDimSize[0] = 2;
+    sDimSize[1] = 4;
+    int sUnitNum = 1;
+    for (int i = 0; i < sOrder; i++)
+        sUnitNum *= sDimSize[i];
+    int answer[2][4] = { {0, 1, 2, 3},
+                         {0, 1, 2, 3} };
+    /* CPU test */
+    bool cpuTest = true;
+    /* create tensors */
+    XTensor * s = NewTensor(sOrder, sDimSize, X_INT);
+    /* initialize variables */
+    s->SetZeroAll();
+    /* call SetAscendingOrder function */
+    s->SetAscendingOrder(1);
+    /* check results */
+    cpuTest = s->CheckData(answer, sUnitNum);
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+    /* create tensors */
+    XTensor * sGPU = NewTensor(sOrder, sDimSize, X_INT, 1.0F, 0);
+    /* initialize variables */
+    sGPU->SetZeroAll();
+    /* call SetAscendingOrder function */
+    sGPU->SetAscendingOrder(1);
+    /* check results */
+    gpuTest = sGPU->CheckData(answer, sUnitNum);
+    /* destroy variables */
+    delete s;
+    delete sGPU;
+    delete[] sDimSize;
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete s;
+    delete[] sDimSize;
+    return cpuTest;
+#endif // USE_CUDA
+}
+/* other cases */
+/*
+TODO!!
+*/
+/* test for SetAscendingOrder Function */
+extern "C"
+bool TestSetAscendingOrder()
+{
+    XPRINT(0, stdout, "[TEST SetAscendingOrder] set the cell to the ascending order along a given dimension \n");
+    bool returnFlag = true, caseFlag = true;
+    /* case 1 test */
+    caseFlag = TestSetAscendingOrder1();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 1 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 1 passed!\n");
+    /* other cases test */
+    /*
+    TODO!!
+    */
+    if (returnFlag) {
+        XPRINT(0, stdout, ">> All Passed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> Failed!\n");
+    XPRINT(0, stdout, "\n");
+    return returnFlag;
+    }
+} // namespace nts(NiuTrans.Tensor)
--- a/source/test/TSetAscendingOrder.h
+++ b/source/test/TSetAscendingOrder.h
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-06
+*/
+#ifndef __TEST_SETASCENDINGORDER_H__
+#define __TEST_SETASCENDINGORDER_H__
+#include "../XTensor.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/* test for SetAscendingOrder Function */
+extern "C"
+bool TestSetAscendingOrder();
+} // namespace nts(NiuTrans.Tensor)
+#endif // __TEST_SETASCENDINGORDER_H__
--- a/source/test/TSetData.cpp
+++ b/source/test/TSetData.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-06
+*/
+#include "TSetData.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/* case 1: set the cell to the ascending order along a given dimension.
+*/
+bool TestSetData1()
+{
+    /* a input tensor of size (2, 4) */
+    int sOrder = 2;
+    int * sDimSize = new int[sOrder];
+    sDimSize[0] = 2;
+    sDimSize[1] = 4;
+    int sUnitNum = 1;
+    for (int i = 0; i < sOrder; i++)
+        sUnitNum *= sDimSize[i];
+    DTYPE answer[2][4] = {0};
+    /* CPU test */
+    bool cpuTest = true;
+    /* create tensors */
+    XTensor * s = NewTensor(sOrder, sDimSize);
+    /* call SetData function */
+    s->SetDataRand(0.0, 1.0);
+    /* check results */
+    cpuTest = s->CheckData(answer, sUnitNum, 1.0F);
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+    /* create tensors */
+    XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    /* call SetDataRand function */
+    sGPU->SetDataRand(0.0, 1.0);
+    gpuTest = sGPU->CheckData(answer, sUnitNum, 1.0F);
+    /* destroy variables */
+    delete s;
+    delete sGPU;
+    delete[] sDimSize;
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete s;
+    delete[] sDimSize;
+    return cpuTest;
+#endif // USE_CUDA
+}
+/* other cases */
+/*
+TODO!!
+*/
+/* test for SetData Function */
+extern "C"
+bool TestSetData()
+{
+    XPRINT(0, stdout, "[TEST SetData] set the data of tensor \n");
+    bool returnFlag = true, caseFlag = true;
+    /* case 1 test */
+    caseFlag = TestSetData1();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 1 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 1 passed!\n");
+    /* other cases test */
+    /*
+    TODO!!
+    */
+    if (returnFlag) {
+        XPRINT(0, stdout, ">> All Passed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> Failed!\n");
+    XPRINT(0, stdout, "\n");
+    return returnFlag;
+    }
+} // namespace nts(NiuTrans.Tensor)
--- a/source/test/TSetData.h
+++ b/source/test/TSetData.h
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-06
+*/
+#ifndef __TEST_SETDATA_H__
+#define __TEST_SETDATA_H__
+#include "../XTensor.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/* test for SetData Function */
+extern "C"
+bool TestSetData();
+} // namespace nts(NiuTrans.Tensor)
+#endif // __TEST_SETDATA_H__
--- a/source/test/TSigmoid.cpp
+++ b/source/test/TSigmoid.cpp
@@ -19,7 +19,6 @@
 * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-06-19
 */
-#include "../XTensor.h"
 #include "../XUtility.h"
 #include "TSigmoid.h"
@@ -41,7 +40,9 @@ bool TestSigmoid1()
    DTYPE xData[3] = {0.0F, 1.0F, 2.0F};
    DTYPE gData[3] = {0.4F, 0.8F, 1.0F};
-    DTYPE answer[3];
+    DTYPE dedyData[3] = {-0.8F, -1.094F, -1.135F};
+    DTYPE yAnswer[3] = {0.5F, 0.731F, 0.881F};
+    DTYPE dedxAnswer[3] = {-0.2F, -0.215F, -0.119F};
    /* CPU test */
    bool cpuTest = true;
@@ -56,41 +57,18 @@ bool TestSigmoid1()
    /* initialize variables */
    x->SetData(xData, sUnitNum);
    g->SetData(gData, sUnitNum);
+    dedy->SetData(dedyData, sUnitNum);
    y->SetZeroAll();
    dedx->SetZeroAll();
    /* call Sigmoid function */
    Sigmoid(x, y);
-    /* cross_entropy: de/dy_i = -t_i / y_i */
-    DTYPE dedyData[3];
-    DTYPE * yProcessedData = (DTYPE*)y->data;
-	for (int i = 0; i < sUnitNum; i++)
-		dedyData[i] = - gData[i] / yProcessedData[i];
-    /* initialize variables */
-    dedy->SetData(dedyData, sUnitNum);
-	for (int i = 0; i < sUnitNum; i++)
-		answer[i] = dedyData[i] * yProcessedData[i] * (1 - yProcessedData[i]);
    /* call SigmoidBackward function */
    SigmoidBackward(g, y, x, dedy, dedx, NOLOSS);
    /* check result */
-    printf("CPU Test:\n");
+    cpuTest = y->CheckData(yAnswer, sUnitNum) && dedx->CheckData(dedxAnswer, sUnitNum);
-    printf("Computer de/dx:");
-    DTYPE * checkData = (DTYPE*)dedx->data;
-    for (int i = 0; i < sUnitNum; i++) {
-        printf("\t%f", checkData[i]);
-    }
-    printf("\n");
-    printf("Real de/dx:");
-    for (int i = 0; i < sUnitNum; i++) {
-        printf("\t%f", answer[i]);
-    }
-    printf("\n");
 #ifdef USE_CUDA
    /* GPU test */
@@ -106,57 +84,40 @@ bool TestSigmoid1()
    /* initialize variables */
    xGPU->SetData(xData, sUnitNum);
    gGPU->SetData(gData, sUnitNum);
+    dedyGPU->SetData(dedyData, sUnitNum);
    yGPU->SetZeroAll();
    dedxGPU->SetZeroAll();
    /* call Sigmoid function */
    Sigmoid(xGPU, yGPU);
-    /* cross_entropy: de/dy_i = -t_i / y_i */
-    void * yProcessedDataGPU = (DTYPE*)yGPU->data;
-    int size = sUnitNum * yGPU->unitSize;
-    DTYPE * copy = new DTYPE[size];
-    XMemCopy(copy, -1, yProcessedDataGPU, yGPU->devID, size);
-	for (int i = 0; i < sUnitNum; i++) {
-		dedyData[i] = - gData[i] / *copy++;
-    }
-    /* initialize variables */
-    dedyGPU->SetData(dedyData, sUnitNum);
-	for (int i = 0; i < sUnitNum; i++)
-		answer[i] = dedyData[i] * yProcessedData[i] * (1 - yProcessedData[i]);
    /* call SigmoidBackward function */
    SigmoidBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, NOLOSS);
    /* check result */
-    printf("\nGPU Test:\n");
+    gpuTest = yGPU->CheckData(yAnswer, sUnitNum) && dedxGPU->CheckData(dedxAnswer, sUnitNum);
-    printf("Computer de/dx:");
-    checkData = (DTYPE*)dedxGPU->data;
-    size = sUnitNum * dedxGPU->unitSize;
-    DTYPE * copy1 = new DTYPE[size];
-    XMemCopy(copy1, -1, checkData, dedxGPU->devID, size);
-    for (int i = 0; i < sUnitNum; i++) {
-        printf("\t%f", copy1[i]);
-    }
-    printf("\n");
-    printf("Real de/dx:");
-    for (int i = 0; i < sUnitNum; i++) {
-        printf("\t%f", answer[i]);
-    }
-    printf("\n\n");
    /* destroy variables */
-    delete x, y, g, dedx, dedy;
+    delete x;
-    delete xGPU, yGPU, gGPU, dedxGPU, dedyGPU;
+    delete y;
+    delete g;
+    delete dedx;
+    delete dedy;
+    delete xGPU;
+    delete yGPU;
+    delete gGPU;
+    delete dedxGPU;
+    delete dedyGPU;
    delete[] sDimSize;
    return cpuTest && gpuTest;
 #else
    /* destroy variables */
-    delete x, y, g, dedx, dedy;
+    delete x;
+    delete y;
+    delete g;
+    delete dedx;
+    delete dedy;
    delete[] sDimSize;
    return cpuTest;
@@ -180,7 +141,9 @@ bool TestSigmoid2()
    DTYPE xData[3] = {0.0F, 1.0F, 2.0F};
    DTYPE gData[3] = {0.4F, 0.8F, 1.0F};
-    DTYPE answer[3] = {0.0F, 0.0F, 0.0F};
+    DTYPE dedyData[3] = {-0.8F, -1.094F, -1.135F};
+    DTYPE yAnswer[3] = {0.5F, 0.731F, 0.881F};
+    DTYPE dedxAnswer[3] = {-0.2F, -0.215F, -0.119F};
    /* CPU test */
    bool cpuTest = true;
@@ -195,29 +158,21 @@ bool TestSigmoid2()
    /* initialize variables */
    x->SetData(xData, sUnitNum);
    g->SetData(gData, sUnitNum);
+    dedy->SetZeroAll();
    y->SetZeroAll();
    dedx->SetZeroAll();
    /* call Sigmoid function */
    Sigmoid(x, y);
+    /* initialize variables */
+    dedy->SetData(dedyData, sUnitNum);
    /* call SigmoidBackward function */
    SigmoidBackward(g, y, x, dedy, dedx, CROSSENTROPY);
    /* check result */
-    printf("CPU Test:\n");
+    cpuTest = y->CheckData(yAnswer, sUnitNum) && dedx->CheckData(dedxAnswer, sUnitNum);
-    printf("Computer de/dx:");
-    DTYPE * checkData = (DTYPE*)dedx->data;
-    for (int i = 0; i < sUnitNum; i++) {
-        printf("\t%f", checkData[i]);
-    }
-    printf("\n");
-    printf("Real de/dx:");
-    for (int i = 0; i < sUnitNum; i++) {
-        printf("\t%f", answer[i]);
-    }
-    printf("\n");
 #ifdef USE_CUDA
    /* GPU test */
@@ -233,6 +188,7 @@ bool TestSigmoid2()
    /* initialize variables */
    xGPU->SetData(xData, sUnitNum);
    gGPU->SetData(gData, sUnitNum);
+    dedyGPU->SetZeroAll();
    yGPU->SetZeroAll();
    dedxGPU->SetZeroAll();
@@ -243,32 +199,29 @@ bool TestSigmoid2()
    SigmoidBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, CROSSENTROPY);
    /* check result */
-    printf("\nGPU Test:\n");
+    gpuTest = yGPU->CheckData(yAnswer, sUnitNum) && dedxGPU->CheckData(dedxAnswer, sUnitNum);
-    printf("Computer de/dx:");
-    checkData = (DTYPE*)dedxGPU->data;
-    int size = sUnitNum * dedxGPU->unitSize;
-    DTYPE * copy1 = new DTYPE[size];
-    XMemCopy(copy1, -1, checkData, dedxGPU->devID, size);
-    for (int i = 0; i < sUnitNum; i++) {
-        printf("\t%f", copy1[i]);
-    }
-    printf("\n");
-    printf("Real de/dx:");
-    for (int i = 0; i < sUnitNum; i++) {
-        printf("\t%f", answer[i]);
-    }
-    printf("\n\n");
    /* destroy variables */
-    delete x, y, g, dedx, dedy;
+    delete x;
-    delete xGPU, yGPU, gGPU, dedxGPU, dedyGPU;
+    delete y;
+    delete g;
+    delete dedx;
+    delete dedy;
+    delete xGPU;
+    delete yGPU;
+    delete gGPU;
+    delete dedxGPU;
+    delete dedyGPU;
    delete[] sDimSize;
    return cpuTest && gpuTest;
 #else
    /* destroy variables */
-    delete x, y, g, dedx, dedy;
+    delete x;
+    delete y;
+    delete g;
+    delete dedx;
+    delete dedy;
    delete[] sDimSize;
    return cpuTest;
@@ -284,7 +237,7 @@ bool TestSigmoid2()
 extern "C"
 bool TestSigmoid()
 {
-    XPRINT(0, stdout, "[TEST SIGMOID] -------------\n");
+    XPRINT(0, stdout, "[TEST SIGMOID] sigmoid function and its backward computation \n");
    bool returnFlag = true, caseFlag = true;
    /* case 1 test */

--- a/source/test/TSoftmax.cpp
+++ b/source/test/TSoftmax.cpp
@@ -59,21 +59,7 @@ bool TestSoftmax1()
    Softmax(x, y, 1);
    /* check result */
-    printf("CPU Test:\n");
+    cpuTest = y->CheckData(answer, sUnitNum);
-    printf("Softmax Result:");
-    DTYPE * checkData = (DTYPE*)y->data;
-    for (int i = 0; i < sUnitNum; i++) {
-        printf("\t%f", checkData[i]);
-    }
-    printf("\n");
-    printf("Real Result:");
-    for (int i = 0; i < sDimSize[0]; i++) {
-        for (int j = 0; j < sDimSize[1]; j++) {
-            printf("\t%f", answer[i][j]);
-        }
-    }
-    printf("\n");
 #ifdef USE_CUDA
    /* GPU test */
@@ -91,28 +77,13 @@ bool TestSoftmax1()
    Softmax(xGPU, yGPU, 1);
    /* check result */
-    printf("\nGPU Test:\n");
+    gpuTest = yGPU->CheckData(answer, sUnitNum);
-    printf("Computer de/dx:");
-    checkData = (DTYPE*)yGPU->data;
-    int size = sUnitNum * yGPU->unitSize;
-    DTYPE * copy = new DTYPE[size];
-    XMemCopy(copy, -1, checkData, yGPU->devID, size);
-    for (int i = 0; i < sUnitNum; i++) {
-        printf("\t%f", copy[i]);
-    }
-    printf("\n");
-    printf("Real Result:");
-    for (int i = 0; i < sDimSize[0]; i++) {
-        for (int j = 0; j < sDimSize[1]; j++) {
-            printf("\t%f", answer[i][j]);
-        }
-    }
-    printf("\n");
    /* destroy variables */
-    delete x, y;
+    delete x;
-    delete xGPU, yGPU;
+    delete y;
+    delete xGPU;
+    delete yGPU;
    delete[] sDimSize;
    return cpuTest && gpuTest;
@@ -130,6 +101,7 @@ bool TestSoftmax1()
 */
 bool TestSoftmax2()
 {
+    /* a input tensor of size (2, 3) */
    int sOrder = 2;
    int * sDimSize = new int[sOrder];
    sDimSize[0] = 1;
@@ -141,7 +113,7 @@ bool TestSoftmax2()
    DTYPE xData[1][3] = { {0.0F, 1.0F, 2.0F} };
    DTYPE gData[1][3] = { {0.0F, 0.0F, 1.0F} };
-    DTYPE answer[3] = {0.090031F, 0.244728F, -0.334759F};
+    DTYPE dedxAnswer[3] = {0.090031F, 0.244728F, -0.334759F};
    /* CPU test */
    bool cpuTest = true;
@@ -163,31 +135,10 @@ bool TestSoftmax2()
    /* call Softmax function */
    Softmax(x, y, 1);
-    /* check result */
-    printf("CPU Test:\n");
-    printf("Softmax Result:");
-    DTYPE * checkData = (DTYPE*)y->data;
-    for (int i = 0; i < sUnitNum; i++) {
-        printf("\t%f", checkData[i]);
-    }
-    printf("\n");
-    /* call SoftmaxBackward function */
    SoftmaxBackward(g, y, x, dedy, dedx, 1, CROSSENTROPY);
    /* check result */
-    printf("Computer de/dx:");
+    cpuTest = dedx->CheckData(dedxAnswer, sUnitNum);
-    checkData = (DTYPE*)dedx->data;
-    for (int i = 0; i < sUnitNum; i++) {
-        printf("\t%f", checkData[i]);
-    }
-    printf("\n");
-    printf("Real de/dx:");
-    for (int i = 0; i < sUnitNum; i++) {
-        printf("\t%f", answer[i]);
-    }
-    printf("\n");
 #ifdef USE_CUDA
    /* GPU test */
@@ -210,44 +161,33 @@ bool TestSoftmax2()
    /* call Softmax function */
    Softmax(xGPU, yGPU, 1);
-    /* check result */
-    printf("\nGPU Test:\n");
-    printf("Softmax Result:");
-    checkData = (DTYPE*)y->data;
-    for (int i = 0; i < sUnitNum; i++) {
-        printf("\t%f", checkData[i]);
-    }
-    printf("\n");
    /* call SoftmaxBackward function */
    SoftmaxBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, 1, CROSSENTROPY);
    /* check result */
-    printf("Computer de/dx:");
+    gpuTest = dedxGPU->CheckData(dedxAnswer, sUnitNum);
-    checkData = (DTYPE*)dedxGPU->data;
-    int size = sUnitNum * dedxGPU->unitSize;
-    DTYPE * copy = new DTYPE[size];
-    XMemCopy(copy, -1, checkData, dedxGPU->devID, size);
-    for (int i = 0; i < sUnitNum; i++) {
-        printf("\t%f", copy[i]);
-    }
-    printf("\n");
-    printf("Real de/dx:");
-    for (int i = 0; i < sUnitNum; i++) {
-        printf("\t%f", answer[i]);
-    }
-    printf("\n");
    /* destroy variables */
-    delete x, y, g, dedx, dedy;
+    delete x;
-    delete xGPU, yGPU, gGPU, dedxGPU, dedyGPU;
+    delete y;
+    delete g;
+    delete dedx;
+    delete dedy;
+    delete xGPU;
+    delete yGPU;
+    delete gGPU;
+    delete dedxGPU;
+    delete dedyGPU;
    delete[] sDimSize;
    return cpuTest && gpuTest;
 #else
    /* destroy variables */
-    delete x, y, g, dedx, dedy;
+    delete x;
+    delete y;
+    delete g;
+    delete dedx;
+    delete dedy;
    delete[] sDimSize;
    return cpuTest;
@@ -263,7 +203,7 @@ bool TestSoftmax2()
 extern "C"
 bool TestSoftmax()
 {
-    XPRINT(0, stdout, "[TEST SOFTMAX] -------------\n");
+    XPRINT(0, stdout, "[TEST SOFTMAX] softmax function and its backward computation \n");
    bool returnFlag = true, caseFlag = true;
    /* case 1 test */

--- a/source/test/TSort.cpp
+++ b/source/test/TSort.cpp
@@ -19,15 +19,13 @@
 * $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-04-30
 */
-#include "../XTensor.h"
+#include "TSort.h"
-#include "../XDevice.h"
-#include "../core/Sort.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
-/* case 1 */
+/* case 1: sort the tensor along a given dimension*/
 bool TestSort1()
 {
-    /* a tensor of size 2 * 4 */
+    /* a tensor of size (2, 4) */
    int order = 2;
    int * dimSize = new int[order];
    dimSize[0] = 2;
@@ -37,33 +35,25 @@ bool TestSort1()
    for (int i = 0; i < order; i++)
        unitNum *= dimSize[i];
-    DTYPE aData[2][4] = { { 0.0,   1.0,   2.0,   3.0 },
+    DTYPE aData[2][4] = { { 0.0F,   1.0F,   2.0F,   3.0F },
-                          { 4.0,   5.0,   6.0,   7.0 } };
+                          { 4.0F,   5.0F,   6.0F,   7.0F } };
-    DTYPE answer[2][4] = { { 4.0,   5.0,  6.0,  7.0 },
+    DTYPE answer[2][4] = { { 4.0F,   5.0F,  6.0F,  7.0F },
-                           { 0.0,   1.0,  2.0,  3.0 } };
+                           { 0.0F,   1.0F,  2.0F,  3.0F } };
    /* CPU test */
    bool cpuTest = true;
    /* create tensors */
    XTensor * a = NewTensor(order, dimSize);
-    XTensor * b = NewTensor(order, dimSize);
+    XTensor * b = NewTensor(order, dimSize, X_INT);
-    b->dataType = X_INT;
    /* initialize variables */
    a->SetData(aData, unitNum);
+    b->SetZeroAll();
-    /* call sort function */
+    /* call Sort function */
    Sort(a, b, 0);
-    DTYPE* check1 = (DTYPE*)a->data;
-    for (int i = 0; i < 8; i++)
-        printf("%f ", *check1++);
-    printf("\n");
-    int* check2 = (int*)b->data;
-    for (int i = 0; i < 8; i++)
-        printf("%d ", *check2++);
-    printf("\n");
-    /* check results */
    cpuTest = a->CheckData(answer, unitNum);
 #ifdef USE_CUDA
@@ -71,11 +61,12 @@ bool TestSort1()
    bool gpuTest = true;
    /* create tensor */
-    XTensor * aGPU = NewTensor(order, dimSize, X_FLOAT);
+    XTensor * aGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
-    XTensor * bGPU = NewTensor(order, dimSize, X_FLOAT);
+    XTensor * bGPU = NewTensor(order, dimSize, X_INT, 1.0F, 0);
-    bGPU->dataType = X_INT;
    /* Initialize variables */
    aGPU->SetData(aData, unitNum);
+    bGPU->SetZeroAll();
    /* call sum function */
    Sort(aGPU, bGPU, 0);
@@ -84,21 +75,26 @@ bool TestSort1()
    gpuTest = aGPU->CheckData(answer, unitNum);
    /* destroy variables */
-    delete a, b, aGPU, bGPU;
+    delete a;
+    delete b;
+    delete aGPU;
+    delete bGPU;
    delete[] dimSize;
    return cpuTest && gpuTest;
 #else
    /* destroy variables */
    delete a;
    delete b;
    delete[] dimSize;
    return cpuTest;
 #endif // USE_CUDA
 }
 bool TestSort2()
 {
-    /* a tensor of size 2 * 4 */
+    /* a tensor of size (2, 4) */
    int order = 2;
    int * dimSize = new int[order];
    dimSize[0] = 2;
@@ -118,22 +114,14 @@ bool TestSort2()
    /* create tensors */
    XTensor * a = NewTensor(order, dimSize);
-    XTensor * b = NewTensor(order, dimSize);
+    XTensor * b = NewTensor(order, dimSize, X_INT);
-    b->dataType = X_INT;
    /* initialize variables */
    a->SetData(aData, unitNum);
-    /* call sort function */
+    /* call Sort function */
    Sort(a, b, 1);
-    DTYPE* check1 = (DTYPE*)a->data;
-    for (int i = 0; i < 8; i++)
-        printf("%f ", *check1++);
-    printf("\n");
-    int* check2 = (int*)b->data;
-    for (int i = 0; i < 8; i++)
-        printf("%d ", *check2++);
-    printf("\n");
    /* check results */
    cpuTest = a->CheckData(answer, unitNum);
@@ -142,9 +130,9 @@ bool TestSort2()
    bool gpuTest = true;
    /* create tensor */
-    XTensor * aGPU = NewTensor(order, dimSize, X_FLOAT);
+    XTensor * aGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
-    XTensor * bGPU = NewTensor(order, dimSize, X_FLOAT);
+    XTensor * bGPU = NewTensor(order, dimSize, X_INT, 1.0F, 0);
-    bGPU->dataType = X_INT;
    /* Initialize variables */
    aGPU->SetData(aData, unitNum);
@@ -155,27 +143,33 @@ bool TestSort2()
    gpuTest = aGPU->CheckData(answer, unitNum);
    /* destroy variables */
-    delete a, b, aGPU, bGPU;
+    delete a;
+    delete b;
+    delete aGPU;
+    delete bGPU;
    delete[] dimSize;
    return cpuTest && gpuTest;
 #else
    /* destroy variables */
    delete a;
    delete b;
    delete[] dimSize;
    return cpuTest;
 #endif // USE_CUDA
 }
 /* other cases */
 /*
 TODO!!
 */
-/* test for Sum Function */
+/* test for Sort Function */
 extern "C"
 bool TestSort()
 {
-    XPRINT(0, stdout, "[TEST SORT]\n");
+    XPRINT(0, stdout, "[TEST SORT] sort the tensor along a given dimension \n");
    bool returnFlag = true, caseFlag = true;
    /* case 1 test */
@@ -195,6 +189,7 @@ bool TestSort()
    }
    else
        XPRINT(0, stdout, ">> case 2 passed!\n");
    /* other cases test */
    /*
    TODO!!

--- a/source/test/TSort.h
+++ b/source/test/TSort.h
@@ -26,9 +26,9 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)
-/* test for Sum Function */
+/* test for Sort Function */
 extern "C"
 bool TestSort();
 } // namespace nts(NiuTrans.Tensor)
-#endif // __TEST_SUM_H__
+#endif // __TEST_SORT_H__
--- a/source/test/TSum.cpp
+++ b/source/test/TSum.cpp
@@ -19,15 +19,13 @@
 * $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-04-30
 */
-#include "../XTensor.h"
+#include "TSum.h"
-#include "../XDevice.h"
-#include "../core/Sum.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
 /* case 1 */
 bool TestSum1()
 {
-    /* a tensor of size 2 * 4 */
+    /* a tensor of size (2, 4) */
    int order = 2;
    int * dimSize = new int[order];
    dimSize[0] = 2;
@@ -80,14 +78,19 @@ bool TestSum1()
    gpuTest = aGPU->CheckData(answer, unitNum);
    /* destroy variables */
-    delete a, b, aGPU, bGPU;
+    delete a;
+    delete b;
+    delete aGPU;
+    delete bGPU;
    delete[] dimSize;
    return cpuTest && gpuTest;
 #else
    /* destroy variables */
    delete a;
 	delete b;
    delete[] dimSize;
    return cpuTest;
 #endif // USE_CUDA
 }
@@ -95,7 +98,7 @@ bool TestSum1()
 /* case 2 */
 bool TestSum2()
 {
-    /* a tensor of size 2 * 4 */
+    /* a tensor of size (2, 4) */
    int order = 2;
    int * dimSize = new int[order];
    dimSize[0] = 2;
@@ -153,8 +156,14 @@ bool TestSum2()
    gpuTest = cGPU->CheckData(answer, unitNum);
    /* destroy variables */
-    delete a, b, c, aGPU, bGPU, cGPU;
+    delete a;
+    delete b;
+    delete c;
+    delete aGPU;
+    delete bGPU;
+    delete cGPU;
    delete[] dimSize;
    return cpuTest && gpuTest;
 #else
    /* destroy variables */
@@ -162,6 +171,7 @@ bool TestSum2()
    delete b;
    delete c;
    delete[] dimSize;
    return cpuTest;
 #endif // USE_CUDA
 }

--- a/source/test/TSumByColumnTV.cpp
+++ b/source/test/TSumByColumnTV.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-06
+*/
+#include "TSumByColumnTV.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/* case 1: test SumByColumnTV function
+* sum of a tensor and a vector (column vector) 
+* in a column by column manner
+*/
+bool TestSumByColumnTV1()
+{
+    /* a tensor of size (2, 4) */
+    int aOrder = 2;
+    int * aDimSize = new int[aOrder];
+    aDimSize[0] = 2;
+    aDimSize[1] = 4;
+    int aUnitNum = 1;
+    for (int i = 0; i < aOrder; i++)
+        aUnitNum *= aDimSize[i];
+    /* a tensor of size (2, 1) */
+    int bOrder = 2;
+    int * bDimSize = new int[bOrder];
+    bDimSize[0] = 2;
+    bDimSize[1] = 1;
+    int bUnitNum = 1;
+    for (int i = 0; i < bOrder; i++)
+        bUnitNum *= bDimSize[i];
+    /* a tensor of size (2, 4) */
+    int cOrder = 2;
+    int * cDimSize = new int[cOrder];
+    cDimSize[0] = 2;
+    cDimSize[1] = 4;
+    int cUnitNum = 1;
+    for (int i = 0; i < cOrder; i++)
+        cUnitNum *= cDimSize[i];
+    DTYPE aData[2][4] = { {0.0, 1.0, 2.0, 3.0},
+                          {4.0, 5.0, 6.0, 7.0} };
+    DTYPE bData[2][1] = { {1.0},
+                          {0.0} };
+    DTYPE answer[2][4] = { {1.0, 2.0, 3.0, 4.0},
+                           {4.0, 5.0, 6.0, 7.0} };
+    /* CPU test */
+    bool cpuTest = true;
+    /* create tensors */
+    XTensor * a = NewTensor(aOrder, aDimSize);
+    XTensor * b = NewTensor(bOrder, bDimSize);
+    XTensor * c = NewTensor(cOrder, cDimSize);
+    /* initialize variables */
+    a->SetData(aData, aUnitNum);
+    b->SetData(bData, bUnitNum);
+    /* call SumByColumnTV function */
+    SumByColumnTV(a, b, c);
+    /* check results */
+    cpuTest = c->CheckData(answer, cUnitNum);
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+    /* create tensor */
+    XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * bGPU = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * cGPU = NewTensor(cOrder, cDimSize, X_FLOAT, 1.0F, 0);
+    /* Initialize variables */
+    aGPU->SetData(aData, aUnitNum);
+    bGPU->SetData(bData, bUnitNum);
+    cGPU->SetZeroAll();
+    /* call SumByColumnTV function */
+    SumByColumnTV(aGPU, bGPU, cGPU);
+    /* check results */
+    gpuTest = cGPU->CheckData(answer, cUnitNum);
+    /* destroy variables */
+    delete a;
+    delete b;
+    delete c;
+    delete aGPU;
+    delete bGPU;
+    delete cGPU;
+    delete[] aDimSize;
+    delete[] bDimSize;
+    delete[] cDimSize;
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete a;
+    delete b;
+    delete c;
+    delete[] aDimSize;
+    delete[] bDimSize;
+    delete[] cDimSize;
+    return cpuTest;
+#endif // USE_CUDA
+}
+/* case 2: test SumByColumnTV function
+* sum of a tensor and a vector (column vector) 
+* in a column by column manner
+*/
+bool TestSumByColumnTV2()
+{
+    /* a tensor of size (2, 4) */
+    int aOrder = 2;
+    int * aDimSize = new int[aOrder];
+    aDimSize[0] = 2;
+    aDimSize[1] = 4;
+    int aUnitNum = 1;
+    for (int i = 0; i < aOrder; i++)
+        aUnitNum *= aDimSize[i];
+    /* a tensor of size (2, 1) */
+    int bOrder = 2;
+    int * bDimSize = new int[bOrder];
+    bDimSize[0] = 2;
+    bDimSize[1] = 1;
+    int bUnitNum = 1;
+    for (int i = 0; i < bOrder; i++)
+        bUnitNum *= bDimSize[i];
+    DTYPE aData[2][4] = { {0.0, 1.0, 2.0, 3.0},
+                          {4.0, 5.0, 6.0, 7.0} };
+    DTYPE bData[2][1] = { {1.0},
+                          {0.0} };
+    DTYPE answer[2][4] = { {1.0, 2.0, 3.0, 4.0},
+                           {4.0, 5.0, 6.0, 7.0} };
+    /* CPU test */
+    bool cpuTest = true;
+    /* create tensors */
+    XTensor * a = NewTensor(aOrder, aDimSize);
+    XTensor * b = NewTensor(bOrder, bDimSize);
+    /* initialize variables */
+    a->SetData(aData, aUnitNum);
+    b->SetData(bData, bUnitNum);
+    /* call SumByColumnTV function */
+    SumByColumnTV(a, b);
+    /* check results */
+    cpuTest = a->CheckData(answer, aUnitNum);
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+    /* create tensor */
+    XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * bGPU = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
+    /* Initialize variables */
+    aGPU->SetData(aData, aUnitNum);
+    bGPU->SetData(bData, bUnitNum);
+    /* call SumByColumnTV function */
+    SumByColumnTV(aGPU, bGPU);
+    /* check results */
+    gpuTest = aGPU->CheckData(answer, aUnitNum);
+    /* destroy variables */
+    delete a;
+    delete b;
+    delete aGPU;
+    delete bGPU;
+    delete[] aDimSize;
+    delete[] bDimSize;
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete a;
+    delete b;
+    delete[] aDimSize;
+    delete[] bDimSize;
+    return cpuTest;
+#endif // USE_CUDA
+}
+/* other cases */
+/*
+    TODO!!
+*/
+/* test for SumByColumnTV Function */
+extern "C"
+bool TestSumByColumnTV() 
+{
+    XPRINT(0, stdout, "[TEST SumByColumnTV] sum of a tensor and a vector (column vector) in a column by column manner \n");
+    bool returnFlag = true, caseFlag = true;
+    /* case 1 test */
+    caseFlag = TestSumByColumnTV1();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 1 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 1 passed!\n");
+    /* case 2 test */
+    caseFlag = TestSumByColumnTV2();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 2 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 2 passed!\n");
+    /* other cases test */
+    /*
+        TODO!!
+    */
+    if (returnFlag) {
+        XPRINT(0, stdout, ">> All Passed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> Failed!\n");
+    XPRINT(0, stdout, "\n");
+    return returnFlag;
+}
+} // namespace nts(NiuTrans.Tensor)
--- a/source/test/TSumByColumnTV.h
+++ b/source/test/TSumByColumnTV.h
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-06
+*/
+#ifndef __TEST_SUMBYCOLUMNTV_H__
+#define __TEST_SUMBYCOLUMNTV_H__
+#include "../core/SumByColumnTV.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/* test for SumByColumnTV Function */
+extern "C"
+bool TestSumByColumnTV();
+} // namespace nts(NiuTrans.Tensor)
+#endif // __TEST_SUMBYCOLUMNTV_H__
--- a/source/test/TSumByColumnVT.cpp
+++ b/source/test/TSumByColumnVT.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-06
+*/
+#include "TSumByColumnVT.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/* case 1: test SumByColumnVT function
+* sum of a vector (column vector) and a tensor 
+* in a column by column manner
+*/
+bool TestSumByColumnVT1()
+{
+    /* a tensor of size (2, 1) */
+    int aOrder = 2;
+    int * aDimSize = new int[aOrder];
+    aDimSize[0] = 2;
+    aDimSize[1] = 1;
+    int aUnitNum = 1;
+    for (int i = 0; i < aOrder; i++)
+        aUnitNum *= aDimSize[i];
+    /* a tensor of size (2, 4) */
+    int bOrder = 2;
+    int * bDimSize = new int[bOrder];
+    bDimSize[0] = 2;
+    bDimSize[1] = 4;
+    int bUnitNum = 1;
+    for (int i = 0; i < bOrder; i++)
+        bUnitNum *= bDimSize[i];
+    /* a tensor of size (2, 1) */
+    int cOrder = 2;
+    int * cDimSize = new int[cOrder];
+    cDimSize[0] = 2;
+    cDimSize[1] = 1;
+    int cUnitNum = 1;
+    for (int i = 0; i < cOrder; i++)
+        cUnitNum *= cDimSize[i];
+    DTYPE aData[2][1] = { {1.0},
+                          {0.0} };
+    DTYPE bData[2][4] = { {0.0, 1.0, 2.0, 3.0},
+                          {4.0, 5.0, 6.0, 7.0} };
+    DTYPE answer[2][1] = { {7.0},
+                           {22.0} };
+    /* CPU test */
+    bool cpuTest = true;
+    /* create tensors */
+    XTensor * a = NewTensor(aOrder, aDimSize);
+    XTensor * b = NewTensor(bOrder, bDimSize);
+    XTensor * c = NewTensor(cOrder, cDimSize);
+    /* initialize variables */
+    a->SetData(aData, aUnitNum);
+    b->SetData(bData, bUnitNum);
+    c->SetZeroAll();
+    /* call SumByColumnVT function */
+    SumByColumnVT(a, b, c);
+    /* check results */
+    cpuTest = c->CheckData(answer, cUnitNum);
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+    /* create tensor */
+    XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * bGPU = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * cGPU = NewTensor(cOrder, cDimSize, X_FLOAT, 1.0F, 0);
+    /* Initialize variables */
+    aGPU->SetData(aData, aUnitNum);
+    bGPU->SetData(bData, bUnitNum);
+    cGPU->SetZeroAll();
+    /* call SumByColumnVT function */
+    SumByColumnVT(aGPU, bGPU, cGPU);
+    /* check results */
+    gpuTest = cGPU->CheckData(answer, cUnitNum);
+    /* destroy variables */
+    delete a;
+    delete b;
+    delete c;
+    delete aGPU;
+    delete bGPU;
+    delete cGPU;
+    delete[] aDimSize;
+    delete[] bDimSize;
+    delete[] cDimSize;
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete a;
+    delete b;
+    delete c;
+    delete[] aDimSize;
+    delete[] bDimSize;
+    delete[] cDimSize;
+    return cpuTest;
+#endif // USE_CUDA
+}
+/* case 2: test SumByColumnVT function
+* sum of a vector (column vector) and a tensor 
+* in a column by column manner
+*/
+bool TestSumByColumnVT2()
+{
+    /* a tensor of size (2, 1) */
+    int aOrder = 2;
+    int * aDimSize = new int[aOrder];
+    aDimSize[0] = 2;
+    aDimSize[1] = 1;
+    int aUnitNum = 1;
+    for (int i = 0; i < aOrder; i++)
+        aUnitNum *= aDimSize[i];
+    /* a tensor of size (2, 4) */
+    int bOrder = 2;
+    int * bDimSize = new int[bOrder];
+    bDimSize[0] = 2;
+    bDimSize[1] = 4;
+    int bUnitNum = 1;
+    for (int i = 0; i < bOrder; i++)
+        bUnitNum *= bDimSize[i];
+    DTYPE aData[2][1] = { {1.0},
+                          {0.0} };
+    DTYPE bData[2][4] = { {0.0, 1.0, 2.0, 3.0},
+                          {4.0, 5.0, 6.0, 7.0} };
+    DTYPE answer[2][1] = { {7.0},
+                           {22.0} };
+    /* CPU test */
+    bool cpuTest = true;
+    /* create tensors */
+    XTensor * a = NewTensor(aOrder, aDimSize);
+    XTensor * b = NewTensor(bOrder, bDimSize);
+    /* initialize variables */
+    a->SetData(aData, aUnitNum);
+    b->SetData(bData, bUnitNum);
+    /* call SumByColumnVT function */
+    SumByColumnVT(a, b);
+    /* check results */
+    cpuTest = a->CheckData(answer, aUnitNum);
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+    /* create tensor */
+    XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * bGPU = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
+    /* Initialize variables */
+    aGPU->SetData(aData, aUnitNum);
+    bGPU->SetData(bData, bUnitNum);
+    /* call SumByColumnVT function */
+    SumByColumnVT(aGPU, bGPU);
+    /* check results */
+    gpuTest = aGPU->CheckData(answer, aUnitNum);
+    /* destroy variables */
+    delete a;
+    delete b;
+    delete aGPU;
+    delete bGPU;
+    delete[] aDimSize;
+    delete[] bDimSize;
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete a;
+    delete b;
+    delete[] aDimSize;
+    delete[] bDimSize;
+    return cpuTest;
+#endif // USE_CUDA
+}
+/* other cases */
+/*
+    TODO!!
+*/
+/* test for SumByColumnVT Function */
+extern "C"
+bool TestSumByColumnVT() 
+{
+    XPRINT(0, stdout, "[TEST SumByColumnVT] sum of a vector (column vector) and a tensor in a column by column manner \n");
+    bool returnFlag = true, caseFlag = true;
+    /* case 1 test */
+    caseFlag = TestSumByColumnVT1();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 1 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 1 passed!\n");
+    /* case 2 test */
+    caseFlag = TestSumByColumnVT2();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 2 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 2 passed!\n");
+    /* other cases test */
+    /*
+        TODO!!
+    */
+    if (returnFlag) {
+        XPRINT(0, stdout, ">> All Passed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> Failed!\n");
+    XPRINT(0, stdout, "\n");
+    return returnFlag;
+}
+} // namespace nts(NiuTrans.Tensor)
--- a/source/test/TSumByColumnVT.h
+++ b/source/test/TSumByColumnVT.h
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-06
+*/
+#ifndef __TEST_SUMBYCOLUMNVT_H__
+#define __TEST_SUMBYCOLUMNVT_H__
+#include "../core/SumByColumnVT.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/* test for SumByColumnVT Function */
+extern "C"
+bool TestSumByColumnVT();
+} // namespace nts(NiuTrans.Tensor)
+#endif // __TEST_SUMBYCOLUMNVT_H__
--- a/source/test/TTopK.cpp
+++ b/source/test/TTopK.cpp
@@ -19,146 +19,243 @@
 * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-06-27
 */
-#include "../XTensor.h"
 #include "TTopK.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
-/* case 1 */
+/* case 1: get the top-k items along a given dimension.
+* In this case, 
+* (2, 4) -> (2, 4), dim = 0, k = 2
+* (2, 4) -> (2, 4), dim = 1, k = 4
+*/
 bool TestTopK1()
 {
-    /* a input tensor of size 2 * 4 */
+    /* a input tensor of size (2, 4) */
-    int inputOrder = 2;
+    int sOrder = 2;
-    int * inputDimSize = new int[inputOrder];
+    int * sDimSize = new int[sOrder];
-    inputDimSize[0] = 2;
+    sDimSize[0] = 2;
-    inputDimSize[1] = 4;
+    sDimSize[1] = 4;
-    int inputUnitNum = 1;
+    int sUnitNum = 1;
-    for (int i = 0; i < inputOrder; i++)
+    for (int i = 0; i < sOrder; i++)
-        inputUnitNum *= inputDimSize[i];
+        sUnitNum *= sDimSize[i];
-    /* a output tensor of size 2 * 4 */
+    /* a output tensor of size (2, 4) */
-    int outputOrder = 2;
+    int tOrder = 2;
-    int * outputDimSize = new int[outputOrder];
+    int * tDimSize = new int[tOrder];
-    outputDimSize[0] = 2;
+    tDimSize[0] = 2;
-    outputDimSize[1] = 4;
+    tDimSize[1] = 4;
-    int outputUnitNum = 1;
+    int tUnitNum = 1;
-    for (int i = 0; i < outputOrder; i++)
+    for (int i = 0; i < tOrder; i++)
-        outputUnitNum *= outputDimSize[i];
+        tUnitNum *= tDimSize[i];
-    DTYPE inputData[2][4] = { {5.0, 1.0, 2.0, 8.0},
+    DTYPE sData[2][4] = { {5.0, 1.0, 2.0, 8.0},
-                              {4.0, 3.0, 7.0, 6.0} };
+                          {4.0, 3.0, 7.0, 6.0} };
-    DTYPE outputAnswerA[2][4] = { {5.0, 3.0, 7.0, 8.0},
-                              {4.0, 1.0, 2.0, 6.0} };
+    DTYPE tAnswer1[2][4] = { {5.0, 3.0, 7.0, 8.0},
-    int indexAnswerA[2][4] = { {0, 1, 1, 0},
+                             {4.0, 1.0, 2.0, 6.0} };
-                         {1, 0, 0, 1} };
+    int indexAnswer1[2][4] = { {0, 1, 1, 0},
+                               {1, 0, 0, 1} };
-    DTYPE outputAnswerB[2][4] = { {8.0, 5.0, 2.0, 1.0},
-                            {7.0, 6.0, 4.0, 3.0} };
+    DTYPE tAnswer2[2][4] = { {8.0, 5.0, 2.0, 1.0},
-    int indexAnswerB[2][4] = { {3, 0, 2, 1},
+                             {7.0, 6.0, 4.0, 3.0} };
+    int indexAnswer2[2][4] = { {3, 0, 2, 1},
                               {2, 3, 0, 1} };
    /* CPU test */
    bool cpuTest = true;
    /* create tensors */
-    XTensor * input = NewTensor(inputOrder, inputDimSize);
+    XTensor * s = NewTensor(sOrder, sDimSize);
-    XTensor * outputA = NewTensor(outputOrder, outputDimSize);
+    XTensor * t1 = NewTensor(tOrder, tDimSize);
-    XTensor * outputB = NewTensor(outputOrder, outputDimSize);
+    XTensor * t2 = NewTensor(tOrder, tDimSize);
-    XTensor * indexA = NewTensor(outputOrder, outputDimSize, X_INT);
+    XTensor * index1 = NewTensor(tOrder, tDimSize, X_INT);
-    XTensor * indexB = NewTensor(outputOrder, outputDimSize, X_INT);
+    XTensor * index2 = NewTensor(tOrder, tDimSize, X_INT);
    /* initialize variables */
-    input->SetData(inputData, inputUnitNum);
+    s->SetData(sData, sUnitNum);
-    outputA->SetZeroAll();
+    t1->SetZeroAll();
-    outputB->SetZeroAll();
+    t2->SetZeroAll();
-    indexA->SetZeroAll();
+    index1->SetZeroAll();
-    indexB->SetZeroAll();
+    index2->SetZeroAll();
    /* call TopK function */
    int dim = 0;
-    int k = inputDimSize[dim];
+    int k = sDimSize[dim];
-    TopK(input, outputA, indexA, dim, k);
+    TopK(s, t1, index1, dim, k);
    dim = 1;
-    k = inputDimSize[dim];
+    k = sDimSize[dim];
-    TopK(input, outputB, indexB, dim, k);
+    TopK(s, t2, index2, dim, k);
- //   {
- //   /* CPU check data */
- //   printf("CPU output data:\n");
- //   XTensor * tensor1 = outputA;
-	//DTYPE * checkData = (DTYPE*)tensor1->data;
-	//for (int i = 0; i < tensor1->unitNum; i++)
-	//	printf("%.2f\t", *checkData++);
-	//printf("\n");
- //   }
- //   {
- //   /* CPU index data */
- //   printf("CPU index data:\n");
- //   XTensor * tensor1 = index;
-	//int * checkData = (int*)tensor1->data;
-	//for (int i = 0; i < tensor1->unitNum; i++)
-	//	printf("%d\t", *checkData++);
-	//printf("\n");
- //   }
    /* check results */
-    cpuTest = outputA->CheckData(outputAnswerA, outputUnitNum) && 
+    cpuTest = t1->CheckData(tAnswer1, tUnitNum) && 
-              outputB->CheckData(outputAnswerB, outputUnitNum) &&
+              t2->CheckData(tAnswer2, tUnitNum) &&
-              indexA->CheckData(indexAnswerA, outputUnitNum) &&
+              index1->CheckData(indexAnswer1, tUnitNum) &&
-              indexB->CheckData(indexAnswerB, outputUnitNum);
+              index2->CheckData(indexAnswer2, tUnitNum);
 #ifdef USE_CUDA
    /* GPU test */
    bool gpuTest = true;
    /* create tensors */
-    XTensor * inputGPU = NewTensor(inputOrder, inputDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
-    XTensor * outputGPUA = NewTensor(outputOrder, outputDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * tGPU1 = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
-    XTensor * outputGPUB = NewTensor(outputOrder, outputDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * tGPU2 = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
-    XTensor * indexGPUA = NewTensor(outputOrder, outputDimSize, X_INT, 1.0F, 0);
+    XTensor * indexGPU1 = NewTensor(tOrder, tDimSize, X_INT, 1.0F, 0);
-    XTensor * indexGPUB = NewTensor(outputOrder, outputDimSize, X_INT, 1.0F, 0);
+    XTensor * indexGPU2 = NewTensor(tOrder, tDimSize, X_INT, 1.0F, 0);
    /* initialize variables */
-    inputGPU->SetData(inputData, inputUnitNum);
+    sGPU->SetData(sData, sUnitNum);
-    outputGPUA->SetZeroAll();
+    tGPU1->SetZeroAll();
-    outputGPUB->SetZeroAll();
+    tGPU2->SetZeroAll();
-    indexGPUA->SetZeroAll();
+    indexGPU1->SetZeroAll();
-    indexGPUB->SetZeroAll();
+    indexGPU2->SetZeroAll();
    /* call TopK function */
    dim = 0;
-    k = inputDimSize[dim];
+    k = sDimSize[dim];
-    TopK(inputGPU, outputGPUA, indexGPUA, dim, k);
+    TopK(sGPU, tGPU1, indexGPU1, dim, k);
    dim = 1;
-    k = inputDimSize[dim];
+    k = sDimSize[dim];
-    TopK(inputGPU, outputGPUB, indexGPUB, dim, k);
+    TopK(sGPU, tGPU2, indexGPU2, dim, k);
    /* check results */
-    gpuTest = outputGPUA->CheckData(outputAnswerA, outputUnitNum) && 
+    gpuTest = tGPU1->CheckData(tAnswer1, tUnitNum) && 
-              outputGPUB->CheckData(outputAnswerB, outputUnitNum) &&
+              tGPU2->CheckData(tAnswer2, tUnitNum) &&
-              indexGPUA->CheckData(indexAnswerA, outputUnitNum) &&
+              indexGPU1->CheckData(indexAnswer1, tUnitNum) &&
-              indexGPUB->CheckData(indexAnswerB, outputUnitNum);
+              indexGPU2->CheckData(indexAnswer2, tUnitNum);
    /* destroy variables */
-    delete input, outputA, outputB, indexA, indexB;
+    delete s;
-    delete inputGPU, outputGPUA, outputGPUB, indexGPUA, indexGPUB;
+    delete t1;
-    delete[] inputDimSize, outputDimSize;
+    delete t2;
+    delete index1;
+    delete index2;
+    delete sGPU;
+    delete tGPU1;
+    delete tGPU2;
+    delete indexGPU1;
+    delete indexGPU2;
+    delete[] sDimSize;
+    delete[] tDimSize;
    return cpuTest && gpuTest;
 #else
    /* destroy variables */
-    delete input, outputA, outputB, indexA, indexB;
+    delete s;
+    delete t1;
+    delete t2;
+    delete index1;
+    delete index2;
+    delete[] sDimSize;
+    delete[] tDimSize;
+    return cpuTest;
+#endif // USE_CUDA
+}
+/* case 2: get the top-k items along a given dimension.
+* In this case, 
+* (2, 4) -> (2, 2), dim = 1, k = 2
+*/
+bool TestTopK2()
+{
+    /* a input tensor of size (2, 4) */
+    int sOrder = 2;
+    int * sDimSize = new int[sOrder];
+    sDimSize[0] = 2;
+    sDimSize[1] = 4;
-    delete[] inputDimSize, outputDimSize;
+    int sUnitNum = 1;
+    for (int i = 0; i < sOrder; i++)
+        sUnitNum *= sDimSize[i];
+    /* a output tensor of size (2, 2) */
+    int tOrder = 2;
+    int * tDimSize = new int[tOrder];
+    tDimSize[0] = 2;
+    tDimSize[1] = 2;
+    int tUnitNum = 1;
+    for (int i = 0; i < tOrder; i++)
+        tUnitNum *= tDimSize[i];
+    DTYPE sData[2][4] = { {5.0, 1.0, 2.0, 8.0},
+                          {4.0, 3.0, 7.0, 6.0} };
+    DTYPE tAnswer[2][2] = { {8.0, 5.0},
+                            {7.0, 6.0} };
+    int indexAnswer[2][2] = { {3, 0},
+                              {2, 3} };
+    /* CPU test */
+    bool cpuTest = true;
+    /* create tensors */
+    XTensor * s = NewTensor(sOrder, sDimSize);
+    XTensor * t = NewTensor(tOrder, tDimSize);
+    XTensor * index = NewTensor(tOrder, tDimSize, X_INT);
+    /* initialize variables */
+    s->SetData(sData, sUnitNum);
+    t->SetZeroAll();
+    index->SetZeroAll();
+    /* call TopK function */
+    int dim = 1;
+    int k = tDimSize[dim];
+    TopK(s, t, index, dim, k);
+    /* check results */
+    cpuTest = t->CheckData(tAnswer, tUnitNum) && index->CheckData(indexAnswer, tUnitNum);
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+    /* create tensors */
+    XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * indexGPU = NewTensor(tOrder, tDimSize, X_INT, 1.0F, 0);
+    /* initialize variables */
+    sGPU->SetData(sData, sUnitNum);
+    tGPU->SetZeroAll();
+    indexGPU->SetZeroAll();
+    /* call TopK function */
+    dim = 1;
+    k = tDimSize[dim];
+    TopK(sGPU, tGPU, indexGPU, dim, k);
+    /* check results */
+    gpuTest = tGPU->CheckData(tAnswer, tUnitNum) && indexGPU->CheckData(indexAnswer, tUnitNum);
+    /* destroy variables */
+    delete s;
+    delete t;
+    delete index;
+    delete sGPU;
+    delete tGPU;
+    delete indexGPU;
+    delete[] sDimSize;
+    delete[] tDimSize;
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete s;
+    delete t;
+    delete index;
+    delete[] sDimSize;
+    delete[] tDimSize;
    return cpuTest;
 #endif // USE_CUDA
 }
 /* other cases */
 /*
 TODO!!
@@ -168,9 +265,9 @@ TODO!!
 extern "C"
 bool TestTopK()
 {
-    XPRINT(0, stdout, "[TEST TopK]\n");
+    XPRINT(0, stdout, "[TEST TopK] get the top-k items along a given dimension\n");
    bool returnFlag = true, caseFlag = true;
    /* case 1 test */
    caseFlag = TestTopK1();
    if (!caseFlag) {
@@ -179,6 +276,15 @@ bool TestTopK()
    }
    else
        XPRINT(0, stdout, ">> case 1 passed!\n");
+    /* case 2 test */
+    caseFlag = TestTopK2();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 2 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 2 passed!\n");
    /* other cases test */
    /*

--- a/source/test/TUnsqueeze.cpp
+++ b/source/test/TUnsqueeze.cpp
@@ -24,8 +24,10 @@
 #include "../XList.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
-/* case 1: transform a tensor by merging it along with a dimension. 
+/* case 1: insert a dimension by copying the blocks for x times (where x is the size of the inerted dimension)
-* In this case, (3, 2) -> (6), whereToUnsqueeze=1, leadingDim=0.
+* In this case, 
+* (2, 3) -> (2, 2, 3), dim=1, dSize=2
+* (2, 3) -> (2, 3, 2), dim=2, dSize=2
 */
 bool TestUnsqueeze1()
 {
@@ -39,46 +41,60 @@ bool TestUnsqueeze1()
    for (int i = 0; i < sOrder; i++)
        sUnitNum *= sDimSize[i];
-    /* a target tensor of size (6, ) */
+    /* a target tensor of size (2, 2, 3) */
-    int tOrder = 3;
+    int tOrder1 = 3;
-    int * tDimSize = new int[tOrder];
+    int * tDimSize1 = new int[tOrder1];
-    tDimSize[0] = 2;
+    tDimSize1[0] = 2;
-    tDimSize[1] = 3;
+    tDimSize1[1] = 2;
-    tDimSize[2] = 2;
+    tDimSize1[2] = 3;
-    int tUnitNum = 1;
+    int tUnitNum1 = 1;
-    for (int i = 0; i < tOrder; i++)
+    for (int i = 0; i < tOrder1; i++)
-        tUnitNum *= tDimSize[i];
+        tUnitNum1 *= tDimSize1[i];
+    /* a target tensor of size (2, 3, 2) */
+    int tOrder2 = 3;
+    int * tDimSize2 = new int[tOrder2];
+    tDimSize2[0] = 2;
+    tDimSize2[1] = 3;
+    tDimSize2[2] = 2;
+    int tUnitNum2 = 1;
+    for (int i = 0; i < tOrder2; i++)
+        tUnitNum2 *= tDimSize2[i];
    DTYPE sData[2][3] = { {0.0, 1.0, 2.0},
                          {3.0, 4.0, 5.0} };
-    DTYPE answer[2][2][3] = { { {0.0, 1.0, 2.0},
+    DTYPE answer1[2][2][3] = { { {0.0, 1.0, 2.0},
-                                {3.0, 4.0, 5.0} },
+                                 {0.0, 1.0, 2.0} },
-                              { {0.0, 1.0, 2.0},
+                               { {3.0, 4.0, 5.0},
-                                {3.0, 4.0, 5.0} } };
+                                 {3.0, 4.0, 5.0} } };
-    DTYPE new_answer[2][3][2] = { { {0.0, 0.0}, 
+    DTYPE answer2[2][3][2] = { { {0.0, 0.0}, 
-                                    {1.0, 1.0}, 
+                                 {1.0, 1.0}, 
-                                    {2.0, 2.0} },
+                                 {2.0, 2.0} },
-                                  { {3.0, 3.0}, 
+                               { {3.0, 3.0}, 
-                                    {4.0, 4.0}, 
+                                 {4.0, 4.0}, 
-                                    {5.0, 5.0} } };
+                                 {5.0, 5.0} } };
    /* CPU test */
    bool cpuTest = true;
    /* create tensors */
    XTensor * s = NewTensor(sOrder, sDimSize);
-    XTensor * t = NewTensor(tOrder, tDimSize);
+    XTensor * t1 = NewTensor(tOrder1, tDimSize1);
+    XTensor * t2 = NewTensor(tOrder2, tDimSize2);
    /* initialize variables */
    s->SetData(sData, sUnitNum);
-    t->SetZeroAll();
+    t1->SetZeroAll();
+    t2->SetZeroAll();
    /* call Unsqueeze function */
-    Unsqueeze(s, t, 2, 2);
+    Unsqueeze(s, t1, 1, 2);
+    Unsqueeze(s, t2, 2, 2);
    /* check results */
-    cpuTest = t->CheckData(new_answer, tUnitNum);
+    cpuTest = t1->CheckData(answer1, tUnitNum1) && t2->CheckData(answer2, tUnitNum2);
 #ifdef USE_CUDA
    /* GPU test */
@@ -86,27 +102,41 @@ bool TestUnsqueeze1()
    /* create tensor */
    XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
-    XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * tGPU1 = NewTensor(tOrder1, tDimSize1, X_FLOAT, 1.0F, 0);
+    XTensor * tGPU2 = NewTensor(tOrder2, tDimSize2, X_FLOAT, 1.0F, 0);
    /* Initialize variables */
    sGPU->SetData(sData, sUnitNum);
-    tGPU->SetZeroAll();
+    tGPU1->SetZeroAll();
+    tGPU2->SetZeroAll();
    /* call Unsqueeze function */
-    Unsqueeze(sGPU, tGPU, 2, 2);
+    Unsqueeze(sGPU, tGPU1, 1, 2);
+    Unsqueeze(sGPU, tGPU2, 2, 2);
    /* check results */
-    gpuTest = tGPU->CheckData(new_answer, tUnitNum);
+    gpuTest = tGPU1->CheckData(answer1, tUnitNum1) && tGPU2->CheckData(answer2, tUnitNum2);
    /* destroy variables */
-    delete s, t, sGPU, tGPU;
+    delete s;
-    delete[] sDimSize, tDimSize;
+    delete t1;
+    delete t2;
+    delete sGPU;
+    delete tGPU1;
+    delete tGPU2;
+    delete[] sDimSize;
+    delete[] tDimSize1;
+    delete[] tDimSize2;
    return cpuTest && gpuTest;
 #else
    /* destroy variables */
-    delete s, t;
+    delete s;
-    delete[] sDimSize, tDimSize;
+    delete t1;
+    delete t2;
+    delete[] sDimSize;
+    delete[] tDimSize1;
+    delete[] tDimSize2;
    return cpuTest;
 #endif // USE_CUDA
@@ -121,7 +151,7 @@ bool TestUnsqueeze1()
 extern "C"
 bool TestUnsqueeze()
 {
-    XPRINT(0, stdout, "[TEST Unsqueeze] -------------\n");
+    XPRINT(0, stdout, "[TEST Unsqueeze] insert a dimension by copying the blocks for x times\n");
    bool returnFlag = true, caseFlag = true;
    /* case 1 test */

--- a/source/test/Test.cpp
+++ b/source/test/Test.cpp
@@ -31,9 +31,12 @@ bool Test()
    wrong = !TestConcatenate() || wrong;
    wrong = !TestConcatenateSolely() || wrong;
+    //wrong = !TestCopyIndexed() || wrong;
    wrong = !TestCopyValues() || wrong;
    wrong = !TestMatrixMul() || wrong;
    wrong = !TestMatrixMul2D() || wrong;
+    wrong = !TestMatrixMul2DParallel() || wrong;
+    //wrong = !TestMatrixMulBatched() || wrong;
    wrong = !TestMatrixMulBatchedCPU() || wrong;
    wrong = !TestMerge() || wrong;
    wrong = !TestMultiplyElementWise() || wrong;
@@ -47,18 +50,23 @@ bool Test()
    wrong = !TestReduceVariance() || wrong;
    wrong = !TestScaleAndShift() || wrong;
    wrong = !TestSelect() || wrong;
+    wrong = !TestSetAscendingOrder() || wrong;
+    wrong = !TestSetData() || wrong;
    wrong = !TestSort() || wrong;
    wrong = !TestSplit() || wrong;
    wrong = !TestSum() || wrong;
+    wrong = !TestSumByColumnTV || wrong;
+    //wrong = !TestSumByColumnVT() || wrong;
    wrong = !TestTopK() || wrong;
    wrong = !TestUnsqueeze() || wrong;
    wrong = !TestXMem() || wrong;
    //wrong = !TestHardTanH() || wrong;
    wrong = !TestIdentity || wrong;
+    //wrong = !TestLogSoftmax() || wrong;
    //wrong = !TestLoss() || wrong;
    //wrong = !TestRectify() || wrong;
-    wrong = !TestSigmoid() || wrong;
+    //wrong = !TestSigmoid() || wrong;
    //wrong = !TestSoftmax() || wrong;
    /* other test */

--- a/source/test/Test.h
+++ b/source/test/Test.h
@@ -24,9 +24,12 @@
 #include "TConcatenate.h"
 #include "TConcatenateSolely.h"
+#include "TCopyIndexed.h"
 #include "TCopyValues.h"
 #include "TMatrixMul.h"
 #include "TMatrixMul2D.h"
+#include "TMatrixMul2DParallel.h"
+#include "TMatrixMulBatched.h"
 #include "TMatrixMULBatchedCPU.h"
 #include "TMerge.h"
 #include "TMultiplyElementWise.h"
@@ -40,15 +43,20 @@
 #include "TReduceVariance.h"
 #include "TScaleAndShift.h"
 #include "TSelect.h"
+#include "TSetAscendingOrder.h"
+#include "TSetData.h"
 #include "TSort.h"
 #include "TSplit.h"
 #include "TSum.h"
+#include "TSumByColumnTV.h"
+#include "TSumByColumnVT.h"
 #include "TTopK.h"
 #include "TUnsqueeze.h"
 #include "TXMem.h"
 #include "THardTanH.h"
 #include "TIdentity.h"
+#include "TLogSoftmax.h"
 #include "TLoss.h"
 #include "TRectify.h"
 #include "TSigmoid.h"