Bug fixed: 1. MatrixMul, Select, Sort, TopK, Loss; 2. Add other tests.

100f4611 · liyinqiao · a3a7145f · 100f4611 · 100f4611 · 100f4611
Commit 100f4611 authored Jul 07, 2018 by liyinqiao
--- a/source/core/MatrixMul.cpp
+++ b/source/core/MatrixMul.cpp
@@ -58,12 +58,12 @@ void MatrixMul(XTensor * a, MATRIX_TRANS_TYPE transposedA,
    CheckNTErrors((a->order >= 2 && b->order >= 2 && c->order >= 2),
        "Input tensors must have a order > 2!");

-    int an = transposedA == X_TRANS ? a->dimSize[1] : a->dimSize[0];
-    int am = transposedA == X_TRANS ? a->dimSize[0] : a->dimSize[1];
-    int bn = transposedB == X_TRANS ? b->dimSize[1] : b->dimSize[0];
-    int bm = transposedB == X_TRANS ? b->dimSize[0] : b->dimSize[1];
-    int cn = c->dimSize[0];
-    int cm = c->dimSize[1];
+    int an = transposedA == X_TRANS ? a->dimSizeRDI[0] : a->dimSizeRDI[1];
+    int am = transposedA == X_TRANS ? a->dimSizeRDI[1] : a->dimSizeRDI[0];
+    int bn = transposedB == X_TRANS ? b->dimSizeRDI[0] : b->dimSizeRDI[1];
+    int bm = transposedB == X_TRANS ? b->dimSizeRDI[1] : b->dimSizeRDI[0];
+    int cn = c->dimSizeRDI[1];
+    int cm = c->dimSizeRDI[0];

    CheckNTErrors((am == bn && an == cn && bm == cm),
        "Unmatched tensors in multiplication!");
@@ -79,13 +79,13 @@ void MatrixMul(XTensor * a, MATRIX_TRANS_TYPE transposedA,
    int cBlockNum = 1;

    for (int i = 2; i < a->order; i++) {
-        CheckNTErrors((a->dimSizeRDI[i] == c->dimSizeRDI[i]), "Incorrect tensor sizes!");
+        CheckNTErrors((a->dimSizeRDI[i] == c->dimSizeRDI[i - 2 + b->order]), "Incorrect tensor sizes!");
        aBlockNum *= a->dimSizeRDI[i];
        cBlockNum *= a->dimSizeRDI[i];
    }

    for (int i = 2; i < b->order; i++) {
-        CheckNTErrors((b->dimSizeRDI[i] == c->dimSizeRDI[i - 2 + a->order]), "Incorrect tensor sizes!");
+        CheckNTErrors((b->dimSizeRDI[i] == c->dimSizeRDI[i]), "Incorrect tensor sizes!");
        bBlockNum *= b->dimSizeRDI[i];
        cBlockNum *= b->dimSizeRDI[i];
    }
@@ -93,9 +93,9 @@ void MatrixMul(XTensor * a, MATRIX_TRANS_TYPE transposedA,
    XList * aList = new XList(10);
    XList * bList = new XList(10);
    XList * cList = new XList(10);
-    int aDimSize[2] = { -a->dimSize[0], a->dimSize[1] };
-    int bDimSize[2] = { -b->dimSize[0], b->dimSize[1] };
-    int cDimSize[2] = { -c->dimSize[0], c->dimSize[1] };
+    int aDimSize[2] = { a->dimSizeRDI[1], a->dimSizeRDI[0] };
+    int bDimSize[2] = { b->dimSizeRDI[1], b->dimSizeRDI[0] };
+    int cDimSize[2] = { c->dimSizeRDI[1], c->dimSizeRDI[0] };

    bool isSparseMul = false;


--- a/source/core/MatrixMulBatched.cpp
+++ b/source/core/MatrixMulBatched.cpp
@@ -52,12 +52,12 @@ void MatrixMulBatched(XTensor * a, MATRIX_TRANS_TYPE transposedA,
    CheckNTErrors((a->order >= 2 && b->order >= 2 && c->order >= 2),
        "Input tensors must have a order > 2!");

-    int an = transposedA == X_TRANS ? a->dimSize[1] : a->dimSize[0];
-    int am = transposedA == X_TRANS ? a->dimSize[0] : a->dimSize[1];
-    int bn = transposedB == X_TRANS ? b->dimSize[1] : b->dimSize[0];
-    int bm = transposedB == X_TRANS ? b->dimSize[0] : b->dimSize[1];
-    int cn = c->dimSize[0];
-    int cm = c->dimSize[1];
+    int an = transposedA == X_TRANS ? a->dimSizeRDI[0] : a->dimSizeRDI[1];
+    int am = transposedA == X_TRANS ? a->dimSizeRDI[1] : a->dimSizeRDI[0];
+    int bn = transposedB == X_TRANS ? b->dimSizeRDI[0] : b->dimSizeRDI[1];
+    int bm = transposedB == X_TRANS ? b->dimSizeRDI[1] : b->dimSizeRDI[0];
+    int cn = c->dimSizeRDI[1];
+    int cm = c->dimSizeRDI[0];

    CheckNTErrors((am == bn && an == cn && bm == cm),
        "Unmatched tensors in multiplication!");
@@ -79,9 +79,9 @@ void MatrixMulBatched(XTensor * a, MATRIX_TRANS_TYPE transposedA,
    XList * aList = new XList(10);
    XList * bList = new XList(10);
    XList * cList = new XList(10);
-    int aDimSize[2] = { -a->dimSizeRDI[0], a->dimSizeRDI[1] };
-    int bDimSize[2] = { -b->dimSizeRDI[0], b->dimSizeRDI[1] };
-    int cDimSize[2] = { -c->dimSizeRDI[0], c->dimSizeRDI[1] };
+    int aDimSize[2] = { -a->dimSizeRDI[1], a->dimSizeRDI[0] };
+    int bDimSize[2] = { -b->dimSizeRDI[1], b->dimSizeRDI[0] };
+    int cDimSize[2] = { -c->dimSizeRDI[1], c->dimSizeRDI[0] };

    for (int p = 0; p < blockNum; p++) {
        void * ap = (char*)a->data + aRealBlockSize * p;
@@ -106,8 +106,9 @@ void MatrixMulBatched(XTensor * a, MATRIX_TRANS_TYPE transposedA,
        int devIDBackup;
        ProtectCudaDev(a->devID, devIDBackup);

-        CudaBLASMatrixMULList(a->mem != NULL ? a->mem->GetCublasHandle() : GDevs.GetCudaHandle(a->devID),
-                              aList, transposedA,
+        cublasHandle_t * handle = a->mem != NULL ? a->mem->GetCublasHandle() : GDevs.GetCudaHandle(a->devID);
+        CudaBLASMatrixMULList(handle,
+							  aList, transposedA,
                              bList, transposedB,
                              cList, aList->count,
                              alpha, beta);

--- a/source/core/Select.cpp
+++ b/source/core/Select.cpp
@@ -47,23 +47,28 @@ void SelectRange(XTensor * a, int dim, int low, int high, XTensor * c)
    for(int i = 0; i < a->order; i++){
        if(i == dim){
            CheckNTErrors(low > 0 && low < a->dimSize[dim], "Illegal range specified!");
-            CheckNTErrors(high > 0 && high < a->dimSize[dim], "Illegal range specified!");
+            CheckNTErrors(high > 0 && high <= a->dimSize[dim], "Illegal range specified!");
        }
        else{
            CheckNTErrors(a->dimSize[i] == c->dimSize[i], "The size of the dimensions should be same!");
        }
    }

+    int dimRDI = a->order - dim - 1;
    int stride = 1;
-    for(int i = 0; i < dim; i++)
+    for(int i = 0; i < dimRDI; i++)
        stride *= a->dimSizeRDI[i];

+    int copyTimes = 1;
+    for (int i = dimRDI + 1; i < a->order; i++) 
+        copyTimes *= a->dimSizeRDI[i];
+
    int blockSize = stride * (high - low) * a->unitSize;
    int stepSizeS = stride * a->dimSize[dim] * a->unitSize;
    int stepSizeT = stride * c->dimSize[dim] * a->unitSize;
    char * s = (char*)a->data + stride * low * a->unitSize;
    char * t = (char*)c->data;
-    for(int i = 0; i < high - low; i++){
+    for(int i = 0; i < copyTimes; i++){
        XMemCopy(t, c->devID, s, a->devID, blockSize);
        s += stepSizeS;
        t += stepSizeT;

--- a/source/core/Sort.cu
+++ b/source/core/Sort.cu
@@ -235,10 +235,6 @@ void CudaSortBig(XTensor * a, XTensor * b, XTensor * indexA, XTensor * indexB, i
    int m = GetNextPower2(strideNum);
    int n = stride * blockNum;

-    /* recheck */
-    /*void * buf = mem->AllocBuf(mem->devID, n * m * a->unitSize);
-    void * bufIndex = (indexA != NULL && indexB != NULL) ? mem->AllocBuf(mem->devID, n * m * sizeof(int)) : NULL;*/
-    /* change by liyinqiao */
    void * buf = mem != NULL ? mem->AllocBuf(a->devID, n * m * a->unitSize) : XMemAlloc(a->devID, n * m * a->unitSize);
    void * bufIndex = NULL;
    if (indexA != NULL && indexB != NULL) {
@@ -294,11 +290,6 @@ void CudaSortBig(XTensor * a, XTensor * b, XTensor * indexA, XTensor * indexB, i
        KernelReorganizeBack<int> << <dim3(cudaGrids[1], cudaGrids[0]), dim3(cudaBlocks[1], cudaBlocks[0]) >> >
                                      (bufIndex, indexB->data, m, n, stride, k, blockNum);

-    /* recheck */
-    /*mem->ReleaseBuf(mem->devID, n * m * a->unitSize);
-    if (indexA != NULL && indexB != NULL)
-        mem->ReleaseBuf(mem->devID, n * m * sizeof(int));*/
-    /* change by liyinqiao */
    if (mem != NULL)
        mem->ReleaseBuf(a->devID, n * m * a->unitSize);
    else

--- a/source/core/TopK.cu
+++ b/source/core/TopK.cu
@@ -20,6 +20,7 @@
 */

 #include "../XDevice.h"
+#include "../XUtility.h"
 #include "../XTensor.h"
 #include "TopK.h"
 #include "TopK.cuh"
@@ -393,7 +394,7 @@ void CudaTopK(XTensor * a, XTensor * b, XTensor * index, int dim, int k)
    int cudaGrids[3];
    int cudaBlocks[3];

-    GDevs.GetCudaThread2D(a->mem->devID,
+    GDevs.GetCudaThread2D(a->devID,
        workerNum, stride * blockNum, MAX_INT,
        cudaGrids, cudaBlocks);

@@ -434,14 +435,17 @@ void CudaTopK(XTensor * a, XTensor * b, XTensor * index, int dim, int k)
        memcpy(dimSize, a->dimSize, sizeof(int) * a->order);
        dimSize[0] = -dimSize[0];
        XTensor * indexA = new XTensor(a->order, dimSize, X_INT, 1.0F, a->mem);
-        indexA->data = a->mem->AllocBuf(a->devID, a->unitNum * sizeof(int));
+        indexA->data = a->mem != NULL ? a->mem->AllocBuf(a->devID, a->unitNum * sizeof(int)) : XMemAlloc(a->devID, a->unitNum * sizeof(int));

        /* make the index tensor */
        indexA->SetAscendingOrder(dim);

        CudaSortBig(a, b, indexA, index, dim, k);

-        a->mem->ReleaseBuf(a->devID, a->unitNum * sizeof(int));
+        if (a->mem != NULL)
+            a->mem->ReleaseBuf(a->devID, a->unitNum * sizeof(int));
+        else
+            XMemFree(a->devID, indexA->data);
        delete indexA;
    }


--- a/source/function/Loss.cpp
+++ b/source/function/Loss.cpp
@@ -374,15 +374,15 @@ void LossBackward(XTensor * dedy, XTensor * t, XTensor * y,
                  LOSS_FUNCTION_NAME LFName, 
                  int leadDim, int tBeg, int tLen, int yBeg)
 {
-    CheckNTErrors((tLen >= 0 && tLen < y->unitNum), "Illegal input length!");
+    CheckNTErrors((tLen < y->unitNum), "Illegal input length!");
    CheckNTErrors((XTensor::IsIdentical(t, y)&& XTensor::IsIdentical(dedy, y)), 
                        "The input tensors must be of the same size!");
-    CheckNTErrors((t->dimSizeRDI[0] == 1 && y->dimSizeRDI[0] == 1 && dedy->dimSizeRDI[1] == 1), "TODO!");
+    //CheckNTErrors((t->dimSizeRDI[0] == 1 && y->dimSizeRDI[0] == 1 && dedy->dimSizeRDI[0] == 1), "TODO!");
    CheckNTErrors((t->order > leadDim && leadDim >= 0), "Illegal leading dimension!");
    CheckNTErrors((t->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE),
                         "TODO!");

-    int leadDimRDI = y->order - leadDim - 1;
+    int leadDimRDI = leadDim >= 0 ? y->order - leadDim - 1 : -1;
    if(leadDimRDI < 0){
        leadDimRDI = y->dimSizeRDI[y->order - 1];
        tBeg = 0;

--- a/source/test/TMatrixMul.cpp
+++ b/source/test/TMatrixMul.cpp
@@ -24,8 +24,8 @@

 namespace nts { // namespace nts(NiuTrans.Tensor)
 /* case 1: matrix multiplication. 
-* In this case, a=(2, 3), b=(3, 2) -> c=(2, 2), transposedA=X_NOTRANS,
-  transposedB=X_NOTRANS.
+* In this case, a=(2, 3), b=(3, 2) -> c=(2, 2), 
+* transposedA=X_NOTRANS, transposedB=X_NOTRANS.
 */
 bool TestMatrixMul1()
 {
@@ -59,13 +59,13 @@ bool TestMatrixMul1()
    for (int i = 0; i < tOrder; i++)
        tUnitNum *= tDimSize[i];

-    DTYPE sData1[2][3] = { {1.0, 2.0, 3.0},
-                           {-4.0, 5.0, 6.0} };
-    DTYPE sData2[3][2] = { {0.0, -1.0},
-                           {1.0, 2.0}, 
-                           {2.0, 1.0} };
-    DTYPE answer[2][2] = { {8.0, 6.0}, 
-                           {17.0, 20.0} };
+    DTYPE sData1[2][3] = { {1.0F, 2.0F, 3.0F},
+                           {-4.0F, 5.0F, 6.0F} };
+    DTYPE sData2[3][2] = { {0.0F, -1.0F},
+                           {1.0F, 2.0F}, 
+                           {2.0F, 1.0F} };
+    DTYPE answer[2][2] = { {8.0F, 6.0F}, 
+                           {17.0F, 20.0F} };

    /* CPU test */
    bool cpuTest = true;
@@ -107,22 +107,33 @@ bool TestMatrixMul1()
    gpuTest = tGPU->CheckData(answer, tUnitNum);

    /* destroy variables */
-    delete s1, s2, t, sGPU1, sGPU2, tGPU;
-    delete[] sDimSize1, sDimSize2, tDimSize;
+    delete s1;
+    delete s2;
+    delete t;
+    delete sGPU1;
+    delete sGPU2;
+    delete tGPU;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;

    return cpuTest && gpuTest;
 #else
    /* destroy variables */
-    delete s1, s2, t;
-    delete[] sDimSize1, sDimSize2, tDimSize;
+    delete s1;
+    delete s2;
+    delete t;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;

    return cpuTest;
 #endif // USE_CUDA
 }

 /* case 2: matrix multiplication. 
-* In this case, a=(3, 2), b=(3, 2) -> c=(2, 2), transposedA=X_TRANS,
-  transposedB=X_NOTRANS.
+* In this case, a=(3, 2), b=(3, 2) -> c=(2, 2), 
+* transposedA=X_TRANS, transposedB=X_NOTRANS.
 */
 bool TestMatrixMul2()
 {
@@ -136,7 +147,7 @@ bool TestMatrixMul2()
    for (int i = 0; i < sOrder1; i++)
        sUnitNum1 *= sDimSize1[i];

-    /* a source tensor of size (2, 3) */
+    /* a source tensor of size (3, 2) */
    int sOrder2 = 2;
    int * sDimSize2 = new int[sOrder2];
    sDimSize2[0] = 3;
@@ -156,14 +167,14 @@ bool TestMatrixMul2()
    for (int i = 0; i < tOrder; i++)
        tUnitNum *= tDimSize[i];

-    DTYPE sData1[3][2] = { {1.0, -4.0},
-                           {2.0, 5.0},
-                           {3.0, 6.0} };
-    DTYPE sData2[3][2] = { {0.0, -1.0},
-                           {1.0, 2.0},
-                           {2.0, 1.0} };
-    DTYPE answer[2][2] = { {8.0, 6.0},
-                           {17.0, 20.0} };
+    DTYPE sData1[3][2] = { {1.0F, -4.0F},
+                           {2.0F, 5.0F},
+                           {3.0F, 6.0F} };
+    DTYPE sData2[3][2] = { {0.0F, -1.0F},
+                           {1.0F, 2.0F},
+                           {2.0F, 1.0F} };
+    DTYPE answer[2][2] = { {8.0F, 6.0F},
+                           {17.0F, 20.0F} };

    /* CPU test */
    bool cpuTest = true;
@@ -205,22 +216,33 @@ bool TestMatrixMul2()
    gpuTest = tGPU->CheckData(answer, tUnitNum);

    /* destroy variables */
-    delete s1, s2, t, sGPU1, sGPU2, tGPU;
-    delete[] sDimSize1, sDimSize2, tDimSize;
+    delete s1;
+    delete s2;
+    delete t;
+    delete sGPU1;
+    delete sGPU2;
+    delete tGPU;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;

    return cpuTest && gpuTest;
 #else
    /* destroy variables */
-    delete s1, s2, t;
-    delete[] sDimSize1, sDimSize2, tDimSize;
+    delete s1;
+    delete s2;
+    delete t;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;

    return cpuTest;
 #endif // USE_CUDA
 }

 /* case 3: matrix multiplication. 
-* In this case, a=(3, 2, 3), b=(2, 3, 2) -> c=(3, 2, 2, 2), transposedA=X_NOTRANS,
-  transposedB=X_NOTRANS.
+* In this case, a=(3, 2, 3), b=(2, 3, 2) -> c=(3, 2, 2, 2), 
+* transposedA=X_NOTRANS, transposedB=X_NOTRANS.
 */
 bool TestMatrixMul3()
 {
@@ -258,20 +280,30 @@ bool TestMatrixMul3()
    for (int i = 0; i < tOrder; i++)
        tUnitNum *= tDimSize[i];

-    DTYPE sData1[3][2][3] = { { {0.0, -1.0, 2.0},
-                                {2.0, 1.0, 3.0} },
-                              { {1.0, 2.0, 4.0}, 
-                                {3.0, 1.0, 2.0}},
-                              { {-1.0, 3.0, 2.0}, 
-                                {1.0, -1.0, 0.0} } };
-    DTYPE sData2[2][3][2] = { { {1.0, 2.0},
-                                {-4.0, 3.0},
-                                {2.0, 6.0} },
-                              { {1.0, 2.0},
-                                {-4.0, 3.0},
-                                {2.0, 6.0} } };
-    DTYPE answer[2][2] = { {8.0, 6.0}, 
-                           {17.0, 20.0} };
+    DTYPE sData1[3][2][3] = { { {0.0F, -1.0F, 2.0},
+                                {2.0F, 1.0F, 3.0} },
+                              { {1.0F, 2.0F, 4.0}, 
+                                {3.0F, 1.0F, 2.0}},
+                              { {-1.0F, 3.0F, 2.0}, 
+                                {1.0F, -1.0F, 0.0} } };
+    DTYPE sData2[2][3][2] = { { {1.0F, 2.0F},
+                                {-4.0F, 3.0F},
+                                {2.0F, 6.0F} },
+                              { {1.0F, 2.0F},
+                                {3.0F, 4.0F},
+                                {5.0F, 6.0F} } };
+    DTYPE answer[3][2][2][2] = { { { {8.0F, 9.0F}, 
+                                     {4.0F, 25.0F} },
+                                   { {7.0F, 8.0F},
+                                     {20.0F, 26.0F} } },
+                                 { { {1.0F, 32.0F},
+                                     {3.0F, 21.0F} },
+                                   { {27.0F, 34.0F}, 
+                                     {16.0F, 22.0F} } },
+                                 { { {-9.0F, 19.0F},
+                                     {5.0F, -1.0F} },
+                                   { {18.0F, 22.0F}, 
+                                     {-2.0F, -2.0F} } } };

    /* CPU test */
    bool cpuTest = true;
@@ -289,17 +321,123 @@ bool TestMatrixMul3()
    /* call MatrixMul function */
    MatrixMul(s1, X_NOTRANS, s2, X_NOTRANS, t);

-    XPRINT(0, stdout, "\ntarget data\n[");
-	DTYPE* check_data = (DTYPE*)t->data;
-	for (int i = 0; i < tUnitNum; i++)
-		printf("%f ", *check_data++);
-	printf("]\n");
+    /* check results */
+    cpuTest = t->CheckData(answer, tUnitNum);

-    int * size = new int(tOrder);
-    size = t->dimSize;
-    for (int i = 0; i < tOrder; i++) {
-        printf("size %d: %d\n", i, *size++);
-    }
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+
+    /* create tensor */
+    XTensor * sGPU1 = NewTensor(sOrder1, sDimSize1, X_FLOAT, 1.0F, 0);
+    XTensor * sGPU2 = NewTensor(sOrder2, sDimSize2, X_FLOAT, 1.0F, 0);
+    XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
+
+    /* Initialize variables */
+    sGPU1->SetData(sData1, sUnitNum1);
+    sGPU2->SetData(sData2, sUnitNum2);
+    tGPU->SetZeroAll();
+
+    /* call MatrixMul function */
+    MatrixMul(sGPU1, X_NOTRANS, sGPU2, X_NOTRANS, tGPU);
+
+    /* check results */
+    gpuTest = tGPU->CheckData(answer, tUnitNum);
+
+    /* destroy variables */
+    delete s1;
+    delete s2;
+    delete t;
+    delete sGPU1;
+    delete sGPU2;
+    delete tGPU;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;
+
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete s1;
+    delete s2;
+    delete t;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;
+
+    return cpuTest;
+#endif // USE_CUDA
+}
+
+/* case 4: matrix multiplication. 
+* In this case, a=(3, 2, 3), b=(3, 2) -> c=(3, 2, 2), 
+* transposedA=X_NOTRANS, transposedB=X_NOTRANS.
+*/
+bool TestMatrixMul4()
+{
+    /* a source tensor of size (3, 2, 3) */
+    int sOrder1 = 3;
+    int * sDimSize1 = new int[sOrder1];
+    sDimSize1[0] = 3;
+    sDimSize1[1] = 2;
+    sDimSize1[2] = 3;
+
+    int sUnitNum1 = 1;
+    for (int i = 0; i < sOrder1; i++)
+        sUnitNum1 *= sDimSize1[i];
+
+    /* a source tensor of size (3, 2) */
+    int sOrder2 = 2;
+    int * sDimSize2 = new int[sOrder2];
+    sDimSize2[0] = 3;
+    sDimSize2[1] = 2;
+
+    int sUnitNum2 = 1;
+    for (int i = 0; i < sOrder2; i++)
+        sUnitNum2 *= sDimSize2[i];
+
+    /* a target tensor of size (3, 2, 2) */
+    int tOrder = 3;
+    int * tDimSize = new int[tOrder];
+    tDimSize[0] = 3;
+    tDimSize[1] = 2;
+    tDimSize[2] = 2;
+
+    int tUnitNum = 1;
+    for (int i = 0; i < tOrder; i++)
+        tUnitNum *= tDimSize[i];
+
+    DTYPE sData1[3][2][3] = { { {0.0F, -1.0F, 2.0F},
+                                {2.0F, 1.0F, 3.0F} },
+                              { {1.0F, 2.0F, 4.0F}, 
+                                {3.0F, 1.0F, 2.0F}},
+                              { {-1.0F, 3.0F, 2.0F}, 
+                                {1.0F, -1.0F, 0.0F} } };
+    DTYPE sData2[3][2] = { {1.0F, 2.0F},
+                           {3.0F, 4.0F},
+                           {5.0F, 6.0F} };
+    DTYPE answer[3][2][2] = { { {7.0F, 8.0F},
+                                {20.0F, 26.0F} },
+                              { {27.0F, 34.0F}, 
+                                 {16.0F, 22.0F} },
+                              { {18.0F, 22.0F}, 
+                                {-2.0F, -2.0F} } };
+
+    /* CPU test */
+    bool cpuTest = true;
+
+    /* create tensors */
+    XTensor * s1 = NewTensor(sOrder1, sDimSize1);
+    XTensor * s2 = NewTensor(sOrder2, sDimSize2);
+    XTensor * t = NewTensor(tOrder, tDimSize);
+
+    /* initialize variables */
+    s1->SetData(sData1, sUnitNum1);
+    s2->SetData(sData2, sUnitNum2);
+    t->SetZeroAll();
+
+    /* call MatrixMul function */
+    MatrixMul(s1, X_NOTRANS, s2, X_NOTRANS, t);

    /* check results */
    cpuTest = t->CheckData(answer, tUnitNum);
@@ -325,14 +463,25 @@ bool TestMatrixMul3()
    gpuTest = tGPU->CheckData(answer, tUnitNum);

    /* destroy variables */
-    delete s1, s2, t, sGPU1, sGPU2, tGPU;
-    delete[] sDimSize1, sDimSize2, tDimSize;
+    delete s1;
+    delete s2;
+    delete t;
+    delete sGPU1;
+    delete sGPU2;
+    delete tGPU;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;

    return cpuTest && gpuTest;
 #else
    /* destroy variables */
-    delete s1, s2, t;
-    delete[] sDimSize1, sDimSize2, tDimSize;
+    delete s1;
+    delete s2;
+    delete t;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;

    return cpuTest;
 #endif // USE_CUDA
@@ -348,7 +497,7 @@ bool TestMatrixMul3()
 extern "C"
 bool TestMatrixMul()
 {
-    XPRINT(0, stdout, "[TEST MATRIXMUL] -------------\n");
+    XPRINT(0, stdout, "[TEST MATRIXMUL] matrix multiplication \n");
    bool returnFlag = true, caseFlag = true;

    /* case 1 test */
@@ -370,14 +519,23 @@ bool TestMatrixMul()
    else
        XPRINT(0, stdout, ">> case 2 passed!\n");

-    ///* case 3 test */
-    //caseFlag = TestMatrixMul3();
-    //if (!caseFlag) {
-    //    returnFlag = false;
-    //    XPRINT(0, stdout, ">> case 3 failed!\n");
-    //}
-    //else
-    //    XPRINT(0, stdout, ">> case 3 passed!\n");
+    /* case 3 test */
+    caseFlag = TestMatrixMul3();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 3 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 3 passed!\n");
+    
+    /* case 4 test */
+    caseFlag = TestMatrixMul4();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 4 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 4 passed!\n");

    /* other cases test */
    /*

--- a/source/test/Test.cpp
+++ b/source/test/Test.cpp
@@ -31,6 +31,7 @@ bool Test()

    wrong = !TestConcatenate() || wrong;
    wrong = !TestConcatenateSolely() || wrong;
+    wrong = !TestCopyValues() || wrong;
    wrong = !TestMatrixMul() || wrong;
    wrong = !TestMatrixMul2D() || wrong;
    wrong = !TestMatrixMulBatchedCPU() || wrong;
@@ -42,12 +43,19 @@ bool Test()
    wrong = !TestReduceMax() || wrong;
    wrong = !TestReduceMean() || wrong;
    wrong = !TestReduceSum() || wrong;
+    wrong = !TestReduceSumSquared() || wrong;
+    wrong = !TestReduceVariance() || wrong;
+    wrong = !TestScaleAndShift() || wrong;
+    wrong = !TestSelect() || wrong;
    wrong = !TestSort() || wrong;
    wrong = !TestSplit() || wrong;
    wrong = !TestSum() || wrong;
+    wrong = !TestTopK() || wrong;
+    wrong = !TestUnsqueeze() || wrong;
    wrong = !TestXMem() || wrong;

    //wrong = !TestHardTanH() || wrong;
+    wrong = !TestIdentity || wrong;
    //wrong = !TestLoss() || wrong;
    //wrong = !TestRectify() || wrong;
    wrong = !TestSigmoid() || wrong;

--- a/source/test/Test.h
+++ b/source/test/Test.h
@@ -24,6 +24,7 @@

 #include "TConcatenate.h"
 #include "TConcatenateSolely.h"
+#include "TCopyValues.h"
 #include "TMatrixMul.h"
 #include "TMatrixMul2D.h"
 #include "TMatrixMULBatchedCPU.h"
@@ -35,12 +36,19 @@
 #include "TReduceMax.h"
 #include "TReduceMean.h"
 #include "TReduceSum.h"
+#include "TReduceSumSquared.h"
+#include "TReduceVariance.h"
+#include "TScaleAndShift.h"
+#include "TSelect.h"
 #include "TSort.h"
 #include "TSplit.h"
 #include "TSum.h"
+#include "TTopK.h"
+#include "TUnsqueeze.h"
 #include "TXMem.h"

 #include "THardTanH.h"
+#include "TIdentity.h"
 #include "TLoss.h"
 #include "TRectify.h"
 #include "TSigmoid.h"