Bug fixed: 1. MatrixMul, Select, Sort, TopK, Loss; 2. Add other tests.

100f4611 · liyinqiao · a3a7145f · 100f4611 · 100f4611 · 100f4611
Commit 100f4611 authored Jul 07, 2018 by liyinqiao
--- a/source/core/MatrixMul.cpp
+++ b/source/core/MatrixMul.cpp
@@ -58,12 +58,12 @@ void MatrixMul(XTensor * a, MATRIX_TRANS_TYPE transposedA,
    CheckNTErrors((a->order >= 2 && b->order >= 2 && c->order >= 2),
        "Input tensors must have a order > 2!");

-    int an = transposedA == X_TRANS ? a->dimSize[1] : a->dimSize[0];
-    int am = transposedA == X_TRANS ? a->dimSize[0] : a->dimSize[1];
-    int bn = transposedB == X_TRANS ? b->dimSize[1] : b->dimSize[0];
-    int bm = transposedB == X_TRANS ? b->dimSize[0] : b->dimSize[1];
-    int cn = c->dimSize[0];
-    int cm = c->dimSize[1];
+    int an = transposedA == X_TRANS ? a->dimSizeRDI[0] : a->dimSizeRDI[1];
+    int am = transposedA == X_TRANS ? a->dimSizeRDI[1] : a->dimSizeRDI[0];
+    int bn = transposedB == X_TRANS ? b->dimSizeRDI[0] : b->dimSizeRDI[1];
+    int bm = transposedB == X_TRANS ? b->dimSizeRDI[1] : b->dimSizeRDI[0];
+    int cn = c->dimSizeRDI[1];
+    int cm = c->dimSizeRDI[0];

    CheckNTErrors((am == bn && an == cn && bm == cm),
        "Unmatched tensors in multiplication!");
@@ -79,13 +79,13 @@ void MatrixMul(XTensor * a, MATRIX_TRANS_TYPE transposedA,
    int cBlockNum = 1;

    for (int i = 2; i < a->order; i++) {
-        CheckNTErrors((a->dimSizeRDI[i] == c->dimSizeRDI[i]), "Incorrect tensor sizes!");
+        CheckNTErrors((a->dimSizeRDI[i] == c->dimSizeRDI[i - 2 + b->order]), "Incorrect tensor sizes!");
        aBlockNum *= a->dimSizeRDI[i];
        cBlockNum *= a->dimSizeRDI[i];
    }

    for (int i = 2; i < b->order; i++) {
-        CheckNTErrors((b->dimSizeRDI[i] == c->dimSizeRDI[i - 2 + a->order]), "Incorrect tensor sizes!");
+        CheckNTErrors((b->dimSizeRDI[i] == c->dimSizeRDI[i]), "Incorrect tensor sizes!");
        bBlockNum *= b->dimSizeRDI[i];
        cBlockNum *= b->dimSizeRDI[i];
    }
@@ -93,9 +93,9 @@ void MatrixMul(XTensor * a, MATRIX_TRANS_TYPE transposedA,
    XList * aList = new XList(10);
    XList * bList = new XList(10);
    XList * cList = new XList(10);
-    int aDimSize[2] = { -a->dimSize[0], a->dimSize[1] };
-    int bDimSize[2] = { -b->dimSize[0], b->dimSize[1] };
-    int cDimSize[2] = { -c->dimSize[0], c->dimSize[1] };
+    int aDimSize[2] = { a->dimSizeRDI[1], a->dimSizeRDI[0] };
+    int bDimSize[2] = { b->dimSizeRDI[1], b->dimSizeRDI[0] };
+    int cDimSize[2] = { c->dimSizeRDI[1], c->dimSizeRDI[0] };

    bool isSparseMul = false;


--- a/source/core/MatrixMulBatched.cpp
+++ b/source/core/MatrixMulBatched.cpp
@@ -52,12 +52,12 @@ void MatrixMulBatched(XTensor * a, MATRIX_TRANS_TYPE transposedA,
    CheckNTErrors((a->order >= 2 && b->order >= 2 && c->order >= 2),
        "Input tensors must have a order > 2!");

-    int an = transposedA == X_TRANS ? a->dimSize[1] : a->dimSize[0];
-    int am = transposedA == X_TRANS ? a->dimSize[0] : a->dimSize[1];
-    int bn = transposedB == X_TRANS ? b->dimSize[1] : b->dimSize[0];
-    int bm = transposedB == X_TRANS ? b->dimSize[0] : b->dimSize[1];
-    int cn = c->dimSize[0];
-    int cm = c->dimSize[1];
+    int an = transposedA == X_TRANS ? a->dimSizeRDI[0] : a->dimSizeRDI[1];
+    int am = transposedA == X_TRANS ? a->dimSizeRDI[1] : a->dimSizeRDI[0];
+    int bn = transposedB == X_TRANS ? b->dimSizeRDI[0] : b->dimSizeRDI[1];
+    int bm = transposedB == X_TRANS ? b->dimSizeRDI[1] : b->dimSizeRDI[0];
+    int cn = c->dimSizeRDI[1];
+    int cm = c->dimSizeRDI[0];

    CheckNTErrors((am == bn && an == cn && bm == cm),
        "Unmatched tensors in multiplication!");
@@ -79,9 +79,9 @@ void MatrixMulBatched(XTensor * a, MATRIX_TRANS_TYPE transposedA,
    XList * aList = new XList(10);
    XList * bList = new XList(10);
    XList * cList = new XList(10);
-    int aDimSize[2] = { -a->dimSizeRDI[0], a->dimSizeRDI[1] };
-    int bDimSize[2] = { -b->dimSizeRDI[0], b->dimSizeRDI[1] };
-    int cDimSize[2] = { -c->dimSizeRDI[0], c->dimSizeRDI[1] };
+    int aDimSize[2] = { -a->dimSizeRDI[1], a->dimSizeRDI[0] };
+    int bDimSize[2] = { -b->dimSizeRDI[1], b->dimSizeRDI[0] };
+    int cDimSize[2] = { -c->dimSizeRDI[1], c->dimSizeRDI[0] };

    for (int p = 0; p < blockNum; p++) {
        void * ap = (char*)a->data + aRealBlockSize * p;
@@ -106,7 +106,8 @@ void MatrixMulBatched(XTensor * a, MATRIX_TRANS_TYPE transposedA,
        int devIDBackup;
        ProtectCudaDev(a->devID, devIDBackup);

-        CudaBLASMatrixMULList(a->mem != NULL ? a->mem->GetCublasHandle() : GDevs.GetCudaHandle(a->devID),
+        cublasHandle_t * handle = a->mem != NULL ? a->mem->GetCublasHandle() : GDevs.GetCudaHandle(a->devID);
+        CudaBLASMatrixMULList(handle,
 							  aList, transposedA,
                              bList, transposedB,
                              cList, aList->count,

--- a/source/core/Select.cpp
+++ b/source/core/Select.cpp
@@ -47,23 +47,28 @@ void SelectRange(XTensor * a, int dim, int low, int high, XTensor * c)
    for(int i = 0; i < a->order; i++){
        if(i == dim){
            CheckNTErrors(low > 0 && low < a->dimSize[dim], "Illegal range specified!");
-            CheckNTErrors(high > 0 && high < a->dimSize[dim], "Illegal range specified!");
+            CheckNTErrors(high > 0 && high <= a->dimSize[dim], "Illegal range specified!");
        }
        else{
            CheckNTErrors(a->dimSize[i] == c->dimSize[i], "The size of the dimensions should be same!");
        }
    }

+    int dimRDI = a->order - dim - 1;
    int stride = 1;
-    for(int i = 0; i < dim; i++)
+    for(int i = 0; i < dimRDI; i++)
        stride *= a->dimSizeRDI[i];

+    int copyTimes = 1;
+    for (int i = dimRDI + 1; i < a->order; i++) 
+        copyTimes *= a->dimSizeRDI[i];
+
    int blockSize = stride * (high - low) * a->unitSize;
    int stepSizeS = stride * a->dimSize[dim] * a->unitSize;
    int stepSizeT = stride * c->dimSize[dim] * a->unitSize;
    char * s = (char*)a->data + stride * low * a->unitSize;
    char * t = (char*)c->data;
-    for(int i = 0; i < high - low; i++){
+    for(int i = 0; i < copyTimes; i++){
        XMemCopy(t, c->devID, s, a->devID, blockSize);
        s += stepSizeS;
        t += stepSizeT;

--- a/source/core/Sort.cu
+++ b/source/core/Sort.cu
@@ -235,10 +235,6 @@ void CudaSortBig(XTensor * a, XTensor * b, XTensor * indexA, XTensor * indexB, i
    int m = GetNextPower2(strideNum);
    int n = stride * blockNum;

-    /* recheck */
-    /*void * buf = mem->AllocBuf(mem->devID, n * m * a->unitSize);
-    void * bufIndex = (indexA != NULL && indexB != NULL) ? mem->AllocBuf(mem->devID, n * m * sizeof(int)) : NULL;*/
-    /* change by liyinqiao */
    void * buf = mem != NULL ? mem->AllocBuf(a->devID, n * m * a->unitSize) : XMemAlloc(a->devID, n * m * a->unitSize);
    void * bufIndex = NULL;
    if (indexA != NULL && indexB != NULL) {
@@ -294,11 +290,6 @@ void CudaSortBig(XTensor * a, XTensor * b, XTensor * indexA, XTensor * indexB, i
        KernelReorganizeBack<int> << <dim3(cudaGrids[1], cudaGrids[0]), dim3(cudaBlocks[1], cudaBlocks[0]) >> >
                                      (bufIndex, indexB->data, m, n, stride, k, blockNum);

-    /* recheck */
-    /*mem->ReleaseBuf(mem->devID, n * m * a->unitSize);
-    if (indexA != NULL && indexB != NULL)
-        mem->ReleaseBuf(mem->devID, n * m * sizeof(int));*/
-    /* change by liyinqiao */
    if (mem != NULL)
        mem->ReleaseBuf(a->devID, n * m * a->unitSize);
    else

--- a/source/core/TopK.cu
+++ b/source/core/TopK.cu
@@ -20,6 +20,7 @@
 */

 #include "../XDevice.h"
+#include "../XUtility.h"
 #include "../XTensor.h"
 #include "TopK.h"
 #include "TopK.cuh"
@@ -393,7 +394,7 @@ void CudaTopK(XTensor * a, XTensor * b, XTensor * index, int dim, int k)
    int cudaGrids[3];
    int cudaBlocks[3];

-    GDevs.GetCudaThread2D(a->mem->devID,
+    GDevs.GetCudaThread2D(a->devID,
        workerNum, stride * blockNum, MAX_INT,
        cudaGrids, cudaBlocks);

@@ -434,14 +435,17 @@ void CudaTopK(XTensor * a, XTensor * b, XTensor * index, int dim, int k)
        memcpy(dimSize, a->dimSize, sizeof(int) * a->order);
        dimSize[0] = -dimSize[0];
        XTensor * indexA = new XTensor(a->order, dimSize, X_INT, 1.0F, a->mem);
-        indexA->data = a->mem->AllocBuf(a->devID, a->unitNum * sizeof(int));
+        indexA->data = a->mem != NULL ? a->mem->AllocBuf(a->devID, a->unitNum * sizeof(int)) : XMemAlloc(a->devID, a->unitNum * sizeof(int));

        /* make the index tensor */
        indexA->SetAscendingOrder(dim);

        CudaSortBig(a, b, indexA, index, dim, k);

+        if (a->mem != NULL)
            a->mem->ReleaseBuf(a->devID, a->unitNum * sizeof(int));
+        else
+            XMemFree(a->devID, indexA->data);
        delete indexA;
    }


--- a/source/function/Loss.cpp
+++ b/source/function/Loss.cpp
@@ -374,15 +374,15 @@ void LossBackward(XTensor * dedy, XTensor * t, XTensor * y,
                  LOSS_FUNCTION_NAME LFName, 
                  int leadDim, int tBeg, int tLen, int yBeg)
 {
-    CheckNTErrors((tLen >= 0 && tLen < y->unitNum), "Illegal input length!");
+    CheckNTErrors((tLen < y->unitNum), "Illegal input length!");
    CheckNTErrors((XTensor::IsIdentical(t, y)&& XTensor::IsIdentical(dedy, y)), 
                        "The input tensors must be of the same size!");
-    CheckNTErrors((t->dimSizeRDI[0] == 1 && y->dimSizeRDI[0] == 1 && dedy->dimSizeRDI[1] == 1), "TODO!");
+    //CheckNTErrors((t->dimSizeRDI[0] == 1 && y->dimSizeRDI[0] == 1 && dedy->dimSizeRDI[0] == 1), "TODO!");
    CheckNTErrors((t->order > leadDim && leadDim >= 0), "Illegal leading dimension!");
    CheckNTErrors((t->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE),
                         "TODO!");

-    int leadDimRDI = y->order - leadDim - 1;
+    int leadDimRDI = leadDim >= 0 ? y->order - leadDim - 1 : -1;
    if(leadDimRDI < 0){
        leadDimRDI = y->dimSizeRDI[y->order - 1];
        tBeg = 0;

--- a/source/test/TMatrixMul.cpp
+++ b/source/test/TMatrixMul.cpp
--- a/source/test/Test.cpp
+++ b/source/test/Test.cpp
@@ -31,6 +31,7 @@ bool Test()

    wrong = !TestConcatenate() || wrong;
    wrong = !TestConcatenateSolely() || wrong;
+    wrong = !TestCopyValues() || wrong;
    wrong = !TestMatrixMul() || wrong;
    wrong = !TestMatrixMul2D() || wrong;
    wrong = !TestMatrixMulBatchedCPU() || wrong;
@@ -42,12 +43,19 @@ bool Test()
    wrong = !TestReduceMax() || wrong;
    wrong = !TestReduceMean() || wrong;
    wrong = !TestReduceSum() || wrong;
+    wrong = !TestReduceSumSquared() || wrong;
+    wrong = !TestReduceVariance() || wrong;
+    wrong = !TestScaleAndShift() || wrong;
+    wrong = !TestSelect() || wrong;
    wrong = !TestSort() || wrong;
    wrong = !TestSplit() || wrong;
    wrong = !TestSum() || wrong;
+    wrong = !TestTopK() || wrong;
+    wrong = !TestUnsqueeze() || wrong;
    wrong = !TestXMem() || wrong;

    //wrong = !TestHardTanH() || wrong;
+    wrong = !TestIdentity || wrong;
    //wrong = !TestLoss() || wrong;
    //wrong = !TestRectify() || wrong;
    wrong = !TestSigmoid() || wrong;

--- a/source/test/Test.h
+++ b/source/test/Test.h
@@ -24,6 +24,7 @@

 #include "TConcatenate.h"
 #include "TConcatenateSolely.h"
+#include "TCopyValues.h"
 #include "TMatrixMul.h"
 #include "TMatrixMul2D.h"
 #include "TMatrixMULBatchedCPU.h"
@@ -35,12 +36,19 @@
 #include "TReduceMax.h"
 #include "TReduceMean.h"
 #include "TReduceSum.h"
+#include "TReduceSumSquared.h"
+#include "TReduceVariance.h"
+#include "TScaleAndShift.h"
+#include "TSelect.h"
 #include "TSort.h"
 #include "TSplit.h"
 #include "TSum.h"
+#include "TTopK.h"
+#include "TUnsqueeze.h"
 #include "TXMem.h"

 #include "THardTanH.h"
+#include "TIdentity.h"
 #include "TLoss.h"
 #include "TRectify.h"
 #include "TSigmoid.h"