code for inference of fnnlm

2bb8754f · xiaotong · f31bc3fb · 2bb8754f · 2bb8754f
Commit 2bb8754f authored Jul 26, 2018 by xiaotong
--- a/source/sample/fnnlm/FNNLM.cpp
+++ b/source/sample/fnnlm/FNNLM.cpp
@@ -1106,6 +1106,7 @@ void Test(const char * test, const char * result, FNNModel &model)
        /* the gold standard */
        XTensor gold;

+        if (!autoDiff) {
            /* prepare an empty network for building the fnn */
            FNNNet net;

@@ -1118,6 +1119,10 @@ void Test(const char * test, const char * result, FNNModel &model)

            /* forward computation */
            Forward(inputs, output, model, net);
+        }
+        else {
+            ForwardAutoDiff(inputs, output, model);
+        }

        /* prediction probabilities */
        XTensor probs;

--- a/source/tensor/core/arithmetic/MatrixMulBatched.cpp
+++ b/source/tensor/core/arithmetic/MatrixMulBatched.cpp
@@ -57,6 +57,11 @@ void _MatrixMulBatched(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
    CheckNTErrors((a->order == b->order && a->order == c->order), 
                  "Input tensor and output tensor must have same order!");

+    if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
+        _MatrixMulBatchedGPU(a, transposedA, b, transposedB, c, alpha, beta);
+        return;
+    }
+
    int an = transposedA == X_TRANS ? a->dimSizeRDI[0] : a->dimSizeRDI[1];
    int am = transposedA == X_TRANS ? a->dimSizeRDI[1] : a->dimSizeRDI[0];
    int bn = transposedB == X_TRANS ? b->dimSizeRDI[0] : b->dimSizeRDI[1];
@@ -213,83 +218,15 @@ void _MatrixMulBatchedGPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
        blockNum *= a->dimSizeRDI[i];
    }

-    XList * aList = new XList(10);
-    XList * bList = new XList(10);
-    XList * cList = new XList(10);
-    int aDimSize[2] = {-a->dimSizeRDI[1], a->dimSizeRDI[0]};
-    int bDimSize[2] = {-b->dimSizeRDI[1], b->dimSizeRDI[0]};
-    int cDimSize[2] = {-c->dimSizeRDI[1], c->dimSizeRDI[0]};
-
-    XTensor * tensorBuf = new XTensor[blockNum * 3];
-    XTensor * aBuf = tensorBuf;
-    XTensor * bBuf = tensorBuf + blockNum;
-    XTensor * cBuf = tensorBuf + blockNum * 2;
-
-    for (int p = 0; p < blockNum; p++) {
-        void * ap = (char*)a->data + aRealBlockSize * p;
-        void * bp = (char*)b->data + bRealBlockSize * p;
-        void * cp = (char*)c->data + cRealBlockSize * p;
-        XTensor * ai = aBuf + p;
-        XTensor * bi = bBuf + p;
-        XTensor * ci = cBuf + p;
-        InitTensor(ai, 2, aDimSize, a->dataType, a->denseRatio, a->devID, a->mem);
-        InitTensor(bi, 2, bDimSize, b->dataType, b->denseRatio, b->devID, b->mem);
-        InitTensor(ci, 2, cDimSize, c->dataType, c->denseRatio, c->devID, c->mem);
-        ai->data = ap;
-        bi->data = bp;
-        ci->data = cp;
-        aList->Add(ai);
-        bList->Add(bi);
-        cList->Add(ci);
-    }
-
-    if (a->devID >= 0 && b->devID >= 0 && c->devID >= 0) {
-#ifdef USE_CUDA
-        CheckNTErrors((a->devID == b->devID && a->devID == c->devID),
-                      "The code must be run on the same GPU!");
-        
-        int devIDBackup;
-        ProtectCudaDev(a->devID, devIDBackup);
-
    cublasHandle_t * handle = a->mem != NULL ? a->mem->GetCublasHandle() : GDevs.GetCudaHandle(a->devID);
-        _CudaBLASMatrixMULList(handle,
-							   aList, transposedA,
-                               bList, transposedB,
-                               cList, aList->count,
-                               alpha, beta);
-
-        BacktoCudaDev(a->devID, devIDBackup);
-#else
-        ShowNTErrors("Please specify USE_CUDA and recompile the code!");
-#endif
-    }
-    else {
-        CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
-        _MatrixMULBatchedCPU(aList, transposedA,
-                             bList, transposedB,
-                             cList, alpha, beta);
-    }
-
-    for (int i = 0; i < aList->count; i++) {
-        XTensor * ai = (XTensor*)aList->GetItem(i);
-        ai->data = NULL;;
-    }
-
-    for (int i = 0; i < bList->count; i++) {
-        XTensor * bi = (XTensor*)bList->GetItem(i);
-        bi->data = NULL;
-    }
+    _CudaBLASMatrixMULBatchedStrided(handle,
+                                    a->data, transposedA, a->dataType, aBlockSize,
+                                    b->data, transposedB, b->dataType, bBlockSize,
+                                    c->data, c->dataType, cBlockSize, blockNum,
+                                    a->dimSizeRDI[1], a->dimSizeRDI[0],
+                                    b->dimSizeRDI[1], b->dimSizeRDI[0],
+                                    c->dimSizeRDI[1], c->dimSizeRDI[0], alpha, beta);

-    for (int i = 0; i < cList->count; i++) {
-        XTensor * ci = (XTensor*)cList->GetItem(i);
-        ci->data = NULL;
-    }
-
-    delete[] tensorBuf;
-
-    delete aList;
-    delete bList;
-    delete cList;
 }

 /*