tmp code for batched matrix mul

f31bc3fb · xiaotong · f74b1c17 · f31bc3fb · f31bc3fb · f31bc3fb
Commit f31bc3fb authored Jul 26, 2018 by xiaotong
--- a/source/sample/fnnlm/FNNLM.cpp
+++ b/source/sample/fnnlm/FNNLM.cpp
@@ -153,43 +153,80 @@ load arguments
 */
 void LoadArgs(int argc, const char ** argv, FNNModel &model)
 {
+    fprintf(stderr, "args:\n");
    for(int i = 0; i < argc; i++){
-        if(!strcmp(argv[i], "-train") && i + 1 < argc)
+        if(!strcmp(argv[i], "-train") && i + 1 < argc){
            strcpy(trainFN, argv[i + 1]);
-        if(!strcmp(argv[i], "-model") && i + 1 < argc)
+            fprintf(stderr, " -train=%s\n", argv[i + 1]);
+        }
+        if(!strcmp(argv[i], "-model") && i + 1 < argc){
            strcpy(modelFN, argv[i + 1]);
-        if(!strcmp(argv[i], "-test") && i + 1 < argc)
+            fprintf(stderr, " -model=%s\n", argv[i + 1]);
+        }
+        if(!strcmp(argv[i], "-test") && i + 1 < argc){
            strcpy(testFN, argv[i + 1]);
-        if(!strcmp(argv[i], "-output") && i + 1 < argc)
+            fprintf(stderr, " -test=%s\n", argv[i + 1]);
+        }
+        if(!strcmp(argv[i], "-output") && i + 1 < argc){
            strcpy(outputFN, argv[i + 1]);
-        if(!strcmp(argv[i], "-n") && i + 1 < argc)
+            fprintf(stderr, " -output=%s\n", argv[i + 1]);
+        }
+        if(!strcmp(argv[i], "-n") && i + 1 < argc){
            model.n = atoi(argv[i + 1]);
-        if(!strcmp(argv[i], "-esize") && i + 1 < argc)
+            fprintf(stderr, " -n=%d\n", model.n);
+        }
+        if(!strcmp(argv[i], "-esize") && i + 1 < argc){
            model.eSize = atoi(argv[i + 1]);
-        if(!strcmp(argv[i], "-vsize") && i + 1 < argc)
+            fprintf(stderr, " -esize=%d\n", model.eSize);
+        }
+        if(!strcmp(argv[i], "-vsize") && i + 1 < argc){
            model.vSize = atoi(argv[i + 1]);
-        if(!strcmp(argv[i], "-hdepth") && i + 1 < argc)
+            fprintf(stderr, " -vsize=%d\n", model.vSize);
+        }
+        if(!strcmp(argv[i], "-hdepth") && i + 1 < argc){
            model.hDepth = atoi(argv[i + 1]);
-        if(!strcmp(argv[i], "-hsize") && i + 1 < argc)
+            fprintf(stderr, " -hdepth=%d\n", model.hDepth);
+        }
+        if(!strcmp(argv[i], "-hsize") && i + 1 < argc){
            model.hSize = atoi(argv[i + 1]);
-        if(!strcmp(argv[i], "-lrate") && i + 1 < argc)
+            fprintf(stderr, " -hsize=%d\n", model.hSize);
+        }
+        if(!strcmp(argv[i], "-lrate") && i + 1 < argc){
            learningRate = (float)atof(argv[i + 1]);
-        if(!strcmp(argv[i], "-nstep") && i + 1 < argc)
+            fprintf(stderr, " -lrate=%f\n", learningRate);
+        }
+        if(!strcmp(argv[i], "-nstep") && i + 1 < argc){
            nStep = atoi(argv[i + 1]);
-        if(!strcmp(argv[i], "-nepoch") && i + 1 < argc)
+            fprintf(stderr, " -nstep=%d\n", nStep);
+        }
+        if(!strcmp(argv[i], "-nepoch") && i + 1 < argc){
            nEpoch = atoi(argv[i + 1]);
-        if(!strcmp(argv[i], "-minmax") && i + 1 < argc)
+            fprintf(stderr, " -nepoch=%d\n", nEpoch);
+        }
+        if(!strcmp(argv[i], "-minmax") && i + 1 < argc){
            minmax = (float)fabs(atof(argv[i + 1]));
-        if(!strcmp(argv[i], "-batch") && i + 1 < argc)
+            fprintf(stderr, " -minmax=%f\n", minmax);
+        }
+        if(!strcmp(argv[i], "-batch") && i + 1 < argc){
            sentBatch = atoi(argv[i + 1]);
-        if(!strcmp(argv[i], "-wbatch") && i + 1 < argc)
+            fprintf(stderr, " -batch=%d\n", sentBatch);
+        }
+        if(!strcmp(argv[i], "-wbatch") && i + 1 < argc){
            wordBatch = atoi(argv[i + 1]);
-        if(!strcmp(argv[i], "-shuffle"))
+            fprintf(stderr, " -wbatch=%d\n", wordBatch);
+        }
+        if(!strcmp(argv[i], "-shuffle")){
            shuffled = true;
-        if(!strcmp(argv[i], "-autodiff"))
+            fprintf(stderr, " -shuffle=true\n");
+        }
+        if(!strcmp(argv[i], "-autodiff")){
            autoDiff = true;
-        if(!strcmp(argv[i], "-dev") && i + 1 < argc)
+            fprintf(stderr, " -autodiff=true\n");
+        }
+        if(!strcmp(argv[i], "-dev") && i + 1 < argc){
            model.devID = atoi(argv[i + 1]);
+            fprintf(stderr, " -dev=%d\n", model.devID);
+        }
    }

    for(int i = 0; i < argc; i++){

--- a/source/tensor/core/arithmetic/MatrixMulBatched.cpp
+++ b/source/tensor/core/arithmetic/MatrixMulBatched.cpp
@@ -64,8 +64,7 @@ void _MatrixMulBatched(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
    int cn = c->dimSizeRDI[1];
    int cm = c->dimSizeRDI[0];

-    CheckNTErrors((am == bn && an == cn && bm == cm),
-        "Unmatched tensors in multiplication!");
+    CheckNTErrors((am == bn && an == cn && bm == cm), "Unmatched tensors in multiplication!");

    int aBlockSize = a->dimSizeRDI[0] * a->dimSizeRDI[1];
    int bBlockSize = b->dimSizeRDI[0] * b->dimSizeRDI[1];
@@ -134,8 +133,141 @@ void _MatrixMulBatched(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
    else {
        CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
        _MatrixMULBatchedCPU(aList, transposedA,
-            bList, transposedB,
-            cList, alpha, beta);
+                             bList, transposedB,
+                             cList, alpha, beta);
+    }
+
+    for (int i = 0; i < aList->count; i++) {
+        XTensor * ai = (XTensor*)aList->GetItem(i);
+        ai->data = NULL;;
+    }
+
+    for (int i = 0; i < bList->count; i++) {
+        XTensor * bi = (XTensor*)bList->GetItem(i);
+        bi->data = NULL;
+    }
+
+    for (int i = 0; i < cList->count; i++) {
+        XTensor * ci = (XTensor*)cList->GetItem(i);
+        ci->data = NULL;
+    }
+
+    delete[] tensorBuf;
+
+    delete aList;
+    delete bList;
+    delete cList;
+}
+
+/*
+matrix multiplication of the two tensors
+optimized for GPU
+
+for each 2-dimensional data array in a (denoted as ai) and
+each 2-dimensional data array in b (denoted as bi), we have
+ci = trans(ai) * trans(bi) * alpha + cm * beta
+where trans() returns the transposed matrix if the flag is fired
+
+>> a - tensor a
+>> transposedA - indicates whether the matrices in a are transposed
+>> b - tensor b
+>> transposedB - indicates whether teh matrices in b are transposed
+>> c - where we keep a*b
+>> alpha - a coefficient
+>> beta - another coefficient
+>> parallelRunner - parallel processing module
+*/
+void _MatrixMulBatchedGPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
+                          const XTensor * b, MATRIX_TRANS_TYPE transposedB,
+                          XTensor * c, DTYPE alpha, DTYPE beta, XPRunner * parallelRunner)
+{
+    CheckNTErrors((a && b && c), "Empty input tensors!");
+    CheckNTErrors((a->dataType == b->dataType && a->dataType == c->dataType),
+                  "Input tensors should have the same data type!");
+    CheckNTErrors((a->order >= 2 && b->order >= 2 && c->order >= 2),
+                  "Input tensors must have a order >= 2!");
+    CheckNTErrors((a->order == b->order && a->order == c->order), 
+                  "Input tensor and output tensor must have same order!");
+    CheckNTErrors(a->devID >= 0 && b->devID >= 0 && c->devID >= 0, "The tensors must be on GPUs");
+
+    int an = transposedA == X_TRANS ? a->dimSizeRDI[0] : a->dimSizeRDI[1];
+    int am = transposedA == X_TRANS ? a->dimSizeRDI[1] : a->dimSizeRDI[0];
+    int bn = transposedB == X_TRANS ? b->dimSizeRDI[0] : b->dimSizeRDI[1];
+    int bm = transposedB == X_TRANS ? b->dimSizeRDI[1] : b->dimSizeRDI[0];
+    int cn = c->dimSizeRDI[1];
+    int cm = c->dimSizeRDI[0];
+
+    CheckNTErrors((am == bn && an == cn && bm == cm), "Unmatched tensors in multiplication!");
+
+    int aBlockSize = a->dimSizeRDI[0] * a->dimSizeRDI[1];
+    int bBlockSize = b->dimSizeRDI[0] * b->dimSizeRDI[1];
+    int cBlockSize = c->dimSizeRDI[0] * c->dimSizeRDI[1];
+    int aRealBlockSize = aBlockSize * a->unitSize;
+    int bRealBlockSize = bBlockSize * b->unitSize;
+    int cRealBlockSize = cBlockSize * c->unitSize;
+    int blockNum = 1;
+
+    for (int i = 2; i < a->order; i++) {
+        CheckNTErrors((a->dimSizeRDI[i] == c->dimSizeRDI[i]), "Incorrect tensor sizes!");
+        CheckNTErrors((b->dimSizeRDI[i] == c->dimSizeRDI[i]), "Incorrect tensor sizes!");
+        blockNum *= a->dimSizeRDI[i];
+    }
+
+    XList * aList = new XList(10);
+    XList * bList = new XList(10);
+    XList * cList = new XList(10);
+    int aDimSize[2] = {-a->dimSizeRDI[1], a->dimSizeRDI[0]};
+    int bDimSize[2] = {-b->dimSizeRDI[1], b->dimSizeRDI[0]};
+    int cDimSize[2] = {-c->dimSizeRDI[1], c->dimSizeRDI[0]};
+
+    XTensor * tensorBuf = new XTensor[blockNum * 3];
+    XTensor * aBuf = tensorBuf;
+    XTensor * bBuf = tensorBuf + blockNum;
+    XTensor * cBuf = tensorBuf + blockNum * 2;
+
+    for (int p = 0; p < blockNum; p++) {
+        void * ap = (char*)a->data + aRealBlockSize * p;
+        void * bp = (char*)b->data + bRealBlockSize * p;
+        void * cp = (char*)c->data + cRealBlockSize * p;
+        XTensor * ai = aBuf + p;
+        XTensor * bi = bBuf + p;
+        XTensor * ci = cBuf + p;
+        InitTensor(ai, 2, aDimSize, a->dataType, a->denseRatio, a->devID, a->mem);
+        InitTensor(bi, 2, bDimSize, b->dataType, b->denseRatio, b->devID, b->mem);
+        InitTensor(ci, 2, cDimSize, c->dataType, c->denseRatio, c->devID, c->mem);
+        ai->data = ap;
+        bi->data = bp;
+        ci->data = cp;
+        aList->Add(ai);
+        bList->Add(bi);
+        cList->Add(ci);
+    }
+
+    if (a->devID >= 0 && b->devID >= 0 && c->devID >= 0) {
+#ifdef USE_CUDA
+        CheckNTErrors((a->devID == b->devID && a->devID == c->devID),
+                      "The code must be run on the same GPU!");
+        
+        int devIDBackup;
+        ProtectCudaDev(a->devID, devIDBackup);
+
+        cublasHandle_t * handle = a->mem != NULL ? a->mem->GetCublasHandle() : GDevs.GetCudaHandle(a->devID);
+        _CudaBLASMatrixMULList(handle,
+							   aList, transposedA,
+                               bList, transposedB,
+                               cList, aList->count,
+                               alpha, beta);
+
+        BacktoCudaDev(a->devID, devIDBackup);
+#else
+        ShowNTErrors("Please specify USE_CUDA and recompile the code!");
+#endif
+    }
+    else {
+        CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
+        _MatrixMULBatchedCPU(aList, transposedA,
+                             bList, transposedB,
+                             cList, alpha, beta);
    }

    for (int i = 0; i < aList->count; i++) {

--- a/source/tensor/core/arithmetic/MatrixMulBatched.h
+++ b/source/tensor/core/arithmetic/MatrixMulBatched.h
@@ -37,6 +37,14 @@ where trans() returns the transposed matrix if the flag is fired
 void _MatrixMulBatched(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB,
                       XTensor * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL);

+
+/*
+matrix multiplication of the two tensors c = trans(a) * trans(b) * alpha + c * beta
+optimized for GPU
+*/
+void _MatrixMulBatchedGPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB,
+                          XTensor * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL);
+
 /*
 matrix multiplication of the two tensors (return a XTensor structure) c = trans(a) * trans(b) * alpha
 make a new tensor to keep the result and return it