better code of MatrixMul batched

a027f72e · xiaotong · 5c0d8bfd · a027f72e · a027f72e · 5c0d8bfd
Commit a027f72e authored Jul 27, 2018 by xiaotong
--- a/source/tensor/XUtility.cpp
+++ b/source/tensor/XUtility.cpp
@@ -262,12 +262,16 @@ void XMemCopy2D(void * t, size_t tPitch, int devIDT, const void * s, size_t sPit
    }
 #ifdef USE_CUDA
    else if (devIDT >= 0 && devIDS < 0) {
-        CheckNTErrors((cudaMemcpy2D(t, tPitch, s, sPitch, mSize, n, cudaMemcpyHostToDevice) == cudaSuccess),
+        cudaError_t error = cudaMemcpy2D(t, tPitch, s, sPitch, mSize, n, cudaMemcpyHostToDevice);
-                            "cudaMemcpy2D error (cudaMemcpyHostToDevice)");
+        if(error != cudaSuccess){
+            ShowNTErrors("cudaMemcpy2D error (cudaMemcpyHostToDevice)");
+        }
    }
    else if (devIDT < 0 && devIDS >= 0) {
-        CheckNTErrors((cudaMemcpy2D(t, tPitch, s, sPitch, mSize, n, cudaMemcpyDeviceToHost) == cudaSuccess),
+        cudaError_t error = cudaMemcpy2D(t, tPitch, s, sPitch, mSize, n, cudaMemcpyDeviceToHost);
-            "cudaMemcpy error (cudaMemcpyDeviceToHost)");
+        if(error != cudaSuccess){
+            ShowNTErrors("cudaMemcpy error (cudaMemcpyDeviceToHost)");
+        }
    }
    else {
        cudaError_t error = cudaMemcpy2D(t, tPitch, s, sPitch, mSize, n, cudaMemcpyDeviceToDevice);

--- a/source/tensor/core/CHeader.h
+++ b/source/tensor/core/CHeader.h
@@ -43,7 +43,6 @@
 #include "arithmetic/MatrixMul2DMultiTheading.h"
 #include "arithmetic/MatrixMul2DParallel.h"
 #include "arithmetic/MatrixMulBatched.h"
-#include "arithmetic/MatrixMULBatchedCPU.h"
 #include "shape/Merge.h"
 #include "shape/MergeBlockLists.h"
 #include "arithmetic/Multiply.h"

--- a/source/tensor/core/arithmetic/MatrixMULBatchedCPU.cpp
+++ b/source/tensor/core/arithmetic/MatrixMULBatchedCPU.cpp
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-/*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
-*/
-#include "../../XTensor.h"
-#include "MatrixMULBatchedCPU.h"
-#include "MatrixMul2D.h"
-#include "XTensorBLAS.h"
-namespace nts { // namespace nts(NiuTrans.Tensor)
-/*
-matrix multiplication in batch mode (BLAS)
-c_i = trans(a_i) * trans(b_i) * \alpha + c_i * \beta for each i in [0,count-1]
->> a - list of input matrices (2d tensors)
->> transposedA - indicate whether the matrix a is transposed
->> b - another list of input matrices (2d tensors)
->> transposedB - indicate whether the matrix b is transposed
->> c - output matrix (2d tensor)
->> alpha - scalar
->> beta - scalar
-*/
-void _MatrixMULBatchedCPU(const XList * a, MATRIX_TRANS_TYPE transposedA,
-                          const XList * b, MATRIX_TRANS_TYPE transposedB,
-                          XList * c, DTYPE alpha, DTYPE beta)
-{
-    CheckNTErrors(a && b && c, "Empty input lists!");
-    CheckNTErrors(a->count == b->count && a->count == c->count, "Input lists must be of the same size!");
-    if (a->count == 0)
-        return;
-    bool isUniform = true;
-    for (int i = 1; i < a->count; i++) {
-        XTensor * aim = (XTensor*)a->GetItem(i - 1);
-        XTensor * bim = (XTensor*)b->GetItem(i - 1);
-        XTensor * cim = (XTensor*)c->GetItem(i - 1);
-        XTensor * ai = (XTensor*)a->GetItem(i);
-        XTensor * bi = (XTensor*)b->GetItem(i);
-        XTensor * ci = (XTensor*)c->GetItem(i);
-        if (!XTensor::IsSameShaped(aim, ai) ||
-            !XTensor::IsSameShaped(bim, bi) ||
-            !XTensor::IsSameShaped(cim, ci))
-        {
-            isUniform = false;
-            break;
-        }
-    }
-    for (int i = 0; i < a->count; i++) {
-        XTensor * ai = (XTensor*)a->GetItem(i);
-        XTensor * bi = (XTensor*)b->GetItem(i);
-        XTensor * ci = (XTensor*)c->GetItem(i);
-        CheckNTErrors((ai->order == 2), "2d tensor (i.e., matrix) is required!");
-        CheckNTErrors((bi->order == 2), "2d tensor (i.e., matrix) is required!");
-        CheckNTErrors((ci->order == 2), "2d tensor (i.e., matrix) is required!");
-#ifdef USE_BLAS
-        if (useBLAS)
-            _MatrixMULCPU(ai, transposedA, bi, transposedB, ci, alpha, beta);
-        else
-            _MatrixMul2D(ai, transposedA, bi, transposedB, ci, alpha, beta);
-#else
-        _MatrixMul2D(ai, transposedA, bi, transposedB, ci, alpha, beta);
-#endif
-    }
-    //}
-}
-} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/arithmetic/MatrixMULBatchedCPU.h
+++ b/source/tensor/core/arithmetic/MatrixMULBatchedCPU.h
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-/*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
-*/
-#ifndef __MATRIXMULBATCHEDCPU_H__
-#define __MATRIXMULBATCHEDCPU_H__
-#include "../../XTensor.h"
-namespace nts { // namespace nts(NiuTrans.Tensor)
-/* matrix multiplication in batch mode (CPU code) */
-void _MatrixMULBatchedCPU(const XList * a, MATRIX_TRANS_TYPE transposedA, const XList * b, MATRIX_TRANS_TYPE transposedB, 
-                          XList * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0);
-} // namespace nts(NiuTrans.Tensor)
-#endif // __MATRIXMULBATCHEDCPU_H__
\ No newline at end of file
--- a/source/tensor/core/arithmetic/MatrixMul.cpp
+++ b/source/tensor/core/arithmetic/MatrixMul.cpp
@@ -24,8 +24,8 @@
 #include "../../XName.h"
 #include "MatrixMul.h"
 #include "MatrixMul2D.h"
-#include "MatrixMULBatchedCPU.h"
 #include "XTensorBLAS.h"
+#include "MatrixMulBatched.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
@@ -156,7 +156,7 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
    }
    else {
        CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
-        _MatrixMULBatchedCPU(aList, transposedA,
+        _MatrixMulBatchedCPU(aList, transposedA,
                             bList, transposedB,
                             cList, alpha, beta);
    }

--- a/source/tensor/core/arithmetic/MatrixMulBatched.cpp
+++ b/source/tensor/core/arithmetic/MatrixMulBatched.cpp
--- a/source/tensor/core/arithmetic/MatrixMulBatched.h
+++ b/source/tensor/core/arithmetic/MatrixMulBatched.h
@@ -43,7 +43,21 @@ matrix multiplication of the two tensors c = trans(a) * trans(b) * alpha + c * b
 optimized for GPU
 */
 void _MatrixMulBatchedGPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB,
-                          XTensor * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL);
+                          XTensor * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0);
+/*
+matrix multiplication of the two tensors c = trans(a) * trans(b) * alpha + c * beta
+optimized for GPU
+*/
+void _MatrixMulBatchedCPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, 
+                          XTensor * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0);
+/*
+matrix multiplication of the two tensors c = trans(a) * trans(b) * alpha + c * beta (for list inputs)
+optimized for GPU
+*/
+void _MatrixMulBatchedCPU(const XList * a, MATRIX_TRANS_TYPE transposedA, const XList * b, MATRIX_TRANS_TYPE transposedB, 
+                          XList * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0);
 /*
 matrix multiplication of the two tensors (return a XTensor structure) c = trans(a) * trans(b) * alpha

--- a/source/tensor/core/shape/Split.cpp
+++ b/source/tensor/core/shape/Split.cpp
--- a/source/tensor/test/TMatrixMULBatchedCPU.cpp
+++ b/source/tensor/test/TMatrixMULBatchedCPU.cpp
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-/*
-* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-06-15
-*/
-#include "TMatrixMULBatchedCPU.h"
-namespace nts { // namespace nts(NiuTrans.Tensor)
-/* 
-case 1: matrix multiplication in batch mode (CPU code). 
-In this case, aList=2*(2, 3), bList=2*(3, 2) -> c=2*(2, 2), transposedA=X_NOTRANS, transposedB=X_NOTRANS.
-*/
-bool TestMatrixMulBatchedCPU1()
-{
-    /* create list */
-    XList * aList = new XList();
-    XList * bList = new XList();
-    XList * cList = new XList();
-    /* a source tensor of size (2, 3) */
-    int aOrder = 2;
-    int * aDimSize = new int[aOrder];
-    aDimSize[0] = 2;
-    aDimSize[1] = 3;
-    int aUnitNum = 1;
-    for (int i = 0; i < aOrder; i++)
-        aUnitNum *= aDimSize[i];
-    /* a source tensor of size (3, 2) */
-    int bOrder = 2;
-    int * bDimSize = new int[bOrder];
-    bDimSize[0] = 3;
-    bDimSize[1] = 2;
-    int bUnitNum = 1;
-    for (int i = 0; i < bOrder; i++)
-        bUnitNum *= bDimSize[i];
-    /* a target tensor of size (2, 2) */
-    int cOrder = 2;
-    int * cDimSize = new int[cOrder];
-    cDimSize[0] = 2;
-    cDimSize[1] = 2;
-    int cUnitNum = 1;
-    for (int i = 0; i < cOrder; i++)
-        cUnitNum *= cDimSize[i];
-    DTYPE aData1[2][3] = { {1.0F, 2.0F, 3.0F},
-                           {-4.0F, 5.0F, 6.0F} };
-    DTYPE aData2[2][3] = { {1.0F, -2.0F, -3.0F},
-                           {-4.0F, 3.0F, 2.0F} };
-    DTYPE bData1[3][2] = { {0.0F, -1.0F},
-                           {1.0F, 2.0F}, 
-                           {2.0F, 1.0F} };
-    DTYPE bData2[3][2] = { {0.0F, 1.0F},
-                           {3.0F, 2.0F}, 
-                           {2.0F, 1.0F} };
-    DTYPE answer1[2][2] = { {8.0F, 6.0F}, 
-                            {17.0F, 20.0F} };
-    DTYPE answer2[2][2] = { {-12.0F, -6.0F}, 
-                            {13.0F, 4.0F} };
-    /* CPU test */
-    bool cpuTest = true;
-    /* create tensors */
-    XTensor * a1 = NewTensor(aOrder, aDimSize);
-    XTensor * a2 = NewTensor(aOrder, aDimSize);
-    XTensor * b1 = NewTensor(bOrder, bDimSize);
-    XTensor * b2 = NewTensor(bOrder, bDimSize);
-    XTensor * c1 = NewTensor(cOrder, cDimSize);
-    XTensor * c2 = NewTensor(cOrder, cDimSize);
-    /* initialize variables */
-    a1->SetData(aData1, aUnitNum);
-    a2->SetData(aData2, aUnitNum);
-    b1->SetData(bData1, aUnitNum);
-    b2->SetData(bData2, aUnitNum);
-    c1->SetZeroAll();
-    c2->SetZeroAll();
-    /* add tensors to list */
-    aList->Add(a1);
-    aList->Add(a2);
-    bList->Add(b1);
-    bList->Add(b2);
-    cList->Add(c1);
-    cList->Add(c2);
-    /* call MatrixMULBatchedCPU function */
-    _MatrixMULBatchedCPU(aList, X_NOTRANS, bList, X_NOTRANS, cList);
-    /* check results */
-    cpuTest = c1->CheckData(answer1, cUnitNum) && c2->CheckData(answer2, cUnitNum);
-#ifdef USE_CUDA
-    /* GPU test */
-    bool gpuTest = true;
-    /* create tensors */
-    XTensor * aGPU1 = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
-    XTensor * aGPU2 = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
-    XTensor * bGPU1 = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
-    XTensor * bGPU2 = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
-    XTensor * cGPU1 = NewTensor(cOrder, cDimSize, X_FLOAT, 1.0F, 0);
-    XTensor * cGPU2 = NewTensor(cOrder, cDimSize, X_FLOAT, 1.0F, 0);
-    /* initialize variables */
-    aGPU1->SetData(aData1, aUnitNum);
-    aGPU2->SetData(aData2, aUnitNum);
-    bGPU1->SetData(bData1, aUnitNum);
-    bGPU2->SetData(bData2, aUnitNum);
-    cGPU1->SetZeroAll();
-    cGPU2->SetZeroAll();
-    /* clear list */
-    aList->Clear();
-    bList->Clear();
-    cList->Clear();
-    /* add tensors to list */
-    aList->Add(aGPU1);
-    aList->Add(aGPU2);
-    bList->Add(bGPU1);
-    bList->Add(bGPU2);
-    cList->Add(cGPU1);
-    cList->Add(cGPU2);
-    /* call MatrixMULBatchedCPU function */
-    _MatrixMULBatchedCPU(aList, X_NOTRANS, bList, X_NOTRANS, cList);
-    /* check results */
-    gpuTest = cGPU1->CheckData(answer1, cUnitNum) && gpuTest;
-    gpuTest = cGPU2->CheckData(answer2, cUnitNum) && gpuTest;
-    /* destroy variables */
-    delete a1;
-    delete a2;
-    delete b1;
-    delete b2;
-    delete c1;
-    delete c2;
-    delete aGPU1;
-    delete aGPU2;
-    delete bGPU1;
-    delete bGPU2;
-    delete cGPU1;
-    delete cGPU2;
-    delete[] aDimSize;
-    delete[] bDimSize;
-    delete[] cDimSize;
-    return cpuTest && gpuTest;
-#else
-    /* destroy variables */
-    delete a1;
-    delete a2;
-    delete b1;
-    delete b2;
-    delete c1;
-    delete c2;
-    delete[] aDimSize;
-    delete[] bDimSize;
-    delete[] cDimSize;
-    return cpuTest;
-#endif // USE_CUDA
-}
-/* other cases */
-/*
-    TODO!!
-*/
-/* test for MatrixMulBatchedCPU Function */
-extern "C"
-bool TestMatrixMulBatchedCPU()
-{
-    XPRINT(0, stdout, "[TEST MATRIXMULBATCHEDCPU] matrix multiplication in batch mode (CPU code) \n");
-    bool returnFlag = true, caseFlag = true;
-    /* case 1 test */
-    caseFlag = TestMatrixMulBatchedCPU1();
-    if (!caseFlag) {
-        returnFlag = false;
-        XPRINT(0, stdout, ">> case 1 failed!\n");
-    }
-    else
-        XPRINT(0, stdout, ">> case 1 passed!\n");
-    /* other cases test */
-    /*
-    TODO!!
-    */
-    if (returnFlag) {
-        XPRINT(0, stdout, ">> All Passed!\n");
-    }
-    else
-        XPRINT(0, stdout, ">> Failed!\n");
-    XPRINT(0, stdout, "\n");
-    return returnFlag;
-}
-} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/test/TMatrixMULBatchedCPU.h
+++ b/source/tensor/test/TMatrixMULBatchedCPU.h
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-/*
-* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-06-15
-*/
-#ifndef __TEST_MATRIXMULBATCHEDCPU_H__
-#define __TEST_MATRIXMULBATCHEDCPU_H__
-#include "../core/arithmetic/MatrixMULBatchedCPU.h"
-namespace nts { // namespace nts(NiuTrans.Tensor)
-/* test for MatrixMulBatchedCPU Function */
-extern "C"
-bool TestMatrixMulBatchedCPU();
-} // namespace nts(NiuTrans.Tensor)
-#endif // __TEST_MATRIXMULBATCHEDCPU_H__
--- a/source/tensor/test/Test.cpp
+++ b/source/tensor/test/Test.cpp
@@ -40,7 +40,6 @@ bool Test()
    wrong = !TestMatrixMul2D() || wrong;
    wrong = !TestMatrixMul2DParallel() || wrong;
    wrong = !TestMatrixMulBatched() || wrong;
-    wrong = !TestMatrixMulBatchedCPU() || wrong;
    wrong = !TestMerge() || wrong;
    wrong = !TestMultiply() || wrong;
    wrong = !TestNegate() || wrong;

--- a/source/tensor/test/Test.h
+++ b/source/tensor/test/Test.h
@@ -33,7 +33,6 @@
 #include "TMatrixMul2D.h"
 #include "TMatrixMul2DParallel.h"
 #include "TMatrixMulBatched.h"
-#include "TMatrixMULBatchedCPU.h"
 #include "TMerge.h"
 #include "TMultiply.h"
 #include "TNegate.h"