better code of MatrixMul batched

a027f72e · xiaotong · 5c0d8bfd · a027f72e · a027f72e · 5c0d8bfd
Commit a027f72e authored Jul 27, 2018 by xiaotong
--- a/source/tensor/XUtility.cpp
+++ b/source/tensor/XUtility.cpp
@@ -262,12 +262,16 @@ void XMemCopy2D(void * t, size_t tPitch, int devIDT, const void * s, size_t sPit
    }
 #ifdef USE_CUDA
    else if (devIDT >= 0 && devIDS < 0) {
-        CheckNTErrors((cudaMemcpy2D(t, tPitch, s, sPitch, mSize, n, cudaMemcpyHostToDevice) == cudaSuccess),
-                            "cudaMemcpy2D error (cudaMemcpyHostToDevice)");
+        cudaError_t error = cudaMemcpy2D(t, tPitch, s, sPitch, mSize, n, cudaMemcpyHostToDevice);
+        if(error != cudaSuccess){
+            ShowNTErrors("cudaMemcpy2D error (cudaMemcpyHostToDevice)");
+        }
    }
    else if (devIDT < 0 && devIDS >= 0) {
-        CheckNTErrors((cudaMemcpy2D(t, tPitch, s, sPitch, mSize, n, cudaMemcpyDeviceToHost) == cudaSuccess),
-            "cudaMemcpy error (cudaMemcpyDeviceToHost)");
+        cudaError_t error = cudaMemcpy2D(t, tPitch, s, sPitch, mSize, n, cudaMemcpyDeviceToHost);
+        if(error != cudaSuccess){
+            ShowNTErrors("cudaMemcpy error (cudaMemcpyDeviceToHost)");
+        }
    }
    else {
        cudaError_t error = cudaMemcpy2D(t, tPitch, s, sPitch, mSize, n, cudaMemcpyDeviceToDevice);

--- a/source/tensor/core/CHeader.h
+++ b/source/tensor/core/CHeader.h
@@ -43,7 +43,6 @@
 #include "arithmetic/MatrixMul2DMultiTheading.h"
 #include "arithmetic/MatrixMul2DParallel.h"
 #include "arithmetic/MatrixMulBatched.h"
-#include "arithmetic/MatrixMULBatchedCPU.h"
 #include "shape/Merge.h"
 #include "shape/MergeBlockLists.h"
 #include "arithmetic/Multiply.h"

--- a/source/tensor/core/arithmetic/MatrixMULBatchedCPU.cpp
+++ b/source/tensor/core/arithmetic/MatrixMULBatchedCPU.cpp
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-/*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
-*/
-
-#include "../../XTensor.h"
-#include "MatrixMULBatchedCPU.h"
-#include "MatrixMul2D.h"
-#include "XTensorBLAS.h"
-
-namespace nts { // namespace nts(NiuTrans.Tensor)
-
-/*
-matrix multiplication in batch mode (BLAS)
-c_i = trans(a_i) * trans(b_i) * \alpha + c_i * \beta for each i in [0,count-1]
->> a - list of input matrices (2d tensors)
->> transposedA - indicate whether the matrix a is transposed
->> b - another list of input matrices (2d tensors)
->> transposedB - indicate whether the matrix b is transposed
->> c - output matrix (2d tensor)
->> alpha - scalar
->> beta - scalar
-*/
-void _MatrixMULBatchedCPU(const XList * a, MATRIX_TRANS_TYPE transposedA,
-                          const XList * b, MATRIX_TRANS_TYPE transposedB,
-                          XList * c, DTYPE alpha, DTYPE beta)
-{
-    CheckNTErrors(a && b && c, "Empty input lists!");
-    CheckNTErrors(a->count == b->count && a->count == c->count, "Input lists must be of the same size!");
-
-    if (a->count == 0)
-        return;
-
-    bool isUniform = true;
-    for (int i = 1; i < a->count; i++) {
-        XTensor * aim = (XTensor*)a->GetItem(i - 1);
-        XTensor * bim = (XTensor*)b->GetItem(i - 1);
-        XTensor * cim = (XTensor*)c->GetItem(i - 1);
-        XTensor * ai = (XTensor*)a->GetItem(i);
-        XTensor * bi = (XTensor*)b->GetItem(i);
-        XTensor * ci = (XTensor*)c->GetItem(i);
-        if (!XTensor::IsSameShaped(aim, ai) ||
-            !XTensor::IsSameShaped(bim, bi) ||
-            !XTensor::IsSameShaped(cim, ci))
-        {
-            isUniform = false;
-            break;
-        }
-    }
-
-    for (int i = 0; i < a->count; i++) {
-        XTensor * ai = (XTensor*)a->GetItem(i);
-        XTensor * bi = (XTensor*)b->GetItem(i);
-        XTensor * ci = (XTensor*)c->GetItem(i);
-        CheckNTErrors((ai->order == 2), "2d tensor (i.e., matrix) is required!");
-        CheckNTErrors((bi->order == 2), "2d tensor (i.e., matrix) is required!");
-        CheckNTErrors((ci->order == 2), "2d tensor (i.e., matrix) is required!");
-#ifdef USE_BLAS
-        if (useBLAS)
-            _MatrixMULCPU(ai, transposedA, bi, transposedB, ci, alpha, beta);
-        else
-            _MatrixMul2D(ai, transposedA, bi, transposedB, ci, alpha, beta);
-#else
-        _MatrixMul2D(ai, transposedA, bi, transposedB, ci, alpha, beta);
-#endif
-    }
-    //}
-}
-} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/arithmetic/MatrixMULBatchedCPU.h
+++ b/source/tensor/core/arithmetic/MatrixMULBatchedCPU.h
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-/*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
-*/
-
-#ifndef __MATRIXMULBATCHEDCPU_H__
-#define __MATRIXMULBATCHEDCPU_H__
-
-#include "../../XTensor.h"
-
-namespace nts { // namespace nts(NiuTrans.Tensor)
-
-/* matrix multiplication in batch mode (CPU code) */
-void _MatrixMULBatchedCPU(const XList * a, MATRIX_TRANS_TYPE transposedA, const XList * b, MATRIX_TRANS_TYPE transposedB, 
-                          XList * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0);
-
-} // namespace nts(NiuTrans.Tensor)
-
-#endif // __MATRIXMULBATCHEDCPU_H__
\ No newline at end of file
--- a/source/tensor/core/arithmetic/MatrixMul.cpp
+++ b/source/tensor/core/arithmetic/MatrixMul.cpp
@@ -24,8 +24,8 @@
 #include "../../XName.h"
 #include "MatrixMul.h"
 #include "MatrixMul2D.h"
-#include "MatrixMULBatchedCPU.h"
 #include "XTensorBLAS.h"
+#include "MatrixMulBatched.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)

@@ -156,9 +156,9 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
    }
    else {
        CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
-        _MatrixMULBatchedCPU(aList, transposedA,
-            bList, transposedB,
-            cList, alpha, beta);
+        _MatrixMulBatchedCPU(aList, transposedA,
+                             bList, transposedB,
+                             cList, alpha, beta);
    }

    for (int i = 0; i < aList->count; i++) {

--- a/source/tensor/core/arithmetic/MatrixMulBatched.cpp
+++ b/source/tensor/core/arithmetic/MatrixMulBatched.cpp
@@ -23,8 +23,8 @@
 #include "../../XDevice.h"
 #include "../../XName.h"
 #include "MatrixMulBatched.h"
-#include "MatrixMULBatchedCPU.h"
 #include "XTensorBLAS.h"
+#include "MatrixMul2D.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)

@@ -57,10 +57,42 @@ void _MatrixMulBatched(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
    CheckNTErrors((a->order == b->order && a->order == c->order), 
                  "Input tensor and output tensor must have same order!");

-    if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
+    if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0)
        _MatrixMulBatchedGPU(a, transposedA, b, transposedB, c, alpha, beta);
-        return;
-    }
+    else
+        _MatrixMulBatchedCPU(a, transposedA, b, transposedB, c, alpha, beta);
+}
+
+/*
+matrix multiplication of the two tensors
+optimized for GPU
+
+for each 2-dimensional data array in a (denoted as ai) and
+each 2-dimensional data array in b (denoted as bi), we have
+ci = trans(ai) * trans(bi) * alpha + cm * beta
+where trans() returns the transposed matrix if the flag is fired
+
+>> a - tensor a
+>> transposedA - indicates whether the matrices in a are transposed
+>> b - tensor b
+>> transposedB - indicates whether teh matrices in b are transposed
+>> c - where we keep a*b
+>> alpha - a coefficient
+>> beta - another coefficient
+*/
+void _MatrixMulBatchedGPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
+                          const XTensor * b, MATRIX_TRANS_TYPE transposedB,
+                          XTensor * c, DTYPE alpha, DTYPE beta)
+{
+#ifdef USE_CUDA
+    CheckNTErrors((a && b && c), "Empty input tensors!");
+    CheckNTErrors((a->dataType == b->dataType && a->dataType == c->dataType),
+                  "Input tensors should have the same data type!");
+    CheckNTErrors((a->order >= 2 && b->order >= 2 && c->order >= 2),
+                  "Input tensors must have a order >= 2!");
+    CheckNTErrors((a->order == b->order && a->order == c->order), 
+                  "Input tensor and output tensor must have same order!");
+    CheckNTErrors(a->devID >= 0 && b->devID >= 0 && c->devID >= 0, "The tensors must be on GPUs");

    int an = transposedA == X_TRANS ? a->dimSizeRDI[0] : a->dimSizeRDI[1];
    int am = transposedA == X_TRANS ? a->dimSizeRDI[1] : a->dimSizeRDI[0];
@@ -85,88 +117,20 @@ void _MatrixMulBatched(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
        blockNum *= a->dimSizeRDI[i];
    }

-    XList * aList = new XList(10);
-    XList * bList = new XList(10);
-    XList * cList = new XList(10);
-    int aDimSize[2] = {-a->dimSizeRDI[1], a->dimSizeRDI[0]};
-    int bDimSize[2] = {-b->dimSizeRDI[1], b->dimSizeRDI[0]};
-    int cDimSize[2] = {-c->dimSizeRDI[1], c->dimSizeRDI[0]};
-
-    XTensor * tensorBuf = new XTensor[blockNum * 3];
-    XTensor * aBuf = tensorBuf;
-    XTensor * bBuf = tensorBuf + blockNum;
-    XTensor * cBuf = tensorBuf + blockNum * 2;
-
-    for (int p = 0; p < blockNum; p++) {
-        void * ap = (char*)a->data + aRealBlockSize * p;
-        void * bp = (char*)b->data + bRealBlockSize * p;
-        void * cp = (char*)c->data + cRealBlockSize * p;
-        XTensor * ai = aBuf + p;
-        XTensor * bi = bBuf + p;
-        XTensor * ci = cBuf + p;
-        InitTensor(ai, 2, aDimSize, a->dataType, a->denseRatio, a->devID, a->mem);
-        InitTensor(bi, 2, bDimSize, b->dataType, b->denseRatio, b->devID, b->mem);
-        InitTensor(ci, 2, cDimSize, c->dataType, c->denseRatio, c->devID, c->mem);
-        ai->data = ap;
-        bi->data = bp;
-        ci->data = cp;
-        aList->Add(ai);
-        bList->Add(bi);
-        cList->Add(ci);
-    }
-
-    if (a->devID >= 0 && b->devID >= 0 && c->devID >= 0) {
-#ifdef USE_CUDA
-        CheckNTErrors((a->devID == b->devID && a->devID == c->devID),
-                      "The code must be run on the same GPU!");
-        
-        int devIDBackup;
-        ProtectCudaDev(a->devID, devIDBackup);
-
-        cublasHandle_t * handle = a->mem != NULL ? a->mem->GetCublasHandle() : GDevs.GetCudaHandle(a->devID);
-        _CudaBLASMatrixMULList(handle,
-							   aList, transposedA,
-                               bList, transposedB,
-                               cList, aList->count,
-                               alpha, beta);
-
-        BacktoCudaDev(a->devID, devIDBackup);
-#else
-        ShowNTErrors("Please specify USE_CUDA and recompile the code!");
+    cublasHandle_t * handle = a->mem != NULL ? a->mem->GetCublasHandle() : GDevs.GetCudaHandle(a->devID);
+    _CudaBLASMatrixMULBatchedStrided(handle,
+                                    a->data, transposedA, a->dataType, aBlockSize,
+                                    b->data, transposedB, b->dataType, bBlockSize,
+                                    c->data, c->dataType, cBlockSize, blockNum,
+                                    a->dimSizeRDI[1], a->dimSizeRDI[0],
+                                    b->dimSizeRDI[1], b->dimSizeRDI[0],
+                                    c->dimSizeRDI[1], c->dimSizeRDI[0], alpha, beta);
 #endif
-    }
-    else {
-        CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
-        _MatrixMULBatchedCPU(aList, transposedA,
-                             bList, transposedB,
-                             cList, alpha, beta);
-    }
-
-    for (int i = 0; i < aList->count; i++) {
-        XTensor * ai = (XTensor*)aList->GetItem(i);
-        ai->data = NULL;;
-    }
-
-    for (int i = 0; i < bList->count; i++) {
-        XTensor * bi = (XTensor*)bList->GetItem(i);
-        bi->data = NULL;
-    }
-
-    for (int i = 0; i < cList->count; i++) {
-        XTensor * ci = (XTensor*)cList->GetItem(i);
-        ci->data = NULL;
-    }
-
-    delete[] tensorBuf;
-
-    delete aList;
-    delete bList;
-    delete cList;
 }

 /*
 matrix multiplication of the two tensors
-optimized for GPU
+optimized for CPU

 for each 2-dimensional data array in a (denoted as ai) and
 each 2-dimensional data array in b (denoted as bi), we have
@@ -180,21 +144,19 @@ where trans() returns the transposed matrix if the flag is fired
 >> c - where we keep a*b
 >> alpha - a coefficient
 >> beta - another coefficient
->> parallelRunner - parallel processing module
 */
-void _MatrixMulBatchedGPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
+void _MatrixMulBatchedCPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
                          const XTensor * b, MATRIX_TRANS_TYPE transposedB,
-                          XTensor * c, DTYPE alpha, DTYPE beta, XPRunner * parallelRunner)
+                          XTensor * c, DTYPE alpha, DTYPE beta)
 {
-#ifdef USE_CUDA
-    CheckNTErrors((a && b && c), "Empty input tensors!");
+CheckNTErrors((a && b && c), "Empty input tensors!");
    CheckNTErrors((a->dataType == b->dataType && a->dataType == c->dataType),
                  "Input tensors should have the same data type!");
    CheckNTErrors((a->order >= 2 && b->order >= 2 && c->order >= 2),
                  "Input tensors must have a order >= 2!");
    CheckNTErrors((a->order == b->order && a->order == c->order), 
                  "Input tensor and output tensor must have same order!");
-    CheckNTErrors(a->devID >= 0 && b->devID >= 0 && c->devID >= 0, "The tensors must be on GPUs");
+

    int an = transposedA == X_TRANS ? a->dimSizeRDI[0] : a->dimSizeRDI[1];
    int am = transposedA == X_TRANS ? a->dimSizeRDI[1] : a->dimSizeRDI[0];
@@ -219,16 +181,90 @@ void _MatrixMulBatchedGPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
        blockNum *= a->dimSizeRDI[i];
    }

-    cublasHandle_t * handle = a->mem != NULL ? a->mem->GetCublasHandle() : GDevs.GetCudaHandle(a->devID);
-    _CudaBLASMatrixMULBatchedStrided(handle,
-                                    a->data, transposedA, a->dataType, aBlockSize,
-                                    b->data, transposedB, b->dataType, bBlockSize,
-                                    c->data, c->dataType, cBlockSize, blockNum,
-                                    a->dimSizeRDI[1], a->dimSizeRDI[0],
-                                    b->dimSizeRDI[1], b->dimSizeRDI[0],
-                                    c->dimSizeRDI[1], c->dimSizeRDI[0], alpha, beta);
+    int aDimSize[2] = {-a->dimSizeRDI[1], a->dimSizeRDI[0]};
+    int bDimSize[2] = {-b->dimSizeRDI[1], b->dimSizeRDI[0]};
+    int cDimSize[2] = {-c->dimSizeRDI[1], c->dimSizeRDI[0]};
+
+    XTensor * ai = NewTensor2D(aDimSize[0], aDimSize[1], a->dataType, a->devID, a->mem);
+    XTensor * bi = NewTensor2D(bDimSize[0], bDimSize[1], b->dataType, b->devID, b->mem);
+    XTensor * ci = NewTensor2D(cDimSize[0], cDimSize[1], c->dataType, c->devID, c->mem);
+
+    for (int i = 0; i < blockNum; i++) {
+        ai->data = (char*)a->data + i * aRealBlockSize;
+        bi->data = (char*)b->data + i * bRealBlockSize;
+        ci->data = (char*)c->data + i * cRealBlockSize;
+#ifdef USE_BLAS
+        if (useBLAS)
+            _MatrixMULCPU(ai, transposedA, bi, transposedB, ci, alpha, beta);
+        else
+            _MatrixMul2D(ai, transposedA, bi, transposedB, ci, alpha, beta);
+#else
+        _MatrixMul2D(ai, transposedA, bi, transposedB, ci, alpha, beta);
 #endif
+    }
+
+    ai->data = NULL;
+    bi->data = NULL;
+    ci->data = NULL;
+    delete ai;
+    delete bi;
+    delete ci;
+}

+/*
+matrix multiplication in batch mode for list inputs (BLAS)
+c_i = trans(a_i) * trans(b_i) * \alpha + c_i * \beta for each i in [0,count-1]
+>> a - list of input matrices (2d tensors)
+>> transposedA - indicate whether the matrix a is transposed
+>> b - another list of input matrices (2d tensors)
+>> transposedB - indicate whether the matrix b is transposed
+>> c - output matrix (2d tensor)
+>> alpha - scalar
+>> beta - scalar
+*/
+void _MatrixMulBatchedCPU(const XList * a, MATRIX_TRANS_TYPE transposedA,
+                          const XList * b, MATRIX_TRANS_TYPE transposedB,
+                          XList * c, DTYPE alpha, DTYPE beta)
+{
+    CheckNTErrors(a && b && c, "Empty input lists!");
+    CheckNTErrors(a->count == b->count && a->count == c->count, "Input lists must be of the same size!");
+
+    if (a->count == 0)
+        return;
+
+    bool isUniform = true;
+    for (int i = 1; i < a->count; i++) {
+        XTensor * aim = (XTensor*)a->GetItem(i - 1);
+        XTensor * bim = (XTensor*)b->GetItem(i - 1);
+        XTensor * cim = (XTensor*)c->GetItem(i - 1);
+        XTensor * ai = (XTensor*)a->GetItem(i);
+        XTensor * bi = (XTensor*)b->GetItem(i);
+        XTensor * ci = (XTensor*)c->GetItem(i);
+        if (!XTensor::IsSameShaped(aim, ai) ||
+            !XTensor::IsSameShaped(bim, bi) ||
+            !XTensor::IsSameShaped(cim, ci))
+        {
+            isUniform = false;
+            break;
+        }
+    }
+
+    for (int i = 0; i < a->count; i++) {
+        XTensor * ai = (XTensor*)a->GetItem(i);
+        XTensor * bi = (XTensor*)b->GetItem(i);
+        XTensor * ci = (XTensor*)c->GetItem(i);
+        CheckNTErrors((ai->order == 2), "2d tensor (i.e., matrix) is required!");
+        CheckNTErrors((bi->order == 2), "2d tensor (i.e., matrix) is required!");
+        CheckNTErrors((ci->order == 2), "2d tensor (i.e., matrix) is required!");
+#ifdef USE_BLAS
+        if (useBLAS)
+            _MatrixMULCPU(ai, transposedA, bi, transposedB, ci, alpha, beta);
+        else
+            _MatrixMul2D(ai, transposedA, bi, transposedB, ci, alpha, beta);
+#else
+        _MatrixMul2D(ai, transposedA, bi, transposedB, ci, alpha, beta);
+#endif
+    }
 }

 /*

--- a/source/tensor/core/arithmetic/MatrixMulBatched.h
+++ b/source/tensor/core/arithmetic/MatrixMulBatched.h
@@ -43,7 +43,21 @@ matrix multiplication of the two tensors c = trans(a) * trans(b) * alpha + c * b
 optimized for GPU
 */
 void _MatrixMulBatchedGPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB,
-                          XTensor * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL);
+                          XTensor * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0);
+
+/*
+matrix multiplication of the two tensors c = trans(a) * trans(b) * alpha + c * beta
+optimized for GPU
+*/
+void _MatrixMulBatchedCPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, 
+                          XTensor * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0);
+
+/*
+matrix multiplication of the two tensors c = trans(a) * trans(b) * alpha + c * beta (for list inputs)
+optimized for GPU
+*/
+void _MatrixMulBatchedCPU(const XList * a, MATRIX_TRANS_TYPE transposedA, const XList * b, MATRIX_TRANS_TYPE transposedB, 
+                          XList * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0);

 /*
 matrix multiplication of the two tensors (return a XTensor structure) c = trans(a) * trans(b) * alpha

--- a/source/tensor/core/shape/Split.cpp
+++ b/source/tensor/core/shape/Split.cpp
@@ -90,8 +90,8 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)
        int tStep = n * tPitch;
        for (int k = 0; k < splitNum; k++) {
            XMemCopy2D((char*)t->data + k * tStep, tPitch, t->devID,
-                (char*)s->data + k * sStep, sPitch, s->devID,
-                mSize, n);
+                       (char*)s->data + k * sStep, sPitch, s->devID,
+                        mSize, n);
        }
    }
    else {
@@ -229,8 +229,8 @@ void _Split(const XTensor * big, XList * smalls, int whereToSplit, int splitNum)
        for (int k = 0; k < splitNum; k++) {
            XTensor * t = (XTensor*)smalls->GetItem(k);
            XMemCopy2D((char*)t->data + k * tStep, tPitch, t->devID,
-                (char*)big->data + k * sStep, sPitch, big->devID,
-                mSize, n);
+                       (char*)big->data + k * sStep, sPitch, big->devID,
+                        mSize, n);
        }
    }
    /* splitting with fewer kernel/api calls??? (i'm not sure about it!! may remove this later) */

--- a/source/tensor/test/TMatrixMULBatchedCPU.cpp
+++ b/source/tensor/test/TMatrixMULBatchedCPU.cpp
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-/*
-* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-06-15
-*/
-
-#include "TMatrixMULBatchedCPU.h"
-
-namespace nts { // namespace nts(NiuTrans.Tensor)
-
-/* 
-case 1: matrix multiplication in batch mode (CPU code). 
-In this case, aList=2*(2, 3), bList=2*(3, 2) -> c=2*(2, 2), transposedA=X_NOTRANS, transposedB=X_NOTRANS.
-*/
-bool TestMatrixMulBatchedCPU1()
-{
-    /* create list */
-    XList * aList = new XList();
-    XList * bList = new XList();
-    XList * cList = new XList();
-
-    /* a source tensor of size (2, 3) */
-    int aOrder = 2;
-    int * aDimSize = new int[aOrder];
-    aDimSize[0] = 2;
-    aDimSize[1] = 3;
-
-    int aUnitNum = 1;
-    for (int i = 0; i < aOrder; i++)
-        aUnitNum *= aDimSize[i];
-
-    /* a source tensor of size (3, 2) */
-    int bOrder = 2;
-    int * bDimSize = new int[bOrder];
-    bDimSize[0] = 3;
-    bDimSize[1] = 2;
-
-    int bUnitNum = 1;
-    for (int i = 0; i < bOrder; i++)
-        bUnitNum *= bDimSize[i];
-
-    /* a target tensor of size (2, 2) */
-    int cOrder = 2;
-    int * cDimSize = new int[cOrder];
-    cDimSize[0] = 2;
-    cDimSize[1] = 2;
-
-    int cUnitNum = 1;
-    for (int i = 0; i < cOrder; i++)
-        cUnitNum *= cDimSize[i];
-
-    DTYPE aData1[2][3] = { {1.0F, 2.0F, 3.0F},
-                           {-4.0F, 5.0F, 6.0F} };
-    DTYPE aData2[2][3] = { {1.0F, -2.0F, -3.0F},
-                           {-4.0F, 3.0F, 2.0F} };
-    DTYPE bData1[3][2] = { {0.0F, -1.0F},
-                           {1.0F, 2.0F}, 
-                           {2.0F, 1.0F} };
-    DTYPE bData2[3][2] = { {0.0F, 1.0F},
-                           {3.0F, 2.0F}, 
-                           {2.0F, 1.0F} };
-    DTYPE answer1[2][2] = { {8.0F, 6.0F}, 
-                            {17.0F, 20.0F} };
-    DTYPE answer2[2][2] = { {-12.0F, -6.0F}, 
-                            {13.0F, 4.0F} };
-
-    /* CPU test */
-    bool cpuTest = true;
-
-    /* create tensors */
-    XTensor * a1 = NewTensor(aOrder, aDimSize);
-    XTensor * a2 = NewTensor(aOrder, aDimSize);
-    XTensor * b1 = NewTensor(bOrder, bDimSize);
-    XTensor * b2 = NewTensor(bOrder, bDimSize);
-    XTensor * c1 = NewTensor(cOrder, cDimSize);
-    XTensor * c2 = NewTensor(cOrder, cDimSize);
-
-    /* initialize variables */
-    a1->SetData(aData1, aUnitNum);
-    a2->SetData(aData2, aUnitNum);
-    b1->SetData(bData1, aUnitNum);
-    b2->SetData(bData2, aUnitNum);
-    c1->SetZeroAll();
-    c2->SetZeroAll();
-
-    /* add tensors to list */
-    aList->Add(a1);
-    aList->Add(a2);
-    bList->Add(b1);
-    bList->Add(b2);
-    cList->Add(c1);
-    cList->Add(c2);
-
-    /* call MatrixMULBatchedCPU function */
-    _MatrixMULBatchedCPU(aList, X_NOTRANS, bList, X_NOTRANS, cList);
-
-    /* check results */
-    cpuTest = c1->CheckData(answer1, cUnitNum) && c2->CheckData(answer2, cUnitNum);
-    
-#ifdef USE_CUDA
-    /* GPU test */
-    bool gpuTest = true;
-
-    /* create tensors */
-    XTensor * aGPU1 = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
-    XTensor * aGPU2 = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
-    XTensor * bGPU1 = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
-    XTensor * bGPU2 = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
-    XTensor * cGPU1 = NewTensor(cOrder, cDimSize, X_FLOAT, 1.0F, 0);
-    XTensor * cGPU2 = NewTensor(cOrder, cDimSize, X_FLOAT, 1.0F, 0);
-
-    /* initialize variables */
-    aGPU1->SetData(aData1, aUnitNum);
-    aGPU2->SetData(aData2, aUnitNum);
-    bGPU1->SetData(bData1, aUnitNum);
-    bGPU2->SetData(bData2, aUnitNum);
-    cGPU1->SetZeroAll();
-    cGPU2->SetZeroAll();
-    
-    /* clear list */
-    aList->Clear();
-    bList->Clear();
-    cList->Clear();
-
-    /* add tensors to list */
-    aList->Add(aGPU1);
-    aList->Add(aGPU2);
-    bList->Add(bGPU1);
-    bList->Add(bGPU2);
-    cList->Add(cGPU1);
-    cList->Add(cGPU2);
-
-    /* call MatrixMULBatchedCPU function */
-    _MatrixMULBatchedCPU(aList, X_NOTRANS, bList, X_NOTRANS, cList);
-
-    /* check results */
-    gpuTest = cGPU1->CheckData(answer1, cUnitNum) && gpuTest;
-    gpuTest = cGPU2->CheckData(answer2, cUnitNum) && gpuTest;
-
-    /* destroy variables */
-    delete a1;
-    delete a2;
-    delete b1;
-    delete b2;
-    delete c1;
-    delete c2;
-    delete aGPU1;
-    delete aGPU2;
-    delete bGPU1;
-    delete bGPU2;
-    delete cGPU1;
-    delete cGPU2;
-    delete[] aDimSize;
-    delete[] bDimSize;
-    delete[] cDimSize;
-    
-    return cpuTest && gpuTest;
-#else
-    /* destroy variables */
-    delete a1;
-    delete a2;
-    delete b1;
-    delete b2;
-    delete c1;
-    delete c2;
-    delete[] aDimSize;
-    delete[] bDimSize;
-    delete[] cDimSize;
-
-    return cpuTest;
-#endif // USE_CUDA
-}
-
-/* other cases */
-/*
-    TODO!!
-*/
-
-/* test for MatrixMulBatchedCPU Function */
-extern "C"
-bool TestMatrixMulBatchedCPU()
-{
-    XPRINT(0, stdout, "[TEST MATRIXMULBATCHEDCPU] matrix multiplication in batch mode (CPU code) \n");
-    bool returnFlag = true, caseFlag = true;
-
-    /* case 1 test */
-    caseFlag = TestMatrixMulBatchedCPU1();
-
-    if (!caseFlag) {
-        returnFlag = false;
-        XPRINT(0, stdout, ">> case 1 failed!\n");
-    }
-    else
-        XPRINT(0, stdout, ">> case 1 passed!\n");
-
-    /* other cases test */
-    /*
-    TODO!!
-    */
-
-    if (returnFlag) {
-        XPRINT(0, stdout, ">> All Passed!\n");
-    }
-    else
-        XPRINT(0, stdout, ">> Failed!\n");
-
-    XPRINT(0, stdout, "\n");
-
-    return returnFlag;
-}
-
-} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/test/TMatrixMULBatchedCPU.h
+++ b/source/tensor/test/TMatrixMULBatchedCPU.h
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-/*
-* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-06-15
-*/
-
-#ifndef __TEST_MATRIXMULBATCHEDCPU_H__
-#define __TEST_MATRIXMULBATCHEDCPU_H__
-
-#include "../core/arithmetic/MatrixMULBatchedCPU.h"
-
-namespace nts { // namespace nts(NiuTrans.Tensor)
-
-/* test for MatrixMulBatchedCPU Function */
-extern "C"
-bool TestMatrixMulBatchedCPU();
-
-} // namespace nts(NiuTrans.Tensor)
-#endif // __TEST_MATRIXMULBATCHEDCPU_H__
--- a/source/tensor/test/Test.cpp
+++ b/source/tensor/test/Test.cpp
@@ -40,7 +40,6 @@ bool Test()
    wrong = !TestMatrixMul2D() || wrong;
    wrong = !TestMatrixMul2DParallel() || wrong;
    wrong = !TestMatrixMulBatched() || wrong;
-    wrong = !TestMatrixMulBatchedCPU() || wrong;
    wrong = !TestMerge() || wrong;
    wrong = !TestMultiply() || wrong;
    wrong = !TestNegate() || wrong;

--- a/source/tensor/test/Test.h
+++ b/source/tensor/test/Test.h
@@ -33,7 +33,6 @@
 #include "TMatrixMul2D.h"
 #include "TMatrixMul2DParallel.h"
 #include "TMatrixMulBatched.h"
-#include "TMatrixMULBatchedCPU.h"
 #include "TMerge.h"
 #include "TMultiply.h"
 #include "TNegate.h"