Commit a027f72e by xiaotong

better code of MatrixMul batched

parent 5c0d8bfd
...@@ -262,12 +262,16 @@ void XMemCopy2D(void * t, size_t tPitch, int devIDT, const void * s, size_t sPit ...@@ -262,12 +262,16 @@ void XMemCopy2D(void * t, size_t tPitch, int devIDT, const void * s, size_t sPit
} }
#ifdef USE_CUDA #ifdef USE_CUDA
else if (devIDT >= 0 && devIDS < 0) { else if (devIDT >= 0 && devIDS < 0) {
CheckNTErrors((cudaMemcpy2D(t, tPitch, s, sPitch, mSize, n, cudaMemcpyHostToDevice) == cudaSuccess), cudaError_t error = cudaMemcpy2D(t, tPitch, s, sPitch, mSize, n, cudaMemcpyHostToDevice);
"cudaMemcpy2D error (cudaMemcpyHostToDevice)"); if(error != cudaSuccess){
ShowNTErrors("cudaMemcpy2D error (cudaMemcpyHostToDevice)");
}
} }
else if (devIDT < 0 && devIDS >= 0) { else if (devIDT < 0 && devIDS >= 0) {
CheckNTErrors((cudaMemcpy2D(t, tPitch, s, sPitch, mSize, n, cudaMemcpyDeviceToHost) == cudaSuccess), cudaError_t error = cudaMemcpy2D(t, tPitch, s, sPitch, mSize, n, cudaMemcpyDeviceToHost);
"cudaMemcpy error (cudaMemcpyDeviceToHost)"); if(error != cudaSuccess){
ShowNTErrors("cudaMemcpy error (cudaMemcpyDeviceToHost)");
}
} }
else { else {
cudaError_t error = cudaMemcpy2D(t, tPitch, s, sPitch, mSize, n, cudaMemcpyDeviceToDevice); cudaError_t error = cudaMemcpy2D(t, tPitch, s, sPitch, mSize, n, cudaMemcpyDeviceToDevice);
......
...@@ -43,7 +43,6 @@ ...@@ -43,7 +43,6 @@
#include "arithmetic/MatrixMul2DMultiTheading.h" #include "arithmetic/MatrixMul2DMultiTheading.h"
#include "arithmetic/MatrixMul2DParallel.h" #include "arithmetic/MatrixMul2DParallel.h"
#include "arithmetic/MatrixMulBatched.h" #include "arithmetic/MatrixMulBatched.h"
#include "arithmetic/MatrixMULBatchedCPU.h"
#include "shape/Merge.h" #include "shape/Merge.h"
#include "shape/MergeBlockLists.h" #include "shape/MergeBlockLists.h"
#include "arithmetic/Multiply.h" #include "arithmetic/Multiply.h"
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
#include "../../XTensor.h"
#include "MatrixMULBatchedCPU.h"
#include "MatrixMul2D.h"
#include "XTensorBLAS.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/*
matrix multiplication in batch mode (BLAS)
c_i = trans(a_i) * trans(b_i) * \alpha + c_i * \beta for each i in [0,count-1]
>> a - list of input matrices (2d tensors)
>> transposedA - indicate whether the matrix a is transposed
>> b - another list of input matrices (2d tensors)
>> transposedB - indicate whether the matrix b is transposed
>> c - output matrix (2d tensor)
>> alpha - scalar
>> beta - scalar
*/
void _MatrixMULBatchedCPU(const XList * a, MATRIX_TRANS_TYPE transposedA,
const XList * b, MATRIX_TRANS_TYPE transposedB,
XList * c, DTYPE alpha, DTYPE beta)
{
CheckNTErrors(a && b && c, "Empty input lists!");
CheckNTErrors(a->count == b->count && a->count == c->count, "Input lists must be of the same size!");
if (a->count == 0)
return;
bool isUniform = true;
for (int i = 1; i < a->count; i++) {
XTensor * aim = (XTensor*)a->GetItem(i - 1);
XTensor * bim = (XTensor*)b->GetItem(i - 1);
XTensor * cim = (XTensor*)c->GetItem(i - 1);
XTensor * ai = (XTensor*)a->GetItem(i);
XTensor * bi = (XTensor*)b->GetItem(i);
XTensor * ci = (XTensor*)c->GetItem(i);
if (!XTensor::IsSameShaped(aim, ai) ||
!XTensor::IsSameShaped(bim, bi) ||
!XTensor::IsSameShaped(cim, ci))
{
isUniform = false;
break;
}
}
for (int i = 0; i < a->count; i++) {
XTensor * ai = (XTensor*)a->GetItem(i);
XTensor * bi = (XTensor*)b->GetItem(i);
XTensor * ci = (XTensor*)c->GetItem(i);
CheckNTErrors((ai->order == 2), "2d tensor (i.e., matrix) is required!");
CheckNTErrors((bi->order == 2), "2d tensor (i.e., matrix) is required!");
CheckNTErrors((ci->order == 2), "2d tensor (i.e., matrix) is required!");
#ifdef USE_BLAS
if (useBLAS)
_MatrixMULCPU(ai, transposedA, bi, transposedB, ci, alpha, beta);
else
_MatrixMul2D(ai, transposedA, bi, transposedB, ci, alpha, beta);
#else
_MatrixMul2D(ai, transposedA, bi, transposedB, ci, alpha, beta);
#endif
}
//}
}
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
#ifndef __MATRIXMULBATCHEDCPU_H__
#define __MATRIXMULBATCHEDCPU_H__
#include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* matrix multiplication in batch mode (CPU code) */
void _MatrixMULBatchedCPU(const XList * a, MATRIX_TRANS_TYPE transposedA, const XList * b, MATRIX_TRANS_TYPE transposedB,
XList * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0);
} // namespace nts(NiuTrans.Tensor)
#endif // __MATRIXMULBATCHEDCPU_H__
\ No newline at end of file
...@@ -24,8 +24,8 @@ ...@@ -24,8 +24,8 @@
#include "../../XName.h" #include "../../XName.h"
#include "MatrixMul.h" #include "MatrixMul.h"
#include "MatrixMul2D.h" #include "MatrixMul2D.h"
#include "MatrixMULBatchedCPU.h"
#include "XTensorBLAS.h" #include "XTensorBLAS.h"
#include "MatrixMulBatched.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
...@@ -156,7 +156,7 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA, ...@@ -156,7 +156,7 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
} }
else { else {
CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!"); CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
_MatrixMULBatchedCPU(aList, transposedA, _MatrixMulBatchedCPU(aList, transposedA,
bList, transposedB, bList, transposedB,
cList, alpha, beta); cList, alpha, beta);
} }
......
...@@ -23,8 +23,8 @@ ...@@ -23,8 +23,8 @@
#include "../../XDevice.h" #include "../../XDevice.h"
#include "../../XName.h" #include "../../XName.h"
#include "MatrixMulBatched.h" #include "MatrixMulBatched.h"
#include "MatrixMULBatchedCPU.h"
#include "XTensorBLAS.h" #include "XTensorBLAS.h"
#include "MatrixMul2D.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
...@@ -57,10 +57,42 @@ void _MatrixMulBatched(const XTensor * a, MATRIX_TRANS_TYPE transposedA, ...@@ -57,10 +57,42 @@ void _MatrixMulBatched(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
CheckNTErrors((a->order == b->order && a->order == c->order), CheckNTErrors((a->order == b->order && a->order == c->order),
"Input tensor and output tensor must have same order!"); "Input tensor and output tensor must have same order!");
if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) { if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0)
_MatrixMulBatchedGPU(a, transposedA, b, transposedB, c, alpha, beta); _MatrixMulBatchedGPU(a, transposedA, b, transposedB, c, alpha, beta);
return; else
} _MatrixMulBatchedCPU(a, transposedA, b, transposedB, c, alpha, beta);
}
/*
matrix multiplication of the two tensors
optimized for GPU
for each 2-dimensional data array in a (denoted as ai) and
each 2-dimensional data array in b (denoted as bi), we have
ci = trans(ai) * trans(bi) * alpha + cm * beta
where trans() returns the transposed matrix if the flag is fired
>> a - tensor a
>> transposedA - indicates whether the matrices in a are transposed
>> b - tensor b
>> transposedB - indicates whether teh matrices in b are transposed
>> c - where we keep a*b
>> alpha - a coefficient
>> beta - another coefficient
*/
void _MatrixMulBatchedGPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
const XTensor * b, MATRIX_TRANS_TYPE transposedB,
XTensor * c, DTYPE alpha, DTYPE beta)
{
#ifdef USE_CUDA
CheckNTErrors((a && b && c), "Empty input tensors!");
CheckNTErrors((a->dataType == b->dataType && a->dataType == c->dataType),
"Input tensors should have the same data type!");
CheckNTErrors((a->order >= 2 && b->order >= 2 && c->order >= 2),
"Input tensors must have a order >= 2!");
CheckNTErrors((a->order == b->order && a->order == c->order),
"Input tensor and output tensor must have same order!");
CheckNTErrors(a->devID >= 0 && b->devID >= 0 && c->devID >= 0, "The tensors must be on GPUs");
int an = transposedA == X_TRANS ? a->dimSizeRDI[0] : a->dimSizeRDI[1]; int an = transposedA == X_TRANS ? a->dimSizeRDI[0] : a->dimSizeRDI[1];
int am = transposedA == X_TRANS ? a->dimSizeRDI[1] : a->dimSizeRDI[0]; int am = transposedA == X_TRANS ? a->dimSizeRDI[1] : a->dimSizeRDI[0];
...@@ -85,88 +117,20 @@ void _MatrixMulBatched(const XTensor * a, MATRIX_TRANS_TYPE transposedA, ...@@ -85,88 +117,20 @@ void _MatrixMulBatched(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
blockNum *= a->dimSizeRDI[i]; blockNum *= a->dimSizeRDI[i];
} }
XList * aList = new XList(10);
XList * bList = new XList(10);
XList * cList = new XList(10);
int aDimSize[2] = {-a->dimSizeRDI[1], a->dimSizeRDI[0]};
int bDimSize[2] = {-b->dimSizeRDI[1], b->dimSizeRDI[0]};
int cDimSize[2] = {-c->dimSizeRDI[1], c->dimSizeRDI[0]};
XTensor * tensorBuf = new XTensor[blockNum * 3];
XTensor * aBuf = tensorBuf;
XTensor * bBuf = tensorBuf + blockNum;
XTensor * cBuf = tensorBuf + blockNum * 2;
for (int p = 0; p < blockNum; p++) {
void * ap = (char*)a->data + aRealBlockSize * p;
void * bp = (char*)b->data + bRealBlockSize * p;
void * cp = (char*)c->data + cRealBlockSize * p;
XTensor * ai = aBuf + p;
XTensor * bi = bBuf + p;
XTensor * ci = cBuf + p;
InitTensor(ai, 2, aDimSize, a->dataType, a->denseRatio, a->devID, a->mem);
InitTensor(bi, 2, bDimSize, b->dataType, b->denseRatio, b->devID, b->mem);
InitTensor(ci, 2, cDimSize, c->dataType, c->denseRatio, c->devID, c->mem);
ai->data = ap;
bi->data = bp;
ci->data = cp;
aList->Add(ai);
bList->Add(bi);
cList->Add(ci);
}
if (a->devID >= 0 && b->devID >= 0 && c->devID >= 0) {
#ifdef USE_CUDA
CheckNTErrors((a->devID == b->devID && a->devID == c->devID),
"The code must be run on the same GPU!");
int devIDBackup;
ProtectCudaDev(a->devID, devIDBackup);
cublasHandle_t * handle = a->mem != NULL ? a->mem->GetCublasHandle() : GDevs.GetCudaHandle(a->devID); cublasHandle_t * handle = a->mem != NULL ? a->mem->GetCublasHandle() : GDevs.GetCudaHandle(a->devID);
_CudaBLASMatrixMULList(handle, _CudaBLASMatrixMULBatchedStrided(handle,
aList, transposedA, a->data, transposedA, a->dataType, aBlockSize,
bList, transposedB, b->data, transposedB, b->dataType, bBlockSize,
cList, aList->count, c->data, c->dataType, cBlockSize, blockNum,
alpha, beta); a->dimSizeRDI[1], a->dimSizeRDI[0],
b->dimSizeRDI[1], b->dimSizeRDI[0],
BacktoCudaDev(a->devID, devIDBackup); c->dimSizeRDI[1], c->dimSizeRDI[0], alpha, beta);
#else
ShowNTErrors("Please specify USE_CUDA and recompile the code!");
#endif #endif
}
else {
CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
_MatrixMULBatchedCPU(aList, transposedA,
bList, transposedB,
cList, alpha, beta);
}
for (int i = 0; i < aList->count; i++) {
XTensor * ai = (XTensor*)aList->GetItem(i);
ai->data = NULL;;
}
for (int i = 0; i < bList->count; i++) {
XTensor * bi = (XTensor*)bList->GetItem(i);
bi->data = NULL;
}
for (int i = 0; i < cList->count; i++) {
XTensor * ci = (XTensor*)cList->GetItem(i);
ci->data = NULL;
}
delete[] tensorBuf;
delete aList;
delete bList;
delete cList;
} }
/* /*
matrix multiplication of the two tensors matrix multiplication of the two tensors
optimized for GPU optimized for CPU
for each 2-dimensional data array in a (denoted as ai) and for each 2-dimensional data array in a (denoted as ai) and
each 2-dimensional data array in b (denoted as bi), we have each 2-dimensional data array in b (denoted as bi), we have
...@@ -180,21 +144,19 @@ where trans() returns the transposed matrix if the flag is fired ...@@ -180,21 +144,19 @@ where trans() returns the transposed matrix if the flag is fired
>> c - where we keep a*b >> c - where we keep a*b
>> alpha - a coefficient >> alpha - a coefficient
>> beta - another coefficient >> beta - another coefficient
>> parallelRunner - parallel processing module
*/ */
void _MatrixMulBatchedGPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA, void _MatrixMulBatchedCPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
const XTensor * b, MATRIX_TRANS_TYPE transposedB, const XTensor * b, MATRIX_TRANS_TYPE transposedB,
XTensor * c, DTYPE alpha, DTYPE beta, XPRunner * parallelRunner) XTensor * c, DTYPE alpha, DTYPE beta)
{ {
#ifdef USE_CUDA CheckNTErrors((a && b && c), "Empty input tensors!");
CheckNTErrors((a && b && c), "Empty input tensors!");
CheckNTErrors((a->dataType == b->dataType && a->dataType == c->dataType), CheckNTErrors((a->dataType == b->dataType && a->dataType == c->dataType),
"Input tensors should have the same data type!"); "Input tensors should have the same data type!");
CheckNTErrors((a->order >= 2 && b->order >= 2 && c->order >= 2), CheckNTErrors((a->order >= 2 && b->order >= 2 && c->order >= 2),
"Input tensors must have a order >= 2!"); "Input tensors must have a order >= 2!");
CheckNTErrors((a->order == b->order && a->order == c->order), CheckNTErrors((a->order == b->order && a->order == c->order),
"Input tensor and output tensor must have same order!"); "Input tensor and output tensor must have same order!");
CheckNTErrors(a->devID >= 0 && b->devID >= 0 && c->devID >= 0, "The tensors must be on GPUs");
int an = transposedA == X_TRANS ? a->dimSizeRDI[0] : a->dimSizeRDI[1]; int an = transposedA == X_TRANS ? a->dimSizeRDI[0] : a->dimSizeRDI[1];
int am = transposedA == X_TRANS ? a->dimSizeRDI[1] : a->dimSizeRDI[0]; int am = transposedA == X_TRANS ? a->dimSizeRDI[1] : a->dimSizeRDI[0];
...@@ -219,16 +181,90 @@ void _MatrixMulBatchedGPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA, ...@@ -219,16 +181,90 @@ void _MatrixMulBatchedGPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
blockNum *= a->dimSizeRDI[i]; blockNum *= a->dimSizeRDI[i];
} }
cublasHandle_t * handle = a->mem != NULL ? a->mem->GetCublasHandle() : GDevs.GetCudaHandle(a->devID); int aDimSize[2] = {-a->dimSizeRDI[1], a->dimSizeRDI[0]};
_CudaBLASMatrixMULBatchedStrided(handle, int bDimSize[2] = {-b->dimSizeRDI[1], b->dimSizeRDI[0]};
a->data, transposedA, a->dataType, aBlockSize, int cDimSize[2] = {-c->dimSizeRDI[1], c->dimSizeRDI[0]};
b->data, transposedB, b->dataType, bBlockSize,
c->data, c->dataType, cBlockSize, blockNum, XTensor * ai = NewTensor2D(aDimSize[0], aDimSize[1], a->dataType, a->devID, a->mem);
a->dimSizeRDI[1], a->dimSizeRDI[0], XTensor * bi = NewTensor2D(bDimSize[0], bDimSize[1], b->dataType, b->devID, b->mem);
b->dimSizeRDI[1], b->dimSizeRDI[0], XTensor * ci = NewTensor2D(cDimSize[0], cDimSize[1], c->dataType, c->devID, c->mem);
c->dimSizeRDI[1], c->dimSizeRDI[0], alpha, beta);
for (int i = 0; i < blockNum; i++) {
ai->data = (char*)a->data + i * aRealBlockSize;
bi->data = (char*)b->data + i * bRealBlockSize;
ci->data = (char*)c->data + i * cRealBlockSize;
#ifdef USE_BLAS
if (useBLAS)
_MatrixMULCPU(ai, transposedA, bi, transposedB, ci, alpha, beta);
else
_MatrixMul2D(ai, transposedA, bi, transposedB, ci, alpha, beta);
#else
_MatrixMul2D(ai, transposedA, bi, transposedB, ci, alpha, beta);
#endif #endif
}
ai->data = NULL;
bi->data = NULL;
ci->data = NULL;
delete ai;
delete bi;
delete ci;
}
/*
matrix multiplication in batch mode for list inputs (BLAS)
c_i = trans(a_i) * trans(b_i) * \alpha + c_i * \beta for each i in [0,count-1]
>> a - list of input matrices (2d tensors)
>> transposedA - indicate whether the matrix a is transposed
>> b - another list of input matrices (2d tensors)
>> transposedB - indicate whether the matrix b is transposed
>> c - output matrix (2d tensor)
>> alpha - scalar
>> beta - scalar
*/
void _MatrixMulBatchedCPU(const XList * a, MATRIX_TRANS_TYPE transposedA,
const XList * b, MATRIX_TRANS_TYPE transposedB,
XList * c, DTYPE alpha, DTYPE beta)
{
CheckNTErrors(a && b && c, "Empty input lists!");
CheckNTErrors(a->count == b->count && a->count == c->count, "Input lists must be of the same size!");
if (a->count == 0)
return;
bool isUniform = true;
for (int i = 1; i < a->count; i++) {
XTensor * aim = (XTensor*)a->GetItem(i - 1);
XTensor * bim = (XTensor*)b->GetItem(i - 1);
XTensor * cim = (XTensor*)c->GetItem(i - 1);
XTensor * ai = (XTensor*)a->GetItem(i);
XTensor * bi = (XTensor*)b->GetItem(i);
XTensor * ci = (XTensor*)c->GetItem(i);
if (!XTensor::IsSameShaped(aim, ai) ||
!XTensor::IsSameShaped(bim, bi) ||
!XTensor::IsSameShaped(cim, ci))
{
isUniform = false;
break;
}
}
for (int i = 0; i < a->count; i++) {
XTensor * ai = (XTensor*)a->GetItem(i);
XTensor * bi = (XTensor*)b->GetItem(i);
XTensor * ci = (XTensor*)c->GetItem(i);
CheckNTErrors((ai->order == 2), "2d tensor (i.e., matrix) is required!");
CheckNTErrors((bi->order == 2), "2d tensor (i.e., matrix) is required!");
CheckNTErrors((ci->order == 2), "2d tensor (i.e., matrix) is required!");
#ifdef USE_BLAS
if (useBLAS)
_MatrixMULCPU(ai, transposedA, bi, transposedB, ci, alpha, beta);
else
_MatrixMul2D(ai, transposedA, bi, transposedB, ci, alpha, beta);
#else
_MatrixMul2D(ai, transposedA, bi, transposedB, ci, alpha, beta);
#endif
}
} }
/* /*
......
...@@ -43,7 +43,21 @@ matrix multiplication of the two tensors c = trans(a) * trans(b) * alpha + c * b ...@@ -43,7 +43,21 @@ matrix multiplication of the two tensors c = trans(a) * trans(b) * alpha + c * b
optimized for GPU optimized for GPU
*/ */
void _MatrixMulBatchedGPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, void _MatrixMulBatchedGPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB,
XTensor * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL); XTensor * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0);
/*
matrix multiplication of the two tensors c = trans(a) * trans(b) * alpha + c * beta
optimized for GPU
*/
void _MatrixMulBatchedCPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB,
XTensor * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0);
/*
matrix multiplication of the two tensors c = trans(a) * trans(b) * alpha + c * beta (for list inputs)
optimized for GPU
*/
void _MatrixMulBatchedCPU(const XList * a, MATRIX_TRANS_TYPE transposedA, const XList * b, MATRIX_TRANS_TYPE transposedB,
XList * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0);
/* /*
matrix multiplication of the two tensors (return a XTensor structure) c = trans(a) * trans(b) * alpha matrix multiplication of the two tensors (return a XTensor structure) c = trans(a) * trans(b) * alpha
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-06-15
*/
#include "TMatrixMULBatchedCPU.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/*
case 1: matrix multiplication in batch mode (CPU code).
In this case, aList=2*(2, 3), bList=2*(3, 2) -> c=2*(2, 2), transposedA=X_NOTRANS, transposedB=X_NOTRANS.
*/
bool TestMatrixMulBatchedCPU1()
{
/* create list */
XList * aList = new XList();
XList * bList = new XList();
XList * cList = new XList();
/* a source tensor of size (2, 3) */
int aOrder = 2;
int * aDimSize = new int[aOrder];
aDimSize[0] = 2;
aDimSize[1] = 3;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
/* a source tensor of size (3, 2) */
int bOrder = 2;
int * bDimSize = new int[bOrder];
bDimSize[0] = 3;
bDimSize[1] = 2;
int bUnitNum = 1;
for (int i = 0; i < bOrder; i++)
bUnitNum *= bDimSize[i];
/* a target tensor of size (2, 2) */
int cOrder = 2;
int * cDimSize = new int[cOrder];
cDimSize[0] = 2;
cDimSize[1] = 2;
int cUnitNum = 1;
for (int i = 0; i < cOrder; i++)
cUnitNum *= cDimSize[i];
DTYPE aData1[2][3] = { {1.0F, 2.0F, 3.0F},
{-4.0F, 5.0F, 6.0F} };
DTYPE aData2[2][3] = { {1.0F, -2.0F, -3.0F},
{-4.0F, 3.0F, 2.0F} };
DTYPE bData1[3][2] = { {0.0F, -1.0F},
{1.0F, 2.0F},
{2.0F, 1.0F} };
DTYPE bData2[3][2] = { {0.0F, 1.0F},
{3.0F, 2.0F},
{2.0F, 1.0F} };
DTYPE answer1[2][2] = { {8.0F, 6.0F},
{17.0F, 20.0F} };
DTYPE answer2[2][2] = { {-12.0F, -6.0F},
{13.0F, 4.0F} };
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * a1 = NewTensor(aOrder, aDimSize);
XTensor * a2 = NewTensor(aOrder, aDimSize);
XTensor * b1 = NewTensor(bOrder, bDimSize);
XTensor * b2 = NewTensor(bOrder, bDimSize);
XTensor * c1 = NewTensor(cOrder, cDimSize);
XTensor * c2 = NewTensor(cOrder, cDimSize);
/* initialize variables */
a1->SetData(aData1, aUnitNum);
a2->SetData(aData2, aUnitNum);
b1->SetData(bData1, aUnitNum);
b2->SetData(bData2, aUnitNum);
c1->SetZeroAll();
c2->SetZeroAll();
/* add tensors to list */
aList->Add(a1);
aList->Add(a2);
bList->Add(b1);
bList->Add(b2);
cList->Add(c1);
cList->Add(c2);
/* call MatrixMULBatchedCPU function */
_MatrixMULBatchedCPU(aList, X_NOTRANS, bList, X_NOTRANS, cList);
/* check results */
cpuTest = c1->CheckData(answer1, cUnitNum) && c2->CheckData(answer2, cUnitNum);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * aGPU1 = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * aGPU2 = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU1 = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU2 = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
XTensor * cGPU1 = NewTensor(cOrder, cDimSize, X_FLOAT, 1.0F, 0);
XTensor * cGPU2 = NewTensor(cOrder, cDimSize, X_FLOAT, 1.0F, 0);
/* initialize variables */
aGPU1->SetData(aData1, aUnitNum);
aGPU2->SetData(aData2, aUnitNum);
bGPU1->SetData(bData1, aUnitNum);
bGPU2->SetData(bData2, aUnitNum);
cGPU1->SetZeroAll();
cGPU2->SetZeroAll();
/* clear list */
aList->Clear();
bList->Clear();
cList->Clear();
/* add tensors to list */
aList->Add(aGPU1);
aList->Add(aGPU2);
bList->Add(bGPU1);
bList->Add(bGPU2);
cList->Add(cGPU1);
cList->Add(cGPU2);
/* call MatrixMULBatchedCPU function */
_MatrixMULBatchedCPU(aList, X_NOTRANS, bList, X_NOTRANS, cList);
/* check results */
gpuTest = cGPU1->CheckData(answer1, cUnitNum) && gpuTest;
gpuTest = cGPU2->CheckData(answer2, cUnitNum) && gpuTest;
/* destroy variables */
delete a1;
delete a2;
delete b1;
delete b2;
delete c1;
delete c2;
delete aGPU1;
delete aGPU2;
delete bGPU1;
delete bGPU2;
delete cGPU1;
delete cGPU2;
delete[] aDimSize;
delete[] bDimSize;
delete[] cDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete a1;
delete a2;
delete b1;
delete b2;
delete c1;
delete c2;
delete[] aDimSize;
delete[] bDimSize;
delete[] cDimSize;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */
/*
TODO!!
*/
/* test for MatrixMulBatchedCPU Function */
extern "C"
bool TestMatrixMulBatchedCPU()
{
XPRINT(0, stdout, "[TEST MATRIXMULBATCHEDCPU] matrix multiplication in batch mode (CPU code) \n");
bool returnFlag = true, caseFlag = true;
/* case 1 test */
caseFlag = TestMatrixMulBatchedCPU1();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 1 failed!\n");
}
else
XPRINT(0, stdout, ">> case 1 passed!\n");
/* other cases test */
/*
TODO!!
*/
if (returnFlag) {
XPRINT(0, stdout, ">> All Passed!\n");
}
else
XPRINT(0, stdout, ">> Failed!\n");
XPRINT(0, stdout, "\n");
return returnFlag;
}
} // namespace nts(NiuTrans.Tensor)
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-06-15
*/
#ifndef __TEST_MATRIXMULBATCHEDCPU_H__
#define __TEST_MATRIXMULBATCHEDCPU_H__
#include "../core/arithmetic/MatrixMULBatchedCPU.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* test for MatrixMulBatchedCPU Function */
extern "C"
bool TestMatrixMulBatchedCPU();
} // namespace nts(NiuTrans.Tensor)
#endif // __TEST_MATRIXMULBATCHEDCPU_H__
...@@ -40,7 +40,6 @@ bool Test() ...@@ -40,7 +40,6 @@ bool Test()
wrong = !TestMatrixMul2D() || wrong; wrong = !TestMatrixMul2D() || wrong;
wrong = !TestMatrixMul2DParallel() || wrong; wrong = !TestMatrixMul2DParallel() || wrong;
wrong = !TestMatrixMulBatched() || wrong; wrong = !TestMatrixMulBatched() || wrong;
wrong = !TestMatrixMulBatchedCPU() || wrong;
wrong = !TestMerge() || wrong; wrong = !TestMerge() || wrong;
wrong = !TestMultiply() || wrong; wrong = !TestMultiply() || wrong;
wrong = !TestNegate() || wrong; wrong = !TestNegate() || wrong;
......
...@@ -33,7 +33,6 @@ ...@@ -33,7 +33,6 @@
#include "TMatrixMul2D.h" #include "TMatrixMul2D.h"
#include "TMatrixMul2DParallel.h" #include "TMatrixMul2DParallel.h"
#include "TMatrixMulBatched.h" #include "TMatrixMulBatched.h"
#include "TMatrixMULBatchedCPU.h"
#include "TMerge.h" #include "TMerge.h"
#include "TMultiply.h" #include "TMultiply.h"
#include "TNegate.h" #include "TNegate.h"
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论