Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
N
NiuTrans.Tensor
概览
Overview
Details
Activity
Cycle Analytics
版本库
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
问题
0
Issues
0
列表
Board
标记
里程碑
合并请求
0
Merge Requests
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
Snippets
成员
Collapse sidebar
Close sidebar
活动
图像
聊天
创建新问题
作业
提交
Issue Boards
Open sidebar
杨迪
NiuTrans.Tensor
Commits
a027f72e
Commit
a027f72e
authored
Jul 27, 2018
by
xiaotong
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
better code of MatrixMul batched
parent
5c0d8bfd
显示空白字符变更
内嵌
并排
正在显示
12 个修改的文件
包含
155 行增加
和
487 行删除
+155
-487
source/tensor/XUtility.cpp
+8
-4
source/tensor/core/CHeader.h
+0
-1
source/tensor/core/arithmetic/MatrixMULBatchedCPU.cpp
+0
-86
source/tensor/core/arithmetic/MatrixMULBatchedCPU.h
+0
-36
source/tensor/core/arithmetic/MatrixMul.cpp
+2
-2
source/tensor/core/arithmetic/MatrixMulBatched.cpp
+130
-94
source/tensor/core/arithmetic/MatrixMulBatched.h
+15
-1
source/tensor/core/shape/Split.cpp
+0
-0
source/tensor/test/TMatrixMULBatchedCPU.cpp
+0
-227
source/tensor/test/TMatrixMULBatchedCPU.h
+0
-34
source/tensor/test/Test.cpp
+0
-1
source/tensor/test/Test.h
+0
-1
没有找到文件。
source/tensor/XUtility.cpp
查看文件 @
a027f72e
...
@@ -262,12 +262,16 @@ void XMemCopy2D(void * t, size_t tPitch, int devIDT, const void * s, size_t sPit
...
@@ -262,12 +262,16 @@ void XMemCopy2D(void * t, size_t tPitch, int devIDT, const void * s, size_t sPit
}
}
#ifdef USE_CUDA
#ifdef USE_CUDA
else
if
(
devIDT
>=
0
&&
devIDS
<
0
)
{
else
if
(
devIDT
>=
0
&&
devIDS
<
0
)
{
CheckNTErrors
((
cudaMemcpy2D
(
t
,
tPitch
,
s
,
sPitch
,
mSize
,
n
,
cudaMemcpyHostToDevice
)
==
cudaSuccess
),
cudaError_t
error
=
cudaMemcpy2D
(
t
,
tPitch
,
s
,
sPitch
,
mSize
,
n
,
cudaMemcpyHostToDevice
);
"cudaMemcpy2D error (cudaMemcpyHostToDevice)"
);
if
(
error
!=
cudaSuccess
){
ShowNTErrors
(
"cudaMemcpy2D error (cudaMemcpyHostToDevice)"
);
}
}
}
else
if
(
devIDT
<
0
&&
devIDS
>=
0
)
{
else
if
(
devIDT
<
0
&&
devIDS
>=
0
)
{
CheckNTErrors
((
cudaMemcpy2D
(
t
,
tPitch
,
s
,
sPitch
,
mSize
,
n
,
cudaMemcpyDeviceToHost
)
==
cudaSuccess
),
cudaError_t
error
=
cudaMemcpy2D
(
t
,
tPitch
,
s
,
sPitch
,
mSize
,
n
,
cudaMemcpyDeviceToHost
);
"cudaMemcpy error (cudaMemcpyDeviceToHost)"
);
if
(
error
!=
cudaSuccess
){
ShowNTErrors
(
"cudaMemcpy error (cudaMemcpyDeviceToHost)"
);
}
}
}
else
{
else
{
cudaError_t
error
=
cudaMemcpy2D
(
t
,
tPitch
,
s
,
sPitch
,
mSize
,
n
,
cudaMemcpyDeviceToDevice
);
cudaError_t
error
=
cudaMemcpy2D
(
t
,
tPitch
,
s
,
sPitch
,
mSize
,
n
,
cudaMemcpyDeviceToDevice
);
...
...
source/tensor/core/CHeader.h
查看文件 @
a027f72e
...
@@ -43,7 +43,6 @@
...
@@ -43,7 +43,6 @@
#include "arithmetic/MatrixMul2DMultiTheading.h"
#include "arithmetic/MatrixMul2DMultiTheading.h"
#include "arithmetic/MatrixMul2DParallel.h"
#include "arithmetic/MatrixMul2DParallel.h"
#include "arithmetic/MatrixMulBatched.h"
#include "arithmetic/MatrixMulBatched.h"
#include "arithmetic/MatrixMULBatchedCPU.h"
#include "shape/Merge.h"
#include "shape/Merge.h"
#include "shape/MergeBlockLists.h"
#include "shape/MergeBlockLists.h"
#include "arithmetic/Multiply.h"
#include "arithmetic/Multiply.h"
...
...
source/tensor/core/arithmetic/MatrixMULBatchedCPU.cpp
deleted
100644 → 0
查看文件 @
5c0d8bfd
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
#include "../../XTensor.h"
#include "MatrixMULBatchedCPU.h"
#include "MatrixMul2D.h"
#include "XTensorBLAS.h"
namespace
nts
{
// namespace nts(NiuTrans.Tensor)
/*
matrix multiplication in batch mode (BLAS)
c_i = trans(a_i) * trans(b_i) * \alpha + c_i * \beta for each i in [0,count-1]
>> a - list of input matrices (2d tensors)
>> transposedA - indicate whether the matrix a is transposed
>> b - another list of input matrices (2d tensors)
>> transposedB - indicate whether the matrix b is transposed
>> c - output matrix (2d tensor)
>> alpha - scalar
>> beta - scalar
*/
void
_MatrixMULBatchedCPU
(
const
XList
*
a
,
MATRIX_TRANS_TYPE
transposedA
,
const
XList
*
b
,
MATRIX_TRANS_TYPE
transposedB
,
XList
*
c
,
DTYPE
alpha
,
DTYPE
beta
)
{
CheckNTErrors
(
a
&&
b
&&
c
,
"Empty input lists!"
);
CheckNTErrors
(
a
->
count
==
b
->
count
&&
a
->
count
==
c
->
count
,
"Input lists must be of the same size!"
);
if
(
a
->
count
==
0
)
return
;
bool
isUniform
=
true
;
for
(
int
i
=
1
;
i
<
a
->
count
;
i
++
)
{
XTensor
*
aim
=
(
XTensor
*
)
a
->
GetItem
(
i
-
1
);
XTensor
*
bim
=
(
XTensor
*
)
b
->
GetItem
(
i
-
1
);
XTensor
*
cim
=
(
XTensor
*
)
c
->
GetItem
(
i
-
1
);
XTensor
*
ai
=
(
XTensor
*
)
a
->
GetItem
(
i
);
XTensor
*
bi
=
(
XTensor
*
)
b
->
GetItem
(
i
);
XTensor
*
ci
=
(
XTensor
*
)
c
->
GetItem
(
i
);
if
(
!
XTensor
::
IsSameShaped
(
aim
,
ai
)
||
!
XTensor
::
IsSameShaped
(
bim
,
bi
)
||
!
XTensor
::
IsSameShaped
(
cim
,
ci
))
{
isUniform
=
false
;
break
;
}
}
for
(
int
i
=
0
;
i
<
a
->
count
;
i
++
)
{
XTensor
*
ai
=
(
XTensor
*
)
a
->
GetItem
(
i
);
XTensor
*
bi
=
(
XTensor
*
)
b
->
GetItem
(
i
);
XTensor
*
ci
=
(
XTensor
*
)
c
->
GetItem
(
i
);
CheckNTErrors
((
ai
->
order
==
2
),
"2d tensor (i.e., matrix) is required!"
);
CheckNTErrors
((
bi
->
order
==
2
),
"2d tensor (i.e., matrix) is required!"
);
CheckNTErrors
((
ci
->
order
==
2
),
"2d tensor (i.e., matrix) is required!"
);
#ifdef USE_BLAS
if
(
useBLAS
)
_MatrixMULCPU
(
ai
,
transposedA
,
bi
,
transposedB
,
ci
,
alpha
,
beta
);
else
_MatrixMul2D
(
ai
,
transposedA
,
bi
,
transposedB
,
ci
,
alpha
,
beta
);
#else
_MatrixMul2D
(
ai
,
transposedA
,
bi
,
transposedB
,
ci
,
alpha
,
beta
);
#endif
}
//}
}
}
//
namespace
nts
(
NiuTrans
.
Tensor
)
\ No newline at end of file
source/tensor/core/arithmetic/MatrixMULBatchedCPU.h
deleted
100644 → 0
查看文件 @
5c0d8bfd
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
#ifndef __MATRIXMULBATCHEDCPU_H__
#define __MATRIXMULBATCHEDCPU_H__
#include "../../XTensor.h"
namespace
nts
{
// namespace nts(NiuTrans.Tensor)
/* matrix multiplication in batch mode (CPU code) */
void
_MatrixMULBatchedCPU
(
const
XList
*
a
,
MATRIX_TRANS_TYPE
transposedA
,
const
XList
*
b
,
MATRIX_TRANS_TYPE
transposedB
,
XList
*
c
,
DTYPE
alpha
=
(
DTYPE
)
1
.
0
,
DTYPE
beta
=
0
);
}
// namespace nts(NiuTrans.Tensor)
#endif // __MATRIXMULBATCHEDCPU_H__
\ No newline at end of file
source/tensor/core/arithmetic/MatrixMul.cpp
查看文件 @
a027f72e
...
@@ -24,8 +24,8 @@
...
@@ -24,8 +24,8 @@
#include "../../XName.h"
#include "../../XName.h"
#include "MatrixMul.h"
#include "MatrixMul.h"
#include "MatrixMul2D.h"
#include "MatrixMul2D.h"
#include "MatrixMULBatchedCPU.h"
#include "XTensorBLAS.h"
#include "XTensorBLAS.h"
#include "MatrixMulBatched.h"
namespace
nts
{
// namespace nts(NiuTrans.Tensor)
namespace
nts
{
// namespace nts(NiuTrans.Tensor)
...
@@ -156,7 +156,7 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
...
@@ -156,7 +156,7 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
}
}
else
{
else
{
CheckNTErrors
((
a
->
dataType
==
DEFAULT_DTYPE
),
"TODO!"
);
CheckNTErrors
((
a
->
dataType
==
DEFAULT_DTYPE
),
"TODO!"
);
_MatrixM
UL
BatchedCPU
(
aList
,
transposedA
,
_MatrixM
ul
BatchedCPU
(
aList
,
transposedA
,
bList
,
transposedB
,
bList
,
transposedB
,
cList
,
alpha
,
beta
);
cList
,
alpha
,
beta
);
}
}
...
...
source/tensor/core/arithmetic/MatrixMulBatched.cpp
查看文件 @
a027f72e
...
@@ -23,8 +23,8 @@
...
@@ -23,8 +23,8 @@
#include "../../XDevice.h"
#include "../../XDevice.h"
#include "../../XName.h"
#include "../../XName.h"
#include "MatrixMulBatched.h"
#include "MatrixMulBatched.h"
#include "MatrixMULBatchedCPU.h"
#include "XTensorBLAS.h"
#include "XTensorBLAS.h"
#include "MatrixMul2D.h"
namespace
nts
{
// namespace nts(NiuTrans.Tensor)
namespace
nts
{
// namespace nts(NiuTrans.Tensor)
...
@@ -57,10 +57,42 @@ void _MatrixMulBatched(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
...
@@ -57,10 +57,42 @@ void _MatrixMulBatched(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
CheckNTErrors
((
a
->
order
==
b
->
order
&&
a
->
order
==
c
->
order
),
CheckNTErrors
((
a
->
order
==
b
->
order
&&
a
->
order
==
c
->
order
),
"Input tensor and output tensor must have same order!"
);
"Input tensor and output tensor must have same order!"
);
if
(
a
->
devID
>=
0
||
b
->
devID
>=
0
||
c
->
devID
>=
0
)
{
if
(
a
->
devID
>=
0
||
b
->
devID
>=
0
||
c
->
devID
>=
0
)
_MatrixMulBatchedGPU
(
a
,
transposedA
,
b
,
transposedB
,
c
,
alpha
,
beta
);
_MatrixMulBatchedGPU
(
a
,
transposedA
,
b
,
transposedB
,
c
,
alpha
,
beta
);
return
;
else
}
_MatrixMulBatchedCPU
(
a
,
transposedA
,
b
,
transposedB
,
c
,
alpha
,
beta
);
}
/*
matrix multiplication of the two tensors
optimized for GPU
for each 2-dimensional data array in a (denoted as ai) and
each 2-dimensional data array in b (denoted as bi), we have
ci = trans(ai) * trans(bi) * alpha + cm * beta
where trans() returns the transposed matrix if the flag is fired
>> a - tensor a
>> transposedA - indicates whether the matrices in a are transposed
>> b - tensor b
>> transposedB - indicates whether teh matrices in b are transposed
>> c - where we keep a*b
>> alpha - a coefficient
>> beta - another coefficient
*/
void
_MatrixMulBatchedGPU
(
const
XTensor
*
a
,
MATRIX_TRANS_TYPE
transposedA
,
const
XTensor
*
b
,
MATRIX_TRANS_TYPE
transposedB
,
XTensor
*
c
,
DTYPE
alpha
,
DTYPE
beta
)
{
#ifdef USE_CUDA
CheckNTErrors
((
a
&&
b
&&
c
),
"Empty input tensors!"
);
CheckNTErrors
((
a
->
dataType
==
b
->
dataType
&&
a
->
dataType
==
c
->
dataType
),
"Input tensors should have the same data type!"
);
CheckNTErrors
((
a
->
order
>=
2
&&
b
->
order
>=
2
&&
c
->
order
>=
2
),
"Input tensors must have a order >= 2!"
);
CheckNTErrors
((
a
->
order
==
b
->
order
&&
a
->
order
==
c
->
order
),
"Input tensor and output tensor must have same order!"
);
CheckNTErrors
(
a
->
devID
>=
0
&&
b
->
devID
>=
0
&&
c
->
devID
>=
0
,
"The tensors must be on GPUs"
);
int
an
=
transposedA
==
X_TRANS
?
a
->
dimSizeRDI
[
0
]
:
a
->
dimSizeRDI
[
1
];
int
an
=
transposedA
==
X_TRANS
?
a
->
dimSizeRDI
[
0
]
:
a
->
dimSizeRDI
[
1
];
int
am
=
transposedA
==
X_TRANS
?
a
->
dimSizeRDI
[
1
]
:
a
->
dimSizeRDI
[
0
];
int
am
=
transposedA
==
X_TRANS
?
a
->
dimSizeRDI
[
1
]
:
a
->
dimSizeRDI
[
0
];
...
@@ -85,88 +117,20 @@ void _MatrixMulBatched(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
...
@@ -85,88 +117,20 @@ void _MatrixMulBatched(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
blockNum
*=
a
->
dimSizeRDI
[
i
];
blockNum
*=
a
->
dimSizeRDI
[
i
];
}
}
XList
*
aList
=
new
XList
(
10
);
XList
*
bList
=
new
XList
(
10
);
XList
*
cList
=
new
XList
(
10
);
int
aDimSize
[
2
]
=
{
-
a
->
dimSizeRDI
[
1
],
a
->
dimSizeRDI
[
0
]};
int
bDimSize
[
2
]
=
{
-
b
->
dimSizeRDI
[
1
],
b
->
dimSizeRDI
[
0
]};
int
cDimSize
[
2
]
=
{
-
c
->
dimSizeRDI
[
1
],
c
->
dimSizeRDI
[
0
]};
XTensor
*
tensorBuf
=
new
XTensor
[
blockNum
*
3
];
XTensor
*
aBuf
=
tensorBuf
;
XTensor
*
bBuf
=
tensorBuf
+
blockNum
;
XTensor
*
cBuf
=
tensorBuf
+
blockNum
*
2
;
for
(
int
p
=
0
;
p
<
blockNum
;
p
++
)
{
void
*
ap
=
(
char
*
)
a
->
data
+
aRealBlockSize
*
p
;
void
*
bp
=
(
char
*
)
b
->
data
+
bRealBlockSize
*
p
;
void
*
cp
=
(
char
*
)
c
->
data
+
cRealBlockSize
*
p
;
XTensor
*
ai
=
aBuf
+
p
;
XTensor
*
bi
=
bBuf
+
p
;
XTensor
*
ci
=
cBuf
+
p
;
InitTensor
(
ai
,
2
,
aDimSize
,
a
->
dataType
,
a
->
denseRatio
,
a
->
devID
,
a
->
mem
);
InitTensor
(
bi
,
2
,
bDimSize
,
b
->
dataType
,
b
->
denseRatio
,
b
->
devID
,
b
->
mem
);
InitTensor
(
ci
,
2
,
cDimSize
,
c
->
dataType
,
c
->
denseRatio
,
c
->
devID
,
c
->
mem
);
ai
->
data
=
ap
;
bi
->
data
=
bp
;
ci
->
data
=
cp
;
aList
->
Add
(
ai
);
bList
->
Add
(
bi
);
cList
->
Add
(
ci
);
}
if
(
a
->
devID
>=
0
&&
b
->
devID
>=
0
&&
c
->
devID
>=
0
)
{
#ifdef USE_CUDA
CheckNTErrors
((
a
->
devID
==
b
->
devID
&&
a
->
devID
==
c
->
devID
),
"The code must be run on the same GPU!"
);
int
devIDBackup
;
ProtectCudaDev
(
a
->
devID
,
devIDBackup
);
cublasHandle_t
*
handle
=
a
->
mem
!=
NULL
?
a
->
mem
->
GetCublasHandle
()
:
GDevs
.
GetCudaHandle
(
a
->
devID
);
cublasHandle_t
*
handle
=
a
->
mem
!=
NULL
?
a
->
mem
->
GetCublasHandle
()
:
GDevs
.
GetCudaHandle
(
a
->
devID
);
_CudaBLASMatrixMULList
(
handle
,
_CudaBLASMatrixMULBatchedStrided
(
handle
,
aList
,
transposedA
,
a
->
data
,
transposedA
,
a
->
dataType
,
aBlockSize
,
bList
,
transposedB
,
b
->
data
,
transposedB
,
b
->
dataType
,
bBlockSize
,
cList
,
aList
->
count
,
c
->
data
,
c
->
dataType
,
cBlockSize
,
blockNum
,
alpha
,
beta
);
a
->
dimSizeRDI
[
1
],
a
->
dimSizeRDI
[
0
],
b
->
dimSizeRDI
[
1
],
b
->
dimSizeRDI
[
0
],
BacktoCudaDev
(
a
->
devID
,
devIDBackup
);
c
->
dimSizeRDI
[
1
],
c
->
dimSizeRDI
[
0
],
alpha
,
beta
);
#else
ShowNTErrors
(
"Please specify USE_CUDA and recompile the code!"
);
#endif
#endif
}
else
{
CheckNTErrors
((
a
->
dataType
==
DEFAULT_DTYPE
),
"TODO!"
);
_MatrixMULBatchedCPU
(
aList
,
transposedA
,
bList
,
transposedB
,
cList
,
alpha
,
beta
);
}
for
(
int
i
=
0
;
i
<
aList
->
count
;
i
++
)
{
XTensor
*
ai
=
(
XTensor
*
)
aList
->
GetItem
(
i
);
ai
->
data
=
NULL
;;
}
for
(
int
i
=
0
;
i
<
bList
->
count
;
i
++
)
{
XTensor
*
bi
=
(
XTensor
*
)
bList
->
GetItem
(
i
);
bi
->
data
=
NULL
;
}
for
(
int
i
=
0
;
i
<
cList
->
count
;
i
++
)
{
XTensor
*
ci
=
(
XTensor
*
)
cList
->
GetItem
(
i
);
ci
->
data
=
NULL
;
}
delete
[]
tensorBuf
;
delete
aList
;
delete
bList
;
delete
cList
;
}
}
/*
/*
matrix multiplication of the two tensors
matrix multiplication of the two tensors
optimized for
G
PU
optimized for
C
PU
for each 2-dimensional data array in a (denoted as ai) and
for each 2-dimensional data array in a (denoted as ai) and
each 2-dimensional data array in b (denoted as bi), we have
each 2-dimensional data array in b (denoted as bi), we have
...
@@ -180,21 +144,19 @@ where trans() returns the transposed matrix if the flag is fired
...
@@ -180,21 +144,19 @@ where trans() returns the transposed matrix if the flag is fired
>> c - where we keep a*b
>> c - where we keep a*b
>> alpha - a coefficient
>> alpha - a coefficient
>> beta - another coefficient
>> beta - another coefficient
>> parallelRunner - parallel processing module
*/
*/
void
_MatrixMulBatched
G
PU
(
const
XTensor
*
a
,
MATRIX_TRANS_TYPE
transposedA
,
void
_MatrixMulBatched
C
PU
(
const
XTensor
*
a
,
MATRIX_TRANS_TYPE
transposedA
,
const
XTensor
*
b
,
MATRIX_TRANS_TYPE
transposedB
,
const
XTensor
*
b
,
MATRIX_TRANS_TYPE
transposedB
,
XTensor
*
c
,
DTYPE
alpha
,
DTYPE
beta
,
XPRunner
*
parallelRunner
)
XTensor
*
c
,
DTYPE
alpha
,
DTYPE
beta
)
{
{
#ifdef USE_CUDA
CheckNTErrors
((
a
&&
b
&&
c
),
"Empty input tensors!"
);
CheckNTErrors
((
a
&&
b
&&
c
),
"Empty input tensors!"
);
CheckNTErrors
((
a
->
dataType
==
b
->
dataType
&&
a
->
dataType
==
c
->
dataType
),
CheckNTErrors
((
a
->
dataType
==
b
->
dataType
&&
a
->
dataType
==
c
->
dataType
),
"Input tensors should have the same data type!"
);
"Input tensors should have the same data type!"
);
CheckNTErrors
((
a
->
order
>=
2
&&
b
->
order
>=
2
&&
c
->
order
>=
2
),
CheckNTErrors
((
a
->
order
>=
2
&&
b
->
order
>=
2
&&
c
->
order
>=
2
),
"Input tensors must have a order >= 2!"
);
"Input tensors must have a order >= 2!"
);
CheckNTErrors
((
a
->
order
==
b
->
order
&&
a
->
order
==
c
->
order
),
CheckNTErrors
((
a
->
order
==
b
->
order
&&
a
->
order
==
c
->
order
),
"Input tensor and output tensor must have same order!"
);
"Input tensor and output tensor must have same order!"
);
CheckNTErrors
(
a
->
devID
>=
0
&&
b
->
devID
>=
0
&&
c
->
devID
>=
0
,
"The tensors must be on GPUs"
);
int
an
=
transposedA
==
X_TRANS
?
a
->
dimSizeRDI
[
0
]
:
a
->
dimSizeRDI
[
1
];
int
an
=
transposedA
==
X_TRANS
?
a
->
dimSizeRDI
[
0
]
:
a
->
dimSizeRDI
[
1
];
int
am
=
transposedA
==
X_TRANS
?
a
->
dimSizeRDI
[
1
]
:
a
->
dimSizeRDI
[
0
];
int
am
=
transposedA
==
X_TRANS
?
a
->
dimSizeRDI
[
1
]
:
a
->
dimSizeRDI
[
0
];
...
@@ -219,16 +181,90 @@ void _MatrixMulBatchedGPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
...
@@ -219,16 +181,90 @@ void _MatrixMulBatchedGPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
blockNum
*=
a
->
dimSizeRDI
[
i
];
blockNum
*=
a
->
dimSizeRDI
[
i
];
}
}
cublasHandle_t
*
handle
=
a
->
mem
!=
NULL
?
a
->
mem
->
GetCublasHandle
()
:
GDevs
.
GetCudaHandle
(
a
->
devID
);
int
aDimSize
[
2
]
=
{
-
a
->
dimSizeRDI
[
1
],
a
->
dimSizeRDI
[
0
]};
_CudaBLASMatrixMULBatchedStrided
(
handle
,
int
bDimSize
[
2
]
=
{
-
b
->
dimSizeRDI
[
1
],
b
->
dimSizeRDI
[
0
]};
a
->
data
,
transposedA
,
a
->
dataType
,
aBlockSize
,
int
cDimSize
[
2
]
=
{
-
c
->
dimSizeRDI
[
1
],
c
->
dimSizeRDI
[
0
]};
b
->
data
,
transposedB
,
b
->
dataType
,
bBlockSize
,
c
->
data
,
c
->
dataType
,
cBlockSize
,
blockNum
,
XTensor
*
ai
=
NewTensor2D
(
aDimSize
[
0
],
aDimSize
[
1
],
a
->
dataType
,
a
->
devID
,
a
->
mem
);
a
->
dimSizeRDI
[
1
],
a
->
dimSizeRDI
[
0
],
XTensor
*
bi
=
NewTensor2D
(
bDimSize
[
0
],
bDimSize
[
1
],
b
->
dataType
,
b
->
devID
,
b
->
mem
);
b
->
dimSizeRDI
[
1
],
b
->
dimSizeRDI
[
0
],
XTensor
*
ci
=
NewTensor2D
(
cDimSize
[
0
],
cDimSize
[
1
],
c
->
dataType
,
c
->
devID
,
c
->
mem
);
c
->
dimSizeRDI
[
1
],
c
->
dimSizeRDI
[
0
],
alpha
,
beta
);
for
(
int
i
=
0
;
i
<
blockNum
;
i
++
)
{
ai
->
data
=
(
char
*
)
a
->
data
+
i
*
aRealBlockSize
;
bi
->
data
=
(
char
*
)
b
->
data
+
i
*
bRealBlockSize
;
ci
->
data
=
(
char
*
)
c
->
data
+
i
*
cRealBlockSize
;
#ifdef USE_BLAS
if
(
useBLAS
)
_MatrixMULCPU
(
ai
,
transposedA
,
bi
,
transposedB
,
ci
,
alpha
,
beta
);
else
_MatrixMul2D
(
ai
,
transposedA
,
bi
,
transposedB
,
ci
,
alpha
,
beta
);
#else
_MatrixMul2D
(
ai
,
transposedA
,
bi
,
transposedB
,
ci
,
alpha
,
beta
);
#endif
#endif
}
ai
->
data
=
NULL
;
bi
->
data
=
NULL
;
ci
->
data
=
NULL
;
delete
ai
;
delete
bi
;
delete
ci
;
}
/*
matrix multiplication in batch mode for list inputs (BLAS)
c_i = trans(a_i) * trans(b_i) * \alpha + c_i * \beta for each i in [0,count-1]
>> a - list of input matrices (2d tensors)
>> transposedA - indicate whether the matrix a is transposed
>> b - another list of input matrices (2d tensors)
>> transposedB - indicate whether the matrix b is transposed
>> c - output matrix (2d tensor)
>> alpha - scalar
>> beta - scalar
*/
void
_MatrixMulBatchedCPU
(
const
XList
*
a
,
MATRIX_TRANS_TYPE
transposedA
,
const
XList
*
b
,
MATRIX_TRANS_TYPE
transposedB
,
XList
*
c
,
DTYPE
alpha
,
DTYPE
beta
)
{
CheckNTErrors
(
a
&&
b
&&
c
,
"Empty input lists!"
);
CheckNTErrors
(
a
->
count
==
b
->
count
&&
a
->
count
==
c
->
count
,
"Input lists must be of the same size!"
);
if
(
a
->
count
==
0
)
return
;
bool
isUniform
=
true
;
for
(
int
i
=
1
;
i
<
a
->
count
;
i
++
)
{
XTensor
*
aim
=
(
XTensor
*
)
a
->
GetItem
(
i
-
1
);
XTensor
*
bim
=
(
XTensor
*
)
b
->
GetItem
(
i
-
1
);
XTensor
*
cim
=
(
XTensor
*
)
c
->
GetItem
(
i
-
1
);
XTensor
*
ai
=
(
XTensor
*
)
a
->
GetItem
(
i
);
XTensor
*
bi
=
(
XTensor
*
)
b
->
GetItem
(
i
);
XTensor
*
ci
=
(
XTensor
*
)
c
->
GetItem
(
i
);
if
(
!
XTensor
::
IsSameShaped
(
aim
,
ai
)
||
!
XTensor
::
IsSameShaped
(
bim
,
bi
)
||
!
XTensor
::
IsSameShaped
(
cim
,
ci
))
{
isUniform
=
false
;
break
;
}
}
for
(
int
i
=
0
;
i
<
a
->
count
;
i
++
)
{
XTensor
*
ai
=
(
XTensor
*
)
a
->
GetItem
(
i
);
XTensor
*
bi
=
(
XTensor
*
)
b
->
GetItem
(
i
);
XTensor
*
ci
=
(
XTensor
*
)
c
->
GetItem
(
i
);
CheckNTErrors
((
ai
->
order
==
2
),
"2d tensor (i.e., matrix) is required!"
);
CheckNTErrors
((
bi
->
order
==
2
),
"2d tensor (i.e., matrix) is required!"
);
CheckNTErrors
((
ci
->
order
==
2
),
"2d tensor (i.e., matrix) is required!"
);
#ifdef USE_BLAS
if
(
useBLAS
)
_MatrixMULCPU
(
ai
,
transposedA
,
bi
,
transposedB
,
ci
,
alpha
,
beta
);
else
_MatrixMul2D
(
ai
,
transposedA
,
bi
,
transposedB
,
ci
,
alpha
,
beta
);
#else
_MatrixMul2D
(
ai
,
transposedA
,
bi
,
transposedB
,
ci
,
alpha
,
beta
);
#endif
}
}
}
/*
/*
...
...
source/tensor/core/arithmetic/MatrixMulBatched.h
查看文件 @
a027f72e
...
@@ -43,7 +43,21 @@ matrix multiplication of the two tensors c = trans(a) * trans(b) * alpha + c * b
...
@@ -43,7 +43,21 @@ matrix multiplication of the two tensors c = trans(a) * trans(b) * alpha + c * b
optimized for GPU
optimized for GPU
*/
*/
void
_MatrixMulBatchedGPU
(
const
XTensor
*
a
,
MATRIX_TRANS_TYPE
transposedA
,
const
XTensor
*
b
,
MATRIX_TRANS_TYPE
transposedB
,
void
_MatrixMulBatchedGPU
(
const
XTensor
*
a
,
MATRIX_TRANS_TYPE
transposedA
,
const
XTensor
*
b
,
MATRIX_TRANS_TYPE
transposedB
,
XTensor
*
c
,
DTYPE
alpha
=
(
DTYPE
)
1
.
0
,
DTYPE
beta
=
0
,
XPRunner
*
parallelRunner
=
NULL
);
XTensor
*
c
,
DTYPE
alpha
=
(
DTYPE
)
1
.
0
,
DTYPE
beta
=
0
);
/*
matrix multiplication of the two tensors c = trans(a) * trans(b) * alpha + c * beta
optimized for GPU
*/
void
_MatrixMulBatchedCPU
(
const
XTensor
*
a
,
MATRIX_TRANS_TYPE
transposedA
,
const
XTensor
*
b
,
MATRIX_TRANS_TYPE
transposedB
,
XTensor
*
c
,
DTYPE
alpha
=
(
DTYPE
)
1
.
0
,
DTYPE
beta
=
0
);
/*
matrix multiplication of the two tensors c = trans(a) * trans(b) * alpha + c * beta (for list inputs)
optimized for GPU
*/
void
_MatrixMulBatchedCPU
(
const
XList
*
a
,
MATRIX_TRANS_TYPE
transposedA
,
const
XList
*
b
,
MATRIX_TRANS_TYPE
transposedB
,
XList
*
c
,
DTYPE
alpha
=
(
DTYPE
)
1
.
0
,
DTYPE
beta
=
0
);
/*
/*
matrix multiplication of the two tensors (return a XTensor structure) c = trans(a) * trans(b) * alpha
matrix multiplication of the two tensors (return a XTensor structure) c = trans(a) * trans(b) * alpha
...
...
source/tensor/core/shape/Split.cpp
查看文件 @
a027f72e
source/tensor/test/TMatrixMULBatchedCPU.cpp
deleted
100644 → 0
查看文件 @
5c0d8bfd
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-06-15
*/
#include "TMatrixMULBatchedCPU.h"
namespace
nts
{
// namespace nts(NiuTrans.Tensor)
/*
case 1: matrix multiplication in batch mode (CPU code).
In this case, aList=2*(2, 3), bList=2*(3, 2) -> c=2*(2, 2), transposedA=X_NOTRANS, transposedB=X_NOTRANS.
*/
bool
TestMatrixMulBatchedCPU1
()
{
/* create list */
XList
*
aList
=
new
XList
();
XList
*
bList
=
new
XList
();
XList
*
cList
=
new
XList
();
/* a source tensor of size (2, 3) */
int
aOrder
=
2
;
int
*
aDimSize
=
new
int
[
aOrder
];
aDimSize
[
0
]
=
2
;
aDimSize
[
1
]
=
3
;
int
aUnitNum
=
1
;
for
(
int
i
=
0
;
i
<
aOrder
;
i
++
)
aUnitNum
*=
aDimSize
[
i
];
/* a source tensor of size (3, 2) */
int
bOrder
=
2
;
int
*
bDimSize
=
new
int
[
bOrder
];
bDimSize
[
0
]
=
3
;
bDimSize
[
1
]
=
2
;
int
bUnitNum
=
1
;
for
(
int
i
=
0
;
i
<
bOrder
;
i
++
)
bUnitNum
*=
bDimSize
[
i
];
/* a target tensor of size (2, 2) */
int
cOrder
=
2
;
int
*
cDimSize
=
new
int
[
cOrder
];
cDimSize
[
0
]
=
2
;
cDimSize
[
1
]
=
2
;
int
cUnitNum
=
1
;
for
(
int
i
=
0
;
i
<
cOrder
;
i
++
)
cUnitNum
*=
cDimSize
[
i
];
DTYPE
aData1
[
2
][
3
]
=
{
{
1.0
F
,
2.0
F
,
3.0
F
},
{
-
4.0
F
,
5.0
F
,
6.0
F
}
};
DTYPE
aData2
[
2
][
3
]
=
{
{
1.0
F
,
-
2.0
F
,
-
3.0
F
},
{
-
4.0
F
,
3.0
F
,
2.0
F
}
};
DTYPE
bData1
[
3
][
2
]
=
{
{
0.0
F
,
-
1.0
F
},
{
1.0
F
,
2.0
F
},
{
2.0
F
,
1.0
F
}
};
DTYPE
bData2
[
3
][
2
]
=
{
{
0.0
F
,
1.0
F
},
{
3.0
F
,
2.0
F
},
{
2.0
F
,
1.0
F
}
};
DTYPE
answer1
[
2
][
2
]
=
{
{
8.0
F
,
6.0
F
},
{
17.0
F
,
20.0
F
}
};
DTYPE
answer2
[
2
][
2
]
=
{
{
-
12.0
F
,
-
6.0
F
},
{
13.0
F
,
4.0
F
}
};
/* CPU test */
bool
cpuTest
=
true
;
/* create tensors */
XTensor
*
a1
=
NewTensor
(
aOrder
,
aDimSize
);
XTensor
*
a2
=
NewTensor
(
aOrder
,
aDimSize
);
XTensor
*
b1
=
NewTensor
(
bOrder
,
bDimSize
);
XTensor
*
b2
=
NewTensor
(
bOrder
,
bDimSize
);
XTensor
*
c1
=
NewTensor
(
cOrder
,
cDimSize
);
XTensor
*
c2
=
NewTensor
(
cOrder
,
cDimSize
);
/* initialize variables */
a1
->
SetData
(
aData1
,
aUnitNum
);
a2
->
SetData
(
aData2
,
aUnitNum
);
b1
->
SetData
(
bData1
,
aUnitNum
);
b2
->
SetData
(
bData2
,
aUnitNum
);
c1
->
SetZeroAll
();
c2
->
SetZeroAll
();
/* add tensors to list */
aList
->
Add
(
a1
);
aList
->
Add
(
a2
);
bList
->
Add
(
b1
);
bList
->
Add
(
b2
);
cList
->
Add
(
c1
);
cList
->
Add
(
c2
);
/* call MatrixMULBatchedCPU function */
_MatrixMULBatchedCPU
(
aList
,
X_NOTRANS
,
bList
,
X_NOTRANS
,
cList
);
/* check results */
cpuTest
=
c1
->
CheckData
(
answer1
,
cUnitNum
)
&&
c2
->
CheckData
(
answer2
,
cUnitNum
);
#ifdef USE_CUDA
/* GPU test */
bool
gpuTest
=
true
;
/* create tensors */
XTensor
*
aGPU1
=
NewTensor
(
aOrder
,
aDimSize
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
*
aGPU2
=
NewTensor
(
aOrder
,
aDimSize
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
*
bGPU1
=
NewTensor
(
bOrder
,
bDimSize
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
*
bGPU2
=
NewTensor
(
bOrder
,
bDimSize
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
*
cGPU1
=
NewTensor
(
cOrder
,
cDimSize
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
*
cGPU2
=
NewTensor
(
cOrder
,
cDimSize
,
X_FLOAT
,
1.0
F
,
0
);
/* initialize variables */
aGPU1
->
SetData
(
aData1
,
aUnitNum
);
aGPU2
->
SetData
(
aData2
,
aUnitNum
);
bGPU1
->
SetData
(
bData1
,
aUnitNum
);
bGPU2
->
SetData
(
bData2
,
aUnitNum
);
cGPU1
->
SetZeroAll
();
cGPU2
->
SetZeroAll
();
/* clear list */
aList
->
Clear
();
bList
->
Clear
();
cList
->
Clear
();
/* add tensors to list */
aList
->
Add
(
aGPU1
);
aList
->
Add
(
aGPU2
);
bList
->
Add
(
bGPU1
);
bList
->
Add
(
bGPU2
);
cList
->
Add
(
cGPU1
);
cList
->
Add
(
cGPU2
);
/* call MatrixMULBatchedCPU function */
_MatrixMULBatchedCPU
(
aList
,
X_NOTRANS
,
bList
,
X_NOTRANS
,
cList
);
/* check results */
gpuTest
=
cGPU1
->
CheckData
(
answer1
,
cUnitNum
)
&&
gpuTest
;
gpuTest
=
cGPU2
->
CheckData
(
answer2
,
cUnitNum
)
&&
gpuTest
;
/* destroy variables */
delete
a1
;
delete
a2
;
delete
b1
;
delete
b2
;
delete
c1
;
delete
c2
;
delete
aGPU1
;
delete
aGPU2
;
delete
bGPU1
;
delete
bGPU2
;
delete
cGPU1
;
delete
cGPU2
;
delete
[]
aDimSize
;
delete
[]
bDimSize
;
delete
[]
cDimSize
;
return
cpuTest
&&
gpuTest
;
#else
/* destroy variables */
delete
a1
;
delete
a2
;
delete
b1
;
delete
b2
;
delete
c1
;
delete
c2
;
delete
[]
aDimSize
;
delete
[]
bDimSize
;
delete
[]
cDimSize
;
return
cpuTest
;
#endif // USE_CUDA
}
/* other cases */
/*
TODO!!
*/
/* test for MatrixMulBatchedCPU Function */
extern
"C"
bool
TestMatrixMulBatchedCPU
()
{
XPRINT
(
0
,
stdout
,
"[TEST MATRIXMULBATCHEDCPU] matrix multiplication in batch mode (CPU code)
\n
"
);
bool
returnFlag
=
true
,
caseFlag
=
true
;
/* case 1 test */
caseFlag
=
TestMatrixMulBatchedCPU1
();
if
(
!
caseFlag
)
{
returnFlag
=
false
;
XPRINT
(
0
,
stdout
,
">> case 1 failed!
\n
"
);
}
else
XPRINT
(
0
,
stdout
,
">> case 1 passed!
\n
"
);
/* other cases test */
/*
TODO!!
*/
if
(
returnFlag
)
{
XPRINT
(
0
,
stdout
,
">> All Passed!
\n
"
);
}
else
XPRINT
(
0
,
stdout
,
">> Failed!
\n
"
);
XPRINT
(
0
,
stdout
,
"
\n
"
);
return
returnFlag
;
}
}
// namespace nts(NiuTrans.Tensor)
source/tensor/test/TMatrixMULBatchedCPU.h
deleted
100644 → 0
查看文件 @
5c0d8bfd
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-06-15
*/
#ifndef __TEST_MATRIXMULBATCHEDCPU_H__
#define __TEST_MATRIXMULBATCHEDCPU_H__
#include "../core/arithmetic/MatrixMULBatchedCPU.h"
namespace
nts
{
// namespace nts(NiuTrans.Tensor)
/* test for MatrixMulBatchedCPU Function */
extern
"C"
bool
TestMatrixMulBatchedCPU
();
}
// namespace nts(NiuTrans.Tensor)
#endif // __TEST_MATRIXMULBATCHEDCPU_H__
source/tensor/test/Test.cpp
查看文件 @
a027f72e
...
@@ -40,7 +40,6 @@ bool Test()
...
@@ -40,7 +40,6 @@ bool Test()
wrong
=
!
TestMatrixMul2D
()
||
wrong
;
wrong
=
!
TestMatrixMul2D
()
||
wrong
;
wrong
=
!
TestMatrixMul2DParallel
()
||
wrong
;
wrong
=
!
TestMatrixMul2DParallel
()
||
wrong
;
wrong
=
!
TestMatrixMulBatched
()
||
wrong
;
wrong
=
!
TestMatrixMulBatched
()
||
wrong
;
wrong
=
!
TestMatrixMulBatchedCPU
()
||
wrong
;
wrong
=
!
TestMerge
()
||
wrong
;
wrong
=
!
TestMerge
()
||
wrong
;
wrong
=
!
TestMultiply
()
||
wrong
;
wrong
=
!
TestMultiply
()
||
wrong
;
wrong
=
!
TestNegate
()
||
wrong
;
wrong
=
!
TestNegate
()
||
wrong
;
...
...
source/tensor/test/Test.h
查看文件 @
a027f72e
...
@@ -33,7 +33,6 @@
...
@@ -33,7 +33,6 @@
#include "TMatrixMul2D.h"
#include "TMatrixMul2D.h"
#include "TMatrixMul2DParallel.h"
#include "TMatrixMul2DParallel.h"
#include "TMatrixMulBatched.h"
#include "TMatrixMulBatched.h"
#include "TMatrixMULBatchedCPU.h"
#include "TMerge.h"
#include "TMerge.h"
#include "TMultiply.h"
#include "TMultiply.h"
#include "TNegate.h"
#include "TNegate.h"
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论