Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
N
NiuTrans.Tensor
概览
Overview
Details
Activity
Cycle Analytics
版本库
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
问题
0
Issues
0
列表
Board
标记
里程碑
合并请求
0
Merge Requests
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
Snippets
成员
Collapse sidebar
Close sidebar
活动
图像
聊天
创建新问题
作业
提交
Issue Boards
Open sidebar
Emmay
NiuTrans.Tensor
Commits
f31bc3fb
Commit
f31bc3fb
authored
Jul 26, 2018
by
xiaotong
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
tmp code for batched matrix mul
parent
f74b1c17
隐藏空白字符变更
内嵌
并排
正在显示
3 个修改的文件
包含
199 行增加
和
22 行删除
+199
-22
source/sample/fnnlm/FNNLM.cpp
+55
-18
source/tensor/core/arithmetic/MatrixMulBatched.cpp
+136
-4
source/tensor/core/arithmetic/MatrixMulBatched.h
+8
-0
没有找到文件。
source/sample/fnnlm/FNNLM.cpp
查看文件 @
f31bc3fb
...
...
@@ -153,43 +153,80 @@ load arguments
*/
void
LoadArgs
(
int
argc
,
const
char
**
argv
,
FNNModel
&
model
)
{
fprintf
(
stderr
,
"args:
\n
"
);
for
(
int
i
=
0
;
i
<
argc
;
i
++
){
if
(
!
strcmp
(
argv
[
i
],
"-train"
)
&&
i
+
1
<
argc
)
if
(
!
strcmp
(
argv
[
i
],
"-train"
)
&&
i
+
1
<
argc
)
{
strcpy
(
trainFN
,
argv
[
i
+
1
]);
if
(
!
strcmp
(
argv
[
i
],
"-model"
)
&&
i
+
1
<
argc
)
fprintf
(
stderr
,
" -train=%s
\n
"
,
argv
[
i
+
1
]);
}
if
(
!
strcmp
(
argv
[
i
],
"-model"
)
&&
i
+
1
<
argc
){
strcpy
(
modelFN
,
argv
[
i
+
1
]);
if
(
!
strcmp
(
argv
[
i
],
"-test"
)
&&
i
+
1
<
argc
)
fprintf
(
stderr
,
" -model=%s
\n
"
,
argv
[
i
+
1
]);
}
if
(
!
strcmp
(
argv
[
i
],
"-test"
)
&&
i
+
1
<
argc
){
strcpy
(
testFN
,
argv
[
i
+
1
]);
if
(
!
strcmp
(
argv
[
i
],
"-output"
)
&&
i
+
1
<
argc
)
fprintf
(
stderr
,
" -test=%s
\n
"
,
argv
[
i
+
1
]);
}
if
(
!
strcmp
(
argv
[
i
],
"-output"
)
&&
i
+
1
<
argc
){
strcpy
(
outputFN
,
argv
[
i
+
1
]);
if
(
!
strcmp
(
argv
[
i
],
"-n"
)
&&
i
+
1
<
argc
)
fprintf
(
stderr
,
" -output=%s
\n
"
,
argv
[
i
+
1
]);
}
if
(
!
strcmp
(
argv
[
i
],
"-n"
)
&&
i
+
1
<
argc
){
model
.
n
=
atoi
(
argv
[
i
+
1
]);
if
(
!
strcmp
(
argv
[
i
],
"-esize"
)
&&
i
+
1
<
argc
)
fprintf
(
stderr
,
" -n=%d
\n
"
,
model
.
n
);
}
if
(
!
strcmp
(
argv
[
i
],
"-esize"
)
&&
i
+
1
<
argc
){
model
.
eSize
=
atoi
(
argv
[
i
+
1
]);
if
(
!
strcmp
(
argv
[
i
],
"-vsize"
)
&&
i
+
1
<
argc
)
fprintf
(
stderr
,
" -esize=%d
\n
"
,
model
.
eSize
);
}
if
(
!
strcmp
(
argv
[
i
],
"-vsize"
)
&&
i
+
1
<
argc
){
model
.
vSize
=
atoi
(
argv
[
i
+
1
]);
if
(
!
strcmp
(
argv
[
i
],
"-hdepth"
)
&&
i
+
1
<
argc
)
fprintf
(
stderr
,
" -vsize=%d
\n
"
,
model
.
vSize
);
}
if
(
!
strcmp
(
argv
[
i
],
"-hdepth"
)
&&
i
+
1
<
argc
){
model
.
hDepth
=
atoi
(
argv
[
i
+
1
]);
if
(
!
strcmp
(
argv
[
i
],
"-hsize"
)
&&
i
+
1
<
argc
)
fprintf
(
stderr
,
" -hdepth=%d
\n
"
,
model
.
hDepth
);
}
if
(
!
strcmp
(
argv
[
i
],
"-hsize"
)
&&
i
+
1
<
argc
){
model
.
hSize
=
atoi
(
argv
[
i
+
1
]);
if
(
!
strcmp
(
argv
[
i
],
"-lrate"
)
&&
i
+
1
<
argc
)
fprintf
(
stderr
,
" -hsize=%d
\n
"
,
model
.
hSize
);
}
if
(
!
strcmp
(
argv
[
i
],
"-lrate"
)
&&
i
+
1
<
argc
){
learningRate
=
(
float
)
atof
(
argv
[
i
+
1
]);
if
(
!
strcmp
(
argv
[
i
],
"-nstep"
)
&&
i
+
1
<
argc
)
fprintf
(
stderr
,
" -lrate=%f
\n
"
,
learningRate
);
}
if
(
!
strcmp
(
argv
[
i
],
"-nstep"
)
&&
i
+
1
<
argc
){
nStep
=
atoi
(
argv
[
i
+
1
]);
if
(
!
strcmp
(
argv
[
i
],
"-nepoch"
)
&&
i
+
1
<
argc
)
fprintf
(
stderr
,
" -nstep=%d
\n
"
,
nStep
);
}
if
(
!
strcmp
(
argv
[
i
],
"-nepoch"
)
&&
i
+
1
<
argc
){
nEpoch
=
atoi
(
argv
[
i
+
1
]);
if
(
!
strcmp
(
argv
[
i
],
"-minmax"
)
&&
i
+
1
<
argc
)
fprintf
(
stderr
,
" -nepoch=%d
\n
"
,
nEpoch
);
}
if
(
!
strcmp
(
argv
[
i
],
"-minmax"
)
&&
i
+
1
<
argc
){
minmax
=
(
float
)
fabs
(
atof
(
argv
[
i
+
1
]));
if
(
!
strcmp
(
argv
[
i
],
"-batch"
)
&&
i
+
1
<
argc
)
fprintf
(
stderr
,
" -minmax=%f
\n
"
,
minmax
);
}
if
(
!
strcmp
(
argv
[
i
],
"-batch"
)
&&
i
+
1
<
argc
){
sentBatch
=
atoi
(
argv
[
i
+
1
]);
if
(
!
strcmp
(
argv
[
i
],
"-wbatch"
)
&&
i
+
1
<
argc
)
fprintf
(
stderr
,
" -batch=%d
\n
"
,
sentBatch
);
}
if
(
!
strcmp
(
argv
[
i
],
"-wbatch"
)
&&
i
+
1
<
argc
){
wordBatch
=
atoi
(
argv
[
i
+
1
]);
if
(
!
strcmp
(
argv
[
i
],
"-shuffle"
))
fprintf
(
stderr
,
" -wbatch=%d
\n
"
,
wordBatch
);
}
if
(
!
strcmp
(
argv
[
i
],
"-shuffle"
)){
shuffled
=
true
;
if
(
!
strcmp
(
argv
[
i
],
"-autodiff"
))
fprintf
(
stderr
,
" -shuffle=true
\n
"
);
}
if
(
!
strcmp
(
argv
[
i
],
"-autodiff"
)){
autoDiff
=
true
;
if
(
!
strcmp
(
argv
[
i
],
"-dev"
)
&&
i
+
1
<
argc
)
fprintf
(
stderr
,
" -autodiff=true
\n
"
);
}
if
(
!
strcmp
(
argv
[
i
],
"-dev"
)
&&
i
+
1
<
argc
){
model
.
devID
=
atoi
(
argv
[
i
+
1
]);
fprintf
(
stderr
,
" -dev=%d
\n
"
,
model
.
devID
);
}
}
for
(
int
i
=
0
;
i
<
argc
;
i
++
){
...
...
source/tensor/core/arithmetic/MatrixMulBatched.cpp
查看文件 @
f31bc3fb
...
...
@@ -64,8 +64,7 @@ void _MatrixMulBatched(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
int
cn
=
c
->
dimSizeRDI
[
1
];
int
cm
=
c
->
dimSizeRDI
[
0
];
CheckNTErrors
((
am
==
bn
&&
an
==
cn
&&
bm
==
cm
),
"Unmatched tensors in multiplication!"
);
CheckNTErrors
((
am
==
bn
&&
an
==
cn
&&
bm
==
cm
),
"Unmatched tensors in multiplication!"
);
int
aBlockSize
=
a
->
dimSizeRDI
[
0
]
*
a
->
dimSizeRDI
[
1
];
int
bBlockSize
=
b
->
dimSizeRDI
[
0
]
*
b
->
dimSizeRDI
[
1
];
...
...
@@ -134,8 +133,141 @@ void _MatrixMulBatched(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
else
{
CheckNTErrors
((
a
->
dataType
==
DEFAULT_DTYPE
),
"TODO!"
);
_MatrixMULBatchedCPU
(
aList
,
transposedA
,
bList
,
transposedB
,
cList
,
alpha
,
beta
);
bList
,
transposedB
,
cList
,
alpha
,
beta
);
}
for
(
int
i
=
0
;
i
<
aList
->
count
;
i
++
)
{
XTensor
*
ai
=
(
XTensor
*
)
aList
->
GetItem
(
i
);
ai
->
data
=
NULL
;;
}
for
(
int
i
=
0
;
i
<
bList
->
count
;
i
++
)
{
XTensor
*
bi
=
(
XTensor
*
)
bList
->
GetItem
(
i
);
bi
->
data
=
NULL
;
}
for
(
int
i
=
0
;
i
<
cList
->
count
;
i
++
)
{
XTensor
*
ci
=
(
XTensor
*
)
cList
->
GetItem
(
i
);
ci
->
data
=
NULL
;
}
delete
[]
tensorBuf
;
delete
aList
;
delete
bList
;
delete
cList
;
}
/*
matrix multiplication of the two tensors
optimized for GPU
for each 2-dimensional data array in a (denoted as ai) and
each 2-dimensional data array in b (denoted as bi), we have
ci = trans(ai) * trans(bi) * alpha + cm * beta
where trans() returns the transposed matrix if the flag is fired
>> a - tensor a
>> transposedA - indicates whether the matrices in a are transposed
>> b - tensor b
>> transposedB - indicates whether teh matrices in b are transposed
>> c - where we keep a*b
>> alpha - a coefficient
>> beta - another coefficient
>> parallelRunner - parallel processing module
*/
void
_MatrixMulBatchedGPU
(
const
XTensor
*
a
,
MATRIX_TRANS_TYPE
transposedA
,
const
XTensor
*
b
,
MATRIX_TRANS_TYPE
transposedB
,
XTensor
*
c
,
DTYPE
alpha
,
DTYPE
beta
,
XPRunner
*
parallelRunner
)
{
CheckNTErrors
((
a
&&
b
&&
c
),
"Empty input tensors!"
);
CheckNTErrors
((
a
->
dataType
==
b
->
dataType
&&
a
->
dataType
==
c
->
dataType
),
"Input tensors should have the same data type!"
);
CheckNTErrors
((
a
->
order
>=
2
&&
b
->
order
>=
2
&&
c
->
order
>=
2
),
"Input tensors must have a order >= 2!"
);
CheckNTErrors
((
a
->
order
==
b
->
order
&&
a
->
order
==
c
->
order
),
"Input tensor and output tensor must have same order!"
);
CheckNTErrors
(
a
->
devID
>=
0
&&
b
->
devID
>=
0
&&
c
->
devID
>=
0
,
"The tensors must be on GPUs"
);
int
an
=
transposedA
==
X_TRANS
?
a
->
dimSizeRDI
[
0
]
:
a
->
dimSizeRDI
[
1
];
int
am
=
transposedA
==
X_TRANS
?
a
->
dimSizeRDI
[
1
]
:
a
->
dimSizeRDI
[
0
];
int
bn
=
transposedB
==
X_TRANS
?
b
->
dimSizeRDI
[
0
]
:
b
->
dimSizeRDI
[
1
];
int
bm
=
transposedB
==
X_TRANS
?
b
->
dimSizeRDI
[
1
]
:
b
->
dimSizeRDI
[
0
];
int
cn
=
c
->
dimSizeRDI
[
1
];
int
cm
=
c
->
dimSizeRDI
[
0
];
CheckNTErrors
((
am
==
bn
&&
an
==
cn
&&
bm
==
cm
),
"Unmatched tensors in multiplication!"
);
int
aBlockSize
=
a
->
dimSizeRDI
[
0
]
*
a
->
dimSizeRDI
[
1
];
int
bBlockSize
=
b
->
dimSizeRDI
[
0
]
*
b
->
dimSizeRDI
[
1
];
int
cBlockSize
=
c
->
dimSizeRDI
[
0
]
*
c
->
dimSizeRDI
[
1
];
int
aRealBlockSize
=
aBlockSize
*
a
->
unitSize
;
int
bRealBlockSize
=
bBlockSize
*
b
->
unitSize
;
int
cRealBlockSize
=
cBlockSize
*
c
->
unitSize
;
int
blockNum
=
1
;
for
(
int
i
=
2
;
i
<
a
->
order
;
i
++
)
{
CheckNTErrors
((
a
->
dimSizeRDI
[
i
]
==
c
->
dimSizeRDI
[
i
]),
"Incorrect tensor sizes!"
);
CheckNTErrors
((
b
->
dimSizeRDI
[
i
]
==
c
->
dimSizeRDI
[
i
]),
"Incorrect tensor sizes!"
);
blockNum
*=
a
->
dimSizeRDI
[
i
];
}
XList
*
aList
=
new
XList
(
10
);
XList
*
bList
=
new
XList
(
10
);
XList
*
cList
=
new
XList
(
10
);
int
aDimSize
[
2
]
=
{
-
a
->
dimSizeRDI
[
1
],
a
->
dimSizeRDI
[
0
]};
int
bDimSize
[
2
]
=
{
-
b
->
dimSizeRDI
[
1
],
b
->
dimSizeRDI
[
0
]};
int
cDimSize
[
2
]
=
{
-
c
->
dimSizeRDI
[
1
],
c
->
dimSizeRDI
[
0
]};
XTensor
*
tensorBuf
=
new
XTensor
[
blockNum
*
3
];
XTensor
*
aBuf
=
tensorBuf
;
XTensor
*
bBuf
=
tensorBuf
+
blockNum
;
XTensor
*
cBuf
=
tensorBuf
+
blockNum
*
2
;
for
(
int
p
=
0
;
p
<
blockNum
;
p
++
)
{
void
*
ap
=
(
char
*
)
a
->
data
+
aRealBlockSize
*
p
;
void
*
bp
=
(
char
*
)
b
->
data
+
bRealBlockSize
*
p
;
void
*
cp
=
(
char
*
)
c
->
data
+
cRealBlockSize
*
p
;
XTensor
*
ai
=
aBuf
+
p
;
XTensor
*
bi
=
bBuf
+
p
;
XTensor
*
ci
=
cBuf
+
p
;
InitTensor
(
ai
,
2
,
aDimSize
,
a
->
dataType
,
a
->
denseRatio
,
a
->
devID
,
a
->
mem
);
InitTensor
(
bi
,
2
,
bDimSize
,
b
->
dataType
,
b
->
denseRatio
,
b
->
devID
,
b
->
mem
);
InitTensor
(
ci
,
2
,
cDimSize
,
c
->
dataType
,
c
->
denseRatio
,
c
->
devID
,
c
->
mem
);
ai
->
data
=
ap
;
bi
->
data
=
bp
;
ci
->
data
=
cp
;
aList
->
Add
(
ai
);
bList
->
Add
(
bi
);
cList
->
Add
(
ci
);
}
if
(
a
->
devID
>=
0
&&
b
->
devID
>=
0
&&
c
->
devID
>=
0
)
{
#ifdef USE_CUDA
CheckNTErrors
((
a
->
devID
==
b
->
devID
&&
a
->
devID
==
c
->
devID
),
"The code must be run on the same GPU!"
);
int
devIDBackup
;
ProtectCudaDev
(
a
->
devID
,
devIDBackup
);
cublasHandle_t
*
handle
=
a
->
mem
!=
NULL
?
a
->
mem
->
GetCublasHandle
()
:
GDevs
.
GetCudaHandle
(
a
->
devID
);
_CudaBLASMatrixMULList
(
handle
,
aList
,
transposedA
,
bList
,
transposedB
,
cList
,
aList
->
count
,
alpha
,
beta
);
BacktoCudaDev
(
a
->
devID
,
devIDBackup
);
#else
ShowNTErrors
(
"Please specify USE_CUDA and recompile the code!"
);
#endif
}
else
{
CheckNTErrors
((
a
->
dataType
==
DEFAULT_DTYPE
),
"TODO!"
);
_MatrixMULBatchedCPU
(
aList
,
transposedA
,
bList
,
transposedB
,
cList
,
alpha
,
beta
);
}
for
(
int
i
=
0
;
i
<
aList
->
count
;
i
++
)
{
...
...
source/tensor/core/arithmetic/MatrixMulBatched.h
查看文件 @
f31bc3fb
...
...
@@ -37,6 +37,14 @@ where trans() returns the transposed matrix if the flag is fired
void
_MatrixMulBatched
(
const
XTensor
*
a
,
MATRIX_TRANS_TYPE
transposedA
,
const
XTensor
*
b
,
MATRIX_TRANS_TYPE
transposedB
,
XTensor
*
c
,
DTYPE
alpha
=
(
DTYPE
)
1
.
0
,
DTYPE
beta
=
0
,
XPRunner
*
parallelRunner
=
NULL
);
/*
matrix multiplication of the two tensors c = trans(a) * trans(b) * alpha + c * beta
optimized for GPU
*/
void
_MatrixMulBatchedGPU
(
const
XTensor
*
a
,
MATRIX_TRANS_TYPE
transposedA
,
const
XTensor
*
b
,
MATRIX_TRANS_TYPE
transposedB
,
XTensor
*
c
,
DTYPE
alpha
=
(
DTYPE
)
1
.
0
,
DTYPE
beta
=
0
,
XPRunner
*
parallelRunner
=
NULL
);
/*
matrix multiplication of the two tensors (return a XTensor structure) c = trans(a) * trans(b) * alpha
make a new tensor to keep the result and return it
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论