Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
N
NiuTrans.Tensor
概览
Overview
Details
Activity
Cycle Analytics
版本库
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
问题
0
Issues
0
列表
Board
标记
里程碑
合并请求
0
Merge Requests
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
Snippets
成员
Collapse sidebar
Close sidebar
活动
图像
聊天
创建新问题
作业
提交
Issue Boards
Open sidebar
杨迪
NiuTrans.Tensor
Commits
f5149a15
Commit
f5149a15
authored
Oct 30, 2019
by
liyinqiao
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Merge with Yuhao branch (with little bit change).
parent
f0b49d6d
隐藏空白字符变更
内嵌
并排
正在显示
56 个修改的文件
包含
1285 行增加
和
1067 行删除
+1285
-1067
source/tensor/Main.cpp
+3
-2
source/tensor/XGlobal.cpp
+0
-8
source/tensor/XGlobal.h
+0
-2
source/tensor/XMem.cpp
+2
-2
source/tensor/XTensor.cpp
+8
-21
source/tensor/XTensor.h
+0
-3
source/tensor/core/arithmetic/Div.cpp
+7
-10
source/tensor/core/arithmetic/Div.cu
+8
-9
source/tensor/core/arithmetic/MatrixMul.cpp
+61
-60
source/tensor/core/arithmetic/MatrixMulBatched.cpp
+40
-40
source/tensor/core/arithmetic/MulAndShift.cpp
+17
-16
source/tensor/core/arithmetic/Multiply.cpp
+8
-11
source/tensor/core/arithmetic/Multiply.cu
+10
-11
source/tensor/core/arithmetic/SumDim.cpp
+0
-14
source/tensor/core/arithmetic/SumDim.cu
+4
-4
source/tensor/core/getandset/OnehotAndIndex.cpp
+41
-0
source/tensor/core/getandset/OnehotAndIndex.h
+3
-0
source/tensor/core/getandset/Select.cpp
+84
-11
source/tensor/core/getandset/Select.h
+4
-1
source/tensor/core/math/Normalize.cpp
+8
-9
source/tensor/core/math/Normalize.cu
+5
-6
source/tensor/core/movement/CopyInGrid.cpp
+2
-3
source/tensor/core/movement/CopyIndexed.cpp
+21
-19
source/tensor/core/movement/Gather.cpp
+69
-7
source/tensor/core/movement/Gather.cu
+69
-0
source/tensor/core/movement/Gather.cuh
+2
-0
source/tensor/core/movement/Gather.h
+6
-0
source/tensor/core/reduce/ReduceMax.cpp
+166
-181
source/tensor/core/reduce/ReduceMax.cu
+414
-387
source/tensor/core/reduce/ReduceMax.cuh
+3
-0
source/tensor/core/reduce/ReduceMax.h
+8
-2
source/tensor/core/reduce/ReduceMean.cpp
+1
-2
source/tensor/core/reduce/ReduceSum.cpp
+13
-14
source/tensor/core/reduce/ReduceSum.cu
+26
-25
source/tensor/core/reduce/ReduceVariance.cpp
+1
-2
source/tensor/core/reduce/VectorBuffer.cpp
+11
-1
source/tensor/core/reduce/VectorBuffer.h
+4
-1
source/tensor/core/shape/ConcatenateSolely.cpp
+12
-13
source/tensor/core/shape/IsSameShaped.cpp
+1
-1
source/tensor/core/shape/Merge.cpp
+19
-21
source/tensor/core/shape/Merge.h
+0
-3
source/tensor/core/shape/Split.cpp
+18
-20
source/tensor/core/shape/Unsqueeze.cpp
+7
-8
source/tensor/core/shape/Unsqueeze.cu
+3
-4
source/tensor/core/sort/Sort.cpp
+6
-7
source/tensor/core/sort/Sort.cu
+8
-9
source/tensor/core/sort/TopK.cpp
+12
-13
source/tensor/core/sort/TopK.cu
+6
-7
source/tensor/core/utilities/SetAscendingOrder.cpp
+6
-7
source/tensor/core/utilities/SetAscendingOrder.cu
+7
-8
source/tensor/function/LogSoftmax.cpp
+15
-17
source/tensor/function/LogSoftmax.cu
+3
-4
source/tensor/function/Loss.cpp
+14
-17
source/tensor/function/Loss.cu
+10
-11
source/tensor/function/Softmax.cpp
+6
-9
source/tensor/function/Softmax.cu
+3
-4
没有找到文件。
source/tensor/Main.cpp
查看文件 @
f5149a15
...
@@ -30,8 +30,9 @@
...
@@ -30,8 +30,9 @@
#include "XDevice.h"
#include "XDevice.h"
#include "./test/Test.h"
#include "./test/Test.h"
#include "./core/CHeader.h"
#include "./core/CHeader.h"
#include "./loss/CrossEntropy.h"
#include "./XBLAS.h"
#include "./core/sort/TopK.h"
#include "./core/movement/Gather.h"
//#define CRTDBG_MAP_ALLOC
//#define CRTDBG_MAP_ALLOC
//#include <stdlib.h>
//#include <stdlib.h>
//#include <crtdbg.h>
//#include <crtdbg.h>
...
...
source/tensor/XGlobal.cpp
查看文件 @
f5149a15
...
@@ -50,14 +50,6 @@ int CONST_MINUSONE = -1;
...
@@ -50,14 +50,6 @@ int CONST_MINUSONE = -1;
bool
CONST_TRUE
=
true
;
bool
CONST_TRUE
=
true
;
int
verboseLevel
=
0
;
int
verboseLevel
=
0
;
bool
useBLAS
=
false
;
#ifdef USE_CUDA
bool
useCUDA
=
true
;
#else
bool
useCUDA
=
false
;
#endif
FILE
*
tmpLog
=
NULL
;
FILE
*
tmpLog
=
NULL
;
double
myTime
=
0
;
double
myTime
=
0
;
...
...
source/tensor/XGlobal.h
查看文件 @
f5149a15
...
@@ -135,8 +135,6 @@ extern bool CONST_TRUE;
...
@@ -135,8 +135,6 @@ extern bool CONST_TRUE;
#define NIUTRANSNNDEBUG
#define NIUTRANSNNDEBUG
extern
int
verboseLevel
;
extern
int
verboseLevel
;
extern
bool
useBLAS
;
extern
bool
useCUDA
;
#define FFLUSH(FILEH) \
#define FFLUSH(FILEH) \
{ \
{ \
...
...
source/tensor/XMem.cpp
查看文件 @
f5149a15
...
@@ -1562,9 +1562,9 @@ void XMemManager::GetBufferSize(MTYPE freeMem, MTYPE * myBufSize)
...
@@ -1562,9 +1562,9 @@ void XMemManager::GetBufferSize(MTYPE freeMem, MTYPE * myBufSize)
if
(
freeMem
>=
MILLION
*
512
){
if
(
freeMem
>=
MILLION
*
512
){
*
myBufSize
=
MILLION
*
128
;
*
myBufSize
=
MILLION
*
128
;
if
(
freeMem
>=
MILLION
*
1024
)
{
if
(
freeMem
>=
MILLION
*
1024
)
{
*
myBufSize
=
MILLION
*
256
;
*
myBufSize
=
MILLION
*
128
;
if
(
freeMem
>=
MILLION
*
2048
)
if
(
freeMem
>=
MILLION
*
2048
)
*
myBufSize
=
MILLION
*
512
;
*
myBufSize
=
MILLION
*
128
;
}
}
}
}
}
}
...
...
source/tensor/XTensor.cpp
查看文件 @
f5149a15
...
@@ -266,7 +266,6 @@ void XTensor::Init()
...
@@ -266,7 +266,6 @@ void XTensor::Init()
devID
=
-
1
;
devID
=
-
1
;
order
=
-
1
;
order
=
-
1
;
memset
(
dimSize
,
0
,
sizeof
(
int
)
*
MAX_TENSOR_DIM_NUM
);
memset
(
dimSize
,
0
,
sizeof
(
int
)
*
MAX_TENSOR_DIM_NUM
);
memset
(
dimSizeRDI
,
0
,
sizeof
(
int
)
*
MAX_TENSOR_DIM_NUM
);
dataType
=
DEFAULT_DTYPE
;
dataType
=
DEFAULT_DTYPE
;
unitSize
=
sizeof
(
float
);
unitSize
=
sizeof
(
float
);
unitNum
=
0
;
unitNum
=
0
;
...
@@ -314,7 +313,6 @@ void XTensor::ShallowCopy(const XTensor &tensor)
...
@@ -314,7 +313,6 @@ void XTensor::ShallowCopy(const XTensor &tensor)
order
=
tensor
.
order
;
order
=
tensor
.
order
;
enableGrad
=
tensor
.
enableGrad
;
enableGrad
=
tensor
.
enableGrad
;
memcpy
(
dimSize
,
tensor
.
dimSize
,
sizeof
(
int
)
*
MAX_TENSOR_DIM_NUM
);
memcpy
(
dimSize
,
tensor
.
dimSize
,
sizeof
(
int
)
*
MAX_TENSOR_DIM_NUM
);
memcpy
(
dimSizeRDI
,
tensor
.
dimSizeRDI
,
sizeof
(
int
)
*
MAX_TENSOR_DIM_NUM
);
dataType
=
tensor
.
dataType
;
dataType
=
tensor
.
dataType
;
unitSize
=
tensor
.
unitSize
;
unitSize
=
tensor
.
unitSize
;
unitNum
=
tensor
.
unitNum
;
unitNum
=
tensor
.
unitNum
;
...
@@ -533,7 +531,7 @@ void XTensor::SetDevice(int myDevId, XMem * myMem)
...
@@ -533,7 +531,7 @@ void XTensor::SetDevice(int myDevId, XMem * myMem)
bool
XTensor
::
IsReduceShaped
(
const
XTensor
*
a
,
const
XTensor
*
b
,
int
dim
)
bool
XTensor
::
IsReduceShaped
(
const
XTensor
*
a
,
const
XTensor
*
b
,
int
dim
)
{
{
if
(
a
==
NULL
||
b
==
NULL
)
if
(
a
==
NULL
||
b
==
NULL
)
return
false
;
return
false
;
if
((
a
->
order
-
1
)
!=
b
->
order
)
if
((
a
->
order
-
1
)
!=
b
->
order
)
...
@@ -570,7 +568,6 @@ void XTensor::SetDim(int * myDimSize)
...
@@ -570,7 +568,6 @@ void XTensor::SetDim(int * myDimSize)
{
{
for
(
int
i
=
0
;
i
<
order
;
i
++
)
{
for
(
int
i
=
0
;
i
<
order
;
i
++
)
{
dimSize
[
i
]
=
myDimSize
[
i
];
dimSize
[
i
]
=
myDimSize
[
i
];
dimSizeRDI
[
order
-
i
-
1
]
=
myDimSize
[
i
];
}
}
}
}
...
@@ -598,20 +595,17 @@ reshape the tensor
...
@@ -598,20 +595,17 @@ reshape the tensor
void
XTensor
::
Reshape
(
const
int
myOrder
,
const
int
*
myDimSize
)
void
XTensor
::
Reshape
(
const
int
myOrder
,
const
int
*
myDimSize
)
{
{
int
dims
[
MAX_TENSOR_DIM_NUM
];
int
dims
[
MAX_TENSOR_DIM_NUM
];
int
dimsRDI
[
MAX_TENSOR_DIM_NUM
];
int
num
=
1
;
int
num
=
1
;
for
(
int
i
=
0
;
i
<
myOrder
;
i
++
){
for
(
int
i
=
0
;
i
<
myOrder
;
i
++
){
num
*=
myDimSize
[
i
];
num
*=
myDimSize
[
i
];
dims
[
i
]
=
abs
(
myDimSize
[
i
]);
dims
[
i
]
=
abs
(
myDimSize
[
i
]);
dimsRDI
[
myOrder
-
i
-
1
]
=
dims
[
i
];
}
}
CheckNTErrors
(
abs
(
num
)
==
unitNum
,
"Wrong size found when we reshape the tensor!"
);
CheckNTErrors
(
abs
(
num
)
==
unitNum
,
"Wrong size found when we reshape the tensor!"
);
order
=
myOrder
;
order
=
myOrder
;
memcpy
(
dimSize
,
dims
,
sizeof
(
int
)
*
order
);
memcpy
(
dimSize
,
dims
,
sizeof
(
int
)
*
order
);
memcpy
(
dimSizeRDI
,
dimsRDI
,
sizeof
(
int
)
*
order
);
}
}
/*
/*
...
@@ -997,18 +991,12 @@ void * XTensor::GetCell(int index[], int size) const
...
@@ -997,18 +991,12 @@ void * XTensor::GetCell(int index[], int size) const
{
{
CheckNTErrors
((
size
==
order
),
"Illegal index!"
);
CheckNTErrors
((
size
==
order
),
"Illegal index!"
);
int
*
indexRDI
=
new
int
[
size
];
int
offset
=
index
[
0
];
for
(
int
i
=
0
;
i
<
size
;
i
++
)
for
(
int
i
=
1
;
i
<
size
;
++
i
){
indexRDI
[
size
-
i
-
1
]
=
index
[
i
];
CheckNTErrors
((
index
[
i
]
<
dimSize
[
i
]),
"Index is out of range!"
);
offset
=
offset
*
dimSize
[
i
]
+
index
[
i
];
int
offset
=
indexRDI
[
size
-
1
];
for
(
int
i
=
size
-
2
;
i
>=
0
;
i
--
){
CheckNTErrors
((
indexRDI
[
i
]
<
dimSizeRDI
[
i
]),
"Index is out of range!"
);
offset
=
offset
*
dimSizeRDI
[
i
]
+
indexRDI
[
i
];
}
}
delete
[]
indexRDI
;
if
(
isSparse
){
if
(
isSparse
){
DTYPE
value
;
DTYPE
value
;
void
*
p
;
void
*
p
;
...
@@ -1469,7 +1457,6 @@ bool XTensor::Resize(const int myOrder, const int * myDimSize,
...
@@ -1469,7 +1457,6 @@ bool XTensor::Resize(const int myOrder, const int * myDimSize,
bool
zeroData
=
false
;
bool
zeroData
=
false
;
for
(
int
i
=
0
;
i
<
order
;
i
++
){
for
(
int
i
=
0
;
i
<
order
;
i
++
){
dimSize
[
i
]
=
abs
(
myDimSize
[
i
]);
dimSize
[
i
]
=
abs
(
myDimSize
[
i
]);
dimSizeRDI
[
order
-
i
-
1
]
=
dimSize
[
i
];
if
(
myDimSize
[
i
]
<
0
)
if
(
myDimSize
[
i
]
<
0
)
filledData
=
false
;
filledData
=
false
;
if
(
myDimSize
[
i
]
==
0
)
if
(
myDimSize
[
i
]
==
0
)
...
@@ -1668,7 +1655,7 @@ void XTensor::Dump(FILE * file, const char * label, const int n, const int beg,
...
@@ -1668,7 +1655,7 @@ void XTensor::Dump(FILE * file, const char * label, const int n, const int beg,
if
(
isSparse
)
{
if
(
isSparse
)
{
int
num
=
0
;
int
num
=
0
;
for
(
int
i
=
0
;
i
<
order
;
i
++
)
for
(
int
i
=
0
;
i
<
order
;
i
++
)
num
*=
dimSize
RDI
[
i
];
num
*=
dimSize
[
i
];
num
=
int
(
num
*
denseRatio
+
1
);
num
=
int
(
num
*
denseRatio
+
1
);
int
tupleSize
=
sizeof
(
int
)
+
sizeof
(
DTYPE
);
int
tupleSize
=
sizeof
(
int
)
+
sizeof
(
DTYPE
);
int
size
=
sizeof
(
int
)
+
tupleSize
*
(
num
);
int
size
=
sizeof
(
int
)
+
tupleSize
*
(
num
);
...
@@ -1880,8 +1867,8 @@ void XTensor::Read(FILE * file, const char * label)
...
@@ -1880,8 +1867,8 @@ void XTensor::Read(FILE * file, const char * label)
int
ds
[
MAX_TENSOR_DIM_NUM
];
int
ds
[
MAX_TENSOR_DIM_NUM
];
for
(
int
i
=
0
;
i
<
order
;
i
++
)
{
for
(
int
i
=
0
;
i
<
order
;
i
++
)
{
ds
[
i
]
=
key
%
dimSize
RDI
[
i
];
ds
[
i
]
=
key
%
dimSize
[
i
];
key
/=
dimSize
RDI
[
i
];
key
/=
dimSize
[
i
];
}
}
Set
(
value
,
ds
);
Set
(
value
,
ds
);
}
}
...
...
source/tensor/XTensor.h
查看文件 @
f5149a15
...
@@ -100,9 +100,6 @@ public:
...
@@ -100,9 +100,6 @@ public:
/* size of each dimension */
/* size of each dimension */
int
dimSize
[
MAX_TENSOR_DIM_NUM
];
int
dimSize
[
MAX_TENSOR_DIM_NUM
];
/* size of each dimension by Reversed Dimension Indexing (RDI) Mode */
int
dimSizeRDI
[
MAX_TENSOR_DIM_NUM
];
/* data unit - data type for every cell */
/* data unit - data type for every cell */
TENSOR_DATA_TYPE
dataType
;
TENSOR_DATA_TYPE
dataType
;
...
...
source/tensor/core/arithmetic/Div.cpp
查看文件 @
f5149a15
...
@@ -49,9 +49,6 @@ void _Div(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int le
...
@@ -49,9 +49,6 @@ void _Div(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int le
"Unmatched tensors!"
);
"Unmatched tensors!"
);
CheckDev
(
a
->
devID
,
b
->
devID
);
CheckDev
(
a
->
devID
,
b
->
devID
);
int
leadingDimRDI
=
a
->
order
-
leadingDim
-
1
;
#ifdef USE_CUDA
#ifdef USE_CUDA
if
(
a
->
devID
>=
0
||
b
->
devID
>=
0
||
c
->
devID
>=
0
)
{
if
(
a
->
devID
>=
0
||
b
->
devID
>=
0
||
c
->
devID
>=
0
)
{
_CudaDiv
(
a
,
b
,
c
,
alpha
,
leadingDim
);
_CudaDiv
(
a
,
b
,
c
,
alpha
,
leadingDim
);
...
@@ -64,17 +61,17 @@ void _Div(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int le
...
@@ -64,17 +61,17 @@ void _Div(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int le
int
blockSizeB
=
1
;
int
blockSizeB
=
1
;
int
blockSizeC
=
1
;
int
blockSizeC
=
1
;
int
blockNum
=
1
;
int
blockNum
=
1
;
int
dimensionSizeA
=
a
->
dimSize
RDI
[
leadingDimRDI
];
int
dimensionSizeA
=
a
->
dimSize
[
leadingDim
];
int
dimensionSizeB
=
b
->
dimSize
RDI
[
leadingDimRDI
];
int
dimensionSizeB
=
b
->
dimSize
[
leadingDim
];
int
dimensionSizeC
=
c
->
dimSize
RDI
[
leadingDimRDI
];
int
dimensionSizeC
=
c
->
dimSize
[
leadingDim
];
for
(
int
i
=
0
;
i
<
a
->
order
;
i
++
)
{
for
(
int
i
=
0
;
i
<
a
->
order
;
i
++
)
{
if
(
i
!=
leadingDim
RDI
)
{
if
(
i
!=
leadingDim
)
{
CheckNTErrors
((
a
->
dimSize
RDI
[
i
]
==
b
->
dimSizeRDI
[
i
]
&&
a
->
dimSizeRDI
[
i
]
==
c
->
dimSizeRDI
[
i
]),
CheckNTErrors
((
a
->
dimSize
[
i
]
==
b
->
dimSize
[
i
]
&&
a
->
dimSize
[
i
]
==
c
->
dimSize
[
i
]),
"Unmatched tensors!"
);
"Unmatched tensors!"
);
}
}
if
(
i
<
leadingDimRDI
)
if
(
i
>
leadingDim
)
stride
*=
a
->
dimSize
RDI
[
i
];
stride
*=
a
->
dimSize
[
i
];
}
}
blockSizeA
=
stride
*
dimensionSizeA
;
blockSizeA
=
stride
*
dimensionSizeA
;
...
...
source/tensor/core/arithmetic/Div.cu
查看文件 @
f5149a15
...
@@ -122,7 +122,6 @@ where i is the item index
...
@@ -122,7 +122,6 @@ where i is the item index
*/
*/
void _CudaDiv(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int leadingDim)
void _CudaDiv(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int leadingDim)
{
{
int leadingDimRDI = a->order - leadingDim - 1;
CheckNTErrors((a->unitNum <= c->unitNum && b->unitNum <= c->unitNum),
CheckNTErrors((a->unitNum <= c->unitNum && b->unitNum <= c->unitNum),
"Unmatched tensors in multiplication!");
"Unmatched tensors in multiplication!");
CheckNTErrors((a->order == b->order && a->order == c->order), "Unmatched tensors!");
CheckNTErrors((a->order == b->order && a->order == c->order), "Unmatched tensors!");
...
@@ -130,18 +129,18 @@ void _CudaDiv(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, in
...
@@ -130,18 +129,18 @@ void _CudaDiv(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, in
int stride = 1;
int stride = 1;
int blockSizeA = 1;
int blockSizeA = 1;
int blockNum = 1;
int blockNum = 1;
int dimensionSizeA = a->dimSize
RDI[leadingDimRDI
];
int dimensionSizeA = a->dimSize
[leadingDim
];
int dimensionSizeB = b->dimSize
RDI[leadingDimRDI
];
int dimensionSizeB = b->dimSize
[leadingDim
];
int dimensionSizeC = c->dimSize
RDI[leadingDimRDI
];
int dimensionSizeC = c->dimSize
[leadingDim
];
for (int i = 0; i < a->order; i++) {
for (int i = 0; i < a->order; i++) {
if (i != leadingDim
RDI
) {
if (i != leadingDim) {
CheckNTErrors((a->dimSize
RDI[i] == b->dimSizeRDI
[i] &&
CheckNTErrors((a->dimSize
[i] == b->dimSize
[i] &&
a->dimSize
RDI[i] == c->dimSizeRDI
[i]),
a->dimSize
[i] == c->dimSize
[i]),
"Unmatched tensors!");
"Unmatched tensors!");
}
}
if (i
< leadingDimRDI
)
if (i
> leadingDim
)
stride *= a->dimSize
RDI
[i];
stride *= a->dimSize[i];
}
}
blockSizeA = stride * dimensionSizeA;
blockSizeA = stride * dimensionSizeA;
...
...
source/tensor/core/arithmetic/MatrixMul.cpp
查看文件 @
f5149a15
...
@@ -77,18 +77,18 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
...
@@ -77,18 +77,18 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
return
;
return
;
}
}
int
an
=
transposedA
==
X_TRANS
?
a
->
dimSize
RDI
[
0
]
:
a
->
dimSizeRDI
[
1
];
int
an
=
transposedA
==
X_TRANS
?
a
->
dimSize
[
a
->
order
-
1
]
:
a
->
dimSize
[
a
->
order
-
2
];
int
am
=
transposedA
==
X_TRANS
?
a
->
dimSize
RDI
[
1
]
:
a
->
dimSizeRDI
[
0
];
int
am
=
transposedA
==
X_TRANS
?
a
->
dimSize
[
a
->
order
-
2
]
:
a
->
dimSize
[
a
->
order
-
1
];
int
bn
=
transposedB
==
X_TRANS
?
b
->
dimSize
RDI
[
0
]
:
b
->
dimSizeRDI
[
1
];
int
bn
=
transposedB
==
X_TRANS
?
b
->
dimSize
[
b
->
order
-
1
]
:
b
->
dimSize
[
b
->
order
-
2
];
int
bm
=
transposedB
==
X_TRANS
?
b
->
dimSize
RDI
[
1
]
:
b
->
dimSizeRDI
[
0
];
int
bm
=
transposedB
==
X_TRANS
?
b
->
dimSize
[
b
->
order
-
2
]
:
b
->
dimSize
[
b
->
order
-
1
];
int
cn
=
c
->
dimSize
RDI
[
1
];
int
cn
=
c
->
dimSize
[
c
->
order
-
2
];
int
cm
=
c
->
dimSize
RDI
[
0
];
int
cm
=
c
->
dimSize
[
c
->
order
-
1
];
CheckNTErrors
((
am
==
bn
&&
an
==
cn
&&
bm
==
cm
),
"Unmatched tensors in multiplication!"
);
CheckNTErrors
((
am
==
bn
&&
an
==
cn
&&
bm
==
cm
),
"Unmatched tensors in multiplication!"
);
int
aBlockSize
=
a
->
dimSize
RDI
[
0
]
*
a
->
dimSizeRDI
[
1
];
int
aBlockSize
=
a
->
dimSize
[
a
->
order
-
1
]
*
a
->
dimSize
[
a
->
order
-
2
];
int
bBlockSize
=
b
->
dimSize
RDI
[
0
]
*
b
->
dimSizeRDI
[
1
];
int
bBlockSize
=
b
->
dimSize
[
b
->
order
-
1
]
*
b
->
dimSize
[
b
->
order
-
2
];
int
cBlockSize
=
c
->
dimSize
RDI
[
0
]
*
c
->
dimSizeRDI
[
1
];
int
cBlockSize
=
c
->
dimSize
[
c
->
order
-
1
]
*
c
->
dimSize
[
c
->
order
-
2
];
int
aRealBlockSize
=
aBlockSize
*
a
->
unitSize
;
int
aRealBlockSize
=
aBlockSize
*
a
->
unitSize
;
int
bRealBlockSize
=
bBlockSize
*
b
->
unitSize
;
int
bRealBlockSize
=
bBlockSize
*
b
->
unitSize
;
int
cRealBlockSize
=
cBlockSize
*
c
->
unitSize
;
int
cRealBlockSize
=
cBlockSize
*
c
->
unitSize
;
...
@@ -96,24 +96,25 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
...
@@ -96,24 +96,25 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
int
bBlockNum
=
1
;
int
bBlockNum
=
1
;
int
cBlockNum
=
1
;
int
cBlockNum
=
1
;
for
(
int
i
=
2
;
i
<
a
->
order
;
i
++
)
{
CheckNTErrors
(
a
->
dimSizeRDI
[
i
]
==
c
->
dimSizeRDI
[
i
-
2
+
b
->
order
],
"Incorrect tensor sizes!"
);
for
(
int
i
=
0
;
i
<
a
->
order
-
2
;
i
++
)
{
aBlockNum
*=
a
->
dimSizeRDI
[
i
];
CheckNTErrors
(
a
->
dimSize
[
i
]
==
c
->
dimSize
[
i
],
"Incorrect tensor sizes!"
);
cBlockNum
*=
a
->
dimSizeRDI
[
i
];
aBlockNum
*=
a
->
dimSize
[
i
];
cBlockNum
*=
a
->
dimSize
[
i
];
}
}
for
(
int
i
=
2
;
i
<
b
->
order
;
i
++
)
{
for
(
int
i
=
0
;
i
<
b
->
order
-
2
;
i
++
)
{
CheckNTErrors
(
b
->
dimSize
RDI
[
i
]
==
c
->
dimSizeRDI
[
i
],
"Incorrect tensor sizes!"
);
CheckNTErrors
(
b
->
dimSize
[
i
]
==
c
->
dimSize
[
i
-
2
+
a
->
order
],
"Incorrect tensor sizes!"
);
bBlockNum
*=
b
->
dimSize
RDI
[
i
];
bBlockNum
*=
b
->
dimSize
[
i
];
cBlockNum
*=
b
->
dimSize
RDI
[
i
];
cBlockNum
*=
b
->
dimSize
[
i
];
}
}
TensorList
*
aList
=
new
TensorList
(
10
);
TensorList
*
aList
=
new
TensorList
(
10
);
TensorList
*
bList
=
new
TensorList
(
10
);
TensorList
*
bList
=
new
TensorList
(
10
);
TensorList
*
cList
=
new
TensorList
(
10
);
TensorList
*
cList
=
new
TensorList
(
10
);
int
aDimSize
[
2
]
=
{
-
a
->
dimSize
RDI
[
1
],
a
->
dimSizeRDI
[
0
]
};
int
aDimSize
[
2
]
=
{
-
a
->
dimSize
[
a
->
order
-
2
],
a
->
dimSize
[
a
->
order
-
1
]
};
int
bDimSize
[
2
]
=
{
-
b
->
dimSize
RDI
[
1
],
b
->
dimSizeRDI
[
0
]
};
int
bDimSize
[
2
]
=
{
-
b
->
dimSize
[
b
->
order
-
2
],
b
->
dimSize
[
b
->
order
-
1
]
};
int
cDimSize
[
2
]
=
{
-
c
->
dimSize
RDI
[
1
],
c
->
dimSizeRDI
[
0
]
};
int
cDimSize
[
2
]
=
{
-
c
->
dimSize
[
c
->
order
-
2
],
c
->
dimSize
[
c
->
order
-
1
]
};
bool
isSparseMul
=
false
;
bool
isSparseMul
=
false
;
...
@@ -215,20 +216,20 @@ bool CheckMMulShape(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
...
@@ -215,20 +216,20 @@ bool CheckMMulShape(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
if
(
!
(
a
->
order
>=
2
&&
b
->
order
>=
2
&&
c
->
order
>=
2
))
if
(
!
(
a
->
order
>=
2
&&
b
->
order
>=
2
&&
c
->
order
>=
2
))
return
false
;
return
false
;
int
an
=
transposedA
==
X_TRANS
?
a
->
dimSize
RDI
[
0
]
:
a
->
dimSizeRDI
[
1
];
int
an
=
transposedA
==
X_TRANS
?
a
->
dimSize
[
a
->
order
-
1
]
:
a
->
dimSize
[
a
->
order
-
2
];
int
am
=
transposedA
==
X_TRANS
?
a
->
dimSize
RDI
[
1
]
:
a
->
dimSizeRDI
[
0
];
int
am
=
transposedA
==
X_TRANS
?
a
->
dimSize
[
a
->
order
-
2
]
:
a
->
dimSize
[
a
->
order
-
1
];
int
bn
=
transposedB
==
X_TRANS
?
b
->
dimSize
RDI
[
0
]
:
b
->
dimSizeRDI
[
1
];
int
bn
=
transposedB
==
X_TRANS
?
b
->
dimSize
[
b
->
order
-
1
]
:
b
->
dimSize
[
b
->
order
-
2
];
int
bm
=
transposedB
==
X_TRANS
?
b
->
dimSize
RDI
[
1
]
:
b
->
dimSizeRDI
[
0
];
int
bm
=
transposedB
==
X_TRANS
?
b
->
dimSize
[
b
->
order
-
2
]
:
b
->
dimSize
[
b
->
order
-
1
];
CheckNTErrors
(
am
==
bn
,
"Unmatched tensors in multiplication!"
);
CheckNTErrors
(
am
==
bn
,
"Unmatched tensors in multiplication!"
);
int
order
=
a
->
order
+
b
->
order
-
2
;
int
order
=
a
->
order
+
b
->
order
-
2
;
int
sub
=
0
;
int
sub
=
0
;
int
*
dimSize
=
new
int
[
order
];
int
*
dimSize
=
new
int
[
order
];
for
(
int
i
=
2
;
i
<
a
->
order
;
i
++
)
for
(
int
i
=
0
;
i
<
a
->
order
-
2
;
i
++
)
dimSize
[
sub
++
]
=
a
->
dimSize
RDI
[
a
->
order
+
1
-
i
];
dimSize
[
sub
++
]
=
a
->
dimSize
[
i
];
for
(
int
i
=
2
;
i
<
b
->
order
;
i
++
)
for
(
int
i
=
0
;
i
<
b
->
order
-
2
;
i
++
)
dimSize
[
sub
++
]
=
b
->
dimSize
RDI
[
b
->
order
+
1
-
i
];
dimSize
[
sub
++
]
=
b
->
dimSize
[
i
];
dimSize
[
sub
++
]
=
an
;
dimSize
[
sub
++
]
=
an
;
dimSize
[
sub
++
]
=
bm
;
dimSize
[
sub
++
]
=
bm
;
...
@@ -271,20 +272,20 @@ XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA,
...
@@ -271,20 +272,20 @@ XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA,
CheckNTErrors
(
a
.
dataType
==
b
.
dataType
,
"Input tensors should have the same data type!"
);
CheckNTErrors
(
a
.
dataType
==
b
.
dataType
,
"Input tensors should have the same data type!"
);
CheckNTErrors
(
a
.
order
>=
2
&&
b
.
order
>=
2
,
"Input tensors must have a order >= 2!"
);
CheckNTErrors
(
a
.
order
>=
2
&&
b
.
order
>=
2
,
"Input tensors must have a order >= 2!"
);
int
an
=
transposedA
==
X_TRANS
?
a
.
dimSize
RDI
[
0
]
:
a
.
dimSizeRDI
[
1
];
int
an
=
transposedA
==
X_TRANS
?
a
.
dimSize
[
a
.
order
-
1
]
:
a
.
dimSize
[
a
.
order
-
2
];
int
am
=
transposedA
==
X_TRANS
?
a
.
dimSize
RDI
[
1
]
:
a
.
dimSizeRDI
[
0
];
int
am
=
transposedA
==
X_TRANS
?
a
.
dimSize
[
a
.
order
-
2
]
:
a
.
dimSize
[
a
.
order
-
1
];
int
bn
=
transposedB
==
X_TRANS
?
b
.
dimSize
RDI
[
0
]
:
b
.
dimSizeRDI
[
1
];
int
bn
=
transposedB
==
X_TRANS
?
b
.
dimSize
[
b
.
order
-
1
]
:
b
.
dimSize
[
b
.
order
-
2
];
int
bm
=
transposedB
==
X_TRANS
?
b
.
dimSize
RDI
[
1
]
:
b
.
dimSizeRDI
[
0
];
int
bm
=
transposedB
==
X_TRANS
?
b
.
dimSize
[
b
.
order
-
2
]
:
b
.
dimSize
[
b
.
order
-
1
];
CheckNTErrors
(
am
==
bn
,
"Unmatched tensors in multiplication!"
);
CheckNTErrors
(
am
==
bn
,
"Unmatched tensors in multiplication!"
);
int
order
=
a
.
order
+
b
.
order
-
2
;
int
order
=
a
.
order
+
b
.
order
-
2
;
int
sub
=
0
;
int
sub
=
0
;
int
*
dimSize
=
new
int
[
order
];
int
*
dimSize
=
new
int
[
order
];
for
(
int
i
=
2
;
i
<
a
.
order
;
i
++
)
for
(
int
i
=
0
;
i
<
a
.
order
-
2
;
i
++
)
dimSize
[
sub
++
]
=
a
.
dimSize
RDI
[
a
.
order
+
1
-
i
];
dimSize
[
sub
++
]
=
a
.
dimSize
[
i
];
for
(
int
i
=
2
;
i
<
b
.
order
;
i
++
)
for
(
int
i
=
0
;
i
<
b
.
order
-
2
;
i
++
)
dimSize
[
sub
++
]
=
b
.
dimSize
RDI
[
b
.
order
+
1
-
i
];
dimSize
[
sub
++
]
=
b
.
dimSize
[
i
];
dimSize
[
sub
++
]
=
an
;
dimSize
[
sub
++
]
=
an
;
dimSize
[
sub
++
]
=
bm
;
dimSize
[
sub
++
]
=
bm
;
...
@@ -318,20 +319,20 @@ void MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA,
...
@@ -318,20 +319,20 @@ void MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA,
if
(
!
c
.
isInit
||
!
CheckMMulShape
(
&
a
,
transposedA
,
&
b
,
transposedB
,
&
c
))
{
if
(
!
c
.
isInit
||
!
CheckMMulShape
(
&
a
,
transposedA
,
&
b
,
transposedB
,
&
c
))
{
int
an
=
transposedA
==
X_TRANS
?
a
.
dimSize
RDI
[
0
]
:
a
.
dimSizeRDI
[
1
];
int
an
=
transposedA
==
X_TRANS
?
a
.
dimSize
[
a
.
order
-
1
]
:
a
.
dimSize
[
a
.
order
-
2
];
int
am
=
transposedA
==
X_TRANS
?
a
.
dimSize
RDI
[
1
]
:
a
.
dimSizeRDI
[
0
];
int
am
=
transposedA
==
X_TRANS
?
a
.
dimSize
[
a
.
order
-
2
]
:
a
.
dimSize
[
a
.
order
-
1
];
int
bn
=
transposedB
==
X_TRANS
?
b
.
dimSize
RDI
[
0
]
:
b
.
dimSizeRDI
[
1
];
int
bn
=
transposedB
==
X_TRANS
?
b
.
dimSize
[
b
.
order
-
1
]
:
b
.
dimSize
[
b
.
order
-
2
];
int
bm
=
transposedB
==
X_TRANS
?
b
.
dimSize
RDI
[
1
]
:
b
.
dimSizeRDI
[
0
];
int
bm
=
transposedB
==
X_TRANS
?
b
.
dimSize
[
b
.
order
-
2
]
:
b
.
dimSize
[
b
.
order
-
1
];
CheckNTErrors
(
am
==
bn
,
"Unmatched tensors in multiplication!"
);
CheckNTErrors
(
am
==
bn
,
"Unmatched tensors in multiplication!"
);
int
order
=
a
.
order
+
b
.
order
-
2
;
int
order
=
a
.
order
+
b
.
order
-
2
;
int
sub
=
0
;
int
sub
=
0
;
int
*
dimSize
=
new
int
[
order
];
int
*
dimSize
=
new
int
[
order
];
for
(
int
i
=
2
;
i
<
a
.
order
;
i
++
)
for
(
int
i
=
0
;
i
<
a
.
order
-
2
;
i
++
)
dimSize
[
sub
++
]
=
a
.
dimSize
RDI
[
a
.
order
+
1
-
i
];
dimSize
[
sub
++
]
=
a
.
dimSize
[
i
];
for
(
int
i
=
2
;
i
<
b
.
order
;
i
++
)
for
(
int
i
=
0
;
i
<
b
.
order
-
2
;
i
++
)
dimSize
[
sub
++
]
=
b
.
dimSize
RDI
[
b
.
order
+
1
-
i
];
dimSize
[
sub
++
]
=
b
.
dimSize
[
i
];
dimSize
[
sub
++
]
=
an
;
dimSize
[
sub
++
]
=
an
;
dimSize
[
sub
++
]
=
bm
;
dimSize
[
sub
++
]
=
bm
;
...
@@ -370,20 +371,20 @@ XTensor MatrixMul(const XTensor &a, const XTensor &b,
...
@@ -370,20 +371,20 @@ XTensor MatrixMul(const XTensor &a, const XTensor &b,
CheckNTErrors
(
a
.
dataType
==
b
.
dataType
,
"Input tensors should have the same data type!"
);
CheckNTErrors
(
a
.
dataType
==
b
.
dataType
,
"Input tensors should have the same data type!"
);
CheckNTErrors
(
a
.
order
>=
2
&&
b
.
order
>=
2
,
"Input tensors must have a order >= 2!"
);
CheckNTErrors
(
a
.
order
>=
2
&&
b
.
order
>=
2
,
"Input tensors must have a order >= 2!"
);
int
an
=
a
.
dimSize
RDI
[
1
];
int
an
=
a
.
dimSize
[
a
.
order
-
2
];
int
am
=
a
.
dimSize
RDI
[
0
];
int
am
=
a
.
dimSize
[
a
.
order
-
1
];
int
bn
=
b
.
dimSize
RDI
[
1
];
int
bn
=
b
.
dimSize
[
b
.
order
-
2
];
int
bm
=
b
.
dimSize
RDI
[
0
];
int
bm
=
b
.
dimSize
[
b
.
order
-
1
];
CheckNTErrors
(
am
==
bn
,
"Unmatched tensors in multiplication!"
);
CheckNTErrors
(
am
==
bn
,
"Unmatched tensors in multiplication!"
);
int
order
=
a
.
order
+
b
.
order
-
2
;
int
order
=
a
.
order
+
b
.
order
-
2
;
int
sub
=
0
;
int
sub
=
0
;
int
*
dimSize
=
new
int
[
order
];
int
*
dimSize
=
new
int
[
order
];
for
(
int
i
=
2
;
i
<
a
.
order
;
i
++
)
for
(
int
i
=
0
;
i
<
a
.
order
-
2
;
i
++
)
dimSize
[
sub
++
]
=
a
.
dimSize
RDI
[
a
.
order
+
1
-
i
];
dimSize
[
sub
++
]
=
a
.
dimSize
[
i
];
for
(
int
i
=
2
;
i
<
b
.
order
;
i
++
)
for
(
int
i
=
0
;
i
<
b
.
order
-
2
;
i
++
)
dimSize
[
sub
++
]
=
b
.
dimSize
RDI
[
b
.
order
+
1
-
i
];
dimSize
[
sub
++
]
=
b
.
dimSize
[
i
];
dimSize
[
sub
++
]
=
an
;
dimSize
[
sub
++
]
=
an
;
dimSize
[
sub
++
]
=
bm
;
dimSize
[
sub
++
]
=
bm
;
...
@@ -416,20 +417,20 @@ void MatrixMul(const XTensor &a, const XTensor &b, XTensor &c,
...
@@ -416,20 +417,20 @@ void MatrixMul(const XTensor &a, const XTensor &b, XTensor &c,
if
(
!
c
.
isInit
||
!
CheckMMulShape
(
&
a
,
X_NOTRANS
,
&
b
,
X_NOTRANS
,
&
c
))
{
if
(
!
c
.
isInit
||
!
CheckMMulShape
(
&
a
,
X_NOTRANS
,
&
b
,
X_NOTRANS
,
&
c
))
{
int
an
=
a
.
dimSize
RDI
[
1
];
int
an
=
a
.
dimSize
[
a
.
order
-
2
];
int
am
=
a
.
dimSize
RDI
[
0
];
int
am
=
a
.
dimSize
[
a
.
order
-
1
];
int
bn
=
b
.
dimSize
RDI
[
1
];
int
bn
=
b
.
dimSize
[
b
.
order
-
2
];
int
bm
=
b
.
dimSize
RDI
[
0
];
int
bm
=
b
.
dimSize
[
b
.
order
-
1
];
CheckNTErrors
(
am
==
bn
,
"Unmatched tensors in multiplication!"
);
CheckNTErrors
(
am
==
bn
,
"Unmatched tensors in multiplication!"
);
int
order
=
a
.
order
+
b
.
order
-
2
;
int
order
=
a
.
order
+
b
.
order
-
2
;
int
sub
=
0
;
int
sub
=
0
;
int
*
dimSize
=
new
int
[
order
];
int
*
dimSize
=
new
int
[
order
];
for
(
int
i
=
2
;
i
<
a
.
order
;
i
++
)
for
(
int
i
=
0
;
i
<
a
.
order
-
2
;
i
++
)
dimSize
[
sub
++
]
=
a
.
dimSize
RDI
[
a
.
order
+
1
-
i
];
dimSize
[
sub
++
]
=
a
.
dimSize
[
i
];
for
(
int
i
=
2
;
i
<
b
.
order
;
i
++
)
for
(
int
i
=
0
;
i
<
b
.
order
-
2
;
i
++
)
dimSize
[
sub
++
]
=
b
.
dimSize
RDI
[
b
.
order
+
1
-
i
];
dimSize
[
sub
++
]
=
b
.
dimSize
[
i
];
dimSize
[
sub
++
]
=
an
;
dimSize
[
sub
++
]
=
an
;
dimSize
[
sub
++
]
=
bm
;
dimSize
[
sub
++
]
=
bm
;
...
...
source/tensor/core/arithmetic/MatrixMulBatched.cpp
查看文件 @
f5149a15
...
@@ -95,27 +95,27 @@ void _MatrixMulBatchedGPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
...
@@ -95,27 +95,27 @@ void _MatrixMulBatchedGPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
"Input tensor and output tensor must have same order!"
);
"Input tensor and output tensor must have same order!"
);
CheckNTErrors
(
a
->
devID
>=
0
&&
b
->
devID
>=
0
&&
c
->
devID
>=
0
,
"The tensors must be on GPUs"
);
CheckNTErrors
(
a
->
devID
>=
0
&&
b
->
devID
>=
0
&&
c
->
devID
>=
0
,
"The tensors must be on GPUs"
);
int
an
=
transposedA
==
X_TRANS
?
a
->
dimSize
RDI
[
0
]
:
a
->
dimSizeRDI
[
1
];
int
an
=
transposedA
==
X_TRANS
?
a
->
dimSize
[
a
->
order
-
1
]
:
a
->
dimSize
[
a
->
order
-
2
];
int
am
=
transposedA
==
X_TRANS
?
a
->
dimSize
RDI
[
1
]
:
a
->
dimSizeRDI
[
0
];
int
am
=
transposedA
==
X_TRANS
?
a
->
dimSize
[
a
->
order
-
2
]
:
a
->
dimSize
[
a
->
order
-
1
];
int
bn
=
transposedB
==
X_TRANS
?
b
->
dimSize
RDI
[
0
]
:
b
->
dimSizeRDI
[
1
];
int
bn
=
transposedB
==
X_TRANS
?
b
->
dimSize
[
b
->
order
-
1
]
:
b
->
dimSize
[
b
->
order
-
2
];
int
bm
=
transposedB
==
X_TRANS
?
b
->
dimSize
RDI
[
1
]
:
b
->
dimSizeRDI
[
0
];
int
bm
=
transposedB
==
X_TRANS
?
b
->
dimSize
[
b
->
order
-
2
]
:
b
->
dimSize
[
b
->
order
-
1
];
int
cn
=
c
->
dimSize
RDI
[
1
];
int
cn
=
c
->
dimSize
[
c
->
order
-
2
];
int
cm
=
c
->
dimSize
RDI
[
0
];
int
cm
=
c
->
dimSize
[
c
->
order
-
1
];
CheckNTErrors
((
am
==
bn
&&
an
==
cn
&&
bm
==
cm
),
"Unmatched tensors in multiplication!"
);
CheckNTErrors
((
am
==
bn
&&
an
==
cn
&&
bm
==
cm
),
"Unmatched tensors in multiplication!"
);
int
aBlockSize
=
a
->
dimSize
RDI
[
0
]
*
a
->
dimSizeRDI
[
1
];
int
aBlockSize
=
a
->
dimSize
[
a
->
order
-
1
]
*
a
->
dimSize
[
a
->
order
-
2
];
int
bBlockSize
=
b
->
dimSize
RDI
[
0
]
*
b
->
dimSizeRDI
[
1
];
int
bBlockSize
=
b
->
dimSize
[
b
->
order
-
1
]
*
b
->
dimSize
[
b
->
order
-
2
];
int
cBlockSize
=
c
->
dimSize
RDI
[
0
]
*
c
->
dimSizeRDI
[
1
];
int
cBlockSize
=
c
->
dimSize
[
c
->
order
-
1
]
*
c
->
dimSize
[
c
->
order
-
2
];
int
aRealBlockSize
=
aBlockSize
*
a
->
unitSize
;
int
aRealBlockSize
=
aBlockSize
*
a
->
unitSize
;
int
bRealBlockSize
=
bBlockSize
*
b
->
unitSize
;
int
bRealBlockSize
=
bBlockSize
*
b
->
unitSize
;
int
cRealBlockSize
=
cBlockSize
*
c
->
unitSize
;
int
cRealBlockSize
=
cBlockSize
*
c
->
unitSize
;
int
blockNum
=
1
;
int
blockNum
=
1
;
for
(
int
i
=
2
;
i
<
a
->
order
;
i
++
)
{
for
(
int
i
=
0
;
i
<
a
->
order
-
2
;
i
++
)
{
CheckNTErrors
((
a
->
dimSize
RDI
[
i
]
==
c
->
dimSizeRDI
[
i
]),
"Incorrect tensor sizes!"
);
CheckNTErrors
((
a
->
dimSize
[
i
]
==
c
->
dimSize
[
i
]),
"Incorrect tensor sizes!"
);
CheckNTErrors
((
b
->
dimSize
RDI
[
i
]
==
c
->
dimSizeRDI
[
i
]),
"Incorrect tensor sizes!"
);
CheckNTErrors
((
b
->
dimSize
[
i
]
==
c
->
dimSize
[
i
]),
"Incorrect tensor sizes!"
);
blockNum
*=
a
->
dimSize
RDI
[
i
];
blockNum
*=
a
->
dimSize
[
i
];
}
}
int
devIDBackup
=
0
;
int
devIDBackup
=
0
;
...
@@ -126,9 +126,9 @@ void _MatrixMulBatchedGPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
...
@@ -126,9 +126,9 @@ void _MatrixMulBatchedGPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
a
->
data
,
transposedA
,
a
->
dataType
,
aBlockSize
,
a
->
data
,
transposedA
,
a
->
dataType
,
aBlockSize
,
b
->
data
,
transposedB
,
b
->
dataType
,
bBlockSize
,
b
->
data
,
transposedB
,
b
->
dataType
,
bBlockSize
,
c
->
data
,
c
->
dataType
,
cBlockSize
,
blockNum
,
c
->
data
,
c
->
dataType
,
cBlockSize
,
blockNum
,
a
->
dimSize
RDI
[
1
],
a
->
dimSizeRDI
[
0
],
a
->
dimSize
[
a
->
order
-
2
],
a
->
dimSize
[
a
->
order
-
1
],
b
->
dimSize
RDI
[
1
],
b
->
dimSizeRDI
[
0
],
b
->
dimSize
[
b
->
order
-
2
],
b
->
dimSize
[
b
->
order
-
1
],
c
->
dimSize
RDI
[
1
],
c
->
dimSizeRDI
[
0
],
alpha
,
beta
);
c
->
dimSize
[
c
->
order
-
2
],
c
->
dimSize
[
c
->
order
-
1
],
alpha
,
beta
);
BacktoCudaDev
(
a
->
devID
,
devIDBackup
);
BacktoCudaDev
(
a
->
devID
,
devIDBackup
);
#endif
#endif
...
@@ -164,32 +164,32 @@ void _MatrixMulBatchedCPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
...
@@ -164,32 +164,32 @@ void _MatrixMulBatchedCPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
"Input tensor and output tensor must have same order!"
);
"Input tensor and output tensor must have same order!"
);
int
an
=
transposedA
==
X_TRANS
?
a
->
dimSize
RDI
[
0
]
:
a
->
dimSizeRDI
[
1
];
int
an
=
transposedA
==
X_TRANS
?
a
->
dimSize
[
a
->
order
-
1
]
:
a
->
dimSize
[
a
->
order
-
2
];
int
am
=
transposedA
==
X_TRANS
?
a
->
dimSize
RDI
[
1
]
:
a
->
dimSizeRDI
[
0
];
int
am
=
transposedA
==
X_TRANS
?
a
->
dimSize
[
a
->
order
-
2
]
:
a
->
dimSize
[
a
->
order
-
1
];
int
bn
=
transposedB
==
X_TRANS
?
b
->
dimSize
RDI
[
0
]
:
b
->
dimSizeRDI
[
1
];
int
bn
=
transposedB
==
X_TRANS
?
b
->
dimSize
[
b
->
order
-
1
]
:
b
->
dimSize
[
b
->
order
-
2
];
int
bm
=
transposedB
==
X_TRANS
?
b
->
dimSize
RDI
[
1
]
:
b
->
dimSizeRDI
[
0
];
int
bm
=
transposedB
==
X_TRANS
?
b
->
dimSize
[
b
->
order
-
2
]
:
b
->
dimSize
[
b
->
order
-
1
];
int
cn
=
c
->
dimSize
RDI
[
1
];
int
cn
=
c
->
dimSize
[
c
->
order
-
2
];
int
cm
=
c
->
dimSize
RDI
[
0
];
int
cm
=
c
->
dimSize
[
c
->
order
-
1
];
CheckNTErrors
(
am
==
bn
&&
an
==
cn
&&
bm
==
cm
,
"Unmatched tensors in multiplication!"
);
CheckNTErrors
(
am
==
bn
&&
an
==
cn
&&
bm
==
cm
,
"Unmatched tensors in multiplication!"
);
int
aBlockSize
=
a
->
dimSize
RDI
[
0
]
*
a
->
dimSizeRDI
[
1
];
int
aBlockSize
=
a
->
dimSize
[
a
->
order
-
1
]
*
a
->
dimSize
[
a
->
order
-
2
];
int
bBlockSize
=
b
->
dimSize
RDI
[
0
]
*
b
->
dimSizeRDI
[
1
];
int
bBlockSize
=
b
->
dimSize
[
b
->
order
-
1
]
*
b
->
dimSize
[
b
->
order
-
2
];
int
cBlockSize
=
c
->
dimSize
RDI
[
0
]
*
c
->
dimSizeRDI
[
1
];
int
cBlockSize
=
c
->
dimSize
[
c
->
order
-
1
]
*
c
->
dimSize
[
c
->
order
-
2
];
int
aRealBlockSize
=
aBlockSize
*
a
->
unitSize
;
int
aRealBlockSize
=
aBlockSize
*
a
->
unitSize
;
int
bRealBlockSize
=
bBlockSize
*
b
->
unitSize
;
int
bRealBlockSize
=
bBlockSize
*
b
->
unitSize
;
int
cRealBlockSize
=
cBlockSize
*
c
->
unitSize
;
int
cRealBlockSize
=
cBlockSize
*
c
->
unitSize
;
int
blockNum
=
1
;
int
blockNum
=
1
;
for
(
int
i
=
2
;
i
<
a
->
order
;
i
++
)
{
for
(
int
i
=
0
;
i
<
a
->
order
-
2
;
i
++
)
{
CheckNTErrors
((
a
->
dimSize
RDI
[
i
]
==
c
->
dimSizeRDI
[
i
]),
"Incorrect tensor sizes!"
);
CheckNTErrors
((
a
->
dimSize
[
i
]
==
c
->
dimSize
[
i
]),
"Incorrect tensor sizes!"
);
CheckNTErrors
((
b
->
dimSize
RDI
[
i
]
==
c
->
dimSizeRDI
[
i
]),
"Incorrect tensor sizes!"
);
CheckNTErrors
((
b
->
dimSize
[
i
]
==
c
->
dimSize
[
i
]),
"Incorrect tensor sizes!"
);
blockNum
*=
a
->
dimSize
RDI
[
i
];
blockNum
*=
a
->
dimSize
[
i
];
}
}
int
aDimSize
[
2
]
=
{
-
a
->
dimSize
RDI
[
1
],
a
->
dimSizeRDI
[
0
]};
int
aDimSize
[
2
]
=
{
-
a
->
dimSize
[
a
->
order
-
2
],
a
->
dimSize
[
a
->
order
-
1
]};
int
bDimSize
[
2
]
=
{
-
b
->
dimSize
RDI
[
1
],
b
->
dimSizeRDI
[
0
]};
int
bDimSize
[
2
]
=
{
-
b
->
dimSize
[
b
->
order
-
2
],
b
->
dimSize
[
b
->
order
-
1
]};
int
cDimSize
[
2
]
=
{
-
c
->
dimSize
RDI
[
1
],
c
->
dimSizeRDI
[
0
]};
int
cDimSize
[
2
]
=
{
-
c
->
dimSize
[
c
->
order
-
2
],
c
->
dimSize
[
c
->
order
-
1
]};
XTensor
*
ai
=
NewTensor2D
(
aDimSize
[
0
],
aDimSize
[
1
],
a
->
dataType
,
a
->
devID
,
a
->
mem
);
XTensor
*
ai
=
NewTensor2D
(
aDimSize
[
0
],
aDimSize
[
1
],
a
->
dataType
,
a
->
devID
,
a
->
mem
);
XTensor
*
bi
=
NewTensor2D
(
bDimSize
[
0
],
bDimSize
[
1
],
b
->
dataType
,
b
->
devID
,
b
->
mem
);
XTensor
*
bi
=
NewTensor2D
(
bDimSize
[
0
],
bDimSize
[
1
],
b
->
dataType
,
b
->
devID
,
b
->
mem
);
...
@@ -292,10 +292,10 @@ XTensor MatrixMulBatched(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const
...
@@ -292,10 +292,10 @@ XTensor MatrixMulBatched(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const
CheckNTErrors
(
a
.
order
>=
2
&&
b
.
order
>=
2
,
"Input tensors must have a order >= 2!"
);
CheckNTErrors
(
a
.
order
>=
2
&&
b
.
order
>=
2
,
"Input tensors must have a order >= 2!"
);
CheckNTErrors
(
a
.
order
==
b
.
order
,
"Input tensor and output tensor must have same order!"
);
CheckNTErrors
(
a
.
order
==
b
.
order
,
"Input tensor and output tensor must have same order!"
);
int
an
=
transposedA
==
X_TRANS
?
a
.
dimSize
RDI
[
0
]
:
a
.
dimSizeRDI
[
1
];
int
an
=
transposedA
==
X_TRANS
?
a
.
dimSize
[
a
.
order
-
1
]
:
a
.
dimSize
[
a
.
order
-
2
];
int
am
=
transposedA
==
X_TRANS
?
a
.
dimSize
RDI
[
1
]
:
a
.
dimSizeRDI
[
0
];
int
am
=
transposedA
==
X_TRANS
?
a
.
dimSize
[
a
.
order
-
2
]
:
a
.
dimSize
[
a
.
order
-
1
];
int
bn
=
transposedB
==
X_TRANS
?
b
.
dimSize
RDI
[
0
]
:
b
.
dimSizeRDI
[
1
];
int
bn
=
transposedB
==
X_TRANS
?
b
.
dimSize
[
b
.
order
-
1
]
:
b
.
dimSize
[
b
.
order
-
2
];
int
bm
=
transposedB
==
X_TRANS
?
b
.
dimSize
RDI
[
1
]
:
b
.
dimSizeRDI
[
0
];
int
bm
=
transposedB
==
X_TRANS
?
b
.
dimSize
[
b
.
order
-
2
]
:
b
.
dimSize
[
b
.
order
-
1
];
CheckNTErrors
(
am
==
bn
,
"Unmatched tensors in multiplication!"
);
CheckNTErrors
(
am
==
bn
,
"Unmatched tensors in multiplication!"
);
...
@@ -350,10 +350,10 @@ XTensor MatrixMulBatched(const XTensor &a, const XTensor &b,
...
@@ -350,10 +350,10 @@ XTensor MatrixMulBatched(const XTensor &a, const XTensor &b,
CheckNTErrors
(
a
.
order
>=
2
&&
b
.
order
>=
2
,
"Input tensors must have a order >= 2!"
);
CheckNTErrors
(
a
.
order
>=
2
&&
b
.
order
>=
2
,
"Input tensors must have a order >= 2!"
);
CheckNTErrors
(
a
.
order
==
b
.
order
,
"Input tensor and output tensor must have same order!"
);
CheckNTErrors
(
a
.
order
==
b
.
order
,
"Input tensor and output tensor must have same order!"
);
int
an
=
a
.
dimSize
RDI
[
1
];
int
an
=
a
.
dimSize
[
a
.
order
-
2
];
int
am
=
a
.
dimSize
RDI
[
0
];
int
am
=
a
.
dimSize
[
a
.
order
-
1
];
int
bn
=
b
.
dimSize
RDI
[
1
];
int
bn
=
b
.
dimSize
[
b
.
order
-
2
];
int
bm
=
b
.
dimSize
RDI
[
0
];
int
bm
=
b
.
dimSize
[
b
.
order
-
1
];
CheckNTErrors
(
am
==
bn
,
"Unmatched tensors in multiplication!"
);
CheckNTErrors
(
am
==
bn
,
"Unmatched tensors in multiplication!"
);
...
...
source/tensor/core/arithmetic/MulAndShift.cpp
查看文件 @
f5149a15
...
@@ -71,20 +71,21 @@ XTensor MulAndShift(const XTensor &x, const XTensor &w, const XTensor &b,
...
@@ -71,20 +71,21 @@ XTensor MulAndShift(const XTensor &x, const XTensor &w, const XTensor &b,
CheckNTErrors
(
x
.
dataType
==
w
.
dataType
,
"Input tensors should have the same data type!"
);
CheckNTErrors
(
x
.
dataType
==
w
.
dataType
,
"Input tensors should have the same data type!"
);
CheckNTErrors
(
x
.
order
>=
2
&&
w
.
order
>=
2
,
"Input tensors must have a order >= 2!"
);
CheckNTErrors
(
x
.
order
>=
2
&&
w
.
order
>=
2
,
"Input tensors must have a order >= 2!"
);
int
xn
=
x
.
dimSize
RDI
[
1
];
int
xn
=
x
.
dimSize
[
x
.
order
-
2
];
int
xm
=
x
.
dimSize
RDI
[
0
];
int
xm
=
x
.
dimSize
[
x
.
order
-
1
];
int
wn
=
w
.
dimSize
RDI
[
1
];
int
wn
=
w
.
dimSize
[
w
.
order
-
2
];
int
wm
=
w
.
dimSize
RDI
[
0
];
int
wm
=
w
.
dimSize
[
w
.
order
-
1
];
CheckNTErrors
(
xm
==
wn
,
"Unmatched tensors in multiplication!"
);
CheckNTErrors
(
xm
==
wn
,
"Unmatched tensors in multiplication!"
);
int
order
=
x
.
order
+
w
.
order
-
2
;
int
order
=
x
.
order
+
w
.
order
-
2
;
int
sub
=
0
;
int
sub
=
0
;
int
*
dimSize
=
new
int
[
order
];
int
*
dimSize
=
new
int
[
order
];
for
(
int
i
=
2
;
i
<
x
.
order
;
i
++
)
for
(
int
i
=
0
;
i
<
x
.
order
-
2
;
i
++
)
dimSize
[
sub
++
]
=
x
.
dimSizeRDI
[
x
.
order
+
1
-
i
];
dimSize
[
sub
++
]
=
x
.
dimSize
[
i
];
for
(
int
i
=
2
;
i
<
w
.
order
;
i
++
)
for
(
int
i
=
0
;
i
<
w
.
order
-
2
;
i
++
)
dimSize
[
sub
++
]
=
w
.
dimSizeRDI
[
w
.
order
+
1
-
i
];
dimSize
[
sub
++
]
=
w
.
dimSize
[
i
];
dimSize
[
sub
++
]
=
xn
;
dimSize
[
sub
++
]
=
xn
;
dimSize
[
sub
++
]
=
wm
;
dimSize
[
sub
++
]
=
wm
;
...
@@ -148,18 +149,18 @@ XTensor MulAndShift(const XTensor& x, MATRIX_TRANS_TYPE transposedA,
...
@@ -148,18 +149,18 @@ XTensor MulAndShift(const XTensor& x, MATRIX_TRANS_TYPE transposedA,
CheckNTErrors
(
x
.
dataType
==
w
.
dataType
,
"Input tensors should have the same data type!"
);
CheckNTErrors
(
x
.
dataType
==
w
.
dataType
,
"Input tensors should have the same data type!"
);
CheckNTErrors
(
x
.
order
>=
2
&&
w
.
order
>=
2
,
"Input tensors must have a order >= 2!"
);
CheckNTErrors
(
x
.
order
>=
2
&&
w
.
order
>=
2
,
"Input tensors must have a order >= 2!"
);
int
xn
=
transposedA
==
X_TRANS
?
x
.
dimSize
RDI
[
0
]
:
x
.
dimSizeRDI
[
1
];
int
xn
=
transposedA
==
X_TRANS
?
x
.
dimSize
[
x
.
order
-
1
]
:
x
.
dimSize
[
x
.
order
-
2
];
int
xm
=
transposedA
==
X_TRANS
?
x
.
dimSize
RDI
[
1
]
:
x
.
dimSizeRDI
[
0
];
int
xm
=
transposedA
==
X_TRANS
?
x
.
dimSize
[
x
.
order
-
2
]
:
x
.
dimSize
[
x
.
order
-
1
];
int
wn
=
transposedB
==
X_TRANS
?
w
.
dimSize
RDI
[
0
]
:
w
.
dimSizeRDI
[
1
];
int
wn
=
transposedB
==
X_TRANS
?
w
.
dimSize
[
w
.
order
-
1
]
:
w
.
dimSize
[
w
.
order
-
2
];
int
wm
=
transposedB
==
X_TRANS
?
w
.
dimSize
RDI
[
1
]
:
w
.
dimSizeRDI
[
0
];
int
wm
=
transposedB
==
X_TRANS
?
w
.
dimSize
[
w
.
order
-
2
]
:
w
.
dimSize
[
w
.
order
-
1
];
int
order
=
x
.
order
+
w
.
order
-
2
;
int
order
=
x
.
order
+
w
.
order
-
2
;
int
sub
=
0
;
int
sub
=
0
;
int
*
dimSize
=
new
int
[
order
];
int
*
dimSize
=
new
int
[
order
];
for
(
int
i
=
2
;
i
<
x
.
order
;
i
++
)
for
(
int
i
=
0
;
i
<
x
.
order
-
2
;
i
++
)
dimSize
[
sub
++
]
=
x
.
dimSize
RDI
[
x
.
order
+
1
-
i
];
dimSize
[
sub
++
]
=
x
.
dimSize
[
i
];
for
(
int
i
=
2
;
i
<
w
.
order
;
i
++
)
for
(
int
i
=
0
;
i
<
w
.
order
-
2
;
i
++
)
dimSize
[
sub
++
]
=
w
.
dimSize
RDI
[
w
.
order
+
1
-
i
];
dimSize
[
sub
++
]
=
w
.
dimSize
[
i
];
dimSize
[
sub
++
]
=
xn
;
dimSize
[
sub
++
]
=
xn
;
dimSize
[
sub
++
]
=
wm
;
dimSize
[
sub
++
]
=
wm
;
...
...
source/tensor/core/arithmetic/Multiply.cpp
查看文件 @
f5149a15
...
@@ -49,9 +49,6 @@ void _Multiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, i
...
@@ -49,9 +49,6 @@ void _Multiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, i
"Unmatched tensors!"
);
"Unmatched tensors!"
);
CheckDev
(
a
->
devID
,
b
->
devID
);
CheckDev
(
a
->
devID
,
b
->
devID
);
int
leadingDimRDI
=
a
->
order
-
leadingDim
-
1
;
#ifdef USE_CUDA
#ifdef USE_CUDA
if
(
a
->
devID
>=
0
||
b
->
devID
>=
0
||
c
->
devID
>=
0
)
{
if
(
a
->
devID
>=
0
||
b
->
devID
>=
0
||
c
->
devID
>=
0
)
{
_CudaMultiply
(
a
,
b
,
c
,
alpha
,
leadingDim
);
_CudaMultiply
(
a
,
b
,
c
,
alpha
,
leadingDim
);
...
@@ -64,18 +61,18 @@ void _Multiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, i
...
@@ -64,18 +61,18 @@ void _Multiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, i
int
blockSizeB
=
1
;
int
blockSizeB
=
1
;
int
blockSizeC
=
1
;
int
blockSizeC
=
1
;
int
blockNum
=
1
;
int
blockNum
=
1
;
int
dimensionSizeA
=
a
->
dimSize
RDI
[
leadingDimRDI
];
int
dimensionSizeA
=
a
->
dimSize
[
leadingDim
];
int
dimensionSizeB
=
b
->
dimSize
RDI
[
leadingDimRDI
];
int
dimensionSizeB
=
b
->
dimSize
[
leadingDim
];
int
dimensionSizeC
=
c
->
dimSize
RDI
[
leadingDimRDI
];
int
dimensionSizeC
=
c
->
dimSize
[
leadingDim
];
for
(
int
i
=
0
;
i
<
a
->
order
;
i
++
)
{
for
(
int
i
=
0
;
i
<
a
->
order
;
i
++
)
{
if
(
i
!=
leadingDim
RDI
)
{
if
(
i
!=
leadingDim
)
{
CheckNTErrors
((
a
->
dimSize
RDI
[
i
]
==
b
->
dimSizeRDI
[
i
]
&&
CheckNTErrors
((
a
->
dimSize
[
i
]
==
b
->
dimSize
[
i
]
&&
a
->
dimSize
RDI
[
i
]
==
c
->
dimSizeRDI
[
i
]),
a
->
dimSize
[
i
]
==
c
->
dimSize
[
i
]),
"Unmatched tensors!"
);
"Unmatched tensors!"
);
}
}
if
(
i
<
leadingDimRDI
)
if
(
i
>
leadingDim
)
stride
*=
a
->
dimSize
RDI
[
i
];
stride
*=
a
->
dimSize
[
i
];
}
}
blockSizeA
=
stride
*
dimensionSizeA
;
blockSizeA
=
stride
*
dimensionSizeA
;
...
...
source/tensor/core/arithmetic/Multiply.cu
查看文件 @
f5149a15
...
@@ -122,26 +122,25 @@ where i is the item index
...
@@ -122,26 +122,25 @@ where i is the item index
*/
*/
void _CudaMultiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int leadingDim)
void _CudaMultiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int leadingDim)
{
{
int leadingDimRDI = a->order - leadingDim - 1;
CheckNTErrors((a->unitNum <= c->unitNum && b->unitNum <= c->unitNum),
CheckNTErrors(a->unitNum <= c->unitNum && b->unitNum <= c->unitNum,
"Unmatched tensors in multiplication!");
"Unmatched tensors in multiplication!");
CheckNTErrors(
a->order == b->order && a->order == c->order
, "Unmatched tensors!");
CheckNTErrors(
(a->order == b->order && a->order == c->order)
, "Unmatched tensors!");
int stride = 1;
int stride = 1;
int blockSizeA = 1;
int blockSizeA = 1;
int blockNum = 1;
int blockNum = 1;
int dimensionSizeA = a->dimSize
RDI[leadingDimRDI
];
int dimensionSizeA = a->dimSize
[leadingDim
];
int dimensionSizeB = b->dimSize
RDI[leadingDimRDI
];
int dimensionSizeB = b->dimSize
[leadingDim
];
int dimensionSizeC = c->dimSize
RDI[leadingDimRDI
];
int dimensionSizeC = c->dimSize
[leadingDim
];
for (int i = 0; i < a->order; i++) {
for (int i = 0; i < a->order; i++) {
if (i != leadingDim
RDI
) {
if (i != leadingDim) {
CheckNTErrors((a->dimSize
RDI[i] == b->dimSizeRDI
[i] &&
CheckNTErrors((a->dimSize
[i] == b->dimSize
[i] &&
a->dimSize
RDI[i] == c->dimSizeRDI
[i]),
a->dimSize
[i] == c->dimSize
[i]),
"Unmatched tensors!");
"Unmatched tensors!");
}
}
if (i
< leadingDimRDI
)
if (i
> leadingDim
)
stride *= a->dimSize
RDI
[i];
stride *= a->dimSize[i];
}
}
blockSizeA = stride * dimensionSizeA;
blockSizeA = stride * dimensionSizeA;
...
...
source/tensor/core/arithmetic/SumDim.cpp
查看文件 @
f5149a15
...
@@ -70,20 +70,6 @@ void _SumDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE bet
...
@@ -70,20 +70,6 @@ void _SumDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE bet
return
;
return
;
}
}
/*int dims[MAX_TENSOR_DIM_NUM];
for(int i = 0; i < a->order; i++)
dims[i] = 1;
dims[n] = a->GetDim(n);
XTensor * b2 = NewTensor(a->order, dims, b->dataType, b->denseRatio, b->devID, b->mem);
_CopyValues(b, b2);
_SumBroadcast(a, b2, c, beta);
DelTensor(b2);
return;*/
if
(
a
->
devID
>=
0
||
b
->
devID
>=
0
||
c
->
devID
>=
0
){
if
(
a
->
devID
>=
0
||
b
->
devID
>=
0
||
c
->
devID
>=
0
){
#ifdef USE_CUDA
#ifdef USE_CUDA
_CudaSumDim
(
a
,
b
,
c
,
n
,
beta
);
_CudaSumDim
(
a
,
b
,
c
,
n
,
beta
);
...
...
source/tensor/core/arithmetic/SumDim.cu
查看文件 @
f5149a15
...
@@ -87,17 +87,17 @@ void KernelAddWithCol(T * a, T * b, T * c, int rowNum, int colNum, int blockSize
...
@@ -87,17 +87,17 @@ void KernelAddWithCol(T * a, T * b, T * c, int rowNum, int colNum, int blockSize
int col = colIndex % colNum;
int col = colIndex % colNum;
int block = colIndex / colNum;
int block = colIndex / colNum;
if
(row >= rowNum || block >= blockNum)
if(row >= rowNum || block >= blockNum)
return;
return;
if
(threadIdx.x == 0)
if(threadIdx.x == 0)
bv[threadIdx.y] = b[row];
bv[threadIdx.y] = b[row];
__syncthreads();
__syncthreads();
int offset = block * blockSize + row * colNum + col;
int offset = block * blockSize + row * colNum + col;
if
(betaFired)
if(betaFired)
c[offset] = a[offset] + bv[threadIdx.y] * beta;
c[offset] = a[offset] + bv[threadIdx.y] * beta;
else
else
c[offset] = a[offset] + bv[threadIdx.y];
c[offset] = a[offset] + bv[threadIdx.y];
...
...
source/tensor/core/getandset/OnehotAndIndex.cpp
查看文件 @
f5149a15
...
@@ -139,6 +139,47 @@ void _IndexToOnehot(const XTensor * index, XTensor * onehot,
...
@@ -139,6 +139,47 @@ void _IndexToOnehot(const XTensor * index, XTensor * onehot,
}
}
/*
convert index tensor to onehot tensor
>> index - index tensor, which value is an integer num
>> onehot - onehot tensor, which value is 0 or 1
>> size - the last dimension size of the onehot tensor
*/
void
_IndexToOnehot
(
int
*
index
,
int
n
,
XTensor
*
onehot
,
int
size
,
float
labelSmoothingP
)
{
/*CheckNTErrors(onehot->GetDim(-1) == size, "Illegal tensor dimension!");
CheckNTErrors(onehot->dataType == X_INT, "The onehot tensor must be in X_INT!")
onehot->SetZeroAll();
#ifdef USE_CUDA
if (onehot->devID >= 0) {
delete[] cudaIndex;
return;
}
#endif
int blockNum = n;
int stride = size;
int * indexData = (int *)index;
int * onehotData = (int *)onehot->data;
for (int i = 0; i < blockNum; i++) {
int id = indexData[i];
int * od = onehotData + i * stride;
od[id] = 1;
}*/
XTensor
*
cudaIndex
=
NewTensor1D
(
n
,
X_INT
,
onehot
->
devID
);
cudaIndex
->
SetData
(
index
,
n
);
_IndexToOnehot
(
cudaIndex
,
onehot
,
size
,
labelSmoothingP
);
delete
[]
cudaIndex
;
}
/*
/*
convert onehot tensor to index tensor (return an XTensor structure)
convert onehot tensor to index tensor (return an XTensor structure)
make a new tensor to keep the result and return it
make a new tensor to keep the result and return it
...
...
source/tensor/core/getandset/OnehotAndIndex.h
查看文件 @
f5149a15
...
@@ -36,6 +36,9 @@ XTensor OnehotToIndex(const XTensor & onehot, int num);
...
@@ -36,6 +36,9 @@ XTensor OnehotToIndex(const XTensor & onehot, int num);
/* convert index tensor to onehot tensor */
/* convert index tensor to onehot tensor */
void
_IndexToOnehot
(
const
XTensor
*
index
,
XTensor
*
onehot
,
int
size
,
float
labelSmoothingP
);
void
_IndexToOnehot
(
const
XTensor
*
index
,
XTensor
*
onehot
,
int
size
,
float
labelSmoothingP
);
/* convert index tensor to onehot tensor */
void
_IndexToOnehot
(
int
*
index
,
int
n
,
XTensor
*
onehot
,
int
size
,
float
labelSmoothingP
);
/* convert index tensor to onehot tensor (return an XTensor structure)
/* convert index tensor to onehot tensor (return an XTensor structure)
make a new tensor to keep the result and return it */
make a new tensor to keep the result and return it */
XTensor
IndexToOnehot
(
const
XTensor
&
index
,
int
num
,
float
labelSmoothingP
);
XTensor
IndexToOnehot
(
const
XTensor
&
index
,
int
num
,
float
labelSmoothingP
);
...
...
source/tensor/core/getandset/Select.cpp
查看文件 @
f5149a15
...
@@ -25,6 +25,82 @@
...
@@ -25,6 +25,82 @@
namespace
nts
{
// namespace nts(NiuTrans.Tensor)
namespace
nts
{
// namespace nts(NiuTrans.Tensor)
/*
generate a tensor with selected data in index along the given dimension
c = select(a)
>> a - input tensor
>> c - result tensor
>> index - the selected index
>> dim - the dimension along with which we do the job
*/
void
_Select
(
const
XTensor
*
a
,
XTensor
*
c
,
int
*
index
,
int
dim
)
{
CheckNTErrors
(
a
!=
NULL
&&
c
!=
NULL
,
"empty tensors!"
);
CheckNTErrors
(
a
->
order
==
c
->
order
,
"The input and output tensors must in the same order!"
);
CheckNTErrors
(
dim
>=
0
&&
dim
<
a
->
order
,
"The input dimension is out of bounds!"
);
CheckNTErrors
(
a
->
dataType
==
c
->
dataType
,
"The tensor must be of the same data type!"
);
int
stride
=
1
;
for
(
int
i
=
dim
+
1
;
i
<
a
->
order
;
i
++
)
stride
*=
a
->
dimSize
[
i
];
printf
(
"
\n
%d %d
\n
"
,
a
->
order
-
dim
-
1
,
stride
);
int
copyTimes
=
1
;
for
(
int
i
=
0
;
i
<
dim
;
i
++
)
{
copyTimes
*=
a
->
dimSize
[
i
];
}
int
cot
=
c
->
dimSize
[
dim
];
int
blockSize
=
stride
*
a
->
unitSize
;
int
stepSizeS
=
stride
*
a
->
dimSize
[
dim
]
*
a
->
unitSize
;
int
stepSizeT
=
stride
*
c
->
dimSize
[
dim
]
*
a
->
unitSize
;
char
*
s
=
(
char
*
)
a
->
data
;
char
*
t
=
(
char
*
)
c
->
data
;
for
(
int
i
=
0
;
i
<
copyTimes
;
i
++
)
{
for
(
int
j
=
0
;
j
<
cot
;
++
j
)
{
XMemCopy
(
t
+
j
*
blockSize
,
c
->
devID
,
s
+
index
[
j
]
*
blockSize
,
a
->
devID
,
blockSize
);
}
s
+=
stepSizeS
;
t
+=
stepSizeT
;
}
}
/*
generate a tensor with selected data in index along the given dimension
c = select(a)
>> a - input tensor
>> c - result tensor
>> index - the selected index
>> dim - the dimension along with which we do the job
*/
void
_Select
(
const
XTensor
*
a
,
XTensor
*
c
,
XTensor
*
index
,
int
dim
)
{
if
(
index
->
devID
>=
0
)
{
int
*
indexCPU
=
new
int
[
index
->
unitNum
];
XMemCopy
(
indexCPU
,
-
1
,
index
->
data
,
index
->
devID
,
index
->
unitNum
*
sizeof
(
int
));
_Select
(
a
,
c
,
indexCPU
,
dim
);
delete
[]
indexCPU
;
}
else
{
_Select
(
a
,
c
,
(
int
*
)
index
->
data
,
dim
);
}
}
/*
*/
/*XTensor Select(const XTensor &a, int* index, int dim)
{
}*/
/*
/*
generate a tensor with selected data in range[low,high] along the given dimension
generate a tensor with selected data in range[low,high] along the given dimension
...
@@ -58,13 +134,12 @@ void _SelectRange(const XTensor * a, XTensor * c, int dim, int low, int high)
...
@@ -58,13 +134,12 @@ void _SelectRange(const XTensor * a, XTensor * c, int dim, int low, int high)
}
}
int
stride
=
1
;
int
stride
=
1
;
int
dimRDI
=
a
->
order
-
dim
-
1
;
for
(
int
i
=
dim
+
1
;
i
<
a
->
order
;
i
++
)
for
(
int
i
=
0
;
i
<
dimRDI
;
i
++
)
stride
*=
a
->
dimSize
[
i
];
stride
*=
a
->
dimSizeRDI
[
i
];
int
copyTimes
=
1
;
int
copyTimes
=
1
;
for
(
int
i
=
dimRDI
+
1
;
i
<
a
->
order
;
i
++
)
for
(
int
i
=
0
;
i
<
dim
;
i
++
)
copyTimes
*=
a
->
dimSize
RDI
[
i
];
copyTimes
*=
a
->
dimSize
[
i
];
int
blockSize
=
stride
*
(
high
-
low
)
*
a
->
unitSize
;
int
blockSize
=
stride
*
(
high
-
low
)
*
a
->
unitSize
;
int
stepSizeS
=
stride
*
a
->
dimSize
[
dim
]
*
a
->
unitSize
;
int
stepSizeS
=
stride
*
a
->
dimSize
[
dim
]
*
a
->
unitSize
;
...
@@ -117,12 +192,10 @@ XTensor SelectRange(const XTensor &a, int dim, int low, int high)
...
@@ -117,12 +192,10 @@ XTensor SelectRange(const XTensor &a, int dim, int low, int high)
_SelectRange
(
&
a
,
&
c
,
dim
,
low
,
high
);
_SelectRange
(
&
a
,
&
c
,
dim
,
low
,
high
);
/* tensor connection */
/* tensor connection */
if
(
a
.
enableGrad
)
{
XLink
::
MakeLink
(
&
a
,
NULL
,
&
c
,
GETANDSET_SELECT
);
XLink
::
MakeLink
(
&
a
,
NULL
,
&
c
,
GETANDSET_SELECT
);
XLink
::
AddParamToHeadInt
(
&
c
,
dim
);
XLink
::
AddParamToHeadInt
(
&
c
,
dim
);
XLink
::
AddParamToHeadInt
(
&
c
,
low
);
XLink
::
AddParamToHeadInt
(
&
c
,
low
);
XLink
::
AddParamToHeadInt
(
&
c
,
high
);
XLink
::
AddParamToHeadInt
(
&
c
,
high
);
}
/* destroy variables */
/* destroy variables */
delete
[]
dimSize
;
delete
[]
dimSize
;
...
...
source/tensor/core/getandset/Select.h
查看文件 @
f5149a15
...
@@ -27,7 +27,10 @@
...
@@ -27,7 +27,10 @@
namespace
nts
{
// namespace nts(NiuTrans.Tensor)
namespace
nts
{
// namespace nts(NiuTrans.Tensor)
/* generate a tensor with selected data c = select(a) */
/* generate a tensor with selected data c = select(a) */
void
_Select
(
const
XTensor
*
a
,
XTensor
*
c
,
XTensor
*
indexCPU
);
void
_Select
(
const
XTensor
*
a
,
XTensor
*
c
,
int
*
index
,
int
dim
);
/* generate a tensor with selected data c = select(a) */
void
_Select
(
const
XTensor
*
a
,
XTensor
*
c
,
XTensor
*
index
,
int
dim
);
/*
/*
generate a tensor with selected data c = select(a) (returna a XTensor structure)
generate a tensor with selected data c = select(a) (returna a XTensor structure)
...
...
source/tensor/core/math/Normalize.cpp
查看文件 @
f5149a15
...
@@ -47,26 +47,25 @@ void _Normalize(const XTensor * input, XTensor * output, int dim,
...
@@ -47,26 +47,25 @@ void _Normalize(const XTensor * input, XTensor * output, int dim,
const
XTensor
*
mean
,
const
XTensor
*
var
,
const
XTensor
*
mean
,
const
XTensor
*
var
,
const
XTensor
*
a
,
const
XTensor
*
b
,
DTYPE
epsilon
)
const
XTensor
*
a
,
const
XTensor
*
b
,
DTYPE
epsilon
)
{
{
int
dimRDI
=
input
->
order
-
dim
-
1
;
CheckNTErrors
((
_IsSameShaped
(
input
,
output
)),
"Unmatched input tensors!"
);
CheckNTErrors
((
_IsSameShaped
(
input
,
output
)),
"Unmatched input tensors!"
);
CheckNTErrors
((
_IsSameShaped
(
a
,
b
)),
"Unmatched input tensors"
);
CheckNTErrors
((
_IsSameShaped
(
a
,
b
)),
"Unmatched input tensors"
);
CheckNTErrors
((
_IsSameShaped
(
mean
,
var
)),
"Unmatched input tensors"
);
CheckNTErrors
((
_IsSameShaped
(
mean
,
var
)),
"Unmatched input tensors"
);
CheckNTErrors
((
input
&&
output
&&
mean
&&
var
&&
a
&&
b
),
"Empty input tensors!"
);
CheckNTErrors
((
input
&&
output
&&
mean
&&
var
&&
a
&&
b
),
"Empty input tensors!"
);
CheckNTErrors
((
dim
RDI
>=
0
&&
dimRDI
<
input
->
order
),
"Incorrect reduction dimension!"
);
CheckNTErrors
((
dim
>=
0
&&
dim
<
input
->
order
),
"Incorrect reduction dimension!"
);
CheckNTErrors
((
input
->
order
==
mean
->
order
+
1
),
"Incorrect reduction dimension!"
);
CheckNTErrors
((
input
->
order
==
mean
->
order
+
1
),
"Incorrect reduction dimension!"
);
int
stride
=
1
;
int
stride
=
1
;
int
strideNum
=
input
->
dimSize
RDI
[
dimRDI
];
int
strideNum
=
input
->
dimSize
[
dim
];
int
blockSize
=
1
;
int
blockSize
=
1
;
int
blockNum
=
1
;
int
blockNum
=
1
;
for
(
int
i
=
0
;
i
<
input
->
order
;
i
++
)
{
for
(
int
i
=
0
;
i
<
input
->
order
;
i
++
)
{
if
(
i
<
dim
RDI
)
{
if
(
i
<
dim
)
{
CheckNTErrors
((
input
->
dimSize
RDI
[
i
]
==
mean
->
dimSizeRDI
[
i
]),
"Wrong size!"
);
CheckNTErrors
((
input
->
dimSize
[
i
]
==
mean
->
dimSize
[
i
]),
"Wrong size!"
);
stride
*=
input
->
dimSizeRDI
[
i
];
blockNum
*=
input
->
dimSize
[
i
];
}
}
else
if
(
i
>
dim
RDI
)
{
else
if
(
i
>
dim
)
{
CheckNTErrors
((
input
->
dimSize
RDI
[
i
]
==
mean
->
dimSizeRDI
[
i
-
1
]),
"Wrong size!"
);
CheckNTErrors
((
input
->
dimSize
[
i
]
==
mean
->
dimSize
[
i
-
1
]),
"Wrong size!"
);
blockNum
*=
input
->
dimSizeRDI
[
i
];
stride
*=
input
->
dimSize
[
i
];
}
}
}
}
blockSize
=
stride
*
strideNum
;
blockSize
=
stride
*
strideNum
;
...
...
source/tensor/core/math/Normalize.cu
查看文件 @
f5149a15
...
@@ -95,15 +95,14 @@ void _CudaNormalize(const XTensor * input, XTensor * output, int dim,
...
@@ -95,15 +95,14 @@ void _CudaNormalize(const XTensor * input, XTensor * output, int dim,
{
{
CheckNTErrors((input->dataType == DEFAULT_DTYPE), "TODO!");
CheckNTErrors((input->dataType == DEFAULT_DTYPE), "TODO!");
int dimRDI = input->order - dim - 1;
int stride = 1;
int stride = 1;
int strideNum = input->dimSize
RDI[dimRDI
];
int strideNum = input->dimSize
[dim
];
int blockNum = 1;
int blockNum = 1;
for (int i = 0; i < input->order; i++) {
for (int i = 0; i < input->order; i++) {
if (i
< dimRDI
)
if (i
> dim
)
stride *= input->dimSize
RDI
[i];
stride *= input->dimSize[i];
else if (i
> dimRDI
)
else if (i
< dim
)
blockNum *= input->dimSize
RDI
[i];
blockNum *= input->dimSize[i];
}
}
int cudaGridSize[3];
int cudaGridSize[3];
...
...
source/tensor/core/movement/CopyInGrid.cpp
查看文件 @
f5149a15
...
@@ -41,12 +41,11 @@ void _CopyInGrid(const XTensor * s, XTensor * t, int * index, int blockDim, int
...
@@ -41,12 +41,11 @@ void _CopyInGrid(const XTensor * s, XTensor * t, int * index, int blockDim, int
{
{
CheckNTErrors
((
_IsSameShaped
(
s
,
t
)),
"Unmatched tensors!"
);
CheckNTErrors
((
_IsSameShaped
(
s
,
t
)),
"Unmatched tensors!"
);
int
blockDimRDI
=
s
->
order
-
blockDim
-
1
;
int
blockSize
=
1
;
int
blockSize
=
1
;
int
blockNum
=
blockNumInGrid
;
int
blockNum
=
blockNumInGrid
;
int
gridNum
=
1
;
int
gridNum
=
1
;
for
(
int
i
=
0
;
i
<
blockDimRDI
;
i
++
)
for
(
int
i
=
blockDim
;
i
<
s
->
order
;
i
++
)
blockSize
*=
s
->
dimSize
RDI
[
i
];
blockSize
*=
s
->
dimSize
[
i
];
CheckNTErrors
((
s
->
unitNum
%
(
blockSize
*
blockNum
)
==
0
),
"Illegal block number!"
);
CheckNTErrors
((
s
->
unitNum
%
(
blockSize
*
blockNum
)
==
0
),
"Illegal block number!"
);
gridNum
=
s
->
unitNum
/
(
blockSize
*
blockNum
);
gridNum
=
s
->
unitNum
/
(
blockSize
*
blockNum
);
...
...
source/tensor/core/movement/CopyIndexed.cpp
查看文件 @
f5149a15
...
@@ -53,26 +53,28 @@ void _CopyIndexed(const XTensor * s, XTensor * t, int dim,
...
@@ -53,26 +53,28 @@ void _CopyIndexed(const XTensor * s, XTensor * t, int dim,
CheckNTErrors
(
dim
<
s
->
order
&&
dim
<
t
->
order
,
"A too larget dimension specified!"
);
CheckNTErrors
(
dim
<
s
->
order
&&
dim
<
t
->
order
,
"A too larget dimension specified!"
);
CheckNTErrors
(
s
->
unitSize
==
t
->
unitSize
,
"Unmatched tensors!"
);
CheckNTErrors
(
s
->
unitSize
==
t
->
unitSize
,
"Unmatched tensors!"
);
int
dimRDI
=
s
->
order
-
dim
-
1
;
int
blockSizeSrc
=
1
;
int
blockSizeSrc
=
1
;
int
blockSizeTgt
=
1
;
int
blockSizeTgt
=
1
;
int
blockNumSrc
=
1
;
int
blockNumSrc
=
1
;
int
blockNumTgt
=
1
;
int
blockNumTgt
=
1
;
int
leadDimSizeSrc
=
s
->
dimSize
RDI
[
dimRDI
];
int
leadDimSizeSrc
=
s
->
dimSize
[
dim
];
int
leadDimSizeTgt
=
t
->
dimSize
RDI
[
dimRDI
];
int
leadDimSizeTgt
=
t
->
dimSize
[
dim
];
int
indexOffsetNum
=
1
;
int
indexOffsetNum
=
1
;
for
(
int
i
=
0
;
i
<
dimRDI
;
i
++
)
{
for
(
int
i
=
dim
+
1
;
i
<
s
->
order
;
i
++
)
{
blockSizeSrc
*=
s
->
dimSizeRDI
[
i
];
blockSizeSrc
*=
s
->
dimSize
[
i
];
blockSizeTgt
*=
t
->
dimSizeRDI
[
i
];
}
for
(
int
i
=
dim
+
1
;
i
<
t
->
order
;
i
++
)
{
blockSizeTgt
*=
t
->
dimSize
[
i
];
}
for
(
int
i
=
0
;
i
<=
dim
;
i
++
)
{
blockNumSrc
*=
s
->
dimSize
[
i
];
blockNumTgt
*=
t
->
dimSize
[
i
];
}
}
for
(
int
i
=
dimRDI
;
i
<
s
->
order
;
i
++
)
blockNumSrc
*=
s
->
dimSizeRDI
[
i
];
for
(
int
i
=
dimRDI
;
i
<
t
->
order
;
i
++
)
blockNumTgt
*=
t
->
dimSizeRDI
[
i
];
CheckNTErrors
(
blockSizeSrc
==
blockSizeTgt
,
"Unmatched tensors!"
);
CheckNTErrors
(
blockSizeSrc
==
blockSizeTgt
,
"Unmatched tensors!"
);
indexOffsetNum
=
blockNumSrc
/
s
->
dimSize
RDI
[
dimRDI
];
indexOffsetNum
=
blockNumSrc
/
s
->
dimSize
[
dim
];
int
realIndexSize
=
indexOffsetNum
*
indexSize
*
copyNum
;
int
realIndexSize
=
indexOffsetNum
*
indexSize
*
copyNum
;
int
*
realSrcIndex
=
new
int
[
realIndexSize
];
int
*
realSrcIndex
=
new
int
[
realIndexSize
];
...
@@ -219,14 +221,14 @@ make a new tensor to keep the result and return it
...
@@ -219,14 +221,14 @@ make a new tensor to keep the result and return it
>> s - the source tensor
>> s - the source tensor
>> dim - the leading dimension to define "sub-tensors"
>> dim - the leading dimension to define "sub-tensors"
e.g., for a tensor of size (
4, 2, 3) and dim = 0
,
e.g., for a tensor of size (
3, 2, 4) and dim = 2
,
we have 4 sub-tensors of size (
2, 3
)
we have 4 sub-tensors of size (
3,2
)
>> srcIndex - index of the source sub-tensors
>> srcIndex - index of the source sub-tensors
>> indexSize - length of srcIndex (and tgtIndex)
>> indexSize - length of srcIndex (and tgtIndex)
>> tgtIndex - index of the target sub-tensors
>> tgtIndex - index of the target sub-tensors
>> copyNum - number of the sub-tensors we copy for each source index,
>> copyNum - number of the sub-tensors we copy for each source index,
e.g., for srcIndex = [
0,1
] and copyNum = 2,
e.g., for srcIndex = [
1,4
] and copyNum = 2,
we actually copy the source sub-tensors
0, 1, 1 and 2
we actually copy the source sub-tensors
1, 2, 4, 5
<< return - the result of copying indexed sub-tensors
<< return - the result of copying indexed sub-tensors
*/
*/
XTensor
CopyIndexed
(
const
XTensor
&
s
,
int
dim
,
XTensor
CopyIndexed
(
const
XTensor
&
s
,
int
dim
,
...
@@ -277,14 +279,14 @@ make a new tensor to keep the result and return it
...
@@ -277,14 +279,14 @@ make a new tensor to keep the result and return it
>> s - the source tensor
>> s - the source tensor
>> dim - the leading dimension to define "sub-tensors"
>> dim - the leading dimension to define "sub-tensors"
e.g., for a tensor of size (
4, 2, 3) and dim = 0
,
e.g., for a tensor of size (
3, 2, 4) and dim = 2
,
we have 4 sub-tensors of size (
2, 3
)
we have 4 sub-tensors of size (
3,2
)
>> srcIndex - index of the source sub-tensors
>> srcIndex - index of the source sub-tensors
>> indexSize - length of srcIndex (and tgtIndex)
>> indexSize - length of srcIndex (and tgtIndex)
>> tgtIndex - index of the target sub-tensors
>> tgtIndex - index of the target sub-tensors
>> copyNum - number of the sub-tensors we copy for each source index,
>> copyNum - number of the sub-tensors we copy for each source index,
e.g., for srcIndex = [
0,1
] and copyNum = 2,
e.g., for srcIndex = [
1,4
] and copyNum = 2,
we actually copy the source sub-tensors
0, 1, 1 and 2
we actually copy the source sub-tensors
1, 2, 4, 5
<< return - the result of copying indexed sub-tensors
<< return - the result of copying indexed sub-tensors
*/
*/
XTensor
CopyIndexed
(
const
XTensor
&
s
,
int
dim
,
int
*
srcIndex
,
int
indexSize
,
int
*
tgtIndex
,
int
copyNum
)
XTensor
CopyIndexed
(
const
XTensor
&
s
,
int
dim
,
int
*
srcIndex
,
int
indexSize
,
int
*
tgtIndex
,
int
copyNum
)
...
...
source/tensor/core/movement/Gather.cpp
查看文件 @
f5149a15
...
@@ -33,6 +33,51 @@ gather indexed sub-tensors
...
@@ -33,6 +33,51 @@ gather indexed sub-tensors
>> s - the source tensor
>> s - the source tensor
>> t - the target tensor
>> t - the target tensor
>> dim - the leading dimension to define "sub-tensors"
e.g., for a tensor of size (3, 2, 4) and dim = 2,
we have 4 sub-tensors of size (3, 2)
>> srcIndex - index of the source sub-tensors
>> indexSize - length of srcIndex (and tgtIndex)
*/
void
_Gather
(
XTensor
*
s
,
XTensor
*
t
,
int
dim
,
int
*
srcIndex
,
int
indexSize
)
{
int
*
tgtIndex
=
new
int
[
indexSize
];
for
(
int
i
=
0
;
i
<
indexSize
;
i
++
)
tgtIndex
[
i
]
=
i
;
_CopyIndexed
(
s
,
t
,
dim
,
srcIndex
,
indexSize
,
tgtIndex
,
1
);
delete
[]
tgtIndex
;
}
/*
gather indexed sub-tensors
>> s - the source tensor
>> t - the target tensor
>> srcIndex - index of the source sub-tensors
>> dim - the leading dimension to define "sub-tensors"
e.g., for a tensor of size (3, 2, 4) and dim = 2,
we have 4 sub-tensors of size (3, 2)
*/
void
_Gather
(
const
XTensor
*
s
,
XTensor
*
t
,
XTensor
*
srcIndex
,
int
dim
)
{
CheckNTErrors
((
s
&&
t
),
"Invalid tensors!"
);
CheckNTErrors
(
s
->
devID
==
t
->
devID
,
"the data must be kept on the same device!"
);
CheckNTErrors
((
t
->
unitSize
==
srcIndex
->
unitSize
),
"Unmatched tensors!"
);
#ifdef USE_CUDA
if
(
s
->
devID
>=
0
&&
t
->
devID
>=
0
)
{
_CudaGather
(
s
,
t
,
srcIndex
,
dim
);
return
;
}
#endif
}
/*
gather indexed sub-tensors
>> s - the source tensor
>> t - the target tensor
>> srcIndex - the tensor to save the index of the source tensor
>> srcIndex - the tensor to save the index of the source tensor
*/
*/
void
_Gather
(
const
XTensor
*
s
,
XTensor
*
t
,
XTensor
*
srcIndex
)
void
_Gather
(
const
XTensor
*
s
,
XTensor
*
t
,
XTensor
*
srcIndex
)
...
@@ -79,10 +124,15 @@ XTensor Gather(XTensor &s, XTensor &index)
...
@@ -79,10 +124,15 @@ XTensor Gather(XTensor &s, XTensor &index)
CheckNTErrors
(
s
.
order
==
2
,
"The order of the input tensor must be 2!"
);
CheckNTErrors
(
s
.
order
==
2
,
"The order of the input tensor must be 2!"
);
int
order
=
index
.
order
+
1
;
int
order
=
s
.
order
;
int
*
dimSize
=
new
int
[
order
];
int
*
dimSize
=
new
int
[
order
];
memcpy
(
dimSize
,
index
.
dimSize
,
index
.
order
*
sizeof
(
int
));
dimSize
[
index
.
order
]
=
s
.
GetDim
(
-
1
);
for
(
int
i
=
0
;
i
<
s
.
order
;
i
++
)
{
if
(
i
==
dim
)
dimSize
[
i
]
=
index
.
unitNum
;
else
dimSize
[
i
]
=
s
.
dimSize
[
i
];
}
float
dr
=
(
!
s
.
isSparse
)
?
1.0
F
:
s
.
denseRatio
;
float
dr
=
(
!
s
.
isSparse
)
?
1.0
F
:
s
.
denseRatio
;
XTensor
t
(
order
,
dimSize
,
s
.
dataType
,
dr
,
s
.
devID
,
s
.
mem
);
XTensor
t
(
order
,
dimSize
,
s
.
dataType
,
dr
,
s
.
devID
,
s
.
mem
);
...
@@ -93,11 +143,22 @@ XTensor Gather(XTensor &s, XTensor &index)
...
@@ -93,11 +143,22 @@ XTensor Gather(XTensor &s, XTensor &index)
_Gather
(
&
s
,
&
t
,
&
index
);
_Gather
(
&
s
,
&
t
,
&
index
);
/* tensor connection */
/* tensor connection */
if
(
s
.
enableGrad
)
{
XLink
::
MakeLink
(
&
s
,
&
index
,
&
t
,
MOVEMENT_GATHER
);
XLink
::
MakeLink
(
&
s
,
&
index
,
&
t
,
MOVEMENT_GATHER
);
}
if
(
index
.
order
>
1
)
{
int
*
dims
=
new
int
[
index
.
order
+
1
];
memcpy
(
dims
,
index
.
dimSize
,
index
.
order
*
sizeof
(
int
));
dims
[
index
.
order
]
=
t
.
GetDim
(
-
1
);
return
t
;
XTensor
tt
;
tt
=
Reshape
(
t
,
index
.
order
+
1
,
dims
);
delete
[]
dims
;
return
tt
;
}
else
{
return
t
;
}
}
}
}
//
namespace
nts
(
NiuTrans
.
Tensor
)
}
//
namespace
nts
(
NiuTrans
.
Tensor
)
\ No newline at end of file
source/tensor/core/movement/Gather.cu
查看文件 @
f5149a15
...
@@ -68,6 +68,36 @@ void KernelGather(DTYPE * sData, DTYPE * tData, int * sIndex, int indexSize, int
...
@@ -68,6 +68,36 @@ void KernelGather(DTYPE * sData, DTYPE * tData, int * sIndex, int indexSize, int
/*
/*
gather indexed sub-tensors(cuda version)
gather indexed sub-tensors(cuda version)
>> sData - the data pointer of the source tensor
>> tData - the data pointer of the target tensor
>> sIndex - the index of the source tensor
>> indexSize - the size of the srcIndex
>> stride - stride of a data block
>> strideNum - strideNum of a data block
>> blockNum - block size of data
*/
__global__
void KernelGather(DTYPE * sData, DTYPE * tData, int * sIndex, int stride, int strideNum, int blockNum)
{
int idx = blockDim.x * blockIdx.x + threadIdx.x;
int idy = blockDim.y * blockIdx.y + threadIdx.y;
int blockIndex = idy / stride;
int offsetInBlock = idy % stride;
int size = stride * strideNum * blockNum;
#pragma unroll
for (int i = idx * stride + stride * strideNum * blockIndex + offsetInBlock;
i < stride * strideNum * blockIndex + offsetInBlock + stride * strideNum && i < size;
i += stride * blockDim.x) {
tData[i] = sData[sIndex[i]];
}
}
/*
gather indexed sub-tensors(cuda version)
>> s - the source tensor
>> s - the source tensor
>> t - the target tensor
>> t - the target tensor
>> srcIndex - the tensor to save the index of the source tensor
>> srcIndex - the tensor to save the index of the source tensor
...
@@ -117,6 +147,44 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex)
...
@@ -117,6 +147,44 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex)
BacktoCudaDev(devID, devIDBackup);
BacktoCudaDev(devID, devIDBackup);
}
}
/*
gather indexed sub-tensors(cuda version)
>> s - the source tensor
>> t - the target tensor
>> srcIndex - the tensor to save the index of the source tensor
>> dim - the leading dimension to define "sub-tensors"
*/
void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex, int dim)
{
int devID = srcIndex->devID;
XMem * mem = s->mem;
int stride = 1;
int blockNum = 1;
int indexSize = srcIndex->unitNum;
int strideNum = srcIndex->dimSize[dim];
for (int i = 0; i < dim; i++)
blockNum *= srcIndex->dimSize[i];
for (int i = dim + 1; i < srcIndex->order; i++)
stride *= srcIndex->dimSize[i];
int * sIndex = NULL;
if (srcIndex->devID < 0) {
sIndex = mem != NULL ?
(int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) :
(int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);
XMemCopy(sIndex, devID, srcIndex, -1, sizeof(int) * indexSize);
}
else
sIndex = (int *)srcIndex->data;
int cudaGrids[3];
int cudaBlocks[3];
GDevs.GetCudaThread2D(devID, max(32, strideNum), stride*blockNum, MAX_INT, cudaGrids, cudaBlocks);
KernelGather << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> > ((DTYPE *)s->data, (DTYPE *)t->data, sIndex, stride, strideNum, blockNum);
}
#endif // USE_CUDA
#endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor)
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
source/tensor/core/movement/Gather.cuh
查看文件 @
f5149a15
...
@@ -32,6 +32,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
...
@@ -32,6 +32,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* gather indexed sub-tensors(cuda version) */
/* gather indexed sub-tensors(cuda version) */
void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex);
void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex);
void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex,int dim);
#endif // USE_CUDA
#endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor)
} // namespace nts(NiuTrans.Tensor)
...
...
source/tensor/core/movement/Gather.h
查看文件 @
f5149a15
...
@@ -27,8 +27,14 @@
...
@@ -27,8 +27,14 @@
namespace
nts
{
// namespace nts(NiuTrans.Tensor)
namespace
nts
{
// namespace nts(NiuTrans.Tensor)
/* gather selected sub-tensors */
/* gather selected sub-tensors */
void
_Gather
(
XTensor
*
s
,
XTensor
*
t
,
int
dim
,
int
*
srcIndex
,
int
indexSize
);
/* gather selected sub-tensors */
void
_Gather
(
const
XTensor
*
s
,
XTensor
*
t
,
XTensor
*
srcIndex
);
void
_Gather
(
const
XTensor
*
s
,
XTensor
*
t
,
XTensor
*
srcIndex
);
/* gather selected sub-tensors accoding to the dimension */
void
_Gather
(
const
XTensor
*
s
,
XTensor
*
t
,
XTensor
*
srcIndex
,
int
dim
);
/* gather selected sub-tensors (return an XTensor structure)
/* gather selected sub-tensors (return an XTensor structure)
make a new tensor to keep the result and return it */
make a new tensor to keep the result and return it */
XTensor
Gather
(
XTensor
&
s
,
XTensor
&
index
);
XTensor
Gather
(
XTensor
&
s
,
XTensor
&
index
);
...
...
source/tensor/core/reduce/ReduceMax.cpp
查看文件 @
f5149a15
...
@@ -35,122 +35,143 @@ get the max value of the items along a dimension of the tensor
...
@@ -35,122 +35,143 @@ get the max value of the items along a dimension of the tensor
>> output - the output tensor
>> output - the output tensor
>> dim - the dimension where the reduction is performed on
>> dim - the dimension where the reduction is performed on
*/
*/
void
_ReduceMax
(
const
XTensor
*
input
,
XTensor
*
output
,
int
dim
)
#define _REDUCE_CPU_FUNCTION(_funcCPUName, _vectorOp, _reduceOp) \
{
void _funcCPUName(const XTensor * input, XTensor * output, int dim) \
CheckNTErrors
((
input
->
devID
==
output
->
devID
||
(
input
->
devID
<
0
&&
output
->
devID
<
0
)),
{ \
"This code must be run on the same device!"
);
CheckNTErrors((input->devID == output->devID || (input->devID < 0 && output->devID < 0)), \
CheckNTErrors
((
input
&&
output
),
"Empty input or output tensors!"
);
"This code must be run on the same device!"); \
CheckNTErrors
((
input
->
order
==
output
->
order
+
1
),
"Incorrect tensor sizes!"
);
CheckNTErrors((input && output), "Empty input or output tensors!"); \
CheckNTErrors
((
input
->
order
>
dim
&&
dim
>=
0
),
"Illegal dimension to reduce!"
);
CheckNTErrors((input->order == output->order + 1), "Incorrect tensor sizes!"); \
CheckNTErrors
((
input
->
dataType
==
output
->
dataType
),
"Unmatched data types!"
);
CheckNTErrors((input->order > dim && dim >= 0), "Illegal dimension to reduce!"); \
CheckNTErrors((input->dataType == output->dataType), "Unmatched data types!"); \
int
dimRDI
=
input
->
order
-
dim
-
1
;
\
CheckNTErrors
(
dimRDI
>=
0
,
"Wrong dimension!"
);
CheckNTErrors(dim < input->order, "Wrong dimension!"); \
\
for
(
int
i
=
0
;
i
<
input
->
order
;
i
++
){
for (int i = 0; i < input->order; i++) { \
if
(
i
<
dimRDI
){
\
CheckNTErrors
((
input
->
dimSizeRDI
[
i
]
==
output
->
dimSizeRDI
[
i
]),
if (i < dim) { \
"Unmatched tensors!"
);
\
}
CheckNTErrors((input->dimSize[i] == output->dimSize[i]), \
else
if
(
i
>
dimRDI
){
"Unmatched tensors!"); \
CheckNTErrors
((
input
->
dimSizeRDI
[
i
]
==
output
->
dimSizeRDI
[
i
-
1
]),
} \
"Unmatched tensors!"
);
else if (i > dim) { \
}
CheckNTErrors((input->dimSize[i] == output->dimSize[i - 1]), \
}
"Unmatched tensors!"); \
} \
if
(
input
->
devID
>=
0
){
} \
#ifdef USE_CUDA
CheckNTErrors((input->dataType == DEFAULT_DTYPE), "TODO!"); \
_CudaReduceMax
(
input
,
output
,
dim
);
int stride = 1; \
#endif
int strideNum = input->dimSize[dim]; \
}
int blockSize = 1; \
else
{
int blockNum = 1; \
CheckNTErrors
((
input
->
dataType
==
DEFAULT_DTYPE
),
"TODO!"
);
for (int i = 0; i < input->order; i++) { \
if (i > dim) \
int
stride
=
1
;
stride *= input->dimSize[i]; \
int
strideNum
=
input
->
dimSizeRDI
[
dimRDI
];
else if (i < dim) \
int
blockSize
=
1
;
blockNum *= input->dimSize[i]; \
int
blockNum
=
1
;
} \
for
(
int
i
=
0
;
i
<
input
->
order
;
i
++
)
{
blockSize = stride * strideNum; \
if
(
i
<
dimRDI
)
\
stride
*=
input
->
dimSizeRDI
[
i
];
\
else
if
(
i
>
dimRDI
)
if(input->dimSize[input->order - 1] % (4 * 32 / sizeof(DTYPE)) == 0 && input->dimSize[input->order - 1] >= 32){ \
blockNum
*=
input
->
dimSizeRDI
[
i
];
int vecBufLength = 32 / sizeof(DTYPE); \
}
\
blockSize
=
stride
*
strideNum
;
if (dim == input->order - 1) { \
/*data is contiguous in dim 0 */
\
if
(
input
->
dimSizeRDI
[
0
]
%
(
4
*
32
/
sizeof
(
DTYPE
))
==
0
&&
input
->
dimSizeRDI
[
0
]
>=
32
){
for (int i = 0; i < blockNum; i++) { \
int
vecBufLength
=
32
/
sizeof
(
DTYPE
);
DTYPE * ip = (DTYPE*)input->data + blockSize * i; \
DTYPE * op = (DTYPE*)output->data + i; \
if
(
dimRDI
==
0
){
VectorBuffer vecBuf[4]; \
//data is contiguous in dim 0
for (int j = 0; j < 4; j++) { \
for
(
int
i
=
0
;
i
<
blockNum
;
i
++
){
vecBuf[j] = VectorBuffer::loadu((DTYPE*)(ip)+j * vecBufLength); \
DTYPE
*
ip
=
(
DTYPE
*
)
input
->
data
+
blockSize
*
i
;
} \
DTYPE
*
op
=
(
DTYPE
*
)
output
->
data
+
i
;
for (int j = 1; j < strideNum / 32; j++) { \
VectorBuffer
vecBuf
[
4
];
const DTYPE* ptr = (DTYPE*)(ip + j * vecBufLength); \
for
(
int
j
=
0
;
j
<
4
;
j
++
){
vecBuf[0] = vecBuf[0]._vectorOp(VectorBuffer::loadu(ptr + 0 * vecBufLength)); \
vecBuf
[
j
]
=
VectorBuffer
::
loadu
((
DTYPE
*
)(
ip
)
+
j
*
vecBufLength
);
vecBuf[1] = vecBuf[1]._vectorOp(VectorBuffer::loadu(ptr + 1 * vecBufLength)); \
}
vecBuf[2] = vecBuf[2]._vectorOp(VectorBuffer::loadu(ptr + 2 * vecBufLength)); \
for
(
int
j
=
1
;
j
<
strideNum
/
32
;
j
++
){
vecBuf[3] = vecBuf[3]._vectorOp(VectorBuffer::loadu(ptr + 3 * vecBufLength)); \
const
DTYPE
*
ptr
=
(
DTYPE
*
)(
ip
+
j
*
vecBufLength
);
} \
vecBuf
[
0
]
=
vecBuf
[
0
].
maxData
(
VectorBuffer
::
loadu
(
ptr
+
0
*
vecBufLength
));
vecBuf[0] = vecBuf[0]._vectorOp(vecBuf[1]); \
vecBuf
[
1
]
=
vecBuf
[
1
].
maxData
(
VectorBuffer
::
loadu
(
ptr
+
1
*
vecBufLength
));
vecBuf[0] = vecBuf[0]._vectorOp(vecBuf[2]); \
vecBuf
[
2
]
=
vecBuf
[
2
].
maxData
(
VectorBuffer
::
loadu
(
ptr
+
2
*
vecBufLength
));
vecBuf[0] = vecBuf[0]._vectorOp(vecBuf[3]); \
vecBuf
[
3
]
=
vecBuf
[
3
].
maxData
(
VectorBuffer
::
loadu
(
ptr
+
3
*
vecBufLength
));
DTYPE maxN = vecBuf[0][0]; \
}
for (int k = 1; k < vecBufLength; k++) { \
vecBuf
[
0
]
=
vecBuf
[
0
].
maxData
(
vecBuf
[
1
]);
maxN = _reduceOp(maxN, vecBuf[0][k]); \
vecBuf
[
0
]
=
vecBuf
[
0
].
maxData
(
vecBuf
[
2
]);
} \
vecBuf
[
0
]
=
vecBuf
[
0
].
maxData
(
vecBuf
[
3
]);
*op = maxN; \
DTYPE
maxN
=
DTYPE_MIN
;
} \
for
(
int
k
=
0
;
k
<
vecBufLength
;
k
++
){
\
maxN
=
MAX
(
maxN
,
vecBuf
[
0
][
k
]);
} \
}
else { \
*
op
=
maxN
;
/* data is separated */
\
}
for(int i = 0; i < blockNum; i++){ \
for(int j = 0; j < input->dimSize[input->order - 1] / 32; j++){ \
}
else
{
DTYPE * ip = (DTYPE*)input->data + blockSize * i; \
//data is separated
DTYPE * op = (DTYPE*)output->data + stride * i; \
for
(
int
i
=
0
;
i
<
blockNum
;
i
++
){
VectorBuffer vecBuf[4]; \
for
(
int
j
=
0
;
j
<
input
->
dimSizeRDI
[
0
]
/
32
;
j
++
){
for(int k = 0; k < 4; k++){ \
DTYPE
*
ip
=
(
DTYPE
*
)
input
->
data
+
blockSize
*
i
;
vecBuf[k] = VectorBuffer::loadu((DTYPE*)(ip) + (j * 4 + k) * 32 / sizeof(DTYPE)); \
DTYPE
*
op
=
(
DTYPE
*
)
output
->
data
+
stride
*
i
;
\
VectorBuffer
vecBuf
[
4
];
} \
for
(
int
k
=
0
;
k
<
4
;
k
++
){
for(int k = 1; k < strideNum; k++){ \
vecBuf
[
k
]
=
VectorBuffer
::
loadu
((
DTYPE
*
)(
ip
)
+
(
j
*
4
+
k
)
*
32
/
sizeof
(
DTYPE
));
DTYPE * ptr = ip + k * stride + (j * 4) * vecBufLength; \
vecBuf[0] = vecBuf[0]._vectorOp(VectorBuffer::loadu(ptr + 0 * vecBufLength)); \
vecBuf[1] = vecBuf[1]._vectorOp(VectorBuffer::loadu(ptr + 1 * vecBufLength)); \
vecBuf[2] = vecBuf[2]._vectorOp(VectorBuffer::loadu(ptr + 2 * vecBufLength)); \
vecBuf[3] = vecBuf[3]._vectorOp(VectorBuffer::loadu(ptr + 3 * vecBufLength)); \
} \
for(int k = 0; k < 4; k++){ \
for(int l = 0; l < vecBufLength; l++) \
*(op + j * 32 + 8 * k + l) = vecBuf[k][l]; \
} \
} \
} \
} \
}
/* run vector buffer */
\
else{ \
for(int k = 0; k < blockNum; k++){ \
DTYPE * ip = (DTYPE*)input->data + blockSize * k; \
DTYPE * op = (DTYPE*)output->data + stride * k; \
for(int i = 0; i < stride; i++){ \
DTYPE * ipe = ip + blockSize; \
DTYPE tmpData = *(ip + i); \
for(DTYPE * ipb = ip + i + stride; ipb < ipe; ipb += stride){ \
DTYPE v = *ipb; \
tmpData = _reduceOp(tmpData, v); \
} \
*(op + i) = tmpData; \
} \
} \
} \
}
}
_REDUCE_CPU_FUNCTION
(
reduceMaxCPU
,
maxData
,
MAX
)
for
(
int
k
=
1
;
k
<
strideNum
;
k
++
){
_REDUCE_CPU_FUNCTION
(
reduceMinCPU
,
minData
,
MIN
)
DTYPE
*
ptr
=
ip
+
k
*
stride
+
(
j
*
4
)
*
vecBufLength
;
vecBuf
[
0
]
=
vecBuf
[
0
].
maxData
(
VectorBuffer
::
loadu
(
ptr
+
0
*
vecBufLength
));
#ifdef USE_CUDA
vecBuf
[
1
]
=
vecBuf
[
1
].
maxData
(
VectorBuffer
::
loadu
(
ptr
+
1
*
vecBufLength
));
#define _REDUCE_FUNCTION(_funcName, _cudaFuncName) \
vecBuf
[
2
]
=
vecBuf
[
2
].
maxData
(
VectorBuffer
::
loadu
(
ptr
+
2
*
vecBufLength
));
void _funcName(const XTensor * input, XTensor * output, int dim) \
vecBuf
[
3
]
=
vecBuf
[
3
].
maxData
(
VectorBuffer
::
loadu
(
ptr
+
3
*
vecBufLength
));
{ \
}
if(input->devID >= 0){ \
for
(
int
k
=
0
;
k
<
4
;
k
++
){
_cudaFuncName(input, output, dim); \
for
(
int
l
=
0
;
l
<
vecBufLength
;
l
++
)
} \
*
(
op
+
j
*
32
+
8
*
k
+
l
)
=
vecBuf
[
k
][
l
];
else{ \
}
reduceMaxCPU(input, output, dim); \
}
} \
}
}
}
_REDUCE_FUNCTION
(
_ReduceMax
,
_CudaReduceMax
)
}
//run vector buffer
_REDUCE_FUNCTION
(
_ReduceMin
,
_CudaReduceMin
)
else
{
#else
for
(
int
k
=
0
;
k
<
blockNum
;
k
++
){
#define _REDUCE_FUNCTION(_funcName, reduceNameCPU) \
DTYPE
*
ip
=
(
DTYPE
*
)
input
->
data
+
blockSize
*
k
;
void _funcName(const XTensor * input, XTensor * output, int dim) \
DTYPE
*
op
=
(
DTYPE
*
)
output
->
data
+
stride
*
k
;
{ \
for
(
int
i
=
0
;
i
<
stride
;
i
++
){
CheckNTErrors((input->devID < 0), "This code must be run on the CPU!"); \
DTYPE
max
=
DTYPE_MIN
;
reduceNameCPU(input, output, dim); \
DTYPE
*
ipe
=
ip
+
blockSize
;
for
(
DTYPE
*
ipb
=
ip
+
i
;
ipb
<
ipe
;
ipb
+=
stride
){
DTYPE
v
=
*
ipb
;
if
(
max
<
v
)
max
=
v
;
}
*
(
op
+
i
)
=
max
;
}
}
}
}
}
}
_REDUCE_FUNCTION
(
_ReduceMax
,
reduceMaxCPU
)
_REDUCE_FUNCTION
(
_ReduceMin
,
reduceMinCPU
)
#endif
/*
/*
get the max value of the items along a dimension of the tensor (return an XTensor structure).
get the max value of the items along a dimension of the tensor (return an XTensor structure).
...
@@ -160,74 +181,38 @@ make a new tensor to keep the result and return it
...
@@ -160,74 +181,38 @@ make a new tensor to keep the result and return it
>> dim - the dimension where the reduction is performed on
>> dim - the dimension where the reduction is performed on
<< return - the max value of the items along a dimension of the tensor
<< return - the max value of the items along a dimension of the tensor
*/
*/
XTensor
ReduceMax
(
const
XTensor
&
input
,
int
dim
)
#define REDUCE_FUNCTION(funcName, funcOp) \
{
XTensor funcName(const XTensor & input, int dim) \
CheckNTErrors
(
dim
>=
0
&&
dim
<
input
.
order
,
"Illegal dimension to reduce!"
);
{ \
CheckNTErrors(dim >= 0 && dim < input.order, "Illegal dimension to reduce!"); \
int
order
=
input
.
order
-
1
;
\
int
*
dimSize
=
new
int
[
order
];
int order = input.order - 1; \
for
(
int
i
=
0
;
i
<
order
;
i
++
){
int * dimSize = new int[order]; \
if
(
i
<
dim
)
for(int i = 0; i < order; i++){ \
dimSize
[
i
]
=
input
.
dimSize
[
i
];
if(i < dim) \
else
if
(
i
>=
dim
)
dimSize[i] = input.dimSize[i]; \
dimSize
[
i
]
=
input
.
dimSize
[
i
+
1
];
else if(i >= dim) \
}
dimSize[i] = input.dimSize[i + 1]; \
} \
float
dr
=
(
!
input
.
isSparse
)
?
1.0
F
:
input
.
denseRatio
;
\
XTensor
output
(
order
,
dimSize
,
input
.
dataType
,
dr
,
input
.
devID
,
input
.
mem
);
float dr = (!input.isSparse) ? 1.0F : input.denseRatio; \
output
.
SetTMPFlag
();
XTensor output(order, dimSize, input.dataType, dr, input.devID, input.mem); \
output.SetTMPFlag(); \
/* call _ReduceMax function */
\
_ReduceMax
(
&
input
,
&
output
,
dim
);
/* call _ReduceMax function */
\
funcOp(&input, &output, dim); \
/* tensor connection */
\
if
(
input
.
enableGrad
)
{
/* tensor connection */
\
XLink
::
MakeLink
(
&
input
,
NULL
,
&
output
,
REDUCE_REDUCEMAX
);
XLink::MakeLink(&input, NULL, &output, REDUCE_REDUCEMAX); \
XLink
::
AddParamToHeadInt
(
&
output
,
dim
);
XLink::AddParamToHeadInt(&output, dim); \
}
\
/* destroy variables */
\
/* destroy variables */
delete[] dimSize; \
delete
[]
dimSize
;
\
return output; \
return
output
;
}
}
/*
REDUCE_FUNCTION
(
ReduceMax
,
_ReduceMax
)
get the max value of the items along a dimension of the tensor
REDUCE_FUNCTION
(
ReduceMin
,
_ReduceMin
)
>> input - the input tensor
>> output - the output tensor
>> dim - the dimension where the reduction is performed on
*/
void
ReduceMax
(
const
XTensor
&
input
,
XTensor
&
output
,
int
dim
)
{
CheckNTErrors
(
dim
>=
0
&&
dim
<
input
.
order
,
"Illegal dimension to reduce!"
);
if
(
!
output
.
isInit
||
!
XTensor
::
IsReduceShaped
(
&
input
,
&
output
,
dim
))
{
int
order
=
input
.
order
-
1
;
int
*
dimSize
=
new
int
[
order
];
for
(
int
i
=
0
;
i
<
order
;
i
++
)
{
if
(
i
<
dim
)
dimSize
[
i
]
=
input
.
dimSize
[
i
];
else
if
(
i
>=
dim
)
dimSize
[
i
]
=
input
.
dimSize
[
i
+
1
];
}
float
dr
=
(
!
input
.
isSparse
)
?
1.0
F
:
input
.
denseRatio
;
InitTensor
(
&
output
,
order
,
dimSize
,
input
.
dataType
,
dr
,
input
.
devID
,
input
.
mem
);
/* destroy variables */
delete
[]
dimSize
;
}
/* call _ReduceMax function */
_ReduceMax
(
&
input
,
&
output
,
dim
);
if
(
input
.
enableGrad
)
{
/* tensor connections */
XLink
::
MakeLink
(
&
input
,
NULL
,
&
output
,
REDUCE_REDUCEMAX
);
XLink
::
AddParamToHeadInt
(
&
output
,
dim
);
}
}
}
// namespace nts(NiuTrans.Tensor)
}
// namespace nts(NiuTrans.Tensor)
source/tensor/core/reduce/ReduceMax.cu
查看文件 @
f5149a15
...
@@ -33,67 +33,75 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
...
@@ -33,67 +33,75 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
/*
/*
use PTX code to reduce float data
use PTX code to reduce float data
*/
*/
__device__ __forceinline__
#define SHLFUNCFLOAT(funcName, reducePTXOp) \
float shflDownReduceMax(float input)
__device__ __forceinline__ \
{
float funcName(float input) \
float output;
{ \
asm volatile(
float output; \
"{"
asm volatile( \
".reg .f32 r0;"
"{" \
".reg .pred p;"
".reg .f32 r0;" \
"shfl.sync.down.b32 r0, %1, 0x10, 0x1f,0xffffffff;"
".reg .pred p;" \
"setp.lt.f32 p,%1,r0;"
"shfl.sync.down.b32 r0, %1, 0x10, 0x1f,0xffffffff;" \
"@p mov.f32 %1,r0;"
"setp."#reducePTXOp".f32 p,%1,r0;" \
"shfl.sync.down.b32 r0, %1, 0x8, 0xf,0xffffffff;"
"@p mov.f32 %1,r0;" \
"setp.lt.f32 p,%1,r0;"
"shfl.sync.down.b32 r0, %1, 0x8, 0xf,0xffffffff;" \
"@p mov.f32 %1,r0;"
"setp."#reducePTXOp".f32 p,%1,r0;" \
"shfl.sync.down.b32 r0, %1, 0x4, 0x7,0xffffffff;"
"@p mov.f32 %1,r0;" \
"setp.lt.f32 p,%1,r0;"
"shfl.sync.down.b32 r0, %1, 0x4, 0x7,0xffffffff;" \
"@p mov.f32 %1,r0;"
"setp."#reducePTXOp".f32 p,%1,r0;" \
"shfl.sync.down.b32 r0, %1, 0x2, 0x3,0xffffffff;"
"@p mov.f32 %1,r0;" \
"setp.lt.f32 p,%1,r0;"
"shfl.sync.down.b32 r0, %1, 0x2, 0x3,0xffffffff;" \
"@p mov.f32 %1,r0;"
"setp."#reducePTXOp".f32 p,%1,r0;" \
"shfl.sync.down.b32 r0, %1, 0x1, 0x1,0xffffffff;"
"@p mov.f32 %1,r0;" \
"setp.lt.f32 p, %1, r0; "
"shfl.sync.down.b32 r0, %1, 0x1, 0x1,0xffffffff;" \
"@p mov.f32 %1,r0;"
"setp."#reducePTXOp".f32 p, %1, r0; " \
"mov.f32 %0,%1;"
"@p mov.f32 %1,r0;" \
"}"
"mov.f32 %0,%1;" \
: "=f"(output) : "f"(input));
"}" \
return output;
: "=f"(output) : "f"(input)); \
return output; \
}
}
SHLFUNCFLOAT(shflDownReduceMax, lt)
SHLFUNCFLOAT(shflDownReduceMin, gt)
/*
/*
use PTX code to reduce int data
use PTX code to reduce int data
*/
*/
__device__ __forceinline__
#define SHLFUNCINT(funcName, reducePTXOp) \
int shflDownReduceMax(int input)
__device__ __forceinline__ \
{
int funcName(int input) \
int output;
{ \
asm volatile(
int output; \
"{"
asm volatile( \
".reg .s32 r0;"
"{" \
".reg .pred p;"
".reg .s32 r0;" \
"shfl.sync.down.b32 r0, %1, 0x10, 0x1f,0xffffffff;"
".reg .pred p;" \
"setp.lt.s32 p,%1,r0;"
"shfl.sync.down.b32 r0, %1, 0x10, 0x1f,0xffffffff;" \
"@p mov.s32 %1,r0;"
"setp."#reducePTXOp".s32 p,%1,r0;" \
"shfl.sync.down.b32 r0, %1, 0x8, 0xf,0xffffffff;"
"@p mov.s32 %1,r0;" \
"setp.lt.s32 p,%1,r0;"
"shfl.sync.down.b32 r0, %1, 0x8, 0xf,0xffffffff;" \
"@p mov.s32 %1,r0;"
"setp."#reducePTXOp".s32 p,%1,r0;" \
"shfl.sync.down.b32 r0, %1, 0x4, 0x7,0xffffffff;"
"@p mov.s32 %1,r0;" \
"setp.lt.s32 p,%1,r0;"
"shfl.sync.down.b32 r0, %1, 0x4, 0x7,0xffffffff;" \
"@p mov.s32 %1,r0;"
"setp."#reducePTXOp".s32 p,%1,r0;" \
"shfl.sync.down.b32 r0, %1, 0x2, 0x3,0xffffffff;"
"@p mov.s32 %1,r0;" \
"setp.lt.s32 p,%1,r0;"
"shfl.sync.down.b32 r0, %1, 0x2, 0x3,0xffffffff;" \
"@p mov.s32 %1,r0;"
"setp."#reducePTXOp".s32 p,%1,r0;" \
"shfl.sync.down.b32 r0, %1, 0x1, 0x1,0xffffffff;"
"@p mov.s32 %1,r0;" \
"setp.lt.s32 p, %1, r0; "
"shfl.sync.down.b32 r0, %1, 0x1, 0x1,0xffffffff;" \
"@p mov.s32 %1,r0;"
"setp."#reducePTXOp".s32 p, %1, r0; " \
"mov.s32 %0,%1;"
"@p mov.s32 %1,r0;" \
"}"
"mov.s32 %0,%1;" \
: "=r"(output) : "r"(input));
"}" \
return output;
: "=r"(output) : "r"(input)); \
return output; \
}
}
SHLFUNCINT(shflDownReduceMax, lt)
SHLFUNCINT(shflDownReduceMin, gt)
/*
/*
reduce a tensor to another that keeps the max value along a dimension - slow version
reduce a tensor to another that keeps the max value along a dimension - slow version
Given a block of data, we go over each dimension i in the stride and we have
Given a block of data, we go over each dimension i in the stride and we have
...
@@ -108,48 +116,52 @@ crossing of the i-th columne and the j-th row.
...
@@ -108,48 +116,52 @@ crossing of the i-th columne and the j-th row.
>> blockSize - size of the block (i.e., stride * strideNum)
>> blockSize - size of the block (i.e., stride * strideNum)
>> blockNum - how many blocks
>> blockNum - how many blocks
*/
*/
__global__
#define KERNELREDUCEFUN3(funName, opName, initData) \
void KernelReduceMax(DTYPE * input, DTYPE * output,
__global__ \
int stride, int strideNum, int reducedStrideNum,
void funName(DTYPE * input, DTYPE * output, \
int blockSize, int blockNum)
int stride, int strideNum, int reducedStrideNum, \
{
int blockSize, int blockNum) \
__shared__ DTYPE iData[MAX_CUDA_THREAD_NUM_PER_BLOCK * MIN_CUDA_SHARED_MEM_COL_SIZE/2];
{ \
__shared__ DTYPE iData[MAX_CUDA_THREAD_NUM_PER_BLOCK * MIN_CUDA_SHARED_MEM_COL_SIZE/2]; \
int idx = threadIdx.x * blockDim.y + threadIdx.y;
\
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
int idx = threadIdx.x * blockDim.y + threadIdx.y; \
unsigned int j = blockIdx.y*blockDim.y + threadIdx.y;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x; \
unsigned int j = blockIdx.y*blockDim.y + threadIdx.y; \
if(i >= stride * blockNum)
\
return;
if(i >= stride * blockNum) \
return; \
__syncthreads();
\
__syncthreads(); \
int k = i / stride;
\
int iOffset = i % stride;
int k = i / stride; \
int iOffset = i % stride; \
DTYPE value = (i < stride * blockNum && j < strideNum) ?
\
input[blockSize * k + stride * j + iOffset] : FLOAT_MIN;
DTYPE value = (i < stride * blockNum && j < strideNum) ? \
input[blockSize * k + stride * j + iOffset] : initData; \
/* load data into the shared mem */
\
iData[threadIdx.x * blockDim.y + threadIdx.y] = value;
/* load data into the shared mem */ \
iData[threadIdx.x * blockDim.y + threadIdx.y] = value; \
__syncthreads();
\
__syncthreads(); \
/* do reduction in shared mem */
\
for (unsigned int s = blockDim.y/2; s > 0; s >>= 1){
/* do reduction in shared mem */ \
if(threadIdx.y < s && iData[idx] < iData[idx + s]){
for (unsigned int s = blockDim.y/2; s > 0; s >>= 1){ \
iData[idx] = iData[idx + s];
if(threadIdx.y < s){ \
}
iData[idx] = opName(iData[idx + s], iData[idx]); \
} \
__syncthreads();
\
}
__syncthreads(); \
} \
/* write result for this block to the output array */
\
if (threadIdx.y == 0 && blockIdx.y < reducedStrideNum)
/* write result for this block to the output array */ \
output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = iData[threadIdx.x * blockDim.y];
if (threadIdx.y == 0 && blockIdx.y < reducedStrideNum) \
output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = iData[threadIdx.x * blockDim.y]; \
\
}
}
KERNELREDUCEFUN3(KernelReduceMax, MAX, FLOAT_MIN)
KERNELREDUCEFUN3(KernelReduceMin, MIN, MAX_FLOAT)
/*
/*
reduce a tensor to another that keeps the max value along a dimension - slow version
reduce a tensor to another that keeps the max value along a dimension - slow version
Given a block of data, we go over each dimension i in the stride and we have
Given a block of data, we go over each dimension i in the stride and we have
...
@@ -231,48 +243,52 @@ reduce a tensor to another that keeps the max value along a dimension - fast ve
...
@@ -231,48 +243,52 @@ reduce a tensor to another that keeps the max value along a dimension - fast ve
>> blockSize - size of the block (i.e., stride * strideNum)
>> blockSize - size of the block (i.e., stride * strideNum)
>> blockNum - how many blocks
>> blockNum - how many blocks
*/
*/
template <unsigned int goodSize> __global__
#define KERNELREDUCEFUN4(funName, opName, opFuncName, initData) \
void KernelReduceMaxFast(DTYPE * input, DTYPE * output,
template <unsigned int goodSize> __global__ \
int stride, int strideNum, int reducedStrideNum,
void funName(DTYPE * input, DTYPE * output, \
int blockSize, int blockNum)
int stride, int strideNum, int reducedStrideNum, \
{
int blockSize, int blockNum) \
__shared__ DTYPE iData[MAX_CUDA_THREAD_NUM_PER_BLOCK];
{ \
__shared__ DTYPE iData[MAX_CUDA_THREAD_NUM_PER_BLOCK]; \
unsigned int tid = threadIdx.y;
\
unsigned int j = blockIdx.y * (blockDim.y * 2) + threadIdx.y;
unsigned int tid = threadIdx.y; \
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int j = blockIdx.y * (blockDim.y * 2) + threadIdx.y; \
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; \
if(i >= stride * blockNum)
\
return;
if(i >= stride * blockNum) \
return; \
__syncthreads();
\
__syncthreads(); \
/* first level reduction */
\
int k = i / stride;
/* first level reduction */ \
int iOffset = i % stride;
int k = i / stride; \
int iOffset = i % stride; \
DTYPE * data = iData + threadIdx.x * blockDim.y;
\
DTYPE * inputData = input + k * blockSize;
DTYPE * data = iData + threadIdx.x * blockDim.y; \
DTYPE value = j < strideNum ? inputData[j * stride + iOffset] : FLOAT_MIN;
DTYPE * inputData = input + k * blockSize; \
DTYPE value2 = j + blockDim.y < strideNum ? inputData[(j + blockDim.y) * stride + iOffset]: FLOAT_MIN;
DTYPE value = j < strideNum ? inputData[j * stride + iOffset] : initData; \
DTYPE value2 = j + blockDim.y < strideNum ? inputData[(j + blockDim.y) * stride + iOffset]: initData; \
value = MAX(value, value2);
\
value = shflDownReduceMax(value);
value = opName(value, value2); \
if ((tid & 0x1f) == 0)
value = opFuncName(value); \
data[tid / 32] = value;
if ((tid & 0x1f) == 0) \
__syncthreads();
data[tid / 32] = value; \
__syncthreads(); \
if (tid < 32) {
\
if (tid < blockDim.y / 32)
if (tid < 32) { \
value = data[tid];
if (tid < blockDim.y / 32) \
else
value = data[tid]; \
value = FLOAT_MIN;
else \
value = shflDownReduceMax(value);
value = initData; \
if (tid == 0 && blockIdx.y < reducedStrideNum)
value = opFuncName(value); \
output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = value;
if (tid == 0 && blockIdx.y < reducedStrideNum) \
}
output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = value; \
} \
}
}
KERNELREDUCEFUN4(KernelReduceMaxFast, MAX, shflDownReduceMax, FLOAT_MIN)
KERNELREDUCEFUN4(KernelReduceMinFast, MIN, shflDownReduceMin, MAX_FLOAT)
/*
/*
reduce a tensor to another that keeps the max value along a dimension - fast version
reduce a tensor to another that keeps the max value along a dimension - fast version
>> input - the input array (representing a tensor)
>> input - the input array (representing a tensor)
...
@@ -372,14 +388,12 @@ void KernelReduceMaxSimpleFast(DTYPE * input, DTYPE * output,
...
@@ -372,14 +388,12 @@ void KernelReduceMaxSimpleFast(DTYPE * input, DTYPE * output,
int stride4 = stride3 + stride;
int stride4 = stride3 + stride;
for(int k = 0; k < blockSize; k += stride4){
for(int k = 0; k < blockSize; k += stride4){
DTYPE m = MAX(MAX(ip[k], ip[k + stride]), MAX(ip[k + stride2], ip[k + stride3]));
DTYPE m = MAX(MAX(ip[k], ip[k + stride]), MAX(ip[k + stride2], ip[k + stride3]));
if(max < m)
max = MAX(max, m);
max = m;
}
}
}
}
else{
else{
for(int k = 0; k < blockSize; k += stride)
for (int k = 0; k < blockSize; k += stride)
if(max < ip[k])
max = MAX(max, ip[k]);
max = ip[k];
}
}
__syncthreads();
__syncthreads();
...
@@ -429,66 +443,75 @@ inline void adjustThreadForUseWarpOptimization(dim3& blocks, dim3& threads)
...
@@ -429,66 +443,75 @@ inline void adjustThreadForUseWarpOptimization(dim3& blocks, dim3& threads)
/*
/*
In some case,we use less block to imporve efficiency
In some case,we use less block to imporve efficiency
*/
*/
__global__
#define KERNELREDUCEFUN2(funName, opName, opFuncName, initData) \
void KernelReduceMaxOpLessBlocks(DTYPE * input, DTYPE * output, int strideNum, int blockNum)
__global__ \
{
void funName(DTYPE * input, DTYPE * output, int strideNum, int blockNum) \
int idx = threadIdx.x % 32;
{ \
int idy = (blockIdx.x * blockDim.x + threadIdx.x) / 32;
int idx = threadIdx.x % 32; \
int idy = (blockIdx.x * blockDim.x + threadIdx.x) / 32; \
int startIndex = idy * strideNum;
\
DTYPE threadMax = FLOAT_MIN;
int startIndex = idy * strideNum; \
for (int i = idx; i < strideNum; i += 32) {
DTYPE threadMax = initData; \
threadMax = max(input[startIndex + i], threadMax);
for (int i = idx; i < strideNum; i += 32) { \
}
threadMax = opName(input[startIndex + i], threadMax); \
threadMax = shflDownReduceMax(threadMax);
} \
if (idx == 0)
threadMax = opFuncName(threadMax); \
output[idy] = threadMax;
if (idx == 0) \
output[idy] = threadMax; \
}
}
KERNELREDUCEFUN2(KernelReduceMaxOpLessBlocks, MAX, shflDownReduceMax, FLOAT_MIN)
KERNELREDUCEFUN2(KernelReduceMinOpLessBlocks, MIN, shflDownReduceMin, MAX_FLOAT)
/*
/*
we use PTX code reduce
we use PTX code reduce
*/
*/
__global__
#define KERNELREDUCEFUN1(funName, opName, opFuncName, initData) \
void KernelReduceMaxOp(DTYPE * input, DTYPE * output,int stride, int strideNum,
__global__ \
int reducedStrideNum,int blockSize, int blockNum)
void funName(DTYPE * input, DTYPE * output,int stride, int strideNum, \
{
int reducedStrideNum,int blockSize, int blockNum) \
__shared__ DTYPE iData[MAX_CUDA_THREAD_NUM_PER_BLOCK / 32];
{ \
__shared__ DTYPE iData[MAX_CUDA_THREAD_NUM_PER_BLOCK / 32]; \
unsigned int tid = threadIdx.y;
\
unsigned int j = blockIdx.y * blockDim.y + threadIdx.y;
unsigned int tid = threadIdx.y; \
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int j = blockIdx.y * blockDim.y + threadIdx.y; \
if (i >= stride * blockNum)
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; \
return;
if (i >= stride * blockNum) \
return; \
/* first level reduction */
\
int k = i / stride;
/* first level reduction */ \
int iOffset = i % stride;
int k = i / stride; \
int iOffset = i % stride; \
DTYPE threadMax = FLOAT_MIN;
\
DTYPE threadMax = initData; \
DTYPE * data = iData + threadIdx.x * blockDim.y;
\
DTYPE * inputData = input + k * blockSize;
DTYPE * data = iData + threadIdx.x * blockDim.y; \
for (int it = j; it < strideNum; it += blockDim.y){
DTYPE * inputData = input + k * blockSize; \
threadMax = max(inputData[it * stride + iOffset], threadMax);
for (int it = j; it < strideNum; it += blockDim.y){ \
}
threadMax = opName(inputData[it * stride + iOffset], threadMax); \
} \
__syncthreads();
\
threadMax = shflDownReduceMax(threadMax);
__syncthreads(); \
if ((tid & 0x1f) == 0)
threadMax = opFuncName(threadMax); \
data[tid / 32] = threadMax;
if ((tid & 0x1f) == 0) \
data[tid / 32] = threadMax; \
__syncthreads();
\
/* use one warp to reduce remaining data */
__syncthreads(); \
if (tid < 32){
/* use one warp to reduce remaining data */ \
if (tid < blockDim.y / 32)
if (tid < 32){ \
threadMax = data[tid];
if (tid < blockDim.y / 32) \
else threadMax = FLOAT_MIN;
threadMax = data[tid]; \
threadMax = shflDownReduceMax(threadMax);
else threadMax = initData; \
if (tid == 0 && blockIdx.y < reducedStrideNum)
threadMax = opFuncName(threadMax); \
output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = threadMax;
if (tid == 0 && blockIdx.y < reducedStrideNum) \
}
output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = threadMax; \
} \
}
}
KERNELREDUCEFUN1(KernelReduceMaxOp, MAX, shflDownReduceMax, FLOAT_MIN)
KERNELREDUCEFUN1(KernelReduceMinOp, MIN, shflDownReduceMin, MAX_FLOAT)
/*
/*
get the max-valued items along a dimension of the tensor (cuda version).
get the max-valued items along a dimension of the tensor (cuda version).
For a 1-dimensional data array a,
For a 1-dimensional data array a,
...
@@ -497,203 +520,207 @@ sum_i = max_{0<=j<strideNum} input_{i,j}
...
@@ -497,203 +520,207 @@ sum_i = max_{0<=j<strideNum} input_{i,j}
>> output - the output tensor
>> output - the output tensor
>> dim - which dimension to reduce
>> dim - which dimension to reduce
*/
*/
void _CudaReduceMax(const XTensor * input, XTensor * output, int dim)
#define _CUDAREDUCE(_funcName, _reduceFunc1, _reduceFunc2, _reduceFunc3, _reduceFun4) \
{
void _funcName(const XTensor * input, XTensor * output, int dim) \
CheckNTErrors(input && output, "Empty input or output tensors!");
{ \
CheckNTErrors(input->order == output->order + 1, "Incorrect tensor sizes!");
CheckNTErrors(input && output, "Empty input or output tensors!"); \
CheckNTErrors(input->order > dim && dim >=0, "Illegal dimension to reduce!");
CheckNTErrors(input->order == output->order + 1, "Incorrect tensor sizes!"); \
CheckNTErrors(input->dataType == output->dataType, "Unmatched data types!");
CheckNTErrors(input->order > dim && dim >=0, "Illegal dimension to reduce!"); \
CheckNTErrors(input->dataType == output->dataType, "Unmatched data types!"); \
int dimRDI = input->order - dim - 1;
\
for(int i = 0; i < input->order; i++){
for(int i = 0; i < input->order; i++){ \
if(i < dimRDI){
if(i < dim){ \
CheckNTErrors(input->dimSizeRDI[i] == output->dimSizeRDI[i], "Unmatched tensors!");
CheckNTErrors(input->dimSize[i] == output->dimSize[i], "Unmatched tensors!"); \
}
} \
else if(i > dimRDI){
else if(i > dim){ \
CheckNTErrors(input->dimSizeRDI[i] == output->dimSizeRDI[i - 1], "Unmatched tensors!");
CheckNTErrors(input->dimSize[i] == output->dimSize[i - 1], "Unmatched tensors!"); \
}
} \
}
} \
\
int cudaGridSize[3];
int cudaGridSize[3]; \
int cudaBlockSize[3];
int cudaBlockSize[3]; \
int iter = 0;
int iter = 0; \
int stride = 1;
int stride = 1; \
int strideNum = input->dimSizeRDI[dimRDI];
int strideNum = input->dimSize[dim]; \
int blockSize = 1;
int blockSize = 1; \
int blockNum = 1;
int blockNum = 1; \
\
for (int i = 0; i < input->order; i++) {
for (int i = 0; i < input->order; i++) { \
if (i < dimRDI)
if (i < dim) \
stride *= input->dimSizeRDI[i];
blockNum *= input->dimSize[i]; \
else if (i > dimRDI)
else if (i > dim) \
blockNum *= input->dimSizeRDI[i];
stride *= input->dimSize[i]; \
}
} \
blockSize = stride * strideNum;
blockSize = stride * strideNum; \
\
int devID = input->devID;
int devID = input->devID; \
XMem * mem = input->mem;
XMem * mem = input->mem; \
\
GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize); \
\
int bufSize = sizeof(DTYPE) * cudaGridSize[0] * stride * blockNum * 2;
int bufSize = sizeof(DTYPE) * cudaGridSize[0] * stride * blockNum * 2; \
DTYPE * buf = mem != NULL ? (DTYPE*)mem->AllocBuf(mem->devID, bufSize) : (DTYPE*)XMemAlloc(input->devID, bufSize);
DTYPE * buf = mem != NULL ? (DTYPE*)mem->AllocBuf(mem->devID, bufSize) : (DTYPE*)XMemAlloc(input->devID, bufSize); \
DTYPE * buf1 = buf;
DTYPE * buf1 = buf; \
DTYPE * buf2 = buf + cudaGridSize[0] * stride * blockNum;
DTYPE * buf2 = buf + cudaGridSize[0] * stride * blockNum; \
\
int devIDBackup;
int devIDBackup; \
ProtectCudaDev(input->devID, devIDBackup);
ProtectCudaDev(input->devID, devIDBackup); \
\
if (stride == 1 && blockNum >= 10) {
if (stride == 1 && blockNum >= 10) { \
dim3 grids;
dim3 grids; \
dim3 blocks;
dim3 blocks; \
continuousStorageThreadAllocation(grids, blocks, (long long)blockNum, strideNum);
continuousStorageThreadAllocation(grids, blocks, (long long)blockNum, strideNum); \
if (blocks.y >= 128) {
if (blocks.y >= 128) { \
KernelReduceMaxOp <<<grids, blocks >>> ((DTYPE *)input->data, (DTYPE*)output->data, stride, strideNum, grids.y, blockSize, blockNum);
_reduceFunc1 <<<grids, blocks >>> ((DTYPE *)input->data, (DTYPE*)output->data, stride, strideNum, grids.y, blockSize, blockNum); \
}
} \
else {
else { \
if (blockNum % 4 != 0) blockNum = (int)(blockNum / 4) + 1;
if (blockNum % 4 != 0) blockNum = (int)(blockNum / 4) + 1; \
else blockNum = blockNum / 4;
else blockNum = blockNum / 4; \
KernelReduceMaxOpLessBlocks <<<blockNum, 128 >>> ((DTYPE *)input->data, (DTYPE*)output->data, strideNum, blockNum);
_reduceFunc2 <<<blockNum, 128 >>> ((DTYPE *)input->data, (DTYPE*)output->data, strideNum, blockNum); \
}
} \
}
} \
else {
else { \
do {
do { \
if (input->dataType == DEFAULT_DTYPE) {
if (input->dataType == DEFAULT_DTYPE) { \
DTYPE * iData = NULL;
DTYPE * iData = NULL; \
DTYPE * oData = NULL;
DTYPE * oData = NULL; \
if (iter == 0) {
if (iter == 0) { \
iData = (DTYPE*)input->data;
iData = (DTYPE*)input->data; \
oData = buf1;
oData = buf1; \
}
} \
else if (iter % 2 == 1) {
else if (iter % 2 == 1) { \
iData = buf1;
iData = buf1; \
oData = buf2;
oData = buf2; \
}
} \
else {
else { \
iData = buf2;
iData = buf2; \
oData = buf1;
oData = buf1; \
}
} \
\
/* unroll the reduction procedure. The code is messy but it is faster. */
/* unroll the reduction procedure. The code is messy but it is faster. */ \
if (strideNum < 32) {
if (strideNum < 32) { \
GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize); \
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]); \
if (cudaGridSize[0] == 1)
if (cudaGridSize[0] == 1) \
oData = (DTYPE*)output->data;
oData = (DTYPE*)output->data; \
KernelReduceMax <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
_reduceFunc3 <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum); \
}
} \
else if (strideNum < 128) {
else if (strideNum < 128) { \
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 64), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 64), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize); \
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]); \
if (cudaGridSize[0] == 1)
if (cudaGridSize[0] == 1) \
oData = (DTYPE*)output->data;
oData = (DTYPE*)output->data; \
CheckNTErrors(cudaBlockSize[0] >= 64, "Incorrect thread number when calling the cuda kernel!");
CheckNTErrors(cudaBlockSize[0] >= 64, "Incorrect thread number when calling the cuda kernel!"); \
adjustThreadForUseWarpOptimization(blocks, threads);
adjustThreadForUseWarpOptimization(blocks, threads); \
KernelReduceMaxFast<64> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
_reduceFun4<64> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum); \
}
} \
else if (strideNum < 256) {
else if (strideNum < 256) { \
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 128), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 128), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize); \
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]); \
if (cudaGridSize[0] == 1)
if (cudaGridSize[0] == 1) \
oData = (DTYPE*)output->data;
oData = (DTYPE*)output->data; \
CheckNTErrors(cudaBlockSize[0] >= 128, "Incorrect thread number when calling the cuda kernel!");
CheckNTErrors(cudaBlockSize[0] >= 128, "Incorrect thread number when calling the cuda kernel!"); \
adjustThreadForUseWarpOptimization(blocks, threads);
adjustThreadForUseWarpOptimization(blocks, threads); \
KernelReduceMaxFast<128> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
_reduceFun4<128> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum); \
}
} \
else if (strideNum < 512) {
else if (strideNum < 512) { \
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 256), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 256), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize); \
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]); \
if (cudaGridSize[0] == 1)
if (cudaGridSize[0] == 1) \
oData = (DTYPE*)output->data;
oData = (DTYPE*)output->data; \
CheckNTErrors(cudaBlockSize[0] >= 256, "Incorrect thread number when calling the cuda kernel!");
CheckNTErrors(cudaBlockSize[0] >= 256, "Incorrect thread number when calling the cuda kernel!"); \
adjustThreadForUseWarpOptimization(blocks, threads);
adjustThreadForUseWarpOptimization(blocks, threads); \
KernelReduceMaxFast<256> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
_reduceFun4<256> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum); \
}
} \
else {
else { \
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 512), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 512), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize); \
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]); \
if (cudaGridSize[0] == 1)
if (cudaGridSize[0] == 1) \
oData = (DTYPE*)output->data;
oData = (DTYPE*)output->data; \
CheckNTErrors(cudaBlockSize[0] >= 512, "Incorrect thread number when calling the cuda kernel!");
CheckNTErrors(cudaBlockSize[0] >= 512, "Incorrect thread number when calling the cuda kernel!"); \
adjustThreadForUseWarpOptimization(blocks, threads);
adjustThreadForUseWarpOptimization(blocks, threads); \
KernelReduceMaxFast<512> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
_reduceFun4<512> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum); \
}
} \
}
} \
else if (input->dataType == X_FLOAT16) {
else if (input->dataType == X_FLOAT16) { \
__half * buf1ft16 = (__half *)buf1;
__half * buf1ft16 = (__half *)buf1; \
__half * buf2ft16 = (__half *)buf2;
__half * buf2ft16 = (__half *)buf2; \
__half * iData = NULL;
__half * iData = NULL; \
__half * oData = NULL;
__half * oData = NULL; \
if (iter == 0) {
if (iter == 0) { \
iData = (__half*)input->data;
iData = (__half*)input->data; \
oData = buf1ft16;
oData = buf1ft16; \
}
} \
else if (iter % 2 == 1) {
else if (iter % 2 == 1) { \
iData = buf1ft16;
iData = buf1ft16; \
oData = buf2ft16;
oData = buf2ft16; \
}
} \
else {
else { \
iData = buf2ft16;
iData = buf2ft16; \
oData = buf1ft16;
oData = buf1ft16; \
}
} \
\
/* unroll the reduction procedure. The code is messy but it is faster. */
/* unroll the reduction procedure. The code is messy but it is faster. */ \
if (strideNum < 32) {
if (strideNum < 32) { \
GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize); \
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]); \
if (cudaGridSize[0] == 1)
if (cudaGridSize[0] == 1) \
oData = (__half*)output->data;
oData = (__half*)output->data; \
KernelReduceMax <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
KernelReduceMax <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum); \
}
} \
else if (strideNum < 128) {
else if (strideNum < 128) { \
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 64), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 64), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize); \
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]); \
if (cudaGridSize[0] == 1)
if (cudaGridSize[0] == 1) \
oData = (__half*)output->data;
oData = (__half*)output->data; \
CheckNTErrors(cudaBlockSize[0] >= 64, "Incorrect thread number when calling the cuda kernel!");
CheckNTErrors(cudaBlockSize[0] >= 64, "Incorrect thread number when calling the cuda kernel!"); \
KernelReduceMaxFast<64> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
KernelReduceMaxFast<64> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum); \
}
} \
else if (strideNum < 256) {
else if (strideNum < 256) { \
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 128), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 128), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize); \
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]); \
if (cudaGridSize[0] == 1)
if (cudaGridSize[0] == 1) \
oData = (__half*)output->data;
oData = (__half*)output->data; \
CheckNTErrors(cudaBlockSize[0] >= 128, "Incorrect thread number when calling the cuda kernel!");
CheckNTErrors(cudaBlockSize[0] >= 128, "Incorrect thread number when calling the cuda kernel!"); \
KernelReduceMaxFast<128> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
KernelReduceMaxFast<128> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum); \
}
} \
else if (strideNum < 512) {
else if (strideNum < 512) { \
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 256), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 256), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize); \
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]); \
if (cudaGridSize[0] == 1)
if (cudaGridSize[0] == 1) \
oData = (__half*)output->data;
oData = (__half*)output->data; \
CheckNTErrors(cudaBlockSize[0] >= 256, "Incorrect thread number when calling the cuda kernel!");
CheckNTErrors(cudaBlockSize[0] >= 256, "Incorrect thread number when calling the cuda kernel!"); \
KernelReduceMaxFast<256> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
KernelReduceMaxFast<256> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum); \
}
} \
else {
else { \
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 512), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 512), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize); \
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]); \
if (cudaGridSize[0] == 1)
if (cudaGridSize[0] == 1) \
oData = (__half*)output->data;
oData = (__half*)output->data; \
CheckNTErrors(cudaBlockSize[0] >= 512, "Incorrect thread number when calling the cuda kernel!");
CheckNTErrors(cudaBlockSize[0] >= 512, "Incorrect thread number when calling the cuda kernel!"); \
KernelReduceMaxFast<512> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
KernelReduceMaxFast<512> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum); \
}
} \
}
} \
\
strideNum = cudaGridSize[0];
strideNum = cudaGridSize[0]; \
blockSize = cudaGridSize[0];
blockSize = cudaGridSize[0]; \
\
iter++;
iter++; \
\
} while (strideNum > 1);
} while (strideNum > 1); \
}
} \
\
BacktoCudaDev(input->devID, devIDBackup); \
\
if (mem != NULL) \
mem->ReleaseBuf(mem->devID, bufSize); \
else \
XMemFree(input->devID, buf); \
}
BacktoCudaDev(input->devID, devIDBackup);
_CUDAREDUCE(_CudaReduceMax, KernelReduceMaxOp, KernelReduceMaxOpLessBlocks, KernelReduceMax, KernelReduceMaxFast)
_CUDAREDUCE(_CudaReduceMin, KernelReduceMinOp, KernelReduceMinOpLessBlocks, KernelReduceMin, KernelReduceMinFast)
if (mem != NULL)
mem->ReleaseBuf(mem->devID, bufSize);
else
XMemFree(input->devID, buf);
}
#endif // USE_CUDA
#endif // USE_CUDA
...
...
source/tensor/core/reduce/ReduceMax.cuh
查看文件 @
f5149a15
...
@@ -31,6 +31,9 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
...
@@ -31,6 +31,9 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
/* get the max-valued items along a dimension of the tensor (cuda version) */
/* get the max-valued items along a dimension of the tensor (cuda version) */
void _CudaReduceMax(const XTensor * input, XTensor * output, int dim);
void _CudaReduceMax(const XTensor * input, XTensor * output, int dim);
/* get the min-valued items along a dimension of the tensor (cuda version) */
void _CudaReduceMin(const XTensor * input, XTensor * output, int dim);
#endif // USE_CUDA
#endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor)
} // namespace nts(NiuTrans.Tensor)
...
...
source/tensor/core/reduce/ReduceMax.h
查看文件 @
f5149a15
...
@@ -29,14 +29,20 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
...
@@ -29,14 +29,20 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
/* get the max value of the items along a dimension of the tensor. */
/* get the max value of the items along a dimension of the tensor. */
void
_ReduceMax
(
const
XTensor
*
input
,
XTensor
*
output
,
int
dim
);
void
_ReduceMax
(
const
XTensor
*
input
,
XTensor
*
output
,
int
dim
);
/* get the min value of the items along a dimension of the tensor. */
void
_ReduceMin
(
const
XTensor
*
input
,
XTensor
*
output
,
int
dim
);
/*
/*
get the max value of the items along a dimension of the tensor (return an XTensor structure)
get the max value of the items along a dimension of the tensor (return an XTensor structure)
make a new tensor to keep the result and return it
make a new tensor to keep the result and return it
*/
*/
XTensor
ReduceMax
(
const
XTensor
&
input
,
int
dim
);
XTensor
ReduceMax
(
const
XTensor
&
input
,
int
dim
);
/* get the max value of the items along a dimension of the tensor. */
/*
void
ReduceMax
(
const
XTensor
&
input
,
XTensor
&
output
,
int
dim
);
get the min value of the items along a dimension of the tensor (return an XTensor structure)
make a new tensor to keep the result and return it
*/
XTensor
ReduceMin
(
const
XTensor
&
input
,
int
dim
);
}
// namespace nts(NiuTrans.Tensor)
}
// namespace nts(NiuTrans.Tensor)
...
...
source/tensor/core/reduce/ReduceMean.cpp
查看文件 @
f5149a15
...
@@ -39,8 +39,7 @@ void _ReduceMean(const XTensor * input, XTensor * output, int dim)
...
@@ -39,8 +39,7 @@ void _ReduceMean(const XTensor * input, XTensor * output, int dim)
{
{
CheckNTErrors
((
input
->
order
>
dim
),
"Illegal dimension specified!"
);
CheckNTErrors
((
input
->
order
>
dim
),
"Illegal dimension specified!"
);
int
dimRDI
=
input
->
order
-
dim
-
1
;
int
num
=
input
->
dimSize
[
dim
];
int
num
=
input
->
dimSizeRDI
[
dimRDI
];
_ReduceSum
(
input
,
output
,
dim
);
_ReduceSum
(
input
,
output
,
dim
);
_ScaleAndShiftMe
(
output
,
(
DTYPE
)
1
/
num
,
0
);
_ScaleAndShiftMe
(
output
,
(
DTYPE
)
1
/
num
,
0
);
...
...
source/tensor/core/reduce/ReduceSum.cpp
查看文件 @
f5149a15
...
@@ -54,15 +54,14 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor
...
@@ -54,15 +54,14 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor
CheckNTErrors
((
input
->
dataType
==
output
->
dataType
),
"Unmatched data types!"
);
CheckNTErrors
((
input
->
dataType
==
output
->
dataType
),
"Unmatched data types!"
);
CheckNTErrors
((
shift
==
NULL
||
_IsSameShaped
(
output
,
shift
)),
"Incorrect shift tensor size!"
);
CheckNTErrors
((
shift
==
NULL
||
_IsSameShaped
(
output
,
shift
)),
"Incorrect shift tensor size!"
);
int
dimRDI
=
input
->
order
-
dim
-
1
;
CheckNTErrors
(
dim
<
input
->
order
,
"Wrong dimension!"
);
CheckNTErrors
(
dimRDI
>=
0
,
"Wrong dimension!"
);
for
(
int
i
=
0
;
i
<
input
->
order
;
i
++
){
for
(
int
i
=
0
;
i
<
input
->
order
;
i
++
){
if
(
i
<
dim
RDI
){
if
(
i
<
dim
){
CheckNTErrors
((
input
->
dimSize
RDI
[
i
]
==
output
->
dimSizeRDI
[
i
]),
"Unmatched tensors!"
);
CheckNTErrors
((
input
->
dimSize
[
i
]
==
output
->
dimSize
[
i
]),
"Unmatched tensors!"
);
}
}
else
if
(
i
>
dim
RDI
){
else
if
(
i
>
dim
){
CheckNTErrors
((
input
->
dimSize
RDI
[
i
]
==
output
->
dimSizeRDI
[
i
-
1
]),
"Unmatched tensors!"
);
CheckNTErrors
((
input
->
dimSize
[
i
]
==
output
->
dimSize
[
i
-
1
]),
"Unmatched tensors!"
);
}
}
}
}
...
@@ -75,21 +74,21 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor
...
@@ -75,21 +74,21 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor
CheckNTErrors
((
input
->
dataType
==
DEFAULT_DTYPE
),
"TODO!"
);
CheckNTErrors
((
input
->
dataType
==
DEFAULT_DTYPE
),
"TODO!"
);
int
stride
=
1
;
int
stride
=
1
;
int
strideNum
=
input
->
dimSize
RDI
[
dimRDI
];
int
strideNum
=
input
->
dimSize
[
dim
];
int
blockSize
=
1
;
int
blockSize
=
1
;
int
blockNum
=
1
;
int
blockNum
=
1
;
for
(
int
i
=
0
;
i
<
input
->
order
;
i
++
)
{
for
(
int
i
=
0
;
i
<
input
->
order
;
i
++
)
{
if
(
i
<
dim
RDI
)
if
(
i
<
dim
)
stride
*=
input
->
dimSizeRDI
[
i
];
blockNum
*=
input
->
dimSize
[
i
];
else
if
(
i
>
dim
RDI
)
else
if
(
i
>
dim
)
blockNum
*=
input
->
dimSizeRDI
[
i
];
stride
*=
input
->
dimSize
[
i
];
}
}
blockSize
=
stride
*
strideNum
;
blockSize
=
stride
*
strideNum
;
if
(
input
->
dimSize
RDI
[
0
]
%
(
4
*
32
/
sizeof
(
DTYPE
))
==
0
&&
input
->
dimSizeRDI
[
0
]
>=
32
){
if
(
input
->
dimSize
[
input
->
order
-
1
]
%
(
4
*
32
/
sizeof
(
DTYPE
))
==
0
&&
input
->
dimSize
[
input
->
order
-
1
]
>=
32
){
int
vecBufLength
=
32
/
sizeof
(
DTYPE
);
int
vecBufLength
=
32
/
sizeof
(
DTYPE
);
if
(
dim
RDI
==
0
){
if
(
dim
==
input
->
order
-
1
){
//data is contiguous in dim 0
//data is contiguous in dim 0
for
(
int
i
=
0
;
i
<
blockNum
;
i
++
){
for
(
int
i
=
0
;
i
<
blockNum
;
i
++
){
// stride = 1
// stride = 1
...
@@ -123,7 +122,7 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor
...
@@ -123,7 +122,7 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor
}
else
{
}
else
{
//data is separated
//data is separated
for
(
int
i
=
0
;
i
<
blockNum
;
i
++
){
for
(
int
i
=
0
;
i
<
blockNum
;
i
++
){
for
(
int
j
=
0
;
j
<
input
->
dimSize
RDI
[
0
]
/
32
;
j
++
){
for
(
int
j
=
0
;
j
<
input
->
dimSize
[
input
->
order
-
1
]
/
32
;
j
++
){
DTYPE
*
ip
=
(
DTYPE
*
)
input
->
data
+
blockSize
*
i
;
DTYPE
*
ip
=
(
DTYPE
*
)
input
->
data
+
blockSize
*
i
;
DTYPE
*
op
=
(
DTYPE
*
)
output
->
data
+
stride
*
i
;
DTYPE
*
op
=
(
DTYPE
*
)
output
->
data
+
stride
*
i
;
DTYPE
*
sp
=
shift
!=
NULL
?
(
DTYPE
*
)
shift
->
data
+
stride
*
i
:
NULL
;
DTYPE
*
sp
=
shift
!=
NULL
?
(
DTYPE
*
)
shift
->
data
+
stride
*
i
:
NULL
;
...
...
source/tensor/core/reduce/ReduceSum.cu
查看文件 @
f5149a15
...
@@ -692,13 +692,12 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
...
@@ -692,13 +692,12 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
CheckNTErrors(input->dataType == output->dataType, "Unmatched data types!");
CheckNTErrors(input->dataType == output->dataType, "Unmatched data types!");
CheckNTErrors(shift == NULL || output->unitNum == shift->unitNum, "Incorrect shift tensor size!");
CheckNTErrors(shift == NULL || output->unitNum == shift->unitNum, "Incorrect shift tensor size!");
int dimRDI = input->order - dim - 1;
for(int i = 0; i < input->order; i++){
for(int i = 0; i < input->order; i++){
if(i < dim
RDI
){
if(i < dim){
CheckNTErrors(input->dimSize
RDI[i] == output->dimSizeRDI
[i], "Unmatched tensors!");
CheckNTErrors(input->dimSize
[i] == output->dimSize
[i], "Unmatched tensors!");
}
}
else if(i > dim
RDI
){
else if(i > dim){
CheckNTErrors(input->dimSize
RDI[i] == output->dimSizeRDI
[i - 1], "Unmatched tensors!");
CheckNTErrors(input->dimSize
[i] == output->dimSize
[i - 1], "Unmatched tensors!");
}
}
}
}
...
@@ -709,31 +708,23 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
...
@@ -709,31 +708,23 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
int cudaBlockSize[3];
int cudaBlockSize[3];
int iter = 0;
int iter = 0;
int stride = 1;
int stride = 1;
int strideNum = input->dimSize
RDI[dimRDI
];
int strideNum = input->dimSize
[dim
];
int blockSize = 1;
int blockSize = 1;
int blockNum = 1;
int blockNum = 1;
for (int i = 0; i < input->order; i++) {
for (int i = 0; i < input->order; i++) {
if (i < dim
RDI
)
if (i < dim)
stride *= input->dimSizeRDI
[i];
blockNum *= input->dimSize
[i];
else if (i > dim
RDI
)
else if (i > dim)
blockNum *= input->dimSizeRDI
[i];
stride *= input->dimSize
[i];
}
}
blockSize = stride * strideNum;
blockSize = stride * strideNum;
int devID = input->devID;
int devID = input->devID;
XMem * mem = input->mem;
int devIDBackup;
ProtectCudaDev(devID, devIDBackup);
GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
int bufSize = input->unitSize * cudaGridSize[0] * stride * blockNum * 2;
DTYPE * buf = mem != NULL ? (DTYPE*)mem->AllocBuf(mem->devID, bufSize) : (DTYPE*)XMemAlloc(input->devID, bufSize);
DTYPE * buf1 = buf;
DTYPE * buf2 = buf + cudaGridSize[0] * stride * blockNum;
DTYPE * sp = shift != NULL ? (DTYPE*)shift->data : NULL;
DTYPE * sp = shift != NULL ? (DTYPE*)shift->data : NULL;
int devIDBackup;
ProtectCudaDev(input->devID, devIDBackup);
if (stride == 1 && blockNum >= 10) {
if (stride == 1 && blockNum >= 10) {
dim3 grids;
dim3 grids;
...
@@ -761,6 +752,14 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
...
@@ -761,6 +752,14 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
strideNum, blockNum,sp, power, isExp);
strideNum, blockNum,sp, power, isExp);
}
}
else {
else {
XMem * mem = input->mem;
GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
int bufSize = input->unitSize * cudaGridSize[0] * stride * blockNum * 2;
DTYPE * buf = mem != NULL ? (DTYPE*)mem->AllocBuf(mem->devID, bufSize) : (DTYPE*)XMemAlloc(devID, bufSize);
DTYPE * buf1 = buf;
DTYPE * buf2 = buf + cudaGridSize[0] * stride * blockNum;
do {
do {
if (input->dataType == DEFAULT_DTYPE) {
if (input->dataType == DEFAULT_DTYPE) {
DTYPE * iData = NULL;
DTYPE * iData = NULL;
...
@@ -904,13 +903,15 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
...
@@ -904,13 +903,15 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
iter++;
iter++;
} while (strideNum > 1);
} while (strideNum > 1);
if (mem != NULL)
mem->ReleaseBuf(mem->devID, bufSize);
else
XMemFree(devID, buf);
}
}
ProtectCudaDev(input->devID, devIDBackup);
if (mem != NULL)
BacktoCudaDev(devID, devIDBackup);
mem->ReleaseBuf(mem->devID, bufSize);
else
XMemFree(input->devID, buf);
}
}
#endif // USE_CUDA
#endif // USE_CUDA
...
...
source/tensor/core/reduce/ReduceVariance.cpp
查看文件 @
f5149a15
...
@@ -38,8 +38,7 @@ For a 1-dimensional data array a, variance = 1/n * \sum_i (a_i - mean)^2
...
@@ -38,8 +38,7 @@ For a 1-dimensional data array a, variance = 1/n * \sum_i (a_i - mean)^2
*/
*/
void
_ReduceVariance
(
const
XTensor
*
input
,
XTensor
*
output
,
int
dim
,
const
XTensor
*
mean
)
void
_ReduceVariance
(
const
XTensor
*
input
,
XTensor
*
output
,
int
dim
,
const
XTensor
*
mean
)
{
{
int
dimRDI
=
input
->
order
-
dim
-
1
;
int
num
=
input
->
dimSize
[
dim
];
int
num
=
input
->
dimSizeRDI
[
dimRDI
];
_ReduceSum
(
input
,
output
,
dim
,
mean
,
2.0
F
);
_ReduceSum
(
input
,
output
,
dim
,
mean
,
2.0
F
);
_ScaleAndShiftMe
(
output
,
(
DTYPE
)
1
/
num
,
0
);
_ScaleAndShiftMe
(
output
,
(
DTYPE
)
1
/
num
,
0
);
}
}
...
...
source/tensor/core/reduce/VectorBuffer.cpp
查看文件 @
f5149a15
...
@@ -20,7 +20,7 @@
...
@@ -20,7 +20,7 @@
*/
*/
#include "VectorBuffer.h"
#include "VectorBuffer.h"
//#include "math.h"
namespace
nts
{
namespace
nts
{
/* data size for each buffer */
/* data size for each buffer */
int
VectorBuffer
::
size
()
int
VectorBuffer
::
size
()
...
@@ -168,4 +168,13 @@ VectorBuffer VectorBuffer::maxData(const VectorBuffer &a) {
...
@@ -168,4 +168,13 @@ VectorBuffer VectorBuffer::maxData(const VectorBuffer &a) {
return
*
this
;
return
*
this
;
}
}
/* conculte the max of two buffer */
VectorBuffer
VectorBuffer
::
minData
(
const
VectorBuffer
&
a
)
{
for
(
int
i
=
0
;
i
!=
a
.
size
();
i
++
)
{
this
->
values
[
i
]
=
MIN
(
a
[
i
],
this
->
values
[
i
]);
printf
(
"runhere"
);
}
return
*
this
;
}
}
/* end of the nts (NiuTrans.Tensor) namespace */
}
/* end of the nts (NiuTrans.Tensor) namespace */
\ No newline at end of file
source/tensor/core/reduce/VectorBuffer.h
查看文件 @
f5149a15
...
@@ -20,7 +20,6 @@
...
@@ -20,7 +20,6 @@
*/
*/
//#include <cstring>
//#include <cstring>
#include <math.h>
#include "../../XGlobal.h"
#include "../../XGlobal.h"
namespace
nts
{
namespace
nts
{
...
@@ -49,5 +48,8 @@ public:
...
@@ -49,5 +48,8 @@ public:
/* conculte the max of two buffer */
/* conculte the max of two buffer */
VectorBuffer
maxData
(
const
VectorBuffer
&
a
);
VectorBuffer
maxData
(
const
VectorBuffer
&
a
);
/* conculte the max of two buffer */
VectorBuffer
minData
(
const
VectorBuffer
&
a
);
};
};
}
}
\ No newline at end of file
source/tensor/core/shape/ConcatenateSolely.cpp
查看文件 @
f5149a15
...
@@ -39,30 +39,29 @@ void _ConcatenateSolely(const TensorList * smalls, XTensor * big, int dim)
...
@@ -39,30 +39,29 @@ void _ConcatenateSolely(const TensorList * smalls, XTensor * big, int dim)
CheckNTErrors
(
big
->
order
>
dim
&&
dim
>=
0
,
"Illegal dimension to concatenate!"
);
CheckNTErrors
(
big
->
order
>
dim
&&
dim
>=
0
,
"Illegal dimension to concatenate!"
);
int
catDimSize
=
0
;
int
catDimSize
=
0
;
int
dimRDI
=
big
->
order
-
dim
-
1
;
for
(
int
i
=
0
;
i
<
smalls
->
count
;
i
++
)
{
for
(
int
i
=
0
;
i
<
smalls
->
count
;
i
++
)
{
XTensor
*
tensor
=
(
XTensor
*
)
smalls
->
GetItem
(
i
);
XTensor
*
tensor
=
(
XTensor
*
)
smalls
->
GetItem
(
i
);
CheckNTErrors
((
big
->
order
==
tensor
->
order
),
"Unmatched tensor orders!"
);
CheckNTErrors
((
big
->
order
==
tensor
->
order
),
"Unmatched tensor orders!"
);
for
(
int
j
=
0
;
j
<
big
->
order
;
j
++
)
{
for
(
int
j
=
0
;
j
<
big
->
order
;
j
++
)
{
if
(
j
!=
dim
RDI
)
{
if
(
j
!=
dim
)
{
CheckNTErrors
((
big
->
dimSize
RDI
[
j
]
==
tensor
->
dimSizeRDI
[
j
]),
"Unmatched tensor sizes!"
);
CheckNTErrors
((
big
->
dimSize
[
j
]
==
tensor
->
dimSize
[
j
]),
"Unmatched tensor sizes!"
);
}
}
else
{
else
{
catDimSize
+=
tensor
->
dimSize
RDI
[
j
];
catDimSize
+=
tensor
->
dimSize
[
j
];
}
}
}
}
}
}
CheckNTErrors
((
catDimSize
==
big
->
dimSize
RDI
[
dimRDI
]),
"Unmatched tensor sizes!"
);
CheckNTErrors
((
catDimSize
==
big
->
dimSize
[
dim
]),
"Unmatched tensor sizes!"
);
int
stride
=
1
;
int
stride
=
1
;
for
(
int
i
=
0
;
i
<
dimRDI
;
i
++
)
stride
*=
big
->
dimSizeRDI
[
i
];
int
blockNum
=
1
;
int
blockNum
=
1
;
for
(
int
i
=
dimRDI
+
1
;
i
<
big
->
order
;
i
++
)
for
(
int
i
=
0
;
i
<
dim
;
i
++
)
blockNum
*=
big
->
dimSizeRDI
[
i
];
blockNum
*=
big
->
dimSize
[
i
];
for
(
int
i
=
dim
+
1
;
i
<
big
->
order
;
i
++
)
stride
*=
big
->
dimSize
[
i
];
int
offset
=
0
;
int
offset
=
0
;
...
@@ -74,8 +73,8 @@ void _ConcatenateSolely(const TensorList * smalls, XTensor * big, int dim)
...
@@ -74,8 +73,8 @@ void _ConcatenateSolely(const TensorList * smalls, XTensor * big, int dim)
if
(
smalls
->
count
<=
MIN_TENSOR_CAT_NUM
)
{
if
(
smalls
->
count
<=
MIN_TENSOR_CAT_NUM
)
{
for
(
int
i
=
0
;
i
<
smalls
->
count
;
i
++
)
{
for
(
int
i
=
0
;
i
<
smalls
->
count
;
i
++
)
{
XTensor
*
tensor
=
(
XTensor
*
)
smalls
->
GetItem
(
i
);
XTensor
*
tensor
=
(
XTensor
*
)
smalls
->
GetItem
(
i
);
int
sPitch
=
stride
*
tensor
->
dimSize
RDI
[
dimRDI
]
*
tensor
->
unitSize
;
int
sPitch
=
stride
*
tensor
->
dimSize
[
dim
]
*
tensor
->
unitSize
;
int
tPitch
=
stride
*
big
->
dimSize
RDI
[
dimRDI
]
*
big
->
unitSize
;
int
tPitch
=
stride
*
big
->
dimSize
[
dim
]
*
big
->
unitSize
;
int
mSize
=
sPitch
;
int
mSize
=
sPitch
;
int
n
=
blockNum
;
int
n
=
blockNum
;
XMemCopy2D
((
char
*
)
big
->
data
+
offset
,
tPitch
,
big
->
devID
,
XMemCopy2D
((
char
*
)
big
->
data
+
offset
,
tPitch
,
big
->
devID
,
...
@@ -89,7 +88,7 @@ void _ConcatenateSolely(const TensorList * smalls, XTensor * big, int dim)
...
@@ -89,7 +88,7 @@ void _ConcatenateSolely(const TensorList * smalls, XTensor * big, int dim)
int
*
blockSizes
=
new
int
[
smalls
->
count
];
int
*
blockSizes
=
new
int
[
smalls
->
count
];
for
(
int
i
=
0
;
i
<
smalls
->
count
;
i
++
)
{
for
(
int
i
=
0
;
i
<
smalls
->
count
;
i
++
)
{
XTensor
*
tensor
=
(
XTensor
*
)
smalls
->
GetItem
(
i
);
XTensor
*
tensor
=
(
XTensor
*
)
smalls
->
GetItem
(
i
);
blockSizes
[
i
]
=
stride
*
tensor
->
dimSize
RDI
[
dimRDI
]
*
tensor
->
unitSize
;
blockSizes
[
i
]
=
stride
*
tensor
->
dimSize
[
dim
]
*
tensor
->
unitSize
;
sourceArrays
->
Add
((
char
*
)
tensor
->
data
);
sourceArrays
->
Add
((
char
*
)
tensor
->
data
);
}
}
...
...
source/tensor/core/shape/IsSameShaped.cpp
查看文件 @
f5149a15
...
@@ -39,7 +39,7 @@ bool _IsSameShaped(const XTensor * a, const XTensor * b)
...
@@ -39,7 +39,7 @@ bool _IsSameShaped(const XTensor * a, const XTensor * b)
return
false
;
return
false
;
for
(
int
i
=
0
;
i
<
a
->
order
;
i
++
){
for
(
int
i
=
0
;
i
<
a
->
order
;
i
++
){
if
(
a
->
dimSize
RDI
[
i
]
!=
b
->
dimSizeRDI
[
i
])
if
(
a
->
dimSize
[
i
]
!=
b
->
dimSize
[
i
])
return
false
;
return
false
;
}
}
...
...
source/tensor/core/shape/Merge.cpp
查看文件 @
f5149a15
...
@@ -46,10 +46,8 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
...
@@ -46,10 +46,8 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
if
(
leadingDim
<
0
)
if
(
leadingDim
<
0
)
leadingDim
=
0
;
leadingDim
=
0
;
int
whereToMergeRDI
=
s
->
order
-
whereToMerge
-
1
;
if
(
leadingDim
>=
s
->
order
)
int
leadingDimRDI
=
s
->
order
-
leadingDim
-
1
;
leadingDim
=
leadingDim
-
s
->
order
;
if
(
leadingDimRDI
<
0
)
leadingDimRDI
=
s
->
order
-
1
;
CheckNTErrors
((
s
!=
NULL
&&
t
!=
NULL
),
"Invalid tensors!"
);
CheckNTErrors
((
s
!=
NULL
&&
t
!=
NULL
),
"Invalid tensors!"
);
CheckNTErrors
((
s
->
devID
==
t
->
devID
||
(
s
->
devID
<
0
&&
t
->
devID
<
0
)),
CheckNTErrors
((
s
->
devID
==
t
->
devID
||
(
s
->
devID
<
0
&&
t
->
devID
<
0
)),
...
@@ -57,19 +55,20 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
...
@@ -57,19 +55,20 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
CheckNTErrors
((
s
->
unitNum
==
t
->
unitNum
&&
s
->
unitSize
==
t
->
unitSize
),
"Unmatched tensors!"
);
CheckNTErrors
((
s
->
unitNum
==
t
->
unitNum
&&
s
->
unitSize
==
t
->
unitSize
),
"Unmatched tensors!"
);
CheckNTErrors
((
s
->
order
==
t
->
order
+
1
),
"Unmatched tensors!"
);
CheckNTErrors
((
s
->
order
==
t
->
order
+
1
),
"Unmatched tensors!"
);
CheckNTErrors
((
leadingDim
RDI
>
whereToMergeRDI
),
"Invalid leading dimension!"
);
CheckNTErrors
((
leadingDim
<
whereToMerge
),
"Invalid leading dimension!"
);
for
(
int
i
=
0
;
i
<
s
->
order
;
i
++
)
{
for
(
int
i
=
0
;
i
<
s
->
order
;
i
++
)
{
if
(
i
==
whereToMergeRDI
)
{
if
(
i
==
whereToMerge
)
{
CheckNTErrors
((
t
->
dimSizeRDI
[
i
]
==
s
->
dimSizeRDI
[
i
]
*
s
->
dimSizeRDI
[
leadingDimRDI
]),
CheckNTErrors
((
t
->
dimSize
[
i
-
1
]
==
s
->
dimSize
[
i
]
*
s
->
dimSize
[
leadingDim
]),
"Unmatched tensor sizes!"
);
"Unmatched tensor sizes!"
);
}
}
else
if
(
i
<
leadingDim
RDI
){
else
if
(
i
<
leadingDim
){
CheckNTErrors
((
s
->
dimSize
RDI
[
i
]
==
t
->
dimSizeRDI
[
i
]),
CheckNTErrors
((
s
->
dimSize
[
i
]
==
t
->
dimSize
[
i
]),
"Unmatched tensor sizes!"
);
"Unmatched tensor sizes!"
);
}
}
else
if
(
i
>
leadingDim
RDI
)
{
else
if
(
i
>
leadingDim
)
{
CheckNTErrors
((
s
->
dimSize
RDI
[
i
]
==
t
->
dimSizeRDI
[
i
-
1
]),
CheckNTErrors
((
s
->
dimSize
[
i
]
==
t
->
dimSize
[
i
-
1
]),
"Unmatched tensor sizes!"
);
"Unmatched tensor sizes!"
);
}
}
}
}
...
@@ -78,14 +77,14 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
...
@@ -78,14 +77,14 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
int
blockNum
=
1
;
int
blockNum
=
1
;
int
gridSize
=
1
;
int
gridSize
=
1
;
int
gridNum
=
1
;
int
gridNum
=
1
;
int
mergedNum
=
s
->
dimSize
RDI
[
leadingDimRDI
];
int
mergedNum
=
s
->
dimSize
[
leadingDim
];
for
(
int
i
=
0
;
i
<
s
->
order
;
i
++
)
{
for
(
int
i
=
0
;
i
<
s
->
order
;
i
++
)
{
if
(
i
<=
leadingDimRDI
)
{
if
(
i
>=
leadingDim
)
{
if
(
i
<=
whereToMergeRDI
)
if
(
i
>=
whereToMerge
)
blockSize
*=
s
->
dimSize
RDI
[
i
];
blockSize
*=
s
->
dimSize
[
i
];
else
else
blockNum
*=
s
->
dimSize
RDI
[
i
];
blockNum
*=
s
->
dimSize
[
i
];
}
}
}
}
...
@@ -122,7 +121,7 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
...
@@ -122,7 +121,7 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
if
(
!
isOnSameDevice
)
if
(
!
isOnSameDevice
)
dataTMP
=
mem
!=
NULL
?
mem
->
AllocBuf
(
mem
->
devID
,
size
)
:
XMemAlloc
(
mem
->
devID
,
size
);
dataTMP
=
mem
!=
NULL
?
mem
->
AllocBuf
(
mem
->
devID
,
size
)
:
XMemAlloc
(
mem
->
devID
,
size
);
int
blockNumInMerge
=
s
->
dimSize
RDI
[
leadingDimRDI
];
int
blockNumInMerge
=
s
->
dimSize
[
leadingDim
];
int
splitSizeInGrid
=
gridSize
/
blockNumInMerge
;
int
splitSizeInGrid
=
gridSize
/
blockNumInMerge
;
int
realBlockSize
=
blockSize
*
t
->
unitSize
;
int
realBlockSize
=
blockSize
*
t
->
unitSize
;
...
@@ -311,12 +310,11 @@ void _Merge(const TensorList * smalls, XTensor * t, int whereToMerge)
...
@@ -311,12 +310,11 @@ void _Merge(const TensorList * smalls, XTensor * t, int whereToMerge)
int
mergedNum
=
smalls
->
count
;
int
mergedNum
=
smalls
->
count
;
XTensor
*
s0
=
smalls
->
GetItem
(
0
);
XTensor
*
s0
=
smalls
->
GetItem
(
0
);
int
whereToMergeRDI
=
s0
->
order
-
whereToMerge
-
1
;
for
(
int
i
=
0
;
i
<
s0
->
order
;
i
++
)
{
for
(
int
i
=
0
;
i
<
s0
->
order
;
i
++
)
{
if
(
i
<=
whereToMergeRDI
)
if
(
i
>=
whereToMerge
)
blockSize
*=
s0
->
dimSize
RDI
[
i
];
blockSize
*=
s0
->
dimSize
[
i
];
else
else
blockNum
*=
s0
->
dimSize
RDI
[
i
];
blockNum
*=
s0
->
dimSize
[
i
];
}
}
CheckNTErrors
((
s0
->
unitNum
%
(
blockSize
*
blockNum
)
==
0
),
"Incorrect size!"
);
CheckNTErrors
((
s0
->
unitNum
%
(
blockSize
*
blockNum
)
==
0
),
"Incorrect size!"
);
...
...
source/tensor/core/shape/Merge.h
查看文件 @
f5149a15
...
@@ -46,8 +46,6 @@ void Merge(const TensorList &smalls, XTensor &t, int whereToMerge);
...
@@ -46,8 +46,6 @@ void Merge(const TensorList &smalls, XTensor &t, int whereToMerge);
/* merge two tensors into a big tensor (return an XTensor structure) */
/* merge two tensors into a big tensor (return an XTensor structure) */
XTensor
Merge
(
const
XTensor
&
smallA
,
const
XTensor
&
smallB
,
int
whereToMerge
);
XTensor
Merge
(
const
XTensor
&
smallA
,
const
XTensor
&
smallB
,
int
whereToMerge
);
void
Merge
(
const
XTensor
&
smallA
,
const
XTensor
&
smallB
,
XTensor
&
t
,
int
whereToMerge
);
}
// namespace nts(NiuTrans.Tensor)
}
// namespace nts(NiuTrans.Tensor)
#endif // __MERGE_H__
#endif // __MERGE_H__
\ No newline at end of file
source/tensor/core/shape/Split.cpp
查看文件 @
f5149a15
...
@@ -31,7 +31,7 @@
...
@@ -31,7 +31,7 @@
namespace
nts
{
// namespace nts(NiuTrans.Tensor)
namespace
nts
{
// namespace nts(NiuTrans.Tensor)
/*
/*
transform a tensor by splitting it, e.g., (N, M) -> (
3, N/3, M
)
transform a tensor by splitting it, e.g., (N, M) -> (
N/3, M, 3
)
>> s - the source tensor
>> s - the source tensor
>> t - the target tensor (for return)
>> t - the target tensor (for return)
...
@@ -46,23 +46,22 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)
...
@@ -46,23 +46,22 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)
CheckNTErrors
((
s
->
unitNum
==
t
->
unitNum
&&
s
->
unitSize
==
t
->
unitSize
),
"Unmatched tensors!"
);
CheckNTErrors
((
s
->
unitNum
==
t
->
unitNum
&&
s
->
unitSize
==
t
->
unitSize
),
"Unmatched tensors!"
);
CheckNTErrors
((
s
->
order
==
t
->
order
-
1
),
"Unmatched tensors!"
);
CheckNTErrors
((
s
->
order
==
t
->
order
-
1
),
"Unmatched tensors!"
);
CheckNTErrors
((
t
->
dimSize
RDI
[
t
->
order
-
1
]
==
splitNum
),
"Incorrect tensor sizes!"
);
CheckNTErrors
((
t
->
dimSize
[
0
]
==
splitNum
),
"Incorrect tensor sizes!"
);
int
whereToSplitRDI
=
s
->
order
-
whereToSplit
-
1
;
for
(
int
i
=
0
;
i
<
s
->
order
;
i
++
)
{
for
(
int
i
=
0
;
i
<
s
->
order
;
i
++
)
{
if
(
i
==
whereToSplit
RDI
)
{
if
(
i
==
whereToSplit
)
{
CheckNTErrors
((
s
->
dimSize
RDI
[
i
]
==
t
->
dimSizeRDI
[
i
]
*
splitNum
),
CheckNTErrors
((
s
->
dimSize
[
i
]
==
t
->
dimSize
[
i
+
1
]
*
splitNum
),
"Unmatched tensor sizes!"
);
"Unmatched tensor sizes!"
);
}
}
else
{
else
{
CheckNTErrors
((
s
->
dimSize
RDI
[
i
]
==
t
->
dimSizeRDI
[
i
]),
CheckNTErrors
((
s
->
dimSize
[
i
]
==
t
->
dimSize
[
i
+
1
]),
"Unmatched tensor sizes!"
);
"Unmatched tensor sizes!"
);
}
}
}
}
/* for the case that we split the last dimension. Actually
/* for the case that we split the last dimension. Actually
(N, M) and (
3, N/3, M
) have the same memory layout */
(N, M) and (
N, M/3, 3
) have the same memory layout */
if
(
s
->
order
-
1
==
whereToSplitRDI
)
{
if
(
0
==
whereToSplit
)
{
XMemCopy
(
t
->
data
,
t
->
devID
,
s
->
data
,
s
->
devID
,
s
->
unitNum
*
s
->
unitSize
);
XMemCopy
(
t
->
data
,
t
->
devID
,
s
->
data
,
s
->
devID
,
s
->
unitNum
*
s
->
unitSize
);
return
;
return
;
}
}
...
@@ -70,14 +69,14 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)
...
@@ -70,14 +69,14 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)
int
blockSize
=
1
;
int
blockSize
=
1
;
int
blockNum
=
1
;
int
blockNum
=
1
;
for
(
int
i
=
0
;
i
<
s
->
order
;
i
++
)
{
for
(
int
i
=
0
;
i
<
s
->
order
;
i
++
)
{
if
(
i
==
whereToSplit
RDI
)
{
if
(
i
==
whereToSplit
)
{
blockSize
*=
s
->
dimSize
RDI
[
i
]
/
splitNum
;
blockSize
*=
s
->
dimSize
[
i
]
/
splitNum
;
blockNum
*=
splitNum
;
blockNum
*=
splitNum
;
}
}
else
if
(
i
<
whereToSplitRDI
)
else
if
(
i
>
whereToSplit
)
blockSize
*=
s
->
dimSize
RDI
[
i
];
blockSize
*=
s
->
dimSize
[
i
];
else
else
blockNum
*=
s
->
dimSize
RDI
[
i
];
blockNum
*=
s
->
dimSize
[
i
];
}
}
CheckNTErrors
((
blockNum
%
splitNum
==
0
),
"Incorrect split number!"
);
CheckNTErrors
((
blockNum
%
splitNum
==
0
),
"Incorrect split number!"
);
...
@@ -184,7 +183,7 @@ bool CheckSplitSize(const XTensor * s, const XTensor * t, int whereToSplit, int
...
@@ -184,7 +183,7 @@ bool CheckSplitSize(const XTensor * s, const XTensor * t, int whereToSplit, int
}
}
/*
/*
transform a tensor by splitting it, e.g., (N, M) -> (
3, N/3, M
) (return an XTensor structure)
transform a tensor by splitting it, e.g., (N, M) -> (
N/3, M, 3
) (return an XTensor structure)
make a new tensor to keep the result and return it
make a new tensor to keep the result and return it
>> s - the source tensor
>> s - the source tensor
...
@@ -276,7 +275,6 @@ void _Split(const XTensor * big, TensorList * smalls, int whereToSplit, int spli
...
@@ -276,7 +275,6 @@ void _Split(const XTensor * big, TensorList * smalls, int whereToSplit, int spli
CheckNTErrors
((
smalls
->
count
==
splitNum
),
"Unmatched tensors!"
);
CheckNTErrors
((
smalls
->
count
==
splitNum
),
"Unmatched tensors!"
);
CheckNTErrors
((
smalls
->
count
>
0
),
"Wrong input!"
);
CheckNTErrors
((
smalls
->
count
>
0
),
"Wrong input!"
);
int
whereToSplitRDI
=
big
->
order
-
whereToSplit
-
1
;
bool
uniform
=
true
;
bool
uniform
=
true
;
for
(
int
i
=
0
;
i
<
smalls
->
count
;
i
++
)
{
for
(
int
i
=
0
;
i
<
smalls
->
count
;
i
++
)
{
...
@@ -292,14 +290,14 @@ void _Split(const XTensor * big, TensorList * smalls, int whereToSplit, int spli
...
@@ -292,14 +290,14 @@ void _Split(const XTensor * big, TensorList * smalls, int whereToSplit, int spli
int
blockSize
=
1
;
int
blockSize
=
1
;
int
blockNum
=
1
;
int
blockNum
=
1
;
for
(
int
i
=
0
;
i
<
big
->
order
;
i
++
)
{
for
(
int
i
=
0
;
i
<
big
->
order
;
i
++
)
{
if
(
i
==
whereToSplit
RDI
)
{
if
(
i
==
whereToSplit
)
{
blockSize
*=
big
->
dimSize
RDI
[
i
]
/
splitNum
;
blockSize
*=
big
->
dimSize
[
i
]
/
splitNum
;
blockNum
*=
splitNum
;
blockNum
*=
splitNum
;
}
}
else
if
(
i
<
whereToSplitRDI
)
else
if
(
i
>
whereToSplit
)
blockSize
*=
big
->
dimSize
RDI
[
i
];
blockSize
*=
big
->
dimSize
[
i
];
else
else
blockNum
*=
big
->
dimSize
RDI
[
i
];
blockNum
*=
big
->
dimSize
[
i
];
}
}
CheckNTErrors
((
blockNum
%
splitNum
==
0
),
"Incorrect split number!"
);
CheckNTErrors
((
blockNum
%
splitNum
==
0
),
"Incorrect split number!"
);
...
...
source/tensor/core/shape/Unsqueeze.cpp
查看文件 @
f5149a15
...
@@ -42,16 +42,15 @@ void _Unsqueeze(const XTensor * a, XTensor * b, int dim, int dSize)
...
@@ -42,16 +42,15 @@ void _Unsqueeze(const XTensor * a, XTensor * b, int dim, int dSize)
CheckNTErrors
((
a
->
order
==
b
->
order
-
1
),
"Unmatched tensors!"
);
CheckNTErrors
((
a
->
order
==
b
->
order
-
1
),
"Unmatched tensors!"
);
CheckNTErrors
((
a
->
unitSize
==
b
->
unitSize
),
"Unmatched tensors!"
);
CheckNTErrors
((
a
->
unitSize
==
b
->
unitSize
),
"Unmatched tensors!"
);
int
dimRDI
=
b
->
order
-
dim
-
1
;
for
(
int
i
=
0
;
i
<
b
->
order
;
i
++
)
{
for
(
int
i
=
0
;
i
<
b
->
order
;
i
++
)
{
if
(
i
<
dim
RDI
)
{
if
(
i
<
dim
)
{
CheckNTErrors
((
a
->
dimSize
RDI
[
i
]
==
b
->
dimSizeRDI
[
i
]),
"Unmatched tensors!"
);
CheckNTErrors
((
a
->
dimSize
[
i
]
==
b
->
dimSize
[
i
]),
"Unmatched tensors!"
);
}
}
else
if
(
i
>
dim
RDI
)
{
else
if
(
i
>
dim
)
{
CheckNTErrors
((
a
->
dimSize
RDI
[
i
-
1
]
==
b
->
dimSizeRDI
[
i
]),
"Unmatched tensors!"
);
CheckNTErrors
((
a
->
dimSize
[
i
-
1
]
==
b
->
dimSize
[
i
]),
"Unmatched tensors!"
);
}
}
else
{
else
{
CheckNTErrors
((
dSize
==
b
->
dimSize
RDI
[
i
]),
"Unmatched tensors!"
);
CheckNTErrors
((
dSize
==
b
->
dimSize
[
i
]),
"Unmatched tensors!"
);
}
}
}
}
...
@@ -60,8 +59,8 @@ void _Unsqueeze(const XTensor * a, XTensor * b, int dim, int dSize)
...
@@ -60,8 +59,8 @@ void _Unsqueeze(const XTensor * a, XTensor * b, int dim, int dSize)
int
blockNumA
=
1
;
int
blockNumA
=
1
;
int
blockNumB
=
1
;
int
blockNumB
=
1
;
for
(
int
i
=
0
;
i
<
dimRDI
;
i
++
)
for
(
int
i
=
dim
;
i
<
a
->
order
;
i
++
)
blockSize
*=
a
->
dimSize
RDI
[
i
];
blockSize
*=
a
->
dimSize
[
i
];
realBlockSize
=
blockSize
*
a
->
unitSize
;
realBlockSize
=
blockSize
*
a
->
unitSize
;
...
...
source/tensor/core/shape/Unsqueeze.cu
查看文件 @
f5149a15
...
@@ -235,9 +235,8 @@ void _CudaUnsqueeze(const XTensor * a, XTensor * b, int dim, int dSize)
...
@@ -235,9 +235,8 @@ void _CudaUnsqueeze(const XTensor * a, XTensor * b, int dim, int dSize)
int blockSize = 1;
int blockSize = 1;
int blockNumA = 1;
int blockNumA = 1;
int blockNumB = 1;
int blockNumB = 1;
int dimRDI = b->order - dim - 1;
for (int i = dim; i < a->order; i++)
for (int i = 0; i < dimRDI; i++)
blockSize *= a->dimSize[i];
blockSize *= a->dimSizeRDI[i];
blockNumA = a->unitNum / blockSize;
blockNumA = a->unitNum / blockSize;
blockNumB = b->unitNum / blockSize;
blockNumB = b->unitNum / blockSize;
...
@@ -250,7 +249,7 @@ void _CudaUnsqueeze(const XTensor * a, XTensor * b, int dim, int dSize)
...
@@ -250,7 +249,7 @@ void _CudaUnsqueeze(const XTensor * a, XTensor * b, int dim, int dSize)
int devIDBackup = 0;
int devIDBackup = 0;
ProtectCudaDev(a->devID, devIDBackup);
ProtectCudaDev(a->devID, devIDBackup);
if (dim
RDI == 0
) {
if (dim
== b->order - 1
) {
GDevs.GetCudaThread2D(a->devID, dSize, blockNumA, MAX_INT, cudaGrids, cudaBlocks);
GDevs.GetCudaThread2D(a->devID, dSize, blockNumA, MAX_INT, cudaGrids, cudaBlocks);
if (a->dataType == X_FLOAT && b->dataType == X_FLOAT) {
if (a->dataType == X_FLOAT && b->dataType == X_FLOAT) {
...
...
source/tensor/core/sort/Sort.cpp
查看文件 @
f5149a15
...
@@ -47,7 +47,6 @@ void _Sort(const XTensor * a, XTensor * b, XTensor * index, int dim)
...
@@ -47,7 +47,6 @@ void _Sort(const XTensor * a, XTensor * b, XTensor * index, int dim)
CheckNTErrors
((
a
->
order
==
index
->
order
),
"Unmatched input tensors!"
);
CheckNTErrors
((
a
->
order
==
index
->
order
),
"Unmatched input tensors!"
);
CheckNTErrors
((
index
->
dataType
==
X_INT
),
"Wrong data type!"
);
CheckNTErrors
((
index
->
dataType
==
X_INT
),
"Wrong data type!"
);
int
dimRDI
=
a
->
order
-
dim
-
1
;
/* make the index tensor */
/* make the index tensor */
SetAscendingOrder
(
*
index
,
dim
);
SetAscendingOrder
(
*
index
,
dim
);
...
@@ -60,13 +59,13 @@ void _Sort(const XTensor * a, XTensor * b, XTensor * index, int dim)
...
@@ -60,13 +59,13 @@ void _Sort(const XTensor * a, XTensor * b, XTensor * index, int dim)
}
}
else
{
else
{
int
stride
=
1
;
int
stride
=
1
;
int
strideNum
=
a
->
dimSizeRDI
[
dimRDI
];
for
(
int
i
=
0
;
i
<
dimRDI
;
i
++
)
stride
*=
a
->
dimSizeRDI
[
i
];
int
blockNum
=
1
;
int
blockNum
=
1
;
for
(
int
i
=
dimRDI
+
1
;
i
<
a
->
order
;
i
++
)
int
strideNum
=
a
->
dimSize
[
dim
];
blockNum
*=
a
->
dimSizeRDI
[
i
];
for
(
int
i
=
0
;
i
<
dim
;
i
++
)
blockNum
*=
a
->
dimSize
[
i
];
for
(
int
i
=
dim
+
1
;
i
<
a
->
order
;
i
++
)
stride
*=
a
->
dimSize
[
i
];
int
blockSize
=
stride
*
strideNum
;
int
blockSize
=
stride
*
strideNum
;
_CopyValues
(
a
,
b
);
_CopyValues
(
a
,
b
);
...
...
source/tensor/core/sort/Sort.cu
查看文件 @
f5149a15
...
@@ -217,20 +217,19 @@ void _CudaSortBig(const XTensor * a, XTensor * b, XTensor * indexA, XTensor * in
...
@@ -217,20 +217,19 @@ void _CudaSortBig(const XTensor * a, XTensor * b, XTensor * indexA, XTensor * in
CheckNTErrors((a->order > dim && dim >= 0), "Incorrect dimension specified!");
CheckNTErrors((a->order > dim && dim >= 0), "Incorrect dimension specified!");
CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
int dimRDI = a->order - dim - 1;
if (k < 0 || k > b->dimSize[dim])
if (k < 0 || k > b->dimSizeRDI[dimRDI])
k = b->dimSize[dim];
k = b->dimSizeRDI[dimRDI];
XMem * mem = a->mem;
XMem * mem = a->mem;
int stride = 1;
int stride = 1;
int strideNum = a->dimSizeRDI[dimRDI];
for (int i = 0; i < dimRDI; i++)
stride *= a->dimSizeRDI[i];
int blockNum = 1;
int blockNum = 1;
for (int i = dimRDI + 1; i < a->order; i++)
int strideNum = a->dimSize[dim];
blockNum *= a->dimSizeRDI[i];
for (int i = 0; i < dim; i++)
blockNum *= a->dimSize[i];
for (int i = dim + 1; i < a->order; i++)
stride *= a->dimSize[i];
int m = GetNextPower2(strideNum);
int m = GetNextPower2(strideNum);
int n = stride * blockNum;
int n = stride * blockNum;
...
...
source/tensor/core/sort/TopK.cpp
查看文件 @
f5149a15
...
@@ -45,15 +45,14 @@ void _TopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k)
...
@@ -45,15 +45,14 @@ void _TopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k)
CheckNTErrors
(
index
==
NULL
||
a
->
order
==
index
->
order
,
"Unmatched input tensors!"
);
CheckNTErrors
(
index
==
NULL
||
a
->
order
==
index
->
order
,
"Unmatched input tensors!"
);
CheckNTErrors
(
index
->
dataType
==
X_INT
,
"Wrong data type!"
);
CheckNTErrors
(
index
->
dataType
==
X_INT
,
"Wrong data type!"
);
int
dimRDI
=
a
->
order
-
dim
-
1
;
for
(
int
i
=
0
;
i
<
a
->
order
;
i
++
)
{
for
(
int
i
=
0
;
i
<
a
->
order
;
i
++
)
{
if
(
i
==
dim
RDI
)
{
if
(
i
==
dim
)
{
CheckNTErrors
(
b
->
dimSizeRDI
[
i
]
==
k
,
"A too large K"
);
CheckNTErrors
(
(
b
->
dimSize
[
i
]
==
k
)
,
"A too large K"
);
CheckNTErrors
(
index
==
NULL
||
index
->
dimSizeRDI
[
i
]
==
k
,
"Wrong size!"
);
CheckNTErrors
(
(
index
==
NULL
||
index
->
dimSize
[
i
]
==
k
)
,
"Wrong size!"
);
}
}
else
{
else
{
CheckNTErrors
(
b
->
dimSizeRDI
[
i
]
==
a
->
dimSizeRDI
[
i
]
,
"Wrong size!"
);
CheckNTErrors
(
(
b
->
dimSize
[
i
]
==
a
->
dimSize
[
i
])
,
"Wrong size!"
);
CheckNTErrors
(
index
==
NULL
||
index
->
dimSizeRDI
[
i
]
==
a
->
dimSizeRDI
[
i
]
,
"Wrong size!"
);
CheckNTErrors
(
(
index
==
NULL
||
index
->
dimSize
[
i
]
==
a
->
dimSize
[
i
])
,
"Wrong size!"
);
}
}
}
}
...
@@ -68,14 +67,14 @@ void _TopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k)
...
@@ -68,14 +67,14 @@ void _TopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k)
CheckNTErrors
((
a
->
dataType
==
DEFAULT_DTYPE
),
"TODO!"
);
CheckNTErrors
((
a
->
dataType
==
DEFAULT_DTYPE
),
"TODO!"
);
int
stride
=
1
;
int
stride
=
1
;
int
strideNumA
=
a
->
dimSizeRDI
[
dimRDI
];
int
strideNumB
=
b
->
dimSizeRDI
[
dimRDI
];
for
(
int
i
=
0
;
i
<
dimRDI
;
i
++
)
stride
*=
a
->
dimSizeRDI
[
i
];
int
blockNum
=
1
;
int
blockNum
=
1
;
for
(
int
i
=
dimRDI
+
1
;
i
<
a
->
order
;
i
++
)
int
strideNumA
=
a
->
dimSize
[
dim
];
blockNum
*=
a
->
dimSizeRDI
[
i
];
int
strideNumB
=
b
->
dimSize
[
dim
];
for
(
int
i
=
0
;
i
<
dim
;
i
++
)
blockNum
*=
a
->
dimSize
[
i
];
for
(
int
i
=
dim
+
1
;
i
<
a
->
order
;
i
++
)
stride
*=
a
->
dimSize
[
i
];
int
blockSizeA
=
stride
*
strideNumA
;
int
blockSizeA
=
stride
*
strideNumA
;
int
blockSizeB
=
stride
*
strideNumB
;
int
blockSizeB
=
stride
*
strideNumB
;
...
...
source/tensor/core/sort/TopK.cu
查看文件 @
f5149a15
...
@@ -812,15 +812,14 @@ void _CudaTopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k)
...
@@ -812,15 +812,14 @@ void _CudaTopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k)
CheckNTErrors((index->dataType == X_INT), "Wrong data type!");
CheckNTErrors((index->dataType == X_INT), "Wrong data type!");
CheckNTErrors((b->dimSize[dim] == k), "A too large K");
CheckNTErrors((b->dimSize[dim] == k), "A too large K");
int dimRDI = a->order - dim - 1;
int stride = 1;
int stride = 1;
int strideNumA = a->dimSizeRDI[dimRDI];
for (int i = 0; i < dimRDI; i++)
stride *= a->dimSizeRDI[i];
int blockNum = 1;
int blockNum = 1;
for (int i = dimRDI + 1; i < a->order; i++)
int strideNumA = a->dimSize[dim];
blockNum *= a->dimSizeRDI[i];
for (int i = 0; i < dim; i++)
blockNum *= a->dimSize[i];
for (int i = dim + 1; i < a->order; i++)
stride *= a->dimSize[i];
int workerNum = blockNum < 16 ? 64 : 32;
int workerNum = blockNum < 16 ? 64 : 32;
/* adjust the thread num according size of k for fitting the share memory size */
/* adjust the thread num according size of k for fitting the share memory size */
...
...
source/tensor/core/utilities/SetAscendingOrder.cpp
查看文件 @
f5149a15
...
@@ -47,7 +47,6 @@ void SetAscendingOrder(XTensor & tensor, int dim)
...
@@ -47,7 +47,6 @@ void SetAscendingOrder(XTensor & tensor, int dim)
return
;
return
;
}
}
int
dimRDI
=
tensor
.
order
-
dim
-
1
;
if
(
tensor
.
devID
>=
0
){
if
(
tensor
.
devID
>=
0
){
#ifdef USE_CUDA
#ifdef USE_CUDA
CudaSetAscendingOrder
(
&
tensor
,
dim
);
CudaSetAscendingOrder
(
&
tensor
,
dim
);
...
@@ -57,13 +56,13 @@ void SetAscendingOrder(XTensor & tensor, int dim)
...
@@ -57,13 +56,13 @@ void SetAscendingOrder(XTensor & tensor, int dim)
}
}
else
{
else
{
int
stride
=
1
;
int
stride
=
1
;
int
strideNum
=
tensor
.
dimSizeRDI
[
dimRDI
];
for
(
int
i
=
0
;
i
<
dimRDI
;
i
++
)
stride
*=
tensor
.
dimSizeRDI
[
i
];
int
blockNum
=
1
;
int
blockNum
=
1
;
for
(
int
i
=
dimRDI
+
1
;
i
<
tensor
.
order
;
i
++
)
int
strideNum
=
tensor
.
dimSize
[
dim
];
blockNum
*=
tensor
.
dimSizeRDI
[
i
];
for
(
int
i
=
0
;
i
<
dim
;
i
++
)
blockNum
*=
tensor
.
dimSize
[
i
];
for
(
int
i
=
dim
+
1
;
i
<
tensor
.
order
;
i
++
)
stride
*=
tensor
.
dimSize
[
i
];
for
(
int
k
=
0
;
k
<
blockNum
;
k
++
){
for
(
int
k
=
0
;
k
<
blockNum
;
k
++
){
for
(
int
j
=
0
;
j
<
strideNum
;
j
++
){
for
(
int
j
=
0
;
j
<
strideNum
;
j
++
){
...
...
source/tensor/core/utilities/SetAscendingOrder.cu
查看文件 @
f5149a15
...
@@ -67,15 +67,14 @@ void CudaSetAscendingOrder(XTensor * a, int dim)
...
@@ -67,15 +67,14 @@ void CudaSetAscendingOrder(XTensor * a, int dim)
{
{
CheckNTErrors((a->dataType == X_INT), "TODO!");
CheckNTErrors((a->dataType == X_INT), "TODO!");
int dimRDI = a->order - dim - 1;
int stride = 1;
int stride = 1;
int strideNum = a->dimSizeRDI[dimRDI];
for(int i = 0; i < dimRDI; i++)
stride *= a->dimSizeRDI[i];
int blockNum = 1;
int blockNum = 1;
for(int i = dimRDI + 1; i < a->order; i++)
int strideNum = a->dimSize[dim];
blockNum *= a->dimSizeRDI[i];
for(int i = 0; i < dim; i++)
blockNum *= a->dimSize[i];
for(int i = dim + 1; i < a->order; i++)
stride *= a->dimSize[i];
int gridSize[3];
int gridSize[3];
int blockSize[3];
int blockSize[3];
...
...
source/tensor/function/LogSoftmax.cpp
查看文件 @
f5149a15
...
@@ -50,7 +50,6 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim)
...
@@ -50,7 +50,6 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim)
return
;
return
;
}
}
int
leadDimRDI
=
x
->
order
-
leadDim
-
1
;
if
(
!
x
->
isSparse
&&
!
y
->
isSparse
&&
if
(
!
x
->
isSparse
&&
!
y
->
isSparse
&&
x
->
dataType
==
DEFAULT_DTYPE
&&
y
->
dataType
==
DEFAULT_DTYPE
)
x
->
dataType
==
DEFAULT_DTYPE
&&
y
->
dataType
==
DEFAULT_DTYPE
)
{
{
...
@@ -70,13 +69,13 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim)
...
@@ -70,13 +69,13 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim)
XTensor
*
blockMax
=
NULL
;
XTensor
*
blockMax
=
NULL
;
XTensor
*
blockSum
=
NULL
;
XTensor
*
blockSum
=
NULL
;
int
dimensionSize
=
y
->
dimSize
RDI
[
leadDimRDI
];
int
dimensionSize
=
y
->
dimSize
[
leadDim
];
int
stride
=
1
;
int
stride
=
1
;
int
blockSize
=
1
;
int
blockSize
=
1
;
int
blockNum
=
1
;
int
blockNum
=
1
;
for
(
int
i
=
0
;
i
<
leadDimRDI
;
i
++
)
for
(
int
i
=
leadDim
+
1
;
i
<
y
->
order
;
i
++
)
stride
*=
y
->
dimSize
RDI
[
i
];
stride
*=
y
->
dimSize
[
i
];
blockSize
=
stride
*
dimensionSize
;
blockSize
=
stride
*
dimensionSize
;
blockNum
=
y
->
unitNum
/
blockSize
;
blockNum
=
y
->
unitNum
/
blockSize
;
...
@@ -87,7 +86,7 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim)
...
@@ -87,7 +86,7 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim)
_ReduceSum
(
x
,
sum
,
leadDim
,
max
,
1.0
F
,
true
);
_ReduceSum
(
x
,
sum
,
leadDim
,
max
,
1.0
F
,
true
);
if
(
x
->
devID
>=
0
)
{
if
(
x
->
devID
>=
0
)
{
if
(
leadDim
RDI
==
0
){
if
(
leadDim
==
x
->
order
-
1
){
blockSize
=
y
->
unitNum
;
blockSize
=
y
->
unitNum
;
blockNum
=
1
;
blockNum
=
1
;
blockx
=
NewTensor2D
(
blockSize
/
dimensionSize
,
-
dimensionSize
,
x
->
dataType
,
x
->
devID
,
mem
);
blockx
=
NewTensor2D
(
blockSize
/
dimensionSize
,
-
dimensionSize
,
x
->
dataType
,
x
->
devID
,
mem
);
...
@@ -138,7 +137,7 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim)
...
@@ -138,7 +137,7 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim)
blockMax
->
data
=
mp
;
blockMax
->
data
=
mp
;
blockSum
->
data
=
sp
;
blockSum
->
data
=
sp
;
#ifdef USE_CUDA
#ifdef USE_CUDA
if
(
leadDim
RDI
==
0
)
if
(
leadDim
==
x
->
order
-
1
)
_CudaLogSoftmaxSumMax
(
blockx
,
blocky
,
1
,
blockSum
,
blockMax
);
_CudaLogSoftmaxSumMax
(
blockx
,
blocky
,
1
,
blockSum
,
blockMax
);
else
else
_CudaLogSoftmaxSumMax
(
blockx
,
blocky
,
leadDim
,
blockSum
,
blockMax
);
_CudaLogSoftmaxSumMax
(
blockx
,
blocky
,
leadDim
,
blockSum
,
blockMax
);
...
@@ -299,7 +298,6 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
...
@@ -299,7 +298,6 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
if
(
leadDim
<
0
)
if
(
leadDim
<
0
)
leadDim
=
y
->
order
-
1
;
leadDim
=
y
->
order
-
1
;
int
leadDimRDI
=
y
->
order
-
leadDim
-
1
;
#ifdef USE_CUDA
#ifdef USE_CUDA
if
(
gold
->
devID
>=
0
)
{
if
(
gold
->
devID
>=
0
)
{
_CudaLogSoftmaxBackward
(
gold
,
y
,
x
,
dedy
,
dedx
,
padding
,
leadDim
,
lossName
);
_CudaLogSoftmaxBackward
(
gold
,
y
,
x
,
dedy
,
dedx
,
padding
,
leadDim
,
lossName
);
...
@@ -307,12 +305,12 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
...
@@ -307,12 +305,12 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
}
}
#endif
#endif
int
dimensionSize
=
y
->
dimSize
RDI
[
leadDimRDI
];
int
dimensionSize
=
y
->
dimSize
[
leadDim
];
int
stride
=
1
;
int
stride
=
1
;
int
blockSize
=
1
;
int
blockSize
=
1
;
int
blockNum
=
1
;
int
blockNum
=
1
;
for
(
int
i
=
0
;
i
<
leadDimRDI
;
i
++
)
for
(
int
i
=
leadDim
+
1
;
i
<
y
->
order
;
i
++
)
stride
*=
y
->
dimSize
RDI
[
i
];
stride
*=
y
->
dimSize
[
i
];
blockSize
=
stride
*
dimensionSize
;
blockSize
=
stride
*
dimensionSize
;
blockNum
=
y
->
unitNum
/
blockSize
;
blockNum
=
y
->
unitNum
/
blockSize
;
...
@@ -339,10 +337,10 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
...
@@ -339,10 +337,10 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
int
key
=
gold
->
GetKeyInSparse
(
i
);
int
key
=
gold
->
GetKeyInSparse
(
i
);
DTYPE
value
=
gold
->
GetInSparse
(
i
);
DTYPE
value
=
gold
->
GetInSparse
(
i
);
int
offset
=
key
;
int
offset
=
key
;
if
(
dedx
->
dimSize
RDI
[
0
]
!=
gm
)
{
if
(
dedx
->
dimSize
[
dedx
->
order
-
1
]
!=
gm
)
{
int
mi
=
key
%
gm
;
int
mi
=
key
%
gm
;
int
ni
=
key
/
gm
;
int
ni
=
key
/
gm
;
int
key2
=
ni
*
dedx
->
dimSize
RDI
[
0
]
+
mi
;
int
key2
=
ni
*
dedx
->
dimSize
[
dedx
->
order
-
1
]
+
mi
;
offset
=
key2
;
offset
=
key2
;
}
}
if
(
key
>=
0
&&
key
<
size
)
if
(
key
>=
0
&&
key
<
size
)
...
@@ -396,10 +394,10 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
...
@@ -396,10 +394,10 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
int
key
=
gold
->
GetKeyInSparse
(
i
);
int
key
=
gold
->
GetKeyInSparse
(
i
);
DTYPE
value
=
gold
->
GetInSparse
(
i
);
DTYPE
value
=
gold
->
GetInSparse
(
i
);
int
offset
=
key
;
int
offset
=
key
;
if
(
dedx
->
dimSize
RDI
[
0
]
!=
gm
)
{
if
(
dedx
->
dimSize
[
dedx
->
order
-
1
]
!=
gm
)
{
int
mi
=
key
%
gm
;
int
mi
=
key
%
gm
;
int
ni
=
key
/
gm
;
int
ni
=
key
/
gm
;
int
key2
=
ni
*
dedx
->
dimSize
RDI
[
0
]
+
mi
;
int
key2
=
ni
*
dedx
->
dimSize
[
dedx
->
order
-
1
]
+
mi
;
offset
=
key2
;
offset
=
key2
;
}
}
if
(
key
>=
0
&&
key
<
size
)
if
(
key
>=
0
&&
key
<
size
)
...
@@ -431,11 +429,11 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
...
@@ -431,11 +429,11 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
/* for columns with no xs we set dE/ds = 0 */
/* for columns with no xs we set dE/ds = 0 */
if
(
gold
!=
NULL
&&
gold
->
isSparse
)
{
if
(
gold
!=
NULL
&&
gold
->
isSparse
)
{
CheckNTErrors
((
gold
->
order
==
2
),
"The gold standard tensor must be of order 2!"
);
CheckNTErrors
((
gold
->
order
==
2
),
"The gold standard tensor must be of order 2!"
);
if
((
gold
->
dimSize
[
1
]
>
1
&&
!
gold
->
isAllValued
[
0
])
||
gold
->
dimSize
[
1
]
!=
dedx
->
dimSize
RDI
[
0
])
{
if
((
gold
->
dimSize
[
1
]
>
1
&&
!
gold
->
isAllValued
[
0
])
||
gold
->
dimSize
[
1
]
!=
dedx
->
dimSize
[
dedx
->
order
-
1
])
{
int
gn
=
gold
->
dimSize
[
0
];
int
gn
=
gold
->
dimSize
[
0
];
int
gm
=
gold
->
dimSize
[
1
];
int
gm
=
gold
->
dimSize
[
1
];
int
sm
=
dedx
->
dimSize
RDI
[
0
];
int
sm
=
dedx
->
dimSize
[
dedx
->
order
-
1
];
int
sn
=
dedx
->
dimSize
RDI
[
1
];
int
sn
=
dedx
->
dimSize
[
dedx
->
order
-
2
];
int
*
flags
=
new
int
[
sm
];
int
*
flags
=
new
int
[
sm
];
memset
(
flags
,
0
,
sizeof
(
int
)
*
sm
);
memset
(
flags
,
0
,
sizeof
(
int
)
*
sm
);
...
...
source/tensor/function/LogSoftmax.cu
查看文件 @
f5149a15
...
@@ -385,13 +385,12 @@ void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
...
@@ -385,13 +385,12 @@ void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
"Tensors used in log softmax are not on the same GPU.");
"Tensors used in log softmax are not on the same GPU.");
CheckNTErrors((gold != NULL), "No x gold standard is found!");
CheckNTErrors((gold != NULL), "No x gold standard is found!");
int leadDimRDI = y->order - leadDim - 1;
int dimensionSize = y->dimSize[leadDim];
int dimensionSize = y->dimSizeRDI[leadDimRDI];
int stride = 1;
int stride = 1;
int blockSize = 1;
int blockSize = 1;
int blockNum = 1;
int blockNum = 1;
for (int i =
0; i < leadDimRDI
; i++)
for (int i =
leadDim + 1; i < y->order
; i++)
stride *= y->dimSize
RDI
[i];
stride *= y->dimSize[i];
blockSize = stride * dimensionSize;
blockSize = stride * dimensionSize;
blockNum = y->unitNum / blockSize;
blockNum = y->unitNum / blockSize;
...
...
source/tensor/function/Loss.cpp
查看文件 @
f5149a15
...
@@ -50,18 +50,17 @@ DTYPE _LossCompute(XTensor * gold, XTensor * output, LOSS_FUNCTION_NAME LFName,
...
@@ -50,18 +50,17 @@ DTYPE _LossCompute(XTensor * gold, XTensor * output, LOSS_FUNCTION_NAME LFName,
if
(
output
->
devID
<
0
)
{
if
(
output
->
devID
<
0
)
{
CheckNTErrors
((
gLen
>=
0
&&
gLen
<=
output
->
unitNum
),
"Illegal input length!"
);
CheckNTErrors
((
gLen
>=
0
&&
gLen
<=
output
->
unitNum
),
"Illegal input length!"
);
CheckNTErrors
((
_IsSameShaped
(
gold
,
output
)),
"The input tensors must be of the same size!"
);
CheckNTErrors
((
_IsSameShaped
(
gold
,
output
)),
"The input tensors must be of the same size!"
);
CheckNTErrors
((
gold
->
dimSize
RDI
[
0
]
==
1
&&
output
->
dimSizeRDI
[
0
]
==
1
),
"TODO!"
);
CheckNTErrors
((
gold
->
dimSize
[
gold
->
order
-
1
]
==
1
&&
output
->
dimSize
[
output
->
order
-
1
]
==
1
),
"TODO!"
);
CheckNTErrors
((
gold
->
order
>
leadDim
&&
leadDim
>=
0
),
"Illegal leading dimension!"
);
CheckNTErrors
((
gold
->
order
>
leadDim
&&
leadDim
>=
0
),
"Illegal leading dimension!"
);
CheckNTErrors
((
gold
->
dataType
==
DEFAULT_DTYPE
&&
output
->
dataType
==
DEFAULT_DTYPE
),
"TODO!"
);
CheckNTErrors
((
gold
->
dataType
==
DEFAULT_DTYPE
&&
output
->
dataType
==
DEFAULT_DTYPE
),
"TODO!"
);
int
leadDimRDI
=
output
->
order
-
leadDim
-
1
;
int
dimensionSize
=
output
->
dimSize
[
leadDim
];
int
dimensionSize
=
output
->
dimSizeRDI
[
leadDimRDI
];
int
stride
=
1
;
int
stride
=
1
;
int
blockSize
=
1
;
int
blockSize
=
1
;
int
blockNum
=
1
;
int
blockNum
=
1
;
for
(
int
i
=
0
;
i
<
leadDimRDI
;
i
++
)
for
(
int
i
=
leadDim
+
1
;
i
<
output
->
order
;
i
++
)
stride
*=
output
->
dimSize
RDI
[
i
];
stride
*=
output
->
dimSize
[
i
];
blockSize
=
stride
*
dimensionSize
;
blockSize
=
stride
*
dimensionSize
;
blockNum
=
output
->
unitNum
/
blockSize
;
blockNum
=
output
->
unitNum
/
blockSize
;
...
@@ -207,18 +206,17 @@ DTYPE _LossComputeForLogScale(XTensor * gold, XTensor * output,
...
@@ -207,18 +206,17 @@ DTYPE _LossComputeForLogScale(XTensor * gold, XTensor * output,
{
{
CheckNTErrors
(
gLen
>=
0
&&
gLen
<=
output
->
unitNum
,
"Illegal input length!"
);
CheckNTErrors
(
gLen
>=
0
&&
gLen
<=
output
->
unitNum
,
"Illegal input length!"
);
CheckNTErrors
(
_IsSameShaped
(
gold
,
output
),
"The input tensors must be of the same size!"
);
CheckNTErrors
(
_IsSameShaped
(
gold
,
output
),
"The input tensors must be of the same size!"
);
CheckNTErrors
(
gold
->
dimSize
RDI
[
0
]
==
1
&&
output
->
dimSizeRDI
[
0
]
==
1
,
"TODO!"
);
CheckNTErrors
(
gold
->
dimSize
[
gold
->
order
-
1
]
==
1
&&
output
->
dimSize
[
output
->
order
-
1
]
==
1
,
"TODO!"
);
CheckNTErrors
(
gold
->
order
>
leadDim
&&
leadDim
>=
0
,
"Illegal leading dimension!"
);
CheckNTErrors
(
gold
->
order
>
leadDim
&&
leadDim
>=
0
,
"Illegal leading dimension!"
);
CheckNTErrors
(
gold
->
dataType
==
DEFAULT_DTYPE
&&
output
->
dataType
==
DEFAULT_DTYPE
,
"TODO!"
);
CheckNTErrors
(
gold
->
dataType
==
DEFAULT_DTYPE
&&
output
->
dataType
==
DEFAULT_DTYPE
,
"TODO!"
);
int
leadDimRDI
=
output
->
order
-
leadDim
-
1
;
int
dimensionSize
=
output
->
dimSize
[
leadDim
];
int
dimensionSize
=
output
->
dimSizeRDI
[
leadDimRDI
];
int
stride
=
1
;
int
stride
=
1
;
int
blockSize
=
1
;
int
blockSize
=
1
;
int
blockNum
=
1
;
int
blockNum
=
1
;
for
(
int
i
=
0
;
i
<
leadDimRDI
;
i
++
)
for
(
int
i
=
leadDim
+
1
;
i
<
output
->
order
;
i
++
)
stride
*=
output
->
dimSize
RDI
[
i
];
stride
*=
output
->
dimSize
[
i
];
blockSize
=
stride
*
dimensionSize
;
blockSize
=
stride
*
dimensionSize
;
blockNum
=
output
->
unitNum
/
blockSize
;
blockNum
=
output
->
unitNum
/
blockSize
;
...
@@ -409,21 +407,20 @@ void _LossBackward(XTensor * dedy, XTensor * t, XTensor * y,
...
@@ -409,21 +407,20 @@ void _LossBackward(XTensor * dedy, XTensor * t, XTensor * y,
CheckNTErrors
(
t
->
order
>
leadDim
,
"Illegal leading dimension!"
);
CheckNTErrors
(
t
->
order
>
leadDim
,
"Illegal leading dimension!"
);
CheckNTErrors
(
t
->
dataType
==
DEFAULT_DTYPE
&&
y
->
dataType
==
DEFAULT_DTYPE
,
"TODO!"
);
CheckNTErrors
(
t
->
dataType
==
DEFAULT_DTYPE
&&
y
->
dataType
==
DEFAULT_DTYPE
,
"TODO!"
);
int
leadDimRDI
=
leadDim
>=
0
?
y
->
order
-
leadDim
-
1
:
-
1
;
if
(
leadDim
<
0
)
{
if
(
leadDimRDI
<
0
){
leadDim
=
0
;
leadDimRDI
=
y
->
order
-
1
;
tBeg
=
0
;
tBeg
=
0
;
yBeg
=
0
;
yBeg
=
0
;
tLen
=
y
->
dimSize
RDI
[
leadDimRDI
];
tLen
=
y
->
dimSize
[
leadDim
];
}
}
int
dimensionSize
=
y
->
dimSize
RDI
[
leadDimRDI
];
int
dimensionSize
=
y
->
dimSize
[
leadDim
];
int
stride
=
1
;
int
stride
=
1
;
int
blockSize
=
1
;
int
blockSize
=
1
;
int
blockNum
=
1
;
int
blockNum
=
1
;
for
(
int
i
=
0
;
i
<
leadDimRDI
;
i
++
)
for
(
int
i
=
leadDim
+
1
;
i
<
y
->
order
;
i
++
)
stride
*=
y
->
dimSize
RDI
[
i
];
stride
*=
y
->
dimSize
[
i
];
blockSize
=
stride
*
dimensionSize
;
blockSize
=
stride
*
dimensionSize
;
blockNum
=
y
->
unitNum
/
blockSize
;
blockNum
=
y
->
unitNum
/
blockSize
;
...
...
source/tensor/function/Loss.cu
查看文件 @
f5149a15
...
@@ -56,7 +56,7 @@ DTYPE _CudaLossCompute(XTensor * gold, XTensor * y, LOSS_FUNCTION_NAME LFName,
...
@@ -56,7 +56,7 @@ DTYPE _CudaLossCompute(XTensor * gold, XTensor * y, LOSS_FUNCTION_NAME LFName,
{
{
CheckNTErrors((gLen >= 0 && gLen <= y->unitNum), "Illegal input length!");
CheckNTErrors((gLen >= 0 && gLen <= y->unitNum), "Illegal input length!");
CheckNTErrors((_IsSameShaped(gold, y)), "The input tensors must be of the same size!");
CheckNTErrors((_IsSameShaped(gold, y)), "The input tensors must be of the same size!");
CheckNTErrors((gold->dimSize
RDI[0] == 1 && y->dimSizeRDI[0
] == 1), "TODO!");
CheckNTErrors((gold->dimSize
[gold->order - 1] == 1 && y->dimSize[y->order - 1
] == 1), "TODO!");
CheckNTErrors((gold->order > leadDim && leadDim >= 0), "Illegal leading dimension!");
CheckNTErrors((gold->order > leadDim && leadDim >= 0), "Illegal leading dimension!");
CheckNTErrors((gold->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE), "TODO!");
CheckNTErrors((gold->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE), "TODO!");
CheckNTErrors((gold->devID == y->devID), "Tensors must be on the same device!");
CheckNTErrors((gold->devID == y->devID), "Tensors must be on the same device!");
...
@@ -91,7 +91,7 @@ DTYPE _CudaLossCompute(XTensor * gold, XTensor * y, LOSS_FUNCTION_NAME LFName,
...
@@ -91,7 +91,7 @@ DTYPE _CudaLossCompute(XTensor * gold, XTensor * y, LOSS_FUNCTION_NAME LFName,
diffNew->order = 2;
diffNew->order = 2;
diffNew->dimSize[1] = diffNew->dimSize[0];
diffNew->dimSize[1] = diffNew->dimSize[0];
diffNew->dimSize[0] = 1;
diffNew->dimSize[0] = 1;
diffNew->dimSize
RDI[1
] = 1;
diffNew->dimSize
[diffNew->order - 2
] = 1;
}
}
delete diff;
delete diff;
diff = diffNew;
diff = diffNew;
...
@@ -125,7 +125,7 @@ DTYPE _CudaLossCompute(XTensor * gold, XTensor * y, LOSS_FUNCTION_NAME LFName,
...
@@ -125,7 +125,7 @@ DTYPE _CudaLossCompute(XTensor * gold, XTensor * y, LOSS_FUNCTION_NAME LFName,
diffNew->order = 2;
diffNew->order = 2;
diffNew->dimSize[1] = diffNew->dimSize[0];
diffNew->dimSize[1] = diffNew->dimSize[0];
diffNew->dimSize[0] = 1;
diffNew->dimSize[0] = 1;
diffNew->dimSize
RDI[1
] = 1;
diffNew->dimSize
[diffNew->order - 2
] = 1;
}
}
delete diff;
delete diff;
diff = diffNew;
diff = diffNew;
...
@@ -162,7 +162,7 @@ DTYPE _CudaLossCompute(XTensor * gold, XTensor * y, LOSS_FUNCTION_NAME LFName,
...
@@ -162,7 +162,7 @@ DTYPE _CudaLossCompute(XTensor * gold, XTensor * y, LOSS_FUNCTION_NAME LFName,
diffNew->order = 2;
diffNew->order = 2;
diffNew->dimSize[1] = diffNew->dimSize[0];
diffNew->dimSize[1] = diffNew->dimSize[0];
diffNew->dimSize[0] = 1;
diffNew->dimSize[0] = 1;
diffNew->dimSize
RDI[1
] = 1;
diffNew->dimSize
[diffNew->order - 2
] = 1;
}
}
delete diff;
delete diff;
diff = diffNew;
diff = diffNew;
...
@@ -349,22 +349,21 @@ void _CudaLossBackward(XTensor * dedy, XTensor * t, XTensor * y,
...
@@ -349,22 +349,21 @@ void _CudaLossBackward(XTensor * dedy, XTensor * t, XTensor * y,
"The vectors must be on the same GPU.");
"The vectors must be on the same GPU.");
CheckNTErrors((tBeg == yBeg), "TODO!");
CheckNTErrors((tBeg == yBeg), "TODO!");
int leadDimRDI = leadDim >= 0 ? y->order - leadDim - 1 : -1;
if (leadDim < 0) {
if(leadDimRDI < 0){
leadDim = 0;
leadDimRDI = y->order - 1;
tBeg = 0;
tBeg = 0;
yBeg = 0;
yBeg = 0;
tLen = y->dimSize
RDI[leadDimRDI
];
tLen = y->dimSize
[leadDim
];
}
}
int dimensionSize = y->dimSize
RDI[leadDimRDI
];
int dimensionSize = y->dimSize
[leadDim
];
int stride = 1;
int stride = 1;
int blockSize = 1;
int blockSize = 1;
int blockNum = 1;
int blockNum = 1;
int size = 1;
int size = 1;
for(int i =
0; i < leadDimRDI
; i++)
for(int i =
leadDim + 1; i < y->order
; i++)
stride *= y->dimSize
RDI
[i];
stride *= y->dimSize[i];
size = tLen * stride;
size = tLen * stride;
blockSize = stride * dimensionSize;
blockSize = stride * dimensionSize;
blockNum = y->unitNum / blockSize;
blockNum = y->unitNum / blockSize;
...
...
source/tensor/function/Softmax.cpp
查看文件 @
f5149a15
...
@@ -41,7 +41,6 @@ void _Softmax(const XTensor * x, XTensor * y, int leadDim)
...
@@ -41,7 +41,6 @@ void _Softmax(const XTensor * x, XTensor * y, int leadDim)
if
(
leadDim
<
0
)
if
(
leadDim
<
0
)
leadDim
=
x
->
order
-
1
;
leadDim
=
x
->
order
-
1
;
int
leadDimRDI
=
x
->
order
-
leadDim
-
1
;
if
(
!
x
->
isSparse
&&
!
y
->
isSparse
&&
x
->
dataType
==
y
->
dataType
){
if
(
!
x
->
isSparse
&&
!
y
->
isSparse
&&
x
->
dataType
==
y
->
dataType
){
int
*
dimSize
=
new
int
[
x
->
order
-
1
];
int
*
dimSize
=
new
int
[
x
->
order
-
1
];
for
(
int
i
=
0
;
i
<
x
->
order
;
i
++
){
for
(
int
i
=
0
;
i
<
x
->
order
;
i
++
){
...
@@ -71,13 +70,13 @@ void _Softmax(const XTensor * x, XTensor * y, int leadDim)
...
@@ -71,13 +70,13 @@ void _Softmax(const XTensor * x, XTensor * y, int leadDim)
else
{
else
{
CheckNTErrors
((
x
->
dataType
==
DEFAULT_DTYPE
),
"TODO!"
);
CheckNTErrors
((
x
->
dataType
==
DEFAULT_DTYPE
),
"TODO!"
);
int
dimensionSize
=
y
->
dimSize
RDI
[
leadDimRDI
];
int
dimensionSize
=
y
->
dimSize
[
leadDim
];
int
stride
=
1
;
int
stride
=
1
;
int
blockSize
=
1
;
int
blockSize
=
1
;
int
blockNum
=
1
;
int
blockNum
=
1
;
for
(
int
i
=
0
;
i
<
leadDimRDI
;
i
++
)
for
(
int
i
=
leadDim
+
1
;
i
<
y
->
order
;
i
++
)
stride
*=
y
->
dimSize
RDI
[
i
];
stride
*=
y
->
dimSize
[
i
];
blockSize
=
stride
*
dimensionSize
;
blockSize
=
stride
*
dimensionSize
;
blockNum
=
y
->
unitNum
/
blockSize
;
blockNum
=
y
->
unitNum
/
blockSize
;
...
@@ -207,8 +206,6 @@ void _SoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
...
@@ -207,8 +206,6 @@ void _SoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
if
(
leadDim
<
0
)
if
(
leadDim
<
0
)
leadDim
=
y
->
order
-
1
;
leadDim
=
y
->
order
-
1
;
int
leadDimRDI
=
y
->
order
-
leadDim
-
1
;
#ifdef USE_CUDA
#ifdef USE_CUDA
if
(
y
->
devID
>=
0
){
if
(
y
->
devID
>=
0
){
_CudaSoftmaxBackward
(
gold
,
y
,
x
,
dedy
,
dedx
,
padding
,
leadDim
,
lossName
);
_CudaSoftmaxBackward
(
gold
,
y
,
x
,
dedy
,
dedx
,
padding
,
leadDim
,
lossName
);
...
@@ -216,12 +213,12 @@ void _SoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
...
@@ -216,12 +213,12 @@ void _SoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
}
}
#endif
#endif
int
dimensionSize
=
y
->
dimSize
RDI
[
leadDimRDI
];
int
dimensionSize
=
y
->
dimSize
[
leadDim
];
int
stride
=
1
;
int
stride
=
1
;
int
blockSize
=
1
;
int
blockSize
=
1
;
int
blockNum
=
1
;
int
blockNum
=
1
;
for
(
int
i
=
0
;
i
<
leadDimRDI
;
i
++
)
for
(
int
i
=
leadDim
+
1
;
i
<
y
->
order
;
i
++
)
stride
*=
y
->
dimSize
RDI
[
i
];
stride
*=
y
->
dimSize
[
i
];
blockSize
=
stride
*
dimensionSize
;
blockSize
=
stride
*
dimensionSize
;
blockNum
=
y
->
unitNum
/
blockSize
;
blockNum
=
y
->
unitNum
/
blockSize
;
...
...
source/tensor/function/Softmax.cu
查看文件 @
f5149a15
...
@@ -226,14 +226,13 @@ void _CudaSoftmaxSumMax(const XTensor * x, XTensor * y, int leadDim, XTensor * s
...
@@ -226,14 +226,13 @@ void _CudaSoftmaxSumMax(const XTensor * x, XTensor * y, int leadDim, XTensor * s
CheckNTErrors((x->devID == y->devID), "Tensors used in softmax are not on the same GPU.");
CheckNTErrors((x->devID == y->devID), "Tensors used in softmax are not on the same GPU.");
CheckNTErrors((_IsSameShaped(x, y)), "Input tensors must be of the same size!");
CheckNTErrors((_IsSameShaped(x, y)), "Input tensors must be of the same size!");
int leadDimRDI = y->order - leadDim - 1;
int dimensionSize = y->dimSize[leadDim];
int dimensionSize = y->dimSizeRDI[leadDimRDI];
int stride = 1;
int stride = 1;
int blockSize = 1;
int blockSize = 1;
int blockNum = 1;
int blockNum = 1;
for(int i =
0; i < leadDimRDI
; i++)
for(int i =
leadDim + 1; i < y->order
; i++)
stride *= y->dimSize
RDI
[i];
stride *= y->dimSize[i];
blockSize = stride * dimensionSize;
blockSize = stride * dimensionSize;
blockNum = y->unitNum / blockSize;
blockNum = y->unitNum / blockSize;
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论