Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
N
NiuTrans.Tensor
概览
Overview
Details
Activity
Cycle Analytics
版本库
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
问题
0
Issues
0
列表
Board
标记
里程碑
合并请求
0
Merge Requests
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
Snippets
成员
Collapse sidebar
Close sidebar
活动
图像
聊天
创建新问题
作业
提交
Issue Boards
Open sidebar
杨迪
NiuTrans.Tensor
Commits
f5149a15
Commit
f5149a15
authored
Oct 30, 2019
by
liyinqiao
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Merge with Yuhao branch (with little bit change).
parent
f0b49d6d
全部展开
显示空白字符变更
内嵌
并排
正在显示
56 个修改的文件
包含
695 行增加
和
489 行删除
+695
-489
source/tensor/Main.cpp
+3
-2
source/tensor/XGlobal.cpp
+0
-8
source/tensor/XGlobal.h
+0
-2
source/tensor/XMem.cpp
+2
-2
source/tensor/XTensor.cpp
+8
-21
source/tensor/XTensor.h
+0
-3
source/tensor/core/arithmetic/Div.cpp
+7
-10
source/tensor/core/arithmetic/Div.cu
+8
-9
source/tensor/core/arithmetic/MatrixMul.cpp
+61
-60
source/tensor/core/arithmetic/MatrixMulBatched.cpp
+40
-40
source/tensor/core/arithmetic/MulAndShift.cpp
+17
-16
source/tensor/core/arithmetic/Multiply.cpp
+8
-11
source/tensor/core/arithmetic/Multiply.cu
+10
-11
source/tensor/core/arithmetic/SumDim.cpp
+0
-14
source/tensor/core/arithmetic/SumDim.cu
+3
-3
source/tensor/core/getandset/OnehotAndIndex.cpp
+41
-0
source/tensor/core/getandset/OnehotAndIndex.h
+3
-0
source/tensor/core/getandset/Select.cpp
+80
-7
source/tensor/core/getandset/Select.h
+4
-1
source/tensor/core/math/Normalize.cpp
+8
-9
source/tensor/core/math/Normalize.cu
+5
-6
source/tensor/core/movement/CopyInGrid.cpp
+2
-3
source/tensor/core/movement/CopyIndexed.cpp
+21
-19
source/tensor/core/movement/Gather.cpp
+67
-5
source/tensor/core/movement/Gather.cu
+69
-0
source/tensor/core/movement/Gather.cuh
+2
-0
source/tensor/core/movement/Gather.h
+6
-0
source/tensor/core/reduce/ReduceMax.cpp
+0
-0
source/tensor/core/reduce/ReduceMax.cu
+0
-0
source/tensor/core/reduce/ReduceMax.cuh
+3
-0
source/tensor/core/reduce/ReduceMax.h
+8
-2
source/tensor/core/reduce/ReduceMean.cpp
+1
-2
source/tensor/core/reduce/ReduceSum.cpp
+13
-14
source/tensor/core/reduce/ReduceSum.cu
+24
-23
source/tensor/core/reduce/ReduceVariance.cpp
+1
-2
source/tensor/core/reduce/VectorBuffer.cpp
+11
-1
source/tensor/core/reduce/VectorBuffer.h
+4
-1
source/tensor/core/shape/ConcatenateSolely.cpp
+12
-13
source/tensor/core/shape/IsSameShaped.cpp
+1
-1
source/tensor/core/shape/Merge.cpp
+19
-21
source/tensor/core/shape/Merge.h
+0
-3
source/tensor/core/shape/Split.cpp
+18
-20
source/tensor/core/shape/Unsqueeze.cpp
+7
-8
source/tensor/core/shape/Unsqueeze.cu
+3
-4
source/tensor/core/sort/Sort.cpp
+6
-7
source/tensor/core/sort/Sort.cu
+8
-9
source/tensor/core/sort/TopK.cpp
+12
-13
source/tensor/core/sort/TopK.cu
+6
-7
source/tensor/core/utilities/SetAscendingOrder.cpp
+6
-7
source/tensor/core/utilities/SetAscendingOrder.cu
+6
-7
source/tensor/function/LogSoftmax.cpp
+15
-17
source/tensor/function/LogSoftmax.cu
+3
-4
source/tensor/function/Loss.cpp
+14
-17
source/tensor/function/Loss.cu
+10
-11
source/tensor/function/Softmax.cpp
+6
-9
source/tensor/function/Softmax.cu
+3
-4
没有找到文件。
source/tensor/Main.cpp
查看文件 @
f5149a15
...
...
@@ -30,8 +30,9 @@
#include "XDevice.h"
#include "./test/Test.h"
#include "./core/CHeader.h"
#include "./loss/CrossEntropy.h"
#include "./XBLAS.h"
#include "./core/sort/TopK.h"
#include "./core/movement/Gather.h"
//#define CRTDBG_MAP_ALLOC
//#include <stdlib.h>
//#include <crtdbg.h>
...
...
source/tensor/XGlobal.cpp
查看文件 @
f5149a15
...
...
@@ -50,14 +50,6 @@ int CONST_MINUSONE = -1;
bool
CONST_TRUE
=
true
;
int
verboseLevel
=
0
;
bool
useBLAS
=
false
;
#ifdef USE_CUDA
bool
useCUDA
=
true
;
#else
bool
useCUDA
=
false
;
#endif
FILE
*
tmpLog
=
NULL
;
double
myTime
=
0
;
...
...
source/tensor/XGlobal.h
查看文件 @
f5149a15
...
...
@@ -135,8 +135,6 @@ extern bool CONST_TRUE;
#define NIUTRANSNNDEBUG
extern
int
verboseLevel
;
extern
bool
useBLAS
;
extern
bool
useCUDA
;
#define FFLUSH(FILEH) \
{ \
...
...
source/tensor/XMem.cpp
查看文件 @
f5149a15
...
...
@@ -1562,9 +1562,9 @@ void XMemManager::GetBufferSize(MTYPE freeMem, MTYPE * myBufSize)
if
(
freeMem
>=
MILLION
*
512
){
*
myBufSize
=
MILLION
*
128
;
if
(
freeMem
>=
MILLION
*
1024
)
{
*
myBufSize
=
MILLION
*
256
;
*
myBufSize
=
MILLION
*
128
;
if
(
freeMem
>=
MILLION
*
2048
)
*
myBufSize
=
MILLION
*
512
;
*
myBufSize
=
MILLION
*
128
;
}
}
}
...
...
source/tensor/XTensor.cpp
查看文件 @
f5149a15
...
...
@@ -266,7 +266,6 @@ void XTensor::Init()
devID
=
-
1
;
order
=
-
1
;
memset
(
dimSize
,
0
,
sizeof
(
int
)
*
MAX_TENSOR_DIM_NUM
);
memset
(
dimSizeRDI
,
0
,
sizeof
(
int
)
*
MAX_TENSOR_DIM_NUM
);
dataType
=
DEFAULT_DTYPE
;
unitSize
=
sizeof
(
float
);
unitNum
=
0
;
...
...
@@ -314,7 +313,6 @@ void XTensor::ShallowCopy(const XTensor &tensor)
order
=
tensor
.
order
;
enableGrad
=
tensor
.
enableGrad
;
memcpy
(
dimSize
,
tensor
.
dimSize
,
sizeof
(
int
)
*
MAX_TENSOR_DIM_NUM
);
memcpy
(
dimSizeRDI
,
tensor
.
dimSizeRDI
,
sizeof
(
int
)
*
MAX_TENSOR_DIM_NUM
);
dataType
=
tensor
.
dataType
;
unitSize
=
tensor
.
unitSize
;
unitNum
=
tensor
.
unitNum
;
...
...
@@ -533,7 +531,7 @@ void XTensor::SetDevice(int myDevId, XMem * myMem)
bool
XTensor
::
IsReduceShaped
(
const
XTensor
*
a
,
const
XTensor
*
b
,
int
dim
)
{
if
(
a
==
NULL
||
b
==
NULL
)
if
(
a
==
NULL
||
b
==
NULL
)
return
false
;
if
((
a
->
order
-
1
)
!=
b
->
order
)
...
...
@@ -570,7 +568,6 @@ void XTensor::SetDim(int * myDimSize)
{
for
(
int
i
=
0
;
i
<
order
;
i
++
)
{
dimSize
[
i
]
=
myDimSize
[
i
];
dimSizeRDI
[
order
-
i
-
1
]
=
myDimSize
[
i
];
}
}
...
...
@@ -598,20 +595,17 @@ reshape the tensor
void
XTensor
::
Reshape
(
const
int
myOrder
,
const
int
*
myDimSize
)
{
int
dims
[
MAX_TENSOR_DIM_NUM
];
int
dimsRDI
[
MAX_TENSOR_DIM_NUM
];
int
num
=
1
;
for
(
int
i
=
0
;
i
<
myOrder
;
i
++
){
num
*=
myDimSize
[
i
];
dims
[
i
]
=
abs
(
myDimSize
[
i
]);
dimsRDI
[
myOrder
-
i
-
1
]
=
dims
[
i
];
}
CheckNTErrors
(
abs
(
num
)
==
unitNum
,
"Wrong size found when we reshape the tensor!"
);
order
=
myOrder
;
memcpy
(
dimSize
,
dims
,
sizeof
(
int
)
*
order
);
memcpy
(
dimSizeRDI
,
dimsRDI
,
sizeof
(
int
)
*
order
);
}
/*
...
...
@@ -997,18 +991,12 @@ void * XTensor::GetCell(int index[], int size) const
{
CheckNTErrors
((
size
==
order
),
"Illegal index!"
);
int
*
indexRDI
=
new
int
[
size
];
for
(
int
i
=
0
;
i
<
size
;
i
++
)
indexRDI
[
size
-
i
-
1
]
=
index
[
i
];
int
offset
=
indexRDI
[
size
-
1
];
for
(
int
i
=
size
-
2
;
i
>=
0
;
i
--
){
CheckNTErrors
((
indexRDI
[
i
]
<
dimSizeRDI
[
i
]),
"Index is out of range!"
);
offset
=
offset
*
dimSizeRDI
[
i
]
+
indexRDI
[
i
];
int
offset
=
index
[
0
];
for
(
int
i
=
1
;
i
<
size
;
++
i
){
CheckNTErrors
((
index
[
i
]
<
dimSize
[
i
]),
"Index is out of range!"
);
offset
=
offset
*
dimSize
[
i
]
+
index
[
i
];
}
delete
[]
indexRDI
;
if
(
isSparse
){
DTYPE
value
;
void
*
p
;
...
...
@@ -1469,7 +1457,6 @@ bool XTensor::Resize(const int myOrder, const int * myDimSize,
bool
zeroData
=
false
;
for
(
int
i
=
0
;
i
<
order
;
i
++
){
dimSize
[
i
]
=
abs
(
myDimSize
[
i
]);
dimSizeRDI
[
order
-
i
-
1
]
=
dimSize
[
i
];
if
(
myDimSize
[
i
]
<
0
)
filledData
=
false
;
if
(
myDimSize
[
i
]
==
0
)
...
...
@@ -1668,7 +1655,7 @@ void XTensor::Dump(FILE * file, const char * label, const int n, const int beg,
if
(
isSparse
)
{
int
num
=
0
;
for
(
int
i
=
0
;
i
<
order
;
i
++
)
num
*=
dimSize
RDI
[
i
];
num
*=
dimSize
[
i
];
num
=
int
(
num
*
denseRatio
+
1
);
int
tupleSize
=
sizeof
(
int
)
+
sizeof
(
DTYPE
);
int
size
=
sizeof
(
int
)
+
tupleSize
*
(
num
);
...
...
@@ -1880,8 +1867,8 @@ void XTensor::Read(FILE * file, const char * label)
int
ds
[
MAX_TENSOR_DIM_NUM
];
for
(
int
i
=
0
;
i
<
order
;
i
++
)
{
ds
[
i
]
=
key
%
dimSize
RDI
[
i
];
key
/=
dimSize
RDI
[
i
];
ds
[
i
]
=
key
%
dimSize
[
i
];
key
/=
dimSize
[
i
];
}
Set
(
value
,
ds
);
}
...
...
source/tensor/XTensor.h
查看文件 @
f5149a15
...
...
@@ -100,9 +100,6 @@ public:
/* size of each dimension */
int
dimSize
[
MAX_TENSOR_DIM_NUM
];
/* size of each dimension by Reversed Dimension Indexing (RDI) Mode */
int
dimSizeRDI
[
MAX_TENSOR_DIM_NUM
];
/* data unit - data type for every cell */
TENSOR_DATA_TYPE
dataType
;
...
...
source/tensor/core/arithmetic/Div.cpp
查看文件 @
f5149a15
...
...
@@ -49,9 +49,6 @@ void _Div(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int le
"Unmatched tensors!"
);
CheckDev
(
a
->
devID
,
b
->
devID
);
int
leadingDimRDI
=
a
->
order
-
leadingDim
-
1
;
#ifdef USE_CUDA
if
(
a
->
devID
>=
0
||
b
->
devID
>=
0
||
c
->
devID
>=
0
)
{
_CudaDiv
(
a
,
b
,
c
,
alpha
,
leadingDim
);
...
...
@@ -64,17 +61,17 @@ void _Div(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int le
int
blockSizeB
=
1
;
int
blockSizeC
=
1
;
int
blockNum
=
1
;
int
dimensionSizeA
=
a
->
dimSize
RDI
[
leadingDimRDI
];
int
dimensionSizeB
=
b
->
dimSize
RDI
[
leadingDimRDI
];
int
dimensionSizeC
=
c
->
dimSize
RDI
[
leadingDimRDI
];
int
dimensionSizeA
=
a
->
dimSize
[
leadingDim
];
int
dimensionSizeB
=
b
->
dimSize
[
leadingDim
];
int
dimensionSizeC
=
c
->
dimSize
[
leadingDim
];
for
(
int
i
=
0
;
i
<
a
->
order
;
i
++
)
{
if
(
i
!=
leadingDim
RDI
)
{
CheckNTErrors
((
a
->
dimSize
RDI
[
i
]
==
b
->
dimSizeRDI
[
i
]
&&
a
->
dimSizeRDI
[
i
]
==
c
->
dimSizeRDI
[
i
]),
if
(
i
!=
leadingDim
)
{
CheckNTErrors
((
a
->
dimSize
[
i
]
==
b
->
dimSize
[
i
]
&&
a
->
dimSize
[
i
]
==
c
->
dimSize
[
i
]),
"Unmatched tensors!"
);
}
if
(
i
<
leadingDimRDI
)
stride
*=
a
->
dimSize
RDI
[
i
];
if
(
i
>
leadingDim
)
stride
*=
a
->
dimSize
[
i
];
}
blockSizeA
=
stride
*
dimensionSizeA
;
...
...
source/tensor/core/arithmetic/Div.cu
查看文件 @
f5149a15
...
...
@@ -122,7 +122,6 @@ where i is the item index
*/
void _CudaDiv(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int leadingDim)
{
int leadingDimRDI = a->order - leadingDim - 1;
CheckNTErrors((a->unitNum <= c->unitNum && b->unitNum <= c->unitNum),
"Unmatched tensors in multiplication!");
CheckNTErrors((a->order == b->order && a->order == c->order), "Unmatched tensors!");
...
...
@@ -130,18 +129,18 @@ void _CudaDiv(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, in
int stride = 1;
int blockSizeA = 1;
int blockNum = 1;
int dimensionSizeA = a->dimSize
RDI[leadingDimRDI
];
int dimensionSizeB = b->dimSize
RDI[leadingDimRDI
];
int dimensionSizeC = c->dimSize
RDI[leadingDimRDI
];
int dimensionSizeA = a->dimSize
[leadingDim
];
int dimensionSizeB = b->dimSize
[leadingDim
];
int dimensionSizeC = c->dimSize
[leadingDim
];
for (int i = 0; i < a->order; i++) {
if (i != leadingDim
RDI
) {
CheckNTErrors((a->dimSize
RDI[i] == b->dimSizeRDI
[i] &&
a->dimSize
RDI[i] == c->dimSizeRDI
[i]),
if (i != leadingDim) {
CheckNTErrors((a->dimSize
[i] == b->dimSize
[i] &&
a->dimSize
[i] == c->dimSize
[i]),
"Unmatched tensors!");
}
if (i
< leadingDimRDI
)
stride *= a->dimSize
RDI
[i];
if (i
> leadingDim
)
stride *= a->dimSize[i];
}
blockSizeA = stride * dimensionSizeA;
...
...
source/tensor/core/arithmetic/MatrixMul.cpp
查看文件 @
f5149a15
...
...
@@ -77,18 +77,18 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
return
;
}
int
an
=
transposedA
==
X_TRANS
?
a
->
dimSize
RDI
[
0
]
:
a
->
dimSizeRDI
[
1
];
int
am
=
transposedA
==
X_TRANS
?
a
->
dimSize
RDI
[
1
]
:
a
->
dimSizeRDI
[
0
];
int
bn
=
transposedB
==
X_TRANS
?
b
->
dimSize
RDI
[
0
]
:
b
->
dimSizeRDI
[
1
];
int
bm
=
transposedB
==
X_TRANS
?
b
->
dimSize
RDI
[
1
]
:
b
->
dimSizeRDI
[
0
];
int
cn
=
c
->
dimSize
RDI
[
1
];
int
cm
=
c
->
dimSize
RDI
[
0
];
int
an
=
transposedA
==
X_TRANS
?
a
->
dimSize
[
a
->
order
-
1
]
:
a
->
dimSize
[
a
->
order
-
2
];
int
am
=
transposedA
==
X_TRANS
?
a
->
dimSize
[
a
->
order
-
2
]
:
a
->
dimSize
[
a
->
order
-
1
];
int
bn
=
transposedB
==
X_TRANS
?
b
->
dimSize
[
b
->
order
-
1
]
:
b
->
dimSize
[
b
->
order
-
2
];
int
bm
=
transposedB
==
X_TRANS
?
b
->
dimSize
[
b
->
order
-
2
]
:
b
->
dimSize
[
b
->
order
-
1
];
int
cn
=
c
->
dimSize
[
c
->
order
-
2
];
int
cm
=
c
->
dimSize
[
c
->
order
-
1
];
CheckNTErrors
((
am
==
bn
&&
an
==
cn
&&
bm
==
cm
),
"Unmatched tensors in multiplication!"
);
int
aBlockSize
=
a
->
dimSize
RDI
[
0
]
*
a
->
dimSizeRDI
[
1
];
int
bBlockSize
=
b
->
dimSize
RDI
[
0
]
*
b
->
dimSizeRDI
[
1
];
int
cBlockSize
=
c
->
dimSize
RDI
[
0
]
*
c
->
dimSizeRDI
[
1
];
int
aBlockSize
=
a
->
dimSize
[
a
->
order
-
1
]
*
a
->
dimSize
[
a
->
order
-
2
];
int
bBlockSize
=
b
->
dimSize
[
b
->
order
-
1
]
*
b
->
dimSize
[
b
->
order
-
2
];
int
cBlockSize
=
c
->
dimSize
[
c
->
order
-
1
]
*
c
->
dimSize
[
c
->
order
-
2
];
int
aRealBlockSize
=
aBlockSize
*
a
->
unitSize
;
int
bRealBlockSize
=
bBlockSize
*
b
->
unitSize
;
int
cRealBlockSize
=
cBlockSize
*
c
->
unitSize
;
...
...
@@ -96,24 +96,25 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
int
bBlockNum
=
1
;
int
cBlockNum
=
1
;
for
(
int
i
=
2
;
i
<
a
->
order
;
i
++
)
{
CheckNTErrors
(
a
->
dimSizeRDI
[
i
]
==
c
->
dimSizeRDI
[
i
-
2
+
b
->
order
],
"Incorrect tensor sizes!"
);
aBlockNum
*=
a
->
dimSizeRDI
[
i
];
cBlockNum
*=
a
->
dimSizeRDI
[
i
];
for
(
int
i
=
0
;
i
<
a
->
order
-
2
;
i
++
)
{
CheckNTErrors
(
a
->
dimSize
[
i
]
==
c
->
dimSize
[
i
],
"Incorrect tensor sizes!"
);
aBlockNum
*=
a
->
dimSize
[
i
];
cBlockNum
*=
a
->
dimSize
[
i
];
}
for
(
int
i
=
2
;
i
<
b
->
order
;
i
++
)
{
CheckNTErrors
(
b
->
dimSize
RDI
[
i
]
==
c
->
dimSizeRDI
[
i
],
"Incorrect tensor sizes!"
);
bBlockNum
*=
b
->
dimSize
RDI
[
i
];
cBlockNum
*=
b
->
dimSize
RDI
[
i
];
for
(
int
i
=
0
;
i
<
b
->
order
-
2
;
i
++
)
{
CheckNTErrors
(
b
->
dimSize
[
i
]
==
c
->
dimSize
[
i
-
2
+
a
->
order
],
"Incorrect tensor sizes!"
);
bBlockNum
*=
b
->
dimSize
[
i
];
cBlockNum
*=
b
->
dimSize
[
i
];
}
TensorList
*
aList
=
new
TensorList
(
10
);
TensorList
*
bList
=
new
TensorList
(
10
);
TensorList
*
cList
=
new
TensorList
(
10
);
int
aDimSize
[
2
]
=
{
-
a
->
dimSize
RDI
[
1
],
a
->
dimSizeRDI
[
0
]
};
int
bDimSize
[
2
]
=
{
-
b
->
dimSize
RDI
[
1
],
b
->
dimSizeRDI
[
0
]
};
int
cDimSize
[
2
]
=
{
-
c
->
dimSize
RDI
[
1
],
c
->
dimSizeRDI
[
0
]
};
int
aDimSize
[
2
]
=
{
-
a
->
dimSize
[
a
->
order
-
2
],
a
->
dimSize
[
a
->
order
-
1
]
};
int
bDimSize
[
2
]
=
{
-
b
->
dimSize
[
b
->
order
-
2
],
b
->
dimSize
[
b
->
order
-
1
]
};
int
cDimSize
[
2
]
=
{
-
c
->
dimSize
[
c
->
order
-
2
],
c
->
dimSize
[
c
->
order
-
1
]
};
bool
isSparseMul
=
false
;
...
...
@@ -215,20 +216,20 @@ bool CheckMMulShape(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
if
(
!
(
a
->
order
>=
2
&&
b
->
order
>=
2
&&
c
->
order
>=
2
))
return
false
;
int
an
=
transposedA
==
X_TRANS
?
a
->
dimSize
RDI
[
0
]
:
a
->
dimSizeRDI
[
1
];
int
am
=
transposedA
==
X_TRANS
?
a
->
dimSize
RDI
[
1
]
:
a
->
dimSizeRDI
[
0
];
int
bn
=
transposedB
==
X_TRANS
?
b
->
dimSize
RDI
[
0
]
:
b
->
dimSizeRDI
[
1
];
int
bm
=
transposedB
==
X_TRANS
?
b
->
dimSize
RDI
[
1
]
:
b
->
dimSizeRDI
[
0
];
int
an
=
transposedA
==
X_TRANS
?
a
->
dimSize
[
a
->
order
-
1
]
:
a
->
dimSize
[
a
->
order
-
2
];
int
am
=
transposedA
==
X_TRANS
?
a
->
dimSize
[
a
->
order
-
2
]
:
a
->
dimSize
[
a
->
order
-
1
];
int
bn
=
transposedB
==
X_TRANS
?
b
->
dimSize
[
b
->
order
-
1
]
:
b
->
dimSize
[
b
->
order
-
2
];
int
bm
=
transposedB
==
X_TRANS
?
b
->
dimSize
[
b
->
order
-
2
]
:
b
->
dimSize
[
b
->
order
-
1
];
CheckNTErrors
(
am
==
bn
,
"Unmatched tensors in multiplication!"
);
int
order
=
a
->
order
+
b
->
order
-
2
;
int
sub
=
0
;
int
*
dimSize
=
new
int
[
order
];
for
(
int
i
=
2
;
i
<
a
->
order
;
i
++
)
dimSize
[
sub
++
]
=
a
->
dimSize
RDI
[
a
->
order
+
1
-
i
];
for
(
int
i
=
2
;
i
<
b
->
order
;
i
++
)
dimSize
[
sub
++
]
=
b
->
dimSize
RDI
[
b
->
order
+
1
-
i
];
for
(
int
i
=
0
;
i
<
a
->
order
-
2
;
i
++
)
dimSize
[
sub
++
]
=
a
->
dimSize
[
i
];
for
(
int
i
=
0
;
i
<
b
->
order
-
2
;
i
++
)
dimSize
[
sub
++
]
=
b
->
dimSize
[
i
];
dimSize
[
sub
++
]
=
an
;
dimSize
[
sub
++
]
=
bm
;
...
...
@@ -271,20 +272,20 @@ XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA,
CheckNTErrors
(
a
.
dataType
==
b
.
dataType
,
"Input tensors should have the same data type!"
);
CheckNTErrors
(
a
.
order
>=
2
&&
b
.
order
>=
2
,
"Input tensors must have a order >= 2!"
);
int
an
=
transposedA
==
X_TRANS
?
a
.
dimSize
RDI
[
0
]
:
a
.
dimSizeRDI
[
1
];
int
am
=
transposedA
==
X_TRANS
?
a
.
dimSize
RDI
[
1
]
:
a
.
dimSizeRDI
[
0
];
int
bn
=
transposedB
==
X_TRANS
?
b
.
dimSize
RDI
[
0
]
:
b
.
dimSizeRDI
[
1
];
int
bm
=
transposedB
==
X_TRANS
?
b
.
dimSize
RDI
[
1
]
:
b
.
dimSizeRDI
[
0
];
int
an
=
transposedA
==
X_TRANS
?
a
.
dimSize
[
a
.
order
-
1
]
:
a
.
dimSize
[
a
.
order
-
2
];
int
am
=
transposedA
==
X_TRANS
?
a
.
dimSize
[
a
.
order
-
2
]
:
a
.
dimSize
[
a
.
order
-
1
];
int
bn
=
transposedB
==
X_TRANS
?
b
.
dimSize
[
b
.
order
-
1
]
:
b
.
dimSize
[
b
.
order
-
2
];
int
bm
=
transposedB
==
X_TRANS
?
b
.
dimSize
[
b
.
order
-
2
]
:
b
.
dimSize
[
b
.
order
-
1
];
CheckNTErrors
(
am
==
bn
,
"Unmatched tensors in multiplication!"
);
int
order
=
a
.
order
+
b
.
order
-
2
;
int
sub
=
0
;
int
*
dimSize
=
new
int
[
order
];
for
(
int
i
=
2
;
i
<
a
.
order
;
i
++
)
dimSize
[
sub
++
]
=
a
.
dimSize
RDI
[
a
.
order
+
1
-
i
];
for
(
int
i
=
2
;
i
<
b
.
order
;
i
++
)
dimSize
[
sub
++
]
=
b
.
dimSize
RDI
[
b
.
order
+
1
-
i
];
for
(
int
i
=
0
;
i
<
a
.
order
-
2
;
i
++
)
dimSize
[
sub
++
]
=
a
.
dimSize
[
i
];
for
(
int
i
=
0
;
i
<
b
.
order
-
2
;
i
++
)
dimSize
[
sub
++
]
=
b
.
dimSize
[
i
];
dimSize
[
sub
++
]
=
an
;
dimSize
[
sub
++
]
=
bm
;
...
...
@@ -318,20 +319,20 @@ void MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA,
if
(
!
c
.
isInit
||
!
CheckMMulShape
(
&
a
,
transposedA
,
&
b
,
transposedB
,
&
c
))
{
int
an
=
transposedA
==
X_TRANS
?
a
.
dimSize
RDI
[
0
]
:
a
.
dimSizeRDI
[
1
];
int
am
=
transposedA
==
X_TRANS
?
a
.
dimSize
RDI
[
1
]
:
a
.
dimSizeRDI
[
0
];
int
bn
=
transposedB
==
X_TRANS
?
b
.
dimSize
RDI
[
0
]
:
b
.
dimSizeRDI
[
1
];
int
bm
=
transposedB
==
X_TRANS
?
b
.
dimSize
RDI
[
1
]
:
b
.
dimSizeRDI
[
0
];
int
an
=
transposedA
==
X_TRANS
?
a
.
dimSize
[
a
.
order
-
1
]
:
a
.
dimSize
[
a
.
order
-
2
];
int
am
=
transposedA
==
X_TRANS
?
a
.
dimSize
[
a
.
order
-
2
]
:
a
.
dimSize
[
a
.
order
-
1
];
int
bn
=
transposedB
==
X_TRANS
?
b
.
dimSize
[
b
.
order
-
1
]
:
b
.
dimSize
[
b
.
order
-
2
];
int
bm
=
transposedB
==
X_TRANS
?
b
.
dimSize
[
b
.
order
-
2
]
:
b
.
dimSize
[
b
.
order
-
1
];
CheckNTErrors
(
am
==
bn
,
"Unmatched tensors in multiplication!"
);
int
order
=
a
.
order
+
b
.
order
-
2
;
int
sub
=
0
;
int
*
dimSize
=
new
int
[
order
];
for
(
int
i
=
2
;
i
<
a
.
order
;
i
++
)
dimSize
[
sub
++
]
=
a
.
dimSize
RDI
[
a
.
order
+
1
-
i
];
for
(
int
i
=
2
;
i
<
b
.
order
;
i
++
)
dimSize
[
sub
++
]
=
b
.
dimSize
RDI
[
b
.
order
+
1
-
i
];
for
(
int
i
=
0
;
i
<
a
.
order
-
2
;
i
++
)
dimSize
[
sub
++
]
=
a
.
dimSize
[
i
];
for
(
int
i
=
0
;
i
<
b
.
order
-
2
;
i
++
)
dimSize
[
sub
++
]
=
b
.
dimSize
[
i
];
dimSize
[
sub
++
]
=
an
;
dimSize
[
sub
++
]
=
bm
;
...
...
@@ -370,20 +371,20 @@ XTensor MatrixMul(const XTensor &a, const XTensor &b,
CheckNTErrors
(
a
.
dataType
==
b
.
dataType
,
"Input tensors should have the same data type!"
);
CheckNTErrors
(
a
.
order
>=
2
&&
b
.
order
>=
2
,
"Input tensors must have a order >= 2!"
);
int
an
=
a
.
dimSize
RDI
[
1
];
int
am
=
a
.
dimSize
RDI
[
0
];
int
bn
=
b
.
dimSize
RDI
[
1
];
int
bm
=
b
.
dimSize
RDI
[
0
];
int
an
=
a
.
dimSize
[
a
.
order
-
2
];
int
am
=
a
.
dimSize
[
a
.
order
-
1
];
int
bn
=
b
.
dimSize
[
b
.
order
-
2
];
int
bm
=
b
.
dimSize
[
b
.
order
-
1
];
CheckNTErrors
(
am
==
bn
,
"Unmatched tensors in multiplication!"
);
int
order
=
a
.
order
+
b
.
order
-
2
;
int
sub
=
0
;
int
*
dimSize
=
new
int
[
order
];
for
(
int
i
=
2
;
i
<
a
.
order
;
i
++
)
dimSize
[
sub
++
]
=
a
.
dimSize
RDI
[
a
.
order
+
1
-
i
];
for
(
int
i
=
2
;
i
<
b
.
order
;
i
++
)
dimSize
[
sub
++
]
=
b
.
dimSize
RDI
[
b
.
order
+
1
-
i
];
for
(
int
i
=
0
;
i
<
a
.
order
-
2
;
i
++
)
dimSize
[
sub
++
]
=
a
.
dimSize
[
i
];
for
(
int
i
=
0
;
i
<
b
.
order
-
2
;
i
++
)
dimSize
[
sub
++
]
=
b
.
dimSize
[
i
];
dimSize
[
sub
++
]
=
an
;
dimSize
[
sub
++
]
=
bm
;
...
...
@@ -416,20 +417,20 @@ void MatrixMul(const XTensor &a, const XTensor &b, XTensor &c,
if
(
!
c
.
isInit
||
!
CheckMMulShape
(
&
a
,
X_NOTRANS
,
&
b
,
X_NOTRANS
,
&
c
))
{
int
an
=
a
.
dimSize
RDI
[
1
];
int
am
=
a
.
dimSize
RDI
[
0
];
int
bn
=
b
.
dimSize
RDI
[
1
];
int
bm
=
b
.
dimSize
RDI
[
0
];
int
an
=
a
.
dimSize
[
a
.
order
-
2
];
int
am
=
a
.
dimSize
[
a
.
order
-
1
];
int
bn
=
b
.
dimSize
[
b
.
order
-
2
];
int
bm
=
b
.
dimSize
[
b
.
order
-
1
];
CheckNTErrors
(
am
==
bn
,
"Unmatched tensors in multiplication!"
);
int
order
=
a
.
order
+
b
.
order
-
2
;
int
sub
=
0
;
int
*
dimSize
=
new
int
[
order
];
for
(
int
i
=
2
;
i
<
a
.
order
;
i
++
)
dimSize
[
sub
++
]
=
a
.
dimSize
RDI
[
a
.
order
+
1
-
i
];
for
(
int
i
=
2
;
i
<
b
.
order
;
i
++
)
dimSize
[
sub
++
]
=
b
.
dimSize
RDI
[
b
.
order
+
1
-
i
];
for
(
int
i
=
0
;
i
<
a
.
order
-
2
;
i
++
)
dimSize
[
sub
++
]
=
a
.
dimSize
[
i
];
for
(
int
i
=
0
;
i
<
b
.
order
-
2
;
i
++
)
dimSize
[
sub
++
]
=
b
.
dimSize
[
i
];
dimSize
[
sub
++
]
=
an
;
dimSize
[
sub
++
]
=
bm
;
...
...
source/tensor/core/arithmetic/MatrixMulBatched.cpp
查看文件 @
f5149a15
...
...
@@ -95,27 +95,27 @@ void _MatrixMulBatchedGPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
"Input tensor and output tensor must have same order!"
);
CheckNTErrors
(
a
->
devID
>=
0
&&
b
->
devID
>=
0
&&
c
->
devID
>=
0
,
"The tensors must be on GPUs"
);
int
an
=
transposedA
==
X_TRANS
?
a
->
dimSize
RDI
[
0
]
:
a
->
dimSizeRDI
[
1
];
int
am
=
transposedA
==
X_TRANS
?
a
->
dimSize
RDI
[
1
]
:
a
->
dimSizeRDI
[
0
];
int
bn
=
transposedB
==
X_TRANS
?
b
->
dimSize
RDI
[
0
]
:
b
->
dimSizeRDI
[
1
];
int
bm
=
transposedB
==
X_TRANS
?
b
->
dimSize
RDI
[
1
]
:
b
->
dimSizeRDI
[
0
];
int
cn
=
c
->
dimSize
RDI
[
1
];
int
cm
=
c
->
dimSize
RDI
[
0
];
int
an
=
transposedA
==
X_TRANS
?
a
->
dimSize
[
a
->
order
-
1
]
:
a
->
dimSize
[
a
->
order
-
2
];
int
am
=
transposedA
==
X_TRANS
?
a
->
dimSize
[
a
->
order
-
2
]
:
a
->
dimSize
[
a
->
order
-
1
];
int
bn
=
transposedB
==
X_TRANS
?
b
->
dimSize
[
b
->
order
-
1
]
:
b
->
dimSize
[
b
->
order
-
2
];
int
bm
=
transposedB
==
X_TRANS
?
b
->
dimSize
[
b
->
order
-
2
]
:
b
->
dimSize
[
b
->
order
-
1
];
int
cn
=
c
->
dimSize
[
c
->
order
-
2
];
int
cm
=
c
->
dimSize
[
c
->
order
-
1
];
CheckNTErrors
((
am
==
bn
&&
an
==
cn
&&
bm
==
cm
),
"Unmatched tensors in multiplication!"
);
int
aBlockSize
=
a
->
dimSize
RDI
[
0
]
*
a
->
dimSizeRDI
[
1
];
int
bBlockSize
=
b
->
dimSize
RDI
[
0
]
*
b
->
dimSizeRDI
[
1
];
int
cBlockSize
=
c
->
dimSize
RDI
[
0
]
*
c
->
dimSizeRDI
[
1
];
int
aBlockSize
=
a
->
dimSize
[
a
->
order
-
1
]
*
a
->
dimSize
[
a
->
order
-
2
];
int
bBlockSize
=
b
->
dimSize
[
b
->
order
-
1
]
*
b
->
dimSize
[
b
->
order
-
2
];
int
cBlockSize
=
c
->
dimSize
[
c
->
order
-
1
]
*
c
->
dimSize
[
c
->
order
-
2
];
int
aRealBlockSize
=
aBlockSize
*
a
->
unitSize
;
int
bRealBlockSize
=
bBlockSize
*
b
->
unitSize
;
int
cRealBlockSize
=
cBlockSize
*
c
->
unitSize
;
int
blockNum
=
1
;
for
(
int
i
=
2
;
i
<
a
->
order
;
i
++
)
{
CheckNTErrors
((
a
->
dimSize
RDI
[
i
]
==
c
->
dimSizeRDI
[
i
]),
"Incorrect tensor sizes!"
);
CheckNTErrors
((
b
->
dimSize
RDI
[
i
]
==
c
->
dimSizeRDI
[
i
]),
"Incorrect tensor sizes!"
);
blockNum
*=
a
->
dimSize
RDI
[
i
];
for
(
int
i
=
0
;
i
<
a
->
order
-
2
;
i
++
)
{
CheckNTErrors
((
a
->
dimSize
[
i
]
==
c
->
dimSize
[
i
]),
"Incorrect tensor sizes!"
);
CheckNTErrors
((
b
->
dimSize
[
i
]
==
c
->
dimSize
[
i
]),
"Incorrect tensor sizes!"
);
blockNum
*=
a
->
dimSize
[
i
];
}
int
devIDBackup
=
0
;
...
...
@@ -126,9 +126,9 @@ void _MatrixMulBatchedGPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
a
->
data
,
transposedA
,
a
->
dataType
,
aBlockSize
,
b
->
data
,
transposedB
,
b
->
dataType
,
bBlockSize
,
c
->
data
,
c
->
dataType
,
cBlockSize
,
blockNum
,
a
->
dimSize
RDI
[
1
],
a
->
dimSizeRDI
[
0
],
b
->
dimSize
RDI
[
1
],
b
->
dimSizeRDI
[
0
],
c
->
dimSize
RDI
[
1
],
c
->
dimSizeRDI
[
0
],
alpha
,
beta
);
a
->
dimSize
[
a
->
order
-
2
],
a
->
dimSize
[
a
->
order
-
1
],
b
->
dimSize
[
b
->
order
-
2
],
b
->
dimSize
[
b
->
order
-
1
],
c
->
dimSize
[
c
->
order
-
2
],
c
->
dimSize
[
c
->
order
-
1
],
alpha
,
beta
);
BacktoCudaDev
(
a
->
devID
,
devIDBackup
);
#endif
...
...
@@ -164,32 +164,32 @@ void _MatrixMulBatchedCPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
"Input tensor and output tensor must have same order!"
);
int
an
=
transposedA
==
X_TRANS
?
a
->
dimSize
RDI
[
0
]
:
a
->
dimSizeRDI
[
1
];
int
am
=
transposedA
==
X_TRANS
?
a
->
dimSize
RDI
[
1
]
:
a
->
dimSizeRDI
[
0
];
int
bn
=
transposedB
==
X_TRANS
?
b
->
dimSize
RDI
[
0
]
:
b
->
dimSizeRDI
[
1
];
int
bm
=
transposedB
==
X_TRANS
?
b
->
dimSize
RDI
[
1
]
:
b
->
dimSizeRDI
[
0
];
int
cn
=
c
->
dimSize
RDI
[
1
];
int
cm
=
c
->
dimSize
RDI
[
0
];
int
an
=
transposedA
==
X_TRANS
?
a
->
dimSize
[
a
->
order
-
1
]
:
a
->
dimSize
[
a
->
order
-
2
];
int
am
=
transposedA
==
X_TRANS
?
a
->
dimSize
[
a
->
order
-
2
]
:
a
->
dimSize
[
a
->
order
-
1
];
int
bn
=
transposedB
==
X_TRANS
?
b
->
dimSize
[
b
->
order
-
1
]
:
b
->
dimSize
[
b
->
order
-
2
];
int
bm
=
transposedB
==
X_TRANS
?
b
->
dimSize
[
b
->
order
-
2
]
:
b
->
dimSize
[
b
->
order
-
1
];
int
cn
=
c
->
dimSize
[
c
->
order
-
2
];
int
cm
=
c
->
dimSize
[
c
->
order
-
1
];
CheckNTErrors
(
am
==
bn
&&
an
==
cn
&&
bm
==
cm
,
"Unmatched tensors in multiplication!"
);
int
aBlockSize
=
a
->
dimSize
RDI
[
0
]
*
a
->
dimSizeRDI
[
1
];
int
bBlockSize
=
b
->
dimSize
RDI
[
0
]
*
b
->
dimSizeRDI
[
1
];
int
cBlockSize
=
c
->
dimSize
RDI
[
0
]
*
c
->
dimSizeRDI
[
1
];
int
aBlockSize
=
a
->
dimSize
[
a
->
order
-
1
]
*
a
->
dimSize
[
a
->
order
-
2
];
int
bBlockSize
=
b
->
dimSize
[
b
->
order
-
1
]
*
b
->
dimSize
[
b
->
order
-
2
];
int
cBlockSize
=
c
->
dimSize
[
c
->
order
-
1
]
*
c
->
dimSize
[
c
->
order
-
2
];
int
aRealBlockSize
=
aBlockSize
*
a
->
unitSize
;
int
bRealBlockSize
=
bBlockSize
*
b
->
unitSize
;
int
cRealBlockSize
=
cBlockSize
*
c
->
unitSize
;
int
blockNum
=
1
;
for
(
int
i
=
2
;
i
<
a
->
order
;
i
++
)
{
CheckNTErrors
((
a
->
dimSize
RDI
[
i
]
==
c
->
dimSizeRDI
[
i
]),
"Incorrect tensor sizes!"
);
CheckNTErrors
((
b
->
dimSize
RDI
[
i
]
==
c
->
dimSizeRDI
[
i
]),
"Incorrect tensor sizes!"
);
blockNum
*=
a
->
dimSize
RDI
[
i
];
for
(
int
i
=
0
;
i
<
a
->
order
-
2
;
i
++
)
{
CheckNTErrors
((
a
->
dimSize
[
i
]
==
c
->
dimSize
[
i
]),
"Incorrect tensor sizes!"
);
CheckNTErrors
((
b
->
dimSize
[
i
]
==
c
->
dimSize
[
i
]),
"Incorrect tensor sizes!"
);
blockNum
*=
a
->
dimSize
[
i
];
}
int
aDimSize
[
2
]
=
{
-
a
->
dimSize
RDI
[
1
],
a
->
dimSizeRDI
[
0
]};
int
bDimSize
[
2
]
=
{
-
b
->
dimSize
RDI
[
1
],
b
->
dimSizeRDI
[
0
]};
int
cDimSize
[
2
]
=
{
-
c
->
dimSize
RDI
[
1
],
c
->
dimSizeRDI
[
0
]};
int
aDimSize
[
2
]
=
{
-
a
->
dimSize
[
a
->
order
-
2
],
a
->
dimSize
[
a
->
order
-
1
]};
int
bDimSize
[
2
]
=
{
-
b
->
dimSize
[
b
->
order
-
2
],
b
->
dimSize
[
b
->
order
-
1
]};
int
cDimSize
[
2
]
=
{
-
c
->
dimSize
[
c
->
order
-
2
],
c
->
dimSize
[
c
->
order
-
1
]};
XTensor
*
ai
=
NewTensor2D
(
aDimSize
[
0
],
aDimSize
[
1
],
a
->
dataType
,
a
->
devID
,
a
->
mem
);
XTensor
*
bi
=
NewTensor2D
(
bDimSize
[
0
],
bDimSize
[
1
],
b
->
dataType
,
b
->
devID
,
b
->
mem
);
...
...
@@ -292,10 +292,10 @@ XTensor MatrixMulBatched(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const
CheckNTErrors
(
a
.
order
>=
2
&&
b
.
order
>=
2
,
"Input tensors must have a order >= 2!"
);
CheckNTErrors
(
a
.
order
==
b
.
order
,
"Input tensor and output tensor must have same order!"
);
int
an
=
transposedA
==
X_TRANS
?
a
.
dimSize
RDI
[
0
]
:
a
.
dimSizeRDI
[
1
];
int
am
=
transposedA
==
X_TRANS
?
a
.
dimSize
RDI
[
1
]
:
a
.
dimSizeRDI
[
0
];
int
bn
=
transposedB
==
X_TRANS
?
b
.
dimSize
RDI
[
0
]
:
b
.
dimSizeRDI
[
1
];
int
bm
=
transposedB
==
X_TRANS
?
b
.
dimSize
RDI
[
1
]
:
b
.
dimSizeRDI
[
0
];
int
an
=
transposedA
==
X_TRANS
?
a
.
dimSize
[
a
.
order
-
1
]
:
a
.
dimSize
[
a
.
order
-
2
];
int
am
=
transposedA
==
X_TRANS
?
a
.
dimSize
[
a
.
order
-
2
]
:
a
.
dimSize
[
a
.
order
-
1
];
int
bn
=
transposedB
==
X_TRANS
?
b
.
dimSize
[
b
.
order
-
1
]
:
b
.
dimSize
[
b
.
order
-
2
];
int
bm
=
transposedB
==
X_TRANS
?
b
.
dimSize
[
b
.
order
-
2
]
:
b
.
dimSize
[
b
.
order
-
1
];
CheckNTErrors
(
am
==
bn
,
"Unmatched tensors in multiplication!"
);
...
...
@@ -350,10 +350,10 @@ XTensor MatrixMulBatched(const XTensor &a, const XTensor &b,
CheckNTErrors
(
a
.
order
>=
2
&&
b
.
order
>=
2
,
"Input tensors must have a order >= 2!"
);
CheckNTErrors
(
a
.
order
==
b
.
order
,
"Input tensor and output tensor must have same order!"
);
int
an
=
a
.
dimSize
RDI
[
1
];
int
am
=
a
.
dimSize
RDI
[
0
];
int
bn
=
b
.
dimSize
RDI
[
1
];
int
bm
=
b
.
dimSize
RDI
[
0
];
int
an
=
a
.
dimSize
[
a
.
order
-
2
];
int
am
=
a
.
dimSize
[
a
.
order
-
1
];
int
bn
=
b
.
dimSize
[
b
.
order
-
2
];
int
bm
=
b
.
dimSize
[
b
.
order
-
1
];
CheckNTErrors
(
am
==
bn
,
"Unmatched tensors in multiplication!"
);
...
...
source/tensor/core/arithmetic/MulAndShift.cpp
查看文件 @
f5149a15
...
...
@@ -71,20 +71,21 @@ XTensor MulAndShift(const XTensor &x, const XTensor &w, const XTensor &b,
CheckNTErrors
(
x
.
dataType
==
w
.
dataType
,
"Input tensors should have the same data type!"
);
CheckNTErrors
(
x
.
order
>=
2
&&
w
.
order
>=
2
,
"Input tensors must have a order >= 2!"
);
int
xn
=
x
.
dimSize
RDI
[
1
];
int
xm
=
x
.
dimSize
RDI
[
0
];
int
wn
=
w
.
dimSize
RDI
[
1
];
int
wm
=
w
.
dimSize
RDI
[
0
];
int
xn
=
x
.
dimSize
[
x
.
order
-
2
];
int
xm
=
x
.
dimSize
[
x
.
order
-
1
];
int
wn
=
w
.
dimSize
[
w
.
order
-
2
];
int
wm
=
w
.
dimSize
[
w
.
order
-
1
];
CheckNTErrors
(
xm
==
wn
,
"Unmatched tensors in multiplication!"
);
int
order
=
x
.
order
+
w
.
order
-
2
;
int
sub
=
0
;
int
*
dimSize
=
new
int
[
order
];
for
(
int
i
=
2
;
i
<
x
.
order
;
i
++
)
dimSize
[
sub
++
]
=
x
.
dimSizeRDI
[
x
.
order
+
1
-
i
];
for
(
int
i
=
2
;
i
<
w
.
order
;
i
++
)
dimSize
[
sub
++
]
=
w
.
dimSizeRDI
[
w
.
order
+
1
-
i
];
for
(
int
i
=
0
;
i
<
x
.
order
-
2
;
i
++
)
dimSize
[
sub
++
]
=
x
.
dimSize
[
i
];
for
(
int
i
=
0
;
i
<
w
.
order
-
2
;
i
++
)
dimSize
[
sub
++
]
=
w
.
dimSize
[
i
];
dimSize
[
sub
++
]
=
xn
;
dimSize
[
sub
++
]
=
wm
;
...
...
@@ -148,18 +149,18 @@ XTensor MulAndShift(const XTensor& x, MATRIX_TRANS_TYPE transposedA,
CheckNTErrors
(
x
.
dataType
==
w
.
dataType
,
"Input tensors should have the same data type!"
);
CheckNTErrors
(
x
.
order
>=
2
&&
w
.
order
>=
2
,
"Input tensors must have a order >= 2!"
);
int
xn
=
transposedA
==
X_TRANS
?
x
.
dimSize
RDI
[
0
]
:
x
.
dimSizeRDI
[
1
];
int
xm
=
transposedA
==
X_TRANS
?
x
.
dimSize
RDI
[
1
]
:
x
.
dimSizeRDI
[
0
];
int
wn
=
transposedB
==
X_TRANS
?
w
.
dimSize
RDI
[
0
]
:
w
.
dimSizeRDI
[
1
];
int
wm
=
transposedB
==
X_TRANS
?
w
.
dimSize
RDI
[
1
]
:
w
.
dimSizeRDI
[
0
];
int
xn
=
transposedA
==
X_TRANS
?
x
.
dimSize
[
x
.
order
-
1
]
:
x
.
dimSize
[
x
.
order
-
2
];
int
xm
=
transposedA
==
X_TRANS
?
x
.
dimSize
[
x
.
order
-
2
]
:
x
.
dimSize
[
x
.
order
-
1
];
int
wn
=
transposedB
==
X_TRANS
?
w
.
dimSize
[
w
.
order
-
1
]
:
w
.
dimSize
[
w
.
order
-
2
];
int
wm
=
transposedB
==
X_TRANS
?
w
.
dimSize
[
w
.
order
-
2
]
:
w
.
dimSize
[
w
.
order
-
1
];
int
order
=
x
.
order
+
w
.
order
-
2
;
int
sub
=
0
;
int
*
dimSize
=
new
int
[
order
];
for
(
int
i
=
2
;
i
<
x
.
order
;
i
++
)
dimSize
[
sub
++
]
=
x
.
dimSize
RDI
[
x
.
order
+
1
-
i
];
for
(
int
i
=
2
;
i
<
w
.
order
;
i
++
)
dimSize
[
sub
++
]
=
w
.
dimSize
RDI
[
w
.
order
+
1
-
i
];
for
(
int
i
=
0
;
i
<
x
.
order
-
2
;
i
++
)
dimSize
[
sub
++
]
=
x
.
dimSize
[
i
];
for
(
int
i
=
0
;
i
<
w
.
order
-
2
;
i
++
)
dimSize
[
sub
++
]
=
w
.
dimSize
[
i
];
dimSize
[
sub
++
]
=
xn
;
dimSize
[
sub
++
]
=
wm
;
...
...
source/tensor/core/arithmetic/Multiply.cpp
查看文件 @
f5149a15
...
...
@@ -49,9 +49,6 @@ void _Multiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, i
"Unmatched tensors!"
);
CheckDev
(
a
->
devID
,
b
->
devID
);
int
leadingDimRDI
=
a
->
order
-
leadingDim
-
1
;
#ifdef USE_CUDA
if
(
a
->
devID
>=
0
||
b
->
devID
>=
0
||
c
->
devID
>=
0
)
{
_CudaMultiply
(
a
,
b
,
c
,
alpha
,
leadingDim
);
...
...
@@ -64,18 +61,18 @@ void _Multiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, i
int
blockSizeB
=
1
;
int
blockSizeC
=
1
;
int
blockNum
=
1
;
int
dimensionSizeA
=
a
->
dimSize
RDI
[
leadingDimRDI
];
int
dimensionSizeB
=
b
->
dimSize
RDI
[
leadingDimRDI
];
int
dimensionSizeC
=
c
->
dimSize
RDI
[
leadingDimRDI
];
int
dimensionSizeA
=
a
->
dimSize
[
leadingDim
];
int
dimensionSizeB
=
b
->
dimSize
[
leadingDim
];
int
dimensionSizeC
=
c
->
dimSize
[
leadingDim
];
for
(
int
i
=
0
;
i
<
a
->
order
;
i
++
)
{
if
(
i
!=
leadingDim
RDI
)
{
CheckNTErrors
((
a
->
dimSize
RDI
[
i
]
==
b
->
dimSizeRDI
[
i
]
&&
a
->
dimSize
RDI
[
i
]
==
c
->
dimSizeRDI
[
i
]),
if
(
i
!=
leadingDim
)
{
CheckNTErrors
((
a
->
dimSize
[
i
]
==
b
->
dimSize
[
i
]
&&
a
->
dimSize
[
i
]
==
c
->
dimSize
[
i
]),
"Unmatched tensors!"
);
}
if
(
i
<
leadingDimRDI
)
stride
*=
a
->
dimSize
RDI
[
i
];
if
(
i
>
leadingDim
)
stride
*=
a
->
dimSize
[
i
];
}
blockSizeA
=
stride
*
dimensionSizeA
;
...
...
source/tensor/core/arithmetic/Multiply.cu
查看文件 @
f5149a15
...
...
@@ -122,26 +122,25 @@ where i is the item index
*/
void _CudaMultiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int leadingDim)
{
int leadingDimRDI = a->order - leadingDim - 1;
CheckNTErrors(a->unitNum <= c->unitNum && b->unitNum <= c->unitNum,
CheckNTErrors((a->unitNum <= c->unitNum && b->unitNum <= c->unitNum),
"Unmatched tensors in multiplication!");
CheckNTErrors(
a->order == b->order && a->order == c->order
, "Unmatched tensors!");
CheckNTErrors(
(a->order == b->order && a->order == c->order)
, "Unmatched tensors!");
int stride = 1;
int blockSizeA = 1;
int blockNum = 1;
int dimensionSizeA = a->dimSize
RDI[leadingDimRDI
];
int dimensionSizeB = b->dimSize
RDI[leadingDimRDI
];
int dimensionSizeC = c->dimSize
RDI[leadingDimRDI
];
int dimensionSizeA = a->dimSize
[leadingDim
];
int dimensionSizeB = b->dimSize
[leadingDim
];
int dimensionSizeC = c->dimSize
[leadingDim
];
for (int i = 0; i < a->order; i++) {
if (i != leadingDim
RDI
) {
CheckNTErrors((a->dimSize
RDI[i] == b->dimSizeRDI
[i] &&
a->dimSize
RDI[i] == c->dimSizeRDI
[i]),
if (i != leadingDim) {
CheckNTErrors((a->dimSize
[i] == b->dimSize
[i] &&
a->dimSize
[i] == c->dimSize
[i]),
"Unmatched tensors!");
}
if (i
< leadingDimRDI
)
stride *= a->dimSize
RDI
[i];
if (i
> leadingDim
)
stride *= a->dimSize[i];
}
blockSizeA = stride * dimensionSizeA;
...
...
source/tensor/core/arithmetic/SumDim.cpp
查看文件 @
f5149a15
...
...
@@ -70,20 +70,6 @@ void _SumDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE bet
return
;
}
/*int dims[MAX_TENSOR_DIM_NUM];
for(int i = 0; i < a->order; i++)
dims[i] = 1;
dims[n] = a->GetDim(n);
XTensor * b2 = NewTensor(a->order, dims, b->dataType, b->denseRatio, b->devID, b->mem);
_CopyValues(b, b2);
_SumBroadcast(a, b2, c, beta);
DelTensor(b2);
return;*/
if
(
a
->
devID
>=
0
||
b
->
devID
>=
0
||
c
->
devID
>=
0
){
#ifdef USE_CUDA
_CudaSumDim
(
a
,
b
,
c
,
n
,
beta
);
...
...
source/tensor/core/arithmetic/SumDim.cu
查看文件 @
f5149a15
...
...
@@ -87,17 +87,17 @@ void KernelAddWithCol(T * a, T * b, T * c, int rowNum, int colNum, int blockSize
int col = colIndex % colNum;
int block = colIndex / colNum;
if
(row >= rowNum || block >= blockNum)
if(row >= rowNum || block >= blockNum)
return;
if
(threadIdx.x == 0)
if(threadIdx.x == 0)
bv[threadIdx.y] = b[row];
__syncthreads();
int offset = block * blockSize + row * colNum + col;
if
(betaFired)
if(betaFired)
c[offset] = a[offset] + bv[threadIdx.y] * beta;
else
c[offset] = a[offset] + bv[threadIdx.y];
...
...
source/tensor/core/getandset/OnehotAndIndex.cpp
查看文件 @
f5149a15
...
...
@@ -140,6 +140,47 @@ void _IndexToOnehot(const XTensor * index, XTensor * onehot,
}
/*
convert index tensor to onehot tensor
>> index - index tensor, which value is an integer num
>> onehot - onehot tensor, which value is 0 or 1
>> size - the last dimension size of the onehot tensor
*/
void
_IndexToOnehot
(
int
*
index
,
int
n
,
XTensor
*
onehot
,
int
size
,
float
labelSmoothingP
)
{
/*CheckNTErrors(onehot->GetDim(-1) == size, "Illegal tensor dimension!");
CheckNTErrors(onehot->dataType == X_INT, "The onehot tensor must be in X_INT!")
onehot->SetZeroAll();
#ifdef USE_CUDA
if (onehot->devID >= 0) {
delete[] cudaIndex;
return;
}
#endif
int blockNum = n;
int stride = size;
int * indexData = (int *)index;
int * onehotData = (int *)onehot->data;
for (int i = 0; i < blockNum; i++) {
int id = indexData[i];
int * od = onehotData + i * stride;
od[id] = 1;
}*/
XTensor
*
cudaIndex
=
NewTensor1D
(
n
,
X_INT
,
onehot
->
devID
);
cudaIndex
->
SetData
(
index
,
n
);
_IndexToOnehot
(
cudaIndex
,
onehot
,
size
,
labelSmoothingP
);
delete
[]
cudaIndex
;
}
/*
convert onehot tensor to index tensor (return an XTensor structure)
make a new tensor to keep the result and return it
...
...
source/tensor/core/getandset/OnehotAndIndex.h
查看文件 @
f5149a15
...
...
@@ -36,6 +36,9 @@ XTensor OnehotToIndex(const XTensor & onehot, int num);
/* convert index tensor to onehot tensor */
void
_IndexToOnehot
(
const
XTensor
*
index
,
XTensor
*
onehot
,
int
size
,
float
labelSmoothingP
);
/* convert index tensor to onehot tensor */
void
_IndexToOnehot
(
int
*
index
,
int
n
,
XTensor
*
onehot
,
int
size
,
float
labelSmoothingP
);
/* convert index tensor to onehot tensor (return an XTensor structure)
make a new tensor to keep the result and return it */
XTensor
IndexToOnehot
(
const
XTensor
&
index
,
int
num
,
float
labelSmoothingP
);
...
...
source/tensor/core/getandset/Select.cpp
查看文件 @
f5149a15
...
...
@@ -26,6 +26,82 @@
namespace
nts
{
// namespace nts(NiuTrans.Tensor)
/*
generate a tensor with selected data in index along the given dimension
c = select(a)
>> a - input tensor
>> c - result tensor
>> index - the selected index
>> dim - the dimension along with which we do the job
*/
void
_Select
(
const
XTensor
*
a
,
XTensor
*
c
,
int
*
index
,
int
dim
)
{
CheckNTErrors
(
a
!=
NULL
&&
c
!=
NULL
,
"empty tensors!"
);
CheckNTErrors
(
a
->
order
==
c
->
order
,
"The input and output tensors must in the same order!"
);
CheckNTErrors
(
dim
>=
0
&&
dim
<
a
->
order
,
"The input dimension is out of bounds!"
);
CheckNTErrors
(
a
->
dataType
==
c
->
dataType
,
"The tensor must be of the same data type!"
);
int
stride
=
1
;
for
(
int
i
=
dim
+
1
;
i
<
a
->
order
;
i
++
)
stride
*=
a
->
dimSize
[
i
];
printf
(
"
\n
%d %d
\n
"
,
a
->
order
-
dim
-
1
,
stride
);
int
copyTimes
=
1
;
for
(
int
i
=
0
;
i
<
dim
;
i
++
)
{
copyTimes
*=
a
->
dimSize
[
i
];
}
int
cot
=
c
->
dimSize
[
dim
];
int
blockSize
=
stride
*
a
->
unitSize
;
int
stepSizeS
=
stride
*
a
->
dimSize
[
dim
]
*
a
->
unitSize
;
int
stepSizeT
=
stride
*
c
->
dimSize
[
dim
]
*
a
->
unitSize
;
char
*
s
=
(
char
*
)
a
->
data
;
char
*
t
=
(
char
*
)
c
->
data
;
for
(
int
i
=
0
;
i
<
copyTimes
;
i
++
)
{
for
(
int
j
=
0
;
j
<
cot
;
++
j
)
{
XMemCopy
(
t
+
j
*
blockSize
,
c
->
devID
,
s
+
index
[
j
]
*
blockSize
,
a
->
devID
,
blockSize
);
}
s
+=
stepSizeS
;
t
+=
stepSizeT
;
}
}
/*
generate a tensor with selected data in index along the given dimension
c = select(a)
>> a - input tensor
>> c - result tensor
>> index - the selected index
>> dim - the dimension along with which we do the job
*/
void
_Select
(
const
XTensor
*
a
,
XTensor
*
c
,
XTensor
*
index
,
int
dim
)
{
if
(
index
->
devID
>=
0
)
{
int
*
indexCPU
=
new
int
[
index
->
unitNum
];
XMemCopy
(
indexCPU
,
-
1
,
index
->
data
,
index
->
devID
,
index
->
unitNum
*
sizeof
(
int
));
_Select
(
a
,
c
,
indexCPU
,
dim
);
delete
[]
indexCPU
;
}
else
{
_Select
(
a
,
c
,
(
int
*
)
index
->
data
,
dim
);
}
}
/*
*/
/*XTensor Select(const XTensor &a, int* index, int dim)
{
}*/
/*
generate a tensor with selected data in range[low,high] along the given dimension
c = select(a)
...
...
@@ -58,13 +134,12 @@ void _SelectRange(const XTensor * a, XTensor * c, int dim, int low, int high)
}
int
stride
=
1
;
int
dimRDI
=
a
->
order
-
dim
-
1
;
for
(
int
i
=
0
;
i
<
dimRDI
;
i
++
)
stride
*=
a
->
dimSizeRDI
[
i
];
for
(
int
i
=
dim
+
1
;
i
<
a
->
order
;
i
++
)
stride
*=
a
->
dimSize
[
i
];
int
copyTimes
=
1
;
for
(
int
i
=
dimRDI
+
1
;
i
<
a
->
order
;
i
++
)
copyTimes
*=
a
->
dimSize
RDI
[
i
];
for
(
int
i
=
0
;
i
<
dim
;
i
++
)
copyTimes
*=
a
->
dimSize
[
i
];
int
blockSize
=
stride
*
(
high
-
low
)
*
a
->
unitSize
;
int
stepSizeS
=
stride
*
a
->
dimSize
[
dim
]
*
a
->
unitSize
;
...
...
@@ -117,12 +192,10 @@ XTensor SelectRange(const XTensor &a, int dim, int low, int high)
_SelectRange
(
&
a
,
&
c
,
dim
,
low
,
high
);
/* tensor connection */
if
(
a
.
enableGrad
)
{
XLink
::
MakeLink
(
&
a
,
NULL
,
&
c
,
GETANDSET_SELECT
);
XLink
::
AddParamToHeadInt
(
&
c
,
dim
);
XLink
::
AddParamToHeadInt
(
&
c
,
low
);
XLink
::
AddParamToHeadInt
(
&
c
,
high
);
}
/* destroy variables */
delete
[]
dimSize
;
...
...
source/tensor/core/getandset/Select.h
查看文件 @
f5149a15
...
...
@@ -27,7 +27,10 @@
namespace
nts
{
// namespace nts(NiuTrans.Tensor)
/* generate a tensor with selected data c = select(a) */
void
_Select
(
const
XTensor
*
a
,
XTensor
*
c
,
XTensor
*
indexCPU
);
void
_Select
(
const
XTensor
*
a
,
XTensor
*
c
,
int
*
index
,
int
dim
);
/* generate a tensor with selected data c = select(a) */
void
_Select
(
const
XTensor
*
a
,
XTensor
*
c
,
XTensor
*
index
,
int
dim
);
/*
generate a tensor with selected data c = select(a) (returna a XTensor structure)
...
...
source/tensor/core/math/Normalize.cpp
查看文件 @
f5149a15
...
...
@@ -47,26 +47,25 @@ void _Normalize(const XTensor * input, XTensor * output, int dim,
const
XTensor
*
mean
,
const
XTensor
*
var
,
const
XTensor
*
a
,
const
XTensor
*
b
,
DTYPE
epsilon
)
{
int
dimRDI
=
input
->
order
-
dim
-
1
;
CheckNTErrors
((
_IsSameShaped
(
input
,
output
)),
"Unmatched input tensors!"
);
CheckNTErrors
((
_IsSameShaped
(
a
,
b
)),
"Unmatched input tensors"
);
CheckNTErrors
((
_IsSameShaped
(
mean
,
var
)),
"Unmatched input tensors"
);
CheckNTErrors
((
input
&&
output
&&
mean
&&
var
&&
a
&&
b
),
"Empty input tensors!"
);
CheckNTErrors
((
dim
RDI
>=
0
&&
dimRDI
<
input
->
order
),
"Incorrect reduction dimension!"
);
CheckNTErrors
((
dim
>=
0
&&
dim
<
input
->
order
),
"Incorrect reduction dimension!"
);
CheckNTErrors
((
input
->
order
==
mean
->
order
+
1
),
"Incorrect reduction dimension!"
);
int
stride
=
1
;
int
strideNum
=
input
->
dimSize
RDI
[
dimRDI
];
int
strideNum
=
input
->
dimSize
[
dim
];
int
blockSize
=
1
;
int
blockNum
=
1
;
for
(
int
i
=
0
;
i
<
input
->
order
;
i
++
)
{
if
(
i
<
dim
RDI
)
{
CheckNTErrors
((
input
->
dimSize
RDI
[
i
]
==
mean
->
dimSizeRDI
[
i
]),
"Wrong size!"
);
stride
*=
input
->
dimSizeRDI
[
i
];
if
(
i
<
dim
)
{
CheckNTErrors
((
input
->
dimSize
[
i
]
==
mean
->
dimSize
[
i
]),
"Wrong size!"
);
blockNum
*=
input
->
dimSize
[
i
];
}
else
if
(
i
>
dim
RDI
)
{
CheckNTErrors
((
input
->
dimSize
RDI
[
i
]
==
mean
->
dimSizeRDI
[
i
-
1
]),
"Wrong size!"
);
blockNum
*=
input
->
dimSizeRDI
[
i
];
else
if
(
i
>
dim
)
{
CheckNTErrors
((
input
->
dimSize
[
i
]
==
mean
->
dimSize
[
i
-
1
]),
"Wrong size!"
);
stride
*=
input
->
dimSize
[
i
];
}
}
blockSize
=
stride
*
strideNum
;
...
...
source/tensor/core/math/Normalize.cu
查看文件 @
f5149a15
...
...
@@ -95,15 +95,14 @@ void _CudaNormalize(const XTensor * input, XTensor * output, int dim,
{
CheckNTErrors((input->dataType == DEFAULT_DTYPE), "TODO!");
int dimRDI = input->order - dim - 1;
int stride = 1;
int strideNum = input->dimSize
RDI[dimRDI
];
int strideNum = input->dimSize
[dim
];
int blockNum = 1;
for (int i = 0; i < input->order; i++) {
if (i
< dimRDI
)
stride *= input->dimSize
RDI
[i];
else if (i
> dimRDI
)
blockNum *= input->dimSize
RDI
[i];
if (i
> dim
)
stride *= input->dimSize[i];
else if (i
< dim
)
blockNum *= input->dimSize[i];
}
int cudaGridSize[3];
...
...
source/tensor/core/movement/CopyInGrid.cpp
查看文件 @
f5149a15
...
...
@@ -41,12 +41,11 @@ void _CopyInGrid(const XTensor * s, XTensor * t, int * index, int blockDim, int
{
CheckNTErrors
((
_IsSameShaped
(
s
,
t
)),
"Unmatched tensors!"
);
int
blockDimRDI
=
s
->
order
-
blockDim
-
1
;
int
blockSize
=
1
;
int
blockNum
=
blockNumInGrid
;
int
gridNum
=
1
;
for
(
int
i
=
0
;
i
<
blockDimRDI
;
i
++
)
blockSize
*=
s
->
dimSize
RDI
[
i
];
for
(
int
i
=
blockDim
;
i
<
s
->
order
;
i
++
)
blockSize
*=
s
->
dimSize
[
i
];
CheckNTErrors
((
s
->
unitNum
%
(
blockSize
*
blockNum
)
==
0
),
"Illegal block number!"
);
gridNum
=
s
->
unitNum
/
(
blockSize
*
blockNum
);
...
...
source/tensor/core/movement/CopyIndexed.cpp
查看文件 @
f5149a15
...
...
@@ -53,26 +53,28 @@ void _CopyIndexed(const XTensor * s, XTensor * t, int dim,
CheckNTErrors
(
dim
<
s
->
order
&&
dim
<
t
->
order
,
"A too larget dimension specified!"
);
CheckNTErrors
(
s
->
unitSize
==
t
->
unitSize
,
"Unmatched tensors!"
);
int
dimRDI
=
s
->
order
-
dim
-
1
;
int
blockSizeSrc
=
1
;
int
blockSizeTgt
=
1
;
int
blockNumSrc
=
1
;
int
blockNumTgt
=
1
;
int
leadDimSizeSrc
=
s
->
dimSize
RDI
[
dimRDI
];
int
leadDimSizeTgt
=
t
->
dimSize
RDI
[
dimRDI
];
int
leadDimSizeSrc
=
s
->
dimSize
[
dim
];
int
leadDimSizeTgt
=
t
->
dimSize
[
dim
];
int
indexOffsetNum
=
1
;
for
(
int
i
=
0
;
i
<
dimRDI
;
i
++
)
{
blockSizeSrc
*=
s
->
dimSizeRDI
[
i
];
blockSizeTgt
*=
t
->
dimSizeRDI
[
i
];
for
(
int
i
=
dim
+
1
;
i
<
s
->
order
;
i
++
)
{
blockSizeSrc
*=
s
->
dimSize
[
i
];
}
for
(
int
i
=
dim
+
1
;
i
<
t
->
order
;
i
++
)
{
blockSizeTgt
*=
t
->
dimSize
[
i
];
}
for
(
int
i
=
0
;
i
<=
dim
;
i
++
)
{
blockNumSrc
*=
s
->
dimSize
[
i
];
blockNumTgt
*=
t
->
dimSize
[
i
];
}
for
(
int
i
=
dimRDI
;
i
<
s
->
order
;
i
++
)
blockNumSrc
*=
s
->
dimSizeRDI
[
i
];
for
(
int
i
=
dimRDI
;
i
<
t
->
order
;
i
++
)
blockNumTgt
*=
t
->
dimSizeRDI
[
i
];
CheckNTErrors
(
blockSizeSrc
==
blockSizeTgt
,
"Unmatched tensors!"
);
indexOffsetNum
=
blockNumSrc
/
s
->
dimSize
RDI
[
dimRDI
];
indexOffsetNum
=
blockNumSrc
/
s
->
dimSize
[
dim
];
int
realIndexSize
=
indexOffsetNum
*
indexSize
*
copyNum
;
int
*
realSrcIndex
=
new
int
[
realIndexSize
];
...
...
@@ -219,14 +221,14 @@ make a new tensor to keep the result and return it
>> s - the source tensor
>> dim - the leading dimension to define "sub-tensors"
e.g., for a tensor of size (
4, 2, 3) and dim = 0
,
we have 4 sub-tensors of size (
2, 3
)
e.g., for a tensor of size (
3, 2, 4) and dim = 2
,
we have 4 sub-tensors of size (
3,2
)
>> srcIndex - index of the source sub-tensors
>> indexSize - length of srcIndex (and tgtIndex)
>> tgtIndex - index of the target sub-tensors
>> copyNum - number of the sub-tensors we copy for each source index,
e.g., for srcIndex = [
0,1
] and copyNum = 2,
we actually copy the source sub-tensors
0, 1, 1 and 2
e.g., for srcIndex = [
1,4
] and copyNum = 2,
we actually copy the source sub-tensors
1, 2, 4, 5
<< return - the result of copying indexed sub-tensors
*/
XTensor
CopyIndexed
(
const
XTensor
&
s
,
int
dim
,
...
...
@@ -277,14 +279,14 @@ make a new tensor to keep the result and return it
>> s - the source tensor
>> dim - the leading dimension to define "sub-tensors"
e.g., for a tensor of size (
4, 2, 3) and dim = 0
,
we have 4 sub-tensors of size (
2, 3
)
e.g., for a tensor of size (
3, 2, 4) and dim = 2
,
we have 4 sub-tensors of size (
3,2
)
>> srcIndex - index of the source sub-tensors
>> indexSize - length of srcIndex (and tgtIndex)
>> tgtIndex - index of the target sub-tensors
>> copyNum - number of the sub-tensors we copy for each source index,
e.g., for srcIndex = [
0,1
] and copyNum = 2,
we actually copy the source sub-tensors
0, 1, 1 and 2
e.g., for srcIndex = [
1,4
] and copyNum = 2,
we actually copy the source sub-tensors
1, 2, 4, 5
<< return - the result of copying indexed sub-tensors
*/
XTensor
CopyIndexed
(
const
XTensor
&
s
,
int
dim
,
int
*
srcIndex
,
int
indexSize
,
int
*
tgtIndex
,
int
copyNum
)
...
...
source/tensor/core/movement/Gather.cpp
查看文件 @
f5149a15
...
...
@@ -33,6 +33,51 @@ gather indexed sub-tensors
>> s - the source tensor
>> t - the target tensor
>> dim - the leading dimension to define "sub-tensors"
e.g., for a tensor of size (3, 2, 4) and dim = 2,
we have 4 sub-tensors of size (3, 2)
>> srcIndex - index of the source sub-tensors
>> indexSize - length of srcIndex (and tgtIndex)
*/
void
_Gather
(
XTensor
*
s
,
XTensor
*
t
,
int
dim
,
int
*
srcIndex
,
int
indexSize
)
{
int
*
tgtIndex
=
new
int
[
indexSize
];
for
(
int
i
=
0
;
i
<
indexSize
;
i
++
)
tgtIndex
[
i
]
=
i
;
_CopyIndexed
(
s
,
t
,
dim
,
srcIndex
,
indexSize
,
tgtIndex
,
1
);
delete
[]
tgtIndex
;
}
/*
gather indexed sub-tensors
>> s - the source tensor
>> t - the target tensor
>> srcIndex - index of the source sub-tensors
>> dim - the leading dimension to define "sub-tensors"
e.g., for a tensor of size (3, 2, 4) and dim = 2,
we have 4 sub-tensors of size (3, 2)
*/
void
_Gather
(
const
XTensor
*
s
,
XTensor
*
t
,
XTensor
*
srcIndex
,
int
dim
)
{
CheckNTErrors
((
s
&&
t
),
"Invalid tensors!"
);
CheckNTErrors
(
s
->
devID
==
t
->
devID
,
"the data must be kept on the same device!"
);
CheckNTErrors
((
t
->
unitSize
==
srcIndex
->
unitSize
),
"Unmatched tensors!"
);
#ifdef USE_CUDA
if
(
s
->
devID
>=
0
&&
t
->
devID
>=
0
)
{
_CudaGather
(
s
,
t
,
srcIndex
,
dim
);
return
;
}
#endif
}
/*
gather indexed sub-tensors
>> s - the source tensor
>> t - the target tensor
>> srcIndex - the tensor to save the index of the source tensor
*/
void
_Gather
(
const
XTensor
*
s
,
XTensor
*
t
,
XTensor
*
srcIndex
)
...
...
@@ -79,10 +124,15 @@ XTensor Gather(XTensor &s, XTensor &index)
CheckNTErrors
(
s
.
order
==
2
,
"The order of the input tensor must be 2!"
);
int
order
=
index
.
order
+
1
;
int
order
=
s
.
order
;
int
*
dimSize
=
new
int
[
order
];
memcpy
(
dimSize
,
index
.
dimSize
,
index
.
order
*
sizeof
(
int
));
dimSize
[
index
.
order
]
=
s
.
GetDim
(
-
1
);
for
(
int
i
=
0
;
i
<
s
.
order
;
i
++
)
{
if
(
i
==
dim
)
dimSize
[
i
]
=
index
.
unitNum
;
else
dimSize
[
i
]
=
s
.
dimSize
[
i
];
}
float
dr
=
(
!
s
.
isSparse
)
?
1.0
F
:
s
.
denseRatio
;
XTensor
t
(
order
,
dimSize
,
s
.
dataType
,
dr
,
s
.
devID
,
s
.
mem
);
...
...
@@ -93,11 +143,22 @@ XTensor Gather(XTensor &s, XTensor &index)
_Gather
(
&
s
,
&
t
,
&
index
);
/* tensor connection */
if
(
s
.
enableGrad
)
{
XLink
::
MakeLink
(
&
s
,
&
index
,
&
t
,
MOVEMENT_GATHER
);
}
if
(
index
.
order
>
1
)
{
int
*
dims
=
new
int
[
index
.
order
+
1
];
memcpy
(
dims
,
index
.
dimSize
,
index
.
order
*
sizeof
(
int
));
dims
[
index
.
order
]
=
t
.
GetDim
(
-
1
);
XTensor
tt
;
tt
=
Reshape
(
t
,
index
.
order
+
1
,
dims
);
delete
[]
dims
;
return
tt
;
}
else
{
return
t
;
}
}
}
//
namespace
nts
(
NiuTrans
.
Tensor
)
\ No newline at end of file
source/tensor/core/movement/Gather.cu
查看文件 @
f5149a15
...
...
@@ -68,6 +68,36 @@ void KernelGather(DTYPE * sData, DTYPE * tData, int * sIndex, int indexSize, int
/*
gather indexed sub-tensors(cuda version)
>> sData - the data pointer of the source tensor
>> tData - the data pointer of the target tensor
>> sIndex - the index of the source tensor
>> indexSize - the size of the srcIndex
>> stride - stride of a data block
>> strideNum - strideNum of a data block
>> blockNum - block size of data
*/
__global__
void KernelGather(DTYPE * sData, DTYPE * tData, int * sIndex, int stride, int strideNum, int blockNum)
{
int idx = blockDim.x * blockIdx.x + threadIdx.x;
int idy = blockDim.y * blockIdx.y + threadIdx.y;
int blockIndex = idy / stride;
int offsetInBlock = idy % stride;
int size = stride * strideNum * blockNum;
#pragma unroll
for (int i = idx * stride + stride * strideNum * blockIndex + offsetInBlock;
i < stride * strideNum * blockIndex + offsetInBlock + stride * strideNum && i < size;
i += stride * blockDim.x) {
tData[i] = sData[sIndex[i]];
}
}
/*
gather indexed sub-tensors(cuda version)
>> s - the source tensor
>> t - the target tensor
>> srcIndex - the tensor to save the index of the source tensor
...
...
@@ -117,6 +147,44 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex)
BacktoCudaDev(devID, devIDBackup);
}
/*
gather indexed sub-tensors(cuda version)
>> s - the source tensor
>> t - the target tensor
>> srcIndex - the tensor to save the index of the source tensor
>> dim - the leading dimension to define "sub-tensors"
*/
void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex, int dim)
{
int devID = srcIndex->devID;
XMem * mem = s->mem;
int stride = 1;
int blockNum = 1;
int indexSize = srcIndex->unitNum;
int strideNum = srcIndex->dimSize[dim];
for (int i = 0; i < dim; i++)
blockNum *= srcIndex->dimSize[i];
for (int i = dim + 1; i < srcIndex->order; i++)
stride *= srcIndex->dimSize[i];
int * sIndex = NULL;
if (srcIndex->devID < 0) {
sIndex = mem != NULL ?
(int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) :
(int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);
XMemCopy(sIndex, devID, srcIndex, -1, sizeof(int) * indexSize);
}
else
sIndex = (int *)srcIndex->data;
int cudaGrids[3];
int cudaBlocks[3];
GDevs.GetCudaThread2D(devID, max(32, strideNum), stride*blockNum, MAX_INT, cudaGrids, cudaBlocks);
KernelGather << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> > ((DTYPE *)s->data, (DTYPE *)t->data, sIndex, stride, strideNum, blockNum);
}
#endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
source/tensor/core/movement/Gather.cuh
查看文件 @
f5149a15
...
...
@@ -32,6 +32,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* gather indexed sub-tensors(cuda version) */
void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex);
void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex,int dim);
#endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor)
...
...
source/tensor/core/movement/Gather.h
查看文件 @
f5149a15
...
...
@@ -27,8 +27,14 @@
namespace
nts
{
// namespace nts(NiuTrans.Tensor)
/* gather selected sub-tensors */
void
_Gather
(
XTensor
*
s
,
XTensor
*
t
,
int
dim
,
int
*
srcIndex
,
int
indexSize
);
/* gather selected sub-tensors */
void
_Gather
(
const
XTensor
*
s
,
XTensor
*
t
,
XTensor
*
srcIndex
);
/* gather selected sub-tensors accoding to the dimension */
void
_Gather
(
const
XTensor
*
s
,
XTensor
*
t
,
XTensor
*
srcIndex
,
int
dim
);
/* gather selected sub-tensors (return an XTensor structure)
make a new tensor to keep the result and return it */
XTensor
Gather
(
XTensor
&
s
,
XTensor
&
index
);
...
...
source/tensor/core/reduce/ReduceMax.cpp
查看文件 @
f5149a15
差异被折叠。
点击展开。
source/tensor/core/reduce/ReduceMax.cu
查看文件 @
f5149a15
差异被折叠。
点击展开。
source/tensor/core/reduce/ReduceMax.cuh
查看文件 @
f5149a15
...
...
@@ -31,6 +31,9 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
/* get the max-valued items along a dimension of the tensor (cuda version) */
void _CudaReduceMax(const XTensor * input, XTensor * output, int dim);
/* get the min-valued items along a dimension of the tensor (cuda version) */
void _CudaReduceMin(const XTensor * input, XTensor * output, int dim);
#endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor)
...
...
source/tensor/core/reduce/ReduceMax.h
查看文件 @
f5149a15
...
...
@@ -29,14 +29,20 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
/* get the max value of the items along a dimension of the tensor. */
void
_ReduceMax
(
const
XTensor
*
input
,
XTensor
*
output
,
int
dim
);
/* get the min value of the items along a dimension of the tensor. */
void
_ReduceMin
(
const
XTensor
*
input
,
XTensor
*
output
,
int
dim
);
/*
get the max value of the items along a dimension of the tensor (return an XTensor structure)
make a new tensor to keep the result and return it
*/
XTensor
ReduceMax
(
const
XTensor
&
input
,
int
dim
);
/* get the max value of the items along a dimension of the tensor. */
void
ReduceMax
(
const
XTensor
&
input
,
XTensor
&
output
,
int
dim
);
/*
get the min value of the items along a dimension of the tensor (return an XTensor structure)
make a new tensor to keep the result and return it
*/
XTensor
ReduceMin
(
const
XTensor
&
input
,
int
dim
);
}
// namespace nts(NiuTrans.Tensor)
...
...
source/tensor/core/reduce/ReduceMean.cpp
查看文件 @
f5149a15
...
...
@@ -39,8 +39,7 @@ void _ReduceMean(const XTensor * input, XTensor * output, int dim)
{
CheckNTErrors
((
input
->
order
>
dim
),
"Illegal dimension specified!"
);
int
dimRDI
=
input
->
order
-
dim
-
1
;
int
num
=
input
->
dimSizeRDI
[
dimRDI
];
int
num
=
input
->
dimSize
[
dim
];
_ReduceSum
(
input
,
output
,
dim
);
_ScaleAndShiftMe
(
output
,
(
DTYPE
)
1
/
num
,
0
);
...
...
source/tensor/core/reduce/ReduceSum.cpp
查看文件 @
f5149a15
...
...
@@ -54,15 +54,14 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor
CheckNTErrors
((
input
->
dataType
==
output
->
dataType
),
"Unmatched data types!"
);
CheckNTErrors
((
shift
==
NULL
||
_IsSameShaped
(
output
,
shift
)),
"Incorrect shift tensor size!"
);
int
dimRDI
=
input
->
order
-
dim
-
1
;
CheckNTErrors
(
dimRDI
>=
0
,
"Wrong dimension!"
);
CheckNTErrors
(
dim
<
input
->
order
,
"Wrong dimension!"
);
for
(
int
i
=
0
;
i
<
input
->
order
;
i
++
){
if
(
i
<
dim
RDI
){
CheckNTErrors
((
input
->
dimSize
RDI
[
i
]
==
output
->
dimSizeRDI
[
i
]),
"Unmatched tensors!"
);
if
(
i
<
dim
){
CheckNTErrors
((
input
->
dimSize
[
i
]
==
output
->
dimSize
[
i
]),
"Unmatched tensors!"
);
}
else
if
(
i
>
dim
RDI
){
CheckNTErrors
((
input
->
dimSize
RDI
[
i
]
==
output
->
dimSizeRDI
[
i
-
1
]),
"Unmatched tensors!"
);
else
if
(
i
>
dim
){
CheckNTErrors
((
input
->
dimSize
[
i
]
==
output
->
dimSize
[
i
-
1
]),
"Unmatched tensors!"
);
}
}
...
...
@@ -75,21 +74,21 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor
CheckNTErrors
((
input
->
dataType
==
DEFAULT_DTYPE
),
"TODO!"
);
int
stride
=
1
;
int
strideNum
=
input
->
dimSize
RDI
[
dimRDI
];
int
strideNum
=
input
->
dimSize
[
dim
];
int
blockSize
=
1
;
int
blockNum
=
1
;
for
(
int
i
=
0
;
i
<
input
->
order
;
i
++
)
{
if
(
i
<
dim
RDI
)
stride
*=
input
->
dimSizeRDI
[
i
];
else
if
(
i
>
dim
RDI
)
blockNum
*=
input
->
dimSizeRDI
[
i
];
if
(
i
<
dim
)
blockNum
*=
input
->
dimSize
[
i
];
else
if
(
i
>
dim
)
stride
*=
input
->
dimSize
[
i
];
}
blockSize
=
stride
*
strideNum
;
if
(
input
->
dimSize
RDI
[
0
]
%
(
4
*
32
/
sizeof
(
DTYPE
))
==
0
&&
input
->
dimSizeRDI
[
0
]
>=
32
){
if
(
input
->
dimSize
[
input
->
order
-
1
]
%
(
4
*
32
/
sizeof
(
DTYPE
))
==
0
&&
input
->
dimSize
[
input
->
order
-
1
]
>=
32
){
int
vecBufLength
=
32
/
sizeof
(
DTYPE
);
if
(
dim
RDI
==
0
){
if
(
dim
==
input
->
order
-
1
){
//data is contiguous in dim 0
for
(
int
i
=
0
;
i
<
blockNum
;
i
++
){
// stride = 1
...
...
@@ -123,7 +122,7 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor
}
else
{
//data is separated
for
(
int
i
=
0
;
i
<
blockNum
;
i
++
){
for
(
int
j
=
0
;
j
<
input
->
dimSize
RDI
[
0
]
/
32
;
j
++
){
for
(
int
j
=
0
;
j
<
input
->
dimSize
[
input
->
order
-
1
]
/
32
;
j
++
){
DTYPE
*
ip
=
(
DTYPE
*
)
input
->
data
+
blockSize
*
i
;
DTYPE
*
op
=
(
DTYPE
*
)
output
->
data
+
stride
*
i
;
DTYPE
*
sp
=
shift
!=
NULL
?
(
DTYPE
*
)
shift
->
data
+
stride
*
i
:
NULL
;
...
...
source/tensor/core/reduce/ReduceSum.cu
查看文件 @
f5149a15
...
...
@@ -692,13 +692,12 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
CheckNTErrors(input->dataType == output->dataType, "Unmatched data types!");
CheckNTErrors(shift == NULL || output->unitNum == shift->unitNum, "Incorrect shift tensor size!");
int dimRDI = input->order - dim - 1;
for(int i = 0; i < input->order; i++){
if(i < dim
RDI
){
CheckNTErrors(input->dimSize
RDI[i] == output->dimSizeRDI
[i], "Unmatched tensors!");
if(i < dim){
CheckNTErrors(input->dimSize
[i] == output->dimSize
[i], "Unmatched tensors!");
}
else if(i > dim
RDI
){
CheckNTErrors(input->dimSize
RDI[i] == output->dimSizeRDI
[i - 1], "Unmatched tensors!");
else if(i > dim){
CheckNTErrors(input->dimSize
[i] == output->dimSize
[i - 1], "Unmatched tensors!");
}
}
...
...
@@ -709,32 +708,24 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
int cudaBlockSize[3];
int iter = 0;
int stride = 1;
int strideNum = input->dimSize
RDI[dimRDI
];
int strideNum = input->dimSize
[dim
];
int blockSize = 1;
int blockNum = 1;
for (int i = 0; i < input->order; i++) {
if (i < dim
RDI
)
stride *= input->dimSizeRDI
[i];
else if (i > dim
RDI
)
blockNum *= input->dimSizeRDI
[i];
if (i < dim)
blockNum *= input->dimSize
[i];
else if (i > dim)
stride *= input->dimSize
[i];
}
blockSize = stride * strideNum;
int devID = input->devID;
XMem * mem = input->mem;
GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
int devIDBackup;
ProtectCudaDev(devID, devIDBackup);
int bufSize = input->unitSize * cudaGridSize[0] * stride * blockNum * 2;
DTYPE * buf = mem != NULL ? (DTYPE*)mem->AllocBuf(mem->devID, bufSize) : (DTYPE*)XMemAlloc(input->devID, bufSize);
DTYPE * buf1 = buf;
DTYPE * buf2 = buf + cudaGridSize[0] * stride * blockNum;
DTYPE * sp = shift != NULL ? (DTYPE*)shift->data : NULL;
int devIDBackup;
ProtectCudaDev(input->devID, devIDBackup);
if (stride == 1 && blockNum >= 10) {
dim3 grids;
dim3 blocks;
...
...
@@ -761,6 +752,14 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
strideNum, blockNum,sp, power, isExp);
}
else {
XMem * mem = input->mem;
GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
int bufSize = input->unitSize * cudaGridSize[0] * stride * blockNum * 2;
DTYPE * buf = mem != NULL ? (DTYPE*)mem->AllocBuf(mem->devID, bufSize) : (DTYPE*)XMemAlloc(devID, bufSize);
DTYPE * buf1 = buf;
DTYPE * buf2 = buf + cudaGridSize[0] * stride * blockNum;
do {
if (input->dataType == DEFAULT_DTYPE) {
DTYPE * iData = NULL;
...
...
@@ -904,13 +903,15 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
iter++;
} while (strideNum > 1);
}
ProtectCudaDev(input->devID, devIDBackup);
if (mem != NULL)
mem->ReleaseBuf(mem->devID, bufSize);
else
XMemFree(input->devID, buf);
XMemFree(devID, buf);
}
BacktoCudaDev(devID, devIDBackup);
}
#endif // USE_CUDA
...
...
source/tensor/core/reduce/ReduceVariance.cpp
查看文件 @
f5149a15
...
...
@@ -38,8 +38,7 @@ For a 1-dimensional data array a, variance = 1/n * \sum_i (a_i - mean)^2
*/
void
_ReduceVariance
(
const
XTensor
*
input
,
XTensor
*
output
,
int
dim
,
const
XTensor
*
mean
)
{
int
dimRDI
=
input
->
order
-
dim
-
1
;
int
num
=
input
->
dimSizeRDI
[
dimRDI
];
int
num
=
input
->
dimSize
[
dim
];
_ReduceSum
(
input
,
output
,
dim
,
mean
,
2.0
F
);
_ScaleAndShiftMe
(
output
,
(
DTYPE
)
1
/
num
,
0
);
}
...
...
source/tensor/core/reduce/VectorBuffer.cpp
查看文件 @
f5149a15
...
...
@@ -20,7 +20,7 @@
*/
#include "VectorBuffer.h"
//#include "math.h"
namespace
nts
{
/* data size for each buffer */
int
VectorBuffer
::
size
()
...
...
@@ -168,4 +168,13 @@ VectorBuffer VectorBuffer::maxData(const VectorBuffer &a) {
return
*
this
;
}
/* conculte the max of two buffer */
VectorBuffer
VectorBuffer
::
minData
(
const
VectorBuffer
&
a
)
{
for
(
int
i
=
0
;
i
!=
a
.
size
();
i
++
)
{
this
->
values
[
i
]
=
MIN
(
a
[
i
],
this
->
values
[
i
]);
printf
(
"runhere"
);
}
return
*
this
;
}
}
/* end of the nts (NiuTrans.Tensor) namespace */
\ No newline at end of file
source/tensor/core/reduce/VectorBuffer.h
查看文件 @
f5149a15
...
...
@@ -20,7 +20,6 @@
*/
//#include <cstring>
#include <math.h>
#include "../../XGlobal.h"
namespace
nts
{
...
...
@@ -49,5 +48,8 @@ public:
/* conculte the max of two buffer */
VectorBuffer
maxData
(
const
VectorBuffer
&
a
);
/* conculte the max of two buffer */
VectorBuffer
minData
(
const
VectorBuffer
&
a
);
};
}
\ No newline at end of file
source/tensor/core/shape/ConcatenateSolely.cpp
查看文件 @
f5149a15
...
...
@@ -39,30 +39,29 @@ void _ConcatenateSolely(const TensorList * smalls, XTensor * big, int dim)
CheckNTErrors
(
big
->
order
>
dim
&&
dim
>=
0
,
"Illegal dimension to concatenate!"
);
int
catDimSize
=
0
;
int
dimRDI
=
big
->
order
-
dim
-
1
;
for
(
int
i
=
0
;
i
<
smalls
->
count
;
i
++
)
{
XTensor
*
tensor
=
(
XTensor
*
)
smalls
->
GetItem
(
i
);
CheckNTErrors
((
big
->
order
==
tensor
->
order
),
"Unmatched tensor orders!"
);
for
(
int
j
=
0
;
j
<
big
->
order
;
j
++
)
{
if
(
j
!=
dim
RDI
)
{
CheckNTErrors
((
big
->
dimSize
RDI
[
j
]
==
tensor
->
dimSizeRDI
[
j
]),
"Unmatched tensor sizes!"
);
if
(
j
!=
dim
)
{
CheckNTErrors
((
big
->
dimSize
[
j
]
==
tensor
->
dimSize
[
j
]),
"Unmatched tensor sizes!"
);
}
else
{
catDimSize
+=
tensor
->
dimSize
RDI
[
j
];
catDimSize
+=
tensor
->
dimSize
[
j
];
}
}
}
CheckNTErrors
((
catDimSize
==
big
->
dimSize
RDI
[
dimRDI
]),
"Unmatched tensor sizes!"
);
CheckNTErrors
((
catDimSize
==
big
->
dimSize
[
dim
]),
"Unmatched tensor sizes!"
);
int
stride
=
1
;
for
(
int
i
=
0
;
i
<
dimRDI
;
i
++
)
stride
*=
big
->
dimSizeRDI
[
i
];
int
blockNum
=
1
;
for
(
int
i
=
dimRDI
+
1
;
i
<
big
->
order
;
i
++
)
blockNum
*=
big
->
dimSizeRDI
[
i
];
for
(
int
i
=
0
;
i
<
dim
;
i
++
)
blockNum
*=
big
->
dimSize
[
i
];
for
(
int
i
=
dim
+
1
;
i
<
big
->
order
;
i
++
)
stride
*=
big
->
dimSize
[
i
];
int
offset
=
0
;
...
...
@@ -74,8 +73,8 @@ void _ConcatenateSolely(const TensorList * smalls, XTensor * big, int dim)
if
(
smalls
->
count
<=
MIN_TENSOR_CAT_NUM
)
{
for
(
int
i
=
0
;
i
<
smalls
->
count
;
i
++
)
{
XTensor
*
tensor
=
(
XTensor
*
)
smalls
->
GetItem
(
i
);
int
sPitch
=
stride
*
tensor
->
dimSize
RDI
[
dimRDI
]
*
tensor
->
unitSize
;
int
tPitch
=
stride
*
big
->
dimSize
RDI
[
dimRDI
]
*
big
->
unitSize
;
int
sPitch
=
stride
*
tensor
->
dimSize
[
dim
]
*
tensor
->
unitSize
;
int
tPitch
=
stride
*
big
->
dimSize
[
dim
]
*
big
->
unitSize
;
int
mSize
=
sPitch
;
int
n
=
blockNum
;
XMemCopy2D
((
char
*
)
big
->
data
+
offset
,
tPitch
,
big
->
devID
,
...
...
@@ -89,7 +88,7 @@ void _ConcatenateSolely(const TensorList * smalls, XTensor * big, int dim)
int
*
blockSizes
=
new
int
[
smalls
->
count
];
for
(
int
i
=
0
;
i
<
smalls
->
count
;
i
++
)
{
XTensor
*
tensor
=
(
XTensor
*
)
smalls
->
GetItem
(
i
);
blockSizes
[
i
]
=
stride
*
tensor
->
dimSize
RDI
[
dimRDI
]
*
tensor
->
unitSize
;
blockSizes
[
i
]
=
stride
*
tensor
->
dimSize
[
dim
]
*
tensor
->
unitSize
;
sourceArrays
->
Add
((
char
*
)
tensor
->
data
);
}
...
...
source/tensor/core/shape/IsSameShaped.cpp
查看文件 @
f5149a15
...
...
@@ -39,7 +39,7 @@ bool _IsSameShaped(const XTensor * a, const XTensor * b)
return
false
;
for
(
int
i
=
0
;
i
<
a
->
order
;
i
++
){
if
(
a
->
dimSize
RDI
[
i
]
!=
b
->
dimSizeRDI
[
i
])
if
(
a
->
dimSize
[
i
]
!=
b
->
dimSize
[
i
])
return
false
;
}
...
...
source/tensor/core/shape/Merge.cpp
查看文件 @
f5149a15
...
...
@@ -46,10 +46,8 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
if
(
leadingDim
<
0
)
leadingDim
=
0
;
int
whereToMergeRDI
=
s
->
order
-
whereToMerge
-
1
;
int
leadingDimRDI
=
s
->
order
-
leadingDim
-
1
;
if
(
leadingDimRDI
<
0
)
leadingDimRDI
=
s
->
order
-
1
;
if
(
leadingDim
>=
s
->
order
)
leadingDim
=
leadingDim
-
s
->
order
;
CheckNTErrors
((
s
!=
NULL
&&
t
!=
NULL
),
"Invalid tensors!"
);
CheckNTErrors
((
s
->
devID
==
t
->
devID
||
(
s
->
devID
<
0
&&
t
->
devID
<
0
)),
...
...
@@ -57,19 +55,20 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
CheckNTErrors
((
s
->
unitNum
==
t
->
unitNum
&&
s
->
unitSize
==
t
->
unitSize
),
"Unmatched tensors!"
);
CheckNTErrors
((
s
->
order
==
t
->
order
+
1
),
"Unmatched tensors!"
);
CheckNTErrors
((
leadingDim
RDI
>
whereToMergeRDI
),
"Invalid leading dimension!"
);
CheckNTErrors
((
leadingDim
<
whereToMerge
),
"Invalid leading dimension!"
);
for
(
int
i
=
0
;
i
<
s
->
order
;
i
++
)
{
if
(
i
==
whereToMergeRDI
)
{
CheckNTErrors
((
t
->
dimSizeRDI
[
i
]
==
s
->
dimSizeRDI
[
i
]
*
s
->
dimSizeRDI
[
leadingDimRDI
]),
if
(
i
==
whereToMerge
)
{
CheckNTErrors
((
t
->
dimSize
[
i
-
1
]
==
s
->
dimSize
[
i
]
*
s
->
dimSize
[
leadingDim
]),
"Unmatched tensor sizes!"
);
}
else
if
(
i
<
leadingDim
RDI
){
CheckNTErrors
((
s
->
dimSize
RDI
[
i
]
==
t
->
dimSizeRDI
[
i
]),
else
if
(
i
<
leadingDim
){
CheckNTErrors
((
s
->
dimSize
[
i
]
==
t
->
dimSize
[
i
]),
"Unmatched tensor sizes!"
);
}
else
if
(
i
>
leadingDim
RDI
)
{
CheckNTErrors
((
s
->
dimSize
RDI
[
i
]
==
t
->
dimSizeRDI
[
i
-
1
]),
else
if
(
i
>
leadingDim
)
{
CheckNTErrors
((
s
->
dimSize
[
i
]
==
t
->
dimSize
[
i
-
1
]),
"Unmatched tensor sizes!"
);
}
}
...
...
@@ -78,14 +77,14 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
int
blockNum
=
1
;
int
gridSize
=
1
;
int
gridNum
=
1
;
int
mergedNum
=
s
->
dimSize
RDI
[
leadingDimRDI
];
int
mergedNum
=
s
->
dimSize
[
leadingDim
];
for
(
int
i
=
0
;
i
<
s
->
order
;
i
++
)
{
if
(
i
<=
leadingDimRDI
)
{
if
(
i
<=
whereToMergeRDI
)
blockSize
*=
s
->
dimSize
RDI
[
i
];
if
(
i
>=
leadingDim
)
{
if
(
i
>=
whereToMerge
)
blockSize
*=
s
->
dimSize
[
i
];
else
blockNum
*=
s
->
dimSize
RDI
[
i
];
blockNum
*=
s
->
dimSize
[
i
];
}
}
...
...
@@ -122,7 +121,7 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
if
(
!
isOnSameDevice
)
dataTMP
=
mem
!=
NULL
?
mem
->
AllocBuf
(
mem
->
devID
,
size
)
:
XMemAlloc
(
mem
->
devID
,
size
);
int
blockNumInMerge
=
s
->
dimSize
RDI
[
leadingDimRDI
];
int
blockNumInMerge
=
s
->
dimSize
[
leadingDim
];
int
splitSizeInGrid
=
gridSize
/
blockNumInMerge
;
int
realBlockSize
=
blockSize
*
t
->
unitSize
;
...
...
@@ -311,12 +310,11 @@ void _Merge(const TensorList * smalls, XTensor * t, int whereToMerge)
int
mergedNum
=
smalls
->
count
;
XTensor
*
s0
=
smalls
->
GetItem
(
0
);
int
whereToMergeRDI
=
s0
->
order
-
whereToMerge
-
1
;
for
(
int
i
=
0
;
i
<
s0
->
order
;
i
++
)
{
if
(
i
<=
whereToMergeRDI
)
blockSize
*=
s0
->
dimSize
RDI
[
i
];
if
(
i
>=
whereToMerge
)
blockSize
*=
s0
->
dimSize
[
i
];
else
blockNum
*=
s0
->
dimSize
RDI
[
i
];
blockNum
*=
s0
->
dimSize
[
i
];
}
CheckNTErrors
((
s0
->
unitNum
%
(
blockSize
*
blockNum
)
==
0
),
"Incorrect size!"
);
...
...
source/tensor/core/shape/Merge.h
查看文件 @
f5149a15
...
...
@@ -46,8 +46,6 @@ void Merge(const TensorList &smalls, XTensor &t, int whereToMerge);
/* merge two tensors into a big tensor (return an XTensor structure) */
XTensor
Merge
(
const
XTensor
&
smallA
,
const
XTensor
&
smallB
,
int
whereToMerge
);
void
Merge
(
const
XTensor
&
smallA
,
const
XTensor
&
smallB
,
XTensor
&
t
,
int
whereToMerge
);
}
// namespace nts(NiuTrans.Tensor)
#endif // __MERGE_H__
\ No newline at end of file
source/tensor/core/shape/Split.cpp
查看文件 @
f5149a15
...
...
@@ -31,7 +31,7 @@
namespace
nts
{
// namespace nts(NiuTrans.Tensor)
/*
transform a tensor by splitting it, e.g., (N, M) -> (
3, N/3, M
)
transform a tensor by splitting it, e.g., (N, M) -> (
N/3, M, 3
)
>> s - the source tensor
>> t - the target tensor (for return)
...
...
@@ -46,23 +46,22 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)
CheckNTErrors
((
s
->
unitNum
==
t
->
unitNum
&&
s
->
unitSize
==
t
->
unitSize
),
"Unmatched tensors!"
);
CheckNTErrors
((
s
->
order
==
t
->
order
-
1
),
"Unmatched tensors!"
);
CheckNTErrors
((
t
->
dimSize
RDI
[
t
->
order
-
1
]
==
splitNum
),
"Incorrect tensor sizes!"
);
CheckNTErrors
((
t
->
dimSize
[
0
]
==
splitNum
),
"Incorrect tensor sizes!"
);
int
whereToSplitRDI
=
s
->
order
-
whereToSplit
-
1
;
for
(
int
i
=
0
;
i
<
s
->
order
;
i
++
)
{
if
(
i
==
whereToSplit
RDI
)
{
CheckNTErrors
((
s
->
dimSize
RDI
[
i
]
==
t
->
dimSizeRDI
[
i
]
*
splitNum
),
if
(
i
==
whereToSplit
)
{
CheckNTErrors
((
s
->
dimSize
[
i
]
==
t
->
dimSize
[
i
+
1
]
*
splitNum
),
"Unmatched tensor sizes!"
);
}
else
{
CheckNTErrors
((
s
->
dimSize
RDI
[
i
]
==
t
->
dimSizeRDI
[
i
]),
CheckNTErrors
((
s
->
dimSize
[
i
]
==
t
->
dimSize
[
i
+
1
]),
"Unmatched tensor sizes!"
);
}
}
/* for the case that we split the last dimension. Actually
(N, M) and (
3, N/3, M
) have the same memory layout */
if
(
s
->
order
-
1
==
whereToSplitRDI
)
{
(N, M) and (
N, M/3, 3
) have the same memory layout */
if
(
0
==
whereToSplit
)
{
XMemCopy
(
t
->
data
,
t
->
devID
,
s
->
data
,
s
->
devID
,
s
->
unitNum
*
s
->
unitSize
);
return
;
}
...
...
@@ -70,14 +69,14 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)
int
blockSize
=
1
;
int
blockNum
=
1
;
for
(
int
i
=
0
;
i
<
s
->
order
;
i
++
)
{
if
(
i
==
whereToSplit
RDI
)
{
blockSize
*=
s
->
dimSize
RDI
[
i
]
/
splitNum
;
if
(
i
==
whereToSplit
)
{
blockSize
*=
s
->
dimSize
[
i
]
/
splitNum
;
blockNum
*=
splitNum
;
}
else
if
(
i
<
whereToSplitRDI
)
blockSize
*=
s
->
dimSize
RDI
[
i
];
else
if
(
i
>
whereToSplit
)
blockSize
*=
s
->
dimSize
[
i
];
else
blockNum
*=
s
->
dimSize
RDI
[
i
];
blockNum
*=
s
->
dimSize
[
i
];
}
CheckNTErrors
((
blockNum
%
splitNum
==
0
),
"Incorrect split number!"
);
...
...
@@ -184,7 +183,7 @@ bool CheckSplitSize(const XTensor * s, const XTensor * t, int whereToSplit, int
}
/*
transform a tensor by splitting it, e.g., (N, M) -> (
3, N/3, M
) (return an XTensor structure)
transform a tensor by splitting it, e.g., (N, M) -> (
N/3, M, 3
) (return an XTensor structure)
make a new tensor to keep the result and return it
>> s - the source tensor
...
...
@@ -276,7 +275,6 @@ void _Split(const XTensor * big, TensorList * smalls, int whereToSplit, int spli
CheckNTErrors
((
smalls
->
count
==
splitNum
),
"Unmatched tensors!"
);
CheckNTErrors
((
smalls
->
count
>
0
),
"Wrong input!"
);
int
whereToSplitRDI
=
big
->
order
-
whereToSplit
-
1
;
bool
uniform
=
true
;
for
(
int
i
=
0
;
i
<
smalls
->
count
;
i
++
)
{
...
...
@@ -292,14 +290,14 @@ void _Split(const XTensor * big, TensorList * smalls, int whereToSplit, int spli
int
blockSize
=
1
;
int
blockNum
=
1
;
for
(
int
i
=
0
;
i
<
big
->
order
;
i
++
)
{
if
(
i
==
whereToSplit
RDI
)
{
blockSize
*=
big
->
dimSize
RDI
[
i
]
/
splitNum
;
if
(
i
==
whereToSplit
)
{
blockSize
*=
big
->
dimSize
[
i
]
/
splitNum
;
blockNum
*=
splitNum
;
}
else
if
(
i
<
whereToSplitRDI
)
blockSize
*=
big
->
dimSize
RDI
[
i
];
else
if
(
i
>
whereToSplit
)
blockSize
*=
big
->
dimSize
[
i
];
else
blockNum
*=
big
->
dimSize
RDI
[
i
];
blockNum
*=
big
->
dimSize
[
i
];
}
CheckNTErrors
((
blockNum
%
splitNum
==
0
),
"Incorrect split number!"
);
...
...
source/tensor/core/shape/Unsqueeze.cpp
查看文件 @
f5149a15
...
...
@@ -42,16 +42,15 @@ void _Unsqueeze(const XTensor * a, XTensor * b, int dim, int dSize)
CheckNTErrors
((
a
->
order
==
b
->
order
-
1
),
"Unmatched tensors!"
);
CheckNTErrors
((
a
->
unitSize
==
b
->
unitSize
),
"Unmatched tensors!"
);
int
dimRDI
=
b
->
order
-
dim
-
1
;
for
(
int
i
=
0
;
i
<
b
->
order
;
i
++
)
{
if
(
i
<
dim
RDI
)
{
CheckNTErrors
((
a
->
dimSize
RDI
[
i
]
==
b
->
dimSizeRDI
[
i
]),
"Unmatched tensors!"
);
if
(
i
<
dim
)
{
CheckNTErrors
((
a
->
dimSize
[
i
]
==
b
->
dimSize
[
i
]),
"Unmatched tensors!"
);
}
else
if
(
i
>
dim
RDI
)
{
CheckNTErrors
((
a
->
dimSize
RDI
[
i
-
1
]
==
b
->
dimSizeRDI
[
i
]),
"Unmatched tensors!"
);
else
if
(
i
>
dim
)
{
CheckNTErrors
((
a
->
dimSize
[
i
-
1
]
==
b
->
dimSize
[
i
]),
"Unmatched tensors!"
);
}
else
{
CheckNTErrors
((
dSize
==
b
->
dimSize
RDI
[
i
]),
"Unmatched tensors!"
);
CheckNTErrors
((
dSize
==
b
->
dimSize
[
i
]),
"Unmatched tensors!"
);
}
}
...
...
@@ -60,8 +59,8 @@ void _Unsqueeze(const XTensor * a, XTensor * b, int dim, int dSize)
int
blockNumA
=
1
;
int
blockNumB
=
1
;
for
(
int
i
=
0
;
i
<
dimRDI
;
i
++
)
blockSize
*=
a
->
dimSize
RDI
[
i
];
for
(
int
i
=
dim
;
i
<
a
->
order
;
i
++
)
blockSize
*=
a
->
dimSize
[
i
];
realBlockSize
=
blockSize
*
a
->
unitSize
;
...
...
source/tensor/core/shape/Unsqueeze.cu
查看文件 @
f5149a15
...
...
@@ -235,9 +235,8 @@ void _CudaUnsqueeze(const XTensor * a, XTensor * b, int dim, int dSize)
int blockSize = 1;
int blockNumA = 1;
int blockNumB = 1;
int dimRDI = b->order - dim - 1;
for (int i = 0; i < dimRDI; i++)
blockSize *= a->dimSizeRDI[i];
for (int i = dim; i < a->order; i++)
blockSize *= a->dimSize[i];
blockNumA = a->unitNum / blockSize;
blockNumB = b->unitNum / blockSize;
...
...
@@ -250,7 +249,7 @@ void _CudaUnsqueeze(const XTensor * a, XTensor * b, int dim, int dSize)
int devIDBackup = 0;
ProtectCudaDev(a->devID, devIDBackup);
if (dim
RDI == 0
) {
if (dim
== b->order - 1
) {
GDevs.GetCudaThread2D(a->devID, dSize, blockNumA, MAX_INT, cudaGrids, cudaBlocks);
if (a->dataType == X_FLOAT && b->dataType == X_FLOAT) {
...
...
source/tensor/core/sort/Sort.cpp
查看文件 @
f5149a15
...
...
@@ -47,7 +47,6 @@ void _Sort(const XTensor * a, XTensor * b, XTensor * index, int dim)
CheckNTErrors
((
a
->
order
==
index
->
order
),
"Unmatched input tensors!"
);
CheckNTErrors
((
index
->
dataType
==
X_INT
),
"Wrong data type!"
);
int
dimRDI
=
a
->
order
-
dim
-
1
;
/* make the index tensor */
SetAscendingOrder
(
*
index
,
dim
);
...
...
@@ -60,13 +59,13 @@ void _Sort(const XTensor * a, XTensor * b, XTensor * index, int dim)
}
else
{
int
stride
=
1
;
int
strideNum
=
a
->
dimSizeRDI
[
dimRDI
];
for
(
int
i
=
0
;
i
<
dimRDI
;
i
++
)
stride
*=
a
->
dimSizeRDI
[
i
];
int
blockNum
=
1
;
for
(
int
i
=
dimRDI
+
1
;
i
<
a
->
order
;
i
++
)
blockNum
*=
a
->
dimSizeRDI
[
i
];
int
strideNum
=
a
->
dimSize
[
dim
];
for
(
int
i
=
0
;
i
<
dim
;
i
++
)
blockNum
*=
a
->
dimSize
[
i
];
for
(
int
i
=
dim
+
1
;
i
<
a
->
order
;
i
++
)
stride
*=
a
->
dimSize
[
i
];
int
blockSize
=
stride
*
strideNum
;
_CopyValues
(
a
,
b
);
...
...
source/tensor/core/sort/Sort.cu
查看文件 @
f5149a15
...
...
@@ -217,20 +217,19 @@ void _CudaSortBig(const XTensor * a, XTensor * b, XTensor * indexA, XTensor * in
CheckNTErrors((a->order > dim && dim >= 0), "Incorrect dimension specified!");
CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
int dimRDI = a->order - dim - 1;
if (k < 0 || k > b->dimSizeRDI[dimRDI])
k = b->dimSizeRDI[dimRDI];
if (k < 0 || k > b->dimSize[dim])
k = b->dimSize[dim];
XMem * mem = a->mem;
int stride = 1;
int strideNum = a->dimSizeRDI[dimRDI];
for (int i = 0; i < dimRDI; i++)
stride *= a->dimSizeRDI[i];
int blockNum = 1;
for (int i = dimRDI + 1; i < a->order; i++)
blockNum *= a->dimSizeRDI[i];
int strideNum = a->dimSize[dim];
for (int i = 0; i < dim; i++)
blockNum *= a->dimSize[i];
for (int i = dim + 1; i < a->order; i++)
stride *= a->dimSize[i];
int m = GetNextPower2(strideNum);
int n = stride * blockNum;
...
...
source/tensor/core/sort/TopK.cpp
查看文件 @
f5149a15
...
...
@@ -45,15 +45,14 @@ void _TopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k)
CheckNTErrors
(
index
==
NULL
||
a
->
order
==
index
->
order
,
"Unmatched input tensors!"
);
CheckNTErrors
(
index
->
dataType
==
X_INT
,
"Wrong data type!"
);
int
dimRDI
=
a
->
order
-
dim
-
1
;
for
(
int
i
=
0
;
i
<
a
->
order
;
i
++
)
{
if
(
i
==
dim
RDI
)
{
CheckNTErrors
(
b
->
dimSizeRDI
[
i
]
==
k
,
"A too large K"
);
CheckNTErrors
(
index
==
NULL
||
index
->
dimSizeRDI
[
i
]
==
k
,
"Wrong size!"
);
if
(
i
==
dim
)
{
CheckNTErrors
(
(
b
->
dimSize
[
i
]
==
k
)
,
"A too large K"
);
CheckNTErrors
(
(
index
==
NULL
||
index
->
dimSize
[
i
]
==
k
)
,
"Wrong size!"
);
}
else
{
CheckNTErrors
(
b
->
dimSizeRDI
[
i
]
==
a
->
dimSizeRDI
[
i
]
,
"Wrong size!"
);
CheckNTErrors
(
index
==
NULL
||
index
->
dimSizeRDI
[
i
]
==
a
->
dimSizeRDI
[
i
]
,
"Wrong size!"
);
CheckNTErrors
(
(
b
->
dimSize
[
i
]
==
a
->
dimSize
[
i
])
,
"Wrong size!"
);
CheckNTErrors
(
(
index
==
NULL
||
index
->
dimSize
[
i
]
==
a
->
dimSize
[
i
])
,
"Wrong size!"
);
}
}
...
...
@@ -68,14 +67,14 @@ void _TopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k)
CheckNTErrors
((
a
->
dataType
==
DEFAULT_DTYPE
),
"TODO!"
);
int
stride
=
1
;
int
strideNumA
=
a
->
dimSizeRDI
[
dimRDI
];
int
strideNumB
=
b
->
dimSizeRDI
[
dimRDI
];
for
(
int
i
=
0
;
i
<
dimRDI
;
i
++
)
stride
*=
a
->
dimSizeRDI
[
i
];
int
blockNum
=
1
;
for
(
int
i
=
dimRDI
+
1
;
i
<
a
->
order
;
i
++
)
blockNum
*=
a
->
dimSizeRDI
[
i
];
int
strideNumA
=
a
->
dimSize
[
dim
];
int
strideNumB
=
b
->
dimSize
[
dim
];
for
(
int
i
=
0
;
i
<
dim
;
i
++
)
blockNum
*=
a
->
dimSize
[
i
];
for
(
int
i
=
dim
+
1
;
i
<
a
->
order
;
i
++
)
stride
*=
a
->
dimSize
[
i
];
int
blockSizeA
=
stride
*
strideNumA
;
int
blockSizeB
=
stride
*
strideNumB
;
...
...
source/tensor/core/sort/TopK.cu
查看文件 @
f5149a15
...
...
@@ -812,15 +812,14 @@ void _CudaTopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k)
CheckNTErrors((index->dataType == X_INT), "Wrong data type!");
CheckNTErrors((b->dimSize[dim] == k), "A too large K");
int dimRDI = a->order - dim - 1;
int stride = 1;
int strideNumA = a->dimSizeRDI[dimRDI];
for (int i = 0; i < dimRDI; i++)
stride *= a->dimSizeRDI[i];
int blockNum = 1;
for (int i = dimRDI + 1; i < a->order; i++)
blockNum *= a->dimSizeRDI[i];
int strideNumA = a->dimSize[dim];
for (int i = 0; i < dim; i++)
blockNum *= a->dimSize[i];
for (int i = dim + 1; i < a->order; i++)
stride *= a->dimSize[i];
int workerNum = blockNum < 16 ? 64 : 32;
/* adjust the thread num according size of k for fitting the share memory size */
...
...
source/tensor/core/utilities/SetAscendingOrder.cpp
查看文件 @
f5149a15
...
...
@@ -47,7 +47,6 @@ void SetAscendingOrder(XTensor & tensor, int dim)
return
;
}
int
dimRDI
=
tensor
.
order
-
dim
-
1
;
if
(
tensor
.
devID
>=
0
){
#ifdef USE_CUDA
CudaSetAscendingOrder
(
&
tensor
,
dim
);
...
...
@@ -57,13 +56,13 @@ void SetAscendingOrder(XTensor & tensor, int dim)
}
else
{
int
stride
=
1
;
int
strideNum
=
tensor
.
dimSizeRDI
[
dimRDI
];
for
(
int
i
=
0
;
i
<
dimRDI
;
i
++
)
stride
*=
tensor
.
dimSizeRDI
[
i
];
int
blockNum
=
1
;
for
(
int
i
=
dimRDI
+
1
;
i
<
tensor
.
order
;
i
++
)
blockNum
*=
tensor
.
dimSizeRDI
[
i
];
int
strideNum
=
tensor
.
dimSize
[
dim
];
for
(
int
i
=
0
;
i
<
dim
;
i
++
)
blockNum
*=
tensor
.
dimSize
[
i
];
for
(
int
i
=
dim
+
1
;
i
<
tensor
.
order
;
i
++
)
stride
*=
tensor
.
dimSize
[
i
];
for
(
int
k
=
0
;
k
<
blockNum
;
k
++
){
for
(
int
j
=
0
;
j
<
strideNum
;
j
++
){
...
...
source/tensor/core/utilities/SetAscendingOrder.cu
查看文件 @
f5149a15
...
...
@@ -67,15 +67,14 @@ void CudaSetAscendingOrder(XTensor * a, int dim)
{
CheckNTErrors((a->dataType == X_INT), "TODO!");
int dimRDI = a->order - dim - 1;
int stride = 1;
int strideNum = a->dimSizeRDI[dimRDI];
for(int i = 0; i < dimRDI; i++)
stride *= a->dimSizeRDI[i];
int blockNum = 1;
for(int i = dimRDI + 1; i < a->order; i++)
blockNum *= a->dimSizeRDI[i];
int strideNum = a->dimSize[dim];
for(int i = 0; i < dim; i++)
blockNum *= a->dimSize[i];
for(int i = dim + 1; i < a->order; i++)
stride *= a->dimSize[i];
int gridSize[3];
int blockSize[3];
...
...
source/tensor/function/LogSoftmax.cpp
查看文件 @
f5149a15
...
...
@@ -50,7 +50,6 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim)
return
;
}
int
leadDimRDI
=
x
->
order
-
leadDim
-
1
;
if
(
!
x
->
isSparse
&&
!
y
->
isSparse
&&
x
->
dataType
==
DEFAULT_DTYPE
&&
y
->
dataType
==
DEFAULT_DTYPE
)
{
...
...
@@ -70,13 +69,13 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim)
XTensor
*
blockMax
=
NULL
;
XTensor
*
blockSum
=
NULL
;
int
dimensionSize
=
y
->
dimSize
RDI
[
leadDimRDI
];
int
dimensionSize
=
y
->
dimSize
[
leadDim
];
int
stride
=
1
;
int
blockSize
=
1
;
int
blockNum
=
1
;
for
(
int
i
=
0
;
i
<
leadDimRDI
;
i
++
)
stride
*=
y
->
dimSize
RDI
[
i
];
for
(
int
i
=
leadDim
+
1
;
i
<
y
->
order
;
i
++
)
stride
*=
y
->
dimSize
[
i
];
blockSize
=
stride
*
dimensionSize
;
blockNum
=
y
->
unitNum
/
blockSize
;
...
...
@@ -87,7 +86,7 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim)
_ReduceSum
(
x
,
sum
,
leadDim
,
max
,
1.0
F
,
true
);
if
(
x
->
devID
>=
0
)
{
if
(
leadDim
RDI
==
0
){
if
(
leadDim
==
x
->
order
-
1
){
blockSize
=
y
->
unitNum
;
blockNum
=
1
;
blockx
=
NewTensor2D
(
blockSize
/
dimensionSize
,
-
dimensionSize
,
x
->
dataType
,
x
->
devID
,
mem
);
...
...
@@ -138,7 +137,7 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim)
blockMax
->
data
=
mp
;
blockSum
->
data
=
sp
;
#ifdef USE_CUDA
if
(
leadDim
RDI
==
0
)
if
(
leadDim
==
x
->
order
-
1
)
_CudaLogSoftmaxSumMax
(
blockx
,
blocky
,
1
,
blockSum
,
blockMax
);
else
_CudaLogSoftmaxSumMax
(
blockx
,
blocky
,
leadDim
,
blockSum
,
blockMax
);
...
...
@@ -299,7 +298,6 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
if
(
leadDim
<
0
)
leadDim
=
y
->
order
-
1
;
int
leadDimRDI
=
y
->
order
-
leadDim
-
1
;
#ifdef USE_CUDA
if
(
gold
->
devID
>=
0
)
{
_CudaLogSoftmaxBackward
(
gold
,
y
,
x
,
dedy
,
dedx
,
padding
,
leadDim
,
lossName
);
...
...
@@ -307,12 +305,12 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
}
#endif
int
dimensionSize
=
y
->
dimSize
RDI
[
leadDimRDI
];
int
dimensionSize
=
y
->
dimSize
[
leadDim
];
int
stride
=
1
;
int
blockSize
=
1
;
int
blockNum
=
1
;
for
(
int
i
=
0
;
i
<
leadDimRDI
;
i
++
)
stride
*=
y
->
dimSize
RDI
[
i
];
for
(
int
i
=
leadDim
+
1
;
i
<
y
->
order
;
i
++
)
stride
*=
y
->
dimSize
[
i
];
blockSize
=
stride
*
dimensionSize
;
blockNum
=
y
->
unitNum
/
blockSize
;
...
...
@@ -339,10 +337,10 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
int
key
=
gold
->
GetKeyInSparse
(
i
);
DTYPE
value
=
gold
->
GetInSparse
(
i
);
int
offset
=
key
;
if
(
dedx
->
dimSize
RDI
[
0
]
!=
gm
)
{
if
(
dedx
->
dimSize
[
dedx
->
order
-
1
]
!=
gm
)
{
int
mi
=
key
%
gm
;
int
ni
=
key
/
gm
;
int
key2
=
ni
*
dedx
->
dimSize
RDI
[
0
]
+
mi
;
int
key2
=
ni
*
dedx
->
dimSize
[
dedx
->
order
-
1
]
+
mi
;
offset
=
key2
;
}
if
(
key
>=
0
&&
key
<
size
)
...
...
@@ -396,10 +394,10 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
int
key
=
gold
->
GetKeyInSparse
(
i
);
DTYPE
value
=
gold
->
GetInSparse
(
i
);
int
offset
=
key
;
if
(
dedx
->
dimSize
RDI
[
0
]
!=
gm
)
{
if
(
dedx
->
dimSize
[
dedx
->
order
-
1
]
!=
gm
)
{
int
mi
=
key
%
gm
;
int
ni
=
key
/
gm
;
int
key2
=
ni
*
dedx
->
dimSize
RDI
[
0
]
+
mi
;
int
key2
=
ni
*
dedx
->
dimSize
[
dedx
->
order
-
1
]
+
mi
;
offset
=
key2
;
}
if
(
key
>=
0
&&
key
<
size
)
...
...
@@ -431,11 +429,11 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
/* for columns with no xs we set dE/ds = 0 */
if
(
gold
!=
NULL
&&
gold
->
isSparse
)
{
CheckNTErrors
((
gold
->
order
==
2
),
"The gold standard tensor must be of order 2!"
);
if
((
gold
->
dimSize
[
1
]
>
1
&&
!
gold
->
isAllValued
[
0
])
||
gold
->
dimSize
[
1
]
!=
dedx
->
dimSize
RDI
[
0
])
{
if
((
gold
->
dimSize
[
1
]
>
1
&&
!
gold
->
isAllValued
[
0
])
||
gold
->
dimSize
[
1
]
!=
dedx
->
dimSize
[
dedx
->
order
-
1
])
{
int
gn
=
gold
->
dimSize
[
0
];
int
gm
=
gold
->
dimSize
[
1
];
int
sm
=
dedx
->
dimSize
RDI
[
0
];
int
sn
=
dedx
->
dimSize
RDI
[
1
];
int
sm
=
dedx
->
dimSize
[
dedx
->
order
-
1
];
int
sn
=
dedx
->
dimSize
[
dedx
->
order
-
2
];
int
*
flags
=
new
int
[
sm
];
memset
(
flags
,
0
,
sizeof
(
int
)
*
sm
);
...
...
source/tensor/function/LogSoftmax.cu
查看文件 @
f5149a15
...
...
@@ -385,13 +385,12 @@ void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
"Tensors used in log softmax are not on the same GPU.");
CheckNTErrors((gold != NULL), "No x gold standard is found!");
int leadDimRDI = y->order - leadDim - 1;
int dimensionSize = y->dimSizeRDI[leadDimRDI];
int dimensionSize = y->dimSize[leadDim];
int stride = 1;
int blockSize = 1;
int blockNum = 1;
for (int i =
0; i < leadDimRDI
; i++)
stride *= y->dimSize
RDI
[i];
for (int i =
leadDim + 1; i < y->order
; i++)
stride *= y->dimSize[i];
blockSize = stride * dimensionSize;
blockNum = y->unitNum / blockSize;
...
...
source/tensor/function/Loss.cpp
查看文件 @
f5149a15
...
...
@@ -50,18 +50,17 @@ DTYPE _LossCompute(XTensor * gold, XTensor * output, LOSS_FUNCTION_NAME LFName,
if
(
output
->
devID
<
0
)
{
CheckNTErrors
((
gLen
>=
0
&&
gLen
<=
output
->
unitNum
),
"Illegal input length!"
);
CheckNTErrors
((
_IsSameShaped
(
gold
,
output
)),
"The input tensors must be of the same size!"
);
CheckNTErrors
((
gold
->
dimSize
RDI
[
0
]
==
1
&&
output
->
dimSizeRDI
[
0
]
==
1
),
"TODO!"
);
CheckNTErrors
((
gold
->
dimSize
[
gold
->
order
-
1
]
==
1
&&
output
->
dimSize
[
output
->
order
-
1
]
==
1
),
"TODO!"
);
CheckNTErrors
((
gold
->
order
>
leadDim
&&
leadDim
>=
0
),
"Illegal leading dimension!"
);
CheckNTErrors
((
gold
->
dataType
==
DEFAULT_DTYPE
&&
output
->
dataType
==
DEFAULT_DTYPE
),
"TODO!"
);
int
leadDimRDI
=
output
->
order
-
leadDim
-
1
;
int
dimensionSize
=
output
->
dimSizeRDI
[
leadDimRDI
];
int
dimensionSize
=
output
->
dimSize
[
leadDim
];
int
stride
=
1
;
int
blockSize
=
1
;
int
blockNum
=
1
;
for
(
int
i
=
0
;
i
<
leadDimRDI
;
i
++
)
stride
*=
output
->
dimSize
RDI
[
i
];
for
(
int
i
=
leadDim
+
1
;
i
<
output
->
order
;
i
++
)
stride
*=
output
->
dimSize
[
i
];
blockSize
=
stride
*
dimensionSize
;
blockNum
=
output
->
unitNum
/
blockSize
;
...
...
@@ -207,18 +206,17 @@ DTYPE _LossComputeForLogScale(XTensor * gold, XTensor * output,
{
CheckNTErrors
(
gLen
>=
0
&&
gLen
<=
output
->
unitNum
,
"Illegal input length!"
);
CheckNTErrors
(
_IsSameShaped
(
gold
,
output
),
"The input tensors must be of the same size!"
);
CheckNTErrors
(
gold
->
dimSize
RDI
[
0
]
==
1
&&
output
->
dimSizeRDI
[
0
]
==
1
,
"TODO!"
);
CheckNTErrors
(
gold
->
dimSize
[
gold
->
order
-
1
]
==
1
&&
output
->
dimSize
[
output
->
order
-
1
]
==
1
,
"TODO!"
);
CheckNTErrors
(
gold
->
order
>
leadDim
&&
leadDim
>=
0
,
"Illegal leading dimension!"
);
CheckNTErrors
(
gold
->
dataType
==
DEFAULT_DTYPE
&&
output
->
dataType
==
DEFAULT_DTYPE
,
"TODO!"
);
int
leadDimRDI
=
output
->
order
-
leadDim
-
1
;
int
dimensionSize
=
output
->
dimSizeRDI
[
leadDimRDI
];
int
dimensionSize
=
output
->
dimSize
[
leadDim
];
int
stride
=
1
;
int
blockSize
=
1
;
int
blockNum
=
1
;
for
(
int
i
=
0
;
i
<
leadDimRDI
;
i
++
)
stride
*=
output
->
dimSize
RDI
[
i
];
for
(
int
i
=
leadDim
+
1
;
i
<
output
->
order
;
i
++
)
stride
*=
output
->
dimSize
[
i
];
blockSize
=
stride
*
dimensionSize
;
blockNum
=
output
->
unitNum
/
blockSize
;
...
...
@@ -409,21 +407,20 @@ void _LossBackward(XTensor * dedy, XTensor * t, XTensor * y,
CheckNTErrors
(
t
->
order
>
leadDim
,
"Illegal leading dimension!"
);
CheckNTErrors
(
t
->
dataType
==
DEFAULT_DTYPE
&&
y
->
dataType
==
DEFAULT_DTYPE
,
"TODO!"
);
int
leadDimRDI
=
leadDim
>=
0
?
y
->
order
-
leadDim
-
1
:
-
1
;
if
(
leadDimRDI
<
0
){
leadDimRDI
=
y
->
order
-
1
;
if
(
leadDim
<
0
)
{
leadDim
=
0
;
tBeg
=
0
;
yBeg
=
0
;
tLen
=
y
->
dimSize
RDI
[
leadDimRDI
];
tLen
=
y
->
dimSize
[
leadDim
];
}
int
dimensionSize
=
y
->
dimSize
RDI
[
leadDimRDI
];
int
dimensionSize
=
y
->
dimSize
[
leadDim
];
int
stride
=
1
;
int
blockSize
=
1
;
int
blockNum
=
1
;
for
(
int
i
=
0
;
i
<
leadDimRDI
;
i
++
)
stride
*=
y
->
dimSize
RDI
[
i
];
for
(
int
i
=
leadDim
+
1
;
i
<
y
->
order
;
i
++
)
stride
*=
y
->
dimSize
[
i
];
blockSize
=
stride
*
dimensionSize
;
blockNum
=
y
->
unitNum
/
blockSize
;
...
...
source/tensor/function/Loss.cu
查看文件 @
f5149a15
...
...
@@ -56,7 +56,7 @@ DTYPE _CudaLossCompute(XTensor * gold, XTensor * y, LOSS_FUNCTION_NAME LFName,
{
CheckNTErrors((gLen >= 0 && gLen <= y->unitNum), "Illegal input length!");
CheckNTErrors((_IsSameShaped(gold, y)), "The input tensors must be of the same size!");
CheckNTErrors((gold->dimSize
RDI[0] == 1 && y->dimSizeRDI[0
] == 1), "TODO!");
CheckNTErrors((gold->dimSize
[gold->order - 1] == 1 && y->dimSize[y->order - 1
] == 1), "TODO!");
CheckNTErrors((gold->order > leadDim && leadDim >= 0), "Illegal leading dimension!");
CheckNTErrors((gold->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE), "TODO!");
CheckNTErrors((gold->devID == y->devID), "Tensors must be on the same device!");
...
...
@@ -91,7 +91,7 @@ DTYPE _CudaLossCompute(XTensor * gold, XTensor * y, LOSS_FUNCTION_NAME LFName,
diffNew->order = 2;
diffNew->dimSize[1] = diffNew->dimSize[0];
diffNew->dimSize[0] = 1;
diffNew->dimSize
RDI[1
] = 1;
diffNew->dimSize
[diffNew->order - 2
] = 1;
}
delete diff;
diff = diffNew;
...
...
@@ -125,7 +125,7 @@ DTYPE _CudaLossCompute(XTensor * gold, XTensor * y, LOSS_FUNCTION_NAME LFName,
diffNew->order = 2;
diffNew->dimSize[1] = diffNew->dimSize[0];
diffNew->dimSize[0] = 1;
diffNew->dimSize
RDI[1
] = 1;
diffNew->dimSize
[diffNew->order - 2
] = 1;
}
delete diff;
diff = diffNew;
...
...
@@ -162,7 +162,7 @@ DTYPE _CudaLossCompute(XTensor * gold, XTensor * y, LOSS_FUNCTION_NAME LFName,
diffNew->order = 2;
diffNew->dimSize[1] = diffNew->dimSize[0];
diffNew->dimSize[0] = 1;
diffNew->dimSize
RDI[1
] = 1;
diffNew->dimSize
[diffNew->order - 2
] = 1;
}
delete diff;
diff = diffNew;
...
...
@@ -349,22 +349,21 @@ void _CudaLossBackward(XTensor * dedy, XTensor * t, XTensor * y,
"The vectors must be on the same GPU.");
CheckNTErrors((tBeg == yBeg), "TODO!");
int leadDimRDI = leadDim >= 0 ? y->order - leadDim - 1 : -1;
if(leadDimRDI < 0){
leadDimRDI = y->order - 1;
if (leadDim < 0) {
leadDim = 0;
tBeg = 0;
yBeg = 0;
tLen = y->dimSize
RDI[leadDimRDI
];
tLen = y->dimSize
[leadDim
];
}
int dimensionSize = y->dimSize
RDI[leadDimRDI
];
int dimensionSize = y->dimSize
[leadDim
];
int stride = 1;
int blockSize = 1;
int blockNum = 1;
int size = 1;
for(int i =
0; i < leadDimRDI
; i++)
stride *= y->dimSize
RDI
[i];
for(int i =
leadDim + 1; i < y->order
; i++)
stride *= y->dimSize[i];
size = tLen * stride;
blockSize = stride * dimensionSize;
blockNum = y->unitNum / blockSize;
...
...
source/tensor/function/Softmax.cpp
查看文件 @
f5149a15
...
...
@@ -41,7 +41,6 @@ void _Softmax(const XTensor * x, XTensor * y, int leadDim)
if
(
leadDim
<
0
)
leadDim
=
x
->
order
-
1
;
int
leadDimRDI
=
x
->
order
-
leadDim
-
1
;
if
(
!
x
->
isSparse
&&
!
y
->
isSparse
&&
x
->
dataType
==
y
->
dataType
){
int
*
dimSize
=
new
int
[
x
->
order
-
1
];
for
(
int
i
=
0
;
i
<
x
->
order
;
i
++
){
...
...
@@ -71,13 +70,13 @@ void _Softmax(const XTensor * x, XTensor * y, int leadDim)
else
{
CheckNTErrors
((
x
->
dataType
==
DEFAULT_DTYPE
),
"TODO!"
);
int
dimensionSize
=
y
->
dimSize
RDI
[
leadDimRDI
];
int
dimensionSize
=
y
->
dimSize
[
leadDim
];
int
stride
=
1
;
int
blockSize
=
1
;
int
blockNum
=
1
;
for
(
int
i
=
0
;
i
<
leadDimRDI
;
i
++
)
stride
*=
y
->
dimSize
RDI
[
i
];
for
(
int
i
=
leadDim
+
1
;
i
<
y
->
order
;
i
++
)
stride
*=
y
->
dimSize
[
i
];
blockSize
=
stride
*
dimensionSize
;
blockNum
=
y
->
unitNum
/
blockSize
;
...
...
@@ -207,8 +206,6 @@ void _SoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
if
(
leadDim
<
0
)
leadDim
=
y
->
order
-
1
;
int
leadDimRDI
=
y
->
order
-
leadDim
-
1
;
#ifdef USE_CUDA
if
(
y
->
devID
>=
0
){
_CudaSoftmaxBackward
(
gold
,
y
,
x
,
dedy
,
dedx
,
padding
,
leadDim
,
lossName
);
...
...
@@ -216,12 +213,12 @@ void _SoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
}
#endif
int
dimensionSize
=
y
->
dimSize
RDI
[
leadDimRDI
];
int
dimensionSize
=
y
->
dimSize
[
leadDim
];
int
stride
=
1
;
int
blockSize
=
1
;
int
blockNum
=
1
;
for
(
int
i
=
0
;
i
<
leadDimRDI
;
i
++
)
stride
*=
y
->
dimSize
RDI
[
i
];
for
(
int
i
=
leadDim
+
1
;
i
<
y
->
order
;
i
++
)
stride
*=
y
->
dimSize
[
i
];
blockSize
=
stride
*
dimensionSize
;
blockNum
=
y
->
unitNum
/
blockSize
;
...
...
source/tensor/function/Softmax.cu
查看文件 @
f5149a15
...
...
@@ -226,14 +226,13 @@ void _CudaSoftmaxSumMax(const XTensor * x, XTensor * y, int leadDim, XTensor * s
CheckNTErrors((x->devID == y->devID), "Tensors used in softmax are not on the same GPU.");
CheckNTErrors((_IsSameShaped(x, y)), "Input tensors must be of the same size!");
int leadDimRDI = y->order - leadDim - 1;
int dimensionSize = y->dimSizeRDI[leadDimRDI];
int dimensionSize = y->dimSize[leadDim];
int stride = 1;
int blockSize = 1;
int blockNum = 1;
for(int i =
0; i < leadDimRDI
; i++)
stride *= y->dimSize
RDI
[i];
for(int i =
leadDim + 1; i < y->order
; i++)
stride *= y->dimSize[i];
blockSize = stride * dimensionSize;
blockNum = y->unitNum / blockSize;
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论