Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
T
Tensor.LowPrecision
概览
Overview
Details
Activity
Cycle Analytics
版本库
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
问题
0
Issues
0
列表
Board
标记
里程碑
合并请求
0
Merge Requests
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
Snippets
成员
Collapse sidebar
Close sidebar
活动
图像
聊天
创建新问题
作业
提交
Issue Boards
Open sidebar
魏冰浩
Tensor.LowPrecision
Commits
7da1bec1
Commit
7da1bec1
authored
Jul 12, 2019
by
单韦乔
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
clip
logsoftmax multipltdim negate scaleandshift
parent
6a3d713a
显示空白字符变更
内嵌
并排
正在显示
34 个修改的文件
包含
1266 行增加
和
1860 行删除
+1266
-1860
source/network/Main.cpp
+11
-288
source/sample/transformer/T2TTrainer.cpp
+3
-1
source/tensor/XTensor.cpp
+0
-14
source/tensor/core/arithmetic/Div.cu
+9
-44
source/tensor/core/arithmetic/Div.cuh
+6
-6
source/tensor/core/arithmetic/MatrixMul.cpp
+2
-57
source/tensor/core/arithmetic/MatrixMul.h
+0
-3
source/tensor/core/arithmetic/MultiplyDim.cu
+1
-1
source/tensor/core/arithmetic/MultiplyDim.cuh
+8
-0
source/tensor/core/arithmetic/Negate.cuh
+2
-5
source/tensor/core/arithmetic/Sum.cuh
+3
-0
source/tensor/core/arithmetic/XTensorBLAS.cu
+24
-145
source/tensor/core/getandset/ConvertDataType.cuh
+16
-0
source/tensor/core/movement/Gather.cu
+11
-26
source/tensor/core/reduce/ReduceMax.cu
+1
-5
source/tensor/core/reduce/ReduceSum.cu
+5
-3
source/tensor/function/HardTanH.cu
+12
-20
source/tensor/function/LogSoftmax.cu
+8
-8
source/tensor/function/LogSoftmax.cuh
+21
-0
source/tensor/test/TClip.cpp
+274
-0
source/tensor/test/TConvertDataType.cpp
+2
-147
source/tensor/test/TDiv.cpp
+0
-126
source/tensor/test/TGather.cpp
+1
-115
source/tensor/test/TGather.h
+0
-1
source/tensor/test/THardTanH.cpp
+0
-86
source/tensor/test/TLogSoftmax.cpp
+205
-8
source/tensor/test/TMatrixMul.cpp
+0
-440
source/tensor/test/TMatrixMul.h
+0
-1
source/tensor/test/TMultiplyDim.cpp
+216
-0
source/tensor/test/TNegate.cpp
+89
-0
source/tensor/test/TReduceMax.cpp
+4
-122
source/tensor/test/TReduceSum.cpp
+1
-131
source/tensor/test/TScaleAndShift.cpp
+274
-0
source/tensor/test/Test.cpp
+57
-57
没有找到文件。
source/network/Main.cpp
查看文件 @
7da1bec1
...
...
@@ -38,16 +38,6 @@ void SumDimTest();
void
SplitBackwardTest
();
void
MemTest
();
void
xcTest
();
void
ConvertDataTypeTest
();
void
ConvertDataTypeBackwardTest
();
void
SumFP16Test
();
void
GatherFP16Test
();
void
HardTanHFP16Test
();
void
ReduceMaxFP16Test
();
void
ReduceSumFP16Test
();
void
LogSoftmaxFP16Test
();
void
ClipFP16Test
();
void
ScaleAndShiftFP16Test
();
using
namespace
nts
;
using
namespace
fnnlm
;
...
...
@@ -66,291 +56,24 @@ int main(int argc, const char ** argv )
//return 0;
//Test();
//return 0;
//ConvertDataTypeTest();
//return 0;
//ConvertDataTypeBackwardTest();
//return 0;
//SumFP16Test();
//return 0;
//GatherFP16Test();
//return 0;
//HardTanHFP16Test();
//return 0;
//ReduceMaxFP16Test();
//return 0;
//ReduceSumFP16Test();
//return 0;
//LogSoftmaxFP16Test();
//return 0;
//ClipFP16Test();
//return 0;
//ScaleAndShiftFP16Test();
//return 0;
if
(
argc
>
1
&&
!
strcmp
(
argv
[
1
],
"-test"
))
//if (argc > 1 && !strcmp(argv[1], "-test"))
Test
();
else
if
(
argc
>
1
&&
!
strcmp
(
argv
[
1
],
"-fnnlm"
))
FNNLMMain
(
argc
-
1
,
argv
+
1
);
else
if
(
argc
>
1
&&
!
strcmp
(
argv
[
1
],
"-t2t"
))
TransformerMain
(
argc
-
1
,
argv
+
1
);
else
{
fprintf
(
stderr
,
"Thanks for using NiuTrans.Network! This is a library for building
\n
"
);
fprintf
(
stderr
,
"neural networks in an easy way.
\n\n
"
);
fprintf
(
stderr
,
"Run this program with
\"
-test
\"
for unit test!
\n
"
);
fprintf
(
stderr
,
"Or run this program with
\"
-fnnlm
\"
for sample FNNLM!
\n
"
);
}
//
else if(argc > 1 && !strcmp(argv[1], "-fnnlm"))
//
FNNLMMain(argc - 1, argv + 1);
//
else if(argc > 1 && !strcmp(argv[1], "-t2t"))
//
TransformerMain(argc - 1, argv + 1);
//
else{
//
fprintf(stderr, "Thanks for using NiuTrans.Network! This is a library for building\n");
//
fprintf(stderr, "neural networks in an easy way. \n\n");
//
fprintf(stderr, "Run this program with \"-test\" for unit test!\n");
//
fprintf(stderr, "Or run this program with \"-fnnlm\" for sample FNNLM!\n");
//
}
//_CrtDumpMemoryLeaks();
return
0
;
}
void
ScaleAndShiftFP16Test
()
{
XTensor
a
;
XTensor
intA
;
XTensor
b
;
XTensor
intB
;
InitTensor2D
(
&
a
,
1
,
10
,
X_FLOAT
,
0
);
a
.
SetDataRand
(
-
10.0
F
,
10.0
F
);
a
.
Dump
(
stderr
,
"a:"
);
intA
=
ConvertDataType
(
a
,
X_INT
);
intB
=
ScaleAndShift
(
intA
,
2
,
0
);
b
=
ConvertDataType
(
intB
,
X_FLOAT
);
b
.
Dump
(
stderr
,
"b:"
);
}
void
ClipFP16Test
()
{
XTensor
a
;
XTensor
intA
;
XTensor
b
;
XTensor
intB
;
InitTensor2D
(
&
a
,
1
,
10
,
X_FLOAT
,
0
);
a
.
SetDataRand
(
-
10.0
F
,
10.0
F
);
a
.
Dump
(
stderr
,
"a:"
);
intA
=
ConvertDataType
(
a
,
X_INT
);
intB
=
Clip
(
intA
,
-
1
,
1
);
b
=
ConvertDataType
(
intB
,
X_FLOAT
);
b
.
Dump
(
stderr
,
"b:"
);
}
void
LogSoftmaxFP16Test
()
{
XTensor
a
;
XTensor
halfA
;
XTensor
b
;
XTensor
halfB
;
InitTensor3D
(
&
a
,
2
,
2
,
2
,
X_FLOAT
,
0
);
a
.
SetDataRand
(
-
1.0
F
,
1.0
F
);
halfA
=
ConvertDataType
(
a
,
X_FLOAT16
);
b
=
LogSoftmax
(
a
,
1
);
halfB
=
LogSoftmax
(
halfA
,
1
);
b
.
Dump
(
stderr
,
"sum:"
);
halfB
.
Dump
(
&
halfB
,
stderr
,
"halfSum:"
);
}
void
ReduceSumFP16Test
()
{
XTensor
a
;
XTensor
sum
;
XTensor
halfA
;
XTensor
halfSum
;
InitTensor2D
(
&
a
,
10
,
10
,
X_FLOAT
,
0
);
a
.
SetDataRand
(
-
5.0
F
,
5.0
F
);
halfA
=
ConvertDataType
(
a
,
X_FLOAT16
);
sum
=
ReduceSum
(
a
,
1
);
halfSum
=
ReduceSum
(
halfA
,
1
);
sum
.
Dump
(
stderr
,
"sum:"
);
halfSum
.
Dump
(
&
halfSum
,
stderr
,
"halfSum:"
);
}
void
ReduceMaxFP16Test
()
{
XTensor
a
;
XTensor
max
;
XTensor
halfA
;
XTensor
halfMax
;
InitTensor2D
(
&
a
,
10
,
10
,
X_FLOAT
,
0
);
a
.
SetDataRand
(
-
5.0
F
,
5.0
F
);
halfA
=
ConvertDataType
(
a
,
X_FLOAT16
);
max
=
ReduceMax
(
a
,
1
);
halfMax
=
ReduceMax
(
halfA
,
1
);
max
.
Dump
(
stderr
,
"max:"
);
halfMax
.
Dump
(
&
halfMax
,
stderr
,
"halfMax:"
);
}
void
HardTanHFP16Test
()
{
XTensor
a
;
XTensor
b
;
XTensor
halfA
;
XTensor
halfB
;
InitTensor2D
(
&
a
,
5
,
5
,
X_FLOAT
,
0
);
InitTensor2D
(
&
b
,
5
,
5
,
X_FLOAT
,
0
);
a
.
SetDataRand
(
-
1.0
F
,
4.0
F
);
b
.
SetDataRand
(
-
1.0
F
,
4.0
F
);
halfA
=
ConvertDataType
(
a
,
X_FLOAT16
);
halfB
=
ConvertDataType
(
b
,
X_FLOAT16
);
a
.
Dump
(
stderr
,
"a:"
);
b
.
Dump
(
stderr
,
"b:"
);
b
=
HardTanH
(
a
);
halfB
=
HardTanH
(
halfA
);
b
.
Dump
(
stderr
,
"b:"
);
halfB
.
Dump
(
&
halfB
,
stderr
,
"halfB:"
);
}
void
GatherFP16Test
()
{
XTensor
a
;
XTensor
b
;
XTensor
srcIndex
;
XTensor
halfA
;
XTensor
halfB
;
XTensor
c
;
InitTensor1D
(
&
srcIndex
,
2
,
X_INT
,
0
);
int
m
=
0
;
int
n
=
1
;
srcIndex
.
Set1DInt
(
m
,
0
);
srcIndex
.
Set1DInt
(
n
,
1
);
InitTensor2D
(
&
a
,
3
,
2
,
X_FLOAT
,
0
);
InitTensor2D
(
&
b
,
2
,
2
,
X_FLOAT
,
0
);
InitTensor2D
(
&
halfB
,
2
,
2
,
X_FLOAT16
,
0
);
a
.
SetDataRand
(
-
5.0
F
,
5.0
F
);
halfA
=
ConvertDataType
(
a
,
X_FLOAT16
);
a
.
Dump
(
stderr
,
"a:"
);
_Gather
(
&
a
,
&
b
,
&
srcIndex
);
b
.
Dump
(
stderr
,
"b:"
);
_Gather
(
&
halfA
,
&
halfB
,
&
srcIndex
);
c
=
ConvertDataType
(
halfB
,
X_FLOAT
);
c
.
Dump
(
stderr
,
"c:"
);
}
void
SumFP16Test
()
{
XTensor
a
;
XTensor
b
;
XTensor
halfA
;
XTensor
halfB
;
InitTensor2D
(
&
a
,
5
,
5
,
X_FLOAT
,
0
);
InitTensor2D
(
&
b
,
5
,
5
,
X_FLOAT
,
0
);
a
.
SetDataRand
(
-
1.0
F
,
4.0
F
);
b
.
SetDataRand
(
-
1.0
F
,
4.0
F
);
halfA
=
ConvertDataType
(
a
,
X_FLOAT16
);
halfB
=
ConvertDataType
(
b
,
X_FLOAT16
);
a
.
Dump
(
stderr
,
"a:"
);
b
.
Dump
(
stderr
,
"b:"
);
b
=
Sum
(
a
,
b
,
-
0.4
F
);
halfB
=
Sum
(
halfA
,
halfB
,
-
0.4
F
);
b
.
Dump
(
stderr
,
"b:"
);
halfB
.
Dump
(
&
halfB
,
stderr
,
"halfB:"
);
}
void
ConvertDataTypeTest
()
{
int
rnum
=
0
;
for
(
int
i
=
0
;
i
<=
rnum
;
i
++
)
{
XTensor
a
;
InitTensor2D
(
&
a
,
2
,
2
,
X_FLOAT
,
0
);
XTensor
halfa
;
InitTensor2D
(
&
halfa
,
2
,
2
,
X_FLOAT16
,
0
);
XTensor
a1
;
InitTensor2D
(
&
a1
,
2
,
2
,
X_FLOAT
,
0
);
a
.
SetDataRand
(
-
10.0
F
,
10.0
F
);
a
.
Dump
(
stderr
,
"a:"
);
halfa
=
ConvertDataType
(
a
,
X_FLOAT16
);
a1
=
ConvertDataType
(
halfa
,
X_FLOAT
);
a1
.
Dump
(
stderr
,
"halfa:"
);
}
}
void
ConvertDataTypeBackwardTest
()
{
int
rnum
=
0
;
for
(
int
i
=
0
;
i
<=
rnum
;
i
++
)
{
XTensor
a
;
InitTensor2D
(
&
a
,
2
,
2
,
X_FLOAT
,
0
);
a
.
SetDataRand
(
2.0
F
,
2.0
F
);
a
.
Dump
(
stderr
,
"a:"
);
XTensor
halfA
;
XTensor
a1
;
halfA
=
ConvertDataType
(
a
,
X_FLOAT16
);
a1
=
ConvertDataType
(
halfA
,
X_FLOAT
);
a1
.
grad
=
NewTensor
(
&
a1
);
a1
.
grad
->
SetDataRand
(
3.0
F
,
3.0
F
);
a1
.
grad
->
Dump
(
stderr
,
"a1.grad:"
);
XNet
testBackward
;
printf
(
"1"
);
testBackward
.
Backward
(
a1
);
printf
(
"2"
);
halfA
.
grad
->
Dump
(
stderr
,
"halfA.grad:"
);
a
.
grad
->
Dump
(
stderr
,
"a.grad:"
);
}
}
XTensor
*
stack
(
XList
&
list
,
int
leadingDim
)
{
size_t
size
=
list
.
count
;
...
...
source/sample/transformer/T2TTrainer.cpp
查看文件 @
7da1bec1
...
...
@@ -249,6 +249,8 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
lossTensor
=
CrossEntropy
(
output
,
labelOnehot
,
paddingDec
);
//lossTensor = CrossEntropy(output, labelOnehot);
float
prob
=
ReduceSumAll
(
lossTensor
);
printf
(
"%f
\n
"
,
prob
);
exit
(
0
);
DTYPE
lossLocal
=
prob
/
wc
;
bool
doUpdate
=
(
!
IsNAN
(
lossLocal
)
&&
!
IsINF
(
lossLocal
)
&&
lossLocal
<
1e3
F
);
...
...
@@ -293,7 +295,7 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
break
;
}
if
(
step
%
10
0
==
0
)
{
if
(
step
%
10
==
0
)
{
double
elapsed
=
GetClockSec
()
-
startT
;
XPRINT8
(
0
,
stderr
,
"[INFO] elapsed=%.1fs, step=%d, epoch=%d, tword=%d, sword=%d, loss=%.3f, ppl=%.3f, sppl=%.3f"
,
elapsed
,
step
,
epoch
,
wordCountTotal
,
wordCountBatch
,
loss
/
wordCount
,
exp
(
loss
/
wordCount
),
exp
(
prob
/
wc
));
...
...
source/tensor/XTensor.cpp
查看文件 @
7da1bec1
...
...
@@ -48,7 +48,6 @@
#include "core/math/ScaleAndShift.h"
#include "core/getandset/SetData.h"
#include "function/Identity.h"
#include "core/getandset/ConvertDataType.h"
#ifdef USE_CUDA
...
...
@@ -1765,22 +1764,9 @@ dump data to a file
*/
void
XTensor
::
Dump
(
const
XTensor
*
tensor
,
FILE
*
file
,
const
char
*
label
,
const
int
n
,
const
int
beg
,
const
int
verbose
)
{
if
(
tensor
->
dataType
==
X_FLOAT
)
{
XTensor
a
(
tensor
->
order
,
tensor
->
dimSize
,
tensor
->
dataType
,
tensor
->
denseRatio
,
tensor
->
devID
,
tensor
->
mem
);
_CopyValues
(
tensor
,
&
a
);
a
.
Dump
(
file
,
label
,
n
,
beg
,
verbose
);
}
else
if
(
tensor
->
dataType
==
X_FLOAT16
)
{
XTensor
a
(
tensor
->
order
,
tensor
->
dimSize
,
X_FLOAT
,
tensor
->
denseRatio
,
tensor
->
devID
,
tensor
->
mem
);
_ConvertDataType
(
tensor
,
&
a
);
a
.
Dump
(
file
,
label
,
n
,
beg
,
verbose
);
}
else
{
ShowNTErrors
(
"TO DO!"
);
}
}
/*
...
...
source/tensor/core/arithmetic/Div.cu
查看文件 @
7da1bec1
...
...
@@ -17,7 +17,6 @@
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-05 float16 added
*/
#include "../../XDevice.h"
...
...
@@ -35,9 +34,8 @@ division of data arrays in a element-wise manner c(i) = a(i)/b(i)
>> c - result data array
>> size - size of c
*/
template <class T>
__global__
void KernelDivElementWise(
T * a, T * b, T
* c, int size)
void KernelDivElementWise(
DTYPE * a, DTYPE * b, DTYPE
* c, int size)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
...
...
@@ -53,9 +51,8 @@ division of data arrays in a element-wise manner c(i) = a(i)/b(i) + \alpha*c(i)
>> size - size of c
>> alpha - the coefficient
*/
template <class T>
__global__
void KernelDivElementWiseV2(
T * a, T * b, T * c, int size, T
alpha)
void KernelDivElementWiseV2(
DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE
alpha)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
...
...
@@ -78,13 +75,13 @@ where |a_lead| means the size of the leading dimension of a
>> ldSizeC - size of the leading dimension of c
>> blockNum - number of blocks
*/
template<
class T,
int nonZeroAlpha> __global__
void KernelDivElementWiseTensorDynamic(
T * a, T * b, T * c, T
alpha,
template<int nonZeroAlpha> __global__
void KernelDivElementWiseTensorDynamic(
DTYPE * a, DTYPE * b, DTYPE * c, DTYPE
alpha,
int stride, int ldSizeA, int ldSizeB, int ldSizeC, int blockNum)
{
__shared__
T
* ap[MAX_CUDA_THREAD_NUM_PER_BLOCK];
__shared__
T
* bp[MAX_CUDA_THREAD_NUM_PER_BLOCK];
__shared__
T
* cp[MAX_CUDA_THREAD_NUM_PER_BLOCK];
__shared__
DTYPE
* ap[MAX_CUDA_THREAD_NUM_PER_BLOCK];
__shared__
DTYPE
* bp[MAX_CUDA_THREAD_NUM_PER_BLOCK];
__shared__
DTYPE
* cp[MAX_CUDA_THREAD_NUM_PER_BLOCK];
int i = blockDim.x * blockIdx.x + threadIdx.x;
int j = blockDim.y * blockIdx.y + threadIdx.y;
...
...
@@ -172,49 +169,17 @@ void _CudaDiv(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, in
dim3 blocks(cudaGridSize[0], cudaGridSize[1]), threads(cudaBlockSize[0], cudaBlockSize[1]);
if (alpha == 0) {
KernelDivElementWiseTensorDynamic<
DTYPE,
0> << <blocks, threads >> >
KernelDivElementWiseTensorDynamic<0> << <blocks, threads >> >
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, 0,
stride, dimensionSizeA, dimensionSizeB, dimensionSizeC, blockNum);
}
else {
KernelDivElementWiseTensorDynamic<
DTYPE,
1> << <blocks, threads >> >
KernelDivElementWiseTensorDynamic<1> << <blocks, threads >> >
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, alpha,
stride, dimensionSizeA, dimensionSizeB, dimensionSizeC, blockNum);
}
}
}
else if (a->dataType == X_FLOAT16 && b->dataType == X_FLOAT16) {
int cudaGridSize[3];
int cudaBlockSize[3];
unsigned short temp = FloatToFloat16(alpha);
half alpha1 = *((half *)&temp);
if (a->unitNum == c->unitNum && b->unitNum == c->unitNum) {
GDevs.GetCudaThread(a->devID, c->unitNum, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[0]), threads(cudaBlockSize[0]);
if (alpha == 0)
KernelDivElementWise << <blocks, threads >> > ((__half *)a->data, (__half *)b->data, (__half *)c->data, c->unitNum);
else
KernelDivElementWiseV2 << <blocks, threads >> > ((__half *)a->data, (__half *)b->data, (__half *)c->data, c->unitNum, alpha1);
}
else {
GDevs.GetCudaThread2D(c->devID, stride * blockNum, dimensionSizeC, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[0], cudaGridSize[1]), threads(cudaBlockSize[0], cudaBlockSize[1]);
if (alpha == 0) {
KernelDivElementWiseTensorDynamic<__half, 0> << <blocks, threads >> >
((__half *)a->data, (__half *)b->data, (__half *)c->data, 0,
stride, dimensionSizeA, dimensionSizeB, dimensionSizeC, blockNum);
}
else {
KernelDivElementWiseTensorDynamic<__half, 1> << <blocks, threads >> >
((__half *)a->data, (__half *)b->data, (__half *)c->data, alpha1,
stride, dimensionSizeA, dimensionSizeB, dimensionSizeC, blockNum);
}
}
}
else {
// TODO!!
ShowNTErrors("TODO!");
...
...
source/tensor/core/arithmetic/Div.cuh
查看文件 @
7da1bec1
...
...
@@ -29,16 +29,16 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/* division of two tensors in a element-wise manner c(i) = a(i)/b(i) */
template<class T>
__global__
void KernelDivElementWise(
T * a, T * b, T
* c, int size);
__global__
void KernelDivElementWise(
DTYPE * a, DTYPE * b, DTYPE
* c, int size);
/* division of two tensors in a element-wise manner c(i) = a(i)/b(i) + \alpha*c(i) */
template<class T>
__global__
void KernelDivElementWiseV2(
T * a, T * b, T * c, int size, T
alpha);
__global__
void KernelDivElementWiseV2(
DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE
alpha);
/* division of two tensors in a element-wise manner c(i) = a(i)/b(i)+ \alpha*c(i) */
template<
class T,
int nonZeroAlpha>__global__
void KernelDivElementWiseTensorDynamic(
T * a, T * b, T * c, T
alpha, int stride, int ldSizeA, int ldSizeB, int ldSizeC, int blockNum);
template<int nonZeroAlpha>__global__
void KernelDivElementWiseTensorDynamic(
DTYPE * a, DTYPE * b, DTYPE * c, DTYPE
alpha, int stride, int ldSizeA, int ldSizeB, int ldSizeC, int blockNum);
/* element-wise division of two tensors */
void _CudaDiv(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha = 0, int leadingDim = 0);
...
...
source/tensor/core/arithmetic/MatrixMul.cpp
查看文件 @
7da1bec1
...
...
@@ -54,6 +54,8 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
XTensor
*
c
,
DTYPE
alpha
,
DTYPE
beta
,
XPRunner
*
parallelRunner
)
{
CheckNTErrors
(
a
&&
b
&&
c
,
"Empty input tensors!"
);
CheckNTErrors
(
a
->
dataType
==
b
->
dataType
&&
a
->
dataType
==
c
->
dataType
,
"Input tensors should have the same data type!"
);
CheckNTErrors
(
a
->
order
>=
2
&&
b
->
order
>=
2
&&
c
->
order
>=
2
,
"Input tensors must have a order >= 2!"
);
CheckNTErrors
(
c
->
order
==
a
->
order
+
b
->
order
-
2
,
"wrong tensor order"
)
...
...
@@ -300,63 +302,6 @@ XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA,
return
c
;
}
/*
matrix multiplication (return an XTensor structure) c = trans(a) * trans(b) * alpha
make a new tensor to keep the result and return it
>> a - tensor a
>> transposedA - indicates whether the matrices in a are transposed
>> b - tensor b
>> transposedB - indicates whether teh matrices in b are transposed
>> dataType - indicates what datatype is needed
>> alpha - a coefficient
>> parallelRunner - parallel processing module
<< return - the result of matrix multiplication
*/
XTensor
MatrixMul
(
const
XTensor
&
a
,
MATRIX_TRANS_TYPE
transposedA
,
const
XTensor
&
b
,
MATRIX_TRANS_TYPE
transposedB
,
TENSOR_DATA_TYPE
dataType
,
DTYPE
alpha
,
XPRunner
*
parallelRunner
)
{
CheckNTErrors
(
a
.
dataType
==
b
.
dataType
,
"Input tensors should have the same data type!"
);
CheckNTErrors
(
a
.
order
>=
2
&&
b
.
order
>=
2
,
"Input tensors must have a order >= 2!"
);
int
an
=
transposedA
==
X_TRANS
?
a
.
dimSizeRDI
[
0
]
:
a
.
dimSizeRDI
[
1
];
int
am
=
transposedA
==
X_TRANS
?
a
.
dimSizeRDI
[
1
]
:
a
.
dimSizeRDI
[
0
];
int
bn
=
transposedB
==
X_TRANS
?
b
.
dimSizeRDI
[
0
]
:
b
.
dimSizeRDI
[
1
];
int
bm
=
transposedB
==
X_TRANS
?
b
.
dimSizeRDI
[
1
]
:
b
.
dimSizeRDI
[
0
];
CheckNTErrors
(
am
==
bn
,
"Unmatched tensors in multiplication!"
);
int
order
=
a
.
order
+
b
.
order
-
2
;
int
sub
=
0
;
int
*
dimSize
=
new
int
[
order
];
for
(
int
i
=
2
;
i
<
a
.
order
;
i
++
)
dimSize
[
sub
++
]
=
a
.
dimSizeRDI
[
a
.
order
+
1
-
i
];
for
(
int
i
=
2
;
i
<
b
.
order
;
i
++
)
dimSize
[
sub
++
]
=
b
.
dimSizeRDI
[
b
.
order
+
1
-
i
];
dimSize
[
sub
++
]
=
an
;
dimSize
[
sub
++
]
=
bm
;
float
dr
=
(
!
a
.
isSparse
||
!
b
.
isSparse
)
?
1.0
F
:
MAX
(
a
.
denseRatio
,
b
.
denseRatio
);
XTensor
c
(
order
,
dimSize
,
dataType
,
dr
,
a
.
devID
,
a
.
mem
);
c
.
SetTMPFlag
();
/* call _MatrixMul function */
_MatrixMul
(
&
a
,
transposedA
,
&
b
,
transposedB
,
&
c
,
alpha
,
0
,
parallelRunner
);
/* tensor connections */
XLink
::
MakeLink
(
&
a
,
&
b
,
&
c
,
MATH_MATRIXMUL
);
XLink
::
AddParamToHeadTrans
(
&
c
,
transposedA
);
XLink
::
AddParamToHeadTrans
(
&
c
,
transposedB
);
XLink
::
AddParamToHead
(
&
c
,
alpha
);
/* destroy variables */
delete
[]
dimSize
;
return
c
;
}
void
MatrixMul
(
const
XTensor
&
a
,
MATRIX_TRANS_TYPE
transposedA
,
const
XTensor
&
b
,
MATRIX_TRANS_TYPE
transposedB
,
XTensor
&
c
,
DTYPE
alpha
,
XPRunner
*
parallelRunner
,
bool
requireLink
)
...
...
source/tensor/core/arithmetic/MatrixMul.h
查看文件 @
7da1bec1
...
...
@@ -59,9 +59,6 @@ Obviously C = A * B performs normal matrix multiplication if A = y * z and B = x
XTensor
MatrixMul
(
const
XTensor
&
a
,
MATRIX_TRANS_TYPE
transposedA
,
const
XTensor
&
b
,
MATRIX_TRANS_TYPE
transposedB
,
DTYPE
alpha
=
(
DTYPE
)
1
.
0
,
XPRunner
*
parallelRunner
=
NULL
);
XTensor
MatrixMul
(
const
XTensor
&
a
,
MATRIX_TRANS_TYPE
transposedA
,
const
XTensor
&
b
,
MATRIX_TRANS_TYPE
transposedB
,
TENSOR_DATA_TYPE
dataType
,
DTYPE
alpha
=
(
DTYPE
)
1
.
0
,
XPRunner
*
parallelRunner
=
NULL
);
void
MatrixMul
(
const
XTensor
&
a
,
MATRIX_TRANS_TYPE
transposedA
,
const
XTensor
&
b
,
MATRIX_TRANS_TYPE
transposedB
,
XTensor
&
c
,
DTYPE
alpha
=
(
DTYPE
)
1
.
0
,
XPRunner
*
parallelRunner
=
NULL
,
bool
requireLink
=
false
);
...
...
source/tensor/core/arithmetic/MultiplyDim.cu
查看文件 @
7da1bec1
...
...
@@ -169,7 +169,7 @@ void _CudaMultiplyDim(const XTensor * a, const XTensor * b, XTensor * c, int n,
ShowNTErrors("Something is wrong!");
}
}
if (a->dataType == X_FLOAT16) {
else
if (a->dataType == X_FLOAT16) {
unsigned short temp = FloatToFloat16(alpha);
half alpha1 = *((half *)&temp);
if (stride > 1) {
...
...
source/tensor/core/arithmetic/MultiplyDim.cuh
查看文件 @
7da1bec1
...
...
@@ -28,6 +28,14 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
template <class T, bool alphaFired>
__global__
void KernelMultiplyWithRow(T * a, T * b, T * c, int rowNum, int colNum, T alpha);
template <class T, bool alphaFired>
__global__
void KernelMultiplyWithCol(T * a, T * b, T * c, int rowNum, int colNum, int blockSize, int blockNum, T alpha);
/* tensor summation a * b + \alpha * c where the size of b is equal to the n-th dimension of a,
i.e., a is multiplied with b by broadcasting (cuda version) */
void _CudaMultiplyDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE alpha = 0);
...
...
source/tensor/core/arithmetic/Negate.cuh
查看文件 @
7da1bec1
...
...
@@ -29,12 +29,9 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/* set each entry to its negtive value (CUDA Kernel) */
template <class T>
__global__
void KernelNegate(DTYPE * a, DTYPE * b, int size);
/* set each entry to its negtive value (CUDA Kernel) with float16 data type*/
__global__
void KernelNegate(__half * a, __half * b, int size);
void KernelNegate(T * a, T * b, int size);
/* set each entry to its negtive value */
void _CudaNegate(const XTensor * a, XTensor * b);
...
...
source/tensor/core/arithmetic/Sum.cuh
查看文件 @
7da1bec1
...
...
@@ -32,6 +32,9 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
template <class T> __global__
void KernelADD(T * a, T * b, T * c, int size, T beta = (T)1.0);
__global__
void KernelADDInt(int * a, int * b, int * c, int size, DTYPE beta);
/* tensor summation c = a + b * \beta (cuda version) */
void _CudaSum(const XTensor * a, const XTensor * b, XTensor * c = NULL, DTYPE beta = (DTYPE)1.0);
...
...
source/tensor/core/arithmetic/XTensorBLAS.cu
查看文件 @
7da1bec1
...
...
@@ -17,15 +17,12 @@
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-06 float16 added
*/
#include "../../XUtility.h"
#include "../../XDevice.h"
#include "../../XTensor.h"
#include "XTensorBLAS.h"
#include "cuda_fp16.h"
#include "cublas_api.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
...
...
@@ -70,50 +67,18 @@ void _CudaBLASMatrixMUL(cublasHandle_t * handle,
cublasSgemm(*handle, CUBLAS_OP_T, CUBLAS_OP_T, mc, nc, na, &alpha2, (const float*)b, mb, (const float*)a, ma, &beta2, (float*)c, mc);
}
else if (dataTypeA == X_FLOAT16 && dataTypeB == X_FLOAT16 && dataTypeC == X_FLOAT16) {
cublasSetMathMode(*handle, CUBLAS_TENSOR_OP_MATH);
unsigned short alpha2 = FloatToFloat16(alpha);
unsigned short beta2 = FloatToFloat16(beta);
__half * alpha3 = (__half*)&alpha2;
__half * beta3 = (__half*)&beta2;
if (transposedA == X_NOTRANS && transposedB == X_NOTRANS)
cublas
GemmEx(*handle, CUBLAS_OP_N, CUBLAS_OP_N, mc, nc, ma, (void*)&alpha, b, CUDA_R_16F, mb, a, CUDA_R_16F, ma, (void*)&beta, c, CUDA_R_16F, mc, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP
);
cublas
Hgemm(*handle, CUBLAS_OP_N, CUBLAS_OP_N, mc, nc, ma, alpha3, (const __half*)b, mb, (const __half*)a, ma, beta3, (__half*)c, mc
);
else if (transposedA == X_TRANS && transposedB == X_NOTRANS)
cublas
GemmEx(*handle, CUBLAS_OP_N, CUBLAS_OP_T, mc, nc, ma, (void*)&alpha, b, CUDA_R_16F, mb, a, CUDA_R_16F, ma, (void*)&beta, c, CUDA_R_16F, mc, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP
);
cublas
Hgemm(*handle, CUBLAS_OP_N, CUBLAS_OP_T, mc, nc, na, alpha3, (const __half*)b, mb, (const __half*)a, ma, beta3, (__half*)c, mc
);
else if (transposedA == X_NOTRANS && transposedB == X_TRANS)
cublas
GemmEx(*handle, CUBLAS_OP_T, CUBLAS_OP_N, mc, nc, ma, (void*)&alpha, b, CUDA_R_16F, mb, a, CUDA_R_16F, ma, (void*)&beta, c, CUDA_R_16F, mc, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP
);
cublas
Hgemm(*handle, CUBLAS_OP_T, CUBLAS_OP_N, mc, nc, ma, alpha3, (const __half*)b, mb, (const __half*)a, ma, beta3, (__half*)c, mc
);
else if (transposedA == X_TRANS && transposedB == X_TRANS)
cublasGemmEx(*handle, CUBLAS_OP_T, CUBLAS_OP_T, mc, nc, ma, (void*)&alpha, b, CUDA_R_16F, mb, a, CUDA_R_16F, ma, (void*)&beta, c, CUDA_R_16F, mc, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
cublasSetMathMode(*handle, CUBLAS_DEFAULT_MATH);
}
else if (dataTypeA == X_FLOAT16 && dataTypeB == X_FLOAT16 && dataTypeC == X_FLOAT) {
cublasSetMathMode(*handle, CUBLAS_TENSOR_OP_MATH);
if (transposedA == X_NOTRANS && transposedB == X_NOTRANS)
cublasGemmEx(*handle, CUBLAS_OP_N, CUBLAS_OP_N, mc, nc, ma, (void*)&alpha, b, CUDA_R_16F, mb, a, CUDA_R_16F, ma, (void*)&beta, c, CUDA_R_32F, mc, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
else if (transposedA == X_TRANS && transposedB == X_NOTRANS)
cublasGemmEx(*handle, CUBLAS_OP_N, CUBLAS_OP_T, mc, nc, ma, (void*)&alpha, b, CUDA_R_16F, mb, a, CUDA_R_16F, ma, (void*)&beta, c, CUDA_R_32F, mc, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
else if (transposedA == X_NOTRANS && transposedB == X_TRANS)
cublasGemmEx(*handle, CUBLAS_OP_T, CUBLAS_OP_N, mc, nc, ma, (void*)&alpha, b, CUDA_R_16F, mb, a, CUDA_R_16F, ma, (void*)&beta, c, CUDA_R_32F, mc, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
else if (transposedA == X_TRANS && transposedB == X_TRANS)
cublasGemmEx(*handle, CUBLAS_OP_T, CUBLAS_OP_T, mc, nc, ma, (void*)&alpha, b, CUDA_R_16F, mb, a, CUDA_R_16F, ma, (void*)&beta, c, CUDA_R_32F, mc, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
cublasSetMathMode(*handle, CUBLAS_DEFAULT_MATH);
}
else if (dataTypeA == X_INT8 && dataTypeB == X_INT8 && dataTypeC == X_FLOAT) {
int alpha2 = (int)alpha;
int beta2 = (int)beta;
/*
CUDA requires that the dimension of two tensor( lda, ldb ) should be multiples of 4.
details in https://devtalk.nvidia.com/default/topic/999101/about-cublasgemm-int8-support/
*/
if (mb % 4 != 0 || ma % 4 != 0) {
ShowNTErrors("mb, ma( lda, ldb ) should be multiples of 4!");
return;
}
cublasSetMathMode(*handle, CUBLAS_TENSOR_OP_MATH);
if (transposedA == X_NOTRANS && transposedB == X_NOTRANS)
cublasGemmEx(*handle, CUBLAS_OP_N, CUBLAS_OP_N, mc, nc, ma, (__int8*)&alpha2, b, CUDA_R_8I, mb, a, CUDA_R_8I, ma, (__int8*)&beta2, c, CUDA_R_32F, mc, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
else if (transposedA == X_TRANS && transposedB == X_NOTRANS)
cublasGemmEx(*handle, CUBLAS_OP_N, CUBLAS_OP_T, mc, nc, ma, (__int8*)&alpha2, b, CUDA_R_8I, mb, a, CUDA_R_8I, ma, (__int8*)&beta2, c, CUDA_R_32F, mc, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
else if (transposedA == X_NOTRANS && transposedB == X_TRANS)
cublasGemmEx(*handle, CUBLAS_OP_T, CUBLAS_OP_N, mc, nc, ma, (__int8*)&alpha2, b, CUDA_R_8I, mb, a, CUDA_R_8I, ma, (__int8*)&beta2, c, CUDA_R_32F, mc, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
else if (transposedA == X_TRANS && transposedB == X_TRANS)
cublasGemmEx(*handle, CUBLAS_OP_T, CUBLAS_OP_T, mc, nc, ma, (__int8*)&alpha2, b, CUDA_R_8I, mb, a, CUDA_R_8I, ma, (__int8*)&beta2, c, CUDA_R_32F, mc, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
cublasSetMathMode(*handle, CUBLAS_DEFAULT_MATH);
cublasHgemm(*handle, CUBLAS_OP_T, CUBLAS_OP_T, mc, nc, na, alpha3, (const __half*)b, mb, (const __half*)a, ma, beta3, (__half*)c, mc);
}
else {
ShowNTErrors("Unsupported data type!");
...
...
@@ -159,50 +124,18 @@ void _CudaBLASMatrixMULBatched(cublasHandle_t * handle,
cublasSgemmBatched(*handle, CUBLAS_OP_T, CUBLAS_OP_T, mc, nc, na, &alpha2, (const float**)b, mb, (const float**)a, ma, &beta2, (float**)c, mc, count);
}
else if (dataTypeA == X_FLOAT16 && dataTypeB == X_FLOAT16 && dataTypeC == X_FLOAT16) {
cublasSetMathMode(*handle, CUBLAS_TENSOR_OP_MATH);
unsigned short alpha2 = FloatToFloat16(alpha);
unsigned short beta2 = FloatToFloat16(beta);
__half * alpha3 = (__half*)&alpha2;
__half * beta3 = (__half*)&beta2;
if (transposedA == X_NOTRANS && transposedB == X_NOTRANS)
cublas
GemmBatchedEx(*handle, CUBLAS_OP_N, CUBLAS_OP_N, mc, nc, ma, (void*)&alpha, b, CUDA_R_16F, mb, a, CUDA_R_16F, ma, (void*)&beta, c, CUDA_R_16F, mc, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP
);
cublas
HgemmBatched(*handle, CUBLAS_OP_N, CUBLAS_OP_N, mc, nc, ma, alpha3, (const __half**)b, mb, (const __half**)a, ma, beta3, (__half**)c, mc, count
);
else if (transposedA == X_TRANS && transposedB == X_NOTRANS)
cublas
GemmBatchedEx(*handle, CUBLAS_OP_N, CUBLAS_OP_T, mc, nc, ma, (void*)&alpha, b, CUDA_R_16F, mb, a, CUDA_R_16F, ma, (void*)&beta, c, CUDA_R_16F, mc, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP
);
cublas
HgemmBatched(*handle, CUBLAS_OP_N, CUBLAS_OP_T, mc, nc, na, alpha3, (const __half**)b, mb, (const __half**)a, ma, beta3, (__half**)c, mc, count
);
else if (transposedA == X_NOTRANS && transposedB == X_TRANS)
cublas
GemmBatchedEx(*handle, CUBLAS_OP_T, CUBLAS_OP_N, mc, nc, ma, (void*)&alpha, b, CUDA_R_16F, mb, a, CUDA_R_16F, ma, (void*)&beta, c, CUDA_R_16F, mc, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP
);
cublas
HgemmBatched(*handle, CUBLAS_OP_T, CUBLAS_OP_N, mc, nc, ma, alpha3, (const __half**)b, mb, (const __half**)a, ma, beta3, (__half**)c, mc, count
);
else if (transposedA == X_TRANS && transposedB == X_TRANS)
cublasGemmBatchedEx(*handle, CUBLAS_OP_T, CUBLAS_OP_T, mc, nc, ma, (void*)&alpha, b, CUDA_R_16F, mb, a, CUDA_R_16F, ma, (void*)&beta, c, CUDA_R_16F, mc, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
cublasSetMathMode(*handle, CUBLAS_DEFAULT_MATH);
}
else if (dataTypeA == X_FLOAT16 && dataTypeB == X_FLOAT16 && dataTypeC == X_FLOAT) {
cublasSetMathMode(*handle, CUBLAS_TENSOR_OP_MATH);
if (transposedA == X_NOTRANS && transposedB == X_NOTRANS)
cublasGemmBatchedEx(*handle, CUBLAS_OP_N, CUBLAS_OP_N, mc, nc, ma, (void*)&alpha, b, CUDA_R_16F, mb, a, CUDA_R_16F, ma, (void*)&beta, c, CUDA_R_32F, mc, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
else if (transposedA == X_TRANS && transposedB == X_NOTRANS)
cublasGemmBatchedEx(*handle, CUBLAS_OP_N, CUBLAS_OP_T, mc, nc, ma, (void*)&alpha, b, CUDA_R_16F, mb, a, CUDA_R_16F, ma, (void*)&beta, c, CUDA_R_32F, mc, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
else if (transposedA == X_NOTRANS && transposedB == X_TRANS)
cublasGemmBatchedEx(*handle, CUBLAS_OP_T, CUBLAS_OP_N, mc, nc, ma, (void*)&alpha, b, CUDA_R_16F, mb, a, CUDA_R_16F, ma, (void*)&beta, c, CUDA_R_32F, mc, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
else if (transposedA == X_TRANS && transposedB == X_TRANS)
cublasGemmBatchedEx(*handle, CUBLAS_OP_T, CUBLAS_OP_T, mc, nc, ma, (void*)&alpha, b, CUDA_R_16F, mb, a, CUDA_R_16F, ma, (void*)&beta, c, CUDA_R_32F, mc, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
cublasSetMathMode(*handle, CUBLAS_DEFAULT_MATH);
}
else if (dataTypeA == X_INT8 && dataTypeB == X_INT8 && dataTypeC == X_FLOAT) {
int alpha2 = (int)alpha;
int beta2 = (int)beta;
/*
CUDA requires that the dimension of two tensor( lda, ldb ) should be multiples of 4.
details in https://devtalk.nvidia.com/default/topic/999101/about-cublasgemm-int8-support/
*/
if (mb % 4 != 0 || ma % 4 != 0) {
ShowNTErrors("mb, ma( lda, ldb ) should be multiples of 4!");
return;
}
cublasSetMathMode(*handle, CUBLAS_TENSOR_OP_MATH);
if (transposedA == X_NOTRANS && transposedB == X_NOTRANS)
cublasGemmBatchedEx(*handle, CUBLAS_OP_N, CUBLAS_OP_N, mc, nc, ma, (__int8*)&alpha2, b, CUDA_R_8I, mb, a, CUDA_R_8I, ma, (__int8*)&beta2, c, CUDA_R_32F, mc, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
else if (transposedA == X_TRANS && transposedB == X_NOTRANS)
cublasGemmBatchedEx(*handle, CUBLAS_OP_N, CUBLAS_OP_T, mc, nc, ma, (__int8*)&alpha2, b, CUDA_R_8I, mb, a, CUDA_R_8I, ma, (__int8*)&beta2, c, CUDA_R_32F, mc, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
else if (transposedA == X_NOTRANS && transposedB == X_TRANS)
cublasGemmBatchedEx(*handle, CUBLAS_OP_T, CUBLAS_OP_N, mc, nc, ma, (__int8*)&alpha2, b, CUDA_R_8I, mb, a, CUDA_R_8I, ma, (__int8*)&beta2, c, CUDA_R_32F, mc, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
else if (transposedA == X_TRANS && transposedB == X_TRANS)
cublasGemmBatchedEx(*handle, CUBLAS_OP_T, CUBLAS_OP_T, mc, nc, ma, (__int8*)&alpha2, b, CUDA_R_8I, mb, a, CUDA_R_8I, ma, (__int8*)&beta2, c, CUDA_R_32F, mc, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
cublasSetMathMode(*handle, CUBLAS_DEFAULT_MATH);
cublasHgemmBatched(*handle, CUBLAS_OP_T, CUBLAS_OP_T, mc, nc, na, alpha3, (const __half**)b, mb, (const __half**)a, ma, beta3, (__half**)c, mc, count);
}
else {
ShowNTErrors("Unsupported data type!");
...
...
@@ -246,72 +179,18 @@ void _CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle,
cublasSgemmStridedBatched(*handle, CUBLAS_OP_T, CUBLAS_OP_T, mc, nc, na, &alpha2, (const float*)b, mb, strideB, (const float*)a, ma, strideA, &beta2, (float*)c, mc, strideC, count);
}
else if (dataTypeA == X_FLOAT16 && dataTypeB == X_FLOAT16 && dataTypeC == X_FLOAT16) {
cublasSetMathMode(*handle, CUBLAS_TENSOR_OP_MATH);
if (transposedA == X_NOTRANS && transposedB == X_NOTRANS)
cublasGemmStridedBatchedEx(*handle, CUBLAS_OP_N, CUBLAS_OP_N, mc, nc, ma, (void*)&alpha, b, CUDA_R_16F, mb, strideB, a, CUDA_R_16F, ma, strideA, (void*)&beta, c, CUDA_R_16F, mc, strideC, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
else if (transposedA == X_TRANS && transposedB == X_NOTRANS)
cublasGemmStridedBatchedEx(*handle, CUBLAS_OP_N, CUBLAS_OP_T, mc, nc, ma, (void*)&alpha, b, CUDA_R_16F, mb, strideB, a, CUDA_R_16F, ma, strideA, (void*)&beta, c, CUDA_R_16F, mc, strideC, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
else if (transposedA == X_NOTRANS && transposedB == X_TRANS)
cublasGemmStridedBatchedEx(*handle, CUBLAS_OP_T, CUBLAS_OP_N, mc, nc, ma, (void*)&alpha, b, CUDA_R_16F, mb, strideB, a, CUDA_R_16F, ma, strideA, (void*)&beta, c, CUDA_R_16F, mc, strideC, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
else if (transposedA == X_TRANS && transposedB == X_TRANS)
cublasGemmStridedBatchedEx(*handle, CUBLAS_OP_T, CUBLAS_OP_T, mc, nc, ma, (void*)&alpha, b, CUDA_R_16F, mb, strideB, a, CUDA_R_16F, ma, strideA, (void*)&beta, c, CUDA_R_16F, mc, strideC, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
cublasSetMathMode(*handle, CUBLAS_DEFAULT_MATH);
}
else if (dataTypeA == X_FLOAT16 && dataTypeB == X_FLOAT16 && dataTypeC == X_FLOAT) {
cublasSetMathMode(*handle, CUBLAS_TENSOR_OP_MATH);
if (transposedA == X_NOTRANS && transposedB == X_NOTRANS)
cublasGemmStridedBatchedEx(*handle, CUBLAS_OP_N, CUBLAS_OP_N, mc, nc, ma, (void*)&alpha, b, CUDA_R_16F, mb, strideB, a, CUDA_R_16F, ma, strideA, (void*)&beta, c, CUDA_R_32F, mc, strideC, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
else if (transposedA == X_TRANS && transposedB == X_NOTRANS)
cublasGemmStridedBatchedEx(*handle, CUBLAS_OP_N, CUBLAS_OP_T, mc, nc, na, (void*)&alpha, b, CUDA_R_16F, mb, strideB, a, CUDA_R_16F, ma, strideA, (void*)&beta, c, CUDA_R_32F, mc, strideC, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
else if (transposedA == X_NOTRANS && transposedB == X_TRANS)
cublasGemmStridedBatchedEx(*handle, CUBLAS_OP_T, CUBLAS_OP_N, mc, nc, ma, (void*)&alpha, b, CUDA_R_16F, mb, strideB, a, CUDA_R_16F, ma, strideA, (void*)&beta, c, CUDA_R_32F, mc, strideC, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
else if (transposedA == X_TRANS && transposedB == X_TRANS)
cublasGemmStridedBatchedEx(*handle, CUBLAS_OP_T, CUBLAS_OP_T, mc, nc, na, (void*)&alpha, b, CUDA_R_16F, mb, strideB, a, CUDA_R_16F, ma, strideA, (void*)&beta, c, CUDA_R_32F, mc, strideC, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
cublasSetMathMode(*handle, CUBLAS_DEFAULT_MATH);
}
else if (dataTypeA == X_INT8 && dataTypeB == X_INT8 && dataTypeC == X_FLOAT) {
int alpha2 = (int)alpha;
int beta2 = (int)beta;
/*
CUDA requires that the dimension of two tensor( lda, ldb ) should be multiples of 4.
details in https://devtalk.nvidia.com/default/topic/999101/about-cublasgemm-int8-support/
*/
if (mb % 4 != 0 || ma % 4 != 0) {
ShowNTErrors("mb, ma( lda, ldb ) should be multiples of 4!");
return;
}
cublasSetMathMode(*handle, CUBLAS_TENSOR_OP_MATH);
if (transposedA == X_NOTRANS && transposedB == X_NOTRANS)
cublasGemmStridedBatchedEx(*handle, CUBLAS_OP_N, CUBLAS_OP_N, mc, nc, ma, (__int8*)&alpha2, b, CUDA_R_8I, mb, strideB, a, CUDA_R_8I, ma, strideA, (__int8*)&beta2, c, CUDA_R_32F, mc, strideC, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
else if (transposedA == X_TRANS && transposedB == X_NOTRANS)
cublasGemmStridedBatchedEx(*handle, CUBLAS_OP_N, CUBLAS_OP_T, mc, nc, ma, (__int8*)&alpha2, b, CUDA_R_8I, mb, strideB, a, CUDA_R_8I, ma, strideA, (__int8*)&beta2, c, CUDA_R_32F, mc, strideC, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
else if (transposedA == X_NOTRANS && transposedB == X_TRANS)
cublasGemmStridedBatchedEx(*handle, CUBLAS_OP_T, CUBLAS_OP_N, mc, nc, ma, (__int8*)&alpha2, b, CUDA_R_8I, mb, strideB, a, CUDA_R_8I, ma, strideA, (__int8*)&beta2, c, CUDA_R_32F, mc, strideC, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
else if (transposedA == X_TRANS && transposedB == X_TRANS)
cublasGemmStridedBatchedEx(*handle, CUBLAS_OP_T, CUBLAS_OP_T, mc, nc, ma, (__int8*)&alpha2, b, CUDA_R_8I, mb, strideB, a, CUDA_R_8I, ma, strideA, (__int8*)&beta2, c, CUDA_R_32F, mc, strideC, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
cublasSetMathMode(*handle, CUBLAS_DEFAULT_MATH);
}
else if (dataTypeA == X_INT8 && dataTypeB == X_INT8 && dataTypeC == X_INT) {
int alpha2 = (int)alpha;
int beta2 = (int)beta;
/*
CUDA requires that the dimension of two tensor( lda, ldb ) should be multiples of 4.
details in https://devtalk.nvidia.com/default/topic/999101/about-cublasgemm-int8-support/
*/
if (mb % 4 != 0 || ma % 4 != 0) {
ShowNTErrors("mb, ma( lda, ldb ) should be multiples of 4!");
return;
}
cublasSetMathMode(*handle, CUBLAS_TENSOR_OP_MATH);
unsigned short alpha2 = FloatToFloat16(alpha);
unsigned short beta2 = FloatToFloat16(beta);
__half * alpha3 = (__half*)&alpha2;
__half * beta3 = (__half*)&beta2;
if (transposedA == X_NOTRANS && transposedB == X_NOTRANS)
cublas
GemmStridedBatchedEx(*handle, CUBLAS_OP_N, CUBLAS_OP_N, mc, nc, ma, (__int8*)&alpha2, b, CUDA_R_8I, mb, strideB, a, CUDA_R_8I, ma, strideA, (__int8*)&beta2, c, CUDA_C_32I, mc, strideC, count, CUDA_R_32I, CUBLAS_GEMM_DEFAULT_TENSOR_OP
);
cublas
HgemmStridedBatched(*handle, CUBLAS_OP_N, CUBLAS_OP_N, mc, nc, ma, (const __half*)alpha3, (const __half*)b, mb, strideB, (const __half*)a, ma, strideA, (const __half*)beta3, (__half*)c, mc, strideC, count
);
else if (transposedA == X_TRANS && transposedB == X_NOTRANS)
cublas
GemmStridedBatchedEx(*handle, CUBLAS_OP_N, CUBLAS_OP_T, mc, nc, ma, (__int8*)&alpha2, b, CUDA_R_8I, mb, strideB, a, CUDA_R_8I, ma, strideA, (__int8*)&beta2, c, CUDA_C_32I, mc, strideC, count, CUDA_R_32I, CUBLAS_GEMM_DEFAULT_TENSOR_OP
);
cublas
HgemmStridedBatched(*handle, CUBLAS_OP_N, CUBLAS_OP_N, mc, nc, ma, (const __half*)alpha3, (const __half*)b, mb, strideB, (const __half*)a, ma, strideA, (const __half*)beta3, (__half*)c, mc, strideC, count
);
else if (transposedA == X_NOTRANS && transposedB == X_TRANS)
cublas
GemmStridedBatchedEx(*handle, CUBLAS_OP_T, CUBLAS_OP_N, mc, nc, ma, (__int8*)&alpha2, b, CUDA_R_8I, mb, strideB, a, CUDA_R_8I, ma, strideA, (__int8*)&beta2, c, CUDA_C_32I, mc, strideC, count, CUDA_R_32I, CUBLAS_GEMM_DEFAULT_TENSOR_OP
);
cublas
HgemmStridedBatched(*handle, CUBLAS_OP_N, CUBLAS_OP_N, mc, nc, ma, (const __half*)alpha3, (const __half*)b, mb, strideB, (const __half*)a, ma, strideA, (const __half*)beta3, (__half*)c, mc, strideC, count
);
else if (transposedA == X_TRANS && transposedB == X_TRANS)
cublasGemmStridedBatchedEx(*handle, CUBLAS_OP_T, CUBLAS_OP_T, mc, nc, ma, (__int8*)&alpha2, b, CUDA_R_8I, mb, strideB, a, CUDA_R_8I, ma, strideA, (__int8*)&beta2, c, CUDA_C_32I, mc, strideC, count, CUDA_R_32I, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
cublasSetMathMode(*handle, CUBLAS_DEFAULT_MATH);
cublasHgemmStridedBatched(*handle, CUBLAS_OP_N, CUBLAS_OP_N, mc, nc, ma, (const __half*)alpha3, (const __half*)b, mb, strideB, (const __half*)a, ma, strideA, (const __half*)beta3, (__half*)c, mc, strideC, count);
}
else {
ShowNTErrors("Unsupported data type!");
...
...
source/tensor/core/getandset/ConvertDataType.cuh
查看文件 @
7da1bec1
...
...
@@ -44,6 +44,22 @@ void KernelFloatToInt(float * inputData, int * outputData, int size);
__global__
void KernelIntToFloat(int * inputData, float * outputData, int size);
/* convert data type from X_FLOAT to X_INT8 (CUDA Kernel) */
__global__
void KernelFloatToInt8(float * inputData, __int8 * outputData, int size);
/* convert data type from X_INT8 to X_FLOAT (CUDA Kernel) */
__global__
void KernelInt8ToFloat(__int8 * inputData, float * outputData, int size);
/* convert data type from X_INT to X_INT8 (CUDA Kernel) */
__global__
void KernelIntToInt8(int * inputData, __int8 * outputData, int size);
/* convert data type from X_INT8 to X_INT (CUDA Kernel) */
__global__
void KernelInt8ToInt(__int8 * inputData, int * outputData, int size);
/* convert data type */
void _CudaConvertDataType(const XTensor * input, XTensor * output);
...
...
source/tensor/core/movement/Gather.cu
查看文件 @
7da1bec1
...
...
@@ -37,12 +37,11 @@ gather indexed sub-tensors(cuda version)
>> indexSize - the size of the srcIndex
>> stride - stride of a data block
*/
template <class T>
__global__
void KernelGather(
T * sData, T
* tData, int * sIndex, int indexSize, int stride)
void KernelGather(
DTYPE * sData, DTYPE
* tData, int * sIndex, int indexSize, int stride)
{
__shared__
T
* sp[MAX_CUDA_THREAD_NUM_PER_BLOCK];
__shared__
T
* tp[MAX_CUDA_THREAD_NUM_PER_BLOCK];
__shared__
DTYPE
* sp[MAX_CUDA_THREAD_NUM_PER_BLOCK];
__shared__
DTYPE
* tp[MAX_CUDA_THREAD_NUM_PER_BLOCK];
/* block id */
int i = blockDim.x * blockIdx.x + threadIdx.x;
...
...
@@ -50,18 +49,18 @@ void KernelGather(T * sData, T * tData, int * sIndex, int indexSize, int stride)
/* offset in each block */
int offset = blockDim.y * blockIdx.y + threadIdx.y;
if
(i >= indexSize || offset >= stride)
if(i >= indexSize || offset >= stride)
return;
if
(threadIdx.y == 0)
{
if
(threadIdx.y == 0)
{
sp[threadIdx.x] = sData + sIndex[i] * stride;
tp[threadIdx.x] = tData + i * stride;
}
__syncthreads();
T
* s = sp[threadIdx.x];
T
* t = tp[threadIdx.x];
DTYPE
* s = sp[threadIdx.x];
DTYPE
* t = tp[threadIdx.x];
t[offset] = s[offset];
}
...
...
@@ -75,10 +74,6 @@ gather indexed sub-tensors(cuda version)
*/
void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex)
{
CheckNTErrors((s->dataType == DEFAULT_DTYPE && t->dataType == DEFAULT_DTYPE) ||
(s->dataType == X_FLOAT16 && t->dataType == X_FLOAT16),
"The gather function does not support this datatype.");
int devID = s->devID;
XMem * mem = s->mem;
...
...
@@ -96,6 +91,9 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex)
dim3 blocks(cudaGrids[0], cudaGrids[1]);
dim3 threads(cudaBlocks[0], cudaBlocks[1]);
DTYPE * sData = (DTYPE*)s->data;
DTYPE * tData = (DTYPE*)t->data;
int * sIndex = NULL;
if (srcIndex->devID < 0) {
...
...
@@ -107,20 +105,7 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex)
else
sIndex = (int *)srcIndex->data;
if (s->dataType == DEFAULT_DTYPE && t->dataType == DEFAULT_DTYPE) {
DTYPE * sData = (DTYPE*)s->data;
DTYPE * tData = (DTYPE*)t->data;
KernelGather<<<blocks, threads>>>(sData, tData, sIndex, indexSize, stride);
}
else if (s->dataType == X_FLOAT16 && t->dataType == X_FLOAT16) {
half * sData = (half*)s->data;
half * tData = (half*)t->data;
KernelGather<<<blocks, threads>>>(sData, tData, sIndex, indexSize, stride);
}
else {
//TODO!
ShowNTErrors("TODO!");
}
KernelGather<<<blocks, threads >>>(sData, tData, sIndex, indexSize, stride);
if (srcIndex->devID < 0) {
if(mem != NULL)
...
...
source/tensor/core/reduce/ReduceMax.cu
查看文件 @
7da1bec1
...
...
@@ -17,7 +17,6 @@
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-03 float16 added
*/
#include "../../XDevice.h"
...
...
@@ -504,9 +503,6 @@ void _CudaReduceMax(const XTensor * input, XTensor * output, int dim)
CheckNTErrors(input->order == output->order + 1, "Incorrect tensor sizes!");
CheckNTErrors(input->order > dim && dim >=0, "Illegal dimension to reduce!");
CheckNTErrors(input->dataType == output->dataType, "Unmatched data types!");
CheckNTErrors((input->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE) ||
(input->dataType == X_FLOAT16 && output->dataType == X_FLOAT16),
"The reduce max function does not support this datatype.");
int dimRDI = input->order - dim - 1;
for(int i = 0; i < input->order; i++){
...
...
@@ -547,7 +543,7 @@ void _CudaReduceMax(const XTensor * input, XTensor * output, int dim)
int devIDBackup;
ProtectCudaDev(input->devID, devIDBackup);
if (stride == 1 && blockNum >= 10
&& input->dataType == DEFAULT_DTYPE
) {
if (stride == 1 && blockNum >= 10) {
dim3 grids;
dim3 blocks;
continuousStorageThreadAllocation(grids, blocks, (long long)blockNum, strideNum);
...
...
source/tensor/core/reduce/ReduceSum.cu
查看文件 @
7da1bec1
...
...
@@ -17,7 +17,6 @@
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-03 float16 added
*/
#include "../../XDevice.h"
...
...
@@ -736,7 +735,7 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
int devIDBackup;
ProtectCudaDev(input->devID, devIDBackup);
if (stride == 1 && blockNum >= 10
&& input->dataType == DEFAULT_DTYPE
) {
if (stride == 1 && blockNum >= 10) {
dim3 grids;
dim3 blocks;
continuousStorageThreadAllocation(grids, blocks, (long long)blockNum, strideNum);
...
...
@@ -752,7 +751,10 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
strideNum, blockNum, sp, power, isExp);
}
}
else if (stride != 1 && stride * blockNum > 4096 && input->dataType == DEFAULT_DTYPE){
else if (stride != 1 && stride * blockNum > 4096){
//GDevs->GetGridAndBlockSize2D(devID, stride * blockNum, strideNum,MAX_INT, cudaGridSize, cudaBlockSize);
//unsigned int* goutput = (unsigned int *)input->data;
//convert2uintV2 << <dim3(cudaGridSize[0], cudaGridSize[1]), dim3(cudaBlockSize[0], cudaBlockSize[1]) >> > ((float*)input->data, goutput, stride, strideNum, blockNum, strideNum*blockNum*stride);
dim3 grid, block;
discontinuousStorageNoShareMemThreadAllocation(&grid, &block, stride, blockNum);
KernelReduceSumDiscontinuousStorage <<<grid, block>>> ((DTYPE *)input->data, (DTYPE*)output->data, stride,
...
...
source/tensor/function/HardTanH.cu
查看文件 @
7da1bec1
...
...
@@ -17,7 +17,6 @@
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-25
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-04 float16 added
*/
#include "HardTanH.h"
...
...
@@ -39,18 +38,17 @@ y = 1 if x > 1
>> y - output data array
>> size - size of input/output
*/
template <class T>
__global__
void KernelHardtanhCompute(
T * x, T
* y, int size)
void KernelHardtanhCompute(
DTYPE * x, DTYPE
* y, int size)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size)
{
T
p = x[i];
if
(p >(T
)1.0)
p = (
T
)1.0;
else if
(p < (T
)-1.0)
p = (
T
)-1.0;
if (i < size){
DTYPE
p = x[i];
if
(p > (DTYPE
)1.0)
p = (
DTYPE
)1.0;
else if
(p < (DTYPE
)-1.0)
p = (
DTYPE
)-1.0;
y[i] = p;
}
}
...
...
@@ -65,11 +63,10 @@ y = 1 if x > 1
*/
void _CudaHardTanH(const XTensor * x, XTensor * y)
{
if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE){
CheckNTErrors(!x->isSparse && !y->isSparse, "The hard tanh activation function does not support sparse tensors.");
CheckNTErrors(x->unitNum && y->unitNum, "The x vectors must be of the same length.");
CheckNTErrors((x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE) ||
(x->dataType == X_FLOAT16 && y->dataType == X_FLOAT16),
"The hard tanh activation function does not support this datatype.");
int gridSize[3], blockSize[3];
...
...
@@ -78,18 +75,13 @@ void _CudaHardTanH(const XTensor * x, XTensor * y)
int devIDBackup;
ProtectCudaDev(x->devID, devIDBackup);
if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE){
KernelHardtanhCompute<<<dim3(gridSize[0]), dim3(blockSize[0])>>>((DTYPE*)x->data, (DTYPE*)y->data, x->unitNum);
BacktoCudaDev(x->devID, devIDBackup);
}
else if (x->dataType == X_FLOAT16 && y->dataType == X_FLOAT16) {
KernelHardtanhCompute<<<dim3(gridSize[0]), dim3(blockSize[0])>>>((__half *)x->data, (__half *)y->data, x->unitNum);
}
else {
//TODO!
else{
ShowNTErrors("TODO!");
}
BacktoCudaDev(x->devID, devIDBackup);
}
/*
...
...
source/tensor/function/LogSoftmax.cu
查看文件 @
7da1bec1
...
...
@@ -83,7 +83,7 @@ void KernelLogSoftmaxComputeByRow(T * x, T * max, T * sum, T * y, int rowNum, in
int key = i * colNum + j;
if (dataType == X_FLOAT) {
DTYPE r = log((DTYPE)exp(
x[key] - inputMax[threadIdx.x]
) / (DTYPE)inputSum[threadIdx.x]);
DTYPE r = log((DTYPE)exp(
(DTYPE)(x[key] - inputMax[threadIdx.x])
) / (DTYPE)inputSum[threadIdx.x]);
if (isnan(r))
r = LOGPROB_MIN;
...
...
@@ -137,7 +137,7 @@ void KernelLogSoftmaxComputeByCol(T * x, T * max, T * sum, T * y, int rowNum, in
if (i < rowNum && j < colNum) {
int key = i * colNum + j;
if (dataType == X_FLOAT) {
DTYPE r = log((DTYPE)exp(
x[key] - inputMax[threadIdx.y]
) / (DTYPE)inputSum[threadIdx.y]);
DTYPE r = log((DTYPE)exp(
(DTYPE)(x[key] - inputMax[threadIdx.y])
) / (DTYPE)inputSum[threadIdx.y]);
if (isnan(r))
r = LOGPROB_MIN;
...
...
@@ -247,10 +247,10 @@ void KernelExpLoss(T * dedy, T * dedx, T * y, int size, LOSS_FUNCTION_NAME lossN
if (i < size) {
/* dE/dx_j = exp(y_j) */
if (lossName == CROSSENTROPY)
dedx[i] = exp(
y[i]
);
dedx[i] = exp(
((DTYPE)y[i])
);
/* dE/dx_j = exp(y_j) */
else if (lossName == SQUAREDERROR)
dedx[i] = exp(
y[i]
);
dedx[i] = exp(
((DTYPE)y[i])
);
else if (lossName == ONEHOTERROR)
dedx[i] = 0;
else
...
...
@@ -283,13 +283,13 @@ void KernelLogSoftmaxBackwardDEDS(T * dedy, T * dedx, T * gold, T * y, T * x,
DTYPE r = 0;
/* dE/ds_j = exp(y_j) */
if (lossName == CROSSENTROPY)
r = -(DTYPE)gold[i] + (DTYPE)exp(
y[i]
);
r = -(DTYPE)gold[i] + (DTYPE)exp(
((DTYPE)y[i])
);
/* dE/ds_j = exp(y_j) */
else if (lossName == SQUAREDERROR)
r = -(DTYPE)gold[i] + (DTYPE)exp(
y[i]
);
r = -(DTYPE)gold[i] + (DTYPE)exp(
((DTYPE)y[i])
);
else if (lossName == ONEHOTERROR) {
if ((DTYPE)gold[i] == 1.0)
r = -(DTYPE)gold[i] + (DTYPE)exp(
y[i]
);
r = -(DTYPE)gold[i] + (DTYPE)exp(
((DTYPE)y[i])
);
else
r = 0;
}
...
...
@@ -366,7 +366,7 @@ void KernelLogSoftmaxBackwardDEDSSparseByRow(T * dedy, T * dedx, void * gold, T
else if (lossName == ONEHOTERROR) {
int offset = colNum * ni + mi;
if (value == 1.0F)
dedx[offset] += (-value + exp(
y[offset]
));
dedx[offset] += (-value + exp(
((DTYPE)y[offset])
));
//dedx[offset] += -value * 0.005;
}
}
...
...
source/tensor/function/LogSoftmax.cuh
查看文件 @
7da1bec1
...
...
@@ -32,9 +32,30 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* log scale softmax y = log(e^x / \sum_{i} e^{x_i}) (Cuda version) */
void _CudaLogSoftmax(const XTensor * input, XTensor * output, int leadDim);
template <class T, TENSOR_DATA_TYPE dataType>
__global__
void KernelLogSoftmaxComputeByRow(T * x, T * max, T * sum, T * y, int rowNum, int colNum);
template <class T, TENSOR_DATA_TYPE dataType>
__global__
void KernelLogSoftmaxComputeByCol(T * x, T * max, T * sum, T * y, int rowNum, int colNum);
/* log scale softmax y = log(e^x / \sum_{i} e^{x_i}) (Cuda version) */
void _CudaLogSoftmaxSumMax(XTensor * x, XTensor * y, int leadDim, XTensor * sum, XTensor * max);
template <class T>
__global__
void KernelExpLoss(T * dedy, T * dedx, T * y, int size, LOSS_FUNCTION_NAME lossName);
template <class T, TENSOR_DATA_TYPE dataType>
__global__
void KernelLogSoftmaxBackwardDEDS(T * dedy, T * dedx, T * gold, T * y, T * x, int size, LOSS_FUNCTION_NAME lossName);
template <class T>
__global__
void KernelLogSoftmaxBackwardDEDSSparseByRow(T * dedy, T * dedx, void * gold, T * y, T * x,
int rowNum, int colNum, int gNonZeroNum, LOSS_FUNCTION_NAME lossName);
/* de/dx (Cuda version) */
void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx,
...
...
source/tensor/test/TClip.cpp
查看文件 @
7da1bec1
...
...
@@ -22,6 +22,7 @@
#include "../XTensor.h"
#include "../core/math/Clip.h"
#include "TClip.h"
#include "../core/getandset/ConvertDataType.h"
namespace
nts
{
// namespace nts(NiuTrans.Tensor)
...
...
@@ -116,6 +117,249 @@ bool TestClip1()
#endif // USE_CUDA
}
/*
case 2: float16 test Clip function.
Set every entry to its clip value.
*/
bool
TestClip2
()
{
/* a tensor of size (3, 2) */
int
aOrder
=
2
;
int
*
aDimSize
=
new
int
[
aOrder
];
aDimSize
[
0
]
=
3
;
aDimSize
[
1
]
=
2
;
int
aUnitNum
=
1
;
for
(
int
i
=
0
;
i
<
aOrder
;
i
++
)
aUnitNum
*=
aDimSize
[
i
];
DTYPE
aData
[
3
][
2
]
=
{
{
1.0
F
,
-
2.0
F
},
{
0.0
F
,
4.0
F
},
{
5.0
F
,
-
6.0
F
}
};
DTYPE
answer
[
3
][
2
]
=
{
{
1.0
F
,
-
1.0
F
},
{
0.0
F
,
1.0
F
},
{
1.0
F
,
-
1.0
F
}
};
/* CPU test */
bool
cpuTest
=
true
;
#ifdef USE_CUDA
/* GPU test */
bool
gpuTest
=
true
;
/* create tensor */
XTensor
*
aGPU
=
NewTensor
(
aOrder
,
aDimSize
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
*
bGPU
=
NewTensor
(
aOrder
,
aDimSize
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
*
aMeGPU
=
NewTensor
(
aOrder
,
aDimSize
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
bUserGPU
;
/* create float16 tensor */
XTensor
aHalfGPU
;
XTensor
bHalfGPU
;
XTensor
aMeHalfGPU
;
XTensor
bUserHalfGPU
;
/* Initialize variables */
aGPU
->
SetData
(
aData
,
aUnitNum
);
aMeGPU
->
SetData
(
aData
,
aUnitNum
);
/* convert data type from float to float16 */
aHalfGPU
=
ConvertDataType
(
*
aGPU
,
X_FLOAT16
);
aMeHalfGPU
=
ConvertDataType
(
*
aMeGPU
,
X_FLOAT16
);
bHalfGPU
=
ConvertDataType
(
*
bGPU
,
X_FLOAT16
);
/* call clip function */
_Clip
(
&
aHalfGPU
,
&
bHalfGPU
,
-
1.0
,
1.0
);
_ClipMe
(
&
aMeHalfGPU
,
-
1.0
,
1.0
);
bUserHalfGPU
=
Clip
(
aHalfGPU
,
-
1.0
,
1.0
);
/* convert data type from float16 to float */
_ConvertDataType
(
&
bHalfGPU
,
bGPU
);
_ConvertDataType
(
&
aMeHalfGPU
,
aMeGPU
);
bUserGPU
=
ConvertDataType
(
bUserHalfGPU
,
X_FLOAT
);
/* check results */
gpuTest
=
bGPU
->
CheckData
(
answer
,
aUnitNum
,
1e-4
F
)
&&
aMeGPU
->
CheckData
(
answer
,
aUnitNum
,
1e-4
F
)
&&
bUserGPU
.
CheckData
(
answer
,
aUnitNum
,
1e-4
F
);
/* destroy variables */
delete
aGPU
;
delete
bGPU
;
delete
aMeGPU
;
delete
[]
aDimSize
;
return
cpuTest
&&
gpuTest
;
#else
/* destroy variables */
delete
[]
aDimSize
;
return
cpuTest
;
#endif // USE_CUDA
}
/*
case 3: int32 test Clip function.
Set every entry to its clip value.
*/
bool
TestClip3
()
{
/* a tensor of size (3, 2) */
int
aOrder
=
2
;
int
*
aDimSize
=
new
int
[
aOrder
];
aDimSize
[
0
]
=
3
;
aDimSize
[
1
]
=
2
;
int
aUnitNum
=
1
;
for
(
int
i
=
0
;
i
<
aOrder
;
i
++
)
aUnitNum
*=
aDimSize
[
i
];
DTYPE
aData
[
3
][
2
]
=
{
{
1.0
F
,
-
2.0
F
},
{
0.0
F
,
4.0
F
},
{
5.0
F
,
-
6.0
F
}
};
DTYPE
answer
[
3
][
2
]
=
{
{
1.0
F
,
-
1.0
F
},
{
0.0
F
,
1.0
F
},
{
1.0
F
,
-
1.0
F
}
};
/* CPU test */
bool
cpuTest
=
true
;
#ifdef USE_CUDA
/* GPU test */
bool
gpuTest
=
true
;
/* create tensor */
XTensor
*
aGPU
=
NewTensor
(
aOrder
,
aDimSize
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
*
bGPU
=
NewTensor
(
aOrder
,
aDimSize
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
*
aMeGPU
=
NewTensor
(
aOrder
,
aDimSize
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
bUserGPU
;
/* create int32 tensor */
XTensor
aInt32GPU
;
XTensor
bInt32GPU
;
XTensor
aMeInt32GPU
;
XTensor
bUserInt32GPU
;
/* Initialize variables */
aGPU
->
SetData
(
aData
,
aUnitNum
);
aMeGPU
->
SetData
(
aData
,
aUnitNum
);
/* convert data type from float to int32 */
aInt32GPU
=
ConvertDataType
(
*
aGPU
,
X_INT
);
aMeInt32GPU
=
ConvertDataType
(
*
aMeGPU
,
X_INT
);
bInt32GPU
=
ConvertDataType
(
*
bGPU
,
X_INT
);
/* call clip function */
_Clip
(
&
aInt32GPU
,
&
bInt32GPU
,
-
1.0
,
1.0
);
_ClipMe
(
&
aMeInt32GPU
,
-
1.0
,
1.0
);
bUserInt32GPU
=
Clip
(
aInt32GPU
,
-
1.0
,
1.0
);
/* convert data type from int32 to float */
_ConvertDataType
(
&
bInt32GPU
,
bGPU
);
_ConvertDataType
(
&
aMeInt32GPU
,
aMeGPU
);
bUserGPU
=
ConvertDataType
(
bUserInt32GPU
,
X_FLOAT
);
/* check results */
gpuTest
=
bGPU
->
CheckData
(
answer
,
aUnitNum
,
1e-4
F
)
&&
aMeGPU
->
CheckData
(
answer
,
aUnitNum
,
1e-4
F
)
&&
bUserGPU
.
CheckData
(
answer
,
aUnitNum
,
1e-4
F
);
/* destroy variables */
delete
aGPU
;
delete
bGPU
;
delete
aMeGPU
;
delete
[]
aDimSize
;
return
cpuTest
&&
gpuTest
;
#else
/* destroy variables */
delete
[]
aDimSize
;
return
cpuTest
;
#endif // USE_CUDA
}
/*
case 4: int8 test Clip function.
Set every entry to its clip value.
*/
bool
TestClip4
()
{
/* a tensor of size (3, 2) */
int
aOrder
=
2
;
int
*
aDimSize
=
new
int
[
aOrder
];
aDimSize
[
0
]
=
3
;
aDimSize
[
1
]
=
2
;
int
aUnitNum
=
1
;
for
(
int
i
=
0
;
i
<
aOrder
;
i
++
)
aUnitNum
*=
aDimSize
[
i
];
DTYPE
aData
[
3
][
2
]
=
{
{
1.0
F
,
-
2.0
F
},
{
0.0
F
,
4.0
F
},
{
5.0
F
,
-
6.0
F
}
};
DTYPE
answer
[
3
][
2
]
=
{
{
1.0
F
,
-
1.0
F
},
{
0.0
F
,
1.0
F
},
{
1.0
F
,
-
1.0
F
}
};
/* CPU test */
bool
cpuTest
=
true
;
#ifdef USE_CUDA
/* GPU test */
bool
gpuTest
=
true
;
/* create tensor */
XTensor
*
aGPU
=
NewTensor
(
aOrder
,
aDimSize
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
*
bGPU
=
NewTensor
(
aOrder
,
aDimSize
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
*
aMeGPU
=
NewTensor
(
aOrder
,
aDimSize
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
bUserGPU
;
/* create int8 tensor */
XTensor
aInt8GPU
;
XTensor
bInt8GPU
;
XTensor
aMeInt8GPU
;
XTensor
bUserInt8GPU
;
/* Initialize variables */
aGPU
->
SetData
(
aData
,
aUnitNum
);
aMeGPU
->
SetData
(
aData
,
aUnitNum
);
/* convert data type from float to int8 */
aInt8GPU
=
ConvertDataType
(
*
aGPU
,
X_INT8
);
aMeInt8GPU
=
ConvertDataType
(
*
aMeGPU
,
X_INT8
);
bInt8GPU
=
ConvertDataType
(
*
bGPU
,
X_INT8
);
/* call clip function */
_Clip
(
&
aInt8GPU
,
&
bInt8GPU
,
-
1.0
,
1.0
);
_ClipMe
(
&
aMeInt8GPU
,
-
1.0
,
1.0
);
bUserInt8GPU
=
Clip
(
aInt8GPU
,
-
1.0
,
1.0
);
/* convert data type from int8 to float */
_ConvertDataType
(
&
bInt8GPU
,
bGPU
);
_ConvertDataType
(
&
aMeInt8GPU
,
aMeGPU
);
bUserGPU
=
ConvertDataType
(
bUserInt8GPU
,
X_FLOAT
);
/* check results */
gpuTest
=
bGPU
->
CheckData
(
answer
,
aUnitNum
,
1e-4
F
)
&&
aMeGPU
->
CheckData
(
answer
,
aUnitNum
,
1e-4
F
)
&&
bUserGPU
.
CheckData
(
answer
,
aUnitNum
,
1e-4
F
);
/* destroy variables */
delete
aGPU
;
delete
bGPU
;
delete
aMeGPU
;
delete
[]
aDimSize
;
return
cpuTest
&&
gpuTest
;
#else
/* destroy variables */
delete
[]
aDimSize
;
return
cpuTest
;
#endif // USE_CUDA
}
/* other cases */
/*
TODO!!
...
...
@@ -137,6 +381,36 @@ bool TestClip()
else
XPRINT
(
0
,
stdout
,
">> case 1 passed!
\n
"
);
/* case 2 test */
caseFlag
=
TestClip2
();
if
(
!
caseFlag
)
{
returnFlag
=
false
;
XPRINT
(
0
,
stdout
,
">> case 2 failed!
\n
"
);
}
else
XPRINT
(
0
,
stdout
,
">> case 2 passed!
\n
"
);
/* case 3 test */
caseFlag
=
TestClip3
();
if
(
!
caseFlag
)
{
returnFlag
=
false
;
XPRINT
(
0
,
stdout
,
">> case 3 failed!
\n
"
);
}
else
XPRINT
(
0
,
stdout
,
">> case 3 passed!
\n
"
);
/* case 4 test */
caseFlag
=
TestClip4
();
if
(
!
caseFlag
)
{
returnFlag
=
false
;
XPRINT
(
0
,
stdout
,
">> case 4 failed!
\n
"
);
}
else
XPRINT
(
0
,
stdout
,
">> case 4 passed!
\n
"
);
/* other cases test */
/*
TODO!!
...
...
source/tensor/test/TConvertDataType.cpp
查看文件 @
7da1bec1
...
...
@@ -17,7 +17,6 @@
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-12
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-06 int8 added
*/
#include "TConvertDataType.h"
...
...
@@ -27,7 +26,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/*
case 1: test ConvertDataType function.
In this case, the fl
oa
t32 data type is converted to int32 data type.
In this case, the fl
ao
t32 data type is converted to int32 data type.
*/
bool
TestConvertDataType1
()
...
...
@@ -178,7 +177,7 @@ bool TestConvertDataType2()
/*
case 3: test ConvertDataType function.
In this case, the float
32
data type is converted to float16 data type.
In this case, the float data type is converted to float16 data type.
*/
bool
TestConvertDataType3
()
{
...
...
@@ -291,130 +290,6 @@ bool TestConvertDataType3()
#endif // USE_CUDA
}
/*
case 4: test ConvertDataType function.
In this case, the float32 data type is converted to int8 data type.
*/
bool
TestConvertDataType4
()
{
/* a tensor of size (3, 2) */
int
aOrder
=
2
;
int
*
aDimSize
=
new
int
[
aOrder
];
aDimSize
[
0
]
=
3
;
aDimSize
[
1
]
=
2
;
int
aUnitNum
=
1
;
for
(
int
i
=
0
;
i
<
aOrder
;
i
++
)
aUnitNum
*=
aDimSize
[
i
];
DTYPE
aData
[
3
][
2
]
=
{
{
1.0
F
,
2.0
F
},
{
0.5
F
,
4.0
F
},
{
5.0
F
,
6.0
F
}
};
int
answer
[
3
][
2
]
=
{
{
1
,
2
},
{
0
,
4
},
{
5
,
6
}
};
/* CPU test */
bool
cpuTest
=
true
;
#ifdef USE_CUDA
/* GPU test */
bool
gpuTest
=
true
;
/* create tensor */
XTensor
*
aGPU
=
NewTensor
(
aOrder
,
aDimSize
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
*
bGPU
=
NewTensor
(
aOrder
,
aDimSize
,
X_INT8
,
1.0
F
,
0
);
XTensor
*
cGPU
=
NewTensor
(
aOrder
,
aDimSize
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
*
dGPU
=
NewTensor
(
aOrder
,
aDimSize
,
X_INT
,
1.0
F
,
0
);
/* Initialize variables */
aGPU
->
SetData
(
aData
,
aUnitNum
);
/* call ConvertDataType function */
_ConvertDataType
(
aGPU
,
bGPU
);
_ConvertDataType
(
bGPU
,
cGPU
);
_ConvertDataType
(
cGPU
,
dGPU
);
/* check results */
gpuTest
=
dGPU
->
CheckData
(
answer
,
aUnitNum
);
/* destroy variables */
delete
aGPU
;
delete
bGPU
;
delete
cGPU
;
delete
dGPU
;
delete
[]
aDimSize
;
return
cpuTest
&&
gpuTest
;
#else
/* destroy variables */
delete
[]
aDimSize
;
return
cpuTest
;
#endif // USE_CUDA
}
/*
case 5: test ConvertDataType function.
In this case, the int data type is converted to int8 data type.
*/
bool
TestConvertDataType5
()
{
/* a tensor of size (3, 2) */
int
aOrder
=
2
;
int
*
aDimSize
=
new
int
[
aOrder
];
aDimSize
[
0
]
=
3
;
aDimSize
[
1
]
=
2
;
int
aUnitNum
=
1
;
for
(
int
i
=
0
;
i
<
aOrder
;
i
++
)
aUnitNum
*=
aDimSize
[
i
];
int
aData
[
3
][
2
]
=
{
{
1
,
2
},
{
0
,
4
},
{
5
,
6
}
};
int
answer
[
3
][
2
]
=
{
{
1
,
2
},
{
0
,
4
},
{
5
,
6
}
};
/* CPU test */
bool
cpuTest
=
true
;
#ifdef USE_CUDA
/* GPU test */
bool
gpuTest
=
true
;
/* create tensor */
XTensor
*
aGPU
=
NewTensor
(
aOrder
,
aDimSize
,
X_INT
,
1.0
F
,
0
);
XTensor
*
bGPU
=
NewTensor
(
aOrder
,
aDimSize
,
X_INT8
,
1.0
F
,
0
);
XTensor
*
cGPU
=
NewTensor
(
aOrder
,
aDimSize
,
X_INT
,
1.0
F
,
0
);
/* Initialize variables */
aGPU
->
SetData
(
aData
,
aUnitNum
);
/* call ConvertDataType function */
_ConvertDataType
(
aGPU
,
bGPU
);
_ConvertDataType
(
bGPU
,
cGPU
);
/* check results */
gpuTest
=
cGPU
->
CheckData
(
answer
,
aUnitNum
);
/* destroy variables */
delete
aGPU
;
delete
bGPU
;
delete
cGPU
;
delete
[]
aDimSize
;
return
cpuTest
&&
gpuTest
;
#else
/* destroy variables */
delete
[]
aDimSize
;
return
cpuTest
;
#endif // USE_CUDA
}
/* other cases */
/*
TODO!!
...
...
@@ -456,26 +331,6 @@ bool TestConvertDataType()
else
XPRINT
(
0
,
stdout
,
">> case 3 passed!
\n
"
);
/* case 4 test */
caseFlag
=
TestConvertDataType4
();
if
(
!
caseFlag
)
{
returnFlag
=
false
;
XPRINT
(
0
,
stdout
,
">> case 4 failed!
\n
"
);
}
else
XPRINT
(
0
,
stdout
,
">> case 4 passed!
\n
"
);
/* case 5 test */
caseFlag
=
TestConvertDataType5
();
if
(
!
caseFlag
)
{
returnFlag
=
false
;
XPRINT
(
0
,
stdout
,
">> case 5 failed!
\n
"
);
}
else
XPRINT
(
0
,
stdout
,
">> case 5 passed!
\n
"
);
/* other cases test */
/*
TODO!!
...
...
source/tensor/test/TDiv.cpp
查看文件 @
7da1bec1
...
...
@@ -17,11 +17,9 @@
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-01
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-06 float16 added
*/
#include "TDiv.h"
#include "../core/getandset/ConvertDataType.h"
namespace
nts
{
// namespace nts(NiuTrans.Tensor)
...
...
@@ -150,120 +148,6 @@ bool TestDiv1()
#endif // USE_CUDA
}
/*
case 2: float16 element-wise division of two tensors
c(i) = a(i)/b(i) + \alpha * c(i)
In this case, (2, 2) (2, 2) -> (2, 2), leadingDim=0, alpha=0.
*/
bool
TestDiv2
()
{
/* a source tensor of size (2, 2) */
int
sOrder1
=
2
;
int
*
sDimSize1
=
new
int
[
sOrder1
];
sDimSize1
[
0
]
=
2
;
sDimSize1
[
1
]
=
2
;
int
sUnitNum1
=
1
;
for
(
int
i
=
0
;
i
<
sOrder1
;
i
++
)
sUnitNum1
*=
sDimSize1
[
i
];
/* a source tensor of size (2, 2) */
int
sOrder2
=
2
;
int
*
sDimSize2
=
new
int
[
sOrder2
];
sDimSize2
[
0
]
=
2
;
sDimSize2
[
1
]
=
2
;
int
sUnitNum2
=
1
;
for
(
int
i
=
0
;
i
<
sOrder2
;
i
++
)
sUnitNum2
*=
sDimSize2
[
i
];
/* a target tensor of size (2, 2) */
int
tOrder
=
2
;
int
*
tDimSize
=
new
int
[
tOrder
];
tDimSize
[
0
]
=
2
;
tDimSize
[
1
]
=
2
;
int
tUnitNum
=
1
;
for
(
int
i
=
0
;
i
<
tOrder
;
i
++
)
tUnitNum
*=
tDimSize
[
i
];
DTYPE
sData1
[
2
][
2
]
=
{
{
0.0
F
,
1.0
F
},
{
2.0
F
,
3.0
F
}
};
DTYPE
sData2
[
2
][
2
]
=
{
{
1.0
F
,
1.0
F
},
{
4.0
F
,
9.0
F
}
};
DTYPE
answer
[
2
][
2
]
=
{
{
0.0
F
,
1.0
F
},
{
0.5
F
,
0.3333
F
}
};
/* CPU test */
bool
cpuTest
=
true
;
#ifdef USE_CUDA
/* GPU test */
bool
gpuTest
=
true
;
/* create tensor */
XTensor
*
sGPU1
=
NewTensor
(
sOrder1
,
sDimSize1
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
*
sGPU2
=
NewTensor
(
sOrder2
,
sDimSize2
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
*
tGPU
=
NewTensor
(
tOrder
,
tDimSize
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
*
tMeGPU
=
NewTensor
(
tOrder
,
tDimSize
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
tUserGPU
;
/* create float16 tensor */
XTensor
sHalfGPU1
;
XTensor
sHalfGPU2
;
XTensor
tHalfGPU
;
XTensor
tMeHalfGPU
;
XTensor
tUserHalfGPU
;
/* Initialize variables */
sGPU1
->
SetData
(
sData1
,
sUnitNum1
);
tMeGPU
->
SetData
(
sData1
,
sUnitNum1
);
sGPU2
->
SetData
(
sData2
,
sUnitNum2
);
tGPU
->
SetZeroAll
();
/* convert data type from float to float16 */
sHalfGPU1
=
ConvertDataType
(
*
sGPU1
,
X_FLOAT16
);
sHalfGPU2
=
ConvertDataType
(
*
sGPU2
,
X_FLOAT16
);
tHalfGPU
=
ConvertDataType
(
*
tGPU
,
X_FLOAT16
);
tMeHalfGPU
=
ConvertDataType
(
*
tMeGPU
,
X_FLOAT16
);
/* call div function */
_Div
(
&
sHalfGPU1
,
&
sHalfGPU2
,
&
tHalfGPU
,
0
,
0
);
_DivMe
(
&
tMeHalfGPU
,
&
sHalfGPU2
,
0
,
0
);
tUserHalfGPU
=
Div
(
sHalfGPU1
,
sHalfGPU2
,
0
);
/* convert data type from float16 to float */
_ConvertDataType
(
&
tHalfGPU
,
tGPU
);
_ConvertDataType
(
&
tMeHalfGPU
,
tMeGPU
);
tUserGPU
=
ConvertDataType
(
tUserHalfGPU
,
X_FLOAT
);
/* check results */
gpuTest
=
tGPU
->
CheckData
(
answer
,
tUnitNum
,
1e-4
F
)
&&
tMeGPU
->
CheckData
(
answer
,
tUnitNum
,
1e-4
F
)
&&
tUserGPU
.
CheckData
(
answer
,
tUnitNum
,
1e-4
F
);
/* destroy variables */
delete
sGPU1
;
delete
sGPU2
;
delete
tGPU
;
delete
tMeGPU
;
delete
[]
sDimSize1
;
delete
[]
sDimSize2
;
delete
[]
tDimSize
;
return
cpuTest
&&
gpuTest
;
#else
/* destroy variables */
delete
[]
sDimSize1
;
delete
[]
sDimSize2
;
delete
[]
tDimSize
;
return
cpuTest
;
#endif // USE_CUDA
}
/* other cases */
/*
TODO!!
...
...
@@ -285,16 +169,6 @@ bool TestDiv()
else
XPRINT
(
0
,
stdout
,
">> case 1 passed!
\n
"
);
/* case 2 test */
caseFlag
=
TestDiv2
();
if
(
!
caseFlag
)
{
returnFlag
=
false
;
XPRINT
(
0
,
stdout
,
">> case 2 failed!
\n
"
);
}
else
XPRINT
(
0
,
stdout
,
">> case 2 passed!
\n
"
);
/* other cases test */
/*
TODO!!
...
...
source/tensor/test/TGather.cpp
查看文件 @
7da1bec1
...
...
@@ -364,111 +364,6 @@ bool TestGather3()
#endif // USE_CUDA
}
/*
case 4: float16 gather indexed sub-tensors
In this case, (3, 3) -> (2, 3), dim = 0,
srcIndex = [0, 2]
*/
bool
TestGather4
()
{
/* a input tensor of size (3, 3) */
int
sOrder
=
2
;
int
*
sDimSize
=
new
int
[
sOrder
];
sDimSize
[
0
]
=
3
;
sDimSize
[
1
]
=
3
;
int
sUnitNum
=
1
;
for
(
int
i
=
0
;
i
<
sOrder
;
i
++
)
sUnitNum
*=
sDimSize
[
i
];
/* a output tensor of size (2, 3) */
int
tOrder
=
2
;
int
*
tDimSize
=
new
int
[
tOrder
];
tDimSize
[
0
]
=
2
;
tDimSize
[
1
]
=
3
;
int
tUnitNum
=
1
;
for
(
int
i
=
0
;
i
<
tOrder
;
i
++
)
tUnitNum
*=
tDimSize
[
i
];
/* a index tensor of size (2) */
int
indexOrder
=
1
;
int
*
indexDimSize
=
new
int
[
indexOrder
];
indexDimSize
[
0
]
=
2
;
int
indexUnitNum
=
1
;
for
(
int
i
=
0
;
i
<
indexOrder
;
i
++
)
indexUnitNum
*=
indexDimSize
[
i
];
DTYPE
sData
[
3
][
3
]
=
{
{
0.0
F
,
-
1.0
F
,
2.0
F
},
{
2.0
F
,
1.0
F
,
3.0
F
},
{
1.0
F
,
2.0
F
,
4.0
F
}
};
DTYPE
answer
[
2
][
3
]
=
{
{
0.0
F
,
-
1.0
F
,
2.0
F
},
{
1.0
F
,
2.0
F
,
4.0
F
}
};
int
dim
=
0
;
int
indexSize
=
2
;
int
srcIndex
[
2
]
=
{
0
,
2
};
/* CPU test */
bool
cpuTest
=
true
;
#ifdef USE_CUDA
/* GPU test */
bool
gpuTest
=
true
;
/* create tensors */
XTensor
*
sGPU
=
NewTensor
(
sOrder
,
sDimSize
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
*
tGPU
=
NewTensor
(
sOrder
,
tDimSize
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
*
indexGPU
=
NewTensor
(
indexOrder
,
indexDimSize
,
X_INT
,
1.0
F
,
0
);
XTensor
tUserGPU
;
/* create float16 tensors */
XTensor
sHalfGPU
;
XTensor
tHalfGPU
;
XTensor
tUserHalfGPU
;
/* initialize variables */
sGPU
->
SetData
(
sData
,
sUnitNum
);
tGPU
->
SetZeroAll
();
indexGPU
->
SetData
(
srcIndex
,
indexSize
);
/* convert data type from float to float16 */
sHalfGPU
=
ConvertDataType
(
*
sGPU
,
X_FLOAT16
);
tHalfGPU
=
ConvertDataType
(
*
tGPU
,
X_FLOAT16
);
/* call gather function */
_Gather
(
&
sHalfGPU
,
&
tHalfGPU
,
indexGPU
);
tUserHalfGPU
=
Gather
(
sHalfGPU
,
*
indexGPU
);
/* convert data type from float16 to float */
_ConvertDataType
(
&
tHalfGPU
,
tGPU
);
tUserGPU
=
ConvertDataType
(
tUserHalfGPU
,
X_FLOAT
);
/* check results */
gpuTest
=
tGPU
->
CheckData
(
answer
,
tUnitNum
)
&&
tUserGPU
.
CheckData
(
answer
,
tUnitNum
);
/* destroy variables */
delete
sGPU
;
delete
tGPU
;
delete
indexGPU
;
delete
[]
sDimSize
;
delete
[]
tDimSize
;
delete
[]
indexDimSize
;
return
cpuTest
&&
gpuTest
;
#else
/* destroy variables */
;
delete
[]
sDimSize
;
delete
[]
tDimSize
;
delete
[]
indexDimSize
;
return
cpuTest
;
#endif // USE_CUDA
}
/* other cases */
/*
TODO!!
...
...
@@ -498,7 +393,7 @@ bool TestGather()
else
XPRINT
(
0
,
stdout
,
">> case 2 passed!
\n
"
);
/* case
3
test */
/* case
2
test */
caseFlag
=
TestGather3
();
if
(
!
caseFlag
)
{
returnFlag
=
false
;
...
...
@@ -507,15 +402,6 @@ bool TestGather()
else
XPRINT
(
0
,
stdout
,
">> case 3 passed!
\n
"
);
/* case 4 test */
caseFlag
=
TestGather4
();
if
(
!
caseFlag
)
{
returnFlag
=
false
;
XPRINT
(
0
,
stdout
,
">> case 4 failed!
\n
"
);
}
else
XPRINT
(
0
,
stdout
,
">> case 4 passed!
\n
"
);
/* other cases test */
/*
TODO!!
...
...
source/tensor/test/TGather.h
查看文件 @
7da1bec1
...
...
@@ -23,7 +23,6 @@
#define __TEST_GATHER_H__
#include "../core/movement/Gather.h"
#include "../core/getandset/ConvertDataType.h"
namespace
nts
{
// namespace nts(NiuTrans.Tensor)
...
...
source/tensor/test/THardTanH.cpp
查看文件 @
7da1bec1
...
...
@@ -17,12 +17,10 @@
/*
* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-06-20
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-06 float16 added
*/
#include "../XTensor.h"
#include "THardTanH.h"
#include "../core/getandset/ConvertDataType.h"
namespace
nts
{
// namespace nts(NiuTrans.Tensor)
...
...
@@ -224,80 +222,6 @@ bool TestHardTanH2()
#endif // USE_CUDA
}
/*
case 3: float16 test HardTanH function.
y = 1 if x > 1
x if -1 <= x <= 1
-1 if x < -1
*/
bool
TestHardTanH3
()
{
/* a tensor of size (2, 3) */
int
order
=
2
;
int
*
dimSize
=
new
int
[
order
];
dimSize
[
0
]
=
2
;
dimSize
[
1
]
=
3
;
int
unitNum
=
1
;
for
(
int
i
=
0
;
i
<
order
;
i
++
)
unitNum
*=
dimSize
[
i
];
DTYPE
xData
[
2
][
3
]
=
{
{
0.5
F
,
-
1.0
F
,
2.0
F
},
{
3.5
F
,
-
4.5
F
,
1.0
F
}
};
DTYPE
answer
[
2
][
3
]
=
{
{
0.5
F
,
-
1.0
F
,
1.0
F
},
{
1.0
F
,
-
1.0
F
,
1.0
F
}
};
/* CPU test */
bool
cpuTest
=
true
;
#ifdef USE_CUDA
/* GPU test */
bool
gpuTest
=
true
;
/* create tensor */
XTensor
*
xGPU
=
NewTensor
(
order
,
dimSize
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
*
yGPU
=
NewTensor
(
order
,
dimSize
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
yUserGPU
;
/* create float16 tensor */
XTensor
xHalfGPU
;
XTensor
yHalfGPU
;
XTensor
yUserHalfGPU
;
/* Initialize variables */
xGPU
->
SetData
(
xData
,
unitNum
);
yGPU
->
SetZeroAll
();
/* convert data type from float to float16 */
xHalfGPU
=
ConvertDataType
(
*
xGPU
,
X_FLOAT16
);
yHalfGPU
=
ConvertDataType
(
*
yGPU
,
X_FLOAT16
);
/* call hardtanh function */
_HardTanH
(
&
xHalfGPU
,
&
yHalfGPU
);
yUserHalfGPU
=
HardTanH
(
xHalfGPU
);
/* convert data type from float16 to float */
_ConvertDataType
(
&
yHalfGPU
,
yGPU
);
yUserGPU
=
ConvertDataType
(
yUserHalfGPU
,
X_FLOAT
);
/* check results */
gpuTest
=
yGPU
->
CheckData
(
answer
,
unitNum
,
1e-4
F
)
&&
yUserGPU
.
CheckData
(
answer
,
unitNum
,
1e-4
F
);
/* destroy variables */
delete
xGPU
;
delete
yGPU
;
delete
[]
dimSize
;
return
cpuTest
&&
gpuTest
;
#else
/* destroy variables */
delete
[]
dimSize
;
return
cpuTest
;
#endif // USE_CUDA
}
/* other cases */
/*
TODO!!
...
...
@@ -329,16 +253,6 @@ bool TestHardTanH()
else
XPRINT
(
0
,
stdout
,
">> case 2 passed!
\n
"
);
/* case 3 test */
caseFlag
=
TestHardTanH3
();
if
(
!
caseFlag
)
{
returnFlag
=
false
;
XPRINT
(
0
,
stdout
,
">> case 3 failed!
\n
"
);
}
else
XPRINT
(
0
,
stdout
,
">> case 3 passed!
\n
"
);
/* other cases test */
/*
TODO!!
...
...
source/tensor/test/TLogSoftmax.cpp
查看文件 @
7da1bec1
...
...
@@ -17,7 +17,6 @@
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-02
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-06 float16 added
*/
#include "../XUtility.h"
...
...
@@ -313,11 +312,6 @@ bool TestLogSoftmax3()
#endif // USE_CUDA
}
/* other cases */
/*
TODO!!
*/
/*
case 4: float16 test LogSoftmax function.
LogSoftmax function: y = log(e^x / \sum_{i} e^{x_i})
...
...
@@ -370,10 +364,11 @@ bool TestLogSoftmax4()
/* convert data type from float16 to float */
_ConvertDataType
(
&
yHalfGPU
,
yGPU
);
yUserGPU
=
ConvertDataType
(
y
HalfGPU
,
X_FLOAT
);
yUserGPU
=
ConvertDataType
(
yUser
HalfGPU
,
X_FLOAT
);
/* check result */
gpuTest
=
yGPU
->
CheckData
(
answer
,
unitNum
,
1e-1
F
)
&&
yUserGPU
.
CheckData
(
answer
,
unitNum
,
1e-1
F
);
gpuTest
=
yGPU
->
CheckData
(
answer
,
unitNum
,
1e-2
F
)
&&
yUserGPU
.
CheckData
(
answer
,
unitNum
,
1e-2
F
);
/* destroy variables */
delete
xGPU
;
...
...
@@ -389,6 +384,188 @@ bool TestLogSoftmax4()
#endif // USE_CUDA
}
/*
case 5: float16 test LogSoftmaxBackward function.
dE/dx = dE/dy * dy/dx
log softmax: y_i = log(e^{x_i} / \sum_{k} e^{x_k})
In this case, LossName=CROSSENTROPY.
*/
bool
TestLogSoftmax5
()
{
/* a tensor of size (1, 3) */
int
order
=
2
;
int
*
dimSize
=
new
int
[
order
];
dimSize
[
0
]
=
1
;
dimSize
[
1
]
=
3
;
int
unitNum
=
1
;
for
(
int
i
=
0
;
i
<
order
;
i
++
)
unitNum
*=
dimSize
[
i
];
DTYPE
xData
[
1
][
3
]
=
{
0.0
F
,
1.0
F
,
2.0
F
};
DTYPE
gData
[
1
][
3
]
=
{
0.5
F
,
0.8
F
,
1.5
F
};
DTYPE
yAnswer
[
1
][
3
]
=
{
-
2.4076
F
,
-
1.4076
F
,
-
0.4076
F
};
DTYPE
dedxAnswer
[
1
][
3
]
=
{
-
0.4100
F
,
-
0.5553
F
,
-
0.8348
F
};
/* CPU test */
bool
cpuTest
=
true
;
#ifdef USE_CUDA
/* GPU test */
bool
gpuTest
=
true
;
/* create tensors */
XTensor
*
xGPU
=
NewTensor
(
order
,
dimSize
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
*
yGPU
=
NewTensor
(
order
,
dimSize
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
*
gGPU
=
NewTensor
(
order
,
dimSize
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
*
dedyGPU
=
NewTensor
(
order
,
dimSize
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
*
dedxGPU
=
NewTensor
(
order
,
dimSize
,
X_FLOAT
,
1.0
F
,
0
);
/* create float16 tensors */
XTensor
xHalfGPU
;
XTensor
yHalfGPU
;
XTensor
gHalfGPU
;
XTensor
dedyHalfGPU
;
XTensor
dedxHalfGPU
;
/* initialize variables */
xGPU
->
SetData
(
xData
,
unitNum
);
gGPU
->
SetData
(
gData
,
unitNum
);
yGPU
->
SetZeroAll
();
dedxGPU
->
SetZeroAll
();
dedyGPU
->
SetZeroAll
();
/* convert data type from float to float16 */
xHalfGPU
=
ConvertDataType
(
*
xGPU
,
X_FLOAT16
);
yHalfGPU
=
ConvertDataType
(
*
yGPU
,
X_FLOAT16
);
gHalfGPU
=
ConvertDataType
(
*
gGPU
,
X_FLOAT16
);
dedyHalfGPU
=
ConvertDataType
(
*
dedyGPU
,
X_FLOAT16
);
dedxHalfGPU
=
ConvertDataType
(
*
dedxGPU
,
X_FLOAT16
);
/* call logsoftmax function */
_LogSoftmax
(
&
xHalfGPU
,
&
yHalfGPU
,
1
);
/* call logsoftmaxbackward function */
_LogSoftmaxBackward
(
&
gHalfGPU
,
&
yHalfGPU
,
&
xHalfGPU
,
&
dedyHalfGPU
,
&
dedxHalfGPU
,
NULL
,
1
,
CROSSENTROPY
);
/* convert data type from float16 to float */
_ConvertDataType
(
&
yHalfGPU
,
yGPU
);
_ConvertDataType
(
&
dedxHalfGPU
,
dedxGPU
);
/* check result */
gpuTest
=
yGPU
->
CheckData
(
yAnswer
,
unitNum
,
1e-2
F
)
&&
dedxGPU
->
CheckData
(
dedxAnswer
,
unitNum
,
1e-2
F
);
/* destroy variables */
delete
xGPU
;
delete
yGPU
;
delete
gGPU
;
delete
dedxGPU
;
delete
dedyGPU
;
delete
[]
dimSize
;
return
cpuTest
&&
gpuTest
;
#else
/* destroy variables */
delete
[]
dimSize
;
return
cpuTest
;
#endif // USE_CUDA
}
/*
case 6: float16 test LogSoftmaxBackward function.
dE/dx = dE/dy * dy/dx
log softmax: y_i = log(e^{x_i} / \sum_{k} e^{x_k})
In this case, LossName=SQUAREDERROR
*/
bool
TestLogSoftmax6
()
{
/* a tensor of size (1, 3) */
int
order
=
2
;
int
*
dimSize
=
new
int
[
order
];
dimSize
[
0
]
=
1
;
dimSize
[
1
]
=
3
;
int
unitNum
=
1
;
for
(
int
i
=
0
;
i
<
order
;
i
++
)
unitNum
*=
dimSize
[
i
];
DTYPE
xData
[
1
][
3
]
=
{
0.0
F
,
1.0
F
,
2.0
F
};
DTYPE
gData
[
1
][
3
]
=
{
0.5
F
,
0.8
F
,
1.5
F
};
DTYPE
yAnswer
[
1
][
3
]
=
{
-
2.4076
F
,
-
1.4076
F
,
-
0.4076
F
};
DTYPE
dedxAnswer
[
1
][
3
]
=
{
-
0.4100
F
,
-
0.5553
F
,
-
0.8348
F
};
/* CPU test */
bool
cpuTest
=
true
;
#ifdef USE_CUDA
/* GPU test */
bool
gpuTest
=
true
;
/* create tensors */
XTensor
*
xGPU
=
NewTensor
(
order
,
dimSize
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
*
yGPU
=
NewTensor
(
order
,
dimSize
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
*
gGPU
=
NewTensor
(
order
,
dimSize
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
*
dedyGPU
=
NewTensor
(
order
,
dimSize
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
*
dedxGPU
=
NewTensor
(
order
,
dimSize
,
X_FLOAT
,
1.0
F
,
0
);
/* create float16 tensors */
XTensor
xHalfGPU
;
XTensor
yHalfGPU
;
XTensor
gHalfGPU
;
XTensor
dedyHalfGPU
;
XTensor
dedxHalfGPU
;
/* initialize variables */
xGPU
->
SetData
(
xData
,
unitNum
);
gGPU
->
SetData
(
gData
,
unitNum
);
yGPU
->
SetZeroAll
();
dedxGPU
->
SetZeroAll
();
dedyGPU
->
SetZeroAll
();
/* convert data type from float to float16 */
xHalfGPU
=
ConvertDataType
(
*
xGPU
,
X_FLOAT16
);
yHalfGPU
=
ConvertDataType
(
*
yGPU
,
X_FLOAT16
);
gHalfGPU
=
ConvertDataType
(
*
gGPU
,
X_FLOAT16
);
dedyHalfGPU
=
ConvertDataType
(
*
dedyGPU
,
X_FLOAT16
);
dedxHalfGPU
=
ConvertDataType
(
*
dedxGPU
,
X_FLOAT16
);
/* call logsoftmax function */
_LogSoftmax
(
&
xHalfGPU
,
&
yHalfGPU
,
1
);
/* call logsoftmaxbackward function */
_LogSoftmaxBackward
(
&
gHalfGPU
,
&
yHalfGPU
,
&
xHalfGPU
,
&
dedyHalfGPU
,
&
dedxHalfGPU
,
NULL
,
1
,
SQUAREDERROR
);
/* convert data type from float16 to float */
_ConvertDataType
(
&
yHalfGPU
,
yGPU
);
_ConvertDataType
(
&
dedxHalfGPU
,
dedxGPU
);
/* check result */
gpuTest
=
yGPU
->
CheckData
(
yAnswer
,
unitNum
,
1e-2
F
)
&&
dedxGPU
->
CheckData
(
dedxAnswer
,
unitNum
,
1e-2
F
);
/* destroy variables */
delete
xGPU
;
delete
yGPU
;
delete
gGPU
;
delete
dedxGPU
;
delete
dedyGPU
;
delete
[]
dimSize
;
return
cpuTest
&&
gpuTest
;
#else
/* destroy variables */
delete
[]
dimSize
;
return
cpuTest
;
#endif // USE_CUDA
}
/* other cases */
/*
TODO!!
*/
/* test for LogSoftmax Function */
bool
TestLogSoftmax
()
...
...
@@ -436,6 +613,26 @@ bool TestLogSoftmax()
else
XPRINT
(
0
,
stdout
,
">> case 4 passed!
\n
"
);
/* case 5 test */
caseFlag
=
TestLogSoftmax5
();
if
(
!
caseFlag
)
{
returnFlag
=
false
;
XPRINT
(
0
,
stdout
,
">> case 5 failed!
\n
"
);
}
else
XPRINT
(
0
,
stdout
,
">> case 5 passed!
\n
"
);
/* case 6 test */
caseFlag
=
TestLogSoftmax6
();
if
(
!
caseFlag
)
{
returnFlag
=
false
;
XPRINT
(
0
,
stdout
,
">> case 6 failed!
\n
"
);
}
else
XPRINT
(
0
,
stdout
,
">> case 6 passed!
\n
"
);
/* other cases test */
/*
TODO!!
...
...
source/tensor/test/TMatrixMul.cpp
查看文件 @
7da1bec1
...
...
@@ -17,7 +17,6 @@
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-06-14
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-07 float16/int8 added
*/
#include "TMatrixMul.h"
...
...
@@ -508,409 +507,6 @@ bool TestMatrixMul4()
#endif // USE_CUDA
}
/*
case 5: float16 matrix multiplication.
In this case, float16 a=(2, 3), float16 b=(3, 2) -> float16 c=(2, 2),
transposedA=X_NOTRANS, transposedB=X_NOTRANS.
*/
bool
TestMatrixMul5
()
{
/* a source tensor of size (2, 3) */
int
sOrder1
=
2
;
int
*
sDimSize1
=
new
int
[
sOrder1
];
sDimSize1
[
0
]
=
2
;
sDimSize1
[
1
]
=
3
;
int
sUnitNum1
=
1
;
for
(
int
i
=
0
;
i
<
sOrder1
;
i
++
)
sUnitNum1
*=
sDimSize1
[
i
];
/* a source tensor of size (3, 2) */
int
sOrder2
=
2
;
int
*
sDimSize2
=
new
int
[
sOrder2
];
sDimSize2
[
0
]
=
3
;
sDimSize2
[
1
]
=
2
;
int
sUnitNum2
=
1
;
for
(
int
i
=
0
;
i
<
sOrder2
;
i
++
)
sUnitNum2
*=
sDimSize2
[
i
];
/* a target tensor of size (2, 2) */
int
tOrder
=
2
;
int
*
tDimSize
=
new
int
[
tOrder
];
tDimSize
[
0
]
=
2
;
tDimSize
[
1
]
=
2
;
int
tUnitNum
=
1
;
for
(
int
i
=
0
;
i
<
tOrder
;
i
++
)
tUnitNum
*=
tDimSize
[
i
];
DTYPE
sData1
[
2
][
3
]
=
{
{
1.0
F
,
2.0
F
,
3.0
F
},
{
-
4.0
F
,
5.0
F
,
6.0
F
}
};
DTYPE
sData2
[
3
][
2
]
=
{
{
0.0
F
,
-
1.0
F
},
{
1.0
F
,
2.0
F
},
{
2.0
F
,
1.0
F
}
};
DTYPE
answer
[
2
][
2
]
=
{
{
8.0
F
,
6.0
F
},
{
17.0
F
,
20.0
F
}
};
/* CPU test */
bool
cpuTest
=
true
;
#ifdef USE_CUDA
/* GPU test */
bool
gpuTest
=
true
;
/* create tensor */
XTensor
*
sGPU1
=
NewTensor
(
sOrder1
,
sDimSize1
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
*
sGPU2
=
NewTensor
(
sOrder2
,
sDimSize2
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
*
tGPU
=
NewTensor
(
tOrder
,
tDimSize
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
tUserGPU
;
/* create float16 tensors */
XTensor
halfSGPU1
;
XTensor
halfSGPU2
;
XTensor
halfTGPU
;
XTensor
halfTUserGPU
;
/* Initialize variables */
sGPU1
->
SetData
(
sData1
,
sUnitNum1
);
sGPU2
->
SetData
(
sData2
,
sUnitNum2
);
tGPU
->
SetZeroAll
();
/* convert data type from float to float16 */
halfSGPU1
=
ConvertDataType
(
*
sGPU1
,
X_FLOAT16
);
halfSGPU2
=
ConvertDataType
(
*
sGPU2
,
X_FLOAT16
);
halfTGPU
=
ConvertDataType
(
*
tGPU
,
X_FLOAT16
);
/* call MatrixMul function */
_MatrixMul
(
&
halfSGPU1
,
X_NOTRANS
,
&
halfSGPU2
,
X_NOTRANS
,
&
halfTGPU
);
halfTUserGPU
=
MatrixMul
(
halfSGPU1
,
X_NOTRANS
,
halfSGPU2
,
X_NOTRANS
);
/* convert data type from float16 to float */
_ConvertDataType
(
&
halfTGPU
,
tGPU
);
tUserGPU
=
ConvertDataType
(
halfTUserGPU
,
X_FLOAT
);
/* check results */
gpuTest
=
tGPU
->
CheckData
(
answer
,
tUnitNum
)
&&
tUserGPU
.
CheckData
(
answer
,
tUnitNum
);
/* destroy variables */
delete
sGPU1
;
delete
sGPU2
;
delete
tGPU
;
delete
[]
sDimSize1
;
delete
[]
sDimSize2
;
delete
[]
tDimSize
;
return
cpuTest
&&
gpuTest
;
#else
/* destroy variables */
delete
[]
sDimSize1
;
delete
[]
sDimSize2
;
delete
[]
tDimSize
;
return
cpuTest
;
#endif // USE_CUDA
}
/*
case 6: float16 matrix multiplication.
In this case, float16 a=(2, 3), float16 b=(3, 2) -> float32 c=(2, 2),
transposedA=X_NOTRANS, transposedB=X_NOTRANS.
*/
bool
TestMatrixMul6
()
{
/* a source tensor of size (2, 3) */
int
sOrder1
=
2
;
int
*
sDimSize1
=
new
int
[
sOrder1
];
sDimSize1
[
0
]
=
2
;
sDimSize1
[
1
]
=
3
;
int
sUnitNum1
=
1
;
for
(
int
i
=
0
;
i
<
sOrder1
;
i
++
)
sUnitNum1
*=
sDimSize1
[
i
];
/* a source tensor of size (3, 2) */
int
sOrder2
=
2
;
int
*
sDimSize2
=
new
int
[
sOrder2
];
sDimSize2
[
0
]
=
3
;
sDimSize2
[
1
]
=
2
;
int
sUnitNum2
=
1
;
for
(
int
i
=
0
;
i
<
sOrder2
;
i
++
)
sUnitNum2
*=
sDimSize2
[
i
];
/* a target tensor of size (2, 2) */
int
tOrder
=
2
;
int
*
tDimSize
=
new
int
[
tOrder
];
tDimSize
[
0
]
=
2
;
tDimSize
[
1
]
=
2
;
int
tUnitNum
=
1
;
for
(
int
i
=
0
;
i
<
tOrder
;
i
++
)
tUnitNum
*=
tDimSize
[
i
];
DTYPE
sData1
[
2
][
3
]
=
{
{
1.0
F
,
2.0
F
,
3.0
F
},
{
-
4.0
F
,
5.0
F
,
6.0
F
}
};
DTYPE
sData2
[
3
][
2
]
=
{
{
0.0
F
,
-
1.0
F
},
{
1.0
F
,
2.0
F
},
{
2.0
F
,
1.0
F
}
};
DTYPE
answer
[
2
][
2
]
=
{
{
8.0
F
,
6.0
F
},
{
17.0
F
,
20.0
F
}
};
/* CPU test */
bool
cpuTest
=
true
;
#ifdef USE_CUDA
/* GPU test */
bool
gpuTest
=
true
;
/* create tensor */
XTensor
*
sGPU1
=
NewTensor
(
sOrder1
,
sDimSize1
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
*
sGPU2
=
NewTensor
(
sOrder2
,
sDimSize2
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
*
tGPU
=
NewTensor
(
tOrder
,
tDimSize
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
tUserGPU
;
/* create float16 tensors */
XTensor
halfSGPU1
;
XTensor
halfSGPU2
;
/* Initialize variables */
sGPU1
->
SetData
(
sData1
,
sUnitNum1
);
sGPU2
->
SetData
(
sData2
,
sUnitNum2
);
tGPU
->
SetZeroAll
();
/* convert data type from float to float16 */
halfSGPU1
=
ConvertDataType
(
*
sGPU1
,
X_FLOAT16
);
halfSGPU2
=
ConvertDataType
(
*
sGPU2
,
X_FLOAT16
);
/* call MatrixMul function */
_MatrixMul
(
&
halfSGPU1
,
X_NOTRANS
,
&
halfSGPU2
,
X_NOTRANS
,
tGPU
);
tUserGPU
=
MatrixMul
(
halfSGPU1
,
X_NOTRANS
,
halfSGPU2
,
X_NOTRANS
,
X_FLOAT
);
/* check results */
gpuTest
=
tGPU
->
CheckData
(
answer
,
tUnitNum
)
&&
tUserGPU
.
CheckData
(
answer
,
tUnitNum
);
/* destroy variables */
delete
sGPU1
;
delete
sGPU2
;
delete
tGPU
;
delete
[]
sDimSize1
;
delete
[]
sDimSize2
;
delete
[]
tDimSize
;
return
cpuTest
&&
gpuTest
;
#else
/* destroy variables */
delete
[]
sDimSize1
;
delete
[]
sDimSize2
;
delete
[]
tDimSize
;
return
cpuTest
;
#endif // USE_CUDA
}
/*
case 7: int8 matrix multiplication.
In this case, int8 a=(2, 3), int8 b=(3, 2) -> float32 c=(2, 2),
transposedA=X_NOTRANS, transposedB=X_NOTRANS.
*/
bool
TestMatrixMul7
()
{
/* a source tensor of size (2, 3) */
int
sOrder1
=
2
;
int
*
sDimSize1
=
new
int
[
sOrder1
];
sDimSize1
[
0
]
=
2
;
sDimSize1
[
1
]
=
3
;
int
sUnitNum1
=
1
;
for
(
int
i
=
0
;
i
<
sOrder1
;
i
++
)
sUnitNum1
*=
sDimSize1
[
i
];
/* a source tensor of size (3, 2) */
int
sOrder2
=
2
;
int
*
sDimSize2
=
new
int
[
sOrder2
];
sDimSize2
[
0
]
=
3
;
sDimSize2
[
1
]
=
2
;
int
sUnitNum2
=
1
;
for
(
int
i
=
0
;
i
<
sOrder2
;
i
++
)
sUnitNum2
*=
sDimSize2
[
i
];
/* a target tensor of size (2, 2) */
int
tOrder
=
2
;
int
*
tDimSize
=
new
int
[
tOrder
];
tDimSize
[
0
]
=
2
;
tDimSize
[
1
]
=
2
;
int
tUnitNum
=
1
;
for
(
int
i
=
0
;
i
<
tOrder
;
i
++
)
tUnitNum
*=
tDimSize
[
i
];
DTYPE
sData1
[
2
][
3
]
=
{
{
1
,
2
,
3
},
{
-
4
,
5
,
6
}
};
DTYPE
sData2
[
3
][
2
]
=
{
{
0
,
-
1
},
{
1
,
2
},
{
2
,
1
}
};
DTYPE
answer
[
2
][
2
]
=
{
{
8
,
6
},
{
17
,
20
}
};
/* CPU test */
bool
cpuTest
=
true
;
#ifdef USE_CUDA
/* GPU test */
bool
gpuTest
=
true
;
/* create tensor */
XTensor
*
sGPU1
=
NewTensor
(
sOrder1
,
sDimSize1
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
*
sGPU2
=
NewTensor
(
sOrder2
,
sDimSize2
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
*
tGPU
=
NewTensor
(
tOrder
,
tDimSize
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
tUserGPU
;
/* create int8 tensors */
XTensor
int8SGPU1
;
XTensor
int8SGPU2
;
/* Initialize variables */
sGPU1
->
SetData
(
sData1
,
sUnitNum1
);
sGPU2
->
SetData
(
sData2
,
sUnitNum2
);
tGPU
->
SetZeroAll
();
/* convert data type from float to int8 */
int8SGPU1
=
ConvertDataType
(
*
sGPU1
,
X_INT8
);
int8SGPU2
=
ConvertDataType
(
*
sGPU2
,
X_INT8
);
/* call MatrixMul function */
_MatrixMul
(
&
int8SGPU1
,
X_NOTRANS
,
&
int8SGPU2
,
X_NOTRANS
,
tGPU
);
tUserGPU
=
MatrixMul
(
int8SGPU1
,
X_NOTRANS
,
int8SGPU2
,
X_NOTRANS
,
X_FLOAT
);
/* check results */
gpuTest
=
tGPU
->
CheckData
(
answer
,
tUnitNum
)
&&
tUserGPU
.
CheckData
(
answer
,
tUnitNum
);
/* destroy variables */
delete
sGPU1
;
delete
sGPU2
;
delete
tGPU
;
delete
[]
sDimSize1
;
delete
[]
sDimSize2
;
delete
[]
tDimSize
;
return
cpuTest
&&
gpuTest
;
#else
/* destroy variables */
delete
[]
sDimSize1
;
delete
[]
sDimSize2
;
delete
[]
tDimSize
;
return
cpuTest
;
#endif // USE_CUDA
}
/*
case 8: int8 matrix multiplication.
In this case, int8 a=(2, 3), int8 b=(3, 2) -> int32 c=(2, 2),
transposedA=X_NOTRANS, transposedB=X_NOTRANS.
*/
bool
TestMatrixMul8
()
{
/* a source tensor of size (2, 3) */
int
sOrder1
=
2
;
int
*
sDimSize1
=
new
int
[
sOrder1
];
sDimSize1
[
0
]
=
2
;
sDimSize1
[
1
]
=
3
;
int
sUnitNum1
=
1
;
for
(
int
i
=
0
;
i
<
sOrder1
;
i
++
)
sUnitNum1
*=
sDimSize1
[
i
];
/* a source tensor of size (3, 2) */
int
sOrder2
=
2
;
int
*
sDimSize2
=
new
int
[
sOrder2
];
sDimSize2
[
0
]
=
3
;
sDimSize2
[
1
]
=
2
;
int
sUnitNum2
=
1
;
for
(
int
i
=
0
;
i
<
sOrder2
;
i
++
)
sUnitNum2
*=
sDimSize2
[
i
];
/* a target tensor of size (2, 2) */
int
tOrder
=
2
;
int
*
tDimSize
=
new
int
[
tOrder
];
tDimSize
[
0
]
=
2
;
tDimSize
[
1
]
=
2
;
int
tUnitNum
=
1
;
for
(
int
i
=
0
;
i
<
tOrder
;
i
++
)
tUnitNum
*=
tDimSize
[
i
];
DTYPE
sData1
[
2
][
3
]
=
{
{
1
,
2
,
3
},
{
-
4
,
5
,
6
}
};
DTYPE
sData2
[
3
][
2
]
=
{
{
0
,
-
1
},
{
1
,
2
},
{
2
,
1
}
};
DTYPE
answer
[
2
][
2
]
=
{
{
8
,
6
},
{
17
,
20
}
};
/* CPU test */
bool
cpuTest
=
true
;
#ifdef USE_CUDA
/* GPU test */
bool
gpuTest
=
true
;
/* create tensor */
XTensor
*
sGPU1
=
NewTensor
(
sOrder1
,
sDimSize1
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
*
sGPU2
=
NewTensor
(
sOrder2
,
sDimSize2
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
*
tGPU
=
NewTensor
(
tOrder
,
tDimSize
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
*
intTGPU
=
NewTensor
(
tOrder
,
tDimSize
,
X_INT
,
1.0
F
,
0
);
XTensor
tUserGPU
;
XTensor
intTUserGPU
;
/* create int8 tensors */
XTensor
int8SGPU1
;
XTensor
int8SGPU2
;
/* Initialize variables */
sGPU1
->
SetData
(
sData1
,
sUnitNum1
);
sGPU2
->
SetData
(
sData2
,
sUnitNum2
);
tGPU
->
SetZeroAll
();
/* convert data type from float to int8 */
int8SGPU1
=
ConvertDataType
(
*
sGPU1
,
X_INT8
);
int8SGPU2
=
ConvertDataType
(
*
sGPU2
,
X_INT8
);
/* call MatrixMul function */
_MatrixMul
(
&
int8SGPU1
,
X_NOTRANS
,
&
int8SGPU2
,
X_NOTRANS
,
intTGPU
);
intTUserGPU
=
MatrixMul
(
int8SGPU1
,
X_NOTRANS
,
int8SGPU2
,
X_NOTRANS
,
X_INT
);
/* convert data type from int to float32 */
_ConvertDataType
(
intTGPU
,
tGPU
);
tUserGPU
=
ConvertDataType
(
intTUserGPU
,
X_FLOAT
);
/* check results */
gpuTest
=
tGPU
->
CheckData
(
answer
,
tUnitNum
)
&&
tUserGPU
.
CheckData
(
answer
,
tUnitNum
);
/* destroy variables */
delete
sGPU1
;
delete
sGPU2
;
delete
tGPU
;
delete
intTGPU
;
delete
[]
sDimSize1
;
delete
[]
sDimSize2
;
delete
[]
tDimSize
;
return
cpuTest
&&
gpuTest
;
#else
/* destroy variables */
delete
[]
sDimSize1
;
delete
[]
sDimSize2
;
delete
[]
tDimSize
;
return
cpuTest
;
#endif // USE_CUDA
}
/* other cases */
/*
...
...
@@ -960,42 +556,6 @@ bool TestMatrixMul()
else
XPRINT
(
0
,
stdout
,
">> case 4 passed!
\n
"
);
/* case 5 test */
caseFlag
=
TestMatrixMul5
();
if
(
!
caseFlag
)
{
returnFlag
=
false
;
XPRINT
(
0
,
stdout
,
">> case 5 failed!
\n
"
);
}
else
XPRINT
(
0
,
stdout
,
">> case 5 passed!
\n
"
);
/* case 6 test */
caseFlag
=
TestMatrixMul6
();
if
(
!
caseFlag
)
{
returnFlag
=
false
;
XPRINT
(
0
,
stdout
,
">> case 6 failed!
\n
"
);
}
else
XPRINT
(
0
,
stdout
,
">> case 6 passed!
\n
"
);
/* case 7 test */
caseFlag
=
TestMatrixMul7
();
if
(
!
caseFlag
)
{
returnFlag
=
false
;
XPRINT
(
0
,
stdout
,
">> case 7 failed!
\n
"
);
}
else
XPRINT
(
0
,
stdout
,
">> case 7 passed!
\n
"
);
/* case 8 test */
caseFlag
=
TestMatrixMul8
();
if
(
!
caseFlag
)
{
returnFlag
=
false
;
XPRINT
(
0
,
stdout
,
">> case 8 failed!
\n
"
);
}
else
XPRINT
(
0
,
stdout
,
">> case 8 passed!
\n
"
);
/* other cases test */
/*
TODO!!
...
...
source/tensor/test/TMatrixMul.h
查看文件 @
7da1bec1
...
...
@@ -23,7 +23,6 @@
#define __TEST_MATRIXMUL_H__
#include "../core/arithmetic/MatrixMul.h"
#include "../core/getandset/ConvertDataType.h"
namespace
nts
{
// namespace nts(NiuTrans.Tensor)
...
...
source/tensor/test/TMultiplyDim.cpp
查看文件 @
7da1bec1
...
...
@@ -22,6 +22,7 @@
#include "TMultiplyDim.h"
#include "../core/arithmetic/MultiplyDim.h"
#include "../XTensor.h"
#include "../core/getandset/ConvertDataType.h"
namespace
nts
{
// namespace nts(NiuTrans.Tensor)
/*
...
...
@@ -248,6 +249,203 @@ bool TestMultiplyDim2()
#endif // USE_CUDA
}
/*
case 3: float16 tensor multiplication c = a * b + \alpha * c
where the size of b is equal to the n-th dimension of a,
i.e., a is multiplied with b by broadcasting
In this case, (2, 4) * (2) = (2, 4), n = 0.
*/
bool
TestMultiplyDim3
()
{
/* a tensor of size (2, 4) */
int
aOrder
=
2
;
int
*
aDimSize
=
new
int
[
aOrder
];
aDimSize
[
0
]
=
2
;
aDimSize
[
1
]
=
4
;
int
aUnitNum
=
1
;
for
(
int
i
=
0
;
i
<
aOrder
;
i
++
)
aUnitNum
*=
aDimSize
[
i
];
/* a tensor of size (2) */
int
bOrder
=
1
;
int
*
bDimSize
=
new
int
[
bOrder
];
bDimSize
[
0
]
=
2
;
int
bUnitNum
=
1
;
for
(
int
i
=
0
;
i
<
bOrder
;
i
++
)
bUnitNum
*=
bDimSize
[
i
];
DTYPE
aData
[
2
][
4
]
=
{
{
0.0
F
,
1.0
F
,
2.0
F
,
3.0
F
},
{
4.0
F
,
5.0
F
,
6.0
F
,
7.0
F
}
};
DTYPE
bData
[
2
]
=
{
1.0
F
,
-
1.0
F
};
DTYPE
answer
[
2
][
4
]
=
{
{
0.0
F
,
1.0
F
,
2.0
F
,
3.0
F
},
{
-
4.0
F
,
-
5.0
F
,
-
6.0
F
,
-
7.0
F
}
};
/* CPU test */
bool
cpuTest
=
true
;
#ifdef USE_CUDA
/* GPU test */
bool
gpuTest
=
true
;
/* create tensor */
XTensor
*
aGPU
=
NewTensor
(
aOrder
,
aDimSize
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
*
bGPU
=
NewTensor
(
bOrder
,
bDimSize
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
*
cGPU
=
NewTensor
(
aOrder
,
aDimSize
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
*
cMeGPU
=
NewTensor
(
aOrder
,
aDimSize
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
cUserGPU
;
/* create float16 tensor */
XTensor
aHalfGPU
;
XTensor
bHalfGPU
;
XTensor
cHalfGPU
;
XTensor
cMeHalfGPU
;
XTensor
cUserHalfGPU
;
/* Initialize variables */
aGPU
->
SetData
(
aData
,
aUnitNum
);
cMeGPU
->
SetData
(
aData
,
aUnitNum
);
bGPU
->
SetData
(
bData
,
bUnitNum
);
cGPU
->
SetZeroAll
();
/* convert data type from float to float16 */
aHalfGPU
=
ConvertDataType
(
*
aGPU
,
X_FLOAT16
);
bHalfGPU
=
ConvertDataType
(
*
bGPU
,
X_FLOAT16
);
cHalfGPU
=
ConvertDataType
(
*
cGPU
,
X_FLOAT16
);
cMeHalfGPU
=
ConvertDataType
(
*
cMeGPU
,
X_FLOAT16
);
/* call multiplydim function */
_MultiplyDim
(
&
aHalfGPU
,
&
bHalfGPU
,
&
cHalfGPU
,
0
);
_MultiplyDimMe
(
&
cMeHalfGPU
,
&
bHalfGPU
,
0
);
cUserHalfGPU
=
MultiplyDim
(
aHalfGPU
,
bHalfGPU
,
0
);
/* convert data type from float16 to float */
_ConvertDataType
(
&
cHalfGPU
,
cGPU
);
_ConvertDataType
(
&
cMeHalfGPU
,
cMeGPU
);
cUserGPU
=
ConvertDataType
(
cUserHalfGPU
,
X_FLOAT
);
/* check results */
gpuTest
=
cGPU
->
CheckData
(
answer
,
aUnitNum
)
&&
cMeGPU
->
CheckData
(
answer
,
aUnitNum
)
&&
cUserGPU
.
CheckData
(
answer
,
aUnitNum
);
/* destroy variables */
delete
aGPU
;
delete
bGPU
;
delete
cGPU
;
delete
cMeGPU
;
delete
[]
aDimSize
;
delete
[]
bDimSize
;
return
cpuTest
&&
gpuTest
;
#else
/* destroy variables */
delete
[]
aDimSize
;
delete
[]
bDimSize
;
return
cpuTest
;
#endif // USE_CUDA
}
/*
case 4: flaot16 tensor multiplication c = a*b + \alpha * c
where the size of b is equal to the n-th dimension of a,
i.e., a is multiplied with b by broadcasting.
In this case, (2, 4) * (4) = (2, 4), n = 1.
*/
bool
TestMultiplyDim4
()
{
/* a tensor of size (2, 4) */
int
aOrder
=
2
;
int
*
aDimSize
=
new
int
[
aOrder
];
aDimSize
[
0
]
=
2
;
aDimSize
[
1
]
=
4
;
int
aUnitNum
=
1
;
for
(
int
i
=
0
;
i
<
aOrder
;
i
++
)
aUnitNum
*=
aDimSize
[
i
];
/* a tensor of size (4) */
int
bOrder
=
1
;
int
*
bDimSize
=
new
int
[
bOrder
];
bDimSize
[
0
]
=
4
;
int
bUnitNum
=
1
;
for
(
int
i
=
0
;
i
<
bOrder
;
i
++
)
bUnitNum
*=
bDimSize
[
i
];
DTYPE
aData
[
2
][
4
]
=
{
{
0.0
F
,
1.0
F
,
2.0
F
,
3.0
F
},
{
4.0
F
,
5.0
F
,
6.0
F
,
7.0
F
}
};
DTYPE
bData
[
4
]
=
{
1.0
F
,
-
1.0
F
,
1.0
F
,
-
1.0
F
};
DTYPE
answer
[
2
][
4
]
=
{
{
0.0
F
,
-
1.0
F
,
2.0
F
,
-
3.0
F
},
{
4.0
F
,
-
5.0
F
,
6.0
F
,
-
7.0
F
}
};
/* CPU test */
bool
cpuTest
=
true
;
#ifdef USE_CUDA
/* GPU test */
bool
gpuTest
=
true
;
/* create tensor */
XTensor
*
aGPU
=
NewTensor
(
aOrder
,
aDimSize
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
*
bGPU
=
NewTensor
(
bOrder
,
bDimSize
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
*
cGPU
=
NewTensor
(
aOrder
,
aDimSize
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
*
cMeGPU
=
NewTensor
(
aOrder
,
aDimSize
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
cUserGPU
;
/* create float16 tensor */
XTensor
aHalfGPU
;
XTensor
bHalfGPU
;
XTensor
cHalfGPU
;
XTensor
cMeHalfGPU
;
XTensor
cUserHalfGPU
;
/* Initialize variables */
aGPU
->
SetData
(
aData
,
aUnitNum
);
cMeGPU
->
SetData
(
aData
,
aUnitNum
);
bGPU
->
SetData
(
bData
,
bUnitNum
);
cGPU
->
SetZeroAll
();
/* convert data type from float to float16 */
aHalfGPU
=
ConvertDataType
(
*
aGPU
,
X_FLOAT16
);
bHalfGPU
=
ConvertDataType
(
*
bGPU
,
X_FLOAT16
);
cHalfGPU
=
ConvertDataType
(
*
cGPU
,
X_FLOAT16
);
cMeHalfGPU
=
ConvertDataType
(
*
cMeGPU
,
X_FLOAT16
);
/* call multiplydim function */
_MultiplyDim
(
&
aHalfGPU
,
&
bHalfGPU
,
&
cHalfGPU
,
1
);
_MultiplyDimMe
(
&
cMeHalfGPU
,
&
bHalfGPU
,
1
);
cUserHalfGPU
=
MultiplyDim
(
aHalfGPU
,
bHalfGPU
,
1
);
/* convert data type from float16 to float */
_ConvertDataType
(
&
cHalfGPU
,
cGPU
);
_ConvertDataType
(
&
cMeHalfGPU
,
cMeGPU
);
cUserGPU
=
ConvertDataType
(
cUserHalfGPU
,
X_FLOAT
);
/* check results */
gpuTest
=
cGPU
->
CheckData
(
answer
,
aUnitNum
)
&&
cMeGPU
->
CheckData
(
answer
,
aUnitNum
)
&&
cUserGPU
.
CheckData
(
answer
,
aUnitNum
);
/* destroy variables */
delete
aGPU
;
delete
bGPU
;
delete
cGPU
;
delete
cMeGPU
;
delete
[]
aDimSize
;
delete
[]
bDimSize
;
return
cpuTest
&&
gpuTest
;
#else
/* destroy variables */
delete
[]
aDimSize
;
delete
[]
bDimSize
;
return
cpuTest
;
#endif // USE_CUDA
}
/* test for MultiplyDim Function */
bool
TestMultiplyDim
()
{
...
...
@@ -272,6 +470,24 @@ bool TestMultiplyDim()
else
XPRINT
(
0
,
stdout
,
">> case 2 passed!
\n
"
);
/* case 3 test */
caseFlag
=
TestMultiplyDim3
();
if
(
!
caseFlag
)
{
returnFlag
=
false
;
XPRINT
(
0
,
stdout
,
">> case 3 failed!
\n
"
);
}
else
XPRINT
(
0
,
stdout
,
">> case 3 passed!
\n
"
);
/* case 4 test */
caseFlag
=
TestMultiplyDim4
();
if
(
!
caseFlag
)
{
returnFlag
=
false
;
XPRINT
(
0
,
stdout
,
">> case 4 failed!
\n
"
);
}
else
XPRINT
(
0
,
stdout
,
">> case 4 passed!
\n
"
);
/* other cases test */
/*
TODO!!
...
...
source/tensor/test/TNegate.cpp
查看文件 @
7da1bec1
...
...
@@ -20,6 +20,7 @@
*/
#include "TNegate.h"
#include "../core/getandset/ConvertDataType.h"
namespace
nts
{
// namespace nts(NiuTrans.Tensor)
...
...
@@ -191,6 +192,84 @@ bool TestNegate2()
#endif // USE_CUDA
}
/* case 3: float16 set every entry to its minus value */
bool
TestNegate3
()
{
/* a tensor of size (3, 2) */
int
aOrder
=
2
;
int
*
aDimSize
=
new
int
[
aOrder
];
aDimSize
[
0
]
=
3
;
aDimSize
[
1
]
=
2
;
int
aUnitNum
=
1
;
for
(
int
i
=
0
;
i
<
aOrder
;
i
++
)
aUnitNum
*=
aDimSize
[
i
];
DTYPE
aData
[
3
][
2
]
=
{
{
1.0
F
,
-
2.0
F
},
{
-
3.0
F
,
4.0
F
},
{
5.0
F
,
-
6.0
F
}
};
DTYPE
answer
[
3
][
2
]
=
{
{
-
1.0
F
,
2.0
F
},
{
3.0
F
,
-
4.0
F
},
{
-
5.0
F
,
6.0
F
}
};
/* CPU test */
bool
cpuTest
=
true
;
#ifdef USE_CUDA
/* GPU test */
bool
gpuTest
=
true
;
/* create tensor */
XTensor
*
aGPU
=
NewTensor
(
aOrder
,
aDimSize
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
*
bGPU
=
NewTensor
(
aOrder
,
aDimSize
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
*
aMeGPU
=
NewTensor
(
aOrder
,
aDimSize
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
bUserGPU
;
/* create float16 tensor */
XTensor
aHalfGPU
;
XTensor
bHalfGPU
;
XTensor
aMeHalfGPU
;
XTensor
bUserHalfGPU
;
/* Initialize variables */
aGPU
->
SetData
(
aData
,
aUnitNum
);
aMeGPU
->
SetData
(
aData
,
aUnitNum
);
/* convert data type from float to float16 */
aHalfGPU
=
ConvertDataType
(
*
aGPU
,
X_FLOAT16
);
aMeHalfGPU
=
ConvertDataType
(
*
aMeGPU
,
X_FLOAT16
);
bHalfGPU
=
ConvertDataType
(
*
bGPU
,
X_FLOAT16
);
/* call negate function */
_Negate
(
&
aHalfGPU
,
&
bHalfGPU
);
_NegateMe
(
&
aMeHalfGPU
);
bUserHalfGPU
=
Negate
(
aHalfGPU
);
/* convert data type from float16 to float */
_ConvertDataType
(
&
bHalfGPU
,
bGPU
);
_ConvertDataType
(
&
aMeHalfGPU
,
aMeGPU
);
bUserGPU
=
ConvertDataType
(
bUserHalfGPU
,
X_FLOAT
);
/* check results */
gpuTest
=
bGPU
->
CheckData
(
answer
,
aUnitNum
,
1e-4
F
)
&&
aMeGPU
->
CheckData
(
answer
,
aUnitNum
,
1e-4
F
)
&&
bUserGPU
.
CheckData
(
answer
,
aUnitNum
,
1e-4
F
);
/* destroy variables */
delete
aGPU
;
delete
bGPU
;
delete
aMeGPU
;
delete
[]
aDimSize
;
return
cpuTest
&&
gpuTest
;
#else
/* destroy variables */
delete
[]
aDimSize
;
return
cpuTest
;
#endif // USE_CUDA
}
/* other cases */
/*
TODO!!
...
...
@@ -222,6 +301,16 @@ bool TestNegate()
else
XPRINT
(
0
,
stdout
,
">> case 2 passed!
\n
"
);
/* case 3 test */
caseFlag
=
TestNegate3
();
if
(
!
caseFlag
)
{
returnFlag
=
false
;
XPRINT
(
0
,
stdout
,
">> case 3 failed!
\n
"
);
}
else
XPRINT
(
0
,
stdout
,
">> case 3 passed!
\n
"
);
/* other cases test */
/*
TODO!!
...
...
source/tensor/test/TReduceMax.cpp
查看文件 @
7da1bec1
...
...
@@ -17,11 +17,9 @@
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-06-30
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-06 float16 added
*/
#include "TReduceMax.h"
#include "../core/getandset/ConvertDataType.h"
namespace
nts
{
// namespace nts(NiuTrans.Tensor)
...
...
@@ -88,8 +86,8 @@ bool TestReduceMax1()
tUser2
=
ReduceMax
(
*
s
,
1
);
/* check results */
cpuTest
=
t1
->
CheckData
(
answer1
,
tUnitNum1
)
&&
tUser1
.
CheckData
(
answer1
,
tUnitNum1
)
&&
t2
->
CheckData
(
answer2
,
tUnitNum2
)
&&
tUser2
.
CheckData
(
answer2
,
tUnitNum2
);
cpuTest
=
t1
->
CheckData
(
answer1
,
tUnitNum1
)
&&
tUser1
.
CheckData
(
answer1
,
tUnitNum1
)
&&
t2
->
CheckData
(
answer2
,
tUnitNum2
)
&&
tUser2
.
CheckData
(
answer2
,
tUnitNum2
);
#ifdef USE_CUDA
/* GPU test */
...
...
@@ -114,8 +112,8 @@ bool TestReduceMax1()
tUserGPU2
=
ReduceMax
(
*
sGPU
,
1
);
/* check results */
gpuTest
=
tGPU1
->
CheckData
(
answer1
,
tUnitNum1
)
&&
tUserGPU1
.
CheckData
(
answer1
,
tUnitNum1
)
&&
tGPU2
->
CheckData
(
answer2
,
tUnitNum2
)
&&
tUserGPU2
.
CheckData
(
answer2
,
tUnitNum2
);
gpuTest
=
tGPU1
->
CheckData
(
answer1
,
tUnitNum1
)
&&
tUserGPU1
.
CheckData
(
answer1
,
tUnitNum1
)
&&
tGPU2
->
CheckData
(
answer2
,
tUnitNum2
)
&&
tUserGPU2
.
CheckData
(
answer2
,
tUnitNum2
);
/* destroy variables */
delete
s
;
...
...
@@ -142,113 +140,6 @@ bool TestReduceMax1()
#endif // USE_CUDA
}
/*
case 2: float16 get the max value of the items along a dimension of the tensor.
In this case,
(2, 4) -> (4), dim = 0
(2, 4) -> (2), dim = 1
*/
bool
TestReduceMax2
()
{
/* a input tensor of size (2, 4) */
int
sOrder
=
2
;
int
*
sDimSize
=
new
int
[
sOrder
];
sDimSize
[
0
]
=
2
;
sDimSize
[
1
]
=
4
;
int
sUnitNum
=
1
;
for
(
int
i
=
0
;
i
<
sOrder
;
i
++
)
sUnitNum
*=
sDimSize
[
i
];
/* a output tensor of size (4) */
int
tOrder1
=
1
;
int
*
tDimSize1
=
new
int
[
tOrder1
];
tDimSize1
[
0
]
=
4
;
int
tUnitNum1
=
1
;
for
(
int
i
=
0
;
i
<
tOrder1
;
i
++
)
tUnitNum1
*=
tDimSize1
[
i
];
/* a output tensor of size (2) */
int
tOrder2
=
1
;
int
*
tDimSize2
=
new
int
[
tOrder2
];
tDimSize2
[
0
]
=
2
;
int
tUnitNum2
=
1
;
for
(
int
i
=
0
;
i
<
tOrder2
;
i
++
)
tUnitNum2
*=
tDimSize2
[
i
];
DTYPE
sData
[
2
][
4
]
=
{
{
0.0
F
,
5.0
F
,
2.0
F
,
3.0
F
},
{
4.0
F
,
1.0
F
,
6.0
F
,
7.0
F
}
};
DTYPE
answer1
[
4
]
=
{
4.0
F
,
5.0
F
,
6.0
F
,
7.0
F
};
DTYPE
answer2
[
2
]
=
{
5.0
F
,
7.0
F
};
/* CPU test */
bool
cpuTest
=
true
;
#ifdef USE_CUDA
/* GPU test */
bool
gpuTest
=
true
;
/* create tensors */
XTensor
*
sGPU
=
NewTensor
(
sOrder
,
sDimSize
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
*
tGPU1
=
NewTensor
(
tOrder1
,
tDimSize1
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
*
tGPU2
=
NewTensor
(
tOrder2
,
tDimSize2
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
tUserGPU1
;
XTensor
tUserGPU2
;
/* create float16 tensors */
XTensor
sHalfGPU
;
XTensor
tHalfGPU1
;
XTensor
tHalfGPU2
;
XTensor
tUserHalfGPU1
;
XTensor
tUserHalfGPU2
;
/* initialize variables */
sGPU
->
SetData
(
sData
,
sUnitNum
);
tGPU1
->
SetZeroAll
();
tGPU2
->
SetZeroAll
();
/* convert data type form float to float16 */
sHalfGPU
=
ConvertDataType
(
*
sGPU
,
X_FLOAT16
);
tHalfGPU1
=
ConvertDataType
(
*
tGPU1
,
X_FLOAT16
);
tHalfGPU2
=
ConvertDataType
(
*
tGPU2
,
X_FLOAT16
);
/* call reducemax function */
_ReduceMax
(
&
sHalfGPU
,
&
tHalfGPU1
,
0
);
_ReduceMax
(
&
sHalfGPU
,
&
tHalfGPU2
,
1
);
tUserHalfGPU1
=
ReduceMax
(
sHalfGPU
,
0
);
tUserHalfGPU2
=
ReduceMax
(
sHalfGPU
,
1
);
/* convert data type from float16 to float */
_ConvertDataType
(
&
tHalfGPU1
,
tGPU1
);
_ConvertDataType
(
&
tHalfGPU2
,
tGPU2
);
tUserGPU1
=
ConvertDataType
(
tUserHalfGPU1
,
X_FLOAT
);
tUserGPU2
=
ConvertDataType
(
tUserHalfGPU2
,
X_FLOAT
);
/* check results */
gpuTest
=
tGPU1
->
CheckData
(
answer1
,
tUnitNum1
)
&&
tUserGPU1
.
CheckData
(
answer1
,
tUnitNum1
)
&&
tGPU2
->
CheckData
(
answer2
,
tUnitNum2
)
&&
tUserGPU2
.
CheckData
(
answer2
,
tUnitNum2
);
/* destroy variables */
delete
sGPU
;
delete
tGPU1
;
delete
tGPU2
;
delete
[]
sDimSize
;
delete
[]
tDimSize1
;
delete
[]
tDimSize2
;
return
cpuTest
&&
gpuTest
;
#else
/* destroy variables */
delete
[]
sDimSize
;
delete
[]
tDimSize1
;
delete
[]
tDimSize2
;
return
cpuTest
;
#endif // USE_CUDA
}
/* other cases */
/*
TODO!!
...
...
@@ -269,15 +160,6 @@ bool TestReduceMax()
else
XPRINT
(
0
,
stdout
,
">> case 1 passed!
\n
"
);
/* case 2 test */
caseFlag
=
TestReduceMax2
();
if
(
!
caseFlag
)
{
returnFlag
=
false
;
XPRINT
(
0
,
stdout
,
">> case 2 failed!
\n
"
);
}
else
XPRINT
(
0
,
stdout
,
">> case 2 passed!
\n
"
);
/* other cases test */
/*
TODO!!
...
...
source/tensor/test/TReduceSum.cpp
查看文件 @
7da1bec1
...
...
@@ -17,12 +17,10 @@
/*
* $Created by: LI Yinqiao (email: li.yin.qiao.2012@hotmail.com) 2018-04-30
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-06 float16 added
*/
#include "TReduceSum.h"
#include "../core/getandset/SetData.h"
#include "../core/getandset/ConvertDataType.h"
namespace
nts
{
// namespace nts(NiuTrans.Tensor)
...
...
@@ -516,6 +514,7 @@ bool TestReduceSum5()
#endif // USE_CUDA
}
/*
case 6: test ReduceSum function.
Sum the items along a dimension of the tensor.
...
...
@@ -608,126 +607,6 @@ bool TestReduceSum6()
}
/*
case 7: float16 test ReduceSum function.
Sum the items along a dimension of the tensor.
In this case,
(2, 4) -> (4), dim = 0
(2, 4) -> (2), dim = 1
*/
bool
TestReduceSum7
()
{
/* a tensor of size (2, 4) */
int
sOrder
=
2
;
int
*
sDimSize
=
new
int
[
sOrder
];
sDimSize
[
0
]
=
2
;
sDimSize
[
1
]
=
4
;
int
sUnitNum
=
1
;
for
(
int
i
=
0
;
i
<
sOrder
;
i
++
)
sUnitNum
*=
sDimSize
[
i
];
/* a tensor of size (4) */
int
tOrder1
=
1
;
int
*
tDimSize1
=
new
int
[
tOrder1
];
tDimSize1
[
0
]
=
4
;
int
tUnitNum1
=
1
;
for
(
int
i
=
0
;
i
<
tOrder1
;
i
++
)
tUnitNum1
*=
tDimSize1
[
i
];
/* a tensor of size (2) */
int
tOrder2
=
1
;
int
*
tDimSize2
=
new
int
[
tOrder2
];
tDimSize2
[
0
]
=
2
;
int
tUnitNum2
=
1
;
for
(
int
i
=
0
;
i
<
tOrder2
;
i
++
)
tUnitNum2
*=
tDimSize2
[
i
];
DTYPE
sData
[
2
][
4
]
=
{
{
0.0
F
,
1.0
F
,
2.0
F
,
3.0
F
},
{
4.0
F
,
5.0
F
,
6.0
F
,
7.0
F
}
};
DTYPE
answer1
[
4
]
=
{
4.0
F
,
6.0
F
,
8.0
F
,
10.0
F
};
DTYPE
answer2
[
2
]
=
{
6.0
F
,
22.0
F
};
/* CPU test */
bool
cpuTest
=
true
;
#ifdef USE_CUDA
/* GPU test */
bool
gpuTest
=
true
;
/* create tensors */
XTensor
*
sGPU
=
NewTensor
(
sOrder
,
sDimSize
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
*
shiftGPU1
=
NewTensor
(
tOrder1
,
tDimSize1
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
*
shiftGPU2
=
NewTensor
(
tOrder2
,
tDimSize2
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
*
tGPU1
=
NewTensor
(
tOrder1
,
tDimSize1
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
*
tGPU2
=
NewTensor
(
tOrder2
,
tDimSize2
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
tUserGPU1
;
XTensor
tUserGPU2
;
/* create float16 tensors */
XTensor
sHalfGPU
;
XTensor
shiftHalfGPU1
;
XTensor
shiftHalfGPU2
;
XTensor
tHalfGPU1
;
XTensor
tHalfGPU2
;
XTensor
tUserHalfGPU1
;
XTensor
tUserHalfGPU2
;
/* initialize variables */
sGPU
->
SetData
(
sData
,
sUnitNum
);
shiftGPU1
->
SetZeroAll
();
shiftGPU2
->
SetZeroAll
();
tGPU1
->
SetZeroAll
();
tGPU2
->
SetZeroAll
();
/* convert data type from float to float16 */
sHalfGPU
=
ConvertDataType
(
*
sGPU
,
X_FLOAT16
);
shiftHalfGPU1
=
ConvertDataType
(
*
shiftGPU1
,
X_FLOAT16
);
shiftHalfGPU2
=
ConvertDataType
(
*
shiftGPU2
,
X_FLOAT16
);
tHalfGPU1
=
ConvertDataType
(
*
tGPU1
,
X_FLOAT16
);
tHalfGPU2
=
ConvertDataType
(
*
tGPU2
,
X_FLOAT16
);
/* call reducesum function */
_ReduceSum
(
&
sHalfGPU
,
&
tHalfGPU1
,
0
);
_ReduceSum
(
&
sHalfGPU
,
&
tHalfGPU2
,
1
);
tUserHalfGPU1
=
ReduceSum
(
sHalfGPU
,
0
,
shiftHalfGPU1
);
tUserHalfGPU2
=
ReduceSum
(
sHalfGPU
,
1
,
shiftHalfGPU2
);
/* convert data type from float16 to float */
_ConvertDataType
(
&
tHalfGPU1
,
tGPU1
);
_ConvertDataType
(
&
tHalfGPU2
,
tGPU2
);
tUserGPU1
=
ConvertDataType
(
tUserHalfGPU1
,
X_FLOAT
);
tUserGPU2
=
ConvertDataType
(
tUserHalfGPU2
,
X_FLOAT
);
/* check results */
gpuTest
=
tGPU1
->
CheckData
(
answer1
,
tUnitNum1
)
&&
tUserGPU1
.
CheckData
(
answer1
,
tUnitNum1
)
&&
tGPU2
->
CheckData
(
answer2
,
tUnitNum2
)
&&
tUserGPU2
.
CheckData
(
answer2
,
tUnitNum2
);
/* destroy variables */
delete
sGPU
;
delete
shiftGPU1
;
delete
shiftGPU2
;
delete
tGPU1
;
delete
tGPU2
;
delete
[]
sDimSize
;
delete
[]
tDimSize1
;
delete
[]
tDimSize2
;
return
cpuTest
&&
gpuTest
;
#else
/* destroy variables */
delete
[]
sDimSize
;
delete
[]
tDimSize1
;
delete
[]
tDimSize2
;
return
cpuTest
;
#endif // USE_CUDA
}
/* other cases */
/*
TODO!!
...
...
@@ -793,15 +672,6 @@ bool TestReduceSum()
else
XPRINT
(
0
,
stdout
,
">> case 6 passed!
\n
"
);
/* case 7 test */
caseFlag
=
TestReduceSum7
();
if
(
!
caseFlag
)
{
returnFlag
=
false
;
XPRINT
(
0
,
stdout
,
">> case 7 failed!
\n
"
);
}
else
XPRINT
(
0
,
stdout
,
">> case 7 passed!
\n
"
);
/* other cases test */
/*
TODO!!
...
...
source/tensor/test/TScaleAndShift.cpp
查看文件 @
7da1bec1
...
...
@@ -20,6 +20,7 @@
*/
#include "TScaleAndShift.h"
#include "../core/getandset/ConvertDataType.h"
namespace
nts
{
// namespace nts(NiuTrans.Tensor)
...
...
@@ -113,6 +114,252 @@ bool TestScaleAndShift1()
#endif // USE_CUDA
}
/*
case 2: flaot16 scale and shift all tensor entires.
p = p * scale + shift
*/
bool
TestScaleAndShift2
()
{
/* a input tensor of size (2, 4) */
int
sOrder
=
2
;
int
*
sDimSize
=
new
int
[
sOrder
];
sDimSize
[
0
]
=
2
;
sDimSize
[
1
]
=
4
;
int
sUnitNum
=
1
;
for
(
int
i
=
0
;
i
<
sOrder
;
i
++
)
sUnitNum
*=
sDimSize
[
i
];
DTYPE
sData
[
2
][
4
]
=
{
{
0.0
F
,
1.0
F
,
2.0
F
,
3.0
F
},
{
4.0
F
,
5.0
F
,
6.0
F
,
7.0
F
}
};
DTYPE
answer
[
2
][
4
]
=
{
{
0.5
F
,
2.5
F
,
4.5
F
,
6.5
F
},
{
8.5
F
,
10.5
F
,
12.5
F
,
14.5
F
}
};
DTYPE
scaleFactor
=
2.0
F
;
DTYPE
shiftFactor
=
0.5
F
;
/* CPU test */
bool
cpuTest
=
true
;
#ifdef USE_CUDA
/* GPU test */
bool
gpuTest
=
true
;
/* create tensors */
XTensor
*
sGPU
=
NewTensor
(
sOrder
,
sDimSize
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
*
tGPU
=
NewTensor
(
sOrder
,
sDimSize
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
*
tMeGPU
=
NewTensor
(
sOrder
,
sDimSize
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
tUserGPU
;
/* create float16 tensor */
XTensor
sHalfGPU
;
XTensor
tHalfGPU
;
XTensor
tMeHalfGPU
;
XTensor
tUserHalfGPU
;
/* initialize variables */
sGPU
->
SetData
(
sData
,
sUnitNum
);
tMeGPU
->
SetData
(
sData
,
sUnitNum
);
/* convert data type from float to float16 */
sHalfGPU
=
ConvertDataType
(
*
sGPU
,
X_FLOAT16
);
tMeHalfGPU
=
ConvertDataType
(
*
tMeGPU
,
X_FLOAT16
);
tHalfGPU
=
ConvertDataType
(
*
tGPU
,
X_FLOAT16
);
/* call scaleandshift function */
_ScaleAndShift
(
&
sHalfGPU
,
&
tHalfGPU
,
scaleFactor
,
shiftFactor
);
_ScaleAndShiftMe
(
&
tMeHalfGPU
,
scaleFactor
,
shiftFactor
);
tUserHalfGPU
=
ScaleAndShift
(
sHalfGPU
,
scaleFactor
,
shiftFactor
);
/* convert data type from float16 to float */
_ConvertDataType
(
&
tHalfGPU
,
tGPU
);
_ConvertDataType
(
&
tMeHalfGPU
,
tMeGPU
);
tUserGPU
=
ConvertDataType
(
tUserHalfGPU
,
X_FLOAT
);
/* check results */
gpuTest
=
tGPU
->
CheckData
(
answer
,
sUnitNum
)
&&
tMeGPU
->
CheckData
(
answer
,
sUnitNum
)
&&
tUserGPU
.
CheckData
(
answer
,
sUnitNum
);
/* destroy variables */
delete
sGPU
;
delete
tGPU
;
delete
tMeGPU
;
delete
[]
sDimSize
;
return
cpuTest
&&
gpuTest
;
#else
/* destroy variables */
delete
[]
sDimSize
;
return
cpuTest
;
#endif // USE_CUDA
}
/*
case 3: int32 scale and shift all tensor entires.
p = p * scale + shift
*/
bool
TestScaleAndShift3
()
{
/* a input tensor of size (2, 4) */
int
sOrder
=
2
;
int
*
sDimSize
=
new
int
[
sOrder
];
sDimSize
[
0
]
=
2
;
sDimSize
[
1
]
=
4
;
int
sUnitNum
=
1
;
for
(
int
i
=
0
;
i
<
sOrder
;
i
++
)
sUnitNum
*=
sDimSize
[
i
];
DTYPE
sData
[
2
][
4
]
=
{
{
0.0
F
,
1.0
F
,
2.0
F
,
3.0
F
},
{
4.0
F
,
5.0
F
,
6.0
F
,
7.0
F
}
};
DTYPE
answer
[
2
][
4
]
=
{
{
1.0
F
,
3.0
F
,
5.0
F
,
7.0
F
},
{
9.0
F
,
11.0
F
,
13.0
F
,
15.0
F
}
};
DTYPE
scaleFactor
=
2.0
F
;
DTYPE
shiftFactor
=
1.8
F
;
/* CPU test */
bool
cpuTest
=
true
;
#ifdef USE_CUDA
/* GPU test */
bool
gpuTest
=
true
;
/* create tensors */
XTensor
*
sGPU
=
NewTensor
(
sOrder
,
sDimSize
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
*
tGPU
=
NewTensor
(
sOrder
,
sDimSize
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
*
tMeGPU
=
NewTensor
(
sOrder
,
sDimSize
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
tUserGPU
;
/* create int32 tensor */
XTensor
sInt32GPU
;
XTensor
tInt32GPU
;
XTensor
tMeInt32GPU
;
XTensor
tUserInt32GPU
;
/* initialize variables */
sGPU
->
SetData
(
sData
,
sUnitNum
);
tMeGPU
->
SetData
(
sData
,
sUnitNum
);
/* convert data type from float to int32 */
sInt32GPU
=
ConvertDataType
(
*
sGPU
,
X_INT
);
tMeInt32GPU
=
ConvertDataType
(
*
tMeGPU
,
X_INT
);
tInt32GPU
=
ConvertDataType
(
tGPU
,
X_INT
);
/* call scaleandshift function */
_ScaleAndShift
(
&
sInt32GPU
,
&
tInt32GPU
,
scaleFactor
,
shiftFactor
);
_ScaleAndShiftMe
(
&
tMeInt32GPU
,
scaleFactor
,
shiftFactor
);
tUserInt32GPU
=
ScaleAndShift
(
sInt32GPU
,
scaleFactor
,
shiftFactor
);
/* convert data type from int32 to float */
_ConvertDataType
(
&
tInt32GPU
,
tGPU
);
_ConvertDataType
(
&
tMeInt32GPU
,
tMeGPU
);
tUserGPU
=
ConvertDataType
(
tUserInt32GPU
,
X_FLOAT
);
/* check results */
gpuTest
=
tGPU
->
CheckData
(
answer
,
sUnitNum
)
&&
tMeGPU
->
CheckData
(
answer
,
sUnitNum
)
&&
tUserGPU
.
CheckData
(
answer
,
sUnitNum
);
/* destroy variables */
delete
sGPU
;
delete
tGPU
;
delete
tMeGPU
;
delete
[]
sDimSize
;
return
cpuTest
&&
gpuTest
;
#else
/* destroy variables */
delete
[]
sDimSize
;
return
cpuTest
;
#endif // USE_CUDA
}
/*
case 4: int8 scale and shift all tensor entires.
p = p * scale + shift
*/
bool
TestScaleAndShift4
()
{
/* a input tensor of size (2, 4) */
int
sOrder
=
2
;
int
*
sDimSize
=
new
int
[
sOrder
];
sDimSize
[
0
]
=
2
;
sDimSize
[
1
]
=
4
;
int
sUnitNum
=
1
;
for
(
int
i
=
0
;
i
<
sOrder
;
i
++
)
sUnitNum
*=
sDimSize
[
i
];
DTYPE
sData
[
2
][
4
]
=
{
{
0.0
F
,
1.0
F
,
2.0
F
,
3.0
F
},
{
4.0
F
,
5.0
F
,
6.0
F
,
7.0
F
}
};
DTYPE
answer
[
2
][
4
]
=
{
{
1.0
F
,
3.0
F
,
5.0
F
,
7.0
F
},
{
9.0
F
,
11.0
F
,
13.0
F
,
15.0
F
}
};
DTYPE
scaleFactor
=
2.0
F
;
DTYPE
shiftFactor
=
1.8
F
;
/* CPU test */
bool
cpuTest
=
true
;
#ifdef USE_CUDA
/* GPU test */
bool
gpuTest
=
true
;
/* create tensors */
XTensor
*
sGPU
=
NewTensor
(
sOrder
,
sDimSize
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
*
tGPU
=
NewTensor
(
sOrder
,
sDimSize
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
*
tMeGPU
=
NewTensor
(
sOrder
,
sDimSize
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
tUserGPU
;
/* create int8 tensor */
XTensor
sInt8GPU
;
XTensor
tInt8GPU
;
XTensor
tMeInt8GPU
;
XTensor
tUserInt8GPU
;
/* initialize variables */
sGPU
->
SetData
(
sData
,
sUnitNum
);
tMeGPU
->
SetData
(
sData
,
sUnitNum
);
/* convert data type from float to int8 */
sInt8GPU
=
ConvertDataType
(
*
sGPU
,
X_INT8
);
tMeInt8GPU
=
ConvertDataType
(
*
tMeGPU
,
X_INT8
);
tInt8GPU
=
ConvertDataType
(
*
tGPU
,
X_INT8
);
/* call scaleandshift function */
_ScaleAndShift
(
&
sInt8GPU
,
&
tInt8GPU
,
scaleFactor
,
shiftFactor
);
_ScaleAndShiftMe
(
&
tMeInt8GPU
,
scaleFactor
,
shiftFactor
);
tUserInt8GPU
=
ScaleAndShift
(
sInt8GPU
,
scaleFactor
,
shiftFactor
);
/* convert data type from int8 to float */
_ConvertDataType
(
&
tInt8GPU
,
tGPU
);
_ConvertDataType
(
&
tMeInt8GPU
,
tMeGPU
);
tUserGPU
=
ConvertDataType
(
tUserInt8GPU
,
X_FLOAT
);
/* check results */
gpuTest
=
tGPU
->
CheckData
(
answer
,
sUnitNum
)
&&
tMeGPU
->
CheckData
(
answer
,
sUnitNum
)
&&
tUserGPU
.
CheckData
(
answer
,
sUnitNum
);
/* destroy variables */
delete
sGPU
;
delete
tGPU
;
delete
tMeGPU
;
delete
[]
sDimSize
;
return
cpuTest
&&
gpuTest
;
#else
/* destroy variables */
delete
[]
sDimSize
;
return
cpuTest
;
#endif // USE_CUDA
}
/* other cases */
/*
TODO!!
...
...
@@ -133,6 +380,33 @@ bool TestScaleAndShift()
else
XPRINT
(
0
,
stdout
,
">> case 1 passed!
\n
"
);
/* case 2 test */
caseFlag
=
TestScaleAndShift2
();
if
(
!
caseFlag
)
{
returnFlag
=
false
;
XPRINT
(
0
,
stdout
,
">> case 2 failed!
\n
"
);
}
else
XPRINT
(
0
,
stdout
,
">> case 2 passed!
\n
"
);
/* case 3 test */
caseFlag
=
TestScaleAndShift3
();
if
(
!
caseFlag
)
{
returnFlag
=
false
;
XPRINT
(
0
,
stdout
,
">> case 3 failed!
\n
"
);
}
else
XPRINT
(
0
,
stdout
,
">> case 3 passed!
\n
"
);
/* case 4 test */
caseFlag
=
TestScaleAndShift4
();
if
(
!
caseFlag
)
{
returnFlag
=
false
;
XPRINT
(
0
,
stdout
,
">> case 4 failed!
\n
"
);
}
else
XPRINT
(
0
,
stdout
,
">> case 4 passed!
\n
"
);
/* other cases test */
/*
TODO!!
...
...
source/tensor/test/Test.cpp
查看文件 @
7da1bec1
...
...
@@ -29,66 +29,66 @@ bool Test()
bool
wrong
=
false
;
XPRINT
(
0
,
stdout
,
"Testing the XTensor utilites ...
\n\n
"
);
//
wrong = !TestAbsolute() || wrong;
//
wrong = !TestAbsolute() || wrong;
//wrong = !TestClip() || wrong;
//
wrong = !TestCompare() || wrong;
//
wrong = !TestConcatenate() || wrong;
//
wrong = !TestConcatenateSolely() || wrong;
//
wrong = !TestCos() || wrong;
//
wrong = !TestConvertDataType() || wrong;
//
wrong = !TestCopyIndexed() || wrong;
//
wrong = !TestCopyValues() || wrong;
//
wrong = !TestDiv() || wrong;
//
wrong = !TestDivDim() || wrong;
//
wrong = !TestExp() || wrong;
//
wrong = !TestGather() || wrong;
//
wrong = !TestLog() || wrong;
//
wrong = !TestMatrixMul() || wrong;
//
wrong = !TestMatrixMul2D() || wrong;
//
wrong = !TestMatrixMul2DParallel() || wrong;
//
wrong = !TestMatrixMulBatched() || wrong;
//
wrong = !TestMerge() || wrong;
//
wrong = !TestMultiply() || wrong;
//
wrong = !TestCompare() || wrong;
//
wrong = !TestConcatenate() || wrong;
//
wrong = !TestConcatenateSolely() || wrong;
//
wrong = !TestCos() || wrong;
//
wrong = !TestConvertDataType() || wrong;
//
wrong = !TestCopyIndexed() || wrong;
//
wrong = !TestCopyValues() || wrong;
//
wrong = !TestDiv() || wrong;
//
wrong = !TestDivDim() || wrong;
//
wrong = !TestExp() || wrong;
//
wrong = !TestGather() || wrong;
//
wrong = !TestLog() || wrong;
//
wrong = !TestMatrixMul() || wrong;
//
wrong = !TestMatrixMul2D() || wrong;
//
wrong = !TestMatrixMul2DParallel() || wrong;
//
wrong = !TestMatrixMulBatched() || wrong;
//
wrong = !TestMerge() || wrong;
//
wrong = !TestMultiply() || wrong;
//wrong = !TestMultiplyDim() || wrong;
//wrong = !TestNegate() || wrong;
//
wrong = !TestNormalize() || wrong;
//
wrong = !TestPower() || wrong;
//
wrong = !TestReduceMax() || wrong;
//
wrong = !TestReduceMean() || wrong;
//
wrong = !TestReduceSum() || wrong;
//
wrong = !TestReduceSumAll() || wrong;
//
wrong = !TestReduceSumSquared() || wrong;
//
wrong = !TestReduceVariance() || wrong;
//
wrong = !TestRound() || wrong;
//
wrong = !TestScaleAndShift() || wrong;
//
wrong = !TestSelect() || wrong;
//
wrong = !TestSetAscendingOrder() || wrong;
//
wrong = !TestSetData() || wrong;
//
wrong = !TestSign() || wrong;
//
wrong = !TestSin() || wrong;
//
wrong = !TestSort() || wrong;
//
wrong = !TestSplit() || wrong;
//
wrong = !TestSpread() || wrong;
//
wrong = !TestSub() || wrong;
wrong
=
!
TestSum
()
||
wrong
;
//
wrong = !TestSumByColumnTV() || wrong;
//
wrong = !TestSumByColumnVT() || wrong;
//
wrong = !TestSumDim() || wrong;
//
wrong = !TestTan() || wrong;
//
wrong = !TestTranspose() || wrong;
//wrong = !TestTopK() || wrong;
//
wrong = !TestUnsqueeze() || wrong;
//
wrong = !TestXMem() || wrong;
//
wrong = !TestCrossEntropy() || wrong;
//wrong = !TestDropout() || wrong;
//
wrong = !TestHardTanH() || wrong;
//
wrong = !TestIdentity() || wrong;
wrong
=
!
TestLogSoftmax
()
||
wrong
;
//
wrong = !TestLoss() || wrong;
//
wrong = !TestRectify() || wrong;
//
wrong = !TestSigmoid() || wrong;
//
wrong = !TestSoftmax() || wrong;
//
wrong = !TestNormalize() || wrong;
//
wrong = !TestPower() || wrong;
//
wrong = !TestReduceMax() || wrong;
//
wrong = !TestReduceMean() || wrong;
//
wrong = !TestReduceSum() || wrong;
//
wrong = !TestReduceSumAll() || wrong;
//
wrong = !TestReduceSumSquared() || wrong;
//
wrong = !TestReduceVariance() || wrong;
//
wrong = !TestRound() || wrong;
wrong
=
!
TestScaleAndShift
()
||
wrong
;
//
wrong = !TestSelect() || wrong;
//
wrong = !TestSetAscendingOrder() || wrong;
//
wrong = !TestSetData() || wrong;
//
wrong = !TestSign() || wrong;
//
wrong = !TestSin() || wrong;
//
wrong = !TestSort() || wrong;
//
wrong = !TestSplit() || wrong;
//
wrong = !TestSpread() || wrong;
//
wrong = !TestSub() || wrong;
//
wrong = !TestSum() || wrong;
//
wrong = !TestSumByColumnTV() || wrong;
//
wrong = !TestSumByColumnVT() || wrong;
//
wrong = !TestSumDim() || wrong;
//
wrong = !TestTan() || wrong;
//
wrong = !TestTranspose() || wrong;
//
//wrong = !TestTopK() || wrong;
//
wrong = !TestUnsqueeze() || wrong;
//
wrong = !TestXMem() || wrong;
//
//
wrong = !TestCrossEntropy() || wrong;
//
//
wrong = !TestDropout() || wrong;
//
wrong = !TestHardTanH() || wrong;
//
wrong = !TestIdentity() || wrong;
//
wrong = !TestLogSoftmax() || wrong;
//
wrong = !TestLoss() || wrong;
//
wrong = !TestRectify() || wrong;
//
wrong = !TestSigmoid() || wrong;
//
wrong = !TestSoftmax() || wrong;
/* other test */
/*
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论