Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
N
NiuTrans.Tensor
概览
Overview
Details
Activity
Cycle Analytics
版本库
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
问题
0
Issues
0
列表
Board
标记
里程碑
合并请求
0
Merge Requests
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
Snippets
成员
Collapse sidebar
Close sidebar
活动
图像
聊天
创建新问题
作业
提交
Issue Boards
Open sidebar
Emmay
NiuTrans.Tensor
Commits
44bf9fa6
Commit
44bf9fa6
authored
Jan 21, 2019
by
linye
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Merge了最新版本代码,修复了一些Bug
parent
c9c53870
显示空白字符变更
内嵌
并排
正在显示
63 个修改的文件
包含
1812 行增加
和
1244 行删除
+1812
-1244
source/network/Main.cpp
+77
-157
source/network/XBackwardLoss.cpp
+1
-3
source/network/XBackwardMath.cpp
+66
-31
source/network/XBackwardMath.h
+10
-0
source/network/XNet.cpp
+1
-2
source/sample/fnnlm/FNNLM.cpp
+14
-6
source/sample/transformer/T2TAttention.cpp
+144
-16
source/sample/transformer/T2TAttention.h
+2
-1
source/sample/transformer/T2TDecoder.cpp
+34
-5
source/sample/transformer/T2TDecoder.h
+48
-1
source/sample/transformer/T2TEmbedding.cpp
+7
-2
source/sample/transformer/T2TEmbedding.h
+1
-1
source/sample/transformer/T2TEncoder.cpp
+2
-1
source/sample/transformer/T2TModel.cpp
+15
-10
source/sample/transformer/T2TOutput.cpp
+1
-1
source/sample/transformer/T2TTrainer.cpp
+203
-46
source/sample/transformer/T2TTrainer.h
+33
-2
source/sample/transformer/Transformer.cpp
+1
-0
source/tensor/XDevice.cpp
+9
-0
source/tensor/XDevice.h
+6
-0
source/tensor/XMem.cpp
+17
-0
source/tensor/XMem.h
+4
-0
source/tensor/XName.cpp
+4
-0
source/tensor/XName.h
+4
-2
source/tensor/XTensor.cpp
+1
-5
source/tensor/core/arithmetic/Div.cpp
+0
-2
source/tensor/core/arithmetic/DivDim.cu
+179
-54
source/tensor/core/arithmetic/MatrixMul.cpp
+4
-213
source/tensor/core/arithmetic/MatrixMul.h
+0
-9
source/tensor/core/arithmetic/MatrixMul2D.cpp
+1
-13
source/tensor/core/arithmetic/MatrixMul2D.cu
+0
-12
source/tensor/core/arithmetic/MatrixMul2DParallel.cpp
+0
-38
source/tensor/core/arithmetic/MatrixMulBatched.cpp
+0
-139
source/tensor/core/arithmetic/MatrixMulBatched.h
+0
-8
source/tensor/core/arithmetic/Multiply.cpp
+0
-2
source/tensor/core/arithmetic/MultiplyDim.cpp
+146
-6
source/tensor/core/arithmetic/MultiplyDim.cu
+3
-3
source/tensor/core/arithmetic/MultiplyDim.h
+9
-2
source/tensor/core/arithmetic/SubDim.cu
+159
-42
source/tensor/core/arithmetic/Sum.cu
+25
-11
source/tensor/core/arithmetic/SumDim.cpp
+146
-1
source/tensor/core/arithmetic/SumDim.cu
+162
-51
source/tensor/core/arithmetic/SumDim.cuh
+2
-0
source/tensor/core/arithmetic/SumDim.h
+10
-0
source/tensor/core/arithmetic/XTensorBLAS.cu
+15
-79
source/tensor/core/arithmetic/XTensorBLAS.h
+0
-2
source/tensor/core/getandset/SetData.cpp
+33
-0
source/tensor/core/getandset/SetData.cu
+102
-12
source/tensor/core/getandset/SetData.cuh
+4
-0
source/tensor/core/getandset/SetData.h
+4
-0
source/tensor/core/math/Power.cpp
+0
-2
source/tensor/core/movement/Gather.cpp
+0
-0
source/tensor/core/movement/Spread.cpp
+0
-0
source/tensor/core/movement/Spread.cu
+0
-0
source/tensor/core/reduce/ReduceMax.cpp
+0
-1
source/tensor/core/reduce/ReduceMax.cu
+16
-26
source/tensor/core/reduce/ReduceSum.cu
+14
-31
source/tensor/function/Dropout.cpp
+72
-53
source/tensor/function/Dropout.h
+1
-4
source/tensor/function/LogSoftmax.cpp
+0
-122
source/tensor/function/LogSoftmax.cu
+0
-1
source/tensor/function/Softmax.cpp
+0
-1
source/tensor/function/Softmax.cu
+0
-12
没有找到文件。
source/network/Main.cpp
查看文件 @
44bf9fa6
...
...
@@ -66,6 +66,7 @@ void PowerFP16Test();
void
ClipFP16Test
();
void
GatherFP16Test
();
void
SetDataGPUFP16Test
();
void
SumIntTest
();
using
namespace
nts
;
using
namespace
fnnlm
;
...
...
@@ -89,8 +90,6 @@ int main( int argc, const char ** argv )
//return 0;
//ConvertBackwardTest();
//return 0;
//DropoutFP16Test();
//return 0;
//UnsqueezeFP16Test();
//return 0;
//ReduceMaxFP16Test();
...
...
@@ -143,11 +142,17 @@ int main( int argc, const char ** argv )
//InitCPUFP16Test();
//return 0;
SetDataGPUFP16
Test
();
return
0
;
//MycublasGemmEx
Test();
//
return 0;
MycublasGemmExTest
();
return
0
;
//SumIntTest();
//return 0;
//DropoutFP16Test();
//return 0;
//SetDataGPUFP16Test();
//return 0;
if
(
argc
>
1
&&
!
strcmp
(
argv
[
1
],
"-fnnlm"
))
...
...
@@ -205,27 +210,69 @@ void MycublasGemmExTest() {
c1
.
Dump
(
stderr
,
"c1:"
);
}
void
SumIntTest
()
{
XTensor
a
;
XTensor
b
;
XTensor
c
;
XTensor
inta
;
XTensor
intb
;
XTensor
intc
;
InitTensor2D
(
&
a
,
2
,
2
,
X_FLOAT
,
0
);
InitTensor2D
(
&
b
,
2
,
2
,
X_FLOAT
,
0
);
a
.
SetDataRand
(
-
5.0
,
5.0
);
b
.
SetDataRand
(
-
5.0
,
5.0
);
a
.
Dump
(
stderr
,
"a:"
);
b
.
Dump
(
stderr
,
"b:"
);
inta
=
ConvertDataType
(
a
,
X_INT
);
intb
=
ConvertDataType
(
b
,
X_INT
);
inta
.
Dump
(
stderr
,
"inta:"
);
intb
.
Dump
(
stderr
,
"intb:"
);
intc
=
Sum
(
inta
,
intb
);
intc
.
Dump
(
stderr
,
"intc:"
);
}
void
SetDataGPUFP16Test
()
{
srand
(
time
(
NULL
));
XTensor
a1
;
InitTensor2D
(
&
a1
,
2
,
2
,
X_FLOAT
,
0
);
_SetDataRand
(
&
a1
,
-
5.0
F
,
5.0
F
);
a1
.
Dump
(
&
a1
,
stderr
,
"a:
\n
"
);
/*XTensor m;
InitTensor2D(&m, 2, 2, X_FLOAT, 0);
m.SetDataRand(0.0, 10.0);*/
XTensor
a
;
InitTensor2D
(
&
a
,
2
,
2
,
X_FLOAT16
,
0
);
XTensor
*
m
=
NewTensor2D
(
2
,
2
,
X_FLOAT
,
0
)
;
m
->
SetDataRand
(
0.0
,
10.
0
);
_SetDataRand
(
&
a
,
-
5.0
F
,
5.0
F
);
//XTensor a1;
//InitTensor2D(&a1, 2, 2, X_FLOAT, 0);
//_CopyValues(&m, &a1);
//_SetDataRand(&a1, -1.0F, 1.0F);
//a1.Dump(&a1, stderr, "a:\n");
a
.
Dump
(
&
a
,
stderr
,
"a:
\n
"
);
/*XTensor a;
InitTensor2D(&a, 2, 2, X_FLOAT16, 0);*/
XTensor
b
;
InitTensor2D
(
&
b
,
2
,
2
,
X_FLOAT
,
0
);
XTensor
*
a
=
NewTensor2D
(
2
,
2
,
X_FLOAT16
,
0
);
b
=
ConvertDataType
(
a
,
X_FLOAT
);
_ConvertDataType
(
m
,
a
);
a
->
Dump
(
a
,
stderr
,
"a:
\n
"
);
_SetDataRand
(
a
,
0.0
F
,
1.0
F
);
a
->
Dump
(
a
,
stderr
,
"a:
\n
"
);
//XTensor b;
//InitTensor2D(&b, 2, 2, X_FLOAT, 0);
//b = ConvertDataType(a, X_FLOAT);
//b.Dump(stderr, "b:\n");
b
.
Dump
(
stderr
,
"b:
\n
"
);
}
void
ClipFP16Test
()
{
...
...
@@ -447,7 +494,7 @@ void FloatToInt8Test() {
InitTensor2D
(
&
a
,
2
,
2
,
X_FLOAT
,
0
);
InitTensor2D
(
&
b
,
2
,
2
,
X_INT8
,
0
);
a
.
SetDataRand
(
-
5.0
F
,
5.0
F
);
a
.
SetDataRand
(
5.0
F
,
5.0
F
);
a
.
Dump
(
stderr
,
"a:"
);
b
=
ConvertDataType
(
a
,
X_INT8
);
...
...
@@ -741,8 +788,8 @@ void MultiplyDimFP16Test()
halfA
=
ConvertDataType
(
a
,
X_FLOAT16
);
halfB
=
ConvertDataType
(
b
,
X_FLOAT16
);
c1
=
MultiplyDim
(
a1
,
b1
,
1
,
0
);
halfC
=
MultiplyDim
(
halfA
,
halfB
,
1
,
0
);
c1
=
MultiplyDim
(
a1
,
b1
,
1
);
halfC
=
MultiplyDim
(
halfA
,
halfB
,
1
);
c
=
ConvertDataType
(
halfC
,
X_FLOAT
);
...
...
@@ -950,26 +997,26 @@ void SubFP16Test()
void
DropoutFP16Test
()
{
srand
(
time
(
NULL
));
XTensor
a
;
XTensor
b
;
XTensor
b1
;
XTensor
halfA
;
XTensor
halfB
;
InitTensor2D
(
&
a
,
10
,
1
0
,
X_FLOAT
,
0
);
InitTensor2D
(
&
a
,
10
,
1
,
X_FLOAT
,
0
);
a
.
SetDataRand
(
-
5.0
F
,
5.0
F
);
/*a.Dump(stderr, "a:");*/
a
.
Dump
(
stderr
,
"a:"
);
halfA
=
ConvertDataType
(
a
,
X_FLOAT16
);
halfB
=
Dropout
(
halfA
,
0.5
);
b1
=
Dropout
(
a
,
0.3
);
b
=
ConvertDataType
(
halfB
,
X_FLOAT
);
halfB
=
Dropout
(
halfA
,
0.2
);
b1
=
Dropout
(
a
,
0.2
);
b
.
Dump
(
stderr
,
"b
:"
);
//b1.Dump(
stderr, "b1:");
halfB
.
Dump
(
&
halfB
,
stderr
,
"halfB
:"
);
b1
.
Dump
(
&
b1
,
stderr
,
"b1:"
);
}
void
ConvertBackwardTest
()
...
...
@@ -1070,131 +1117,3 @@ void ConvertTest()
a1
.
Dump
(
stderr
,
"halfa:"
);
}
}
\ No newline at end of file
void
MatrixMulFloat16AndFloatTest
()
{
XTensor
a
;
XTensor
b
;
XTensor
c
;
InitTensor2D
(
&
a
,
5000
,
5000
,
X_FLOAT
,
0
);
InitTensor2D
(
&
b
,
5000
,
5000
,
X_FLOAT
,
0
);
InitTensor2D
(
&
c
,
5000
,
5000
,
X_FLOAT
,
0
);
a
.
SetDataRand
(
-
10.0
F
,
10.0
F
);
b
.
SetDataRand
(
-
10.0
F
,
10.0
F
);
int
recurrentNum
=
10000
;
double
startT1
=
GetClockSec
();
for
(
int
i1
=
0
;
i1
<
recurrentNum
;
i1
++
)
{
c
=
MatrixMul
(
&
a
,
&
b
);
}
printf
(
"ElapsedFloat32 = %.2f s
\n
"
,
GetClockSec
()
-
startT1
);
double
startT2
=
GetClockSec
();
for
(
int
i2
=
0
;
i2
<
recurrentNum
;
i2
++
)
{
c
=
MatrixMulFloat16
(
&
a
,
&
b
);
}
printf
(
"ElapsedFloat16 = %.2f s
\n
"
,
GetClockSec
()
-
startT2
);
}
void
MatrixMul2DFloat16Test
()
{
XTensor
a
;
XTensor
b
;
XTensor
c
;
XTensor
a00
;
XTensor
b00
;
XTensor
c00
;
XTensor
c01
;
XTensor
halfa
;
XTensor
halfb
;
XTensor
halfc
;
InitTensor3D
(
&
a
,
3
,
2
,
3
,
X_FLOAT
,
0
);
InitTensor2D
(
&
b
,
3
,
2
,
X_FLOAT
,
0
);
InitTensor3D
(
&
c
,
3
,
2
,
2
,
X_FLOAT
,
0
);
InitTensor3D
(
&
a00
,
3
,
2
,
3
,
X_FLOAT
,
0
);
InitTensor2D
(
&
b00
,
3
,
2
,
X_FLOAT
,
0
);
InitTensor3D
(
&
c00
,
3
,
2
,
2
,
X_FLOAT
,
0
);
InitTensor3D
(
&
c01
,
3
,
2
,
2
,
X_FLOAT
,
0
);
InitTensor3D
(
&
halfa
,
3
,
2
,
3
,
X_FLOAT16
,
0
);
InitTensor2D
(
&
halfb
,
3
,
2
,
X_FLOAT16
,
0
);
InitTensor3D
(
&
halfc
,
3
,
2
,
2
,
X_FLOAT16
,
0
);
DTYPE
aData
[
3
][
2
][
3
]
=
{
{
{
0.02121212144
F
,
-
1.0234556667
F
,
2.04354565678
F
},
{
2.0234567332
F
,
-
1.0213469654
F
,
-
3.01568321
F
}
},
{
{
-
1.022347899421
F
,
2.012589653664
F
,
4.035346643
F
},
{
3.01234544634
F
,
1.0324354635
F
,
2.0546578332
F
}
},
{
{
-
1.0235743446
F
,
3.0335753334
F
,
2.0653323234
F
},
{
1.03235643232
F
,
-
1.023463345542
F
,
0.0335563322
F
}
}
};
DTYPE
bData
[
3
][
2
]
=
{
{
-
1.034466323232
F
,
-
2.0546676442
F
},
{
-
3.0224354656
F
,
4.034467866532
F
},
{
5.02354657442
F
,
-
6.0324355767443
F
}
};
a
.
SetData
(
aData
,
18
);
b
.
SetData
(
bData
,
6
);
_MatrixMul
(
&
a
,
X_NOTRANS
,
&
b
,
X_NOTRANS
,
&
c
);
_ConvertDataType
(
&
a
,
&
halfa
);
_ConvertDataType
(
&
b
,
&
halfb
);
_MatrixMul
(
&
halfa
,
X_NOTRANS
,
&
halfb
,
X_NOTRANS
,
&
halfc
);
_ConvertDataType
(
&
halfc
,
&
c01
);
_ConvertDataType
(
&
halfa
,
&
a00
);
_ConvertDataType
(
&
halfb
,
&
b00
);
_MatrixMul
(
&
a00
,
X_NOTRANS
,
&
b00
,
X_NOTRANS
,
&
c00
);
c
.
Dump
(
stderr
,
"c:"
);
c01
.
Dump
(
stderr
,
"c01:"
);
c00
.
Dump
(
stderr
,
"c0:"
);
XTensor
a1
;
XTensor
b1
;
XTensor
c1
;
XTensor
a10
;
XTensor
b10
;
XTensor
c10
;
XTensor
c11
;
XTensor
halfa1
;
XTensor
halfb1
;
XTensor
halfc1
;
InitTensor2D
(
&
a1
,
3
,
3
,
X_FLOAT
,
0
);
InitTensor2D
(
&
b1
,
3
,
2
,
X_FLOAT
,
0
);
InitTensor2D
(
&
c1
,
3
,
2
,
X_FLOAT
,
0
);
InitTensor2D
(
&
a10
,
3
,
3
,
X_FLOAT
,
0
);
InitTensor2D
(
&
b10
,
3
,
2
,
X_FLOAT
,
0
);
InitTensor2D
(
&
c10
,
3
,
2
,
X_FLOAT
,
0
);
InitTensor2D
(
&
c11
,
3
,
2
,
X_FLOAT
,
0
);
InitTensor2D
(
&
halfa1
,
3
,
3
,
X_FLOAT16
,
0
);
InitTensor2D
(
&
halfb1
,
3
,
2
,
X_FLOAT16
,
0
);
InitTensor2D
(
&
halfc1
,
3
,
2
,
X_FLOAT16
,
0
);
DTYPE
a1Data
[
3
][
3
]
=
{
{
0.02121212144
F
,
-
1.0234556667
F
,
2.043541565678
F
},
{
-
2.0234567332
F
,
1.0213469657774
F
,
-
3.0156837543321
F
}
,
{
1.022347899421
F
,
-
2.012589653664
F
,
4.03534634643
F
}};
DTYPE
b1Data
[
3
][
2
]
=
{
{
1.034466323232
F
,
-
2.0546676442
F
},
{
3.0224354656
F
,
-
4.034467866532
F
},
{
5.02354657442
F
,
6.0324355767443
F
}
};
a1
.
SetData
(
a1Data
,
9
);
b1
.
SetData
(
b1Data
,
6
);
_MatrixMul
(
&
a1
,
X_NOTRANS
,
&
b1
,
X_NOTRANS
,
&
c1
);
_ConvertDataType
(
&
a1
,
&
halfa1
);
_ConvertDataType
(
&
b1
,
&
halfb1
);
_MatrixMul
(
&
halfa1
,
X_NOTRANS
,
&
halfb1
,
X_NOTRANS
,
&
halfc1
);
_ConvertDataType
(
&
halfc1
,
&
c11
);
_ConvertDataType
(
&
halfa1
,
&
a10
);
_ConvertDataType
(
&
halfb1
,
&
b10
);
_MatrixMul
(
&
a10
,
X_NOTRANS
,
&
b10
,
X_NOTRANS
,
&
c10
);
c1
.
Dump
(
stderr
,
"c1:"
);
c11
.
Dump
(
stderr
,
"c11:"
);
c10
.
Dump
(
stderr
,
"c10:"
);
}
source/network/XBackwardLoss.cpp
查看文件 @
44bf9fa6
...
...
@@ -87,8 +87,6 @@ void XLossGrad::Compute(XTensor * gold, XTensor * y,
XTensor
*
dedy
,
XTensor
*
padding
,
LOSS_FUNCTION_NAME
lossName
)
{
//return;
if
(
gold
==
NULL
){
if
(
dedy
->
dataType
==
X_FLOAT
)
_SetDataFixedFloat
(
dedy
,
1.0
F
);
...
...
@@ -97,7 +95,7 @@ void XLossGrad::Compute(XTensor * gold, XTensor * y,
else
if
(
dedy
->
dataType
==
X_INT
)
_SetDataFixedInt
(
dedy
,
1
);
else
{
//
ShowNTErrors("TODO");
ShowNTErrors
(
"TODO"
);
}
return
;
}
...
...
source/network/XBackwardMath.cpp
查看文件 @
44bf9fa6
...
...
@@ -71,6 +71,8 @@ void XMathGrad::MakeGrad(XTensor * node, bool isEfficient)
GradMultiply
(
node
,
isEfficient
);
else
if
(
operID
==
MATH_MULTIPLYDIM
)
GradMultiplyDim
(
node
,
isEfficient
);
else
if
(
operID
==
MATH_MULTIPLYBROADCAST
)
GradMultiplyBroadcast
(
node
,
isEfficient
);
else
if
(
operID
==
MATH_NEGATE
)
GradNegate
(
node
,
isEfficient
);
else
if
(
operID
==
MATH_NORMALIZE
)
...
...
@@ -87,6 +89,8 @@ void XMathGrad::MakeGrad(XTensor * node, bool isEfficient)
GradSum
(
node
,
isEfficient
);
else
if
(
operID
==
MATH_SUMDIM
)
GradSumDim
(
node
,
isEfficient
);
else
if
(
operID
==
MATH_SUMBROADCAST
)
GradSumBroadcast
(
node
,
isEfficient
);
else
if
(
operID
==
REDUCE_REDUCEMEAN
)
GradReduceMean
(
node
,
isEfficient
);
else
if
(
operID
==
REDUCE_REDUCESUM
)
...
...
@@ -736,10 +740,6 @@ dE/db = (dE/dc * a).reduce(0,...,n-1,n+1,...)
*/
void
XMathGrad
::
GradMultiplyDim
(
XTensor
*
node
,
bool
isEfficient
)
{
/* XTensor node1;
node1 = ConvertDataType(*node, X_FLOAT);
node1.Dump(stderr, "node:");*/
XLink
&
income
=
node
->
income
;
CheckNTErrors
(
income
.
tailNum
==
2
,
"Wrong input tensor number for MULTIPLYDIM!"
);
...
...
@@ -752,14 +752,6 @@ void XMathGrad::GradMultiplyDim(XTensor * node, bool isEfficient)
/* dE/da */
_MultiplyDim
(
node
->
grad
,
b
,
a
->
grad
,
n
,
1.0
F
);
//XTensor a1;
//a1 = ConvertDataType(*a, X_FLOAT);
//a1.Dump(stderr, "a:");
//XTensor b1;
//b1 = ConvertDataType(*b, X_FLOAT);
//b1.Dump(stderr, "b:");
/* dE/db */
int
order
=
a
->
order
;
int
dimSize
[
MAX_TENSOR_DIM_NUM
];
...
...
@@ -777,19 +769,10 @@ void XMathGrad::GradMultiplyDim(XTensor * node, bool isEfficient)
size of b. Then we can reduce the matrix into a row vector. */
bGradTMP
->
Reshape
(
2
,
reshapedSize
);
/*XTensor bGradTMP1;
bGradTMP1 = ConvertDataType(*bGradTMP, X_FLOAT);
bGradTMP1.Dump(stderr, "bGradTMP:");*/
//if(b->outgo.tailNum > 1){
XTensor
*
bGradTMP2
=
NewTensorBuf
(
b
->
grad
,
b
->
devID
,
b
->
mem
);
_ReduceSum
(
bGradTMP
,
bGradTMP2
,
0
);
/* XTensor bGradTMP21;
bGradTMP21 = ConvertDataType(*bGradTMP2, X_FLOAT);
bGradTMP21.Dump(stderr, "bGradTMP2:");*/
_Sum
(
b
->
grad
,
bGradTMP2
,
b
->
grad
);
DelTensorBuf
(
bGradTMP2
);
...
...
@@ -832,21 +815,43 @@ void XMathGrad::GradMultiplyDim(XTensor * node, bool isEfficient)
DelTensorBuf
(
interGrad
);
}
//printf("\n");
//XTensor a2;
//a2 = ConvertDataType(*a, X_FLOAT);
//a2.Dump(stderr, "a2:");
//XTensor b2;
//b2 = ConvertDataType(*b, X_FLOAT);
//b2.Dump(stderr, "b2:");
DelTensorBuf
(
bGradTMP
);
node
->
visitMark
=
NODE_FINISHED
;
}
/*
gradient for multiplication by broadcasting:
c = a * b
where some dimensions of b are of size 1
dE/da = dE/dc * b
dE/db = (dE/dc * a).reduce(0...n)
where a.reduce(0...n) is the reduction along the dimension
whose size is 1 in b. Note that there might be several reductions.
>> node - the node (c) for backward computation
>> isEfficient - indicates whether the computation is in
an efficient manner
*/
void
XMathGrad
::
GradMultiplyBroadcast
(
XTensor
*
node
,
bool
isEfficient
)
{
XLink
&
income
=
node
->
income
;
CheckNTErrors
(
income
.
tailNum
==
2
,
"Wrong input tensor number for MULTIPLYBROADCAST!"
);
XTensor
*
a
=
income
.
tails
[
0
];
XTensor
*
b
=
income
.
tails
[
1
];
DTYPE
beta
=
income
.
GetParam
(
0
);
XNoder
::
MakeGrad
(
a
);
_MultiplyBroadcast
(
node
->
grad
,
b
,
a
->
grad
,
1.0
F
);
if
(
b
->
isVar
||
b
->
income
.
tailNum
>
0
){
ShowNTErrors
(
"TODO"
);
}
}
/*
gradient for negate
for
c = -a
...
...
@@ -1020,7 +1025,6 @@ void XMathGrad::GradScaleAndShift(XTensor * node, bool isEfficient)
_Sum
(
a
->
grad
,
node
->
grad
,
a
->
grad
,
scale
);
node
->
visitMark
=
NODE_FINISHED
;
}
...
...
@@ -1285,6 +1289,37 @@ void XMathGrad::GradSumDim(XTensor * node, bool isEfficient)
}
/*
gradient for sum by broadcasting:
c = a + b * \beta
where some dimensions of b are of size 1
dE/da = dE/dc
dE/db = dE/dc * a.reduce(0..n) * \beta
where a.reduce(0..n) is the reduction along the dimension
whose size is 1 in b
>> node - the node (c) for backward computation
>> isEfficient - indicates whether the computation is in
an efficient manner
*/
void
XMathGrad
::
GradSumBroadcast
(
XTensor
*
node
,
bool
isEfficient
)
{
XLink
&
income
=
node
->
income
;
CheckNTErrors
(
income
.
tailNum
==
2
,
"Wrong input tensor number for SUMBROADCAST!"
);
XTensor
*
a
=
income
.
tails
[
0
];
XTensor
*
b
=
income
.
tails
[
1
];
DTYPE
beta
=
income
.
GetParam
(
0
);
XNoder
::
MakeGrad
(
a
);
_Sum
(
a
->
grad
,
node
->
grad
,
a
->
grad
);
if
(
b
->
isVar
||
b
->
income
.
tailNum
>
0
){
ShowNTErrors
(
"TODO"
);
}
}
/*
gradient for reduceMean
for
c = reduceMean(a, dim)
...
...
source/network/XBackwardMath.h
查看文件 @
44bf9fa6
...
...
@@ -109,6 +109,11 @@ private:
static
void
GradMultiplyDim
(
XTensor
*
node
,
bool
isEfficient
);
/* gradient for multiply one dimension: c = a * b
where some dimensions of b are of size 1 */
static
void
GradMultiplyBroadcast
(
XTensor
*
node
,
bool
isEfficient
);
/* gradient for negate */
static
void
GradNegate
(
XTensor
*
node
,
bool
isEfficient
);
...
...
@@ -143,6 +148,11 @@ private:
static
void
GradSumDim
(
XTensor
*
node
,
bool
isEfficient
);
/* gradient for sum by broadcasting: c = a + b * \beta
where some dimensions of b are of size 1 */
static
void
GradSumBroadcast
(
XTensor
*
node
,
bool
isEfficient
);
/* gradient for reduceMean */
static
void
GradReduceMean
(
XTensor
*
node
,
bool
isEfficient
);
...
...
source/network/XNet.cpp
查看文件 @
44bf9fa6
...
...
@@ -27,7 +27,6 @@
#include "XBackwardFunc.h"
#include "XBackwardShape.h"
#include "../tensor/XName.h"
#include "../tensor/core/CHeader.h"
namespace
nts
{
...
...
@@ -266,7 +265,7 @@ void XNet::BackwardNode(XTensor * node, bool isEfficent)
XMathGrad
::
MakeGrad
(
node
,
isEfficent
);
else
if
(
XFuncGrad
::
IsFunc
(
node
))
XFuncGrad
::
MakeGrad
(
node
,
isEfficent
);
else
if
(
XDataGrad
::
IsDataOP
(
node
))
else
if
(
XDataGrad
::
IsDataOP
(
node
))
XDataGrad
::
MakeGrad
(
node
,
isEfficent
);
else
if
(
XShapeGrad
::
IsShapeOP
(
node
))
XShapeGrad
::
MakeGrad
(
node
,
isEfficent
);
...
...
source/sample/fnnlm/FNNLM.cpp
查看文件 @
44bf9fa6
...
...
@@ -839,6 +839,9 @@ void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net)
InitModelTensor2D
(
s
,
batchSize
,
model
.
vSize
,
model
);
InitModelTensor2D
(
y
,
batchSize
,
model
.
vSize
,
model
);
///* s = h_last * w */
//_MatrixMul(&h_last, X_NOTRANS, &w, X_NOTRANS, &s);
XTensor
h_last1
;
h_last1
=
ScaleAndShift
(
h_last
,
100
,
0
);
...
...
@@ -852,20 +855,25 @@ void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net)
int8W
=
ConvertDataType
(
w1
,
X_INT8
);
XTensor
s1
;
InitTensor2D
(
&
s1
,
batchSize
,
model
.
vSize
,
X_
FLOA
T
,
model
.
devID
,
model
.
mem
);
InitTensor2D
(
&
s1
,
batchSize
,
model
.
vSize
,
X_
IN
T
,
model
.
devID
,
model
.
mem
);
_MatrixMul2D
(
&
int8H_last
,
X_NOTRANS
,
&
int8W
,
X_NOTRANS
,
&
s1
);
s
=
ScaleAndShift
(
s1
,
0.0001
,
0
);
XTensor
b2D
;
InitTensor
(
&
b2D
,
&
s
);
InitTensor
2D
(
&
b2D
,
batchSize
,
model
.
vSize
,
X_FLOAT
,
model
.
devID
,
model
.
mem
);
_Unsqueeze
(
&
b
,
&
b2D
,
0
,
batchSize
);
_Sum
(
&
s
,
&
b2D
,
&
s
);
b2D
=
ScaleAndShift
(
b2D
,
10000
,
0
);
XTensor
b2D1
;
b2D1
=
ConvertDataType
(
b2D
,
X_INT
);
_Sum
(
&
s1
,
&
b2D1
,
&
s1
);
s
=
ConvertDataType
(
s1
,
X_FLOAT
);
s
=
ScaleAndShift
(
s
,
0.0001
,
0
);
/* y = softmax(s) */
_LogSoftmax
(
&
s
,
&
y
,
1
);
}
}
...
...
source/sample/transformer/T2TAttention.cpp
查看文件 @
44bf9fa6
...
...
@@ -53,6 +53,42 @@ initialize the model
>> myDevID - device id
>> myMem - the memory pool
*/
//void T2TAttention::InitModel(int argc, char ** argv,
// bool myIsMasked, int myIgnored,
// int myDevID, XMem * myMem)
//{
// devID = myDevID;
// mem = myMem;
// isMasked = myIsMasked;
// ignored = myIgnored;
//
// float minmax = 0;
//
// LoadParamInt(argc, argv, "nhead", &nhead, 8);
// LoadParamInt(argc, argv, "d", &dk, DEFAULT_EMBEDDING_SIZE);
// LoadParamInt(argc, argv, "d", &dv, DEFAULT_EMBEDDING_SIZE);
// LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);
// LoadParamFloat(argc, argv, "attminmax", &minmax, 0.1F);
// LoadParamFloat(argc, argv, "dropoutatt", &dropoutP, 0);
//
// InitTensor2D(&wk, d, dk, X_FLOAT, devID, mem);
// InitTensor2D(&wq, d, dk, X_FLOAT, devID, mem);
// InitTensor2D(&wv, d, dv, X_FLOAT, devID, mem);
// InitTensor2D(&wa, d, d, X_FLOAT, devID, mem);
// InitTensor2D(&wbig, d, 3 * d, X_FLOAT, devID, mem);
//
// float scale = 1.0F;
// float finfoutk = (float)sqrt(6.0F * scale/(d + dk));
// float finfoutv = (float)sqrt(6.0F * scale/(d + dv));
// float finfouta = (float)sqrt(6.0F * scale / (d + d));
// float finfoutbig = (float)sqrt(6.0F * scale / (d + 3*d));
//
// wk.SetDataRand(-finfoutk, finfoutk);
// wq.SetDataRand(-finfoutk, finfoutk);
// wv.SetDataRand(-finfoutv, finfoutv);
// wa.SetDataRand(-finfouta, finfouta);
// wbig.SetDataRand(-finfoutbig, finfoutbig);
//}
void
T2TAttention
::
InitModel
(
int
argc
,
char
**
argv
,
bool
myIsMasked
,
int
myIgnored
,
...
...
@@ -76,20 +112,19 @@ void T2TAttention::InitModel(int argc, char ** argv,
InitTensor2D
(
&
wq
,
d
,
dk
,
X_FLOAT16
,
devID
,
mem
);
InitTensor2D
(
&
wv
,
d
,
dv
,
X_FLOAT16
,
devID
,
mem
);
InitTensor2D
(
&
wa
,
d
,
d
,
X_FLOAT16
,
devID
,
mem
);
InitTensor2D
(
&
wbig
,
d
,
3
*
d
,
X_FLOAT16
,
devID
,
mem
);
float
scale
=
1.0
F
;
float
finfoutk
=
(
float
)
sqrt
(
6.0
F
*
scale
/
(
d
+
dk
));
float
finfoutv
=
(
float
)
sqrt
(
6.0
F
*
scale
/
(
d
+
dv
));
float
finfouta
=
(
float
)
sqrt
(
6.0
F
*
scale
/
(
d
+
d
));
float
finfoutbig
=
(
float
)
sqrt
(
6.0
F
*
scale
/
(
d
+
3
*
d
));
wk
.
SetDataRand
(
-
finfoutk
,
finfoutk
);
wq
.
SetDataRand
(
-
finfoutk
,
finfoutk
);
wv
.
SetDataRand
(
-
finfoutv
,
finfoutv
);
wa
.
SetDataRand
(
-
finfouta
,
finfouta
);
//_SetDataRand(&wk, -finfoutk, finfoutk);
//_SetDataRand(&wq, -finfoutk, finfoutk);
//_SetDataRand(&wv, -finfoutv, finfoutv);
//_SetDataRand(&wa, -finfouta, finfouta);
wbig
.
SetDataRand
(
-
finfoutbig
,
finfoutbig
);
}
/*
...
...
@@ -103,40 +138,136 @@ make the network
>> isTraining - indicates whether the model is used for training
<< return - multi-attention result
*/
//XTensor T2TAttention::Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining, bool selfatt)
//{
// XTensor k2;
// XTensor q2;
// XTensor v2;
//
// if (selfatt){
//
// XTensor con;
// XList split;
//
// con = MMul(k, wbig);
//
// int d1 = con.GetDim(0);
// int d2 = con.GetDim(1);
// int d3 = con.GetDim(2) / 3;
//
// InitTensor3D(&k2, d1, d2, d3, X_FLOAT, devID, mem);
// InitTensor3D(&q2, d1, d2, d3, X_FLOAT, devID, mem);
// InitTensor3D(&v2, d1, d2, d3, X_FLOAT, devID, mem);
//
// split.Add(&q2);
// split.Add(&k2);
// split.Add(&v2);
//
// Split(con, split, 2, 3);
// }
//
// else{
// /* linear transofmration before self-attention */
// k2 = MMul(k, wk);
// q2 = MMul(q, wq);
// v2 = MMul(v, wv);
// }
//
// XTensor kheads;
// XTensor qheads;
// XTensor vheads;
//
// /* multi head */
// kheads = Split(k2, k2.order - 1, nhead);
// qheads = Split(q2, q2.order - 1, nhead);
// vheads = Split(v2, v2.order - 1, nhead);
//
// XTensor att;
// XTensor dot;
// XTensor scalar;
//
// /* scalar = softmax(Q * K^T / sqrt(dk)) * V */
// dot = BMMul(qheads, X_NOTRANS, kheads, X_TRANS);
//
// if(isMasked)
// dot = dot + mask;
//
// dot = Linear(dot, 1.0F/(float)sqrt((float)dk/nhead));
//
// scalar = Softmax(dot, -1);
//
// if(isTraining && dropoutP > 0)
// scalar = Dropout(scalar, dropoutP);
//
// att = BMMul(scalar, vheads);
//
// /* concatenate the heads */
// return MMul(Merge(att, att.order - 1), wa);
//}
XTensor
T2TAttention
::
Make
(
XTensor
&
k
,
XTensor
&
q
,
XTensor
&
v
,
XTensor
&
mask
,
bool
isTraining
)
XTensor
T2TAttention
::
Make
(
XTensor
&
k
,
XTensor
&
q
,
XTensor
&
v
,
XTensor
&
mask
,
bool
isTraining
,
bool
selfatt
)
{
XTensor
halfK
;
XTensor
halfK2
;
XTensor
halfQ2
;
XTensor
halfV2
;
XTensor
halfK
;
halfK
=
ConvertDataType
(
k
,
X_FLOAT16
);
if
(
selfatt
)
{
XTensor
halfCon
;
XList
halfSplit
;
halfCon
=
MMul
(
halfK
,
wbig
);
int
d1
=
halfCon
.
GetDim
(
0
);
int
d2
=
halfCon
.
GetDim
(
1
);
int
d3
=
halfCon
.
GetDim
(
2
)
/
3
;
InitTensor3D
(
&
halfK2
,
d1
,
d2
,
d3
,
X_FLOAT16
,
devID
,
mem
);
InitTensor3D
(
&
halfQ2
,
d1
,
d2
,
d3
,
X_FLOAT16
,
devID
,
mem
);
InitTensor3D
(
&
halfV2
,
d1
,
d2
,
d3
,
X_FLOAT16
,
devID
,
mem
);
halfSplit
.
Add
(
&
halfQ2
);
halfSplit
.
Add
(
&
halfK2
);
halfSplit
.
Add
(
&
halfV2
);
Split
(
halfCon
,
halfSplit
,
2
,
3
);
}
else
{
XTensor
halfQ
;
XTensor
halfV
;
halfQ
=
ConvertDataType
(
q
,
X_FLOAT16
);
halfV
=
ConvertDataType
(
v
,
X_FLOAT16
);
/* linear transofmration before self-attention */
halfK2
=
MMul
(
halfK
,
wk
);
halfQ2
=
MMul
(
halfK
,
wq
);
halfV2
=
MMul
(
halfK
,
wv
);
halfQ2
=
MMul
(
halfQ
,
wq
);
halfV2
=
MMul
(
halfV
,
wv
);
}
XTensor
halfKheads
;
XTensor
halfQheads
;
XTensor
halfVheads
;
/* multi head */
halfKheads
=
Split
(
halfK2
,
halfK2
.
order
-
1
,
nhead
);
halfQheads
=
Split
(
halfQ2
,
halfQ2
.
order
-
1
,
nhead
);
halfVheads
=
Split
(
halfV2
,
halfV2
.
order
-
1
,
nhead
);
XTensor
half
Mask
;
XTensor
half
Att
;
XTensor
halfDot
;
XTensor
halfScalar
;
XTensor
halfAtt
;
halfMask
=
ConvertDataType
(
mask
,
X_FLOAT16
);
/* scalar = softmax(Q * K^T / sqrt(dk)) * V */
halfDot
=
BMMul
(
halfQheads
,
X_NOTRANS
,
halfKheads
,
X_TRANS
);
//XTensor halfMask(mask.order, mask.dimSize, X_FLOAT16, mask.denseRatio, mask.devID, mask.mem);
if
(
isMasked
)
{
XTensor
halfMask
;
halfMask
=
ConvertDataType
(
mask
,
X_FLOAT16
);
halfDot
=
Sum
(
halfDot
,
halfMask
);
}
...
...
@@ -147,13 +278,10 @@ XTensor T2TAttention::Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bo
if
(
isTraining
&&
dropoutP
>
0
)
halfScalar
=
Dropout
(
halfScalar
,
dropoutP
);
/*att = BMMul(scalar, vheads);*/
halfAtt
=
BMMul
(
halfScalar
,
halfVheads
);
/* concatenate the heads */
return
ConvertDataType
(
MMul
(
Merge
(
halfAtt
,
halfAtt
.
order
-
1
),
wa
),
X_FLOAT
);
}
}
source/sample/transformer/T2TAttention.h
查看文件 @
44bf9fa6
...
...
@@ -60,6 +60,7 @@ public:
/* transformation after dot-product attention */
XTensor
wa
;
XTensor
wbig
;
/* size of transformed Q and K */
int
dk
;
...
...
@@ -95,7 +96,7 @@ public:
int
myDevID
=
-
1
,
XMem
*
myMem
=
NULL
);
/* make the network */
XTensor
Make
(
XTensor
&
k
,
XTensor
&
q
,
XTensor
&
v
,
XTensor
&
mask
,
bool
isTraining
);
XTensor
Make
(
XTensor
&
k
,
XTensor
&
q
,
XTensor
&
v
,
XTensor
&
mask
,
bool
isTraining
,
bool
selfatt
);
};
}
...
...
source/sample/transformer/T2TDecoder.cpp
查看文件 @
44bf9fa6
...
...
@@ -21,6 +21,8 @@
#include <math.h>
#include "T2TDecoder.h"
#include "T2TUtility.h"
#include "T2TLayerNormal.h"
#include "../../tensor/core/CHeader.h"
namespace
transformer
...
...
@@ -53,16 +55,43 @@ void AttDecoder::InitModel(int argc, char ** argv,
bool
myIsMasked
,
int
myIgnored
,
int
myDevID
,
XMem
*
myMem
)
{
AttEncoder
::
InitModel
(
argc
,
argv
,
myIsMasked
,
myIgnored
,
myDevID
,
myMem
);
//
AttEncoder::InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem);
devID
=
myDevID
;
mem
=
myMem
;
ignored
=
myIgnored
;
LoadParamInt
(
argc
,
argv
,
"nlayer"
,
&
nlayer
,
6
);
LoadParamInt
(
argc
,
argv
,
"hsize"
,
&
hSize
,
DEFAULT_EMBEDDING_SIZE
);
LoadParamInt
(
argc
,
argv
,
"esize"
,
&
eSize
,
DEFAULT_EMBEDDING_SIZE
);
LoadParamInt
(
argc
,
argv
,
"vsizetgt"
,
&
vSize
,
-
1
);
LoadParamFloat
(
argc
,
argv
,
"dropout"
,
&
dropoutP
,
0
);
CheckNTErrors
(
nlayer
>=
1
,
"We have one encoding layer at least!"
);
CheckNTErrors
(
vSize
>
1
,
"set vocabulary size by
\"
-vsize
\"
"
);
/* embedding model */
embedder
.
InitModel
(
argc
,
argv
,
devID
,
mem
,
false
);
attentions
=
new
T2TAttention
[
nlayer
];
fnns
=
new
T2TFNN
[
nlayer
];
attLayerNorms
=
new
T2TLN
[
nlayer
];
fnnLayerNorms
=
new
T2TLN
[
nlayer
];
attentionsEnde
=
new
T2TAttention
[
nlayer
];
attEndeLayerNorms
=
new
T2TLN
[
nlayer
];
/* initialize the stacked layers */
for
(
int
i
=
0
;
i
<
nlayer
;
i
++
){
attentionsEnde
[
i
].
InitModel
(
argc
,
argv
,
myIsMasked
,
myIgnored
,
myDevID
,
myMem
);
for
(
int
i
=
0
;
i
<
nlayer
;
i
++
)
{
attentions
[
i
].
InitModel
(
argc
,
argv
,
myIsMasked
,
myIgnored
,
myDevID
,
myMem
);
fnns
[
i
].
InitModel
(
argc
,
argv
,
myDevID
,
myMem
);
attLayerNorms
[
i
].
InitModel
(
argc
,
argv
,
myDevID
,
myMem
);
fnnLayerNorms
[
i
].
InitModel
(
argc
,
argv
,
myDevID
,
myMem
);
attentionsEnde
[
i
].
InitModel
(
argc
,
argv
,
true
,
myIgnored
,
myDevID
,
myMem
);
attEndeLayerNorms
[
i
].
InitModel
(
argc
,
argv
,
myDevID
,
myMem
);
}
}
/*
...
...
@@ -93,7 +122,7 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X
/******************/
/* self attention */
att
=
attentions
[
i
].
Make
(
x
,
x
,
x
,
mask
,
isTraining
);
att
=
attentions
[
i
].
Make
(
x
,
x
,
x
,
mask
,
isTraining
,
true
);
/* dropout */
if
(
isTraining
&&
dropoutP
>
0
)
...
...
@@ -107,7 +136,7 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X
/*****************************/
/* encoder-decoder attention */
ende
=
attentionsEnde
[
i
].
Make
(
outputEnc
,
x
,
outputEnc
,
maskEncDec
,
isTraining
);
ende
=
attentionsEnde
[
i
].
Make
(
outputEnc
,
x
,
outputEnc
,
maskEncDec
,
isTraining
,
false
);
/* dropout */
if
(
isTraining
&&
dropoutP
>
0
)
...
...
source/sample/transformer/T2TDecoder.h
查看文件 @
44bf9fa6
...
...
@@ -27,9 +27,56 @@
namespace
transformer
{
class
AttDecoder
:
public
AttEncoder
class
AttDecoder
{
public
:
/* device id */
int
devID
;
/* memory pool */
XMem
*
mem
;
/* layer number */
int
nlayer
;
/* hidden layer size of the FNN layer */
int
hSize
;
/* embedding size */
int
eSize
;
/* vocabulary size */
int
vSize
;
/* dropout probability */
DTYPE
dropoutP
;
/* some positions can be ignored in attention. this is useful in lm where the first position needs
* special design for the attention model. */
int
ignored
;
/* embedding of word at each position */
T2TEmbedder
embedder
;
/* FNN model of each layer */
T2TFNN
*
fnns
;
/* attention model of each layer */
T2TAttention
*
attentions
;
/* layer normalization for fnn */
T2TLN
*
fnnLayerNorms
;
/* layer normalization for attention */
T2TLN
*
attLayerNorms
;
/* input tensor of the encoder */
XTensor
*
input
;
/* output tensor of the encoder */
XTensor
*
output
;
/* encoder-decoder attention model of each layer */
T2TAttention
*
attentionsEnde
;
...
...
source/sample/transformer/T2TEmbedding.cpp
查看文件 @
44bf9fa6
...
...
@@ -48,12 +48,18 @@ initialize the model
>> myDevID - device id
>> myMem - the memory pool
*/
void
T2TEmbedder
::
InitModel
(
int
argc
,
char
**
argv
,
int
myDevID
,
XMem
*
myMem
)
void
T2TEmbedder
::
InitModel
(
int
argc
,
char
**
argv
,
int
myDevID
,
XMem
*
myMem
,
bool
isEnc
)
{
devID
=
myDevID
;
mem
=
myMem
;
if
(
isEnc
){
LoadParamInt
(
argc
,
argv
,
"vsize"
,
&
vSize
,
-
1
);
}
else
{
LoadParamInt
(
argc
,
argv
,
"vsizetgt"
,
&
vSize
,
-
1
);
}
//LoadParamInt(argc, argv, "vsize", &vSize, -1);
LoadParamInt
(
argc
,
argv
,
"maxlen"
,
&
maxLength
,
512
);
LoadParamInt
(
argc
,
argv
,
"d"
,
&
eSize
,
DEFAULT_EMBEDDING_SIZE
);
LoadParamInt
(
argc
,
argv
,
"d"
,
&
d
,
DEFAULT_EMBEDDING_SIZE
);
...
...
@@ -110,7 +116,6 @@ void T2TEmbedder::MakePosEmbedding(int eSize, int d, int length)
delete
[]
data
;
}
/*
make the network
*/
...
...
source/sample/transformer/T2TEmbedding.h
查看文件 @
44bf9fa6
...
...
@@ -71,7 +71,7 @@ public:
~
T2TEmbedder
();
/* initialize the model */
void
InitModel
(
int
argc
,
char
**
argv
,
int
myDevID
=
-
1
,
XMem
*
myMem
=
NULL
);
void
InitModel
(
int
argc
,
char
**
argv
,
int
myDevID
=
-
1
,
XMem
*
myMem
=
NULL
,
bool
isEnc
=
true
);
/* make positional embeddings */
void
MakePosEmbedding
(
int
eSize
,
int
d
,
int
length
);
...
...
source/sample/transformer/T2TEncoder.cpp
查看文件 @
44bf9fa6
...
...
@@ -116,7 +116,7 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, boo
XTensor
res
;
/* self attention */
att
=
attentions
[
i
].
Make
(
x
,
x
,
x
,
mask
,
isTraining
);
att
=
attentions
[
i
].
Make
(
x
,
x
,
x
,
mask
,
isTraining
,
true
);
/* dropout */
if
(
isTraining
&&
dropoutP
>
0
)
...
...
@@ -160,3 +160,4 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool isTraining)
}
}
source/sample/transformer/T2TModel.cpp
查看文件 @
44bf9fa6
...
...
@@ -236,10 +236,10 @@ void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTe
XTensor
*
maskEncDecTMPDec
=
NewTensorBuf
(
maskEncDecTMPEnc
,
paddingEnc
.
devID
,
paddingEnc
.
mem
);
_Unsqueeze
(
&
paddingEnc
,
maskEncDecTMPEnc
,
paddingEnc
.
order
-
1
,
paddingDec
.
GetDim
(
-
1
));
_Unsqueeze
(
&
paddingDec
,
maskEncDecTMPDec
,
paddingEnc
.
order
,
paddingEnc
.
GetDim
(
-
1
));
_Multiply
(
maskEncDecTMPDec
,
maskEncDecTMPEnc
,
maskEncDecTMPDec
);
_ScaleAndShiftMe
(
maskEncDecTMP
De
c
,
1e9
F
,
-
1e9
F
);
_Unsqueeze
(
maskEncDecTMP
De
c
,
&
maskEncDec
,
0
,
dims
[
0
]);
//
_Unsqueeze(&paddingDec, maskEncDecTMPDec, paddingEnc.order, paddingEnc.GetDim(-1));
//
_Multiply(maskEncDecTMPDec, maskEncDecTMPEnc, maskEncDecTMPDec);
_ScaleAndShiftMe
(
maskEncDecTMP
En
c
,
1e9
F
,
-
1e9
F
);
_Unsqueeze
(
maskEncDecTMP
En
c
,
&
maskEncDec
,
0
,
dims
[
0
]);
DelTensorBuf
(
maskEncDecTMPDec
);
DelTensorBuf
(
maskEncDecTMPEnc
);
...
...
@@ -274,7 +274,10 @@ void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTe
_Sum
(
&
maskEnc
,
padding3
,
&
maskEnc
);
encoding
=
MakeEncoder
(
inputEnc
,
maskEnc
,
isTraining
);
//encoding.Dump(stderr, "encoding",10);
decoding
=
MakeDecoder
(
inputDec
,
encoding
,
maskDec
,
maskEncDec
,
isTraining
);
//decoding.Dump(stderr, "decoding", 10);
outputLayer
->
Make
(
decoding
,
output
);
delete
[]
dims
;
...
...
@@ -298,9 +301,10 @@ void T2TModel::GetParams(XList &list)
list
.
Add
(
&
encoder
->
fnns
[
i
].
b1
);
list
.
Add
(
&
encoder
->
fnns
[
i
].
w2
);
list
.
Add
(
&
encoder
->
fnns
[
i
].
b2
);
list
.
Add
(
&
encoder
->
attentions
[
i
].
wk
);
list
.
Add
(
&
encoder
->
attentions
[
i
].
wq
);
list
.
Add
(
&
encoder
->
attentions
[
i
].
wv
);
//list.Add(&encoder->attentions[i].wk);
//list.Add(&encoder->attentions[i].wq);
//list.Add(&encoder->attentions[i].wv);
list
.
Add
(
&
encoder
->
attentions
[
i
].
wbig
);
list
.
Add
(
&
encoder
->
attentions
[
i
].
wa
);
list
.
Add
(
&
encoder
->
fnnLayerNorms
[
i
].
w
);
list
.
Add
(
&
encoder
->
fnnLayerNorms
[
i
].
b
);
...
...
@@ -322,9 +326,10 @@ void T2TModel::GetParams(XList &list)
list
.
Add
(
&
decoder
->
attentionsEnde
[
i
].
wa
);
list
.
Add
(
&
decoder
->
attEndeLayerNorms
[
i
].
w
);
list
.
Add
(
&
decoder
->
attEndeLayerNorms
[
i
].
b
);
list
.
Add
(
&
decoder
->
attentions
[
i
].
wk
);
list
.
Add
(
&
decoder
->
attentions
[
i
].
wq
);
list
.
Add
(
&
decoder
->
attentions
[
i
].
wv
);
//list.Add(&decoder->attentions[i].wk);
//list.Add(&decoder->attentions[i].wq);
//list.Add(&decoder->attentions[i].wv);
list
.
Add
(
&
decoder
->
attentions
[
i
].
wbig
);
list
.
Add
(
&
decoder
->
attentions
[
i
].
wa
);
list
.
Add
(
&
decoder
->
fnnLayerNorms
[
i
].
w
);
list
.
Add
(
&
decoder
->
fnnLayerNorms
[
i
].
b
);
...
...
source/sample/transformer/T2TOutput.cpp
查看文件 @
44bf9fa6
...
...
@@ -56,7 +56,7 @@ void T2TOutput::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
float
minmax
=
0
;
LoadParamInt
(
argc
,
argv
,
"vsize"
,
&
vSize
,
-
1
);
LoadParamInt
(
argc
,
argv
,
"vsize
tgt
"
,
&
vSize
,
-
1
);
LoadParamInt
(
argc
,
argv
,
"d"
,
&
inSize
,
DEFAULT_EMBEDDING_SIZE
);
LoadParamInt
(
argc
,
argv
,
"d"
,
&
hSize
,
DEFAULT_EMBEDDING_SIZE
);
LoadParamFloat
(
argc
,
argv
,
"outputminmax"
,
&
minmax
,
0.08
F
);
...
...
source/sample/transformer/T2TTrainer.cpp
查看文件 @
44bf9fa6
...
...
@@ -41,12 +41,15 @@ T2TTrainer::T2TTrainer()
seqLen2
=
NULL
;
nseqBuf
=
0
;
nextSeq
=
-
1
;
nextBatch
=
-
1
;
argNum
=
0
;
argArray
=
NULL
;
buf
=
NULL
;
buf2
=
NULL
;
bufBatch
=
NULL
;
bufSize
=
0
;
bufBatchSize
=
0
;
seqOffset
=
NULL
;
}
...
...
@@ -55,6 +58,7 @@ T2TTrainer::~T2TTrainer()
{
delete
[]
buf
;
delete
[]
buf2
;
delete
[]
bufBatch
;
delete
[]
seqLen
;
delete
[]
seqLen2
;
delete
[]
seqOffset
;
...
...
@@ -116,10 +120,12 @@ void T2TTrainer::Init(int argc, char ** argv)
LoadParamBool
(
argc
,
argv
,
"doubledend"
,
&
isDoubledEnd
,
false
);
LoadParamBool
(
argc
,
argv
,
"smallbatch"
,
&
isSmallBatch
,
true
);
LoadParamBool
(
argc
,
argv
,
"bigbatch"
,
&
isBigBatch
,
false
);
LoadParamBool
(
argc
,
argv
,
"smallfootprint"
,
&
isSmallFootprint
,
false
);
LoadParamBool
(
argc
,
argv
,
"debug"
,
&
isDebugged
,
false
);
LoadParamBool
(
argc
,
argv
,
"randbatch"
,
&
isRandomBatch
,
false
);
buf
=
new
int
[
bufSize
];
buf2
=
new
int
[
bufSize
];
bufBatch
=
new
BatchNode
[
bufSize
];
seqLen
=
new
int
[
bufSize
];
seqLen2
=
new
int
[
bufSize
];
seqOffset
=
new
int
[
bufSize
];
...
...
@@ -165,13 +171,16 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
XMem
*
mem
=
model
->
mem
;
XNet
net
;
if
(
is
SmallFootprint
)
net
.
SetGradEfficientFlag
();
if
(
is
Debugged
)
net
.
SetGradEfficientFlag
(
false
);
PrepareModel
(
model
);
double
startT
=
GetClockSec
();
FILE
*
fileen
=
fopen
(
"enc.txt"
,
"w"
);
FILE
*
filede
=
fopen
(
"dec.txt"
,
"w"
);
for
(
epoch
=
1
;
epoch
<=
nepoch
;
epoch
++
){
#ifndef WIN32
if
(
isShuffled
)
...
...
@@ -205,6 +214,10 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
CheckNTErrors
(
batchEnc
.
order
==
2
,
"wrong tensor order of the sequence batch"
);
//batchEnc.Dump(stderr, "enc",1);
//batchDec.Dump(stderr, "dec",1);
//paddingDec.Dump(stderr, "paddec");
/* output probabilities */
XTensor
output
;
...
...
@@ -222,17 +235,18 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
LabelSmooth
(
&
gold
,
&
goldSmoothed
,
labelSmoothingP
);
/* make paddings for the output */
if
(
output
.
GetDim
(
0
)
>
1
)
if
(
output
.
GetDim
(
0
)
>
0
)
PadOutput
(
&
output
,
&
gold
,
&
paddingDec
);
/* get probabilities */
float
prob
=
GetProb
(
&
output
,
&
gold
,
NULL
);
//printf("%f\n", prob);
//float prob = 0;
DTYPE
lossLocal
=
-
prob
/
wc
;
bool
doUpdate
=
(
!
IsNAN
(
lossLocal
)
&&
!
IsINF
(
lossLocal
)
&&
lossLocal
<
1e3
F
);
XTensor
&
g
=
labelSmoothingP
>
0
?
goldSmoothed
:
gold
;
//doUpdate = false;
if
(
doUpdate
)
{
/* recale the output for normalized loss */
...
...
@@ -292,6 +306,9 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
MakeCheckpoint
(
model
,
validFN
,
modelFN
,
"epoch"
,
epoch
);
}
fclose
(
fileen
);
fclose
(
filede
);
double
elapsed
=
GetClockSec
()
-
startT
;
epoch
=
MIN
(
epoch
,
nepoch
);
...
...
@@ -434,11 +451,13 @@ void T2TTrainer::MakeCheckpoint(T2TModel * model, const char * validFN, const ch
sprintf
(
fn2
,
"%s.%s.%03d.output"
,
modelFN
,
label
,
id
);
model
->
Dump
(
fn
);
if
(
validFN
!=
NULL
){
if
(
model
->
isLM
)
{
if
(
validFN
!=
NULL
)
{
T2TTrainer
trainer
;
trainer
.
Init
(
argNum
,
argArray
);
trainer
.
Test
(
validFN
,
fn2
,
model
);
}
}
delete
[]
fn
;
delete
[]
fn2
;
...
...
@@ -473,7 +492,8 @@ int T2TTrainer::LoadBuf(FILE * file, bool isSorted, int step)
int
wordCount
=
0
;
while
(
fgets
(
line
,
MAX_SEQUENCE_LENGTH
-
1
,
file
)){
int
len
=
(
int
)
strlen
(
line
);
if
(
line
[
0
]
==
'b'
)
break
;
while
(
line
[
len
-
1
]
==
'\r'
||
line
[
len
-
1
]
==
'\n'
){
line
[
len
-
1
]
=
0
;
len
--
;
...
...
@@ -544,9 +564,14 @@ int T2TTrainer::LoadBuf(FILE * file, bool isSorted, int step)
node
.
offset
=
i
;
node
.
p
=
buf
+
offset
;
node
.
size
=
0
;
for
(
int
j
=
0
;
j
<
step
;
j
++
)
int
max
=
0
;
for
(
int
j
=
0
;
j
<
step
;
j
++
){
node
.
size
+=
seqLen
[
i
+
j
];
node
.
value
=
seqLen
[
i
];
max
=
MAX
(
max
,
seqLen
[
i
+
j
]);
}
//node.value = seqLen[i+1];
//node.value = MAX(seqLen[i+1],seqLen[i]);
node
.
value
=
max
;
count
++
;
offset
+=
node
.
size
;
}
...
...
@@ -768,6 +793,12 @@ int T2TTrainer::LoadBatchLM(FILE * file,
return
sc
;
}
int
CompareBatchNode
(
const
void
*
a
,
const
void
*
b
)
{
return
((
BatchNode
*
)
b
)
->
key
-
((
BatchNode
*
)
a
)
->
key
;
}
/*
load a batch of sequences (for MT)
>> file - the handle to the data file
...
...
@@ -797,10 +828,19 @@ int T2TTrainer::LoadBatchMT(FILE * file,
int
devID
,
XMem
*
mem
,
bool
isTraining
)
{
if
(
nextSeq
<
0
||
nextSeq
>=
nseqBuf
)
//if (nextSeq < 0 || nextSeq >= nseqBuf)
// LoadBuf(file, isSorted, 2);
if
(
nextBatch
<
0
||
nextBatch
>=
bufBatchSize
)
{
LoadBuf
(
file
,
isSorted
,
2
);
int
seq
=
MAX
(
nextSeq
,
0
);
int
seq
=
0
;
bufBatchSize
=
0
;
nextBatch
=
0
;
/* we segment the buffer into batches */
while
(
seq
<
nseqBuf
)
{
int
wcEnc
=
0
;
int
wcDec
=
0
;
int
wnEnc
=
0
;
...
...
@@ -809,38 +849,99 @@ int T2TTrainer::LoadBatchMT(FILE * file,
int
maxDec
=
0
;
int
sc
=
0
;
CheckNTErrors
((
nseqBuf
-
seq
)
%
2
==
0
,
"Input sequence must be paired!"
);
while
(
seq
+
sc
<
nseqBuf
){
while
(
seq
+
sc
<
nseqBuf
)
{
/* source-side sequence */
wnEnc
=
seqLen
[
seq
+
sc
];
/* target-side sequence */
wnDec
=
isDoubledEnd
?
seqLen
[
seq
+
sc
+
1
]
:
seqLen
[
seq
+
sc
+
1
]
-
1
;
int
tcEnc
=
isBigBatch
?
(
wcEnc
+
wnEnc
)
:
MAX
(
maxEnc
,
wnEnc
)
*
(
sc
+
2
)
/
2
;
int
tcDec
=
isBigBatch
?
(
wcDec
+
wnDec
)
:
MAX
(
maxDec
,
wnDec
)
*
(
sc
+
2
)
/
2
;
if
(
sc
!=
0
&&
sc
>
sBatch
*
2
&&
(
tcEnc
>
wBatch
||
tcDec
>
wBatch
))
break
;
wcEnc
+=
wnEnc
;
sc
+=
1
;
if
(
maxEnc
<
wnEnc
)
maxEnc
=
wnEnc
;
/* target-side sequence */
int
len
=
isDoubledEnd
?
seqLen
[
seq
+
sc
]
:
seqLen
[
seq
+
sc
]
-
1
;
wnDec
=
len
;
wcDec
+=
wnDec
;
sc
+=
1
;
if
(
maxDec
<
wnDec
)
maxDec
=
wnDec
;
}
BatchNode
&
batch
=
bufBatch
[
bufBatchSize
];
batch
.
beg
=
seq
;
batch
.
end
=
seq
+
sc
;
batch
.
maxEnc
=
maxEnc
;
batch
.
maxDec
=
maxDec
;
batch
.
key
=
rand
();
bufBatchSize
++
;
seq
=
seq
+
sc
;
}
if
(
isRandomBatch
)
qsort
(
bufBatch
,
bufBatchSize
,
sizeof
(
BatchNode
),
CompareBatchNode
);
}
/*int seq = MAX(nextSeq, 0);
int wcEnc = 0;
int wcDec = 0;
int wnEnc = 0;
int wnDec = 0;
int maxEnc = 0;
int maxDec = 0;
int sc = 0;
CheckNTErrors((nseqBuf - seq) % 2 == 0, "Input sequence must be paired!");
int
tcEnc
=
isBigBatch
?
wcEnc
:
maxEnc
*
sc
/
2
;
int
tcDec
=
isBigBatch
?
wcDec
:
maxDec
*
sc
/
2
;
if
(
sc
>=
sBatch
*
2
&&
(
tcEnc
>=
wBatch
||
tcDec
>=
wBatch
))
while(seq + sc < nseqBuf){
wnEnc = seqLen[seq + sc];
wnDec = isDoubledEnd ? seqLen[seq + sc + 1] : seqLen[seq + sc + 1] - 1;
int tcEnc = isBigBatch ? (wcEnc + wnEnc): MAX(maxEnc, wnEnc) * (sc + 2) / 2;
int tcDec = isBigBatch ? (wcDec + wnDec): MAX(maxDec, wnDec) * (sc + 2) / 2;
if(sc != 0 && sc > sBatch * 2 && (tcEnc > wBatch || tcDec > wBatch))
break;
wcEnc += wnEnc;
sc += 1;
if(maxEnc < wnEnc)
maxEnc = wnEnc;
wcDec += wnDec;
sc += 1;
if(maxDec < wnDec)
maxDec = wnDec;
}
nextSeq = seq + sc;
if(sc <= 0)
return 0;*/
if
(
bufBatchSize
<=
0
)
return
0
;
BatchNode
&
batch
=
bufBatch
[
nextBatch
++
];
int
seq
=
batch
.
beg
;
int
sc
=
batch
.
end
-
batch
.
beg
;
int
maxEnc
=
batch
.
maxEnc
;
int
maxDec
=
batch
.
maxDec
;
CheckNTErrors
(
sc
%
2
==
0
,
"The input samples must be paired"
);
int
sCount
=
sc
/
2
;
int
seqSize
=
0
;
int
dimsDec
[
3
]
=
{
sCount
,
maxDec
,
vsDec
};
...
...
@@ -859,13 +960,14 @@ int T2TTrainer::LoadBatchMT(FILE * file,
int
wCountEnc
=
0
;
int
wCountDec
=
0
;
int
wCountPad
=
0
;
int
wGold
=
0
;
wCount
=
0
;
int
*
batchEncValues
=
new
int
[
batchEnc
->
unitNum
];
int
*
batchDecValues
=
new
int
[
batchDec
->
unitNum
];
//MTYPE * paddingEncOffsets = new MTYPE[sc * maxEnc / 2];
//
MTYPE * paddingDecOffsets = new MTYPE[sc * maxDec / 2];
MTYPE
*
paddingDecOffsets
=
new
MTYPE
[
sc
*
maxDec
/
2
];
MTYPE
*
goldOffsets
=
new
MTYPE
[
sc
*
maxDec
/
2
];
memset
(
batchEncValues
,
0
,
sizeof
(
int
)
*
batchEnc
->
unitNum
);
...
...
@@ -899,7 +1001,10 @@ int T2TTrainer::LoadBatchMT(FILE * file,
int
num
=
buf
[
seqOffset
[
s
]
+
w
];
batchDecValues
[
batchDec
->
GetOffset2D
(
sent
,
w
)]
=
num
;
//paddingDecOffsets[wCountDec] = paddingDec->GetOffset2D(sent, w);
if
(
w
<
len
-
1
){
paddingDecOffsets
[
wCountPad
++
]
=
paddingDec
->
GetOffset2D
(
sent
,
w
);
wCount
++
;
}
if
(
w
>
0
)
goldOffsets
[
wGold
++
]
=
gold
->
GetOffset3D
(
sent
,
w
-
1
,
buf
[
seqOffset
[
s
]
+
w
]);
...
...
@@ -909,7 +1014,7 @@ int T2TTrainer::LoadBatchMT(FILE * file,
else
goldOffsets
[
wGold
++
]
=
gold
->
GetOffset3D
(
sent
,
w
,
buf
[
seqOffset
[
s
]
+
w
+
1
]);
}
wCount
++
;
//
wCount++;
wCountDec
++
;
if
(
seqs
!=
NULL
)
seqs
[
seqSize
++
]
=
buf
[
seqOffset
[
s
]
+
w
];
...
...
@@ -922,19 +1027,19 @@ int T2TTrainer::LoadBatchMT(FILE * file,
}
batchDec
->
SetData
(
batchDecValues
,
batchDec
->
unitNum
);
//paddingDec->SetDataBatched(paddingDecOffsets, 1.0F, wCountDec
);
paddingDec
->
SetDataBatched
(
paddingDecOffsets
,
1.0
F
,
wCountPad
);
XTensor
*
tmp2
=
NewTensorBuf
(
paddingDec
,
devID
,
mem
);
_ConvertDataType
(
batchDec
,
tmp2
);
_NotEqual
(
tmp2
,
paddingDec
,
0
);
DelTensorBuf
(
tmp2
);
//
XTensor * tmp2 = NewTensorBuf(paddingDec, devID, mem);
//
_ConvertDataType(batchDec, tmp2);
//
_NotEqual(tmp2, paddingDec, 0);
//
DelTensorBuf(tmp2);
gold
->
SetDataBatched
(
goldOffsets
,
1.0
F
,
wGold
);
delete
[]
batchEncValues
;
delete
[]
batchDecValues
;
//delete[] paddingEncOffsets;
//
delete[] paddingDecOffsets;
delete
[]
paddingDecOffsets
;
delete
[]
goldOffsets
;
return
sc
;
...
...
@@ -1015,20 +1120,20 @@ void T2TTrainer::Update(T2TModel * model, const float lr)
model
->
GetParams
(
ws
);
for
(
int
i
=
0
;
i
<
ws
.
count
;
i
++
)
{
for
(
int
i
=
0
;
i
<
ws
.
count
;
i
++
)
{
XTensor
*
para
=
(
XTensor
*
)
ws
.
Get
(
i
);
XTensor
*
paraGrad
=
para
->
grad
;
if
(
para
->
dataType
==
X_FLOAT
)
{
if
(
paraGrad
==
NULL
)
continue
;
CheckNTErrors
(
para
!=
NULL
,
"NULL parameter tensor!"
);
CheckNTErrors
(
paraGrad
!=
NULL
,
"NULL gradient tensor!"
);
if
(
useAdam
)
{
if
(
para
->
dataType
==
X_FLOAT
)
{
if
(
useAdam
)
{
adamBeta1T
*=
adamBeta1
;
adamBeta2T
*=
adamBeta2
;
DTYPE
e
=
lr
*
(
DTYPE
)
sqrt
(
1
-
adamBeta2T
)
/
(
1
-
adamBeta1T
);
...
...
@@ -1070,19 +1175,13 @@ void T2TTrainer::Update(T2TModel * model, const float lr)
/* clear gradient */
paraGrad
->
SetZeroAll
();
}
else
if
(
para
->
dataType
==
X_FLOAT16
)
{
else
if
(
para
->
dataType
==
X_FLOAT16
)
{
if
(
useAdam
)
{
XTensor
paraGrad1
(
paraGrad
->
order
,
paraGrad
->
dimSize
,
X_FLOAT
,
paraGrad
->
denseRatio
,
paraGrad
->
devID
,
paraGrad
->
mem
);
_ConvertDataType
(
paraGrad
,
&
paraGrad1
);
if
(
paraGrad
==
NULL
)
continue
;
CheckNTErrors
(
para
!=
NULL
,
"NULL parameter tensor!"
);
CheckNTErrors
(
paraGrad
!=
NULL
,
"NULL gradient tensor!"
);
if
(
useAdam
)
{
adamBeta1T
*=
adamBeta1
;
adamBeta2T
*=
adamBeta2
;
DTYPE
e
=
lr
*
(
DTYPE
)
sqrt
(
1
-
adamBeta2T
)
/
(
1
-
adamBeta1T
);
...
...
@@ -1136,9 +1235,67 @@ void T2TTrainer::Update(T2TModel * model, const float lr)
}
}
}
//void T2TTrainer::Update(T2TModel * model, const float lr)
//{
// XList ws(100);
//
// model->GetParams(ws);
//
// for(int i = 0; i < ws.count; i++){
// XTensor * para = (XTensor*)ws.Get(i);
// XTensor * paraGrad = para->grad;
//
// if (paraGrad == NULL)
// continue;
//
// CheckNTErrors(para != NULL, "NULL parameter tensor!");
// CheckNTErrors(paraGrad != NULL, "NULL gradient tensor!");
//
// if(useAdam){
// adamBeta1T *= adamBeta1;
// adamBeta2T *= adamBeta2;
// DTYPE e = lr * (DTYPE)sqrt(1 - adamBeta2T) / (1 - adamBeta1T);
// DTYPE d = adamDelta * (DTYPE)sqrt(1 - adamBeta2T);
//
// if (para->dataType == X_FLOAT16) {
// d = d * 5e4;
// }
//
// /* m = beta_1 * m + (1-beta_1) * grad */
// XTensor * m = (XTensor*)moments.Get(i);
// _ScaleAndShiftMe(m, adamBeta1, 0);
// _Sum(m, paraGrad, m, (1.0F - adamBeta1));
//
// /* v = beta_2 * v + (1-beta_2) * grad * grad*/
// XTensor * v = (XTensor*)moments2nd.Get(i);
// _Multiply(paraGrad, paraGrad, v, adamBeta2/(1.0F - adamBeta2));
// _ScaleAndShiftMe(v, (1.0F - adamBeta2), 0);
//
// /* v2 = m / (sqrt(v) + delta) */
// XTensor * v2 = NewTensorBuf(v, v->devID, v->mem);
// _Power(v, v2, 0.5F);
// _ScaleAndShiftMe(v2, 1.0F, d);
// _Div(m, v2, v2);
//
// /* the delta rule */
//
// _Sum(para, v2, para, -e);
//
// DelTensorBuf(v2);
//
// }
// else{
// /* the delta rule */
// _Sum(para, paraGrad, para, -lr);
// }
//
// /* clear gradient */
// paraGrad->SetZeroAll();
// }
//}
/*
prepare model for training
>> model - the model for training
...
...
source/sample/transformer/T2TTrainer.h
查看文件 @
44bf9fa6
...
...
@@ -33,6 +33,25 @@ using namespace nts;
namespace
transformer
{
/* node to keep batch information */
struct
BatchNode
{
/* begining position */
int
beg
;
/* end position */
int
end
;
/* maximum word number on the encoder side */
int
maxEnc
;
/* maximum word number on the decoder side */
int
maxDec
;
/* a key for sorting */
int
key
;
};
/* trainer of the T2T model */
class
T2TTrainer
{
...
...
@@ -49,9 +68,15 @@ public:
/* another buffer */
int
*
buf2
;
/* batch buf */
BatchNode
*
bufBatch
;
/* buffer size */
int
bufSize
;
/* size of batch buffer */
int
bufBatchSize
;
/* length of each sequence */
int
*
seqLen
;
...
...
@@ -67,6 +92,9 @@ public:
/* offset for next sequence in the buffer */
int
nextSeq
;
/* offset for next batch */
int
nextBatch
;
/* indicates whether the sequence is sorted by length */
bool
isLenSorted
;
...
...
@@ -142,8 +170,11 @@ public:
/* counterpart of "isSmallBatch" */
bool
isBigBatch
;
/* indicates whether we use small memory footprint for backward process */
bool
isSmallFootprint
;
/* randomize batches */
bool
isRandomBatch
;
/* indicates whether we intend to debug the net */
bool
isDebugged
;
public
:
/* constructor */
...
...
source/sample/transformer/Transformer.cpp
查看文件 @
44bf9fa6
...
...
@@ -60,6 +60,7 @@ int TransformerMain(int argc, const char ** argv)
LoadParamString
(
argc
,
args
,
"output"
,
outputFN
,
""
);
srand
((
unsigned
int
)
time
(
NULL
));
T2TTrainer
trainer
;
trainer
.
Init
(
argc
,
args
);
...
...
source/tensor/XDevice.cpp
查看文件 @
44bf9fa6
...
...
@@ -24,6 +24,7 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include "XDevice.h"
#include "XGlobal.h"
#include "XThread.h"
...
...
@@ -59,6 +60,7 @@ XDevice::~XDevice()
cublasDestroy
(
cublasHandle
);
if
(
stream
!=
NULL
)
delete
stream
;
curandDestroyGenerator
(
gen
);
#endif
}
...
...
@@ -68,6 +70,7 @@ void XDevice::Init(int myDevID)
Clear
();
devID
=
myDevID
;
seed
=
rand
();
/* CPU information */
if
(
devID
<
0
){
...
...
@@ -80,6 +83,10 @@ void XDevice::Init(int myDevID)
cudaDeviceProp
prop
;
cudaSetDevice
(
myDevID
);
curandCreateGenerator
(
&
gen
,
CURAND_RNG_PSEUDO_DEFAULT
);
curandSetPseudoRandomGeneratorSeed
(
gen
,
seed
);
if
(
cudaGetDeviceProperties
(
&
prop
,
devID
)
!=
cudaSuccess
){
XPRINT1
(
0
,
stderr
,
"cannot get GPU(%d) information."
,
devID
);
exit
(
1
);
...
...
@@ -270,6 +277,8 @@ XDevManager::~XDevManager()
/* initialize it and get the CPU and GPU information */
void
XDevManager
::
Init
()
{
srand
((
unsigned
int
)
time
(
NULL
));
Clear
();
/* CPUs (we actually do not care about how many CPUs are using) */
...
...
source/tensor/XDevice.h
查看文件 @
44bf9fa6
...
...
@@ -100,6 +100,9 @@ public:
/* default stream for the device */
XStream
*
stream
;
/* seed for random number generation */
int
seed
;
#ifdef USE_CUDA
/* mutex for handle (GPU cublas) */
MUTEX_HANDLE
cublasMutex
;
...
...
@@ -109,6 +112,9 @@ public:
/* specify if the handle is initialized */
bool
isHandleReady
;
/* generater of random numbers */
curandGenerator_t
gen
;
#endif
...
...
source/tensor/XMem.cpp
查看文件 @
44bf9fa6
...
...
@@ -1461,6 +1461,23 @@ void XMem::CreateBLASHandle()
#endif
}
/* show profile of the memory pool */
void
XMem
::
ShowMemUsage
(
FILE
*
file
)
{
MTYPE
used
=
0
;
MTYPE
total
=
0
;
for
(
int
i
=
0
;
i
<
blockNum
;
i
++
){
if
(
blocks
[
i
].
mem
!=
NULL
){
used
+=
blocks
[
i
].
used
;
total
+=
blocks
[
i
].
size
;
}
}
fprintf
(
file
,
"mem:%.1fMB used:%.1fMB usage:%.3f
\n
"
,
(
DTYPE
)
used
/
MILLION
,
(
DTYPE
)
total
/
MILLION
,
(
DTYPE
)
used
/
total
);
}
#ifdef USE_CUDA
/* get the handle of cublas */
...
...
source/tensor/XMem.h
查看文件 @
44bf9fa6
...
...
@@ -24,6 +24,7 @@
#ifndef __XMEM_H__
#define __XMEM_H__
#include <stdio.h>
#include <stdlib.h>
#ifdef CUDA_BLAS
...
...
@@ -402,6 +403,9 @@ public:
/* create a new cublas handle */
void
CreateBLASHandle
();
/* show profile of the memory pool */
void
ShowMemUsage
(
FILE
*
file
);
#ifdef USE_CUDA
/* get the handle of cublas */
cublasHandle_t
*
GetCublasHandle
();
...
...
source/tensor/XName.cpp
查看文件 @
44bf9fa6
...
...
@@ -67,6 +67,8 @@ const char * GetOPName(int type)
return
"M_MULTIPLY"
;
else
if
(
type
==
MATH_MULTIPLYDIM
)
return
"M_MULTIPLYDIM"
;
else
if
(
type
==
MATH_MULTIPLYBROADCAST
)
return
"M_MULTIPLYBROADCAST"
;
else
if
(
type
==
MATH_NEGATE
)
return
"M_NEGATE"
;
else
if
(
type
==
MATH_NORMALIZE
)
...
...
@@ -85,6 +87,8 @@ const char * GetOPName(int type)
return
"M_SUM"
;
else
if
(
type
==
MATH_SUMDIM
)
return
"M_SUMDIM"
;
else
if
(
type
==
MATH_SUMBROADCAST
)
return
"M_SUMBROADCAST"
;
else
if
(
type
==
REDUCE_REDUCEMAX
)
return
"R_REDUCEMAX"
;
else
if
(
type
==
REDUCE_REDUCEMEAN
)
...
...
source/tensor/XName.h
查看文件 @
44bf9fa6
...
...
@@ -52,7 +52,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#define MATH_MATRIXMULBATCHED MATH_MATRIXMUL + 1
#define MATH_MULTIPLY MATH_MATRIXMULBATCHED + 1
#define MATH_MULTIPLYDIM MATH_MULTIPLY + 1
#define MATH_NEGATE MATH_MULTIPLYDIM + 1
#define MATH_MULTIPLYBROADCAST MATH_MULTIPLYDIM + 1
#define MATH_NEGATE MATH_MULTIPLYBROADCAST + 1
#define MATH_NORMALIZE MATH_NEGATE + 1
#define MATH_POWER MATH_NORMALIZE + 1
#define MATH_SCALEANDSHIFT MATH_POWER + 1
...
...
@@ -61,8 +62,9 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#define MATH_SUBDIM MATH_SUB + 1
#define MATH_SUM MATH_SUBDIM + 1
#define MATH_SUMDIM MATH_SUM + 1
#define MATH_SUMBROADCAST MATH_SUMDIM + 1
#define REDUCE MATH_SUM
DIM
+ 1
#define REDUCE MATH_SUM
BROADCAST
+ 1
#define REDUCE_REDUCEMAX REDUCE + 1
#define REDUCE_REDUCEMEAN REDUCE_REDUCEMAX + 1
#define REDUCE_REDUCESUM REDUCE_REDUCEMEAN + 1
...
...
source/tensor/XTensor.cpp
查看文件 @
44bf9fa6
...
...
@@ -60,6 +60,7 @@
#include "core/utilities/FlushToMem.cuh"
#include "core/utilities/SetAscendingOrder.cuh"
#endif
/* the nts (NiuTrans.Tensor) namespace */
...
...
@@ -690,9 +691,6 @@ set the tensor items by a uniform distribution in range [lower, upper]
>> lower - lower value of the range
>> upper - upper value of the range
*/
void
XTensor
::
SetDataRand
(
DTYPE
lower
,
DTYPE
upper
)
{
// TODO: cuda code!!!!!!!
...
...
@@ -703,7 +701,6 @@ void XTensor::SetDataRand(DTYPE lower, DTYPE upper)
// srand((unsigned)time(0));
DTYPE
variance
=
upper
-
lower
;
void
*
d
=
NULL
;
if
(
dataType
==
X_FLOAT
)
{
d
=
new
float
[
unitNum
];
for
(
int
i
=
0
;
i
<
unitNum
;
i
++
)
{
...
...
@@ -1700,7 +1697,6 @@ void XTensor::Dump(const XTensor * tensor, FILE * file, const char * label, cons
{
ShowNTErrors
(
"TO DO!"
);
}
}
/*
...
...
source/tensor/core/arithmetic/Div.cpp
查看文件 @
44bf9fa6
...
...
@@ -46,8 +46,6 @@ void _Div(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int le
"Unmatched tensors in multiplication!"
);
CheckNTErrors
((
a
->
order
==
b
->
order
&&
a
->
order
==
c
->
order
),
"Unmatched tensors!"
);
CheckNTErrors
(
a
->
dataType
==
b
->
dataType
&&
a
->
dataType
==
c
->
dataType
,
"Unmatched tensors in addition!"
);
#ifdef USE_CUDA
if
(
a
->
devID
>=
0
||
b
->
devID
>=
0
||
c
->
devID
>=
0
)
{
...
...
source/tensor/core/arithmetic/DivDim.cu
查看文件 @
44bf9fa6
...
...
@@ -22,6 +22,8 @@
#include "DivDim.cuh"
#include "../../XDevice.h"
#include "cuda_fp16.h"
#include "device_launch_parameters.h"
#include "../../XDataType.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
...
...
@@ -38,43 +40,89 @@ where a is a tensor and b is a row vector
>> colNum - number of columns of a and c (i.e., the size of b)
>> alpha - the scaling factor
*/
template <class T, bool alphaFired>
__global__
void KernelDivWithRow(T * a, T * b, T * c, int rowNum, int colNum,DTYPE alpha
)
void KernelDivWithRow(DTYPE * a, DTYPE * b, DTYPE * c, int rowNum, int colNum, DTYPE alpha,bool alphaFired
)
{
__shared__ T
bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
__shared__ DTYPE
bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
int col = blockDim.x * blockIdx.x + threadIdx.x;
int row = blockDim.y * blockIdx.y + threadIdx.y;
if
(col >= colNum || row >= rowNum)
if
(col >= colNum || row >= rowNum)
return;
if
(threadIdx.y == 0)
if
(threadIdx.y == 0)
bv[threadIdx.x] = b[col];
__syncthreads();
//#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
//int offset = colNum * row + col;
T alpha1;
if (sizeof(T) - sizeof(half) == 0) {
alpha1 = __float2half(alpha);
}
else {
alpha1 = (DTYPE)alpha;
}
int offset = colNum * row + col;
if (alphaFired)
c[offset] = a[offset] / bv[threadIdx.x] + c[offset] * alpha
1
;
c[offset] = a[offset] / bv[threadIdx.x] + c[offset] * alpha;
else
c[offset] = a[offset] / bv[threadIdx.x];
}
__global__
void KernelDivWithRowHalf(half * a, half * b, half * c, int rowNum, int colNum, half alpha, bool alphaFired)
{
__shared__ half bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
int col = blockDim.x * blockIdx.x + threadIdx.x;
int row = blockDim.y * blockIdx.y + threadIdx.y;
if (col >= colNum || row >= rowNum)
return;
if (threadIdx.y == 0)
bv[threadIdx.x] = b[col];
__syncthreads();
//#endif
int offset = colNum * row + col;
if (alphaFired)
c[offset] = a[offset] / bv[threadIdx.x] + c[offset] * alpha;
else
c[offset] = a[offset] / bv[threadIdx.x];
}
//template <class T, bool alphaFired>
//__global__
//void KernelDivWithRow(T * a, T * b, T * c, int rowNum, int colNum,DTYPE alpha)
//{
// __shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
// int col = blockDim.x * blockIdx.x + threadIdx.x;
// int row = blockDim.y * blockIdx.y + threadIdx.y;
//
// if(col >= colNum || row >= rowNum)
// return;
//
// if(threadIdx.y == 0)
// bv[threadIdx.x] = b[col];
// __syncthreads();
//
////#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
//
// //int offset = colNum * row + col;
// T alpha1;
// if (sizeof(T) - sizeof(half) == 0) {
// alpha1 = __float2half(alpha);
// }
// else {
// alpha1 = (DTYPE)alpha;
// }
//
// int offset = colNum * row + col;
//
// if (alphaFired)
// c[offset] = a[offset] / bv[threadIdx.x] + c[offset] * alpha1;
// else
// c[offset] = a[offset] / bv[threadIdx.x];
//
////#endif
//
//}
/*
tensor division of a tensor and a colum vector
c = a / b + alpha * c
...
...
@@ -88,11 +136,11 @@ where a is a tensor and b is a colum vector
>> blockNum - number of matrics
>> alpha - the scaling factor
*/
template <class T, bool alphaFired>
__global__
void KernelDivWithCol(
T * a, T * b, T * c, int rowNum, int colNum, int blockSize, int blockNum, DTYPE alpha
)
void KernelDivWithCol(
DTYPE * a, DTYPE * b, DTYPE * c, int rowNum, int colNum, int blockSize, int blockNum, DTYPE alpha,bool alphaFired
)
{
__shared__ T
bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
__shared__ DTYPE
bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
int colIndex = blockDim.x * blockIdx.x + threadIdx.x;
int row = blockDim.y * blockIdx.y + threadIdx.y;
...
...
@@ -100,33 +148,95 @@ void KernelDivWithCol(T * a, T * b, T * c, int rowNum, int colNum, int blockSize
int col = colIndex % colNum;
int block = colIndex / colNum;
if
(row >= rowNum || block >= blockNum)
if
(row >= rowNum || block >= blockNum)
return;
if
(threadIdx.x == 0)
if
(threadIdx.x == 0)
bv[threadIdx.y] = b[row];
__syncthreads();
//#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
//#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
int offset = block * blockSize + row * colNum + col;
T alpha1;
if (sizeof(T) -sizeof(half) == 0) {
alpha1 = __float2half(alpha);
}
else {
alpha1 = alpha;
}
if (alphaFired)
c[offset] = a[offset] / bv[threadIdx.y] + c[offset] * alpha
1
;
c[offset] = a[offset] / bv[threadIdx.y] + c[offset] * alpha;
else
c[offset] = a[offset] / bv[threadIdx.y];
//#endif
//#endif
}
__global__
void KernelDivWithColHalf(half * a, half * b, half * c, int rowNum, int colNum, int blockSize, int blockNum, half alpha, bool alphaFired)
{
__shared__ half bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
int colIndex = blockDim.x * blockIdx.x + threadIdx.x;
int row = blockDim.y * blockIdx.y + threadIdx.y;
int col = colIndex % colNum;
int block = colIndex / colNum;
if (row >= rowNum || block >= blockNum)
return;
if (threadIdx.x == 0)
bv[threadIdx.y] = b[row];
__syncthreads();
int offset = block * blockSize + row * colNum + col;
if (alphaFired)
c[offset] = a[offset] / bv[threadIdx.y] + c[offset] * alpha;
else
c[offset] = a[offset] / bv[threadIdx.y];
\
}
//
//
//template <class T, bool alphaFired>
//__global__
//void KernelDivWithCol(T * a, T * b, T * c, int rowNum, int colNum, int blockSize, int blockNum, DTYPE alpha)
//{
// __shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
//
// int colIndex = blockDim.x * blockIdx.x + threadIdx.x;
// int row = blockDim.y * blockIdx.y + threadIdx.y;
//
// int col = colIndex % colNum;
// int block = colIndex / colNum;
//
// if(row >= rowNum || block >= blockNum)
// return;
//
// if(threadIdx.x == 0)
// bv[threadIdx.y] = b[row];
// __syncthreads();
//
////#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
//
// int offset = block * blockSize + row * colNum + col;
//
// T alpha1;
// if (sizeof(T) -sizeof(half) == 0) {
// alpha1 = __float2half(alpha);
// }
// else {
// alpha1 = alpha;
// }
//
// if (alphaFired)
// c[offset] = a[offset] / bv[threadIdx.y] + c[offset] * alpha1;
// else
// c[offset] = a[offset] / bv[threadIdx.y];
//
////#endif
//}
/*
tensor division
...
...
@@ -171,24 +281,24 @@ void _CudaDivDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE
if(stride > 1){
GDevs.GetCudaThread2D(a->devID, stride * blockNum, blockSize, MAX_INT, cudaGrids, cudaBlocks);
if(alpha == (DTYPE)0.0F)
KernelDivWithCol
<DTYPE, false>
<<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
KernelDivWithCol <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
blockSize, stride, blockSize * stride, blockNum, alpha);
blockSize, stride, blockSize * stride, blockNum, alpha
,false
);
else
KernelDivWithCol<
DTYPE, true> <
<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
KernelDivWithCol<<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
blockSize, stride, blockSize * stride, blockNum, alpha);
blockSize, stride, blockSize * stride, blockNum, alpha
,true
);
}
else if(stride == 1){
GDevs.GetCudaThread2D(a->devID, blockSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
if(alpha == (DTYPE)0.0F)
KernelDivWithRow<
DTYPE, false> <
<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
KernelDivWithRow<<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
blockNum, blockSize, alpha);
blockNum, blockSize, alpha
,false
);
else
KernelDivWithRow
<DTYPE, true>
<<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
KernelDivWithRow <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
blockNum, blockSize, alpha);
blockNum, blockSize, alpha
,true
);
}
else{
ShowNTErrors("Something is wrong!");
...
...
@@ -197,25 +307,40 @@ void _CudaDivDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE
else if (a->dataType == X_FLOAT16) {
if (stride > 1) {
GDevs.GetCudaThread2D(a->devID, stride * blockNum, blockSize, MAX_INT, cudaGrids, cudaBlocks);
if (alpha == (DTYPE)0.0F)
KernelDivWithCol<half, false> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
if (alpha == (DTYPE)0.0F){
unsigned short temp = FloatToFloat16(alpha);
half alpha1 = *((half *)&temp);
KernelDivWithColHalf << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((half *)a->data, (half*)b->data, (half*)c->data,
blockSize, stride, blockSize * stride, blockNum, alpha);
else
KernelDivWithCol<half, true> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
blockSize, stride, blockSize * stride, blockNum, alpha1, false);
}
else{
unsigned short temp = FloatToFloat16(alpha);
half alpha1 = *((half *)&temp);
KernelDivWithColHalf << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((half*)a->data, (half*)b->data, (half*)c->data,
blockSize, stride, blockSize * stride, blockNum, alpha);
blockSize, stride, blockSize * stride, blockNum, alpha1, true);
}
}
else if (stride == 1) {
GDevs.GetCudaThread2D(a->devID, blockSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
if (alpha == (DTYPE)0.0F)
KernelDivWithRow<half, false> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
if (alpha == (DTYPE)0.0F) {
unsigned short temp = FloatToFloat16(alpha);
half alpha1 = *((half *)&temp);
KernelDivWithRowHalf << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((half*)a->data, (half*)b->data, (half*)c->data,
blockNum, blockSize, alpha);
else
KernelDivWithRow<half, true> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
blockNum, blockSize, alpha1, false);
}
else {
unsigned short temp = FloatToFloat16(alpha);
half alpha1 = *((half *)&temp);
KernelDivWithRowHalf << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((half*)a->data, (half*)b->data, (half*)c->data,
blockNum, blockSize, alpha);
blockNum, blockSize, alpha1, true);
}
}
else {
ShowNTErrors("Something is wrong!");
...
...
source/tensor/core/arithmetic/MatrixMul.cpp
查看文件 @
44bf9fa6
...
...
@@ -22,12 +22,10 @@
#include "../../XTensor.h"
#include "../../XDevice.h"
#include "../../XName.h"
#include "../CHeader.h"
#include "MatrixMul.h"
#include "MatrixMul2D.h"
#include "XTensorBLAS.h"
#include "MatrixMulBatched.h"
#include "timer.h"
namespace
nts
{
// namespace nts(NiuTrans.Tensor)
...
...
@@ -276,7 +274,8 @@ matrix multiplication with no transposition c = a * b * alpha
>> parallelRunner - parallel processing module
<< return - the result of matrix multiplication
*/
XTensor
MatrixMul
(
const
XTensor
&
a
,
const
XTensor
&
b
,
DTYPE
alpha
,
XPRunner
*
parallelRunner
)
XTensor
MatrixMul
(
const
XTensor
&
a
,
const
XTensor
&
b
,
DTYPE
alpha
,
XPRunner
*
parallelRunner
)
{
CheckNTErrors
(
a
.
dataType
==
b
.
dataType
,
"Input tensors should have the same data type!"
);
CheckNTErrors
(
a
.
order
>=
2
&&
b
.
order
>=
2
,
"Input tensors must have a order >= 2!"
);
...
...
@@ -317,212 +316,4 @@ XTensor MatrixMul(const XTensor &a, const XTensor &b, DTYPE alpha, XPRunner * pa
return
c
;
}
/*
matrix multiplication (return a XTensor structure) c = trans(a) * trans(b) * alpha
make a new tensor to keep the result and return it
For the input tensors a and b, we perform matrix multiplication on the first two dimentsions.
E.g., let A be a tensor of size y * z * m and B be a tensor of size x * y * n.
For A * B, we go over each order-2 tensor of A (of size x * y) and each order-2 tensor B (of size z * x),
like this c_{i,j} = trans(ai) * trans(bj) * alpha + c_{i,j} * beta
where trans() returns the transposed matrix if the flag is fired, ai is the i-th element tensor of A,
bj is the j-th element tensor of B, and c_{i,j} is the (i,j) element tensor of the result C.
The result C should be a tensor of z * x * n * m.
Obviously C = A * B performs normal matrix multiplication if A = y * z and B = x * y.
>> a - tensor a
>> transposedA - indicates whether the matrices in a are transposed
>> b - tensor b
>> transposedB - indicates whether teh matrices in b are transposed
>> alpha - a coefficient
>> parallelRunner - parallel processing module
<< return - the result of matrix multiplication
*/
XTensor
MatrixMulFloat16
(
const
XTensor
&
a
,
MATRIX_TRANS_TYPE
transposedA
,
const
XTensor
&
b
,
MATRIX_TRANS_TYPE
transposedB
,
DTYPE
alpha
,
XPRunner
*
parallelRunner
)
{
CheckNTErrors
(
a
.
dataType
==
b
.
dataType
,
"Input tensors should have the same data type!"
);
CheckNTErrors
(
a
.
order
>=
2
&&
b
.
order
>=
2
,
"Input tensors must have a order >= 2!"
);
int
an
=
transposedA
==
X_TRANS
?
a
.
dimSizeRDI
[
0
]
:
a
.
dimSizeRDI
[
1
];
int
am
=
transposedA
==
X_TRANS
?
a
.
dimSizeRDI
[
1
]
:
a
.
dimSizeRDI
[
0
];
int
bn
=
transposedB
==
X_TRANS
?
b
.
dimSizeRDI
[
0
]
:
b
.
dimSizeRDI
[
1
];
int
bm
=
transposedB
==
X_TRANS
?
b
.
dimSizeRDI
[
1
]
:
b
.
dimSizeRDI
[
0
];
CheckNTErrors
(
am
==
bn
,
"Unmatched tensors in multiplication!"
);
int
order
=
a
.
order
+
b
.
order
-
2
;
int
sub
=
0
;
int
*
dimSize
=
new
int
[
order
];
for
(
int
i
=
2
;
i
<
a
.
order
;
i
++
)
dimSize
[
sub
++
]
=
a
.
dimSizeRDI
[
a
.
order
+
1
-
i
];
for
(
int
i
=
2
;
i
<
b
.
order
;
i
++
)
dimSize
[
sub
++
]
=
b
.
dimSizeRDI
[
b
.
order
+
1
-
i
];
dimSize
[
sub
++
]
=
an
;
dimSize
[
sub
++
]
=
bm
;
float
dr
=
(
!
a
.
isSparse
||
!
b
.
isSparse
)
?
1.0
F
:
MAX
(
a
.
denseRatio
,
b
.
denseRatio
);
XTensor
c
(
order
,
dimSize
,
a
.
dataType
,
dr
,
a
.
devID
,
a
.
mem
);
c
.
SetTMPFlag
();
//XTensor * halfA = NewTensorBuf(a.order, a.dimSize, X_FLOAT16, 1.0F, a.devID, a.mem);
//XTensor * halfB = NewTensorBuf(b.order, b.dimSize, X_FLOAT16, 1.0F, a.devID, a.mem);
//XTensor * halfC = NewTensorBuf(c.order, c.dimSize, X_FLOAT16, 1.0F, a.devID, a.mem);
//_ConvertDataType(&a, halfA);
//_ConvertDataType(&b, halfB);
//_MatrixMul(halfA, transposedA, halfB, transposedB, halfC, alpha, 0, parallelRunner);
//_ConvertDataType(halfC, &c);
//DelTensorBuf(halfC);
//DelTensorBuf(halfB);
//DelTensorBuf(halfA);
XTensor
*
halfA
=
NewTensorBuf
(
a
.
order
,
a
.
dimSize
,
X_FLOAT16
,
1.0
F
,
a
.
devID
,
a
.
mem
);
XTensor
*
halfB
=
NewTensorBuf
(
b
.
order
,
b
.
dimSize
,
X_FLOAT16
,
1.0
F
,
a
.
devID
,
a
.
mem
);
XTensor
*
halfC
=
NewTensorBuf
(
c
.
order
,
c
.
dimSize
,
X_FLOAT16
,
1.0
F
,
a
.
devID
,
a
.
mem
);
/*timer_c timerConvert1;
timerConvert1.m_start_timer();
*/
_ConvertDataType
(
&
a
,
halfA
);
_ConvertDataType
(
&
b
,
halfB
);
/*timerConvert1.m_end_timer();
printf("time convert1 %f ms\n", timerConvert1.m_get_time_diff_msec());
timer_c timerMatrixMul;
timerMatrixMul.m_start_timer();*/
_MatrixMul
(
halfA
,
transposedA
,
halfB
,
transposedB
,
halfC
,
alpha
,
0
,
parallelRunner
);
/*timerMatrixMul.m_end_timer();
printf("time matrixmul %f ms\n", timerMatrixMul.m_get_time_diff_msec());
timer_c timerConvert2;
timerConvert2.m_start_timer();
*/
_ConvertDataType
(
halfC
,
&
c
);
/*timerConvert2.m_end_timer();
printf("time convert2 %f ms\n\n", timerConvert2.m_get_time_diff_msec());*/
DelTensorBuf
(
halfC
);
DelTensorBuf
(
halfB
);
DelTensorBuf
(
halfA
);
/* tensor connections */
XLink
::
MakeLink
(
&
a
,
&
b
,
&
c
,
MATH_MATRIXMUL
);
XLink
::
AddParamToHeadTrans
(
&
c
,
transposedA
);
XLink
::
AddParamToHeadTrans
(
&
c
,
transposedB
);
XLink
::
AddParamToHead
(
&
c
,
alpha
);
/* destroy variables */
delete
[]
dimSize
;
return
c
;
}
/*
matrix multiplication with no transposition c = a * b * alpha
>> a - tensor a
>> b - tensor b
>> alpha - a coefficient
>> parallelRunner - parallel processing module
<< return - the result of matrix multiplication
*/
XTensor
MatrixMulFloat16
(
const
XTensor
&
a
,
const
XTensor
&
b
,
DTYPE
alpha
,
XPRunner
*
parallelRunner
)
{
CheckNTErrors
(
a
.
dataType
==
b
.
dataType
,
"Input tensors should have the same data type!"
);
CheckNTErrors
(
a
.
order
>=
2
&&
b
.
order
>=
2
,
"Input tensors must have a order >= 2!"
);
int
an
=
a
.
dimSizeRDI
[
1
];
int
am
=
a
.
dimSizeRDI
[
0
];
int
bn
=
b
.
dimSizeRDI
[
1
];
int
bm
=
b
.
dimSizeRDI
[
0
];
CheckNTErrors
(
am
==
bn
,
"Unmatched tensors in multiplication!"
);
int
order
=
a
.
order
+
b
.
order
-
2
;
int
sub
=
0
;
int
*
dimSize
=
new
int
[
order
];
for
(
int
i
=
2
;
i
<
a
.
order
;
i
++
)
dimSize
[
sub
++
]
=
a
.
dimSizeRDI
[
a
.
order
+
1
-
i
];
for
(
int
i
=
2
;
i
<
b
.
order
;
i
++
)
dimSize
[
sub
++
]
=
b
.
dimSizeRDI
[
b
.
order
+
1
-
i
];
dimSize
[
sub
++
]
=
an
;
dimSize
[
sub
++
]
=
bm
;
float
dr
=
(
!
a
.
isSparse
||
!
b
.
isSparse
)
?
1.0
F
:
MAX
(
a
.
denseRatio
,
b
.
denseRatio
);
XTensor
c
(
order
,
dimSize
,
a
.
dataType
,
dr
,
a
.
devID
,
a
.
mem
);
c
.
SetTMPFlag
();
XTensor
*
halfA
=
NewTensorBuf
(
a
.
order
,
a
.
dimSize
,
X_FLOAT16
,
1.0
F
,
a
.
devID
,
a
.
mem
);
XTensor
*
halfB
=
NewTensorBuf
(
b
.
order
,
b
.
dimSize
,
X_FLOAT16
,
1.0
F
,
a
.
devID
,
a
.
mem
);
XTensor
*
halfC
=
NewTensorBuf
(
c
.
order
,
c
.
dimSize
,
X_FLOAT16
,
1.0
F
,
a
.
devID
,
a
.
mem
);
_ConvertDataType
(
&
a
,
halfA
);
_ConvertDataType
(
&
b
,
halfB
);
_MatrixMul
(
halfA
,
X_NOTRANS
,
halfB
,
X_NOTRANS
,
halfC
,
alpha
,
0
,
parallelRunner
);
_ConvertDataType
(
halfC
,
&
c
);
DelTensorBuf
(
halfC
);
DelTensorBuf
(
halfB
);
DelTensorBuf
(
halfA
);
//XTensor * halfA = NewTensorBuf(a.order, a.dimSize, X_FLOAT16, 1.0F, a.devID, a.mem);
//XTensor * halfB = NewTensorBuf(b.order, b.dimSize, X_FLOAT16, 1.0F, a.devID, a.mem);
//XTensor * halfC = NewTensorBuf(c.order, c.dimSize, X_FLOAT16, 1.0F, a.devID, a.mem);
//timer_c timerConvert1;
//timerConvert1.m_start_timer();
//_ConvertDataType(&a, halfA);
//_ConvertDataType(&b, halfB);
//timerConvert1.m_end_timer();
//printf("time convert1 %f ms\n", timerConvert1.m_get_time_diff_msec());
//timer_c timerMatrixMul;
//timerMatrixMul.m_start_timer();
//_MatrixMul(halfA, X_NOTRANS, halfB, X_NOTRANS, halfC, alpha, 0, parallelRunner);
//timerMatrixMul.m_end_timer();
//printf("time matrixmul %f ms\n", timerMatrixMul.m_get_time_diff_msec());
//timer_c timerConvert2;
//timerConvert2.m_start_timer();
//_ConvertDataType(halfC, &c);
//timerConvert2.m_end_timer();
//printf("time convert2 %f ms\n\n", timerConvert2.m_get_time_diff_msec());
//DelTensorBuf(halfC);
//DelTensorBuf(halfB);
//DelTensorBuf(halfA);
/* tensor connections */
XLink
::
MakeLink
(
&
a
,
&
b
,
&
c
,
MATH_MATRIXMUL
);
XLink
::
AddParamToHeadTrans
(
&
c
,
X_NOTRANS
);
XLink
::
AddParamToHeadTrans
(
&
c
,
X_NOTRANS
);
XLink
::
AddParamToHead
(
&
c
,
alpha
);
/* destroy variables */
delete
[]
dimSize
;
return
c
;
}
}
\ No newline at end of file
}
//
namespace
nts
(
NiuTrans
.
Tensor
)
\ No newline at end of file
source/tensor/core/arithmetic/MatrixMul.h
查看文件 @
44bf9fa6
...
...
@@ -27,7 +27,6 @@
namespace
nts
{
// namespace nts(NiuTrans.Tensor)
#define MMul MatrixMul
#define MMul16 MatrixMulFloat16
/*
matrix multiplication c = trans(a) * trans(b) * alpha + c * beta
...
...
@@ -64,13 +63,6 @@ XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor
XTensor
MatrixMul
(
const
XTensor
&
a
,
const
XTensor
&
b
,
DTYPE
alpha
=
(
DTYPE
)
1
.
0
,
XPRunner
*
parallelRunner
=
NULL
);
XTensor
MatrixMulFloat16
(
const
XTensor
&
a
,
MATRIX_TRANS_TYPE
transposedA
,
const
XTensor
&
b
,
MATRIX_TRANS_TYPE
transposedB
,
DTYPE
alpha
=
(
DTYPE
)
1
.
0
,
XPRunner
*
parallelRunner
=
NULL
);
/* matrix multiplication with no transposition c = a * b * alpha*/
XTensor
MatrixMulFloat16
(
const
XTensor
&
a
,
const
XTensor
&
b
,
DTYPE
alpha
=
(
DTYPE
)
1
.
0
,
XPRunner
*
parallelRunner
=
NULL
);
}
// namespace nts(NiuTrans.Tensor)
#endif // __MATRIXMUL_H__
\ No newline at end of file
source/tensor/core/arithmetic/MatrixMul2D.cpp
查看文件 @
44bf9fa6
...
...
@@ -82,19 +82,7 @@ void _MatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
_MatrixMULCPU
(
a
,
transposedA
,
b
,
transposedB
,
c
,
alpha
,
beta
);
else
_MatrixMul2DParallel
(
a
,
transposedA
,
b
,
transposedB
,
c
,
alpha
,
beta
,
parallelRunner
);
/*if (a->dataType == DEFAULT_DTYPE &&
b->dataType == DEFAULT_DTYPE &&
c->dataType == DEFAULT_DTYPE)
{
if (useBLAS)
_MatrixMULCPU(a, transposedA, b, transposedB, c, alpha, beta);
else
_MatrixMul2DParallel(a, transposedA, b, transposedB, c, alpha, beta, parallelRunner);
}*/
//else {
// // TODO!!
// ShowNTErrors("TODO!");
//}
}
/* a dense matrix multiply a sparse matrix */
else
if
(
!
a
->
isSparse
&&
b
->
isSparse
)
{
...
...
source/tensor/core/arithmetic/MatrixMul2D.cu
查看文件 @
44bf9fa6
...
...
@@ -156,18 +156,6 @@ void _CudaMatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
if (stream != NULL)
cublasSetStream(*handle, stream->stream);
//if (a->dataType == X_FLOAT && b->dataType == X_FLOAT && c->dataType == X_FLOAT) {
// _CudaBLASMatrixMUL(handle, a->data, transposedA, a->dataType,
// b->data, transposedB, a->dataType, c->data, c->dataType,
// a->dimSize[0], a->dimSize[1],
// b->dimSize[0], b->dimSize[1],
// c->dimSize[0], c->dimSize[1],
// alpha, beta);
//}
//else {
// // TODO!!
// ShowNTErrors("TODO!");
//}
_CudaBLASMatrixMUL(handle, a->data, transposedA, a->dataType,
b->data, transposedB, a->dataType, c->data, c->dataType,
a->dimSize[0], a->dimSize[1],
...
...
source/tensor/core/arithmetic/MatrixMul2DParallel.cpp
查看文件 @
44bf9fa6
...
...
@@ -63,44 +63,6 @@ void _MatrixMul2DParallel(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
/* trans(a) * b */
else
if
(
transposedA
==
X_TRANS
&&
transposedB
==
X_NOTRANS
)
{
int
num
=
an
;
/*if (a->dataType == X_FLOAT16) {
for (int i = 0; i < cn; i++) {
X_FLOAT16 *p3 = (X_FLOAT16*)c->data + i * cm;
for (int j = 0; j < cm; j++) {
X_FLOAT16 r = 0;
X_FLOAT16 * p1 = (X_FLOAT16*)a->data + 0 * am + i;
X_FLOAT16 * p2 = (X_FLOAT16*)b->data + 0 * bm + j;
for (int k = 0; k < num; k++) {
r += (*p1) * (*p2) * alpha;
p1 += aColNum;
p2 += bColNum;
}
*p3 = *p3 * beta + r;
p3 += 1;
}
}
}
else {
for (int i = 0; i < cn; i++) {
DTYPE * p3 = (DTYPE*)c->data + i * cm;
for (int j = 0; j < cm; j++) {
DTYPE r = 0;
DTYPE * p1 = (DTYPE*)a->data + 0 * am + i;
DTYPE * p2 = (DTYPE*)b->data + 0 * bm + j;
for (int k = 0; k < num; k++) {
r += (*p1) * (*p2) * alpha;
p1 += aColNum;
p2 += bColNum;
}
*p3 = *p3 * beta + r;
p3 += 1;
}
}
}*/
for
(
int
i
=
0
;
i
<
cn
;
i
++
)
{
DTYPE
*
p3
=
(
DTYPE
*
)
c
->
data
+
i
*
cm
;
for
(
int
j
=
0
;
j
<
cm
;
j
++
)
{
...
...
source/tensor/core/arithmetic/MatrixMulBatched.cpp
查看文件 @
44bf9fa6
...
...
@@ -22,7 +22,6 @@
#include "../../XTensor.h"
#include "../../XDevice.h"
#include "../../XName.h"
#include "../CHeader.h"
#include "MatrixMulBatched.h"
#include "XTensorBLAS.h"
#include "MatrixMul2D.h"
...
...
@@ -388,142 +387,4 @@ XTensor MatrixMulBatched(const XTensor &a, const XTensor &b,
return
c
;
}
/*
matrix multiplication of the two tensors (do it on site)
c = trans(a) * trans(b) * alpha
make a new tensor to keep the result and return it
for each 2-dimensional data array in a (denoted as ai) and
each 2-dimensional data array in b (denoted as bi), we have
ci = trans(ai) * trans(bi) * alpha + cm * beta
where trans() returns the transposed matrix if the flag is fired.
>> a - tensor a
>> transposedA - indicates whether the matrices in a are transposed
>> b - tensor b
>> transposedB - indicates whether teh matrices in b are transposed
>> alpha - a coefficient
>> parallelRunner - parallel processing module
<< return - the result of matrix multiplication of the two tensors
*/
XTensor
MatrixMulBatchedFloat16
(
const
XTensor
&
a
,
MATRIX_TRANS_TYPE
transposedA
,
const
XTensor
&
b
,
MATRIX_TRANS_TYPE
transposedB
,
DTYPE
alpha
,
XPRunner
*
parallelRunner
)
{
CheckNTErrors
(
a
.
dataType
==
b
.
dataType
,
"Input tensors should have the same data type!"
);
CheckNTErrors
(
a
.
order
>=
2
&&
b
.
order
>=
2
,
"Input tensors must have a order >= 2!"
);
CheckNTErrors
(
a
.
order
==
b
.
order
,
"Input tensor and output tensor must have same order!"
);
int
an
=
transposedA
==
X_TRANS
?
a
.
dimSizeRDI
[
0
]
:
a
.
dimSizeRDI
[
1
];
int
am
=
transposedA
==
X_TRANS
?
a
.
dimSizeRDI
[
1
]
:
a
.
dimSizeRDI
[
0
];
int
bn
=
transposedB
==
X_TRANS
?
b
.
dimSizeRDI
[
0
]
:
b
.
dimSizeRDI
[
1
];
int
bm
=
transposedB
==
X_TRANS
?
b
.
dimSizeRDI
[
1
]
:
b
.
dimSizeRDI
[
0
];
CheckNTErrors
(
am
==
bn
,
"Unmatched tensors in multiplication!"
);
int
order
=
a
.
order
;
int
sub
=
0
;
int
*
dimSize
=
new
int
[
order
];
for
(
int
i
=
0
;
i
<
a
.
order
-
2
;
i
++
)
dimSize
[
sub
++
]
=
a
.
dimSize
[
i
];
dimSize
[
sub
++
]
=
an
;
dimSize
[
sub
++
]
=
bm
;
float
dr
=
(
!
a
.
isSparse
||
!
b
.
isSparse
)
?
1.0
F
:
MAX
(
a
.
denseRatio
,
b
.
denseRatio
);
XTensor
c
(
order
,
dimSize
,
a
.
dataType
,
dr
,
a
.
devID
,
a
.
mem
);
c
.
SetTMPFlag
();
///*call _MatrixMulBatched function */
//_MatrixMulBatched(&a, transposedA, &b, transposedB, &c, alpha, 0, parallelRunner);
XTensor
*
halfA
=
NewTensorBuf
(
a
.
order
,
a
.
dimSize
,
X_FLOAT16
,
1.0
F
,
a
.
devID
,
a
.
mem
);
XTensor
*
halfB
=
NewTensorBuf
(
b
.
order
,
b
.
dimSize
,
X_FLOAT16
,
1.0
F
,
a
.
devID
,
a
.
mem
);
XTensor
*
halfC
=
NewTensorBuf
(
c
.
order
,
c
.
dimSize
,
X_FLOAT16
,
1.0
F
,
a
.
devID
,
a
.
mem
);
_ConvertDataType
(
&
a
,
halfA
);
_ConvertDataType
(
&
b
,
halfB
);
_MatrixMulBatched
(
halfA
,
transposedA
,
halfB
,
transposedB
,
halfC
,
alpha
,
0
,
parallelRunner
);
_ConvertDataType
(
halfC
,
&
c
);
DelTensorBuf
(
halfC
);
DelTensorBuf
(
halfB
);
DelTensorBuf
(
halfA
);
/* tensor connections */
XLink
::
MakeLink
(
&
a
,
&
b
,
&
c
,
MATH_MATRIXMULBATCHED
);
XLink
::
AddParamToHeadTrans
(
&
c
,
transposedA
);
XLink
::
AddParamToHeadTrans
(
&
c
,
transposedB
);
XLink
::
AddParamToHead
(
&
c
,
alpha
);
/* destroy variables */
delete
[]
dimSize
;
return
c
;
}
/*
matrix multiplication of the two tensors (do it on site)
c = a * b * alpha
make a new tensor to keep the result and return it
for each 2-dimensional data array in a (denoted as ai) and
each 2-dimensional data array in b (denoted as bi), we have
ci = ai * bi * alpha + cm * beta
>> a - tensor a
>> b - tensor b
>> alpha - a coefficient
>> parallelRunner - parallel processing module
<< return - the result of matrix multiplication of the two tensors
*/
XTensor
MatrixMulBatchedFloat16
(
const
XTensor
&
a
,
const
XTensor
&
b
,
DTYPE
alpha
,
XPRunner
*
parallelRunner
)
{
CheckNTErrors
(
a
.
dataType
==
b
.
dataType
,
"Input tensors should have the same data type!"
);
CheckNTErrors
(
a
.
order
>=
2
&&
b
.
order
>=
2
,
"Input tensors must have a order >= 2!"
);
CheckNTErrors
(
a
.
order
==
b
.
order
,
"Input tensor and output tensor must have same order!"
);
int
an
=
a
.
dimSizeRDI
[
1
];
int
am
=
a
.
dimSizeRDI
[
0
];
int
bn
=
b
.
dimSizeRDI
[
1
];
int
bm
=
b
.
dimSizeRDI
[
0
];
CheckNTErrors
(
am
==
bn
,
"Unmatched tensors in multiplication!"
);
int
order
=
a
.
order
;
int
sub
=
0
;
int
*
dimSize
=
new
int
[
order
];
for
(
int
i
=
0
;
i
<
a
.
order
-
2
;
i
++
)
dimSize
[
sub
++
]
=
a
.
dimSize
[
i
];
dimSize
[
sub
++
]
=
an
;
dimSize
[
sub
++
]
=
bm
;
float
dr
=
(
!
a
.
isSparse
||
!
b
.
isSparse
)
?
1.0
F
:
MAX
(
a
.
denseRatio
,
b
.
denseRatio
);
XTensor
c
(
order
,
dimSize
,
a
.
dataType
,
dr
,
a
.
devID
,
a
.
mem
);
c
.
SetTMPFlag
();
///*call _MatrixMulBatched function */
//_MatrixMulBatched(&a, X_NOTRANS, &b, X_NOTRANS, &c, alpha, 0, parallelRunner);
XTensor
*
halfA
=
NewTensorBuf
(
a
.
order
,
a
.
dimSize
,
X_FLOAT16
,
1.0
F
,
a
.
devID
,
a
.
mem
);
XTensor
*
halfB
=
NewTensorBuf
(
b
.
order
,
b
.
dimSize
,
X_FLOAT16
,
1.0
F
,
a
.
devID
,
a
.
mem
);
XTensor
*
halfC
=
NewTensorBuf
(
c
.
order
,
c
.
dimSize
,
X_FLOAT16
,
1.0
F
,
a
.
devID
,
a
.
mem
);
_ConvertDataType
(
&
a
,
halfA
);
_ConvertDataType
(
&
b
,
halfB
);
_MatrixMulBatched
(
halfA
,
X_NOTRANS
,
halfB
,
X_NOTRANS
,
halfC
,
alpha
,
0
,
parallelRunner
);
_ConvertDataType
(
halfC
,
&
c
);
DelTensorBuf
(
halfC
);
DelTensorBuf
(
halfB
);
DelTensorBuf
(
halfA
);
/* tensor connections */
XLink
::
MakeLink
(
&
a
,
&
b
,
&
c
,
MATH_MATRIXMULBATCHED
);
XLink
::
AddParamToHeadTrans
(
&
c
,
X_NOTRANS
);
XLink
::
AddParamToHeadTrans
(
&
c
,
X_NOTRANS
);
XLink
::
AddParamToHead
(
&
c
,
alpha
);
/* destroy variables */
delete
[]
dimSize
;
return
c
;
}
}
// namespace nts(NiuTrans.Tensor)
source/tensor/core/arithmetic/MatrixMulBatched.h
查看文件 @
44bf9fa6
...
...
@@ -27,7 +27,6 @@
namespace
nts
{
// namespace nts(NiuTrans.Tensor)
#define BMMul MatrixMulBatched
#define BMMul16 MatrixMulBatchedFloat16
/*
matrix multiplication of the two tensors c = trans(a) * trans(b) * alpha + c * beta
...
...
@@ -85,12 +84,6 @@ ci = ai * bi * alpha + cm * beta
XTensor
MatrixMulBatched
(
const
XTensor
&
a
,
const
XTensor
&
b
,
DTYPE
alpha
=
(
DTYPE
)
1
.
0
,
XPRunner
*
parallelRunner
=
NULL
);
XTensor
MatrixMulBatchedFloat16
(
const
XTensor
&
a
,
MATRIX_TRANS_TYPE
transposedA
,
const
XTensor
&
b
,
MATRIX_TRANS_TYPE
transposedB
,
DTYPE
alpha
=
(
DTYPE
)
1
.
0
,
XPRunner
*
parallelRunner
=
NULL
);
XTensor
MatrixMulBatchedFloat16
(
const
XTensor
&
a
,
const
XTensor
&
b
,
DTYPE
alpha
=
(
DTYPE
)
1
.
0
,
XPRunner
*
parallelRunner
=
NULL
);
}
// namespace nts(NiuTrans.Tensor)
#endif // __MATRIXMULBATCHED_H__
\ No newline at end of file
source/tensor/core/arithmetic/Multiply.cpp
查看文件 @
44bf9fa6
...
...
@@ -46,8 +46,6 @@ void _Multiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, i
"Unmatched tensors in multiplication!"
);
CheckNTErrors
((
a
->
order
==
b
->
order
&&
a
->
order
==
c
->
order
),
"Unmatched tensors!"
);
CheckNTErrors
(
a
->
dataType
==
b
->
dataType
&&
a
->
dataType
==
c
->
dataType
,
"Unmatched tensors in addition!"
);
#ifdef USE_CUDA
if
(
a
->
devID
>=
0
||
b
->
devID
>=
0
||
c
->
devID
>=
0
)
{
...
...
source/tensor/core/arithmetic/MultiplyDim.cpp
查看文件 @
44bf9fa6
...
...
@@ -22,9 +22,10 @@
#include "Multiply.h"
#include "MultiplyDim.h"
#include "MultiplyDim.cuh"
#include "../shape/Unsqueeze.h"
#include "../../XName.h"
#include "../../XUtility.h"
#include "../movement/CopyValues.h"
#include "../getandset/ConvertDataType.h"
namespace
nts
{
// namespace nts(NiuTrans.Tensor)
...
...
@@ -136,28 +137,167 @@ void _MultiplyDimMe(XTensor * a, const XTensor * b, int n, DTYPE alpha)
tensor multiplication (return an XTensor structure and make tensor connections)
make a new tensor to keep the result and return it
c = a * b
+ \alpha * c
c = a * b
where the size of b is equal to the n-th dimension of a,
i.e., a is multiplied with b by broadcasting
>> a - a tensor
>> b - another tensor whose size is equal to that of dimension n of a
>> n - the dimension index
>> alpha - the scaling factor
<< return - the result tensor by tensor multiplication
*/
XTensor
MultiplyDim
(
const
XTensor
&
a
,
const
XTensor
&
b
,
int
n
,
DTYPE
alpha
)
XTensor
MultiplyDim
(
const
XTensor
&
a
,
const
XTensor
&
b
,
int
n
)
{
XTensor
c
(
&
a
);
c
.
SetTMPFlag
();
/* call _Multiply function */
_MultiplyDim
(
&
a
,
&
b
,
&
c
,
n
,
alpha
);
_MultiplyDim
(
&
a
,
&
b
,
&
c
,
n
,
0
);
/* tensor connections */
XLink
::
MakeLink
(
&
a
,
&
b
,
&
c
,
MATH_MULTIPLYDIM
);
XLink
::
AddParamToHeadInt
(
&
c
,
n
);
XLink
::
AddParamToHead
(
&
c
,
alpha
);
XLink
::
AddParamToHead
(
&
c
,
0
);
return
c
;
}
/*
tensor broadcast multiplication
c = a * b + c * \beta
where some of dimensions of b can be of size 1
>> a - a tensor
>> b - another tensor that would be broadcasted
>> c - the resulting tensor
>> beta - the scaling factor
*/
void
_MultiplyBroadcast
(
const
XTensor
*
a
,
const
XTensor
*
b
,
XTensor
*
c
,
DTYPE
beta
)
{
CheckNTErrors
(
a
->
order
==
b
->
order
,
"Wrong tensor orders!"
);
CheckNTErrors
(
a
->
order
==
c
->
order
,
"Wrong tensor orders!"
);
CheckNTErrors
(
a
->
order
>
0
,
"TODO!"
);
int
order
=
a
->
order
;
int
count
=
0
;
void
*
source
=
0
;
void
*
target
=
0
;
for
(
int
i
=
0
;
i
<
order
;
i
++
){
if
(
a
->
GetDim
(
i
)
==
b
->
GetDim
(
i
))
continue
;
if
(
b
->
GetDim
(
i
)
==
1
){
int
fitSize
=
a
->
GetDim
(
i
);
int
j
=
i
+
1
;
/* we define a range over dimensions. It is to be unsqueezed */
for
(;
j
<
order
;
j
++
){
if
(
a
->
GetDim
(
j
)
==
b
->
GetDim
(
j
))
break
;
fitSize
*=
a
->
GetDim
(
j
);
}
int
dimsS
[
MAX_TENSOR_DIM_NUM
];
int
dimsT
[
MAX_TENSOR_DIM_NUM
];
for
(
int
k
=
0
;
k
<
i
;
k
++
){
dimsS
[
k
]
=
a
->
GetDim
(
k
);
dimsT
[
k
]
=
a
->
GetDim
(
k
);
}
dimsT
[
i
]
=
fitSize
;
bool
isLast
=
true
;
for
(
int
k
=
j
;
k
<
order
;
k
++
){
dimsS
[
i
+
k
-
j
+
0
]
=
b
->
GetDim
(
k
);
dimsT
[
i
+
k
-
j
+
1
]
=
b
->
GetDim
(
k
);
if
(
a
->
GetDim
(
k
)
!=
b
->
GetDim
(
k
)){
if
(
b
->
GetDim
(
k
)
==
1
)
isLast
=
false
;
else
{
ShowNTErrors
(
"Wrong dimension size!"
)
}
}
}
dimsS
[
0
]
=
-
dimsS
[
0
];
dimsT
[
0
]
=
-
dimsT
[
0
];
XTensor
*
s
=
NewTensor
(
order
-
(
j
-
i
),
dimsS
,
a
->
dataType
,
a
->
denseRatio
,
a
->
devID
,
a
->
mem
);
XTensor
*
t
=
NewTensor
(
order
-
(
j
-
i
)
+
1
,
dimsT
,
b
->
dataType
,
b
->
denseRatio
,
b
->
devID
,
b
->
mem
);
if
(
count
==
0
)
source
=
b
->
data
;
else
{
source
=
target
;
}
target
=
t
->
mem
!=
NULL
?
t
->
mem
->
AllocBuf
(
t
->
devID
,
t
->
unitNum
*
t
->
unitSize
)
:
XMemAlloc
(
t
->
devID
,
t
->
unitNum
*
t
->
unitSize
);
s
->
data
=
source
;
t
->
data
=
target
;
_Unsqueeze
(
s
,
t
,
i
,
fitSize
);
/* free the memory space of the one before the last allocation */
if
(
count
>
0
){
int
size
=
s
->
unitNum
*
s
->
unitSize
;
if
(
t
->
mem
!=
NULL
)
t
->
mem
->
ReleaseBuf
(
t
->
devID
,
size
);
else
XMemFree
(
t
->
devID
,
source
);
}
/* we do multiplication here */
if
(
isLast
){
CheckNTErrors
(
t
->
unitNum
==
c
->
unitNum
,
"Wrong tensor size!"
);
_Multiply
(
a
,
t
,
c
,
beta
);
if
(
t
->
mem
!=
NULL
)
t
->
mem
->
ReleaseBuf
(
t
->
devID
,
t
->
unitNum
*
t
->
unitSize
);
else
XMemFree
(
t
->
devID
,
target
);
target
=
NULL
;
}
s
->
data
=
NULL
;
t
->
data
=
NULL
;
DelTensor
(
s
);
DelTensor
(
t
);
i
=
j
;
count
++
;
}
}
if
(
count
==
0
)
_Multiply
(
a
,
b
,
c
,
beta
);
CheckNTErrors
(
target
==
NULL
,
"Something is wrong!"
);
}
/*
tensor broadcast multiplication
c = a * b
where some of dimensions of b can be of size 1
>> a - a tensor
>> b - another tensor that would be broadcasted
<< return - the resulting tensor c
*/
XTensor
MultiplyBroadcast
(
const
XTensor
&
a
,
const
XTensor
&
b
)
{
XTensor
c
(
&
a
);
c
.
SetTMPFlag
();
/* call _SumBroadcast function */
_MultiplyBroadcast
(
&
a
,
&
b
,
&
c
,
0
);
/* tensor connections */
XLink
::
MakeLink
(
&
a
,
&
b
,
&
c
,
MATH_MULTIPLYBROADCAST
);
XLink
::
AddParamToHead
(
&
c
,
0
);
return
c
;
}
...
...
source/tensor/core/arithmetic/MultiplyDim.cu
查看文件 @
44bf9fa6
...
...
@@ -217,8 +217,6 @@ void _CudaMultiplyDim(const XTensor * a, const XTensor * b, XTensor * c, int n,
}
}
#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
else if (a->dataType == X_FLOAT16) {
if (stride > 1) {
...
...
@@ -243,7 +241,9 @@ void _CudaMultiplyDim(const XTensor * a, const XTensor * b, XTensor * c, int n,
}
}
#endif
else {
ShowNTErrors("TODO!");
}
BacktoCudaDev(a->devID, devIDBackup);
}
...
...
source/tensor/core/arithmetic/MultiplyDim.h
查看文件 @
44bf9fa6
...
...
@@ -34,9 +34,16 @@ void _MultiplyDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYP
i.e., a is multiplied with b by broadcasting. we keep the result in the input tensor a and return nothing */
void
_MultiplyDimMe
(
XTensor
*
a
,
const
XTensor
*
b
,
int
n
,
DTYPE
alpha
=
0
.
0
);
/* tensor multiplication c = a * b
+ \alpha * c
where the size of b is equal to the n-th dimension of a,
/* tensor multiplication c = a * b where the size of b is equal to the n-th dimension of a,
i.e., a is multiplied with b by broadcasting. We make a new tensor c to keep the result and return it */
XTensor
MultiplyDim
(
const
XTensor
&
a
,
const
XTensor
&
b
,
int
n
,
DTYPE
alpha
=
0
.
0
);
XTensor
MultiplyDim
(
const
XTensor
&
a
,
const
XTensor
&
b
,
int
n
);
/* tensor multiplication summation c = a * b + c * \beta where some of dimensions of b can be of size 1 */
void
_MultiplyBroadcast
(
const
XTensor
*
a
,
const
XTensor
*
b
,
XTensor
*
c
,
DTYPE
beta
=
(
DTYPE
)
1
.
0
);
/* tensor broadcast multiplication c = a * b where some of dimensions of b can be of size 1.
we return the resulting tensor here */
XTensor
MultiplyBroadcast
(
const
XTensor
&
a
,
const
XTensor
&
b
);
}
// namespace nts(NiuTrans.Tensor)
...
...
source/tensor/core/arithmetic/SubDim.cu
查看文件 @
44bf9fa6
...
...
@@ -22,6 +22,8 @@
#include "SubDim.cuh"
#include "../../XDevice.h"
#include "cuda_fp16.h"
#include "device_launch_parameters.h"
#include "../../XDataType.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
...
...
@@ -38,11 +40,10 @@ where a is a tensor and b is a row vector
>> colNum - number of columns of a and c (i.e., the size of b)
>> beta - the scaling factor
*/
template <class T, bool betaFired>
__global__
void KernelSubWithRow(T * a, T * b, T * c, int rowNum, int colNum, DTYPE beta
)
void KernelSubWithRow(DTYPE * a, DTYPE * b, DTYPE * c, int rowNum, int colNum, DTYPE beta,bool betaFired
)
{
__shared__
T
bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
__shared__
DTYPE
bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
int col = blockDim.x * blockIdx.x + threadIdx.x;
int row = blockDim.y * blockIdx.y + threadIdx.y;
...
...
@@ -54,21 +55,66 @@ __global__
__syncthreads();
T beta1;
if (sizeof(T) - sizeof(half) == 0) {
beta1 =__float2half(beta);
}
else {
beta1 = beta;
}
int offset = colNum * row + col;
if (betaFired)
c[offset] = a[offset] - bv[threadIdx.x] * beta;
else
c[offset] = a[offset] - bv[threadIdx.x];
}
__global__
void KernelSubWithRowHalf(half * a, half * b, half * c, int rowNum, int colNum, half beta, bool betaFired)
{
__shared__ half bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
int col = blockDim.x * blockIdx.x + threadIdx.x;
int row = blockDim.y * blockIdx.y + threadIdx.y;
if (col >= colNum || row >= rowNum)
return;
if (threadIdx.y == 0)
bv[threadIdx.x] = b[col];
__syncthreads();
int offset = colNum * row + col;
if (betaFired)
c[offset] = a[offset] - bv[threadIdx.x] * beta
1
;
c[offset] = a[offset] - bv[threadIdx.x] * beta;
else
c[offset] = a[offset] - bv[threadIdx.x];
}
//template <class T, bool betaFired>
//__global__
//void KernelSubWithRow(T * a, T * b, T * c, int rowNum, int colNum, DTYPE beta)
//{
// __shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
// int col = blockDim.x * blockIdx.x + threadIdx.x;
// int row = blockDim.y * blockIdx.y + threadIdx.y;
//
// if (col >= colNum || row >= rowNum)
// return;
//
// if (threadIdx.y == 0)
// bv[threadIdx.x] = b[col];
//
// __syncthreads();
//
// T beta1;
// if (sizeof(T) - sizeof(half) == 0) {
// beta1 =__float2half(beta);
// }
// else {
// beta1 = beta;
// }
//
// int offset = colNum * row + col;
// if (betaFired)
// c[offset] = a[offset] - bv[threadIdx.x] * beta1;
// else
// c[offset] = a[offset] - bv[threadIdx.x];
//}
/*
tensor subtraction of a tensor and a colum vector
c = a - b * \beta
...
...
@@ -82,11 +128,11 @@ where a is a tensor and b is a colum vector
>> blockNum - number of matrics
>> beta - the scaling factor
*/
template <class T, bool betaFired>
__global__
void KernelSubWithCol(T * a, T * b, T * c, int rowNum, int colNum, int blockSize, int blockNum, DTYPE beta
)
void KernelSubWithCol(DTYPE * a, DTYPE * b, DTYPE * c, int rowNum, int colNum, int blockSize, int blockNum, DTYPE beta,bool betaFired
)
{
__shared__
T
bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
__shared__
DTYPE
bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
int colIndex = blockDim.x * blockIdx.x + threadIdx.x;
int row = blockDim.y * blockIdx.y + threadIdx.y;
...
...
@@ -102,23 +148,79 @@ __global__
__syncthreads();
T beta1
;
int offset = block * blockSize + row * colNum + col
;
if (sizeof(T) - sizeof(half) == 0) {
beta1 = __float2half(beta);
}
else {
beta1 = beta;
}
if (betaFired)
c[offset] = a[offset] - bv[threadIdx.y] * beta;
else
c[offset] = a[offset] - bv[threadIdx.y];
}
__global__
void KernelSubWithColHalf(half * a, half * b, half * c, int rowNum, int colNum, int blockSize, int blockNum, half beta, bool betaFired)
{
__shared__ half bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
int colIndex = blockDim.x * blockIdx.x + threadIdx.x;
int row = blockDim.y * blockIdx.y + threadIdx.y;
int col = colIndex % colNum;
int block = colIndex / colNum;
if (row >= rowNum || block >= blockNum)
return;
if (threadIdx.x == 0)
bv[threadIdx.y] = b[row];
__syncthreads();
int offset = block * blockSize + row * colNum + col;
if (betaFired)
c[offset] = a[offset] - bv[threadIdx.y] * beta
1
;
c[offset] = a[offset] - bv[threadIdx.y] * beta;
else
c[offset] = a[offset] - bv[threadIdx.y];
}
//
//template <class T, bool betaFired>
//__global__
// void KernelSubWithCol(T * a, T * b, T * c, int rowNum, int colNum, int blockSize, int blockNum, DTYPE beta)
//{
// __shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
//
// int colIndex = blockDim.x * blockIdx.x + threadIdx.x;
// int row = blockDim.y * blockIdx.y + threadIdx.y;
//
// int col = colIndex % colNum;
// int block = colIndex / colNum;
//
// if (row >= rowNum || block >= blockNum)
// return;
//
// if (threadIdx.x == 0)
// bv[threadIdx.y] = b[row];
//
// __syncthreads();
//
// T beta1;
//
// if (sizeof(T) - sizeof(half) == 0) {
// beta1 = __float2half(beta);
// }
// else {
// beta1 = beta;
// }
//
// int offset = block * blockSize + row * colNum + col;
//
// if (betaFired)
// c[offset] = a[offset] - bv[threadIdx.y] * beta1;
// else
// c[offset] = a[offset] - bv[threadIdx.y];
//}
/*
tensor subtraction (cuda version)
...
...
@@ -163,24 +265,24 @@ void _CudaSubDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE
if (stride > 1) {
GDevs.GetCudaThread2D(a->devID, stride * blockNum, blockSize, MAX_INT, cudaGrids, cudaBlocks);
if (beta == (DTYPE)1.0F)
KernelSubWithCol
<DTYPE, false>
<<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
KernelSubWithCol <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
blockSize, stride, blockSize * stride, blockNum, beta);
blockSize, stride, blockSize * stride, blockNum, beta
,false
);
else
KernelSubWithCol
<DTYPE, true>
<<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
KernelSubWithCol <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
blockSize, stride, blockSize * stride, blockNum, beta);
blockSize, stride, blockSize * stride, blockNum, beta
,true
);
}
else if (stride == 1) {
GDevs.GetCudaThread2D(a->devID, blockSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
if (beta == (DTYPE)1.0F)
KernelSubWithRow
<DTYPE, false>
<<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
KernelSubWithRow <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
blockNum, blockSize, beta);
blockNum, blockSize, beta
,false
);
else
KernelSubWithRow<
DTYPE, true> <
<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
KernelSubWithRow<<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
blockNum, blockSize, beta);
blockNum, blockSize, beta
,true
);
}
else {
ShowNTErrors("Something is wrong!");
...
...
@@ -190,25 +292,40 @@ void _CudaSubDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE
else if (a->dataType == X_FLOAT16) {
if (stride > 1) {
GDevs.GetCudaThread2D(a->devID, stride * blockNum, blockSize, MAX_INT, cudaGrids, cudaBlocks);
if (beta == (DTYPE)1.0F)
KernelSubWithCol<half, false> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
if (beta == (DTYPE)1.0F){
unsigned short temp = FloatToFloat16(beta);
half beta1 = *((half *)&temp);
KernelSubWithColHalf << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((half*)a->data, (half*)b->data, (half*)c->data,
blockSize, stride, blockSize * stride, blockNum, beta);
else
KernelSubWithCol<half, true> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
blockSize, stride, blockSize * stride, blockNum, beta1, false);
}
else {
unsigned short temp = FloatToFloat16(beta);
half beta1 = *((half *)&temp);
KernelSubWithColHalf << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((half*)a->data, (half*)b->data, (half*)c->data,
blockSize, stride, blockSize * stride, blockNum, beta);
blockSize, stride, blockSize * stride, blockNum, beta1, true);
}
}
else if (stride == 1) {
GDevs.GetCudaThread2D(a->devID, blockSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
if (beta == (DTYPE)1.0F)
KernelSubWithRow<half, false> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
if (beta == (DTYPE)1.0F) {
unsigned short temp = FloatToFloat16(beta);
half beta1 = *((half *)&temp);
KernelSubWithRowHalf << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((half*)a->data, (half*)b->data, (half*)c->data,
blockNum, blockSize, beta);
else
KernelSubWithRow<half, true> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
blockNum, blockSize, beta1, false);
}
else{
unsigned short temp = FloatToFloat16(beta);
half beta1 = *((half *)&temp);
KernelSubWithRowHalf << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((half*)a->data, (half*)b->data, (half*)c->data,
blockNum, blockSize, beta);
blockNum, blockSize, beta1, true);
}
}
else {
ShowNTErrors("Something is wrong!");
...
...
source/tensor/core/arithmetic/Sum.cu
查看文件 @
44bf9fa6
...
...
@@ -60,6 +60,16 @@ void KernelADDHalf(__half * a, __half * b, __half * c, int size, DTYPE beta)
#endif
}
__global__
void KernelADDInt(int * a, int * b, int * c, int size, DTYPE beta)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size)
c[i] = a[i] + b[i] * (int)beta;
}
/*
tensor summation c = a + b * \beta (cuda version)
...
...
@@ -101,7 +111,7 @@ void _CudaSum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
if ((c == a && handle != NULL) && *handle != 0) {
#ifdef DOUBELPRICSION
cublasDaxpy(*handle, a->unitNum, &beta, (DTYPE*)->data, 1, (DTYPE*)a->data, 1);
cublasDaxpy(*handle, a->unitNum, &beta, (DTYPE*)
b
->data, 1, (DTYPE*)a->data, 1);
#else
cublasSaxpy(*handle, a->unitNum, &beta, (DTYPE*)b->data, 1, (DTYPE*)a->data, 1);
#endif
...
...
@@ -121,16 +131,6 @@ void _CudaSum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
b->dataType == X_FLOAT16 &&
c->dataType == X_FLOAT16)
{
cublasHandle_t * handle = NULL;
if ((a->mem != NULL) && (b->mem != NULL)) {
cublasHandle_t * handleA = a->mem->GetCublasHandle();
cublasHandle_t * handleB = b->mem->GetCublasHandle();
handle = *handleA != 0 ? handleA : handleB;
}
else {
handle = GDevs.GetCudaHandle(a->devID);
}
int gridSize[3], blockSize[3];
GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);
...
...
@@ -141,6 +141,20 @@ void _CudaSum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
KernelADDHalf << <blocks, threads >> >((__half *)a->data, (__half *)b->data, (__half *)c->data, a->unitNum, beta);
}
else if (a->dataType == X_INT &&
b->dataType == X_INT &&
c->dataType == X_INT)
{
int gridSize[3], blockSize[3];
GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);
dim3 blocks(gridSize[0]);
dim3 threads(blockSize[0]);
//KernelADD << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, a->unitNum, beta);
KernelADDInt << <blocks, threads >> >((int *)a->data, (int *)b->data, (int *)c->data, a->unitNum, beta);
}
else {
// TODO!!
...
...
source/tensor/core/arithmetic/SumDim.cpp
查看文件 @
44bf9fa6
...
...
@@ -17,12 +17,16 @@
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-07-29
* &Updated by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-12-26
* Add summation by broadcasting.
*/
#include "Sum.h"
#include "SumDim.h"
#include "SumDim.cuh"
#include "../shape/Unsqueeze.h"
#include "../../XName.h"
#include "../../XUtility.h"
#include "../movement/CopyValues.h"
namespace
nts
{
// namespace nts(NiuTrans.Tensor)
...
...
@@ -152,7 +156,7 @@ XTensor SumDim(const XTensor &a, const XTensor &b, int n, DTYPE beta)
XTensor
c
(
&
a
);
c
.
SetTMPFlag
();
/* call _Sum function */
/* call _Sum
Dim
function */
_SumDim
(
&
a
,
&
b
,
&
c
,
n
,
beta
);
/* tensor connections */
...
...
@@ -163,4 +167,145 @@ XTensor SumDim(const XTensor &a, const XTensor &b, int n, DTYPE beta)
return
c
;
}
/*
tensor broadcast summation c = a + b * \beta where some of dimensions of b can be of size 1
c = a + b * \beta
>> a - a tensor
>> b - another tensor that would be broadcasted
>> c - the resulting tensor
>> beta - the scaling factor
*/
void
_SumBroadcast
(
const
XTensor
*
a
,
const
XTensor
*
b
,
XTensor
*
c
,
DTYPE
beta
)
{
CheckNTErrors
(
a
->
order
==
b
->
order
,
"Wrong tensor orders!"
);
CheckNTErrors
(
a
->
order
==
c
->
order
,
"Wrong tensor orders!"
);
CheckNTErrors
(
a
->
order
>
0
,
"TODO!"
);
int
order
=
a
->
order
;
int
count
=
0
;
void
*
source
=
0
;
void
*
target
=
0
;
for
(
int
i
=
0
;
i
<
order
;
i
++
){
if
(
a
->
GetDim
(
i
)
==
b
->
GetDim
(
i
))
continue
;
if
(
b
->
GetDim
(
i
)
==
1
){
int
fitSize
=
a
->
GetDim
(
i
);
int
j
=
i
+
1
;
/* we define a range over dimensions. It is to be unsqueezed */
for
(;
j
<
order
;
j
++
){
if
(
a
->
GetDim
(
j
)
==
b
->
GetDim
(
j
))
break
;
fitSize
*=
a
->
GetDim
(
j
);
}
int
dimsS
[
MAX_TENSOR_DIM_NUM
];
int
dimsT
[
MAX_TENSOR_DIM_NUM
];
for
(
int
k
=
0
;
k
<
i
;
k
++
){
dimsS
[
k
]
=
a
->
GetDim
(
k
);
dimsT
[
k
]
=
a
->
GetDim
(
k
);
}
dimsT
[
i
]
=
fitSize
;
bool
isLast
=
true
;
for
(
int
k
=
j
;
k
<
order
;
k
++
){
dimsS
[
i
+
k
-
j
+
0
]
=
b
->
GetDim
(
k
);
dimsT
[
i
+
k
-
j
+
1
]
=
b
->
GetDim
(
k
);
if
(
a
->
GetDim
(
k
)
!=
b
->
GetDim
(
k
)){
if
(
b
->
GetDim
(
k
)
==
1
)
isLast
=
false
;
else
{
ShowNTErrors
(
"Wrong dimension size!"
)
}
}
}
dimsS
[
0
]
=
-
dimsS
[
0
];
dimsT
[
0
]
=
-
dimsT
[
0
];
XTensor
*
s
=
NewTensor
(
order
-
(
j
-
i
),
dimsS
,
a
->
dataType
,
a
->
denseRatio
,
a
->
devID
,
a
->
mem
);
XTensor
*
t
=
NewTensor
(
order
-
(
j
-
i
)
+
1
,
dimsT
,
b
->
dataType
,
b
->
denseRatio
,
b
->
devID
,
b
->
mem
);
if
(
count
==
0
)
source
=
b
->
data
;
else
{
source
=
target
;
}
target
=
t
->
mem
!=
NULL
?
t
->
mem
->
AllocBuf
(
t
->
devID
,
t
->
unitNum
*
t
->
unitSize
)
:
XMemAlloc
(
t
->
devID
,
t
->
unitNum
*
t
->
unitSize
);
s
->
data
=
source
;
t
->
data
=
target
;
_Unsqueeze
(
s
,
t
,
i
,
fitSize
);
/* free the memory space of the one before the last allocation */
if
(
count
>
0
){
int
size
=
s
->
unitNum
*
s
->
unitSize
;
if
(
t
->
mem
!=
NULL
)
t
->
mem
->
ReleaseBuf
(
t
->
devID
,
size
);
else
XMemFree
(
t
->
devID
,
source
);
}
/* we do summation here */
if
(
isLast
){
CheckNTErrors
(
t
->
unitNum
==
c
->
unitNum
,
"Wrong tensor size!"
);
_Sum
(
a
,
t
,
c
,
beta
);
if
(
t
->
mem
!=
NULL
)
t
->
mem
->
ReleaseBuf
(
t
->
devID
,
t
->
unitNum
*
t
->
unitSize
);
else
XMemFree
(
t
->
devID
,
target
);
target
=
NULL
;
}
s
->
data
=
NULL
;
t
->
data
=
NULL
;
DelTensor
(
s
);
DelTensor
(
t
);
i
=
j
;
count
++
;
}
}
if
(
count
==
0
)
_Sum
(
a
,
b
,
c
,
beta
);
CheckNTErrors
(
target
==
NULL
,
"Something is wrong!"
);
}
/*
tensor broadcast summation c = a + b * \beta where some of dimensions of b can be of size 1
c = a + b * \beta
we return c here
>> a - a tensor
>> b - another tensor that would be broadcasted
>> beta - the scaling factor
<< return - the resulting tensor c
*/
XTensor
SumBroadcast
(
const
XTensor
&
a
,
const
XTensor
&
b
,
DTYPE
beta
)
{
XTensor
c
(
&
a
);
c
.
SetTMPFlag
();
/* call _SumBroadcast function */
_SumBroadcast
(
&
a
,
&
b
,
&
c
,
beta
);
/* tensor connections */
XLink
::
MakeLink
(
&
a
,
&
b
,
&
c
,
MATH_SUMBROADCAST
);
XLink
::
AddParamToHead
(
&
c
,
beta
);
return
c
;
}
}
source/tensor/core/arithmetic/SumDim.cu
查看文件 @
44bf9fa6
...
...
@@ -22,6 +22,8 @@
#include "SumDim.cuh"
#include "../../XDevice.h"
#include "cuda_fp16.h"
#include "device_launch_parameters.h"
#include "../../XDataType.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
...
...
@@ -38,38 +40,82 @@ where a is a tensor and b is a row vector
>> colNum - number of columns of a and c (i.e., the size of b)
>> beta - the scaling factor
*/
template <class T, bool betaFired>
__global__
void KernelAddWithRow(T * a, T * b, T * c, int rowNum, int colNum, DTYPE beta)
{
__shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
void KernelAddWithRow(DTYPE *a, DTYPE *b, DTYPE *c, int rowNum, int colNum, DTYPE beta,bool betaFired) {
__shared__ DTYPE bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
int col = blockDim.x * blockIdx.x + threadIdx.x;
int row = blockDim.y * blockIdx.y + threadIdx.y;
if
(col >= colNum || row >= rowNum)
if
(col >= colNum || row >= rowNum)
return;
if
(threadIdx.y == 0)
if
(threadIdx.y == 0)
bv[threadIdx.x] = b[col];
__syncthreads();
T beta1;
if (sizeof(T) - sizeof(half) == 0) {
beta1 = __float2half(beta);
}
else {
beta1 = beta;
}
int offset = colNum * row + col;
if (betaFired)
c[offset] = a[offset] + bv[threadIdx.x] * beta;
else
c[offset] = a[offset] + bv[threadIdx.x];
}
__global__
void KernelAddWithRowHalf(half *a, half *b, half *c, int rowNum, int colNum, half beta, bool betaFired) {
__shared__ half bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
int col = blockDim.x * blockIdx.x + threadIdx.x;
int row = blockDim.y * blockIdx.y + threadIdx.y;
if (col >= colNum || row >= rowNum)
return;
if (threadIdx.y == 0)
bv[threadIdx.x] = b[col];
__syncthreads();
int offset = colNum * row + col;
if
(betaFired)
c[offset] = a[offset] + bv[threadIdx.x] * beta1
;
if
(betaFired)
c[offset] = a[offset] + bv[threadIdx.x] * beta
;
else
c[offset] = a[offset] + bv[threadIdx.x];
}
//
//template <class T, bool betaFired>
//__global__
//void KernelAddWithRow(T * a, T * b, T * c, int rowNum, int colNum, DTYPE beta)
//{
// __shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
// int col = blockDim.x * blockIdx.x + threadIdx.x;
// int row = blockDim.y * blockIdx.y + threadIdx.y;
//
// if(col >= colNum || row >= rowNum)
// return;
//
// if(threadIdx.y == 0)
// bv[threadIdx.x] = b[col];
//
// __syncthreads();
//
// T beta1;
//
// if (sizeof(T) - sizeof(half) == 0) {
// beta1 = __float2half(beta);
// }
// else {
// beta1 = beta;
// }
//
// int offset = colNum * row + col;
// if(betaFired)
// c[offset] = a[offset] + bv[threadIdx.x] * beta1;
// else
// c[offset] = a[offset] + bv[threadIdx.x];
//}
/*
tensor summation of a tensor and a colum vector
c = a + b * \beta
...
...
@@ -83,11 +129,9 @@ where a is a tensor and b is a colum vector
>> blockNum - number of matrics
>> beta - the scaling factor
*/
template <class T, bool betaFired>
__global__
void KernelAddWithCol(T * a, T * b, T * c, int rowNum, int colNum, int blockSize, int blockNum, DTYPE beta)
{
__shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
void KernelAddWithCol(DTYPE *a, DTYPE *b, DTYPE *c, int rowNum, int colNum, int blockSize, int blockNum, DTYPE beta, bool betaFired) {
__shared__ DTYPE bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
int colIndex = blockDim.x * blockIdx.x + threadIdx.x;
int row = blockDim.y * blockIdx.y + threadIdx.y;
...
...
@@ -95,30 +139,85 @@ void KernelAddWithCol(T * a, T * b, T * c, int rowNum, int colNum, int blockSize
int col = colIndex % colNum;
int block = colIndex / colNum;
if
(row >= rowNum || block >= blockNum)
if
(row >= rowNum || block >= blockNum)
return;
if
(threadIdx.x == 0)
if
(threadIdx.x == 0)
bv[threadIdx.y] = b[row];
__syncthreads();
T beta1;
if (sizeof(T) - sizeof(half) == 0) {
beta1 = __float2half(beta);
}
else {
beta1 = beta;
}
int offset = block * blockSize + row * colNum + col;
if (betaFired)
c[offset] = a[offset] + bv[threadIdx.y] * beta;
else
c[offset] = a[offset] + bv[threadIdx.y];
}
__global__
void KernelAddWithColHalf(half *a, half *b, half *c, int rowNum, int colNum, int blockSize, int blockNum, half beta, bool betaFired) {
__shared__ half bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
int colIndex = blockDim.x * blockIdx.x + threadIdx.x;
int row = blockDim.y * blockIdx.y + threadIdx.y;
int col = colIndex % colNum;
int block = colIndex / colNum;
if (row >= rowNum || block >= blockNum)
return;
if (threadIdx.x == 0)
bv[threadIdx.y] = b[row];
__syncthreads();
int offset = block * blockSize + row * colNum + col;
if
(betaFired)
c[offset] = a[offset] + bv[threadIdx.y] * beta1
;
if
(betaFired)
c[offset] = a[offset] + bv[threadIdx.y] * beta
;
else
c[offset] = a[offset] + bv[threadIdx.y];
}
//
//template <class T, bool betaFired>
//__global__
//void KernelAddWithCol(T * a, T * b, T * c, int rowNum, int colNum, int blockSize, int blockNum, DTYPE beta)
//{
// __shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
//
// int colIndex = blockDim.x * blockIdx.x + threadIdx.x;
// int row = blockDim.y * blockIdx.y + threadIdx.y;
//
// int col = colIndex % colNum;
// int block = colIndex / colNum;
//
// if(row >= rowNum || block >= blockNum)
// return;
//
// if(threadIdx.x == 0)
// bv[threadIdx.y] = b[row];
//
// __syncthreads();
//
// T beta1;
// if (sizeof(T) - sizeof(half) == 0) {
// beta1 = __float2half(beta);
// }
// else {
// beta1 = beta;
// }
//
// int offset = block * blockSize + row * colNum + col;
//
// if(betaFired)
// c[offset] = a[offset] + bv[threadIdx.y] * beta1;
// else
// c[offset] = a[offset] + bv[threadIdx.y];
//}
/*
tensor summation (cuda version)
...
...
@@ -162,25 +261,25 @@ void _CudaSumDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE
if (a->dataType == DEFAULT_DTYPE){
if(stride > 1){
GDevs.GetCudaThread2D(a->devID, stride * blockNum, blockSize, MAX_INT, cudaGrids, cudaBlocks);
if
(beta == (DTYPE)1.0F)
KernelAddWithCol<DTYPE, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>
>
if
(beta == (DTYPE)1.0F)
KernelAddWithCol << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >>
>
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
blockSize, stride, blockSize * stride, blockNum, beta
);
blockSize, stride, blockSize * stride, blockNum, beta, false
);
else
KernelAddWithCol
<DTYPE, true>
<<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
KernelAddWithCol <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
blockSize, stride, blockSize * stride, blockNum, beta);
blockSize, stride, blockSize * stride, blockNum, beta
, true
);
}
else if(stride == 1){
GDevs.GetCudaThread2D(a->devID, blockSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
if(beta == (DTYPE)1.0F)
KernelAddWithRow
<DTYPE, false>
<<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
KernelAddWithRow <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
blockNum, blockSize, beta);
blockNum, blockSize, beta
,false
);
else
KernelAddWithRow
<DTYPE, true>
<<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
KernelAddWithRow <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
blockNum, blockSize, beta);
blockNum, blockSize, beta
,true
);
}
else{
ShowNTErrors("Something is wrong!");
...
...
@@ -190,25 +289,37 @@ void _CudaSumDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE
else if (a->dataType==X_FLOAT16) {
if (stride > 1) {
GDevs.GetCudaThread2D(a->devID, stride * blockNum, blockSize, MAX_INT, cudaGrids, cudaBlocks);
if (beta == (DTYPE)1.0F)
KernelAddWithCol<half, false> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
if (beta == (DTYPE)1.0F) {
unsigned short temp = FloatToFloat16(beta);
half beta1 = *((half *)&temp);
KernelAddWithColHalf << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((half*)a->data, (half*)b->data, (half*)c->data,
blockSize, stride, blockSize * stride, blockNum, beta);
else
KernelAddWithCol<half, true> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
blockSize, stride, blockSize * stride, blockNum, beta1, false);
}
else{
unsigned short temp = FloatToFloat16(beta);
half beta1 = *((half *)&temp);
KernelAddWithColHalf << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((half*)a->data, (half*)b->data, (half*)c->data,
blockSize, stride, blockSize * stride, blockNum, beta);
blockSize, stride, blockSize * stride, blockNum, beta1,true);
}
}
else if (stride == 1) {
GDevs.GetCudaThread2D(a->devID, blockSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
if (beta == (DTYPE)1.0F)
KernelAddWithRow<half, false> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
if (beta == (DTYPE)1.0F) {
unsigned short temp = FloatToFloat16(beta);
half beta1 = *((half *)&temp);
KernelAddWithRowHalf << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((half*)a->data, (half*)b->data, (half*)c->data,
blockNum, blockSize, beta);
else
KernelAddWithRow<half, true> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
blockNum, blockSize, beta1, false);
}
else{
unsigned short temp = FloatToFloat16(beta);
half beta1 = *((half *)&temp);
KernelAddWithRowHalf << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((half*)a->data, (half*)b->data, (half*)c->data,
blockNum, blockSize, beta);
blockNum, blockSize, beta1, true);
}
}
else {
ShowNTErrors("Something is wrong!");
...
...
source/tensor/core/arithmetic/SumDim.cuh
查看文件 @
44bf9fa6
...
...
@@ -17,6 +17,8 @@
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-07-29
* &Updated by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-12-26
* Add summation by broadcasting.
*/
#ifndef __SUMDIM_CUH__
...
...
source/tensor/core/arithmetic/SumDim.h
查看文件 @
44bf9fa6
...
...
@@ -18,6 +18,9 @@
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-07-29
* It reached to 39 centigrade around 3:00 pm in Shenyang
* &Updated by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-12-26
* Add summation by broadcasting.
* Four of my master students graduated. Good luck to them for their future work!
*/
#ifndef __SUMDIM_H__
...
...
@@ -39,6 +42,13 @@ void _SumDim(XTensor * a, const XTensor * b, int n, DTYPE beta = (DTYPE)1.0);
i.e., a is summed with b by broadcasting. We make a new tensor c to keep the result and return it */
XTensor
SumDim
(
const
XTensor
&
a
,
const
XTensor
&
b
,
int
n
,
DTYPE
beta
=
(
DTYPE
)
1
.
0
);
/* tensor broadcast summation c = a + b * \beta where some of dimensions of b can be of size 1 */
void
_SumBroadcast
(
const
XTensor
*
a
,
const
XTensor
*
b
,
XTensor
*
c
,
DTYPE
beta
=
(
DTYPE
)
1
.
0
);
/* tensor broadcast summation c = a + b * \beta where some of dimensions of b can be of size 1.
we return the resulting tensor here */
XTensor
SumBroadcast
(
const
XTensor
&
a
,
const
XTensor
&
b
,
DTYPE
beta
=
(
DTYPE
)
1
.
0
);
}
// namespace nts(NiuTrans.Tensor)
#endif // __SUMDIM_H__
source/tensor/core/arithmetic/XTensorBLAS.cu
查看文件 @
44bf9fa6
...
...
@@ -23,89 +23,12 @@
#include "../../XDevice.h"
#include "../../XTensor.h"
#include "XTensorBLAS.h"
#include <stdint.h>
namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
#include <stdint.h>
typedef unsigned short uint16_t;
typedef unsigned int uint32_t;
//typedef char __int8;
half uint16_as_fp16(uint16_t a)
{
half res;
#if defined (__cplusplus)
memcpy(&res, &a, sizeof(res));
#else /* __cplusplus */
volatile union {
half f;
uint16_t i;
} cvt;
cvt.i = a;
res = cvt.f;
#endif /* __cplusplus */
return res;
}
uint32_t fp32_as_uint32(float a)
{
uint32_t res;
#if defined (__cplusplus)
memcpy(&res, &a, sizeof(res));
#else /* __cplusplus */
volatile union {
float f;
uint32_t i;
} cvt;
cvt.f = a;
res = cvt.i;
#endif /* __cplusplus */
return res;
}
/* host version of device function __float2half_rn() */
half float2half_rn(float a)
{
uint32_t ia = fp32_as_uint32(a);
uint16_t ir;
ir = (ia >> 16) & 0x8000;
if ((ia & 0x7f800000) == 0x7f800000) {
if ((ia & 0x7fffffff) == 0x7f800000) {
ir |= 0x7c00; /* infinity */
}
else {
ir = 0x7fff; /* canonical NaN */
}
}
else if ((ia & 0x7f800000) >= 0x33000000) {
int shift = (int)((ia >> 23) & 0xff) - 127;
if (shift > 15) {
ir |= 0x7c00; /* infinity */
}
else {
ia = (ia & 0x007fffff) | 0x00800000; /* extract mantissa */
if (shift < -14) { /* denormal */
ir |= ia >> (-1 - shift);
ia = ia << (32 - (-1 - shift));
}
else { /* normal */
ir |= ia >> (24 - 11);
ia = ia << (32 - (24 - 11));
ir = ir + ((14 + shift) << 10);
}
/* IEEE-754 round to nearest of even */
if ((ia > 0x80000000) || ((ia == 0x80000000) && (ir & 1))) {
ir++;
}
}
}
return uint16_as_fp16(ir);
}
/*
matrix multiplication via cuda version BLAS
*/
...
...
@@ -171,11 +94,24 @@ void _CudaBLASMatrixMUL(cublasHandle_t * handle,
cublasGemmEx(*handle, CUBLAS_OP_T, CUBLAS_OP_T, mc, nc, na, &alpha2, (const int8_t*)b, CUDA_R_8I, mb, (const int8_t*)a, CUDA_R_8I, ma, &beta2, (float*)c, CUDA_R_32F, mc, CUDA_R_32F, CUBLAS_GEMM_DEFAULT);
}
else if (dataTypeA == X_INT8 && dataTypeB == X_INT8 && dataTypeC == X_INT) {
//ShowNTErrors("TO DO!");
int alpha2 = (int)alpha;
int beta2 = (int)beta;
/*
CUDA requires that the dimension of two tensor( lda, ldb ) should be multiples of 4.
details in https://devtalk.nvidia.com/default/topic/999101/about-cublasgemm-int8-support/
*/
if (mb % 4 != 0 || ma % 4 != 0) {
ShowNTErrors("mb, ma( lda, ldb ) should be multiples of 4!");
return;
}
if (transposedA == X_NOTRANS && transposedB == X_NOTRANS)
cublasGemmEx(*handle, CUBLAS_OP_N, CUBLAS_OP_N, mc, nc, ma, &alpha2, (const int8_t*)b, CUDA_R_8I, mb, (const int8_t*)a, CUDA_R_8I, ma, &beta2, (int*)c, CUDA_R_32I, mc, CUDA_R_32I, CUBLAS_GEMM_DEFAULT);
else if (transposedA == X_TRANS && transposedB == X_NOTRANS)
cublasGemmEx(*handle, CUBLAS_OP_N, CUBLAS_OP_T, mc, nc, na, &alpha2, (const int8_t*)b, CUDA_R_8I, mb, (const int8_t*)a, CUDA_R_8I, ma, &beta2, (int*)c, CUDA_R_32I, mc, CUDA_R_32I, CUBLAS_GEMM_DEFAULT);
else if (transposedA == X_NOTRANS && transposedB == X_TRANS)
cublasGemmEx(*handle, CUBLAS_OP_T, CUBLAS_OP_N, mc, nc, ma, &alpha2, (const int8_t*)b, CUDA_R_8I, mb, (const int8_t*)a, CUDA_R_8I, ma, &beta2, (int*)c, CUDA_R_32I, mc, CUDA_R_32I, CUBLAS_GEMM_DEFAULT);
else if (transposedA == X_TRANS && transposedB == X_TRANS)
cublasGemmEx(*handle, CUBLAS_OP_T, CUBLAS_OP_T, mc, nc, na, &alpha2, (const int8_t*)b, CUDA_R_8I, mb, (const int8_t*)a, CUDA_R_8I, ma, &beta2, (int*)c, CUDA_R_32I, mc, CUDA_R_32I, CUBLAS_GEMM_DEFAULT);
}
else {
ShowNTErrors("Unsupported data type!");
...
...
source/tensor/core/arithmetic/XTensorBLAS.h
查看文件 @
44bf9fa6
...
...
@@ -26,8 +26,6 @@
namespace
nts
{
// namespace nts(NiuTrans.Tensor)
half
float2half_rn
(
float
a
);
/* matrix multiplication (BLAS) */
void
_MatrixMULCPU
(
const
XTensor
*
a
,
MATRIX_TRANS_TYPE
transposedA
,
const
XTensor
*
b
,
MATRIX_TRANS_TYPE
transposedB
,
XTensor
*
c
,
DTYPE
alpha
=
(
DTYPE
)
1
.
0
,
DTYPE
beta
=
0
);
...
...
source/tensor/core/getandset/SetData.cpp
查看文件 @
44bf9fa6
...
...
@@ -432,6 +432,39 @@ void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper)
}
/*
generate data items with a uniform distribution in [lower, upper] and set
the item to a pre-defined value if the item >= p, set the item to 0 otherwise
>> tensor - the tensor whose data array would be initialized
>> lower - lower value of the range
>> upper - upper value of the range
>> p - the threshold
>> value - the value we intend to assign to the item
*/
void
_SetDataRandP
(
XTensor
*
tensor
,
DTYPE
lower
,
DTYPE
upper
,
DTYPE
p
,
DTYPE
value
)
{
//CheckNTErrors(tensor->dataType == DEFAULT_DTYPE, "TODO");
if
(
tensor
->
devID
<
0
)
{
_SetDataRand
(
tensor
,
lower
,
upper
);
DTYPE
*
data
=
(
DTYPE
*
)
tensor
->
data
;
for
(
int
i
=
0
;
i
<
tensor
->
unitNum
;
i
++
)
{
if
(
data
[
i
]
>=
p
)
data
[
i
]
=
value
;
else
data
[
i
]
=
0
;
}
}
else
{
#ifdef USE_CUDA
_CudaSetDataRandP
(
tensor
,
lower
,
upper
,
p
,
value
);
#else
ShowNTErrors
(
"Please recompile the code by specifying USE_CUDA"
);
#endif // USE_CUDA
}
}
/*
generate data items with a normal distribution with specified mean and standard deviation
>> tensor - the tensor that keeps the data
>> mean - mean or expectation of the distribution
...
...
source/tensor/core/getandset/SetData.cu
查看文件 @
44bf9fa6
...
...
@@ -28,6 +28,7 @@
#include "../../XDevice.h"
#include "../../XUtility.h"
#include "../getandset/ConvertDataType.h"
#include "../movement/CopyValues.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
...
...
@@ -197,6 +198,7 @@ set data array with a uniform distribution in [low, high]
__global__
void KernelSetDataRandHalf(half * d, int size, DTYPE lower, DTYPE variance)
{
#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
half lowerHalf = __float2half(lower);
half varianceHalf = __float2half(variance);
...
...
@@ -204,6 +206,47 @@ void KernelSetDataRandHalf(half * d, int size, DTYPE lower, DTYPE variance)
if (i < size) {
d[i] = d[i] * varianceHalf + lowerHalf;
}
#endif
}
/*
set data items to a pre-defined value if its value >= p, set it to 0 otherwise
>> d - pointer to the data array
>> size - size of the array
>> lower - low value of the range
>> variance - the variance of the range
*/
__global__
void KernelSetDataPCut(DTYPE * d, int size, DTYPE p, DTYPE value)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size) {
if (d[i] >= p)
d[i] = value;
else
d[i] = 0;
}
}
__global__
void KernelSetDataPCutHalf(half * d, int size, DTYPE p, DTYPE value)
{
#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
half halfP = __float2half(p);
half halfValue = __float2half(value);
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size) {
if (d[i] >= halfP)
d[i] = halfValue;
else
d[i] = 0;
}
#endif
}
/*
...
...
@@ -473,30 +516,77 @@ void _CudaSetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper)
int devIDBackup;
ProtectCudaDev(tensor->devID, devIDBackup);
curandGenerator_t gen;
curandCreateGenerator (&gen, CURAND_RNG_PSEUDO_DEFAULT);
curandSetPseudoRandomGeneratorSeed(gen, time(NULL));
XTensor tensor1(tensor->order, tensor->dimSize, X_FLOAT, tensor->denseRatio, tensor->devID, tensor->mem);
if (tensor->dataType == X_FLOAT){
curandGenerator_t & gen = GDevs.GPUs[tensor->devID].gen;
curandGenerateUniform(gen, (float*)tensor->data, tensor->unitNum);
}
else{
curandGenerator_t & gen = GDevs.GPUs[tensor->devID].gen;
curandGenerateUniform(gen, (float*)tensor1.data, tensor1.unitNum);
}
//curandGenerator_t & gen = GDevs.GPUs[tensor->devID].gen;
//curandGenerateUniform(gen, (float*)tensor->data, tensor->unitNum);
DTYPE variance = upper - lower;
if (variance != 1.0F || lower != 0) {
if (tensor->dataType == X_FLOAT) {
curandGenerateUniform(gen, (float*)tensor->data, tensor->unitNum);
curandDestroyGenerator(gen);
KernelSetDataRandFloat << <blocks, threads >> >((float*)tensor->data, tensor->unitNum, lower, variance);
}
else if (tensor->dataType == X_DOUBLE) {
curandGenerateUniform(gen, (float*)tensor->data, tensor->unitNum);
curandDestroyGenerator(gen);
KernelSetDataRandDouble << <blocks, threads >> >((double*)tensor->data, tensor->unitNum, lower, variance);
}
else if (tensor->dataType == X_FLOAT16) {
XTensor tensor1(tensor->order, tensor->dimSize, X_FLOAT, tensor->denseRatio, tensor->devID,tensor->mem);
curandGenerateUniform(gen, (float *)tensor1.data, tensor1.unitNum);
curandDestroyGenerator(gen);
_ConvertDataType(&tensor1, tensor);
KernelSetDataRandHalf << <blocks, threads >> >((half*)tensor->data, tensor->unitNum, lower, variance);
}
else {
ShowNTErrors("TODO!");
}
}
else if (tensor->dataType == X_FLOAT16) {
_ConvertDataType(&tensor1, tensor);
}
BacktoCudaDev(tensor->devID, devIDBackup);
}
/*
generate data items with a uniform distribution in [lower, upper] and set
the item to a pre-defined value if the item >= p, set the item to 0 otherwise
>> tensor - the tensor whose data array would be initialized
>> lower - lower value of the range
>> upper - upper value of the range
>> p - the threshold
>> value - the value we intend to assign to the item
*/
void _CudaSetDataRandP(XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE p, DTYPE value)
{
_CudaSetDataRand(tensor, lower, upper);
int gridSize[3];
int blockSize[3];
GDevs.GetCudaThread(tensor->devID, tensor->unitNum, gridSize, blockSize);
dim3 blocks(gridSize[0]);
dim3 threads(blockSize[0]);
int devIDBackup;
ProtectCudaDev(tensor->devID, devIDBackup);
if (tensor->dataType == X_FLOAT) {
KernelSetDataPCut << <blocks, threads >> >((float*)tensor->data, tensor->unitNum, p, value);
}
else if (tensor->dataType == X_FLOAT16) {
KernelSetDataPCutHalf << <blocks, threads >> >((__half*)tensor->data, tensor->unitNum, p, value);
}
else {
ShowNTErrors("TODO!")
}
BacktoCudaDev(tensor->devID, devIDBackup);
}
...
...
source/tensor/core/getandset/SetData.cuh
查看文件 @
44bf9fa6
...
...
@@ -49,6 +49,10 @@ void _CudaSetDataLowTri(XTensor * tensor, DTYPE p, int shift);
/* generate data items with a uniform distribution in [lower, upper] */
void _CudaSetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper);
/* generate data items with a uniform distribution in [lower, upper] and set
the item to a pre-defined value if the item >= p, set the item to 0 otherwise */
void _CudaSetDataRandP(XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE p, DTYPE value);
/* set the data with an array of offsets */
void _CudaSetDataWithOffset(XTensor * tensor, MTYPE * offsets, DTYPE value, MTYPE num);
...
...
source/tensor/core/getandset/SetData.h
查看文件 @
44bf9fa6
...
...
@@ -57,6 +57,10 @@ void _SetDataLowTri(XTensor * tensor, DTYPE p, int shift);
/* generate data items with a uniform distribution in [lower, upper] */
void
_SetDataRand
(
XTensor
*
tensor
,
DTYPE
lower
,
DTYPE
upper
);
/* generate data items with a uniform distribution in [lower, upper] and set
the item to a pre-defined value if the item >= p, set the item to 0 otherwise */
void
_SetDataRandP
(
XTensor
*
tensor
,
DTYPE
lower
,
DTYPE
upper
,
DTYPE
p
,
DTYPE
value
);
/* generate data items with a normal distribution with specified mean and standard deviation */
void
_SetDataRandN
(
XTensor
*
tensor
,
DTYPE
mean
=
0
.
0
F
,
DTYPE
standardDeviation
=
1
.
0
F
);
...
...
source/tensor/core/math/Power.cpp
查看文件 @
44bf9fa6
...
...
@@ -35,8 +35,6 @@ get the power(a, p)
*/
void
_Power
(
const
XTensor
*
a
,
XTensor
*
b
,
DTYPE
p
)
{
CheckNTErrors
(
a
->
dataType
==
b
->
dataType
,
"Unmatched tensors in addition!"
);
#ifdef USE_CUDA
/* run it on GPUs */
if
(
a
->
devID
>=
0
)
{
...
...
source/tensor/core/movement/Gather.cpp
查看文件 @
44bf9fa6
source/tensor/core/movement/Spread.cpp
查看文件 @
44bf9fa6
source/tensor/core/movement/Spread.cu
查看文件 @
44bf9fa6
source/tensor/core/reduce/ReduceMax.cpp
查看文件 @
44bf9fa6
...
...
@@ -23,7 +23,6 @@
#include "../../XName.h"
#include "ReduceMax.h"
#include "ReduceMax.cuh"
#include "../getandset/ConvertDataType.h"
namespace
nts
{
// namespace nts(NiuTrans.Tensor)
...
...
source/tensor/core/reduce/ReduceMax.cu
查看文件 @
44bf9fa6
...
...
@@ -24,7 +24,6 @@
#include "../../XUtility.h"
#include "ReduceMax.h"
#include "ReduceMax.cuh"
#include "../getandset/ConvertDataType.h"
namespace nts{ // namespace nts(NiuTrans.Tensor)
...
...
@@ -544,22 +543,22 @@ void _CudaReduceMax(const XTensor * input, XTensor * output, int dim)
int devIDBackup;
ProtectCudaDev(input->devID, devIDBackup);
if (input->dataType == DEFAULT_DTYPE) {
if (stride == 1 && blockNum >= 10) {
if (stride == 1 && blockNum >= 10 && input->dataType == DEFAULT_DTYPE) {
dim3 grids;
dim3 blocks;
continuousStorageThreadAllocation(grids, blocks, (long long)blockNum, strideNum);
if (blocks.y >= 128) {
KernelReduceMaxOp << <grids, blocks >>
> ((DTYPE *)input->data, (DTYPE*)output->data, stride, strideNum, grids.y, blockSize, blockNum);
KernelReduceMaxOp <<<grids, blocks >>
> ((DTYPE *)input->data, (DTYPE*)output->data, stride, strideNum, grids.y, blockSize, blockNum);
}
else {
if (blockNum % 4 != 0) blockNum = (int)(blockNum / 4) + 1;
else blockNum = blockNum / 4;
KernelReduceMaxOpLessBlocks << <blockNum, 128 >>
> ((DTYPE *)input->data, (DTYPE*)output->data, strideNum, blockNum);
KernelReduceMaxOpLessBlocks <<<blockNum, 128 >>
> ((DTYPE *)input->data, (DTYPE*)output->data, strideNum, blockNum);
}
}
else {
do {
if (input->dataType == DEFAULT_DTYPE) {
DTYPE * iData = NULL;
DTYPE * oData = NULL;
if (iter == 0) {
...
...
@@ -581,7 +580,7 @@ void _CudaReduceMax(const XTensor * input, XTensor * output, int dim)
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (DTYPE*)output->data;
KernelReduceMax <<
<blocks, threads >>
> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
KernelReduceMax <<
<blocks, threads>>
> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
}
else if (strideNum < 128) {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 64), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
...
...
@@ -590,7 +589,7 @@ void _CudaReduceMax(const XTensor * input, XTensor * output, int dim)
oData = (DTYPE*)output->data;
CheckNTErrors(cudaBlockSize[0] >= 64, "Incorrect thread number when calling the cuda kernel!");
adjustThreadForUseWarpOptimization(blocks, threads);
KernelReduceMaxFast<64> <<
<blocks, threads >>
> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
KernelReduceMaxFast<64> <<
<blocks, threads>>
> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
}
else if (strideNum < 256) {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 128), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
...
...
@@ -599,7 +598,7 @@ void _CudaReduceMax(const XTensor * input, XTensor * output, int dim)
oData = (DTYPE*)output->data;
CheckNTErrors(cudaBlockSize[0] >= 128, "Incorrect thread number when calling the cuda kernel!");
adjustThreadForUseWarpOptimization(blocks, threads);
KernelReduceMaxFast<128> <<
<blocks, threads >>
> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
KernelReduceMaxFast<128> <<
<blocks, threads>>
> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
}
else if (strideNum < 512) {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 256), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
...
...
@@ -608,7 +607,7 @@ void _CudaReduceMax(const XTensor * input, XTensor * output, int dim)
oData = (DTYPE*)output->data;
CheckNTErrors(cudaBlockSize[0] >= 256, "Incorrect thread number when calling the cuda kernel!");
adjustThreadForUseWarpOptimization(blocks, threads);
KernelReduceMaxFast<256> <<
<blocks, threads >>
> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
KernelReduceMaxFast<256> <<
<blocks, threads>>
> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
}
else {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 512), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
...
...
@@ -617,18 +616,10 @@ void _CudaReduceMax(const XTensor * input, XTensor * output, int dim)
oData = (DTYPE*)output->data;
CheckNTErrors(cudaBlockSize[0] >= 512, "Incorrect thread number when calling the cuda kernel!");
adjustThreadForUseWarpOptimization(blocks, threads);
KernelReduceMaxFast<512> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
}
strideNum = cudaGridSize[0];
blockSize = cudaGridSize[0];
iter++;
} while (strideNum > 1);
KernelReduceMaxFast<512> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
}
}
else if (input->dataType == X_FLOAT16) {
do {
__half * buf1ft16 = (__half *)buf1;
__half * buf2ft16 = (__half *)buf2;
__half * iData = NULL;
...
...
@@ -652,7 +643,7 @@ void _CudaReduceMax(const XTensor * input, XTensor * output, int dim)
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (__half*)output->data;
KernelReduceMax << <blocks, threads >>
> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
KernelReduceMax <<<blocks, threads>>
> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
}
else if (strideNum < 128) {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 64), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
...
...
@@ -660,7 +651,7 @@ void _CudaReduceMax(const XTensor * input, XTensor * output, int dim)
if (cudaGridSize[0] == 1)
oData = (__half*)output->data;
CheckNTErrors(cudaBlockSize[0] >= 64, "Incorrect thread number when calling the cuda kernel!");
KernelReduceMaxFast<64> << <blocks, threads >>
> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
KernelReduceMaxFast<64> <<<blocks, threads>>
> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
}
else if (strideNum < 256) {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 128), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
...
...
@@ -668,7 +659,7 @@ void _CudaReduceMax(const XTensor * input, XTensor * output, int dim)
if (cudaGridSize[0] == 1)
oData = (__half*)output->data;
CheckNTErrors(cudaBlockSize[0] >= 128, "Incorrect thread number when calling the cuda kernel!");
KernelReduceMaxFast<128> << <blocks, threads >>
> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
KernelReduceMaxFast<128> <<<blocks, threads>>
> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
}
else if (strideNum < 512) {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 256), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
...
...
@@ -676,7 +667,7 @@ void _CudaReduceMax(const XTensor * input, XTensor * output, int dim)
if (cudaGridSize[0] == 1)
oData = (__half*)output->data;
CheckNTErrors(cudaBlockSize[0] >= 256, "Incorrect thread number when calling the cuda kernel!");
KernelReduceMaxFast<256> << <blocks, threads >>
> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
KernelReduceMaxFast<256> <<<blocks, threads>>
> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
}
else {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 512), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
...
...
@@ -684,8 +675,10 @@ void _CudaReduceMax(const XTensor * input, XTensor * output, int dim)
if (cudaGridSize[0] == 1)
oData = (__half*)output->data;
CheckNTErrors(cudaBlockSize[0] >= 512, "Incorrect thread number when calling the cuda kernel!");
KernelReduceMaxFast<512> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
KernelReduceMaxFast<512> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
}
}
strideNum = cudaGridSize[0];
blockSize = cudaGridSize[0];
...
...
@@ -693,9 +686,6 @@ void _CudaReduceMax(const XTensor * input, XTensor * output, int dim)
} while (strideNum > 1);
}
else {
ShowNTErrors("TODO!");
}
BacktoCudaDev(input->devID, devIDBackup);
...
...
source/tensor/core/reduce/ReduceSum.cu
查看文件 @
44bf9fa6
...
...
@@ -735,8 +735,7 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
int devIDBackup;
ProtectCudaDev(input->devID, devIDBackup);
if (input->dataType == DEFAULT_DTYPE) {
if (stride == 1 && blockNum >= 10) {
if (stride == 1 && blockNum >= 10 && input->dataType == DEFAULT_DTYPE) {
dim3 grids;
dim3 blocks;
continuousStorageThreadAllocation(grids, blocks, (long long)blockNum, strideNum);
...
...
@@ -752,7 +751,7 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
strideNum, blockNum, sp, power, isExp);
}
}
else if (stride != 1 && stride * blockNum > 4096
){
else if (stride != 1 && stride * blockNum > 4096 && input->dataType == DEFAULT_DTYPE
){
//GDevs->GetGridAndBlockSize2D(devID, stride * blockNum, strideNum,MAX_INT, cudaGridSize, cudaBlockSize);
//unsigned int* goutput = (unsigned int *)input->data;
//convert2uintV2 << <dim3(cudaGridSize[0], cudaGridSize[1]), dim3(cudaBlockSize[0], cudaBlockSize[1]) >> > ((float*)input->data, goutput, stride, strideNum, blockNum, strideNum*blockNum*stride);
...
...
@@ -763,6 +762,7 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
}
else {
do {
if (input->dataType == DEFAULT_DTYPE) {
DTYPE * iData = NULL;
DTYPE * oData = NULL;
if (iter == 0) {
...
...
@@ -783,7 +783,7 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
dim3 blocks(cudaGridSize[0], cudaGridSize[1]), threads(cudaBlockSize[0], cudaBlockSize[1]);
if (cudaGridSize[0] == 1)
oData = (DTYPE*)output->data;
KernelReduceSum <<
<blocks, threads >> > (iData, oData, stride, strideNum, blocks.x,
KernelReduceSum <<
<blocks, threads>>> (iData, oData, stride, strideNum, blocks.x,
blockSize, blockNum, sp, power, isExp);
}
else if (strideNum < 128) {
...
...
@@ -793,7 +793,7 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
oData = (DTYPE*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 64), "Incorrect thread number when calling the cuda kernel!");
adjustThreadForUseWarpOptimization(&blocks, &threads);
KernelReduceSumFast<64> <<
<blocks, threads >> > (iData, oData, stride, strideNum, blocks.x,
KernelReduceSumFast<64> <<
<blocks, threads>>> (iData, oData, stride, strideNum, blocks.x,
blockSize, blockNum, sp, power, isExp);
}
else if (strideNum < 256) {
...
...
@@ -803,7 +803,7 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
oData = (DTYPE*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 128), "Incorrect thread number when calling the cuda kernel!");
adjustThreadForUseWarpOptimization(&blocks, &threads);
KernelReduceSumFast<128> <<
<blocks, threads >> > (iData, oData, stride, strideNum, blocks.x,
KernelReduceSumFast<128> <<
<blocks, threads>>> (iData, oData, stride, strideNum, blocks.x,
blockSize, blockNum, sp, power, isExp);
}
else if (strideNum < 512) {
...
...
@@ -813,7 +813,7 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
oData = (DTYPE*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 256), "Incorrect thread number when calling the cuda kernel!");
adjustThreadForUseWarpOptimization(&blocks, &threads);
KernelReduceSumFast<256> <<
<blocks, threads >> > (iData, oData, stride, strideNum, blocks.x,
KernelReduceSumFast<256> <<
<blocks, threads>>> (iData, oData, stride, strideNum, blocks.x,
blockSize, blockNum, sp, power, isExp);
}
else {
...
...
@@ -823,23 +823,11 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
oData = (DTYPE*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 512), "Incorrect thread number when calling the cuda kernel!");
adjustThreadForUseWarpOptimization(&blocks, &threads);
KernelReduceSumFast<512> <<
<blocks, threads >> > (iData, oData, stride, strideNum, blocks.x,
KernelReduceSumFast<512> <<
<blocks, threads>>> (iData, oData, stride, strideNum, blocks.x,
blockSize, blockNum, sp, power, isExp);
}
strideNum = cudaGridSize[0];
blockSize = cudaGridSize[0];
sp = NULL;
power = (DTYPE)1.0;
isExp = false;
iter++;
} while (strideNum > 1);
}
}
else if (input->dataType == X_FLOAT16) {
do {
__half * buf1ft16 = (__half *)buf1;
__half * buf2ft16 = (__half *)buf2;
__half * spft16 = (__half *)sp;
...
...
@@ -866,7 +854,7 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (__half*)output->data;
KernelReduceSum << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y,
KernelReduceSum <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y,
blockSize, blockNum, spft16, *powerft16p, isExp);
}
else if (strideNum < 128) {
...
...
@@ -875,7 +863,7 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
if (cudaGridSize[0] == 1)
oData = (__half*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 64), "Incorrect thread number when calling the cuda kernel!");
KernelReduceSumFast<64> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y,
KernelReduceSumFast<64> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y,
blockSize, blockNum, spft16, *powerft16p, isExp);
}
else if (strideNum < 256) {
...
...
@@ -884,7 +872,7 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
if (cudaGridSize[0] == 1)
oData = (__half*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 128), "Incorrect thread number when calling the cuda kernel!");
KernelReduceSumFast<128> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y,
KernelReduceSumFast<128> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y,
blockSize, blockNum, spft16, *powerft16p, isExp);
}
else if (strideNum < 512) {
...
...
@@ -893,7 +881,7 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
if (cudaGridSize[0] == 1)
oData = (__half*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 256), "Incorrect thread number when calling the cuda kernel!");
KernelReduceSumFast<256> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y,
KernelReduceSumFast<256> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y,
blockSize, blockNum, spft16, *powerft16p, isExp);
}
else {
...
...
@@ -902,9 +890,10 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
if (cudaGridSize[0] == 1)
oData = (__half*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 512), "Incorrect thread number when calling the cuda kernel!");
KernelReduceSumFast<512> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y,
KernelReduceSumFast<512> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y,
blockSize, blockNum, spft16, *powerft16p, isExp);
}
}
strideNum = cudaGridSize[0];
blockSize = cudaGridSize[0];
...
...
@@ -915,13 +904,7 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
iter++;
} while (strideNum > 1);
}
else {
ShowNTErrors("TODO!");
}
ProtectCudaDev(input->devID, devIDBackup);
if (mem != NULL)
...
...
source/tensor/function/Dropout.cpp
查看文件 @
44bf9fa6
...
...
@@ -27,6 +27,7 @@
#include "../core/arithmetic/MultiplyDim.h"
#include "../core/math/ScaleAndShift.h"
#include "../core/CHeader.h"
#include "../core/getandset/SetData.h"
namespace
nts
{
// namespace nts(NiuTrans.Tensor
...
...
@@ -40,7 +41,7 @@ for more details.
Here, the output is scaled by a factor of \frac{1}{1-p} so that we do not need
to mark the tensor with probability p in the inference phase. Instead we perform
the same inference procedure as that
with no use of dropout on the test data
.
the same inference procedure as that
on the test data withno nb use of dropout
.
>> x - input tensor
>> y - output tensor
...
...
@@ -139,25 +140,35 @@ the same inference procedure as that with no use of dropout on the test data.
>> x - input tensor
>> dropProb - probability to set an element to zero
>> leadingDim - the dimension which we generate the random numbers and perform broadcasting
>> leadingDim2 - another dimension which we generate the random numbers and perform broadcasting
<< return - tensor after dropout
*/
XTensor
Dropout
(
const
XTensor
&
x
,
DTYPE
dropProb
,
int
leadingDim
)
XTensor
Dropout
(
const
XTensor
&
x
,
DTYPE
dropProb
,
int
leadingDim
,
int
leadingDim2
)
{
CheckNTErrors
(
dropProb
>=
0.0
&&
dropProb
<=
1.0
,
"The probability must be 0-1!"
);
int
n
=
leadingDim
<
0
?
x
.
order
-
1
:
leadingDim
;
XTensor
mask
;
DTYPE
*
maskArray
=
NULL
;
DTYPE
scaleFactor
=
(
DTYPE
)
1.0
/
((
DTYPE
)
1.0
-
dropProb
);
CheckNTErrors
(
n
>=
0
&&
n
<
x
.
order
,
"Wrong leadingDim!"
);
if
(
leadingDim
<
0
&&
leadingDim2
<
0
){
XTensor
mask
;
InitTensor
(
&
mask
,
&
x
);
_SetDataRandP
(
&
mask
,
0
,
1.0
F
,
dropProb
,
scaleFactor
);
return
Multiply
(
x
,
mask
);
}
else
if
(
leadingDim2
<
0
){
int
n
=
leadingDim
;
DTYPE
scaleFactor
=
(
DTYPE
)
1.0
/
((
DTYPE
)
1.0
-
dropProb
);
CheckNTErrors
(
n
>=
0
&&
n
<
x
.
order
,
"Wrong leadingDim!"
);
/* generate a mask tensor with probability p */
int
unitNum
=
x
.
dimSize
[
n
];
DTYPE
*
maskArray
=
new
DTYPE
[
unitNum
];
maskArray
=
new
DTYPE
[
unitNum
];
//srand((unsigned int)time(NULL));
for
(
int
i
=
0
;
i
<
unitNum
;
i
++
)
{
for
(
int
i
=
0
;
i
<
unitNum
;
i
++
)
maskArray
[
i
]
=
RandomBernoulli
(
dropProb
,
scaleFactor
);
}
XTensor
mask
;
InitTensor1D
(
&
mask
,
unitNum
,
X_FLOAT
,
x
.
devID
,
x
.
mem
);
...
...
@@ -167,57 +178,66 @@ XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim)
if
(
x
.
dataType
==
X_FLOAT
)
{
return
MultiplyDim
(
x
,
mask
,
n
,
0
);
return
MultiplyDim
(
x
,
mask
,
n
);
}
else
else
if
(
x
.
dataType
==
X_FLOAT16
)
{
XTensor
mask1
;
mask1
=
ConvertDataType
(
mask
,
X_FLOAT16
);
XTensor
mask1
(
mask
.
order
,
mask
.
dimSize
,
X_FLOAT16
,
mask
.
denseRatio
,
mask
.
devID
,
mask
.
mem
);
//mask1 = ConvertDataType(mask, X_FLOAT16);
_ConvertDataType
(
&
mask
,
&
mask1
);
return
MultiplyDim
(
x
,
mask1
,
n
,
0
);
return
MultiplyDim
(
x
,
mask1
,
n
);
}
}
else
{
ShowNTErrors
(
"TODO!"
);
}
}
else
{
int
n
=
leadingDim
;
int
m
=
leadingDim2
;
CheckNTErrors
(
n
>=
0
&&
n
<
x
.
order
,
"Wrong leadingDim!"
);
CheckNTErrors
(
m
>=
0
&&
m
<
x
.
order
,
"Wrong leadingDim!"
);
/* generate a mask tensor with probability p */
int
unitNum
=
x
.
dimSize
[
n
]
*
x
.
dimSize
[
m
];
maskArray
=
new
DTYPE
[
unitNum
];
//XTensor DropoutFloat16(const XTensor &x, DTYPE dropProb, int leadingDim)
//{
// CheckNTErrors(dropProb >= 0.0 && dropProb <= 1.0, "The probability must be 0-1!");
//
// int n = leadingDim < 0 ? x.order - 1 : leadingDim;
//
// CheckNTErrors(n >= 0 && n < x.order, "Wrong leadingDim!");
//
// DTYPE scaleFactor = (DTYPE)1.0 / ((DTYPE)1.0 - dropProb);
//
// /* generate a mask tensor with probability p */
// int unitNum = x.dimSize[n];
// DTYPE * maskArray = new DTYPE[unitNum];
//
// //srand((unsigned int)time(NULL));
// for (int i = 0; i < unitNum; i++) {
// maskArray[i] = RandomBernoulli(dropProb, scaleFactor);
// }
//
// XTensor mask;
// InitTensor1D(&mask, unitNum, X_FLOAT, x.devID, x.mem);
// mask.SetData(maskArray, unitNum);
//
// delete[] maskArray;
//
// XTensor halfMask;
// halfMask = ConvertDataType(mask, X_FLOAT16);
// XTensor halfX;
// halfX = ConvertDataType(x, X_FLOAT16);
// XTensor result;
// XTensor halfResult;
//
// halfResult = MultiplyDim(halfX, halfMask, n, 0);
//
// result = ConvertDataType(halfResult, X_FLOAT);
// return result;
// /*return MultiplyDim(x, mask1, n, 0);*/
//}
//srand((unsigned int)time(NULL));
for
(
int
i
=
0
;
i
<
unitNum
;
i
++
)
maskArray
[
i
]
=
RandomBernoulli
(
dropProb
,
scaleFactor
);
int
dims
[
MAX_TENSOR_DIM_NUM
];
for
(
int
i
=
0
;
i
<
x
.
order
;
i
++
)
dims
[
i
]
=
1
;
dims
[
n
]
=
x
.
GetDim
(
n
);
dims
[
m
]
=
x
.
GetDim
(
m
);
InitTensor
(
&
mask
,
x
.
order
,
dims
,
X_FLOAT
,
x
.
denseRatio
,
x
.
devID
,
x
.
mem
);
mask
.
SetData
(
maskArray
,
unitNum
);
delete
[]
maskArray
;
if
(
x
.
dataType
==
X_FLOAT
)
{
return
MultiplyBroadcast
(
x
,
mask
);
}
else
if
(
x
.
dataType
==
X_FLOAT16
)
{
XTensor
mask1
(
mask
.
order
,
mask
.
dimSize
,
X_FLOAT16
,
mask
.
denseRatio
,
mask
.
devID
,
mask
.
mem
);
//mask1 = ConvertDataType(mask, X_FLOAT16);
_ConvertDataType
(
&
mask
,
&
mask1
);
return
MultiplyBroadcast
(
x
,
mask1
);
}
else
{
ShowNTErrors
(
"TODO!"
);
}
}
}
/*
dropout function without broadcast
...
...
@@ -235,7 +255,6 @@ XTensor DropoutWithoutBroadcast(const XTensor &x, DTYPE dropProb)
int
unitNum
=
x
.
unitNum
;
DTYPE
*
maskArray
=
new
DTYPE
[
unitNum
];
srand
((
unsigned
int
)
time
(
NULL
));
for
(
int
i
=
0
;
i
<
unitNum
;
i
++
)
maskArray
[
i
]
=
RandomBernoulli
(
dropProb
,
scaleFactor
);
...
...
source/tensor/function/Dropout.h
查看文件 @
44bf9fa6
...
...
@@ -30,7 +30,6 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
inline
DTYPE
RandomBernoulli
(
DTYPE
dropProb
,
DTYPE
value
)
{
return
(
DTYPE
)
rand
()
/
(
DTYPE
)
RAND_MAX
>=
dropProb
?
(
DTYPE
)
value
:
0
;
}
/* dropout function */
...
...
@@ -42,9 +41,7 @@ void _DropoutBackward(const XTensor * y, const XTensor * x,
unsigned
int
seed
,
DTYPE
dropProb
,
int
leadingDim
=
-
1
);
/* dropout function */
XTensor
Dropout
(
const
XTensor
&
x
,
DTYPE
dropProb
,
int
leadingDim
=
-
1
);
XTensor
Dropout
(
const
XTensor
&
x
,
DTYPE
dropProb
,
int
leadingDim
=
-
1
,
int
leadingDim2
=
-
1
);
/* dropout function without broadcast */
XTensor
DropoutWithoutBroadcast
(
const
XTensor
&
x
,
DTYPE
dropProb
);
...
...
source/tensor/function/LogSoftmax.cpp
查看文件 @
44bf9fa6
...
...
@@ -27,9 +27,6 @@
#include "../core/reduce/ReduceSum.h"
#include "../core/reduce/ReduceMax.h"
#include "../core/movement/CopyValues.h"
#include "../../tensor/core/getandset/ConvertDataType.h"
using
namespace
nts
;
namespace
nts
{
// namespace nts(NiuTrans.Tensor)
...
...
@@ -182,125 +179,6 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim)
delete
[]
dimSize
;
// if (!x->isSparse && !y->isSparse &&
// x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE)
// {
// int * dimSize = new int[x->order - 1];
// for (int i = 0; i < x->order; i++) {
// if (i < leadDim)
// dimSize[i] = -x->dimSize[i];
// else if (i > leadDim)
// dimSize[i - 1] = -x->dimSize[i];
// }
//
// XMem * mem = x->mem;
// XTensor * max = NULL;
// XTensor * sum = NULL;
// XTensor * blockx = NULL;
// XTensor * blocky = NULL;
// XTensor * blockMax = NULL;
// XTensor * blockSum = NULL;
//
// int dimensionSize = y->dimSizeRDI[leadDimRDI];
// int stride = 1;
// int blockSize = 1;
// int blockNum = 1;
//
// for (int i = 0; i < leadDimRDI; i++)
// stride *= y->dimSizeRDI[i];
// blockSize = stride * dimensionSize;
// blockNum = y->unitNum / blockSize;
//
// max = NewTensorBuf(x->order - 1, dimSize, x->dataType, x->denseRatio, x->devID, mem);
// sum = NewTensorBuf(x->order - 1, dimSize, x->dataType, x->denseRatio, x->devID, mem);
//
// _ReduceMax(x, max, leadDim);
// _ReduceSum(x, sum, leadDim, max, 1.0F, true);
//
// if (x->devID >= 0) {
// if (leadDimRDI == 0) {
// blockSize = y->unitNum;
// blockNum = 1;
// blockx = NewTensor2D(blockSize / dimensionSize, -dimensionSize, x->dataType, x->devID, mem);
// blocky = NewTensor2D(blockSize / dimensionSize, -dimensionSize, x->dataType, x->devID, mem);
// blockMax = NewTensor2D(blockSize / dimensionSize, -1, x->dataType, x->devID, mem);
// blockSum = NewTensor2D(blockSize / dimensionSize, -1, x->dataType, x->devID, mem);
// }
// else {
// blockx = NewTensor2D(-stride, dimensionSize, x->dataType, x->devID, mem);
// blocky = NewTensor2D(-stride, dimensionSize, x->dataType, x->devID, mem);
// blockMax = NewTensor2D(-stride, 1, x->dataType, x->devID, mem);
// blockSum = NewTensor2D(-stride, 1, x->dataType, x->devID, mem);
// }
// }
//
// for (int k = 0; k < blockNum; k++) {
// int m = stride;
// int n = dimensionSize;
//
// DTYPE * ip = (DTYPE*)x->data + k * blockSize;
// DTYPE * op = (DTYPE*)y->data + k * blockSize;
// DTYPE * mp = (DTYPE*)max->data + k * blockSize / dimensionSize;
// DTYPE * sp = (DTYPE*)sum->data + k * blockSize / dimensionSize;
//
// if (x->devID < 0) {
// for (int j = 0; j < m; j++) {
// DTYPE sumValue = sp[j];
// if (sumValue == 0) {
// for (int i = 0; i < n; i++)
// op[i * m + j] = 0;
// }
// else {
// for (int i = 0; i < n; i++) {
// DTYPE r = (DTYPE)log(exp(ip[i * m + j] - mp[j]) / sp[j]);
// if (IsNAN(r))
// r = LOGPROB_MIN;
// if (IsINF(r))
// r = LOGPROB_MIN;
//
// op[i * m + j] = MAX(r, LOGPROB_MIN);
// }
// }
// }
// }
// else {
// blockx->data = ip;
// blocky->data = op;
// blockMax->data = mp;
// blockSum->data = sp;
//#ifdef USE_CUDA
// if (leadDimRDI == 0)
// _CudaLogSoftmaxSumMax(blockx, blocky, 1, blockSum, blockMax);
// else
// _CudaLogSoftmaxSumMax(blockx, blocky, leadDim, blockSum, blockMax);
//#else
// ShowNTErrors("Please specify USE_CUDA and recompile the code!");
//#endif
// blockx->data = NULL;
// blocky->data = NULL;
// blockMax->data = NULL;
// blockSum->data = NULL;
// }
// }
//
// DelTensorBuf(max);
// DelTensorBuf(sum);
//
// if (x->devID >= 0) {
// delete blockx;
// delete blocky;
// delete blockMax;
// delete blockSum;
// }
//
// delete[] dimSize;
// }
// else
// ShowNTErrors("TODO!");
}
/*
...
...
source/tensor/function/LogSoftmax.cu
查看文件 @
44bf9fa6
...
...
@@ -26,7 +26,6 @@
#include "../core/reduce/ReduceSum.cuh"
#include "../core/reduce/ReduceMax.cuh"
#include "../XDevice.h"
#include "device_launch_parameters.h"
#include "cuda_fp16.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
...
...
source/tensor/function/Softmax.cpp
查看文件 @
44bf9fa6
...
...
@@ -26,7 +26,6 @@
#include "../XUtility.h"
#include "../core/reduce/ReduceSum.h"
#include "../core/reduce/ReduceMax.h"
#include "../core/getandset/ConvertDataType.h"
namespace
nts
{
// namespace nts(NiuTrans.Tensor)
...
...
source/tensor/function/Softmax.cu
查看文件 @
44bf9fa6
...
...
@@ -29,7 +29,6 @@
#include "../core/arithmetic/Sum.h"
#include "../XDevice.h"
#include "../XUtility.h"
#include "../core/getandset/ConvertDataType.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
...
...
@@ -86,8 +85,6 @@ void KernelSoftmaxComputeTensor(DTYPE * x, DTYPE * max, DTYPE * sum, DTYPE * y,
/* synchronize to make sure the values of max and sum are loaded */
__syncthreads();
//printf("1: %d %d %d %d\n", i, strideSizeTotal, j, strideNum);
if(i < strideSizeTotal && j < strideNum){
int offset = int(i / stride) * blockSize + j * stride + i2[threadIdx.x];
DTYPE r = exp(x[offset] - xMax[threadIdx.x])/xSum[threadIdx.x];
...
...
@@ -145,8 +142,6 @@ void KernelSoftmaxComputeTensorHalf(__half * x, __half * max, __half * sum, __ha
/* synchronize to make sure the values of max and sum are loaded */
__syncthreads();
//printf("2: %d %d %d %d\n",i ,stride * blockNum ,j ,strideNum);
if(i < stride * blockNum && j < strideNum){
int offset = int(i / stride) * blockSize + j * stride + i2[threadIdx.x];
#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
...
...
@@ -256,7 +251,6 @@ void _CudaSoftmaxSumMax(const XTensor * x, XTensor * y, int leadDim, XTensor * s
if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE){
if (leadDim != 0 || dimensionSize <= 10) {
//printf("%d %d %d %d\n", cudaGridSize[0], cudaGridSize[1], cudaBlockSize[0], cudaBlockSize[1]);
KernelSoftmaxComputeTensor <<< dim3(cudaGridSize[0], cudaGridSize[1]), dim3(cudaBlockSize[0], cudaBlockSize[1]) >>>
((DTYPE*)x->data, (DTYPE*)max->data, (DTYPE*)sum->data, (DTYPE*)y->data,
stride, dimensionSize, stride * dimensionSize, blockNum, stride * blockNum);
...
...
@@ -269,8 +263,6 @@ void _CudaSoftmaxSumMax(const XTensor * x, XTensor * y, int leadDim, XTensor * s
}
else if(x->dataType == X_FLOAT16 && y->dataType == X_FLOAT16){
//printf("%d\n\n",dimensionSize);
//printf("%d %d %d %d\n", cudaGridSize[0], cudaGridSize[1], cudaBlockSize[0], cudaBlockSize[1]);
KernelSoftmaxComputeTensorHalf <<< dim3(cudaGridSize[0], cudaGridSize[1]), dim3(cudaBlockSize[0], cudaBlockSize[1]) >>>
((__half*)x->data, (__half*)max->data, (__half*)sum->data, (__half*)y->data,
stride, dimensionSize, blockNum);
...
...
@@ -279,10 +271,6 @@ void _CudaSoftmaxSumMax(const XTensor * x, XTensor * y, int leadDim, XTensor * s
ShowNTErrors("TODO!");
}
/*XTensor y1;
y1 = ConvertDataType(*y, X_FLOAT);
y1.Dump(stderr, "y1:");*/
BacktoCudaDev(x->devID, devIDBackup);
}
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论