Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
N
NiuTrans.Tensor
概览
Overview
Details
Activity
Cycle Analytics
版本库
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
问题
0
Issues
0
列表
Board
标记
里程碑
合并请求
0
Merge Requests
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
Snippets
成员
Collapse sidebar
Close sidebar
活动
图像
聊天
创建新问题
作业
提交
Issue Boards
Open sidebar
Emmay
NiuTrans.Tensor
Commits
44bf9fa6
Commit
44bf9fa6
authored
Jan 21, 2019
by
linye
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Merge了最新版本代码,修复了一些Bug
parent
c9c53870
全部展开
隐藏空白字符变更
内嵌
并排
正在显示
63 个修改的文件
包含
1283 行增加
和
1085 行删除
+1283
-1085
source/network/Main.cpp
+78
-158
source/network/XBackwardLoss.cpp
+1
-3
source/network/XBackwardMath.cpp
+66
-31
source/network/XBackwardMath.h
+10
-0
source/network/XNet.cpp
+1
-2
source/sample/fnnlm/FNNLM.cpp
+18
-10
source/sample/transformer/T2TAttention.cpp
+146
-18
source/sample/transformer/T2TAttention.h
+3
-2
source/sample/transformer/T2TDecoder.cpp
+34
-5
source/sample/transformer/T2TDecoder.h
+49
-3
source/sample/transformer/T2TEmbedding.cpp
+8
-3
source/sample/transformer/T2TEmbedding.h
+1
-1
source/sample/transformer/T2TEncoder.cpp
+3
-2
source/sample/transformer/T2TModel.cpp
+15
-10
source/sample/transformer/T2TOutput.cpp
+1
-1
source/sample/transformer/T2TTrainer.cpp
+0
-0
source/sample/transformer/T2TTrainer.h
+33
-2
source/sample/transformer/Transformer.cpp
+3
-2
source/tensor/XDevice.cpp
+9
-0
source/tensor/XDevice.h
+6
-0
source/tensor/XMem.cpp
+17
-0
source/tensor/XMem.h
+4
-0
source/tensor/XName.cpp
+4
-0
source/tensor/XName.h
+4
-2
source/tensor/XTensor.cpp
+2
-6
source/tensor/core/arithmetic/Div.cpp
+0
-2
source/tensor/core/arithmetic/DivDim.cu
+0
-0
source/tensor/core/arithmetic/MatrixMul.cpp
+4
-213
source/tensor/core/arithmetic/MatrixMul.h
+0
-9
source/tensor/core/arithmetic/MatrixMul2D.cpp
+1
-13
source/tensor/core/arithmetic/MatrixMul2D.cu
+0
-12
source/tensor/core/arithmetic/MatrixMul2DParallel.cpp
+0
-38
source/tensor/core/arithmetic/MatrixMulBatched.cpp
+0
-139
source/tensor/core/arithmetic/MatrixMulBatched.h
+0
-8
source/tensor/core/arithmetic/Multiply.cpp
+0
-2
source/tensor/core/arithmetic/MultiplyDim.cpp
+146
-6
source/tensor/core/arithmetic/MultiplyDim.cu
+3
-3
source/tensor/core/arithmetic/MultiplyDim.h
+9
-2
source/tensor/core/arithmetic/SubDim.cu
+163
-46
source/tensor/core/arithmetic/Sum.cu
+25
-11
source/tensor/core/arithmetic/SumDim.cpp
+146
-1
source/tensor/core/arithmetic/SumDim.cu
+0
-0
source/tensor/core/arithmetic/SumDim.cuh
+2
-0
source/tensor/core/arithmetic/SumDim.h
+10
-0
source/tensor/core/arithmetic/XTensorBLAS.cu
+15
-79
source/tensor/core/arithmetic/XTensorBLAS.h
+0
-2
source/tensor/core/getandset/SetData.cpp
+33
-0
source/tensor/core/getandset/SetData.cu
+109
-19
source/tensor/core/getandset/SetData.cuh
+4
-0
source/tensor/core/getandset/SetData.h
+4
-0
source/tensor/core/math/Power.cpp
+0
-2
source/tensor/core/movement/Gather.cpp
+1
-2
source/tensor/core/movement/Spread.cpp
+1
-2
source/tensor/core/movement/Spread.cu
+1
-2
source/tensor/core/reduce/ReduceMax.cpp
+0
-1
source/tensor/core/reduce/ReduceMax.cu
+0
-0
source/tensor/core/reduce/ReduceSum.cu
+0
-0
source/tensor/function/Dropout.cpp
+88
-69
source/tensor/function/Dropout.h
+2
-5
source/tensor/function/LogSoftmax.cpp
+0
-122
source/tensor/function/LogSoftmax.cu
+0
-1
source/tensor/function/Softmax.cpp
+0
-1
source/tensor/function/Softmax.cu
+0
-12
没有找到文件。
source/network/Main.cpp
查看文件 @
44bf9fa6
...
@@ -66,6 +66,7 @@ void PowerFP16Test();
...
@@ -66,6 +66,7 @@ void PowerFP16Test();
void
ClipFP16Test
();
void
ClipFP16Test
();
void
GatherFP16Test
();
void
GatherFP16Test
();
void
SetDataGPUFP16Test
();
void
SetDataGPUFP16Test
();
void
SumIntTest
();
using
namespace
nts
;
using
namespace
nts
;
using
namespace
fnnlm
;
using
namespace
fnnlm
;
...
@@ -89,8 +90,6 @@ int main( int argc, const char ** argv )
...
@@ -89,8 +90,6 @@ int main( int argc, const char ** argv )
//return 0;
//return 0;
//ConvertBackwardTest();
//ConvertBackwardTest();
//return 0;
//return 0;
//DropoutFP16Test();
//return 0;
//UnsqueezeFP16Test();
//UnsqueezeFP16Test();
//return 0;
//return 0;
//ReduceMaxFP16Test();
//ReduceMaxFP16Test();
...
@@ -143,11 +142,17 @@ int main( int argc, const char ** argv )
...
@@ -143,11 +142,17 @@ int main( int argc, const char ** argv )
//InitCPUFP16Test();
//InitCPUFP16Test();
//return 0;
//return 0;
SetDataGPUFP16
Test
();
//MycublasGemmEx
Test();
return
0
;
//
return 0;
MycublasGemmExTest
();
//SumIntTest();
return
0
;
//return 0;
//DropoutFP16Test();
//return 0;
//SetDataGPUFP16Test();
//return 0;
if
(
argc
>
1
&&
!
strcmp
(
argv
[
1
],
"-fnnlm"
))
if
(
argc
>
1
&&
!
strcmp
(
argv
[
1
],
"-fnnlm"
))
...
@@ -205,27 +210,69 @@ void MycublasGemmExTest() {
...
@@ -205,27 +210,69 @@ void MycublasGemmExTest() {
c1
.
Dump
(
stderr
,
"c1:"
);
c1
.
Dump
(
stderr
,
"c1:"
);
}
}
void
SumIntTest
()
{
XTensor
a
;
XTensor
b
;
XTensor
c
;
XTensor
inta
;
XTensor
intb
;
XTensor
intc
;
InitTensor2D
(
&
a
,
2
,
2
,
X_FLOAT
,
0
);
InitTensor2D
(
&
b
,
2
,
2
,
X_FLOAT
,
0
);
a
.
SetDataRand
(
-
5.0
,
5.0
);
b
.
SetDataRand
(
-
5.0
,
5.0
);
a
.
Dump
(
stderr
,
"a:"
);
b
.
Dump
(
stderr
,
"b:"
);
inta
=
ConvertDataType
(
a
,
X_INT
);
intb
=
ConvertDataType
(
b
,
X_INT
);
inta
.
Dump
(
stderr
,
"inta:"
);
intb
.
Dump
(
stderr
,
"intb:"
);
intc
=
Sum
(
inta
,
intb
);
intc
.
Dump
(
stderr
,
"intc:"
);
}
void
SetDataGPUFP16Test
()
{
void
SetDataGPUFP16Test
()
{
srand
(
time
(
NULL
));
srand
(
time
(
NULL
));
XTensor
a1
;
/*XTensor m;
InitTensor2D
(
&
a1
,
2
,
2
,
X_FLOAT
,
0
);
InitTensor2D(&m, 2, 2, X_FLOAT, 0);
_SetDataRand
(
&
a1
,
-
5.0
F
,
5.0
F
);
m.SetDataRand(0.0, 10.0);*/
a1
.
Dump
(
&
a1
,
stderr
,
"a:
\n
"
);
XTensor
a
;
XTensor
*
m
=
NewTensor2D
(
2
,
2
,
X_FLOAT
,
0
)
;
InitTensor2D
(
&
a
,
2
,
2
,
X_FLOAT16
,
0
);
m
->
SetDataRand
(
0.0
,
10.
0
);
_SetDataRand
(
&
a
,
-
5.0
F
,
5.0
F
);
//XTensor a1;
//InitTensor2D(&a1, 2, 2, X_FLOAT, 0);
//_CopyValues(&m, &a1);
//_SetDataRand(&a1, -1.0F, 1.0F);
//a1.Dump(&a1, stderr, "a:\n");
a
.
Dump
(
&
a
,
stderr
,
"a:
\n
"
);
/*XTensor a;
InitTensor2D(&a, 2, 2, X_FLOAT16, 0);*/
XTensor
b
;
XTensor
*
a
=
NewTensor2D
(
2
,
2
,
X_FLOAT16
,
0
);
InitTensor2D
(
&
b
,
2
,
2
,
X_FLOAT
,
0
);
b
=
ConvertDataType
(
a
,
X_FLOAT
);
_ConvertDataType
(
m
,
a
);
a
->
Dump
(
a
,
stderr
,
"a:
\n
"
);
_SetDataRand
(
a
,
0.0
F
,
1.0
F
);
a
->
Dump
(
a
,
stderr
,
"a:
\n
"
);
//XTensor b;
//InitTensor2D(&b, 2, 2, X_FLOAT, 0);
//b = ConvertDataType(a, X_FLOAT);
//b.Dump(stderr, "b:\n");
b
.
Dump
(
stderr
,
"b:
\n
"
);
}
}
void
ClipFP16Test
()
{
void
ClipFP16Test
()
{
...
@@ -447,7 +494,7 @@ void FloatToInt8Test() {
...
@@ -447,7 +494,7 @@ void FloatToInt8Test() {
InitTensor2D
(
&
a
,
2
,
2
,
X_FLOAT
,
0
);
InitTensor2D
(
&
a
,
2
,
2
,
X_FLOAT
,
0
);
InitTensor2D
(
&
b
,
2
,
2
,
X_INT8
,
0
);
InitTensor2D
(
&
b
,
2
,
2
,
X_INT8
,
0
);
a
.
SetDataRand
(
-
5.0
F
,
5.0
F
);
a
.
SetDataRand
(
5.0
F
,
5.0
F
);
a
.
Dump
(
stderr
,
"a:"
);
a
.
Dump
(
stderr
,
"a:"
);
b
=
ConvertDataType
(
a
,
X_INT8
);
b
=
ConvertDataType
(
a
,
X_INT8
);
...
@@ -741,8 +788,8 @@ void MultiplyDimFP16Test()
...
@@ -741,8 +788,8 @@ void MultiplyDimFP16Test()
halfA
=
ConvertDataType
(
a
,
X_FLOAT16
);
halfA
=
ConvertDataType
(
a
,
X_FLOAT16
);
halfB
=
ConvertDataType
(
b
,
X_FLOAT16
);
halfB
=
ConvertDataType
(
b
,
X_FLOAT16
);
c1
=
MultiplyDim
(
a1
,
b1
,
1
,
0
);
c1
=
MultiplyDim
(
a1
,
b1
,
1
);
halfC
=
MultiplyDim
(
halfA
,
halfB
,
1
,
0
);
halfC
=
MultiplyDim
(
halfA
,
halfB
,
1
);
c
=
ConvertDataType
(
halfC
,
X_FLOAT
);
c
=
ConvertDataType
(
halfC
,
X_FLOAT
);
...
@@ -950,26 +997,26 @@ void SubFP16Test()
...
@@ -950,26 +997,26 @@ void SubFP16Test()
void
DropoutFP16Test
()
void
DropoutFP16Test
()
{
{
srand
(
time
(
NULL
));
XTensor
a
;
XTensor
a
;
XTensor
b
;
XTensor
b
;
XTensor
b1
;
XTensor
b1
;
XTensor
halfA
;
XTensor
halfA
;
XTensor
halfB
;
XTensor
halfB
;
InitTensor2D
(
&
a
,
10
,
1
0
,
X_FLOAT
,
0
);
InitTensor2D
(
&
a
,
10
,
1
,
X_FLOAT
,
0
);
a
.
SetDataRand
(
-
5.0
F
,
5.0
F
);
a
.
SetDataRand
(
-
5.0
F
,
5.0
F
);
/*a.Dump(stderr, "a:");*/
a
.
Dump
(
stderr
,
"a:"
);
halfA
=
ConvertDataType
(
a
,
X_FLOAT16
);
halfA
=
ConvertDataType
(
a
,
X_FLOAT16
);
halfB
=
Dropout
(
halfA
,
0.5
);
halfB
=
Dropout
(
halfA
,
0.2
);
b1
=
Dropout
(
a
,
0.3
);
b1
=
Dropout
(
a
,
0.2
);
b
=
ConvertDataType
(
halfB
,
X_FLOAT
);
b
.
Dump
(
stderr
,
"b
:"
);
halfB
.
Dump
(
&
halfB
,
stderr
,
"halfB
:"
);
//b1.Dump(
stderr, "b1:");
b1
.
Dump
(
&
b1
,
stderr
,
"b1:"
);
}
}
void
ConvertBackwardTest
()
void
ConvertBackwardTest
()
...
@@ -1069,132 +1116,4 @@ void ConvertTest()
...
@@ -1069,132 +1116,4 @@ void ConvertTest()
a1
.
Dump
(
stderr
,
"halfa:"
);
a1
.
Dump
(
stderr
,
"halfa:"
);
}
}
}
}
\ No newline at end of file
void
MatrixMulFloat16AndFloatTest
()
{
XTensor
a
;
XTensor
b
;
XTensor
c
;
InitTensor2D
(
&
a
,
5000
,
5000
,
X_FLOAT
,
0
);
InitTensor2D
(
&
b
,
5000
,
5000
,
X_FLOAT
,
0
);
InitTensor2D
(
&
c
,
5000
,
5000
,
X_FLOAT
,
0
);
a
.
SetDataRand
(
-
10.0
F
,
10.0
F
);
b
.
SetDataRand
(
-
10.0
F
,
10.0
F
);
int
recurrentNum
=
10000
;
double
startT1
=
GetClockSec
();
for
(
int
i1
=
0
;
i1
<
recurrentNum
;
i1
++
)
{
c
=
MatrixMul
(
&
a
,
&
b
);
}
printf
(
"ElapsedFloat32 = %.2f s
\n
"
,
GetClockSec
()
-
startT1
);
double
startT2
=
GetClockSec
();
for
(
int
i2
=
0
;
i2
<
recurrentNum
;
i2
++
)
{
c
=
MatrixMulFloat16
(
&
a
,
&
b
);
}
printf
(
"ElapsedFloat16 = %.2f s
\n
"
,
GetClockSec
()
-
startT2
);
}
void
MatrixMul2DFloat16Test
()
{
XTensor
a
;
XTensor
b
;
XTensor
c
;
XTensor
a00
;
XTensor
b00
;
XTensor
c00
;
XTensor
c01
;
XTensor
halfa
;
XTensor
halfb
;
XTensor
halfc
;
InitTensor3D
(
&
a
,
3
,
2
,
3
,
X_FLOAT
,
0
);
InitTensor2D
(
&
b
,
3
,
2
,
X_FLOAT
,
0
);
InitTensor3D
(
&
c
,
3
,
2
,
2
,
X_FLOAT
,
0
);
InitTensor3D
(
&
a00
,
3
,
2
,
3
,
X_FLOAT
,
0
);
InitTensor2D
(
&
b00
,
3
,
2
,
X_FLOAT
,
0
);
InitTensor3D
(
&
c00
,
3
,
2
,
2
,
X_FLOAT
,
0
);
InitTensor3D
(
&
c01
,
3
,
2
,
2
,
X_FLOAT
,
0
);
InitTensor3D
(
&
halfa
,
3
,
2
,
3
,
X_FLOAT16
,
0
);
InitTensor2D
(
&
halfb
,
3
,
2
,
X_FLOAT16
,
0
);
InitTensor3D
(
&
halfc
,
3
,
2
,
2
,
X_FLOAT16
,
0
);
DTYPE
aData
[
3
][
2
][
3
]
=
{
{
{
0.02121212144
F
,
-
1.0234556667
F
,
2.04354565678
F
},
{
2.0234567332
F
,
-
1.0213469654
F
,
-
3.01568321
F
}
},
{
{
-
1.022347899421
F
,
2.012589653664
F
,
4.035346643
F
},
{
3.01234544634
F
,
1.0324354635
F
,
2.0546578332
F
}
},
{
{
-
1.0235743446
F
,
3.0335753334
F
,
2.0653323234
F
},
{
1.03235643232
F
,
-
1.023463345542
F
,
0.0335563322
F
}
}
};
DTYPE
bData
[
3
][
2
]
=
{
{
-
1.034466323232
F
,
-
2.0546676442
F
},
{
-
3.0224354656
F
,
4.034467866532
F
},
{
5.02354657442
F
,
-
6.0324355767443
F
}
};
a
.
SetData
(
aData
,
18
);
b
.
SetData
(
bData
,
6
);
_MatrixMul
(
&
a
,
X_NOTRANS
,
&
b
,
X_NOTRANS
,
&
c
);
_ConvertDataType
(
&
a
,
&
halfa
);
_ConvertDataType
(
&
b
,
&
halfb
);
_MatrixMul
(
&
halfa
,
X_NOTRANS
,
&
halfb
,
X_NOTRANS
,
&
halfc
);
_ConvertDataType
(
&
halfc
,
&
c01
);
_ConvertDataType
(
&
halfa
,
&
a00
);
_ConvertDataType
(
&
halfb
,
&
b00
);
_MatrixMul
(
&
a00
,
X_NOTRANS
,
&
b00
,
X_NOTRANS
,
&
c00
);
c
.
Dump
(
stderr
,
"c:"
);
c01
.
Dump
(
stderr
,
"c01:"
);
c00
.
Dump
(
stderr
,
"c0:"
);
XTensor
a1
;
XTensor
b1
;
XTensor
c1
;
XTensor
a10
;
XTensor
b10
;
XTensor
c10
;
XTensor
c11
;
XTensor
halfa1
;
XTensor
halfb1
;
XTensor
halfc1
;
InitTensor2D
(
&
a1
,
3
,
3
,
X_FLOAT
,
0
);
InitTensor2D
(
&
b1
,
3
,
2
,
X_FLOAT
,
0
);
InitTensor2D
(
&
c1
,
3
,
2
,
X_FLOAT
,
0
);
InitTensor2D
(
&
a10
,
3
,
3
,
X_FLOAT
,
0
);
InitTensor2D
(
&
b10
,
3
,
2
,
X_FLOAT
,
0
);
InitTensor2D
(
&
c10
,
3
,
2
,
X_FLOAT
,
0
);
InitTensor2D
(
&
c11
,
3
,
2
,
X_FLOAT
,
0
);
InitTensor2D
(
&
halfa1
,
3
,
3
,
X_FLOAT16
,
0
);
InitTensor2D
(
&
halfb1
,
3
,
2
,
X_FLOAT16
,
0
);
InitTensor2D
(
&
halfc1
,
3
,
2
,
X_FLOAT16
,
0
);
DTYPE
a1Data
[
3
][
3
]
=
{
{
0.02121212144
F
,
-
1.0234556667
F
,
2.043541565678
F
},
{
-
2.0234567332
F
,
1.0213469657774
F
,
-
3.0156837543321
F
}
,
{
1.022347899421
F
,
-
2.012589653664
F
,
4.03534634643
F
}};
DTYPE
b1Data
[
3
][
2
]
=
{
{
1.034466323232
F
,
-
2.0546676442
F
},
{
3.0224354656
F
,
-
4.034467866532
F
},
{
5.02354657442
F
,
6.0324355767443
F
}
};
a1
.
SetData
(
a1Data
,
9
);
b1
.
SetData
(
b1Data
,
6
);
_MatrixMul
(
&
a1
,
X_NOTRANS
,
&
b1
,
X_NOTRANS
,
&
c1
);
_ConvertDataType
(
&
a1
,
&
halfa1
);
_ConvertDataType
(
&
b1
,
&
halfb1
);
_MatrixMul
(
&
halfa1
,
X_NOTRANS
,
&
halfb1
,
X_NOTRANS
,
&
halfc1
);
_ConvertDataType
(
&
halfc1
,
&
c11
);
_ConvertDataType
(
&
halfa1
,
&
a10
);
_ConvertDataType
(
&
halfb1
,
&
b10
);
_MatrixMul
(
&
a10
,
X_NOTRANS
,
&
b10
,
X_NOTRANS
,
&
c10
);
c1
.
Dump
(
stderr
,
"c1:"
);
c11
.
Dump
(
stderr
,
"c11:"
);
c10
.
Dump
(
stderr
,
"c10:"
);
}
source/network/XBackwardLoss.cpp
查看文件 @
44bf9fa6
...
@@ -87,8 +87,6 @@ void XLossGrad::Compute(XTensor * gold, XTensor * y,
...
@@ -87,8 +87,6 @@ void XLossGrad::Compute(XTensor * gold, XTensor * y,
XTensor
*
dedy
,
XTensor
*
padding
,
XTensor
*
dedy
,
XTensor
*
padding
,
LOSS_FUNCTION_NAME
lossName
)
LOSS_FUNCTION_NAME
lossName
)
{
{
//return;
if
(
gold
==
NULL
){
if
(
gold
==
NULL
){
if
(
dedy
->
dataType
==
X_FLOAT
)
if
(
dedy
->
dataType
==
X_FLOAT
)
_SetDataFixedFloat
(
dedy
,
1.0
F
);
_SetDataFixedFloat
(
dedy
,
1.0
F
);
...
@@ -97,7 +95,7 @@ void XLossGrad::Compute(XTensor * gold, XTensor * y,
...
@@ -97,7 +95,7 @@ void XLossGrad::Compute(XTensor * gold, XTensor * y,
else
if
(
dedy
->
dataType
==
X_INT
)
else
if
(
dedy
->
dataType
==
X_INT
)
_SetDataFixedInt
(
dedy
,
1
);
_SetDataFixedInt
(
dedy
,
1
);
else
{
else
{
//
ShowNTErrors("TODO");
ShowNTErrors
(
"TODO"
);
}
}
return
;
return
;
}
}
...
...
source/network/XBackwardMath.cpp
查看文件 @
44bf9fa6
...
@@ -71,6 +71,8 @@ void XMathGrad::MakeGrad(XTensor * node, bool isEfficient)
...
@@ -71,6 +71,8 @@ void XMathGrad::MakeGrad(XTensor * node, bool isEfficient)
GradMultiply
(
node
,
isEfficient
);
GradMultiply
(
node
,
isEfficient
);
else
if
(
operID
==
MATH_MULTIPLYDIM
)
else
if
(
operID
==
MATH_MULTIPLYDIM
)
GradMultiplyDim
(
node
,
isEfficient
);
GradMultiplyDim
(
node
,
isEfficient
);
else
if
(
operID
==
MATH_MULTIPLYBROADCAST
)
GradMultiplyBroadcast
(
node
,
isEfficient
);
else
if
(
operID
==
MATH_NEGATE
)
else
if
(
operID
==
MATH_NEGATE
)
GradNegate
(
node
,
isEfficient
);
GradNegate
(
node
,
isEfficient
);
else
if
(
operID
==
MATH_NORMALIZE
)
else
if
(
operID
==
MATH_NORMALIZE
)
...
@@ -87,6 +89,8 @@ void XMathGrad::MakeGrad(XTensor * node, bool isEfficient)
...
@@ -87,6 +89,8 @@ void XMathGrad::MakeGrad(XTensor * node, bool isEfficient)
GradSum
(
node
,
isEfficient
);
GradSum
(
node
,
isEfficient
);
else
if
(
operID
==
MATH_SUMDIM
)
else
if
(
operID
==
MATH_SUMDIM
)
GradSumDim
(
node
,
isEfficient
);
GradSumDim
(
node
,
isEfficient
);
else
if
(
operID
==
MATH_SUMBROADCAST
)
GradSumBroadcast
(
node
,
isEfficient
);
else
if
(
operID
==
REDUCE_REDUCEMEAN
)
else
if
(
operID
==
REDUCE_REDUCEMEAN
)
GradReduceMean
(
node
,
isEfficient
);
GradReduceMean
(
node
,
isEfficient
);
else
if
(
operID
==
REDUCE_REDUCESUM
)
else
if
(
operID
==
REDUCE_REDUCESUM
)
...
@@ -736,10 +740,6 @@ dE/db = (dE/dc * a).reduce(0,...,n-1,n+1,...)
...
@@ -736,10 +740,6 @@ dE/db = (dE/dc * a).reduce(0,...,n-1,n+1,...)
*/
*/
void
XMathGrad
::
GradMultiplyDim
(
XTensor
*
node
,
bool
isEfficient
)
void
XMathGrad
::
GradMultiplyDim
(
XTensor
*
node
,
bool
isEfficient
)
{
{
/* XTensor node1;
node1 = ConvertDataType(*node, X_FLOAT);
node1.Dump(stderr, "node:");*/
XLink
&
income
=
node
->
income
;
XLink
&
income
=
node
->
income
;
CheckNTErrors
(
income
.
tailNum
==
2
,
"Wrong input tensor number for MULTIPLYDIM!"
);
CheckNTErrors
(
income
.
tailNum
==
2
,
"Wrong input tensor number for MULTIPLYDIM!"
);
...
@@ -751,14 +751,6 @@ void XMathGrad::GradMultiplyDim(XTensor * node, bool isEfficient)
...
@@ -751,14 +751,6 @@ void XMathGrad::GradMultiplyDim(XTensor * node, bool isEfficient)
/* dE/da */
/* dE/da */
_MultiplyDim
(
node
->
grad
,
b
,
a
->
grad
,
n
,
1.0
F
);
_MultiplyDim
(
node
->
grad
,
b
,
a
->
grad
,
n
,
1.0
F
);
//XTensor a1;
//a1 = ConvertDataType(*a, X_FLOAT);
//a1.Dump(stderr, "a:");
//XTensor b1;
//b1 = ConvertDataType(*b, X_FLOAT);
//b1.Dump(stderr, "b:");
/* dE/db */
/* dE/db */
int
order
=
a
->
order
;
int
order
=
a
->
order
;
...
@@ -777,19 +769,10 @@ void XMathGrad::GradMultiplyDim(XTensor * node, bool isEfficient)
...
@@ -777,19 +769,10 @@ void XMathGrad::GradMultiplyDim(XTensor * node, bool isEfficient)
size of b. Then we can reduce the matrix into a row vector. */
size of b. Then we can reduce the matrix into a row vector. */
bGradTMP
->
Reshape
(
2
,
reshapedSize
);
bGradTMP
->
Reshape
(
2
,
reshapedSize
);
/*XTensor bGradTMP1;
bGradTMP1 = ConvertDataType(*bGradTMP, X_FLOAT);
bGradTMP1.Dump(stderr, "bGradTMP:");*/
//if(b->outgo.tailNum > 1){
//if(b->outgo.tailNum > 1){
XTensor
*
bGradTMP2
=
NewTensorBuf
(
b
->
grad
,
b
->
devID
,
b
->
mem
);
XTensor
*
bGradTMP2
=
NewTensorBuf
(
b
->
grad
,
b
->
devID
,
b
->
mem
);
_ReduceSum
(
bGradTMP
,
bGradTMP2
,
0
);
_ReduceSum
(
bGradTMP
,
bGradTMP2
,
0
);
/* XTensor bGradTMP21;
bGradTMP21 = ConvertDataType(*bGradTMP2, X_FLOAT);
bGradTMP21.Dump(stderr, "bGradTMP2:");*/
_Sum
(
b
->
grad
,
bGradTMP2
,
b
->
grad
);
_Sum
(
b
->
grad
,
bGradTMP2
,
b
->
grad
);
DelTensorBuf
(
bGradTMP2
);
DelTensorBuf
(
bGradTMP2
);
...
@@ -832,21 +815,43 @@ void XMathGrad::GradMultiplyDim(XTensor * node, bool isEfficient)
...
@@ -832,21 +815,43 @@ void XMathGrad::GradMultiplyDim(XTensor * node, bool isEfficient)
DelTensorBuf
(
interGrad
);
DelTensorBuf
(
interGrad
);
}
}
//printf("\n");
//XTensor a2;
//a2 = ConvertDataType(*a, X_FLOAT);
//a2.Dump(stderr, "a2:");
//XTensor b2;
//b2 = ConvertDataType(*b, X_FLOAT);
//b2.Dump(stderr, "b2:");
DelTensorBuf
(
bGradTMP
);
DelTensorBuf
(
bGradTMP
);
node
->
visitMark
=
NODE_FINISHED
;
node
->
visitMark
=
NODE_FINISHED
;
}
}
/*
/*
gradient for multiplication by broadcasting:
c = a * b
where some dimensions of b are of size 1
dE/da = dE/dc * b
dE/db = (dE/dc * a).reduce(0...n)
where a.reduce(0...n) is the reduction along the dimension
whose size is 1 in b. Note that there might be several reductions.
>> node - the node (c) for backward computation
>> isEfficient - indicates whether the computation is in
an efficient manner
*/
void
XMathGrad
::
GradMultiplyBroadcast
(
XTensor
*
node
,
bool
isEfficient
)
{
XLink
&
income
=
node
->
income
;
CheckNTErrors
(
income
.
tailNum
==
2
,
"Wrong input tensor number for MULTIPLYBROADCAST!"
);
XTensor
*
a
=
income
.
tails
[
0
];
XTensor
*
b
=
income
.
tails
[
1
];
DTYPE
beta
=
income
.
GetParam
(
0
);
XNoder
::
MakeGrad
(
a
);
_MultiplyBroadcast
(
node
->
grad
,
b
,
a
->
grad
,
1.0
F
);
if
(
b
->
isVar
||
b
->
income
.
tailNum
>
0
){
ShowNTErrors
(
"TODO"
);
}
}
/*
gradient for negate
gradient for negate
for
for
c = -a
c = -a
...
@@ -1020,7 +1025,6 @@ void XMathGrad::GradScaleAndShift(XTensor * node, bool isEfficient)
...
@@ -1020,7 +1025,6 @@ void XMathGrad::GradScaleAndShift(XTensor * node, bool isEfficient)
_Sum
(
a
->
grad
,
node
->
grad
,
a
->
grad
,
scale
);
_Sum
(
a
->
grad
,
node
->
grad
,
a
->
grad
,
scale
);
node
->
visitMark
=
NODE_FINISHED
;
node
->
visitMark
=
NODE_FINISHED
;
}
}
...
@@ -1284,6 +1288,37 @@ void XMathGrad::GradSumDim(XTensor * node, bool isEfficient)
...
@@ -1284,6 +1288,37 @@ void XMathGrad::GradSumDim(XTensor * node, bool isEfficient)
node
->
visitMark
=
NODE_FINISHED
;
node
->
visitMark
=
NODE_FINISHED
;
}
}
/*
gradient for sum by broadcasting:
c = a + b * \beta
where some dimensions of b are of size 1
dE/da = dE/dc
dE/db = dE/dc * a.reduce(0..n) * \beta
where a.reduce(0..n) is the reduction along the dimension
whose size is 1 in b
>> node - the node (c) for backward computation
>> isEfficient - indicates whether the computation is in
an efficient manner
*/
void
XMathGrad
::
GradSumBroadcast
(
XTensor
*
node
,
bool
isEfficient
)
{
XLink
&
income
=
node
->
income
;
CheckNTErrors
(
income
.
tailNum
==
2
,
"Wrong input tensor number for SUMBROADCAST!"
);
XTensor
*
a
=
income
.
tails
[
0
];
XTensor
*
b
=
income
.
tails
[
1
];
DTYPE
beta
=
income
.
GetParam
(
0
);
XNoder
::
MakeGrad
(
a
);
_Sum
(
a
->
grad
,
node
->
grad
,
a
->
grad
);
if
(
b
->
isVar
||
b
->
income
.
tailNum
>
0
){
ShowNTErrors
(
"TODO"
);
}
}
/*
/*
gradient for reduceMean
gradient for reduceMean
for
for
...
...
source/network/XBackwardMath.h
查看文件 @
44bf9fa6
...
@@ -109,6 +109,11 @@ private:
...
@@ -109,6 +109,11 @@ private:
static
static
void
GradMultiplyDim
(
XTensor
*
node
,
bool
isEfficient
);
void
GradMultiplyDim
(
XTensor
*
node
,
bool
isEfficient
);
/* gradient for multiply one dimension: c = a * b
where some dimensions of b are of size 1 */
static
void
GradMultiplyBroadcast
(
XTensor
*
node
,
bool
isEfficient
);
/* gradient for negate */
/* gradient for negate */
static
static
void
GradNegate
(
XTensor
*
node
,
bool
isEfficient
);
void
GradNegate
(
XTensor
*
node
,
bool
isEfficient
);
...
@@ -143,6 +148,11 @@ private:
...
@@ -143,6 +148,11 @@ private:
static
static
void
GradSumDim
(
XTensor
*
node
,
bool
isEfficient
);
void
GradSumDim
(
XTensor
*
node
,
bool
isEfficient
);
/* gradient for sum by broadcasting: c = a + b * \beta
where some dimensions of b are of size 1 */
static
void
GradSumBroadcast
(
XTensor
*
node
,
bool
isEfficient
);
/* gradient for reduceMean */
/* gradient for reduceMean */
static
static
void
GradReduceMean
(
XTensor
*
node
,
bool
isEfficient
);
void
GradReduceMean
(
XTensor
*
node
,
bool
isEfficient
);
...
...
source/network/XNet.cpp
查看文件 @
44bf9fa6
...
@@ -27,7 +27,6 @@
...
@@ -27,7 +27,6 @@
#include "XBackwardFunc.h"
#include "XBackwardFunc.h"
#include "XBackwardShape.h"
#include "XBackwardShape.h"
#include "../tensor/XName.h"
#include "../tensor/XName.h"
#include "../tensor/core/CHeader.h"
namespace
nts
{
namespace
nts
{
...
@@ -266,7 +265,7 @@ void XNet::BackwardNode(XTensor * node, bool isEfficent)
...
@@ -266,7 +265,7 @@ void XNet::BackwardNode(XTensor * node, bool isEfficent)
XMathGrad
::
MakeGrad
(
node
,
isEfficent
);
XMathGrad
::
MakeGrad
(
node
,
isEfficent
);
else
if
(
XFuncGrad
::
IsFunc
(
node
))
else
if
(
XFuncGrad
::
IsFunc
(
node
))
XFuncGrad
::
MakeGrad
(
node
,
isEfficent
);
XFuncGrad
::
MakeGrad
(
node
,
isEfficent
);
else
if
(
XDataGrad
::
IsDataOP
(
node
))
else
if
(
XDataGrad
::
IsDataOP
(
node
))
XDataGrad
::
MakeGrad
(
node
,
isEfficent
);
XDataGrad
::
MakeGrad
(
node
,
isEfficent
);
else
if
(
XShapeGrad
::
IsShapeOP
(
node
))
else
if
(
XShapeGrad
::
IsShapeOP
(
node
))
XShapeGrad
::
MakeGrad
(
node
,
isEfficent
);
XShapeGrad
::
MakeGrad
(
node
,
isEfficent
);
...
...
source/sample/fnnlm/FNNLM.cpp
查看文件 @
44bf9fa6
...
@@ -839,6 +839,9 @@ void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net)
...
@@ -839,6 +839,9 @@ void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net)
InitModelTensor2D
(
s
,
batchSize
,
model
.
vSize
,
model
);
InitModelTensor2D
(
s
,
batchSize
,
model
.
vSize
,
model
);
InitModelTensor2D
(
y
,
batchSize
,
model
.
vSize
,
model
);
InitModelTensor2D
(
y
,
batchSize
,
model
.
vSize
,
model
);
///* s = h_last * w */
//_MatrixMul(&h_last, X_NOTRANS, &w, X_NOTRANS, &s);
XTensor
h_last1
;
XTensor
h_last1
;
h_last1
=
ScaleAndShift
(
h_last
,
100
,
0
);
h_last1
=
ScaleAndShift
(
h_last
,
100
,
0
);
...
@@ -850,22 +853,27 @@ void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net)
...
@@ -850,22 +853,27 @@ void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net)
int8H_last
=
ConvertDataType
(
h_last1
,
X_INT8
);
int8H_last
=
ConvertDataType
(
h_last1
,
X_INT8
);
int8W
=
ConvertDataType
(
w1
,
X_INT8
);
int8W
=
ConvertDataType
(
w1
,
X_INT8
);
XTensor
s1
;
InitTensor2D
(
&
s1
,
batchSize
,
model
.
vSize
,
X_FLOAT
,
model
.
devID
,
model
.
mem
);
_MatrixMul2D
(
&
int8H_last
,
X_NOTRANS
,
&
int8W
,
X_NOTRANS
,
&
s1
);
s
=
ScaleAndShift
(
s1
,
0.0001
,
0
);
XTensor
s1
;
InitTensor2D
(
&
s1
,
batchSize
,
model
.
vSize
,
X_INT
,
model
.
devID
,
model
.
mem
);
_MatrixMul2D
(
&
int8H_last
,
X_NOTRANS
,
&
int8W
,
X_NOTRANS
,
&
s1
);
XTensor
b2D
;
XTensor
b2D
;
InitTensor
(
&
b2D
,
&
s
);
InitTensor
2D
(
&
b2D
,
batchSize
,
model
.
vSize
,
X_FLOAT
,
model
.
devID
,
model
.
mem
);
_Unsqueeze
(
&
b
,
&
b2D
,
0
,
batchSize
);
_Unsqueeze
(
&
b
,
&
b2D
,
0
,
batchSize
);
_Sum
(
&
s
,
&
b2D
,
&
s
);
b2D
=
ScaleAndShift
(
b2D
,
10000
,
0
);
XTensor
b2D1
;
b2D1
=
ConvertDataType
(
b2D
,
X_INT
);
_Sum
(
&
s1
,
&
b2D1
,
&
s1
);
s
=
ConvertDataType
(
s1
,
X_FLOAT
);
s
=
ScaleAndShift
(
s
,
0.0001
,
0
);
/* y = softmax(s) */
/* y = softmax(s) */
_LogSoftmax
(
&
s
,
&
y
,
1
);
_LogSoftmax
(
&
s
,
&
y
,
1
);
}
}
}
}
...
@@ -1203,12 +1211,12 @@ void Test(const char * test, const char * result, FNNModel &model)
...
@@ -1203,12 +1211,12 @@ void Test(const char * test, const char * result, FNNModel &model)
fprintf
(
ofile
,
"%d "
,
ngrams
[
0
].
words
[
i
]);
fprintf
(
ofile
,
"%d "
,
ngrams
[
0
].
words
[
i
]);
for
(
int
i
=
0
;
i
<
ngramNum
;
i
++
)
for
(
int
i
=
0
;
i
<
ngramNum
;
i
++
)
fprintf
(
ofile
,
"%d "
,
ngrams
[
i
].
words
[
model
.
n
-
1
]);
fprintf
(
ofile
,
"%d "
,
ngrams
[
i
].
words
[
model
.
n
-
1
]);
fprintf
(
ofile
,
"||| "
);
fprintf
(
ofile
,
"||| "
);
for
(
int
i
=
0
;
i
<
model
.
n
-
1
;
i
++
)
for
(
int
i
=
0
;
i
<
model
.
n
-
1
;
i
++
)
fprintf
(
ofile
,
"<s> "
);
fprintf
(
ofile
,
"<s> "
);
for
(
int
i
=
0
;
i
<
ngramNum
;
i
++
)
for
(
int
i
=
0
;
i
<
ngramNum
;
i
++
)
fprintf
(
ofile
,
"%f "
,
probs
.
Get1D
(
i
));
fprintf
(
ofile
,
"%f "
,
probs
.
Get1D
(
i
));
fprintf
(
ofile
,
"||| %f
\n
"
,
prob
);
fprintf
(
ofile
,
"||| %f
\n
"
,
prob
);
loss
+=
-
prob
;
loss
+=
-
prob
;
wordCount
+=
ngramNum
;
wordCount
+=
ngramNum
;
...
...
source/sample/transformer/T2TAttention.cpp
查看文件 @
44bf9fa6
...
@@ -53,6 +53,42 @@ initialize the model
...
@@ -53,6 +53,42 @@ initialize the model
>> myDevID - device id
>> myDevID - device id
>> myMem - the memory pool
>> myMem - the memory pool
*/
*/
//void T2TAttention::InitModel(int argc, char ** argv,
// bool myIsMasked, int myIgnored,
// int myDevID, XMem * myMem)
//{
// devID = myDevID;
// mem = myMem;
// isMasked = myIsMasked;
// ignored = myIgnored;
//
// float minmax = 0;
//
// LoadParamInt(argc, argv, "nhead", &nhead, 8);
// LoadParamInt(argc, argv, "d", &dk, DEFAULT_EMBEDDING_SIZE);
// LoadParamInt(argc, argv, "d", &dv, DEFAULT_EMBEDDING_SIZE);
// LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);
// LoadParamFloat(argc, argv, "attminmax", &minmax, 0.1F);
// LoadParamFloat(argc, argv, "dropoutatt", &dropoutP, 0);
//
// InitTensor2D(&wk, d, dk, X_FLOAT, devID, mem);
// InitTensor2D(&wq, d, dk, X_FLOAT, devID, mem);
// InitTensor2D(&wv, d, dv, X_FLOAT, devID, mem);
// InitTensor2D(&wa, d, d, X_FLOAT, devID, mem);
// InitTensor2D(&wbig, d, 3 * d, X_FLOAT, devID, mem);
//
// float scale = 1.0F;
// float finfoutk = (float)sqrt(6.0F * scale/(d + dk));
// float finfoutv = (float)sqrt(6.0F * scale/(d + dv));
// float finfouta = (float)sqrt(6.0F * scale / (d + d));
// float finfoutbig = (float)sqrt(6.0F * scale / (d + 3*d));
//
// wk.SetDataRand(-finfoutk, finfoutk);
// wq.SetDataRand(-finfoutk, finfoutk);
// wv.SetDataRand(-finfoutv, finfoutv);
// wa.SetDataRand(-finfouta, finfouta);
// wbig.SetDataRand(-finfoutbig, finfoutbig);
//}
void
T2TAttention
::
InitModel
(
int
argc
,
char
**
argv
,
void
T2TAttention
::
InitModel
(
int
argc
,
char
**
argv
,
bool
myIsMasked
,
int
myIgnored
,
bool
myIsMasked
,
int
myIgnored
,
...
@@ -76,20 +112,19 @@ void T2TAttention::InitModel(int argc, char ** argv,
...
@@ -76,20 +112,19 @@ void T2TAttention::InitModel(int argc, char ** argv,
InitTensor2D
(
&
wq
,
d
,
dk
,
X_FLOAT16
,
devID
,
mem
);
InitTensor2D
(
&
wq
,
d
,
dk
,
X_FLOAT16
,
devID
,
mem
);
InitTensor2D
(
&
wv
,
d
,
dv
,
X_FLOAT16
,
devID
,
mem
);
InitTensor2D
(
&
wv
,
d
,
dv
,
X_FLOAT16
,
devID
,
mem
);
InitTensor2D
(
&
wa
,
d
,
d
,
X_FLOAT16
,
devID
,
mem
);
InitTensor2D
(
&
wa
,
d
,
d
,
X_FLOAT16
,
devID
,
mem
);
InitTensor2D
(
&
wbig
,
d
,
3
*
d
,
X_FLOAT16
,
devID
,
mem
);
float
scale
=
1.0
F
;
float
scale
=
1.0
F
;
float
finfoutk
=
(
float
)
sqrt
(
6.0
F
*
scale
/
(
d
+
dk
));
float
finfoutk
=
(
float
)
sqrt
(
6.0
F
*
scale
/
(
d
+
dk
));
float
finfoutv
=
(
float
)
sqrt
(
6.0
F
*
scale
/
(
d
+
dv
));
float
finfoutv
=
(
float
)
sqrt
(
6.0
F
*
scale
/
(
d
+
dv
));
float
finfouta
=
(
float
)
sqrt
(
6.0
F
*
scale
/
(
d
+
d
));
float
finfouta
=
(
float
)
sqrt
(
6.0
F
*
scale
/
(
d
+
d
));
float
finfoutbig
=
(
float
)
sqrt
(
6.0
F
*
scale
/
(
d
+
3
*
d
));
wk
.
SetDataRand
(
-
finfoutk
,
finfoutk
);
wk
.
SetDataRand
(
-
finfoutk
,
finfoutk
);
wq
.
SetDataRand
(
-
finfoutk
,
finfoutk
);
wq
.
SetDataRand
(
-
finfoutk
,
finfoutk
);
wv
.
SetDataRand
(
-
finfoutv
,
finfoutv
);
wv
.
SetDataRand
(
-
finfoutv
,
finfoutv
);
wa
.
SetDataRand
(
-
finfouta
,
finfouta
);
wa
.
SetDataRand
(
-
finfouta
,
finfouta
);
//_SetDataRand(&wk, -finfoutk, finfoutk);
wbig
.
SetDataRand
(
-
finfoutbig
,
finfoutbig
);
//_SetDataRand(&wq, -finfoutk, finfoutk);
//_SetDataRand(&wv, -finfoutv, finfoutv);
//_SetDataRand(&wa, -finfouta, finfouta);
}
}
/*
/*
...
@@ -103,42 +138,138 @@ make the network
...
@@ -103,42 +138,138 @@ make the network
>> isTraining - indicates whether the model is used for training
>> isTraining - indicates whether the model is used for training
<< return - multi-attention result
<< return - multi-attention result
*/
*/
//XTensor T2TAttention::Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining, bool selfatt)
//{
// XTensor k2;
// XTensor q2;
// XTensor v2;
//
// if (selfatt){
//
// XTensor con;
// XList split;
//
// con = MMul(k, wbig);
//
// int d1 = con.GetDim(0);
// int d2 = con.GetDim(1);
// int d3 = con.GetDim(2) / 3;
//
// InitTensor3D(&k2, d1, d2, d3, X_FLOAT, devID, mem);
// InitTensor3D(&q2, d1, d2, d3, X_FLOAT, devID, mem);
// InitTensor3D(&v2, d1, d2, d3, X_FLOAT, devID, mem);
//
// split.Add(&q2);
// split.Add(&k2);
// split.Add(&v2);
//
// Split(con, split, 2, 3);
// }
//
// else{
// /* linear transofmration before self-attention */
// k2 = MMul(k, wk);
// q2 = MMul(q, wq);
// v2 = MMul(v, wv);
// }
//
// XTensor kheads;
// XTensor qheads;
// XTensor vheads;
//
// /* multi head */
// kheads = Split(k2, k2.order - 1, nhead);
// qheads = Split(q2, q2.order - 1, nhead);
// vheads = Split(v2, v2.order - 1, nhead);
//
// XTensor att;
// XTensor dot;
// XTensor scalar;
//
// /* scalar = softmax(Q * K^T / sqrt(dk)) * V */
// dot = BMMul(qheads, X_NOTRANS, kheads, X_TRANS);
//
// if(isMasked)
// dot = dot + mask;
//
// dot = Linear(dot, 1.0F/(float)sqrt((float)dk/nhead));
//
// scalar = Softmax(dot, -1);
//
// if(isTraining && dropoutP > 0)
// scalar = Dropout(scalar, dropoutP);
//
// att = BMMul(scalar, vheads);
//
// /* concatenate the heads */
// return MMul(Merge(att, att.order - 1), wa);
//}
XTensor
T2TAttention
::
Make
(
XTensor
&
k
,
XTensor
&
q
,
XTensor
&
v
,
XTensor
&
mask
,
bool
isTraining
)
XTensor
T2TAttention
::
Make
(
XTensor
&
k
,
XTensor
&
q
,
XTensor
&
v
,
XTensor
&
mask
,
bool
isTraining
,
bool
selfatt
)
{
{
XTensor
halfK
;
XTensor
halfK2
;
XTensor
halfK2
;
XTensor
halfQ2
;
XTensor
halfQ2
;
XTensor
halfV2
;
XTensor
halfV2
;
XTensor
halfK
;
halfK
=
ConvertDataType
(
k
,
X_FLOAT16
);
halfK
=
ConvertDataType
(
k
,
X_FLOAT16
);
halfK2
=
MMul
(
halfK
,
wk
);
if
(
selfatt
)
{
halfQ2
=
MMul
(
halfK
,
wq
);
halfV2
=
MMul
(
halfK
,
wv
);
XTensor
halfCon
;
XList
halfSplit
;
halfCon
=
MMul
(
halfK
,
wbig
);
int
d1
=
halfCon
.
GetDim
(
0
);
int
d2
=
halfCon
.
GetDim
(
1
);
int
d3
=
halfCon
.
GetDim
(
2
)
/
3
;
InitTensor3D
(
&
halfK2
,
d1
,
d2
,
d3
,
X_FLOAT16
,
devID
,
mem
);
InitTensor3D
(
&
halfQ2
,
d1
,
d2
,
d3
,
X_FLOAT16
,
devID
,
mem
);
InitTensor3D
(
&
halfV2
,
d1
,
d2
,
d3
,
X_FLOAT16
,
devID
,
mem
);
halfSplit
.
Add
(
&
halfQ2
);
halfSplit
.
Add
(
&
halfK2
);
halfSplit
.
Add
(
&
halfV2
);
Split
(
halfCon
,
halfSplit
,
2
,
3
);
}
else
{
XTensor
halfQ
;
XTensor
halfV
;
halfQ
=
ConvertDataType
(
q
,
X_FLOAT16
);
halfV
=
ConvertDataType
(
v
,
X_FLOAT16
);
/* linear transofmration before self-attention */
halfK2
=
MMul
(
halfK
,
wk
);
halfQ2
=
MMul
(
halfQ
,
wq
);
halfV2
=
MMul
(
halfV
,
wv
);
}
XTensor
halfKheads
;
XTensor
halfKheads
;
XTensor
halfQheads
;
XTensor
halfQheads
;
XTensor
halfVheads
;
XTensor
halfVheads
;
/* multi head */
halfKheads
=
Split
(
halfK2
,
halfK2
.
order
-
1
,
nhead
);
halfKheads
=
Split
(
halfK2
,
halfK2
.
order
-
1
,
nhead
);
halfQheads
=
Split
(
halfQ2
,
halfQ2
.
order
-
1
,
nhead
);
halfQheads
=
Split
(
halfQ2
,
halfQ2
.
order
-
1
,
nhead
);
halfVheads
=
Split
(
halfV2
,
halfV2
.
order
-
1
,
nhead
);
halfVheads
=
Split
(
halfV2
,
halfV2
.
order
-
1
,
nhead
);
XTensor
half
Mask
;
XTensor
half
Att
;
XTensor
halfDot
;
XTensor
halfDot
;
XTensor
halfScalar
;
XTensor
halfScalar
;
XTensor
halfAtt
;
halfMask
=
ConvertDataType
(
mask
,
X_FLOAT16
);
/* scalar = softmax(Q * K^T / sqrt(dk)) * V */
/* scalar = softmax(Q * K^T / sqrt(dk)) * V */
halfDot
=
BMMul
(
halfQheads
,
X_NOTRANS
,
halfKheads
,
X_TRANS
);
halfDot
=
BMMul
(
halfQheads
,
X_NOTRANS
,
halfKheads
,
X_TRANS
);
//XTensor halfMask(mask.order, mask.dimSize, X_FLOAT16, mask.denseRatio, mask.devID, mask.mem);
if
(
isMasked
)
{
if
(
isMasked
)
{
XTensor
halfMask
;
halfMask
=
ConvertDataType
(
mask
,
X_FLOAT16
);
halfDot
=
Sum
(
halfDot
,
halfMask
);
halfDot
=
Sum
(
halfDot
,
halfMask
);
}
}
halfDot
=
Linear
(
halfDot
,
1.0
F
/
(
float
)
sqrt
((
float
)
dk
/
nhead
));
halfDot
=
Linear
(
halfDot
,
1.0
F
/
(
float
)
sqrt
((
float
)
dk
/
nhead
));
...
@@ -147,13 +278,10 @@ XTensor T2TAttention::Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bo
...
@@ -147,13 +278,10 @@ XTensor T2TAttention::Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bo
if
(
isTraining
&&
dropoutP
>
0
)
if
(
isTraining
&&
dropoutP
>
0
)
halfScalar
=
Dropout
(
halfScalar
,
dropoutP
);
halfScalar
=
Dropout
(
halfScalar
,
dropoutP
);
/*att = BMMul(scalar, vheads);*/
halfAtt
=
BMMul
(
halfScalar
,
halfVheads
);
halfAtt
=
BMMul
(
halfScalar
,
halfVheads
);
/* concatenate the heads */
/* concatenate the heads */
return
ConvertDataType
(
MMul
(
Merge
(
halfAtt
,
halfAtt
.
order
-
1
),
wa
),
X_FLOAT
);
return
ConvertDataType
(
MMul
(
Merge
(
halfAtt
,
halfAtt
.
order
-
1
),
wa
),
X_FLOAT
);
}
}
}
}
source/sample/transformer/T2TAttention.h
查看文件 @
44bf9fa6
...
@@ -59,7 +59,8 @@ public:
...
@@ -59,7 +59,8 @@ public:
/* transformation after dot-product attention */
/* transformation after dot-product attention */
XTensor
wa
;
XTensor
wa
;
XTensor
wbig
;
/* size of transformed Q and K */
/* size of transformed Q and K */
int
dk
;
int
dk
;
...
@@ -95,7 +96,7 @@ public:
...
@@ -95,7 +96,7 @@ public:
int
myDevID
=
-
1
,
XMem
*
myMem
=
NULL
);
int
myDevID
=
-
1
,
XMem
*
myMem
=
NULL
);
/* make the network */
/* make the network */
XTensor
Make
(
XTensor
&
k
,
XTensor
&
q
,
XTensor
&
v
,
XTensor
&
mask
,
bool
isTraining
);
XTensor
Make
(
XTensor
&
k
,
XTensor
&
q
,
XTensor
&
v
,
XTensor
&
mask
,
bool
isTraining
,
bool
selfatt
);
};
};
}
}
...
...
source/sample/transformer/T2TDecoder.cpp
查看文件 @
44bf9fa6
...
@@ -21,6 +21,8 @@
...
@@ -21,6 +21,8 @@
#include <math.h>
#include <math.h>
#include "T2TDecoder.h"
#include "T2TDecoder.h"
#include "T2TUtility.h"
#include "T2TLayerNormal.h"
#include "../../tensor/core/CHeader.h"
#include "../../tensor/core/CHeader.h"
namespace
transformer
namespace
transformer
...
@@ -53,16 +55,43 @@ void AttDecoder::InitModel(int argc, char ** argv,
...
@@ -53,16 +55,43 @@ void AttDecoder::InitModel(int argc, char ** argv,
bool
myIsMasked
,
int
myIgnored
,
bool
myIsMasked
,
int
myIgnored
,
int
myDevID
,
XMem
*
myMem
)
int
myDevID
,
XMem
*
myMem
)
{
{
AttEncoder
::
InitModel
(
argc
,
argv
,
myIsMasked
,
myIgnored
,
myDevID
,
myMem
);
//
AttEncoder::InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem);
devID
=
myDevID
;
mem
=
myMem
;
ignored
=
myIgnored
;
LoadParamInt
(
argc
,
argv
,
"nlayer"
,
&
nlayer
,
6
);
LoadParamInt
(
argc
,
argv
,
"hsize"
,
&
hSize
,
DEFAULT_EMBEDDING_SIZE
);
LoadParamInt
(
argc
,
argv
,
"esize"
,
&
eSize
,
DEFAULT_EMBEDDING_SIZE
);
LoadParamInt
(
argc
,
argv
,
"vsizetgt"
,
&
vSize
,
-
1
);
LoadParamFloat
(
argc
,
argv
,
"dropout"
,
&
dropoutP
,
0
);
CheckNTErrors
(
nlayer
>=
1
,
"We have one encoding layer at least!"
);
CheckNTErrors
(
vSize
>
1
,
"set vocabulary size by
\"
-vsize
\"
"
);
/* embedding model */
embedder
.
InitModel
(
argc
,
argv
,
devID
,
mem
,
false
);
attentions
=
new
T2TAttention
[
nlayer
];
fnns
=
new
T2TFNN
[
nlayer
];
attLayerNorms
=
new
T2TLN
[
nlayer
];
fnnLayerNorms
=
new
T2TLN
[
nlayer
];
attentionsEnde
=
new
T2TAttention
[
nlayer
];
attentionsEnde
=
new
T2TAttention
[
nlayer
];
attEndeLayerNorms
=
new
T2TLN
[
nlayer
];
attEndeLayerNorms
=
new
T2TLN
[
nlayer
];
/* initialize the stacked layers */
/* initialize the stacked layers */
for
(
int
i
=
0
;
i
<
nlayer
;
i
++
){
for
(
int
i
=
0
;
i
<
nlayer
;
i
++
)
{
attentionsEnde
[
i
].
InitModel
(
argc
,
argv
,
myIsMasked
,
myIgnored
,
myDevID
,
myMem
);
attentions
[
i
].
InitModel
(
argc
,
argv
,
myIsMasked
,
myIgnored
,
myDevID
,
myMem
);
fnns
[
i
].
InitModel
(
argc
,
argv
,
myDevID
,
myMem
);
attLayerNorms
[
i
].
InitModel
(
argc
,
argv
,
myDevID
,
myMem
);
fnnLayerNorms
[
i
].
InitModel
(
argc
,
argv
,
myDevID
,
myMem
);
attentionsEnde
[
i
].
InitModel
(
argc
,
argv
,
true
,
myIgnored
,
myDevID
,
myMem
);
attEndeLayerNorms
[
i
].
InitModel
(
argc
,
argv
,
myDevID
,
myMem
);
attEndeLayerNorms
[
i
].
InitModel
(
argc
,
argv
,
myDevID
,
myMem
);
}
}
}
}
/*
/*
...
@@ -93,7 +122,7 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X
...
@@ -93,7 +122,7 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X
/******************/
/******************/
/* self attention */
/* self attention */
att
=
attentions
[
i
].
Make
(
x
,
x
,
x
,
mask
,
isTraining
);
att
=
attentions
[
i
].
Make
(
x
,
x
,
x
,
mask
,
isTraining
,
true
);
/* dropout */
/* dropout */
if
(
isTraining
&&
dropoutP
>
0
)
if
(
isTraining
&&
dropoutP
>
0
)
...
@@ -107,7 +136,7 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X
...
@@ -107,7 +136,7 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X
/*****************************/
/*****************************/
/* encoder-decoder attention */
/* encoder-decoder attention */
ende
=
attentionsEnde
[
i
].
Make
(
outputEnc
,
x
,
outputEnc
,
maskEncDec
,
isTraining
);
ende
=
attentionsEnde
[
i
].
Make
(
outputEnc
,
x
,
outputEnc
,
maskEncDec
,
isTraining
,
false
);
/* dropout */
/* dropout */
if
(
isTraining
&&
dropoutP
>
0
)
if
(
isTraining
&&
dropoutP
>
0
)
...
...
source/sample/transformer/T2TDecoder.h
查看文件 @
44bf9fa6
...
@@ -27,9 +27,56 @@
...
@@ -27,9 +27,56 @@
namespace
transformer
namespace
transformer
{
{
class
AttDecoder
:
public
AttEncoder
class
AttDecoder
{
{
public
:
public
:
/* device id */
int
devID
;
/* memory pool */
XMem
*
mem
;
/* layer number */
int
nlayer
;
/* hidden layer size of the FNN layer */
int
hSize
;
/* embedding size */
int
eSize
;
/* vocabulary size */
int
vSize
;
/* dropout probability */
DTYPE
dropoutP
;
/* some positions can be ignored in attention. this is useful in lm where the first position needs
* special design for the attention model. */
int
ignored
;
/* embedding of word at each position */
T2TEmbedder
embedder
;
/* FNN model of each layer */
T2TFNN
*
fnns
;
/* attention model of each layer */
T2TAttention
*
attentions
;
/* layer normalization for fnn */
T2TLN
*
fnnLayerNorms
;
/* layer normalization for attention */
T2TLN
*
attLayerNorms
;
/* input tensor of the encoder */
XTensor
*
input
;
/* output tensor of the encoder */
XTensor
*
output
;
/* encoder-decoder attention model of each layer */
/* encoder-decoder attention model of each layer */
T2TAttention
*
attentionsEnde
;
T2TAttention
*
attentionsEnde
;
...
@@ -53,4 +100,4 @@ public:
...
@@ -53,4 +100,4 @@ public:
}
}
#endif
#endif
\ No newline at end of file
source/sample/transformer/T2TEmbedding.cpp
查看文件 @
44bf9fa6
...
@@ -48,12 +48,18 @@ initialize the model
...
@@ -48,12 +48,18 @@ initialize the model
>> myDevID - device id
>> myDevID - device id
>> myMem - the memory pool
>> myMem - the memory pool
*/
*/
void
T2TEmbedder
::
InitModel
(
int
argc
,
char
**
argv
,
int
myDevID
,
XMem
*
myMem
)
void
T2TEmbedder
::
InitModel
(
int
argc
,
char
**
argv
,
int
myDevID
,
XMem
*
myMem
,
bool
isEnc
)
{
{
devID
=
myDevID
;
devID
=
myDevID
;
mem
=
myMem
;
mem
=
myMem
;
LoadParamInt
(
argc
,
argv
,
"vsize"
,
&
vSize
,
-
1
);
if
(
isEnc
){
LoadParamInt
(
argc
,
argv
,
"vsize"
,
&
vSize
,
-
1
);
}
else
{
LoadParamInt
(
argc
,
argv
,
"vsizetgt"
,
&
vSize
,
-
1
);
}
//LoadParamInt(argc, argv, "vsize", &vSize, -1);
LoadParamInt
(
argc
,
argv
,
"maxlen"
,
&
maxLength
,
512
);
LoadParamInt
(
argc
,
argv
,
"maxlen"
,
&
maxLength
,
512
);
LoadParamInt
(
argc
,
argv
,
"d"
,
&
eSize
,
DEFAULT_EMBEDDING_SIZE
);
LoadParamInt
(
argc
,
argv
,
"d"
,
&
eSize
,
DEFAULT_EMBEDDING_SIZE
);
LoadParamInt
(
argc
,
argv
,
"d"
,
&
d
,
DEFAULT_EMBEDDING_SIZE
);
LoadParamInt
(
argc
,
argv
,
"d"
,
&
d
,
DEFAULT_EMBEDDING_SIZE
);
...
@@ -110,7 +116,6 @@ void T2TEmbedder::MakePosEmbedding(int eSize, int d, int length)
...
@@ -110,7 +116,6 @@ void T2TEmbedder::MakePosEmbedding(int eSize, int d, int length)
delete
[]
data
;
delete
[]
data
;
}
}
/*
/*
make the network
make the network
*/
*/
...
...
source/sample/transformer/T2TEmbedding.h
查看文件 @
44bf9fa6
...
@@ -71,7 +71,7 @@ public:
...
@@ -71,7 +71,7 @@ public:
~
T2TEmbedder
();
~
T2TEmbedder
();
/* initialize the model */
/* initialize the model */
void
InitModel
(
int
argc
,
char
**
argv
,
int
myDevID
=
-
1
,
XMem
*
myMem
=
NULL
);
void
InitModel
(
int
argc
,
char
**
argv
,
int
myDevID
=
-
1
,
XMem
*
myMem
=
NULL
,
bool
isEnc
=
true
);
/* make positional embeddings */
/* make positional embeddings */
void
MakePosEmbedding
(
int
eSize
,
int
d
,
int
length
);
void
MakePosEmbedding
(
int
eSize
,
int
d
,
int
length
);
...
...
source/sample/transformer/T2TEncoder.cpp
查看文件 @
44bf9fa6
...
@@ -116,8 +116,8 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, boo
...
@@ -116,8 +116,8 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, boo
XTensor
res
;
XTensor
res
;
/* self attention */
/* self attention */
att
=
attentions
[
i
].
Make
(
x
,
x
,
x
,
mask
,
isTraining
);
att
=
attentions
[
i
].
Make
(
x
,
x
,
x
,
mask
,
isTraining
,
true
);
/* dropout */
/* dropout */
if
(
isTraining
&&
dropoutP
>
0
)
if
(
isTraining
&&
dropoutP
>
0
)
att
=
Dropout
(
att
,
dropoutP
);
att
=
Dropout
(
att
,
dropoutP
);
...
@@ -160,3 +160,4 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool isTraining)
...
@@ -160,3 +160,4 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool isTraining)
}
}
}
}
source/sample/transformer/T2TModel.cpp
查看文件 @
44bf9fa6
...
@@ -236,10 +236,10 @@ void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTe
...
@@ -236,10 +236,10 @@ void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTe
XTensor
*
maskEncDecTMPDec
=
NewTensorBuf
(
maskEncDecTMPEnc
,
paddingEnc
.
devID
,
paddingEnc
.
mem
);
XTensor
*
maskEncDecTMPDec
=
NewTensorBuf
(
maskEncDecTMPEnc
,
paddingEnc
.
devID
,
paddingEnc
.
mem
);
_Unsqueeze
(
&
paddingEnc
,
maskEncDecTMPEnc
,
paddingEnc
.
order
-
1
,
paddingDec
.
GetDim
(
-
1
));
_Unsqueeze
(
&
paddingEnc
,
maskEncDecTMPEnc
,
paddingEnc
.
order
-
1
,
paddingDec
.
GetDim
(
-
1
));
_Unsqueeze
(
&
paddingDec
,
maskEncDecTMPDec
,
paddingEnc
.
order
,
paddingEnc
.
GetDim
(
-
1
));
//
_Unsqueeze(&paddingDec, maskEncDecTMPDec, paddingEnc.order, paddingEnc.GetDim(-1));
_Multiply
(
maskEncDecTMPDec
,
maskEncDecTMPEnc
,
maskEncDecTMPDec
);
//
_Multiply(maskEncDecTMPDec, maskEncDecTMPEnc, maskEncDecTMPDec);
_ScaleAndShiftMe
(
maskEncDecTMP
De
c
,
1e9
F
,
-
1e9
F
);
_ScaleAndShiftMe
(
maskEncDecTMP
En
c
,
1e9
F
,
-
1e9
F
);
_Unsqueeze
(
maskEncDecTMP
De
c
,
&
maskEncDec
,
0
,
dims
[
0
]);
_Unsqueeze
(
maskEncDecTMP
En
c
,
&
maskEncDec
,
0
,
dims
[
0
]);
DelTensorBuf
(
maskEncDecTMPDec
);
DelTensorBuf
(
maskEncDecTMPDec
);
DelTensorBuf
(
maskEncDecTMPEnc
);
DelTensorBuf
(
maskEncDecTMPEnc
);
...
@@ -274,7 +274,10 @@ void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTe
...
@@ -274,7 +274,10 @@ void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTe
_Sum
(
&
maskEnc
,
padding3
,
&
maskEnc
);
_Sum
(
&
maskEnc
,
padding3
,
&
maskEnc
);
encoding
=
MakeEncoder
(
inputEnc
,
maskEnc
,
isTraining
);
encoding
=
MakeEncoder
(
inputEnc
,
maskEnc
,
isTraining
);
//encoding.Dump(stderr, "encoding",10);
decoding
=
MakeDecoder
(
inputDec
,
encoding
,
maskDec
,
maskEncDec
,
isTraining
);
decoding
=
MakeDecoder
(
inputDec
,
encoding
,
maskDec
,
maskEncDec
,
isTraining
);
//decoding.Dump(stderr, "decoding", 10);
outputLayer
->
Make
(
decoding
,
output
);
outputLayer
->
Make
(
decoding
,
output
);
delete
[]
dims
;
delete
[]
dims
;
...
@@ -298,9 +301,10 @@ void T2TModel::GetParams(XList &list)
...
@@ -298,9 +301,10 @@ void T2TModel::GetParams(XList &list)
list
.
Add
(
&
encoder
->
fnns
[
i
].
b1
);
list
.
Add
(
&
encoder
->
fnns
[
i
].
b1
);
list
.
Add
(
&
encoder
->
fnns
[
i
].
w2
);
list
.
Add
(
&
encoder
->
fnns
[
i
].
w2
);
list
.
Add
(
&
encoder
->
fnns
[
i
].
b2
);
list
.
Add
(
&
encoder
->
fnns
[
i
].
b2
);
list
.
Add
(
&
encoder
->
attentions
[
i
].
wk
);
//list.Add(&encoder->attentions[i].wk);
list
.
Add
(
&
encoder
->
attentions
[
i
].
wq
);
//list.Add(&encoder->attentions[i].wq);
list
.
Add
(
&
encoder
->
attentions
[
i
].
wv
);
//list.Add(&encoder->attentions[i].wv);
list
.
Add
(
&
encoder
->
attentions
[
i
].
wbig
);
list
.
Add
(
&
encoder
->
attentions
[
i
].
wa
);
list
.
Add
(
&
encoder
->
attentions
[
i
].
wa
);
list
.
Add
(
&
encoder
->
fnnLayerNorms
[
i
].
w
);
list
.
Add
(
&
encoder
->
fnnLayerNorms
[
i
].
w
);
list
.
Add
(
&
encoder
->
fnnLayerNorms
[
i
].
b
);
list
.
Add
(
&
encoder
->
fnnLayerNorms
[
i
].
b
);
...
@@ -322,9 +326,10 @@ void T2TModel::GetParams(XList &list)
...
@@ -322,9 +326,10 @@ void T2TModel::GetParams(XList &list)
list
.
Add
(
&
decoder
->
attentionsEnde
[
i
].
wa
);
list
.
Add
(
&
decoder
->
attentionsEnde
[
i
].
wa
);
list
.
Add
(
&
decoder
->
attEndeLayerNorms
[
i
].
w
);
list
.
Add
(
&
decoder
->
attEndeLayerNorms
[
i
].
w
);
list
.
Add
(
&
decoder
->
attEndeLayerNorms
[
i
].
b
);
list
.
Add
(
&
decoder
->
attEndeLayerNorms
[
i
].
b
);
list
.
Add
(
&
decoder
->
attentions
[
i
].
wk
);
//list.Add(&decoder->attentions[i].wk);
list
.
Add
(
&
decoder
->
attentions
[
i
].
wq
);
//list.Add(&decoder->attentions[i].wq);
list
.
Add
(
&
decoder
->
attentions
[
i
].
wv
);
//list.Add(&decoder->attentions[i].wv);
list
.
Add
(
&
decoder
->
attentions
[
i
].
wbig
);
list
.
Add
(
&
decoder
->
attentions
[
i
].
wa
);
list
.
Add
(
&
decoder
->
attentions
[
i
].
wa
);
list
.
Add
(
&
decoder
->
fnnLayerNorms
[
i
].
w
);
list
.
Add
(
&
decoder
->
fnnLayerNorms
[
i
].
w
);
list
.
Add
(
&
decoder
->
fnnLayerNorms
[
i
].
b
);
list
.
Add
(
&
decoder
->
fnnLayerNorms
[
i
].
b
);
...
...
source/sample/transformer/T2TOutput.cpp
查看文件 @
44bf9fa6
...
@@ -56,7 +56,7 @@ void T2TOutput::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
...
@@ -56,7 +56,7 @@ void T2TOutput::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
float
minmax
=
0
;
float
minmax
=
0
;
LoadParamInt
(
argc
,
argv
,
"vsize"
,
&
vSize
,
-
1
);
LoadParamInt
(
argc
,
argv
,
"vsize
tgt
"
,
&
vSize
,
-
1
);
LoadParamInt
(
argc
,
argv
,
"d"
,
&
inSize
,
DEFAULT_EMBEDDING_SIZE
);
LoadParamInt
(
argc
,
argv
,
"d"
,
&
inSize
,
DEFAULT_EMBEDDING_SIZE
);
LoadParamInt
(
argc
,
argv
,
"d"
,
&
hSize
,
DEFAULT_EMBEDDING_SIZE
);
LoadParamInt
(
argc
,
argv
,
"d"
,
&
hSize
,
DEFAULT_EMBEDDING_SIZE
);
LoadParamFloat
(
argc
,
argv
,
"outputminmax"
,
&
minmax
,
0.08
F
);
LoadParamFloat
(
argc
,
argv
,
"outputminmax"
,
&
minmax
,
0.08
F
);
...
...
source/sample/transformer/T2TTrainer.cpp
查看文件 @
44bf9fa6
差异被折叠。
点击展开。
source/sample/transformer/T2TTrainer.h
查看文件 @
44bf9fa6
...
@@ -33,6 +33,25 @@ using namespace nts;
...
@@ -33,6 +33,25 @@ using namespace nts;
namespace
transformer
namespace
transformer
{
{
/* node to keep batch information */
struct
BatchNode
{
/* begining position */
int
beg
;
/* end position */
int
end
;
/* maximum word number on the encoder side */
int
maxEnc
;
/* maximum word number on the decoder side */
int
maxDec
;
/* a key for sorting */
int
key
;
};
/* trainer of the T2T model */
/* trainer of the T2T model */
class
T2TTrainer
class
T2TTrainer
{
{
...
@@ -49,9 +68,15 @@ public:
...
@@ -49,9 +68,15 @@ public:
/* another buffer */
/* another buffer */
int
*
buf2
;
int
*
buf2
;
/* batch buf */
BatchNode
*
bufBatch
;
/* buffer size */
/* buffer size */
int
bufSize
;
int
bufSize
;
/* size of batch buffer */
int
bufBatchSize
;
/* length of each sequence */
/* length of each sequence */
int
*
seqLen
;
int
*
seqLen
;
...
@@ -66,6 +91,9 @@ public:
...
@@ -66,6 +91,9 @@ public:
/* offset for next sequence in the buffer */
/* offset for next sequence in the buffer */
int
nextSeq
;
int
nextSeq
;
/* offset for next batch */
int
nextBatch
;
/* indicates whether the sequence is sorted by length */
/* indicates whether the sequence is sorted by length */
bool
isLenSorted
;
bool
isLenSorted
;
...
@@ -142,8 +170,11 @@ public:
...
@@ -142,8 +170,11 @@ public:
/* counterpart of "isSmallBatch" */
/* counterpart of "isSmallBatch" */
bool
isBigBatch
;
bool
isBigBatch
;
/* indicates whether we use small memory footprint for backward process */
/* randomize batches */
bool
isSmallFootprint
;
bool
isRandomBatch
;
/* indicates whether we intend to debug the net */
bool
isDebugged
;
public
:
public
:
/* constructor */
/* constructor */
...
...
source/sample/transformer/Transformer.cpp
查看文件 @
44bf9fa6
...
@@ -60,12 +60,13 @@ int TransformerMain(int argc, const char ** argv)
...
@@ -60,12 +60,13 @@ int TransformerMain(int argc, const char ** argv)
LoadParamString
(
argc
,
args
,
"output"
,
outputFN
,
""
);
LoadParamString
(
argc
,
args
,
"output"
,
outputFN
,
""
);
srand
((
unsigned
int
)
time
(
NULL
));
srand
((
unsigned
int
)
time
(
NULL
));
T2TTrainer
trainer
;
T2TTrainer
trainer
;
trainer
.
Init
(
argc
,
args
);
trainer
.
Init
(
argc
,
args
);
T2TModel
model
;
T2TModel
model
;
model
.
InitModel
(
argc
,
args
);
model
.
InitModel
(
argc
,
args
);
/* learn model parameters */
/* learn model parameters */
if
(
strcmp
(
trainFN
,
""
))
if
(
strcmp
(
trainFN
,
""
))
trainer
.
Train
(
trainFN
,
testFN
,
strcmp
(
modelFN
,
""
)
?
modelFN
:
"checkpoint.model"
,
&
model
);
trainer
.
Train
(
trainFN
,
testFN
,
strcmp
(
modelFN
,
""
)
?
modelFN
:
"checkpoint.model"
,
&
model
);
...
...
source/tensor/XDevice.cpp
查看文件 @
44bf9fa6
...
@@ -24,6 +24,7 @@
...
@@ -24,6 +24,7 @@
#include <stdio.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdlib.h>
#include <string.h>
#include <string.h>
#include <time.h>
#include "XDevice.h"
#include "XDevice.h"
#include "XGlobal.h"
#include "XGlobal.h"
#include "XThread.h"
#include "XThread.h"
...
@@ -59,6 +60,7 @@ XDevice::~XDevice()
...
@@ -59,6 +60,7 @@ XDevice::~XDevice()
cublasDestroy
(
cublasHandle
);
cublasDestroy
(
cublasHandle
);
if
(
stream
!=
NULL
)
if
(
stream
!=
NULL
)
delete
stream
;
delete
stream
;
curandDestroyGenerator
(
gen
);
#endif
#endif
}
}
...
@@ -68,6 +70,7 @@ void XDevice::Init(int myDevID)
...
@@ -68,6 +70,7 @@ void XDevice::Init(int myDevID)
Clear
();
Clear
();
devID
=
myDevID
;
devID
=
myDevID
;
seed
=
rand
();
/* CPU information */
/* CPU information */
if
(
devID
<
0
){
if
(
devID
<
0
){
...
@@ -80,6 +83,10 @@ void XDevice::Init(int myDevID)
...
@@ -80,6 +83,10 @@ void XDevice::Init(int myDevID)
cudaDeviceProp
prop
;
cudaDeviceProp
prop
;
cudaSetDevice
(
myDevID
);
cudaSetDevice
(
myDevID
);
curandCreateGenerator
(
&
gen
,
CURAND_RNG_PSEUDO_DEFAULT
);
curandSetPseudoRandomGeneratorSeed
(
gen
,
seed
);
if
(
cudaGetDeviceProperties
(
&
prop
,
devID
)
!=
cudaSuccess
){
if
(
cudaGetDeviceProperties
(
&
prop
,
devID
)
!=
cudaSuccess
){
XPRINT1
(
0
,
stderr
,
"cannot get GPU(%d) information."
,
devID
);
XPRINT1
(
0
,
stderr
,
"cannot get GPU(%d) information."
,
devID
);
exit
(
1
);
exit
(
1
);
...
@@ -270,6 +277,8 @@ XDevManager::~XDevManager()
...
@@ -270,6 +277,8 @@ XDevManager::~XDevManager()
/* initialize it and get the CPU and GPU information */
/* initialize it and get the CPU and GPU information */
void
XDevManager
::
Init
()
void
XDevManager
::
Init
()
{
{
srand
((
unsigned
int
)
time
(
NULL
));
Clear
();
Clear
();
/* CPUs (we actually do not care about how many CPUs are using) */
/* CPUs (we actually do not care about how many CPUs are using) */
...
...
source/tensor/XDevice.h
查看文件 @
44bf9fa6
...
@@ -99,6 +99,9 @@ public:
...
@@ -99,6 +99,9 @@ public:
/* default stream for the device */
/* default stream for the device */
XStream
*
stream
;
XStream
*
stream
;
/* seed for random number generation */
int
seed
;
#ifdef USE_CUDA
#ifdef USE_CUDA
/* mutex for handle (GPU cublas) */
/* mutex for handle (GPU cublas) */
...
@@ -109,6 +112,9 @@ public:
...
@@ -109,6 +112,9 @@ public:
/* specify if the handle is initialized */
/* specify if the handle is initialized */
bool
isHandleReady
;
bool
isHandleReady
;
/* generater of random numbers */
curandGenerator_t
gen
;
#endif
#endif
...
...
source/tensor/XMem.cpp
查看文件 @
44bf9fa6
...
@@ -1461,6 +1461,23 @@ void XMem::CreateBLASHandle()
...
@@ -1461,6 +1461,23 @@ void XMem::CreateBLASHandle()
#endif
#endif
}
}
/* show profile of the memory pool */
void
XMem
::
ShowMemUsage
(
FILE
*
file
)
{
MTYPE
used
=
0
;
MTYPE
total
=
0
;
for
(
int
i
=
0
;
i
<
blockNum
;
i
++
){
if
(
blocks
[
i
].
mem
!=
NULL
){
used
+=
blocks
[
i
].
used
;
total
+=
blocks
[
i
].
size
;
}
}
fprintf
(
file
,
"mem:%.1fMB used:%.1fMB usage:%.3f
\n
"
,
(
DTYPE
)
used
/
MILLION
,
(
DTYPE
)
total
/
MILLION
,
(
DTYPE
)
used
/
total
);
}
#ifdef USE_CUDA
#ifdef USE_CUDA
/* get the handle of cublas */
/* get the handle of cublas */
...
...
source/tensor/XMem.h
查看文件 @
44bf9fa6
...
@@ -24,6 +24,7 @@
...
@@ -24,6 +24,7 @@
#ifndef __XMEM_H__
#ifndef __XMEM_H__
#define __XMEM_H__
#define __XMEM_H__
#include <stdio.h>
#include <stdlib.h>
#include <stdlib.h>
#ifdef CUDA_BLAS
#ifdef CUDA_BLAS
...
@@ -402,6 +403,9 @@ public:
...
@@ -402,6 +403,9 @@ public:
/* create a new cublas handle */
/* create a new cublas handle */
void
CreateBLASHandle
();
void
CreateBLASHandle
();
/* show profile of the memory pool */
void
ShowMemUsage
(
FILE
*
file
);
#ifdef USE_CUDA
#ifdef USE_CUDA
/* get the handle of cublas */
/* get the handle of cublas */
cublasHandle_t
*
GetCublasHandle
();
cublasHandle_t
*
GetCublasHandle
();
...
...
source/tensor/XName.cpp
查看文件 @
44bf9fa6
...
@@ -67,6 +67,8 @@ const char * GetOPName(int type)
...
@@ -67,6 +67,8 @@ const char * GetOPName(int type)
return
"M_MULTIPLY"
;
return
"M_MULTIPLY"
;
else
if
(
type
==
MATH_MULTIPLYDIM
)
else
if
(
type
==
MATH_MULTIPLYDIM
)
return
"M_MULTIPLYDIM"
;
return
"M_MULTIPLYDIM"
;
else
if
(
type
==
MATH_MULTIPLYBROADCAST
)
return
"M_MULTIPLYBROADCAST"
;
else
if
(
type
==
MATH_NEGATE
)
else
if
(
type
==
MATH_NEGATE
)
return
"M_NEGATE"
;
return
"M_NEGATE"
;
else
if
(
type
==
MATH_NORMALIZE
)
else
if
(
type
==
MATH_NORMALIZE
)
...
@@ -85,6 +87,8 @@ const char * GetOPName(int type)
...
@@ -85,6 +87,8 @@ const char * GetOPName(int type)
return
"M_SUM"
;
return
"M_SUM"
;
else
if
(
type
==
MATH_SUMDIM
)
else
if
(
type
==
MATH_SUMDIM
)
return
"M_SUMDIM"
;
return
"M_SUMDIM"
;
else
if
(
type
==
MATH_SUMBROADCAST
)
return
"M_SUMBROADCAST"
;
else
if
(
type
==
REDUCE_REDUCEMAX
)
else
if
(
type
==
REDUCE_REDUCEMAX
)
return
"R_REDUCEMAX"
;
return
"R_REDUCEMAX"
;
else
if
(
type
==
REDUCE_REDUCEMEAN
)
else
if
(
type
==
REDUCE_REDUCEMEAN
)
...
...
source/tensor/XName.h
查看文件 @
44bf9fa6
...
@@ -52,7 +52,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
...
@@ -52,7 +52,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#define MATH_MATRIXMULBATCHED MATH_MATRIXMUL + 1
#define MATH_MATRIXMULBATCHED MATH_MATRIXMUL + 1
#define MATH_MULTIPLY MATH_MATRIXMULBATCHED + 1
#define MATH_MULTIPLY MATH_MATRIXMULBATCHED + 1
#define MATH_MULTIPLYDIM MATH_MULTIPLY + 1
#define MATH_MULTIPLYDIM MATH_MULTIPLY + 1
#define MATH_NEGATE MATH_MULTIPLYDIM + 1
#define MATH_MULTIPLYBROADCAST MATH_MULTIPLYDIM + 1
#define MATH_NEGATE MATH_MULTIPLYBROADCAST + 1
#define MATH_NORMALIZE MATH_NEGATE + 1
#define MATH_NORMALIZE MATH_NEGATE + 1
#define MATH_POWER MATH_NORMALIZE + 1
#define MATH_POWER MATH_NORMALIZE + 1
#define MATH_SCALEANDSHIFT MATH_POWER + 1
#define MATH_SCALEANDSHIFT MATH_POWER + 1
...
@@ -61,8 +62,9 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
...
@@ -61,8 +62,9 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#define MATH_SUBDIM MATH_SUB + 1
#define MATH_SUBDIM MATH_SUB + 1
#define MATH_SUM MATH_SUBDIM + 1
#define MATH_SUM MATH_SUBDIM + 1
#define MATH_SUMDIM MATH_SUM + 1
#define MATH_SUMDIM MATH_SUM + 1
#define MATH_SUMBROADCAST MATH_SUMDIM + 1
#define REDUCE MATH_SUM
DIM
+ 1
#define REDUCE MATH_SUM
BROADCAST
+ 1
#define REDUCE_REDUCEMAX REDUCE + 1
#define REDUCE_REDUCEMAX REDUCE + 1
#define REDUCE_REDUCEMEAN REDUCE_REDUCEMAX + 1
#define REDUCE_REDUCEMEAN REDUCE_REDUCEMAX + 1
#define REDUCE_REDUCESUM REDUCE_REDUCEMEAN + 1
#define REDUCE_REDUCESUM REDUCE_REDUCEMEAN + 1
...
...
source/tensor/XTensor.cpp
查看文件 @
44bf9fa6
...
@@ -60,6 +60,7 @@
...
@@ -60,6 +60,7 @@
#include "core/utilities/FlushToMem.cuh"
#include "core/utilities/FlushToMem.cuh"
#include "core/utilities/SetAscendingOrder.cuh"
#include "core/utilities/SetAscendingOrder.cuh"
#endif
#endif
/* the nts (NiuTrans.Tensor) namespace */
/* the nts (NiuTrans.Tensor) namespace */
...
@@ -690,9 +691,6 @@ set the tensor items by a uniform distribution in range [lower, upper]
...
@@ -690,9 +691,6 @@ set the tensor items by a uniform distribution in range [lower, upper]
>> lower - lower value of the range
>> lower - lower value of the range
>> upper - upper value of the range
>> upper - upper value of the range
*/
*/
void
XTensor
::
SetDataRand
(
DTYPE
lower
,
DTYPE
upper
)
void
XTensor
::
SetDataRand
(
DTYPE
lower
,
DTYPE
upper
)
{
{
// TODO: cuda code!!!!!!!
// TODO: cuda code!!!!!!!
...
@@ -703,7 +701,6 @@ void XTensor::SetDataRand(DTYPE lower, DTYPE upper)
...
@@ -703,7 +701,6 @@ void XTensor::SetDataRand(DTYPE lower, DTYPE upper)
// srand((unsigned)time(0));
// srand((unsigned)time(0));
DTYPE
variance
=
upper
-
lower
;
DTYPE
variance
=
upper
-
lower
;
void
*
d
=
NULL
;
void
*
d
=
NULL
;
if
(
dataType
==
X_FLOAT
)
{
if
(
dataType
==
X_FLOAT
)
{
d
=
new
float
[
unitNum
];
d
=
new
float
[
unitNum
];
for
(
int
i
=
0
;
i
<
unitNum
;
i
++
)
{
for
(
int
i
=
0
;
i
<
unitNum
;
i
++
)
{
...
@@ -715,7 +712,7 @@ void XTensor::SetDataRand(DTYPE lower, DTYPE upper)
...
@@ -715,7 +712,7 @@ void XTensor::SetDataRand(DTYPE lower, DTYPE upper)
d
=
new
double
[
unitNum
];
d
=
new
double
[
unitNum
];
for
(
int
i
=
0
;
i
<
unitNum
;
i
++
)
{
for
(
int
i
=
0
;
i
<
unitNum
;
i
++
)
{
*
((
double
*
)
d
+
i
)
=
lower
+
variance
*
rand
()
/
RAND_MAX
;
*
((
double
*
)
d
+
i
)
=
lower
+
variance
*
rand
()
/
RAND_MAX
;
}
}
}
}
else
if
(
dataType
==
X_FLOAT16
)
{
else
if
(
dataType
==
X_FLOAT16
)
{
unsigned
short
random
;
unsigned
short
random
;
...
@@ -1700,7 +1697,6 @@ void XTensor::Dump(const XTensor * tensor, FILE * file, const char * label, cons
...
@@ -1700,7 +1697,6 @@ void XTensor::Dump(const XTensor * tensor, FILE * file, const char * label, cons
{
{
ShowNTErrors
(
"TO DO!"
);
ShowNTErrors
(
"TO DO!"
);
}
}
}
}
/*
/*
...
...
source/tensor/core/arithmetic/Div.cpp
查看文件 @
44bf9fa6
...
@@ -46,8 +46,6 @@ void _Div(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int le
...
@@ -46,8 +46,6 @@ void _Div(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int le
"Unmatched tensors in multiplication!"
);
"Unmatched tensors in multiplication!"
);
CheckNTErrors
((
a
->
order
==
b
->
order
&&
a
->
order
==
c
->
order
),
CheckNTErrors
((
a
->
order
==
b
->
order
&&
a
->
order
==
c
->
order
),
"Unmatched tensors!"
);
"Unmatched tensors!"
);
CheckNTErrors
(
a
->
dataType
==
b
->
dataType
&&
a
->
dataType
==
c
->
dataType
,
"Unmatched tensors in addition!"
);
#ifdef USE_CUDA
#ifdef USE_CUDA
if
(
a
->
devID
>=
0
||
b
->
devID
>=
0
||
c
->
devID
>=
0
)
{
if
(
a
->
devID
>=
0
||
b
->
devID
>=
0
||
c
->
devID
>=
0
)
{
...
...
source/tensor/core/arithmetic/DivDim.cu
查看文件 @
44bf9fa6
差异被折叠。
点击展开。
source/tensor/core/arithmetic/MatrixMul.cpp
查看文件 @
44bf9fa6
...
@@ -22,12 +22,10 @@
...
@@ -22,12 +22,10 @@
#include "../../XTensor.h"
#include "../../XTensor.h"
#include "../../XDevice.h"
#include "../../XDevice.h"
#include "../../XName.h"
#include "../../XName.h"
#include "../CHeader.h"
#include "MatrixMul.h"
#include "MatrixMul.h"
#include "MatrixMul2D.h"
#include "MatrixMul2D.h"
#include "XTensorBLAS.h"
#include "XTensorBLAS.h"
#include "MatrixMulBatched.h"
#include "MatrixMulBatched.h"
#include "timer.h"
namespace
nts
{
// namespace nts(NiuTrans.Tensor)
namespace
nts
{
// namespace nts(NiuTrans.Tensor)
...
@@ -276,7 +274,8 @@ matrix multiplication with no transposition c = a * b * alpha
...
@@ -276,7 +274,8 @@ matrix multiplication with no transposition c = a * b * alpha
>> parallelRunner - parallel processing module
>> parallelRunner - parallel processing module
<< return - the result of matrix multiplication
<< return - the result of matrix multiplication
*/
*/
XTensor
MatrixMul
(
const
XTensor
&
a
,
const
XTensor
&
b
,
DTYPE
alpha
,
XPRunner
*
parallelRunner
)
XTensor
MatrixMul
(
const
XTensor
&
a
,
const
XTensor
&
b
,
DTYPE
alpha
,
XPRunner
*
parallelRunner
)
{
{
CheckNTErrors
(
a
.
dataType
==
b
.
dataType
,
"Input tensors should have the same data type!"
);
CheckNTErrors
(
a
.
dataType
==
b
.
dataType
,
"Input tensors should have the same data type!"
);
CheckNTErrors
(
a
.
order
>=
2
&&
b
.
order
>=
2
,
"Input tensors must have a order >= 2!"
);
CheckNTErrors
(
a
.
order
>=
2
&&
b
.
order
>=
2
,
"Input tensors must have a order >= 2!"
);
...
@@ -317,212 +316,4 @@ XTensor MatrixMul(const XTensor &a, const XTensor &b, DTYPE alpha, XPRunner * pa
...
@@ -317,212 +316,4 @@ XTensor MatrixMul(const XTensor &a, const XTensor &b, DTYPE alpha, XPRunner * pa
return
c
;
return
c
;
}
}
/*
}
//
namespace
nts
(
NiuTrans
.
Tensor
)
matrix multiplication (return a XTensor structure) c = trans(a) * trans(b) * alpha
\ No newline at end of file
make a new tensor to keep the result and return it
For the input tensors a and b, we perform matrix multiplication on the first two dimentsions.
E.g., let A be a tensor of size y * z * m and B be a tensor of size x * y * n.
For A * B, we go over each order-2 tensor of A (of size x * y) and each order-2 tensor B (of size z * x),
like this c_{i,j} = trans(ai) * trans(bj) * alpha + c_{i,j} * beta
where trans() returns the transposed matrix if the flag is fired, ai is the i-th element tensor of A,
bj is the j-th element tensor of B, and c_{i,j} is the (i,j) element tensor of the result C.
The result C should be a tensor of z * x * n * m.
Obviously C = A * B performs normal matrix multiplication if A = y * z and B = x * y.
>> a - tensor a
>> transposedA - indicates whether the matrices in a are transposed
>> b - tensor b
>> transposedB - indicates whether teh matrices in b are transposed
>> alpha - a coefficient
>> parallelRunner - parallel processing module
<< return - the result of matrix multiplication
*/
XTensor
MatrixMulFloat16
(
const
XTensor
&
a
,
MATRIX_TRANS_TYPE
transposedA
,
const
XTensor
&
b
,
MATRIX_TRANS_TYPE
transposedB
,
DTYPE
alpha
,
XPRunner
*
parallelRunner
)
{
CheckNTErrors
(
a
.
dataType
==
b
.
dataType
,
"Input tensors should have the same data type!"
);
CheckNTErrors
(
a
.
order
>=
2
&&
b
.
order
>=
2
,
"Input tensors must have a order >= 2!"
);
int
an
=
transposedA
==
X_TRANS
?
a
.
dimSizeRDI
[
0
]
:
a
.
dimSizeRDI
[
1
];
int
am
=
transposedA
==
X_TRANS
?
a
.
dimSizeRDI
[
1
]
:
a
.
dimSizeRDI
[
0
];
int
bn
=
transposedB
==
X_TRANS
?
b
.
dimSizeRDI
[
0
]
:
b
.
dimSizeRDI
[
1
];
int
bm
=
transposedB
==
X_TRANS
?
b
.
dimSizeRDI
[
1
]
:
b
.
dimSizeRDI
[
0
];
CheckNTErrors
(
am
==
bn
,
"Unmatched tensors in multiplication!"
);
int
order
=
a
.
order
+
b
.
order
-
2
;
int
sub
=
0
;
int
*
dimSize
=
new
int
[
order
];
for
(
int
i
=
2
;
i
<
a
.
order
;
i
++
)
dimSize
[
sub
++
]
=
a
.
dimSizeRDI
[
a
.
order
+
1
-
i
];
for
(
int
i
=
2
;
i
<
b
.
order
;
i
++
)
dimSize
[
sub
++
]
=
b
.
dimSizeRDI
[
b
.
order
+
1
-
i
];
dimSize
[
sub
++
]
=
an
;
dimSize
[
sub
++
]
=
bm
;
float
dr
=
(
!
a
.
isSparse
||
!
b
.
isSparse
)
?
1.0
F
:
MAX
(
a
.
denseRatio
,
b
.
denseRatio
);
XTensor
c
(
order
,
dimSize
,
a
.
dataType
,
dr
,
a
.
devID
,
a
.
mem
);
c
.
SetTMPFlag
();
//XTensor * halfA = NewTensorBuf(a.order, a.dimSize, X_FLOAT16, 1.0F, a.devID, a.mem);
//XTensor * halfB = NewTensorBuf(b.order, b.dimSize, X_FLOAT16, 1.0F, a.devID, a.mem);
//XTensor * halfC = NewTensorBuf(c.order, c.dimSize, X_FLOAT16, 1.0F, a.devID, a.mem);
//_ConvertDataType(&a, halfA);
//_ConvertDataType(&b, halfB);
//_MatrixMul(halfA, transposedA, halfB, transposedB, halfC, alpha, 0, parallelRunner);
//_ConvertDataType(halfC, &c);
//DelTensorBuf(halfC);
//DelTensorBuf(halfB);
//DelTensorBuf(halfA);
XTensor
*
halfA
=
NewTensorBuf
(
a
.
order
,
a
.
dimSize
,
X_FLOAT16
,
1.0
F
,
a
.
devID
,
a
.
mem
);
XTensor
*
halfB
=
NewTensorBuf
(
b
.
order
,
b
.
dimSize
,
X_FLOAT16
,
1.0
F
,
a
.
devID
,
a
.
mem
);
XTensor
*
halfC
=
NewTensorBuf
(
c
.
order
,
c
.
dimSize
,
X_FLOAT16
,
1.0
F
,
a
.
devID
,
a
.
mem
);
/*timer_c timerConvert1;
timerConvert1.m_start_timer();
*/
_ConvertDataType
(
&
a
,
halfA
);
_ConvertDataType
(
&
b
,
halfB
);
/*timerConvert1.m_end_timer();
printf("time convert1 %f ms\n", timerConvert1.m_get_time_diff_msec());
timer_c timerMatrixMul;
timerMatrixMul.m_start_timer();*/
_MatrixMul
(
halfA
,
transposedA
,
halfB
,
transposedB
,
halfC
,
alpha
,
0
,
parallelRunner
);
/*timerMatrixMul.m_end_timer();
printf("time matrixmul %f ms\n", timerMatrixMul.m_get_time_diff_msec());
timer_c timerConvert2;
timerConvert2.m_start_timer();
*/
_ConvertDataType
(
halfC
,
&
c
);
/*timerConvert2.m_end_timer();
printf("time convert2 %f ms\n\n", timerConvert2.m_get_time_diff_msec());*/
DelTensorBuf
(
halfC
);
DelTensorBuf
(
halfB
);
DelTensorBuf
(
halfA
);
/* tensor connections */
XLink
::
MakeLink
(
&
a
,
&
b
,
&
c
,
MATH_MATRIXMUL
);
XLink
::
AddParamToHeadTrans
(
&
c
,
transposedA
);
XLink
::
AddParamToHeadTrans
(
&
c
,
transposedB
);
XLink
::
AddParamToHead
(
&
c
,
alpha
);
/* destroy variables */
delete
[]
dimSize
;
return
c
;
}
/*
matrix multiplication with no transposition c = a * b * alpha
>> a - tensor a
>> b - tensor b
>> alpha - a coefficient
>> parallelRunner - parallel processing module
<< return - the result of matrix multiplication
*/
XTensor
MatrixMulFloat16
(
const
XTensor
&
a
,
const
XTensor
&
b
,
DTYPE
alpha
,
XPRunner
*
parallelRunner
)
{
CheckNTErrors
(
a
.
dataType
==
b
.
dataType
,
"Input tensors should have the same data type!"
);
CheckNTErrors
(
a
.
order
>=
2
&&
b
.
order
>=
2
,
"Input tensors must have a order >= 2!"
);
int
an
=
a
.
dimSizeRDI
[
1
];
int
am
=
a
.
dimSizeRDI
[
0
];
int
bn
=
b
.
dimSizeRDI
[
1
];
int
bm
=
b
.
dimSizeRDI
[
0
];
CheckNTErrors
(
am
==
bn
,
"Unmatched tensors in multiplication!"
);
int
order
=
a
.
order
+
b
.
order
-
2
;
int
sub
=
0
;
int
*
dimSize
=
new
int
[
order
];
for
(
int
i
=
2
;
i
<
a
.
order
;
i
++
)
dimSize
[
sub
++
]
=
a
.
dimSizeRDI
[
a
.
order
+
1
-
i
];
for
(
int
i
=
2
;
i
<
b
.
order
;
i
++
)
dimSize
[
sub
++
]
=
b
.
dimSizeRDI
[
b
.
order
+
1
-
i
];
dimSize
[
sub
++
]
=
an
;
dimSize
[
sub
++
]
=
bm
;
float
dr
=
(
!
a
.
isSparse
||
!
b
.
isSparse
)
?
1.0
F
:
MAX
(
a
.
denseRatio
,
b
.
denseRatio
);
XTensor
c
(
order
,
dimSize
,
a
.
dataType
,
dr
,
a
.
devID
,
a
.
mem
);
c
.
SetTMPFlag
();
XTensor
*
halfA
=
NewTensorBuf
(
a
.
order
,
a
.
dimSize
,
X_FLOAT16
,
1.0
F
,
a
.
devID
,
a
.
mem
);
XTensor
*
halfB
=
NewTensorBuf
(
b
.
order
,
b
.
dimSize
,
X_FLOAT16
,
1.0
F
,
a
.
devID
,
a
.
mem
);
XTensor
*
halfC
=
NewTensorBuf
(
c
.
order
,
c
.
dimSize
,
X_FLOAT16
,
1.0
F
,
a
.
devID
,
a
.
mem
);
_ConvertDataType
(
&
a
,
halfA
);
_ConvertDataType
(
&
b
,
halfB
);
_MatrixMul
(
halfA
,
X_NOTRANS
,
halfB
,
X_NOTRANS
,
halfC
,
alpha
,
0
,
parallelRunner
);
_ConvertDataType
(
halfC
,
&
c
);
DelTensorBuf
(
halfC
);
DelTensorBuf
(
halfB
);
DelTensorBuf
(
halfA
);
//XTensor * halfA = NewTensorBuf(a.order, a.dimSize, X_FLOAT16, 1.0F, a.devID, a.mem);
//XTensor * halfB = NewTensorBuf(b.order, b.dimSize, X_FLOAT16, 1.0F, a.devID, a.mem);
//XTensor * halfC = NewTensorBuf(c.order, c.dimSize, X_FLOAT16, 1.0F, a.devID, a.mem);
//timer_c timerConvert1;
//timerConvert1.m_start_timer();
//_ConvertDataType(&a, halfA);
//_ConvertDataType(&b, halfB);
//timerConvert1.m_end_timer();
//printf("time convert1 %f ms\n", timerConvert1.m_get_time_diff_msec());
//timer_c timerMatrixMul;
//timerMatrixMul.m_start_timer();
//_MatrixMul(halfA, X_NOTRANS, halfB, X_NOTRANS, halfC, alpha, 0, parallelRunner);
//timerMatrixMul.m_end_timer();
//printf("time matrixmul %f ms\n", timerMatrixMul.m_get_time_diff_msec());
//timer_c timerConvert2;
//timerConvert2.m_start_timer();
//_ConvertDataType(halfC, &c);
//timerConvert2.m_end_timer();
//printf("time convert2 %f ms\n\n", timerConvert2.m_get_time_diff_msec());
//DelTensorBuf(halfC);
//DelTensorBuf(halfB);
//DelTensorBuf(halfA);
/* tensor connections */
XLink
::
MakeLink
(
&
a
,
&
b
,
&
c
,
MATH_MATRIXMUL
);
XLink
::
AddParamToHeadTrans
(
&
c
,
X_NOTRANS
);
XLink
::
AddParamToHeadTrans
(
&
c
,
X_NOTRANS
);
XLink
::
AddParamToHead
(
&
c
,
alpha
);
/* destroy variables */
delete
[]
dimSize
;
return
c
;
}
}
\ No newline at end of file
source/tensor/core/arithmetic/MatrixMul.h
查看文件 @
44bf9fa6
...
@@ -27,7 +27,6 @@
...
@@ -27,7 +27,6 @@
namespace
nts
{
// namespace nts(NiuTrans.Tensor)
namespace
nts
{
// namespace nts(NiuTrans.Tensor)
#define MMul MatrixMul
#define MMul MatrixMul
#define MMul16 MatrixMulFloat16
/*
/*
matrix multiplication c = trans(a) * trans(b) * alpha + c * beta
matrix multiplication c = trans(a) * trans(b) * alpha + c * beta
...
@@ -64,13 +63,6 @@ XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor
...
@@ -64,13 +63,6 @@ XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor
XTensor
MatrixMul
(
const
XTensor
&
a
,
const
XTensor
&
b
,
XTensor
MatrixMul
(
const
XTensor
&
a
,
const
XTensor
&
b
,
DTYPE
alpha
=
(
DTYPE
)
1
.
0
,
XPRunner
*
parallelRunner
=
NULL
);
DTYPE
alpha
=
(
DTYPE
)
1
.
0
,
XPRunner
*
parallelRunner
=
NULL
);
XTensor
MatrixMulFloat16
(
const
XTensor
&
a
,
MATRIX_TRANS_TYPE
transposedA
,
const
XTensor
&
b
,
MATRIX_TRANS_TYPE
transposedB
,
DTYPE
alpha
=
(
DTYPE
)
1
.
0
,
XPRunner
*
parallelRunner
=
NULL
);
/* matrix multiplication with no transposition c = a * b * alpha*/
XTensor
MatrixMulFloat16
(
const
XTensor
&
a
,
const
XTensor
&
b
,
DTYPE
alpha
=
(
DTYPE
)
1
.
0
,
XPRunner
*
parallelRunner
=
NULL
);
}
// namespace nts(NiuTrans.Tensor)
}
// namespace nts(NiuTrans.Tensor)
#endif // __MATRIXMUL_H__
#endif // __MATRIXMUL_H__
\ No newline at end of file
source/tensor/core/arithmetic/MatrixMul2D.cpp
查看文件 @
44bf9fa6
...
@@ -82,19 +82,7 @@ void _MatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
...
@@ -82,19 +82,7 @@ void _MatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
_MatrixMULCPU
(
a
,
transposedA
,
b
,
transposedB
,
c
,
alpha
,
beta
);
_MatrixMULCPU
(
a
,
transposedA
,
b
,
transposedB
,
c
,
alpha
,
beta
);
else
else
_MatrixMul2DParallel
(
a
,
transposedA
,
b
,
transposedB
,
c
,
alpha
,
beta
,
parallelRunner
);
_MatrixMul2DParallel
(
a
,
transposedA
,
b
,
transposedB
,
c
,
alpha
,
beta
,
parallelRunner
);
/*if (a->dataType == DEFAULT_DTYPE &&
b->dataType == DEFAULT_DTYPE &&
c->dataType == DEFAULT_DTYPE)
{
if (useBLAS)
_MatrixMULCPU(a, transposedA, b, transposedB, c, alpha, beta);
else
_MatrixMul2DParallel(a, transposedA, b, transposedB, c, alpha, beta, parallelRunner);
}*/
//else {
// // TODO!!
// ShowNTErrors("TODO!");
//}
}
}
/* a dense matrix multiply a sparse matrix */
/* a dense matrix multiply a sparse matrix */
else
if
(
!
a
->
isSparse
&&
b
->
isSparse
)
{
else
if
(
!
a
->
isSparse
&&
b
->
isSparse
)
{
...
...
source/tensor/core/arithmetic/MatrixMul2D.cu
查看文件 @
44bf9fa6
...
@@ -156,18 +156,6 @@ void _CudaMatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
...
@@ -156,18 +156,6 @@ void _CudaMatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
if (stream != NULL)
if (stream != NULL)
cublasSetStream(*handle, stream->stream);
cublasSetStream(*handle, stream->stream);
//if (a->dataType == X_FLOAT && b->dataType == X_FLOAT && c->dataType == X_FLOAT) {
// _CudaBLASMatrixMUL(handle, a->data, transposedA, a->dataType,
// b->data, transposedB, a->dataType, c->data, c->dataType,
// a->dimSize[0], a->dimSize[1],
// b->dimSize[0], b->dimSize[1],
// c->dimSize[0], c->dimSize[1],
// alpha, beta);
//}
//else {
// // TODO!!
// ShowNTErrors("TODO!");
//}
_CudaBLASMatrixMUL(handle, a->data, transposedA, a->dataType,
_CudaBLASMatrixMUL(handle, a->data, transposedA, a->dataType,
b->data, transposedB, a->dataType, c->data, c->dataType,
b->data, transposedB, a->dataType, c->data, c->dataType,
a->dimSize[0], a->dimSize[1],
a->dimSize[0], a->dimSize[1],
...
...
source/tensor/core/arithmetic/MatrixMul2DParallel.cpp
查看文件 @
44bf9fa6
...
@@ -63,44 +63,6 @@ void _MatrixMul2DParallel(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
...
@@ -63,44 +63,6 @@ void _MatrixMul2DParallel(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
/* trans(a) * b */
/* trans(a) * b */
else
if
(
transposedA
==
X_TRANS
&&
transposedB
==
X_NOTRANS
)
{
else
if
(
transposedA
==
X_TRANS
&&
transposedB
==
X_NOTRANS
)
{
int
num
=
an
;
int
num
=
an
;
/*if (a->dataType == X_FLOAT16) {
for (int i = 0; i < cn; i++) {
X_FLOAT16 *p3 = (X_FLOAT16*)c->data + i * cm;
for (int j = 0; j < cm; j++) {
X_FLOAT16 r = 0;
X_FLOAT16 * p1 = (X_FLOAT16*)a->data + 0 * am + i;
X_FLOAT16 * p2 = (X_FLOAT16*)b->data + 0 * bm + j;
for (int k = 0; k < num; k++) {
r += (*p1) * (*p2) * alpha;
p1 += aColNum;
p2 += bColNum;
}
*p3 = *p3 * beta + r;
p3 += 1;
}
}
}
else {
for (int i = 0; i < cn; i++) {
DTYPE * p3 = (DTYPE*)c->data + i * cm;
for (int j = 0; j < cm; j++) {
DTYPE r = 0;
DTYPE * p1 = (DTYPE*)a->data + 0 * am + i;
DTYPE * p2 = (DTYPE*)b->data + 0 * bm + j;
for (int k = 0; k < num; k++) {
r += (*p1) * (*p2) * alpha;
p1 += aColNum;
p2 += bColNum;
}
*p3 = *p3 * beta + r;
p3 += 1;
}
}
}*/
for
(
int
i
=
0
;
i
<
cn
;
i
++
)
{
for
(
int
i
=
0
;
i
<
cn
;
i
++
)
{
DTYPE
*
p3
=
(
DTYPE
*
)
c
->
data
+
i
*
cm
;
DTYPE
*
p3
=
(
DTYPE
*
)
c
->
data
+
i
*
cm
;
for
(
int
j
=
0
;
j
<
cm
;
j
++
)
{
for
(
int
j
=
0
;
j
<
cm
;
j
++
)
{
...
...
source/tensor/core/arithmetic/MatrixMulBatched.cpp
查看文件 @
44bf9fa6
...
@@ -22,7 +22,6 @@
...
@@ -22,7 +22,6 @@
#include "../../XTensor.h"
#include "../../XTensor.h"
#include "../../XDevice.h"
#include "../../XDevice.h"
#include "../../XName.h"
#include "../../XName.h"
#include "../CHeader.h"
#include "MatrixMulBatched.h"
#include "MatrixMulBatched.h"
#include "XTensorBLAS.h"
#include "XTensorBLAS.h"
#include "MatrixMul2D.h"
#include "MatrixMul2D.h"
...
@@ -388,142 +387,4 @@ XTensor MatrixMulBatched(const XTensor &a, const XTensor &b,
...
@@ -388,142 +387,4 @@ XTensor MatrixMulBatched(const XTensor &a, const XTensor &b,
return
c
;
return
c
;
}
}
/*
matrix multiplication of the two tensors (do it on site)
c = trans(a) * trans(b) * alpha
make a new tensor to keep the result and return it
for each 2-dimensional data array in a (denoted as ai) and
each 2-dimensional data array in b (denoted as bi), we have
ci = trans(ai) * trans(bi) * alpha + cm * beta
where trans() returns the transposed matrix if the flag is fired.
>> a - tensor a
>> transposedA - indicates whether the matrices in a are transposed
>> b - tensor b
>> transposedB - indicates whether teh matrices in b are transposed
>> alpha - a coefficient
>> parallelRunner - parallel processing module
<< return - the result of matrix multiplication of the two tensors
*/
XTensor
MatrixMulBatchedFloat16
(
const
XTensor
&
a
,
MATRIX_TRANS_TYPE
transposedA
,
const
XTensor
&
b
,
MATRIX_TRANS_TYPE
transposedB
,
DTYPE
alpha
,
XPRunner
*
parallelRunner
)
{
CheckNTErrors
(
a
.
dataType
==
b
.
dataType
,
"Input tensors should have the same data type!"
);
CheckNTErrors
(
a
.
order
>=
2
&&
b
.
order
>=
2
,
"Input tensors must have a order >= 2!"
);
CheckNTErrors
(
a
.
order
==
b
.
order
,
"Input tensor and output tensor must have same order!"
);
int
an
=
transposedA
==
X_TRANS
?
a
.
dimSizeRDI
[
0
]
:
a
.
dimSizeRDI
[
1
];
int
am
=
transposedA
==
X_TRANS
?
a
.
dimSizeRDI
[
1
]
:
a
.
dimSizeRDI
[
0
];
int
bn
=
transposedB
==
X_TRANS
?
b
.
dimSizeRDI
[
0
]
:
b
.
dimSizeRDI
[
1
];
int
bm
=
transposedB
==
X_TRANS
?
b
.
dimSizeRDI
[
1
]
:
b
.
dimSizeRDI
[
0
];
CheckNTErrors
(
am
==
bn
,
"Unmatched tensors in multiplication!"
);
int
order
=
a
.
order
;
int
sub
=
0
;
int
*
dimSize
=
new
int
[
order
];
for
(
int
i
=
0
;
i
<
a
.
order
-
2
;
i
++
)
dimSize
[
sub
++
]
=
a
.
dimSize
[
i
];
dimSize
[
sub
++
]
=
an
;
dimSize
[
sub
++
]
=
bm
;
float
dr
=
(
!
a
.
isSparse
||
!
b
.
isSparse
)
?
1.0
F
:
MAX
(
a
.
denseRatio
,
b
.
denseRatio
);
XTensor
c
(
order
,
dimSize
,
a
.
dataType
,
dr
,
a
.
devID
,
a
.
mem
);
c
.
SetTMPFlag
();
///*call _MatrixMulBatched function */
//_MatrixMulBatched(&a, transposedA, &b, transposedB, &c, alpha, 0, parallelRunner);
XTensor
*
halfA
=
NewTensorBuf
(
a
.
order
,
a
.
dimSize
,
X_FLOAT16
,
1.0
F
,
a
.
devID
,
a
.
mem
);
XTensor
*
halfB
=
NewTensorBuf
(
b
.
order
,
b
.
dimSize
,
X_FLOAT16
,
1.0
F
,
a
.
devID
,
a
.
mem
);
XTensor
*
halfC
=
NewTensorBuf
(
c
.
order
,
c
.
dimSize
,
X_FLOAT16
,
1.0
F
,
a
.
devID
,
a
.
mem
);
_ConvertDataType
(
&
a
,
halfA
);
_ConvertDataType
(
&
b
,
halfB
);
_MatrixMulBatched
(
halfA
,
transposedA
,
halfB
,
transposedB
,
halfC
,
alpha
,
0
,
parallelRunner
);
_ConvertDataType
(
halfC
,
&
c
);
DelTensorBuf
(
halfC
);
DelTensorBuf
(
halfB
);
DelTensorBuf
(
halfA
);
/* tensor connections */
XLink
::
MakeLink
(
&
a
,
&
b
,
&
c
,
MATH_MATRIXMULBATCHED
);
XLink
::
AddParamToHeadTrans
(
&
c
,
transposedA
);
XLink
::
AddParamToHeadTrans
(
&
c
,
transposedB
);
XLink
::
AddParamToHead
(
&
c
,
alpha
);
/* destroy variables */
delete
[]
dimSize
;
return
c
;
}
/*
matrix multiplication of the two tensors (do it on site)
c = a * b * alpha
make a new tensor to keep the result and return it
for each 2-dimensional data array in a (denoted as ai) and
each 2-dimensional data array in b (denoted as bi), we have
ci = ai * bi * alpha + cm * beta
>> a - tensor a
>> b - tensor b
>> alpha - a coefficient
>> parallelRunner - parallel processing module
<< return - the result of matrix multiplication of the two tensors
*/
XTensor
MatrixMulBatchedFloat16
(
const
XTensor
&
a
,
const
XTensor
&
b
,
DTYPE
alpha
,
XPRunner
*
parallelRunner
)
{
CheckNTErrors
(
a
.
dataType
==
b
.
dataType
,
"Input tensors should have the same data type!"
);
CheckNTErrors
(
a
.
order
>=
2
&&
b
.
order
>=
2
,
"Input tensors must have a order >= 2!"
);
CheckNTErrors
(
a
.
order
==
b
.
order
,
"Input tensor and output tensor must have same order!"
);
int
an
=
a
.
dimSizeRDI
[
1
];
int
am
=
a
.
dimSizeRDI
[
0
];
int
bn
=
b
.
dimSizeRDI
[
1
];
int
bm
=
b
.
dimSizeRDI
[
0
];
CheckNTErrors
(
am
==
bn
,
"Unmatched tensors in multiplication!"
);
int
order
=
a
.
order
;
int
sub
=
0
;
int
*
dimSize
=
new
int
[
order
];
for
(
int
i
=
0
;
i
<
a
.
order
-
2
;
i
++
)
dimSize
[
sub
++
]
=
a
.
dimSize
[
i
];
dimSize
[
sub
++
]
=
an
;
dimSize
[
sub
++
]
=
bm
;
float
dr
=
(
!
a
.
isSparse
||
!
b
.
isSparse
)
?
1.0
F
:
MAX
(
a
.
denseRatio
,
b
.
denseRatio
);
XTensor
c
(
order
,
dimSize
,
a
.
dataType
,
dr
,
a
.
devID
,
a
.
mem
);
c
.
SetTMPFlag
();
///*call _MatrixMulBatched function */
//_MatrixMulBatched(&a, X_NOTRANS, &b, X_NOTRANS, &c, alpha, 0, parallelRunner);
XTensor
*
halfA
=
NewTensorBuf
(
a
.
order
,
a
.
dimSize
,
X_FLOAT16
,
1.0
F
,
a
.
devID
,
a
.
mem
);
XTensor
*
halfB
=
NewTensorBuf
(
b
.
order
,
b
.
dimSize
,
X_FLOAT16
,
1.0
F
,
a
.
devID
,
a
.
mem
);
XTensor
*
halfC
=
NewTensorBuf
(
c
.
order
,
c
.
dimSize
,
X_FLOAT16
,
1.0
F
,
a
.
devID
,
a
.
mem
);
_ConvertDataType
(
&
a
,
halfA
);
_ConvertDataType
(
&
b
,
halfB
);
_MatrixMulBatched
(
halfA
,
X_NOTRANS
,
halfB
,
X_NOTRANS
,
halfC
,
alpha
,
0
,
parallelRunner
);
_ConvertDataType
(
halfC
,
&
c
);
DelTensorBuf
(
halfC
);
DelTensorBuf
(
halfB
);
DelTensorBuf
(
halfA
);
/* tensor connections */
XLink
::
MakeLink
(
&
a
,
&
b
,
&
c
,
MATH_MATRIXMULBATCHED
);
XLink
::
AddParamToHeadTrans
(
&
c
,
X_NOTRANS
);
XLink
::
AddParamToHeadTrans
(
&
c
,
X_NOTRANS
);
XLink
::
AddParamToHead
(
&
c
,
alpha
);
/* destroy variables */
delete
[]
dimSize
;
return
c
;
}
}
// namespace nts(NiuTrans.Tensor)
}
// namespace nts(NiuTrans.Tensor)
source/tensor/core/arithmetic/MatrixMulBatched.h
查看文件 @
44bf9fa6
...
@@ -27,7 +27,6 @@
...
@@ -27,7 +27,6 @@
namespace
nts
{
// namespace nts(NiuTrans.Tensor)
namespace
nts
{
// namespace nts(NiuTrans.Tensor)
#define BMMul MatrixMulBatched
#define BMMul MatrixMulBatched
#define BMMul16 MatrixMulBatchedFloat16
/*
/*
matrix multiplication of the two tensors c = trans(a) * trans(b) * alpha + c * beta
matrix multiplication of the two tensors c = trans(a) * trans(b) * alpha + c * beta
...
@@ -85,12 +84,6 @@ ci = ai * bi * alpha + cm * beta
...
@@ -85,12 +84,6 @@ ci = ai * bi * alpha + cm * beta
XTensor
MatrixMulBatched
(
const
XTensor
&
a
,
const
XTensor
&
b
,
XTensor
MatrixMulBatched
(
const
XTensor
&
a
,
const
XTensor
&
b
,
DTYPE
alpha
=
(
DTYPE
)
1
.
0
,
XPRunner
*
parallelRunner
=
NULL
);
DTYPE
alpha
=
(
DTYPE
)
1
.
0
,
XPRunner
*
parallelRunner
=
NULL
);
XTensor
MatrixMulBatchedFloat16
(
const
XTensor
&
a
,
MATRIX_TRANS_TYPE
transposedA
,
const
XTensor
&
b
,
MATRIX_TRANS_TYPE
transposedB
,
DTYPE
alpha
=
(
DTYPE
)
1
.
0
,
XPRunner
*
parallelRunner
=
NULL
);
XTensor
MatrixMulBatchedFloat16
(
const
XTensor
&
a
,
const
XTensor
&
b
,
DTYPE
alpha
=
(
DTYPE
)
1
.
0
,
XPRunner
*
parallelRunner
=
NULL
);
}
// namespace nts(NiuTrans.Tensor)
}
// namespace nts(NiuTrans.Tensor)
#endif // __MATRIXMULBATCHED_H__
#endif // __MATRIXMULBATCHED_H__
\ No newline at end of file
source/tensor/core/arithmetic/Multiply.cpp
查看文件 @
44bf9fa6
...
@@ -46,8 +46,6 @@ void _Multiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, i
...
@@ -46,8 +46,6 @@ void _Multiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, i
"Unmatched tensors in multiplication!"
);
"Unmatched tensors in multiplication!"
);
CheckNTErrors
((
a
->
order
==
b
->
order
&&
a
->
order
==
c
->
order
),
CheckNTErrors
((
a
->
order
==
b
->
order
&&
a
->
order
==
c
->
order
),
"Unmatched tensors!"
);
"Unmatched tensors!"
);
CheckNTErrors
(
a
->
dataType
==
b
->
dataType
&&
a
->
dataType
==
c
->
dataType
,
"Unmatched tensors in addition!"
);
#ifdef USE_CUDA
#ifdef USE_CUDA
if
(
a
->
devID
>=
0
||
b
->
devID
>=
0
||
c
->
devID
>=
0
)
{
if
(
a
->
devID
>=
0
||
b
->
devID
>=
0
||
c
->
devID
>=
0
)
{
...
...
source/tensor/core/arithmetic/MultiplyDim.cpp
查看文件 @
44bf9fa6
...
@@ -22,9 +22,10 @@
...
@@ -22,9 +22,10 @@
#include "Multiply.h"
#include "Multiply.h"
#include "MultiplyDim.h"
#include "MultiplyDim.h"
#include "MultiplyDim.cuh"
#include "MultiplyDim.cuh"
#include "../shape/Unsqueeze.h"
#include "../../XName.h"
#include "../../XName.h"
#include "../../XUtility.h"
#include "../movement/CopyValues.h"
#include "../movement/CopyValues.h"
#include "../getandset/ConvertDataType.h"
namespace
nts
{
// namespace nts(NiuTrans.Tensor)
namespace
nts
{
// namespace nts(NiuTrans.Tensor)
...
@@ -136,29 +137,168 @@ void _MultiplyDimMe(XTensor * a, const XTensor * b, int n, DTYPE alpha)
...
@@ -136,29 +137,168 @@ void _MultiplyDimMe(XTensor * a, const XTensor * b, int n, DTYPE alpha)
tensor multiplication (return an XTensor structure and make tensor connections)
tensor multiplication (return an XTensor structure and make tensor connections)
make a new tensor to keep the result and return it
make a new tensor to keep the result and return it
c = a * b
+ \alpha * c
c = a * b
where the size of b is equal to the n-th dimension of a,
where the size of b is equal to the n-th dimension of a,
i.e., a is multiplied with b by broadcasting
i.e., a is multiplied with b by broadcasting
>> a - a tensor
>> a - a tensor
>> b - another tensor whose size is equal to that of dimension n of a
>> b - another tensor whose size is equal to that of dimension n of a
>> n - the dimension index
>> n - the dimension index
>> alpha - the scaling factor
<< return - the result tensor by tensor multiplication
<< return - the result tensor by tensor multiplication
*/
*/
XTensor
MultiplyDim
(
const
XTensor
&
a
,
const
XTensor
&
b
,
int
n
,
DTYPE
alpha
)
XTensor
MultiplyDim
(
const
XTensor
&
a
,
const
XTensor
&
b
,
int
n
)
{
{
XTensor
c
(
&
a
);
XTensor
c
(
&
a
);
c
.
SetTMPFlag
();
c
.
SetTMPFlag
();
/* call _Multiply function */
/* call _Multiply function */
_MultiplyDim
(
&
a
,
&
b
,
&
c
,
n
,
alpha
);
_MultiplyDim
(
&
a
,
&
b
,
&
c
,
n
,
0
);
/* tensor connections */
/* tensor connections */
XLink
::
MakeLink
(
&
a
,
&
b
,
&
c
,
MATH_MULTIPLYDIM
);
XLink
::
MakeLink
(
&
a
,
&
b
,
&
c
,
MATH_MULTIPLYDIM
);
XLink
::
AddParamToHeadInt
(
&
c
,
n
);
XLink
::
AddParamToHeadInt
(
&
c
,
n
);
XLink
::
AddParamToHead
(
&
c
,
alpha
);
XLink
::
AddParamToHead
(
&
c
,
0
);
return
c
;
}
/*
tensor broadcast multiplication
c = a * b + c * \beta
where some of dimensions of b can be of size 1
>> a - a tensor
>> b - another tensor that would be broadcasted
>> c - the resulting tensor
>> beta - the scaling factor
*/
void
_MultiplyBroadcast
(
const
XTensor
*
a
,
const
XTensor
*
b
,
XTensor
*
c
,
DTYPE
beta
)
{
CheckNTErrors
(
a
->
order
==
b
->
order
,
"Wrong tensor orders!"
);
CheckNTErrors
(
a
->
order
==
c
->
order
,
"Wrong tensor orders!"
);
CheckNTErrors
(
a
->
order
>
0
,
"TODO!"
);
int
order
=
a
->
order
;
int
count
=
0
;
void
*
source
=
0
;
void
*
target
=
0
;
for
(
int
i
=
0
;
i
<
order
;
i
++
){
if
(
a
->
GetDim
(
i
)
==
b
->
GetDim
(
i
))
continue
;
if
(
b
->
GetDim
(
i
)
==
1
){
int
fitSize
=
a
->
GetDim
(
i
);
int
j
=
i
+
1
;
/* we define a range over dimensions. It is to be unsqueezed */
for
(;
j
<
order
;
j
++
){
if
(
a
->
GetDim
(
j
)
==
b
->
GetDim
(
j
))
break
;
fitSize
*=
a
->
GetDim
(
j
);
}
int
dimsS
[
MAX_TENSOR_DIM_NUM
];
int
dimsT
[
MAX_TENSOR_DIM_NUM
];
for
(
int
k
=
0
;
k
<
i
;
k
++
){
dimsS
[
k
]
=
a
->
GetDim
(
k
);
dimsT
[
k
]
=
a
->
GetDim
(
k
);
}
dimsT
[
i
]
=
fitSize
;
bool
isLast
=
true
;
for
(
int
k
=
j
;
k
<
order
;
k
++
){
dimsS
[
i
+
k
-
j
+
0
]
=
b
->
GetDim
(
k
);
dimsT
[
i
+
k
-
j
+
1
]
=
b
->
GetDim
(
k
);
if
(
a
->
GetDim
(
k
)
!=
b
->
GetDim
(
k
)){
if
(
b
->
GetDim
(
k
)
==
1
)
isLast
=
false
;
else
{
ShowNTErrors
(
"Wrong dimension size!"
)
}
}
}
dimsS
[
0
]
=
-
dimsS
[
0
];
dimsT
[
0
]
=
-
dimsT
[
0
];
XTensor
*
s
=
NewTensor
(
order
-
(
j
-
i
),
dimsS
,
a
->
dataType
,
a
->
denseRatio
,
a
->
devID
,
a
->
mem
);
XTensor
*
t
=
NewTensor
(
order
-
(
j
-
i
)
+
1
,
dimsT
,
b
->
dataType
,
b
->
denseRatio
,
b
->
devID
,
b
->
mem
);
if
(
count
==
0
)
source
=
b
->
data
;
else
{
source
=
target
;
}
target
=
t
->
mem
!=
NULL
?
t
->
mem
->
AllocBuf
(
t
->
devID
,
t
->
unitNum
*
t
->
unitSize
)
:
XMemAlloc
(
t
->
devID
,
t
->
unitNum
*
t
->
unitSize
);
s
->
data
=
source
;
t
->
data
=
target
;
_Unsqueeze
(
s
,
t
,
i
,
fitSize
);
/* free the memory space of the one before the last allocation */
if
(
count
>
0
){
int
size
=
s
->
unitNum
*
s
->
unitSize
;
if
(
t
->
mem
!=
NULL
)
t
->
mem
->
ReleaseBuf
(
t
->
devID
,
size
);
else
XMemFree
(
t
->
devID
,
source
);
}
/* we do multiplication here */
if
(
isLast
){
CheckNTErrors
(
t
->
unitNum
==
c
->
unitNum
,
"Wrong tensor size!"
);
_Multiply
(
a
,
t
,
c
,
beta
);
if
(
t
->
mem
!=
NULL
)
t
->
mem
->
ReleaseBuf
(
t
->
devID
,
t
->
unitNum
*
t
->
unitSize
);
else
XMemFree
(
t
->
devID
,
target
);
target
=
NULL
;
}
s
->
data
=
NULL
;
t
->
data
=
NULL
;
DelTensor
(
s
);
DelTensor
(
t
);
i
=
j
;
count
++
;
}
}
if
(
count
==
0
)
_Multiply
(
a
,
b
,
c
,
beta
);
CheckNTErrors
(
target
==
NULL
,
"Something is wrong!"
);
}
/*
tensor broadcast multiplication
c = a * b
where some of dimensions of b can be of size 1
>> a - a tensor
>> b - another tensor that would be broadcasted
<< return - the resulting tensor c
*/
XTensor
MultiplyBroadcast
(
const
XTensor
&
a
,
const
XTensor
&
b
)
{
XTensor
c
(
&
a
);
c
.
SetTMPFlag
();
/* call _SumBroadcast function */
_MultiplyBroadcast
(
&
a
,
&
b
,
&
c
,
0
);
/* tensor connections */
XLink
::
MakeLink
(
&
a
,
&
b
,
&
c
,
MATH_MULTIPLYBROADCAST
);
XLink
::
AddParamToHead
(
&
c
,
0
);
return
c
;
return
c
;
}
}
...
...
source/tensor/core/arithmetic/MultiplyDim.cu
查看文件 @
44bf9fa6
...
@@ -217,8 +217,6 @@ void _CudaMultiplyDim(const XTensor * a, const XTensor * b, XTensor * c, int n,
...
@@ -217,8 +217,6 @@ void _CudaMultiplyDim(const XTensor * a, const XTensor * b, XTensor * c, int n,
}
}
}
}
#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
else if (a->dataType == X_FLOAT16) {
else if (a->dataType == X_FLOAT16) {
if (stride > 1) {
if (stride > 1) {
...
@@ -243,7 +241,9 @@ void _CudaMultiplyDim(const XTensor * a, const XTensor * b, XTensor * c, int n,
...
@@ -243,7 +241,9 @@ void _CudaMultiplyDim(const XTensor * a, const XTensor * b, XTensor * c, int n,
}
}
}
}
#endif
else {
ShowNTErrors("TODO!");
}
BacktoCudaDev(a->devID, devIDBackup);
BacktoCudaDev(a->devID, devIDBackup);
}
}
...
...
source/tensor/core/arithmetic/MultiplyDim.h
查看文件 @
44bf9fa6
...
@@ -34,9 +34,16 @@ void _MultiplyDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYP
...
@@ -34,9 +34,16 @@ void _MultiplyDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYP
i.e., a is multiplied with b by broadcasting. we keep the result in the input tensor a and return nothing */
i.e., a is multiplied with b by broadcasting. we keep the result in the input tensor a and return nothing */
void
_MultiplyDimMe
(
XTensor
*
a
,
const
XTensor
*
b
,
int
n
,
DTYPE
alpha
=
0
.
0
);
void
_MultiplyDimMe
(
XTensor
*
a
,
const
XTensor
*
b
,
int
n
,
DTYPE
alpha
=
0
.
0
);
/* tensor multiplication c = a * b
+ \alpha * c
where the size of b is equal to the n-th dimension of a,
/* tensor multiplication c = a * b where the size of b is equal to the n-th dimension of a,
i.e., a is multiplied with b by broadcasting. We make a new tensor c to keep the result and return it */
i.e., a is multiplied with b by broadcasting. We make a new tensor c to keep the result and return it */
XTensor
MultiplyDim
(
const
XTensor
&
a
,
const
XTensor
&
b
,
int
n
,
DTYPE
alpha
=
0
.
0
);
XTensor
MultiplyDim
(
const
XTensor
&
a
,
const
XTensor
&
b
,
int
n
);
/* tensor multiplication summation c = a * b + c * \beta where some of dimensions of b can be of size 1 */
void
_MultiplyBroadcast
(
const
XTensor
*
a
,
const
XTensor
*
b
,
XTensor
*
c
,
DTYPE
beta
=
(
DTYPE
)
1
.
0
);
/* tensor broadcast multiplication c = a * b where some of dimensions of b can be of size 1.
we return the resulting tensor here */
XTensor
MultiplyBroadcast
(
const
XTensor
&
a
,
const
XTensor
&
b
);
}
// namespace nts(NiuTrans.Tensor)
}
// namespace nts(NiuTrans.Tensor)
...
...
source/tensor/core/arithmetic/SubDim.cu
查看文件 @
44bf9fa6
差异被折叠。
点击展开。
source/tensor/core/arithmetic/Sum.cu
查看文件 @
44bf9fa6
...
@@ -60,6 +60,16 @@ void KernelADDHalf(__half * a, __half * b, __half * c, int size, DTYPE beta)
...
@@ -60,6 +60,16 @@ void KernelADDHalf(__half * a, __half * b, __half * c, int size, DTYPE beta)
#endif
#endif
}
}
__global__
void KernelADDInt(int * a, int * b, int * c, int size, DTYPE beta)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size)
c[i] = a[i] + b[i] * (int)beta;
}
/*
/*
tensor summation c = a + b * \beta (cuda version)
tensor summation c = a + b * \beta (cuda version)
...
@@ -101,7 +111,7 @@ void _CudaSum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
...
@@ -101,7 +111,7 @@ void _CudaSum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
if ((c == a && handle != NULL) && *handle != 0) {
if ((c == a && handle != NULL) && *handle != 0) {
#ifdef DOUBELPRICSION
#ifdef DOUBELPRICSION
cublasDaxpy(*handle, a->unitNum, &beta, (DTYPE*)->data, 1, (DTYPE*)a->data, 1);
cublasDaxpy(*handle, a->unitNum, &beta, (DTYPE*)
b
->data, 1, (DTYPE*)a->data, 1);
#else
#else
cublasSaxpy(*handle, a->unitNum, &beta, (DTYPE*)b->data, 1, (DTYPE*)a->data, 1);
cublasSaxpy(*handle, a->unitNum, &beta, (DTYPE*)b->data, 1, (DTYPE*)a->data, 1);
#endif
#endif
...
@@ -121,16 +131,6 @@ void _CudaSum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
...
@@ -121,16 +131,6 @@ void _CudaSum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
b->dataType == X_FLOAT16 &&
b->dataType == X_FLOAT16 &&
c->dataType == X_FLOAT16)
c->dataType == X_FLOAT16)
{
{
cublasHandle_t * handle = NULL;
if ((a->mem != NULL) && (b->mem != NULL)) {
cublasHandle_t * handleA = a->mem->GetCublasHandle();
cublasHandle_t * handleB = b->mem->GetCublasHandle();
handle = *handleA != 0 ? handleA : handleB;
}
else {
handle = GDevs.GetCudaHandle(a->devID);
}
int gridSize[3], blockSize[3];
int gridSize[3], blockSize[3];
GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);
GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);
...
@@ -141,6 +141,20 @@ void _CudaSum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
...
@@ -141,6 +141,20 @@ void _CudaSum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
KernelADDHalf << <blocks, threads >> >((__half *)a->data, (__half *)b->data, (__half *)c->data, a->unitNum, beta);
KernelADDHalf << <blocks, threads >> >((__half *)a->data, (__half *)b->data, (__half *)c->data, a->unitNum, beta);
}
}
else if (a->dataType == X_INT &&
b->dataType == X_INT &&
c->dataType == X_INT)
{
int gridSize[3], blockSize[3];
GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);
dim3 blocks(gridSize[0]);
dim3 threads(blockSize[0]);
//KernelADD << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, a->unitNum, beta);
KernelADDInt << <blocks, threads >> >((int *)a->data, (int *)b->data, (int *)c->data, a->unitNum, beta);
}
else {
else {
// TODO!!
// TODO!!
...
...
source/tensor/core/arithmetic/SumDim.cpp
查看文件 @
44bf9fa6
...
@@ -17,12 +17,16 @@
...
@@ -17,12 +17,16 @@
/*
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-07-29
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-07-29
* &Updated by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-12-26
* Add summation by broadcasting.
*/
*/
#include "Sum.h"
#include "Sum.h"
#include "SumDim.h"
#include "SumDim.h"
#include "SumDim.cuh"
#include "SumDim.cuh"
#include "../shape/Unsqueeze.h"
#include "../../XName.h"
#include "../../XName.h"
#include "../../XUtility.h"
#include "../movement/CopyValues.h"
#include "../movement/CopyValues.h"
namespace
nts
{
// namespace nts(NiuTrans.Tensor)
namespace
nts
{
// namespace nts(NiuTrans.Tensor)
...
@@ -152,7 +156,7 @@ XTensor SumDim(const XTensor &a, const XTensor &b, int n, DTYPE beta)
...
@@ -152,7 +156,7 @@ XTensor SumDim(const XTensor &a, const XTensor &b, int n, DTYPE beta)
XTensor
c
(
&
a
);
XTensor
c
(
&
a
);
c
.
SetTMPFlag
();
c
.
SetTMPFlag
();
/* call _Sum function */
/* call _Sum
Dim
function */
_SumDim
(
&
a
,
&
b
,
&
c
,
n
,
beta
);
_SumDim
(
&
a
,
&
b
,
&
c
,
n
,
beta
);
/* tensor connections */
/* tensor connections */
...
@@ -162,5 +166,146 @@ XTensor SumDim(const XTensor &a, const XTensor &b, int n, DTYPE beta)
...
@@ -162,5 +166,146 @@ XTensor SumDim(const XTensor &a, const XTensor &b, int n, DTYPE beta)
return
c
;
return
c
;
}
}
/*
tensor broadcast summation c = a + b * \beta where some of dimensions of b can be of size 1
c = a + b * \beta
>> a - a tensor
>> b - another tensor that would be broadcasted
>> c - the resulting tensor
>> beta - the scaling factor
*/
void
_SumBroadcast
(
const
XTensor
*
a
,
const
XTensor
*
b
,
XTensor
*
c
,
DTYPE
beta
)
{
CheckNTErrors
(
a
->
order
==
b
->
order
,
"Wrong tensor orders!"
);
CheckNTErrors
(
a
->
order
==
c
->
order
,
"Wrong tensor orders!"
);
CheckNTErrors
(
a
->
order
>
0
,
"TODO!"
);
int
order
=
a
->
order
;
int
count
=
0
;
void
*
source
=
0
;
void
*
target
=
0
;
for
(
int
i
=
0
;
i
<
order
;
i
++
){
if
(
a
->
GetDim
(
i
)
==
b
->
GetDim
(
i
))
continue
;
if
(
b
->
GetDim
(
i
)
==
1
){
int
fitSize
=
a
->
GetDim
(
i
);
int
j
=
i
+
1
;
/* we define a range over dimensions. It is to be unsqueezed */
for
(;
j
<
order
;
j
++
){
if
(
a
->
GetDim
(
j
)
==
b
->
GetDim
(
j
))
break
;
fitSize
*=
a
->
GetDim
(
j
);
}
int
dimsS
[
MAX_TENSOR_DIM_NUM
];
int
dimsT
[
MAX_TENSOR_DIM_NUM
];
for
(
int
k
=
0
;
k
<
i
;
k
++
){
dimsS
[
k
]
=
a
->
GetDim
(
k
);
dimsT
[
k
]
=
a
->
GetDim
(
k
);
}
dimsT
[
i
]
=
fitSize
;
bool
isLast
=
true
;
for
(
int
k
=
j
;
k
<
order
;
k
++
){
dimsS
[
i
+
k
-
j
+
0
]
=
b
->
GetDim
(
k
);
dimsT
[
i
+
k
-
j
+
1
]
=
b
->
GetDim
(
k
);
if
(
a
->
GetDim
(
k
)
!=
b
->
GetDim
(
k
)){
if
(
b
->
GetDim
(
k
)
==
1
)
isLast
=
false
;
else
{
ShowNTErrors
(
"Wrong dimension size!"
)
}
}
}
dimsS
[
0
]
=
-
dimsS
[
0
];
dimsT
[
0
]
=
-
dimsT
[
0
];
XTensor
*
s
=
NewTensor
(
order
-
(
j
-
i
),
dimsS
,
a
->
dataType
,
a
->
denseRatio
,
a
->
devID
,
a
->
mem
);
XTensor
*
t
=
NewTensor
(
order
-
(
j
-
i
)
+
1
,
dimsT
,
b
->
dataType
,
b
->
denseRatio
,
b
->
devID
,
b
->
mem
);
if
(
count
==
0
)
source
=
b
->
data
;
else
{
source
=
target
;
}
target
=
t
->
mem
!=
NULL
?
t
->
mem
->
AllocBuf
(
t
->
devID
,
t
->
unitNum
*
t
->
unitSize
)
:
XMemAlloc
(
t
->
devID
,
t
->
unitNum
*
t
->
unitSize
);
s
->
data
=
source
;
t
->
data
=
target
;
_Unsqueeze
(
s
,
t
,
i
,
fitSize
);
/* free the memory space of the one before the last allocation */
if
(
count
>
0
){
int
size
=
s
->
unitNum
*
s
->
unitSize
;
if
(
t
->
mem
!=
NULL
)
t
->
mem
->
ReleaseBuf
(
t
->
devID
,
size
);
else
XMemFree
(
t
->
devID
,
source
);
}
/* we do summation here */
if
(
isLast
){
CheckNTErrors
(
t
->
unitNum
==
c
->
unitNum
,
"Wrong tensor size!"
);
_Sum
(
a
,
t
,
c
,
beta
);
if
(
t
->
mem
!=
NULL
)
t
->
mem
->
ReleaseBuf
(
t
->
devID
,
t
->
unitNum
*
t
->
unitSize
);
else
XMemFree
(
t
->
devID
,
target
);
target
=
NULL
;
}
s
->
data
=
NULL
;
t
->
data
=
NULL
;
DelTensor
(
s
);
DelTensor
(
t
);
i
=
j
;
count
++
;
}
}
if
(
count
==
0
)
_Sum
(
a
,
b
,
c
,
beta
);
CheckNTErrors
(
target
==
NULL
,
"Something is wrong!"
);
}
/*
tensor broadcast summation c = a + b * \beta where some of dimensions of b can be of size 1
c = a + b * \beta
we return c here
>> a - a tensor
>> b - another tensor that would be broadcasted
>> beta - the scaling factor
<< return - the resulting tensor c
*/
XTensor
SumBroadcast
(
const
XTensor
&
a
,
const
XTensor
&
b
,
DTYPE
beta
)
{
XTensor
c
(
&
a
);
c
.
SetTMPFlag
();
/* call _SumBroadcast function */
_SumBroadcast
(
&
a
,
&
b
,
&
c
,
beta
);
/* tensor connections */
XLink
::
MakeLink
(
&
a
,
&
b
,
&
c
,
MATH_SUMBROADCAST
);
XLink
::
AddParamToHead
(
&
c
,
beta
);
return
c
;
}
}
}
source/tensor/core/arithmetic/SumDim.cu
查看文件 @
44bf9fa6
差异被折叠。
点击展开。
source/tensor/core/arithmetic/SumDim.cuh
查看文件 @
44bf9fa6
...
@@ -17,6 +17,8 @@
...
@@ -17,6 +17,8 @@
/*
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-07-29
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-07-29
* &Updated by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-12-26
* Add summation by broadcasting.
*/
*/
#ifndef __SUMDIM_CUH__
#ifndef __SUMDIM_CUH__
...
...
source/tensor/core/arithmetic/SumDim.h
查看文件 @
44bf9fa6
...
@@ -18,6 +18,9 @@
...
@@ -18,6 +18,9 @@
/*
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-07-29
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-07-29
* It reached to 39 centigrade around 3:00 pm in Shenyang
* It reached to 39 centigrade around 3:00 pm in Shenyang
* &Updated by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-12-26
* Add summation by broadcasting.
* Four of my master students graduated. Good luck to them for their future work!
*/
*/
#ifndef __SUMDIM_H__
#ifndef __SUMDIM_H__
...
@@ -38,6 +41,13 @@ void _SumDim(XTensor * a, const XTensor * b, int n, DTYPE beta = (DTYPE)1.0);
...
@@ -38,6 +41,13 @@ void _SumDim(XTensor * a, const XTensor * b, int n, DTYPE beta = (DTYPE)1.0);
/* tensor summation c = a + b * \beta where the size of b is equal to the n-th dimension of a,
/* tensor summation c = a + b * \beta where the size of b is equal to the n-th dimension of a,
i.e., a is summed with b by broadcasting. We make a new tensor c to keep the result and return it */
i.e., a is summed with b by broadcasting. We make a new tensor c to keep the result and return it */
XTensor
SumDim
(
const
XTensor
&
a
,
const
XTensor
&
b
,
int
n
,
DTYPE
beta
=
(
DTYPE
)
1
.
0
);
XTensor
SumDim
(
const
XTensor
&
a
,
const
XTensor
&
b
,
int
n
,
DTYPE
beta
=
(
DTYPE
)
1
.
0
);
/* tensor broadcast summation c = a + b * \beta where some of dimensions of b can be of size 1 */
void
_SumBroadcast
(
const
XTensor
*
a
,
const
XTensor
*
b
,
XTensor
*
c
,
DTYPE
beta
=
(
DTYPE
)
1
.
0
);
/* tensor broadcast summation c = a + b * \beta where some of dimensions of b can be of size 1.
we return the resulting tensor here */
XTensor
SumBroadcast
(
const
XTensor
&
a
,
const
XTensor
&
b
,
DTYPE
beta
=
(
DTYPE
)
1
.
0
);
}
// namespace nts(NiuTrans.Tensor)
}
// namespace nts(NiuTrans.Tensor)
...
...
source/tensor/core/arithmetic/XTensorBLAS.cu
查看文件 @
44bf9fa6
...
@@ -23,89 +23,12 @@
...
@@ -23,89 +23,12 @@
#include "../../XDevice.h"
#include "../../XDevice.h"
#include "../../XTensor.h"
#include "../../XTensor.h"
#include "XTensorBLAS.h"
#include "XTensorBLAS.h"
#include <stdint.h>
namespace nts { // namespace nts(NiuTrans.Tensor)
namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
#ifdef USE_CUDA
#include <stdint.h>
typedef unsigned short uint16_t;
typedef unsigned int uint32_t;
//typedef char __int8;
half uint16_as_fp16(uint16_t a)
{
half res;
#if defined (__cplusplus)
memcpy(&res, &a, sizeof(res));
#else /* __cplusplus */
volatile union {
half f;
uint16_t i;
} cvt;
cvt.i = a;
res = cvt.f;
#endif /* __cplusplus */
return res;
}
uint32_t fp32_as_uint32(float a)
{
uint32_t res;
#if defined (__cplusplus)
memcpy(&res, &a, sizeof(res));
#else /* __cplusplus */
volatile union {
float f;
uint32_t i;
} cvt;
cvt.f = a;
res = cvt.i;
#endif /* __cplusplus */
return res;
}
/* host version of device function __float2half_rn() */
half float2half_rn(float a)
{
uint32_t ia = fp32_as_uint32(a);
uint16_t ir;
ir = (ia >> 16) & 0x8000;
if ((ia & 0x7f800000) == 0x7f800000) {
if ((ia & 0x7fffffff) == 0x7f800000) {
ir |= 0x7c00; /* infinity */
}
else {
ir = 0x7fff; /* canonical NaN */
}
}
else if ((ia & 0x7f800000) >= 0x33000000) {
int shift = (int)((ia >> 23) & 0xff) - 127;
if (shift > 15) {
ir |= 0x7c00; /* infinity */
}
else {
ia = (ia & 0x007fffff) | 0x00800000; /* extract mantissa */
if (shift < -14) { /* denormal */
ir |= ia >> (-1 - shift);
ia = ia << (32 - (-1 - shift));
}
else { /* normal */
ir |= ia >> (24 - 11);
ia = ia << (32 - (24 - 11));
ir = ir + ((14 + shift) << 10);
}
/* IEEE-754 round to nearest of even */
if ((ia > 0x80000000) || ((ia == 0x80000000) && (ir & 1))) {
ir++;
}
}
}
return uint16_as_fp16(ir);
}
/*
/*
matrix multiplication via cuda version BLAS
matrix multiplication via cuda version BLAS
*/
*/
...
@@ -171,11 +94,24 @@ void _CudaBLASMatrixMUL(cublasHandle_t * handle,
...
@@ -171,11 +94,24 @@ void _CudaBLASMatrixMUL(cublasHandle_t * handle,
cublasGemmEx(*handle, CUBLAS_OP_T, CUBLAS_OP_T, mc, nc, na, &alpha2, (const int8_t*)b, CUDA_R_8I, mb, (const int8_t*)a, CUDA_R_8I, ma, &beta2, (float*)c, CUDA_R_32F, mc, CUDA_R_32F, CUBLAS_GEMM_DEFAULT);
cublasGemmEx(*handle, CUBLAS_OP_T, CUBLAS_OP_T, mc, nc, na, &alpha2, (const int8_t*)b, CUDA_R_8I, mb, (const int8_t*)a, CUDA_R_8I, ma, &beta2, (float*)c, CUDA_R_32F, mc, CUDA_R_32F, CUBLAS_GEMM_DEFAULT);
}
}
else if (dataTypeA == X_INT8 && dataTypeB == X_INT8 && dataTypeC == X_INT) {
else if (dataTypeA == X_INT8 && dataTypeB == X_INT8 && dataTypeC == X_INT) {
//ShowNTErrors("TO DO!");
int alpha2 = (int)alpha;
int alpha2 = (int)alpha;
int beta2 = (int)beta;
int beta2 = (int)beta;
/*
CUDA requires that the dimension of two tensor( lda, ldb ) should be multiples of 4.
details in https://devtalk.nvidia.com/default/topic/999101/about-cublasgemm-int8-support/
*/
if (mb % 4 != 0 || ma % 4 != 0) {
ShowNTErrors("mb, ma( lda, ldb ) should be multiples of 4!");
return;
}
if (transposedA == X_NOTRANS && transposedB == X_NOTRANS)
if (transposedA == X_NOTRANS && transposedB == X_NOTRANS)
cublasGemmEx(*handle, CUBLAS_OP_N, CUBLAS_OP_N, mc, nc, ma, &alpha2, (const int8_t*)b, CUDA_R_8I, mb, (const int8_t*)a, CUDA_R_8I, ma, &beta2, (int*)c, CUDA_R_32I, mc, CUDA_R_32I, CUBLAS_GEMM_DEFAULT);
cublasGemmEx(*handle, CUBLAS_OP_N, CUBLAS_OP_N, mc, nc, ma, &alpha2, (const int8_t*)b, CUDA_R_8I, mb, (const int8_t*)a, CUDA_R_8I, ma, &beta2, (int*)c, CUDA_R_32I, mc, CUDA_R_32I, CUBLAS_GEMM_DEFAULT);
else if (transposedA == X_TRANS && transposedB == X_NOTRANS)
cublasGemmEx(*handle, CUBLAS_OP_N, CUBLAS_OP_T, mc, nc, na, &alpha2, (const int8_t*)b, CUDA_R_8I, mb, (const int8_t*)a, CUDA_R_8I, ma, &beta2, (int*)c, CUDA_R_32I, mc, CUDA_R_32I, CUBLAS_GEMM_DEFAULT);
else if (transposedA == X_NOTRANS && transposedB == X_TRANS)
cublasGemmEx(*handle, CUBLAS_OP_T, CUBLAS_OP_N, mc, nc, ma, &alpha2, (const int8_t*)b, CUDA_R_8I, mb, (const int8_t*)a, CUDA_R_8I, ma, &beta2, (int*)c, CUDA_R_32I, mc, CUDA_R_32I, CUBLAS_GEMM_DEFAULT);
else if (transposedA == X_TRANS && transposedB == X_TRANS)
cublasGemmEx(*handle, CUBLAS_OP_T, CUBLAS_OP_T, mc, nc, na, &alpha2, (const int8_t*)b, CUDA_R_8I, mb, (const int8_t*)a, CUDA_R_8I, ma, &beta2, (int*)c, CUDA_R_32I, mc, CUDA_R_32I, CUBLAS_GEMM_DEFAULT);
}
}
else {
else {
ShowNTErrors("Unsupported data type!");
ShowNTErrors("Unsupported data type!");
...
...
source/tensor/core/arithmetic/XTensorBLAS.h
查看文件 @
44bf9fa6
...
@@ -26,8 +26,6 @@
...
@@ -26,8 +26,6 @@
namespace
nts
{
// namespace nts(NiuTrans.Tensor)
namespace
nts
{
// namespace nts(NiuTrans.Tensor)
half
float2half_rn
(
float
a
);
/* matrix multiplication (BLAS) */
/* matrix multiplication (BLAS) */
void
_MatrixMULCPU
(
const
XTensor
*
a
,
MATRIX_TRANS_TYPE
transposedA
,
const
XTensor
*
b
,
MATRIX_TRANS_TYPE
transposedB
,
void
_MatrixMULCPU
(
const
XTensor
*
a
,
MATRIX_TRANS_TYPE
transposedA
,
const
XTensor
*
b
,
MATRIX_TRANS_TYPE
transposedB
,
XTensor
*
c
,
DTYPE
alpha
=
(
DTYPE
)
1
.
0
,
DTYPE
beta
=
0
);
XTensor
*
c
,
DTYPE
alpha
=
(
DTYPE
)
1
.
0
,
DTYPE
beta
=
0
);
...
...
source/tensor/core/getandset/SetData.cpp
查看文件 @
44bf9fa6
...
@@ -430,6 +430,39 @@ void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper)
...
@@ -430,6 +430,39 @@ void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper)
//delete t2;
//delete t2;
}
}
}
}
/*
generate data items with a uniform distribution in [lower, upper] and set
the item to a pre-defined value if the item >= p, set the item to 0 otherwise
>> tensor - the tensor whose data array would be initialized
>> lower - lower value of the range
>> upper - upper value of the range
>> p - the threshold
>> value - the value we intend to assign to the item
*/
void
_SetDataRandP
(
XTensor
*
tensor
,
DTYPE
lower
,
DTYPE
upper
,
DTYPE
p
,
DTYPE
value
)
{
//CheckNTErrors(tensor->dataType == DEFAULT_DTYPE, "TODO");
if
(
tensor
->
devID
<
0
)
{
_SetDataRand
(
tensor
,
lower
,
upper
);
DTYPE
*
data
=
(
DTYPE
*
)
tensor
->
data
;
for
(
int
i
=
0
;
i
<
tensor
->
unitNum
;
i
++
)
{
if
(
data
[
i
]
>=
p
)
data
[
i
]
=
value
;
else
data
[
i
]
=
0
;
}
}
else
{
#ifdef USE_CUDA
_CudaSetDataRandP
(
tensor
,
lower
,
upper
,
p
,
value
);
#else
ShowNTErrors
(
"Please recompile the code by specifying USE_CUDA"
);
#endif // USE_CUDA
}
}
/*
/*
generate data items with a normal distribution with specified mean and standard deviation
generate data items with a normal distribution with specified mean and standard deviation
...
...
source/tensor/core/getandset/SetData.cu
查看文件 @
44bf9fa6
...
@@ -28,6 +28,7 @@
...
@@ -28,6 +28,7 @@
#include "../../XDevice.h"
#include "../../XDevice.h"
#include "../../XUtility.h"
#include "../../XUtility.h"
#include "../getandset/ConvertDataType.h"
#include "../getandset/ConvertDataType.h"
#include "../movement/CopyValues.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
namespace nts { // namespace nts(NiuTrans.Tensor)
...
@@ -197,6 +198,7 @@ set data array with a uniform distribution in [low, high]
...
@@ -197,6 +198,7 @@ set data array with a uniform distribution in [low, high]
__global__
__global__
void KernelSetDataRandHalf(half * d, int size, DTYPE lower, DTYPE variance)
void KernelSetDataRandHalf(half * d, int size, DTYPE lower, DTYPE variance)
{
{
#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
half lowerHalf = __float2half(lower);
half lowerHalf = __float2half(lower);
half varianceHalf = __float2half(variance);
half varianceHalf = __float2half(variance);
...
@@ -204,6 +206,47 @@ void KernelSetDataRandHalf(half * d, int size, DTYPE lower, DTYPE variance)
...
@@ -204,6 +206,47 @@ void KernelSetDataRandHalf(half * d, int size, DTYPE lower, DTYPE variance)
if (i < size) {
if (i < size) {
d[i] = d[i] * varianceHalf + lowerHalf;
d[i] = d[i] * varianceHalf + lowerHalf;
}
}
#endif
}
/*
set data items to a pre-defined value if its value >= p, set it to 0 otherwise
>> d - pointer to the data array
>> size - size of the array
>> lower - low value of the range
>> variance - the variance of the range
*/
__global__
void KernelSetDataPCut(DTYPE * d, int size, DTYPE p, DTYPE value)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size) {
if (d[i] >= p)
d[i] = value;
else
d[i] = 0;
}
}
__global__
void KernelSetDataPCutHalf(half * d, int size, DTYPE p, DTYPE value)
{
#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
half halfP = __float2half(p);
half halfValue = __float2half(value);
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size) {
if (d[i] >= halfP)
d[i] = halfValue;
else
d[i] = 0;
}
#endif
}
}
/*
/*
...
@@ -473,34 +516,81 @@ void _CudaSetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper)
...
@@ -473,34 +516,81 @@ void _CudaSetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper)
int devIDBackup;
int devIDBackup;
ProtectCudaDev(tensor->devID, devIDBackup);
ProtectCudaDev(tensor->devID, devIDBackup);
curandGenerator_t gen;
XTensor tensor1(tensor->order, tensor->dimSize, X_FLOAT, tensor->denseRatio, tensor->devID, tensor->mem);
curandCreateGenerator (&gen, CURAND_RNG_PSEUDO_DEFAULT);
curandSetPseudoRandomGeneratorSeed(gen, time(NULL));
DTYPE variance = upper - lower;
if (tensor->dataType == X_FLOAT) {
if (tensor->dataType == X_FLOAT){
curandGenerator_t & gen = GDevs.GPUs[tensor->devID].gen;
curandGenerateUniform(gen, (float*)tensor->data, tensor->unitNum);
curandGenerateUniform(gen, (float*)tensor->data, tensor->unitNum);
curandDestroyGenerator(gen);
KernelSetDataRandFloat << <blocks, threads >> >((float*)tensor->data, tensor->unitNum, lower, variance);
}
}
else{
else if (tensor->dataType == X_DOUBLE) {
curandGenerator_t & gen = GDevs.GPUs[tensor->devID].gen;
curandGenerateUniform(gen, (float*)tensor->data, tensor->unitNum);
curandGenerateUniform(gen, (float*)tensor1.data, tensor1.unitNum);
curandDestroyGenerator(gen);
KernelSetDataRandDouble << <blocks, threads >> >((double*)tensor->data, tensor->unitNum, lower, variance);
}
}
//curandGenerator_t & gen = GDevs.GPUs[tensor->devID].gen;
//curandGenerateUniform(gen, (float*)tensor->data, tensor->unitNum);
else if (tensor->dataType == X_FLOAT16) {
DTYPE variance = upper - lower;
XTensor tensor1(tensor->order, tensor->dimSize, X_FLOAT, tensor->denseRatio, tensor->devID,tensor->mem);
curandGenerateUniform(gen, (float *)tensor1.data, tensor1.unitNum);
if (variance != 1.0F || lower != 0) {
curandDestroyGenerator(gen);
if (tensor->dataType == X_FLOAT) {
_ConvertDataType(&tensor1, tensor);
KernelSetDataRandFloat << <blocks, threads >> >((float*)tensor->data, tensor->unitNum, lower, variance);
KernelSetDataRandHalf << <blocks, threads >> >((half*)tensor->data, tensor->unitNum, lower, variance);
}
}
else if (tensor->dataType == X_DOUBLE) {
KernelSetDataRandDouble << <blocks, threads >> >((double*)tensor->data, tensor->unitNum, lower, variance);
}
else if (tensor->dataType == X_FLOAT16) {
_ConvertDataType(&tensor1, tensor);
KernelSetDataRandHalf << <blocks, threads >> >((half*)tensor->data, tensor->unitNum, lower, variance);
}
else {
ShowNTErrors("TODO!");
}
}
else if (tensor->dataType == X_FLOAT16) {
_ConvertDataType(&tensor1, tensor);
}
BacktoCudaDev(tensor->devID, devIDBackup);
BacktoCudaDev(tensor->devID, devIDBackup);
}
}
/*
generate data items with a uniform distribution in [lower, upper] and set
the item to a pre-defined value if the item >= p, set the item to 0 otherwise
>> tensor - the tensor whose data array would be initialized
>> lower - lower value of the range
>> upper - upper value of the range
>> p - the threshold
>> value - the value we intend to assign to the item
*/
void _CudaSetDataRandP(XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE p, DTYPE value)
{
_CudaSetDataRand(tensor, lower, upper);
int gridSize[3];
int blockSize[3];
GDevs.GetCudaThread(tensor->devID, tensor->unitNum, gridSize, blockSize);
dim3 blocks(gridSize[0]);
dim3 threads(blockSize[0]);
int devIDBackup;
ProtectCudaDev(tensor->devID, devIDBackup);
if (tensor->dataType == X_FLOAT) {
KernelSetDataPCut << <blocks, threads >> >((float*)tensor->data, tensor->unitNum, p, value);
}
else if (tensor->dataType == X_FLOAT16) {
KernelSetDataPCutHalf << <blocks, threads >> >((__half*)tensor->data, tensor->unitNum, p, value);
}
else {
ShowNTErrors("TODO!")
}
BacktoCudaDev(tensor->devID, devIDBackup);
}
/*
/*
set the data with an array of offsets (kernel version)
set the data with an array of offsets (kernel version)
>> data - pointer to the data array
>> data - pointer to the data array
...
...
source/tensor/core/getandset/SetData.cuh
查看文件 @
44bf9fa6
...
@@ -49,6 +49,10 @@ void _CudaSetDataLowTri(XTensor * tensor, DTYPE p, int shift);
...
@@ -49,6 +49,10 @@ void _CudaSetDataLowTri(XTensor * tensor, DTYPE p, int shift);
/* generate data items with a uniform distribution in [lower, upper] */
/* generate data items with a uniform distribution in [lower, upper] */
void _CudaSetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper);
void _CudaSetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper);
/* generate data items with a uniform distribution in [lower, upper] and set
the item to a pre-defined value if the item >= p, set the item to 0 otherwise */
void _CudaSetDataRandP(XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE p, DTYPE value);
/* set the data with an array of offsets */
/* set the data with an array of offsets */
void _CudaSetDataWithOffset(XTensor * tensor, MTYPE * offsets, DTYPE value, MTYPE num);
void _CudaSetDataWithOffset(XTensor * tensor, MTYPE * offsets, DTYPE value, MTYPE num);
...
...
source/tensor/core/getandset/SetData.h
查看文件 @
44bf9fa6
...
@@ -57,6 +57,10 @@ void _SetDataLowTri(XTensor * tensor, DTYPE p, int shift);
...
@@ -57,6 +57,10 @@ void _SetDataLowTri(XTensor * tensor, DTYPE p, int shift);
/* generate data items with a uniform distribution in [lower, upper] */
/* generate data items with a uniform distribution in [lower, upper] */
void
_SetDataRand
(
XTensor
*
tensor
,
DTYPE
lower
,
DTYPE
upper
);
void
_SetDataRand
(
XTensor
*
tensor
,
DTYPE
lower
,
DTYPE
upper
);
/* generate data items with a uniform distribution in [lower, upper] and set
the item to a pre-defined value if the item >= p, set the item to 0 otherwise */
void
_SetDataRandP
(
XTensor
*
tensor
,
DTYPE
lower
,
DTYPE
upper
,
DTYPE
p
,
DTYPE
value
);
/* generate data items with a normal distribution with specified mean and standard deviation */
/* generate data items with a normal distribution with specified mean and standard deviation */
void
_SetDataRandN
(
XTensor
*
tensor
,
DTYPE
mean
=
0
.
0
F
,
DTYPE
standardDeviation
=
1
.
0
F
);
void
_SetDataRandN
(
XTensor
*
tensor
,
DTYPE
mean
=
0
.
0
F
,
DTYPE
standardDeviation
=
1
.
0
F
);
...
...
source/tensor/core/math/Power.cpp
查看文件 @
44bf9fa6
...
@@ -35,8 +35,6 @@ get the power(a, p)
...
@@ -35,8 +35,6 @@ get the power(a, p)
*/
*/
void
_Power
(
const
XTensor
*
a
,
XTensor
*
b
,
DTYPE
p
)
void
_Power
(
const
XTensor
*
a
,
XTensor
*
b
,
DTYPE
p
)
{
{
CheckNTErrors
(
a
->
dataType
==
b
->
dataType
,
"Unmatched tensors in addition!"
);
#ifdef USE_CUDA
#ifdef USE_CUDA
/* run it on GPUs */
/* run it on GPUs */
if
(
a
->
devID
>=
0
)
{
if
(
a
->
devID
>=
0
)
{
...
...
source/tensor/core/movement/Gather.cpp
查看文件 @
44bf9fa6
...
@@ -138,4 +138,4 @@ XTensor Gather(XTensor &s, XTensor &index)
...
@@ -138,4 +138,4 @@ XTensor Gather(XTensor &s, XTensor &index)
}
}
}
}
}
//
namespace
nts
(
NiuTrans
.
Tensor
)
}
// namespace nts(NiuTrans.Tensor)
\ No newline at end of file
source/tensor/core/movement/Spread.cpp
查看文件 @
44bf9fa6
...
@@ -270,4 +270,4 @@ void _SpreadForGather(XTensor * source, XTensor * collection, XTensor * index)
...
@@ -270,4 +270,4 @@ void _SpreadForGather(XTensor * source, XTensor * collection, XTensor * index)
}
}
}
}
}
//
namespace
nts
(
NiuTrans
.
Tensor
)
}
// namespace nts(NiuTrans.Tensor)
\ No newline at end of file
source/tensor/core/movement/Spread.cu
查看文件 @
44bf9fa6
...
@@ -416,4 +416,4 @@ void _CudaSpreadForGather(XTensor * source, XTensor * collection, XTensor * srcI
...
@@ -416,4 +416,4 @@ void _CudaSpreadForGather(XTensor * source, XTensor * collection, XTensor * srcI
#endif // USE_CUDA
#endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor)
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
source/tensor/core/reduce/ReduceMax.cpp
查看文件 @
44bf9fa6
...
@@ -23,7 +23,6 @@
...
@@ -23,7 +23,6 @@
#include "../../XName.h"
#include "../../XName.h"
#include "ReduceMax.h"
#include "ReduceMax.h"
#include "ReduceMax.cuh"
#include "ReduceMax.cuh"
#include "../getandset/ConvertDataType.h"
namespace
nts
{
// namespace nts(NiuTrans.Tensor)
namespace
nts
{
// namespace nts(NiuTrans.Tensor)
...
...
source/tensor/core/reduce/ReduceMax.cu
查看文件 @
44bf9fa6
差异被折叠。
点击展开。
source/tensor/core/reduce/ReduceSum.cu
查看文件 @
44bf9fa6
差异被折叠。
点击展开。
source/tensor/function/Dropout.cpp
查看文件 @
44bf9fa6
...
@@ -27,6 +27,7 @@
...
@@ -27,6 +27,7 @@
#include "../core/arithmetic/MultiplyDim.h"
#include "../core/arithmetic/MultiplyDim.h"
#include "../core/math/ScaleAndShift.h"
#include "../core/math/ScaleAndShift.h"
#include "../core/CHeader.h"
#include "../core/CHeader.h"
#include "../core/getandset/SetData.h"
namespace
nts
{
// namespace nts(NiuTrans.Tensor
namespace
nts
{
// namespace nts(NiuTrans.Tensor
...
@@ -40,7 +41,7 @@ for more details.
...
@@ -40,7 +41,7 @@ for more details.
Here, the output is scaled by a factor of \frac{1}{1-p} so that we do not need
Here, the output is scaled by a factor of \frac{1}{1-p} so that we do not need
to mark the tensor with probability p in the inference phase. Instead we perform
to mark the tensor with probability p in the inference phase. Instead we perform
the same inference procedure as that
with no use of dropout on the test data
.
the same inference procedure as that
on the test data withno nb use of dropout
.
>> x - input tensor
>> x - input tensor
>> y - output tensor
>> y - output tensor
...
@@ -123,8 +124,8 @@ void _DropoutBackward(const XTensor * y, const XTensor * x,
...
@@ -123,8 +124,8 @@ void _DropoutBackward(const XTensor * y, const XTensor * x,
else
else
ShowNTErrors
(
"TODO!"
);
ShowNTErrors
(
"TODO!"
);
}
}
/*
/*
dropout function (we make tensor connections here)
dropout function (we make tensor connections here)
It randomly zeroes some of the elements of the input tensor
It randomly zeroes some of the elements of the input tensor
with probability p via a Bernoulli distribution.
with probability p via a Bernoulli distribution.
...
@@ -135,89 +136,108 @@ for more details.
...
@@ -135,89 +136,108 @@ for more details.
Here, the output is scaled by a factor of \frac{1}{1-p} so that we do not need
Here, the output is scaled by a factor of \frac{1}{1-p} so that we do not need
to mark the tensor with probability p in the inference phase. Instead we perform
to mark the tensor with probability p in the inference phase. Instead we perform
the same inference procedure as that with no use of dropout on the test data.
the same inference procedure as that with no use of dropout on the test data.
>> x - input tensor
>> x - input tensor
>> dropProb - probability to set an element to zero
>> dropProb - probability to set an element to zero
>> leadingDim - the dimension which we generate the random numbers and perform broadcasting
>> leadingDim - the dimension which we generate the random numbers and perform broadcasting
>> leadingDim2 - another dimension which we generate the random numbers and perform broadcasting
<< return - tensor after dropout
*/
*/
XTensor
Dropout
(
const
XTensor
&
x
,
DTYPE
dropProb
,
int
leadingDim
)
XTensor
Dropout
(
const
XTensor
&
x
,
DTYPE
dropProb
,
int
leadingDim
,
int
leadingDim2
)
{
{
CheckNTErrors
(
dropProb
>=
0.0
&&
dropProb
<=
1.0
,
"The probability must be 0-1!"
);
CheckNTErrors
(
dropProb
>=
0.0
&&
dropProb
<=
1.0
,
"The probability must be 0-1!"
);
int
n
=
leadingDim
<
0
?
x
.
order
-
1
:
leadingDim
;
XTensor
mask
;
DTYPE
*
maskArray
=
NULL
;
DTYPE
scaleFactor
=
(
DTYPE
)
1.0
/
((
DTYPE
)
1.0
-
dropProb
);
CheckNTErrors
(
n
>=
0
&&
n
<
x
.
order
,
"Wrong leadingDim!"
);
if
(
leadingDim
<
0
&&
leadingDim2
<
0
){
XTensor
mask
;
InitTensor
(
&
mask
,
&
x
);
_SetDataRandP
(
&
mask
,
0
,
1.0
F
,
dropProb
,
scaleFactor
);
return
Multiply
(
x
,
mask
);
}
else
if
(
leadingDim2
<
0
){
int
n
=
leadingDim
;
DTYPE
scaleFactor
=
(
DTYPE
)
1.0
/
((
DTYPE
)
1.0
-
dropProb
);
CheckNTErrors
(
n
>=
0
&&
n
<
x
.
order
,
"Wrong leadingDim!"
);
/* generate a mask tensor with probability p */
int
unitNum
=
x
.
dimSize
[
n
];
maskArray
=
new
DTYPE
[
unitNum
];
//srand((unsigned int)time(NULL));
for
(
int
i
=
0
;
i
<
unitNum
;
i
++
)
maskArray
[
i
]
=
RandomBernoulli
(
dropProb
,
scaleFactor
);
/* generate a mask tensor with probability p */
XTensor
mask
;
int
unitNum
=
x
.
dimSize
[
n
]
;
InitTensor1D
(
&
mask
,
unitNum
,
X_FLOAT
,
x
.
devID
,
x
.
mem
)
;
DTYPE
*
maskArray
=
new
DTYPE
[
unitNum
]
;
mask
.
SetData
(
maskArray
,
unitNum
)
;
//srand((unsigned int)time(NULL));
delete
[]
maskArray
;
for
(
int
i
=
0
;
i
<
unitNum
;
i
++
)
{
maskArray
[
i
]
=
RandomBernoulli
(
dropProb
,
scaleFactor
);
}
XTensor
mask
;
if
(
x
.
dataType
==
X_FLOAT
)
InitTensor1D
(
&
mask
,
unitNum
,
X_FLOAT
,
x
.
devID
,
x
.
mem
);
{
mask
.
SetData
(
maskArray
,
unitNum
);
return
MultiplyDim
(
x
,
mask
,
n
);
}
else
if
(
x
.
dataType
==
X_FLOAT16
)
{
XTensor
mask1
(
mask
.
order
,
mask
.
dimSize
,
X_FLOAT16
,
mask
.
denseRatio
,
mask
.
devID
,
mask
.
mem
);
//mask1 = ConvertDataType(mask, X_FLOAT16);
_ConvertDataType
(
&
mask
,
&
mask1
);
return
MultiplyDim
(
x
,
mask1
,
n
);
}
else
{
ShowNTErrors
(
"TODO!"
);
}
}
else
{
int
n
=
leadingDim
;
int
m
=
leadingDim2
;
delete
[]
maskArray
;
CheckNTErrors
(
n
>=
0
&&
n
<
x
.
order
,
"Wrong leadingDim!"
);
CheckNTErrors
(
m
>=
0
&&
m
<
x
.
order
,
"Wrong leadingDim!"
);
/* generate a mask tensor with probability p */
int
unitNum
=
x
.
dimSize
[
n
]
*
x
.
dimSize
[
m
];
maskArray
=
new
DTYPE
[
unitNum
];
if
(
x
.
dataType
==
X_FLOAT
)
//srand((unsigned int)time(NULL));
{
for
(
int
i
=
0
;
i
<
unitNum
;
i
++
)
return
MultiplyDim
(
x
,
mask
,
n
,
0
);
maskArray
[
i
]
=
RandomBernoulli
(
dropProb
,
scaleFactor
);
}
else
{
XTensor
mask1
;
mask1
=
ConvertDataType
(
mask
,
X_FLOAT16
);
return
MultiplyDim
(
x
,
mask1
,
n
,
0
);
int
dims
[
MAX_TENSOR_DIM_NUM
];
}
}
for
(
int
i
=
0
;
i
<
x
.
order
;
i
++
)
dims
[
i
]
=
1
;
dims
[
n
]
=
x
.
GetDim
(
n
);
dims
[
m
]
=
x
.
GetDim
(
m
);
InitTensor
(
&
mask
,
x
.
order
,
dims
,
X_FLOAT
,
x
.
denseRatio
,
x
.
devID
,
x
.
mem
);
mask
.
SetData
(
maskArray
,
unitNum
);
delete
[]
maskArray
;
if
(
x
.
dataType
==
X_FLOAT
)
{
return
MultiplyBroadcast
(
x
,
mask
);
}
else
if
(
x
.
dataType
==
X_FLOAT16
)
{
XTensor
mask1
(
mask
.
order
,
mask
.
dimSize
,
X_FLOAT16
,
mask
.
denseRatio
,
mask
.
devID
,
mask
.
mem
);
//mask1 = ConvertDataType(mask, X_FLOAT16);
_ConvertDataType
(
&
mask
,
&
mask1
);
return
MultiplyBroadcast
(
x
,
mask1
);
}
else
{
ShowNTErrors
(
"TODO!"
);
}
}
//XTensor DropoutFloat16(const XTensor &x, DTYPE dropProb, int leadingDim)
//{
}
// CheckNTErrors(dropProb >= 0.0 && dropProb <= 1.0, "The probability must be 0-1!");
//
// int n = leadingDim < 0 ? x.order - 1 : leadingDim;
//
// CheckNTErrors(n >= 0 && n < x.order, "Wrong leadingDim!");
//
// DTYPE scaleFactor = (DTYPE)1.0 / ((DTYPE)1.0 - dropProb);
//
// /* generate a mask tensor with probability p */
// int unitNum = x.dimSize[n];
// DTYPE * maskArray = new DTYPE[unitNum];
//
// //srand((unsigned int)time(NULL));
// for (int i = 0; i < unitNum; i++) {
// maskArray[i] = RandomBernoulli(dropProb, scaleFactor);
// }
//
// XTensor mask;
// InitTensor1D(&mask, unitNum, X_FLOAT, x.devID, x.mem);
// mask.SetData(maskArray, unitNum);
//
// delete[] maskArray;
//
// XTensor halfMask;
// halfMask = ConvertDataType(mask, X_FLOAT16);
// XTensor halfX;
// halfX = ConvertDataType(x, X_FLOAT16);
// XTensor result;
// XTensor halfResult;
//
// halfResult = MultiplyDim(halfX, halfMask, n, 0);
//
// result = ConvertDataType(halfResult, X_FLOAT);
// return result;
// /*return MultiplyDim(x, mask1, n, 0);*/
//}
/*
/*
dropout function without broadcast
dropout function without broadcast
...
@@ -235,7 +255,6 @@ XTensor DropoutWithoutBroadcast(const XTensor &x, DTYPE dropProb)
...
@@ -235,7 +255,6 @@ XTensor DropoutWithoutBroadcast(const XTensor &x, DTYPE dropProb)
int
unitNum
=
x
.
unitNum
;
int
unitNum
=
x
.
unitNum
;
DTYPE
*
maskArray
=
new
DTYPE
[
unitNum
];
DTYPE
*
maskArray
=
new
DTYPE
[
unitNum
];
srand
((
unsigned
int
)
time
(
NULL
));
for
(
int
i
=
0
;
i
<
unitNum
;
i
++
)
for
(
int
i
=
0
;
i
<
unitNum
;
i
++
)
maskArray
[
i
]
=
RandomBernoulli
(
dropProb
,
scaleFactor
);
maskArray
[
i
]
=
RandomBernoulli
(
dropProb
,
scaleFactor
);
...
...
source/tensor/function/Dropout.h
查看文件 @
44bf9fa6
...
@@ -30,7 +30,6 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
...
@@ -30,7 +30,6 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
inline
DTYPE
RandomBernoulli
(
DTYPE
dropProb
,
DTYPE
value
)
inline
DTYPE
RandomBernoulli
(
DTYPE
dropProb
,
DTYPE
value
)
{
{
return
(
DTYPE
)
rand
()
/
(
DTYPE
)
RAND_MAX
>=
dropProb
?
(
DTYPE
)
value
:
0
;
return
(
DTYPE
)
rand
()
/
(
DTYPE
)
RAND_MAX
>=
dropProb
?
(
DTYPE
)
value
:
0
;
}
}
/* dropout function */
/* dropout function */
...
@@ -40,11 +39,9 @@ void _Dropout(const XTensor * x, XTensor * y, unsigned int seed, DTYPE dropProb,
...
@@ -40,11 +39,9 @@ void _Dropout(const XTensor * x, XTensor * y, unsigned int seed, DTYPE dropProb,
void
_DropoutBackward
(
const
XTensor
*
y
,
const
XTensor
*
x
,
void
_DropoutBackward
(
const
XTensor
*
y
,
const
XTensor
*
x
,
const
XTensor
*
dedy
,
XTensor
*
dedx
,
const
XTensor
*
dedy
,
XTensor
*
dedx
,
unsigned
int
seed
,
DTYPE
dropProb
,
int
leadingDim
=
-
1
);
unsigned
int
seed
,
DTYPE
dropProb
,
int
leadingDim
=
-
1
);
/* dropout function */
XTensor
Dropout
(
const
XTensor
&
x
,
DTYPE
dropProb
,
int
leadingDim
=
-
1
);
/* dropout function */
XTensor
Dropout
(
const
XTensor
&
x
,
DTYPE
dropProb
,
int
leadingDim
=
-
1
,
int
leadingDim2
=
-
1
);
/* dropout function without broadcast */
/* dropout function without broadcast */
XTensor
DropoutWithoutBroadcast
(
const
XTensor
&
x
,
DTYPE
dropProb
);
XTensor
DropoutWithoutBroadcast
(
const
XTensor
&
x
,
DTYPE
dropProb
);
...
...
source/tensor/function/LogSoftmax.cpp
查看文件 @
44bf9fa6
...
@@ -27,9 +27,6 @@
...
@@ -27,9 +27,6 @@
#include "../core/reduce/ReduceSum.h"
#include "../core/reduce/ReduceSum.h"
#include "../core/reduce/ReduceMax.h"
#include "../core/reduce/ReduceMax.h"
#include "../core/movement/CopyValues.h"
#include "../core/movement/CopyValues.h"
#include "../../tensor/core/getandset/ConvertDataType.h"
using
namespace
nts
;
namespace
nts
{
// namespace nts(NiuTrans.Tensor)
namespace
nts
{
// namespace nts(NiuTrans.Tensor)
...
@@ -182,125 +179,6 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim)
...
@@ -182,125 +179,6 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim)
delete
[]
dimSize
;
delete
[]
dimSize
;
// if (!x->isSparse && !y->isSparse &&
// x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE)
// {
// int * dimSize = new int[x->order - 1];
// for (int i = 0; i < x->order; i++) {
// if (i < leadDim)
// dimSize[i] = -x->dimSize[i];
// else if (i > leadDim)
// dimSize[i - 1] = -x->dimSize[i];
// }
//
// XMem * mem = x->mem;
// XTensor * max = NULL;
// XTensor * sum = NULL;
// XTensor * blockx = NULL;
// XTensor * blocky = NULL;
// XTensor * blockMax = NULL;
// XTensor * blockSum = NULL;
//
// int dimensionSize = y->dimSizeRDI[leadDimRDI];
// int stride = 1;
// int blockSize = 1;
// int blockNum = 1;
//
// for (int i = 0; i < leadDimRDI; i++)
// stride *= y->dimSizeRDI[i];
// blockSize = stride * dimensionSize;
// blockNum = y->unitNum / blockSize;
//
// max = NewTensorBuf(x->order - 1, dimSize, x->dataType, x->denseRatio, x->devID, mem);
// sum = NewTensorBuf(x->order - 1, dimSize, x->dataType, x->denseRatio, x->devID, mem);
//
// _ReduceMax(x, max, leadDim);
// _ReduceSum(x, sum, leadDim, max, 1.0F, true);
//
// if (x->devID >= 0) {
// if (leadDimRDI == 0) {
// blockSize = y->unitNum;
// blockNum = 1;
// blockx = NewTensor2D(blockSize / dimensionSize, -dimensionSize, x->dataType, x->devID, mem);
// blocky = NewTensor2D(blockSize / dimensionSize, -dimensionSize, x->dataType, x->devID, mem);
// blockMax = NewTensor2D(blockSize / dimensionSize, -1, x->dataType, x->devID, mem);
// blockSum = NewTensor2D(blockSize / dimensionSize, -1, x->dataType, x->devID, mem);
// }
// else {
// blockx = NewTensor2D(-stride, dimensionSize, x->dataType, x->devID, mem);
// blocky = NewTensor2D(-stride, dimensionSize, x->dataType, x->devID, mem);
// blockMax = NewTensor2D(-stride, 1, x->dataType, x->devID, mem);
// blockSum = NewTensor2D(-stride, 1, x->dataType, x->devID, mem);
// }
// }
//
// for (int k = 0; k < blockNum; k++) {
// int m = stride;
// int n = dimensionSize;
//
// DTYPE * ip = (DTYPE*)x->data + k * blockSize;
// DTYPE * op = (DTYPE*)y->data + k * blockSize;
// DTYPE * mp = (DTYPE*)max->data + k * blockSize / dimensionSize;
// DTYPE * sp = (DTYPE*)sum->data + k * blockSize / dimensionSize;
//
// if (x->devID < 0) {
// for (int j = 0; j < m; j++) {
// DTYPE sumValue = sp[j];
// if (sumValue == 0) {
// for (int i = 0; i < n; i++)
// op[i * m + j] = 0;
// }
// else {
// for (int i = 0; i < n; i++) {
// DTYPE r = (DTYPE)log(exp(ip[i * m + j] - mp[j]) / sp[j]);
// if (IsNAN(r))
// r = LOGPROB_MIN;
// if (IsINF(r))
// r = LOGPROB_MIN;
//
// op[i * m + j] = MAX(r, LOGPROB_MIN);
// }
// }
// }
// }
// else {
// blockx->data = ip;
// blocky->data = op;
// blockMax->data = mp;
// blockSum->data = sp;
//#ifdef USE_CUDA
// if (leadDimRDI == 0)
// _CudaLogSoftmaxSumMax(blockx, blocky, 1, blockSum, blockMax);
// else
// _CudaLogSoftmaxSumMax(blockx, blocky, leadDim, blockSum, blockMax);
//#else
// ShowNTErrors("Please specify USE_CUDA and recompile the code!");
//#endif
// blockx->data = NULL;
// blocky->data = NULL;
// blockMax->data = NULL;
// blockSum->data = NULL;
// }
// }
//
// DelTensorBuf(max);
// DelTensorBuf(sum);
//
// if (x->devID >= 0) {
// delete blockx;
// delete blocky;
// delete blockMax;
// delete blockSum;
// }
//
// delete[] dimSize;
// }
// else
// ShowNTErrors("TODO!");
}
}
/*
/*
...
...
source/tensor/function/LogSoftmax.cu
查看文件 @
44bf9fa6
...
@@ -26,7 +26,6 @@
...
@@ -26,7 +26,6 @@
#include "../core/reduce/ReduceSum.cuh"
#include "../core/reduce/ReduceSum.cuh"
#include "../core/reduce/ReduceMax.cuh"
#include "../core/reduce/ReduceMax.cuh"
#include "../XDevice.h"
#include "../XDevice.h"
#include "device_launch_parameters.h"
#include "cuda_fp16.h"
#include "cuda_fp16.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
namespace nts { // namespace nts(NiuTrans.Tensor)
...
...
source/tensor/function/Softmax.cpp
查看文件 @
44bf9fa6
...
@@ -26,7 +26,6 @@
...
@@ -26,7 +26,6 @@
#include "../XUtility.h"
#include "../XUtility.h"
#include "../core/reduce/ReduceSum.h"
#include "../core/reduce/ReduceSum.h"
#include "../core/reduce/ReduceMax.h"
#include "../core/reduce/ReduceMax.h"
#include "../core/getandset/ConvertDataType.h"
namespace
nts
{
// namespace nts(NiuTrans.Tensor)
namespace
nts
{
// namespace nts(NiuTrans.Tensor)
...
...
source/tensor/function/Softmax.cu
查看文件 @
44bf9fa6
...
@@ -29,7 +29,6 @@
...
@@ -29,7 +29,6 @@
#include "../core/arithmetic/Sum.h"
#include "../core/arithmetic/Sum.h"
#include "../XDevice.h"
#include "../XDevice.h"
#include "../XUtility.h"
#include "../XUtility.h"
#include "../core/getandset/ConvertDataType.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
namespace nts { // namespace nts(NiuTrans.Tensor)
...
@@ -86,8 +85,6 @@ void KernelSoftmaxComputeTensor(DTYPE * x, DTYPE * max, DTYPE * sum, DTYPE * y,
...
@@ -86,8 +85,6 @@ void KernelSoftmaxComputeTensor(DTYPE * x, DTYPE * max, DTYPE * sum, DTYPE * y,
/* synchronize to make sure the values of max and sum are loaded */
/* synchronize to make sure the values of max and sum are loaded */
__syncthreads();
__syncthreads();
//printf("1: %d %d %d %d\n", i, strideSizeTotal, j, strideNum);
if(i < strideSizeTotal && j < strideNum){
if(i < strideSizeTotal && j < strideNum){
int offset = int(i / stride) * blockSize + j * stride + i2[threadIdx.x];
int offset = int(i / stride) * blockSize + j * stride + i2[threadIdx.x];
DTYPE r = exp(x[offset] - xMax[threadIdx.x])/xSum[threadIdx.x];
DTYPE r = exp(x[offset] - xMax[threadIdx.x])/xSum[threadIdx.x];
...
@@ -145,8 +142,6 @@ void KernelSoftmaxComputeTensorHalf(__half * x, __half * max, __half * sum, __ha
...
@@ -145,8 +142,6 @@ void KernelSoftmaxComputeTensorHalf(__half * x, __half * max, __half * sum, __ha
/* synchronize to make sure the values of max and sum are loaded */
/* synchronize to make sure the values of max and sum are loaded */
__syncthreads();
__syncthreads();
//printf("2: %d %d %d %d\n",i ,stride * blockNum ,j ,strideNum);
if(i < stride * blockNum && j < strideNum){
if(i < stride * blockNum && j < strideNum){
int offset = int(i / stride) * blockSize + j * stride + i2[threadIdx.x];
int offset = int(i / stride) * blockSize + j * stride + i2[threadIdx.x];
#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
...
@@ -256,7 +251,6 @@ void _CudaSoftmaxSumMax(const XTensor * x, XTensor * y, int leadDim, XTensor * s
...
@@ -256,7 +251,6 @@ void _CudaSoftmaxSumMax(const XTensor * x, XTensor * y, int leadDim, XTensor * s
if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE){
if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE){
if (leadDim != 0 || dimensionSize <= 10) {
if (leadDim != 0 || dimensionSize <= 10) {
//printf("%d %d %d %d\n", cudaGridSize[0], cudaGridSize[1], cudaBlockSize[0], cudaBlockSize[1]);
KernelSoftmaxComputeTensor <<< dim3(cudaGridSize[0], cudaGridSize[1]), dim3(cudaBlockSize[0], cudaBlockSize[1]) >>>
KernelSoftmaxComputeTensor <<< dim3(cudaGridSize[0], cudaGridSize[1]), dim3(cudaBlockSize[0], cudaBlockSize[1]) >>>
((DTYPE*)x->data, (DTYPE*)max->data, (DTYPE*)sum->data, (DTYPE*)y->data,
((DTYPE*)x->data, (DTYPE*)max->data, (DTYPE*)sum->data, (DTYPE*)y->data,
stride, dimensionSize, stride * dimensionSize, blockNum, stride * blockNum);
stride, dimensionSize, stride * dimensionSize, blockNum, stride * blockNum);
...
@@ -269,8 +263,6 @@ void _CudaSoftmaxSumMax(const XTensor * x, XTensor * y, int leadDim, XTensor * s
...
@@ -269,8 +263,6 @@ void _CudaSoftmaxSumMax(const XTensor * x, XTensor * y, int leadDim, XTensor * s
}
}
else if(x->dataType == X_FLOAT16 && y->dataType == X_FLOAT16){
else if(x->dataType == X_FLOAT16 && y->dataType == X_FLOAT16){
//printf("%d\n\n",dimensionSize);
//printf("%d %d %d %d\n", cudaGridSize[0], cudaGridSize[1], cudaBlockSize[0], cudaBlockSize[1]);
KernelSoftmaxComputeTensorHalf <<< dim3(cudaGridSize[0], cudaGridSize[1]), dim3(cudaBlockSize[0], cudaBlockSize[1]) >>>
KernelSoftmaxComputeTensorHalf <<< dim3(cudaGridSize[0], cudaGridSize[1]), dim3(cudaBlockSize[0], cudaBlockSize[1]) >>>
((__half*)x->data, (__half*)max->data, (__half*)sum->data, (__half*)y->data,
((__half*)x->data, (__half*)max->data, (__half*)sum->data, (__half*)y->data,
stride, dimensionSize, blockNum);
stride, dimensionSize, blockNum);
...
@@ -279,10 +271,6 @@ void _CudaSoftmaxSumMax(const XTensor * x, XTensor * y, int leadDim, XTensor * s
...
@@ -279,10 +271,6 @@ void _CudaSoftmaxSumMax(const XTensor * x, XTensor * y, int leadDim, XTensor * s
ShowNTErrors("TODO!");
ShowNTErrors("TODO!");
}
}
/*XTensor y1;
y1 = ConvertDataType(*y, X_FLOAT);
y1.Dump(stderr, "y1:");*/
BacktoCudaDev(x->devID, devIDBackup);
BacktoCudaDev(x->devID, devIDBackup);
}
}
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论