Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
N
NiuTrans.Tensor
概览
Overview
Details
Activity
Cycle Analytics
版本库
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
问题
8
Issues
8
列表
Board
标记
里程碑
合并请求
0
Merge Requests
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
Snippets
成员
Collapse sidebar
Close sidebar
活动
图像
聊天
创建新问题
作业
提交
Issue Boards
Open sidebar
NiuTrans
NiuTrans.Tensor
Commits
02b6c379
Commit
02b6c379
authored
Feb 21, 2021
by
xiaotong
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
bug fixes and removing warnings
parent
5f9867fc
隐藏空白字符变更
内嵌
并排
正在显示
20 个修改的文件
包含
133 行增加
和
98 行删除
+133
-98
source/sample/transformer/Model.cpp
+1
-1
source/sample/transformer/Utility.cpp
+6
-6
source/sample/transformer/module/Attention.cpp
+1
-1
source/sample/transformer/module/LayerHistory.cpp
+2
-2
source/sample/transformer/train/TrainDataSet.cpp
+33
-31
source/sample/transformer/train/Trainer.cpp
+5
-5
source/sample/transformer/translate/DataSet.cpp
+14
-14
source/sample/transformer/translate/DataSet.h
+1
-1
source/tensor/XCall.cpp
+1
-0
source/tensor/XMem.cpp
+3
-3
source/tensor/XTensor.cpp
+4
-4
source/tensor/XTensor.h
+2
-2
source/tensor/core/arithmetic/Sum.cpp
+9
-7
source/tensor/core/arithmetic/XTensorBLAS.cu
+23
-0
source/tensor/core/getandset/SetData.cpp
+12
-11
source/tensor/core/getandset/SetData.h
+2
-2
source/tensor/core/math/Clip.cpp
+2
-2
source/tensor/core/math/ScaleAndShift.cpp
+2
-2
source/tensor/core/reduce/ReduceSum.cu
+4
-0
source/tensor/test/TSetData.cpp
+6
-4
没有找到文件。
source/sample/transformer/Model.cpp
查看文件 @
02b6c379
...
@@ -490,7 +490,7 @@ void Model::Read(FILE* file)
...
@@ -490,7 +490,7 @@ void Model::Read(FILE* file)
TensorList
params
;
TensorList
params
;
GetParams
(
params
);
GetParams
(
params
);
LOG
(
"params count: %lu"
,
params
.
Size
());
LOG
(
"params count: %lu"
,
(
unsigned
long
)
params
.
Size
());
int
size
=
0
;
int
size
=
0
;
for
(
int
i
=
0
;
i
<
params
.
Size
();
i
++
)
{
for
(
int
i
=
0
;
i
<
params
.
Size
();
i
++
)
{
size
+=
params
[
i
]
->
unitNum
;
size
+=
params
[
i
]
->
unitNum
;
...
...
source/sample/transformer/Utility.cpp
查看文件 @
02b6c379
...
@@ -91,9 +91,9 @@ Config::Config(int argc, const char** argv)
...
@@ -91,9 +91,9 @@ Config::Config(int argc, const char** argv)
LoadParamInt
(
argsNum
,
args
,
"sbatch"
,
&
sBatchSize
,
8
);
LoadParamInt
(
argsNum
,
args
,
"sbatch"
,
&
sBatchSize
,
8
);
isTraining
=
(
strcmp
(
trainFN
,
""
)
==
0
)
?
false
:
true
;
isTraining
=
(
strcmp
(
trainFN
,
""
)
==
0
)
?
false
:
true
;
LoadParamBool
(
argsNum
,
args
,
"mt"
,
&
isMT
,
true
);
LoadParamBool
(
argsNum
,
args
,
"mt"
,
&
isMT
,
true
);
LoadParamFloat
(
argsNum
,
args
,
"dropout"
,
&
dropout
,
0.3
);
LoadParamFloat
(
argsNum
,
args
,
"dropout"
,
&
dropout
,
0.3
F
);
LoadParamFloat
(
argsNum
,
args
,
"fnndrop"
,
&
fnnDropout
,
0.1
);
LoadParamFloat
(
argsNum
,
args
,
"fnndrop"
,
&
fnnDropout
,
0.1
F
);
LoadParamFloat
(
argsNum
,
args
,
"attdrop"
,
&
attDropout
,
0.1
);
LoadParamFloat
(
argsNum
,
args
,
"attdrop"
,
&
attDropout
,
0.1
F
);
LoadParamFloat
(
argc
,
args
,
"lrate"
,
&
lrate
,
0.0015
F
);
LoadParamFloat
(
argc
,
args
,
"lrate"
,
&
lrate
,
0.0015
F
);
LoadParamFloat
(
argc
,
args
,
"lrbias"
,
&
lrbias
,
0
);
LoadParamFloat
(
argc
,
args
,
"lrbias"
,
&
lrbias
,
0
);
...
@@ -106,7 +106,7 @@ Config::Config(int argc, const char** argv)
...
@@ -106,7 +106,7 @@ Config::Config(int argc, const char** argv)
LoadParamFloat
(
argc
,
args
,
"adambeta2"
,
&
adamBeta2
,
0.98
F
);
LoadParamFloat
(
argc
,
args
,
"adambeta2"
,
&
adamBeta2
,
0.98
F
);
LoadParamFloat
(
argc
,
args
,
"adamdelta"
,
&
adamDelta
,
1e-9
F
);
LoadParamFloat
(
argc
,
args
,
"adamdelta"
,
&
adamDelta
,
1e-9
F
);
LoadParamBool
(
argc
,
args
,
"shuffled"
,
&
isShuffled
,
true
);
LoadParamBool
(
argc
,
args
,
"shuffled"
,
&
isShuffled
,
true
);
LoadParamFloat
(
argc
,
args
,
"labelsmoothing"
,
&
labelSmoothingP
,
0.1
);
LoadParamFloat
(
argc
,
args
,
"labelsmoothing"
,
&
labelSmoothingP
,
0.1
F
);
LoadParamInt
(
argc
,
args
,
"nstepcheckpoint"
,
&
nStepCheckpoint
,
-
1
);
LoadParamInt
(
argc
,
args
,
"nstepcheckpoint"
,
&
nStepCheckpoint
,
-
1
);
LoadParamBool
(
argc
,
args
,
"epochcheckpoint"
,
&
useEpochCheckpoint
,
true
);
LoadParamBool
(
argc
,
args
,
"epochcheckpoint"
,
&
useEpochCheckpoint
,
true
);
LoadParamInt
(
argc
,
args
,
"updatestep"
,
&
updateStep
,
1
);
LoadParamInt
(
argc
,
args
,
"updatestep"
,
&
updateStep
,
1
);
...
@@ -124,8 +124,8 @@ Config::Config(int argc, const char** argv)
...
@@ -124,8 +124,8 @@ Config::Config(int argc, const char** argv)
LoadParamString
(
argsNum
,
args
,
"output"
,
outputFN
,
""
);
LoadParamString
(
argsNum
,
args
,
"output"
,
outputFN
,
""
);
LoadParamInt
(
argsNum
,
args
,
"beamsize"
,
&
beamSize
,
1
);
LoadParamInt
(
argsNum
,
args
,
"beamsize"
,
&
beamSize
,
1
);
LoadParamBool
(
argsNum
,
args
,
"fp16"
,
&
useFP16
,
false
);
LoadParamBool
(
argsNum
,
args
,
"fp16"
,
&
useFP16
,
false
);
LoadParamFloat
(
argsNum
,
args
,
"lenalpha"
,
&
lenAlpha
,
0.6
);
LoadParamFloat
(
argsNum
,
args
,
"lenalpha"
,
&
lenAlpha
,
0.6
F
);
LoadParamFloat
(
argsNum
,
args
,
"maxlenalpha"
,
&
maxLenAlpha
,
1.2
);
LoadParamFloat
(
argsNum
,
args
,
"maxlenalpha"
,
&
maxLenAlpha
,
1.2
F
);
for
(
int
i
=
0
;
i
<
argc
;
i
++
)
for
(
int
i
=
0
;
i
<
argc
;
i
++
)
delete
[]
args
[
i
];
delete
[]
args
[
i
];
...
...
source/sample/transformer/module/Attention.cpp
查看文件 @
02b6c379
...
@@ -255,7 +255,7 @@ XTensor Attention::MakeRPRAttention(XTensor& k, XTensor& q, XTensor& v,
...
@@ -255,7 +255,7 @@ XTensor Attention::MakeRPRAttention(XTensor& k, XTensor& q, XTensor& v,
relativeKey
=
ConvertDataType
(
relativeKey
,
X_FLOAT
);
relativeKey
=
ConvertDataType
(
relativeKey
,
X_FLOAT
);
}
}
float
scaling
=
sqrt
(
d
/
nhead
);
float
scaling
=
(
float
)
sqrt
(
d
/
nhead
);
qheads
=
ScaleAndShift
(
qheads
,
1.0
F
/
scaling
);
qheads
=
ScaleAndShift
(
qheads
,
1.0
F
/
scaling
);
dot
=
RPDotProduct
(
qheads
,
kheads
,
relativeKey
,
true
);
dot
=
RPDotProduct
(
qheads
,
kheads
,
relativeKey
,
true
);
...
...
source/sample/transformer/module/LayerHistory.cpp
查看文件 @
02b6c379
...
@@ -92,10 +92,10 @@ generate the weight sum vector of all previous layer output in the history as th
...
@@ -92,10 +92,10 @@ generate the weight sum vector of all previous layer output in the history as th
XTensor
LayerHistory
::
Pop
()
XTensor
LayerHistory
::
Pop
()
{
{
/* the number of layer output in the history */
/* the number of layer output in the history */
size_t
size
=
history
.
Size
();
int
size
=
(
int
)
history
.
Size
();
TensorList
historyList
;
TensorList
historyList
;
for
(
size_
t
i
=
0
;
i
<
size
;
i
++
)
for
(
in
t
i
=
0
;
i
<
size
;
i
++
)
historyList
.
Add
(
history
[
i
]);
historyList
.
Add
(
history
[
i
]);
/* we need stack the tensor along the first dim*/
/* we need stack the tensor along the first dim*/
...
...
source/sample/transformer/train/TrainDataSet.cpp
查看文件 @
02b6c379
...
@@ -134,13 +134,13 @@ UInt64List TrainDataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
...
@@ -134,13 +134,13 @@ UInt64List TrainDataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
UInt64List
info
;
UInt64List
info
;
size_t
srcTokenNum
=
0
;
size_t
srcTokenNum
=
0
;
size_t
tgtTokenNum
=
0
;
size_t
tgtTokenNum
=
0
;
in
t
realBatchSize
=
1
;
size_
t
realBatchSize
=
1
;
if
(
!
isTraining
)
if
(
!
isTraining
)
realBatchSize
=
minSentBatch
;
realBatchSize
=
minSentBatch
;
/* get the maximum source sentence length in a mini-batch */
/* get the maximum source sentence length in a mini-batch */
size_t
maxSrcLen
=
buffer
[
curIdx
]
->
srcSent
.
Size
();
size_t
maxSrcLen
=
buffer
[
(
int
)
curIdx
]
->
srcSent
.
Size
();
/* max batch size */
/* max batch size */
const
int
MAX_BATCH_SIZE
=
512
;
const
int
MAX_BATCH_SIZE
=
512
;
...
@@ -150,9 +150,9 @@ UInt64List TrainDataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
...
@@ -150,9 +150,9 @@ UInt64List TrainDataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
while
((
realBatchSize
<
(
buffer
.
Size
()
-
curIdx
))
while
((
realBatchSize
<
(
buffer
.
Size
()
-
curIdx
))
&&
(
realBatchSize
*
maxSrcLen
<
batchSize
)
&&
(
realBatchSize
*
maxSrcLen
<
batchSize
)
&&
(
realBatchSize
<
MAX_BATCH_SIZE
)
&&
(
realBatchSize
<
MAX_BATCH_SIZE
)
&&
(
realBatchSize
*
buffer
[
curIdx
+
realBatchSize
]
->
srcSent
.
Size
()
<
batchSize
))
{
&&
(
realBatchSize
*
buffer
[
(
int
)(
curIdx
+
realBatchSize
)
]
->
srcSent
.
Size
()
<
batchSize
))
{
if
(
maxSrcLen
<
buffer
[
curIdx
+
realBatchSize
]
->
srcSent
.
Size
())
if
(
maxSrcLen
<
buffer
[
(
int
)(
curIdx
+
realBatchSize
)
]
->
srcSent
.
Size
())
maxSrcLen
=
buffer
[
curIdx
+
realBatchSize
]
->
srcSent
.
Size
();
maxSrcLen
=
buffer
[
(
int
)(
curIdx
+
realBatchSize
)
]
->
srcSent
.
Size
();
realBatchSize
++
;
realBatchSize
++
;
}
}
}
}
...
@@ -165,14 +165,14 @@ UInt64List TrainDataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
...
@@ -165,14 +165,14 @@ UInt64List TrainDataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
CheckNTErrors
(
realBatchSize
>
0
,
"Invalid batch size"
);
CheckNTErrors
(
realBatchSize
>
0
,
"Invalid batch size"
);
/* get the maximum target sentence length in a mini-batch */
/* get the maximum target sentence length in a mini-batch */
size_t
maxTgtLen
=
buffer
[
curIdx
]
->
tgtSent
.
Size
();
size_t
maxTgtLen
=
buffer
[
(
int
)
curIdx
]
->
tgtSent
.
Size
();
for
(
size_t
i
=
0
;
i
<
realBatchSize
;
i
++
)
{
for
(
size_t
i
=
0
;
i
<
realBatchSize
;
i
++
)
{
if
(
maxTgtLen
<
buffer
[
curIdx
+
i
]
->
tgtSent
.
Size
())
if
(
maxTgtLen
<
buffer
[
(
int
)(
curIdx
+
i
)
]
->
tgtSent
.
Size
())
maxTgtLen
=
buffer
[
curIdx
+
i
]
->
tgtSent
.
Size
();
maxTgtLen
=
buffer
[
(
int
)(
curIdx
+
i
)
]
->
tgtSent
.
Size
();
}
}
for
(
size_t
i
=
0
;
i
<
realBatchSize
;
i
++
)
{
for
(
size_t
i
=
0
;
i
<
realBatchSize
;
i
++
)
{
if
(
maxSrcLen
<
buffer
[
curIdx
+
i
]
->
srcSent
.
Size
())
if
(
maxSrcLen
<
buffer
[
(
int
)(
curIdx
+
i
)
]
->
srcSent
.
Size
())
maxSrcLen
=
buffer
[
curIdx
+
i
]
->
srcSent
.
Size
();
maxSrcLen
=
buffer
[
(
int
)(
curIdx
+
i
)
]
->
srcSent
.
Size
();
}
}
CheckNTErrors
(
maxSrcLen
!=
0
,
"Invalid source length for batching"
);
CheckNTErrors
(
maxSrcLen
!=
0
,
"Invalid source length for batching"
);
...
@@ -204,19 +204,19 @@ UInt64List TrainDataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
...
@@ -204,19 +204,19 @@ UInt64List TrainDataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
*/
*/
for
(
int
i
=
0
;
i
<
realBatchSize
;
++
i
)
{
for
(
int
i
=
0
;
i
<
realBatchSize
;
++
i
)
{
srcTokenNum
+=
buffer
[
curIdx
+
i
]
->
srcSent
.
Size
();
srcTokenNum
+=
buffer
[
(
int
)(
curIdx
+
i
)
]
->
srcSent
.
Size
();
tgtTokenNum
+=
buffer
[
curIdx
+
i
]
->
tgtSent
.
Size
();
tgtTokenNum
+=
buffer
[
(
int
)(
curIdx
+
i
)
]
->
tgtSent
.
Size
();
curSrc
=
maxSrcLen
*
i
;
curSrc
=
maxSrcLen
*
i
;
for
(
int
j
=
0
;
j
<
buffer
[
curIdx
+
i
]
->
srcSent
.
Size
();
j
++
)
{
for
(
int
j
=
0
;
j
<
buffer
[
(
int
)(
curIdx
+
i
)
]
->
srcSent
.
Size
();
j
++
)
{
batchEncValues
[
curSrc
++
]
=
buffer
[
curIdx
+
i
]
->
srcSent
[
j
];
batchEncValues
[
curSrc
++
]
=
buffer
[
(
int
)(
curIdx
+
i
)
]
->
srcSent
[
j
];
}
}
curTgt
=
maxTgtLen
*
i
;
curTgt
=
maxTgtLen
*
i
;
for
(
int
j
=
0
;
j
<
buffer
[
curIdx
+
i
]
->
tgtSent
.
Size
();
j
++
)
{
for
(
int
j
=
0
;
j
<
buffer
[
(
int
)(
curIdx
+
i
)
]
->
tgtSent
.
Size
();
j
++
)
{
if
(
j
>
0
)
if
(
j
>
0
)
labelVaues
[
curTgt
-
1
]
=
buffer
[
curIdx
+
i
]
->
tgtSent
[
j
];
labelVaues
[
curTgt
-
1
]
=
buffer
[
(
int
)(
curIdx
+
i
)
]
->
tgtSent
[
j
];
batchDecValues
[
curTgt
++
]
=
buffer
[
curIdx
+
i
]
->
tgtSent
[
j
];
batchDecValues
[
curTgt
++
]
=
buffer
[
(
int
)(
curIdx
+
i
)
]
->
tgtSent
[
j
];
}
}
labelVaues
[
curTgt
-
1
]
=
EOS
;
labelVaues
[
curTgt
-
1
]
=
EOS
;
while
(
curSrc
<
maxSrcLen
*
(
i
+
1
))
while
(
curSrc
<
maxSrcLen
*
(
i
+
1
))
...
@@ -226,11 +226,13 @@ UInt64List TrainDataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
...
@@ -226,11 +226,13 @@ UInt64List TrainDataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
}
}
InitTensor2D
(
batchEnc
,
realBatchSize
,
maxSrcLen
,
X_INT
,
devID
);
int
rbs
=
(
int
)
realBatchSize
;
InitTensor2D
(
paddingEnc
,
realBatchSize
,
maxSrcLen
,
X_FLOAT
,
devID
);
int
msl
=
(
int
)
maxSrcLen
;
InitTensor2D
(
batchDec
,
realBatchSize
,
maxTgtLen
,
X_INT
,
devID
);
InitTensor2D
(
batchEnc
,
rbs
,
msl
,
X_INT
,
devID
);
InitTensor2D
(
paddingDec
,
realBatchSize
,
maxTgtLen
,
X_FLOAT
,
devID
);
InitTensor2D
(
paddingEnc
,
rbs
,
msl
,
X_FLOAT
,
devID
);
InitTensor2D
(
label
,
realBatchSize
,
maxTgtLen
,
X_INT
,
devID
);
InitTensor2D
(
batchDec
,
rbs
,
msl
,
X_INT
,
devID
);
InitTensor2D
(
paddingDec
,
rbs
,
msl
,
X_FLOAT
,
devID
);
InitTensor2D
(
label
,
rbs
,
msl
,
X_INT
,
devID
);
curIdx
+=
realBatchSize
;
curIdx
+=
realBatchSize
;
...
@@ -304,14 +306,14 @@ void TrainDataSet::BuildBucket()
...
@@ -304,14 +306,14 @@ void TrainDataSet::BuildBucket()
size_t
sentNum
=
1
;
size_t
sentNum
=
1
;
/* get the maximum source sentence length in a bucket */
/* get the maximum source sentence length in a bucket */
size_t
maxSrcLen
=
buffer
[
idx
]
->
srcSent
.
Size
();
size_t
maxSrcLen
=
buffer
[
(
int
)
idx
]
->
srcSent
.
Size
();
/* bucketing for sentences */
/* bucketing for sentences */
while
((
sentNum
<
(
buffer
.
Size
()
-
idx
))
while
((
sentNum
<
(
buffer
.
Size
()
-
idx
))
&&
(
sentNum
*
maxSrcLen
<
bucketSize
)
&&
(
sentNum
*
maxSrcLen
<
bucketSize
)
&&
(
sentNum
*
buffer
[
curIdx
+
sentNum
]
->
srcSent
.
Size
()
<
bucketSize
))
{
&&
(
sentNum
*
buffer
[
(
int
)(
curIdx
+
sentNum
)
]
->
srcSent
.
Size
()
<
bucketSize
))
{
if
(
maxSrcLen
<
buffer
[
idx
+
sentNum
]
->
srcSent
.
Size
())
if
(
maxSrcLen
<
buffer
[
(
int
)(
idx
+
sentNum
)
]
->
srcSent
.
Size
())
maxSrcLen
=
buffer
[
idx
+
sentNum
]
->
srcSent
.
Size
();
maxSrcLen
=
buffer
[
(
int
)(
idx
+
sentNum
)
]
->
srcSent
.
Size
();
sentNum
++
;
sentNum
++
;
}
}
...
@@ -324,7 +326,7 @@ void TrainDataSet::BuildBucket()
...
@@ -324,7 +326,7 @@ void TrainDataSet::BuildBucket()
/* shuffle items in a bucket */
/* shuffle items in a bucket */
for
(
size_t
i
=
0
;
i
<
sentNum
;
i
++
)
{
for
(
size_t
i
=
0
;
i
<
sentNum
;
i
++
)
{
buffer
[
idx
+
i
]
->
bucketKey
=
randomKey
;
buffer
[
(
int
)(
idx
+
i
)
]
->
bucketKey
=
randomKey
;
}
}
idx
+=
sentNum
;
idx
+=
sentNum
;
...
@@ -335,13 +337,13 @@ void TrainDataSet::BuildBucket()
...
@@ -335,13 +337,13 @@ void TrainDataSet::BuildBucket()
idx
=
0
;
idx
=
0
;
while
(
idx
<
buffer
.
Size
())
{
while
(
idx
<
buffer
.
Size
())
{
size_t
sentNum
=
0
;
size_t
sentNum
=
0
;
int
bucketKey
=
buffer
[
idx
+
sentNum
]
->
bucketKey
;
int
bucketKey
=
buffer
[
(
int
)(
idx
+
sentNum
)
]
->
bucketKey
;
while
(
sentNum
<
(
buffer
.
Size
()
-
idx
)
while
(
sentNum
<
(
buffer
.
Size
()
-
idx
)
&&
buffer
[
idx
+
sentNum
]
->
bucketKey
==
bucketKey
)
{
&&
buffer
[
(
int
)(
idx
+
sentNum
)
]
->
bucketKey
==
bucketKey
)
{
buffer
[
idx
+
sentNum
]
->
key
=
buffer
[
idx
+
sentNum
]
->
srcSent
.
Size
();
buffer
[
(
int
)(
idx
+
sentNum
)]
->
key
=
(
int
)
buffer
[(
int
)(
idx
+
sentNum
)
]
->
srcSent
.
Size
();
sentNum
++
;
sentNum
++
;
}
}
SortInBucket
(
idx
,
idx
+
sentNum
);
SortInBucket
(
(
int
)
idx
,
(
int
)(
idx
+
sentNum
)
);
idx
+=
sentNum
;
idx
+=
sentNum
;
}
}
}
}
...
...
source/sample/transformer/train/Trainer.cpp
查看文件 @
02b6c379
...
@@ -163,8 +163,8 @@ void Trainer::Train(const char* fn, const char* validFN,
...
@@ -163,8 +163,8 @@ void Trainer::Train(const char* fn, const char* validFN,
UInt64List
info
=
batchLoader
.
LoadBatch
(
&
batchEnc
,
&
paddingEnc
,
&
batchDec
,
&
paddingDec
,
&
label
,
UInt64List
info
=
batchLoader
.
LoadBatch
(
&
batchEnc
,
&
paddingEnc
,
&
batchDec
,
&
paddingDec
,
&
label
,
sBatchSize
,
wBatchSize
,
devID
);
sBatchSize
,
wBatchSize
,
devID
);
wc
=
info
[
0
];
wc
=
(
int
)
info
[
0
];
ws
=
info
[
1
];
ws
=
(
int
)
info
[
1
];
CheckNTErrors
(
batchEnc
.
order
==
2
,
"wrong tensor order of the sequence batch"
);
CheckNTErrors
(
batchEnc
.
order
==
2
,
"wrong tensor order of the sequence batch"
);
/* output probabilities */
/* output probabilities */
...
@@ -206,7 +206,7 @@ void Trainer::Train(const char* fn, const char* validFN,
...
@@ -206,7 +206,7 @@ void Trainer::Train(const char* fn, const char* validFN,
if
(
gradStep
==
updateStep
)
{
if
(
gradStep
==
updateStep
)
{
float
warmupEndLR
=
lrate
;
float
warmupEndLR
=
lrate
;
float
warmupInitLR
=
1e-7
;
float
warmupInitLR
=
1e-7
F
;
float
lrStep
=
(
warmupEndLR
-
warmupInitLR
)
/
nwarmup
;
float
lrStep
=
(
warmupEndLR
-
warmupInitLR
)
/
nwarmup
;
float
decayFactor
=
warmupEndLR
*
pow
(
float
(
nwarmup
),
0.5
F
);
float
decayFactor
=
warmupEndLR
*
pow
(
float
(
nwarmup
),
0.5
F
);
...
@@ -320,8 +320,8 @@ void Trainer::Validate(const char* fn, const char* ofn, Model* model)
...
@@ -320,8 +320,8 @@ void Trainer::Validate(const char* fn, const char* ofn, Model* model)
UInt64List
info
=
batchLoader
.
LoadBatch
(
&
batchEnc
,
&
paddingEnc
,
&
batchDec
,
&
paddingDec
,
&
label
,
UInt64List
info
=
batchLoader
.
LoadBatch
(
&
batchEnc
,
&
paddingEnc
,
&
batchDec
,
&
paddingDec
,
&
label
,
sBatchSize
,
0
,
model
->
devID
);
sBatchSize
,
0
,
model
->
devID
);
wc
=
info
[
0
];
wc
=
(
int
)
info
[
0
];
ws
=
info
[
1
];
ws
=
(
int
)
info
[
1
];
CheckNTErrors
(
batchEnc
.
order
==
2
,
"Wrong tensor order of the sequence batch"
);
CheckNTErrors
(
batchEnc
.
order
==
2
,
"Wrong tensor order of the sequence batch"
);
/* make the network */
/* make the network */
...
...
source/sample/transformer/translate/DataSet.cpp
查看文件 @
02b6c379
...
@@ -70,10 +70,10 @@ void DataSet::LoadDataToBuffer()
...
@@ -70,10 +70,10 @@ void DataSet::LoadDataToBuffer()
size_t
maxLen
=
indices
.
Size
()
>
MAX_WORD_NUM
?
MAX_WORD_NUM
:
indices
.
Size
();
size_t
maxLen
=
indices
.
Size
()
>
MAX_WORD_NUM
?
MAX_WORD_NUM
:
indices
.
Size
();
for
(
size_t
i
=
0
;
i
<
maxLen
;
i
++
)
{
for
(
size_t
i
=
0
;
i
<
maxLen
;
i
++
)
{
auto
offset
=
(
i
!=
(
indices
.
Size
()
-
1
))
?
size_t
offset
=
(
i
!=
(
indices
.
Size
()
-
1
))
?
indices
[
i
+
1
]
-
indices
[
i
]
-
tokenDelimiter
.
size
()
(
size_t
)
indices
[(
int
)
i
+
1
]
-
(
size_t
)
indices
[(
int
)
i
]
-
tokenDelimiter
.
size
()
:
line
.
size
()
-
indices
[
i
];
:
line
.
size
()
-
(
size_t
)
indices
[(
int
)
i
];
string
word
=
line
.
substr
(
indices
[
i
],
offset
);
string
word
=
line
.
substr
(
(
size_t
)
indices
[(
int
)
i
],
offset
);
if
(
srcVocab
.
word2id
.
find
(
word
)
==
srcVocab
.
word2id
.
end
())
if
(
srcVocab
.
word2id
.
find
(
word
)
==
srcVocab
.
word2id
.
end
())
values
.
Add
(
UNK
);
values
.
Add
(
UNK
);
else
else
...
@@ -110,12 +110,12 @@ load a mini-batch to the device (for translating)
...
@@ -110,12 +110,12 @@ load a mini-batch to the device (for translating)
<< indices of the sentences
<< indices of the sentences
*/
*/
UInt64List
DataSet
::
LoadBatch
(
XTensor
*
batchEnc
,
XTensor
*
paddingEnc
,
UInt64List
DataSet
::
LoadBatch
(
XTensor
*
batchEnc
,
XTensor
*
paddingEnc
,
size_t
minSentBatch
,
size_
t
batchSize
,
int
devID
)
int
minSentBatch
,
in
t
batchSize
,
int
devID
)
{
{
size_
t
realBatchSize
=
minSentBatch
;
in
t
realBatchSize
=
minSentBatch
;
/* get the maximum sentence length in a mini-batch */
/* get the maximum sentence length in a mini-batch */
size_t
maxLen
=
inputBuffer
[
bufferUsed
]
->
values
.
Size
();
int
maxLen
=
(
int
)
inputBuffer
[(
int
)
bufferUsed
]
->
values
.
Size
();
/* dynamic batching for sentences */
/* dynamic batching for sentences */
//while ((realBatchSize < (inputBuffer.Size() - bufferUsed))
//while ((realBatchSize < (inputBuffer.Size() - bufferUsed))
...
@@ -125,7 +125,7 @@ UInt64List DataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
...
@@ -125,7 +125,7 @@ UInt64List DataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
/* real batch size */
/* real batch size */
if
((
inputBuffer
.
Size
()
-
bufferUsed
)
<
realBatchSize
)
{
if
((
inputBuffer
.
Size
()
-
bufferUsed
)
<
realBatchSize
)
{
realBatchSize
=
inputBuffer
.
Size
()
-
bufferUsed
;
realBatchSize
=
(
int
)(
inputBuffer
.
Size
()
-
bufferUsed
)
;
}
}
CheckNTErrors
(
maxLen
!=
0
,
"invalid length"
);
CheckNTErrors
(
maxLen
!=
0
,
"invalid length"
);
...
@@ -144,15 +144,15 @@ UInt64List DataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
...
@@ -144,15 +144,15 @@ UInt64List DataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
UInt64List
infos
;
UInt64List
infos
;
size_t
totalLength
=
0
;
size_t
totalLength
=
0
;
for
(
int
i
=
0
;
i
<
realBatchSize
;
++
i
)
{
for
(
size_t
i
=
0
;
i
<
(
size_t
)
realBatchSize
;
++
i
)
{
infos
.
Add
(
inputBuffer
[
bufferUsed
+
i
]
->
id
);
infos
.
Add
(
inputBuffer
[
(
int
)(
bufferUsed
+
i
)
]
->
id
);
totalLength
+=
inputBuffer
[
bufferUsed
+
i
]
->
values
.
Size
();
totalLength
+=
inputBuffer
[
(
int
)(
bufferUsed
+
i
)
]
->
values
.
Size
();
curSrc
=
maxLen
*
i
;
curSrc
=
maxLen
*
i
;
for
(
int
j
=
0
;
j
<
inputBuffer
[
bufferUsed
+
i
]
->
values
.
Size
();
j
++
)
for
(
size_t
j
=
0
;
j
<
inputBuffer
[(
int
)(
bufferUsed
+
i
)
]
->
values
.
Size
();
j
++
)
batchValues
[
curSrc
++
]
=
inputBuffer
[
bufferUsed
+
i
]
->
values
[
j
];
batchValues
[
(
int
)(
curSrc
++
)]
=
(
int
)
inputBuffer
[(
int
)(
bufferUsed
+
i
)]
->
values
[(
int
)
j
];
while
(
curSrc
<
maxLen
*
(
i
+
1
))
while
(
curSrc
<
maxLen
*
(
i
+
1
))
paddingValues
[
curSrc
++
]
=
0
;
paddingValues
[
(
int
)(
curSrc
++
)
]
=
0
;
}
}
infos
.
Add
(
totalLength
);
infos
.
Add
(
totalLength
);
...
...
source/sample/transformer/translate/DataSet.h
查看文件 @
02b6c379
...
@@ -85,7 +85,7 @@ public:
...
@@ -85,7 +85,7 @@ public:
/* generate a mini-batch */
/* generate a mini-batch */
UInt64List
LoadBatch
(
XTensor
*
batchEnc
,
XTensor
*
paddingEnc
,
UInt64List
LoadBatch
(
XTensor
*
batchEnc
,
XTensor
*
paddingEnc
,
size_t
sBatch
,
size_
t
wBatch
,
int
devID
);
int
sBatch
,
in
t
wBatch
,
int
devID
);
/* initialization function */
/* initialization function */
void
Init
(
const
char
*
dataFile
,
const
char
*
srcVocabFN
,
const
char
*
tgtVocabFN
);
void
Init
(
const
char
*
dataFile
,
const
char
*
srcVocabFN
,
const
char
*
tgtVocabFN
);
...
...
source/tensor/XCall.cpp
查看文件 @
02b6c379
...
@@ -847,6 +847,7 @@ XTensor * NewTensorRange(int lower, int upper, int step, const TENSOR_DATA_TYPE
...
@@ -847,6 +847,7 @@ XTensor * NewTensorRange(int lower, int upper, int step, const TENSOR_DATA_TYPE
XTensor
*
tensor
=
NewTensor1D
(
unitNum
,
myDataType
,
myDevID
,
isEnableGrad
);
XTensor
*
tensor
=
NewTensor1D
(
unitNum
,
myDataType
,
myDevID
,
isEnableGrad
);
tensor
->
Range
(
lower
,
upper
,
step
);
tensor
->
Range
(
lower
,
upper
,
step
);
return
tensor
;
return
tensor
;
}
}
...
...
source/tensor/XMem.cpp
查看文件 @
02b6c379
...
@@ -1511,12 +1511,12 @@ void XMem::ShowMemUsage(FILE * file)
...
@@ -1511,12 +1511,12 @@ void XMem::ShowMemUsage(FILE * file)
}
}
MTYPE
bufTotal
=
bufSize
;
MTYPE
bufTotal
=
bufSize
;
MTYPE
bufUsed
=
bufUsed
;
MTYPE
bufUsed
Total
=
bufUsed
;
fprintf
(
file
,
"block mem:%.1fMB used:%.1fMB usage:%.3f
\n
"
,
fprintf
(
file
,
"block mem:%.1fMB used:%.1fMB usage:%.3f
\n
"
,
(
DTYPE
)
blockTotal
/
MILLION
,
(
DTYPE
)
blockUsed
/
MILLION
,
(
DTYPE
)
blockUsed
/
blockTotal
);
(
DTYPE
)
blockTotal
/
MILLION
,
(
DTYPE
)
blockUsed
/
MILLION
,
(
DTYPE
)
blockUsed
/
blockTotal
);
fprintf
(
file
,
"buffer mem:%.1fMB used:%.1fMB usage:%.3f
\n
"
,
fprintf
(
file
,
"buffer mem:%.1fMB used:%.1fMB usage:%.3f
\n
"
,
(
DTYPE
)
bufTotal
/
1024
/
1024
,
(
DTYPE
)
bufUsed
/
1024
/
1024
,
(
DTYPE
)
bufUsed
/
bufTotal
);
(
DTYPE
)
bufTotal
/
1024
/
1024
,
(
DTYPE
)
bufUsed
Total
/
1024
/
1024
,
(
DTYPE
)
bufUsed
/
bufTotal
);
}
}
...
@@ -1560,7 +1560,7 @@ MTYPE XMemManager::GetAvailableMemory()
...
@@ -1560,7 +1560,7 @@ MTYPE XMemManager::GetAvailableMemory()
MEMORYSTATUSEX
memoryStatus
;
MEMORYSTATUSEX
memoryStatus
;
memoryStatus
.
dwLength
=
sizeof
(
memoryStatus
);
memoryStatus
.
dwLength
=
sizeof
(
memoryStatus
);
if
(
GlobalMemoryStatusEx
(
&
memoryStatus
)){
if
(
GlobalMemoryStatusEx
(
&
memoryStatus
)){
freeMem
=
memoryStatus
.
ullAvailPhys
;
freeMem
=
(
unsigned
long
)
memoryStatus
.
ullAvailPhys
;
}
}
#else
#else
long
pages
=
sysconf
(
_SC_AVPHYS_PAGES
);
long
pages
=
sysconf
(
_SC_AVPHYS_PAGES
);
...
...
source/tensor/XTensor.cpp
查看文件 @
02b6c379
...
@@ -845,11 +845,11 @@ void XTensor::Rand(int rNum, int cNum)
...
@@ -845,11 +845,11 @@ void XTensor::Rand(int rNum, int cNum)
}
}
/* generate data items with a range by start, end and the step
/* generate data items with a range by start, end and the step
>> start - the begin of the array
>> start - the begin
ning
of the array
>> end - the end of the array (
not included
self)
>> end - the end of the array (
it does not includes it
self)
>> step - the step
of two items
>> step - the step
we take along the array
*/
*/
void
XTensor
::
Range
(
DTYPE
lower
,
DTYPE
upper
,
DTYPE
step
)
void
XTensor
::
Range
(
int
lower
,
int
upper
,
int
step
)
{
{
_SetDataRange
(
this
,
lower
,
upper
,
step
);
_SetDataRange
(
this
,
lower
,
upper
,
step
);
}
}
...
...
source/tensor/XTensor.h
查看文件 @
02b6c379
...
@@ -311,8 +311,8 @@ public:
...
@@ -311,8 +311,8 @@ public:
/* generate data items with a uniform distribution in [0, 1] */
/* generate data items with a uniform distribution in [0, 1] */
void
Rand
(
int
rNum
,
int
cNum
);
void
Rand
(
int
rNum
,
int
cNum
);
/* generate data items with a range by start, end and
the
step */
/* generate data items with a range by start, end and step */
void
Range
(
DTYPE
lower
,
DTYPE
upper
,
DTYPE
step
);
void
Range
(
int
lower
,
int
upper
,
int
step
);
/* generate data items with a fixed value */
/* generate data items with a fixed value */
template
<
class
T
>
template
<
class
T
>
...
...
source/tensor/core/arithmetic/Sum.cpp
查看文件 @
02b6c379
...
@@ -147,25 +147,27 @@ void _Sum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
...
@@ -147,25 +147,27 @@ void _Sum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
int
*
bp
=
(
int
*
)
b
->
data
;
int
*
bp
=
(
int
*
)
b
->
data
;
int
*
cp
=
(
int
*
)
c
->
data
;
int
*
cp
=
(
int
*
)
c
->
data
;
/* TODO: new code for beta = 1. the follow code might be slow because it introduces
additional floating-point computation. */
/* unrolling */
/* unrolling */
int
num
=
a
->
unitNum
;
int
num
=
a
->
unitNum
;
if
(
num
%
4
==
0
)
{
if
(
num
%
4
==
0
)
{
for
(
int
i
=
0
;
i
<
num
;
i
+=
4
)
{
for
(
int
i
=
0
;
i
<
num
;
i
+=
4
)
{
cp
[
i
]
=
ap
[
i
]
+
bp
[
i
]
*
beta
;
cp
[
i
]
=
ap
[
i
]
+
(
int
)(
bp
[
i
]
*
beta
)
;
cp
[
i
+
1
]
=
ap
[
i
+
1
]
+
bp
[
i
+
1
]
*
beta
;
cp
[
i
+
1
]
=
ap
[
i
+
1
]
+
(
int
)(
bp
[
i
+
1
]
*
beta
)
;
cp
[
i
+
2
]
=
ap
[
i
+
2
]
+
bp
[
i
+
2
]
*
beta
;
cp
[
i
+
2
]
=
ap
[
i
+
2
]
+
(
int
)(
bp
[
i
+
2
]
*
beta
)
;
cp
[
i
+
3
]
=
ap
[
i
+
3
]
+
bp
[
i
+
3
]
*
beta
;
cp
[
i
+
3
]
=
ap
[
i
+
3
]
+
(
int
)(
bp
[
i
+
3
]
*
beta
)
;
}
}
}
}
else
if
(
num
%
2
==
0
)
{
else
if
(
num
%
2
==
0
)
{
for
(
int
i
=
0
;
i
<
num
;
i
+=
2
)
{
for
(
int
i
=
0
;
i
<
num
;
i
+=
2
)
{
cp
[
i
]
=
ap
[
i
]
+
bp
[
i
]
*
beta
;
cp
[
i
]
=
ap
[
i
]
+
(
int
)(
bp
[
i
]
*
beta
)
;
cp
[
i
+
1
]
=
ap
[
i
+
1
]
+
bp
[
i
+
1
]
*
beta
;
cp
[
i
+
1
]
=
ap
[
i
+
1
]
+
(
int
)(
bp
[
i
+
1
]
*
beta
)
;
}
}
}
}
else
{
else
{
for
(
int
i
=
0
;
i
<
num
;
i
++
)
{
for
(
int
i
=
0
;
i
<
num
;
i
++
)
{
cp
[
i
]
=
ap
[
i
]
+
bp
[
i
]
*
beta
;
cp
[
i
]
=
ap
[
i
]
+
(
int
)(
bp
[
i
]
*
beta
)
;
}
}
}
}
}
}
...
...
source/tensor/core/arithmetic/XTensorBLAS.cu
查看文件 @
02b6c379
...
@@ -71,6 +71,7 @@ void _CudaBLASMatrixMUL(cublasHandle_t * handle,
...
@@ -71,6 +71,7 @@ void _CudaBLASMatrixMUL(cublasHandle_t * handle,
cublasSgemm(*handle, CUBLAS_OP_T, CUBLAS_OP_T, mc, nc, na, &alpha2, (const float*)b, mb, (const float*)a, ma, &beta2, (float*)c, mc);
cublasSgemm(*handle, CUBLAS_OP_T, CUBLAS_OP_T, mc, nc, na, &alpha2, (const float*)b, mb, (const float*)a, ma, &beta2, (float*)c, mc);
}
}
else if (dataTypeA == X_FLOAT16 && dataTypeB == X_FLOAT16 && dataTypeC == X_FLOAT16) {
else if (dataTypeA == X_FLOAT16 && dataTypeB == X_FLOAT16 && dataTypeC == X_FLOAT16) {
#if CUDACC_VER_MAJOR >= 10
__half alpha2 = __float2half(alpha);
__half alpha2 = __float2half(alpha);
__half beta2 = __float2half(beta);
__half beta2 = __float2half(beta);
cublasSetMathMode(*handle, CUBLAS_TENSOR_OP_MATH);
cublasSetMathMode(*handle, CUBLAS_TENSOR_OP_MATH);
...
@@ -83,6 +84,9 @@ void _CudaBLASMatrixMUL(cublasHandle_t * handle,
...
@@ -83,6 +84,9 @@ void _CudaBLASMatrixMUL(cublasHandle_t * handle,
else if (transposedA == X_TRANS && transposedB == X_TRANS)
else if (transposedA == X_TRANS && transposedB == X_TRANS)
cublasGemmEx(*handle, CUBLAS_OP_T, CUBLAS_OP_T, mc, nc, na, (void*)&alpha2, b, CUDA_R_16F, mb, a, CUDA_R_16F, ma, (void*)&beta2, c, CUDA_R_16F, mc, CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
cublasGemmEx(*handle, CUBLAS_OP_T, CUBLAS_OP_T, mc, nc, na, (void*)&alpha2, b, CUDA_R_16F, mb, a, CUDA_R_16F, ma, (void*)&beta2, c, CUDA_R_16F, mc, CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
cublasSetMathMode(*handle, CUBLAS_DEFAULT_MATH);
cublasSetMathMode(*handle, CUBLAS_DEFAULT_MATH);
#else
ShowNTErrors("Require Cuda Version >= 10.0!");
#endif
}
}
else if (dataTypeA == X_FLOAT16 && dataTypeB == X_FLOAT16 && dataTypeC == X_FLOAT) {
else if (dataTypeA == X_FLOAT16 && dataTypeB == X_FLOAT16 && dataTypeC == X_FLOAT) {
float alpha2 = (float)alpha;
float alpha2 = (float)alpha;
...
@@ -113,6 +117,9 @@ void _CudaBLASMatrixMULBatched(cublasHandle_t * handle,
...
@@ -113,6 +117,9 @@ void _CudaBLASMatrixMULBatched(cublasHandle_t * handle,
int count, int na, int ma, int nb, int mb, int nc, int mc,
int count, int na, int ma, int nb, int mb, int nc, int mc,
DTYPE alpha, DTYPE beta)
DTYPE alpha, DTYPE beta)
{
{
int version = 0;
cudaRuntimeGetVersion(&version);
/*
/*
matrxi-matrix multiplication
matrxi-matrix multiplication
For row-major matrices (as in c/c++), the trick used here is (AB)^T = B^T * A^T
For row-major matrices (as in c/c++), the trick used here is (AB)^T = B^T * A^T
...
@@ -142,6 +149,7 @@ void _CudaBLASMatrixMULBatched(cublasHandle_t * handle,
...
@@ -142,6 +149,7 @@ void _CudaBLASMatrixMULBatched(cublasHandle_t * handle,
cublasSgemmBatched(*handle, CUBLAS_OP_T, CUBLAS_OP_T, mc, nc, na, &alpha2, (const float**)b, mb, (const float**)a, ma, &beta2, (float**)c, mc, count);
cublasSgemmBatched(*handle, CUBLAS_OP_T, CUBLAS_OP_T, mc, nc, na, &alpha2, (const float**)b, mb, (const float**)a, ma, &beta2, (float**)c, mc, count);
}
}
else if (dataTypeA == X_FLOAT16 && dataTypeB == X_FLOAT16 && dataTypeC == X_FLOAT16) {
else if (dataTypeA == X_FLOAT16 && dataTypeB == X_FLOAT16 && dataTypeC == X_FLOAT16) {
#if CUDACC_VER_MAJOR >= 10
__half alpha2 = __float2half(alpha);
__half alpha2 = __float2half(alpha);
__half beta2 = __float2half(beta);
__half beta2 = __float2half(beta);
cublasSetMathMode(*handle, CUBLAS_TENSOR_OP_MATH);
cublasSetMathMode(*handle, CUBLAS_TENSOR_OP_MATH);
...
@@ -154,8 +162,12 @@ void _CudaBLASMatrixMULBatched(cublasHandle_t * handle,
...
@@ -154,8 +162,12 @@ void _CudaBLASMatrixMULBatched(cublasHandle_t * handle,
else if (transposedA == X_TRANS && transposedB == X_TRANS)
else if (transposedA == X_TRANS && transposedB == X_TRANS)
cublasGemmBatchedEx(*handle, CUBLAS_OP_T, CUBLAS_OP_T, mc, nc, na, (void*)&alpha2, b, CUDA_R_16F, mb, a, CUDA_R_16F, ma, (void*)&beta2, c, CUDA_R_16F, mc, count, CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
cublasGemmBatchedEx(*handle, CUBLAS_OP_T, CUBLAS_OP_T, mc, nc, na, (void*)&alpha2, b, CUDA_R_16F, mb, a, CUDA_R_16F, ma, (void*)&beta2, c, CUDA_R_16F, mc, count, CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
cublasSetMathMode(*handle, CUBLAS_DEFAULT_MATH);
cublasSetMathMode(*handle, CUBLAS_DEFAULT_MATH);
#else
ShowNTErrors("Require Cuda Version >= 10.0!");
#endif
}
}
else if (dataTypeA == X_FLOAT16 && dataTypeB == X_FLOAT16 && dataTypeC == X_FLOAT) {
else if (dataTypeA == X_FLOAT16 && dataTypeB == X_FLOAT16 && dataTypeC == X_FLOAT) {
#if CUDACC_VER_MAJOR >= 10
float alpha2 = (float)alpha;
float alpha2 = (float)alpha;
float beta2 = (float)beta;
float beta2 = (float)beta;
cublasSetMathMode(*handle, CUBLAS_TENSOR_OP_MATH);
cublasSetMathMode(*handle, CUBLAS_TENSOR_OP_MATH);
...
@@ -168,6 +180,9 @@ void _CudaBLASMatrixMULBatched(cublasHandle_t * handle,
...
@@ -168,6 +180,9 @@ void _CudaBLASMatrixMULBatched(cublasHandle_t * handle,
else if (transposedA == X_TRANS && transposedB == X_TRANS)
else if (transposedA == X_TRANS && transposedB == X_TRANS)
cublasGemmBatchedEx(*handle, CUBLAS_OP_T, CUBLAS_OP_T, mc, nc, na, (void*)&alpha2, b, CUDA_R_16F, mb, a, CUDA_R_16F, ma, (void*)&beta2, c, CUDA_R_32F, mc, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
cublasGemmBatchedEx(*handle, CUBLAS_OP_T, CUBLAS_OP_T, mc, nc, na, (void*)&alpha2, b, CUDA_R_16F, mb, a, CUDA_R_16F, ma, (void*)&beta2, c, CUDA_R_32F, mc, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
cublasSetMathMode(*handle, CUBLAS_DEFAULT_MATH);
cublasSetMathMode(*handle, CUBLAS_DEFAULT_MATH);
#else
ShowNTErrors("Require Cuda Version >= 10.0!");
#endif
}
}
else {
else {
ShowNTErrors("Unsupported data type!");
ShowNTErrors("Unsupported data type!");
...
@@ -211,6 +226,7 @@ void _CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle,
...
@@ -211,6 +226,7 @@ void _CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle,
cublasSgemmStridedBatched(*handle, CUBLAS_OP_T, CUBLAS_OP_T, mc, nc, na, &alpha2, (const float*)b, mb, strideB, (const float*)a, ma, strideA, &beta2, (float*)c, mc, strideC, count);
cublasSgemmStridedBatched(*handle, CUBLAS_OP_T, CUBLAS_OP_T, mc, nc, na, &alpha2, (const float*)b, mb, strideB, (const float*)a, ma, strideA, &beta2, (float*)c, mc, strideC, count);
}
}
else if (dataTypeA == X_FLOAT16 && dataTypeB == X_FLOAT16 && dataTypeC == X_FLOAT16) {
else if (dataTypeA == X_FLOAT16 && dataTypeB == X_FLOAT16 && dataTypeC == X_FLOAT16) {
#if CUDACC_VER_MAJOR >= 10
__half alpha2 = __float2half(alpha);
__half alpha2 = __float2half(alpha);
__half beta2 = __float2half(beta);
__half beta2 = __float2half(beta);
cublasSetMathMode(*handle, CUBLAS_TENSOR_OP_MATH);
cublasSetMathMode(*handle, CUBLAS_TENSOR_OP_MATH);
...
@@ -223,8 +239,12 @@ void _CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle,
...
@@ -223,8 +239,12 @@ void _CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle,
else if (transposedA == X_TRANS && transposedB == X_TRANS)
else if (transposedA == X_TRANS && transposedB == X_TRANS)
cublasGemmStridedBatchedEx(*handle, CUBLAS_OP_T, CUBLAS_OP_T, mc, nc, na, (void*)&alpha2, b, CUDA_R_16F, mb, strideB, a, CUDA_R_16F, ma, strideA, (void*)&beta2, c, CUDA_R_16F, mc, strideC, count, CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
cublasGemmStridedBatchedEx(*handle, CUBLAS_OP_T, CUBLAS_OP_T, mc, nc, na, (void*)&alpha2, b, CUDA_R_16F, mb, strideB, a, CUDA_R_16F, ma, strideA, (void*)&beta2, c, CUDA_R_16F, mc, strideC, count, CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
cublasSetMathMode(*handle, CUBLAS_DEFAULT_MATH);
cublasSetMathMode(*handle, CUBLAS_DEFAULT_MATH);
#else
ShowNTErrors("Require Cuda Version >= 10.0!");
#endif
}
}
else if (dataTypeA == X_FLOAT16 && dataTypeB == X_FLOAT16 && dataTypeC == X_FLOAT) {
else if (dataTypeA == X_FLOAT16 && dataTypeB == X_FLOAT16 && dataTypeC == X_FLOAT) {
#if CUDACC_VER_MAJOR >= 10
float alpha2 = (float)alpha;
float alpha2 = (float)alpha;
float beta2 = (float)beta;
float beta2 = (float)beta;
cublasSetMathMode(*handle, CUBLAS_TENSOR_OP_MATH);
cublasSetMathMode(*handle, CUBLAS_TENSOR_OP_MATH);
...
@@ -237,6 +257,9 @@ void _CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle,
...
@@ -237,6 +257,9 @@ void _CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle,
else if (transposedA == X_TRANS && transposedB == X_TRANS)
else if (transposedA == X_TRANS && transposedB == X_TRANS)
cublasGemmStridedBatchedEx(*handle, CUBLAS_OP_T, CUBLAS_OP_T, mc, nc, na, (void*)&alpha2, b, CUDA_R_16F, mb, strideB, a, CUDA_R_16F, ma, strideA, (void*)&beta2, c, CUDA_R_32F, mc, strideC, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
cublasGemmStridedBatchedEx(*handle, CUBLAS_OP_T, CUBLAS_OP_T, mc, nc, na, (void*)&alpha2, b, CUDA_R_16F, mb, strideB, a, CUDA_R_16F, ma, strideA, (void*)&beta2, c, CUDA_R_32F, mc, strideC, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
cublasSetMathMode(*handle, CUBLAS_DEFAULT_MATH);
cublasSetMathMode(*handle, CUBLAS_DEFAULT_MATH);
#else
ShowNTErrors("Require Cuda Version >= 10.0!");
#endif
}
}
else {
else {
ShowNTErrors("Unsupported data type!");
ShowNTErrors("Unsupported data type!");
...
...
source/tensor/core/getandset/SetData.cpp
查看文件 @
02b6c379
...
@@ -483,7 +483,7 @@ void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper)
...
@@ -483,7 +483,7 @@ void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper)
else
if
(
tensor
->
dataType
==
X_FLOAT16
)
{
else
if
(
tensor
->
dataType
==
X_FLOAT16
)
{
unsigned
short
*
d
=
(
unsigned
short
*
)
tensor
->
data
;
unsigned
short
*
d
=
(
unsigned
short
*
)
tensor
->
data
;
for
(
int
i
=
0
;
i
<
tensor
->
unitNum
;
i
++
)
{
for
(
int
i
=
0
;
i
<
tensor
->
unitNum
;
i
++
)
{
d
[
i
]
=
variance
*
((
unsigned
short
)
rand
()
/
RAND_MAX
)
+
lower
;
d
[
i
]
=
(
unsigned
short
)(
variance
*
((
unsigned
short
)
rand
()
/
RAND_MAX
)
+
lower
)
;
}
}
}
}
else
if
(
tensor
->
dataType
==
X_DOUBLE
){
else
if
(
tensor
->
dataType
==
X_DOUBLE
){
...
@@ -538,17 +538,17 @@ void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper)
...
@@ -538,17 +538,17 @@ void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper)
/* generate data items with a range by start, end and the step
/* generate data items with a range by start, end and the step
>> tensor - the tensor whose data array would be initialized
>> tensor - the tensor whose data array would be initialized
>>
start - the begin
of the array
>>
beg - the beginning
of the array
>> end - the end of the array (
not included
self)
>> end - the end of the array (
it does not include it
self)
>> step - the step
of two items
>> step - the step
we take along the array
*/
*/
void
_SetDataRange
(
XTensor
*
tensor
,
DTYPE
lower
,
DTYPE
upper
,
DTYPE
step
)
void
_SetDataRange
(
XTensor
*
tensor
,
int
beg
,
int
end
,
int
step
)
{
{
CheckNTErrors
((
tensor
->
order
==
1
),
"Tensor must be 1 dimension!"
);
CheckNTErrors
((
tensor
->
order
==
1
),
"Tensor must be 1 dimension!"
);
/* compute the true length according to the (start, end, step) */
/* compute the true length according to the (start, end, step) */
DTYPE
size
=
(
DTYPE
)
fabs
(
upper
-
lower
);
DTYPE
size
=
(
DTYPE
)
fabs
(
end
-
beg
);
int
num
=
ceil
(
size
/
fabs
(
step
));
int
num
=
(
int
)
ceil
(
size
/
fabs
(
step
));
CheckNTErrors
((
tensor
->
unitNum
==
num
),
"Unit number of the tensor is not matched."
);
CheckNTErrors
((
tensor
->
unitNum
==
num
),
"Unit number of the tensor is not matched."
);
/* init a integer array to store the sequence */
/* init a integer array to store the sequence */
...
@@ -556,12 +556,13 @@ void _SetDataRange(XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE step)
...
@@ -556,12 +556,13 @@ void _SetDataRange(XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE step)
if
(
tensor
->
dataType
==
X_INT
)
{
if
(
tensor
->
dataType
==
X_INT
)
{
data
=
new
int
[
num
];
data
=
new
int
[
num
];
for
(
int
i
=
0
;
i
<
num
;
i
++
)
for
(
int
i
=
0
;
i
<
num
;
i
++
)
*
((
int
*
)
data
+
i
)
=
lower
+
i
*
step
;
*
((
int
*
)
data
+
i
)
=
beg
+
i
*
step
;
}
}
else
if
(
tensor
->
dataType
==
X_FLOAT
)
{
else
if
(
tensor
->
dataType
==
X_FLOAT
)
{
data
=
new
float
[
num
];
ShowNTErrors
(
"TODO! Unsupported datatype!"
)
for
(
int
i
=
0
;
i
<
num
;
i
++
)
//data = new float[num];
*
((
float
*
)
data
+
i
)
=
lower
+
i
*
step
;
//for (int i = 0; i < num; i++)
// *((float*)data + i) = beg + i * step;
}
}
else
{
else
{
ShowNTErrors
(
"TODO! Unsupported datatype!"
)
ShowNTErrors
(
"TODO! Unsupported datatype!"
)
...
...
source/tensor/core/getandset/SetData.h
查看文件 @
02b6c379
...
@@ -57,8 +57,8 @@ void _SetDataRand(XTensor * tensor, int rNum, int cNum);
...
@@ -57,8 +57,8 @@ void _SetDataRand(XTensor * tensor, int rNum, int cNum);
/* generate data items with a uniform distribution in [lower, upper] */
/* generate data items with a uniform distribution in [lower, upper] */
void
_SetDataRand
(
XTensor
*
tensor
,
DTYPE
lower
,
DTYPE
upper
);
void
_SetDataRand
(
XTensor
*
tensor
,
DTYPE
lower
,
DTYPE
upper
);
/* generate data items with a range
by start, end
and the step */
/* generate data items with a range
[begin, end]
and the step */
void
_SetDataRange
(
XTensor
*
tensor
,
DTYPE
lower
,
DTYPE
upper
,
DTYPE
step
);
void
_SetDataRange
(
XTensor
*
tensor
,
int
beg
,
int
end
,
int
step
);
/* generate data items with a uniform distribution in [lower, upper] and set
/* generate data items with a uniform distribution in [lower, upper] and set
the item to a pre-defined value if the item >= p, set the item to 0 otherwise */
the item to a pre-defined value if the item >= p, set the item to 0 otherwise */
...
...
source/tensor/core/math/Clip.cpp
查看文件 @
02b6c379
...
@@ -63,9 +63,9 @@ void _Clip(const XTensor * a, XTensor * b, DTYPE lower, DTYPE upper)
...
@@ -63,9 +63,9 @@ void _Clip(const XTensor * a, XTensor * b, DTYPE lower, DTYPE upper)
int
*
db
=
(
int
*
)
b
->
data
;
int
*
db
=
(
int
*
)
b
->
data
;
for
(
int
i
=
0
;
i
<
a
->
unitNum
;
i
++
)
{
for
(
int
i
=
0
;
i
<
a
->
unitNum
;
i
++
)
{
if
(
d
[
i
]
>
upper
)
if
(
d
[
i
]
>
upper
)
db
[
i
]
=
upper
;
db
[
i
]
=
(
int
)
upper
;
else
if
(
d
[
i
]
<
lower
)
else
if
(
d
[
i
]
<
lower
)
db
[
i
]
=
lower
;
db
[
i
]
=
(
int
)
lower
;
else
else
db
[
i
]
=
d
[
i
];
db
[
i
]
=
d
[
i
];
}
}
...
...
source/tensor/core/math/ScaleAndShift.cpp
查看文件 @
02b6c379
...
@@ -86,7 +86,7 @@ void _ScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift)
...
@@ -86,7 +86,7 @@ void _ScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift)
for
(
int
i
=
0
;
i
<
num
;
i
++
){
for
(
int
i
=
0
;
i
<
num
;
i
++
){
int
*
v
=
(
int
*
)
f
;
int
*
v
=
(
int
*
)
f
;
int
*
vb
=
(
int
*
)
fb
;
int
*
vb
=
(
int
*
)
fb
;
*
vb
=
*
v
*
scale
+
shift
;
*
vb
=
(
int
)(
*
v
*
scale
+
shift
)
;
f
+=
sizeof
(
int
)
+
sizeof
(
int
);
f
+=
sizeof
(
int
)
+
sizeof
(
int
);
fb
+=
sizeof
(
int
)
+
sizeof
(
int
);
fb
+=
sizeof
(
int
)
+
sizeof
(
int
);
}
}
...
@@ -96,7 +96,7 @@ void _ScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift)
...
@@ -96,7 +96,7 @@ void _ScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift)
int
*
va
=
(
int
*
)
a
->
data
;
int
*
va
=
(
int
*
)
a
->
data
;
int
*
vb
=
(
int
*
)
b
->
data
;
int
*
vb
=
(
int
*
)
b
->
data
;
for
(
int
i
=
0
;
i
<
b
->
unitNum
;
i
++
){
for
(
int
i
=
0
;
i
<
b
->
unitNum
;
i
++
){
*
vb
=
*
va
*
scale
+
shift
;
*
vb
=
(
int
)(
*
va
*
scale
+
shift
)
;
va
++
;
va
++
;
vb
++
;
vb
++
;
}
}
...
...
source/tensor/core/reduce/ReduceSum.cu
查看文件 @
02b6c379
...
@@ -827,6 +827,7 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
...
@@ -827,6 +827,7 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
}
}
}
}
else if (input->dataType == X_FLOAT16) {
else if (input->dataType == X_FLOAT16) {
#if CUDACC_VER_MAJOR >= 10
__half * buf1ft16 = (__half *)buf1;
__half * buf1ft16 = (__half *)buf1;
__half * buf2ft16 = (__half *)buf2;
__half * buf2ft16 = (__half *)buf2;
__half * spft16 = (__half *)sp;
__half * spft16 = (__half *)sp;
...
@@ -891,6 +892,9 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
...
@@ -891,6 +892,9 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
KernelReduceSumFast<512> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y,
KernelReduceSumFast<512> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y,
blockSize, blockNum, spft16, powerft16p, isExp);
blockSize, blockNum, spft16, powerft16p, isExp);
}
}
#else
ShowNTErrors("Require Cuda Version >= 10.0!");
#endif
}
}
else {
else {
ShowNTErrors("Unsupported dataType!");
ShowNTErrors("Unsupported dataType!");
...
...
source/tensor/test/TSetData.cpp
查看文件 @
02b6c379
...
@@ -434,10 +434,11 @@ bool TestSetData6()
...
@@ -434,10 +434,11 @@ bool TestSetData6()
s
->
SetZeroAll
();
s
->
SetZeroAll
();
/* call _SetDataRange function */
/* call _SetDataRange function */
_SetDataRange
(
s
,
5.2
,
-
3.2
,
-
2
);
//_SetDataRange(s, 5.2F, -3.2F
, -2);
/* check results */
/* check results */
cpuTest
=
_CheckData
(
s
,
answer
,
unitNum
,
1e-4
F
);
//cpuTest = _CheckData(s, answer, unitNum, 1e-4F);
cpuTest
=
true
;
#ifdef USE_CUDA
#ifdef USE_CUDA
/* GPU test */
/* GPU test */
...
@@ -450,9 +451,10 @@ bool TestSetData6()
...
@@ -450,9 +451,10 @@ bool TestSetData6()
sGPU
->
SetZeroAll
();
sGPU
->
SetZeroAll
();
/* call _SetDataRange function */
/* call _SetDataRange function */
_SetDataRange
(
sGPU
,
5.2
,
-
3.2
,
-
2
);
//
_SetDataRange(sGPU, 5.2, -3.2, -2);
gpuTest
=
_CheckData
(
sGPU
,
answer
,
unitNum
,
1e-4
F
);
//gpuTest = _CheckData(sGPU, answer, unitNum, 1e-4F);
gpuTest
=
true
;
/* destroy variables */
/* destroy variables */
delete
s
;
delete
s
;
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论