Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
N
NiuTrans.Tensor
概览
Overview
Details
Activity
Cycle Analytics
版本库
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
问题
8
Issues
8
列表
Board
标记
里程碑
合并请求
0
Merge Requests
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
Snippets
成员
Collapse sidebar
Close sidebar
活动
图像
聊天
创建新问题
作业
提交
Issue Boards
Open sidebar
NiuTrans
NiuTrans.Tensor
Commits
02b6c379
Commit
02b6c379
authored
Feb 21, 2021
by
xiaotong
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
bug fixes and removing warnings
parent
5f9867fc
隐藏空白字符变更
内嵌
并排
正在显示
20 个修改的文件
包含
133 行增加
和
98 行删除
+133
-98
source/sample/transformer/Model.cpp
+1
-1
source/sample/transformer/Utility.cpp
+6
-6
source/sample/transformer/module/Attention.cpp
+1
-1
source/sample/transformer/module/LayerHistory.cpp
+2
-2
source/sample/transformer/train/TrainDataSet.cpp
+33
-31
source/sample/transformer/train/Trainer.cpp
+5
-5
source/sample/transformer/translate/DataSet.cpp
+14
-14
source/sample/transformer/translate/DataSet.h
+1
-1
source/tensor/XCall.cpp
+1
-0
source/tensor/XMem.cpp
+3
-3
source/tensor/XTensor.cpp
+4
-4
source/tensor/XTensor.h
+2
-2
source/tensor/core/arithmetic/Sum.cpp
+9
-7
source/tensor/core/arithmetic/XTensorBLAS.cu
+23
-0
source/tensor/core/getandset/SetData.cpp
+12
-11
source/tensor/core/getandset/SetData.h
+2
-2
source/tensor/core/math/Clip.cpp
+2
-2
source/tensor/core/math/ScaleAndShift.cpp
+2
-2
source/tensor/core/reduce/ReduceSum.cu
+4
-0
source/tensor/test/TSetData.cpp
+6
-4
没有找到文件。
source/sample/transformer/Model.cpp
查看文件 @
02b6c379
...
...
@@ -490,7 +490,7 @@ void Model::Read(FILE* file)
TensorList
params
;
GetParams
(
params
);
LOG
(
"params count: %lu"
,
params
.
Size
());
LOG
(
"params count: %lu"
,
(
unsigned
long
)
params
.
Size
());
int
size
=
0
;
for
(
int
i
=
0
;
i
<
params
.
Size
();
i
++
)
{
size
+=
params
[
i
]
->
unitNum
;
...
...
source/sample/transformer/Utility.cpp
查看文件 @
02b6c379
...
...
@@ -91,9 +91,9 @@ Config::Config(int argc, const char** argv)
LoadParamInt
(
argsNum
,
args
,
"sbatch"
,
&
sBatchSize
,
8
);
isTraining
=
(
strcmp
(
trainFN
,
""
)
==
0
)
?
false
:
true
;
LoadParamBool
(
argsNum
,
args
,
"mt"
,
&
isMT
,
true
);
LoadParamFloat
(
argsNum
,
args
,
"dropout"
,
&
dropout
,
0.3
);
LoadParamFloat
(
argsNum
,
args
,
"fnndrop"
,
&
fnnDropout
,
0.1
);
LoadParamFloat
(
argsNum
,
args
,
"attdrop"
,
&
attDropout
,
0.1
);
LoadParamFloat
(
argsNum
,
args
,
"dropout"
,
&
dropout
,
0.3
F
);
LoadParamFloat
(
argsNum
,
args
,
"fnndrop"
,
&
fnnDropout
,
0.1
F
);
LoadParamFloat
(
argsNum
,
args
,
"attdrop"
,
&
attDropout
,
0.1
F
);
LoadParamFloat
(
argc
,
args
,
"lrate"
,
&
lrate
,
0.0015
F
);
LoadParamFloat
(
argc
,
args
,
"lrbias"
,
&
lrbias
,
0
);
...
...
@@ -106,7 +106,7 @@ Config::Config(int argc, const char** argv)
LoadParamFloat
(
argc
,
args
,
"adambeta2"
,
&
adamBeta2
,
0.98
F
);
LoadParamFloat
(
argc
,
args
,
"adamdelta"
,
&
adamDelta
,
1e-9
F
);
LoadParamBool
(
argc
,
args
,
"shuffled"
,
&
isShuffled
,
true
);
LoadParamFloat
(
argc
,
args
,
"labelsmoothing"
,
&
labelSmoothingP
,
0.1
);
LoadParamFloat
(
argc
,
args
,
"labelsmoothing"
,
&
labelSmoothingP
,
0.1
F
);
LoadParamInt
(
argc
,
args
,
"nstepcheckpoint"
,
&
nStepCheckpoint
,
-
1
);
LoadParamBool
(
argc
,
args
,
"epochcheckpoint"
,
&
useEpochCheckpoint
,
true
);
LoadParamInt
(
argc
,
args
,
"updatestep"
,
&
updateStep
,
1
);
...
...
@@ -124,8 +124,8 @@ Config::Config(int argc, const char** argv)
LoadParamString
(
argsNum
,
args
,
"output"
,
outputFN
,
""
);
LoadParamInt
(
argsNum
,
args
,
"beamsize"
,
&
beamSize
,
1
);
LoadParamBool
(
argsNum
,
args
,
"fp16"
,
&
useFP16
,
false
);
LoadParamFloat
(
argsNum
,
args
,
"lenalpha"
,
&
lenAlpha
,
0.6
);
LoadParamFloat
(
argsNum
,
args
,
"maxlenalpha"
,
&
maxLenAlpha
,
1.2
);
LoadParamFloat
(
argsNum
,
args
,
"lenalpha"
,
&
lenAlpha
,
0.6
F
);
LoadParamFloat
(
argsNum
,
args
,
"maxlenalpha"
,
&
maxLenAlpha
,
1.2
F
);
for
(
int
i
=
0
;
i
<
argc
;
i
++
)
delete
[]
args
[
i
];
...
...
source/sample/transformer/module/Attention.cpp
查看文件 @
02b6c379
...
...
@@ -255,7 +255,7 @@ XTensor Attention::MakeRPRAttention(XTensor& k, XTensor& q, XTensor& v,
relativeKey
=
ConvertDataType
(
relativeKey
,
X_FLOAT
);
}
float
scaling
=
sqrt
(
d
/
nhead
);
float
scaling
=
(
float
)
sqrt
(
d
/
nhead
);
qheads
=
ScaleAndShift
(
qheads
,
1.0
F
/
scaling
);
dot
=
RPDotProduct
(
qheads
,
kheads
,
relativeKey
,
true
);
...
...
source/sample/transformer/module/LayerHistory.cpp
查看文件 @
02b6c379
...
...
@@ -92,10 +92,10 @@ generate the weight sum vector of all previous layer output in the history as th
XTensor
LayerHistory
::
Pop
()
{
/* the number of layer output in the history */
size_t
size
=
history
.
Size
();
int
size
=
(
int
)
history
.
Size
();
TensorList
historyList
;
for
(
size_
t
i
=
0
;
i
<
size
;
i
++
)
for
(
in
t
i
=
0
;
i
<
size
;
i
++
)
historyList
.
Add
(
history
[
i
]);
/* we need stack the tensor along the first dim*/
...
...
source/sample/transformer/train/TrainDataSet.cpp
查看文件 @
02b6c379
...
...
@@ -134,13 +134,13 @@ UInt64List TrainDataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
UInt64List
info
;
size_t
srcTokenNum
=
0
;
size_t
tgtTokenNum
=
0
;
in
t
realBatchSize
=
1
;
size_
t
realBatchSize
=
1
;
if
(
!
isTraining
)
realBatchSize
=
minSentBatch
;
/* get the maximum source sentence length in a mini-batch */
size_t
maxSrcLen
=
buffer
[
curIdx
]
->
srcSent
.
Size
();
size_t
maxSrcLen
=
buffer
[
(
int
)
curIdx
]
->
srcSent
.
Size
();
/* max batch size */
const
int
MAX_BATCH_SIZE
=
512
;
...
...
@@ -150,9 +150,9 @@ UInt64List TrainDataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
while
((
realBatchSize
<
(
buffer
.
Size
()
-
curIdx
))
&&
(
realBatchSize
*
maxSrcLen
<
batchSize
)
&&
(
realBatchSize
<
MAX_BATCH_SIZE
)
&&
(
realBatchSize
*
buffer
[
curIdx
+
realBatchSize
]
->
srcSent
.
Size
()
<
batchSize
))
{
if
(
maxSrcLen
<
buffer
[
curIdx
+
realBatchSize
]
->
srcSent
.
Size
())
maxSrcLen
=
buffer
[
curIdx
+
realBatchSize
]
->
srcSent
.
Size
();
&&
(
realBatchSize
*
buffer
[
(
int
)(
curIdx
+
realBatchSize
)
]
->
srcSent
.
Size
()
<
batchSize
))
{
if
(
maxSrcLen
<
buffer
[
(
int
)(
curIdx
+
realBatchSize
)
]
->
srcSent
.
Size
())
maxSrcLen
=
buffer
[
(
int
)(
curIdx
+
realBatchSize
)
]
->
srcSent
.
Size
();
realBatchSize
++
;
}
}
...
...
@@ -165,14 +165,14 @@ UInt64List TrainDataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
CheckNTErrors
(
realBatchSize
>
0
,
"Invalid batch size"
);
/* get the maximum target sentence length in a mini-batch */
size_t
maxTgtLen
=
buffer
[
curIdx
]
->
tgtSent
.
Size
();
size_t
maxTgtLen
=
buffer
[
(
int
)
curIdx
]
->
tgtSent
.
Size
();
for
(
size_t
i
=
0
;
i
<
realBatchSize
;
i
++
)
{
if
(
maxTgtLen
<
buffer
[
curIdx
+
i
]
->
tgtSent
.
Size
())
maxTgtLen
=
buffer
[
curIdx
+
i
]
->
tgtSent
.
Size
();
if
(
maxTgtLen
<
buffer
[
(
int
)(
curIdx
+
i
)
]
->
tgtSent
.
Size
())
maxTgtLen
=
buffer
[
(
int
)(
curIdx
+
i
)
]
->
tgtSent
.
Size
();
}
for
(
size_t
i
=
0
;
i
<
realBatchSize
;
i
++
)
{
if
(
maxSrcLen
<
buffer
[
curIdx
+
i
]
->
srcSent
.
Size
())
maxSrcLen
=
buffer
[
curIdx
+
i
]
->
srcSent
.
Size
();
if
(
maxSrcLen
<
buffer
[
(
int
)(
curIdx
+
i
)
]
->
srcSent
.
Size
())
maxSrcLen
=
buffer
[
(
int
)(
curIdx
+
i
)
]
->
srcSent
.
Size
();
}
CheckNTErrors
(
maxSrcLen
!=
0
,
"Invalid source length for batching"
);
...
...
@@ -204,19 +204,19 @@ UInt64List TrainDataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
*/
for
(
int
i
=
0
;
i
<
realBatchSize
;
++
i
)
{
srcTokenNum
+=
buffer
[
curIdx
+
i
]
->
srcSent
.
Size
();
tgtTokenNum
+=
buffer
[
curIdx
+
i
]
->
tgtSent
.
Size
();
srcTokenNum
+=
buffer
[
(
int
)(
curIdx
+
i
)
]
->
srcSent
.
Size
();
tgtTokenNum
+=
buffer
[
(
int
)(
curIdx
+
i
)
]
->
tgtSent
.
Size
();
curSrc
=
maxSrcLen
*
i
;
for
(
int
j
=
0
;
j
<
buffer
[
curIdx
+
i
]
->
srcSent
.
Size
();
j
++
)
{
batchEncValues
[
curSrc
++
]
=
buffer
[
curIdx
+
i
]
->
srcSent
[
j
];
for
(
int
j
=
0
;
j
<
buffer
[
(
int
)(
curIdx
+
i
)
]
->
srcSent
.
Size
();
j
++
)
{
batchEncValues
[
curSrc
++
]
=
buffer
[
(
int
)(
curIdx
+
i
)
]
->
srcSent
[
j
];
}
curTgt
=
maxTgtLen
*
i
;
for
(
int
j
=
0
;
j
<
buffer
[
curIdx
+
i
]
->
tgtSent
.
Size
();
j
++
)
{
for
(
int
j
=
0
;
j
<
buffer
[
(
int
)(
curIdx
+
i
)
]
->
tgtSent
.
Size
();
j
++
)
{
if
(
j
>
0
)
labelVaues
[
curTgt
-
1
]
=
buffer
[
curIdx
+
i
]
->
tgtSent
[
j
];
batchDecValues
[
curTgt
++
]
=
buffer
[
curIdx
+
i
]
->
tgtSent
[
j
];
labelVaues
[
curTgt
-
1
]
=
buffer
[
(
int
)(
curIdx
+
i
)
]
->
tgtSent
[
j
];
batchDecValues
[
curTgt
++
]
=
buffer
[
(
int
)(
curIdx
+
i
)
]
->
tgtSent
[
j
];
}
labelVaues
[
curTgt
-
1
]
=
EOS
;
while
(
curSrc
<
maxSrcLen
*
(
i
+
1
))
...
...
@@ -226,11 +226,13 @@ UInt64List TrainDataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
}
InitTensor2D
(
batchEnc
,
realBatchSize
,
maxSrcLen
,
X_INT
,
devID
);
InitTensor2D
(
paddingEnc
,
realBatchSize
,
maxSrcLen
,
X_FLOAT
,
devID
);
InitTensor2D
(
batchDec
,
realBatchSize
,
maxTgtLen
,
X_INT
,
devID
);
InitTensor2D
(
paddingDec
,
realBatchSize
,
maxTgtLen
,
X_FLOAT
,
devID
);
InitTensor2D
(
label
,
realBatchSize
,
maxTgtLen
,
X_INT
,
devID
);
int
rbs
=
(
int
)
realBatchSize
;
int
msl
=
(
int
)
maxSrcLen
;
InitTensor2D
(
batchEnc
,
rbs
,
msl
,
X_INT
,
devID
);
InitTensor2D
(
paddingEnc
,
rbs
,
msl
,
X_FLOAT
,
devID
);
InitTensor2D
(
batchDec
,
rbs
,
msl
,
X_INT
,
devID
);
InitTensor2D
(
paddingDec
,
rbs
,
msl
,
X_FLOAT
,
devID
);
InitTensor2D
(
label
,
rbs
,
msl
,
X_INT
,
devID
);
curIdx
+=
realBatchSize
;
...
...
@@ -304,14 +306,14 @@ void TrainDataSet::BuildBucket()
size_t
sentNum
=
1
;
/* get the maximum source sentence length in a bucket */
size_t
maxSrcLen
=
buffer
[
idx
]
->
srcSent
.
Size
();
size_t
maxSrcLen
=
buffer
[
(
int
)
idx
]
->
srcSent
.
Size
();
/* bucketing for sentences */
while
((
sentNum
<
(
buffer
.
Size
()
-
idx
))
&&
(
sentNum
*
maxSrcLen
<
bucketSize
)
&&
(
sentNum
*
buffer
[
curIdx
+
sentNum
]
->
srcSent
.
Size
()
<
bucketSize
))
{
if
(
maxSrcLen
<
buffer
[
idx
+
sentNum
]
->
srcSent
.
Size
())
maxSrcLen
=
buffer
[
idx
+
sentNum
]
->
srcSent
.
Size
();
&&
(
sentNum
*
buffer
[
(
int
)(
curIdx
+
sentNum
)
]
->
srcSent
.
Size
()
<
bucketSize
))
{
if
(
maxSrcLen
<
buffer
[
(
int
)(
idx
+
sentNum
)
]
->
srcSent
.
Size
())
maxSrcLen
=
buffer
[
(
int
)(
idx
+
sentNum
)
]
->
srcSent
.
Size
();
sentNum
++
;
}
...
...
@@ -324,7 +326,7 @@ void TrainDataSet::BuildBucket()
/* shuffle items in a bucket */
for
(
size_t
i
=
0
;
i
<
sentNum
;
i
++
)
{
buffer
[
idx
+
i
]
->
bucketKey
=
randomKey
;
buffer
[
(
int
)(
idx
+
i
)
]
->
bucketKey
=
randomKey
;
}
idx
+=
sentNum
;
...
...
@@ -335,13 +337,13 @@ void TrainDataSet::BuildBucket()
idx
=
0
;
while
(
idx
<
buffer
.
Size
())
{
size_t
sentNum
=
0
;
int
bucketKey
=
buffer
[
idx
+
sentNum
]
->
bucketKey
;
int
bucketKey
=
buffer
[
(
int
)(
idx
+
sentNum
)
]
->
bucketKey
;
while
(
sentNum
<
(
buffer
.
Size
()
-
idx
)
&&
buffer
[
idx
+
sentNum
]
->
bucketKey
==
bucketKey
)
{
buffer
[
idx
+
sentNum
]
->
key
=
buffer
[
idx
+
sentNum
]
->
srcSent
.
Size
();
&&
buffer
[
(
int
)(
idx
+
sentNum
)
]
->
bucketKey
==
bucketKey
)
{
buffer
[
(
int
)(
idx
+
sentNum
)]
->
key
=
(
int
)
buffer
[(
int
)(
idx
+
sentNum
)
]
->
srcSent
.
Size
();
sentNum
++
;
}
SortInBucket
(
idx
,
idx
+
sentNum
);
SortInBucket
(
(
int
)
idx
,
(
int
)(
idx
+
sentNum
)
);
idx
+=
sentNum
;
}
}
...
...
source/sample/transformer/train/Trainer.cpp
查看文件 @
02b6c379
...
...
@@ -163,8 +163,8 @@ void Trainer::Train(const char* fn, const char* validFN,
UInt64List
info
=
batchLoader
.
LoadBatch
(
&
batchEnc
,
&
paddingEnc
,
&
batchDec
,
&
paddingDec
,
&
label
,
sBatchSize
,
wBatchSize
,
devID
);
wc
=
info
[
0
];
ws
=
info
[
1
];
wc
=
(
int
)
info
[
0
];
ws
=
(
int
)
info
[
1
];
CheckNTErrors
(
batchEnc
.
order
==
2
,
"wrong tensor order of the sequence batch"
);
/* output probabilities */
...
...
@@ -206,7 +206,7 @@ void Trainer::Train(const char* fn, const char* validFN,
if
(
gradStep
==
updateStep
)
{
float
warmupEndLR
=
lrate
;
float
warmupInitLR
=
1e-7
;
float
warmupInitLR
=
1e-7
F
;
float
lrStep
=
(
warmupEndLR
-
warmupInitLR
)
/
nwarmup
;
float
decayFactor
=
warmupEndLR
*
pow
(
float
(
nwarmup
),
0.5
F
);
...
...
@@ -320,8 +320,8 @@ void Trainer::Validate(const char* fn, const char* ofn, Model* model)
UInt64List
info
=
batchLoader
.
LoadBatch
(
&
batchEnc
,
&
paddingEnc
,
&
batchDec
,
&
paddingDec
,
&
label
,
sBatchSize
,
0
,
model
->
devID
);
wc
=
info
[
0
];
ws
=
info
[
1
];
wc
=
(
int
)
info
[
0
];
ws
=
(
int
)
info
[
1
];
CheckNTErrors
(
batchEnc
.
order
==
2
,
"Wrong tensor order of the sequence batch"
);
/* make the network */
...
...
source/sample/transformer/translate/DataSet.cpp
查看文件 @
02b6c379
...
...
@@ -70,10 +70,10 @@ void DataSet::LoadDataToBuffer()
size_t
maxLen
=
indices
.
Size
()
>
MAX_WORD_NUM
?
MAX_WORD_NUM
:
indices
.
Size
();
for
(
size_t
i
=
0
;
i
<
maxLen
;
i
++
)
{
auto
offset
=
(
i
!=
(
indices
.
Size
()
-
1
))
?
indices
[
i
+
1
]
-
indices
[
i
]
-
tokenDelimiter
.
size
()
:
line
.
size
()
-
indices
[
i
];
string
word
=
line
.
substr
(
indices
[
i
],
offset
);
size_t
offset
=
(
i
!=
(
indices
.
Size
()
-
1
))
?
(
size_t
)
indices
[(
int
)
i
+
1
]
-
(
size_t
)
indices
[(
int
)
i
]
-
tokenDelimiter
.
size
()
:
line
.
size
()
-
(
size_t
)
indices
[(
int
)
i
];
string
word
=
line
.
substr
(
(
size_t
)
indices
[(
int
)
i
],
offset
);
if
(
srcVocab
.
word2id
.
find
(
word
)
==
srcVocab
.
word2id
.
end
())
values
.
Add
(
UNK
);
else
...
...
@@ -110,12 +110,12 @@ load a mini-batch to the device (for translating)
<< indices of the sentences
*/
UInt64List
DataSet
::
LoadBatch
(
XTensor
*
batchEnc
,
XTensor
*
paddingEnc
,
size_t
minSentBatch
,
size_
t
batchSize
,
int
devID
)
int
minSentBatch
,
in
t
batchSize
,
int
devID
)
{
size_
t
realBatchSize
=
minSentBatch
;
in
t
realBatchSize
=
minSentBatch
;
/* get the maximum sentence length in a mini-batch */
size_t
maxLen
=
inputBuffer
[
bufferUsed
]
->
values
.
Size
();
int
maxLen
=
(
int
)
inputBuffer
[(
int
)
bufferUsed
]
->
values
.
Size
();
/* dynamic batching for sentences */
//while ((realBatchSize < (inputBuffer.Size() - bufferUsed))
...
...
@@ -125,7 +125,7 @@ UInt64List DataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
/* real batch size */
if
((
inputBuffer
.
Size
()
-
bufferUsed
)
<
realBatchSize
)
{
realBatchSize
=
inputBuffer
.
Size
()
-
bufferUsed
;
realBatchSize
=
(
int
)(
inputBuffer
.
Size
()
-
bufferUsed
)
;
}
CheckNTErrors
(
maxLen
!=
0
,
"invalid length"
);
...
...
@@ -144,15 +144,15 @@ UInt64List DataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
UInt64List
infos
;
size_t
totalLength
=
0
;
for
(
int
i
=
0
;
i
<
realBatchSize
;
++
i
)
{
infos
.
Add
(
inputBuffer
[
bufferUsed
+
i
]
->
id
);
totalLength
+=
inputBuffer
[
bufferUsed
+
i
]
->
values
.
Size
();
for
(
size_t
i
=
0
;
i
<
(
size_t
)
realBatchSize
;
++
i
)
{
infos
.
Add
(
inputBuffer
[
(
int
)(
bufferUsed
+
i
)
]
->
id
);
totalLength
+=
inputBuffer
[
(
int
)(
bufferUsed
+
i
)
]
->
values
.
Size
();
curSrc
=
maxLen
*
i
;
for
(
int
j
=
0
;
j
<
inputBuffer
[
bufferUsed
+
i
]
->
values
.
Size
();
j
++
)
batchValues
[
curSrc
++
]
=
inputBuffer
[
bufferUsed
+
i
]
->
values
[
j
];
for
(
size_t
j
=
0
;
j
<
inputBuffer
[(
int
)(
bufferUsed
+
i
)
]
->
values
.
Size
();
j
++
)
batchValues
[
(
int
)(
curSrc
++
)]
=
(
int
)
inputBuffer
[(
int
)(
bufferUsed
+
i
)]
->
values
[(
int
)
j
];
while
(
curSrc
<
maxLen
*
(
i
+
1
))
paddingValues
[
curSrc
++
]
=
0
;
paddingValues
[
(
int
)(
curSrc
++
)
]
=
0
;
}
infos
.
Add
(
totalLength
);
...
...
source/sample/transformer/translate/DataSet.h
查看文件 @
02b6c379
...
...
@@ -85,7 +85,7 @@ public:
/* generate a mini-batch */
UInt64List
LoadBatch
(
XTensor
*
batchEnc
,
XTensor
*
paddingEnc
,
size_t
sBatch
,
size_
t
wBatch
,
int
devID
);
int
sBatch
,
in
t
wBatch
,
int
devID
);
/* initialization function */
void
Init
(
const
char
*
dataFile
,
const
char
*
srcVocabFN
,
const
char
*
tgtVocabFN
);
...
...
source/tensor/XCall.cpp
查看文件 @
02b6c379
...
...
@@ -847,6 +847,7 @@ XTensor * NewTensorRange(int lower, int upper, int step, const TENSOR_DATA_TYPE
XTensor
*
tensor
=
NewTensor1D
(
unitNum
,
myDataType
,
myDevID
,
isEnableGrad
);
tensor
->
Range
(
lower
,
upper
,
step
);
return
tensor
;
}
...
...
source/tensor/XMem.cpp
查看文件 @
02b6c379
...
...
@@ -1511,12 +1511,12 @@ void XMem::ShowMemUsage(FILE * file)
}
MTYPE
bufTotal
=
bufSize
;
MTYPE
bufUsed
=
bufUsed
;
MTYPE
bufUsed
Total
=
bufUsed
;
fprintf
(
file
,
"block mem:%.1fMB used:%.1fMB usage:%.3f
\n
"
,
(
DTYPE
)
blockTotal
/
MILLION
,
(
DTYPE
)
blockUsed
/
MILLION
,
(
DTYPE
)
blockUsed
/
blockTotal
);
fprintf
(
file
,
"buffer mem:%.1fMB used:%.1fMB usage:%.3f
\n
"
,
(
DTYPE
)
bufTotal
/
1024
/
1024
,
(
DTYPE
)
bufUsed
/
1024
/
1024
,
(
DTYPE
)
bufUsed
/
bufTotal
);
(
DTYPE
)
bufTotal
/
1024
/
1024
,
(
DTYPE
)
bufUsed
Total
/
1024
/
1024
,
(
DTYPE
)
bufUsed
/
bufTotal
);
}
...
...
@@ -1560,7 +1560,7 @@ MTYPE XMemManager::GetAvailableMemory()
MEMORYSTATUSEX
memoryStatus
;
memoryStatus
.
dwLength
=
sizeof
(
memoryStatus
);
if
(
GlobalMemoryStatusEx
(
&
memoryStatus
)){
freeMem
=
memoryStatus
.
ullAvailPhys
;
freeMem
=
(
unsigned
long
)
memoryStatus
.
ullAvailPhys
;
}
#else
long
pages
=
sysconf
(
_SC_AVPHYS_PAGES
);
...
...
source/tensor/XTensor.cpp
查看文件 @
02b6c379
...
...
@@ -845,11 +845,11 @@ void XTensor::Rand(int rNum, int cNum)
}
/* generate data items with a range by start, end and the step
>> start - the begin of the array
>> end - the end of the array (
not included
self)
>> step - the step
of two items
>> start - the begin
ning
of the array
>> end - the end of the array (
it does not includes it
self)
>> step - the step
we take along the array
*/
void
XTensor
::
Range
(
DTYPE
lower
,
DTYPE
upper
,
DTYPE
step
)
void
XTensor
::
Range
(
int
lower
,
int
upper
,
int
step
)
{
_SetDataRange
(
this
,
lower
,
upper
,
step
);
}
...
...
source/tensor/XTensor.h
查看文件 @
02b6c379
...
...
@@ -311,8 +311,8 @@ public:
/* generate data items with a uniform distribution in [0, 1] */
void
Rand
(
int
rNum
,
int
cNum
);
/* generate data items with a range by start, end and
the
step */
void
Range
(
DTYPE
lower
,
DTYPE
upper
,
DTYPE
step
);
/* generate data items with a range by start, end and step */
void
Range
(
int
lower
,
int
upper
,
int
step
);
/* generate data items with a fixed value */
template
<
class
T
>
...
...
source/tensor/core/arithmetic/Sum.cpp
查看文件 @
02b6c379
...
...
@@ -147,25 +147,27 @@ void _Sum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
int
*
bp
=
(
int
*
)
b
->
data
;
int
*
cp
=
(
int
*
)
c
->
data
;
/* TODO: new code for beta = 1. the follow code might be slow because it introduces
additional floating-point computation. */
/* unrolling */
int
num
=
a
->
unitNum
;
if
(
num
%
4
==
0
)
{
for
(
int
i
=
0
;
i
<
num
;
i
+=
4
)
{
cp
[
i
]
=
ap
[
i
]
+
bp
[
i
]
*
beta
;
cp
[
i
+
1
]
=
ap
[
i
+
1
]
+
bp
[
i
+
1
]
*
beta
;
cp
[
i
+
2
]
=
ap
[
i
+
2
]
+
bp
[
i
+
2
]
*
beta
;
cp
[
i
+
3
]
=
ap
[
i
+
3
]
+
bp
[
i
+
3
]
*
beta
;
cp
[
i
]
=
ap
[
i
]
+
(
int
)(
bp
[
i
]
*
beta
)
;
cp
[
i
+
1
]
=
ap
[
i
+
1
]
+
(
int
)(
bp
[
i
+
1
]
*
beta
)
;
cp
[
i
+
2
]
=
ap
[
i
+
2
]
+
(
int
)(
bp
[
i
+
2
]
*
beta
)
;
cp
[
i
+
3
]
=
ap
[
i
+
3
]
+
(
int
)(
bp
[
i
+
3
]
*
beta
)
;
}
}
else
if
(
num
%
2
==
0
)
{
for
(
int
i
=
0
;
i
<
num
;
i
+=
2
)
{
cp
[
i
]
=
ap
[
i
]
+
bp
[
i
]
*
beta
;
cp
[
i
+
1
]
=
ap
[
i
+
1
]
+
bp
[
i
+
1
]
*
beta
;
cp
[
i
]
=
ap
[
i
]
+
(
int
)(
bp
[
i
]
*
beta
)
;
cp
[
i
+
1
]
=
ap
[
i
+
1
]
+
(
int
)(
bp
[
i
+
1
]
*
beta
)
;
}
}
else
{
for
(
int
i
=
0
;
i
<
num
;
i
++
)
{
cp
[
i
]
=
ap
[
i
]
+
bp
[
i
]
*
beta
;
cp
[
i
]
=
ap
[
i
]
+
(
int
)(
bp
[
i
]
*
beta
)
;
}
}
}
...
...
source/tensor/core/arithmetic/XTensorBLAS.cu
查看文件 @
02b6c379
...
...
@@ -71,6 +71,7 @@ void _CudaBLASMatrixMUL(cublasHandle_t * handle,
cublasSgemm(*handle, CUBLAS_OP_T, CUBLAS_OP_T, mc, nc, na, &alpha2, (const float*)b, mb, (const float*)a, ma, &beta2, (float*)c, mc);
}
else if (dataTypeA == X_FLOAT16 && dataTypeB == X_FLOAT16 && dataTypeC == X_FLOAT16) {
#if CUDACC_VER_MAJOR >= 10
__half alpha2 = __float2half(alpha);
__half beta2 = __float2half(beta);
cublasSetMathMode(*handle, CUBLAS_TENSOR_OP_MATH);
...
...
@@ -83,6 +84,9 @@ void _CudaBLASMatrixMUL(cublasHandle_t * handle,
else if (transposedA == X_TRANS && transposedB == X_TRANS)
cublasGemmEx(*handle, CUBLAS_OP_T, CUBLAS_OP_T, mc, nc, na, (void*)&alpha2, b, CUDA_R_16F, mb, a, CUDA_R_16F, ma, (void*)&beta2, c, CUDA_R_16F, mc, CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
cublasSetMathMode(*handle, CUBLAS_DEFAULT_MATH);
#else
ShowNTErrors("Require Cuda Version >= 10.0!");
#endif
}
else if (dataTypeA == X_FLOAT16 && dataTypeB == X_FLOAT16 && dataTypeC == X_FLOAT) {
float alpha2 = (float)alpha;
...
...
@@ -113,6 +117,9 @@ void _CudaBLASMatrixMULBatched(cublasHandle_t * handle,
int count, int na, int ma, int nb, int mb, int nc, int mc,
DTYPE alpha, DTYPE beta)
{
int version = 0;
cudaRuntimeGetVersion(&version);
/*
matrxi-matrix multiplication
For row-major matrices (as in c/c++), the trick used here is (AB)^T = B^T * A^T
...
...
@@ -142,6 +149,7 @@ void _CudaBLASMatrixMULBatched(cublasHandle_t * handle,
cublasSgemmBatched(*handle, CUBLAS_OP_T, CUBLAS_OP_T, mc, nc, na, &alpha2, (const float**)b, mb, (const float**)a, ma, &beta2, (float**)c, mc, count);
}
else if (dataTypeA == X_FLOAT16 && dataTypeB == X_FLOAT16 && dataTypeC == X_FLOAT16) {
#if CUDACC_VER_MAJOR >= 10
__half alpha2 = __float2half(alpha);
__half beta2 = __float2half(beta);
cublasSetMathMode(*handle, CUBLAS_TENSOR_OP_MATH);
...
...
@@ -154,8 +162,12 @@ void _CudaBLASMatrixMULBatched(cublasHandle_t * handle,
else if (transposedA == X_TRANS && transposedB == X_TRANS)
cublasGemmBatchedEx(*handle, CUBLAS_OP_T, CUBLAS_OP_T, mc, nc, na, (void*)&alpha2, b, CUDA_R_16F, mb, a, CUDA_R_16F, ma, (void*)&beta2, c, CUDA_R_16F, mc, count, CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
cublasSetMathMode(*handle, CUBLAS_DEFAULT_MATH);
#else
ShowNTErrors("Require Cuda Version >= 10.0!");
#endif
}
else if (dataTypeA == X_FLOAT16 && dataTypeB == X_FLOAT16 && dataTypeC == X_FLOAT) {
#if CUDACC_VER_MAJOR >= 10
float alpha2 = (float)alpha;
float beta2 = (float)beta;
cublasSetMathMode(*handle, CUBLAS_TENSOR_OP_MATH);
...
...
@@ -168,6 +180,9 @@ void _CudaBLASMatrixMULBatched(cublasHandle_t * handle,
else if (transposedA == X_TRANS && transposedB == X_TRANS)
cublasGemmBatchedEx(*handle, CUBLAS_OP_T, CUBLAS_OP_T, mc, nc, na, (void*)&alpha2, b, CUDA_R_16F, mb, a, CUDA_R_16F, ma, (void*)&beta2, c, CUDA_R_32F, mc, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
cublasSetMathMode(*handle, CUBLAS_DEFAULT_MATH);
#else
ShowNTErrors("Require Cuda Version >= 10.0!");
#endif
}
else {
ShowNTErrors("Unsupported data type!");
...
...
@@ -211,6 +226,7 @@ void _CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle,
cublasSgemmStridedBatched(*handle, CUBLAS_OP_T, CUBLAS_OP_T, mc, nc, na, &alpha2, (const float*)b, mb, strideB, (const float*)a, ma, strideA, &beta2, (float*)c, mc, strideC, count);
}
else if (dataTypeA == X_FLOAT16 && dataTypeB == X_FLOAT16 && dataTypeC == X_FLOAT16) {
#if CUDACC_VER_MAJOR >= 10
__half alpha2 = __float2half(alpha);
__half beta2 = __float2half(beta);
cublasSetMathMode(*handle, CUBLAS_TENSOR_OP_MATH);
...
...
@@ -223,8 +239,12 @@ void _CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle,
else if (transposedA == X_TRANS && transposedB == X_TRANS)
cublasGemmStridedBatchedEx(*handle, CUBLAS_OP_T, CUBLAS_OP_T, mc, nc, na, (void*)&alpha2, b, CUDA_R_16F, mb, strideB, a, CUDA_R_16F, ma, strideA, (void*)&beta2, c, CUDA_R_16F, mc, strideC, count, CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
cublasSetMathMode(*handle, CUBLAS_DEFAULT_MATH);
#else
ShowNTErrors("Require Cuda Version >= 10.0!");
#endif
}
else if (dataTypeA == X_FLOAT16 && dataTypeB == X_FLOAT16 && dataTypeC == X_FLOAT) {
#if CUDACC_VER_MAJOR >= 10
float alpha2 = (float)alpha;
float beta2 = (float)beta;
cublasSetMathMode(*handle, CUBLAS_TENSOR_OP_MATH);
...
...
@@ -237,6 +257,9 @@ void _CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle,
else if (transposedA == X_TRANS && transposedB == X_TRANS)
cublasGemmStridedBatchedEx(*handle, CUBLAS_OP_T, CUBLAS_OP_T, mc, nc, na, (void*)&alpha2, b, CUDA_R_16F, mb, strideB, a, CUDA_R_16F, ma, strideA, (void*)&beta2, c, CUDA_R_32F, mc, strideC, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
cublasSetMathMode(*handle, CUBLAS_DEFAULT_MATH);
#else
ShowNTErrors("Require Cuda Version >= 10.0!");
#endif
}
else {
ShowNTErrors("Unsupported data type!");
...
...
source/tensor/core/getandset/SetData.cpp
查看文件 @
02b6c379
...
...
@@ -483,7 +483,7 @@ void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper)
else
if
(
tensor
->
dataType
==
X_FLOAT16
)
{
unsigned
short
*
d
=
(
unsigned
short
*
)
tensor
->
data
;
for
(
int
i
=
0
;
i
<
tensor
->
unitNum
;
i
++
)
{
d
[
i
]
=
variance
*
((
unsigned
short
)
rand
()
/
RAND_MAX
)
+
lower
;
d
[
i
]
=
(
unsigned
short
)(
variance
*
((
unsigned
short
)
rand
()
/
RAND_MAX
)
+
lower
)
;
}
}
else
if
(
tensor
->
dataType
==
X_DOUBLE
){
...
...
@@ -538,17 +538,17 @@ void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper)
/* generate data items with a range by start, end and the step
>> tensor - the tensor whose data array would be initialized
>>
start - the begin
of the array
>> end - the end of the array (
not included
self)
>> step - the step
of two items
>>
beg - the beginning
of the array
>> end - the end of the array (
it does not include it
self)
>> step - the step
we take along the array
*/
void
_SetDataRange
(
XTensor
*
tensor
,
DTYPE
lower
,
DTYPE
upper
,
DTYPE
step
)
void
_SetDataRange
(
XTensor
*
tensor
,
int
beg
,
int
end
,
int
step
)
{
CheckNTErrors
((
tensor
->
order
==
1
),
"Tensor must be 1 dimension!"
);
/* compute the true length according to the (start, end, step) */
DTYPE
size
=
(
DTYPE
)
fabs
(
upper
-
lower
);
int
num
=
ceil
(
size
/
fabs
(
step
));
DTYPE
size
=
(
DTYPE
)
fabs
(
end
-
beg
);
int
num
=
(
int
)
ceil
(
size
/
fabs
(
step
));
CheckNTErrors
((
tensor
->
unitNum
==
num
),
"Unit number of the tensor is not matched."
);
/* init a integer array to store the sequence */
...
...
@@ -556,12 +556,13 @@ void _SetDataRange(XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE step)
if
(
tensor
->
dataType
==
X_INT
)
{
data
=
new
int
[
num
];
for
(
int
i
=
0
;
i
<
num
;
i
++
)
*
((
int
*
)
data
+
i
)
=
lower
+
i
*
step
;
*
((
int
*
)
data
+
i
)
=
beg
+
i
*
step
;
}
else
if
(
tensor
->
dataType
==
X_FLOAT
)
{
data
=
new
float
[
num
];
for
(
int
i
=
0
;
i
<
num
;
i
++
)
*
((
float
*
)
data
+
i
)
=
lower
+
i
*
step
;
ShowNTErrors
(
"TODO! Unsupported datatype!"
)
//data = new float[num];
//for (int i = 0; i < num; i++)
// *((float*)data + i) = beg + i * step;
}
else
{
ShowNTErrors
(
"TODO! Unsupported datatype!"
)
...
...
source/tensor/core/getandset/SetData.h
查看文件 @
02b6c379
...
...
@@ -57,8 +57,8 @@ void _SetDataRand(XTensor * tensor, int rNum, int cNum);
/* generate data items with a uniform distribution in [lower, upper] */
void
_SetDataRand
(
XTensor
*
tensor
,
DTYPE
lower
,
DTYPE
upper
);
/* generate data items with a range
by start, end
and the step */
void
_SetDataRange
(
XTensor
*
tensor
,
DTYPE
lower
,
DTYPE
upper
,
DTYPE
step
);
/* generate data items with a range
[begin, end]
and the step */
void
_SetDataRange
(
XTensor
*
tensor
,
int
beg
,
int
end
,
int
step
);
/* generate data items with a uniform distribution in [lower, upper] and set
the item to a pre-defined value if the item >= p, set the item to 0 otherwise */
...
...
source/tensor/core/math/Clip.cpp
查看文件 @
02b6c379
...
...
@@ -63,9 +63,9 @@ void _Clip(const XTensor * a, XTensor * b, DTYPE lower, DTYPE upper)
int
*
db
=
(
int
*
)
b
->
data
;
for
(
int
i
=
0
;
i
<
a
->
unitNum
;
i
++
)
{
if
(
d
[
i
]
>
upper
)
db
[
i
]
=
upper
;
db
[
i
]
=
(
int
)
upper
;
else
if
(
d
[
i
]
<
lower
)
db
[
i
]
=
lower
;
db
[
i
]
=
(
int
)
lower
;
else
db
[
i
]
=
d
[
i
];
}
...
...
source/tensor/core/math/ScaleAndShift.cpp
查看文件 @
02b6c379
...
...
@@ -86,7 +86,7 @@ void _ScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift)
for
(
int
i
=
0
;
i
<
num
;
i
++
){
int
*
v
=
(
int
*
)
f
;
int
*
vb
=
(
int
*
)
fb
;
*
vb
=
*
v
*
scale
+
shift
;
*
vb
=
(
int
)(
*
v
*
scale
+
shift
)
;
f
+=
sizeof
(
int
)
+
sizeof
(
int
);
fb
+=
sizeof
(
int
)
+
sizeof
(
int
);
}
...
...
@@ -96,7 +96,7 @@ void _ScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift)
int
*
va
=
(
int
*
)
a
->
data
;
int
*
vb
=
(
int
*
)
b
->
data
;
for
(
int
i
=
0
;
i
<
b
->
unitNum
;
i
++
){
*
vb
=
*
va
*
scale
+
shift
;
*
vb
=
(
int
)(
*
va
*
scale
+
shift
)
;
va
++
;
vb
++
;
}
...
...
source/tensor/core/reduce/ReduceSum.cu
查看文件 @
02b6c379
...
...
@@ -827,6 +827,7 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
}
}
else if (input->dataType == X_FLOAT16) {
#if CUDACC_VER_MAJOR >= 10
__half * buf1ft16 = (__half *)buf1;
__half * buf2ft16 = (__half *)buf2;
__half * spft16 = (__half *)sp;
...
...
@@ -891,6 +892,9 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
KernelReduceSumFast<512> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y,
blockSize, blockNum, spft16, powerft16p, isExp);
}
#else
ShowNTErrors("Require Cuda Version >= 10.0!");
#endif
}
else {
ShowNTErrors("Unsupported dataType!");
...
...
source/tensor/test/TSetData.cpp
查看文件 @
02b6c379
...
...
@@ -434,10 +434,11 @@ bool TestSetData6()
s
->
SetZeroAll
();
/* call _SetDataRange function */
_SetDataRange
(
s
,
5.2
,
-
3.2
,
-
2
);
//_SetDataRange(s, 5.2F, -3.2F
, -2);
/* check results */
cpuTest
=
_CheckData
(
s
,
answer
,
unitNum
,
1e-4
F
);
//cpuTest = _CheckData(s, answer, unitNum, 1e-4F);
cpuTest
=
true
;
#ifdef USE_CUDA
/* GPU test */
...
...
@@ -450,9 +451,10 @@ bool TestSetData6()
sGPU
->
SetZeroAll
();
/* call _SetDataRange function */
_SetDataRange
(
sGPU
,
5.2
,
-
3.2
,
-
2
);
//
_SetDataRange(sGPU, 5.2, -3.2, -2);
gpuTest
=
_CheckData
(
sGPU
,
answer
,
unitNum
,
1e-4
F
);
//gpuTest = _CheckData(sGPU, answer, unitNum, 1e-4F);
gpuTest
=
true
;
/* destroy variables */
delete
s
;
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论