Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
N
NiuTrans.Tensor
概览
Overview
Details
Activity
Cycle Analytics
版本库
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
问题
0
Issues
0
列表
Board
标记
里程碑
合并请求
0
Merge Requests
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
Snippets
成员
Collapse sidebar
Close sidebar
活动
图像
聊天
创建新问题
作业
提交
Issue Boards
Open sidebar
Emmay
NiuTrans.Tensor
Commits
03a9836e
Commit
03a9836e
authored
Nov 13, 2018
by
xuchen
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
1. add some base functions 2.better implementation for t2t
parent
52c0e35a
隐藏空白字符变更
内嵌
并排
正在显示
50 个修改的文件
包含
2333 行增加
和
906 行删除
+2333
-906
source/network/XBackwardFunc.cpp
+2
-2
source/network/XBackwardLoss.cpp
+8
-5
source/network/XBackwardLoss.h
+2
-2
source/network/XBackwardShape.cpp
+0
-2
source/network/XNet.cpp
+66
-12
source/network/XNet.h
+8
-1
source/sample/fnnlm/FNNLM.cpp
+3
-1
source/sample/transformer/T2TDecoder.cpp
+141
-0
source/sample/transformer/T2TDecoder.h
+20
-6
source/sample/transformer/T2TEmbedding.cpp
+13
-9
source/sample/transformer/T2TEncoder.cpp
+4
-0
source/sample/transformer/T2TLayerNormal.cpp
+1
-4
source/sample/transformer/T2TModel.cpp
+167
-52
source/sample/transformer/T2TModel.h
+9
-3
source/sample/transformer/T2TOutput.cpp
+5
-1
source/sample/transformer/T2TTrainer.cpp
+346
-140
source/sample/transformer/T2TTrainer.h
+40
-5
source/sample/transformer/Transformer.cpp
+67
-11
source/tensor/XDevice.cpp
+1
-1
source/tensor/XMem.cpp
+4
-3
source/tensor/XTensor.cpp
+79
-3
source/tensor/XTensor.h
+12
-0
source/tensor/XUtility.cpp
+15
-0
source/tensor/XUtility.h
+1
-0
source/tensor/core/getandset/SetData.cpp
+4
-4
source/tensor/core/movement/Gather.cpp
+49
-0
source/tensor/core/movement/Gather.h
+5
-0
source/tensor/core/reduce/ReduceSum.cpp
+2
-2
source/tensor/core/reduce/ReduceSum.cu
+77
-66
source/tensor/core/reduce/ReduceSumAll.cpp
+10
-9
source/tensor/core/reduce/ReduceSumAll.h
+2
-2
source/tensor/function/CrossEntropy.cpp
+248
-228
source/tensor/function/CrossEntropy.cu
+70
-273
source/tensor/function/CrossEntropy.cuh
+1
-1
source/tensor/function/CrossEntropy.h
+3
-3
source/tensor/function/LogSoftmax.cpp
+3
-3
source/tensor/function/LogSoftmax.cu
+27
-3
source/tensor/function/LogSoftmax.cuh
+2
-2
source/tensor/function/LogSoftmax.h
+2
-2
source/tensor/function/Loss.cpp
+3
-2
source/tensor/function/Softmax.cpp
+3
-3
source/tensor/function/Softmax.cu
+22
-1
source/tensor/function/Softmax.cuh
+2
-2
source/tensor/function/Softmax.h
+2
-2
source/tensor/test/TDropout.cpp
+4
-4
source/tensor/test/TLogSoftmax.cpp
+4
-4
source/tensor/test/TPower.cpp
+19
-7
source/tensor/test/TReduceSum.cpp
+514
-17
source/tensor/test/TSoftmax.cpp
+2
-2
source/tensor/test/TSumDim.cpp
+239
-1
没有找到文件。
source/network/XBackwardFunc.cpp
查看文件 @
03a9836e
...
@@ -49,7 +49,7 @@ void XFuncGrad::MakeGrad(XTensor * node, bool isEfficient)
...
@@ -49,7 +49,7 @@ void XFuncGrad::MakeGrad(XTensor * node, bool isEfficient)
else
if
(
operID
==
FUNC_LOGSOFTMAX
){
else
if
(
operID
==
FUNC_LOGSOFTMAX
){
int
leadDim
=
income
.
GetParamInt
(
0
);
int
leadDim
=
income
.
GetParamInt
(
0
);
CheckNTErrors
(
leadDim
>=
0
&&
leadDim
<
input
->
order
,
"wrong leading dimension in logsoftmax!"
);
CheckNTErrors
(
leadDim
>=
0
&&
leadDim
<
input
->
order
,
"wrong leading dimension in logsoftmax!"
);
_LogSoftmaxBackward
(
NULL
,
output
,
input
,
output
->
grad
,
input
->
grad
,
leadDim
,
NOLOSS
);
_LogSoftmaxBackward
(
NULL
,
output
,
input
,
output
->
grad
,
input
->
grad
,
NULL
,
leadDim
,
NOLOSS
);
}
}
else
if
(
operID
==
FUNC_RECTIFY
)
else
if
(
operID
==
FUNC_RECTIFY
)
_RectifyBackward
(
NULL
,
output
,
input
,
output
->
grad
,
input
->
grad
,
NOLOSS
);
_RectifyBackward
(
NULL
,
output
,
input
,
output
->
grad
,
input
->
grad
,
NOLOSS
);
...
@@ -58,7 +58,7 @@ void XFuncGrad::MakeGrad(XTensor * node, bool isEfficient)
...
@@ -58,7 +58,7 @@ void XFuncGrad::MakeGrad(XTensor * node, bool isEfficient)
else
if
(
operID
==
FUNC_SOFTMAX
){
else
if
(
operID
==
FUNC_SOFTMAX
){
int
leadDim
=
income
.
GetParamInt
(
0
);
int
leadDim
=
income
.
GetParamInt
(
0
);
CheckNTErrors
(
leadDim
>=
0
&&
leadDim
<
input
->
order
,
"wrong leading dimension in softmax!"
);
CheckNTErrors
(
leadDim
>=
0
&&
leadDim
<
input
->
order
,
"wrong leading dimension in softmax!"
);
_SoftmaxBackward
(
NULL
,
output
,
input
,
output
->
grad
,
input
->
grad
,
leadDim
,
NOLOSS
);
_SoftmaxBackward
(
NULL
,
output
,
input
,
output
->
grad
,
input
->
grad
,
NULL
,
leadDim
,
NOLOSS
);
}
}
else
{
else
{
ShowNTErrors
(
"Wrong activation function type!"
);
ShowNTErrors
(
"Wrong activation function type!"
);
...
...
source/network/XBackwardLoss.cpp
查看文件 @
03a9836e
...
@@ -42,7 +42,7 @@ compute dE/dx for a given function y = f(x)
...
@@ -42,7 +42,7 @@ compute dE/dx for a given function y = f(x)
>> lossName - name of the loss, e.g., cross entropy
>> lossName - name of the loss, e.g., cross entropy
*/
*/
void
XLossGrad
::
Compute
(
XTensor
*
gold
,
XTensor
*
y
,
XTensor
*
x
,
void
XLossGrad
::
Compute
(
XTensor
*
gold
,
XTensor
*
y
,
XTensor
*
x
,
XTensor
*
dedy
,
XTensor
*
dedx
,
XTensor
*
dedy
,
XTensor
*
dedx
,
XTensor
*
padding
,
int
funcID
,
void
*
params
,
int
funcID
,
void
*
params
,
LOSS_FUNCTION_NAME
lossName
)
LOSS_FUNCTION_NAME
lossName
)
{
{
...
@@ -58,7 +58,7 @@ void XLossGrad::Compute(XTensor * gold, XTensor * y, XTensor * x,
...
@@ -58,7 +58,7 @@ void XLossGrad::Compute(XTensor * gold, XTensor * y, XTensor * x,
}
}
else
if
(
funcID
==
FUNC_LOGSOFTMAX
){
else
if
(
funcID
==
FUNC_LOGSOFTMAX
){
int
leadDim
=
*
(
int
*
)
params
;
int
leadDim
=
*
(
int
*
)
params
;
_LogSoftmaxBackward
(
gold
,
y
,
x
,
dedy
,
dedx
,
leadDim
,
lossName
);
_LogSoftmaxBackward
(
gold
,
y
,
x
,
dedy
,
dedx
,
padding
,
leadDim
,
lossName
);
}
}
else
if
(
funcID
==
FUNC_RECTIFY
){
else
if
(
funcID
==
FUNC_RECTIFY
){
_RectifyBackward
(
gold
,
y
,
x
,
dedy
,
dedx
,
lossName
);
_RectifyBackward
(
gold
,
y
,
x
,
dedy
,
dedx
,
lossName
);
...
@@ -67,7 +67,7 @@ void XLossGrad::Compute(XTensor * gold, XTensor * y, XTensor * x,
...
@@ -67,7 +67,7 @@ void XLossGrad::Compute(XTensor * gold, XTensor * y, XTensor * x,
_SigmoidBackward
(
gold
,
y
,
x
,
dedy
,
dedx
,
lossName
);
_SigmoidBackward
(
gold
,
y
,
x
,
dedy
,
dedx
,
lossName
);
}
else
if
(
funcID
==
FUNC_SOFTMAX
){
}
else
if
(
funcID
==
FUNC_SOFTMAX
){
int
leadDim
=
*
(
int
*
)
params
;
int
leadDim
=
*
(
int
*
)
params
;
_SoftmaxBackward
(
gold
,
y
,
x
,
dedy
,
dedx
,
leadDim
,
lossName
);
_SoftmaxBackward
(
gold
,
y
,
x
,
dedy
,
dedx
,
padding
,
leadDim
,
lossName
);
}
}
else
{
else
{
ShowNTErrors
(
"wrong function found when call the backward process!"
);
ShowNTErrors
(
"wrong function found when call the backward process!"
);
...
@@ -83,10 +83,12 @@ compute dE/dy for variable y and error(loss) function E
...
@@ -83,10 +83,12 @@ compute dE/dy for variable y and error(loss) function E
>> lossName - name of the loss, e.g., cross entropy
>> lossName - name of the loss, e.g., cross entropy
*/
*/
void
XLossGrad
::
Compute
(
XTensor
*
gold
,
XTensor
*
y
,
void
XLossGrad
::
Compute
(
XTensor
*
gold
,
XTensor
*
y
,
XTensor
*
dedy
,
XTensor
*
dedy
,
XTensor
*
padding
,
LOSS_FUNCTION_NAME
lossName
)
LOSS_FUNCTION_NAME
lossName
)
{
{
_LossBackward
(
dedy
,
gold
,
y
,
lossName
);
//_LossBackward(dedy, gold, y, lossName);
if
(
lossName
==
CROSSENTROPY
)
_CrossEntropyBackward
(
dedy
,
y
,
gold
,
NULL
,
padding
);
}
}
}
}
\ No newline at end of file
source/network/XBackwardLoss.h
查看文件 @
03a9836e
...
@@ -36,13 +36,13 @@ class XLossGrad
...
@@ -36,13 +36,13 @@ class XLossGrad
public
:
public
:
/* compute dE/dx for a given function y = f(x) */
/* compute dE/dx for a given function y = f(x) */
void
Compute
(
XTensor
*
gold
,
XTensor
*
y
,
XTensor
*
x
,
void
Compute
(
XTensor
*
gold
,
XTensor
*
y
,
XTensor
*
x
,
XTensor
*
dedy
,
XTensor
*
dedx
,
XTensor
*
dedy
,
XTensor
*
dedx
,
XTensor
*
padding
,
int
funcID
,
void
*
params
,
int
funcID
,
void
*
params
,
LOSS_FUNCTION_NAME
lossName
);
LOSS_FUNCTION_NAME
lossName
);
/* compute dE/dy for variable y and error(loss) function E */
/* compute dE/dy for variable y and error(loss) function E */
void
Compute
(
XTensor
*
gold
,
XTensor
*
y
,
void
Compute
(
XTensor
*
gold
,
XTensor
*
y
,
XTensor
*
dedy
,
XTensor
*
dedy
,
XTensor
*
padding
,
LOSS_FUNCTION_NAME
lossName
);
LOSS_FUNCTION_NAME
lossName
);
};
};
...
...
source/network/XBackwardShape.cpp
查看文件 @
03a9836e
...
@@ -469,8 +469,6 @@ void XShapeGrad::GradTranspose(XTensor * node, bool isEfficient)
...
@@ -469,8 +469,6 @@ void XShapeGrad::GradTranspose(XTensor * node, bool isEfficient)
DelTensorBuf
(
b
);
DelTensorBuf
(
b
);
node
->
visitMark
=
NODE_FINISHED
;
node
->
visitMark
=
NODE_FINISHED
;
delete
b
;
}
}
/*
/*
...
...
source/network/XNet.cpp
查看文件 @
03a9836e
...
@@ -55,7 +55,7 @@ void XNetClearAll()
...
@@ -55,7 +55,7 @@ void XNetClearAll()
XNet
::
XNet
()
XNet
::
XNet
()
{
{
nodes
.
Clear
();
nodes
.
Clear
();
isGradEfficient
=
tru
e
;
isGradEfficient
=
fals
e
;
}
}
/* de-constructor */
/* de-constructor */
...
@@ -86,7 +86,31 @@ void XNet::Backward(XTensor &root, XTensor &gold, LOSS_FUNCTION_NAME loss)
...
@@ -86,7 +86,31 @@ void XNet::Backward(XTensor &root, XTensor &gold, LOSS_FUNCTION_NAME loss)
XList
golds
(
1
);
XList
golds
(
1
);
golds
.
Add
(
&
gold
);
golds
.
Add
(
&
gold
);
Backward
(
roots
,
golds
,
loss
);
XList
paddings
(
1
);
paddings
.
Add
(
NULL
);
Backward
(
roots
,
golds
,
paddings
,
loss
);
}
/*
backward propagation to obtain gradient wrt. the loss/error function
>> root - root node (output) of the network
>> gold - gold standard for the output
>> padding - specify a target value that is ignored and does not contribute to the loss computation
>> loss - name of loss function
*/
void
XNet
::
Backward
(
XTensor
&
root
,
XTensor
&
gold
,
XTensor
&
padding
,
LOSS_FUNCTION_NAME
loss
)
{
XList
roots
(
1
);
roots
.
Add
(
&
root
);
XList
golds
(
1
);
golds
.
Add
(
&
gold
);
XList
paddings
(
1
);
paddings
.
Add
(
&
padding
);
Backward
(
roots
,
golds
,
paddings
,
loss
);
}
}
/*
/*
...
@@ -102,7 +126,10 @@ void XNet::Backward(XTensor &root, LOSS_FUNCTION_NAME loss)
...
@@ -102,7 +126,10 @@ void XNet::Backward(XTensor &root, LOSS_FUNCTION_NAME loss)
XList
golds
(
1
);
XList
golds
(
1
);
golds
.
Add
(
NULL
);
golds
.
Add
(
NULL
);
Backward
(
roots
,
golds
,
loss
);
XList
paddings
(
1
);
paddings
.
Add
(
NULL
);
Backward
(
roots
,
golds
,
paddings
,
loss
);
}
}
/*
/*
...
@@ -110,9 +137,10 @@ backward propagation to obtain gradient wrt. the loss/error function
...
@@ -110,9 +137,10 @@ backward propagation to obtain gradient wrt. the loss/error function
with a number of root nodes
with a number of root nodes
>> root - a list of root nodes (output) of the network
>> root - a list of root nodes (output) of the network
>> gold - a list of gold standard for the output
>> gold - a list of gold standard for the output
>> padding - specify a target value that is ignored
>> loss - name of loss function
>> loss - name of loss function
*/
*/
void
XNet
::
Backward
(
XList
&
roots
,
XList
&
golds
,
LOSS_FUNCTION_NAME
loss
)
void
XNet
::
Backward
(
XList
&
roots
,
XList
&
golds
,
XList
&
paddings
,
LOSS_FUNCTION_NAME
loss
)
{
{
Traverse
(
roots
);
Traverse
(
roots
);
...
@@ -131,6 +159,7 @@ void XNet::Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss)
...
@@ -131,6 +159,7 @@ void XNet::Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss)
for
(
int
i
=
0
;
i
<
roots
.
count
;
i
++
){
for
(
int
i
=
0
;
i
<
roots
.
count
;
i
++
){
XTensor
*
root
=
(
XTensor
*
)
roots
.
Get
(
i
);
XTensor
*
root
=
(
XTensor
*
)
roots
.
Get
(
i
);
XTensor
*
gold
=
(
XTensor
*
)
golds
.
Get
(
i
);
XTensor
*
gold
=
(
XTensor
*
)
golds
.
Get
(
i
);
XTensor
*
padding
=
(
XTensor
*
)
paddings
.
Get
(
i
);
XLink
&
income
=
root
->
income
;
XLink
&
income
=
root
->
income
;
int
funcID
=
income
.
typeID
;
int
funcID
=
income
.
typeID
;
void
*
params
=
income
.
params
;
void
*
params
=
income
.
params
;
...
@@ -139,15 +168,21 @@ void XNet::Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss)
...
@@ -139,15 +168,21 @@ void XNet::Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss)
Note that we do not need to obtain dE/dy here because it is no use in the
Note that we do not need to obtain dE/dy here because it is no use in the
folloing process of back-propagation */
folloing process of back-propagation */
if
(
gold
!=
NULL
&&
income
.
tailNum
==
1
&&
(
funcID
&
FUNCTION_BASE
)){
if
(
gold
!=
NULL
&&
income
.
tailNum
==
1
&&
(
funcID
&
FUNCTION_BASE
)){
XTensor
*
x
=
income
.
tails
[
0
];
if
(
funcID
==
FUNC_LOGSOFTMAX
||
funcID
==
FUNC_SOFTMAX
)
{
XNoder
::
MakeGrad
(
x
);
XTensor
*
x
=
income
.
tails
[
0
];
lossGrad
.
Compute
(
gold
,
root
,
x
,
NULL
,
x
->
grad
,
funcID
,
params
,
loss
);
XNoder
::
MakeGrad
(
x
);
root
->
visitMark
=
NODE_FINISHED
;
lossGrad
.
Compute
(
gold
,
root
,
x
,
NULL
,
x
->
grad
,
padding
,
funcID
,
params
,
loss
);
root
->
visitMark
=
NODE_FINISHED
;
}
else
{
XNoder
::
MakeGrad
(
root
);
lossGrad
.
Compute
(
gold
,
root
,
root
->
grad
,
padding
,
loss
);
}
}
}
/* we compuate dE/dy (y is the output) if no predefined activation function is used */
/* we compuate dE/dy (y is the output) if no predefined activation function is used */
else
{
else
{
XNoder
::
MakeGrad
(
root
);
XNoder
::
MakeGrad
(
root
);
lossGrad
.
Compute
(
gold
,
root
,
root
->
grad
,
loss
);
lossGrad
.
Compute
(
gold
,
root
,
root
->
grad
,
NULL
,
loss
);
}
}
}
}
...
@@ -178,16 +213,35 @@ void XNet::Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss)
...
@@ -178,16 +213,35 @@ void XNet::Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss)
/*
/*
backward propagation to obtain gradient
backward propagation to obtain gradient
with a number of root nodes
with a number of root nodes
>> root - a list of root nodes (output) of the network
>> root
s
- a list of root nodes (output) of the network
>> loss - name of loss function
>> loss - name of loss function
*/
*/
void
XNet
::
Backward
(
XList
&
roots
,
LOSS_FUNCTION_NAME
loss
)
void
XNet
::
Backward
(
XList
&
roots
,
LOSS_FUNCTION_NAME
loss
)
{
{
XList
golds
(
roots
.
count
);
XList
golds
(
roots
.
count
);
for
(
int
i
=
0
;
i
<
roots
.
count
;
i
++
)
XList
paddings
(
roots
.
count
);
for
(
int
i
=
0
;
i
<
roots
.
count
;
i
++
)
{
golds
.
Add
(
NULL
);
golds
.
Add
(
NULL
);
paddings
.
Add
(
NULL
);
}
Backward
(
roots
,
golds
,
paddings
,
loss
);
}
/*
backward propagation to obtain gradient
with a number of root nodes
>> roots - a list of root nodes (output) of the network
>> golds - a list of gold standard for the output
>> loss - name of loss function
*/
void
XNet
::
Backward
(
XList
&
roots
,
XList
&
golds
,
LOSS_FUNCTION_NAME
loss
)
{
XList
paddings
(
roots
.
count
);
for
(
int
i
=
0
;
i
<
roots
.
count
;
i
++
)
paddings
.
Add
(
NULL
);
Backward
(
roots
,
golds
,
loss
);
Backward
(
roots
,
golds
,
paddings
,
loss
);
}
}
/*
/*
...
...
source/network/XNet.h
查看文件 @
03a9836e
...
@@ -62,17 +62,24 @@ struct XNet
...
@@ -62,17 +62,24 @@ struct XNet
/* backward propagation to obtain gradient wrt. the loss/error function */
/* backward propagation to obtain gradient wrt. the loss/error function */
void
Backward
(
XTensor
&
root
,
XTensor
&
gold
,
LOSS_FUNCTION_NAME
loss
=
NOLOSS
);
void
Backward
(
XTensor
&
root
,
XTensor
&
gold
,
LOSS_FUNCTION_NAME
loss
=
NOLOSS
);
/* backward propagation to obtain gradient wrt. the loss/error function */
void
Backward
(
XTensor
&
root
,
XTensor
&
gold
,
XTensor
&
padding
,
LOSS_FUNCTION_NAME
loss
=
NOLOSS
);
/* backward propagation to obtain gradient */
/* backward propagation to obtain gradient */
void
Backward
(
XTensor
&
root
,
LOSS_FUNCTION_NAME
loss
=
NOLOSS
);
void
Backward
(
XTensor
&
root
,
LOSS_FUNCTION_NAME
loss
=
NOLOSS
);
/* backward propagation to obtain gradient wrt. the loss/error function
/* backward propagation to obtain gradient wrt. the loss/error function
with a number of root nodes */
with a number of root nodes */
void
Backward
(
XList
&
roots
,
XList
&
golds
,
LOSS_FUNCTION_NAME
loss
=
NOLOSS
);
void
Backward
(
XList
&
roots
,
XList
&
golds
,
XList
&
paddings
,
LOSS_FUNCTION_NAME
loss
=
NOLOSS
);
/* backward propagation to obtain gradient
/* backward propagation to obtain gradient
with a number of root nodes */
with a number of root nodes */
void
Backward
(
XList
&
roots
,
LOSS_FUNCTION_NAME
loss
=
NOLOSS
);
void
Backward
(
XList
&
roots
,
LOSS_FUNCTION_NAME
loss
=
NOLOSS
);
/* backward propagation to obtain gradient
with a number of root nodes */
void
Backward
(
XList
&
roots
,
XList
&
golds
,
LOSS_FUNCTION_NAME
loss
=
NOLOSS
);
/* backward computation for a given node */
/* backward computation for a given node */
void
BackwardNode
(
XTensor
*
node
,
bool
isEfficent
=
false
);
void
BackwardNode
(
XTensor
*
node
,
bool
isEfficent
=
false
);
...
...
source/sample/fnnlm/FNNLM.cpp
查看文件 @
03a9836e
...
@@ -514,6 +514,8 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
...
@@ -514,6 +514,8 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
if
(
isEnd
)
if
(
isEnd
)
break
;
break
;
Test
(
testFN
,
outputFN
,
model
);
}
}
double
elapsed
=
GetClockSec
()
-
startT
;
double
elapsed
=
GetClockSec
()
-
startT
;
...
@@ -890,7 +892,7 @@ void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NA
...
@@ -890,7 +892,7 @@ void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NA
/* for y = softmax(s), we get dE/ds
/* for y = softmax(s), we get dE/ds
where E is the error function (define by loss) */
where E is the error function (define by loss) */
_LogSoftmaxBackward
(
&
gold
,
&
y
,
&
s
,
NULL
,
&
deds
,
1
,
loss
);
_LogSoftmaxBackward
(
&
gold
,
&
y
,
&
s
,
NULL
,
&
deds
,
NULL
,
1
,
loss
);
/* for s = x * w, we get
/* for s = x * w, we get
dE/w_{i,j} = dE/ds_j * ds/dw_{i,j}
dE/w_{i,j} = dE/ds_j * ds/dw_{i,j}
...
...
source/sample/transformer/T2TDecoder.cpp
查看文件 @
03a9836e
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-10-09
*/
#include <math.h>
#include "T2TDecoder.h"
#include "../../tensor/core/CHeader.h"
namespace
transformer
{
/* constructor */
AttDecoder
::
AttDecoder
()
{
attentionsEnde
=
NULL
;
attEndeLayerNorms
=
NULL
;
}
/* de-constructor */
AttDecoder
::~
AttDecoder
()
{
delete
[]
attentionsEnde
;
delete
[]
attEndeLayerNorms
;
}
/*
initialize the model
>> argc - number of arguments
>> argv - list of pointers to the arguments
>> myIsMasked - indicates whether the masked attention is employed
>> myIgnored - number of positions ignored in attention (from the start)
>> myDevID - device id
>> myMem - the memory pool
*/
void
AttDecoder
::
InitModel
(
int
argc
,
char
**
argv
,
bool
myIsMasked
,
int
myIgnored
,
int
myDevID
,
XMem
*
myMem
)
{
AttEncoder
::
InitModel
(
argc
,
argv
,
myIsMasked
,
myIgnored
,
myDevID
,
myMem
);
attentionsEnde
=
new
T2TAttention
[
nlayer
];
attEndeLayerNorms
=
new
T2TLN
[
nlayer
];
/* initialize the stacked layers */
for
(
int
i
=
0
;
i
<
nlayer
;
i
++
){
attentionsEnde
[
i
].
InitModel
(
argc
,
argv
,
false
,
myIgnored
,
myDevID
,
myMem
);
attEndeLayerNorms
[
i
].
InitModel
(
argc
,
argv
,
myDevID
,
myMem
);
}
}
/*
make the decoding network
>> inputDec - the input tensor of the decoder
>> outputEnc - the output tensor of the encoder
>> mask - the mask that indicate each position is valid
>> isTraining - indicates whether the model is used for training
<< return - the output tensor of the encoder
*/
XTensor
AttDecoder
::
Make
(
XTensor
&
inputDec
,
XTensor
&
outputEnc
,
XTensor
&
mask
,
bool
isTraining
)
{
XTensor
x
;
x
=
embedder
.
Make
(
inputDec
);
/* dropout */
if
(
isTraining
&&
dropoutP
>
0
)
x
=
Dropout
(
x
,
dropoutP
);
for
(
int
i
=
0
;
i
<
nlayer
;
i
++
){
XTensor
att
;
XTensor
ende
;
XTensor
ln
;
XTensor
fnn
;
XTensor
res
;
XTensor
nothing
;
/******************/
/* self attention */
att
=
attentions
[
i
].
Make
(
x
,
x
,
x
,
mask
,
isTraining
);
/* dropout */
if
(
isTraining
&&
dropoutP
>
0
)
att
=
Dropout
(
att
,
dropoutP
);
/* residual connection */
res
=
Sum
(
att
,
x
);
/* layer normalization */
x
=
attLayerNorms
[
i
].
Make
(
res
);
/*****************************/
/* encoder-decoder attention */
ende
=
attentionsEnde
[
i
].
Make
(
outputEnc
,
x
,
outputEnc
,
nothing
,
isTraining
);
/* dropout */
if
(
isTraining
&&
dropoutP
>
0
)
ende
=
Dropout
(
ende
,
dropoutP
);
/* residual connection */
res
=
Sum
(
ende
,
x
);
/* layer normalization */
x
=
attEndeLayerNorms
[
i
].
Make
(
res
);
/*******/
/* fnn */
fnn
=
fnns
[
i
].
Make
(
x
,
isTraining
);
/* dropout */
if
(
isTraining
&&
dropoutP
>
0
)
fnn
=
Dropout
(
fnn
,
dropoutP
);
/* residual connection */
res
=
Sum
(
fnn
,
x
);
/* layer normalization */
x
=
fnnLayerNorms
[
i
].
Make
(
res
);
}
return
x
;
}
}
source/sample/transformer/T2TDecoder.h
查看文件 @
03a9836e
...
@@ -22,19 +22,33 @@
...
@@ -22,19 +22,33 @@
#ifndef __T2TDECODER_H__
#ifndef __T2TDECODER_H__
#define __T2TDECODER_H__
#define __T2TDECODER_H__
#include "T2TEncoder.h"
namespace
transformer
namespace
transformer
{
{
class
T2TDe
coder
class
AttDecoder
:
public
AttEn
coder
{
{
public
:
/* encoder-decoder attention model of each layer */
T2TAttention
*
attentionsEnde
;
};
/* layer normalization for encoder-decoder attention */
T2TLN
*
attEndeLayerNorms
;
class
AttDecoder
:
T2TDecoder
{
public
:
public
:
/* constructor */
AttDecoder
();
/* deconstructor */
~
AttDecoder
();
/* initialize the model */
/* initialize the model */
void
InitModel
(
int
argc
,
char
**
argv
);
void
InitModel
(
int
argc
,
char
**
argv
,
bool
myIsMasked
,
int
myIgnored
,
int
myDevID
=
-
1
,
XMem
*
myMem
=
NULL
);
/* make the decoding network */
XTensor
Make
(
XTensor
&
inputDec
,
XTensor
&
outputEnc
,
XTensor
&
mask
,
bool
isTraining
);
};
};
}
}
...
...
source/sample/transformer/T2TEmbedding.cpp
查看文件 @
03a9836e
...
@@ -61,16 +61,17 @@ void T2TEmbedder::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
...
@@ -61,16 +61,17 @@ void T2TEmbedder::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
InitTensor2D
(
&
w
,
vSize
,
eSize
,
X_FLOAT
,
devID
,
mem
);
InitTensor2D
(
&
w
,
vSize
,
eSize
,
X_FLOAT
,
devID
,
mem
);
DTYPE
v
=
1.0
F
/
(
float
)
sqrt
((
float
)
eSize
);
DTYPE
v
=
1.0
F
/
(
float
)
sqrt
((
float
)
eSize
);
w
.
SetDataRand
(
-
v
,
v
);
w
.
SetDataRand
n
(
0
,
v
);
/* create the positional embedding matrix */
/* create the positional embedding matrix */
MakePosEmbedding
(
eSize
,
d
,
maxLength
);
MakePosEmbedding
(
eSize
,
d
,
maxLength
);
}
}
/*
/*
make positional embeddings (of size eSize * length
make positional embeddings (of size eSize * length)
eSize - embedding size
>> eSize - embedding size
length - length of the sequenc
>> d - dimension size of the hidden layers
>> length - length of the sequence
*/
*/
void
T2TEmbedder
::
MakePosEmbedding
(
int
eSize
,
int
d
,
int
length
)
void
T2TEmbedder
::
MakePosEmbedding
(
int
eSize
,
int
d
,
int
length
)
{
{
...
@@ -114,15 +115,15 @@ make the network
...
@@ -114,15 +115,15 @@ make the network
*/
*/
XTensor
T2TEmbedder
::
Make
(
XTensor
&
input
)
XTensor
T2TEmbedder
::
Make
(
XTensor
&
input
)
{
{
CheckNTErrors
(
input
.
GetDim
(
-
1
)
==
vSize
,
"Wrong vocabulary size!"
);
//
CheckNTErrors(input.GetDim(-1) == vSize, "Wrong vocabulary size!");
CheckNTErrors
(
input
.
order
>
1
,
"Wrong input tensor size!"
);
CheckNTErrors
(
input
.
order
>
1
,
"Wrong input tensor size!"
);
CheckNTErrors
(
input
.
dimSize
[
input
.
order
-
2
]
<
maxLength
,
"The sequence is too long!"
);
CheckNTErrors
(
input
.
dimSize
[
input
.
order
-
1
]
<
maxLength
,
"The sequence is too long!"
);
CheckNTErrors
(
vSize
>
0
,
"set vocabulary size by
\"
-vsize
\"
"
);
CheckNTErrors
(
vSize
>
0
,
"set vocabulary size by
\"
-vsize
\"
"
);
CheckNTErrors
(
eSize
>
0
,
"set embedding size by
\"
-esize
\"
"
);
CheckNTErrors
(
eSize
>
0
,
"set embedding size by
\"
-esize
\"
"
);
int
dims
[
MAX_TENSOR_DIM_NUM
];
int
dims
[
MAX_TENSOR_DIM_NUM
];
memcpy
(
dims
,
input
.
dimSize
,
input
.
order
*
sizeof
(
int
));
memcpy
(
dims
,
input
.
dimSize
,
input
.
order
*
sizeof
(
int
));
dims
[
input
.
order
-
1
]
=
eSize
;
dims
[
input
.
order
]
=
eSize
;
XTensor
wordEmbedding
;
XTensor
wordEmbedding
;
XTensor
posEmbedding
;
XTensor
posEmbedding
;
...
@@ -138,7 +139,8 @@ XTensor T2TEmbedder::Make(XTensor &input)
...
@@ -138,7 +139,8 @@ XTensor T2TEmbedder::Make(XTensor &input)
/* we make positional embeddings first */
/* we make positional embeddings first */
//if(!match){
//if(!match){
if
(
true
){
if
(
true
){
InitTensor
(
&
posEmbedding
,
input
.
order
,
dims
,
X_FLOAT
,
1.0
F
,
devID
,
mem
);
InitTensor
(
&
posEmbedding
,
input
.
order
+
1
,
dims
,
X_FLOAT
,
1.0
F
,
devID
,
mem
);
XTensor
*
posTMP
=
NewTensorBuf
(
2
,
dims
+
1
,
X_FLOAT
,
1.0
F
,
devID
,
mem
);
XTensor
*
posTMP
=
NewTensorBuf
(
2
,
dims
+
1
,
X_FLOAT
,
1.0
F
,
devID
,
mem
);
_CopyValues
(
&
posEmbeddingBase
,
0
,
posTMP
->
unitNum
,
posTMP
,
0
);
_CopyValues
(
&
posEmbeddingBase
,
0
,
posTMP
->
unitNum
,
posTMP
,
0
);
...
@@ -148,7 +150,9 @@ XTensor T2TEmbedder::Make(XTensor &input)
...
@@ -148,7 +150,9 @@ XTensor T2TEmbedder::Make(XTensor &input)
}
}
/* then we make word embeddings */
/* then we make word embeddings */
wordEmbedding
=
Linear
(
MMul
(
input
,
w
),
(
float
)
sqrt
((
float
)
eSize
));
//wordEmbedding = Linear(MMul(input, w), (float)sqrt((float)eSize));
wordEmbedding
=
Gather
(
w
,
input
);
wordEmbedding
=
Linear
(
wordEmbedding
,
(
float
)
sqrt
((
float
)
eSize
));
/* we sum over the two embeddings */
/* we sum over the two embeddings */
return
wordEmbedding
+
posEmbedding
;
return
wordEmbedding
+
posEmbedding
;
...
...
source/sample/transformer/T2TEncoder.cpp
查看文件 @
03a9836e
...
@@ -31,6 +31,10 @@ namespace transformer
...
@@ -31,6 +31,10 @@ namespace transformer
/* constructor */
/* constructor */
AttEncoder
::
AttEncoder
()
AttEncoder
::
AttEncoder
()
{
{
attentions
=
NULL
;
fnns
=
NULL
;
attLayerNorms
=
NULL
;
fnnLayerNorms
=
NULL
;
}
}
/* de-constructor */
/* de-constructor */
...
...
source/sample/transformer/T2TLayerNormal.cpp
查看文件 @
03a9836e
...
@@ -59,10 +59,7 @@ void T2TLN::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
...
@@ -59,10 +59,7 @@ void T2TLN::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
InitTensor1D
(
&
w
,
d
,
X_FLOAT
,
devID
,
mem
);
InitTensor1D
(
&
w
,
d
,
X_FLOAT
,
devID
,
mem
);
InitTensor1D
(
&
b
,
d
,
X_FLOAT
,
devID
,
mem
);
InitTensor1D
(
&
b
,
d
,
X_FLOAT
,
devID
,
mem
);
float
scale
=
1.0
F
;
w
.
SetDataRand
(
1.0
F
,
1.0
F
);
float
finfout
=
(
float
)
sqrt
(
6.0
F
*
scale
/
d
);
w
.
SetDataRand
(
-
finfout
,
finfout
);
b
.
SetZeroAll
();
b
.
SetZeroAll
();
}
}
...
...
source/sample/transformer/T2TModel.cpp
查看文件 @
03a9836e
...
@@ -57,8 +57,8 @@ void T2TModel::InitModel(int argc, char ** argv)
...
@@ -57,8 +57,8 @@ void T2TModel::InitModel(int argc, char ** argv)
LoadParamInt
(
argc
,
argv
,
"dev"
,
&
devID
,
-
1
);
LoadParamInt
(
argc
,
argv
,
"dev"
,
&
devID
,
-
1
);
LoadParamBool
(
argc
,
argv
,
"mem"
,
&
useMem
,
useMem
);
LoadParamBool
(
argc
,
argv
,
"mem"
,
&
useMem
,
useMem
);
LoadParamInt
(
argc
,
argv
,
"memsize"
,
&
memSize
,
1024
);
LoadParamInt
(
argc
,
argv
,
"memsize"
,
&
memSize
,
1024
);
LoadParamBool
(
argc
,
argv
,
"lm"
,
&
isLM
,
true
);
LoadParamBool
(
argc
,
argv
,
"mt"
,
&
isMT
,
false
);
LoadParamBool
(
argc
,
argv
,
"mt"
,
&
isMT
,
false
);
LoadParamBool
(
argc
,
argv
,
"lm"
,
&
isLM
,
!
isMT
);
LoadParamInt
(
argc
,
argv
,
"nhead"
,
&
nhead
,
8
);
LoadParamInt
(
argc
,
argv
,
"nhead"
,
&
nhead
,
8
);
LoadParamBool
(
argc
,
argv
,
"freeotf"
,
&
isMemFreeOTF
,
false
);
LoadParamBool
(
argc
,
argv
,
"freeotf"
,
&
isMemFreeOTF
,
false
);
...
@@ -71,6 +71,9 @@ void T2TModel::InitModel(int argc, char ** argv)
...
@@ -71,6 +71,9 @@ void T2TModel::InitModel(int argc, char ** argv)
encoder
.
InitModel
(
argc
,
argv
,
isLM
,
0
,
devID
,
mem
);
encoder
.
InitModel
(
argc
,
argv
,
isLM
,
0
,
devID
,
mem
);
outputLayer
.
InitModel
(
argc
,
argv
,
devID
,
mem
);
outputLayer
.
InitModel
(
argc
,
argv
,
devID
,
mem
);
if
(
isMT
)
decoder
.
InitModel
(
argc
,
argv
,
true
,
0
,
devID
,
mem
);
XList
params
(
10
);
XList
params
(
10
);
GetParams
(
params
);
GetParams
(
params
);
...
@@ -87,74 +90,161 @@ make the encoding network
...
@@ -87,74 +90,161 @@ make the encoding network
>> isTraining - indicates whether we are training the model
>> isTraining - indicates whether we are training the model
<< return - encoding result
<< return - encoding result
*/
*/
XTensor
T2TModel
::
MakeEncod
ing
(
XTensor
&
input
,
XTensor
&
mask
,
bool
isTraining
)
XTensor
T2TModel
::
MakeEncod
er
(
XTensor
&
input
,
XTensor
&
mask
,
bool
isTraining
)
{
{
return
encoder
.
Make
(
input
,
mask
,
isTraining
);
return
encoder
.
Make
(
input
,
mask
,
isTraining
);
}
}
/*
/*
make the entire network (with the output softmax layer)
make the decoding network
>> inputDec - input tensor of the decoder
>> outputEnc - output tensor of the encoder
>> output - output tensor (distribution)
>> mask - the mask for positions that are/not involved in computation
>> isTraining - indicates whether we are training the model
<< return - encoding result
*/
XTensor
T2TModel
::
MakeDecoder
(
XTensor
&
inputDec
,
XTensor
&
outputEnc
,
XTensor
&
mask
,
bool
isTraining
)
{
return
decoder
.
Make
(
inputDec
,
outputEnc
,
mask
,
isTraining
);
}
/*
make the network for language modeling (with the output softmax layer)
>> input - input tensor
>> input - input tensor
>> output - output tensor (distribution)
>> output - output tensor (distribution)
>> padding - padding of the sequences
>> padding - padding of the sequences
>> isTraining - indicates whether the model is for training
>> isTraining - indicates whether the model is for training
*/
*/
void
T2TModel
::
Make
(
XTensor
&
input
,
XTensor
&
output
,
XTensor
&
padding
,
bool
isTraining
)
void
T2TModel
::
Make
LM
(
XTensor
&
input
,
XTensor
&
output
,
XTensor
&
padding
,
bool
isTraining
)
{
{
XTensor
encoding
;
XTensor
encoding
;
if
(
isLM
){
/* generate mask to see "previous" words only */
/* generate mask to see "previous" words only */
//int len = input.GetDim(input.order - 2);
int
len
=
input
.
GetDim
(
input
.
order
-
2
);
//int * dims = new int[input.order + 1];
int
*
dims
=
new
int
[
input
.
order
+
1
];
//for(int i = 0; i < input.order; i++)
for
(
int
i
=
0
;
i
<
input
.
order
;
i
++
)
// dims[i + 1] = input.GetDim(i);
dims
[
i
+
1
]
=
input
.
GetDim
(
i
);
//dims[0] = nhead;
dims
[
0
]
=
nhead
;
//dims[input.order] = len;
dims
[
input
.
order
]
=
len
;
//XTensor mask(input.order + 1, dims, X_FLOAT, 1.0F, input.devID, input.mem);
XTensor
mask
(
input
.
order
+
1
,
dims
,
X_FLOAT
,
1.0
F
,
input
.
devID
,
input
.
mem
);
int
len
=
input
.
GetDim
(
input
.
order
-
1
);
/* a upper triangular matrix where the cells of the upper triangular are set to -1e-9.
int
*
dims
=
new
int
[
input
.
order
+
2
];
this matrix can be used to prevent the attention to current or following words in
for
(
int
i
=
0
;
i
<
input
.
order
;
i
++
)
a given sequence. */
dims
[
i
+
1
]
=
input
.
GetDim
(
i
);
_SetDataLowTri
(
&
mask
,
1e9
F
,
0
);
dims
[
0
]
=
nhead
;
_ScaleAndShiftMe
(
&
mask
,
1.0
F
,
-
1e9
F
);
dims
[
input
.
order
+
1
]
=
len
;
XTensor
mask
(
input
.
order
+
2
,
dims
,
X_FLOAT
,
1.0
F
,
padding
.
devID
,
padding
.
mem
);
int
*
dimsPadding
=
new
int
[
padding
.
order
+
2
];
for
(
int
i
=
0
;
i
<
padding
.
order
-
1
;
i
++
)
/* a upper triangular matrix where the cells of the upper triangular are set to -1e-9.
dimsPadding
[
i
]
=
padding
.
GetDim
(
i
);
this matrix can be used to prevent the attention to current or following words in
dimsPadding
[
padding
.
order
-
1
]
=
padding
.
GetDim
(
-
1
);
a given sequence. */
dimsPadding
[
padding
.
order
]
=
padding
.
GetDim
(
-
1
);
_SetDataLowTri
(
&
mask
,
1e9
F
,
0
);
_ScaleAndShiftMe
(
&
mask
,
1.0
F
,
-
1e9
F
);
XTensor
*
padding2
=
NewTensorBuf
(
padding
.
order
+
1
,
dimsPadding
,
padding
.
dataType
,
padding
.
denseRatio
,
padding
.
devID
,
padding
.
mem
);
for
(
int
i
=
0
;
i
<
padding2
->
order
;
i
++
)
dimsPadding
[
i
+
1
]
=
padding2
->
GetDim
(
i
);
dimsPadding
[
0
]
=
nhead
;
XTensor
*
padding3
=
NewTensorBuf
(
padding
.
order
+
2
,
dimsPadding
,
padding
.
dataType
,
padding
.
denseRatio
,
padding
.
devID
,
padding
.
mem
);
/* mask of the padding */
int
*
dimsPadding
=
new
int
[
padding
.
order
+
2
];
_Unsqueeze
(
&
padding
,
padding2
,
padding
.
order
-
1
,
padding
.
GetDim
(
-
1
));
for
(
int
i
=
0
;
i
<
padding
.
order
-
1
;
i
++
)
_Unsqueeze
(
padding2
,
padding3
,
0
,
nhead
);
dimsPadding
[
i
]
=
padding
.
GetDim
(
i
);
dimsPadding
[
padding
.
order
-
1
]
=
padding
.
GetDim
(
-
1
);
dimsPadding
[
padding
.
order
]
=
padding
.
GetDim
(
-
1
);
XTensor
*
padding2
=
NewTensorBuf
(
padding
.
order
+
1
,
dimsPadding
,
padding
.
dataType
,
padding
.
denseRatio
,
padding
.
devID
,
padding
.
mem
);
for
(
int
i
=
0
;
i
<
padding2
->
order
;
i
++
)
dimsPadding
[
i
+
1
]
=
padding2
->
GetDim
(
i
);
dimsPadding
[
0
]
=
nhead
;
//XTensor * padding3 = NewTensorBuf(padding.order + 2, dimsPadding, padding.dataType,
// padding.denseRatio, padding.devID, padding.mem);
//
///* mask of the padding */
//_Unsqueeze(&padding, padding2, padding.order - 1, padding.GetDim(-1));
//_Unsqueeze(padding2, padding3, 0, nhead);
//
//_ScaleAndShiftMe(padding3, 1e9F, -1e9F);
//
////_Sum(&mask, padding3, &mask);
encoding
=
MakeEncoder
(
input
,
mask
,
isTraining
);
outputLayer
.
Make
(
encoding
,
output
);
delete
[]
dims
;
delete
[]
dimsPadding
;
_ScaleAndShiftMe
(
padding3
,
1e9
F
,
-
1e9
F
);
//DelTensorBuf(padding3);
DelTensorBuf
(
padding2
);
}
/*
make the network for machine translation (with the output softmax layer)
>> inputEnc - input tensor of the encoder
>> inputDec - input tensor of the decoder
>> output - output tensor (distribution)
>> paddingEnc - padding of the sequences (on the encoder side)
>> isTraining - indicates whether the model is for training
*/
void
T2TModel
::
MakeMT
(
XTensor
&
inputEnc
,
XTensor
&
inputDec
,
XTensor
&
output
,
XTensor
&
paddingEnc
,
bool
isTraining
)
{
XTensor
encoding
;
XTensor
decoding
;
XTensor
maskEnc
;
XTensor
maskDec
;
/* generate mask to see "previous" words on the decoder side */
int
len
=
inputDec
.
GetDim
(
inputDec
.
order
-
2
);
int
*
dims
=
new
int
[
inputDec
.
order
+
1
];
for
(
int
i
=
0
;
i
<
inputDec
.
order
;
i
++
)
dims
[
i
+
1
]
=
inputDec
.
GetDim
(
i
);
dims
[
0
]
=
nhead
;
dims
[
inputDec
.
order
]
=
len
;
InitTensor
(
&
maskDec
,
inputDec
.
order
+
1
,
dims
,
X_FLOAT
,
1.0
F
,
inputDec
.
devID
,
inputDec
.
mem
);
_Sum
(
&
mask
,
padding3
,
&
mask
);
/* a upper triangular matrix where the cells of the upper triangular are set to -1e-9.
this matrix can be used to prevent the attention to current or following words in
a given sequence. */
_SetDataLowTri
(
&
maskDec
,
1e9
F
,
0
);
_ScaleAndShiftMe
(
&
maskDec
,
1.0
F
,
-
1e9
F
);
encoding
=
MakeEncoding
(
input
,
mask
,
isTraining
);
/* padding on the source side */
outputLayer
.
Make
(
encoding
,
output
);
int
*
dimsPadding
=
new
int
[
paddingEnc
.
order
+
2
];
for
(
int
i
=
0
;
i
<
paddingEnc
.
order
-
1
;
i
++
)
dimsPadding
[
i
]
=
paddingEnc
.
GetDim
(
i
);
dimsPadding
[
paddingEnc
.
order
-
1
]
=
paddingEnc
.
GetDim
(
-
1
);
dimsPadding
[
paddingEnc
.
order
]
=
paddingEnc
.
GetDim
(
-
1
);
delete
[]
dims
;
XTensor
*
padding2
=
NewTensorBuf
(
paddingEnc
.
order
+
1
,
dimsPadding
,
paddingEnc
.
dataType
,
delete
[]
dimsPadding
;
paddingEnc
.
denseRatio
,
paddingEnc
.
devID
,
paddingEnc
.
mem
);
DelTensorBuf
(
padding2
);
for
(
int
i
=
0
;
i
<
padding2
->
order
;
i
++
)
DelTensorBuf
(
padding3
);
dimsPadding
[
i
+
1
]
=
padding2
->
GetDim
(
i
);
}
dimsPadding
[
0
]
=
nhead
;
else
{
ShowNTErrors
(
"TODO!"
);
XTensor
*
padding3
=
NewTensorBuf
(
paddingEnc
.
order
+
2
,
dimsPadding
,
paddingEnc
.
dataType
,
}
paddingEnc
.
denseRatio
,
paddingEnc
.
devID
,
paddingEnc
.
mem
);
/* mask of the padding */
_Unsqueeze
(
&
paddingEnc
,
padding2
,
paddingEnc
.
order
-
1
,
paddingEnc
.
GetDim
(
-
1
));
_Unsqueeze
(
padding2
,
padding3
,
0
,
nhead
);
_ScaleAndShiftMe
(
padding3
,
1e9
F
,
-
1e9
F
);
InitTensor
(
&
maskEnc
,
padding3
);
maskEnc
.
SetZeroAll
();
/* generate the mask on the source language side (for padding) */
_Sum
(
&
maskEnc
,
padding3
,
&
maskEnc
);
encoding
=
MakeEncoder
(
inputEnc
,
maskEnc
,
isTraining
);
decoding
=
MakeDecoder
(
inputDec
,
encoding
,
maskDec
,
isTraining
);
outputLayer
.
Make
(
decoding
,
output
);
delete
[]
dims
;
delete
[]
dimsPadding
;
DelTensorBuf
(
padding3
);
DelTensorBuf
(
padding2
);
}
}
/*
/*
...
@@ -180,8 +270,33 @@ void T2TModel::GetParams(XList &list)
...
@@ -180,8 +270,33 @@ void T2TModel::GetParams(XList &list)
list
.
Add
(
&
encoder
.
attLayerNorms
[
i
].
w
);
list
.
Add
(
&
encoder
.
attLayerNorms
[
i
].
w
);
list
.
Add
(
&
encoder
.
attLayerNorms
[
i
].
b
);
list
.
Add
(
&
encoder
.
attLayerNorms
[
i
].
b
);
}
}
list
.
Add
(
&
encoder
.
embedder
.
w
);
list
.
Add
(
&
encoder
.
embedder
.
w
);
if
(
isMT
){
for
(
int
i
=
0
;
i
<
decoder
.
nlayer
;
i
++
){
list
.
Add
(
&
decoder
.
fnns
[
i
].
w1
);
list
.
Add
(
&
decoder
.
fnns
[
i
].
b1
);
list
.
Add
(
&
decoder
.
fnns
[
i
].
w2
);
list
.
Add
(
&
decoder
.
fnns
[
i
].
b2
);
list
.
Add
(
&
decoder
.
attentionsEnde
[
i
].
wk
);
list
.
Add
(
&
decoder
.
attentionsEnde
[
i
].
wq
);
list
.
Add
(
&
decoder
.
attentionsEnde
[
i
].
wv
);
list
.
Add
(
&
decoder
.
attentionsEnde
[
i
].
wa
);
list
.
Add
(
&
decoder
.
attEndeLayerNorms
[
i
].
w
);
list
.
Add
(
&
decoder
.
attEndeLayerNorms
[
i
].
b
);
list
.
Add
(
&
decoder
.
attentions
[
i
].
wk
);
list
.
Add
(
&
decoder
.
attentions
[
i
].
wq
);
list
.
Add
(
&
decoder
.
attentions
[
i
].
wv
);
list
.
Add
(
&
decoder
.
attentions
[
i
].
wa
);
list
.
Add
(
&
decoder
.
fnnLayerNorms
[
i
].
w
);
list
.
Add
(
&
decoder
.
fnnLayerNorms
[
i
].
b
);
list
.
Add
(
&
decoder
.
attLayerNorms
[
i
].
w
);
list
.
Add
(
&
decoder
.
attLayerNorms
[
i
].
b
);
}
list
.
Add
(
&
decoder
.
embedder
.
w
);
}
}
}
/*
/*
...
...
source/sample/transformer/T2TModel.h
查看文件 @
03a9836e
...
@@ -69,10 +69,16 @@ public:
...
@@ -69,10 +69,16 @@ public:
void
InitModel
(
int
argc
,
char
**
argv
);
void
InitModel
(
int
argc
,
char
**
argv
);
/* make the encoding network */
/* make the encoding network */
XTensor
MakeEncod
ing
(
XTensor
&
input
,
XTensor
&
mask
,
bool
isTraining
);
XTensor
MakeEncod
er
(
XTensor
&
input
,
XTensor
&
mask
,
bool
isTraining
);
/* make the entire network (with the output softmax layer) */
/* make the encoding network */
void
Make
(
XTensor
&
input
,
XTensor
&
output
,
XTensor
&
padding
,
bool
isTraining
);
XTensor
MakeDecoder
(
XTensor
&
inputEnc
,
XTensor
&
inputDec
,
XTensor
&
mask
,
bool
isTraining
);
/* make the network for langauge modeling (with the output softmax layer) */
void
MakeLM
(
XTensor
&
input
,
XTensor
&
output
,
XTensor
&
padding
,
bool
isTraining
);
/* make the network for machine translation (with the output softmax layer) */
void
MakeMT
(
XTensor
&
inputEnc
,
XTensor
&
inputDec
,
XTensor
&
output
,
XTensor
&
paddingEnc
,
bool
isTraining
);
/* get parameter matrics */
/* get parameter matrics */
void
GetParams
(
XList
&
list
);
void
GetParams
(
XList
&
list
);
...
...
source/sample/transformer/T2TOutput.cpp
查看文件 @
03a9836e
...
@@ -66,6 +66,9 @@ void T2TOutput::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
...
@@ -66,6 +66,9 @@ void T2TOutput::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
float
scale
=
1.0
F
;
float
scale
=
1.0
F
;
float
finfout
=
(
float
)
sqrt
(
6.0
F
*
scale
/
(
hSize
+
vSize
));
float
finfout
=
(
float
)
sqrt
(
6.0
F
*
scale
/
(
hSize
+
vSize
));
w
.
SetDataRand
(
-
finfout
,
finfout
);
w
.
SetDataRand
(
-
finfout
,
finfout
);
DTYPE
v
=
1.0
F
/
(
float
)
sqrt
((
float
)
hSize
);
w
.
SetDataRandn
(
0
,
v
);
}
}
/*
/*
...
@@ -90,7 +93,8 @@ void T2TOutput::Make(XTensor &input, XTensor &output)
...
@@ -90,7 +93,8 @@ void T2TOutput::Make(XTensor &input, XTensor &output)
{
{
XTensor
&
x
=
input
;
XTensor
&
x
=
input
;
output
=
LogSoftmax
(
MMul
(
x
,
w
),
-
1
);
//output = LogSoftmax(MMul(x, w), -1);
output
=
Softmax
(
MMul
(
x
,
w
),
-
1
);
}
}
}
}
source/sample/transformer/T2TTrainer.cpp
查看文件 @
03a9836e
...
@@ -101,6 +101,7 @@ void T2TTrainer::Init(int argc, char ** argv)
...
@@ -101,6 +101,7 @@ void T2TTrainer::Init(int argc, char ** argv)
LoadParamInt
(
argc
,
argv
,
"d"
,
&
d
,
512
);
LoadParamInt
(
argc
,
argv
,
"d"
,
&
d
,
512
);
LoadParamInt
(
argc
,
argv
,
"nwarmup"
,
&
nwarmup
,
4000
);
LoadParamInt
(
argc
,
argv
,
"nwarmup"
,
&
nwarmup
,
4000
);
LoadParamInt
(
argc
,
argv
,
"vsize"
,
&
vSize
,
1
);
LoadParamInt
(
argc
,
argv
,
"vsize"
,
&
vSize
,
1
);
LoadParamInt
(
argc
,
argv
,
"vsizetgt"
,
&
vSizeTgt
,
vSize
);
LoadParamBool
(
argc
,
argv
,
"sorted"
,
&
isLenSorted
,
false
);
LoadParamBool
(
argc
,
argv
,
"sorted"
,
&
isLenSorted
,
false
);
LoadParamInt
(
argc
,
argv
,
"bufsize"
,
&
bufSize
,
50000
);
LoadParamInt
(
argc
,
argv
,
"bufsize"
,
&
bufSize
,
50000
);
LoadParamBool
(
argc
,
argv
,
"adam"
,
&
useAdam
,
false
);
LoadParamBool
(
argc
,
argv
,
"adam"
,
&
useAdam
,
false
);
...
@@ -113,6 +114,7 @@ void T2TTrainer::Init(int argc, char ** argv)
...
@@ -113,6 +114,7 @@ void T2TTrainer::Init(int argc, char ** argv)
LoadParamBool
(
argc
,
argv
,
"epochcheckpoint"
,
&
useEpochCheckpoint
,
false
);
LoadParamBool
(
argc
,
argv
,
"epochcheckpoint"
,
&
useEpochCheckpoint
,
false
);
LoadParamInt
(
argc
,
argv
,
"updatestep"
,
&
updateStep
,
1
);
LoadParamInt
(
argc
,
argv
,
"updatestep"
,
&
updateStep
,
1
);
LoadParamBool
(
argc
,
argv
,
"doubledend"
,
&
isDoubledEnd
,
false
);
LoadParamBool
(
argc
,
argv
,
"doubledend"
,
&
isDoubledEnd
,
false
);
LoadParamBool
(
argc
,
argv
,
"smallbatch"
,
&
isSmallBatch
,
false
);
buf
=
new
int
[
bufSize
];
buf
=
new
int
[
bufSize
];
buf2
=
new
int
[
bufSize
];
buf2
=
new
int
[
bufSize
];
...
@@ -122,6 +124,9 @@ void T2TTrainer::Init(int argc, char ** argv)
...
@@ -122,6 +124,9 @@ void T2TTrainer::Init(int argc, char ** argv)
adamBeta1T
=
1.0
F
;
adamBeta1T
=
1.0
F
;
adamBeta2T
=
1.0
F
;
adamBeta2T
=
1.0
F
;
validStep
=
0
;
curEpoch
=
0
;
}
}
int
tc
=
0
;
int
tc
=
0
;
...
@@ -133,9 +138,10 @@ train the model
...
@@ -133,9 +138,10 @@ train the model
>> modelFN - where we keep the model
>> modelFN - where we keep the model
>> model - model to train
>> model - model to train
*/
*/
void
T2TTrainer
::
Train
(
const
char
*
fn
,
const
char
*
validFN
,
const
char
*
modelFN
,
T2TModel
*
model
)
bool
T2TTrainer
::
Train
(
const
char
*
fn
,
const
char
*
validFN
,
const
char
*
modelFN
,
T2TModel
*
model
)
{
{
int
epoch
=
0
;
curEpoch
+=
1
;
int
step
=
0
;
int
step
=
0
;
int
wc
=
0
;
int
wc
=
0
;
int
wordCount
=
0
;
int
wordCount
=
0
;
...
@@ -147,7 +153,7 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
...
@@ -147,7 +153,7 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
int
nCheckpoint
=
0
;
int
nCheckpoint
=
0
;
int
nSkipped
=
0
;
int
nSkipped
=
0
;
int
gradStep
=
0
;
int
gradStep
=
0
;
int
validStep
=
0
;
//
int validStep = 0;
char
*
trainFN
=
new
char
[(
int
)
strlen
(
fn
)
+
10
];
char
*
trainFN
=
new
char
[(
int
)
strlen
(
fn
)
+
10
];
strcpy
(
trainFN
,
fn
);
strcpy
(
trainFN
,
fn
);
...
@@ -157,18 +163,18 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
...
@@ -157,18 +163,18 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
sprintf
(
trainFN
,
"%s.random"
,
fn
);
sprintf
(
trainFN
,
"%s.random"
,
fn
);
#endif
#endif
PrepareModel
(
model
);
int
devID
=
model
->
devID
;
int
devID
=
model
->
devID
;
XMem
*
mem
=
model
->
mem
;
XMem
*
mem
=
model
->
mem
;
XNet
net
;
XNet
net
;
PrepareModel
(
model
);
double
startT
=
GetClockSec
();
double
startT
=
GetClockSec
();
for
(
epoch
=
1
;
epoch
<=
nepoch
;
epoch
++
){
//
for(epoch = 1; epoch <= nepoch; epoch++){
#ifndef WIN32
#ifndef WIN32
if
(
isShuffled
)
if
(
isShuffled
)
Shuffle
(
fn
,
trainFN
);
Shuffle
(
fn
,
trainFN
);
#endif
#endif
FILE
*
file
=
fopen
(
trainFN
,
"rb"
);
FILE
*
file
=
fopen
(
trainFN
,
"rb"
);
...
@@ -177,11 +183,13 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
...
@@ -177,11 +183,13 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
wordCount
=
0
;
wordCount
=
0
;
loss
=
0
;
loss
=
0
;
/* batch of input sequences */
/* batch of sequences (on the encoder and decoder sides) */
XTensor
batch
;
XTensor
batchEnc
;
XTensor
batchDec
;
/* padding */
/* padding */
XTensor
padding
;
XTensor
paddingEnc
;
XTensor
paddingDec
;
/* gold standard */
/* gold standard */
XTensor
gold
;
XTensor
gold
;
...
@@ -189,26 +197,40 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
...
@@ -189,26 +197,40 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
/* label smoothed gold standard (if needed) */
/* label smoothed gold standard (if needed) */
XTensor
goldSmoothed
;
XTensor
goldSmoothed
;
while
(
LoadBatch
(
file
,
true
,
&
batch
,
&
padding
,
&
gold
,
NULL
,
1
,
vSize
,
sBatchSize
,
wBatchSize
,
isLenSorted
,
wc
,
devID
,
mem
))
{
while
(
LoadBatch
(
file
,
model
->
isLM
,
&
batchEnc
,
&
paddingEnc
,
&
batchDec
,
&
paddingDec
,
&
gold
,
NULL
,
vSize
,
vSizeTgt
,
sBatchSize
,
wBatchSize
,
isLenSorted
,
wc
,
devID
,
mem
,
true
))
{
CheckNTErrors
(
batch
.
order
==
3
,
"wrong tensor order of the sequence batch"
);
CheckNTErrors
(
batchEnc
.
order
==
2
,
"wrong tensor order of the sequence batch"
);
//CheckNTErrors(batchEnc.order == 3, "wrong tensor order of the sequence batch");
/* output probabilities */
/* output probabilities */
XTensor
output
;
XTensor
output
;
/* make the network */
/* make the network */
model
->
Make
(
batch
,
output
,
padding
,
true
);
if
(
model
->
isLM
)
model
->
MakeLM
(
batchEnc
,
output
,
paddingEnc
,
true
);
else
if
(
model
->
isMT
)
model
->
MakeMT
(
batchEnc
,
batchDec
,
output
,
paddingEnc
,
true
);
else
{
ShowNTErrors
(
"Illegal model type!"
);
}
/* back-propagation for obtaining gradients */
/* back-propagation for obtaining gradients */
if
(
labelSmoothingP
>
0
)
if
(
labelSmoothingP
>
0
)
LabelSmooth
(
&
gold
,
&
goldSmoothed
,
labelSmoothingP
);
LabelSmooth
(
&
gold
,
&
goldSmoothed
,
labelSmoothingP
);
/* make paddings for the output */
/* make paddings for the output */
if
(
output
.
GetDim
(
0
)
>
1
)
//if (output.GetDim(0) > 1)
PadOutput
(
&
output
,
&
gold
,
&
padding
);
// PadOutput(&output, &gold, &paddingDec);
//output.Dump(tmpFILE, "output: ");
//fflush(tmpFILE);
/* get probabilities */
/* get probabilities */
float
prob
=
GetProb
(
&
output
,
&
gold
,
NULL
);
float
prob
=
GetProb
(
&
output
,
&
gold
,
NULL
);
DTYPE
lossLocal
=
-
prob
/
wc
;
DTYPE
lossLocal
=
-
prob
/
wc
;
bool
doUpdate
=
(
!
IsNAN
(
lossLocal
)
&&
!
IsINF
(
lossLocal
)
&&
lossLocal
<
1e3
F
);
bool
doUpdate
=
(
!
IsNAN
(
lossLocal
)
&&
!
IsINF
(
lossLocal
)
&&
lossLocal
<
1e3
F
);
...
@@ -217,18 +239,11 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
...
@@ -217,18 +239,11 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
if
(
doUpdate
)
{
if
(
doUpdate
)
{
/* recale the output for normalized loss */
/* recale the output for normalized loss */
RescaleOutput
(
&
output
,
&
g
,
&
padding
);
//RescaleOutput(&output, &g, &paddingDec
);
/* back-propagation */
/* back-propagation */
net
.
Backward
(
output
,
g
,
CROSSENTROPY
);
net
.
Backward
(
output
,
g
,
paddingDec
,
CROSSENTROPY
);
/*for(int i = 0; i < net.nodes.count; i++){
XTensor * node = (XTensor*)net.nodes.Get(i);
XLink::ShowNode(stderr, node);
}
exit(0);*/
gradStep
+=
1
;
gradStep
+=
1
;
loss
+=
-
prob
;
loss
+=
-
prob
;
wordCount
+=
wc
;
wordCount
+=
wc
;
...
@@ -255,10 +270,10 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
...
@@ -255,10 +270,10 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
break
;
break
;
}
}
if
(
step
%
1
==
0
)
{
if
(
step
%
1
00
==
0
)
{
double
elapsed
=
GetClockSec
()
-
startT
;
double
elapsed
=
GetClockSec
()
-
startT
;
XPRINT8
(
0
,
stderr
,
"[INFO] lr=%.2e, elapsed=%.1fs, step=%d, epoch=%d, word=%d, loss=%.3f, ppl=%.3f, sppl=%.3f"
,
XPRINT8
(
0
,
stderr
,
"[INFO] lr=%.2e, elapsed=%.1fs, step=%d, epoch=%d, word=%d, loss=%.3f, ppl=%.3f, sppl=%.3f"
,
lr
,
elapsed
,
step
,
e
poch
,
wordCountTotal
,
loss
/
wordCount
,
exp
(
loss
/
wordCount
),
exp
(
-
prob
/
wc
));
lr
,
elapsed
,
step
,
curE
poch
,
wordCountTotal
,
loss
/
wordCount
,
exp
(
loss
/
wordCount
),
exp
(
-
prob
/
wc
));
if
(
!
doUpdate
)
if
(
!
doUpdate
)
XPRINT
(
0
,
stderr
,
" (no update)"
);
XPRINT
(
0
,
stderr
,
" (no update)"
);
XPRINT
(
0
,
stderr
,
"
\n
"
);
XPRINT
(
0
,
stderr
,
"
\n
"
);
...
@@ -274,20 +289,20 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
...
@@ -274,20 +289,20 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
fclose
(
file
);
fclose
(
file
);
if
(
isEnd
)
if
(
isEnd
)
break
;
return
false
;
return
true
;
if
(
useEpochCheckpoint
)
//
if(useEpochCheckpoint)
MakeCheckpoint
(
model
,
validFN
,
modelFN
,
"epoch"
,
epoch
);
//
MakeCheckpoint(model, validFN, modelFN, "epoch", epoch);
}
//
}
double
elapsed
=
GetClockSec
()
-
startT
;
//
double elapsed = GetClockSec() - startT;
//
epoch
=
MIN
(
epoch
,
nepoch
);
//
epoch = MIN(epoch, nepoch);
//
XPRINT7
(
0
,
stderr
,
"[INFO] lr=%.2e, elapsed=%.1fs, step=%d, epoch=%d, word=%d, loss=%.3f, ppl=%.3f
\n
"
,
//
XPRINT7(0, stderr, "[INFO] lr=%.2e, elapsed=%.1fs, step=%d, epoch=%d, word=%d, loss=%.3f, ppl=%.3f\n",
lr
,
elapsed
,
step
,
epoch
,
wordCountTotal
,
loss
/
wordCount
,
exp
(
loss
/
wordCount
));
//
lr, elapsed, step, epoch, wordCountTotal, loss/wordCount, exp(loss/wordCount));
XPRINT4
(
0
,
stderr
,
"[INFO] training finished (took %.1fs, step=%d, skipped=%d and epoch=%d)
\n
"
,
//
XPRINT4(0, stderr, "[INFO] training finished (took %.1fs, step=%d, skipped=%d and epoch=%d)\n",
elapsed
,
step
,
nSkipped
,
epoch
);
//
elapsed, step, nSkipped, epoch);
delete
[]
trainFN
;
delete
[]
trainFN
;
}
}
...
@@ -322,10 +337,12 @@ void T2TTrainer::Test(const char * fn, const char * ofn, T2TModel * model)
...
@@ -322,10 +337,12 @@ void T2TTrainer::Test(const char * fn, const char * ofn, T2TModel * model)
wordCount
=
0
;
wordCount
=
0
;
/* batch of input sequences */
/* batch of input sequences */
XTensor
batch
;
XTensor
batchEnc
;
XTensor
batchDec
;
/* padding */
/* padding */
XTensor
padding
;
XTensor
paddingEnc
;
XTensor
paddingDec
;
/* gold standard */
/* gold standard */
XTensor
gold
;
XTensor
gold
;
...
@@ -335,18 +352,28 @@ void T2TTrainer::Test(const char * fn, const char * ofn, T2TModel * model)
...
@@ -335,18 +352,28 @@ void T2TTrainer::Test(const char * fn, const char * ofn, T2TModel * model)
ClearBuf
();
ClearBuf
();
while
(
LoadBatch
(
file
,
true
,
&
batch
,
&
padding
,
&
gold
,
seqs
,
1
,
vSize
,
1
,
1
,
false
,
wc
,
devID
,
mem
)){
while
(
LoadBatch
(
file
,
model
->
isLM
,
&
batchEnc
,
&
paddingEnc
,
&
paddingDec
,
&
paddingDec
,
&
gold
,
seqs
,
vSize
,
vSizeTgt
,
1
,
1
,
false
,
wc
,
devID
,
mem
,
false
))
{
CheckNTErrors
(
batch
.
order
==
3
,
"wrong tensor order of the sequence batch"
);
//CheckNTErrors(batchEnc.order == 3, "wrong tensor order of the sequence batch");
CheckNTErrors
(
batchEnc
.
order
==
2
,
"wrong tensor order of the sequence batch"
);
/* output probabilities */
/* output probabilities */
XTensor
output
;
XTensor
output
;
/* make the network */
/* make the network */
model
->
Make
(
batch
,
output
,
padding
,
false
);
if
(
model
->
isLM
)
model
->
MakeLM
(
batchEnc
,
output
,
paddingEnc
,
false
);
else
if
(
model
->
isMT
)
model
->
MakeMT
(
batchEnc
,
batchDec
,
output
,
paddingEnc
,
false
);
else
{
ShowNTErrors
(
"Illegal model type!"
);
}
int
bSize
=
batch
.
GetDim
(
0
);
int
bSize
=
output
.
GetDim
(
0
);
int
length
=
batch
.
GetDim
(
1
);
int
length
=
output
.
GetDim
(
1
);
/* prediction probabilities */
/* prediction probabilities */
XTensor
probs
;
XTensor
probs
;
...
@@ -391,7 +418,7 @@ void T2TTrainer::Test(const char * fn, const char * ofn, T2TModel * model)
...
@@ -391,7 +418,7 @@ void T2TTrainer::Test(const char * fn, const char * ofn, T2TModel * model)
delete
[]
seqs
;
delete
[]
seqs
;
double
elapsed
=
GetClockSec
()
-
startT
;
double
elapsed
=
GetClockSec
()
-
startT
;
XPRINT3
(
0
,
stderr
,
"[INFO] test finished (took %.1fs, word=%d, and ppl=%.3f)
\n
"
,
XPRINT3
(
0
,
stderr
,
"[INFO] test finished (took %.1fs, word=%d, and ppl=%.3f)
\n
"
,
elapsed
,
wordCountTotal
,
exp
(
loss
/
wordCount
));
elapsed
,
wordCountTotal
,
exp
(
loss
/
wordCount
));
}
}
...
@@ -511,6 +538,7 @@ int T2TTrainer::LoadBuf(FILE * file, bool isSorted, int step)
...
@@ -511,6 +538,7 @@ int T2TTrainer::LoadBuf(FILE * file, bool isSorted, int step)
/* sort the sequences by length */
/* sort the sequences by length */
if
(
isSorted
)
{
if
(
isSorted
)
{
CheckNTErrors
(
seqCount
%
step
==
0
,
"Wrong number of sequences!"
);
SampleNode
*
nodes
=
new
SampleNode
[
seqCount
];
SampleNode
*
nodes
=
new
SampleNode
[
seqCount
];
int
count
=
0
;
int
count
=
0
;
int
offset
=
0
;
int
offset
=
0
;
...
@@ -526,19 +554,18 @@ int T2TTrainer::LoadBuf(FILE * file, bool isSorted, int step)
...
@@ -526,19 +554,18 @@ int T2TTrainer::LoadBuf(FILE * file, bool isSorted, int step)
offset
+=
node
.
size
;
offset
+=
node
.
size
;
}
}
qsort
(
nodes
,
seqC
ount
,
sizeof
(
SampleNode
),
CompareSampleNode
);
qsort
(
nodes
,
c
ount
,
sizeof
(
SampleNode
),
CompareSampleNode
);
count
=
0
;
count
=
0
;
offset
=
0
;
offset
=
0
;
for
(
int
i
=
0
;
i
<
seqCount
;
i
++
){
for
(
int
i
=
0
;
i
<
seqCount
;
i
+=
step
){
SampleNode
&
node
=
nodes
[
count
];
SampleNode
&
node
=
nodes
[
count
];
//fprintf(stderr, "%d %d %d\n", node.size, node.id, node.value);
memcpy
(
buf2
+
offset
,
node
.
p
,
sizeof
(
int
)
*
node
.
size
);
memcpy
(
buf2
+
offset
,
node
.
p
,
sizeof
(
int
)
*
node
.
size
);
for
(
int
j
=
0
;
j
<
step
;
j
++
){
for
(
int
j
=
0
;
j
<
step
;
j
++
){
seqLen2
[
count
+
j
]
=
seqLen
[
node
.
id
+
j
];
seqLen2
[
i
+
j
]
=
seqLen
[
node
.
id
+
j
];
seqOffset
[
count
+
j
]
=
offset
+
(
j
>
0
?
seqLen
[
node
.
id
+
j
-
1
]
:
0
);
seqOffset
[
i
+
j
]
=
offset
+
(
j
>
0
?
seqLen
[
node
.
id
+
j
-
1
]
:
0
);
}
}
count
+=
step
;
count
+=
1
;
offset
+=
node
.
size
;
offset
+=
node
.
size
;
}
}
...
@@ -546,6 +573,7 @@ int T2TTrainer::LoadBuf(FILE * file, bool isSorted, int step)
...
@@ -546,6 +573,7 @@ int T2TTrainer::LoadBuf(FILE * file, bool isSorted, int step)
buf
=
buf2
;
buf
=
buf2
;
buf2
=
tmp
;
buf2
=
tmp
;
tmp
=
seqLen
;
tmp
=
seqLen
;
seqLen
=
seqLen2
;
seqLen
=
seqLen2
;
seqLen2
=
tmp
;
seqLen2
=
tmp
;
...
@@ -562,32 +590,79 @@ void T2TTrainer::ClearBuf()
...
@@ -562,32 +590,79 @@ void T2TTrainer::ClearBuf()
nextSeq
=
-
1
;
nextSeq
=
-
1
;
}
}
/*
/*
load a batch of sequences
load a batch of sequences
>> file - the handle to the data file
>> file - the handle to the data file
>> isLM - indicates whether the data is used for training lms
>> isLM - indicates whether the data is used for training lms
>> batch - the batch of the input sequences
>> batchEnc - the batch of the input sequences
>> padding - padding of the input sequences
>> paddingEnc - padding of the input sequences
>> output - the batch of the output sequences
>> batchDec - the batch of the output sequences
>> paddingDec - padding of the output sequences
>> gold - gold standard
>> seqs - keep the sequences in an array
>> seqs - keep the sequences in an array
>>
step - the step we go over when move to the next sequence
>>
vsEnc - size of the encoder vocabulary
>> vs
- vocabulary size
>> vs
Dec - size of the decoder vocabulary
>> sBatch - batch size of sequences
>> sBatch - batch size of sequences
>> wBatch - batch size of words
>> wBatch - batch size of words
>> isSorted - indicates whether the sequences are sorted by length
>> isSorted - indicates whether the sequences are sorted by length
>> wCount - word count
>> wCount - word count
>> devID - device id
>> devID - device id
>> mem - memory pool
>> mem - memory pool
>> isTraining - indicates whether we are training the model
*/
*/
int
T2TTrainer
::
LoadBatch
(
FILE
*
file
,
bool
isLM
,
int
T2TTrainer
::
LoadBatch
(
FILE
*
file
,
bool
isLM
,
XTensor
*
batch
,
XTensor
*
padding
,
XTensor
*
output
,
XTensor
*
batchEnc
,
XTensor
*
paddingEnc
,
XTensor
*
batchDec
,
XTensor
*
paddingDec
,
XTensor
*
gold
,
int
*
seqs
,
int
*
seqs
,
int
step
,
int
vs
,
int
sBatch
,
int
wBatch
,
int
vsEnc
,
int
vsDec
,
int
sBatch
,
int
wBatch
,
bool
isSorted
,
int
&
wCount
,
bool
isSorted
,
int
&
wCount
,
int
devID
,
XMem
*
mem
)
int
devID
,
XMem
*
mem
,
bool
isTraining
)
{
if
(
isLM
){
return
LoadBatchLM
(
file
,
batchEnc
,
paddingEnc
,
batchDec
,
paddingDec
,
gold
,
seqs
,
vsEnc
,
sBatch
,
wBatch
,
isSorted
,
wCount
,
devID
,
mem
,
isTraining
);
}
else
{
return
LoadBatchMT
(
file
,
batchEnc
,
paddingEnc
,
batchDec
,
paddingDec
,
gold
,
seqs
,
vsEnc
,
vsDec
,
sBatch
,
wBatch
,
isSorted
,
wCount
,
devID
,
mem
,
isTraining
);
}
}
/*
load a batch of sequences (for LM)
>> file - the handle to the data file
>> isLM - indicates whether the data is used for training lms
>> batchEnc - the batch of the input sequences
>> paddingEnc - padding of the input sequences
>> batchDec - the batch of the output sequences
>> paddingDec - padding of the output sequences
>> gold - gold standard
>> seqs - keep the sequences in an array
>> vs - vocabulary size
>> sBatch - batch size of sequences
>> wBatch - batch size of words
>> isSorted - indicates whether the sequences are sorted by length
>> wCount - word count
>> devID - device id
>> mem - memory pool
>> isTraining - indicates whether we are training the model
*/
int
T2TTrainer
::
LoadBatchLM
(
FILE
*
file
,
XTensor
*
batchEnc
,
XTensor
*
paddingEnc
,
XTensor
*
batchDec
,
XTensor
*
paddingDec
,
XTensor
*
gold
,
int
*
seqs
,
int
vs
,
int
sBatch
,
int
wBatch
,
bool
isSorted
,
int
&
wCount
,
int
devID
,
XMem
*
mem
,
bool
isTraining
)
{
{
if
(
nextSeq
<
0
||
nextSeq
>=
nseqBuf
)
if
(
nextSeq
<
0
||
nextSeq
>=
nseqBuf
)
LoadBuf
(
file
,
isSorted
,
step
);
LoadBuf
(
file
,
isSorted
,
1
);
int
seq
=
MAX
(
nextSeq
,
0
);
int
seq
=
MAX
(
nextSeq
,
0
);
int
wc
=
0
;
int
wc
=
0
;
...
@@ -604,7 +679,8 @@ int T2TTrainer::LoadBatch(FILE * file, bool isLM,
...
@@ -604,7 +679,8 @@ int T2TTrainer::LoadBatch(FILE * file, bool isLM,
if
(
max
<
wn
)
if
(
max
<
wn
)
max
=
wn
;
max
=
wn
;
if
(
sc
>=
sBatch
&&
wc
>=
wBatch
)
int
tc
=
isSmallBatch
?
max
*
sc
:
wc
;
if
(
sc
>=
sBatch
&&
tc
>=
wBatch
)
break
;
break
;
}
}
...
@@ -614,74 +690,205 @@ int T2TTrainer::LoadBatch(FILE * file, bool isLM,
...
@@ -614,74 +690,205 @@ int T2TTrainer::LoadBatch(FILE * file, bool isLM,
if
(
sc
<=
0
)
if
(
sc
<=
0
)
return
0
;
return
0
;
if
(
isLM
){
int
dims
[
MAX_TENSOR_DIM_NUM
];
int
dims
[
MAX_TENSOR_DIM_NUM
];
dims
[
0
]
=
sc
;
dims
[
0
]
=
sc
;
dims
[
1
]
=
max
;
dims
[
1
]
=
max
;
dims
[
2
]
=
vs
;
dims
[
2
]
=
vs
;
InitTensor
(
batchEnc
,
2
,
dims
,
X_INT
,
1.0
F
,
-
1
);
InitTensor
(
batch
,
3
,
dims
,
X_FLOAT
,
1.0
F
,
devID
,
mem
);
//InitTensor(batchEnc, 3, dims, X_FLOAT, 1.0F, devID, mem);
InitTensor2D
(
padding
,
sc
,
max
,
X_FLOAT
,
devID
,
mem
);
InitTensor2D
(
paddingEnc
,
sc
,
max
,
X_FLOAT
,
devID
,
mem
);
InitTensor
(
output
,
3
,
dims
,
X_FLOAT
,
1.0
F
,
devID
,
mem
);
InitTensor
(
gold
,
3
,
dims
,
X_FLOAT
,
1.0
F
,
devID
,
mem
);
InitTensor2D
(
paddingDec
,
sc
,
max
,
X_FLOAT
,
devID
,
mem
);
if
(
batch
->
grad
==
NULL
)
XNoder
::
MakeGrad
(
batch
);
batchEnc
->
SetZeroAll
();
else
paddingEnc
->
SetZeroAll
();
InitTensor
(
batch
->
grad
,
3
,
dims
,
X_FLOAT
,
1.0
F
,
devID
,
mem
);
gold
->
SetZeroAll
();
paddingDec
->
SetZeroAll
();
if
(
padding
->
grad
==
NULL
)
XNoder
::
MakeGrad
(
padding
);
if
(
isTraining
)
{
else
//XNoder::MakeGrad(batchEnc);
InitTensor2D
(
padding
->
grad
,
sc
,
max
,
X_FLOAT
,
devID
,
mem
);
XNoder
::
MakeGrad
(
paddingEnc
);
XNoder
::
MakeGrad
(
gold
);
if
(
output
->
grad
==
NULL
)
XNoder
::
MakeGrad
(
paddingDec
);
XNoder
::
MakeGrad
(
output
);
//batchEnc->grad->SetZeroAll();
else
paddingEnc
->
grad
->
SetZeroAll
();
InitTensor
(
output
->
grad
,
3
,
dims
,
X_FLOAT
,
1.0
F
,
devID
,
mem
);
gold
->
grad
->
SetZeroAll
();
paddingDec
->
grad
->
SetZeroAll
();
batch
->
SetZeroAll
();
}
padding
->
SetZeroAll
();
output
->
SetZeroAll
();
int
seqSize
=
0
;
batch
->
grad
->
SetZeroAll
();
padding
->
grad
->
SetZeroAll
();
//fprintf(tf, "batch %d(%d)\n", tc++, sc);
output
->
grad
->
SetZeroAll
();
/* this might be slow on GPUs :( */
int
seqSize
=
0
;
for
(
int
s
=
seq
;
s
<
seq
+
sc
;
s
++
){
int
len
=
isDoubledEnd
?
seqLen
[
s
]
:
seqLen
[
s
]
-
1
;
//fprintf(tf, "batch %d(%d)\n", tc++, sc);
CheckNTErrors
(
len
<=
max
,
"Something is wrong!"
);
for
(
int
w
=
0
;
w
<
len
;
w
++
){
/* this might be slow on GPUs :( */
batchEnc
->
Set2DInt
(
buf
[
seqOffset
[
s
]
+
w
],
s
-
seq
,
w
);
for
(
int
s
=
seq
;
s
<
seq
+
sc
;
s
++
){
//batchEnc->Set3D(1.0F, s - seq, w, buf[seqOffset[s] + w]);
int
len
=
isDoubledEnd
?
seqLen
[
s
]
:
seqLen
[
s
]
-
1
;
paddingEnc
->
Set2D
(
1.0
F
,
s
-
seq
,
w
);
CheckNTErrors
(
len
<=
max
,
"Something is wrong!"
);
paddingDec
->
Set2D
(
1.0
F
,
s
-
seq
,
w
);
for
(
int
w
=
0
;
w
<
len
;
w
++
){
if
(
w
>
0
)
batch
->
Set3D
(
1.0
F
,
s
-
seq
,
w
,
buf
[
seqOffset
[
s
]
+
w
]);
gold
->
Set3D
(
1.0
F
,
s
-
seq
,
w
-
1
,
buf
[
seqOffset
[
s
]
+
w
]);
padding
->
Set2D
(
1.0
F
,
s
-
seq
,
w
);
if
(
w
>
0
)
if
(
w
==
len
-
1
)
{
output
->
Set3D
(
1.0
F
,
s
-
seq
,
w
-
1
,
buf
[
seqOffset
[
s
]
+
w
]);
if
(
isDoubledEnd
)
if
(
w
==
len
-
1
){
gold
->
Set3D
(
1.0
F
,
s
-
seq
,
w
,
buf
[
seqOffset
[
s
]
+
w
]);
if
(
isDoubledEnd
)
output
->
Set3D
(
1.0
F
,
s
-
seq
,
w
,
buf
[
seqOffset
[
s
]
+
w
]);
else
output
->
Set3D
(
1.0
F
,
s
-
seq
,
w
,
buf
[
seqOffset
[
s
]
+
w
+
1
]);
}
wCount
++
;
/*fprintf(tf, "%d", buf[seqOffset[s] + w]);
if(w < seqLen[s] - 1)
fprintf(tf, " ");
else
else
fprintf(tf, "\n");*/
gold
->
Set3D
(
1.0
F
,
s
-
seq
,
w
,
buf
[
seqOffset
[
s
]
+
w
+
1
]);
if
(
seqs
!=
NULL
)
seqs
[
seqSize
++
]
=
buf
[
seqOffset
[
s
]
+
w
];
}
}
if
(
seqs
!=
NULL
){
wCount
++
;
for
(
int
w
=
len
;
w
<
max
;
w
++
)
/*fprintf(tf, "%d", buf[seqOffset[s] + w]);
seqs
[
seqSize
++
]
=
-
1
;
if(w < seqLen[s] - 1)
fprintf(tf, " ");
else
fprintf(tf, "\n");*/
if
(
seqs
!=
NULL
)
seqs
[
seqSize
++
]
=
buf
[
seqOffset
[
s
]
+
w
];
}
if
(
seqs
!=
NULL
){
for
(
int
w
=
len
;
w
<
max
;
w
++
)
seqs
[
seqSize
++
]
=
-
1
;
}
}
fflush
(
tf
);
return
sc
;
}
/*
load a batch of sequences (for MT)
>> file - the handle to the data file
>> batchEnc - the batch of the input sequences
>> paddingEnc - padding of the input sequences
>> batchDec - the batch of the output sequences
>> paddingDec - padding of the output sequences
>> gold - gold standard
>> seqs - keep the sequences in an array
>> vsEnc - size of the encoder vocabulary
>> vsDec - size of the decoder vocabulary
>> sBatch - batch size of sequences
>> wBatch - batch size of words
>> isSorted - indicates whether the sequences are sorted by length
>> wCount - word count
>> devID - device id
>> mem - memory pool
>> isTraining - indicates whether we are training the model
*/
int
T2TTrainer
::
LoadBatchMT
(
FILE
*
file
,
XTensor
*
batchEnc
,
XTensor
*
paddingEnc
,
XTensor
*
batchDec
,
XTensor
*
paddingDec
,
XTensor
*
gold
,
int
*
seqs
,
int
vsEnc
,
int
vsDec
,
int
sBatch
,
int
wBatch
,
bool
isSorted
,
int
&
wCount
,
int
devID
,
XMem
*
mem
,
bool
isTraining
)
{
if
(
nextSeq
<
0
||
nextSeq
>=
nseqBuf
)
LoadBuf
(
file
,
isSorted
,
2
);
int
seq
=
MAX
(
nextSeq
,
0
);
int
wcEnc
=
0
;
int
wcDec
=
0
;
int
wnEnc
=
0
;
int
wnDec
=
0
;
int
maxEnc
=
0
;
int
maxDec
=
0
;
int
sc
=
0
;
CheckNTErrors
((
nseqBuf
-
seq
)
%
2
==
0
,
"Input sequence must be paired!"
);
while
(
seq
+
sc
<
nseqBuf
){
/* source-side sequence */
wnEnc
=
seqLen
[
seq
+
sc
];
wcEnc
+=
wnEnc
;
sc
+=
1
;
if
(
maxEnc
<
wnEnc
)
maxEnc
=
wnEnc
;
/* target-side sequence */
int
len
=
isDoubledEnd
?
seqLen
[
seq
+
sc
]
:
seqLen
[
seq
+
sc
]
-
1
;
wnDec
=
len
;
wcDec
+=
wnDec
;
sc
+=
1
;
if
(
maxDec
<
wnDec
)
maxDec
=
wnDec
;
int
tc
=
isSmallBatch
?
maxEnc
*
sc
/
2
:
wcEnc
;
if
(
sc
>=
sBatch
*
2
&&
tc
>=
wBatch
)
break
;
}
nextSeq
=
seq
+
sc
;
if
(
sc
<=
0
)
return
0
;
int
sCount
=
sc
/
2
;
int
seqSize
=
0
;
int
dimsEnc
[
3
]
=
{
sCount
,
maxEnc
,
vsEnc
};
int
dimsDec
[
3
]
=
{
sCount
,
maxDec
,
vsDec
};
InitTensor
(
batchEnc
,
3
,
dimsEnc
,
X_FLOAT
,
1.0
F
,
devID
,
mem
);
InitTensor2D
(
paddingEnc
,
sCount
,
maxEnc
,
X_FLOAT
,
devID
,
mem
);
InitTensor
(
batchDec
,
3
,
dimsDec
,
X_FLOAT
,
1.0
F
,
devID
,
mem
);
InitTensor2D
(
paddingDec
,
sCount
,
maxDec
,
X_FLOAT
,
devID
,
mem
);
InitTensor
(
gold
,
3
,
dimsDec
,
X_FLOAT
,
1.0
F
,
devID
,
mem
);
batchEnc
->
SetZeroAll
();
paddingEnc
->
SetZeroAll
();
batchDec
->
SetZeroAll
();
paddingDec
->
SetZeroAll
();
gold
->
SetZeroAll
();
wCount
=
0
;
/* batch of the source-side sequences */
for
(
int
s
=
seq
;
s
<
seq
+
sc
;
s
+=
2
){
int
len
=
seqLen
[
s
];
int
sent
=
(
s
-
seq
)
/
2
;
for
(
int
w
=
0
;
w
<
len
;
w
++
){
batchEnc
->
Set3D
(
1.0
F
,
sent
,
w
,
buf
[
seqOffset
[
s
]
+
w
]);
paddingEnc
->
Set2D
(
1.0
F
,
sent
,
w
);
wCount
++
;
}
}
/* batch of the target-side sequences */
for
(
int
s
=
seq
+
1
;
s
<
seq
+
sc
;
s
+=
2
){
int
len
=
isDoubledEnd
?
seqLen
[
s
]
:
seqLen
[
s
]
-
1
;
CheckNTErrors
(
len
<=
maxDec
,
"Something is wrong!"
);
int
sent
=
(
s
-
seq
-
1
)
/
2
;
for
(
int
w
=
0
;
w
<
len
;
w
++
){
paddingDec
->
Set2D
(
1.0
F
,
sent
,
w
);
batchDec
->
Set3D
(
1.0
F
,
sent
,
w
,
buf
[
seqOffset
[
s
]
+
w
]);
if
(
w
>
0
)
gold
->
Set3D
(
1.0
F
,
sent
,
w
-
1
,
buf
[
seqOffset
[
s
]
+
w
]);
if
(
w
==
len
-
1
)
{
if
(
isDoubledEnd
)
gold
->
Set3D
(
1.0
F
,
sent
,
w
,
buf
[
seqOffset
[
s
]
+
w
]);
else
gold
->
Set3D
(
1.0
F
,
sent
,
w
,
buf
[
seqOffset
[
s
]
+
w
+
1
]);
}
}
wCount
++
;
if
(
seqs
!=
NULL
)
seqs
[
seqSize
++
]
=
buf
[
seqOffset
[
s
]
+
w
];
}
}
fflush
(
tf
);
if
(
seqs
!=
NULL
){
for
(
int
w
=
len
;
w
<
maxDec
;
w
++
)
seqs
[
seqSize
++
]
=
-
1
;
}
}
}
return
sc
;
return
sc
;
...
@@ -715,8 +922,12 @@ float T2TTrainer::GetProb(XTensor * output, XTensor * gold, XTensor * wordProbs)
...
@@ -715,8 +922,12 @@ float T2TTrainer::GetProb(XTensor * output, XTensor * gold, XTensor * wordProbs)
XTensor
probs
;
XTensor
probs
;
InitTensor
(
&
probs
,
output
);
InitTensor
(
&
probs
,
output
);
XTensor
logOutput
;
InitTensor
(
&
logOutput
,
output
);
_Log
(
output
,
&
logOutput
);
/* probs[i,j] = output[i,j] * gold[i,j] */
/* probs[i,j] = output[i,j] * gold[i,j] */
_Multiply
(
o
utput
,
gold
,
&
probs
);
_Multiply
(
&
logO
utput
,
gold
,
&
probs
);
/* probability of each word */
/* probability of each word */
XTensor
wprobs
;
XTensor
wprobs
;
...
@@ -730,7 +941,7 @@ float T2TTrainer::GetProb(XTensor * output, XTensor * gold, XTensor * wordProbs)
...
@@ -730,7 +941,7 @@ float T2TTrainer::GetProb(XTensor * output, XTensor * gold, XTensor * wordProbs)
_CopyValues
(
&
wprobs
,
wordProbs
);
_CopyValues
(
&
wprobs
,
wordProbs
);
/* reshape the tensor to fit it into the reduce procedure
/* reshape the tensor to fit it into the reduce procedure
TODO: XTensor supports scalars */
TODO: XTensor supports scalars */
dims
[
0
]
=
1
;
dims
[
0
]
=
1
;
dims
[
1
]
=
probs
.
unitNum
;
dims
[
1
]
=
probs
.
unitNum
;
probs
.
Reshape
(
2
,
dims
);
probs
.
Reshape
(
2
,
dims
);
...
@@ -885,18 +1096,13 @@ void T2TTrainer::RescaleOutput(XTensor * output, XTensor * gold, XTensor * paddi
...
@@ -885,18 +1096,13 @@ void T2TTrainer::RescaleOutput(XTensor * output, XTensor * gold, XTensor * paddi
{
{
CheckNTErrors
(
output
->
order
==
3
,
"Wrong dimension number!"
);
CheckNTErrors
(
output
->
order
==
3
,
"Wrong dimension number!"
);
CheckNTErrors
(
gold
->
order
==
3
,
"Wrong dimension number!"
);
CheckNTErrors
(
gold
->
order
==
3
,
"Wrong dimension number!"
);
int
num
=
padding
->
GetDim
(
0
);
DTYPE
count
=
_ReduceSumAll
(
padding
);
XTensor
*
factor
=
NewTensorBuf
(
1
,
&
num
,
padding
->
dataType
,
1.0
F
,
padding
->
devID
,
padding
->
mem
);
_ReduceSum
(
padding
,
factor
,
padding
->
order
-
1
);
_ExpMe
(
output
);
_ExpMe
(
output
);
_
DivDim
(
output
,
factor
,
output
,
0
);
_
ScaleAndShiftMe
(
output
,
1
/
count
);
_LogMe
(
output
);
_LogMe
(
output
);
_DivDim
(
gold
,
factor
,
gold
,
0
);
_ScaleAndShiftMe
(
gold
,
1
/
count
);
DelTensorBuf
(
factor
);
}
}
/*
/*
...
...
source/sample/transformer/T2TTrainer.h
查看文件 @
03a9836e
...
@@ -79,6 +79,9 @@ public:
...
@@ -79,6 +79,9 @@ public:
/* vocabulary size of the source side */
/* vocabulary size of the source side */
int
vSize
;
int
vSize
;
/* vocabulary size of the target side */
int
vSizeTgt
;
/* learning rate */
/* learning rate */
float
lrate
;
float
lrate
;
...
@@ -100,6 +103,10 @@ public:
...
@@ -100,6 +103,10 @@ public:
/* indicates whether we use adam */
/* indicates whether we use adam */
bool
useAdam
;
bool
useAdam
;
int
validStep
;
int
curEpoch
;
/* hyper parameters of adam*/
/* hyper parameters of adam*/
float
adamBeta1
;
float
adamBeta1
;
float
adamBeta2
;
float
adamBeta2
;
...
@@ -128,8 +135,13 @@ public:
...
@@ -128,8 +135,13 @@ public:
/* number of batches on which we do model update */
/* number of batches on which we do model update */
int
updateStep
;
int
updateStep
;
/* indicates whether we double the </s> symb
le
for the output of lms */
/* indicates whether we double the </s> symb
ol
for the output of lms */
bool
isDoubledEnd
;
bool
isDoubledEnd
;
/* indicates whether we use batchsize = max * sc
rather rather than batchsize = word-number, where max is the maximum
length and sc is the sentence number */
bool
isSmallBatch
;
public
:
public
:
/* constructor */
/* constructor */
...
@@ -142,7 +154,7 @@ public:
...
@@ -142,7 +154,7 @@ public:
void
Init
(
int
argc
,
char
**
argv
);
void
Init
(
int
argc
,
char
**
argv
);
/* train the model */
/* train the model */
void
Train
(
const
char
*
fn
,
const
char
*
validFN
,
const
char
*
modelFN
,
T2TModel
*
model
);
bool
Train
(
const
char
*
fn
,
const
char
*
validFN
,
const
char
*
modelFN
,
T2TModel
*
model
);
/* test the model */
/* test the model */
void
Test
(
const
char
*
fn
,
const
char
*
ofn
,
T2TModel
*
model
);
void
Test
(
const
char
*
fn
,
const
char
*
ofn
,
T2TModel
*
model
);
...
@@ -158,11 +170,34 @@ public:
...
@@ -158,11 +170,34 @@ public:
/* load a batch of sequences */
/* load a batch of sequences */
int
LoadBatch
(
FILE
*
file
,
bool
isLM
,
int
LoadBatch
(
FILE
*
file
,
bool
isLM
,
XTensor
*
batch
,
XTensor
*
padding
,
XTensor
*
output
,
XTensor
*
batchEnc
,
XTensor
*
paddingEnc
,
XTensor
*
batchDec
,
XTensor
*
paddingDec
,
XTensor
*
gold
,
int
*
seqs
,
int
*
seqs
,
int
step
,
int
vs
,
int
sBatch
,
int
wBatch
,
int
vsEnc
,
int
vsDec
,
int
sBatch
,
int
wBatch
,
bool
isSorted
,
int
&
wCount
,
bool
isSorted
,
int
&
wCount
,
int
devID
,
XMem
*
mem
);
int
devID
,
XMem
*
mem
,
bool
isTraining
);
/* load a batch of sequences (for language modeling) */
int
LoadBatchLM
(
FILE
*
file
,
XTensor
*
batchEnc
,
XTensor
*
paddingEnc
,
XTensor
*
batchDec
,
XTensor
*
paddingDec
,
XTensor
*
gold
,
int
*
seqs
,
int
vs
,
int
sBatch
,
int
wBatch
,
bool
isSorted
,
int
&
wCount
,
int
devID
,
XMem
*
mem
,
bool
isTraining
);
/* load a batch of sequences (for machine translation) */
int
LoadBatchMT
(
FILE
*
file
,
XTensor
*
batchEnc
,
XTensor
*
paddingEnc
,
XTensor
*
batchDec
,
XTensor
*
paddingDec
,
XTensor
*
gold
,
int
*
seqs
,
int
vsEnc
,
int
vsDec
,
int
sBatch
,
int
wBatch
,
bool
isSorted
,
int
&
wCount
,
int
devID
,
XMem
*
mem
,
bool
isTraining
);
/* shuffle the data file */
/* shuffle the data file */
void
Shuffle
(
const
char
*
srcFile
,
const
char
*
tgtFile
);
void
Shuffle
(
const
char
*
srcFile
,
const
char
*
tgtFile
);
...
...
source/sample/transformer/Transformer.cpp
查看文件 @
03a9836e
...
@@ -25,6 +25,8 @@
...
@@ -25,6 +25,8 @@
#include "T2TUtility.h"
#include "T2TUtility.h"
#include "T2TTrainer.h"
#include "T2TTrainer.h"
#include "../../tensor/XDevice.h"
#include "../../tensor/XDevice.h"
#include "../../tensor/XUtility.h"
#include "../../tensor/XGlobal.h"
namespace
transformer
namespace
transformer
{
{
...
@@ -56,20 +58,74 @@ int TransformerMain(int argc, const char ** argv)
...
@@ -56,20 +58,74 @@ int TransformerMain(int argc, const char ** argv)
LoadParamString
(
argc
,
args
,
"test"
,
testFN
,
""
);
LoadParamString
(
argc
,
args
,
"test"
,
testFN
,
""
);
LoadParamString
(
argc
,
args
,
"output"
,
outputFN
,
""
);
LoadParamString
(
argc
,
args
,
"output"
,
outputFN
,
""
);
T2TTrainer
trainer
;
trainer
.
Init
(
argc
,
args
);
T2TModel
model
;
model
.
InitModel
(
argc
,
args
);
/* learn model parameters */
/* learn model parameters */
if
(
strcmp
(
trainFN
,
""
))
if
(
strcmp
(
trainFN
,
""
))
{
trainer
.
Train
(
trainFN
,
testFN
,
strcmp
(
modelFN
,
""
)
?
modelFN
:
"checkpoint.model"
,
&
model
);
double
startT
=
GetClockSec
();
T2TTrainer
trainer
;
trainer
.
Init
(
argc
,
args
);
char
*
fn
=
new
char
[
MAX_LINE_LENGTH
];
char
*
fn1
=
new
char
[
MAX_LINE_LENGTH
];
char
*
fn2
=
new
char
[
MAX_LINE_LENGTH
];
modelFN
=
strcmp
(
modelFN
,
""
)
?
modelFN
:
(
char
*
)
"checkpoint.model"
;
int
epoch
;
bool
isTrain
;
for
(
epoch
=
1
;
epoch
<=
trainer
.
nepoch
;
epoch
++
)
{
sprintf
(
fn
,
"%s.%s.%03d"
,
modelFN
,
"epoch"
,
epoch
-
1
);
sprintf
(
fn1
,
"%s.%s.%03d"
,
modelFN
,
"epoch"
,
epoch
);
sprintf
(
fn2
,
"%s.%s.%03d.output"
,
modelFN
,
"epoch"
,
epoch
);
if
(
epoch
==
1
)
{
T2TModel
model
;
model
.
InitModel
(
argc
,
args
);
isTrain
=
trainer
.
Train
(
trainFN
,
testFN
,
modelFN
,
&
model
);
model
.
Dump
(
fn1
);
}
else
{
T2TModel
model
;
model
.
InitModel
(
argc
,
args
);
model
.
Read
(
fn
);
isTrain
=
trainer
.
Train
(
trainFN
,
testFN
,
modelFN
,
&
model
);
model
.
Dump
(
fn1
);
}
if
(
trainer
.
useEpochCheckpoint
&&
strcmp
(
testFN
,
""
))
{
T2TTrainer
tester
;
tester
.
Init
(
argc
,
args
);
T2TModel
model
;
model
.
InitModel
(
argc
,
args
);
model
.
Read
(
fn1
);
tester
.
Test
(
testFN
,
fn2
,
&
model
);
}
if
(
!
isTrain
)
break
;
}
double
elapsed
=
GetClockSec
()
-
startT
;
epoch
=
MIN
(
epoch
,
trainer
.
nepoch
);
XPRINT2
(
0
,
stderr
,
"[INFO] training finished (took %.1fs and epoch=%d)
\n
"
,
elapsed
,
epoch
);
delete
[]
fn
;
delete
[]
fn1
;
delete
[]
fn2
;
}
/* don't dump the final model */
/* save the final model */
/* save the final model */
if
(
strcmp
(
modelFN
,
""
)
&&
strcmp
(
trainFN
,
""
))
//if(strcmp(modelFN, "") && strcmp(trainFN, ""))
model
.
Dump
(
modelFN
);
// model.Dump(modelFN);
T2TModel
model
;
model
.
InitModel
(
argc
,
args
);
/* load the model if neccessary */
/* load the model if neccessary */
if
(
strcmp
(
modelFN
,
""
))
if
(
strcmp
(
modelFN
,
""
))
...
...
source/tensor/XDevice.cpp
查看文件 @
03a9836e
...
@@ -446,7 +446,7 @@ int XDevManager::GetCudaThread2D(const int devID, const int n, const int m, int
...
@@ -446,7 +446,7 @@ int XDevManager::GetCudaThread2D(const int devID, const int n, const int m, int
CheckNTErrors
((
!
(
b
&
(
b
-
1
))),
"Block size (x-axis) must be in 2^x"
);
CheckNTErrors
((
!
(
b
&
(
b
-
1
))),
"Block size (x-axis) must be in 2^x"
);
CheckNTErrors
((
gXSize
<=
GPUs
[
devID
].
GPUMaxGridSize
[
0
]
&&
CheckNTErrors
((
gXSize
<=
GPUs
[
devID
].
GPUMaxGridSize
[
0
]
&&
gYSize
<=
GPUs
[
devID
].
GPUMaxGridSize
[
1
]),
"A too large grid size."
);
gYSize
<=
GPUs
[
devID
].
GPUMaxGridSize
[
1
]),
"A too large grid size."
);
blockSize
[
0
]
=
bXSize
;
blockSize
[
0
]
=
bXSize
;
blockSize
[
1
]
=
bYSize
;
blockSize
[
1
]
=
bYSize
;
...
...
source/tensor/XMem.cpp
查看文件 @
03a9836e
...
@@ -292,7 +292,8 @@ void XMem::SetComputationMode(bool myIsForComputation)
...
@@ -292,7 +292,8 @@ void XMem::SetComputationMode(bool myIsForComputation)
if
(
!
myIsForComputation
&&
devID
>=
0
&&
cublasHandle
!=
NULL
)
if
(
!
myIsForComputation
&&
devID
>=
0
&&
cublasHandle
!=
NULL
)
cublasDestroy
(
cublasHandle
);
cublasDestroy
(
cublasHandle
);
if
(
myIsForComputation
)
if
(
myIsForComputation
)
CheckNTErrors
(
cublasCreate
(
&
cublasHandle
)
==
CURAND_STATUS_SUCCESS
,
"Cannot create the cublas handle."
);
CheckNTErrors
((
enum
curandStatus
)
cublasCreate
(
&
cublasHandle
)
==
CURAND_STATUS_SUCCESS
,
"Cannot create the cublas handle."
);
SetDevice
(
devIDBackup
);
SetDevice
(
devIDBackup
);
#endif
#endif
...
@@ -1392,8 +1393,8 @@ void XMem::CreateBLASHandle()
...
@@ -1392,8 +1393,8 @@ void XMem::CreateBLASHandle()
"Cannot destroy the cublas handle."
);
"Cannot destroy the cublas handle."
);
}
}
CheckNTErrors
(
cublasCreate
(
&
cublasHandle
)
==
CURAND_STATUS_SUCCESS
,
CheckNTErrors
(
(
enum
curandStatus
)
cublasCreate
(
&
cublasHandle
)
==
CURAND_STATUS_SUCCESS
,
"Cannot create the cublas handle."
);
"Cannot create the cublas handle."
);
#endif
#endif
}
}
...
...
source/tensor/XTensor.cpp
查看文件 @
03a9836e
...
@@ -1057,9 +1057,9 @@ int XTensor::GetKeyInSparse(int i)
...
@@ -1057,9 +1057,9 @@ int XTensor::GetKeyInSparse(int i)
/*
/*
set the value of a cell
set the value of a cell
>> value - value
to assign to the cell
>> value - value
we tend to set
>> index - index of the cell for each dimension
>> index - index of the cell for each dimension
>>
>>
size - size of the index
*/
*/
bool
XTensor
::
Set
(
DTYPE
value
,
int
index
[],
int
size
)
bool
XTensor
::
Set
(
DTYPE
value
,
int
index
[],
int
size
)
{
{
...
@@ -1070,8 +1070,9 @@ bool XTensor::Set(DTYPE value, int index[], int size)
...
@@ -1070,8 +1070,9 @@ bool XTensor::Set(DTYPE value, int index[], int size)
/*
/*
set the value of a cell in a 1d tensor
set the value of a cell in a 1d tensor
>> value - value
to assign to the cell
>> value - value
we tend to set
>> i - item offset
>> i - item offset
<< return - succeeded or not
*/
*/
bool
XTensor
::
Set1D
(
DTYPE
value
,
int
i
)
bool
XTensor
::
Set1D
(
DTYPE
value
,
int
i
)
{
{
...
@@ -1124,6 +1125,78 @@ bool XTensor::Set3D(DTYPE value, int d0, int d1, int d2)
...
@@ -1124,6 +1125,78 @@ bool XTensor::Set3D(DTYPE value, int d0, int d1, int d2)
return
SetToDevice
(
devID
,
GetCell
(
dims
,
3
),
value
);
return
SetToDevice
(
devID
,
GetCell
(
dims
,
3
),
value
);
}
}
/*
set the integer value of a cell
>> value - value we tend to set
>> index - index of the cell for each dimension
>> size - size of the index
<< return - succeeded or not
*/
bool
XTensor
::
SetInt
(
int
value
,
int
index
[],
int
size
)
{
CheckNTErrors
((
dataType
==
X_INT
),
"The tensor is not in integer type."
);
return
SetToDeviceInt
(
devID
,
GetCell
(
index
,
size
),
value
);
}
/*
set the integer value of a cell in a 1d tensor
>> value - value we tend to set
>> i - item offset
<< return - succeeded or not
*/
bool
XTensor
::
Set1DInt
(
int
value
,
int
i
)
{
CheckNTErrors
((
order
==
1
),
"Cannot get a 2d cell for a tensor whose order is not 2!"
);
CheckNTErrors
((
i
>=
0
&&
i
<
dimSize
[
0
]),
"dimension 0 is out of range!"
);
CheckNTErrors
((
dataType
==
X_INT
),
"The tensor is not in integer type."
);
int
dims
[
1
]
=
{
i
};
return
SetToDeviceInt
(
devID
,
GetCell
(
dims
,
1
),
value
);
}
/*
set the integer value of a cell in a 2d tensor in default type
>> value - value we tend to set
>> ni - row index
>> mi - column index
<< return - succeeded or not
*/
bool
XTensor
::
Set2DInt
(
int
value
,
int
ni
,
int
mi
)
{
CheckNTErrors
((
order
==
2
),
"Cannot get a 2d cell for a tensor whose order is not 2!"
);
CheckNTErrors
((
ni
>=
0
&&
ni
<
dimSize
[
0
]),
"dimension 0 is out of range!"
);
CheckNTErrors
((
mi
>=
0
&&
mi
<
dimSize
[
1
]),
"dimension 1 is out of range!"
);
CheckNTErrors
((
dataType
==
X_INT
),
"The tensor is not in integer type."
);
int
dims
[
2
]
=
{
ni
,
mi
};
return
SetToDeviceInt
(
devID
,
GetCell
(
dims
,
2
),
value
);
}
/*
set the integer value of a cell in a 3d tensor in default type
>> value - value we tend to set
>> d0 - index of demension 0
>> d1 - index of demension 1
>> d2 - index of demension 2
<< return - succeeded or not
*/
bool
XTensor
::
Set3DInt
(
int
value
,
int
d0
,
int
d1
,
int
d2
)
{
CheckNTErrors
(
order
==
3
,
"Cannot get a 2d cell for a tensor whose order is not 2!"
);
CheckNTErrors
(
d0
>=
0
&&
d0
<
dimSize
[
0
],
"dimension 0 is out of range!"
);
CheckNTErrors
(
d1
>=
0
&&
d1
<
dimSize
[
1
],
"dimension 1 is out of range!"
);
CheckNTErrors
(
d2
>=
0
&&
d2
<
dimSize
[
2
],
"dimension 2 is out of range!"
);
CheckNTErrors
((
dataType
==
X_INT
),
"The tensor is not in integer type."
);
int
dims
[
3
]
=
{
d0
,
d1
,
d2
};
return
SetToDeviceInt
(
devID
,
GetCell
(
dims
,
3
),
value
);
}
/*
/*
increase the value of a cell in a 2d tensor
increase the value of a cell in a 2d tensor
>> value - value we tend to set
>> value - value we tend to set
...
@@ -1986,6 +2059,9 @@ XTensor * NewTensorBuf(const int myOrder, const int * myDimSize,
...
@@ -1986,6 +2059,9 @@ XTensor * NewTensorBuf(const int myOrder, const int * myDimSize,
XTensor
*
tensor
=
NewTensor
(
myOrder
,
dims
,
myDataType
,
myDenseRatio
,
devID
,
myMem
);
XTensor
*
tensor
=
NewTensor
(
myOrder
,
dims
,
myDataType
,
myDenseRatio
,
devID
,
myMem
);
if
(
tensor
->
unitNum
*
tensor
->
unitSize
==
176657664
)
{
tensor
->
Dump
(
stderr
,
""
,
200
);
}
if
(
myMem
!=
NULL
)
if
(
myMem
!=
NULL
)
tensor
->
data
=
myMem
->
AllocBuf
(
myMem
->
devID
,
tensor
->
unitNum
*
tensor
->
unitSize
);
tensor
->
data
=
myMem
->
AllocBuf
(
myMem
->
devID
,
tensor
->
unitNum
*
tensor
->
unitSize
);
else
else
...
...
source/tensor/XTensor.h
查看文件 @
03a9836e
...
@@ -326,6 +326,18 @@ public:
...
@@ -326,6 +326,18 @@ public:
/* set the value of a cell in a 3d tensor */
/* set the value of a cell in a 3d tensor */
bool
Set3D
(
DTYPE
value
,
int
d0
,
int
d1
,
int
d2
);
bool
Set3D
(
DTYPE
value
,
int
d0
,
int
d1
,
int
d2
);
/* set the integer value of a cell */
bool
SetInt
(
int
value
,
int
index
[],
int
size
=
-
1
);
/* set the integer value of a cell in a 1d tensor */
bool
Set1DInt
(
int
value
,
int
i
);
/* set the integer value of a cell in a 2d tensor */
bool
Set2DInt
(
int
value
,
int
ni
,
int
mi
);
/* set the integer value of a cell in a 3d tensor */
bool
Set3DInt
(
int
value
,
int
d0
,
int
d1
,
int
d2
);
/* increase the value of a cell in a 2d */
/* increase the value of a cell in a 2d */
bool
Add2D
(
DTYPE
value
,
int
ni
,
int
mi
);
bool
Add2D
(
DTYPE
value
,
int
ni
,
int
mi
);
...
...
source/tensor/XUtility.cpp
查看文件 @
03a9836e
...
@@ -491,6 +491,21 @@ bool SetToDevice(int devID, void * p, DTYPE value)
...
@@ -491,6 +491,21 @@ bool SetToDevice(int devID, void * p, DTYPE value)
return
true
;
return
true
;
}
}
/* assign a integer number to a variable that is kept on a specified device */
bool
SetToDeviceInt
(
int
devID
,
void
*
p
,
int
value
)
{
if
(
p
==
NULL
)
return
false
;
if
(
devID
<
0
)
*
(
int
*
)
p
=
value
;
else
{
XMemCopy
(
p
,
devID
,
&
value
,
-
1
,
sizeof
(
int
));
}
return
true
;
}
/* get the next number with power of 2 */
/* get the next number with power of 2 */
unsigned
int
GetNextPower2
(
unsigned
int
n
)
unsigned
int
GetNextPower2
(
unsigned
int
n
)
{
{
...
...
source/tensor/XUtility.h
查看文件 @
03a9836e
...
@@ -50,6 +50,7 @@ extern void XMemFreeOnDev(int devID, void * p);
...
@@ -50,6 +50,7 @@ extern void XMemFreeOnDev(int devID, void * p);
extern
DTYPE
ToCPU
(
int
devID
,
void
*
value
);
extern
DTYPE
ToCPU
(
int
devID
,
void
*
value
);
extern
int
ToCPUInt
(
int
devID
,
void
*
value
);
extern
int
ToCPUInt
(
int
devID
,
void
*
value
);
extern
bool
SetToDevice
(
int
devID
,
void
*
p
,
DTYPE
value
);
extern
bool
SetToDevice
(
int
devID
,
void
*
p
,
DTYPE
value
);
extern
bool
SetToDeviceInt
(
int
devID
,
void
*
p
,
int
value
);
extern
unsigned
int
GetNextPower2
(
unsigned
int
n
);
extern
unsigned
int
GetNextPower2
(
unsigned
int
n
);
extern
void
XSleep
(
int
sleepTime
);
extern
void
XSleep
(
int
sleepTime
);
extern
double
GetClock
();
extern
double
GetClock
();
...
...
source/tensor/core/getandset/SetData.cpp
查看文件 @
03a9836e
...
@@ -70,9 +70,9 @@ void _SetDataFanInOut(XTensor * tensor, DTYPE gain)
...
@@ -70,9 +70,9 @@ void _SetDataFanInOut(XTensor * tensor, DTYPE gain)
fanOut
=
numOutputFmaps
*
receptiveFieldSize
;
fanOut
=
numOutputFmaps
*
receptiveFieldSize
;
}
}
DTYPE
std
=
gain
*
(
float
)
sqrt
(
2.0
/
(
fanIn
+
fanOut
));
DTYPE
finfout
=
gain
*
(
float
)
sqrt
(
6.0
F
/
(
fanIn
+
fanOut
));
DTYPE
a
=
(
DTYPE
)
sqrt
(
3.0
)
*
std
;
tensor
->
SetDataRand
(
-
finfout
,
finfout
)
;
_SetDataRand
(
tensor
,
-
a
,
a
);
//_SetDataRand(tensor, -finfout, finfout
);
}
}
/*
/*
...
@@ -393,7 +393,7 @@ void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper)
...
@@ -393,7 +393,7 @@ void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper)
if
(
tensor
==
NULL
)
if
(
tensor
==
NULL
)
return
;
return
;
/*
G
PU code */
/*
C
PU code */
if
(
tensor
->
devID
<
0
){
if
(
tensor
->
devID
<
0
){
DTYPE
variance
=
upper
-
lower
;
DTYPE
variance
=
upper
-
lower
;
...
...
source/tensor/core/movement/Gather.cpp
查看文件 @
03a9836e
...
@@ -21,6 +21,8 @@
...
@@ -21,6 +21,8 @@
#include "Gather.h"
#include "Gather.h"
#include "CopyIndexed.h"
#include "CopyIndexed.h"
#include "../../XUtility.h"
#include "../shape/Reshape.h"
namespace
nts
{
// namespace nts(NiuTrans.Tensor)
namespace
nts
{
// namespace nts(NiuTrans.Tensor)
...
@@ -75,4 +77,50 @@ XTensor Gather(const XTensor &s, int dim, int * srcIndex, int indexSize)
...
@@ -75,4 +77,50 @@ XTensor Gather(const XTensor &s, int dim, int * srcIndex, int indexSize)
return
result
;
return
result
;
}
}
/*
gather indexed sub-tensors (return a XTensor structure)
make a new tensor to keep the result and return it
>> s - the source tensor(2D)
>> index - the index tensor
<< return - the result of copying indexed sub-tensors
*/
XTensor
Gather
(
const
XTensor
&
s
,
const
XTensor
&
index
)
{
int
indexSize
=
index
.
unitNum
;
CheckNTErrors
(
s
.
order
==
2
,
"The order of the input tensor must be 2!"
);
int
*
srcIndex
=
new
int
[
index
.
unitNum
];
if
(
index
.
dataType
==
X_INT
)
{
XMemCopy
(
srcIndex
,
-
1
,
index
.
data
,
index
.
devID
,
indexSize
*
index
.
unitSize
);
}
else
if
(
index
.
dataType
==
X_FLOAT
||
index
.
dataType
==
X_DOUBLE
)
{
DTYPE
*
tmp
=
new
DTYPE
[
indexSize
];
XMemCopy
(
tmp
,
-
1
,
index
.
data
,
index
.
devID
,
indexSize
*
index
.
unitSize
);
for
(
int
i
=
0
;
i
<
indexSize
;
i
++
)
srcIndex
[
i
]
=
(
int
)
tmp
[
i
];
delete
[]
tmp
;
}
XTensor
tensor
;
tensor
=
Gather
(
s
,
0
,
srcIndex
,
indexSize
);
delete
[]
srcIndex
;
if
(
index
.
order
>
1
)
{
int
*
dims
=
new
int
[
index
.
order
+
1
];
memcpy
(
dims
,
index
.
dimSize
,
index
.
order
*
sizeof
(
int
));
dims
[
index
.
order
]
=
tensor
.
GetDim
(
-
1
);
XTensor
t
;
t
=
Reshape
(
tensor
,
index
.
order
+
1
,
dims
);
delete
[]
dims
;
return
t
;
}
else
{
return
tensor
;
}
}
}
//
namespace
nts
(
NiuTrans
.
Tensor
)
}
//
namespace
nts
(
NiuTrans
.
Tensor
)
\ No newline at end of file
source/tensor/core/movement/Gather.h
查看文件 @
03a9836e
...
@@ -33,6 +33,10 @@ void _Gather(const XTensor * s, XTensor * t, int dim, int * srcIndex, int indexS
...
@@ -33,6 +33,10 @@ void _Gather(const XTensor * s, XTensor * t, int dim, int * srcIndex, int indexS
make a new tensor to keep the result and return it */
make a new tensor to keep the result and return it */
XTensor
Gather
(
const
XTensor
&
s
,
int
dim
,
int
*
srcIndex
,
int
indexSize
);
XTensor
Gather
(
const
XTensor
&
s
,
int
dim
,
int
*
srcIndex
,
int
indexSize
);
/* gather selected sub-tensors (return a XTensor structure)
make a new tensor to keep the result and return it */
XTensor
Gather
(
const
XTensor
&
s
,
const
XTensor
&
index
);
}
// namespace nts(NiuTrans.Tensor)
}
// namespace nts(NiuTrans.Tensor)
#endif // __GATHER_H__
#endif // __GATHER_H__
\ No newline at end of file
source/tensor/core/reduce/ReduceSum.cpp
查看文件 @
03a9836e
...
@@ -16,8 +16,8 @@
...
@@ -16,8 +16,8 @@
*/
*/
/*
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
*/
#include <math.h>
#include <math.h>
#include "ReduceSum.h"
#include "ReduceSum.h"
...
...
source/tensor/core/reduce/ReduceSum.cu
查看文件 @
03a9836e
...
@@ -105,15 +105,15 @@ void KernelReduceSum(DTYPE * input, DTYPE * output,
...
@@ -105,15 +105,15 @@ void KernelReduceSum(DTYPE * input, DTYPE * output,
__shared__ DTYPE iData[MAX_CUDA_THREAD_NUM_PER_BLOCK * MIN_CUDA_SHARED_MEM_COL_SIZE/2];
__shared__ DTYPE iData[MAX_CUDA_THREAD_NUM_PER_BLOCK * MIN_CUDA_SHARED_MEM_COL_SIZE/2];
__shared__ DTYPE bias[MAX_CUDA_THREAD_NUM_PER_BLOCK];
__shared__ DTYPE bias[MAX_CUDA_THREAD_NUM_PER_BLOCK];
int idx = threadIdx.
x * blockDim.y + threadIdx.y
;
int idx = threadIdx.
y * blockDim.x + threadIdx.x
;
unsigned int i = blockIdx.
x*blockDim.x + threadIdx.x
;
unsigned int i = blockIdx.
y*blockDim.y + threadIdx.y
;
unsigned int j = blockIdx.
y*blockDim.y + threadIdx.y
;
unsigned int j = blockIdx.
x*blockDim.x + threadIdx.x
;
if(i >= stride * blockNum)
if(i >= stride * blockNum)
return;
return;
if(threadIdx.
y
== 0)
if(threadIdx.
x
== 0)
bias[threadIdx.
x
] = shift != NULL ? shift[i] : 0;
bias[threadIdx.
y
] = shift != NULL ? shift[i] : 0;
__syncthreads();
__syncthreads();
...
@@ -121,7 +121,7 @@ void KernelReduceSum(DTYPE * input, DTYPE * output,
...
@@ -121,7 +121,7 @@ void KernelReduceSum(DTYPE * input, DTYPE * output,
int iOffset = i % stride;
int iOffset = i % stride;
bool isValid = (i < stride * blockNum && j < strideNum);
bool isValid = (i < stride * blockNum && j < strideNum);
DTYPE value = isValid ? input[blockSize * k + stride * j + iOffset] - bias[threadIdx.
x
] : 0;
DTYPE value = isValid ? input[blockSize * k + stride * j + iOffset] - bias[threadIdx.
y
] : 0;
if(power != (DTYPE)1.0){
if(power != (DTYPE)1.0){
if(power == (DTYPE)2.0)
if(power == (DTYPE)2.0)
...
@@ -136,21 +136,20 @@ void KernelReduceSum(DTYPE * input, DTYPE * output,
...
@@ -136,21 +136,20 @@ void KernelReduceSum(DTYPE * input, DTYPE * output,
value = exp(value);
value = exp(value);
/* load data into the shared mem */
/* load data into the shared mem */
iData[threadIdx.
x * blockDim.y + threadIdx.y
] = value;
iData[threadIdx.
y * blockDim.x + threadIdx.x
] = value;
__syncthreads();
__syncthreads();
/* do reduction in shared mem */
/* do reduction in shared mem */
for (unsigned int s = blockDim.
y
/2; s > 0; s >>= 1){
for (unsigned int s = blockDim.
x
/2; s > 0; s >>= 1){
if (threadIdx.
y
< s)
if (threadIdx.
x
< s)
iData[idx] += iData[idx + s];
iData[idx] += iData[idx + s];
__syncthreads();
__syncthreads();
}
}
/* write result for this block to the output array */
/* write result for this block to the output array */
if (threadIdx.
y == 0 && blockIdx.y
< reducedStrideNum)
if (threadIdx.
x == 0 && blockIdx.x
< reducedStrideNum)
output[(k * reducedStrideNum + blockIdx.
y) * stride + iOffset] = iData[threadIdx.x * blockDim.y
];
output[(k * reducedStrideNum + blockIdx.
x) * stride + iOffset] = iData[threadIdx.y * blockDim.x
];
}
}
/*
/*
...
@@ -282,15 +281,15 @@ void KernelReduceSumFast(DTYPE * input, DTYPE * output,
...
@@ -282,15 +281,15 @@ void KernelReduceSumFast(DTYPE * input, DTYPE * output,
__shared__ DTYPE iData[MAX_CUDA_THREAD_NUM_PER_BLOCK];
__shared__ DTYPE iData[MAX_CUDA_THREAD_NUM_PER_BLOCK];
__shared__ DTYPE bias[MAX_CUDA_THREAD_NUM_PER_BLOCK];
__shared__ DTYPE bias[MAX_CUDA_THREAD_NUM_PER_BLOCK];
unsigned int tid = threadIdx.
y
;
unsigned int tid = threadIdx.
x
;
unsigned int j = blockIdx.
y * (blockDim.y * 2) + threadIdx.y
;
unsigned int j = blockIdx.
x * (blockDim.x * 2) + threadIdx.x
;
unsigned int i = blockIdx.
x * blockDim.x + threadIdx.x
;
unsigned int i = blockIdx.
y * blockDim.y + threadIdx.y
;
if(i >= stride * blockNum)
if(i >= stride * blockNum)
return;
return;
if (threadIdx.
y
== 0)
if (threadIdx.
x
== 0)
bias[threadIdx.
x
] = shift != NULL ? shift[i] : 0;
bias[threadIdx.
y
] = shift != NULL ? shift[i] : 0;
__syncthreads();
__syncthreads();
...
@@ -299,17 +298,17 @@ void KernelReduceSumFast(DTYPE * input, DTYPE * output,
...
@@ -299,17 +298,17 @@ void KernelReduceSumFast(DTYPE * input, DTYPE * output,
int iOffset = i % stride;
int iOffset = i % stride;
bool isValid = j < strideNum;
bool isValid = j < strideNum;
bool isValid2 = j + blockDim.
y
< strideNum;
bool isValid2 = j + blockDim.
x
< strideNum;
DTYPE * data = iData + threadIdx.
x * blockDim.y
;
DTYPE * data = iData + threadIdx.
y * blockDim.x
;
DTYPE * inputData = input + k * blockSize;
DTYPE * inputData = input + k * blockSize;
DTYPE value = isValid ? inputData[j * stride + iOffset] - bias[threadIdx.
x
]: 0;
DTYPE value = isValid ? inputData[j * stride + iOffset] - bias[threadIdx.
y
]: 0;
DTYPE value2 = isValid2 ? inputData[(j + blockDim.
y) * stride + iOffset] - bias[threadIdx.x
]: 0;
DTYPE value2 = isValid2 ? inputData[(j + blockDim.
x) * stride + iOffset] - bias[threadIdx.y
]: 0;
if(power != (DTYPE)1.0){
if(power != (DTYPE)1.0){
if(power == (DTYPE)2.0){
if(power == (DTYPE)2.0){
value = value * value;
value = value * value;
value2 = value2 *value2;
value2 = value2 *
value2;
}
}
else if(power == (DTYPE)0.5){
else if(power == (DTYPE)0.5){
value = sqrt(value);
value = sqrt(value);
...
@@ -329,17 +328,25 @@ void KernelReduceSumFast(DTYPE * input, DTYPE * output,
...
@@ -329,17 +328,25 @@ void KernelReduceSumFast(DTYPE * input, DTYPE * output,
}
}
value = value + value2;
value = value + value2;
__syncthreads();
__syncthreads();
value = shflDownReduceSum(value);
value = shflDownReduceSum(value);
if ((tid & 0x1f) == 0) { data[tid / 32] = value; }
if ((tid & 0x1f) == 0)
data[tid / 32] = value;
__syncthreads();
__syncthreads();
if (tid < 32){
if (tid < 32){
if (tid < blockDim.
y
/ 32)
if (tid < blockDim.
x
/ 32)
value = data[tid];
value = data[tid];
else value = 0;
else
value = shflDownReduceSum(value);
value = 0;
if (tid == 0 && blockIdx.y < reducedStrideNum)
value = shflDownReduceSum(value);
output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = value;
if (tid == 0 && blockIdx.x < reducedStrideNum) {
output[(k * reducedStrideNum + blockIdx.x) * stride + iOffset] = value;
}
}
}
}
}
...
@@ -480,7 +487,7 @@ void KernelReduceSumFast(__half * input, __half * output,
...
@@ -480,7 +487,7 @@ void KernelReduceSumFast(__half * input, __half * output,
if data storage is discontinuius ,use this way to reduce
if data storage is discontinuius ,use this way to reduce
*/
*/
__global__
__global__
void KernelReduceSumDiscontinuousStorage(DTYPE * input, DTYPE * output, int stride, int strideNum,
void KernelReduceSumDiscontinuousStorage(DTYPE * input, DTYPE * output, int stride, int strideNum,
int blockNum, DTYPE * shift, DTYPE power, bool isExp)
int blockNum, DTYPE * shift, DTYPE power, bool isExp)
{
{
__shared__ DTYPE bias[MAX_CUDA_THREAD_NUM_PER_BLOCK];
__shared__ DTYPE bias[MAX_CUDA_THREAD_NUM_PER_BLOCK];
...
@@ -568,7 +575,8 @@ void KernelReduceSumOp(DTYPE * input, DTYPE * output,
...
@@ -568,7 +575,8 @@ void KernelReduceSumOp(DTYPE * input, DTYPE * output,
if (tid < 32){
if (tid < 32){
if (tid < blockDim.y / 32)
if (tid < blockDim.y / 32)
threadSum = data[tid];
threadSum = data[tid];
else threadSum = 0;
else
threadSum = 0;
threadSum = shflDownReduceSum(threadSum);
threadSum = shflDownReduceSum(threadSum);
if (tid == 0 && blockIdx.y < reducedStrideNum)
if (tid == 0 && blockIdx.y < reducedStrideNum)
output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = threadSum;
output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = threadSum;
...
@@ -640,29 +648,28 @@ inline void continuousStorageThreadAllocation(dim3& grid, dim3& block, long long
...
@@ -640,29 +648,28 @@ inline void continuousStorageThreadAllocation(dim3& grid, dim3& block, long long
/*
/*
this situation we use block.x * grid.x deal one vector for continuous read
this situation we use block.x * grid.x deal one vector for continuous read
*/
*/
inline void discontinuousStorageNoShareMemThreadAllocation(dim3& grid, dim3&
block, int stride, int blockNum)
void discontinuousStorageNoShareMemThreadAllocation(dim3* grid, dim3*
block, int stride, int blockNum)
{
{
block.x = 512;
block->x = 512;
block.y = 1;
block->y = 1;
if ((stride * blockNum) % 512 == 0)
if ((stride * blockNum) % 512 == 0)
grid
.
x = (stride * blockNum) / 512;
grid
->
x = (stride * blockNum) / 512;
else
else
grid
.
x = (stride * blockNum) / 512 + 1;
grid
->
x = (stride * blockNum) / 512 + 1;
grid
.
y = 1;
grid
->
y = 1;
}
}
/*
/*
adjust threads.x number then we can use warp optimization
adjust threads.x number then we can use warp optimization
*/
*/
inline void adjustThreadForUseWarpOptimization(dim3& blocks, dim3&
threads)
void adjustThreadForUseWarpOptimization(dim3* blocks, dim3*
threads)
{
{
if (threads
.x
> 1){
if (threads
->y
> 1){
blocks
.x *= threads.x
;
blocks
->y *= threads->y
;
threads
.x
= 1;
threads
->y
= 1;
}
}
if (threads
.y
< 32)
if (threads
->x
< 32)
threads
.y
= 32;
threads
->x
= 32;
}
}
/*
/*
...
@@ -724,7 +731,7 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
...
@@ -724,7 +731,7 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
DTYPE * buf1 = buf;
DTYPE * buf1 = buf;
DTYPE * buf2 = buf + cudaGridSize[0] * stride * blockNum;
DTYPE * buf2 = buf + cudaGridSize[0] * stride * blockNum;
DTYPE * sp = shift != NULL ? (DTYPE*)shift->data : NULL;
DTYPE * sp = shift != NULL ? (DTYPE*)shift->data : NULL;
int devIDBackup;
int devIDBackup;
ProtectCudaDev(input->devID, devIDBackup);
ProtectCudaDev(input->devID, devIDBackup);
...
@@ -733,19 +740,23 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
...
@@ -733,19 +740,23 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
dim3 blocks;
dim3 blocks;
continuousStorageThreadAllocation(grids, blocks, (long long)blockNum, strideNum);
continuousStorageThreadAllocation(grids, blocks, (long long)blockNum, strideNum);
if (blocks.y >= 128)
if (blocks.y >= 128)
KernelReduceSumOp <<<grids, blocks >>> ((DTYPE *)input->data, (DTYPE*)output->data, stride, strideNum, grids.y, blockSize, blockNum, sp, power, isExp);
KernelReduceSumOp <<<grids, blocks>>> ((DTYPE *)input->data, (DTYPE*)output->data, stride,
strideNum, grids.y, blockSize, blockNum, sp, power, isExp);
else {
else {
if (blockNum % 4 != 0) blockNum = (int)(blockNum / 4) + 1;
if (blockNum % 4 != 0)
else blockNum = blockNum / 4;
blockNum = (int)(blockNum / 4) + 1;
KernelReduceSumOpLessBlocks << <blockNum, 128 >> > ((DTYPE *)input->data, (DTYPE*)output->data, strideNum, blockNum, sp, power, isExp);
else
blockNum = blockNum / 4;
KernelReduceSumOpLessBlocks <<<blockNum, 128>>> ((DTYPE *)input->data, (DTYPE*)output->data,
strideNum, blockNum, sp, power, isExp);
}
}
}
}
else if (stride != 1 && stride * blockNum > 4096){
else if (stride != 1 && stride * blockNum > 4096){
//GDevs->GetGridAndBlockSize2D(devID, stride * blockNum, strideNum,MAX_INT, cudaGridSize, cudaBlockSize);
//GDevs->GetGridAndBlockSize2D(devID, stride * blockNum, strideNum,MAX_INT, cudaGridSize, cudaBlockSize);
//unsigned int* goutput = (unsigned int *)input->data;
//unsigned int* goutput = (unsigned int *)input->data;
//convert2uintV2 <<
<dim3(cudaGridSize[0], cudaGridSize[1]), dim3(cudaBlockSize[0], cudaBlockSize[1])>>
> ((float*)input->data, goutput, stride, strideNum, blockNum, strideNum*blockNum*stride);
//convert2uintV2 <<
<dim3(cudaGridSize[0], cudaGridSize[1]), dim3(cudaBlockSize[0], cudaBlockSize[1]) >>
> ((float*)input->data, goutput, stride, strideNum, blockNum, strideNum*blockNum*stride);
dim3 grid, block;
dim3 grid, block;
discontinuousStorageNoShareMemThreadAllocation(
grid,
block, stride, blockNum);
discontinuousStorageNoShareMemThreadAllocation(
&grid, &
block, stride, blockNum);
KernelReduceSumDiscontinuousStorage <<<grid, block>>> ((DTYPE *)input->data, (DTYPE*)output->data, stride,
KernelReduceSumDiscontinuousStorage <<<grid, block>>> ((DTYPE *)input->data, (DTYPE*)output->data, stride,
strideNum, blockNum,sp, power, isExp);
strideNum, blockNum,sp, power, isExp);
}
}
...
@@ -769,50 +780,50 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
...
@@ -769,50 +780,50 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
/* unroll the reduction procedure. The code is messy but it is faster. */
/* unroll the reduction procedure. The code is messy but it is faster. */
if (strideNum <= 32) {
if (strideNum <= 32) {
GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[
1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0
]);
dim3 blocks(cudaGridSize[
0], cudaGridSize[1]), threads(cudaBlockSize[0], cudaBlockSize[1
]);
if (cudaGridSize[0] == 1)
if (cudaGridSize[0] == 1)
oData = (DTYPE*)output->data;
oData = (DTYPE*)output->data;
KernelReduceSum <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.
y
,
KernelReduceSum <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.
x
,
blockSize, blockNum, sp, power, isExp);
blockSize, blockNum, sp, power, isExp);
}
}
else if (strideNum < 128) {
else if (strideNum < 128) {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 64), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 64), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[
1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0
]);
dim3 blocks(cudaGridSize[
0], cudaGridSize[1]), threads(cudaBlockSize[0], cudaBlockSize[1
]);
if (cudaGridSize[0] == 1)
if (cudaGridSize[0] == 1)
oData = (DTYPE*)output->data;
oData = (DTYPE*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 64), "Incorrect thread number when calling the cuda kernel!");
CheckNTErrors((cudaBlockSize[0] >= 64), "Incorrect thread number when calling the cuda kernel!");
adjustThreadForUseWarpOptimization(
blocks,
threads);
adjustThreadForUseWarpOptimization(
&blocks, &
threads);
KernelReduceSumFast<64> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.
y
,
KernelReduceSumFast<64> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.
x
,
blockSize, blockNum, sp, power, isExp);
blockSize, blockNum, sp, power, isExp);
}
}
else if (strideNum < 256) {
else if (strideNum < 256) {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 128), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 128), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[
1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0
]);
dim3 blocks(cudaGridSize[
0], cudaGridSize[1]), threads(cudaBlockSize[0], cudaBlockSize[1
]);
if (cudaGridSize[0] == 1)
if (cudaGridSize[0] == 1)
oData = (DTYPE*)output->data;
oData = (DTYPE*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 128), "Incorrect thread number when calling the cuda kernel!");
CheckNTErrors((cudaBlockSize[0] >= 128), "Incorrect thread number when calling the cuda kernel!");
adjustThreadForUseWarpOptimization(
blocks,
threads);
adjustThreadForUseWarpOptimization(
&blocks, &
threads);
KernelReduceSumFast<128> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.
y
,
KernelReduceSumFast<128> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.
x
,
blockSize, blockNum, sp, power, isExp);
blockSize, blockNum, sp, power, isExp);
}
}
else if (strideNum < 512) {
else if (strideNum < 512) {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 256), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 256), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[
1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0
]);
dim3 blocks(cudaGridSize[
0], cudaGridSize[1]), threads(cudaBlockSize[0], cudaBlockSize[1
]);
if (cudaGridSize[0] == 1)
if (cudaGridSize[0] == 1)
oData = (DTYPE*)output->data;
oData = (DTYPE*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 256), "Incorrect thread number when calling the cuda kernel!");
CheckNTErrors((cudaBlockSize[0] >= 256), "Incorrect thread number when calling the cuda kernel!");
adjustThreadForUseWarpOptimization(
blocks,
threads);
adjustThreadForUseWarpOptimization(
&blocks, &
threads);
KernelReduceSumFast<256> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.
y
,
KernelReduceSumFast<256> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.
x
,
blockSize, blockNum, sp, power, isExp);
blockSize, blockNum, sp, power, isExp);
}
}
else {
else {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 512), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 512), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[
1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0
]);
dim3 blocks(cudaGridSize[
0], cudaGridSize[1]), threads(cudaBlockSize[0], cudaBlockSize[1
]);
if (cudaGridSize[0] == 1)
if (cudaGridSize[0] == 1)
oData = (DTYPE*)output->data;
oData = (DTYPE*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 512), "Incorrect thread number when calling the cuda kernel!");
CheckNTErrors((cudaBlockSize[0] >= 512), "Incorrect thread number when calling the cuda kernel!");
adjustThreadForUseWarpOptimization(
blocks,
threads);
adjustThreadForUseWarpOptimization(
&blocks, &
threads);
KernelReduceSumFast<512> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.
y
,
KernelReduceSumFast<512> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.
x
,
blockSize, blockNum, sp, power, isExp);
blockSize, blockNum, sp, power, isExp);
}
}
}
}
...
...
source/tensor/core/reduce/ReduceSumAll.cpp
查看文件 @
03a9836e
...
@@ -44,23 +44,24 @@ sum all the items of the tensor (It should be optimized!)
...
@@ -44,23 +44,24 @@ sum all the items of the tensor (It should be optimized!)
>> source - the inpute tensor
>> source - the inpute tensor
<< return - the total summation
<< return - the total summation
*/
*/
DTYPE
_ReduceSumAll
(
XTensor
*
source
)
DTYPE
_ReduceSumAll
(
const
XTensor
*
source
)
{
{
int
order
=
source
->
order
;
int
order
=
source
->
order
;
DTYPE
summation
;
DTYPE
summation
;
XTensor
*
big
=
NewTensor
(
source
);
XTensor
*
big
=
NewTensor
(
source
);
_CopyValues
(
source
,
big
);
_CopyValues
(
source
,
big
);
for
(
int
i
=
0
;
i
<
order
;
i
++
)
{
for
(
int
i
=
order
-
1
;
i
>=
0
;
i
--
)
{
if
(
i
==
0
)
if
(
i
==
order
-
1
)
big
->
Reshape
(
1
,
big
->
unitNum
);
big
->
Reshape
(
big
->
unitNum
,
1
);
int
leadingDim
=
big
->
order
-
1
;
int
*
dimSize
;
int
*
dimSize
;
dimSize
=
getDimSize
(
big
,
0
);
dimSize
=
getDimSize
(
big
,
leadingDim
);
XTensor
*
little
=
NewTensor
(
big
->
order
-
1
,
dimSize
,
source
->
dataType
,
source
->
denseRatio
,
source
->
devID
,
source
->
mem
);
XTensor
*
little
=
NewTensor
(
big
->
order
-
1
,
dimSize
,
source
->
dataType
,
source
->
denseRatio
,
source
->
devID
,
source
->
mem
);
_ReduceSum
(
big
,
little
,
0
);
_ReduceSum
(
big
,
little
,
leadingDim
);
delete
big
;
delete
big
;
delete
dimSize
;
delete
dimSize
;
...
@@ -81,7 +82,7 @@ sum all the items of the tensor
...
@@ -81,7 +82,7 @@ sum all the items of the tensor
>> source - the inpute tensor
>> source - the inpute tensor
<< return - the total summation
<< return - the total summation
*/
*/
DTYPE
ReduceSumAll
(
XTensor
&
source
)
DTYPE
ReduceSumAll
(
const
XTensor
&
source
)
{
{
return
_ReduceSumAll
(
&
source
);
return
_ReduceSumAll
(
&
source
);
}
}
...
...
source/tensor/core/reduce/ReduceSumAll.h
查看文件 @
03a9836e
...
@@ -28,10 +28,10 @@
...
@@ -28,10 +28,10 @@
namespace
nts
{
// namespace nts(NiuTrans.Tensor)
namespace
nts
{
// namespace nts(NiuTrans.Tensor)
/* sum all the items of the tensor */
/* sum all the items of the tensor */
DTYPE
_ReduceSumAll
(
XTensor
*
source
);
DTYPE
_ReduceSumAll
(
const
XTensor
*
source
);
/* sum all the items of the tensor */
/* sum all the items of the tensor */
DTYPE
ReduceSumAll
(
XTensor
&
source
);
DTYPE
ReduceSumAll
(
const
XTensor
&
source
);
}
// namespace nts(NiuTrans.Tensor)
}
// namespace nts(NiuTrans.Tensor)
...
...
source/tensor/function/CrossEntropy.cpp
查看文件 @
03a9836e
...
@@ -50,46 +50,33 @@ void _CrossEntropy(const XTensor * output, const XTensor * gold,
...
@@ -50,46 +50,33 @@ void _CrossEntropy(const XTensor * output, const XTensor * gold,
const
XTensor
*
padding
,
int
leadingDim
)
const
XTensor
*
padding
,
int
leadingDim
)
{
{
int
n
=
leadingDim
<
0
?
output
->
order
-
1
:
leadingDim
;
int
n
=
leadingDim
<
0
?
output
->
order
-
1
:
leadingDim
;
CheckNTErrors
(
n
>=
0
&&
n
<
output
->
order
,
"Wrong leadingDim!"
);
int
unitNum
=
output
->
dimSize
[
n
];
int
unitNum
=
output
->
dimSize
[
n
];
CheckNTErrors
(
n
>=
0
&&
n
<
output
->
order
,
"Wrong leadingDim!"
);
CheckNTErrors
(
XTensor
::
IsSameShaped
(
output
,
gold
),
CheckNTErrors
(
XTensor
::
IsSameShaped
(
output
,
gold
),
"The output tensor and gold tensor must be of the same size!"
);
"The output tensor and gold tensor must be of the same size!"
);
CheckNTErrors
(
weight
==
NULL
||
weight
->
unitNum
==
unitNum
,
"Wrong weight tensor!"
);
CheckNTErrors
(
weight
==
NULL
||
weight
->
unitNum
==
unitNum
,
"Wrong weight tensor!"
);
CheckNTErrors
(
padding
==
NULL
||
XTensor
::
IsSameShaped
(
padding
,
loss
),
"The loss tensor and padding tensor must be same shape!"
);
CheckNTErrors
(
padding
==
NULL
||
XTensor
::
IsSameShaped
(
padding
,
loss
),
"The loss tensor and padding tensor must be same shape!"
);
CheckNTErrors
(
loss
->
order
==
output
->
order
-
1
,
"Wrong loss dimension!"
);
CheckNTErrors
(
loss
->
order
==
output
->
order
-
1
,
"Wrong loss dimension!"
);
CheckNTErrors
(
gold
->
dataType
==
DEFAULT_DTYPE
&&
output
->
dataType
==
DEFAULT_DTYPE
,
"TODO!"
);
CheckNTErrors
(
gold
->
dataType
==
DEFAULT_DTYPE
&&
output
->
dataType
==
DEFAULT_DTYPE
,
"TODO!"
);
XTensor
*
logBuf
=
NewTensorBuf
(
output
,
output
->
devID
,
output
->
mem
);
XTensor
*
interBuf1
=
NewTensorBuf
(
output
,
output
->
devID
,
output
->
mem
);
XTensor
*
mulBuf
=
NewTensorBuf
(
output
,
output
->
devID
,
output
->
mem
);
XTensor
*
interBuf2
=
NewTensorBuf
(
output
,
output
->
devID
,
output
->
mem
);
/* l = log(output) */
_Log
(
output
,
logBuf
);
if
(
weight
!=
NULL
){
XTensor
*
weightBuf
=
NewTensorBuf
(
output
,
output
->
devID
,
output
->
mem
);
/* multiply gold with weight by broadcast wg = mulDim(g * w) */
_MultiplyDim
(
gold
,
weight
,
weightBuf
,
n
,
0
);
/* multiply weighted gold with log(output) wgl = mul(wg, l) */
_Multiply
(
weightBuf
,
logBuf
,
mulBuf
,
0
);
DelTensorBuf
(
weightBuf
);
}
else
{
/* multiply gold with log(output) gl = mul(g, l) */
_Multiply
(
gold
,
logBuf
,
mulBuf
,
0
);
}
/* negate result n = negate(mul) */
_NegateMe
(
mulBuf
);
_ReduceSum
(
mulBuf
,
loss
,
n
);
_Log
(
output
,
interBuf1
);
_Multiply
(
gold
,
interBuf1
,
interBuf2
);
if
(
weight
!=
NULL
)
_MultiplyDimMe
(
interBuf2
,
weight
,
n
);
_NegateMe
(
interBuf2
);
_ReduceSum
(
interBuf2
,
loss
,
n
);
DelTensorBuf
(
mulBuf
);
if
(
padding
!=
NULL
)
DelTensorBuf
(
logBuf
);
_MultiplyMe
(
loss
,
padding
);
DelTensorBuf
(
interBuf2
);
DelTensorBuf
(
interBuf1
);
}
}
/*
/*
...
@@ -109,19 +96,12 @@ void _CrossEntropyFast(const XTensor * output, const XTensor * gold,
...
@@ -109,19 +96,12 @@ void _CrossEntropyFast(const XTensor * output, const XTensor * gold,
XTensor
*
loss
,
const
XTensor
*
weight
,
XTensor
*
loss
,
const
XTensor
*
weight
,
const
XTensor
*
padding
,
int
leadingDim
)
const
XTensor
*
padding
,
int
leadingDim
)
{
{
#ifdef USE_CUDA
if
(
output
->
devID
>=
0
)
{
_CudaCrossEntropyFast
(
output
,
gold
,
loss
,
weight
,
padding
,
leadingDim
);
return
;
}
#endif
int
order
=
output
->
order
;
int
order
=
output
->
order
;
int
n
=
leadingDim
<
0
?
output
->
order
-
1
:
leadingDim
;
int
n
=
leadingDim
<
0
?
output
->
order
-
1
:
leadingDim
;
int
leadingDimSize
=
output
->
GetDim
(
n
);
int
leadingDimSize
=
output
->
GetDim
(
n
);
CheckNTErrors
(
n
>=
0
&&
n
<
output
->
order
,
CheckNTErrors
(
n
>=
0
&&
n
<
output
->
order
,
"Wrong leading
Dim
!"
);
"Wrong leading
dimension
!"
);
CheckNTErrors
(
XTensor
::
IsSameShaped
(
output
,
gold
),
CheckNTErrors
(
XTensor
::
IsSameShaped
(
output
,
gold
),
"The output tensor and gold tensor must be of the same size!"
);
"The output tensor and gold tensor must be of the same size!"
);
CheckNTErrors
(
weight
==
NULL
||
weight
->
unitNum
==
leadingDimSize
,
CheckNTErrors
(
weight
==
NULL
||
weight
->
unitNum
==
leadingDimSize
,
...
@@ -133,6 +113,22 @@ void _CrossEntropyFast(const XTensor * output, const XTensor * gold,
...
@@ -133,6 +113,22 @@ void _CrossEntropyFast(const XTensor * output, const XTensor * gold,
CheckNTErrors
(
gold
->
dataType
==
DEFAULT_DTYPE
&&
output
->
dataType
==
DEFAULT_DTYPE
,
CheckNTErrors
(
gold
->
dataType
==
DEFAULT_DTYPE
&&
output
->
dataType
==
DEFAULT_DTYPE
,
"TODO!"
);
"TODO!"
);
for
(
int
i
=
0
;
i
<
order
;
i
++
){
if
(
i
<
n
){
CheckNTErrors
((
output
->
GetDim
(
i
)
==
loss
->
GetDim
(
i
)),
"Unmatched tensors!"
);
}
else
if
(
i
>
n
){
CheckNTErrors
((
output
->
GetDim
(
i
)
==
loss
->
GetDim
(
i
-
1
)),
"Unmatched tensors!"
);
}
}
#ifdef USE_CUDA
if
(
output
->
devID
>=
0
)
{
_CudaCrossEntropyFast
(
output
,
gold
,
loss
,
weight
,
padding
,
leadingDim
);
return
;
}
#endif
int
blockNum
=
1
;
int
blockNum
=
1
;
int
blockSize
=
1
;
int
blockSize
=
1
;
int
stride
=
1
;
int
stride
=
1
;
...
@@ -148,31 +144,40 @@ void _CrossEntropyFast(const XTensor * output, const XTensor * gold,
...
@@ -148,31 +144,40 @@ void _CrossEntropyFast(const XTensor * output, const XTensor * gold,
DTYPE
*
lossData
=
(
DTYPE
*
)
loss
->
data
;
DTYPE
*
lossData
=
(
DTYPE
*
)
loss
->
data
;
DTYPE
tmpLoss
;
DTYPE
tmpLoss
;
int
lossPos
;
int
goldPos
;
if
(
weight
==
NULL
)
{
if
(
weight
==
NULL
)
{
if
(
padding
==
NULL
)
{
if
(
padding
==
NULL
)
{
for
(
int
i
=
0
;
i
<
blockNum
;
i
++
)
{
for
(
int
i
=
0
;
i
<
blockNum
;
i
++
)
{
int
beg
=
i
*
blockSize
;
for
(
int
j
=
0
;
j
<
stride
;
j
++
)
{
tmpLoss
=
0
;
tmpLoss
=
0
;
lossPos
=
i
*
stride
+
j
;
for
(
int
j
=
0
;
j
<
blockSize
;
j
++
)
for
(
int
k
=
0
;
k
<
leadingDimSize
;
k
++
)
{
tmpLoss
+=
-
(
*
(
goldData
+
beg
+
j
))
*
goldPos
=
i
*
blockSize
+
j
+
k
*
stride
;
(
DTYPE
)
log
(
*
(
outputData
+
beg
+
j
));
tmpLoss
+=
-
(
*
(
goldData
+
goldPos
))
*
*
(
lossData
+
i
)
=
tmpLoss
;
(
DTYPE
)
log
(
*
(
outputData
+
goldPos
));
}
*
(
lossData
+
lossPos
)
=
tmpLoss
;
}
}
}
}
}
else
{
else
{
DTYPE
*
paddingData
=
(
DTYPE
*
)
padding
->
data
;
DTYPE
*
paddingData
=
(
DTYPE
*
)
padding
->
data
;
for
(
int
i
=
0
;
i
<
blockNum
;
i
++
)
{
for
(
int
i
=
0
;
i
<
blockNum
;
i
++
)
{
int
beg
=
i
*
blockSize
;
for
(
int
j
=
0
;
j
<
stride
;
j
++
)
{
lossPos
=
i
*
stride
+
j
;
if
(
*
(
paddingData
+
i
)
==
0
)
if
(
*
(
paddingData
+
lossPos
)
==
0
)
*
(
lossData
+
i
)
=
0
;
*
(
lossData
+
lossPos
)
=
0
;
else
{
else
{
tmpLoss
=
0
;
tmpLoss
=
0
;
for
(
int
j
=
0
;
j
<
blockSize
;
j
++
)
for
(
int
k
=
0
;
k
<
leadingDimSize
;
k
++
)
{
tmpLoss
+=
-
(
*
(
goldData
+
beg
+
j
))
*
goldPos
=
i
*
blockSize
+
j
+
k
*
stride
;
(
DTYPE
)
log
(
*
(
outputData
+
beg
+
j
));
tmpLoss
+=
-
(
*
(
goldData
+
goldPos
))
*
*
(
lossData
+
i
)
=
tmpLoss
;
(
DTYPE
)
log
(
*
(
outputData
+
goldPos
));
}
*
(
lossData
+
lossPos
)
=
tmpLoss
;
}
}
}
}
}
}
}
...
@@ -181,30 +186,36 @@ void _CrossEntropyFast(const XTensor * output, const XTensor * gold,
...
@@ -181,30 +186,36 @@ void _CrossEntropyFast(const XTensor * output, const XTensor * gold,
DTYPE
*
weightData
=
(
DTYPE
*
)
weight
->
data
;
DTYPE
*
weightData
=
(
DTYPE
*
)
weight
->
data
;
if
(
padding
==
NULL
)
{
if
(
padding
==
NULL
)
{
for
(
int
i
=
0
;
i
<
blockNum
;
i
++
)
{
for
(
int
i
=
0
;
i
<
blockNum
;
i
++
)
{
int
beg
=
i
*
blockSize
;
for
(
int
j
=
0
;
j
<
stride
;
j
++
)
{
tmpLoss
=
0
;
tmpLoss
=
0
;
lossPos
=
i
*
stride
+
j
;
for
(
int
j
=
0
;
j
<
blockSize
;
j
++
)
for
(
int
k
=
0
;
k
<
leadingDimSize
;
k
++
)
{
tmpLoss
+=
-
(
*
(
goldData
+
beg
+
j
))
*
goldPos
=
i
*
blockSize
+
j
+
k
*
stride
;
(
DTYPE
)
log
(
*
(
outputData
+
beg
+
j
))
*
tmpLoss
+=
-
(
*
(
goldData
+
goldPos
))
*
(
*
(
weightData
+
j
));
(
DTYPE
)
log
(
*
(
outputData
+
goldPos
))
*
*
(
lossData
+
i
)
=
tmpLoss
;
(
*
(
weightData
+
k
));
}
*
(
lossData
+
lossPos
)
=
tmpLoss
;
}
}
}
}
}
else
{
else
{
DTYPE
*
paddingData
=
(
DTYPE
*
)
padding
->
data
;
DTYPE
*
paddingData
=
(
DTYPE
*
)
padding
->
data
;
for
(
int
i
=
0
;
i
<
blockNum
;
i
++
)
{
for
(
int
i
=
0
;
i
<
blockNum
;
i
++
)
{
int
beg
=
i
*
blockSize
;
for
(
int
j
=
0
;
j
<
stride
;
j
++
)
{
lossPos
=
i
*
stride
+
j
;
if
(
*
(
paddingData
+
i
)
==
0
)
if
(
*
(
paddingData
+
lossPos
)
==
0
)
*
(
lossData
+
i
)
=
0
;
*
(
lossData
+
lossPos
)
=
0
;
else
{
else
{
tmpLoss
=
0
;
tmpLoss
=
0
;
for
(
int
j
=
0
;
j
<
blockSize
;
j
++
)
for
(
int
k
=
0
;
k
<
leadingDimSize
;
k
++
)
{
tmpLoss
+=
-
(
*
(
goldData
+
beg
+
j
))
*
goldPos
=
i
*
blockSize
+
j
+
k
*
stride
;
(
DTYPE
)
log
(
*
(
outputData
+
beg
+
j
))
*
tmpLoss
+=
-
(
*
(
goldData
+
goldPos
))
*
(
*
(
weightData
+
j
));
(
DTYPE
)
log
(
*
(
outputData
+
goldPos
))
*
*
(
lossData
+
i
)
=
tmpLoss
;
(
*
(
weightData
+
k
));
}
*
(
lossData
+
lossPos
)
=
tmpLoss
;
}
}
}
}
}
}
}
...
@@ -212,26 +223,6 @@ void _CrossEntropyFast(const XTensor * output, const XTensor * gold,
...
@@ -212,26 +223,6 @@ void _CrossEntropyFast(const XTensor * output, const XTensor * gold,
}
}
/*
/*
get the dimSize after reduce operation
>> tensor - a tensor to be reduced
>> n - the reduce dimension
<< return - the pointer of dimSize
*/
int
*
reduceDimSize
(
const
XTensor
*
tensor
,
int
n
)
{
int
order
=
tensor
->
order
;
int
*
dimSize
=
new
int
[
order
-
1
];
for
(
int
i
=
0
;
i
<
order
;
i
++
)
{
if
(
i
<
n
)
dimSize
[
i
]
=
tensor
->
dimSize
[
i
];
else
if
(
i
>
n
)
dimSize
[
i
-
1
]
=
tensor
->
dimSize
[
i
];
}
return
dimSize
;
}
/*
compute the cross entropy loss
compute the cross entropy loss
loss = sum_{i} (-gold_i * log(output_i))
loss = sum_{i} (-gold_i * log(output_i))
where gold and output are distributions
where gold and output are distributions
...
@@ -247,73 +238,45 @@ DTYPE _CrossEntropy(const XTensor * output, const XTensor * gold,
...
@@ -247,73 +238,45 @@ DTYPE _CrossEntropy(const XTensor * output, const XTensor * gold,
LOSS_COMPUTE_WAY
reduceWay
,
const
XTensor
*
weight
,
LOSS_COMPUTE_WAY
reduceWay
,
const
XTensor
*
weight
,
const
XTensor
*
padding
,
int
leadingDim
)
const
XTensor
*
padding
,
int
leadingDim
)
{
{
DTYPE
loss
=
0
;
int
order
=
output
->
order
;
int
n
=
leadingDim
<
0
?
output
->
order
-
1
:
leadingDim
;
int
n
=
leadingDim
<
0
?
output
->
order
-
1
:
leadingDim
;
CheckNTErrors
(
n
>=
0
&&
n
<
output
->
order
,
"Wrong leadingDim!"
);
int
unitNum
=
output
->
dimSize
[
n
];
int
unitNum
=
output
->
dimSize
[
n
];
CheckNTErrors
(
n
>=
0
&&
n
<
output
->
order
,
"Wrong leadingDim!"
);
CheckNTErrors
(
XTensor
::
IsSameShaped
(
output
,
gold
),
CheckNTErrors
(
XTensor
::
IsSameShaped
(
output
,
gold
),
"The output tensor and gold tensor must be of the same size!"
);
"The output tensor and gold tensor must be of the same size!"
);
CheckNTErrors
(
weight
==
NULL
||
weight
->
unitNum
==
unitNum
,
"Wrong weight tensor!"
);
CheckNTErrors
(
weight
==
NULL
||
weight
->
unitNum
==
unitNum
,
"Wrong weight tensor!"
);
CheckNTErrors
(
padding
==
NULL
||
padding
->
order
==
output
->
order
-
1
,
"The loss tensor and padding tensor must be same shape!"
);
CheckNTErrors
(
padding
==
NULL
||
padding
->
order
==
output
->
order
-
1
,
"The loss tensor and padding tensor must be same shape!"
);
CheckNTErrors
(
gold
->
dataType
==
DEFAULT_DTYPE
&&
output
->
dataType
==
DEFAULT_DTYPE
,
"TODO!"
);
CheckNTErrors
(
gold
->
dataType
==
DEFAULT_DTYPE
&&
output
->
dataType
==
DEFAULT_DTYPE
,
"TODO!"
);
XTensor
*
logBuf
=
NewTensorBuf
(
output
,
output
->
devID
,
output
->
mem
);
int
*
dimSize
=
new
int
[
order
-
1
];
XTensor
*
mulBuf
=
NewTensorBuf
(
output
,
output
->
devID
,
output
->
mem
);
for
(
int
i
=
0
;
i
<
order
;
i
++
)
{
if
(
i
<
n
)
/* l = log(output) */
dimSize
[
i
]
=
output
->
dimSize
[
i
];
_Log
(
output
,
logBuf
);
else
if
(
i
>
n
)
dimSize
[
i
-
1
]
=
output
->
dimSize
[
i
];
if
(
weight
!=
NULL
){
XTensor
*
weightBuf
=
NewTensorBuf
(
output
,
output
->
devID
,
output
->
mem
);
/* multiply gold with weight by broadcast wg = mulDim(g * w) */
_MultiplyDim
(
gold
,
weight
,
weightBuf
,
n
,
0
);
/* multiply weighted gold with log(output) wgl = mul(wg, l) */
_Multiply
(
weightBuf
,
logBuf
,
mulBuf
,
0
);
DelTensorBuf
(
weightBuf
);
}
else
{
/* multiply gold with log(output) gl = mul(g, l) */
_Multiply
(
gold
,
logBuf
,
mulBuf
,
0
);
}
}
/* negate multiply result n = negate(mul) */
XTensor
*
lossBuf
=
NewTensorBuf
(
output
->
order
-
1
,
dimSize
,
output
->
dataType
,
output
->
denseRatio
,
_NegateMe
(
mulBuf
);
output
->
devID
,
output
->
mem
);
int
*
dimSize
;
dimSize
=
reduceDimSize
(
output
,
n
);
XTensor
*
lossInter
=
NewTensor
(
output
->
order
-
1
,
dimSize
,
output
->
dataType
,
output
->
denseRatio
,
output
->
devID
,
output
->
mem
);
/* reduce sum all classes */
_ReduceSum
(
mulBuf
,
lossInter
,
n
);
DelTensorBuf
(
mulBuf
);
_CrossEntropy
(
output
,
gold
,
lossBuf
,
weight
,
padding
,
leadingDim
);
DelTensorBuf
(
logBuf
);
DTYPE
loss
;
/* compute the total loss */
loss
=
_ReduceSumAll
(
lossBuf
);
if
(
padding
!=
NULL
)
{
XTensor
*
temp
=
NewTensor
(
lossInter
);
_Multiply
(
lossInter
,
padding
,
temp
);
loss
=
_ReduceSumAll
(
temp
);
delete
temp
;
}
else
loss
=
_ReduceSumAll
(
lossInter
);
if
(
reduceWay
==
REDUCE_MEAN
)
{
if
(
reduceWay
==
REDUCE_MEAN
)
{
int
nonZeroNum
;
int
nonZeroNum
;
if
(
padding
==
NULL
)
{
if
(
padding
==
NULL
)
{
nonZeroNum
=
loss
Inter
->
unitNum
;
nonZeroNum
=
loss
Buf
->
unitNum
;
}
}
else
{
else
{
XTensor
*
tmp
=
NewTensor
(
padding
);
XTensor
*
tmp
=
NewTensor
Buf
(
padding
,
padding
->
devID
,
padding
->
mem
);
_IsNonZero
(
padding
,
tmp
);
_IsNonZero
(
padding
,
tmp
);
nonZeroNum
=
(
int
)
_ReduceSumAll
(
tmp
);
nonZeroNum
=
(
int
)
_ReduceSumAll
(
tmp
);
delete
tmp
;
DelTensorBuf
(
tmp
)
;
}
}
loss
=
loss
/
(
DTYPE
)
nonZeroNum
;
loss
=
loss
/
(
DTYPE
)
nonZeroNum
;
...
@@ -326,7 +289,7 @@ DTYPE _CrossEntropy(const XTensor * output, const XTensor * gold,
...
@@ -326,7 +289,7 @@ DTYPE _CrossEntropy(const XTensor * output, const XTensor * gold,
}
}
delete
[]
dimSize
;
delete
[]
dimSize
;
delete
lossInter
;
DelTensorBuf
(
lossBuf
)
;
return
loss
;
return
loss
;
}
}
...
@@ -349,11 +312,7 @@ DTYPE _CrossEntropyFast(const XTensor * output, const XTensor * gold,
...
@@ -349,11 +312,7 @@ DTYPE _CrossEntropyFast(const XTensor * output, const XTensor * gold,
LOSS_COMPUTE_WAY
reduceWay
,
const
XTensor
*
weight
,
LOSS_COMPUTE_WAY
reduceWay
,
const
XTensor
*
weight
,
const
XTensor
*
padding
,
int
leadingDim
)
const
XTensor
*
padding
,
int
leadingDim
)
{
{
#ifdef USE_CUDA
DTYPE
loss
=
0
;
if
(
output
->
devID
>=
0
)
{
return
_CudaCrossEntropyFast
(
output
,
gold
,
reduceWay
,
weight
,
padding
,
leadingDim
);
}
#endif
int
order
=
output
->
order
;
int
order
=
output
->
order
;
int
n
=
leadingDim
<
0
?
output
->
order
-
1
:
leadingDim
;
int
n
=
leadingDim
<
0
?
output
->
order
-
1
:
leadingDim
;
...
@@ -370,6 +329,23 @@ DTYPE _CrossEntropyFast(const XTensor * output, const XTensor * gold,
...
@@ -370,6 +329,23 @@ DTYPE _CrossEntropyFast(const XTensor * output, const XTensor * gold,
CheckNTErrors
(
gold
->
dataType
==
DEFAULT_DTYPE
&&
output
->
dataType
==
DEFAULT_DTYPE
,
CheckNTErrors
(
gold
->
dataType
==
DEFAULT_DTYPE
&&
output
->
dataType
==
DEFAULT_DTYPE
,
"TODO!"
);
"TODO!"
);
if
(
padding
!=
NULL
)
{
for
(
int
i
=
0
;
i
<
order
;
i
++
){
if
(
i
<
n
){
CheckNTErrors
((
output
->
GetDim
(
i
)
==
padding
->
GetDim
(
i
)),
"Unmatched tensors!"
);
}
else
if
(
i
>
n
){
CheckNTErrors
((
output
->
GetDim
(
i
)
==
padding
->
dimSize
[
i
-
1
]),
"Unmatched tensors!"
);
}
}
}
#ifdef USE_CUDA
if
(
output
->
devID
>=
0
)
{
return
_CudaCrossEntropyFast
(
output
,
gold
,
reduceWay
,
weight
,
padding
,
leadingDim
);
}
#endif
int
blockNum
=
1
;
int
blockNum
=
1
;
int
blockSize
=
1
;
int
blockSize
=
1
;
int
stride
=
1
;
int
stride
=
1
;
...
@@ -383,63 +359,78 @@ DTYPE _CrossEntropyFast(const XTensor * output, const XTensor * gold,
...
@@ -383,63 +359,78 @@ DTYPE _CrossEntropyFast(const XTensor * output, const XTensor * gold,
DTYPE
*
outputData
=
(
DTYPE
*
)
output
->
data
;
DTYPE
*
outputData
=
(
DTYPE
*
)
output
->
data
;
DTYPE
*
goldData
=
(
DTYPE
*
)
gold
->
data
;
DTYPE
*
goldData
=
(
DTYPE
*
)
gold
->
data
;
DTYPE
loss
=
0
;
int
paddingPos
;
int
goldPos
;
int
nonZeroNum
=
0
;
int
nonZeroNum
=
0
;
if
(
weight
==
NULL
)
{
if
(
weight
==
NULL
)
{
if
(
padding
==
NULL
)
{
if
(
padding
==
NULL
)
{
nonZeroNum
=
blockNum
;
nonZeroNum
=
blockNum
*
stride
;
for
(
int
i
=
0
;
i
<
blockNum
;
i
++
)
{
int
beg
=
i
*
blockSize
;
for
(
int
j
=
0
;
j
<
blockSize
;
j
++
)
for
(
int
i
=
0
;
i
<
blockNum
;
i
++
)
{
loss
+=
-
(
*
(
goldData
+
beg
+
j
))
*
for
(
int
j
=
0
;
j
<
stride
;
j
++
)
{
(
DTYPE
)
log
(
*
(
outputData
+
beg
+
j
));
paddingPos
=
i
*
stride
+
j
;
for
(
int
k
=
0
;
k
<
leadingDimSize
;
k
++
)
{
goldPos
=
i
*
blockSize
+
j
+
k
*
stride
;
loss
+=
-
(
*
(
goldData
+
goldPos
))
*
(
DTYPE
)
log
(
*
(
outputData
+
goldPos
));
}
}
}
}
}
}
else
{
else
{
DTYPE
*
paddingData
=
(
DTYPE
*
)
padding
->
data
;
DTYPE
*
paddingData
=
(
DTYPE
*
)
padding
->
data
;
for
(
int
i
=
0
;
i
<
blockNum
;
i
++
)
{
for
(
int
i
=
0
;
i
<
blockNum
;
i
++
)
{
if
(
*
(
paddingData
+
i
)
==
0
)
for
(
int
j
=
0
;
j
<
stride
;
j
++
)
{
continue
;
paddingPos
=
i
*
stride
+
j
;
else
{
if
(
*
(
paddingData
+
paddingPos
)
==
0
)
nonZeroNum
+=
1
;
continue
;
else
{
int
beg
=
i
*
blockSize
;
nonZeroNum
+=
1
;
for
(
int
j
=
0
;
j
<
blockSize
;
j
++
)
for
(
int
k
=
0
;
k
<
leadingDimSize
;
k
++
)
{
loss
+=
-
(
*
(
goldData
+
beg
+
j
))
*
goldPos
=
i
*
blockSize
+
j
+
k
*
stride
;
(
DTYPE
)
log
(
*
(
outputData
+
beg
+
j
));
loss
+=
-
(
*
(
goldData
+
goldPos
))
*
(
DTYPE
)
log
(
*
(
outputData
+
goldPos
));
}
}
}
}
}
}
}
}
}
}
else
{
else
{
DTYPE
*
weightData
=
(
DTYPE
*
)
weight
->
data
;
DTYPE
*
weightData
=
(
DTYPE
*
)
weight
->
data
;
if
(
padding
==
NULL
)
{
if
(
padding
==
NULL
)
{
nonZeroNum
=
blockNum
;
nonZeroNum
=
blockNum
*
stride
;
for
(
int
i
=
0
;
i
<
blockNum
;
i
++
)
{
for
(
int
i
=
0
;
i
<
blockNum
;
i
++
)
{
int
beg
=
i
*
blockSize
;
for
(
int
j
=
0
;
j
<
stride
;
j
++
)
{
for
(
int
j
=
0
;
j
<
blockSize
;
j
++
)
paddingPos
=
i
*
stride
+
j
;
loss
+=
-
(
*
(
goldData
+
beg
+
j
))
*
for
(
int
k
=
0
;
k
<
leadingDimSize
;
k
++
)
{
(
DTYPE
)
log
(
*
(
outputData
+
beg
+
j
))
*
goldPos
=
i
*
blockSize
+
j
+
k
*
stride
;
(
*
(
weightData
+
j
));
loss
+=
-
(
*
(
goldData
+
goldPos
))
*
(
DTYPE
)
log
(
*
(
outputData
+
goldPos
))
*
(
*
(
weightData
+
k
));
}
}
}
}
}
}
else
{
else
{
DTYPE
*
paddingData
=
(
DTYPE
*
)
padding
->
data
;
DTYPE
*
paddingData
=
(
DTYPE
*
)
padding
->
data
;
for
(
int
i
=
0
;
i
<
blockNum
;
i
++
)
{
for
(
int
i
=
0
;
i
<
blockNum
;
i
++
)
{
if
(
*
(
paddingData
+
i
)
==
0
)
for
(
int
j
=
0
;
j
<
stride
;
j
++
)
{
continue
;
paddingPos
=
i
*
stride
+
j
;
else
{
if
(
*
(
paddingData
+
paddingPos
)
==
0
)
nonZeroNum
+=
1
;
continue
;
else
{
int
beg
=
i
*
blockSize
;
nonZeroNum
+=
1
;
for
(
int
j
=
0
;
j
<
blockSize
;
j
++
)
for
(
int
k
=
0
;
k
<
leadingDimSize
;
k
++
)
{
loss
+=
-
(
*
(
goldData
+
beg
+
j
))
*
goldPos
=
i
*
blockSize
+
j
+
k
*
stride
;
(
DTYPE
)
log
(
*
(
outputData
+
beg
+
j
))
*
loss
+=
-
(
*
(
goldData
+
goldPos
))
*
(
*
(
weightData
+
j
));
(
DTYPE
)
log
(
*
(
outputData
+
goldPos
))
*
(
*
(
weightData
+
j
));
}
}
}
}
}
}
}
}
}
}
...
@@ -471,17 +462,10 @@ with respect to gold standard, and y this the model output
...
@@ -471,17 +462,10 @@ with respect to gold standard, and y this the model output
>> padding - specify a target value that is ignored and does not contribute to the loss computation
>> padding - specify a target value that is ignored and does not contribute to the loss computation
>> leadingDim - the leading dimension for the output
>> leadingDim - the leading dimension for the output
*/
*/
void
_CrossEntropyBackward
(
XTensor
*
dedy
,
const
XTensor
*
output
,
const
XTensor
*
gold
,
void
_CrossEntropyBackward
(
XTensor
*
dedy
,
const
XTensor
*
output
,
const
XTensor
*
weight
,
const
XTensor
*
padding
,
const
XTensor
*
gold
,
const
XTensor
*
weight
,
int
leadingDim
)
XTensor
*
padding
,
int
leadingDim
)
{
{
#ifdef USE_CUDA
if
(
output
->
devID
>=
0
)
{
_CudaCrossEntropyBackward
(
dedy
,
output
,
gold
,
weight
,
padding
,
leadingDim
);
return
;
}
#endif
int
order
=
output
->
order
;
int
order
=
output
->
order
;
int
n
=
leadingDim
<
0
?
output
->
order
-
1
:
leadingDim
;
int
n
=
leadingDim
<
0
?
output
->
order
-
1
:
leadingDim
;
int
leadingDimSize
=
output
->
GetDim
(
n
);
int
leadingDimSize
=
output
->
GetDim
(
n
);
...
@@ -497,7 +481,26 @@ void _CrossEntropyBackward(XTensor * dedy, const XTensor * output, const XTensor
...
@@ -497,7 +481,26 @@ void _CrossEntropyBackward(XTensor * dedy, const XTensor * output, const XTensor
"Wrong padding tensor!"
);
"Wrong padding tensor!"
);
CheckNTErrors
(
gold
->
dataType
==
DEFAULT_DTYPE
&&
output
->
dataType
==
DEFAULT_DTYPE
,
CheckNTErrors
(
gold
->
dataType
==
DEFAULT_DTYPE
&&
output
->
dataType
==
DEFAULT_DTYPE
,
"TODO!"
);
"TODO!"
);
if
(
padding
!=
NULL
)
{
for
(
int
i
=
0
;
i
<
order
;
i
++
){
if
(
i
<
n
){
CheckNTErrors
((
output
->
GetDim
(
i
)
==
padding
->
GetDim
(
i
)),
"Unmatched tensors!"
);
}
else
if
(
i
>
n
){
CheckNTErrors
((
output
->
GetDim
(
i
)
==
padding
->
dimSize
[
i
-
1
]),
"Unmatched tensors!"
);
}
}
}
#ifdef USE_CUDA
if
(
output
->
devID
>=
0
)
{
_CudaCrossEntropyBackward
(
dedy
,
output
,
gold
,
weight
,
padding
,
leadingDim
);
return
;
}
#endif
int
blockNum
=
1
;
int
blockNum
=
1
;
int
blockSize
=
1
;
int
blockSize
=
1
;
int
stride
=
1
;
int
stride
=
1
;
...
@@ -512,25 +515,35 @@ void _CrossEntropyBackward(XTensor * dedy, const XTensor * output, const XTensor
...
@@ -512,25 +515,35 @@ void _CrossEntropyBackward(XTensor * dedy, const XTensor * output, const XTensor
DTYPE
*
outputData
=
(
DTYPE
*
)
output
->
data
;
DTYPE
*
outputData
=
(
DTYPE
*
)
output
->
data
;
DTYPE
*
goldData
=
(
DTYPE
*
)
gold
->
data
;
DTYPE
*
goldData
=
(
DTYPE
*
)
gold
->
data
;
int
paddingPos
;
int
goldPos
;
if
(
weight
==
NULL
)
{
if
(
weight
==
NULL
)
{
if
(
padding
==
NULL
)
{
if
(
padding
==
NULL
)
{
for
(
int
i
=
0
;
i
<
blockNum
;
i
++
)
{
for
(
int
i
=
0
;
i
<
blockNum
;
i
++
)
{
int
beg
=
i
*
blockSize
;
for
(
int
j
=
0
;
j
<
stride
;
j
++
)
{
for
(
int
j
=
0
;
j
<
blockSize
;
j
++
)
for
(
int
k
=
0
;
k
<
leadingDimSize
;
k
++
)
{
*
(
dedyData
+
beg
+
j
)
=
-
(
*
(
goldData
+
beg
+
j
))
/
goldPos
=
i
*
blockSize
+
j
+
k
*
stride
;
(
*
(
outputData
+
beg
+
j
));
*
(
dedyData
+
goldPos
)
=
-
(
*
(
goldData
+
goldPos
))
/
(
*
(
outputData
+
goldPos
));
}
}
}
}
}
}
else
{
else
{
DTYPE
*
paddingData
=
(
DTYPE
*
)
padding
->
data
;
DTYPE
*
paddingData
=
(
DTYPE
*
)
padding
->
data
;
for
(
int
i
=
0
;
i
<
blockNum
;
i
++
)
{
for
(
int
i
=
0
;
i
<
blockNum
;
i
++
)
{
int
beg
=
i
*
blockSize
;
for
(
int
j
=
0
;
j
<
stride
;
j
++
)
{
if
(
*
(
paddingData
+
i
)
==
0
)
paddingPos
=
i
*
stride
+
j
;
memset
(
dedyData
+
beg
,
0
,
blockSize
*
unitSize
);
for
(
int
k
=
0
;
k
<
leadingDimSize
;
k
++
)
{
else
goldPos
=
i
*
blockSize
+
j
+
k
*
stride
;
for
(
int
j
=
0
;
j
<
blockSize
;
j
++
)
if
(
*
(
paddingData
+
paddingPos
)
==
0
)
*
(
dedyData
+
beg
+
j
)
=
-
(
*
(
goldData
+
beg
+
j
))
/
*
(
dedyData
+
goldPos
)
=
0
;
(
*
(
outputData
+
beg
+
j
));
else
*
(
dedyData
+
goldPos
)
=
-
(
*
(
goldData
+
goldPos
))
/
(
*
(
outputData
+
goldPos
));
}
}
}
}
}
}
}
}
...
@@ -538,39 +551,45 @@ void _CrossEntropyBackward(XTensor * dedy, const XTensor * output, const XTensor
...
@@ -538,39 +551,45 @@ void _CrossEntropyBackward(XTensor * dedy, const XTensor * output, const XTensor
DTYPE
*
weightData
=
(
DTYPE
*
)
weight
->
data
;
DTYPE
*
weightData
=
(
DTYPE
*
)
weight
->
data
;
if
(
padding
==
NULL
)
{
if
(
padding
==
NULL
)
{
for
(
int
i
=
0
;
i
<
blockNum
;
i
++
)
{
for
(
int
i
=
0
;
i
<
blockNum
;
i
++
)
{
int
beg
=
i
*
blockSize
;
for
(
int
j
=
0
;
j
<
stride
;
j
++
)
{
for
(
int
j
=
0
;
j
<
blockSize
;
j
++
)
for
(
int
k
=
0
;
k
<
leadingDimSize
;
k
++
)
{
*
(
dedyData
+
beg
+
j
)
=
-
(
*
(
weightData
+
j
))
*
goldPos
=
i
*
blockSize
+
j
+
k
*
stride
;
(
*
(
goldData
+
beg
+
j
))
/
*
(
dedyData
+
goldPos
)
=
-
(
*
(
weightData
+
k
))
*
(
*
(
outputData
+
beg
+
j
));
(
*
(
goldData
+
goldPos
))
/
(
*
(
outputData
+
goldPos
));
}
}
}
}
}
}
else
{
else
{
DTYPE
*
paddingData
=
(
DTYPE
*
)
padding
->
data
;
DTYPE
*
paddingData
=
(
DTYPE
*
)
padding
->
data
;
for
(
int
i
=
0
;
i
<
blockNum
;
i
++
)
{
for
(
int
i
=
0
;
i
<
blockNum
;
i
++
)
{
int
beg
=
i
*
blockSize
;
for
(
int
j
=
0
;
j
<
stride
;
j
++
)
{
if
(
*
(
paddingData
+
i
)
==
0
)
paddingPos
=
i
*
stride
+
j
;
memset
(
dedyData
+
beg
,
0
,
blockSize
*
unitSize
);
for
(
int
k
=
0
;
k
<
leadingDimSize
;
k
++
)
{
else
goldPos
=
i
*
blockSize
+
j
+
k
*
stride
;
for
(
int
j
=
0
;
j
<
blockSize
;
j
++
)
{
if
(
*
(
paddingData
+
paddingPos
)
==
0
)
*
(
dedyData
+
beg
+
j
)
=
-
(
*
(
weightData
+
j
))
*
*
(
dedyData
+
goldPos
)
=
0
;
(
*
(
goldData
+
beg
+
j
))
/
else
(
*
(
outputData
+
beg
+
j
));
*
(
dedyData
+
goldPos
)
=
-
(
*
(
weightData
+
k
))
*
(
*
(
goldData
+
goldPos
))
/
(
*
(
outputData
+
goldPos
));
}
}
}
}
}
}
}
}
}
if
(
padding
!=
NULL
)
{
//
if(padding != NULL) {
XTensor
*
tmp
=
NewTensor
(
padding
);
//
XTensor * tmp = NewTensor(padding);
_IsNonZero
(
padding
,
tmp
);
//
_IsNonZero(padding, tmp);
int
nonZeroNum
=
(
int
)
_ReduceSumAll
(
tmp
);
//
int nonZeroNum = (int)_ReduceSumAll(tmp);
_ScaleAndShiftMe
(
dedy
,
(
DTYPE
)
1.0
/
(
DTYPE
)
nonZeroNum
);
//
_ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)nonZeroNum);
delete
tmp
;
//
delete tmp;
}
//
}
else
{
//
else {
_ScaleAndShiftMe
(
dedy
,
(
DTYPE
)
1.0
/
(
DTYPE
)
blockNum
);
//
_ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)blockNum);
}
//
}
}
}
}
//
namespace
nts
(
NiuTrans
.
Tensor
)
}
//
namespace
nts
(
NiuTrans
.
Tensor
)
\ No newline at end of file
source/tensor/function/CrossEntropy.cu
查看文件 @
03a9836e
...
@@ -26,80 +26,20 @@
...
@@ -26,80 +26,20 @@
#include "../XDevice.h"
#include "../XDevice.h"
#include "CrossEntropy.cuh"
#include "CrossEntropy.cuh"
#include "CrossEntropy.h"
#include "CrossEntropy.h"
#include "../core/reduce/ReduceSumAll.h"
#include "../core/arithmetic/Div.h"
#include "../core/arithmetic/Multiply.h"
#include "../core/arithmetic/MultiplyDim.h"
#include "../core/arithmetic/Negate.h"
#include "../core/math/Unary.h"
#include "../core/math/Unary.h"
#include "../core/math/ScaleAndShift.h"
#include "../core/math/ScaleAndShift.h"
#include "../core/reduce/ReduceSum.h"
#include "../core/reduce/ReduceSumAll.h"
#include "../core/shape/Transpose.h"
#include "../core/shape/Unsqueeze.h"
namespace nts{ // namespace nts(NiuTrans.Tensor)
namespace nts{ // namespace nts(NiuTrans.Tensor)
/*
/*
compute the cross entropy loss (cuda kernel)
>> outputData - the data pointer of output tensor
>> goldData - the data pointer of gold tensor
>> lossData - the data pointer of loss tensor
>> weightData - the data pointer of weight tensor
>> paddingData - the data pointer of padding tensor
>> blockNum - the number of data blocks
>> stride - the size of a data block
*/
__global__
void KernelCrossEntropy(DTYPE * outputData, DTYPE * goldData,
DTYPE * lossData, DTYPE * weightData,
DTYPE * paddingData, int blockNum, int blockSize)
{
/* block id */
int i = blockDim.x * blockIdx.x + threadIdx.x;
if(i >= blockNum)
return;
int beg = i * blockSize;
DTYPE tmpLoss = 0;
if(weightData == NULL) {
if(paddingData == NULL) {
tmpLoss = 0;
for(int j = 0; j < blockSize; j++)
tmpLoss += -(*(goldData + beg + j)) *
(DTYPE)log(*(outputData + beg + j));
*(lossData + i) = tmpLoss;
}
else {
if(*(paddingData + i) == 0)
*(lossData + i) = tmpLoss;
else{
for(int j = 0; j < blockSize; j++)
tmpLoss += -(*(goldData + beg + j)) *
(DTYPE)log(*(outputData + beg + j));
*(lossData + i) = tmpLoss;
}
}
}
else {
if(paddingData == NULL) {
for(int j = 0; j < blockSize; j++)
tmpLoss += -(*(goldData + beg + j)) *
(DTYPE)log(*(outputData + beg + j)) *
(*(weightData + j));
*(lossData + i) = tmpLoss;
}
else {
if(*(paddingData + i) == 0)
*(lossData + i) = tmpLoss;
else{
tmpLoss = 0;
for(int j = 0; j < blockSize; j++)
tmpLoss += -(*(goldData + beg + j)) *
(DTYPE)log(*(outputData + beg + j)) *
(*(weightData + j));
*(lossData + i) = tmpLoss;
}
}
}
}
/*
compute the cross entropy loss (cuda version)
compute the cross entropy loss (cuda version)
loss = sum_{i} (-gold_i * log(output_i))
loss = sum_{i} (-gold_i * log(output_i))
where gold and output are distributions
where gold and output are distributions
...
@@ -112,79 +52,27 @@ where gold and output are distributions
...
@@ -112,79 +52,27 @@ where gold and output are distributions
>> leadingDim - the leading dimension for the output
>> leadingDim - the leading dimension for the output
*/
*/
void _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,
void _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,
XTensor * loss, const XTensor * weight,
XTensor * loss, const XTensor * weight,
const XTensor * padding, int leadingDim)
const XTensor * padding, int leadingDim)
{
{
int order = output->order;
int n = leadingDim < 0 ? output->order - 1 : leadingDim;
int n = leadingDim < 0 ? output->order - 1 : leadingDim;
int leadingDimSize = output->GetDim(n);
CheckNTErrors(n >= 0 && n < output->order,
"Wrong leadingDim!");
CheckNTErrors(XTensor::IsSameShaped(output, gold),
"The output tensor and gold tensor must be of the same size!");
CheckNTErrors(weight == NULL || weight->unitNum == leadingDimSize,
"Wrong weight tensor!");
CheckNTErrors(padding == NULL || XTensor::IsSameShaped(padding, loss),
"The loss tensor and padding tensor must be same shape!");
CheckNTErrors(loss->order == output->order - 1,
"Wrong loss dimension!");
CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE,
"TODO!");
int blockNum = 1;
int blockSize = 1;
int stride = 1;
for(int i = n + 1; i < order; i++)
stride *= output->GetDim(i);
blockSize = stride * leadingDimSize;
XTensor * interBuf1 = NewTensorBuf(output, output->devID, output->mem);
blockNum = output->unitNum / blockSize;
XTensor * interBuf2 = NewTensorBuf(output, output->devID, output->mem);
int cudaGrids[3];
int cudaBlocks[3];
//GDevs.GetCudaThread2D(output->devID, blockNum, blockSize, MAX_INT, cudaGrids, cudaBlocks);
GDevs.GetCudaThread(output->devID, blockNum, cudaGrids, cudaBlocks);
dim3 blocks(cudaGrids[0], cudaGrids[1]
);
_Log(output, interBuf1
);
dim3 threads(cudaBlocks[0], cudaBlocks[1]
);
_Multiply(gold, interBuf1, interBuf2
);
int devIDBackup;
if(weight != NULL)
ProtectCudaDev(output->devID, devIDBackup);
_MultiplyDimMe(interBuf2, weight, n);
_NegateMe(interBuf2);
_ReduceSum(interBuf2, loss, n);
DTYPE * outputData = (DTYPE*)output->data;
if(padding != NULL)
DTYPE * goldData = (DTYPE*)gold->data;
_MultiplyMe(loss, padding);
DTYPE * lossData = (DTYPE*)loss->data;
if(weight == NULL) {
if(padding == NULL)
KernelCrossEntropy<<<dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >>>
(outputData, goldData, lossData,
NULL, NULL,
blockNum, blockSize);
else
KernelCrossEntropy<<<dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >>>
(outputData, goldData, lossData,
NULL, (DTYPE*)padding->data,
blockNum, blockSize);
}
else {
if(padding == NULL)
KernelCrossEntropy<<<dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >>>
(outputData, goldData, lossData,
(DTYPE*)weight->data, NULL,
blockNum, blockSize);
else
KernelCrossEntropy<<<dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >>>
(outputData, goldData, lossData,
(DTYPE*)weight->data, (DTYPE*)padding->data,
blockNum, blockSize);
}
BacktoCudaDev(output->devID, devIDBackup);
DelTensorBuf(interBuf2);
DelTensorBuf(interBuf1);
}
}
/*
/*
...
@@ -230,87 +118,38 @@ DTYPE _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,
...
@@ -230,87 +118,38 @@ DTYPE _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,
dimSize[i - 1] = output->dimSize[i];
dimSize[i - 1] = output->dimSize[i];
}
}
XTensor * lossInter = NewTensor(output->order - 1, dimSize, output->dataType, output->denseRatio, output->devID, output->mem);
XTensor * lossBuf = NewTensorBuf(output->order - 1, dimSize, output->dataType, output->denseRatio,
output->devID, output->mem);
_CudaCrossEntropyFast(output, gold, loss
Inter
, weight, padding, leadingDim);
_CudaCrossEntropyFast(output, gold, loss
Buf
, weight, padding, leadingDim);
loss = _ReduceSumAll(loss
Inter
);
loss = _ReduceSumAll(loss
Buf
);
if(reduceWay == REDUCE_MEAN) {
if(reduceWay == REDUCE_MEAN) {
int nonZeroNum;
int nonZeroNum;
if(padding == NULL) {
if(padding == NULL) {
nonZeroNum = loss
Inter
->unitNum;
nonZeroNum = loss
Buf
->unitNum;
}
}
else {
else {
XTensor * tmp = NewTensor
(padding
);
XTensor * tmp = NewTensor
Buf(padding, padding->devID, padding->mem
);
_IsNonZero(padding, tmp);
_IsNonZero(padding, tmp);
nonZeroNum = (int)_ReduceSumAll(tmp);
nonZeroNum = (int)_ReduceSumAll(tmp);
delete tmp
;
DelTensorBuf(tmp)
;
}
}
loss = loss / (DTYPE)nonZeroNum;
loss = loss / (DTYPE)nonZeroNum;
}
}
else if(reduceWay == REDUCE_SUM) {
return loss;
/* don't need to do anything */
}
/*
backward computation of cross entropy function (kernel version)
>> dedyData - the data pointer of dedy tensor
>> outputData - the data pointer of output tensor
>> goldData - the data pointer of gold tensor
>> weightData - the data pointer of weight tensor
>> paddingData - the data pointer of padding tensor
>> blockNum - the number of data blocks
>> blockSize - the size of a data block
*/
__global__
void KernelCrossEntropyBackward(DTYPE * dedyData, DTYPE * outputData, DTYPE * goldData,
DTYPE * weightData, DTYPE * paddingData,
int blockNum, int blockSize)
{
/* block id */
int i = blockDim.x * blockIdx.x + threadIdx.x;
if(i >= blockNum)
return;
int beg = i * blockSize;
if(weightData == NULL) {
if(paddingData == NULL) {
for(int j = 0; j < blockSize; j++)
*(dedyData + beg + j) = -(*(goldData + beg + j)) /
(*(outputData + beg + j));
}
else {
if(*(paddingData + i) == 0)
memset(dedyData + beg, 0, blockSize * sizeof(DTYPE));
else
for(int j = 0; j < blockSize; j++)
*(dedyData + beg + j) = -(*(goldData + beg + j)) /
(*(outputData + beg + j));
}
}
}
else {
else {
if(paddingData == NULL) {
ShowNTErrors("TODO");
for(int j = 0; j < blockSize; j++)
*(dedyData + beg + j) = -(*(weightData + j)) *
(*(goldData + beg + j)) /
(*(outputData + beg + j));
}
else {
if(*(paddingData + i) == 0)
memset(dedyData + beg, 0, blockSize * sizeof(DTYPE));
else
for(int j = 0; j < blockSize; j++) {
*(dedyData + beg + j) = -(*(weightData + j)) *
(*(goldData + beg + j)) /
(*(outputData + beg + j));
}
}
}
}
delete[] dimSize;
DelTensorBuf(lossBuf);
return loss;
}
}
/*
/*
...
@@ -330,85 +169,43 @@ with respect to gold standard, and y this the model output
...
@@ -330,85 +169,43 @@ with respect to gold standard, and y this the model output
*/
*/
void _CudaCrossEntropyBackward(XTensor * dedy, const XTensor * output,
void _CudaCrossEntropyBackward(XTensor * dedy, const XTensor * output,
const XTensor * gold, const XTensor * weight,
const XTensor * gold, const XTensor * weight,
const
XTensor * padding, int leadingDim)
XTensor * padding, int leadingDim)
{
{
int order = output->order;
int n = leadingDim < 0 ? output->order - 1 : leadingDim;
int n = leadingDim < 0 ? output->order - 1 : leadingDim;
int leadingDimSize = output->GetDim(n);
CheckNTErrors(n >= 0 && n < output->order,
"Wrong leading dimension!");
CheckNTErrors(XTensor::IsSameShaped(dedy, output, gold),
"The output tensor and gold tensor must be of the same size!");
CheckNTErrors(weight == NULL || weight->unitNum == leadingDimSize,
"Wrong weight tensor!");
CheckNTErrors(padding == NULL || padding->order == output->order - 1,
"Wrong padding tensor!");
CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE,
"TODO!");
int blockNum = 1;
int blockSize = 1;
int stride = 1;
for(int i = n + 1; i < order; i++)
stride *= output->GetDim(i);
blockSize = stride * leadingDimSize;
blockNum = output->unitNum / blockSize;
int cudaGrids[3];
int cudaBlocks[3];
GDevs.GetCudaThread(output->devID, blockNum, cudaGrids, cudaBlocks);
dim3 blocks(cudaGrids[0], cudaGrids[1]);
_Div(gold, output, dedy);
dim3 threads(cudaBlocks[0], cudaBlocks[1]);
_NegateMe(dedy);
if(weight != NULL)
int devIDBackup;
_MultiplyDimMe(dedy, weight, n);
ProtectCudaDev(output->devID, devIDBackup);
DTYPE * dedyData = (DTYPE*)dedy->data;
DTYPE * outputData = (DTYPE*)output->data;
DTYPE * goldData = (DTYPE*)gold->data;
if(weight == NULL) {
if(padding == NULL)
KernelCrossEntropyBackward<<<dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >>>
(dedyData, outputData, goldData,
NULL, NULL,
blockNum, blockSize);
else
KernelCrossEntropyBackward<<<dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >>>
(dedyData, outputData, goldData,
NULL, (DTYPE*)padding->data,
blockNum, blockSize);
}
else {
if(padding == NULL)
KernelCrossEntropyBackward<<<dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >>>
(dedyData, outputData, goldData,
(DTYPE*)weight->data, NULL,
blockNum, blockSize);
else
KernelCrossEntropyBackward<<<dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >>>
(dedyData, outputData, goldData,
(DTYPE*)weight->data, (DTYPE*)padding->data,
blockNum, blockSize);
}
if(padding != NULL) {
if(padding != NULL) {
XTensor * tmp = NewTensor(padding);
int paddingOrder = padding->order;
_IsNonZero(padding, tmp);
int * paddingDims = new int[paddingOrder];
int nonZeroNum = (int)_ReduceSumAll(tmp);
memcpy(paddingDims, padding->dimSize, padding->order * sizeof(int));
_ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)nonZeroNum);
padding->Reshape(padding->unitNum);
delete tmp;
}
int order = dedy->order;
else {
int * dims = new int[order];
_ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)blockNum);
memcpy(dims, dedy->dimSize, dedy->order * sizeof(int));
dedy->Reshape(dedy->unitNum/dedy->GetDim(n), dedy->GetDim(n));
_MultiplyDimMe(dedy, padding, 0);
padding->Reshape(paddingOrder, paddingDims);
dedy->Reshape(order, dims);
delete[] paddingDims;
delete[] dims;
}
}
BacktoCudaDev(output->devID, devIDBackup);
//if(padding != NULL) {
// XTensor * tmp = NewTensor(padding);
// _IsNonZero(padding, tmp);
// int nonZeroNum = (int)_ReduceSumAll(tmp);
// _ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)nonZeroNum);
// delete tmp;
//}
//else {
// _ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)blockNum);
//}
}
}
...
...
source/tensor/function/CrossEntropy.cuh
查看文件 @
03a9836e
...
@@ -40,7 +40,7 @@ DTYPE _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,
...
@@ -40,7 +40,7 @@ DTYPE _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,
/* backward computation of cross entropy function */
/* backward computation of cross entropy function */
void _CudaCrossEntropyBackward(XTensor * dedy, const XTensor * output,
void _CudaCrossEntropyBackward(XTensor * dedy, const XTensor * output,
const XTensor * gold, const XTensor * weight = NULL,
const XTensor * gold, const XTensor * weight = NULL,
const
XTensor * padding = NULL, int leadingDim = -1);
XTensor * padding = NULL, int leadingDim = -1);
} // namespace nts(NiuTrans.Tensor)
} // namespace nts(NiuTrans.Tensor)
...
...
source/tensor/function/CrossEntropy.h
查看文件 @
03a9836e
...
@@ -52,9 +52,9 @@ DTYPE _CrossEntropyFast(const XTensor * output, const XTensor * gold,
...
@@ -52,9 +52,9 @@ DTYPE _CrossEntropyFast(const XTensor * output, const XTensor * gold,
const
XTensor
*
padding
=
NULL
,
int
leadingDim
=
-
1
);
const
XTensor
*
padding
=
NULL
,
int
leadingDim
=
-
1
);
/* backward computation of cross entropy function */
/* backward computation of cross entropy function */
void
_CrossEntropyBackward
(
XTensor
*
dedy
,
const
XTensor
*
output
,
const
XTensor
*
gold
,
void
_CrossEntropyBackward
(
XTensor
*
dedy
,
const
XTensor
*
output
,
const
XTensor
*
weight
=
NULL
,
const
XTensor
*
padding
=
NULL
,
const
XTensor
*
gold
,
const
XTensor
*
weight
=
NULL
,
int
leadingDim
=
-
1
);
XTensor
*
padding
=
NULL
,
int
leadingDim
=
-
1
);
}
// namespace nts(NiuTrans.Tensor)
}
// namespace nts(NiuTrans.Tensor)
...
...
source/tensor/function/LogSoftmax.cpp
查看文件 @
03a9836e
...
@@ -279,8 +279,8 @@ better numerical stability.
...
@@ -279,8 +279,8 @@ better numerical stability.
>> leadDim - leading dimension (along which we perform reduction)
>> leadDim - leading dimension (along which we perform reduction)
*/
*/
void
_LogSoftmaxBackward
(
XTensor
*
gold
,
XTensor
*
y
,
XTensor
*
x
,
void
_LogSoftmaxBackward
(
XTensor
*
gold
,
XTensor
*
y
,
XTensor
*
x
,
XTensor
*
dedy
,
XTensor
*
dedx
,
XTensor
*
dedy
,
XTensor
*
dedx
,
int
leadDim
,
XTensor
*
padding
,
int
leadDim
,
LOSS_FUNCTION_NAME
lossName
)
LOSS_FUNCTION_NAME
lossName
)
{
{
CheckNTErrors
((
!
dedx
->
isSparse
),
"The gradient matrix must be dense!"
);
CheckNTErrors
((
!
dedx
->
isSparse
),
"The gradient matrix must be dense!"
);
...
@@ -292,7 +292,7 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
...
@@ -292,7 +292,7 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
int
leadDimRDI
=
y
->
order
-
leadDim
-
1
;
int
leadDimRDI
=
y
->
order
-
leadDim
-
1
;
#ifdef USE_CUDA
#ifdef USE_CUDA
if
(
gold
->
devID
>=
0
)
{
if
(
gold
->
devID
>=
0
)
{
_CudaLogSoftmaxBackward
(
gold
,
y
,
x
,
dedy
,
dedx
,
leadDim
,
lossName
);
_CudaLogSoftmaxBackward
(
gold
,
y
,
x
,
dedy
,
dedx
,
padding
,
leadDim
,
lossName
);
return
;
return
;
}
}
#endif
#endif
...
...
source/tensor/function/LogSoftmax.cu
查看文件 @
03a9836e
...
@@ -22,6 +22,7 @@
...
@@ -22,6 +22,7 @@
#include "LogSoftmax.h"
#include "LogSoftmax.h"
#include "LogSoftmax.cuh"
#include "LogSoftmax.cuh"
#include "Loss.cuh"
#include "Loss.cuh"
#include "../core/arithmetic/MultiplyDim.h"
#include "../core/reduce/ReduceSum.cuh"
#include "../core/reduce/ReduceSum.cuh"
#include "../core/reduce/ReduceMax.cuh"
#include "../core/reduce/ReduceMax.cuh"
#include "../XDevice.h"
#include "../XDevice.h"
...
@@ -232,7 +233,8 @@ dE/dx = dE/dy * dy/dx
...
@@ -232,7 +233,8 @@ dE/dx = dE/dy * dy/dx
>> lossName - name of the loss function
>> lossName - name of the loss function
*/
*/
__global__
__global__
void KernelLogSoftmaxBackwardDEDS(DTYPE * dedy, DTYPE * dedx, DTYPE * gold, DTYPE * y, DTYPE * x, int size, LOSS_FUNCTION_NAME lossName)
void KernelLogSoftmaxBackwardDEDS(DTYPE * dedy, DTYPE * dedx, DTYPE * gold, DTYPE * y, DTYPE * x,
int size, LOSS_FUNCTION_NAME lossName)
{
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
int i = blockDim.x * blockIdx.x + threadIdx.x;
...
@@ -371,10 +373,12 @@ better numerical stability.
...
@@ -371,10 +373,12 @@ better numerical stability.
>> leadDim - leading dimension (along which we perform reduction)
>> leadDim - leading dimension (along which we perform reduction)
*/
*/
void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx,
XTensor * dedy, XTensor * dedx,
int leadDim,
XTensor * padding, int leadDim,
LOSS_FUNCTION_NAME lossName)
LOSS_FUNCTION_NAME lossName)
{
{
leadDim = leadDim < 0 ? y->order - 1 : leadDim;
CheckNTErrors((x->devID >= 0), "Backward computation of log softmax must be run on GPUs.");
CheckNTErrors((x->devID >= 0), "Backward computation of log softmax must be run on GPUs.");
CheckNTErrors((x->devID == y->devID && gold->devID == y->devID),
CheckNTErrors((x->devID == y->devID && gold->devID == y->devID),
"Tensors used in log softmax are not on the same GPU.");
"Tensors used in log softmax are not on the same GPU.");
...
@@ -441,6 +445,26 @@ void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
...
@@ -441,6 +445,26 @@ void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
dimensionSize * stride, lossName);
dimensionSize * stride, lossName);
}
}
}
}
if(padding != NULL) {
int n = leadDim;
int paddingOrder = padding->order;
int * paddingDims = new int[paddingOrder];
memcpy(paddingDims, padding->dimSize, padding->order * sizeof(int));
padding->Reshape(padding->unitNum);
int order = dedx->order;
int * dims = new int[order];
memcpy(dims, dedx->dimSize, dedx->order * sizeof(int));
dedx->Reshape(dedx->unitNum/dedx->GetDim(n), dedx->GetDim(n));
_MultiplyDimMe(dedx, padding, 0);
padding->Reshape(paddingOrder, paddingDims);
dedx->Reshape(order, dims);
delete[] paddingDims;
delete[] dims;
}
}
}
else {
else {
ShowNTErrors("TODO!");
ShowNTErrors("TODO!");
...
...
source/tensor/function/LogSoftmax.cuh
查看文件 @
03a9836e
...
@@ -37,8 +37,8 @@ void _CudaLogSoftmaxSumMax(XTensor * x, XTensor * y, int leadDim, XTensor * sum,
...
@@ -37,8 +37,8 @@ void _CudaLogSoftmaxSumMax(XTensor * x, XTensor * y, int leadDim, XTensor * sum,
/* de/dx (Cuda version) */
/* de/dx (Cuda version) */
void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx,
XTensor * dedy, XTensor * dedx,
int leadDim,
XTensor * padding,
int leadDim,
LOSS_FUNCTION_NAME lossName);
LOSS_FUNCTION_NAME lossName);
#endif // USE_CUDA
#endif // USE_CUDA
...
...
source/tensor/function/LogSoftmax.h
查看文件 @
03a9836e
...
@@ -38,8 +38,8 @@ void LogSoftmax(const XTensor &x, XTensor &y, int leadDim);
...
@@ -38,8 +38,8 @@ void LogSoftmax(const XTensor &x, XTensor &y, int leadDim);
/* de/dx */
/* de/dx */
void
_LogSoftmaxBackward
(
XTensor
*
gold
,
XTensor
*
y
,
XTensor
*
x
,
void
_LogSoftmaxBackward
(
XTensor
*
gold
,
XTensor
*
y
,
XTensor
*
x
,
XTensor
*
dedy
,
XTensor
*
dedx
,
XTensor
*
dedy
,
XTensor
*
dedx
,
int
leadDim
,
XTensor
*
padding
,
int
leadDim
,
LOSS_FUNCTION_NAME
lossName
);
LOSS_FUNCTION_NAME
lossName
);
}
// namespace nts(NiuTrans.Tensor)
}
// namespace nts(NiuTrans.Tensor)
...
...
source/tensor/function/Loss.cpp
查看文件 @
03a9836e
...
@@ -486,8 +486,9 @@ void _LossBackward(XTensor * dedy, XTensor * t, XTensor * y,
...
@@ -486,8 +486,9 @@ void _LossBackward(XTensor * dedy, XTensor * t, XTensor * y,
for
(
int
i
=
0
;
i
<
blockNum
;
i
++
)
{
for
(
int
i
=
0
;
i
<
blockNum
;
i
++
)
{
for
(
int
j
=
0
;
j
<
stride
;
j
++
)
{
for
(
int
j
=
0
;
j
<
stride
;
j
++
)
{
for
(
int
k
=
0
;
k
<
tLen
;
k
++
)
{
for
(
int
k
=
0
;
k
<
tLen
;
k
++
)
{
*
(
dedyp
+
i
*
stride
*
dimensionSize
+
j
+
stride
*
(
yBeg
+
k
))
=
-
(
DTYPE
)
*
(
tp
+
i
*
stride
*
dimensionSize
*
(
dedyp
+
i
*
stride
*
dimensionSize
+
j
+
stride
*
(
yBeg
+
k
))
=
+
j
+
stride
*
(
tBeg
+
k
))
/
(
DTYPE
)
*
(
yp
+
i
*
stride
*
dimensionSize
+
j
+
stride
*
(
yBeg
+
k
));
-
(
DTYPE
)
*
(
tp
+
i
*
stride
*
dimensionSize
+
j
+
stride
*
(
tBeg
+
k
))
/
(
DTYPE
)
*
(
yp
+
i
*
stride
*
dimensionSize
+
j
+
stride
*
(
yBeg
+
k
));
}
}
}
}
}
}
...
...
source/tensor/function/Softmax.cpp
查看文件 @
03a9836e
...
@@ -174,8 +174,8 @@ See more details in LogSoftmaxBackward(...)
...
@@ -174,8 +174,8 @@ See more details in LogSoftmaxBackward(...)
>> leadDim - leading dimension (along which we perform reduction)
>> leadDim - leading dimension (along which we perform reduction)
*/
*/
void
_SoftmaxBackward
(
XTensor
*
gold
,
XTensor
*
y
,
XTensor
*
x
,
void
_SoftmaxBackward
(
XTensor
*
gold
,
XTensor
*
y
,
XTensor
*
x
,
XTensor
*
dedy
,
XTensor
*
dedx
,
XTensor
*
dedy
,
XTensor
*
dedx
,
int
leadDim
,
XTensor
*
padding
,
int
leadDim
,
LOSS_FUNCTION_NAME
lossName
)
LOSS_FUNCTION_NAME
lossName
)
{
{
CheckNTErrors
(
dedx
->
isSparse
==
false
,
"The gradient tensor must be dense!"
);
CheckNTErrors
(
dedx
->
isSparse
==
false
,
"The gradient tensor must be dense!"
);
...
@@ -188,7 +188,7 @@ void _SoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
...
@@ -188,7 +188,7 @@ void _SoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
#ifdef USE_CUDA
#ifdef USE_CUDA
if
(
y
->
devID
>=
0
){
if
(
y
->
devID
>=
0
){
_CudaSoftmaxBackward
(
gold
,
y
,
x
,
dedy
,
dedx
,
leadDim
,
lossName
);
_CudaSoftmaxBackward
(
gold
,
y
,
x
,
dedy
,
dedx
,
padding
,
leadDim
,
lossName
);
return
;
return
;
}
}
#endif
#endif
...
...
source/tensor/function/Softmax.cu
查看文件 @
03a9836e
...
@@ -24,6 +24,7 @@
...
@@ -24,6 +24,7 @@
#include "Loss.cuh"
#include "Loss.cuh"
#include "../core/reduce/ReduceSum.h"
#include "../core/reduce/ReduceSum.h"
#include "../core/arithmetic/Multiply.h"
#include "../core/arithmetic/Multiply.h"
#include "../core/arithmetic/MultiplyDim.h"
#include "../core/shape/Unsqueeze.h"
#include "../core/shape/Unsqueeze.h"
#include "../core/arithmetic/Sum.h"
#include "../core/arithmetic/Sum.h"
#include "../XDevice.h"
#include "../XDevice.h"
...
@@ -309,9 +310,11 @@ See more details in SoftmaxBackward
...
@@ -309,9 +310,11 @@ See more details in SoftmaxBackward
*/
*/
void _CudaSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
void _CudaSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx,
XTensor * dedy, XTensor * dedx,
int leadDim,
XTensor * padding,
int leadDim,
LOSS_FUNCTION_NAME lossName)
LOSS_FUNCTION_NAME lossName)
{
{
int n = leadDim < 0 ? y->order - 1 : leadDim;
CheckNTErrors((x->devID >= 0), "Backward computation of log softmax must be run on GPUs.");
CheckNTErrors((x->devID >= 0), "Backward computation of log softmax must be run on GPUs.");
CheckNTErrors((x->devID == y->devID), "Matrices used in log softmax are not on the same GPU.");
CheckNTErrors((x->devID == y->devID), "Matrices used in log softmax are not on the same GPU.");
CheckNTErrors((y->order >= 1), "Empty tensor!");
CheckNTErrors((y->order >= 1), "Empty tensor!");
...
@@ -329,6 +332,24 @@ void _CudaSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
...
@@ -329,6 +332,24 @@ void _CudaSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
if(lossName == CROSSENTROPY || lossName == SQUAREDERROR){
if(lossName == CROSSENTROPY || lossName == SQUAREDERROR){
_Sum(y, gold, dedx, -1.0F);
_Sum(y, gold, dedx, -1.0F);
if(padding != NULL) {
int paddingOrder = padding->order;
int * paddingDims = new int[paddingOrder];
memcpy(paddingDims, padding->dimSize, padding->order * sizeof(int));
padding->Reshape(padding->unitNum);
int order = dedx->order;
int * dims = new int[order];
memcpy(dims, dedx->dimSize, dedx->order * sizeof(int));
dedx->Reshape(dedx->unitNum/dedx->GetDim(n), dedx->GetDim(n));
_MultiplyDimMe(dedx, padding, 0);
padding->Reshape(paddingOrder, paddingDims);
dedx->Reshape(order, dims);
delete[] paddingDims;
delete[] dims;
}
}
}
else if(lossName == ONEHOTERROR){
else if(lossName == ONEHOTERROR){
ShowNTErrors("TODO!");
ShowNTErrors("TODO!");
...
...
source/tensor/function/Softmax.cuh
查看文件 @
03a9836e
...
@@ -37,8 +37,8 @@ void _CudaSoftmaxSumMax(const XTensor * x, XTensor * y, int leadDim, XTensor * s
...
@@ -37,8 +37,8 @@ void _CudaSoftmaxSumMax(const XTensor * x, XTensor * y, int leadDim, XTensor * s
/* de/dx (Cuda version) */
/* de/dx (Cuda version) */
void _CudaSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
void _CudaSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx,
XTensor * dedy, XTensor * dedx,
int leadDim,
XTensor * padding,
int leadDim,
LOSS_FUNCTION_NAME lossName);
LOSS_FUNCTION_NAME lossName);
#endif // USE_CUDA
#endif // USE_CUDA
...
...
source/tensor/function/Softmax.h
查看文件 @
03a9836e
...
@@ -35,8 +35,8 @@ XTensor Softmax(const XTensor &x, int leadDim);
...
@@ -35,8 +35,8 @@ XTensor Softmax(const XTensor &x, int leadDim);
/* de/dx */
/* de/dx */
void
_SoftmaxBackward
(
XTensor
*
gold
,
XTensor
*
y
,
XTensor
*
x
,
void
_SoftmaxBackward
(
XTensor
*
gold
,
XTensor
*
y
,
XTensor
*
x
,
XTensor
*
dedy
,
XTensor
*
dedx
,
XTensor
*
dedy
,
XTensor
*
dedx
,
int
leadDim
,
XTensor
*
padding
,
int
leadDim
,
LOSS_FUNCTION_NAME
lossName
);
LOSS_FUNCTION_NAME
lossName
);
}
// namespace nts(NiuTrans.Tensor)
}
// namespace nts(NiuTrans.Tensor)
...
...
source/tensor/test/TDropout.cpp
查看文件 @
03a9836e
...
@@ -169,8 +169,8 @@ bool TestDropout2()
...
@@ -169,8 +169,8 @@ bool TestDropout2()
_DropoutBackward
(
y
,
x
,
dedy
,
dedx
,
1
,
dropProb
);
_DropoutBackward
(
y
,
x
,
dedy
,
dedx
,
1
,
dropProb
);
/* check result */
/* check result */
y
->
Dump
(
stderr
,
"y"
);
//
y->Dump(stderr, "y");
dedx
->
Dump
(
stderr
,
"dedy"
);
//
dedx->Dump(stderr, "dedy");
#ifdef USE_CUDA
#ifdef USE_CUDA
/* GPU test */
/* GPU test */
...
@@ -193,8 +193,8 @@ bool TestDropout2()
...
@@ -193,8 +193,8 @@ bool TestDropout2()
_DropoutBackward
(
yGPU
,
xGPU
,
dedyGPU
,
dedxGPU
,
1
,
dropProb
);
_DropoutBackward
(
yGPU
,
xGPU
,
dedyGPU
,
dedxGPU
,
1
,
dropProb
);
/* check result */
/* check result */
yGPU
->
Dump
(
stderr
,
"yGPU"
);
//
yGPU->Dump(stderr, "yGPU");
dedxGPU
->
Dump
(
stderr
,
"dedyGPU"
);
//
dedxGPU->Dump(stderr, "dedyGPU");
/* destroy variables */
/* destroy variables */
delete
x
;
delete
x
;
...
...
source/tensor/test/TLogSoftmax.cpp
查看文件 @
03a9836e
...
@@ -146,7 +146,7 @@ bool TestLogSoftmax2()
...
@@ -146,7 +146,7 @@ bool TestLogSoftmax2()
_LogSoftmax
(
x
,
y
,
1
);
_LogSoftmax
(
x
,
y
,
1
);
/* call LogSoftmaxBackward function */
/* call LogSoftmaxBackward function */
_LogSoftmaxBackward
(
g
,
y
,
x
,
dedy
,
dedx
,
1
,
CROSSENTROPY
);
_LogSoftmaxBackward
(
g
,
y
,
x
,
dedy
,
dedx
,
NULL
,
1
,
CROSSENTROPY
);
/* check result */
/* check result */
cpuTest
=
y
->
CheckData
(
yAnswer
,
unitNum
,
1e-4
F
)
cpuTest
=
y
->
CheckData
(
yAnswer
,
unitNum
,
1e-4
F
)
...
@@ -174,7 +174,7 @@ bool TestLogSoftmax2()
...
@@ -174,7 +174,7 @@ bool TestLogSoftmax2()
_LogSoftmax
(
xGPU
,
yGPU
,
1
);
_LogSoftmax
(
xGPU
,
yGPU
,
1
);
/* call LogSoftmaxBackward function */
/* call LogSoftmaxBackward function */
_LogSoftmaxBackward
(
gGPU
,
yGPU
,
xGPU
,
dedyGPU
,
dedxGPU
,
1
,
CROSSENTROPY
);
_LogSoftmaxBackward
(
gGPU
,
yGPU
,
xGPU
,
dedyGPU
,
dedxGPU
,
NULL
,
1
,
CROSSENTROPY
);
/* check result */
/* check result */
gpuTest
=
yGPU
->
CheckData
(
yAnswer
,
unitNum
,
1e-4
F
)
&&
dedxGPU
->
CheckData
(
dedxAnswer
,
unitNum
,
1e-4
F
);
gpuTest
=
yGPU
->
CheckData
(
yAnswer
,
unitNum
,
1e-4
F
)
&&
dedxGPU
->
CheckData
(
dedxAnswer
,
unitNum
,
1e-4
F
);
...
@@ -250,7 +250,7 @@ bool TestLogSoftmax3()
...
@@ -250,7 +250,7 @@ bool TestLogSoftmax3()
_LogSoftmax
(
x
,
y
,
1
);
_LogSoftmax
(
x
,
y
,
1
);
/* call LogSoftmaxBackward function */
/* call LogSoftmaxBackward function */
_LogSoftmaxBackward
(
g
,
y
,
x
,
dedy
,
dedx
,
1
,
SQUAREDERROR
);
_LogSoftmaxBackward
(
g
,
y
,
x
,
dedy
,
dedx
,
NULL
,
1
,
SQUAREDERROR
);
/* check result */
/* check result */
cpuTest
=
y
->
CheckData
(
yAnswer
,
unitNum
,
1e-4
F
)
cpuTest
=
y
->
CheckData
(
yAnswer
,
unitNum
,
1e-4
F
)
...
@@ -278,7 +278,7 @@ bool TestLogSoftmax3()
...
@@ -278,7 +278,7 @@ bool TestLogSoftmax3()
_LogSoftmax
(
xGPU
,
yGPU
,
1
);
_LogSoftmax
(
xGPU
,
yGPU
,
1
);
/* call LogSoftmaxBackward function */
/* call LogSoftmaxBackward function */
_LogSoftmaxBackward
(
gGPU
,
yGPU
,
xGPU
,
dedyGPU
,
dedxGPU
,
1
,
SQUAREDERROR
);
_LogSoftmaxBackward
(
gGPU
,
yGPU
,
xGPU
,
dedyGPU
,
dedxGPU
,
NULL
,
1
,
SQUAREDERROR
);
/* check result */
/* check result */
gpuTest
=
yGPU
->
CheckData
(
yAnswer
,
unitNum
,
1e-4
F
)
gpuTest
=
yGPU
->
CheckData
(
yAnswer
,
unitNum
,
1e-4
F
)
...
...
source/tensor/test/TPower.cpp
查看文件 @
03a9836e
...
@@ -66,7 +66,9 @@ bool TestPower1()
...
@@ -66,7 +66,9 @@ bool TestPower1()
bUser
=
Power
(
*
a
,
2.0
F
);
bUser
=
Power
(
*
a
,
2.0
F
);
/* check results */
/* check results */
cpuTest
=
b
->
CheckData
(
answer
,
aUnitNum
,
1e-4
F
)
&&
aMe
->
CheckData
(
answer
,
aUnitNum
,
1e-4
F
)
&&
bUser
.
CheckData
(
answer
,
aUnitNum
,
1e-4
F
);
cpuTest
=
b
->
CheckData
(
answer
,
aUnitNum
,
1e-4
F
)
&&
aMe
->
CheckData
(
answer
,
aUnitNum
,
1e-4
F
)
&&
bUser
.
CheckData
(
answer
,
aUnitNum
,
1e-4
F
);
#ifdef USE_CUDA
#ifdef USE_CUDA
/* GPU test */
/* GPU test */
...
@@ -88,7 +90,9 @@ bool TestPower1()
...
@@ -88,7 +90,9 @@ bool TestPower1()
bUserGPU
=
Power
(
*
aGPU
,
2.0
F
);
bUserGPU
=
Power
(
*
aGPU
,
2.0
F
);
/* check results */
/* check results */
gpuTest
=
bGPU
->
CheckData
(
answer
,
aUnitNum
,
1e-4
F
)
&&
aMeGPU
->
CheckData
(
answer
,
aUnitNum
,
1e-4
F
)
&&
bUserGPU
.
CheckData
(
answer
,
aUnitNum
,
1e-4
F
);
gpuTest
=
bGPU
->
CheckData
(
answer
,
aUnitNum
,
1e-4
F
)
&&
aMeGPU
->
CheckData
(
answer
,
aUnitNum
,
1e-4
F
)
&&
bUserGPU
.
CheckData
(
answer
,
aUnitNum
,
1e-4
F
);
/* destroy variables */
/* destroy variables */
delete
a
;
delete
a
;
...
@@ -153,7 +157,9 @@ bool TestPower2()
...
@@ -153,7 +157,9 @@ bool TestPower2()
bUser
=
Power
(
*
a
,
1.0
F
);
bUser
=
Power
(
*
a
,
1.0
F
);
/* check results */
/* check results */
cpuTest
=
b
->
CheckData
(
answer
,
aUnitNum
,
1e-4
F
)
&&
aMe
->
CheckData
(
answer
,
aUnitNum
,
1e-4
F
)
&&
bUser
.
CheckData
(
answer
,
aUnitNum
,
1e-4
F
);
cpuTest
=
b
->
CheckData
(
answer
,
aUnitNum
,
1e-4
F
)
&&
aMe
->
CheckData
(
answer
,
aUnitNum
,
1e-4
F
)
&&
bUser
.
CheckData
(
answer
,
aUnitNum
,
1e-4
F
);
#ifdef USE_CUDA
#ifdef USE_CUDA
/* GPU test */
/* GPU test */
...
@@ -175,7 +181,9 @@ bool TestPower2()
...
@@ -175,7 +181,9 @@ bool TestPower2()
bUserGPU
=
Power
(
*
aGPU
,
1.0
F
);
bUserGPU
=
Power
(
*
aGPU
,
1.0
F
);
/* check results */
/* check results */
gpuTest
=
bGPU
->
CheckData
(
answer
,
aUnitNum
,
1e-4
F
)
&&
aMeGPU
->
CheckData
(
answer
,
aUnitNum
,
1e-4
F
)
&&
bUserGPU
.
CheckData
(
answer
,
aUnitNum
,
1e-4
F
);
gpuTest
=
bGPU
->
CheckData
(
answer
,
aUnitNum
,
1e-4
F
)
&&
aMeGPU
->
CheckData
(
answer
,
aUnitNum
,
1e-4
F
)
&&
bUserGPU
.
CheckData
(
answer
,
aUnitNum
,
1e-4
F
);
/* destroy variables */
/* destroy variables */
delete
a
;
delete
a
;
...
@@ -214,7 +222,7 @@ bool TestPower3()
...
@@ -214,7 +222,7 @@ bool TestPower3()
for
(
int
i
=
0
;
i
<
aOrder
;
i
++
)
for
(
int
i
=
0
;
i
<
aOrder
;
i
++
)
aUnitNum
*=
aDimSize
[
i
];
aUnitNum
*=
aDimSize
[
i
];
DTYPE
aData
[
3
][
2
]
=
{
{
0
.0
F
,
1.0
F
},
DTYPE
aData
[
3
][
2
]
=
{
{
1
.0
F
,
1.0
F
},
{
2.0
F
,
3.0
F
},
{
2.0
F
,
3.0
F
},
{
4.0
F
,
5.0
F
}
};
{
4.0
F
,
5.0
F
}
};
DTYPE
answer
[
3
][
2
]
=
{
{
1.0
F
,
1.0
F
},
DTYPE
answer
[
3
][
2
]
=
{
{
1.0
F
,
1.0
F
},
...
@@ -240,7 +248,9 @@ bool TestPower3()
...
@@ -240,7 +248,9 @@ bool TestPower3()
bUser
=
Power
(
*
a
,
0.0
F
);
bUser
=
Power
(
*
a
,
0.0
F
);
/* check results */
/* check results */
cpuTest
=
b
->
CheckData
(
answer
,
aUnitNum
,
1e-4
F
)
&&
aMe
->
CheckData
(
answer
,
aUnitNum
,
1e-4
F
)
&&
bUser
.
CheckData
(
answer
,
aUnitNum
,
1e-4
F
);
cpuTest
=
b
->
CheckData
(
answer
,
aUnitNum
,
1e-4
F
)
&&
aMe
->
CheckData
(
answer
,
aUnitNum
,
1e-4
F
)
&&
bUser
.
CheckData
(
answer
,
aUnitNum
,
1e-4
F
);
#ifdef USE_CUDA
#ifdef USE_CUDA
/* GPU test */
/* GPU test */
...
@@ -262,7 +272,9 @@ bool TestPower3()
...
@@ -262,7 +272,9 @@ bool TestPower3()
bUserGPU
=
Power
(
*
aGPU
,
0.0
F
);
bUserGPU
=
Power
(
*
aGPU
,
0.0
F
);
/* check results */
/* check results */
gpuTest
=
bGPU
->
CheckData
(
answer
,
aUnitNum
,
1e-4
F
)
&&
aMeGPU
->
CheckData
(
answer
,
aUnitNum
,
1e-4
F
)
&&
bUserGPU
.
CheckData
(
answer
,
aUnitNum
,
1e-4
F
);
gpuTest
=
bGPU
->
CheckData
(
answer
,
aUnitNum
,
1e-4
F
)
&&
aMeGPU
->
CheckData
(
answer
,
aUnitNum
,
1e-4
F
)
&&
bUserGPU
.
CheckData
(
answer
,
aUnitNum
,
1e-4
F
);
/* destroy variables */
/* destroy variables */
delete
a
;
delete
a
;
...
...
source/tensor/test/TReduceSum.cpp
查看文件 @
03a9836e
/* NiuTrans.Tensor - an open-source tensor library
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
* All rights reserved.
*
*
* Licensed under the Apache License, Version 2.0 (the "License");
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* You may obtain a copy of the License at
*
*
* http://www.apache.org/licenses/LICENSE-2.0
* http://www.apache.org/licenses/LICENSE-2.0
*
*
* Unless required by applicable law or agreed to in writing, software
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* See the License for the specific language governing permissions and
* limitations under the License.
* limitations under the License.
*/
*/
/*
/*
* $Created by: LI Yinqiao (email: li.yin.qiao.2012@hotmail.com) 2018-04-30
* $Created by: LI Yinqiao (email: li.yin.qiao.2012@hotmail.com) 2018-04-30
*/
*/
#include "TReduceSum.h"
#include "TReduceSum.h"
#include "../core/getandset/SetData.h"
namespace
nts
{
// namespace nts(NiuTrans.Tensor)
namespace
nts
{
// namespace nts(NiuTrans.Tensor)
...
@@ -155,6 +156,457 @@ bool TestReduceSum1()
...
@@ -155,6 +156,457 @@ bool TestReduceSum1()
#endif // USE_CUDA
#endif // USE_CUDA
}
}
/*
case 2: test ReduceSum function.
Sum the items along a dimension of the tensor.
In this case,
C = 1, A >= 10, B >= 128
(50, 1000000) -> (50), dim = 1
*/
bool
TestReduceSum2
()
{
/* a tensor of size (50, 1000000) */
int
sOrder
=
2
;
int
*
sDimSize
=
new
int
[
sOrder
];
sDimSize
[
0
]
=
50
;
sDimSize
[
1
]
=
1000000
;
int
sUnitNum
=
1
;
for
(
int
i
=
0
;
i
<
sOrder
;
i
++
)
sUnitNum
*=
sDimSize
[
i
];
/* a tensor of size (50) */
int
tOrder
=
1
;
int
*
tDimSize
=
new
int
[
tOrder
];
tDimSize
[
0
]
=
50
;
int
tUnitNum
=
1
;
for
(
int
i
=
0
;
i
<
tOrder
;
i
++
)
tUnitNum
*=
tDimSize
[
i
];
/* CPU test */
bool
cpuTest
=
true
;
/* create tensors */
XTensor
*
s
=
NewTensor
(
sOrder
,
sDimSize
);
XTensor
*
t
=
NewTensor
(
tOrder
,
tDimSize
);
XTensor
*
answer
=
NewTensor
(
tOrder
,
tDimSize
);
XTensor
tUser
;
/* initialize variables */
_SetDataFixedFloat
(
s
,
1.0
F
);
_SetDataFixedFloat
(
answer
,
(
float
)
s
->
GetDim
(
1
));
/* call ReduceSum function */
_ReduceSum
(
s
,
t
,
1
);
tUser
=
ReduceSum
(
*
s
,
1
);
/* check results */
cpuTest
=
t
->
CheckData
(
answer
->
data
,
tUnitNum
)
&&
tUser
.
CheckData
(
answer
->
data
,
tUnitNum
);
#ifdef USE_CUDA
/* GPU test */
bool
gpuTest
=
true
;
/* create tensors */
XTensor
*
sGPU
=
NewTensor
(
sOrder
,
sDimSize
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
*
tGPU
=
NewTensor
(
tOrder
,
tDimSize
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
tUserGPU
;
/* initialize variables */
_SetDataFixedFloat
(
sGPU
,
1.0
F
);
/* call ReduceSum function */
_ReduceSum
(
sGPU
,
tGPU
,
1
);
tUserGPU
=
ReduceSum
(
*
sGPU
,
1
);
/* check results */
gpuTest
=
tGPU
->
CheckData
(
answer
->
data
,
tUnitNum
)
&&
tUserGPU
.
CheckData
(
answer
->
data
,
tUnitNum
);
/* destroy variables */
delete
s
;
delete
t
;
delete
answer
;
delete
sGPU
;
delete
tGPU
;
delete
[]
sDimSize
;
delete
[]
tDimSize
;
return
cpuTest
&&
gpuTest
;
#else
/* destroy variables */
delete
s
;
delete
t
;
delete
answer
;
delete
[]
sDimSize
;
delete
[]
tDimSize
;
return
cpuTest
;
#endif // USE_CUDA
}
/*
case 3: test ReduceSum function.
Sum the items along a dimension of the tensor.
In this case,
C = 1, A >= 10, B < 128
(1000000, 50) -> (1000000), dim = 1
*/
bool
TestReduceSum3
()
{
/* a tensor of size (1000000, 50) */
int
sOrder
=
2
;
int
*
sDimSize
=
new
int
[
sOrder
];
sDimSize
[
0
]
=
1000000
;
sDimSize
[
1
]
=
50
;
int
sUnitNum
=
1
;
for
(
int
i
=
0
;
i
<
sOrder
;
i
++
)
sUnitNum
*=
sDimSize
[
i
];
/* a tensor of size (1000000) */
int
tOrder
=
1
;
int
*
tDimSize
=
new
int
[
tOrder
];
tDimSize
[
0
]
=
1000000
;
int
tUnitNum
=
1
;
for
(
int
i
=
0
;
i
<
tOrder
;
i
++
)
tUnitNum
*=
tDimSize
[
i
];
/* CPU test */
bool
cpuTest
=
true
;
/* create tensors */
XTensor
*
s
=
NewTensor
(
sOrder
,
sDimSize
);
XTensor
*
t
=
NewTensor
(
tOrder
,
tDimSize
);
XTensor
*
answer
=
NewTensor
(
tOrder
,
tDimSize
);
XTensor
tUser
;
/* initialize variables */
_SetDataFixedFloat
(
s
,
1.0
F
);
_SetDataFixedFloat
(
answer
,
(
float
)
s
->
GetDim
(
1
));
/* call ReduceSum function */
_ReduceSum
(
s
,
t
,
1
);
tUser
=
ReduceSum
(
*
s
,
1
);
/* check results */
cpuTest
=
t
->
CheckData
(
answer
->
data
,
tUnitNum
)
&&
tUser
.
CheckData
(
answer
->
data
,
tUnitNum
);
#ifdef USE_CUDA
/* GPU test */
bool
gpuTest
=
true
;
/* create tensors */
XTensor
*
sGPU
=
NewTensor
(
sOrder
,
sDimSize
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
*
tGPU
=
NewTensor
(
tOrder
,
tDimSize
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
tUserGPU
;
/* initialize variables */
_SetDataFixedFloat
(
sGPU
,
1.0
F
);
/* call ReduceSum function */
_ReduceSum
(
sGPU
,
tGPU
,
1
);
tUserGPU
=
ReduceSum
(
*
sGPU
,
1
);
/* check results */
gpuTest
=
tGPU
->
CheckData
(
answer
->
data
,
tUnitNum
)
&&
tUserGPU
.
CheckData
(
answer
->
data
,
tUnitNum
);
/* destroy variables */
delete
s
;
delete
t
;
delete
answer
;
delete
sGPU
;
delete
tGPU
;
delete
[]
sDimSize
;
delete
[]
tDimSize
;
return
cpuTest
&&
gpuTest
;
#else
/* destroy variables */
delete
s
;
delete
t
;
delete
answer
;
delete
[]
sDimSize
;
delete
[]
tDimSize
;
return
cpuTest
;
#endif // USE_CUDA
}
/*
case 4: test ReduceSum function.
Sum the items along a dimension of the tensor.
In this case,
C = 1, A < 10, B is free
(5, 1000000) -> (5), dim = 1
*/
bool
TestReduceSum4
()
{
/* a tensor of size (5, 1000000) */
int
sOrder
=
2
;
int
*
sDimSize
=
new
int
[
sOrder
];
sDimSize
[
0
]
=
5
;
sDimSize
[
1
]
=
1000000
;
int
sUnitNum
=
1
;
for
(
int
i
=
0
;
i
<
sOrder
;
i
++
)
sUnitNum
*=
sDimSize
[
i
];
/* a tensor of size (5) */
int
tOrder
=
1
;
int
*
tDimSize
=
new
int
[
tOrder
];
tDimSize
[
0
]
=
5
;
int
tUnitNum
=
1
;
for
(
int
i
=
0
;
i
<
tOrder
;
i
++
)
tUnitNum
*=
tDimSize
[
i
];
/* CPU test */
bool
cpuTest
=
true
;
/* create tensors */
XTensor
*
s
=
NewTensor
(
sOrder
,
sDimSize
);
XTensor
*
t
=
NewTensor
(
tOrder
,
tDimSize
);
XTensor
*
answer
=
NewTensor
(
tOrder
,
tDimSize
);
XTensor
tUser
;
/* initialize variables */
_SetDataFixedFloat
(
s
,
1.0
F
);
_SetDataFixedFloat
(
answer
,
(
float
)
s
->
GetDim
(
1
));
/* call ReduceSum function */
_ReduceSum
(
s
,
t
,
1
);
tUser
=
ReduceSum
(
*
s
,
1
);
/* check results */
cpuTest
=
t
->
CheckData
(
answer
->
data
,
tUnitNum
)
&&
tUser
.
CheckData
(
answer
->
data
,
tUnitNum
);
#ifdef USE_CUDA
/* GPU test */
bool
gpuTest
=
true
;
/* create tensors */
XTensor
*
sGPU
=
NewTensor
(
sOrder
,
sDimSize
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
*
tGPU
=
NewTensor
(
tOrder
,
tDimSize
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
tUserGPU
;
/* initialize variables */
_SetDataFixedFloat
(
sGPU
,
1.0
F
);
/* call ReduceSum function */
_ReduceSum
(
sGPU
,
tGPU
,
1
);
tUserGPU
=
ReduceSum
(
*
sGPU
,
1
);
/* check results */
gpuTest
=
tGPU
->
CheckData
(
answer
->
data
,
tUnitNum
)
&&
tUserGPU
.
CheckData
(
answer
->
data
,
tUnitNum
);
/* destroy variables */
delete
s
;
delete
t
;
delete
answer
;
delete
sGPU
;
delete
tGPU
;
delete
[]
sDimSize
;
delete
[]
tDimSize
;
return
cpuTest
&&
gpuTest
;
#else
/* destroy variables */
delete
s
;
delete
t
;
delete
answer
;
delete
[]
sDimSize
;
delete
[]
tDimSize
;
return
cpuTest
;
#endif // USE_CUDA
}
/*
case 5: test ReduceSum function.
Sum the items along a dimension of the tensor.
In this case,
C != 1, A*C > 4096
(500, 1000, 500) -> (500, 500), dim = 1
*/
bool
TestReduceSum5
()
{
/* a tensor of size (500, 1000, 500) */
int
sOrder
=
3
;
int
*
sDimSize
=
new
int
[
sOrder
];
sDimSize
[
0
]
=
500
;
sDimSize
[
1
]
=
1000
;
sDimSize
[
2
]
=
500
;
int
sUnitNum
=
1
;
for
(
int
i
=
0
;
i
<
sOrder
;
i
++
)
sUnitNum
*=
sDimSize
[
i
];
/* a tensor of size (500, 500) */
int
tOrder
=
2
;
int
*
tDimSize
=
new
int
[
tOrder
];
tDimSize
[
0
]
=
50
;
tDimSize
[
1
]
=
50
;
int
tUnitNum
=
1
;
for
(
int
i
=
0
;
i
<
tOrder
;
i
++
)
tUnitNum
*=
tDimSize
[
i
];
/* CPU test */
bool
cpuTest
=
true
;
/* create tensors */
XTensor
*
s
=
NewTensor
(
sOrder
,
sDimSize
);
XTensor
*
t
=
NewTensor
(
tOrder
,
tDimSize
);
XTensor
*
answer
=
NewTensor
(
tOrder
,
tDimSize
);
XTensor
tUser
;
/* initialize variables */
_SetDataFixedFloat
(
s
,
1.0
F
);
_SetDataFixedFloat
(
answer
,
(
float
)
s
->
GetDim
(
1
));
/* call ReduceSum function */
_ReduceSum
(
s
,
t
,
1
);
tUser
=
ReduceSum
(
*
s
,
1
);
/* check results */
cpuTest
=
t
->
CheckData
(
answer
->
data
,
tUnitNum
)
&&
tUser
.
CheckData
(
answer
->
data
,
tUnitNum
);
#ifdef USE_CUDA
/* GPU test */
bool
gpuTest
=
true
;
/* create tensors */
XTensor
*
sGPU
=
NewTensor
(
sOrder
,
sDimSize
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
*
tGPU
=
NewTensor
(
tOrder
,
tDimSize
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
tUserGPU
;
/* initialize variables */
_SetDataFixedFloat
(
sGPU
,
1.0
F
);
/* call ReduceSum function */
_ReduceSum
(
sGPU
,
tGPU
,
1
);
tUserGPU
=
ReduceSum
(
*
sGPU
,
1
);
/* check results */
gpuTest
=
tGPU
->
CheckData
(
answer
->
data
,
tUnitNum
)
&&
tUserGPU
.
CheckData
(
answer
->
data
,
tUnitNum
);
/* destroy variables */
delete
s
;
delete
t
;
delete
answer
;
delete
sGPU
;
delete
tGPU
;
delete
[]
sDimSize
;
delete
[]
tDimSize
;
return
cpuTest
&&
gpuTest
;
#else
/* destroy variables */
delete
s
;
delete
t
;
delete
answer
;
delete
[]
sDimSize
;
delete
[]
tDimSize
;
return
cpuTest
;
#endif // USE_CUDA
}
/*
case 6: test ReduceSum function.
Sum the items along a dimension of the tensor.
In this case,
C != 1, A*C <= 4096
(50, 10000, 50) -> (50, 50), dim = 1
*/
bool
TestReduceSum6
()
{
/* a tensor of size (50, 10000, 50) */
int
sOrder
=
3
;
int
*
sDimSize
=
new
int
[
sOrder
];
sDimSize
[
0
]
=
50
;
sDimSize
[
1
]
=
10000
;
sDimSize
[
2
]
=
50
;
int
sUnitNum
=
1
;
for
(
int
i
=
0
;
i
<
sOrder
;
i
++
)
sUnitNum
*=
sDimSize
[
i
];
/* a tensor of size (50, 50) */
int
tOrder
=
2
;
int
*
tDimSize
=
new
int
[
tOrder
];
tDimSize
[
0
]
=
50
;
tDimSize
[
1
]
=
50
;
int
tUnitNum
=
1
;
for
(
int
i
=
0
;
i
<
tOrder
;
i
++
)
tUnitNum
*=
tDimSize
[
i
];
/* CPU test */
bool
cpuTest
=
true
;
/* create tensors */
XTensor
*
s
=
NewTensor
(
sOrder
,
sDimSize
);
XTensor
*
t
=
NewTensor
(
tOrder
,
tDimSize
);
XTensor
*
answer
=
NewTensor
(
tOrder
,
tDimSize
);
XTensor
tUser
;
/* initialize variables */
_SetDataFixedFloat
(
s
,
1.0
F
);
_SetDataFixedFloat
(
answer
,
(
float
)
s
->
GetDim
(
1
));
/* call ReduceSum function */
_ReduceSum
(
s
,
t
,
1
);
tUser
=
ReduceSum
(
*
s
,
1
);
/* check results */
cpuTest
=
t
->
CheckData
(
answer
->
data
,
tUnitNum
)
&&
tUser
.
CheckData
(
answer
->
data
,
tUnitNum
);
#ifdef USE_CUDA
/* GPU test */
bool
gpuTest
=
true
;
/* create tensors */
XTensor
*
sGPU
=
NewTensor
(
sOrder
,
sDimSize
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
*
tGPU
=
NewTensor
(
tOrder
,
tDimSize
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
tUserGPU
;
/* initialize variables */
_SetDataFixedFloat
(
sGPU
,
1.0
F
);
/* call ReduceSum function */
_ReduceSum
(
sGPU
,
tGPU
,
1
);
tUserGPU
=
ReduceSum
(
*
sGPU
,
1
);
/* check results */
gpuTest
=
tGPU
->
CheckData
(
answer
->
data
,
tUnitNum
)
&&
tUserGPU
.
CheckData
(
answer
->
data
,
tUnitNum
);
/* destroy variables */
delete
s
;
delete
t
;
delete
answer
;
delete
sGPU
;
delete
tGPU
;
delete
[]
sDimSize
;
delete
[]
tDimSize
;
return
cpuTest
&&
gpuTest
;
#else
/* destroy variables */
delete
s
;
delete
t
;
delete
answer
;
delete
[]
sDimSize
;
delete
[]
tDimSize
;
return
cpuTest
;
#endif // USE_CUDA
}
/* other cases */
/* other cases */
/*
/*
TODO!!
TODO!!
...
@@ -175,6 +627,51 @@ bool TestReduceSum()
...
@@ -175,6 +627,51 @@ bool TestReduceSum()
else
else
XPRINT
(
0
,
stdout
,
">> case 1 passed!
\n
"
);
XPRINT
(
0
,
stdout
,
">> case 1 passed!
\n
"
);
/* case 2 test */
caseFlag
=
TestReduceSum2
();
if
(
!
caseFlag
)
{
returnFlag
=
false
;
XPRINT
(
0
,
stdout
,
">> case 2 failed!
\n
"
);
}
else
XPRINT
(
0
,
stdout
,
">> case 2 passed!
\n
"
);
///* case 3 test */
//caseFlag = TestReduceSum3();
//if (!caseFlag) {
// returnFlag = false;
// XPRINT(0, stdout, ">> case 3 failed!\n");
//}
//else
// XPRINT(0, stdout, ">> case 3 passed!\n");
/* case 4 test */
caseFlag
=
TestReduceSum4
();
if
(
!
caseFlag
)
{
returnFlag
=
false
;
XPRINT
(
0
,
stdout
,
">> case 4 failed!
\n
"
);
}
else
XPRINT
(
0
,
stdout
,
">> case 4 passed!
\n
"
);
///* case 5 test */
//caseFlag = TestReduceSum5();
//if (!caseFlag) {
// returnFlag = false;
// XPRINT(0, stdout, ">> case 5 failed!\n");
//}
//else
// XPRINT(0, stdout, ">> case 5 passed!\n");
/* case 6 test */
caseFlag
=
TestReduceSum6
();
if
(
!
caseFlag
)
{
returnFlag
=
false
;
XPRINT
(
0
,
stdout
,
">> case 6 failed!
\n
"
);
}
else
XPRINT
(
0
,
stdout
,
">> case 6 passed!
\n
"
);
/* other cases test */
/* other cases test */
/*
/*
TODO!!
TODO!!
...
...
source/tensor/test/TSoftmax.cpp
查看文件 @
03a9836e
...
@@ -146,7 +146,7 @@ bool TestSoftmax2()
...
@@ -146,7 +146,7 @@ bool TestSoftmax2()
_Softmax
(
x
,
y
,
1
);
_Softmax
(
x
,
y
,
1
);
/* call SoftmaxBackward function */
/* call SoftmaxBackward function */
_SoftmaxBackward
(
g
,
y
,
x
,
dedy
,
dedx
,
1
,
CROSSENTROPY
);
_SoftmaxBackward
(
g
,
y
,
x
,
dedy
,
dedx
,
NULL
,
1
,
CROSSENTROPY
);
/* check result */
/* check result */
cpuTest
=
y
->
CheckData
(
yAnswer
,
unitNum
,
1e-4
F
)
cpuTest
=
y
->
CheckData
(
yAnswer
,
unitNum
,
1e-4
F
)
...
@@ -174,7 +174,7 @@ bool TestSoftmax2()
...
@@ -174,7 +174,7 @@ bool TestSoftmax2()
_Softmax
(
xGPU
,
yGPU
,
1
);
_Softmax
(
xGPU
,
yGPU
,
1
);
/* call SoftmaxBackward function */
/* call SoftmaxBackward function */
_SoftmaxBackward
(
gGPU
,
yGPU
,
xGPU
,
dedyGPU
,
dedxGPU
,
1
,
CROSSENTROPY
);
_SoftmaxBackward
(
gGPU
,
yGPU
,
xGPU
,
dedyGPU
,
dedxGPU
,
NULL
,
1
,
CROSSENTROPY
);
/* check result */
/* check result */
gpuTest
=
yGPU
->
CheckData
(
yAnswer
,
unitNum
,
1e-4
F
)
gpuTest
=
yGPU
->
CheckData
(
yAnswer
,
unitNum
,
1e-4
F
)
...
...
source/tensor/test/TSumDim.cpp
查看文件 @
03a9836e
...
@@ -20,8 +20,9 @@
...
@@ -20,8 +20,9 @@
*/
*/
#include "TSumDim.h"
#include "TSumDim.h"
#include "../core/arithmetic/SumDim.h"
#include "../XTensor.h"
#include "../XTensor.h"
#include "../core/arithmetic/SumDim.h"
#include "../core/getandset/SetData.h"
namespace
nts
{
// namespace nts(NiuTrans.Tensor)
namespace
nts
{
// namespace nts(NiuTrans.Tensor)
...
@@ -251,6 +252,225 @@ bool TestSumDim2()
...
@@ -251,6 +252,225 @@ bool TestSumDim2()
#endif // USE_CUDA
#endif // USE_CUDA
}
}
/*
case 3: tensor summation c = a + b * \beta
where the size of b is equal to the n-th dimension of a,
i.e., a is summed with b by broadcasting.
In this case,
(20, 40, 4000) + (40) = (20, 40, 4000), dim = 1.
*/
bool
TestSumDim3
()
{
/* a tensor of size (20, 40, 4000) */
int
aOrder
=
3
;
int
*
aDimSize
=
new
int
[
aOrder
];
aDimSize
[
0
]
=
20
;
aDimSize
[
1
]
=
40
;
aDimSize
[
2
]
=
4000
;
int
aUnitNum
=
1
;
for
(
int
i
=
0
;
i
<
aOrder
;
i
++
)
aUnitNum
*=
aDimSize
[
i
];
/* a tensor of size (40) */
int
bOrder
=
1
;
int
*
bDimSize
=
new
int
[
bOrder
];
bDimSize
[
0
]
=
40
;
int
bUnitNum
=
1
;
for
(
int
i
=
0
;
i
<
bOrder
;
i
++
)
bUnitNum
*=
bDimSize
[
i
];
/* CPU test */
bool
cpuTest
=
true
;
/* create tensors */
XTensor
*
a
=
NewTensor
(
aOrder
,
aDimSize
);
XTensor
*
b
=
NewTensor
(
bOrder
,
bDimSize
);
XTensor
*
c
=
NewTensor
(
aOrder
,
aDimSize
);
XTensor
*
cMe
=
NewTensor
(
aOrder
,
aDimSize
);
XTensor
*
answer
=
NewTensor
(
aOrder
,
aDimSize
);
XTensor
cUser
;
/* initialize variables */
a
->
SetZeroAll
();
cMe
->
SetZeroAll
();
_SetDataFixedFloat
(
b
,
1.0
F
);
_SetDataFixedFloat
(
answer
,
1.0
F
);
/* call SumDim function */
_SumDim
(
a
,
b
,
c
,
1
);
_SumDim
(
cMe
,
b
,
1
);
cUser
=
SumDim
(
*
a
,
*
b
,
1
);
/* check results */
cpuTest
=
c
->
CheckData
(
answer
->
data
,
aUnitNum
)
&&
cMe
->
CheckData
(
answer
->
data
,
aUnitNum
)
&&
cUser
.
CheckData
(
answer
->
data
,
aUnitNum
);
#ifdef USE_CUDA
/* GPU test */
bool
gpuTest
=
true
;
/* create tensor */
XTensor
*
aGPU
=
NewTensor
(
aOrder
,
aDimSize
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
*
bGPU
=
NewTensor
(
bOrder
,
bDimSize
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
*
cGPU
=
NewTensor
(
aOrder
,
aDimSize
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
*
cMeGPU
=
NewTensor
(
aOrder
,
aDimSize
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
cUserGPU
;
/* Initialize variables */
aGPU
->
SetZeroAll
();
cMe
->
SetZeroAll
();
_SetDataFixedFloat
(
bGPU
,
1.0
F
);
/* call sum function */
_SumDim
(
aGPU
,
bGPU
,
cGPU
,
1
);
_SumDim
(
cMeGPU
,
bGPU
,
1
);
cUserGPU
=
SumDim
(
*
aGPU
,
*
bGPU
,
1
);
/* check results */
gpuTest
=
cGPU
->
CheckData
(
answer
->
data
,
aUnitNum
)
&&
cMeGPU
->
CheckData
(
answer
->
data
,
aUnitNum
)
&&
cUserGPU
.
CheckData
(
answer
->
data
,
aUnitNum
);
/* destroy variables */
delete
a
;
delete
b
;
delete
c
;
delete
cMe
;
delete
answer
;
delete
aGPU
;
delete
bGPU
;
delete
cGPU
;
delete
cMeGPU
;
delete
[]
aDimSize
;
delete
[]
bDimSize
;
return
cpuTest
&&
gpuTest
;
#else
/* destroy variables */
delete
a
;
delete
b
;
delete
c
;
delete
cMe
;
delete
answer
;
delete
[]
aDimSize
;
delete
[]
bDimSize
;
return
cpuTest
;
#endif // USE_CUDA
}
/*
case 4: tensor summation c = a + b * \beta
where the size of b is equal to the n-th dimension of a,
i.e., a is summed with b by broadcasting.
In this case,
(200, 40, 4000) + (40) = (200, 40, 4000), dim = 1.
*/
bool
TestSumDim4
()
{
/* a tensor of size (200, 40, 4000) */
int
aOrder
=
2
;
int
*
aDimSize
=
new
int
[
aOrder
];
aDimSize
[
0
]
=
1000000
;
aDimSize
[
1
]
=
50
;
int
aUnitNum
=
1
;
for
(
int
i
=
0
;
i
<
aOrder
;
i
++
)
aUnitNum
*=
aDimSize
[
i
];
/* a tensor of size (40) */
int
bOrder
=
1
;
int
*
bDimSize
=
new
int
[
bOrder
];
bDimSize
[
0
]
=
50
;
int
bUnitNum
=
1
;
for
(
int
i
=
0
;
i
<
bOrder
;
i
++
)
bUnitNum
*=
bDimSize
[
i
];
/* CPU test */
bool
cpuTest
=
true
;
/* create tensors */
XTensor
*
a
=
NewTensor
(
aOrder
,
aDimSize
);
XTensor
*
b
=
NewTensor
(
bOrder
,
bDimSize
);
XTensor
*
c
=
NewTensor
(
aOrder
,
aDimSize
);
XTensor
*
cMe
=
NewTensor
(
aOrder
,
aDimSize
);
XTensor
*
answer
=
NewTensor
(
aOrder
,
aDimSize
);
XTensor
cUser
;
/* initialize variables */
a
->
SetZeroAll
();
cMe
->
SetZeroAll
();
_SetDataFixedFloat
(
b
,
1.0
F
);
_SetDataFixedFloat
(
answer
,
1.0
F
);
/* call SumDim function */
_SumDim
(
a
,
b
,
c
,
1
);
_SumDim
(
cMe
,
b
,
1
);
cUser
=
SumDim
(
*
a
,
*
b
,
1
);
/* check results */
cpuTest
=
c
->
CheckData
(
answer
->
data
,
aUnitNum
)
&&
cMe
->
CheckData
(
answer
->
data
,
aUnitNum
)
&&
cUser
.
CheckData
(
answer
->
data
,
aUnitNum
);
#ifdef USE_CUDA
/* GPU test */
bool
gpuTest
=
true
;
/* create tensor */
XTensor
*
aGPU
=
NewTensor
(
aOrder
,
aDimSize
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
*
bGPU
=
NewTensor
(
bOrder
,
bDimSize
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
*
cGPU
=
NewTensor
(
aOrder
,
aDimSize
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
*
cMeGPU
=
NewTensor
(
aOrder
,
aDimSize
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
cUserGPU
;
/* Initialize variables */
aGPU
->
SetZeroAll
();
cMe
->
SetZeroAll
();
_SetDataFixedFloat
(
bGPU
,
1.0
F
);
/* call sum function */
_SumDim
(
aGPU
,
bGPU
,
cGPU
,
1
);
_SumDim
(
cMeGPU
,
bGPU
,
1
);
cUserGPU
=
SumDim
(
*
aGPU
,
*
bGPU
,
1
);
/* check results */
gpuTest
=
cGPU
->
CheckData
(
answer
->
data
,
aUnitNum
)
&&
cMeGPU
->
CheckData
(
answer
->
data
,
aUnitNum
)
&&
cUserGPU
.
CheckData
(
answer
->
data
,
aUnitNum
);
/* destroy variables */
delete
a
;
delete
b
;
delete
c
;
delete
cMe
;
delete
answer
;
delete
aGPU
;
delete
bGPU
;
delete
cGPU
;
delete
cMeGPU
;
delete
[]
aDimSize
;
delete
[]
bDimSize
;
return
cpuTest
&&
gpuTest
;
#else
/* destroy variables */
delete
a
;
delete
b
;
delete
c
;
delete
cMe
;
delete
answer
;
delete
[]
aDimSize
;
delete
[]
bDimSize
;
return
cpuTest
;
#endif // USE_CUDA
}
/* other cases */
/* other cases */
/*
/*
TODO!!
TODO!!
...
@@ -279,6 +499,24 @@ bool TestSumDim()
...
@@ -279,6 +499,24 @@ bool TestSumDim()
}
}
else
else
XPRINT
(
0
,
stdout
,
">> case 2 passed!
\n
"
);
XPRINT
(
0
,
stdout
,
">> case 2 passed!
\n
"
);
/* case 3 test */
caseFlag
=
TestSumDim3
();
if
(
!
caseFlag
)
{
returnFlag
=
false
;
XPRINT
(
0
,
stdout
,
">> case 3 failed!
\n
"
);
}
else
XPRINT
(
0
,
stdout
,
">> case 3 passed!
\n
"
);
///* case 4 test */
//caseFlag = TestSumDim4();
//if (!caseFlag) {
// returnFlag = false;
// XPRINT(0, stdout, ">> case 4 failed!\n");
//}
//else
// XPRINT(0, stdout, ">> case 4 passed!\n");
/* other cases test */
/* other cases test */
/*
/*
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论