Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
N
NiuTrans.Tensor
概览
Overview
Details
Activity
Cycle Analytics
版本库
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
问题
0
Issues
0
列表
Board
标记
里程碑
合并请求
0
Merge Requests
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
Snippets
成员
Collapse sidebar
Close sidebar
活动
图像
聊天
创建新问题
作业
提交
Issue Boards
Open sidebar
杨迪
NiuTrans.Tensor
Commits
102db468
Commit
102db468
authored
Sep 17, 2018
by
xuchen
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
better code for dropout function (by broadcasting)
parent
4336f2f9
隐藏空白字符变更
内嵌
并排
正在显示
18 个修改的文件
包含
271 行增加
和
192 行删除
+271
-192
source/sample/transformer/T2TAttention.cpp
+0
-9
source/sample/transformer/T2TAttention.h
+3
-0
source/sample/transformer/T2TEncoder.cpp
+21
-10
source/sample/transformer/T2TEncoder.h
+6
-3
source/sample/transformer/T2TFNN.cpp
+1
-1
source/sample/transformer/T2TModel.cpp
+6
-4
source/sample/transformer/T2TModel.h
+2
-2
source/sample/transformer/T2TTrainer.cpp
+26
-15
source/sample/transformer/T2TTrainer.h
+1
-1
source/tensor/XMem.cpp
+63
-2
source/tensor/XTensor.cpp
+12
-0
source/tensor/core/arithmetic/Multiply.cpp
+2
-2
source/tensor/core/arithmetic/Multiply.cu
+3
-5
source/tensor/function/Dropout.cpp
+92
-114
source/tensor/function/Dropout.h
+10
-4
source/tensor/function/FHeader.h
+1
-0
source/tensor/test/TDropout.cpp
+20
-19
source/tensor/test/TXMem.cpp
+2
-1
没有找到文件。
source/sample/transformer/T2TAttention.cpp
查看文件 @
102db468
...
...
@@ -125,17 +125,8 @@ XTensor T2TAttention::Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask)
dot
=
Linear
(
dot
,
1.0
F
/
(
float
)
sqrt
((
float
)
dk
));
//if(llnum == 1)
// dot.Dump(tf, "dot:");
scalar
=
Softmax
(
dot
,
-
1
);
//if(llnum == 1)
// scalar.Dump(tf, "scalar:");
//if(ignored > 0)
// _SetDataDim(&scalar, 0, ignored, scalar.order - 2, 1e-9F);
att
=
BMMul
(
scalar
,
vheads
);
/* concatenate the heads */
...
...
source/sample/transformer/T2TAttention.h
查看文件 @
102db468
...
...
@@ -73,6 +73,9 @@ public:
special design for the attention model. */
int
ignored
;
/* indicates whether the model is used for training */
bool
isTraining
;
public
:
/* constructor */
T2TAttention
();
...
...
source/sample/transformer/T2TEncoder.cpp
查看文件 @
102db468
...
...
@@ -63,6 +63,7 @@ void AttEncoder::InitModel(int argc, const char ** argv,
LoadParamInt
(
argc
,
argv
,
"hsize"
,
&
hSize
,
DEFAULT_EMBEDDING_SIZE
);
LoadParamInt
(
argc
,
argv
,
"esize"
,
&
eSize
,
DEFAULT_EMBEDDING_SIZE
);
LoadParamInt
(
argc
,
argv
,
"vsize"
,
&
vSize
,
-
1
);
LoadParamFloat
(
argc
,
argv
,
"dropout"
,
&
dropoutP
,
0
);
CheckNTErrors
(
nlayer
>=
1
,
"We have one encoding layer at least!"
);
CheckNTErrors
(
vSize
>
1
,
"set vocabulary size by
\"
-vsize
\"
"
);
...
...
@@ -89,29 +90,34 @@ make the encoding network
>> input - the input tensor of the encoder
>> mask - the mask that indicate each position is valid
>> skipInputRes - indicates whether we skip the residual connection of the first layer
>> isTraining - indicates whether the model is for training
<< return - the output tensor of the encoder
*/
XTensor
AttEncoder
::
Make
(
XTensor
&
input
,
XTensor
&
mask
,
bool
skipInputRes
)
XTensor
AttEncoder
::
Make
(
XTensor
&
input
,
XTensor
&
mask
,
bool
skipInputRes
,
bool
isTraining
)
{
XTensor
x
;
x
=
embedder
.
Make
(
input
);
/* dropout */
if
(
isTraining
&&
dropoutP
>
0
)
x
=
Dropout
(
x
,
dropoutP
);
for
(
int
i
=
0
;
i
<
nlayer
;
i
++
){
XTensor
att
;
XTensor
ln
;
XTensor
fnn
;
XTensor
res
;
llnum
=
-
1
;
/* we skip the residual connection for the first layer if
the encoder is used in language modeling. */
if
(
skipInputRes
&&
i
==
0
){
/* self attention */
att
=
attentions
[
i
].
Make
(
x
,
x
,
x
,
mask
);
/* TODO: dropout */
/* dropout */
if
(
isTraining
&&
dropoutP
>
0
)
att
=
Dropout
(
att
,
dropoutP
);
/* layer normalization */
x
=
attLayerNorms
[
i
].
Make
(
att
);
...
...
@@ -121,27 +127,32 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool skipInputRes)
/* self attention */
att
=
attentions
[
i
].
Make
(
x
,
x
,
x
,
mask
);
/* dropout */
if
(
isTraining
&&
dropoutP
>
0
)
att
=
Dropout
(
att
,
dropoutP
);
/* residual connection */
res
=
Sum
(
att
,
x
);
/* TODO: dropout */
/* layer normalization */
x
=
attLayerNorms
[
i
].
Make
(
res
);
llnum
=
-
1
;
}
/* fnn */
fnn
=
fnns
[
i
].
Make
(
x
);
/* dropout */
if
(
isTraining
&&
dropoutP
>
0
)
fnn
=
Dropout
(
fnn
,
dropoutP
);
/* residual connection */
res
=
Sum
(
fnn
,
x
);
/* TODO: dropout */
/* layer normalization */
x
=
fnnLayerNorms
[
i
].
Make
(
res
);
if
(
isTraining
&&
dropoutP
>
0
)
x
=
Dropout
(
x
,
dropoutP
);
}
return
x
;
...
...
source/sample/transformer/T2TEncoder.h
查看文件 @
102db468
...
...
@@ -40,7 +40,7 @@ class T2TEncoder
{
public
:
virtual
XTensor
Make
(
XTensor
&
input
,
XTensor
&
mask
,
bool
skipInputRes
)
=
0
;
XTensor
Make
(
XTensor
&
input
,
XTensor
&
mask
,
bool
skipInputRes
,
bool
isTraining
)
=
0
;
};
/*
...
...
@@ -49,7 +49,7 @@ the encoder based on RNN
class
RNNEncoder
:
T2TEncoder
{
public
:
XTensor
Make
(
XTensor
&
input
,
XTensor
&
mask
,
bool
skipInputRes
);
XTensor
Make
(
XTensor
&
input
,
XTensor
&
mask
,
bool
skipInputRes
,
bool
isTraining
);
};
...
...
@@ -77,6 +77,9 @@ public:
/* vocabulary size */
int
vSize
;
/* dropout probability */
DTYPE
dropoutP
;
/* some positions can be ignored in attention. this is useful in lm where the first position needs
special design for the attention model. */
int
ignored
;
...
...
@@ -115,7 +118,7 @@ public:
int
myDevID
=
-
1
,
XMem
*
myMem
=
NULL
);
/* make the encoding network */
XTensor
Make
(
XTensor
&
input
,
XTensor
&
mask
,
bool
skipInputRes
);
XTensor
Make
(
XTensor
&
input
,
XTensor
&
mask
,
bool
skipInputRes
,
bool
isTraining
);
};
...
...
source/sample/transformer/T2TFNN.cpp
查看文件 @
102db468
...
...
@@ -58,7 +58,7 @@ void T2TFNN::InitModel(int argc, const char ** argv, int myDevID, XMem * myMem)
LoadParamInt
(
argc
,
argv
,
"d"
,
&
inSize
,
DEFAULT_EMBEDDING_SIZE
);
LoadParamInt
(
argc
,
argv
,
"d"
,
&
outSize
,
DEFAULT_EMBEDDING_SIZE
);
LoadParamInt
(
argc
,
argv
,
"fnnh"
,
&
hSize
,
DEFAULT_EMBEDDING_SIZE
);
LoadParamInt
(
argc
,
argv
,
"fnnh"
,
&
hSize
,
DEFAULT_EMBEDDING_SIZE
*
4
);
LoadParamFloat
(
argc
,
argv
,
"fnnminmax"
,
&
minmax
,
0.1
F
);
InitTensor2D
(
&
w1
,
inSize
,
hSize
,
X_FLOAT
,
devID
,
mem
);
...
...
source/sample/transformer/T2TModel.cpp
查看文件 @
102db468
...
...
@@ -77,11 +77,12 @@ make the encoding network
>> input - input tensor
>> mask - the mask for positions that are/not involved in computation
>> skipInputRes - indicates whether we skip the residual connection of the first layer
>> isTraining - indicates whether we are training the model
<< return - encoding result
*/
XTensor
T2TModel
::
MakeEncoding
(
XTensor
&
input
,
XTensor
&
mask
,
bool
skipInputRes
)
XTensor
T2TModel
::
MakeEncoding
(
XTensor
&
input
,
XTensor
&
mask
,
bool
skipInputRes
,
bool
isTraining
)
{
return
encoder
.
Make
(
input
,
mask
,
skipInputRes
);
return
encoder
.
Make
(
input
,
mask
,
skipInputRes
,
isTraining
);
}
/*
...
...
@@ -89,8 +90,9 @@ make the entire network (with the output softmax layer)
>> input - input tensor
>> output - output tensor (distribution)
>> padding - padding of the sequences
>> isTraining - indicates whether the model is for training
*/
void
T2TModel
::
Make
(
XTensor
&
input
,
XTensor
&
output
,
XTensor
&
padding
)
void
T2TModel
::
Make
(
XTensor
&
input
,
XTensor
&
output
,
XTensor
&
padding
,
bool
isTraining
)
{
XTensor
encoding
;
...
...
@@ -134,7 +136,7 @@ void T2TModel::Make(XTensor &input, XTensor &output, XTensor &padding)
//_Sum(&mask, padding3, &mask);
encoding
=
MakeEncoding
(
input
,
mask
,
true
);
encoding
=
MakeEncoding
(
input
,
mask
,
true
,
isTraining
);
outputLayer
.
Make
(
encoding
,
output
);
delete
[]
dims
;
...
...
source/sample/transformer/T2TModel.h
查看文件 @
102db468
...
...
@@ -69,10 +69,10 @@ public:
void
InitModel
(
int
argc
,
const
char
**
argv
);
/* make the encoding network */
XTensor
MakeEncoding
(
XTensor
&
input
,
XTensor
&
mask
,
bool
skipInputRes
);
XTensor
MakeEncoding
(
XTensor
&
input
,
XTensor
&
mask
,
bool
skipInputRes
,
bool
isTraining
);
/* make the entire network (with the output softmax layer) */
void
Make
(
XTensor
&
input
,
XTensor
&
output
,
XTensor
&
padding
);
void
Make
(
XTensor
&
input
,
XTensor
&
output
,
XTensor
&
padding
,
bool
isTraining
);
/* get parameter matrics */
void
GetParams
(
XList
&
list
);
...
...
source/sample/transformer/T2TTrainer.cpp
查看文件 @
102db468
...
...
@@ -149,7 +149,7 @@ void T2TTrainer::Train(const char * fn, T2TModel * model)
XTensor
output
;
/* make the network */
model
->
Make
(
batch
,
output
,
padding
);
model
->
Make
(
batch
,
output
,
padding
,
true
);
/* make paddings for the output */
if
(
output
.
GetDim
(
0
)
>
1
)
...
...
@@ -166,16 +166,6 @@ void T2TTrainer::Train(const char * fn, T2TModel * model)
/* get probabilities */
float
prob
=
GetProb
(
&
output
,
&
gold
,
NULL
);
MTYPE
totalUsed
=
0
;
MTYPE
totalSize
=
0
;
for
(
int
i
=
0
;
i
<=
mem
->
curBlockID
;
i
++
)
{
totalSize
+=
mem
->
blocks
[
i
].
size
;
totalUsed
+=
mem
->
blocks
[
i
].
used
;
}
//fprintf(stderr, "%d(%ld,%ld,%f)\n", mem->curBlockID, totalUsed, totalSize, (float)totalUsed/totalSize);
loss
+=
-
prob
;
wordCount
+=
wc
;
...
...
@@ -209,6 +199,8 @@ void T2TTrainer::Train(const char * fn, T2TModel * model)
fclose
(
tf
);
epoch
=
MIN
(
epoch
,
nepoch
);
XPRINT6
(
0
,
stderr
,
"[INFO] lr=%.2e, elapsed=%.1fs, step=%d, epoch=%d, word=%d, ppl=%.3f
\n
"
,
lr
,
elapsed
,
step
,
epoch
,
wordCountTotal
,
exp
(
loss
/
wordCount
));
XPRINT3
(
0
,
stderr
,
"[INFO] training finished (took %.1fs, step=%d and epoch=%d)
\n
"
,
...
...
@@ -271,7 +263,7 @@ void T2TTrainer::Test(const char * fn, const char * ofn, T2TModel * model)
XTensor
output
;
/* make the network */
model
->
Make
(
batch
,
output
,
padding
);
model
->
Make
(
batch
,
output
,
padding
,
false
);
int
bSize
=
batch
.
GetDim
(
0
);
int
length
=
batch
.
GetDim
(
1
);
...
...
@@ -333,11 +325,19 @@ void T2TTrainer::Test(const char * fn, const char * ofn, T2TModel * model)
char
line
[
MAX_SEQUENCE_LENGTH
];
struct
SampleNode
{
int
id
;
int
size
;
};
/*
load data to buffer
>> file - where to load data
>> isSorted - indicates whether the samples are sorted by length
>> step - the number of sequences we go over when move to the next sample
*/
int
T2TTrainer
::
LoadBuf
(
FILE
*
file
)
int
T2TTrainer
::
LoadBuf
(
FILE
*
file
,
bool
isSorted
,
int
step
)
{
int
lineCount
=
0
;
int
seqCount
=
0
;
...
...
@@ -403,6 +403,17 @@ int T2TTrainer::LoadBuf(FILE * file)
nseqBuf
=
seqCount
;
nextSeq
=
0
;
if
(
isSorted
)
{
SampleNode
*
nodes
=
new
SampleNode
[
seqCount
];
int
count
=
0
;
for
(
int
i
=
0
;
i
<
seqCount
;
i
+=
step
)
{
nodes
[
count
].
id
=
count
;
nodes
[
count
].
size
=
seqLen
[
i
];
count
++
;
}
delete
[]
nodes
;
}
return
lineCount
;
}
...
...
@@ -430,7 +441,7 @@ load a batch of sequences
>> devID - device id
>> mem - memory pool
*/
int
T2TTrainer
::
LoadBatch
(
FILE
*
file
,
bool
isLM
,
int
T2TTrainer
::
LoadBatch
(
FILE
*
file
,
bool
isLM
,
XTensor
*
batch
,
XTensor
*
padding
,
XTensor
*
output
,
int
*
seqs
,
int
step
,
int
vs
,
int
sBatch
,
int
wBatch
,
...
...
@@ -438,7 +449,7 @@ int T2TTrainer::LoadBatch(FILE * file, bool isLM,
int
devID
,
XMem
*
mem
)
{
if
(
nextSeq
<
0
||
nextSeq
>=
nseqBuf
)
LoadBuf
(
file
);
LoadBuf
(
file
,
isSorted
);
int
seq
=
MAX
(
nextSeq
,
0
);
int
wc
=
0
;
...
...
source/sample/transformer/T2TTrainer.h
查看文件 @
102db468
...
...
@@ -118,7 +118,7 @@ public:
void
Test
(
const
char
*
fn
,
const
char
*
ofn
,
T2TModel
*
model
);
/* load data to buffer */
int
LoadBuf
(
FILE
*
file
);
int
LoadBuf
(
FILE
*
file
,
bool
isSorted
,
int
step
);
/* clear data buffer */
void
ClearBuf
();
...
...
source/tensor/XMem.cpp
查看文件 @
102db468
...
...
@@ -747,6 +747,64 @@ void * XMem::AllocStandard(int myDevID, MTYPE mySize, bool myIsRebuiltIndex)
CheckNTErrors
(
nodeNumUsed
<
nodeNum
,
"No enough index nodes for the memory pool!"
);
}
/*if(testxmemid == 30){
recordp = result;
}
if(curBlockID >= 25){
MHeader * head = blocks[25].head;
while(head != NULL){
fprintf(stderr, "head: %ld %ld\n", head->indexNode->pReal, head->indexNode->size);
head = head->next;
}
}
if(testxmemid == 32){
int nnn = 0;
}
if(recordp != NULL){
MTYPE size = mySize;
if(size <= minSizeIndex[0])
size = minSizeIndex[0];
MPieceNode * entry = NULL;
MPieceNode * node = NULL;
MPieceNode * hit = NULL;
MPieceNode * last = NULL;
entry = memIndex + indexEntryNum + FindIndexEntry(size);
last = entry;
node = entry->next;
while(node != NULL){
CheckNTErrors(node->pre == last, "Something is wrong!");
CheckNTErrors(last->next == node, "Something is wrong!");
CheckNTErrors(node->head.state == 2, "Something is wrong!");
last = node;
if(node->size == 0){
MPieceNode * next = node->next;
RemoveFreeIndexNode(node, entry);
node = next;
ShowNTErrors("Something is wrong!");
}
else{
CheckNTErrors(node->pReal != NULL, "Illegal pointer!");
if(node->pReal == recordp){
hit = node;
break;
}
node = node->next;
}
}
if(hit == NULL){
int nnn = 0;
}
}*/
return
result
;
}
...
...
@@ -918,6 +976,8 @@ void XMem::ReleaseStandard(int myDevID, void * p, MTYPE size)
hit
->
head
.
state
=
1
;
RemoveAllocIndexNode
(
hit
);
hit
->
size
=
(
char
*
)
hit
->
p
+
hit
->
head
.
size
-
(
char
*
)
GetPitchedAddress
((
char
*
)
hit
->
p
,
MY_PITCH
);
AddFreeIndexNode
(
hit
);
blocks
[
hit
->
head
.
blockID
].
used
-=
hit
->
head
.
size
;
...
...
@@ -981,8 +1041,9 @@ void XMem::RebuildIndex()
/* make a new index node */
MPieceNode
*
newNode
=
memIndex2
+
nodeNumUsed2
++
;
newNode
->
p
=
p
;
newNode
->
size
=
(
char
*
)
p
+
head
->
size
-
(
head
->
state
==
1
?
(
char
*
)
GetPitchedAddress
((
char
*
)
p
,
MY_PITCH
)
:
(
char
*
)
head
->
indexNode
->
pReal
);
newNode
->
size
=
node
->
size
;
//newNode->size = (char*)p + head->size -
// ( head->state == 1 ? (char*)GetPitchedAddress((char*)p, MY_PITCH) : (char*)head->indexNode->pReal);
newNode
->
pre
=
NULL
;
newNode
->
next
=
NULL
;
...
...
source/tensor/XTensor.cpp
查看文件 @
102db468
...
...
@@ -552,11 +552,17 @@ void XTensor::SetZeroAll(XStream * stream)
if
(
devID
>=
0
){
#ifdef USE_CUDA
int
size
=
sizeof
(
int
)
+
(
sizeof
(
int
)
+
sizeof
(
DTYPE
))
*
unitNumNonZero
;
int
devIDBackup
=
0
;
cudaGetDevice
(
&
devIDBackup
);
cudaSetDevice
(
devID
);
if
(
stream
==
NULL
)
cudaMemset
(
data
,
0
,
size
);
else
cudaMemsetAsync
(
data
,
0
,
size
,
stream
->
stream
);
cudaSetDevice
(
devIDBackup
);
#endif
}
else
...
...
@@ -567,10 +573,16 @@ void XTensor::SetZeroAll(XStream * stream)
else
{
if
(
devID
>=
0
){
#ifdef USE_CUDA
int
devIDBackup
=
0
;
cudaGetDevice
(
&
devIDBackup
);
cudaSetDevice
(
devID
);
if
(
stream
==
NULL
)
cudaMemset
(
data
,
0
,
unitNum
*
unitSize
);
else
cudaMemsetAsync
(
data
,
0
,
unitNum
*
unitSize
,
stream
->
stream
);
cudaSetDevice
(
devIDBackup
);
#endif
}
else
...
...
source/tensor/core/arithmetic/Multiply.cpp
查看文件 @
102db468
...
...
@@ -66,8 +66,8 @@ void _Multiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, i
for
(
int
i
=
0
;
i
<
a
->
order
;
i
++
)
{
if
(
i
!=
leadingDimRDI
)
{
CheckNTErrors
((
a
->
dimSizeRDI
[
i
]
==
b
->
dimSizeRDI
[
i
]
&&
a
->
dimSizeRDI
[
i
]
==
c
->
dimSizeRDI
[
i
]),
"Unmatched tensors!"
);
a
->
dimSizeRDI
[
i
]
==
c
->
dimSizeRDI
[
i
]),
"Unmatched tensors!"
);
}
if
(
i
<
leadingDimRDI
)
stride
*=
a
->
dimSizeRDI
[
i
];
...
...
source/tensor/core/arithmetic/Multiply.cu
查看文件 @
102db468
...
...
@@ -77,7 +77,7 @@ where |a_lead| means the size of the leading dimension of a
*/
template<int nonZeroAlpha> __global__
void KernelMulElementWiseTensorDynamic(DTYPE * a, DTYPE * b, DTYPE * c, DTYPE alpha,
int stride, int ldSizeA, int ldSizeB, int ldSizeC, int blockNum)
int stride, int ldSizeA, int ldSizeB, int ldSizeC, int blockNum)
{
__shared__ DTYPE* ap[MAX_CUDA_THREAD_NUM_PER_BLOCK];
__shared__ DTYPE* bp[MAX_CUDA_THREAD_NUM_PER_BLOCK];
...
...
@@ -171,14 +171,12 @@ void _CudaMultiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alph
if (alpha == 0) {
KernelMulElementWiseTensorDynamic<0> << <blocks, threads >> >
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, 0,
stride, dimensionSizeA, dimensionSizeB, dimensionSizeC,
blockNum);
stride, dimensionSizeA, dimensionSizeB, dimensionSizeC, blockNum);
}
else {
KernelMulElementWiseTensorDynamic<1> << <blocks, threads >> >
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, alpha,
stride, dimensionSizeA, dimensionSizeB, dimensionSizeC,
blockNum);
stride, dimensionSizeA, dimensionSizeB, dimensionSizeC, blockNum);
}
}
}
...
...
source/tensor/function/Dropout.cpp
查看文件 @
102db468
...
...
@@ -25,120 +25,59 @@
#include "Dropout.h"
#include "Dropout.cuh"
#include "../core/arithmetic/Multiply.h"
#include "../core/arithmetic/MultiplyDim.h"
#include "../core/math/ScaleAndShift.h"
namespace
nts
{
// namespace nts(NiuTrans.Tensor
/*
generate a random bernoulli number
*/
DTYPE
RandomBernoulli
(
DTYPE
prob
)
{
return
(
DTYPE
)
rand
()
/
(
DTYPE
)
RAND_MAX
>
prob
?
(
DTYPE
)
1.0
:
(
DTYPE
)
0.0
;
}
/*
dropout function
It randomly zeroes some of the elements of the input tensor
with probability p via a Bernoulli distribution.
During training, randomly zeroes some of the elements of the input tensor
with probability p using samples from a Bernoulli distribution.
The elements to zero are randomized on every forward call.
This has proven to be an effective technique for regularization and
preventing the co-adaptation of neurons as described in the paper
"Improving neural networks by preventing co-adaptation of feature detectors".
See "Improving neural networks by preventing co-adaptation of feature detectors"
for more details.
Furthermore, the outputs are scaled by a factor of \frac{1}{1-p} during training.
This means that during evaluation the module simply computes an identity function.
Here, the output is scaled by a factor of \frac{1}{1-p} so that we do not need
to mark the tensor with probability p in the inference phase. Instead we perform
the same inference procedure as that with no use of dropout on the test data.
>> x - input tensor
>> y - output tensor
>> prob - probability to set an element zero
>> seed - random seed
>> dropProb - probability to set an element to zero
>> leadingDim - the dimension which we generate the random numbers and perform broadcasting
*/
void
_Dropout
(
const
XTensor
*
x
,
XTensor
*
y
,
unsigned
int
seed
,
DTYPE
prob
)
void
_Dropout
(
const
XTensor
*
x
,
XTensor
*
y
,
unsigned
int
seed
,
DTYPE
dropProb
,
int
leadingDim
)
{
CheckNTErrors
(
prob
>=
0.0
&&
prob
<=
1.0
,
"The probability must be 0-1!"
);
DTYPE
scaleFactor
=
(
DTYPE
)
1.0
/
((
DTYPE
)
1.0
-
prob
);
/* generate a mask tensor again with special probability */
srand
(
seed
);
int
unitNum
=
x
->
unitNum
;
DTYPE
*
maskArray
=
new
DTYPE
[
unitNum
];
for
(
int
i
=
0
;
i
<
unitNum
;
i
++
)
maskArray
[
i
]
=
RandomBernoulli
(
prob
);
CheckNTErrors
(
dropProb
>=
0.0
&&
dropProb
<=
1.0
,
"The probability must be 0-1!"
);
XTensor
*
maskTensor
=
NewTensorBuf
(
x
,
x
->
devID
,
x
->
mem
);
maskTensor
->
SetData
(
maskArray
,
unitNum
);
int
n
=
leadingDim
<
0
?
x
->
order
-
1
:
leadingDim
;
#ifdef USE_CUDA
if
(
x
->
devID
>=
0
||
y
->
devID
>=
0
){
_CudaDropout
(
x
,
y
,
maskTensor
,
scaleFactor
);
DelTensorBuf
(
maskTensor
);
delete
[]
maskArray
;
return
;
}
#endif
CheckNTErrors
(
n
>=
0
&&
n
<
x
->
order
,
"Wrong leadingDim!"
);
XTensor
*
inter
=
NewTensorBuf
(
x
,
x
->
devID
,
x
->
mem
);
_Multiply
(
x
,
maskTensor
,
inter
);
_ScaleAndShift
(
inter
,
y
,
scaleFactor
,
0
);
DelTensorBuf
(
inter
);
DelTensorBuf
(
maskTensor
);
delete
[]
maskArray
;
}
/*
dropout function (return a XTensor structure)
make a new tensor to keep the result and return it
During training, randomly zeroes some of the elements of the input tensor
with probability p using samples from a Bernoulli distribution.
The elements to zero are randomized on every forward call.
This has proven to be an effective technique for regularization and
preventing the co-adaptation of neurons as described in the paper
"Improving neural networks by preventing co-adaptation of feature detectors".
Furthermore, the outputs are scaled by a factor of \frac{1}{1-p} during training.
This means that during evaluation the module simply computes an identity function.
>> x - input tensor
>> y - output tensor
>> prob - probability to set an element zero
*/
XTensor
Dropout
(
const
XTensor
&
x
,
DTYPE
prob
)
{
XTensor
y
(
&
x
);
y
.
SetTMP
();
DTYPE
scaleFactor
=
(
DTYPE
)
1.0
/
((
DTYPE
)
1.0
-
prob
);
DTYPE
scaleFactor
=
(
DTYPE
)
1.0
/
((
DTYPE
)
1.0
-
dropProb
);
/* generate a mask tensor again with special probability */
srand
((
unsigned
int
)
time
(
NULL
));
int
unitNum
=
x
.
unitNum
;
int
unitNum
=
x
->
dimSize
[
n
];
DTYPE
*
maskArray
=
new
DTYPE
[
unitNum
];
for
(
int
i
=
0
;
i
<
unitNum
;
i
++
)
maskArray
[
i
]
=
RandomBernoulli
(
prob
);
XTensor
maskTensor
(
&
x
);
maskTensor
.
SetData
(
maskArray
,
unitNum
);
XTensor
inter
;
inter
=
Multiply
(
x
,
maskTensor
);
y
=
ScaleAndShift
(
inter
,
scaleFactor
,
0
);
srand
(
seed
)
;
for
(
int
i
=
0
;
i
<
unitNum
;
i
++
)
maskArray
[
i
]
=
RandomBernoulli
(
dropProb
,
scaleFactor
);
delete
[]
maskArray
;
XTensor
*
mask
=
NewTensor1D
(
unitNum
,
x
->
dataType
,
x
->
devID
,
x
->
mem
);
mask
->
SetData
(
maskArray
,
unitNum
);
///* tensor connection */
//XLink::MakeLink(&x, NULL, &y, FUNC_DROPOUT);
//XLink::AddParamToHead(&y, prob);
/* call Multiply function for mask */
_MultiplyDim
(
x
,
mask
,
y
,
n
,
0
);
return
y
;
delete
mask
;
delete
[]
maskArray
;
}
/*
backward computation of dropout function
backward computation of
the
dropout function
dE/dx = dE/dy * dy/dx
...
...
@@ -146,48 +85,86 @@ dE/dx = dE/dy * dy/dx
>> x - input of the dropout function
>> dedy - dE/dy
>> dedx - dE/dx
>> prob - probability to set an element zero
>> seed - random seed
>> dropProb - probability to set an element to zero
>> leadingDim - the dimension which we generate the random numbers and perform broadcasting
*/
void
_DropoutBackward
(
const
XTensor
*
y
,
const
XTensor
*
x
,
const
XTensor
*
dedy
,
XTensor
*
dedx
,
unsigned
int
seed
,
DTYPE
prob
)
unsigned
int
seed
,
DTYPE
dropProb
,
int
leadingDim
)
{
CheckNTErrors
(
dropProb
>=
0.0
&&
dropProb
<=
1.0
,
"The probability must be 0-1!"
);
int
n
=
leadingDim
<
0
?
x
->
order
-
1
:
leadingDim
;
CheckNTErrors
(
n
>=
0
&&
n
<
x
->
order
,
"Wrong leadingDim!"
);
if
(
x
->
dataType
==
DEFAULT_DTYPE
&&
y
->
dataType
==
DEFAULT_DTYPE
)
{
int
unitNum
=
y
->
unitNum
;
DTYPE
scaleFactor
=
(
DTYPE
)
1.0
F
/
((
DTYPE
)
1.0
F
-
prob
);
DTYPE
scaleFactor
=
(
DTYPE
)
1.0
F
/
((
DTYPE
)
1.0
F
-
dropProb
);
/* generate a mask tensor again with special probability */
srand
(
seed
)
;
int
unitNum
=
x
->
dimSize
[
n
]
;
DTYPE
*
maskArray
=
new
DTYPE
[
unitNum
];
srand
(
seed
);
for
(
int
i
=
0
;
i
<
unitNum
;
i
++
)
maskArray
[
i
]
=
RandomBernoulli
(
prob
);
XTensor
*
maskTensor
=
NewTensorBuf
(
x
,
x
->
devID
,
x
->
mem
);
maskTensor
->
SetData
(
maskArray
,
unitNum
);
#ifdef USE_CUDA
if
(
x
->
devID
>=
0
||
y
->
devID
>=
0
){
_CudaDropoutBackward
(
y
,
x
,
dedy
,
dedx
,
maskTensor
,
scaleFactor
);
DelTensorBuf
(
maskTensor
);
delete
[]
maskArray
;
return
;
}
#endif
maskArray
[
i
]
=
RandomBernoulli
(
dropProb
,
scaleFactor
);
DTYPE
*
dedyp
=
(
DTYPE
*
)
dedy
->
data
;
DTYPE
*
dedxp
=
(
DTYPE
*
)
dedx
->
data
;
XTensor
*
mask
=
NewTensor1D
(
unitNum
,
x
->
dataType
,
x
->
devID
,
x
->
mem
)
;
mask
->
SetData
(
maskArray
,
unitNum
)
;
/* dE/dx = dE/dy * dy/dx */
for
(
int
i
=
0
;
i
<
unitNum
;
i
++
)
dedxp
[
i
]
=
dedyp
[
i
]
*
maskArray
[
i
]
*
scaleFactor
;
/* call MultiplyDim function for mask */
_MultiplyDim
(
dedy
,
mask
,
dedx
,
n
,
0
);
DelTensorBuf
(
maskTensor
)
;
delete
mask
;
delete
[]
maskArray
;
}
else
ShowNTErrors
(
"TODO!"
);
}
/*
dropout function (we make tensor connections here)
It randomly zeroes some of the elements of the input tensor
with probability p via a Bernoulli distribution.
See "Improving neural networks by preventing co-adaptation of feature detectors"
for more details.
Here, the output is scaled by a factor of \frac{1}{1-p} so that we do not need
to mark the tensor with probability p in the inference phase. Instead we perform
the same inference procedure as that with no use of dropout on the test data.
>> x - input tensor
>> dropProb - probability to set an element to zero
>> leadingDim - the dimension which we generate the random numbers and perform broadcasting
*/
XTensor
Dropout
(
const
XTensor
&
x
,
DTYPE
dropProb
,
int
leadingDim
)
{
CheckNTErrors
(
dropProb
>=
0.0
&&
dropProb
<=
1.0
,
"The probability must be 0-1!"
);
int
n
=
leadingDim
<
0
?
x
.
order
-
1
:
leadingDim
;
CheckNTErrors
(
n
>=
0
&&
n
<
x
.
order
,
"Wrong leadingDim!"
);
DTYPE
scaleFactor
=
(
DTYPE
)
1.0
/
((
DTYPE
)
1.0
-
dropProb
);
/* generate a mask tensor with probability p */
int
unitNum
=
x
.
dimSize
[
n
];
DTYPE
*
maskArray
=
new
DTYPE
[
unitNum
];
srand
((
unsigned
int
)
time
(
NULL
));
for
(
int
i
=
0
;
i
<
unitNum
;
i
++
)
maskArray
[
i
]
=
RandomBernoulli
(
dropProb
,
scaleFactor
);
XTensor
mask
;
InitTensor1D
(
&
mask
,
unitNum
,
x
.
dataType
,
x
.
devID
,
x
.
mem
);
mask
.
SetData
(
maskArray
,
unitNum
);
delete
[]
maskArray
;
return
MultiplyDim
(
x
,
mask
,
n
,
0
);
}
}
//
namespace
nts
(
NiuTrans
.
Tensor
)
\ No newline at end of file
source/tensor/function/Dropout.h
查看文件 @
102db468
...
...
@@ -27,16 +27,22 @@
namespace
nts
{
// namespace nts(NiuTrans.Tensor)
/* dropout function */
void
_Dropout
(
const
XTensor
*
x
,
XTensor
*
y
,
unsigned
int
seed
,
DTYPE
prob
=
0
.
5
);
/* generate a random bernoulli number */
inline
DTYPE
RandomBernoulli
(
DTYPE
dropProb
,
DTYPE
value
)
{
return
(
DTYPE
)
rand
()
/
(
DTYPE
)
RAND_MAX
>=
dropProb
?
(
DTYPE
)
value
:
0
;
}
/* dropout function */
XTensor
Dropout
(
const
XTensor
&
x
,
DTYPE
prob
=
0
.
5
);
void
_Dropout
(
const
XTensor
*
x
,
XTensor
*
y
,
unsigned
int
seed
,
DTYPE
dropProb
,
int
leadingDim
=
-
1
);
/* de/dx */
void
_DropoutBackward
(
const
XTensor
*
y
,
const
XTensor
*
x
,
const
XTensor
*
dedy
,
XTensor
*
dedx
,
unsigned
int
seed
,
DTYPE
prob
=
0
.
5
);
unsigned
int
seed
,
DTYPE
dropProb
,
int
leadingDim
=
-
1
);
/* dropout function */
XTensor
Dropout
(
const
XTensor
&
x
,
DTYPE
dropProb
,
int
leadingDim
=
-
1
);
}
// namespace nts(NiuTrans.Tensor)
...
...
source/tensor/function/FHeader.h
查看文件 @
102db468
...
...
@@ -26,6 +26,7 @@
#include "../XTensor.h"
#include "Dropout.h"
#include "HardTanH.h"
#include "Identity.h"
#include "LogSoftmax.h"
...
...
source/tensor/test/TDropout.cpp
查看文件 @
102db468
...
...
@@ -31,10 +31,11 @@ case 1: test Dropout function.
bool
TestDropout1
()
{
/* a input tensor of size (4, 5) */
int
order
=
2
;
int
order
=
3
;
int
*
dimSize
=
new
int
[
order
];
dimSize
[
0
]
=
40
;
dimSize
[
1
]
=
50
;
dimSize
[
2
]
=
60
;
int
unitNum
=
1
;
for
(
int
i
=
0
;
i
<
order
;
i
++
)
...
...
@@ -49,14 +50,14 @@ bool TestDropout1()
XTensor
yUser
;
/* initialize variables */
x
->
SetDataRand
(
0
,
1
);
_SetDataFixedFloat
(
x
,
1.0
F
);
y
->
SetZeroAll
();
/* call Dropout function */
float
prob
=
0.2
F
;
float
drop_
prob
=
0.2
F
;
int
seed
=
20
;
_Dropout
(
x
,
y
,
seed
,
prob
);
yUser
=
Dropout
(
*
x
);
_Dropout
(
x
,
y
,
seed
,
drop_
prob
);
yUser
=
Dropout
(
*
x
,
drop_prob
);
/* check result */
int
zeroNum1
=
0
;
...
...
@@ -73,9 +74,9 @@ bool TestDropout1()
}
printf
(
"CPU Test:
\n
"
);
printf
(
"In tensor y, there are %d units.
\n
"
,
unitNum
);
printf
(
"There are %d zero units by Dropout layer with probability %.2f.
\n
"
,
zeroNum1
,
prob
);
printf
(
"There are %d zero units by Dropout layer with probability %.2f.
\n
"
,
zeroNum1
,
drop_
prob
);
printf
(
"In tensor yUser, there are %d units.
\n
"
,
unitNum
);
printf
(
"There are %d zero units by Dropout layer with default probability %.2f.
\n
"
,
zeroNum2
,
0.5
F
);
printf
(
"There are %d zero units by Dropout layer with default probability %.2f.
\n
"
,
zeroNum2
,
drop_prob
);
#ifdef USE_CUDA
/* GPU test */
...
...
@@ -87,12 +88,12 @@ bool TestDropout1()
XTensor
yUserGPU
;
/* initialize variables */
xGPU
->
SetDataRand
(
0
,
1
);
_SetDataFixedFloat
(
xGPU
,
1.0
F
);
yGPU
->
SetZeroAll
();
/* call Dropout function */
_Dropout
(
xGPU
,
yGPU
,
seed
,
prob
);
yUserGPU
=
Dropout
(
*
xGPU
);
_Dropout
(
xGPU
,
yGPU
,
seed
,
drop_
prob
);
yUserGPU
=
Dropout
(
*
xGPU
,
drop_prob
);
/* check result */
zeroNum1
=
0
;
...
...
@@ -109,9 +110,9 @@ bool TestDropout1()
}
printf
(
"CPU Test:
\n
"
);
printf
(
"In tensor y, there are %d units.
\n
"
,
unitNum
);
printf
(
"There are %d zero units by Dropout layer with probability %.2f.
\n
"
,
zeroNum1
,
prob
);
printf
(
"There are %d zero units by Dropout layer with probability %.2f.
\n
"
,
zeroNum1
,
drop_
prob
);
printf
(
"In tensor yUser, there are %d units.
\n
"
,
unitNum
);
printf
(
"There are %d zero units by Dropout layer with default probability %.2f.
\n
"
,
zeroNum2
,
0.5
F
);
printf
(
"There are %d zero units by Dropout layer with default probability %.2f.
\n
"
,
zeroNum2
,
drop_prob
);
/* destroy variables */
delete
x
;
...
...
@@ -159,13 +160,13 @@ bool TestDropout2()
_SetDataFixedFloat
(
x
,
1.0
F
);
y
->
SetZeroAll
();
dedx
->
SetZeroAll
();
_SetDataFixedFloat
(
dedy
,
1.
0
F
);
_SetDataFixedFloat
(
dedy
,
1.
5
F
);
/* call Dropout function */
float
prob
=
0.5
F
;
float
drop_
prob
=
0.5
F
;
int
seed
=
1
;
_Dropout
(
x
,
y
,
seed
,
prob
);
_DropoutBackward
(
y
,
x
,
dedy
,
dedx
,
1
,
prob
);
_Dropout
(
x
,
y
,
seed
,
drop_
prob
);
_DropoutBackward
(
y
,
x
,
dedy
,
dedx
,
1
,
drop_
prob
);
/* check result */
y
->
Dump
(
stderr
,
"y"
);
...
...
@@ -185,11 +186,11 @@ bool TestDropout2()
_SetDataFixedFloat
(
xGPU
,
1.0
F
);
yGPU
->
SetZeroAll
();
dedxGPU
->
SetZeroAll
();
_SetDataFixedFloat
(
dedyGPU
,
1.
0
F
);
_SetDataFixedFloat
(
dedyGPU
,
1.
5
F
);
/* call Dropout function */
_Dropout
(
xGPU
,
yGPU
,
seed
,
prob
);
_DropoutBackward
(
yGPU
,
xGPU
,
dedyGPU
,
dedxGPU
,
1
,
prob
);
_Dropout
(
xGPU
,
yGPU
,
seed
,
drop_
prob
);
_DropoutBackward
(
yGPU
,
xGPU
,
dedyGPU
,
dedxGPU
,
1
,
drop_
prob
);
/* check result */
yGPU
->
Dump
(
stderr
,
"yGPU"
);
...
...
source/tensor/test/TXMem.cpp
查看文件 @
102db468
...
...
@@ -65,9 +65,10 @@ bool TestXMemCase1()
for
(
int
i
=
0
;
i
<
testNum
*
scalar
;
i
++
)
{
testxmemid
++
;
//fprintf(stderr, "%d %d\n", testxmemid, ok);
int
j
=
rand
()
%
caseNum
;
//fprintf(stderr, "%d %d %d\n", testxmemid, j, ok);
if
(
p
[
j
]
==
NULL
)
{
p
[
j
]
=
(
int
*
)
mem
.
AllocStandard
(
mem
.
devID
,
size
[
j
]
*
sizeof
(
int
));
for
(
int
k
=
0
;
k
<
size
[
j
];
k
++
)
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论