Commit 0b43acf6 by 姜雨帆

update

parent 896e5231
...@@ -71,6 +71,8 @@ void XMathGrad::MakeGrad(XTensor * node, bool isEfficient) ...@@ -71,6 +71,8 @@ void XMathGrad::MakeGrad(XTensor * node, bool isEfficient)
GradMultiply(node, isEfficient); GradMultiply(node, isEfficient);
else if(operID == MATH_MULTIPLYDIM) else if(operID == MATH_MULTIPLYDIM)
GradMultiplyDim(node, isEfficient); GradMultiplyDim(node, isEfficient);
else if (operID == MATH_MULTIPLYBROADCAST)
GradMultiplyBroadcast(node, isEfficient);
else if(operID == MATH_NEGATE) else if(operID == MATH_NEGATE)
GradNegate(node, isEfficient); GradNegate(node, isEfficient);
else if(operID == MATH_NORMALIZE) else if(operID == MATH_NORMALIZE)
......
...@@ -75,16 +75,19 @@ void T2TAttention::InitModel(int argc, char ** argv, ...@@ -75,16 +75,19 @@ void T2TAttention::InitModel(int argc, char ** argv,
InitTensor2D(&wq, d, dk, X_FLOAT, devID, mem); InitTensor2D(&wq, d, dk, X_FLOAT, devID, mem);
InitTensor2D(&wv, d, dv, X_FLOAT, devID, mem); InitTensor2D(&wv, d, dv, X_FLOAT, devID, mem);
InitTensor2D(&wa, d, d, X_FLOAT, devID, mem); InitTensor2D(&wa, d, d, X_FLOAT, devID, mem);
InitTensor2D(&wbig, d, 3 * d, X_FLOAT, devID, mem);
float scale = 1.0F; float scale = 1.0F;
float finfoutk = (float)sqrt(6.0F * scale/(d + dk)); float finfoutk = (float)sqrt(6.0F * scale/(d + dk));
float finfoutv = (float)sqrt(6.0F * scale/(d + dv)); float finfoutv = (float)sqrt(6.0F * scale/(d + dv));
float finfouta = (float)sqrt(6.0F * scale / (d + d)); float finfouta = (float)sqrt(6.0F * scale / (d + d));
float finfoutbig = (float)sqrt(6.0F * scale / (d + 3*d));
wk.SetDataRand(-finfoutk, finfoutk); wk.SetDataRand(-finfoutk, finfoutk);
wq.SetDataRand(-finfoutk, finfoutk); wq.SetDataRand(-finfoutk, finfoutk);
wv.SetDataRand(-finfoutv, finfoutv); wv.SetDataRand(-finfoutv, finfoutv);
wa.SetDataRand(-finfouta, finfouta); wa.SetDataRand(-finfouta, finfouta);
wbig.SetDataRand(-finfoutbig, finfoutbig);
} }
/* /*
...@@ -98,16 +101,40 @@ make the network ...@@ -98,16 +101,40 @@ make the network
>> isTraining - indicates whether the model is used for training >> isTraining - indicates whether the model is used for training
<< return - multi-attention result << return - multi-attention result
*/ */
XTensor T2TAttention::Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining) XTensor T2TAttention::Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining, bool selfatt)
{ {
XTensor k2; XTensor k2;
XTensor q2; XTensor q2;
XTensor v2; XTensor v2;
if (selfatt){
XTensor con;
XList split;
con = MMul(k, wbig);
int d1 = con.GetDim(0);
int d2 = con.GetDim(1);
int d3 = con.GetDim(2) / 3;
InitTensor3D(&k2, d1, d2, d3, X_FLOAT, devID, mem);
InitTensor3D(&q2, d1, d2, d3, X_FLOAT, devID, mem);
InitTensor3D(&v2, d1, d2, d3, X_FLOAT, devID, mem);
split.Add(&q2);
split.Add(&k2);
split.Add(&v2);
Split(con, split, 2, 3);
}
else{
/* linear transofmration before self-attention */ /* linear transofmration before self-attention */
k2 = MMul(k, wk); k2 = MMul(k, wk);
q2 = MMul(q, wq); q2 = MMul(q, wq);
v2 = MMul(v, wv); v2 = MMul(v, wv);
}
XTensor kheads; XTensor kheads;
XTensor qheads; XTensor qheads;
......
...@@ -60,6 +60,7 @@ public: ...@@ -60,6 +60,7 @@ public:
/* transformation after dot-product attention */ /* transformation after dot-product attention */
XTensor wa; XTensor wa;
XTensor wbig;
/* size of transformed Q and K */ /* size of transformed Q and K */
int dk; int dk;
...@@ -95,7 +96,7 @@ public: ...@@ -95,7 +96,7 @@ public:
int myDevID = -1, XMem * myMem = NULL); int myDevID = -1, XMem * myMem = NULL);
/* make the network */ /* make the network */
XTensor Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining); XTensor Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining, bool selfatt);
}; };
} }
......
...@@ -21,6 +21,8 @@ ...@@ -21,6 +21,8 @@
#include <math.h> #include <math.h>
#include "T2TDecoder.h" #include "T2TDecoder.h"
#include "T2TUtility.h"
#include "T2TLayerNormal.h"
#include "../../tensor/core/CHeader.h" #include "../../tensor/core/CHeader.h"
namespace transformer namespace transformer
...@@ -53,16 +55,43 @@ void AttDecoder::InitModel(int argc, char ** argv, ...@@ -53,16 +55,43 @@ void AttDecoder::InitModel(int argc, char ** argv,
bool myIsMasked, int myIgnored, bool myIsMasked, int myIgnored,
int myDevID, XMem * myMem) int myDevID, XMem * myMem)
{ {
AttEncoder::InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem); //AttEncoder::InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem);
devID = myDevID;
mem = myMem;
ignored = myIgnored;
LoadParamInt(argc, argv, "nlayer", &nlayer, 6);
LoadParamInt(argc, argv, "hsize", &hSize, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "esize", &eSize, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "vsizetgt", &vSize, -1);
LoadParamFloat(argc, argv, "dropout", &dropoutP, 0);
CheckNTErrors(nlayer >= 1, "We have one encoding layer at least!");
CheckNTErrors(vSize > 1, "set vocabulary size by \"-vsize\"");
/* embedding model */
embedder.InitModel(argc, argv, devID, mem, false);
attentions = new T2TAttention[nlayer];
fnns = new T2TFNN[nlayer];
attLayerNorms = new T2TLN[nlayer];
fnnLayerNorms = new T2TLN[nlayer];
attentionsEnde = new T2TAttention[nlayer]; attentionsEnde = new T2TAttention[nlayer];
attEndeLayerNorms = new T2TLN[nlayer]; attEndeLayerNorms = new T2TLN[nlayer];
/* initialize the stacked layers */ /* initialize the stacked layers */
for(int i = 0; i < nlayer; i++){ for (int i = 0; i < nlayer; i++) {
attentionsEnde[i].InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem); attentions[i].InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem);
fnns[i].InitModel(argc, argv, myDevID, myMem);
attLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
fnnLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
attentionsEnde[i].InitModel(argc, argv, true, myIgnored, myDevID, myMem);
attEndeLayerNorms[i].InitModel(argc, argv, myDevID, myMem); attEndeLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
} }
} }
/* /*
...@@ -82,7 +111,7 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X ...@@ -82,7 +111,7 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X
/* dropout */ /* dropout */
if(isTraining && dropoutP > 0) if(isTraining && dropoutP > 0)
x = Dropout(x, dropoutP, 2); x = Dropout(x, dropoutP);
for(int i = 0; i < nlayer; i++){ for(int i = 0; i < nlayer; i++){
XTensor att; XTensor att;
...@@ -93,11 +122,11 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X ...@@ -93,11 +122,11 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X
/******************/ /******************/
/* self attention */ /* self attention */
att = attentions[i].Make(x, x, x, mask, isTraining); att = attentions[i].Make(x, x, x, mask, isTraining, true);
/* dropout */ /* dropout */
if(isTraining && dropoutP > 0) if(isTraining && dropoutP > 0)
att = Dropout(att, dropoutP, 2); att = Dropout(att, dropoutP);
/* residual connection */ /* residual connection */
res = Sum(att, x); res = Sum(att, x);
...@@ -107,11 +136,11 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X ...@@ -107,11 +136,11 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X
/*****************************/ /*****************************/
/* encoder-decoder attention */ /* encoder-decoder attention */
ende = attentionsEnde[i].Make(outputEnc, x, outputEnc, maskEncDec, isTraining); ende = attentionsEnde[i].Make(outputEnc, x, outputEnc, maskEncDec, isTraining, false);
/* dropout */ /* dropout */
if(isTraining && dropoutP > 0) if(isTraining && dropoutP > 0)
ende = Dropout(ende, dropoutP, 2); ende = Dropout(ende, dropoutP);
/* residual connection */ /* residual connection */
res = Sum(ende, x); res = Sum(ende, x);
...@@ -125,7 +154,7 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X ...@@ -125,7 +154,7 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X
/* dropout */ /* dropout */
if(isTraining && dropoutP > 0) if(isTraining && dropoutP > 0)
fnn = Dropout(fnn, dropoutP, 2); fnn = Dropout(fnn, dropoutP);
/* residual connection */ /* residual connection */
res = Sum(fnn, x); res = Sum(fnn, x);
......
...@@ -27,9 +27,56 @@ ...@@ -27,9 +27,56 @@
namespace transformer namespace transformer
{ {
class AttDecoder : public AttEncoder class AttDecoder
{ {
public: public:
/* device id */
int devID;
/* memory pool */
XMem * mem;
/* layer number */
int nlayer;
/* hidden layer size of the FNN layer */
int hSize;
/* embedding size */
int eSize;
/* vocabulary size */
int vSize;
/* dropout probability */
DTYPE dropoutP;
/* some positions can be ignored in attention. this is useful in lm where the first position needs
* special design for the attention model. */
int ignored;
/* embedding of word at each position */
T2TEmbedder embedder;
/* FNN model of each layer */
T2TFNN * fnns;
/* attention model of each layer */
T2TAttention * attentions;
/* layer normalization for fnn */
T2TLN * fnnLayerNorms;
/* layer normalization for attention */
T2TLN * attLayerNorms;
/* input tensor of the encoder */
XTensor * input;
/* output tensor of the encoder */
XTensor * output;
/* encoder-decoder attention model of each layer */ /* encoder-decoder attention model of each layer */
T2TAttention * attentionsEnde; T2TAttention * attentionsEnde;
......
...@@ -48,12 +48,18 @@ initialize the model ...@@ -48,12 +48,18 @@ initialize the model
>> myDevID - device id >> myDevID - device id
>> myMem - the memory pool >> myMem - the memory pool
*/ */
void T2TEmbedder::InitModel(int argc, char ** argv, int myDevID, XMem * myMem) void T2TEmbedder::InitModel(int argc, char ** argv, int myDevID, XMem * myMem, bool isEnc)
{ {
devID = myDevID; devID = myDevID;
mem = myMem; mem = myMem;
if(isEnc){
LoadParamInt(argc, argv, "vsize", &vSize, -1); LoadParamInt(argc, argv, "vsize", &vSize, -1);
}
else{
LoadParamInt(argc, argv, "vsizetgt", &vSize, -1);
}
//LoadParamInt(argc, argv, "vsize", &vSize, -1);
LoadParamInt(argc, argv, "maxlen", &maxLength, 512); LoadParamInt(argc, argv, "maxlen", &maxLength, 512);
LoadParamInt(argc, argv, "d", &eSize, DEFAULT_EMBEDDING_SIZE); LoadParamInt(argc, argv, "d", &eSize, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE); LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);
......
...@@ -71,7 +71,7 @@ public: ...@@ -71,7 +71,7 @@ public:
~T2TEmbedder(); ~T2TEmbedder();
/* initialize the model */ /* initialize the model */
void InitModel(int argc, char ** argv, int myDevID = -1, XMem * myMem = NULL); void InitModel(int argc, char ** argv, int myDevID = -1, XMem * myMem = NULL, bool isEnc = true);
/* make positional embeddings */ /* make positional embeddings */
void MakePosEmbedding(int eSize, int d, int length); void MakePosEmbedding(int eSize, int d, int length);
......
...@@ -107,7 +107,7 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, boo ...@@ -107,7 +107,7 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, boo
/* dropout */ /* dropout */
if(isTraining && dropoutP > 0) if(isTraining && dropoutP > 0)
x = Dropout(x, dropoutP, 2); x = Dropout(x, dropoutP);
for(int i = 0; i < nlayer; i++){ for(int i = 0; i < nlayer; i++){
XTensor att; XTensor att;
...@@ -116,11 +116,11 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, boo ...@@ -116,11 +116,11 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, boo
XTensor res; XTensor res;
/* self attention */ /* self attention */
att = attentions[i].Make(x, x, x, mask, isTraining); att = attentions[i].Make(x, x, x, mask, isTraining, true);
/* dropout */ /* dropout */
if(isTraining && dropoutP > 0) if(isTraining && dropoutP > 0)
att = Dropout(att, dropoutP, 2); att = Dropout(att, dropoutP);
/* residual connection */ /* residual connection */
res = Sum(att, x); res = Sum(att, x);
...@@ -133,7 +133,7 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, boo ...@@ -133,7 +133,7 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, boo
/* dropout */ /* dropout */
if(isTraining && dropoutP > 0) if(isTraining && dropoutP > 0)
fnn = Dropout(fnn, dropoutP, 2); fnn = Dropout(fnn, dropoutP);
/* residual connection */ /* residual connection */
res = Sum(fnn, x); res = Sum(fnn, x);
...@@ -160,3 +160,4 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool isTraining) ...@@ -160,3 +160,4 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool isTraining)
} }
} }
...@@ -274,9 +274,10 @@ void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTe ...@@ -274,9 +274,10 @@ void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTe
_Sum(&maskEnc, padding3, &maskEnc); _Sum(&maskEnc, padding3, &maskEnc);
encoding = MakeEncoder(inputEnc, maskEnc, isTraining); encoding = MakeEncoder(inputEnc, maskEnc, isTraining);
//encoding.Dump(stderr, "encoding",10);
decoding = MakeDecoder(inputDec, encoding, maskDec, maskEncDec, isTraining); decoding = MakeDecoder(inputDec, encoding, maskDec, maskEncDec, isTraining);
//decoding.Dump(stderr, "decoding", 10);
outputLayer->Make(decoding, output); outputLayer->Make(decoding, output);
delete[] dims; delete[] dims;
...@@ -300,9 +301,10 @@ void T2TModel::GetParams(XList &list) ...@@ -300,9 +301,10 @@ void T2TModel::GetParams(XList &list)
list.Add(&encoder->fnns[i].b1); list.Add(&encoder->fnns[i].b1);
list.Add(&encoder->fnns[i].w2); list.Add(&encoder->fnns[i].w2);
list.Add(&encoder->fnns[i].b2); list.Add(&encoder->fnns[i].b2);
list.Add(&encoder->attentions[i].wk); //list.Add(&encoder->attentions[i].wk);
list.Add(&encoder->attentions[i].wq); //list.Add(&encoder->attentions[i].wq);
list.Add(&encoder->attentions[i].wv); //list.Add(&encoder->attentions[i].wv);
list.Add(&encoder->attentions[i].wbig);
list.Add(&encoder->attentions[i].wa); list.Add(&encoder->attentions[i].wa);
list.Add(&encoder->fnnLayerNorms[i].w); list.Add(&encoder->fnnLayerNorms[i].w);
list.Add(&encoder->fnnLayerNorms[i].b); list.Add(&encoder->fnnLayerNorms[i].b);
...@@ -324,9 +326,10 @@ void T2TModel::GetParams(XList &list) ...@@ -324,9 +326,10 @@ void T2TModel::GetParams(XList &list)
list.Add(&decoder->attentionsEnde[i].wa); list.Add(&decoder->attentionsEnde[i].wa);
list.Add(&decoder->attEndeLayerNorms[i].w); list.Add(&decoder->attEndeLayerNorms[i].w);
list.Add(&decoder->attEndeLayerNorms[i].b); list.Add(&decoder->attEndeLayerNorms[i].b);
list.Add(&decoder->attentions[i].wk); //list.Add(&decoder->attentions[i].wk);
list.Add(&decoder->attentions[i].wq); //list.Add(&decoder->attentions[i].wq);
list.Add(&decoder->attentions[i].wv); //list.Add(&decoder->attentions[i].wv);
list.Add(&decoder->attentions[i].wbig);
list.Add(&decoder->attentions[i].wa); list.Add(&decoder->attentions[i].wa);
list.Add(&decoder->fnnLayerNorms[i].w); list.Add(&decoder->fnnLayerNorms[i].w);
list.Add(&decoder->fnnLayerNorms[i].b); list.Add(&decoder->fnnLayerNorms[i].b);
......
...@@ -56,7 +56,7 @@ void T2TOutput::InitModel(int argc, char ** argv, int myDevID, XMem * myMem) ...@@ -56,7 +56,7 @@ void T2TOutput::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
float minmax = 0; float minmax = 0;
LoadParamInt(argc, argv, "vsize", &vSize, -1); LoadParamInt(argc, argv, "vsizetgt", &vSize, -1);
LoadParamInt(argc, argv, "d", &inSize, DEFAULT_EMBEDDING_SIZE); LoadParamInt(argc, argv, "d", &inSize, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "d", &hSize, DEFAULT_EMBEDDING_SIZE); LoadParamInt(argc, argv, "d", &hSize, DEFAULT_EMBEDDING_SIZE);
LoadParamFloat(argc, argv, "outputminmax", &minmax, 0.08F); LoadParamFloat(argc, argv, "outputminmax", &minmax, 0.08F);
......
...@@ -41,12 +41,15 @@ T2TTrainer::T2TTrainer() ...@@ -41,12 +41,15 @@ T2TTrainer::T2TTrainer()
seqLen2 = NULL; seqLen2 = NULL;
nseqBuf = 0; nseqBuf = 0;
nextSeq = -1; nextSeq = -1;
nextBatch = -1;
argNum = 0; argNum = 0;
argArray = NULL; argArray = NULL;
buf = NULL; buf = NULL;
buf2 = NULL; buf2 = NULL;
bufBatch = NULL;
bufSize = 0; bufSize = 0;
bufBatchSize = 0;
seqOffset = NULL; seqOffset = NULL;
} }
...@@ -55,6 +58,7 @@ T2TTrainer::~T2TTrainer() ...@@ -55,6 +58,7 @@ T2TTrainer::~T2TTrainer()
{ {
delete[] buf; delete[] buf;
delete[] buf2; delete[] buf2;
delete[] bufBatch;
delete[] seqLen; delete[] seqLen;
delete[] seqLen2; delete[] seqLen2;
delete[] seqOffset; delete[] seqOffset;
...@@ -117,9 +121,11 @@ void T2TTrainer::Init(int argc, char ** argv) ...@@ -117,9 +121,11 @@ void T2TTrainer::Init(int argc, char ** argv)
LoadParamBool(argc, argv, "smallbatch", &isSmallBatch, true); LoadParamBool(argc, argv, "smallbatch", &isSmallBatch, true);
LoadParamBool(argc, argv, "bigbatch", &isBigBatch, false); LoadParamBool(argc, argv, "bigbatch", &isBigBatch, false);
LoadParamBool(argc, argv, "debug", &isDebugged, false); LoadParamBool(argc, argv, "debug", &isDebugged, false);
LoadParamBool(argc, argv, "randbatch", &isRandomBatch, false);
buf = new int[bufSize]; buf = new int[bufSize];
buf2 = new int[bufSize]; buf2 = new int[bufSize];
bufBatch = new BatchNode[bufSize];
seqLen = new int[bufSize]; seqLen = new int[bufSize];
seqLen2 = new int[bufSize]; seqLen2 = new int[bufSize];
seqOffset = new int[bufSize]; seqOffset = new int[bufSize];
...@@ -172,6 +178,9 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model ...@@ -172,6 +178,9 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
double startT = GetClockSec(); double startT = GetClockSec();
FILE * fileen = fopen("enc.txt", "w");
FILE * filede = fopen("dec.txt", "w");
for(epoch = 1; epoch <= nepoch; epoch++){ for(epoch = 1; epoch <= nepoch; epoch++){
#ifndef WIN32 #ifndef WIN32
if(isShuffled) if(isShuffled)
...@@ -205,6 +214,10 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model ...@@ -205,6 +214,10 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
CheckNTErrors(batchEnc.order == 2, "wrong tensor order of the sequence batch"); CheckNTErrors(batchEnc.order == 2, "wrong tensor order of the sequence batch");
//batchEnc.Dump(stderr, "enc",1);
//batchDec.Dump(stderr, "dec",1);
//paddingDec.Dump(stderr, "paddec");
/* output probabilities */ /* output probabilities */
XTensor output; XTensor output;
...@@ -222,17 +235,18 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model ...@@ -222,17 +235,18 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
LabelSmooth(&gold, &goldSmoothed, labelSmoothingP); LabelSmooth(&gold, &goldSmoothed, labelSmoothingP);
/* make paddings for the output */ /* make paddings for the output */
if (output.GetDim(0) > 1) if (output.GetDim(0) > 0)
PadOutput(&output, &gold, &paddingDec); PadOutput(&output, &gold, &paddingDec);
/* get probabilities */ /* get probabilities */
float prob = GetProb(&output, &gold, NULL); float prob = GetProb(&output, &gold, NULL);
//printf("%f\n", prob);
//float prob = 0;
DTYPE lossLocal = -prob / wc; DTYPE lossLocal = -prob / wc;
bool doUpdate = (!IsNAN(lossLocal) && !IsINF(lossLocal) && lossLocal < 1e3F); bool doUpdate = (!IsNAN(lossLocal) && !IsINF(lossLocal) && lossLocal < 1e3F);
XTensor &g = labelSmoothingP > 0 ? goldSmoothed : gold; XTensor &g = labelSmoothingP > 0 ? goldSmoothed : gold;
//doUpdate = false;
if (doUpdate) { if (doUpdate) {
/* recale the output for normalized loss */ /* recale the output for normalized loss */
...@@ -292,6 +306,9 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model ...@@ -292,6 +306,9 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
MakeCheckpoint(model, validFN, modelFN, "epoch", epoch); MakeCheckpoint(model, validFN, modelFN, "epoch", epoch);
} }
fclose(fileen);
fclose(filede);
double elapsed = GetClockSec() - startT; double elapsed = GetClockSec() - startT;
epoch = MIN(epoch, nepoch); epoch = MIN(epoch, nepoch);
...@@ -434,11 +451,11 @@ void T2TTrainer::MakeCheckpoint(T2TModel * model, const char * validFN, const ch ...@@ -434,11 +451,11 @@ void T2TTrainer::MakeCheckpoint(T2TModel * model, const char * validFN, const ch
sprintf(fn2, "%s.%s.%03d.output", modelFN, label, id); sprintf(fn2, "%s.%s.%03d.output", modelFN, label, id);
model->Dump(fn); model->Dump(fn);
if(validFN != NULL){ //if(validFN != NULL){
T2TTrainer trainer; //T2TTrainer trainer;
trainer.Init(argNum, argArray); //trainer.Init(argNum, argArray);
trainer.Test(validFN, fn2, model); //trainer.Test(validFN, fn2, model);
} //}
delete[] fn; delete[] fn;
delete[] fn2; delete[] fn2;
...@@ -473,7 +490,8 @@ int T2TTrainer::LoadBuf(FILE * file, bool isSorted, int step) ...@@ -473,7 +490,8 @@ int T2TTrainer::LoadBuf(FILE * file, bool isSorted, int step)
int wordCount = 0; int wordCount = 0;
while(fgets(line, MAX_SEQUENCE_LENGTH - 1, file)){ while(fgets(line, MAX_SEQUENCE_LENGTH - 1, file)){
int len = (int)strlen(line); int len = (int)strlen(line);
if(line[0]=='b')
break;
while(line[len - 1] == '\r' || line[len - 1] == '\n'){ while(line[len - 1] == '\r' || line[len - 1] == '\n'){
line[len - 1] = 0; line[len - 1] = 0;
len--; len--;
...@@ -544,9 +562,14 @@ int T2TTrainer::LoadBuf(FILE * file, bool isSorted, int step) ...@@ -544,9 +562,14 @@ int T2TTrainer::LoadBuf(FILE * file, bool isSorted, int step)
node.offset = i; node.offset = i;
node.p = buf + offset; node.p = buf + offset;
node.size = 0; node.size = 0;
for(int j = 0; j < step; j++) int max = 0;
for(int j = 0; j < step; j++){
node.size += seqLen[i + j]; node.size += seqLen[i + j];
node.value = seqLen[i]; max = MAX(max, seqLen[i + j]);
}
//node.value = seqLen[i+1]+seqLen[i];
//node.value = MAX(seqLen[i+1],seqLen[i]);
node.value = max;
count++; count++;
offset += node.size; offset += node.size;
} }
...@@ -768,6 +791,12 @@ int T2TTrainer::LoadBatchLM(FILE * file, ...@@ -768,6 +791,12 @@ int T2TTrainer::LoadBatchLM(FILE * file,
return sc; return sc;
} }
int CompareBatchNode(const void * a, const void * b)
{
return ((BatchNode*)b)->key - ((BatchNode*)a)->key;
}
/* /*
load a batch of sequences (for MT) load a batch of sequences (for MT)
>> file - the handle to the data file >> file - the handle to the data file
...@@ -797,10 +826,70 @@ int T2TTrainer::LoadBatchMT(FILE * file, ...@@ -797,10 +826,70 @@ int T2TTrainer::LoadBatchMT(FILE * file,
int devID, XMem * mem, int devID, XMem * mem,
bool isTraining) bool isTraining)
{ {
if(nextSeq < 0 || nextSeq >= nseqBuf) //if (nextSeq < 0 || nextSeq >= nseqBuf)
// LoadBuf(file, isSorted, 2);
if (nextBatch < 0 || nextBatch >= bufBatchSize) {
LoadBuf(file, isSorted, 2); LoadBuf(file, isSorted, 2);
int seq = MAX(nextSeq, 0); int seq = 0;
bufBatchSize = 0;
nextBatch = 0;
/* we segment the buffer into batches */
while (seq < nseqBuf) {
int wcEnc = 0;
int wcDec = 0;
int wnEnc = 0;
int wnDec = 0;
int maxEnc = 0;
int maxDec = 0;
int sc = 0;
while (seq + sc < nseqBuf) {
/* source-side sequence */
wnEnc = seqLen[seq + sc];
/* target-side sequence */
wnDec = isDoubledEnd ? seqLen[seq + sc + 1] : seqLen[seq + sc + 1] - 1;
int tcEnc = isBigBatch ? (wcEnc + wnEnc): MAX(maxEnc, wnEnc) * (sc + 2) / 2;
int tcDec = isBigBatch ? (wcDec + wnDec): MAX(maxDec, wnDec) * (sc + 2) / 2;
if(sc != 0 && sc > sBatch * 2 && (tcEnc > wBatch || tcDec > wBatch))
break;
wcEnc += wnEnc;
sc += 1;
if(maxEnc < wnEnc)
maxEnc = wnEnc;
wcDec += wnDec;
sc += 1;
if(maxDec < wnDec)
maxDec = wnDec;
}
BatchNode & batch = bufBatch[bufBatchSize];
batch.beg = seq;
batch.end = seq + sc;
batch.maxEnc = maxEnc;
batch.maxDec = maxDec;
batch.key = rand();
bufBatchSize++;
seq = seq + sc;
}
if(isRandomBatch)
qsort(bufBatch, bufBatchSize, sizeof(BatchNode), CompareBatchNode);
}
/*int seq = MAX(nextSeq, 0);
int wcEnc = 0; int wcEnc = 0;
int wcDec = 0; int wcDec = 0;
int wnEnc = 0; int wnEnc = 0;
...@@ -813,10 +902,8 @@ int T2TTrainer::LoadBatchMT(FILE * file, ...@@ -813,10 +902,8 @@ int T2TTrainer::LoadBatchMT(FILE * file,
while(seq + sc < nseqBuf){ while(seq + sc < nseqBuf){
/* source-side sequence */
wnEnc = seqLen[seq + sc]; wnEnc = seqLen[seq + sc];
/* target-side sequence */
wnDec = isDoubledEnd ? seqLen[seq + sc + 1] : seqLen[seq + sc + 1] - 1; wnDec = isDoubledEnd ? seqLen[seq + sc + 1] : seqLen[seq + sc + 1] - 1;
int tcEnc = isBigBatch ? (wcEnc + wnEnc): MAX(maxEnc, wnEnc) * (sc + 2) / 2; int tcEnc = isBigBatch ? (wcEnc + wnEnc): MAX(maxEnc, wnEnc) * (sc + 2) / 2;
...@@ -841,8 +928,18 @@ int T2TTrainer::LoadBatchMT(FILE * file, ...@@ -841,8 +928,18 @@ int T2TTrainer::LoadBatchMT(FILE * file,
nextSeq = seq + sc; nextSeq = seq + sc;
if(sc <= 0) if(sc <= 0)
return 0;*/
if(bufBatchSize <= 0)
return 0; return 0;
BatchNode & batch = bufBatch[nextBatch++];
int seq = batch.beg;
int sc = batch.end - batch.beg;
int maxEnc = batch.maxEnc;
int maxDec = batch.maxDec;
CheckNTErrors(sc % 2 == 0, "The input samples must be paired");
int sCount = sc/2; int sCount = sc/2;
int seqSize = 0; int seqSize = 0;
int dimsDec[3] = {sCount, maxDec, vsDec}; int dimsDec[3] = {sCount, maxDec, vsDec};
...@@ -861,13 +958,14 @@ int T2TTrainer::LoadBatchMT(FILE * file, ...@@ -861,13 +958,14 @@ int T2TTrainer::LoadBatchMT(FILE * file,
int wCountEnc = 0; int wCountEnc = 0;
int wCountDec = 0; int wCountDec = 0;
int wCountPad = 0;
int wGold = 0; int wGold = 0;
wCount = 0; wCount = 0;
int * batchEncValues = new int[batchEnc->unitNum]; int * batchEncValues = new int[batchEnc->unitNum];
int * batchDecValues = new int[batchDec->unitNum]; int * batchDecValues = new int[batchDec->unitNum];
//MTYPE * paddingEncOffsets = new MTYPE[sc * maxEnc / 2]; //MTYPE * paddingEncOffsets = new MTYPE[sc * maxEnc / 2];
//MTYPE * paddingDecOffsets = new MTYPE[sc * maxDec / 2]; MTYPE * paddingDecOffsets = new MTYPE[sc * maxDec / 2];
MTYPE * goldOffsets = new MTYPE[sc * maxDec / 2]; MTYPE * goldOffsets = new MTYPE[sc * maxDec / 2];
memset(batchEncValues, 0, sizeof(int) * batchEnc->unitNum); memset(batchEncValues, 0, sizeof(int) * batchEnc->unitNum);
...@@ -901,7 +999,10 @@ int T2TTrainer::LoadBatchMT(FILE * file, ...@@ -901,7 +999,10 @@ int T2TTrainer::LoadBatchMT(FILE * file,
int num = buf[seqOffset[s] + w]; int num = buf[seqOffset[s] + w];
batchDecValues[batchDec->GetOffset2D(sent, w)] = num; batchDecValues[batchDec->GetOffset2D(sent, w)] = num;
//paddingDecOffsets[wCountDec] = paddingDec->GetOffset2D(sent, w); //paddingDecOffsets[wCountDec] = paddingDec->GetOffset2D(sent, w);
if (w < len-1){
paddingDecOffsets[wCountPad++] = paddingDec->GetOffset2D(sent, w);
wCount++;
}
if (w > 0) if (w > 0)
goldOffsets[wGold++] = gold->GetOffset3D(sent, w - 1, buf[seqOffset[s] + w]); goldOffsets[wGold++] = gold->GetOffset3D(sent, w - 1, buf[seqOffset[s] + w]);
...@@ -911,7 +1012,7 @@ int T2TTrainer::LoadBatchMT(FILE * file, ...@@ -911,7 +1012,7 @@ int T2TTrainer::LoadBatchMT(FILE * file,
else else
goldOffsets[wGold++] = gold->GetOffset3D(sent, w, buf[seqOffset[s] + w + 1]); goldOffsets[wGold++] = gold->GetOffset3D(sent, w, buf[seqOffset[s] + w + 1]);
} }
wCount++; //wCount++;
wCountDec++; wCountDec++;
if(seqs != NULL) if(seqs != NULL)
seqs[seqSize++] = buf[seqOffset[s] + w]; seqs[seqSize++] = buf[seqOffset[s] + w];
...@@ -924,19 +1025,19 @@ int T2TTrainer::LoadBatchMT(FILE * file, ...@@ -924,19 +1025,19 @@ int T2TTrainer::LoadBatchMT(FILE * file,
} }
batchDec->SetData(batchDecValues, batchDec->unitNum); batchDec->SetData(batchDecValues, batchDec->unitNum);
//paddingDec->SetDataBatched(paddingDecOffsets, 1.0F, wCountDec); paddingDec->SetDataBatched(paddingDecOffsets, 1.0F, wCountPad);
XTensor * tmp2 = NewTensorBuf(paddingDec, devID, mem); //XTensor * tmp2 = NewTensorBuf(paddingDec, devID, mem);
_ConvertDataType(batchDec, tmp2); //_ConvertDataType(batchDec, tmp2);
_NotEqual(tmp2, paddingDec, 0); //_NotEqual(tmp2, paddingDec, 0);
DelTensorBuf(tmp2); //DelTensorBuf(tmp2);
gold->SetDataBatched(goldOffsets, 1.0F, wGold); gold->SetDataBatched(goldOffsets, 1.0F, wGold);
delete[] batchEncValues; delete[] batchEncValues;
delete[] batchDecValues; delete[] batchDecValues;
//delete[] paddingEncOffsets; //delete[] paddingEncOffsets;
//delete[] paddingDecOffsets; delete[] paddingDecOffsets;
delete[] goldOffsets; delete[] goldOffsets;
return sc; return sc;
......
...@@ -33,6 +33,25 @@ using namespace nts; ...@@ -33,6 +33,25 @@ using namespace nts;
namespace transformer namespace transformer
{ {
/* node to keep batch information */
struct BatchNode
{
/* begining position */
int beg;
/* end position */
int end;
/* maximum word number on the encoder side */
int maxEnc;
/* maximum word number on the decoder side */
int maxDec;
/* a key for sorting */
int key;
};
/* trainer of the T2T model */ /* trainer of the T2T model */
class T2TTrainer class T2TTrainer
{ {
...@@ -49,9 +68,15 @@ public: ...@@ -49,9 +68,15 @@ public:
/* another buffer */ /* another buffer */
int * buf2; int * buf2;
/* batch buf */
BatchNode * bufBatch;
/* buffer size */ /* buffer size */
int bufSize; int bufSize;
/* size of batch buffer */
int bufBatchSize;
/* length of each sequence */ /* length of each sequence */
int * seqLen; int * seqLen;
...@@ -67,6 +92,9 @@ public: ...@@ -67,6 +92,9 @@ public:
/* offset for next sequence in the buffer */ /* offset for next sequence in the buffer */
int nextSeq; int nextSeq;
/* offset for next batch */
int nextBatch;
/* indicates whether the sequence is sorted by length */ /* indicates whether the sequence is sorted by length */
bool isLenSorted; bool isLenSorted;
...@@ -142,6 +170,9 @@ public: ...@@ -142,6 +170,9 @@ public:
/* counterpart of "isSmallBatch" */ /* counterpart of "isSmallBatch" */
bool isBigBatch; bool isBigBatch;
/* randomize batches */
bool isRandomBatch;
/* indicates whether we intend to debug the net */ /* indicates whether we intend to debug the net */
bool isDebugged; bool isDebugged;
......
...@@ -59,23 +59,28 @@ int TransformerMain(int argc, const char ** argv) ...@@ -59,23 +59,28 @@ int TransformerMain(int argc, const char ** argv)
LoadParamString(argc, args, "test", testFN, ""); LoadParamString(argc, args, "test", testFN, "");
LoadParamString(argc, args, "output", outputFN, ""); LoadParamString(argc, args, "output", outputFN, "");
srand((unsigned int)time(NULL));
T2TTrainer trainer; T2TTrainer trainer;
trainer.Init(argc, args); trainer.Init(argc, args);
T2TModel model; T2TModel model;
model.InitModel(argc, args); model.InitModel(argc, args);
//if(strcmp(modelFN, ""))
//model.Read(modelFN);
/* learn model parameters */ /* learn model parameters */
if(strcmp(trainFN, "")) if(strcmp(trainFN, ""))
trainer.Train(trainFN, testFN, strcmp(modelFN, "") ? modelFN : "checkpoint.model", &model); trainer.Train(trainFN, testFN, strcmp(modelFN, "") ? modelFN : "checkpoint.model", &model);
/* save the final model */ /* save the final model */
if(strcmp(modelFN, "") && strcmp(trainFN, "")) //if(strcmp(modelFN, "") && strcmp(trainFN, ""))
model.Dump(modelFN); //model.Dump(modelFN);
/* load the model if neccessary */ /* load the model if neccessary */
if(strcmp(modelFN, "")) //if(strcmp(modelFN, ""))
model.Read(modelFN); //model.Read(modelFN);
T2TTrainer tester; T2TTrainer tester;
tester.Init(argc, args); tester.Init(argc, args);
......
...@@ -60,6 +60,7 @@ XDevice::~XDevice() ...@@ -60,6 +60,7 @@ XDevice::~XDevice()
cublasDestroy(cublasHandle); cublasDestroy(cublasHandle);
if(stream != NULL) if(stream != NULL)
delete stream; delete stream;
curandDestroyGenerator(gen);
#endif #endif
} }
...@@ -82,6 +83,10 @@ void XDevice::Init(int myDevID) ...@@ -82,6 +83,10 @@ void XDevice::Init(int myDevID)
cudaDeviceProp prop; cudaDeviceProp prop;
cudaSetDevice(myDevID); cudaSetDevice(myDevID);
curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT);
curandSetPseudoRandomGeneratorSeed(gen, seed);
if(cudaGetDeviceProperties(&prop, devID) != cudaSuccess){ if(cudaGetDeviceProperties(&prop, devID) != cudaSuccess){
XPRINT1(0, stderr, "cannot get GPU(%d) information.", devID); XPRINT1(0, stderr, "cannot get GPU(%d) information.", devID);
exit(1); exit(1);
......
...@@ -112,6 +112,9 @@ public: ...@@ -112,6 +112,9 @@ public:
/* specify if the handle is initialized */ /* specify if the handle is initialized */
bool isHandleReady; bool isHandleReady;
/* generater of random numbers */
curandGenerator_t gen;
#endif #endif
......
...@@ -1614,11 +1614,17 @@ void XTensor::Dump(FILE * file, const char * label, const int n, const int beg, ...@@ -1614,11 +1614,17 @@ void XTensor::Dump(FILE * file, const char * label, const int n, const int beg,
else if (dataType == X_INT) { else if (dataType == X_INT) {
int end = MIN(n > 0 ? beg + n : beg + unitNum, unitNum); int end = MIN(n > 0 ? beg + n : beg + unitNum, unitNum);
for(int i = beg; i < end; i++){ for(int i = beg; i < end; i++){
if((i%(dimSize[1]) == 0)&&(i!=0)) {
fprintf(file, " \n");
}
int f = ((int*)d)[i]; int f = ((int*)d)[i];
if(i == beg) if(i == beg)
fprintf(file, "%d", f); fprintf(file, "%d", f);
else else
fprintf(file, " %d", f); fprintf(file, " %d", f);
//if((i%(dimSize[1]-1) == 0)&&(i!=0)) {
//fprintf(file, " \n");
//}
} }
} }
else else
......
...@@ -387,7 +387,7 @@ generate data items with a uniform distribution in [lower, upper] ...@@ -387,7 +387,7 @@ generate data items with a uniform distribution in [lower, upper]
>> lower - lower value of the range >> lower - lower value of the range
>> upper - upper value of the range >> upper - upper value of the range
*/ */
void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper) void _SetDataRand(const XTensor * tensor, DTYPE lower, DTYPE upper)
{ {
CheckNTErrors(upper > lower, "the high value must be greater than low value!"); CheckNTErrors(upper > lower, "the high value must be greater than low value!");
...@@ -432,6 +432,39 @@ void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper) ...@@ -432,6 +432,39 @@ void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper)
} }
/* /*
generate data items with a uniform distribution in [lower, upper] and set
the item to a pre-defined value if the item >= p, set the item to 0 otherwise
>> tensor - the tensor whose data array would be initialized
>> lower - lower value of the range
>> upper - upper value of the range
>> p - the threshold
>> value - the value we intend to assign to the item
*/
void _SetDataRandP(const XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE p, DTYPE value)
{
CheckNTErrors(tensor->dataType == DEFAULT_DTYPE, "TODO");
if (tensor->devID < 0) {
_SetDataRand(tensor, lower, upper);
DTYPE * data = (DTYPE*)tensor->data;
for (int i = 0; i < tensor->unitNum; i++) {
if (data[i] >= p)
data[i] = value;
else
data[i] = 0;
}
}
else {
#ifdef USE_CUDA
_CudaSetDataRandP(tensor, lower, upper, p, value);
#else
ShowNTErrors("Please recompile the code by specifying USE_CUDA");
#endif // USE_CUDA
}
}
/*
generate data items with a normal distribution with specified mean and standard deviation generate data items with a normal distribution with specified mean and standard deviation
>> tensor - the tensor that keeps the data >> tensor - the tensor that keeps the data
>> mean - mean or expectation of the distribution >> mean - mean or expectation of the distribution
......
...@@ -186,6 +186,26 @@ void KernelSetDataRandDouble(double * d, int size, DTYPE lower, DTYPE variance) ...@@ -186,6 +186,26 @@ void KernelSetDataRandDouble(double * d, int size, DTYPE lower, DTYPE variance)
} }
/* /*
set data items to a pre-defined value if its value >= p, set it to 0 otherwise
>> d - pointer to the data array
>> size - size of the array
>> lower - low value of the range
>> variance - the variance of the range
*/
__global__
void KernelSetDataPCut(DTYPE * d, int size, DTYPE p, DTYPE value)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size) {
if (d[i] >= p)
d[i] = value;
else
d[i] = 0;
}
}
/*
set data items along with a given dimension (and keep the remaining items unchanged) - kernel version set data items along with a given dimension (and keep the remaining items unchanged) - kernel version
>> tensor - the tensor whose data array would be initialized >> tensor - the tensor whose data array would be initialized
>> beg - the beginning position >> beg - the beginning position
...@@ -437,7 +457,7 @@ generate data items with a uniform distribution in [lower, upper] ...@@ -437,7 +457,7 @@ generate data items with a uniform distribution in [lower, upper]
>> lower - lower value of the range >> lower - lower value of the range
>> upper - upper value of the range >> upper - upper value of the range
*/ */
void _CudaSetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper) void _CudaSetDataRand(const XTensor * tensor, DTYPE lower, DTYPE upper)
{ {
CheckNTErrors(upper > lower, "the high value must be greater than low value!"); CheckNTErrors(upper > lower, "the high value must be greater than low value!");
...@@ -452,17 +472,46 @@ void _CudaSetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper) ...@@ -452,17 +472,46 @@ void _CudaSetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper)
int devIDBackup; int devIDBackup;
ProtectCudaDev(tensor->devID, devIDBackup); ProtectCudaDev(tensor->devID, devIDBackup);
curandGenerator_t gen; curandGenerator_t & gen = GDevs.GPUs[tensor->devID].gen;
curandCreateGenerator (&gen, CURAND_RNG_PSEUDO_DEFAULT);
curandSetPseudoRandomGeneratorSeed(gen, time(NULL));
curandGenerateUniform(gen , (float*)tensor->data , tensor->unitNum); curandGenerateUniform(gen , (float*)tensor->data , tensor->unitNum);
curandDestroyGenerator(gen);
DTYPE variance = upper - lower; DTYPE variance = upper - lower;
if(variance != 1.0F || lower != 0){
if (tensor->dataType == X_FLOAT) if (tensor->dataType == X_FLOAT)
KernelSetDataRandFloat <<<blocks, threads >>>((float*) tensor->data, tensor->unitNum, lower, variance); KernelSetDataRandFloat <<<blocks, threads >>>((float*) tensor->data, tensor->unitNum, lower, variance);
else if (tensor->dataType == X_DOUBLE) else if (tensor->dataType == X_DOUBLE)
KernelSetDataRandDouble <<<blocks, threads >>>((double*)tensor->data, tensor->unitNum, lower, variance); KernelSetDataRandDouble <<<blocks, threads >>>((double*)tensor->data, tensor->unitNum, lower, variance);
}
BacktoCudaDev(tensor->devID, devIDBackup);
}
/*
generate data items with a uniform distribution in [lower, upper] and set
the item to a pre-defined value if the item >= p, set the item to 0 otherwise
>> tensor - the tensor whose data array would be initialized
>> lower - lower value of the range
>> upper - upper value of the range
>> p - the threshold
>> value - the value we intend to assign to the item
*/
void _CudaSetDataRandP(const XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE p, DTYPE value)
{
_CudaSetDataRand(tensor, lower, upper);
int gridSize[3];
int blockSize[3];
GDevs.GetCudaThread(tensor->devID, tensor->unitNum, gridSize, blockSize);
dim3 blocks(gridSize[0]);
dim3 threads(blockSize[0]);
int devIDBackup;
ProtectCudaDev(tensor->devID, devIDBackup);
KernelSetDataPCut << <blocks, threads >> >((float*)tensor->data, tensor->unitNum, p, value);
BacktoCudaDev(tensor->devID, devIDBackup); BacktoCudaDev(tensor->devID, devIDBackup);
} }
......
...@@ -47,7 +47,11 @@ void _CudaSetDataIndexed(XTensor * source, XTensor * modify, int dim, int index) ...@@ -47,7 +47,11 @@ void _CudaSetDataIndexed(XTensor * source, XTensor * modify, int dim, int index)
void _CudaSetDataLowTri(XTensor * tensor, DTYPE p, int shift); void _CudaSetDataLowTri(XTensor * tensor, DTYPE p, int shift);
/* generate data items with a uniform distribution in [lower, upper] */ /* generate data items with a uniform distribution in [lower, upper] */
void _CudaSetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper); void _CudaSetDataRand(const XTensor * tensor, DTYPE lower, DTYPE upper);
/* generate data items with a uniform distribution in [lower, upper] and set
the item to a pre-defined value if the item >= p, set the item to 0 otherwise */
void _CudaSetDataRandP(const XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE p, DTYPE value);
/* set the data with an array of offsets */ /* set the data with an array of offsets */
void _CudaSetDataWithOffset(XTensor * tensor, MTYPE * offsets, DTYPE value, MTYPE num); void _CudaSetDataWithOffset(XTensor * tensor, MTYPE * offsets, DTYPE value, MTYPE num);
......
...@@ -55,7 +55,11 @@ void _SetDataIndexed(XTensor * source, XTensor * modify, int dim, int index); ...@@ -55,7 +55,11 @@ void _SetDataIndexed(XTensor * source, XTensor * modify, int dim, int index);
void _SetDataLowTri(XTensor * tensor, DTYPE p, int shift); void _SetDataLowTri(XTensor * tensor, DTYPE p, int shift);
/* generate data items with a uniform distribution in [lower, upper] */ /* generate data items with a uniform distribution in [lower, upper] */
void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper); void _SetDataRand(const XTensor * tensor, DTYPE lower, DTYPE upper);
/* generate data items with a uniform distribution in [lower, upper] and set
the item to a pre-defined value if the item >= p, set the item to 0 otherwise */
void _SetDataRandP(const XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE p, DTYPE value);
/* generate data items with a normal distribution with specified mean and standard deviation */ /* generate data items with a normal distribution with specified mean and standard deviation */
void _SetDataRandN(XTensor * tensor, DTYPE mean = 0.0F, DTYPE standardDeviation = 1.0F); void _SetDataRandN(XTensor * tensor, DTYPE mean = 0.0F, DTYPE standardDeviation = 1.0F);
......
...@@ -26,6 +26,7 @@ ...@@ -26,6 +26,7 @@
#include "../core/arithmetic/Multiply.h" #include "../core/arithmetic/Multiply.h"
#include "../core/arithmetic/MultiplyDim.h" #include "../core/arithmetic/MultiplyDim.h"
#include "../core/math/ScaleAndShift.h" #include "../core/math/ScaleAndShift.h"
#include "../core/getandset/SetData.h"
namespace nts{ // namespace nts(NiuTrans.Tensor namespace nts{ // namespace nts(NiuTrans.Tensor
...@@ -147,17 +148,21 @@ XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim, int leadingDim ...@@ -147,17 +148,21 @@ XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim, int leadingDim
XTensor mask; XTensor mask;
DTYPE * maskArray = NULL; DTYPE * maskArray = NULL;
DTYPE scaleFactor = (DTYPE)1.0 / ((DTYPE)1.0 - dropProb);
if(leadingDim < 0 && leadingDim2 < 0){ if(leadingDim < 0 && leadingDim2 < 0){
ShowNTErrors("TODO"); XTensor mask;
InitTensor(&mask, &x);
_SetDataRandP(&mask, 0, 1.0F, dropProb, scaleFactor);
return Multiply(x, mask);
} }
else if(leadingDim2 < 0){ else if(leadingDim2 < 0){
int n = leadingDim; int n = leadingDim;
CheckNTErrors(n >= 0 && n < x.order, "Wrong leadingDim!"); CheckNTErrors(n >= 0 && n < x.order, "Wrong leadingDim!");
DTYPE scaleFactor = (DTYPE)1.0 / ((DTYPE)1.0 - dropProb);
/* generate a mask tensor with probability p */ /* generate a mask tensor with probability p */
int unitNum = x.dimSize[n]; int unitNum = x.dimSize[n];
maskArray = new DTYPE[unitNum]; maskArray = new DTYPE[unitNum];
...@@ -181,8 +186,6 @@ XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim, int leadingDim ...@@ -181,8 +186,6 @@ XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim, int leadingDim
CheckNTErrors(n >= 0 && n < x.order, "Wrong leadingDim!"); CheckNTErrors(n >= 0 && n < x.order, "Wrong leadingDim!");
CheckNTErrors(m >= 0 && m < x.order, "Wrong leadingDim!"); CheckNTErrors(m >= 0 && m < x.order, "Wrong leadingDim!");
DTYPE scaleFactor = (DTYPE)1.0 / ((DTYPE)1.0 - dropProb);
/* generate a mask tensor with probability p */ /* generate a mask tensor with probability p */
int unitNum = x.dimSize[n] * x.dimSize[m]; int unitNum = x.dimSize[n] * x.dimSize[m];
maskArray = new DTYPE[unitNum]; maskArray = new DTYPE[unitNum];
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论