Commit 0b43acf6 by 姜雨帆

update

parent 896e5231
......@@ -71,6 +71,8 @@ void XMathGrad::MakeGrad(XTensor * node, bool isEfficient)
GradMultiply(node, isEfficient);
else if(operID == MATH_MULTIPLYDIM)
GradMultiplyDim(node, isEfficient);
else if (operID == MATH_MULTIPLYBROADCAST)
GradMultiplyBroadcast(node, isEfficient);
else if(operID == MATH_NEGATE)
GradNegate(node, isEfficient);
else if(operID == MATH_NORMALIZE)
......
......@@ -75,16 +75,19 @@ void T2TAttention::InitModel(int argc, char ** argv,
InitTensor2D(&wq, d, dk, X_FLOAT, devID, mem);
InitTensor2D(&wv, d, dv, X_FLOAT, devID, mem);
InitTensor2D(&wa, d, d, X_FLOAT, devID, mem);
InitTensor2D(&wbig, d, 3 * d, X_FLOAT, devID, mem);
float scale = 1.0F;
float finfoutk = (float)sqrt(6.0F * scale/(d + dk));
float finfoutv = (float)sqrt(6.0F * scale/(d + dv));
float finfouta = (float)sqrt(6.0F * scale / (d + d));
float finfoutbig = (float)sqrt(6.0F * scale / (d + 3*d));
wk.SetDataRand(-finfoutk, finfoutk);
wq.SetDataRand(-finfoutk, finfoutk);
wv.SetDataRand(-finfoutv, finfoutv);
wa.SetDataRand(-finfouta, finfouta);
wbig.SetDataRand(-finfoutbig, finfoutbig);
}
/*
......@@ -98,16 +101,40 @@ make the network
>> isTraining - indicates whether the model is used for training
<< return - multi-attention result
*/
XTensor T2TAttention::Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining)
XTensor T2TAttention::Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining, bool selfatt)
{
XTensor k2;
XTensor q2;
XTensor v2;
/* linear transofmration before self-attention */
k2 = MMul(k, wk);
q2 = MMul(q, wq);
v2 = MMul(v, wv);
if (selfatt){
XTensor con;
XList split;
con = MMul(k, wbig);
int d1 = con.GetDim(0);
int d2 = con.GetDim(1);
int d3 = con.GetDim(2) / 3;
InitTensor3D(&k2, d1, d2, d3, X_FLOAT, devID, mem);
InitTensor3D(&q2, d1, d2, d3, X_FLOAT, devID, mem);
InitTensor3D(&v2, d1, d2, d3, X_FLOAT, devID, mem);
split.Add(&q2);
split.Add(&k2);
split.Add(&v2);
Split(con, split, 2, 3);
}
else{
/* linear transofmration before self-attention */
k2 = MMul(k, wk);
q2 = MMul(q, wq);
v2 = MMul(v, wv);
}
XTensor kheads;
XTensor qheads;
......
......@@ -59,7 +59,8 @@ public:
/* transformation after dot-product attention */
XTensor wa;
XTensor wbig;
/* size of transformed Q and K */
int dk;
......@@ -95,7 +96,7 @@ public:
int myDevID = -1, XMem * myMem = NULL);
/* make the network */
XTensor Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining);
XTensor Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining, bool selfatt);
};
}
......
......@@ -21,6 +21,8 @@
#include <math.h>
#include "T2TDecoder.h"
#include "T2TUtility.h"
#include "T2TLayerNormal.h"
#include "../../tensor/core/CHeader.h"
namespace transformer
......@@ -53,16 +55,43 @@ void AttDecoder::InitModel(int argc, char ** argv,
bool myIsMasked, int myIgnored,
int myDevID, XMem * myMem)
{
AttEncoder::InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem);
//AttEncoder::InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem);
devID = myDevID;
mem = myMem;
ignored = myIgnored;
LoadParamInt(argc, argv, "nlayer", &nlayer, 6);
LoadParamInt(argc, argv, "hsize", &hSize, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "esize", &eSize, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "vsizetgt", &vSize, -1);
LoadParamFloat(argc, argv, "dropout", &dropoutP, 0);
CheckNTErrors(nlayer >= 1, "We have one encoding layer at least!");
CheckNTErrors(vSize > 1, "set vocabulary size by \"-vsize\"");
/* embedding model */
embedder.InitModel(argc, argv, devID, mem, false);
attentions = new T2TAttention[nlayer];
fnns = new T2TFNN[nlayer];
attLayerNorms = new T2TLN[nlayer];
fnnLayerNorms = new T2TLN[nlayer];
attentionsEnde = new T2TAttention[nlayer];
attEndeLayerNorms = new T2TLN[nlayer];
/* initialize the stacked layers */
for(int i = 0; i < nlayer; i++){
attentionsEnde[i].InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem);
for (int i = 0; i < nlayer; i++) {
attentions[i].InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem);
fnns[i].InitModel(argc, argv, myDevID, myMem);
attLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
fnnLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
attentionsEnde[i].InitModel(argc, argv, true, myIgnored, myDevID, myMem);
attEndeLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
}
}
/*
......@@ -82,7 +111,7 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X
/* dropout */
if(isTraining && dropoutP > 0)
x = Dropout(x, dropoutP, 2);
x = Dropout(x, dropoutP);
for(int i = 0; i < nlayer; i++){
XTensor att;
......@@ -93,11 +122,11 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X
/******************/
/* self attention */
att = attentions[i].Make(x, x, x, mask, isTraining);
att = attentions[i].Make(x, x, x, mask, isTraining, true);
/* dropout */
if(isTraining && dropoutP > 0)
att = Dropout(att, dropoutP, 2);
att = Dropout(att, dropoutP);
/* residual connection */
res = Sum(att, x);
......@@ -107,11 +136,11 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X
/*****************************/
/* encoder-decoder attention */
ende = attentionsEnde[i].Make(outputEnc, x, outputEnc, maskEncDec, isTraining);
ende = attentionsEnde[i].Make(outputEnc, x, outputEnc, maskEncDec, isTraining, false);
/* dropout */
if(isTraining && dropoutP > 0)
ende = Dropout(ende, dropoutP, 2);
ende = Dropout(ende, dropoutP);
/* residual connection */
res = Sum(ende, x);
......@@ -125,7 +154,7 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X
/* dropout */
if(isTraining && dropoutP > 0)
fnn = Dropout(fnn, dropoutP, 2);
fnn = Dropout(fnn, dropoutP);
/* residual connection */
res = Sum(fnn, x);
......
......@@ -27,9 +27,56 @@
namespace transformer
{
class AttDecoder : public AttEncoder
class AttDecoder
{
public:
/* device id */
int devID;
/* memory pool */
XMem * mem;
/* layer number */
int nlayer;
/* hidden layer size of the FNN layer */
int hSize;
/* embedding size */
int eSize;
/* vocabulary size */
int vSize;
/* dropout probability */
DTYPE dropoutP;
/* some positions can be ignored in attention. this is useful in lm where the first position needs
* special design for the attention model. */
int ignored;
/* embedding of word at each position */
T2TEmbedder embedder;
/* FNN model of each layer */
T2TFNN * fnns;
/* attention model of each layer */
T2TAttention * attentions;
/* layer normalization for fnn */
T2TLN * fnnLayerNorms;
/* layer normalization for attention */
T2TLN * attLayerNorms;
/* input tensor of the encoder */
XTensor * input;
/* output tensor of the encoder */
XTensor * output;
/* encoder-decoder attention model of each layer */
T2TAttention * attentionsEnde;
......@@ -53,4 +100,4 @@ public:
}
#endif
\ No newline at end of file
#endif
......@@ -48,12 +48,18 @@ initialize the model
>> myDevID - device id
>> myMem - the memory pool
*/
void T2TEmbedder::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
void T2TEmbedder::InitModel(int argc, char ** argv, int myDevID, XMem * myMem, bool isEnc)
{
devID = myDevID;
mem = myMem;
LoadParamInt(argc, argv, "vsize", &vSize, -1);
if(isEnc){
LoadParamInt(argc, argv, "vsize", &vSize, -1);
}
else{
LoadParamInt(argc, argv, "vsizetgt", &vSize, -1);
}
//LoadParamInt(argc, argv, "vsize", &vSize, -1);
LoadParamInt(argc, argv, "maxlen", &maxLength, 512);
LoadParamInt(argc, argv, "d", &eSize, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);
......
......@@ -71,7 +71,7 @@ public:
~T2TEmbedder();
/* initialize the model */
void InitModel(int argc, char ** argv, int myDevID = -1, XMem * myMem = NULL);
void InitModel(int argc, char ** argv, int myDevID = -1, XMem * myMem = NULL, bool isEnc = true);
/* make positional embeddings */
void MakePosEmbedding(int eSize, int d, int length);
......
......@@ -107,7 +107,7 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, boo
/* dropout */
if(isTraining && dropoutP > 0)
x = Dropout(x, dropoutP, 2);
x = Dropout(x, dropoutP);
for(int i = 0; i < nlayer; i++){
XTensor att;
......@@ -116,11 +116,11 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, boo
XTensor res;
/* self attention */
att = attentions[i].Make(x, x, x, mask, isTraining);
att = attentions[i].Make(x, x, x, mask, isTraining, true);
/* dropout */
if(isTraining && dropoutP > 0)
att = Dropout(att, dropoutP, 2);
att = Dropout(att, dropoutP);
/* residual connection */
res = Sum(att, x);
......@@ -133,7 +133,7 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, boo
/* dropout */
if(isTraining && dropoutP > 0)
fnn = Dropout(fnn, dropoutP, 2);
fnn = Dropout(fnn, dropoutP);
/* residual connection */
res = Sum(fnn, x);
......@@ -160,3 +160,4 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool isTraining)
}
}
......@@ -274,9 +274,10 @@ void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTe
_Sum(&maskEnc, padding3, &maskEnc);
encoding = MakeEncoder(inputEnc, maskEnc, isTraining);
//encoding.Dump(stderr, "encoding",10);
decoding = MakeDecoder(inputDec, encoding, maskDec, maskEncDec, isTraining);
//decoding.Dump(stderr, "decoding", 10);
outputLayer->Make(decoding, output);
delete[] dims;
......@@ -300,9 +301,10 @@ void T2TModel::GetParams(XList &list)
list.Add(&encoder->fnns[i].b1);
list.Add(&encoder->fnns[i].w2);
list.Add(&encoder->fnns[i].b2);
list.Add(&encoder->attentions[i].wk);
list.Add(&encoder->attentions[i].wq);
list.Add(&encoder->attentions[i].wv);
//list.Add(&encoder->attentions[i].wk);
//list.Add(&encoder->attentions[i].wq);
//list.Add(&encoder->attentions[i].wv);
list.Add(&encoder->attentions[i].wbig);
list.Add(&encoder->attentions[i].wa);
list.Add(&encoder->fnnLayerNorms[i].w);
list.Add(&encoder->fnnLayerNorms[i].b);
......@@ -324,9 +326,10 @@ void T2TModel::GetParams(XList &list)
list.Add(&decoder->attentionsEnde[i].wa);
list.Add(&decoder->attEndeLayerNorms[i].w);
list.Add(&decoder->attEndeLayerNorms[i].b);
list.Add(&decoder->attentions[i].wk);
list.Add(&decoder->attentions[i].wq);
list.Add(&decoder->attentions[i].wv);
//list.Add(&decoder->attentions[i].wk);
//list.Add(&decoder->attentions[i].wq);
//list.Add(&decoder->attentions[i].wv);
list.Add(&decoder->attentions[i].wbig);
list.Add(&decoder->attentions[i].wa);
list.Add(&decoder->fnnLayerNorms[i].w);
list.Add(&decoder->fnnLayerNorms[i].b);
......
......@@ -56,7 +56,7 @@ void T2TOutput::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
float minmax = 0;
LoadParamInt(argc, argv, "vsize", &vSize, -1);
LoadParamInt(argc, argv, "vsizetgt", &vSize, -1);
LoadParamInt(argc, argv, "d", &inSize, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "d", &hSize, DEFAULT_EMBEDDING_SIZE);
LoadParamFloat(argc, argv, "outputminmax", &minmax, 0.08F);
......
......@@ -41,12 +41,15 @@ T2TTrainer::T2TTrainer()
seqLen2 = NULL;
nseqBuf = 0;
nextSeq = -1;
nextBatch = -1;
argNum = 0;
argArray = NULL;
buf = NULL;
buf2 = NULL;
bufBatch = NULL;
bufSize = 0;
bufBatchSize = 0;
seqOffset = NULL;
}
......@@ -55,6 +58,7 @@ T2TTrainer::~T2TTrainer()
{
delete[] buf;
delete[] buf2;
delete[] bufBatch;
delete[] seqLen;
delete[] seqLen2;
delete[] seqOffset;
......@@ -117,9 +121,11 @@ void T2TTrainer::Init(int argc, char ** argv)
LoadParamBool(argc, argv, "smallbatch", &isSmallBatch, true);
LoadParamBool(argc, argv, "bigbatch", &isBigBatch, false);
LoadParamBool(argc, argv, "debug", &isDebugged, false);
LoadParamBool(argc, argv, "randbatch", &isRandomBatch, false);
buf = new int[bufSize];
buf2 = new int[bufSize];
bufBatch = new BatchNode[bufSize];
seqLen = new int[bufSize];
seqLen2 = new int[bufSize];
seqOffset = new int[bufSize];
......@@ -171,6 +177,9 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
PrepareModel(model);
double startT = GetClockSec();
FILE * fileen = fopen("enc.txt", "w");
FILE * filede = fopen("dec.txt", "w");
for(epoch = 1; epoch <= nepoch; epoch++){
#ifndef WIN32
......@@ -205,6 +214,10 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
CheckNTErrors(batchEnc.order == 2, "wrong tensor order of the sequence batch");
//batchEnc.Dump(stderr, "enc",1);
//batchDec.Dump(stderr, "dec",1);
//paddingDec.Dump(stderr, "paddec");
/* output probabilities */
XTensor output;
......@@ -222,17 +235,18 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
LabelSmooth(&gold, &goldSmoothed, labelSmoothingP);
/* make paddings for the output */
if (output.GetDim(0) > 1)
if (output.GetDim(0) > 0)
PadOutput(&output, &gold, &paddingDec);
/* get probabilities */
float prob = GetProb(&output, &gold, NULL);
//printf("%f\n", prob);
//float prob = 0;
DTYPE lossLocal = -prob / wc;
bool doUpdate = (!IsNAN(lossLocal) && !IsINF(lossLocal) && lossLocal < 1e3F);
XTensor &g = labelSmoothingP > 0 ? goldSmoothed : gold;
//doUpdate = false;
if (doUpdate) {
/* recale the output for normalized loss */
......@@ -292,6 +306,9 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
MakeCheckpoint(model, validFN, modelFN, "epoch", epoch);
}
fclose(fileen);
fclose(filede);
double elapsed = GetClockSec() - startT;
epoch = MIN(epoch, nepoch);
......@@ -434,11 +451,11 @@ void T2TTrainer::MakeCheckpoint(T2TModel * model, const char * validFN, const ch
sprintf(fn2, "%s.%s.%03d.output", modelFN, label, id);
model->Dump(fn);
if(validFN != NULL){
T2TTrainer trainer;
trainer.Init(argNum, argArray);
trainer.Test(validFN, fn2, model);
}
//if(validFN != NULL){
//T2TTrainer trainer;
//trainer.Init(argNum, argArray);
//trainer.Test(validFN, fn2, model);
//}
delete[] fn;
delete[] fn2;
......@@ -473,7 +490,8 @@ int T2TTrainer::LoadBuf(FILE * file, bool isSorted, int step)
int wordCount = 0;
while(fgets(line, MAX_SEQUENCE_LENGTH - 1, file)){
int len = (int)strlen(line);
if(line[0]=='b')
break;
while(line[len - 1] == '\r' || line[len - 1] == '\n'){
line[len - 1] = 0;
len--;
......@@ -544,9 +562,14 @@ int T2TTrainer::LoadBuf(FILE * file, bool isSorted, int step)
node.offset = i;
node.p = buf + offset;
node.size = 0;
for(int j = 0; j < step; j++)
int max = 0;
for(int j = 0; j < step; j++){
node.size += seqLen[i + j];
node.value = seqLen[i];
max = MAX(max, seqLen[i + j]);
}
//node.value = seqLen[i+1]+seqLen[i];
//node.value = MAX(seqLen[i+1],seqLen[i]);
node.value = max;
count++;
offset += node.size;
}
......@@ -768,6 +791,12 @@ int T2TTrainer::LoadBatchLM(FILE * file,
return sc;
}
int CompareBatchNode(const void * a, const void * b)
{
return ((BatchNode*)b)->key - ((BatchNode*)a)->key;
}
/*
load a batch of sequences (for MT)
>> file - the handle to the data file
......@@ -797,10 +826,70 @@ int T2TTrainer::LoadBatchMT(FILE * file,
int devID, XMem * mem,
bool isTraining)
{
if(nextSeq < 0 || nextSeq >= nseqBuf)
//if (nextSeq < 0 || nextSeq >= nseqBuf)
// LoadBuf(file, isSorted, 2);
if (nextBatch < 0 || nextBatch >= bufBatchSize) {
LoadBuf(file, isSorted, 2);
int seq = MAX(nextSeq, 0);
int seq = 0;
bufBatchSize = 0;
nextBatch = 0;
/* we segment the buffer into batches */
while (seq < nseqBuf) {
int wcEnc = 0;
int wcDec = 0;
int wnEnc = 0;
int wnDec = 0;
int maxEnc = 0;
int maxDec = 0;
int sc = 0;
while (seq + sc < nseqBuf) {
/* source-side sequence */
wnEnc = seqLen[seq + sc];
/* target-side sequence */
wnDec = isDoubledEnd ? seqLen[seq + sc + 1] : seqLen[seq + sc + 1] - 1;
int tcEnc = isBigBatch ? (wcEnc + wnEnc): MAX(maxEnc, wnEnc) * (sc + 2) / 2;
int tcDec = isBigBatch ? (wcDec + wnDec): MAX(maxDec, wnDec) * (sc + 2) / 2;
if(sc != 0 && sc > sBatch * 2 && (tcEnc > wBatch || tcDec > wBatch))
break;
wcEnc += wnEnc;
sc += 1;
if(maxEnc < wnEnc)
maxEnc = wnEnc;
wcDec += wnDec;
sc += 1;
if(maxDec < wnDec)
maxDec = wnDec;
}
BatchNode & batch = bufBatch[bufBatchSize];
batch.beg = seq;
batch.end = seq + sc;
batch.maxEnc = maxEnc;
batch.maxDec = maxDec;
batch.key = rand();
bufBatchSize++;
seq = seq + sc;
}
if(isRandomBatch)
qsort(bufBatch, bufBatchSize, sizeof(BatchNode), CompareBatchNode);
}
/*int seq = MAX(nextSeq, 0);
int wcEnc = 0;
int wcDec = 0;
int wnEnc = 0;
......@@ -813,10 +902,8 @@ int T2TTrainer::LoadBatchMT(FILE * file,
while(seq + sc < nseqBuf){
/* source-side sequence */
wnEnc = seqLen[seq + sc];
/* target-side sequence */
wnDec = isDoubledEnd ? seqLen[seq + sc + 1] : seqLen[seq + sc + 1] - 1;
int tcEnc = isBigBatch ? (wcEnc + wnEnc): MAX(maxEnc, wnEnc) * (sc + 2) / 2;
......@@ -841,8 +928,18 @@ int T2TTrainer::LoadBatchMT(FILE * file,
nextSeq = seq + sc;
if(sc <= 0)
return 0;*/
if(bufBatchSize <= 0)
return 0;
BatchNode & batch = bufBatch[nextBatch++];
int seq = batch.beg;
int sc = batch.end - batch.beg;
int maxEnc = batch.maxEnc;
int maxDec = batch.maxDec;
CheckNTErrors(sc % 2 == 0, "The input samples must be paired");
int sCount = sc/2;
int seqSize = 0;
int dimsDec[3] = {sCount, maxDec, vsDec};
......@@ -861,13 +958,14 @@ int T2TTrainer::LoadBatchMT(FILE * file,
int wCountEnc = 0;
int wCountDec = 0;
int wCountPad = 0;
int wGold = 0;
wCount = 0;
int * batchEncValues = new int[batchEnc->unitNum];
int * batchDecValues = new int[batchDec->unitNum];
//MTYPE * paddingEncOffsets = new MTYPE[sc * maxEnc / 2];
//MTYPE * paddingDecOffsets = new MTYPE[sc * maxDec / 2];
MTYPE * paddingDecOffsets = new MTYPE[sc * maxDec / 2];
MTYPE * goldOffsets = new MTYPE[sc * maxDec / 2];
memset(batchEncValues, 0, sizeof(int) * batchEnc->unitNum);
......@@ -901,7 +999,10 @@ int T2TTrainer::LoadBatchMT(FILE * file,
int num = buf[seqOffset[s] + w];
batchDecValues[batchDec->GetOffset2D(sent, w)] = num;
//paddingDecOffsets[wCountDec] = paddingDec->GetOffset2D(sent, w);
if (w < len-1){
paddingDecOffsets[wCountPad++] = paddingDec->GetOffset2D(sent, w);
wCount++;
}
if (w > 0)
goldOffsets[wGold++] = gold->GetOffset3D(sent, w - 1, buf[seqOffset[s] + w]);
......@@ -911,7 +1012,7 @@ int T2TTrainer::LoadBatchMT(FILE * file,
else
goldOffsets[wGold++] = gold->GetOffset3D(sent, w, buf[seqOffset[s] + w + 1]);
}
wCount++;
//wCount++;
wCountDec++;
if(seqs != NULL)
seqs[seqSize++] = buf[seqOffset[s] + w];
......@@ -924,19 +1025,19 @@ int T2TTrainer::LoadBatchMT(FILE * file,
}
batchDec->SetData(batchDecValues, batchDec->unitNum);
//paddingDec->SetDataBatched(paddingDecOffsets, 1.0F, wCountDec);
paddingDec->SetDataBatched(paddingDecOffsets, 1.0F, wCountPad);
XTensor * tmp2 = NewTensorBuf(paddingDec, devID, mem);
_ConvertDataType(batchDec, tmp2);
_NotEqual(tmp2, paddingDec, 0);
DelTensorBuf(tmp2);
//XTensor * tmp2 = NewTensorBuf(paddingDec, devID, mem);
//_ConvertDataType(batchDec, tmp2);
//_NotEqual(tmp2, paddingDec, 0);
//DelTensorBuf(tmp2);
gold->SetDataBatched(goldOffsets, 1.0F, wGold);
delete[] batchEncValues;
delete[] batchDecValues;
//delete[] paddingEncOffsets;
//delete[] paddingDecOffsets;
delete[] paddingDecOffsets;
delete[] goldOffsets;
return sc;
......
......@@ -33,6 +33,25 @@ using namespace nts;
namespace transformer
{
/* node to keep batch information */
struct BatchNode
{
/* begining position */
int beg;
/* end position */
int end;
/* maximum word number on the encoder side */
int maxEnc;
/* maximum word number on the decoder side */
int maxDec;
/* a key for sorting */
int key;
};
/* trainer of the T2T model */
class T2TTrainer
{
......@@ -49,9 +68,15 @@ public:
/* another buffer */
int * buf2;
/* batch buf */
BatchNode * bufBatch;
/* buffer size */
int bufSize;
/* size of batch buffer */
int bufBatchSize;
/* length of each sequence */
int * seqLen;
......@@ -66,6 +91,9 @@ public:
/* offset for next sequence in the buffer */
int nextSeq;
/* offset for next batch */
int nextBatch;
/* indicates whether the sequence is sorted by length */
bool isLenSorted;
......@@ -142,6 +170,9 @@ public:
/* counterpart of "isSmallBatch" */
bool isBigBatch;
/* randomize batches */
bool isRandomBatch;
/* indicates whether we intend to debug the net */
bool isDebugged;
......
......@@ -59,23 +59,28 @@ int TransformerMain(int argc, const char ** argv)
LoadParamString(argc, args, "test", testFN, "");
LoadParamString(argc, args, "output", outputFN, "");
srand((unsigned int)time(NULL));
T2TTrainer trainer;
trainer.Init(argc, args);
T2TModel model;
model.InitModel(argc, args);
//if(strcmp(modelFN, ""))
//model.Read(modelFN);
/* learn model parameters */
if(strcmp(trainFN, ""))
trainer.Train(trainFN, testFN, strcmp(modelFN, "") ? modelFN : "checkpoint.model", &model);
/* save the final model */
if(strcmp(modelFN, "") && strcmp(trainFN, ""))
model.Dump(modelFN);
//if(strcmp(modelFN, "") && strcmp(trainFN, ""))
//model.Dump(modelFN);
/* load the model if neccessary */
if(strcmp(modelFN, ""))
model.Read(modelFN);
//if(strcmp(modelFN, ""))
//model.Read(modelFN);
T2TTrainer tester;
tester.Init(argc, args);
......
......@@ -60,6 +60,7 @@ XDevice::~XDevice()
cublasDestroy(cublasHandle);
if(stream != NULL)
delete stream;
curandDestroyGenerator(gen);
#endif
}
......@@ -82,6 +83,10 @@ void XDevice::Init(int myDevID)
cudaDeviceProp prop;
cudaSetDevice(myDevID);
curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT);
curandSetPseudoRandomGeneratorSeed(gen, seed);
if(cudaGetDeviceProperties(&prop, devID) != cudaSuccess){
XPRINT1(0, stderr, "cannot get GPU(%d) information.", devID);
exit(1);
......
......@@ -112,6 +112,9 @@ public:
/* specify if the handle is initialized */
bool isHandleReady;
/* generater of random numbers */
curandGenerator_t gen;
#endif
......
......@@ -1614,11 +1614,17 @@ void XTensor::Dump(FILE * file, const char * label, const int n, const int beg,
else if (dataType == X_INT) {
int end = MIN(n > 0 ? beg + n : beg + unitNum, unitNum);
for(int i = beg; i < end; i++){
if((i%(dimSize[1]) == 0)&&(i!=0)) {
fprintf(file, " \n");
}
int f = ((int*)d)[i];
if(i == beg)
fprintf(file, "%d", f);
else
fprintf(file, " %d", f);
//if((i%(dimSize[1]-1) == 0)&&(i!=0)) {
//fprintf(file, " \n");
//}
}
}
else
......
......@@ -387,7 +387,7 @@ generate data items with a uniform distribution in [lower, upper]
>> lower - lower value of the range
>> upper - upper value of the range
*/
void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper)
void _SetDataRand(const XTensor * tensor, DTYPE lower, DTYPE upper)
{
CheckNTErrors(upper > lower, "the high value must be greater than low value!");
......@@ -430,6 +430,39 @@ void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper)
//delete t2;
}
}
/*
generate data items with a uniform distribution in [lower, upper] and set
the item to a pre-defined value if the item >= p, set the item to 0 otherwise
>> tensor - the tensor whose data array would be initialized
>> lower - lower value of the range
>> upper - upper value of the range
>> p - the threshold
>> value - the value we intend to assign to the item
*/
void _SetDataRandP(const XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE p, DTYPE value)
{
CheckNTErrors(tensor->dataType == DEFAULT_DTYPE, "TODO");
if (tensor->devID < 0) {
_SetDataRand(tensor, lower, upper);
DTYPE * data = (DTYPE*)tensor->data;
for (int i = 0; i < tensor->unitNum; i++) {
if (data[i] >= p)
data[i] = value;
else
data[i] = 0;
}
}
else {
#ifdef USE_CUDA
_CudaSetDataRandP(tensor, lower, upper, p, value);
#else
ShowNTErrors("Please recompile the code by specifying USE_CUDA");
#endif // USE_CUDA
}
}
/*
generate data items with a normal distribution with specified mean and standard deviation
......
......@@ -185,6 +185,26 @@ void KernelSetDataRandDouble(double * d, int size, DTYPE lower, DTYPE variance)
}
}
/*
set data items to a pre-defined value if its value >= p, set it to 0 otherwise
>> d - pointer to the data array
>> size - size of the array
>> lower - low value of the range
>> variance - the variance of the range
*/
__global__
void KernelSetDataPCut(DTYPE * d, int size, DTYPE p, DTYPE value)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size) {
if (d[i] >= p)
d[i] = value;
else
d[i] = 0;
}
}
/*
set data items along with a given dimension (and keep the remaining items unchanged) - kernel version
>> tensor - the tensor whose data array would be initialized
......@@ -437,7 +457,7 @@ generate data items with a uniform distribution in [lower, upper]
>> lower - lower value of the range
>> upper - upper value of the range
*/
void _CudaSetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper)
void _CudaSetDataRand(const XTensor * tensor, DTYPE lower, DTYPE upper)
{
CheckNTErrors(upper > lower, "the high value must be greater than low value!");
......@@ -452,17 +472,46 @@ void _CudaSetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper)
int devIDBackup;
ProtectCudaDev(tensor->devID, devIDBackup);
curandGenerator_t gen;
curandCreateGenerator (&gen, CURAND_RNG_PSEUDO_DEFAULT);
curandSetPseudoRandomGeneratorSeed(gen, time(NULL));
curandGenerator_t & gen = GDevs.GPUs[tensor->devID].gen;
curandGenerateUniform(gen , (float*)tensor->data , tensor->unitNum);
curandDestroyGenerator(gen);
DTYPE variance = upper - lower;
if (tensor->dataType == X_FLOAT)
KernelSetDataRandFloat <<<blocks, threads >>>((float*) tensor->data, tensor->unitNum, lower, variance);
else if (tensor->dataType == X_DOUBLE)
KernelSetDataRandDouble <<<blocks, threads >>>((double*)tensor->data, tensor->unitNum, lower, variance);
if(variance != 1.0F || lower != 0){
if (tensor->dataType == X_FLOAT)
KernelSetDataRandFloat <<<blocks, threads >>>((float*) tensor->data, tensor->unitNum, lower, variance);
else if (tensor->dataType == X_DOUBLE)
KernelSetDataRandDouble <<<blocks, threads >>>((double*)tensor->data, tensor->unitNum, lower, variance);
}
BacktoCudaDev(tensor->devID, devIDBackup);
}
/*
generate data items with a uniform distribution in [lower, upper] and set
the item to a pre-defined value if the item >= p, set the item to 0 otherwise
>> tensor - the tensor whose data array would be initialized
>> lower - lower value of the range
>> upper - upper value of the range
>> p - the threshold
>> value - the value we intend to assign to the item
*/
void _CudaSetDataRandP(const XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE p, DTYPE value)
{
_CudaSetDataRand(tensor, lower, upper);
int gridSize[3];
int blockSize[3];
GDevs.GetCudaThread(tensor->devID, tensor->unitNum, gridSize, blockSize);
dim3 blocks(gridSize[0]);
dim3 threads(blockSize[0]);
int devIDBackup;
ProtectCudaDev(tensor->devID, devIDBackup);
KernelSetDataPCut << <blocks, threads >> >((float*)tensor->data, tensor->unitNum, p, value);
BacktoCudaDev(tensor->devID, devIDBackup);
}
......
......@@ -47,7 +47,11 @@ void _CudaSetDataIndexed(XTensor * source, XTensor * modify, int dim, int index)
void _CudaSetDataLowTri(XTensor * tensor, DTYPE p, int shift);
/* generate data items with a uniform distribution in [lower, upper] */
void _CudaSetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper);
void _CudaSetDataRand(const XTensor * tensor, DTYPE lower, DTYPE upper);
/* generate data items with a uniform distribution in [lower, upper] and set
the item to a pre-defined value if the item >= p, set the item to 0 otherwise */
void _CudaSetDataRandP(const XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE p, DTYPE value);
/* set the data with an array of offsets */
void _CudaSetDataWithOffset(XTensor * tensor, MTYPE * offsets, DTYPE value, MTYPE num);
......
......@@ -55,7 +55,11 @@ void _SetDataIndexed(XTensor * source, XTensor * modify, int dim, int index);
void _SetDataLowTri(XTensor * tensor, DTYPE p, int shift);
/* generate data items with a uniform distribution in [lower, upper] */
void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper);
void _SetDataRand(const XTensor * tensor, DTYPE lower, DTYPE upper);
/* generate data items with a uniform distribution in [lower, upper] and set
the item to a pre-defined value if the item >= p, set the item to 0 otherwise */
void _SetDataRandP(const XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE p, DTYPE value);
/* generate data items with a normal distribution with specified mean and standard deviation */
void _SetDataRandN(XTensor * tensor, DTYPE mean = 0.0F, DTYPE standardDeviation = 1.0F);
......
......@@ -26,6 +26,7 @@
#include "../core/arithmetic/Multiply.h"
#include "../core/arithmetic/MultiplyDim.h"
#include "../core/math/ScaleAndShift.h"
#include "../core/getandset/SetData.h"
namespace nts{ // namespace nts(NiuTrans.Tensor
......@@ -147,17 +148,21 @@ XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim, int leadingDim
XTensor mask;
DTYPE * maskArray = NULL;
DTYPE scaleFactor = (DTYPE)1.0 / ((DTYPE)1.0 - dropProb);
if(leadingDim < 0 && leadingDim2 < 0){
ShowNTErrors("TODO");
XTensor mask;
InitTensor(&mask, &x);
_SetDataRandP(&mask, 0, 1.0F, dropProb, scaleFactor);
return Multiply(x, mask);
}
else if(leadingDim2 < 0){
int n = leadingDim;
CheckNTErrors(n >= 0 && n < x.order, "Wrong leadingDim!");
DTYPE scaleFactor = (DTYPE)1.0 / ((DTYPE)1.0 - dropProb);
/* generate a mask tensor with probability p */
int unitNum = x.dimSize[n];
maskArray = new DTYPE[unitNum];
......@@ -180,8 +185,6 @@ XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim, int leadingDim
CheckNTErrors(n >= 0 && n < x.order, "Wrong leadingDim!");
CheckNTErrors(m >= 0 && m < x.order, "Wrong leadingDim!");
DTYPE scaleFactor = (DTYPE)1.0 / ((DTYPE)1.0 - dropProb);
/* generate a mask tensor with probability p */
int unitNum = x.dimSize[n] * x.dimSize[m];
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论