Commit d3a0b984 by xuchen

1. remove old interface of XNet

2. fix the bug of onehotandindex
3. modify the data initilization of transformer
4. add the DumpFormat (need reimplement)
parent a0aa3d49
......@@ -55,7 +55,7 @@ void XNetClearAll()
isGradEfficient = false;
isGradEfficient = true;
/* de-constructor */
......@@ -77,104 +77,20 @@ backward propagation to obtain gradient
>> root - root node (output) of the network
>> loss - name of loss function
void XNet::Backward(XTensor &root, LOSS_FUNCTION_NAME loss)
void XNet::Backward(XTensor &root)
TensorList roots(1);
TensorList golds(1);
TensorList paddings(1);
Backward(roots, golds, paddings, loss);
backward propagation to obtain gradient wrt. the loss/error function
>> root - root node (output) of the network
>> gold - gold standard for the output
>> loss - name of loss function
void XNet::Backward(XTensor &root, XTensor &gold, LOSS_FUNCTION_NAME loss)
TensorList roots(1);
TensorList golds(1);
TensorList paddings(1);
Backward(roots, golds, paddings, loss);
backward propagation to obtain gradient wrt. the loss/error function
>> root - root node (output) of the network
>> gold - gold standard for the output
>> padding - specify a target value that is ignored and does not contribute to the gradient computation
>> loss - name of loss function
void XNet::Backward(XTensor &root, XTensor &gold, XTensor &padding, LOSS_FUNCTION_NAME loss)
TensorList roots(1);
TensorList golds(1);
TensorList paddings(1);
Backward(roots, golds, paddings, loss);
backward propagation to obtain gradient
with a number of root nodes
>> roots - a list of root nodes (output) of the network
>> loss - name of loss function
void XNet::Backward(TensorList &roots, LOSS_FUNCTION_NAME loss)
TensorList golds(roots.count);
TensorList paddings(roots.count);
for (int i = 0; i < roots.count; i++) {
Backward(roots, golds, paddings, loss);
backward propagation to obtain gradient
with a number of root nodes
>> roots - a list of root nodes (output) of the network
>> golds - a list of gold standard for the output
>> loss - name of loss function
void XNet::Backward(TensorList &roots, TensorList &golds, LOSS_FUNCTION_NAME loss)
TensorList paddings(roots.count);
for (int i = 0; i < roots.count; i++)
Backward(roots, golds, paddings, loss);
backward propagation to obtain gradient wrt. the loss/error function
with a number of root nodes
>> roots - a list of root nodes (output) of the network
>> golds - a list of gold standard for the output
>> paddings - specify a target value that is ignored
>> loss - name of loss function
void XNet::Backward(TensorList &roots, TensorList &golds, TensorList &paddings, LOSS_FUNCTION_NAME loss)
void XNet::Backward(TensorList &roots)
......@@ -187,39 +103,6 @@ void XNet::Backward(TensorList &roots, TensorList &golds, TensorList &paddings,
node->visitMark = NODE_UNFINISHED;
//XLossGrad lossGrad;
/* we start with the gradient with respect to the loss for output layers */
/*for(int i = 0; i < roots.count; i++){
XTensor * root = (XTensor*)roots.Get(i);
XTensor * gold = (XTensor*)golds.Get(i);
XTensor * padding = (XTensor*)paddings.Get(i);
XLink &income = root->income;
int funcID = income.typeID;
void * params = income.params;*/
/* we compute dE/dx if the output is generated by an activation function y = f(x).
Note that we do not need to obtain dE/dy here because it is no use in the
folloing process of back-propagation */
/*if(gold != NULL && income.tailNum == 1 && (funcID & FUNCTION_BASE)){
XTensor * x = income.tails[0];
lossGrad.Compute(gold, root, x, NULL, x->grad, padding, funcID, params, loss);
root->visitMark = NODE_FINISHED;
else {
lossGrad.Compute(gold, root, root->grad, padding, loss);
/* we compuate dE/dy (y is the output) if no predefined activation function is used */
lossGrad.Compute(gold, root, root->grad, NULL, loss);
/* back-propagation from output to input */
for(int i = nodes.count - 1; i >= 0; i--){
XTensor * node = (XTensor*)nodes.Get(i);
......@@ -271,11 +154,6 @@ void XNet::BackwardNode(XTensor * node, bool isEfficent)
ShowNTErrors("Wrong node type!");
//FILE *f = fopen("debug", "a");
//node->Dump(f, "node", 10);
//if (node->grad != NULL)
// node->grad->Dump(f, "node->grad", 10);
node->visitMark = NODE_FINISHED;
......@@ -465,7 +343,6 @@ void XNet::ShowNetwork(FILE * file, XTensor * node)
search for a node in a top-down manner by its name
>> top - the top most node
......@@ -61,25 +61,11 @@ struct XNet
void Clear();
/* backward propagation to obtain gradient */
void Backward(XTensor &root, LOSS_FUNCTION_NAME loss = NOLOSS);
/* backward propagation to obtain gradient wrt. the loss/error function */
void Backward(XTensor &root, XTensor &gold, LOSS_FUNCTION_NAME loss = NOLOSS);
/* backward propagation to obtain gradient wrt. the loss/error function */
void Backward(XTensor &root, XTensor &gold, XTensor &padding, LOSS_FUNCTION_NAME loss = NOLOSS);
/* backward propagation to obtain gradient
with a number of root nodes */
void Backward(TensorList &roots, LOSS_FUNCTION_NAME loss = NOLOSS);
/* backward propagation to obtain gradient
with a number of root nodes */
void Backward(TensorList &roots, TensorList &golds, LOSS_FUNCTION_NAME loss = NOLOSS);
void Backward(XTensor &root);
/* backward propagation to obtain gradient wrt. the loss/error function
with a number of root nodes */
void Backward(TensorList &roots, TensorList &golds, TensorList &paddings, LOSS_FUNCTION_NAME loss = NOLOSS);
void Backward(TensorList &roots);
/* backward computation for a given node */
void BackwardNode(XTensor * node, bool isEfficent = false);
......@@ -68,8 +68,8 @@ void Read(const char * fn, FNNModel &model);
void Test(const char * test, const char * result, FNNModel &model);
int LoadNGrams(FILE * file, int n, NGram * ngrams, int sentNum, int wordNum);
void InitZeroOneTensor2D(XTensor &tensor, int rowNum, int colNum, int * rows, int * cols,
int itemNum, int devID, XMem * mem);
void MakeWordBatch(XTensor &batch, NGram * ngrams, int ngramNum, int n, int vSize, int devID, XMem * mem);
int itemNum, int devID);
void MakeWordBatch(XTensor &batch, NGram * ngrams, int ngramNum, int n, int vSize, int devID);
void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net);
void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NAME loss,
FNNModel &model, FNNModel &grad, FNNNet &net);
......@@ -229,11 +229,6 @@ void LoadArgs(int argc, const char ** argv, FNNModel &model)
fprintf(stderr, " -dev=%d\n", model.devID);
for(int i = 0; i < argc; i++){
if (!strcmp(argv[i], "-mem"))
model.mem = new XMem(model.devID, FREE_ON_THE_FLY, 256 * MILLION, 512, 256 * MILLION);
/* check model settings */
......@@ -262,11 +257,6 @@ void Copy(FNNModel &tgt, FNNModel &src)
tgt.vSize = src.vSize;
tgt.devID = src.devID;
tgt.useMemPool = src.useMemPool;
if(src.mem != NULL){
tgt.mem = new XMem(src.mem->devID, src.mem->mode,
src.mem->maxBlockSize, src.mem->blockNum,
......@@ -351,9 +341,8 @@ void Init(FNNModel &model)
/* create the output layer parameter matrix and bias term */
int iSize = model.hDepth == 0 ? (model.n - 1) * model.eSize : model.hSize;
InitModelTensor2D(model.outputW, iSize, model.vSize, model);
InitModelTensor1D(model.outputB, model.vSize, model);
/* then, we initialize model parameters using a uniform distribution in range
......@@ -460,10 +449,10 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
/* make the input tensor for position i */
for(int i = 0; i < model.n - 1; i++)
MakeWordBatch(inputs[i], ngrams, ngramNum, i, model.vSize, model.devID, model.mem);
MakeWordBatch(inputs[i], ngrams, ngramNum, i, model.vSize, model.devID);
/* make the gold tensor */
MakeWordBatch(gold, ngrams, ngramNum, model.n - 1, model.vSize, model.devID, model.mem);
MakeWordBatch(gold, ngrams, ngramNum, model.n - 1, model.vSize, model.devID);
/* prepare an empty network for building the fnn */
......@@ -490,7 +479,11 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
Clear(model, true);
/* forward + backward process */
/* this is implemented by gather function */
ForwardAutoDiff(ngrams, ngramNum, output, model);
/* this is implemented by multiply function */
lossTensor = CrossEntropy(output, gold);
/* automatic differentiation */
......@@ -719,10 +712,9 @@ The indexed cell is set to 1, and 0 otherwise.
>> cols - column index
>> itemNum - number of non-zero items
>> devID - device id
>> mem - memory pool
void InitZeroOneTensor2D(XTensor &tensor, int rowNum, int colNum, int * rows, int * cols,
int itemNum, int devID, XMem * mem)
int itemNum, int devID)
InitTensor2DV2(&tensor, rowNum, colNum, X_FLOAT, devID);
......@@ -741,9 +733,8 @@ make a tensor that encodes a batch of words
>> n - indicate which word is encode for each ngram
>> vSize - vocabulary size
>> devID - device id
>> mem - memory pool
void MakeWordBatch(XTensor &batch, NGram * ngrams, int ngramNum, int n, int vSize, int devID, XMem * mem)
void MakeWordBatch(XTensor &batch, NGram * ngrams, int ngramNum, int n, int vSize, int devID)
int * rows = new int[ngramNum];
int * cols = new int[ngramNum];
......@@ -753,7 +744,7 @@ void MakeWordBatch(XTensor &batch, NGram * ngrams, int ngramNum, int n, int vSiz
cols[i] = ngrams[i].words[n];
InitZeroOneTensor2D(batch, ngramNum, vSize, rows, cols, ngramNum, devID, mem);
InitZeroOneTensor2D(batch, ngramNum, vSize, rows, cols, ngramNum, devID);
delete[] rows;
delete[] cols;
......@@ -1162,10 +1153,10 @@ void Test(const char * test, const char * result, FNNModel &model)
/* make the input tensor for position i */
for (int i = 0; i < model.n - 1; i++)
MakeWordBatch(inputs[i], ngrams, ngramNum, i, model.vSize, model.devID, model.mem);
MakeWordBatch(inputs[i], ngrams, ngramNum, i, model.vSize, model.devID);
/* make the gold tensor */
MakeWordBatch(gold, ngrams, ngramNum, model.n - 1, model.vSize, model.devID, model.mem);
MakeWordBatch(gold, ngrams, ngramNum, model.n - 1, model.vSize, model.devID);
if (!autoDiff) {
/* prepare an empty network for building the fnn */
......@@ -51,14 +51,12 @@ initialize the model
>> myIgnored - number of position ignored in attention (from the begining)
>> myIsMasked - indicates whether the attention is with a mask
>> myDevID - device id
>> myMem - the memory pool
void T2TAttention::InitModel(int argc, char ** argv,
bool myIsMasked, int myIgnored,
int myDevID, XMem * myMem)
int myDevID)
devID = myDevID;
mem = myMem;
isMasked = myIsMasked;
ignored = myIgnored;
......@@ -71,23 +69,18 @@ void T2TAttention::InitModel(int argc, char ** argv,
LoadParamFloat(argc, argv, "attminmax", &minmax, 0.1F);
LoadParamFloat(argc, argv, "dropoutatt", &dropoutP, 0);
InitTensor2D(&wk, d, dk, X_FLOAT, devID, mem);
InitTensor2D(&wq, d, dk, X_FLOAT, devID, mem);
InitTensor2D(&wv, d, dv, X_FLOAT, devID, mem);
InitTensor2D(&wa, d, d, X_FLOAT, devID, mem);
InitTensor2D(&wbig, d, 3 * d, X_FLOAT, devID, mem);
InitTensor2DV2(&wk, d, dk, X_FLOAT, devID);
InitTensor2DV2(&wq, d, dk, X_FLOAT, devID);
InitTensor2DV2(&wv, d, dv, X_FLOAT, devID);
InitTensor2DV2(&wa, d, d, X_FLOAT, devID);
InitTensor2DV2(&wbig, d, 3 * d, X_FLOAT, devID);
float scale = 1.0F;
float finfoutk = (float)sqrt(6.0F * scale/(d + dk));
float finfoutv = (float)sqrt(6.0F * scale/(d + dv));
float finfouta = (float)sqrt(6.0F * scale / (d + d));
float finfoutbig = (float)sqrt(6.0F * scale / (d + 3*d));
wk.SetDataRand(-finfoutk, finfoutk);
wq.SetDataRand(-finfoutk, finfoutk);
wv.SetDataRand(-finfoutv, finfoutv);
wa.SetDataRand(-finfouta, finfouta);
wbig.SetDataRand(-finfoutbig, finfoutbig);
_SetDataFanInOut(&wk, scale);
_SetDataFanInOut(&wq, scale);
_SetDataFanInOut(&wv, scale);
_SetDataFanInOut(&wa, scale);
_SetDataFanInOut(&wbig, scale);
......@@ -135,9 +128,9 @@ XTensor T2TAttention::MakeBig(XTensor &kqv, XTensor &mask, bool isTraining)
int d2 = kqv2.GetDim(1);
int d3 = kqv2.GetDim(2) / 3;
InitTensor3D(&k2, d1, d2, d3, X_FLOAT, devID, mem);
InitTensor3D(&q2, d1, d2, d3, X_FLOAT, devID, mem);
InitTensor3D(&v2, d1, d2, d3, X_FLOAT, devID, mem);
InitTensor3DV2(&k2, d1, d2, d3, X_FLOAT, devID);
InitTensor3DV2(&q2, d1, d2, d3, X_FLOAT, devID);
InitTensor3DV2(&v2, d1, d2, d3, X_FLOAT, devID);
......@@ -42,9 +42,6 @@ public:
/* device id */
int devID;
/* memory pool */
XMem * mem;
/* head number */
int nhead;
......@@ -94,7 +91,7 @@ public:
/* initialize the model */
void InitModel(int argc, char ** argv,
bool myIsMasked, int myIgnored,
int myDevID = -1, XMem * myMem = NULL);
int myDevID = -1);
/* make the network */
XTensor Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining);
......@@ -280,7 +280,6 @@ load a batch of sequences
>> isSorted - indicates whether the sequences are sorted by length
>> wCount - word count
>> devID - device id
>> mem - memory pool
>> isTraining - indicates whether we are training the model
int T2TBatchLoader::LoadBatch(FILE * file, bool isLM,
......@@ -290,18 +289,17 @@ int T2TBatchLoader::LoadBatch(FILE * file, bool isLM,
int * seqs,
int vsEnc, int vsDec, int sBatch, int wBatch,
bool isSorted, int &ws, int &wCount,
int devID, XMem * mem,
bool isTraining)
int devID, bool isTraining)
return LoadBatchLM(file, batchEnc, paddingEnc, batchDec, paddingDec, gold, label,
seqs, vsEnc, sBatch, wBatch,
isSorted, wCount, devID, mem, isTraining);
isSorted, wCount, devID, isTraining);
return LoadBatchMT(file, batchEnc, paddingEnc, batchDec, paddingDec, gold, label,
seqs, vsEnc, vsDec, sBatch, wBatch,
isSorted, ws, wCount, devID, mem, isTraining);
isSorted, ws, wCount, devID, isTraining);
......@@ -322,7 +320,6 @@ load a batch of sequences (for LM)
>> isSorted - indicates whether the sequences are sorted by length
>> wCount - word count
>> devID - device id
>> mem - memory pool
>> isTraining - indicates whether we are training the model
int T2TBatchLoader::LoadBatchLM(FILE * file,
......@@ -332,8 +329,7 @@ int T2TBatchLoader::LoadBatchLM(FILE * file,
int * seqs,
int vSize, int sBatch, int wBatch,
bool isSorted, int &wCount,
int devID, XMem * mem,
bool isTraining)
int devID, bool isTraining)
if(nextSeq < 0 || nextSeq >= nseqBuf)
LoadBuf(file, isSorted, 1);
......@@ -369,11 +365,11 @@ int T2TBatchLoader::LoadBatchLM(FILE * file,
dims[1] = max;
dims[2] = vSize;
InitTensor2D(batchEnc, sc, max, X_INT, devID, mem);
InitTensor2D(label, sc, max, X_INT, devID, mem);
InitTensor(gold, 3, dims, X_FLOAT, 1.0F, devID, mem);
InitTensor2D(paddingEnc, sc, max, X_FLOAT, devID, mem);
InitTensor2D(paddingDec, sc, max, X_FLOAT, devID, mem);
InitTensor2DV2(batchEnc, sc, max, X_INT, devID);
InitTensor2DV2(label, sc, max, X_INT, devID);
InitTensorV2(gold, 3, dims, X_FLOAT, devID);
InitTensor2DV2(paddingEnc, sc, max, X_FLOAT, devID);
InitTensor2DV2(paddingDec, sc, max, X_FLOAT, devID);
......@@ -437,12 +433,12 @@ int T2TBatchLoader::LoadBatchLM(FILE * file,
paddingEnc->SetDataBatched(paddingEncOffsets, 1.0F, wCount);
paddingDec->SetDataBatched(paddingDecOffsets, 1.0F, wCount);
/*XTensor * tmp = NewTensorBuf(paddingEnc, devID, mem);
/*XTensor * tmp = NewTensorBufV2(paddingEnc, devID);
_ConvertDataType(batchEnc, tmp);
_NotEqual(tmp, paddingEnc, 0);
XTensor * tmp2 = NewTensorBuf(paddingDec, devID, mem);
XTensor * tmp2 = NewTensorBufV2(paddingDec, devID);
_ConvertDataType(batchEnc, tmp2);
_NotEqual(tmp2, paddingDec, 0);
......@@ -481,7 +477,6 @@ load a batch of sequences (for MT)
>> isSorted - indicates whether the sequences are sorted by length
>> wCount - word count
>> devID - device id
>> mem - memory pool
>> isTraining - indicates whether we are training the model
int T2TBatchLoader::LoadBatchMT(FILE * file,
......@@ -491,8 +486,7 @@ int T2TBatchLoader::LoadBatchMT(FILE * file,
int * seqs,
int vSizeEnc, int vSizeDec, int sBatch, int wBatch,
bool isSorted, int &ws, int &wCount,
int devID, XMem * mem,
bool isTraining)
int devID, bool isTraining)
if (nextBatch < 0 || nextBatch >= bufBatchSize) {
LoadBuf(file, isSorted, 2);
......@@ -569,12 +563,12 @@ int T2TBatchLoader::LoadBatchMT(FILE * file,
int sCount = sc/2;
int seqSize = 0;
InitTensor2D(batchEnc, sCount, maxEnc, X_INT, devID, mem);
InitTensor2D(paddingEnc, sCount, maxEnc, X_FLOAT, devID, mem);
InitTensor2D(batchDec, sCount, maxDec, X_INT, devID, mem);
InitTensor2D(paddingDec, sCount, maxDec, X_FLOAT, devID, mem);
InitTensor2D(label, sCount, maxDec, X_INT, devID, mem);
//InitTensor(gold, 3, dimsDec, X_FLOAT, 1.0F, devID, mem);
InitTensor2DV2(batchEnc, sCount, maxEnc, X_INT, devID);
InitTensor2DV2(paddingEnc, sCount, maxEnc, X_FLOAT, devID);
InitTensor2DV2(batchDec, sCount, maxDec, X_INT, devID);
InitTensor2DV2(paddingDec, sCount, maxDec, X_FLOAT, devID);
InitTensor2DV2(label, sCount, maxDec, X_INT, devID);
//InitTensorV2(gold, 3, dimsDec, X_FLOAT, devID);
......@@ -613,7 +607,7 @@ int T2TBatchLoader::LoadBatchMT(FILE * file,
ws = wCountEnc;
batchEnc->SetData(batchEncValues, batchEnc->unitNum);
paddingEnc->SetDataBatched(paddingEncOffsets, 1.0F, wCountEnc);
//XTensor * tmp = NewTensorBuf(paddingEnc, devID, mem);
//XTensor * tmp = NewTensorBufV2(paddingEnc, devID);
//_ConvertDataType(batchEnc, tmp);
//tmp->Dump(stderr, "tmp:");
//_NotEqual(tmp, paddingEnc, 0);
......@@ -662,7 +656,7 @@ int T2TBatchLoader::LoadBatchMT(FILE * file,
label->SetData(labelValues, label->unitNum);
paddingDec->SetDataBatched(paddingDecOffsets, 1.0F, wCountPad);
//XTensor * tmp2 = NewTensorBuf(paddingDec, devID, mem);
//XTensor * tmp2 = NewTensorBufV2(paddingDec, devID);
//_ConvertDataType(batchDec, tmp2);
//_NotEqual(tmp2, paddingDec, 0);
......@@ -131,8 +131,7 @@ public:
int * seqs,
int vsEnc, int vsDec, int sBatch, int wBatch,
bool isSorted, int &ws, int &wCount,
int devID, XMem * mem,
bool isTraining);
int devID, bool isTraining);
/* load a batch of sequences (for language modeling) */
int LoadBatchLM(FILE * file,
......@@ -141,8 +140,7 @@ public:
XTensor * gold, XTensor * label,
int * seqs, int vs, int sBatch, int wBatch,
bool isSorted, int &wCount,
int devID, XMem * mem,
bool isTraining);
int devID, bool isTraining);
/* load a batch of sequences (for machine translation) */
int LoadBatchMT(FILE * file,
......@@ -151,8 +149,7 @@ public:
XTensor * gold, XTensor * label,
int * seqs, int vsEnc, int vsDec, int sBatch, int wBatch,
bool isSorted, int &ws, int &wCount,
int devID, XMem * mem,
bool isTraining);
int devID, bool isTraining);
/* shuffle the data file */
void Shuffle(const char * srcFile, const char * tgtFile);
......@@ -57,16 +57,14 @@ initialize the model
>> myIsMasked - indicates whether the masked attention is employed
>> myIgnored - number of positions ignored in attention (from the start)
>> myDevID - device id
>> myMem - the memory pool
void AttDecoder::InitModel(int argc, char ** argv,
bool myIsMasked, int myIgnored,
int myDevID, XMem * myMem)
int myDevID)
//AttEncoder::InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem);
//AttEncoder::InitModel(argc, argv, myIsMasked, myIgnored, myDevID);
devID = myDevID;
mem = myMem;
ignored = myIgnored;
LoadParamInt(argc, argv, "nlayer", &nlayer, 6);
......@@ -79,7 +77,7 @@ void AttDecoder::InitModel(int argc, char ** argv,
CheckNTErrors(vSize > 1, "set vocabulary size by \"-vsizetgt\"");
/* embedding model */
embedder.InitModel(argc, argv, devID, mem, false);
embedder.InitModel(argc, argv, devID, false);
attentions = new T2TAttention[nlayer];
fnns = new T2TFNN[nlayer];
......@@ -90,12 +88,12 @@ void AttDecoder::InitModel(int argc, char ** argv,
/* initialize the stacked layers */
for (int i = 0; i < nlayer; i++) {
attentions[i].InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem);
fnns[i].InitModel(argc, argv, myDevID, myMem);
attLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
fnnLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
attentionsEnde[i].InitModel(argc, argv, true, myIgnored, myDevID, myMem);
attEndeLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
attentions[i].InitModel(argc, argv, myIsMasked, myIgnored, myDevID);
fnns[i].InitModel(argc, argv, myDevID);
attLayerNorms[i].InitModel(argc, argv, myDevID);
fnnLayerNorms[i].InitModel(argc, argv, myDevID);
attentionsEnde[i].InitModel(argc, argv, true, myIgnored, myDevID);
attEndeLayerNorms[i].InitModel(argc, argv, myDevID);
......@@ -37,9 +37,6 @@ public:
/* device id */
int devID;
/* memory pool */
XMem * mem;
/* layer number */
int nlayer;
......@@ -95,7 +92,7 @@ public:
/* initialize the model */
void InitModel(int argc, char ** argv,
bool myIsMasked, int myIgnored,
int myDevID = -1, XMem * myMem = NULL);
int myDevID = -1);
/* make the decoding network */
XTensor Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, XTensor &maskEncDec, bool isTraining);
......@@ -31,7 +31,6 @@ namespace transformer
devID = -1;
mem = NULL;
vSize = -1;
maxLength = -1;
......@@ -46,12 +45,10 @@ initialize the model
>> argc - number of arguments
>> argv - list of pointers to the arguments
>> myDevID - device id
>> myMem - the memory pool
void T2TEmbedder::InitModel(int argc, char ** argv, int myDevID, XMem * myMem, bool isEnc)
void T2TEmbedder::InitModel(int argc, char ** argv, int myDevID, bool isEnc)
devID = myDevID;
mem = myMem;
LoadParamInt(argc, argv, "vsize", &vSize, -1);
......@@ -64,7 +61,7 @@ void T2TEmbedder::InitModel(int argc, char ** argv, int myDevID, XMem * myMem, b
LoadParamInt(argc, argv, "d", &eSize, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);
InitTensor2D(&w, vSize, eSize, X_FLOAT, devID, mem);
InitTensor2DV2(&w, vSize, eSize, X_FLOAT, devID);
DTYPE v = 1.0F/(float)sqrt((float)eSize);
w.SetDataRandn(0, v);
......@@ -81,7 +78,7 @@ make positional embeddings (of size eSize * length)
void T2TEmbedder::MakePosEmbedding(int eSize, int d, int length)
InitTensor2D(&posEmbeddingBase, length, eSize, X_FLOAT, devID, mem);
InitTensor2DV2(&posEmbeddingBase, length, eSize, X_FLOAT, devID);
float * data = new float[posEmbeddingBase.unitNum];
......@@ -145,9 +142,9 @@ XTensor T2TEmbedder::Make(XTensor &input)
/* we make positional embeddings first */
InitTensor(&posEmbedding, input.order + 1, dims, X_FLOAT, 1.0F, devID, mem);
InitTensorV2(&posEmbedding, input.order + 1, dims, X_FLOAT, devID);
XTensor * posTMP = NewTensorBuf(2, dims + 1, X_FLOAT, 1.0F, devID, mem);
XTensor * posTMP = NewTensorBufV2(2, dims + 1, X_FLOAT, devID);
_CopyValues(&posEmbeddingBase, 0, posTMP->unitNum, posTMP, 0);
_Unsqueeze(posTMP, &posEmbedding, 0, dims[0]);
......@@ -41,9 +41,6 @@ public:
/* device id */
int devID;
/* memory pool */
XMem * mem;
/* vocabulary size */
int vSize;
......@@ -71,7 +68,7 @@ public:
/* initialize the model */
void InitModel(int argc, char ** argv, int myDevID = -1, XMem * myMem = NULL, bool isEnc = true);
void InitModel(int argc, char ** argv, int myDevID = -1, bool isEnc = true);
/* make positional embeddings */
void MakePosEmbedding(int eSize, int d, int length);
......@@ -52,15 +52,12 @@ initialize the model
>> argv - list of pointers to the arguments
>> myIsMasked - indicates whether the masked attention is employed
>> myIgnored - number of positions ignored in attention (from the start)
>> myDevID - device id
>> myMem - the memory pool
>> myDevID - device id*/
void AttEncoder::InitModel(int argc, char ** argv,
bool myIsMasked, int myIgnored,
int myDevID, XMem * myMem)
int myDevID)
devID = myDevID;
mem = myMem;
ignored = myIgnored;
LoadParamInt(argc, argv, "nlayer", &nlayer, 6);
......@@ -73,7 +70,7 @@ void AttEncoder::InitModel(int argc, char ** argv,
CheckNTErrors(vSize > 1, "set vocabulary size by \"-vsize\"");
/* embedding model */
embedder.InitModel(argc, argv, devID, mem);
embedder.InitModel(argc, argv, devID);
attentions = new T2TAttention[nlayer];
fnns = new T2TFNN[nlayer];
......@@ -82,10 +79,10 @@ void AttEncoder::InitModel(int argc, char ** argv,
/* initialize the stacked layers */
for(int i = 0; i < nlayer; i++){
attentions[i].InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem);
fnns[i].InitModel(argc, argv, myDevID, myMem);
attLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
fnnLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
attentions[i].InitModel(argc, argv, myIsMasked, myIgnored, myDevID);
fnns[i].InitModel(argc, argv, myDevID);
attLayerNorms[i].InitModel(argc, argv, myDevID);
fnnLayerNorms[i].InitModel(argc, argv, myDevID);
......@@ -65,9 +65,6 @@ public:
/* device id */
int devID;
/* memory pool */
XMem * mem;
/* layer number */
int nlayer;
......@@ -118,7 +115,7 @@ public:
/* initialize the model */
void InitModel(int argc, char ** argv,
bool myIsMasked, int myIgnored,
int myDevID = -1, XMem * myMem = NULL);
int myDevID = -1);
/* make the encoding network */
XTensor Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, bool isTraining);
......@@ -47,12 +47,10 @@ initialize the model
>> argc - number of arguments
>> argv - list of pointers to the arguments
>> myDevID - device id
>> myMem - the memory pool
void T2TFNN::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
void T2TFNN::InitModel(int argc, char ** argv, int myDevID)
devID = myDevID;
mem = myMem;
float minmax = 0;
......@@ -62,19 +60,17 @@ void T2TFNN::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
LoadParamFloat(argc, argv, "fnnminmax", &minmax, 0.1F);
LoadParamFloat(argc, argv, "dropoutfnn", &dropoutP, 0);
InitTensor2D(&w1, inSize, hSize, X_FLOAT, devID, mem);
InitTensor1D(&b1, hSize, X_FLOAT, devID, mem);
InitTensor2DV2(&w1, inSize, hSize, X_FLOAT, devID);
InitTensor1DV2(&b1, hSize, X_FLOAT, devID);
InitTensor2D(&w2, hSize, outSize, X_FLOAT, devID, mem);
InitTensor1D(&b2, outSize, X_FLOAT, devID, mem);
InitTensor2DV2(&w2, hSize, outSize, X_FLOAT, devID);
InitTensor1DV2(&b2, outSize, X_FLOAT, devID);
float scale = 1.0F;
float finfout1 = (float)sqrt(6.0F * scale/(inSize + hSize));
float finfout2 = (float)sqrt(6.0F * scale/(hSize + outSize));
_SetDataFanInOut(&w1, scale);
_SetDataFanInOut(&w2, scale);
w1.SetDataRand(-finfout1, finfout1);
w2.SetDataRand(-finfout2, finfout2);
......@@ -36,9 +36,6 @@ public:
/* device id */
int devID;
/* memory pool */
XMem * mem;
/* size of input vector */
int inSize;
......@@ -72,7 +69,7 @@ public:
/* initialize the model */
void InitModel(int argc, char ** argv, int myDevID = -1, XMem * myMem = NULL);
void InitModel(int argc, char ** argv, int myDevID = -1);
/* make the network */
XTensor Make(XTensor &input, bool isTraining);
......@@ -32,7 +32,6 @@ namespace transformer
devID = -1;
mem = NULL;
d = 0;
......@@ -46,18 +45,16 @@ initialize the model
>> argc - number of arguments
>> argv - list of pointers to the arguments
>> myDevID - device id
>> myMem - the memory pool
void T2TLN::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
void T2TLN::InitModel(int argc, char ** argv, int myDevID)
devID = myDevID;
mem = myMem;
d = 0;
LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);
InitTensor1D(&w, d, X_FLOAT, devID, mem);
InitTensor1D(&b, d, X_FLOAT, devID, mem);
InitTensor1DV2(&w, d, X_FLOAT, devID);
InitTensor1DV2(&b, d, X_FLOAT, devID);
w.SetDataRand(1.0F, 1.0F);
......@@ -37,9 +37,6 @@ public:
/* device id */
int devID;
/* memory pool */
XMem * mem;
/* the transformation matrix w */
XTensor w;
......@@ -57,7 +54,7 @@ public:
/* initialize the model */
void InitModel(int argc, char ** argv, int myDevID = -1, XMem * myMem = NULL);
void InitModel(int argc, char ** argv, int myDevID = -1);
/* make the network */
XTensor Make(XTensor &input);
......@@ -32,7 +32,6 @@ namespace transformer
devID = -1;
mem = NULL;
isLM = false;
isMT = false;
nhead = 1;
......@@ -48,10 +47,6 @@ T2TModel::~T2TModel()
delete encoder;
delete decoder;
delete outputLayer;
/* we delete "mem" at the end because other members are using it and we must
remove the memory space before all tensors are destroyed. */
delete mem;
......@@ -61,29 +56,16 @@ initialize the model
void T2TModel::InitModel(int argc, char ** argv)
bool useMem = false;
int memSize = 0;
bool isMemFreeOTF = false;
LoadParamInt(argc, argv, "dev", &devID, -1);
LoadParamBool(argc, argv, "mem", &useMem, useMem);
LoadParamInt(argc, argv, "memsize", &memSize, 1024);
LoadParamBool(argc, argv, "mt", &isMT, false);
LoadParamBool(argc, argv, "lm", &isLM, !isMT);
LoadParamInt(argc, argv, "nhead", &nhead, 8);
LoadParamBool(argc, argv, "freeotf", &isMemFreeOTF, false);
delete mem;
mem = new XMem(devID, FREE_ON_THE_FLY, (MTYPE)MILLION * 256, 1024, MILLION * 128);
mem->SetDesiredSize(devID, 0, (MTYPE)memSize * MILLION);
encoder->InitModel(argc, argv, true, 0, devID, mem);
outputLayer->InitModel(argc, argv, devID, mem);
encoder->InitModel(argc, argv, true, 0, devID);
outputLayer->InitModel(argc, argv, devID);
decoder->InitModel(argc, argv, true, 0, devID, mem);
decoder->InitModel(argc, argv, true, 0, devID);
TensorList params(10);
......@@ -149,7 +131,8 @@ void T2TModel::MakeLM(XTensor &input, XTensor &output, XTensor &padding, bool is
dims[i + 1] = input.GetDim(i);
dims[0] = nhead;
dims[input.order + 1] = len;
XTensor mask(input.order + 2, dims, X_FLOAT, 1.0F, padding.devID, padding.mem);
XTensor mask;
InitTensorV2(&mask, input.order + 2, dims, X_FLOAT, padding.devID);
/* a upper triangular matrix where the cells of the upper triangular are set to -1e-9.
this matrix can be used to prevent the attention to current or following words in
......@@ -163,15 +146,15 @@ void T2TModel::MakeLM(XTensor &input, XTensor &output, XTensor &padding, bool is
dimsPadding[padding.order - 1] = padding.GetDim(-1);
dimsPadding[padding.order] = padding.GetDim(-1);
XTensor * padding2 = NewTensorBuf(padding.order + 1, dimsPadding, padding.dataType,
padding.denseRatio, padding.devID, padding.mem);
XTensor * padding2 = NewTensorBufV2(padding.order + 1, dimsPadding, padding.dataType,
for(int i = 0; i < padding2->order; i++)
dimsPadding[i + 1] = padding2->GetDim(i);
dimsPadding[0] = nhead;
//XTensor * padding3 = NewTensorBuf(padding.order + 2, dimsPadding, padding.dataType,
// padding.denseRatio, padding.devID, padding.mem);
//XTensor * padding3 = NewTensorBufV2(padding.order + 2, dimsPadding, padding.dataType,
// padding.devID);
///* mask of the padding */
//_Unsqueeze(&padding, padding2, padding.order - 1, padding.GetDim(-1));
......@@ -241,7 +224,7 @@ void T2TModel::MakeMTMask(XTensor &inputEnc, XTensor &inputDec,
dims[i + 1] = inputDec.GetDim(i);
dims[0] = nhead;
dims[inputDec.order + 1] = len;
InitTensor(&maskDec, inputDec.order + 2, dims, X_FLOAT, 1.0F, paddingDec.devID, paddingDec.mem);
InitTensorV2(&maskDec, inputDec.order + 2, dims, X_FLOAT, paddingDec.devID);
/* an upper triangular matrix where the cells of the upper triangular are set to -1e-9.
this matrix can be used to prevent the attention to current or following words in
......@@ -251,11 +234,11 @@ void T2TModel::MakeMTMask(XTensor &inputEnc, XTensor &inputDec,
/* encoder-decoder mask that prevents the attention to padding dummy words */
dims[inputDec.order + 1] = inputEnc.GetDim(inputEnc.order - 1);
InitTensor(&maskEncDec, inputDec.order + 2, dims, X_FLOAT, 1.0F, paddingEnc.devID, paddingEnc.mem);
InitTensorV2(&maskEncDec, inputDec.order + 2, dims, X_FLOAT, paddingEnc.devID);
XTensor * maskEncDecTMPEnc = NewTensorBuf(paddingEnc.order + 1, dims + 1, paddingEnc.dataType,
paddingEnc.denseRatio, paddingEnc.devID, paddingEnc.mem);
XTensor * maskEncDecTMPDec = NewTensorBuf(maskEncDecTMPEnc, paddingEnc.devID, paddingEnc.mem);
XTensor * maskEncDecTMPEnc = NewTensorBufV2(paddingEnc.order + 1, dims + 1, paddingEnc.dataType,
XTensor * maskEncDecTMPDec = NewTensorBufV2(maskEncDecTMPEnc, paddingEnc.devID);
_Unsqueeze(&paddingEnc, maskEncDecTMPEnc, paddingEnc.order - 1, paddingDec.GetDim(-1));
_ScaleAndShiftMe(maskEncDecTMPEnc, 1e9F, -1e9F);
......@@ -271,15 +254,15 @@ void T2TModel::MakeMTMask(XTensor &inputEnc, XTensor &inputDec,
dimsPadding[paddingEnc.order - 1] = paddingEnc.GetDim(-1);
dimsPadding[paddingEnc.order] = paddingEnc.GetDim(-1);
XTensor * padding2 = NewTensorBuf(paddingEnc.order + 1, dimsPadding, paddingEnc.dataType,
paddingEnc.denseRatio, paddingEnc.devID, paddingEnc.mem);
XTensor * padding2 = NewTensorBufV2(paddingEnc.order + 1, dimsPadding, paddingEnc.dataType,
for (int i = 0; i < padding2->order; i++)
dimsPadding[i + 1] = padding2->GetDim(i);
dimsPadding[0] = nhead;
XTensor * padding3 = NewTensorBuf(paddingEnc.order + 2, dimsPadding, paddingEnc.dataType,
paddingEnc.denseRatio, paddingEnc.devID, paddingEnc.mem);
XTensor * padding3 = NewTensorBufV2(paddingEnc.order + 2, dimsPadding, paddingEnc.dataType,
/* mask of the padding */
_Unsqueeze(&paddingEnc, padding2, paddingEnc.order - 1, paddingEnc.GetDim(-1));
......@@ -287,7 +270,7 @@ void T2TModel::MakeMTMask(XTensor &inputEnc, XTensor &inputDec,
_ScaleAndShiftMe(padding3, 1e9F, -1e9F);
InitTensor(&maskEnc, padding3);
InitTensorV2(&maskEnc, padding3);
/* generate the mask on the source language side (for padding) */
......@@ -315,15 +298,15 @@ void T2TModel::MakeMTMaskEnc(XTensor &inputEnc, XTensor &paddingEnc, XTensor &ma
dimsPadding[paddingEnc.order - 1] = paddingEnc.GetDim(-1);
dimsPadding[paddingEnc.order] = paddingEnc.GetDim(-1);
XTensor * padding2 = NewTensorBuf(paddingEnc.order + 1, dimsPadding, paddingEnc.dataType,
paddingEnc.denseRatio, paddingEnc.devID, paddingEnc.mem);
XTensor * padding2 = NewTensorBufV2(paddingEnc.order + 1, dimsPadding, paddingEnc.dataType,
for (int i = 0; i < padding2->order; i++)
dimsPadding[i + 1] = padding2->GetDim(i);
dimsPadding[0] = nhead;
XTensor * padding3 = NewTensorBuf(paddingEnc.order + 2, dimsPadding, paddingEnc.dataType,
paddingEnc.denseRatio, paddingEnc.devID, paddingEnc.mem);
XTensor * padding3 = NewTensorBufV2(paddingEnc.order + 2, dimsPadding, paddingEnc.dataType,
/* mask of the padding */
_Unsqueeze(&paddingEnc, padding2, paddingEnc.order - 1, paddingEnc.GetDim(-1));
......@@ -331,7 +314,7 @@ void T2TModel::MakeMTMaskEnc(XTensor &inputEnc, XTensor &paddingEnc, XTensor &ma
_ScaleAndShiftMe(padding3, 1e9F, -1e9F);
InitTensor(&maskEnc, padding3);
InitTensorV2(&maskEnc, padding3);
/* generate the mask on the source language side (for padding) */
......@@ -361,7 +344,7 @@ void T2TModel::MakeMTMaskDec(XTensor &inputEnc, XTensor &inputDec,
dims[i + 1] = inputDec.GetDim(i);
dims[0] = nhead;
dims[inputDec.order + 1] = len;
InitTensor(&maskDec, inputDec.order + 2, dims, X_FLOAT, 1.0F, paddingDec.devID, paddingDec.mem);
InitTensorV2(&maskDec, inputDec.order + 2, dims, X_FLOAT, paddingDec.devID);
/* An upper triangular matrix where the cells of the upper triangular are set to -1e-9.
This matrix can be used to block the attention to current or following words in
......@@ -376,11 +359,11 @@ void T2TModel::MakeMTMaskDec(XTensor &inputEnc, XTensor &inputDec,
/* encoder-decoder mask that prevents the attention to padding dummy words */
dims[inputDec.order + 1] = inputEnc.GetDim(inputEnc.order - 1);
InitTensor(&maskEncDec, inputDec.order + 2, dims, X_FLOAT, 1.0F, paddingEnc.devID, paddingEnc.mem);
InitTensorV2(&maskEncDec, inputDec.order + 2, dims, X_FLOAT, paddingEnc.devID);
XTensor * maskEncDecTMPEnc = NewTensorBuf(paddingEnc.order + 1, dims + 1, paddingEnc.dataType,
paddingEnc.denseRatio, paddingEnc.devID, paddingEnc.mem);
XTensor * maskEncDecTMPDec = NewTensorBuf(maskEncDecTMPEnc, paddingEnc.devID, paddingEnc.mem);
XTensor * maskEncDecTMPEnc = NewTensorBufV2(paddingEnc.order + 1, dims + 1, paddingEnc.dataType,
XTensor * maskEncDecTMPDec = NewTensorBufV2(maskEncDecTMPEnc, paddingEnc.devID);
_Unsqueeze(&paddingEnc, maskEncDecTMPEnc, paddingEnc.order - 1, paddingDec.GetDim(-1));
......@@ -40,9 +40,6 @@ public:
/* device id */
int devID;
/* memory pool */
XMem * mem;
/* the encoder */
AttEncoder * encoder;
......@@ -31,7 +31,6 @@ namespace transformer
devID = -1;
mem = NULL;
vSize = -1;
inSize = -1;
hSize = -1;
......@@ -47,12 +46,10 @@ initialize the model
>> argc - number of arguments
>> argv - list of pointers to the arguments
>> myDevID - device id
>> myMem - the memory pool
void T2TOutput::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
void T2TOutput::InitModel(int argc, char ** argv, int myDevID)
devID = myDevID;
mem = myMem;
float minmax = 0;
......@@ -61,7 +58,7 @@ void T2TOutput::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
LoadParamInt(argc, argv, "d", &hSize, DEFAULT_EMBEDDING_SIZE);
LoadParamFloat(argc, argv, "outputminmax", &minmax, 0.08F);
InitTensor2D(&w, hSize, vSize, X_FLOAT, devID, mem);
InitTensor2DV2(&w, hSize, vSize, X_FLOAT, devID);
float scale = 1.0F;
float finfout = (float)sqrt(6.0F * scale/(hSize + vSize));
......@@ -38,9 +38,6 @@ public:
/* device id */
int devID;
/* memory pool */
XMem * mem;
/* vocabulary size */
int vSize;
......@@ -61,7 +58,7 @@ public:
/* initialize the model */
void InitModel(int argc, char ** argv, int myDevID = -1, XMem * myMem = NULL);
void InitModel(int argc, char ** argv, int myDevID = -1);
/* make the network */
XTensor Make(XTensor &input);
......@@ -105,9 +105,9 @@ void T2TPredictor::Create(T2TModel * model, XTensor * top, const XTensor * input
dims[i] = input->GetDim(i);
dims[input->order - 1] = beamSize;
InitTensor(&state->probPath, input->order, dims, X_FLOAT, 1.0F, input->devID, input->mem);
InitTensor(&state->nstep, input->order, dims, X_FLOAT, 1.0F, input->devID, input->mem);
InitTensor(&state->endMark, input->order, dims, X_INT, 1.0F, input->devID, input->mem);
InitTensorV2(&state->probPath, input->order, dims, X_FLOAT, input->devID);
InitTensorV2(&state->nstep, input->order, dims, X_FLOAT, input->devID);
InitTensorV2(&state->endMark, input->order, dims, X_INT, input->devID);
......@@ -170,7 +170,7 @@ void T2TPredictor::Predict(T2TStateBundle * next, XTensor * encoding,
dims[i] = inputEnc->GetDim(i);
dims[inputEnc->order - 1] = 1;
InitTensor(&first, inputEnc->order, dims, X_INT, 1.0F, inputEnc->devID, inputEnc->mem);
InitTensorV2(&first, inputEnc->order, dims, X_INT, inputEnc->devID);
_SetDataFixedInt(&first, startSymbol);
/* add a new word into the input sequence of the decoder side */
......@@ -179,7 +179,7 @@ void T2TPredictor::Predict(T2TStateBundle * next, XTensor * encoding,
inputDec = GeneratePaths(s);
inputDec.SetDevice(inputEnc->devID, inputEnc->mem);
inputDec = Concatenate(first, inputDec, inputDec.order - 1);
......@@ -194,7 +194,7 @@ void T2TPredictor::Predict(T2TStateBundle * next, XTensor * encoding,
dims[inputDec.order - 1] = inputDec.GetDim(-1);
XTensor paddingDec;
InitTensor(&paddingDec, inputDec.order, dims, X_INT, 1.0F, paddingEnc->devID, paddingEnc->mem);
InitTensorV2(&paddingDec, inputDec.order, dims, X_INT, paddingEnc->devID);
SetDataFixedInt(paddingDec, 1);
XTensor maskDec;
......@@ -213,14 +213,14 @@ void T2TPredictor::Predict(T2TStateBundle * next, XTensor * encoding,
int stride = decoding.GetDim(decoding.order - 2);
InitTensor1D(&selectSrc, 1, X_INT);
InitTensor1D(&selectTgt, 1, X_INT);
InitTensor1DV2(&selectSrc, 1, X_INT);
InitTensor1DV2(&selectTgt, 1, X_INT);
selectSrc.SetInt(stride - 1, 0);
selectTgt.SetInt(0, 0);
selectSrc.SetDevice(decoding.devID, decoding.mem);
selectTgt.SetDevice(decoding.devID, decoding.mem);
/* the decoder output of the last position */
decodingStep = CopyIndexed(decoding, decoding.order - 2, selectSrc, selectTgt);
......@@ -257,7 +257,7 @@ XTensor T2TPredictor::GeneratePaths(T2TStateBundle * state)
XTensor path;
InitTensor2D(&path, state->stateNum, distance, X_INT);
InitTensor2DV2(&path, state->stateNum, distance, X_INT);
for(int i = 0; i < state->stateNum; i++){
......@@ -141,10 +141,6 @@ void T2TSearch::Search(T2TModel * model, XTensor * input, XTensor * padding, XTe
/* push complete hypotheses into the heap */
/* stop searching when all hypotheses are completed */
/* fill the heap with imcomplete hypotheses if neccesary */
......@@ -196,8 +192,8 @@ void T2TSearch::Score(T2TStateBundle * prev, T2TStateBundle * beam)
for(int i = 0; i < order; i++)
dims[i] = prob.GetDim(i);
InitTensor(&score, &prob);
InitTensor(&probPath, &prob);
InitTensorV2(&score, &prob);
InitTensorV2(&probPath, &prob);
prob.Reshape(prob.unitNum/outputSize, outputSize);
score.Reshape(score.unitNum/outputSize, outputSize);
......@@ -208,8 +204,8 @@ void T2TSearch::Score(T2TStateBundle * prev, T2TStateBundle * beam)
_SumDim(&prob, &probPathPrev, &probPath, 0);
InitTensor(&len, &lenPrev);
InitTensor(&lp, &lenPrev);
InitTensorV2(&len, &lenPrev);
InitTensorV2(&lp, &lenPrev);
_ScaleAndShift(&lenPrev, &len, 1.0F, 1.0F);
......@@ -229,9 +225,9 @@ void T2TSearch::Score(T2TStateBundle * prev, T2TStateBundle * beam)
_SumDim(&score, &firstMask, &score, 0);
prev->endMark.order, prev->endMark.dimSize, X_FLOAT, 1.0F,
prev->endMark.devID, prev->endMark.mem);
prev->endMark.order, prev->endMark.dimSize, X_FLOAT,
_SetDataFixedCond(&mask, &prev->endMark, -1e9F);
......@@ -266,15 +262,15 @@ void T2TSearch::Generate(T2TStateBundle * beam)
XTensor &prob = beam->prob;
int order = score.order;
CheckNTErrors(order >= 3, "The tensor must be of order 2 or larger.");
CheckNTErrors(dimsBeam[order - 3] % beamSize == 0, "Wrong dimension size!");
for (int i = 0; i < order; i++) {
dims[i] = score.GetDim(i);
dimsBeam[i] = score.GetDim(i);
dimsTopK[i] = score.GetDim(i);
CheckNTErrors(order >= 3, "The tensor must be of order 2 or larger.");
CheckNTErrors(dimsBeam[order - 3] % beamSize == 0, "Wrong dimension size!");
int sizeVocab = score.GetDim(-1);
int stride = score.GetDim(-1);
......@@ -283,12 +279,11 @@ void T2TSearch::Generate(T2TStateBundle * beam)
dimsTopK[order - 3] = dimsBeam[order - 3];
dimsTopK[order - 1] = beamSize;
InitTensor(&scoreTopK, order, dimsTopK, score.dataType,
1.0F, score.devID, score.mem);
InitTensor(&index, order, dimsTopK, X_INT,
1.0F, score.devID, score.mem);
InitTensor(&preID, order, dimsTopK, X_INT,
1.0F, -1);
InitTensorV2(&scoreTopK, order, dimsTopK, score.dataType,
InitTensorV2(&index, order, dimsTopK, X_INT,
InitTensorV2(&preID, order, dimsTopK, X_INT, -1);
score.Reshape(order, dimsBeam);
......@@ -312,7 +307,7 @@ void T2TSearch::Generate(T2TStateBundle * beam)
score.Reshape(order, dims);
/* we keep the top-k scores */
InitTensor(&score, &scoreTopK);
InitTensorV2(&score, &scoreTopK);
CopyValues(scoreTopK, score);
/* CPU data (TODO: remove GPU->CPU data copy!!!) */
......@@ -328,9 +323,9 @@ void T2TSearch::Generate(T2TStateBundle * beam)
/* sequence probability of top-k candidates */
XTensor probPathTopK;
InitTensor(&probPathTopK, &scoreTopK);
InitTensorV2(&probPathTopK, &scoreTopK);
XTensor probTopK;
InitTensor(&probTopK, &scoreTopK);
InitTensorV2(&probTopK, &scoreTopK);
for (int i = 0; i < probPath.order; i++) {
dims[i] = probPath.GetDim(i);
......@@ -386,7 +381,7 @@ void T2TSearch::Expand(T2TStateBundle * prev, T2TStateBundle * beam)
InitTensorOnCPU(&probPath, &probPathRef);
InitTensorOnCPU(&prediction, &predictionRef);
InitTensorOnCPU(&endMarkCPU, &predictionRef);
InitTensor(&endMark, &predictionRef);
InitTensorV2(&endMark, &predictionRef);
/* we copy the data to CPU because the frequent access to GPU is slow
and we can speed-up the process by doing the job on CPU. */
......@@ -507,7 +502,7 @@ void T2TSearch::Dump(XTensor * output)
int dims[3] = {batchSize, beamSize, maxLength};
int * words = new int[maxLength];
InitTensor(output, 3, dims, X_INT);
InitTensorV2(output, 3, dims, X_INT);
SetDataFixedInt(*output, -1);
/* heap for an input sentence in the batch */
......@@ -579,23 +574,6 @@ void T2TSearch::SetEnd(const int * tokens, const int tokenNum)
check whether all hypotheses are completed
>> beam - the beam that keeps the searching states
bool T2TSearch::IsAllCompleted(T2TStateBundle * beam)
T2TState * states = beam->states;
for (int i = 0; i < beam->stateNum; i++) {
T2TState & state = states[i];
return false;
return true;
make a mask to prevent duplicated entries in beam expansion for the first position
>> beam - the beam that keeps the searching states
......@@ -609,7 +587,7 @@ XTensor T2TSearch::MakeFirstMask(T2TStateBundle * beam)
for (int i = 0; i < order - 1; i++)
dims[i] = prob.GetDim(i);
InitTensor(&mask, order - 1, dims, X_FLOAT);
InitTensorV2(&mask, order - 1, dims, X_FLOAT);
for (int i = 0; i < mask.unitNum; i++) {
......@@ -617,7 +595,7 @@ XTensor T2TSearch::MakeFirstMask(T2TStateBundle * beam)
mask.Set(-1e9, i);
mask.SetDevice(prob.devID, prob.mem);
return mask;
......@@ -102,9 +102,6 @@ public:
/* set end symbols for search */
void SetEnd(const int * tokens, const int tokenNum);
/* check whether all hypotheses are completed */
bool IsAllCompleted(T2TStateBundle * beam);
/* make a mask to prevent duplicated entries in beam expansion for the first position */
XTensor MakeFirstMask(T2TStateBundle * beam);
......@@ -75,7 +75,6 @@ void T2TTester::Test(const char * fn, const char * ofn, T2TModel * model)
CheckNTErrors(ofile, "Cannot open the output file");
int devID = model->devID;
XMem * mem = model->mem;
XNet net;
......@@ -106,7 +105,7 @@ void T2TTester::Test(const char * fn, const char * ofn, T2TModel * model)
while(batchLoader.LoadBatch(file, model->isLM,
&batchEnc, &paddingEnc, &paddingDec, &paddingDec, &gold, &label,
seqs, vSize, vSizeTgt,
1, 1, false, ws, wc, devID, mem, false))
1, 1, false, ws, wc, devID, false))
CheckNTErrors(batchEnc.order == 2, "wrong tensor order of the sequence batch!");
CheckNTErrors(!model->isLM, "Only MT model is supported!");
......@@ -129,7 +128,7 @@ void T2TTester::Test(const char * fn, const char * ofn, T2TModel * model)
if (batchCount % 1 == 0) {
double elapsed = GetClockSec() - startT;
XPRINT3(0, stderr,
"[INFO] elapsed=%.1fs, sent=%d, sword=%d\n",
"[INFO] elapsed=%.1fs, sentence=%d, sword=%d\n",
elapsed, sentCount, wordCount);
......@@ -141,8 +140,8 @@ void T2TTester::Test(const char * fn, const char * ofn, T2TModel * model)
double elapsed = GetClockSec() - startT;
XPRINT4(0, stderr, "[INFO] test finished (took %.1fs, word=%d, sent=%d, and ppl=%.3f)\n",
elapsed,wordCountTotal, sentCount, exp(loss/wordCount));
XPRINT3(0, stderr, "[INFO] test finished (took %.1fs, word=%d, and ppl=%.3f)\n",
elapsed,wordCountTotal, exp(loss/wordCount));
......@@ -75,9 +75,6 @@ void T2TTrainer::Init(int argc, char ** argv)
strcpy(argArray[i], argv[i]);
bool useMem = false;
LoadParamBool(argc, argv, "mem", &useMem, useMem);
LoadParamFloat(argc, argv, "lrate", &lrate, 1.0F);
LoadParamFloat(argc, argv, "lrbias", &lrbias, 0);
LoadParamInt(argc, argv, "sbatch", &sBatchSize, 1);
......@@ -142,7 +139,6 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
int devID = model->devID;
XMem * mem = model->mem;
XNet net;
......@@ -184,7 +180,7 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
while (batchLoader.LoadBatch(file, model->isLM,
&batchEnc, &paddingEnc, &batchDec, &paddingDec, &gold, &label,
NULL, vSize, vSizeTgt,
sBatchSize, wBatchSize, isLenSorted, ws, wc, devID, mem, true))
sBatchSize, wBatchSize, isLenSorted, ws, wc, devID, true))
CheckNTErrors(batchEnc.order == 2, "wrong tensor order of the sequence batch");
......@@ -321,7 +317,6 @@ void T2TTrainer::Test(const char * fn, const char * ofn, T2TModel * model)
CheckNTErrors(ofile, "Cannot open the output file");
int devID = model->devID;
XMem * mem = model->mem;
XNet net;
......@@ -351,7 +346,7 @@ void T2TTrainer::Test(const char * fn, const char * ofn, T2TModel * model)
while(batchLoader.LoadBatch(file, model->isLM,
&batchEnc, &paddingEnc, &batchDec, &paddingDec, &gold, &label,
seqs, vSize, vSizeTgt,
1, 1, false, ws, wc, devID, mem, false))
1, 1, false, ws, wc, devID, false))
CheckNTErrors(batchEnc.order == 2, "wrong tensor order of the sequence batch");
......@@ -372,7 +367,7 @@ void T2TTrainer::Test(const char * fn, const char * ofn, T2TModel * model)
/* prediction probabilities */
XTensor probs;
InitTensor1D(&probs, bSize * length);
InitTensor1DV2(&probs, bSize * length);
XTensor labelOnehot;
......@@ -463,7 +458,7 @@ float T2TTrainer::GetProb(XTensor * output, XTensor * gold, XTensor * wordProbs)
/* probability of each word */
XTensor wprobs;
InitTensor1D(&wprobs, output->unitNum/output->GetDim(-1), X_FLOAT, output->devID, output->mem);
InitTensor1DV2(&wprobs, output->unitNum/output->GetDim(-1), X_FLOAT, output->devID);
int dims[2] = {output->unitNum/output->GetDim(-1), output->GetDim(-1)};
probs.Reshape(2, dims);
......@@ -480,7 +475,7 @@ float T2TTrainer::GetProb(XTensor * output, XTensor * gold, XTensor * wordProbs)
/* probability for the batch */
XTensor result;
InitTensor1D(&result, 1, X_FLOAT, output->devID, output->mem);
InitTensor1DV2(&result, 1, X_FLOAT, output->devID);
_ReduceSum(&probs, &result, 1);
return result.Get1D(0);
......@@ -527,7 +522,7 @@ void T2TTrainer::Update(T2TModel * model, const float lr)
_ScaleAndShiftMe(v, (1.0F - adamBeta2), 0);
/* v2 = m / (sqrt(v) + delta) */
XTensor * v2 = NewTensorBuf(v, v->devID, v->mem);
XTensor * v2 = NewTensorBufV2(v, v->devID);
_Power(v, v2, 0.5F);
_ScaleAndShiftMe(v2, 1.0F, d);
_Div(m, v2, v2);
......@@ -598,7 +593,7 @@ void T2TTrainer::PadOutput(XTensor * output, XTensor * gold, XTensor * padding)
output->Reshape(output->unitNum/dimso[output->order - 1], dimso[output->order - 1]);
XTensor * padding2 = NewTensorBuf(1, &padding->unitNum, X_FLOAT, 1.0F, padding->devID, padding->mem);
XTensor * padding2 = NewTensorBufV2(1, &padding->unitNum, X_FLOAT, padding->devID);
_CopyValues(padding, padding2);
_MultiplyDim(output, padding2, output, 0);
......@@ -652,7 +647,7 @@ void T2TTrainer::LabelSmooth(XTensor * gold, XTensor * smoothed, DTYPE p)
DTYPE q = 1.0F - p;
DTYPE gift = p / n;
InitTensor(smoothed, gold);
InitTensorV2(smoothed, gold);
_CopyValues(gold, smoothed);
if(p == 0)
......@@ -528,8 +528,6 @@ void XLink::Replace(const XTensor * oldOne, XTensor * newOne)
CheckNTErrors(hit, "No proper node found in parent.income edge!");
strcpy(newOne->name, oldOne->name);
......@@ -737,11 +735,6 @@ void XLink::ShowNode(FILE * file, XTensor * node)
fprintf(file, "shape[%d] ", node->order);
for (int i = 0; i < node->order; i++)
fprintf(file, "%d ", node->GetDim(i));
fprintf(stderr, "\n");
......@@ -53,6 +53,7 @@ XMem::XMem()
strcpy(name, "xmem");
signature = 0;
mergeFreeOTF = true;
isInitialized = false;
......@@ -169,6 +170,7 @@ void XMem::Initialize(int myDevID, MEMPOOL_MODE myMode, MTYPE myBlockSize, int m
isInitialized = true;
/* free memory */
......@@ -1579,11 +1581,6 @@ void XMemManager::Initialize()
/* CPUs (we actually do not care about how many CPUs are using) */
nCPUMem = 1;
MTYPE freeMem = GetAvailableMemory();
MTYPE myBufSize = 0;
GetBufferSize(freeMem, &myBufSize);
/* GPUs */
nGPUMem = 0;
......@@ -1592,23 +1589,16 @@ void XMemManager::Initialize()
XPRINT(0, stderr, "cannot get GPU information.");
for (int i = 0; i < nGPUMem; i++) {
MTYPE freeMem = GetAvailableGPUMemory(i);
MTYPE myBufSize = 0;
GetBufferSize(freeMem, &myBufSize);
/* free it */
void XMemManager::Free()
for (int i = 0; i < MAX_CPU_NUM; i++)
for (int i = 0; i < MAX_CPU_MEM_NUM; i++)
for (int i = 0; i < MAX_GPU_NUM; i++)
for (int i = 0; i < MAX_GPU_MEM_NUM; i++)
......@@ -1616,14 +1606,35 @@ void XMemManager::Free()
XMem * XMemManager::GetMem(const int devID)
XMem * mem = NULL;
if (devID < 0)
if (devID < 0){
MTYPE freeMem = GetAvailableMemory();
MTYPE myBufSize = 0;
GetBufferSize(freeMem, &myBufSize);
CPUMems[0].Initialize(-1, FREE_ON_THE_FLY,
mem = CPUMems;
if (devID < nGPUMem)
if (devID < nGPUMem){
MTYPE freeMem = GetAvailableGPUMemory(devID);
MTYPE myBufSize = 0;
GetBufferSize(freeMem, &myBufSize);
GPUMems[devID].Initialize(devID, FREE_ON_THE_FLY,
mem = GPUMems + devID;
XPRINT1(0, stderr, "Cannot get the memory (%d). Please check your device id!", devID);
return mem;
......@@ -60,10 +60,10 @@ typedef long long INT_64;
#define BUF_PITCH 256
#define MIN_BLOCK_SIZE_FOR_MEMPOOL 128 * 1024 * 1024
#define MIN_BLOCK_SIZE_FOR_MEMPOOL 256 * 1024 * 1024
#define MAX_CPU_NUM 16
#define MAX_GPU_NUM 16
#define MAX_CPU_MEM_NUM 16
#define MAX_GPU_MEM_NUM 16
mode of runnig a memory pool
......@@ -213,6 +213,9 @@ public:
MTYPE curUsedPin;
MTYPE bufUsedPin;
/* indicates whether the memory pool is initialized */
bool isInitialized;
#ifdef USE_CUDA
/* handle used for cublas */
cublasHandle_t cublasHandle;
......@@ -429,15 +432,15 @@ a class for the management of memory
class XMemManager
/* cpu memory pool information */
/* number of cpu memory pools */
int nCPUMem;
/* gpu memory pool information */
/* number of gpu memory pools */
int nGPUMem;
......@@ -526,7 +526,7 @@ void XTensor::SetDevice(int myDevId, XMem * myMem)
isInGlobalMem = false;
else {
myMem = GMems.GetMem(myDevId);
......@@ -923,11 +923,9 @@ set the tensor items by a normal distribution
void XTensor::SetDataRandn(DTYPE mean, DTYPE standardDeviation)
// TODO: cuda code!!!!!!!
if (data == NULL)
// srand((unsigned)time(0));
void * d = NULL;
if (dataType == X_FLOAT) {
d = new float[unitNum];
......@@ -1826,7 +1824,6 @@ void XTensor::Dump(FILE * file, const char * label, const int n, const int beg,
fprintf(file, "%s ", label);
fprintf(file, "id=%d ", id);
fprintf(file, "order=%d dimsize=", order);
for (int i = 0; i < order; i++) {
fprintf(file, "%d", dimSize[i]);
......@@ -1879,7 +1876,149 @@ void XTensor::Dump(FILE * file, const char * label, const int n, const int beg,
fprintf(file, "[%d]%e ", key, value);
fprintf(file, "\n\n");
fprintf(file, "\n");
if (isNewData) {
#ifdef USE_CUDA
if (devID >= 0)
dataHost = NULL;
void * RecursionData(XTensor * s, int dim, int * index, void * d, FILE * file)
if (dim == s->order - 2) {
/* print index */
printf("Index: ");
for (int i = 0; i < s->order-2; i++)
printf("[%d]", index[i]);
int dimSize1 = s->dimSize[dim];
int dimSize2 = s->dimSize[dim+1];
printf(" %d * %d\n", dimSize1, dimSize2);
/* print 2D data */
if (s->dataType == X_FLOAT) {
float * data = (float*)d;
for (int i = 0; i < dimSize1; i++) {
for (int j = 0; j < dimSize2; j++)
fprintf(file, "%e ", *data++);
fprintf(file, "\n");
d = (float*)d + dimSize1 *dimSize2;
else if (s->dataType == X_INT) {
int * data = (int*)d;
for (int i = 0; i < dimSize1; i++) {
for (int j = 0; j < dimSize2; j++)
fprintf(file, "%d ", *data++);
fprintf(file, "\n");
d = (int*)d + dimSize1 *dimSize2;
return d;
/* recursion for deeper dimsion */
int levelSize = s->dimSize[dim];
for (int k = 0; k < levelSize; k++) {
index[dim] = k;
d = RecursionData(s, dim+1, index, d, file);
return d;
dump data to a file
>> file - where to domp the data
>> label - label of the tensor
>> n - number of items to dump
>> beg - the first item id
>> verbose - verbose level
void XTensor::DumpFormat(FILE * file, const char * label, const int n, const int beg, const int verbose)
if (verbose > verboseLevel)
void * d = data;
bool isNewData = false;
#ifdef USE_CUDA
if (devID >= 0) {
d = dataHost;
isNewData = true;
if (d == NULL) {
if (isSparse) {
int num = 0;
for (int i = 0; i < order; i++)
num *= dimSizeRDI[i];
num = int(num * denseRatio + 1);
int tupleSize = sizeof(int) + sizeof(DTYPE);
int size = sizeof(int) + tupleSize*(num);
d = new char[size];
memset(d, 0, size);
else {
d = new char[unitNum * unitSize];
memset(d, 0, unitNum * unitSize);
isNewData = true;
if (label != NULL)
fprintf(file, "%s ", label);
fprintf(file, "id=%d ", id);
fprintf(file, "order=%d dimsize=", order);
for (int i = 0; i < order; i++) {
fprintf(file, "%d", dimSize[i]);
if (i < order - 1)
fprintf(file, ",");
fprintf(file, "order=-1 dimsize=-1");
fprintf(file, " dtype=%s dense=%f\n", GetDataTypeName(dataType), denseRatio);
fprintf(file, "NULL");
if (order == 1) {
for (int i = 0; i < unitNum; i++) {
if (dataType == X_FLOAT)
fprintf(file, "%e ", ((float*)d)[i]);
else if (dataType == X_INT)
fprintf(file, "%d ", ((int*)d)[i]);
/* print multi-dimensional tensor */
else {
int * index = new int[order];
RecursionData(this, 0, index, d, file);
delete[] index;
fprintf(file, "\n");
if (isNewData) {
......@@ -2184,6 +2323,11 @@ void InitTensorV2(XTensor * tensor,
const int myOrder, const int * myDimSize, const TENSOR_DATA_TYPE myDataType,
const int myDevID)
if (tensor->mem == NULL) {
XMem * myMem = GMems.GetMem(myDevID);
tensor->mem = myMem;
tensor->devID = myMem->devID;
if(tensor->mem != NULL){
tensor->Resize(myOrder, myDimSize, myDataType, 1.0F);
......@@ -2488,9 +2632,8 @@ void InitTensorOnCPU(XTensor * tensor, const XTensor * reference)
tensor->enableGrad = reference->enableGrad;
InitTensor(tensor, reference->order, reference->dimSize,
reference->dataType, reference->denseRatio,
InitTensorV2(tensor, reference->order, reference->dimSize,
reference->dataType, -1);
/* generate a XTensor with no initialization */
......@@ -2565,9 +2708,6 @@ XTensor * NewTensorBuf(const int myOrder, const int * myDimSize,
XTensor * tensor = NewTensor(myOrder, dims, myDataType, myDenseRatio, devID, myMem);
if (tensor->unitNum * tensor->unitSize == 176657664) {
tensor->Dump(stderr, "", 200);
if(myMem != NULL)
tensor->data = myMem->AllocBuf(myMem->devID, tensor->unitNum * tensor->unitSize);
......@@ -2593,7 +2733,7 @@ XTensor * NewTensorBufV2(const int myOrder, const int * myDimSize,
dims[0] = -abs(dims[0]);
XTensor * tensor = NewTensor(myOrder, dims, myDataType, 1.0F, devID);
XTensor * tensor = NewTensorV2(myOrder, dims, myDataType, devID);
if (tensor->unitNum * tensor->unitSize == 176657664) {
tensor->Dump(stderr, "", 200);
......@@ -427,6 +427,9 @@ public:
void Dump(FILE * file, const char * label = NULL, const int n = -1, const int beg = 0, const int verbose = 0);
/* dump data to a file */
void DumpFormat(FILE * file, const char * label = NULL, const int n = -1, const int beg = 0, const int verbose = 0);
/* dump data to a file */
void Dump(const XTensor * tensor, FILE * file, const char * label = NULL, const int n = -1, const int beg = 0, const int verbose = 0);
......@@ -125,7 +125,7 @@ void _CudaMultiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alph
int leadingDimRDI = a->order - leadingDim - 1;
CheckNTErrors(a->unitNum <= c->unitNum && b->unitNum <= c->unitNum,
"Unmatched tensors in multiplication!");
CheckNTErrors((a->order == b->order && a->order == c->order), "Unmatched tensors!");
CheckNTErrors(a->order == b->order && a->order == c->order, "Unmatched tensors!");
int stride = 1;
int blockSizeA = 1;
......@@ -33,6 +33,7 @@ void _MultiplyDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYP
/* tensor multiplication a = a * b + \alpha * c where the size of b is equal to the n-th dimension of a,
i.e., a is multiplied with b by broadcasting. we keep the result in the input tensor a and return nothing */
void _MultiplyDimMe(XTensor * a, const XTensor * b, int n, DTYPE alpha = 0.0);
void MultiplyDimMe(XTensor & a, const XTensor & b, int n, DTYPE alpha = 0.0);
/* tensor multiplication c = a * b where the size of b is equal to the n-th dimension of a,
i.e., a is multiplied with b by broadcasting. We make a new tensor c to keep the result and return it */
......@@ -40,7 +41,7 @@ XTensor MultiplyDim(const XTensor &a, const XTensor &b, int n);
/* tensor multiplication c = a * b + \alpha * c where the size of b is equal to the n-th dimension of a,
i.e., a is multiplied with b by broadcasting */
void MultiplyDim(const XTensor &a, const XTensor &b, XTensor &c, int n, bool requireLink = false);
void MultiplyDim(const XTensor &a, const XTensor &b, XTensor &c, int n);
/* tensor multiplication summation c = a * b + c * \beta where some of dimensions of b can be of size 1 */
void _MultiplyBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta = (DTYPE)1.0);
......@@ -50,7 +51,7 @@ void _MultiplyBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE
XTensor MultiplyBroadcast(const XTensor &a, const XTensor &b);
/* tensor multiplication summation c = a * b + c * \beta where some of dimensions of b can be of size 1 */
void MultiplyBroadcast(const XTensor &a, const XTensor &b, XTensor &c, bool requireLink = false);
void MultiplyBroadcast(const XTensor &a, const XTensor &b, XTensor &c);
} // namespace nts(NiuTrans.Tensor)
......@@ -183,9 +183,8 @@ i.e., a is subtracted with b by broadcasting
>> c - where we put a-b*\beta. we save it in a if c is NULL
>> n - the dimension index
>> beta - the scaling factor
>> requireLink - if add operation to network
void SubDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE beta, bool requireLink)
void SubDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE beta)
if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
InitTensor(&c, &a);
......@@ -194,7 +193,7 @@ void SubDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE beta, b
/* call _Sub function */
_SubDim(&a, &b, &c, n, beta);
if (requireLink) {
if (c.enableGrad) {
/* tensor connections */
XLink::MakeLink(&a, &b, &c, MATH_SUBDIM);
XLink::AddParamToHeadInt(&c, n);
......@@ -134,12 +134,12 @@ void _IndexToOnehot(const XTensor * index, XTensor * onehot,
for (int i = 0; i < index->order; i++)
CheckNTErrors(index->GetDim(i) == onehot->GetDim(i), "Illegal tensor order!");
float confidence = 1 - labelSmoothingP;
float lowconfidence = labelSmoothingP / size;
//_SetDataFixedFloat(onehot, lowconfidence);
_SetDataFixedFloat(onehot, lowconfidence);
#ifdef USE_CUDA
if(onehot->devID >= 0 && index->devID >= 0) {
......@@ -110,13 +110,11 @@ void KernelIndexToOnehot(DTYPE * onehotData, int * indexData, int blockNum, int
DTYPE * od = onehotData + i * stride;
int id = indexData[i];
//od[id] = confidence;
if (offset == id)
od[offset] = confidence;
od[offset] = lowconfidence;
// od[offset] = lowconfidence;
......@@ -126,7 +124,8 @@ convert index tensor to onehot tensor (cuda version)
>> onehot - onehot tensor, which value is 0 or 1
>> size - the last dimension size of the onehot tensor
void _CudaIndexToOnehot(const XTensor * index, XTensor * onehot, int size, float confidence, float lowconfidence)
void _CudaIndexToOnehot(const XTensor * index, XTensor * onehot,
int size, float confidence, float lowconfidence)
int devID = onehot->devID;
......@@ -30,6 +30,8 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
set an integer data array with a fixed value p (in int)
>> d - pointer to the data array
......@@ -740,4 +742,5 @@ void _CudaSetDataWithOffsetAndValue(XTensor * tensor, MTYPE * offsets, void * va
BacktoCudaDev(tensor->devID, devIDBackup);
#endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor)
......@@ -41,19 +41,19 @@ float shflDownReduceMax(float input)
".reg .f32 r0;"
".reg .pred p;"
"shfl.down.b32 r0, %1, 0x10, 0x1f;"
"shfl.sync.down.b32 r0, %1, 0x10, 0x1f,0xffffffff;"
" p,%1,r0;"
"@p mov.f32 %1,r0;"
"shfl.down.b32 r0, %1, 0x8, 0xf;"
"shfl.sync.down.b32 r0, %1, 0x8, 0xf,0xffffffff;"
" p,%1,r0;"
"@p mov.f32 %1,r0;"
"shfl.down.b32 r0, %1, 0x4, 0x7;"
"shfl.sync.down.b32 r0, %1, 0x4, 0x7,0xffffffff;"
" p,%1,r0;"
"@p mov.f32 %1,r0;"
"shfl.down.b32 r0, %1, 0x2, 0x3;"
"shfl.sync.down.b32 r0, %1, 0x2, 0x3,0xffffffff;"
" p,%1,r0;"
"@p mov.f32 %1,r0;"
"shfl.down.b32 r0, %1, 0x1, 0x1;"
"shfl.sync.down.b32 r0, %1, 0x1, 0x1,0xffffffff;"
" p, %1, r0; "
"@p mov.f32 %1,r0;"
"mov.f32 %0,%1;"
......@@ -73,19 +73,19 @@ int shflDownReduceMax(int input)
".reg .s32 r0;"
".reg .pred p;"
"shfl.down.b32 r0, %1, 0x10, 0x1f;"
"shfl.sync.down.b32 r0, %1, 0x10, 0x1f,0xffffffff;"
" p,%1,r0;"
"@p mov.s32 %1,r0;"
"shfl.down.b32 r0, %1, 0x8, 0xf;"
"shfl.sync.down.b32 r0, %1, 0x8, 0xf,0xffffffff;"
" p,%1,r0;"
"@p mov.s32 %1,r0;"
"shfl.down.b32 r0, %1, 0x4, 0x7;"
"shfl.sync.down.b32 r0, %1, 0x4, 0x7,0xffffffff;"
" p,%1,r0;"
"@p mov.s32 %1,r0;"
"shfl.down.b32 r0, %1, 0x2, 0x3;"
"shfl.sync.down.b32 r0, %1, 0x2, 0x3,0xffffffff;"
" p,%1,r0;"
"@p mov.s32 %1,r0;"
"shfl.down.b32 r0, %1, 0x1, 0x1;"
"shfl.sync.down.b32 r0, %1, 0x1, 0x1,0xffffffff;"
" p, %1, r0; "
"@p mov.s32 %1,r0;"
"mov.s32 %0,%1;"
......@@ -37,15 +37,15 @@ float shflDownReduceSum(float input)
asm volatile(
".reg .f32 r0;"
"shfl.down.b32 r0, %1, 0x10, 0x1f;"
"shfl.sync.down.b32 r0, %1, 0x10, 0x1f,0xffffffff;"
"add.f32 %1, r0, %1;"
"shfl.down.b32 r0, %1, 0x8, 0xf;"
"shfl.sync.down.b32 r0, %1, 0x8, 0xf,0xffffffff;"
"add.f32 %1, r0, %1;"
"shfl.down.b32 r0, %1, 0x4, 0x7;"
"shfl.sync.down.b32 r0, %1, 0x4, 0x7,0xffffffff;"
"add.f32 %1, r0, %1;"
"shfl.down.b32 r0, %1, 0x2, 0x3;"
"shfl.sync.down.b32 r0, %1, 0x2, 0x3,0xffffffff;"
"add.f32 %1, r0, %1;"
"shfl.down.b32 r0, %1, 0x1, 0x1;"
"shfl.sync.down.b32 r0, %1, 0x1, 0x1,0xffffffff;"
"add.f32 %0, r0, %1;"
: "=f"(output) : "f"(input));
......@@ -62,15 +62,15 @@ int shflDownReduceSum(int input)
asm volatile(
".reg .s32 r0;"
"shfl.down.b32 r0, %1, 0x10, 0x1f;"
"shfl.sync.down.b32 r0, %1, 0x10, 0x1f,0xffffffff;"
"add.s32 %1, r0, %1;"
"shfl.down.b32 r0, %1, 0x8, 0xf;"
"shfl.sync.down.b32 r0, %1, 0x8, 0xf,0xffffffff;"
"add.s32 %1, r0, %1;"
"shfl.down.b32 r0, %1, 0x4, 0x7;"
"shfl.sync.down.b32 r0, %1, 0x4, 0x7,0xffffffff;"
"add.s32 %1, r0, %1;"
"shfl.down.b32 r0, %1, 0x2, 0x3;"
"shfl.sync.down.b32 r0, %1, 0x2, 0x3,0xffffffff;"
"add.s32 %1, r0, %1;"
"shfl.down.b32 r0, %1, 0x1, 0x1;"
"shfl.sync.down.b32 r0, %1, 0x1, 0x1,0xffffffff;"
"add.s32 %0, r0, %1;"
: "=r"(output) : "r"(input));
......@@ -36,6 +36,12 @@ keep the result in the input tensor a and return nothing
void _SortMe(XTensor * a, XTensor * index, int dim);
sort the data along a given dimension (do it on site)
keep the result in the input tensor a and return nothing
void SortMe(XTensor & a, XTensor & index, int dim);
sort the data along a given dimension (return an XTensor structure)
make a new tensor to keep the result and return it
......@@ -171,7 +171,7 @@ float broadcast(float input)
float output;
"shfl.idx.b32 %0,%1,0x0,0x1f;"
"shfl.sync.idx.b32 %0,%1,0x0,0x1f,0xffffffff;"
:"=f"(output) : "f"(input)
......@@ -35,7 +35,7 @@ bool Test()
wrong = !TestConcatenate() || wrong;
wrong = !TestConcatenateSolely() || wrong;
wrong = !TestCos() || wrong;
wrong = !TestConvertDataType() || wrong;
//wrong = !TestConvertDataType() || wrong;
wrong = !TestCopyIndexed() || wrong;
wrong = !TestCopyValues() || wrong;
wrong = !TestDiv() || wrong;
Markdown 格式
您添加了 0 到此讨论。请谨慎行事。
注册 或者 后发表评论