Commit 5f933fc6 by xuchen

cumulative update

parent 78954fad
......@@ -35,6 +35,8 @@
void BackwardTest();
void TransposeTest();
void SumDimTest();
void SplitBackwardTest();
void MemTest();
using namespace nts;
using namespace fnnlm;
......@@ -42,6 +44,10 @@ using namespace transformer;
int main( int argc, const char ** argv )
//return 0;
//return 0;
//return 0;
......@@ -89,7 +95,7 @@ void BackwardTest()
c = DivDim(a, b, 0);
c.Dump(stderr, "c:");
XLink::ShowNetwork(stderr, &c);
//XLink::ShowNetwork(stderr, &c);
......@@ -209,3 +215,67 @@ void SumDimTest()
delete[] data;
void SplitBackwardTest()
int * dimSize = new int[2];
dimSize[0] = 2;
dimSize[1] = 4;
XTensor t1;
InitTensor2D(&t1, 2, 4, X_FLOAT, 0, NULL);
XTensor t2;
InitTensor2D(&t2, 2, 4, X_FLOAT, 0, NULL);
XTensor tensor;
//_SetDataFixedFloat(&t1, 1.0F);
//_SetDataFixedFloat(&t2, 2.0F);
tensor = t1 + t2;
XList smalls;
XTensor first;
XTensor second;
InitTensor2D(&first, 2, 2, X_FLOAT, 0, NULL);
InitTensor2D(&second, 2, 2, X_FLOAT, 0, NULL);
Split(tensor, smalls, 1, 2);
XTensor mul;
mul = Sum(first, second);
XNet net;
void MemTest()
XMem * mem;
mem = new XMem(0, FREE_ON_THE_FLY, (MTYPE)MILLION, 1024, MILLION);
XTensor tensor;
InitTensor2D(&tensor, 2, 4, X_FLOAT, 0, mem);
delete mem;
if (tensor.mem != NULL) {
printf("It isn't null!\n");
printf("%d\n", (int)tensor.mem->signature);
else {
printf("It's null\n");
\ No newline at end of file
......@@ -25,6 +25,7 @@
#include "XNoder.h"
#include "XBackwardShape.h"
#include "../tensor/XName.h"
#include "../tensor/XUtility.h"
#include "../tensor/core/CHeader.h"
#include "../tensor/core/getandset/SetData.h"
......@@ -40,7 +41,7 @@ void XShapeGrad::MakeGrad(XTensor * node, bool isEfficent)
GradCopyIndexed(node, isEfficent);
else if(operID == MOVEMENT_GATHER)
GradGather(node, isEfficent);
else if(operID == SHAPE_MERGE)
GradMerge(node, isEfficent);
......@@ -80,7 +81,7 @@ gradient computation for copying indexed sub-tensors
b = copyindexed(a)
we have
dE/da = spread(b)
dE/da = spreadforcopyindexed(b)
>> node - the node (c) for backward computation
>> isEfficient - indicates whether the computation is in
an efficient manner
......@@ -91,32 +92,14 @@ void XShapeGrad::GradCopyIndexed(XTensor * node, bool isEfficent)
CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for CopyIndexed!");
int dim = income.GetParamInt(0);
int * srcIndex = (int *)income.GetParamPointer(1);
int indexSize = income.GetParamInt(2);
int * tgtIndex = (int *)income.GetParamPointer(3);
int copyNum = income.GetParamInt(4);
int realIndexSize = indexSize * copyNum;
int * realSrcIndex = new int[realIndexSize];
int * realTgtIndex = new int[realIndexSize];
for(int i = 0; i < indexSize; i++) {
for(int j = 0; j < copyNum; j++) {
realSrcIndex[i * copyNum + j] = srcIndex[i] + j;
realTgtIndex[i * copyNum + j] = tgtIndex[i] + j;
int copyNum = income.GetParamInt(1);
XTensor * input = income.tails[0];
_Spread(input->grad, node->grad, dim, realSrcIndex, realIndexSize, realTgtIndex);
XTensor * srcIndex = income.tails[1];
XTensor * tgtIndex = income.tails[2];
delete[] realSrcIndex;
delete[] realTgtIndex;
delete[] srcIndex;
delete[] tgtIndex;
node->visitMark = NODE_FINISHED;
_SpreadForCopyIndexed(input->grad, node->grad, dim, srcIndex, tgtIndex, copyNum);
......@@ -143,7 +126,6 @@ void XShapeGrad::GradGather(XTensor * node, bool isEfficent)
node->visitMark = NODE_FINISHED;
gradient for merge
......@@ -181,6 +163,7 @@ void XShapeGrad::GradMerge(XTensor * node, bool isEfficent)
int * dims = new int[input->order];
memset(dims, 0, sizeof(int) * input->order);
for(int i = 0, j = 0; i < input->order; i++){
if(i >= leadDim){
dims[j++] = input->dimSize[i];
......@@ -437,4 +437,25 @@ void XNet::ClearGrad(XTensor * node)
show network topology
>> file - file to dump information
>> node - pointer to the node
void XNet::ShowNetwork(FILE * file, XTensor * node)
XList roots(1);
XLink::ShowNode(file, node);
/* go over nodes in its topological order */
for(int i = nodes.count - 1; i >= 0; i--){
XTensor * n = (XTensor*)nodes.Get(i);
XLink::ShowNode(file, n);
\ No newline at end of file
......@@ -108,6 +108,9 @@ struct XNet
/* clear the graident information if the node is no use */
void ClearGrad(XTensor * node);
/* show network topology */
void ShowNetwork(FILE * file, XTensor * node);
/* we make a unique id for every tensor */
......@@ -231,7 +231,7 @@ void LoadArgs(int argc, const char ** argv, FNNModel &model)
for(int i = 0; i < argc; i++){
if(!strcmp(argv[i], "-mempool"))
if (!strcmp(argv[i], "-mempool"))
model.mem = new XMem(model.devID);
......@@ -715,11 +715,9 @@ The indexed cell is set to 1, and 0 otherwise.
>> devID - device id
>> mem - memory pool
void InitZeroOneTensor2D(XTensor &tensor, int rowNum, int colNum, int * rows, int * cols, int itemNum, int devID, XMem * mem)
void InitZeroOneTensor2D(XTensor &tensor, int rowNum, int colNum, int * rows, int * cols,
int itemNum, int devID, XMem * mem)
if(devID >= 0 || (mem != NULL && mem->devID >= 0))
InitTensor2D(&tensor, rowNum, colNum, X_FLOAT, -1);
InitTensor2D(&tensor, rowNum, colNum, X_FLOAT, devID, mem);
......@@ -727,12 +725,6 @@ void InitZeroOneTensor2D(XTensor &tensor, int rowNum, int colNum, int * rows, in
/* set none-zero cells */
for(int i = 0; i < itemNum; i++)
tensor.Set2D(1.0F, rows[i], cols[i]);
if(devID >= 0 || (mem != NULL && mem->devID >= 0)){
XList list(1);
CPUToGPUFlush(&list, devID, mem);
......@@ -859,8 +851,6 @@ void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net)
/* y = softmax(s) */
_LogSoftmax(&s, &y, 1);
......@@ -998,7 +988,6 @@ void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model
XTensor embeddingBig;
XTensor hidden;
XTensor b;
XTensor srcIndex;
int size = batch * (n-1);
int * index = new int[size];
......@@ -1010,28 +999,25 @@ void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model
InitTensor1D(&srcIndex, size, X_INT, model.devID, model.mem);
srcIndex.SetData(index, size);
InitTensor1D(&words, size, X_INT, model.devID, model.mem);
words.SetData(index, size);
XTensor embedding;
embedding = Gather(model.embeddingW, srcIndex);
embeddingBig = Gather(model.embeddingW, words);
delete[] index;
int dimSize[2];
dimSize[0] = embedding.GetDim(0) / (n - 1);
dimSize[1] = embedding.GetDim(1) * (n - 1);
dimSize[0] = embeddingBig.GetDim(0) / (n - 1);
dimSize[1] = embeddingBig.GetDim(1) * (n - 1);
hidden = Reshape(embedding, embedding.order, dimSize);
hidden = Reshape(embeddingBig, embeddingBig.order, dimSize);
/* hidden layers */
for(int i = 0; i < depth; i++)
hidden = MMul(hidden, model.hiddenW[i]) + model.hiddenB[i];
hidden = HardTanH(MMul(hidden, model.hiddenW[i]) + model.hiddenB[i]);
/* output layer */
output = LogSoftmax(MMul(hidden, model.outputW) + model.outputB, 1);
//XLink::ShowNetwork(stderr, &output);
......@@ -1071,7 +1057,6 @@ void ForwardAutoDiff(XTensor inputs[], XTensor &output, FNNModel &model)
/* output layer */
output = LogSoftmax(MMul(hidden, model.outputW) + model.outputB, 1);
//XLink::ShowNetwork(stderr, &output);
......@@ -60,7 +60,7 @@ void AttDecoder::InitModel(int argc, char ** argv,
/* initialize the stacked layers */
for(int i = 0; i < nlayer; i++){
attentionsEnde[i].InitModel(argc, argv, false, myIgnored, myDevID, myMem);
attentionsEnde[i].InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem);
attEndeLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
......@@ -69,11 +69,12 @@ void AttDecoder::InitModel(int argc, char ** argv,
make the decoding network
>> inputDec - the input tensor of the decoder
>> outputEnc - the output tensor of the encoder
>> mask - the mask that indicate each position is valid
>> mask - mask that indicates which position is valid
>> maskEncDec - mask for the encoder-decoder attention
>> isTraining - indicates whether the model is used for training
<< return - the output tensor of the encoder
XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, bool isTraining)
XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, XTensor &maskEncDec, bool isTraining)
XTensor x;
......@@ -89,7 +90,6 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, b
XTensor ln;
XTensor fnn;
XTensor res;
XTensor nothing;
/* self attention */
......@@ -107,7 +107,7 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, b
/* encoder-decoder attention */
ende = attentionsEnde[i].Make(outputEnc, x, outputEnc, nothing, isTraining);
ende = attentionsEnde[i].Make(outputEnc, x, outputEnc, maskEncDec, isTraining);
/* dropout */
if(isTraining && dropoutP > 0)
......@@ -48,7 +48,7 @@ public:
int myDevID = -1, XMem * myMem = NULL);
/* make the decoding network */
XTensor Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, bool isTraining);
XTensor Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, XTensor &maskEncDec, bool isTraining);
......@@ -150,7 +150,6 @@ XTensor T2TEmbedder::Make(XTensor &input)
/* then we make word embeddings */
//wordEmbedding = Linear(MMul(input, w), (float)sqrt((float)eSize));
wordEmbedding = Gather(w, input);
wordEmbedding = Linear(wordEmbedding, (float)sqrt((float)eSize));
......@@ -93,10 +93,11 @@ void AttEncoder::InitModel(int argc, char ** argv,
make the encoding network
>> input - the input tensor of the encoder
>> mask - the mask that indicate each position is valid
>> maskEncDec - no use
>> isTraining - indicates whether the model is used for training
<< return - the output tensor of the encoder
XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool isTraining)
XTensor AttEncoder::Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, bool isTraining)
XTensor x;
......@@ -144,4 +145,18 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool isTraining)
return x;
make the encoding network (wrapper)
>> input - the input tensor of the encoder
>> mask - the mask that indicate each position is valid
>> isTraining - indicates whether the model is used for training
<< return - the output tensor of the encoder
XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool isTraining)
XTensor nothing;
return Make(input, mask, nothing, isTraining);
......@@ -40,7 +40,7 @@ class T2TEncoder
XTensor Make(XTensor &input, XTensor &mask, bool isTraining) = 0;
XTensor Make(XTensor &input, XTensor &mask, XTensor &mask2, bool isTraining) = 0;
......@@ -49,7 +49,7 @@ the encoder based on RNN
class RNNEncoder : T2TEncoder
XTensor Make(XTensor &input, XTensor &mask, bool isTraining);
XTensor Make(XTensor &input, XTensor &mask, XTensor &mask2, bool isTraining);
......@@ -118,6 +118,9 @@ public:
int myDevID = -1, XMem * myMem = NULL);
/* make the encoding network */
XTensor Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, bool isTraining);
/* make the encoding network (wrapper) */
XTensor Make(XTensor &input, XTensor &mask, bool isTraining);
......@@ -75,7 +75,7 @@ void T2TModel::InitModel(int argc, char ** argv)
mem->SetDesiredSize(devID, 0, (MTYPE)memSize * MILLION);
encoder->InitModel(argc, argv, isLM, 0, devID, mem);
encoder->InitModel(argc, argv, true, 0, devID, mem);
outputLayer->InitModel(argc, argv, devID, mem);
......@@ -99,7 +99,9 @@ make the encoding network
XTensor T2TModel::MakeEncoder(XTensor &input, XTensor &mask, bool isTraining)
return encoder->Make(input, mask, isTraining);
XTensor nothing;
return encoder->Make(input, mask, nothing, isTraining);
......@@ -107,13 +109,14 @@ make the decoding network
>> inputDec - input tensor of the decoder
>> outputEnc - output tensor of the encoder
>> output - output tensor (distribution)
>> mask - the mask for positions that are/not involved in computation
>> mask - mask for positions that are/not involved in computation
>> maskEncDec - mask for the encoder-decoder attention
>> isTraining - indicates whether we are training the model
<< return - encoding result
XTensor T2TModel::MakeDecoder(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, bool isTraining)
XTensor T2TModel::MakeDecoder(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, XTensor &maskEncDec, bool isTraining)
return decoder->Make(inputDec, outputEnc, mask, isTraining);
return decoder->Make(inputDec, outputEnc, mask, maskEncDec, isTraining);
......@@ -190,14 +193,16 @@ make the network for machine translation (with the output softmax layer)
>> inputDec - input tensor of the decoder
>> output - output tensor (distribution)
>> paddingEnc - padding of the sequences (on the encoder side)
>> paddingDec - padding of the sequences (on the decoder side)
>> isTraining - indicates whether the model is for training
void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTensor &paddingEnc, bool isTraining)
void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTensor &paddingEnc, XTensor &paddingDec, bool isTraining)
XTensor encoding;
XTensor decoding;
XTensor maskEnc;
XTensor maskDec;
XTensor maskEncDec;
/* generate mask to see "previous" words on the decoder side */
//int len = inputDec.GetDim(inputDec.order - 2);
......@@ -222,6 +227,23 @@ void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTe
_SetDataLowTri(&maskDec, 1e9F, 0);
_ScaleAndShiftMe(&maskDec, 1.0F, -1e9F);
/* encoder-decoder mask that prevent the attention to padding dummy words */
dims[inputDec.order + 1] = inputEnc.GetDim(inputEnc.order - 1);
InitTensor(&maskEncDec, inputDec.order + 2, dims, X_FLOAT, 1.0F, paddingEnc.devID, paddingEnc.mem);
XTensor * maskEncDecTMPEnc = NewTensorBuf(paddingEnc.order + 1, dims + 1, paddingEnc.dataType,
paddingEnc.denseRatio, paddingEnc.devID, paddingEnc.mem);
XTensor * maskEncDecTMPDec = NewTensorBuf(maskEncDecTMPEnc, paddingEnc.devID, paddingEnc.mem);
_Unsqueeze(&paddingEnc, maskEncDecTMPEnc, paddingEnc.order - 1, paddingDec.GetDim(-1));
_Unsqueeze(&paddingDec, maskEncDecTMPDec, paddingEnc.order, paddingEnc.GetDim(-1));
_Multiply(maskEncDecTMPDec, maskEncDecTMPEnc, maskEncDecTMPDec);
_ScaleAndShiftMe(maskEncDecTMPDec, 1e9F, -1e9F);
_Unsqueeze(maskEncDecTMPDec, &maskEncDec, 0, dims[0]);
/* padding on the source side */
int * dimsPadding = new int[paddingEnc.order + 2];
for (int i = 0; i < paddingEnc.order - 1; i++)
......@@ -252,7 +274,7 @@ void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTe
_Sum(&maskEnc, padding3, &maskEnc);
encoding = MakeEncoder(inputEnc, maskEnc, isTraining);
decoding = MakeDecoder(inputDec, encoding, maskDec, isTraining);
decoding = MakeDecoder(inputDec, encoding, maskDec, maskEncDec, isTraining);
outputLayer->Make(decoding, output);
delete[] dims;
......@@ -72,13 +72,13 @@ public:
XTensor MakeEncoder(XTensor &input, XTensor &mask, bool isTraining);
/* make the encoding network */
XTensor MakeDecoder(XTensor &inputEnc, XTensor &inputDec, XTensor &mask, bool isTraining);
XTensor MakeDecoder(XTensor &inputEnc, XTensor &inputDec, XTensor &mask, XTensor &MaskEncDec, bool isTraining);
/* make the network for langauge modeling (with the output softmax layer) */
void MakeLM(XTensor &input, XTensor &output, XTensor &padding, bool isTraining);
/* make the network for machine translation (with the output softmax layer) */
void MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTensor &paddingEnc, bool isTraining);
void MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTensor &paddingEnc, XTensor &paddingDec, bool isTraining);
/* get parameter matrics */
void GetParams(XList &list);
......@@ -93,8 +93,8 @@ void T2TOutput::Make(XTensor &input, XTensor &output)
XTensor &x = input;
//output = LogSoftmax(MMul(x, w), -1);
output = Softmax(MMul(x, w), -1);
output = LogSoftmax(MMul(x, w), -1);
//output = Softmax(MMul(x, w), -1);
......@@ -116,6 +116,7 @@ void T2TTrainer::Init(int argc, char ** argv)
LoadParamBool(argc, argv, "doubledend", &isDoubledEnd, false);
LoadParamBool(argc, argv, "smallbatch", &isSmallBatch, true);
LoadParamBool(argc, argv, "bigbatch", &isBigBatch, false);
LoadParamBool(argc, argv, "smallfootprint", &isSmallFootprint, false);
buf = new int[bufSize];
buf2 = new int[bufSize];
......@@ -164,6 +165,9 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
XMem * mem = model->mem;
XNet net;
double startT = GetClockSec();
......@@ -208,7 +212,7 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
model->MakeLM(batchEnc, output, paddingEnc, true);
else if(model->isMT)
model->MakeMT(batchEnc, batchDec, output, paddingEnc, true);
model->MakeMT(batchEnc, batchDec, output, paddingEnc, paddingDec, true);
ShowNTErrors("Illegal model type!");
......@@ -218,11 +222,8 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
LabelSmooth(&gold, &goldSmoothed, labelSmoothingP);
/* make paddings for the output */
//if (output.GetDim(0) > 1)
// PadOutput(&output, &gold, &paddingDec);
//output.Dump(tmpFILE, "output: ");
if (output.GetDim(0) > 1)
PadOutput(&output, &gold, &paddingDec);
/* get probabilities */
float prob = GetProb(&output, &gold, NULL);
......@@ -235,7 +236,7 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
if (doUpdate) {
/* recale the output for normalized loss */
//RescaleOutput(&output, &g, &paddingDec);
RescaleOutput(&output, &g, &paddingDec);
/* back-propagation */
net.Backward(output, g, paddingDec, CROSSENTROPY);
......@@ -275,19 +276,6 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
XPRINT(0, stderr, "\n");
//XMem * mem = model->mem;
//MTYPE used = 0;
//MTYPE total = 0;
//for(int i = 0; i < mem->blockNum; i++){
// if(mem->blocks[i].mem != NULL){
// used += mem->blocks[i].used;
// total += mem->blocks[i].size;
// }
//fprintf(stderr, "%d %d %d %d mem: %lld %lld\n", paddingEnc.GetDim(0), paddingEnc.GetDim(1),
// paddingDec.GetDim(0), paddingDec.GetDim(1), used, total);
if(nStepCheckpoint > 0 && ++nStepCheck >= nStepCheckpoint){
MakeCheckpoint(model, validFN, modelFN, "step", step);
nStepCheck = 0;
......@@ -374,7 +362,7 @@ void T2TTrainer::Test(const char * fn, const char * ofn, T2TModel * model)
model->MakeLM(batchEnc, output, paddingEnc, false);
else if(model->isMT)
model->MakeMT(batchEnc, batchDec, output, paddingEnc, false);
model->MakeMT(batchEnc, batchDec, output, paddingEnc, paddingDec, false);
ShowNTErrors("Illegal model type!");
......@@ -705,70 +693,46 @@ int T2TTrainer::LoadBatchLM(FILE * file,
dims[2] = vs;
InitTensor2D(batchEnc, sc, max, X_INT, devID, mem);
//InitTensor(batchEnc, 3, dims, X_FLOAT, 1.0F, devID, mem);
InitTensor2D(paddingEnc, sc, max, X_FLOAT, devID, mem);
InitTensor(gold, 3, dims, X_FLOAT, 1.0F, devID, mem);
InitTensor2D(paddingEnc, sc, max, X_FLOAT, devID, mem);
InitTensor2D(paddingDec, sc, max, X_FLOAT, devID, mem);
if(isTraining) {
int seqSize = 0;
MTYPE * batchEncOffsets = new MTYPE[batchEnc->unitNum];
int * batchEncValues = new int[batchEnc->unitNum];
MTYPE * paddingEncOffsets = new MTYPE[paddingEnc->unitNum];
MTYPE * goldOffsets = new MTYPE[gold->unitNum];
MTYPE * paddingDecOffsets = new MTYPE[paddingDec->unitNum];
//MTYPE * paddingEncOffsets = new MTYPE[paddingEnc->unitNum];
//MTYPE * paddingDecOffsets = new MTYPE[paddingDec->unitNum];
int wGold = 0;
//fprintf(tf, "batch %d(%d)\n", tc++, sc);
memset(batchEncValues, 0, sizeof(int) * batchEnc->unitNum);
for(int s = seq; s < seq + sc; s++){
int len = isDoubledEnd ? seqLen[s] : seqLen[s] - 1;
CheckNTErrors(len <= max, "Something is wrong!");
for(int w = 0; w < len; w++){
int num = buf[seqOffset[s] + w];
//batchEnc->Set2DInt(buf[seqOffset[s] + w], s - seq, w);
batchEncOffsets[wCount] = batchEnc->GetOffset2D(s - seq, w);
batchEncValues[wCount] = num;
//paddingEnc->Set2D(1.0F, s - seq, w);
//paddingDec->Set2D(1.0F, s - seq, w);
paddingEncOffsets[wCount] = paddingEnc->GetOffset2D(s - seq, w);
paddingDecOffsets[wCount] = paddingDec->GetOffset2D(s - seq, w);
batchEncValues[(int)batchEnc->GetOffset2D(s - seq, w)] = num;
//paddingEncOffsets[wCount] = paddingEnc->GetOffset2D(s - seq, w);
//paddingDecOffsets[wCount] = paddingDec->GetOffset2D(s - seq, w);
if (w > 0)
//gold->Set3D(1.0F, s - seq, w - 1, buf[seqOffset[s] + w]);
goldOffsets[wGold++] = gold->GetOffset3D(s - seq, w - 1, num);
if (w == len - 1) {
if (isDoubledEnd)
//gold->Set3D(1.0F, s - seq, w, buf[seqOffset[s] + w]);
goldOffsets[wGold++] = gold->GetOffset3D(s - seq, w, num);
//gold->Set3D(1.0F, s - seq, w, buf[seqOffset[s] + w + 1]);
goldOffsets[wGold++] = gold->GetOffset3D(s - seq, w, buf[seqOffset[s] + w + 1]);
/*fprintf(tf, "%d", buf[seqOffset[s] + w]);
if(w < seqLen[s] - 1)
fprintf(tf, " ");
fprintf(tf, "\n");*/
if(seqs != NULL)
seqs[seqSize++] = buf[seqOffset[s] + w];
......@@ -779,16 +743,25 @@ int T2TTrainer::LoadBatchLM(FILE * file,
batchEnc->SetDataBatched(batchEncOffsets, batchEncValues, wCount);
paddingEnc->SetDataBatched(paddingEncOffsets, 1.0F, wCount);
paddingDec->SetDataBatched(paddingDecOffsets, 1.0F, wCount);
batchEnc->SetData(batchEncValues, batchEnc->unitNum);
gold->SetDataBatched(goldOffsets, 1.0F, wGold);
//paddingEnc->SetDataBatched(paddingEncOffsets, 1.0F, wCount);
//paddingDec->SetDataBatched(paddingDecOffsets, 1.0F, wCount);
XTensor * tmp = NewTensorBuf(paddingEnc, devID, mem);
_ConvertDataType(batchEnc, tmp);
_NotEqual(tmp, paddingEnc, 0);
XTensor * tmp2 = NewTensorBuf(paddingDec, devID, mem);
_ConvertDataType(batchEnc, tmp2);
_NotEqual(tmp2, paddingDec, 0);
delete[] batchEncOffsets;
delete[] batchEncValues;
delete[] paddingEncOffsets;
delete[] paddingDecOffsets;
delete[] goldOffsets;
//delete[] paddingEncOffsets;
//delete[] paddingDecOffsets;
......@@ -884,35 +857,38 @@ int T2TTrainer::LoadBatchMT(FILE * file,
int wCountEnc = 0;
int wCountDec = 0;
int wGold = 0;
wCount = 0;
MTYPE * batchEncOffsets = new MTYPE[batchEnc->unitNum];
int * batchEncValues = new int[batchEnc->unitNum];
MTYPE * batchDecOffsets = new MTYPE[batchDec->unitNum];
int * batchDecValues = new int[batchDec->unitNum];
MTYPE * paddingEncOffsets = new MTYPE[sc * maxEnc / 2];
MTYPE * paddingDecOffsets = new MTYPE[sc * maxDec / 2];
//MTYPE * paddingEncOffsets = new MTYPE[sc * maxEnc / 2];
//MTYPE * paddingDecOffsets = new MTYPE[sc * maxDec / 2];
MTYPE * goldOffsets = new MTYPE[sc * maxDec / 2];
memset(batchEncValues, 0, sizeof(int) * batchEnc->unitNum);
memset(batchDecValues, 0, sizeof(int) * batchDec->unitNum);
/* batch of the source-side sequences */
for(int s = seq; s < seq + sc; s += 2){
int len = seqLen[s];
int sent = (s - seq)/2;
for(int w = 0; w < len; w++){
int num = buf[seqOffset[s] + w];
batchEncOffsets[wCount] = batchEnc->GetOffset2D(sent, w);
batchEncValues[wCount] = num;
paddingEncOffsets[wCount] = paddingEnc->GetOffset2D(sent, w);
batchEncValues[batchEnc->GetOffset2D(sent, w)] = num;
//paddingEncOffsets[wCountEnc] = paddingEnc->GetOffset2D(sent, w);
batchEnc->SetDataBatched(batchEncOffsets, batchEncValues, wCount);
paddingEnc->SetDataBatched(paddingEncOffsets, 1.0F, wCount);
int wCountDec = 0;
int wGold = 0;
batchEnc->SetData(batchEncValues, batchEnc->unitNum);
//paddingEnc->SetDataBatched(paddingEncOffsets, 1.0F, wCountEnc);
XTensor * tmp = NewTensorBuf(paddingEnc, devID, mem);
_ConvertDataType(batchEnc, tmp);
_NotEqual(tmp, paddingEnc, 0);
/* batch of the target-side sequences */
for(int s = seq + 1; s < seq + sc; s += 2){
......@@ -921,10 +897,8 @@ int T2TTrainer::LoadBatchMT(FILE * file,
int sent = (s - seq - 1)/2;
for(int w = 0; w < len; w++){
int num = buf[seqOffset[s] + w];
batchDecOffsets[wCountDec] = batchDec->GetOffset2D(sent, w);
batchDecValues[wCountDec] = num;
paddingDecOffsets[wCountDec] = paddingDec->GetOffset2D(sent, w);
batchDecValues[batchDec->GetOffset2D(sent, w)] = num;
//paddingDecOffsets[wCountDec] = paddingDec->GetOffset2D(sent, w);
if (w > 0)
goldOffsets[wGold++] = gold->GetOffset3D(sent, w - 1, buf[seqOffset[s] + w]);
......@@ -947,16 +921,20 @@ int T2TTrainer::LoadBatchMT(FILE * file,
batchDec->SetDataBatched(batchDecOffsets, batchDecValues, wCountDec);
paddingDec->SetDataBatched(paddingDecOffsets, 1.0F, wCountDec);
batchDec->SetData(batchDecValues, batchDec->unitNum);
//paddingDec->SetDataBatched(paddingDecOffsets, 1.0F, wCountDec);
XTensor * tmp2 = NewTensorBuf(paddingDec, devID, mem);
_ConvertDataType(batchDec, tmp2);
_NotEqual(tmp2, paddingDec, 0);
gold->SetDataBatched(goldOffsets, 1.0F, wGold);
delete[] batchEncOffsets;
delete[] batchEncValues;
delete[] batchDecOffsets;
delete[] batchDecValues;
delete[] paddingEncOffsets;
delete[] paddingDecOffsets;
//delete[] paddingEncOffsets;
//delete[] paddingDecOffsets;
delete[] goldOffsets;
return sc;
......@@ -990,12 +968,13 @@ float T2TTrainer::GetProb(XTensor * output, XTensor * gold, XTensor * wordProbs)
XTensor probs;
InitTensor(&probs, output);
XTensor logOutput;
InitTensor(&logOutput, output);
_Log(output, &logOutput);
//XTensor logOutput;
//InitTensor(&logOutput, output);
//_Log(output, &logOutput);
/* probs[i,j] = output[i,j] * gold[i,j] */
_Multiply(&logOutput, gold, &probs);
//_Multiply(&logOutput, gold, &probs);
_Multiply(output, gold, &probs);
/* probability of each word */
XTensor wprobs;
......@@ -1170,6 +1149,7 @@ void T2TTrainer::RescaleOutput(XTensor * output, XTensor * gold, XTensor * paddi
_ScaleAndShiftMe(output, 1/count);
_ScaleAndShiftMe(gold, 1/count);
......@@ -142,6 +142,9 @@ public:
/* counterpart of "isSmallBatch" */
bool isBigBatch;
/* indicates whether we use small memory footprint for backward process */
bool isSmallFootprint;
/* constructor */
......@@ -20,6 +20,7 @@
#include <math.h>
#include <time.h>
#include "Transformer.h"
#include "T2TModel.h"
#include "T2TUtility.h"
......@@ -58,6 +59,7 @@ int TransformerMain(int argc, const char ** argv)
LoadParamString(argc, args, "test", testFN, "");
LoadParamString(argc, args, "output", outputFN, "");
srand((unsigned int)time(NULL));
T2TTrainer trainer;
trainer.Init(argc, args);
......@@ -126,7 +126,7 @@ void SmallTest()
d = a + b + c.Lin(0.5F);
XLink::ShowNetwork(stderr, &d);
//XLink::ShowNetwork(stderr, &d);
a.Dump(stderr, "a:");
b.Dump(stderr, "b:");
......@@ -65,10 +65,10 @@ namespace nts {
#ifndef MIN
#define MIN(a,b) ((a < b) ? a : b)
#define MIN(a,b) ((a) < (b) ? a : b)
#ifndef MAX
#define MAX(a,b) ((a > b) ? a : b)
#define MAX(a,b) ((a) > (b) ? a : b)
#define __FILENAME__ ( strrchr(__FILE__, DELIMITER) != NULL ? strrchr(__FILE__, DELIMITER)+1 : __FILE__ )
......@@ -593,21 +593,6 @@ void XLink::CheckNetwork(XTensor * root)
show the network encoded in a root node (tensor)
>> file - file to dump information
>> root - pointer to the root node
void XLink::ShowNetwork(FILE * file, XTensor * root)
XLink &income = root->income;
for(int i = 0; i < income.tailNum; i++){
XTensor * child = income.tails[i];
ShowNetwork(file, child);
show a node
>> file - file to dump information
>> root - pointer to the node
......@@ -178,10 +178,6 @@ struct XLink
void CheckNetwork(XTensor * root);
/* show the network encoded in a root node (tensor) */
void ShowNetwork(FILE * file, XTensor * root);
/* show a node */
void ShowNode(FILE * file, XTensor * node);
......@@ -79,7 +79,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
......@@ -677,9 +677,9 @@ void XTensor::SetData(const void * d, int num, int beg)
CheckNTErrors(!isSparse, "TODO");
CheckNTErrors(num == unitNum - beg, "Illegal size!");
CheckNTErrors(num <= unitNum - beg, "Illegal size!");
XMemCopy(data, devID, d, -1, num * unitSize);
XMemCopy((char*)data + beg * unitSize, devID, d, -1, num * unitSize);
......@@ -804,7 +804,7 @@ set tensor items with an array of values
>> values - value for each data item
>> num - number of the data items
void XTensor::SetDataBatched(MTYPE * offsets, void * values, int num)
void XTensor::SetDataBatchedWithValues(MTYPE * offsets, void * values, int num)
_SetDataWithOffsetAndValue(this, offsets, values, num);
......@@ -1289,7 +1289,7 @@ int XTensor::GetNonzeroSize()
if(dataType == DEFAULT_DTYPE){
int count = 0;
for(int i = 0; i < unitNum; i++){
DTYPE value = *((DTYPE*)(char*)data + i * sizeof(DTYPE));
DTYPE value = *(DTYPE*)((char*)data + i * sizeof(DTYPE));
if(value == 0)
......@@ -2271,6 +2271,8 @@ XTensor * NewTensor(const XTensor * a, bool isFilledData)
CheckNTErrors((a != NULL), "Empty input!");
memset(dims, 0, sizeof(int) * MAX_TENSOR_DIM_NUM);
if(a->order > 0)
memcpy(dims, a->dimSize, sizeof(int) * a->order);
......@@ -49,6 +49,8 @@ struct XLink;
/* computation flags */
......@@ -283,7 +285,7 @@ public:
void SetDataBatched(MTYPE * offsets, DTYPE value, int num);
/* set tensor items with an array of values */
void SetDataBatched(MTYPE * offsets, void * values, int num);
void SetDataBatchedWithValues(MTYPE * offsets, void * values, int num);
/* check whether the data array is the same as the answer */
bool CheckData(const void * answer, int num, int beg = 0);
......@@ -16,8 +16,8 @@
* $Created by: XIAO Tong (email: 2018-04-24
* $Created by: XIAO Tong (email: 2018-04-24
/* this is a header to include all functions in the "core" workspace */
......@@ -46,16 +46,17 @@
#include "arithmetic/XTensorBLAS.h"
#include "getandset/ConvertDataType.h"
#include "getandset/OnehotAndIndex.h"
#include "getandset/Select.h"
#include "getandset/SetData.h"
#include "math/Clip.h"
#include "math/Compare.h"
#include "math/Normalize.h"
#include "math/Power.h"
#include "math/ScaleAndShift.h"
#include "math/Unary.h"
#include "movement/CopyBlocks.h"
#include "movement/CopyBlocksInGrid.h"
#include "movement/CopyBlocksOnSite.h"
......@@ -32,8 +32,6 @@ convert data type
void _ConvertDataType(const XTensor * input, XTensor * output)
//CheckNTErrors((input->unitSize == output->unitSize), "Input and Output must be same in size!");
if (input->dataType == output->dataType)
......@@ -61,4 +59,29 @@ void _ConvertDataType(const XTensor * input, XTensor * output)
ShowNTErrors("Unsupported data types for conversion!");
convert data type (return an XTensor structure)
make a new tensor to keep the result and return it
>> input - input tensor
>> output - output tensor
XTensor ConvertDataType(const XTensor & input, TENSOR_DATA_TYPE dataType)
int order = input.order;
float dr = (!input.isSparse) ? 1.0F : input.denseRatio;
XTensor output(order, input.dimSize, dataType, dr, input.devID, input.mem);
_Gather(&s, &t, &index);
/* tensor connection */
XLink::MakeLink(&s, &index, &t, MOVEMENT_GATHER);
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
......@@ -23,12 +23,16 @@
#include "../../XTensor.h"
#include "../../XDataType.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* convert data type */
void _ConvertDataType(const XTensor * input, XTensor * output);
/* convert data type (return an XTensor structure) */
XTensor ConvertDataType(const XTensor * input, TENSOR_DATA_TYPE dataType);
} // namespace nts(NiuTrans.Tensor)
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
* $Created by: Xu Chen (email: 2018-12-17
#include "OnehotAndIndex.h"
#include "OnehotAndIndex.cuh"
namespace nts{ // namespace nts(NiuTrans.Tensor)
convert onehot tensor to index tensor
>> onehot - onehot tensor, which value is 0 or 1
>> index - index tensor, which value is an integer num
>> size - the last dimension size of the onehot tensor
void _OnehotToIndex(XTensor * onehot, XTensor * index, int size)
CheckNTErrors(onehot->GetDim(-1) == size, "Illegal tensor dimension!");
CheckNTErrors(onehot->order == index->order + 1, "Illegal tensor order!");
CheckNTErrors(onehot->dataType == X_INT, "The onehot tensor must be in X_INT!")
CheckNTErrors(index->dataType == X_INT, "The index tensor must be in X_INT!")
for (int i = 0; i < index->order; i++)
CheckNTErrors(index->GetDim(i) == onehot->GetDim(i), "Illegal tensor order!");
#ifdef USE_CUDA
if(onehot->devID >= 0 && index->devID >= 0) {
_CudaOnehotToIndex(onehot, index, size);
int blockNum = index->unitNum;
int stride = size;
int * onehotData = (int *)onehot->data;
int * indexData = (int *)index->data;
for (int i = 0; i < blockNum; i++) {
int * od = onehotData + i * stride;
int record = -1;
for (int j = 0; j < stride; j++) {
if (od[j] != 0) {
if (record == -1)
record = j;
ShowNTErrors("The value of onehot tensor is illegal!");
indexData[i] = record;
convert onehot tensor to index tensor (return an XTensor structure)
make a new tensor to keep the result and return it
>> onehot - onehot tensor, which value is 0 or 1
>> size - the last dimension size of the onehot tensor
<< return - the index tensor
XTensor OnehotToIndex(XTensor & onehot, int size)
CheckNTErrors(onehot.GetDim(-1) == size, "Illegal tensor dimension!");
CheckNTErrors(onehot.dataType == X_INT, "The onehot tensor must be in X_INT!")
XTensor index;
InitTensor(&index, onehot.order - 1, onehot.dimSize, X_INT, 1.0F, onehot.devID, onehot.mem);
_OnehotToIndex(&onehot, &index, size);
return index;
convert index tensor to onehot tensor
>> index - index tensor, which value is an integer num
>> onehot - onehot tensor, which value is 0 or 1
>> size - the last dimension size of the onehot tensor
void _IndexToOnehot(XTensor * index, XTensor * onehot, int size)
CheckNTErrors(onehot->GetDim(-1) == size, "Illegal tensor dimension!");
CheckNTErrors(onehot->order == index->order + 1, "Illegal tensor order!");
CheckNTErrors(onehot->dataType == X_INT, "The onehot tensor must be in X_INT!")
CheckNTErrors(index->dataType == X_INT, "The index tensor must be in X_INT!")
for (int i = 0; i < index->order; i++)
CheckNTErrors(index->GetDim(i) == onehot->GetDim(i), "Illegal tensor order!");
#ifdef USE_CUDA
if(onehot->devID >= 0 && index->devID >= 0) {
_CudaIndexToOnehot(index, onehot, size);
int blockNum = index->unitNum;
int stride = size;
int * indexData = (int *)index->data;
int * onehotData = (int *)onehot->data;
for (int i = 0; i < blockNum; i++) {
int id = indexData[i];
int * od = onehotData + i * stride;
od[id] = 1;
convert onehot tensor to index tensor (return an XTensor structure)
make a new tensor to keep the result and return it
>> index - index tensor, which value is an integer num
>> size - the last dimension size of the onehot tensor
<< return - the onehot tensor
XTensor IndexToOnehot(XTensor & index, int size)
CheckNTErrors(index.dataType == X_INT, "The onehot tensor must be in X_INT!")
XTensor onehot;
int order = index.order;
int * dim = new int[order + 1];
memcpy(dim, index.dimSize, order * sizeof(int));
dim[order] = size;
InitTensor(&onehot, index.order + 1, dim, X_INT, 1.0F, index.devID, index.mem);
_IndexToOnehot(&index, &onehot, size);
delete[] dim;
return onehot;
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
* $Created by: Xu Chen (email: 2018-07-31
#include "OnehotAndIndex.cuh"
#include "../../XDevice.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
convert onehot tensor to index tensor (kernel version)
>> onehotData - the data pointer of the onehot tensor
>> indexData - the data pointer of the index tensor
>> blockNum - the number of block
>> stride - stride of a data block
void KernelOnehotToIndex(int * onehotData, int * indexData, int blockNum, int stride)
/* block id */
int i = blockDim.x * blockIdx.x + threadIdx.x;
/* offset in each block */
int offset = blockDim.y * blockIdx.y + threadIdx.y;
if (i >= blockNum || offset >= stride)
int * od = onehotData + i * stride;
int * id = indexData + i;
if (od[offset] != 0)
*id = offset;
convert onehot tensor to index tensor (cuda version)
>> onehot - onehot tensor, which value is 0 or 1
>> index - index tensor, which value is an integer num
>> size - the last dimension size of the onehot tensor
void _CudaOnehotToIndex(XTensor * onehot, XTensor * index, int size)
int devID = onehot->devID;
int blockNum = index->unitNum;
int stride = size;
int cudaGrids[3];
int cudaBlocks[3];
int devIDBackup;
ProtectCudaDev(devID, devIDBackup);
GDevs.GetCudaThread2D(devID, blockNum, stride, MAX_INT, cudaGrids, cudaBlocks);
dim3 blocks(cudaGrids[0], cudaGrids[1]);
dim3 threads(cudaBlocks[0], cudaBlocks[1]);
int * onehotData = (int *)onehot->data;
int * indexData = (int *)index->data;
KernelOnehotToIndex<<<blocks, threads >>>(onehotData, indexData, blockNum, stride);
BacktoCudaDev(devID, devIDBackup);
convert index tensor to onehot tensor (kernel version)
>> onehotData - the data pointer of the onehot tensor
>> indexData - the data pointer of the index tensor
>> blockNum - the number of block
>> stride - stride of a data block
void KernelIndexToOnehot(int * onehotData, int * indexData, int blockNum, int stride)
/* block id */
int i = blockDim.x * blockIdx.x + threadIdx.x;
/* offset in each block */
int offset = blockDim.y * blockIdx.y + threadIdx.y;
if (i >= blockNum || offset >= stride)
int * od = onehotData + i * stride;
int id = indexData[i];
od[id] = 1;
convert index tensor to onehot tensor (cuda version)
>> index - index tensor, which value is an integer num
>> onehot - onehot tensor, which value is 0 or 1
>> size - the last dimension size of the onehot tensor
void _CudaIndexToOnehot(XTensor * index, XTensor * onehot, int size)
int devID = onehot->devID;
int blockNum = index->unitNum;
int stride = size;
int cudaGrids[3];
int cudaBlocks[3];
int devIDBackup;
ProtectCudaDev(devID, devIDBackup);
GDevs.GetCudaThread2D(devID, blockNum, stride, MAX_INT, cudaGrids, cudaBlocks);
dim3 blocks(cudaGrids[0], cudaGrids[1]);
dim3 threads(cudaBlocks[0], cudaBlocks[1]);
int * onehotData = (int *)onehot->data;
int * indexData = (int *)index->data;
KernelIndexToOnehot<<<blocks, threads >>>(onehotData, indexData, blockNum, stride);
BacktoCudaDev(devID, devIDBackup);
#endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
* $Created by: Xu Chen (email: 2018-12-17
#include "../../XTensor.h"
namespace nts{ // namespace nts(NiuTrans.Tensor)
/* convert onehot tensor to index tensor (cuda version) */
void _CudaOnehotToIndex(XTensor * onehot, XTensor * index, int size);
/* convert index tensor to onehot tensor (cuda version) */
void _CudaIndexToOnehot(XTensor * index, XTensor * onehot, int size);
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
* $Created by: Xu Chen (email: 2018-12-17
#include "../../XTensor.h"
namespace nts{ // namespace nts(NiuTrans.Tensor)
/* convert onehot tensor to index tensor */
void _OnehotToIndex(XTensor * onehot, XTensor * index, int size);
/* convert onehot tensor to index tensor (return an XTensor structure)
make a new tensor to keep the result and return it */
XTensor OnehotToIndex(XTensor & onehot, int num);
/* convert index tensor to onehot tensor */
void _IndexToOnehot(XTensor * index, XTensor * onehot, int size);
/* convert index tensor to onehot tensor (return an XTensor structure)
make a new tensor to keep the result and return it */
XTensor IndexToOnehot(XTensor & index, int num);
} // namespace nts(NiuTrans.Tensor)
#endif // __ONEHOTANDINDEX_H__
\ No newline at end of file
......@@ -70,8 +70,9 @@ void _SetDataFanInOut(XTensor * tensor, DTYPE gain)
fanOut = numOutputFmaps * receptiveFieldSize;
DTYPE finfout = gain * (float)sqrt(6.0F/(fanIn + fanOut));
tensor->SetDataRand(-finfout, finfout);
DTYPE std = gain * (float)sqrt(2.0 / (fanIn + fanOut));
DTYPE a = (DTYPE)sqrt(3.0F) * std;
tensor->SetDataRand(-a, a);
//_SetDataRand(tensor, -finfout, finfout);
......@@ -499,36 +500,9 @@ void _SetDataWithOffsetAndValue(XTensor * tensor, MTYPE * offsets, void * values
else {
#ifdef USE_CUDA
XMem * mem = tensor->mem;
MTYPE offsetSize = num * sizeof(MTYPE);
MTYPE valueSize;
if (tensor->dataType == X_INT)
valueSize = num * sizeof(int);
else if (tensor->dataType == X_FLOAT)
valueSize = num * sizeof(float);
ShowNTErrors("TO DO!!!");
MTYPE * offsetsCuda = mem != NULL ?
(MTYPE*)mem->AllocBuf(mem->devID, offsetSize) :
(MTYPE*)XMemAlloc(tensor->devID, offsetSize);
void * valuesCuda = mem != NULL ?
mem->AllocBuf(mem->devID, valueSize) :
XMemAlloc(tensor->devID, valueSize);
XMemCopy(offsetsCuda, tensor->devID, offsets, -1, offsetSize);
XMemCopy(valuesCuda, tensor->devID, values, -1, valueSize);
_CudaSetDataWithOffsetAndValue(tensor, offsetsCuda, valuesCuda, num);
if (mem != NULL) {
mem->ReleaseBuf(mem->devID, valueSize);
mem->ReleaseBuf(mem->devID, offsetSize);
else {
XMemFree(tensor->devID, offsetsCuda);
XMemFree(tensor->devID, valuesCuda);
if(tensor->devID >= 0) {
_CudaSetDataWithOffsetAndValue(tensor, offsets, values, num);
ShowNTErrors("Please recompile the code with USE_CUDA");
......@@ -26,6 +26,7 @@
#include "SetData.cuh"
#include <curand_kernel.h>
#include "../../XDevice.h"
#include "../../XUtility.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......@@ -363,7 +364,7 @@ e.g., for a 3* 3 tensor,
2 2 0
void _KernelSetDataLowTri(DTYPE * d, int l, int blockSize, int blockNum, DTYPE p, int shift)
void KernelSetDataLowTri(DTYPE * d, int l, int blockSize, int blockNum, DTYPE p, int shift)
/* offset in each block */
int i = blockDim.x * blockIdx.x + threadIdx.x;
......@@ -425,7 +426,7 @@ void _CudaSetDataLowTri(XTensor * tensor, DTYPE p, int shift)
int devIDBackup;
ProtectCudaDev(tensor->devID, devIDBackup);
_KernelSetDataLowTri<<<blocks, threads >>>((DTYPE*)tensor->data, l, blockSize, blockNum, p, shift);
KernelSetDataLowTri<<<blocks, threads >>>((DTYPE*)tensor->data, l, blockSize, blockNum, p, shift);
BacktoCudaDev(tensor->devID, devIDBackup);
......@@ -474,12 +475,12 @@ set the data with an array of offsets (kernel version)
>> num - number of the data items
void _KernelSetDataWithOffset(DTYPE * data, MTYPE * offsets, DTYPE value, MTYPE num)
void KernelSetDataWithOffset(DTYPE * data, MTYPE * offsets, DTYPE value, MTYPE num)
/* index */
int i = blockDim.x * blockIdx.x + threadIdx.x;
if(i < num)
if (i < num)
data[offsets[i]] = value;
......@@ -505,7 +506,7 @@ void _CudaSetDataWithOffset(XTensor * tensor, MTYPE * offsets, DTYPE value, MTYP
int devIDBackup;
ProtectCudaDev(tensor->devID, devIDBackup);
_KernelSetDataWithOffset << <blocks, threads >> > ((DTYPE*)tensor->data, offsets, value, num);
KernelSetDataWithOffset << <blocks, threads >> > ((DTYPE*)tensor->data, offsets, value, num);
BacktoCudaDev(tensor->devID, devIDBackup);
......@@ -519,7 +520,7 @@ set the data with an array of offsets (kernel version)
>> dataType - the data type of the data and values
void _KernelSetDataWithOffset(void * data, MTYPE * offsets, void * values, MTYPE num, TENSOR_DATA_TYPE dataType)
void KernelSetDataWithOffsetAndValue(void * data, MTYPE * offsets, void * values, MTYPE num, TENSOR_DATA_TYPE dataType)
/* index */
int i = blockDim.x * blockIdx.x + threadIdx.x;
......@@ -541,6 +542,18 @@ set the data with an array of values
void _CudaSetDataWithOffsetAndValue(XTensor * tensor, MTYPE * offsets, void * values, MTYPE num)
XMem * mem = tensor->mem;
MTYPE offsetSize = num * sizeof(MTYPE);
MTYPE valueSize;
if (tensor->dataType == X_INT)
valueSize = num * sizeof(int);
else if (tensor->dataType == X_FLOAT)
valueSize = num * sizeof(float);
ShowNTErrors("TO DO!!!");
int gridSize[3];
int blockSize[3];
......@@ -552,7 +565,32 @@ void _CudaSetDataWithOffsetAndValue(XTensor * tensor, MTYPE * offsets, void * va
int devIDBackup;
ProtectCudaDev(tensor->devID, devIDBackup);
_KernelSetDataWithOffset << <blocks, threads >> > (tensor->data, offsets, values, num, tensor->dataType);
MTYPE * offsetsCuda = mem != NULL ?
(MTYPE*)mem->AllocBuf(mem->devID, offsetSize) :
(MTYPE*)XMemAlloc(tensor->devID, offsetSize);
void * valuesCuda = mem != NULL ?
mem->AllocBuf(mem->devID, valueSize) :
XMemAlloc(tensor->devID, valueSize);
if (mem != NULL) {
XMemCopy(offsetsCuda, mem->devID, offsets, -1, offsetSize);
XMemCopy(valuesCuda, mem->devID, values, -1, valueSize);
else {
XMemCopy(offsetsCuda, tensor->devID, offsets, -1, offsetSize);
XMemCopy(valuesCuda, tensor->devID, values, -1, valueSize);
KernelSetDataWithOffsetAndValue<<<blocks, threads >>> (tensor->data, offsetsCuda, valuesCuda, num, tensor->dataType);
if (mem != NULL) {
mem->ReleaseBuf(mem->devID, valueSize);
mem->ReleaseBuf(mem->devID, offsetSize);
else {
XMemFree(tensor->devID, valuesCuda);
XMemFree(tensor->devID, offsetsCuda);
BacktoCudaDev(tensor->devID, devIDBackup);
......@@ -15,7 +15,6 @@
* limitations under the License.
* $Created by: Lin Ye (email: 2018-08-03
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
* $Created by: Xu Chen (email: 2018-12-10
#include "../../XTensor.h"
#include "../../XName.h"
#include "Compare.h"
#include "Compare.cuh"
namespace nts{ // namespace nts(NiuTrans.Tensor)
return (a == b ? 1.0F : 0.0F);
DTYPE myIsNotEqual(DTYPE a, DTYPE b)
return (a != b ? 1.0F : 0.0F);
#ifdef USE_CUDA
/* define three marco separately, specify the respective function names (GPU mode) */
#define _SIMPLE_COMPARE_FUNCTION(_funcName, _cudaFuncName, origFunc) \
void _funcName(const XTensor * a, XTensor * b, DTYPE number) \
{ \
CheckNTErrors((XTensor::IsSameShaped(a, b)), \
"Input tensors should have the same type!"); \
CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!"); \
/* run it on GPUs */ \
if (a->devID >= 0) { \
_cudaFuncName(a, b, number); \
return; \
} \
DTYPE * d = (DTYPE*)a->data; \
DTYPE * db = (DTYPE*)b->data; \
for (int i = 0; i < a->unitNum; i++) \
db[i] = (DTYPE)origFunc(d[i], number); \
#define _SIMPLE_COMPARE_FUNCTION_ME(_funcNameMe, _funcName) \
void _funcNameMe(XTensor * a, DTYPE number) \
{ \
_funcName(a, a, number); \
#define SIMPLE_COMPARE_FUNCTION(funcName, _funcName, operationId) \
XTensor funcName(const XTensor &a, DTYPE number) \
{ \
XTensor b(&a); \
b.SetTMPFlag(); \
_funcName(&a, &b, number); \
return b; \
// I think we needn't to make link.
// XLink::MakeLink(&a, NULL, &b, operationId);
_SIMPLE_COMPARE_FUNCTION(_Equal, _CudaEqual, myIsEqual)
_SIMPLE_COMPARE_FUNCTION(_NotEqual, _CudaNotEqual, myIsNotEqual)
/* define three marco separately, specify the respective function names (CPU mode) */
#define _SIMPLE_COMPARE_FUNCTION(_funcName, origFunc) \
void _funcName(const XTensor * a, XTensor * b, DTYPE number) \
{ \
CheckNTErrors((XTensor::IsSameShaped(a, b)), \
"Input tensors should have the same type!"); \
CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!"); \
DTYPE * d = (DTYPE*)a->data; \
DTYPE * db = (DTYPE*)b->data; \
for (int i = 0; i < a->unitNum; i++) \
db[i] = (DTYPE)origFunc(d[i], number); \
#define _SIMPLE_COMPARE_FUNCTION_ME(_funcNameMe, _funcName) \
void _funcNameMe(XTensor * a, DTYPE number) \
{ \
_funcName(a, a, number); \
#define SIMPLE_COMPARE_FUNCTION(funcName, _funcName, operationId) \
XTensor funcName(const XTensor &a, DTYPE number) \
{ \
XTensor b(&a); \
b.SetTMPFlag(); \
_funcName(&a, &b, number); \
return b; \
// I think we needn't to make link.
// XLink::MakeLink(&a, NULL, &b, operationId);
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
* $Created by: Xu Chen (email: 2018-07-31
#include <math.h>
#include "../../XDevice.h"
#include "../../XName.h"
#include "Compare.h"
#include "Compare.cuh"
namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
DTYPE cudaIsEqual(DTYPE a, DTYPE b)
return (a == b ? 1.0F : 0.0F);
DTYPE cudaIsNotEqual(DTYPE a, DTYPE b)
return (a != b ? 1.0F : 0.0F);
#define SIMPLE_COMPARE_FUNCTION_GPU(funcName, origFunc) \
__global__ \
void Kernel##funcName(DTYPE * a, DTYPE * b, int size, DTYPE number) \
{ \
int i = blockDim.x * blockIdx.x + threadIdx.x; \
if (i < size) \
b[i] = (DTYPE)origFunc(a[i], number); \
} \
__global__ \
void Kernel##funcName(__half * a, __half * b, int size, __half number) \
{ \
return; \
} \
void _Cuda##funcName(const XTensor * a, XTensor * b, DTYPE number) \
{ \
int gridSize[3]; \
int blockSize[3]; \
GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize); \
dim3 blocks(gridSize[0]); \
dim3 threads(blockSize[0]); \
int devIDBackup; \
ProtectCudaDev(a->devID, devIDBackup); \
if (a->dataType == DEFAULT_DTYPE) { \
Kernel##funcName<<<blocks, threads>>> \
((DTYPE*)a->data, (DTYPE*)b->data, \
a->unitNum, (DTYPE)number); \
} \
else if (a->dataType == X_FLOAT16) { \
Kernel##funcName<<<blocks, threads>>> \
((__half*)a->data, (__half*)b->data, \
a->unitNum, (__half)number); \
} \
else { \
ShowNTErrors("TODO!"); \
} \
BacktoCudaDev(a->devID, devIDBackup); \
} \
#endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
* $Created by: Xu Chen (email: 2018-12-10
#ifndef __COMPARE_CUH__
#define __COMPARE_CUH__
#include "../../XTensor.h"
namespace nts{ // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/* compare whether every entry is equal to the specified value (cuda kernel) */
void KernelEqual(DTYPE * a, DTYPE * b, DTYPE * number);
/* compare whether every entry is equal to the specified value (cuda version) */
void _CudaEqual(const XTensor * a, XTensor * b, DTYPE number);
/* compare whether every entry is not equal to the specified value (cuda kernel) */
void KernelNotEqual(DTYPE * a, DTYPE * b, DTYPE * number);
/* compare whether every entry is not equal to the specified value (cuda version) */
void _CudaNotEqual(const XTensor * a, XTensor * b, DTYPE number);
#endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor)
#endif //end __COMPARE_CUH__
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
* $Created by: Xu Chen (email: 2018-12-10
#ifndef __COMPARE_H__
#define __COMPARE_H__
#include "../../XTensor.h"
namespace nts{ // namespace nts(NiuTrans.Tensor)
/* compare whether every entry is equal to the specified value */
void _Equal(const XTensor * a, XTensor * b, DTYPE number);
/* compare whether every entry is equal to the specified value (do it on site)
keep the result in the input tensor a and return nothing */
void _EqualMe(XTensor * a, DTYPE number);
/* compare whether every entry is equal to the specified value (return an XTensor structure)
make a new tensor to keep the result and return it */
XTensor Equal(const XTensor & a, DTYPE number);
/* compare whether every entry is not equal to the specified value */
void _NotEqual(const XTensor * a, XTensor * b, DTYPE number);
/* compare whether every entry is not equal to the specified value (do it on site)
keep the result in the input tensor a and return nothing */
void _NotEqualMe(XTensor * a, DTYPE number);
/* compare whether every entry is not equal to the specified value (return an XTensor structure)
make a new tensor to keep the result and return it */
XTensor NotEqual(const XTensor & a, DTYPE number);
} // namespace nts(NiuTrans.Tensor)
#endif // end __COMPARE_H__
\ No newline at end of file
......@@ -223,4 +223,4 @@ _SIMPLE_UNARY_FUNCTION_ME(_RoundMe, _Round)
\ No newline at end of file
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
......@@ -15,7 +15,6 @@
* limitations under the License.
* $Created by: Xu Chen (email: 2018-07-31
......@@ -15,7 +15,6 @@
* limitations under the License.
* $Created by: Xu Chen (email: 2018-07-31
......@@ -15,7 +15,6 @@
* limitations under the License.
* $Created by: Xu Chen (email: 2018-07-31
......@@ -145,5 +144,6 @@ void _TanMe(XTensor * a);
make a new tensor to keep the result and return it */
XTensor Tan(const XTensor & a);
#endif //end __UNARY_H__
\ No newline at end of file
} // namespace nts(NiuTrans.Tensor)
#endif // end __UNARY_H__
\ No newline at end of file
......@@ -79,8 +79,13 @@ void _CudaCopyBlocksSelected(void * source, int blockSize, int * sourceBlocks, i
ProtectCudaDev(devID, devIDBackup);
/* copy the index to the GPU memory */
int * sourceBlocksTMP = myMem != NULL ? (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int)) : (int *)XMemAlloc(devID, blockNum * sizeof(int));
int * targetBlocksTMP = myMem != NULL ? (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int)) : (int *)XMemAlloc(devID, blockNum * sizeof(int));
int * sourceBlocksTMP = myMem != NULL ?
(int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int)) :
(int *)XMemAlloc(devID, blockNum * sizeof(int));
int * targetBlocksTMP = myMem != NULL ?
(int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int)) :
(int *)XMemAlloc(devID, blockNum * sizeof(int));
XMemCopy(sourceBlocksTMP, devID, sourceBlocks, -1, blockNum * sizeof(int));
XMemCopy(targetBlocksTMP, devID, targetBlocks, -1, blockNum * sizeof(int));
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
* $Created by: XIAO Tong (email: 2018-04-24
* $Created by: XIAO Tong (email: 2018-04-24
#include "CopyIndexed.h"
#include "CopyIndexed.cuh"
#include "CopyBlocks.h"
#include "Gather.h"
#include "../../XName.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......@@ -40,7 +42,9 @@ copy indexed sub-tensors
e.g., for srcIndex = [1,4] and copyNum = 2,
we actually copy the source sub-tensors 1, 2, 4, 5
void _CopyIndexed(const XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSize, int * tgtIndex, int copyNum)
void _CopyIndexed(const XTensor * s, XTensor * t, int dim,
int * srcIndex, int indexSize, int * tgtIndex,
int copyNum)
CheckNTErrors((s && t), "Invalid tensors!");
CheckNTErrors((s->devID == t->devID || (s->devID < 0 && t->devID < 0)),
......@@ -99,7 +103,148 @@ void _CopyIndexed(const XTensor * s, XTensor * t, int dim, int * srcIndex, int i
copy indexed sub-tensors (return an XTensor structure)
copy selected sub-tensors where indeces are kept in tensors
>> s - the source tensor
>> t - the target tensor
>> dim - the leading dimension to define "sub-tensors"
e.g., for a tensor of size (3, 2, 4) and dim = 2,
we have 4 sub-tensors of size (3, 2)
>> srcIndex - the tensor to save the index of the source sub-tensors
>> tgtIndex - the tensor to save the index of the target sub-tensors
>> copyNum - number of the sub-tensors we copy for each source index,
e.g., for srcIndex = [1,4] and copyNum = 2,
we actually copy the source sub-tensors 1, 2, 4, 5
void _CopyIndexed(const XTensor * s, XTensor * t, int dim,
const XTensor * srcIndex, const XTensor * tgtIndex,
int copyNum)
int order = s->order;
int indexSize = srcIndex->unitNum;
CheckNTErrors(indexSize != 0, "NULL index!")
CheckNTErrors((s && t), "Invalid tensors!");
CheckNTErrors((srcIndex && tgtIndex), "Invalid index tensors!");
CheckNTErrors((s->devID == t->devID || (s->devID < 0 && t->devID < 0)),
"the data must be kept on the same device!");
CheckNTErrors((srcIndex->devID == srcIndex->devID || (s->devID < 0 && t->devID < 0)),
"the index must be kept on the same device!");
CheckNTErrors((s->devID == srcIndex->devID || (s->devID < 0 && t->devID < 0)),
"the data and index must be kept on the same device!");
CheckNTErrors((dim >= 0 && dim < order), "A too larget dimension specified!");
CheckNTErrors((s->unitSize == t->unitSize), "Unmatched tensors!");
CheckNTErrors((srcIndex->unitNum == tgtIndex->unitNum), "Unmatched index tensors!");
for (int i = 0; i < order; i++) {
if (i != dim) {
CheckNTErrors(s->GetDim(i) == t->GetDim(i), "Unmatched dimensions");
else {
CheckNTErrors(t->GetDim(i) == indexSize * copyNum, "Unmatched dimensions");
#ifdef USE_CUDA
if (s->devID >= 0 && srcIndex->devID >= 0) {
_CudaCopyIndexed(s, t, dim, srcIndex, tgtIndex, copyNum);
int blockNum = 1;
int stride = 1;
int blockSizeSrc = 1;
int blockSizeTgt = 1;
for (int i = 0; i < dim; i++)
blockNum *= s->GetDim(i);
for (int i = dim + 1; i < order; i++)
stride *= s->GetDim(i);
blockSizeSrc = stride * s->GetDim(dim);
blockSizeTgt = stride * t->GetDim(dim);
DTYPE * sData = (DTYPE*)s->data;
DTYPE * tData = (DTYPE*)t->data;
int * sIndex = (int*)srcIndex->data;
int * tIndex = (int*)tgtIndex->data;
for (int i = 0; i < indexSize; i++) {
for (int c = 0; c < copyNum; c++) {
int si = sIndex[i] + c;
int ti = tIndex[i] + c;
for (int j = 0; j < blockNum; j++) {
DTYPE * sd = sData + j * blockSizeSrc + si * stride;
DTYPE * td = tData + j * blockSizeTgt + ti * stride;
for (int k = 0; k < stride; k++)
*(td + k) = *(sd + k);
copy selected sub-tensors where indeces are kept in tensors (return an XTensor structure)
make a new tensor to keep the result and return it
>> s - the source tensor
>> dim - the leading dimension to define "sub-tensors"
e.g., for a tensor of size (3, 2, 4) and dim = 2,
we have 4 sub-tensors of size (3,2)
>> srcIndex - index of the source sub-tensors
>> indexSize - length of srcIndex (and tgtIndex)
>> tgtIndex - index of the target sub-tensors
>> copyNum - number of the sub-tensors we copy for each source index,
e.g., for srcIndex = [1,4] and copyNum = 2,
we actually copy the source sub-tensors 1, 2, 4, 5
<< return - the result of copying indexed sub-tensors
XTensor CopyIndexed(const XTensor & s, int dim,
const XTensor & srcIndex, const XTensor & tgtIndex,
int copyNum)
CheckNTErrors(dim >= 0 && dim < s.order, "A too larget dimension specified!");
int order = s.order;
int * dimSize = new int[order];
int indexSize = srcIndex.unitNum;
for (int i = 0; i < s.order; i++) {
if (i == dim)
dimSize[i] = indexSize * copyNum;
dimSize[i] = s.dimSize[i];
float dr = (!s.isSparse) ? 1.0F : s.denseRatio;
XTensor t(order, dimSize, s.dataType, dr, s.devID, s.mem);
/* call _CopyIndexed function */
_CopyIndexed(&s, &t, dim, &srcIndex, &tgtIndex, copyNum);
XList list(3);
/* tensor connection */
XLink::MakeLink(&list, &t, MOVEMENT_COPYINDEXED);
XLink::AddParamToHeadInt(&t, dim);
XLink::AddParamToHeadInt(&t, copyNum);
/* destroy variables */
delete[] dimSize;
return t;
copy indexed sub-tensors (return a XTensor structure)
make a new tensor to keep the result and return it
>> s - the source tensor
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
* $Created by: Xu Chen (email: 2018-11-30
#include "CopyIndexed.cuh"
#include "../../XDevice.h"
#include "../../XUtility.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
copy selected sub-tensors where indeces are kept in tensors (kenerl version)
>> s - the source tensor
>> t - the target tensor
>> dim - the leading dimension to define "sub-tensors"
e.g., for a tensor of size (3, 2, 4) and dim = 2,
we have 4 sub-tensors of size (3, 2)
>> srcIndex - the tensor to save the index of the source sub-tensors
>> tgtIndex - the tensor to save the index of the target sub-tensors
>> copyNum - number of the sub-tensors we copy for each source index,
e.g., for srcIndex = [1,4] and copyNum = 2,
we actually copy the source sub-tensors 1, 2, 4, 5
void KernelCopyIndexed(DTYPE * sData, DTYPE * tData, int * sIndex, int * tIndex,
int blockNum, int blockSizeSrc, int blockSizeTgt,
int stride, int indexSize, int copyNum)
/* block id */
int i = blockDim.x * blockIdx.x + threadIdx.x;
/* offset in each block */
int offset = blockDim.y * blockIdx.y + threadIdx.y;
if(i >= blockNum * indexSize * copyNum || offset >= stride)
int realIndexSize = indexSize * copyNum;
int realBlockNum = i / realIndexSize;
int realIndex = i % realIndexSize;
int realSrcIndex = sIndex[realIndex / copyNum] + realIndex % copyNum;
int realTgtIndex = tIndex[realIndex / copyNum] + realIndex % copyNum;
if(threadIdx.y == 0){
sp[threadIdx.x] = sData + realBlockNum * blockSizeSrc + realSrcIndex * stride;
tp[threadIdx.x] = tData + realBlockNum * blockSizeTgt + realTgtIndex * stride;
DTYPE * s = sp[threadIdx.x];
DTYPE * t = tp[threadIdx.x];
t[offset] = s[offset];
copy selected sub-tensors where indeces are kept in tensors
>> s - the source tensor
>> t - the target tensor
>> dim - the leading dimension to define "sub-tensors"
e.g., for a tensor of size (3, 2, 4) and dim = 2,
we have 4 sub-tensors of size (3, 2)
>> srcIndex - the tensor to save the index of the source sub-tensors
>> tgtIndex - the tensor to save the index of the target sub-tensors
>> copyNum - number of the sub-tensors we copy for each source index,
e.g., for srcIndex = [1,4] and copyNum = 2,
we actually copy the source sub-tensors 1, 2, 4, 5
void _CudaCopyIndexed(const XTensor * s, XTensor * t, int dim,
const XTensor * srcIndex, const XTensor * tgtIndex,
int copyNum)
int devID = s->devID;
int order = s->order;
int indexSize = srcIndex->unitNum;
int blockNum = 1;
int stride = 1;
int blockSizeSrc = 1;
int blockSizeTgt = 1;
for (int i = 0; i < dim; i++)
blockNum *= s->GetDim(i);
for (int i = dim + 1; i < order; i++)
stride *= s->GetDim(i);
blockSizeSrc = stride * s->GetDim(dim);
blockSizeTgt = stride * t->GetDim(dim);
int cudaGrids[3];
int cudaBlocks[3];
int devIDBackup;
ProtectCudaDev(devID, devIDBackup);
GDevs.GetCudaThread2D(devID, blockNum * indexSize * copyNum, stride, MAX_INT, cudaGrids, cudaBlocks);
dim3 blocks(cudaGrids[0], cudaGrids[1]);
dim3 threads(cudaBlocks[0], cudaBlocks[1]);
DTYPE * sData = (DTYPE*)s->data;
DTYPE * tData = (DTYPE*)t->data;
int * sIndex = (int *)srcIndex->data;
int * tIndex = (int *)tgtIndex->data;
KernelCopyIndexed<<<blocks, threads >>>(sData, tData, sIndex, tIndex,
blockNum, blockSizeSrc, blockSizeTgt,
stride, indexSize, copyNum);
BacktoCudaDev(devID, devIDBackup);
#endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
* $Created by: Xu Chen (email: 2018-11-30
* Tomorrow is the celebration of the laboratory, I'm so happy!
#ifndef __CopyIndexed_CUH__
#define __CopyIndexed_CUH__
#include "../../XTensor.h"
#include "CopyIndexed.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/* copy selected sub-tensors where indeces are kept in tensors (cuda version) */
void _CudaCopyIndexed(const XTensor * s, XTensor * t, int dim,
const XTensor * srcIndex, const XTensor * tgtIndex,
int copyNum);
#endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor)
#endif // __CopyIndexed_CUH__
\ No newline at end of file
......@@ -27,22 +27,27 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
/* copy selected sub-tensors */
void _CopyIndexed(const XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSize, int * tgtIndex, int copyNum);
void _CopyIndexed(const XTensor * s, XTensor * t, int dim,
int * srcIndex, int indexSize, int * tgtIndex,
int copyNum = 1);
/* copy selected sub-tensors where indeces are kept in tensors */
void _CopyIndexed(const XTensor * s, XTensor * t, int dim, const XTensor * srcIndex, const XTensor * tgtIndex);
void _CopyIndexed(const XTensor * s, XTensor * t, int dim,
const XTensor * srcIndex, const XTensor * tgtIndex,
int copyNum = 1);
copy selected sub-tensors (return an XTensor structure)
copy selected sub-tensors (return a XTensor structure)
make a new tensor to keep the result and return it (remove this???)
XTensor CopyIndexed(const XTensor &s, int dim, int * srcIndex, int indexSize, int * tgtIndex, int copyNum);
copy selected sub-tensors where indeces are kept in tensors (return an XTensor structure)
make a new tensor to keep the result and return it (remove this???)
make a new tensor to keep the result and return it
void CopyIndexed(const XTensor * s, XTensor * t, int dim, const XTensor * srcIndex, const XTensor * tgtIndex);
XTensor CopyIndexed(const XTensor & s, int dim,
const XTensor & srcIndex, const XTensor & tgtIndex,
int copyNum = 1);
} // namespace nts(NiuTrans.Tensor)
......@@ -57,15 +57,14 @@ gather indexed sub-tensors
>> t - the target tensor
>> srcIndex - the tensor to save the index of the source tensor
void _Gather(XTensor * s, XTensor * t, XTensor * srcIndex)
void _Gather(const XTensor * s, XTensor * t, XTensor * srcIndex)
CheckNTErrors((s && t), "Invalid tensors!");
CheckNTErrors((s->devID == t->devID && t->devID == srcIndex->devID),
"the data must be kept on the same device!");
CheckNTErrors(s->devID == t->devID, "the data must be kept on the same device!");
CheckNTErrors((s->unitSize == t->unitSize), "Unmatched tensors!");
#ifdef USE_CUDA
if (s->devID >= 0 && t->devID >= 0 && srcIndex->devID >= 0) {
if (s->devID >= 0 && t->devID >= 0) {
_CudaGather(s, t, srcIndex);
......@@ -116,6 +115,8 @@ XTensor Gather(XTensor &s, XTensor &index)
XTensor t(order, dimSize, s.dataType, dr, s.devID, s.mem);
delete[] dimSize;
_Gather(&s, &t, &index);
/* tensor connection */
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
* $Created by: XIAO Tong (email: 2018-04-24
* $Created by: Xu Chen (email: 2018-11-27
#include "Gather.cuh"
#include "CopyBlocksSelected.cuh"
......@@ -41,7 +41,7 @@ __global__
void KernelGather(DTYPE * sData, DTYPE * tData, int * sIndex, int indexSize, int stride)
/* block id */
int i = blockDim.x * blockIdx.x + threadIdx.x;
......@@ -54,15 +54,15 @@ void KernelGather(DTYPE * sData, DTYPE * tData, int * sIndex, int indexSize, int
if(threadIdx.y == 0){
sp[threadIdx.x] = sData + sIndex[i] * stride;
cp[threadIdx.x] = tData + i * stride;
tp[threadIdx.x] = tData + i * stride;
DTYPE * s = sp[threadIdx.x];
DTYPE * c = cp[threadIdx.x];
DTYPE * t = tp[threadIdx.x];
c[offset] = s[offset];
t[offset] = s[offset];
......@@ -72,9 +72,10 @@ gather indexed sub-tensors(cuda version)
>> t - the target tensor
>> srcIndex - the tensor to save the index of the source tensor
void _CudaGather(XTensor * s, XTensor * t, XTensor * srcIndex)
void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex)
int devID = s->devID;
XMem * mem = s->mem;
int stride = s->GetDim(1);
int indexSize = srcIndex->unitNum;
......@@ -93,10 +94,26 @@ void _CudaGather(XTensor * s, XTensor * t, XTensor * srcIndex)
DTYPE * sData = (DTYPE*)s->data;
DTYPE * tData = (DTYPE*)t->data;
int * sIndex = (int *)srcIndex->data;
int * sIndex = NULL;
if (srcIndex->devID < 0) {
sIndex = mem != NULL ?
(int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) :
(int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);
XMemCopy(sIndex, devID, srcIndex, -1, sizeof(int) * indexSize);
sIndex = (int *)srcIndex->data;
KernelGather<<<blocks, threads >>>(sData, tData, sIndex, indexSize, stride);
if (srcIndex->devID < 0) {
if(mem != NULL)
mem->ReleaseBuf(mem->devID, sizeof(int) * indexSize);
XMemFree(mem->devID, sIndex);
BacktoCudaDev(devID, devIDBackup);
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
* $Created by: XIAO Tong (email: 2018-04-24
* $Created by: Xu Chen (email: 2018-11-27
#ifndef __GATHER_CUH__
#define __GATHER_CUH__
......@@ -30,7 +30,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/* gather indexed sub-tensors(cuda version) */
void _CudaGather(XTensor * s, XTensor * t, XTensor * srcIndex);
void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex);
#endif // USE_CUDA
......@@ -30,7 +30,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
void _Gather(XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSize);
/* gather selected sub-tensors */
void _Gather(XTensor * s, XTensor * t, XTensor * srcIndex);
void _Gather(const XTensor * s, XTensor * t, XTensor * srcIndex);
/* gather selected sub-tensors (return an XTensor structure)
make a new tensor to keep the result and return it */
......@@ -134,63 +134,92 @@ void _AssignmentForGather(DTYPE * sData, DTYPE * cData, int blockNum,
spread a collection tensor to source tensor.
And this is a special spread function for backward computation of gather function.
And this is a special spread function for backward computation of CopyIndexed function.
>> source - the source tensor whose data would be modified
>> collection - the collection whose data would be spread to source tensor
>> s - the source tensor whose data would be modified
>> c - the collection whose data would be spread to source tensor
>> dim - the leading dimension to define "sub-tensors"
e.g., for a tensor of size (3, 2, 4) and dim = 2,
we have 4 sub-tensors of size (3, 2)
>> srcIndex - index of the source sub-tensors
>> indexSize - length of srcIndex (and collIndex)
>> srcIndex - the tensor to save the index of the source sub-tensors
>> collIndex - the tensor to save the index of the collection sub-tensors
>> copyNum - number of the sub-tensors we copy for each source index,
e.g., for srcIndex = [1,4] and copyNum = 2,
we actually copy the source sub-tensors 1, 2, 4, 5
void _SpreadForGather(XTensor * source, XTensor * collection, int dim,
int * srcIndex, int indexSize)
void _SpreadForCopyIndexed(XTensor * s, XTensor * c, int dim,
XTensor * srcIndex, XTensor * collIndex,
int copyNum)
int order = source->order;
CheckNTErrors(source->dataType == DEFAULT_DTYPE, "TODO!");
int order = s->order;
int indexSize = srcIndex->unitNum;
CheckNTErrors(indexSize != 0, "NULL index!")
CheckNTErrors((s && c), "Invalid tensors!");
CheckNTErrors((srcIndex && collIndex), "Invalid index tensors!");
CheckNTErrors((s->devID == c->devID || (s->devID < 0 && c->devID < 0)),
"the data must be kept on the same device!");
CheckNTErrors((srcIndex->devID == srcIndex->devID || (s->devID < 0 && c->devID < 0)),
"the index must be kept on the same device!");
CheckNTErrors((s->devID == srcIndex->devID || (s->devID < 0 && c->devID < 0)),
"the data and index must be kept on the same device!");
CheckNTErrors((dim >= 0 && dim < s->order), "A too larget dimension specified!");
CheckNTErrors((s->unitSize == c->unitSize), "Unmatched tensors!");
CheckNTErrors((srcIndex->unitNum == collIndex->unitNum), "Unmatched index tensors!");
CheckNTErrors(s->dataType == DEFAULT_DTYPE, "TODO!");
CheckNTErrors(dim >= 0 && dim < order, "Illegal dimension!");
for (int i = 0; i < order; i++){
if (i == dim) {
CheckNTErrors(collection->GetDim(i) == indexSize, "Illegal dimension!");
for (int i = 0; i < order; i++) {
if (i != dim) {
CheckNTErrors(s->GetDim(i) == c->GetDim(i), "Unmatched dimensions");
else {
CheckNTErrors(collection->GetDim(i) == source->GetDim(i), "Illegal dimension!");
CheckNTErrors(c->GetDim(i) == indexSize * copyNum, "Unmatched dimensions");
#ifdef USE_CUDA
if(source->devID >= 0 && collection->devID >= 0) {
_CudaSpreadForGather(source, collection, dim, srcIndex, indexSize);
if(s->devID >= 0 && c->devID >= 0) {
_CudaSpreadForCopyIndexed(s, c, dim, srcIndex, collIndex, copyNum);
int blockSizeSrc = 1;
int blockSizeColl = 1;
int blockNum = 1;
int stride = 1;
int blockSizeSrc = 1;
int blockSizeTgt = 1;
for (int i = dim + 1; i < order; i++) {
stride *= source->GetDim(i);
for (int i = 0; i < dim; i++)
blockNum *= s->GetDim(i);
blockSizeSrc = stride * source->GetDim(dim);
blockSizeColl = stride * collection->GetDim(dim);
blockNum = source->unitNum / blockSizeSrc;
for (int i = dim + 1; i < order; i++)
stride *= s->GetDim(i);
DTYPE * sData = (DTYPE*)source->data;
DTYPE * cData = (DTYPE*)collection->data;
blockSizeSrc = stride * s->GetDim(dim);
blockSizeTgt = stride * c->GetDim(dim);
for(int i = 0; i < indexSize; i++){
int src = srcIndex[i];
int tgt = i;
DTYPE * s = sData + src * stride;
DTYPE * c = cData + tgt * stride;
_AssignmentForGather(s, c, blockNum, blockSizeSrc, blockSizeColl, stride);
DTYPE * sData = (DTYPE*)s->data;
DTYPE * cData = (DTYPE*)c->data;
int * sIndex = (int*)srcIndex->data;
int * cIndex = (int*)collIndex->data;
for (int i = 0; i < indexSize; i++) {
for (int c = 0; c < copyNum; c++) {
int si = sIndex[i] + c;
int ti = cIndex[i] + c;
for (int j = 0; j < blockNum; j++) {
DTYPE * sd = sData + j * blockSizeSrc + si * stride;
DTYPE * td = cData + j * blockSizeTgt + ti * stride;
for (int k = 0; k < stride; k++)
*(sd + k) += *(td + k);
......@@ -218,7 +247,7 @@ void _SpreadForGather(XTensor * source, XTensor * collection, XTensor * index)
#ifdef USE_CUDA
if(source->devID >= 0 && collection->devID >= 0 && index->devID >= 0) {
if(source->devID >= 0 && collection->devID >= 0) {
_CudaSpreadForGather(source, collection, index);
......@@ -19,13 +19,11 @@
* $Created by: Xu Chen (email: 2018-09-25
#ifndef __SPREAD_CUH__
#define __SPREAD_CUH__
#include "../../XTensor.h"
#include "../../XDevice.h"
#include "../../XUtility.h"
#include "Spread.cuh"
#include "CopyValues.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......@@ -60,7 +58,6 @@ void KernelSpread(DTYPE * sData, DTYPE * cData, int blockNum,
s[j] = c[j];
This is core assignment for spread function.
......@@ -201,53 +198,24 @@ void _CudaSpread(XTensor * source, XTensor * collection, int dim,
This is core assignment for backward computation of gather function.
Care of the operator "+=" instead of "=".
spread a collection tensor to source tensor (kernel version).
And this is a special spread function for backward computation of CopyIndexed function.
>> sData - the data pointer of the source tensor
>> cData - the data pointer of collection tensor
>> sIndex - index of the source sub-tensor
>> cIndex - index of the sub-tensor in the collection tensor
>> blockNum - number of data blocks
>> blockSizeSrc - size of source data block
>> blockSizeColl - size of source data block
>> stride - stride of a data block
>> indexSize - number of indexs
>> copyNum - number of the sub-tensors we copy for each source index
void KernelSpreadForGather(DTYPE * sData, DTYPE * cData, int blockNum,
int blockSizeSrc, int blockSizeColl, int stride)
/* block id */
int i = blockDim.x * blockIdx.x + threadIdx.x;
/* offset in each block */
int j = blockDim.y * blockIdx.y + threadIdx.y;
if(i >= blockNum || j >= stride)
DTYPE * s = sData + blockSizeSrc * i;
DTYPE * c = cData + blockSizeColl * i;
s[j] += c[j];
This is core assignment for backward computation of gather function.
Care of the operator "+=" instead of "=".
>> sData - the data pointer of the source tensor
>> cData - the data pointer of collection tensor
>> blockNum - number of data blocks
>> blockSizeSrc - size of source data block
>> blockSizeColl - size of source data block
>> stride - stride of a data block
>> subtensorNum - number of sub-tensors
>> srcIndex - index of the source sub-tensor
void KernelSpreadForGatherFuzed(DTYPE * sData, DTYPE * cData, int blockNum,
int blockSizeSrc, int blockSizeColl, int stride,
int subtensorNum,
int * srcIndex)
void KernelSpreadForCopyIndexed(DTYPE * sData, DTYPE * cData, int * sIndex, int * cIndex,
int blockNum, int blockSizeSrc, int blockSizeColl,
int stride, int indexSize, int copyNum)
......@@ -258,105 +226,140 @@ void KernelSpreadForGatherFuzed(DTYPE * sData, DTYPE * cData, int blockNum,
/* offset in each block */
int offset = blockDim.y * blockIdx.y + threadIdx.y;
int blockId = i % blockNum;
int subtensorId = i / blockNum;
int realIndexSize = indexSize * copyNum;
if(subtensorId >= subtensorNum || offset >= stride)
int realBlockNum = i / realIndexSize;
int tmp = i % realIndexSize;
int realIndex = tmp / copyNum;
int realCopyNum = tmp % copyNum;
if (realBlockNum >= blockNum || offset >= stride || realIndex >= indexSize || realCopyNum >= copyNum)
//if(i >= blockNum * indexSize * copyNum || offset >= stride)
// return;
int realSrcIndex = sIndex[realIndex] + realCopyNum;
int realCollIndex = cIndex[realIndex] + realCopyNum;
//int realSrcIndex = sIndex[realIndex / copyNum] + realIndex % copyNum;
//int realCollIndex = cIndex[realIndex / copyNum] + realIndex % copyNum;
if(threadIdx.y == 0){
sp[threadIdx.x] = sData + srcIndex[subtensorId] * stride;
cp[threadIdx.x] = cData + subtensorId * stride;
sp[threadIdx.x] = sData + realBlockNum * blockSizeSrc + realSrcIndex * stride;
cp[threadIdx.x] = cData + realBlockNum * blockSizeColl + realCollIndex * stride;
DTYPE * s = sp[threadIdx.x] + blockSizeSrc * blockId;
DTYPE * c = cp[threadIdx.x] + blockSizeColl * blockId;
DTYPE * s = sp[threadIdx.x];
DTYPE * c = cp[threadIdx.x];
atomicAdd(s + offset, c[offset]);
s[offset] += c[offset];
spread a collection tensor to source tensor (cuda version).
And this is a special spread function for backward computation of gather function.
spread a collection tensor to source tensor.
And this is a special spread function for backward computation of CopyIndexed function.
>> source - the source tensor whose data would be modified
>> collection - the collection whose data would be spread to source tensor
>> s - the source tensor whose data would be modified
>> c - the collection whose data would be spread to source tensor
>> dim - the leading dimension to define "sub-tensors"
e.g., for a tensor of size (3, 2, 4) and dim = 2,
we have 4 sub-tensors of size (3, 2)
>> srcIndex - index of the source sub-tensors
>> indexSize - length of srcIndex (and collIndex)
>> srcIndex - the tensor to save the index of the source sub-tensors
>> collIndex - the tensor to save the index of the collection sub-tensors
>> copyNum - number of the sub-tensors we copy for each source index,
e.g., for srcIndex = [1,4] and copyNum = 2,
we actually copy the source sub-tensors 1, 2, 4, 5
void _CudaSpreadForGather(XTensor * source, XTensor * collection, int dim,
int * srcIndex, int indexSize)
void _CudaSpreadForCopyIndexed(XTensor * s, XTensor * c, int dim,
XTensor * srcIndex, XTensor * collIndex,
int copyNum)
int order = source->order;
CheckNTErrors(source->dataType == DEFAULT_DTYPE, "TODO!");
CheckNTErrors(dim >= 0 && dim < order, "Illegal dimension!");
int devID = s->devID;
int order = s->order;
int indexSize = srcIndex->unitNum;
int blockSizeSrc = 1;
int blockSizeColl = 1;
int blockNum = 1;
int stride = 1;
int blockSizeSrc = 1;
int blockSizeTgt = 1;
for (int i = 0; i < dim; i++)
blockNum *= s->GetDim(i);
for (int i = dim + 1; i < order; i++)
stride *= source->GetDim(i);
stride *= s->GetDim(i);
blockSizeSrc = stride * source->GetDim(dim);
blockSizeColl = stride * collection->GetDim(dim);
blockNum = source->unitNum / blockSizeSrc;
blockSizeSrc = stride * s->GetDim(dim);
blockSizeTgt = stride * c->GetDim(dim);
int cudaGrids[3];
int cudaBlocks[3];
int devIDBackup;
ProtectCudaDev(source->devID, devIDBackup);
ProtectCudaDev(devID, devIDBackup);
if(indexSize < 4){
GDevs.GetCudaThread2D(source->devID, blockNum, stride, MAX_INT, cudaGrids, cudaBlocks);
GDevs.GetCudaThread2D(devID, blockNum * indexSize * copyNum, stride, MAX_INT, cudaGrids, cudaBlocks);
dim3 blocks(cudaGrids[0], cudaGrids[1]);
dim3 threads(cudaBlocks[0], cudaBlocks[1]);
DTYPE * sData = (DTYPE*)source->data;
DTYPE * cData = (DTYPE*)collection->data;
for(int i = 0; i < indexSize; i++) {
int src = srcIndex[i];
int tgt = i;
DTYPE * s = sData + src * stride;
DTYPE * c = cData + tgt * stride;
DTYPE * sData = (DTYPE*)s->data;
DTYPE * cData = (DTYPE*)c->data;
KernelSpreadForGather<<<blocks, threads >>>(s, c, blockNum, blockSizeSrc, blockSizeColl, stride);
XMem * mem = source->mem;
int * si = mem != NULL ?
(int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) :
(int*)XMemAlloc(source->devID, sizeof(int) * indexSize);
int * sIndex = (int *)srcIndex->data;
int * cIndex = (int *)collIndex->data;
XMemCopy(si, source->devID, srcIndex, -1, sizeof(int) * indexSize);
KernelSpreadForCopyIndexed<<<blocks, threads >>>(sData, cData, sIndex, cIndex,
blockNum, blockSizeSrc, blockSizeTgt,
stride, indexSize, copyNum);
DTYPE * s = (DTYPE*)source->data;
DTYPE * c = (DTYPE*)collection->data;
BacktoCudaDev(devID, devIDBackup);
GDevs.GetCudaThread2D(source->devID, blockNum * indexSize, stride, MAX_INT, cudaGrids, cudaBlocks);
This is core assignment for backward computation of gather function.
Care of the operator "+=" instead of "=".
dim3 blocks(cudaGrids[0], cudaGrids[1]);
dim3 threads(cudaBlocks[0], cudaBlocks[1]);
>> sData - the data pointer of the source tensor
>> cData - the data pointer of collection tensor
>> srcIndex - index of the source sub-tensor
>> indexSize - the number of index
>> stride - stride of a data block
void KernelSpreadForGather(DTYPE * sData, DTYPE * cData, int * srcIndex,
int indexSize, int stride)
KernelSpreadForGatherFuzed<<<blocks, threads >>>(s, c, blockNum, blockSizeSrc, blockSizeColl, stride, indexSize, si);
/* block id */
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (mem != NULL) {
mem->ReleaseBuf(mem->devID, sizeof(int) * indexSize);
else {
XMemFree(source->devID, si);
/* offset in each block */
int offset = blockDim.y * blockIdx.y + threadIdx.y;
if(i >= indexSize || offset >= stride)
if (threadIdx.y == 0) {
sp[threadIdx.x] = sData + srcIndex[i] * stride;
cp[threadIdx.x] = cData + i * stride;
DTYPE * s = sp[threadIdx.x];
DTYPE * c = cp[threadIdx.x];
//DTYPE * s = sData + srcIndex[i] * stride;
//DTYPE * c = cData + i * stride;
atomicAdd(s + offset, c[offset]);
......@@ -369,34 +372,44 @@ And this is a special spread function for backward computation of gather functio
void _CudaSpreadForGather(XTensor * source, XTensor * collection, XTensor * srcIndex)
int dim = 0;
int devID = source->devID;
XMem * mem = source->mem;
int blockNum = 1;
int stride = source->GetDim(1);
int indexSize = srcIndex->unitNum;
int blockSizeSrc = stride * source->GetDim(dim);
int blockSizeColl = stride * collection->GetDim(dim);
int cudaGrids[3];
int cudaBlocks[3];
int devIDBackup;
ProtectCudaDev(source->devID, devIDBackup);
DTYPE * sData = (DTYPE*)source->data;
DTYPE * cData = (DTYPE*)collection->data;
int * sIndex = NULL;
GDevs.GetCudaThread2D(devID, indexSize, stride, MAX_INT, cudaGrids, cudaBlocks);
dim3 blocks(cudaGrids[0], cudaGrids[1]);
dim3 threads(cudaBlocks[0], cudaBlocks[1]);
DTYPE * s = (DTYPE*)source->data;
DTYPE * c = (DTYPE*)collection->data;
if (srcIndex->devID < 0) {
sIndex = mem != NULL ?
(int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) :
(int*)XMemAlloc(devID, sizeof(int) * indexSize);
XMemCopy(sIndex, devID, srcIndex->data, -1, sizeof(int) * indexSize);
sIndex = (int *)srcIndex->data;
int * si = (int *)srcIndex->data;
KernelSpreadForGather<<<blocks, threads >>>(sData, cData, sIndex, indexSize, stride);
KernelSpreadForGatherFuzed<<<blocks, threads >>>(s, c, blockNum, blockSizeSrc, blockSizeColl,
stride, indexSize, si);
if (srcIndex->devID < 0) {
if(mem != NULL)
mem->ReleaseBuf(mem->devID, sizeof(int) * indexSize);
XMemFree(devID, sIndex);
BacktoCudaDev(source->devID, devIDBackup);
......@@ -404,5 +417,3 @@ void _CudaSpreadForGather(XTensor * source, XTensor * collection, XTensor * srcI
#endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
#endif // __SPREAD_CUH__
\ No newline at end of file
......@@ -32,9 +32,10 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
void _CudaSpread(XTensor * source, XTensor * collection, int dim,
int * srcIndex, int indexSize, int * collIndex);
/* special spread function for backward computation of gather function (cuda version) */
void _CudaSpreadForGather(XTensor * source, XTensor * collection, int dim,
int * srcIndex, int indexSize);
/* special spread function for backward computation of CopyIndexed function (cuda version) */
void _CudaSpreadForCopyIndexed(XTensor * s, XTensor * c, int dim,
XTensor * srcIndex, XTensor * collIndex,
int copyNum);
/* special spread function for backward computation of gather function (cuda version) */
void _CudaSpreadForGather(XTensor * source, XTensor * collection, XTensor * srcIndex);
......@@ -36,9 +36,10 @@ void Spread(XTensor * source, XTensor * collection,
XTensor * srcIndex, XTensor * collIndex,
int dim);
/* special spread function for backward computation of gather function */
void _SpreadForGather(XTensor * source, XTensor * collection, int dim,
int * srcIndex, int indexSize);
/* special spread function for backward computation of CopyIndexed function */
void _SpreadForCopyIndexed(XTensor * source, XTensor * collection, int dim,
XTensor * srcIndex, XTensor * collIndex,
int copyNum);
/* special spread function for backward computation of gather function */
void _SpreadForGather(XTensor * source, XTensor * collection, XTensor * index);
......@@ -46,6 +46,22 @@ sum all the items of the tensor (It should be optimized!)
DTYPE _ReduceSumAll(const XTensor * source)
int dims[2] = {1, source->unitNum};
int one = 1;
XTensor * all = NewTensorBuf(2, dims, source->dataType, source->denseRatio, source->devID, source->mem);
XTensor * result = NewTensorBuf(1, &one, source->dataType, 1.0F, source->devID, source->mem);
_CopyValues(source, all);
_ReduceSum(all, result, 1);
DTYPE r = result->Get1D(0);
return r;
int order = source->order;
DTYPE summation;
......@@ -94,7 +94,7 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
gridSize = blockNum;
gridNum = s->unitNum / (blockSize * blockNum);
if (mergedNum * gridNum <= MIN_TENSOR_SPLIT_NUM) {
if (mergedNum * gridNum <= MIN_TENSOR_MERGE_NUM) {
int sPitch = blockSize * s->unitSize;
int tPtich = blockSize * mergedNum * t->unitSize;
int mSize = blockSize * t->unitSize;
......@@ -253,7 +253,7 @@ void _Merge(const XList * smalls, XTensor * big, int whereToMerge)
gridNum = s0->unitNum / (blockSize * blockNum);
/* merging with fewer data copy operations */
if (mergedNum * gridNum <= MIN_TENSOR_SPLIT_LIST_NUM) {
if (mergedNum * gridNum <= MIN_TENSOR_MERGE_LIST_NUM) {
int sPitch = blockSize * s0->unitSize;
int tPtich = blockSize * mergedNum * big->unitSize;
int mSize = blockSize * big->unitSize;
......@@ -126,7 +126,7 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)
void * dataTMP = t->data;
if (!isOnSameDevice)
dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(mem->devID, size);
dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(s->devID, size);
int realBlockSize = blockSize * t->unitSize;
int blockSplitSize = blockNum / splitNum;
......@@ -153,7 +153,7 @@ XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim)
int unitNum = x.dimSize[n];
DTYPE * maskArray = new DTYPE[unitNum];
srand((unsigned int)time(NULL));
//srand((unsigned int)time(NULL));
for (int i = 0; i < unitNum; i++)
maskArray[i] = RandomBernoulli(dropProb, scaleFactor);
......@@ -166,4 +166,33 @@ XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim)
return MultiplyDim(x, mask, n, 0);
dropout function without broadcast
>> x - input tensor
>> dropProb - probability to set an element to zero
XTensor DropoutWithoutBroadcast(const XTensor &x, DTYPE dropProb)
CheckNTErrors(dropProb >= 0.0 && dropProb <= 1.0, "The probability must be 0-1!");
DTYPE scaleFactor = (DTYPE)1.0 / ((DTYPE)1.0 - dropProb);
/* generate a mask tensor with probability p */
int unitNum = x.unitNum;
DTYPE * maskArray = new DTYPE[unitNum];
srand((unsigned int)time(NULL));
for (int i = 0; i < unitNum; i++)
maskArray[i] = RandomBernoulli(dropProb, scaleFactor);
XTensor mask;
InitTensor(&mask, x.order, x.dimSize, x.dataType, x.denseRatio, x.devID, x.mem);
mask.SetData(maskArray, unitNum);
delete[] maskArray;
return Multiply(x, mask);
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
......@@ -43,6 +43,9 @@ void _DropoutBackward(const XTensor * y, const XTensor * x,
/* dropout function */
XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim = -1);
/* dropout function without broadcast */
XTensor DropoutWithoutBroadcast(const XTensor &x, DTYPE dropProb);
} // namespace nts(NiuTrans.Tensor)
#endif // __DROPOUT_H__
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
* $Created by: Xu Chen (email: 2018-07-12
* $Created by: Xu Chen (email: 2018-07-12
#ifndef __TEST_ABSOLUTE_H__
#define __TEST_ABSOLUTE_H__
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
* $Created by: Lin Ye (email: 2018-08-03
* $Created by: Lin Ye (email: 2018-08-03
#include "../XTensor.h"
#include "../core/math/Clip.h"
#include "TClip.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
* $Created by: Lin Ye (email: 2018-08-03
* $Created by: Lin Ye (email: 2018-08-03
#ifndef __TEST_CLIP_H__
#define __TEST_CLIP_H__
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
* $Created by: Xu Chen (email: 2018-07-12
#include "../XTensor.h"
#include "../core/math/Compare.h"
#include "TCompare.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
case 1: test Equal function.
Comapre whether every entry is equal to the specified value.
bool TestCompare1()
/* a tensor of size (3, 2) */
int aOrder = 2;
int * aDimSize = new int[aOrder];
aDimSize[0] = 3;
aDimSize[1] = 2;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
DTYPE aData[3][2] = { {1.0F, -2.0F},
{0.0F, 4.0F},
{5.0F, 1.0F} };
DTYPE answer[3][2] = { {1.0F, 0.0F},
{0.0F, 0.0F},
{0.0F, 1.0F} };
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * a = NewTensor(aOrder, aDimSize);
XTensor * b = NewTensor(aOrder, aDimSize);
XTensor * aMe = NewTensor(aOrder, aDimSize);
XTensor bUser;
/* initialize variables */
a->SetData(aData, aUnitNum);
aMe->SetData(aData, aUnitNum);
/* call Equal function */
_Equal(a, b, 1.0);
_EqualMe(aMe, 1.0);
bUser = Equal(*a, 1.0);
/* check results */
cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) &&
aMe->CheckData(answer, aUnitNum, 1e-4F) &&
bUser.CheckData(answer, aUnitNum, 1e-4F);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * aMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor bUserGPU;
/* Initialize variables */
aGPU->SetData(aData, aUnitNum);
aMeGPU->SetData(aData, aUnitNum);
/* call Equal function */
_Equal(aGPU, bGPU, 1.0);
_EqualMe(aMeGPU, 1.0);
bUserGPU = Equal(*aGPU, 1.0);
/* check results */
gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) &&
aMeGPU->CheckData(answer, aUnitNum, 1e-4F) &&
bUserGPU.CheckData(answer, aUnitNum, 1e-4F);
/* destroy variables */
delete a;
delete b;
delete aMe;
delete aGPU;
delete bGPU;
delete aMeGPU;
delete[] aDimSize;
return cpuTest && gpuTest;
/* destroy variables */
delete a;
delete b;
delete aMe;
delete[] aDimSize;
return cpuTest;
#endif // USE_CUDA
/* other cases */
/* test for Compare Function */
bool TestCompare()
XPRINT(0, stdout, "[TEST Compare] compare every entry with specified value \n");
bool returnFlag = true, caseFlag = true;
/* case 1 test */
caseFlag = TestCompare1();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 1 failed!\n");
XPRINT(0, stdout, ">> case 1 passed!\n");
/* other cases test */
if (returnFlag) {
XPRINT(0, stdout, ">> All Passed!\n");
XPRINT(0, stdout, ">> Failed!\n");
XPRINT(0, stdout, "\n");
return returnFlag;
} // namespace nts(NiuTrans.Tensor)
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
* $Created by: Xu Chen (email: 2018-12-10
#ifndef __TEST_Compare_H__
#define __TEST_Compare_H__
namespace nts { // namespace nts(NiuTrans.Tensor)
/* test for Compare Function */
bool TestCompare();
} // namespace nts(NiuTrans.Tensor)
#endif // __TEST_Compare_H__
......@@ -232,13 +232,12 @@ bool TestConvertDataType3()
/* initialize variables */
a->SetData(data1, unitNum1);
/* call ConvertDataType function */
/* call ConvertDataType function (We have not implemented this yet...) */
//_ConvertDataType(a, b);
//_ConvertDataType(b, c);
/* check results */
cpuTest = a->CheckData(data1, unitNum1, 1e-4F);
c->Dump(stderr, "");
//cpuTest = a->CheckData(data1, unitNum1, 1e-4F);
#ifdef USE_CUDA
/* GPU test */
......@@ -52,6 +52,15 @@ bool TestCopyIndexed1()
for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i];
/* a index tensor of size (2) */
int indexOrder = 1;
int * indexDimSize = new int[indexOrder];
indexDimSize[0] = 2;
int indexUnitNum = 1;
for (int i = 0; i < indexOrder; i++)
indexUnitNum *= indexDimSize[i];
DTYPE sData[3][2][3] = { { {0.0F, -1.0F, 2.0F},
{2.0F, 1.0F, 3.0F} },
{ {1.0F, 2.0F, 4.0F},
......@@ -76,19 +85,28 @@ bool TestCopyIndexed1()
/* create tensors */
XTensor * s = NewTensor(sOrder, sDimSize);
XTensor * t = NewTensor(tOrder, tDimSize);
XTensor * t1 = NewTensor(tOrder, tDimSize);
XTensor * t2 = NewTensor(tOrder, tDimSize);
XTensor * sIndex = NewTensor(indexOrder, indexDimSize, X_INT);
XTensor * tIndex = NewTensor(indexOrder, indexDimSize, X_INT);
XTensor tUser;
/* initialize variables */
s->SetData(sData, sUnitNum);
sIndex->SetData(srcIndex, indexUnitNum);
tIndex->SetData(tgtIndex, indexUnitNum);
/* call CopyIndexed function */
_CopyIndexed(s, t, dim, srcIndex, indexSize, tgtIndex, copyNum);
tUser = CopyIndexed(*s, dim, srcIndex, indexSize, tgtIndex, copyNum);
_CopyIndexed(s, t1, dim, srcIndex, indexSize, tgtIndex, copyNum);
_CopyIndexed(s, t2, dim, sIndex, tIndex, copyNum);
tUser = CopyIndexed(*s, dim, *sIndex, *tIndex, copyNum);
/* check results */
cpuTest = t->CheckData(answer, tUnitNum) && tUser.CheckData(answer, tUnitNum);
cpuTest = t1->CheckData(answer, tUnitNum) &&
t2->CheckData(answer, tUnitNum) &&
tUser.CheckData(answer, tUnitNum);
#ifdef USE_CUDA
/* GPU test */
......@@ -96,35 +114,55 @@ bool TestCopyIndexed1()
/* create tensors */
XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensor(sOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU1 = NewTensor(sOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU2 = NewTensor(sOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor * sIndexGPU = NewTensor(indexOrder, indexDimSize, X_INT, 1.0F, 0);
XTensor * tIndexGPU = NewTensor(indexOrder, indexDimSize, X_INT, 1.0F, 0);
XTensor tUserGPU;
/* initialize variables */
sGPU->SetData(sData, sUnitNum);
sIndexGPU->SetData(srcIndex, indexUnitNum);
tIndexGPU->SetData(tgtIndex, indexUnitNum);
/* call CopyIndexed function */
_CopyIndexed(sGPU, tGPU, dim, srcIndex, indexSize, tgtIndex, copyNum);
tUserGPU = CopyIndexed(*sGPU, dim, srcIndex, indexSize, tgtIndex, copyNum);
_CopyIndexed(sGPU, tGPU1, dim, srcIndex, indexSize, tgtIndex, copyNum);
_CopyIndexed(sGPU, tGPU2, dim, sIndexGPU, tIndexGPU, copyNum);
tUserGPU = CopyIndexed(*sGPU, dim, *sIndexGPU, *tIndexGPU, copyNum);
/* check results */
gpuTest = tGPU->CheckData(answer, tUnitNum) && tUserGPU.CheckData(answer, tUnitNum);
gpuTest = tGPU1->CheckData(answer, tUnitNum) &&
tGPU2->CheckData(answer, tUnitNum) &&
tUserGPU.CheckData(answer, tUnitNum);
/* destroy variables */
delete s;
delete t;
delete t1;
delete t2;
delete sIndex;
delete tIndex;
delete sGPU;
delete tGPU;
delete tGPU1;
delete tGPU2;
delete sIndexGPU;
delete tIndexGPU;
delete[] sDimSize;
delete[] tDimSize;
delete[] indexDimSize;
return cpuTest && gpuTest;
/* destroy variables */
delete s;
delete t;
delete t1;
delete t2;
delete sIndex;
delete tIndex;
delete[] sDimSize;
delete[] tDimSize;
delete[] indexDimSize;
return cpuTest;
#endif // USE_CUDA
......@@ -159,6 +197,15 @@ bool TestCopyIndexed2()
for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i];
/* a index tensor of size (2) */
int indexOrder = 1;
int * indexDimSize = new int[indexOrder];
indexDimSize[0] = 2;
int indexUnitNum = 1;
for (int i = 0; i < indexOrder; i++)
indexUnitNum *= indexDimSize[i];
DTYPE sData[3][2][3] = { { {0.0F, -1.0F, 2.0F},
{2.0F, 1.0F, 3.0F} },
{ {1.0F, 2.0F, 4.0F},
......@@ -183,19 +230,28 @@ bool TestCopyIndexed2()
/* create tensors */
XTensor * s = NewTensor(sOrder, sDimSize);
XTensor * t = NewTensor(tOrder, tDimSize);
XTensor * t1 = NewTensor(tOrder, tDimSize);
XTensor * t2 = NewTensor(tOrder, tDimSize);
XTensor * sIndex = NewTensor(indexOrder, indexDimSize, X_INT);
XTensor * tIndex = NewTensor(indexOrder, indexDimSize, X_INT);
XTensor tUser;
/* initialize variables */
s->SetData(sData, sUnitNum);
sIndex->SetData(srcIndex, indexUnitNum);
tIndex->SetData(tgtIndex, indexUnitNum);
/* call CopyIndexed function */
_CopyIndexed(s, t, dim, srcIndex, indexSize, tgtIndex, copyNum);
tUser = CopyIndexed(*s, dim, srcIndex, indexSize, tgtIndex, copyNum);
_CopyIndexed(s, t1, dim, srcIndex, indexSize, tgtIndex, copyNum);
_CopyIndexed(s, t2, dim, sIndex, tIndex, copyNum);
tUser = CopyIndexed(*s, dim, *sIndex, *tIndex);
/* check results */
cpuTest = t->CheckData(answer, tUnitNum) && tUser.CheckData(answer, tUnitNum);
cpuTest = t1->CheckData(answer, tUnitNum) &&
t2->CheckData(answer, tUnitNum) &&
tUser.CheckData(answer, tUnitNum);
#ifdef USE_CUDA
/* GPU test */
......@@ -203,35 +259,55 @@ bool TestCopyIndexed2()
/* create tensors */
XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensor(sOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU1 = NewTensor(sOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU2 = NewTensor(sOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor * sIndexGPU = NewTensor(indexOrder, indexDimSize, X_INT, 1.0F, 0);
XTensor * tIndexGPU = NewTensor(indexOrder, indexDimSize, X_INT, 1.0F, 0);
XTensor tUserGPU;
/* initialize variables */
sGPU->SetData(sData, sUnitNum);
sIndexGPU->SetData(srcIndex, indexUnitNum);
tIndexGPU->SetData(tgtIndex, indexUnitNum);
/* call CopyIndexed function */
_CopyIndexed(sGPU, tGPU, dim, srcIndex, indexSize, tgtIndex, copyNum);
tUserGPU = CopyIndexed(*sGPU, dim, srcIndex, indexSize, tgtIndex, copyNum);
_CopyIndexed(sGPU, tGPU1, dim, srcIndex, indexSize, tgtIndex, copyNum);
_CopyIndexed(sGPU, tGPU2, dim, sIndexGPU, tIndexGPU, copyNum);
tUserGPU = CopyIndexed(*sGPU, dim, *sIndexGPU, *tIndexGPU, copyNum);
/* check results */
gpuTest = tGPU->CheckData(answer, tUnitNum) && tUserGPU.CheckData(answer, tUnitNum);
gpuTest = tGPU1->CheckData(answer, tUnitNum) &&
tGPU2->CheckData(answer, tUnitNum) &&
tUserGPU.CheckData(answer, tUnitNum);
/* destroy variables */
delete s;
delete t;
delete t1;
delete t2;
delete sIndex;
delete tIndex;
delete sGPU;
delete tGPU;
delete tGPU1;
delete tGPU2;
delete sIndexGPU;
delete tIndexGPU;
delete[] sDimSize;
delete[] tDimSize;
delete[] indexDimSize;
return cpuTest && gpuTest;
/* destroy variables */
delete s;
delete t;
delete t1;
delete t2;
delete sIndex;
delete tIndex;
delete[] sDimSize;
delete[] tDimSize;
delete[] indexDimSize;
return cpuTest;
#endif // USE_CUDA
......@@ -266,6 +342,15 @@ bool TestCopyIndexed3()
for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i];
/* a index tensor of size (1) */
int indexOrder = 1;
int * indexDimSize = new int[indexOrder];
indexDimSize[0] = 1;
int indexUnitNum = 1;
for (int i = 0; i < indexOrder; i++)
indexUnitNum *= indexDimSize[i];
DTYPE sData[3][2][3] = { { {0.0F, -1.0F, 2.0F},
{2.0F, 1.0F, 3.0F} },
{ {1.0F, 2.0F, 4.0F},
......@@ -290,19 +375,28 @@ bool TestCopyIndexed3()
/* create tensors */
XTensor * s = NewTensor(sOrder, sDimSize);
XTensor * t = NewTensor(tOrder, tDimSize);
XTensor * t1 = NewTensor(tOrder, tDimSize);
XTensor * t2 = NewTensor(tOrder, tDimSize);
XTensor * sIndex = NewTensor(indexOrder, indexDimSize, X_INT);
XTensor * tIndex = NewTensor(indexOrder, indexDimSize, X_INT);
XTensor tUser;
/* initialize variables */
s->SetData(sData, sUnitNum);
sIndex->SetData(srcIndex, indexUnitNum);
tIndex->SetData(tgtIndex, indexUnitNum);
/* call CopyIndexed function */
_CopyIndexed(s, t, dim, srcIndex, indexSize, tgtIndex, copyNum);
tUser = CopyIndexed(*s, dim, srcIndex, indexSize, tgtIndex, copyNum);
_CopyIndexed(s, t1, dim, srcIndex, indexSize, tgtIndex, copyNum);
_CopyIndexed(s, t2, dim, sIndex, tIndex, copyNum);
tUser = CopyIndexed(*s, dim, *sIndex, *tIndex, copyNum);
/* check results */
cpuTest = t->CheckData(answer, tUnitNum) && tUser.CheckData(answer, tUnitNum);
cpuTest = t1->CheckData(answer, tUnitNum) &&
t2->CheckData(answer, tUnitNum) &&
tUser.CheckData(answer, tUnitNum);
#ifdef USE_CUDA
/* GPU test */
......@@ -310,35 +404,55 @@ bool TestCopyIndexed3()
/* create tensors */
XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensor(sOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU1 = NewTensor(sOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU2 = NewTensor(sOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor * sIndexGPU = NewTensor(indexOrder, indexDimSize, X_INT, 1.0F, 0);
XTensor * tIndexGPU = NewTensor(indexOrder, indexDimSize, X_INT, 1.0F, 0);
XTensor tUserGPU;
/* initialize variables */
sGPU->SetData(sData, sUnitNum);
sIndexGPU->SetData(srcIndex, indexUnitNum);
tIndexGPU->SetData(tgtIndex, indexUnitNum);
/* call CopyIndexed function */
_CopyIndexed(sGPU, tGPU, dim, srcIndex, indexSize, tgtIndex, copyNum);
tUserGPU = CopyIndexed(*sGPU, dim, srcIndex, indexSize, tgtIndex, copyNum);
_CopyIndexed(sGPU, tGPU1, dim, srcIndex, indexSize, tgtIndex, copyNum);
_CopyIndexed(sGPU, tGPU2, dim, sIndexGPU, tIndexGPU, copyNum);
tUserGPU = CopyIndexed(*sGPU, dim, *sIndexGPU, *tIndexGPU, copyNum);
/* check results */
gpuTest = tGPU->CheckData(answer, tUnitNum) && tUserGPU.CheckData(answer, tUnitNum);
gpuTest = tGPU1->CheckData(answer, tUnitNum) &&
tGPU2->CheckData(answer, tUnitNum) &&
tUserGPU.CheckData(answer, tUnitNum);
/* destroy variables */
delete s;
delete t;
delete t1;
delete t2;
delete sIndex;
delete tIndex;
delete sGPU;
delete tGPU;
delete tGPU1;
delete tGPU2;
delete sIndexGPU;
delete tIndexGPU;
delete[] sDimSize;
delete[] tDimSize;
delete[] indexDimSize;
return cpuTest && gpuTest;
/* destroy variables */
delete s;
delete t;
delete t1;
delete t2;
delete sIndex;
delete tIndex;
delete[] sDimSize;
delete[] tDimSize;
delete[] indexDimSize;
return cpuTest;
#endif // USE_CUDA
......@@ -374,15 +488,13 @@ bool TestCopyIndexed4()
tUnitNum *= tDimSize[i];
/* a index tensor of size(2) */
int iOrder = 3;
int * iDimSize = new int[iOrder];
iDimSize[0] = 3;
iDimSize[1] = 2;
iDimSize[2] = 2;
int indexOrder = 1;
int * indexDimSize = new int[indexOrder];
indexDimSize[0] = 2;
int iUnitNum = 1;
for (int i = 0; i < iOrder; i++)
iUnitNum *= iDimSize[i];
int indexUnitNum = 1;
for (int i = 0; i < indexOrder; i++)
indexUnitNum *= indexDimSize[i];
DTYPE sData[3][2][3] = { { {0.0F, -1.0F, 2.0F},
{2.0F, 1.0F, 3.0F} },
......@@ -408,21 +520,28 @@ bool TestCopyIndexed4()
/* create tensors */
XTensor * s = NewTensor(sOrder, sDimSize);
XTensor * t = NewTensor(tOrder, tDimSize);
XTensor * index = NewTensor(tOrder, tDimSize, X_INT);
XTensor * t1 = NewTensor(tOrder, tDimSize);
XTensor * t2 = NewTensor(tOrder, tDimSize);
XTensor * sIndex = NewTensor(indexOrder, indexDimSize, X_INT);
XTensor * tIndex = NewTensor(indexOrder, indexDimSize, X_INT);
XTensor tUser;
/* initialize variables */
s->SetData(sData, sUnitNum);
index->SetData(srcIndex, iUnitNum);
sIndex->SetData(srcIndex, indexUnitNum);
tIndex->SetData(tgtIndex, indexUnitNum);
/* call CopyIndexed function */
_CopyIndexed(s, t, dim, (int*)index->data, indexSize, tgtIndex, copyNum);
tUser = CopyIndexed(*s, dim, (int*)index->data, indexSize, tgtIndex, copyNum);
_CopyIndexed(s, t1, dim, srcIndex, indexSize, tgtIndex, copyNum);
_CopyIndexed(s, t2, dim, sIndex, tIndex, copyNum);
tUser = CopyIndexed(*s, dim, *sIndex, *tIndex, copyNum);
/* check results */
cpuTest = t->CheckData(answer, tUnitNum) && tUser.CheckData(answer, tUnitNum);
cpuTest = t1->CheckData(answer, tUnitNum) &&
t2->CheckData(answer, tUnitNum) &&
tUser.CheckData(answer, tUnitNum);
#ifdef USE_CUDA
/* GPU test */
......@@ -430,42 +549,60 @@ bool TestCopyIndexed4()
/* create tensors */
XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensor(sOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU1 = NewTensor(sOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU2 = NewTensor(sOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor * sIndexGPU = NewTensor(indexOrder, indexDimSize, X_INT, 1.0F, 0);
XTensor * tIndexGPU = NewTensor(indexOrder, indexDimSize, X_INT, 1.0F, 0);
XTensor tUserGPU;
/* initialize variables */
sGPU->SetData(sData, sUnitNum);
sIndexGPU->SetData(srcIndex, indexUnitNum);
tIndexGPU->SetData(tgtIndex, indexUnitNum);
/* call CopyIndexed function */
_CopyIndexed(sGPU, tGPU, dim, (int*)index->data, indexSize, tgtIndex, copyNum);
tUserGPU = CopyIndexed(*sGPU, dim, srcIndex, indexSize, tgtIndex, copyNum);
_CopyIndexed(sGPU, tGPU1, dim, srcIndex, indexSize, tgtIndex, copyNum);
_CopyIndexed(sGPU, tGPU2, dim, sIndexGPU, tIndexGPU, copyNum);
tUserGPU = CopyIndexed(*sGPU, dim, *sIndexGPU, *tIndexGPU, copyNum);
/* check results */
gpuTest = tGPU->CheckData(answer, tUnitNum) && tUserGPU.CheckData(answer, tUnitNum);
gpuTest = tGPU1->CheckData(answer, tUnitNum) &&
tGPU2->CheckData(answer, tUnitNum) &&
tUserGPU.CheckData(answer, tUnitNum);
/* destroy variables */
delete s;
delete t;
delete index;
delete t1;
delete t2;
delete sIndex;
delete tIndex;
delete sGPU;
delete tGPU;
delete tGPU1;
delete tGPU2;
delete sIndexGPU;
delete tIndexGPU;
delete[] sDimSize;
delete[] tDimSize;
delete[] indexDimSize;
return cpuTest && gpuTest;
/* destroy variables */
delete s;
delete t;
delete t1;
delete t2;
delete sIndex;
delete tIndex;
delete[] sDimSize;
delete[] tDimSize;
delete[] indexDimSize;
return cpuTest;
#endif // USE_CUDA
case 5: copy indexed sub-tensors
In this case, (3, 2, 3) -> (3, 2, 2), dim = 2, indexSize = 1,
......@@ -495,6 +632,15 @@ bool TestCopyIndexed5()
for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i];
/* a index tensor of size (2) */
int indexOrder = 1;
int * indexDimSize = new int[indexOrder];
indexDimSize[0] = 2;
int indexUnitNum = 1;
for (int i = 0; i < indexOrder; i++)
indexUnitNum *= indexDimSize[i];
DTYPE sData[3][2][3] = { { {0.0F, -1.0F, 2.0F},
{2.0F, 1.0F, 3.0F} },
{ {1.0F, 2.0F, 4.0F},
......@@ -519,19 +665,28 @@ bool TestCopyIndexed5()
/* create tensors */
XTensor * s = NewTensor(sOrder, sDimSize);
XTensor * t = NewTensor(tOrder, tDimSize);
XTensor * t1 = NewTensor(tOrder, tDimSize);
XTensor * t2 = NewTensor(tOrder, tDimSize);
XTensor * sIndex = NewTensor(indexOrder, indexDimSize, X_INT);
XTensor * tIndex = NewTensor(indexOrder, indexDimSize, X_INT);
XTensor tUser;
/* initialize variables */
s->SetData(sData, sUnitNum);
sIndex->SetData(srcIndex, indexUnitNum);
tIndex->SetData(tgtIndex, indexUnitNum);
/* call CopyIndexed function */
_CopyIndexed(s, t, dim, srcIndex, indexSize, tgtIndex, copyNum);
tUser = CopyIndexed(*s, dim, srcIndex, indexSize, tgtIndex, copyNum);
_CopyIndexed(s, t1, dim, srcIndex, indexSize, tgtIndex, copyNum);
_CopyIndexed(s, t2, dim, sIndex, tIndex, copyNum);
tUser = CopyIndexed(*s, dim, *sIndex, *tIndex, copyNum);
/* check results */
cpuTest = t->CheckData(answer, tUnitNum) && tUser.CheckData(answer, tUnitNum);
cpuTest = t1->CheckData(answer, tUnitNum) &&
t2->CheckData(answer, tUnitNum) &&
tUser.CheckData(answer, tUnitNum);
#ifdef USE_CUDA
/* GPU test */
......@@ -539,35 +694,55 @@ bool TestCopyIndexed5()
/* create tensors */
XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensor(sOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU1 = NewTensor(sOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU2 = NewTensor(sOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor * sIndexGPU = NewTensor(indexOrder, indexDimSize, X_INT, 1.0F, 0);
XTensor * tIndexGPU = NewTensor(indexOrder, indexDimSize, X_INT, 1.0F, 0);
XTensor tUserGPU;
/* initialize variables */
sGPU->SetData(sData, sUnitNum);
sIndexGPU->SetData(srcIndex, indexUnitNum);
tIndexGPU->SetData(tgtIndex, indexUnitNum);
/* call CopyIndexed function */
_CopyIndexed(sGPU, tGPU, dim, srcIndex, indexSize, tgtIndex, copyNum);
tUserGPU = CopyIndexed(*sGPU, dim, srcIndex, indexSize, tgtIndex, copyNum);
_CopyIndexed(sGPU, tGPU1, dim, srcIndex, indexSize, tgtIndex, copyNum);
_CopyIndexed(sGPU, tGPU2, dim, sIndexGPU, tIndexGPU, copyNum);
tUserGPU = CopyIndexed(*sGPU, dim, *sIndexGPU, *tIndexGPU, copyNum);
/* check results */
gpuTest = tGPU->CheckData(answer, tUnitNum) && tUserGPU.CheckData(answer, tUnitNum);
gpuTest = tGPU1->CheckData(answer, tUnitNum) &&
tGPU2->CheckData(answer, tUnitNum) &&
tUserGPU.CheckData(answer, tUnitNum);
/* destroy variables */
delete s;
delete t;
delete t1;
delete t2;
delete sIndex;
delete tIndex;
delete sGPU;
delete tGPU;
delete tGPU1;
delete tGPU2;
delete sIndexGPU;
delete tIndexGPU;
delete[] sDimSize;
delete[] tDimSize;
delete[] indexDimSize;
return cpuTest && gpuTest;
/* destroy variables */
delete s;
delete t;
delete t1;
delete t2;
delete sIndex;
delete tIndex;
delete[] sDimSize;
delete[] tDimSize;
delete[] indexDimSize;
return cpuTest;
#endif // USE_CUDA
......@@ -311,8 +311,8 @@ bool TestCrossEntropy3()
delete goldGPU;
delete lossGPU;
delete weightGPU;
delete[] dimSize;
delete[] wDimSize;
return cpuTest && gpuTest;
......@@ -322,6 +322,7 @@ bool TestCrossEntropy3()
delete loss;
delete weight;
delete[] dimSize;
delete[] wDimSize;
return cpuTest;
#endif // USE_CUDA
......@@ -212,6 +212,8 @@ bool TestDropout2()
/* destroy variables */
delete x;
delete y;
delete dedx;
delete dedy;
delete[] dimSize;
return cpuTest;
......@@ -332,6 +332,7 @@ bool TestGather3()
/* destroy variables */
delete s;
delete t;
delete index;
delete[] sDimSize;
delete[] tDimSize;
delete[] indexDimSize;
......@@ -150,6 +150,7 @@ bool TestSetData2()
delete sGPU;
delete modifyGPU;
delete[] sDimSize;
delete[] dataDimSize;
return cpuTest && gpuTest;
......@@ -157,6 +158,7 @@ bool TestSetData2()
delete s;
delete modify;
delete[] sDimSize;
delete[] dataDimSize;
return cpuTest;
#endif // USE_CUDA
......@@ -242,6 +244,7 @@ bool TestSetData3()
delete sGPU;
delete modifyGPU;
delete[] sDimSize;
delete[] dataDimSize;
return cpuTest && gpuTest;
......@@ -249,6 +252,7 @@ bool TestSetData3()
delete s;
delete modify;
delete[] sDimSize;
delete[] dataDimSize;
return cpuTest;
#endif // USE_CUDA
......@@ -61,7 +61,9 @@ bool TestSort1()
_SortMe(aMe, index, 0);
Sort(*a, bUser, *index, 0);
cpuTest = b->CheckData(answer, unitNum) && aMe->CheckData(answer, unitNum) && bUser.CheckData(answer, unitNum);
cpuTest = b->CheckData(answer, unitNum) &&
aMe->CheckData(answer, unitNum) &&
bUser.CheckData(answer, unitNum);
#ifdef USE_CUDA
/* GPU test */
......@@ -85,7 +87,9 @@ bool TestSort1()
Sort(*aGPU, bUserGPU, *indexGPU, 0);
/* check results */
gpuTest = bGPU->CheckData(answer, unitNum) && aMeGPU->CheckData(answer, unitNum) && bUserGPU.CheckData(answer, unitNum);
gpuTest = bGPU->CheckData(answer, unitNum) &&
aMeGPU->CheckData(answer, unitNum) &&
bUserGPU.CheckData(answer, unitNum);
/* destroy variables */
delete a;
......@@ -149,7 +153,9 @@ bool TestSort2()
Sort(*a, bUser, *index, 1);
/* check results */
cpuTest = b->CheckData(answer, unitNum) && aMe->CheckData(answer, unitNum) && bUser.CheckData(answer, unitNum);
cpuTest = b->CheckData(answer, unitNum) &&
aMe->CheckData(answer, unitNum) &&
bUser.CheckData(answer, unitNum);
#ifdef USE_CUDA
/* GPU test */
......@@ -173,7 +179,9 @@ bool TestSort2()
Sort(*aGPU, bUserGPU, *indexGPU, 1);
/* check results */
gpuTest = bGPU->CheckData(answer, unitNum) && aMeGPU->CheckData(answer, unitNum) && bUserGPU.CheckData(answer, unitNum);
gpuTest = bGPU->CheckData(answer, unitNum) &&
aMeGPU->CheckData(answer, unitNum) &&
bUserGPU.CheckData(answer, unitNum);
/* destroy variables */
delete a;
......@@ -357,6 +357,7 @@ bool TestSplit3()
delete[] sDimSize;
delete[] tDimSize1;
delete[] tDimSize2;
delete tList;
return cpuTest;
#endif // USE_CUDA
......@@ -182,6 +182,7 @@ bool TestSpread2()
int dim = 0;
int indexSize = 2;
int srcIndex[2] = {0, 2};
int tgtIndex[2] = {0, 1};
/* CPU test */
bool cpuTest = true;
......@@ -190,17 +191,19 @@ bool TestSpread2()
XTensor * s1 = NewTensor(sOrder, sDimSize);
XTensor * s2 = NewTensor(sOrder, sDimSize);
XTensor * t = NewTensor(tOrder, tDimSize);
XTensor * index = NewTensor(indexOrder, indexDimSize, X_INT);
XTensor * sIndex = NewTensor(indexOrder, indexDimSize, X_INT);
XTensor * cIndex = NewTensor(indexOrder, indexDimSize, X_INT);
/* initialize variables */
s1->SetData(sData, sUnitNum);
s2->SetData(sData, sUnitNum);
t->SetData(tData, tUnitNum);
index->SetData(srcIndex, indexSize);
sIndex->SetData(srcIndex, indexSize);
cIndex->SetData(tgtIndex, indexSize);
/* call _SpreadForGather function */
_SpreadForGather(s1, t, dim, srcIndex, indexSize);
_SpreadForGather(s2, t, index);
_SpreadForCopyIndexed(s1, t, dim, sIndex, cIndex, 1);
_SpreadForGather(s2, t, sIndex);
/* check results */
cpuTest = s1->CheckData(answer, tUnitNum) &&
......@@ -214,17 +217,19 @@ bool TestSpread2()
XTensor * sGPU1 = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * sGPU2 = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensor(sOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor * indexGPU = NewTensor(indexOrder, indexDimSize, X_INT, 1.0F, 0);
XTensor * sIndexGPU = NewTensor(indexOrder, indexDimSize, X_INT, 1.0F, 0);
XTensor * cIndexGPU = NewTensor(indexOrder, indexDimSize, X_INT, 1.0F, 0);
/* initialize variables */
sGPU1->SetData(sData, sUnitNum);
sGPU2->SetData(sData, sUnitNum);
tGPU->SetData(tData, tUnitNum);
indexGPU->SetData(srcIndex, indexSize);
sIndexGPU->SetData(srcIndex, indexSize);
cIndexGPU->SetData(tgtIndex, indexSize);
/* call _SpreadForGather function */
_SpreadForGather(sGPU1, tGPU, dim, srcIndex, indexSize);
_SpreadForGather(sGPU2, tGPU, indexGPU);
_SpreadForCopyIndexed(sGPU1, tGPU, dim, sIndex, cIndex, 1);
_SpreadForGather(sGPU2, tGPU, sIndexGPU);
/* check results */
gpuTest = sGPU1->CheckData(answer, tUnitNum) &&
......@@ -234,11 +239,13 @@ bool TestSpread2()
delete s1;
delete s2;
delete t;
delete index;
delete sIndex;
delete cIndex;
delete sGPU1;
delete sGPU2;
delete tGPU;
delete indexGPU;
delete sIndexGPU;
delete cIndexGPU;
delete[] sDimSize;
delete[] tDimSize;
delete[] indexDimSize;
......@@ -249,6 +256,8 @@ bool TestSpread2()
delete s1;
delete s2;
delete t;
delete sIndex;
delete cIndex;
delete[] sDimSize;
delete[] tDimSize;
delete[] indexDimSize;
......@@ -31,6 +31,7 @@ bool Test()
wrong = !TestAbsolute() || wrong;
wrong = !TestClip() || wrong;
wrong = !TestCompare() || wrong;
wrong = !TestConcatenate() || wrong;
wrong = !TestConcatenateSolely() || wrong;
wrong = !TestCos() || wrong;
......@@ -24,6 +24,7 @@
#include "TAbsolute.h"
#include "TClip.h"
#include "TCompare.h"
#include "TConcatenate.h"
#include "TConcatenateSolely.h"
#include "TCos.h"
Markdown 格式
您添加了 0 到此讨论。请谨慎行事。
注册 或者 后发表评论