Commit 854a4835 by xuchen

1. fixed the bug in the Spread function. 2. add compare and OnehotAndIndex…

1. fixed the bug in the Spread function.  2. add compare and OnehotAndIndex function 3. optimize the code
parent 61c4d15c
...@@ -25,6 +25,7 @@ ...@@ -25,6 +25,7 @@
#include "XNoder.h" #include "XNoder.h"
#include "XBackwardShape.h" #include "XBackwardShape.h"
#include "../tensor/XName.h" #include "../tensor/XName.h"
#include "../tensor/XUtility.h"
#include "../tensor/core/CHeader.h" #include "../tensor/core/CHeader.h"
#include "../tensor/core/getandset/SetData.h" #include "../tensor/core/getandset/SetData.h"
...@@ -40,7 +41,7 @@ void XShapeGrad::MakeGrad(XTensor * node, bool isEfficent) ...@@ -40,7 +41,7 @@ void XShapeGrad::MakeGrad(XTensor * node, bool isEfficent)
if(operID == MOVEMENT_COPYINDEXED) if(operID == MOVEMENT_COPYINDEXED)
GradCopyIndexed(node, isEfficent); GradCopyIndexed(node, isEfficent);
if(operID == MOVEMENT_GATHER) else if(operID == MOVEMENT_GATHER)
GradGather(node, isEfficent); GradGather(node, isEfficent);
else if(operID == SHAPE_MERGE) else if(operID == SHAPE_MERGE)
GradMerge(node, isEfficent); GradMerge(node, isEfficent);
...@@ -80,7 +81,7 @@ gradient computation for copying indexed sub-tensors ...@@ -80,7 +81,7 @@ gradient computation for copying indexed sub-tensors
for for
b = copyindexed(a) b = copyindexed(a)
we have we have
dE/da = spread(b) dE/da = spreadforcopyindexed(b)
>> node - the node (c) for backward computation >> node - the node (c) for backward computation
>> isEfficient - indicates whether the computation is in >> isEfficient - indicates whether the computation is in
an efficient manner an efficient manner
...@@ -91,32 +92,14 @@ void XShapeGrad::GradCopyIndexed(XTensor * node, bool isEfficent) ...@@ -91,32 +92,14 @@ void XShapeGrad::GradCopyIndexed(XTensor * node, bool isEfficent)
CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for CopyIndexed!"); CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for CopyIndexed!");
int dim = income.GetParamInt(0); int dim = income.GetParamInt(0);
int * srcIndex = (int *)income.GetParamPointer(1); int copyNum = income.GetParamInt(1);
int indexSize = income.GetParamInt(2);
int * tgtIndex = (int *)income.GetParamPointer(3);
int copyNum = income.GetParamInt(4);
int realIndexSize = indexSize * copyNum;
int * realSrcIndex = new int[realIndexSize];
int * realTgtIndex = new int[realIndexSize];
for(int i = 0; i < indexSize; i++) {
for(int j = 0; j < copyNum; j++) {
realSrcIndex[i * copyNum + j] = srcIndex[i] + j;
realTgtIndex[i * copyNum + j] = tgtIndex[i] + j;
}
}
XTensor * input = income.tails[0]; XTensor * input = income.tails[0];
XNoder::MakeGrad(input); XTensor * srcIndex = income.tails[1];
XTensor * tgtIndex = income.tails[2];
_Spread(input->grad, node->grad, dim, realSrcIndex, realIndexSize, realTgtIndex);
delete[] realSrcIndex; XNoder::MakeGrad(input);
delete[] realTgtIndex; _SpreadForCopyIndexed(input->grad, node->grad, dim, srcIndex, tgtIndex, copyNum);
delete[] srcIndex;
delete[] tgtIndex;
node->visitMark = NODE_FINISHED;
} }
/* /*
...@@ -143,7 +126,6 @@ void XShapeGrad::GradGather(XTensor * node, bool isEfficent) ...@@ -143,7 +126,6 @@ void XShapeGrad::GradGather(XTensor * node, bool isEfficent)
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
} }
/* /*
gradient for merge gradient for merge
for for
...@@ -181,6 +163,7 @@ void XShapeGrad::GradMerge(XTensor * node, bool isEfficent) ...@@ -181,6 +163,7 @@ void XShapeGrad::GradMerge(XTensor * node, bool isEfficent)
XNoder::MakeGrad(input); XNoder::MakeGrad(input);
int * dims = new int[input->order]; int * dims = new int[input->order];
memset(dims, 0, sizeof(int) * input->order);
for(int i = 0, j = 0; i < input->order; i++){ for(int i = 0, j = 0; i < input->order; i++){
if(i >= leadDim){ if(i >= leadDim){
dims[j++] = input->dimSize[i]; dims[j++] = input->dimSize[i];
......
...@@ -231,7 +231,7 @@ void LoadArgs(int argc, const char ** argv, FNNModel &model) ...@@ -231,7 +231,7 @@ void LoadArgs(int argc, const char ** argv, FNNModel &model)
} }
for(int i = 0; i < argc; i++){ for(int i = 0; i < argc; i++){
if(!strcmp(argv[i], "-mempool")) if (!strcmp(argv[i], "-mempool"))
model.mem = new XMem(model.devID); model.mem = new XMem(model.devID);
} }
} }
...@@ -715,24 +715,16 @@ The indexed cell is set to 1, and 0 otherwise. ...@@ -715,24 +715,16 @@ The indexed cell is set to 1, and 0 otherwise.
>> devID - device id >> devID - device id
>> mem - memory pool >> mem - memory pool
*/ */
void InitZeroOneTensor2D(XTensor &tensor, int rowNum, int colNum, int * rows, int * cols, int itemNum, int devID, XMem * mem) void InitZeroOneTensor2D(XTensor &tensor, int rowNum, int colNum, int * rows, int * cols,
int itemNum, int devID, XMem * mem)
{ {
if(devID >= 0 || (mem != NULL && mem->devID >= 0)) InitTensor2D(&tensor, rowNum, colNum, X_FLOAT, devID, mem);
InitTensor2D(&tensor, rowNum, colNum, X_FLOAT, -1);
else
InitTensor2D(&tensor, rowNum, colNum, X_FLOAT, devID, mem);
tensor.SetZeroAll(); tensor.SetZeroAll();
/* set none-zero cells */ /* set none-zero cells */
for(int i = 0; i < itemNum; i++) for(int i = 0; i < itemNum; i++)
tensor.Set2D(1.0F, rows[i], cols[i]); tensor.Set2D(1.0F, rows[i], cols[i]);
if(devID >= 0 || (mem != NULL && mem->devID >= 0)){
XList list(1);
list.Add(&tensor);
CPUToGPUFlush(&list, devID, mem);
}
} }
/* /*
...@@ -859,8 +851,6 @@ void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net) ...@@ -859,8 +851,6 @@ void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net)
/* y = softmax(s) */ /* y = softmax(s) */
_LogSoftmax(&s, &y, 1); _LogSoftmax(&s, &y, 1);
} }
} }
/* /*
...@@ -998,7 +988,6 @@ void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model ...@@ -998,7 +988,6 @@ void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model
XTensor embeddingBig; XTensor embeddingBig;
XTensor hidden; XTensor hidden;
XTensor b; XTensor b;
XTensor srcIndex;
int size = batch * (n-1); int size = batch * (n-1);
int * index = new int[size]; int * index = new int[size];
...@@ -1010,28 +999,25 @@ void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model ...@@ -1010,28 +999,25 @@ void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model
} }
} }
InitTensor1D(&srcIndex, size, X_INT, model.devID, model.mem); InitTensor1D(&words, size, X_INT, model.devID, model.mem);
srcIndex.SetData(index, size); words.SetData(index, size);
embeddingBig = Gather(model.embeddingW, words);
XTensor embedding;
embedding = Gather(model.embeddingW, srcIndex);
delete[] index; delete[] index;
int dimSize[2]; int dimSize[2];
dimSize[0] = embedding.GetDim(0) / (n - 1); dimSize[0] = embeddingBig.GetDim(0) / (n - 1);
dimSize[1] = embedding.GetDim(1) * (n - 1); dimSize[1] = embeddingBig.GetDim(1) * (n - 1);
hidden = Reshape(embedding, embedding.order, dimSize); hidden = Reshape(embeddingBig, embeddingBig.order, dimSize);
/* hidden layers */ /* hidden layers */
for(int i = 0; i < depth; i++) for(int i = 0; i < depth; i++)
hidden = MMul(hidden, model.hiddenW[i]) + model.hiddenB[i]; hidden = HardTanH(MMul(hidden, model.hiddenW[i]) + model.hiddenB[i]);
/* output layer */ /* output layer */
output = LogSoftmax(MMul(hidden, model.outputW) + model.outputB, 1); output = LogSoftmax(MMul(hidden, model.outputW) + model.outputB, 1);
//XLink::ShowNetwork(stderr, &output);
} }
/* /*
...@@ -1071,7 +1057,6 @@ void ForwardAutoDiff(XTensor inputs[], XTensor &output, FNNModel &model) ...@@ -1071,7 +1057,6 @@ void ForwardAutoDiff(XTensor inputs[], XTensor &output, FNNModel &model)
/* output layer */ /* output layer */
output = LogSoftmax(MMul(hidden, model.outputW) + model.outputB, 1); output = LogSoftmax(MMul(hidden, model.outputW) + model.outputB, 1);
//XLink::ShowNetwork(stderr, &output);
} }
/* /*
......
...@@ -70,7 +70,7 @@ make the decoding network ...@@ -70,7 +70,7 @@ make the decoding network
>> inputDec - the input tensor of the decoder >> inputDec - the input tensor of the decoder
>> outputEnc - the output tensor of the encoder >> outputEnc - the output tensor of the encoder
>> mask - mask that indicates which position is valid >> mask - mask that indicates which position is valid
>> mask - mask for the encoder-decoder attention >> maskEncDec - mask for the encoder-decoder attention
>> isTraining - indicates whether the model is used for training >> isTraining - indicates whether the model is used for training
<< return - the output tensor of the encoder << return - the output tensor of the encoder
*/ */
......
...@@ -150,7 +150,6 @@ XTensor T2TEmbedder::Make(XTensor &input) ...@@ -150,7 +150,6 @@ XTensor T2TEmbedder::Make(XTensor &input)
} }
/* then we make word embeddings */ /* then we make word embeddings */
//wordEmbedding = Linear(MMul(input, w), (float)sqrt((float)eSize));
wordEmbedding = Gather(w, input); wordEmbedding = Gather(w, input);
wordEmbedding = Linear(wordEmbedding, (float)sqrt((float)eSize)); wordEmbedding = Linear(wordEmbedding, (float)sqrt((float)eSize));
......
...@@ -693,70 +693,46 @@ int T2TTrainer::LoadBatchLM(FILE * file, ...@@ -693,70 +693,46 @@ int T2TTrainer::LoadBatchLM(FILE * file,
dims[2] = vs; dims[2] = vs;
InitTensor2D(batchEnc, sc, max, X_INT, devID, mem); InitTensor2D(batchEnc, sc, max, X_INT, devID, mem);
//InitTensor(batchEnc, 3, dims, X_FLOAT, 1.0F, devID, mem);
InitTensor2D(paddingEnc, sc, max, X_FLOAT, devID, mem);
InitTensor(gold, 3, dims, X_FLOAT, 1.0F, devID, mem); InitTensor(gold, 3, dims, X_FLOAT, 1.0F, devID, mem);
InitTensor2D(paddingEnc, sc, max, X_FLOAT, devID, mem);
InitTensor2D(paddingDec, sc, max, X_FLOAT, devID, mem); InitTensor2D(paddingDec, sc, max, X_FLOAT, devID, mem);
batchEnc->SetZeroAll(); batchEnc->SetZeroAll();
paddingEnc->SetZeroAll();
gold->SetZeroAll(); gold->SetZeroAll();
paddingEnc->SetZeroAll();
paddingDec->SetZeroAll(); paddingDec->SetZeroAll();
if(isTraining) {
//XNoder::MakeGrad(batchEnc);
XNoder::MakeGrad(paddingEnc);
XNoder::MakeGrad(gold);
XNoder::MakeGrad(paddingDec);
//batchEnc->grad->SetZeroAll();
paddingEnc->grad->SetZeroAll();
gold->grad->SetZeroAll();
paddingDec->grad->SetZeroAll();
}
int seqSize = 0; int seqSize = 0;
MTYPE * batchEncOffsets = new MTYPE[batchEnc->unitNum];
int * batchEncValues = new int[batchEnc->unitNum]; int * batchEncValues = new int[batchEnc->unitNum];
MTYPE * paddingEncOffsets = new MTYPE[paddingEnc->unitNum];
MTYPE * goldOffsets = new MTYPE[gold->unitNum]; MTYPE * goldOffsets = new MTYPE[gold->unitNum];
MTYPE * paddingDecOffsets = new MTYPE[paddingDec->unitNum]; //MTYPE * paddingEncOffsets = new MTYPE[paddingEnc->unitNum];
//MTYPE * paddingDecOffsets = new MTYPE[paddingDec->unitNum];
int wGold = 0; int wGold = 0;
//fprintf(tf, "batch %d(%d)\n", tc++, sc); memset(batchEncValues, 0, sizeof(int) * batchEnc->unitNum);
for(int s = seq; s < seq + sc; s++){ for(int s = seq; s < seq + sc; s++){
int len = isDoubledEnd ? seqLen[s] : seqLen[s] - 1; int len = isDoubledEnd ? seqLen[s] : seqLen[s] - 1;
CheckNTErrors(len <= max, "Something is wrong!"); CheckNTErrors(len <= max, "Something is wrong!");
for(int w = 0; w < len; w++){ for(int w = 0; w < len; w++){
int num = buf[seqOffset[s] + w]; int num = buf[seqOffset[s] + w];
//batchEnc->Set2DInt(buf[seqOffset[s] + w], s - seq, w); batchEncValues[(int)batchEnc->GetOffset2D(s - seq, w)] = num;
batchEncOffsets[wCount] = batchEnc->GetOffset2D(s - seq, w); //paddingEncOffsets[wCount] = paddingEnc->GetOffset2D(s - seq, w);
batchEncValues[wCount] = num; //paddingDecOffsets[wCount] = paddingDec->GetOffset2D(s - seq, w);
//paddingEnc->Set2D(1.0F, s - seq, w);
//paddingDec->Set2D(1.0F, s - seq, w);
paddingEncOffsets[wCount] = paddingEnc->GetOffset2D(s - seq, w);
paddingDecOffsets[wCount] = paddingDec->GetOffset2D(s - seq, w);
if (w > 0) if (w > 0)
//gold->Set3D(1.0F, s - seq, w - 1, buf[seqOffset[s] + w]);
goldOffsets[wGold++] = gold->GetOffset3D(s - seq, w - 1, num); goldOffsets[wGold++] = gold->GetOffset3D(s - seq, w - 1, num);
if (w == len - 1) { if (w == len - 1) {
if (isDoubledEnd) if (isDoubledEnd)
//gold->Set3D(1.0F, s - seq, w, buf[seqOffset[s] + w]);
goldOffsets[wGold++] = gold->GetOffset3D(s - seq, w, num); goldOffsets[wGold++] = gold->GetOffset3D(s - seq, w, num);
else else
//gold->Set3D(1.0F, s - seq, w, buf[seqOffset[s] + w + 1]);
goldOffsets[wGold++] = gold->GetOffset3D(s - seq, w, buf[seqOffset[s] + w + 1]); goldOffsets[wGold++] = gold->GetOffset3D(s - seq, w, buf[seqOffset[s] + w + 1]);
} }
wCount++; wCount++;
/*fprintf(tf, "%d", buf[seqOffset[s] + w]);
if(w < seqLen[s] - 1)
fprintf(tf, " ");
else
fprintf(tf, "\n");*/
if(seqs != NULL) if(seqs != NULL)
seqs[seqSize++] = buf[seqOffset[s] + w]; seqs[seqSize++] = buf[seqOffset[s] + w];
} }
...@@ -767,16 +743,25 @@ int T2TTrainer::LoadBatchLM(FILE * file, ...@@ -767,16 +743,25 @@ int T2TTrainer::LoadBatchLM(FILE * file,
} }
} }
batchEnc->SetDataBatched(batchEncOffsets, batchEncValues, wCount); batchEnc->SetData(batchEncValues, batchEnc->unitNum);
paddingEnc->SetDataBatched(paddingEncOffsets, 1.0F, wCount);
paddingDec->SetDataBatched(paddingDecOffsets, 1.0F, wCount);
gold->SetDataBatched(goldOffsets, 1.0F, wGold); gold->SetDataBatched(goldOffsets, 1.0F, wGold);
//paddingEnc->SetDataBatched(paddingEncOffsets, 1.0F, wCount);
//paddingDec->SetDataBatched(paddingDecOffsets, 1.0F, wCount);
XTensor * tmp = NewTensorBuf(paddingEnc, devID, mem);
_ConvertDataType(batchEnc, tmp);
_NotEqual(tmp, paddingEnc, 0);
DelTensorBuf(tmp);
XTensor * tmp2 = NewTensorBuf(paddingDec, devID, mem);
_ConvertDataType(batchEnc, tmp2);
_NotEqual(tmp2, paddingDec, 0);
DelTensorBuf(tmp2);
delete[] batchEncOffsets;
delete[] batchEncValues; delete[] batchEncValues;
delete[] paddingEncOffsets;
delete[] paddingDecOffsets;
delete[] goldOffsets; delete[] goldOffsets;
//delete[] paddingEncOffsets;
//delete[] paddingDecOffsets;
fflush(tf); fflush(tf);
...@@ -877,33 +862,33 @@ int T2TTrainer::LoadBatchMT(FILE * file, ...@@ -877,33 +862,33 @@ int T2TTrainer::LoadBatchMT(FILE * file,
int wGold = 0; int wGold = 0;
wCount = 0; wCount = 0;
MTYPE * batchEncOffsets = new MTYPE[batchEnc->unitNum];
int * batchEncValues = new int[batchEnc->unitNum]; int * batchEncValues = new int[batchEnc->unitNum];
MTYPE * batchDecOffsets = new MTYPE[batchDec->unitNum];
int * batchDecValues = new int[batchDec->unitNum]; int * batchDecValues = new int[batchDec->unitNum];
MTYPE * paddingEncOffsets = new MTYPE[sc * maxEnc / 2]; //MTYPE * paddingEncOffsets = new MTYPE[sc * maxEnc / 2];
MTYPE * paddingDecOffsets = new MTYPE[sc * maxDec / 2]; //MTYPE * paddingDecOffsets = new MTYPE[sc * maxDec / 2];
MTYPE * goldOffsets = new MTYPE[sc * maxDec / 2]; MTYPE * goldOffsets = new MTYPE[sc * maxDec / 2];
memset(batchEncValues, 0, sizeof(int) * batchEnc->unitNum);
memset(batchDecValues, 0, sizeof(int) * batchDec->unitNum);
/* batch of the source-side sequences */ /* batch of the source-side sequences */
for(int s = seq; s < seq + sc; s += 2){ for(int s = seq; s < seq + sc; s += 2){
int len = seqLen[s]; int len = seqLen[s];
int sent = (s - seq)/2; int sent = (s - seq)/2;
for(int w = 0; w < len; w++){ for(int w = 0; w < len; w++){
int num = buf[seqOffset[s] + w]; int num = buf[seqOffset[s] + w];
batchEncValues[batchEnc->GetOffset2D(sent, w)] = num;
batchEncOffsets[wCount] = batchEnc->GetOffset2D(sent, w); //paddingEncOffsets[wCountEnc] = paddingEnc->GetOffset2D(sent, w);
batchEncValues[wCount] = num; wCountEnc++;
paddingEncOffsets[wCount] = paddingEnc->GetOffset2D(sent, w);
wCount++;
} }
} }
batchEnc->SetDataBatched(batchEncOffsets, batchEncValues, wCount); batchEnc->SetData(batchEncValues, batchEnc->unitNum);
paddingEnc->SetDataBatched(paddingEncOffsets, 1.0F, wCount); //paddingEnc->SetDataBatched(paddingEncOffsets, 1.0F, wCountEnc);
XTensor * tmp = NewTensorBuf(paddingEnc, devID, mem);
wCountEnc = wCount; _ConvertDataType(batchEnc, tmp);
wCount = 0; _NotEqual(tmp, paddingEnc, 0);
DelTensorBuf(tmp);
/* batch of the target-side sequences */ /* batch of the target-side sequences */
for(int s = seq + 1; s < seq + sc; s += 2){ for(int s = seq + 1; s < seq + sc; s += 2){
...@@ -912,10 +897,8 @@ int T2TTrainer::LoadBatchMT(FILE * file, ...@@ -912,10 +897,8 @@ int T2TTrainer::LoadBatchMT(FILE * file,
int sent = (s - seq - 1)/2; int sent = (s - seq - 1)/2;
for(int w = 0; w < len; w++){ for(int w = 0; w < len; w++){
int num = buf[seqOffset[s] + w]; int num = buf[seqOffset[s] + w];
batchDecValues[batchDec->GetOffset2D(sent, w)] = num;
batchDecOffsets[wCountDec] = batchDec->GetOffset2D(sent, w); //paddingDecOffsets[wCountDec] = paddingDec->GetOffset2D(sent, w);
batchDecValues[wCountDec] = num;
paddingDecOffsets[wCountDec] = paddingDec->GetOffset2D(sent, w);
if (w > 0) if (w > 0)
goldOffsets[wGold++] = gold->GetOffset3D(sent, w - 1, buf[seqOffset[s] + w]); goldOffsets[wGold++] = gold->GetOffset3D(sent, w - 1, buf[seqOffset[s] + w]);
...@@ -938,16 +921,20 @@ int T2TTrainer::LoadBatchMT(FILE * file, ...@@ -938,16 +921,20 @@ int T2TTrainer::LoadBatchMT(FILE * file,
} }
} }
batchDec->SetDataBatched(batchDecOffsets, batchDecValues, wCountDec); batchDec->SetData(batchDecValues, batchDec->unitNum);
paddingDec->SetDataBatched(paddingDecOffsets, 1.0F, wCountDec); //paddingDec->SetDataBatched(paddingDecOffsets, 1.0F, wCountDec);
XTensor * tmp2 = NewTensorBuf(paddingDec, devID, mem);
_ConvertDataType(batchDec, tmp2);
_NotEqual(tmp2, paddingDec, 0);
DelTensorBuf(tmp2);
gold->SetDataBatched(goldOffsets, 1.0F, wGold); gold->SetDataBatched(goldOffsets, 1.0F, wGold);
delete[] batchEncOffsets;
delete[] batchEncValues; delete[] batchEncValues;
delete[] batchDecOffsets;
delete[] batchDecValues; delete[] batchDecValues;
delete[] paddingEncOffsets; //delete[] paddingEncOffsets;
delete[] paddingDecOffsets; //delete[] paddingDecOffsets;
delete[] goldOffsets; delete[] goldOffsets;
return sc; return sc;
...@@ -981,11 +968,12 @@ float T2TTrainer::GetProb(XTensor * output, XTensor * gold, XTensor * wordProbs) ...@@ -981,11 +968,12 @@ float T2TTrainer::GetProb(XTensor * output, XTensor * gold, XTensor * wordProbs)
XTensor probs; XTensor probs;
InitTensor(&probs, output); InitTensor(&probs, output);
/*XTensor logOutput; //XTensor logOutput;
InitTensor(&logOutput, output); //InitTensor(&logOutput, output);
_Log(output, &logOutput);*/ //_Log(output, &logOutput);
/* probs[i,j] = output[i,j] * gold[i,j] */ /* probs[i,j] = output[i,j] * gold[i,j] */
//_Multiply(&logOutput, gold, &probs);
_Multiply(output, gold, &probs); _Multiply(output, gold, &probs);
/* probability of each word */ /* probability of each word */
...@@ -1161,6 +1149,7 @@ void T2TTrainer::RescaleOutput(XTensor * output, XTensor * gold, XTensor * paddi ...@@ -1161,6 +1149,7 @@ void T2TTrainer::RescaleOutput(XTensor * output, XTensor * gold, XTensor * paddi
_ExpMe(output); _ExpMe(output);
_ScaleAndShiftMe(output, 1/count); _ScaleAndShiftMe(output, 1/count);
_LogMe(output); _LogMe(output);
_ScaleAndShiftMe(gold, 1/count); _ScaleAndShiftMe(gold, 1/count);
} }
......
...@@ -20,6 +20,7 @@ ...@@ -20,6 +20,7 @@
*/ */
#include <math.h> #include <math.h>
#include <time.h>
#include "Transformer.h" #include "Transformer.h"
#include "T2TModel.h" #include "T2TModel.h"
#include "T2TUtility.h" #include "T2TUtility.h"
...@@ -58,6 +59,7 @@ int TransformerMain(int argc, const char ** argv) ...@@ -58,6 +59,7 @@ int TransformerMain(int argc, const char ** argv)
LoadParamString(argc, args, "test", testFN, ""); LoadParamString(argc, args, "test", testFN, "");
LoadParamString(argc, args, "output", outputFN, ""); LoadParamString(argc, args, "output", outputFN, "");
srand((unsigned int)time(NULL));
T2TTrainer trainer; T2TTrainer trainer;
trainer.Init(argc, args); trainer.Init(argc, args);
......
...@@ -65,10 +65,10 @@ namespace nts { ...@@ -65,10 +65,10 @@ namespace nts {
#endif #endif
#ifndef MIN #ifndef MIN
#define MIN(a,b) ((a < b) ? a : b) #define MIN(a,b) ((a) < (b) ? a : b)
#endif #endif
#ifndef MAX #ifndef MAX
#define MAX(a,b) ((a > b) ? a : b) #define MAX(a,b) ((a) > (b) ? a : b)
#endif #endif
#define __FILENAME__ ( strrchr(__FILE__, DELIMITER) != NULL ? strrchr(__FILE__, DELIMITER)+1 : __FILE__ ) #define __FILENAME__ ( strrchr(__FILE__, DELIMITER) != NULL ? strrchr(__FILE__, DELIMITER)+1 : __FILE__ )
......
...@@ -79,7 +79,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -79,7 +79,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#define MOVEMENT_COPYVALUES MOVEMENT_COPYINDEXED + 1 #define MOVEMENT_COPYVALUES MOVEMENT_COPYINDEXED + 1
#define MOVEMENT_GATHER MOVEMENT_COPYVALUES + 1 #define MOVEMENT_GATHER MOVEMENT_COPYVALUES + 1
#define SHAPE MOVEMENT_COPYVALUES + 1 #define SHAPE MOVEMENT_GATHER + 1
#define SHAPE_CONCATENATE SHAPE + 1 #define SHAPE_CONCATENATE SHAPE + 1
#define SHAPE_MERGE SHAPE_CONCATENATE + 1 #define SHAPE_MERGE SHAPE_CONCATENATE + 1
#define SHAPE_MERGE_LIST SHAPE_MERGE + 1 #define SHAPE_MERGE_LIST SHAPE_MERGE + 1
......
...@@ -804,7 +804,7 @@ set tensor items with an array of values ...@@ -804,7 +804,7 @@ set tensor items with an array of values
>> values - value for each data item >> values - value for each data item
>> num - number of the data items >> num - number of the data items
*/ */
void XTensor::SetDataBatched(MTYPE * offsets, void * values, int num) void XTensor::SetDataBatchedWithValues(MTYPE * offsets, void * values, int num)
{ {
_SetDataWithOffsetAndValue(this, offsets, values, num); _SetDataWithOffsetAndValue(this, offsets, values, num);
} }
...@@ -1289,7 +1289,7 @@ int XTensor::GetNonzeroSize() ...@@ -1289,7 +1289,7 @@ int XTensor::GetNonzeroSize()
if(dataType == DEFAULT_DTYPE){ if(dataType == DEFAULT_DTYPE){
int count = 0; int count = 0;
for(int i = 0; i < unitNum; i++){ for(int i = 0; i < unitNum; i++){
DTYPE value = *((DTYPE*)(char*)data + i * sizeof(DTYPE)); DTYPE value = *(DTYPE*)((char*)data + i * sizeof(DTYPE));
if(value == 0) if(value == 0)
count++; count++;
} }
...@@ -2271,6 +2271,8 @@ XTensor * NewTensor(const XTensor * a, bool isFilledData) ...@@ -2271,6 +2271,8 @@ XTensor * NewTensor(const XTensor * a, bool isFilledData)
CheckNTErrors((a != NULL), "Empty input!"); CheckNTErrors((a != NULL), "Empty input!");
memset(dims, 0, sizeof(int) * MAX_TENSOR_DIM_NUM);
if(a->order > 0) if(a->order > 0)
memcpy(dims, a->dimSize, sizeof(int) * a->order); memcpy(dims, a->dimSize, sizeof(int) * a->order);
......
...@@ -285,7 +285,7 @@ public: ...@@ -285,7 +285,7 @@ public:
void SetDataBatched(MTYPE * offsets, DTYPE value, int num); void SetDataBatched(MTYPE * offsets, DTYPE value, int num);
/* set tensor items with an array of values */ /* set tensor items with an array of values */
void SetDataBatched(MTYPE * offsets, void * values, int num); void SetDataBatchedWithValues(MTYPE * offsets, void * values, int num);
/* check whether the data array is the same as the answer */ /* check whether the data array is the same as the answer */
bool CheckData(const void * answer, int num, int beg = 0); bool CheckData(const void * answer, int num, int beg = 0);
......
...@@ -16,8 +16,8 @@ ...@@ -16,8 +16,8 @@
*/ */
/* /*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/ */
/* this is a header to include all functions in the "core" workspace */ /* this is a header to include all functions in the "core" workspace */
...@@ -46,16 +46,17 @@ ...@@ -46,16 +46,17 @@
#include "arithmetic/XTensorBLAS.h" #include "arithmetic/XTensorBLAS.h"
#include "getandset/ConvertDataType.h" #include "getandset/ConvertDataType.h"
#include "getandset/OnehotAndIndex.h"
#include "getandset/Select.h" #include "getandset/Select.h"
#include "getandset/SetData.h" #include "getandset/SetData.h"
#include "math/Clip.h" #include "math/Clip.h"
#include "math/Compare.h"
#include "math/Normalize.h" #include "math/Normalize.h"
#include "math/Power.h" #include "math/Power.h"
#include "math/ScaleAndShift.h" #include "math/ScaleAndShift.h"
#include "math/Unary.h" #include "math/Unary.h"
#include "movement/CopyBlocks.h" #include "movement/CopyBlocks.h"
#include "movement/CopyBlocksInGrid.h" #include "movement/CopyBlocksInGrid.h"
#include "movement/CopyBlocksOnSite.h" #include "movement/CopyBlocksOnSite.h"
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-12-17
*/
#include "OnehotAndIndex.h"
#include "OnehotAndIndex.cuh"
namespace nts{ // namespace nts(NiuTrans.Tensor)
/*
convert onehot tensor to index tensor
>> onehot - onehot tensor, which value is 0 or 1
>> index - index tensor, which value is an integer num
>> size - the last dimension size of the onehot tensor
*/
void _OnehotToIndex(XTensor * onehot, XTensor * index, int size)
{
CheckNTErrors(onehot->GetDim(-1) == size, "Illegal tensor dimension!");
CheckNTErrors(onehot->order == index->order + 1, "Illegal tensor order!");
CheckNTErrors(onehot->dataType == X_INT, "The onehot tensor must be in X_INT!")
CheckNTErrors(index->dataType == X_INT, "The index tensor must be in X_INT!")
for (int i = 0; i < index->order; i++)
CheckNTErrors(index->GetDim(i) == onehot->GetDim(i), "Illegal tensor order!");
#ifdef USE_CUDA
if(onehot->devID >= 0 && index->devID >= 0) {
_CudaOnehotToIndex(onehot, index, size);
return;
}
#endif
int blockNum = index->unitNum;
int stride = size;
int * onehotData = (int *)onehot->data;
int * indexData = (int *)index->data;
for (int i = 0; i < blockNum; i++) {
int * od = onehotData + i * stride;
int record = -1;
for (int j = 0; j < stride; j++) {
if (od[j] != 0) {
if (record == -1)
record = j;
else
ShowNTErrors("The value of onehot tensor is illegal!");
}
}
indexData[i] = record;
}
}
/*
convert onehot tensor to index tensor (return an XTensor structure)
make a new tensor to keep the result and return it
>> onehot - onehot tensor, which value is 0 or 1
>> size - the last dimension size of the onehot tensor
<< return - the index tensor
*/
XTensor OnehotToIndex(XTensor & onehot, int size)
{
CheckNTErrors(onehot.GetDim(-1) == size, "Illegal tensor dimension!");
CheckNTErrors(onehot.dataType == X_INT, "The onehot tensor must be in X_INT!")
XTensor index;
InitTensor(&index, onehot.order - 1, onehot.dimSize, X_INT, 1.0F, onehot.devID, onehot.mem);
index.SetTMPFlag();
_OnehotToIndex(&onehot, &index, size);
return index;
}
/*
convert index tensor to onehot tensor
>> index - index tensor, which value is an integer num
>> onehot - onehot tensor, which value is 0 or 1
>> size - the last dimension size of the onehot tensor
*/
void _IndexToOnehot(XTensor * index, XTensor * onehot, int size)
{
CheckNTErrors(onehot->GetDim(-1) == size, "Illegal tensor dimension!");
CheckNTErrors(onehot->order == index->order + 1, "Illegal tensor order!");
CheckNTErrors(onehot->dataType == X_INT, "The onehot tensor must be in X_INT!")
CheckNTErrors(index->dataType == X_INT, "The index tensor must be in X_INT!")
for (int i = 0; i < index->order; i++)
CheckNTErrors(index->GetDim(i) == onehot->GetDim(i), "Illegal tensor order!");
onehot->SetZeroAll();
#ifdef USE_CUDA
if(onehot->devID >= 0 && index->devID >= 0) {
_CudaIndexToOnehot(index, onehot, size);
return;
}
#endif
int blockNum = index->unitNum;
int stride = size;
int * indexData = (int *)index->data;
int * onehotData = (int *)onehot->data;
for (int i = 0; i < blockNum; i++) {
int id = indexData[i];
int * od = onehotData + i * stride;
od[id] = 1;
}
}
/*
convert onehot tensor to index tensor (return an XTensor structure)
make a new tensor to keep the result and return it
>> index - index tensor, which value is an integer num
>> size - the last dimension size of the onehot tensor
<< return - the onehot tensor
*/
XTensor IndexToOnehot(XTensor & index, int size)
{
CheckNTErrors(index.dataType == X_INT, "The onehot tensor must be in X_INT!")
XTensor onehot;
onehot.SetTMPFlag();
int order = index.order;
int * dim = new int[order + 1];
memcpy(dim, index.dimSize, order * sizeof(int));
dim[order] = size;
InitTensor(&onehot, index.order + 1, dim, X_INT, 1.0F, index.devID, index.mem);
_IndexToOnehot(&index, &onehot, size);
delete[] dim;
return onehot;
}
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-31
*/
#include "OnehotAndIndex.cuh"
#include "../../XDevice.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/*
convert onehot tensor to index tensor (kernel version)
>> onehotData - the data pointer of the onehot tensor
>> indexData - the data pointer of the index tensor
>> blockNum - the number of block
>> stride - stride of a data block
*/
__global__
void KernelOnehotToIndex(int * onehotData, int * indexData, int blockNum, int stride)
{
/* block id */
int i = blockDim.x * blockIdx.x + threadIdx.x;
/* offset in each block */
int offset = blockDim.y * blockIdx.y + threadIdx.y;
if (i >= blockNum || offset >= stride)
return;
int * od = onehotData + i * stride;
int * id = indexData + i;
if (od[offset] != 0)
*id = offset;
}
/*
convert onehot tensor to index tensor (cuda version)
>> onehot - onehot tensor, which value is 0 or 1
>> index - index tensor, which value is an integer num
>> size - the last dimension size of the onehot tensor
*/
void _CudaOnehotToIndex(XTensor * onehot, XTensor * index, int size)
{
int devID = onehot->devID;
int blockNum = index->unitNum;
int stride = size;
int cudaGrids[3];
int cudaBlocks[3];
int devIDBackup;
ProtectCudaDev(devID, devIDBackup);
GDevs.GetCudaThread2D(devID, blockNum, stride, MAX_INT, cudaGrids, cudaBlocks);
dim3 blocks(cudaGrids[0], cudaGrids[1]);
dim3 threads(cudaBlocks[0], cudaBlocks[1]);
int * onehotData = (int *)onehot->data;
int * indexData = (int *)index->data;
KernelOnehotToIndex<<<blocks, threads >>>(onehotData, indexData, blockNum, stride);
BacktoCudaDev(devID, devIDBackup);
}
/*
convert index tensor to onehot tensor (kernel version)
>> onehotData - the data pointer of the onehot tensor
>> indexData - the data pointer of the index tensor
>> blockNum - the number of block
>> stride - stride of a data block
*/
__global__
void KernelIndexToOnehot(int * onehotData, int * indexData, int blockNum, int stride)
{
/* block id */
int i = blockDim.x * blockIdx.x + threadIdx.x;
/* offset in each block */
int offset = blockDim.y * blockIdx.y + threadIdx.y;
if (i >= blockNum || offset >= stride)
return;
int * od = onehotData + i * stride;
int id = indexData[i];
od[id] = 1;
}
/*
convert index tensor to onehot tensor (cuda version)
>> index - index tensor, which value is an integer num
>> onehot - onehot tensor, which value is 0 or 1
>> size - the last dimension size of the onehot tensor
*/
void _CudaIndexToOnehot(XTensor * index, XTensor * onehot, int size)
{
int devID = onehot->devID;
int blockNum = index->unitNum;
int stride = size;
int cudaGrids[3];
int cudaBlocks[3];
int devIDBackup;
ProtectCudaDev(devID, devIDBackup);
GDevs.GetCudaThread2D(devID, blockNum, stride, MAX_INT, cudaGrids, cudaBlocks);
dim3 blocks(cudaGrids[0], cudaGrids[1]);
dim3 threads(cudaBlocks[0], cudaBlocks[1]);
int * onehotData = (int *)onehot->data;
int * indexData = (int *)index->data;
KernelIndexToOnehot<<<blocks, threads >>>(onehotData, indexData, blockNum, stride);
BacktoCudaDev(devID, devIDBackup);
}
#endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-12-17
*/
#ifndef __ONEHOTANDINDEX_CUH__
#define __ONEHOTANDINDEX_CUH__
#include "../../XTensor.h"
namespace nts{ // namespace nts(NiuTrans.Tensor)
/* convert onehot tensor to index tensor (cuda version) */
void _CudaOnehotToIndex(XTensor * onehot, XTensor * index, int size);
/* convert index tensor to onehot tensor (cuda version) */
void _CudaIndexToOnehot(XTensor * index, XTensor * onehot, int size);
} // namespace nts(NiuTrans.Tensor)
#endif // __ONEHOTANDINDEX_CUH__
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-12-17
*/
#ifndef __ONEHOTANDINDEX_H__
#define __ONEHOTANDINDEX_H__
#include "../../XTensor.h"
namespace nts{ // namespace nts(NiuTrans.Tensor)
/* convert onehot tensor to index tensor */
void _OnehotToIndex(XTensor * onehot, XTensor * index, int size);
/* convert onehot tensor to index tensor (return an XTensor structure)
make a new tensor to keep the result and return it */
XTensor OnehotToIndex(XTensor & onehot, int num);
/* convert index tensor to onehot tensor */
void _IndexToOnehot(XTensor * index, XTensor * onehot, int size);
/* convert index tensor to onehot tensor (return an XTensor structure)
make a new tensor to keep the result and return it */
XTensor IndexToOnehot(XTensor & index, int num);
} // namespace nts(NiuTrans.Tensor)
#endif // __ONEHOTANDINDEX_H__
\ No newline at end of file
...@@ -70,8 +70,9 @@ void _SetDataFanInOut(XTensor * tensor, DTYPE gain) ...@@ -70,8 +70,9 @@ void _SetDataFanInOut(XTensor * tensor, DTYPE gain)
fanOut = numOutputFmaps * receptiveFieldSize; fanOut = numOutputFmaps * receptiveFieldSize;
} }
DTYPE finfout = gain * (float)sqrt(6.0F/(fanIn + fanOut)); DTYPE std = gain * (float)sqrt(2.0 / (fanIn + fanOut));
tensor->SetDataRand(-finfout, finfout); DTYPE a = (DTYPE)sqrt(3.0F) * std;
tensor->SetDataRand(-a, a);
//_SetDataRand(tensor, -finfout, finfout); //_SetDataRand(tensor, -finfout, finfout);
} }
...@@ -499,36 +500,9 @@ void _SetDataWithOffsetAndValue(XTensor * tensor, MTYPE * offsets, void * values ...@@ -499,36 +500,9 @@ void _SetDataWithOffsetAndValue(XTensor * tensor, MTYPE * offsets, void * values
} }
else { else {
#ifdef USE_CUDA #ifdef USE_CUDA
XMem * mem = tensor->mem; if(tensor->devID >= 0) {
MTYPE offsetSize = num * sizeof(MTYPE); _CudaSetDataWithOffsetAndValue(tensor, offsets, values, num);
MTYPE valueSize; return;
if (tensor->dataType == X_INT)
valueSize = num * sizeof(int);
else if (tensor->dataType == X_FLOAT)
valueSize = num * sizeof(float);
else
ShowNTErrors("TO DO!!!");
MTYPE * offsetsCuda = mem != NULL ?
(MTYPE*)mem->AllocBuf(mem->devID, offsetSize) :
(MTYPE*)XMemAlloc(tensor->devID, offsetSize);
void * valuesCuda = mem != NULL ?
mem->AllocBuf(mem->devID, valueSize) :
XMemAlloc(tensor->devID, valueSize);
XMemCopy(offsetsCuda, tensor->devID, offsets, -1, offsetSize);
XMemCopy(valuesCuda, tensor->devID, values, -1, valueSize);
_CudaSetDataWithOffsetAndValue(tensor, offsetsCuda, valuesCuda, num);
if (mem != NULL) {
mem->ReleaseBuf(mem->devID, valueSize);
mem->ReleaseBuf(mem->devID, offsetSize);
}
else {
XMemFree(tensor->devID, offsetsCuda);
XMemFree(tensor->devID, valuesCuda);
} }
#else #else
ShowNTErrors("Please recompile the code with USE_CUDA"); ShowNTErrors("Please recompile the code with USE_CUDA");
......
...@@ -26,6 +26,7 @@ ...@@ -26,6 +26,7 @@
#include "SetData.cuh" #include "SetData.cuh"
#include <curand_kernel.h> #include <curand_kernel.h>
#include "../../XDevice.h" #include "../../XDevice.h"
#include "../../XUtility.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
...@@ -363,7 +364,7 @@ e.g., for a 3* 3 tensor, ...@@ -363,7 +364,7 @@ e.g., for a 3* 3 tensor,
2 2 0 2 2 0
*/ */
__global__ __global__
void _KernelSetDataLowTri(DTYPE * d, int l, int blockSize, int blockNum, DTYPE p, int shift) void KernelSetDataLowTri(DTYPE * d, int l, int blockSize, int blockNum, DTYPE p, int shift)
{ {
/* offset in each block */ /* offset in each block */
int i = blockDim.x * blockIdx.x + threadIdx.x; int i = blockDim.x * blockIdx.x + threadIdx.x;
...@@ -425,7 +426,7 @@ void _CudaSetDataLowTri(XTensor * tensor, DTYPE p, int shift) ...@@ -425,7 +426,7 @@ void _CudaSetDataLowTri(XTensor * tensor, DTYPE p, int shift)
int devIDBackup; int devIDBackup;
ProtectCudaDev(tensor->devID, devIDBackup); ProtectCudaDev(tensor->devID, devIDBackup);
_KernelSetDataLowTri<<<blocks, threads >>>((DTYPE*)tensor->data, l, blockSize, blockNum, p, shift); KernelSetDataLowTri<<<blocks, threads >>>((DTYPE*)tensor->data, l, blockSize, blockNum, p, shift);
BacktoCudaDev(tensor->devID, devIDBackup); BacktoCudaDev(tensor->devID, devIDBackup);
} }
...@@ -474,12 +475,12 @@ set the data with an array of offsets (kernel version) ...@@ -474,12 +475,12 @@ set the data with an array of offsets (kernel version)
>> num - number of the data items >> num - number of the data items
*/ */
__global__ __global__
void _KernelSetDataWithOffset(DTYPE * data, MTYPE * offsets, DTYPE value, MTYPE num) void KernelSetDataWithOffset(DTYPE * data, MTYPE * offsets, DTYPE value, MTYPE num)
{ {
/* index */ /* index */
int i = blockDim.x * blockIdx.x + threadIdx.x; int i = blockDim.x * blockIdx.x + threadIdx.x;
if(i < num) if (i < num)
data[offsets[i]] = value; data[offsets[i]] = value;
} }
...@@ -505,7 +506,7 @@ void _CudaSetDataWithOffset(XTensor * tensor, MTYPE * offsets, DTYPE value, MTYP ...@@ -505,7 +506,7 @@ void _CudaSetDataWithOffset(XTensor * tensor, MTYPE * offsets, DTYPE value, MTYP
int devIDBackup; int devIDBackup;
ProtectCudaDev(tensor->devID, devIDBackup); ProtectCudaDev(tensor->devID, devIDBackup);
_KernelSetDataWithOffset << <blocks, threads >> > ((DTYPE*)tensor->data, offsets, value, num); KernelSetDataWithOffset << <blocks, threads >> > ((DTYPE*)tensor->data, offsets, value, num);
BacktoCudaDev(tensor->devID, devIDBackup); BacktoCudaDev(tensor->devID, devIDBackup);
} }
...@@ -519,7 +520,7 @@ set the data with an array of offsets (kernel version) ...@@ -519,7 +520,7 @@ set the data with an array of offsets (kernel version)
>> dataType - the data type of the data and values >> dataType - the data type of the data and values
*/ */
__global__ __global__
void _KernelSetDataWithOffset(void * data, MTYPE * offsets, void * values, MTYPE num, TENSOR_DATA_TYPE dataType) void KernelSetDataWithOffsetAndValue(void * data, MTYPE * offsets, void * values, MTYPE num, TENSOR_DATA_TYPE dataType)
{ {
/* index */ /* index */
int i = blockDim.x * blockIdx.x + threadIdx.x; int i = blockDim.x * blockIdx.x + threadIdx.x;
...@@ -541,6 +542,18 @@ set the data with an array of values ...@@ -541,6 +542,18 @@ set the data with an array of values
*/ */
void _CudaSetDataWithOffsetAndValue(XTensor * tensor, MTYPE * offsets, void * values, MTYPE num) void _CudaSetDataWithOffsetAndValue(XTensor * tensor, MTYPE * offsets, void * values, MTYPE num)
{ {
XMem * mem = tensor->mem;
MTYPE offsetSize = num * sizeof(MTYPE);
MTYPE valueSize;
if (tensor->dataType == X_INT)
valueSize = num * sizeof(int);
else if (tensor->dataType == X_FLOAT)
valueSize = num * sizeof(float);
else
ShowNTErrors("TO DO!!!");
int gridSize[3]; int gridSize[3];
int blockSize[3]; int blockSize[3];
...@@ -552,7 +565,32 @@ void _CudaSetDataWithOffsetAndValue(XTensor * tensor, MTYPE * offsets, void * va ...@@ -552,7 +565,32 @@ void _CudaSetDataWithOffsetAndValue(XTensor * tensor, MTYPE * offsets, void * va
int devIDBackup; int devIDBackup;
ProtectCudaDev(tensor->devID, devIDBackup); ProtectCudaDev(tensor->devID, devIDBackup);
_KernelSetDataWithOffset << <blocks, threads >> > (tensor->data, offsets, values, num, tensor->dataType); MTYPE * offsetsCuda = mem != NULL ?
(MTYPE*)mem->AllocBuf(mem->devID, offsetSize) :
(MTYPE*)XMemAlloc(tensor->devID, offsetSize);
void * valuesCuda = mem != NULL ?
mem->AllocBuf(mem->devID, valueSize) :
XMemAlloc(tensor->devID, valueSize);
if (mem != NULL) {
XMemCopy(offsetsCuda, mem->devID, offsets, -1, offsetSize);
XMemCopy(valuesCuda, mem->devID, values, -1, valueSize);
}
else {
XMemCopy(offsetsCuda, tensor->devID, offsets, -1, offsetSize);
XMemCopy(valuesCuda, tensor->devID, values, -1, valueSize);
}
KernelSetDataWithOffsetAndValue<<<blocks, threads >>> (tensor->data, offsetsCuda, valuesCuda, num, tensor->dataType);
if (mem != NULL) {
mem->ReleaseBuf(mem->devID, valueSize);
mem->ReleaseBuf(mem->devID, offsetSize);
}
else {
XMemFree(tensor->devID, valuesCuda);
XMemFree(tensor->devID, offsetsCuda);
}
BacktoCudaDev(tensor->devID, devIDBackup); BacktoCudaDev(tensor->devID, devIDBackup);
} }
......
...@@ -15,7 +15,6 @@ ...@@ -15,7 +15,6 @@
* limitations under the License. * limitations under the License.
*/ */
/* /*
* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-03 * $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-03
*/ */
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-12-10
*/
#include "../../XTensor.h"
#include "../../XName.h"
#include "Compare.h"
#include "Compare.cuh"
namespace nts{ // namespace nts(NiuTrans.Tensor)
DTYPE myIsEqual(DTYPE a, DTYPE b)
{
return (a == b ? 1.0F : 0.0F);
}
DTYPE myIsNotEqual(DTYPE a, DTYPE b)
{
return (a != b ? 1.0F : 0.0F);
}
#ifdef USE_CUDA
/* define three marco separately, specify the respective function names (GPU mode) */
#define _SIMPLE_COMPARE_FUNCTION(_funcName, _cudaFuncName, origFunc) \
void _funcName(const XTensor * a, XTensor * b, DTYPE number) \
{ \
CheckNTErrors((XTensor::IsSameShaped(a, b)), \
"Input tensors should have the same type!"); \
CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!"); \
/* run it on GPUs */ \
if (a->devID >= 0) { \
_cudaFuncName(a, b, number); \
return; \
} \
DTYPE * d = (DTYPE*)a->data; \
DTYPE * db = (DTYPE*)b->data; \
for (int i = 0; i < a->unitNum; i++) \
db[i] = (DTYPE)origFunc(d[i], number); \
}
#define _SIMPLE_COMPARE_FUNCTION_ME(_funcNameMe, _funcName) \
void _funcNameMe(XTensor * a, DTYPE number) \
{ \
_funcName(a, a, number); \
}
#define SIMPLE_COMPARE_FUNCTION(funcName, _funcName, operationId) \
XTensor funcName(const XTensor &a, DTYPE number) \
{ \
XTensor b(&a); \
b.SetTMPFlag(); \
_funcName(&a, &b, number); \
return b; \
}
// I think we needn't to make link.
// XLink::MakeLink(&a, NULL, &b, operationId);
_SIMPLE_COMPARE_FUNCTION(_Equal, _CudaEqual, myIsEqual)
_SIMPLE_COMPARE_FUNCTION_ME(_EqualMe, _Equal)
SIMPLE_COMPARE_FUNCTION(Equal, _Equal, MATH_EQUAL)
_SIMPLE_COMPARE_FUNCTION(_NotEqual, _CudaNotEqual, myIsNotEqual)
_SIMPLE_COMPARE_FUNCTION_ME(_NotEqualMe, _NotEqual)
SIMPLE_COMPARE_FUNCTION(NotEqual, _NotEqual, MATH_NOTEQUAL)
#else
/* define three marco separately, specify the respective function names (CPU mode) */
#define _SIMPLE_COMPARE_FUNCTION(_funcName, origFunc) \
void _funcName(const XTensor * a, XTensor * b, DTYPE number) \
{ \
CheckNTErrors((XTensor::IsSameShaped(a, b)), \
"Input tensors should have the same type!"); \
CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!"); \
DTYPE * d = (DTYPE*)a->data; \
DTYPE * db = (DTYPE*)b->data; \
for (int i = 0; i < a->unitNum; i++) \
db[i] = (DTYPE)origFunc(d[i], number); \
}
#define _SIMPLE_COMPARE_FUNCTION_ME(_funcNameMe, _funcName) \
void _funcNameMe(XTensor * a, DTYPE number) \
{ \
_funcName(a, a, number); \
}
#define SIMPLE_COMPARE_FUNCTION(funcName, _funcName, operationId) \
XTensor funcName(const XTensor &a, DTYPE number) \
{ \
XTensor b(&a); \
b.SetTMPFlag(); \
_funcName(&a, &b, number); \
return b; \
}
// I think we needn't to make link.
// XLink::MakeLink(&a, NULL, &b, operationId);
_SIMPLE_COMPARE_FUNCTION(_Equal, myIsEqual)
_SIMPLE_COMPARE_FUNCTION_ME(_EqualMe, _Equal)
SIMPLE_COMPARE_FUNCTION(Equal, _Equal, MATH_EQUAL)
_SIMPLE_COMPARE_FUNCTION(_NotEqual, myIsNotEqual)
_SIMPLE_COMPARE_FUNCTION_ME(_NotEqualMe, _NotEqual)
SIMPLE_COMPARE_FUNCTION(NotEqual, _NotEqual, MATH_NOTEQUAL)
#endif
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-31
*/
#include <math.h>
#include "../../XDevice.h"
#include "../../XName.h"
#include "Compare.h"
#include "Compare.cuh"
namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
__device__
DTYPE cudaIsEqual(DTYPE a, DTYPE b)
{
return (a == b ? 1.0F : 0.0F);
}
__device__
DTYPE cudaIsNotEqual(DTYPE a, DTYPE b)
{
return (a != b ? 1.0F : 0.0F);
}
#define SIMPLE_COMPARE_FUNCTION_GPU(funcName, origFunc) \
__global__ \
void Kernel##funcName(DTYPE * a, DTYPE * b, int size, DTYPE number) \
{ \
int i = blockDim.x * blockIdx.x + threadIdx.x; \
\
if (i < size) \
b[i] = (DTYPE)origFunc(a[i], number); \
} \
__global__ \
void Kernel##funcName(__half * a, __half * b, int size, __half number) \
{ \
return; \
} \
void _Cuda##funcName(const XTensor * a, XTensor * b, DTYPE number) \
{ \
\
int gridSize[3]; \
int blockSize[3]; \
\
GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize); \
\
dim3 blocks(gridSize[0]); \
dim3 threads(blockSize[0]); \
\
int devIDBackup; \
ProtectCudaDev(a->devID, devIDBackup); \
\
if (a->dataType == DEFAULT_DTYPE) { \
Kernel##funcName<<<blocks, threads>>> \
((DTYPE*)a->data, (DTYPE*)b->data, \
a->unitNum, (DTYPE)number); \
} \
else if (a->dataType == X_FLOAT16) { \
Kernel##funcName<<<blocks, threads>>> \
((__half*)a->data, (__half*)b->data, \
a->unitNum, (__half)number); \
} \
else { \
ShowNTErrors("TODO!"); \
} \
\
BacktoCudaDev(a->devID, devIDBackup); \
} \
SIMPLE_COMPARE_FUNCTION_GPU(Equal, cudaIsEqual)
SIMPLE_COMPARE_FUNCTION_GPU(NotEqual, cudaIsNotEqual)
#endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-12-10
*/
#ifndef __COMPARE_CUH__
#define __COMPARE_CUH__
#include "../../XTensor.h"
namespace nts{ // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/* compare whether every entry is equal to the specified value (cuda kernel) */
__global__
void KernelEqual(DTYPE * a, DTYPE * b, DTYPE * number);
/* compare whether every entry is equal to the specified value (cuda version) */
void _CudaEqual(const XTensor * a, XTensor * b, DTYPE number);
/* compare whether every entry is not equal to the specified value (cuda kernel) */
__global__
void KernelNotEqual(DTYPE * a, DTYPE * b, DTYPE * number);
/* compare whether every entry is not equal to the specified value (cuda version) */
void _CudaNotEqual(const XTensor * a, XTensor * b, DTYPE number);
#endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor)
#endif //end __COMPARE_CUH__
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-12-10
*/
#ifndef __COMPARE_H__
#define __COMPARE_H__
#include "../../XTensor.h"
namespace nts{ // namespace nts(NiuTrans.Tensor)
/* compare whether every entry is equal to the specified value */
void _Equal(const XTensor * a, XTensor * b, DTYPE number);
/* compare whether every entry is equal to the specified value (do it on site)
keep the result in the input tensor a and return nothing */
void _EqualMe(XTensor * a, DTYPE number);
/* compare whether every entry is equal to the specified value (return an XTensor structure)
make a new tensor to keep the result and return it */
XTensor Equal(const XTensor & a, DTYPE number);
/* compare whether every entry is not equal to the specified value */
void _NotEqual(const XTensor * a, XTensor * b, DTYPE number);
/* compare whether every entry is not equal to the specified value (do it on site)
keep the result in the input tensor a and return nothing */
void _NotEqualMe(XTensor * a, DTYPE number);
/* compare whether every entry is not equal to the specified value (return an XTensor structure)
make a new tensor to keep the result and return it */
XTensor NotEqual(const XTensor & a, DTYPE number);
} // namespace nts(NiuTrans.Tensor)
#endif // end __COMPARE_H__
\ No newline at end of file
...@@ -223,4 +223,4 @@ _SIMPLE_UNARY_FUNCTION_ME(_RoundMe, _Round) ...@@ -223,4 +223,4 @@ _SIMPLE_UNARY_FUNCTION_ME(_RoundMe, _Round)
SIMPLE_UNARY_FUNCTION(Round, _Round, MATH_ROUND)*/ SIMPLE_UNARY_FUNCTION(Round, _Round, MATH_ROUND)*/
#endif #endif
} } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file \ No newline at end of file
...@@ -15,7 +15,6 @@ ...@@ -15,7 +15,6 @@
* limitations under the License. * limitations under the License.
*/ */
/* /*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-31 * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-31
*/ */
......
...@@ -15,7 +15,6 @@ ...@@ -15,7 +15,6 @@
* limitations under the License. * limitations under the License.
*/ */
/* /*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-31 * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-31
*/ */
......
...@@ -15,7 +15,6 @@ ...@@ -15,7 +15,6 @@
* limitations under the License. * limitations under the License.
*/ */
/* /*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-31 * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-31
*/ */
...@@ -145,5 +144,6 @@ void _TanMe(XTensor * a); ...@@ -145,5 +144,6 @@ void _TanMe(XTensor * a);
make a new tensor to keep the result and return it */ make a new tensor to keep the result and return it */
XTensor Tan(const XTensor & a); XTensor Tan(const XTensor & a);
} } // namespace nts(NiuTrans.Tensor)
#endif //end __UNARY_H__
\ No newline at end of file #endif // end __UNARY_H__
\ No newline at end of file
...@@ -79,8 +79,13 @@ void _CudaCopyBlocksSelected(void * source, int blockSize, int * sourceBlocks, i ...@@ -79,8 +79,13 @@ void _CudaCopyBlocksSelected(void * source, int blockSize, int * sourceBlocks, i
ProtectCudaDev(devID, devIDBackup); ProtectCudaDev(devID, devIDBackup);
/* copy the index to the GPU memory */ /* copy the index to the GPU memory */
int * sourceBlocksTMP = myMem != NULL ? (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int)) : (int *)XMemAlloc(devID, blockNum * sizeof(int)); int * sourceBlocksTMP = myMem != NULL ?
int * targetBlocksTMP = myMem != NULL ? (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int)) : (int *)XMemAlloc(devID, blockNum * sizeof(int)); (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int)) :
(int *)XMemAlloc(devID, blockNum * sizeof(int));
int * targetBlocksTMP = myMem != NULL ?
(int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int)) :
(int *)XMemAlloc(devID, blockNum * sizeof(int));
XMemCopy(sourceBlocksTMP, devID, sourceBlocks, -1, blockNum * sizeof(int)); XMemCopy(sourceBlocksTMP, devID, sourceBlocks, -1, blockNum * sizeof(int));
XMemCopy(targetBlocksTMP, devID, targetBlocks, -1, blockNum * sizeof(int)); XMemCopy(targetBlocksTMP, devID, targetBlocks, -1, blockNum * sizeof(int));
......
/* NiuTrans.Tensor - an open-source tensor library /* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University. * Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved. * All rights reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
* You may obtain a copy of the License at * You may obtain a copy of the License at
* *
* http://www.apache.org/licenses/LICENSE-2.0 * http://www.apache.org/licenses/LICENSE-2.0
* *
* Unless required by applicable law or agreed to in writing, software * Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, * distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
/* /*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/ */
#include "CopyIndexed.h" #include "CopyIndexed.h"
#include "CopyIndexed.cuh"
#include "CopyBlocks.h" #include "CopyBlocks.h"
#include "Gather.h"
#include "../../XName.h" #include "../../XName.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
...@@ -40,7 +42,9 @@ copy indexed sub-tensors ...@@ -40,7 +42,9 @@ copy indexed sub-tensors
e.g., for srcIndex = [1,4] and copyNum = 2, e.g., for srcIndex = [1,4] and copyNum = 2,
we actually copy the source sub-tensors 1, 2, 4, 5 we actually copy the source sub-tensors 1, 2, 4, 5
*/ */
void _CopyIndexed(const XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSize, int * tgtIndex, int copyNum) void _CopyIndexed(const XTensor * s, XTensor * t, int dim,
int * srcIndex, int indexSize, int * tgtIndex,
int copyNum)
{ {
CheckNTErrors((s && t), "Invalid tensors!"); CheckNTErrors((s && t), "Invalid tensors!");
CheckNTErrors((s->devID == t->devID || (s->devID < 0 && t->devID < 0)), CheckNTErrors((s->devID == t->devID || (s->devID < 0 && t->devID < 0)),
...@@ -99,7 +103,148 @@ void _CopyIndexed(const XTensor * s, XTensor * t, int dim, int * srcIndex, int i ...@@ -99,7 +103,148 @@ void _CopyIndexed(const XTensor * s, XTensor * t, int dim, int * srcIndex, int i
} }
/* /*
copy indexed sub-tensors (return an XTensor structure) copy selected sub-tensors where indeces are kept in tensors
>> s - the source tensor
>> t - the target tensor
>> dim - the leading dimension to define "sub-tensors"
e.g., for a tensor of size (3, 2, 4) and dim = 2,
we have 4 sub-tensors of size (3, 2)
>> srcIndex - the tensor to save the index of the source sub-tensors
>> tgtIndex - the tensor to save the index of the target sub-tensors
>> copyNum - number of the sub-tensors we copy for each source index,
e.g., for srcIndex = [1,4] and copyNum = 2,
we actually copy the source sub-tensors 1, 2, 4, 5
*/
void _CopyIndexed(const XTensor * s, XTensor * t, int dim,
const XTensor * srcIndex, const XTensor * tgtIndex,
int copyNum)
{
int order = s->order;
int indexSize = srcIndex->unitNum;
CheckNTErrors(indexSize != 0, "NULL index!")
CheckNTErrors((s && t), "Invalid tensors!");
CheckNTErrors((srcIndex && tgtIndex), "Invalid index tensors!");
CheckNTErrors((s->devID == t->devID || (s->devID < 0 && t->devID < 0)),
"the data must be kept on the same device!");
CheckNTErrors((srcIndex->devID == srcIndex->devID || (s->devID < 0 && t->devID < 0)),
"the index must be kept on the same device!");
CheckNTErrors((s->devID == srcIndex->devID || (s->devID < 0 && t->devID < 0)),
"the data and index must be kept on the same device!");
CheckNTErrors((dim >= 0 && dim < order), "A too larget dimension specified!");
CheckNTErrors((s->unitSize == t->unitSize), "Unmatched tensors!");
CheckNTErrors((srcIndex->unitNum == tgtIndex->unitNum), "Unmatched index tensors!");
for (int i = 0; i < order; i++) {
if (i != dim) {
CheckNTErrors(s->GetDim(i) == t->GetDim(i), "Unmatched dimensions");
}
else {
CheckNTErrors(t->GetDim(i) == indexSize * copyNum, "Unmatched dimensions");
}
}
#ifdef USE_CUDA
if (s->devID >= 0 && srcIndex->devID >= 0) {
_CudaCopyIndexed(s, t, dim, srcIndex, tgtIndex, copyNum);
return;
}
#endif
int blockNum = 1;
int stride = 1;
int blockSizeSrc = 1;
int blockSizeTgt = 1;
for (int i = 0; i < dim; i++)
blockNum *= s->GetDim(i);
for (int i = dim + 1; i < order; i++)
stride *= s->GetDim(i);
blockSizeSrc = stride * s->GetDim(dim);
blockSizeTgt = stride * t->GetDim(dim);
DTYPE * sData = (DTYPE*)s->data;
DTYPE * tData = (DTYPE*)t->data;
int * sIndex = (int*)srcIndex->data;
int * tIndex = (int*)tgtIndex->data;
for (int i = 0; i < indexSize; i++) {
for (int c = 0; c < copyNum; c++) {
int si = sIndex[i] + c;
int ti = tIndex[i] + c;
for (int j = 0; j < blockNum; j++) {
DTYPE * sd = sData + j * blockSizeSrc + si * stride;
DTYPE * td = tData + j * blockSizeTgt + ti * stride;
for (int k = 0; k < stride; k++)
*(td + k) = *(sd + k);
}
}
}
}
/*
copy selected sub-tensors where indeces are kept in tensors (return an XTensor structure)
make a new tensor to keep the result and return it
>> s - the source tensor
>> dim - the leading dimension to define "sub-tensors"
e.g., for a tensor of size (3, 2, 4) and dim = 2,
we have 4 sub-tensors of size (3,2)
>> srcIndex - index of the source sub-tensors
>> indexSize - length of srcIndex (and tgtIndex)
>> tgtIndex - index of the target sub-tensors
>> copyNum - number of the sub-tensors we copy for each source index,
e.g., for srcIndex = [1,4] and copyNum = 2,
we actually copy the source sub-tensors 1, 2, 4, 5
<< return - the result of copying indexed sub-tensors
*/
XTensor CopyIndexed(const XTensor & s, int dim,
const XTensor & srcIndex, const XTensor & tgtIndex,
int copyNum)
{
CheckNTErrors(dim >= 0 && dim < s.order, "A too larget dimension specified!");
int order = s.order;
int * dimSize = new int[order];
int indexSize = srcIndex.unitNum;
for (int i = 0; i < s.order; i++) {
if (i == dim)
dimSize[i] = indexSize * copyNum;
else
dimSize[i] = s.dimSize[i];
}
float dr = (!s.isSparse) ? 1.0F : s.denseRatio;
XTensor t(order, dimSize, s.dataType, dr, s.devID, s.mem);
t.SetTMPFlag();
/* call _CopyIndexed function */
_CopyIndexed(&s, &t, dim, &srcIndex, &tgtIndex, copyNum);
XList list(3);
list.Add(&s);
list.Add(&srcIndex);
list.Add(&tgtIndex);
/* tensor connection */
XLink::MakeLink(&list, &t, MOVEMENT_COPYINDEXED);
XLink::AddParamToHeadInt(&t, dim);
XLink::AddParamToHeadInt(&t, copyNum);
/* destroy variables */
delete[] dimSize;
return t;
}
/*
copy indexed sub-tensors (return a XTensor structure)
make a new tensor to keep the result and return it make a new tensor to keep the result and return it
>> s - the source tensor >> s - the source tensor
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-11-30
*/
#include "CopyIndexed.cuh"
#include "../../XDevice.h"
#include "../../XUtility.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/*
copy selected sub-tensors where indeces are kept in tensors (kenerl version)
>> s - the source tensor
>> t - the target tensor
>> dim - the leading dimension to define "sub-tensors"
e.g., for a tensor of size (3, 2, 4) and dim = 2,
we have 4 sub-tensors of size (3, 2)
>> srcIndex - the tensor to save the index of the source sub-tensors
>> tgtIndex - the tensor to save the index of the target sub-tensors
>> copyNum - number of the sub-tensors we copy for each source index,
e.g., for srcIndex = [1,4] and copyNum = 2,
we actually copy the source sub-tensors 1, 2, 4, 5
*/
__global__
void KernelCopyIndexed(DTYPE * sData, DTYPE * tData, int * sIndex, int * tIndex,
int blockNum, int blockSizeSrc, int blockSizeTgt,
int stride, int indexSize, int copyNum)
{
__shared__ DTYPE * sp[MAX_CUDA_THREAD_NUM_PER_BLOCK];
__shared__ DTYPE * tp[MAX_CUDA_THREAD_NUM_PER_BLOCK];
/* block id */
int i = blockDim.x * blockIdx.x + threadIdx.x;
/* offset in each block */
int offset = blockDim.y * blockIdx.y + threadIdx.y;
if(i >= blockNum * indexSize * copyNum || offset >= stride)
return;
int realIndexSize = indexSize * copyNum;
int realBlockNum = i / realIndexSize;
int realIndex = i % realIndexSize;
int realSrcIndex = sIndex[realIndex / copyNum] + realIndex % copyNum;
int realTgtIndex = tIndex[realIndex / copyNum] + realIndex % copyNum;
if(threadIdx.y == 0){
sp[threadIdx.x] = sData + realBlockNum * blockSizeSrc + realSrcIndex * stride;
tp[threadIdx.x] = tData + realBlockNum * blockSizeTgt + realTgtIndex * stride;
}
__syncthreads();
DTYPE * s = sp[threadIdx.x];
DTYPE * t = tp[threadIdx.x];
t[offset] = s[offset];
}
/*
copy selected sub-tensors where indeces are kept in tensors
>> s - the source tensor
>> t - the target tensor
>> dim - the leading dimension to define "sub-tensors"
e.g., for a tensor of size (3, 2, 4) and dim = 2,
we have 4 sub-tensors of size (3, 2)
>> srcIndex - the tensor to save the index of the source sub-tensors
>> tgtIndex - the tensor to save the index of the target sub-tensors
>> copyNum - number of the sub-tensors we copy for each source index,
e.g., for srcIndex = [1,4] and copyNum = 2,
we actually copy the source sub-tensors 1, 2, 4, 5
*/
void _CudaCopyIndexed(const XTensor * s, XTensor * t, int dim,
const XTensor * srcIndex, const XTensor * tgtIndex,
int copyNum)
{
int devID = s->devID;
int order = s->order;
int indexSize = srcIndex->unitNum;
int blockNum = 1;
int stride = 1;
int blockSizeSrc = 1;
int blockSizeTgt = 1;
for (int i = 0; i < dim; i++)
blockNum *= s->GetDim(i);
for (int i = dim + 1; i < order; i++)
stride *= s->GetDim(i);
blockSizeSrc = stride * s->GetDim(dim);
blockSizeTgt = stride * t->GetDim(dim);
int cudaGrids[3];
int cudaBlocks[3];
int devIDBackup;
ProtectCudaDev(devID, devIDBackup);
GDevs.GetCudaThread2D(devID, blockNum * indexSize * copyNum, stride, MAX_INT, cudaGrids, cudaBlocks);
dim3 blocks(cudaGrids[0], cudaGrids[1]);
dim3 threads(cudaBlocks[0], cudaBlocks[1]);
DTYPE * sData = (DTYPE*)s->data;
DTYPE * tData = (DTYPE*)t->data;
int * sIndex = (int *)srcIndex->data;
int * tIndex = (int *)tgtIndex->data;
KernelCopyIndexed<<<blocks, threads >>>(sData, tData, sIndex, tIndex,
blockNum, blockSizeSrc, blockSizeTgt,
stride, indexSize, copyNum);
BacktoCudaDev(devID, devIDBackup);
}
#endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-11-30
* Tomorrow is the celebration of the laboratory, I'm so happy!
*/
#ifndef __CopyIndexed_CUH__
#define __CopyIndexed_CUH__
#include "../../XTensor.h"
#include "CopyIndexed.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/* copy selected sub-tensors where indeces are kept in tensors (cuda version) */
void _CudaCopyIndexed(const XTensor * s, XTensor * t, int dim,
const XTensor * srcIndex, const XTensor * tgtIndex,
int copyNum);
#endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor)
#endif // __CopyIndexed_CUH__
\ No newline at end of file
...@@ -27,22 +27,27 @@ ...@@ -27,22 +27,27 @@
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* copy selected sub-tensors */ /* copy selected sub-tensors */
void _CopyIndexed(const XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSize, int * tgtIndex, int copyNum); void _CopyIndexed(const XTensor * s, XTensor * t, int dim,
int * srcIndex, int indexSize, int * tgtIndex,
int copyNum = 1);
/* copy selected sub-tensors where indeces are kept in tensors */ /* copy selected sub-tensors where indeces are kept in tensors */
void _CopyIndexed(const XTensor * s, XTensor * t, int dim, const XTensor * srcIndex, const XTensor * tgtIndex); void _CopyIndexed(const XTensor * s, XTensor * t, int dim,
const XTensor * srcIndex, const XTensor * tgtIndex,
int copyNum = 1);
/* /*
copy selected sub-tensors (return an XTensor structure) copy selected sub-tensors (return a XTensor structure)
make a new tensor to keep the result and return it (remove this???) make a new tensor to keep the result and return it (remove this???)
*/ */
XTensor CopyIndexed(const XTensor &s, int dim, int * srcIndex, int indexSize, int * tgtIndex, int copyNum); XTensor CopyIndexed(const XTensor &s, int dim, int * srcIndex, int indexSize, int * tgtIndex, int copyNum);
/* /*
copy selected sub-tensors where indeces are kept in tensors (return an XTensor structure) copy selected sub-tensors where indeces are kept in tensors (return an XTensor structure)
make a new tensor to keep the result and return it (remove this???) make a new tensor to keep the result and return it
*/ */
void CopyIndexed(const XTensor * s, XTensor * t, int dim, const XTensor * srcIndex, const XTensor * tgtIndex); XTensor CopyIndexed(const XTensor & s, int dim,
const XTensor & srcIndex, const XTensor & tgtIndex,
int copyNum = 1);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -57,15 +57,14 @@ gather indexed sub-tensors ...@@ -57,15 +57,14 @@ gather indexed sub-tensors
>> t - the target tensor >> t - the target tensor
>> srcIndex - the tensor to save the index of the source tensor >> srcIndex - the tensor to save the index of the source tensor
*/ */
void _Gather(XTensor * s, XTensor * t, XTensor * srcIndex) void _Gather(const XTensor * s, XTensor * t, XTensor * srcIndex)
{ {
CheckNTErrors((s && t), "Invalid tensors!"); CheckNTErrors((s && t), "Invalid tensors!");
CheckNTErrors((s->devID == t->devID && t->devID == srcIndex->devID), CheckNTErrors(s->devID == t->devID, "the data must be kept on the same device!");
"the data must be kept on the same device!");
CheckNTErrors((s->unitSize == t->unitSize), "Unmatched tensors!"); CheckNTErrors((s->unitSize == t->unitSize), "Unmatched tensors!");
#ifdef USE_CUDA #ifdef USE_CUDA
if (s->devID >= 0 && t->devID >= 0 && srcIndex->devID >= 0) { if (s->devID >= 0 && t->devID >= 0) {
_CudaGather(s, t, srcIndex); _CudaGather(s, t, srcIndex);
return; return;
} }
...@@ -116,6 +115,8 @@ XTensor Gather(XTensor &s, XTensor &index) ...@@ -116,6 +115,8 @@ XTensor Gather(XTensor &s, XTensor &index)
XTensor t(order, dimSize, s.dataType, dr, s.devID, s.mem); XTensor t(order, dimSize, s.dataType, dr, s.devID, s.mem);
t.SetTMPFlag(); t.SetTMPFlag();
delete[] dimSize;
_Gather(&s, &t, &index); _Gather(&s, &t, &index);
/* tensor connection */ /* tensor connection */
......
/* NiuTrans.Tensor - an open-source tensor library /* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University. * Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved. * All rights reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
* You may obtain a copy of the License at * You may obtain a copy of the License at
* *
* http://www.apache.org/licenses/LICENSE-2.0 * http://www.apache.org/licenses/LICENSE-2.0
* *
* Unless required by applicable law or agreed to in writing, software * Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, * distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
/* /*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24 * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-11-27
*/ */
#include "Gather.cuh" #include "Gather.cuh"
#include "CopyBlocksSelected.cuh" #include "CopyBlocksSelected.cuh"
...@@ -41,7 +41,7 @@ __global__ ...@@ -41,7 +41,7 @@ __global__
void KernelGather(DTYPE * sData, DTYPE * tData, int * sIndex, int indexSize, int stride) void KernelGather(DTYPE * sData, DTYPE * tData, int * sIndex, int indexSize, int stride)
{ {
__shared__ DTYPE * sp[MAX_CUDA_THREAD_NUM_PER_BLOCK]; __shared__ DTYPE * sp[MAX_CUDA_THREAD_NUM_PER_BLOCK];
__shared__ DTYPE * cp[MAX_CUDA_THREAD_NUM_PER_BLOCK]; __shared__ DTYPE * tp[MAX_CUDA_THREAD_NUM_PER_BLOCK];
/* block id */ /* block id */
int i = blockDim.x * blockIdx.x + threadIdx.x; int i = blockDim.x * blockIdx.x + threadIdx.x;
...@@ -54,15 +54,15 @@ void KernelGather(DTYPE * sData, DTYPE * tData, int * sIndex, int indexSize, int ...@@ -54,15 +54,15 @@ void KernelGather(DTYPE * sData, DTYPE * tData, int * sIndex, int indexSize, int
if(threadIdx.y == 0){ if(threadIdx.y == 0){
sp[threadIdx.x] = sData + sIndex[i] * stride; sp[threadIdx.x] = sData + sIndex[i] * stride;
cp[threadIdx.x] = tData + i * stride; tp[threadIdx.x] = tData + i * stride;
} }
__syncthreads(); __syncthreads();
DTYPE * s = sp[threadIdx.x]; DTYPE * s = sp[threadIdx.x];
DTYPE * c = cp[threadIdx.x]; DTYPE * t = tp[threadIdx.x];
c[offset] = s[offset]; t[offset] = s[offset];
} }
/* /*
...@@ -72,9 +72,10 @@ gather indexed sub-tensors(cuda version) ...@@ -72,9 +72,10 @@ gather indexed sub-tensors(cuda version)
>> t - the target tensor >> t - the target tensor
>> srcIndex - the tensor to save the index of the source tensor >> srcIndex - the tensor to save the index of the source tensor
*/ */
void _CudaGather(XTensor * s, XTensor * t, XTensor * srcIndex) void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex)
{ {
int devID = s->devID; int devID = s->devID;
XMem * mem = s->mem;
int stride = s->GetDim(1); int stride = s->GetDim(1);
int indexSize = srcIndex->unitNum; int indexSize = srcIndex->unitNum;
...@@ -93,10 +94,26 @@ void _CudaGather(XTensor * s, XTensor * t, XTensor * srcIndex) ...@@ -93,10 +94,26 @@ void _CudaGather(XTensor * s, XTensor * t, XTensor * srcIndex)
DTYPE * sData = (DTYPE*)s->data; DTYPE * sData = (DTYPE*)s->data;
DTYPE * tData = (DTYPE*)t->data; DTYPE * tData = (DTYPE*)t->data;
int * sIndex = (int *)srcIndex->data; int * sIndex = NULL;
if (srcIndex->devID < 0) {
sIndex = mem != NULL ?
(int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) :
(int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);
XMemCopy(sIndex, devID, srcIndex, -1, sizeof(int) * indexSize);
}
else
sIndex = (int *)srcIndex->data;
KernelGather<<<blocks, threads >>>(sData, tData, sIndex, indexSize, stride); KernelGather<<<blocks, threads >>>(sData, tData, sIndex, indexSize, stride);
if (srcIndex->devID < 0) {
if(mem != NULL)
mem->ReleaseBuf(mem->devID, sizeof(int) * indexSize);
else
XMemFree(mem->devID, sIndex);
}
BacktoCudaDev(devID, devIDBackup); BacktoCudaDev(devID, devIDBackup);
} }
......
/* NiuTrans.Tensor - an open-source tensor library /* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University. * Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved. * All rights reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
* You may obtain a copy of the License at * You may obtain a copy of the License at
* *
* http://www.apache.org/licenses/LICENSE-2.0 * http://www.apache.org/licenses/LICENSE-2.0
* *
* Unless required by applicable law or agreed to in writing, software * Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, * distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
/* /*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24 * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-11-27
*/ */
#ifndef __GATHER_CUH__ #ifndef __GATHER_CUH__
#define __GATHER_CUH__ #define __GATHER_CUH__
...@@ -30,7 +30,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -30,7 +30,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA #ifdef USE_CUDA
/* gather indexed sub-tensors(cuda version) */ /* gather indexed sub-tensors(cuda version) */
void _CudaGather(XTensor * s, XTensor * t, XTensor * srcIndex); void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex);
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -30,7 +30,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -30,7 +30,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
void _Gather(XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSize); void _Gather(XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSize);
/* gather selected sub-tensors */ /* gather selected sub-tensors */
void _Gather(XTensor * s, XTensor * t, XTensor * srcIndex); void _Gather(const XTensor * s, XTensor * t, XTensor * srcIndex);
/* gather selected sub-tensors (return an XTensor structure) /* gather selected sub-tensors (return an XTensor structure)
make a new tensor to keep the result and return it */ make a new tensor to keep the result and return it */
......
...@@ -134,63 +134,92 @@ void _AssignmentForGather(DTYPE * sData, DTYPE * cData, int blockNum, ...@@ -134,63 +134,92 @@ void _AssignmentForGather(DTYPE * sData, DTYPE * cData, int blockNum,
/* /*
spread a collection tensor to source tensor. spread a collection tensor to source tensor.
And this is a special spread function for backward computation of gather function. And this is a special spread function for backward computation of CopyIndexed function.
>> source - the source tensor whose data would be modified >> s - the source tensor whose data would be modified
>> collection - the collection whose data would be spread to source tensor >> c - the collection whose data would be spread to source tensor
>> dim - the leading dimension to define "sub-tensors" >> dim - the leading dimension to define "sub-tensors"
e.g., for a tensor of size (3, 2, 4) and dim = 2, e.g., for a tensor of size (3, 2, 4) and dim = 2,
we have 4 sub-tensors of size (3, 2) we have 4 sub-tensors of size (3, 2)
>> srcIndex - index of the source sub-tensors >> srcIndex - the tensor to save the index of the source sub-tensors
>> indexSize - length of srcIndex (and collIndex) >> collIndex - the tensor to save the index of the collection sub-tensors
>> copyNum - number of the sub-tensors we copy for each source index,
e.g., for srcIndex = [1,4] and copyNum = 2,
we actually copy the source sub-tensors 1, 2, 4, 5
*/ */
void _SpreadForGather(XTensor * source, XTensor * collection, int dim, void _SpreadForCopyIndexed(XTensor * s, XTensor * c, int dim,
int * srcIndex, int indexSize) XTensor * srcIndex, XTensor * collIndex,
int copyNum)
{ {
int order = source->order; int order = s->order;
int indexSize = srcIndex->unitNum;
CheckNTErrors(source->dataType == DEFAULT_DTYPE, "TODO!"); CheckNTErrors(indexSize != 0, "NULL index!")
CheckNTErrors((s && c), "Invalid tensors!");
CheckNTErrors((srcIndex && collIndex), "Invalid index tensors!");
CheckNTErrors((s->devID == c->devID || (s->devID < 0 && c->devID < 0)),
"the data must be kept on the same device!");
CheckNTErrors((srcIndex->devID == srcIndex->devID || (s->devID < 0 && c->devID < 0)),
"the index must be kept on the same device!");
CheckNTErrors((s->devID == srcIndex->devID || (s->devID < 0 && c->devID < 0)),
"the data and index must be kept on the same device!");
CheckNTErrors((dim >= 0 && dim < s->order), "A too larget dimension specified!");
CheckNTErrors((s->unitSize == c->unitSize), "Unmatched tensors!");
CheckNTErrors((srcIndex->unitNum == collIndex->unitNum), "Unmatched index tensors!");
CheckNTErrors(s->dataType == DEFAULT_DTYPE, "TODO!");
CheckNTErrors(dim >= 0 && dim < order, "Illegal dimension!"); CheckNTErrors(dim >= 0 && dim < order, "Illegal dimension!");
for (int i = 0; i < order; i++){ for (int i = 0; i < order; i++) {
if (i == dim) { if (i != dim) {
CheckNTErrors(collection->GetDim(i) == indexSize, "Illegal dimension!"); CheckNTErrors(s->GetDim(i) == c->GetDim(i), "Unmatched dimensions");
} }
else { else {
CheckNTErrors(collection->GetDim(i) == source->GetDim(i), "Illegal dimension!"); CheckNTErrors(c->GetDim(i) == indexSize * copyNum, "Unmatched dimensions");
} }
} }
#ifdef USE_CUDA #ifdef USE_CUDA
if(source->devID >= 0 && collection->devID >= 0) { if(s->devID >= 0 && c->devID >= 0) {
_CudaSpreadForGather(source, collection, dim, srcIndex, indexSize); _CudaSpreadForCopyIndexed(s, c, dim, srcIndex, collIndex, copyNum);
return; return;
} }
#endif #endif
int blockSizeSrc = 1;
int blockSizeColl = 1;
int blockNum = 1; int blockNum = 1;
int stride = 1; int stride = 1;
int blockSizeSrc = 1;
int blockSizeTgt = 1;
for (int i = dim + 1; i < order; i++) { for (int i = 0; i < dim; i++)
stride *= source->GetDim(i); blockNum *= s->GetDim(i);
}
blockSizeSrc = stride * source->GetDim(dim); for (int i = dim + 1; i < order; i++)
blockSizeColl = stride * collection->GetDim(dim); stride *= s->GetDim(i);
blockNum = source->unitNum / blockSizeSrc;
DTYPE * sData = (DTYPE*)source->data; blockSizeSrc = stride * s->GetDim(dim);
DTYPE * cData = (DTYPE*)collection->data; blockSizeTgt = stride * c->GetDim(dim);
for(int i = 0; i < indexSize; i++){ DTYPE * sData = (DTYPE*)s->data;
int src = srcIndex[i]; DTYPE * cData = (DTYPE*)c->data;
int tgt = i; int * sIndex = (int*)srcIndex->data;
DTYPE * s = sData + src * stride; int * cIndex = (int*)collIndex->data;
DTYPE * c = cData + tgt * stride;
_AssignmentForGather(s, c, blockNum, blockSizeSrc, blockSizeColl, stride); for (int i = 0; i < indexSize; i++) {
for (int c = 0; c < copyNum; c++) {
int si = sIndex[i] + c;
int ti = cIndex[i] + c;
for (int j = 0; j < blockNum; j++) {
DTYPE * sd = sData + j * blockSizeSrc + si * stride;
DTYPE * td = cData + j * blockSizeTgt + ti * stride;
for (int k = 0; k < stride; k++)
*(sd + k) += *(td + k);
}
}
} }
} }
/* /*
...@@ -218,7 +247,7 @@ void _SpreadForGather(XTensor * source, XTensor * collection, XTensor * index) ...@@ -218,7 +247,7 @@ void _SpreadForGather(XTensor * source, XTensor * collection, XTensor * index)
} }
#ifdef USE_CUDA #ifdef USE_CUDA
if(source->devID >= 0 && collection->devID >= 0 && index->devID >= 0) { if(source->devID >= 0 && collection->devID >= 0) {
_CudaSpreadForGather(source, collection, index); _CudaSpreadForGather(source, collection, index);
return; return;
} }
......
...@@ -19,13 +19,11 @@ ...@@ -19,13 +19,11 @@
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-09-25 * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-09-25
*/ */
#ifndef __SPREAD_CUH__
#define __SPREAD_CUH__
#include "../../XTensor.h" #include "../../XTensor.h"
#include "../../XDevice.h" #include "../../XDevice.h"
#include "../../XUtility.h" #include "../../XUtility.h"
#include "Spread.cuh" #include "Spread.cuh"
#include "CopyValues.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
...@@ -60,7 +58,6 @@ void KernelSpread(DTYPE * sData, DTYPE * cData, int blockNum, ...@@ -60,7 +58,6 @@ void KernelSpread(DTYPE * sData, DTYPE * cData, int blockNum,
s[j] = c[j]; s[j] = c[j];
} }
/* /*
This is core assignment for spread function. This is core assignment for spread function.
...@@ -76,9 +73,9 @@ This is core assignment for spread function. ...@@ -76,9 +73,9 @@ This is core assignment for spread function.
*/ */
__global__ __global__
void KernelSpreadFuzed(DTYPE * sData, DTYPE * cData, int blockNum, void KernelSpreadFuzed(DTYPE * sData, DTYPE * cData, int blockNum,
int blockSizeSrc, int blockSizeColl, int stride, int blockSizeSrc, int blockSizeColl, int stride,
int subtensorNum, int subtensorNum,
int * srcIndex, int * colIndex) int * srcIndex, int * colIndex)
{ {
__shared__ DTYPE * sp[MAX_CUDA_THREAD_NUM_PER_BLOCK]; __shared__ DTYPE * sp[MAX_CUDA_THREAD_NUM_PER_BLOCK];
__shared__ DTYPE * cp[MAX_CUDA_THREAD_NUM_PER_BLOCK]; __shared__ DTYPE * cp[MAX_CUDA_THREAD_NUM_PER_BLOCK];
...@@ -189,7 +186,7 @@ void _CudaSpread(XTensor * source, XTensor * collection, int dim, ...@@ -189,7 +186,7 @@ void _CudaSpread(XTensor * source, XTensor * collection, int dim,
XMemCopy(ci, mem->devID, collIndex, -1, sizeof(int) * indexSize); XMemCopy(ci, mem->devID, collIndex, -1, sizeof(int) * indexSize);
KernelSpreadFuzed<<<blocks, threads >>>(s, c, blockNum, blockSizeSrc, blockSizeColl, KernelSpreadFuzed<<<blocks, threads >>>(s, c, blockNum, blockSizeSrc, blockSizeColl,
stride, indexSize, si, ci); stride, indexSize, si, ci);
if(mem != NULL) if(mem != NULL)
mem->ReleaseBuf(mem->devID, sizeof(int) * indexSize * 2); mem->ReleaseBuf(mem->devID, sizeof(int) * indexSize * 2);
...@@ -200,54 +197,25 @@ void _CudaSpread(XTensor * source, XTensor * collection, int dim, ...@@ -200,54 +197,25 @@ void _CudaSpread(XTensor * source, XTensor * collection, int dim,
BacktoCudaDev(source->devID, devIDBackup); BacktoCudaDev(source->devID, devIDBackup);
} }
/* /*
This is core assignment for backward computation of gather function. spread a collection tensor to source tensor (kernel version).
Care of the operator "+=" instead of "=". And this is a special spread function for backward computation of CopyIndexed function.
>> sData - the data pointer of the source tensor >> sData - the data pointer of the source tensor
>> cData - the data pointer of collection tensor >> cData - the data pointer of collection tensor
>> sIndex - index of the source sub-tensor
>> cIndex - index of the sub-tensor in the collection tensor
>> blockNum - number of data blocks >> blockNum - number of data blocks
>> blockSizeSrc - size of source data block >> blockSizeSrc - size of source data block
>> blockSizeColl - size of source data block >> blockSizeColl - size of source data block
>> stride - stride of a data block >> stride - stride of a data block
>> indexSize - number of indexs
>> copyNum - number of the sub-tensors we copy for each source index
*/ */
__global__ __global__
void KernelSpreadForGather(DTYPE * sData, DTYPE * cData, int blockNum, void KernelSpreadForCopyIndexed(DTYPE * sData, DTYPE * cData, int * sIndex, int * cIndex,
int blockSizeSrc, int blockSizeColl, int stride) int blockNum, int blockSizeSrc, int blockSizeColl,
{ int stride, int indexSize, int copyNum)
/* block id */
int i = blockDim.x * blockIdx.x + threadIdx.x;
/* offset in each block */
int j = blockDim.y * blockIdx.y + threadIdx.y;
if(i >= blockNum || j >= stride)
return;
DTYPE * s = sData + blockSizeSrc * i;
DTYPE * c = cData + blockSizeColl * i;
s[j] += c[j];
}
/*
This is core assignment for backward computation of gather function.
Care of the operator "+=" instead of "=".
>> sData - the data pointer of the source tensor
>> cData - the data pointer of collection tensor
>> blockNum - number of data blocks
>> blockSizeSrc - size of source data block
>> blockSizeColl - size of source data block
>> stride - stride of a data block
>> subtensorNum - number of sub-tensors
>> srcIndex - index of the source sub-tensor
*/
__global__
void KernelSpreadForGatherFuzed(DTYPE * sData, DTYPE * cData, int blockNum,
int blockSizeSrc, int blockSizeColl, int stride,
int subtensorNum,
int * srcIndex)
{ {
__shared__ DTYPE * sp[MAX_CUDA_THREAD_NUM_PER_BLOCK]; __shared__ DTYPE * sp[MAX_CUDA_THREAD_NUM_PER_BLOCK];
__shared__ DTYPE * cp[MAX_CUDA_THREAD_NUM_PER_BLOCK]; __shared__ DTYPE * cp[MAX_CUDA_THREAD_NUM_PER_BLOCK];
...@@ -258,105 +226,140 @@ void KernelSpreadForGatherFuzed(DTYPE * sData, DTYPE * cData, int blockNum, ...@@ -258,105 +226,140 @@ void KernelSpreadForGatherFuzed(DTYPE * sData, DTYPE * cData, int blockNum,
/* offset in each block */ /* offset in each block */
int offset = blockDim.y * blockIdx.y + threadIdx.y; int offset = blockDim.y * blockIdx.y + threadIdx.y;
int blockId = i % blockNum; int realIndexSize = indexSize * copyNum;
int subtensorId = i / blockNum;
int realBlockNum = i / realIndexSize;
if(subtensorId >= subtensorNum || offset >= stride) int tmp = i % realIndexSize;
int realIndex = tmp / copyNum;
int realCopyNum = tmp % copyNum;
if (realBlockNum >= blockNum || offset >= stride || realIndex >= indexSize || realCopyNum >= copyNum)
return; return;
//if(i >= blockNum * indexSize * copyNum || offset >= stride)
// return;
int realSrcIndex = sIndex[realIndex] + realCopyNum;
int realCollIndex = cIndex[realIndex] + realCopyNum;
//int realSrcIndex = sIndex[realIndex / copyNum] + realIndex % copyNum;
//int realCollIndex = cIndex[realIndex / copyNum] + realIndex % copyNum;
if(threadIdx.y == 0){ if(threadIdx.y == 0){
sp[threadIdx.x] = sData + srcIndex[subtensorId] * stride; sp[threadIdx.x] = sData + realBlockNum * blockSizeSrc + realSrcIndex * stride;
cp[threadIdx.x] = cData + subtensorId * stride; cp[threadIdx.x] = cData + realBlockNum * blockSizeColl + realCollIndex * stride;
} }
__syncthreads(); __syncthreads();
DTYPE * s = sp[threadIdx.x] + blockSizeSrc * blockId; DTYPE * s = sp[threadIdx.x];
DTYPE * c = cp[threadIdx.x] + blockSizeColl * blockId; DTYPE * c = cp[threadIdx.x];
atomicAdd(s + offset, c[offset]);
s[offset] += c[offset];
} }
/* /*
spread a collection tensor to source tensor (cuda version). spread a collection tensor to source tensor.
And this is a special spread function for backward computation of gather function. And this is a special spread function for backward computation of CopyIndexed function.
>> source - the source tensor whose data would be modified >> s - the source tensor whose data would be modified
>> collection - the collection whose data would be spread to source tensor >> c - the collection whose data would be spread to source tensor
>> dim - the leading dimension to define "sub-tensors" >> dim - the leading dimension to define "sub-tensors"
e.g., for a tensor of size (3, 2, 4) and dim = 2, e.g., for a tensor of size (3, 2, 4) and dim = 2,
we have 4 sub-tensors of size (3, 2) we have 4 sub-tensors of size (3, 2)
>> srcIndex - index of the source sub-tensors >> srcIndex - the tensor to save the index of the source sub-tensors
>> indexSize - length of srcIndex (and collIndex) >> collIndex - the tensor to save the index of the collection sub-tensors
>> copyNum - number of the sub-tensors we copy for each source index,
e.g., for srcIndex = [1,4] and copyNum = 2,
we actually copy the source sub-tensors 1, 2, 4, 5
*/ */
void _CudaSpreadForGather(XTensor * source, XTensor * collection, int dim, void _CudaSpreadForCopyIndexed(XTensor * s, XTensor * c, int dim,
int * srcIndex, int indexSize) XTensor * srcIndex, XTensor * collIndex,
int copyNum)
{ {
int order = source->order; int devID = s->devID;
int order = s->order;
int indexSize = srcIndex->unitNum;
CheckNTErrors(source->dataType == DEFAULT_DTYPE, "TODO!");
CheckNTErrors(dim >= 0 && dim < order, "Illegal dimension!");
int blockSizeSrc = 1;
int blockSizeColl = 1;
int blockNum = 1; int blockNum = 1;
int stride = 1; int stride = 1;
int blockSizeSrc = 1;
int blockSizeTgt = 1;
for (int i = dim + 1; i < order; i++) for (int i = 0; i < dim; i++)
stride *= source->GetDim(i); blockNum *= s->GetDim(i);
blockSizeSrc = stride * source->GetDim(dim); for (int i = dim + 1; i < order; i++)
blockSizeColl = stride * collection->GetDim(dim); stride *= s->GetDim(i);
blockNum = source->unitNum / blockSizeSrc;
blockSizeSrc = stride * s->GetDim(dim);
blockSizeTgt = stride * c->GetDim(dim);
int cudaGrids[3]; int cudaGrids[3];
int cudaBlocks[3]; int cudaBlocks[3];
int devIDBackup; int devIDBackup;
ProtectCudaDev(source->devID, devIDBackup); ProtectCudaDev(devID, devIDBackup);
if(indexSize < 4){ GDevs.GetCudaThread2D(devID, blockNum * indexSize * copyNum, stride, MAX_INT, cudaGrids, cudaBlocks);
GDevs.GetCudaThread2D(source->devID, blockNum, stride, MAX_INT, cudaGrids, cudaBlocks);
dim3 blocks(cudaGrids[0], cudaGrids[1]); dim3 blocks(cudaGrids[0], cudaGrids[1]);
dim3 threads(cudaBlocks[0], cudaBlocks[1]); dim3 threads(cudaBlocks[0], cudaBlocks[1]);
DTYPE * sData = (DTYPE*)source->data;
DTYPE * cData = (DTYPE*)collection->data;
for(int i = 0; i < indexSize; i++) {
int src = srcIndex[i];
int tgt = i;
DTYPE * s = sData + src * stride;
DTYPE * c = cData + tgt * stride;
KernelSpreadForGather<<<blocks, threads >>>(s, c, blockNum, blockSizeSrc, blockSizeColl, stride); DTYPE * sData = (DTYPE*)s->data;
} DTYPE * cData = (DTYPE*)c->data;
}
else{
XMem * mem = source->mem;
int * si = mem != NULL ?
(int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) :
(int*)XMemAlloc(source->devID, sizeof(int) * indexSize);
XMemCopy(si, source->devID, srcIndex, -1, sizeof(int) * indexSize); int * sIndex = (int *)srcIndex->data;
int * cIndex = (int *)collIndex->data;
DTYPE * s = (DTYPE*)source->data; KernelSpreadForCopyIndexed<<<blocks, threads >>>(sData, cData, sIndex, cIndex,
DTYPE * c = (DTYPE*)collection->data; blockNum, blockSizeSrc, blockSizeTgt,
stride, indexSize, copyNum);
GDevs.GetCudaThread2D(source->devID, blockNum * indexSize, stride, MAX_INT, cudaGrids, cudaBlocks); BacktoCudaDev(devID, devIDBackup);
}
dim3 blocks(cudaGrids[0], cudaGrids[1]); /*
dim3 threads(cudaBlocks[0], cudaBlocks[1]); This is core assignment for backward computation of gather function.
Care of the operator "+=" instead of "=".
>> sData - the data pointer of the source tensor
>> cData - the data pointer of collection tensor
>> srcIndex - index of the source sub-tensor
>> indexSize - the number of index
>> stride - stride of a data block
*/
__global__
void KernelSpreadForGather(DTYPE * sData, DTYPE * cData, int * srcIndex,
int indexSize, int stride)
{
__shared__ DTYPE * sp[MAX_CUDA_THREAD_NUM_PER_BLOCK];
__shared__ DTYPE * cp[MAX_CUDA_THREAD_NUM_PER_BLOCK];
KernelSpreadForGatherFuzed<<<blocks, threads >>>(s, c, blockNum, blockSizeSrc, blockSizeColl, stride, indexSize, si); /* block id */
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (mem != NULL) { /* offset in each block */
mem->ReleaseBuf(mem->devID, sizeof(int) * indexSize); int offset = blockDim.y * blockIdx.y + threadIdx.y;
}
else { if(i >= indexSize || offset >= stride)
XMemFree(source->devID, si); return;
}
if (threadIdx.y == 0) {
sp[threadIdx.x] = sData + srcIndex[i] * stride;
cp[threadIdx.x] = cData + i * stride;
} }
__syncthreads();
DTYPE * s = sp[threadIdx.x];
DTYPE * c = cp[threadIdx.x];
//DTYPE * s = sData + srcIndex[i] * stride;
//DTYPE * c = cData + i * stride;
atomicAdd(s + offset, c[offset]);
} }
/* /*
...@@ -369,40 +372,48 @@ And this is a special spread function for backward computation of gather functio ...@@ -369,40 +372,48 @@ And this is a special spread function for backward computation of gather functio
*/ */
void _CudaSpreadForGather(XTensor * source, XTensor * collection, XTensor * srcIndex) void _CudaSpreadForGather(XTensor * source, XTensor * collection, XTensor * srcIndex)
{ {
int dim = 0;
int devID = source->devID; int devID = source->devID;
XMem * mem = source->mem;
int blockNum = 1;
int stride = source->GetDim(1); int stride = source->GetDim(1);
int indexSize = srcIndex->unitNum; int indexSize = srcIndex->unitNum;
int blockSizeSrc = stride * source->GetDim(dim);
int blockSizeColl = stride * collection->GetDim(dim);
int cudaGrids[3]; int cudaGrids[3];
int cudaBlocks[3]; int cudaBlocks[3];
int devIDBackup; int devIDBackup;
ProtectCudaDev(source->devID, devIDBackup); ProtectCudaDev(source->devID, devIDBackup);
DTYPE * sData = (DTYPE*)source->data;
DTYPE * cData = (DTYPE*)collection->data;
int * sIndex = NULL;
GDevs.GetCudaThread2D(devID, indexSize, stride, MAX_INT, cudaGrids, cudaBlocks); GDevs.GetCudaThread2D(devID, indexSize, stride, MAX_INT, cudaGrids, cudaBlocks);
dim3 blocks(cudaGrids[0], cudaGrids[1]); dim3 blocks(cudaGrids[0], cudaGrids[1]);
dim3 threads(cudaBlocks[0], cudaBlocks[1]); dim3 threads(cudaBlocks[0], cudaBlocks[1]);
DTYPE * s = (DTYPE*)source->data; if (srcIndex->devID < 0) {
DTYPE * c = (DTYPE*)collection->data; sIndex = mem != NULL ?
(int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) :
(int*)XMemAlloc(devID, sizeof(int) * indexSize);
XMemCopy(sIndex, devID, srcIndex->data, -1, sizeof(int) * indexSize);
}
else
sIndex = (int *)srcIndex->data;
int * si = (int *)srcIndex->data; KernelSpreadForGather<<<blocks, threads >>>(sData, cData, sIndex, indexSize, stride);
if (srcIndex->devID < 0) {
if(mem != NULL)
mem->ReleaseBuf(mem->devID, sizeof(int) * indexSize);
else
XMemFree(devID, sIndex);
}
KernelSpreadForGatherFuzed<<<blocks, threads >>>(s, c, blockNum, blockSizeSrc, blockSizeColl,
stride, indexSize, si);
BacktoCudaDev(source->devID, devIDBackup); BacktoCudaDev(source->devID, devIDBackup);
} }
#endif // USE_CUDA #endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
#endif // __SPREAD_CUH__
\ No newline at end of file
...@@ -32,9 +32,10 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -32,9 +32,10 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
void _CudaSpread(XTensor * source, XTensor * collection, int dim, void _CudaSpread(XTensor * source, XTensor * collection, int dim,
int * srcIndex, int indexSize, int * collIndex); int * srcIndex, int indexSize, int * collIndex);
/* special spread function for backward computation of gather function (cuda version) */ /* special spread function for backward computation of CopyIndexed function (cuda version) */
void _CudaSpreadForGather(XTensor * source, XTensor * collection, int dim, void _CudaSpreadForCopyIndexed(XTensor * s, XTensor * c, int dim,
int * srcIndex, int indexSize); XTensor * srcIndex, XTensor * collIndex,
int copyNum);
/* special spread function for backward computation of gather function (cuda version) */ /* special spread function for backward computation of gather function (cuda version) */
void _CudaSpreadForGather(XTensor * source, XTensor * collection, XTensor * srcIndex); void _CudaSpreadForGather(XTensor * source, XTensor * collection, XTensor * srcIndex);
......
...@@ -36,9 +36,10 @@ void Spread(XTensor * source, XTensor * collection, ...@@ -36,9 +36,10 @@ void Spread(XTensor * source, XTensor * collection,
XTensor * srcIndex, XTensor * collIndex, XTensor * srcIndex, XTensor * collIndex,
int dim); int dim);
/* special spread function for backward computation of gather function */ /* special spread function for backward computation of CopyIndexed function */
void _SpreadForGather(XTensor * source, XTensor * collection, int dim, void _SpreadForCopyIndexed(XTensor * source, XTensor * collection, int dim,
int * srcIndex, int indexSize); XTensor * srcIndex, XTensor * collIndex,
int copyNum);
/* special spread function for backward computation of gather function */ /* special spread function for backward computation of gather function */
void _SpreadForGather(XTensor * source, XTensor * collection, XTensor * index); void _SpreadForGather(XTensor * source, XTensor * collection, XTensor * index);
......
...@@ -48,18 +48,18 @@ DTYPE _ReduceSumAll(const XTensor * source) ...@@ -48,18 +48,18 @@ DTYPE _ReduceSumAll(const XTensor * source)
{ {
int dims[2] = {1, source->unitNum}; int dims[2] = {1, source->unitNum};
int one = 1; int one = 1;
XTensor * all = NewTensorBuf(2, dims, source->dataType, source->denseRatio, source->devID, source->mem); XTensor * all = NewTensorBuf(2, dims, source->dataType, source->denseRatio, source->devID, source->mem);
XTensor * result = NewTensorBuf(1, &one, source->dataType, 1.0F, source->devID, source->mem); XTensor * result = NewTensorBuf(1, &one, source->dataType, 1.0F, source->devID, source->mem);
_CopyValues(source, all); _CopyValues(source, all);
_ReduceSum(all, result, 1); _ReduceSum(all, result, 1);
DTYPE r = result->Get1D(0); DTYPE r = result->Get1D(0);
DelTensorBuf(result); DelTensorBuf(result);
DelTensorBuf(all); DelTensorBuf(all);
return r; return r;
int order = source->order; int order = source->order;
...@@ -76,7 +76,7 @@ DTYPE _ReduceSumAll(const XTensor * source) ...@@ -76,7 +76,7 @@ DTYPE _ReduceSumAll(const XTensor * source)
dimSize = getDimSize(big, leadingDim); dimSize = getDimSize(big, leadingDim);
XTensor * little = NewTensor(big->order - 1, dimSize, source->dataType, source->denseRatio, XTensor * little = NewTensor(big->order - 1, dimSize, source->dataType, source->denseRatio,
source->devID, source->mem); source->devID, source->mem);
_ReduceSum(big, little, leadingDim); _ReduceSum(big, little, leadingDim);
delete big; delete big;
......
...@@ -126,7 +126,7 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum) ...@@ -126,7 +126,7 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)
void * dataTMP = t->data; void * dataTMP = t->data;
if (!isOnSameDevice) if (!isOnSameDevice)
dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(mem->devID, size); dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(s->devID, size);
int realBlockSize = blockSize * t->unitSize; int realBlockSize = blockSize * t->unitSize;
int blockSplitSize = blockNum / splitNum; int blockSplitSize = blockNum / splitNum;
......
...@@ -153,7 +153,7 @@ XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim) ...@@ -153,7 +153,7 @@ XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim)
int unitNum = x.dimSize[n]; int unitNum = x.dimSize[n];
DTYPE * maskArray = new DTYPE[unitNum]; DTYPE * maskArray = new DTYPE[unitNum];
srand((unsigned int)time(NULL)); //srand((unsigned int)time(NULL));
for (int i = 0; i < unitNum; i++) for (int i = 0; i < unitNum; i++)
maskArray[i] = RandomBernoulli(dropProb, scaleFactor); maskArray[i] = RandomBernoulli(dropProb, scaleFactor);
...@@ -166,4 +166,33 @@ XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim) ...@@ -166,4 +166,33 @@ XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim)
return MultiplyDim(x, mask, n, 0); return MultiplyDim(x, mask, n, 0);
} }
/*
dropout function without broadcast
>> x - input tensor
>> dropProb - probability to set an element to zero
*/
XTensor DropoutWithoutBroadcast(const XTensor &x, DTYPE dropProb)
{
CheckNTErrors(dropProb >= 0.0 && dropProb <= 1.0, "The probability must be 0-1!");
DTYPE scaleFactor = (DTYPE)1.0 / ((DTYPE)1.0 - dropProb);
/* generate a mask tensor with probability p */
int unitNum = x.unitNum;
DTYPE * maskArray = new DTYPE[unitNum];
srand((unsigned int)time(NULL));
for (int i = 0; i < unitNum; i++)
maskArray[i] = RandomBernoulli(dropProb, scaleFactor);
XTensor mask;
InitTensor(&mask, x.order, x.dimSize, x.dataType, x.denseRatio, x.devID, x.mem);
mask.SetData(maskArray, unitNum);
delete[] maskArray;
return Multiply(x, mask);
}
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
...@@ -42,6 +42,9 @@ void _DropoutBackward(const XTensor * y, const XTensor * x, ...@@ -42,6 +42,9 @@ void _DropoutBackward(const XTensor * y, const XTensor * x,
/* dropout function */ /* dropout function */
XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim = -1); XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim = -1);
/* dropout function without broadcast */
XTensor DropoutWithoutBroadcast(const XTensor &x, DTYPE dropProb);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -373,9 +373,9 @@ better numerical stability. ...@@ -373,9 +373,9 @@ better numerical stability.
>> leadDim - leading dimension (along which we perform reduction) >> leadDim - leading dimension (along which we perform reduction)
*/ */
void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx, XTensor * dedy, XTensor * dedx,
XTensor * padding, int leadDim, XTensor * padding, int leadDim,
LOSS_FUNCTION_NAME lossName) LOSS_FUNCTION_NAME lossName)
{ {
leadDim = leadDim < 0 ? y->order - 1 : leadDim; leadDim = leadDim < 0 ? y->order - 1 : leadDim;
......
/* NiuTrans.Tensor - an open-source tensor library /* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University. * Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved. * All rights reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
* You may obtain a copy of the License at * You may obtain a copy of the License at
* *
* http://www.apache.org/licenses/LICENSE-2.0 * http://www.apache.org/licenses/LICENSE-2.0
* *
* Unless required by applicable law or agreed to in writing, software * Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, * distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
/* /*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-12 * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-12
*/ */
#ifndef __TEST_ABSOLUTE_H__ #ifndef __TEST_ABSOLUTE_H__
#define __TEST_ABSOLUTE_H__ #define __TEST_ABSOLUTE_H__
......
/* NiuTrans.Tensor - an open-source tensor library /* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University. * Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved. * All rights reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
* You may obtain a copy of the License at * You may obtain a copy of the License at
* *
* http://www.apache.org/licenses/LICENSE-2.0 * http://www.apache.org/licenses/LICENSE-2.0
* *
* Unless required by applicable law or agreed to in writing, software * Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, * distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
/* /*
* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-03 * $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-03
*/ */
#include "../XTensor.h" #include "../XTensor.h"
#include "../core/math/Clip.h"
#include "TClip.h" #include "TClip.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
......
/* NiuTrans.Tensor - an open-source tensor library /* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University. * Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved. * All rights reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
* You may obtain a copy of the License at * You may obtain a copy of the License at
* *
* http://www.apache.org/licenses/LICENSE-2.0 * http://www.apache.org/licenses/LICENSE-2.0
* *
* Unless required by applicable law or agreed to in writing, software * Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, * distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
/* /*
* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-03 * $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-03
*/ */
#ifndef __TEST_CLIP_H__ #ifndef __TEST_CLIP_H__
#define __TEST_CLIP_H__ #define __TEST_CLIP_H__
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-12
*/
#include "../XTensor.h"
#include "../core/math/Compare.h"
#include "TCompare.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/*
case 1: test Equal function.
Comapre whether every entry is equal to the specified value.
*/
bool TestCompare1()
{
/* a tensor of size (3, 2) */
int aOrder = 2;
int * aDimSize = new int[aOrder];
aDimSize[0] = 3;
aDimSize[1] = 2;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
DTYPE aData[3][2] = { {1.0F, -2.0F},
{0.0F, 4.0F},
{5.0F, 1.0F} };
DTYPE answer[3][2] = { {1.0F, 0.0F},
{0.0F, 0.0F},
{0.0F, 1.0F} };
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * a = NewTensor(aOrder, aDimSize);
XTensor * b = NewTensor(aOrder, aDimSize);
XTensor * aMe = NewTensor(aOrder, aDimSize);
XTensor bUser;
/* initialize variables */
a->SetData(aData, aUnitNum);
aMe->SetData(aData, aUnitNum);
/* call Equal function */
_Equal(a, b, 1.0);
_EqualMe(aMe, 1.0);
bUser = Equal(*a, 1.0);
/* check results */
cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) &&
aMe->CheckData(answer, aUnitNum, 1e-4F) &&
bUser.CheckData(answer, aUnitNum, 1e-4F);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * aMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor bUserGPU;
/* Initialize variables */
aGPU->SetData(aData, aUnitNum);
aMeGPU->SetData(aData, aUnitNum);
/* call Equal function */
_Equal(aGPU, bGPU, 1.0);
_EqualMe(aMeGPU, 1.0);
bUserGPU = Equal(*aGPU, 1.0);
/* check results */
gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) &&
aMeGPU->CheckData(answer, aUnitNum, 1e-4F) &&
bUserGPU.CheckData(answer, aUnitNum, 1e-4F);
/* destroy variables */
delete a;
delete b;
delete aMe;
delete aGPU;
delete bGPU;
delete aMeGPU;
delete[] aDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete a;
delete b;
delete aMe;
delete[] aDimSize;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */
/*
TODO!!
*/
/* test for Compare Function */
bool TestCompare()
{
XPRINT(0, stdout, "[TEST Compare] compare every entry with specified value \n");
bool returnFlag = true, caseFlag = true;
/* case 1 test */
caseFlag = TestCompare1();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 1 failed!\n");
}
else
XPRINT(0, stdout, ">> case 1 passed!\n");
/* other cases test */
/*
TODO!!
*/
if (returnFlag) {
XPRINT(0, stdout, ">> All Passed!\n");
}
else
XPRINT(0, stdout, ">> Failed!\n");
XPRINT(0, stdout, "\n");
return returnFlag;
}
} // namespace nts(NiuTrans.Tensor)
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-12-10
*/
#ifndef __TEST_Compare_H__
#define __TEST_Compare_H__
namespace nts { // namespace nts(NiuTrans.Tensor)
/* test for Compare Function */
bool TestCompare();
} // namespace nts(NiuTrans.Tensor)
#endif // __TEST_Compare_H__
...@@ -232,13 +232,12 @@ bool TestConvertDataType3() ...@@ -232,13 +232,12 @@ bool TestConvertDataType3()
/* initialize variables */ /* initialize variables */
a->SetData(data1, unitNum1); a->SetData(data1, unitNum1);
/* call ConvertDataType function */ /* call ConvertDataType function (We have not implemented this yet...) */
//_ConvertDataType(a, b); //_ConvertDataType(a, b);
//_ConvertDataType(b, c); //_ConvertDataType(b, c);
/* check results */ /* check results */
cpuTest = a->CheckData(data1, unitNum1, 1e-4F); //cpuTest = a->CheckData(data1, unitNum1, 1e-4F);
c->Dump(stderr, "");
#ifdef USE_CUDA #ifdef USE_CUDA
/* GPU test */ /* GPU test */
......
...@@ -51,6 +51,15 @@ bool TestCopyIndexed1() ...@@ -51,6 +51,15 @@ bool TestCopyIndexed1()
int tUnitNum = 1; int tUnitNum = 1;
for (int i = 0; i < tOrder; i++) for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i]; tUnitNum *= tDimSize[i];
/* a index tensor of size (2) */
int indexOrder = 1;
int * indexDimSize = new int[indexOrder];
indexDimSize[0] = 2;
int indexUnitNum = 1;
for (int i = 0; i < indexOrder; i++)
indexUnitNum *= indexDimSize[i];
DTYPE sData[3][2][3] = { { {0.0F, -1.0F, 2.0F}, DTYPE sData[3][2][3] = { { {0.0F, -1.0F, 2.0F},
{2.0F, 1.0F, 3.0F} }, {2.0F, 1.0F, 3.0F} },
...@@ -76,19 +85,28 @@ bool TestCopyIndexed1() ...@@ -76,19 +85,28 @@ bool TestCopyIndexed1()
/* create tensors */ /* create tensors */
XTensor * s = NewTensor(sOrder, sDimSize); XTensor * s = NewTensor(sOrder, sDimSize);
XTensor * t = NewTensor(tOrder, tDimSize); XTensor * t1 = NewTensor(tOrder, tDimSize);
XTensor * t2 = NewTensor(tOrder, tDimSize);
XTensor * sIndex = NewTensor(indexOrder, indexDimSize, X_INT);
XTensor * tIndex = NewTensor(indexOrder, indexDimSize, X_INT);
XTensor tUser; XTensor tUser;
/* initialize variables */ /* initialize variables */
s->SetData(sData, sUnitNum); s->SetData(sData, sUnitNum);
t->SetZeroAll(); t1->SetZeroAll();
t2->SetZeroAll();
sIndex->SetData(srcIndex, indexUnitNum);
tIndex->SetData(tgtIndex, indexUnitNum);
/* call CopyIndexed function */ /* call CopyIndexed function */
_CopyIndexed(s, t, dim, srcIndex, indexSize, tgtIndex, copyNum); _CopyIndexed(s, t1, dim, srcIndex, indexSize, tgtIndex, copyNum);
tUser = CopyIndexed(*s, dim, srcIndex, indexSize, tgtIndex, copyNum); _CopyIndexed(s, t2, dim, sIndex, tIndex, copyNum);
tUser = CopyIndexed(*s, dim, *sIndex, *tIndex, copyNum);
/* check results */ /* check results */
cpuTest = t->CheckData(answer, tUnitNum) && tUser.CheckData(answer, tUnitNum); cpuTest = t1->CheckData(answer, tUnitNum) &&
t2->CheckData(answer, tUnitNum) &&
tUser.CheckData(answer, tUnitNum);
#ifdef USE_CUDA #ifdef USE_CUDA
/* GPU test */ /* GPU test */
...@@ -96,35 +114,55 @@ bool TestCopyIndexed1() ...@@ -96,35 +114,55 @@ bool TestCopyIndexed1()
/* create tensors */ /* create tensors */
XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0); XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensor(sOrder, tDimSize, X_FLOAT, 1.0F, 0); XTensor * tGPU1 = NewTensor(sOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU2 = NewTensor(sOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor * sIndexGPU = NewTensor(indexOrder, indexDimSize, X_INT, 1.0F, 0);
XTensor * tIndexGPU = NewTensor(indexOrder, indexDimSize, X_INT, 1.0F, 0);
XTensor tUserGPU; XTensor tUserGPU;
/* initialize variables */ /* initialize variables */
sGPU->SetData(sData, sUnitNum); sGPU->SetData(sData, sUnitNum);
tGPU->SetZeroAll(); tGPU1->SetZeroAll();
tGPU2->SetZeroAll();
sIndexGPU->SetData(srcIndex, indexUnitNum);
tIndexGPU->SetData(tgtIndex, indexUnitNum);
/* call CopyIndexed function */ /* call CopyIndexed function */
_CopyIndexed(sGPU, tGPU, dim, srcIndex, indexSize, tgtIndex, copyNum); _CopyIndexed(sGPU, tGPU1, dim, srcIndex, indexSize, tgtIndex, copyNum);
tUserGPU = CopyIndexed(*sGPU, dim, srcIndex, indexSize, tgtIndex, copyNum); _CopyIndexed(sGPU, tGPU2, dim, sIndexGPU, tIndexGPU, copyNum);
tUserGPU = CopyIndexed(*sGPU, dim, *sIndexGPU, *tIndexGPU, copyNum);
/* check results */ /* check results */
gpuTest = tGPU->CheckData(answer, tUnitNum) && tUserGPU.CheckData(answer, tUnitNum); gpuTest = tGPU1->CheckData(answer, tUnitNum) &&
tGPU2->CheckData(answer, tUnitNum) &&
tUserGPU.CheckData(answer, tUnitNum);
/* destroy variables */ /* destroy variables */
delete s; delete s;
delete t; delete t1;
delete t2;
delete sIndex;
delete tIndex;
delete sGPU; delete sGPU;
delete tGPU; delete tGPU1;
delete tGPU2;
delete sIndexGPU;
delete tIndexGPU;
delete[] sDimSize; delete[] sDimSize;
delete[] tDimSize; delete[] tDimSize;
delete[] indexDimSize;
return cpuTest && gpuTest; return cpuTest && gpuTest;
#else #else
/* destroy variables */ /* destroy variables */
delete s; delete s;
delete t; delete t1;
delete t2;
delete sIndex;
delete tIndex;
delete[] sDimSize; delete[] sDimSize;
delete[] tDimSize; delete[] tDimSize;
delete[] indexDimSize;
return cpuTest; return cpuTest;
#endif // USE_CUDA #endif // USE_CUDA
...@@ -159,6 +197,15 @@ bool TestCopyIndexed2() ...@@ -159,6 +197,15 @@ bool TestCopyIndexed2()
for (int i = 0; i < tOrder; i++) for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i]; tUnitNum *= tDimSize[i];
/* a index tensor of size (2) */
int indexOrder = 1;
int * indexDimSize = new int[indexOrder];
indexDimSize[0] = 2;
int indexUnitNum = 1;
for (int i = 0; i < indexOrder; i++)
indexUnitNum *= indexDimSize[i];
DTYPE sData[3][2][3] = { { {0.0F, -1.0F, 2.0F}, DTYPE sData[3][2][3] = { { {0.0F, -1.0F, 2.0F},
{2.0F, 1.0F, 3.0F} }, {2.0F, 1.0F, 3.0F} },
{ {1.0F, 2.0F, 4.0F}, { {1.0F, 2.0F, 4.0F},
...@@ -183,19 +230,28 @@ bool TestCopyIndexed2() ...@@ -183,19 +230,28 @@ bool TestCopyIndexed2()
/* create tensors */ /* create tensors */
XTensor * s = NewTensor(sOrder, sDimSize); XTensor * s = NewTensor(sOrder, sDimSize);
XTensor * t = NewTensor(tOrder, tDimSize); XTensor * t1 = NewTensor(tOrder, tDimSize);
XTensor * t2 = NewTensor(tOrder, tDimSize);
XTensor * sIndex = NewTensor(indexOrder, indexDimSize, X_INT);
XTensor * tIndex = NewTensor(indexOrder, indexDimSize, X_INT);
XTensor tUser; XTensor tUser;
/* initialize variables */ /* initialize variables */
s->SetData(sData, sUnitNum); s->SetData(sData, sUnitNum);
t->SetZeroAll(); t1->SetZeroAll();
t2->SetZeroAll();
sIndex->SetData(srcIndex, indexUnitNum);
tIndex->SetData(tgtIndex, indexUnitNum);
/* call CopyIndexed function */ /* call CopyIndexed function */
_CopyIndexed(s, t, dim, srcIndex, indexSize, tgtIndex, copyNum); _CopyIndexed(s, t1, dim, srcIndex, indexSize, tgtIndex, copyNum);
tUser = CopyIndexed(*s, dim, srcIndex, indexSize, tgtIndex, copyNum); _CopyIndexed(s, t2, dim, sIndex, tIndex, copyNum);
tUser = CopyIndexed(*s, dim, *sIndex, *tIndex);
/* check results */ /* check results */
cpuTest = t->CheckData(answer, tUnitNum) && tUser.CheckData(answer, tUnitNum); cpuTest = t1->CheckData(answer, tUnitNum) &&
t2->CheckData(answer, tUnitNum) &&
tUser.CheckData(answer, tUnitNum);
#ifdef USE_CUDA #ifdef USE_CUDA
/* GPU test */ /* GPU test */
...@@ -203,35 +259,55 @@ bool TestCopyIndexed2() ...@@ -203,35 +259,55 @@ bool TestCopyIndexed2()
/* create tensors */ /* create tensors */
XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0); XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensor(sOrder, tDimSize, X_FLOAT, 1.0F, 0); XTensor * tGPU1 = NewTensor(sOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU2 = NewTensor(sOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor * sIndexGPU = NewTensor(indexOrder, indexDimSize, X_INT, 1.0F, 0);
XTensor * tIndexGPU = NewTensor(indexOrder, indexDimSize, X_INT, 1.0F, 0);
XTensor tUserGPU; XTensor tUserGPU;
/* initialize variables */ /* initialize variables */
sGPU->SetData(sData, sUnitNum); sGPU->SetData(sData, sUnitNum);
tGPU->SetZeroAll(); tGPU1->SetZeroAll();
tGPU2->SetZeroAll();
sIndexGPU->SetData(srcIndex, indexUnitNum);
tIndexGPU->SetData(tgtIndex, indexUnitNum);
/* call CopyIndexed function */ /* call CopyIndexed function */
_CopyIndexed(sGPU, tGPU, dim, srcIndex, indexSize, tgtIndex, copyNum); _CopyIndexed(sGPU, tGPU1, dim, srcIndex, indexSize, tgtIndex, copyNum);
tUserGPU = CopyIndexed(*sGPU, dim, srcIndex, indexSize, tgtIndex, copyNum); _CopyIndexed(sGPU, tGPU2, dim, sIndexGPU, tIndexGPU, copyNum);
tUserGPU = CopyIndexed(*sGPU, dim, *sIndexGPU, *tIndexGPU, copyNum);
/* check results */ /* check results */
gpuTest = tGPU->CheckData(answer, tUnitNum) && tUserGPU.CheckData(answer, tUnitNum); gpuTest = tGPU1->CheckData(answer, tUnitNum) &&
tGPU2->CheckData(answer, tUnitNum) &&
tUserGPU.CheckData(answer, tUnitNum);
/* destroy variables */ /* destroy variables */
delete s; delete s;
delete t; delete t1;
delete t2;
delete sIndex;
delete tIndex;
delete sGPU; delete sGPU;
delete tGPU; delete tGPU1;
delete tGPU2;
delete sIndexGPU;
delete tIndexGPU;
delete[] sDimSize; delete[] sDimSize;
delete[] tDimSize; delete[] tDimSize;
delete[] indexDimSize;
return cpuTest && gpuTest; return cpuTest && gpuTest;
#else #else
/* destroy variables */ /* destroy variables */
delete s; delete s;
delete t; delete t1;
delete t2;
delete sIndex;
delete tIndex;
delete[] sDimSize; delete[] sDimSize;
delete[] tDimSize; delete[] tDimSize;
delete[] indexDimSize;
return cpuTest; return cpuTest;
#endif // USE_CUDA #endif // USE_CUDA
...@@ -265,6 +341,15 @@ bool TestCopyIndexed3() ...@@ -265,6 +341,15 @@ bool TestCopyIndexed3()
int tUnitNum = 1; int tUnitNum = 1;
for (int i = 0; i < tOrder; i++) for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i]; tUnitNum *= tDimSize[i];
/* a index tensor of size (1) */
int indexOrder = 1;
int * indexDimSize = new int[indexOrder];
indexDimSize[0] = 1;
int indexUnitNum = 1;
for (int i = 0; i < indexOrder; i++)
indexUnitNum *= indexDimSize[i];
DTYPE sData[3][2][3] = { { {0.0F, -1.0F, 2.0F}, DTYPE sData[3][2][3] = { { {0.0F, -1.0F, 2.0F},
{2.0F, 1.0F, 3.0F} }, {2.0F, 1.0F, 3.0F} },
...@@ -290,19 +375,28 @@ bool TestCopyIndexed3() ...@@ -290,19 +375,28 @@ bool TestCopyIndexed3()
/* create tensors */ /* create tensors */
XTensor * s = NewTensor(sOrder, sDimSize); XTensor * s = NewTensor(sOrder, sDimSize);
XTensor * t = NewTensor(tOrder, tDimSize); XTensor * t1 = NewTensor(tOrder, tDimSize);
XTensor * t2 = NewTensor(tOrder, tDimSize);
XTensor * sIndex = NewTensor(indexOrder, indexDimSize, X_INT);
XTensor * tIndex = NewTensor(indexOrder, indexDimSize, X_INT);
XTensor tUser; XTensor tUser;
/* initialize variables */ /* initialize variables */
s->SetData(sData, sUnitNum); s->SetData(sData, sUnitNum);
t->SetZeroAll(); t1->SetZeroAll();
t2->SetZeroAll();
sIndex->SetData(srcIndex, indexUnitNum);
tIndex->SetData(tgtIndex, indexUnitNum);
/* call CopyIndexed function */ /* call CopyIndexed function */
_CopyIndexed(s, t, dim, srcIndex, indexSize, tgtIndex, copyNum); _CopyIndexed(s, t1, dim, srcIndex, indexSize, tgtIndex, copyNum);
tUser = CopyIndexed(*s, dim, srcIndex, indexSize, tgtIndex, copyNum); _CopyIndexed(s, t2, dim, sIndex, tIndex, copyNum);
tUser = CopyIndexed(*s, dim, *sIndex, *tIndex, copyNum);
/* check results */ /* check results */
cpuTest = t->CheckData(answer, tUnitNum) && tUser.CheckData(answer, tUnitNum); cpuTest = t1->CheckData(answer, tUnitNum) &&
t2->CheckData(answer, tUnitNum) &&
tUser.CheckData(answer, tUnitNum);
#ifdef USE_CUDA #ifdef USE_CUDA
/* GPU test */ /* GPU test */
...@@ -310,35 +404,55 @@ bool TestCopyIndexed3() ...@@ -310,35 +404,55 @@ bool TestCopyIndexed3()
/* create tensors */ /* create tensors */
XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0); XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensor(sOrder, tDimSize, X_FLOAT, 1.0F, 0); XTensor * tGPU1 = NewTensor(sOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU2 = NewTensor(sOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor * sIndexGPU = NewTensor(indexOrder, indexDimSize, X_INT, 1.0F, 0);
XTensor * tIndexGPU = NewTensor(indexOrder, indexDimSize, X_INT, 1.0F, 0);
XTensor tUserGPU; XTensor tUserGPU;
/* initialize variables */ /* initialize variables */
sGPU->SetData(sData, sUnitNum); sGPU->SetData(sData, sUnitNum);
tGPU->SetZeroAll(); tGPU1->SetZeroAll();
tGPU2->SetZeroAll();
sIndexGPU->SetData(srcIndex, indexUnitNum);
tIndexGPU->SetData(tgtIndex, indexUnitNum);
/* call CopyIndexed function */ /* call CopyIndexed function */
_CopyIndexed(sGPU, tGPU, dim, srcIndex, indexSize, tgtIndex, copyNum); _CopyIndexed(sGPU, tGPU1, dim, srcIndex, indexSize, tgtIndex, copyNum);
tUserGPU = CopyIndexed(*sGPU, dim, srcIndex, indexSize, tgtIndex, copyNum); _CopyIndexed(sGPU, tGPU2, dim, sIndexGPU, tIndexGPU, copyNum);
tUserGPU = CopyIndexed(*sGPU, dim, *sIndexGPU, *tIndexGPU, copyNum);
/* check results */ /* check results */
gpuTest = tGPU->CheckData(answer, tUnitNum) && tUserGPU.CheckData(answer, tUnitNum); gpuTest = tGPU1->CheckData(answer, tUnitNum) &&
tGPU2->CheckData(answer, tUnitNum) &&
tUserGPU.CheckData(answer, tUnitNum);
/* destroy variables */ /* destroy variables */
delete s; delete s;
delete t; delete t1;
delete t2;
delete sIndex;
delete tIndex;
delete sGPU; delete sGPU;
delete tGPU; delete tGPU1;
delete tGPU2;
delete sIndexGPU;
delete tIndexGPU;
delete[] sDimSize; delete[] sDimSize;
delete[] tDimSize; delete[] tDimSize;
delete[] indexDimSize;
return cpuTest && gpuTest; return cpuTest && gpuTest;
#else #else
/* destroy variables */ /* destroy variables */
delete s; delete s;
delete t; delete t1;
delete t2;
delete sIndex;
delete tIndex;
delete[] sDimSize; delete[] sDimSize;
delete[] tDimSize; delete[] tDimSize;
delete[] indexDimSize;
return cpuTest; return cpuTest;
#endif // USE_CUDA #endif // USE_CUDA
...@@ -374,15 +488,13 @@ bool TestCopyIndexed4() ...@@ -374,15 +488,13 @@ bool TestCopyIndexed4()
tUnitNum *= tDimSize[i]; tUnitNum *= tDimSize[i];
/* a index tensor of size(2) */ /* a index tensor of size(2) */
int iOrder = 3; int indexOrder = 1;
int * iDimSize = new int[iOrder]; int * indexDimSize = new int[indexOrder];
iDimSize[0] = 3; indexDimSize[0] = 2;
iDimSize[1] = 2;
iDimSize[2] = 2;
int iUnitNum = 1; int indexUnitNum = 1;
for (int i = 0; i < iOrder; i++) for (int i = 0; i < indexOrder; i++)
iUnitNum *= iDimSize[i]; indexUnitNum *= indexDimSize[i];
DTYPE sData[3][2][3] = { { {0.0F, -1.0F, 2.0F}, DTYPE sData[3][2][3] = { { {0.0F, -1.0F, 2.0F},
{2.0F, 1.0F, 3.0F} }, {2.0F, 1.0F, 3.0F} },
...@@ -408,21 +520,28 @@ bool TestCopyIndexed4() ...@@ -408,21 +520,28 @@ bool TestCopyIndexed4()
/* create tensors */ /* create tensors */
XTensor * s = NewTensor(sOrder, sDimSize); XTensor * s = NewTensor(sOrder, sDimSize);
XTensor * t = NewTensor(tOrder, tDimSize); XTensor * t1 = NewTensor(tOrder, tDimSize);
XTensor * index = NewTensor(tOrder, tDimSize, X_INT); XTensor * t2 = NewTensor(tOrder, tDimSize);
XTensor * sIndex = NewTensor(indexOrder, indexDimSize, X_INT);
XTensor * tIndex = NewTensor(indexOrder, indexDimSize, X_INT);
XTensor tUser; XTensor tUser;
/* initialize variables */ /* initialize variables */
s->SetData(sData, sUnitNum); s->SetData(sData, sUnitNum);
t->SetZeroAll(); t1->SetZeroAll();
index->SetData(srcIndex, iUnitNum); t2->SetZeroAll();
sIndex->SetData(srcIndex, indexUnitNum);
tIndex->SetData(tgtIndex, indexUnitNum);
/* call CopyIndexed function */ /* call CopyIndexed function */
_CopyIndexed(s, t, dim, (int*)index->data, indexSize, tgtIndex, copyNum); _CopyIndexed(s, t1, dim, srcIndex, indexSize, tgtIndex, copyNum);
tUser = CopyIndexed(*s, dim, (int*)index->data, indexSize, tgtIndex, copyNum); _CopyIndexed(s, t2, dim, sIndex, tIndex, copyNum);
tUser = CopyIndexed(*s, dim, *sIndex, *tIndex, copyNum);
/* check results */ /* check results */
cpuTest = t->CheckData(answer, tUnitNum) && tUser.CheckData(answer, tUnitNum); cpuTest = t1->CheckData(answer, tUnitNum) &&
t2->CheckData(answer, tUnitNum) &&
tUser.CheckData(answer, tUnitNum);
#ifdef USE_CUDA #ifdef USE_CUDA
/* GPU test */ /* GPU test */
...@@ -430,42 +549,60 @@ bool TestCopyIndexed4() ...@@ -430,42 +549,60 @@ bool TestCopyIndexed4()
/* create tensors */ /* create tensors */
XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0); XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensor(sOrder, tDimSize, X_FLOAT, 1.0F, 0); XTensor * tGPU1 = NewTensor(sOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU2 = NewTensor(sOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor * sIndexGPU = NewTensor(indexOrder, indexDimSize, X_INT, 1.0F, 0);
XTensor * tIndexGPU = NewTensor(indexOrder, indexDimSize, X_INT, 1.0F, 0);
XTensor tUserGPU; XTensor tUserGPU;
/* initialize variables */ /* initialize variables */
sGPU->SetData(sData, sUnitNum); sGPU->SetData(sData, sUnitNum);
tGPU->SetZeroAll(); tGPU1->SetZeroAll();
tGPU2->SetZeroAll();
sIndexGPU->SetData(srcIndex, indexUnitNum);
tIndexGPU->SetData(tgtIndex, indexUnitNum);
/* call CopyIndexed function */ /* call CopyIndexed function */
_CopyIndexed(sGPU, tGPU, dim, (int*)index->data, indexSize, tgtIndex, copyNum); _CopyIndexed(sGPU, tGPU1, dim, srcIndex, indexSize, tgtIndex, copyNum);
tUserGPU = CopyIndexed(*sGPU, dim, srcIndex, indexSize, tgtIndex, copyNum); _CopyIndexed(sGPU, tGPU2, dim, sIndexGPU, tIndexGPU, copyNum);
tUserGPU = CopyIndexed(*sGPU, dim, *sIndexGPU, *tIndexGPU, copyNum);
/* check results */ /* check results */
gpuTest = tGPU->CheckData(answer, tUnitNum) && tUserGPU.CheckData(answer, tUnitNum); gpuTest = tGPU1->CheckData(answer, tUnitNum) &&
tGPU2->CheckData(answer, tUnitNum) &&
tUserGPU.CheckData(answer, tUnitNum);
/* destroy variables */ /* destroy variables */
delete s; delete s;
delete t; delete t1;
delete index; delete t2;
delete sIndex;
delete tIndex;
delete sGPU; delete sGPU;
delete tGPU; delete tGPU1;
delete tGPU2;
delete sIndexGPU;
delete tIndexGPU;
delete[] sDimSize; delete[] sDimSize;
delete[] tDimSize; delete[] tDimSize;
delete[] indexDimSize;
return cpuTest && gpuTest; return cpuTest && gpuTest;
#else #else
/* destroy variables */ /* destroy variables */
delete s; delete s;
delete t; delete t1;
delete t2;
delete sIndex;
delete tIndex;
delete[] sDimSize; delete[] sDimSize;
delete[] tDimSize; delete[] tDimSize;
delete[] indexDimSize;
return cpuTest; return cpuTest;
#endif // USE_CUDA #endif // USE_CUDA
} }
/* /*
case 5: copy indexed sub-tensors case 5: copy indexed sub-tensors
In this case, (3, 2, 3) -> (3, 2, 2), dim = 2, indexSize = 1, In this case, (3, 2, 3) -> (3, 2, 2), dim = 2, indexSize = 1,
...@@ -494,6 +631,15 @@ bool TestCopyIndexed5() ...@@ -494,6 +631,15 @@ bool TestCopyIndexed5()
int tUnitNum = 1; int tUnitNum = 1;
for (int i = 0; i < tOrder; i++) for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i]; tUnitNum *= tDimSize[i];
/* a index tensor of size (2) */
int indexOrder = 1;
int * indexDimSize = new int[indexOrder];
indexDimSize[0] = 2;
int indexUnitNum = 1;
for (int i = 0; i < indexOrder; i++)
indexUnitNum *= indexDimSize[i];
DTYPE sData[3][2][3] = { { {0.0F, -1.0F, 2.0F}, DTYPE sData[3][2][3] = { { {0.0F, -1.0F, 2.0F},
{2.0F, 1.0F, 3.0F} }, {2.0F, 1.0F, 3.0F} },
...@@ -519,19 +665,28 @@ bool TestCopyIndexed5() ...@@ -519,19 +665,28 @@ bool TestCopyIndexed5()
/* create tensors */ /* create tensors */
XTensor * s = NewTensor(sOrder, sDimSize); XTensor * s = NewTensor(sOrder, sDimSize);
XTensor * t = NewTensor(tOrder, tDimSize); XTensor * t1 = NewTensor(tOrder, tDimSize);
XTensor * t2 = NewTensor(tOrder, tDimSize);
XTensor * sIndex = NewTensor(indexOrder, indexDimSize, X_INT);
XTensor * tIndex = NewTensor(indexOrder, indexDimSize, X_INT);
XTensor tUser; XTensor tUser;
/* initialize variables */ /* initialize variables */
s->SetData(sData, sUnitNum); s->SetData(sData, sUnitNum);
t->SetZeroAll(); t1->SetZeroAll();
t2->SetZeroAll();
sIndex->SetData(srcIndex, indexUnitNum);
tIndex->SetData(tgtIndex, indexUnitNum);
/* call CopyIndexed function */ /* call CopyIndexed function */
_CopyIndexed(s, t, dim, srcIndex, indexSize, tgtIndex, copyNum); _CopyIndexed(s, t1, dim, srcIndex, indexSize, tgtIndex, copyNum);
tUser = CopyIndexed(*s, dim, srcIndex, indexSize, tgtIndex, copyNum); _CopyIndexed(s, t2, dim, sIndex, tIndex, copyNum);
tUser = CopyIndexed(*s, dim, *sIndex, *tIndex, copyNum);
/* check results */ /* check results */
cpuTest = t->CheckData(answer, tUnitNum) && tUser.CheckData(answer, tUnitNum); cpuTest = t1->CheckData(answer, tUnitNum) &&
t2->CheckData(answer, tUnitNum) &&
tUser.CheckData(answer, tUnitNum);
#ifdef USE_CUDA #ifdef USE_CUDA
/* GPU test */ /* GPU test */
...@@ -539,35 +694,55 @@ bool TestCopyIndexed5() ...@@ -539,35 +694,55 @@ bool TestCopyIndexed5()
/* create tensors */ /* create tensors */
XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0); XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensor(sOrder, tDimSize, X_FLOAT, 1.0F, 0); XTensor * tGPU1 = NewTensor(sOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU2 = NewTensor(sOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor * sIndexGPU = NewTensor(indexOrder, indexDimSize, X_INT, 1.0F, 0);
XTensor * tIndexGPU = NewTensor(indexOrder, indexDimSize, X_INT, 1.0F, 0);
XTensor tUserGPU; XTensor tUserGPU;
/* initialize variables */ /* initialize variables */
sGPU->SetData(sData, sUnitNum); sGPU->SetData(sData, sUnitNum);
tGPU->SetZeroAll(); tGPU1->SetZeroAll();
tGPU2->SetZeroAll();
sIndexGPU->SetData(srcIndex, indexUnitNum);
tIndexGPU->SetData(tgtIndex, indexUnitNum);
/* call CopyIndexed function */ /* call CopyIndexed function */
_CopyIndexed(sGPU, tGPU, dim, srcIndex, indexSize, tgtIndex, copyNum); _CopyIndexed(sGPU, tGPU1, dim, srcIndex, indexSize, tgtIndex, copyNum);
tUserGPU = CopyIndexed(*sGPU, dim, srcIndex, indexSize, tgtIndex, copyNum); _CopyIndexed(sGPU, tGPU2, dim, sIndexGPU, tIndexGPU, copyNum);
tUserGPU = CopyIndexed(*sGPU, dim, *sIndexGPU, *tIndexGPU, copyNum);
/* check results */ /* check results */
gpuTest = tGPU->CheckData(answer, tUnitNum) && tUserGPU.CheckData(answer, tUnitNum); gpuTest = tGPU1->CheckData(answer, tUnitNum) &&
tGPU2->CheckData(answer, tUnitNum) &&
tUserGPU.CheckData(answer, tUnitNum);
/* destroy variables */ /* destroy variables */
delete s; delete s;
delete t; delete t1;
delete t2;
delete sIndex;
delete tIndex;
delete sGPU; delete sGPU;
delete tGPU; delete tGPU1;
delete tGPU2;
delete sIndexGPU;
delete tIndexGPU;
delete[] sDimSize; delete[] sDimSize;
delete[] tDimSize; delete[] tDimSize;
delete[] indexDimSize;
return cpuTest && gpuTest; return cpuTest && gpuTest;
#else #else
/* destroy variables */ /* destroy variables */
delete s; delete s;
delete t; delete t1;
delete t2;
delete sIndex;
delete tIndex;
delete[] sDimSize; delete[] sDimSize;
delete[] tDimSize; delete[] tDimSize;
delete[] indexDimSize;
return cpuTest; return cpuTest;
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -311,8 +311,8 @@ bool TestCrossEntropy3() ...@@ -311,8 +311,8 @@ bool TestCrossEntropy3()
delete goldGPU; delete goldGPU;
delete lossGPU; delete lossGPU;
delete weightGPU; delete weightGPU;
delete[] dimSize; delete[] dimSize;
delete[] wDimSize;
return cpuTest && gpuTest; return cpuTest && gpuTest;
#else #else
...@@ -322,6 +322,7 @@ bool TestCrossEntropy3() ...@@ -322,6 +322,7 @@ bool TestCrossEntropy3()
delete loss; delete loss;
delete weight; delete weight;
delete[] dimSize; delete[] dimSize;
delete[] wDimSize;
return cpuTest; return cpuTest;
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -212,6 +212,8 @@ bool TestDropout2() ...@@ -212,6 +212,8 @@ bool TestDropout2()
/* destroy variables */ /* destroy variables */
delete x; delete x;
delete y; delete y;
delete dedx;
delete dedy;
delete[] dimSize; delete[] dimSize;
return cpuTest; return cpuTest;
......
...@@ -332,6 +332,7 @@ bool TestGather3() ...@@ -332,6 +332,7 @@ bool TestGather3()
/* destroy variables */ /* destroy variables */
delete s; delete s;
delete t; delete t;
delete index;
delete[] sDimSize; delete[] sDimSize;
delete[] tDimSize; delete[] tDimSize;
delete[] indexDimSize; delete[] indexDimSize;
......
...@@ -150,6 +150,7 @@ bool TestSetData2() ...@@ -150,6 +150,7 @@ bool TestSetData2()
delete sGPU; delete sGPU;
delete modifyGPU; delete modifyGPU;
delete[] sDimSize; delete[] sDimSize;
delete[] dataDimSize;
return cpuTest && gpuTest; return cpuTest && gpuTest;
#else #else
...@@ -157,6 +158,7 @@ bool TestSetData2() ...@@ -157,6 +158,7 @@ bool TestSetData2()
delete s; delete s;
delete modify; delete modify;
delete[] sDimSize; delete[] sDimSize;
delete[] dataDimSize;
return cpuTest; return cpuTest;
#endif // USE_CUDA #endif // USE_CUDA
...@@ -242,6 +244,7 @@ bool TestSetData3() ...@@ -242,6 +244,7 @@ bool TestSetData3()
delete sGPU; delete sGPU;
delete modifyGPU; delete modifyGPU;
delete[] sDimSize; delete[] sDimSize;
delete[] dataDimSize;
return cpuTest && gpuTest; return cpuTest && gpuTest;
#else #else
...@@ -249,6 +252,7 @@ bool TestSetData3() ...@@ -249,6 +252,7 @@ bool TestSetData3()
delete s; delete s;
delete modify; delete modify;
delete[] sDimSize; delete[] sDimSize;
delete[] dataDimSize;
return cpuTest; return cpuTest;
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -61,7 +61,9 @@ bool TestSort1() ...@@ -61,7 +61,9 @@ bool TestSort1()
_SortMe(aMe, index, 0); _SortMe(aMe, index, 0);
Sort(*a, bUser, *index, 0); Sort(*a, bUser, *index, 0);
cpuTest = b->CheckData(answer, unitNum) && aMe->CheckData(answer, unitNum) && bUser.CheckData(answer, unitNum); cpuTest = b->CheckData(answer, unitNum) &&
aMe->CheckData(answer, unitNum) &&
bUser.CheckData(answer, unitNum);
#ifdef USE_CUDA #ifdef USE_CUDA
/* GPU test */ /* GPU test */
...@@ -85,7 +87,9 @@ bool TestSort1() ...@@ -85,7 +87,9 @@ bool TestSort1()
Sort(*aGPU, bUserGPU, *indexGPU, 0); Sort(*aGPU, bUserGPU, *indexGPU, 0);
/* check results */ /* check results */
gpuTest = bGPU->CheckData(answer, unitNum) && aMeGPU->CheckData(answer, unitNum) && bUserGPU.CheckData(answer, unitNum); gpuTest = bGPU->CheckData(answer, unitNum) &&
aMeGPU->CheckData(answer, unitNum) &&
bUserGPU.CheckData(answer, unitNum);
/* destroy variables */ /* destroy variables */
delete a; delete a;
...@@ -149,7 +153,9 @@ bool TestSort2() ...@@ -149,7 +153,9 @@ bool TestSort2()
Sort(*a, bUser, *index, 1); Sort(*a, bUser, *index, 1);
/* check results */ /* check results */
cpuTest = b->CheckData(answer, unitNum) && aMe->CheckData(answer, unitNum) && bUser.CheckData(answer, unitNum); cpuTest = b->CheckData(answer, unitNum) &&
aMe->CheckData(answer, unitNum) &&
bUser.CheckData(answer, unitNum);
#ifdef USE_CUDA #ifdef USE_CUDA
/* GPU test */ /* GPU test */
...@@ -173,7 +179,9 @@ bool TestSort2() ...@@ -173,7 +179,9 @@ bool TestSort2()
Sort(*aGPU, bUserGPU, *indexGPU, 1); Sort(*aGPU, bUserGPU, *indexGPU, 1);
/* check results */ /* check results */
gpuTest = bGPU->CheckData(answer, unitNum) && aMeGPU->CheckData(answer, unitNum) && bUserGPU.CheckData(answer, unitNum); gpuTest = bGPU->CheckData(answer, unitNum) &&
aMeGPU->CheckData(answer, unitNum) &&
bUserGPU.CheckData(answer, unitNum);
/* destroy variables */ /* destroy variables */
delete a; delete a;
......
...@@ -357,6 +357,7 @@ bool TestSplit3() ...@@ -357,6 +357,7 @@ bool TestSplit3()
delete[] sDimSize; delete[] sDimSize;
delete[] tDimSize1; delete[] tDimSize1;
delete[] tDimSize2; delete[] tDimSize2;
delete tList;
return cpuTest; return cpuTest;
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -182,6 +182,7 @@ bool TestSpread2() ...@@ -182,6 +182,7 @@ bool TestSpread2()
int dim = 0; int dim = 0;
int indexSize = 2; int indexSize = 2;
int srcIndex[2] = {0, 2}; int srcIndex[2] = {0, 2};
int tgtIndex[2] = {0, 1};
/* CPU test */ /* CPU test */
bool cpuTest = true; bool cpuTest = true;
...@@ -190,17 +191,19 @@ bool TestSpread2() ...@@ -190,17 +191,19 @@ bool TestSpread2()
XTensor * s1 = NewTensor(sOrder, sDimSize); XTensor * s1 = NewTensor(sOrder, sDimSize);
XTensor * s2 = NewTensor(sOrder, sDimSize); XTensor * s2 = NewTensor(sOrder, sDimSize);
XTensor * t = NewTensor(tOrder, tDimSize); XTensor * t = NewTensor(tOrder, tDimSize);
XTensor * index = NewTensor(indexOrder, indexDimSize, X_INT); XTensor * sIndex = NewTensor(indexOrder, indexDimSize, X_INT);
XTensor * cIndex = NewTensor(indexOrder, indexDimSize, X_INT);
/* initialize variables */ /* initialize variables */
s1->SetData(sData, sUnitNum); s1->SetData(sData, sUnitNum);
s2->SetData(sData, sUnitNum); s2->SetData(sData, sUnitNum);
t->SetData(tData, tUnitNum); t->SetData(tData, tUnitNum);
index->SetData(srcIndex, indexSize); sIndex->SetData(srcIndex, indexSize);
cIndex->SetData(tgtIndex, indexSize);
/* call _SpreadForGather function */ /* call _SpreadForGather function */
_SpreadForGather(s1, t, dim, srcIndex, indexSize); _SpreadForCopyIndexed(s1, t, dim, sIndex, cIndex, 1);
_SpreadForGather(s2, t, index); _SpreadForGather(s2, t, sIndex);
/* check results */ /* check results */
cpuTest = s1->CheckData(answer, tUnitNum) && cpuTest = s1->CheckData(answer, tUnitNum) &&
...@@ -214,17 +217,19 @@ bool TestSpread2() ...@@ -214,17 +217,19 @@ bool TestSpread2()
XTensor * sGPU1 = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0); XTensor * sGPU1 = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * sGPU2 = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0); XTensor * sGPU2 = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensor(sOrder, tDimSize, X_FLOAT, 1.0F, 0); XTensor * tGPU = NewTensor(sOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor * indexGPU = NewTensor(indexOrder, indexDimSize, X_INT, 1.0F, 0); XTensor * sIndexGPU = NewTensor(indexOrder, indexDimSize, X_INT, 1.0F, 0);
XTensor * cIndexGPU = NewTensor(indexOrder, indexDimSize, X_INT, 1.0F, 0);
/* initialize variables */ /* initialize variables */
sGPU1->SetData(sData, sUnitNum); sGPU1->SetData(sData, sUnitNum);
sGPU2->SetData(sData, sUnitNum); sGPU2->SetData(sData, sUnitNum);
tGPU->SetData(tData, tUnitNum); tGPU->SetData(tData, tUnitNum);
indexGPU->SetData(srcIndex, indexSize); sIndexGPU->SetData(srcIndex, indexSize);
cIndexGPU->SetData(tgtIndex, indexSize);
/* call _SpreadForGather function */ /* call _SpreadForGather function */
_SpreadForGather(sGPU1, tGPU, dim, srcIndex, indexSize); _SpreadForCopyIndexed(sGPU1, tGPU, dim, sIndex, cIndex, 1);
_SpreadForGather(sGPU2, tGPU, indexGPU); _SpreadForGather(sGPU2, tGPU, sIndexGPU);
/* check results */ /* check results */
gpuTest = sGPU1->CheckData(answer, tUnitNum) && gpuTest = sGPU1->CheckData(answer, tUnitNum) &&
...@@ -234,11 +239,13 @@ bool TestSpread2() ...@@ -234,11 +239,13 @@ bool TestSpread2()
delete s1; delete s1;
delete s2; delete s2;
delete t; delete t;
delete index; delete sIndex;
delete cIndex;
delete sGPU1; delete sGPU1;
delete sGPU2; delete sGPU2;
delete tGPU; delete tGPU;
delete indexGPU; delete sIndexGPU;
delete cIndexGPU;
delete[] sDimSize; delete[] sDimSize;
delete[] tDimSize; delete[] tDimSize;
delete[] indexDimSize; delete[] indexDimSize;
...@@ -249,6 +256,8 @@ bool TestSpread2() ...@@ -249,6 +256,8 @@ bool TestSpread2()
delete s1; delete s1;
delete s2; delete s2;
delete t; delete t;
delete sIndex;
delete cIndex;
delete[] sDimSize; delete[] sDimSize;
delete[] tDimSize; delete[] tDimSize;
delete[] indexDimSize; delete[] indexDimSize;
......
...@@ -31,6 +31,7 @@ bool Test() ...@@ -31,6 +31,7 @@ bool Test()
wrong = !TestAbsolute() || wrong; wrong = !TestAbsolute() || wrong;
wrong = !TestClip() || wrong; wrong = !TestClip() || wrong;
wrong = !TestCompare() || wrong;
wrong = !TestConcatenate() || wrong; wrong = !TestConcatenate() || wrong;
wrong = !TestConcatenateSolely() || wrong; wrong = !TestConcatenateSolely() || wrong;
wrong = !TestCos() || wrong; wrong = !TestCos() || wrong;
......
...@@ -24,6 +24,7 @@ ...@@ -24,6 +24,7 @@
#include "TAbsolute.h" #include "TAbsolute.h"
#include "TClip.h" #include "TClip.h"
#include "TCompare.h"
#include "TConcatenate.h" #include "TConcatenate.h"
#include "TConcatenateSolely.h" #include "TConcatenateSolely.h"
#include "TCos.h" #include "TCos.h"
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论