Commit 5f933fc6 by xuchen

cumulative update

parent 78954fad
...@@ -35,6 +35,8 @@ ...@@ -35,6 +35,8 @@
void BackwardTest(); void BackwardTest();
void TransposeTest(); void TransposeTest();
void SumDimTest(); void SumDimTest();
void SplitBackwardTest();
void MemTest();
using namespace nts; using namespace nts;
using namespace fnnlm; using namespace fnnlm;
...@@ -42,6 +44,10 @@ using namespace transformer; ...@@ -42,6 +44,10 @@ using namespace transformer;
int main( int argc, const char ** argv ) int main( int argc, const char ** argv )
{ {
//MemTest();
//return 0;
//SplitBackwardTest();
//return 0;
//_CrtSetBreakAlloc(896); //_CrtSetBreakAlloc(896);
//BackwardTest(); //BackwardTest();
//return 0; //return 0;
...@@ -89,7 +95,7 @@ void BackwardTest() ...@@ -89,7 +95,7 @@ void BackwardTest()
c = DivDim(a, b, 0); c = DivDim(a, b, 0);
c.Dump(stderr, "c:"); c.Dump(stderr, "c:");
XLink::ShowNetwork(stderr, &c); //XLink::ShowNetwork(stderr, &c);
net.Backward(c); net.Backward(c);
...@@ -208,4 +214,68 @@ void SumDimTest() ...@@ -208,4 +214,68 @@ void SumDimTest()
z.Dump(stderr, "z:"); z.Dump(stderr, "z:");
delete[] data; delete[] data;
}
void SplitBackwardTest()
{
int * dimSize = new int[2];
dimSize[0] = 2;
dimSize[1] = 4;
XTensor t1;
InitTensor2D(&t1, 2, 4, X_FLOAT, 0, NULL);
XTensor t2;
InitTensor2D(&t2, 2, 4, X_FLOAT, 0, NULL);
XTensor tensor;
//_SetDataFixedFloat(&t1, 1.0F);
//_SetDataFixedFloat(&t2, 2.0F);
t1.SetDataRand();
t2.SetDataRand();
tensor = t1 + t2;
XList smalls;
XTensor first;
XTensor second;
InitTensor2D(&first, 2, 2, X_FLOAT, 0, NULL);
InitTensor2D(&second, 2, 2, X_FLOAT, 0, NULL);
smalls.Add(&first);
smalls.Add(&second);
Split(tensor, smalls, 1, 2);
XTensor mul;
mul = Sum(first, second);
XNet net;
net.Backward(mul);
net.Dump(stderr);
printf("Done!");
}
void MemTest()
{
XMem * mem;
mem = new XMem(0, FREE_ON_THE_FLY, (MTYPE)MILLION, 1024, MILLION);
XTensor tensor;
InitTensor2D(&tensor, 2, 4, X_FLOAT, 0, mem);
tensor.SetZeroAll();
tensor.Dump(stderr);
delete mem;
if (tensor.mem != NULL) {
printf("It isn't null!\n");
printf("%d\n", (int)tensor.mem->signature);
}
else {
printf("It's null\n");
}
tensor.Dump(stderr);
} }
\ No newline at end of file
...@@ -25,6 +25,7 @@ ...@@ -25,6 +25,7 @@
#include "XNoder.h" #include "XNoder.h"
#include "XBackwardShape.h" #include "XBackwardShape.h"
#include "../tensor/XName.h" #include "../tensor/XName.h"
#include "../tensor/XUtility.h"
#include "../tensor/core/CHeader.h" #include "../tensor/core/CHeader.h"
#include "../tensor/core/getandset/SetData.h" #include "../tensor/core/getandset/SetData.h"
...@@ -40,7 +41,7 @@ void XShapeGrad::MakeGrad(XTensor * node, bool isEfficent) ...@@ -40,7 +41,7 @@ void XShapeGrad::MakeGrad(XTensor * node, bool isEfficent)
if(operID == MOVEMENT_COPYINDEXED) if(operID == MOVEMENT_COPYINDEXED)
GradCopyIndexed(node, isEfficent); GradCopyIndexed(node, isEfficent);
if(operID == MOVEMENT_GATHER) else if(operID == MOVEMENT_GATHER)
GradGather(node, isEfficent); GradGather(node, isEfficent);
else if(operID == SHAPE_MERGE) else if(operID == SHAPE_MERGE)
GradMerge(node, isEfficent); GradMerge(node, isEfficent);
...@@ -80,7 +81,7 @@ gradient computation for copying indexed sub-tensors ...@@ -80,7 +81,7 @@ gradient computation for copying indexed sub-tensors
for for
b = copyindexed(a) b = copyindexed(a)
we have we have
dE/da = spread(b) dE/da = spreadforcopyindexed(b)
>> node - the node (c) for backward computation >> node - the node (c) for backward computation
>> isEfficient - indicates whether the computation is in >> isEfficient - indicates whether the computation is in
an efficient manner an efficient manner
...@@ -91,32 +92,14 @@ void XShapeGrad::GradCopyIndexed(XTensor * node, bool isEfficent) ...@@ -91,32 +92,14 @@ void XShapeGrad::GradCopyIndexed(XTensor * node, bool isEfficent)
CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for CopyIndexed!"); CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for CopyIndexed!");
int dim = income.GetParamInt(0); int dim = income.GetParamInt(0);
int * srcIndex = (int *)income.GetParamPointer(1); int copyNum = income.GetParamInt(1);
int indexSize = income.GetParamInt(2);
int * tgtIndex = (int *)income.GetParamPointer(3);
int copyNum = income.GetParamInt(4);
int realIndexSize = indexSize * copyNum;
int * realSrcIndex = new int[realIndexSize];
int * realTgtIndex = new int[realIndexSize];
for(int i = 0; i < indexSize; i++) {
for(int j = 0; j < copyNum; j++) {
realSrcIndex[i * copyNum + j] = srcIndex[i] + j;
realTgtIndex[i * copyNum + j] = tgtIndex[i] + j;
}
}
XTensor * input = income.tails[0]; XTensor * input = income.tails[0];
XNoder::MakeGrad(input); XTensor * srcIndex = income.tails[1];
XTensor * tgtIndex = income.tails[2];
_Spread(input->grad, node->grad, dim, realSrcIndex, realIndexSize, realTgtIndex);
delete[] realSrcIndex; XNoder::MakeGrad(input);
delete[] realTgtIndex; _SpreadForCopyIndexed(input->grad, node->grad, dim, srcIndex, tgtIndex, copyNum);
delete[] srcIndex;
delete[] tgtIndex;
node->visitMark = NODE_FINISHED;
} }
/* /*
...@@ -143,7 +126,6 @@ void XShapeGrad::GradGather(XTensor * node, bool isEfficent) ...@@ -143,7 +126,6 @@ void XShapeGrad::GradGather(XTensor * node, bool isEfficent)
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
} }
/* /*
gradient for merge gradient for merge
for for
...@@ -181,6 +163,7 @@ void XShapeGrad::GradMerge(XTensor * node, bool isEfficent) ...@@ -181,6 +163,7 @@ void XShapeGrad::GradMerge(XTensor * node, bool isEfficent)
XNoder::MakeGrad(input); XNoder::MakeGrad(input);
int * dims = new int[input->order]; int * dims = new int[input->order];
memset(dims, 0, sizeof(int) * input->order);
for(int i = 0, j = 0; i < input->order; i++){ for(int i = 0, j = 0; i < input->order; i++){
if(i >= leadDim){ if(i >= leadDim){
dims[j++] = input->dimSize[i]; dims[j++] = input->dimSize[i];
......
...@@ -437,4 +437,25 @@ void XNet::ClearGrad(XTensor * node) ...@@ -437,4 +437,25 @@ void XNet::ClearGrad(XTensor * node)
} }
} }
/*
show network topology
>> file - file to dump information
>> node - pointer to the node
*/
void XNet::ShowNetwork(FILE * file, XTensor * node)
{
XList roots(1);
roots.Add(node);
Traverse(roots);
XLink::ShowNode(file, node);
/* go over nodes in its topological order */
for(int i = nodes.count - 1; i >= 0; i--){
XTensor * n = (XTensor*)nodes.Get(i);
XLink::ShowNode(file, n);
}
}
} }
\ No newline at end of file
...@@ -108,6 +108,9 @@ struct XNet ...@@ -108,6 +108,9 @@ struct XNet
/* clear the graident information if the node is no use */ /* clear the graident information if the node is no use */
void ClearGrad(XTensor * node); void ClearGrad(XTensor * node);
/* show network topology */
void ShowNetwork(FILE * file, XTensor * node);
}; };
/* we make a unique id for every tensor */ /* we make a unique id for every tensor */
......
...@@ -231,7 +231,7 @@ void LoadArgs(int argc, const char ** argv, FNNModel &model) ...@@ -231,7 +231,7 @@ void LoadArgs(int argc, const char ** argv, FNNModel &model)
} }
for(int i = 0; i < argc; i++){ for(int i = 0; i < argc; i++){
if(!strcmp(argv[i], "-mempool")) if (!strcmp(argv[i], "-mempool"))
model.mem = new XMem(model.devID); model.mem = new XMem(model.devID);
} }
} }
...@@ -715,24 +715,16 @@ The indexed cell is set to 1, and 0 otherwise. ...@@ -715,24 +715,16 @@ The indexed cell is set to 1, and 0 otherwise.
>> devID - device id >> devID - device id
>> mem - memory pool >> mem - memory pool
*/ */
void InitZeroOneTensor2D(XTensor &tensor, int rowNum, int colNum, int * rows, int * cols, int itemNum, int devID, XMem * mem) void InitZeroOneTensor2D(XTensor &tensor, int rowNum, int colNum, int * rows, int * cols,
int itemNum, int devID, XMem * mem)
{ {
if(devID >= 0 || (mem != NULL && mem->devID >= 0)) InitTensor2D(&tensor, rowNum, colNum, X_FLOAT, devID, mem);
InitTensor2D(&tensor, rowNum, colNum, X_FLOAT, -1);
else
InitTensor2D(&tensor, rowNum, colNum, X_FLOAT, devID, mem);
tensor.SetZeroAll(); tensor.SetZeroAll();
/* set none-zero cells */ /* set none-zero cells */
for(int i = 0; i < itemNum; i++) for(int i = 0; i < itemNum; i++)
tensor.Set2D(1.0F, rows[i], cols[i]); tensor.Set2D(1.0F, rows[i], cols[i]);
if(devID >= 0 || (mem != NULL && mem->devID >= 0)){
XList list(1);
list.Add(&tensor);
CPUToGPUFlush(&list, devID, mem);
}
} }
/* /*
...@@ -859,8 +851,6 @@ void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net) ...@@ -859,8 +851,6 @@ void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net)
/* y = softmax(s) */ /* y = softmax(s) */
_LogSoftmax(&s, &y, 1); _LogSoftmax(&s, &y, 1);
} }
} }
/* /*
...@@ -998,7 +988,6 @@ void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model ...@@ -998,7 +988,6 @@ void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model
XTensor embeddingBig; XTensor embeddingBig;
XTensor hidden; XTensor hidden;
XTensor b; XTensor b;
XTensor srcIndex;
int size = batch * (n-1); int size = batch * (n-1);
int * index = new int[size]; int * index = new int[size];
...@@ -1010,28 +999,25 @@ void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model ...@@ -1010,28 +999,25 @@ void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model
} }
} }
InitTensor1D(&srcIndex, size, X_INT, model.devID, model.mem); InitTensor1D(&words, size, X_INT, model.devID, model.mem);
srcIndex.SetData(index, size); words.SetData(index, size);
embeddingBig = Gather(model.embeddingW, words);
XTensor embedding;
embedding = Gather(model.embeddingW, srcIndex);
delete[] index; delete[] index;
int dimSize[2]; int dimSize[2];
dimSize[0] = embedding.GetDim(0) / (n - 1); dimSize[0] = embeddingBig.GetDim(0) / (n - 1);
dimSize[1] = embedding.GetDim(1) * (n - 1); dimSize[1] = embeddingBig.GetDim(1) * (n - 1);
hidden = Reshape(embedding, embedding.order, dimSize); hidden = Reshape(embeddingBig, embeddingBig.order, dimSize);
/* hidden layers */ /* hidden layers */
for(int i = 0; i < depth; i++) for(int i = 0; i < depth; i++)
hidden = MMul(hidden, model.hiddenW[i]) + model.hiddenB[i]; hidden = HardTanH(MMul(hidden, model.hiddenW[i]) + model.hiddenB[i]);
/* output layer */ /* output layer */
output = LogSoftmax(MMul(hidden, model.outputW) + model.outputB, 1); output = LogSoftmax(MMul(hidden, model.outputW) + model.outputB, 1);
//XLink::ShowNetwork(stderr, &output);
} }
/* /*
...@@ -1071,7 +1057,6 @@ void ForwardAutoDiff(XTensor inputs[], XTensor &output, FNNModel &model) ...@@ -1071,7 +1057,6 @@ void ForwardAutoDiff(XTensor inputs[], XTensor &output, FNNModel &model)
/* output layer */ /* output layer */
output = LogSoftmax(MMul(hidden, model.outputW) + model.outputB, 1); output = LogSoftmax(MMul(hidden, model.outputW) + model.outputB, 1);
//XLink::ShowNetwork(stderr, &output);
} }
/* /*
......
...@@ -60,7 +60,7 @@ void AttDecoder::InitModel(int argc, char ** argv, ...@@ -60,7 +60,7 @@ void AttDecoder::InitModel(int argc, char ** argv,
/* initialize the stacked layers */ /* initialize the stacked layers */
for(int i = 0; i < nlayer; i++){ for(int i = 0; i < nlayer; i++){
attentionsEnde[i].InitModel(argc, argv, false, myIgnored, myDevID, myMem); attentionsEnde[i].InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem);
attEndeLayerNorms[i].InitModel(argc, argv, myDevID, myMem); attEndeLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
} }
} }
...@@ -69,11 +69,12 @@ void AttDecoder::InitModel(int argc, char ** argv, ...@@ -69,11 +69,12 @@ void AttDecoder::InitModel(int argc, char ** argv,
make the decoding network make the decoding network
>> inputDec - the input tensor of the decoder >> inputDec - the input tensor of the decoder
>> outputEnc - the output tensor of the encoder >> outputEnc - the output tensor of the encoder
>> mask - the mask that indicate each position is valid >> mask - mask that indicates which position is valid
>> maskEncDec - mask for the encoder-decoder attention
>> isTraining - indicates whether the model is used for training >> isTraining - indicates whether the model is used for training
<< return - the output tensor of the encoder << return - the output tensor of the encoder
*/ */
XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, bool isTraining) XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, XTensor &maskEncDec, bool isTraining)
{ {
XTensor x; XTensor x;
...@@ -89,7 +90,6 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, b ...@@ -89,7 +90,6 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, b
XTensor ln; XTensor ln;
XTensor fnn; XTensor fnn;
XTensor res; XTensor res;
XTensor nothing;
/******************/ /******************/
/* self attention */ /* self attention */
...@@ -107,7 +107,7 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, b ...@@ -107,7 +107,7 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, b
/*****************************/ /*****************************/
/* encoder-decoder attention */ /* encoder-decoder attention */
ende = attentionsEnde[i].Make(outputEnc, x, outputEnc, nothing, isTraining); ende = attentionsEnde[i].Make(outputEnc, x, outputEnc, maskEncDec, isTraining);
/* dropout */ /* dropout */
if(isTraining && dropoutP > 0) if(isTraining && dropoutP > 0)
......
...@@ -48,7 +48,7 @@ public: ...@@ -48,7 +48,7 @@ public:
int myDevID = -1, XMem * myMem = NULL); int myDevID = -1, XMem * myMem = NULL);
/* make the decoding network */ /* make the decoding network */
XTensor Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, bool isTraining); XTensor Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, XTensor &maskEncDec, bool isTraining);
}; };
} }
......
...@@ -150,7 +150,6 @@ XTensor T2TEmbedder::Make(XTensor &input) ...@@ -150,7 +150,6 @@ XTensor T2TEmbedder::Make(XTensor &input)
} }
/* then we make word embeddings */ /* then we make word embeddings */
//wordEmbedding = Linear(MMul(input, w), (float)sqrt((float)eSize));
wordEmbedding = Gather(w, input); wordEmbedding = Gather(w, input);
wordEmbedding = Linear(wordEmbedding, (float)sqrt((float)eSize)); wordEmbedding = Linear(wordEmbedding, (float)sqrt((float)eSize));
......
...@@ -93,10 +93,11 @@ void AttEncoder::InitModel(int argc, char ** argv, ...@@ -93,10 +93,11 @@ void AttEncoder::InitModel(int argc, char ** argv,
make the encoding network make the encoding network
>> input - the input tensor of the encoder >> input - the input tensor of the encoder
>> mask - the mask that indicate each position is valid >> mask - the mask that indicate each position is valid
>> maskEncDec - no use
>> isTraining - indicates whether the model is used for training >> isTraining - indicates whether the model is used for training
<< return - the output tensor of the encoder << return - the output tensor of the encoder
*/ */
XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool isTraining) XTensor AttEncoder::Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, bool isTraining)
{ {
XTensor x; XTensor x;
...@@ -144,4 +145,18 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool isTraining) ...@@ -144,4 +145,18 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool isTraining)
return x; return x;
} }
/*
make the encoding network (wrapper)
>> input - the input tensor of the encoder
>> mask - the mask that indicate each position is valid
>> isTraining - indicates whether the model is used for training
<< return - the output tensor of the encoder
*/
XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool isTraining)
{
XTensor nothing;
return Make(input, mask, nothing, isTraining);
}
} }
...@@ -40,7 +40,7 @@ class T2TEncoder ...@@ -40,7 +40,7 @@ class T2TEncoder
{ {
public: public:
virtual virtual
XTensor Make(XTensor &input, XTensor &mask, bool isTraining) = 0; XTensor Make(XTensor &input, XTensor &mask, XTensor &mask2, bool isTraining) = 0;
}; };
/* /*
...@@ -49,7 +49,7 @@ the encoder based on RNN ...@@ -49,7 +49,7 @@ the encoder based on RNN
class RNNEncoder : T2TEncoder class RNNEncoder : T2TEncoder
{ {
public: public:
XTensor Make(XTensor &input, XTensor &mask, bool isTraining); XTensor Make(XTensor &input, XTensor &mask, XTensor &mask2, bool isTraining);
}; };
...@@ -118,6 +118,9 @@ public: ...@@ -118,6 +118,9 @@ public:
int myDevID = -1, XMem * myMem = NULL); int myDevID = -1, XMem * myMem = NULL);
/* make the encoding network */ /* make the encoding network */
XTensor Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, bool isTraining);
/* make the encoding network (wrapper) */
XTensor Make(XTensor &input, XTensor &mask, bool isTraining); XTensor Make(XTensor &input, XTensor &mask, bool isTraining);
}; };
......
...@@ -75,7 +75,7 @@ void T2TModel::InitModel(int argc, char ** argv) ...@@ -75,7 +75,7 @@ void T2TModel::InitModel(int argc, char ** argv)
mem->SetDesiredSize(devID, 0, (MTYPE)memSize * MILLION); mem->SetDesiredSize(devID, 0, (MTYPE)memSize * MILLION);
} }
encoder->InitModel(argc, argv, isLM, 0, devID, mem); encoder->InitModel(argc, argv, true, 0, devID, mem);
outputLayer->InitModel(argc, argv, devID, mem); outputLayer->InitModel(argc, argv, devID, mem);
if(isMT) if(isMT)
...@@ -99,7 +99,9 @@ make the encoding network ...@@ -99,7 +99,9 @@ make the encoding network
*/ */
XTensor T2TModel::MakeEncoder(XTensor &input, XTensor &mask, bool isTraining) XTensor T2TModel::MakeEncoder(XTensor &input, XTensor &mask, bool isTraining)
{ {
return encoder->Make(input, mask, isTraining); XTensor nothing;
return encoder->Make(input, mask, nothing, isTraining);
} }
/* /*
...@@ -107,13 +109,14 @@ make the decoding network ...@@ -107,13 +109,14 @@ make the decoding network
>> inputDec - input tensor of the decoder >> inputDec - input tensor of the decoder
>> outputEnc - output tensor of the encoder >> outputEnc - output tensor of the encoder
>> output - output tensor (distribution) >> output - output tensor (distribution)
>> mask - the mask for positions that are/not involved in computation >> mask - mask for positions that are/not involved in computation
>> maskEncDec - mask for the encoder-decoder attention
>> isTraining - indicates whether we are training the model >> isTraining - indicates whether we are training the model
<< return - encoding result << return - encoding result
*/ */
XTensor T2TModel::MakeDecoder(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, bool isTraining) XTensor T2TModel::MakeDecoder(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, XTensor &maskEncDec, bool isTraining)
{ {
return decoder->Make(inputDec, outputEnc, mask, isTraining); return decoder->Make(inputDec, outputEnc, mask, maskEncDec, isTraining);
} }
/* /*
...@@ -190,14 +193,16 @@ make the network for machine translation (with the output softmax layer) ...@@ -190,14 +193,16 @@ make the network for machine translation (with the output softmax layer)
>> inputDec - input tensor of the decoder >> inputDec - input tensor of the decoder
>> output - output tensor (distribution) >> output - output tensor (distribution)
>> paddingEnc - padding of the sequences (on the encoder side) >> paddingEnc - padding of the sequences (on the encoder side)
>> paddingDec - padding of the sequences (on the decoder side)
>> isTraining - indicates whether the model is for training >> isTraining - indicates whether the model is for training
*/ */
void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTensor &paddingEnc, bool isTraining) void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTensor &paddingEnc, XTensor &paddingDec, bool isTraining)
{ {
XTensor encoding; XTensor encoding;
XTensor decoding; XTensor decoding;
XTensor maskEnc; XTensor maskEnc;
XTensor maskDec; XTensor maskDec;
XTensor maskEncDec;
/* generate mask to see "previous" words on the decoder side */ /* generate mask to see "previous" words on the decoder side */
//int len = inputDec.GetDim(inputDec.order - 2); //int len = inputDec.GetDim(inputDec.order - 2);
...@@ -222,6 +227,23 @@ void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTe ...@@ -222,6 +227,23 @@ void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTe
_SetDataLowTri(&maskDec, 1e9F, 0); _SetDataLowTri(&maskDec, 1e9F, 0);
_ScaleAndShiftMe(&maskDec, 1.0F, -1e9F); _ScaleAndShiftMe(&maskDec, 1.0F, -1e9F);
/* encoder-decoder mask that prevent the attention to padding dummy words */
dims[inputDec.order + 1] = inputEnc.GetDim(inputEnc.order - 1);
InitTensor(&maskEncDec, inputDec.order + 2, dims, X_FLOAT, 1.0F, paddingEnc.devID, paddingEnc.mem);
XTensor * maskEncDecTMPEnc = NewTensorBuf(paddingEnc.order + 1, dims + 1, paddingEnc.dataType,
paddingEnc.denseRatio, paddingEnc.devID, paddingEnc.mem);
XTensor * maskEncDecTMPDec = NewTensorBuf(maskEncDecTMPEnc, paddingEnc.devID, paddingEnc.mem);
_Unsqueeze(&paddingEnc, maskEncDecTMPEnc, paddingEnc.order - 1, paddingDec.GetDim(-1));
_Unsqueeze(&paddingDec, maskEncDecTMPDec, paddingEnc.order, paddingEnc.GetDim(-1));
_Multiply(maskEncDecTMPDec, maskEncDecTMPEnc, maskEncDecTMPDec);
_ScaleAndShiftMe(maskEncDecTMPDec, 1e9F, -1e9F);
_Unsqueeze(maskEncDecTMPDec, &maskEncDec, 0, dims[0]);
DelTensorBuf(maskEncDecTMPDec);
DelTensorBuf(maskEncDecTMPEnc);
/* padding on the source side */ /* padding on the source side */
int * dimsPadding = new int[paddingEnc.order + 2]; int * dimsPadding = new int[paddingEnc.order + 2];
for (int i = 0; i < paddingEnc.order - 1; i++) for (int i = 0; i < paddingEnc.order - 1; i++)
...@@ -252,7 +274,7 @@ void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTe ...@@ -252,7 +274,7 @@ void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTe
_Sum(&maskEnc, padding3, &maskEnc); _Sum(&maskEnc, padding3, &maskEnc);
encoding = MakeEncoder(inputEnc, maskEnc, isTraining); encoding = MakeEncoder(inputEnc, maskEnc, isTraining);
decoding = MakeDecoder(inputDec, encoding, maskDec, isTraining); decoding = MakeDecoder(inputDec, encoding, maskDec, maskEncDec, isTraining);
outputLayer->Make(decoding, output); outputLayer->Make(decoding, output);
delete[] dims; delete[] dims;
......
...@@ -72,13 +72,13 @@ public: ...@@ -72,13 +72,13 @@ public:
XTensor MakeEncoder(XTensor &input, XTensor &mask, bool isTraining); XTensor MakeEncoder(XTensor &input, XTensor &mask, bool isTraining);
/* make the encoding network */ /* make the encoding network */
XTensor MakeDecoder(XTensor &inputEnc, XTensor &inputDec, XTensor &mask, bool isTraining); XTensor MakeDecoder(XTensor &inputEnc, XTensor &inputDec, XTensor &mask, XTensor &MaskEncDec, bool isTraining);
/* make the network for langauge modeling (with the output softmax layer) */ /* make the network for langauge modeling (with the output softmax layer) */
void MakeLM(XTensor &input, XTensor &output, XTensor &padding, bool isTraining); void MakeLM(XTensor &input, XTensor &output, XTensor &padding, bool isTraining);
/* make the network for machine translation (with the output softmax layer) */ /* make the network for machine translation (with the output softmax layer) */
void MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTensor &paddingEnc, bool isTraining); void MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTensor &paddingEnc, XTensor &paddingDec, bool isTraining);
/* get parameter matrics */ /* get parameter matrics */
void GetParams(XList &list); void GetParams(XList &list);
......
...@@ -93,8 +93,8 @@ void T2TOutput::Make(XTensor &input, XTensor &output) ...@@ -93,8 +93,8 @@ void T2TOutput::Make(XTensor &input, XTensor &output)
{ {
XTensor &x = input; XTensor &x = input;
//output = LogSoftmax(MMul(x, w), -1); output = LogSoftmax(MMul(x, w), -1);
output = Softmax(MMul(x, w), -1); //output = Softmax(MMul(x, w), -1);
} }
} }
...@@ -116,6 +116,7 @@ void T2TTrainer::Init(int argc, char ** argv) ...@@ -116,6 +116,7 @@ void T2TTrainer::Init(int argc, char ** argv)
LoadParamBool(argc, argv, "doubledend", &isDoubledEnd, false); LoadParamBool(argc, argv, "doubledend", &isDoubledEnd, false);
LoadParamBool(argc, argv, "smallbatch", &isSmallBatch, true); LoadParamBool(argc, argv, "smallbatch", &isSmallBatch, true);
LoadParamBool(argc, argv, "bigbatch", &isBigBatch, false); LoadParamBool(argc, argv, "bigbatch", &isBigBatch, false);
LoadParamBool(argc, argv, "smallfootprint", &isSmallFootprint, false);
buf = new int[bufSize]; buf = new int[bufSize];
buf2 = new int[bufSize]; buf2 = new int[bufSize];
...@@ -163,6 +164,9 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model ...@@ -163,6 +164,9 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
int devID = model->devID; int devID = model->devID;
XMem * mem = model->mem; XMem * mem = model->mem;
XNet net; XNet net;
if(isSmallFootprint)
net.SetGradEfficientFlag();
PrepareModel(model); PrepareModel(model);
...@@ -208,7 +212,7 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model ...@@ -208,7 +212,7 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
if(model->isLM) if(model->isLM)
model->MakeLM(batchEnc, output, paddingEnc, true); model->MakeLM(batchEnc, output, paddingEnc, true);
else if(model->isMT) else if(model->isMT)
model->MakeMT(batchEnc, batchDec, output, paddingEnc, true); model->MakeMT(batchEnc, batchDec, output, paddingEnc, paddingDec, true);
else{ else{
ShowNTErrors("Illegal model type!"); ShowNTErrors("Illegal model type!");
} }
...@@ -218,11 +222,8 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model ...@@ -218,11 +222,8 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
LabelSmooth(&gold, &goldSmoothed, labelSmoothingP); LabelSmooth(&gold, &goldSmoothed, labelSmoothingP);
/* make paddings for the output */ /* make paddings for the output */
//if (output.GetDim(0) > 1) if (output.GetDim(0) > 1)
// PadOutput(&output, &gold, &paddingDec); PadOutput(&output, &gold, &paddingDec);
//output.Dump(tmpFILE, "output: ");
//fflush(tmpFILE);
/* get probabilities */ /* get probabilities */
float prob = GetProb(&output, &gold, NULL); float prob = GetProb(&output, &gold, NULL);
...@@ -235,7 +236,7 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model ...@@ -235,7 +236,7 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
if (doUpdate) { if (doUpdate) {
/* recale the output for normalized loss */ /* recale the output for normalized loss */
//RescaleOutput(&output, &g, &paddingDec); RescaleOutput(&output, &g, &paddingDec);
/* back-propagation */ /* back-propagation */
net.Backward(output, g, paddingDec, CROSSENTROPY); net.Backward(output, g, paddingDec, CROSSENTROPY);
...@@ -275,19 +276,6 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model ...@@ -275,19 +276,6 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
XPRINT(0, stderr, "\n"); XPRINT(0, stderr, "\n");
} }
//XMem * mem = model->mem;
//MTYPE used = 0;
//MTYPE total = 0;
//for(int i = 0; i < mem->blockNum; i++){
// if(mem->blocks[i].mem != NULL){
// used += mem->blocks[i].used;
// total += mem->blocks[i].size;
// }
//}
//fprintf(stderr, "%d %d %d %d mem: %lld %lld\n", paddingEnc.GetDim(0), paddingEnc.GetDim(1),
// paddingDec.GetDim(0), paddingDec.GetDim(1), used, total);
if(nStepCheckpoint > 0 && ++nStepCheck >= nStepCheckpoint){ if(nStepCheckpoint > 0 && ++nStepCheck >= nStepCheckpoint){
MakeCheckpoint(model, validFN, modelFN, "step", step); MakeCheckpoint(model, validFN, modelFN, "step", step);
nStepCheck = 0; nStepCheck = 0;
...@@ -374,7 +362,7 @@ void T2TTrainer::Test(const char * fn, const char * ofn, T2TModel * model) ...@@ -374,7 +362,7 @@ void T2TTrainer::Test(const char * fn, const char * ofn, T2TModel * model)
if(model->isLM) if(model->isLM)
model->MakeLM(batchEnc, output, paddingEnc, false); model->MakeLM(batchEnc, output, paddingEnc, false);
else if(model->isMT) else if(model->isMT)
model->MakeMT(batchEnc, batchDec, output, paddingEnc, false); model->MakeMT(batchEnc, batchDec, output, paddingEnc, paddingDec, false);
else{ else{
ShowNTErrors("Illegal model type!"); ShowNTErrors("Illegal model type!");
} }
...@@ -705,70 +693,46 @@ int T2TTrainer::LoadBatchLM(FILE * file, ...@@ -705,70 +693,46 @@ int T2TTrainer::LoadBatchLM(FILE * file,
dims[2] = vs; dims[2] = vs;
InitTensor2D(batchEnc, sc, max, X_INT, devID, mem); InitTensor2D(batchEnc, sc, max, X_INT, devID, mem);
//InitTensor(batchEnc, 3, dims, X_FLOAT, 1.0F, devID, mem);
InitTensor2D(paddingEnc, sc, max, X_FLOAT, devID, mem);
InitTensor(gold, 3, dims, X_FLOAT, 1.0F, devID, mem); InitTensor(gold, 3, dims, X_FLOAT, 1.0F, devID, mem);
InitTensor2D(paddingEnc, sc, max, X_FLOAT, devID, mem);
InitTensor2D(paddingDec, sc, max, X_FLOAT, devID, mem); InitTensor2D(paddingDec, sc, max, X_FLOAT, devID, mem);
batchEnc->SetZeroAll(); batchEnc->SetZeroAll();
paddingEnc->SetZeroAll();
gold->SetZeroAll(); gold->SetZeroAll();
paddingEnc->SetZeroAll();
paddingDec->SetZeroAll(); paddingDec->SetZeroAll();
if(isTraining) {
//XNoder::MakeGrad(batchEnc);
XNoder::MakeGrad(paddingEnc);
XNoder::MakeGrad(gold);
XNoder::MakeGrad(paddingDec);
//batchEnc->grad->SetZeroAll();
paddingEnc->grad->SetZeroAll();
gold->grad->SetZeroAll();
paddingDec->grad->SetZeroAll();
}
int seqSize = 0; int seqSize = 0;
MTYPE * batchEncOffsets = new MTYPE[batchEnc->unitNum];
int * batchEncValues = new int[batchEnc->unitNum]; int * batchEncValues = new int[batchEnc->unitNum];
MTYPE * paddingEncOffsets = new MTYPE[paddingEnc->unitNum];
MTYPE * goldOffsets = new MTYPE[gold->unitNum]; MTYPE * goldOffsets = new MTYPE[gold->unitNum];
MTYPE * paddingDecOffsets = new MTYPE[paddingDec->unitNum]; //MTYPE * paddingEncOffsets = new MTYPE[paddingEnc->unitNum];
//MTYPE * paddingDecOffsets = new MTYPE[paddingDec->unitNum];
int wGold = 0; int wGold = 0;
//fprintf(tf, "batch %d(%d)\n", tc++, sc); memset(batchEncValues, 0, sizeof(int) * batchEnc->unitNum);
for(int s = seq; s < seq + sc; s++){ for(int s = seq; s < seq + sc; s++){
int len = isDoubledEnd ? seqLen[s] : seqLen[s] - 1; int len = isDoubledEnd ? seqLen[s] : seqLen[s] - 1;
CheckNTErrors(len <= max, "Something is wrong!"); CheckNTErrors(len <= max, "Something is wrong!");
for(int w = 0; w < len; w++){ for(int w = 0; w < len; w++){
int num = buf[seqOffset[s] + w]; int num = buf[seqOffset[s] + w];
//batchEnc->Set2DInt(buf[seqOffset[s] + w], s - seq, w); batchEncValues[(int)batchEnc->GetOffset2D(s - seq, w)] = num;
batchEncOffsets[wCount] = batchEnc->GetOffset2D(s - seq, w); //paddingEncOffsets[wCount] = paddingEnc->GetOffset2D(s - seq, w);
batchEncValues[wCount] = num; //paddingDecOffsets[wCount] = paddingDec->GetOffset2D(s - seq, w);
//paddingEnc->Set2D(1.0F, s - seq, w);
//paddingDec->Set2D(1.0F, s - seq, w);
paddingEncOffsets[wCount] = paddingEnc->GetOffset2D(s - seq, w);
paddingDecOffsets[wCount] = paddingDec->GetOffset2D(s - seq, w);
if (w > 0) if (w > 0)
//gold->Set3D(1.0F, s - seq, w - 1, buf[seqOffset[s] + w]);
goldOffsets[wGold++] = gold->GetOffset3D(s - seq, w - 1, num); goldOffsets[wGold++] = gold->GetOffset3D(s - seq, w - 1, num);
if (w == len - 1) { if (w == len - 1) {
if (isDoubledEnd) if (isDoubledEnd)
//gold->Set3D(1.0F, s - seq, w, buf[seqOffset[s] + w]);
goldOffsets[wGold++] = gold->GetOffset3D(s - seq, w, num); goldOffsets[wGold++] = gold->GetOffset3D(s - seq, w, num);
else else
//gold->Set3D(1.0F, s - seq, w, buf[seqOffset[s] + w + 1]);
goldOffsets[wGold++] = gold->GetOffset3D(s - seq, w, buf[seqOffset[s] + w + 1]); goldOffsets[wGold++] = gold->GetOffset3D(s - seq, w, buf[seqOffset[s] + w + 1]);
} }
wCount++; wCount++;
/*fprintf(tf, "%d", buf[seqOffset[s] + w]);
if(w < seqLen[s] - 1)
fprintf(tf, " ");
else
fprintf(tf, "\n");*/
if(seqs != NULL) if(seqs != NULL)
seqs[seqSize++] = buf[seqOffset[s] + w]; seqs[seqSize++] = buf[seqOffset[s] + w];
} }
...@@ -779,16 +743,25 @@ int T2TTrainer::LoadBatchLM(FILE * file, ...@@ -779,16 +743,25 @@ int T2TTrainer::LoadBatchLM(FILE * file,
} }
} }
batchEnc->SetDataBatched(batchEncOffsets, batchEncValues, wCount); batchEnc->SetData(batchEncValues, batchEnc->unitNum);
paddingEnc->SetDataBatched(paddingEncOffsets, 1.0F, wCount);
paddingDec->SetDataBatched(paddingDecOffsets, 1.0F, wCount);
gold->SetDataBatched(goldOffsets, 1.0F, wGold); gold->SetDataBatched(goldOffsets, 1.0F, wGold);
//paddingEnc->SetDataBatched(paddingEncOffsets, 1.0F, wCount);
//paddingDec->SetDataBatched(paddingDecOffsets, 1.0F, wCount);
XTensor * tmp = NewTensorBuf(paddingEnc, devID, mem);
_ConvertDataType(batchEnc, tmp);
_NotEqual(tmp, paddingEnc, 0);
DelTensorBuf(tmp);
XTensor * tmp2 = NewTensorBuf(paddingDec, devID, mem);
_ConvertDataType(batchEnc, tmp2);
_NotEqual(tmp2, paddingDec, 0);
DelTensorBuf(tmp2);
delete[] batchEncOffsets;
delete[] batchEncValues; delete[] batchEncValues;
delete[] paddingEncOffsets;
delete[] paddingDecOffsets;
delete[] goldOffsets; delete[] goldOffsets;
//delete[] paddingEncOffsets;
//delete[] paddingDecOffsets;
fflush(tf); fflush(tf);
...@@ -884,35 +857,38 @@ int T2TTrainer::LoadBatchMT(FILE * file, ...@@ -884,35 +857,38 @@ int T2TTrainer::LoadBatchMT(FILE * file,
paddingDec->SetZeroAll(); paddingDec->SetZeroAll();
gold->SetZeroAll(); gold->SetZeroAll();
int wCountEnc = 0;
int wCountDec = 0;
int wGold = 0;
wCount = 0; wCount = 0;
MTYPE * batchEncOffsets = new MTYPE[batchEnc->unitNum];
int * batchEncValues = new int[batchEnc->unitNum]; int * batchEncValues = new int[batchEnc->unitNum];
MTYPE * batchDecOffsets = new MTYPE[batchDec->unitNum];
int * batchDecValues = new int[batchDec->unitNum]; int * batchDecValues = new int[batchDec->unitNum];
MTYPE * paddingEncOffsets = new MTYPE[sc * maxEnc / 2]; //MTYPE * paddingEncOffsets = new MTYPE[sc * maxEnc / 2];
MTYPE * paddingDecOffsets = new MTYPE[sc * maxDec / 2]; //MTYPE * paddingDecOffsets = new MTYPE[sc * maxDec / 2];
MTYPE * goldOffsets = new MTYPE[sc * maxDec / 2]; MTYPE * goldOffsets = new MTYPE[sc * maxDec / 2];
memset(batchEncValues, 0, sizeof(int) * batchEnc->unitNum);
memset(batchDecValues, 0, sizeof(int) * batchDec->unitNum);
/* batch of the source-side sequences */ /* batch of the source-side sequences */
for(int s = seq; s < seq + sc; s += 2){ for(int s = seq; s < seq + sc; s += 2){
int len = seqLen[s]; int len = seqLen[s];
int sent = (s - seq)/2; int sent = (s - seq)/2;
for(int w = 0; w < len; w++){ for(int w = 0; w < len; w++){
int num = buf[seqOffset[s] + w]; int num = buf[seqOffset[s] + w];
batchEncValues[batchEnc->GetOffset2D(sent, w)] = num;
batchEncOffsets[wCount] = batchEnc->GetOffset2D(sent, w); //paddingEncOffsets[wCountEnc] = paddingEnc->GetOffset2D(sent, w);
batchEncValues[wCount] = num; wCountEnc++;
paddingEncOffsets[wCount] = paddingEnc->GetOffset2D(sent, w);
wCount++;
} }
} }
batchEnc->SetDataBatched(batchEncOffsets, batchEncValues, wCount); batchEnc->SetData(batchEncValues, batchEnc->unitNum);
paddingEnc->SetDataBatched(paddingEncOffsets, 1.0F, wCount); //paddingEnc->SetDataBatched(paddingEncOffsets, 1.0F, wCountEnc);
XTensor * tmp = NewTensorBuf(paddingEnc, devID, mem);
int wCountDec = 0; _ConvertDataType(batchEnc, tmp);
int wGold = 0; _NotEqual(tmp, paddingEnc, 0);
DelTensorBuf(tmp);
/* batch of the target-side sequences */ /* batch of the target-side sequences */
for(int s = seq + 1; s < seq + sc; s += 2){ for(int s = seq + 1; s < seq + sc; s += 2){
...@@ -921,10 +897,8 @@ int T2TTrainer::LoadBatchMT(FILE * file, ...@@ -921,10 +897,8 @@ int T2TTrainer::LoadBatchMT(FILE * file,
int sent = (s - seq - 1)/2; int sent = (s - seq - 1)/2;
for(int w = 0; w < len; w++){ for(int w = 0; w < len; w++){
int num = buf[seqOffset[s] + w]; int num = buf[seqOffset[s] + w];
batchDecValues[batchDec->GetOffset2D(sent, w)] = num;
batchDecOffsets[wCountDec] = batchDec->GetOffset2D(sent, w); //paddingDecOffsets[wCountDec] = paddingDec->GetOffset2D(sent, w);
batchDecValues[wCountDec] = num;
paddingDecOffsets[wCountDec] = paddingDec->GetOffset2D(sent, w);
if (w > 0) if (w > 0)
goldOffsets[wGold++] = gold->GetOffset3D(sent, w - 1, buf[seqOffset[s] + w]); goldOffsets[wGold++] = gold->GetOffset3D(sent, w - 1, buf[seqOffset[s] + w]);
...@@ -947,16 +921,20 @@ int T2TTrainer::LoadBatchMT(FILE * file, ...@@ -947,16 +921,20 @@ int T2TTrainer::LoadBatchMT(FILE * file,
} }
} }
batchDec->SetDataBatched(batchDecOffsets, batchDecValues, wCountDec); batchDec->SetData(batchDecValues, batchDec->unitNum);
paddingDec->SetDataBatched(paddingDecOffsets, 1.0F, wCountDec); //paddingDec->SetDataBatched(paddingDecOffsets, 1.0F, wCountDec);
XTensor * tmp2 = NewTensorBuf(paddingDec, devID, mem);
_ConvertDataType(batchDec, tmp2);
_NotEqual(tmp2, paddingDec, 0);
DelTensorBuf(tmp2);
gold->SetDataBatched(goldOffsets, 1.0F, wGold); gold->SetDataBatched(goldOffsets, 1.0F, wGold);
delete[] batchEncOffsets;
delete[] batchEncValues; delete[] batchEncValues;
delete[] batchDecOffsets;
delete[] batchDecValues; delete[] batchDecValues;
delete[] paddingEncOffsets; //delete[] paddingEncOffsets;
delete[] paddingDecOffsets; //delete[] paddingDecOffsets;
delete[] goldOffsets; delete[] goldOffsets;
return sc; return sc;
...@@ -990,12 +968,13 @@ float T2TTrainer::GetProb(XTensor * output, XTensor * gold, XTensor * wordProbs) ...@@ -990,12 +968,13 @@ float T2TTrainer::GetProb(XTensor * output, XTensor * gold, XTensor * wordProbs)
XTensor probs; XTensor probs;
InitTensor(&probs, output); InitTensor(&probs, output);
XTensor logOutput; //XTensor logOutput;
InitTensor(&logOutput, output); //InitTensor(&logOutput, output);
_Log(output, &logOutput); //_Log(output, &logOutput);
/* probs[i,j] = output[i,j] * gold[i,j] */ /* probs[i,j] = output[i,j] * gold[i,j] */
_Multiply(&logOutput, gold, &probs); //_Multiply(&logOutput, gold, &probs);
_Multiply(output, gold, &probs);
/* probability of each word */ /* probability of each word */
XTensor wprobs; XTensor wprobs;
...@@ -1170,6 +1149,7 @@ void T2TTrainer::RescaleOutput(XTensor * output, XTensor * gold, XTensor * paddi ...@@ -1170,6 +1149,7 @@ void T2TTrainer::RescaleOutput(XTensor * output, XTensor * gold, XTensor * paddi
_ExpMe(output); _ExpMe(output);
_ScaleAndShiftMe(output, 1/count); _ScaleAndShiftMe(output, 1/count);
_LogMe(output); _LogMe(output);
_ScaleAndShiftMe(gold, 1/count); _ScaleAndShiftMe(gold, 1/count);
} }
......
...@@ -142,6 +142,9 @@ public: ...@@ -142,6 +142,9 @@ public:
/* counterpart of "isSmallBatch" */ /* counterpart of "isSmallBatch" */
bool isBigBatch; bool isBigBatch;
/* indicates whether we use small memory footprint for backward process */
bool isSmallFootprint;
public: public:
/* constructor */ /* constructor */
T2TTrainer(); T2TTrainer();
......
...@@ -20,6 +20,7 @@ ...@@ -20,6 +20,7 @@
*/ */
#include <math.h> #include <math.h>
#include <time.h>
#include "Transformer.h" #include "Transformer.h"
#include "T2TModel.h" #include "T2TModel.h"
#include "T2TUtility.h" #include "T2TUtility.h"
...@@ -58,6 +59,7 @@ int TransformerMain(int argc, const char ** argv) ...@@ -58,6 +59,7 @@ int TransformerMain(int argc, const char ** argv)
LoadParamString(argc, args, "test", testFN, ""); LoadParamString(argc, args, "test", testFN, "");
LoadParamString(argc, args, "output", outputFN, ""); LoadParamString(argc, args, "output", outputFN, "");
srand((unsigned int)time(NULL));
T2TTrainer trainer; T2TTrainer trainer;
trainer.Init(argc, args); trainer.Init(argc, args);
......
...@@ -126,7 +126,7 @@ void SmallTest() ...@@ -126,7 +126,7 @@ void SmallTest()
d = a + b + c.Lin(0.5F); d = a + b + c.Lin(0.5F);
XLink::CheckNetwork(&d); XLink::CheckNetwork(&d);
XLink::ShowNetwork(stderr, &d); //XLink::ShowNetwork(stderr, &d);
a.Dump(stderr, "a:"); a.Dump(stderr, "a:");
b.Dump(stderr, "b:"); b.Dump(stderr, "b:");
......
...@@ -65,10 +65,10 @@ namespace nts { ...@@ -65,10 +65,10 @@ namespace nts {
#endif #endif
#ifndef MIN #ifndef MIN
#define MIN(a,b) ((a < b) ? a : b) #define MIN(a,b) ((a) < (b) ? a : b)
#endif #endif
#ifndef MAX #ifndef MAX
#define MAX(a,b) ((a > b) ? a : b) #define MAX(a,b) ((a) > (b) ? a : b)
#endif #endif
#define __FILENAME__ ( strrchr(__FILE__, DELIMITER) != NULL ? strrchr(__FILE__, DELIMITER)+1 : __FILE__ ) #define __FILENAME__ ( strrchr(__FILE__, DELIMITER) != NULL ? strrchr(__FILE__, DELIMITER)+1 : __FILE__ )
......
...@@ -593,21 +593,6 @@ void XLink::CheckNetwork(XTensor * root) ...@@ -593,21 +593,6 @@ void XLink::CheckNetwork(XTensor * root)
} }
/* /*
show the network encoded in a root node (tensor)
>> file - file to dump information
>> root - pointer to the root node
*/
void XLink::ShowNetwork(FILE * file, XTensor * root)
{
XLink &income = root->income;
for(int i = 0; i < income.tailNum; i++){
XTensor * child = income.tails[i];
ShowNetwork(file, child);
}
}
/*
show a node show a node
>> file - file to dump information >> file - file to dump information
>> root - pointer to the node >> root - pointer to the node
......
...@@ -178,10 +178,6 @@ struct XLink ...@@ -178,10 +178,6 @@ struct XLink
static static
void CheckNetwork(XTensor * root); void CheckNetwork(XTensor * root);
/* show the network encoded in a root node (tensor) */
static
void ShowNetwork(FILE * file, XTensor * root);
/* show a node */ /* show a node */
static static
void ShowNode(FILE * file, XTensor * node); void ShowNode(FILE * file, XTensor * node);
......
...@@ -79,7 +79,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -79,7 +79,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#define MOVEMENT_COPYVALUES MOVEMENT_COPYINDEXED + 1 #define MOVEMENT_COPYVALUES MOVEMENT_COPYINDEXED + 1
#define MOVEMENT_GATHER MOVEMENT_COPYVALUES + 1 #define MOVEMENT_GATHER MOVEMENT_COPYVALUES + 1
#define SHAPE MOVEMENT_COPYVALUES + 1 #define SHAPE MOVEMENT_GATHER + 1
#define SHAPE_CONCATENATE SHAPE + 1 #define SHAPE_CONCATENATE SHAPE + 1
#define SHAPE_MERGE SHAPE_CONCATENATE + 1 #define SHAPE_MERGE SHAPE_CONCATENATE + 1
#define SHAPE_MERGE_LIST SHAPE_MERGE + 1 #define SHAPE_MERGE_LIST SHAPE_MERGE + 1
......
...@@ -677,9 +677,9 @@ void XTensor::SetData(const void * d, int num, int beg) ...@@ -677,9 +677,9 @@ void XTensor::SetData(const void * d, int num, int beg)
return; return;
CheckNTErrors(!isSparse, "TODO"); CheckNTErrors(!isSparse, "TODO");
CheckNTErrors(num == unitNum - beg, "Illegal size!"); CheckNTErrors(num <= unitNum - beg, "Illegal size!");
XMemCopy(data, devID, d, -1, num * unitSize); XMemCopy((char*)data + beg * unitSize, devID, d, -1, num * unitSize);
} }
/* /*
...@@ -804,7 +804,7 @@ set tensor items with an array of values ...@@ -804,7 +804,7 @@ set tensor items with an array of values
>> values - value for each data item >> values - value for each data item
>> num - number of the data items >> num - number of the data items
*/ */
void XTensor::SetDataBatched(MTYPE * offsets, void * values, int num) void XTensor::SetDataBatchedWithValues(MTYPE * offsets, void * values, int num)
{ {
_SetDataWithOffsetAndValue(this, offsets, values, num); _SetDataWithOffsetAndValue(this, offsets, values, num);
} }
...@@ -1289,7 +1289,7 @@ int XTensor::GetNonzeroSize() ...@@ -1289,7 +1289,7 @@ int XTensor::GetNonzeroSize()
if(dataType == DEFAULT_DTYPE){ if(dataType == DEFAULT_DTYPE){
int count = 0; int count = 0;
for(int i = 0; i < unitNum; i++){ for(int i = 0; i < unitNum; i++){
DTYPE value = *((DTYPE*)(char*)data + i * sizeof(DTYPE)); DTYPE value = *(DTYPE*)((char*)data + i * sizeof(DTYPE));
if(value == 0) if(value == 0)
count++; count++;
} }
...@@ -2271,6 +2271,8 @@ XTensor * NewTensor(const XTensor * a, bool isFilledData) ...@@ -2271,6 +2271,8 @@ XTensor * NewTensor(const XTensor * a, bool isFilledData)
CheckNTErrors((a != NULL), "Empty input!"); CheckNTErrors((a != NULL), "Empty input!");
memset(dims, 0, sizeof(int) * MAX_TENSOR_DIM_NUM);
if(a->order > 0) if(a->order > 0)
memcpy(dims, a->dimSize, sizeof(int) * a->order); memcpy(dims, a->dimSize, sizeof(int) * a->order);
......
...@@ -49,6 +49,8 @@ struct XLink; ...@@ -49,6 +49,8 @@ struct XLink;
#define USE_BATCHED_STRIDED_MAT_MUL #define USE_BATCHED_STRIDED_MAT_MUL
#define MIN_TENSOR_SPLIT_NUM 0 #define MIN_TENSOR_SPLIT_NUM 0
#define MIN_TENSOR_SPLIT_LIST_NUM 1024 #define MIN_TENSOR_SPLIT_LIST_NUM 1024
#define MIN_TENSOR_MERGE_NUM 0
#define MIN_TENSOR_MERGE_LIST_NUM 1024
#define MIN_TENSOR_CAT_NUM 8 #define MIN_TENSOR_CAT_NUM 8
/* computation flags */ /* computation flags */
...@@ -283,7 +285,7 @@ public: ...@@ -283,7 +285,7 @@ public:
void SetDataBatched(MTYPE * offsets, DTYPE value, int num); void SetDataBatched(MTYPE * offsets, DTYPE value, int num);
/* set tensor items with an array of values */ /* set tensor items with an array of values */
void SetDataBatched(MTYPE * offsets, void * values, int num); void SetDataBatchedWithValues(MTYPE * offsets, void * values, int num);
/* check whether the data array is the same as the answer */ /* check whether the data array is the same as the answer */
bool CheckData(const void * answer, int num, int beg = 0); bool CheckData(const void * answer, int num, int beg = 0);
......
...@@ -16,8 +16,8 @@ ...@@ -16,8 +16,8 @@
*/ */
/* /*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/ */
/* this is a header to include all functions in the "core" workspace */ /* this is a header to include all functions in the "core" workspace */
...@@ -46,16 +46,17 @@ ...@@ -46,16 +46,17 @@
#include "arithmetic/XTensorBLAS.h" #include "arithmetic/XTensorBLAS.h"
#include "getandset/ConvertDataType.h" #include "getandset/ConvertDataType.h"
#include "getandset/OnehotAndIndex.h"
#include "getandset/Select.h" #include "getandset/Select.h"
#include "getandset/SetData.h" #include "getandset/SetData.h"
#include "math/Clip.h" #include "math/Clip.h"
#include "math/Compare.h"
#include "math/Normalize.h" #include "math/Normalize.h"
#include "math/Power.h" #include "math/Power.h"
#include "math/ScaleAndShift.h" #include "math/ScaleAndShift.h"
#include "math/Unary.h" #include "math/Unary.h"
#include "movement/CopyBlocks.h" #include "movement/CopyBlocks.h"
#include "movement/CopyBlocksInGrid.h" #include "movement/CopyBlocksInGrid.h"
#include "movement/CopyBlocksOnSite.h" #include "movement/CopyBlocksOnSite.h"
......
...@@ -32,8 +32,6 @@ convert data type ...@@ -32,8 +32,6 @@ convert data type
*/ */
void _ConvertDataType(const XTensor * input, XTensor * output) void _ConvertDataType(const XTensor * input, XTensor * output)
{ {
//CheckNTErrors((input->unitSize == output->unitSize), "Input and Output must be same in size!");
if (input->dataType == output->dataType) if (input->dataType == output->dataType)
return; return;
...@@ -61,4 +59,29 @@ void _ConvertDataType(const XTensor * input, XTensor * output) ...@@ -61,4 +59,29 @@ void _ConvertDataType(const XTensor * input, XTensor * output)
ShowNTErrors("Unsupported data types for conversion!"); ShowNTErrors("Unsupported data types for conversion!");
} }
/*
convert data type (return an XTensor structure)
make a new tensor to keep the result and return it
>> input - input tensor
>> output - output tensor
*/
XTensor ConvertDataType(const XTensor & input, TENSOR_DATA_TYPE dataType)
{
int order = input.order;
float dr = (!input.isSparse) ? 1.0F : input.denseRatio;
XTensor output(order, input.dimSize, dataType, dr, input.devID, input.mem);
output.SetTMPFlag();
_Gather(&s, &t, &index);
/* tensor connection */
XLink::MakeLink(&s, &index, &t, MOVEMENT_GATHER);
}
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
...@@ -23,12 +23,16 @@ ...@@ -23,12 +23,16 @@
#define __CONVERTDATATYPE_H__ #define __CONVERTDATATYPE_H__
#include "../../XTensor.h" #include "../../XTensor.h"
#include "../../XDataType.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* convert data type */ /* convert data type */
void _ConvertDataType(const XTensor * input, XTensor * output); void _ConvertDataType(const XTensor * input, XTensor * output);
/* convert data type (return an XTensor structure) */
XTensor ConvertDataType(const XTensor * input, TENSOR_DATA_TYPE dataType);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
#endif // __CONVERTDATATYPE_H__ #endif // __CONVERTDATATYPE_H__
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-12-17
*/
#include "OnehotAndIndex.h"
#include "OnehotAndIndex.cuh"
namespace nts{ // namespace nts(NiuTrans.Tensor)
/*
convert onehot tensor to index tensor
>> onehot - onehot tensor, which value is 0 or 1
>> index - index tensor, which value is an integer num
>> size - the last dimension size of the onehot tensor
*/
void _OnehotToIndex(XTensor * onehot, XTensor * index, int size)
{
CheckNTErrors(onehot->GetDim(-1) == size, "Illegal tensor dimension!");
CheckNTErrors(onehot->order == index->order + 1, "Illegal tensor order!");
CheckNTErrors(onehot->dataType == X_INT, "The onehot tensor must be in X_INT!")
CheckNTErrors(index->dataType == X_INT, "The index tensor must be in X_INT!")
for (int i = 0; i < index->order; i++)
CheckNTErrors(index->GetDim(i) == onehot->GetDim(i), "Illegal tensor order!");
#ifdef USE_CUDA
if(onehot->devID >= 0 && index->devID >= 0) {
_CudaOnehotToIndex(onehot, index, size);
return;
}
#endif
int blockNum = index->unitNum;
int stride = size;
int * onehotData = (int *)onehot->data;
int * indexData = (int *)index->data;
for (int i = 0; i < blockNum; i++) {
int * od = onehotData + i * stride;
int record = -1;
for (int j = 0; j < stride; j++) {
if (od[j] != 0) {
if (record == -1)
record = j;
else
ShowNTErrors("The value of onehot tensor is illegal!");
}
}
indexData[i] = record;
}
}
/*
convert onehot tensor to index tensor (return an XTensor structure)
make a new tensor to keep the result and return it
>> onehot - onehot tensor, which value is 0 or 1
>> size - the last dimension size of the onehot tensor
<< return - the index tensor
*/
XTensor OnehotToIndex(XTensor & onehot, int size)
{
CheckNTErrors(onehot.GetDim(-1) == size, "Illegal tensor dimension!");
CheckNTErrors(onehot.dataType == X_INT, "The onehot tensor must be in X_INT!")
XTensor index;
InitTensor(&index, onehot.order - 1, onehot.dimSize, X_INT, 1.0F, onehot.devID, onehot.mem);
index.SetTMPFlag();
_OnehotToIndex(&onehot, &index, size);
return index;
}
/*
convert index tensor to onehot tensor
>> index - index tensor, which value is an integer num
>> onehot - onehot tensor, which value is 0 or 1
>> size - the last dimension size of the onehot tensor
*/
void _IndexToOnehot(XTensor * index, XTensor * onehot, int size)
{
CheckNTErrors(onehot->GetDim(-1) == size, "Illegal tensor dimension!");
CheckNTErrors(onehot->order == index->order + 1, "Illegal tensor order!");
CheckNTErrors(onehot->dataType == X_INT, "The onehot tensor must be in X_INT!")
CheckNTErrors(index->dataType == X_INT, "The index tensor must be in X_INT!")
for (int i = 0; i < index->order; i++)
CheckNTErrors(index->GetDim(i) == onehot->GetDim(i), "Illegal tensor order!");
onehot->SetZeroAll();
#ifdef USE_CUDA
if(onehot->devID >= 0 && index->devID >= 0) {
_CudaIndexToOnehot(index, onehot, size);
return;
}
#endif
int blockNum = index->unitNum;
int stride = size;
int * indexData = (int *)index->data;
int * onehotData = (int *)onehot->data;
for (int i = 0; i < blockNum; i++) {
int id = indexData[i];
int * od = onehotData + i * stride;
od[id] = 1;
}
}
/*
convert onehot tensor to index tensor (return an XTensor structure)
make a new tensor to keep the result and return it
>> index - index tensor, which value is an integer num
>> size - the last dimension size of the onehot tensor
<< return - the onehot tensor
*/
XTensor IndexToOnehot(XTensor & index, int size)
{
CheckNTErrors(index.dataType == X_INT, "The onehot tensor must be in X_INT!")
XTensor onehot;
onehot.SetTMPFlag();
int order = index.order;
int * dim = new int[order + 1];
memcpy(dim, index.dimSize, order * sizeof(int));
dim[order] = size;
InitTensor(&onehot, index.order + 1, dim, X_INT, 1.0F, index.devID, index.mem);
_IndexToOnehot(&index, &onehot, size);
delete[] dim;
return onehot;
}
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-31
*/
#include "OnehotAndIndex.cuh"
#include "../../XDevice.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/*
convert onehot tensor to index tensor (kernel version)
>> onehotData - the data pointer of the onehot tensor
>> indexData - the data pointer of the index tensor
>> blockNum - the number of block
>> stride - stride of a data block
*/
__global__
void KernelOnehotToIndex(int * onehotData, int * indexData, int blockNum, int stride)
{
/* block id */
int i = blockDim.x * blockIdx.x + threadIdx.x;
/* offset in each block */
int offset = blockDim.y * blockIdx.y + threadIdx.y;
if (i >= blockNum || offset >= stride)
return;
int * od = onehotData + i * stride;
int * id = indexData + i;
if (od[offset] != 0)
*id = offset;
}
/*
convert onehot tensor to index tensor (cuda version)
>> onehot - onehot tensor, which value is 0 or 1
>> index - index tensor, which value is an integer num
>> size - the last dimension size of the onehot tensor
*/
void _CudaOnehotToIndex(XTensor * onehot, XTensor * index, int size)
{
int devID = onehot->devID;
int blockNum = index->unitNum;
int stride = size;
int cudaGrids[3];
int cudaBlocks[3];
int devIDBackup;
ProtectCudaDev(devID, devIDBackup);
GDevs.GetCudaThread2D(devID, blockNum, stride, MAX_INT, cudaGrids, cudaBlocks);
dim3 blocks(cudaGrids[0], cudaGrids[1]);
dim3 threads(cudaBlocks[0], cudaBlocks[1]);
int * onehotData = (int *)onehot->data;
int * indexData = (int *)index->data;
KernelOnehotToIndex<<<blocks, threads >>>(onehotData, indexData, blockNum, stride);
BacktoCudaDev(devID, devIDBackup);
}
/*
convert index tensor to onehot tensor (kernel version)
>> onehotData - the data pointer of the onehot tensor
>> indexData - the data pointer of the index tensor
>> blockNum - the number of block
>> stride - stride of a data block
*/
__global__
void KernelIndexToOnehot(int * onehotData, int * indexData, int blockNum, int stride)
{
/* block id */
int i = blockDim.x * blockIdx.x + threadIdx.x;
/* offset in each block */
int offset = blockDim.y * blockIdx.y + threadIdx.y;
if (i >= blockNum || offset >= stride)
return;
int * od = onehotData + i * stride;
int id = indexData[i];
od[id] = 1;
}
/*
convert index tensor to onehot tensor (cuda version)
>> index - index tensor, which value is an integer num
>> onehot - onehot tensor, which value is 0 or 1
>> size - the last dimension size of the onehot tensor
*/
void _CudaIndexToOnehot(XTensor * index, XTensor * onehot, int size)
{
int devID = onehot->devID;
int blockNum = index->unitNum;
int stride = size;
int cudaGrids[3];
int cudaBlocks[3];
int devIDBackup;
ProtectCudaDev(devID, devIDBackup);
GDevs.GetCudaThread2D(devID, blockNum, stride, MAX_INT, cudaGrids, cudaBlocks);
dim3 blocks(cudaGrids[0], cudaGrids[1]);
dim3 threads(cudaBlocks[0], cudaBlocks[1]);
int * onehotData = (int *)onehot->data;
int * indexData = (int *)index->data;
KernelIndexToOnehot<<<blocks, threads >>>(onehotData, indexData, blockNum, stride);
BacktoCudaDev(devID, devIDBackup);
}
#endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-12-17
*/
#ifndef __ONEHOTANDINDEX_CUH__
#define __ONEHOTANDINDEX_CUH__
#include "../../XTensor.h"
namespace nts{ // namespace nts(NiuTrans.Tensor)
/* convert onehot tensor to index tensor (cuda version) */
void _CudaOnehotToIndex(XTensor * onehot, XTensor * index, int size);
/* convert index tensor to onehot tensor (cuda version) */
void _CudaIndexToOnehot(XTensor * index, XTensor * onehot, int size);
} // namespace nts(NiuTrans.Tensor)
#endif // __ONEHOTANDINDEX_CUH__
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-12-17
*/
#ifndef __ONEHOTANDINDEX_H__
#define __ONEHOTANDINDEX_H__
#include "../../XTensor.h"
namespace nts{ // namespace nts(NiuTrans.Tensor)
/* convert onehot tensor to index tensor */
void _OnehotToIndex(XTensor * onehot, XTensor * index, int size);
/* convert onehot tensor to index tensor (return an XTensor structure)
make a new tensor to keep the result and return it */
XTensor OnehotToIndex(XTensor & onehot, int num);
/* convert index tensor to onehot tensor */
void _IndexToOnehot(XTensor * index, XTensor * onehot, int size);
/* convert index tensor to onehot tensor (return an XTensor structure)
make a new tensor to keep the result and return it */
XTensor IndexToOnehot(XTensor & index, int num);
} // namespace nts(NiuTrans.Tensor)
#endif // __ONEHOTANDINDEX_H__
\ No newline at end of file
...@@ -70,8 +70,9 @@ void _SetDataFanInOut(XTensor * tensor, DTYPE gain) ...@@ -70,8 +70,9 @@ void _SetDataFanInOut(XTensor * tensor, DTYPE gain)
fanOut = numOutputFmaps * receptiveFieldSize; fanOut = numOutputFmaps * receptiveFieldSize;
} }
DTYPE finfout = gain * (float)sqrt(6.0F/(fanIn + fanOut)); DTYPE std = gain * (float)sqrt(2.0 / (fanIn + fanOut));
tensor->SetDataRand(-finfout, finfout); DTYPE a = (DTYPE)sqrt(3.0F) * std;
tensor->SetDataRand(-a, a);
//_SetDataRand(tensor, -finfout, finfout); //_SetDataRand(tensor, -finfout, finfout);
} }
...@@ -499,36 +500,9 @@ void _SetDataWithOffsetAndValue(XTensor * tensor, MTYPE * offsets, void * values ...@@ -499,36 +500,9 @@ void _SetDataWithOffsetAndValue(XTensor * tensor, MTYPE * offsets, void * values
} }
else { else {
#ifdef USE_CUDA #ifdef USE_CUDA
XMem * mem = tensor->mem; if(tensor->devID >= 0) {
MTYPE offsetSize = num * sizeof(MTYPE); _CudaSetDataWithOffsetAndValue(tensor, offsets, values, num);
MTYPE valueSize; return;
if (tensor->dataType == X_INT)
valueSize = num * sizeof(int);
else if (tensor->dataType == X_FLOAT)
valueSize = num * sizeof(float);
else
ShowNTErrors("TO DO!!!");
MTYPE * offsetsCuda = mem != NULL ?
(MTYPE*)mem->AllocBuf(mem->devID, offsetSize) :
(MTYPE*)XMemAlloc(tensor->devID, offsetSize);
void * valuesCuda = mem != NULL ?
mem->AllocBuf(mem->devID, valueSize) :
XMemAlloc(tensor->devID, valueSize);
XMemCopy(offsetsCuda, tensor->devID, offsets, -1, offsetSize);
XMemCopy(valuesCuda, tensor->devID, values, -1, valueSize);
_CudaSetDataWithOffsetAndValue(tensor, offsetsCuda, valuesCuda, num);
if (mem != NULL) {
mem->ReleaseBuf(mem->devID, valueSize);
mem->ReleaseBuf(mem->devID, offsetSize);
}
else {
XMemFree(tensor->devID, offsetsCuda);
XMemFree(tensor->devID, valuesCuda);
} }
#else #else
ShowNTErrors("Please recompile the code with USE_CUDA"); ShowNTErrors("Please recompile the code with USE_CUDA");
......
...@@ -26,6 +26,7 @@ ...@@ -26,6 +26,7 @@
#include "SetData.cuh" #include "SetData.cuh"
#include <curand_kernel.h> #include <curand_kernel.h>
#include "../../XDevice.h" #include "../../XDevice.h"
#include "../../XUtility.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
...@@ -363,7 +364,7 @@ e.g., for a 3* 3 tensor, ...@@ -363,7 +364,7 @@ e.g., for a 3* 3 tensor,
2 2 0 2 2 0
*/ */
__global__ __global__
void _KernelSetDataLowTri(DTYPE * d, int l, int blockSize, int blockNum, DTYPE p, int shift) void KernelSetDataLowTri(DTYPE * d, int l, int blockSize, int blockNum, DTYPE p, int shift)
{ {
/* offset in each block */ /* offset in each block */
int i = blockDim.x * blockIdx.x + threadIdx.x; int i = blockDim.x * blockIdx.x + threadIdx.x;
...@@ -425,7 +426,7 @@ void _CudaSetDataLowTri(XTensor * tensor, DTYPE p, int shift) ...@@ -425,7 +426,7 @@ void _CudaSetDataLowTri(XTensor * tensor, DTYPE p, int shift)
int devIDBackup; int devIDBackup;
ProtectCudaDev(tensor->devID, devIDBackup); ProtectCudaDev(tensor->devID, devIDBackup);
_KernelSetDataLowTri<<<blocks, threads >>>((DTYPE*)tensor->data, l, blockSize, blockNum, p, shift); KernelSetDataLowTri<<<blocks, threads >>>((DTYPE*)tensor->data, l, blockSize, blockNum, p, shift);
BacktoCudaDev(tensor->devID, devIDBackup); BacktoCudaDev(tensor->devID, devIDBackup);
} }
...@@ -474,12 +475,12 @@ set the data with an array of offsets (kernel version) ...@@ -474,12 +475,12 @@ set the data with an array of offsets (kernel version)
>> num - number of the data items >> num - number of the data items
*/ */
__global__ __global__
void _KernelSetDataWithOffset(DTYPE * data, MTYPE * offsets, DTYPE value, MTYPE num) void KernelSetDataWithOffset(DTYPE * data, MTYPE * offsets, DTYPE value, MTYPE num)
{ {
/* index */ /* index */
int i = blockDim.x * blockIdx.x + threadIdx.x; int i = blockDim.x * blockIdx.x + threadIdx.x;
if(i < num) if (i < num)
data[offsets[i]] = value; data[offsets[i]] = value;
} }
...@@ -505,7 +506,7 @@ void _CudaSetDataWithOffset(XTensor * tensor, MTYPE * offsets, DTYPE value, MTYP ...@@ -505,7 +506,7 @@ void _CudaSetDataWithOffset(XTensor * tensor, MTYPE * offsets, DTYPE value, MTYP
int devIDBackup; int devIDBackup;
ProtectCudaDev(tensor->devID, devIDBackup); ProtectCudaDev(tensor->devID, devIDBackup);
_KernelSetDataWithOffset << <blocks, threads >> > ((DTYPE*)tensor->data, offsets, value, num); KernelSetDataWithOffset << <blocks, threads >> > ((DTYPE*)tensor->data, offsets, value, num);
BacktoCudaDev(tensor->devID, devIDBackup); BacktoCudaDev(tensor->devID, devIDBackup);
} }
...@@ -519,7 +520,7 @@ set the data with an array of offsets (kernel version) ...@@ -519,7 +520,7 @@ set the data with an array of offsets (kernel version)
>> dataType - the data type of the data and values >> dataType - the data type of the data and values
*/ */
__global__ __global__
void _KernelSetDataWithOffset(void * data, MTYPE * offsets, void * values, MTYPE num, TENSOR_DATA_TYPE dataType) void KernelSetDataWithOffsetAndValue(void * data, MTYPE * offsets, void * values, MTYPE num, TENSOR_DATA_TYPE dataType)
{ {
/* index */ /* index */
int i = blockDim.x * blockIdx.x + threadIdx.x; int i = blockDim.x * blockIdx.x + threadIdx.x;
...@@ -541,6 +542,18 @@ set the data with an array of values ...@@ -541,6 +542,18 @@ set the data with an array of values
*/ */
void _CudaSetDataWithOffsetAndValue(XTensor * tensor, MTYPE * offsets, void * values, MTYPE num) void _CudaSetDataWithOffsetAndValue(XTensor * tensor, MTYPE * offsets, void * values, MTYPE num)
{ {
XMem * mem = tensor->mem;
MTYPE offsetSize = num * sizeof(MTYPE);
MTYPE valueSize;
if (tensor->dataType == X_INT)
valueSize = num * sizeof(int);
else if (tensor->dataType == X_FLOAT)
valueSize = num * sizeof(float);
else
ShowNTErrors("TO DO!!!");
int gridSize[3]; int gridSize[3];
int blockSize[3]; int blockSize[3];
...@@ -552,7 +565,32 @@ void _CudaSetDataWithOffsetAndValue(XTensor * tensor, MTYPE * offsets, void * va ...@@ -552,7 +565,32 @@ void _CudaSetDataWithOffsetAndValue(XTensor * tensor, MTYPE * offsets, void * va
int devIDBackup; int devIDBackup;
ProtectCudaDev(tensor->devID, devIDBackup); ProtectCudaDev(tensor->devID, devIDBackup);
_KernelSetDataWithOffset << <blocks, threads >> > (tensor->data, offsets, values, num, tensor->dataType); MTYPE * offsetsCuda = mem != NULL ?
(MTYPE*)mem->AllocBuf(mem->devID, offsetSize) :
(MTYPE*)XMemAlloc(tensor->devID, offsetSize);
void * valuesCuda = mem != NULL ?
mem->AllocBuf(mem->devID, valueSize) :
XMemAlloc(tensor->devID, valueSize);
if (mem != NULL) {
XMemCopy(offsetsCuda, mem->devID, offsets, -1, offsetSize);
XMemCopy(valuesCuda, mem->devID, values, -1, valueSize);
}
else {
XMemCopy(offsetsCuda, tensor->devID, offsets, -1, offsetSize);
XMemCopy(valuesCuda, tensor->devID, values, -1, valueSize);
}
KernelSetDataWithOffsetAndValue<<<blocks, threads >>> (tensor->data, offsetsCuda, valuesCuda, num, tensor->dataType);
if (mem != NULL) {
mem->ReleaseBuf(mem->devID, valueSize);
mem->ReleaseBuf(mem->devID, offsetSize);
}
else {
XMemFree(tensor->devID, valuesCuda);
XMemFree(tensor->devID, offsetsCuda);
}
BacktoCudaDev(tensor->devID, devIDBackup); BacktoCudaDev(tensor->devID, devIDBackup);
} }
......
...@@ -15,7 +15,6 @@ ...@@ -15,7 +15,6 @@
* limitations under the License. * limitations under the License.
*/ */
/* /*
* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-03 * $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-03
*/ */
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-12-10
*/
#include "../../XTensor.h"
#include "../../XName.h"
#include "Compare.h"
#include "Compare.cuh"
namespace nts{ // namespace nts(NiuTrans.Tensor)
DTYPE myIsEqual(DTYPE a, DTYPE b)
{
return (a == b ? 1.0F : 0.0F);
}
DTYPE myIsNotEqual(DTYPE a, DTYPE b)
{
return (a != b ? 1.0F : 0.0F);
}
#ifdef USE_CUDA
/* define three marco separately, specify the respective function names (GPU mode) */
#define _SIMPLE_COMPARE_FUNCTION(_funcName, _cudaFuncName, origFunc) \
void _funcName(const XTensor * a, XTensor * b, DTYPE number) \
{ \
CheckNTErrors((XTensor::IsSameShaped(a, b)), \
"Input tensors should have the same type!"); \
CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!"); \
/* run it on GPUs */ \
if (a->devID >= 0) { \
_cudaFuncName(a, b, number); \
return; \
} \
DTYPE * d = (DTYPE*)a->data; \
DTYPE * db = (DTYPE*)b->data; \
for (int i = 0; i < a->unitNum; i++) \
db[i] = (DTYPE)origFunc(d[i], number); \
}
#define _SIMPLE_COMPARE_FUNCTION_ME(_funcNameMe, _funcName) \
void _funcNameMe(XTensor * a, DTYPE number) \
{ \
_funcName(a, a, number); \
}
#define SIMPLE_COMPARE_FUNCTION(funcName, _funcName, operationId) \
XTensor funcName(const XTensor &a, DTYPE number) \
{ \
XTensor b(&a); \
b.SetTMPFlag(); \
_funcName(&a, &b, number); \
return b; \
}
// I think we needn't to make link.
// XLink::MakeLink(&a, NULL, &b, operationId);
_SIMPLE_COMPARE_FUNCTION(_Equal, _CudaEqual, myIsEqual)
_SIMPLE_COMPARE_FUNCTION_ME(_EqualMe, _Equal)
SIMPLE_COMPARE_FUNCTION(Equal, _Equal, MATH_EQUAL)
_SIMPLE_COMPARE_FUNCTION(_NotEqual, _CudaNotEqual, myIsNotEqual)
_SIMPLE_COMPARE_FUNCTION_ME(_NotEqualMe, _NotEqual)
SIMPLE_COMPARE_FUNCTION(NotEqual, _NotEqual, MATH_NOTEQUAL)
#else
/* define three marco separately, specify the respective function names (CPU mode) */
#define _SIMPLE_COMPARE_FUNCTION(_funcName, origFunc) \
void _funcName(const XTensor * a, XTensor * b, DTYPE number) \
{ \
CheckNTErrors((XTensor::IsSameShaped(a, b)), \
"Input tensors should have the same type!"); \
CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!"); \
DTYPE * d = (DTYPE*)a->data; \
DTYPE * db = (DTYPE*)b->data; \
for (int i = 0; i < a->unitNum; i++) \
db[i] = (DTYPE)origFunc(d[i], number); \
}
#define _SIMPLE_COMPARE_FUNCTION_ME(_funcNameMe, _funcName) \
void _funcNameMe(XTensor * a, DTYPE number) \
{ \
_funcName(a, a, number); \
}
#define SIMPLE_COMPARE_FUNCTION(funcName, _funcName, operationId) \
XTensor funcName(const XTensor &a, DTYPE number) \
{ \
XTensor b(&a); \
b.SetTMPFlag(); \
_funcName(&a, &b, number); \
return b; \
}
// I think we needn't to make link.
// XLink::MakeLink(&a, NULL, &b, operationId);
_SIMPLE_COMPARE_FUNCTION(_Equal, myIsEqual)
_SIMPLE_COMPARE_FUNCTION_ME(_EqualMe, _Equal)
SIMPLE_COMPARE_FUNCTION(Equal, _Equal, MATH_EQUAL)
_SIMPLE_COMPARE_FUNCTION(_NotEqual, myIsNotEqual)
_SIMPLE_COMPARE_FUNCTION_ME(_NotEqualMe, _NotEqual)
SIMPLE_COMPARE_FUNCTION(NotEqual, _NotEqual, MATH_NOTEQUAL)
#endif
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-31
*/
#include <math.h>
#include "../../XDevice.h"
#include "../../XName.h"
#include "Compare.h"
#include "Compare.cuh"
namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
__device__
DTYPE cudaIsEqual(DTYPE a, DTYPE b)
{
return (a == b ? 1.0F : 0.0F);
}
__device__
DTYPE cudaIsNotEqual(DTYPE a, DTYPE b)
{
return (a != b ? 1.0F : 0.0F);
}
#define SIMPLE_COMPARE_FUNCTION_GPU(funcName, origFunc) \
__global__ \
void Kernel##funcName(DTYPE * a, DTYPE * b, int size, DTYPE number) \
{ \
int i = blockDim.x * blockIdx.x + threadIdx.x; \
\
if (i < size) \
b[i] = (DTYPE)origFunc(a[i], number); \
} \
__global__ \
void Kernel##funcName(__half * a, __half * b, int size, __half number) \
{ \
return; \
} \
void _Cuda##funcName(const XTensor * a, XTensor * b, DTYPE number) \
{ \
\
int gridSize[3]; \
int blockSize[3]; \
\
GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize); \
\
dim3 blocks(gridSize[0]); \
dim3 threads(blockSize[0]); \
\
int devIDBackup; \
ProtectCudaDev(a->devID, devIDBackup); \
\
if (a->dataType == DEFAULT_DTYPE) { \
Kernel##funcName<<<blocks, threads>>> \
((DTYPE*)a->data, (DTYPE*)b->data, \
a->unitNum, (DTYPE)number); \
} \
else if (a->dataType == X_FLOAT16) { \
Kernel##funcName<<<blocks, threads>>> \
((__half*)a->data, (__half*)b->data, \
a->unitNum, (__half)number); \
} \
else { \
ShowNTErrors("TODO!"); \
} \
\
BacktoCudaDev(a->devID, devIDBackup); \
} \
SIMPLE_COMPARE_FUNCTION_GPU(Equal, cudaIsEqual)
SIMPLE_COMPARE_FUNCTION_GPU(NotEqual, cudaIsNotEqual)
#endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-12-10
*/
#ifndef __COMPARE_CUH__
#define __COMPARE_CUH__
#include "../../XTensor.h"
namespace nts{ // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/* compare whether every entry is equal to the specified value (cuda kernel) */
__global__
void KernelEqual(DTYPE * a, DTYPE * b, DTYPE * number);
/* compare whether every entry is equal to the specified value (cuda version) */
void _CudaEqual(const XTensor * a, XTensor * b, DTYPE number);
/* compare whether every entry is not equal to the specified value (cuda kernel) */
__global__
void KernelNotEqual(DTYPE * a, DTYPE * b, DTYPE * number);
/* compare whether every entry is not equal to the specified value (cuda version) */
void _CudaNotEqual(const XTensor * a, XTensor * b, DTYPE number);
#endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor)
#endif //end __COMPARE_CUH__
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-12-10
*/
#ifndef __COMPARE_H__
#define __COMPARE_H__
#include "../../XTensor.h"
namespace nts{ // namespace nts(NiuTrans.Tensor)
/* compare whether every entry is equal to the specified value */
void _Equal(const XTensor * a, XTensor * b, DTYPE number);
/* compare whether every entry is equal to the specified value (do it on site)
keep the result in the input tensor a and return nothing */
void _EqualMe(XTensor * a, DTYPE number);
/* compare whether every entry is equal to the specified value (return an XTensor structure)
make a new tensor to keep the result and return it */
XTensor Equal(const XTensor & a, DTYPE number);
/* compare whether every entry is not equal to the specified value */
void _NotEqual(const XTensor * a, XTensor * b, DTYPE number);
/* compare whether every entry is not equal to the specified value (do it on site)
keep the result in the input tensor a and return nothing */
void _NotEqualMe(XTensor * a, DTYPE number);
/* compare whether every entry is not equal to the specified value (return an XTensor structure)
make a new tensor to keep the result and return it */
XTensor NotEqual(const XTensor & a, DTYPE number);
} // namespace nts(NiuTrans.Tensor)
#endif // end __COMPARE_H__
\ No newline at end of file
...@@ -223,4 +223,4 @@ _SIMPLE_UNARY_FUNCTION_ME(_RoundMe, _Round) ...@@ -223,4 +223,4 @@ _SIMPLE_UNARY_FUNCTION_ME(_RoundMe, _Round)
SIMPLE_UNARY_FUNCTION(Round, _Round, MATH_ROUND)*/ SIMPLE_UNARY_FUNCTION(Round, _Round, MATH_ROUND)*/
#endif #endif
} } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file \ No newline at end of file
...@@ -15,7 +15,6 @@ ...@@ -15,7 +15,6 @@
* limitations under the License. * limitations under the License.
*/ */
/* /*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-31 * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-31
*/ */
......
...@@ -15,7 +15,6 @@ ...@@ -15,7 +15,6 @@
* limitations under the License. * limitations under the License.
*/ */
/* /*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-31 * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-31
*/ */
......
...@@ -15,7 +15,6 @@ ...@@ -15,7 +15,6 @@
* limitations under the License. * limitations under the License.
*/ */
/* /*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-31 * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-31
*/ */
...@@ -145,5 +144,6 @@ void _TanMe(XTensor * a); ...@@ -145,5 +144,6 @@ void _TanMe(XTensor * a);
make a new tensor to keep the result and return it */ make a new tensor to keep the result and return it */
XTensor Tan(const XTensor & a); XTensor Tan(const XTensor & a);
} } // namespace nts(NiuTrans.Tensor)
#endif //end __UNARY_H__
\ No newline at end of file #endif // end __UNARY_H__
\ No newline at end of file
...@@ -79,8 +79,13 @@ void _CudaCopyBlocksSelected(void * source, int blockSize, int * sourceBlocks, i ...@@ -79,8 +79,13 @@ void _CudaCopyBlocksSelected(void * source, int blockSize, int * sourceBlocks, i
ProtectCudaDev(devID, devIDBackup); ProtectCudaDev(devID, devIDBackup);
/* copy the index to the GPU memory */ /* copy the index to the GPU memory */
int * sourceBlocksTMP = myMem != NULL ? (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int)) : (int *)XMemAlloc(devID, blockNum * sizeof(int)); int * sourceBlocksTMP = myMem != NULL ?
int * targetBlocksTMP = myMem != NULL ? (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int)) : (int *)XMemAlloc(devID, blockNum * sizeof(int)); (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int)) :
(int *)XMemAlloc(devID, blockNum * sizeof(int));
int * targetBlocksTMP = myMem != NULL ?
(int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int)) :
(int *)XMemAlloc(devID, blockNum * sizeof(int));
XMemCopy(sourceBlocksTMP, devID, sourceBlocks, -1, blockNum * sizeof(int)); XMemCopy(sourceBlocksTMP, devID, sourceBlocks, -1, blockNum * sizeof(int));
XMemCopy(targetBlocksTMP, devID, targetBlocks, -1, blockNum * sizeof(int)); XMemCopy(targetBlocksTMP, devID, targetBlocks, -1, blockNum * sizeof(int));
......
/* NiuTrans.Tensor - an open-source tensor library /* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University. * Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved. * All rights reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
* You may obtain a copy of the License at * You may obtain a copy of the License at
* *
* http://www.apache.org/licenses/LICENSE-2.0 * http://www.apache.org/licenses/LICENSE-2.0
* *
* Unless required by applicable law or agreed to in writing, software * Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, * distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
/* /*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/ */
#include "CopyIndexed.h" #include "CopyIndexed.h"
#include "CopyIndexed.cuh"
#include "CopyBlocks.h" #include "CopyBlocks.h"
#include "Gather.h"
#include "../../XName.h" #include "../../XName.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
...@@ -40,7 +42,9 @@ copy indexed sub-tensors ...@@ -40,7 +42,9 @@ copy indexed sub-tensors
e.g., for srcIndex = [1,4] and copyNum = 2, e.g., for srcIndex = [1,4] and copyNum = 2,
we actually copy the source sub-tensors 1, 2, 4, 5 we actually copy the source sub-tensors 1, 2, 4, 5
*/ */
void _CopyIndexed(const XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSize, int * tgtIndex, int copyNum) void _CopyIndexed(const XTensor * s, XTensor * t, int dim,
int * srcIndex, int indexSize, int * tgtIndex,
int copyNum)
{ {
CheckNTErrors((s && t), "Invalid tensors!"); CheckNTErrors((s && t), "Invalid tensors!");
CheckNTErrors((s->devID == t->devID || (s->devID < 0 && t->devID < 0)), CheckNTErrors((s->devID == t->devID || (s->devID < 0 && t->devID < 0)),
...@@ -99,7 +103,148 @@ void _CopyIndexed(const XTensor * s, XTensor * t, int dim, int * srcIndex, int i ...@@ -99,7 +103,148 @@ void _CopyIndexed(const XTensor * s, XTensor * t, int dim, int * srcIndex, int i
} }
/* /*
copy indexed sub-tensors (return an XTensor structure) copy selected sub-tensors where indeces are kept in tensors
>> s - the source tensor
>> t - the target tensor
>> dim - the leading dimension to define "sub-tensors"
e.g., for a tensor of size (3, 2, 4) and dim = 2,
we have 4 sub-tensors of size (3, 2)
>> srcIndex - the tensor to save the index of the source sub-tensors
>> tgtIndex - the tensor to save the index of the target sub-tensors
>> copyNum - number of the sub-tensors we copy for each source index,
e.g., for srcIndex = [1,4] and copyNum = 2,
we actually copy the source sub-tensors 1, 2, 4, 5
*/
void _CopyIndexed(const XTensor * s, XTensor * t, int dim,
const XTensor * srcIndex, const XTensor * tgtIndex,
int copyNum)
{
int order = s->order;
int indexSize = srcIndex->unitNum;
CheckNTErrors(indexSize != 0, "NULL index!")
CheckNTErrors((s && t), "Invalid tensors!");
CheckNTErrors((srcIndex && tgtIndex), "Invalid index tensors!");
CheckNTErrors((s->devID == t->devID || (s->devID < 0 && t->devID < 0)),
"the data must be kept on the same device!");
CheckNTErrors((srcIndex->devID == srcIndex->devID || (s->devID < 0 && t->devID < 0)),
"the index must be kept on the same device!");
CheckNTErrors((s->devID == srcIndex->devID || (s->devID < 0 && t->devID < 0)),
"the data and index must be kept on the same device!");
CheckNTErrors((dim >= 0 && dim < order), "A too larget dimension specified!");
CheckNTErrors((s->unitSize == t->unitSize), "Unmatched tensors!");
CheckNTErrors((srcIndex->unitNum == tgtIndex->unitNum), "Unmatched index tensors!");
for (int i = 0; i < order; i++) {
if (i != dim) {
CheckNTErrors(s->GetDim(i) == t->GetDim(i), "Unmatched dimensions");
}
else {
CheckNTErrors(t->GetDim(i) == indexSize * copyNum, "Unmatched dimensions");
}
}
#ifdef USE_CUDA
if (s->devID >= 0 && srcIndex->devID >= 0) {
_CudaCopyIndexed(s, t, dim, srcIndex, tgtIndex, copyNum);
return;
}
#endif
int blockNum = 1;
int stride = 1;
int blockSizeSrc = 1;
int blockSizeTgt = 1;
for (int i = 0; i < dim; i++)
blockNum *= s->GetDim(i);
for (int i = dim + 1; i < order; i++)
stride *= s->GetDim(i);
blockSizeSrc = stride * s->GetDim(dim);
blockSizeTgt = stride * t->GetDim(dim);
DTYPE * sData = (DTYPE*)s->data;
DTYPE * tData = (DTYPE*)t->data;
int * sIndex = (int*)srcIndex->data;
int * tIndex = (int*)tgtIndex->data;
for (int i = 0; i < indexSize; i++) {
for (int c = 0; c < copyNum; c++) {
int si = sIndex[i] + c;
int ti = tIndex[i] + c;
for (int j = 0; j < blockNum; j++) {
DTYPE * sd = sData + j * blockSizeSrc + si * stride;
DTYPE * td = tData + j * blockSizeTgt + ti * stride;
for (int k = 0; k < stride; k++)
*(td + k) = *(sd + k);
}
}
}
}
/*
copy selected sub-tensors where indeces are kept in tensors (return an XTensor structure)
make a new tensor to keep the result and return it
>> s - the source tensor
>> dim - the leading dimension to define "sub-tensors"
e.g., for a tensor of size (3, 2, 4) and dim = 2,
we have 4 sub-tensors of size (3,2)
>> srcIndex - index of the source sub-tensors
>> indexSize - length of srcIndex (and tgtIndex)
>> tgtIndex - index of the target sub-tensors
>> copyNum - number of the sub-tensors we copy for each source index,
e.g., for srcIndex = [1,4] and copyNum = 2,
we actually copy the source sub-tensors 1, 2, 4, 5
<< return - the result of copying indexed sub-tensors
*/
XTensor CopyIndexed(const XTensor & s, int dim,
const XTensor & srcIndex, const XTensor & tgtIndex,
int copyNum)
{
CheckNTErrors(dim >= 0 && dim < s.order, "A too larget dimension specified!");
int order = s.order;
int * dimSize = new int[order];
int indexSize = srcIndex.unitNum;
for (int i = 0; i < s.order; i++) {
if (i == dim)
dimSize[i] = indexSize * copyNum;
else
dimSize[i] = s.dimSize[i];
}
float dr = (!s.isSparse) ? 1.0F : s.denseRatio;
XTensor t(order, dimSize, s.dataType, dr, s.devID, s.mem);
t.SetTMPFlag();
/* call _CopyIndexed function */
_CopyIndexed(&s, &t, dim, &srcIndex, &tgtIndex, copyNum);
XList list(3);
list.Add(&s);
list.Add(&srcIndex);
list.Add(&tgtIndex);
/* tensor connection */
XLink::MakeLink(&list, &t, MOVEMENT_COPYINDEXED);
XLink::AddParamToHeadInt(&t, dim);
XLink::AddParamToHeadInt(&t, copyNum);
/* destroy variables */
delete[] dimSize;
return t;
}
/*
copy indexed sub-tensors (return a XTensor structure)
make a new tensor to keep the result and return it make a new tensor to keep the result and return it
>> s - the source tensor >> s - the source tensor
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-11-30
*/
#include "CopyIndexed.cuh"
#include "../../XDevice.h"
#include "../../XUtility.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/*
copy selected sub-tensors where indeces are kept in tensors (kenerl version)
>> s - the source tensor
>> t - the target tensor
>> dim - the leading dimension to define "sub-tensors"
e.g., for a tensor of size (3, 2, 4) and dim = 2,
we have 4 sub-tensors of size (3, 2)
>> srcIndex - the tensor to save the index of the source sub-tensors
>> tgtIndex - the tensor to save the index of the target sub-tensors
>> copyNum - number of the sub-tensors we copy for each source index,
e.g., for srcIndex = [1,4] and copyNum = 2,
we actually copy the source sub-tensors 1, 2, 4, 5
*/
__global__
void KernelCopyIndexed(DTYPE * sData, DTYPE * tData, int * sIndex, int * tIndex,
int blockNum, int blockSizeSrc, int blockSizeTgt,
int stride, int indexSize, int copyNum)
{
__shared__ DTYPE * sp[MAX_CUDA_THREAD_NUM_PER_BLOCK];
__shared__ DTYPE * tp[MAX_CUDA_THREAD_NUM_PER_BLOCK];
/* block id */
int i = blockDim.x * blockIdx.x + threadIdx.x;
/* offset in each block */
int offset = blockDim.y * blockIdx.y + threadIdx.y;
if(i >= blockNum * indexSize * copyNum || offset >= stride)
return;
int realIndexSize = indexSize * copyNum;
int realBlockNum = i / realIndexSize;
int realIndex = i % realIndexSize;
int realSrcIndex = sIndex[realIndex / copyNum] + realIndex % copyNum;
int realTgtIndex = tIndex[realIndex / copyNum] + realIndex % copyNum;
if(threadIdx.y == 0){
sp[threadIdx.x] = sData + realBlockNum * blockSizeSrc + realSrcIndex * stride;
tp[threadIdx.x] = tData + realBlockNum * blockSizeTgt + realTgtIndex * stride;
}
__syncthreads();
DTYPE * s = sp[threadIdx.x];
DTYPE * t = tp[threadIdx.x];
t[offset] = s[offset];
}
/*
copy selected sub-tensors where indeces are kept in tensors
>> s - the source tensor
>> t - the target tensor
>> dim - the leading dimension to define "sub-tensors"
e.g., for a tensor of size (3, 2, 4) and dim = 2,
we have 4 sub-tensors of size (3, 2)
>> srcIndex - the tensor to save the index of the source sub-tensors
>> tgtIndex - the tensor to save the index of the target sub-tensors
>> copyNum - number of the sub-tensors we copy for each source index,
e.g., for srcIndex = [1,4] and copyNum = 2,
we actually copy the source sub-tensors 1, 2, 4, 5
*/
void _CudaCopyIndexed(const XTensor * s, XTensor * t, int dim,
const XTensor * srcIndex, const XTensor * tgtIndex,
int copyNum)
{
int devID = s->devID;
int order = s->order;
int indexSize = srcIndex->unitNum;
int blockNum = 1;
int stride = 1;
int blockSizeSrc = 1;
int blockSizeTgt = 1;
for (int i = 0; i < dim; i++)
blockNum *= s->GetDim(i);
for (int i = dim + 1; i < order; i++)
stride *= s->GetDim(i);
blockSizeSrc = stride * s->GetDim(dim);
blockSizeTgt = stride * t->GetDim(dim);
int cudaGrids[3];
int cudaBlocks[3];
int devIDBackup;
ProtectCudaDev(devID, devIDBackup);
GDevs.GetCudaThread2D(devID, blockNum * indexSize * copyNum, stride, MAX_INT, cudaGrids, cudaBlocks);
dim3 blocks(cudaGrids[0], cudaGrids[1]);
dim3 threads(cudaBlocks[0], cudaBlocks[1]);
DTYPE * sData = (DTYPE*)s->data;
DTYPE * tData = (DTYPE*)t->data;
int * sIndex = (int *)srcIndex->data;
int * tIndex = (int *)tgtIndex->data;
KernelCopyIndexed<<<blocks, threads >>>(sData, tData, sIndex, tIndex,
blockNum, blockSizeSrc, blockSizeTgt,
stride, indexSize, copyNum);
BacktoCudaDev(devID, devIDBackup);
}
#endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-11-30
* Tomorrow is the celebration of the laboratory, I'm so happy!
*/
#ifndef __CopyIndexed_CUH__
#define __CopyIndexed_CUH__
#include "../../XTensor.h"
#include "CopyIndexed.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/* copy selected sub-tensors where indeces are kept in tensors (cuda version) */
void _CudaCopyIndexed(const XTensor * s, XTensor * t, int dim,
const XTensor * srcIndex, const XTensor * tgtIndex,
int copyNum);
#endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor)
#endif // __CopyIndexed_CUH__
\ No newline at end of file
...@@ -27,22 +27,27 @@ ...@@ -27,22 +27,27 @@
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* copy selected sub-tensors */ /* copy selected sub-tensors */
void _CopyIndexed(const XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSize, int * tgtIndex, int copyNum); void _CopyIndexed(const XTensor * s, XTensor * t, int dim,
int * srcIndex, int indexSize, int * tgtIndex,
int copyNum = 1);
/* copy selected sub-tensors where indeces are kept in tensors */ /* copy selected sub-tensors where indeces are kept in tensors */
void _CopyIndexed(const XTensor * s, XTensor * t, int dim, const XTensor * srcIndex, const XTensor * tgtIndex); void _CopyIndexed(const XTensor * s, XTensor * t, int dim,
const XTensor * srcIndex, const XTensor * tgtIndex,
int copyNum = 1);
/* /*
copy selected sub-tensors (return an XTensor structure) copy selected sub-tensors (return a XTensor structure)
make a new tensor to keep the result and return it (remove this???) make a new tensor to keep the result and return it (remove this???)
*/ */
XTensor CopyIndexed(const XTensor &s, int dim, int * srcIndex, int indexSize, int * tgtIndex, int copyNum); XTensor CopyIndexed(const XTensor &s, int dim, int * srcIndex, int indexSize, int * tgtIndex, int copyNum);
/* /*
copy selected sub-tensors where indeces are kept in tensors (return an XTensor structure) copy selected sub-tensors where indeces are kept in tensors (return an XTensor structure)
make a new tensor to keep the result and return it (remove this???) make a new tensor to keep the result and return it
*/ */
void CopyIndexed(const XTensor * s, XTensor * t, int dim, const XTensor * srcIndex, const XTensor * tgtIndex); XTensor CopyIndexed(const XTensor & s, int dim,
const XTensor & srcIndex, const XTensor & tgtIndex,
int copyNum = 1);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -57,15 +57,14 @@ gather indexed sub-tensors ...@@ -57,15 +57,14 @@ gather indexed sub-tensors
>> t - the target tensor >> t - the target tensor
>> srcIndex - the tensor to save the index of the source tensor >> srcIndex - the tensor to save the index of the source tensor
*/ */
void _Gather(XTensor * s, XTensor * t, XTensor * srcIndex) void _Gather(const XTensor * s, XTensor * t, XTensor * srcIndex)
{ {
CheckNTErrors((s && t), "Invalid tensors!"); CheckNTErrors((s && t), "Invalid tensors!");
CheckNTErrors((s->devID == t->devID && t->devID == srcIndex->devID), CheckNTErrors(s->devID == t->devID, "the data must be kept on the same device!");
"the data must be kept on the same device!");
CheckNTErrors((s->unitSize == t->unitSize), "Unmatched tensors!"); CheckNTErrors((s->unitSize == t->unitSize), "Unmatched tensors!");
#ifdef USE_CUDA #ifdef USE_CUDA
if (s->devID >= 0 && t->devID >= 0 && srcIndex->devID >= 0) { if (s->devID >= 0 && t->devID >= 0) {
_CudaGather(s, t, srcIndex); _CudaGather(s, t, srcIndex);
return; return;
} }
...@@ -116,6 +115,8 @@ XTensor Gather(XTensor &s, XTensor &index) ...@@ -116,6 +115,8 @@ XTensor Gather(XTensor &s, XTensor &index)
XTensor t(order, dimSize, s.dataType, dr, s.devID, s.mem); XTensor t(order, dimSize, s.dataType, dr, s.devID, s.mem);
t.SetTMPFlag(); t.SetTMPFlag();
delete[] dimSize;
_Gather(&s, &t, &index); _Gather(&s, &t, &index);
/* tensor connection */ /* tensor connection */
...@@ -137,4 +138,4 @@ XTensor Gather(XTensor &s, XTensor &index) ...@@ -137,4 +138,4 @@ XTensor Gather(XTensor &s, XTensor &index)
} }
} }
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library /* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University. * Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved. * All rights reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
* You may obtain a copy of the License at * You may obtain a copy of the License at
* *
* http://www.apache.org/licenses/LICENSE-2.0 * http://www.apache.org/licenses/LICENSE-2.0
* *
* Unless required by applicable law or agreed to in writing, software * Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, * distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
/* /*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24 * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-11-27
*/ */
#include "Gather.cuh" #include "Gather.cuh"
#include "CopyBlocksSelected.cuh" #include "CopyBlocksSelected.cuh"
...@@ -41,7 +41,7 @@ __global__ ...@@ -41,7 +41,7 @@ __global__
void KernelGather(DTYPE * sData, DTYPE * tData, int * sIndex, int indexSize, int stride) void KernelGather(DTYPE * sData, DTYPE * tData, int * sIndex, int indexSize, int stride)
{ {
__shared__ DTYPE * sp[MAX_CUDA_THREAD_NUM_PER_BLOCK]; __shared__ DTYPE * sp[MAX_CUDA_THREAD_NUM_PER_BLOCK];
__shared__ DTYPE * cp[MAX_CUDA_THREAD_NUM_PER_BLOCK]; __shared__ DTYPE * tp[MAX_CUDA_THREAD_NUM_PER_BLOCK];
/* block id */ /* block id */
int i = blockDim.x * blockIdx.x + threadIdx.x; int i = blockDim.x * blockIdx.x + threadIdx.x;
...@@ -54,15 +54,15 @@ void KernelGather(DTYPE * sData, DTYPE * tData, int * sIndex, int indexSize, int ...@@ -54,15 +54,15 @@ void KernelGather(DTYPE * sData, DTYPE * tData, int * sIndex, int indexSize, int
if(threadIdx.y == 0){ if(threadIdx.y == 0){
sp[threadIdx.x] = sData + sIndex[i] * stride; sp[threadIdx.x] = sData + sIndex[i] * stride;
cp[threadIdx.x] = tData + i * stride; tp[threadIdx.x] = tData + i * stride;
} }
__syncthreads(); __syncthreads();
DTYPE * s = sp[threadIdx.x]; DTYPE * s = sp[threadIdx.x];
DTYPE * c = cp[threadIdx.x]; DTYPE * t = tp[threadIdx.x];
c[offset] = s[offset]; t[offset] = s[offset];
} }
/* /*
...@@ -72,9 +72,10 @@ gather indexed sub-tensors(cuda version) ...@@ -72,9 +72,10 @@ gather indexed sub-tensors(cuda version)
>> t - the target tensor >> t - the target tensor
>> srcIndex - the tensor to save the index of the source tensor >> srcIndex - the tensor to save the index of the source tensor
*/ */
void _CudaGather(XTensor * s, XTensor * t, XTensor * srcIndex) void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex)
{ {
int devID = s->devID; int devID = s->devID;
XMem * mem = s->mem;
int stride = s->GetDim(1); int stride = s->GetDim(1);
int indexSize = srcIndex->unitNum; int indexSize = srcIndex->unitNum;
...@@ -93,10 +94,26 @@ void _CudaGather(XTensor * s, XTensor * t, XTensor * srcIndex) ...@@ -93,10 +94,26 @@ void _CudaGather(XTensor * s, XTensor * t, XTensor * srcIndex)
DTYPE * sData = (DTYPE*)s->data; DTYPE * sData = (DTYPE*)s->data;
DTYPE * tData = (DTYPE*)t->data; DTYPE * tData = (DTYPE*)t->data;
int * sIndex = (int *)srcIndex->data; int * sIndex = NULL;
if (srcIndex->devID < 0) {
sIndex = mem != NULL ?
(int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) :
(int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);
XMemCopy(sIndex, devID, srcIndex, -1, sizeof(int) * indexSize);
}
else
sIndex = (int *)srcIndex->data;
KernelGather<<<blocks, threads >>>(sData, tData, sIndex, indexSize, stride); KernelGather<<<blocks, threads >>>(sData, tData, sIndex, indexSize, stride);
if (srcIndex->devID < 0) {
if(mem != NULL)
mem->ReleaseBuf(mem->devID, sizeof(int) * indexSize);
else
XMemFree(mem->devID, sIndex);
}
BacktoCudaDev(devID, devIDBackup); BacktoCudaDev(devID, devIDBackup);
} }
......
/* NiuTrans.Tensor - an open-source tensor library /* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University. * Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved. * All rights reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
* You may obtain a copy of the License at * You may obtain a copy of the License at
* *
* http://www.apache.org/licenses/LICENSE-2.0 * http://www.apache.org/licenses/LICENSE-2.0
* *
* Unless required by applicable law or agreed to in writing, software * Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, * distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
/* /*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24 * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-11-27
*/ */
#ifndef __GATHER_CUH__ #ifndef __GATHER_CUH__
#define __GATHER_CUH__ #define __GATHER_CUH__
...@@ -30,7 +30,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -30,7 +30,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA #ifdef USE_CUDA
/* gather indexed sub-tensors(cuda version) */ /* gather indexed sub-tensors(cuda version) */
void _CudaGather(XTensor * s, XTensor * t, XTensor * srcIndex); void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex);
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -30,7 +30,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -30,7 +30,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
void _Gather(XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSize); void _Gather(XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSize);
/* gather selected sub-tensors */ /* gather selected sub-tensors */
void _Gather(XTensor * s, XTensor * t, XTensor * srcIndex); void _Gather(const XTensor * s, XTensor * t, XTensor * srcIndex);
/* gather selected sub-tensors (return an XTensor structure) /* gather selected sub-tensors (return an XTensor structure)
make a new tensor to keep the result and return it */ make a new tensor to keep the result and return it */
......
...@@ -134,63 +134,92 @@ void _AssignmentForGather(DTYPE * sData, DTYPE * cData, int blockNum, ...@@ -134,63 +134,92 @@ void _AssignmentForGather(DTYPE * sData, DTYPE * cData, int blockNum,
/* /*
spread a collection tensor to source tensor. spread a collection tensor to source tensor.
And this is a special spread function for backward computation of gather function. And this is a special spread function for backward computation of CopyIndexed function.
>> source - the source tensor whose data would be modified >> s - the source tensor whose data would be modified
>> collection - the collection whose data would be spread to source tensor >> c - the collection whose data would be spread to source tensor
>> dim - the leading dimension to define "sub-tensors" >> dim - the leading dimension to define "sub-tensors"
e.g., for a tensor of size (3, 2, 4) and dim = 2, e.g., for a tensor of size (3, 2, 4) and dim = 2,
we have 4 sub-tensors of size (3, 2) we have 4 sub-tensors of size (3, 2)
>> srcIndex - index of the source sub-tensors >> srcIndex - the tensor to save the index of the source sub-tensors
>> indexSize - length of srcIndex (and collIndex) >> collIndex - the tensor to save the index of the collection sub-tensors
>> copyNum - number of the sub-tensors we copy for each source index,
e.g., for srcIndex = [1,4] and copyNum = 2,
we actually copy the source sub-tensors 1, 2, 4, 5
*/ */
void _SpreadForGather(XTensor * source, XTensor * collection, int dim, void _SpreadForCopyIndexed(XTensor * s, XTensor * c, int dim,
int * srcIndex, int indexSize) XTensor * srcIndex, XTensor * collIndex,
int copyNum)
{ {
int order = source->order; int order = s->order;
int indexSize = srcIndex->unitNum;
CheckNTErrors(source->dataType == DEFAULT_DTYPE, "TODO!"); CheckNTErrors(indexSize != 0, "NULL index!")
CheckNTErrors((s && c), "Invalid tensors!");
CheckNTErrors((srcIndex && collIndex), "Invalid index tensors!");
CheckNTErrors((s->devID == c->devID || (s->devID < 0 && c->devID < 0)),
"the data must be kept on the same device!");
CheckNTErrors((srcIndex->devID == srcIndex->devID || (s->devID < 0 && c->devID < 0)),
"the index must be kept on the same device!");
CheckNTErrors((s->devID == srcIndex->devID || (s->devID < 0 && c->devID < 0)),
"the data and index must be kept on the same device!");
CheckNTErrors((dim >= 0 && dim < s->order), "A too larget dimension specified!");
CheckNTErrors((s->unitSize == c->unitSize), "Unmatched tensors!");
CheckNTErrors((srcIndex->unitNum == collIndex->unitNum), "Unmatched index tensors!");
CheckNTErrors(s->dataType == DEFAULT_DTYPE, "TODO!");
CheckNTErrors(dim >= 0 && dim < order, "Illegal dimension!"); CheckNTErrors(dim >= 0 && dim < order, "Illegal dimension!");
for (int i = 0; i < order; i++){ for (int i = 0; i < order; i++) {
if (i == dim) { if (i != dim) {
CheckNTErrors(collection->GetDim(i) == indexSize, "Illegal dimension!"); CheckNTErrors(s->GetDim(i) == c->GetDim(i), "Unmatched dimensions");
} }
else { else {
CheckNTErrors(collection->GetDim(i) == source->GetDim(i), "Illegal dimension!"); CheckNTErrors(c->GetDim(i) == indexSize * copyNum, "Unmatched dimensions");
} }
} }
#ifdef USE_CUDA #ifdef USE_CUDA
if(source->devID >= 0 && collection->devID >= 0) { if(s->devID >= 0 && c->devID >= 0) {
_CudaSpreadForGather(source, collection, dim, srcIndex, indexSize); _CudaSpreadForCopyIndexed(s, c, dim, srcIndex, collIndex, copyNum);
return; return;
} }
#endif #endif
int blockSizeSrc = 1;
int blockSizeColl = 1;
int blockNum = 1; int blockNum = 1;
int stride = 1; int stride = 1;
int blockSizeSrc = 1;
int blockSizeTgt = 1;
for (int i = dim + 1; i < order; i++) { for (int i = 0; i < dim; i++)
stride *= source->GetDim(i); blockNum *= s->GetDim(i);
}
blockSizeSrc = stride * source->GetDim(dim); for (int i = dim + 1; i < order; i++)
blockSizeColl = stride * collection->GetDim(dim); stride *= s->GetDim(i);
blockNum = source->unitNum / blockSizeSrc;
DTYPE * sData = (DTYPE*)source->data; blockSizeSrc = stride * s->GetDim(dim);
DTYPE * cData = (DTYPE*)collection->data; blockSizeTgt = stride * c->GetDim(dim);
for(int i = 0; i < indexSize; i++){ DTYPE * sData = (DTYPE*)s->data;
int src = srcIndex[i]; DTYPE * cData = (DTYPE*)c->data;
int tgt = i; int * sIndex = (int*)srcIndex->data;
DTYPE * s = sData + src * stride; int * cIndex = (int*)collIndex->data;
DTYPE * c = cData + tgt * stride;
_AssignmentForGather(s, c, blockNum, blockSizeSrc, blockSizeColl, stride); for (int i = 0; i < indexSize; i++) {
for (int c = 0; c < copyNum; c++) {
int si = sIndex[i] + c;
int ti = cIndex[i] + c;
for (int j = 0; j < blockNum; j++) {
DTYPE * sd = sData + j * blockSizeSrc + si * stride;
DTYPE * td = cData + j * blockSizeTgt + ti * stride;
for (int k = 0; k < stride; k++)
*(sd + k) += *(td + k);
}
}
} }
} }
/* /*
...@@ -218,7 +247,7 @@ void _SpreadForGather(XTensor * source, XTensor * collection, XTensor * index) ...@@ -218,7 +247,7 @@ void _SpreadForGather(XTensor * source, XTensor * collection, XTensor * index)
} }
#ifdef USE_CUDA #ifdef USE_CUDA
if(source->devID >= 0 && collection->devID >= 0 && index->devID >= 0) { if(source->devID >= 0 && collection->devID >= 0) {
_CudaSpreadForGather(source, collection, index); _CudaSpreadForGather(source, collection, index);
return; return;
} }
...@@ -241,4 +270,4 @@ void _SpreadForGather(XTensor * source, XTensor * collection, XTensor * index) ...@@ -241,4 +270,4 @@ void _SpreadForGather(XTensor * source, XTensor * collection, XTensor * index)
} }
} }
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
...@@ -19,13 +19,11 @@ ...@@ -19,13 +19,11 @@
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-09-25 * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-09-25
*/ */
#ifndef __SPREAD_CUH__
#define __SPREAD_CUH__
#include "../../XTensor.h" #include "../../XTensor.h"
#include "../../XDevice.h" #include "../../XDevice.h"
#include "../../XUtility.h" #include "../../XUtility.h"
#include "Spread.cuh" #include "Spread.cuh"
#include "CopyValues.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
...@@ -60,7 +58,6 @@ void KernelSpread(DTYPE * sData, DTYPE * cData, int blockNum, ...@@ -60,7 +58,6 @@ void KernelSpread(DTYPE * sData, DTYPE * cData, int blockNum,
s[j] = c[j]; s[j] = c[j];
} }
/* /*
This is core assignment for spread function. This is core assignment for spread function.
...@@ -76,9 +73,9 @@ This is core assignment for spread function. ...@@ -76,9 +73,9 @@ This is core assignment for spread function.
*/ */
__global__ __global__
void KernelSpreadFuzed(DTYPE * sData, DTYPE * cData, int blockNum, void KernelSpreadFuzed(DTYPE * sData, DTYPE * cData, int blockNum,
int blockSizeSrc, int blockSizeColl, int stride, int blockSizeSrc, int blockSizeColl, int stride,
int subtensorNum, int subtensorNum,
int * srcIndex, int * colIndex) int * srcIndex, int * colIndex)
{ {
__shared__ DTYPE * sp[MAX_CUDA_THREAD_NUM_PER_BLOCK]; __shared__ DTYPE * sp[MAX_CUDA_THREAD_NUM_PER_BLOCK];
__shared__ DTYPE * cp[MAX_CUDA_THREAD_NUM_PER_BLOCK]; __shared__ DTYPE * cp[MAX_CUDA_THREAD_NUM_PER_BLOCK];
...@@ -189,7 +186,7 @@ void _CudaSpread(XTensor * source, XTensor * collection, int dim, ...@@ -189,7 +186,7 @@ void _CudaSpread(XTensor * source, XTensor * collection, int dim,
XMemCopy(ci, mem->devID, collIndex, -1, sizeof(int) * indexSize); XMemCopy(ci, mem->devID, collIndex, -1, sizeof(int) * indexSize);
KernelSpreadFuzed<<<blocks, threads >>>(s, c, blockNum, blockSizeSrc, blockSizeColl, KernelSpreadFuzed<<<blocks, threads >>>(s, c, blockNum, blockSizeSrc, blockSizeColl,
stride, indexSize, si, ci); stride, indexSize, si, ci);
if(mem != NULL) if(mem != NULL)
mem->ReleaseBuf(mem->devID, sizeof(int) * indexSize * 2); mem->ReleaseBuf(mem->devID, sizeof(int) * indexSize * 2);
...@@ -200,54 +197,25 @@ void _CudaSpread(XTensor * source, XTensor * collection, int dim, ...@@ -200,54 +197,25 @@ void _CudaSpread(XTensor * source, XTensor * collection, int dim,
BacktoCudaDev(source->devID, devIDBackup); BacktoCudaDev(source->devID, devIDBackup);
} }
/* /*
This is core assignment for backward computation of gather function. spread a collection tensor to source tensor (kernel version).
Care of the operator "+=" instead of "=". And this is a special spread function for backward computation of CopyIndexed function.
>> sData - the data pointer of the source tensor >> sData - the data pointer of the source tensor
>> cData - the data pointer of collection tensor >> cData - the data pointer of collection tensor
>> sIndex - index of the source sub-tensor
>> cIndex - index of the sub-tensor in the collection tensor
>> blockNum - number of data blocks >> blockNum - number of data blocks
>> blockSizeSrc - size of source data block >> blockSizeSrc - size of source data block
>> blockSizeColl - size of source data block >> blockSizeColl - size of source data block
>> stride - stride of a data block >> stride - stride of a data block
>> indexSize - number of indexs
>> copyNum - number of the sub-tensors we copy for each source index
*/ */
__global__ __global__
void KernelSpreadForGather(DTYPE * sData, DTYPE * cData, int blockNum, void KernelSpreadForCopyIndexed(DTYPE * sData, DTYPE * cData, int * sIndex, int * cIndex,
int blockSizeSrc, int blockSizeColl, int stride) int blockNum, int blockSizeSrc, int blockSizeColl,
{ int stride, int indexSize, int copyNum)
/* block id */
int i = blockDim.x * blockIdx.x + threadIdx.x;
/* offset in each block */
int j = blockDim.y * blockIdx.y + threadIdx.y;
if(i >= blockNum || j >= stride)
return;
DTYPE * s = sData + blockSizeSrc * i;
DTYPE * c = cData + blockSizeColl * i;
s[j] += c[j];
}
/*
This is core assignment for backward computation of gather function.
Care of the operator "+=" instead of "=".
>> sData - the data pointer of the source tensor
>> cData - the data pointer of collection tensor
>> blockNum - number of data blocks
>> blockSizeSrc - size of source data block
>> blockSizeColl - size of source data block
>> stride - stride of a data block
>> subtensorNum - number of sub-tensors
>> srcIndex - index of the source sub-tensor
*/
__global__
void KernelSpreadForGatherFuzed(DTYPE * sData, DTYPE * cData, int blockNum,
int blockSizeSrc, int blockSizeColl, int stride,
int subtensorNum,
int * srcIndex)
{ {
__shared__ DTYPE * sp[MAX_CUDA_THREAD_NUM_PER_BLOCK]; __shared__ DTYPE * sp[MAX_CUDA_THREAD_NUM_PER_BLOCK];
__shared__ DTYPE * cp[MAX_CUDA_THREAD_NUM_PER_BLOCK]; __shared__ DTYPE * cp[MAX_CUDA_THREAD_NUM_PER_BLOCK];
...@@ -258,105 +226,140 @@ void KernelSpreadForGatherFuzed(DTYPE * sData, DTYPE * cData, int blockNum, ...@@ -258,105 +226,140 @@ void KernelSpreadForGatherFuzed(DTYPE * sData, DTYPE * cData, int blockNum,
/* offset in each block */ /* offset in each block */
int offset = blockDim.y * blockIdx.y + threadIdx.y; int offset = blockDim.y * blockIdx.y + threadIdx.y;
int blockId = i % blockNum; int realIndexSize = indexSize * copyNum;
int subtensorId = i / blockNum;
int realBlockNum = i / realIndexSize;
if(subtensorId >= subtensorNum || offset >= stride) int tmp = i % realIndexSize;
int realIndex = tmp / copyNum;
int realCopyNum = tmp % copyNum;
if (realBlockNum >= blockNum || offset >= stride || realIndex >= indexSize || realCopyNum >= copyNum)
return; return;
//if(i >= blockNum * indexSize * copyNum || offset >= stride)
// return;
int realSrcIndex = sIndex[realIndex] + realCopyNum;
int realCollIndex = cIndex[realIndex] + realCopyNum;
//int realSrcIndex = sIndex[realIndex / copyNum] + realIndex % copyNum;
//int realCollIndex = cIndex[realIndex / copyNum] + realIndex % copyNum;
if(threadIdx.y == 0){ if(threadIdx.y == 0){
sp[threadIdx.x] = sData + srcIndex[subtensorId] * stride; sp[threadIdx.x] = sData + realBlockNum * blockSizeSrc + realSrcIndex * stride;
cp[threadIdx.x] = cData + subtensorId * stride; cp[threadIdx.x] = cData + realBlockNum * blockSizeColl + realCollIndex * stride;
} }
__syncthreads(); __syncthreads();
DTYPE * s = sp[threadIdx.x] + blockSizeSrc * blockId; DTYPE * s = sp[threadIdx.x];
DTYPE * c = cp[threadIdx.x] + blockSizeColl * blockId; DTYPE * c = cp[threadIdx.x];
atomicAdd(s + offset, c[offset]);
s[offset] += c[offset];
} }
/* /*
spread a collection tensor to source tensor (cuda version). spread a collection tensor to source tensor.
And this is a special spread function for backward computation of gather function. And this is a special spread function for backward computation of CopyIndexed function.
>> source - the source tensor whose data would be modified >> s - the source tensor whose data would be modified
>> collection - the collection whose data would be spread to source tensor >> c - the collection whose data would be spread to source tensor
>> dim - the leading dimension to define "sub-tensors" >> dim - the leading dimension to define "sub-tensors"
e.g., for a tensor of size (3, 2, 4) and dim = 2, e.g., for a tensor of size (3, 2, 4) and dim = 2,
we have 4 sub-tensors of size (3, 2) we have 4 sub-tensors of size (3, 2)
>> srcIndex - index of the source sub-tensors >> srcIndex - the tensor to save the index of the source sub-tensors
>> indexSize - length of srcIndex (and collIndex) >> collIndex - the tensor to save the index of the collection sub-tensors
>> copyNum - number of the sub-tensors we copy for each source index,
e.g., for srcIndex = [1,4] and copyNum = 2,
we actually copy the source sub-tensors 1, 2, 4, 5
*/ */
void _CudaSpreadForGather(XTensor * source, XTensor * collection, int dim, void _CudaSpreadForCopyIndexed(XTensor * s, XTensor * c, int dim,
int * srcIndex, int indexSize) XTensor * srcIndex, XTensor * collIndex,
int copyNum)
{ {
int order = source->order; int devID = s->devID;
int order = s->order;
int indexSize = srcIndex->unitNum;
CheckNTErrors(source->dataType == DEFAULT_DTYPE, "TODO!");
CheckNTErrors(dim >= 0 && dim < order, "Illegal dimension!");
int blockSizeSrc = 1;
int blockSizeColl = 1;
int blockNum = 1; int blockNum = 1;
int stride = 1; int stride = 1;
int blockSizeSrc = 1;
int blockSizeTgt = 1;
for (int i = dim + 1; i < order; i++) for (int i = 0; i < dim; i++)
stride *= source->GetDim(i); blockNum *= s->GetDim(i);
blockSizeSrc = stride * source->GetDim(dim); for (int i = dim + 1; i < order; i++)
blockSizeColl = stride * collection->GetDim(dim); stride *= s->GetDim(i);
blockNum = source->unitNum / blockSizeSrc;
blockSizeSrc = stride * s->GetDim(dim);
blockSizeTgt = stride * c->GetDim(dim);
int cudaGrids[3]; int cudaGrids[3];
int cudaBlocks[3]; int cudaBlocks[3];
int devIDBackup; int devIDBackup;
ProtectCudaDev(source->devID, devIDBackup); ProtectCudaDev(devID, devIDBackup);
if(indexSize < 4){ GDevs.GetCudaThread2D(devID, blockNum * indexSize * copyNum, stride, MAX_INT, cudaGrids, cudaBlocks);
GDevs.GetCudaThread2D(source->devID, blockNum, stride, MAX_INT, cudaGrids, cudaBlocks);
dim3 blocks(cudaGrids[0], cudaGrids[1]); dim3 blocks(cudaGrids[0], cudaGrids[1]);
dim3 threads(cudaBlocks[0], cudaBlocks[1]); dim3 threads(cudaBlocks[0], cudaBlocks[1]);
DTYPE * sData = (DTYPE*)source->data;
DTYPE * cData = (DTYPE*)collection->data;
for(int i = 0; i < indexSize; i++) {
int src = srcIndex[i];
int tgt = i;
DTYPE * s = sData + src * stride;
DTYPE * c = cData + tgt * stride;
KernelSpreadForGather<<<blocks, threads >>>(s, c, blockNum, blockSizeSrc, blockSizeColl, stride); DTYPE * sData = (DTYPE*)s->data;
} DTYPE * cData = (DTYPE*)c->data;
}
else{
XMem * mem = source->mem;
int * si = mem != NULL ?
(int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) :
(int*)XMemAlloc(source->devID, sizeof(int) * indexSize);
XMemCopy(si, source->devID, srcIndex, -1, sizeof(int) * indexSize); int * sIndex = (int *)srcIndex->data;
int * cIndex = (int *)collIndex->data;
DTYPE * s = (DTYPE*)source->data; KernelSpreadForCopyIndexed<<<blocks, threads >>>(sData, cData, sIndex, cIndex,
DTYPE * c = (DTYPE*)collection->data; blockNum, blockSizeSrc, blockSizeTgt,
stride, indexSize, copyNum);
GDevs.GetCudaThread2D(source->devID, blockNum * indexSize, stride, MAX_INT, cudaGrids, cudaBlocks); BacktoCudaDev(devID, devIDBackup);
}
dim3 blocks(cudaGrids[0], cudaGrids[1]); /*
dim3 threads(cudaBlocks[0], cudaBlocks[1]); This is core assignment for backward computation of gather function.
Care of the operator "+=" instead of "=".
>> sData - the data pointer of the source tensor
>> cData - the data pointer of collection tensor
>> srcIndex - index of the source sub-tensor
>> indexSize - the number of index
>> stride - stride of a data block
*/
__global__
void KernelSpreadForGather(DTYPE * sData, DTYPE * cData, int * srcIndex,
int indexSize, int stride)
{
__shared__ DTYPE * sp[MAX_CUDA_THREAD_NUM_PER_BLOCK];
__shared__ DTYPE * cp[MAX_CUDA_THREAD_NUM_PER_BLOCK];
KernelSpreadForGatherFuzed<<<blocks, threads >>>(s, c, blockNum, blockSizeSrc, blockSizeColl, stride, indexSize, si); /* block id */
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (mem != NULL) { /* offset in each block */
mem->ReleaseBuf(mem->devID, sizeof(int) * indexSize); int offset = blockDim.y * blockIdx.y + threadIdx.y;
}
else { if(i >= indexSize || offset >= stride)
XMemFree(source->devID, si); return;
}
if (threadIdx.y == 0) {
sp[threadIdx.x] = sData + srcIndex[i] * stride;
cp[threadIdx.x] = cData + i * stride;
} }
__syncthreads();
DTYPE * s = sp[threadIdx.x];
DTYPE * c = cp[threadIdx.x];
//DTYPE * s = sData + srcIndex[i] * stride;
//DTYPE * c = cData + i * stride;
atomicAdd(s + offset, c[offset]);
} }
/* /*
...@@ -369,40 +372,48 @@ And this is a special spread function for backward computation of gather functio ...@@ -369,40 +372,48 @@ And this is a special spread function for backward computation of gather functio
*/ */
void _CudaSpreadForGather(XTensor * source, XTensor * collection, XTensor * srcIndex) void _CudaSpreadForGather(XTensor * source, XTensor * collection, XTensor * srcIndex)
{ {
int dim = 0;
int devID = source->devID; int devID = source->devID;
XMem * mem = source->mem;
int blockNum = 1;
int stride = source->GetDim(1); int stride = source->GetDim(1);
int indexSize = srcIndex->unitNum; int indexSize = srcIndex->unitNum;
int blockSizeSrc = stride * source->GetDim(dim);
int blockSizeColl = stride * collection->GetDim(dim);
int cudaGrids[3]; int cudaGrids[3];
int cudaBlocks[3]; int cudaBlocks[3];
int devIDBackup; int devIDBackup;
ProtectCudaDev(source->devID, devIDBackup); ProtectCudaDev(source->devID, devIDBackup);
DTYPE * sData = (DTYPE*)source->data;
DTYPE * cData = (DTYPE*)collection->data;
int * sIndex = NULL;
GDevs.GetCudaThread2D(devID, indexSize, stride, MAX_INT, cudaGrids, cudaBlocks); GDevs.GetCudaThread2D(devID, indexSize, stride, MAX_INT, cudaGrids, cudaBlocks);
dim3 blocks(cudaGrids[0], cudaGrids[1]); dim3 blocks(cudaGrids[0], cudaGrids[1]);
dim3 threads(cudaBlocks[0], cudaBlocks[1]); dim3 threads(cudaBlocks[0], cudaBlocks[1]);
DTYPE * s = (DTYPE*)source->data; if (srcIndex->devID < 0) {
DTYPE * c = (DTYPE*)collection->data; sIndex = mem != NULL ?
(int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) :
(int*)XMemAlloc(devID, sizeof(int) * indexSize);
XMemCopy(sIndex, devID, srcIndex->data, -1, sizeof(int) * indexSize);
}
else
sIndex = (int *)srcIndex->data;
int * si = (int *)srcIndex->data; KernelSpreadForGather<<<blocks, threads >>>(sData, cData, sIndex, indexSize, stride);
if (srcIndex->devID < 0) {
if(mem != NULL)
mem->ReleaseBuf(mem->devID, sizeof(int) * indexSize);
else
XMemFree(devID, sIndex);
}
KernelSpreadForGatherFuzed<<<blocks, threads >>>(s, c, blockNum, blockSizeSrc, blockSizeColl,
stride, indexSize, si);
BacktoCudaDev(source->devID, devIDBackup); BacktoCudaDev(source->devID, devIDBackup);
} }
#endif // USE_CUDA #endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
#endif // __SPREAD_CUH__
\ No newline at end of file
...@@ -32,9 +32,10 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -32,9 +32,10 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
void _CudaSpread(XTensor * source, XTensor * collection, int dim, void _CudaSpread(XTensor * source, XTensor * collection, int dim,
int * srcIndex, int indexSize, int * collIndex); int * srcIndex, int indexSize, int * collIndex);
/* special spread function for backward computation of gather function (cuda version) */ /* special spread function for backward computation of CopyIndexed function (cuda version) */
void _CudaSpreadForGather(XTensor * source, XTensor * collection, int dim, void _CudaSpreadForCopyIndexed(XTensor * s, XTensor * c, int dim,
int * srcIndex, int indexSize); XTensor * srcIndex, XTensor * collIndex,
int copyNum);
/* special spread function for backward computation of gather function (cuda version) */ /* special spread function for backward computation of gather function (cuda version) */
void _CudaSpreadForGather(XTensor * source, XTensor * collection, XTensor * srcIndex); void _CudaSpreadForGather(XTensor * source, XTensor * collection, XTensor * srcIndex);
......
...@@ -36,9 +36,10 @@ void Spread(XTensor * source, XTensor * collection, ...@@ -36,9 +36,10 @@ void Spread(XTensor * source, XTensor * collection,
XTensor * srcIndex, XTensor * collIndex, XTensor * srcIndex, XTensor * collIndex,
int dim); int dim);
/* special spread function for backward computation of gather function */ /* special spread function for backward computation of CopyIndexed function */
void _SpreadForGather(XTensor * source, XTensor * collection, int dim, void _SpreadForCopyIndexed(XTensor * source, XTensor * collection, int dim,
int * srcIndex, int indexSize); XTensor * srcIndex, XTensor * collIndex,
int copyNum);
/* special spread function for backward computation of gather function */ /* special spread function for backward computation of gather function */
void _SpreadForGather(XTensor * source, XTensor * collection, XTensor * index); void _SpreadForGather(XTensor * source, XTensor * collection, XTensor * index);
......
...@@ -46,6 +46,22 @@ sum all the items of the tensor (It should be optimized!) ...@@ -46,6 +46,22 @@ sum all the items of the tensor (It should be optimized!)
*/ */
DTYPE _ReduceSumAll(const XTensor * source) DTYPE _ReduceSumAll(const XTensor * source)
{ {
int dims[2] = {1, source->unitNum};
int one = 1;
XTensor * all = NewTensorBuf(2, dims, source->dataType, source->denseRatio, source->devID, source->mem);
XTensor * result = NewTensorBuf(1, &one, source->dataType, 1.0F, source->devID, source->mem);
_CopyValues(source, all);
_ReduceSum(all, result, 1);
DTYPE r = result->Get1D(0);
DelTensorBuf(result);
DelTensorBuf(all);
return r;
int order = source->order; int order = source->order;
DTYPE summation; DTYPE summation;
...@@ -60,7 +76,7 @@ DTYPE _ReduceSumAll(const XTensor * source) ...@@ -60,7 +76,7 @@ DTYPE _ReduceSumAll(const XTensor * source)
dimSize = getDimSize(big, leadingDim); dimSize = getDimSize(big, leadingDim);
XTensor * little = NewTensor(big->order - 1, dimSize, source->dataType, source->denseRatio, XTensor * little = NewTensor(big->order - 1, dimSize, source->dataType, source->denseRatio,
source->devID, source->mem); source->devID, source->mem);
_ReduceSum(big, little, leadingDim); _ReduceSum(big, little, leadingDim);
delete big; delete big;
......
...@@ -94,7 +94,7 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim) ...@@ -94,7 +94,7 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
gridSize = blockNum; gridSize = blockNum;
gridNum = s->unitNum / (blockSize * blockNum); gridNum = s->unitNum / (blockSize * blockNum);
if (mergedNum * gridNum <= MIN_TENSOR_SPLIT_NUM) { if (mergedNum * gridNum <= MIN_TENSOR_MERGE_NUM) {
int sPitch = blockSize * s->unitSize; int sPitch = blockSize * s->unitSize;
int tPtich = blockSize * mergedNum * t->unitSize; int tPtich = blockSize * mergedNum * t->unitSize;
int mSize = blockSize * t->unitSize; int mSize = blockSize * t->unitSize;
...@@ -253,7 +253,7 @@ void _Merge(const XList * smalls, XTensor * big, int whereToMerge) ...@@ -253,7 +253,7 @@ void _Merge(const XList * smalls, XTensor * big, int whereToMerge)
gridNum = s0->unitNum / (blockSize * blockNum); gridNum = s0->unitNum / (blockSize * blockNum);
/* merging with fewer data copy operations */ /* merging with fewer data copy operations */
if (mergedNum * gridNum <= MIN_TENSOR_SPLIT_LIST_NUM) { if (mergedNum * gridNum <= MIN_TENSOR_MERGE_LIST_NUM) {
int sPitch = blockSize * s0->unitSize; int sPitch = blockSize * s0->unitSize;
int tPtich = blockSize * mergedNum * big->unitSize; int tPtich = blockSize * mergedNum * big->unitSize;
int mSize = blockSize * big->unitSize; int mSize = blockSize * big->unitSize;
......
...@@ -126,7 +126,7 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum) ...@@ -126,7 +126,7 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)
void * dataTMP = t->data; void * dataTMP = t->data;
if (!isOnSameDevice) if (!isOnSameDevice)
dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(mem->devID, size); dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(s->devID, size);
int realBlockSize = blockSize * t->unitSize; int realBlockSize = blockSize * t->unitSize;
int blockSplitSize = blockNum / splitNum; int blockSplitSize = blockNum / splitNum;
......
...@@ -153,7 +153,7 @@ XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim) ...@@ -153,7 +153,7 @@ XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim)
int unitNum = x.dimSize[n]; int unitNum = x.dimSize[n];
DTYPE * maskArray = new DTYPE[unitNum]; DTYPE * maskArray = new DTYPE[unitNum];
srand((unsigned int)time(NULL)); //srand((unsigned int)time(NULL));
for (int i = 0; i < unitNum; i++) for (int i = 0; i < unitNum; i++)
maskArray[i] = RandomBernoulli(dropProb, scaleFactor); maskArray[i] = RandomBernoulli(dropProb, scaleFactor);
...@@ -166,4 +166,33 @@ XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim) ...@@ -166,4 +166,33 @@ XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim)
return MultiplyDim(x, mask, n, 0); return MultiplyDim(x, mask, n, 0);
} }
/*
dropout function without broadcast
>> x - input tensor
>> dropProb - probability to set an element to zero
*/
XTensor DropoutWithoutBroadcast(const XTensor &x, DTYPE dropProb)
{
CheckNTErrors(dropProb >= 0.0 && dropProb <= 1.0, "The probability must be 0-1!");
DTYPE scaleFactor = (DTYPE)1.0 / ((DTYPE)1.0 - dropProb);
/* generate a mask tensor with probability p */
int unitNum = x.unitNum;
DTYPE * maskArray = new DTYPE[unitNum];
srand((unsigned int)time(NULL));
for (int i = 0; i < unitNum; i++)
maskArray[i] = RandomBernoulli(dropProb, scaleFactor);
XTensor mask;
InitTensor(&mask, x.order, x.dimSize, x.dataType, x.denseRatio, x.devID, x.mem);
mask.SetData(maskArray, unitNum);
delete[] maskArray;
return Multiply(x, mask);
}
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
...@@ -42,6 +42,9 @@ void _DropoutBackward(const XTensor * y, const XTensor * x, ...@@ -42,6 +42,9 @@ void _DropoutBackward(const XTensor * y, const XTensor * x,
/* dropout function */ /* dropout function */
XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim = -1); XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim = -1);
/* dropout function without broadcast */
XTensor DropoutWithoutBroadcast(const XTensor &x, DTYPE dropProb);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -373,9 +373,9 @@ better numerical stability. ...@@ -373,9 +373,9 @@ better numerical stability.
>> leadDim - leading dimension (along which we perform reduction) >> leadDim - leading dimension (along which we perform reduction)
*/ */
void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx, XTensor * dedy, XTensor * dedx,
XTensor * padding, int leadDim, XTensor * padding, int leadDim,
LOSS_FUNCTION_NAME lossName) LOSS_FUNCTION_NAME lossName)
{ {
leadDim = leadDim < 0 ? y->order - 1 : leadDim; leadDim = leadDim < 0 ? y->order - 1 : leadDim;
......
/* NiuTrans.Tensor - an open-source tensor library /* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University. * Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved. * All rights reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
* You may obtain a copy of the License at * You may obtain a copy of the License at
* *
* http://www.apache.org/licenses/LICENSE-2.0 * http://www.apache.org/licenses/LICENSE-2.0
* *
* Unless required by applicable law or agreed to in writing, software * Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, * distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
/* /*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-12 * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-12
*/ */
#ifndef __TEST_ABSOLUTE_H__ #ifndef __TEST_ABSOLUTE_H__
#define __TEST_ABSOLUTE_H__ #define __TEST_ABSOLUTE_H__
......
/* NiuTrans.Tensor - an open-source tensor library /* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University. * Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved. * All rights reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
* You may obtain a copy of the License at * You may obtain a copy of the License at
* *
* http://www.apache.org/licenses/LICENSE-2.0 * http://www.apache.org/licenses/LICENSE-2.0
* *
* Unless required by applicable law or agreed to in writing, software * Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, * distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
/* /*
* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-03 * $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-03
*/ */
#include "../XTensor.h" #include "../XTensor.h"
#include "../core/math/Clip.h"
#include "TClip.h" #include "TClip.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
......
/* NiuTrans.Tensor - an open-source tensor library /* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University. * Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved. * All rights reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
* You may obtain a copy of the License at * You may obtain a copy of the License at
* *
* http://www.apache.org/licenses/LICENSE-2.0 * http://www.apache.org/licenses/LICENSE-2.0
* *
* Unless required by applicable law or agreed to in writing, software * Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, * distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
/* /*
* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-03 * $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-03
*/ */
#ifndef __TEST_CLIP_H__ #ifndef __TEST_CLIP_H__
#define __TEST_CLIP_H__ #define __TEST_CLIP_H__
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-12
*/
#include "../XTensor.h"
#include "../core/math/Compare.h"
#include "TCompare.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/*
case 1: test Equal function.
Comapre whether every entry is equal to the specified value.
*/
bool TestCompare1()
{
/* a tensor of size (3, 2) */
int aOrder = 2;
int * aDimSize = new int[aOrder];
aDimSize[0] = 3;
aDimSize[1] = 2;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
DTYPE aData[3][2] = { {1.0F, -2.0F},
{0.0F, 4.0F},
{5.0F, 1.0F} };
DTYPE answer[3][2] = { {1.0F, 0.0F},
{0.0F, 0.0F},
{0.0F, 1.0F} };
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * a = NewTensor(aOrder, aDimSize);
XTensor * b = NewTensor(aOrder, aDimSize);
XTensor * aMe = NewTensor(aOrder, aDimSize);
XTensor bUser;
/* initialize variables */
a->SetData(aData, aUnitNum);
aMe->SetData(aData, aUnitNum);
/* call Equal function */
_Equal(a, b, 1.0);
_EqualMe(aMe, 1.0);
bUser = Equal(*a, 1.0);
/* check results */
cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) &&
aMe->CheckData(answer, aUnitNum, 1e-4F) &&
bUser.CheckData(answer, aUnitNum, 1e-4F);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * aMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor bUserGPU;
/* Initialize variables */
aGPU->SetData(aData, aUnitNum);
aMeGPU->SetData(aData, aUnitNum);
/* call Equal function */
_Equal(aGPU, bGPU, 1.0);
_EqualMe(aMeGPU, 1.0);
bUserGPU = Equal(*aGPU, 1.0);
/* check results */
gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) &&
aMeGPU->CheckData(answer, aUnitNum, 1e-4F) &&
bUserGPU.CheckData(answer, aUnitNum, 1e-4F);
/* destroy variables */
delete a;
delete b;
delete aMe;
delete aGPU;
delete bGPU;
delete aMeGPU;
delete[] aDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete a;
delete b;
delete aMe;
delete[] aDimSize;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */
/*
TODO!!
*/
/* test for Compare Function */
bool TestCompare()
{
XPRINT(0, stdout, "[TEST Compare] compare every entry with specified value \n");
bool returnFlag = true, caseFlag = true;
/* case 1 test */
caseFlag = TestCompare1();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 1 failed!\n");
}
else
XPRINT(0, stdout, ">> case 1 passed!\n");
/* other cases test */
/*
TODO!!
*/
if (returnFlag) {
XPRINT(0, stdout, ">> All Passed!\n");
}
else
XPRINT(0, stdout, ">> Failed!\n");
XPRINT(0, stdout, "\n");
return returnFlag;
}
} // namespace nts(NiuTrans.Tensor)
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-12-10
*/
#ifndef __TEST_Compare_H__
#define __TEST_Compare_H__
namespace nts { // namespace nts(NiuTrans.Tensor)
/* test for Compare Function */
bool TestCompare();
} // namespace nts(NiuTrans.Tensor)
#endif // __TEST_Compare_H__
...@@ -232,13 +232,12 @@ bool TestConvertDataType3() ...@@ -232,13 +232,12 @@ bool TestConvertDataType3()
/* initialize variables */ /* initialize variables */
a->SetData(data1, unitNum1); a->SetData(data1, unitNum1);
/* call ConvertDataType function */ /* call ConvertDataType function (We have not implemented this yet...) */
//_ConvertDataType(a, b); //_ConvertDataType(a, b);
//_ConvertDataType(b, c); //_ConvertDataType(b, c);
/* check results */ /* check results */
cpuTest = a->CheckData(data1, unitNum1, 1e-4F); //cpuTest = a->CheckData(data1, unitNum1, 1e-4F);
c->Dump(stderr, "");
#ifdef USE_CUDA #ifdef USE_CUDA
/* GPU test */ /* GPU test */
......
...@@ -51,6 +51,15 @@ bool TestCopyIndexed1() ...@@ -51,6 +51,15 @@ bool TestCopyIndexed1()
int tUnitNum = 1; int tUnitNum = 1;
for (int i = 0; i < tOrder; i++) for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i]; tUnitNum *= tDimSize[i];
/* a index tensor of size (2) */
int indexOrder = 1;
int * indexDimSize = new int[indexOrder];
indexDimSize[0] = 2;
int indexUnitNum = 1;
for (int i = 0; i < indexOrder; i++)
indexUnitNum *= indexDimSize[i];
DTYPE sData[3][2][3] = { { {0.0F, -1.0F, 2.0F}, DTYPE sData[3][2][3] = { { {0.0F, -1.0F, 2.0F},
{2.0F, 1.0F, 3.0F} }, {2.0F, 1.0F, 3.0F} },
...@@ -76,19 +85,28 @@ bool TestCopyIndexed1() ...@@ -76,19 +85,28 @@ bool TestCopyIndexed1()
/* create tensors */ /* create tensors */
XTensor * s = NewTensor(sOrder, sDimSize); XTensor * s = NewTensor(sOrder, sDimSize);
XTensor * t = NewTensor(tOrder, tDimSize); XTensor * t1 = NewTensor(tOrder, tDimSize);
XTensor * t2 = NewTensor(tOrder, tDimSize);
XTensor * sIndex = NewTensor(indexOrder, indexDimSize, X_INT);
XTensor * tIndex = NewTensor(indexOrder, indexDimSize, X_INT);
XTensor tUser; XTensor tUser;
/* initialize variables */ /* initialize variables */
s->SetData(sData, sUnitNum); s->SetData(sData, sUnitNum);
t->SetZeroAll(); t1->SetZeroAll();
t2->SetZeroAll();
sIndex->SetData(srcIndex, indexUnitNum);
tIndex->SetData(tgtIndex, indexUnitNum);
/* call CopyIndexed function */ /* call CopyIndexed function */
_CopyIndexed(s, t, dim, srcIndex, indexSize, tgtIndex, copyNum); _CopyIndexed(s, t1, dim, srcIndex, indexSize, tgtIndex, copyNum);
tUser = CopyIndexed(*s, dim, srcIndex, indexSize, tgtIndex, copyNum); _CopyIndexed(s, t2, dim, sIndex, tIndex, copyNum);
tUser = CopyIndexed(*s, dim, *sIndex, *tIndex, copyNum);
/* check results */ /* check results */
cpuTest = t->CheckData(answer, tUnitNum) && tUser.CheckData(answer, tUnitNum); cpuTest = t1->CheckData(answer, tUnitNum) &&
t2->CheckData(answer, tUnitNum) &&
tUser.CheckData(answer, tUnitNum);
#ifdef USE_CUDA #ifdef USE_CUDA
/* GPU test */ /* GPU test */
...@@ -96,35 +114,55 @@ bool TestCopyIndexed1() ...@@ -96,35 +114,55 @@ bool TestCopyIndexed1()
/* create tensors */ /* create tensors */
XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0); XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensor(sOrder, tDimSize, X_FLOAT, 1.0F, 0); XTensor * tGPU1 = NewTensor(sOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU2 = NewTensor(sOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor * sIndexGPU = NewTensor(indexOrder, indexDimSize, X_INT, 1.0F, 0);
XTensor * tIndexGPU = NewTensor(indexOrder, indexDimSize, X_INT, 1.0F, 0);
XTensor tUserGPU; XTensor tUserGPU;
/* initialize variables */ /* initialize variables */
sGPU->SetData(sData, sUnitNum); sGPU->SetData(sData, sUnitNum);
tGPU->SetZeroAll(); tGPU1->SetZeroAll();
tGPU2->SetZeroAll();
sIndexGPU->SetData(srcIndex, indexUnitNum);
tIndexGPU->SetData(tgtIndex, indexUnitNum);
/* call CopyIndexed function */ /* call CopyIndexed function */
_CopyIndexed(sGPU, tGPU, dim, srcIndex, indexSize, tgtIndex, copyNum); _CopyIndexed(sGPU, tGPU1, dim, srcIndex, indexSize, tgtIndex, copyNum);
tUserGPU = CopyIndexed(*sGPU, dim, srcIndex, indexSize, tgtIndex, copyNum); _CopyIndexed(sGPU, tGPU2, dim, sIndexGPU, tIndexGPU, copyNum);
tUserGPU = CopyIndexed(*sGPU, dim, *sIndexGPU, *tIndexGPU, copyNum);
/* check results */ /* check results */
gpuTest = tGPU->CheckData(answer, tUnitNum) && tUserGPU.CheckData(answer, tUnitNum); gpuTest = tGPU1->CheckData(answer, tUnitNum) &&
tGPU2->CheckData(answer, tUnitNum) &&
tUserGPU.CheckData(answer, tUnitNum);
/* destroy variables */ /* destroy variables */
delete s; delete s;
delete t; delete t1;
delete t2;
delete sIndex;
delete tIndex;
delete sGPU; delete sGPU;
delete tGPU; delete tGPU1;
delete tGPU2;
delete sIndexGPU;
delete tIndexGPU;
delete[] sDimSize; delete[] sDimSize;
delete[] tDimSize; delete[] tDimSize;
delete[] indexDimSize;
return cpuTest && gpuTest; return cpuTest && gpuTest;
#else #else
/* destroy variables */ /* destroy variables */
delete s; delete s;
delete t; delete t1;
delete t2;
delete sIndex;
delete tIndex;
delete[] sDimSize; delete[] sDimSize;
delete[] tDimSize; delete[] tDimSize;
delete[] indexDimSize;
return cpuTest; return cpuTest;
#endif // USE_CUDA #endif // USE_CUDA
...@@ -159,6 +197,15 @@ bool TestCopyIndexed2() ...@@ -159,6 +197,15 @@ bool TestCopyIndexed2()
for (int i = 0; i < tOrder; i++) for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i]; tUnitNum *= tDimSize[i];
/* a index tensor of size (2) */
int indexOrder = 1;
int * indexDimSize = new int[indexOrder];
indexDimSize[0] = 2;
int indexUnitNum = 1;
for (int i = 0; i < indexOrder; i++)
indexUnitNum *= indexDimSize[i];
DTYPE sData[3][2][3] = { { {0.0F, -1.0F, 2.0F}, DTYPE sData[3][2][3] = { { {0.0F, -1.0F, 2.0F},
{2.0F, 1.0F, 3.0F} }, {2.0F, 1.0F, 3.0F} },
{ {1.0F, 2.0F, 4.0F}, { {1.0F, 2.0F, 4.0F},
...@@ -183,19 +230,28 @@ bool TestCopyIndexed2() ...@@ -183,19 +230,28 @@ bool TestCopyIndexed2()
/* create tensors */ /* create tensors */
XTensor * s = NewTensor(sOrder, sDimSize); XTensor * s = NewTensor(sOrder, sDimSize);
XTensor * t = NewTensor(tOrder, tDimSize); XTensor * t1 = NewTensor(tOrder, tDimSize);
XTensor * t2 = NewTensor(tOrder, tDimSize);
XTensor * sIndex = NewTensor(indexOrder, indexDimSize, X_INT);
XTensor * tIndex = NewTensor(indexOrder, indexDimSize, X_INT);
XTensor tUser; XTensor tUser;
/* initialize variables */ /* initialize variables */
s->SetData(sData, sUnitNum); s->SetData(sData, sUnitNum);
t->SetZeroAll(); t1->SetZeroAll();
t2->SetZeroAll();
sIndex->SetData(srcIndex, indexUnitNum);
tIndex->SetData(tgtIndex, indexUnitNum);
/* call CopyIndexed function */ /* call CopyIndexed function */
_CopyIndexed(s, t, dim, srcIndex, indexSize, tgtIndex, copyNum); _CopyIndexed(s, t1, dim, srcIndex, indexSize, tgtIndex, copyNum);
tUser = CopyIndexed(*s, dim, srcIndex, indexSize, tgtIndex, copyNum); _CopyIndexed(s, t2, dim, sIndex, tIndex, copyNum);
tUser = CopyIndexed(*s, dim, *sIndex, *tIndex);
/* check results */ /* check results */
cpuTest = t->CheckData(answer, tUnitNum) && tUser.CheckData(answer, tUnitNum); cpuTest = t1->CheckData(answer, tUnitNum) &&
t2->CheckData(answer, tUnitNum) &&
tUser.CheckData(answer, tUnitNum);
#ifdef USE_CUDA #ifdef USE_CUDA
/* GPU test */ /* GPU test */
...@@ -203,35 +259,55 @@ bool TestCopyIndexed2() ...@@ -203,35 +259,55 @@ bool TestCopyIndexed2()
/* create tensors */ /* create tensors */
XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0); XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensor(sOrder, tDimSize, X_FLOAT, 1.0F, 0); XTensor * tGPU1 = NewTensor(sOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU2 = NewTensor(sOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor * sIndexGPU = NewTensor(indexOrder, indexDimSize, X_INT, 1.0F, 0);
XTensor * tIndexGPU = NewTensor(indexOrder, indexDimSize, X_INT, 1.0F, 0);
XTensor tUserGPU; XTensor tUserGPU;
/* initialize variables */ /* initialize variables */
sGPU->SetData(sData, sUnitNum); sGPU->SetData(sData, sUnitNum);
tGPU->SetZeroAll(); tGPU1->SetZeroAll();
tGPU2->SetZeroAll();
sIndexGPU->SetData(srcIndex, indexUnitNum);
tIndexGPU->SetData(tgtIndex, indexUnitNum);
/* call CopyIndexed function */ /* call CopyIndexed function */
_CopyIndexed(sGPU, tGPU, dim, srcIndex, indexSize, tgtIndex, copyNum); _CopyIndexed(sGPU, tGPU1, dim, srcIndex, indexSize, tgtIndex, copyNum);
tUserGPU = CopyIndexed(*sGPU, dim, srcIndex, indexSize, tgtIndex, copyNum); _CopyIndexed(sGPU, tGPU2, dim, sIndexGPU, tIndexGPU, copyNum);
tUserGPU = CopyIndexed(*sGPU, dim, *sIndexGPU, *tIndexGPU, copyNum);
/* check results */ /* check results */
gpuTest = tGPU->CheckData(answer, tUnitNum) && tUserGPU.CheckData(answer, tUnitNum); gpuTest = tGPU1->CheckData(answer, tUnitNum) &&
tGPU2->CheckData(answer, tUnitNum) &&
tUserGPU.CheckData(answer, tUnitNum);
/* destroy variables */ /* destroy variables */
delete s; delete s;
delete t; delete t1;
delete t2;
delete sIndex;
delete tIndex;
delete sGPU; delete sGPU;
delete tGPU; delete tGPU1;
delete tGPU2;
delete sIndexGPU;
delete tIndexGPU;
delete[] sDimSize; delete[] sDimSize;
delete[] tDimSize; delete[] tDimSize;
delete[] indexDimSize;
return cpuTest && gpuTest; return cpuTest && gpuTest;
#else #else
/* destroy variables */ /* destroy variables */
delete s; delete s;
delete t; delete t1;
delete t2;
delete sIndex;
delete tIndex;
delete[] sDimSize; delete[] sDimSize;
delete[] tDimSize; delete[] tDimSize;
delete[] indexDimSize;
return cpuTest; return cpuTest;
#endif // USE_CUDA #endif // USE_CUDA
...@@ -265,6 +341,15 @@ bool TestCopyIndexed3() ...@@ -265,6 +341,15 @@ bool TestCopyIndexed3()
int tUnitNum = 1; int tUnitNum = 1;
for (int i = 0; i < tOrder; i++) for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i]; tUnitNum *= tDimSize[i];
/* a index tensor of size (1) */
int indexOrder = 1;
int * indexDimSize = new int[indexOrder];
indexDimSize[0] = 1;
int indexUnitNum = 1;
for (int i = 0; i < indexOrder; i++)
indexUnitNum *= indexDimSize[i];
DTYPE sData[3][2][3] = { { {0.0F, -1.0F, 2.0F}, DTYPE sData[3][2][3] = { { {0.0F, -1.0F, 2.0F},
{2.0F, 1.0F, 3.0F} }, {2.0F, 1.0F, 3.0F} },
...@@ -290,19 +375,28 @@ bool TestCopyIndexed3() ...@@ -290,19 +375,28 @@ bool TestCopyIndexed3()
/* create tensors */ /* create tensors */
XTensor * s = NewTensor(sOrder, sDimSize); XTensor * s = NewTensor(sOrder, sDimSize);
XTensor * t = NewTensor(tOrder, tDimSize); XTensor * t1 = NewTensor(tOrder, tDimSize);
XTensor * t2 = NewTensor(tOrder, tDimSize);
XTensor * sIndex = NewTensor(indexOrder, indexDimSize, X_INT);
XTensor * tIndex = NewTensor(indexOrder, indexDimSize, X_INT);
XTensor tUser; XTensor tUser;
/* initialize variables */ /* initialize variables */
s->SetData(sData, sUnitNum); s->SetData(sData, sUnitNum);
t->SetZeroAll(); t1->SetZeroAll();
t2->SetZeroAll();
sIndex->SetData(srcIndex, indexUnitNum);
tIndex->SetData(tgtIndex, indexUnitNum);
/* call CopyIndexed function */ /* call CopyIndexed function */
_CopyIndexed(s, t, dim, srcIndex, indexSize, tgtIndex, copyNum); _CopyIndexed(s, t1, dim, srcIndex, indexSize, tgtIndex, copyNum);
tUser = CopyIndexed(*s, dim, srcIndex, indexSize, tgtIndex, copyNum); _CopyIndexed(s, t2, dim, sIndex, tIndex, copyNum);
tUser = CopyIndexed(*s, dim, *sIndex, *tIndex, copyNum);
/* check results */ /* check results */
cpuTest = t->CheckData(answer, tUnitNum) && tUser.CheckData(answer, tUnitNum); cpuTest = t1->CheckData(answer, tUnitNum) &&
t2->CheckData(answer, tUnitNum) &&
tUser.CheckData(answer, tUnitNum);
#ifdef USE_CUDA #ifdef USE_CUDA
/* GPU test */ /* GPU test */
...@@ -310,35 +404,55 @@ bool TestCopyIndexed3() ...@@ -310,35 +404,55 @@ bool TestCopyIndexed3()
/* create tensors */ /* create tensors */
XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0); XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensor(sOrder, tDimSize, X_FLOAT, 1.0F, 0); XTensor * tGPU1 = NewTensor(sOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU2 = NewTensor(sOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor * sIndexGPU = NewTensor(indexOrder, indexDimSize, X_INT, 1.0F, 0);
XTensor * tIndexGPU = NewTensor(indexOrder, indexDimSize, X_INT, 1.0F, 0);
XTensor tUserGPU; XTensor tUserGPU;
/* initialize variables */ /* initialize variables */
sGPU->SetData(sData, sUnitNum); sGPU->SetData(sData, sUnitNum);
tGPU->SetZeroAll(); tGPU1->SetZeroAll();
tGPU2->SetZeroAll();
sIndexGPU->SetData(srcIndex, indexUnitNum);
tIndexGPU->SetData(tgtIndex, indexUnitNum);
/* call CopyIndexed function */ /* call CopyIndexed function */
_CopyIndexed(sGPU, tGPU, dim, srcIndex, indexSize, tgtIndex, copyNum); _CopyIndexed(sGPU, tGPU1, dim, srcIndex, indexSize, tgtIndex, copyNum);
tUserGPU = CopyIndexed(*sGPU, dim, srcIndex, indexSize, tgtIndex, copyNum); _CopyIndexed(sGPU, tGPU2, dim, sIndexGPU, tIndexGPU, copyNum);
tUserGPU = CopyIndexed(*sGPU, dim, *sIndexGPU, *tIndexGPU, copyNum);
/* check results */ /* check results */
gpuTest = tGPU->CheckData(answer, tUnitNum) && tUserGPU.CheckData(answer, tUnitNum); gpuTest = tGPU1->CheckData(answer, tUnitNum) &&
tGPU2->CheckData(answer, tUnitNum) &&
tUserGPU.CheckData(answer, tUnitNum);
/* destroy variables */ /* destroy variables */
delete s; delete s;
delete t; delete t1;
delete t2;
delete sIndex;
delete tIndex;
delete sGPU; delete sGPU;
delete tGPU; delete tGPU1;
delete tGPU2;
delete sIndexGPU;
delete tIndexGPU;
delete[] sDimSize; delete[] sDimSize;
delete[] tDimSize; delete[] tDimSize;
delete[] indexDimSize;
return cpuTest && gpuTest; return cpuTest && gpuTest;
#else #else
/* destroy variables */ /* destroy variables */
delete s; delete s;
delete t; delete t1;
delete t2;
delete sIndex;
delete tIndex;
delete[] sDimSize; delete[] sDimSize;
delete[] tDimSize; delete[] tDimSize;
delete[] indexDimSize;
return cpuTest; return cpuTest;
#endif // USE_CUDA #endif // USE_CUDA
...@@ -374,15 +488,13 @@ bool TestCopyIndexed4() ...@@ -374,15 +488,13 @@ bool TestCopyIndexed4()
tUnitNum *= tDimSize[i]; tUnitNum *= tDimSize[i];
/* a index tensor of size(2) */ /* a index tensor of size(2) */
int iOrder = 3; int indexOrder = 1;
int * iDimSize = new int[iOrder]; int * indexDimSize = new int[indexOrder];
iDimSize[0] = 3; indexDimSize[0] = 2;
iDimSize[1] = 2;
iDimSize[2] = 2;
int iUnitNum = 1; int indexUnitNum = 1;
for (int i = 0; i < iOrder; i++) for (int i = 0; i < indexOrder; i++)
iUnitNum *= iDimSize[i]; indexUnitNum *= indexDimSize[i];
DTYPE sData[3][2][3] = { { {0.0F, -1.0F, 2.0F}, DTYPE sData[3][2][3] = { { {0.0F, -1.0F, 2.0F},
{2.0F, 1.0F, 3.0F} }, {2.0F, 1.0F, 3.0F} },
...@@ -408,21 +520,28 @@ bool TestCopyIndexed4() ...@@ -408,21 +520,28 @@ bool TestCopyIndexed4()
/* create tensors */ /* create tensors */
XTensor * s = NewTensor(sOrder, sDimSize); XTensor * s = NewTensor(sOrder, sDimSize);
XTensor * t = NewTensor(tOrder, tDimSize); XTensor * t1 = NewTensor(tOrder, tDimSize);
XTensor * index = NewTensor(tOrder, tDimSize, X_INT); XTensor * t2 = NewTensor(tOrder, tDimSize);
XTensor * sIndex = NewTensor(indexOrder, indexDimSize, X_INT);
XTensor * tIndex = NewTensor(indexOrder, indexDimSize, X_INT);
XTensor tUser; XTensor tUser;
/* initialize variables */ /* initialize variables */
s->SetData(sData, sUnitNum); s->SetData(sData, sUnitNum);
t->SetZeroAll(); t1->SetZeroAll();
index->SetData(srcIndex, iUnitNum); t2->SetZeroAll();
sIndex->SetData(srcIndex, indexUnitNum);
tIndex->SetData(tgtIndex, indexUnitNum);
/* call CopyIndexed function */ /* call CopyIndexed function */
_CopyIndexed(s, t, dim, (int*)index->data, indexSize, tgtIndex, copyNum); _CopyIndexed(s, t1, dim, srcIndex, indexSize, tgtIndex, copyNum);
tUser = CopyIndexed(*s, dim, (int*)index->data, indexSize, tgtIndex, copyNum); _CopyIndexed(s, t2, dim, sIndex, tIndex, copyNum);
tUser = CopyIndexed(*s, dim, *sIndex, *tIndex, copyNum);
/* check results */ /* check results */
cpuTest = t->CheckData(answer, tUnitNum) && tUser.CheckData(answer, tUnitNum); cpuTest = t1->CheckData(answer, tUnitNum) &&
t2->CheckData(answer, tUnitNum) &&
tUser.CheckData(answer, tUnitNum);
#ifdef USE_CUDA #ifdef USE_CUDA
/* GPU test */ /* GPU test */
...@@ -430,42 +549,60 @@ bool TestCopyIndexed4() ...@@ -430,42 +549,60 @@ bool TestCopyIndexed4()
/* create tensors */ /* create tensors */
XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0); XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensor(sOrder, tDimSize, X_FLOAT, 1.0F, 0); XTensor * tGPU1 = NewTensor(sOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU2 = NewTensor(sOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor * sIndexGPU = NewTensor(indexOrder, indexDimSize, X_INT, 1.0F, 0);
XTensor * tIndexGPU = NewTensor(indexOrder, indexDimSize, X_INT, 1.0F, 0);
XTensor tUserGPU; XTensor tUserGPU;
/* initialize variables */ /* initialize variables */
sGPU->SetData(sData, sUnitNum); sGPU->SetData(sData, sUnitNum);
tGPU->SetZeroAll(); tGPU1->SetZeroAll();
tGPU2->SetZeroAll();
sIndexGPU->SetData(srcIndex, indexUnitNum);
tIndexGPU->SetData(tgtIndex, indexUnitNum);
/* call CopyIndexed function */ /* call CopyIndexed function */
_CopyIndexed(sGPU, tGPU, dim, (int*)index->data, indexSize, tgtIndex, copyNum); _CopyIndexed(sGPU, tGPU1, dim, srcIndex, indexSize, tgtIndex, copyNum);
tUserGPU = CopyIndexed(*sGPU, dim, srcIndex, indexSize, tgtIndex, copyNum); _CopyIndexed(sGPU, tGPU2, dim, sIndexGPU, tIndexGPU, copyNum);
tUserGPU = CopyIndexed(*sGPU, dim, *sIndexGPU, *tIndexGPU, copyNum);
/* check results */ /* check results */
gpuTest = tGPU->CheckData(answer, tUnitNum) && tUserGPU.CheckData(answer, tUnitNum); gpuTest = tGPU1->CheckData(answer, tUnitNum) &&
tGPU2->CheckData(answer, tUnitNum) &&
tUserGPU.CheckData(answer, tUnitNum);
/* destroy variables */ /* destroy variables */
delete s; delete s;
delete t; delete t1;
delete index; delete t2;
delete sIndex;
delete tIndex;
delete sGPU; delete sGPU;
delete tGPU; delete tGPU1;
delete tGPU2;
delete sIndexGPU;
delete tIndexGPU;
delete[] sDimSize; delete[] sDimSize;
delete[] tDimSize; delete[] tDimSize;
delete[] indexDimSize;
return cpuTest && gpuTest; return cpuTest && gpuTest;
#else #else
/* destroy variables */ /* destroy variables */
delete s; delete s;
delete t; delete t1;
delete t2;
delete sIndex;
delete tIndex;
delete[] sDimSize; delete[] sDimSize;
delete[] tDimSize; delete[] tDimSize;
delete[] indexDimSize;
return cpuTest; return cpuTest;
#endif // USE_CUDA #endif // USE_CUDA
} }
/* /*
case 5: copy indexed sub-tensors case 5: copy indexed sub-tensors
In this case, (3, 2, 3) -> (3, 2, 2), dim = 2, indexSize = 1, In this case, (3, 2, 3) -> (3, 2, 2), dim = 2, indexSize = 1,
...@@ -494,6 +631,15 @@ bool TestCopyIndexed5() ...@@ -494,6 +631,15 @@ bool TestCopyIndexed5()
int tUnitNum = 1; int tUnitNum = 1;
for (int i = 0; i < tOrder; i++) for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i]; tUnitNum *= tDimSize[i];
/* a index tensor of size (2) */
int indexOrder = 1;
int * indexDimSize = new int[indexOrder];
indexDimSize[0] = 2;
int indexUnitNum = 1;
for (int i = 0; i < indexOrder; i++)
indexUnitNum *= indexDimSize[i];
DTYPE sData[3][2][3] = { { {0.0F, -1.0F, 2.0F}, DTYPE sData[3][2][3] = { { {0.0F, -1.0F, 2.0F},
{2.0F, 1.0F, 3.0F} }, {2.0F, 1.0F, 3.0F} },
...@@ -519,19 +665,28 @@ bool TestCopyIndexed5() ...@@ -519,19 +665,28 @@ bool TestCopyIndexed5()
/* create tensors */ /* create tensors */
XTensor * s = NewTensor(sOrder, sDimSize); XTensor * s = NewTensor(sOrder, sDimSize);
XTensor * t = NewTensor(tOrder, tDimSize); XTensor * t1 = NewTensor(tOrder, tDimSize);
XTensor * t2 = NewTensor(tOrder, tDimSize);
XTensor * sIndex = NewTensor(indexOrder, indexDimSize, X_INT);
XTensor * tIndex = NewTensor(indexOrder, indexDimSize, X_INT);
XTensor tUser; XTensor tUser;
/* initialize variables */ /* initialize variables */
s->SetData(sData, sUnitNum); s->SetData(sData, sUnitNum);
t->SetZeroAll(); t1->SetZeroAll();
t2->SetZeroAll();
sIndex->SetData(srcIndex, indexUnitNum);
tIndex->SetData(tgtIndex, indexUnitNum);
/* call CopyIndexed function */ /* call CopyIndexed function */
_CopyIndexed(s, t, dim, srcIndex, indexSize, tgtIndex, copyNum); _CopyIndexed(s, t1, dim, srcIndex, indexSize, tgtIndex, copyNum);
tUser = CopyIndexed(*s, dim, srcIndex, indexSize, tgtIndex, copyNum); _CopyIndexed(s, t2, dim, sIndex, tIndex, copyNum);
tUser = CopyIndexed(*s, dim, *sIndex, *tIndex, copyNum);
/* check results */ /* check results */
cpuTest = t->CheckData(answer, tUnitNum) && tUser.CheckData(answer, tUnitNum); cpuTest = t1->CheckData(answer, tUnitNum) &&
t2->CheckData(answer, tUnitNum) &&
tUser.CheckData(answer, tUnitNum);
#ifdef USE_CUDA #ifdef USE_CUDA
/* GPU test */ /* GPU test */
...@@ -539,35 +694,55 @@ bool TestCopyIndexed5() ...@@ -539,35 +694,55 @@ bool TestCopyIndexed5()
/* create tensors */ /* create tensors */
XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0); XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensor(sOrder, tDimSize, X_FLOAT, 1.0F, 0); XTensor * tGPU1 = NewTensor(sOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU2 = NewTensor(sOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor * sIndexGPU = NewTensor(indexOrder, indexDimSize, X_INT, 1.0F, 0);
XTensor * tIndexGPU = NewTensor(indexOrder, indexDimSize, X_INT, 1.0F, 0);
XTensor tUserGPU; XTensor tUserGPU;
/* initialize variables */ /* initialize variables */
sGPU->SetData(sData, sUnitNum); sGPU->SetData(sData, sUnitNum);
tGPU->SetZeroAll(); tGPU1->SetZeroAll();
tGPU2->SetZeroAll();
sIndexGPU->SetData(srcIndex, indexUnitNum);
tIndexGPU->SetData(tgtIndex, indexUnitNum);
/* call CopyIndexed function */ /* call CopyIndexed function */
_CopyIndexed(sGPU, tGPU, dim, srcIndex, indexSize, tgtIndex, copyNum); _CopyIndexed(sGPU, tGPU1, dim, srcIndex, indexSize, tgtIndex, copyNum);
tUserGPU = CopyIndexed(*sGPU, dim, srcIndex, indexSize, tgtIndex, copyNum); _CopyIndexed(sGPU, tGPU2, dim, sIndexGPU, tIndexGPU, copyNum);
tUserGPU = CopyIndexed(*sGPU, dim, *sIndexGPU, *tIndexGPU, copyNum);
/* check results */ /* check results */
gpuTest = tGPU->CheckData(answer, tUnitNum) && tUserGPU.CheckData(answer, tUnitNum); gpuTest = tGPU1->CheckData(answer, tUnitNum) &&
tGPU2->CheckData(answer, tUnitNum) &&
tUserGPU.CheckData(answer, tUnitNum);
/* destroy variables */ /* destroy variables */
delete s; delete s;
delete t; delete t1;
delete t2;
delete sIndex;
delete tIndex;
delete sGPU; delete sGPU;
delete tGPU; delete tGPU1;
delete tGPU2;
delete sIndexGPU;
delete tIndexGPU;
delete[] sDimSize; delete[] sDimSize;
delete[] tDimSize; delete[] tDimSize;
delete[] indexDimSize;
return cpuTest && gpuTest; return cpuTest && gpuTest;
#else #else
/* destroy variables */ /* destroy variables */
delete s; delete s;
delete t; delete t1;
delete t2;
delete sIndex;
delete tIndex;
delete[] sDimSize; delete[] sDimSize;
delete[] tDimSize; delete[] tDimSize;
delete[] indexDimSize;
return cpuTest; return cpuTest;
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -311,8 +311,8 @@ bool TestCrossEntropy3() ...@@ -311,8 +311,8 @@ bool TestCrossEntropy3()
delete goldGPU; delete goldGPU;
delete lossGPU; delete lossGPU;
delete weightGPU; delete weightGPU;
delete[] dimSize; delete[] dimSize;
delete[] wDimSize;
return cpuTest && gpuTest; return cpuTest && gpuTest;
#else #else
...@@ -322,6 +322,7 @@ bool TestCrossEntropy3() ...@@ -322,6 +322,7 @@ bool TestCrossEntropy3()
delete loss; delete loss;
delete weight; delete weight;
delete[] dimSize; delete[] dimSize;
delete[] wDimSize;
return cpuTest; return cpuTest;
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -212,6 +212,8 @@ bool TestDropout2() ...@@ -212,6 +212,8 @@ bool TestDropout2()
/* destroy variables */ /* destroy variables */
delete x; delete x;
delete y; delete y;
delete dedx;
delete dedy;
delete[] dimSize; delete[] dimSize;
return cpuTest; return cpuTest;
......
...@@ -332,6 +332,7 @@ bool TestGather3() ...@@ -332,6 +332,7 @@ bool TestGather3()
/* destroy variables */ /* destroy variables */
delete s; delete s;
delete t; delete t;
delete index;
delete[] sDimSize; delete[] sDimSize;
delete[] tDimSize; delete[] tDimSize;
delete[] indexDimSize; delete[] indexDimSize;
......
...@@ -150,6 +150,7 @@ bool TestSetData2() ...@@ -150,6 +150,7 @@ bool TestSetData2()
delete sGPU; delete sGPU;
delete modifyGPU; delete modifyGPU;
delete[] sDimSize; delete[] sDimSize;
delete[] dataDimSize;
return cpuTest && gpuTest; return cpuTest && gpuTest;
#else #else
...@@ -157,6 +158,7 @@ bool TestSetData2() ...@@ -157,6 +158,7 @@ bool TestSetData2()
delete s; delete s;
delete modify; delete modify;
delete[] sDimSize; delete[] sDimSize;
delete[] dataDimSize;
return cpuTest; return cpuTest;
#endif // USE_CUDA #endif // USE_CUDA
...@@ -242,6 +244,7 @@ bool TestSetData3() ...@@ -242,6 +244,7 @@ bool TestSetData3()
delete sGPU; delete sGPU;
delete modifyGPU; delete modifyGPU;
delete[] sDimSize; delete[] sDimSize;
delete[] dataDimSize;
return cpuTest && gpuTest; return cpuTest && gpuTest;
#else #else
...@@ -249,6 +252,7 @@ bool TestSetData3() ...@@ -249,6 +252,7 @@ bool TestSetData3()
delete s; delete s;
delete modify; delete modify;
delete[] sDimSize; delete[] sDimSize;
delete[] dataDimSize;
return cpuTest; return cpuTest;
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -61,7 +61,9 @@ bool TestSort1() ...@@ -61,7 +61,9 @@ bool TestSort1()
_SortMe(aMe, index, 0); _SortMe(aMe, index, 0);
Sort(*a, bUser, *index, 0); Sort(*a, bUser, *index, 0);
cpuTest = b->CheckData(answer, unitNum) && aMe->CheckData(answer, unitNum) && bUser.CheckData(answer, unitNum); cpuTest = b->CheckData(answer, unitNum) &&
aMe->CheckData(answer, unitNum) &&
bUser.CheckData(answer, unitNum);
#ifdef USE_CUDA #ifdef USE_CUDA
/* GPU test */ /* GPU test */
...@@ -85,7 +87,9 @@ bool TestSort1() ...@@ -85,7 +87,9 @@ bool TestSort1()
Sort(*aGPU, bUserGPU, *indexGPU, 0); Sort(*aGPU, bUserGPU, *indexGPU, 0);
/* check results */ /* check results */
gpuTest = bGPU->CheckData(answer, unitNum) && aMeGPU->CheckData(answer, unitNum) && bUserGPU.CheckData(answer, unitNum); gpuTest = bGPU->CheckData(answer, unitNum) &&
aMeGPU->CheckData(answer, unitNum) &&
bUserGPU.CheckData(answer, unitNum);
/* destroy variables */ /* destroy variables */
delete a; delete a;
...@@ -149,7 +153,9 @@ bool TestSort2() ...@@ -149,7 +153,9 @@ bool TestSort2()
Sort(*a, bUser, *index, 1); Sort(*a, bUser, *index, 1);
/* check results */ /* check results */
cpuTest = b->CheckData(answer, unitNum) && aMe->CheckData(answer, unitNum) && bUser.CheckData(answer, unitNum); cpuTest = b->CheckData(answer, unitNum) &&
aMe->CheckData(answer, unitNum) &&
bUser.CheckData(answer, unitNum);
#ifdef USE_CUDA #ifdef USE_CUDA
/* GPU test */ /* GPU test */
...@@ -173,7 +179,9 @@ bool TestSort2() ...@@ -173,7 +179,9 @@ bool TestSort2()
Sort(*aGPU, bUserGPU, *indexGPU, 1); Sort(*aGPU, bUserGPU, *indexGPU, 1);
/* check results */ /* check results */
gpuTest = bGPU->CheckData(answer, unitNum) && aMeGPU->CheckData(answer, unitNum) && bUserGPU.CheckData(answer, unitNum); gpuTest = bGPU->CheckData(answer, unitNum) &&
aMeGPU->CheckData(answer, unitNum) &&
bUserGPU.CheckData(answer, unitNum);
/* destroy variables */ /* destroy variables */
delete a; delete a;
......
...@@ -357,6 +357,7 @@ bool TestSplit3() ...@@ -357,6 +357,7 @@ bool TestSplit3()
delete[] sDimSize; delete[] sDimSize;
delete[] tDimSize1; delete[] tDimSize1;
delete[] tDimSize2; delete[] tDimSize2;
delete tList;
return cpuTest; return cpuTest;
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -182,6 +182,7 @@ bool TestSpread2() ...@@ -182,6 +182,7 @@ bool TestSpread2()
int dim = 0; int dim = 0;
int indexSize = 2; int indexSize = 2;
int srcIndex[2] = {0, 2}; int srcIndex[2] = {0, 2};
int tgtIndex[2] = {0, 1};
/* CPU test */ /* CPU test */
bool cpuTest = true; bool cpuTest = true;
...@@ -190,17 +191,19 @@ bool TestSpread2() ...@@ -190,17 +191,19 @@ bool TestSpread2()
XTensor * s1 = NewTensor(sOrder, sDimSize); XTensor * s1 = NewTensor(sOrder, sDimSize);
XTensor * s2 = NewTensor(sOrder, sDimSize); XTensor * s2 = NewTensor(sOrder, sDimSize);
XTensor * t = NewTensor(tOrder, tDimSize); XTensor * t = NewTensor(tOrder, tDimSize);
XTensor * index = NewTensor(indexOrder, indexDimSize, X_INT); XTensor * sIndex = NewTensor(indexOrder, indexDimSize, X_INT);
XTensor * cIndex = NewTensor(indexOrder, indexDimSize, X_INT);
/* initialize variables */ /* initialize variables */
s1->SetData(sData, sUnitNum); s1->SetData(sData, sUnitNum);
s2->SetData(sData, sUnitNum); s2->SetData(sData, sUnitNum);
t->SetData(tData, tUnitNum); t->SetData(tData, tUnitNum);
index->SetData(srcIndex, indexSize); sIndex->SetData(srcIndex, indexSize);
cIndex->SetData(tgtIndex, indexSize);
/* call _SpreadForGather function */ /* call _SpreadForGather function */
_SpreadForGather(s1, t, dim, srcIndex, indexSize); _SpreadForCopyIndexed(s1, t, dim, sIndex, cIndex, 1);
_SpreadForGather(s2, t, index); _SpreadForGather(s2, t, sIndex);
/* check results */ /* check results */
cpuTest = s1->CheckData(answer, tUnitNum) && cpuTest = s1->CheckData(answer, tUnitNum) &&
...@@ -214,17 +217,19 @@ bool TestSpread2() ...@@ -214,17 +217,19 @@ bool TestSpread2()
XTensor * sGPU1 = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0); XTensor * sGPU1 = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * sGPU2 = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0); XTensor * sGPU2 = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensor(sOrder, tDimSize, X_FLOAT, 1.0F, 0); XTensor * tGPU = NewTensor(sOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor * indexGPU = NewTensor(indexOrder, indexDimSize, X_INT, 1.0F, 0); XTensor * sIndexGPU = NewTensor(indexOrder, indexDimSize, X_INT, 1.0F, 0);
XTensor * cIndexGPU = NewTensor(indexOrder, indexDimSize, X_INT, 1.0F, 0);
/* initialize variables */ /* initialize variables */
sGPU1->SetData(sData, sUnitNum); sGPU1->SetData(sData, sUnitNum);
sGPU2->SetData(sData, sUnitNum); sGPU2->SetData(sData, sUnitNum);
tGPU->SetData(tData, tUnitNum); tGPU->SetData(tData, tUnitNum);
indexGPU->SetData(srcIndex, indexSize); sIndexGPU->SetData(srcIndex, indexSize);
cIndexGPU->SetData(tgtIndex, indexSize);
/* call _SpreadForGather function */ /* call _SpreadForGather function */
_SpreadForGather(sGPU1, tGPU, dim, srcIndex, indexSize); _SpreadForCopyIndexed(sGPU1, tGPU, dim, sIndex, cIndex, 1);
_SpreadForGather(sGPU2, tGPU, indexGPU); _SpreadForGather(sGPU2, tGPU, sIndexGPU);
/* check results */ /* check results */
gpuTest = sGPU1->CheckData(answer, tUnitNum) && gpuTest = sGPU1->CheckData(answer, tUnitNum) &&
...@@ -234,11 +239,13 @@ bool TestSpread2() ...@@ -234,11 +239,13 @@ bool TestSpread2()
delete s1; delete s1;
delete s2; delete s2;
delete t; delete t;
delete index; delete sIndex;
delete cIndex;
delete sGPU1; delete sGPU1;
delete sGPU2; delete sGPU2;
delete tGPU; delete tGPU;
delete indexGPU; delete sIndexGPU;
delete cIndexGPU;
delete[] sDimSize; delete[] sDimSize;
delete[] tDimSize; delete[] tDimSize;
delete[] indexDimSize; delete[] indexDimSize;
...@@ -249,6 +256,8 @@ bool TestSpread2() ...@@ -249,6 +256,8 @@ bool TestSpread2()
delete s1; delete s1;
delete s2; delete s2;
delete t; delete t;
delete sIndex;
delete cIndex;
delete[] sDimSize; delete[] sDimSize;
delete[] tDimSize; delete[] tDimSize;
delete[] indexDimSize; delete[] indexDimSize;
......
...@@ -31,6 +31,7 @@ bool Test() ...@@ -31,6 +31,7 @@ bool Test()
wrong = !TestAbsolute() || wrong; wrong = !TestAbsolute() || wrong;
wrong = !TestClip() || wrong; wrong = !TestClip() || wrong;
wrong = !TestCompare() || wrong;
wrong = !TestConcatenate() || wrong; wrong = !TestConcatenate() || wrong;
wrong = !TestConcatenateSolely() || wrong; wrong = !TestConcatenateSolely() || wrong;
wrong = !TestCos() || wrong; wrong = !TestCos() || wrong;
......
...@@ -24,6 +24,7 @@ ...@@ -24,6 +24,7 @@
#include "TAbsolute.h" #include "TAbsolute.h"
#include "TClip.h" #include "TClip.h"
#include "TCompare.h"
#include "TConcatenate.h" #include "TConcatenate.h"
#include "TConcatenateSolely.h" #include "TConcatenateSolely.h"
#include "TCos.h" #include "TCos.h"
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论