Commit a79523f9 by liyinqiao

Merge with xiaotong branch and add mutex when operating the memory pool.

parent 7d4bc44a
...@@ -27,6 +27,7 @@ ...@@ -27,6 +27,7 @@
#include "./tensor/test/Test.h" #include "./tensor/test/Test.h"
#include "./sample/fnnlm/FNNLM.h" #include "./sample/fnnlm/FNNLM.h"
#include "./sample/transformer/NMT.h" #include "./sample/transformer/NMT.h"
#include "./train/TTrain.h"
//#define CRTDBG_MAP_ALLOC //#define CRTDBG_MAP_ALLOC
//#include <stdlib.h> //#include <stdlib.h>
...@@ -38,8 +39,14 @@ using namespace nmt; ...@@ -38,8 +39,14 @@ using namespace nmt;
int main( int argc, const char ** argv ) int main( int argc, const char ** argv )
{ {
if(argc > 1 && !strcmp(argv[1], "-test")) XConfig config;
config.Create(argc - 1, argv + 1);
verboseLevel = config.GetInt("verbose", 1);
if (argc > 1 && !strcmp(argv[1], "-test"))
Test(); Test();
else if (argc > 1 && !strcmp(argv[1], "-testtrain"))
TestTrain();
else if(argc > 1 && !strcmp(argv[1], "-fnnlm")) else if(argc > 1 && !strcmp(argv[1], "-fnnlm"))
FNNLMMain(argc - 1, argv + 1); FNNLMMain(argc - 1, argv + 1);
else if(argc > 1 && !strcmp(argv[1], "-t2t")) else if(argc > 1 && !strcmp(argv[1], "-t2t"))
...@@ -47,7 +54,8 @@ int main( int argc, const char ** argv ) ...@@ -47,7 +54,8 @@ int main( int argc, const char ** argv )
else{ else{
fprintf(stderr, "Thanks for using NiuTensor! This is a library for building\n"); fprintf(stderr, "Thanks for using NiuTensor! This is a library for building\n");
fprintf(stderr, "neural networks in an easy way. \n\n"); fprintf(stderr, "neural networks in an easy way. \n\n");
fprintf(stderr, "Run this program with \"-test\" for unit test!\n"); fprintf(stderr, " Run this program with \"-test\" for unit test!\n");
fprintf(stderr, "Or run this program with \"-testtrain\" for test of the trainer!\n");
fprintf(stderr, "Or run this program with \"-fnnlm\" for sample FNNLM!\n"); fprintf(stderr, "Or run this program with \"-fnnlm\" for sample FNNLM!\n");
fprintf(stderr, "Or run this program with \"-t2t\" for sample Transformer!\n"); fprintf(stderr, "Or run this program with \"-t2t\" for sample Transformer!\n");
} }
......
...@@ -93,6 +93,7 @@ void XFuncGrad::MakeGrad(XTensor * node, bool isEfficient) ...@@ -93,6 +93,7 @@ void XFuncGrad::MakeGrad(XTensor * node, bool isEfficient)
} }
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
node->isGradFinished = true;
} }
/* indicates whether the node is for an activation function */ /* indicates whether the node is for an activation function */
......
...@@ -89,6 +89,7 @@ void XLossGrad::MakeGrad(XTensor * node, bool isEfficient) ...@@ -89,6 +89,7 @@ void XLossGrad::MakeGrad(XTensor * node, bool isEfficient)
} }
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
node->isGradFinished = true;
} }
/* indicates whether the node is for a loss computation */ /* indicates whether the node is for a loss computation */
......
...@@ -105,12 +105,19 @@ void XShapeGrad::GradConvertDataType(XTensor* node, bool isEfficient) ...@@ -105,12 +105,19 @@ void XShapeGrad::GradConvertDataType(XTensor* node, bool isEfficient)
if (!isEfficient || a->isGrad) { if (!isEfficient || a->isGrad) {
XNoder::MakeGrad(a); XNoder::MakeGrad(a);
if (a->mem != NULL)
a->mem->LockBuf();
XTensor* tmp = NewTensorBufV2(a, a->devID, a->mem); XTensor* tmp = NewTensorBufV2(a, a->devID, a->mem);
_ConvertDataType(node->grad, tmp); _ConvertDataType(node->grad, tmp);
_SumMe(a->grad, tmp); _SumMe(a->grad, tmp);
DelTensorBuf(tmp); DelTensorBuf(tmp);
if (a->mem != NULL)
a->mem->UnlockBuf();
} }
node->visitMark = NODE_FINISHED;
node->isGradFinished = true;
} }
/* /*
...@@ -138,12 +145,19 @@ void XShapeGrad::GradCopyIndexed(XTensor * node, bool isEfficient) ...@@ -138,12 +145,19 @@ void XShapeGrad::GradCopyIndexed(XTensor * node, bool isEfficient)
if (!isEfficient || input->isGrad) { if (!isEfficient || input->isGrad) {
XNoder::MakeGrad(input); XNoder::MakeGrad(input);
if (input->mem != NULL)
input->mem->LockBuf();
XTensor * tmp = NewTensorBufV2(input, input->devID, input->mem); XTensor * tmp = NewTensorBufV2(input, input->devID, input->mem);
_SpreadForCopyIndexed(tmp, node->grad, dim, srcIndex, tgtIndex, copyNum); _SpreadForCopyIndexed(tmp, node->grad, dim, srcIndex, tgtIndex, copyNum);
_SumMe(input->grad, tmp); _SumMe(input->grad, tmp);
DelTensorBuf(tmp); DelTensorBuf(tmp);
if (input->mem != NULL)
input->mem->UnlockBuf();
} }
node->visitMark = NODE_FINISHED;
node->isGradFinished = true;
} }
/* /*
...@@ -167,15 +181,20 @@ void XShapeGrad::GradGather(XTensor * node, bool isEfficient) ...@@ -167,15 +181,20 @@ void XShapeGrad::GradGather(XTensor * node, bool isEfficient)
if (!isEfficient || input->isGrad) { if (!isEfficient || input->isGrad) {
XNoder::MakeGrad(input); XNoder::MakeGrad(input);
if (input->mem != NULL)
input->mem->LockBuf();
XTensor * tmp = NewTensorBufV2(input, input->devID, input->mem); XTensor * tmp = NewTensorBufV2(input, input->devID, input->mem);
tmp->SetZeroAll(); tmp->SetZeroAll();
_SpreadForGather(tmp, node->grad, index); _SpreadForGather(tmp, node->grad, index);
_SumMe(input->grad, tmp); _SumMe(input->grad, tmp);
DelTensorBuf(tmp); DelTensorBuf(tmp);
if (input->mem != NULL)
input->mem->UnlockBuf();
} }
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
node->isGradFinished = true;
} }
/* /*
...@@ -193,6 +212,8 @@ void XShapeGrad::GradDropoutWithIndex(XTensor * node, bool isEfficient) ...@@ -193,6 +212,8 @@ void XShapeGrad::GradDropoutWithIndex(XTensor * node, bool isEfficient)
if (!isEfficient || input->isGrad) { if (!isEfficient || input->isGrad) {
XNoder::MakeGrad(input); XNoder::MakeGrad(input);
if (input->mem != NULL)
input->mem->LockBuf();
XTensor * tmp = NewTensorBufV2(input, input->devID, input->mem); XTensor * tmp = NewTensorBufV2(input, input->devID, input->mem);
_CopyValues(node->grad, tmp); _CopyValues(node->grad, tmp);
...@@ -205,9 +226,12 @@ void XShapeGrad::GradDropoutWithIndex(XTensor * node, bool isEfficient) ...@@ -205,9 +226,12 @@ void XShapeGrad::GradDropoutWithIndex(XTensor * node, bool isEfficient)
_SumMe(input->grad, tmp); _SumMe(input->grad, tmp);
DelTensorBuf(tmp); DelTensorBuf(tmp);
if (input->mem != NULL)
input->mem->UnlockBuf();
} }
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
node->isGradFinished = true;
} }
/* /*
...@@ -246,13 +270,16 @@ void XShapeGrad::GradMerge(XTensor * node, bool isEfficient) ...@@ -246,13 +270,16 @@ void XShapeGrad::GradMerge(XTensor * node, bool isEfficient)
dims[j++] = input->dimSize[i]; dims[j++] = input->dimSize[i];
} }
} }
dims[0] = -dims[0];
dims[0] = -abs(dims[0]);
XTensor gradInputSmall(input->order - leadDim, dims, XTensor gradInputSmall(input->order - leadDim, dims,
input->dataType, input->denseRatio, input->dataType, input->denseRatio,
input->devID, input->mem); input->devID, input->mem);
dims[whereToMerge - leadDim] *= dims[0]; dims[whereToMerge - leadDim] *= abs(dims[0]);
XTensor gradNodeSmall(node->order - leadDim, dims + leadDim + 1, int * dimsNode = dims + 1;
dimsNode[0] = -abs(dimsNode[0]);
XTensor gradNodeSmall(node->order - leadDim, dimsNode,
node->dataType, node->denseRatio, node->dataType, node->denseRatio,
node->devID, node->mem); node->devID, node->mem);
...@@ -296,6 +323,7 @@ void XShapeGrad::GradMerge(XTensor * node, bool isEfficient) ...@@ -296,6 +323,7 @@ void XShapeGrad::GradMerge(XTensor * node, bool isEfficient)
} }
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
node->isGradFinished = true;
} }
/* /*
...@@ -379,6 +407,7 @@ void XShapeGrad::GradMergeList(XTensor * node, bool isEfficient) ...@@ -379,6 +407,7 @@ void XShapeGrad::GradMergeList(XTensor * node, bool isEfficient)
} }
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
node->isGradFinished = true;
} }
/* /*
...@@ -407,6 +436,7 @@ void XShapeGrad::GradReshape(XTensor * node, bool isEfficient) ...@@ -407,6 +436,7 @@ void XShapeGrad::GradReshape(XTensor * node, bool isEfficient)
} }
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
node->isGradFinished = true;
} }
/* /*
...@@ -442,16 +472,21 @@ void XShapeGrad::GradSplit(XTensor * node, bool isEfficient) ...@@ -442,16 +472,21 @@ void XShapeGrad::GradSplit(XTensor * node, bool isEfficient)
/* if the tensor is used somewhere else, we need another SUM /* if the tensor is used somewhere else, we need another SUM
for gradient accumulation */ for gradient accumulation */
else { else {
if (input->mem != NULL)
input->mem->LockBuf();
XTensor * inputGradTMP = NewTensorBufV2(input, input->devID, input->mem); XTensor * inputGradTMP = NewTensorBufV2(input, input->devID, input->mem);
_Merge(node->grad, inputGradTMP, whereToSplit + 1, 0); _Merge(node->grad, inputGradTMP, whereToSplit + 1, 0);
_Sum(input->grad, inputGradTMP, input->grad); _Sum(input->grad, inputGradTMP, input->grad);
DelTensorBuf(inputGradTMP); DelTensorBuf(inputGradTMP);
if (input->mem != NULL)
input->mem->UnlockBuf();
} }
} }
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
node->isGradFinished = true;
} }
/* /*
...@@ -528,14 +563,21 @@ void XShapeGrad::GradSplitListPost(XTensor * node, bool isEfficient) ...@@ -528,14 +563,21 @@ void XShapeGrad::GradSplitListPost(XTensor * node, bool isEfficient)
somewhere else, we need another SUM for gradient somewhere else, we need another SUM for gradient
accumulation */ accumulation */
else { else {
if (node->mem != NULL)
node->mem->LockBuf();
XTensor * nodeGradTMP = NewTensorBufV2(node, node->devID, node->mem); XTensor * nodeGradTMP = NewTensorBufV2(node, node->devID, node->mem);
_Merge(&splits, nodeGradTMP, whereToSplit + 1); _Merge(&splits, nodeGradTMP, whereToSplit + 1);
_Sum(node->grad, nodeGradTMP, node->grad); _Sum(node->grad, nodeGradTMP, node->grad);
DelTensorBuf(nodeGradTMP); DelTensorBuf(nodeGradTMP);
if (node->mem != NULL)
node->mem->UnlockBuf();
} }
} }
node->visitMark = NODE_DOING;
node->isGradFinished = true;
} }
/* /*
...@@ -566,14 +608,19 @@ void XShapeGrad::GradTranspose(XTensor * node, bool isEfficient) ...@@ -566,14 +608,19 @@ void XShapeGrad::GradTranspose(XTensor * node, bool isEfficient)
CheckNTErrors(input->order > i && i >= 0, "index of dimension is out of scope!"); CheckNTErrors(input->order > i && i >= 0, "index of dimension is out of scope!");
CheckNTErrors(input->order > j && j >= 0, "index of dimension is out of scope!"); CheckNTErrors(input->order > j && j >= 0, "index of dimension is out of scope!");
if (input->mem != NULL)
input->mem->LockBuf();
XTensor * tmp = NewTensorBufV2(input, input->devID, input->mem); XTensor * tmp = NewTensorBufV2(input, input->devID, input->mem);
_Transpose(output->grad, tmp, i, j); _Transpose(output->grad, tmp, i, j);
_Sum(input->grad, tmp, input->grad); _Sum(input->grad, tmp, input->grad);
DelTensorBuf(tmp); DelTensorBuf(tmp);
if (input->mem != NULL)
input->mem->UnlockBuf();
} }
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
node->isGradFinished = true;
} }
/* /*
...@@ -603,15 +650,20 @@ void XShapeGrad::GradUnsqueeze(XTensor * node, bool isEfficient) ...@@ -603,15 +650,20 @@ void XShapeGrad::GradUnsqueeze(XTensor * node, bool isEfficient)
if (!isEfficient || input->isGrad) { if (!isEfficient || input->isGrad) {
XNoder::MakeGrad(input); XNoder::MakeGrad(input);
if (input->mem != NULL)
input->mem->LockBuf();
XTensor * tmp = NewTensorBufV2(input->grad, input->devID, input->mem); XTensor * tmp = NewTensorBufV2(input->grad, input->devID, input->mem);
_ReduceSum(output->grad, tmp, dim); _ReduceSum(output->grad, tmp, dim);
_Sum(input->grad, tmp, input->grad); _Sum(input->grad, tmp, input->grad);
DelTensorBuf(tmp); DelTensorBuf(tmp);
if (input->mem != NULL)
input->mem->UnlockBuf();
} }
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
node->isGradFinished = true;
} }
} }
\ No newline at end of file
...@@ -101,6 +101,7 @@ void XNet::Backward(TensorList &roots) ...@@ -101,6 +101,7 @@ void XNet::Backward(TensorList &roots)
for(int i = 0; i < nodes.count; i++){ for(int i = 0; i < nodes.count; i++){
XTensor * node = (XTensor*)nodes.Get(i); XTensor * node = (XTensor*)nodes.Get(i);
node->visitMark = NODE_UNFINISHED; node->visitMark = NODE_UNFINISHED;
node->isGradFinished = false;
} }
/* back-propagation from output to input */ /* back-propagation from output to input */
...@@ -162,6 +163,7 @@ void XNet::BackwardNode(XTensor * node, bool isEfficent) ...@@ -162,6 +163,7 @@ void XNet::BackwardNode(XTensor * node, bool isEfficent)
} }
else{ else{
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
node->isGradFinished = true;
} }
} }
......
...@@ -21,8 +21,8 @@ ...@@ -21,8 +21,8 @@
#include "Decoder.h" #include "Decoder.h"
#include "Utility.h" #include "Utility.h"
#include "module/LayerNorm.h" #include "submodel/LayerNorm.h"
#include "module/CommonModules.h" #include "submodel/CommonModules.h"
#include "../../tensor/core/CHeader.h" #include "../../tensor/core/CHeader.h"
namespace nmt namespace nmt
......
...@@ -21,8 +21,8 @@ ...@@ -21,8 +21,8 @@
#include "Encoder.h" #include "Encoder.h"
#include "Utility.h" #include "Utility.h"
#include "module/LayerNorm.h" #include "submodel/LayerNorm.h"
#include "module/CommonModules.h" #include "submodel/CommonModules.h"
#include "../../tensor/core/CHeader.h" #include "../../tensor/core/CHeader.h"
namespace nmt namespace nmt
......
...@@ -23,10 +23,10 @@ ...@@ -23,10 +23,10 @@
#define __ENCODER_H__ #define __ENCODER_H__
#include "Utility.h" #include "Utility.h"
#include "module/FNN.h" #include "submodel/FNN.h"
#include "module/Attention.h" #include "submodel/Attention.h"
#include "module/Embedding.h" #include "submodel/Embedding.h"
#include "module/LayerNorm.h" #include "submodel/LayerNorm.h"
#include "../../network/XNet.h" #include "../../network/XNet.h"
using namespace nts; using namespace nts;
......
...@@ -265,6 +265,7 @@ void Model::MakeMTMask(XTensor& inputEnc, XTensor& inputDec, ...@@ -265,6 +265,7 @@ void Model::MakeMTMask(XTensor& inputEnc, XTensor& inputDec,
dims[inputDec.order + 1] = inputEnc.GetDim(inputEnc.order - 1); dims[inputDec.order + 1] = inputEnc.GetDim(inputEnc.order - 1);
InitTensor(&maskEncDec, inputDec.order + 2, dims, X_FLOAT, paddingEnc.devID); InitTensor(&maskEncDec, inputDec.order + 2, dims, X_FLOAT, paddingEnc.devID);
GMems.GetMem(paddingEnc.devID)->LockBuf();
XTensor* maskEncDecTMPEnc = NewTensorBuf(paddingEnc.order + 1, dims + 1, XTensor* maskEncDecTMPEnc = NewTensorBuf(paddingEnc.order + 1, dims + 1,
paddingEnc.dataType, paddingEnc.devID); paddingEnc.dataType, paddingEnc.devID);
XTensor* maskEncDecTMPDec = NewTensorBuf(maskEncDecTMPEnc, paddingEnc.devID); XTensor* maskEncDecTMPDec = NewTensorBuf(maskEncDecTMPEnc, paddingEnc.devID);
...@@ -275,6 +276,7 @@ void Model::MakeMTMask(XTensor& inputEnc, XTensor& inputDec, ...@@ -275,6 +276,7 @@ void Model::MakeMTMask(XTensor& inputEnc, XTensor& inputDec,
DelTensorBuf(maskEncDecTMPDec); DelTensorBuf(maskEncDecTMPDec);
DelTensorBuf(maskEncDecTMPEnc); DelTensorBuf(maskEncDecTMPEnc);
GMems.GetMem(paddingEnc.devID)->UnlockBuf();
/* padding on the source side */ /* padding on the source side */
int* dimsPadding = new int[paddingEnc.order + 2]; int* dimsPadding = new int[paddingEnc.order + 2];
...@@ -283,6 +285,7 @@ void Model::MakeMTMask(XTensor& inputEnc, XTensor& inputDec, ...@@ -283,6 +285,7 @@ void Model::MakeMTMask(XTensor& inputEnc, XTensor& inputDec,
dimsPadding[paddingEnc.order - 1] = paddingEnc.GetDim(-1); dimsPadding[paddingEnc.order - 1] = paddingEnc.GetDim(-1);
dimsPadding[paddingEnc.order] = paddingEnc.GetDim(-1); dimsPadding[paddingEnc.order] = paddingEnc.GetDim(-1);
GMems.GetMem(paddingEnc.devID)->LockBuf();
XTensor* padding2 = NewTensorBuf(paddingEnc.order + 1, dimsPadding, paddingEnc.dataType, XTensor* padding2 = NewTensorBuf(paddingEnc.order + 1, dimsPadding, paddingEnc.dataType,
paddingEnc.devID); paddingEnc.devID);
...@@ -309,6 +312,7 @@ void Model::MakeMTMask(XTensor& inputEnc, XTensor& inputDec, ...@@ -309,6 +312,7 @@ void Model::MakeMTMask(XTensor& inputEnc, XTensor& inputDec,
DelTensorBuf(padding3); DelTensorBuf(padding3);
DelTensorBuf(padding2); DelTensorBuf(padding2);
GMems.GetMem(paddingEnc.devID)->UnlockBuf();
} }
/* /*
...@@ -490,7 +494,7 @@ void Model::Read(FILE* file) ...@@ -490,7 +494,7 @@ void Model::Read(FILE* file)
TensorList params; TensorList params;
GetParams(params); GetParams(params);
LOG("params count: %lu", params.Size()); LOG("params count: %lu", (unsigned long)params.Size());
int size = 0; int size = 0;
for (int i = 0; i < params.Size(); i++) { for (int i = 0; i < params.Size(); i++) {
size += params[i]->unitNum; size += params[i]->unitNum;
......
...@@ -24,10 +24,10 @@ ...@@ -24,10 +24,10 @@
#include "Encoder.h" #include "Encoder.h"
#include "Decoder.h" #include "Decoder.h"
#include "module/FNN.h" #include "submodel/FNN.h"
#include "module/Output.h" #include "submodel/Output.h"
#include "Utility.h" #include "Utility.h"
#include "module/Attention.h" #include "submodel/Attention.h"
namespace nmt namespace nmt
{ {
......
...@@ -28,6 +28,7 @@ ...@@ -28,6 +28,7 @@
#include "Utility.h" #include "Utility.h"
#include "../../tensor/XGlobal.h" #include "../../tensor/XGlobal.h"
#include "../../tensor/XConfig.h"
using namespace nts; using namespace nts;
using namespace std; using namespace std;
...@@ -91,9 +92,9 @@ Config::Config(int argc, const char** argv) ...@@ -91,9 +92,9 @@ Config::Config(int argc, const char** argv)
LoadParamInt(argsNum, args, "sbatch", &sBatchSize, 8); LoadParamInt(argsNum, args, "sbatch", &sBatchSize, 8);
isTraining = (strcmp(trainFN, "") == 0) ? false : true; isTraining = (strcmp(trainFN, "") == 0) ? false : true;
LoadParamBool(argsNum, args, "mt", &isMT, true); LoadParamBool(argsNum, args, "mt", &isMT, true);
LoadParamFloat(argsNum, args, "dropout", &dropout, 0.3); LoadParamFloat(argsNum, args, "dropout", &dropout, 0.3F);
LoadParamFloat(argsNum, args, "fnndrop", &fnnDropout, 0.1); LoadParamFloat(argsNum, args, "fnndrop", &fnnDropout, 0.1F);
LoadParamFloat(argsNum, args, "attdrop", &attDropout, 0.1); LoadParamFloat(argsNum, args, "attdrop", &attDropout, 0.1F);
LoadParamFloat(argc, args, "lrate", &lrate, 0.0015F); LoadParamFloat(argc, args, "lrate", &lrate, 0.0015F);
LoadParamFloat(argc, args, "lrbias", &lrbias, 0); LoadParamFloat(argc, args, "lrbias", &lrbias, 0);
...@@ -106,7 +107,7 @@ Config::Config(int argc, const char** argv) ...@@ -106,7 +107,7 @@ Config::Config(int argc, const char** argv)
LoadParamFloat(argc, args, "adambeta2", &adamBeta2, 0.98F); LoadParamFloat(argc, args, "adambeta2", &adamBeta2, 0.98F);
LoadParamFloat(argc, args, "adamdelta", &adamDelta, 1e-9F); LoadParamFloat(argc, args, "adamdelta", &adamDelta, 1e-9F);
LoadParamBool(argc, args, "shuffled", &isShuffled, true); LoadParamBool(argc, args, "shuffled", &isShuffled, true);
LoadParamFloat(argc, args, "labelsmoothing", &labelSmoothingP, 0.1); LoadParamFloat(argc, args, "labelsmoothing", &labelSmoothingP, 0.1F);
LoadParamInt(argc, args, "nstepcheckpoint", &nStepCheckpoint, -1); LoadParamInt(argc, args, "nstepcheckpoint", &nStepCheckpoint, -1);
LoadParamBool(argc, args, "epochcheckpoint", &useEpochCheckpoint, true); LoadParamBool(argc, args, "epochcheckpoint", &useEpochCheckpoint, true);
LoadParamInt(argc, args, "updatestep", &updateStep, 1); LoadParamInt(argc, args, "updatestep", &updateStep, 1);
...@@ -124,8 +125,8 @@ Config::Config(int argc, const char** argv) ...@@ -124,8 +125,8 @@ Config::Config(int argc, const char** argv)
LoadParamString(argsNum, args, "output", outputFN, ""); LoadParamString(argsNum, args, "output", outputFN, "");
LoadParamInt(argsNum, args, "beamsize", &beamSize, 1); LoadParamInt(argsNum, args, "beamsize", &beamSize, 1);
LoadParamBool(argsNum, args, "fp16", &useFP16, false); LoadParamBool(argsNum, args, "fp16", &useFP16, false);
LoadParamFloat(argsNum, args, "lenalpha", &lenAlpha, 0.6); LoadParamFloat(argsNum, args, "lenalpha", &lenAlpha, 0.6F);
LoadParamFloat(argsNum, args, "maxlenalpha", &maxLenAlpha, 1.2); LoadParamFloat(argsNum, args, "maxlenalpha", &maxLenAlpha, 1.2F);
for (int i = 0; i < argc; i++) for (int i = 0; i < argc; i++)
delete[] args[i]; delete[] args[i];
...@@ -157,90 +158,6 @@ int Config::LoadFromFile(const char* configFN, char** args) { ...@@ -157,90 +158,6 @@ int Config::LoadFromFile(const char* configFN, char** args) {
return argsNum; return argsNum;
} }
void LoadParamString(int argc, char** argv, const char* name, char* p, const char* defaultP)
{
char vname[128];
vname[0] = '-';
strcpy(vname + 1, name);
bool hit = false;
for (int i = 0; i < argc; i++) {
if (!strcmp(argv[i], vname) && i + 1 < argc) {
strcpy(p, argv[i + 1]);
hit = true;
break;
}
}
if (!hit)
strcpy(p, defaultP);
}
void LoadParamInt(int argc, char** argv, const char* name, int* p, int defaultP)
{
char vname[128];
vname[0] = '-';
strcpy(vname + 1, name);
bool hit = false;
for (int i = 0; i < argc; i++) {
if (!strcmp(argv[i], vname) && i + 1 < argc) {
*(int*)p = atoi(argv[i + 1]);
hit = true;
break;
}
}
if (!hit)
*p = defaultP;
}
void LoadParamBool(int argc, char** argv, const char* name, bool* p, bool defaultP)
{
char vname[128];
vname[0] = '-';
strcpy(vname + 1, name);
bool hit = false;
for (int i = 0; i < argc; i++) {
if (!strcmp(argv[i], vname)) {
*(bool*)p = true;
hit = true;
break;
}
}
if (!hit)
*p = defaultP;
}
void LoadParamFloat(int argc, char** argv, const char* name, float* p, float defaultP)
{
char vname[128];
vname[0] = '-';
strcpy(vname + 1, name);
bool hit = false;
for (int i = 0; i < argc; i++) {
if (!strcmp(argv[i], vname) && i + 1 < argc) {
*p = (float)atof(argv[i + 1]);
hit = true;
break;
}
}
if (!hit)
*p = defaultP;
}
void ShowParams(int argc, char** argv)
{
fprintf(stderr, "args:\n");
for (int i = 0; i < argc; i++) {
if (argv[i][1] == 0)
continue;
if (argv[i][0] == '-' && (argv[i][1] < '1' || argv[i][1] > '9')) {
if (i + 1 < argc && argv[i + 1][0] != '-')
fprintf(stderr, " %s=%s\n", argv[i], argv[i + 1]);
else
fprintf(stderr, " %s=yes\n", argv[i]);
}
}
fprintf(stderr, "\n");
}
#define MAX_WORD_NUM 120 #define MAX_WORD_NUM 120
/* /*
...@@ -275,7 +192,9 @@ IntList SplitInt(const string& s, const string& delimiter) ...@@ -275,7 +192,9 @@ IntList SplitInt(const string& s, const string& delimiter)
IntList values; IntList values;
auto indices = SplitToPos(s, delimiter); auto indices = SplitToPos(s, delimiter);
for (int i = 0; i < indices.Size(); i++) { for (int i = 0; i < indices.Size(); i++) {
values.Add(strtol(s.data() + indices[i], nullptr, 10));
/* this line is with problem. Why do we need an IntList to keep an int64*/
values.Add((int)strtol(s.data() + indices[i], nullptr, 10));
} }
return values; return values;
} }
...@@ -291,4 +210,4 @@ FloatList SplitFloat(const string& s, const string& delimiter) ...@@ -291,4 +210,4 @@ FloatList SplitFloat(const string& s, const string& delimiter)
return values; return values;
} }
} }
\ No newline at end of file
...@@ -33,17 +33,6 @@ using namespace nts; ...@@ -33,17 +33,6 @@ using namespace nts;
namespace nmt namespace nmt
{ {
#define MAX_PARAM_NUM 100
/* load arguments */
void LoadParamInt(int argc, char** argv, const char* name, int* p, int defaultP);
void LoadParamBool(int argc, char** argv, const char* name, bool* p, bool defaultP);
void LoadParamFloat(int argc, char** argv, const char* name, float* p, float defaultP);
void LoadParamString(int argc, char** argv, const char* name, char* p, const char* defaultP);
/* show arguments */
void ShowParams(int argc, char** argv);
/* split string */ /* split string */
IntList SplitInt(const string& s, const string& delimiter); IntList SplitInt(const string& s, const string& delimiter);
FloatList SplitFloat(const string& s, const string& delimiter); FloatList SplitFloat(const string& s, const string& delimiter);
......
...@@ -226,7 +226,6 @@ XTensor Attention::MakeRPRAttention(XTensor& k, XTensor& q, XTensor& v, ...@@ -226,7 +226,6 @@ XTensor Attention::MakeRPRAttention(XTensor& k, XTensor& q, XTensor& v,
XTensor qheads; XTensor qheads;
XTensor vheads; XTensor vheads;
const int batchSize = q.GetDim(0);
const int lenQ = q.GetDim(1); const int lenQ = q.GetDim(1);
const int lenKV = k.GetDim(1); const int lenKV = k.GetDim(1);
...@@ -255,7 +254,7 @@ XTensor Attention::MakeRPRAttention(XTensor& k, XTensor& q, XTensor& v, ...@@ -255,7 +254,7 @@ XTensor Attention::MakeRPRAttention(XTensor& k, XTensor& q, XTensor& v,
relativeKey = ConvertDataType(relativeKey, X_FLOAT); relativeKey = ConvertDataType(relativeKey, X_FLOAT);
} }
float scaling = sqrt(d / nhead); float scaling = (float)sqrt(d / nhead);
qheads = ScaleAndShift(qheads, 1.0F / scaling); qheads = ScaleAndShift(qheads, 1.0F / scaling);
dot = RPDotProduct(qheads, kheads, relativeKey, true); dot = RPDotProduct(qheads, kheads, relativeKey, true);
...@@ -402,4 +401,4 @@ void Cache::Reorder(XTensor& reorder) ...@@ -402,4 +401,4 @@ void Cache::Reorder(XTensor& reorder)
value = AutoGather(value, reorder); value = AutoGather(value, reorder);
} }
} }
} }
\ No newline at end of file
...@@ -48,8 +48,6 @@ void GLU::InitModel(Config& config) ...@@ -48,8 +48,6 @@ void GLU::InitModel(Config& config)
{ {
devID = config.devID; devID = config.devID;
float minmax = 0;
inSize = config.modelSize; inSize = config.modelSize;
outSize = config.modelSize; outSize = config.modelSize;
...@@ -84,4 +82,4 @@ XTensor GLU::Make(XTensor& input) ...@@ -84,4 +82,4 @@ XTensor GLU::Make(XTensor& input)
return t1 * Sigmoid(t2); return t1 * Sigmoid(t2);
} }
} }
\ No newline at end of file
...@@ -92,10 +92,10 @@ generate the weight sum vector of all previous layer output in the history as th ...@@ -92,10 +92,10 @@ generate the weight sum vector of all previous layer output in the history as th
XTensor LayerHistory::Pop() XTensor LayerHistory::Pop()
{ {
/* the number of layer output in the history */ /* the number of layer output in the history */
size_t size = history.Size(); int size = (int)history.Size();
TensorList historyList; TensorList historyList;
for (size_t i = 0; i < size; i++) for (int i = 0; i < size; i++)
historyList.Add(history[i]); historyList.Add(history[i]);
/* we need stack the tensor along the first dim*/ /* we need stack the tensor along the first dim*/
......
...@@ -134,13 +134,13 @@ UInt64List TrainDataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc, ...@@ -134,13 +134,13 @@ UInt64List TrainDataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
UInt64List info; UInt64List info;
size_t srcTokenNum = 0; size_t srcTokenNum = 0;
size_t tgtTokenNum = 0; size_t tgtTokenNum = 0;
int realBatchSize = 1; size_t realBatchSize = 1;
if (!isTraining) if (!isTraining)
realBatchSize = minSentBatch; realBatchSize = minSentBatch;
/* get the maximum source sentence length in a mini-batch */ /* get the maximum source sentence length in a mini-batch */
size_t maxSrcLen = buffer[curIdx]->srcSent.Size(); size_t maxSrcLen = buffer[(int)curIdx]->srcSent.Size();
/* max batch size */ /* max batch size */
const int MAX_BATCH_SIZE = 512; const int MAX_BATCH_SIZE = 512;
...@@ -150,9 +150,9 @@ UInt64List TrainDataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc, ...@@ -150,9 +150,9 @@ UInt64List TrainDataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
while ((realBatchSize < (buffer.Size() - curIdx)) while ((realBatchSize < (buffer.Size() - curIdx))
&& (realBatchSize * maxSrcLen < batchSize) && (realBatchSize * maxSrcLen < batchSize)
&& (realBatchSize < MAX_BATCH_SIZE) && (realBatchSize < MAX_BATCH_SIZE)
&& (realBatchSize * buffer[curIdx + realBatchSize]->srcSent.Size() < batchSize)) { && (realBatchSize * buffer[(int)(curIdx + realBatchSize)]->srcSent.Size() < batchSize)) {
if (maxSrcLen < buffer[curIdx + realBatchSize]->srcSent.Size()) if (maxSrcLen < buffer[(int)(curIdx + realBatchSize)]->srcSent.Size())
maxSrcLen = buffer[curIdx + realBatchSize]->srcSent.Size(); maxSrcLen = buffer[(int)(curIdx + realBatchSize)]->srcSent.Size();
realBatchSize++; realBatchSize++;
} }
} }
...@@ -165,14 +165,14 @@ UInt64List TrainDataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc, ...@@ -165,14 +165,14 @@ UInt64List TrainDataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
CheckNTErrors(realBatchSize > 0, "Invalid batch size"); CheckNTErrors(realBatchSize > 0, "Invalid batch size");
/* get the maximum target sentence length in a mini-batch */ /* get the maximum target sentence length in a mini-batch */
size_t maxTgtLen = buffer[curIdx]->tgtSent.Size(); size_t maxTgtLen = buffer[(int)curIdx]->tgtSent.Size();
for (size_t i = 0; i < realBatchSize; i++) { for (size_t i = 0; i < realBatchSize; i++) {
if (maxTgtLen < buffer[curIdx + i]->tgtSent.Size()) if (maxTgtLen < buffer[(int)(curIdx + i)]->tgtSent.Size())
maxTgtLen = buffer[curIdx + i]->tgtSent.Size(); maxTgtLen = buffer[(int)(curIdx + i)]->tgtSent.Size();
} }
for (size_t i = 0; i < realBatchSize; i++) { for (size_t i = 0; i < realBatchSize; i++) {
if (maxSrcLen < buffer[curIdx + i]->srcSent.Size()) if (maxSrcLen < buffer[(int)(curIdx + i)]->srcSent.Size())
maxSrcLen = buffer[curIdx + i]->srcSent.Size(); maxSrcLen = buffer[(int)(curIdx + i)]->srcSent.Size();
} }
CheckNTErrors(maxSrcLen != 0, "Invalid source length for batching"); CheckNTErrors(maxSrcLen != 0, "Invalid source length for batching");
...@@ -204,19 +204,19 @@ UInt64List TrainDataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc, ...@@ -204,19 +204,19 @@ UInt64List TrainDataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
*/ */
for (int i = 0; i < realBatchSize; ++i) { for (int i = 0; i < realBatchSize; ++i) {
srcTokenNum += buffer[curIdx + i]->srcSent.Size(); srcTokenNum += buffer[(int)(curIdx + i)]->srcSent.Size();
tgtTokenNum += buffer[curIdx + i]->tgtSent.Size(); tgtTokenNum += buffer[(int)(curIdx + i)]->tgtSent.Size();
curSrc = maxSrcLen * i; curSrc = maxSrcLen * i;
for (int j = 0; j < buffer[curIdx + i]->srcSent.Size(); j++) { for (int j = 0; j < buffer[(int)(curIdx + i)]->srcSent.Size(); j++) {
batchEncValues[curSrc++] = buffer[curIdx + i]->srcSent[j]; batchEncValues[curSrc++] = buffer[(int)(curIdx + i)]->srcSent[j];
} }
curTgt = maxTgtLen * i; curTgt = maxTgtLen * i;
for (int j = 0; j < buffer[curIdx + i]->tgtSent.Size(); j++) { for (int j = 0; j < buffer[(int)(curIdx + i)]->tgtSent.Size(); j++) {
if (j > 0) if (j > 0)
labelVaues[curTgt - 1] = buffer[curIdx + i]->tgtSent[j]; labelVaues[curTgt - 1] = buffer[(int)(curIdx + i)]->tgtSent[j];
batchDecValues[curTgt++] = buffer[curIdx + i]->tgtSent[j]; batchDecValues[curTgt++] = buffer[(int)(curIdx + i)]->tgtSent[j];
} }
labelVaues[curTgt - 1] = EOS; labelVaues[curTgt - 1] = EOS;
while (curSrc < maxSrcLen * (i + 1)) while (curSrc < maxSrcLen * (i + 1))
...@@ -226,11 +226,13 @@ UInt64List TrainDataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc, ...@@ -226,11 +226,13 @@ UInt64List TrainDataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
} }
InitTensor2D(batchEnc, realBatchSize, maxSrcLen, X_INT, devID); int rbs = (int)realBatchSize;
InitTensor2D(paddingEnc, realBatchSize, maxSrcLen, X_FLOAT, devID); int msl = (int)maxSrcLen;
InitTensor2D(batchDec, realBatchSize, maxTgtLen, X_INT, devID); InitTensor2D(batchEnc, rbs, msl, X_INT, devID);
InitTensor2D(paddingDec, realBatchSize, maxTgtLen, X_FLOAT, devID); InitTensor2D(paddingEnc, rbs, msl, X_FLOAT, devID);
InitTensor2D(label, realBatchSize, maxTgtLen, X_INT, devID); InitTensor2D(batchDec, rbs, msl, X_INT, devID);
InitTensor2D(paddingDec, rbs, msl, X_FLOAT, devID);
InitTensor2D(label, rbs, msl, X_INT, devID);
curIdx += realBatchSize; curIdx += realBatchSize;
...@@ -304,14 +306,14 @@ void TrainDataSet::BuildBucket() ...@@ -304,14 +306,14 @@ void TrainDataSet::BuildBucket()
size_t sentNum = 1; size_t sentNum = 1;
/* get the maximum source sentence length in a bucket */ /* get the maximum source sentence length in a bucket */
size_t maxSrcLen = buffer[idx]->srcSent.Size(); size_t maxSrcLen = buffer[(int)idx]->srcSent.Size();
/* bucketing for sentences */ /* bucketing for sentences */
while ((sentNum < (buffer.Size() - idx)) while ((sentNum < (buffer.Size() - idx))
&& (sentNum * maxSrcLen < bucketSize) && (sentNum * maxSrcLen < bucketSize)
&& (sentNum * buffer[curIdx + sentNum]->srcSent.Size() < bucketSize)) { && (sentNum * buffer[(int)(curIdx + sentNum)]->srcSent.Size() < bucketSize)) {
if (maxSrcLen < buffer[idx + sentNum]->srcSent.Size()) if (maxSrcLen < buffer[(int)(idx + sentNum)]->srcSent.Size())
maxSrcLen = buffer[idx + sentNum]->srcSent.Size(); maxSrcLen = buffer[(int)(idx + sentNum)]->srcSent.Size();
sentNum++; sentNum++;
} }
...@@ -324,7 +326,7 @@ void TrainDataSet::BuildBucket() ...@@ -324,7 +326,7 @@ void TrainDataSet::BuildBucket()
/* shuffle items in a bucket */ /* shuffle items in a bucket */
for (size_t i = 0; i < sentNum; i++) { for (size_t i = 0; i < sentNum; i++) {
buffer[idx + i]->bucketKey = randomKey; buffer[(int)(idx + i)]->bucketKey = randomKey;
} }
idx += sentNum; idx += sentNum;
...@@ -335,13 +337,13 @@ void TrainDataSet::BuildBucket() ...@@ -335,13 +337,13 @@ void TrainDataSet::BuildBucket()
idx = 0; idx = 0;
while (idx < buffer.Size()) { while (idx < buffer.Size()) {
size_t sentNum = 0; size_t sentNum = 0;
int bucketKey = buffer[idx + sentNum]->bucketKey; int bucketKey = buffer[(int)(idx + sentNum)]->bucketKey;
while (sentNum < (buffer.Size() - idx) while (sentNum < (buffer.Size() - idx)
&& buffer[idx + sentNum]->bucketKey == bucketKey) { && buffer[(int)(idx + sentNum)]->bucketKey == bucketKey) {
buffer[idx + sentNum]->key = buffer[idx + sentNum]->srcSent.Size(); buffer[(int)(idx + sentNum)]->key = (int)buffer[(int)(idx + sentNum)]->srcSent.Size();
sentNum++; sentNum++;
} }
SortInBucket(idx, idx + sentNum); SortInBucket((int)idx, (int)(idx + sentNum));
idx += sentNum; idx += sentNum;
} }
} }
......
...@@ -98,6 +98,21 @@ public: ...@@ -98,6 +98,21 @@ public:
XTensor* batchDec, XTensor* paddingDec, XTensor* label, XTensor* batchDec, XTensor* paddingDec, XTensor* label,
size_t minSentBatch, size_t batchSize, int devID); size_t minSentBatch, size_t batchSize, int devID);
/* load the samples into the buffer (a list) */
bool LoadBatchToBuf(XList * buf);
/* load the samples into tensors from the buffer */
static
bool LoadBatch(XList * buf,
XTensor* batchEnc, XTensor* paddingEnc,
XTensor* batchDec, XTensor* paddingDec, XTensor* label,
size_t minSentBatch, size_t batchSize, int devID,
int &wc, int &sc);
/* release the samples in a buffer */
static
void ClearSamples(XList * buf);
/* initialization function */ /* initialization function */
void Init(const char* dataFile, int bucketSize, bool training); void Init(const char* dataFile, int bucketSize, bool training);
......
...@@ -163,8 +163,8 @@ void Trainer::Train(const char* fn, const char* validFN, ...@@ -163,8 +163,8 @@ void Trainer::Train(const char* fn, const char* validFN,
UInt64List info = batchLoader.LoadBatch(&batchEnc, &paddingEnc, &batchDec, &paddingDec, &label, UInt64List info = batchLoader.LoadBatch(&batchEnc, &paddingEnc, &batchDec, &paddingDec, &label,
sBatchSize, wBatchSize, devID); sBatchSize, wBatchSize, devID);
wc = info[0]; wc = (int)info[0];
ws = info[1]; ws = (int)info[1];
CheckNTErrors(batchEnc.order == 2, "wrong tensor order of the sequence batch"); CheckNTErrors(batchEnc.order == 2, "wrong tensor order of the sequence batch");
/* output probabilities */ /* output probabilities */
...@@ -206,7 +206,7 @@ void Trainer::Train(const char* fn, const char* validFN, ...@@ -206,7 +206,7 @@ void Trainer::Train(const char* fn, const char* validFN,
if (gradStep == updateStep) { if (gradStep == updateStep) {
float warmupEndLR = lrate; float warmupEndLR = lrate;
float warmupInitLR = 1e-7; float warmupInitLR = 1e-7F;
float lrStep = (warmupEndLR - warmupInitLR) / nwarmup; float lrStep = (warmupEndLR - warmupInitLR) / nwarmup;
float decayFactor = warmupEndLR * pow(float(nwarmup), 0.5F); float decayFactor = warmupEndLR * pow(float(nwarmup), 0.5F);
...@@ -320,8 +320,8 @@ void Trainer::Validate(const char* fn, const char* ofn, Model* model) ...@@ -320,8 +320,8 @@ void Trainer::Validate(const char* fn, const char* ofn, Model* model)
UInt64List info = batchLoader.LoadBatch(&batchEnc, &paddingEnc, &batchDec, &paddingDec, &label, UInt64List info = batchLoader.LoadBatch(&batchEnc, &paddingEnc, &batchDec, &paddingDec, &label,
sBatchSize, 0, model->devID); sBatchSize, 0, model->devID);
wc = info[0]; wc = (int)info[0];
ws = info[1]; ws = (int)info[1];
CheckNTErrors(batchEnc.order == 2, "Wrong tensor order of the sequence batch"); CheckNTErrors(batchEnc.order == 2, "Wrong tensor order of the sequence batch");
/* make the network */ /* make the network */
...@@ -334,7 +334,7 @@ void Trainer::Validate(const char* fn, const char* ofn, Model* model) ...@@ -334,7 +334,7 @@ void Trainer::Validate(const char* fn, const char* ofn, Model* model)
} }
int bSize = output.GetDim(0); int bSize = output.GetDim(0);
int length = output.GetDim(1); //int length = output.GetDim(1);
labelOnehot = IndexToOnehot(label, vSizeTgt, 0); labelOnehot = IndexToOnehot(label, vSizeTgt, 0);
lossTensor = CrossEntropy(output, labelOnehot, paddingDec); lossTensor = CrossEntropy(output, labelOnehot, paddingDec);
...@@ -428,6 +428,7 @@ void Trainer::Update(Model* model, const float lr) ...@@ -428,6 +428,7 @@ void Trainer::Update(Model* model, const float lr)
_ScaleAndShiftMe(v, (1.0F - adamBeta2), 0); _ScaleAndShiftMe(v, (1.0F - adamBeta2), 0);
/* v2 = m / (sqrt(v) + delta) */ /* v2 = m / (sqrt(v) + delta) */
GMems.GetMem(v->devID)->LockBuf();
XTensor* v2 = NewTensorBuf(v, v->devID); XTensor* v2 = NewTensorBuf(v, v->devID);
_Power(v, v2, 0.5F); _Power(v, v2, 0.5F);
_ScaleAndShiftMe(v2, 1.0F, d); _ScaleAndShiftMe(v2, 1.0F, d);
...@@ -437,6 +438,7 @@ void Trainer::Update(Model* model, const float lr) ...@@ -437,6 +438,7 @@ void Trainer::Update(Model* model, const float lr)
_Sum(para, v2, para, -e); _Sum(para, v2, para, -e);
DelTensorBuf(v2); DelTensorBuf(v2);
GMems.GetMem(v->devID)->UnlockBuf();
} }
else { else {
/* the delta rule */ /* the delta rule */
...@@ -479,4 +481,4 @@ void Trainer::PrepareModel(Model* model) ...@@ -479,4 +481,4 @@ void Trainer::PrepareModel(Model* model)
adamBeta2T = 1.0F; adamBeta2T = 1.0F;
} }
} }
\ No newline at end of file
...@@ -70,10 +70,10 @@ void DataSet::LoadDataToBuffer() ...@@ -70,10 +70,10 @@ void DataSet::LoadDataToBuffer()
size_t maxLen = indices.Size() > MAX_WORD_NUM ? MAX_WORD_NUM : indices.Size(); size_t maxLen = indices.Size() > MAX_WORD_NUM ? MAX_WORD_NUM : indices.Size();
for (size_t i = 0; i < maxLen; i++) { for (size_t i = 0; i < maxLen; i++) {
auto offset = (i != (indices.Size() - 1)) ? size_t offset = (i != (indices.Size() - 1)) ?
indices[i + 1] - indices[i] - tokenDelimiter.size() (size_t)indices[(int)i + 1] - (size_t)indices[(int)i] - tokenDelimiter.size()
: line.size() - indices[i]; : line.size() - (size_t)indices[(int)i];
string word = line.substr(indices[i], offset); string word = line.substr((size_t)indices[(int)i], offset);
if (srcVocab.word2id.find(word) == srcVocab.word2id.end()) if (srcVocab.word2id.find(word) == srcVocab.word2id.end())
values.Add(UNK); values.Add(UNK);
else else
...@@ -110,12 +110,12 @@ load a mini-batch to the device (for translating) ...@@ -110,12 +110,12 @@ load a mini-batch to the device (for translating)
<< indices of the sentences << indices of the sentences
*/ */
UInt64List DataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc, UInt64List DataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
size_t minSentBatch, size_t batchSize, int devID) int minSentBatch, int batchSize, int devID)
{ {
size_t realBatchSize = minSentBatch; int realBatchSize = minSentBatch;
/* get the maximum sentence length in a mini-batch */ /* get the maximum sentence length in a mini-batch */
size_t maxLen = inputBuffer[bufferUsed]->values.Size(); int maxLen = (int)inputBuffer[(int)bufferUsed]->values.Size();
/* dynamic batching for sentences */ /* dynamic batching for sentences */
//while ((realBatchSize < (inputBuffer.Size() - bufferUsed)) //while ((realBatchSize < (inputBuffer.Size() - bufferUsed))
...@@ -125,7 +125,7 @@ UInt64List DataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc, ...@@ -125,7 +125,7 @@ UInt64List DataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
/* real batch size */ /* real batch size */
if ((inputBuffer.Size() - bufferUsed) < realBatchSize) { if ((inputBuffer.Size() - bufferUsed) < realBatchSize) {
realBatchSize = inputBuffer.Size() - bufferUsed; realBatchSize = (int)(inputBuffer.Size() - bufferUsed);
} }
CheckNTErrors(maxLen != 0, "invalid length"); CheckNTErrors(maxLen != 0, "invalid length");
...@@ -144,15 +144,15 @@ UInt64List DataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc, ...@@ -144,15 +144,15 @@ UInt64List DataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
UInt64List infos; UInt64List infos;
size_t totalLength = 0; size_t totalLength = 0;
for (int i = 0; i < realBatchSize; ++i) { for (size_t i = 0; i < (size_t)realBatchSize; ++i) {
infos.Add(inputBuffer[bufferUsed + i]->id); infos.Add(inputBuffer[(int)(bufferUsed + i)]->id);
totalLength += inputBuffer[bufferUsed + i]->values.Size(); totalLength += inputBuffer[(int)(bufferUsed + i)]->values.Size();
curSrc = maxLen * i; curSrc = maxLen * i;
for (int j = 0; j < inputBuffer[bufferUsed + i]->values.Size(); j++) for (size_t j = 0; j < inputBuffer[(int)(bufferUsed + i)]->values.Size(); j++)
batchValues[curSrc++] = inputBuffer[bufferUsed + i]->values[j]; batchValues[(int)(curSrc++)] = (int)inputBuffer[(int)(bufferUsed + i)]->values[(int)j];
while (curSrc < maxLen * (i + 1)) while (curSrc < maxLen * (i + 1))
paddingValues[curSrc++] = 0; paddingValues[(int)(curSrc++)] = 0;
} }
infos.Add(totalLength); infos.Add(totalLength);
......
...@@ -85,7 +85,7 @@ public: ...@@ -85,7 +85,7 @@ public:
/* generate a mini-batch */ /* generate a mini-batch */
UInt64List LoadBatch(XTensor* batchEnc, XTensor* paddingEnc, UInt64List LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
size_t sBatch, size_t wBatch, int devID); int sBatch, int wBatch, int devID);
/* initialization function */ /* initialization function */
void Init(const char* dataFile, const char* srcVocabFN, const char* tgtVocabFN); void Init(const char* dataFile, const char* srcVocabFN, const char* tgtVocabFN);
......
...@@ -42,7 +42,7 @@ float LengthPenalizer::GNMT(float length, float alpha) ...@@ -42,7 +42,7 @@ float LengthPenalizer::GNMT(float length, float alpha)
base = (length + 5.0F) / (1.0F + 5.0F); base = (length + 5.0F) / (1.0F + 5.0F);
lp = pow(base, alpha); lp = (float)pow(base, alpha);
return lp; return lp;
} }
......
...@@ -22,7 +22,7 @@ ...@@ -22,7 +22,7 @@
#include <iostream> #include <iostream>
#include "Predictor.h" #include "Predictor.h"
#include "../module/NNUtil.h" #include "../submodel/NNUtil.h"
using namespace nts; using namespace nts;
......
...@@ -322,7 +322,7 @@ void BeamSearch::Generate(StateBundle* prev, StateBundle* beam) ...@@ -322,7 +322,7 @@ void BeamSearch::Generate(StateBundle* prev, StateBundle* beam)
/* keep the most promising candidates in the beam */ /* keep the most promising candidates in the beam */
TopK(score, scoreTopK, index, -1, beamSize, true); TopK(score, scoreTopK, index, -1, beamSize, true);
float lp = LengthPenalizer::GNMT(beam->nstep, alpha); //float lp = LengthPenalizer::GNMT(beam->nstep, alpha);
CopyValues(index, indexCPU); CopyValues(index, indexCPU);
CopyValues(index, preID); CopyValues(index, preID);
...@@ -493,8 +493,8 @@ void BeamSearch::Collect(StateBundle* beam) ...@@ -493,8 +493,8 @@ void BeamSearch::Collect(StateBundle* beam)
/* check if this is the first end symbol. It is false /* check if this is the first end symbol. It is false
if there have been end symbols in previously generated words. */ if there have been end symbols in previously generated words. */
bool isCompleted = state.isCompleted && //bool isCompleted = state.isCompleted &&
(state.last == NULL || !state.last->isCompleted); // (state.last == NULL || !state.last->isCompleted);
/* we push the hypothesis into the heap when it is completed */ /* we push the hypothesis into the heap when it is completed */
if ((state.isEnd || state.isCompleted)) { if ((state.isEnd || state.isCompleted)) {
...@@ -557,7 +557,6 @@ void BeamSearch::Dump(IntList* output, XTensor* score) ...@@ -557,7 +557,6 @@ void BeamSearch::Dump(IntList* output, XTensor* score)
} }
} }
int count = 0;
bool isCompleted = true; bool isCompleted = true;
/* we track the state from the end to the beginning */ /* we track the state from the end to the beginning */
...@@ -873,4 +872,4 @@ void GreedySearch::Search(Model* model, XTensor& input, ...@@ -873,4 +872,4 @@ void GreedySearch::Search(Model* model, XTensor& input,
delete[] finishedFlags; delete[] finishedFlags;
} }
} }
\ No newline at end of file
...@@ -155,7 +155,7 @@ void Translator::Translate(const char* ifn, const char* sfn, ...@@ -155,7 +155,7 @@ void Translator::Translate(const char* ifn, const char* sfn,
batchLoader.outputBuffer.Add(emptyRes); batchLoader.outputBuffer.Add(emptyRes);
} }
double startDump = GetClockSec(); //double startDump = GetClockSec();
/* reorder the result */ /* reorder the result */
batchLoader.SortOutput(); batchLoader.SortOutput();
...@@ -163,7 +163,7 @@ void Translator::Translate(const char* ifn, const char* sfn, ...@@ -163,7 +163,7 @@ void Translator::Translate(const char* ifn, const char* sfn,
/* print the result to a file */ /* print the result to a file */
batchLoader.DumpRes(ofn); batchLoader.DumpRes(ofn);
double elapsed = GetClockSec() - startDump; //double elapsed = GetClockSec() - startDump;
LOG("translation completed (word=%d, sent=%zu)", LOG("translation completed (word=%d, sent=%zu)",
wordCountTotal, batchLoader.inputBuffer.Size() + batchLoader.emptyLines.Size()); wordCountTotal, batchLoader.inputBuffer.Size() + batchLoader.emptyLines.Size());
...@@ -196,4 +196,4 @@ void Translator::Dump(FILE* file, XTensor* output) ...@@ -196,4 +196,4 @@ void Translator::Dump(FILE* file, XTensor* output)
} }
} }
} }
\ No newline at end of file
...@@ -34,14 +34,14 @@ void Vocab::Load(const string& src) ...@@ -34,14 +34,14 @@ void Vocab::Load(const string& src)
/* get the vocab size and the start id */ /* get the vocab size and the start id */
f >> vsz >> sid; f >> vsz >> sid;
startID = stol(sid); startID = (int)stol(sid);
vocabSize = stol(vsz); vocabSize = (int)stol(vsz);
string word, id; string word, id;
for (int i = 0; i < vocabSize - startID; i++) { for (int i = 0; i < vocabSize - startID; i++) {
f >> word >> id; f >> word >> id;
word2id[word] = stol(id); word2id[word] = (int)stol(id);
id2word[stol(id)] = word; id2word[(int)stol(id)] = word;
} }
f.close(); f.close();
...@@ -75,4 +75,4 @@ void Vocab::CopyFrom(const Vocab& v) ...@@ -75,4 +75,4 @@ void Vocab::CopyFrom(const Vocab& v)
id2word.insert(i2w); id2word.insert(i2w);
} }
} }
\ No newline at end of file
...@@ -847,6 +847,7 @@ XTensor * NewTensorRange(int lower, int upper, int step, const TENSOR_DATA_TYPE ...@@ -847,6 +847,7 @@ XTensor * NewTensorRange(int lower, int upper, int step, const TENSOR_DATA_TYPE
XTensor * tensor = NewTensor1D(unitNum, myDataType, myDevID, isEnableGrad); XTensor * tensor = NewTensor1D(unitNum, myDataType, myDevID, isEnableGrad);
tensor->Range(lower, upper, step); tensor->Range(lower, upper, step);
return tensor; return tensor;
} }
......
/*
* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2021
* Natural Language Processing Lab, Northeastern University
* and
* NiuTrans Research
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* this class keeps a batch of paramters.
*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2021-02-28
*/
#include "XConfig.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* constructor */
XConfig::XConfig()
{
n = 0;
args = NULL;
nReal = 0;
}
/* de-constructor */
XConfig::~XConfig()
{
for (int i = 0; i < n; i++) {
delete[] args[i];
}
delete[] args;
}
/* clear it */
void XConfig::Clear()
{
for (int i = 0; i < n; i++) {
delete[] args[i];
}
delete[] args;
n = 0;
args = NULL;
nReal = 0;
}
/*
create a config
>> myN - number of the input arguments
>> myArgs - the input arguments
*/
void XConfig::Create(const int myN, const char ** myArgs)
{
CheckNTErrors(myN > 0, "No input parameters to XConfig!");
for (int i = 0; i < n; i++) {
delete[] args[i];
}
delete[] args;
args = NULL;
n = myN;
nReal = n * 2;
args = new char*[nReal];
for (int i = 0; i < nReal; i++) {
args[i] = NULL;
}
for (int i = 0; i < n; i++) {
CheckNTErrors(myArgs[i] != NULL, "Illegal parameter input!");
args[i] = new char[strlen(myArgs[i]) + 1];
strcpy(args[i], myArgs[i]);
}
}
/*
add an argument
>> myArg - the argument
>> myValue - the value of the argument
*/
void XConfig::Add(const char * myArg, const char * myValue)
{
CheckNTErrors(myArg != NULL, "No argument!");
if (n + 2 > nReal) {
nReal = MAX(n * 2 + 1, 128);
char ** newArgs = new char*[nReal];
memset(newArgs, 0, sizeof(char*) * n);
memcpy(newArgs, args, sizeof(char*) * n);
delete[] args;
args = newArgs;
}
args[n] = new char[strlen(myArg) + 2];
args[n][0] = '-';
strcpy(args[n] + 1, myArg);
n++;
if (myValue != NULL) {
args[n] = new char[strlen(myValue) + 1];
strcpy(args[n], myValue);
n++;
}
}
/*
add an argument (in integer)
>> myArg - the argument
>> myValue - the value of the argument
*/
void XConfig::Add(const char * myArg, int myValue)
{
char value[MAX_WORD_LENGTH_IN_CONFIG];
sprintf(value, "%d", myValue);
Add(myArg, value);
}
/*
add an argument (in bool)
>> myArg - the argument
>> myValue - the value of the argument
*/
void XConfig::Add(const char * myArg, bool myValue)
{
char value[2];
if (myValue)
value[0] = '1';
else
value[0] = '0';
value[1] = 0;
Add(myArg, value);
}
/*
add an argument (in float)
>> myArg - the argument
>> myValue - the value of the argument
*/
void XConfig::Add(const char * myArg, float myValue)
{
char value[MAX_WORD_LENGTH_IN_CONFIG];
sprintf(value, "%f", myValue);
Add(myArg, value);
}
/*
load the value of an argument (in integer)
>> name - the name of the argument
>> p - where we place the loaded value
>> defaultP - the default value (used only if no argument is hit in the list)
*/
void XConfig::LoadInt(const char * name, int * p, int defaultP)
{
LoadParamInt(n, args, name, p, defaultP);
}
/*
load the value of an argument (in boolean)
>> name - the name of the argument
>> p - where we place the loaded value
>> defaultP - the default value (used only if no argument is hit in the list)
*/
void XConfig::LoadBool(const char * name, bool * p, bool defaultP)
{
LoadParamBool(n, args, name, p, defaultP);
}
/*
load the value of an argument (in float)
>> name - the name of the argument
>> p - where we place the loaded value
>> defaultP - the default value (used only if no argument is hit in the list)
*/void XConfig::LoadFloat(const char * name, float * p, float defaultP)
{
LoadParamFloat(n, args, name, p, defaultP);
}
/*
load the value of an argument (in char string)
>> name - the name of the argument
>> p - where we place the loaded value
>> defaultP - the default value (used only if no argument is hit in the list)
*/
void XConfig::LoadString(const char * name, char * p, const char* defaultP)
{
LoadParamString(n, args, name, p, defaultP);
}
/*
get the value of an argument (in integer)
>> name - the name of the argument
>> defaultP - the default value (used only if no argument is hit in the list)
*/
int XConfig::GetInt(const char * name, int defaultP)
{
int r;
LoadInt(name, &r, defaultP);
return r;
}
/*
get the value of an argument (in bool)
>> name - the name of the argument
>> defaultP - the default value (used only if no argument is hit in the list)
*/
bool XConfig::GetBool(const char * name, bool defaultP)
{
bool r;
LoadBool(name, &r, defaultP);
return r;
}
/*
get the value of an argument (in float)
>> name - the name of the argument
>> defaultP - the default value (used only if no argument is hit in the list)
*/
float XConfig::GetFloat(const char * name, float defaultP)
{
float r;
LoadFloat(name, &r, defaultP);
return r;
}
/* get item number */
int XConfig::GetItemNum()
{
return n;
}
/*
get the item with offset i
>> i - offset
*/
char * XConfig::GetItem(int i)
{
if (i < n && i >= 0)
return args[i];
else
return NULL;
}
/*
initialize with another config model
>> myConfig - the configure model that we want to copy
*/
void XConfig::CreateFromMe(XConfig & myConfig)
{
Clear();
for (int i = 0; i < myConfig.GetItemNum(); i++)
Add(myConfig.GetItem(i), i);
}
/*
load the value of an argument (in integer)
>> argc - number of arguments
>> argv - arguments
>> name - the argument we search for
>> p - the pointer to the target variable where we want to place the value
>> defaultP - the default value we use if no argument is found
*/
void LoadParamInt(int argc, char** argv, const char* name, int* p, int defaultP)
{
char vname[128];
vname[0] = '-';
strcpy(vname + 1, name);
bool hit = false;
for (int i = 0; i < argc; i++) {
if (!strcmp(argv[i], vname) && i + 1 < argc) {
*(int*)p = atoi(argv[i + 1]);
hit = true;
break;
}
}
if (!hit)
*p = defaultP;
}
/*
load the value of an argument (in boolean)
>> argc - number of arguments
>> argv - arguments
>> name - the argument we search for
>> p - the pointer to the target variable where we want to place the value
>> defaultP - the default value we use if no argument is found
*/
void LoadParamBool(int argc, char** argv, const char* name, bool* p, bool defaultP)
{
char vname[128];
vname[0] = '-';
strcpy(vname + 1, name);
bool hit = false;
for (int i = 0; i < argc; i++) {
if (!strcmp(argv[i], vname)) {
*(bool*)p = true;
hit = true;
break;
}
}
if (!hit)
*p = defaultP;
}
/*
load the value of an argument (in float)
>> argc - number of arguments
>> argv - arguments
>> name - the argument we search for
>> p - the pointer to the target variable where we want to place the value
>> defaultP - the default value we use if no argument is found
*/
void LoadParamFloat(int argc, char** argv, const char* name, float* p, float defaultP)
{
char vname[128];
vname[0] = '-';
strcpy(vname + 1, name);
bool hit = false;
for (int i = 0; i < argc; i++) {
if (!strcmp(argv[i], vname) && i + 1 < argc) {
*p = (float)atof(argv[i + 1]);
hit = true;
break;
}
}
if (!hit)
*p = defaultP;
}
/*
load the value of an argument (in char string)
>> argc - number of arguments
>> argv - arguments
>> name - the argument we search for
>> p - the pointer to the target variable where we want to place the value
>> defaultP - the default value we use if no argument is found
*/
void LoadParamString(int argc, char** argv, const char* name, char* p, const char* defaultP)
{
char vname[128];
vname[0] = '-';
strcpy(vname + 1, name);
bool hit = false;
for (int i = 0; i < argc; i++) {
if (!strcmp(argv[i], vname) && i + 1 < argc) {
strcpy(p, argv[i + 1]);
hit = true;
break;
}
}
if (!hit)
strcpy(p, defaultP);
}
/*
show the argument list
>> argc - number of arguments
>> argv - arguments
*/
void ShowParams(int argc, char** argv)
{
fprintf(stderr, "args:\n");
for (int i = 0; i < argc; i++) {
if (argv[i][1] == 0)
continue;
if (argv[i][0] == '-' && (argv[i][1] < '1' || argv[i][1] > '9')) {
if (i + 1 < argc && argv[i + 1][0] != '-')
fprintf(stderr, " %s=%s\n", argv[i], argv[i + 1]);
else
fprintf(stderr, " %s=yes\n", argv[i]);
}
}
fprintf(stderr, "\n");
}
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
/*
* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2021
* Natural Language Processing Lab, Northeastern University
* and
* NiuTrans Research
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* this class defines a parameter keeper.
*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2021-02-28
* A new semester begins today.
*/
#ifndef __XCONFIG_H__
#define __XCONFIG_H__
#include "XGlobal.h"
#include "XUtility.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
#define MAX_WORD_LENGTH_IN_CONFIG 256
/* the parameter keeper */
class XConfig
{
private:
/* number of arguments */
int n;
/* argument list (in char*) */
char ** args;
/* number of items we rellocate for these arguments */
int nReal;
public:
/* constructor */
XConfig();
/* de-constructor */
~XConfig();
/* clear it */
void Clear();
/* create a config */
void Create(const int myN, const char ** myArgs);
/* add an argument */
void Add(const char * myArg, const char * myValue);
/* add an argument (in integer) */
void Add(const char * myArg, int myValue);
/* add an argument (in bool) */
void Add(const char * myArg, bool myValue);
/* add an argument (in float) */
void Add(const char * myArg, float myValue);
/* load the value of an argument to a variable (in integer) */
void LoadInt(const char * name, int * p, int defaultP);
/* load the value of an argument to a variable (in boolean) */
void LoadBool(const char * name, bool * p, bool defaultP);
/* load the value of an argument to a variable (in float) */
void LoadFloat(const char * name, float * p, float defaultP);
/* load the value of an argument to a variable (in char string) */
void LoadString(const char * name, char * p, const char* defaultP);
/* get the value of an argument (in integer) */
int GetInt(const char * name, int defaultP);
/* get the value of an argument (in boolean) */
bool GetBool(const char * name, bool defaultP);
/* get the value of an argument (in float) */
float GetFloat(const char * name, float defaultP);
/* get item number */
int GetItemNum();
/* get the item with offset i */
char * GetItem(int i);
/* initialize with another config model */
void CreateFromMe(XConfig &myConfig);
};
#define MAX_PARAM_NUM 100
/* load arguments */
void extern LoadParamInt(int argc, char** argv, const char* name, int* p, int defaultP);
void extern LoadParamBool(int argc, char** argv, const char* name, bool* p, bool defaultP);
void extern LoadParamFloat(int argc, char** argv, const char* name, float* p, float defaultP);
void extern LoadParamString(int argc, char** argv, const char* name, char* p, const char* defaultP);
/* show arguments */
void extern ShowParams(int argc, char** argv);
} // namespace nts(NiuTrans.Tensor)
#endif
\ No newline at end of file
...@@ -42,7 +42,6 @@ XDevManager GDevs; ...@@ -42,7 +42,6 @@ XDevManager GDevs;
/* constructor */ /* constructor */
XDevice::XDevice() XDevice::XDevice()
{ {
stream = NULL;
isInitialized = false; isInitialized = false;
Clear(); Clear();
...@@ -141,8 +140,6 @@ void XDevice::Init(int myDevID) ...@@ -141,8 +140,6 @@ void XDevice::Init(int myDevID)
} }
else else
sprintf(name2, "GPU-%d %s", devID, name); sprintf(name2, "GPU-%d %s", devID, name);
stream = new XStream(0, devID);
#endif #endif
} }
...@@ -176,10 +173,6 @@ void XDevice::Clear() ...@@ -176,10 +173,6 @@ void XDevice::Clear()
curandDestroyGenerator(gen); curandDestroyGenerator(gen);
isGenReady = false; isGenReady = false;
} }
if (stream != NULL) {
delete stream;
stream = NULL;
}
#endif #endif
isInitialized = false; isInitialized = false;
} }
...@@ -189,10 +182,11 @@ void XDevice::Reset() ...@@ -189,10 +182,11 @@ void XDevice::Reset()
XMem * mem = GMems.GetMem(devID); XMem * mem = GMems.GetMem(devID);
mem->Free(); mem->Free();
#ifdef USE_CUDA
int devIDReset = devID; int devIDReset = devID;
Clear(); Clear();
#ifdef USE_CUDA
if (devIDReset >= 0) { if (devIDReset >= 0) {
int devIDBackup = -1; int devIDBackup = -1;
cudaGetDevice(&devIDBackup); cudaGetDevice(&devIDBackup);
...@@ -202,6 +196,8 @@ void XDevice::Reset() ...@@ -202,6 +196,8 @@ void XDevice::Reset()
cudaSetDevice(devIDBackup); cudaSetDevice(devIDBackup);
} }
#else
Clear();
#endif #endif
} }
...@@ -227,17 +223,6 @@ cublasHandle_t * XDevice::GetCublasHandle() ...@@ -227,17 +223,6 @@ cublasHandle_t * XDevice::GetCublasHandle()
return &cublasHandle; return &cublasHandle;
} }
/* get the stream of cuda */
cudaStream_t * XDevice::GetCudaStream()
{
if (!isInitialized)
Init(devID);
CheckNTErrors(stream != NULL, "the stream is not initialized!");
return &stream->stream;
}
#endif // USE_CUDA #endif // USE_CUDA
/* switch to a device */ /* switch to a device */
...@@ -286,6 +271,28 @@ int XDevice::GetGPUDevice() ...@@ -286,6 +271,28 @@ int XDevice::GetGPUDevice()
#endif #endif
} }
/*
swith to a device (CPU or GPU)
>> devID - device id
*/
void XDevice::SetDevice(int devID)
{
if(devID >= 0)
SetGPUDevice(devID);
}
/*
swith to a device (CPU or GPU) with a backup of the device id
>> devID - device id
>> backupDevID - backup of the device id
*/
void XDevice::SetDevice(int devID, int &backupDevID)
{
backupDevID = GetGPUDevice();
if (devID >= 0)
SetGPUDevice(devID);
}
/* reset cuda flag for more efficient cuda execution. It should be called after "SetGPUDevice" when /* reset cuda flag for more efficient cuda execution. It should be called after "SetGPUDevice" when
no GPU context has been established. */ no GPU context has been established. */
void XDevice::SetFastFlags() void XDevice::SetFastFlags()
...@@ -312,13 +319,6 @@ void XDevice::SetFastFlagsAllDevices() ...@@ -312,13 +319,6 @@ void XDevice::SetFastFlagsAllDevices()
#endif #endif
} }
/* delete the default stream for the device */
void XDevice::DelDeviceStream()
{
if(stream != NULL)
delete stream;
}
/* constructor */ /* constructor */
XDevManager::XDevManager() XDevManager::XDevManager()
{ {
...@@ -391,14 +391,6 @@ cublasHandle_t * XDevManager::GetCudaHandle(const int devID) ...@@ -391,14 +391,6 @@ cublasHandle_t * XDevManager::GetCudaHandle(const int devID)
return GPUs[devID].GetCublasHandle(); return GPUs[devID].GetCublasHandle();
} }
/* get the stream of a given GPU */
cudaStream_t * XDevManager::GetCudaStream(const int devID)
{
CheckNTErrors(devID < nGPU, "index of GPU is out of range.");
return GPUs[devID].GetCudaStream();
}
#endif #endif
/* /*
...@@ -620,16 +612,5 @@ char * XDevManager::GetDevString(int devID) ...@@ -620,16 +612,5 @@ char * XDevManager::GetDevString(int devID)
} }
} }
/* delete the streams for all devices */
void XDevManager::DelDeviceStream()
{
for(int i = 0; i < GDevs.nCPU; i++) {
GDevs.CPUs[i].DelDeviceStream();
}
for(int i = 0; i < GDevs.nGPU; i++) {
GDevs.GPUs[i].DelDeviceStream();
}
}
} /* end of the nts (NiuTrans.Tensor) namespace */ } /* end of the nts (NiuTrans.Tensor) namespace */
...@@ -25,7 +25,6 @@ ...@@ -25,7 +25,6 @@
#define __XDEVICE_H__ #define __XDEVICE_H__
#include "XThread.h" #include "XThread.h"
#include "XStream.h"
#ifdef USE_CUDA #ifdef USE_CUDA
...@@ -97,9 +96,6 @@ public: ...@@ -97,9 +96,6 @@ public:
/* specify whether Unified Virtual Address Space (UVA) is supported */ /* specify whether Unified Virtual Address Space (UVA) is supported */
bool isUVASupported; bool isUVASupported;
/* default stream for the device */
XStream * stream;
/* seed for random number generation */ /* seed for random number generation */
int seed; int seed;
...@@ -140,12 +136,9 @@ public: ...@@ -140,12 +136,9 @@ public:
#ifdef USE_CUDA #ifdef USE_CUDA
/* get cublas handle */ /* get cublas handle */
cublasHandle_t * GetCublasHandle(); cublasHandle_t * GetCublasHandle();
/* get the stream of cuda */
cudaStream_t * GetCudaStream();
#endif #endif
/* switch to a device */ /* switch to a GPU device */
static static
void SetGPUDevice(int devID); void SetGPUDevice(int devID);
...@@ -153,10 +146,18 @@ public: ...@@ -153,10 +146,18 @@ public:
static static
void SetGPUDeviceFast(int devID); void SetGPUDeviceFast(int devID);
/* switch to a get current dev */ /* get current dev */
static static
int GetGPUDevice(); int GetGPUDevice();
/* swith to a device (CPU or GPU) */
static
void SetDevice(int devID);
/* swith to a device (CPU or GPU) with a backup of the device id */
static
void SetDevice(int devID, int &backupDevID);
/* reset cuda flag for more efficient cuda execution */ /* reset cuda flag for more efficient cuda execution */
static static
void SetFastFlags(); void SetFastFlags();
...@@ -164,9 +165,6 @@ public: ...@@ -164,9 +165,6 @@ public:
/* reset cuda flag for more efficient cuda execution (all devices) */ /* reset cuda flag for more efficient cuda execution (all devices) */
static static
void SetFastFlagsAllDevices(); void SetFastFlagsAllDevices();
/* delete the default stream for the device (call it before deleting the XDevice) */
void DelDeviceStream();
}; };
/* /*
...@@ -206,9 +204,6 @@ public: ...@@ -206,9 +204,6 @@ public:
#ifdef USE_CUDA #ifdef USE_CUDA
/* get the handle of GPU */ /* get the handle of GPU */
cublasHandle_t * GetCudaHandle(const int devID); cublasHandle_t * GetCudaHandle(const int devID);
/* get the stream of cuda */
cudaStream_t * GetCudaStream(const int devID);
#endif #endif
/* get grid and block sizes that max potential */ /* get grid and block sizes that max potential */
...@@ -228,10 +223,6 @@ public: ...@@ -228,10 +223,6 @@ public:
/* get the device information in string */ /* get the device information in string */
char * GetDevString(int devID); char * GetDevString(int devID);
/* delete the streams for all devices */
static
void DelDeviceStream();
}; };
/* managing the devices */ /* managing the devices */
......
...@@ -132,6 +132,36 @@ extern int TRAINING_SAMPLE_BUF_SIZE; ...@@ -132,6 +132,36 @@ extern int TRAINING_SAMPLE_BUF_SIZE;
extern int CONST_MINUSONE; extern int CONST_MINUSONE;
extern bool CONST_TRUE; extern bool CONST_TRUE;
//////////////////////////////////////////////////
// mutex
#ifdef WIN32
#define THREAD_HANDLE HANDLE
#define MUTEX_HANDLE CRITICAL_SECTION
#define COND_HANDLE HANDLE
#define MUTEX_INIT( x ) InitializeCriticalSection( &(x) )
#define MUTEX_DELE( x ) DeleteCriticalSection( &(x) )
#define MUTEX_LOCK( x ) EnterCriticalSection( &(x) )
#define MUTEX_UNLOCK( x ) LeaveCriticalSection( &(x) )
#define COND_INIT( x ) ( x = CreateEvent( NULL, false, false, NULL ) )
#define COND_DELE( x ) CloseHandle( (x) )
#define COND_WAIT( x, y ) WaitForSingleObject( (x), INFINITE )
#define COND_SIGNAL( x ) SetEvent( (x) )
#define COND_RESET( x) ResetEvent( (x) )
#else
#define THREAD_HANDLE pthread_t
#define MUTEX_HANDLE pthread_mutex_t
#define COND_HANDLE pthread_cond_t
#define MUTEX_INIT( x ) pthread_mutex_init( &(x), NULL )
#define MUTEX_DELE( x ) pthread_mutex_destroy( &(x) )
#define MUTEX_LOCK( x ) pthread_mutex_lock( &(x) )
#define MUTEX_UNLOCK( x ) pthread_mutex_unlock( &(x) )
#define COND_INIT( x ) pthread_cond_init( &(x), NULL )
#define COND_DELE( x ) pthread_cond_destroy( &(x) )
#define COND_WAIT( x, y ) pthread_cond_wait( &(x), &(y) )
#define COND_SIGNAL( x ) pthread_cond_signal( &(x) )
#define COND_BROADCAST( x ) pthread_cond_broadcast( &(x) )
#endif
//#define USE_CUDA_RESURSION 1 //#define USE_CUDA_RESURSION 1
#define NIUTRANSNNDEBUG #define NIUTRANSNNDEBUG
......
...@@ -26,8 +26,6 @@ ...@@ -26,8 +26,6 @@
#ifndef __XLINK_H__ #ifndef __XLINK_H__
#define __XLINK_H__ #define __XLINK_H__
#include "XGlobal.h"
namespace nts{ // namespace nts(NiuTrans.Tensor) namespace nts{ // namespace nts(NiuTrans.Tensor)
/* cross reference */ /* cross reference */
......
...@@ -75,6 +75,9 @@ public: ...@@ -75,6 +75,9 @@ public:
/* de-constructor */ /* de-constructor */
~TensorListBase(); ~TensorListBase();
/* reallocate */
void Reallocate(int itemNum);
/* add an item into the list */ /* add an item into the list */
void Add(T&& item); void Add(T&& item);
...@@ -84,6 +87,15 @@ public: ...@@ -84,6 +87,15 @@ public:
/* add an item into the list */ /* add an item into the list */
void Add(const T& item); void Add(const T& item);
/* add an item (as an integer) into the list */
void AddInt(const int item);
/* add an item (as a float) into the list */
void AddFloat(const float item);
/* add an item (as a long long) into the list */
void AddLLong(const long long item);
/* add a number of items into the list */ /* add a number of items into the list */
void Add(const T* inputItems, int inputItemCount); void Add(const T* inputItems, int inputItemCount);
...@@ -99,12 +111,30 @@ public: ...@@ -99,12 +111,30 @@ public:
/* get the item at position i */ /* get the item at position i */
T& GetItem(int i) const; T& GetItem(int i) const;
/* get the item at position i and force it to an integer */
int GetItemInt(int i) const;
/* get the item at position i and force it to a float number */
float GetItemFloat(int i) const;
/* get the item at position i and force it to an long long number */
long long GetItemLLong(int i) const;
/* set the item at position i */ /* set the item at position i */
void SetItem(int i, const T& item); void SetItem(int i, const T& item);
/* set the item at position i */ /* set the item at position i */
void SetItem(int i, T&& item); void SetItem(int i, T&& item);
/* set the item (as an integer) at position i */
void SetItemInt(int i, const int item);
/* set the item (as a float) at position i */
void SetItemFloat(int i, const float item);
/* set the item (as a long long) at position i */
void SetItemLLong(int i, const long long item);
/* find the position of the first matched item */ /* find the position of the first matched item */
int FindFirst(const T& item); int FindFirst(const T& item);
...@@ -135,7 +165,13 @@ public: ...@@ -135,7 +165,13 @@ public:
/* short */ /* short */
T& operator[] (int i) const { return GetItem(i); }; T& operator[] (int i) const { return GetItem(i); };
T& Get(int i) const { return GetItem(i); }; T& Get(int i) const { return GetItem(i); };
int GetInt(int i) const { return GetItemInt(i); };
float GetFloat(int i) const { return GetItemFloat(i); };
long long GetLLong(int i) const { return GetItemLLong(i); };
void Set(int i, T item) { SetItem(i, item); }; void Set(int i, T item) { SetItem(i, item); };
void SetInt(int i, int item) { SetItemInt(i, item); };
void SetFloat(int i, float item) { SetItemFloat(i, item); };
void SetLLong(int i, long long item) { SetItemLLong(i, item); };
}; };
struct XTensor; struct XTensor;
......
...@@ -54,6 +54,8 @@ XMem::XMem() ...@@ -54,6 +54,8 @@ XMem::XMem()
signature = 0; signature = 0;
mergeFreeOTF = true; mergeFreeOTF = true;
isInitialized = false; isInitialized = false;
MUTEX_INIT(allocMutex);
MUTEX_INIT(bufMutex);
} }
/* /*
...@@ -77,6 +79,8 @@ XMem::XMem(int myDevID, MEMPOOL_MODE myMode, MTYPE myBlockSize, int myBlockNum, ...@@ -77,6 +79,8 @@ XMem::XMem(int myDevID, MEMPOOL_MODE myMode, MTYPE myBlockSize, int myBlockNum,
strcpy(name, "xmem"); strcpy(name, "xmem");
signature = 0; signature = 0;
mergeFreeOTF = true; mergeFreeOTF = true;
MUTEX_INIT(allocMutex);
MUTEX_INIT(bufMutex);
Initialize(myDevID, myMode, myBlockSize, myBlockNum, myBufSize); Initialize(myDevID, myMode, myBlockSize, myBlockNum, myBufSize);
} }
...@@ -99,6 +103,8 @@ XMem::~XMem() ...@@ -99,6 +103,8 @@ XMem::~XMem()
delete[] memIndex; delete[] memIndex;
delete[] memIndex2; delete[] memIndex2;
delete[] minSizeIndex; delete[] minSizeIndex;
MUTEX_DELE(allocMutex);
MUTEX_DELE(bufMutex);
} }
/* /*
...@@ -379,12 +385,18 @@ require a piece of memory ...@@ -379,12 +385,18 @@ require a piece of memory
*/ */
void * XMem::Alloc(int myDevID, MTYPE mySize) void * XMem::Alloc(int myDevID, MTYPE mySize)
{ {
void * p = NULL;
MUTEX_LOCK(allocMutex);
if(mode == FREE_ON_THE_FLY) if(mode == FREE_ON_THE_FLY)
return AllocStandard(myDevID, mySize); p = AllocStandard(myDevID, mySize);
else if(isStatic) else if(isStatic)
return AllocStatic(myDevID, mySize); p = AllocStatic(myDevID, mySize);
else else
return AllocDynamic(myDevID, mySize); p = AllocDynamic(myDevID, mySize);
MUTEX_UNLOCK(allocMutex);
return p;
} }
/* /*
...@@ -521,6 +533,11 @@ void * XMem::AllocBuf(int myDevID, MTYPE mySize, int pitch) ...@@ -521,6 +533,11 @@ void * XMem::AllocBuf(int myDevID, MTYPE mySize, int pitch)
{ {
MTYPE backOffset = 0; MTYPE backOffset = 0;
/* NOTE THAT this is tricky because we lock the buffer
but DO NOT unlock it in this function. The unlock would
happans when we call ReleaseBuf() */
//MUTEX_LOCK(bufMutex);
if(pitch > 1){ if(pitch > 1){
MTYPE address = (MTYPE)((char*)buf + bufUsed); MTYPE address = (MTYPE)((char*)buf + bufUsed);
int offset = address % pitch; int offset = address % pitch;
...@@ -560,8 +577,10 @@ release a piece of memory ...@@ -560,8 +577,10 @@ release a piece of memory
*/ */
void XMem::Release(int myDevID, void * p, MTYPE size) void XMem::Release(int myDevID, void * p, MTYPE size)
{ {
MUTEX_LOCK(allocMutex);
if(mode == FREE_ON_THE_FLY) if(mode == FREE_ON_THE_FLY)
ReleaseStandard(myDevID, p, size); ReleaseStandard(myDevID, p, size);
MUTEX_UNLOCK(allocMutex);
} }
/* /*
...@@ -583,6 +602,9 @@ void XMem::ReleaseBuf(int myDevID, MTYPE mySize, int pitch) ...@@ -583,6 +602,9 @@ void XMem::ReleaseBuf(int myDevID, MTYPE mySize, int pitch)
} }
bufUsed -= (mySize + backOffset); bufUsed -= (mySize + backOffset);
/* NOTE THAT this is a response to the lock in AllocBuf() */
//MUTEX_UNLOCK(bufMutex);
} }
/* /*
...@@ -825,6 +847,18 @@ void * XMem::AllocStandard(int myDevID, MTYPE mySize, bool myIsRebuiltIndex) ...@@ -825,6 +847,18 @@ void * XMem::AllocStandard(int myDevID, MTYPE mySize, bool myIsRebuiltIndex)
return result; return result;
} }
/* lock the buffer mutex */
void XMem::LockBuf()
{
MUTEX_LOCK(bufMutex);
}
/* unlock the buffer mutex */
void XMem::UnlockBuf()
{
MUTEX_UNLOCK(bufMutex);
}
/* /*
find the highest set bit (or most significant set bit) in an integer-64 find the highest set bit (or most significant set bit) in an integer-64
>> mySize - required size >> mySize - required size
...@@ -1511,12 +1545,12 @@ void XMem::ShowMemUsage(FILE * file) ...@@ -1511,12 +1545,12 @@ void XMem::ShowMemUsage(FILE * file)
} }
MTYPE bufTotal = bufSize; MTYPE bufTotal = bufSize;
MTYPE bufUsed = bufUsed; MTYPE bufUsedTotal = bufUsed;
fprintf(file, "block mem:%.1fMB used:%.1fMB usage:%.3f\n", fprintf(file, "block mem:%.1fMB used:%.1fMB usage:%.3f\n",
(DTYPE)blockTotal/MILLION, (DTYPE)blockUsed/MILLION, (DTYPE)blockUsed/blockTotal); (DTYPE)blockTotal/MILLION, (DTYPE)blockUsed/MILLION, (DTYPE)blockUsed/blockTotal);
fprintf(file, "buffer mem:%.1fMB used:%.1fMB usage:%.3f\n", fprintf(file, "buffer mem:%.1fMB used:%.1fMB usage:%.3f\n",
(DTYPE)bufTotal / 1024 / 1024, (DTYPE)bufUsed / 1024 / 1024, (DTYPE)bufUsed / bufTotal); (DTYPE)bufTotal / 1024 / 1024, (DTYPE)bufUsedTotal / 1024 / 1024, (DTYPE)bufUsed / bufTotal);
} }
...@@ -1560,7 +1594,7 @@ MTYPE XMemManager::GetAvailableMemory() ...@@ -1560,7 +1594,7 @@ MTYPE XMemManager::GetAvailableMemory()
MEMORYSTATUSEX memoryStatus; MEMORYSTATUSEX memoryStatus;
memoryStatus.dwLength = sizeof(memoryStatus); memoryStatus.dwLength = sizeof(memoryStatus);
if (GlobalMemoryStatusEx(&memoryStatus)){ if (GlobalMemoryStatusEx(&memoryStatus)){
freeMem = memoryStatus.ullAvailPhys; freeMem = (unsigned long)memoryStatus.ullAvailPhys;
} }
#else #else
long pages = sysconf(_SC_AVPHYS_PAGES); long pages = sysconf(_SC_AVPHYS_PAGES);
...@@ -1604,6 +1638,9 @@ void XMemManager::GetBufferSize(MTYPE freeMem, MTYPE * myBufSize) ...@@ -1604,6 +1638,9 @@ void XMemManager::GetBufferSize(MTYPE freeMem, MTYPE * myBufSize)
} }
} }
} }
else {
ShowNTErrors("No enough memory for buffer allocation!");
}
} }
/* initialize it and set the global memory information */ /* initialize it and set the global memory information */
......
...@@ -24,6 +24,7 @@ ...@@ -24,6 +24,7 @@
#ifndef __XMEM_H__ #ifndef __XMEM_H__
#define __XMEM_H__ #define __XMEM_H__
#include "XGlobal.h"
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
...@@ -249,6 +250,13 @@ public: ...@@ -249,6 +250,13 @@ public:
/* indicates whether we merge free memory pieces on the fly */ /* indicates whether we merge free memory pieces on the fly */
bool mergeFreeOTF; bool mergeFreeOTF;
private:
/* a mutex for memory allocation and release */
MUTEX_HANDLE allocMutex;
/* a mutex for buffer memory allocation and release */
MUTEX_HANDLE bufMutex;
public: public:
/* constructor */ /* constructor */
...@@ -337,6 +345,12 @@ public: ...@@ -337,6 +345,12 @@ public:
/* allocate a piece of memory as "malloc" */ /* allocate a piece of memory as "malloc" */
void * AllocStandard(int myDevID, MTYPE mySize, bool myIsRebuiltIndex = false); void * AllocStandard(int myDevID, MTYPE mySize, bool myIsRebuiltIndex = false);
/* lock the buffer mutex */
void LockBuf();
/* unlock the buffer mutex */
void UnlockBuf();
/* find the highest set bit (or most significant set bit) in an integer-64 */ /* find the highest set bit (or most significant set bit) in an integer-64 */
int GetMSB(MTYPE mySize); int GetMSB(MTYPE mySize);
......
...@@ -146,7 +146,7 @@ run a set of jobs in parallel ...@@ -146,7 +146,7 @@ run a set of jobs in parallel
>> jobArgs - the list of arguments for each job >> jobArgs - the list of arguments for each job
>> sleepTime - time to sleep (in ms) for each round >> sleepTime - time to sleep (in ms) for each round
*/ */
void XPRunner::Run(TensorList * jobFunctions, TensorList * jobArgs, float sleepTime) void XPRunner::Run(XList * jobFunctions, XList * jobArgs, float sleepTime)
{ {
if(threadNum <= 0){ if(threadNum <= 0){
XPRINT(1, stderr, "Error! No threads were created!\n"); XPRINT(1, stderr, "Error! No threads were created!\n");
...@@ -195,13 +195,12 @@ void XPRunner::Run(TensorList * jobFunctions, TensorList * jobArgs, float sleepT ...@@ -195,13 +195,12 @@ void XPRunner::Run(TensorList * jobFunctions, TensorList * jobArgs, float sleepT
TFunction function = (TFunction)jobFunctions->GetItem(jobArgs->count - c); TFunction function = (TFunction)jobFunctions->GetItem(jobArgs->count - c);
/* the arguments that are passed to the function */ /* the arguments that are passed to the function */
volatile TensorList * args = (TensorList*)jobArgs->GetItem(jobArgs->count - c); XList * args = (XList*)jobArgs->GetItem(jobArgs->count - c);
/* thread */ /* thread */
XThread * thread = threads + availableThreads[i]; XThread * thread = threads + availableThreads[i];
thread->argv = args; thread->SetFunc(function, args);
thread->function = function;
MUTEX_LOCK(thread->workingMutex); MUTEX_LOCK(thread->workingMutex);
thread->working = 1; thread->working = 1;
......
...@@ -106,7 +106,7 @@ public: ...@@ -106,7 +106,7 @@ public:
void KillThreads(); void KillThreads();
/* run a set of jobs in parallel */ /* run a set of jobs in parallel */
void Run(TensorList * jobFunctions, TensorList * jobArgs, float sleepTime = 0); void Run(XList * jobFunctions, XList * jobArgs, float sleepTime = 0);
/* get the number of parallel jobs to run */ /* get the number of parallel jobs to run */
int GetJobNum(int size); int GetJobNum(int size);
......
...@@ -42,7 +42,7 @@ job item used in queues ...@@ -42,7 +42,7 @@ job item used in queues
JobQueueNode::JobQueueNode() JobQueueNode::JobQueueNode()
{ {
job = NULL; job = NULL;
args = new TensorList(1); args = new XList(1);
} }
/* de-constructor */ /* de-constructor */
...@@ -67,12 +67,9 @@ XQueue::XQueue(int mySize) ...@@ -67,12 +67,9 @@ XQueue::XQueue(int mySize)
head = 0; head = 0;
tail = 0; tail = 0;
isJobQueue = false; isJobQueue = false;
jobDequeuerArgs = new TensorList(1); jobDequeuerArgs = new XList(1);
jobDequeuerBreak = false; jobDequeuerBreak = false;
runningJobCount = 0; runningJobCount = 0;
jobStream = NULL;
jobStream1 = NULL;
jobStream2 = NULL;
MUTEX_INIT(enqueueMutex); MUTEX_INIT(enqueueMutex);
MUTEX_INIT(dequeueMutex); MUTEX_INIT(dequeueMutex);
...@@ -85,9 +82,6 @@ XQueue::~XQueue() ...@@ -85,9 +82,6 @@ XQueue::~XQueue()
{ {
delete[] queue; delete[] queue;
delete jobDequeuerArgs; delete jobDequeuerArgs;
delete jobStream;
delete jobStream1;
delete jobStream2;
//if(isJobQueue) //if(isJobQueue)
// StopJobConsumer(); // StopJobConsumer();
...@@ -160,19 +154,6 @@ void XQueue::WaitForEmptyJobQueue() ...@@ -160,19 +154,6 @@ void XQueue::WaitForEmptyJobQueue()
while(runningJobCount > 0){ while(runningJobCount > 0){
XSleep(10); XSleep(10);
} }
if(jobStream != NULL){
CheckNTErrors((jobStream->IsFinished()), "None fineished jobs remain");
jobStream->Clear();
}
if(jobStream1 != NULL){
CheckNTErrors((jobStream1->IsFinished()), "None fineished jobs remain");
jobStream1->Clear();
}
if(jobStream2 != NULL){
CheckNTErrors((jobStream2->IsFinished()), "None fineished jobs remain");
jobStream2->Clear();
}
} }
int devids[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; int devids[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
...@@ -189,12 +170,11 @@ void XQueue::RunJobConsumer(int jobDevID) ...@@ -189,12 +170,11 @@ void XQueue::RunJobConsumer(int jobDevID)
isJobQueue = true; isJobQueue = true;
jobDequeuerArgs->Clear(); jobDequeuerArgs->Clear();
// warning: this may cause unknown error /* warning: this may cause unknown errors */
jobDequeuerArgs->Add((XTensor*)this); jobDequeuerArgs->Add(this);
jobDequeuerArgs->Add(jobDevID >= 0 ? (XTensor*)(devids + jobDevID) : (XTensor*)&cpuid); jobDequeuerArgs->Add(jobDevID >= 0 ? (devids + jobDevID) : &cpuid);
jobDequeuer.function = (TFunction)DequeueJobs; jobDequeuer.SetFunc((TFunction)DequeueJobs, jobDequeuerArgs);
jobDequeuer.argv = jobDequeuerArgs;
jobDequeuer.Start(); jobDequeuer.Start();
jobDequeuer.LetItGo(); jobDequeuer.LetItGo();
...@@ -213,7 +193,7 @@ void XQueue::StopJobConsumer() ...@@ -213,7 +193,7 @@ void XQueue::StopJobConsumer()
} }
/* add a job item to process */ /* add a job item to process */
void XQueue::EnqueueJob(void * job, TensorList * jobArgs) void XQueue::EnqueueJob(void * job, XList * jobArgs)
{ {
MUTEX_LOCK(jobQueueMutex); MUTEX_LOCK(jobQueueMutex);
runningJobCount++; runningJobCount++;
...@@ -227,17 +207,16 @@ void XQueue::EnqueueJob(void * job, TensorList * jobArgs) ...@@ -227,17 +207,16 @@ void XQueue::EnqueueJob(void * job, TensorList * jobArgs)
} }
/* job item consumer */ /* job item consumer */
void XQueue::DequeueJobs(TensorList * args) void XQueue::DequeueJobs(XList * args)
{ {
CheckNTErrors((args->count == 2), "Illegal arguments!"); CheckNTErrors((args->count == 2), "Illegal arguments!");
XQueue * q = (XQueue*)args->GetItem(0); XQueue * q = (XQueue*)args->GetItem(0);
int devID = *(int*)args->GetItem(1); int devID = *(int*)args->GetItem(1);
int devIDBackup = XDevice::GetGPUDevice(); int devIDBackup = -1;
if(devID >= 0) if(devID >= 0)
XDevice::SetGPUDevice(devID); XDevice::SetDevice(devID, devIDBackup);
while(1){ while(1){
JobQueueNode * node = (JobQueueNode*)q->Dequeue(); JobQueueNode * node = (JobQueueNode*)q->Dequeue();
...@@ -259,7 +238,7 @@ void XQueue::DequeueJobs(TensorList * args) ...@@ -259,7 +238,7 @@ void XQueue::DequeueJobs(TensorList * args)
} }
if(devID >= 0) if(devID >= 0)
XDevice::SetGPUDevice(devIDBackup); XDevice::SetDevice(devIDBackup);
} }
/* get the break flag */ /* get the break flag */
...@@ -268,31 +247,14 @@ bool XQueue::GetJobBreak() ...@@ -268,31 +247,14 @@ bool XQueue::GetJobBreak()
return jobDequeuerBreak; return jobDequeuerBreak;
} }
/* get job stream */ /* get the number of jobs */
XStream * XQueue::GetJobStream(int n) int XQueue::GetJobNum()
{ {
if(n == 0) MUTEX_LOCK(jobQueueMutex);
return jobStream; int c = runningJobCount;
else if(n == 1) MUTEX_UNLOCK(jobQueueMutex);
return jobStream1;
else if(n == 2)
return jobStream2;
else{
ShowNTErrors("invalid stream id!");
}
return NULL;
}
/* make job streams */ return c;
void XQueue::MakeJobStreams(int devID, int devID1, int devID2)
{
if(devID != INVALID_DEVICE_ID)
jobStream = new XStream(0, devID);
if(devID1 != INVALID_DEVICE_ID)
jobStream1 = new XStream(0, devID1);
if(devID2 != INVALID_DEVICE_ID)
jobStream2 = new XStream(0, devID2);
} }
} /* end of the nts (NiuTrans.Tensor) namespace */ } /* end of the nts (NiuTrans.Tensor) namespace */
...@@ -33,7 +33,6 @@ ...@@ -33,7 +33,6 @@
#include "XGlobal.h" #include "XGlobal.h"
#include "XThread.h" #include "XThread.h"
#include "XStream.h"
#include "XDevice.h" #include "XDevice.h"
#include "XList.h" #include "XList.h"
...@@ -52,7 +51,7 @@ public: ...@@ -52,7 +51,7 @@ public:
void * job; void * job;
/* arguments of the job */ /* arguments of the job */
TensorList * args; XList * args;
public: public:
/* constructor */ /* constructor */
...@@ -102,7 +101,7 @@ private: ...@@ -102,7 +101,7 @@ private:
XThread jobDequeuer; XThread jobDequeuer;
/* argument list of jobDequeuer */ /* argument list of jobDequeuer */
TensorList * jobDequeuerArgs; XList * jobDequeuerArgs;
/* indicates whether jobDequeuer stops */ /* indicates whether jobDequeuer stops */
bool jobDequeuerBreak; bool jobDequeuerBreak;
...@@ -110,11 +109,6 @@ private: ...@@ -110,11 +109,6 @@ private:
/* running job count */ /* running job count */
int runningJobCount; int runningJobCount;
/* job streams (we think that three streams is enough :)) */
XStream * jobStream;
XStream * jobStream1;
XStream * jobStream2;
public: public:
/* constuctor */ /* constuctor */
XQueue(int mySize = MAX_QUEUE_SIZE); XQueue(int mySize = MAX_QUEUE_SIZE);
...@@ -135,26 +129,23 @@ public: ...@@ -135,26 +129,23 @@ public:
void WaitForEmptyJobQueue(); void WaitForEmptyJobQueue();
/* run the job consumer */ /* run the job consumer */
void RunJobConsumer(int jobDevID = 0); void RunJobConsumer(int jobDevID = -1);
/* stop the job consumer */ /* stop the job consumer */
void StopJobConsumer(); void StopJobConsumer();
/* add a job item to process */ /* add a job item to process */
void EnqueueJob(void * job, TensorList * jobArgs); void EnqueueJob(void * job, XList * jobArgs);
/* job item consumer */ /* job item consumer */
static static
void DequeueJobs(TensorList * args); void DequeueJobs(XList * args);
/* get the break flag */ /* get the break flag */
bool GetJobBreak(); bool GetJobBreak();
/* get job stream */ /* get the number of jobs */
XStream * GetJobStream(int n = 0); int GetJobNum();
/* make job streams */
void MakeJobStreams(int devID = INVALID_DEVICE_ID, int devID1 = INVALID_DEVICE_ID, int devID2 = INVALID_DEVICE_ID);
}; };
} /* end of the nts (NiuTrans.Tensor) namespace */ } /* end of the nts (NiuTrans.Tensor) namespace */
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
*
* This is for streaming (on GPU), i.e., run jobs in different stream for
* GPU Async capabilities.
*
*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2016-03-09
*
*/
#include "stdio.h"
#include "stdlib.h"
#include "XGlobal.h"
#include "XStream.h"
#include "XDevice.h"
/* the nts (NiuTrans.Tensor) namespace */
namespace nts{
/*
This class defines the stream used in pipelining jobs. E.g., one can put
a sequence of jobs in a stream and asynchronously do something else. Basically
we can use multiply streams to hide the data transfer cost on GPUs by using
job overlaps.
*/
/* constructor */
XStream::XStream(int priority, int myDevID, int myMaxEventNum)
{
devID = myDevID;
#ifdef USE_CUDA
if(myDevID >= 0){
int backupDevID = XDevice::GetGPUDevice();
XDevice::SetGPUDevice(myDevID);
events = new cudaEvent_t[myMaxEventNum];
XDevice::SetGPUDevice(backupDevID);
maxEventNum = myMaxEventNum;
usedEventNum = 0;
}
else{
maxEventNum = 0;
usedEventNum = 0;
}
#endif
Create(priority, devID);
}
/* deconstructor */
XStream::~XStream()
{
Destroy();
#ifdef USE_CUDA
delete[] events;
#endif
}
/* create the stream */
void XStream::Create(int priority, int myDevID)
{
if(myDevID < 0)
return;
#ifdef USE_CUDA
int backupDevID = XDevice::GetGPUDevice();
XDevice::SetGPUDevice(myDevID);
//cudaStreamCreateWithPriority(&stream, cudaStreamDefault, priority);
CheckNTErrors((cudaStreamCreate(&stream) == cudaSuccess),
"cannot create the cuda stream!");
XDevice::SetGPUDevice(backupDevID);
#endif
devID = myDevID;
}
/* destroy the stream */
void XStream::Destroy()
{
if(devID < 0)
return;
#ifdef USE_CUDA
int backupDevID = XDevice::GetGPUDevice();
XDevice::SetGPUDevice(devID);
cudaStreamDestroy(stream);
XDevice::SetGPUDevice(backupDevID);
Clear();
#endif
}
/* clear it */
void XStream::Clear()
{
#ifdef USE_CUDA
int backupDevID = XDevice::GetGPUDevice();
XDevice::SetGPUDevice(devID);
for(int i = 0; i < usedEventNum; i++){
cudaEventDestroy(events[i]);
}
usedEventNum = 0;
XDevice::SetGPUDevice(backupDevID);
#endif
}
/* judge if all the jobs in the stream have been finished */
bool XStream::IsFinished()
{
#ifdef USE_CUDA
if(cudaStreamQuery(stream) == cudaSuccess)
return true;
else
return false;
#else
return true;
#endif
}
void XStream::StreamSynchronize()
{
#ifdef USE_CUDA
int devIDBackup = XDevice::GetGPUDevice();
if(devID != devIDBackup)
XDevice::SetGPUDevice(devID);
cudaStreamSynchronize(stream);
if(devID != devIDBackup)
XDevice::SetGPUDevice(devIDBackup);
#endif
}
void XStream::ThreadSynchronize()
{
#ifdef USE_CUDA
#if CUDART_VERSION < 10000
cudaThreadSynchronize();
#else
ShowNTErrors("TODO!");
#endif
#endif
}
void XStream::DeviceSynchronize(int devID)
{
#ifdef USE_CUDA
int devIDBackup = XDevice::GetGPUDevice();
cudaGetDevice(&devIDBackup);
if(devID != devIDBackup)
XDevice::SetGPUDevice(devID);
cudaDeviceSynchronize();
if(devID != devIDBackup)
XDevice::SetGPUDevice(devIDBackup);
#endif
}
/* make a dependency of two streams. i.e., current stream must wait for the last job finished in another stream */
void XStream::MakeDependency(XStream * precedingStream)
{
#ifdef USE_CUDA
cudaEvent_t * e = precedingStream->MakeEvent();
cudaEventRecord(*e, precedingStream->stream);
cudaStreamWaitEvent(stream, *e, 0);
#endif
}
/* get the stream */
#ifdef USE_CUDA
inline cudaStream_t * XStream::Get()
{
return &stream;
}
/* make a event */
inline cudaEvent_t * XStream::MakeEvent()
{
int backupDevID = XDevice::GetGPUDevice();
XDevice::SetGPUDevice(devID);
CheckNTErrors((usedEventNum < maxEventNum), "Too many events are required!");
cudaEvent_t * e = events + usedEventNum++;
cudaEventCreate(e);
XDevice::SetGPUDevice(backupDevID);
return e;
}
#endif
} /* end of the nts (NiuTrans.Tensor) namespace */
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
*
* This is for streaming (on GPU), i.e., run jobs in different stream for
* GPU Async capabilities.
*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2016-03-09
*
*/
#ifndef __XSTREAM_H__
#define __XSTREAM_H__
/* the CUDA stuff */
#ifdef USE_CUDA
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cuda_fp16.h>
#endif
/* the nts (NiuTrans.Tensor) namespace */
namespace nts{
#define MAX_CUDA_EVENT_NUM_IN_A_STREAM 128
/*
This class defines the stream used in pipelining jobs. E.g., one can put
a sequence of jobs in a stream and asychronously do something else. Basically
we can use multiply streams to hide the data transfer cost on GPUs by using
job overlaps.
*/
class XStream
{
public:
#ifdef USE_CUDA
/* the cuda stream */
cudaStream_t stream;
/* list of cuda events for synchronize different streams */
cudaEvent_t * events;
/* max number of the events */
int maxEventNum;
/* number of used events */
int usedEventNum;
#else
/* virtual pointer */
void * stream;
#endif
/* device that holds the stream */
int devID;
public:
/* constructor */
XStream(int priority = 0, int devID = 0, int maxEventNum = MAX_CUDA_EVENT_NUM_IN_A_STREAM);
/* deconstructor */
~XStream();
/* create the stream */
void Create(int priority = 0, int devID = 0);
/* destroy the stream */
void Destroy();
/* clear it */
void Clear();
/* judge if all the jobs in the stream have been finished */
bool IsFinished();
/* stream synchronize */
void StreamSynchronize();
/* thread synchronize */
static
void ThreadSynchronize();
/* device synchronize */
static
void DeviceSynchronize(int devID);
/* make a dependency of two streams. i.e., current stream must wait for the last job finished in another stream */
void MakeDependency(XStream * precedingStream);
#ifdef USE_CUDA
/* get the stream */
cudaStream_t * Get();
/* make a event */
cudaEvent_t * MakeEvent();
#endif
};
} /* end of the nts (NiuTrans.Tensor) namespace */
#endif
...@@ -89,10 +89,6 @@ XTensor::XTensor() ...@@ -89,10 +89,6 @@ XTensor::XTensor()
Init(); Init();
id = MakeTensorID(); id = MakeTensorID();
isDefaultDType = true;
isInGlobalMem = false;
isInit = false;
isTmp = false;
reserved = 0; reserved = 0;
} }
...@@ -277,6 +273,7 @@ void XTensor::Init() ...@@ -277,6 +273,7 @@ void XTensor::Init()
isTmp = false; isTmp = false;
isGrad = false; isGrad = false;
isVar = false; isVar = false;
isGradFinished = false;
enableGrad = X_ENABLE_GRAD; enableGrad = X_ENABLE_GRAD;
visitMark = 0; visitMark = 0;
grad = NULL; grad = NULL;
...@@ -772,10 +769,9 @@ MTYPE XTensor::GetOffset3D(int d0, int d1, int d2) const ...@@ -772,10 +769,9 @@ MTYPE XTensor::GetOffset3D(int d0, int d1, int d2) const
} }
/* /*
a vector with all entries of 0 a tensor with all entries of 0
>> stream - stream for the job pipeline
*/ */
void XTensor::SetZeroAll(XStream* stream) void XTensor::SetZeroAll()
{ {
if(data == NULL) if(data == NULL)
return; return;
...@@ -788,12 +784,7 @@ void XTensor::SetZeroAll(XStream* stream) ...@@ -788,12 +784,7 @@ void XTensor::SetZeroAll(XStream* stream)
int devIDBackup = 0; int devIDBackup = 0;
cudaGetDevice(&devIDBackup); cudaGetDevice(&devIDBackup);
cudaSetDevice(devID); cudaSetDevice(devID);
cudaMemset(data, 0, size);
if(stream == NULL)
cudaMemset(data, 0, size);
else
cudaMemsetAsync(data, 0, size, stream->stream);
cudaSetDevice(devIDBackup); cudaSetDevice(devIDBackup);
#endif #endif
} }
...@@ -807,13 +798,8 @@ void XTensor::SetZeroAll(XStream* stream) ...@@ -807,13 +798,8 @@ void XTensor::SetZeroAll(XStream* stream)
#ifdef USE_CUDA #ifdef USE_CUDA
int devIDBackup = 0; int devIDBackup = 0;
cudaGetDevice(&devIDBackup); cudaGetDevice(&devIDBackup);
cudaSetDevice(devID); cudaSetDevice(devID);
cudaMemset(data, 0, unitNum * unitSize);
if(stream == NULL)
cudaMemset(data, 0, unitNum * unitSize);
else
cudaMemsetAsync(data, 0, unitNum * unitSize, stream->stream);
cudaSetDevice(devIDBackup); cudaSetDevice(devIDBackup);
#endif #endif
} }
...@@ -845,11 +831,11 @@ void XTensor::Rand(int rNum, int cNum) ...@@ -845,11 +831,11 @@ void XTensor::Rand(int rNum, int cNum)
} }
/* generate data items with a range by start, end and the step /* generate data items with a range by start, end and the step
>> start - the begin of the array >> start - the beginning of the array
>> end - the end of the array (not included self) >> end - the end of the array (it does not includes itself)
>> step - the step of two items >> step - the step we take along the array
*/ */
void XTensor::Range(DTYPE lower, DTYPE upper, DTYPE step) void XTensor::Range(int lower, int upper, int step)
{ {
_SetDataRange(this, lower, upper, step); _SetDataRange(this, lower, upper, step);
} }
......
...@@ -31,7 +31,6 @@ ...@@ -31,7 +31,6 @@
#include <math.h> #include <math.h>
#include "XGlobal.h" #include "XGlobal.h"
#include "XPRunner.h" #include "XPRunner.h"
#include "XStream.h"
#include "XHeap.h" #include "XHeap.h"
#include "XList.h" #include "XList.h"
#include "XDataType.h" #include "XDataType.h"
...@@ -157,6 +156,11 @@ public: ...@@ -157,6 +156,11 @@ public:
/* mark for traversing the gragh */ /* mark for traversing the gragh */
unsigned int visitMark; unsigned int visitMark;
/* indicates whether the gradient of the tensor has been computed (in the backward process)
Note that the indicator could be modified by XNet (in back propagation) and be accessed
in XTrainer (and related classes). */
bool isGradFinished;
/* gradient (for back-propagation) */ /* gradient (for back-propagation) */
XTensor * grad; XTensor * grad;
...@@ -303,7 +307,7 @@ public: ...@@ -303,7 +307,7 @@ public:
MTYPE GetOffset3D(int d0, int d1, int d2) const; MTYPE GetOffset3D(int d0, int d1, int d2) const;
/* a tensor with all entries of 0 */ /* a tensor with all entries of 0 */
void SetZeroAll(XStream * stream = NULL); void SetZeroAll();
/* set the tensor with an data array */ /* set the tensor with an data array */
void SetData(const void * d, int num, int beg = 0); void SetData(const void * d, int num, int beg = 0);
...@@ -311,8 +315,8 @@ public: ...@@ -311,8 +315,8 @@ public:
/* generate data items with a uniform distribution in [0, 1] */ /* generate data items with a uniform distribution in [0, 1] */
void Rand(int rNum, int cNum); void Rand(int rNum, int cNum);
/* generate data items with a range by start, end and the step */ /* generate data items with a range by start, end and step */
void Range(DTYPE lower, DTYPE upper, DTYPE step); void Range(int lower, int upper, int step);
/* generate data items with a fixed value */ /* generate data items with a fixed value */
template<class T> template<class T>
......
...@@ -38,7 +38,7 @@ XThread::XThread() ...@@ -38,7 +38,7 @@ XThread::XThread()
#endif #endif
MUTEX_INIT(gMutex); MUTEX_INIT(gMutex);
function = NULL; function = NULL;
argv = NULL; argv.Clear();
toBreak = false; toBreak = false;
jobCount = 0; jobCount = 0;
working = 0; working = 0;
...@@ -69,6 +69,18 @@ void * XThread::Wrapper(void * ptr) ...@@ -69,6 +69,18 @@ void * XThread::Wrapper(void * ptr)
return 0; return 0;
} }
/*
initialize the thread with the function and its parameters
>> myFunc - the function to run
>> myArgv - arguments of the function
*/
void XThread::SetFunc(TFunction myFunc, XList * myArgv)
{
function = myFunc;
argv.Clear();
argv.AddList(myArgv);
}
/* /*
Tunning for this thread. It is very very native implementation. Tunning for this thread. It is very very native implementation.
...@@ -77,6 +89,10 @@ After that, we wait again if there is no new job. ...@@ -77,6 +89,10 @@ After that, we wait again if there is no new job.
*/ */
void XThread::Run() void XThread::Run()
{ {
if (function == NULL) {
ShowNTErrors("You are running a thread with no function specified!");
}
#ifdef _WIN32 #ifdef _WIN32
//COND_RESET(gCond); //COND_RESET(gCond);
#endif #endif
...@@ -104,7 +120,7 @@ void XThread::Run() ...@@ -104,7 +120,7 @@ void XThread::Run()
} }
/* do what you want to do*/ /* do what you want to do*/
function(argv); function(&argv);
#ifdef USE_PTHREAD #ifdef USE_PTHREAD
jobCount--; jobCount--;
......
...@@ -54,38 +54,7 @@ namespace nts{ ...@@ -54,38 +54,7 @@ namespace nts{
(unsigned)(flag), (unsigned *)(id)) (unsigned)(flag), (unsigned *)(id))
#endif #endif
////////////////////////////////////////////////// typedef void (*TFunction) (volatile XList*);
// mutex
#ifdef WIN32
#define THREAD_HANDLE HANDLE
#define MUTEX_HANDLE CRITICAL_SECTION
#define COND_HANDLE HANDLE
#define MUTEX_INIT( x ) InitializeCriticalSection( &(x) )
#define MUTEX_DELE( x ) DeleteCriticalSection( &(x) )
#define MUTEX_LOCK( x ) EnterCriticalSection( &(x) )
#define MUTEX_UNLOCK( x ) LeaveCriticalSection( &(x) )
#define COND_INIT( x ) ( x = CreateEvent( NULL, false, false, NULL ) )
#define COND_DELE( x ) CloseHandle( (x) )
#define COND_WAIT( x, y ) WaitForSingleObject( (x), INFINITE )
#define COND_SIGNAL( x ) SetEvent( (x) )
#define COND_RESET( x) ResetEvent( (x) )
#else
#define THREAD_HANDLE pthread_t
#define MUTEX_HANDLE pthread_mutex_t
#define COND_HANDLE pthread_cond_t
#define MUTEX_INIT( x ) pthread_mutex_init( &(x), NULL )
#define MUTEX_DELE( x ) pthread_mutex_destroy( &(x) )
#define MUTEX_LOCK( x ) pthread_mutex_lock( &(x) )
#define MUTEX_UNLOCK( x ) pthread_mutex_unlock( &(x) )
#define COND_INIT( x ) pthread_cond_init( &(x), NULL )
#define COND_DELE( x ) pthread_cond_destroy( &(x) )
#define COND_WAIT( x, y ) pthread_cond_wait( &(x), &(y) )
#define COND_SIGNAL( x ) pthread_cond_signal( &(x) )
#define COND_BROADCAST( x ) pthread_cond_broadcast( &(x) )
#endif
typedef void (*TFunction) (volatile TensorList*);
/* /*
This is a class that wraps the standard implementation of threading This is a class that wraps the standard implementation of threading
...@@ -128,12 +97,10 @@ public: ...@@ -128,12 +97,10 @@ public:
public: public:
/* function to run */ /* function to run */
volatile
TFunction function; TFunction function;
/* arguments (for the function to run) */ /* arguments (for the function to run) */
volatile XList argv;
TensorList * argv;
/* a flag to break */ /* a flag to break */
volatile volatile
...@@ -154,6 +121,9 @@ public: ...@@ -154,6 +121,9 @@ public:
/* a wrapper for the start-routine parameter in pthread_create */ /* a wrapper for the start-routine parameter in pthread_create */
static void * Wrapper(void * ptr); static void * Wrapper(void * ptr);
/* initialize the thread with the function and its parameters */
void SetFunc(TFunction myFunc, XList * myArgv);
/* /*
Core of the thread. It is very very native impelementation. Core of the thread. It is very very native impelementation.
We loop and wait for a singnal to activate the job processing. We loop and wait for a singnal to activate the job processing.
......
...@@ -155,13 +155,13 @@ void XMemSet(int devID, void * p, int value, size_t size) ...@@ -155,13 +155,13 @@ void XMemSet(int devID, void * p, int value, size_t size)
cudaMemcpyKind GetMemcpyKind(int devIDFrom, int devIDTo) cudaMemcpyKind GetMemcpyKind(int devIDFrom, int devIDTo)
{ {
if(devIDFrom < 0 && devIDTo < 0) if(devIDFrom < 0 && devIDTo < 0)
return cudaMemcpyHostToHost; return cudaMemcpyKind::cudaMemcpyHostToHost;
else if(devIDFrom < 0 && devIDTo >= 0) else if(devIDFrom < 0 && devIDTo >= 0)
return cudaMemcpyHostToDevice; return cudaMemcpyKind::cudaMemcpyHostToDevice;
else if(devIDFrom >= 0 && devIDTo < 0) else if(devIDFrom >= 0 && devIDTo < 0)
return cudaMemcpyDeviceToHost; return cudaMemcpyKind::cudaMemcpyDeviceToHost;
else else
return cudaMemcpyDeviceToDevice; return cudaMemcpyKind::cudaMemcpyDeviceToDevice;
} }
#endif #endif
...@@ -311,44 +311,6 @@ void XMemCopy2D(void * t, size_t tPitch, int devIDT, const void * s, size_t sPit ...@@ -311,44 +311,6 @@ void XMemCopy2D(void * t, size_t tPitch, int devIDT, const void * s, size_t sPit
#endif #endif
} }
void XMemCopy2DAsync(void * t, size_t tPitch, int devIDT, const void * s, size_t sPitch, int devIDS, size_t mSize, int n, XStream * stream)
{
if (t == s)
return;
if (devIDT < 0 && devIDS < 0) {
for(int i = 0; i < n; i++)
memcpy((char*)t + tPitch * i, (char*)s + sPitch * i, mSize);
return;
}
#ifdef USE_CUDA
else{
CheckNTErrors(stream != NULL, "No stream found!");
cudaStream_t &cstream = stream->stream;
if (devIDT >= 0 && devIDS < 0) {
cudaError_t error = cudaMemcpy2DAsync(t, tPitch, s, sPitch, mSize, n, cudaMemcpyHostToDevice, cstream);
if(error != cudaSuccess){
ShowNTErrors("cudaMemcpy2D error (cudaMemcpyHostToDevice)");
}
}
else if (devIDT < 0 && devIDS >= 0) {
cudaError_t error = cudaMemcpy2DAsync(t, tPitch, s, sPitch, mSize, n, cudaMemcpyDeviceToHost, cstream);
if(error != cudaSuccess){
ShowNTErrors("cudaMemcpy error (cudaMemcpyDeviceToHost)");
}
}
else {
cudaError_t error = cudaMemcpy2DAsync(t, tPitch, s, sPitch, mSize, n, cudaMemcpyDeviceToDevice, cstream);
if (error != cudaSuccess) {
ShowNTErrors("cudaMemcpy error (cudaMemcpyDeviceToDevice)");
}
}
}
#else
ShowNTErrors("Please specify USE_CUDA and recompile the code!");
#endif
}
void * XMemAlloc(int devID, size_t size) void * XMemAlloc(int devID, size_t size)
{ {
void * p = NULL; void * p = NULL;
...@@ -523,6 +485,9 @@ unsigned int GetNextPower2(unsigned int n) ...@@ -523,6 +485,9 @@ unsigned int GetNextPower2(unsigned int n)
/* sleep for a while */ /* sleep for a while */
void XSleep(int sleepTime) void XSleep(int sleepTime)
{ {
if (sleepTime <= 0)
return;
#ifdef _WIN32 #ifdef _WIN32
Sleep((DWORD)sleepTime); Sleep((DWORD)sleepTime);
#else #else
...@@ -591,9 +556,9 @@ void XQSort(void * data, void * index, int num, int width, int stride, int (*com ...@@ -591,9 +556,9 @@ void XQSort(void * data, void * index, int num, int width, int stride, int (*com
stackptr = 0; stackptr = 0;
lo = (char*)data; lo = (char*)data;
hi = (char*)data + realStride * (num - 1); hi = (char*)data + (long)realStride * (num - 1);
indexlo = (int*)index; indexlo = (int*)index;
indexhi = index != NULL ? (int*)index + stride * (num - 1) : NULL; indexhi = index != NULL ? (int*)index + (long)stride * (num - 1) : NULL;
recurse: recurse:
...@@ -603,8 +568,8 @@ recurse: ...@@ -603,8 +568,8 @@ recurse:
if(size <= MIN_QSORT_NUM) if(size <= MIN_QSORT_NUM)
XShortSort(lo, hi, indexlo, indexhi, width, stride, comp); XShortSort(lo, hi, indexlo, indexhi, width, stride, comp);
else { else {
mid = lo + (size/2) * realStride; mid = lo + (long)(size/2) * realStride;
indexmid = indexlo + (size/2) * stride; indexmid = indexlo + (long)(size/2) * stride;
/* sort the first, last and middle elements into order */ /* sort the first, last and middle elements into order */
if(comp(lo, mid) > 0) if(comp(lo, mid) > 0)
...@@ -872,8 +837,7 @@ int SplitALine(char* inputString, const char* seperator, StrList* items) ...@@ -872,8 +837,7 @@ int SplitALine(char* inputString, const char* seperator, StrList* items)
return 0; return 0;
if (sepLen == 0) { if (sepLen == 0) {
char* item = new char[(long)inputLen + 1];
char* item = new char[inputLen + 1];
strcpy(item, inputString); strcpy(item, inputString);
items->Add(item); items->Add(item);
} }
......
...@@ -42,7 +42,6 @@ extern void XMemSet(void * p, int value, size_t size); ...@@ -42,7 +42,6 @@ extern void XMemSet(void * p, int value, size_t size);
extern void XMemSet(int devID, void * p, int value, size_t size); extern void XMemSet(int devID, void * p, int value, size_t size);
extern void XMemCopy(void * t, int devIDT, const void * s, int devIDS, size_t size); extern void XMemCopy(void * t, int devIDT, const void * s, int devIDS, size_t size);
extern void XMemCopy2D(void * t, size_t tPitch, int devIDT, const void * s, size_t sPitch, int devIDS, size_t mSize, int n); extern void XMemCopy2D(void * t, size_t tPitch, int devIDT, const void * s, size_t sPitch, int devIDS, size_t mSize, int n);
extern void XMemCopy2DAsync(void * t, size_t tPitch, int devIDT, const void * s, size_t sPitch, int devIDS, size_t mSize, int n, XStream * stream);
extern void * XMemAlloc(int devID, size_t size); extern void * XMemAlloc(int devID, size_t size);
extern void * XMemAllocOnDev(int devID, size_t size); extern void * XMemAllocOnDev(int devID, size_t size);
extern void XMemFree(int devID, void * p); extern void XMemFree(int devID, void * p);
......
...@@ -253,15 +253,25 @@ void Div(const XTensor & a, const XTensor & b, XTensor & c, DTYPE alpha, int lea ...@@ -253,15 +253,25 @@ void Div(const XTensor & a, const XTensor & b, XTensor & c, DTYPE alpha, int lea
if (b.order == 0){ if (b.order == 0){
DTYPE scale = 1.0F / b.Get0D(); DTYPE scale = 1.0F / b.Get0D();
if (a.mem != NULL)
a.mem->LockBuf();
XTensor * tmp1 = NewTensorBufV2(&a, a.devID, a.mem); XTensor * tmp1 = NewTensorBufV2(&a, a.devID, a.mem);
if ((c.mem != NULL) && (c.mem != a.mem)) {
c.mem->LockBuf();
}
XTensor * tmp2 = NewTensorBufV2(&c, c.devID, c.mem); XTensor * tmp2 = NewTensorBufV2(&c, c.devID, c.mem);
ScaleAndShift(a, *tmp1, scale, 0.0F); ScaleAndShift(a, *tmp1, scale, 0.0F);
ScaleAndShift(c, *tmp2, alpha, 0.0F); ScaleAndShift(c, *tmp2, alpha, 0.0F);
Sum(*tmp2, *tmp1, c); Sum(*tmp2, *tmp1, c);
DelTensorBuf(tmp1);
DelTensorBuf(tmp2); DelTensorBuf(tmp2);
if ((c.mem != NULL) && (c.mem != a.mem)) {
c.mem->UnlockBuf();
}
DelTensorBuf(tmp1);
if (a.mem != NULL)
a.mem->UnlockBuf();
} }
else { else {
int n = GetBroadcastDimIndex(a, b); int n = GetBroadcastDimIndex(a, b);
......
...@@ -42,12 +42,11 @@ where trans() return the transposed matrix if the flag is fired ...@@ -42,12 +42,11 @@ where trans() return the transposed matrix if the flag is fired
>> alpha - a coefficient >> alpha - a coefficient
>> beta - another coefficient >> beta - another coefficient
>> parallelRunner - parallel processing module >> parallelRunner - parallel processing module
>> stream - the string for creating the job pipeline
*/ */
void _MatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA, void _MatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
const XTensor * b, MATRIX_TRANS_TYPE transposedB, const XTensor * b, MATRIX_TRANS_TYPE transposedB,
XTensor * c, DTYPE alpha, DTYPE beta, XTensor * c, DTYPE alpha, DTYPE beta,
XPRunner * parallelRunner, XStream * stream) XPRunner * parallelRunner)
{ {
CheckNTErrors((a && b && c), "Empty input tensors!"); CheckNTErrors((a && b && c), "Empty input tensors!");
CheckNTErrors((a->dataType == b->dataType), "Input tensors should have the same data type!"); CheckNTErrors((a->dataType == b->dataType), "Input tensors should have the same data type!");
...@@ -69,7 +68,7 @@ void _MatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA, ...@@ -69,7 +68,7 @@ void _MatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
#ifdef USE_CUDA #ifdef USE_CUDA
if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) { if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
_CudaMatrixMul2D(a, transposedA, b, transposedB, c, alpha, beta, stream); _CudaMatrixMul2D(a, transposedA, b, transposedB, c, alpha, beta);
return; return;
} }
#endif #endif
......
...@@ -119,11 +119,10 @@ where trans() return the transposed matrix if the flag is fired ...@@ -119,11 +119,10 @@ where trans() return the transposed matrix if the flag is fired
>> c - where we put a*b >> c - where we put a*b
>> alpha - a coefficient >> alpha - a coefficient
>> beta - another coefficient >> beta - another coefficient
>> stream - the string for creating the job pipeline
*/ */
void _CudaMatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA, void _CudaMatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
const XTensor * b, MATRIX_TRANS_TYPE transposedB, const XTensor * b, MATRIX_TRANS_TYPE transposedB,
XTensor * c, DTYPE alpha, DTYPE beta, XStream * stream) XTensor * c, DTYPE alpha, DTYPE beta)
{ {
int an = transposedA == X_TRANS ? a->dimSize[1] : a->dimSize[0]; int an = transposedA == X_TRANS ? a->dimSize[1] : a->dimSize[0];
int am = transposedA == X_TRANS ? a->dimSize[0] : a->dimSize[1]; int am = transposedA == X_TRANS ? a->dimSize[0] : a->dimSize[1];
...@@ -152,10 +151,6 @@ void _CudaMatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA, ...@@ -152,10 +151,6 @@ void _CudaMatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
cublasHandle_t * handle = a->mem == NULL ? GDevs.GetCudaHandle(a->devID) : a->mem->GetCublasHandle(); cublasHandle_t * handle = a->mem == NULL ? GDevs.GetCudaHandle(a->devID) : a->mem->GetCublasHandle();
/* !!!! might have problems */
if (stream != NULL)
cublasSetStream(*handle, stream->stream);
if (beta == 0) if (beta == 0)
c->SetZeroAll(); c->SetZeroAll();
......
...@@ -43,7 +43,7 @@ c = trans(a) * trans(b) * alpha + c * beta ...@@ -43,7 +43,7 @@ c = trans(a) * trans(b) * alpha + c * beta
where trans() return the transposed matrix if the flag is fired where trans() return the transposed matrix if the flag is fired
*/ */
void _CudaMatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c, void _CudaMatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c,
DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XStream * stream = NULL); DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0);
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -32,7 +32,7 @@ c = trans(a) * trans(b) * alpha + c * beta ...@@ -32,7 +32,7 @@ c = trans(a) * trans(b) * alpha + c * beta
where trans() return the transposed matrix if the flag is fired where trans() return the transposed matrix if the flag is fired
*/ */
void _MatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c, void _MatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c,
DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL, XStream * stream = NULL); DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -61,6 +61,8 @@ XTensor MulAndShift(const XTensor &x, const XTensor &w, const XTensor &b, ...@@ -61,6 +61,8 @@ XTensor MulAndShift(const XTensor &x, const XTensor &w, const XTensor &b,
float dr = (!x.isSparse || !w.isSparse) ? 1.0F : MAX(x.denseRatio, w.denseRatio); float dr = (!x.isSparse || !w.isSparse) ? 1.0F : MAX(x.denseRatio, w.denseRatio);
if (x.mem != NULL)
x.mem->LockBuf();
XTensor * tmp = NewTensorBufV2(order, dimSize, x.dataType, dr, x.devID, x.mem); XTensor * tmp = NewTensorBufV2(order, dimSize, x.dataType, dr, x.devID, x.mem);
/* call _MatrixMul function */ /* call _MatrixMul function */
...@@ -101,6 +103,8 @@ XTensor MulAndShift(const XTensor &x, const XTensor &w, const XTensor &b, ...@@ -101,6 +103,8 @@ XTensor MulAndShift(const XTensor &x, const XTensor &w, const XTensor &b,
/* destroy variables */ /* destroy variables */
delete[] dimSize; delete[] dimSize;
DelTensorBuf(tmp); DelTensorBuf(tmp);
if (x.mem != NULL)
x.mem->UnlockBuf();
return c; return c;
} }
...@@ -121,8 +125,8 @@ XTensor MulAndShift(const XTensor& x, MATRIX_TRANS_TYPE transposedX, ...@@ -121,8 +125,8 @@ XTensor MulAndShift(const XTensor& x, MATRIX_TRANS_TYPE transposedX,
CheckNTErrors(x.order >= 2 && w.order >= 2, "Input tensors must have a order >= 2!"); CheckNTErrors(x.order >= 2 && w.order >= 2, "Input tensors must have a order >= 2!");
int xn = transposedX == X_TRANS ? x.dimSize[x.order - 1] : x.dimSize[x.order - 2]; int xn = transposedX == X_TRANS ? x.dimSize[x.order - 1] : x.dimSize[x.order - 2];
int xm = transposedX == X_TRANS ? x.dimSize[x.order - 2] : x.dimSize[x.order - 1]; //int xm = transposedX == X_TRANS ? x.dimSize[x.order - 2] : x.dimSize[x.order - 1];
int wn = transposedW == X_TRANS ? w.dimSize[w.order - 1] : w.dimSize[w.order - 2]; //int wn = transposedW == X_TRANS ? w.dimSize[w.order - 1] : w.dimSize[w.order - 2];
int wm = transposedW == X_TRANS ? w.dimSize[w.order - 2] : w.dimSize[w.order - 1]; int wm = transposedW == X_TRANS ? w.dimSize[w.order - 2] : w.dimSize[w.order - 1];
int order = x.order + w.order - 2; int order = x.order + w.order - 2;
...@@ -137,6 +141,8 @@ XTensor MulAndShift(const XTensor& x, MATRIX_TRANS_TYPE transposedX, ...@@ -137,6 +141,8 @@ XTensor MulAndShift(const XTensor& x, MATRIX_TRANS_TYPE transposedX,
float dr = (!x.isSparse || !w.isSparse) ? 1.0F : MAX(x.denseRatio, w.denseRatio); float dr = (!x.isSparse || !w.isSparse) ? 1.0F : MAX(x.denseRatio, w.denseRatio);
if (x.mem != NULL)
x.mem->LockBuf();
XTensor * tmp = NewTensorBufV2(order, dimSize, x.dataType, dr, x.devID, x.mem); XTensor * tmp = NewTensorBufV2(order, dimSize, x.dataType, dr, x.devID, x.mem);
/* call _MatrixMul function */ /* call _MatrixMul function */
...@@ -175,8 +181,10 @@ XTensor MulAndShift(const XTensor& x, MATRIX_TRANS_TYPE transposedX, ...@@ -175,8 +181,10 @@ XTensor MulAndShift(const XTensor& x, MATRIX_TRANS_TYPE transposedX,
/* destroy variables */ /* destroy variables */
delete[] dimSize; delete[] dimSize;
DelTensorBuf(tmp); DelTensorBuf(tmp);
if (x.mem != NULL)
x.mem->UnlockBuf();
return c; return c;
} }
} }
\ No newline at end of file
...@@ -277,15 +277,25 @@ void Multiply(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int l ...@@ -277,15 +277,25 @@ void Multiply(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int l
if (b.order == 0){ if (b.order == 0){
DTYPE scale = b.Get0D(); DTYPE scale = b.Get0D();
if (a.mem != NULL)
a.mem->LockBuf();
XTensor * tmp1 = NewTensorBufV2(&a, a.devID, a.mem); XTensor * tmp1 = NewTensorBufV2(&a, a.devID, a.mem);
if ((c.mem != NULL) && (c.mem != a.mem)) {
c.mem->LockBuf();
}
XTensor * tmp2 = NewTensorBufV2(&c, c.devID, c.mem); XTensor * tmp2 = NewTensorBufV2(&c, c.devID, c.mem);
ScaleAndShift(a, *tmp1, scale, 0.0F); ScaleAndShift(a, *tmp1, scale, 0.0F);
ScaleAndShift(c, *tmp2, alpha, 0.0F); ScaleAndShift(c, *tmp2, alpha, 0.0F);
Sum(*tmp2, *tmp1, c); Sum(*tmp2, *tmp1, c);
DelTensorBuf(tmp1);
DelTensorBuf(tmp2); DelTensorBuf(tmp2);
if ((c.mem != NULL) && (c.mem != a.mem)) {
c.mem->UnlockBuf();
}
DelTensorBuf(tmp1);
if (a.mem != NULL)
a.mem->UnlockBuf();
} }
else { else {
int n = GetBroadcastDimIndex(a, b); int n = GetBroadcastDimIndex(a, b);
......
...@@ -290,9 +290,16 @@ void _MultiplyBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE ...@@ -290,9 +290,16 @@ void _MultiplyBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE
source = target; source = target;
} }
target = t->mem != NULL ? /*target = t->mem != NULL ?
t->mem->AllocBuf(t->devID, t->unitNum * t->unitSize): t->mem->AllocBuf(t->devID, t->unitNum * t->unitSize):
XMemAlloc(t->devID, t->unitNum * t->unitSize); XMemAlloc(t->devID, t->unitNum * t->unitSize);*/
if (t->mem != NULL) {
t->mem->LockBuf();
target = t->mem->AllocBuf(t->devID, t->unitNum * t->unitSize);
}
else {
target = XMemAlloc(t->devID, t->unitNum * t->unitSize);
}
s->data = source; s->data = source;
t->data = target; t->data = target;
...@@ -302,8 +309,9 @@ void _MultiplyBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE ...@@ -302,8 +309,9 @@ void _MultiplyBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE
/* free the memory space of the one before the last allocation */ /* free the memory space of the one before the last allocation */
if(count > 0){ if(count > 0){
int size = s->unitNum * s->unitSize; int size = s->unitNum * s->unitSize;
if(t->mem != NULL) if(t->mem != NULL) {
t->mem->ReleaseBuf(t->devID, size); t->mem->ReleaseBuf(t->devID, size);
}
else else
XMemFree(t->devID, source); XMemFree(t->devID, source);
} }
...@@ -312,8 +320,10 @@ void _MultiplyBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE ...@@ -312,8 +320,10 @@ void _MultiplyBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE
if(isLast){ if(isLast){
CheckNTErrors(t->unitNum == c->unitNum, "Wrong tensor size!"); CheckNTErrors(t->unitNum == c->unitNum, "Wrong tensor size!");
_Multiply(a, t, c, beta); _Multiply(a, t, c, beta);
if(t->mem != NULL) if(t->mem != NULL) {
t->mem->ReleaseBuf(t->devID, t->unitNum * t->unitSize); t->mem->ReleaseBuf(t->devID, t->unitNum * t->unitSize);
t->mem->UnlockBuf();
}
else else
XMemFree(t->devID, target); XMemFree(t->devID, target);
target = NULL; target = NULL;
......
...@@ -147,25 +147,27 @@ void _Sum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta) ...@@ -147,25 +147,27 @@ void _Sum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
int * bp = (int*)b->data; int * bp = (int*)b->data;
int * cp = (int*)c->data; int * cp = (int*)c->data;
/* TODO: new code for beta = 1. the follow code might be slow because it introduces
additional floating-point computation. */
/* unrolling */ /* unrolling */
int num = a->unitNum; int num = a->unitNum;
if (num % 4 == 0) { if (num % 4 == 0) {
for (int i = 0; i < num; i += 4) { for (int i = 0; i < num; i += 4) {
cp[i] = ap[i] + bp[i] * beta; cp[i] = ap[i] + (int)(bp[i] * beta);
cp[i + 1] = ap[i + 1] + bp[i + 1] * beta; cp[i + 1] = ap[i + 1] + (int)(bp[i + 1] * beta);
cp[i + 2] = ap[i + 2] + bp[i + 2] * beta; cp[i + 2] = ap[i + 2] + (int)(bp[i + 2] * beta);
cp[i + 3] = ap[i + 3] + bp[i + 3] * beta; cp[i + 3] = ap[i + 3] + (int)(bp[i + 3] * beta);
} }
} }
else if (num % 2 == 0) { else if (num % 2 == 0) {
for (int i = 0; i < num; i += 2) { for (int i = 0; i < num; i += 2) {
cp[i] = ap[i] + bp[i] * beta; cp[i] = ap[i] + (int)(bp[i] * beta);
cp[i + 1] = ap[i + 1] + bp[i + 1] * beta; cp[i + 1] = ap[i + 1] + (int)(bp[i + 1] * beta);
} }
} }
else { else {
for (int i = 0; i < num; i++) { for (int i = 0; i < num; i++) {
cp[i] = ap[i] + bp[i] * beta; cp[i] = ap[i] + (int)(bp[i] * beta);
} }
} }
} }
......
...@@ -293,10 +293,16 @@ void _SumBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta ...@@ -293,10 +293,16 @@ void _SumBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta
source = target; source = target;
} }
target = t->mem != NULL ? /*target = t->mem != NULL ?
t->mem->AllocBuf(t->devID, t->unitNum * t->unitSize): t->mem->AllocBuf(t->devID, t->unitNum * t->unitSize):
XMemAlloc(t->devID, t->unitNum * t->unitSize); XMemAlloc(t->devID, t->unitNum * t->unitSize);*/
if (t->mem != NULL) {
t->mem->LockBuf();
target = t->mem->AllocBuf(t->devID, t->unitNum * t->unitSize);
}
else {
target = XMemAlloc(t->devID, t->unitNum * t->unitSize);
}
s->data = source; s->data = source;
t->data = target; t->data = target;
...@@ -315,8 +321,10 @@ void _SumBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta ...@@ -315,8 +321,10 @@ void _SumBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta
if(isLast){ if(isLast){
CheckNTErrors(t->unitNum == c->unitNum, "Wrong tensor size!"); CheckNTErrors(t->unitNum == c->unitNum, "Wrong tensor size!");
_Sum(a, t, c, beta); _Sum(a, t, c, beta);
if(t->mem != NULL) if(t->mem != NULL) {
t->mem->ReleaseBuf(t->devID, t->unitNum * t->unitSize); t->mem->ReleaseBuf(t->devID, t->unitNum * t->unitSize);
t->mem->UnlockBuf();
}
else else
XMemFree(t->devID, target); XMemFree(t->devID, target);
target = NULL; target = NULL;
......
...@@ -113,6 +113,9 @@ void _CudaBLASMatrixMULBatched(cublasHandle_t * handle, ...@@ -113,6 +113,9 @@ void _CudaBLASMatrixMULBatched(cublasHandle_t * handle,
int count, int na, int ma, int nb, int mb, int nc, int mc, int count, int na, int ma, int nb, int mb, int nc, int mc,
DTYPE alpha, DTYPE beta) DTYPE alpha, DTYPE beta)
{ {
int version = 0;
cudaRuntimeGetVersion(&version);
/* /*
matrxi-matrix multiplication matrxi-matrix multiplication
For row-major matrices (as in c/c++), the trick used here is (AB)^T = B^T * A^T For row-major matrices (as in c/c++), the trick used here is (AB)^T = B^T * A^T
...@@ -327,6 +330,7 @@ void _CudaBLASMatrixMULList(cublasHandle_t * handle, ...@@ -327,6 +330,7 @@ void _CudaBLASMatrixMULList(cublasHandle_t * handle,
DTYPE ** cpGPU = NULL; DTYPE ** cpGPU = NULL;
if (mem != NULL) { if (mem != NULL) {
mem->LockBuf();
mem->SetPinBuf(); mem->SetPinBuf();
apGPU = (DTYPE**)mem->AllocBuf(mem->devID, sizeof(DTYPE*) * a->count, 256); apGPU = (DTYPE**)mem->AllocBuf(mem->devID, sizeof(DTYPE*) * a->count, 256);
bpGPU = (DTYPE**)mem->AllocBuf(mem->devID, sizeof(DTYPE*) * a->count, 256); bpGPU = (DTYPE**)mem->AllocBuf(mem->devID, sizeof(DTYPE*) * a->count, 256);
...@@ -353,8 +357,10 @@ void _CudaBLASMatrixMULList(cublasHandle_t * handle, ...@@ -353,8 +357,10 @@ void _CudaBLASMatrixMULList(cublasHandle_t * handle,
delete[] bp; delete[] bp;
delete[] cp; delete[] cp;
if(mem != NULL) if (mem != NULL) {
mem->BackToPinBuf(); mem->BackToPinBuf();
mem->UnlockBuf();
}
else { else {
XMemFree(a0->devID, apGPU); XMemFree(a0->devID, apGPU);
XMemFree(a0->devID, bpGPU); XMemFree(a0->devID, bpGPU);
......
...@@ -96,9 +96,12 @@ XTensor OnehotToIndex(const XTensor & onehot, int size) ...@@ -96,9 +96,12 @@ XTensor OnehotToIndex(const XTensor & onehot, int size)
/* /*
convert index tensor to onehot tensor convert index tensor to onehot tensor
>> index - index tensor, which value is an integer num >> index - index of the output dimension (over the vocabulary)
>> onehot - onehot tensor, which value is 0 or 1 >> onehot - one-hot representation of the index
>> size - the last dimension size of the onehot tensor >> size - vocabuary size (last dimension size of onehot)
>> labelSmoothingP - the parameter that controls how smooth the output is.
E.g., p = 0 means no smoothing
p = 1 means a uniform distribution (almost)
*/ */
void _IndexToOnehot(const XTensor * index, XTensor * onehot, void _IndexToOnehot(const XTensor * index, XTensor * onehot,
int size, float labelSmoothingP) int size, float labelSmoothingP)
......
...@@ -483,7 +483,7 @@ void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper) ...@@ -483,7 +483,7 @@ void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper)
else if (tensor->dataType == X_FLOAT16) { else if (tensor->dataType == X_FLOAT16) {
unsigned short* d = (unsigned short*)tensor->data; unsigned short* d = (unsigned short*)tensor->data;
for (int i = 0; i < tensor->unitNum; i++) { for (int i = 0; i < tensor->unitNum; i++) {
d[i] = variance * ((unsigned short)rand() / RAND_MAX) + lower; d[i] = (unsigned short)(variance * ((unsigned short)rand() / RAND_MAX) + lower);
} }
} }
else if(tensor->dataType == X_DOUBLE){ else if(tensor->dataType == X_DOUBLE){
...@@ -538,17 +538,17 @@ void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper) ...@@ -538,17 +538,17 @@ void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper)
/* generate data items with a range by start, end and the step /* generate data items with a range by start, end and the step
>> tensor - the tensor whose data array would be initialized >> tensor - the tensor whose data array would be initialized
>> start - the begin of the array >> beg - the beginning of the array
>> end - the end of the array (not included self) >> end - the end of the array (it does not include itself)
>> step - the step of two items >> step - the step we take along the array
*/ */
void _SetDataRange(XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE step) void _SetDataRange(XTensor * tensor, int beg, int end, int step)
{ {
CheckNTErrors((tensor->order == 1), "Tensor must be 1 dimension!"); CheckNTErrors((tensor->order == 1), "Tensor must be 1 dimension!");
/* compute the true length according to the (start, end, step) */ /* compute the true length according to the (start, end, step) */
DTYPE size = (DTYPE)fabs(upper - lower); DTYPE size = (DTYPE)fabs(end - beg);
int num = ceil(size / fabs(step)); int num = (int)ceil(size / fabs(step));
CheckNTErrors((tensor->unitNum == num), "Unit number of the tensor is not matched."); CheckNTErrors((tensor->unitNum == num), "Unit number of the tensor is not matched.");
/* init a integer array to store the sequence */ /* init a integer array to store the sequence */
...@@ -556,12 +556,13 @@ void _SetDataRange(XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE step) ...@@ -556,12 +556,13 @@ void _SetDataRange(XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE step)
if (tensor->dataType == X_INT) { if (tensor->dataType == X_INT) {
data = new int[num]; data = new int[num];
for (int i = 0; i < num; i++) for (int i = 0; i < num; i++)
*((int*)data + i) = lower + i * step; *((int*)data + i) = beg + i * step;
} }
else if (tensor->dataType == X_FLOAT) { else if (tensor->dataType == X_FLOAT) {
data = new float[num]; ShowNTErrors("TODO! Unsupported datatype!")
for (int i = 0; i < num; i++) //data = new float[num];
*((float*)data + i) = lower + i * step; //for (int i = 0; i < num; i++)
// *((float*)data + i) = beg + i * step;
} }
else { else {
ShowNTErrors("TODO! Unsupported datatype!") ShowNTErrors("TODO! Unsupported datatype!")
...@@ -695,13 +696,23 @@ void _SetDataWithOffset(XTensor * tensor, MTYPE * offsets, DTYPE value, MTYPE nu ...@@ -695,13 +696,23 @@ void _SetDataWithOffset(XTensor * tensor, MTYPE * offsets, DTYPE value, MTYPE nu
#ifdef USE_CUDA #ifdef USE_CUDA
XMem * mem = tensor->mem; XMem * mem = tensor->mem;
MTYPE size = num * sizeof(MTYPE); MTYPE size = num * sizeof(MTYPE);
MTYPE * offsetsCuda = mem != NULL ? (MTYPE*)mem->AllocBuf(mem->devID, size) : (MTYPE*)XMemAlloc(tensor->devID, size); //MTYPE * offsetsCuda = mem != NULL ? (MTYPE*)mem->AllocBuf(mem->devID, size) : (MTYPE*)XMemAlloc(tensor->devID, size);
MTYPE * offsetsCuda;
if (mem != NULL) {
mem->LockBuf();
offsetsCuda = (MTYPE*)mem->AllocBuf(mem->devID, size);
}
else {
offsetsCuda = (MTYPE*)XMemAlloc(tensor->devID, size);
}
XMemCopy(offsetsCuda, tensor->devID, offsets, -1, num * sizeof(MTYPE)); XMemCopy(offsetsCuda, tensor->devID, offsets, -1, num * sizeof(MTYPE));
_CudaSetDataWithOffset(tensor, offsetsCuda, value, num); _CudaSetDataWithOffset(tensor, offsetsCuda, value, num);
if (mem != NULL) if (mem != NULL) {
mem->ReleaseBuf(mem->devID, size); mem->ReleaseBuf(mem->devID, size);
mem->UnlockBuf();
}
else else
XMemFree(tensor->devID, offsetsCuda); XMemFree(tensor->devID, offsetsCuda);
#else #else
......
...@@ -636,12 +636,23 @@ void _CudaSetDataWithOffsetAndValue(XTensor * tensor, MTYPE * offsets, void * va ...@@ -636,12 +636,23 @@ void _CudaSetDataWithOffsetAndValue(XTensor * tensor, MTYPE * offsets, void * va
int devIDBackup; int devIDBackup;
ProtectCudaDev(tensor->devID, devIDBackup); ProtectCudaDev(tensor->devID, devIDBackup);
MTYPE * offsetsCuda = mem != NULL ? /*MTYPE * offsetsCuda = mem != NULL ?
(MTYPE*)mem->AllocBuf(mem->devID, offsetSize) : (MTYPE*)mem->AllocBuf(mem->devID, offsetSize) :
(MTYPE*)XMemAlloc(tensor->devID, offsetSize); (MTYPE*)XMemAlloc(tensor->devID, offsetSize);
void * valuesCuda = mem != NULL ? void * valuesCuda = mem != NULL ?
mem->AllocBuf(mem->devID, valueSize) : mem->AllocBuf(mem->devID, valueSize) :
XMemAlloc(tensor->devID, valueSize); XMemAlloc(tensor->devID, valueSize);*/
MTYPE * offsetsCuda;
void * valuesCuda;
if (mem != NULL) {
mem->LockBuf();
offsetsCuda = (MTYPE*)mem->AllocBuf(mem->devID, offsetSize);
valuesCuda = mem->AllocBuf(mem->devID, valueSize);
}
else {
offsetsCuda = (MTYPE*)XMemAlloc(tensor->devID, offsetSize);
valuesCuda = XMemAlloc(tensor->devID, valueSize);
}
if (mem != NULL) { if (mem != NULL) {
XMemCopy(offsetsCuda, mem->devID, offsets, -1, offsetSize); XMemCopy(offsetsCuda, mem->devID, offsets, -1, offsetSize);
...@@ -657,6 +668,7 @@ void _CudaSetDataWithOffsetAndValue(XTensor * tensor, MTYPE * offsets, void * va ...@@ -657,6 +668,7 @@ void _CudaSetDataWithOffsetAndValue(XTensor * tensor, MTYPE * offsets, void * va
if (mem != NULL) { if (mem != NULL) {
mem->ReleaseBuf(mem->devID, valueSize); mem->ReleaseBuf(mem->devID, valueSize);
mem->ReleaseBuf(mem->devID, offsetSize); mem->ReleaseBuf(mem->devID, offsetSize);
mem->UnlockBuf();
} }
else { else {
XMemFree(tensor->devID, valuesCuda); XMemFree(tensor->devID, valuesCuda);
......
...@@ -57,8 +57,8 @@ void _SetDataRand(XTensor * tensor, int rNum, int cNum); ...@@ -57,8 +57,8 @@ void _SetDataRand(XTensor * tensor, int rNum, int cNum);
/* generate data items with a uniform distribution in [lower, upper] */ /* generate data items with a uniform distribution in [lower, upper] */
void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper); void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper);
/* generate data items with a range by start, end and the step */ /* generate data items with a range [begin, end] and the step */
void _SetDataRange(XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE step); void _SetDataRange(XTensor * tensor, int beg, int end, int step);
/* generate data items with a uniform distribution in [lower, upper] and set /* generate data items with a uniform distribution in [lower, upper] and set
the item to a pre-defined value if the item >= p, set the item to 0 otherwise */ the item to a pre-defined value if the item >= p, set the item to 0 otherwise */
......
...@@ -63,9 +63,9 @@ void _Clip(const XTensor * a, XTensor * b, DTYPE lower, DTYPE upper) ...@@ -63,9 +63,9 @@ void _Clip(const XTensor * a, XTensor * b, DTYPE lower, DTYPE upper)
int* db = (int*)b->data; int* db = (int*)b->data;
for (int i = 0; i < a->unitNum; i++) { for (int i = 0; i < a->unitNum; i++) {
if (d[i] > upper) if (d[i] > upper)
db[i] = upper; db[i] = (int)upper;
else if (d[i] < lower) else if (d[i] < lower)
db[i] = lower; db[i] = (int)lower;
else else
db[i] = d[i]; db[i] = d[i];
} }
......
...@@ -86,7 +86,7 @@ void _ScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift) ...@@ -86,7 +86,7 @@ void _ScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift)
for(int i = 0; i < num; i++){ for(int i = 0; i < num; i++){
int * v = (int*)f; int * v = (int*)f;
int * vb = (int*)fb; int * vb = (int*)fb;
*vb = *v * scale + shift; *vb = (int)(*v * scale + shift);
f += sizeof(int) + sizeof(int); f += sizeof(int) + sizeof(int);
fb += sizeof(int) + sizeof(int); fb += sizeof(int) + sizeof(int);
} }
...@@ -96,7 +96,7 @@ void _ScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift) ...@@ -96,7 +96,7 @@ void _ScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift)
int * va = (int*)a->data; int * va = (int*)a->data;
int * vb = (int*)b->data; int * vb = (int*)b->data;
for(int i = 0; i < b->unitNum; i++){ for(int i = 0; i < b->unitNum; i++){
*vb = *va * scale + shift; *vb = (int)(*va * scale + shift);
va++; va++;
vb++; vb++;
} }
......
...@@ -45,15 +45,25 @@ void _CopyBlocks(void * source, int unitSize, int blockSize, int blockNum, void ...@@ -45,15 +45,25 @@ void _CopyBlocks(void * source, int unitSize, int blockSize, int blockNum, void
if (devID >= 0) { if (devID >= 0) {
#ifdef USE_CUDA #ifdef USE_CUDA
/* copy the index from host to device */ /* copy the index from host to device */
int * targetBlocksTMP = myMem != NULL ? /*int * targetBlocksTMP = myMem != NULL ?
(int*)myMem->AllocBuf(devID, blockNum * sizeof(int)): (int*)myMem->AllocBuf(devID, blockNum * sizeof(int)):
(int*)XMemAlloc(devID, blockNum * sizeof(int)); (int*)XMemAlloc(devID, blockNum * sizeof(int));*/
int * targetBlocksTMP;
if (myMem != NULL) {
myMem->LockBuf();
targetBlocksTMP = (int*)myMem->AllocBuf(devID, blockNum * sizeof(int));
}
else {
targetBlocksTMP = (int*)XMemAlloc(devID, blockNum * sizeof(int));
}
XMemCopy(targetBlocksTMP, devID, targetBlocks, -1, blockNum * sizeof(int)); XMemCopy(targetBlocksTMP, devID, targetBlocks, -1, blockNum * sizeof(int));
_CopyBlocksOnSite(source, unitSize, blockSize, blockNum, target, targetBlocksTMP, devID); _CopyBlocksOnSite(source, unitSize, blockSize, blockNum, target, targetBlocksTMP, devID);
if(myMem != NULL) if (myMem != NULL) {
myMem->ReleaseBuf(myMem->devID, blockNum * sizeof(int)); myMem->ReleaseBuf(myMem->devID, blockNum * sizeof(int));
myMem->UnlockBuf();
}
else else
XMemFree(devID, targetBlocksTMP); XMemFree(devID, targetBlocksTMP);
#else #else
......
...@@ -47,14 +47,17 @@ void _CopyBlocksInGrid(void * source, int blockSize, int blockNum, int gridNum, ...@@ -47,14 +47,17 @@ void _CopyBlocksInGrid(void * source, int blockSize, int blockNum, int gridNum,
#ifdef USE_CUDA #ifdef USE_CUDA
int * indexGPU = index; int * indexGPU = index;
if (!isIndexOnDev) { if (!isIndexOnDev) {
myMem->LockBuf();
indexGPU = (int*)myMem->AllocBuf(myMem->devID, blockNum * gridNum * sizeof(int)); indexGPU = (int*)myMem->AllocBuf(myMem->devID, blockNum * gridNum * sizeof(int));
XMemCopy(indexGPU, myMem->devID, index, -1, blockNum * gridNum * sizeof(int)); XMemCopy(indexGPU, myMem->devID, index, -1, blockNum * gridNum * sizeof(int));
} }
_CudaCopyBlocksInGrid(source, blockSize, blockNum, gridNum, target, indexGPU, unitSize, myMem); _CudaCopyBlocksInGrid(source, blockSize, blockNum, gridNum, target, indexGPU, unitSize, myMem);
if (!isIndexOnDev) if (!isIndexOnDev) {
myMem->ReleaseBuf(myMem->devID, blockNum * gridNum * sizeof(int)); myMem->ReleaseBuf(myMem->devID, blockNum * gridNum * sizeof(int));
myMem->UnlockBuf();
}
#else #else
ShowNTErrors("Plesae specify USE_CUDA and recompile the code!"); ShowNTErrors("Plesae specify USE_CUDA and recompile the code!");
#endif #endif
......
...@@ -80,12 +80,23 @@ void _CudaCopyBlocksSelected(void * source, int unitSize, int blockSize, int * s ...@@ -80,12 +80,23 @@ void _CudaCopyBlocksSelected(void * source, int unitSize, int blockSize, int * s
ProtectCudaDev(devID, devIDBackup); ProtectCudaDev(devID, devIDBackup);
/* copy the index to the GPU memory */ /* copy the index to the GPU memory */
int * sourceBlocksTMP = myMem != NULL ? /*int * sourceBlocksTMP = myMem != NULL ?
(int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int)) : (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int)) :
(int *)XMemAlloc(devID, blockNum * sizeof(int)); (int *)XMemAlloc(devID, blockNum * sizeof(int));
int * targetBlocksTMP = myMem != NULL ? int * targetBlocksTMP = myMem != NULL ?
(int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int)) : (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int)) :
(int *)XMemAlloc(devID, blockNum * sizeof(int)); (int *)XMemAlloc(devID, blockNum * sizeof(int));*/
int * sourceBlocksTMP;
int * targetBlocksTMP;
if (myMem != NULL) {
myMem->LockBuf();
sourceBlocksTMP = (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int));
targetBlocksTMP = (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int));
}
else {
sourceBlocksTMP = (int *)XMemAlloc(devID, blockNum * sizeof(int));
targetBlocksTMP = (int *)XMemAlloc(devID, blockNum * sizeof(int));
}
XMemCopy(sourceBlocksTMP, devID, sourceBlocks, -1, blockNum * sizeof(int)); XMemCopy(sourceBlocksTMP, devID, sourceBlocks, -1, blockNum * sizeof(int));
XMemCopy(targetBlocksTMP, devID, targetBlocks, -1, blockNum * sizeof(int)); XMemCopy(targetBlocksTMP, devID, targetBlocks, -1, blockNum * sizeof(int));
...@@ -107,6 +118,7 @@ void _CudaCopyBlocksSelected(void * source, int unitSize, int blockSize, int * s ...@@ -107,6 +118,7 @@ void _CudaCopyBlocksSelected(void * source, int unitSize, int blockSize, int * s
if (myMem != NULL) { if (myMem != NULL) {
myMem->ReleaseBuf(myMem->devID, blockNum * sizeof(int)); myMem->ReleaseBuf(myMem->devID, blockNum * sizeof(int));
myMem->ReleaseBuf(myMem->devID, blockNum * sizeof(int)); myMem->ReleaseBuf(myMem->devID, blockNum * sizeof(int));
myMem->UnlockBuf();
} }
else { else {
XMemFree(devID, sourceBlocksTMP); XMemFree(devID, sourceBlocksTMP);
......
...@@ -32,9 +32,8 @@ copy s to t ...@@ -32,9 +32,8 @@ copy s to t
>> s - source >> s - source
>> t - target >> t - target
>> stream - the stream for creating the job pipeline
*/ */
void _CopyValues(const XTensor * s, XTensor * t, XStream * stream) void _CopyValues(const XTensor * s, XTensor * t)
{ {
if(s->data == NULL && t->data == NULL) if(s->data == NULL && t->data == NULL)
return; return;
...@@ -55,7 +54,7 @@ void _CopyValues(const XTensor * s, XTensor * t, XStream * stream) ...@@ -55,7 +54,7 @@ void _CopyValues(const XTensor * s, XTensor * t, XStream * stream)
#ifdef USE_CUDA #ifdef USE_CUDA
if (s->devID >= 0 || t->devID >= 0) { if (s->devID >= 0 || t->devID >= 0) {
_CudaCopyValues(s, t, stream); _CudaCopyValues(s, t);
return; return;
} }
#endif #endif
...@@ -82,9 +81,8 @@ copy s to t ...@@ -82,9 +81,8 @@ copy s to t
>> sLen - length of the segment >> sLen - length of the segment
>> t - target >> t - target
>> tBeg - beginning of the segment on the target side >> tBeg - beginning of the segment on the target side
>> stream - the stream for creating the job pipeline
*/ */
void _CopyValues(const XTensor * s, const int sBeg, const int sLen, XTensor * t, const int tBeg, XStream * stream) void _CopyValues(const XTensor * s, const int sBeg, const int sLen, XTensor * t, const int tBeg)
{ {
if(s->data == NULL && t->data == NULL) if(s->data == NULL && t->data == NULL)
return; return;
...@@ -108,13 +106,12 @@ void _CopyValues(const XTensor * s, const int sBeg, const int sLen, XTensor * t, ...@@ -108,13 +106,12 @@ void _CopyValues(const XTensor * s, const int sBeg, const int sLen, XTensor * t,
/* /*
copy s to t (rename _CopyValues) copy s to t (rename _CopyValues)
>> s - source >> s - source
>> t - target >> t - target
>> stream - the stream for creating the job pipeline
*/ */
void CopyValues(const XTensor &s, XTensor &t, XStream * stream) void CopyValues(const XTensor &s, XTensor &t)
{ {
_CopyValues(&s, &t, stream); _CopyValues(&s, &t);
} }
/* /*
...@@ -122,16 +119,15 @@ copy s to t (return an XTensor structure) ...@@ -122,16 +119,15 @@ copy s to t (return an XTensor structure)
make a new tensor to keep the result and return it make a new tensor to keep the result and return it
>> s - source >> s - source
>> stream - the stream for creating the job pipeline
<< return - the copyed tensor t << return - the copyed tensor t
*/ */
XTensor CopyValues(const XTensor &s, XStream * stream) XTensor CopyValues(const XTensor &s)
{ {
XTensor t(&s); XTensor t(&s);
t.SetTMPFlag(); t.SetTMPFlag();
/* call _CopyValues function */ /* call _CopyValues function */
_CopyValues(&s, &t, stream); _CopyValues(&s, &t);
/* tensor connection */ /* tensor connection */
if (s.enableGrad) { if (s.enableGrad) {
......
...@@ -32,10 +32,9 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -32,10 +32,9 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
copy a range of elements from a source vector to a target vector copy a range of elements from a source vector to a target vector
>> s - source matrix >> s - source matrix
>> t - target matrix >> t - target matrix
>> stream - the stream for creating the job pipeline
<< return - succeed or not << return - succeed or not
*/ */
void _CudaCopyValues(const XTensor * s, XTensor * t, XStream * stream) void _CudaCopyValues(const XTensor * s, XTensor * t)
{ {
CheckNTErrors(s != NULL && t != NULL, "The input tensor and output tensor must be nonempty!"); CheckNTErrors(s != NULL && t != NULL, "The input tensor and output tensor must be nonempty!");
CheckNTErrors(s->dataType == t->dataType, "Unmatched data type!"); CheckNTErrors(s->dataType == t->dataType, "Unmatched data type!");
...@@ -45,10 +44,7 @@ void _CudaCopyValues(const XTensor * s, XTensor * t, XStream * stream) ...@@ -45,10 +44,7 @@ void _CudaCopyValues(const XTensor * s, XTensor * t, XStream * stream)
/* dense -> dense */ /* dense -> dense */
if (!s->isSparse && !t->isSparse) { if (!s->isSparse && !t->isSparse) {
if (stream == NULL) XMemCopy(t->data, t->devID, s->data, s->devID, s->unitSize * s->unitNum);
XMemCopy(t->data, t->devID, s->data, s->devID, s->unitSize * s->unitNum);
else
XMemCopyAsync(t->data, t->devID, s->data, s->devID, s->unitSize * s->unitNum, stream->stream, stream->devID);
} }
/* dense -> sparse */ /* dense -> sparse */
else if (!s->isSparse && t->isSparse && else if (!s->isSparse && t->isSparse &&
...@@ -72,11 +68,8 @@ void _CudaCopyValues(const XTensor * s, XTensor * t, XStream * stream) ...@@ -72,11 +68,8 @@ void _CudaCopyValues(const XTensor * s, XTensor * t, XStream * stream)
int num = s->unitNumNonZero; int num = s->unitNumNonZero;
int size = sizeof(int) + num * (s->unitSize + sizeof(int)); int size = sizeof(int) + num * (s->unitSize + sizeof(int));
if (stream == NULL) XMemCopy(t->data, t->devID, s->data, s->devID, size);
XMemCopy(t->data, t->devID, s->data, s->devID, size);
else
XMemCopyAsync(t->data, t->devID, s->data, s->devID, size, stream->stream, stream->devID);
t->unitNumNonZero = num; t->unitNumNonZero = num;
} }
else { else {
......
...@@ -29,7 +29,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -29,7 +29,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA #ifdef USE_CUDA
/* copy all elements from a source matrix to a target matrix */ /* copy all elements from a source matrix to a target matrix */
void _CudaCopyValues(const XTensor * s, XTensor * t, XStream * stream = NULL); void _CudaCopyValues(const XTensor * s, XTensor * t);
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -27,19 +27,19 @@ ...@@ -27,19 +27,19 @@
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* copy s to t */ /* copy s to t */
void _CopyValues(const XTensor * s, XTensor * t, XStream * stream = NULL); void _CopyValues(const XTensor * s, XTensor * t);
/* copy a segment of s to t */ /* copy a segment of s to t */
void _CopyValues(const XTensor * s, const int sBeg, const int sLen, XTensor * t, const int tBeg, XStream * stream = NULL); void _CopyValues(const XTensor * s, const int sBeg, const int sLen, XTensor * t, const int tBeg);
/* copy s to t (rename _CopyValues) */ /* copy s to t (rename _CopyValues) */
void CopyValues(const XTensor &s, XTensor &t, XStream * stream = NULL); void CopyValues(const XTensor &s, XTensor &t);
/* /*
copy s to t (return an XTensor structure) copy s to t (return an XTensor structure)
make a new tensor to keep the result and return it make a new tensor to keep the result and return it
*/ */
XTensor CopyValues(const XTensor &s, XStream * stream = NULL); XTensor CopyValues(const XTensor &s);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -115,7 +115,7 @@ void _Gather(const XTensor * s, XTensor * t, XTensor * srcIndex) ...@@ -115,7 +115,7 @@ void _Gather(const XTensor * s, XTensor * t, XTensor * srcIndex)
for (int i = 0; i < indexSize; i++) { for (int i = 0; i < indexSize; i++) {
int sIndex = sIndexData[i] * stride; int sIndex = sIndexData[i] * stride;
CheckNTErrors(sIndex < s->unitNum, "Wrong index!"); CheckNTErrors(sIndex < s->unitNum && sIndex >= 0, "Wrong index!");
for (int j = 0; j < stride; j++) for (int j = 0; j < stride; j++)
tData[i * stride + j] = sData[sIndex + j]; tData[i * stride + j] = sData[sIndex + j];
} }
......
...@@ -131,9 +131,16 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex) ...@@ -131,9 +131,16 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex)
CheckNTErrors(srcIndexValue < s->unitNum, "Wrong index!"); CheckNTErrors(srcIndexValue < s->unitNum, "Wrong index!");
} }
sIndex = mem != NULL ? /*sIndex = mem != NULL ?
(int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) : (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) :
(int*)XMemAlloc(mem->devID, sizeof(int) * indexSize); (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);*/
if (mem != NULL) {
mem->LockBuf();
sIndex = (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize);
}
else {
sIndex = (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);
}
XMemCopy(sIndex, devID, srcIndex, -1, sizeof(int) * indexSize); XMemCopy(sIndex, devID, srcIndex, -1, sizeof(int) * indexSize);
} }
else { else {
...@@ -169,8 +176,10 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex) ...@@ -169,8 +176,10 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex)
} }
if (srcIndex->devID < 0) { if (srcIndex->devID < 0) {
if(mem != NULL) if (mem != NULL) {
mem->ReleaseBuf(mem->devID, sizeof(int) * indexSize); mem->ReleaseBuf(mem->devID, sizeof(int) * indexSize);
mem->UnlockBuf();
}
else else
XMemFree(mem->devID, sIndex); XMemFree(mem->devID, sIndex);
} }
...@@ -209,9 +218,16 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex, int dim) ...@@ -209,9 +218,16 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex, int dim)
CheckNTErrors(srcIndexValue < s->unitNum, "Wrong index!"); CheckNTErrors(srcIndexValue < s->unitNum, "Wrong index!");
} }
sIndex = mem != NULL ? /*sIndex = mem != NULL ?
(int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) : (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) :
(int*)XMemAlloc(mem->devID, sizeof(int) * indexSize); (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);*/
if (mem != NULL) {
mem->LockBuf();
sIndex = (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize);
}
else {
sIndex = (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);
}
XMemCopy(sIndex, devID, srcIndex, -1, sizeof(int) * indexSize); XMemCopy(sIndex, devID, srcIndex, -1, sizeof(int) * indexSize);
} }
else { else {
...@@ -238,6 +254,15 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex, int dim) ...@@ -238,6 +254,15 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex, int dim)
else { else {
ShowNTErrors("Unsupported dataType!"); ShowNTErrors("Unsupported dataType!");
} }
if (srcIndex->devID < 0) {
if (mem != NULL) {
mem->ReleaseBuf(mem->devID, sizeof(int) * indexSize);
mem->UnlockBuf();
}
else
XMemFree(mem->devID, sIndex);
}
} }
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -231,8 +231,8 @@ And this is a special spread function for backward computation of gather functio ...@@ -231,8 +231,8 @@ And this is a special spread function for backward computation of gather functio
*/ */
void _SpreadForGather(XTensor * source, XTensor * collection, XTensor * index) void _SpreadForGather(XTensor * source, XTensor * collection, XTensor * index)
{ {
int dim = 0; //int dim = 0;
int order = source->order; //int order = source->order;
CheckNTErrors(source->dataType == DEFAULT_DTYPE, "TODO!"); CheckNTErrors(source->dataType == DEFAULT_DTYPE, "TODO!");
CheckNTErrors(collection->GetDim(-1) == source->GetDim(-1), "Illegal dimension!"); CheckNTErrors(collection->GetDim(-1) == source->GetDim(-1), "Illegal dimension!");
...@@ -272,4 +272,4 @@ void _SpreadForGather(XTensor * source, XTensor * collection, XTensor * index) ...@@ -272,4 +272,4 @@ void _SpreadForGather(XTensor * source, XTensor * collection, XTensor * index)
} }
} }
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
...@@ -177,9 +177,17 @@ void _CudaSpread(XTensor * source, XTensor * collection, int dim, ...@@ -177,9 +177,17 @@ void _CudaSpread(XTensor * source, XTensor * collection, int dim,
DTYPE * c = (DTYPE*)collection->data; DTYPE * c = (DTYPE*)collection->data;
XMem * mem = source->mem; XMem * mem = source->mem;
int * si = mem != NULL ? /*int * si = mem != NULL ?
(int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize * 2) : (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize * 2) :
(int*)XMemAlloc(mem->devID, sizeof(int) * indexSize * 2); (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize * 2);*/
int * si;
if (mem != NULL) {
mem->LockBuf();
si = (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize * 2);
}
else {
si = (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize * 2);
}
int * ci = si + indexSize; int * ci = si + indexSize;
XMemCopy(si, mem->devID, srcIndex, -1, sizeof(int) * indexSize); XMemCopy(si, mem->devID, srcIndex, -1, sizeof(int) * indexSize);
...@@ -188,8 +196,10 @@ void _CudaSpread(XTensor * source, XTensor * collection, int dim, ...@@ -188,8 +196,10 @@ void _CudaSpread(XTensor * source, XTensor * collection, int dim,
KernelSpreadFuzed<<<blocks, threads >>>(s, c, blockNum, blockSizeSrc, blockSizeColl, KernelSpreadFuzed<<<blocks, threads >>>(s, c, blockNum, blockSizeSrc, blockSizeColl,
stride, indexSize, si, ci); stride, indexSize, si, ci);
if(mem != NULL) if (mem != NULL) {
mem->ReleaseBuf(mem->devID, sizeof(int) * indexSize * 2); mem->ReleaseBuf(mem->devID, sizeof(int) * indexSize * 2);
mem->UnlockBuf();
}
else else
XMemFree(mem->devID, si); XMemFree(mem->devID, si);
} }
...@@ -393,9 +403,16 @@ void _CudaSpreadForGather(XTensor * source, XTensor * collection, XTensor * srcI ...@@ -393,9 +403,16 @@ void _CudaSpreadForGather(XTensor * source, XTensor * collection, XTensor * srcI
dim3 threads(cudaBlocks[0], cudaBlocks[1]); dim3 threads(cudaBlocks[0], cudaBlocks[1]);
if (srcIndex->devID < 0) { if (srcIndex->devID < 0) {
sIndex = mem != NULL ? /*sIndex = mem != NULL ?
(int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) : (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) :
(int*)XMemAlloc(devID, sizeof(int) * indexSize); (int*)XMemAlloc(devID, sizeof(int) * indexSize);*/
if (mem != NULL) {
mem->LockBuf();
sIndex = (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize);
}
else {
sIndex = (int*)XMemAlloc(devID, sizeof(int) * indexSize);
}
XMemCopy(sIndex, devID, srcIndex->data, -1, sizeof(int) * indexSize); XMemCopy(sIndex, devID, srcIndex->data, -1, sizeof(int) * indexSize);
} }
else else
...@@ -422,8 +439,10 @@ void _CudaSpreadForGather(XTensor * source, XTensor * collection, XTensor * srcI ...@@ -422,8 +439,10 @@ void _CudaSpreadForGather(XTensor * source, XTensor * collection, XTensor * srcI
} }
if (srcIndex->devID < 0) { if (srcIndex->devID < 0) {
if(mem != NULL) if (mem != NULL) {
mem->ReleaseBuf(mem->devID, sizeof(int) * indexSize); mem->ReleaseBuf(mem->devID, sizeof(int) * indexSize);
mem->UnlockBuf();
}
else else
XMemFree(devID, sIndex); XMemFree(devID, sIndex);
} }
......
...@@ -512,8 +512,8 @@ void funName(DTYPE * input, DTYPE * output,int stride, int strideNum, ...@@ -512,8 +512,8 @@ void funName(DTYPE * input, DTYPE * output,int stride, int strideNum,
KERNELREDUCEFUN1(KernelReduceMaxOp, MAX, shflDownReduceMax, FLOAT_MIN) KERNELREDUCEFUN1(KernelReduceMaxOp, MAX, shflDownReduceMax, FLOAT_MIN)
KERNELREDUCEFUN1(KernelReduceMinOp, MIN, shflDownReduceMin, MAX_FLOAT) KERNELREDUCEFUN1(KernelReduceMinOp, MIN, shflDownReduceMin, MAX_FLOAT)
/* /*
get the max-valued items along a dimension of the tensor (cuda version). get the max-valued items along a dimension of the tensor (cuda version).
For a 1-dimensional data array a, For a 1-dimensional data array a,
sum_i = max_{0<=j<strideNum} input_{i,j} sum_i = max_{0<=j<strideNum} input_{i,j}
>> input - the input tensor >> input - the input tensor
...@@ -574,7 +574,14 @@ void _funcName(const XTensor * input, XTensor * output, int dim) ...@@ -574,7 +574,14 @@ void _funcName(const XTensor * input, XTensor * output, int dim)
XMem * mem = input->mem; \ XMem * mem = input->mem; \
GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize); \ GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize); \
int bufSize = input->unitSize * cudaGridSize[0] * stride * blockNum * 2; \ int bufSize = input->unitSize * cudaGridSize[0] * stride * blockNum * 2; \
DTYPE * buf = mem != NULL ? (DTYPE*)mem->AllocBuf(mem->devID, bufSize) : (DTYPE*)XMemAlloc(devID, bufSize); \ DTYPE * buf; \
if (mem != NULL) { \
mem->LockBuf(); \
buf = (DTYPE*)mem->AllocBuf(mem->devID, bufSize); \
} \
else { \
buf = (DTYPE*)XMemAlloc(devID, bufSize); \
} \
DTYPE * buf1 = buf; \ DTYPE * buf1 = buf; \
DTYPE * buf2 = buf + cudaGridSize[0] * stride * blockNum; \ DTYPE * buf2 = buf + cudaGridSize[0] * stride * blockNum; \
do { \ do { \
...@@ -706,8 +713,10 @@ void _funcName(const XTensor * input, XTensor * output, int dim) ...@@ -706,8 +713,10 @@ void _funcName(const XTensor * input, XTensor * output, int dim)
\ \
} while (strideNum > 1); \ } while (strideNum > 1); \
\ \
if (mem != NULL) \ if (mem != NULL) { \
mem->ReleaseBuf(mem->devID, bufSize); \ mem->ReleaseBuf(mem->devID, bufSize); \
mem->UnlockBuf(); \
} \
else \ else \
XMemFree(input->devID, buf); \ XMemFree(input->devID, buf); \
} \ } \
......
...@@ -757,7 +757,15 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen ...@@ -757,7 +757,15 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize); GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
int bufSize = input->unitSize * cudaGridSize[0] * stride * blockNum * 2; int bufSize = input->unitSize * cudaGridSize[0] * stride * blockNum * 2;
DTYPE * buf = mem != NULL ? (DTYPE*)mem->AllocBuf(mem->devID, bufSize) : (DTYPE*)XMemAlloc(devID, bufSize); //DTYPE * buf = mem != NULL ? (DTYPE*)mem->AllocBuf(mem->devID, bufSize) : (DTYPE*)XMemAlloc(devID, bufSize);
DTYPE * buf;
if (mem != NULL) {
mem->LockBuf();
buf = (DTYPE*)mem->AllocBuf(mem->devID, bufSize);
}
else {
buf = (DTYPE*)XMemAlloc(devID, bufSize);
}
DTYPE * buf1 = buf; DTYPE * buf1 = buf;
DTYPE * buf2 = buf + cudaGridSize[0] * stride * blockNum; DTYPE * buf2 = buf + cudaGridSize[0] * stride * blockNum;
do { do {
...@@ -907,8 +915,10 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen ...@@ -907,8 +915,10 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
} while (strideNum > 1); } while (strideNum > 1);
if (mem != NULL) if (mem != NULL) {
mem->ReleaseBuf(mem->devID, bufSize); mem->ReleaseBuf(mem->devID, bufSize);
mem->UnlockBuf();
}
else else
XMemFree(devID, buf); XMemFree(devID, buf);
} }
......
...@@ -56,12 +56,16 @@ void _ReduceSumAll(const XTensor * source, XTensor * target) ...@@ -56,12 +56,16 @@ void _ReduceSumAll(const XTensor * source, XTensor * target)
int dims[1] = {source->unitNum}; int dims[1] = {source->unitNum};
if (source->mem != NULL)
source->mem->LockBuf();
XTensor * all = NewTensorBufV2(1, dims, source->dataType, source->denseRatio, source->devID, source->mem); XTensor * all = NewTensorBufV2(1, dims, source->dataType, source->denseRatio, source->devID, source->mem);
_CopyValues(source, all); _CopyValues(source, all);
_ReduceSum(all, target, 0); _ReduceSum(all, target, 0);
DelTensorBuf(all); DelTensorBuf(all);
if (source->mem != NULL)
source->mem->UnlockBuf();
} }
/* /*
...@@ -72,7 +76,8 @@ sum all the items of the tensor (It should be optimized!) ...@@ -72,7 +76,8 @@ sum all the items of the tensor (It should be optimized!)
void _ReduceSumAll(const XTensor * source, DTYPE * value) void _ReduceSumAll(const XTensor * source, DTYPE * value)
{ {
int * dimSize = new int[MAX_TENSOR_DIM_NUM]; int * dimSize = new int[MAX_TENSOR_DIM_NUM];
float dr = (!source->isSparse) ? 1.0F : source->denseRatio; if (source->mem != NULL)
source->mem->LockBuf();
XTensor * target = NewTensorBufV2(0, dimSize, source->dataType, source->denseRatio, source->devID, source->mem); XTensor * target = NewTensorBufV2(0, dimSize, source->dataType, source->denseRatio, source->devID, source->mem);
target->SetTMPFlag(); target->SetTMPFlag();
...@@ -82,6 +87,8 @@ void _ReduceSumAll(const XTensor * source, DTYPE * value) ...@@ -82,6 +87,8 @@ void _ReduceSumAll(const XTensor * source, DTYPE * value)
delete[] dimSize; delete[] dimSize;
DelTensorBuf(target); DelTensorBuf(target);
if (source->mem != NULL)
source->mem->UnlockBuf();
} }
/* /*
...@@ -122,4 +129,4 @@ DTYPE ReduceSumAllValue(const XTensor & source) ...@@ -122,4 +129,4 @@ DTYPE ReduceSumAllValue(const XTensor & source)
return target.Get0D(); return target.Get0D();
} }
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
...@@ -32,14 +32,14 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -32,14 +32,14 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* /*
transform a tensor by merging it along with a dimension. transform a tensor by merging it along with a dimension.
e.g., (N/3, M, 3) -> (N, M) e.g., (3, M, N/3) -> (M, N)
>> s - the source tensor >> s - the source tensor
>> t - the target tensor (for return) >> t - the target tensor (for return)
>> whereToMerge - the merging operation is along with which dimension >> whereToMerge - the merging operation is along with which dimension
>> leadingDim - the leading dimension of merging, take (N/3, M, 3) -> (N, M) >> leadingDim - the leading dimension of merging, take (3, M, N/3) -> (M, N)
for example, whereToMerge = 0 (i.e., the dimension for "N/3") for example, whereToMerge = 2 (i.e., the dimension for "N/3")
leadingDim = 2 (i.e., the dimension for "3") leadingDim = 0 (i.e., the dimension for "3")
*/ */
void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim) void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
{ {
...@@ -118,30 +118,54 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim) ...@@ -118,30 +118,54 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
void * dataTMP = t->data; void * dataTMP = t->data;
if (!isOnSameDevice) if (!isOnSameDevice) {
dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(mem->devID, size); /*dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(mem->devID, size);*/
if (mem != NULL) {
mem->LockBuf();
dataTMP = mem->AllocBuf(mem->devID, size);
}
else {
dataTMP = XMemAlloc(mem->devID, size);
}
}
int blockNumInMerge = s->dimSize[leadingDim]; int blockNumInMerge = s->dimSize[leadingDim];
int splitSizeInGrid = gridSize / blockNumInMerge; int splitSizeInGrid = gridSize / blockNumInMerge;
int realBlockSize = blockSize * t->unitSize; int realBlockSize = blockSize * t->unitSize;
int * blockIndex = (int*)(mem != NULL ? /*int * blockIndex = (int*)(mem != NULL ?
mem->AllocBuf(mem->devID, blockNum * gridNum * sizeof(int)) : mem->AllocBuf(mem->devID, blockNum * gridNum * sizeof(int)) :
XMemAlloc(s->devID, blockNum * gridNum * sizeof(int))); XMemAlloc(s->devID, blockNum * gridNum * sizeof(int)));*/
int * blockIndex;
if (mem != NULL) {
if (isOnSameDevice) {
mem->LockBuf();
}
blockIndex = (int*)mem->AllocBuf(mem->devID, blockNum * gridNum * sizeof(int));
}
else {
blockIndex = (int*)XMemAlloc(s->devID, blockNum * gridNum * sizeof(int));
}
_MakeMergeBlockIndex(blockIndex, blockNum, blockNumInMerge, splitSizeInGrid, gridSize, gridNum, s->devID); _MakeMergeBlockIndex(blockIndex, blockNum, blockNumInMerge, splitSizeInGrid, gridSize, gridNum, s->devID);
_CopyBlocksOnSite(s->data, s->unitSize, realBlockSize, blockNum * gridNum, dataTMP, blockIndex, s->devID); _CopyBlocksOnSite(s->data, s->unitSize, realBlockSize, blockNum * gridNum, dataTMP, blockIndex, s->devID);
if (mem != NULL) if (mem != NULL) {
mem->ReleaseBuf(mem->devID, blockNum * gridNum * sizeof(int)); mem->ReleaseBuf(mem->devID, blockNum * gridNum * sizeof(int));
if (isOnSameDevice) {
mem->UnlockBuf();
}
}
else else
XMemFree(s->devID, blockIndex); XMemFree(s->devID, blockIndex);
if (!isOnSameDevice) { if (!isOnSameDevice) {
XMemCopy(t->data, t->devID, dataTMP, s->devID, size); XMemCopy(t->data, t->devID, dataTMP, s->devID, size);
if (mem != NULL) if (mem != NULL) {
mem->ReleaseBuf(mem->devID, size); mem->ReleaseBuf(mem->devID, size);
mem->UnlockBuf();
}
else else
XMemFree(s->devID, dataTMP); XMemFree(s->devID, dataTMP);
} }
...@@ -185,13 +209,13 @@ bool CheckMergeSize(const XTensor * s, const XTensor * t, int whereToMerge, int ...@@ -185,13 +209,13 @@ bool CheckMergeSize(const XTensor * s, const XTensor * t, int whereToMerge, int
transform a tensor by merging it along with a dimension (return an XTensor structure) transform a tensor by merging it along with a dimension (return an XTensor structure)
make a new tensor to keep the result and return it make a new tensor to keep the result and return it
e.g., (N/3, M, 3) -> (N, M) e.g., (3, M, N/3) -> (M, N)
>> s - the source tensor >> s - the source tensor
>> whereToMerge - the merging operation is along with which dimension >> whereToMerge - the merging operation is along with which dimension
>> leadingDim - the leading dimension of merging, take (N/3, M, 3) -> (N, M) >> leadingDim - the leading dimension of merging, take (3, M, N/3) -> (M, N)
for example, whereToMerge = 0 (i.e., the dimension for "N/3") for example, whereToMerge = 2 (i.e., the dimension for "N/3")
leadingDim = 2 (i.e., the dimension for "3") leadingDim = 0 (i.e., the dimension for "3")
<< return - the transformed tensor by merging along with a dimension << return - the transformed tensor by merging along with a dimension
*/ */
XTensor Merge(const XTensor &s, int whereToMerge, int leadingDim) XTensor Merge(const XTensor &s, int whereToMerge, int leadingDim)
...@@ -358,8 +382,16 @@ void _Merge(const TensorList * smalls, XTensor * t, int whereToMerge) ...@@ -358,8 +382,16 @@ void _Merge(const TensorList * smalls, XTensor * t, int whereToMerge)
void * dataTMP = NULL; void * dataTMP = NULL;
if (uniform) if (uniform)
dataTMP = smallsItem0->data; dataTMP = smallsItem0->data;
else else {
dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(t->devID, size); //dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(t->devID, size);
if (mem != NULL) {
mem->LockBuf();
dataTMP = mem->AllocBuf(mem->devID, size);
}
else {
dataTMP = XMemAlloc(t->devID, size);
}
}
tensorTMP->data = dataTMP; tensorTMP->data = dataTMP;
...@@ -378,8 +410,10 @@ void _Merge(const TensorList * smalls, XTensor * t, int whereToMerge) ...@@ -378,8 +410,10 @@ void _Merge(const TensorList * smalls, XTensor * t, int whereToMerge)
tensorTMP->data = NULL; tensorTMP->data = NULL;
delete tensorTMP; delete tensorTMP;
if ((!uniform) && (mem != NULL)) if ((!uniform) && (mem != NULL)) {
mem->ReleaseBuf(mem->devID, size); mem->ReleaseBuf(mem->devID, size);
mem->UnlockBuf();
}
else else
XMemFree(t->devID, dataTMP); XMemFree(t->devID, dataTMP);
} }
......
...@@ -117,7 +117,7 @@ void _CudaMergeBlockLists(const StrList* sourceList, int * blockSizes, int block ...@@ -117,7 +117,7 @@ void _CudaMergeBlockLists(const StrList* sourceList, int * blockSizes, int block
GDevs.GetCudaThread2D(myMem->devID, realMaxBlockSize, newBlockListSize, MAX_INT, GDevs.GetCudaThread2D(myMem->devID, realMaxBlockSize, newBlockListSize, MAX_INT,
cudaGridSizes, cudaBlockSizes); cudaGridSizes, cudaBlockSizes);
myMem->LockBuf();
myMem->SetPinBuf(); myMem->SetPinBuf();
int * sizesGPU = (int*)myMem->AllocBuf(myMem->devID, sizeof(int) * newBlockListSize, 256); int * sizesGPU = (int*)myMem->AllocBuf(myMem->devID, sizeof(int) * newBlockListSize, 256);
...@@ -133,6 +133,7 @@ void _CudaMergeBlockLists(const StrList* sourceList, int * blockSizes, int block ...@@ -133,6 +133,7 @@ void _CudaMergeBlockLists(const StrList* sourceList, int * blockSizes, int block
(sourceArraysGPU, sizesGPU, newBlockListSize, targetArraysGPU); (sourceArraysGPU, sizesGPU, newBlockListSize, targetArraysGPU);
myMem->BackToPinBuf(); myMem->BackToPinBuf();
myMem->UnlockBuf();
delete[] sourceArrays; delete[] sourceArrays;
delete[] targetArrays; delete[] targetArrays;
......
...@@ -96,25 +96,11 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum) ...@@ -96,25 +96,11 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)
} }
} }
else{ else{
#ifdef USE_CUDA
#ifdef STREAMED_MEMCPOPY
XStream * stream = GDevs.GPUs[t->devID].stream;
for (int k = 0; k < splitNum; k++) {
XMemCopy2DAsync((char*)t->data + k * tStep, tPitch, t->devID,
(char*)s->data + k * sStep, sPitch, s->devID,
mSize, n, stream);
}
stream->StreamSynchronize();
#else
for (int k = 0; k < splitNum; k++) { for (int k = 0; k < splitNum; k++) {
XMemCopy2D((char*)t->data + k * tStep, tPitch, t->devID, XMemCopy2D((char*)t->data + k * tStep, tPitch, t->devID,
(char*)s->data + k * sStep, sPitch, s->devID, (char*)s->data + k * sStep, sPitch, s->devID,
mSize, n); mSize, n);
} }
#endif
#else
ShowNTErrors("Please specify USE_CUDA and recompile the code!");
#endif
} }
} }
else { else {
...@@ -124,22 +110,44 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum) ...@@ -124,22 +110,44 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)
void * dataTMP = t->data; void * dataTMP = t->data;
if (!isOnSameDevice) if (!isOnSameDevice) {
dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(s->devID, size); //dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(s->devID, size);
if (mem != NULL) {
mem->LockBuf();
dataTMP = mem->AllocBuf(mem->devID, size);
}
else {
dataTMP = XMemAlloc(s->devID, size);
}
}
int realBlockSize = blockSize * t->unitSize; int realBlockSize = blockSize * t->unitSize;
int blockSplitSize = blockNum / splitNum; int blockSplitSize = blockNum / splitNum;
int * blockIndex = (int*)(mem != NULL ? /*int * blockIndex = (int*)(mem != NULL ?
mem->AllocBuf(mem->devID, blockNum * sizeof(int)) : mem->AllocBuf(mem->devID, blockNum * sizeof(int)) :
XMemAlloc(s->devID, blockNum * sizeof(int))); XMemAlloc(s->devID, blockNum * sizeof(int)));*/
int * blockIndex;
if (mem != NULL) {
if (isOnSameDevice) {
mem->LockBuf();
}
blockIndex = (int*)mem->AllocBuf(mem->devID, blockNum * sizeof(int));
}
else {
blockIndex = (int*)XMemAlloc(s->devID, blockNum * sizeof(int));
}
_MakeSplitBlockIndex(blockIndex, splitNum, blockSplitSize, blockNum, s->devID); _MakeSplitBlockIndex(blockIndex, splitNum, blockSplitSize, blockNum, s->devID);
_CopyBlocksOnSite(s->data, s->unitSize, realBlockSize, blockNum, dataTMP, blockIndex, s->devID); _CopyBlocksOnSite(s->data, s->unitSize, realBlockSize, blockNum, dataTMP, blockIndex, s->devID);
if (mem != NULL) if (mem != NULL) {
mem->ReleaseBuf(mem->devID, blockNum * sizeof(int)); mem->ReleaseBuf(mem->devID, blockNum * sizeof(int));
if (isOnSameDevice) {
mem->UnlockBuf();
}
}
else else
XMemFree(s->devID, blockIndex); XMemFree(s->devID, blockIndex);
...@@ -147,8 +155,10 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum) ...@@ -147,8 +155,10 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)
if (!isOnSameDevice) { if (!isOnSameDevice) {
XMemCopy(t->data, t->devID, dataTMP, s->devID, size); XMemCopy(t->data, t->devID, dataTMP, s->devID, size);
if (mem != NULL) if (mem != NULL) {
mem->ReleaseBuf(mem->devID, size); mem->ReleaseBuf(mem->devID, size);
mem->UnlockBuf();
}
else else
XMemFree(s->devID, dataTMP); XMemFree(s->devID, dataTMP);
} }
...@@ -321,27 +331,12 @@ void _Split(const XTensor * big, TensorList * smalls, int whereToSplit, int spli ...@@ -321,27 +331,12 @@ void _Split(const XTensor * big, TensorList * smalls, int whereToSplit, int spli
} }
} }
else{ else{
#ifdef USE_CUDA
#ifdef STREAMED_MEMCPOPY
XStream * stream = GDevs.GPUs[big->devID].stream;
for (int k = 0; k < splitNum; k++) {
XTensor * t = (XTensor*)smalls->GetItem(k);
XMemCopy2DAsync((char*)t->data + k * tStep, tPitch, t->devID,
(char*)big->data + k * sStep, sPitch, big->devID,
mSize, n, stream);
}
stream->StreamSynchronize();
#else
for (int k = 0; k < splitNum; k++) { for (int k = 0; k < splitNum; k++) {
XTensor * t = (XTensor*)smalls->GetItem(k); XTensor * t = (XTensor*)smalls->GetItem(k);
XMemCopy2D((char*)t->data + k * tStep, tPitch, t->devID, XMemCopy2D((char*)t->data + k * tStep, tPitch, t->devID,
(char*)big->data + k * sStep, sPitch, big->devID, (char*)big->data + k * sStep, sPitch, big->devID,
mSize, n); mSize, n);
} }
#endif
#else
ShowNTErrors("Please specify USE_CUDA and recompile the code!");
#endif
} }
} }
/* splitting with fewer kernel/api calls??? (i'm not sure about it!! may remove this later) */ /* splitting with fewer kernel/api calls??? (i'm not sure about it!! may remove this later) */
...@@ -362,7 +357,14 @@ void _Split(const XTensor * big, TensorList * smalls, int whereToSplit, int spli ...@@ -362,7 +357,14 @@ void _Split(const XTensor * big, TensorList * smalls, int whereToSplit, int spli
dataTMP = first->data; dataTMP = first->data;
} }
else { else {
dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(big->devID, size); //dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(big->devID, size);
if (mem != NULL) {
mem->LockBuf();
dataTMP = mem->AllocBuf(mem->devID, size);
}
else {
dataTMP = XMemAlloc(big->devID, size);
}
} }
tensorTMP->data = dataTMP; tensorTMP->data = dataTMP;
...@@ -383,8 +385,10 @@ void _Split(const XTensor * big, TensorList * smalls, int whereToSplit, int spli ...@@ -383,8 +385,10 @@ void _Split(const XTensor * big, TensorList * smalls, int whereToSplit, int spli
tensorTMP->data = NULL; tensorTMP->data = NULL;
delete tensorTMP; delete tensorTMP;
if ((!uniform) && (mem != NULL)) if ((!uniform) && (mem != NULL)) {
mem->ReleaseBuf(mem->devID, size); mem->ReleaseBuf(mem->devID, size);
mem->UnlockBuf();
}
else else
XMemFree(big->devID, dataTMP); XMemFree(big->devID, dataTMP);
} }
......
...@@ -43,13 +43,11 @@ void _Stack(const TensorList * smalls, XTensor * t, int dim) ...@@ -43,13 +43,11 @@ void _Stack(const TensorList * smalls, XTensor * t, int dim)
int blockSize = 1; int blockSize = 1;
int blockNum = 1; int blockNum = 1;
int gridSize = 1;
int gridNum = 1; int gridNum = 1;
XTensor * smallsItem0 = smalls->GetItem(0); XTensor * smallsItem0 = smalls->GetItem(0);
int unitNum = smallsItem0->unitNum; //int unitNum = smallsItem0->unitNum;
int unitSize = smallsItem0->unitSize; int unitSize = smallsItem0->unitSize;
int itemSize = unitNum * unitSize;
for (int i = 0; i < smallsItem0->order; i++) { for (int i = 0; i < smallsItem0->order; i++) {
if (i >= dim) if (i >= dim)
...@@ -129,7 +127,7 @@ bool CheckStackShape(const TensorList &smalls, XTensor &t, int dim) ...@@ -129,7 +127,7 @@ bool CheckStackShape(const TensorList &smalls, XTensor &t, int dim)
XTensor * tensor = (XTensor*)smalls.GetItem(0); XTensor * tensor = (XTensor*)smalls.GetItem(0);
int order = tensor->order; int order = tensor->order;
for (int i = 0; i < tensor->order; i++) { for (int i = 0; i < order; i++) {
if (i < dim) { if (i < dim) {
if (t.GetDim(i) != tensor->GetDim(i)) if (t.GetDim(i) != tensor->GetDim(i))
return false; return false;
......
...@@ -234,7 +234,15 @@ void _CudaSortBig(const XTensor * a, XTensor * b, XTensor * indexA, XTensor * in ...@@ -234,7 +234,15 @@ void _CudaSortBig(const XTensor * a, XTensor * b, XTensor * indexA, XTensor * in
int m = GetNextPower2(strideNum); int m = GetNextPower2(strideNum);
int n = stride * blockNum; int n = stride * blockNum;
void * buf = mem != NULL ? mem->AllocBuf(a->devID, n * m * a->unitSize) : XMemAlloc(a->devID, n * m * a->unitSize); //void * buf = mem != NULL ? mem->AllocBuf(a->devID, n * m * a->unitSize) : XMemAlloc(a->devID, n * m * a->unitSize);
void * buf;
if (mem != NULL) {
mem->LockBuf();
buf = mem->AllocBuf(a->devID, n * m * a->unitSize);
}
else {
buf = XMemAlloc(a->devID, n * m * a->unitSize);
}
void * bufIndex = NULL; void * bufIndex = NULL;
if (indexA != NULL && indexB != NULL) { if (indexA != NULL && indexB != NULL) {
bufIndex = mem != NULL ? mem->AllocBuf(a->devID, n * m * sizeof(int)) : XMemAlloc(a->devID, n * m * sizeof(int)); bufIndex = mem != NULL ? mem->AllocBuf(a->devID, n * m * sizeof(int)) : XMemAlloc(a->devID, n * m * sizeof(int));
...@@ -289,8 +297,10 @@ void _CudaSortBig(const XTensor * a, XTensor * b, XTensor * indexA, XTensor * in ...@@ -289,8 +297,10 @@ void _CudaSortBig(const XTensor * a, XTensor * b, XTensor * indexA, XTensor * in
KernelReorganizeBack<int> << <dim3(cudaGrids[1], cudaGrids[0]), dim3(cudaBlocks[1], cudaBlocks[0]) >> > KernelReorganizeBack<int> << <dim3(cudaGrids[1], cudaGrids[0]), dim3(cudaBlocks[1], cudaBlocks[0]) >> >
(bufIndex, indexB->data, m, n, stride, k, blockNum); (bufIndex, indexB->data, m, n, stride, k, blockNum);
if (mem != NULL) if (mem != NULL) {
mem->ReleaseBuf(a->devID, n * m * a->unitSize); mem->ReleaseBuf(a->devID, n * m * a->unitSize);
mem->UnlockBuf();
}
else else
XMemFree(a->devID, buf); XMemFree(a->devID, buf);
if (indexA != NULL && indexB != NULL) if (indexA != NULL && indexB != NULL)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论