Commit a79523f9 by liyinqiao

Merge with xiaotong branch and add mutex when operating the memory pool.

parent 7d4bc44a
......@@ -27,6 +27,7 @@
#include "./tensor/test/Test.h"
#include "./sample/fnnlm/FNNLM.h"
#include "./sample/transformer/NMT.h"
#include "./train/TTrain.h"
//#define CRTDBG_MAP_ALLOC
//#include <stdlib.h>
......@@ -38,8 +39,14 @@ using namespace nmt;
int main( int argc, const char ** argv )
{
if(argc > 1 && !strcmp(argv[1], "-test"))
XConfig config;
config.Create(argc - 1, argv + 1);
verboseLevel = config.GetInt("verbose", 1);
if (argc > 1 && !strcmp(argv[1], "-test"))
Test();
else if (argc > 1 && !strcmp(argv[1], "-testtrain"))
TestTrain();
else if(argc > 1 && !strcmp(argv[1], "-fnnlm"))
FNNLMMain(argc - 1, argv + 1);
else if(argc > 1 && !strcmp(argv[1], "-t2t"))
......@@ -47,7 +54,8 @@ int main( int argc, const char ** argv )
else{
fprintf(stderr, "Thanks for using NiuTensor! This is a library for building\n");
fprintf(stderr, "neural networks in an easy way. \n\n");
fprintf(stderr, "Run this program with \"-test\" for unit test!\n");
fprintf(stderr, " Run this program with \"-test\" for unit test!\n");
fprintf(stderr, "Or run this program with \"-testtrain\" for test of the trainer!\n");
fprintf(stderr, "Or run this program with \"-fnnlm\" for sample FNNLM!\n");
fprintf(stderr, "Or run this program with \"-t2t\" for sample Transformer!\n");
}
......
......@@ -93,6 +93,7 @@ void XFuncGrad::MakeGrad(XTensor * node, bool isEfficient)
}
node->visitMark = NODE_FINISHED;
node->isGradFinished = true;
}
/* indicates whether the node is for an activation function */
......
......@@ -89,6 +89,7 @@ void XLossGrad::MakeGrad(XTensor * node, bool isEfficient)
}
node->visitMark = NODE_FINISHED;
node->isGradFinished = true;
}
/* indicates whether the node is for a loss computation */
......
......@@ -105,12 +105,19 @@ void XShapeGrad::GradConvertDataType(XTensor* node, bool isEfficient)
if (!isEfficient || a->isGrad) {
XNoder::MakeGrad(a);
if (a->mem != NULL)
a->mem->LockBuf();
XTensor* tmp = NewTensorBufV2(a, a->devID, a->mem);
_ConvertDataType(node->grad, tmp);
_SumMe(a->grad, tmp);
DelTensorBuf(tmp);
if (a->mem != NULL)
a->mem->UnlockBuf();
}
node->visitMark = NODE_FINISHED;
node->isGradFinished = true;
}
/*
......@@ -138,12 +145,19 @@ void XShapeGrad::GradCopyIndexed(XTensor * node, bool isEfficient)
if (!isEfficient || input->isGrad) {
XNoder::MakeGrad(input);
if (input->mem != NULL)
input->mem->LockBuf();
XTensor * tmp = NewTensorBufV2(input, input->devID, input->mem);
_SpreadForCopyIndexed(tmp, node->grad, dim, srcIndex, tgtIndex, copyNum);
_SumMe(input->grad, tmp);
DelTensorBuf(tmp);
if (input->mem != NULL)
input->mem->UnlockBuf();
}
node->visitMark = NODE_FINISHED;
node->isGradFinished = true;
}
/*
......@@ -167,15 +181,20 @@ void XShapeGrad::GradGather(XTensor * node, bool isEfficient)
if (!isEfficient || input->isGrad) {
XNoder::MakeGrad(input);
if (input->mem != NULL)
input->mem->LockBuf();
XTensor * tmp = NewTensorBufV2(input, input->devID, input->mem);
tmp->SetZeroAll();
_SpreadForGather(tmp, node->grad, index);
_SumMe(input->grad, tmp);
DelTensorBuf(tmp);
if (input->mem != NULL)
input->mem->UnlockBuf();
}
node->visitMark = NODE_FINISHED;
node->isGradFinished = true;
}
/*
......@@ -193,6 +212,8 @@ void XShapeGrad::GradDropoutWithIndex(XTensor * node, bool isEfficient)
if (!isEfficient || input->isGrad) {
XNoder::MakeGrad(input);
if (input->mem != NULL)
input->mem->LockBuf();
XTensor * tmp = NewTensorBufV2(input, input->devID, input->mem);
_CopyValues(node->grad, tmp);
......@@ -205,9 +226,12 @@ void XShapeGrad::GradDropoutWithIndex(XTensor * node, bool isEfficient)
_SumMe(input->grad, tmp);
DelTensorBuf(tmp);
if (input->mem != NULL)
input->mem->UnlockBuf();
}
node->visitMark = NODE_FINISHED;
node->isGradFinished = true;
}
/*
......@@ -246,13 +270,16 @@ void XShapeGrad::GradMerge(XTensor * node, bool isEfficient)
dims[j++] = input->dimSize[i];
}
}
dims[0] = -dims[0];
dims[0] = -abs(dims[0]);
XTensor gradInputSmall(input->order - leadDim, dims,
input->dataType, input->denseRatio,
input->devID, input->mem);
dims[whereToMerge - leadDim] *= dims[0];
XTensor gradNodeSmall(node->order - leadDim, dims + leadDim + 1,
dims[whereToMerge - leadDim] *= abs(dims[0]);
int * dimsNode = dims + 1;
dimsNode[0] = -abs(dimsNode[0]);
XTensor gradNodeSmall(node->order - leadDim, dimsNode,
node->dataType, node->denseRatio,
node->devID, node->mem);
......@@ -296,6 +323,7 @@ void XShapeGrad::GradMerge(XTensor * node, bool isEfficient)
}
node->visitMark = NODE_FINISHED;
node->isGradFinished = true;
}
/*
......@@ -379,6 +407,7 @@ void XShapeGrad::GradMergeList(XTensor * node, bool isEfficient)
}
node->visitMark = NODE_FINISHED;
node->isGradFinished = true;
}
/*
......@@ -407,6 +436,7 @@ void XShapeGrad::GradReshape(XTensor * node, bool isEfficient)
}
node->visitMark = NODE_FINISHED;
node->isGradFinished = true;
}
/*
......@@ -442,16 +472,21 @@ void XShapeGrad::GradSplit(XTensor * node, bool isEfficient)
/* if the tensor is used somewhere else, we need another SUM
for gradient accumulation */
else {
if (input->mem != NULL)
input->mem->LockBuf();
XTensor * inputGradTMP = NewTensorBufV2(input, input->devID, input->mem);
_Merge(node->grad, inputGradTMP, whereToSplit + 1, 0);
_Sum(input->grad, inputGradTMP, input->grad);
DelTensorBuf(inputGradTMP);
if (input->mem != NULL)
input->mem->UnlockBuf();
}
}
node->visitMark = NODE_FINISHED;
node->isGradFinished = true;
}
/*
......@@ -528,14 +563,21 @@ void XShapeGrad::GradSplitListPost(XTensor * node, bool isEfficient)
somewhere else, we need another SUM for gradient
accumulation */
else {
if (node->mem != NULL)
node->mem->LockBuf();
XTensor * nodeGradTMP = NewTensorBufV2(node, node->devID, node->mem);
_Merge(&splits, nodeGradTMP, whereToSplit + 1);
_Sum(node->grad, nodeGradTMP, node->grad);
DelTensorBuf(nodeGradTMP);
if (node->mem != NULL)
node->mem->UnlockBuf();
}
}
node->visitMark = NODE_DOING;
node->isGradFinished = true;
}
/*
......@@ -566,14 +608,19 @@ void XShapeGrad::GradTranspose(XTensor * node, bool isEfficient)
CheckNTErrors(input->order > i && i >= 0, "index of dimension is out of scope!");
CheckNTErrors(input->order > j && j >= 0, "index of dimension is out of scope!");
if (input->mem != NULL)
input->mem->LockBuf();
XTensor * tmp = NewTensorBufV2(input, input->devID, input->mem);
_Transpose(output->grad, tmp, i, j);
_Sum(input->grad, tmp, input->grad);
DelTensorBuf(tmp);
if (input->mem != NULL)
input->mem->UnlockBuf();
}
node->visitMark = NODE_FINISHED;
node->isGradFinished = true;
}
/*
......@@ -603,15 +650,20 @@ void XShapeGrad::GradUnsqueeze(XTensor * node, bool isEfficient)
if (!isEfficient || input->isGrad) {
XNoder::MakeGrad(input);
if (input->mem != NULL)
input->mem->LockBuf();
XTensor * tmp = NewTensorBufV2(input->grad, input->devID, input->mem);
_ReduceSum(output->grad, tmp, dim);
_Sum(input->grad, tmp, input->grad);
DelTensorBuf(tmp);
if (input->mem != NULL)
input->mem->UnlockBuf();
}
node->visitMark = NODE_FINISHED;
node->isGradFinished = true;
}
}
\ No newline at end of file
......@@ -101,6 +101,7 @@ void XNet::Backward(TensorList &roots)
for(int i = 0; i < nodes.count; i++){
XTensor * node = (XTensor*)nodes.Get(i);
node->visitMark = NODE_UNFINISHED;
node->isGradFinished = false;
}
/* back-propagation from output to input */
......@@ -162,6 +163,7 @@ void XNet::BackwardNode(XTensor * node, bool isEfficent)
}
else{
node->visitMark = NODE_FINISHED;
node->isGradFinished = true;
}
}
......
......@@ -21,8 +21,8 @@
#include "Decoder.h"
#include "Utility.h"
#include "module/LayerNorm.h"
#include "module/CommonModules.h"
#include "submodel/LayerNorm.h"
#include "submodel/CommonModules.h"
#include "../../tensor/core/CHeader.h"
namespace nmt
......
......@@ -21,8 +21,8 @@
#include "Encoder.h"
#include "Utility.h"
#include "module/LayerNorm.h"
#include "module/CommonModules.h"
#include "submodel/LayerNorm.h"
#include "submodel/CommonModules.h"
#include "../../tensor/core/CHeader.h"
namespace nmt
......
......@@ -23,10 +23,10 @@
#define __ENCODER_H__
#include "Utility.h"
#include "module/FNN.h"
#include "module/Attention.h"
#include "module/Embedding.h"
#include "module/LayerNorm.h"
#include "submodel/FNN.h"
#include "submodel/Attention.h"
#include "submodel/Embedding.h"
#include "submodel/LayerNorm.h"
#include "../../network/XNet.h"
using namespace nts;
......
......@@ -265,6 +265,7 @@ void Model::MakeMTMask(XTensor& inputEnc, XTensor& inputDec,
dims[inputDec.order + 1] = inputEnc.GetDim(inputEnc.order - 1);
InitTensor(&maskEncDec, inputDec.order + 2, dims, X_FLOAT, paddingEnc.devID);
GMems.GetMem(paddingEnc.devID)->LockBuf();
XTensor* maskEncDecTMPEnc = NewTensorBuf(paddingEnc.order + 1, dims + 1,
paddingEnc.dataType, paddingEnc.devID);
XTensor* maskEncDecTMPDec = NewTensorBuf(maskEncDecTMPEnc, paddingEnc.devID);
......@@ -275,6 +276,7 @@ void Model::MakeMTMask(XTensor& inputEnc, XTensor& inputDec,
DelTensorBuf(maskEncDecTMPDec);
DelTensorBuf(maskEncDecTMPEnc);
GMems.GetMem(paddingEnc.devID)->UnlockBuf();
/* padding on the source side */
int* dimsPadding = new int[paddingEnc.order + 2];
......@@ -283,6 +285,7 @@ void Model::MakeMTMask(XTensor& inputEnc, XTensor& inputDec,
dimsPadding[paddingEnc.order - 1] = paddingEnc.GetDim(-1);
dimsPadding[paddingEnc.order] = paddingEnc.GetDim(-1);
GMems.GetMem(paddingEnc.devID)->LockBuf();
XTensor* padding2 = NewTensorBuf(paddingEnc.order + 1, dimsPadding, paddingEnc.dataType,
paddingEnc.devID);
......@@ -309,6 +312,7 @@ void Model::MakeMTMask(XTensor& inputEnc, XTensor& inputDec,
DelTensorBuf(padding3);
DelTensorBuf(padding2);
GMems.GetMem(paddingEnc.devID)->UnlockBuf();
}
/*
......@@ -490,7 +494,7 @@ void Model::Read(FILE* file)
TensorList params;
GetParams(params);
LOG("params count: %lu", params.Size());
LOG("params count: %lu", (unsigned long)params.Size());
int size = 0;
for (int i = 0; i < params.Size(); i++) {
size += params[i]->unitNum;
......
......@@ -24,10 +24,10 @@
#include "Encoder.h"
#include "Decoder.h"
#include "module/FNN.h"
#include "module/Output.h"
#include "submodel/FNN.h"
#include "submodel/Output.h"
#include "Utility.h"
#include "module/Attention.h"
#include "submodel/Attention.h"
namespace nmt
{
......
......@@ -28,6 +28,7 @@
#include "Utility.h"
#include "../../tensor/XGlobal.h"
#include "../../tensor/XConfig.h"
using namespace nts;
using namespace std;
......@@ -91,9 +92,9 @@ Config::Config(int argc, const char** argv)
LoadParamInt(argsNum, args, "sbatch", &sBatchSize, 8);
isTraining = (strcmp(trainFN, "") == 0) ? false : true;
LoadParamBool(argsNum, args, "mt", &isMT, true);
LoadParamFloat(argsNum, args, "dropout", &dropout, 0.3);
LoadParamFloat(argsNum, args, "fnndrop", &fnnDropout, 0.1);
LoadParamFloat(argsNum, args, "attdrop", &attDropout, 0.1);
LoadParamFloat(argsNum, args, "dropout", &dropout, 0.3F);
LoadParamFloat(argsNum, args, "fnndrop", &fnnDropout, 0.1F);
LoadParamFloat(argsNum, args, "attdrop", &attDropout, 0.1F);
LoadParamFloat(argc, args, "lrate", &lrate, 0.0015F);
LoadParamFloat(argc, args, "lrbias", &lrbias, 0);
......@@ -106,7 +107,7 @@ Config::Config(int argc, const char** argv)
LoadParamFloat(argc, args, "adambeta2", &adamBeta2, 0.98F);
LoadParamFloat(argc, args, "adamdelta", &adamDelta, 1e-9F);
LoadParamBool(argc, args, "shuffled", &isShuffled, true);
LoadParamFloat(argc, args, "labelsmoothing", &labelSmoothingP, 0.1);
LoadParamFloat(argc, args, "labelsmoothing", &labelSmoothingP, 0.1F);
LoadParamInt(argc, args, "nstepcheckpoint", &nStepCheckpoint, -1);
LoadParamBool(argc, args, "epochcheckpoint", &useEpochCheckpoint, true);
LoadParamInt(argc, args, "updatestep", &updateStep, 1);
......@@ -124,8 +125,8 @@ Config::Config(int argc, const char** argv)
LoadParamString(argsNum, args, "output", outputFN, "");
LoadParamInt(argsNum, args, "beamsize", &beamSize, 1);
LoadParamBool(argsNum, args, "fp16", &useFP16, false);
LoadParamFloat(argsNum, args, "lenalpha", &lenAlpha, 0.6);
LoadParamFloat(argsNum, args, "maxlenalpha", &maxLenAlpha, 1.2);
LoadParamFloat(argsNum, args, "lenalpha", &lenAlpha, 0.6F);
LoadParamFloat(argsNum, args, "maxlenalpha", &maxLenAlpha, 1.2F);
for (int i = 0; i < argc; i++)
delete[] args[i];
......@@ -157,90 +158,6 @@ int Config::LoadFromFile(const char* configFN, char** args) {
return argsNum;
}
void LoadParamString(int argc, char** argv, const char* name, char* p, const char* defaultP)
{
char vname[128];
vname[0] = '-';
strcpy(vname + 1, name);
bool hit = false;
for (int i = 0; i < argc; i++) {
if (!strcmp(argv[i], vname) && i + 1 < argc) {
strcpy(p, argv[i + 1]);
hit = true;
break;
}
}
if (!hit)
strcpy(p, defaultP);
}
void LoadParamInt(int argc, char** argv, const char* name, int* p, int defaultP)
{
char vname[128];
vname[0] = '-';
strcpy(vname + 1, name);
bool hit = false;
for (int i = 0; i < argc; i++) {
if (!strcmp(argv[i], vname) && i + 1 < argc) {
*(int*)p = atoi(argv[i + 1]);
hit = true;
break;
}
}
if (!hit)
*p = defaultP;
}
void LoadParamBool(int argc, char** argv, const char* name, bool* p, bool defaultP)
{
char vname[128];
vname[0] = '-';
strcpy(vname + 1, name);
bool hit = false;
for (int i = 0; i < argc; i++) {
if (!strcmp(argv[i], vname)) {
*(bool*)p = true;
hit = true;
break;
}
}
if (!hit)
*p = defaultP;
}
void LoadParamFloat(int argc, char** argv, const char* name, float* p, float defaultP)
{
char vname[128];
vname[0] = '-';
strcpy(vname + 1, name);
bool hit = false;
for (int i = 0; i < argc; i++) {
if (!strcmp(argv[i], vname) && i + 1 < argc) {
*p = (float)atof(argv[i + 1]);
hit = true;
break;
}
}
if (!hit)
*p = defaultP;
}
void ShowParams(int argc, char** argv)
{
fprintf(stderr, "args:\n");
for (int i = 0; i < argc; i++) {
if (argv[i][1] == 0)
continue;
if (argv[i][0] == '-' && (argv[i][1] < '1' || argv[i][1] > '9')) {
if (i + 1 < argc && argv[i + 1][0] != '-')
fprintf(stderr, " %s=%s\n", argv[i], argv[i + 1]);
else
fprintf(stderr, " %s=yes\n", argv[i]);
}
}
fprintf(stderr, "\n");
}
#define MAX_WORD_NUM 120
/*
......@@ -275,7 +192,9 @@ IntList SplitInt(const string& s, const string& delimiter)
IntList values;
auto indices = SplitToPos(s, delimiter);
for (int i = 0; i < indices.Size(); i++) {
values.Add(strtol(s.data() + indices[i], nullptr, 10));
/* this line is with problem. Why do we need an IntList to keep an int64*/
values.Add((int)strtol(s.data() + indices[i], nullptr, 10));
}
return values;
}
......@@ -291,4 +210,4 @@ FloatList SplitFloat(const string& s, const string& delimiter)
return values;
}
}
\ No newline at end of file
}
......@@ -33,17 +33,6 @@ using namespace nts;
namespace nmt
{
#define MAX_PARAM_NUM 100
/* load arguments */
void LoadParamInt(int argc, char** argv, const char* name, int* p, int defaultP);
void LoadParamBool(int argc, char** argv, const char* name, bool* p, bool defaultP);
void LoadParamFloat(int argc, char** argv, const char* name, float* p, float defaultP);
void LoadParamString(int argc, char** argv, const char* name, char* p, const char* defaultP);
/* show arguments */
void ShowParams(int argc, char** argv);
/* split string */
IntList SplitInt(const string& s, const string& delimiter);
FloatList SplitFloat(const string& s, const string& delimiter);
......
......@@ -226,7 +226,6 @@ XTensor Attention::MakeRPRAttention(XTensor& k, XTensor& q, XTensor& v,
XTensor qheads;
XTensor vheads;
const int batchSize = q.GetDim(0);
const int lenQ = q.GetDim(1);
const int lenKV = k.GetDim(1);
......@@ -255,7 +254,7 @@ XTensor Attention::MakeRPRAttention(XTensor& k, XTensor& q, XTensor& v,
relativeKey = ConvertDataType(relativeKey, X_FLOAT);
}
float scaling = sqrt(d / nhead);
float scaling = (float)sqrt(d / nhead);
qheads = ScaleAndShift(qheads, 1.0F / scaling);
dot = RPDotProduct(qheads, kheads, relativeKey, true);
......@@ -402,4 +401,4 @@ void Cache::Reorder(XTensor& reorder)
value = AutoGather(value, reorder);
}
}
}
\ No newline at end of file
}
......@@ -48,8 +48,6 @@ void GLU::InitModel(Config& config)
{
devID = config.devID;
float minmax = 0;
inSize = config.modelSize;
outSize = config.modelSize;
......@@ -84,4 +82,4 @@ XTensor GLU::Make(XTensor& input)
return t1 * Sigmoid(t2);
}
}
\ No newline at end of file
}
......@@ -92,10 +92,10 @@ generate the weight sum vector of all previous layer output in the history as th
XTensor LayerHistory::Pop()
{
/* the number of layer output in the history */
size_t size = history.Size();
int size = (int)history.Size();
TensorList historyList;
for (size_t i = 0; i < size; i++)
for (int i = 0; i < size; i++)
historyList.Add(history[i]);
/* we need stack the tensor along the first dim*/
......
......@@ -134,13 +134,13 @@ UInt64List TrainDataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
UInt64List info;
size_t srcTokenNum = 0;
size_t tgtTokenNum = 0;
int realBatchSize = 1;
size_t realBatchSize = 1;
if (!isTraining)
realBatchSize = minSentBatch;
/* get the maximum source sentence length in a mini-batch */
size_t maxSrcLen = buffer[curIdx]->srcSent.Size();
size_t maxSrcLen = buffer[(int)curIdx]->srcSent.Size();
/* max batch size */
const int MAX_BATCH_SIZE = 512;
......@@ -150,9 +150,9 @@ UInt64List TrainDataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
while ((realBatchSize < (buffer.Size() - curIdx))
&& (realBatchSize * maxSrcLen < batchSize)
&& (realBatchSize < MAX_BATCH_SIZE)
&& (realBatchSize * buffer[curIdx + realBatchSize]->srcSent.Size() < batchSize)) {
if (maxSrcLen < buffer[curIdx + realBatchSize]->srcSent.Size())
maxSrcLen = buffer[curIdx + realBatchSize]->srcSent.Size();
&& (realBatchSize * buffer[(int)(curIdx + realBatchSize)]->srcSent.Size() < batchSize)) {
if (maxSrcLen < buffer[(int)(curIdx + realBatchSize)]->srcSent.Size())
maxSrcLen = buffer[(int)(curIdx + realBatchSize)]->srcSent.Size();
realBatchSize++;
}
}
......@@ -165,14 +165,14 @@ UInt64List TrainDataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
CheckNTErrors(realBatchSize > 0, "Invalid batch size");
/* get the maximum target sentence length in a mini-batch */
size_t maxTgtLen = buffer[curIdx]->tgtSent.Size();
size_t maxTgtLen = buffer[(int)curIdx]->tgtSent.Size();
for (size_t i = 0; i < realBatchSize; i++) {
if (maxTgtLen < buffer[curIdx + i]->tgtSent.Size())
maxTgtLen = buffer[curIdx + i]->tgtSent.Size();
if (maxTgtLen < buffer[(int)(curIdx + i)]->tgtSent.Size())
maxTgtLen = buffer[(int)(curIdx + i)]->tgtSent.Size();
}
for (size_t i = 0; i < realBatchSize; i++) {
if (maxSrcLen < buffer[curIdx + i]->srcSent.Size())
maxSrcLen = buffer[curIdx + i]->srcSent.Size();
if (maxSrcLen < buffer[(int)(curIdx + i)]->srcSent.Size())
maxSrcLen = buffer[(int)(curIdx + i)]->srcSent.Size();
}
CheckNTErrors(maxSrcLen != 0, "Invalid source length for batching");
......@@ -204,19 +204,19 @@ UInt64List TrainDataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
*/
for (int i = 0; i < realBatchSize; ++i) {
srcTokenNum += buffer[curIdx + i]->srcSent.Size();
tgtTokenNum += buffer[curIdx + i]->tgtSent.Size();
srcTokenNum += buffer[(int)(curIdx + i)]->srcSent.Size();
tgtTokenNum += buffer[(int)(curIdx + i)]->tgtSent.Size();
curSrc = maxSrcLen * i;
for (int j = 0; j < buffer[curIdx + i]->srcSent.Size(); j++) {
batchEncValues[curSrc++] = buffer[curIdx + i]->srcSent[j];
for (int j = 0; j < buffer[(int)(curIdx + i)]->srcSent.Size(); j++) {
batchEncValues[curSrc++] = buffer[(int)(curIdx + i)]->srcSent[j];
}
curTgt = maxTgtLen * i;
for (int j = 0; j < buffer[curIdx + i]->tgtSent.Size(); j++) {
for (int j = 0; j < buffer[(int)(curIdx + i)]->tgtSent.Size(); j++) {
if (j > 0)
labelVaues[curTgt - 1] = buffer[curIdx + i]->tgtSent[j];
batchDecValues[curTgt++] = buffer[curIdx + i]->tgtSent[j];
labelVaues[curTgt - 1] = buffer[(int)(curIdx + i)]->tgtSent[j];
batchDecValues[curTgt++] = buffer[(int)(curIdx + i)]->tgtSent[j];
}
labelVaues[curTgt - 1] = EOS;
while (curSrc < maxSrcLen * (i + 1))
......@@ -226,11 +226,13 @@ UInt64List TrainDataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
}
InitTensor2D(batchEnc, realBatchSize, maxSrcLen, X_INT, devID);
InitTensor2D(paddingEnc, realBatchSize, maxSrcLen, X_FLOAT, devID);
InitTensor2D(batchDec, realBatchSize, maxTgtLen, X_INT, devID);
InitTensor2D(paddingDec, realBatchSize, maxTgtLen, X_FLOAT, devID);
InitTensor2D(label, realBatchSize, maxTgtLen, X_INT, devID);
int rbs = (int)realBatchSize;
int msl = (int)maxSrcLen;
InitTensor2D(batchEnc, rbs, msl, X_INT, devID);
InitTensor2D(paddingEnc, rbs, msl, X_FLOAT, devID);
InitTensor2D(batchDec, rbs, msl, X_INT, devID);
InitTensor2D(paddingDec, rbs, msl, X_FLOAT, devID);
InitTensor2D(label, rbs, msl, X_INT, devID);
curIdx += realBatchSize;
......@@ -304,14 +306,14 @@ void TrainDataSet::BuildBucket()
size_t sentNum = 1;
/* get the maximum source sentence length in a bucket */
size_t maxSrcLen = buffer[idx]->srcSent.Size();
size_t maxSrcLen = buffer[(int)idx]->srcSent.Size();
/* bucketing for sentences */
while ((sentNum < (buffer.Size() - idx))
&& (sentNum * maxSrcLen < bucketSize)
&& (sentNum * buffer[curIdx + sentNum]->srcSent.Size() < bucketSize)) {
if (maxSrcLen < buffer[idx + sentNum]->srcSent.Size())
maxSrcLen = buffer[idx + sentNum]->srcSent.Size();
&& (sentNum * buffer[(int)(curIdx + sentNum)]->srcSent.Size() < bucketSize)) {
if (maxSrcLen < buffer[(int)(idx + sentNum)]->srcSent.Size())
maxSrcLen = buffer[(int)(idx + sentNum)]->srcSent.Size();
sentNum++;
}
......@@ -324,7 +326,7 @@ void TrainDataSet::BuildBucket()
/* shuffle items in a bucket */
for (size_t i = 0; i < sentNum; i++) {
buffer[idx + i]->bucketKey = randomKey;
buffer[(int)(idx + i)]->bucketKey = randomKey;
}
idx += sentNum;
......@@ -335,13 +337,13 @@ void TrainDataSet::BuildBucket()
idx = 0;
while (idx < buffer.Size()) {
size_t sentNum = 0;
int bucketKey = buffer[idx + sentNum]->bucketKey;
int bucketKey = buffer[(int)(idx + sentNum)]->bucketKey;
while (sentNum < (buffer.Size() - idx)
&& buffer[idx + sentNum]->bucketKey == bucketKey) {
buffer[idx + sentNum]->key = buffer[idx + sentNum]->srcSent.Size();
&& buffer[(int)(idx + sentNum)]->bucketKey == bucketKey) {
buffer[(int)(idx + sentNum)]->key = (int)buffer[(int)(idx + sentNum)]->srcSent.Size();
sentNum++;
}
SortInBucket(idx, idx + sentNum);
SortInBucket((int)idx, (int)(idx + sentNum));
idx += sentNum;
}
}
......
......@@ -98,6 +98,21 @@ public:
XTensor* batchDec, XTensor* paddingDec, XTensor* label,
size_t minSentBatch, size_t batchSize, int devID);
/* load the samples into the buffer (a list) */
bool LoadBatchToBuf(XList * buf);
/* load the samples into tensors from the buffer */
static
bool LoadBatch(XList * buf,
XTensor* batchEnc, XTensor* paddingEnc,
XTensor* batchDec, XTensor* paddingDec, XTensor* label,
size_t minSentBatch, size_t batchSize, int devID,
int &wc, int &sc);
/* release the samples in a buffer */
static
void ClearSamples(XList * buf);
/* initialization function */
void Init(const char* dataFile, int bucketSize, bool training);
......
......@@ -163,8 +163,8 @@ void Trainer::Train(const char* fn, const char* validFN,
UInt64List info = batchLoader.LoadBatch(&batchEnc, &paddingEnc, &batchDec, &paddingDec, &label,
sBatchSize, wBatchSize, devID);
wc = info[0];
ws = info[1];
wc = (int)info[0];
ws = (int)info[1];
CheckNTErrors(batchEnc.order == 2, "wrong tensor order of the sequence batch");
/* output probabilities */
......@@ -206,7 +206,7 @@ void Trainer::Train(const char* fn, const char* validFN,
if (gradStep == updateStep) {
float warmupEndLR = lrate;
float warmupInitLR = 1e-7;
float warmupInitLR = 1e-7F;
float lrStep = (warmupEndLR - warmupInitLR) / nwarmup;
float decayFactor = warmupEndLR * pow(float(nwarmup), 0.5F);
......@@ -320,8 +320,8 @@ void Trainer::Validate(const char* fn, const char* ofn, Model* model)
UInt64List info = batchLoader.LoadBatch(&batchEnc, &paddingEnc, &batchDec, &paddingDec, &label,
sBatchSize, 0, model->devID);
wc = info[0];
ws = info[1];
wc = (int)info[0];
ws = (int)info[1];
CheckNTErrors(batchEnc.order == 2, "Wrong tensor order of the sequence batch");
/* make the network */
......@@ -334,7 +334,7 @@ void Trainer::Validate(const char* fn, const char* ofn, Model* model)
}
int bSize = output.GetDim(0);
int length = output.GetDim(1);
//int length = output.GetDim(1);
labelOnehot = IndexToOnehot(label, vSizeTgt, 0);
lossTensor = CrossEntropy(output, labelOnehot, paddingDec);
......@@ -428,6 +428,7 @@ void Trainer::Update(Model* model, const float lr)
_ScaleAndShiftMe(v, (1.0F - adamBeta2), 0);
/* v2 = m / (sqrt(v) + delta) */
GMems.GetMem(v->devID)->LockBuf();
XTensor* v2 = NewTensorBuf(v, v->devID);
_Power(v, v2, 0.5F);
_ScaleAndShiftMe(v2, 1.0F, d);
......@@ -437,6 +438,7 @@ void Trainer::Update(Model* model, const float lr)
_Sum(para, v2, para, -e);
DelTensorBuf(v2);
GMems.GetMem(v->devID)->UnlockBuf();
}
else {
/* the delta rule */
......@@ -479,4 +481,4 @@ void Trainer::PrepareModel(Model* model)
adamBeta2T = 1.0F;
}
}
\ No newline at end of file
}
......@@ -70,10 +70,10 @@ void DataSet::LoadDataToBuffer()
size_t maxLen = indices.Size() > MAX_WORD_NUM ? MAX_WORD_NUM : indices.Size();
for (size_t i = 0; i < maxLen; i++) {
auto offset = (i != (indices.Size() - 1)) ?
indices[i + 1] - indices[i] - tokenDelimiter.size()
: line.size() - indices[i];
string word = line.substr(indices[i], offset);
size_t offset = (i != (indices.Size() - 1)) ?
(size_t)indices[(int)i + 1] - (size_t)indices[(int)i] - tokenDelimiter.size()
: line.size() - (size_t)indices[(int)i];
string word = line.substr((size_t)indices[(int)i], offset);
if (srcVocab.word2id.find(word) == srcVocab.word2id.end())
values.Add(UNK);
else
......@@ -110,12 +110,12 @@ load a mini-batch to the device (for translating)
<< indices of the sentences
*/
UInt64List DataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
size_t minSentBatch, size_t batchSize, int devID)
int minSentBatch, int batchSize, int devID)
{
size_t realBatchSize = minSentBatch;
int realBatchSize = minSentBatch;
/* get the maximum sentence length in a mini-batch */
size_t maxLen = inputBuffer[bufferUsed]->values.Size();
int maxLen = (int)inputBuffer[(int)bufferUsed]->values.Size();
/* dynamic batching for sentences */
//while ((realBatchSize < (inputBuffer.Size() - bufferUsed))
......@@ -125,7 +125,7 @@ UInt64List DataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
/* real batch size */
if ((inputBuffer.Size() - bufferUsed) < realBatchSize) {
realBatchSize = inputBuffer.Size() - bufferUsed;
realBatchSize = (int)(inputBuffer.Size() - bufferUsed);
}
CheckNTErrors(maxLen != 0, "invalid length");
......@@ -144,15 +144,15 @@ UInt64List DataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
UInt64List infos;
size_t totalLength = 0;
for (int i = 0; i < realBatchSize; ++i) {
infos.Add(inputBuffer[bufferUsed + i]->id);
totalLength += inputBuffer[bufferUsed + i]->values.Size();
for (size_t i = 0; i < (size_t)realBatchSize; ++i) {
infos.Add(inputBuffer[(int)(bufferUsed + i)]->id);
totalLength += inputBuffer[(int)(bufferUsed + i)]->values.Size();
curSrc = maxLen * i;
for (int j = 0; j < inputBuffer[bufferUsed + i]->values.Size(); j++)
batchValues[curSrc++] = inputBuffer[bufferUsed + i]->values[j];
for (size_t j = 0; j < inputBuffer[(int)(bufferUsed + i)]->values.Size(); j++)
batchValues[(int)(curSrc++)] = (int)inputBuffer[(int)(bufferUsed + i)]->values[(int)j];
while (curSrc < maxLen * (i + 1))
paddingValues[curSrc++] = 0;
paddingValues[(int)(curSrc++)] = 0;
}
infos.Add(totalLength);
......
......@@ -85,7 +85,7 @@ public:
/* generate a mini-batch */
UInt64List LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
size_t sBatch, size_t wBatch, int devID);
int sBatch, int wBatch, int devID);
/* initialization function */
void Init(const char* dataFile, const char* srcVocabFN, const char* tgtVocabFN);
......
......@@ -42,7 +42,7 @@ float LengthPenalizer::GNMT(float length, float alpha)
base = (length + 5.0F) / (1.0F + 5.0F);
lp = pow(base, alpha);
lp = (float)pow(base, alpha);
return lp;
}
......
......@@ -22,7 +22,7 @@
#include <iostream>
#include "Predictor.h"
#include "../module/NNUtil.h"
#include "../submodel/NNUtil.h"
using namespace nts;
......
......@@ -322,7 +322,7 @@ void BeamSearch::Generate(StateBundle* prev, StateBundle* beam)
/* keep the most promising candidates in the beam */
TopK(score, scoreTopK, index, -1, beamSize, true);
float lp = LengthPenalizer::GNMT(beam->nstep, alpha);
//float lp = LengthPenalizer::GNMT(beam->nstep, alpha);
CopyValues(index, indexCPU);
CopyValues(index, preID);
......@@ -493,8 +493,8 @@ void BeamSearch::Collect(StateBundle* beam)
/* check if this is the first end symbol. It is false
if there have been end symbols in previously generated words. */
bool isCompleted = state.isCompleted &&
(state.last == NULL || !state.last->isCompleted);
//bool isCompleted = state.isCompleted &&
// (state.last == NULL || !state.last->isCompleted);
/* we push the hypothesis into the heap when it is completed */
if ((state.isEnd || state.isCompleted)) {
......@@ -557,7 +557,6 @@ void BeamSearch::Dump(IntList* output, XTensor* score)
}
}
int count = 0;
bool isCompleted = true;
/* we track the state from the end to the beginning */
......@@ -873,4 +872,4 @@ void GreedySearch::Search(Model* model, XTensor& input,
delete[] finishedFlags;
}
}
\ No newline at end of file
}
......@@ -155,7 +155,7 @@ void Translator::Translate(const char* ifn, const char* sfn,
batchLoader.outputBuffer.Add(emptyRes);
}
double startDump = GetClockSec();
//double startDump = GetClockSec();
/* reorder the result */
batchLoader.SortOutput();
......@@ -163,7 +163,7 @@ void Translator::Translate(const char* ifn, const char* sfn,
/* print the result to a file */
batchLoader.DumpRes(ofn);
double elapsed = GetClockSec() - startDump;
//double elapsed = GetClockSec() - startDump;
LOG("translation completed (word=%d, sent=%zu)",
wordCountTotal, batchLoader.inputBuffer.Size() + batchLoader.emptyLines.Size());
......@@ -196,4 +196,4 @@ void Translator::Dump(FILE* file, XTensor* output)
}
}
}
\ No newline at end of file
}
......@@ -34,14 +34,14 @@ void Vocab::Load(const string& src)
/* get the vocab size and the start id */
f >> vsz >> sid;
startID = stol(sid);
vocabSize = stol(vsz);
startID = (int)stol(sid);
vocabSize = (int)stol(vsz);
string word, id;
for (int i = 0; i < vocabSize - startID; i++) {
f >> word >> id;
word2id[word] = stol(id);
id2word[stol(id)] = word;
word2id[word] = (int)stol(id);
id2word[(int)stol(id)] = word;
}
f.close();
......@@ -75,4 +75,4 @@ void Vocab::CopyFrom(const Vocab& v)
id2word.insert(i2w);
}
}
\ No newline at end of file
}
......@@ -847,6 +847,7 @@ XTensor * NewTensorRange(int lower, int upper, int step, const TENSOR_DATA_TYPE
XTensor * tensor = NewTensor1D(unitNum, myDataType, myDevID, isEnableGrad);
tensor->Range(lower, upper, step);
return tensor;
}
......
/*
* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2021
* Natural Language Processing Lab, Northeastern University
* and
* NiuTrans Research
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* this class keeps a batch of paramters.
*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2021-02-28
*/
#include "XConfig.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* constructor */
XConfig::XConfig()
{
n = 0;
args = NULL;
nReal = 0;
}
/* de-constructor */
XConfig::~XConfig()
{
for (int i = 0; i < n; i++) {
delete[] args[i];
}
delete[] args;
}
/* clear it */
void XConfig::Clear()
{
for (int i = 0; i < n; i++) {
delete[] args[i];
}
delete[] args;
n = 0;
args = NULL;
nReal = 0;
}
/*
create a config
>> myN - number of the input arguments
>> myArgs - the input arguments
*/
void XConfig::Create(const int myN, const char ** myArgs)
{
CheckNTErrors(myN > 0, "No input parameters to XConfig!");
for (int i = 0; i < n; i++) {
delete[] args[i];
}
delete[] args;
args = NULL;
n = myN;
nReal = n * 2;
args = new char*[nReal];
for (int i = 0; i < nReal; i++) {
args[i] = NULL;
}
for (int i = 0; i < n; i++) {
CheckNTErrors(myArgs[i] != NULL, "Illegal parameter input!");
args[i] = new char[strlen(myArgs[i]) + 1];
strcpy(args[i], myArgs[i]);
}
}
/*
add an argument
>> myArg - the argument
>> myValue - the value of the argument
*/
void XConfig::Add(const char * myArg, const char * myValue)
{
CheckNTErrors(myArg != NULL, "No argument!");
if (n + 2 > nReal) {
nReal = MAX(n * 2 + 1, 128);
char ** newArgs = new char*[nReal];
memset(newArgs, 0, sizeof(char*) * n);
memcpy(newArgs, args, sizeof(char*) * n);
delete[] args;
args = newArgs;
}
args[n] = new char[strlen(myArg) + 2];
args[n][0] = '-';
strcpy(args[n] + 1, myArg);
n++;
if (myValue != NULL) {
args[n] = new char[strlen(myValue) + 1];
strcpy(args[n], myValue);
n++;
}
}
/*
add an argument (in integer)
>> myArg - the argument
>> myValue - the value of the argument
*/
void XConfig::Add(const char * myArg, int myValue)
{
char value[MAX_WORD_LENGTH_IN_CONFIG];
sprintf(value, "%d", myValue);
Add(myArg, value);
}
/*
add an argument (in bool)
>> myArg - the argument
>> myValue - the value of the argument
*/
void XConfig::Add(const char * myArg, bool myValue)
{
char value[2];
if (myValue)
value[0] = '1';
else
value[0] = '0';
value[1] = 0;
Add(myArg, value);
}
/*
add an argument (in float)
>> myArg - the argument
>> myValue - the value of the argument
*/
void XConfig::Add(const char * myArg, float myValue)
{
char value[MAX_WORD_LENGTH_IN_CONFIG];
sprintf(value, "%f", myValue);
Add(myArg, value);
}
/*
load the value of an argument (in integer)
>> name - the name of the argument
>> p - where we place the loaded value
>> defaultP - the default value (used only if no argument is hit in the list)
*/
void XConfig::LoadInt(const char * name, int * p, int defaultP)
{
LoadParamInt(n, args, name, p, defaultP);
}
/*
load the value of an argument (in boolean)
>> name - the name of the argument
>> p - where we place the loaded value
>> defaultP - the default value (used only if no argument is hit in the list)
*/
void XConfig::LoadBool(const char * name, bool * p, bool defaultP)
{
LoadParamBool(n, args, name, p, defaultP);
}
/*
load the value of an argument (in float)
>> name - the name of the argument
>> p - where we place the loaded value
>> defaultP - the default value (used only if no argument is hit in the list)
*/void XConfig::LoadFloat(const char * name, float * p, float defaultP)
{
LoadParamFloat(n, args, name, p, defaultP);
}
/*
load the value of an argument (in char string)
>> name - the name of the argument
>> p - where we place the loaded value
>> defaultP - the default value (used only if no argument is hit in the list)
*/
void XConfig::LoadString(const char * name, char * p, const char* defaultP)
{
LoadParamString(n, args, name, p, defaultP);
}
/*
get the value of an argument (in integer)
>> name - the name of the argument
>> defaultP - the default value (used only if no argument is hit in the list)
*/
int XConfig::GetInt(const char * name, int defaultP)
{
int r;
LoadInt(name, &r, defaultP);
return r;
}
/*
get the value of an argument (in bool)
>> name - the name of the argument
>> defaultP - the default value (used only if no argument is hit in the list)
*/
bool XConfig::GetBool(const char * name, bool defaultP)
{
bool r;
LoadBool(name, &r, defaultP);
return r;
}
/*
get the value of an argument (in float)
>> name - the name of the argument
>> defaultP - the default value (used only if no argument is hit in the list)
*/
float XConfig::GetFloat(const char * name, float defaultP)
{
float r;
LoadFloat(name, &r, defaultP);
return r;
}
/* get item number */
int XConfig::GetItemNum()
{
return n;
}
/*
get the item with offset i
>> i - offset
*/
char * XConfig::GetItem(int i)
{
if (i < n && i >= 0)
return args[i];
else
return NULL;
}
/*
initialize with another config model
>> myConfig - the configure model that we want to copy
*/
void XConfig::CreateFromMe(XConfig & myConfig)
{
Clear();
for (int i = 0; i < myConfig.GetItemNum(); i++)
Add(myConfig.GetItem(i), i);
}
/*
load the value of an argument (in integer)
>> argc - number of arguments
>> argv - arguments
>> name - the argument we search for
>> p - the pointer to the target variable where we want to place the value
>> defaultP - the default value we use if no argument is found
*/
void LoadParamInt(int argc, char** argv, const char* name, int* p, int defaultP)
{
char vname[128];
vname[0] = '-';
strcpy(vname + 1, name);
bool hit = false;
for (int i = 0; i < argc; i++) {
if (!strcmp(argv[i], vname) && i + 1 < argc) {
*(int*)p = atoi(argv[i + 1]);
hit = true;
break;
}
}
if (!hit)
*p = defaultP;
}
/*
load the value of an argument (in boolean)
>> argc - number of arguments
>> argv - arguments
>> name - the argument we search for
>> p - the pointer to the target variable where we want to place the value
>> defaultP - the default value we use if no argument is found
*/
void LoadParamBool(int argc, char** argv, const char* name, bool* p, bool defaultP)
{
char vname[128];
vname[0] = '-';
strcpy(vname + 1, name);
bool hit = false;
for (int i = 0; i < argc; i++) {
if (!strcmp(argv[i], vname)) {
*(bool*)p = true;
hit = true;
break;
}
}
if (!hit)
*p = defaultP;
}
/*
load the value of an argument (in float)
>> argc - number of arguments
>> argv - arguments
>> name - the argument we search for
>> p - the pointer to the target variable where we want to place the value
>> defaultP - the default value we use if no argument is found
*/
void LoadParamFloat(int argc, char** argv, const char* name, float* p, float defaultP)
{
char vname[128];
vname[0] = '-';
strcpy(vname + 1, name);
bool hit = false;
for (int i = 0; i < argc; i++) {
if (!strcmp(argv[i], vname) && i + 1 < argc) {
*p = (float)atof(argv[i + 1]);
hit = true;
break;
}
}
if (!hit)
*p = defaultP;
}
/*
load the value of an argument (in char string)
>> argc - number of arguments
>> argv - arguments
>> name - the argument we search for
>> p - the pointer to the target variable where we want to place the value
>> defaultP - the default value we use if no argument is found
*/
void LoadParamString(int argc, char** argv, const char* name, char* p, const char* defaultP)
{
char vname[128];
vname[0] = '-';
strcpy(vname + 1, name);
bool hit = false;
for (int i = 0; i < argc; i++) {
if (!strcmp(argv[i], vname) && i + 1 < argc) {
strcpy(p, argv[i + 1]);
hit = true;
break;
}
}
if (!hit)
strcpy(p, defaultP);
}
/*
show the argument list
>> argc - number of arguments
>> argv - arguments
*/
void ShowParams(int argc, char** argv)
{
fprintf(stderr, "args:\n");
for (int i = 0; i < argc; i++) {
if (argv[i][1] == 0)
continue;
if (argv[i][0] == '-' && (argv[i][1] < '1' || argv[i][1] > '9')) {
if (i + 1 < argc && argv[i + 1][0] != '-')
fprintf(stderr, " %s=%s\n", argv[i], argv[i + 1]);
else
fprintf(stderr, " %s=yes\n", argv[i]);
}
}
fprintf(stderr, "\n");
}
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
/*
* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2021
* Natural Language Processing Lab, Northeastern University
* and
* NiuTrans Research
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* this class defines a parameter keeper.
*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2021-02-28
* A new semester begins today.
*/
#ifndef __XCONFIG_H__
#define __XCONFIG_H__
#include "XGlobal.h"
#include "XUtility.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
#define MAX_WORD_LENGTH_IN_CONFIG 256
/* the parameter keeper */
class XConfig
{
private:
/* number of arguments */
int n;
/* argument list (in char*) */
char ** args;
/* number of items we rellocate for these arguments */
int nReal;
public:
/* constructor */
XConfig();
/* de-constructor */
~XConfig();
/* clear it */
void Clear();
/* create a config */
void Create(const int myN, const char ** myArgs);
/* add an argument */
void Add(const char * myArg, const char * myValue);
/* add an argument (in integer) */
void Add(const char * myArg, int myValue);
/* add an argument (in bool) */
void Add(const char * myArg, bool myValue);
/* add an argument (in float) */
void Add(const char * myArg, float myValue);
/* load the value of an argument to a variable (in integer) */
void LoadInt(const char * name, int * p, int defaultP);
/* load the value of an argument to a variable (in boolean) */
void LoadBool(const char * name, bool * p, bool defaultP);
/* load the value of an argument to a variable (in float) */
void LoadFloat(const char * name, float * p, float defaultP);
/* load the value of an argument to a variable (in char string) */
void LoadString(const char * name, char * p, const char* defaultP);
/* get the value of an argument (in integer) */
int GetInt(const char * name, int defaultP);
/* get the value of an argument (in boolean) */
bool GetBool(const char * name, bool defaultP);
/* get the value of an argument (in float) */
float GetFloat(const char * name, float defaultP);
/* get item number */
int GetItemNum();
/* get the item with offset i */
char * GetItem(int i);
/* initialize with another config model */
void CreateFromMe(XConfig &myConfig);
};
#define MAX_PARAM_NUM 100
/* load arguments */
void extern LoadParamInt(int argc, char** argv, const char* name, int* p, int defaultP);
void extern LoadParamBool(int argc, char** argv, const char* name, bool* p, bool defaultP);
void extern LoadParamFloat(int argc, char** argv, const char* name, float* p, float defaultP);
void extern LoadParamString(int argc, char** argv, const char* name, char* p, const char* defaultP);
/* show arguments */
void extern ShowParams(int argc, char** argv);
} // namespace nts(NiuTrans.Tensor)
#endif
\ No newline at end of file
......@@ -42,7 +42,6 @@ XDevManager GDevs;
/* constructor */
XDevice::XDevice()
{
stream = NULL;
isInitialized = false;
Clear();
......@@ -141,8 +140,6 @@ void XDevice::Init(int myDevID)
}
else
sprintf(name2, "GPU-%d %s", devID, name);
stream = new XStream(0, devID);
#endif
}
......@@ -176,10 +173,6 @@ void XDevice::Clear()
curandDestroyGenerator(gen);
isGenReady = false;
}
if (stream != NULL) {
delete stream;
stream = NULL;
}
#endif
isInitialized = false;
}
......@@ -189,10 +182,11 @@ void XDevice::Reset()
XMem * mem = GMems.GetMem(devID);
mem->Free();
#ifdef USE_CUDA
int devIDReset = devID;
Clear();
#ifdef USE_CUDA
if (devIDReset >= 0) {
int devIDBackup = -1;
cudaGetDevice(&devIDBackup);
......@@ -202,6 +196,8 @@ void XDevice::Reset()
cudaSetDevice(devIDBackup);
}
#else
Clear();
#endif
}
......@@ -227,17 +223,6 @@ cublasHandle_t * XDevice::GetCublasHandle()
return &cublasHandle;
}
/* get the stream of cuda */
cudaStream_t * XDevice::GetCudaStream()
{
if (!isInitialized)
Init(devID);
CheckNTErrors(stream != NULL, "the stream is not initialized!");
return &stream->stream;
}
#endif // USE_CUDA
/* switch to a device */
......@@ -286,6 +271,28 @@ int XDevice::GetGPUDevice()
#endif
}
/*
swith to a device (CPU or GPU)
>> devID - device id
*/
void XDevice::SetDevice(int devID)
{
if(devID >= 0)
SetGPUDevice(devID);
}
/*
swith to a device (CPU or GPU) with a backup of the device id
>> devID - device id
>> backupDevID - backup of the device id
*/
void XDevice::SetDevice(int devID, int &backupDevID)
{
backupDevID = GetGPUDevice();
if (devID >= 0)
SetGPUDevice(devID);
}
/* reset cuda flag for more efficient cuda execution. It should be called after "SetGPUDevice" when
no GPU context has been established. */
void XDevice::SetFastFlags()
......@@ -312,13 +319,6 @@ void XDevice::SetFastFlagsAllDevices()
#endif
}
/* delete the default stream for the device */
void XDevice::DelDeviceStream()
{
if(stream != NULL)
delete stream;
}
/* constructor */
XDevManager::XDevManager()
{
......@@ -391,14 +391,6 @@ cublasHandle_t * XDevManager::GetCudaHandle(const int devID)
return GPUs[devID].GetCublasHandle();
}
/* get the stream of a given GPU */
cudaStream_t * XDevManager::GetCudaStream(const int devID)
{
CheckNTErrors(devID < nGPU, "index of GPU is out of range.");
return GPUs[devID].GetCudaStream();
}
#endif
/*
......@@ -620,16 +612,5 @@ char * XDevManager::GetDevString(int devID)
}
}
/* delete the streams for all devices */
void XDevManager::DelDeviceStream()
{
for(int i = 0; i < GDevs.nCPU; i++) {
GDevs.CPUs[i].DelDeviceStream();
}
for(int i = 0; i < GDevs.nGPU; i++) {
GDevs.GPUs[i].DelDeviceStream();
}
}
} /* end of the nts (NiuTrans.Tensor) namespace */
......@@ -25,7 +25,6 @@
#define __XDEVICE_H__
#include "XThread.h"
#include "XStream.h"
#ifdef USE_CUDA
......@@ -97,9 +96,6 @@ public:
/* specify whether Unified Virtual Address Space (UVA) is supported */
bool isUVASupported;
/* default stream for the device */
XStream * stream;
/* seed for random number generation */
int seed;
......@@ -140,12 +136,9 @@ public:
#ifdef USE_CUDA
/* get cublas handle */
cublasHandle_t * GetCublasHandle();
/* get the stream of cuda */
cudaStream_t * GetCudaStream();
#endif
/* switch to a device */
/* switch to a GPU device */
static
void SetGPUDevice(int devID);
......@@ -153,10 +146,18 @@ public:
static
void SetGPUDeviceFast(int devID);
/* switch to a get current dev */
/* get current dev */
static
int GetGPUDevice();
/* swith to a device (CPU or GPU) */
static
void SetDevice(int devID);
/* swith to a device (CPU or GPU) with a backup of the device id */
static
void SetDevice(int devID, int &backupDevID);
/* reset cuda flag for more efficient cuda execution */
static
void SetFastFlags();
......@@ -164,9 +165,6 @@ public:
/* reset cuda flag for more efficient cuda execution (all devices) */
static
void SetFastFlagsAllDevices();
/* delete the default stream for the device (call it before deleting the XDevice) */
void DelDeviceStream();
};
/*
......@@ -206,9 +204,6 @@ public:
#ifdef USE_CUDA
/* get the handle of GPU */
cublasHandle_t * GetCudaHandle(const int devID);
/* get the stream of cuda */
cudaStream_t * GetCudaStream(const int devID);
#endif
/* get grid and block sizes that max potential */
......@@ -228,10 +223,6 @@ public:
/* get the device information in string */
char * GetDevString(int devID);
/* delete the streams for all devices */
static
void DelDeviceStream();
};
/* managing the devices */
......
......@@ -132,6 +132,36 @@ extern int TRAINING_SAMPLE_BUF_SIZE;
extern int CONST_MINUSONE;
extern bool CONST_TRUE;
//////////////////////////////////////////////////
// mutex
#ifdef WIN32
#define THREAD_HANDLE HANDLE
#define MUTEX_HANDLE CRITICAL_SECTION
#define COND_HANDLE HANDLE
#define MUTEX_INIT( x ) InitializeCriticalSection( &(x) )
#define MUTEX_DELE( x ) DeleteCriticalSection( &(x) )
#define MUTEX_LOCK( x ) EnterCriticalSection( &(x) )
#define MUTEX_UNLOCK( x ) LeaveCriticalSection( &(x) )
#define COND_INIT( x ) ( x = CreateEvent( NULL, false, false, NULL ) )
#define COND_DELE( x ) CloseHandle( (x) )
#define COND_WAIT( x, y ) WaitForSingleObject( (x), INFINITE )
#define COND_SIGNAL( x ) SetEvent( (x) )
#define COND_RESET( x) ResetEvent( (x) )
#else
#define THREAD_HANDLE pthread_t
#define MUTEX_HANDLE pthread_mutex_t
#define COND_HANDLE pthread_cond_t
#define MUTEX_INIT( x ) pthread_mutex_init( &(x), NULL )
#define MUTEX_DELE( x ) pthread_mutex_destroy( &(x) )
#define MUTEX_LOCK( x ) pthread_mutex_lock( &(x) )
#define MUTEX_UNLOCK( x ) pthread_mutex_unlock( &(x) )
#define COND_INIT( x ) pthread_cond_init( &(x), NULL )
#define COND_DELE( x ) pthread_cond_destroy( &(x) )
#define COND_WAIT( x, y ) pthread_cond_wait( &(x), &(y) )
#define COND_SIGNAL( x ) pthread_cond_signal( &(x) )
#define COND_BROADCAST( x ) pthread_cond_broadcast( &(x) )
#endif
//#define USE_CUDA_RESURSION 1
#define NIUTRANSNNDEBUG
......
......@@ -26,8 +26,6 @@
#ifndef __XLINK_H__
#define __XLINK_H__
#include "XGlobal.h"
namespace nts{ // namespace nts(NiuTrans.Tensor)
/* cross reference */
......
......@@ -75,6 +75,9 @@ public:
/* de-constructor */
~TensorListBase();
/* reallocate */
void Reallocate(int itemNum);
/* add an item into the list */
void Add(T&& item);
......@@ -84,6 +87,15 @@ public:
/* add an item into the list */
void Add(const T& item);
/* add an item (as an integer) into the list */
void AddInt(const int item);
/* add an item (as a float) into the list */
void AddFloat(const float item);
/* add an item (as a long long) into the list */
void AddLLong(const long long item);
/* add a number of items into the list */
void Add(const T* inputItems, int inputItemCount);
......@@ -99,12 +111,30 @@ public:
/* get the item at position i */
T& GetItem(int i) const;
/* get the item at position i and force it to an integer */
int GetItemInt(int i) const;
/* get the item at position i and force it to a float number */
float GetItemFloat(int i) const;
/* get the item at position i and force it to an long long number */
long long GetItemLLong(int i) const;
/* set the item at position i */
void SetItem(int i, const T& item);
/* set the item at position i */
void SetItem(int i, T&& item);
/* set the item (as an integer) at position i */
void SetItemInt(int i, const int item);
/* set the item (as a float) at position i */
void SetItemFloat(int i, const float item);
/* set the item (as a long long) at position i */
void SetItemLLong(int i, const long long item);
/* find the position of the first matched item */
int FindFirst(const T& item);
......@@ -135,7 +165,13 @@ public:
/* short */
T& operator[] (int i) const { return GetItem(i); };
T& Get(int i) const { return GetItem(i); };
int GetInt(int i) const { return GetItemInt(i); };
float GetFloat(int i) const { return GetItemFloat(i); };
long long GetLLong(int i) const { return GetItemLLong(i); };
void Set(int i, T item) { SetItem(i, item); };
void SetInt(int i, int item) { SetItemInt(i, item); };
void SetFloat(int i, float item) { SetItemFloat(i, item); };
void SetLLong(int i, long long item) { SetItemLLong(i, item); };
};
struct XTensor;
......
......@@ -54,6 +54,8 @@ XMem::XMem()
signature = 0;
mergeFreeOTF = true;
isInitialized = false;
MUTEX_INIT(allocMutex);
MUTEX_INIT(bufMutex);
}
/*
......@@ -77,6 +79,8 @@ XMem::XMem(int myDevID, MEMPOOL_MODE myMode, MTYPE myBlockSize, int myBlockNum,
strcpy(name, "xmem");
signature = 0;
mergeFreeOTF = true;
MUTEX_INIT(allocMutex);
MUTEX_INIT(bufMutex);
Initialize(myDevID, myMode, myBlockSize, myBlockNum, myBufSize);
}
......@@ -99,6 +103,8 @@ XMem::~XMem()
delete[] memIndex;
delete[] memIndex2;
delete[] minSizeIndex;
MUTEX_DELE(allocMutex);
MUTEX_DELE(bufMutex);
}
/*
......@@ -379,12 +385,18 @@ require a piece of memory
*/
void * XMem::Alloc(int myDevID, MTYPE mySize)
{
void * p = NULL;
MUTEX_LOCK(allocMutex);
if(mode == FREE_ON_THE_FLY)
return AllocStandard(myDevID, mySize);
p = AllocStandard(myDevID, mySize);
else if(isStatic)
return AllocStatic(myDevID, mySize);
p = AllocStatic(myDevID, mySize);
else
return AllocDynamic(myDevID, mySize);
p = AllocDynamic(myDevID, mySize);
MUTEX_UNLOCK(allocMutex);
return p;
}
/*
......@@ -521,6 +533,11 @@ void * XMem::AllocBuf(int myDevID, MTYPE mySize, int pitch)
{
MTYPE backOffset = 0;
/* NOTE THAT this is tricky because we lock the buffer
but DO NOT unlock it in this function. The unlock would
happans when we call ReleaseBuf() */
//MUTEX_LOCK(bufMutex);
if(pitch > 1){
MTYPE address = (MTYPE)((char*)buf + bufUsed);
int offset = address % pitch;
......@@ -560,8 +577,10 @@ release a piece of memory
*/
void XMem::Release(int myDevID, void * p, MTYPE size)
{
MUTEX_LOCK(allocMutex);
if(mode == FREE_ON_THE_FLY)
ReleaseStandard(myDevID, p, size);
MUTEX_UNLOCK(allocMutex);
}
/*
......@@ -583,6 +602,9 @@ void XMem::ReleaseBuf(int myDevID, MTYPE mySize, int pitch)
}
bufUsed -= (mySize + backOffset);
/* NOTE THAT this is a response to the lock in AllocBuf() */
//MUTEX_UNLOCK(bufMutex);
}
/*
......@@ -825,6 +847,18 @@ void * XMem::AllocStandard(int myDevID, MTYPE mySize, bool myIsRebuiltIndex)
return result;
}
/* lock the buffer mutex */
void XMem::LockBuf()
{
MUTEX_LOCK(bufMutex);
}
/* unlock the buffer mutex */
void XMem::UnlockBuf()
{
MUTEX_UNLOCK(bufMutex);
}
/*
find the highest set bit (or most significant set bit) in an integer-64
>> mySize - required size
......@@ -1511,12 +1545,12 @@ void XMem::ShowMemUsage(FILE * file)
}
MTYPE bufTotal = bufSize;
MTYPE bufUsed = bufUsed;
MTYPE bufUsedTotal = bufUsed;
fprintf(file, "block mem:%.1fMB used:%.1fMB usage:%.3f\n",
(DTYPE)blockTotal/MILLION, (DTYPE)blockUsed/MILLION, (DTYPE)blockUsed/blockTotal);
fprintf(file, "buffer mem:%.1fMB used:%.1fMB usage:%.3f\n",
(DTYPE)bufTotal / 1024 / 1024, (DTYPE)bufUsed / 1024 / 1024, (DTYPE)bufUsed / bufTotal);
(DTYPE)bufTotal / 1024 / 1024, (DTYPE)bufUsedTotal / 1024 / 1024, (DTYPE)bufUsed / bufTotal);
}
......@@ -1560,7 +1594,7 @@ MTYPE XMemManager::GetAvailableMemory()
MEMORYSTATUSEX memoryStatus;
memoryStatus.dwLength = sizeof(memoryStatus);
if (GlobalMemoryStatusEx(&memoryStatus)){
freeMem = memoryStatus.ullAvailPhys;
freeMem = (unsigned long)memoryStatus.ullAvailPhys;
}
#else
long pages = sysconf(_SC_AVPHYS_PAGES);
......@@ -1604,6 +1638,9 @@ void XMemManager::GetBufferSize(MTYPE freeMem, MTYPE * myBufSize)
}
}
}
else {
ShowNTErrors("No enough memory for buffer allocation!");
}
}
/* initialize it and set the global memory information */
......
......@@ -24,6 +24,7 @@
#ifndef __XMEM_H__
#define __XMEM_H__
#include "XGlobal.h"
#include <stdio.h>
#include <stdlib.h>
......@@ -249,6 +250,13 @@ public:
/* indicates whether we merge free memory pieces on the fly */
bool mergeFreeOTF;
private:
/* a mutex for memory allocation and release */
MUTEX_HANDLE allocMutex;
/* a mutex for buffer memory allocation and release */
MUTEX_HANDLE bufMutex;
public:
/* constructor */
......@@ -337,6 +345,12 @@ public:
/* allocate a piece of memory as "malloc" */
void * AllocStandard(int myDevID, MTYPE mySize, bool myIsRebuiltIndex = false);
/* lock the buffer mutex */
void LockBuf();
/* unlock the buffer mutex */
void UnlockBuf();
/* find the highest set bit (or most significant set bit) in an integer-64 */
int GetMSB(MTYPE mySize);
......
......@@ -146,7 +146,7 @@ run a set of jobs in parallel
>> jobArgs - the list of arguments for each job
>> sleepTime - time to sleep (in ms) for each round
*/
void XPRunner::Run(TensorList * jobFunctions, TensorList * jobArgs, float sleepTime)
void XPRunner::Run(XList * jobFunctions, XList * jobArgs, float sleepTime)
{
if(threadNum <= 0){
XPRINT(1, stderr, "Error! No threads were created!\n");
......@@ -195,13 +195,12 @@ void XPRunner::Run(TensorList * jobFunctions, TensorList * jobArgs, float sleepT
TFunction function = (TFunction)jobFunctions->GetItem(jobArgs->count - c);
/* the arguments that are passed to the function */
volatile TensorList * args = (TensorList*)jobArgs->GetItem(jobArgs->count - c);
XList * args = (XList*)jobArgs->GetItem(jobArgs->count - c);
/* thread */
XThread * thread = threads + availableThreads[i];
thread->argv = args;
thread->function = function;
thread->SetFunc(function, args);
MUTEX_LOCK(thread->workingMutex);
thread->working = 1;
......
......@@ -106,7 +106,7 @@ public:
void KillThreads();
/* run a set of jobs in parallel */
void Run(TensorList * jobFunctions, TensorList * jobArgs, float sleepTime = 0);
void Run(XList * jobFunctions, XList * jobArgs, float sleepTime = 0);
/* get the number of parallel jobs to run */
int GetJobNum(int size);
......
......@@ -42,7 +42,7 @@ job item used in queues
JobQueueNode::JobQueueNode()
{
job = NULL;
args = new TensorList(1);
args = new XList(1);
}
/* de-constructor */
......@@ -67,12 +67,9 @@ XQueue::XQueue(int mySize)
head = 0;
tail = 0;
isJobQueue = false;
jobDequeuerArgs = new TensorList(1);
jobDequeuerArgs = new XList(1);
jobDequeuerBreak = false;
runningJobCount = 0;
jobStream = NULL;
jobStream1 = NULL;
jobStream2 = NULL;
MUTEX_INIT(enqueueMutex);
MUTEX_INIT(dequeueMutex);
......@@ -85,9 +82,6 @@ XQueue::~XQueue()
{
delete[] queue;
delete jobDequeuerArgs;
delete jobStream;
delete jobStream1;
delete jobStream2;
//if(isJobQueue)
// StopJobConsumer();
......@@ -160,19 +154,6 @@ void XQueue::WaitForEmptyJobQueue()
while(runningJobCount > 0){
XSleep(10);
}
if(jobStream != NULL){
CheckNTErrors((jobStream->IsFinished()), "None fineished jobs remain");
jobStream->Clear();
}
if(jobStream1 != NULL){
CheckNTErrors((jobStream1->IsFinished()), "None fineished jobs remain");
jobStream1->Clear();
}
if(jobStream2 != NULL){
CheckNTErrors((jobStream2->IsFinished()), "None fineished jobs remain");
jobStream2->Clear();
}
}
int devids[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
......@@ -189,12 +170,11 @@ void XQueue::RunJobConsumer(int jobDevID)
isJobQueue = true;
jobDequeuerArgs->Clear();
// warning: this may cause unknown error
jobDequeuerArgs->Add((XTensor*)this);
jobDequeuerArgs->Add(jobDevID >= 0 ? (XTensor*)(devids + jobDevID) : (XTensor*)&cpuid);
/* warning: this may cause unknown errors */
jobDequeuerArgs->Add(this);
jobDequeuerArgs->Add(jobDevID >= 0 ? (devids + jobDevID) : &cpuid);
jobDequeuer.function = (TFunction)DequeueJobs;
jobDequeuer.argv = jobDequeuerArgs;
jobDequeuer.SetFunc((TFunction)DequeueJobs, jobDequeuerArgs);
jobDequeuer.Start();
jobDequeuer.LetItGo();
......@@ -213,7 +193,7 @@ void XQueue::StopJobConsumer()
}
/* add a job item to process */
void XQueue::EnqueueJob(void * job, TensorList * jobArgs)
void XQueue::EnqueueJob(void * job, XList * jobArgs)
{
MUTEX_LOCK(jobQueueMutex);
runningJobCount++;
......@@ -227,17 +207,16 @@ void XQueue::EnqueueJob(void * job, TensorList * jobArgs)
}
/* job item consumer */
void XQueue::DequeueJobs(TensorList * args)
void XQueue::DequeueJobs(XList * args)
{
CheckNTErrors((args->count == 2), "Illegal arguments!");
XQueue * q = (XQueue*)args->GetItem(0);
int devID = *(int*)args->GetItem(1);
int devIDBackup = XDevice::GetGPUDevice();
int devIDBackup = -1;
if(devID >= 0)
XDevice::SetGPUDevice(devID);
XDevice::SetDevice(devID, devIDBackup);
while(1){
JobQueueNode * node = (JobQueueNode*)q->Dequeue();
......@@ -259,7 +238,7 @@ void XQueue::DequeueJobs(TensorList * args)
}
if(devID >= 0)
XDevice::SetGPUDevice(devIDBackup);
XDevice::SetDevice(devIDBackup);
}
/* get the break flag */
......@@ -268,31 +247,14 @@ bool XQueue::GetJobBreak()
return jobDequeuerBreak;
}
/* get job stream */
XStream * XQueue::GetJobStream(int n)
/* get the number of jobs */
int XQueue::GetJobNum()
{
if(n == 0)
return jobStream;
else if(n == 1)
return jobStream1;
else if(n == 2)
return jobStream2;
else{
ShowNTErrors("invalid stream id!");
}
return NULL;
}
MUTEX_LOCK(jobQueueMutex);
int c = runningJobCount;
MUTEX_UNLOCK(jobQueueMutex);
/* make job streams */
void XQueue::MakeJobStreams(int devID, int devID1, int devID2)
{
if(devID != INVALID_DEVICE_ID)
jobStream = new XStream(0, devID);
if(devID1 != INVALID_DEVICE_ID)
jobStream1 = new XStream(0, devID1);
if(devID2 != INVALID_DEVICE_ID)
jobStream2 = new XStream(0, devID2);
return c;
}
} /* end of the nts (NiuTrans.Tensor) namespace */
......@@ -33,7 +33,6 @@
#include "XGlobal.h"
#include "XThread.h"
#include "XStream.h"
#include "XDevice.h"
#include "XList.h"
......@@ -52,7 +51,7 @@ public:
void * job;
/* arguments of the job */
TensorList * args;
XList * args;
public:
/* constructor */
......@@ -102,7 +101,7 @@ private:
XThread jobDequeuer;
/* argument list of jobDequeuer */
TensorList * jobDequeuerArgs;
XList * jobDequeuerArgs;
/* indicates whether jobDequeuer stops */
bool jobDequeuerBreak;
......@@ -110,11 +109,6 @@ private:
/* running job count */
int runningJobCount;
/* job streams (we think that three streams is enough :)) */
XStream * jobStream;
XStream * jobStream1;
XStream * jobStream2;
public:
/* constuctor */
XQueue(int mySize = MAX_QUEUE_SIZE);
......@@ -135,26 +129,23 @@ public:
void WaitForEmptyJobQueue();
/* run the job consumer */
void RunJobConsumer(int jobDevID = 0);
void RunJobConsumer(int jobDevID = -1);
/* stop the job consumer */
void StopJobConsumer();
/* add a job item to process */
void EnqueueJob(void * job, TensorList * jobArgs);
void EnqueueJob(void * job, XList * jobArgs);
/* job item consumer */
static
void DequeueJobs(TensorList * args);
void DequeueJobs(XList * args);
/* get the break flag */
bool GetJobBreak();
/* get job stream */
XStream * GetJobStream(int n = 0);
/* make job streams */
void MakeJobStreams(int devID = INVALID_DEVICE_ID, int devID1 = INVALID_DEVICE_ID, int devID2 = INVALID_DEVICE_ID);
/* get the number of jobs */
int GetJobNum();
};
} /* end of the nts (NiuTrans.Tensor) namespace */
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
*
* This is for streaming (on GPU), i.e., run jobs in different stream for
* GPU Async capabilities.
*
*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2016-03-09
*
*/
#include "stdio.h"
#include "stdlib.h"
#include "XGlobal.h"
#include "XStream.h"
#include "XDevice.h"
/* the nts (NiuTrans.Tensor) namespace */
namespace nts{
/*
This class defines the stream used in pipelining jobs. E.g., one can put
a sequence of jobs in a stream and asynchronously do something else. Basically
we can use multiply streams to hide the data transfer cost on GPUs by using
job overlaps.
*/
/* constructor */
XStream::XStream(int priority, int myDevID, int myMaxEventNum)
{
devID = myDevID;
#ifdef USE_CUDA
if(myDevID >= 0){
int backupDevID = XDevice::GetGPUDevice();
XDevice::SetGPUDevice(myDevID);
events = new cudaEvent_t[myMaxEventNum];
XDevice::SetGPUDevice(backupDevID);
maxEventNum = myMaxEventNum;
usedEventNum = 0;
}
else{
maxEventNum = 0;
usedEventNum = 0;
}
#endif
Create(priority, devID);
}
/* deconstructor */
XStream::~XStream()
{
Destroy();
#ifdef USE_CUDA
delete[] events;
#endif
}
/* create the stream */
void XStream::Create(int priority, int myDevID)
{
if(myDevID < 0)
return;
#ifdef USE_CUDA
int backupDevID = XDevice::GetGPUDevice();
XDevice::SetGPUDevice(myDevID);
//cudaStreamCreateWithPriority(&stream, cudaStreamDefault, priority);
CheckNTErrors((cudaStreamCreate(&stream) == cudaSuccess),
"cannot create the cuda stream!");
XDevice::SetGPUDevice(backupDevID);
#endif
devID = myDevID;
}
/* destroy the stream */
void XStream::Destroy()
{
if(devID < 0)
return;
#ifdef USE_CUDA
int backupDevID = XDevice::GetGPUDevice();
XDevice::SetGPUDevice(devID);
cudaStreamDestroy(stream);
XDevice::SetGPUDevice(backupDevID);
Clear();
#endif
}
/* clear it */
void XStream::Clear()
{
#ifdef USE_CUDA
int backupDevID = XDevice::GetGPUDevice();
XDevice::SetGPUDevice(devID);
for(int i = 0; i < usedEventNum; i++){
cudaEventDestroy(events[i]);
}
usedEventNum = 0;
XDevice::SetGPUDevice(backupDevID);
#endif
}
/* judge if all the jobs in the stream have been finished */
bool XStream::IsFinished()
{
#ifdef USE_CUDA
if(cudaStreamQuery(stream) == cudaSuccess)
return true;
else
return false;
#else
return true;
#endif
}
void XStream::StreamSynchronize()
{
#ifdef USE_CUDA
int devIDBackup = XDevice::GetGPUDevice();
if(devID != devIDBackup)
XDevice::SetGPUDevice(devID);
cudaStreamSynchronize(stream);
if(devID != devIDBackup)
XDevice::SetGPUDevice(devIDBackup);
#endif
}
void XStream::ThreadSynchronize()
{
#ifdef USE_CUDA
#if CUDART_VERSION < 10000
cudaThreadSynchronize();
#else
ShowNTErrors("TODO!");
#endif
#endif
}
void XStream::DeviceSynchronize(int devID)
{
#ifdef USE_CUDA
int devIDBackup = XDevice::GetGPUDevice();
cudaGetDevice(&devIDBackup);
if(devID != devIDBackup)
XDevice::SetGPUDevice(devID);
cudaDeviceSynchronize();
if(devID != devIDBackup)
XDevice::SetGPUDevice(devIDBackup);
#endif
}
/* make a dependency of two streams. i.e., current stream must wait for the last job finished in another stream */
void XStream::MakeDependency(XStream * precedingStream)
{
#ifdef USE_CUDA
cudaEvent_t * e = precedingStream->MakeEvent();
cudaEventRecord(*e, precedingStream->stream);
cudaStreamWaitEvent(stream, *e, 0);
#endif
}
/* get the stream */
#ifdef USE_CUDA
inline cudaStream_t * XStream::Get()
{
return &stream;
}
/* make a event */
inline cudaEvent_t * XStream::MakeEvent()
{
int backupDevID = XDevice::GetGPUDevice();
XDevice::SetGPUDevice(devID);
CheckNTErrors((usedEventNum < maxEventNum), "Too many events are required!");
cudaEvent_t * e = events + usedEventNum++;
cudaEventCreate(e);
XDevice::SetGPUDevice(backupDevID);
return e;
}
#endif
} /* end of the nts (NiuTrans.Tensor) namespace */
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
*
* This is for streaming (on GPU), i.e., run jobs in different stream for
* GPU Async capabilities.
*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2016-03-09
*
*/
#ifndef __XSTREAM_H__
#define __XSTREAM_H__
/* the CUDA stuff */
#ifdef USE_CUDA
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cuda_fp16.h>
#endif
/* the nts (NiuTrans.Tensor) namespace */
namespace nts{
#define MAX_CUDA_EVENT_NUM_IN_A_STREAM 128
/*
This class defines the stream used in pipelining jobs. E.g., one can put
a sequence of jobs in a stream and asychronously do something else. Basically
we can use multiply streams to hide the data transfer cost on GPUs by using
job overlaps.
*/
class XStream
{
public:
#ifdef USE_CUDA
/* the cuda stream */
cudaStream_t stream;
/* list of cuda events for synchronize different streams */
cudaEvent_t * events;
/* max number of the events */
int maxEventNum;
/* number of used events */
int usedEventNum;
#else
/* virtual pointer */
void * stream;
#endif
/* device that holds the stream */
int devID;
public:
/* constructor */
XStream(int priority = 0, int devID = 0, int maxEventNum = MAX_CUDA_EVENT_NUM_IN_A_STREAM);
/* deconstructor */
~XStream();
/* create the stream */
void Create(int priority = 0, int devID = 0);
/* destroy the stream */
void Destroy();
/* clear it */
void Clear();
/* judge if all the jobs in the stream have been finished */
bool IsFinished();
/* stream synchronize */
void StreamSynchronize();
/* thread synchronize */
static
void ThreadSynchronize();
/* device synchronize */
static
void DeviceSynchronize(int devID);
/* make a dependency of two streams. i.e., current stream must wait for the last job finished in another stream */
void MakeDependency(XStream * precedingStream);
#ifdef USE_CUDA
/* get the stream */
cudaStream_t * Get();
/* make a event */
cudaEvent_t * MakeEvent();
#endif
};
} /* end of the nts (NiuTrans.Tensor) namespace */
#endif
......@@ -89,10 +89,6 @@ XTensor::XTensor()
Init();
id = MakeTensorID();
isDefaultDType = true;
isInGlobalMem = false;
isInit = false;
isTmp = false;
reserved = 0;
}
......@@ -277,6 +273,7 @@ void XTensor::Init()
isTmp = false;
isGrad = false;
isVar = false;
isGradFinished = false;
enableGrad = X_ENABLE_GRAD;
visitMark = 0;
grad = NULL;
......@@ -772,10 +769,9 @@ MTYPE XTensor::GetOffset3D(int d0, int d1, int d2) const
}
/*
a vector with all entries of 0
>> stream - stream for the job pipeline
a tensor with all entries of 0
*/
void XTensor::SetZeroAll(XStream* stream)
void XTensor::SetZeroAll()
{
if(data == NULL)
return;
......@@ -788,12 +784,7 @@ void XTensor::SetZeroAll(XStream* stream)
int devIDBackup = 0;
cudaGetDevice(&devIDBackup);
cudaSetDevice(devID);
if(stream == NULL)
cudaMemset(data, 0, size);
else
cudaMemsetAsync(data, 0, size, stream->stream);
cudaMemset(data, 0, size);
cudaSetDevice(devIDBackup);
#endif
}
......@@ -807,13 +798,8 @@ void XTensor::SetZeroAll(XStream* stream)
#ifdef USE_CUDA
int devIDBackup = 0;
cudaGetDevice(&devIDBackup);
cudaSetDevice(devID);
if(stream == NULL)
cudaMemset(data, 0, unitNum * unitSize);
else
cudaMemsetAsync(data, 0, unitNum * unitSize, stream->stream);
cudaSetDevice(devID);
cudaMemset(data, 0, unitNum * unitSize);
cudaSetDevice(devIDBackup);
#endif
}
......@@ -845,11 +831,11 @@ void XTensor::Rand(int rNum, int cNum)
}
/* generate data items with a range by start, end and the step
>> start - the begin of the array
>> end - the end of the array (not included self)
>> step - the step of two items
>> start - the beginning of the array
>> end - the end of the array (it does not includes itself)
>> step - the step we take along the array
*/
void XTensor::Range(DTYPE lower, DTYPE upper, DTYPE step)
void XTensor::Range(int lower, int upper, int step)
{
_SetDataRange(this, lower, upper, step);
}
......
......@@ -31,7 +31,6 @@
#include <math.h>
#include "XGlobal.h"
#include "XPRunner.h"
#include "XStream.h"
#include "XHeap.h"
#include "XList.h"
#include "XDataType.h"
......@@ -157,6 +156,11 @@ public:
/* mark for traversing the gragh */
unsigned int visitMark;
/* indicates whether the gradient of the tensor has been computed (in the backward process)
Note that the indicator could be modified by XNet (in back propagation) and be accessed
in XTrainer (and related classes). */
bool isGradFinished;
/* gradient (for back-propagation) */
XTensor * grad;
......@@ -303,7 +307,7 @@ public:
MTYPE GetOffset3D(int d0, int d1, int d2) const;
/* a tensor with all entries of 0 */
void SetZeroAll(XStream * stream = NULL);
void SetZeroAll();
/* set the tensor with an data array */
void SetData(const void * d, int num, int beg = 0);
......@@ -311,8 +315,8 @@ public:
/* generate data items with a uniform distribution in [0, 1] */
void Rand(int rNum, int cNum);
/* generate data items with a range by start, end and the step */
void Range(DTYPE lower, DTYPE upper, DTYPE step);
/* generate data items with a range by start, end and step */
void Range(int lower, int upper, int step);
/* generate data items with a fixed value */
template<class T>
......
......@@ -38,7 +38,7 @@ XThread::XThread()
#endif
MUTEX_INIT(gMutex);
function = NULL;
argv = NULL;
argv.Clear();
toBreak = false;
jobCount = 0;
working = 0;
......@@ -69,6 +69,18 @@ void * XThread::Wrapper(void * ptr)
return 0;
}
/*
initialize the thread with the function and its parameters
>> myFunc - the function to run
>> myArgv - arguments of the function
*/
void XThread::SetFunc(TFunction myFunc, XList * myArgv)
{
function = myFunc;
argv.Clear();
argv.AddList(myArgv);
}
/*
Tunning for this thread. It is very very native implementation.
......@@ -77,6 +89,10 @@ After that, we wait again if there is no new job.
*/
void XThread::Run()
{
if (function == NULL) {
ShowNTErrors("You are running a thread with no function specified!");
}
#ifdef _WIN32
//COND_RESET(gCond);
#endif
......@@ -104,7 +120,7 @@ void XThread::Run()
}
/* do what you want to do*/
function(argv);
function(&argv);
#ifdef USE_PTHREAD
jobCount--;
......
......@@ -54,38 +54,7 @@ namespace nts{
(unsigned)(flag), (unsigned *)(id))
#endif
//////////////////////////////////////////////////
// mutex
#ifdef WIN32
#define THREAD_HANDLE HANDLE
#define MUTEX_HANDLE CRITICAL_SECTION
#define COND_HANDLE HANDLE
#define MUTEX_INIT( x ) InitializeCriticalSection( &(x) )
#define MUTEX_DELE( x ) DeleteCriticalSection( &(x) )
#define MUTEX_LOCK( x ) EnterCriticalSection( &(x) )
#define MUTEX_UNLOCK( x ) LeaveCriticalSection( &(x) )
#define COND_INIT( x ) ( x = CreateEvent( NULL, false, false, NULL ) )
#define COND_DELE( x ) CloseHandle( (x) )
#define COND_WAIT( x, y ) WaitForSingleObject( (x), INFINITE )
#define COND_SIGNAL( x ) SetEvent( (x) )
#define COND_RESET( x) ResetEvent( (x) )
#else
#define THREAD_HANDLE pthread_t
#define MUTEX_HANDLE pthread_mutex_t
#define COND_HANDLE pthread_cond_t
#define MUTEX_INIT( x ) pthread_mutex_init( &(x), NULL )
#define MUTEX_DELE( x ) pthread_mutex_destroy( &(x) )
#define MUTEX_LOCK( x ) pthread_mutex_lock( &(x) )
#define MUTEX_UNLOCK( x ) pthread_mutex_unlock( &(x) )
#define COND_INIT( x ) pthread_cond_init( &(x), NULL )
#define COND_DELE( x ) pthread_cond_destroy( &(x) )
#define COND_WAIT( x, y ) pthread_cond_wait( &(x), &(y) )
#define COND_SIGNAL( x ) pthread_cond_signal( &(x) )
#define COND_BROADCAST( x ) pthread_cond_broadcast( &(x) )
#endif
typedef void (*TFunction) (volatile TensorList*);
typedef void (*TFunction) (volatile XList*);
/*
This is a class that wraps the standard implementation of threading
......@@ -128,12 +97,10 @@ public:
public:
/* function to run */
volatile
TFunction function;
/* arguments (for the function to run) */
volatile
TensorList * argv;
XList argv;
/* a flag to break */
volatile
......@@ -154,6 +121,9 @@ public:
/* a wrapper for the start-routine parameter in pthread_create */
static void * Wrapper(void * ptr);
/* initialize the thread with the function and its parameters */
void SetFunc(TFunction myFunc, XList * myArgv);
/*
Core of the thread. It is very very native impelementation.
We loop and wait for a singnal to activate the job processing.
......
......@@ -155,13 +155,13 @@ void XMemSet(int devID, void * p, int value, size_t size)
cudaMemcpyKind GetMemcpyKind(int devIDFrom, int devIDTo)
{
if(devIDFrom < 0 && devIDTo < 0)
return cudaMemcpyHostToHost;
return cudaMemcpyKind::cudaMemcpyHostToHost;
else if(devIDFrom < 0 && devIDTo >= 0)
return cudaMemcpyHostToDevice;
return cudaMemcpyKind::cudaMemcpyHostToDevice;
else if(devIDFrom >= 0 && devIDTo < 0)
return cudaMemcpyDeviceToHost;
return cudaMemcpyKind::cudaMemcpyDeviceToHost;
else
return cudaMemcpyDeviceToDevice;
return cudaMemcpyKind::cudaMemcpyDeviceToDevice;
}
#endif
......@@ -311,44 +311,6 @@ void XMemCopy2D(void * t, size_t tPitch, int devIDT, const void * s, size_t sPit
#endif
}
void XMemCopy2DAsync(void * t, size_t tPitch, int devIDT, const void * s, size_t sPitch, int devIDS, size_t mSize, int n, XStream * stream)
{
if (t == s)
return;
if (devIDT < 0 && devIDS < 0) {
for(int i = 0; i < n; i++)
memcpy((char*)t + tPitch * i, (char*)s + sPitch * i, mSize);
return;
}
#ifdef USE_CUDA
else{
CheckNTErrors(stream != NULL, "No stream found!");
cudaStream_t &cstream = stream->stream;
if (devIDT >= 0 && devIDS < 0) {
cudaError_t error = cudaMemcpy2DAsync(t, tPitch, s, sPitch, mSize, n, cudaMemcpyHostToDevice, cstream);
if(error != cudaSuccess){
ShowNTErrors("cudaMemcpy2D error (cudaMemcpyHostToDevice)");
}
}
else if (devIDT < 0 && devIDS >= 0) {
cudaError_t error = cudaMemcpy2DAsync(t, tPitch, s, sPitch, mSize, n, cudaMemcpyDeviceToHost, cstream);
if(error != cudaSuccess){
ShowNTErrors("cudaMemcpy error (cudaMemcpyDeviceToHost)");
}
}
else {
cudaError_t error = cudaMemcpy2DAsync(t, tPitch, s, sPitch, mSize, n, cudaMemcpyDeviceToDevice, cstream);
if (error != cudaSuccess) {
ShowNTErrors("cudaMemcpy error (cudaMemcpyDeviceToDevice)");
}
}
}
#else
ShowNTErrors("Please specify USE_CUDA and recompile the code!");
#endif
}
void * XMemAlloc(int devID, size_t size)
{
void * p = NULL;
......@@ -523,6 +485,9 @@ unsigned int GetNextPower2(unsigned int n)
/* sleep for a while */
void XSleep(int sleepTime)
{
if (sleepTime <= 0)
return;
#ifdef _WIN32
Sleep((DWORD)sleepTime);
#else
......@@ -591,9 +556,9 @@ void XQSort(void * data, void * index, int num, int width, int stride, int (*com
stackptr = 0;
lo = (char*)data;
hi = (char*)data + realStride * (num - 1);
hi = (char*)data + (long)realStride * (num - 1);
indexlo = (int*)index;
indexhi = index != NULL ? (int*)index + stride * (num - 1) : NULL;
indexhi = index != NULL ? (int*)index + (long)stride * (num - 1) : NULL;
recurse:
......@@ -603,8 +568,8 @@ recurse:
if(size <= MIN_QSORT_NUM)
XShortSort(lo, hi, indexlo, indexhi, width, stride, comp);
else {
mid = lo + (size/2) * realStride;
indexmid = indexlo + (size/2) * stride;
mid = lo + (long)(size/2) * realStride;
indexmid = indexlo + (long)(size/2) * stride;
/* sort the first, last and middle elements into order */
if(comp(lo, mid) > 0)
......@@ -872,8 +837,7 @@ int SplitALine(char* inputString, const char* seperator, StrList* items)
return 0;
if (sepLen == 0) {
char* item = new char[inputLen + 1];
char* item = new char[(long)inputLen + 1];
strcpy(item, inputString);
items->Add(item);
}
......
......@@ -42,7 +42,6 @@ extern void XMemSet(void * p, int value, size_t size);
extern void XMemSet(int devID, void * p, int value, size_t size);
extern void XMemCopy(void * t, int devIDT, const void * s, int devIDS, size_t size);
extern void XMemCopy2D(void * t, size_t tPitch, int devIDT, const void * s, size_t sPitch, int devIDS, size_t mSize, int n);
extern void XMemCopy2DAsync(void * t, size_t tPitch, int devIDT, const void * s, size_t sPitch, int devIDS, size_t mSize, int n, XStream * stream);
extern void * XMemAlloc(int devID, size_t size);
extern void * XMemAllocOnDev(int devID, size_t size);
extern void XMemFree(int devID, void * p);
......
......@@ -253,15 +253,25 @@ void Div(const XTensor & a, const XTensor & b, XTensor & c, DTYPE alpha, int lea
if (b.order == 0){
DTYPE scale = 1.0F / b.Get0D();
if (a.mem != NULL)
a.mem->LockBuf();
XTensor * tmp1 = NewTensorBufV2(&a, a.devID, a.mem);
if ((c.mem != NULL) && (c.mem != a.mem)) {
c.mem->LockBuf();
}
XTensor * tmp2 = NewTensorBufV2(&c, c.devID, c.mem);
ScaleAndShift(a, *tmp1, scale, 0.0F);
ScaleAndShift(c, *tmp2, alpha, 0.0F);
Sum(*tmp2, *tmp1, c);
DelTensorBuf(tmp1);
DelTensorBuf(tmp2);
if ((c.mem != NULL) && (c.mem != a.mem)) {
c.mem->UnlockBuf();
}
DelTensorBuf(tmp1);
if (a.mem != NULL)
a.mem->UnlockBuf();
}
else {
int n = GetBroadcastDimIndex(a, b);
......
......@@ -42,12 +42,11 @@ where trans() return the transposed matrix if the flag is fired
>> alpha - a coefficient
>> beta - another coefficient
>> parallelRunner - parallel processing module
>> stream - the string for creating the job pipeline
*/
void _MatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
const XTensor * b, MATRIX_TRANS_TYPE transposedB,
XTensor * c, DTYPE alpha, DTYPE beta,
XPRunner * parallelRunner, XStream * stream)
XPRunner * parallelRunner)
{
CheckNTErrors((a && b && c), "Empty input tensors!");
CheckNTErrors((a->dataType == b->dataType), "Input tensors should have the same data type!");
......@@ -69,7 +68,7 @@ void _MatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
#ifdef USE_CUDA
if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
_CudaMatrixMul2D(a, transposedA, b, transposedB, c, alpha, beta, stream);
_CudaMatrixMul2D(a, transposedA, b, transposedB, c, alpha, beta);
return;
}
#endif
......
......@@ -119,11 +119,10 @@ where trans() return the transposed matrix if the flag is fired
>> c - where we put a*b
>> alpha - a coefficient
>> beta - another coefficient
>> stream - the string for creating the job pipeline
*/
void _CudaMatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
const XTensor * b, MATRIX_TRANS_TYPE transposedB,
XTensor * c, DTYPE alpha, DTYPE beta, XStream * stream)
XTensor * c, DTYPE alpha, DTYPE beta)
{
int an = transposedA == X_TRANS ? a->dimSize[1] : a->dimSize[0];
int am = transposedA == X_TRANS ? a->dimSize[0] : a->dimSize[1];
......@@ -152,10 +151,6 @@ void _CudaMatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
cublasHandle_t * handle = a->mem == NULL ? GDevs.GetCudaHandle(a->devID) : a->mem->GetCublasHandle();
/* !!!! might have problems */
if (stream != NULL)
cublasSetStream(*handle, stream->stream);
if (beta == 0)
c->SetZeroAll();
......
......@@ -43,7 +43,7 @@ c = trans(a) * trans(b) * alpha + c * beta
where trans() return the transposed matrix if the flag is fired
*/
void _CudaMatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c,
DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XStream * stream = NULL);
DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0);
#endif // USE_CUDA
......
......@@ -32,7 +32,7 @@ c = trans(a) * trans(b) * alpha + c * beta
where trans() return the transposed matrix if the flag is fired
*/
void _MatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c,
DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL, XStream * stream = NULL);
DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL);
} // namespace nts(NiuTrans.Tensor)
......
......@@ -61,6 +61,8 @@ XTensor MulAndShift(const XTensor &x, const XTensor &w, const XTensor &b,
float dr = (!x.isSparse || !w.isSparse) ? 1.0F : MAX(x.denseRatio, w.denseRatio);
if (x.mem != NULL)
x.mem->LockBuf();
XTensor * tmp = NewTensorBufV2(order, dimSize, x.dataType, dr, x.devID, x.mem);
/* call _MatrixMul function */
......@@ -101,6 +103,8 @@ XTensor MulAndShift(const XTensor &x, const XTensor &w, const XTensor &b,
/* destroy variables */
delete[] dimSize;
DelTensorBuf(tmp);
if (x.mem != NULL)
x.mem->UnlockBuf();
return c;
}
......@@ -121,8 +125,8 @@ XTensor MulAndShift(const XTensor& x, MATRIX_TRANS_TYPE transposedX,
CheckNTErrors(x.order >= 2 && w.order >= 2, "Input tensors must have a order >= 2!");
int xn = transposedX == X_TRANS ? x.dimSize[x.order - 1] : x.dimSize[x.order - 2];
int xm = transposedX == X_TRANS ? x.dimSize[x.order - 2] : x.dimSize[x.order - 1];
int wn = transposedW == X_TRANS ? w.dimSize[w.order - 1] : w.dimSize[w.order - 2];
//int xm = transposedX == X_TRANS ? x.dimSize[x.order - 2] : x.dimSize[x.order - 1];
//int wn = transposedW == X_TRANS ? w.dimSize[w.order - 1] : w.dimSize[w.order - 2];
int wm = transposedW == X_TRANS ? w.dimSize[w.order - 2] : w.dimSize[w.order - 1];
int order = x.order + w.order - 2;
......@@ -137,6 +141,8 @@ XTensor MulAndShift(const XTensor& x, MATRIX_TRANS_TYPE transposedX,
float dr = (!x.isSparse || !w.isSparse) ? 1.0F : MAX(x.denseRatio, w.denseRatio);
if (x.mem != NULL)
x.mem->LockBuf();
XTensor * tmp = NewTensorBufV2(order, dimSize, x.dataType, dr, x.devID, x.mem);
/* call _MatrixMul function */
......@@ -175,8 +181,10 @@ XTensor MulAndShift(const XTensor& x, MATRIX_TRANS_TYPE transposedX,
/* destroy variables */
delete[] dimSize;
DelTensorBuf(tmp);
if (x.mem != NULL)
x.mem->UnlockBuf();
return c;
}
}
\ No newline at end of file
}
......@@ -277,15 +277,25 @@ void Multiply(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int l
if (b.order == 0){
DTYPE scale = b.Get0D();
if (a.mem != NULL)
a.mem->LockBuf();
XTensor * tmp1 = NewTensorBufV2(&a, a.devID, a.mem);
if ((c.mem != NULL) && (c.mem != a.mem)) {
c.mem->LockBuf();
}
XTensor * tmp2 = NewTensorBufV2(&c, c.devID, c.mem);
ScaleAndShift(a, *tmp1, scale, 0.0F);
ScaleAndShift(c, *tmp2, alpha, 0.0F);
Sum(*tmp2, *tmp1, c);
DelTensorBuf(tmp1);
DelTensorBuf(tmp2);
if ((c.mem != NULL) && (c.mem != a.mem)) {
c.mem->UnlockBuf();
}
DelTensorBuf(tmp1);
if (a.mem != NULL)
a.mem->UnlockBuf();
}
else {
int n = GetBroadcastDimIndex(a, b);
......
......@@ -290,9 +290,16 @@ void _MultiplyBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE
source = target;
}
target = t->mem != NULL ?
/*target = t->mem != NULL ?
t->mem->AllocBuf(t->devID, t->unitNum * t->unitSize):
XMemAlloc(t->devID, t->unitNum * t->unitSize);
XMemAlloc(t->devID, t->unitNum * t->unitSize);*/
if (t->mem != NULL) {
t->mem->LockBuf();
target = t->mem->AllocBuf(t->devID, t->unitNum * t->unitSize);
}
else {
target = XMemAlloc(t->devID, t->unitNum * t->unitSize);
}
s->data = source;
t->data = target;
......@@ -302,8 +309,9 @@ void _MultiplyBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE
/* free the memory space of the one before the last allocation */
if(count > 0){
int size = s->unitNum * s->unitSize;
if(t->mem != NULL)
if(t->mem != NULL) {
t->mem->ReleaseBuf(t->devID, size);
}
else
XMemFree(t->devID, source);
}
......@@ -312,8 +320,10 @@ void _MultiplyBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE
if(isLast){
CheckNTErrors(t->unitNum == c->unitNum, "Wrong tensor size!");
_Multiply(a, t, c, beta);
if(t->mem != NULL)
if(t->mem != NULL) {
t->mem->ReleaseBuf(t->devID, t->unitNum * t->unitSize);
t->mem->UnlockBuf();
}
else
XMemFree(t->devID, target);
target = NULL;
......
......@@ -147,25 +147,27 @@ void _Sum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
int * bp = (int*)b->data;
int * cp = (int*)c->data;
/* TODO: new code for beta = 1. the follow code might be slow because it introduces
additional floating-point computation. */
/* unrolling */
int num = a->unitNum;
if (num % 4 == 0) {
for (int i = 0; i < num; i += 4) {
cp[i] = ap[i] + bp[i] * beta;
cp[i + 1] = ap[i + 1] + bp[i + 1] * beta;
cp[i + 2] = ap[i + 2] + bp[i + 2] * beta;
cp[i + 3] = ap[i + 3] + bp[i + 3] * beta;
cp[i] = ap[i] + (int)(bp[i] * beta);
cp[i + 1] = ap[i + 1] + (int)(bp[i + 1] * beta);
cp[i + 2] = ap[i + 2] + (int)(bp[i + 2] * beta);
cp[i + 3] = ap[i + 3] + (int)(bp[i + 3] * beta);
}
}
else if (num % 2 == 0) {
for (int i = 0; i < num; i += 2) {
cp[i] = ap[i] + bp[i] * beta;
cp[i + 1] = ap[i + 1] + bp[i + 1] * beta;
cp[i] = ap[i] + (int)(bp[i] * beta);
cp[i + 1] = ap[i + 1] + (int)(bp[i + 1] * beta);
}
}
else {
for (int i = 0; i < num; i++) {
cp[i] = ap[i] + bp[i] * beta;
cp[i] = ap[i] + (int)(bp[i] * beta);
}
}
}
......
......@@ -293,10 +293,16 @@ void _SumBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta
source = target;
}
target = t->mem != NULL ?
/*target = t->mem != NULL ?
t->mem->AllocBuf(t->devID, t->unitNum * t->unitSize):
XMemAlloc(t->devID, t->unitNum * t->unitSize);
XMemAlloc(t->devID, t->unitNum * t->unitSize);*/
if (t->mem != NULL) {
t->mem->LockBuf();
target = t->mem->AllocBuf(t->devID, t->unitNum * t->unitSize);
}
else {
target = XMemAlloc(t->devID, t->unitNum * t->unitSize);
}
s->data = source;
t->data = target;
......@@ -315,8 +321,10 @@ void _SumBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta
if(isLast){
CheckNTErrors(t->unitNum == c->unitNum, "Wrong tensor size!");
_Sum(a, t, c, beta);
if(t->mem != NULL)
if(t->mem != NULL) {
t->mem->ReleaseBuf(t->devID, t->unitNum * t->unitSize);
t->mem->UnlockBuf();
}
else
XMemFree(t->devID, target);
target = NULL;
......
......@@ -113,6 +113,9 @@ void _CudaBLASMatrixMULBatched(cublasHandle_t * handle,
int count, int na, int ma, int nb, int mb, int nc, int mc,
DTYPE alpha, DTYPE beta)
{
int version = 0;
cudaRuntimeGetVersion(&version);
/*
matrxi-matrix multiplication
For row-major matrices (as in c/c++), the trick used here is (AB)^T = B^T * A^T
......@@ -327,6 +330,7 @@ void _CudaBLASMatrixMULList(cublasHandle_t * handle,
DTYPE ** cpGPU = NULL;
if (mem != NULL) {
mem->LockBuf();
mem->SetPinBuf();
apGPU = (DTYPE**)mem->AllocBuf(mem->devID, sizeof(DTYPE*) * a->count, 256);
bpGPU = (DTYPE**)mem->AllocBuf(mem->devID, sizeof(DTYPE*) * a->count, 256);
......@@ -353,8 +357,10 @@ void _CudaBLASMatrixMULList(cublasHandle_t * handle,
delete[] bp;
delete[] cp;
if(mem != NULL)
if (mem != NULL) {
mem->BackToPinBuf();
mem->UnlockBuf();
}
else {
XMemFree(a0->devID, apGPU);
XMemFree(a0->devID, bpGPU);
......
......@@ -96,9 +96,12 @@ XTensor OnehotToIndex(const XTensor & onehot, int size)
/*
convert index tensor to onehot tensor
>> index - index tensor, which value is an integer num
>> onehot - onehot tensor, which value is 0 or 1
>> size - the last dimension size of the onehot tensor
>> index - index of the output dimension (over the vocabulary)
>> onehot - one-hot representation of the index
>> size - vocabuary size (last dimension size of onehot)
>> labelSmoothingP - the parameter that controls how smooth the output is.
E.g., p = 0 means no smoothing
p = 1 means a uniform distribution (almost)
*/
void _IndexToOnehot(const XTensor * index, XTensor * onehot,
int size, float labelSmoothingP)
......
......@@ -483,7 +483,7 @@ void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper)
else if (tensor->dataType == X_FLOAT16) {
unsigned short* d = (unsigned short*)tensor->data;
for (int i = 0; i < tensor->unitNum; i++) {
d[i] = variance * ((unsigned short)rand() / RAND_MAX) + lower;
d[i] = (unsigned short)(variance * ((unsigned short)rand() / RAND_MAX) + lower);
}
}
else if(tensor->dataType == X_DOUBLE){
......@@ -538,17 +538,17 @@ void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper)
/* generate data items with a range by start, end and the step
>> tensor - the tensor whose data array would be initialized
>> start - the begin of the array
>> end - the end of the array (not included self)
>> step - the step of two items
>> beg - the beginning of the array
>> end - the end of the array (it does not include itself)
>> step - the step we take along the array
*/
void _SetDataRange(XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE step)
void _SetDataRange(XTensor * tensor, int beg, int end, int step)
{
CheckNTErrors((tensor->order == 1), "Tensor must be 1 dimension!");
/* compute the true length according to the (start, end, step) */
DTYPE size = (DTYPE)fabs(upper - lower);
int num = ceil(size / fabs(step));
DTYPE size = (DTYPE)fabs(end - beg);
int num = (int)ceil(size / fabs(step));
CheckNTErrors((tensor->unitNum == num), "Unit number of the tensor is not matched.");
/* init a integer array to store the sequence */
......@@ -556,12 +556,13 @@ void _SetDataRange(XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE step)
if (tensor->dataType == X_INT) {
data = new int[num];
for (int i = 0; i < num; i++)
*((int*)data + i) = lower + i * step;
*((int*)data + i) = beg + i * step;
}
else if (tensor->dataType == X_FLOAT) {
data = new float[num];
for (int i = 0; i < num; i++)
*((float*)data + i) = lower + i * step;
ShowNTErrors("TODO! Unsupported datatype!")
//data = new float[num];
//for (int i = 0; i < num; i++)
// *((float*)data + i) = beg + i * step;
}
else {
ShowNTErrors("TODO! Unsupported datatype!")
......@@ -695,13 +696,23 @@ void _SetDataWithOffset(XTensor * tensor, MTYPE * offsets, DTYPE value, MTYPE nu
#ifdef USE_CUDA
XMem * mem = tensor->mem;
MTYPE size = num * sizeof(MTYPE);
MTYPE * offsetsCuda = mem != NULL ? (MTYPE*)mem->AllocBuf(mem->devID, size) : (MTYPE*)XMemAlloc(tensor->devID, size);
//MTYPE * offsetsCuda = mem != NULL ? (MTYPE*)mem->AllocBuf(mem->devID, size) : (MTYPE*)XMemAlloc(tensor->devID, size);
MTYPE * offsetsCuda;
if (mem != NULL) {
mem->LockBuf();
offsetsCuda = (MTYPE*)mem->AllocBuf(mem->devID, size);
}
else {
offsetsCuda = (MTYPE*)XMemAlloc(tensor->devID, size);
}
XMemCopy(offsetsCuda, tensor->devID, offsets, -1, num * sizeof(MTYPE));
_CudaSetDataWithOffset(tensor, offsetsCuda, value, num);
if (mem != NULL)
if (mem != NULL) {
mem->ReleaseBuf(mem->devID, size);
mem->UnlockBuf();
}
else
XMemFree(tensor->devID, offsetsCuda);
#else
......
......@@ -636,12 +636,23 @@ void _CudaSetDataWithOffsetAndValue(XTensor * tensor, MTYPE * offsets, void * va
int devIDBackup;
ProtectCudaDev(tensor->devID, devIDBackup);
MTYPE * offsetsCuda = mem != NULL ?
/*MTYPE * offsetsCuda = mem != NULL ?
(MTYPE*)mem->AllocBuf(mem->devID, offsetSize) :
(MTYPE*)XMemAlloc(tensor->devID, offsetSize);
void * valuesCuda = mem != NULL ?
mem->AllocBuf(mem->devID, valueSize) :
XMemAlloc(tensor->devID, valueSize);
void * valuesCuda = mem != NULL ?
mem->AllocBuf(mem->devID, valueSize) :
XMemAlloc(tensor->devID, valueSize);*/
MTYPE * offsetsCuda;
void * valuesCuda;
if (mem != NULL) {
mem->LockBuf();
offsetsCuda = (MTYPE*)mem->AllocBuf(mem->devID, offsetSize);
valuesCuda = mem->AllocBuf(mem->devID, valueSize);
}
else {
offsetsCuda = (MTYPE*)XMemAlloc(tensor->devID, offsetSize);
valuesCuda = XMemAlloc(tensor->devID, valueSize);
}
if (mem != NULL) {
XMemCopy(offsetsCuda, mem->devID, offsets, -1, offsetSize);
......@@ -657,6 +668,7 @@ void _CudaSetDataWithOffsetAndValue(XTensor * tensor, MTYPE * offsets, void * va
if (mem != NULL) {
mem->ReleaseBuf(mem->devID, valueSize);
mem->ReleaseBuf(mem->devID, offsetSize);
mem->UnlockBuf();
}
else {
XMemFree(tensor->devID, valuesCuda);
......
......@@ -57,8 +57,8 @@ void _SetDataRand(XTensor * tensor, int rNum, int cNum);
/* generate data items with a uniform distribution in [lower, upper] */
void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper);
/* generate data items with a range by start, end and the step */
void _SetDataRange(XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE step);
/* generate data items with a range [begin, end] and the step */
void _SetDataRange(XTensor * tensor, int beg, int end, int step);
/* generate data items with a uniform distribution in [lower, upper] and set
the item to a pre-defined value if the item >= p, set the item to 0 otherwise */
......
......@@ -63,9 +63,9 @@ void _Clip(const XTensor * a, XTensor * b, DTYPE lower, DTYPE upper)
int* db = (int*)b->data;
for (int i = 0; i < a->unitNum; i++) {
if (d[i] > upper)
db[i] = upper;
db[i] = (int)upper;
else if (d[i] < lower)
db[i] = lower;
db[i] = (int)lower;
else
db[i] = d[i];
}
......
......@@ -86,7 +86,7 @@ void _ScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift)
for(int i = 0; i < num; i++){
int * v = (int*)f;
int * vb = (int*)fb;
*vb = *v * scale + shift;
*vb = (int)(*v * scale + shift);
f += sizeof(int) + sizeof(int);
fb += sizeof(int) + sizeof(int);
}
......@@ -96,7 +96,7 @@ void _ScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift)
int * va = (int*)a->data;
int * vb = (int*)b->data;
for(int i = 0; i < b->unitNum; i++){
*vb = *va * scale + shift;
*vb = (int)(*va * scale + shift);
va++;
vb++;
}
......
......@@ -45,15 +45,25 @@ void _CopyBlocks(void * source, int unitSize, int blockSize, int blockNum, void
if (devID >= 0) {
#ifdef USE_CUDA
/* copy the index from host to device */
int * targetBlocksTMP = myMem != NULL ?
/*int * targetBlocksTMP = myMem != NULL ?
(int*)myMem->AllocBuf(devID, blockNum * sizeof(int)):
(int*)XMemAlloc(devID, blockNum * sizeof(int));
(int*)XMemAlloc(devID, blockNum * sizeof(int));*/
int * targetBlocksTMP;
if (myMem != NULL) {
myMem->LockBuf();
targetBlocksTMP = (int*)myMem->AllocBuf(devID, blockNum * sizeof(int));
}
else {
targetBlocksTMP = (int*)XMemAlloc(devID, blockNum * sizeof(int));
}
XMemCopy(targetBlocksTMP, devID, targetBlocks, -1, blockNum * sizeof(int));
_CopyBlocksOnSite(source, unitSize, blockSize, blockNum, target, targetBlocksTMP, devID);
if(myMem != NULL)
if (myMem != NULL) {
myMem->ReleaseBuf(myMem->devID, blockNum * sizeof(int));
myMem->UnlockBuf();
}
else
XMemFree(devID, targetBlocksTMP);
#else
......
......@@ -47,14 +47,17 @@ void _CopyBlocksInGrid(void * source, int blockSize, int blockNum, int gridNum,
#ifdef USE_CUDA
int * indexGPU = index;
if (!isIndexOnDev) {
myMem->LockBuf();
indexGPU = (int*)myMem->AllocBuf(myMem->devID, blockNum * gridNum * sizeof(int));
XMemCopy(indexGPU, myMem->devID, index, -1, blockNum * gridNum * sizeof(int));
}
_CudaCopyBlocksInGrid(source, blockSize, blockNum, gridNum, target, indexGPU, unitSize, myMem);
if (!isIndexOnDev)
if (!isIndexOnDev) {
myMem->ReleaseBuf(myMem->devID, blockNum * gridNum * sizeof(int));
myMem->UnlockBuf();
}
#else
ShowNTErrors("Plesae specify USE_CUDA and recompile the code!");
#endif
......
......@@ -80,12 +80,23 @@ void _CudaCopyBlocksSelected(void * source, int unitSize, int blockSize, int * s
ProtectCudaDev(devID, devIDBackup);
/* copy the index to the GPU memory */
int * sourceBlocksTMP = myMem != NULL ?
/*int * sourceBlocksTMP = myMem != NULL ?
(int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int)) :
(int *)XMemAlloc(devID, blockNum * sizeof(int));
int * targetBlocksTMP = myMem != NULL ?
(int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int)) :
(int *)XMemAlloc(devID, blockNum * sizeof(int));
(int *)XMemAlloc(devID, blockNum * sizeof(int));*/
int * sourceBlocksTMP;
int * targetBlocksTMP;
if (myMem != NULL) {
myMem->LockBuf();
sourceBlocksTMP = (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int));
targetBlocksTMP = (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int));
}
else {
sourceBlocksTMP = (int *)XMemAlloc(devID, blockNum * sizeof(int));
targetBlocksTMP = (int *)XMemAlloc(devID, blockNum * sizeof(int));
}
XMemCopy(sourceBlocksTMP, devID, sourceBlocks, -1, blockNum * sizeof(int));
XMemCopy(targetBlocksTMP, devID, targetBlocks, -1, blockNum * sizeof(int));
......@@ -107,6 +118,7 @@ void _CudaCopyBlocksSelected(void * source, int unitSize, int blockSize, int * s
if (myMem != NULL) {
myMem->ReleaseBuf(myMem->devID, blockNum * sizeof(int));
myMem->ReleaseBuf(myMem->devID, blockNum * sizeof(int));
myMem->UnlockBuf();
}
else {
XMemFree(devID, sourceBlocksTMP);
......
......@@ -32,9 +32,8 @@ copy s to t
>> s - source
>> t - target
>> stream - the stream for creating the job pipeline
*/
void _CopyValues(const XTensor * s, XTensor * t, XStream * stream)
void _CopyValues(const XTensor * s, XTensor * t)
{
if(s->data == NULL && t->data == NULL)
return;
......@@ -55,7 +54,7 @@ void _CopyValues(const XTensor * s, XTensor * t, XStream * stream)
#ifdef USE_CUDA
if (s->devID >= 0 || t->devID >= 0) {
_CudaCopyValues(s, t, stream);
_CudaCopyValues(s, t);
return;
}
#endif
......@@ -82,9 +81,8 @@ copy s to t
>> sLen - length of the segment
>> t - target
>> tBeg - beginning of the segment on the target side
>> stream - the stream for creating the job pipeline
*/
void _CopyValues(const XTensor * s, const int sBeg, const int sLen, XTensor * t, const int tBeg, XStream * stream)
void _CopyValues(const XTensor * s, const int sBeg, const int sLen, XTensor * t, const int tBeg)
{
if(s->data == NULL && t->data == NULL)
return;
......@@ -108,13 +106,12 @@ void _CopyValues(const XTensor * s, const int sBeg, const int sLen, XTensor * t,
/*
copy s to t (rename _CopyValues)
>> s - source
>> t - target
>> stream - the stream for creating the job pipeline
>> s - source
>> t - target
*/
void CopyValues(const XTensor &s, XTensor &t, XStream * stream)
void CopyValues(const XTensor &s, XTensor &t)
{
_CopyValues(&s, &t, stream);
_CopyValues(&s, &t);
}
/*
......@@ -122,16 +119,15 @@ copy s to t (return an XTensor structure)
make a new tensor to keep the result and return it
>> s - source
>> stream - the stream for creating the job pipeline
<< return - the copyed tensor t
*/
XTensor CopyValues(const XTensor &s, XStream * stream)
XTensor CopyValues(const XTensor &s)
{
XTensor t(&s);
t.SetTMPFlag();
/* call _CopyValues function */
_CopyValues(&s, &t, stream);
_CopyValues(&s, &t);
/* tensor connection */
if (s.enableGrad) {
......
......@@ -32,10 +32,9 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
copy a range of elements from a source vector to a target vector
>> s - source matrix
>> t - target matrix
>> stream - the stream for creating the job pipeline
<< return - succeed or not
*/
void _CudaCopyValues(const XTensor * s, XTensor * t, XStream * stream)
void _CudaCopyValues(const XTensor * s, XTensor * t)
{
CheckNTErrors(s != NULL && t != NULL, "The input tensor and output tensor must be nonempty!");
CheckNTErrors(s->dataType == t->dataType, "Unmatched data type!");
......@@ -45,10 +44,7 @@ void _CudaCopyValues(const XTensor * s, XTensor * t, XStream * stream)
/* dense -> dense */
if (!s->isSparse && !t->isSparse) {
if (stream == NULL)
XMemCopy(t->data, t->devID, s->data, s->devID, s->unitSize * s->unitNum);
else
XMemCopyAsync(t->data, t->devID, s->data, s->devID, s->unitSize * s->unitNum, stream->stream, stream->devID);
XMemCopy(t->data, t->devID, s->data, s->devID, s->unitSize * s->unitNum);
}
/* dense -> sparse */
else if (!s->isSparse && t->isSparse &&
......@@ -72,11 +68,8 @@ void _CudaCopyValues(const XTensor * s, XTensor * t, XStream * stream)
int num = s->unitNumNonZero;
int size = sizeof(int) + num * (s->unitSize + sizeof(int));
if (stream == NULL)
XMemCopy(t->data, t->devID, s->data, s->devID, size);
else
XMemCopyAsync(t->data, t->devID, s->data, s->devID, size, stream->stream, stream->devID);
XMemCopy(t->data, t->devID, s->data, s->devID, size);
t->unitNumNonZero = num;
}
else {
......
......@@ -29,7 +29,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/* copy all elements from a source matrix to a target matrix */
void _CudaCopyValues(const XTensor * s, XTensor * t, XStream * stream = NULL);
void _CudaCopyValues(const XTensor * s, XTensor * t);
#endif // USE_CUDA
......
......@@ -27,19 +27,19 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
/* copy s to t */
void _CopyValues(const XTensor * s, XTensor * t, XStream * stream = NULL);
void _CopyValues(const XTensor * s, XTensor * t);
/* copy a segment of s to t */
void _CopyValues(const XTensor * s, const int sBeg, const int sLen, XTensor * t, const int tBeg, XStream * stream = NULL);
void _CopyValues(const XTensor * s, const int sBeg, const int sLen, XTensor * t, const int tBeg);
/* copy s to t (rename _CopyValues) */
void CopyValues(const XTensor &s, XTensor &t, XStream * stream = NULL);
void CopyValues(const XTensor &s, XTensor &t);
/*
copy s to t (return an XTensor structure)
make a new tensor to keep the result and return it
*/
XTensor CopyValues(const XTensor &s, XStream * stream = NULL);
XTensor CopyValues(const XTensor &s);
} // namespace nts(NiuTrans.Tensor)
......
......@@ -115,7 +115,7 @@ void _Gather(const XTensor * s, XTensor * t, XTensor * srcIndex)
for (int i = 0; i < indexSize; i++) {
int sIndex = sIndexData[i] * stride;
CheckNTErrors(sIndex < s->unitNum, "Wrong index!");
CheckNTErrors(sIndex < s->unitNum && sIndex >= 0, "Wrong index!");
for (int j = 0; j < stride; j++)
tData[i * stride + j] = sData[sIndex + j];
}
......
......@@ -131,9 +131,16 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex)
CheckNTErrors(srcIndexValue < s->unitNum, "Wrong index!");
}
sIndex = mem != NULL ?
/*sIndex = mem != NULL ?
(int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) :
(int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);
(int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);*/
if (mem != NULL) {
mem->LockBuf();
sIndex = (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize);
}
else {
sIndex = (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);
}
XMemCopy(sIndex, devID, srcIndex, -1, sizeof(int) * indexSize);
}
else {
......@@ -169,8 +176,10 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex)
}
if (srcIndex->devID < 0) {
if(mem != NULL)
if (mem != NULL) {
mem->ReleaseBuf(mem->devID, sizeof(int) * indexSize);
mem->UnlockBuf();
}
else
XMemFree(mem->devID, sIndex);
}
......@@ -209,9 +218,16 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex, int dim)
CheckNTErrors(srcIndexValue < s->unitNum, "Wrong index!");
}
sIndex = mem != NULL ?
(int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) :
(int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);
/*sIndex = mem != NULL ?
(int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) :
(int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);*/
if (mem != NULL) {
mem->LockBuf();
sIndex = (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize);
}
else {
sIndex = (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);
}
XMemCopy(sIndex, devID, srcIndex, -1, sizeof(int) * indexSize);
}
else {
......@@ -238,6 +254,15 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex, int dim)
else {
ShowNTErrors("Unsupported dataType!");
}
if (srcIndex->devID < 0) {
if (mem != NULL) {
mem->ReleaseBuf(mem->devID, sizeof(int) * indexSize);
mem->UnlockBuf();
}
else
XMemFree(mem->devID, sIndex);
}
}
#endif // USE_CUDA
......
......@@ -231,8 +231,8 @@ And this is a special spread function for backward computation of gather functio
*/
void _SpreadForGather(XTensor * source, XTensor * collection, XTensor * index)
{
int dim = 0;
int order = source->order;
//int dim = 0;
//int order = source->order;
CheckNTErrors(source->dataType == DEFAULT_DTYPE, "TODO!");
CheckNTErrors(collection->GetDim(-1) == source->GetDim(-1), "Illegal dimension!");
......@@ -272,4 +272,4 @@ void _SpreadForGather(XTensor * source, XTensor * collection, XTensor * index)
}
}
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
} // namespace nts(NiuTrans.Tensor)
......@@ -177,9 +177,17 @@ void _CudaSpread(XTensor * source, XTensor * collection, int dim,
DTYPE * c = (DTYPE*)collection->data;
XMem * mem = source->mem;
int * si = mem != NULL ?
/*int * si = mem != NULL ?
(int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize * 2) :
(int*)XMemAlloc(mem->devID, sizeof(int) * indexSize * 2);
(int*)XMemAlloc(mem->devID, sizeof(int) * indexSize * 2);*/
int * si;
if (mem != NULL) {
mem->LockBuf();
si = (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize * 2);
}
else {
si = (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize * 2);
}
int * ci = si + indexSize;
XMemCopy(si, mem->devID, srcIndex, -1, sizeof(int) * indexSize);
......@@ -188,8 +196,10 @@ void _CudaSpread(XTensor * source, XTensor * collection, int dim,
KernelSpreadFuzed<<<blocks, threads >>>(s, c, blockNum, blockSizeSrc, blockSizeColl,
stride, indexSize, si, ci);
if(mem != NULL)
if (mem != NULL) {
mem->ReleaseBuf(mem->devID, sizeof(int) * indexSize * 2);
mem->UnlockBuf();
}
else
XMemFree(mem->devID, si);
}
......@@ -393,9 +403,16 @@ void _CudaSpreadForGather(XTensor * source, XTensor * collection, XTensor * srcI
dim3 threads(cudaBlocks[0], cudaBlocks[1]);
if (srcIndex->devID < 0) {
sIndex = mem != NULL ?
/*sIndex = mem != NULL ?
(int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) :
(int*)XMemAlloc(devID, sizeof(int) * indexSize);
(int*)XMemAlloc(devID, sizeof(int) * indexSize);*/
if (mem != NULL) {
mem->LockBuf();
sIndex = (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize);
}
else {
sIndex = (int*)XMemAlloc(devID, sizeof(int) * indexSize);
}
XMemCopy(sIndex, devID, srcIndex->data, -1, sizeof(int) * indexSize);
}
else
......@@ -422,8 +439,10 @@ void _CudaSpreadForGather(XTensor * source, XTensor * collection, XTensor * srcI
}
if (srcIndex->devID < 0) {
if(mem != NULL)
if (mem != NULL) {
mem->ReleaseBuf(mem->devID, sizeof(int) * indexSize);
mem->UnlockBuf();
}
else
XMemFree(devID, sIndex);
}
......
......@@ -512,8 +512,8 @@ void funName(DTYPE * input, DTYPE * output,int stride, int strideNum,
KERNELREDUCEFUN1(KernelReduceMaxOp, MAX, shflDownReduceMax, FLOAT_MIN)
KERNELREDUCEFUN1(KernelReduceMinOp, MIN, shflDownReduceMin, MAX_FLOAT)
/*
get the max-valued items along a dimension of the tensor (cuda version).
/*
get the max-valued items along a dimension of the tensor (cuda version).
For a 1-dimensional data array a,
sum_i = max_{0<=j<strideNum} input_{i,j}
>> input - the input tensor
......@@ -574,7 +574,14 @@ void _funcName(const XTensor * input, XTensor * output, int dim)
XMem * mem = input->mem; \
GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize); \
int bufSize = input->unitSize * cudaGridSize[0] * stride * blockNum * 2; \
DTYPE * buf = mem != NULL ? (DTYPE*)mem->AllocBuf(mem->devID, bufSize) : (DTYPE*)XMemAlloc(devID, bufSize); \
DTYPE * buf; \
if (mem != NULL) { \
mem->LockBuf(); \
buf = (DTYPE*)mem->AllocBuf(mem->devID, bufSize); \
} \
else { \
buf = (DTYPE*)XMemAlloc(devID, bufSize); \
} \
DTYPE * buf1 = buf; \
DTYPE * buf2 = buf + cudaGridSize[0] * stride * blockNum; \
do { \
......@@ -706,8 +713,10 @@ void _funcName(const XTensor * input, XTensor * output, int dim)
\
} while (strideNum > 1); \
\
if (mem != NULL) \
if (mem != NULL) { \
mem->ReleaseBuf(mem->devID, bufSize); \
mem->UnlockBuf(); \
} \
else \
XMemFree(input->devID, buf); \
} \
......
......@@ -757,7 +757,15 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
int bufSize = input->unitSize * cudaGridSize[0] * stride * blockNum * 2;
DTYPE * buf = mem != NULL ? (DTYPE*)mem->AllocBuf(mem->devID, bufSize) : (DTYPE*)XMemAlloc(devID, bufSize);
//DTYPE * buf = mem != NULL ? (DTYPE*)mem->AllocBuf(mem->devID, bufSize) : (DTYPE*)XMemAlloc(devID, bufSize);
DTYPE * buf;
if (mem != NULL) {
mem->LockBuf();
buf = (DTYPE*)mem->AllocBuf(mem->devID, bufSize);
}
else {
buf = (DTYPE*)XMemAlloc(devID, bufSize);
}
DTYPE * buf1 = buf;
DTYPE * buf2 = buf + cudaGridSize[0] * stride * blockNum;
do {
......@@ -907,8 +915,10 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
} while (strideNum > 1);
if (mem != NULL)
if (mem != NULL) {
mem->ReleaseBuf(mem->devID, bufSize);
mem->UnlockBuf();
}
else
XMemFree(devID, buf);
}
......
......@@ -56,12 +56,16 @@ void _ReduceSumAll(const XTensor * source, XTensor * target)
int dims[1] = {source->unitNum};
if (source->mem != NULL)
source->mem->LockBuf();
XTensor * all = NewTensorBufV2(1, dims, source->dataType, source->denseRatio, source->devID, source->mem);
_CopyValues(source, all);
_ReduceSum(all, target, 0);
DelTensorBuf(all);
if (source->mem != NULL)
source->mem->UnlockBuf();
}
/*
......@@ -72,7 +76,8 @@ sum all the items of the tensor (It should be optimized!)
void _ReduceSumAll(const XTensor * source, DTYPE * value)
{
int * dimSize = new int[MAX_TENSOR_DIM_NUM];
float dr = (!source->isSparse) ? 1.0F : source->denseRatio;
if (source->mem != NULL)
source->mem->LockBuf();
XTensor * target = NewTensorBufV2(0, dimSize, source->dataType, source->denseRatio, source->devID, source->mem);
target->SetTMPFlag();
......@@ -82,6 +87,8 @@ void _ReduceSumAll(const XTensor * source, DTYPE * value)
delete[] dimSize;
DelTensorBuf(target);
if (source->mem != NULL)
source->mem->UnlockBuf();
}
/*
......@@ -122,4 +129,4 @@ DTYPE ReduceSumAllValue(const XTensor & source)
return target.Get0D();
}
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
} // namespace nts(NiuTrans.Tensor)
......@@ -32,14 +32,14 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/*
transform a tensor by merging it along with a dimension.
e.g., (N/3, M, 3) -> (N, M)
e.g., (3, M, N/3) -> (M, N)
>> s - the source tensor
>> t - the target tensor (for return)
>> whereToMerge - the merging operation is along with which dimension
>> leadingDim - the leading dimension of merging, take (N/3, M, 3) -> (N, M)
for example, whereToMerge = 0 (i.e., the dimension for "N/3")
leadingDim = 2 (i.e., the dimension for "3")
>> leadingDim - the leading dimension of merging, take (3, M, N/3) -> (M, N)
for example, whereToMerge = 2 (i.e., the dimension for "N/3")
leadingDim = 0 (i.e., the dimension for "3")
*/
void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
{
......@@ -118,30 +118,54 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
void * dataTMP = t->data;
if (!isOnSameDevice)
dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(mem->devID, size);
if (!isOnSameDevice) {
/*dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(mem->devID, size);*/
if (mem != NULL) {
mem->LockBuf();
dataTMP = mem->AllocBuf(mem->devID, size);
}
else {
dataTMP = XMemAlloc(mem->devID, size);
}
}
int blockNumInMerge = s->dimSize[leadingDim];
int splitSizeInGrid = gridSize / blockNumInMerge;
int realBlockSize = blockSize * t->unitSize;
int * blockIndex = (int*)(mem != NULL ?
/*int * blockIndex = (int*)(mem != NULL ?
mem->AllocBuf(mem->devID, blockNum * gridNum * sizeof(int)) :
XMemAlloc(s->devID, blockNum * gridNum * sizeof(int)));
XMemAlloc(s->devID, blockNum * gridNum * sizeof(int)));*/
int * blockIndex;
if (mem != NULL) {
if (isOnSameDevice) {
mem->LockBuf();
}
blockIndex = (int*)mem->AllocBuf(mem->devID, blockNum * gridNum * sizeof(int));
}
else {
blockIndex = (int*)XMemAlloc(s->devID, blockNum * gridNum * sizeof(int));
}
_MakeMergeBlockIndex(blockIndex, blockNum, blockNumInMerge, splitSizeInGrid, gridSize, gridNum, s->devID);
_CopyBlocksOnSite(s->data, s->unitSize, realBlockSize, blockNum * gridNum, dataTMP, blockIndex, s->devID);
if (mem != NULL)
if (mem != NULL) {
mem->ReleaseBuf(mem->devID, blockNum * gridNum * sizeof(int));
if (isOnSameDevice) {
mem->UnlockBuf();
}
}
else
XMemFree(s->devID, blockIndex);
if (!isOnSameDevice) {
XMemCopy(t->data, t->devID, dataTMP, s->devID, size);
if (mem != NULL)
if (mem != NULL) {
mem->ReleaseBuf(mem->devID, size);
mem->UnlockBuf();
}
else
XMemFree(s->devID, dataTMP);
}
......@@ -185,13 +209,13 @@ bool CheckMergeSize(const XTensor * s, const XTensor * t, int whereToMerge, int
transform a tensor by merging it along with a dimension (return an XTensor structure)
make a new tensor to keep the result and return it
e.g., (N/3, M, 3) -> (N, M)
e.g., (3, M, N/3) -> (M, N)
>> s - the source tensor
>> whereToMerge - the merging operation is along with which dimension
>> leadingDim - the leading dimension of merging, take (N/3, M, 3) -> (N, M)
for example, whereToMerge = 0 (i.e., the dimension for "N/3")
leadingDim = 2 (i.e., the dimension for "3")
>> leadingDim - the leading dimension of merging, take (3, M, N/3) -> (M, N)
for example, whereToMerge = 2 (i.e., the dimension for "N/3")
leadingDim = 0 (i.e., the dimension for "3")
<< return - the transformed tensor by merging along with a dimension
*/
XTensor Merge(const XTensor &s, int whereToMerge, int leadingDim)
......@@ -358,8 +382,16 @@ void _Merge(const TensorList * smalls, XTensor * t, int whereToMerge)
void * dataTMP = NULL;
if (uniform)
dataTMP = smallsItem0->data;
else
dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(t->devID, size);
else {
//dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(t->devID, size);
if (mem != NULL) {
mem->LockBuf();
dataTMP = mem->AllocBuf(mem->devID, size);
}
else {
dataTMP = XMemAlloc(t->devID, size);
}
}
tensorTMP->data = dataTMP;
......@@ -378,8 +410,10 @@ void _Merge(const TensorList * smalls, XTensor * t, int whereToMerge)
tensorTMP->data = NULL;
delete tensorTMP;
if ((!uniform) && (mem != NULL))
if ((!uniform) && (mem != NULL)) {
mem->ReleaseBuf(mem->devID, size);
mem->UnlockBuf();
}
else
XMemFree(t->devID, dataTMP);
}
......
......@@ -117,7 +117,7 @@ void _CudaMergeBlockLists(const StrList* sourceList, int * blockSizes, int block
GDevs.GetCudaThread2D(myMem->devID, realMaxBlockSize, newBlockListSize, MAX_INT,
cudaGridSizes, cudaBlockSizes);
myMem->LockBuf();
myMem->SetPinBuf();
int * sizesGPU = (int*)myMem->AllocBuf(myMem->devID, sizeof(int) * newBlockListSize, 256);
......@@ -133,6 +133,7 @@ void _CudaMergeBlockLists(const StrList* sourceList, int * blockSizes, int block
(sourceArraysGPU, sizesGPU, newBlockListSize, targetArraysGPU);
myMem->BackToPinBuf();
myMem->UnlockBuf();
delete[] sourceArrays;
delete[] targetArrays;
......
......@@ -96,25 +96,11 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)
}
}
else{
#ifdef USE_CUDA
#ifdef STREAMED_MEMCPOPY
XStream * stream = GDevs.GPUs[t->devID].stream;
for (int k = 0; k < splitNum; k++) {
XMemCopy2DAsync((char*)t->data + k * tStep, tPitch, t->devID,
(char*)s->data + k * sStep, sPitch, s->devID,
mSize, n, stream);
}
stream->StreamSynchronize();
#else
for (int k = 0; k < splitNum; k++) {
XMemCopy2D((char*)t->data + k * tStep, tPitch, t->devID,
(char*)s->data + k * sStep, sPitch, s->devID,
mSize, n);
}
#endif
#else
ShowNTErrors("Please specify USE_CUDA and recompile the code!");
#endif
}
}
else {
......@@ -124,22 +110,44 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)
void * dataTMP = t->data;
if (!isOnSameDevice)
dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(s->devID, size);
if (!isOnSameDevice) {
//dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(s->devID, size);
if (mem != NULL) {
mem->LockBuf();
dataTMP = mem->AllocBuf(mem->devID, size);
}
else {
dataTMP = XMemAlloc(s->devID, size);
}
}
int realBlockSize = blockSize * t->unitSize;
int blockSplitSize = blockNum / splitNum;
int * blockIndex = (int*)(mem != NULL ?
/*int * blockIndex = (int*)(mem != NULL ?
mem->AllocBuf(mem->devID, blockNum * sizeof(int)) :
XMemAlloc(s->devID, blockNum * sizeof(int)));
XMemAlloc(s->devID, blockNum * sizeof(int)));*/
int * blockIndex;
if (mem != NULL) {
if (isOnSameDevice) {
mem->LockBuf();
}
blockIndex = (int*)mem->AllocBuf(mem->devID, blockNum * sizeof(int));
}
else {
blockIndex = (int*)XMemAlloc(s->devID, blockNum * sizeof(int));
}
_MakeSplitBlockIndex(blockIndex, splitNum, blockSplitSize, blockNum, s->devID);
_CopyBlocksOnSite(s->data, s->unitSize, realBlockSize, blockNum, dataTMP, blockIndex, s->devID);
if (mem != NULL)
if (mem != NULL) {
mem->ReleaseBuf(mem->devID, blockNum * sizeof(int));
if (isOnSameDevice) {
mem->UnlockBuf();
}
}
else
XMemFree(s->devID, blockIndex);
......@@ -147,8 +155,10 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)
if (!isOnSameDevice) {
XMemCopy(t->data, t->devID, dataTMP, s->devID, size);
if (mem != NULL)
if (mem != NULL) {
mem->ReleaseBuf(mem->devID, size);
mem->UnlockBuf();
}
else
XMemFree(s->devID, dataTMP);
}
......@@ -321,27 +331,12 @@ void _Split(const XTensor * big, TensorList * smalls, int whereToSplit, int spli
}
}
else{
#ifdef USE_CUDA
#ifdef STREAMED_MEMCPOPY
XStream * stream = GDevs.GPUs[big->devID].stream;
for (int k = 0; k < splitNum; k++) {
XTensor * t = (XTensor*)smalls->GetItem(k);
XMemCopy2DAsync((char*)t->data + k * tStep, tPitch, t->devID,
(char*)big->data + k * sStep, sPitch, big->devID,
mSize, n, stream);
}
stream->StreamSynchronize();
#else
for (int k = 0; k < splitNum; k++) {
XTensor * t = (XTensor*)smalls->GetItem(k);
XMemCopy2D((char*)t->data + k * tStep, tPitch, t->devID,
(char*)big->data + k * sStep, sPitch, big->devID,
mSize, n);
}
#endif
#else
ShowNTErrors("Please specify USE_CUDA and recompile the code!");
#endif
}
}
/* splitting with fewer kernel/api calls??? (i'm not sure about it!! may remove this later) */
......@@ -362,7 +357,14 @@ void _Split(const XTensor * big, TensorList * smalls, int whereToSplit, int spli
dataTMP = first->data;
}
else {
dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(big->devID, size);
//dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(big->devID, size);
if (mem != NULL) {
mem->LockBuf();
dataTMP = mem->AllocBuf(mem->devID, size);
}
else {
dataTMP = XMemAlloc(big->devID, size);
}
}
tensorTMP->data = dataTMP;
......@@ -383,8 +385,10 @@ void _Split(const XTensor * big, TensorList * smalls, int whereToSplit, int spli
tensorTMP->data = NULL;
delete tensorTMP;
if ((!uniform) && (mem != NULL))
if ((!uniform) && (mem != NULL)) {
mem->ReleaseBuf(mem->devID, size);
mem->UnlockBuf();
}
else
XMemFree(big->devID, dataTMP);
}
......
......@@ -43,13 +43,11 @@ void _Stack(const TensorList * smalls, XTensor * t, int dim)
int blockSize = 1;
int blockNum = 1;
int gridSize = 1;
int gridNum = 1;
XTensor * smallsItem0 = smalls->GetItem(0);
int unitNum = smallsItem0->unitNum;
//int unitNum = smallsItem0->unitNum;
int unitSize = smallsItem0->unitSize;
int itemSize = unitNum * unitSize;
for (int i = 0; i < smallsItem0->order; i++) {
if (i >= dim)
......@@ -129,7 +127,7 @@ bool CheckStackShape(const TensorList &smalls, XTensor &t, int dim)
XTensor * tensor = (XTensor*)smalls.GetItem(0);
int order = tensor->order;
for (int i = 0; i < tensor->order; i++) {
for (int i = 0; i < order; i++) {
if (i < dim) {
if (t.GetDim(i) != tensor->GetDim(i))
return false;
......
......@@ -234,7 +234,15 @@ void _CudaSortBig(const XTensor * a, XTensor * b, XTensor * indexA, XTensor * in
int m = GetNextPower2(strideNum);
int n = stride * blockNum;
void * buf = mem != NULL ? mem->AllocBuf(a->devID, n * m * a->unitSize) : XMemAlloc(a->devID, n * m * a->unitSize);
//void * buf = mem != NULL ? mem->AllocBuf(a->devID, n * m * a->unitSize) : XMemAlloc(a->devID, n * m * a->unitSize);
void * buf;
if (mem != NULL) {
mem->LockBuf();
buf = mem->AllocBuf(a->devID, n * m * a->unitSize);
}
else {
buf = XMemAlloc(a->devID, n * m * a->unitSize);
}
void * bufIndex = NULL;
if (indexA != NULL && indexB != NULL) {
bufIndex = mem != NULL ? mem->AllocBuf(a->devID, n * m * sizeof(int)) : XMemAlloc(a->devID, n * m * sizeof(int));
......@@ -289,8 +297,10 @@ void _CudaSortBig(const XTensor * a, XTensor * b, XTensor * indexA, XTensor * in
KernelReorganizeBack<int> << <dim3(cudaGrids[1], cudaGrids[0]), dim3(cudaBlocks[1], cudaBlocks[0]) >> >
(bufIndex, indexB->data, m, n, stride, k, blockNum);
if (mem != NULL)
if (mem != NULL) {
mem->ReleaseBuf(a->devID, n * m * a->unitSize);
mem->UnlockBuf();
}
else
XMemFree(a->devID, buf);
if (indexA != NULL && indexB != NULL)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论