Commit 4336f2f9 by xuchen

1. add dropout function 2. add some function in tensor/core/math/unary file 3.…

1. add dropout function 2. add some function in tensor/core/math/unary file 3. merge with xiaotong-working branch
parent 98db6f24
...@@ -39,7 +39,6 @@ void SumDimTest(); ...@@ -39,7 +39,6 @@ void SumDimTest();
using namespace nts; using namespace nts;
using namespace fnnlm; using namespace fnnlm;
using namespace transformer; using namespace transformer;
using namespace GAN;
int main( int argc, const char ** argv ) int main( int argc, const char ** argv )
{ {
...@@ -47,9 +46,7 @@ int main( int argc, const char ** argv ) ...@@ -47,9 +46,7 @@ int main( int argc, const char ** argv )
//BackwardTest(); //BackwardTest();
//return 0; //return 0;
if(argc > 1 && !strcmp(argv[1], "-test")) if(argc > 1 && !strcmp(argv[1], "-fnnlm"))
Test();
else if(argc > 1 && !strcmp(argv[1], "-fnnlm"))
FNNLMMain(argc - 1, argv + 1); FNNLMMain(argc - 1, argv + 1);
else if(argc > 1 && !strcmp(argv[1], "-t2t")) else if(argc > 1 && !strcmp(argv[1], "-t2t"))
TransformerMain(argc - 1, argv + 1); TransformerMain(argc - 1, argv + 1);
......
...@@ -451,10 +451,10 @@ void XMathGrad::GradDivDim(XTensor * node) ...@@ -451,10 +451,10 @@ void XMathGrad::GradDivDim(XTensor * node)
DelTensorBuf(interGrad); DelTensorBuf(interGrad);
} }
DelTensorBuf(aTMP1);
DelTensorBuf(aTMP2);
DelTensorBuf(bTMP);
DelTensorBuf(interGradTMP); DelTensorBuf(interGradTMP);
DelTensorBuf(bTMP);
DelTensorBuf(aTMP2);
DelTensorBuf(aTMP1);
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
} }
...@@ -499,8 +499,8 @@ void XMathGrad::GradMatrixMul(XTensor * node) ...@@ -499,8 +499,8 @@ void XMathGrad::GradMatrixMul(XTensor * node)
a->Reshape(a->unitNum/a->GetDim(-1), a->GetDim(-1)); a->Reshape(a->unitNum/a->GetDim(-1), a->GetDim(-1));
c->Reshape(c->unitNum/c->GetDim(-1), c->GetDim(-1)); c->Reshape(c->unitNum/c->GetDim(-1), c->GetDim(-1));
deda->Reshape(a->unitNum/a->GetDim(-1), a->GetDim(-1)); deda->Reshape(deda->unitNum/deda->GetDim(-1), deda->GetDim(-1));
dedc->Reshape(c->unitNum/c->GetDim(-1), c->GetDim(-1)); dedc->Reshape(dedc->unitNum/dedc->GetDim(-1), dedc->GetDim(-1));
GradMatrixMul(a, deda, transA, b, dedb, transB, dedc, alpha); GradMatrixMul(a, deda, transA, b, dedb, transB, dedc, alpha);
...@@ -760,7 +760,7 @@ void XMathGrad::GradMultiplyDim(XTensor * node) ...@@ -760,7 +760,7 @@ void XMathGrad::GradMultiplyDim(XTensor * node)
DelTensorBuf(interGrad); DelTensorBuf(interGrad);
} }
DelTensor(bGradTMP); DelTensorBuf(bGradTMP);
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
} }
...@@ -796,6 +796,8 @@ gradient for normalize ...@@ -796,6 +796,8 @@ gradient for normalize
*/ */
void XMathGrad::GradNormalize(XTensor * node) void XMathGrad::GradNormalize(XTensor * node)
{ {
ShowNTErrors("This is really a bad piece of code!!!");
XLink &income = node->income; XLink &income = node->income;
CheckNTErrors(income.tailNum == 5, "Wrong input tensor number for NORMALIZE!"); CheckNTErrors(income.tailNum == 5, "Wrong input tensor number for NORMALIZE!");
...@@ -902,7 +904,7 @@ void XMathGrad::GradPower(XTensor * node) ...@@ -902,7 +904,7 @@ void XMathGrad::GradPower(XTensor * node)
_ScaleAndShiftMe(b, p); _ScaleAndShiftMe(b, p);
_Multiply(node->grad, b, a->grad, 1.0F); _Multiply(node->grad, b, a->grad, 1.0F);
DelTensor(b); DelTensorBuf(b);
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
} }
...@@ -1229,7 +1231,7 @@ void XMathGrad::GradReduceSum(XTensor * node) ...@@ -1229,7 +1231,7 @@ void XMathGrad::GradReduceSum(XTensor * node)
_Unsqueeze(node->grad, b, dim, n); _Unsqueeze(node->grad, b, dim, n);
_Sum(a->grad, b, a->grad); _Sum(a->grad, b, a->grad);
DelTensor(b); DelTensorBuf(b);
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
} }
...@@ -1274,10 +1276,10 @@ void XMathGrad::GradReduceSumSquared(XTensor * node) ...@@ -1274,10 +1276,10 @@ void XMathGrad::GradReduceSumSquared(XTensor * node)
_ScaleAndShiftMe(f, -2.0F); _ScaleAndShiftMe(f, -2.0F);
_Multiply(node->grad, f, b->grad, 1.0F); _Multiply(node->grad, f, b->grad, 1.0F);
DelTensorBuf(c);
DelTensorBuf(d);
DelTensorBuf(e);
DelTensorBuf(f); DelTensorBuf(f);
DelTensorBuf(e);
DelTensorBuf(d);
DelTensorBuf(c);
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
} }
...@@ -1323,10 +1325,10 @@ void XMathGrad::GradReduceVariance(XTensor * node) ...@@ -1323,10 +1325,10 @@ void XMathGrad::GradReduceVariance(XTensor * node)
_ScaleAndShiftMe(f, -2.0F /n); _ScaleAndShiftMe(f, -2.0F /n);
_Multiply(node->grad, f, b->grad, 1.0F); _Multiply(node->grad, f, b->grad, 1.0F);
DelTensorBuf(c);
DelTensorBuf(d);
DelTensorBuf(e);
DelTensorBuf(f); DelTensorBuf(f);
DelTensorBuf(e);
DelTensorBuf(d);
DelTensorBuf(c);
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
} }
......
...@@ -145,14 +145,19 @@ void XNet::Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss) ...@@ -145,14 +145,19 @@ void XNet::Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss)
lossGrad.Compute(gold, root, root->grad, loss); lossGrad.Compute(gold, root, root->grad, loss);
} }
} }
/* back-propagation from output to input */ /* back-propagation from output to input */
for(int i = nodes.count - 1; i >= 0; i--){ for(int i = nodes.count - 1; i >= 0; i--){
XTensor * node = (XTensor*)nodes.Get(i);; XTensor * node = (XTensor*)nodes.Get(i);
if(node->mem != NULL){
CheckNTErrors(node->mem->bufUsed < BUF_PITCH, "Illegal access of buffer!");
}
if(node->visitMark == NODE_FINISHED) if(node->visitMark == NODE_FINISHED)
continue; continue;
BackwardNode(node); BackwardNode(node);
} }
} }
......
...@@ -116,15 +116,25 @@ XTensor T2TAttention::Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask) ...@@ -116,15 +116,25 @@ XTensor T2TAttention::Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask)
XTensor att; XTensor att;
XTensor dot; XTensor dot;
XTensor scalar; XTensor scalar;
/* scalar = softmax(Q * K^T / sqrt(dk)) * V */ /* scalar = softmax(Q * K^T / sqrt(dk)) * V */
dot = BMMul(qheads, X_NOTRANS, kheads, X_TRANS); dot = BMMul(qheads, X_NOTRANS, kheads, X_TRANS);
if(isMasked) if(isMasked)
dot = dot + mask; dot = dot + mask;
scalar = Softmax(Linear(dot, 1/(float)sqrt((float)dk)), -1);
if(ignored > 0) dot = Linear(dot, 1.0F/(float)sqrt((float)dk));
_SetDataDim(&scalar, 0, ignored, scalar.order - 2, 1e-9F);
//if(llnum == 1)
// dot.Dump(tf, "dot:");
scalar = Softmax(dot, -1);
//if(llnum == 1)
// scalar.Dump(tf, "scalar:");
//if(ignored > 0)
// _SetDataDim(&scalar, 0, ignored, scalar.order - 2, 1e-9F);
att = BMMul(scalar, vheads); att = BMMul(scalar, vheads);
......
...@@ -111,6 +111,9 @@ XTensor T2TEmbedder::Make(XTensor &input) ...@@ -111,6 +111,9 @@ XTensor T2TEmbedder::Make(XTensor &input)
memcpy(dims, input.dimSize, input.order * sizeof(int)); memcpy(dims, input.dimSize, input.order * sizeof(int));
dims[input.order - 1] = eSize; dims[input.order - 1] = eSize;
XTensor wordEmbedding;
XTensor posEmbedding;
bool match = (posEmbedding.order == input.order); bool match = (posEmbedding.order == input.order);
if(match){ if(match){
for(int i = 0; i < input.order; i++){ for(int i = 0; i < input.order; i++){
...@@ -120,7 +123,8 @@ XTensor T2TEmbedder::Make(XTensor &input) ...@@ -120,7 +123,8 @@ XTensor T2TEmbedder::Make(XTensor &input)
} }
/* we make positional embeddings first */ /* we make positional embeddings first */
if(!match){ //if(!match){
if(true){
InitTensor(&posEmbedding, input.order, dims, X_FLOAT, 1.0F, devID, mem); InitTensor(&posEmbedding, input.order, dims, X_FLOAT, 1.0F, devID, mem);
XTensor * posTMP = NewTensorBuf(2, dims + 1, X_FLOAT, 1.0F, devID, mem); XTensor * posTMP = NewTensorBuf(2, dims + 1, X_FLOAT, 1.0F, devID, mem);
...@@ -130,8 +134,6 @@ XTensor T2TEmbedder::Make(XTensor &input) ...@@ -130,8 +134,6 @@ XTensor T2TEmbedder::Make(XTensor &input)
DelTensorBuf(posTMP); DelTensorBuf(posTMP);
} }
XTensor wordEmbedding;
/* then we make word embeddings */ /* then we make word embeddings */
wordEmbedding = Linear(MMul(input, w), (float)sqrt((float)d)); wordEmbedding = Linear(MMul(input, w), (float)sqrt((float)d));
......
...@@ -63,9 +63,6 @@ public: ...@@ -63,9 +63,6 @@ public:
the embedding processing by re-loading. */ the embedding processing by re-loading. */
XTensor posEmbeddingBase; XTensor posEmbeddingBase;
/* positional embeddings */
XTensor posEmbedding;
public: public:
/* constructor */ /* constructor */
T2TEmbedder(); T2TEmbedder();
......
...@@ -103,6 +103,10 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool skipInputRes) ...@@ -103,6 +103,10 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool skipInputRes)
XTensor fnn; XTensor fnn;
XTensor res; XTensor res;
llnum = -1;
/* we skip the residual connection for the first layer if
the encoder is used in language modeling. */
if(skipInputRes && i == 0){ if(skipInputRes && i == 0){
/* self attention */ /* self attention */
att = attentions[i].Make(x, x, x, mask); att = attentions[i].Make(x, x, x, mask);
...@@ -113,6 +117,7 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool skipInputRes) ...@@ -113,6 +117,7 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool skipInputRes)
x = attLayerNorms[i].Make(att); x = attLayerNorms[i].Make(att);
} }
else{ else{
/* self attention */ /* self attention */
att = attentions[i].Make(x, x, x, mask); att = attentions[i].Make(x, x, x, mask);
...@@ -123,6 +128,8 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool skipInputRes) ...@@ -123,6 +128,8 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool skipInputRes)
/* layer normalization */ /* layer normalization */
x = attLayerNorms[i].Make(res); x = attLayerNorms[i].Make(res);
llnum = -1;
} }
/* fnn */ /* fnn */
......
...@@ -27,7 +27,7 @@ ...@@ -27,7 +27,7 @@
namespace transformer namespace transformer
{ {
/* constructor */ /* constructor */
T2TLN::T2TLN() T2TLN::T2TLN()
{ {
......
...@@ -51,19 +51,24 @@ initialize the model ...@@ -51,19 +51,24 @@ initialize the model
void T2TModel::InitModel(int argc, const char ** argv) void T2TModel::InitModel(int argc, const char ** argv)
{ {
bool useMem = false; bool useMem = false;
int memSize = 0;
bool isMemFreeOTF = false;
LoadParamInt(argc, argv, "dev", &devID, -1); LoadParamInt(argc, argv, "dev", &devID, -1);
LoadParamBool(argc, argv, "mem", &useMem, useMem); LoadParamBool(argc, argv, "mem", &useMem, useMem);
LoadParamInt(argc, argv, "memsize", &memSize, 1024);
LoadParamBool(argc, argv, "lm", &isLM, true); LoadParamBool(argc, argv, "lm", &isLM, true);
LoadParamBool(argc, argv, "mt", &isMT, false); LoadParamBool(argc, argv, "mt", &isMT, false);
LoadParamInt(argc, argv, "nhead", &nhead, 8); LoadParamInt(argc, argv, "nhead", &nhead, 8);
LoadParamBool(argc, argv, "freeotf", &isMemFreeOTF, false);
if(useMem){ if(useMem){
delete mem; delete mem;
mem = new XMem(devID); mem = new XMem(devID, isMemFreeOTF ? FREE_ON_THE_FLY : UNI_FREE, (MTYPE)MILLION * 256, 1024, MILLION * 128);
mem->SetDesiredSize(devID, 0, (MTYPE)memSize * MILLION);
} }
encoder.InitModel(argc, argv, isLM, isLM ? 1 : 0, devID, mem); encoder.InitModel(argc, argv, isLM, 0, devID, mem);
outputLayer.InitModel(argc, argv, devID, mem); outputLayer.InitModel(argc, argv, devID, mem);
} }
...@@ -83,8 +88,9 @@ XTensor T2TModel::MakeEncoding(XTensor &input, XTensor &mask, bool skipInputRes) ...@@ -83,8 +88,9 @@ XTensor T2TModel::MakeEncoding(XTensor &input, XTensor &mask, bool skipInputRes)
make the entire network (with the output softmax layer) make the entire network (with the output softmax layer)
>> input - input tensor >> input - input tensor
>> output - output tensor (distribution) >> output - output tensor (distribution)
>> padding - padding of the sequences
*/ */
void T2TModel::Make(XTensor &input, XTensor &output) void T2TModel::Make(XTensor &input, XTensor &output, XTensor &padding)
{ {
XTensor encoding; XTensor encoding;
...@@ -98,18 +104,118 @@ void T2TModel::Make(XTensor &input, XTensor &output) ...@@ -98,18 +104,118 @@ void T2TModel::Make(XTensor &input, XTensor &output)
dims[input.order] = len; dims[input.order] = len;
XTensor mask(input.order + 1, dims, X_FLOAT, 1.0F, input.devID, input.mem); XTensor mask(input.order + 1, dims, X_FLOAT, 1.0F, input.devID, input.mem);
/* a upper triangular matrix where the cells of the upper triangular are set to -1e-9 */ /* a upper triangular matrix where the cells of the upper triangular are set to -1e-9.
_SetDataLowTri(&mask, 1e9F, -1); this matrix can be used to prevent the attention to current or following words in
a given sequence. */
_SetDataLowTri(&mask, 1e9F, 0);
_ScaleAndShiftMe(&mask, 1.0F, -1e9F); _ScaleAndShiftMe(&mask, 1.0F, -1e9F);
int * dimsPadding = new int[padding.order + 2];
for(int i = 0; i < padding.order - 1; i++)
dimsPadding[i] = padding.GetDim(i);
dimsPadding[padding.order - 1] = padding.GetDim(-1);
dimsPadding[padding.order] = padding.GetDim(-1);
XTensor * padding2 = NewTensorBuf(padding.order + 1, dimsPadding, padding.dataType,
padding.denseRatio, padding.devID, padding.mem);
for(int i = 0; i < padding2->order; i++)
dimsPadding[i + 1] = padding2->GetDim(i);
dimsPadding[0] = nhead;
XTensor * padding3 = NewTensorBuf(padding.order + 2, dimsPadding, padding.dataType,
padding.denseRatio, padding.devID, padding.mem);
/* mask of the padding */
_Unsqueeze(&padding, padding2, padding.order - 1, padding.GetDim(-1));
_Unsqueeze(padding2, padding3, 0, nhead);
_ScaleAndShiftMe(padding3, 1e9F, -1e9F);
//_Sum(&mask, padding3, &mask);
encoding = MakeEncoding(input, mask, true); encoding = MakeEncoding(input, mask, true);
outputLayer.Make(encoding, output); outputLayer.Make(encoding, output);
delete[] dims; delete[] dims;
delete[] dimsPadding;
DelTensorBuf(padding2);
DelTensorBuf(padding3);
} }
else{ else{
ShowNTErrors("TODO!"); ShowNTErrors("TODO!");
} }
} }
/*
get parameter matrics
>> list - the list that keeps the parameter matrics
*/
void T2TModel::GetParams(XList &list)
{
list.Clear();
list.Add(&outputLayer.w);
for(int i = 0; i < encoder.nlayer; i++){
list.Add(&encoder.fnns[i].w1);
list.Add(&encoder.fnns[i].b1);
list.Add(&encoder.fnns[i].w2);
list.Add(&encoder.fnns[i].b2);
list.Add(&encoder.attentions[i].wk);
list.Add(&encoder.attentions[i].wq);
list.Add(&encoder.attentions[i].wv);
list.Add(&encoder.fnnLayerNorms[i].w);
list.Add(&encoder.fnnLayerNorms[i].b);
list.Add(&encoder.attLayerNorms[i].w);
list.Add(&encoder.attLayerNorms[i].b);
}
list.Add(&encoder.embedder.w);
}
/*
dump the parameters
>> fn - where to keep the model
>> model - the model
*/
void T2TModel::Dump(const char * fn)
{
FILE * file = fopen(fn, "wb");
CheckNTErrors(file, "Cannot open the model file");
XList params(100);
GetParams(params);
for(int i = 0; i < params.count; i++){
XTensor * p = (XTensor*)params.Get(i);
p->Dump(file, "param:");
}
fclose(file);
XPRINT(0, stderr, "[INFO] model saved\n");
}
/* read the parameters */
void T2TModel::Read(const char * fn)
{
FILE * file = fopen(fn, "rb");
CheckNTErrors(file, "Cannot open the model file");
XList params(100);
GetParams(params);
for(int i = 0; i < params.count; i++){
XTensor * p = (XTensor*)params.Get(i);
p->Read(file, "param:");
}
fclose(file);
XPRINT(0, stderr, "[INFO] model loaded\n");
}
} }
...@@ -72,9 +72,18 @@ public: ...@@ -72,9 +72,18 @@ public:
XTensor MakeEncoding(XTensor &input, XTensor &mask, bool skipInputRes); XTensor MakeEncoding(XTensor &input, XTensor &mask, bool skipInputRes);
/* make the entire network (with the output softmax layer) */ /* make the entire network (with the output softmax layer) */
void Make(XTensor &input, XTensor &output); void Make(XTensor &input, XTensor &output, XTensor &padding);
/* get parameter matrics */
void GetParams(XList &list);
/* dump the parameters */
void Dump(const char * fn);
/* read the parameters */
void Read(const char * fn);
}; };
} }
#endif #endif
\ No newline at end of file
...@@ -37,12 +37,6 @@ namespace transformer ...@@ -37,12 +37,6 @@ namespace transformer
class T2TTrainer class T2TTrainer
{ {
public: public:
/* device id */
int devID;
/* memory pool */
XMem * mem;
/* buffer for loading words */ /* buffer for loading words */
int * buf; int * buf;
...@@ -75,6 +69,9 @@ public: ...@@ -75,6 +69,9 @@ public:
/* learning rate */ /* learning rate */
float lrate; float lrate;
/* the parameter that controls the maximum learning rate in training */
float lrbias;
/* sentence batch size */ /* sentence batch size */
int sBatchSize; int sBatchSize;
...@@ -88,6 +85,22 @@ public: ...@@ -88,6 +85,22 @@ public:
/* traing step number */ /* traing step number */
int nstep; int nstep;
/* indicates whether we use adam */
bool useAdam;
/* hyper parameters of adam*/
float adamBeta1;
float adamBeta2;
float adamDelta;
float adamBeta1T;
float adamBeta2T;
/* list of the moment of the parameter matrics */
XList moments;
/* list of the 2nd order moment of the parameter matrics */
XList moments2nd;
public: public:
/* constructor */ /* constructor */
T2TTrainer(); T2TTrainer();
...@@ -101,17 +114,34 @@ public: ...@@ -101,17 +114,34 @@ public:
/* train the model */ /* train the model */
void Train(const char * fn, T2TModel * model); void Train(const char * fn, T2TModel * model);
/* test the model */
void Test(const char * fn, const char * ofn, T2TModel * model);
/* load data to buffer */ /* load data to buffer */
int LoadBuf(FILE * file); int LoadBuf(FILE * file);
/* clear data buffer */
void ClearBuf();
/* load a batch of sequences */ /* load a batch of sequences */
int LoadBatch(FILE * file, XTensor * batch, int step, int vs, int sBatch, int wBatch, bool isSorted, int &wCount); int LoadBatch(FILE * file, bool isLM,
XTensor * batch, XTensor * padding, XTensor * output,
int * seqs,
int step, int vs, int sBatch, int wBatch,
bool isSorted, int &wCount,
int devID, XMem * mem);
/* get word probabilities for a batch of sequences */ /* get word probabilities for a batch of sequences */
float GetProb(XTensor * output, XTensor * gold, XTensor * wordProbs); float GetProb(XTensor * output, XTensor * gold, XTensor * wordProbs);
/* update the model by delta rule */ /* update the model by delta rule */
void Update(T2TModel * model, const float lr); void Update(T2TModel * model, const float lr);
/* prepare model for training */
void PrepareModel(T2TModel * model);
/* do padding on the output */
void PadOutput(XTensor * output, XTensor * padding);
}; };
......
...@@ -27,6 +27,8 @@ namespace transformer ...@@ -27,6 +27,8 @@ namespace transformer
{ {
FILE * tmpFILE; FILE * tmpFILE;
int llnum = 0;
FILE * tf = NULL;
void LoadParamString(int argc, const char ** argv, const char * name, char * p, const char * defaultP) void LoadParamString(int argc, const char ** argv, const char * name, char * p, const char * defaultP)
{ {
......
...@@ -38,6 +38,9 @@ void LoadParamFloat(int argc, const char ** argv, const char * name, float * p, ...@@ -38,6 +38,9 @@ void LoadParamFloat(int argc, const char ** argv, const char * name, float * p,
/* show arguments */ /* show arguments */
void ShowParams(int argc, const char ** argv); void ShowParams(int argc, const char ** argv);
extern int llnum;
extern FILE * tf;
} }
#endif #endif
...@@ -38,20 +38,42 @@ int TransformerMain(int argc, const char ** argv) ...@@ -38,20 +38,42 @@ int TransformerMain(int argc, const char ** argv)
ShowParams(argc, argv); ShowParams(argc, argv);
char * trainFN = new char[MAX_LINE_LENGTH]; char * trainFN = new char[MAX_LINE_LENGTH];
char * modelFN = new char[MAX_LINE_LENGTH];
char * testFN = new char[MAX_LINE_LENGTH];
char * outputFN = new char[MAX_LINE_LENGTH];
LoadParamString(argc, argv, "train", trainFN, ""); LoadParamString(argc, argv, "train", trainFN, "");
LoadParamString(argc, argv, "model", modelFN, "");
LoadParamString(argc, argv, "test", testFN, "");
LoadParamString(argc, argv, "output", outputFN, "");
T2TTrainer trainer;
trainer.Init(argc, argv);
T2TModel model; T2TModel model;
model.InitModel(argc, argv); model.InitModel(argc, argv);
if(strcmp(trainFN, "")){ /* learn model parameters */
T2TTrainer trainer; if(strcmp(trainFN, ""))
trainer.Init(argc, argv);
trainer.Train(trainFN, &model); trainer.Train(trainFN, &model);
}
/* save the final model */
if(strcmp(modelFN, "") && strcmp(trainFN, ""))
model.Dump(modelFN);
/* load the model if neccessary */
if(strcmp(modelFN, ""))
model.Read(modelFN);
/* test the model on the new data */
if(strcmp(testFN, "") && strcmp(outputFN, ""))
trainer.Test(testFN, outputFN, &model);
delete[] trainFN; delete[] trainFN;
delete[] modelFN;
delete[] testFN;
delete[] outputFN;
fclose(tmpFILE); fclose(tmpFILE);
......
...@@ -41,6 +41,7 @@ XDevManager GDevs; ...@@ -41,6 +41,7 @@ XDevManager GDevs;
XDevice::XDevice() XDevice::XDevice()
{ {
stream = NULL; stream = NULL;
isInitialized = false;
Clear(); Clear();
#ifdef USE_CUDA #ifdef USE_CUDA
...@@ -126,6 +127,7 @@ void XDevice::Init(int myDevID) ...@@ -126,6 +127,7 @@ void XDevice::Init(int myDevID)
#endif #endif
} }
isInitialized = true;
} }
/* clear it */ /* clear it */
...@@ -152,11 +154,14 @@ void XDevice::Clear() ...@@ -152,11 +154,14 @@ void XDevice::Clear()
/* get cublas handle */ /* get cublas handle */
cublasHandle_t * XDevice::GetCublasHandle() cublasHandle_t * XDevice::GetCublasHandle()
{ {
if (!isInitialized)
Init(devID);
if(!isHandleReady){ if(!isHandleReady){
MUTEX_LOCK(cublasMutex); MUTEX_LOCK(cublasMutex);
int devIDBackup = 0; int devIDBackup = 0;
ProtectCudaDev(devID, devIDBackup); ProtectCudaDev(devID, devIDBackup);
CheckNTErrors(cublasCreate(&cublasHandle) == cudaSuccess, CheckNTErrors(cublasCreate(&cublasHandle) == CUBLAS_STATUS_SUCCESS,
"Cannot create the cublas handle."); "Cannot create the cublas handle.");
isHandleReady = true; isHandleReady = true;
BacktoCudaDev(devID, devIDBackup); BacktoCudaDev(devID, devIDBackup);
...@@ -169,6 +174,9 @@ cublasHandle_t * XDevice::GetCublasHandle() ...@@ -169,6 +174,9 @@ cublasHandle_t * XDevice::GetCublasHandle()
/* get the stream of cuda */ /* get the stream of cuda */
cudaStream_t * XDevice::GetCudaStream() cudaStream_t * XDevice::GetCudaStream()
{ {
if (!isInitialized)
Init(devID);
CheckNTErrors(stream != NULL, "the stream is not initialized!"); CheckNTErrors(stream != NULL, "the stream is not initialized!");
return &stream->stream; return &stream->stream;
...@@ -279,33 +287,13 @@ void XDevManager::Init() ...@@ -279,33 +287,13 @@ void XDevManager::Init()
exit(1); exit(1);
} }
cudaDeviceProp prop[64];
for(int i = 0; i < GPUCount; i++){ for(int i = 0; i < GPUCount; i++){
GPUs[i].Init(i); GPUs[i].devID = i;
cudaGetDeviceProperties(&prop[i], i); //GPUs[i].Init(i);
} }
#ifdef USA_CUDA_P2P
for(int i = 0; i < GPUCount; i++){
cudaSetDevice(i);
for(int j = 0; j < GPUCount; j++){
if(i == j)
continue;
int access;
cudaDeviceCanAccessPeer(&access, i, j);
bool hasUVA = (prop[i].unifiedAddressing && prop[j].unifiedAddressing);
fprintf(stderr, "device %d -> device %d access:%d UVA:%d\n", i, j, access, hasUVA ? 1 : 0);
if(access != 0){
CheckNTErrors((hasUVA == true), "at least one GPU does not support UVA.")
CheckNTErrors((cudaDeviceEnablePeerAccess(j, 0)==cudaSuccess), "cannot set cuda p2t mode!");
}
}
}
#endif
#endif #endif
nGPU = GPUCount; nGPU = GPUCount;
} }
...@@ -351,6 +339,9 @@ into blocks ...@@ -351,6 +339,9 @@ into blocks
*/ */
int XDevManager::GetCudaThread(const int devID, const int n, int * gridSize, int * blockSize) int XDevManager::GetCudaThread(const int devID, const int n, int * gridSize, int * blockSize)
{ {
if (!GPUs[devID].isInitialized)
GPUs[devID].Init(devID);
memset(gridSize, 0, sizeof(int) * 3); memset(gridSize, 0, sizeof(int) * 3);
memset(blockSize, 0, sizeof(int) * 3); memset(blockSize, 0, sizeof(int) * 3);
...@@ -402,6 +393,9 @@ into blocks ...@@ -402,6 +393,9 @@ into blocks
*/ */
int XDevManager::GetCudaThread2D(const int devID, const int n, const int m, int nLimit, int * gridSize, int * blockSize) int XDevManager::GetCudaThread2D(const int devID, const int n, const int m, int nLimit, int * gridSize, int * blockSize)
{ {
if (!GPUs[devID].isInitialized)
GPUs[devID].Init(devID);
memset(gridSize, 0, sizeof(int) * 3); memset(gridSize, 0, sizeof(int) * 3);
memset(blockSize, 0, sizeof(int) * 3); memset(blockSize, 0, sizeof(int) * 3);
......
...@@ -67,6 +67,9 @@ public: ...@@ -67,6 +67,9 @@ public:
/* warp size of an (Navida) GPU */ /* warp size of an (Navida) GPU */
int GPUWarpSize; int GPUWarpSize;
/* indicates whether the device class has been initialized */
bool isInitialized;
/* /*
max grid size (or number of blocks) of an (Navida) GPU max grid size (or number of blocks) of an (Navida) GPU
NOTE: the grid size is alone with three dimensions (x, y, z) NOTE: the grid size is alone with three dimensions (x, y, z)
......
...@@ -147,6 +147,7 @@ extern bool useCUDA; ...@@ -147,6 +147,7 @@ extern bool useCUDA;
#define XPRINT4(VERBOSE,FILEH,STR,ARG,ARG2,ARG3,ARG4) {if(VERBOSE<=verboseLevel) {fprintf(FILEH,STR,ARG,ARG2,ARG3,ARG4);FFLUSH(FILEH);}} #define XPRINT4(VERBOSE,FILEH,STR,ARG,ARG2,ARG3,ARG4) {if(VERBOSE<=verboseLevel) {fprintf(FILEH,STR,ARG,ARG2,ARG3,ARG4);FFLUSH(FILEH);}}
#define XPRINT5(VERBOSE,FILEH,STR,ARG,ARG2,ARG3,ARG4,ARG5) {if(VERBOSE<=verboseLevel) {fprintf(FILEH,STR,ARG,ARG2,ARG3,ARG4,ARG5);FFLUSH(FILEH);}} #define XPRINT5(VERBOSE,FILEH,STR,ARG,ARG2,ARG3,ARG4,ARG5) {if(VERBOSE<=verboseLevel) {fprintf(FILEH,STR,ARG,ARG2,ARG3,ARG4,ARG5);FFLUSH(FILEH);}}
#define XPRINT6(VERBOSE,FILEH,STR,ARG,ARG2,ARG3,ARG4,ARG5,ARG6) {if(VERBOSE<=verboseLevel) {fprintf(FILEH,STR,ARG,ARG2,ARG3,ARG4,ARG5,ARG6);FFLUSH(FILEH);}} #define XPRINT6(VERBOSE,FILEH,STR,ARG,ARG2,ARG3,ARG4,ARG5,ARG6) {if(VERBOSE<=verboseLevel) {fprintf(FILEH,STR,ARG,ARG2,ARG3,ARG4,ARG5,ARG6);FFLUSH(FILEH);}}
#define XPRINT7(VERBOSE,FILEH,STR,ARG,ARG2,ARG3,ARG4,ARG5,ARG6,ARG7) {if(VERBOSE<=verboseLevel) {fprintf(FILEH,STR,ARG,ARG2,ARG3,ARG4,ARG5,ARG6,ARG7);FFLUSH(FILEH);}}
#define B2I(V) V==0?false:true #define B2I(V) V==0?false:true
......
...@@ -53,28 +53,14 @@ typedef long long INT_64; ...@@ -53,28 +53,14 @@ typedef long long INT_64;
#define MIN_BLOCK_SIZE_FOR_MEMPOOL 128 * 1024 * 1024 #define MIN_BLOCK_SIZE_FOR_MEMPOOL 128 * 1024 * 1024
#define MIN_BLOCK_NUM_FOR_MEMPOOL 1024 #define MIN_BLOCK_NUM_FOR_MEMPOOL 1024
/* memory block */
struct XMemBlock
{
/* pointer to where to start */
void * mem;
/* size of the block */
MTYPE size;
/* size of the used memory in this block */
MTYPE used;
/* disired size of the block */
MTYPE sizeDesired;
};
/* /*
mode of runnig a memory pool mode of runnig a memory pool
- UNI_FREE: free all memory space when the memory allocation is no use - UNI_FREE: free all memory space when the memory allocation is no use
- FREE_ON_THE_FLY: run in normal "malloc" and "free" ways - FREE_ON_THE_FLY: run in normal "malloc" and "free" ways
*/ */
enum MEMPOOL_MODE {UNI_FREE, FREE_ON_THE_FLY}; enum MEMPOOL_MODE {UNI_FREE, FREE_ON_THE_FLY};
struct MPieceNode;
/* header of a memory piece (FREE_ON_THE_FLY) */ /* header of a memory piece (FREE_ON_THE_FLY) */
struct MHeader struct MHeader
...@@ -96,6 +82,9 @@ struct MHeader ...@@ -96,6 +82,9 @@ struct MHeader
/* id of the memory block */ /* id of the memory block */
int blockID; int blockID;
/* pointer to the index node */
MPieceNode * indexNode;
}; };
/* index of memory piece */ /* index of memory piece */
...@@ -112,6 +101,31 @@ struct MPieceNode ...@@ -112,6 +101,31 @@ struct MPieceNode
/* pointer to the head of a memory piece */ /* pointer to the head of a memory piece */
void * p; void * p;
/* pointer to the head of memory that is returned back to the user */
void * pReal;
/* header of the memory piece */
MHeader head;
};
/* memory block */
struct XMemBlock
{
/* pointer to where to start */
void * mem;
/* size of the block */
MTYPE size;
/* size of the used memory in this block */
MTYPE used;
/* desired size of the block */
MTYPE sizeDesired;
/* first head of the block */
MHeader * head;
}; };
/* /*
...@@ -138,6 +152,9 @@ public: ...@@ -138,6 +152,9 @@ public:
/* mode of running the memory pool */ /* mode of running the memory pool */
MEMPOOL_MODE mode; MEMPOOL_MODE mode;
/* signature */
MTYPE signature;
/* indicates whether the memory allocation is static */ /* indicates whether the memory allocation is static */
bool isStatic; bool isStatic;
...@@ -194,13 +211,16 @@ public: ...@@ -194,13 +211,16 @@ public:
public: public:
/* index of the free memory pieces */ /* index of the free memory pieces */
MPieceNode * freeMemIndex; MPieceNode * memIndex;
/* for double buffering */
MPieceNode * memIndex2;
/* maximum number of index nodes */ /* maximum number of index nodes */
INT_64 indexNodeNum; INT_64 nodeNum;
/* count of the used nodes */ /* count of the used nodes */
INT_64 indexNodeNumUsed; INT_64 nodeNumUsed;
/* minimal size allocation for each index entry */ /* minimal size allocation for each index entry */
MTYPE * minSizeIndex; MTYPE * minSizeIndex;
...@@ -235,6 +255,9 @@ public: ...@@ -235,6 +255,9 @@ public:
/* free a piece of memory */ /* free a piece of memory */
void Free(int myDevID, void * mem); void Free(int myDevID, void * mem);
/* get signature */
MTYPE GetSignature();
/* use string as the name of the memory pool */ /* use string as the name of the memory pool */
void SetName(const char * myName); void SetName(const char * myName);
...@@ -282,10 +305,10 @@ public: ...@@ -282,10 +305,10 @@ public:
void * AllocBuf(int myDevID, MTYPE mySize, int pitch = BUF_PITCH); void * AllocBuf(int myDevID, MTYPE mySize, int pitch = BUF_PITCH);
/* release a piece of memory */ /* release a piece of memory */
void Release(void * p); void Release(void * p, MTYPE size, MTYPE code);
/* release a piece of memory */ /* release a piece of memory */
void Release(int myDevID, void * p); void Release(int myDevID, void * p, MTYPE size);
/* release a piece of memory in the buffer */ /* release a piece of memory in the buffer */
void ReleaseBuf(int myDevID, MTYPE mySize, int pitch = BUF_PITCH); void ReleaseBuf(int myDevID, MTYPE mySize, int pitch = BUF_PITCH);
...@@ -302,14 +325,20 @@ public: ...@@ -302,14 +325,20 @@ public:
/* find the index entry for allocation query */ /* find the index entry for allocation query */
int FindIndexEntry(MTYPE mySize); int FindIndexEntry(MTYPE mySize);
/* remove an index node */ /* remove an index node for available memory pieces */
void RemoveIndexNode(MPieceNode * node, MPieceNode * entry = NULL); void RemoveFreeIndexNode(MPieceNode * node, MPieceNode * entry = NULL);
/* add an index node */ /* add an index node for available memory pieces */
void AddIndexNode(MPieceNode * node, MPieceNode * entry = NULL); void AddFreeIndexNode(MPieceNode * node, MPieceNode * entry = NULL);
/* remove an index node for memory pieces in use */
void RemoveAllocIndexNode(MPieceNode * node, MPieceNode * entry = NULL);
/* add an index node for available memory pieces */
void AddAllocIndexNode(MPieceNode * node, MPieceNode * entry = NULL);
/* release a piece of memory as "free" */ /* release a piece of memory as "free" */
void ReleaseStandard(int myDevID, void * p); void ReleaseStandard(int myDevID, void * p, MTYPE size);
/* rebuild index to merge small fragments of memory and free the block with no use */ /* rebuild index to merge small fragments of memory and free the block with no use */
void RebuildIndex(); void RebuildIndex();
...@@ -379,6 +408,9 @@ public: ...@@ -379,6 +408,9 @@ public:
extern XMem * GMem; extern XMem * GMem;
extern int testxmemid;
extern void * recordp;
} /* end of the nts (NiuTrans.Tensor) namespace */ } /* end of the nts (NiuTrans.Tensor) namespace */
#endif #endif
...@@ -29,10 +29,18 @@ const char * GetOPName(int type) ...@@ -29,10 +29,18 @@ const char * GetOPName(int type)
if ((type & MATH_BASE) != 0){ if ((type & MATH_BASE) != 0){
if (type == MATH_ABSOLUTE) if (type == MATH_ABSOLUTE)
return "M_ABSOLUTE"; return "M_ABSOLUTE";
else if (type == MATH_CEIL)
return "M_CEIL";
else if (type == MATH_EXP) else if (type == MATH_EXP)
return "M_EXP"; return "M_EXP";
else if (type == MATH_FLOOR)
return "M_FLOOR";
else if (type == MATH_LOG) else if (type == MATH_LOG)
return "M_LOG"; return "M_LOG";
else if (type == MATH_SQRT)
return "M_SQRT";
else if (type == MATH_SQUARE)
return "M_SQUARE";
else if (type == MATH_SIN) else if (type == MATH_SIN)
return "M_SIN"; return "M_SIN";
else if (type == MATH_COS) else if (type == MATH_COS)
...@@ -113,7 +121,9 @@ const char * GetOPName(int type) ...@@ -113,7 +121,9 @@ const char * GetOPName(int type)
return "S_TOPK"; return "S_TOPK";
} }
else if ((type & FUNCTION_BASE) != 0){ else if ((type & FUNCTION_BASE) != 0){
if (type == FUNC_HARDTANH) if (type == FUNC_DROPOUT)
return "F_DROPOUT";
else if (type == FUNC_HARDTANH)
return "F_HARDTANH"; return "F_HARDTANH";
else if (type == FUNC_IDENTITY) else if (type == FUNC_IDENTITY)
return "F_IDENTITY"; return "F_IDENTITY";
......
...@@ -32,9 +32,13 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -32,9 +32,13 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#define MATH_BASE 0x00001000 #define MATH_BASE 0x00001000
#define MATH_ABSOLUTE MATH_BASE + 1 #define MATH_ABSOLUTE MATH_BASE + 1
#define MATH_EXP MATH_ABSOLUTE + 1 #define MATH_CEIL MATH_ABSOLUTE + 1
#define MATH_LOG MATH_EXP + 1 #define MATH_EXP MATH_CEIL + 1
#define MATH_SIN MATH_LOG + 1 #define MATH_FLOOR MATH_EXP + 1
#define MATH_LOG MATH_FLOOR + 1
#define MATH_SQRT MATH_LOG + 1
#define MATH_SQUARE MATH_SQRT + 1
#define MATH_SIN MATH_SQUARE + 1
#define MATH_COS MATH_SIN + 1 #define MATH_COS MATH_SIN + 1
#define MATH_TAN MATH_COS + 1 #define MATH_TAN MATH_COS + 1
#define MATH_ROUND MATH_TAN + 1 #define MATH_ROUND MATH_TAN + 1
...@@ -88,7 +92,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -88,7 +92,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* activation functions */ /* activation functions */
#define FUNCTION_BASE DATA_BASE * 2 #define FUNCTION_BASE DATA_BASE * 2
#define FUNC_HARDTANH FUNCTION_BASE + 1 #define FUNC_DROPOUT FUNCTION_BASE + 1
#define FUNC_HARDTANH FUNC_DROPOUT + 1
#define FUNC_IDENTITY FUNC_HARDTANH + 1 #define FUNC_IDENTITY FUNC_HARDTANH + 1
#define FUNC_LOGSOFTMAX FUNC_IDENTITY + 1 #define FUNC_LOGSOFTMAX FUNC_IDENTITY + 1
#define FUNC_RECTIFY FUNC_LOGSOFTMAX + 1 #define FUNC_RECTIFY FUNC_LOGSOFTMAX + 1
......
...@@ -162,6 +162,7 @@ XTensor::XTensor(const XTensor &reference) ...@@ -162,6 +162,7 @@ XTensor::XTensor(const XTensor &reference)
devID = reference.devID; devID = reference.devID;
mem = reference.mem; mem = reference.mem;
data = reference.data; data = reference.data;
signature = reference.signature;
/* what we really want to do is "reference.data = NULL;" /* what we really want to do is "reference.data = NULL;"
As "reference" is constant, we cannot reset reference.data As "reference" is constant, we cannot reset reference.data
...@@ -221,7 +222,8 @@ XTensor::~XTensor() ...@@ -221,7 +222,8 @@ XTensor::~XTensor()
void XTensor::Init() void XTensor::Init()
{ {
id = -1; id = -1;
mem = NULL;; mem = NULL;
signature = 0;
data = NULL; data = NULL;
dataHost = NULL; dataHost = NULL;
dataP = NULL; dataP = NULL;
...@@ -254,7 +256,7 @@ void XTensor::DestroyData() ...@@ -254,7 +256,7 @@ void XTensor::DestroyData()
else if(data != NULL && isInGlobalMem) else if(data != NULL && isInGlobalMem)
FreeData(this, mem); FreeData(this, mem);
else if(data != NULL) else if(data != NULL)
mem->Release(data); mem->Release(data, GetDataSizeInChar(), signature);
data = NULL; data = NULL;
if(dataHost != NULL) if(dataHost != NULL)
...@@ -298,6 +300,7 @@ XTensor& XTensor::operator= (const XTensor& tensor) ...@@ -298,6 +300,7 @@ XTensor& XTensor::operator= (const XTensor& tensor)
newTensor->SetTMP(); newTensor->SetTMP();
newTensor->data = data; newTensor->data = data;
newTensor->dataHost = dataHost; newTensor->dataHost = dataHost;
newTensor->signature = tensor.signature;
XLink::Replace(this, newTensor); XLink::Replace(this, newTensor);
XLink::ClearOutgoing(this); XLink::ClearOutgoing(this);
...@@ -1135,19 +1138,21 @@ resize a tensor with a specified tensor size ...@@ -1135,19 +1138,21 @@ resize a tensor with a specified tensor size
bool XTensor::Resize(const int myOrder, const int * myDimSize, bool XTensor::Resize(const int myOrder, const int * myDimSize,
const TENSOR_DATA_TYPE myDataType, const float myDenseRatio) const TENSOR_DATA_TYPE myDataType, const float myDenseRatio)
{ {
order = myOrder;
unitNum = 1;
unitNumNonZero = 0;
isInit = true;
/* free old mem */ /* free old mem */
if(data != NULL){ if(data != NULL){
if (mem == NULL) if (mem == NULL)
XMemFree(devID, data); XMemFree(devID, data);
else else
mem->Release(data); mem->Release(data, GetDataSizeInChar(), signature);
} }
signature = mem != NULL ? mem->GetSignature() : 0;
order = myOrder;
unitNum = 1;
unitNumNonZero = 0;
isInit = true;
bool filledData = true; bool filledData = true;
bool zeroData = false; bool zeroData = false;
for(int i = 0; i < order; i++){ for(int i = 0; i < order; i++){
...@@ -1243,56 +1248,6 @@ bool XTensor::Resize(const int myOrder, const int * myDimSize, ...@@ -1243,56 +1248,6 @@ bool XTensor::Resize(const int myOrder, const int * myDimSize,
} }
/* /*
resize a tensor with a specified tensor size (with no data filled)
>> myOrder - order of the tensor
>> myDimSize - the size of each dimension
>> myDataType - unit size (e.g., int, float, and double)
>> myDenseRatio - how often an element has non-zero value
<< return - succeeded or not
*/
bool XTensor::ResizeWithNoData(const int myOrder, const int * myDimSize,
const TENSOR_DATA_TYPE myDataType, const float myDenseRatio)
{
order = myOrder;
unitNum = 1;
unitNumNonZero = 0;
/* free old mem */
if(data != NULL && mem == NULL)
delete[] (char*)data;
bool filledData = true;
bool zeroData = false;
for(int i = 0; i < order; i++){
dimSize[i] = abs(myDimSize[i]);
dimSizeRDI[order - i - 1] = dimSize[i];
if(myDimSize[i] < 0)
filledData = false;
if(myDimSize[i] == 0)
zeroData = true;
unitNum *= dimSize[i];
}
data = NULL;
denseRatio = myDenseRatio;
isSparse = denseRatio < 1.0F ? true : false;
dataType = myDataType;
unitSize = GetUnitSize(dataType);
if(myDataType != DEFAULT_DTYPE)
isDefaultDType = false;
else
isDefaultDType = true;
if(zeroData){
unitNum = 0;
return false;
}
return true;
}
/*
resize a tensor by another one resize a tensor by another one
>> myTensor - tensor for reference >> myTensor - tensor for reference
*/ */
...@@ -1377,9 +1332,10 @@ dump data to a file ...@@ -1377,9 +1332,10 @@ dump data to a file
>> file - where to domp the data >> file - where to domp the data
>> label - label of the tensor >> label - label of the tensor
>> n - number of items to dump >> n - number of items to dump
>> beg - the first item id
>> verbose - verbose level >> verbose - verbose level
*/ */
void XTensor::Dump(FILE * file, const char * label, const int n, const int verbose) void XTensor::Dump(FILE * file, const char * label, const int n, const int beg, const int verbose)
{ {
if (verbose > verboseLevel) if (verbose > verboseLevel)
return; return;
...@@ -1437,28 +1393,26 @@ void XTensor::Dump(FILE * file, const char * label, const int n, const int verbo ...@@ -1437,28 +1393,26 @@ void XTensor::Dump(FILE * file, const char * label, const int n, const int verbo
} }
if (!isSparse) { if (!isSparse) {
if (dataType == DEFAULT_DTYPE) { if (dataType == DEFAULT_DTYPE) {
if (unitNum > 0) { int end = MIN(n > 0 ? beg + n : beg + unitNum, unitNum);
DTYPE f = *(DTYPE*)d; for(int i = beg; i < end; i++){
fprintf(file, "%e", f); DTYPE f = ((DTYPE*)d)[i];
} if(i == beg)
int num = unitNum; fprintf(file, "%e", f);
if (n > 0) else
num = MIN(num, n); fprintf(file, " %e", f);
for (int i = 1; i < num; i++) {
DTYPE * f = ((DTYPE*)d) + i;
fprintf(file, " %e", *f);
} }
} }
else { else {
ShowNTErrors("Cannot dump the tensor to the file in non-float values!"); ShowNTErrors("TODO!");
} }
} }
else { else {
int num = this->unitNumNonZero > 0 ? *(int*)d : 0; int num = this->unitNumNonZero > 0 ? *(int*)d : 0;
if (n > 0) if (beg + n > 0)
num = MIN(num, n); num = MIN(num, beg + n);
fprintf(file, "%d ", num); fprintf(file, "%d ", num);
for (int i = 0; i < num; i++) { for (int i = beg; i < num; i++) {
int key = GetKeyInSparse(i); int key = GetKeyInSparse(i);
DTYPE value = GetInSparse(i); DTYPE value = GetInSparse(i);
fprintf(file, "[%d]%e ", key, value); fprintf(file, "[%d]%e ", key, value);
...@@ -1481,13 +1435,14 @@ dump data to a file ...@@ -1481,13 +1435,14 @@ dump data to a file
>> file - where to domp the data >> file - where to domp the data
>> label - label of the tensor >> label - label of the tensor
>> n - number of items to dump >> n - number of items to dump
>> beg - the first item id
>> verbose - verbose level >> verbose - verbose level
*/ */
void XTensor::Dump(const XTensor * tensor, FILE * file, const char * label, const int n, const int verbose) void XTensor::Dump(const XTensor * tensor, FILE * file, const char * label, const int n, const int beg, const int verbose)
{ {
XTensor a(tensor->order, tensor->dimSize, tensor->dataType, tensor->denseRatio, tensor->devID, tensor->mem); XTensor a(tensor->order, tensor->dimSize, tensor->dataType, tensor->denseRatio, tensor->devID, tensor->mem);
_CopyValues(tensor, &a); _CopyValues(tensor, &a);
a.Dump(file, label, n, verbose); a.Dump(file, label, n, beg, verbose);
} }
/* /*
...@@ -1670,6 +1625,8 @@ void XTensor::AllocateData(XTensor * tensor, XMem * myMem, bool useBuf) ...@@ -1670,6 +1625,8 @@ void XTensor::AllocateData(XTensor * tensor, XMem * myMem, bool useBuf)
tensor->isInGlobalMem = true; tensor->isInGlobalMem = true;
} }
} }
tensor->signature = 0;
} }
/* /*
......
...@@ -51,7 +51,6 @@ struct XLink; ...@@ -51,7 +51,6 @@ struct XLink;
#define MIN_TENSOR_SPLIT_LIST_NUM 1024 #define MIN_TENSOR_SPLIT_LIST_NUM 1024
#define MIN_TENSOR_CAT_NUM 8 #define MIN_TENSOR_CAT_NUM 8
/* computation flags */ /* computation flags */
#define UNSAFE_BUT_FAST_MEM #define UNSAFE_BUT_FAST_MEM
#define FAST_MATRIX #define FAST_MATRIX
...@@ -66,6 +65,9 @@ public: ...@@ -66,6 +65,9 @@ public:
/* memory pool */ /* memory pool */
XMem * mem; XMem * mem;
/* signature of the memory pool */
MTYPE signature;
/* data array to keep the elements */ /* data array to keep the elements */
void * data; void * data;
...@@ -327,11 +329,6 @@ public: ...@@ -327,11 +329,6 @@ public:
const TENSOR_DATA_TYPE myDataType = DEFAULT_DTYPE, const TENSOR_DATA_TYPE myDataType = DEFAULT_DTYPE,
const float myDenseRatio = 1.0F); const float myDenseRatio = 1.0F);
/* resize a matrix with a specified matrix size (with no data filled) */
bool ResizeWithNoData(const int myOrder, const int * myDimSize,
const TENSOR_DATA_TYPE myDataType = DEFAULT_DTYPE,
const float myDenseRatio = 1.0F);
/* resize a matrix by another one */ /* resize a matrix by another one */
bool Resize(const XTensor * myTensor); bool Resize(const XTensor * myTensor);
...@@ -339,11 +336,11 @@ public: ...@@ -339,11 +336,11 @@ public:
bool BinarySearch(int key, DTYPE &value, void * &position) const; bool BinarySearch(int key, DTYPE &value, void * &position) const;
/* dump data to a file */ /* dump data to a file */
void Dump(FILE * file, const char * label = NULL, const int n = -1, const int verbose = 0); void Dump(FILE * file, const char * label = NULL, const int n = -1, const int beg = 0, const int verbose = 0);
/* dump data to a file */ /* dump data to a file */
static static
void Dump(const XTensor * tensor, FILE * file, const char * label = NULL, const int n = -1, const int verbose = 0); void Dump(const XTensor * tensor, FILE * file, const char * label = NULL, const int n = -1, const int beg = 0, const int verbose = 0);
/* read data from a file */ /* read data from a file */
void Read(FILE * file, const char * label = NULL); void Read(FILE * file, const char * label = NULL);
......
...@@ -203,7 +203,7 @@ XTensor Div(const XTensor &a, const XTensor &b, DTYPE alpha, int leadingDim) ...@@ -203,7 +203,7 @@ XTensor Div(const XTensor &a, const XTensor &b, DTYPE alpha, int leadingDim)
/* tensor connections */ /* tensor connections */
XLink::MakeLink(&a, &b, &c, MATH_DIVDIM); XLink::MakeLink(&a, &b, &c, MATH_DIVDIM);
XLink::AddParamToHeadInt(&c, n); XLink::AddParamToHeadInt(&c, n);
XLink::AddParamToHeadInt(&c, alpha); XLink::AddParamToHead(&c, alpha);
} }
else{ else{
ShowNTErrors("Something is wrong!"); ShowNTErrors("Something is wrong!");
......
...@@ -204,7 +204,7 @@ XTensor Multiply(const XTensor &a, const XTensor &b, DTYPE alpha, int leadingDim ...@@ -204,7 +204,7 @@ XTensor Multiply(const XTensor &a, const XTensor &b, DTYPE alpha, int leadingDim
/* tensor connections */ /* tensor connections */
XLink::MakeLink(&a, &b, &c, MATH_MULTIPLYDIM); XLink::MakeLink(&a, &b, &c, MATH_MULTIPLYDIM);
XLink::AddParamToHeadInt(&c, n); XLink::AddParamToHeadInt(&c, n);
XLink::AddParamToHeadInt(&c, alpha); XLink::AddParamToHead(&c, alpha);
} }
else{ else{
ShowNTErrors("Something is wrong!"); ShowNTErrors("Something is wrong!");
......
...@@ -50,7 +50,6 @@ void _Normalize(const XTensor * input, XTensor * output, int dim, const XTensor ...@@ -50,7 +50,6 @@ void _Normalize(const XTensor * input, XTensor * output, int dim, const XTensor
CheckNTErrors((XTensor::IsSameShaped(mean, var)), "Unmatched input tensors"); CheckNTErrors((XTensor::IsSameShaped(mean, var)), "Unmatched input tensors");
CheckNTErrors((input && output && mean && var && a && b), "Empty input tensors!"); CheckNTErrors((input && output && mean && var && a && b), "Empty input tensors!");
CheckNTErrors((dimRDI >= 0 && dimRDI < input->order), "Incorrect reduction dimension!"); CheckNTErrors((dimRDI >= 0 && dimRDI < input->order), "Incorrect reduction dimension!");
CheckNTErrors((dimRDI == a->order - 1), "Incorrect reduction dimension!");
CheckNTErrors((input->order == mean->order + 1), "Incorrect reduction dimension!"); CheckNTErrors((input->order == mean->order + 1), "Incorrect reduction dimension!");
int stride = 1; int stride = 1;
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-31
*/
#include <math.h> #include <math.h>
#include "../../XName.h" #include "../../XName.h"
#include "Unary.h" #include "Unary.h"
...@@ -5,9 +26,18 @@ ...@@ -5,9 +26,18 @@
namespace nts{ namespace nts{
DTYPE square(DTYPE x)
{
return x * x;
}
DTYPE round(DTYPE r)
{
return (r > 0.0) ? (DTYPE)floor(r + 0.5) : (DTYPE)ceil(r - 0.5);
}
#ifdef USE_CUDA #ifdef USE_CUDA
/* define three marco separately, specify the respective function names */ /* define three marco separately, specify the respective function names (GPU mode) */
#define _SIMPLE_UNARY_FUNCTION(_funcName, _cudaFuncName, origFunc) \ #define _SIMPLE_UNARY_FUNCTION(_funcName, _cudaFuncName, origFunc) \
void _funcName(const XTensor * a, XTensor * b) \ void _funcName(const XTensor * a, XTensor * b) \
{ \ { \
...@@ -45,14 +75,35 @@ _SIMPLE_UNARY_FUNCTION(_Absolute, _CudaAbsolute, fabs) ...@@ -45,14 +75,35 @@ _SIMPLE_UNARY_FUNCTION(_Absolute, _CudaAbsolute, fabs)
_SIMPLE_UNARY_FUNCTION_ME(_AbsoluteMe, _Absolute) _SIMPLE_UNARY_FUNCTION_ME(_AbsoluteMe, _Absolute)
SIMPLE_UNARY_FUNCTION(Absolute, _Absolute, MATH_ABSOLUTE) SIMPLE_UNARY_FUNCTION(Absolute, _Absolute, MATH_ABSOLUTE)
_SIMPLE_UNARY_FUNCTION(_Ceil, _CudaCeil, ceil)
_SIMPLE_UNARY_FUNCTION_ME(_CeilMe, _Ceil)
SIMPLE_UNARY_FUNCTION(Ceil, _Ceil, MATH_CEIL)
_SIMPLE_UNARY_FUNCTION(_Exp, _CudaExp, exp) _SIMPLE_UNARY_FUNCTION(_Exp, _CudaExp, exp)
_SIMPLE_UNARY_FUNCTION_ME(_ExpMe, _Exp) _SIMPLE_UNARY_FUNCTION_ME(_ExpMe, _Exp)
SIMPLE_UNARY_FUNCTION(Exp, _Exp, MATH_EXP) SIMPLE_UNARY_FUNCTION(Exp, _Exp, MATH_EXP)
_SIMPLE_UNARY_FUNCTION(_Floor, _CudaFloor, floor)
_SIMPLE_UNARY_FUNCTION_ME(_FloorMe, _Floor)
SIMPLE_UNARY_FUNCTION(Floor, _Floor, MATH_FLOOR)
_SIMPLE_UNARY_FUNCTION(_Log, _CudaLog, log) _SIMPLE_UNARY_FUNCTION(_Log, _CudaLog, log)
_SIMPLE_UNARY_FUNCTION_ME(_LogMe, _Log) _SIMPLE_UNARY_FUNCTION_ME(_LogMe, _Log)
SIMPLE_UNARY_FUNCTION(Log, _Log, MATH_LOG) SIMPLE_UNARY_FUNCTION(Log, _Log, MATH_LOG)
_SIMPLE_UNARY_FUNCTION(_Round, _CudaRound, round)
_SIMPLE_UNARY_FUNCTION_ME(_RoundMe, _Round)
SIMPLE_UNARY_FUNCTION(Round, _Round, MATH_ROUND)
_SIMPLE_UNARY_FUNCTION(_Sqrt, _CudaSqrt, sqrt)
_SIMPLE_UNARY_FUNCTION_ME(_SqrtMe, _Sqrt)
SIMPLE_UNARY_FUNCTION(Sqrt, _Sqrt, MATH_SQRT)
_SIMPLE_UNARY_FUNCTION(_Square, _CudaSquare, square)
_SIMPLE_UNARY_FUNCTION_ME(_SquareMe, _Square)
SIMPLE_UNARY_FUNCTION(Square, _Square, MATH_SQUARE)
_SIMPLE_UNARY_FUNCTION(_Sin, _CudaSin, sin) _SIMPLE_UNARY_FUNCTION(_Sin, _CudaSin, sin)
_SIMPLE_UNARY_FUNCTION_ME(_SinMe, _Sin) _SIMPLE_UNARY_FUNCTION_ME(_SinMe, _Sin)
SIMPLE_UNARY_FUNCTION(Sin, _Sin, MATH_SIN) SIMPLE_UNARY_FUNCTION(Sin, _Sin, MATH_SIN)
...@@ -65,11 +116,8 @@ _SIMPLE_UNARY_FUNCTION(_Tan, _CudaTan, tan) ...@@ -65,11 +116,8 @@ _SIMPLE_UNARY_FUNCTION(_Tan, _CudaTan, tan)
_SIMPLE_UNARY_FUNCTION_ME(_TanMe, _Tan) _SIMPLE_UNARY_FUNCTION_ME(_TanMe, _Tan)
SIMPLE_UNARY_FUNCTION(Tan, _Tan, MATH_TAN) SIMPLE_UNARY_FUNCTION(Tan, _Tan, MATH_TAN)
/*_SIMPLE_UNARY_FUNCTION(_Round, _CudaRound, round)
_SIMPLE_UNARY_FUNCTION_ME(_RoundMe, _Round)
SIMPLE_UNARY_FUNCTION(Round, _Round, MATH_ROUND)*/
#else #else
/* define three marco separately, specify the respective function names */ /* define three marco separately, specify the respective function names (CPU mode) */
#define _SIMPLE_UNARY_FUNCTION(_funcName, origFunc) \ #define _SIMPLE_UNARY_FUNCTION(_funcName, origFunc) \
void _funcName(const XTensor * a, XTensor * b) \ void _funcName(const XTensor * a, XTensor * b) \
{ \ { \
...@@ -102,14 +150,35 @@ _SIMPLE_UNARY_FUNCTION(_Absolute, fabs) ...@@ -102,14 +150,35 @@ _SIMPLE_UNARY_FUNCTION(_Absolute, fabs)
_SIMPLE_UNARY_FUNCTION_ME(_AbsoluteMe, _Absolute) _SIMPLE_UNARY_FUNCTION_ME(_AbsoluteMe, _Absolute)
SIMPLE_UNARY_FUNCTION(Absolute, _Absolute, MATH_ABSOLUTE) SIMPLE_UNARY_FUNCTION(Absolute, _Absolute, MATH_ABSOLUTE)
_SIMPLE_UNARY_FUNCTION(_Ceil, ceil)
_SIMPLE_UNARY_FUNCTION_ME(_CeilMe, _Ceil)
SIMPLE_UNARY_FUNCTION(Ceil, _Ceil, MATH_CEIL)
_SIMPLE_UNARY_FUNCTION(_Exp, exp) _SIMPLE_UNARY_FUNCTION(_Exp, exp)
_SIMPLE_UNARY_FUNCTION_ME(_ExpMe, _Exp) _SIMPLE_UNARY_FUNCTION_ME(_ExpMe, _Exp)
SIMPLE_UNARY_FUNCTION(Exp, _Exp, MATH_EXP) SIMPLE_UNARY_FUNCTION(Exp, _Exp, MATH_EXP)
_SIMPLE_UNARY_FUNCTION(_Floor, floor)
_SIMPLE_UNARY_FUNCTION_ME(_FloorMe, _Floor)
SIMPLE_UNARY_FUNCTION(Floor, _Floor, MATH_FLOOR)
_SIMPLE_UNARY_FUNCTION(_Log, log) _SIMPLE_UNARY_FUNCTION(_Log, log)
_SIMPLE_UNARY_FUNCTION_ME(_LogMe, _Log) _SIMPLE_UNARY_FUNCTION_ME(_LogMe, _Log)
SIMPLE_UNARY_FUNCTION(Log, _Log, MATH_LOG) SIMPLE_UNARY_FUNCTION(Log, _Log, MATH_LOG)
_SIMPLE_UNARY_FUNCTION(_Round, round)
_SIMPLE_UNARY_FUNCTION_ME(_RoundMe, _Round)
SIMPLE_UNARY_FUNCTION(Round, _Round, MATH_ROUND)
_SIMPLE_UNARY_FUNCTION(_Sqrt, sqrt)
_SIMPLE_UNARY_FUNCTION_ME(_SqrtMe, _Sqrt)
SIMPLE_UNARY_FUNCTION(Sqrt, _Sqrt, MATH_SQRT)
_SIMPLE_UNARY_FUNCTION(_Square, square)
_SIMPLE_UNARY_FUNCTION_ME(_SquareMe, _Square)
SIMPLE_UNARY_FUNCTION(Square, _Square, MATH_SQUARE)
_SIMPLE_UNARY_FUNCTION(_Sin, sin) _SIMPLE_UNARY_FUNCTION(_Sin, sin)
_SIMPLE_UNARY_FUNCTION_ME(_SinMe, _Sin) _SIMPLE_UNARY_FUNCTION_ME(_SinMe, _Sin)
SIMPLE_UNARY_FUNCTION(Sin, _Sin, MATH_SIN) SIMPLE_UNARY_FUNCTION(Sin, _Sin, MATH_SIN)
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-31
*/
#include <math.h> #include <math.h>
#include "../../XDevice.h" #include "../../XDevice.h"
#include "../../XName.h" #include "../../XName.h"
#include "Unary.h"
#include "Unary.cuh" #include "Unary.cuh"
namespace nts { namespace nts {
__device__
DTYPE CudaSquare(DTYPE x)
{
return x * x;
}
__device__
DTYPE CudaRound(DTYPE r)
{
return (r > 0.0) ? (DTYPE)floor(r + 0.5) : (DTYPE)ceil(r - 0.5);
}
#define SIMPLE_UNARY_FUNCTION_GPU(funcName, origFunc) \ #define SIMPLE_UNARY_FUNCTION_GPU(funcName, origFunc) \
__global__ \ __global__ \
void Kernel##funcName(DTYPE * a, DTYPE * b, int size) \ void Kernel##funcName(DTYPE * a, DTYPE * b, int size) \
...@@ -15,7 +49,7 @@ void Kernel##funcName(DTYPE * a, DTYPE * b, int size) \ ...@@ -15,7 +49,7 @@ void Kernel##funcName(DTYPE * a, DTYPE * b, int size) \
b[i] = (DTYPE)origFunc(a[i]); \ b[i] = (DTYPE)origFunc(a[i]); \
} \ } \
__global__ \ __global__ \
void Kernel##funcName(__half * a, __half * b, int size) \ void Kernel##funcName(__half * a, __half * b, int size) \
{ \ { \
return; \ return; \
} \ } \
...@@ -37,12 +71,12 @@ void _Cuda##funcName(const XTensor * a, XTensor * b) \ ...@@ -37,12 +71,12 @@ void _Cuda##funcName(const XTensor * a, XTensor * b) \
ProtectCudaDev(a->devID, devIDBackup); \ ProtectCudaDev(a->devID, devIDBackup); \
\ \
if (a->dataType == DEFAULT_DTYPE) { \ if (a->dataType == DEFAULT_DTYPE) { \
Kernel##funcName << <blocks, threads >> > \ Kernel##funcName<<<blocks, threads>>> \
((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum); \ ((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum); \
} \ } \
else if (a->dataType == X_FLOAT16) { \ else if (a->dataType == X_FLOAT16) { \
Kernel##funcName << <blocks, threads >> > \ Kernel##funcName<<<blocks, threads>>> \
((__half*)a->data, (__half*)b->data, a->unitNum); \ ((__half*)a->data, (__half*)b->data, a->unitNum); \
} \ } \
else { \ else { \
ShowNTErrors("TODO!"); \ ShowNTErrors("TODO!"); \
...@@ -52,11 +86,16 @@ void _Cuda##funcName(const XTensor * a, XTensor * b) \ ...@@ -52,11 +86,16 @@ void _Cuda##funcName(const XTensor * a, XTensor * b) \
} \ } \
SIMPLE_UNARY_FUNCTION_GPU(Absolute, fabs) SIMPLE_UNARY_FUNCTION_GPU(Absolute, fabs)
SIMPLE_UNARY_FUNCTION_GPU(Ceil, ceil)
SIMPLE_UNARY_FUNCTION_GPU(Exp, exp) SIMPLE_UNARY_FUNCTION_GPU(Exp, exp)
SIMPLE_UNARY_FUNCTION_GPU(Floor, floor)
SIMPLE_UNARY_FUNCTION_GPU(Log, log) SIMPLE_UNARY_FUNCTION_GPU(Log, log)
SIMPLE_UNARY_FUNCTION_GPU(Round, CudaRound)
SIMPLE_UNARY_FUNCTION_GPU(Sqrt, sqrt)
SIMPLE_UNARY_FUNCTION_GPU(Square, CudaSquare)
SIMPLE_UNARY_FUNCTION_GPU(Sin, sin) SIMPLE_UNARY_FUNCTION_GPU(Sin, sin)
SIMPLE_UNARY_FUNCTION_GPU(Cos, cos) SIMPLE_UNARY_FUNCTION_GPU(Cos, cos)
SIMPLE_UNARY_FUNCTION_GPU(Tan, tan) SIMPLE_UNARY_FUNCTION_GPU(Tan, tan)
//SIMPLE_UNARY_FUNCTION_GPU(Round, round)
} }
\ No newline at end of file
...@@ -38,6 +38,15 @@ void KernelAbsolute(__half * a, __half * b, int size); ...@@ -38,6 +38,15 @@ void KernelAbsolute(__half * a, __half * b, int size);
/* set each entry to its absolute value */ /* set each entry to its absolute value */
void _CudaAbsolute(const XTensor * a, XTensor * b); void _CudaAbsolute(const XTensor * a, XTensor * b);
/* set each entry to its ceil value (CUDA Kernel) */
__global__
void KernelCeil(DTYPE * a, DTYPE * b, int size);
/* set each entry to its ceil value (CUDA Kernel) with float16 data type*/
__global__
void KernelCeil(__half * a, __half * b, int size);
/* set each entry to its ceil value */
void _CudaCeil(const XTensor * a, XTensor * b);
/* set each entry to its exponent value (CUDA Kernel) */ /* set each entry to its exponent value (CUDA Kernel) */
__global__ __global__
void KernelExp(DTYPE * a, DTYPE * b, int size); void KernelExp(DTYPE * a, DTYPE * b, int size);
...@@ -47,6 +56,15 @@ void KernelExp(__half * a, __half * b, int size); ...@@ -47,6 +56,15 @@ void KernelExp(__half * a, __half * b, int size);
/* set each entry to its exponent value */ /* set each entry to its exponent value */
void _CudaExp(const XTensor * a, XTensor * b); void _CudaExp(const XTensor * a, XTensor * b);
/* set each entry to its floor value (CUDA Kernel) */
__global__
void KernelFloor(DTYPE * a, DTYPE * b, int size);
/* set each entry to its floor value (CUDA Kernel) with float16 data type*/
__global__
void KernelFloor(__half * a, __half * b, int size);
/* set each entry to its floor value */
void _CudaFloor(const XTensor * a, XTensor * b);
/* set each entry to its logarithm value (CUDA Kernel) */ /* set each entry to its logarithm value (CUDA Kernel) */
__global__ __global__
void KernelLog(DTYPE * a, DTYPE * b, int size); void KernelLog(DTYPE * a, DTYPE * b, int size);
...@@ -56,6 +74,34 @@ void KernelLog(__half * a, __half * b, int size); ...@@ -56,6 +74,34 @@ void KernelLog(__half * a, __half * b, int size);
/* set each entry to its logarithm value */ /* set each entry to its logarithm value */
void _CudaLog(const XTensor * a, XTensor * b); void _CudaLog(const XTensor * a, XTensor * b);
/* set each entry to its round value (CUDA Kernel) */
__global__
void KernelRound(DTYPE * a, DTYPE * b, int size);
/* set each entry to its round value (CUDA Kernel) with float16 data type*/
__global__
void KernelRound(__half * a, __half * b, int size);
/* set each entry to its round value */
void _CudaRound(const XTensor * a, XTensor * b);
/* set each entry to its sqrt value (CUDA Kernel) */
__global__
void KernelSqrt(DTYPE * a, DTYPE * b, int size);
/* set each entry to its sqrt value (CUDA Kernel) with float16 data type*/
__global__
void KernelSqrt(__half * a, __half * b, int size);
/* set each entry to its sqrt value */
void _CudaSqrt(const XTensor * a, XTensor * b);
/* set each entry to its square value (CUDA Kernel) */
__global__
void KernelSquare(DTYPE * a, DTYPE * b, int size);
/* set each entry to its square value (CUDA Kernel) with float16 data type*/
__global__
void KernelSquare(__half * a, __half * b, int size);
/* set each entry to its square value */
void _CudaSquare(const XTensor * a, XTensor * b);
/* set each entry to its sine value (CUDA Kernel) */ /* set each entry to its sine value (CUDA Kernel) */
__global__ __global__
void KernelSin(DTYPE * a, DTYPE * b, int size); void KernelSin(DTYPE * a, DTYPE * b, int size);
...@@ -83,15 +129,6 @@ void KernelTan(__half * a, __half * b, int size); ...@@ -83,15 +129,6 @@ void KernelTan(__half * a, __half * b, int size);
/* set each entry to its tangent value */ /* set each entry to its tangent value */
void _CudaTan(const XTensor * a, XTensor * b); void _CudaTan(const XTensor * a, XTensor * b);
/* set each entry to its round value (CUDA Kernel) */
//__global__
//void KernelRound(DTYPE * a, DTYPE * b, int size);
/* set each entry to its round value (CUDA Kernel) with float16 data type*/
//__global__
//void KernelRound(__half * a, __half * b, int size);
/* set each entry to its round value */
//void _CudaRound(const XTensor * a, XTensor * b);
#endif // USE_CUDA #endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -28,95 +28,103 @@ namespace nts{ ...@@ -28,95 +28,103 @@ namespace nts{
/* set every entry to its absolute value */ /* set every entry to its absolute value */
void _Absolute(const XTensor * a, XTensor * b); void _Absolute(const XTensor * a, XTensor * b);
/* /* set every entry to its absolute value (do it on site)
set every entry to its absolute value (do it on site) keep the result in the input tensor a and return nothing */
keep the result in the input tensor a and return nothing
*/
void _AbsoluteMe(XTensor * a); void _AbsoluteMe(XTensor * a);
/* /* set every entry to its absolute value (return a XTensor structure)
set every entry to its absolute value (return a XTensor structure) make a new tensor to keep the result and return it */
make a new tensor to keep the result and return it
*/
XTensor Absolute(const XTensor & a); XTensor Absolute(const XTensor & a);
/* set every entry to its ceil value */
void _Ceil(const XTensor * a, XTensor * b);
/* set every entry to its ceil value (do it on site)
keep the result in the input tensor a and return nothing */
void _CeilMe(XTensor * a);
/* set every entry to its ceil value (return a XTensor structure)
make a new tensor to keep the result and return it */
XTensor Ceil(const XTensor & a);
/* set every entry to its exponent value */ /* set every entry to its exponent value */
void _Exp(const XTensor * a, XTensor * b); void _Exp(const XTensor * a, XTensor * b);
/* /* set every entry to its exponent value (do it on site)
set every entry to its exponent value (do it on site) keep the result in the input tensor a and return nothing */
keep the result in the input tensor a and return nothing
*/
void _ExpMe(XTensor * a); void _ExpMe(XTensor * a);
/* /* set every entry to its exponent value (return a XTensor structure)
set every entry to its exponent value (return a XTensor structure) make a new tensor to keep the result and return it */
make a new tensor to keep the result and return it
*/
XTensor Exp(const XTensor & a); XTensor Exp(const XTensor & a);
/* set every entry to its floor value */
void _Floor(const XTensor * a, XTensor * b);
/* set every entry to its floor value (do it on site)
keep the result in the input tensor a and return nothing */
void _FloorMe(XTensor * a);
/* set every entry to its floor value (return a XTensor structure)
make a new tensor to keep the result and return it */
XTensor Floor(const XTensor & a);
/* set every entry to its logarithm value */ /* set every entry to its logarithm value */
void _Log(const XTensor * a, XTensor * b); void _Log(const XTensor * a, XTensor * b);
/* /* set every entry to its logarithm value (do it on site)
set every entry to its logarithm value (do it on site) keep the result in the input tensor a and return nothing */
keep the result in the input tensor a and return nothing
*/
void _LogMe(XTensor * a); void _LogMe(XTensor * a);
/* /* set every entry to its logarithm value (return a XTensor structure)
set every entry to its logarithm value (return a XTensor structure) make a new tensor to keep the result and return it */
make a new tensor to keep the result and return it
*/
XTensor Log(const XTensor & a); XTensor Log(const XTensor & a);
/* set every entry to its round value */
void _Round(const XTensor * a, XTensor * b);
/* set every entry to its round value (do it on site)
keep the result in the input tensor a and return nothing */
void _RoundMe(XTensor * a);
/* set every entry to its round value (return a XTensor structure)
make a new tensor to keep the result and return it */
XTensor Round(const XTensor & a);
/* set every entry to its sqrt value */
void _Sqrt(const XTensor * a, XTensor * b);
/* set every entry to its sqrt value (do it on site)
keep the result in the input tensor a and return nothing */
void _SqrtMe(XTensor * a);
/* set every entry to its sqrt value (return a XTensor structure)
make a new tensor to keep the result and return it */
XTensor Sqrt(const XTensor & a);
/* set every entry to its square value */
void _Square(const XTensor * a, XTensor * b);
/* set every entry to its square value (do it on site)
keep the result in the input tensor a and return nothing */
void _SquareMe(XTensor * a);
/* set every entry to its square value (return a XTensor structure)
make a new tensor to keep the result and return it */
XTensor Square(const XTensor & a);
/* set every entry to its sine value */ /* set every entry to its sine value */
void _Sin(const XTensor * a, XTensor * b); void _Sin(const XTensor * a, XTensor * b);
/* /* set every entry to its sine value (do it on site)
set every entry to its sine value (do it on site) keep the result in the input tensor a and return nothing */
keep the result in the input tensor a and return nothing
*/
void _SinMe(XTensor * a); void _SinMe(XTensor * a);
/* /* set every entry to its sine value (return a XTensor structure)
set every entry to its sine value (return a XTensor structure) make a new tensor to keep the result and return it */
make a new tensor to keep the result and return it
*/
XTensor Sin(const XTensor & a); XTensor Sin(const XTensor & a);
/* set every entry to its cosine value */ /* set every entry to its cosine value */
void _Cos(const XTensor * a, XTensor * b); void _Cos(const XTensor * a, XTensor * b);
/* /* set every entry to its cosine value (do it on site)
set every entry to its cosine value (do it on site) keep the result in the input tensor a and return nothing */
keep the result in the input tensor a and return nothing
*/
void _CosMe(XTensor * a); void _CosMe(XTensor * a);
/* /* set every entry to its cosine value (return a XTensor structure)
set every entry to its cosine value (return a XTensor structure) make a new tensor to keep the result and return it */
make a new tensor to keep the result and return it
*/
XTensor Cos(const XTensor & a); XTensor Cos(const XTensor & a);
/* set every entry to its tangent value */ /* set every entry to its tangent value */
void _Tan(const XTensor * a, XTensor * b); void _Tan(const XTensor * a, XTensor * b);
/* /* set every entry to its tangent value (do it on site)
set every entry to its tangent value (do it on site) keep the result in the input tensor a and return nothing */
keep the result in the input tensor a and return nothing
*/
void _TanMe(XTensor * a); void _TanMe(XTensor * a);
/* /* set every entry to its tangent value (return a XTensor structure)
set every entry to its tangent value (return a XTensor structure) make a new tensor to keep the result and return it */
make a new tensor to keep the result and return it
*/
XTensor Tan(const XTensor & a); XTensor Tan(const XTensor & a);
/* set every entry to its round value */
//void _Round(const XTensor * a, XTensor * b);
/*
set every entry to its round value (do it on site)
keep the result in the input tensor a and return nothing
*/
//void _RoundMe(XTensor * a);
/*
set every entry to its round value (return a XTensor structure)
make a new tensor to keep the result and return it
*/
//XTensor Round(const XTensor & a);
} }
#endif //end __UNARY_H__ #endif //end __UNARY_H__
\ No newline at end of file
...@@ -77,7 +77,7 @@ void KernelCopyBlocksV2(T * source, int blockSize, int blockNum, int totalSize, ...@@ -77,7 +77,7 @@ void KernelCopyBlocksV2(T * source, int blockSize, int blockNum, int totalSize,
int targetBlockID = targetBlocks[i / blockSize]; int targetBlockID = targetBlocks[i / blockSize];
int targetOffset = i % blockSize; int targetOffset = i % blockSize;
*(target + blockSize * targetBlockID + targetOffset) = source[i]; target[blockSize * targetBlockID + targetOffset] = source[i];
} }
/* /*
...@@ -98,16 +98,6 @@ void _CudaCopyBlocks(void * source, int blockSize, int blockNum, void * target, ...@@ -98,16 +98,6 @@ void _CudaCopyBlocks(void * source, int blockSize, int blockNum, void * target,
int devIDBackup; int devIDBackup;
ProtectCudaDev(devID, devIDBackup); ProtectCudaDev(devID, devIDBackup);
if(blockSize % sizeof(double) == 0){
int bSize = blockSize / sizeof(double);
GDevs.GetCudaThread(devID, bSize * blockNum, cudaGrids, cudaBlocks);
KernelCopyBlocksV2<double> <<<dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >>>
((double*)source, bSize, blockNum, bSize * blockNum, (double*)target, targetBlocks);
//GDevs.GetCudaThread2D(devID, bSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
//KernelCopyBlocks<double> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >>>
// ((double*)source, bSize, blockNum, (double*)target, targetBlocks);
}
else
if(blockSize % sizeof(float) == 0){ if(blockSize % sizeof(float) == 0){
int bSize = blockSize / sizeof(float); int bSize = blockSize / sizeof(float);
GDevs.GetCudaThread(devID, bSize * blockNum, cudaGrids, cudaBlocks); GDevs.GetCudaThread(devID, bSize * blockNum, cudaGrids, cudaBlocks);
......
...@@ -405,7 +405,7 @@ inline void continuousStorageThreadAllocation(dim3& grid, dim3& block, long long ...@@ -405,7 +405,7 @@ inline void continuousStorageThreadAllocation(dim3& grid, dim3& block, long long
if (vectorSize % 32 != 0) minWarpNum++; if (vectorSize % 32 != 0) minWarpNum++;
warpNum = min(warpNum, minWarpNum); warpNum = min(warpNum, minWarpNum);
grid.x = vectorNum; grid.x = (unsigned int)vectorNum;
grid.y = 1; grid.y = 1;
grid.z = 1; grid.z = 1;
block.x = 1; block.x = 1;
...@@ -482,7 +482,7 @@ void KernelReduceMaxOp(DTYPE * input, DTYPE * output,int stride, int strideNum, ...@@ -482,7 +482,7 @@ void KernelReduceMaxOp(DTYPE * input, DTYPE * output,int stride, int strideNum,
if (tid < 32){ if (tid < 32){
if (tid < blockDim.y / 32) if (tid < blockDim.y / 32)
threadMax = data[tid]; threadMax = data[tid];
else threadMax = 0; else threadMax = FLOAT_MIN;
threadMax = shflDownReduceMax(threadMax); threadMax = shflDownReduceMax(threadMax);
if (tid == 0 && blockIdx.y < reducedStrideNum) if (tid == 0 && blockIdx.y < reducedStrideNum)
output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = threadMax; output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = threadMax;
......
...@@ -480,8 +480,8 @@ void KernelReduceSumFast(__half * input, __half * output, ...@@ -480,8 +480,8 @@ void KernelReduceSumFast(__half * input, __half * output,
if data storage is discontinuius ,use this way to reduce if data storage is discontinuius ,use this way to reduce
*/ */
__global__ __global__
void KernelReduceSumDiscontinuousStorage(DTYPE * input, DTYPE * output, int stride, int blockNum, void KernelReduceSumDiscontinuousStorage(DTYPE * input, DTYPE * output, int stride, int strideNum,
int strideNum, DTYPE * shift, DTYPE power, bool isExp) int blockNum, DTYPE * shift, DTYPE power, bool isExp)
{ {
__shared__ DTYPE bias[MAX_CUDA_THREAD_NUM_PER_BLOCK]; __shared__ DTYPE bias[MAX_CUDA_THREAD_NUM_PER_BLOCK];
int idx = blockDim.x * blockIdx.x + threadIdx.x; int idx = blockDim.x * blockIdx.x + threadIdx.x;
...@@ -629,7 +629,7 @@ inline void continuousStorageThreadAllocation(dim3& grid, dim3& block, long long ...@@ -629,7 +629,7 @@ inline void continuousStorageThreadAllocation(dim3& grid, dim3& block, long long
if (vectorSize % 32 != 0) minWarpNum++; if (vectorSize % 32 != 0) minWarpNum++;
warpNum = min(warpNum, minWarpNum); warpNum = min(warpNum, minWarpNum);
grid.x = vectorNum; grid.x = (unsigned int)vectorNum;
grid.y = 1; grid.y = 1;
grid.z = 1; grid.z = 1;
block.x = 1; block.x = 1;
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-09-12
*/
#include "../XName.h"
#include <math.h>
#include <time.h>
#include "Dropout.h"
#include "Dropout.cuh"
#include "../core/arithmetic/Multiply.h"
#include "../core/math/ScaleAndShift.h"
namespace nts{ // namespace nts(NiuTrans.Tensor
/*
generate a random bernoulli number
*/
DTYPE RandomBernoulli(DTYPE prob)
{
return (DTYPE)rand()/(DTYPE)RAND_MAX > prob ? (DTYPE)1.0 : (DTYPE)0.0;
}
/*
dropout function
During training, randomly zeroes some of the elements of the input tensor
with probability p using samples from a Bernoulli distribution.
The elements to zero are randomized on every forward call.
This has proven to be an effective technique for regularization and
preventing the co-adaptation of neurons as described in the paper
"Improving neural networks by preventing co-adaptation of feature detectors".
Furthermore, the outputs are scaled by a factor of \frac{1}{1-p} during training.
This means that during evaluation the module simply computes an identity function.
>> x - input tensor
>> y - output tensor
>> prob - probability to set an element zero
*/
void _Dropout(const XTensor *x, XTensor *y, unsigned int seed, DTYPE prob)
{
CheckNTErrors(prob >= 0.0 && prob <= 1.0, "The probability must be 0-1!");
DTYPE scaleFactor = (DTYPE)1.0 / ((DTYPE)1.0 - prob);
/* generate a mask tensor again with special probability */
srand(seed);
int unitNum = x->unitNum;
DTYPE * maskArray = new DTYPE[unitNum];
for (int i = 0; i < unitNum; i++)
maskArray[i] = RandomBernoulli(prob);
XTensor * maskTensor = NewTensorBuf(x, x->devID, x->mem);
maskTensor->SetData(maskArray, unitNum);
#ifdef USE_CUDA
if(x->devID >=0 || y->devID >= 0){
_CudaDropout(x, y, maskTensor, scaleFactor);
DelTensorBuf(maskTensor);
delete[] maskArray;
return;
}
#endif
XTensor * inter = NewTensorBuf(x, x->devID, x->mem);
_Multiply(x, maskTensor, inter);
_ScaleAndShift(inter, y, scaleFactor, 0);
DelTensorBuf(inter);
DelTensorBuf(maskTensor);
delete[] maskArray;
}
/*
dropout function (return a XTensor structure)
make a new tensor to keep the result and return it
During training, randomly zeroes some of the elements of the input tensor
with probability p using samples from a Bernoulli distribution.
The elements to zero are randomized on every forward call.
This has proven to be an effective technique for regularization and
preventing the co-adaptation of neurons as described in the paper
"Improving neural networks by preventing co-adaptation of feature detectors".
Furthermore, the outputs are scaled by a factor of \frac{1}{1-p} during training.
This means that during evaluation the module simply computes an identity function.
>> x - input tensor
>> y - output tensor
>> prob - probability to set an element zero
*/
XTensor Dropout(const XTensor &x, DTYPE prob)
{
XTensor y(&x);
y.SetTMP();
DTYPE scaleFactor = (DTYPE)1.0 / ((DTYPE)1.0 - prob);
/* generate a mask tensor again with special probability */
srand((unsigned int)time(NULL));
int unitNum = x.unitNum;
DTYPE * maskArray = new DTYPE[unitNum];
for (int i = 0; i < unitNum; i++)
maskArray[i] = RandomBernoulli(prob);
XTensor maskTensor(&x);
maskTensor.SetData(maskArray, unitNum);
XTensor inter;
inter = Multiply(x, maskTensor);
y = ScaleAndShift(inter, scaleFactor, 0);
delete[] maskArray;
///* tensor connection */
//XLink::MakeLink(&x, NULL, &y, FUNC_DROPOUT);
//XLink::AddParamToHead(&y, prob);
return y;
}
/*
backward computation of dropout function
dE/dx = dE/dy * dy/dx
>> y - output of the dropout function
>> x - input of the dropout function
>> dedy - dE/dy
>> dedx - dE/dx
>> prob - probability to set an element zero
*/
void _DropoutBackward(const XTensor * y, const XTensor * x,
const XTensor * dedy, XTensor * dedx,
unsigned int seed, DTYPE prob)
{
if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE)
{
int unitNum = y->unitNum;
DTYPE scaleFactor = (DTYPE)1.0F / ((DTYPE)1.0F - prob);
/* generate a mask tensor again with special probability */
srand(seed);
DTYPE * maskArray = new DTYPE[unitNum];
for (int i = 0; i < unitNum; i++)
maskArray[i] = RandomBernoulli(prob);
XTensor * maskTensor = NewTensorBuf(x, x->devID, x->mem);
maskTensor->SetData(maskArray, unitNum);
#ifdef USE_CUDA
if(x->devID >= 0 || y->devID >= 0){
_CudaDropoutBackward(y, x, dedy, dedx, maskTensor, scaleFactor);
DelTensorBuf(maskTensor);
delete[] maskArray;
return;
}
#endif
DTYPE * dedyp = (DTYPE*)dedy->data;
DTYPE * dedxp = (DTYPE*)dedx->data;
/* dE/dx = dE/dy * dy/dx */
for(int i = 0; i < unitNum; i++)
dedxp[i] = dedyp[i] * maskArray[i] * scaleFactor;
DelTensorBuf(maskTensor);
delete[] maskArray;
}
else
ShowNTErrors("TODO!");
}
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-09-12
*/
#include "Dropout.h"
#include "Dropout.cuh"
#include "Loss.cuh"
#include "../XDevice.h"
#ifdef USE_CUDA
// the CUDA stuff
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cuda.h>
#endif
namespace nts{ // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/*
dropout function (Cuda kernel)
>> x - input data pointer
>> y - output data pointer
>> m - mask indicator to set zero
>> s - the scale factor
>> size - size of input/output
*/
__global__
void KernelDropoutCompute(DTYPE * x, DTYPE * y, DTYPE * m, DTYPE s, int size)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size){
y[i] = x[i] * m[i] * s;
}
}
/*
dropout function (Cuda version)
>> x - input tensor
>> y - output tensor
>> mask - mask tensor to set 0
>> scaleFactor - the scale factor
*/
void _CudaDropout(const XTensor * x, XTensor * y, const XTensor * mask, DTYPE scaleFactor)
{
if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE){
CheckNTErrors(!x->isSparse && !y->isSparse, "the activation function (rectify) does not support sparse matrices.");
CheckNTErrors(x->unitNum && y->unitNum, "we require two vectors with the same length.");
int gridSize[3], blockSize[3];
GDevs.GetCudaThread(x->devID, x->unitNum, gridSize, blockSize);
int devIDBackup;
ProtectCudaDev(x->devID, devIDBackup);
KernelDropoutCompute<<<dim3(gridSize[0]), dim3(blockSize[0])>>>((DTYPE*)x->data, (DTYPE*)y->data, (DTYPE*)mask->data, scaleFactor, x->unitNum);
BacktoCudaDev(x->devID, devIDBackup);
}
else
ShowNTErrors("TODO!");
}
/*
backward computation of dropout function (Cuda kernel)
dE/dx = dE/dy * dy/dx
>> dedy - dE/dy
>> dedx - dE/dx
>> m - mask indicator to set zero
>> s - the scale factor
>> size - size of input/output
*/
__global__
void KernelDropoutBackward(DTYPE * dedy, DTYPE * dedx,
DTYPE * m, DTYPE s, int size)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size){
dedx[i] = dedy[i] * m[i] * s;
}
}
/*
backward computation of dropout function (Cuda version)
dE/dx = dE/dy * dy/dx
>> y - output of the dropout function
>> x - input of the dropout function
>> dedy - dE/dy
>> dedx - dE/dx
>> mask - mask tensor to set 0
>> scaleFactor - the scale factor
*/
void _CudaDropoutBackward(const XTensor * y, const XTensor * x,
const XTensor * dedy, XTensor * dedx,
const XTensor * mask, DTYPE scaleFactor)
{
int gridSize[3], blockSize[3];
if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE){
GDevs.GetCudaThread(x->devID, x->unitNum, gridSize, blockSize);
int devIDBackup;
ProtectCudaDev(x->devID, devIDBackup);
/* dE/ds = dE/dy * dy/ds */
KernelDropoutBackward<<<dim3(gridSize[0]),dim3(blockSize[0])>>>
((DTYPE*)dedy->data, (DTYPE*)dedx->data,
(DTYPE*)mask->data, scaleFactor, x->unitNum);
BacktoCudaDev(x->devID, devIDBackup);
}
else
ShowNTErrors("TODO!");
}
#endif
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-09-12
*/
#ifndef __DROPOUT_CUH__
#define __DROPOUT_CUH__
#include "../XTensor.h"
#include "Loss.h"
namespace nts{ // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/* dropout function (Cuda version) */
void _CudaDropout(const XTensor * x, XTensor * y, const XTensor * r, DTYPE scaleFactor);
/* de/dx (Cuda version) */
void _CudaDropoutBackward(const XTensor * y, const XTensor * x,
const XTensor * dedy, XTensor * dedx,
const XTensor * mask, DTYPE scaleFactor);
#endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor)
#endif // __DROPOUT_CUH__
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-09-12
*/
#ifndef __DROPOUT_H__
#define __DROPOUT_H__
#include "../XTensor.h"
#include "Loss.h"
namespace nts{ // namespace nts(NiuTrans.Tensor)
/* dropout function */
void _Dropout(const XTensor * x, XTensor * y, unsigned int seed, DTYPE prob = 0.5);
/* dropout function */
XTensor Dropout(const XTensor &x, DTYPE prob = 0.5);
/* de/dx */
void _DropoutBackward(const XTensor * y, const XTensor * x,
const XTensor * dedy, XTensor * dedx,
unsigned int seed, DTYPE prob = 0.5);
} // namespace nts(NiuTrans.Tensor)
#endif // __DROPOUT_H__
\ No newline at end of file
...@@ -51,8 +51,7 @@ DTYPE _LossCompute(XTensor * gold, XTensor * output, LOSS_FUNCTION_NAME LFName, ...@@ -51,8 +51,7 @@ DTYPE _LossCompute(XTensor * gold, XTensor * output, LOSS_FUNCTION_NAME LFName,
CheckNTErrors((XTensor::IsSameShaped(gold, output)), "The input tensors must be of the same size!"); CheckNTErrors((XTensor::IsSameShaped(gold, output)), "The input tensors must be of the same size!");
CheckNTErrors((gold->dimSizeRDI[0] == 1 && output->dimSizeRDI[0] == 1), "TODO!"); CheckNTErrors((gold->dimSizeRDI[0] == 1 && output->dimSizeRDI[0] == 1), "TODO!");
CheckNTErrors((gold->order > leadDim && leadDim >= 0), "Illegal leading dimension!"); CheckNTErrors((gold->order > leadDim && leadDim >= 0), "Illegal leading dimension!");
CheckNTErrors((gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE), CheckNTErrors((gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE), "TODO!");
"TODO!");
int leadDimRDI = output->order - leadDim - 1; int leadDimRDI = output->order - leadDim - 1;
int dimensionSize = output->dimSizeRDI[leadDimRDI]; int dimensionSize = output->dimSizeRDI[leadDimRDI];
......
...@@ -58,8 +58,7 @@ DTYPE _CudaLossCompute(XTensor * gold, XTensor * y, LOSS_FUNCTION_NAME LFName, ...@@ -58,8 +58,7 @@ DTYPE _CudaLossCompute(XTensor * gold, XTensor * y, LOSS_FUNCTION_NAME LFName,
CheckNTErrors((XTensor::IsSameShaped(gold, y)), "The input tensors must be of the same size!"); CheckNTErrors((XTensor::IsSameShaped(gold, y)), "The input tensors must be of the same size!");
CheckNTErrors((gold->dimSizeRDI[0] == 1 && y->dimSizeRDI[0] == 1), "TODO!"); CheckNTErrors((gold->dimSizeRDI[0] == 1 && y->dimSizeRDI[0] == 1), "TODO!");
CheckNTErrors((gold->order > leadDim && leadDim >= 0), "Illegal leading dimension!"); CheckNTErrors((gold->order > leadDim && leadDim >= 0), "Illegal leading dimension!");
CheckNTErrors((gold->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE), CheckNTErrors((gold->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE), "TODO!");
"TODO!");
CheckNTErrors((gold->devID == y->devID), "Tensors must be on the same device!"); CheckNTErrors((gold->devID == y->devID), "Tensors must be on the same device!");
CheckNTErrors((gold->devID >= 0), "Tensors must be on GPU device!"); CheckNTErrors((gold->devID >= 0), "Tensors must be on GPU device!");
CheckNTErrors((gLen == gold->dimSize[leadDim] && gBeg == 0 && yBeg == 0), "TODO!"); CheckNTErrors((gLen == gold->dimSize[leadDim] && gBeg == 0 && yBeg == 0), "TODO!");
......
...@@ -48,19 +48,19 @@ loss function to measure the "number" of errors ...@@ -48,19 +48,19 @@ loss function to measure the "number" of errors
/* compute the loss */ /* compute the loss */
DTYPE _LossCompute(XTensor * gold, XTensor * output, LOSS_FUNCTION_NAME LFName, DTYPE _LossCompute(XTensor * gold, XTensor * output, LOSS_FUNCTION_NAME LFName,
bool isLogOutput, int leadDim, int gBeg, int gLen, int oBeg); bool isLogOutput, int leadDim, int gBeg, int gLen, int oBeg);
/* compute the loss (log version) */ /* compute the loss (log version) */
DTYPE _LossComputeForLogScale(XTensor * gold, XTensor * output, LOSS_FUNCTION_NAME LFName, DTYPE _LossComputeForLogScale(XTensor * gold, XTensor * output, LOSS_FUNCTION_NAME LFName,
int leadDim, int gBeg, int gLen, int oBeg); int leadDim, int gBeg, int gLen, int oBeg);
/* backward compuation for a single element */ /* backward compuation for a single element */
DTYPE _LossBackwardPoint(DTYPE t, DTYPE y, LOSS_FUNCTION_NAME LFName); DTYPE _LossBackwardPoint(DTYPE t, DTYPE y, LOSS_FUNCTION_NAME LFName);
/* backward compuation for (dense) vectors */ /* backward compuation for (dense) vectors */
void _LossBackward(XTensor * dEdY, XTensor * t, XTensor * y, void _LossBackward(XTensor * dEdY, XTensor * t, XTensor * y,
LOSS_FUNCTION_NAME LFName, LOSS_FUNCTION_NAME LFName,
int leadDim = -1, int tBeg = 0, int tLen = -1, int yBeg = 0); int leadDim = -1, int tBeg = 0, int tLen = -1, int yBeg = 0);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -16,8 +16,8 @@ ...@@ -16,8 +16,8 @@
*/ */
/* /*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-25 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-25
*/ */
#include "../XName.h" #include "../XName.h"
#include <math.h> #include <math.h>
......
...@@ -16,8 +16,8 @@ ...@@ -16,8 +16,8 @@
*/ */
/* /*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-25 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-25
*/ */
#include "Sigmoid.h" #include "Sigmoid.h"
#include "Sigmoid.cuh" #include "Sigmoid.cuh"
......
...@@ -29,7 +29,7 @@ namespace nts{ // namespace nts(NiuTrans.Tensor) ...@@ -29,7 +29,7 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA #ifdef USE_CUDA
/* rectify function y = max(0, x) (Cuda version) */ /* sigmoid function y = 1/(1+exp(-x)) (Cuda version) */
void _CudaSigmoid(const XTensor * input, XTensor * output); void _CudaSigmoid(const XTensor * input, XTensor * output);
/* de/dx (Cuda version) */ /* de/dx (Cuda version) */
......
...@@ -45,20 +45,17 @@ void _Softmax(const XTensor * x, XTensor * y, int leadDim) ...@@ -45,20 +45,17 @@ void _Softmax(const XTensor * x, XTensor * y, int leadDim)
int * dimSize = new int[x->order - 1]; int * dimSize = new int[x->order - 1];
for(int i = 0; i < x->order; i++){ for(int i = 0; i < x->order; i++){
if(i < leadDim) if(i < leadDim)
dimSize[i] = -x->dimSize[i]; dimSize[i] = x->dimSize[i];
else if(i > leadDim) else if(i > leadDim)
dimSize[i - 1] = -x->dimSize[i]; dimSize[i - 1] = x->dimSize[i];
} }
XMem * mem = x->mem; XMem * mem = x->mem;
XTensor * max = NULL; XTensor * max = NULL;
XTensor * sum = NULL; XTensor * sum = NULL;
max = NewTensor(x->order - 1, dimSize, x->dataType, x->denseRatio, x->devID, mem); max = NewTensorBuf(x->order - 1, dimSize, x->dataType, x->denseRatio, x->devID, mem);
sum = NewTensor(x->order - 1, dimSize, x->dataType, x->denseRatio, x->devID, mem); sum = NewTensorBuf(x->order - 1, dimSize, x->dataType, x->denseRatio, x->devID, mem);
max->data = mem != NULL ? (char*)mem->AllocBuf(mem->devID, max->unitNum * max->unitSize) : XMemAlloc(max->devID, max->unitNum * max->unitSize);
sum->data = mem != NULL ? (char*)mem->AllocBuf(mem->devID, sum->unitNum * sum->unitSize) : XMemAlloc(sum->devID, sum->unitNum * sum->unitSize);
_ReduceMax(x, max, leadDim); _ReduceMax(x, max, leadDim);
_ReduceSum(x, sum, leadDim, max, 1.0F, true); _ReduceSum(x, sum, leadDim, max, 1.0F, true);
...@@ -114,18 +111,9 @@ void _Softmax(const XTensor * x, XTensor * y, int leadDim) ...@@ -114,18 +111,9 @@ void _Softmax(const XTensor * x, XTensor * y, int leadDim)
} }
} }
if(mem != NULL){ DelTensorBuf(sum);
mem->ReleaseBuf(mem->devID, max->unitNum * max->unitSize); DelTensorBuf(max);
mem->ReleaseBuf(mem->devID, sum->unitNum * sum->unitSize);
}
else{
XMemFree(max->devID, max->data);
XMemFree(sum->devID, sum->data);
max->data = NULL;
sum->data = NULL;
}
delete max;
delete sum;
delete[] dimSize; delete[] dimSize;
} }
else else
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-09-12
*/
#include "../XUtility.h"
#include "TDropout.h"
#include "../core/getandset/SetData.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/*
case 1: test Dropout function.
*/
bool TestDropout1()
{
/* a input tensor of size (4, 5) */
int order = 2;
int * dimSize = new int[order];
dimSize[0] = 40;
dimSize[1] = 50;
int unitNum = 1;
for (int i = 0; i < order; i++)
unitNum *= dimSize[i];
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * x = NewTensor(order, dimSize);
XTensor * y = NewTensor(order, dimSize);
XTensor yUser;
/* initialize variables */
x->SetDataRand(0, 1);
y->SetZeroAll();
/* call Dropout function */
float prob = 0.2F;
int seed = 20;
_Dropout(x, y, seed, prob);
yUser = Dropout(*x);
/* check result */
int zeroNum1 = 0;
int zeroNum2 = 0;
float * data1 = (float*)y->data;
float * data2 = (float*)yUser.data;
for (int i = 0; i < unitNum; i++){
DTYPE tmp1 = data1[i];
DTYPE tmp2 = data2[i];
if(tmp1 == 0.0F)
zeroNum1 += 1;
if(tmp2 == 0.0F)
zeroNum2 += 1;
}
printf("CPU Test:\n");
printf("In tensor y, there are %d units.\n", unitNum);
printf("There are %d zero units by Dropout layer with probability %.2f.\n", zeroNum1, prob);
printf("In tensor yUser, there are %d units.\n", unitNum);
printf("There are %d zero units by Dropout layer with default probability %.2f.\n", zeroNum2, 0.5F);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * xGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * yGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor yUserGPU;
/* initialize variables */
xGPU->SetDataRand(0, 1);
yGPU->SetZeroAll();
/* call Dropout function */
_Dropout(xGPU, yGPU, seed, prob);
yUserGPU = Dropout(*xGPU);
/* check result */
zeroNum1 = 0;
zeroNum2 = 0;
data1 = (float*)y->data;
data2 = (float*)yUser.data;
for (int i = 0; i < unitNum; i++){
DTYPE tmp1 = data1[i];
DTYPE tmp2 = data2[i];
if(tmp1 == 0.0F)
zeroNum1 += 1;
if(tmp2 == 0.0F)
zeroNum2 += 1;
}
printf("CPU Test:\n");
printf("In tensor y, there are %d units.\n", unitNum);
printf("There are %d zero units by Dropout layer with probability %.2f.\n", zeroNum1, prob);
printf("In tensor yUser, there are %d units.\n", unitNum);
printf("There are %d zero units by Dropout layer with default probability %.2f.\n", zeroNum2, 0.5F);
/* destroy variables */
delete x;
delete y;
delete xGPU;
delete yGPU;
delete[] dimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete x;
delete y;
delete[] dimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 2: test Dropout function and backward computation.
*/
bool TestDropout2()
{
/* a input tensor of size (4, 5) */
int order = 2;
int * dimSize = new int[order];
dimSize[0] = 4;
dimSize[1] = 5;
int unitNum = 1;
for (int i = 0; i < order; i++)
unitNum *= dimSize[i];
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * x = NewTensor(order, dimSize);
XTensor * y = NewTensor(order, dimSize);
XTensor * dedx = NewTensor(order, dimSize);
XTensor * dedy = NewTensor(order, dimSize);
/* initialize variables */
_SetDataFixedFloat(x, 1.0F);
y->SetZeroAll();
dedx->SetZeroAll();
_SetDataFixedFloat(dedy, 1.0F);
/* call Dropout function */
float prob = 0.5F;
int seed = 1;
_Dropout(x, y, seed, prob);
_DropoutBackward(y, x, dedy, dedx, 1, prob);
/* check result */
y->Dump(stderr, "y");
dedx->Dump(stderr, "dedy");
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * xGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * yGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * dedxGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * dedyGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
/* initialize variables */
_SetDataFixedFloat(xGPU, 1.0F);
yGPU->SetZeroAll();
dedxGPU->SetZeroAll();
_SetDataFixedFloat(dedyGPU, 1.0F);
/* call Dropout function */
_Dropout(xGPU, yGPU, seed, prob);
_DropoutBackward(yGPU, xGPU, dedyGPU, dedxGPU, 1, prob);
/* check result */
yGPU->Dump(stderr, "yGPU");
dedxGPU->Dump(stderr, "dedyGPU");
/* destroy variables */
delete x;
delete y;
delete dedx;
delete dedy;
delete xGPU;
delete yGPU;
delete dedxGPU;
delete dedyGPU;
delete[] dimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete x;
delete y;
delete[] dimSize;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */
/*
TODO!!
*/
/* test for Dropout Function */
bool TestDropout()
{
XPRINT(0, stdout, "[TEST DROPOUT] dropout function and its backward computation \n");
bool returnFlag = true, caseFlag = true;
/* case 1 test */
caseFlag = TestDropout1();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 1 failed!\n");
}
else
XPRINT(0, stdout, ">> case 1 passed!\n");
/* case 2 test */
caseFlag = TestDropout2();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 2 failed!\n");
}
else
XPRINT(0, stdout, ">> case 2 passed!\n");
/* other cases test */
/*
TODO!!
*/
if (returnFlag) {
XPRINT(0, stdout, ">> All Passed!\n");
}
else
XPRINT(0, stdout, ">> Failed!\n");
XPRINT(0, stdout, "\n");
return returnFlag;
}
} // namespace nts(NiuTrans.Tensor)
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-09-12
*/
#ifndef __TEST_DROPOUT_H__
#define __TEST_DROPOUT_H__
#include "../function/Dropout.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* test for Dropout Function */
bool TestDropout();
} // namespace nts(NiuTrans.Tensor)
#endif // __TEST_DROPOUT_H__
...@@ -19,6 +19,7 @@ ...@@ -19,6 +19,7 @@
* $Created by: LI Yinqiao (email: li.yin.qiao.2012@hotmail.com) 2018-04-30 * $Created by: LI Yinqiao (email: li.yin.qiao.2012@hotmail.com) 2018-04-30
*/ */
#include<math.h>
#include "../core/math/ScaleAndShift.h" #include "../core/math/ScaleAndShift.h"
#include "TLoss.h" #include "TLoss.h"
...@@ -62,7 +63,7 @@ bool TestLoss1() ...@@ -62,7 +63,7 @@ bool TestLoss1()
error = _LossCompute(gold, output, SQUAREDERROR, false, 0, 0, dimSize[0], 0); error = _LossCompute(gold, output, SQUAREDERROR, false, 0, 0, dimSize[0], 0);
/* check results */ /* check results */
cpuTest = (error == answer); cpuTest = (fabs(error - answer) < 1e-4);
#ifdef USE_CUDA #ifdef USE_CUDA
/* GPU test */ /* GPU test */
...@@ -82,7 +83,7 @@ bool TestLoss1() ...@@ -82,7 +83,7 @@ bool TestLoss1()
error = _LossCompute(goldGPU, outputGPU, SQUAREDERROR, false, 0, 0, dimSize[0], 0); error = _LossCompute(goldGPU, outputGPU, SQUAREDERROR, false, 0, 0, dimSize[0], 0);
/* check results */ /* check results */
gpuTest = (error == answer); gpuTest = (fabs(error - answer) < 1e-4);
/* destroy variables */ /* destroy variables */
delete output; delete output;
...@@ -140,7 +141,7 @@ bool TestLoss2() ...@@ -140,7 +141,7 @@ bool TestLoss2()
error = _LossCompute(gold, output, CROSSENTROPY, false, 0, 0, dimSize[0], 0); error = _LossCompute(gold, output, CROSSENTROPY, false, 0, 0, dimSize[0], 0);
/* check results */ /* check results */
cpuTest = (error == answer); cpuTest = (fabs(error - answer) < 1e-4);
#ifdef USE_CUDA #ifdef USE_CUDA
/* GPU test */ /* GPU test */
...@@ -160,7 +161,7 @@ bool TestLoss2() ...@@ -160,7 +161,7 @@ bool TestLoss2()
error = _LossCompute(goldGPU, outputGPU, CROSSENTROPY, false, 0, 0, dimSize[0], 0); error = _LossCompute(goldGPU, outputGPU, CROSSENTROPY, false, 0, 0, dimSize[0], 0);
/* check results */ /* check results */
gpuTest = (error == answer); gpuTest = (fabs(error - answer) < 1e-4);
/* destroy variables */ /* destroy variables */
delete output; delete output;
...@@ -226,7 +227,7 @@ bool TestLoss3() ...@@ -226,7 +227,7 @@ bool TestLoss3()
error = _LossCompute(gold, output, ONEHOTERROR, false, 0, 0, dimSize[0], 0); error = _LossCompute(gold, output, ONEHOTERROR, false, 0, 0, dimSize[0], 0);
/* check results */ /* check results */
cpuTest = (error == answer); cpuTest = (fabs(error - answer) < 1e-4);
#ifdef USE_CUDA #ifdef USE_CUDA
/* GPU test */ /* GPU test */
...@@ -244,7 +245,7 @@ bool TestLoss3() ...@@ -244,7 +245,7 @@ bool TestLoss3()
error = _LossCompute(goldGPU, outputGPU, ONEHOTERROR, false, 0, 0, dimSize[0], 0); error = _LossCompute(goldGPU, outputGPU, ONEHOTERROR, false, 0, 0, dimSize[0], 0);
/* check results */ /* check results */
gpuTest = (error == answer); gpuTest = (fabs(error - answer) < 1e-4);
/* destroy variables */ /* destroy variables */
delete output; delete output;
......
...@@ -33,52 +33,130 @@ bool TestXMemCase1() ...@@ -33,52 +33,130 @@ bool TestXMemCase1()
int blcokSize = 16; int blcokSize = 16;
int testNum = caseNum * 10; int testNum = caseNum * 10;
for(int i = 0, scalar = 1; i < 3; i++){ int devIDs[2];
XMem mem; int devNum = 1;
mem.Initialize(-1, FREE_ON_THE_FLY, blcokSize * sizeof(int) * scalar * scalar, 1000, 0); devIDs[0] = -1;
mem.SetIndex(10000, blcokSize * sizeof(int) / 2);
/*if (GDevs.nGPU > 0) {
srand(907); devIDs[1] = 0;
devNum = 2;
int ** p = new int*[caseNum]; devIDs[0] = 0;
int * size = new int[caseNum]; devNum = 1;
}*/
for(int i = 0; i < caseNum; i++){
p[i] = NULL; int * buf = new int[blcokSize * 10];
size[i] = rand() % (2*blcokSize);
} for (int id = 0; id < devNum; id++) {
int devID = devIDs[id];
for (int iter = 0, scalar = 1; iter < 3; iter++) {
XMem mem;
mem.Initialize(devID, FREE_ON_THE_FLY, blcokSize * sizeof(int) * scalar * scalar, 1000, 0);
mem.SetIndex(10000, blcokSize * sizeof(int) / 2);
srand(907);
int ** p = new int*[caseNum];
int * size = new int[caseNum];
for (int i = 0; i < caseNum; i++) {
p[i] = NULL;
size[i] = rand() % (2 * blcokSize);
}
for(int i = 0; i < testNum * scalar; i++){ for (int i = 0; i < testNum * scalar; i++) {
int j = rand() % caseNum; testxmemid++;
//fprintf(stderr, "%d %d\n", testxmemid, ok);
int j = rand() % caseNum;
if(p[j] == NULL){ if (p[j] == NULL) {
p[j] = (int*)mem.AllocStandard(mem.devID, size[j] * sizeof(int)); p[j] = (int*)mem.AllocStandard(mem.devID, size[j] * sizeof(int));
for(int k = 0; k < size[j]; k++) for (int k = 0; k < size[j]; k++)
p[j][k] = j; buf[k] = j;
} XMemCopy(p[j], devID, buf, -1, sizeof(int) * size[j]);
else{ }
mem.ReleaseStandard(mem.devID, p[j]); else {
for(int k = 0; k < size[j]; k++) mem.ReleaseStandard(mem.devID, p[j], size[j] * sizeof(int));
p[j][k] = -1; for (int k = 0; k < size[j]; k++)
p[j] = NULL; buf[k] = -1;
} XMemCopy(p[j], devID, buf, -1, sizeof(int) * size[j]);
p[j] = NULL;
}
for(int k = 0; k < caseNum; k++){ for (int k = 0; k < caseNum; k++) {
if(p[k] != NULL){ if (p[k] != NULL) {
for(int o = 0; o < size[k]; o++){ XMemCopy(buf, -1, p[k], devID, sizeof(int) * size[k]);
if(p[k][o] != k){ for (int o = 0; o < size[k]; o++) {
ok = false; if (buf[o] != k) {
ok = false;
}
}
}
}
/*MPieceNode * entry = NULL;
MPieceNode * node = NULL;
entry = mem.memIndex + mem.indexEntryNum + mem.FindIndexEntry(112);
int cc = 0;
node = entry->next;
while(node != NULL){
fprintf(stderr, "%d ", cc++);
if(node->size == 0){
MPieceNode * next = node->next;
node = next;
}
else{
CheckNTErrors(node->pReal != NULL, "Illegal pointer!");
node = node->next;
}
}
fprintf(stderr, "\n");*/
/*int ccc = 0;
bool hhh = recordp != NULL ? false : true;
for(int i = 0; i < mem.indexEntryNum; i++){
MPieceNode * entry = mem.memIndex + mem.indexEntryNum + i;
MPieceNode * last = entry;
MPieceNode * node = entry->next;
ccc = 0;
while(node != NULL){
CheckNTErrors(node->pre == last, "XSomething is wrong!");
CheckNTErrors(last->next == node, "XSomething is wrong!");
last = node;
ccc++;
if(node->pReal == recordp){
hhh = true;
}
if(node->size == 0){
MPieceNode * next = node->next;
node = next;
}
else{
CheckNTErrors(node->pReal != NULL, "Illegal pointer!");
node = node->next;
} }
} }
} }
if(!hhh){
int nnn = 0;
}*/
} }
}
delete[] p; delete[] p;
delete[] size; delete[] size;
scalar *= 2; scalar *= 2;
}
} }
delete[] buf;
return ok; return ok;
} }
...@@ -113,4 +191,4 @@ bool TestXMem() ...@@ -113,4 +191,4 @@ bool TestXMem()
return returnFlag; return returnFlag;
} }
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
...@@ -76,6 +76,7 @@ bool Test() ...@@ -76,6 +76,7 @@ bool Test()
wrong = !TestUnsqueeze() || wrong; wrong = !TestUnsqueeze() || wrong;
wrong = !TestXMem() || wrong; wrong = !TestXMem() || wrong;
wrong = !TestDropout() || wrong;
wrong = !TestHardTanH() || wrong; wrong = !TestHardTanH() || wrong;
wrong = !TestIdentity() || wrong; wrong = !TestIdentity() || wrong;
wrong = !TestLogSoftmax() || wrong; wrong = !TestLogSoftmax() || wrong;
......
...@@ -69,6 +69,7 @@ ...@@ -69,6 +69,7 @@
#include "TUnsqueeze.h" #include "TUnsqueeze.h"
#include "TXMem.h" #include "TXMem.h"
#include "TDropout.h"
#include "THardTanH.h" #include "THardTanH.h"
#include "TIdentity.h" #include "TIdentity.h"
#include "TLogSoftmax.h" #include "TLogSoftmax.h"
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论