Commit 4336f2f9 by xuchen

1. add dropout function 2. add some function in tensor/core/math/unary file 3.…

1. add dropout function 2. add some function in tensor/core/math/unary file 3. merge with xiaotong-working branch
parent 98db6f24
......@@ -39,7 +39,6 @@ void SumDimTest();
using namespace nts;
using namespace fnnlm;
using namespace transformer;
using namespace GAN;
int main( int argc, const char ** argv )
......@@ -47,9 +46,7 @@ int main( int argc, const char ** argv )
//return 0;
if(argc > 1 && !strcmp(argv[1], "-test"))
else if(argc > 1 && !strcmp(argv[1], "-fnnlm"))
if(argc > 1 && !strcmp(argv[1], "-fnnlm"))
FNNLMMain(argc - 1, argv + 1);
else if(argc > 1 && !strcmp(argv[1], "-t2t"))
TransformerMain(argc - 1, argv + 1);
......@@ -451,10 +451,10 @@ void XMathGrad::GradDivDim(XTensor * node)
node->visitMark = NODE_FINISHED;
......@@ -499,8 +499,8 @@ void XMathGrad::GradMatrixMul(XTensor * node)
a->Reshape(a->unitNum/a->GetDim(-1), a->GetDim(-1));
c->Reshape(c->unitNum/c->GetDim(-1), c->GetDim(-1));
deda->Reshape(a->unitNum/a->GetDim(-1), a->GetDim(-1));
dedc->Reshape(c->unitNum/c->GetDim(-1), c->GetDim(-1));
deda->Reshape(deda->unitNum/deda->GetDim(-1), deda->GetDim(-1));
dedc->Reshape(dedc->unitNum/dedc->GetDim(-1), dedc->GetDim(-1));
GradMatrixMul(a, deda, transA, b, dedb, transB, dedc, alpha);
......@@ -760,7 +760,7 @@ void XMathGrad::GradMultiplyDim(XTensor * node)
node->visitMark = NODE_FINISHED;
......@@ -796,6 +796,8 @@ gradient for normalize
void XMathGrad::GradNormalize(XTensor * node)
ShowNTErrors("This is really a bad piece of code!!!");
XLink &income = node->income;
CheckNTErrors(income.tailNum == 5, "Wrong input tensor number for NORMALIZE!");
......@@ -902,7 +904,7 @@ void XMathGrad::GradPower(XTensor * node)
_ScaleAndShiftMe(b, p);
_Multiply(node->grad, b, a->grad, 1.0F);
node->visitMark = NODE_FINISHED;
......@@ -1229,7 +1231,7 @@ void XMathGrad::GradReduceSum(XTensor * node)
_Unsqueeze(node->grad, b, dim, n);
_Sum(a->grad, b, a->grad);
node->visitMark = NODE_FINISHED;
......@@ -1274,10 +1276,10 @@ void XMathGrad::GradReduceSumSquared(XTensor * node)
_ScaleAndShiftMe(f, -2.0F);
_Multiply(node->grad, f, b->grad, 1.0F);
node->visitMark = NODE_FINISHED;
......@@ -1323,10 +1325,10 @@ void XMathGrad::GradReduceVariance(XTensor * node)
_ScaleAndShiftMe(f, -2.0F /n);
_Multiply(node->grad, f, b->grad, 1.0F);
node->visitMark = NODE_FINISHED;
......@@ -145,14 +145,19 @@ void XNet::Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss)
lossGrad.Compute(gold, root, root->grad, loss);
/* back-propagation from output to input */
for(int i = nodes.count - 1; i >= 0; i--){
XTensor * node = (XTensor*)nodes.Get(i);;
XTensor * node = (XTensor*)nodes.Get(i);
if(node->mem != NULL){
CheckNTErrors(node->mem->bufUsed < BUF_PITCH, "Illegal access of buffer!");
if(node->visitMark == NODE_FINISHED)
......@@ -116,15 +116,25 @@ XTensor T2TAttention::Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask)
XTensor att;
XTensor dot;
XTensor scalar;
/* scalar = softmax(Q * K^T / sqrt(dk)) * V */
dot = BMMul(qheads, X_NOTRANS, kheads, X_TRANS);
dot = dot + mask;
scalar = Softmax(Linear(dot, 1/(float)sqrt((float)dk)), -1);
if(ignored > 0)
_SetDataDim(&scalar, 0, ignored, scalar.order - 2, 1e-9F);
dot = Linear(dot, 1.0F/(float)sqrt((float)dk));
//if(llnum == 1)
// dot.Dump(tf, "dot:");
scalar = Softmax(dot, -1);
//if(llnum == 1)
// scalar.Dump(tf, "scalar:");
//if(ignored > 0)
// _SetDataDim(&scalar, 0, ignored, scalar.order - 2, 1e-9F);
att = BMMul(scalar, vheads);
......@@ -111,6 +111,9 @@ XTensor T2TEmbedder::Make(XTensor &input)
memcpy(dims, input.dimSize, input.order * sizeof(int));
dims[input.order - 1] = eSize;
XTensor wordEmbedding;
XTensor posEmbedding;
bool match = (posEmbedding.order == input.order);
for(int i = 0; i < input.order; i++){
......@@ -120,7 +123,8 @@ XTensor T2TEmbedder::Make(XTensor &input)
/* we make positional embeddings first */
InitTensor(&posEmbedding, input.order, dims, X_FLOAT, 1.0F, devID, mem);
XTensor * posTMP = NewTensorBuf(2, dims + 1, X_FLOAT, 1.0F, devID, mem);
......@@ -130,8 +134,6 @@ XTensor T2TEmbedder::Make(XTensor &input)
XTensor wordEmbedding;
/* then we make word embeddings */
wordEmbedding = Linear(MMul(input, w), (float)sqrt((float)d));
......@@ -63,9 +63,6 @@ public:
the embedding processing by re-loading. */
XTensor posEmbeddingBase;
/* positional embeddings */
XTensor posEmbedding;
/* constructor */
......@@ -103,6 +103,10 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool skipInputRes)
XTensor fnn;
XTensor res;
llnum = -1;
/* we skip the residual connection for the first layer if
the encoder is used in language modeling. */
if(skipInputRes && i == 0){
/* self attention */
att = attentions[i].Make(x, x, x, mask);
......@@ -113,6 +117,7 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool skipInputRes)
x = attLayerNorms[i].Make(att);
/* self attention */
att = attentions[i].Make(x, x, x, mask);
......@@ -123,6 +128,8 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool skipInputRes)
/* layer normalization */
x = attLayerNorms[i].Make(res);
llnum = -1;
/* fnn */
......@@ -27,7 +27,7 @@
namespace transformer
/* constructor */
......@@ -51,19 +51,24 @@ initialize the model
void T2TModel::InitModel(int argc, const char ** argv)
bool useMem = false;
int memSize = 0;
bool isMemFreeOTF = false;
LoadParamInt(argc, argv, "dev", &devID, -1);
LoadParamBool(argc, argv, "mem", &useMem, useMem);
LoadParamInt(argc, argv, "memsize", &memSize, 1024);
LoadParamBool(argc, argv, "lm", &isLM, true);
LoadParamBool(argc, argv, "mt", &isMT, false);
LoadParamInt(argc, argv, "nhead", &nhead, 8);
LoadParamBool(argc, argv, "freeotf", &isMemFreeOTF, false);
delete mem;
mem = new XMem(devID);
mem = new XMem(devID, isMemFreeOTF ? FREE_ON_THE_FLY : UNI_FREE, (MTYPE)MILLION * 256, 1024, MILLION * 128);
mem->SetDesiredSize(devID, 0, (MTYPE)memSize * MILLION);
encoder.InitModel(argc, argv, isLM, isLM ? 1 : 0, devID, mem);
encoder.InitModel(argc, argv, isLM, 0, devID, mem);
outputLayer.InitModel(argc, argv, devID, mem);
......@@ -83,8 +88,9 @@ XTensor T2TModel::MakeEncoding(XTensor &input, XTensor &mask, bool skipInputRes)
make the entire network (with the output softmax layer)
>> input - input tensor
>> output - output tensor (distribution)
>> padding - padding of the sequences
void T2TModel::Make(XTensor &input, XTensor &output)
void T2TModel::Make(XTensor &input, XTensor &output, XTensor &padding)
XTensor encoding;
......@@ -98,18 +104,118 @@ void T2TModel::Make(XTensor &input, XTensor &output)
dims[input.order] = len;
XTensor mask(input.order + 1, dims, X_FLOAT, 1.0F, input.devID, input.mem);
/* a upper triangular matrix where the cells of the upper triangular are set to -1e-9 */
_SetDataLowTri(&mask, 1e9F, -1);
/* a upper triangular matrix where the cells of the upper triangular are set to -1e-9.
this matrix can be used to prevent the attention to current or following words in
a given sequence. */
_SetDataLowTri(&mask, 1e9F, 0);
_ScaleAndShiftMe(&mask, 1.0F, -1e9F);
int * dimsPadding = new int[padding.order + 2];
for(int i = 0; i < padding.order - 1; i++)
dimsPadding[i] = padding.GetDim(i);
dimsPadding[padding.order - 1] = padding.GetDim(-1);
dimsPadding[padding.order] = padding.GetDim(-1);
XTensor * padding2 = NewTensorBuf(padding.order + 1, dimsPadding, padding.dataType,
padding.denseRatio, padding.devID, padding.mem);
for(int i = 0; i < padding2->order; i++)
dimsPadding[i + 1] = padding2->GetDim(i);
dimsPadding[0] = nhead;
XTensor * padding3 = NewTensorBuf(padding.order + 2, dimsPadding, padding.dataType,
padding.denseRatio, padding.devID, padding.mem);
/* mask of the padding */
_Unsqueeze(&padding, padding2, padding.order - 1, padding.GetDim(-1));
_Unsqueeze(padding2, padding3, 0, nhead);
_ScaleAndShiftMe(padding3, 1e9F, -1e9F);
//_Sum(&mask, padding3, &mask);
encoding = MakeEncoding(input, mask, true);
outputLayer.Make(encoding, output);
delete[] dims;
delete[] dimsPadding;
get parameter matrics
>> list - the list that keeps the parameter matrics
void T2TModel::GetParams(XList &list)
for(int i = 0; i < encoder.nlayer; i++){
dump the parameters
>> fn - where to keep the model
>> model - the model
void T2TModel::Dump(const char * fn)
FILE * file = fopen(fn, "wb");
CheckNTErrors(file, "Cannot open the model file");
XList params(100);
for(int i = 0; i < params.count; i++){
XTensor * p = (XTensor*)params.Get(i);
p->Dump(file, "param:");
XPRINT(0, stderr, "[INFO] model saved\n");
/* read the parameters */
void T2TModel::Read(const char * fn)
FILE * file = fopen(fn, "rb");
CheckNTErrors(file, "Cannot open the model file");
XList params(100);
for(int i = 0; i < params.count; i++){
XTensor * p = (XTensor*)params.Get(i);
p->Read(file, "param:");
XPRINT(0, stderr, "[INFO] model loaded\n");
......@@ -72,9 +72,18 @@ public:
XTensor MakeEncoding(XTensor &input, XTensor &mask, bool skipInputRes);
/* make the entire network (with the output softmax layer) */
void Make(XTensor &input, XTensor &output);
void Make(XTensor &input, XTensor &output, XTensor &padding);
/* get parameter matrics */
void GetParams(XList &list);
/* dump the parameters */
void Dump(const char * fn);
/* read the parameters */
void Read(const char * fn);
\ No newline at end of file
......@@ -24,6 +24,7 @@
#include "T2TUtility.h"
#include "../../tensor/XUtility.h"
#include "../../tensor/core/CHeader.h"
#include "../../network/XNoder.h"
namespace transformer
......@@ -31,8 +32,6 @@ namespace transformer
/* constructor */
devID = -1;
mem = NULL;
seqLen = NULL;
nseqBuf = 0;
nextSeq = -1;
......@@ -44,6 +43,16 @@ T2TTrainer::~T2TTrainer()
delete[] buf;
delete[] seqLen;
delete[] seqOffset;
for(int i = 0; i < moments.count; i++){
XTensor * m = (XTensor*)moments.Get(i);
delete m;
for(int i = 0; i < moments2nd.count; i++){
XTensor * m = (XTensor*)moments2nd.Get(i);
delete m;
......@@ -53,8 +62,11 @@ initialization
void T2TTrainer::Init(int argc, const char ** argv)
LoadParamInt(argc, argv, "dev", &devID, -1);
LoadParamFloat(argc, argv, "lrate", &lrate, 0.001F);
bool useMem = false;
LoadParamBool(argc, argv, "mem", &useMem, useMem);
LoadParamFloat(argc, argv, "lrate", &lrate, 1.0F);
LoadParamFloat(argc, argv, "lrbias", &lrbias, 0);
LoadParamInt(argc, argv, "sbatch", &sBatchSize, 1);
LoadParamInt(argc, argv, "wbatch", &wBatchSize, 1);
LoadParamInt(argc, argv, "nepoch", &nepoch, 1);
......@@ -64,12 +76,22 @@ void T2TTrainer::Init(int argc, const char ** argv)
LoadParamInt(argc, argv, "vsize", &vSize, 1);
LoadParamBool(argc, argv, "sorted", &isLenSorted, false);
LoadParamInt(argc, argv, "bufsize", &bufSize, 50000);
LoadParamBool(argc, argv, "adam", &useAdam, false);
LoadParamFloat(argc, argv, "adambeta1", &adamBeta1, 0.9F);
LoadParamFloat(argc, argv, "adambeta2", &adamBeta2, 0.999F);
LoadParamFloat(argc, argv, "adamdelta", &adamDelta, 1e-8F);
buf = new int[bufSize];
seqLen = new int[bufSize];
seqOffset = new int[bufSize];
adamBeta1T = 1.0F;
adamBeta2T = 1.0F;
int tc = 0;
train the model
>> fn - training data file
......@@ -86,40 +108,74 @@ void T2TTrainer::Train(const char * fn, T2TModel * model)
float loss = 0;
float lr = 0;
int devID = model->devID;
XMem * mem = model->mem;
if(mem != NULL && mem->mode == UNI_FREE)
XNet net;
tf = fopen("tmp.xx.txt", "wb");
tc = 0;
double startT = GetClockSec();
for(epoch = 0; epoch < nepoch; epoch++){
for(epoch = 1; epoch <= nepoch; epoch++){
FILE * file = fopen(fn, "rb");
CheckNTErrors(file, "cannot open training file!");
wordCount = 0;
loss = 0;
if(mem != NULL)
/* batch of input sequences */
XTensor batch;
/* padding */
XTensor padding;
/* gold standard */
XTensor gold;
while(LoadBatch(file, &batch, 1, vSize, sBatchSize, wBatchSize, isLenSorted, wc)){
while(LoadBatch(file, true, &batch, &padding, &gold, NULL, 1, vSize, sBatchSize, wBatchSize, isLenSorted, wc, devID, mem)){
/* output probabilities */
XTensor output;
/* make the network */
model->Make(batch, output);
model->Make(batch, output, padding);
/* make paddings for the output */
if(output.GetDim(0) > 1)
PadOutput(&output, &padding);
/* back-propagation for obtaining gradients */
net.Backward(output, batch, CROSSENTROPY);
net.Backward(output, gold, CROSSENTROPY);
/* learning rate */
lr = (1 / (float)sqrt((float)d)) * (float)MIN(pow(step + 1, -0.5), (step + 1) * pow(nwarmup, -1.5));
//lr = 0.00005F;
lr = lrate * (1.0F / (float)sqrt((float)d)) * (float)MIN(pow((float)step + 1, -0.5F - lrbias), ((float)step + 1) * pow((float)nwarmup, -1.5F - lrbias));
/* update the parameters */
Update(model, lr);
/* get probabilities */
float prob = GetProb(&output, &batch, NULL);
float prob = GetProb(&output, &gold, NULL);
MTYPE totalUsed = 0;
MTYPE totalSize = 0;
for (int i = 0; i <= mem->curBlockID; i++) {
totalSize += mem->blocks[i].size;
totalUsed += mem->blocks[i].used;
//fprintf(stderr, "%d(%ld,%ld,%f)\n", mem->curBlockID, totalUsed, totalSize, (float)totalUsed/totalSize);
loss += -prob;
wordCount += wc;
......@@ -132,9 +188,12 @@ void T2TTrainer::Train(const char * fn, T2TModel * model)
if (step % 1 == 0) {
double elapsed = GetClockSec() - startT;
XPRINT6(0, stderr, "[INFO] lr=%.2e, elapsed=%.1fs, step=%d, epoch=%d, word=%d, ppl=%.3f\n",
lr, elapsed, step, epoch + 1, wordCountTotal, exp(loss / wordCount));
XPRINT7(0, stderr, "[INFO] lr=%.2e, elapsed=%.1fs, step=%d, epoch=%d, word=%d, ppl=%.3f, sppl=%.3f\n",
lr, elapsed, step, epoch, wordCountTotal, exp(loss / wordCount), exp(-prob/wc));
if(mem != NULL && mem->mode == UNI_FREE)
......@@ -142,8 +201,13 @@ void T2TTrainer::Train(const char * fn, T2TModel * model)
if (isEnd)
if(mem != NULL && mem->mode == UNI_FREE)
double elapsed = GetClockSec() - startT;
XPRINT6(0, stderr, "[INFO] lr=%.2e, elapsed=%.1fs, step=%d, epoch=%d, word=%d, ppl=%.3f\n",
lr, elapsed, step, epoch, wordCountTotal, exp(loss / wordCount));
......@@ -151,6 +215,122 @@ void T2TTrainer::Train(const char * fn, T2TModel * model)
elapsed, step, epoch);
test the model
>> fn - test data file
>> ofn - output data file
>> model - model that is trained
void T2TTrainer::Test(const char * fn, const char * ofn, T2TModel * model)
int wc = 0;
int wordCount = 0;
int wordCountTotal = 0;
float loss = 0;
/* data files */
FILE * file = fopen(fn, "rb");
CheckNTErrors(file, "Cannot read the test file");
FILE * ofile = fopen(ofn, "wb");
CheckNTErrors(ofile, "Cannot open the output file");
int devID = model->devID;
XMem * mem = model->mem;
XNet net;
tf = fopen("tmp.xx.txt", "wb");
tc = 0;
double startT = GetClockSec();
wordCount = 0;
if(mem != NULL && mem->mode == UNI_FREE)
/* batch of input sequences */
XTensor batch;
/* padding */
XTensor padding;
/* gold standard */
XTensor gold;
/* an array that keeps the sequences */
int * seqs = new int[MILLION];
while(LoadBatch(file, true, &batch, &padding, &gold, seqs, 1, vSize, 1, 1, isLenSorted, wc, devID, mem)){
CheckNTErrors(batch.order == 3, "wrong tensor order of the sequence batch");
/* output probabilities */
XTensor output;
/* make the network */
model->Make(batch, output, padding);
int bSize = batch.GetDim(0);
int length = batch.GetDim(1);
/* prediction probabilities */
XTensor probs;
InitTensor1D(&probs, bSize * length);
/* get probabilities */
float prob = GetProb(&output, &gold, &probs);
/* dump the test result */
for(int s = 0; s < bSize; s++){
DTYPE sum = 0;
int * seq = seqs + s * length;
for(int i = 0; i < length; i++){
if(seq[i] >= 0){
fprintf(ofile, "%d ", seq[i]);
fprintf(ofile, "||| ");
for(int i = 0; i < length; i++){
if(seq[i] >= 0){
DTYPE p = probs.Get1D(s * length + i);
fprintf(ofile, "%.3e ", p);
sum += p;
fprintf(ofile, "||| %e\n", sum);
loss += -prob;
wordCount += wc;
wordCountTotal += wc;
if(mem != NULL && mem->mode == UNI_FREE)
if(mem != NULL && mem->mode == UNI_FREE)
delete[] seqs;
double elapsed = GetClockSec() - startT;
XPRINT3(0, stderr, "[INFO] test finished (took %.1fs, word=%d, and ppl=%.3f)\n",
elapsed,wordCountTotal, exp(loss / wordCount));
......@@ -226,18 +406,36 @@ int T2TTrainer::LoadBuf(FILE * file)
return lineCount;
/* clear the data buffer */
void T2TTrainer::ClearBuf()
nseqBuf = 0;
nextSeq = -1;
load a batch of sequences
>> file - the handle to the data file
>> batch - the batch
>> isLM - indicates whether the data is used for training lms
>> batch - the batch of the input sequences
>> padding - padding of the input sequences
>> output - the batch of the output sequences
>> seqs - keep the sequences in an array
>> step - the step we go over when move to the next sequence
>> vs - vocabulary size
>> sBatch - batch size of sequences
>> wBatch - batch size of words
>> isSorted - indicates whether the sequences are sorted by length
>> wCount - word count
>> devID - device id
>> mem - memory pool
int T2TTrainer::LoadBatch(FILE * file, XTensor * batch, int step, int vs, int sBatch, int wBatch, bool isSorted, int &wCount)
int T2TTrainer::LoadBatch(FILE * file, bool isLM,
XTensor * batch, XTensor * padding, XTensor * output,
int * seqs,
int step, int vs, int sBatch, int wBatch,
bool isSorted, int &wCount,
int devID, XMem * mem)
if(nextSeq < 0 || nextSeq >= nseqBuf)
......@@ -262,26 +460,71 @@ int T2TTrainer::LoadBatch(FILE * file, XTensor * batch, int step, int vs, int sB
wCount = 0;
nextSeq = seq + sc;
if(sc > 0){
if(sc <= 0)
return 0;
dims[0] = sc;
dims[1] = max;
dims[2] = vs;
if(batch->order != 3 || batch->GetDim(0) != dims[0] ||
batch->GetDim(1) != dims[1] || batch->GetDim(2) != dims[2]){
InitTensor(batch, 3, dims, X_FLOAT, 1.0F, devID, mem);
InitTensor(batch, 3, dims, X_FLOAT, 1.0F, devID, mem);
InitTensor2D(padding, sc, max, X_FLOAT, devID, mem);
InitTensor(output, 3, dims, X_FLOAT, 1.0F, devID, mem);
if(batch->grad == NULL)
InitTensor(batch->grad, 3, dims, X_FLOAT, 1.0F, devID, mem);
if(padding->grad == NULL)
InitTensor2D(padding->grad, sc, max, X_FLOAT, devID, mem);
if(output->grad == NULL)
InitTensor(output->grad, 3, dims, X_FLOAT, 1.0F, devID, mem);
int seqSize = 0;
//fprintf(tf, "batch %d(%d)\n", tc++, sc);
/* this might be slow on GPUs :( */
for(int s = seq; s < seq + sc; s++){
for(int w = 0; w < seqLen[s]; w++){
batch->Set3D(1.0F, s - seq, w, buf[seqOffset[s] + w]);
padding->Set2D(1.0F, s - seq, w);
if(w > 0)
output->Set3D(1.0F, s - seq, w - 1, buf[seqOffset[s] + w]);
if(w == seqLen[s] - 1)
output->Set3D(1.0F, s - seq, w, buf[seqOffset[s] + w]);
/*fprintf(tf, "%d", buf[seqOffset[s] + w]);
if(w < seqLen[s] - 1)
fprintf(tf, " ");
fprintf(tf, "\n");*/
if(seqs != NULL)
seqs[seqSize++] = buf[seqOffset[s] + w];
if(seqs != NULL){
for(int w = seqLen[s]; w < max; w++)
seqs[seqSize++] = -1;
return sc;
......@@ -338,23 +581,7 @@ void T2TTrainer::Update(T2TModel * model, const float lr)
XList ws(100);
for(int i = 0; i < model->encoder.nlayer; i++){
for(int i = 0; i < ws.count; i++){
XTensor * para = (XTensor*)ws.Get(i);
......@@ -366,32 +593,103 @@ void T2TTrainer::Update(T2TModel * model, const float lr)
CheckNTErrors(para != NULL, "NULL parameter tensor!");
CheckNTErrors(paraGrad != NULL, "NULL gradient tensor!");
DTYPE * d = new DTYPE[para->unitNum * para->unitSize];
DTYPE * g = new DTYPE[para->unitNum * para->unitSize];
adamBeta1T *= adamBeta1;
adamBeta2T *= adamBeta2;
DTYPE e = lr * (DTYPE)sqrt(1 - adamBeta2T) / (1 - adamBeta1T);
DTYPE d = adamDelta * (DTYPE)sqrt(1 - adamBeta2T);
XMemCopy(d, -1, para->data, para->devID, para->unitNum * para->unitSize);
XMemCopy(g, -1, paraGrad->data, paraGrad->devID, para->unitNum * para->unitSize);
/* m = beat_1 * m + (1-beta_1) * grad */
XTensor * m = (XTensor*)moments.Get(i);
_ScaleAndShiftMe(m, adamBeta1, 0);
_Sum(m, paraGrad, m, (1.0F - adamBeta1));
/* v = beat_2 * v + (1-beta_2) * grad * grad*/
XTensor * v = (XTensor*)moments2nd.Get(i);
_Multiply(paraGrad, paraGrad, v, adamBeta2/(1.0F - adamBeta2));
_ScaleAndShiftMe(v, (1.0F - adamBeta2), 0);
for (int i = 0; i < para->unitNum; i++) {
if (IsNAN(d[i]) || IsINF(d[i])) {
int nnn = 0;
if (IsNAN(g[i]) || IsINF(g[i])) {
int nnn = 0;
/* v2 = m / (sqrt(v) + delta) */
XTensor * v2 = NewTensorBuf(v, v->devID, v->mem);
_Power(v, v2, 0.5F);
_ScaleAndShiftMe(v2, 1.0F, d);
_Div(m, v2, v2);
/* the delta rule */
_Sum(para, v2, para, -e);
delete[] d;
delete[] g;
/* the delta rule */
_Sum(para, paraGrad, para, -lr);
/* the delta rule */
_Sum(para, paraGrad, para, -lr);
/* clear gradient */
prepare model for training
>> model - the model for training
void T2TTrainer::PrepareModel(T2TModel * model)
XList ws(100);
for(int i = 0; i < ws.count; i++){
XTensor * para = (XTensor*)ws.Get(i);
XTensor * m = new XTensor(para);
XTensor * m2 = new XTensor(para);
adamBeta1T = 1.0F;
adamBeta2T = 1.0F;
do padding on the output
>> output - output tensor of the network
>> padding - padding of a batch of sentences
void T2TTrainer::PadOutput(XTensor * output, XTensor * padding)
if(output == NULL || padding == NULL)
int on = output->order;
int * dimso = new int[on];
memcpy(dimso, output->dimSize, sizeof(int) * on);
output->Reshape(output->unitNum/dimso[output->order - 1], dimso[output->order - 1]);
XTensor * padding2 = NewTensorBuf(1, &padding->unitNum, X_FLOAT, 1.0F, padding->devID, padding->mem);
_CopyValues(padding, padding2);
_ScaleAndShiftMe(padding2, 1e9F, -1e9F);
_SumDim(output, padding2, output, 0);
output->Reshape(on, dimso);
delete[] dimso;
......@@ -37,12 +37,6 @@ namespace transformer
class T2TTrainer
/* device id */
int devID;
/* memory pool */
XMem * mem;
/* buffer for loading words */
int * buf;
......@@ -75,6 +69,9 @@ public:
/* learning rate */
float lrate;
/* the parameter that controls the maximum learning rate in training */
float lrbias;
/* sentence batch size */
int sBatchSize;
......@@ -88,6 +85,22 @@ public:
/* traing step number */
int nstep;
/* indicates whether we use adam */
bool useAdam;
/* hyper parameters of adam*/
float adamBeta1;
float adamBeta2;
float adamDelta;
float adamBeta1T;
float adamBeta2T;
/* list of the moment of the parameter matrics */
XList moments;
/* list of the 2nd order moment of the parameter matrics */
XList moments2nd;
/* constructor */
......@@ -101,17 +114,34 @@ public:
/* train the model */
void Train(const char * fn, T2TModel * model);
/* test the model */
void Test(const char * fn, const char * ofn, T2TModel * model);
/* load data to buffer */
int LoadBuf(FILE * file);
/* clear data buffer */
void ClearBuf();
/* load a batch of sequences */
int LoadBatch(FILE * file, XTensor * batch, int step, int vs, int sBatch, int wBatch, bool isSorted, int &wCount);
int LoadBatch(FILE * file, bool isLM,
XTensor * batch, XTensor * padding, XTensor * output,
int * seqs,
int step, int vs, int sBatch, int wBatch,
bool isSorted, int &wCount,
int devID, XMem * mem);
/* get word probabilities for a batch of sequences */
float GetProb(XTensor * output, XTensor * gold, XTensor * wordProbs);
/* update the model by delta rule */
void Update(T2TModel * model, const float lr);
/* prepare model for training */
void PrepareModel(T2TModel * model);
/* do padding on the output */
void PadOutput(XTensor * output, XTensor * padding);
......@@ -27,6 +27,8 @@ namespace transformer
int llnum = 0;
FILE * tf = NULL;
void LoadParamString(int argc, const char ** argv, const char * name, char * p, const char * defaultP)
......@@ -38,6 +38,9 @@ void LoadParamFloat(int argc, const char ** argv, const char * name, float * p,
/* show arguments */
void ShowParams(int argc, const char ** argv);
extern int llnum;
extern FILE * tf;
......@@ -38,20 +38,42 @@ int TransformerMain(int argc, const char ** argv)
ShowParams(argc, argv);
char * trainFN = new char[MAX_LINE_LENGTH];
char * modelFN = new char[MAX_LINE_LENGTH];
char * testFN = new char[MAX_LINE_LENGTH];
char * outputFN = new char[MAX_LINE_LENGTH];
LoadParamString(argc, argv, "train", trainFN, "");
LoadParamString(argc, argv, "model", modelFN, "");
LoadParamString(argc, argv, "test", testFN, "");
LoadParamString(argc, argv, "output", outputFN, "");
T2TTrainer trainer;
trainer.Init(argc, argv);
T2TModel model;
model.InitModel(argc, argv);
if(strcmp(trainFN, "")){
T2TTrainer trainer;
trainer.Init(argc, argv);
/* learn model parameters */
if(strcmp(trainFN, ""))
trainer.Train(trainFN, &model);
/* save the final model */
if(strcmp(modelFN, "") && strcmp(trainFN, ""))
/* load the model if neccessary */
if(strcmp(modelFN, ""))
/* test the model on the new data */
if(strcmp(testFN, "") && strcmp(outputFN, ""))
trainer.Test(testFN, outputFN, &model);
delete[] trainFN;
delete[] modelFN;
delete[] testFN;
delete[] outputFN;
......@@ -41,6 +41,7 @@ XDevManager GDevs;
stream = NULL;
isInitialized = false;
#ifdef USE_CUDA
......@@ -126,6 +127,7 @@ void XDevice::Init(int myDevID)
isInitialized = true;
/* clear it */
......@@ -152,11 +154,14 @@ void XDevice::Clear()
/* get cublas handle */
cublasHandle_t * XDevice::GetCublasHandle()
if (!isInitialized)
int devIDBackup = 0;
ProtectCudaDev(devID, devIDBackup);
CheckNTErrors(cublasCreate(&cublasHandle) == cudaSuccess,
CheckNTErrors(cublasCreate(&cublasHandle) == CUBLAS_STATUS_SUCCESS,
"Cannot create the cublas handle.");
isHandleReady = true;
BacktoCudaDev(devID, devIDBackup);
......@@ -169,6 +174,9 @@ cublasHandle_t * XDevice::GetCublasHandle()
/* get the stream of cuda */
cudaStream_t * XDevice::GetCudaStream()
if (!isInitialized)
CheckNTErrors(stream != NULL, "the stream is not initialized!");
return &stream->stream;
......@@ -279,33 +287,13 @@ void XDevManager::Init()
cudaDeviceProp prop[64];
for(int i = 0; i < GPUCount; i++){
cudaGetDeviceProperties(&prop[i], i);
GPUs[i].devID = i;
#ifdef USA_CUDA_P2P
for(int i = 0; i < GPUCount; i++){
for(int j = 0; j < GPUCount; j++){
if(i == j)
int access;
cudaDeviceCanAccessPeer(&access, i, j);
bool hasUVA = (prop[i].unifiedAddressing && prop[j].unifiedAddressing);
fprintf(stderr, "device %d -> device %d access:%d UVA:%d\n", i, j, access, hasUVA ? 1 : 0);
if(access != 0){
CheckNTErrors((hasUVA == true), "at least one GPU does not support UVA.")
CheckNTErrors((cudaDeviceEnablePeerAccess(j, 0)==cudaSuccess), "cannot set cuda p2t mode!");
nGPU = GPUCount;
......@@ -351,6 +339,9 @@ into blocks
int XDevManager::GetCudaThread(const int devID, const int n, int * gridSize, int * blockSize)
if (!GPUs[devID].isInitialized)
memset(gridSize, 0, sizeof(int) * 3);
memset(blockSize, 0, sizeof(int) * 3);
......@@ -402,6 +393,9 @@ into blocks
int XDevManager::GetCudaThread2D(const int devID, const int n, const int m, int nLimit, int * gridSize, int * blockSize)
if (!GPUs[devID].isInitialized)
memset(gridSize, 0, sizeof(int) * 3);
memset(blockSize, 0, sizeof(int) * 3);
......@@ -67,6 +67,9 @@ public:
/* warp size of an (Navida) GPU */
int GPUWarpSize;
/* indicates whether the device class has been initialized */
bool isInitialized;
max grid size (or number of blocks) of an (Navida) GPU
NOTE: the grid size is alone with three dimensions (x, y, z)
......@@ -147,6 +147,7 @@ extern bool useCUDA;
#define B2I(V) V==0?false:true
......@@ -30,6 +30,9 @@
/* the nts (NiuTrans.Tensor) namespace */
namespace nts{
int testxmemid = 0;
void * recordp = NULL;
XMem * GMem;
......@@ -43,6 +46,7 @@ XMem::XMem()
indexOffset = -1;
name = new char[64];
strcpy(name, "xmem");
signature = 0;
......@@ -64,6 +68,7 @@ XMem::XMem(int myDevID, MEMPOOL_MODE myMode, MTYPE myBlockSize, int myBlockNum,
indexOffset = -1;
name = new char[64];
strcpy(name, "xmem");
signature = 0;
Initialize(myDevID, myMode, myBlockSize, myBlockNum, myBufSize);
......@@ -83,6 +88,9 @@ XMem::~XMem()
delete[] name;
delete[] memIndex;
delete[] memIndex2;
delete[] minSizeIndex;
......@@ -131,8 +139,8 @@ void XMem::Initialize(int myDevID, MEMPOOL_MODE myMode, MTYPE myBlockSize, int m
CheckNTErrors(cudaMalloc((void **)&buf, myBufSize) == cudaSuccess, "Cannot allocate the memory.");
CheckNTErrors(cudaMemset(buf, 0, myBufSize) == cudaSuccess, "Cannot update the memory.");
CheckNTErrors(curandCreateGenerator(&randGen, CURAND_RNG_PSEUDO_DEFAULT) == cudaSuccess, "Cannot make the cuda random number generator!");
CheckNTErrors(curandSetPseudoRandomGeneratorSeed(randGen, (unsigned)time(NULL)) == cudaSuccess, "Cannot generate the seed!");
CheckNTErrors(curandCreateGenerator(&randGen, CURAND_RNG_PSEUDO_DEFAULT) == CURAND_STATUS_SUCCESS, "Cannot make the cuda random number generator!");
CheckNTErrors(curandSetPseudoRandomGeneratorSeed(randGen, (unsigned)time(NULL)) == CURAND_STATUS_SUCCESS, "Cannot generate the seed!");
......@@ -144,6 +152,11 @@ void XMem::Initialize(int myDevID, MEMPOOL_MODE myMode, MTYPE myBlockSize, int m
bufSize = myBufSize;
if (myMode == FREE_ON_THE_FLY)
/* free memory */
......@@ -180,11 +193,16 @@ void XMem::Free(int myDevID, void * mem)
/* on GPUs */
#ifdef USE_CUDA
int devIDBackup = -1;
cudaError_t error = cudaFree((char*)mem);
if(error != cudaSuccess){
ShowNTErrors("Cannot free the memory.");
ShowNTErrors("Please specify USE_CUDA for compiling this program.");
......@@ -192,6 +210,15 @@ void XMem::Free(int myDevID, void * mem)
get signature
<< return - return the signature
MTYPE XMem::GetSignature()
return signature;
use string as the name of the memory pool
>> myName - name of the memory pool
......@@ -265,7 +292,7 @@ void XMem::SetComputationMode(bool myIsForComputation)
if(!myIsForComputation && devID >= 0 && cublasHandle != NULL)
CheckNTErrors(cublasCreate(&cublasHandle) == cudaSuccess, "Cannot create the cublas handle.");
CheckNTErrors(cublasCreate(&cublasHandle) == CURAND_STATUS_SUCCESS, "Cannot create the cublas handle.");
......@@ -279,16 +306,19 @@ initialize the index
void XMem::SetIndex(INT_64 indexSize, MTYPE minSizeFirst, int minSizeNum)
delete[] memIndex;
delete[] memIndex2;
delete[] minSizeIndex;
delete freeMemIndex;
delete minSizeIndex;
indexNodeNum = indexSize;
indexNodeNumUsed = minSizeNum;
nodeNum = indexSize;
nodeNumUsed = minSizeNum * 2;
indexEntryNum = minSizeNum;
freeMemIndex = new MPieceNode[indexNodeNum];
memset(freeMemIndex, 0, sizeof(MPieceNode) * indexNodeNum);
memIndex = new MPieceNode[nodeNum];
memset(memIndex, 0, sizeof(MPieceNode) * nodeNum);
memIndex2 = new MPieceNode[nodeNum];
memset(memIndex2, 0, sizeof(MPieceNode) * nodeNum);
minSizeIndex = new MTYPE[indexEntryNum];
memset(minSizeIndex, 0, sizeof(MTYPE) * indexEntryNum);
......@@ -395,6 +425,7 @@ void * XMem::AllocDynamic(int myDevID, MTYPE mySize)
ShowNTErrors("Please specify USE_CUDA for compiling this program.");
......@@ -496,21 +527,25 @@ void * XMem::AllocBuf(int myDevID, MTYPE mySize, int pitch)
release a piece of memory
>> p - pointer to the memory piece we intend to release
>> size - size of the memory piece to release
>> code - code the memory
void XMem::Release(void * p)
void XMem::Release(void * p, MTYPE size, MTYPE code)
Release(devID, p);
if(code == signature)
Release(devID, p, size);
release a piece of memory
>> myDevID - device id
>> p - pointer to the memory piece we intend to release
>> size - size of the memory piece to release
void XMem::Release(int myDevID, void * p)
void XMem::Release(int myDevID, void * p, MTYPE size)
if(mode == FREE_ON_THE_FLY)
ReleaseStandard(myDevID, p);
ReleaseStandard(myDevID, p, size);
......@@ -522,7 +557,7 @@ release a piece of memory in the buffer
void XMem::ReleaseBuf(int myDevID, MTYPE mySize, int pitch)
CheckNTErrors((bufUsed >= mySize),
"Cannot allocate the memory. Please specify a larger buffer in XMem!");
"Cannot allocate the memory. Please specify a larger buffer in XMem!");
MTYPE backOffset = 0;
......@@ -553,7 +588,7 @@ allocate a piece of memory as "malloc"
void * XMem::AllocStandard(int myDevID, MTYPE mySize, bool myIsRebuiltIndex)
CheckNTErrors(freeMemIndex != NULL, "The index of the memory pool is not initialized!");
CheckNTErrors(memIndex != NULL, "The index of the memory pool is not initialized!");
if(mySize <= minSizeIndex[0])
mySize = minSizeIndex[0];
......@@ -567,20 +602,17 @@ void * XMem::AllocStandard(int myDevID, MTYPE mySize, bool myIsRebuiltIndex)
/* search for the memory piece avialable for the allocation */
for(int i = indexEntryNum; i > index; i--){
if(i == indexEntryNum){
entry = freeMemIndex + index;
entry = memIndex + index;
CheckNTErrors(mySize >= minSizeIndex[index], "Wrong index!");
entry = freeMemIndex + i;
if(entry->size == 0)
entry = memIndex + i;
node = entry;
node = entry->next;
while(node != NULL){
if(node->size == 0){
MPieceNode * next = node->next;
RemoveIndexNode(node, entry);
RemoveFreeIndexNode(node, entry);
node = next;
......@@ -598,41 +630,48 @@ void * XMem::AllocStandard(int myDevID, MTYPE mySize, bool myIsRebuiltIndex)
/* if a free memory piece is found, we allocate the memory on it. */
if(hit != NULL){
MHeader * head = (MHeader*)hit->p;
CheckNTErrors(head->state == 1, "Accessing the wrong memory piece!");
char * beg = (char*)GetPitchedAddress((char*)head + sizeof(MHeader) + sizeof(void*), MY_PITCH);
MHeader * head = &hit->head;
char * beg = (char*)GetPitchedAddress((char*)hit->p, MY_PITCH);
char * end = (char*)beg + mySize;
MTYPE needed = end - (char*)head;
MTYPE needed = end - (char*)hit->p;
MTYPE remaining = head->size - needed;
if(remaining >= minSizeIndex[0] + sizeof(MHeader) + sizeof(void*)){
/* make a new header for the remaining segment in the memory piece */
MHeader * newHead = (MHeader*)end;
newHead->state = 1;
newHead->size = remaining;
newHead->pre = head;
newHead->next = head->next;
newHead->blockID = head->blockID;
head->next = newHead;
head->size = needed;
if(newHead->next != NULL)
newHead->next->pre = newHead;
if(remaining >= minSizeIndex[0]){
/* make a new index node */
MPieceNode * newNode = freeMemIndex + indexNodeNumUsed++;
MPieceNode * newNode = memIndex + nodeNumUsed++;
newNode->head.indexNode = newNode;
newNode->p = end;
newNode->size = (char*)newHead + newHead->size -
(char*)GetPitchedAddress((char*)newNode->p + sizeof(MHeader) + sizeof(void*), MY_PITCH);
newNode->pReal = NULL;
newNode->size = (char*)end + remaining -
(char*)GetPitchedAddress((char*)end, MY_PITCH);
/* connections for headers */
MHeader &cur = hit->head;
MHeader &next = newNode->head;
next.pre = &cur; =; = &next;
if( != NULL)>pre = &next;
cur.size = needed;
next.state = 1;
next.size = remaining;
next.blockID = cur.blockID;
head->state = 2;
void * backPointer = (char*)beg - sizeof(void*);
*((MHeader**)backPointer) = head;
hit->size = 0;
hit->size = mySize;
hit->head.state = 2;
hit->pReal = beg;
blocks[hit->head.blockID].used += mySize;
result = beg;
......@@ -647,13 +686,16 @@ void * XMem::AllocStandard(int myDevID, MTYPE mySize, bool myIsRebuiltIndex)
int bi;
for(bi = 0; bi < blockNum; bi++){
XMemBlock * block = blocks + bi;
if(block->mem == NULL){
block->size = MAX(maxBlockSize, mySize + sizeof(MHeader) + sizeof(void*) + 2 * MY_PITCH);
if (block->mem != NULL && (block->head != NULL || block->size < mySize + 2 * MY_PITCH))
if (block->mem == NULL) {
block->size = MAX(maxBlockSize, mySize + 2 * MY_PITCH);
if (myDevID < 0) {
block->mem = new char[block->size];
memset(block->mem, 0, block->size);
else {
#ifdef USE_CUDA
int devIDBackup = -1;
......@@ -668,36 +710,41 @@ void * XMem::AllocStandard(int myDevID, MTYPE mySize, bool myIsRebuiltIndex)
ShowNTErrors("Please specify USE_CUDA for compiling this program.");
curBlockID = MAX(curBlockID, bi);
/* make a new header for the remaining segment in the memory piece */
MHeader * newHead = (MHeader*)block->mem;
newHead->state = 1;
newHead->size = block->size;
newHead->pre = NULL;
newHead->next = NULL;
newHead->blockID = bi;
/* make a new index node */
MPieceNode * newNode = freeMemIndex + indexNodeNumUsed++;
newNode->p = newHead;
newNode->size = (char*)newHead + newHead->size -
(char*)GetPitchedAddress((char*)newHead + sizeof(MHeader) + sizeof(void*), MY_PITCH);
result = AllocStandard(myDevID, mySize, myIsRebuiltIndex);
curBlockID = MAX(curBlockID, bi);
/* make a new index node */
MPieceNode * newNode = memIndex + nodeNumUsed++;
newNode->head.indexNode = newNode;
newNode->p = block->mem;
newNode->pReal = NULL;
newNode->size = (char*)block->mem + mySize -
(char*)GetPitchedAddress(block->mem, MY_PITCH);
MHeader &header = newNode->head;
header.state = 1;
header.size = block->size;
header.pre = NULL; = NULL;
header.blockID = bi;
block->head = &header;
block->used = 0;
result = AllocStandard(myDevID, mySize, myIsRebuiltIndex);
CheckNTErrors(bi < blockNum, "No enough memory is available!");
/* if all index nodes are used, we rebuild the index to release the nodes that are free */
if(indexNodeNumUsed == indexNodeNum){
if(nodeNumUsed == nodeNum){
CheckNTErrors(indexNodeNumUsed < indexNodeNum, "No enough index nodes for the memory pool!");
CheckNTErrors(nodeNumUsed < nodeNum, "No enough index nodes for the memory pool!");
return result;
......@@ -749,61 +796,73 @@ int XMem::FindIndexEntry(MTYPE mySize)
remove an index node
remove an index node for available memory pieces
>> node - node to remove
>> - the entry of the list that keeps the node
void XMem::RemoveIndexNode(MPieceNode * node, MPieceNode * entry)
void XMem::RemoveFreeIndexNode(MPieceNode * node, MPieceNode * entry)
MPieceNode * pre = node->pre;
MPieceNode * next = node->next;
CheckNTErrors(pre != NULL, "cannot free the entry node!");
if(pre == NULL){
if(entry == NULL){
entry = freeMemIndex + FindIndexEntry(node->size);
CheckNTErrors(entry == node, "Illegal index node!");
*entry = *next;
entry->pre = NULL;
pre->next = next;
if(next != NULL)
next->pre = pre;
memset(node, 0, sizeof(MPieceNode));
pre->next = next;
if(next != NULL)
next->pre = pre;
node->pre = NULL;
node->next = NULL;
add an index node
add an index node for available memory pieces
>> node - node to add
>> entry - the entry of the list to append the node
void XMem::AddIndexNode(MPieceNode * node, MPieceNode * entry)
void XMem::AddFreeIndexNode(MPieceNode * node, MPieceNode * entry)
MPieceNode * entryForMe = entry != NULL ? entry :
freeMemIndex + FindIndexEntry(node->size);
memIndex + FindIndexEntry(node->size);
MPieceNode * backup = entryForMe->next;
entryForMe->next = node;
node->pre = entryForMe;
node->next = backup;
if(backup != NULL)
backup->pre = node;
if(entryForMe->size == 0){
entryForMe->size = node->size;
entryForMe->p = node->p;
entryForMe->pre = NULL;
entryForMe->next = NULL;
MTYPE tmpSize = entryForMe->size;
void * tmpP = entryForMe->p;
entryForMe->size = node->size;
entryForMe->p = node->p;
node->size = tmpSize;
node->p = tmpP;
node->next = entryForMe->next;
node->pre = entryForMe;
entryForMe->next = node;
CheckNTErrors(node != node->next, "Something wrong with the index node!");
CheckNTErrors(node != node->pre, "Something wrong with the index node!");
remove an index node for memory pieces in use
>> node - node to remove
>> - the entry of the list that keeps the node
void XMem::RemoveAllocIndexNode(MPieceNode * node, MPieceNode * entry)
RemoveFreeIndexNode(node, entry);
add an index node for memory pieces in use
>> node - node to add
>> entry - the entry of the list to append the node
void XMem::AddAllocIndexNode(MPieceNode * node, MPieceNode * entry)
MPieceNode * entryForMe = entry != NULL ? entry :
memIndex + indexEntryNum + FindIndexEntry(node->size);
MPieceNode * backup = entryForMe->next;
entryForMe->next = node;
node->pre = entryForMe;
node->next = backup;
if(backup != NULL)
backup->pre = node;
CheckNTErrors(node != node->next, "Something wrong with the index node!");
CheckNTErrors(node != node->pre, "Something wrong with the index node!");
......@@ -812,39 +871,74 @@ void XMem::AddIndexNode(MPieceNode * node, MPieceNode * entry)
release a piece of memory as "free"
>> myDevID - device id(-1: CPU memory, >=0: GPU device ID)
>> p - the pointer to the address of the memory we intend to free
>> size - size of the memory piece to release
void XMem::ReleaseStandard(int myDevID, void * p)
void XMem::ReleaseStandard(int myDevID, void * p, MTYPE size)
if(p == NULL)
if(size <= minSizeIndex[0])
size = minSizeIndex[0];
MPieceNode * entry = NULL;
MPieceNode * node = NULL;
MPieceNode * hit = NULL;
MPieceNode * last = NULL;
entry = memIndex + indexEntryNum + FindIndexEntry(size);
last = entry;
node = entry->next;
while(node != NULL){
CheckNTErrors(node->pre == last, "Something is wrong!");
CheckNTErrors(last->next == node, "Something is wrong!");
CheckNTErrors(node->head.state == 2, "Something is wrong!");
last = node;
if(node->size == 0){
MPieceNode * next = node->next;
RemoveFreeIndexNode(node, entry);
node = next;
ShowNTErrors("Something is wrong!");
CheckNTErrors(node->pReal != NULL, "Illegal pointer!");
if(node->pReal == p){
hit = node;
node = node->next;
CheckNTErrors(hit != NULL, "No header is found!");
hit->head.state = 1;
void * back = (char*)p - sizeof(void*);
MHeader * head = *(MHeader**)back;
CheckNTErrors(head->state == 2, "Illegal header of a memory piece!");
head->state = 1;
/* make a new index node */
MPieceNode * newNode = freeMemIndex + indexNodeNumUsed++;
newNode->p = head;
newNode->size = (char*)head + head->size -
(char*)GetPitchedAddress((char*)head + sizeof(MHeader) + sizeof(void*), MY_PITCH);
blocks[hit->head.blockID].used -= hit->head.size;
/* rebuild index to merge small fragments of memory and free the block with no use */
void XMem::RebuildIndex()
indexNodeNumUsed = indexEntryNum;
memset(freeMemIndex, 0, sizeof(MPieceNode) * indexEntryNum);
int nodeNumUsed2 = indexEntryNum * 2;
memset(memIndex2, 0, sizeof(MPieceNode) * indexEntryNum * 2);
for(int bi = 0; bi <= curBlockID; bi++){
XMemBlock * block = blocks + bi;
if(block->mem == NULL)
if(block->mem == NULL || block->head == NULL)
MHeader * head = (MHeader*)block->mem;
MHeader * head = block->head;
CheckNTErrors(head->size <= block->size, "Illegal memory block!");
block->head = NULL;
block->used = 0;
/* if the block is not used, we delete it */
if(head->state == 1 && head->size == block->size){
......@@ -870,6 +964,7 @@ void XMem::RebuildIndex()
/* if the block is in use, we build the index */
int pieceCount = 0;
MTYPE size = 0;
MHeader * newLast = NULL;
while(head != NULL){
MHeader * next = head->next;
if(head->state == 1){
......@@ -878,21 +973,63 @@ void XMem::RebuildIndex()
next = next->next;
head->next = next;
/* make a new index node */
MPieceNode * newNode = freeMemIndex + indexNodeNumUsed++;
newNode->p = head;
newNode->size = (char*)head + head->size -
(char*)GetPitchedAddress((char*)head + sizeof(MHeader) + sizeof(void*), MY_PITCH);
size += head->size;
MPieceNode * node = head->indexNode;
void * p = node->p;
/* make a new index node */
MPieceNode * newNode = memIndex2 + nodeNumUsed2++;
newNode->p = p;
newNode->size = (char*)p + head->size -
( head->state == 1 ? (char*)GetPitchedAddress((char*)p, MY_PITCH) : (char*)head->indexNode->pReal);
newNode->pre = NULL;
newNode->next = NULL;
CheckNTErrors(newNode->size > 0, "Illegal index node!");
MHeader * newHeader = &newNode->head;
newHeader->indexNode = newNode;
newHeader->pre = newLast;
newHeader->next = NULL;
newHeader->blockID = bi;
newHeader->size = head->size;
newHeader->state = head->state;
if(newLast != NULL)
newLast->next = newHeader;
newHeader->pre = newLast;
if(head->state == 1){
newNode->pReal = NULL;
MPieceNode * entry = memIndex2 + FindIndexEntry(newNode->size);
AddFreeIndexNode(newNode, entry);
newNode->pReal = head->indexNode->pReal;
MPieceNode * entry = memIndex2 + indexEntryNum + FindIndexEntry(newNode->size);
AddAllocIndexNode(newNode, entry);
block->used += head->size;
if(newLast == NULL)
block->head = newHeader;
head = next;
size += head->size;
CheckNTErrors(size <= block->size, "Illegal block size!");
newLast = newHeader;
head = next;
MPieceNode * backup = memIndex2;
memIndex2 = memIndex;
memIndex = backup;
nodeNumUsed = nodeNumUsed2;
......@@ -964,10 +1101,32 @@ void * XMem::GetAddress()
/* clear it */
void XMem::Clear()
for(int i = 0; i < blockNum; i++)
blocks[i].used = 0;
curBlock = blocks;
curBlockID = 0;
if (mode == UNI_FREE) {
for (int i = 0; i < blockNum; i++)
blocks[i].used = 0;
curBlock = blocks;
curBlockID = 0;
else if (mode == FREE_ON_THE_FLY) {
nodeNumUsed = indexEntryNum * 2;
memset(memIndex, 0, sizeof(MPieceNode) * indexEntryNum * 2);
for (int i = 0; i <= curBlockID; i++) {
blocks[i].head = NULL;
blocks[i].used = 0;
if (i > 0) {
blocks[i].size = blocks[i].sizeDesired;
Free(devID, blocks[i].mem);
blocks[i].mem = NULL;
curBlock = blocks;
curBlockID = 0;
else {
ShowNTErrors("Something is wrong!");
/* clear the buffer */
......@@ -1162,12 +1321,12 @@ void XMem::CreateBLASHandle()
#ifdef USE_CUDA
if(cublasHandle != NULL){
CheckNTErrors(cublasDestroy(cublasHandle) == cudaSuccess,
"Cannot destroy the cublas handle.");
CheckNTErrors(cublasDestroy(cublasHandle) == CUBLAS_STATUS_SUCCESS,
"Cannot destroy the cublas handle.");
CheckNTErrors(cublasCreate(&cublasHandle) == cudaSuccess,
"Cannot create the cublas handle.");
CheckNTErrors(cublasCreate(&cublasHandle) == CURAND_STATUS_SUCCESS,
"Cannot create the cublas handle.");
......@@ -53,28 +53,14 @@ typedef long long INT_64;
#define MIN_BLOCK_SIZE_FOR_MEMPOOL 128 * 1024 * 1024
/* memory block */
struct XMemBlock
/* pointer to where to start */
void * mem;
/* size of the block */
MTYPE size;
/* size of the used memory in this block */
MTYPE used;
/* disired size of the block */
MTYPE sizeDesired;
mode of runnig a memory pool
- UNI_FREE: free all memory space when the memory allocation is no use
- FREE_ON_THE_FLY: run in normal "malloc" and "free" ways
struct MPieceNode;
/* header of a memory piece (FREE_ON_THE_FLY) */
struct MHeader
......@@ -96,6 +82,9 @@ struct MHeader
/* id of the memory block */
int blockID;
/* pointer to the index node */
MPieceNode * indexNode;
/* index of memory piece */
......@@ -112,6 +101,31 @@ struct MPieceNode
/* pointer to the head of a memory piece */
void * p;
/* pointer to the head of memory that is returned back to the user */
void * pReal;
/* header of the memory piece */
MHeader head;
/* memory block */
struct XMemBlock
/* pointer to where to start */
void * mem;
/* size of the block */
MTYPE size;
/* size of the used memory in this block */
MTYPE used;
/* desired size of the block */
MTYPE sizeDesired;
/* first head of the block */
MHeader * head;
......@@ -138,6 +152,9 @@ public:
/* mode of running the memory pool */
/* signature */
MTYPE signature;
/* indicates whether the memory allocation is static */
bool isStatic;
......@@ -194,13 +211,16 @@ public:
/* index of the free memory pieces */
MPieceNode * freeMemIndex;
MPieceNode * memIndex;
/* for double buffering */
MPieceNode * memIndex2;
/* maximum number of index nodes */
INT_64 indexNodeNum;
INT_64 nodeNum;
/* count of the used nodes */
INT_64 indexNodeNumUsed;
INT_64 nodeNumUsed;
/* minimal size allocation for each index entry */
MTYPE * minSizeIndex;
......@@ -235,6 +255,9 @@ public:
/* free a piece of memory */
void Free(int myDevID, void * mem);
/* get signature */
MTYPE GetSignature();
/* use string as the name of the memory pool */
void SetName(const char * myName);
......@@ -282,10 +305,10 @@ public:
void * AllocBuf(int myDevID, MTYPE mySize, int pitch = BUF_PITCH);
/* release a piece of memory */
void Release(void * p);
void Release(void * p, MTYPE size, MTYPE code);
/* release a piece of memory */
void Release(int myDevID, void * p);
void Release(int myDevID, void * p, MTYPE size);
/* release a piece of memory in the buffer */
void ReleaseBuf(int myDevID, MTYPE mySize, int pitch = BUF_PITCH);
......@@ -302,14 +325,20 @@ public:
/* find the index entry for allocation query */
int FindIndexEntry(MTYPE mySize);
/* remove an index node */
void RemoveIndexNode(MPieceNode * node, MPieceNode * entry = NULL);
/* remove an index node for available memory pieces */
void RemoveFreeIndexNode(MPieceNode * node, MPieceNode * entry = NULL);
/* add an index node */
void AddIndexNode(MPieceNode * node, MPieceNode * entry = NULL);
/* add an index node for available memory pieces */
void AddFreeIndexNode(MPieceNode * node, MPieceNode * entry = NULL);
/* remove an index node for memory pieces in use */
void RemoveAllocIndexNode(MPieceNode * node, MPieceNode * entry = NULL);
/* add an index node for available memory pieces */
void AddAllocIndexNode(MPieceNode * node, MPieceNode * entry = NULL);
/* release a piece of memory as "free" */
void ReleaseStandard(int myDevID, void * p);
void ReleaseStandard(int myDevID, void * p, MTYPE size);
/* rebuild index to merge small fragments of memory and free the block with no use */
void RebuildIndex();
......@@ -379,6 +408,9 @@ public:
extern XMem * GMem;
extern int testxmemid;
extern void * recordp;
} /* end of the nts (NiuTrans.Tensor) namespace */
......@@ -29,10 +29,18 @@ const char * GetOPName(int type)
if ((type & MATH_BASE) != 0){
if (type == MATH_ABSOLUTE)
return "M_ABSOLUTE";
else if (type == MATH_CEIL)
return "M_CEIL";
else if (type == MATH_EXP)
return "M_EXP";
else if (type == MATH_FLOOR)
return "M_FLOOR";
else if (type == MATH_LOG)
return "M_LOG";
else if (type == MATH_SQRT)
return "M_SQRT";
else if (type == MATH_SQUARE)
return "M_SQUARE";
else if (type == MATH_SIN)
return "M_SIN";
else if (type == MATH_COS)
......@@ -113,7 +121,9 @@ const char * GetOPName(int type)
return "S_TOPK";
else if ((type & FUNCTION_BASE) != 0){
if (type == FUNC_HARDTANH)
if (type == FUNC_DROPOUT)
return "F_DROPOUT";
else if (type == FUNC_HARDTANH)
return "F_HARDTANH";
else if (type == FUNC_IDENTITY)
return "F_IDENTITY";
......@@ -32,9 +32,13 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#define MATH_BASE 0x00001000
#define MATH_LOG MATH_EXP + 1
#define MATH_SIN MATH_LOG + 1
#define MATH_EXP MATH_CEIL + 1
#define MATH_SQRT MATH_LOG + 1
#define MATH_COS MATH_SIN + 1
#define MATH_TAN MATH_COS + 1
......@@ -88,7 +92,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* activation functions */
......@@ -162,6 +162,7 @@ XTensor::XTensor(const XTensor &reference)
devID = reference.devID;
mem = reference.mem;
data =;
signature = reference.signature;
/* what we really want to do is " = NULL;"
As "reference" is constant, we cannot reset
......@@ -221,7 +222,8 @@ XTensor::~XTensor()
void XTensor::Init()
id = -1;
mem = NULL;;
mem = NULL;
signature = 0;
data = NULL;
dataHost = NULL;
dataP = NULL;
......@@ -254,7 +256,7 @@ void XTensor::DestroyData()
else if(data != NULL && isInGlobalMem)
FreeData(this, mem);
else if(data != NULL)
mem->Release(data, GetDataSizeInChar(), signature);
data = NULL;
if(dataHost != NULL)
......@@ -298,6 +300,7 @@ XTensor& XTensor::operator= (const XTensor& tensor)
newTensor->data = data;
newTensor->dataHost = dataHost;
newTensor->signature = tensor.signature;
XLink::Replace(this, newTensor);
......@@ -1135,19 +1138,21 @@ resize a tensor with a specified tensor size
bool XTensor::Resize(const int myOrder, const int * myDimSize,
const TENSOR_DATA_TYPE myDataType, const float myDenseRatio)
order = myOrder;
unitNum = 1;
unitNumNonZero = 0;
isInit = true;
/* free old mem */
if(data != NULL){
if (mem == NULL)
XMemFree(devID, data);
mem->Release(data, GetDataSizeInChar(), signature);
signature = mem != NULL ? mem->GetSignature() : 0;
order = myOrder;
unitNum = 1;
unitNumNonZero = 0;
isInit = true;
bool filledData = true;
bool zeroData = false;
for(int i = 0; i < order; i++){
......@@ -1243,56 +1248,6 @@ bool XTensor::Resize(const int myOrder, const int * myDimSize,
resize a tensor with a specified tensor size (with no data filled)
>> myOrder - order of the tensor
>> myDimSize - the size of each dimension
>> myDataType - unit size (e.g., int, float, and double)
>> myDenseRatio - how often an element has non-zero value
<< return - succeeded or not
bool XTensor::ResizeWithNoData(const int myOrder, const int * myDimSize,
const TENSOR_DATA_TYPE myDataType, const float myDenseRatio)
order = myOrder;
unitNum = 1;
unitNumNonZero = 0;
/* free old mem */
if(data != NULL && mem == NULL)
delete[] (char*)data;
bool filledData = true;
bool zeroData = false;
for(int i = 0; i < order; i++){
dimSize[i] = abs(myDimSize[i]);
dimSizeRDI[order - i - 1] = dimSize[i];
if(myDimSize[i] < 0)
filledData = false;
if(myDimSize[i] == 0)
zeroData = true;
unitNum *= dimSize[i];
data = NULL;
denseRatio = myDenseRatio;
isSparse = denseRatio < 1.0F ? true : false;
dataType = myDataType;
unitSize = GetUnitSize(dataType);
if(myDataType != DEFAULT_DTYPE)
isDefaultDType = false;
isDefaultDType = true;
unitNum = 0;
return false;
return true;
resize a tensor by another one
>> myTensor - tensor for reference
......@@ -1377,9 +1332,10 @@ dump data to a file
>> file - where to domp the data
>> label - label of the tensor
>> n - number of items to dump
>> beg - the first item id
>> verbose - verbose level
void XTensor::Dump(FILE * file, const char * label, const int n, const int verbose)
void XTensor::Dump(FILE * file, const char * label, const int n, const int beg, const int verbose)
if (verbose > verboseLevel)
......@@ -1437,28 +1393,26 @@ void XTensor::Dump(FILE * file, const char * label, const int n, const int verbo
if (!isSparse) {
if (dataType == DEFAULT_DTYPE) {
if (unitNum > 0) {
DTYPE f = *(DTYPE*)d;
fprintf(file, "%e", f);
int num = unitNum;
if (n > 0)
num = MIN(num, n);
for (int i = 1; i < num; i++) {
DTYPE * f = ((DTYPE*)d) + i;
fprintf(file, " %e", *f);
int end = MIN(n > 0 ? beg + n : beg + unitNum, unitNum);
for(int i = beg; i < end; i++){
DTYPE f = ((DTYPE*)d)[i];
if(i == beg)
fprintf(file, "%e", f);
fprintf(file, " %e", f);
else {
ShowNTErrors("Cannot dump the tensor to the file in non-float values!");
else {
int num = this->unitNumNonZero > 0 ? *(int*)d : 0;
if (n > 0)
num = MIN(num, n);
if (beg + n > 0)
num = MIN(num, beg + n);
fprintf(file, "%d ", num);
for (int i = 0; i < num; i++) {
for (int i = beg; i < num; i++) {
int key = GetKeyInSparse(i);
DTYPE value = GetInSparse(i);
fprintf(file, "[%d]%e ", key, value);
......@@ -1481,13 +1435,14 @@ dump data to a file
>> file - where to domp the data
>> label - label of the tensor
>> n - number of items to dump
>> beg - the first item id
>> verbose - verbose level
void XTensor::Dump(const XTensor * tensor, FILE * file, const char * label, const int n, const int verbose)
void XTensor::Dump(const XTensor * tensor, FILE * file, const char * label, const int n, const int beg, const int verbose)
XTensor a(tensor->order, tensor->dimSize, tensor->dataType, tensor->denseRatio, tensor->devID, tensor->mem);
_CopyValues(tensor, &a);
a.Dump(file, label, n, verbose);
a.Dump(file, label, n, beg, verbose);
......@@ -1670,6 +1625,8 @@ void XTensor::AllocateData(XTensor * tensor, XMem * myMem, bool useBuf)
tensor->isInGlobalMem = true;
tensor->signature = 0;
......@@ -51,7 +51,6 @@ struct XLink;
/* computation flags */
......@@ -66,6 +65,9 @@ public:
/* memory pool */
XMem * mem;
/* signature of the memory pool */
MTYPE signature;
/* data array to keep the elements */
void * data;
......@@ -327,11 +329,6 @@ public:
const float myDenseRatio = 1.0F);
/* resize a matrix with a specified matrix size (with no data filled) */
bool ResizeWithNoData(const int myOrder, const int * myDimSize,
const float myDenseRatio = 1.0F);
/* resize a matrix by another one */
bool Resize(const XTensor * myTensor);
......@@ -339,11 +336,11 @@ public:
bool BinarySearch(int key, DTYPE &value, void * &position) const;
/* dump data to a file */
void Dump(FILE * file, const char * label = NULL, const int n = -1, const int verbose = 0);
void Dump(FILE * file, const char * label = NULL, const int n = -1, const int beg = 0, const int verbose = 0);
/* dump data to a file */
void Dump(const XTensor * tensor, FILE * file, const char * label = NULL, const int n = -1, const int verbose = 0);
void Dump(const XTensor * tensor, FILE * file, const char * label = NULL, const int n = -1, const int beg = 0, const int verbose = 0);
/* read data from a file */
void Read(FILE * file, const char * label = NULL);
......@@ -203,7 +203,7 @@ XTensor Div(const XTensor &a, const XTensor &b, DTYPE alpha, int leadingDim)
/* tensor connections */
XLink::MakeLink(&a, &b, &c, MATH_DIVDIM);
XLink::AddParamToHeadInt(&c, n);
XLink::AddParamToHeadInt(&c, alpha);
XLink::AddParamToHead(&c, alpha);
ShowNTErrors("Something is wrong!");
......@@ -204,7 +204,7 @@ XTensor Multiply(const XTensor &a, const XTensor &b, DTYPE alpha, int leadingDim
/* tensor connections */
XLink::MakeLink(&a, &b, &c, MATH_MULTIPLYDIM);
XLink::AddParamToHeadInt(&c, n);
XLink::AddParamToHeadInt(&c, alpha);
XLink::AddParamToHead(&c, alpha);
ShowNTErrors("Something is wrong!");
......@@ -50,7 +50,6 @@ void _Normalize(const XTensor * input, XTensor * output, int dim, const XTensor
CheckNTErrors((XTensor::IsSameShaped(mean, var)), "Unmatched input tensors");
CheckNTErrors((input && output && mean && var && a && b), "Empty input tensors!");
CheckNTErrors((dimRDI >= 0 && dimRDI < input->order), "Incorrect reduction dimension!");
CheckNTErrors((dimRDI == a->order - 1), "Incorrect reduction dimension!");
CheckNTErrors((input->order == mean->order + 1), "Incorrect reduction dimension!");
int stride = 1;
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
* $Created by: Xu Chen (email: 2018-07-31
#include <math.h>
#include "../../XName.h"
#include "Unary.h"
......@@ -5,9 +26,18 @@
namespace nts{
DTYPE square(DTYPE x)
return x * x;
DTYPE round(DTYPE r)
return (r > 0.0) ? (DTYPE)floor(r + 0.5) : (DTYPE)ceil(r - 0.5);
#ifdef USE_CUDA
/* define three marco separately, specify the respective function names */
/* define three marco separately, specify the respective function names (GPU mode) */
#define _SIMPLE_UNARY_FUNCTION(_funcName, _cudaFuncName, origFunc) \
void _funcName(const XTensor * a, XTensor * b) \
{ \
......@@ -45,14 +75,35 @@ _SIMPLE_UNARY_FUNCTION(_Absolute, _CudaAbsolute, fabs)
_SIMPLE_UNARY_FUNCTION_ME(_AbsoluteMe, _Absolute)
_SIMPLE_UNARY_FUNCTION(_Ceil, _CudaCeil, ceil)
_SIMPLE_UNARY_FUNCTION(_Floor, _CudaFloor, floor)
_SIMPLE_UNARY_FUNCTION(_Round, _CudaRound, round)
_SIMPLE_UNARY_FUNCTION(_Sqrt, _CudaSqrt, sqrt)
_SIMPLE_UNARY_FUNCTION(_Square, _CudaSquare, square)
......@@ -65,11 +116,8 @@ _SIMPLE_UNARY_FUNCTION(_Tan, _CudaTan, tan)
/*_SIMPLE_UNARY_FUNCTION(_Round, _CudaRound, round)
/* define three marco separately, specify the respective function names */
/* define three marco separately, specify the respective function names (CPU mode) */
#define _SIMPLE_UNARY_FUNCTION(_funcName, origFunc) \
void _funcName(const XTensor * a, XTensor * b) \
{ \
......@@ -102,14 +150,35 @@ _SIMPLE_UNARY_FUNCTION(_Absolute, fabs)
_SIMPLE_UNARY_FUNCTION_ME(_AbsoluteMe, _Absolute)
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
* $Created by: Xu Chen (email: 2018-07-31
#include <math.h>
#include "../../XDevice.h"
#include "../../XName.h"
#include "Unary.h"
#include "Unary.cuh"
namespace nts {
DTYPE CudaSquare(DTYPE x)
return x * x;
DTYPE CudaRound(DTYPE r)
return (r > 0.0) ? (DTYPE)floor(r + 0.5) : (DTYPE)ceil(r - 0.5);
#define SIMPLE_UNARY_FUNCTION_GPU(funcName, origFunc) \
__global__ \
void Kernel##funcName(DTYPE * a, DTYPE * b, int size) \
......@@ -15,7 +49,7 @@ void Kernel##funcName(DTYPE * a, DTYPE * b, int size) \
b[i] = (DTYPE)origFunc(a[i]); \
} \
__global__ \
void Kernel##funcName(__half * a, __half * b, int size) \
void Kernel##funcName(__half * a, __half * b, int size) \
{ \
return; \
} \
......@@ -37,12 +71,12 @@ void _Cuda##funcName(const XTensor * a, XTensor * b) \
ProtectCudaDev(a->devID, devIDBackup); \
if (a->dataType == DEFAULT_DTYPE) { \
Kernel##funcName << <blocks, threads >> > \
((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum); \
Kernel##funcName<<<blocks, threads>>> \
((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum); \
} \
else if (a->dataType == X_FLOAT16) { \
Kernel##funcName << <blocks, threads >> > \
((__half*)a->data, (__half*)b->data, a->unitNum); \
Kernel##funcName<<<blocks, threads>>> \
((__half*)a->data, (__half*)b->data, a->unitNum); \
} \
else { \
ShowNTErrors("TODO!"); \
......@@ -52,11 +86,16 @@ void _Cuda##funcName(const XTensor * a, XTensor * b) \
} \
\ No newline at end of file
......@@ -38,6 +38,15 @@ void KernelAbsolute(__half * a, __half * b, int size);
/* set each entry to its absolute value */
void _CudaAbsolute(const XTensor * a, XTensor * b);
/* set each entry to its ceil value (CUDA Kernel) */
void KernelCeil(DTYPE * a, DTYPE * b, int size);
/* set each entry to its ceil value (CUDA Kernel) with float16 data type*/
void KernelCeil(__half * a, __half * b, int size);
/* set each entry to its ceil value */
void _CudaCeil(const XTensor * a, XTensor * b);
/* set each entry to its exponent value (CUDA Kernel) */
void KernelExp(DTYPE * a, DTYPE * b, int size);
......@@ -47,6 +56,15 @@ void KernelExp(__half * a, __half * b, int size);
/* set each entry to its exponent value */
void _CudaExp(const XTensor * a, XTensor * b);
/* set each entry to its floor value (CUDA Kernel) */
void KernelFloor(DTYPE * a, DTYPE * b, int size);
/* set each entry to its floor value (CUDA Kernel) with float16 data type*/
void KernelFloor(__half * a, __half * b, int size);
/* set each entry to its floor value */
void _CudaFloor(const XTensor * a, XTensor * b);
/* set each entry to its logarithm value (CUDA Kernel) */
void KernelLog(DTYPE * a, DTYPE * b, int size);
......@@ -56,6 +74,34 @@ void KernelLog(__half * a, __half * b, int size);
/* set each entry to its logarithm value */
void _CudaLog(const XTensor * a, XTensor * b);
/* set each entry to its round value (CUDA Kernel) */
void KernelRound(DTYPE * a, DTYPE * b, int size);
/* set each entry to its round value (CUDA Kernel) with float16 data type*/
void KernelRound(__half * a, __half * b, int size);
/* set each entry to its round value */
void _CudaRound(const XTensor * a, XTensor * b);
/* set each entry to its sqrt value (CUDA Kernel) */
void KernelSqrt(DTYPE * a, DTYPE * b, int size);
/* set each entry to its sqrt value (CUDA Kernel) with float16 data type*/
void KernelSqrt(__half * a, __half * b, int size);
/* set each entry to its sqrt value */
void _CudaSqrt(const XTensor * a, XTensor * b);
/* set each entry to its square value (CUDA Kernel) */
void KernelSquare(DTYPE * a, DTYPE * b, int size);
/* set each entry to its square value (CUDA Kernel) with float16 data type*/
void KernelSquare(__half * a, __half * b, int size);
/* set each entry to its square value */
void _CudaSquare(const XTensor * a, XTensor * b);
/* set each entry to its sine value (CUDA Kernel) */
void KernelSin(DTYPE * a, DTYPE * b, int size);
......@@ -83,15 +129,6 @@ void KernelTan(__half * a, __half * b, int size);
/* set each entry to its tangent value */
void _CudaTan(const XTensor * a, XTensor * b);
/* set each entry to its round value (CUDA Kernel) */
//void KernelRound(DTYPE * a, DTYPE * b, int size);
/* set each entry to its round value (CUDA Kernel) with float16 data type*/
//void KernelRound(__half * a, __half * b, int size);
/* set each entry to its round value */
//void _CudaRound(const XTensor * a, XTensor * b);
#endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor)
......@@ -28,95 +28,103 @@ namespace nts{
/* set every entry to its absolute value */
void _Absolute(const XTensor * a, XTensor * b);
set every entry to its absolute value (do it on site)
keep the result in the input tensor a and return nothing
/* set every entry to its absolute value (do it on site)
keep the result in the input tensor a and return nothing */
void _AbsoluteMe(XTensor * a);
set every entry to its absolute value (return a XTensor structure)
make a new tensor to keep the result and return it
/* set every entry to its absolute value (return a XTensor structure)
make a new tensor to keep the result and return it */
XTensor Absolute(const XTensor & a);
/* set every entry to its ceil value */
void _Ceil(const XTensor * a, XTensor * b);
/* set every entry to its ceil value (do it on site)
keep the result in the input tensor a and return nothing */
void _CeilMe(XTensor * a);
/* set every entry to its ceil value (return a XTensor structure)
make a new tensor to keep the result and return it */
XTensor Ceil(const XTensor & a);
/* set every entry to its exponent value */
void _Exp(const XTensor * a, XTensor * b);
set every entry to its exponent value (do it on site)
keep the result in the input tensor a and return nothing
/* set every entry to its exponent value (do it on site)
keep the result in the input tensor a and return nothing */
void _ExpMe(XTensor * a);
set every entry to its exponent value (return a XTensor structure)
make a new tensor to keep the result and return it
/* set every entry to its exponent value (return a XTensor structure)
make a new tensor to keep the result and return it */
XTensor Exp(const XTensor & a);
/* set every entry to its floor value */
void _Floor(const XTensor * a, XTensor * b);
/* set every entry to its floor value (do it on site)
keep the result in the input tensor a and return nothing */
void _FloorMe(XTensor * a);
/* set every entry to its floor value (return a XTensor structure)
make a new tensor to keep the result and return it */
XTensor Floor(const XTensor & a);
/* set every entry to its logarithm value */
void _Log(const XTensor * a, XTensor * b);
set every entry to its logarithm value (do it on site)
keep the result in the input tensor a and return nothing
/* set every entry to its logarithm value (do it on site)
keep the result in the input tensor a and return nothing */
void _LogMe(XTensor * a);
set every entry to its logarithm value (return a XTensor structure)
make a new tensor to keep the result and return it
/* set every entry to its logarithm value (return a XTensor structure)
make a new tensor to keep the result and return it */
XTensor Log(const XTensor & a);
/* set every entry to its round value */
void _Round(const XTensor * a, XTensor * b);
/* set every entry to its round value (do it on site)
keep the result in the input tensor a and return nothing */
void _RoundMe(XTensor * a);
/* set every entry to its round value (return a XTensor structure)
make a new tensor to keep the result and return it */
XTensor Round(const XTensor & a);
/* set every entry to its sqrt value */
void _Sqrt(const XTensor * a, XTensor * b);
/* set every entry to its sqrt value (do it on site)
keep the result in the input tensor a and return nothing */
void _SqrtMe(XTensor * a);
/* set every entry to its sqrt value (return a XTensor structure)
make a new tensor to keep the result and return it */
XTensor Sqrt(const XTensor & a);
/* set every entry to its square value */
void _Square(const XTensor * a, XTensor * b);
/* set every entry to its square value (do it on site)
keep the result in the input tensor a and return nothing */
void _SquareMe(XTensor * a);
/* set every entry to its square value (return a XTensor structure)
make a new tensor to keep the result and return it */
XTensor Square(const XTensor & a);
/* set every entry to its sine value */
void _Sin(const XTensor * a, XTensor * b);
set every entry to its sine value (do it on site)
keep the result in the input tensor a and return nothing
/* set every entry to its sine value (do it on site)
keep the result in the input tensor a and return nothing */
void _SinMe(XTensor * a);
set every entry to its sine value (return a XTensor structure)
make a new tensor to keep the result and return it
/* set every entry to its sine value (return a XTensor structure)
make a new tensor to keep the result and return it */
XTensor Sin(const XTensor & a);
/* set every entry to its cosine value */
void _Cos(const XTensor * a, XTensor * b);
set every entry to its cosine value (do it on site)
keep the result in the input tensor a and return nothing
/* set every entry to its cosine value (do it on site)
keep the result in the input tensor a and return nothing */
void _CosMe(XTensor * a);
set every entry to its cosine value (return a XTensor structure)
make a new tensor to keep the result and return it
/* set every entry to its cosine value (return a XTensor structure)
make a new tensor to keep the result and return it */
XTensor Cos(const XTensor & a);
/* set every entry to its tangent value */
void _Tan(const XTensor * a, XTensor * b);
set every entry to its tangent value (do it on site)
keep the result in the input tensor a and return nothing
/* set every entry to its tangent value (do it on site)
keep the result in the input tensor a and return nothing */
void _TanMe(XTensor * a);
set every entry to its tangent value (return a XTensor structure)
make a new tensor to keep the result and return it
/* set every entry to its tangent value (return a XTensor structure)
make a new tensor to keep the result and return it */
XTensor Tan(const XTensor & a);
/* set every entry to its round value */
//void _Round(const XTensor * a, XTensor * b);
set every entry to its round value (do it on site)
keep the result in the input tensor a and return nothing
//void _RoundMe(XTensor * a);
set every entry to its round value (return a XTensor structure)
make a new tensor to keep the result and return it
//XTensor Round(const XTensor & a);
#endif //end __UNARY_H__
\ No newline at end of file
......@@ -77,7 +77,7 @@ void KernelCopyBlocksV2(T * source, int blockSize, int blockNum, int totalSize,
int targetBlockID = targetBlocks[i / blockSize];
int targetOffset = i % blockSize;
*(target + blockSize * targetBlockID + targetOffset) = source[i];
target[blockSize * targetBlockID + targetOffset] = source[i];
......@@ -98,16 +98,6 @@ void _CudaCopyBlocks(void * source, int blockSize, int blockNum, void * target,
int devIDBackup;
ProtectCudaDev(devID, devIDBackup);
if(blockSize % sizeof(double) == 0){
int bSize = blockSize / sizeof(double);
GDevs.GetCudaThread(devID, bSize * blockNum, cudaGrids, cudaBlocks);
KernelCopyBlocksV2<double> <<<dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >>>
((double*)source, bSize, blockNum, bSize * blockNum, (double*)target, targetBlocks);
//GDevs.GetCudaThread2D(devID, bSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
//KernelCopyBlocks<double> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >>>
// ((double*)source, bSize, blockNum, (double*)target, targetBlocks);
if(blockSize % sizeof(float) == 0){
int bSize = blockSize / sizeof(float);
GDevs.GetCudaThread(devID, bSize * blockNum, cudaGrids, cudaBlocks);
......@@ -405,7 +405,7 @@ inline void continuousStorageThreadAllocation(dim3& grid, dim3& block, long long
if (vectorSize % 32 != 0) minWarpNum++;
warpNum = min(warpNum, minWarpNum);
grid.x = vectorNum;
grid.x = (unsigned int)vectorNum;
grid.y = 1;
grid.z = 1;
block.x = 1;
......@@ -482,7 +482,7 @@ void KernelReduceMaxOp(DTYPE * input, DTYPE * output,int stride, int strideNum,
if (tid < 32){
if (tid < blockDim.y / 32)
threadMax = data[tid];
else threadMax = 0;
else threadMax = FLOAT_MIN;
threadMax = shflDownReduceMax(threadMax);
if (tid == 0 && blockIdx.y < reducedStrideNum)
output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = threadMax;
......@@ -480,8 +480,8 @@ void KernelReduceSumFast(__half * input, __half * output,
if data storage is discontinuius ,use this way to reduce
void KernelReduceSumDiscontinuousStorage(DTYPE * input, DTYPE * output, int stride, int blockNum,
int strideNum, DTYPE * shift, DTYPE power, bool isExp)
void KernelReduceSumDiscontinuousStorage(DTYPE * input, DTYPE * output, int stride, int strideNum,
int blockNum, DTYPE * shift, DTYPE power, bool isExp)
int idx = blockDim.x * blockIdx.x + threadIdx.x;
......@@ -629,7 +629,7 @@ inline void continuousStorageThreadAllocation(dim3& grid, dim3& block, long long
if (vectorSize % 32 != 0) minWarpNum++;
warpNum = min(warpNum, minWarpNum);
grid.x = vectorNum;
grid.x = (unsigned int)vectorNum;
grid.y = 1;
grid.z = 1;
block.x = 1;
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
* $Created by: Xu Chen (email: 2018-09-12
#include "../XName.h"
#include <math.h>
#include <time.h>
#include "Dropout.h"
#include "Dropout.cuh"
#include "../core/arithmetic/Multiply.h"
#include "../core/math/ScaleAndShift.h"
namespace nts{ // namespace nts(NiuTrans.Tensor
generate a random bernoulli number
DTYPE RandomBernoulli(DTYPE prob)
return (DTYPE)rand()/(DTYPE)RAND_MAX > prob ? (DTYPE)1.0 : (DTYPE)0.0;
dropout function
During training, randomly zeroes some of the elements of the input tensor
with probability p using samples from a Bernoulli distribution.
The elements to zero are randomized on every forward call.
This has proven to be an effective technique for regularization and
preventing the co-adaptation of neurons as described in the paper
"Improving neural networks by preventing co-adaptation of feature detectors".
Furthermore, the outputs are scaled by a factor of \frac{1}{1-p} during training.
This means that during evaluation the module simply computes an identity function.
>> x - input tensor
>> y - output tensor
>> prob - probability to set an element zero
void _Dropout(const XTensor *x, XTensor *y, unsigned int seed, DTYPE prob)
CheckNTErrors(prob >= 0.0 && prob <= 1.0, "The probability must be 0-1!");
DTYPE scaleFactor = (DTYPE)1.0 / ((DTYPE)1.0 - prob);
/* generate a mask tensor again with special probability */
int unitNum = x->unitNum;
DTYPE * maskArray = new DTYPE[unitNum];
for (int i = 0; i < unitNum; i++)
maskArray[i] = RandomBernoulli(prob);
XTensor * maskTensor = NewTensorBuf(x, x->devID, x->mem);
maskTensor->SetData(maskArray, unitNum);
#ifdef USE_CUDA
if(x->devID >=0 || y->devID >= 0){
_CudaDropout(x, y, maskTensor, scaleFactor);
delete[] maskArray;
XTensor * inter = NewTensorBuf(x, x->devID, x->mem);
_Multiply(x, maskTensor, inter);
_ScaleAndShift(inter, y, scaleFactor, 0);
delete[] maskArray;
dropout function (return a XTensor structure)
make a new tensor to keep the result and return it
During training, randomly zeroes some of the elements of the input tensor
with probability p using samples from a Bernoulli distribution.
The elements to zero are randomized on every forward call.
This has proven to be an effective technique for regularization and
preventing the co-adaptation of neurons as described in the paper
"Improving neural networks by preventing co-adaptation of feature detectors".
Furthermore, the outputs are scaled by a factor of \frac{1}{1-p} during training.
This means that during evaluation the module simply computes an identity function.
>> x - input tensor
>> y - output tensor
>> prob - probability to set an element zero
XTensor Dropout(const XTensor &x, DTYPE prob)
XTensor y(&x);
DTYPE scaleFactor = (DTYPE)1.0 / ((DTYPE)1.0 - prob);
/* generate a mask tensor again with special probability */
srand((unsigned int)time(NULL));
int unitNum = x.unitNum;
DTYPE * maskArray = new DTYPE[unitNum];
for (int i = 0; i < unitNum; i++)
maskArray[i] = RandomBernoulli(prob);
XTensor maskTensor(&x);
maskTensor.SetData(maskArray, unitNum);
XTensor inter;
inter = Multiply(x, maskTensor);
y = ScaleAndShift(inter, scaleFactor, 0);
delete[] maskArray;
///* tensor connection */
//XLink::MakeLink(&x, NULL, &y, FUNC_DROPOUT);
//XLink::AddParamToHead(&y, prob);
return y;
backward computation of dropout function
dE/dx = dE/dy * dy/dx
>> y - output of the dropout function
>> x - input of the dropout function
>> dedy - dE/dy
>> dedx - dE/dx
>> prob - probability to set an element zero
void _DropoutBackward(const XTensor * y, const XTensor * x,
const XTensor * dedy, XTensor * dedx,
unsigned int seed, DTYPE prob)
if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE)
int unitNum = y->unitNum;
DTYPE scaleFactor = (DTYPE)1.0F / ((DTYPE)1.0F - prob);
/* generate a mask tensor again with special probability */
DTYPE * maskArray = new DTYPE[unitNum];
for (int i = 0; i < unitNum; i++)
maskArray[i] = RandomBernoulli(prob);
XTensor * maskTensor = NewTensorBuf(x, x->devID, x->mem);
maskTensor->SetData(maskArray, unitNum);
#ifdef USE_CUDA
if(x->devID >= 0 || y->devID >= 0){
_CudaDropoutBackward(y, x, dedy, dedx, maskTensor, scaleFactor);
delete[] maskArray;
DTYPE * dedyp = (DTYPE*)dedy->data;
DTYPE * dedxp = (DTYPE*)dedx->data;
/* dE/dx = dE/dy * dy/dx */
for(int i = 0; i < unitNum; i++)
dedxp[i] = dedyp[i] * maskArray[i] * scaleFactor;
delete[] maskArray;
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
* $Created by: Xu Chen (email: 2018-09-12
#include "Dropout.h"
#include "Dropout.cuh"
#include "Loss.cuh"
#include "../XDevice.h"
#ifdef USE_CUDA
// the CUDA stuff
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cuda.h>
namespace nts{ // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
dropout function (Cuda kernel)
>> x - input data pointer
>> y - output data pointer
>> m - mask indicator to set zero
>> s - the scale factor
>> size - size of input/output
void KernelDropoutCompute(DTYPE * x, DTYPE * y, DTYPE * m, DTYPE s, int size)
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size){
y[i] = x[i] * m[i] * s;
dropout function (Cuda version)
>> x - input tensor
>> y - output tensor
>> mask - mask tensor to set 0
>> scaleFactor - the scale factor
void _CudaDropout(const XTensor * x, XTensor * y, const XTensor * mask, DTYPE scaleFactor)
if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE){
CheckNTErrors(!x->isSparse && !y->isSparse, "the activation function (rectify) does not support sparse matrices.");
CheckNTErrors(x->unitNum && y->unitNum, "we require two vectors with the same length.");
int gridSize[3], blockSize[3];
GDevs.GetCudaThread(x->devID, x->unitNum, gridSize, blockSize);
int devIDBackup;
ProtectCudaDev(x->devID, devIDBackup);
KernelDropoutCompute<<<dim3(gridSize[0]), dim3(blockSize[0])>>>((DTYPE*)x->data, (DTYPE*)y->data, (DTYPE*)mask->data, scaleFactor, x->unitNum);
BacktoCudaDev(x->devID, devIDBackup);
backward computation of dropout function (Cuda kernel)
dE/dx = dE/dy * dy/dx
>> dedy - dE/dy
>> dedx - dE/dx
>> m - mask indicator to set zero
>> s - the scale factor
>> size - size of input/output
void KernelDropoutBackward(DTYPE * dedy, DTYPE * dedx,
DTYPE * m, DTYPE s, int size)
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size){
dedx[i] = dedy[i] * m[i] * s;
backward computation of dropout function (Cuda version)
dE/dx = dE/dy * dy/dx
>> y - output of the dropout function
>> x - input of the dropout function
>> dedy - dE/dy
>> dedx - dE/dx
>> mask - mask tensor to set 0
>> scaleFactor - the scale factor
void _CudaDropoutBackward(const XTensor * y, const XTensor * x,
const XTensor * dedy, XTensor * dedx,
const XTensor * mask, DTYPE scaleFactor)
int gridSize[3], blockSize[3];
if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE){
GDevs.GetCudaThread(x->devID, x->unitNum, gridSize, blockSize);
int devIDBackup;
ProtectCudaDev(x->devID, devIDBackup);
/* dE/ds = dE/dy * dy/ds */
((DTYPE*)dedy->data, (DTYPE*)dedx->data,
(DTYPE*)mask->data, scaleFactor, x->unitNum);
BacktoCudaDev(x->devID, devIDBackup);
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
* $Created by: Xu Chen (email: 2018-09-12
#ifndef __DROPOUT_CUH__
#define __DROPOUT_CUH__
#include "../XTensor.h"
#include "Loss.h"
namespace nts{ // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/* dropout function (Cuda version) */
void _CudaDropout(const XTensor * x, XTensor * y, const XTensor * r, DTYPE scaleFactor);
/* de/dx (Cuda version) */
void _CudaDropoutBackward(const XTensor * y, const XTensor * x,
const XTensor * dedy, XTensor * dedx,
const XTensor * mask, DTYPE scaleFactor);
#endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor)
#endif // __DROPOUT_CUH__
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
* $Created by: Xu Chen (email: 2018-09-12
#ifndef __DROPOUT_H__
#define __DROPOUT_H__
#include "../XTensor.h"
#include "Loss.h"
namespace nts{ // namespace nts(NiuTrans.Tensor)
/* dropout function */
void _Dropout(const XTensor * x, XTensor * y, unsigned int seed, DTYPE prob = 0.5);
/* dropout function */
XTensor Dropout(const XTensor &x, DTYPE prob = 0.5);
/* de/dx */
void _DropoutBackward(const XTensor * y, const XTensor * x,
const XTensor * dedy, XTensor * dedx,
unsigned int seed, DTYPE prob = 0.5);
} // namespace nts(NiuTrans.Tensor)
#endif // __DROPOUT_H__
\ No newline at end of file
......@@ -51,8 +51,7 @@ DTYPE _LossCompute(XTensor * gold, XTensor * output, LOSS_FUNCTION_NAME LFName,
CheckNTErrors((XTensor::IsSameShaped(gold, output)), "The input tensors must be of the same size!");
CheckNTErrors((gold->dimSizeRDI[0] == 1 && output->dimSizeRDI[0] == 1), "TODO!");
CheckNTErrors((gold->order > leadDim && leadDim >= 0), "Illegal leading dimension!");
CheckNTErrors((gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE),
CheckNTErrors((gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE), "TODO!");
int leadDimRDI = output->order - leadDim - 1;
int dimensionSize = output->dimSizeRDI[leadDimRDI];
......@@ -58,8 +58,7 @@ DTYPE _CudaLossCompute(XTensor * gold, XTensor * y, LOSS_FUNCTION_NAME LFName,
CheckNTErrors((XTensor::IsSameShaped(gold, y)), "The input tensors must be of the same size!");
CheckNTErrors((gold->dimSizeRDI[0] == 1 && y->dimSizeRDI[0] == 1), "TODO!");
CheckNTErrors((gold->order > leadDim && leadDim >= 0), "Illegal leading dimension!");
CheckNTErrors((gold->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE),
CheckNTErrors((gold->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE), "TODO!");
CheckNTErrors((gold->devID == y->devID), "Tensors must be on the same device!");
CheckNTErrors((gold->devID >= 0), "Tensors must be on GPU device!");
CheckNTErrors((gLen == gold->dimSize[leadDim] && gBeg == 0 && yBeg == 0), "TODO!");
......@@ -48,19 +48,19 @@ loss function to measure the "number" of errors
/* compute the loss */
DTYPE _LossCompute(XTensor * gold, XTensor * output, LOSS_FUNCTION_NAME LFName,
bool isLogOutput, int leadDim, int gBeg, int gLen, int oBeg);
bool isLogOutput, int leadDim, int gBeg, int gLen, int oBeg);
/* compute the loss (log version) */
DTYPE _LossComputeForLogScale(XTensor * gold, XTensor * output, LOSS_FUNCTION_NAME LFName,
int leadDim, int gBeg, int gLen, int oBeg);
int leadDim, int gBeg, int gLen, int oBeg);
/* backward compuation for a single element */
/* backward compuation for (dense) vectors */
void _LossBackward(XTensor * dEdY, XTensor * t, XTensor * y,
int leadDim = -1, int tBeg = 0, int tLen = -1, int yBeg = 0);
int leadDim = -1, int tBeg = 0, int tLen = -1, int yBeg = 0);
} // namespace nts(NiuTrans.Tensor)
......@@ -16,8 +16,8 @@
* $Created by: XIAO Tong (email: 2018-04-25
* $Created by: XIAO Tong (email: 2018-04-25
#include "../XName.h"
#include <math.h>
......@@ -16,8 +16,8 @@
* $Created by: XIAO Tong (email: 2018-04-25
* $Created by: XIAO Tong (email: 2018-04-25
#include "Sigmoid.h"
#include "Sigmoid.cuh"
......@@ -29,7 +29,7 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/* rectify function y = max(0, x) (Cuda version) */
/* sigmoid function y = 1/(1+exp(-x)) (Cuda version) */
void _CudaSigmoid(const XTensor * input, XTensor * output);
/* de/dx (Cuda version) */
......@@ -45,20 +45,17 @@ void _Softmax(const XTensor * x, XTensor * y, int leadDim)
int * dimSize = new int[x->order - 1];
for(int i = 0; i < x->order; i++){
if(i < leadDim)
dimSize[i] = -x->dimSize[i];
dimSize[i] = x->dimSize[i];
else if(i > leadDim)
dimSize[i - 1] = -x->dimSize[i];
dimSize[i - 1] = x->dimSize[i];
XMem * mem = x->mem;
XTensor * max = NULL;
XTensor * sum = NULL;
max = NewTensor(x->order - 1, dimSize, x->dataType, x->denseRatio, x->devID, mem);
sum = NewTensor(x->order - 1, dimSize, x->dataType, x->denseRatio, x->devID, mem);
max->data = mem != NULL ? (char*)mem->AllocBuf(mem->devID, max->unitNum * max->unitSize) : XMemAlloc(max->devID, max->unitNum * max->unitSize);
sum->data = mem != NULL ? (char*)mem->AllocBuf(mem->devID, sum->unitNum * sum->unitSize) : XMemAlloc(sum->devID, sum->unitNum * sum->unitSize);
max = NewTensorBuf(x->order - 1, dimSize, x->dataType, x->denseRatio, x->devID, mem);
sum = NewTensorBuf(x->order - 1, dimSize, x->dataType, x->denseRatio, x->devID, mem);
_ReduceMax(x, max, leadDim);
_ReduceSum(x, sum, leadDim, max, 1.0F, true);
......@@ -114,18 +111,9 @@ void _Softmax(const XTensor * x, XTensor * y, int leadDim)
if(mem != NULL){
mem->ReleaseBuf(mem->devID, max->unitNum * max->unitSize);
mem->ReleaseBuf(mem->devID, sum->unitNum * sum->unitSize);
XMemFree(max->devID, max->data);
XMemFree(sum->devID, sum->data);
max->data = NULL;
sum->data = NULL;
delete max;
delete sum;
delete[] dimSize;
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
* $Created by: Xu Chen (email: 2018-09-12
#include "../XUtility.h"
#include "TDropout.h"
#include "../core/getandset/SetData.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
case 1: test Dropout function.
bool TestDropout1()
/* a input tensor of size (4, 5) */
int order = 2;
int * dimSize = new int[order];
dimSize[0] = 40;
dimSize[1] = 50;
int unitNum = 1;
for (int i = 0; i < order; i++)
unitNum *= dimSize[i];
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * x = NewTensor(order, dimSize);
XTensor * y = NewTensor(order, dimSize);
XTensor yUser;
/* initialize variables */
x->SetDataRand(0, 1);
/* call Dropout function */
float prob = 0.2F;
int seed = 20;
_Dropout(x, y, seed, prob);
yUser = Dropout(*x);
/* check result */
int zeroNum1 = 0;
int zeroNum2 = 0;
float * data1 = (float*)y->data;
float * data2 = (float*);
for (int i = 0; i < unitNum; i++){
DTYPE tmp1 = data1[i];
DTYPE tmp2 = data2[i];
if(tmp1 == 0.0F)
zeroNum1 += 1;
if(tmp2 == 0.0F)
zeroNum2 += 1;
printf("CPU Test:\n");
printf("In tensor y, there are %d units.\n", unitNum);
printf("There are %d zero units by Dropout layer with probability %.2f.\n", zeroNum1, prob);
printf("In tensor yUser, there are %d units.\n", unitNum);
printf("There are %d zero units by Dropout layer with default probability %.2f.\n", zeroNum2, 0.5F);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * xGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * yGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor yUserGPU;
/* initialize variables */
xGPU->SetDataRand(0, 1);
/* call Dropout function */
_Dropout(xGPU, yGPU, seed, prob);
yUserGPU = Dropout(*xGPU);
/* check result */
zeroNum1 = 0;
zeroNum2 = 0;
data1 = (float*)y->data;
data2 = (float*);
for (int i = 0; i < unitNum; i++){
DTYPE tmp1 = data1[i];
DTYPE tmp2 = data2[i];
if(tmp1 == 0.0F)
zeroNum1 += 1;
if(tmp2 == 0.0F)
zeroNum2 += 1;
printf("CPU Test:\n");
printf("In tensor y, there are %d units.\n", unitNum);
printf("There are %d zero units by Dropout layer with probability %.2f.\n", zeroNum1, prob);
printf("In tensor yUser, there are %d units.\n", unitNum);
printf("There are %d zero units by Dropout layer with default probability %.2f.\n", zeroNum2, 0.5F);
/* destroy variables */
delete x;
delete y;
delete xGPU;
delete yGPU;
delete[] dimSize;
return cpuTest && gpuTest;
/* destroy variables */
delete x;
delete y;
delete[] dimSize;
return cpuTest;
#endif // USE_CUDA
case 2: test Dropout function and backward computation.
bool TestDropout2()
/* a input tensor of size (4, 5) */
int order = 2;
int * dimSize = new int[order];
dimSize[0] = 4;
dimSize[1] = 5;
int unitNum = 1;
for (int i = 0; i < order; i++)
unitNum *= dimSize[i];
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * x = NewTensor(order, dimSize);
XTensor * y = NewTensor(order, dimSize);
XTensor * dedx = NewTensor(order, dimSize);
XTensor * dedy = NewTensor(order, dimSize);
/* initialize variables */
_SetDataFixedFloat(x, 1.0F);
_SetDataFixedFloat(dedy, 1.0F);
/* call Dropout function */
float prob = 0.5F;
int seed = 1;
_Dropout(x, y, seed, prob);
_DropoutBackward(y, x, dedy, dedx, 1, prob);
/* check result */
y->Dump(stderr, "y");
dedx->Dump(stderr, "dedy");
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * xGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * yGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * dedxGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * dedyGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
/* initialize variables */
_SetDataFixedFloat(xGPU, 1.0F);
_SetDataFixedFloat(dedyGPU, 1.0F);
/* call Dropout function */
_Dropout(xGPU, yGPU, seed, prob);
_DropoutBackward(yGPU, xGPU, dedyGPU, dedxGPU, 1, prob);
/* check result */
yGPU->Dump(stderr, "yGPU");
dedxGPU->Dump(stderr, "dedyGPU");
/* destroy variables */
delete x;
delete y;
delete dedx;
delete dedy;
delete xGPU;
delete yGPU;
delete dedxGPU;
delete dedyGPU;
delete[] dimSize;
return cpuTest && gpuTest;
/* destroy variables */
delete x;
delete y;
delete[] dimSize;
return cpuTest;
#endif // USE_CUDA
/* other cases */
/* test for Dropout Function */
bool TestDropout()
XPRINT(0, stdout, "[TEST DROPOUT] dropout function and its backward computation \n");
bool returnFlag = true, caseFlag = true;
/* case 1 test */
caseFlag = TestDropout1();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 1 failed!\n");
XPRINT(0, stdout, ">> case 1 passed!\n");
/* case 2 test */
caseFlag = TestDropout2();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 2 failed!\n");
XPRINT(0, stdout, ">> case 2 passed!\n");
/* other cases test */
if (returnFlag) {
XPRINT(0, stdout, ">> All Passed!\n");
XPRINT(0, stdout, ">> Failed!\n");
XPRINT(0, stdout, "\n");
return returnFlag;
} // namespace nts(NiuTrans.Tensor)
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
* $Created by: Xu Chen (email: 2018-09-12
#ifndef __TEST_DROPOUT_H__
#define __TEST_DROPOUT_H__
#include "../function/Dropout.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* test for Dropout Function */
bool TestDropout();
} // namespace nts(NiuTrans.Tensor)
#endif // __TEST_DROPOUT_H__
......@@ -19,6 +19,7 @@
* $Created by: LI Yinqiao (email: 2018-04-30
#include "../core/math/ScaleAndShift.h"
#include "TLoss.h"
......@@ -62,7 +63,7 @@ bool TestLoss1()
error = _LossCompute(gold, output, SQUAREDERROR, false, 0, 0, dimSize[0], 0);
/* check results */
cpuTest = (error == answer);
cpuTest = (fabs(error - answer) < 1e-4);
#ifdef USE_CUDA
/* GPU test */
......@@ -82,7 +83,7 @@ bool TestLoss1()
error = _LossCompute(goldGPU, outputGPU, SQUAREDERROR, false, 0, 0, dimSize[0], 0);
/* check results */
gpuTest = (error == answer);
gpuTest = (fabs(error - answer) < 1e-4);
/* destroy variables */
delete output;
......@@ -140,7 +141,7 @@ bool TestLoss2()
error = _LossCompute(gold, output, CROSSENTROPY, false, 0, 0, dimSize[0], 0);
/* check results */
cpuTest = (error == answer);
cpuTest = (fabs(error - answer) < 1e-4);
#ifdef USE_CUDA
/* GPU test */
......@@ -160,7 +161,7 @@ bool TestLoss2()
error = _LossCompute(goldGPU, outputGPU, CROSSENTROPY, false, 0, 0, dimSize[0], 0);
/* check results */
gpuTest = (error == answer);
gpuTest = (fabs(error - answer) < 1e-4);
/* destroy variables */
delete output;
......@@ -226,7 +227,7 @@ bool TestLoss3()
error = _LossCompute(gold, output, ONEHOTERROR, false, 0, 0, dimSize[0], 0);
/* check results */
cpuTest = (error == answer);
cpuTest = (fabs(error - answer) < 1e-4);
#ifdef USE_CUDA
/* GPU test */
......@@ -244,7 +245,7 @@ bool TestLoss3()
error = _LossCompute(goldGPU, outputGPU, ONEHOTERROR, false, 0, 0, dimSize[0], 0);
/* check results */
gpuTest = (error == answer);
gpuTest = (fabs(error - answer) < 1e-4);
/* destroy variables */
delete output;
......@@ -33,52 +33,130 @@ bool TestXMemCase1()
int blcokSize = 16;
int testNum = caseNum * 10;
for(int i = 0, scalar = 1; i < 3; i++){
XMem mem;
mem.Initialize(-1, FREE_ON_THE_FLY, blcokSize * sizeof(int) * scalar * scalar, 1000, 0);
mem.SetIndex(10000, blcokSize * sizeof(int) / 2);
int ** p = new int*[caseNum];
int * size = new int[caseNum];
for(int i = 0; i < caseNum; i++){
p[i] = NULL;
size[i] = rand() % (2*blcokSize);
int devIDs[2];
int devNum = 1;
devIDs[0] = -1;
/*if (GDevs.nGPU > 0) {
devIDs[1] = 0;
devNum = 2;
devIDs[0] = 0;
devNum = 1;
int * buf = new int[blcokSize * 10];
for (int id = 0; id < devNum; id++) {
int devID = devIDs[id];
for (int iter = 0, scalar = 1; iter < 3; iter++) {
XMem mem;
mem.Initialize(devID, FREE_ON_THE_FLY, blcokSize * sizeof(int) * scalar * scalar, 1000, 0);
mem.SetIndex(10000, blcokSize * sizeof(int) / 2);
int ** p = new int*[caseNum];
int * size = new int[caseNum];
for (int i = 0; i < caseNum; i++) {
p[i] = NULL;
size[i] = rand() % (2 * blcokSize);
for(int i = 0; i < testNum * scalar; i++){
int j = rand() % caseNum;
for (int i = 0; i < testNum * scalar; i++) {
//fprintf(stderr, "%d %d\n", testxmemid, ok);
int j = rand() % caseNum;
if(p[j] == NULL){
p[j] = (int*)mem.AllocStandard(mem.devID, size[j] * sizeof(int));
for(int k = 0; k < size[j]; k++)
p[j][k] = j;
mem.ReleaseStandard(mem.devID, p[j]);
for(int k = 0; k < size[j]; k++)
p[j][k] = -1;
p[j] = NULL;
if (p[j] == NULL) {
p[j] = (int*)mem.AllocStandard(mem.devID, size[j] * sizeof(int));
for (int k = 0; k < size[j]; k++)
buf[k] = j;
XMemCopy(p[j], devID, buf, -1, sizeof(int) * size[j]);
else {
mem.ReleaseStandard(mem.devID, p[j], size[j] * sizeof(int));
for (int k = 0; k < size[j]; k++)
buf[k] = -1;
XMemCopy(p[j], devID, buf, -1, sizeof(int) * size[j]);
p[j] = NULL;
for(int k = 0; k < caseNum; k++){
if(p[k] != NULL){
for(int o = 0; o < size[k]; o++){
if(p[k][o] != k){
ok = false;
for (int k = 0; k < caseNum; k++) {
if (p[k] != NULL) {
XMemCopy(buf, -1, p[k], devID, sizeof(int) * size[k]);
for (int o = 0; o < size[k]; o++) {
if (buf[o] != k) {
ok = false;
/*MPieceNode * entry = NULL;
MPieceNode * node = NULL;
entry = mem.memIndex + mem.indexEntryNum + mem.FindIndexEntry(112);
int cc = 0;
node = entry->next;
while(node != NULL){
fprintf(stderr, "%d ", cc++);
if(node->size == 0){
MPieceNode * next = node->next;
node = next;
CheckNTErrors(node->pReal != NULL, "Illegal pointer!");
node = node->next;
fprintf(stderr, "\n");*/
/*int ccc = 0;
bool hhh = recordp != NULL ? false : true;
for(int i = 0; i < mem.indexEntryNum; i++){
MPieceNode * entry = mem.memIndex + mem.indexEntryNum + i;
MPieceNode * last = entry;
MPieceNode * node = entry->next;
ccc = 0;
while(node != NULL){
CheckNTErrors(node->pre == last, "XSomething is wrong!");
CheckNTErrors(last->next == node, "XSomething is wrong!");
last = node;
if(node->pReal == recordp){
hhh = true;
if(node->size == 0){
MPieceNode * next = node->next;
node = next;
CheckNTErrors(node->pReal != NULL, "Illegal pointer!");
node = node->next;
int nnn = 0;
delete[] p;
delete[] size;
scalar *= 2;
delete[] p;
delete[] size;
scalar *= 2;
delete[] buf;
return ok;
......@@ -113,4 +191,4 @@ bool TestXMem()
return returnFlag;
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
} // namespace nts(NiuTrans.Tensor)
......@@ -76,6 +76,7 @@ bool Test()
wrong = !TestUnsqueeze() || wrong;
wrong = !TestXMem() || wrong;
wrong = !TestDropout() || wrong;
wrong = !TestHardTanH() || wrong;
wrong = !TestIdentity() || wrong;
wrong = !TestLogSoftmax() || wrong;
......@@ -69,6 +69,7 @@
#include "TUnsqueeze.h"
#include "TXMem.h"
#include "TDropout.h"
#include "THardTanH.h"
#include "TIdentity.h"
#include "TLogSoftmax.h"
Markdown 格式
您添加了 0 到此讨论。请谨慎行事。
注册 或者 后发表评论