Merge branch 'xuchen' into xiaotong-working

7809ed05 · xuchen · f4be1882 · 03a9836e · 7809ed05 · 7809ed05
Commit 7809ed05 authored Nov 13, 2018 by xuchen
--- a/source/network/XBackwardFunc.cpp
+++ b/source/network/XBackwardFunc.cpp
@@ -49,7 +49,7 @@ void XFuncGrad::MakeGrad(XTensor * node, bool isEfficient)
    else if(operID == FUNC_LOGSOFTMAX){
        int leadDim = income.GetParamInt(0);
        CheckNTErrors(leadDim >= 0 && leadDim < input->order, "wrong leading dimension in logsoftmax!");
-        _LogSoftmaxBackward(NULL, output, input, output->grad, input->grad, leadDim, NOLOSS);
+        _LogSoftmaxBackward(NULL, output, input, output->grad, input->grad, NULL, leadDim, NOLOSS);
    }
    else if(operID == FUNC_RECTIFY)
        _RectifyBackward(NULL, output, input, output->grad, input->grad, NOLOSS);
@@ -58,7 +58,7 @@ void XFuncGrad::MakeGrad(XTensor * node, bool isEfficient)
    else if(operID == FUNC_SOFTMAX){
        int leadDim = income.GetParamInt(0);
        CheckNTErrors(leadDim >= 0 && leadDim < input->order, "wrong leading dimension in softmax!");
-        _SoftmaxBackward(NULL, output, input, output->grad, input->grad, leadDim, NOLOSS);
+        _SoftmaxBackward(NULL, output, input, output->grad, input->grad, NULL, leadDim, NOLOSS);
    }
    else{
        ShowNTErrors("Wrong activation function type!");

--- a/source/network/XBackwardLoss.cpp
+++ b/source/network/XBackwardLoss.cpp
@@ -42,7 +42,7 @@ compute dE/dx for a given function y = f(x)
 >> lossName - name of the loss, e.g., cross entropy
 */
 void XLossGrad::Compute(XTensor * gold, XTensor * y, XTensor * x, 
-                        XTensor * dedy, XTensor * dedx,
+                        XTensor * dedy, XTensor * dedx, XTensor * padding,
                        int funcID, void * params,
                        LOSS_FUNCTION_NAME lossName)
 {
@@ -58,7 +58,7 @@ void XLossGrad::Compute(XTensor * gold, XTensor * y, XTensor * x,
    }
    else if(funcID == FUNC_LOGSOFTMAX){
        int leadDim = *(int*)params;
-        _LogSoftmaxBackward(gold, y, x, dedy, dedx, leadDim, lossName);
+        _LogSoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
    }
    else if(funcID == FUNC_RECTIFY){
        _RectifyBackward(gold, y, x, dedy, dedx, lossName);
@@ -67,7 +67,7 @@ void XLossGrad::Compute(XTensor * gold, XTensor * y, XTensor * x,
        _SigmoidBackward(gold, y, x, dedy, dedx, lossName);
    }else if(funcID == FUNC_SOFTMAX){
        int leadDim = *(int*)params;
-        _SoftmaxBackward(gold, y, x, dedy, dedx, leadDim, lossName);
+        _SoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
    }
    else{
        ShowNTErrors("wrong function found when call the backward process!");
@@ -83,10 +83,12 @@ compute dE/dy for variable y and error(loss) function E
 >> lossName - name of the loss, e.g., cross entropy
 */
 void XLossGrad::Compute(XTensor * gold, XTensor * y, 
-                        XTensor * dedy, 
+                        XTensor * dedy, XTensor * padding,
                        LOSS_FUNCTION_NAME lossName)
 {
-    _LossBackward(dedy, gold, y, lossName);
+    //_LossBackward(dedy, gold, y, lossName);
+    if(lossName == CROSSENTROPY)
+        _CrossEntropyBackward(dedy, y, gold, NULL, padding);
 }
 }
\ No newline at end of file
--- a/source/network/XBackwardLoss.h
+++ b/source/network/XBackwardLoss.h
@@ -36,13 +36,13 @@ class XLossGrad
 public:
    /* compute dE/dx for a given function y = f(x) */
    void Compute(XTensor * gold, XTensor * y, XTensor * x, 
-                 XTensor * dedy, XTensor * dedx,
+                 XTensor * dedy, XTensor * dedx, XTensor * padding,
                 int funcID, void * params,
                 LOSS_FUNCTION_NAME lossName);
    /* compute dE/dy for variable y and error(loss) function E */
    void Compute(XTensor * gold, XTensor * y, 
-                 XTensor * dedy, 
+                 XTensor * dedy, XTensor * padding,
                 LOSS_FUNCTION_NAME lossName);
 };

--- a/source/network/XBackwardShape.cpp
+++ b/source/network/XBackwardShape.cpp
@@ -469,8 +469,6 @@ void XShapeGrad::GradTranspose(XTensor * node, bool isEfficient)
    DelTensorBuf(b);
    node->visitMark = NODE_FINISHED;
-    delete b;
 }
 /* 

--- a/source/network/XNet.cpp
+++ b/source/network/XNet.cpp
@@ -55,7 +55,7 @@ void XNetClearAll()
 XNet::XNet()
 {
    nodes.Clear();
-    isGradEfficient = true;
+    isGradEfficient = false;
 }
 /* de-constructor */
@@ -86,7 +86,31 @@ void XNet::Backward(XTensor &root, XTensor &gold, LOSS_FUNCTION_NAME loss)
    XList golds(1);
    golds.Add(&gold);
-    Backward(roots, golds, loss);
+    XList paddings(1);
+    paddings.Add(NULL);
+    Backward(roots, golds, paddings, loss);
+}
+/* 
+backward propagation to obtain gradient wrt. the loss/error function 
+>> root - root node (output) of the network
+>> gold - gold standard for the output
+>> padding - specify a target value that is ignored and does not contribute to the loss computation
+>> loss - name of loss function
+*/
+void XNet::Backward(XTensor &root, XTensor &gold, XTensor &padding, LOSS_FUNCTION_NAME loss)
+{
+    XList roots(1);
+    roots.Add(&root);
+    XList golds(1);
+    golds.Add(&gold);
+    XList paddings(1);
+    paddings.Add(&padding);
+    Backward(roots, golds, paddings, loss);
 }
 /* 
@@ -102,7 +126,10 @@ void XNet::Backward(XTensor &root, LOSS_FUNCTION_NAME loss)
    XList golds(1);
    golds.Add(NULL);
-    Backward(roots, golds, loss);
+    XList paddings(1);
+    paddings.Add(NULL);
+    Backward(roots, golds, paddings, loss);
 }
 /* 
@@ -110,9 +137,10 @@ backward propagation to obtain gradient wrt. the loss/error function
 with a number of root nodes 
 >> root - a list of root nodes (output) of the network
 >> gold - a list of gold standard for the output
+>> padding - specify a target value that is ignored
 >> loss - name of loss function
 */
-void XNet::Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss)
+void XNet::Backward(XList &roots, XList &golds, XList &paddings, LOSS_FUNCTION_NAME loss)
 {
    Traverse(roots);
@@ -131,6 +159,7 @@ void XNet::Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss)
    for(int i = 0; i < roots.count; i++){
        XTensor * root = (XTensor*)roots.Get(i);
        XTensor * gold = (XTensor*)golds.Get(i);
+        XTensor * padding = (XTensor*)paddings.Get(i);
        XLink &income = root->income;
        int funcID = income.typeID;
        void * params = income.params;
@@ -139,15 +168,21 @@ void XNet::Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss)
           Note that we do not need to obtain dE/dy here because it is no use in the 
           folloing process of back-propagation */
        if(gold != NULL && income.tailNum == 1 && (funcID & FUNCTION_BASE)){
+            if(funcID == FUNC_LOGSOFTMAX || funcID == FUNC_SOFTMAX) {
                XTensor * x = income.tails[0];
                XNoder::MakeGrad(x);
-            lossGrad.Compute(gold, root, x, NULL, x->grad, funcID, params, loss);
+                lossGrad.Compute(gold, root, x, NULL, x->grad, padding, funcID, params, loss);
                root->visitMark = NODE_FINISHED;
            }
+            else {
+                XNoder::MakeGrad(root);
+                lossGrad.Compute(gold, root, root->grad, padding, loss);
+            }
+        }
        /* we compuate dE/dy (y is the output) if no predefined activation function is used */
        else{
            XNoder::MakeGrad(root);
-            lossGrad.Compute(gold, root, root->grad, loss);
+            lossGrad.Compute(gold, root, root->grad, NULL, loss);
        }
    }
@@ -178,16 +213,35 @@ void XNet::Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss)
 /* 
 backward propagation to obtain gradient
 with a number of root nodes 
->> root - a list of root nodes (output) of the network
+>> roots - a list of root nodes (output) of the network
 >> loss - name of loss function
 */
 void XNet::Backward(XList &roots, LOSS_FUNCTION_NAME loss)
 {
    XList golds(roots.count);
-    for(int i = 0; i < roots.count; i++)
+    XList paddings(roots.count);
+    for(int i = 0; i < roots.count; i++) {
        golds.Add(NULL);
+        paddings.Add(NULL);
+    }
+    Backward(roots, golds, paddings, loss);
+}
+/* 
+backward propagation to obtain gradient
+with a number of root nodes 
+>> roots - a list of root nodes (output) of the network
+>> golds - a list of gold standard for the output
+>> loss - name of loss function
+*/
+void XNet::Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss)
+{
+    XList paddings(roots.count);
+    for(int i = 0; i < roots.count; i++)
+        paddings.Add(NULL);
-    Backward(roots, golds, loss);
+    Backward(roots, golds, paddings, loss);
 }
 /* 

--- a/source/network/XNet.h
+++ b/source/network/XNet.h
@@ -62,17 +62,24 @@ struct XNet
    /* backward propagation to obtain gradient wrt. the loss/error function */
    void Backward(XTensor &root, XTensor &gold, LOSS_FUNCTION_NAME loss = NOLOSS);
+    /* backward propagation to obtain gradient wrt. the loss/error function */
+    void Backward(XTensor &root, XTensor &gold, XTensor &padding, LOSS_FUNCTION_NAME loss = NOLOSS);
    /* backward propagation to obtain gradient */
    void Backward(XTensor &root, LOSS_FUNCTION_NAME loss = NOLOSS);
    /* backward propagation to obtain gradient wrt. the loss/error function
       with a number of root nodes */
-    void Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss = NOLOSS);
+    void Backward(XList &roots, XList &golds, XList &paddings, LOSS_FUNCTION_NAME loss = NOLOSS);
    /* backward propagation to obtain gradient
       with a number of root nodes */
    void Backward(XList &roots, LOSS_FUNCTION_NAME loss = NOLOSS);
+    /* backward propagation to obtain gradient
+       with a number of root nodes */
+    void Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss = NOLOSS);
    /* backward computation for a given node */
    void BackwardNode(XTensor * node, bool isEfficent = false);

--- a/source/sample/fnnlm/FNNLM.cpp
+++ b/source/sample/fnnlm/FNNLM.cpp
@@ -514,6 +514,8 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
        if(isEnd)
            break;
+        Test(testFN, outputFN, model);
    }
    double elapsed = GetClockSec() - startT;
@@ -890,7 +892,7 @@ void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NA
    /* for y = softmax(s), we get dE/ds
        where E is the error function (define by loss) */
-    _LogSoftmaxBackward(&gold, &y, &s, NULL, &deds, 1, loss);
+    _LogSoftmaxBackward(&gold, &y, &s, NULL, &deds, NULL, 1, loss);
    /* for s = x * w, we get 
       dE/w_{i,j} = dE/ds_j * ds/dw_{i,j} 

--- a/source/sample/transformer/T2TEmbedding.cpp
+++ b/source/sample/transformer/T2TEmbedding.cpp
@@ -68,9 +68,10 @@ void T2TEmbedder::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
 }
 /* 
-make positional embeddings (of size eSize * length
+make positional embeddings (of size eSize * length)
-eSize - embedding size
+>> eSize - embedding size
-length - length of the sequenc
+>> d - dimension size of the hidden layers
+>> length - length of the sequence
 */
 void T2TEmbedder::MakePosEmbedding(int eSize, int d, int length)
 {
@@ -114,15 +115,15 @@ make the network
 */
 XTensor T2TEmbedder::Make(XTensor &input)
 {
-    CheckNTErrors(input.GetDim(-1) == vSize, "Wrong vocabulary size!");
+    //CheckNTErrors(input.GetDim(-1) == vSize, "Wrong vocabulary size!");
    CheckNTErrors(input.order > 1, "Wrong input tensor size!");
-    CheckNTErrors(input.dimSize[input.order - 2] < maxLength, "The sequence is too long!");
+    CheckNTErrors(input.dimSize[input.order - 1] < maxLength, "The sequence is too long!");
    CheckNTErrors(vSize > 0, "set vocabulary size by \"-vsize\"");
    CheckNTErrors(eSize > 0, "set embedding size by \"-esize\"");
    int dims[MAX_TENSOR_DIM_NUM];
    memcpy(dims, input.dimSize, input.order * sizeof(int));
-    dims[input.order - 1] = eSize;
+    dims[input.order] = eSize;
    XTensor wordEmbedding;
    XTensor posEmbedding;
@@ -138,7 +139,8 @@ XTensor T2TEmbedder::Make(XTensor &input)
    /* we make positional embeddings first */
    //if(!match){
    if(true){
-        InitTensor(&posEmbedding, input.order, dims, X_FLOAT, 1.0F, devID, mem);
+        InitTensor(&posEmbedding, input.order + 1, dims, X_FLOAT, 1.0F, devID, mem);
        XTensor * posTMP = NewTensorBuf(2, dims + 1, X_FLOAT, 1.0F, devID, mem);
        _CopyValues(&posEmbeddingBase, 0, posTMP->unitNum, posTMP, 0);
@@ -148,7 +150,9 @@ XTensor T2TEmbedder::Make(XTensor &input)
    }
    /* then we make word embeddings */
-    wordEmbedding = Linear(MMul(input, w), (float)sqrt((float)eSize));
+    //wordEmbedding = Linear(MMul(input, w), (float)sqrt((float)eSize));
+    wordEmbedding = Gather(w, input);
+    wordEmbedding = Linear(wordEmbedding, (float)sqrt((float)eSize));
    /* we sum over the two embeddings */
    return wordEmbedding + posEmbedding;

--- a/source/sample/transformer/T2TModel.cpp
+++ b/source/sample/transformer/T2TModel.cpp
@@ -121,13 +121,21 @@ void T2TModel::MakeLM(XTensor &input, XTensor &output, XTensor &padding, bool is
    XTensor encoding;
    /* generate mask to see "previous" words only */
-    int len = input.GetDim(input.order - 2);
+    //int len = input.GetDim(input.order - 2);
-    int * dims = new int[input.order + 1];
+    //int * dims = new int[input.order + 1];
+    //for(int i = 0; i < input.order; i++)
+    //    dims[i + 1] = input.GetDim(i);
+    //dims[0] = nhead;
+    //dims[input.order] = len;
+    //XTensor mask(input.order + 1, dims, X_FLOAT, 1.0F, input.devID, input.mem);
+    int len = input.GetDim(input.order - 1);
+    int * dims = new int[input.order + 2];
    for(int i = 0; i < input.order; i++)
        dims[i + 1] = input.GetDim(i);
    dims[0] = nhead;
-    dims[input.order] = len;
+    dims[input.order + 1] = len;
-    XTensor mask(input.order + 1, dims, X_FLOAT, 1.0F, input.devID, input.mem);
+    XTensor mask(input.order + 2, dims, X_FLOAT, 1.0F, padding.devID, padding.mem);
    /* a upper triangular matrix where the cells of the upper triangular are set to -1e-9.
        this matrix can be used to prevent the attention to current or following words in
@@ -148,16 +156,16 @@ void T2TModel::MakeLM(XTensor &input, XTensor &output, XTensor &padding, bool is
        dimsPadding[i + 1] = padding2->GetDim(i);
    dimsPadding[0] = nhead;
-    XTensor * padding3 = NewTensorBuf(padding.order + 2, dimsPadding, padding.dataType,
+    //XTensor * padding3 = NewTensorBuf(padding.order + 2, dimsPadding, padding.dataType,
-                                        padding.denseRatio, padding.devID, padding.mem);
+    //                                  padding.denseRatio, padding.devID, padding.mem);
+    //    
-    /* mask of the padding */
+    ///* mask of the padding */
-    _Unsqueeze(&padding, padding2, padding.order - 1, padding.GetDim(-1));
+    //_Unsqueeze(&padding, padding2, padding.order - 1, padding.GetDim(-1));
-    _Unsqueeze(padding2, padding3, 0, nhead);
+    //_Unsqueeze(padding2, padding3, 0, nhead);
+    //    
-    _ScaleAndShiftMe(padding3, 1e9F, -1e9F);
+    //_ScaleAndShiftMe(padding3, 1e9F, -1e9F);
+    //    
-    //_Sum(&mask, padding3, &mask);
+    ////_Sum(&mask, padding3, &mask);
    encoding = MakeEncoder(input, mask, isTraining);
    outputLayer.Make(encoding, output);
@@ -165,8 +173,8 @@ void T2TModel::MakeLM(XTensor &input, XTensor &output, XTensor &padding, bool is
    delete[] dims;
    delete[] dimsPadding;
+    //DelTensorBuf(padding3);
    DelTensorBuf(padding2);
-    DelTensorBuf(padding3);
 }
 /* 
@@ -235,8 +243,8 @@ void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTe
    delete[] dims;
    delete[] dimsPadding;
-    DelTensorBuf(padding2);
    DelTensorBuf(padding3);
+    DelTensorBuf(padding2);
 }
 /* 

--- a/source/sample/transformer/T2TOutput.cpp
+++ b/source/sample/transformer/T2TOutput.cpp
@@ -93,7 +93,8 @@ void T2TOutput::Make(XTensor &input, XTensor &output)
 {
    XTensor &x = input;
-    output = LogSoftmax(MMul(x, w), -1);
+    //output = LogSoftmax(MMul(x, w), -1);
+    output = Softmax(MMul(x, w), -1);
 }
 }
--- a/source/sample/transformer/T2TTrainer.cpp
+++ b/source/sample/transformer/T2TTrainer.cpp
@@ -124,6 +124,9 @@ void T2TTrainer::Init(int argc, char ** argv)
    adamBeta1T = 1.0F;
    adamBeta2T = 1.0F;
+    validStep = 0;
+    curEpoch = 0;
 }
 int tc = 0;
@@ -135,9 +138,10 @@ train the model
 >> modelFN - where we keep the model
 >> model - model to train
 */
-void T2TTrainer::Train(const char * fn, const char * validFN, const char * modelFN, T2TModel * model)
+bool T2TTrainer::Train(const char * fn, const char * validFN, const char * modelFN, T2TModel * model)
 {
-    int epoch = 0;
+    curEpoch += 1;
    int step = 0;
    int wc = 0;
    int wordCount = 0;
@@ -149,7 +153,7 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
    int nCheckpoint = 0;
    int nSkipped = 0;
    int gradStep = 0;
-    int validStep = 0;
+    //int validStep = 0;
    char * trainFN = new char[(int)strlen(fn) + 10];
    strcpy(trainFN, fn);
@@ -159,15 +163,15 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
        sprintf(trainFN, "%s.random", fn);
 #endif
-    PrepareModel(model);
    int devID = model->devID;
    XMem * mem = model->mem;
    XNet net;
+    PrepareModel(model);
    double startT = GetClockSec();
-    for(epoch = 1; epoch <= nepoch; epoch++){
+    //for(epoch = 1; epoch <= nepoch; epoch++){
 #ifndef WIN32
    if(isShuffled)
        Shuffle(fn, trainFN);
@@ -195,10 +199,11 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
        while (LoadBatch(file, model->isLM, &batchEnc, &paddingEnc, &batchDec, &paddingDec, &gold, 
                         NULL, vSize, vSizeTgt,
-                         sBatchSize, wBatchSize, isLenSorted, wc, devID, mem)) 
+                         sBatchSize, wBatchSize, isLenSorted, wc, devID, mem, true)) 
        {
-            CheckNTErrors(batchEnc.order == 3, "wrong tensor order of the sequence batch");
+            CheckNTErrors(batchEnc.order == 2, "wrong tensor order of the sequence batch");
+            //CheckNTErrors(batchEnc.order == 3, "wrong tensor order of the sequence batch");
            /* output probabilities */
            XTensor output;
@@ -217,14 +222,15 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
                LabelSmooth(&gold, &goldSmoothed, labelSmoothingP);
            /* make paddings for the output */
-            if (output.GetDim(0) > 1)
+            //if (output.GetDim(0) > 1)
-                PadOutput(&output, &gold, &paddingDec);
+            //    PadOutput(&output, &gold, &paddingDec);
            //output.Dump(tmpFILE, "output: ");
            //fflush(tmpFILE);
            /* get probabilities */
            float prob = GetProb(&output, &gold, NULL);
            DTYPE lossLocal = -prob / wc;
            bool doUpdate = (!IsNAN(lossLocal) && !IsINF(lossLocal) && lossLocal < 1e3F);
@@ -233,10 +239,10 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
            if (doUpdate) {
                /* recale the output for normalized loss */
-                RescaleOutput(&output, &g, &paddingDec);
+                //RescaleOutput(&output, &g, &paddingDec);
                /* back-propagation */
-                net.Backward(output, g, CROSSENTROPY);
+                net.Backward(output, g, paddingDec, CROSSENTROPY);
                gradStep += 1;
                loss += -prob;
@@ -264,10 +270,10 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
                break;
            }
-            if (step % 1 == 0) {
+            if (step % 100 == 0) {
                double elapsed = GetClockSec() - startT;
                XPRINT8(0, stderr, "[INFO] lr=%.2e, elapsed=%.1fs, step=%d, epoch=%d, word=%d, loss=%.3f, ppl=%.3f, sppl=%.3f",
-                        lr, elapsed, step, epoch, wordCountTotal, loss/wordCount, exp(loss/wordCount), exp(-prob/wc));
+                        lr, elapsed, step, curEpoch, wordCountTotal, loss/wordCount, exp(loss/wordCount), exp(-prob/wc));
                if (!doUpdate)
                    XPRINT(0, stderr, " (no update)");
                XPRINT(0, stderr, "\n");
@@ -283,20 +289,20 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
        fclose(file);
        if (isEnd)
-            break;
+            return false;
+        return true;
-        if(useEpochCheckpoint)
+        //if(useEpochCheckpoint)
-            MakeCheckpoint(model, validFN, modelFN, "epoch", epoch);
+        //    MakeCheckpoint(model, validFN, modelFN, "epoch", epoch);
-    }
+    //}
-    double elapsed = GetClockSec() - startT;
+    //double elapsed = GetClockSec() - startT;
+    //
-    epoch = MIN(epoch, nepoch);
+    //epoch = MIN(epoch, nepoch);
+    //
-    XPRINT7(0, stderr, "[INFO] lr=%.2e, elapsed=%.1fs, step=%d, epoch=%d, word=%d, loss=%.3f, ppl=%.3f\n",
+    //XPRINT7(0, stderr, "[INFO] lr=%.2e, elapsed=%.1fs, step=%d, epoch=%d, word=%d, loss=%.3f, ppl=%.3f\n",
-            lr, elapsed, step, epoch, wordCountTotal, loss/wordCount, exp(loss/wordCount));
+    //        lr, elapsed, step, epoch, wordCountTotal, loss/wordCount, exp(loss/wordCount));
-    XPRINT4(0, stderr, "[INFO] training finished (took %.1fs, step=%d, skipped=%d and epoch=%d)\n",
+    //XPRINT4(0, stderr, "[INFO] training finished (took %.1fs, step=%d, skipped=%d and epoch=%d)\n",
-            elapsed, step, nSkipped, epoch);
+    //        elapsed, step, nSkipped, epoch);
    delete[] trainFN;
 }
@@ -348,10 +354,11 @@ void T2TTrainer::Test(const char * fn, const char * ofn, T2TModel * model)
    while(LoadBatch(file, model->isLM, &batchEnc, &paddingEnc, &paddingDec, &paddingDec, &gold, 
                    seqs, vSize, vSizeTgt,
-                    1, 1, false, wc, devID, mem))
+                    1, 1, false, wc, devID, mem, false))
    {
-        CheckNTErrors(batchEnc.order == 3, "wrong tensor order of the sequence batch");
+        //CheckNTErrors(batchEnc.order == 3, "wrong tensor order of the sequence batch");
+        CheckNTErrors(batchEnc.order == 2, "wrong tensor order of the sequence batch");
        /* output probabilities */
        XTensor output;
@@ -601,6 +608,7 @@ load a batch of sequences
 >> wCount - word count
 >> devID - device id
 >> mem - memory pool
+>> isTraining - indicates whether we are training the model
 */
 int T2TTrainer::LoadBatch(FILE * file, bool isLM, 
                          XTensor * batchEnc, XTensor * paddingEnc, 
@@ -609,17 +617,18 @@ int T2TTrainer::LoadBatch(FILE * file, bool isLM,
                          int * seqs,
                          int vsEnc, int vsDec, int sBatch, int wBatch, 
                          bool isSorted, int &wCount,
-                          int devID, XMem * mem)
+                          int devID, XMem * mem, 
+						  bool isTraining)
 {
    if(isLM){
        return LoadBatchLM(file, batchEnc, paddingEnc, batchDec, paddingDec, gold, 
                           seqs, vsEnc, sBatch, wBatch, 
-                           isSorted, wCount, devID, mem);
+                           isSorted, wCount, devID, mem, isTraining);
    }
    else{
        return LoadBatchMT(file, batchEnc, paddingEnc, batchDec, paddingDec, gold, 
                           seqs, vsEnc, vsDec, sBatch, wBatch, 
-                           isSorted, wCount, devID, mem);
+                           isSorted, wCount, devID, mem, isTraining);
    }
 }
@@ -640,6 +649,7 @@ load a batch of sequences (for LM)
 >> wCount - word count
 >> devID - device id
 >> mem - memory pool
+>> isTraining - indicates whether we are training the model
 */
 int T2TTrainer::LoadBatchLM(FILE * file, 
                            XTensor * batchEnc, XTensor * paddingEnc,
@@ -648,7 +658,8 @@ int T2TTrainer::LoadBatchLM(FILE * file,
                            int * seqs,
                            int vs, int sBatch, int wBatch, 
                            bool isSorted, int &wCount,
-                            int devID, XMem * mem)
+                            int devID, XMem * mem,
+							bool isTraining)
 {
    if(nextSeq < 0 || nextSeq >= nseqBuf)
        LoadBuf(file, isSorted, 1);
@@ -684,24 +695,27 @@ int T2TTrainer::LoadBatchLM(FILE * file,
    dims[1] = max;
    dims[2] = vs;
-    InitTensor(batchEnc, 3, dims, X_FLOAT, 1.0F, devID, mem);
+    InitTensor(batchEnc, 2, dims, X_INT, 1.0F, -1);
+    //InitTensor(batchEnc, 3, dims, X_FLOAT, 1.0F, devID, mem);
    InitTensor2D(paddingEnc, sc, max, X_FLOAT, devID, mem);
    InitTensor(gold, 3, dims, X_FLOAT, 1.0F, devID, mem);
    InitTensor2D(paddingDec, sc, max, X_FLOAT, devID, mem);
-    XNoder::MakeGrad(batchEnc);
-    XNoder::MakeGrad(paddingEnc);
-    XNoder::MakeGrad(gold);
-    XNoder::MakeGrad(paddingDec);
    batchEnc->SetZeroAll();
    paddingEnc->SetZeroAll();
    gold->SetZeroAll();
    paddingDec->SetZeroAll();
-    batchEnc->grad->SetZeroAll();
+    if(isTraining) {
+        //XNoder::MakeGrad(batchEnc);
+        XNoder::MakeGrad(paddingEnc);
+        XNoder::MakeGrad(gold);
+        XNoder::MakeGrad(paddingDec);
+        //batchEnc->grad->SetZeroAll();
        paddingEnc->grad->SetZeroAll();
        gold->grad->SetZeroAll();
        paddingDec->grad->SetZeroAll();
+    }
    int seqSize = 0;
@@ -712,7 +726,8 @@ int T2TTrainer::LoadBatchLM(FILE * file,
        int len = isDoubledEnd ? seqLen[s] : seqLen[s] - 1;
        CheckNTErrors(len <= max, "Something is wrong!");
        for(int w = 0; w < len; w++){
-            batchEnc->Set3D(1.0F, s - seq, w, buf[seqOffset[s] + w]);
+            batchEnc->Set2DInt(buf[seqOffset[s] + w], s - seq, w);
+            //batchEnc->Set3D(1.0F, s - seq, w, buf[seqOffset[s] + w]);
            paddingEnc->Set2D(1.0F, s - seq, w);
            paddingDec->Set2D(1.0F, s - seq, w);
            if (w > 0)
@@ -763,6 +778,7 @@ load a batch of sequences (for MT)
 >> wCount - word count
 >> devID - device id
 >> mem - memory pool
+>> isTraining - indicates whether we are training the model
 */
 int T2TTrainer::LoadBatchMT(FILE * file, 
                            XTensor * batchEnc, XTensor * paddingEnc, 
@@ -771,7 +787,8 @@ int T2TTrainer::LoadBatchMT(FILE * file,
                            int * seqs,
                            int vsEnc, int vsDec, int sBatch, int wBatch, 
                            bool isSorted, int &wCount,
-                            int devID, XMem * mem)
+                            int devID, XMem * mem, 
+							bool isTraining)
 {
    if(nextSeq < 0 || nextSeq >= nseqBuf)
        LoadBuf(file, isSorted, 2);
@@ -905,8 +922,12 @@ float T2TTrainer::GetProb(XTensor * output, XTensor * gold, XTensor * wordProbs)
    XTensor probs;
    InitTensor(&probs, output);
+    XTensor logOutput;
+    InitTensor(&logOutput, output);
+    _Log(output, &logOutput);
    /* probs[i,j] = output[i,j] * gold[i,j] */
-    _Multiply(output, gold, &probs);
+    _Multiply(&logOutput, gold, &probs);
    /* probability of each word */
    XTensor wprobs;

--- a/source/sample/transformer/T2TTrainer.h
+++ b/source/sample/transformer/T2TTrainer.h
@@ -103,6 +103,10 @@ public:
    /* indicates whether we use adam */
    bool useAdam;
+    int validStep;
+    int curEpoch;
    /* hyper parameters of adam*/
    float adamBeta1;
    float adamBeta2;
@@ -131,7 +135,7 @@ public:
    /* number of batches on which we do model update */
    int updateStep;
-    /* indicates whether we double the </s> symble for the output of lms */
+    /* indicates whether we double the </s> symbol for the output of lms */
    bool isDoubledEnd;
    /* indicates whether we use batchsize = max * sc
@@ -150,7 +154,7 @@ public:
    void Init(int argc, char ** argv);
    /* train the model */
-    void Train(const char * fn, const char * validFN, const char * modelFN, T2TModel * model);
+    bool Train(const char * fn, const char * validFN, const char * modelFN, T2TModel * model);
    /* test the model */
    void Test(const char * fn, const char * ofn, T2TModel * model);
@@ -172,7 +176,28 @@ public:
                  int * seqs,
                  int vsEnc, int vsDec, int sBatch, int wBatch, 
                  bool isSorted, int &wCount,
-                  int devID, XMem * mem);
+                  int devID, XMem * mem, 
+				  bool isTraining);
+    /* load a batch of sequences (for language modeling) */
+    int LoadBatchLM(FILE * file, 
+                    XTensor * batchEnc, XTensor * paddingEnc,
+                    XTensor * batchDec, XTensor * paddingDec,
+                    XTensor * gold,
+                    int * seqs, int vs, int sBatch, int wBatch, 
+                    bool isSorted, int &wCount,
+                    int devID, XMem * mem, 
+					bool isTraining);
+    /* load a batch of sequences (for machine translation) */
+    int LoadBatchMT(FILE * file, 
+                    XTensor * batchEnc, XTensor * paddingEnc, 
+                    XTensor * batchDec, XTensor * paddingDec,
+                    XTensor * gold,
+                    int * seqs, int vsEnc, int vsDec, int sBatch, int wBatch, 
+                    bool isSorted, int &wCount,
+                    int devID, XMem * mem, 
+					bool isTraining);
    /* load a batch of sequences (for language modeling) */
    int LoadBatchLM(FILE * file, 

--- a/source/sample/transformer/Transformer.cpp
+++ b/source/sample/transformer/Transformer.cpp
@@ -25,6 +25,8 @@
 #include "T2TUtility.h"
 #include "T2TTrainer.h"
 #include "../../tensor/XDevice.h"
+#include "../../tensor/XUtility.h"
+#include "../../tensor/XGlobal.h"
 namespace transformer
 {
@@ -56,20 +58,74 @@ int TransformerMain(int argc, const char ** argv)
    LoadParamString(argc, args, "test", testFN, "");
    LoadParamString(argc, args, "output", outputFN, "");
+    /* learn model parameters */
+    if(strcmp(trainFN, "")) {
+        double startT = GetClockSec();
        T2TTrainer trainer;
        trainer.Init(argc, args);
+        char * fn = new char[MAX_LINE_LENGTH];
+        char * fn1 = new char[MAX_LINE_LENGTH];
+        char * fn2 = new char[MAX_LINE_LENGTH];
+        modelFN = strcmp(modelFN, "") ? modelFN : (char *)"checkpoint.model";
+        int epoch;
+        bool isTrain;
+        for(epoch = 1; epoch <= trainer.nepoch; epoch++) {
+            sprintf(fn, "%s.%s.%03d", modelFN, "epoch", epoch - 1);
+            sprintf(fn1, "%s.%s.%03d", modelFN, "epoch", epoch);
+            sprintf(fn2, "%s.%s.%03d.output", modelFN, "epoch", epoch);
+            if(epoch == 1) {
                T2TModel model;
+                model.InitModel(argc, args);
+                isTrain = trainer.Train(trainFN, testFN, modelFN, &model);
+                model.Dump(fn1);
+            }
+            else {
+                T2TModel model;
                model.InitModel(argc, args);
+                model.Read(fn);
-    /* learn model parameters */
+                isTrain = trainer.Train(trainFN, testFN, modelFN, &model);
-    if(strcmp(trainFN, ""))
+                model.Dump(fn1);
-        trainer.Train(trainFN, testFN, strcmp(modelFN, "") ? modelFN : "checkpoint.model", &model);
+            }
+            if(trainer.useEpochCheckpoint && strcmp(testFN, "")) {
+                T2TTrainer tester;
+                tester.Init(argc, args);
+                T2TModel model;
+                model.InitModel(argc, args);
+                model.Read(fn1);
+                tester.Test(testFN, fn2, &model);
+            }
+            if(!isTrain)
+                break;
+        }
+        double elapsed = GetClockSec() - startT;
+        epoch = MIN(epoch, trainer.nepoch);
+        XPRINT2(0, stderr, "[INFO] training finished (took %.1fs and epoch=%d)\n", elapsed, epoch);
+        delete[] fn;
+        delete[] fn1;
+        delete[] fn2;
+    }
+    /* don't dump the final model */
    /* save the final model */
-    if(strcmp(modelFN, "") && strcmp(trainFN, ""))
+    //if(strcmp(modelFN, "") && strcmp(trainFN, ""))
-        model.Dump(modelFN);
+    //    model.Dump(modelFN);
+    T2TModel model;
+    model.InitModel(argc, args);
    /* load the model if neccessary */
    if(strcmp(modelFN, ""))

--- a/source/tensor/XDevice.cpp
+++ b/source/tensor/XDevice.cpp
--- a/source/tensor/XMem.cpp
+++ b/source/tensor/XMem.cpp
@@ -292,7 +292,8 @@ void XMem::SetComputationMode(bool myIsForComputation)
    if(!myIsForComputation && devID >= 0 && cublasHandle != NULL)
        cublasDestroy(cublasHandle);
    if(myIsForComputation)
-        CheckNTErrors(cublasCreate(&cublasHandle) == CURAND_STATUS_SUCCESS, "Cannot create the cublas handle.");
+        CheckNTErrors((enum curandStatus)cublasCreate(&cublasHandle) == CURAND_STATUS_SUCCESS, 
+				      "Cannot create the cublas handle.");
    SetDevice(devIDBackup);
 #endif
@@ -1392,7 +1393,7 @@ void XMem::CreateBLASHandle()
                      "Cannot destroy the cublas handle.");
    }
-    CheckNTErrors(cublasCreate(&cublasHandle) == CURAND_STATUS_SUCCESS, 
+    CheckNTErrors((enum curandStatus)cublasCreate(&cublasHandle) == CURAND_STATUS_SUCCESS, 
                  "Cannot create the cublas handle.");
 #endif
 }

--- a/source/tensor/XName.cpp
+++ b/source/tensor/XName.cpp
@@ -35,6 +35,8 @@ const char * GetOPName(int type)
            return "M_EXP";
        else if (type == MATH_FLOOR)
            return "M_FLOOR";
+        else if (type == MATH_ISNONZERO)
+            return "M_ISNONZERO";
        else if (type == MATH_ISZERO)
            return "M_ISZERO";
        else if (type == MATH_LOG)

--- a/source/tensor/XName.h
+++ b/source/tensor/XName.h
@@ -35,7 +35,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #define MATH_CEIL               MATH_ABSOLUTE + 1
 #define MATH_EXP                MATH_CEIL + 1
 #define MATH_FLOOR              MATH_EXP + 1
-#define MATH_ISZERO             MATH_FLOOR + 1
+#define MATH_ISNONZERO          MATH_FLOOR + 1
+#define MATH_ISZERO             MATH_ISNONZERO + 1
 #define MATH_LOG                MATH_ISZERO + 1
 #define MATH_SQRT               MATH_LOG + 1
 #define MATH_SQUARE             MATH_SQRT + 1

--- a/source/tensor/XTensor.cpp
+++ b/source/tensor/XTensor.cpp
@@ -1057,9 +1057,9 @@ int XTensor::GetKeyInSparse(int i)
 /* 
 set the value of a cell 
->> value - value to assign to the cell
+>> value - value we tend to set
 >> index - index of the cell for each dimension
->> 
+>> size - size of the index
 */
 bool XTensor::Set(DTYPE value, int index[], int size)
 {
@@ -1070,8 +1070,9 @@ bool XTensor::Set(DTYPE value, int index[], int size)
 /* 
 set the value of a cell in a 1d tensor 
->> value - value to assign to the cell
+>> value - value we tend to set
 >> i - item offset
+<< return - succeeded or not
 */
 bool XTensor::Set1D(DTYPE value, int i)
 {
@@ -1124,6 +1125,78 @@ bool XTensor::Set3D(DTYPE value, int d0, int d1, int d2)
    return SetToDevice(devID, GetCell(dims, 3), value);
 }
+/* 
+set the integer value of a cell 
+>> value - value we tend to set
+>> index - index of the cell for each dimension
+>> size - size of the index
+<< return - succeeded or not
+*/
+bool XTensor::SetInt(int value, int index[], int size)
+{
+    CheckNTErrors((dataType == X_INT), "The tensor is not in integer type.");
+    return SetToDeviceInt(devID, GetCell(index, size), value);
+}
+/* 
+set the integer value of a cell in a 1d tensor 
+>> value - value we tend to set
+>> i - item offset
+<< return - succeeded or not
+*/
+bool XTensor::Set1DInt(int value, int i)
+{
+    CheckNTErrors((order == 1), "Cannot get a 2d cell for a tensor whose order is not 2!");
+    CheckNTErrors((i >= 0 && i < dimSize[0]), "dimension 0 is out of range!");
+    CheckNTErrors((dataType == X_INT), "The tensor is not in integer type.");
+    int dims[1] = {i};
+    return SetToDeviceInt(devID, GetCell(dims, 1), value);
+}
+/* 
+set the integer value of a cell in a 2d tensor in default type
+>> value - value we tend to set
+>> ni - row index
+>> mi - column index
+<< return - succeeded or not
+*/
+bool XTensor::Set2DInt(int value, int ni, int mi)
+{
+    CheckNTErrors((order == 2), "Cannot get a 2d cell for a tensor whose order is not 2!");
+    CheckNTErrors((ni >= 0 && ni < dimSize[0]), "dimension 0 is out of range!");
+    CheckNTErrors((mi >= 0 && mi < dimSize[1]), "dimension 1 is out of range!");
+    CheckNTErrors((dataType == X_INT), "The tensor is not in integer type.");
+    int dims[2] = {ni, mi};
+    return SetToDeviceInt(devID, GetCell(dims, 2), value);
+}
+/* 
+set the integer value of a cell in a 3d tensor in default type
+>> value - value we tend to set
+>> d0 - index of demension 0
+>> d1 - index of demension 1
+>> d2 - index of demension 2
+<< return - succeeded or not
+*/
+bool XTensor::Set3DInt(int value, int d0, int d1, int d2)
+{
+    CheckNTErrors(order == 3, "Cannot get a 2d cell for a tensor whose order is not 2!");
+    CheckNTErrors(d0 >= 0 && d0 < dimSize[0], "dimension 0 is out of range!");
+    CheckNTErrors(d1 >= 0 && d1 < dimSize[1], "dimension 1 is out of range!");
+    CheckNTErrors(d2 >= 0 && d2 < dimSize[2], "dimension 2 is out of range!");
+    CheckNTErrors((dataType == X_INT), "The tensor is not in integer type.");
+    int dims[3] = {d0, d1, d2};
+    return SetToDeviceInt(devID, GetCell(dims, 3), value);
+}
 /* 
 increase the value of a cell in a 2d tensor
 >> value - value we tend to set
@@ -1986,6 +2059,9 @@ XTensor * NewTensorBuf(const int myOrder, const int * myDimSize,
    XTensor * tensor = NewTensor(myOrder, dims, myDataType, myDenseRatio, devID, myMem);
+    if (tensor->unitNum * tensor->unitSize == 176657664) {
+        tensor->Dump(stderr, "", 200);
+    }
    if(myMem != NULL)
        tensor->data = myMem->AllocBuf(myMem->devID, tensor->unitNum * tensor->unitSize);
    else
@@ -2135,7 +2211,7 @@ generate a copy of XTensor
 >> isFilledData - indicates whether we allocate the data for
                  the newly-generated tensor
 */
-XTensor * NewTensor(XTensor * a, bool isFilledData)
+XTensor * NewTensor(const XTensor * a, bool isFilledData)
 {
    int dims[MAX_TENSOR_DIM_NUM];

--- a/source/tensor/XTensor.h
+++ b/source/tensor/XTensor.h
@@ -327,6 +327,18 @@ public:
    /* set the value of a cell in a 3d tensor */
    bool Set3D(DTYPE value, int d0, int d1, int d2);
+    /* set the integer value of a cell */
+    bool SetInt(int value, int index[], int size = -1);
+    /* set the integer value of a cell in a 1d tensor */
+    bool Set1DInt(int value, int i);
+    /* set the integer value of a cell in a 2d tensor */
+    bool Set2DInt(int value, int ni, int mi);
+    /* set the integer value of a cell in a 3d tensor */
+    bool Set3DInt(int value, int d0, int d1, int d2);
    /* increase the value of a cell in a 2d */
    bool Add2D(DTYPE value, int ni, int mi);
@@ -450,7 +462,7 @@ XTensor * NewTensor5D(const int d0, const int d1, const int d2, const int d3, co
                      const int myDevID = -1, XMem * myMem = NULL);
 /* generate a copy of XTensor (with a reference to a given tensor) */
-XTensor * NewTensor(XTensor * a, bool isFilledData = true);
+XTensor * NewTensor(const XTensor * a, bool isFilledData = true);
 /* free the data space of a given tensor */
 void DelTensor(XTensor * tensor);

--- a/source/tensor/XUtility.cpp
+++ b/source/tensor/XUtility.cpp
@@ -491,6 +491,21 @@ bool SetToDevice(int devID, void * p, DTYPE value)
    return true;
 }
+/* assign a integer number to a variable that is kept on a specified device */
+bool SetToDeviceInt(int devID, void * p, int value)
+{
+    if(p == NULL)
+        return false;
+    if(devID < 0)
+        *(int*)p = value;
+    else{
+        XMemCopy(p, devID, &value, -1, sizeof(int));
+    }
+    return true;
+}
 /* get the next number with power of 2 */
 unsigned int GetNextPower2(unsigned int n)
 {

--- a/source/tensor/XUtility.h
+++ b/source/tensor/XUtility.h
@@ -50,6 +50,7 @@ extern void XMemFreeOnDev(int devID, void * p);
 extern DTYPE ToCPU(int devID, void * value);
 extern int ToCPUInt(int devID, void * value);
 extern bool SetToDevice(int devID, void * p, DTYPE value);
+extern bool SetToDeviceInt(int devID, void * p, int value);
 extern unsigned int GetNextPower2(unsigned int n);
 extern void XSleep(int sleepTime);
 extern double GetClock();

--- a/source/tensor/core/getandset/SetData.cpp
+++ b/source/tensor/core/getandset/SetData.cpp
@@ -70,9 +70,9 @@ void _SetDataFanInOut(XTensor * tensor, DTYPE gain)
        fanOut = numOutputFmaps * receptiveFieldSize;
    }
-    DTYPE std = gain * (float)sqrt(2.0/(fanIn + fanOut));
+    DTYPE finfout = gain * (float)sqrt(6.0F/(fanIn + fanOut));
-    DTYPE a = (DTYPE)sqrt(3.0) * std;
+    tensor->SetDataRand(-finfout, finfout);
-    _SetDataRand(tensor, -a, a);
+    //_SetDataRand(tensor, -finfout, finfout);
 }
 /* 
@@ -393,7 +393,7 @@ void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper)
    if(tensor == NULL)
        return;
-    /* GPU code */
+    /* CPU code */
    if(tensor->devID < 0){
        DTYPE variance = upper - lower;

--- a/source/tensor/core/math/Unary.cpp
+++ b/source/tensor/core/math/Unary.cpp
@@ -37,6 +37,11 @@ DTYPE round(DTYPE r)
 	return (r > 0.0) ? (DTYPE)floor(r + 0.5) : (DTYPE)ceil(r - 0.5);
 }
+DTYPE isnonzero(DTYPE r)
+{
+    return (r != 0.0) ? (DTYPE)1.0 : (DTYPE)0.0;
+}
 DTYPE iszero(DTYPE r)
 {
    return (r == 0.0) ? (DTYPE)1.0 : (DTYPE)0.0;
@@ -93,6 +98,10 @@ _SIMPLE_UNARY_FUNCTION(_Floor, _CudaFloor, floor)
 _SIMPLE_UNARY_FUNCTION_ME(_FloorMe, _Floor)
 SIMPLE_UNARY_FUNCTION(Floor, _Floor, MATH_FLOOR)
+_SIMPLE_UNARY_FUNCTION(_IsNonZero, _CudaIsNonZero, isnonzero)
+_SIMPLE_UNARY_FUNCTION_ME(_IsNonZeroMe, _IsNonZero)
+SIMPLE_UNARY_FUNCTION(IsNonZero, _IsNonZero, MATH_ISNONZERO)
 _SIMPLE_UNARY_FUNCTION(_IsZero, _CudaIsZero, iszero)
 _SIMPLE_UNARY_FUNCTION_ME(_IsZeroMe, _IsZero)
 SIMPLE_UNARY_FUNCTION(IsZero, _IsZero, MATH_ISZERO)
@@ -173,6 +182,10 @@ _SIMPLE_UNARY_FUNCTION(_Floor, floor)
 _SIMPLE_UNARY_FUNCTION_ME(_FloorMe, _Floor)
 SIMPLE_UNARY_FUNCTION(Floor, _Floor, MATH_FLOOR)
+_SIMPLE_UNARY_FUNCTION(_IsNonZero, isnonzero)
+_SIMPLE_UNARY_FUNCTION_ME(_IsNonZeroMe, _IsNonZero)
+SIMPLE_UNARY_FUNCTION(IsNonZero, _IsNonZero, MATH_ISNONZERO)
 _SIMPLE_UNARY_FUNCTION(_IsZero, iszero)
 _SIMPLE_UNARY_FUNCTION_ME(_IsZeroMe, _IsZero)
 SIMPLE_UNARY_FUNCTION(IsZero, _IsZero, MATH_ISZERO)

--- a/source/tensor/core/math/Unary.cu
+++ b/source/tensor/core/math/Unary.cu
@@ -41,11 +41,18 @@ DTYPE cudaround(DTYPE r)
 }
 __device__
+DTYPE cudaisnonzero(DTYPE r)
+{
+    return (r != 0.0) ? (DTYPE)1.0 : (DTYPE)0.0;
+}
+__device__
 DTYPE cudaiszero(DTYPE r)
 {
    return (r == 0.0) ? (DTYPE)1.0 : (DTYPE)0.0;
 }
 #define SIMPLE_UNARY_FUNCTION_GPU(funcName, origFunc)                       \
 __global__                                                                  \
 void Kernel##funcName(DTYPE * a, DTYPE * b, int size)                       \
@@ -96,6 +103,7 @@ SIMPLE_UNARY_FUNCTION_GPU(Absolute, fabs)
 SIMPLE_UNARY_FUNCTION_GPU(Ceil, ceil)
 SIMPLE_UNARY_FUNCTION_GPU(Exp, exp)
 SIMPLE_UNARY_FUNCTION_GPU(Floor, floor)
+SIMPLE_UNARY_FUNCTION_GPU(IsNonZero, cudaisnonzero)
 SIMPLE_UNARY_FUNCTION_GPU(IsZero, cudaiszero)
 SIMPLE_UNARY_FUNCTION_GPU(Log, log)
 SIMPLE_UNARY_FUNCTION_GPU(Round, cudaround)

--- a/source/tensor/core/math/Unary.cuh
+++ b/source/tensor/core/math/Unary.cuh
@@ -66,6 +66,15 @@ void KernelFloor(__half * a, __half * b, int size);
 /* set each entry to its floor value */
 void _CudaFloor(const XTensor * a, XTensor * b);
+/* if source entry is non-zero, set target entry to be one, otherwise zero (CUDA Kernel) */
+__global__
+void KernelIsNonZero(DTYPE * a, DTYPE * b, int size);
+/* if source entry is non-zero, set target entry to be one, otherwise zero (CUDA Kernel) with float16 data type*/
+__global__
+void KernelIsNonZero(__half * a, __half * b, int size);
+/* if source entry is non-zero, set target entry to be one, otherwise zero */
+void _CudaIsNonZero(const XTensor * a, XTensor * b);
 /* if source entry is zero, set target entry to be one, otherwise zero (CUDA Kernel) */
 __global__
 void KernelIsZero(DTYPE * a, DTYPE * b, int size);

--- a/source/tensor/core/math/Unary.h
+++ b/source/tensor/core/math/Unary.h
@@ -63,6 +63,15 @@ void _FloorMe(XTensor * a);
 make a new tensor to keep the result and return it */
 XTensor Floor(const XTensor & a);
+/* if source entry is non-zero, set target entry to be one, otherwise zero */
+void _IsNonZero(const XTensor *a, XTensor *b);
+/* if source entry is non-zero, set target entry to be one, otherwise zero (do it on site)
+keep the result in the input tensor a and return nothing */
+void _IsNonZeroMe(XTensor *a);
+/* if source entry is non-zero, set target entry to be one, otherwise zero (return a XTensor structure)
+make a new tensor to keep the result and return it */
+XTensor IsNonZero(const XTensor &a);
 /* if source entry is zero, set target entry to be one, otherwise zero */
 void _IsZero(const XTensor *a, XTensor *b);
 /* if source entry is zero, set target entry to be one, otherwise zero (do it on site)

--- a/source/tensor/core/movement/Gather.cpp
+++ b/source/tensor/core/movement/Gather.cpp
@@ -21,6 +21,8 @@
 #include "Gather.h"
 #include "CopyIndexed.h"
+#include "../../XUtility.h"
+#include "../shape/Reshape.h"
 namespace nts{ // namespace nts(NiuTrans.Tensor)
@@ -75,4 +77,50 @@ XTensor Gather(const XTensor &s, int dim, int * srcIndex, int indexSize)
    return result;
 }
+/*
+gather indexed sub-tensors (return a XTensor structure)
+make a new tensor to keep the result and return it
+>> s - the source tensor(2D)
+>> index - the index tensor
+<< return - the result of copying indexed sub-tensors
+*/
+XTensor Gather(const XTensor &s, const XTensor &index)
+{
+    int indexSize = index.unitNum;
+    CheckNTErrors(s.order == 2, "The order of the input tensor must be 2!");
+    int * srcIndex = new int[index.unitNum];
+    if(index.dataType == X_INT) {
+        XMemCopy(srcIndex, -1, index.data, index.devID, indexSize * index.unitSize);
+    }
+    else if(index.dataType == X_FLOAT || index.dataType == X_DOUBLE) {
+        DTYPE * tmp = new DTYPE[indexSize];
+        XMemCopy(tmp, -1, index.data, index.devID, indexSize * index.unitSize);
+        for(int i = 0; i < indexSize; i++)
+            srcIndex[i] = (int)tmp[i];
+        delete[] tmp;
+    }
+    XTensor tensor;
+    tensor = Gather(s, 0, srcIndex, indexSize);
+    delete[] srcIndex;
+    if(index.order > 1) {
+        int * dims = new int[index.order + 1];
+        memcpy(dims, index.dimSize, index.order * sizeof(int));
+        dims[index.order] = tensor.GetDim(-1);
+        XTensor t;
+        t = Reshape(tensor, index.order + 1, dims);
+        delete[] dims;
+        return t;
+    }
+    else {
+        return tensor;
+    }   
+}
 } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/movement/Gather.h
+++ b/source/tensor/core/movement/Gather.h
@@ -33,6 +33,10 @@ void _Gather(const XTensor * s, XTensor * t, int dim, int * srcIndex, int indexS
   make a new tensor to keep the result and return it */
 XTensor Gather(const XTensor &s, int dim, int * srcIndex, int indexSize);
+/* gather selected sub-tensors (return a XTensor structure)
+   make a new tensor to keep the result and return it */
+XTensor Gather(const XTensor &s, const XTensor &index);
 } // namespace nts(NiuTrans.Tensor)
 #endif // __GATHER_H__
--- a/source/tensor/core/reduce/ReduceSum.cpp
+++ b/source/tensor/core/reduce/ReduceSum.cpp
@@ -16,8 +16,8 @@
 */
 /*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
+ * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
-*/
+ */
 #include <math.h>
 #include "ReduceSum.h"

--- a/source/tensor/core/reduce/ReduceSum.cu
+++ b/source/tensor/core/reduce/ReduceSum.cu
@@ -105,15 +105,15 @@ void KernelReduceSum(DTYPE * input, DTYPE * output,
    __shared__ DTYPE iData[MAX_CUDA_THREAD_NUM_PER_BLOCK * MIN_CUDA_SHARED_MEM_COL_SIZE/2];
    __shared__ DTYPE bias[MAX_CUDA_THREAD_NUM_PER_BLOCK];
-    int idx = threadIdx.x * blockDim.y + threadIdx.y;
+    int idx = threadIdx.y * blockDim.x + threadIdx.x;
-    unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
+    unsigned int i = blockIdx.y*blockDim.y + threadIdx.y;
-    unsigned int j = blockIdx.y*blockDim.y + threadIdx.y;
+    unsigned int j = blockIdx.x*blockDim.x + threadIdx.x;
    if(i >= stride * blockNum)
        return;
-    if(threadIdx.y == 0)
+    if(threadIdx.x == 0)
-        bias[threadIdx.x] = shift != NULL ? shift[i] : 0;
+        bias[threadIdx.y] = shift != NULL ? shift[i] : 0;
    __syncthreads();
@@ -121,7 +121,7 @@ void KernelReduceSum(DTYPE * input, DTYPE * output,
    int iOffset = i % stride;
    bool isValid = (i < stride * blockNum && j < strideNum);
-    DTYPE value =  isValid ? input[blockSize * k + stride * j + iOffset] - bias[threadIdx.x] : 0;
+    DTYPE value =  isValid ? input[blockSize * k + stride * j + iOffset] - bias[threadIdx.y] : 0;
    if(power != (DTYPE)1.0){
        if(power == (DTYPE)2.0)
@@ -136,21 +136,20 @@ void KernelReduceSum(DTYPE * input, DTYPE * output,
        value = exp(value);
    /* load data into the shared mem */
-    iData[threadIdx.x * blockDim.y + threadIdx.y] = value;
+    iData[threadIdx.y * blockDim.x + threadIdx.x] = value;
    __syncthreads();
    /* do reduction in shared mem */
-    for (unsigned int s = blockDim.y/2; s > 0; s >>= 1){
+    for (unsigned int s = blockDim.x/2; s > 0; s >>= 1){
-        if (threadIdx.y < s)
+        if (threadIdx.x < s)
            iData[idx] += iData[idx + s];
        __syncthreads();
    }
    /* write result for this block to the output array */
-    if (threadIdx.y == 0 && blockIdx.y < reducedStrideNum) 
+    if (threadIdx.x == 0 && blockIdx.x < reducedStrideNum) 
-        output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = iData[threadIdx.x * blockDim.y];
+        output[(k * reducedStrideNum + blockIdx.x) * stride + iOffset] = iData[threadIdx.y * blockDim.x];
 }
 /* 
@@ -282,15 +281,15 @@ void KernelReduceSumFast(DTYPE * input, DTYPE * output,
    __shared__ DTYPE iData[MAX_CUDA_THREAD_NUM_PER_BLOCK];
    __shared__ DTYPE bias[MAX_CUDA_THREAD_NUM_PER_BLOCK];
-    unsigned int tid = threadIdx.y;
+    unsigned int tid = threadIdx.x;
-    unsigned int j = blockIdx.y * (blockDim.y * 2) + threadIdx.y;
+    unsigned int j = blockIdx.x * (blockDim.x * 2) + threadIdx.x;
-    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int i = blockIdx.y * blockDim.y + threadIdx.y;
    if(i >= stride * blockNum)
        return;
-    if (threadIdx.y == 0)
+    if (threadIdx.x == 0)
-        bias[threadIdx.x] = shift != NULL ? shift[i] : 0;
+        bias[threadIdx.y] = shift != NULL ? shift[i] : 0;
    __syncthreads();
@@ -299,17 +298,17 @@ void KernelReduceSumFast(DTYPE * input, DTYPE * output,
    int iOffset = i % stride;
    bool isValid = j < strideNum;
-    bool isValid2 = j + blockDim.y < strideNum;
+    bool isValid2 = j + blockDim.x < strideNum;
-    DTYPE * data =  iData + threadIdx.x * blockDim.y;
+    DTYPE * data =  iData + threadIdx.y * blockDim.x;
    DTYPE * inputData = input  + k * blockSize;
-    DTYPE value  = isValid ? inputData[j * stride + iOffset] - bias[threadIdx.x]: 0;
+    DTYPE value  = isValid ? inputData[j * stride + iOffset] - bias[threadIdx.y]: 0;
-    DTYPE value2 = isValid2 ? inputData[(j + blockDim.y) * stride + iOffset] - bias[threadIdx.x]: 0;
+    DTYPE value2 = isValid2 ? inputData[(j + blockDim.x) * stride + iOffset] - bias[threadIdx.y]: 0;
    if(power != (DTYPE)1.0){
        if(power == (DTYPE)2.0){
            value = value * value;
-            value2 = value2 *value2;
+            value2 = value2 * value2;
        }
        else if(power == (DTYPE)0.5){
            value = sqrt(value);
@@ -329,17 +328,25 @@ void KernelReduceSumFast(DTYPE * input, DTYPE * output,
    }
    value = value + value2;
    __syncthreads();
    value = shflDownReduceSum(value);
-    if ((tid & 0x1f) == 0) { data[tid / 32] = value; }
+    if ((tid & 0x1f) == 0) 
+        data[tid / 32] = value;
    __syncthreads();
    if (tid < 32){
-        if (tid < blockDim.y / 32)
+        if (tid < blockDim.x / 32)
            value = data[tid];
-        else value = 0;
+        else
+	        value = 0;
        value = shflDownReduceSum(value);
-        if (tid == 0 && blockIdx.y < reducedStrideNum)
-            output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = value;
+        if (tid == 0 && blockIdx.x < reducedStrideNum) {
+            output[(k * reducedStrideNum + blockIdx.x) * stride + iOffset] = value;
+        }
    }
 }
@@ -568,7 +575,8 @@ void KernelReduceSumOp(DTYPE * input, DTYPE * output,
    if (tid < 32){
        if (tid < blockDim.y / 32)
            threadSum = data[tid];
-        else threadSum = 0;
+        else 
+            threadSum = 0;
        threadSum = shflDownReduceSum(threadSum);
        if (tid == 0 && blockIdx.y < reducedStrideNum)
            output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = threadSum;
@@ -640,29 +648,28 @@ inline void continuousStorageThreadAllocation(dim3& grid, dim3& block, long long
 /* 
 this situation we use block.x * grid.x deal one vector for continuous read
 */
-inline void discontinuousStorageNoShareMemThreadAllocation(dim3& grid, dim3& block, int stride, int blockNum)
+void discontinuousStorageNoShareMemThreadAllocation(dim3* grid, dim3* block, int stride, int blockNum)
 {
-    block.x = 512;
+    block->x = 512;
-    block.y = 1;
+    block->y = 1;
    if ((stride * blockNum) % 512 == 0)
-        grid.x = (stride * blockNum) / 512;
+        grid->x = (stride * blockNum) / 512;
    else
-        grid.x = (stride * blockNum) / 512 + 1;
+        grid->x = (stride * blockNum) / 512 + 1;
-    grid.y = 1;
+    grid->y = 1;
 }
 /*
 adjust threads.x number then we can use warp optimization
 */
-inline void adjustThreadForUseWarpOptimization(dim3& blocks, dim3& threads)
+void adjustThreadForUseWarpOptimization(dim3* blocks, dim3* threads)
 {
-    if (threads.x > 1){
+    if (threads->y > 1){
-        blocks.x *= threads.x;
+        blocks->y *= threads->y;
-        threads.x = 1;
+        threads->y = 1;
    }
-    if (threads.y < 32)
+    if (threads->x < 32)
-        threads.y = 32;
+        threads->x = 32;
 }
 /* 
@@ -733,19 +740,23 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
        dim3 blocks;
        continuousStorageThreadAllocation(grids, blocks, (long long)blockNum, strideNum);
        if (blocks.y >= 128)
-            KernelReduceSumOp <<<grids, blocks >>> ((DTYPE *)input->data, (DTYPE*)output->data, stride, strideNum, grids.y, blockSize, blockNum, sp, power, isExp);
+            KernelReduceSumOp <<<grids, blocks>>> ((DTYPE *)input->data, (DTYPE*)output->data, stride, 
+                                                    strideNum, grids.y, blockSize, blockNum, sp, power, isExp);
        else {
-            if (blockNum % 4 != 0) blockNum = (int)(blockNum / 4) + 1;
+            if (blockNum % 4 != 0) 
-            else blockNum = blockNum / 4;
+                blockNum = (int)(blockNum / 4) + 1;
-            KernelReduceSumOpLessBlocks << <blockNum, 128 >> > ((DTYPE *)input->data, (DTYPE*)output->data, strideNum, blockNum, sp, power, isExp);
+            else 
+                blockNum = blockNum / 4;
+            KernelReduceSumOpLessBlocks <<<blockNum, 128>>> ((DTYPE *)input->data, (DTYPE*)output->data, 
+                                                              strideNum, blockNum, sp, power, isExp);
        }
    }
    else if (stride != 1 && stride * blockNum > 4096){
        //GDevs->GetGridAndBlockSize2D(devID, stride * blockNum, strideNum,MAX_INT, cudaGridSize, cudaBlockSize);
        //unsigned int* goutput = (unsigned int *)input->data;
-        //convert2uintV2 <<<dim3(cudaGridSize[0], cudaGridSize[1]), dim3(cudaBlockSize[0], cudaBlockSize[1])>>> ((float*)input->data, goutput, stride, strideNum, blockNum, strideNum*blockNum*stride);
+        //convert2uintV2 << <dim3(cudaGridSize[0], cudaGridSize[1]), dim3(cudaBlockSize[0], cudaBlockSize[1]) >> > ((float*)input->data, goutput, stride, strideNum, blockNum, strideNum*blockNum*stride);
        dim3 grid, block;
-        discontinuousStorageNoShareMemThreadAllocation(grid, block, stride, blockNum);
+        discontinuousStorageNoShareMemThreadAllocation(&grid, &block, stride, blockNum);
        KernelReduceSumDiscontinuousStorage <<<grid, block>>> ((DTYPE *)input->data, (DTYPE*)output->data, stride, 
                                                                strideNum, blockNum,sp, power, isExp);
    }
@@ -769,50 +780,50 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
                /* unroll the reduction procedure. The code is messy but it is faster. */
                if (strideNum <= 32) {
                    GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
-                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
+                    dim3 blocks(cudaGridSize[0], cudaGridSize[1]), threads(cudaBlockSize[0], cudaBlockSize[1]);
                    if (cudaGridSize[0] == 1)
                        oData = (DTYPE*)output->data;
-                    KernelReduceSum <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, 
+                    KernelReduceSum <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.x, 
                                                           blockSize, blockNum, sp, power, isExp);
                }
                else if (strideNum < 128) {
                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 64), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
-                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
+                    dim3 blocks(cudaGridSize[0], cudaGridSize[1]), threads(cudaBlockSize[0], cudaBlockSize[1]);
                    if (cudaGridSize[0] == 1)
                        oData = (DTYPE*)output->data;
                    CheckNTErrors((cudaBlockSize[0] >= 64), "Incorrect thread number when calling the cuda kernel!");
-                    adjustThreadForUseWarpOptimization(blocks, threads);
+                    adjustThreadForUseWarpOptimization(&blocks, &threads);
-                    KernelReduceSumFast<64> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, 
+                    KernelReduceSumFast<64> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.x, 
                                                                   blockSize, blockNum, sp, power, isExp);
                }
                else if (strideNum < 256) {
                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 128), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
-                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
+                    dim3 blocks(cudaGridSize[0], cudaGridSize[1]), threads(cudaBlockSize[0], cudaBlockSize[1]);
                    if (cudaGridSize[0] == 1)
                        oData = (DTYPE*)output->data;
                    CheckNTErrors((cudaBlockSize[0] >= 128), "Incorrect thread number when calling the cuda kernel!");
-                    adjustThreadForUseWarpOptimization(blocks, threads);
+                    adjustThreadForUseWarpOptimization(&blocks, &threads);
-                    KernelReduceSumFast<128> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, 
+                    KernelReduceSumFast<128> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.x, 
                                                                    blockSize, blockNum, sp, power, isExp);
                }
                else if (strideNum < 512) {
                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 256), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
-                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
+                    dim3 blocks(cudaGridSize[0], cudaGridSize[1]), threads(cudaBlockSize[0], cudaBlockSize[1]);
                    if (cudaGridSize[0] == 1)
                        oData = (DTYPE*)output->data;
                    CheckNTErrors((cudaBlockSize[0] >= 256), "Incorrect thread number when calling the cuda kernel!");
-                    adjustThreadForUseWarpOptimization(blocks, threads);
+                    adjustThreadForUseWarpOptimization(&blocks, &threads);
-                    KernelReduceSumFast<256> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, 
+                    KernelReduceSumFast<256> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.x, 
                                                                    blockSize, blockNum, sp, power, isExp);
                }
                else {
                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 512), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
-                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
+                    dim3 blocks(cudaGridSize[0], cudaGridSize[1]), threads(cudaBlockSize[0], cudaBlockSize[1]);
                    if (cudaGridSize[0] == 1)
                        oData = (DTYPE*)output->data;
                    CheckNTErrors((cudaBlockSize[0] >= 512), "Incorrect thread number when calling the cuda kernel!");
-                    adjustThreadForUseWarpOptimization(blocks, threads);
+                    adjustThreadForUseWarpOptimization(&blocks, &threads);
-                    KernelReduceSumFast<512> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, 
+                    KernelReduceSumFast<512> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.x, 
                                                                    blockSize, blockNum, sp, power, isExp);
                }
            }

--- a/source/tensor/core/reduce/ReduceSumAll.cpp
+++ b/source/tensor/core/reduce/ReduceSumAll.cpp
@@ -44,23 +44,24 @@ sum all the items of the tensor (It should be optimized!)
 >> source - the inpute tensor
 << return - the total summation
 */
-DTYPE _ReduceSumAll(XTensor * source)
+DTYPE _ReduceSumAll(const XTensor * source)
 {
    int order = source->order;
    DTYPE summation;
    XTensor * big = NewTensor(source);
    _CopyValues(source, big);
-    for(int i = 0; i < order; i++) {
+    for(int i = order - 1; i >= 0; i--) {
+        if(i == 0)
-        if(i == order - 1)
+            big->Reshape(1, big->unitNum);
-            big->Reshape(big->unitNum, 1);
+        int leadingDim = big->order - 1;
        int * dimSize;
-        dimSize = getDimSize(big, 0);
+        dimSize = getDimSize(big, leadingDim);
-        XTensor * little = NewTensor(big->order - 1, dimSize, source->dataType, source->denseRatio, source->devID, source->mem);
+        XTensor * little = NewTensor(big->order - 1, dimSize, source->dataType, source->denseRatio, 
+                                     source->devID, source->mem);
-        _ReduceSum(big, little, 0);
+        _ReduceSum(big, little, leadingDim);
        delete big;
        delete dimSize;
@@ -81,7 +82,7 @@ sum all the items of the tensor
 >> source - the inpute tensor
 << return - the total summation   
 */
-DTYPE ReduceSumAll(XTensor & source)
+DTYPE ReduceSumAll(const XTensor & source)
 {
    return _ReduceSumAll(&source);
 }

--- a/source/tensor/core/reduce/ReduceSumAll.h
+++ b/source/tensor/core/reduce/ReduceSumAll.h
@@ -28,10 +28,10 @@
 namespace nts{ // namespace nts(NiuTrans.Tensor)
 /* sum all the items of the tensor */
-DTYPE _ReduceSumAll(XTensor * source);
+DTYPE _ReduceSumAll(const XTensor * source);
 /* sum all the items of the tensor */
-DTYPE ReduceSumAll(XTensor & source);
+DTYPE ReduceSumAll(const XTensor & source);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/function/CrossEntropy.cpp
+++ b/source/tensor/function/CrossEntropy.cpp
@@ -50,46 +50,33 @@ void _CrossEntropy(const XTensor * output, const XTensor * gold,
                   const XTensor * padding, int leadingDim)
 {
    int n = leadingDim < 0 ? output->order - 1 : leadingDim;
-    CheckNTErrors(n >= 0 && n < output->order, "Wrong leadingDim!");
    int unitNum = output->dimSize[n];
+    CheckNTErrors(n >= 0 && n < output->order, "Wrong leadingDim!");
    CheckNTErrors(XTensor::IsSameShaped(output, gold), 
                 "The output tensor and gold tensor must be of the same size!");
    CheckNTErrors(weight == NULL || weight->unitNum == unitNum, "Wrong weight tensor!");
-    CheckNTErrors(padding == NULL || XTensor::IsSameShaped(padding, loss), "The loss tensor and padding tensor must be same shape!");
+    CheckNTErrors(padding == NULL || XTensor::IsSameShaped(padding, loss), 
+                 "The loss tensor and padding tensor must be same shape!");
    CheckNTErrors(loss->order == output->order - 1, "Wrong loss dimension!");
    CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE, "TODO!");
-    XTensor * logBuf = NewTensorBuf(output, output->devID, output->mem);
+    XTensor * interBuf1 = NewTensorBuf(output, output->devID, output->mem);
-    XTensor * mulBuf = NewTensorBuf(output, output->devID, output->mem);
+    XTensor * interBuf2 = NewTensorBuf(output, output->devID, output->mem);
-    /* l = log(output) */
+    _Log(output, interBuf1);
-    _Log(output, logBuf);
+    _Multiply(gold, interBuf1, interBuf2);
-    if(weight != NULL){
-        XTensor * weightBuf = NewTensorBuf(output, output->devID, output->mem);
-        /* multiply gold with weight by broadcast wg = mulDim(g * w) */
-        _MultiplyDim(gold, weight, weightBuf, n, 0);
-        /* multiply weighted gold with log(output) wgl = mul(wg, l) */
-        _Multiply(weightBuf, logBuf, mulBuf, 0);
-        DelTensorBuf(weightBuf);
-    }
-    else{
-        /* multiply gold with log(output) gl = mul(g, l) */
-        _Multiply(gold, logBuf, mulBuf, 0);
-    }
-    /* negate result n = negate(mul) */
+    if(weight != NULL)
-    _NegateMe(mulBuf);
+        _MultiplyDimMe(interBuf2, weight, n);
+    _NegateMe(interBuf2);
+    _ReduceSum(interBuf2, loss, n);
-    _ReduceSum(mulBuf, loss, n);
+    if(padding != NULL)
+        _MultiplyMe(loss, padding);
-    DelTensorBuf(mulBuf);
+    DelTensorBuf(interBuf2);
-    DelTensorBuf(logBuf);
+    DelTensorBuf(interBuf1);
 }
 /*
@@ -109,19 +96,12 @@ void _CrossEntropyFast(const XTensor * output, const XTensor * gold,
                       XTensor * loss, const XTensor * weight,
                       const XTensor * padding, int leadingDim)
 {
-#ifdef USE_CUDA
-    if(output->devID >= 0) {
-        _CudaCrossEntropyFast(output, gold, loss, weight, padding, leadingDim);
-        return;
-    }
-#endif
    int order = output->order;
    int n = leadingDim < 0 ? output->order - 1 : leadingDim;
    int leadingDimSize = output->GetDim(n);
    CheckNTErrors(n >= 0 && n < output->order, 
-                 "Wrong leadingDim!");
+                 "Wrong leading dimension!");
    CheckNTErrors(XTensor::IsSameShaped(output, gold), 
                 "The output tensor and gold tensor must be of the same size!");
    CheckNTErrors(weight == NULL || weight->unitNum == leadingDimSize, 
@@ -133,6 +113,22 @@ void _CrossEntropyFast(const XTensor * output, const XTensor * gold,
    CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE, 
                 "TODO!");
+    for(int i = 0; i < order; i++){
+        if(i < n){
+            CheckNTErrors((output->GetDim(i) == loss->GetDim(i)), "Unmatched tensors!");
+        }
+        else if(i > n){
+            CheckNTErrors((output->GetDim(i) == loss->GetDim(i - 1)), "Unmatched tensors!");
+        }
+    }
+#ifdef USE_CUDA
+    if(output->devID >= 0) {
+        _CudaCrossEntropyFast(output, gold, loss, weight, padding, leadingDim);
+        return;
+    }
+#endif
    int blockNum = 1;
    int blockSize = 1;
    int stride = 1;
@@ -148,31 +144,40 @@ void _CrossEntropyFast(const XTensor * output, const XTensor * gold,
    DTYPE * lossData = (DTYPE*)loss->data;
    DTYPE tmpLoss;
+    int lossPos;
+    int goldPos;
    if(weight == NULL) {
        if(padding == NULL) {
            for(int i = 0; i < blockNum; i++) {
-                int beg = i * blockSize;
+                for(int j = 0; j < stride; j++) {
                    tmpLoss = 0;
-                for(int j = 0; j < blockSize; j++) 
+                    lossPos = i * stride + j;
-                    tmpLoss += -(*(goldData + beg + j)) * 
+                    for(int k = 0; k < leadingDimSize; k++) {
-                                (DTYPE)log(*(outputData + beg + j));
+                        goldPos = i * blockSize + j + k * stride;
-                *(lossData + i) = tmpLoss;
+                        tmpLoss += -(*(goldData + goldPos)) * 
+                                    (DTYPE)log(*(outputData + goldPos));
+                    }
+                    *(lossData + lossPos) = tmpLoss;
+                }
            }
        }
        else {
            DTYPE * paddingData = (DTYPE*)padding->data;
            for(int i = 0; i < blockNum; i++) {
-                int beg = i * blockSize;
+                for(int j = 0; j < stride; j++) {
+                    lossPos = i * stride + j;
-                if(*(paddingData + i) == 0)
+                    if(*(paddingData + lossPos) == 0)
-                    *(lossData + i) = 0;
+                        *(lossData + lossPos) = 0;
-                else{
+                    else {
                        tmpLoss = 0;
-                    for(int j = 0; j < blockSize; j++)
+                        for(int k = 0; k < leadingDimSize; k++) {
-                        tmpLoss += -(*(goldData + beg + j)) * 
+                            goldPos = i * blockSize + j + k * stride;
-                                    (DTYPE)log(*(outputData + beg + j));
+                            tmpLoss += -(*(goldData + goldPos)) * 
-                    *(lossData + i) = tmpLoss;
+                                        (DTYPE)log(*(outputData + goldPos));
+                        }
+                        *(lossData + lossPos) = tmpLoss;
+                    }
                }
            }            
        }
@@ -181,54 +186,40 @@ void _CrossEntropyFast(const XTensor * output, const XTensor * gold,
        DTYPE * weightData = (DTYPE*)weight->data;
        if(padding == NULL) {
            for(int i = 0; i < blockNum; i++) {
-                int beg = i * blockSize;
+                for(int j = 0; j < stride; j++) {
                    tmpLoss = 0;
-                for(int j = 0; j < blockSize; j++)
+                    lossPos = i * stride + j;
-                    tmpLoss += -(*(goldData + beg + j)) * 
+                    for(int k = 0; k < leadingDimSize; k++) {
-                                (DTYPE)log(*(outputData + beg + j)) * 
+                        goldPos = i * blockSize + j + k * stride;
-                                (*(weightData + j));
+                        tmpLoss += -(*(goldData + goldPos)) * 
-                *(lossData + i) = tmpLoss;
+                                    (DTYPE)log(*(outputData + goldPos)) *
+                                    (*(weightData + k));
+                    }
+                    *(lossData + lossPos) = tmpLoss;                    
+                }
            }
        }
        else {
            DTYPE * paddingData = (DTYPE*)padding->data;
            for(int i = 0; i < blockNum; i++) {
-                int beg = i * blockSize;
+                for(int j = 0; j < stride; j++) {
+                    lossPos = i * stride + j;
-                if(*(paddingData + i) == 0)
+                    if(*(paddingData + lossPos) == 0)
-                    *(lossData + i) = 0;
+                        *(lossData + lossPos) = 0;
-                else{
+                    else {
                        tmpLoss = 0;
-                    for(int j = 0; j < blockSize; j++)
+                        for(int k = 0; k < leadingDimSize; k++) {
-                        tmpLoss += -(*(goldData + beg + j)) * 
+                            goldPos = i * blockSize + j + k * stride;
-                                    (DTYPE)log(*(outputData + beg + j)) * 
+                            tmpLoss += -(*(goldData + goldPos)) * 
-                                    (*(weightData + j));
+                                        (DTYPE)log(*(outputData + goldPos)) *
-                    *(lossData + i) = tmpLoss;
+                                        (*(weightData + k));
+                        }
+                        *(lossData + lossPos) = tmpLoss;
                    }
                }
            }              
        }
-}
-/*
-get the dimSize after reduce operation
->> tensor - a tensor to be reduced
->> n - the reduce dimension 
-<< return - the pointer of dimSize
-*/
-int * reduceDimSize(const XTensor * tensor, int n)
-{
-    int order = tensor->order;
-    int * dimSize = new int[order - 1];
-    for (int i = 0; i < order; i++) {
-        if(i < n)
-            dimSize[i] = tensor->dimSize[i];
-        else if(i > n)
-            dimSize[i - 1] = tensor->dimSize[i];
    }
-    return dimSize;
 }
 /*
@@ -247,75 +238,48 @@ DTYPE _CrossEntropy(const XTensor * output, const XTensor * gold,
                    LOSS_COMPUTE_WAY reduceWay, const XTensor * weight, 
                    const XTensor * padding, int leadingDim)
 {
-    int n = leadingDim < 0 ? output->order - 1 : leadingDim;
+    DTYPE loss = 0;
-    CheckNTErrors(n >= 0 && n < output->order, "Wrong leadingDim!");
+    int order = output->order;
+    int n = leadingDim < 0 ? output->order - 1 : leadingDim;
    int unitNum = output->dimSize[n];
+    CheckNTErrors(n >= 0 && n < output->order, "Wrong leadingDim!");
    CheckNTErrors(XTensor::IsSameShaped(output, gold), 
                 "The output tensor and gold tensor must be of the same size!");
    CheckNTErrors(weight == NULL || weight->unitNum == unitNum, "Wrong weight tensor!");
-    CheckNTErrors(padding == NULL || padding->order == output->order - 1, "The loss tensor and padding tensor must be same shape!");
+    CheckNTErrors(padding == NULL || padding->order == output->order - 1, 
+                 "The loss tensor and padding tensor must be same shape!");
    CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE, "TODO!");
-    XTensor * logBuf = NewTensorBuf(output, output->devID, output->mem);
+    int * dimSize = new int[order - 1];
-    XTensor * mulBuf = NewTensorBuf(output, output->devID, output->mem);
+    for (int i = 0; i < order; i++) {
+        if(i < n)
-    /* l = log(output) */
+            dimSize[i] = output->dimSize[i];
-    _Log(output, logBuf);
+        else if(i > n)
+            dimSize[i - 1] = output->dimSize[i];
-    if(weight != NULL){
-        XTensor * weightBuf = NewTensorBuf(output, output->devID, output->mem);
-        /* multiply gold with weight by broadcast wg = mulDim(g * w) */
-        _MultiplyDim(gold, weight, weightBuf, n, 0);
-        /* multiply weighted gold with log(output) wgl = mul(wg, l) */
-        _Multiply(weightBuf, logBuf, mulBuf, 0);
-        DelTensorBuf(weightBuf);
-    }
-    else{
-        /* multiply gold with log(output) gl = mul(g, l) */
-        _Multiply(gold, logBuf, mulBuf, 0);
    }
-    /* negate multiply result n = negate(mul) */
+    XTensor * lossBuf = NewTensorBuf(output->order - 1, dimSize, output->dataType, output->denseRatio, 
-    _NegateMe(mulBuf);
+                                     output->devID, output->mem);
-    int * dimSize;
+    _CrossEntropy(output, gold, lossBuf, weight, padding, leadingDim);
-    dimSize = reduceDimSize(output, n);
-    XTensor * lossInter = NewTensor(output->order - 1, dimSize, output->dataType, output->denseRatio, output->devID, output->mem);
-    /* reduce sum all classes */
+    loss = _ReduceSumAll(lossBuf);
-    _ReduceSum(mulBuf, lossInter, n);
-    DelTensorBuf(mulBuf);
-    DelTensorBuf(logBuf);
-    DTYPE loss;
-    /* compute the total loss */
-    if(padding != NULL) {
-        XTensor * temp(lossInter);
-        _Multiply(lossInter, padding, temp);
-        loss = _ReduceSumAll(temp);
-        delete temp;
-    }
-    else 
-        loss = _ReduceSumAll(lossInter);
    if(reduceWay == REDUCE_MEAN) {
-        if(padding != NULL) {
+        int nonZeroNum;
-            XTensor * zeroIndicator = NewTensorBuf(padding, padding->devID, padding->mem);
+        if(padding == NULL) {
+            nonZeroNum = lossBuf->unitNum;
-            _IsZero(padding, zeroIndicator);
-            int reduceSize = (int)_ReduceSumAll(zeroIndicator);
-            loss = loss / (DTYPE)(padding->unitNum - reduceSize);
-            DelTensorBuf(zeroIndicator);
        }
-        else 
+        else {
-            loss = loss / (DTYPE)lossInter->unitNum;
+            XTensor * tmp = NewTensorBuf(padding, padding->devID, padding->mem);
+            _IsNonZero(padding, tmp);
+            nonZeroNum = (int)_ReduceSumAll(tmp);
+            DelTensorBuf(tmp);
+        }
+        loss = loss / (DTYPE)nonZeroNum;
    }
    else if(reduceWay == REDUCE_SUM) {
        /* don't need to do anything */
@@ -325,7 +289,7 @@ DTYPE _CrossEntropy(const XTensor * output, const XTensor * gold,
    }
    delete[] dimSize;
-    delete lossInter;
+    DelTensorBuf(lossBuf);
    return loss;
 }
@@ -348,11 +312,7 @@ DTYPE _CrossEntropyFast(const XTensor * output, const XTensor * gold,
                        LOSS_COMPUTE_WAY reduceWay, const XTensor * weight,
                        const XTensor * padding, int leadingDim)
 {
-#ifdef USE_CUDA
+    DTYPE loss = 0;
-    if(output->devID >= 0) {
-        return _CudaCrossEntropyFast(output, gold, reduceWay, weight, padding, leadingDim);
-    }
-#endif
    int order = output->order;
    int n = leadingDim < 0 ? output->order - 1 : leadingDim;
@@ -369,6 +329,23 @@ DTYPE _CrossEntropyFast(const XTensor * output, const XTensor * gold,
    CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE, 
                 "TODO!");
+    if(padding != NULL) {
+        for(int i = 0; i < order; i++){
+            if(i < n){
+                CheckNTErrors((output->GetDim(i) == padding->GetDim(i)), "Unmatched tensors!");
+            }
+            else if(i > n){
+                CheckNTErrors((output->GetDim(i) == padding->dimSize[i - 1]), "Unmatched tensors!");
+            }
+        }
+    }
+#ifdef USE_CUDA
+    if(output->devID >= 0) {
+        return _CudaCrossEntropyFast(output, gold, reduceWay, weight, padding, leadingDim);
+    }
+#endif
    int blockNum = 1;
    int blockSize = 1;
    int stride = 1;
@@ -382,32 +359,40 @@ DTYPE _CrossEntropyFast(const XTensor * output, const XTensor * gold,
    DTYPE * outputData = (DTYPE*)output->data;
    DTYPE * goldData = (DTYPE*)gold->data;
-    DTYPE loss = 0;
+    int paddingPos;
+    int goldPos;
    int nonZeroNum = 0;
    if(weight == NULL) {
        if(padding == NULL) {
-            nonZeroNum = blockNum;
+            nonZeroNum = blockNum * stride;
-            for(int i = 0; i < blockNum; i++) {
-                int beg = i * blockSize;
-                for(int j = 0; j < blockSize; j++) 
+            for(int i = 0; i < blockNum; i++) {
-                    loss += -(*(goldData + beg + j)) * 
+                for(int j = 0; j < stride; j++) {
-                             (DTYPE)log(*(outputData + beg + j));
+                    paddingPos = i * stride + j;
+                    for(int k = 0; k < leadingDimSize; k++) {
+                        goldPos = i * blockSize + j + k * stride;
+                        loss += -(*(goldData + goldPos)) * 
+                                 (DTYPE)log(*(outputData + goldPos));
+                    }
+                }
            }
        }
        else {
            DTYPE * paddingData = (DTYPE*)padding->data;
            for(int i = 0; i < blockNum; i++) {
-                if(*(paddingData + i) == 0)
+                for(int j = 0; j < stride; j++) {
+                    paddingPos = i * stride + j;
+                    if(*(paddingData + paddingPos) == 0)
                        continue;
-                else{
+                    else {
                        nonZeroNum += 1;
+                        for(int k = 0; k < leadingDimSize; k++) {
-                    int beg = i * blockSize;
+                            goldPos = i * blockSize + j + k * stride;
-                    for(int j = 0; j < blockSize; j++)
+                            loss += -(*(goldData + goldPos)) * 
-                        loss += -(*(goldData + beg + j)) * 
+                                     (DTYPE)log(*(outputData + goldPos));
-                                 (DTYPE)log(*(outputData + beg + j));
+                        }    
+                    }
                }
            }
        }
@@ -415,32 +400,39 @@ DTYPE _CrossEntropyFast(const XTensor * output, const XTensor * gold,
    else {
        DTYPE * weightData = (DTYPE*)weight->data;
        if(padding == NULL) {
-            nonZeroNum = blockNum;
+            nonZeroNum = blockNum * stride;
            for(int i = 0; i < blockNum; i++) {
-                int beg = i * blockSize;
+                for(int j = 0; j < stride; j++) {
-                for(int j = 0; j < blockSize; j++)
+                    paddingPos = i * stride + j;
-                    loss += -(*(goldData + beg + j)) * 
+                    for(int k = 0; k < leadingDimSize; k++) {
-                             (DTYPE)log(*(outputData + beg + j)) * 
+                        goldPos = i * blockSize + j + k * stride;
-                             (*(weightData + j));
+                        loss += -(*(goldData + goldPos)) * 
+                                 (DTYPE)log(*(outputData + goldPos)) *
+                                 (*(weightData + k));
+                    }
+                }
            }
        }
        else {
            DTYPE * paddingData = (DTYPE*)padding->data;
            for(int i = 0; i < blockNum; i++) {
-                if(*(paddingData + i) == 0)
+                for(int j = 0; j < stride; j++) {
+                    paddingPos = i * stride + j;
+                    if(*(paddingData + paddingPos) == 0)
                        continue;
-                else{
+                    else {
                        nonZeroNum += 1;
+                        for(int k = 0; k < leadingDimSize; k++) {
-                    int beg = i * blockSize;
+                            goldPos = i * blockSize + j + k * stride;
-                    for(int j = 0; j < blockSize; j++)
+                            loss += -(*(goldData + goldPos)) * 
-                        loss += -(*(goldData + beg + j)) * 
+                                     (DTYPE)log(*(outputData + goldPos)) *
-                                 (DTYPE)log(*(outputData + beg + j)) * 
                                     (*(weightData + j));
                        }    
                    }
                }
            }
+        }
+    }
    if(reduceWay == REDUCE_MEAN) {
        loss = loss / (DTYPE)nonZeroNum;
@@ -470,17 +462,10 @@ with respect to gold standard, and y this the model output
 >> padding - specify a target value that is ignored and does not contribute to the loss computation
 >> leadingDim - the leading dimension for the output
 */
-void _CrossEntropyBackward(XTensor * dedy, const XTensor * output, const XTensor * gold, 
+void _CrossEntropyBackward(XTensor * dedy, const XTensor * output, 
-                           const XTensor * weight, XTensor * padding, 
+                           const XTensor * gold, const XTensor * weight,
-                           int leadingDim)
+                           XTensor * padding, int leadingDim)
 {
-#ifdef USE_CUDA
-    if(output->devID >= 0) {
-        _CudaCrossEntropyBackward(dedy, output, gold, weight, padding, leadingDim);
-        return;
-    }
-#endif
    int order = output->order;
    int n = leadingDim < 0 ? output->order - 1 : leadingDim;
    int leadingDimSize = output->GetDim(n);
@@ -497,6 +482,25 @@ void _CrossEntropyBackward(XTensor * dedy, const XTensor * output, const XTensor
    CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE, 
                 "TODO!");
+    if(padding != NULL) {
+        for(int i = 0; i < order; i++){
+            if(i < n){
+                CheckNTErrors((output->GetDim(i) == padding->GetDim(i)), "Unmatched tensors!");
+            }
+            else if(i > n){
+                CheckNTErrors((output->GetDim(i) == padding->dimSize[i - 1]), "Unmatched tensors!");
+            }
+        }    
+    }
+#ifdef USE_CUDA
+    if(output->devID >= 0) {
+        _CudaCrossEntropyBackward(dedy, output, gold, weight, padding, leadingDim);
+        return;
+    }
+#endif
    int blockNum = 1;
    int blockSize = 1;
    int stride = 1;
@@ -511,25 +515,35 @@ void _CrossEntropyBackward(XTensor * dedy, const XTensor * output, const XTensor
    DTYPE * outputData = (DTYPE*)output->data;
    DTYPE * goldData = (DTYPE*)gold->data;
+    int paddingPos;
+    int goldPos;
    if(weight == NULL) {
        if(padding == NULL) {
            for(int i = 0; i < blockNum; i++) {
-                int beg = i * blockSize;
+                for(int j = 0; j < stride; j++) {
-                for(int j = 0; j < blockSize; j++)
+                    for(int k = 0; k < leadingDimSize; k++) {
-                    *(dedyData + beg + j) = -(*(goldData + beg + j)) / 
+                        goldPos = i * blockSize + j + k * stride;
-                                             (*(outputData + beg + j));
+                        *(dedyData + goldPos) = -(*(goldData + goldPos)) / 
+                                                 (*(outputData + goldPos));
+                    }
+                }
            }
        }
        else {
            DTYPE * paddingData = (DTYPE*)padding->data;
            for(int i = 0; i < blockNum; i++) {
-                int beg = i * blockSize;
+                for(int j = 0; j < stride; j++) {
-                if(*(paddingData + i) == 0)
+                    paddingPos = i * stride + j;
-                    memset(dedyData + beg, 0, blockSize * unitSize);
+                    for(int k = 0; k < leadingDimSize; k++) {
+                        goldPos = i * blockSize + j + k * stride;
+                        if(*(paddingData + paddingPos) == 0)
+                            *(dedyData + goldPos) = 0;
                        else
-                    for(int j = 0; j < blockSize; j++)
+                            *(dedyData + goldPos) = -(*(goldData + goldPos)) / 
-                        *(dedyData + beg + j) = -(*(goldData + beg + j)) / 
+                                                     (*(outputData + goldPos));
-                                                 (*(outputData + beg + j));
+                    }
+                }
            }
        }
    }
@@ -537,39 +551,45 @@ void _CrossEntropyBackward(XTensor * dedy, const XTensor * output, const XTensor
        DTYPE * weightData = (DTYPE*)weight->data;
        if(padding == NULL) {
            for(int i = 0; i < blockNum; i++) {
-                int beg = i * blockSize;
+                for(int j = 0; j < stride; j++) {
-                for(int j = 0; j < blockSize; j++)
+                    for(int k = 0; k < leadingDimSize; k++) {
-                    *(dedyData + beg + j) = -(*(weightData + j)) * 
+                        goldPos = i * blockSize + j + k * stride;
-                                             (*(goldData + beg + j)) / 
+                        *(dedyData + goldPos) = -(*(weightData + k)) * 
-                                             (*(outputData + beg + j));
+                                                 (*(goldData + goldPos)) / 
+                                                 (*(outputData + goldPos));
+                    }
+                }
            }
        }
        else {
            DTYPE * paddingData = (DTYPE*)padding->data;
            for(int i = 0; i < blockNum; i++) {
-                int beg = i * blockSize;
+                for(int j = 0; j < stride; j++) {
-                if(*(paddingData + i) == 0)
+                    paddingPos = i * stride + j;
-                    memset(dedyData + beg, 0, blockSize * unitSize);
+                    for(int k = 0; k < leadingDimSize; k++) {
+                        goldPos = i * blockSize + j + k * stride;
+                        if(*(paddingData + paddingPos) == 0)
+                            *(dedyData + goldPos) = 0;
                        else
-                    for(int j = 0; j < blockSize; j++) {
+                            *(dedyData + goldPos) = -(*(weightData + k)) * 
-                        *(dedyData + beg + j) = -(*(weightData + j)) * 
+                                                     (*(goldData + goldPos)) / 
-                                                 (*(goldData + beg + j)) / 
+                                                     (*(outputData + goldPos));
-                                                 (*(outputData + beg + j));
                    }
                }
            }
        }
-    if(padding != NULL) {
-        XTensor * tmp(padding);
-        _IsZero(padding, tmp);
-        int nonZeroNum = (int)_ReduceSumAll(tmp);
-        _ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)nonZeroNum);
-        delete tmp;
-    }
-    else {
-        _ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)blockNum);
    }
+    //if(padding != NULL) {
+    //    XTensor * tmp = NewTensor(padding);
+    //    _IsNonZero(padding, tmp);
+    //    int nonZeroNum = (int)_ReduceSumAll(tmp);
+    //    _ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)nonZeroNum);
+    //    delete tmp;
+    //}
+    //else {
+    //    _ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)blockNum);
+    //}
 }
 } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/function/CrossEntropy.cu
+++ b/source/tensor/function/CrossEntropy.cu
@@ -26,80 +26,20 @@
 #include "../XDevice.h"
 #include "CrossEntropy.cuh"
 #include "CrossEntropy.h"
-#include "../core/reduce/ReduceSumAll.h"
+#include "../core/arithmetic/Div.h"
+#include "../core/arithmetic/Multiply.h"
+#include "../core/arithmetic/MultiplyDim.h"
+#include "../core/arithmetic/Negate.h"
 #include "../core/math/Unary.h"
 #include "../core/math/ScaleAndShift.h"
+#include "../core/reduce/ReduceSum.h"
+#include "../core/reduce/ReduceSumAll.h"
+#include "../core/shape/Transpose.h"
+#include "../core/shape/Unsqueeze.h"
 namespace nts{ // namespace nts(NiuTrans.Tensor)
 /*
-compute the cross entropy loss (cuda kernel) 
->> outputData - the data pointer of output tensor
->> goldData - the data pointer of gold tensor
->> lossData - the data pointer of loss tensor
->> weightData - the data pointer of weight tensor
->> paddingData - the data pointer of padding tensor
->> blockNum - the number of data blocks
->> stride - the size of a data block
-*/
-__global__
-void KernelCrossEntropy(DTYPE * outputData, DTYPE * goldData,
-                        DTYPE * lossData, DTYPE * weightData, 
-                        DTYPE * paddingData, int blockNum, int blockSize)
-{
-    /* block id */
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-    if(i >= blockNum)
-        return;
-    int beg = i * blockSize;
-    DTYPE tmpLoss = 0;
-    if(weightData == NULL) {
-        if(paddingData == NULL) {
-            tmpLoss = 0;
-            for(int j = 0; j < blockSize; j++) 
-                tmpLoss += -(*(goldData + beg + j)) * 
-                            (DTYPE)log(*(outputData + beg + j));
-            *(lossData + i) = tmpLoss;
-        }
-        else {
-            if(*(paddingData + i) == 0)
-                *(lossData + i) = tmpLoss;
-            else{
-                for(int j = 0; j < blockSize; j++)
-                    tmpLoss += -(*(goldData + beg + j)) * 
-                                (DTYPE)log(*(outputData + beg + j));
-                *(lossData + i) = tmpLoss;
-            }
-        }
-    }
-    else {
-        if(paddingData == NULL) {
-            for(int j = 0; j < blockSize; j++)
-                tmpLoss += -(*(goldData + beg + j)) * 
-                            (DTYPE)log(*(outputData + beg + j)) * 
-                            (*(weightData + j));
-            *(lossData + i) = tmpLoss;
-        }
-        else {
-            if(*(paddingData + i) == 0)
-                *(lossData + i) = tmpLoss;
-            else{
-                tmpLoss = 0;
-                for(int j = 0; j < blockSize; j++)
-                    tmpLoss += -(*(goldData + beg + j)) * 
-                                (DTYPE)log(*(outputData + beg + j)) * 
-                                (*(weightData + j));
-                *(lossData + i) = tmpLoss;
-            }
-        }
-    }
-}
-/* 
 compute the cross entropy loss (cuda version) 
 loss = sum_{i} (-gold_i * log(output_i))
 where gold and output are distributions 
@@ -115,76 +55,24 @@ void _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,
                           XTensor * loss, const XTensor * weight, 
                           const XTensor * padding, int leadingDim)
 {
-    int order = output->order;
    int n = leadingDim < 0 ? output->order - 1 : leadingDim;
-    int leadingDimSize = output->GetDim(n);
-    CheckNTErrors(n >= 0 && n < output->order, 
+    XTensor * interBuf1 = NewTensorBuf(output, output->devID, output->mem);
-                 "Wrong leadingDim!");
+    XTensor * interBuf2 = NewTensorBuf(output, output->devID, output->mem);
-    CheckNTErrors(XTensor::IsSameShaped(output, gold), 
-                 "The output tensor and gold tensor must be of the same size!");
-    CheckNTErrors(weight == NULL || weight->unitNum == leadingDimSize, 
-                 "Wrong weight tensor!");
-    CheckNTErrors(padding == NULL || XTensor::IsSameShaped(padding, loss), 
-                 "The loss tensor and padding tensor must be same shape!");
-    CheckNTErrors(loss->order == output->order - 1, 
-                 "Wrong loss dimension!");
-    CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE, 
-                 "TODO!");
-    int blockNum = 1;
-    int blockSize = 1;
-    int stride = 1;
-    for(int i = n + 1; i < order; i++)
-        stride *= output->GetDim(i);
-    blockSize = stride * leadingDimSize;
-    blockNum = output->unitNum / blockSize;
-    int cudaGrids[3];
-    int cudaBlocks[3];
-    //GDevs.GetCudaThread2D(output->devID, blockNum, blockSize, MAX_INT, cudaGrids, cudaBlocks);
-    GDevs.GetCudaThread(output->devID, blockNum, cudaGrids, cudaBlocks);
-    dim3 blocks(cudaGrids[0], cudaGrids[1]);
+    _Log(output, interBuf1);
-    dim3 threads(cudaBlocks[0], cudaBlocks[1]);
+    _Multiply(gold, interBuf1, interBuf2);
-    int devIDBackup;
+    if(weight != NULL)
-    ProtectCudaDev(output->devID, devIDBackup);
+        _MultiplyDimMe(interBuf2, weight, n);
+    _NegateMe(interBuf2);
+    _ReduceSum(interBuf2, loss, n);
-    DTYPE * outputData = (DTYPE*)output->data;
+    if(padding != NULL)
-    DTYPE * goldData = (DTYPE*)gold->data;
+        _MultiplyMe(loss, padding);
-    DTYPE * lossData = (DTYPE*)loss->data;
-    if(weight == NULL) {
-        if(padding == NULL)
-            KernelCrossEntropy<<<dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >>>
-                                (outputData, goldData, lossData, 
-                                 NULL, NULL,
-                                 blockNum, blockSize);
-        else
-            KernelCrossEntropy<<<dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >>>
-                                (outputData, goldData, lossData, 
-                                 NULL, (DTYPE*)padding->data,
-                                 blockNum, blockSize);
-    }
-    else {
-        if(padding == NULL)
-            KernelCrossEntropy<<<dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >>>
-                                (outputData, goldData, lossData, 
-                                 (DTYPE*)weight->data, NULL,
-                                 blockNum, blockSize);
-        else
-            KernelCrossEntropy<<<dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >>>
-                                (outputData, goldData, lossData, 
-                                 (DTYPE*)weight->data, (DTYPE*)padding->data,
-                                 blockNum, blockSize);
-    }
-    BacktoCudaDev(output->devID, devIDBackup);
+    DelTensorBuf(interBuf2);
+    DelTensorBuf(interBuf1);
 }
 /*
@@ -230,87 +118,38 @@ DTYPE _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,
            dimSize[i - 1] = output->dimSize[i];
    }
-    XTensor * lossInter = NewTensor(output->order - 1, dimSize, output->dataType, output->denseRatio, output->devID, output->mem);
+    XTensor * lossBuf = NewTensorBuf(output->order - 1, dimSize, output->dataType, output->denseRatio, 
+                                     output->devID, output->mem);
-    _CudaCrossEntropyFast(output, gold, lossInter, weight, padding, leadingDim);
+    _CudaCrossEntropyFast(output, gold, lossBuf, weight, padding, leadingDim);
-    loss = _ReduceSumAll(lossInter);
+    loss = _ReduceSumAll(lossBuf);
    if(reduceWay == REDUCE_MEAN) {
-        int totalNum;
+        int nonZeroNum;
        if(padding == NULL) {
-            totalNum = lossInter->unitNum;
+            nonZeroNum = lossBuf->unitNum;
        }
        else {
-            XTensor * zeroIndicator = NewTensorBuf(output, output->devID, output->mem);
+            XTensor * tmp = NewTensorBuf(padding, padding->devID, padding->mem);
-            _IsZero(padding, zeroIndicator);
+            _IsNonZero(padding, tmp);
-            totalNum = lossInter->unitNum - (int)_ReduceSumAll(zeroIndicator);
+            nonZeroNum = (int)_ReduceSumAll(tmp);
-            DelTensorBuf(zeroIndicator);
+            DelTensorBuf(tmp);
        }
-        loss = loss / (DTYPE)totalNum;
+        loss = loss / (DTYPE)nonZeroNum;
    }
+    else if(reduceWay == REDUCE_SUM) {
-    return loss;
+        /* don't need to do anything */
-}
-/* 
-backward computation of cross entropy function (kernel version)
->> dedyData - the data pointer of dedy tensor
->> outputData - the data pointer of output tensor
->> goldData - the data pointer of gold tensor
->> weightData - the data pointer of weight tensor
->> paddingData - the data pointer of padding tensor
->> blockNum - the number of data blocks
->> blockSize - the size of a data block
-*/
-__global__
-void KernelCrossEntropyBackward(DTYPE * dedyData, DTYPE * outputData, DTYPE * goldData,
-                                DTYPE * weightData, DTYPE * paddingData,
-                                int blockNum, int blockSize)
-{
-    /* block id */
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-    if(i >= blockNum)
-        return;
-    int beg = i * blockSize;
-    if(weightData == NULL) {
-        if(paddingData == NULL) {
-            for(int j = 0; j < blockSize; j++) 
-                *(dedyData + beg + j) = -(*(goldData + beg + j)) / 
-                                         (*(outputData + beg + j));
    }
    else {
-            if(*(paddingData + i) == 0)
+        ShowNTErrors("TODO");
-                memset(dedyData + beg, 0, blockSize * sizeof(DTYPE));
-            else
-                for(int j = 0; j < blockSize; j++)
-                    *(dedyData + beg + j) = -(*(goldData + beg + j)) / 
-                                             (*(outputData + beg + j));
-        }
-    }
-    else {
-        if(paddingData == NULL) {
-            for(int j = 0; j < blockSize; j++)
-                *(dedyData + beg + j) = -(*(weightData + j)) * 
-                                         (*(goldData + beg + j)) / 
-                                         (*(outputData + beg + j));
-        }
-        else {
-            if(*(paddingData + i) == 0)
-                memset(dedyData + beg, 0, blockSize * sizeof(DTYPE));
-            else
-                for(int j = 0; j < blockSize; j++) {
-                    *(dedyData + beg + j) = -(*(weightData + j)) * 
-                                             (*(goldData + beg + j)) / 
-                                             (*(outputData + beg + j));
-            }
-        }
    }
+    delete[] dimSize;
+    DelTensorBuf(lossBuf);
+    return loss;
 }
 /* 
@@ -328,87 +167,45 @@ with respect to gold standard, and y this the model output
 >> padding - specify a target value that is ignored and does not contribute to the loss computation
 >> leadingDim - the leading dimension for the output
 */
-void _CudaCrossEntropyBackward(XTensor * dedy, const XTensor * output, const XTensor * gold, 
+void _CudaCrossEntropyBackward(XTensor * dedy, const XTensor * output, 
-                               const XTensor * weight, XTensor * padding,
+                               const XTensor * gold, const XTensor * weight,
-                               int leadingDim)
+                               XTensor * padding, int leadingDim)
 {
-    int order = output->order;
    int n = leadingDim < 0 ? output->order - 1 : leadingDim;
-    int leadingDimSize = output->GetDim(n);
-    CheckNTErrors(n >= 0 && n < output->order, 
-                 "Wrong leading dimension!");
-    CheckNTErrors(XTensor::IsSameShaped(dedy, output, gold), 
-                 "The output tensor and gold tensor must be of the same size!");
-    CheckNTErrors(weight == NULL || weight->unitNum == leadingDimSize, 
-                 "Wrong weight tensor!");
-    CheckNTErrors(padding == NULL || padding->order == output->order - 1, 
-                 "Wrong padding tensor!");
-    CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE, 
-                 "TODO!");
-    int blockNum = 1;
-    int blockSize = 1;
-    int stride = 1;
-    for(int i = n + 1; i < order; i++)
-        stride *= output->GetDim(i);
-    blockSize = stride * leadingDimSize;
-    blockNum = output->unitNum / blockSize;
-    int cudaGrids[3];
-    int cudaBlocks[3];
-    GDevs.GetCudaThread(output->devID, blockNum, cudaGrids, cudaBlocks);
-    dim3 blocks(cudaGrids[0], cudaGrids[1]);
-    dim3 threads(cudaBlocks[0], cudaBlocks[1]);
-    int devIDBackup;
-    ProtectCudaDev(output->devID, devIDBackup);
-    DTYPE * dedyData = (DTYPE*)dedy->data;
-    DTYPE * outputData = (DTYPE*)output->data;
-    DTYPE * goldData = (DTYPE*)gold->data;
-    if(weight == NULL) {
-        if(padding == NULL)
-            KernelCrossEntropyBackward<<<dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >>>
-                                        (dedyData, outputData, goldData,
-                                         NULL, NULL,
-                                         blockNum, blockSize);
-        else
-            KernelCrossEntropyBackward<<<dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >>>
-                                        (dedyData, outputData, goldData,
-                                         NULL, (DTYPE*)padding->data,
-                                         blockNum, blockSize);
-    }
-    else {
-        if(padding == NULL)
-            KernelCrossEntropyBackward<<<dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >>>
-                                        (dedyData, outputData, goldData,
-                                        (DTYPE*)weight->data, NULL,
-                                         blockNum, blockSize);
-        else
-            KernelCrossEntropyBackward<<<dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >>>
-                                        (dedyData, outputData, goldData,
-                                        (DTYPE*)weight->data, (DTYPE*)padding->data,
-                                         blockNum, blockSize);
-    }
+    _Div(gold, output, dedy);
+    _NegateMe(dedy);
+    if(weight != NULL)
+        _MultiplyDimMe(dedy, weight, n);
    if(padding != NULL) {
-        XTensor * tmp(padding);
+        int paddingOrder = padding->order;
-        _IsZero(padding, tmp);
+        int * paddingDims = new int[paddingOrder];
-        int nonZeroNum = (int)_ReduceSumAll(tmp);
+        memcpy(paddingDims, padding->dimSize, padding->order * sizeof(int));
-        _ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)nonZeroNum);
+        padding->Reshape(padding->unitNum);
-        delete tmp;
-    }
+        int order = dedy->order;
-    else {
+        int * dims = new int[order];
-        _ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)blockNum);
+        memcpy(dims, dedy->dimSize, dedy->order * sizeof(int));
-    }
+        dedy->Reshape(dedy->unitNum/dedy->GetDim(n), dedy->GetDim(n));
+        _MultiplyDimMe(dedy, padding, 0);
-    BacktoCudaDev(output->devID, devIDBackup);
+        padding->Reshape(paddingOrder, paddingDims);
+        dedy->Reshape(order, dims);
+        delete[] paddingDims;
+        delete[] dims;
+    }
+    //if(padding != NULL) {
+    //    XTensor * tmp = NewTensor(padding);
+    //    _IsNonZero(padding, tmp);
+    //    int nonZeroNum = (int)_ReduceSumAll(tmp);
+    //    _ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)nonZeroNum);
+    //    delete tmp;
+    //}
+    //else {
+    //    _ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)blockNum);
+    //}
 }

--- a/source/tensor/function/CrossEntropy.cuh
+++ b/source/tensor/function/CrossEntropy.cuh
@@ -38,9 +38,9 @@ DTYPE _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,
                            const XTensor * padding = NULL, int leadingDim = -1);
 /* backward computation of cross entropy function */
-void _CudaCrossEntropyBackward(XTensor * dedy, const XTensor * output, const XTensor * gold, 
+void _CudaCrossEntropyBackward(XTensor * dedy, const XTensor * output, 
-                               const XTensor * weight = NULL, XTensor * padding = NULL, 
+                               const XTensor * gold, const XTensor * weight = NULL, 
-                               int leadingDim = -1);
+                               XTensor * padding = NULL, int leadingDim = -1);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/function/CrossEntropy.h
+++ b/source/tensor/function/CrossEntropy.h
@@ -52,9 +52,9 @@ DTYPE _CrossEntropyFast(const XTensor * output, const XTensor * gold,
                        const XTensor * padding = NULL, int leadingDim = -1);
 /* backward computation of cross entropy function */
-void _CrossEntropyBackward(XTensor * dedy, const XTensor * output, const XTensor * gold, 
+void _CrossEntropyBackward(XTensor * dedy, const XTensor * output, 
-                           const XTensor * weight = NULL, XTensor * padding = NULL, 
+                           const XTensor * gold, const XTensor * weight = NULL, 
-                           int leadingDim = -1);
+                           XTensor * padding = NULL, int leadingDim = -1);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/function/LogSoftmax.cpp
+++ b/source/tensor/function/LogSoftmax.cpp
@@ -280,7 +280,7 @@ better numerical stability.
 */
 void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
                         XTensor * dedy, XTensor * dedx, 
-                         int leadDim,
+                         XTensor * padding, int leadDim, 
                         LOSS_FUNCTION_NAME lossName)
 {
    CheckNTErrors((!dedx->isSparse), "The gradient matrix must be dense!");
@@ -292,7 +292,7 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
    int leadDimRDI = y->order - leadDim - 1;
 #ifdef USE_CUDA
    if (gold->devID >= 0) {
-        _CudaLogSoftmaxBackward(gold, y, x, dedy, dedx, leadDim, lossName);
+        _CudaLogSoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
        return;
    }
 #endif

--- a/source/tensor/function/LogSoftmax.cu
+++ b/source/tensor/function/LogSoftmax.cu
@@ -22,6 +22,7 @@
 #include "LogSoftmax.h"
 #include "LogSoftmax.cuh"
 #include "Loss.cuh"
+#include "../core/arithmetic/MultiplyDim.h"
 #include "../core/reduce/ReduceSum.cuh"
 #include "../core/reduce/ReduceMax.cuh"
 #include "../XDevice.h"
@@ -232,7 +233,8 @@ dE/dx = dE/dy * dy/dx
 >> lossName - name of the loss function
 */
 __global__
-void KernelLogSoftmaxBackwardDEDS(DTYPE * dedy, DTYPE * dedx, DTYPE * gold, DTYPE * y, DTYPE * x, int size, LOSS_FUNCTION_NAME lossName)
+void KernelLogSoftmaxBackwardDEDS(DTYPE * dedy, DTYPE * dedx, DTYPE * gold, DTYPE * y, DTYPE * x, 
+                                  int size, LOSS_FUNCTION_NAME lossName)
 {
    int i = blockDim.x * blockIdx.x + threadIdx.x;
@@ -372,9 +374,11 @@ better numerical stability.
 */
 void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
                            XTensor * dedy, XTensor * dedx, 
-                            int leadDim,
+                            XTensor * padding, int leadDim, 
                            LOSS_FUNCTION_NAME lossName)
 {
+    leadDim = leadDim < 0 ? y->order - 1 : leadDim;
    CheckNTErrors((x->devID >= 0), "Backward computation of log softmax must be run on GPUs.");
    CheckNTErrors((x->devID == y->devID && gold->devID == y->devID),
                  "Tensors used in log softmax are not on the same GPU.");
@@ -441,6 +445,26 @@ void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
                                                    dimensionSize * stride, lossName);
                }
            }
+            if(padding != NULL) {
+                int n = leadDim;
+                int paddingOrder = padding->order;
+                int * paddingDims = new int[paddingOrder];
+                memcpy(paddingDims, padding->dimSize, padding->order * sizeof(int));
+                padding->Reshape(padding->unitNum);
+                int order = dedx->order;
+                int * dims = new int[order];
+                memcpy(dims, dedx->dimSize, dedx->order * sizeof(int));
+                dedx->Reshape(dedx->unitNum/dedx->GetDim(n), dedx->GetDim(n));
+                _MultiplyDimMe(dedx, padding, 0);
+                padding->Reshape(paddingOrder, paddingDims);
+                dedx->Reshape(order, dims);
+                delete[] paddingDims;
+                delete[] dims;
+            }
        }
        else {
            ShowNTErrors("TODO!");

--- a/source/tensor/function/LogSoftmax.cuh
+++ b/source/tensor/function/LogSoftmax.cuh
@@ -38,7 +38,7 @@ void _CudaLogSoftmaxSumMax(XTensor * x, XTensor * y, int leadDim, XTensor * sum,
 /* de/dx (Cuda version) */
 void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
                            XTensor * dedy, XTensor * dedx, 
-                            int leadDim, 
+                            XTensor * padding, int leadDim, 
                            LOSS_FUNCTION_NAME lossName);
 #endif // USE_CUDA

--- a/source/tensor/function/LogSoftmax.h
+++ b/source/tensor/function/LogSoftmax.h
@@ -39,7 +39,7 @@ void LogSoftmax(const XTensor &x, XTensor &y, int leadDim);
 /* de/dx */
 void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, 
                         XTensor * dedy, XTensor * dedx, 
-                         int leadDim,
+                         XTensor * padding, int leadDim, 
                         LOSS_FUNCTION_NAME lossName);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/function/Loss.cpp
+++ b/source/tensor/function/Loss.cpp
@@ -486,8 +486,9 @@ void _LossBackward(XTensor * dedy, XTensor * t, XTensor * y,
                for (int i = 0; i < blockNum; i++) {
                    for (int j = 0; j < stride; j++) {
                        for (int k = 0; k < tLen; k++) {
-                            *(dedyp + i * stride * dimensionSize + j + stride * (yBeg + k)) = -(DTYPE)*(tp + i * stride * dimensionSize
+                            *(dedyp + i * stride * dimensionSize + j + stride * (yBeg + k)) = 
-                                + j + stride * (tBeg + k)) / (DTYPE)*(yp +  i * stride * dimensionSize + j + stride * (yBeg + k));
+                            -(DTYPE)*(tp + i * stride * dimensionSize + j + stride * (tBeg + k)) / 
+                             (DTYPE)*(yp +  i * stride * dimensionSize + j + stride * (yBeg + k));
                        }
                    }
                }

--- a/source/tensor/function/Softmax.cpp
+++ b/source/tensor/function/Softmax.cpp
@@ -175,7 +175,7 @@ See more details in LogSoftmaxBackward(...)
 */
 void _SoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, 
                      XTensor * dedy, XTensor * dedx, 
-                      int leadDim,
+                      XTensor * padding, int leadDim,
                      LOSS_FUNCTION_NAME lossName)
 {
    CheckNTErrors(dedx->isSparse == false, "The gradient tensor must be dense!");
@@ -188,7 +188,7 @@ void _SoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
 #ifdef USE_CUDA
    if(y->devID >= 0){
-        _CudaSoftmaxBackward(gold, y, x, dedy, dedx, leadDim, lossName);
+        _CudaSoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
        return;
    }
 #endif
@@ -297,9 +297,10 @@ void _SoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
            \beta = \sum_i (dE/dy_i * y_i) 
            */
-            for(int k = 0; k < blockNum; k++){
+            for(int m = 0; m < blockNum; m++){
-                op = (DTYPE*)y->data + k * blockSize;
+                yp = (DTYPE*)dedy->data + m * blockSize;
-                sp = (DTYPE*)dedx->data + k * blockSize;
+                op = (DTYPE*)y->data + m * blockSize;
+                sp = (DTYPE*)dedx->data + m * blockSize;
                int nCols = stride;
                for(int k = 0; k < stride; k++){

--- a/source/tensor/function/Softmax.cu
+++ b/source/tensor/function/Softmax.cu
@@ -24,6 +24,7 @@
 #include "Loss.cuh"
 #include "../core/reduce/ReduceSum.h"
 #include "../core/arithmetic/Multiply.h"
+#include "../core/arithmetic/MultiplyDim.h"
 #include "../core/shape/Unsqueeze.h"
 #include "../core/arithmetic/Sum.h"
 #include "../XDevice.h"
@@ -309,9 +310,11 @@ See more details in SoftmaxBackward
 */
 void _CudaSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, 
                          XTensor * dedy, XTensor * dedx,
-                          int leadDim,
+                          XTensor * padding, int leadDim,
                          LOSS_FUNCTION_NAME lossName)
 {
+    int n = leadDim < 0 ? y->order - 1 : leadDim;
    CheckNTErrors((x->devID >= 0), "Backward computation of log softmax must be run on GPUs.");
    CheckNTErrors((x->devID == y->devID), "Matrices used in log softmax are not on the same GPU.");
    CheckNTErrors((y->order >= 1), "Empty tensor!");
@@ -329,6 +332,24 @@ void _CudaSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
        if(lossName == CROSSENTROPY || lossName == SQUAREDERROR){
            _Sum(y, gold, dedx, -1.0F);
+            if(padding != NULL) {
+                int paddingOrder = padding->order;
+                int * paddingDims = new int[paddingOrder];
+                memcpy(paddingDims, padding->dimSize, padding->order * sizeof(int));
+                padding->Reshape(padding->unitNum);
+                int order = dedx->order;
+                int * dims = new int[order];
+                memcpy(dims, dedx->dimSize, dedx->order * sizeof(int));
+                dedx->Reshape(dedx->unitNum/dedx->GetDim(n), dedx->GetDim(n));
+                _MultiplyDimMe(dedx, padding, 0);
+                padding->Reshape(paddingOrder, paddingDims);
+                dedx->Reshape(order, dims);
+                delete[] paddingDims;
+                delete[] dims;
+            }
        }
        else if(lossName == ONEHOTERROR){
            ShowNTErrors("TODO!");

--- a/source/tensor/function/Softmax.cuh
+++ b/source/tensor/function/Softmax.cuh
@@ -38,7 +38,7 @@ void _CudaSoftmaxSumMax(const XTensor * x, XTensor * y, int leadDim, XTensor * s
 /* de/dx (Cuda version) */
 void _CudaSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
                          XTensor * dedy, XTensor * dedx, 
-                          int leadDim, 
+                          XTensor * padding, int leadDim, 
                          LOSS_FUNCTION_NAME lossName);
 #endif // USE_CUDA

--- a/source/tensor/function/Softmax.h
+++ b/source/tensor/function/Softmax.h
@@ -36,7 +36,7 @@ XTensor Softmax(const XTensor &x, int leadDim);
 /* de/dx */
 void _SoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, 
                      XTensor * dedy, XTensor * dedx, 
-                      int leadDim,
+                      XTensor * padding, int leadDim,
                      LOSS_FUNCTION_NAME lossName);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/test/TDropout.cpp
+++ b/source/tensor/test/TDropout.cpp
@@ -169,8 +169,8 @@ bool TestDropout2()
    _DropoutBackward(y, x, dedy, dedx, 1, dropProb);
    /* check result */
-    y->Dump(stderr, "y");
+    //y->Dump(stderr, "y");
-    dedx->Dump(stderr, "dedy");
+    //dedx->Dump(stderr, "dedy");
 #ifdef USE_CUDA
    /* GPU test */
@@ -193,8 +193,8 @@ bool TestDropout2()
    _DropoutBackward(yGPU, xGPU, dedyGPU, dedxGPU, 1, dropProb);
    /* check result */
-    yGPU->Dump(stderr, "yGPU");
+    //yGPU->Dump(stderr, "yGPU");
-    dedxGPU->Dump(stderr, "dedyGPU");
+    //dedxGPU->Dump(stderr, "dedyGPU");
    /* destroy variables */
    delete x;

--- a/source/tensor/test/TLogSoftmax.cpp
+++ b/source/tensor/test/TLogSoftmax.cpp
@@ -146,7 +146,7 @@ bool TestLogSoftmax2()
    _LogSoftmax(x, y, 1);
    /* call LogSoftmaxBackward function */
-    _LogSoftmaxBackward(g, y, x, dedy, dedx, 1, CROSSENTROPY);
+    _LogSoftmaxBackward(g, y, x, dedy, dedx, NULL, 1, CROSSENTROPY);
    /* check result */
    cpuTest = y->CheckData(yAnswer, unitNum, 1e-4F) 
@@ -174,7 +174,7 @@ bool TestLogSoftmax2()
    _LogSoftmax(xGPU, yGPU, 1);
    /* call LogSoftmaxBackward function */
-    _LogSoftmaxBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, 1, CROSSENTROPY);
+    _LogSoftmaxBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, NULL, 1, CROSSENTROPY);
    /* check result */
    gpuTest = yGPU->CheckData(yAnswer, unitNum, 1e-4F) && dedxGPU->CheckData(dedxAnswer, unitNum, 1e-4F);
@@ -250,7 +250,7 @@ bool TestLogSoftmax3()
    _LogSoftmax(x, y, 1);
    /* call LogSoftmaxBackward function */
-    _LogSoftmaxBackward(g, y, x, dedy, dedx, 1, SQUAREDERROR);
+    _LogSoftmaxBackward(g, y, x, dedy, dedx, NULL, 1, SQUAREDERROR);
    /* check result */
    cpuTest = y->CheckData(yAnswer, unitNum, 1e-4F) 
@@ -278,7 +278,7 @@ bool TestLogSoftmax3()
    _LogSoftmax(xGPU, yGPU, 1);
    /* call LogSoftmaxBackward function */
-    _LogSoftmaxBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, 1, SQUAREDERROR);
+    _LogSoftmaxBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, NULL, 1, SQUAREDERROR);
    /* check result */
    gpuTest = yGPU->CheckData(yAnswer, unitNum, 1e-4F) 

--- a/source/tensor/test/TPower.cpp
+++ b/source/tensor/test/TPower.cpp
@@ -66,7 +66,9 @@ bool TestPower1()
    bUser = Power(*a, 2.0F);
 	/* check results */
-	cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) && aMe->CheckData(answer, aUnitNum, 1e-4F) && bUser.CheckData(answer, aUnitNum, 1e-4F);
+	cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) && 
+              aMe->CheckData(answer, aUnitNum, 1e-4F) && 
+              bUser.CheckData(answer, aUnitNum, 1e-4F);
 #ifdef USE_CUDA
 	/* GPU test */
@@ -88,7 +90,9 @@ bool TestPower1()
    bUserGPU = Power(*aGPU, 2.0F);
 	/* check results */
-	gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) && aMeGPU->CheckData(answer, aUnitNum, 1e-4F) && bUserGPU.CheckData(answer, aUnitNum, 1e-4F);
+	gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) && 
+              aMeGPU->CheckData(answer, aUnitNum, 1e-4F) && 
+              bUserGPU.CheckData(answer, aUnitNum, 1e-4F);
 	/* destroy variables */
 	delete a;
@@ -153,7 +157,9 @@ bool TestPower2()
    bUser = Power(*a, 1.0F);
 	/* check results */
-	cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) && aMe->CheckData(answer, aUnitNum, 1e-4F) && bUser.CheckData(answer, aUnitNum, 1e-4F);
+	cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) && 
+              aMe->CheckData(answer, aUnitNum, 1e-4F) && 
+              bUser.CheckData(answer, aUnitNum, 1e-4F);
 #ifdef USE_CUDA
 	/* GPU test */
@@ -175,7 +181,9 @@ bool TestPower2()
    bUserGPU = Power(*aGPU, 1.0F);
 	/* check results */
-	gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) && aMeGPU->CheckData(answer, aUnitNum, 1e-4F) && bUserGPU.CheckData(answer, aUnitNum, 1e-4F);
+	gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) && 
+              aMeGPU->CheckData(answer, aUnitNum, 1e-4F) && 
+              bUserGPU.CheckData(answer, aUnitNum, 1e-4F);
 	/* destroy variables */
 	delete a;
@@ -214,7 +222,7 @@ bool TestPower3()
 	for (int i = 0; i < aOrder; i++)
 		aUnitNum *= aDimSize[i];
-	DTYPE aData[3][2] = { {0.0F, 1.0F},
+	DTYPE aData[3][2] = { {1.0F, 1.0F},
 	                      {2.0F, 3.0F},
 	                      {4.0F, 5.0F} };
 	DTYPE answer[3][2] = { {1.0F, 1.0F},
@@ -240,7 +248,9 @@ bool TestPower3()
    bUser = Power(*a, 0.0F);
 	/* check results */
-	cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) && aMe->CheckData(answer, aUnitNum, 1e-4F) && bUser.CheckData(answer, aUnitNum, 1e-4F);
+	cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) && 
+              aMe->CheckData(answer, aUnitNum, 1e-4F) && 
+              bUser.CheckData(answer, aUnitNum, 1e-4F);
 #ifdef USE_CUDA
 	/* GPU test */
@@ -262,7 +272,9 @@ bool TestPower3()
    bUserGPU = Power(*aGPU, 0.0F);
 	/* check results */
-	gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) && aMeGPU->CheckData(answer, aUnitNum, 1e-4F) && bUserGPU.CheckData(answer, aUnitNum, 1e-4F);
+	gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) && 
+              aMeGPU->CheckData(answer, aUnitNum, 1e-4F) && 
+              bUserGPU.CheckData(answer, aUnitNum, 1e-4F);
 	/* destroy variables */
 	delete a;

--- a/source/tensor/test/TReduceSum.cpp
+++ b/source/tensor/test/TReduceSum.cpp
 /* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University. 
-* All rights reserved.
+ * All rights reserved.
-*
+ *
-* Licensed under the Apache License, Version 2.0 (the "License");
+ * Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
+ * you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
+ * You may obtain a copy of the License at
-*
+ *
-*   http://www.apache.org/licenses/LICENSE-2.0
+ *   http://www.apache.org/licenses/LICENSE-2.0
-*
+ *
-* Unless required by applicable law or agreed to in writing, software
+ * Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
+ * distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
+ * See the License for the specific language governing permissions and
-* limitations under the License.
+ * limitations under the License.
-*/
+ */
 /*
-* $Created by: LI Yinqiao (email: li.yin.qiao.2012@hotmail.com) 2018-04-30
+ * $Created by: LI Yinqiao (email: li.yin.qiao.2012@hotmail.com) 2018-04-30
-*/
+ */
 #include "TReduceSum.h"
+#include "../core/getandset/SetData.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
@@ -155,6 +156,457 @@ bool TestReduceSum1()
 #endif // USE_CUDA
 }
+/* 
+case 2: test ReduceSum function.
+Sum the items along a dimension of the tensor.
+In this case, 
+C = 1, A >= 10, B >= 128
+(50, 1000000) -> (50), dim = 1
+*/
+bool TestReduceSum2()
+{
+    /* a tensor of size (50, 1000000) */
+    int sOrder = 2;
+    int * sDimSize = new int[sOrder];
+    sDimSize[0] = 50;
+    sDimSize[1] = 1000000;
+    int sUnitNum = 1;
+    for (int i = 0; i < sOrder; i++)
+        sUnitNum *= sDimSize[i];
+    /* a tensor of size (50) */
+    int tOrder = 1;
+    int * tDimSize = new int[tOrder];
+    tDimSize[0] = 50;
+    int tUnitNum = 1;
+    for (int i = 0; i < tOrder; i++)
+        tUnitNum *= tDimSize[i];
+    /* CPU test */
+    bool cpuTest = true;
+    /* create tensors */
+    XTensor * s = NewTensor(sOrder, sDimSize);
+    XTensor * t = NewTensor(tOrder, tDimSize);
+    XTensor * answer = NewTensor(tOrder, tDimSize);
+    XTensor tUser;
+    /* initialize variables */
+    _SetDataFixedFloat(s, 1.0F);
+    _SetDataFixedFloat(answer, (float)s->GetDim(1));
+    /* call ReduceSum function */
+    _ReduceSum(s, t, 1);
+    tUser = ReduceSum(*s, 1);
+    /* check results */
+    cpuTest = t->CheckData(answer->data, tUnitNum) && tUser.CheckData(answer->data, tUnitNum);
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+    /* create tensors */
+    XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
+    XTensor tUserGPU;
+    /* initialize variables */
+    _SetDataFixedFloat(sGPU, 1.0F);
+    /* call ReduceSum function */
+    _ReduceSum(sGPU, tGPU, 1);
+    tUserGPU = ReduceSum(*sGPU, 1);
+    /* check results */
+    gpuTest = tGPU->CheckData(answer->data, tUnitNum) && tUserGPU.CheckData(answer->data, tUnitNum);
+    /* destroy variables */
+    delete s;
+    delete t;
+    delete answer;
+    delete sGPU;
+    delete tGPU;
+    delete[] sDimSize;
+    delete[] tDimSize;
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete s;
+    delete t;
+    delete answer;
+    delete[] sDimSize;
+    delete[] tDimSize;
+    return cpuTest;
+#endif // USE_CUDA
+}
+/* 
+case 3: test ReduceSum function.
+Sum the items along a dimension of the tensor.
+In this case, 
+C = 1, A >= 10, B < 128
+(1000000, 50) -> (1000000), dim = 1
+*/
+bool TestReduceSum3()
+{
+    /* a tensor of size (1000000, 50) */
+    int sOrder = 2;
+    int * sDimSize = new int[sOrder];
+    sDimSize[0] = 1000000;
+    sDimSize[1] = 50;
+    int sUnitNum = 1;
+    for (int i = 0; i < sOrder; i++)
+        sUnitNum *= sDimSize[i];
+    /* a tensor of size (1000000) */
+    int tOrder = 1;
+    int * tDimSize = new int[tOrder];
+    tDimSize[0] = 1000000;
+    int tUnitNum = 1;
+    for (int i = 0; i < tOrder; i++)
+        tUnitNum *= tDimSize[i];
+    /* CPU test */
+    bool cpuTest = true;
+    /* create tensors */
+    XTensor * s = NewTensor(sOrder, sDimSize);
+    XTensor * t = NewTensor(tOrder, tDimSize);
+    XTensor * answer = NewTensor(tOrder, tDimSize);
+    XTensor tUser;
+    /* initialize variables */
+    _SetDataFixedFloat(s, 1.0F);
+    _SetDataFixedFloat(answer, (float)s->GetDim(1));
+    /* call ReduceSum function */
+    _ReduceSum(s, t, 1);
+    tUser = ReduceSum(*s, 1);
+    /* check results */
+    cpuTest = t->CheckData(answer->data, tUnitNum) && tUser.CheckData(answer->data, tUnitNum);
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+    /* create tensors */
+    XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
+    XTensor tUserGPU;
+    /* initialize variables */
+    _SetDataFixedFloat(sGPU, 1.0F);
+    /* call ReduceSum function */
+    _ReduceSum(sGPU, tGPU, 1);
+    tUserGPU = ReduceSum(*sGPU, 1);
+    /* check results */
+    gpuTest = tGPU->CheckData(answer->data, tUnitNum) && tUserGPU.CheckData(answer->data, tUnitNum);
+    /* destroy variables */
+    delete s;
+    delete t;
+    delete answer;
+    delete sGPU;
+    delete tGPU;
+    delete[] sDimSize;
+    delete[] tDimSize;
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete s;
+    delete t;
+    delete answer;
+    delete[] sDimSize;
+    delete[] tDimSize;
+    return cpuTest;
+#endif // USE_CUDA
+}
+/* 
+case 4: test ReduceSum function.
+Sum the items along a dimension of the tensor.
+In this case, 
+C = 1, A < 10, B is free
+(5, 1000000) -> (5), dim = 1
+*/
+bool TestReduceSum4()
+{
+    /* a tensor of size (5, 1000000) */
+    int sOrder = 2;
+    int * sDimSize = new int[sOrder];
+    sDimSize[0] = 5;
+    sDimSize[1] = 1000000;
+    int sUnitNum = 1;
+    for (int i = 0; i < sOrder; i++)
+        sUnitNum *= sDimSize[i];
+    /* a tensor of size (5) */
+    int tOrder = 1;
+    int * tDimSize = new int[tOrder];
+    tDimSize[0] = 5;
+    int tUnitNum = 1;
+    for (int i = 0; i < tOrder; i++)
+        tUnitNum *= tDimSize[i];
+    /* CPU test */
+    bool cpuTest = true;
+    /* create tensors */
+    XTensor * s = NewTensor(sOrder, sDimSize);
+    XTensor * t = NewTensor(tOrder, tDimSize);
+    XTensor * answer = NewTensor(tOrder, tDimSize);
+    XTensor tUser;
+    /* initialize variables */
+    _SetDataFixedFloat(s, 1.0F);
+    _SetDataFixedFloat(answer, (float)s->GetDim(1));
+    /* call ReduceSum function */
+    _ReduceSum(s, t, 1);
+    tUser = ReduceSum(*s, 1);
+    /* check results */
+    cpuTest = t->CheckData(answer->data, tUnitNum) && tUser.CheckData(answer->data, tUnitNum);
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+    /* create tensors */
+    XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
+    XTensor tUserGPU;
+    /* initialize variables */
+    _SetDataFixedFloat(sGPU, 1.0F);
+    /* call ReduceSum function */
+    _ReduceSum(sGPU, tGPU, 1);
+    tUserGPU = ReduceSum(*sGPU, 1);
+    /* check results */
+    gpuTest = tGPU->CheckData(answer->data, tUnitNum) && tUserGPU.CheckData(answer->data, tUnitNum);
+    /* destroy variables */
+    delete s;
+    delete t;
+    delete answer;
+    delete sGPU;
+    delete tGPU;
+    delete[] sDimSize;
+    delete[] tDimSize;
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete s;
+    delete t;
+    delete answer;
+    delete[] sDimSize;
+    delete[] tDimSize;
+    return cpuTest;
+#endif // USE_CUDA
+}
+/* 
+case 5: test ReduceSum function.
+Sum the items along a dimension of the tensor.
+In this case, 
+C != 1, A*C > 4096
+(500, 1000, 500) -> (500, 500), dim = 1
+*/
+bool TestReduceSum5()
+{
+    /* a tensor of size (500, 1000, 500) */
+    int sOrder = 3;
+    int * sDimSize = new int[sOrder];
+    sDimSize[0] = 500;
+    sDimSize[1] = 1000;
+    sDimSize[2] = 500;
+    int sUnitNum = 1;
+    for (int i = 0; i < sOrder; i++)
+        sUnitNum *= sDimSize[i];
+    /* a tensor of size (500, 500) */
+    int tOrder = 2;
+    int * tDimSize = new int[tOrder];
+    tDimSize[0] = 50;
+    tDimSize[1] = 50;
+    int tUnitNum = 1;
+    for (int i = 0; i < tOrder; i++)
+        tUnitNum *= tDimSize[i];
+    /* CPU test */
+    bool cpuTest = true;
+    /* create tensors */
+    XTensor * s = NewTensor(sOrder, sDimSize);
+    XTensor * t = NewTensor(tOrder, tDimSize);
+    XTensor * answer = NewTensor(tOrder, tDimSize);
+    XTensor tUser;
+    /* initialize variables */
+    _SetDataFixedFloat(s, 1.0F);
+    _SetDataFixedFloat(answer, (float)s->GetDim(1));
+    /* call ReduceSum function */
+    _ReduceSum(s, t, 1);
+    tUser = ReduceSum(*s, 1);
+    /* check results */
+    cpuTest = t->CheckData(answer->data, tUnitNum) && tUser.CheckData(answer->data, tUnitNum);
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+    /* create tensors */
+    XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
+    XTensor tUserGPU;
+    /* initialize variables */
+    _SetDataFixedFloat(sGPU, 1.0F);
+    /* call ReduceSum function */
+    _ReduceSum(sGPU, tGPU, 1);
+    tUserGPU = ReduceSum(*sGPU, 1);
+    /* check results */
+    gpuTest = tGPU->CheckData(answer->data, tUnitNum) && tUserGPU.CheckData(answer->data, tUnitNum);
+    /* destroy variables */
+    delete s;
+    delete t;
+    delete answer;
+    delete sGPU;
+    delete tGPU;
+    delete[] sDimSize;
+    delete[] tDimSize;
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete s;
+    delete t;
+    delete answer;
+    delete[] sDimSize;
+    delete[] tDimSize;
+    return cpuTest;
+#endif // USE_CUDA
+}
+/* 
+case 6: test ReduceSum function.
+Sum the items along a dimension of the tensor.
+In this case, 
+C != 1, A*C <= 4096
+(50, 10000, 50) -> (50, 50), dim = 1
+*/
+bool TestReduceSum6()
+{
+    /* a tensor of size (50, 10000, 50) */
+    int sOrder = 3;
+    int * sDimSize = new int[sOrder];
+    sDimSize[0] = 50;
+    sDimSize[1] = 10000;
+    sDimSize[2] = 50;
+    int sUnitNum = 1;
+    for (int i = 0; i < sOrder; i++)
+        sUnitNum *= sDimSize[i];
+    /* a tensor of size (50, 50) */
+    int tOrder = 2;
+    int * tDimSize = new int[tOrder];
+    tDimSize[0] = 50;
+    tDimSize[1] = 50;
+    int tUnitNum = 1;
+    for (int i = 0; i < tOrder; i++)
+        tUnitNum *= tDimSize[i];
+    /* CPU test */
+    bool cpuTest = true;
+    /* create tensors */
+    XTensor * s = NewTensor(sOrder, sDimSize);
+    XTensor * t = NewTensor(tOrder, tDimSize);
+    XTensor * answer = NewTensor(tOrder, tDimSize);
+    XTensor tUser;
+    /* initialize variables */
+    _SetDataFixedFloat(s, 1.0F);
+    _SetDataFixedFloat(answer, (float)s->GetDim(1));
+    /* call ReduceSum function */
+    _ReduceSum(s, t, 1);
+    tUser = ReduceSum(*s, 1);
+    /* check results */
+    cpuTest = t->CheckData(answer->data, tUnitNum) && tUser.CheckData(answer->data, tUnitNum);
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+    /* create tensors */
+    XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
+    XTensor tUserGPU;
+    /* initialize variables */
+    _SetDataFixedFloat(sGPU, 1.0F);
+    /* call ReduceSum function */
+    _ReduceSum(sGPU, tGPU, 1);
+    tUserGPU = ReduceSum(*sGPU, 1);
+    /* check results */
+    gpuTest = tGPU->CheckData(answer->data, tUnitNum) && tUserGPU.CheckData(answer->data, tUnitNum);
+    /* destroy variables */
+    delete s;
+    delete t;
+    delete answer;
+    delete sGPU;
+    delete tGPU;
+    delete[] sDimSize;
+    delete[] tDimSize;
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete s;
+    delete t;
+    delete answer;
+    delete[] sDimSize;
+    delete[] tDimSize;
+    return cpuTest;
+#endif // USE_CUDA
+}
 /* other cases */
 /*
 TODO!!
@@ -175,6 +627,51 @@ bool TestReduceSum()
    else
        XPRINT(0, stdout, ">> case 1 passed!\n");
+    /* case 2 test */
+    caseFlag = TestReduceSum2();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 2 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 2 passed!\n");
+    ///* case 3 test */
+    //caseFlag = TestReduceSum3();
+    //if (!caseFlag) {
+    //    returnFlag = false;
+    //    XPRINT(0, stdout, ">> case 3 failed!\n");
+    //}
+    //else
+    //    XPRINT(0, stdout, ">> case 3 passed!\n");
+    /* case 4 test */
+    caseFlag = TestReduceSum4();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 4 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 4 passed!\n");
+    ///* case 5 test */
+    //caseFlag = TestReduceSum5();
+    //if (!caseFlag) {
+    //    returnFlag = false;
+    //    XPRINT(0, stdout, ">> case 5 failed!\n");
+    //}
+    //else
+    //    XPRINT(0, stdout, ">> case 5 passed!\n");
+    /* case 6 test */
+    caseFlag = TestReduceSum6();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 6 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 6 passed!\n");
    /* other cases test */
    /*
    TODO!!

--- a/source/tensor/test/TSoftmax.cpp
+++ b/source/tensor/test/TSoftmax.cpp
@@ -146,7 +146,7 @@ bool TestSoftmax2()
    _Softmax(x, y, 1);
    /* call SoftmaxBackward function */
-    _SoftmaxBackward(g, y, x, dedy, dedx, 1, CROSSENTROPY);
+    _SoftmaxBackward(g, y, x, dedy, dedx, NULL, 1, CROSSENTROPY);
    /* check result */
    cpuTest = y->CheckData(yAnswer, unitNum, 1e-4F)
@@ -174,7 +174,7 @@ bool TestSoftmax2()
    _Softmax(xGPU, yGPU, 1);
    /* call SoftmaxBackward function */
-    _SoftmaxBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, 1, CROSSENTROPY);
+    _SoftmaxBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, NULL, 1, CROSSENTROPY);
    /* check result */
    gpuTest = yGPU->CheckData(yAnswer, unitNum, 1e-4F)

--- a/source/tensor/test/TSumDim.cpp
+++ b/source/tensor/test/TSumDim.cpp
@@ -20,8 +20,9 @@
 */
 #include "TSumDim.h"
-#include "../core/arithmetic/SumDim.h"
 #include "../XTensor.h"
+#include "../core/arithmetic/SumDim.h"
+#include "../core/getandset/SetData.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
@@ -251,6 +252,225 @@ bool TestSumDim2()
 #endif // USE_CUDA
 }
+/* 
+case 3: tensor summation c = a + b * \beta 
+where the size of b is equal to the n-th dimension of a, 
+i.e., a is summed with b by broadcasting.
+In this case, 
+(20, 40, 4000) + (40) = (20, 40, 4000), dim = 1.
+*/
+bool TestSumDim3()
+{
+    /* a tensor of size (20, 40, 4000) */
+    int aOrder = 3;
+    int * aDimSize = new int[aOrder];
+    aDimSize[0] = 20;
+    aDimSize[1] = 40;
+    aDimSize[2] = 4000;
+    int aUnitNum = 1;
+    for (int i = 0; i < aOrder; i++)
+        aUnitNum *= aDimSize[i];
+    /* a tensor of size (40) */
+    int bOrder = 1;
+    int * bDimSize = new int[bOrder];
+    bDimSize[0] = 40;
+    int bUnitNum = 1;
+    for (int i = 0; i < bOrder; i++)
+        bUnitNum *= bDimSize[i];
+    /* CPU test */
+    bool cpuTest = true;
+    /* create tensors */
+    XTensor * a = NewTensor(aOrder, aDimSize);
+    XTensor * b = NewTensor(bOrder, bDimSize);
+    XTensor * c = NewTensor(aOrder, aDimSize);
+    XTensor * cMe = NewTensor(aOrder, aDimSize);
+    XTensor * answer = NewTensor(aOrder, aDimSize);
+    XTensor cUser;
+    /* initialize variables */
+    a->SetZeroAll();
+    cMe->SetZeroAll();
+    _SetDataFixedFloat(b, 1.0F);
+    _SetDataFixedFloat(answer, 1.0F);
+    /* call SumDim function */
+    _SumDim(a, b, c, 1);
+    _SumDim(cMe, b, 1);
+    cUser = SumDim(*a, *b, 1);
+    /* check results */
+    cpuTest = c->CheckData(answer->data, aUnitNum) && 
+              cMe->CheckData(answer->data, aUnitNum) && 
+              cUser.CheckData(answer->data, aUnitNum);
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+    /* create tensor */
+    XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * bGPU = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * cGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * cMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+    XTensor cUserGPU;
+    /* Initialize variables */
+    aGPU->SetZeroAll();
+    cMe->SetZeroAll();
+    _SetDataFixedFloat(bGPU, 1.0F);
+    /* call sum function */
+    _SumDim(aGPU, bGPU, cGPU, 1);
+    _SumDim(cMeGPU, bGPU, 1);
+    cUserGPU = SumDim(*aGPU, *bGPU, 1);
+    /* check results */
+    gpuTest = cGPU->CheckData(answer->data, aUnitNum) && 
+              cMeGPU->CheckData(answer->data, aUnitNum) && 
+              cUserGPU.CheckData(answer->data, aUnitNum);
+    /* destroy variables */
+    delete a;
+    delete b;
+    delete c;
+    delete cMe;
+    delete answer;
+    delete aGPU;
+    delete bGPU;
+    delete cGPU;
+    delete cMeGPU;
+    delete[] aDimSize;
+    delete[] bDimSize;
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete a;
+	delete b;
+	delete c;
+    delete cMe;
+    delete answer;
+    delete[] aDimSize;
+    delete[] bDimSize;
+    return cpuTest;
+#endif // USE_CUDA
+}
+/* 
+case 4: tensor summation c = a + b * \beta 
+where the size of b is equal to the n-th dimension of a, 
+i.e., a is summed with b by broadcasting.
+In this case, 
+(200, 40, 4000) + (40) = (200, 40, 4000), dim = 1.
+*/
+bool TestSumDim4()
+{
+    /* a tensor of size (200, 40, 4000) */
+    int aOrder = 2;
+    int * aDimSize = new int[aOrder];
+    aDimSize[0] = 1000000;
+    aDimSize[1] = 50;
+    int aUnitNum = 1;
+    for (int i = 0; i < aOrder; i++)
+        aUnitNum *= aDimSize[i];
+    /* a tensor of size (40) */
+    int bOrder = 1;
+    int * bDimSize = new int[bOrder];
+    bDimSize[0] = 50;
+    int bUnitNum = 1;
+    for (int i = 0; i < bOrder; i++)
+        bUnitNum *= bDimSize[i];
+    /* CPU test */
+    bool cpuTest = true;
+    /* create tensors */
+    XTensor * a = NewTensor(aOrder, aDimSize);
+    XTensor * b = NewTensor(bOrder, bDimSize);
+    XTensor * c = NewTensor(aOrder, aDimSize);
+    XTensor * cMe = NewTensor(aOrder, aDimSize);
+    XTensor * answer = NewTensor(aOrder, aDimSize);
+    XTensor cUser;
+    /* initialize variables */
+    a->SetZeroAll();
+    cMe->SetZeroAll();
+    _SetDataFixedFloat(b, 1.0F);
+    _SetDataFixedFloat(answer, 1.0F);
+    /* call SumDim function */
+    _SumDim(a, b, c, 1);
+    _SumDim(cMe, b, 1);
+    cUser = SumDim(*a, *b, 1);
+    /* check results */
+    cpuTest = c->CheckData(answer->data, aUnitNum) && 
+              cMe->CheckData(answer->data, aUnitNum) && 
+              cUser.CheckData(answer->data, aUnitNum);
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+    /* create tensor */
+    XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * bGPU = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * cGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * cMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+    XTensor cUserGPU;
+    /* Initialize variables */
+    aGPU->SetZeroAll();
+    cMe->SetZeroAll();
+    _SetDataFixedFloat(bGPU, 1.0F);
+    /* call sum function */
+    _SumDim(aGPU, bGPU, cGPU, 1);
+    _SumDim(cMeGPU, bGPU, 1);
+    cUserGPU = SumDim(*aGPU, *bGPU, 1);
+    /* check results */
+    gpuTest = cGPU->CheckData(answer->data, aUnitNum) && 
+              cMeGPU->CheckData(answer->data, aUnitNum) && 
+              cUserGPU.CheckData(answer->data, aUnitNum);
+    /* destroy variables */
+    delete a;
+    delete b;
+    delete c;
+    delete cMe;
+    delete answer;
+    delete aGPU;
+    delete bGPU;
+    delete cGPU;
+    delete cMeGPU;
+    delete[] aDimSize;
+    delete[] bDimSize;
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete a;
+	delete b;
+	delete c;
+    delete cMe;
+    delete answer;
+    delete[] aDimSize;
+    delete[] bDimSize;
+    return cpuTest;
+#endif // USE_CUDA
+}
 /* other cases */
 /*
    TODO!!
@@ -280,6 +500,24 @@ bool TestSumDim()
    else
        XPRINT(0, stdout, ">> case 2 passed!\n");
+    /* case 3 test */
+    caseFlag = TestSumDim3();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 3 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 3 passed!\n");
+    ///* case 4 test */
+    //caseFlag = TestSumDim4();
+    //if (!caseFlag) {
+    //    returnFlag = false;
+    //    XPRINT(0, stdout, ">> case 4 failed!\n");
+    //}
+    //else
+    //    XPRINT(0, stdout, ">> case 4 passed!\n");
    /* other cases test */
    /*
        TODO!!