Merge branch 'xuchen' into xiaotong-working

7809ed05 · xuchen · f4be1882 · 03a9836e · 7809ed05 · 7809ed05
Commit 7809ed05 authored Nov 13, 2018 by xuchen
--- a/source/network/XBackwardFunc.cpp
+++ b/source/network/XBackwardFunc.cpp
@@ -49,7 +49,7 @@ void XFuncGrad::MakeGrad(XTensor * node, bool isEfficient)
    else if(operID == FUNC_LOGSOFTMAX){
        int leadDim = income.GetParamInt(0);
        CheckNTErrors(leadDim >= 0 && leadDim < input->order, "wrong leading dimension in logsoftmax!");
-        _LogSoftmaxBackward(NULL, output, input, output->grad, input->grad, leadDim, NOLOSS);
+        _LogSoftmaxBackward(NULL, output, input, output->grad, input->grad, NULL, leadDim, NOLOSS);
    }
    else if(operID == FUNC_RECTIFY)
        _RectifyBackward(NULL, output, input, output->grad, input->grad, NOLOSS);
@@ -58,7 +58,7 @@ void XFuncGrad::MakeGrad(XTensor * node, bool isEfficient)
    else if(operID == FUNC_SOFTMAX){
        int leadDim = income.GetParamInt(0);
        CheckNTErrors(leadDim >= 0 && leadDim < input->order, "wrong leading dimension in softmax!");
-        _SoftmaxBackward(NULL, output, input, output->grad, input->grad, leadDim, NOLOSS);
+        _SoftmaxBackward(NULL, output, input, output->grad, input->grad, NULL, leadDim, NOLOSS);
    }
    else{
        ShowNTErrors("Wrong activation function type!");

--- a/source/network/XBackwardLoss.cpp
+++ b/source/network/XBackwardLoss.cpp
@@ -42,7 +42,7 @@ compute dE/dx for a given function y = f(x)
 >> lossName - name of the loss, e.g., cross entropy
 */
 void XLossGrad::Compute(XTensor * gold, XTensor * y, XTensor * x, 
-                        XTensor * dedy, XTensor * dedx,
+                        XTensor * dedy, XTensor * dedx, XTensor * padding,
                        int funcID, void * params,
                        LOSS_FUNCTION_NAME lossName)
 {
@@ -58,7 +58,7 @@ void XLossGrad::Compute(XTensor * gold, XTensor * y, XTensor * x,
    }
    else if(funcID == FUNC_LOGSOFTMAX){
        int leadDim = *(int*)params;
-        _LogSoftmaxBackward(gold, y, x, dedy, dedx, leadDim, lossName);
+        _LogSoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
    }
    else if(funcID == FUNC_RECTIFY){
        _RectifyBackward(gold, y, x, dedy, dedx, lossName);
@@ -67,7 +67,7 @@ void XLossGrad::Compute(XTensor * gold, XTensor * y, XTensor * x,
        _SigmoidBackward(gold, y, x, dedy, dedx, lossName);
    }else if(funcID == FUNC_SOFTMAX){
        int leadDim = *(int*)params;
-        _SoftmaxBackward(gold, y, x, dedy, dedx, leadDim, lossName);
+        _SoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
    }
    else{
        ShowNTErrors("wrong function found when call the backward process!");
@@ -83,10 +83,12 @@ compute dE/dy for variable y and error(loss) function E
 >> lossName - name of the loss, e.g., cross entropy
 */
 void XLossGrad::Compute(XTensor * gold, XTensor * y, 
-                        XTensor * dedy, 
+                        XTensor * dedy, XTensor * padding,
                        LOSS_FUNCTION_NAME lossName)
 {
-    _LossBackward(dedy, gold, y, lossName);
+    //_LossBackward(dedy, gold, y, lossName);
+    if(lossName == CROSSENTROPY)
+        _CrossEntropyBackward(dedy, y, gold, NULL, padding);
 }

 }
\ No newline at end of file
--- a/source/network/XBackwardLoss.h
+++ b/source/network/XBackwardLoss.h
@@ -36,13 +36,13 @@ class XLossGrad
 public:
    /* compute dE/dx for a given function y = f(x) */
    void Compute(XTensor * gold, XTensor * y, XTensor * x, 
-                 XTensor * dedy, XTensor * dedx,
+                 XTensor * dedy, XTensor * dedx, XTensor * padding,
                 int funcID, void * params,
                 LOSS_FUNCTION_NAME lossName);

    /* compute dE/dy for variable y and error(loss) function E */
    void Compute(XTensor * gold, XTensor * y, 
-                 XTensor * dedy, 
+                 XTensor * dedy, XTensor * padding,
                 LOSS_FUNCTION_NAME lossName);
 };


--- a/source/network/XBackwardShape.cpp
+++ b/source/network/XBackwardShape.cpp
@@ -469,8 +469,6 @@ void XShapeGrad::GradTranspose(XTensor * node, bool isEfficient)
    DelTensorBuf(b);

    node->visitMark = NODE_FINISHED;
-
-    delete b;
 }

 /* 

--- a/source/network/XNet.cpp
+++ b/source/network/XNet.cpp
@@ -55,7 +55,7 @@ void XNetClearAll()
 XNet::XNet()
 {
    nodes.Clear();
-    isGradEfficient = true;
+    isGradEfficient = false;
 }

 /* de-constructor */
@@ -86,7 +86,31 @@ void XNet::Backward(XTensor &root, XTensor &gold, LOSS_FUNCTION_NAME loss)
    XList golds(1);
    golds.Add(&gold);

-    Backward(roots, golds, loss);
+    XList paddings(1);
+    paddings.Add(NULL);
+
+    Backward(roots, golds, paddings, loss);
+}
+
+/* 
+backward propagation to obtain gradient wrt. the loss/error function 
+>> root - root node (output) of the network
+>> gold - gold standard for the output
+>> padding - specify a target value that is ignored and does not contribute to the loss computation
+>> loss - name of loss function
+*/
+void XNet::Backward(XTensor &root, XTensor &gold, XTensor &padding, LOSS_FUNCTION_NAME loss)
+{
+    XList roots(1);
+    roots.Add(&root);
+
+    XList golds(1);
+    golds.Add(&gold);
+
+    XList paddings(1);
+    paddings.Add(&padding);
+
+    Backward(roots, golds, paddings, loss);
 }

 /* 
@@ -102,7 +126,10 @@ void XNet::Backward(XTensor &root, LOSS_FUNCTION_NAME loss)
    XList golds(1);
    golds.Add(NULL);

-    Backward(roots, golds, loss);
+    XList paddings(1);
+    paddings.Add(NULL);
+
+    Backward(roots, golds, paddings, loss);
 }

 /* 
@@ -110,9 +137,10 @@ backward propagation to obtain gradient wrt. the loss/error function
 with a number of root nodes 
 >> root - a list of root nodes (output) of the network
 >> gold - a list of gold standard for the output
+>> padding - specify a target value that is ignored
 >> loss - name of loss function
 */
-void XNet::Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss)
+void XNet::Backward(XList &roots, XList &golds, XList &paddings, LOSS_FUNCTION_NAME loss)
 {
    Traverse(roots);

@@ -131,6 +159,7 @@ void XNet::Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss)
    for(int i = 0; i < roots.count; i++){
        XTensor * root = (XTensor*)roots.Get(i);
        XTensor * gold = (XTensor*)golds.Get(i);
+        XTensor * padding = (XTensor*)paddings.Get(i);
        XLink &income = root->income;
        int funcID = income.typeID;
        void * params = income.params;
@@ -139,15 +168,21 @@ void XNet::Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss)
           Note that we do not need to obtain dE/dy here because it is no use in the 
           folloing process of back-propagation */
        if(gold != NULL && income.tailNum == 1 && (funcID & FUNCTION_BASE)){
-            XTensor * x = income.tails[0];
-            XNoder::MakeGrad(x);
-            lossGrad.Compute(gold, root, x, NULL, x->grad, funcID, params, loss);
-            root->visitMark = NODE_FINISHED;
+            if(funcID == FUNC_LOGSOFTMAX || funcID == FUNC_SOFTMAX) {
+                XTensor * x = income.tails[0];
+                XNoder::MakeGrad(x);
+                lossGrad.Compute(gold, root, x, NULL, x->grad, padding, funcID, params, loss);
+                root->visitMark = NODE_FINISHED;
+            }
+            else {
+                XNoder::MakeGrad(root);
+                lossGrad.Compute(gold, root, root->grad, padding, loss);
+            }
        }
        /* we compuate dE/dy (y is the output) if no predefined activation function is used */
        else{
            XNoder::MakeGrad(root);
-            lossGrad.Compute(gold, root, root->grad, loss);
+            lossGrad.Compute(gold, root, root->grad, NULL, loss);
        }
    }
    
@@ -178,16 +213,35 @@ void XNet::Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss)
 /* 
 backward propagation to obtain gradient
 with a number of root nodes 
->> root - a list of root nodes (output) of the network
+>> roots - a list of root nodes (output) of the network
 >> loss - name of loss function
 */
 void XNet::Backward(XList &roots, LOSS_FUNCTION_NAME loss)
 {
    XList golds(roots.count);
-    for(int i = 0; i < roots.count; i++)
+    XList paddings(roots.count);
+    for(int i = 0; i < roots.count; i++) {
        golds.Add(NULL);
+        paddings.Add(NULL);
+    }
+
+    Backward(roots, golds, paddings, loss);
+}
+
+/* 
+backward propagation to obtain gradient
+with a number of root nodes 
+>> roots - a list of root nodes (output) of the network
+>> golds - a list of gold standard for the output
+>> loss - name of loss function
+*/
+void XNet::Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss)
+{
+    XList paddings(roots.count);
+    for(int i = 0; i < roots.count; i++)
+        paddings.Add(NULL);

-    Backward(roots, golds, loss);
+    Backward(roots, golds, paddings, loss);
 }

 /* 

--- a/source/network/XNet.h
+++ b/source/network/XNet.h
@@ -62,17 +62,24 @@ struct XNet
    /* backward propagation to obtain gradient wrt. the loss/error function */
    void Backward(XTensor &root, XTensor &gold, LOSS_FUNCTION_NAME loss = NOLOSS);

+    /* backward propagation to obtain gradient wrt. the loss/error function */
+    void Backward(XTensor &root, XTensor &gold, XTensor &padding, LOSS_FUNCTION_NAME loss = NOLOSS);
+
    /* backward propagation to obtain gradient */
    void Backward(XTensor &root, LOSS_FUNCTION_NAME loss = NOLOSS);

    /* backward propagation to obtain gradient wrt. the loss/error function
       with a number of root nodes */
-    void Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss = NOLOSS);
+    void Backward(XList &roots, XList &golds, XList &paddings, LOSS_FUNCTION_NAME loss = NOLOSS);

    /* backward propagation to obtain gradient
       with a number of root nodes */
    void Backward(XList &roots, LOSS_FUNCTION_NAME loss = NOLOSS);

+    /* backward propagation to obtain gradient
+       with a number of root nodes */
+    void Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss = NOLOSS);
+
    /* backward computation for a given node */
    void BackwardNode(XTensor * node, bool isEfficent = false);


--- a/source/sample/fnnlm/FNNLM.cpp
+++ b/source/sample/fnnlm/FNNLM.cpp
@@ -514,6 +514,8 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
        
        if(isEnd)
            break;
+
+        Test(testFN, outputFN, model);
    }

    double elapsed = GetClockSec() - startT;
@@ -890,7 +892,7 @@ void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NA

    /* for y = softmax(s), we get dE/ds
        where E is the error function (define by loss) */
-    _LogSoftmaxBackward(&gold, &y, &s, NULL, &deds, 1, loss);
+    _LogSoftmaxBackward(&gold, &y, &s, NULL, &deds, NULL, 1, loss);

    /* for s = x * w, we get 
       dE/w_{i,j} = dE/ds_j * ds/dw_{i,j} 

--- a/source/sample/transformer/T2TEmbedding.cpp
+++ b/source/sample/transformer/T2TEmbedding.cpp
@@ -68,9 +68,10 @@ void T2TEmbedder::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
 }

 /* 
-make positional embeddings (of size eSize * length
-eSize - embedding size
-length - length of the sequenc
+make positional embeddings (of size eSize * length)
+>> eSize - embedding size
+>> d - dimension size of the hidden layers
+>> length - length of the sequence
 */
 void T2TEmbedder::MakePosEmbedding(int eSize, int d, int length)
 {
@@ -114,15 +115,15 @@ make the network
 */
 XTensor T2TEmbedder::Make(XTensor &input)
 {
-    CheckNTErrors(input.GetDim(-1) == vSize, "Wrong vocabulary size!");
+    //CheckNTErrors(input.GetDim(-1) == vSize, "Wrong vocabulary size!");
    CheckNTErrors(input.order > 1, "Wrong input tensor size!");
-    CheckNTErrors(input.dimSize[input.order - 2] < maxLength, "The sequence is too long!");
+    CheckNTErrors(input.dimSize[input.order - 1] < maxLength, "The sequence is too long!");
    CheckNTErrors(vSize > 0, "set vocabulary size by \"-vsize\"");
    CheckNTErrors(eSize > 0, "set embedding size by \"-esize\"");

    int dims[MAX_TENSOR_DIM_NUM];
    memcpy(dims, input.dimSize, input.order * sizeof(int));
-    dims[input.order - 1] = eSize;
+    dims[input.order] = eSize;

    XTensor wordEmbedding;
    XTensor posEmbedding;
@@ -138,7 +139,8 @@ XTensor T2TEmbedder::Make(XTensor &input)
    /* we make positional embeddings first */
    //if(!match){
    if(true){
-        InitTensor(&posEmbedding, input.order, dims, X_FLOAT, 1.0F, devID, mem);
+        InitTensor(&posEmbedding, input.order + 1, dims, X_FLOAT, 1.0F, devID, mem);
+
        XTensor * posTMP = NewTensorBuf(2, dims + 1, X_FLOAT, 1.0F, devID, mem);

        _CopyValues(&posEmbeddingBase, 0, posTMP->unitNum, posTMP, 0);
@@ -148,7 +150,9 @@ XTensor T2TEmbedder::Make(XTensor &input)
    }

    /* then we make word embeddings */
-    wordEmbedding = Linear(MMul(input, w), (float)sqrt((float)eSize));
+    //wordEmbedding = Linear(MMul(input, w), (float)sqrt((float)eSize));
+    wordEmbedding = Gather(w, input);
+    wordEmbedding = Linear(wordEmbedding, (float)sqrt((float)eSize));

    /* we sum over the two embeddings */
    return wordEmbedding + posEmbedding;

--- a/source/sample/transformer/T2TModel.cpp
+++ b/source/sample/transformer/T2TModel.cpp
@@ -121,14 +121,22 @@ void T2TModel::MakeLM(XTensor &input, XTensor &output, XTensor &padding, bool is
    XTensor encoding;
    
    /* generate mask to see "previous" words only */
-    int len = input.GetDim(input.order - 2);
-    int * dims = new int[input.order + 1];
+    //int len = input.GetDim(input.order - 2);
+    //int * dims = new int[input.order + 1];
+    //for(int i = 0; i < input.order; i++)
+    //    dims[i + 1] = input.GetDim(i);
+    //dims[0] = nhead;
+    //dims[input.order] = len;
+    //XTensor mask(input.order + 1, dims, X_FLOAT, 1.0F, input.devID, input.mem);
+
+    int len = input.GetDim(input.order - 1);
+    int * dims = new int[input.order + 2];
    for(int i = 0; i < input.order; i++)
        dims[i + 1] = input.GetDim(i);
    dims[0] = nhead;
-    dims[input.order] = len;
-    XTensor mask(input.order + 1, dims, X_FLOAT, 1.0F, input.devID, input.mem);
-        
+    dims[input.order + 1] = len;
+    XTensor mask(input.order + 2, dims, X_FLOAT, 1.0F, padding.devID, padding.mem);
+
    /* a upper triangular matrix where the cells of the upper triangular are set to -1e-9.
        this matrix can be used to prevent the attention to current or following words in
        a given sequence. */
@@ -140,24 +148,24 @@ void T2TModel::MakeLM(XTensor &input, XTensor &output, XTensor &padding, bool is
        dimsPadding[i] = padding.GetDim(i);
    dimsPadding[padding.order - 1] = padding.GetDim(-1);
    dimsPadding[padding.order] = padding.GetDim(-1);
-        
+
    XTensor * padding2 = NewTensorBuf(padding.order + 1, dimsPadding, padding.dataType,
-                                        padding.denseRatio, padding.devID, padding.mem);
-        
+                                      padding.denseRatio, padding.devID, padding.mem);
+
    for(int i = 0; i < padding2->order; i++)
        dimsPadding[i + 1] = padding2->GetDim(i);
    dimsPadding[0] = nhead;
-        
-    XTensor * padding3 = NewTensorBuf(padding.order + 2, dimsPadding, padding.dataType,
-                                        padding.denseRatio, padding.devID, padding.mem);
-        
-    /* mask of the padding */
-    _Unsqueeze(&padding, padding2, padding.order - 1, padding.GetDim(-1));
-    _Unsqueeze(padding2, padding3, 0, nhead);
-        
-    _ScaleAndShiftMe(padding3, 1e9F, -1e9F);
-        
-    //_Sum(&mask, padding3, &mask);
+
+    //XTensor * padding3 = NewTensorBuf(padding.order + 2, dimsPadding, padding.dataType,
+    //                                  padding.denseRatio, padding.devID, padding.mem);
+    //    
+    ///* mask of the padding */
+    //_Unsqueeze(&padding, padding2, padding.order - 1, padding.GetDim(-1));
+    //_Unsqueeze(padding2, padding3, 0, nhead);
+    //    
+    //_ScaleAndShiftMe(padding3, 1e9F, -1e9F);
+    //    
+    ////_Sum(&mask, padding3, &mask);

    encoding = MakeEncoder(input, mask, isTraining);
    outputLayer.Make(encoding, output);
@@ -165,8 +173,8 @@ void T2TModel::MakeLM(XTensor &input, XTensor &output, XTensor &padding, bool is
    delete[] dims;
    delete[] dimsPadding;
        
+    //DelTensorBuf(padding3);
    DelTensorBuf(padding2);
-    DelTensorBuf(padding3);
 }

 /* 
@@ -235,8 +243,8 @@ void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTe
    delete[] dims;
    delete[] dimsPadding;

-    DelTensorBuf(padding2);
    DelTensorBuf(padding3);
+    DelTensorBuf(padding2);
 }

 /* 

--- a/source/sample/transformer/T2TOutput.cpp
+++ b/source/sample/transformer/T2TOutput.cpp
@@ -93,7 +93,8 @@ void T2TOutput::Make(XTensor &input, XTensor &output)
 {
    XTensor &x = input;

-    output = LogSoftmax(MMul(x, w), -1);
+    //output = LogSoftmax(MMul(x, w), -1);
+    output = Softmax(MMul(x, w), -1);
 }

 }
--- a/source/sample/transformer/T2TTrainer.cpp
+++ b/source/sample/transformer/T2TTrainer.cpp
--- a/source/sample/transformer/T2TTrainer.h
+++ b/source/sample/transformer/T2TTrainer.h
@@ -103,6 +103,10 @@ public:
    /* indicates whether we use adam */
    bool useAdam;

+    int validStep;
+
+    int curEpoch;
+
    /* hyper parameters of adam*/
    float adamBeta1;
    float adamBeta2;
@@ -131,7 +135,7 @@ public:
    /* number of batches on which we do model update */
    int updateStep;
    
-    /* indicates whether we double the </s> symble for the output of lms */
+    /* indicates whether we double the </s> symbol for the output of lms */
    bool isDoubledEnd;
    
    /* indicates whether we use batchsize = max * sc
@@ -150,7 +154,7 @@ public:
    void Init(int argc, char ** argv);

    /* train the model */
-    void Train(const char * fn, const char * validFN, const char * modelFN, T2TModel * model);
+    bool Train(const char * fn, const char * validFN, const char * modelFN, T2TModel * model);

    /* test the model */
    void Test(const char * fn, const char * ofn, T2TModel * model);
@@ -172,7 +176,28 @@ public:
                  int * seqs,
                  int vsEnc, int vsDec, int sBatch, int wBatch, 
                  bool isSorted, int &wCount,
-                  int devID, XMem * mem);
+                  int devID, XMem * mem, 
+				  bool isTraining);
+
+    /* load a batch of sequences (for language modeling) */
+    int LoadBatchLM(FILE * file, 
+                    XTensor * batchEnc, XTensor * paddingEnc,
+                    XTensor * batchDec, XTensor * paddingDec,
+                    XTensor * gold,
+                    int * seqs, int vs, int sBatch, int wBatch, 
+                    bool isSorted, int &wCount,
+                    int devID, XMem * mem, 
+					bool isTraining);
+
+    /* load a batch of sequences (for machine translation) */
+    int LoadBatchMT(FILE * file, 
+                    XTensor * batchEnc, XTensor * paddingEnc, 
+                    XTensor * batchDec, XTensor * paddingDec,
+                    XTensor * gold,
+                    int * seqs, int vsEnc, int vsDec, int sBatch, int wBatch, 
+                    bool isSorted, int &wCount,
+                    int devID, XMem * mem, 
+					bool isTraining);

    /* load a batch of sequences (for language modeling) */
    int LoadBatchLM(FILE * file, 

--- a/source/sample/transformer/Transformer.cpp
+++ b/source/sample/transformer/Transformer.cpp
@@ -25,6 +25,8 @@
 #include "T2TUtility.h"
 #include "T2TTrainer.h"
 #include "../../tensor/XDevice.h"
+#include "../../tensor/XUtility.h"
+#include "../../tensor/XGlobal.h"

 namespace transformer
 {
@@ -56,20 +58,74 @@ int TransformerMain(int argc, const char ** argv)
    LoadParamString(argc, args, "test", testFN, "");
    LoadParamString(argc, args, "output", outputFN, "");

-    T2TTrainer trainer;
-    trainer.Init(argc, args);
-
-    T2TModel model;
-
-    model.InitModel(argc, args);
-
    /* learn model parameters */
-    if(strcmp(trainFN, ""))
-        trainer.Train(trainFN, testFN, strcmp(modelFN, "") ? modelFN : "checkpoint.model", &model);
+    if(strcmp(trainFN, "")) {
+        double startT = GetClockSec();
+
+        T2TTrainer trainer;
+        trainer.Init(argc, args);
+
+        char * fn = new char[MAX_LINE_LENGTH];
+        char * fn1 = new char[MAX_LINE_LENGTH];
+        char * fn2 = new char[MAX_LINE_LENGTH];
+        modelFN = strcmp(modelFN, "") ? modelFN : (char *)"checkpoint.model";
+
+        int epoch;
+        bool isTrain;
+
+        for(epoch = 1; epoch <= trainer.nepoch; epoch++) {
+            sprintf(fn, "%s.%s.%03d", modelFN, "epoch", epoch - 1);
+            sprintf(fn1, "%s.%s.%03d", modelFN, "epoch", epoch);
+            sprintf(fn2, "%s.%s.%03d.output", modelFN, "epoch", epoch);
+
+            if(epoch == 1) {
+                T2TModel model;
+                model.InitModel(argc, args);
+
+                isTrain = trainer.Train(trainFN, testFN, modelFN, &model);
+                model.Dump(fn1);
+            }
+            else {
+                T2TModel model;
+                model.InitModel(argc, args);
+                model.Read(fn);
+
+                isTrain = trainer.Train(trainFN, testFN, modelFN, &model);
+                model.Dump(fn1);
+            }
+
+            if(trainer.useEpochCheckpoint && strcmp(testFN, "")) {
+                T2TTrainer tester;
+                tester.Init(argc, args);
+
+                T2TModel model;
+                model.InitModel(argc, args);
+                model.Read(fn1);
+
+                tester.Test(testFN, fn2, &model);
+            }
+
+            if(!isTrain)
+                break;
+        }
+
+        double elapsed = GetClockSec() - startT;
+        epoch = MIN(epoch, trainer.nepoch);
+    
+        XPRINT2(0, stderr, "[INFO] training finished (took %.1fs and epoch=%d)\n", elapsed, epoch);

+        delete[] fn;
+        delete[] fn1;
+        delete[] fn2;
+    }
+
+    /* don't dump the final model */
    /* save the final model */
-    if(strcmp(modelFN, "") && strcmp(trainFN, ""))
-        model.Dump(modelFN);
+    //if(strcmp(modelFN, "") && strcmp(trainFN, ""))
+    //    model.Dump(modelFN);
+    
+    T2TModel model;
+    model.InitModel(argc, args);

    /* load the model if neccessary */
    if(strcmp(modelFN, ""))

--- a/source/tensor/XDevice.cpp
+++ b/source/tensor/XDevice.cpp
@@ -446,7 +446,7 @@ int XDevManager::GetCudaThread2D(const int devID, const int n, const int m, int 

    CheckNTErrors((!(b & (b-1))), "Block size (x-axis) must be in 2^x");
    CheckNTErrors((gXSize <= GPUs[devID].GPUMaxGridSize[0] && 
-                         gYSize <= GPUs[devID].GPUMaxGridSize[1]), "A too large grid size.");
+                   gYSize <= GPUs[devID].GPUMaxGridSize[1]), "A too large grid size.");

    blockSize[0] = bXSize;
    blockSize[1] = bYSize;

--- a/source/tensor/XMem.cpp
+++ b/source/tensor/XMem.cpp
@@ -292,7 +292,8 @@ void XMem::SetComputationMode(bool myIsForComputation)
    if(!myIsForComputation && devID >= 0 && cublasHandle != NULL)
        cublasDestroy(cublasHandle);
    if(myIsForComputation)
-        CheckNTErrors(cublasCreate(&cublasHandle) == CURAND_STATUS_SUCCESS, "Cannot create the cublas handle.");
+        CheckNTErrors((enum curandStatus)cublasCreate(&cublasHandle) == CURAND_STATUS_SUCCESS, 
+				      "Cannot create the cublas handle.");

    SetDevice(devIDBackup);
 #endif
@@ -1392,8 +1393,8 @@ void XMem::CreateBLASHandle()
                      "Cannot destroy the cublas handle.");
    }

-    CheckNTErrors(cublasCreate(&cublasHandle) == CURAND_STATUS_SUCCESS, 
-                 "Cannot create the cublas handle.");
+    CheckNTErrors((enum curandStatus)cublasCreate(&cublasHandle) == CURAND_STATUS_SUCCESS, 
+                  "Cannot create the cublas handle.");
 #endif
 }


--- a/source/tensor/XName.cpp
+++ b/source/tensor/XName.cpp
@@ -35,6 +35,8 @@ const char * GetOPName(int type)
            return "M_EXP";
        else if (type == MATH_FLOOR)
            return "M_FLOOR";
+        else if (type == MATH_ISNONZERO)
+            return "M_ISNONZERO";
        else if (type == MATH_ISZERO)
            return "M_ISZERO";
        else if (type == MATH_LOG)

--- a/source/tensor/XName.h
+++ b/source/tensor/XName.h
@@ -35,7 +35,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #define MATH_CEIL               MATH_ABSOLUTE + 1
 #define MATH_EXP                MATH_CEIL + 1
 #define MATH_FLOOR              MATH_EXP + 1
-#define MATH_ISZERO             MATH_FLOOR + 1
+#define MATH_ISNONZERO          MATH_FLOOR + 1
+#define MATH_ISZERO             MATH_ISNONZERO + 1
 #define MATH_LOG                MATH_ISZERO + 1
 #define MATH_SQRT               MATH_LOG + 1
 #define MATH_SQUARE             MATH_SQRT + 1

--- a/source/tensor/XTensor.cpp
+++ b/source/tensor/XTensor.cpp
@@ -1057,9 +1057,9 @@ int XTensor::GetKeyInSparse(int i)

 /* 
 set the value of a cell 
->> value - value to assign to the cell
+>> value - value we tend to set
 >> index - index of the cell for each dimension
->> 
+>> size - size of the index
 */
 bool XTensor::Set(DTYPE value, int index[], int size)
 {
@@ -1070,8 +1070,9 @@ bool XTensor::Set(DTYPE value, int index[], int size)

 /* 
 set the value of a cell in a 1d tensor 
->> value - value to assign to the cell
+>> value - value we tend to set
 >> i - item offset
+<< return - succeeded or not
 */
 bool XTensor::Set1D(DTYPE value, int i)
 {
@@ -1124,6 +1125,78 @@ bool XTensor::Set3D(DTYPE value, int d0, int d1, int d2)
    return SetToDevice(devID, GetCell(dims, 3), value);
 }

+
+/* 
+set the integer value of a cell 
+>> value - value we tend to set
+>> index - index of the cell for each dimension
+>> size - size of the index
+<< return - succeeded or not
+*/
+bool XTensor::SetInt(int value, int index[], int size)
+{
+    CheckNTErrors((dataType == X_INT), "The tensor is not in integer type.");
+
+    return SetToDeviceInt(devID, GetCell(index, size), value);
+}
+
+/* 
+set the integer value of a cell in a 1d tensor 
+>> value - value we tend to set
+>> i - item offset
+<< return - succeeded or not
+*/
+bool XTensor::Set1DInt(int value, int i)
+{
+    CheckNTErrors((order == 1), "Cannot get a 2d cell for a tensor whose order is not 2!");
+    CheckNTErrors((i >= 0 && i < dimSize[0]), "dimension 0 is out of range!");
+    CheckNTErrors((dataType == X_INT), "The tensor is not in integer type.");
+
+    int dims[1] = {i};
+
+    return SetToDeviceInt(devID, GetCell(dims, 1), value);
+}
+
+/* 
+set the integer value of a cell in a 2d tensor in default type
+>> value - value we tend to set
+>> ni - row index
+>> mi - column index
+<< return - succeeded or not
+*/
+bool XTensor::Set2DInt(int value, int ni, int mi)
+{
+    CheckNTErrors((order == 2), "Cannot get a 2d cell for a tensor whose order is not 2!");
+    CheckNTErrors((ni >= 0 && ni < dimSize[0]), "dimension 0 is out of range!");
+    CheckNTErrors((mi >= 0 && mi < dimSize[1]), "dimension 1 is out of range!");
+    CheckNTErrors((dataType == X_INT), "The tensor is not in integer type.");
+
+    int dims[2] = {ni, mi};
+
+    return SetToDeviceInt(devID, GetCell(dims, 2), value);
+}
+
+/* 
+set the integer value of a cell in a 3d tensor in default type
+>> value - value we tend to set
+>> d0 - index of demension 0
+>> d1 - index of demension 1
+>> d2 - index of demension 2
+<< return - succeeded or not
+*/
+bool XTensor::Set3DInt(int value, int d0, int d1, int d2)
+{
+    CheckNTErrors(order == 3, "Cannot get a 2d cell for a tensor whose order is not 2!");
+    CheckNTErrors(d0 >= 0 && d0 < dimSize[0], "dimension 0 is out of range!");
+    CheckNTErrors(d1 >= 0 && d1 < dimSize[1], "dimension 1 is out of range!");
+    CheckNTErrors(d2 >= 0 && d2 < dimSize[2], "dimension 2 is out of range!");
+    CheckNTErrors((dataType == X_INT), "The tensor is not in integer type.");
+
+    int dims[3] = {d0, d1, d2};
+
+    return SetToDeviceInt(devID, GetCell(dims, 3), value);
+}
+
 /* 
 increase the value of a cell in a 2d tensor
 >> value - value we tend to set
@@ -1986,6 +2059,9 @@ XTensor * NewTensorBuf(const int myOrder, const int * myDimSize,

    XTensor * tensor = NewTensor(myOrder, dims, myDataType, myDenseRatio, devID, myMem);

+    if (tensor->unitNum * tensor->unitSize == 176657664) {
+        tensor->Dump(stderr, "", 200);
+    }
    if(myMem != NULL)
        tensor->data = myMem->AllocBuf(myMem->devID, tensor->unitNum * tensor->unitSize);
    else
@@ -2135,7 +2211,7 @@ generate a copy of XTensor
 >> isFilledData - indicates whether we allocate the data for
                  the newly-generated tensor
 */
-XTensor * NewTensor(XTensor * a, bool isFilledData)
+XTensor * NewTensor(const XTensor * a, bool isFilledData)
 {
    int dims[MAX_TENSOR_DIM_NUM];
    

--- a/source/tensor/XTensor.h
+++ b/source/tensor/XTensor.h
@@ -326,6 +326,18 @@ public:

    /* set the value of a cell in a 3d tensor */
    bool Set3D(DTYPE value, int d0, int d1, int d2);
+    
+    /* set the integer value of a cell */
+    bool SetInt(int value, int index[], int size = -1);
+
+    /* set the integer value of a cell in a 1d tensor */
+    bool Set1DInt(int value, int i);
+
+    /* set the integer value of a cell in a 2d tensor */
+    bool Set2DInt(int value, int ni, int mi);
+
+    /* set the integer value of a cell in a 3d tensor */
+    bool Set3DInt(int value, int d0, int d1, int d2);

    /* increase the value of a cell in a 2d */
    bool Add2D(DTYPE value, int ni, int mi);
@@ -450,7 +462,7 @@ XTensor * NewTensor5D(const int d0, const int d1, const int d2, const int d3, co
                      const int myDevID = -1, XMem * myMem = NULL);

 /* generate a copy of XTensor (with a reference to a given tensor) */
-XTensor * NewTensor(XTensor * a, bool isFilledData = true);
+XTensor * NewTensor(const XTensor * a, bool isFilledData = true);

 /* free the data space of a given tensor */
 void DelTensor(XTensor * tensor);

--- a/source/tensor/XUtility.cpp
+++ b/source/tensor/XUtility.cpp
@@ -491,6 +491,21 @@ bool SetToDevice(int devID, void * p, DTYPE value)
    return true;
 }

+/* assign a integer number to a variable that is kept on a specified device */
+bool SetToDeviceInt(int devID, void * p, int value)
+{
+    if(p == NULL)
+        return false;
+
+    if(devID < 0)
+        *(int*)p = value;
+    else{
+        XMemCopy(p, devID, &value, -1, sizeof(int));
+    }
+
+    return true;
+}
+
 /* get the next number with power of 2 */
 unsigned int GetNextPower2(unsigned int n)
 {

--- a/source/tensor/XUtility.h
+++ b/source/tensor/XUtility.h
@@ -50,6 +50,7 @@ extern void XMemFreeOnDev(int devID, void * p);
 extern DTYPE ToCPU(int devID, void * value);
 extern int ToCPUInt(int devID, void * value);
 extern bool SetToDevice(int devID, void * p, DTYPE value);
+extern bool SetToDeviceInt(int devID, void * p, int value);
 extern unsigned int GetNextPower2(unsigned int n);
 extern void XSleep(int sleepTime);
 extern double GetClock();

--- a/source/tensor/core/getandset/SetData.cpp
+++ b/source/tensor/core/getandset/SetData.cpp
@@ -70,9 +70,9 @@ void _SetDataFanInOut(XTensor * tensor, DTYPE gain)
        fanOut = numOutputFmaps * receptiveFieldSize;
    }

-    DTYPE std = gain * (float)sqrt(2.0/(fanIn + fanOut));
-    DTYPE a = (DTYPE)sqrt(3.0) * std;
-    _SetDataRand(tensor, -a, a);
+    DTYPE finfout = gain * (float)sqrt(6.0F/(fanIn + fanOut));
+    tensor->SetDataRand(-finfout, finfout);
+    //_SetDataRand(tensor, -finfout, finfout);
 }

 /* 
@@ -393,7 +393,7 @@ void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper)
    if(tensor == NULL)
        return;
    
-    /* GPU code */
+    /* CPU code */
    if(tensor->devID < 0){
        DTYPE variance = upper - lower;
        

--- a/source/tensor/core/math/Unary.cpp
+++ b/source/tensor/core/math/Unary.cpp
@@ -37,6 +37,11 @@ DTYPE round(DTYPE r)
 	return (r > 0.0) ? (DTYPE)floor(r + 0.5) : (DTYPE)ceil(r - 0.5);
 }

+DTYPE isnonzero(DTYPE r)
+{
+    return (r != 0.0) ? (DTYPE)1.0 : (DTYPE)0.0;
+}
+
 DTYPE iszero(DTYPE r)
 {
    return (r == 0.0) ? (DTYPE)1.0 : (DTYPE)0.0;
@@ -93,6 +98,10 @@ _SIMPLE_UNARY_FUNCTION(_Floor, _CudaFloor, floor)
 _SIMPLE_UNARY_FUNCTION_ME(_FloorMe, _Floor)
 SIMPLE_UNARY_FUNCTION(Floor, _Floor, MATH_FLOOR)

+_SIMPLE_UNARY_FUNCTION(_IsNonZero, _CudaIsNonZero, isnonzero)
+_SIMPLE_UNARY_FUNCTION_ME(_IsNonZeroMe, _IsNonZero)
+SIMPLE_UNARY_FUNCTION(IsNonZero, _IsNonZero, MATH_ISNONZERO)
+
 _SIMPLE_UNARY_FUNCTION(_IsZero, _CudaIsZero, iszero)
 _SIMPLE_UNARY_FUNCTION_ME(_IsZeroMe, _IsZero)
 SIMPLE_UNARY_FUNCTION(IsZero, _IsZero, MATH_ISZERO)
@@ -173,6 +182,10 @@ _SIMPLE_UNARY_FUNCTION(_Floor, floor)
 _SIMPLE_UNARY_FUNCTION_ME(_FloorMe, _Floor)
 SIMPLE_UNARY_FUNCTION(Floor, _Floor, MATH_FLOOR)

+_SIMPLE_UNARY_FUNCTION(_IsNonZero, isnonzero)
+_SIMPLE_UNARY_FUNCTION_ME(_IsNonZeroMe, _IsNonZero)
+SIMPLE_UNARY_FUNCTION(IsNonZero, _IsNonZero, MATH_ISNONZERO)
+
 _SIMPLE_UNARY_FUNCTION(_IsZero, iszero)
 _SIMPLE_UNARY_FUNCTION_ME(_IsZeroMe, _IsZero)
 SIMPLE_UNARY_FUNCTION(IsZero, _IsZero, MATH_ISZERO)

--- a/source/tensor/core/math/Unary.cu
+++ b/source/tensor/core/math/Unary.cu
@@ -41,11 +41,18 @@ DTYPE cudaround(DTYPE r)
 }

 __device__
+DTYPE cudaisnonzero(DTYPE r)
+{
+    return (r != 0.0) ? (DTYPE)1.0 : (DTYPE)0.0;
+}
+
+__device__
 DTYPE cudaiszero(DTYPE r)
 {
    return (r == 0.0) ? (DTYPE)1.0 : (DTYPE)0.0;
 }

+
 #define SIMPLE_UNARY_FUNCTION_GPU(funcName, origFunc)                       \
 __global__                                                                  \
 void Kernel##funcName(DTYPE * a, DTYPE * b, int size)                       \
@@ -96,6 +103,7 @@ SIMPLE_UNARY_FUNCTION_GPU(Absolute, fabs)
 SIMPLE_UNARY_FUNCTION_GPU(Ceil, ceil)
 SIMPLE_UNARY_FUNCTION_GPU(Exp, exp)
 SIMPLE_UNARY_FUNCTION_GPU(Floor, floor)
+SIMPLE_UNARY_FUNCTION_GPU(IsNonZero, cudaisnonzero)
 SIMPLE_UNARY_FUNCTION_GPU(IsZero, cudaiszero)
 SIMPLE_UNARY_FUNCTION_GPU(Log, log)
 SIMPLE_UNARY_FUNCTION_GPU(Round, cudaround)

--- a/source/tensor/core/math/Unary.cuh
+++ b/source/tensor/core/math/Unary.cuh
@@ -66,6 +66,15 @@ void KernelFloor(__half * a, __half * b, int size);
 /* set each entry to its floor value */
 void _CudaFloor(const XTensor * a, XTensor * b);

+/* if source entry is non-zero, set target entry to be one, otherwise zero (CUDA Kernel) */
+__global__
+void KernelIsNonZero(DTYPE * a, DTYPE * b, int size);
+/* if source entry is non-zero, set target entry to be one, otherwise zero (CUDA Kernel) with float16 data type*/
+__global__
+void KernelIsNonZero(__half * a, __half * b, int size);
+/* if source entry is non-zero, set target entry to be one, otherwise zero */
+void _CudaIsNonZero(const XTensor * a, XTensor * b);
+
 /* if source entry is zero, set target entry to be one, otherwise zero (CUDA Kernel) */
 __global__
 void KernelIsZero(DTYPE * a, DTYPE * b, int size);

--- a/source/tensor/core/math/Unary.h
+++ b/source/tensor/core/math/Unary.h
@@ -63,6 +63,15 @@ void _FloorMe(XTensor * a);
 make a new tensor to keep the result and return it */
 XTensor Floor(const XTensor & a);

+/* if source entry is non-zero, set target entry to be one, otherwise zero */
+void _IsNonZero(const XTensor *a, XTensor *b);
+/* if source entry is non-zero, set target entry to be one, otherwise zero (do it on site)
+keep the result in the input tensor a and return nothing */
+void _IsNonZeroMe(XTensor *a);
+/* if source entry is non-zero, set target entry to be one, otherwise zero (return a XTensor structure)
+make a new tensor to keep the result and return it */
+XTensor IsNonZero(const XTensor &a);
+
 /* if source entry is zero, set target entry to be one, otherwise zero */
 void _IsZero(const XTensor *a, XTensor *b);
 /* if source entry is zero, set target entry to be one, otherwise zero (do it on site)

--- a/source/tensor/core/movement/Gather.cpp
+++ b/source/tensor/core/movement/Gather.cpp
@@ -21,6 +21,8 @@

 #include "Gather.h"
 #include "CopyIndexed.h"
+#include "../../XUtility.h"
+#include "../shape/Reshape.h"

 namespace nts{ // namespace nts(NiuTrans.Tensor)

@@ -75,4 +77,50 @@ XTensor Gather(const XTensor &s, int dim, int * srcIndex, int indexSize)
    return result;
 }

+/*
+gather indexed sub-tensors (return a XTensor structure)
+make a new tensor to keep the result and return it
+
+>> s - the source tensor(2D)
+>> index - the index tensor
+<< return - the result of copying indexed sub-tensors
+*/
+XTensor Gather(const XTensor &s, const XTensor &index)
+{
+    int indexSize = index.unitNum;
+    CheckNTErrors(s.order == 2, "The order of the input tensor must be 2!");
+ 
+    int * srcIndex = new int[index.unitNum];
+
+    if(index.dataType == X_INT) {
+        XMemCopy(srcIndex, -1, index.data, index.devID, indexSize * index.unitSize);
+    }
+    else if(index.dataType == X_FLOAT || index.dataType == X_DOUBLE) {
+        DTYPE * tmp = new DTYPE[indexSize];
+        XMemCopy(tmp, -1, index.data, index.devID, indexSize * index.unitSize);
+        for(int i = 0; i < indexSize; i++)
+            srcIndex[i] = (int)tmp[i];
+        delete[] tmp;
+    }
+
+    XTensor tensor;
+    tensor = Gather(s, 0, srcIndex, indexSize);
+    delete[] srcIndex;
+
+    if(index.order > 1) {
+        int * dims = new int[index.order + 1];
+        memcpy(dims, index.dimSize, index.order * sizeof(int));
+        dims[index.order] = tensor.GetDim(-1);
+
+        XTensor t;
+        t = Reshape(tensor, index.order + 1, dims);
+        delete[] dims;
+
+        return t;
+    }
+    else {
+        return tensor;
+    }   
+}
+
 } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/movement/Gather.h
+++ b/source/tensor/core/movement/Gather.h
@@ -33,6 +33,10 @@ void _Gather(const XTensor * s, XTensor * t, int dim, int * srcIndex, int indexS
   make a new tensor to keep the result and return it */
 XTensor Gather(const XTensor &s, int dim, int * srcIndex, int indexSize);

+/* gather selected sub-tensors (return a XTensor structure)
+   make a new tensor to keep the result and return it */
+XTensor Gather(const XTensor &s, const XTensor &index);
+
 } // namespace nts(NiuTrans.Tensor)

 #endif // __GATHER_H__
--- a/source/tensor/core/reduce/ReduceSum.cpp
+++ b/source/tensor/core/reduce/ReduceSum.cpp
@@ -16,8 +16,8 @@
 */

 /*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
-*/
+ * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
+ */

 #include <math.h>
 #include "ReduceSum.h"

--- a/source/tensor/core/reduce/ReduceSum.cu
+++ b/source/tensor/core/reduce/ReduceSum.cu
--- a/source/tensor/core/reduce/ReduceSumAll.cpp
+++ b/source/tensor/core/reduce/ReduceSumAll.cpp
@@ -44,23 +44,24 @@ sum all the items of the tensor (It should be optimized!)
 >> source - the inpute tensor
 << return - the total summation
 */
-DTYPE _ReduceSumAll(XTensor * source)
+DTYPE _ReduceSumAll(const XTensor * source)
 {
    int order = source->order;
    DTYPE summation;

    XTensor * big = NewTensor(source);
    _CopyValues(source, big);
-    for(int i = 0; i < order; i++) {
-
-        if(i == order - 1)
-            big->Reshape(big->unitNum, 1);
+    for(int i = order - 1; i >= 0; i--) {
+        if(i == 0)
+            big->Reshape(1, big->unitNum);

+        int leadingDim = big->order - 1;
        int * dimSize;
-        dimSize = getDimSize(big, 0);
-        XTensor * little = NewTensor(big->order - 1, dimSize, source->dataType, source->denseRatio, source->devID, source->mem);
+        dimSize = getDimSize(big, leadingDim);
+        XTensor * little = NewTensor(big->order - 1, dimSize, source->dataType, source->denseRatio, 
+                                     source->devID, source->mem);
           
-        _ReduceSum(big, little, 0);
+        _ReduceSum(big, little, leadingDim);

        delete big;
        delete dimSize;
@@ -81,7 +82,7 @@ sum all the items of the tensor
 >> source - the inpute tensor
 << return - the total summation   
 */
-DTYPE ReduceSumAll(XTensor & source)
+DTYPE ReduceSumAll(const XTensor & source)
 {
    return _ReduceSumAll(&source);
 }

--- a/source/tensor/core/reduce/ReduceSumAll.h
+++ b/source/tensor/core/reduce/ReduceSumAll.h
@@ -28,10 +28,10 @@
 namespace nts{ // namespace nts(NiuTrans.Tensor)

 /* sum all the items of the tensor */
-DTYPE _ReduceSumAll(XTensor * source);
+DTYPE _ReduceSumAll(const XTensor * source);

 /* sum all the items of the tensor */
-DTYPE ReduceSumAll(XTensor & source);
+DTYPE ReduceSumAll(const XTensor & source);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/function/CrossEntropy.cpp
+++ b/source/tensor/function/CrossEntropy.cpp
--- a/source/tensor/function/CrossEntropy.cu
+++ b/source/tensor/function/CrossEntropy.cu
--- a/source/tensor/function/CrossEntropy.cuh
+++ b/source/tensor/function/CrossEntropy.cuh
@@ -29,20 +29,20 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)

 /* compute the cross entropy loss */
 void _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,
-                             XTensor * loss, const XTensor * weight = NULL, 
-                             const XTensor * padding = NULL, int leadingDim = -1);
+                           XTensor * loss, const XTensor * weight = NULL, 
+                           const XTensor * padding = NULL, int leadingDim = -1);

 /* compute the cross entropy loss */
 DTYPE _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,
-                              LOSS_COMPUTE_WAY reduceWay, const XTensor * weight = NULL, 
-                              const XTensor * padding = NULL, int leadingDim = -1);
+                            LOSS_COMPUTE_WAY reduceWay, const XTensor * weight = NULL, 
+                            const XTensor * padding = NULL, int leadingDim = -1);

 /* backward computation of cross entropy function */
-void _CudaCrossEntropyBackward(XTensor * dedy, const XTensor * output, const XTensor * gold, 
-                               const XTensor * weight = NULL, XTensor * padding = NULL, 
-                               int leadingDim = -1);
+void _CudaCrossEntropyBackward(XTensor * dedy, const XTensor * output, 
+                               const XTensor * gold, const XTensor * weight = NULL, 
+                               XTensor * padding = NULL, int leadingDim = -1);


 } // namespace nts(NiuTrans.Tensor)

-#endif // __CROSSENTROPY_CUH__
+#endif // __CROSSENTROPY_CUH__
\ No newline at end of file
--- a/source/tensor/function/CrossEntropy.h
+++ b/source/tensor/function/CrossEntropy.h
@@ -52,9 +52,9 @@ DTYPE _CrossEntropyFast(const XTensor * output, const XTensor * gold,
                        const XTensor * padding = NULL, int leadingDim = -1);

 /* backward computation of cross entropy function */
-void _CrossEntropyBackward(XTensor * dedy, const XTensor * output, const XTensor * gold, 
-                           const XTensor * weight = NULL, XTensor * padding = NULL, 
-                           int leadingDim = -1);
+void _CrossEntropyBackward(XTensor * dedy, const XTensor * output, 
+                           const XTensor * gold, const XTensor * weight = NULL, 
+                           XTensor * padding = NULL, int leadingDim = -1);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/function/LogSoftmax.cpp
+++ b/source/tensor/function/LogSoftmax.cpp
@@ -279,8 +279,8 @@ better numerical stability.
 >> leadDim - leading dimension (along which we perform reduction)
 */
 void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
-                         XTensor * dedy, XTensor * dedx,
-                         int leadDim,
+                         XTensor * dedy, XTensor * dedx, 
+                         XTensor * padding, int leadDim, 
                         LOSS_FUNCTION_NAME lossName)
 {
    CheckNTErrors((!dedx->isSparse), "The gradient matrix must be dense!");
@@ -292,7 +292,7 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
    int leadDimRDI = y->order - leadDim - 1;
 #ifdef USE_CUDA
    if (gold->devID >= 0) {
-        _CudaLogSoftmaxBackward(gold, y, x, dedy, dedx, leadDim, lossName);
+        _CudaLogSoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
        return;
    }
 #endif

--- a/source/tensor/function/LogSoftmax.cu
+++ b/source/tensor/function/LogSoftmax.cu
@@ -22,6 +22,7 @@
 #include "LogSoftmax.h"
 #include "LogSoftmax.cuh"
 #include "Loss.cuh"
+#include "../core/arithmetic/MultiplyDim.h"
 #include "../core/reduce/ReduceSum.cuh"
 #include "../core/reduce/ReduceMax.cuh"
 #include "../XDevice.h"
@@ -232,7 +233,8 @@ dE/dx = dE/dy * dy/dx
 >> lossName - name of the loss function
 */
 __global__
-void KernelLogSoftmaxBackwardDEDS(DTYPE * dedy, DTYPE * dedx, DTYPE * gold, DTYPE * y, DTYPE * x, int size, LOSS_FUNCTION_NAME lossName)
+void KernelLogSoftmaxBackwardDEDS(DTYPE * dedy, DTYPE * dedx, DTYPE * gold, DTYPE * y, DTYPE * x, 
+                                  int size, LOSS_FUNCTION_NAME lossName)
 {
    int i = blockDim.x * blockIdx.x + threadIdx.x;

@@ -371,10 +373,12 @@ better numerical stability.
 >> leadDim - leading dimension (along which we perform reduction)
 */
 void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
-                            XTensor * dedy, XTensor * dedx,
-                            int leadDim,
+                            XTensor * dedy, XTensor * dedx, 
+                            XTensor * padding, int leadDim, 
                            LOSS_FUNCTION_NAME lossName)
 {
+    leadDim = leadDim < 0 ? y->order - 1 : leadDim;
+
    CheckNTErrors((x->devID >= 0), "Backward computation of log softmax must be run on GPUs.");
    CheckNTErrors((x->devID == y->devID && gold->devID == y->devID),
                  "Tensors used in log softmax are not on the same GPU.");
@@ -441,6 +445,26 @@ void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
                                                    dimensionSize * stride, lossName);
                }
            }
+            if(padding != NULL) {
+                int n = leadDim;
+
+                int paddingOrder = padding->order;
+                int * paddingDims = new int[paddingOrder];
+                memcpy(paddingDims, padding->dimSize, padding->order * sizeof(int));
+                padding->Reshape(padding->unitNum);
+
+                int order = dedx->order;
+                int * dims = new int[order];
+                memcpy(dims, dedx->dimSize, dedx->order * sizeof(int));
+                dedx->Reshape(dedx->unitNum/dedx->GetDim(n), dedx->GetDim(n));
+                _MultiplyDimMe(dedx, padding, 0);
+
+                padding->Reshape(paddingOrder, paddingDims);
+                dedx->Reshape(order, dims);
+
+                delete[] paddingDims;
+                delete[] dims;
+            }
        }
        else {
            ShowNTErrors("TODO!");

--- a/source/tensor/function/LogSoftmax.cuh
+++ b/source/tensor/function/LogSoftmax.cuh
@@ -37,8 +37,8 @@ void _CudaLogSoftmaxSumMax(XTensor * x, XTensor * y, int leadDim, XTensor * sum,

 /* de/dx (Cuda version) */
 void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
-                            XTensor * dedy, XTensor * dedx,
-                            int leadDim, 
+                            XTensor * dedy, XTensor * dedx, 
+                            XTensor * padding, int leadDim, 
                            LOSS_FUNCTION_NAME lossName);

 #endif // USE_CUDA

--- a/source/tensor/function/LogSoftmax.h
+++ b/source/tensor/function/LogSoftmax.h
@@ -38,8 +38,8 @@ void LogSoftmax(const XTensor &x, XTensor &y, int leadDim);

 /* de/dx */
 void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, 
-                         XTensor * dedy, XTensor * dedx,
-                         int leadDim,
+                         XTensor * dedy, XTensor * dedx, 
+                         XTensor * padding, int leadDim, 
                         LOSS_FUNCTION_NAME lossName);

 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/function/Loss.cpp
+++ b/source/tensor/function/Loss.cpp
@@ -486,8 +486,9 @@ void _LossBackward(XTensor * dedy, XTensor * t, XTensor * y,
                for (int i = 0; i < blockNum; i++) {
                    for (int j = 0; j < stride; j++) {
                        for (int k = 0; k < tLen; k++) {
-                            *(dedyp + i * stride * dimensionSize + j + stride * (yBeg + k)) = -(DTYPE)*(tp + i * stride * dimensionSize
-                                + j + stride * (tBeg + k)) / (DTYPE)*(yp +  i * stride * dimensionSize + j + stride * (yBeg + k));
+                            *(dedyp + i * stride * dimensionSize + j + stride * (yBeg + k)) = 
+                            -(DTYPE)*(tp + i * stride * dimensionSize + j + stride * (tBeg + k)) / 
+                             (DTYPE)*(yp +  i * stride * dimensionSize + j + stride * (yBeg + k));
                        }
                    }
                }

--- a/source/tensor/function/Softmax.cpp
+++ b/source/tensor/function/Softmax.cpp
@@ -174,8 +174,8 @@ See more details in LogSoftmaxBackward(...)
 >> leadDim - leading dimension (along which we perform reduction)
 */
 void _SoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, 
-                      XTensor * dedy, XTensor * dedx,
-                      int leadDim,
+                      XTensor * dedy, XTensor * dedx, 
+                      XTensor * padding, int leadDim,
                      LOSS_FUNCTION_NAME lossName)
 {
    CheckNTErrors(dedx->isSparse == false, "The gradient tensor must be dense!");
@@ -188,7 +188,7 @@ void _SoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,

 #ifdef USE_CUDA
    if(y->devID >= 0){
-        _CudaSoftmaxBackward(gold, y, x, dedy, dedx, leadDim, lossName);
+        _CudaSoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
        return;
    }
 #endif
@@ -297,9 +297,10 @@ void _SoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
            \beta = \sum_i (dE/dy_i * y_i) 
            */

-            for(int k = 0; k < blockNum; k++){
-                op = (DTYPE*)y->data + k * blockSize;
-                sp = (DTYPE*)dedx->data + k * blockSize;
+            for(int m = 0; m < blockNum; m++){
+                yp = (DTYPE*)dedy->data + m * blockSize;
+                op = (DTYPE*)y->data + m * blockSize;
+                sp = (DTYPE*)dedx->data + m * blockSize;
                
                int nCols = stride;
                for(int k = 0; k < stride; k++){

--- a/source/tensor/function/Softmax.cu
+++ b/source/tensor/function/Softmax.cu
@@ -24,6 +24,7 @@
 #include "Loss.cuh"
 #include "../core/reduce/ReduceSum.h"
 #include "../core/arithmetic/Multiply.h"
+#include "../core/arithmetic/MultiplyDim.h"
 #include "../core/shape/Unsqueeze.h"
 #include "../core/arithmetic/Sum.h"
 #include "../XDevice.h"
@@ -309,9 +310,11 @@ See more details in SoftmaxBackward
 */
 void _CudaSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, 
                          XTensor * dedy, XTensor * dedx,
-                          int leadDim,
+                          XTensor * padding, int leadDim,
                          LOSS_FUNCTION_NAME lossName)
 {
+    int n = leadDim < 0 ? y->order - 1 : leadDim;
+
    CheckNTErrors((x->devID >= 0), "Backward computation of log softmax must be run on GPUs.");
    CheckNTErrors((x->devID == y->devID), "Matrices used in log softmax are not on the same GPU.");
    CheckNTErrors((y->order >= 1), "Empty tensor!");
@@ -329,6 +332,24 @@ void _CudaSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,

        if(lossName == CROSSENTROPY || lossName == SQUAREDERROR){
            _Sum(y, gold, dedx, -1.0F);
+            if(padding != NULL) {
+                int paddingOrder = padding->order;
+                int * paddingDims = new int[paddingOrder];
+                memcpy(paddingDims, padding->dimSize, padding->order * sizeof(int));
+                padding->Reshape(padding->unitNum);
+
+                int order = dedx->order;
+                int * dims = new int[order];
+                memcpy(dims, dedx->dimSize, dedx->order * sizeof(int));
+                dedx->Reshape(dedx->unitNum/dedx->GetDim(n), dedx->GetDim(n));
+                _MultiplyDimMe(dedx, padding, 0);
+
+                padding->Reshape(paddingOrder, paddingDims);
+                dedx->Reshape(order, dims);
+
+                delete[] paddingDims;
+                delete[] dims;
+            }
        }
        else if(lossName == ONEHOTERROR){
            ShowNTErrors("TODO!");

--- a/source/tensor/function/Softmax.cuh
+++ b/source/tensor/function/Softmax.cuh
@@ -37,8 +37,8 @@ void _CudaSoftmaxSumMax(const XTensor * x, XTensor * y, int leadDim, XTensor * s

 /* de/dx (Cuda version) */
 void _CudaSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
-                          XTensor * dedy, XTensor * dedx,
-                          int leadDim, 
+                          XTensor * dedy, XTensor * dedx, 
+                          XTensor * padding, int leadDim, 
                          LOSS_FUNCTION_NAME lossName);

 #endif // USE_CUDA

--- a/source/tensor/function/Softmax.h
+++ b/source/tensor/function/Softmax.h
@@ -35,8 +35,8 @@ XTensor Softmax(const XTensor &x, int leadDim);

 /* de/dx */
 void _SoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, 
-                      XTensor * dedy, XTensor * dedx,
-                      int leadDim,
+                      XTensor * dedy, XTensor * dedx, 
+                      XTensor * padding, int leadDim,
                      LOSS_FUNCTION_NAME lossName);

 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/test/TDropout.cpp
+++ b/source/tensor/test/TDropout.cpp
@@ -169,8 +169,8 @@ bool TestDropout2()
    _DropoutBackward(y, x, dedy, dedx, 1, dropProb);

    /* check result */
-    y->Dump(stderr, "y");
-    dedx->Dump(stderr, "dedy");
+    //y->Dump(stderr, "y");
+    //dedx->Dump(stderr, "dedy");

 #ifdef USE_CUDA
    /* GPU test */
@@ -193,8 +193,8 @@ bool TestDropout2()
    _DropoutBackward(yGPU, xGPU, dedyGPU, dedxGPU, 1, dropProb);

    /* check result */
-    yGPU->Dump(stderr, "yGPU");
-    dedxGPU->Dump(stderr, "dedyGPU");
+    //yGPU->Dump(stderr, "yGPU");
+    //dedxGPU->Dump(stderr, "dedyGPU");

    /* destroy variables */
    delete x;

--- a/source/tensor/test/TLogSoftmax.cpp
+++ b/source/tensor/test/TLogSoftmax.cpp
@@ -146,7 +146,7 @@ bool TestLogSoftmax2()
    _LogSoftmax(x, y, 1);
    
    /* call LogSoftmaxBackward function */
-    _LogSoftmaxBackward(g, y, x, dedy, dedx, 1, CROSSENTROPY);
+    _LogSoftmaxBackward(g, y, x, dedy, dedx, NULL, 1, CROSSENTROPY);
    
    /* check result */
    cpuTest = y->CheckData(yAnswer, unitNum, 1e-4F) 
@@ -174,7 +174,7 @@ bool TestLogSoftmax2()
    _LogSoftmax(xGPU, yGPU, 1);

    /* call LogSoftmaxBackward function */
-    _LogSoftmaxBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, 1, CROSSENTROPY);
+    _LogSoftmaxBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, NULL, 1, CROSSENTROPY);
    
    /* check result */
    gpuTest = yGPU->CheckData(yAnswer, unitNum, 1e-4F) && dedxGPU->CheckData(dedxAnswer, unitNum, 1e-4F);
@@ -250,7 +250,7 @@ bool TestLogSoftmax3()
    _LogSoftmax(x, y, 1);
    
    /* call LogSoftmaxBackward function */
-    _LogSoftmaxBackward(g, y, x, dedy, dedx, 1, SQUAREDERROR);
+    _LogSoftmaxBackward(g, y, x, dedy, dedx, NULL, 1, SQUAREDERROR);
    
    /* check result */
    cpuTest = y->CheckData(yAnswer, unitNum, 1e-4F) 
@@ -278,7 +278,7 @@ bool TestLogSoftmax3()
    _LogSoftmax(xGPU, yGPU, 1);

    /* call LogSoftmaxBackward function */
-    _LogSoftmaxBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, 1, SQUAREDERROR);
+    _LogSoftmaxBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, NULL, 1, SQUAREDERROR);
    
    /* check result */
    gpuTest = yGPU->CheckData(yAnswer, unitNum, 1e-4F) 

--- a/source/tensor/test/TPower.cpp
+++ b/source/tensor/test/TPower.cpp
@@ -66,7 +66,9 @@ bool TestPower1()
    bUser = Power(*a, 2.0F);

 	/* check results */
-	cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) && aMe->CheckData(answer, aUnitNum, 1e-4F) && bUser.CheckData(answer, aUnitNum, 1e-4F);
+	cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) && 
+              aMe->CheckData(answer, aUnitNum, 1e-4F) && 
+              bUser.CheckData(answer, aUnitNum, 1e-4F);
    
 #ifdef USE_CUDA
 	/* GPU test */
@@ -88,7 +90,9 @@ bool TestPower1()
    bUserGPU = Power(*aGPU, 2.0F);

 	/* check results */
-	gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) && aMeGPU->CheckData(answer, aUnitNum, 1e-4F) && bUserGPU.CheckData(answer, aUnitNum, 1e-4F);
+	gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) && 
+              aMeGPU->CheckData(answer, aUnitNum, 1e-4F) && 
+              bUserGPU.CheckData(answer, aUnitNum, 1e-4F);
    
 	/* destroy variables */
 	delete a;
@@ -153,7 +157,9 @@ bool TestPower2()
    bUser = Power(*a, 1.0F);

 	/* check results */
-	cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) && aMe->CheckData(answer, aUnitNum, 1e-4F) && bUser.CheckData(answer, aUnitNum, 1e-4F);
+	cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) && 
+              aMe->CheckData(answer, aUnitNum, 1e-4F) && 
+              bUser.CheckData(answer, aUnitNum, 1e-4F);
    
 #ifdef USE_CUDA
 	/* GPU test */
@@ -175,7 +181,9 @@ bool TestPower2()
    bUserGPU = Power(*aGPU, 1.0F);

 	/* check results */
-	gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) && aMeGPU->CheckData(answer, aUnitNum, 1e-4F) && bUserGPU.CheckData(answer, aUnitNum, 1e-4F);
+	gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) && 
+              aMeGPU->CheckData(answer, aUnitNum, 1e-4F) && 
+              bUserGPU.CheckData(answer, aUnitNum, 1e-4F);

 	/* destroy variables */
 	delete a;
@@ -214,7 +222,7 @@ bool TestPower3()
 	for (int i = 0; i < aOrder; i++)
 		aUnitNum *= aDimSize[i];

-	DTYPE aData[3][2] = { {0.0F, 1.0F},
+	DTYPE aData[3][2] = { {1.0F, 1.0F},
 	                      {2.0F, 3.0F},
 	                      {4.0F, 5.0F} };
 	DTYPE answer[3][2] = { {1.0F, 1.0F},
@@ -240,7 +248,9 @@ bool TestPower3()
    bUser = Power(*a, 0.0F);

 	/* check results */
-	cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) && aMe->CheckData(answer, aUnitNum, 1e-4F) && bUser.CheckData(answer, aUnitNum, 1e-4F);
+	cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) && 
+              aMe->CheckData(answer, aUnitNum, 1e-4F) && 
+              bUser.CheckData(answer, aUnitNum, 1e-4F);
    
 #ifdef USE_CUDA
 	/* GPU test */
@@ -262,7 +272,9 @@ bool TestPower3()
    bUserGPU = Power(*aGPU, 0.0F);

 	/* check results */
-	gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) && aMeGPU->CheckData(answer, aUnitNum, 1e-4F) && bUserGPU.CheckData(answer, aUnitNum, 1e-4F);
+	gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) && 
+              aMeGPU->CheckData(answer, aUnitNum, 1e-4F) && 
+              bUserGPU.CheckData(answer, aUnitNum, 1e-4F);

 	/* destroy variables */
 	delete a;

--- a/source/tensor/test/TReduceSum.cpp
+++ b/source/tensor/test/TReduceSum.cpp
--- a/source/tensor/test/TSoftmax.cpp
+++ b/source/tensor/test/TSoftmax.cpp
@@ -146,7 +146,7 @@ bool TestSoftmax2()
    _Softmax(x, y, 1);
    
    /* call SoftmaxBackward function */
-    _SoftmaxBackward(g, y, x, dedy, dedx, 1, CROSSENTROPY);
+    _SoftmaxBackward(g, y, x, dedy, dedx, NULL, 1, CROSSENTROPY);
    
    /* check result */
    cpuTest = y->CheckData(yAnswer, unitNum, 1e-4F)
@@ -174,7 +174,7 @@ bool TestSoftmax2()
    _Softmax(xGPU, yGPU, 1);

    /* call SoftmaxBackward function */
-    _SoftmaxBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, 1, CROSSENTROPY);
+    _SoftmaxBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, NULL, 1, CROSSENTROPY);
    
    /* check result */
    gpuTest = yGPU->CheckData(yAnswer, unitNum, 1e-4F)

--- a/source/tensor/test/TSumDim.cpp
+++ b/source/tensor/test/TSumDim.cpp
@@ -20,8 +20,9 @@
 */

 #include "TSumDim.h"
-#include "../core/arithmetic/SumDim.h"
 #include "../XTensor.h"
+#include "../core/arithmetic/SumDim.h"
+#include "../core/getandset/SetData.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)

@@ -251,6 +252,225 @@ bool TestSumDim2()
 #endif // USE_CUDA
 }

+/* 
+case 3: tensor summation c = a + b * \beta 
+where the size of b is equal to the n-th dimension of a, 
+i.e., a is summed with b by broadcasting.
+In this case, 
+(20, 40, 4000) + (40) = (20, 40, 4000), dim = 1.
+*/
+bool TestSumDim3()
+{
+    /* a tensor of size (20, 40, 4000) */
+    int aOrder = 3;
+    int * aDimSize = new int[aOrder];
+    aDimSize[0] = 20;
+    aDimSize[1] = 40;
+    aDimSize[2] = 4000;
+
+    int aUnitNum = 1;
+    for (int i = 0; i < aOrder; i++)
+        aUnitNum *= aDimSize[i];
+
+    /* a tensor of size (40) */
+    int bOrder = 1;
+    int * bDimSize = new int[bOrder];
+    bDimSize[0] = 40;
+
+    int bUnitNum = 1;
+    for (int i = 0; i < bOrder; i++)
+        bUnitNum *= bDimSize[i];
+
+    /* CPU test */
+    bool cpuTest = true;
+
+    /* create tensors */
+    XTensor * a = NewTensor(aOrder, aDimSize);
+    XTensor * b = NewTensor(bOrder, bDimSize);
+    XTensor * c = NewTensor(aOrder, aDimSize);
+    XTensor * cMe = NewTensor(aOrder, aDimSize);
+    XTensor * answer = NewTensor(aOrder, aDimSize);
+    XTensor cUser;
+
+    /* initialize variables */
+    a->SetZeroAll();
+    cMe->SetZeroAll();
+    _SetDataFixedFloat(b, 1.0F);
+    _SetDataFixedFloat(answer, 1.0F);
+
+    /* call SumDim function */
+    _SumDim(a, b, c, 1);
+    _SumDim(cMe, b, 1);
+    cUser = SumDim(*a, *b, 1);
+    
+    /* check results */
+    cpuTest = c->CheckData(answer->data, aUnitNum) && 
+              cMe->CheckData(answer->data, aUnitNum) && 
+              cUser.CheckData(answer->data, aUnitNum);
+
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+
+    /* create tensor */
+    XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * bGPU = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * cGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * cMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+    XTensor cUserGPU;
+
+    /* Initialize variables */
+    aGPU->SetZeroAll();
+    cMe->SetZeroAll();
+    _SetDataFixedFloat(bGPU, 1.0F);
+
+    /* call sum function */
+    _SumDim(aGPU, bGPU, cGPU, 1);
+    _SumDim(cMeGPU, bGPU, 1);
+    cUserGPU = SumDim(*aGPU, *bGPU, 1);
+
+    /* check results */
+    gpuTest = cGPU->CheckData(answer->data, aUnitNum) && 
+              cMeGPU->CheckData(answer->data, aUnitNum) && 
+              cUserGPU.CheckData(answer->data, aUnitNum);
+
+    /* destroy variables */
+    delete a;
+    delete b;
+    delete c;
+    delete cMe;
+    delete answer;
+    delete aGPU;
+    delete bGPU;
+    delete cGPU;
+    delete cMeGPU;
+    delete[] aDimSize;
+    delete[] bDimSize;
+
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete a;
+	delete b;
+	delete c;
+    delete cMe;
+    delete answer;
+    delete[] aDimSize;
+    delete[] bDimSize;
+
+    return cpuTest;
+#endif // USE_CUDA
+}
+
+/* 
+case 4: tensor summation c = a + b * \beta 
+where the size of b is equal to the n-th dimension of a, 
+i.e., a is summed with b by broadcasting.
+In this case, 
+(200, 40, 4000) + (40) = (200, 40, 4000), dim = 1.
+*/
+bool TestSumDim4()
+{
+    /* a tensor of size (200, 40, 4000) */
+    int aOrder = 2;
+    int * aDimSize = new int[aOrder];
+    aDimSize[0] = 1000000;
+    aDimSize[1] = 50;
+
+    int aUnitNum = 1;
+    for (int i = 0; i < aOrder; i++)
+        aUnitNum *= aDimSize[i];
+
+    /* a tensor of size (40) */
+    int bOrder = 1;
+    int * bDimSize = new int[bOrder];
+    bDimSize[0] = 50;
+
+    int bUnitNum = 1;
+    for (int i = 0; i < bOrder; i++)
+        bUnitNum *= bDimSize[i];
+
+    /* CPU test */
+    bool cpuTest = true;
+
+    /* create tensors */
+    XTensor * a = NewTensor(aOrder, aDimSize);
+    XTensor * b = NewTensor(bOrder, bDimSize);
+    XTensor * c = NewTensor(aOrder, aDimSize);
+    XTensor * cMe = NewTensor(aOrder, aDimSize);
+    XTensor * answer = NewTensor(aOrder, aDimSize);
+    XTensor cUser;
+
+    /* initialize variables */
+    a->SetZeroAll();
+    cMe->SetZeroAll();
+    _SetDataFixedFloat(b, 1.0F);
+    _SetDataFixedFloat(answer, 1.0F);
+
+    /* call SumDim function */
+    _SumDim(a, b, c, 1);
+    _SumDim(cMe, b, 1);
+    cUser = SumDim(*a, *b, 1);
+    
+    /* check results */
+    cpuTest = c->CheckData(answer->data, aUnitNum) && 
+              cMe->CheckData(answer->data, aUnitNum) && 
+              cUser.CheckData(answer->data, aUnitNum);
+
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+
+    /* create tensor */
+    XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * bGPU = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * cGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * cMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+    XTensor cUserGPU;
+
+    /* Initialize variables */
+    aGPU->SetZeroAll();
+    cMe->SetZeroAll();
+    _SetDataFixedFloat(bGPU, 1.0F);
+
+    /* call sum function */
+    _SumDim(aGPU, bGPU, cGPU, 1);
+    _SumDim(cMeGPU, bGPU, 1);
+    cUserGPU = SumDim(*aGPU, *bGPU, 1);
+
+    /* check results */
+    gpuTest = cGPU->CheckData(answer->data, aUnitNum) && 
+              cMeGPU->CheckData(answer->data, aUnitNum) && 
+              cUserGPU.CheckData(answer->data, aUnitNum);
+
+    /* destroy variables */
+    delete a;
+    delete b;
+    delete c;
+    delete cMe;
+    delete answer;
+    delete aGPU;
+    delete bGPU;
+    delete cGPU;
+    delete cMeGPU;
+    delete[] aDimSize;
+    delete[] bDimSize;
+
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete a;
+	delete b;
+	delete c;
+    delete cMe;
+    delete answer;
+    delete[] aDimSize;
+    delete[] bDimSize;
+
+    return cpuTest;
+#endif // USE_CUDA
+}
+
 /* other cases */
 /*
    TODO!!
@@ -279,6 +499,24 @@ bool TestSumDim()
    }
    else
        XPRINT(0, stdout, ">> case 2 passed!\n");
+    
+    /* case 3 test */
+    caseFlag = TestSumDim3();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 3 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 3 passed!\n");
+        
+    ///* case 4 test */
+    //caseFlag = TestSumDim4();
+    //if (!caseFlag) {
+    //    returnFlag = false;
+    //    XPRINT(0, stdout, ">> case 4 failed!\n");
+    //}
+    //else
+    //    XPRINT(0, stdout, ">> case 4 passed!\n");

    /* other cases test */
    /*