1. add some base functions 2.better implementation for t2t

03a9836e · xuchen · 52c0e35a · 03a9836e · 03a9836e · 03a9836e
Commit 03a9836e authored Nov 13, 2018 by xuchen
--- a/source/network/XBackwardFunc.cpp
+++ b/source/network/XBackwardFunc.cpp
@@ -49,7 +49,7 @@ void XFuncGrad::MakeGrad(XTensor * node, bool isEfficient)
    else if(operID == FUNC_LOGSOFTMAX){
        int leadDim = income.GetParamInt(0);
        CheckNTErrors(leadDim >= 0 && leadDim < input->order, "wrong leading dimension in logsoftmax!");
-        _LogSoftmaxBackward(NULL, output, input, output->grad, input->grad, leadDim, NOLOSS);
+        _LogSoftmaxBackward(NULL, output, input, output->grad, input->grad, NULL, leadDim, NOLOSS);
    }
    else if(operID == FUNC_RECTIFY)
        _RectifyBackward(NULL, output, input, output->grad, input->grad, NOLOSS);
@@ -58,7 +58,7 @@ void XFuncGrad::MakeGrad(XTensor * node, bool isEfficient)
    else if(operID == FUNC_SOFTMAX){
        int leadDim = income.GetParamInt(0);
        CheckNTErrors(leadDim >= 0 && leadDim < input->order, "wrong leading dimension in softmax!");
-        _SoftmaxBackward(NULL, output, input, output->grad, input->grad, leadDim, NOLOSS);
+        _SoftmaxBackward(NULL, output, input, output->grad, input->grad, NULL, leadDim, NOLOSS);
    }
    else{
        ShowNTErrors("Wrong activation function type!");

--- a/source/network/XBackwardLoss.cpp
+++ b/source/network/XBackwardLoss.cpp
@@ -42,7 +42,7 @@ compute dE/dx for a given function y = f(x)
 >> lossName - name of the loss, e.g., cross entropy
 */
 void XLossGrad::Compute(XTensor * gold, XTensor * y, XTensor * x, 
-                        XTensor * dedy, XTensor * dedx,
+                        XTensor * dedy, XTensor * dedx, XTensor * padding,
                        int funcID, void * params,
                        LOSS_FUNCTION_NAME lossName)
 {
@@ -58,7 +58,7 @@ void XLossGrad::Compute(XTensor * gold, XTensor * y, XTensor * x,
    }
    else if(funcID == FUNC_LOGSOFTMAX){
        int leadDim = *(int*)params;
-        _LogSoftmaxBackward(gold, y, x, dedy, dedx, leadDim, lossName);
+        _LogSoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
    }
    else if(funcID == FUNC_RECTIFY){
        _RectifyBackward(gold, y, x, dedy, dedx, lossName);
@@ -67,7 +67,7 @@ void XLossGrad::Compute(XTensor * gold, XTensor * y, XTensor * x,
        _SigmoidBackward(gold, y, x, dedy, dedx, lossName);
    }else if(funcID == FUNC_SOFTMAX){
        int leadDim = *(int*)params;
-        _SoftmaxBackward(gold, y, x, dedy, dedx, leadDim, lossName);
+        _SoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
    }
    else{
        ShowNTErrors("wrong function found when call the backward process!");
@@ -83,10 +83,12 @@ compute dE/dy for variable y and error(loss) function E
 >> lossName - name of the loss, e.g., cross entropy
 */
 void XLossGrad::Compute(XTensor * gold, XTensor * y, 
-                        XTensor * dedy, 
+                        XTensor * dedy, XTensor * padding,
                        LOSS_FUNCTION_NAME lossName)
 {
-    _LossBackward(dedy, gold, y, lossName);
+    //_LossBackward(dedy, gold, y, lossName);
+    if(lossName == CROSSENTROPY)
+        _CrossEntropyBackward(dedy, y, gold, NULL, padding);
 }

 }
\ No newline at end of file
--- a/source/network/XBackwardLoss.h
+++ b/source/network/XBackwardLoss.h
@@ -36,13 +36,13 @@ class XLossGrad
 public:
    /* compute dE/dx for a given function y = f(x) */
    void Compute(XTensor * gold, XTensor * y, XTensor * x, 
-                 XTensor * dedy, XTensor * dedx,
+                 XTensor * dedy, XTensor * dedx, XTensor * padding,
                 int funcID, void * params,
                 LOSS_FUNCTION_NAME lossName);

    /* compute dE/dy for variable y and error(loss) function E */
    void Compute(XTensor * gold, XTensor * y, 
-                 XTensor * dedy, 
+                 XTensor * dedy, XTensor * padding,
                 LOSS_FUNCTION_NAME lossName);
 };


--- a/source/network/XBackwardShape.cpp
+++ b/source/network/XBackwardShape.cpp
@@ -469,8 +469,6 @@ void XShapeGrad::GradTranspose(XTensor * node, bool isEfficient)
    DelTensorBuf(b);

    node->visitMark = NODE_FINISHED;
-
-    delete b;
 }

 /* 

--- a/source/network/XNet.cpp
+++ b/source/network/XNet.cpp
@@ -55,7 +55,7 @@ void XNetClearAll()
 XNet::XNet()
 {
    nodes.Clear();
-    isGradEfficient = true;
+    isGradEfficient = false;
 }

 /* de-constructor */
@@ -86,7 +86,31 @@ void XNet::Backward(XTensor &root, XTensor &gold, LOSS_FUNCTION_NAME loss)
    XList golds(1);
    golds.Add(&gold);

-    Backward(roots, golds, loss);
+    XList paddings(1);
+    paddings.Add(NULL);
+
+    Backward(roots, golds, paddings, loss);
+}
+
+/* 
+backward propagation to obtain gradient wrt. the loss/error function 
+>> root - root node (output) of the network
+>> gold - gold standard for the output
+>> padding - specify a target value that is ignored and does not contribute to the loss computation
+>> loss - name of loss function
+*/
+void XNet::Backward(XTensor &root, XTensor &gold, XTensor &padding, LOSS_FUNCTION_NAME loss)
+{
+    XList roots(1);
+    roots.Add(&root);
+
+    XList golds(1);
+    golds.Add(&gold);
+
+    XList paddings(1);
+    paddings.Add(&padding);
+
+    Backward(roots, golds, paddings, loss);
 }

 /* 
@@ -102,7 +126,10 @@ void XNet::Backward(XTensor &root, LOSS_FUNCTION_NAME loss)
    XList golds(1);
    golds.Add(NULL);

-    Backward(roots, golds, loss);
+    XList paddings(1);
+    paddings.Add(NULL);
+
+    Backward(roots, golds, paddings, loss);
 }

 /* 
@@ -110,9 +137,10 @@ backward propagation to obtain gradient wrt. the loss/error function
 with a number of root nodes 
 >> root - a list of root nodes (output) of the network
 >> gold - a list of gold standard for the output
+>> padding - specify a target value that is ignored
 >> loss - name of loss function
 */
-void XNet::Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss)
+void XNet::Backward(XList &roots, XList &golds, XList &paddings, LOSS_FUNCTION_NAME loss)
 {
    Traverse(roots);

@@ -131,6 +159,7 @@ void XNet::Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss)
    for(int i = 0; i < roots.count; i++){
        XTensor * root = (XTensor*)roots.Get(i);
        XTensor * gold = (XTensor*)golds.Get(i);
+        XTensor * padding = (XTensor*)paddings.Get(i);
        XLink &income = root->income;
        int funcID = income.typeID;
        void * params = income.params;
@@ -139,15 +168,21 @@ void XNet::Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss)
           Note that we do not need to obtain dE/dy here because it is no use in the 
           folloing process of back-propagation */
        if(gold != NULL && income.tailNum == 1 && (funcID & FUNCTION_BASE)){
-            XTensor * x = income.tails[0];
-            XNoder::MakeGrad(x);
-            lossGrad.Compute(gold, root, x, NULL, x->grad, funcID, params, loss);
-            root->visitMark = NODE_FINISHED;
+            if(funcID == FUNC_LOGSOFTMAX || funcID == FUNC_SOFTMAX) {
+                XTensor * x = income.tails[0];
+                XNoder::MakeGrad(x);
+                lossGrad.Compute(gold, root, x, NULL, x->grad, padding, funcID, params, loss);
+                root->visitMark = NODE_FINISHED;
+            }
+            else {
+                XNoder::MakeGrad(root);
+                lossGrad.Compute(gold, root, root->grad, padding, loss);
+            }
        }
        /* we compuate dE/dy (y is the output) if no predefined activation function is used */
        else{
            XNoder::MakeGrad(root);
-            lossGrad.Compute(gold, root, root->grad, loss);
+            lossGrad.Compute(gold, root, root->grad, NULL, loss);
        }
    }
    
@@ -178,16 +213,35 @@ void XNet::Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss)
 /* 
 backward propagation to obtain gradient
 with a number of root nodes 
->> root - a list of root nodes (output) of the network
+>> roots - a list of root nodes (output) of the network
 >> loss - name of loss function
 */
 void XNet::Backward(XList &roots, LOSS_FUNCTION_NAME loss)
 {
    XList golds(roots.count);
-    for(int i = 0; i < roots.count; i++)
+    XList paddings(roots.count);
+    for(int i = 0; i < roots.count; i++) {
        golds.Add(NULL);
+        paddings.Add(NULL);
+    }
+
+    Backward(roots, golds, paddings, loss);
+}
+
+/* 
+backward propagation to obtain gradient
+with a number of root nodes 
+>> roots - a list of root nodes (output) of the network
+>> golds - a list of gold standard for the output
+>> loss - name of loss function
+*/
+void XNet::Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss)
+{
+    XList paddings(roots.count);
+    for(int i = 0; i < roots.count; i++)
+        paddings.Add(NULL);

-    Backward(roots, golds, loss);
+    Backward(roots, golds, paddings, loss);
 }

 /* 

--- a/source/network/XNet.h
+++ b/source/network/XNet.h
@@ -62,17 +62,24 @@ struct XNet
    /* backward propagation to obtain gradient wrt. the loss/error function */
    void Backward(XTensor &root, XTensor &gold, LOSS_FUNCTION_NAME loss = NOLOSS);

+    /* backward propagation to obtain gradient wrt. the loss/error function */
+    void Backward(XTensor &root, XTensor &gold, XTensor &padding, LOSS_FUNCTION_NAME loss = NOLOSS);
+
    /* backward propagation to obtain gradient */
    void Backward(XTensor &root, LOSS_FUNCTION_NAME loss = NOLOSS);

    /* backward propagation to obtain gradient wrt. the loss/error function
       with a number of root nodes */
-    void Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss = NOLOSS);
+    void Backward(XList &roots, XList &golds, XList &paddings, LOSS_FUNCTION_NAME loss = NOLOSS);

    /* backward propagation to obtain gradient
       with a number of root nodes */
    void Backward(XList &roots, LOSS_FUNCTION_NAME loss = NOLOSS);

+    /* backward propagation to obtain gradient
+       with a number of root nodes */
+    void Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss = NOLOSS);
+
    /* backward computation for a given node */
    void BackwardNode(XTensor * node, bool isEfficent = false);


--- a/source/sample/fnnlm/FNNLM.cpp
+++ b/source/sample/fnnlm/FNNLM.cpp
@@ -514,6 +514,8 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
        
        if(isEnd)
            break;
+
+        Test(testFN, outputFN, model);
    }

    double elapsed = GetClockSec() - startT;
@@ -890,7 +892,7 @@ void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NA

    /* for y = softmax(s), we get dE/ds
        where E is the error function (define by loss) */
-    _LogSoftmaxBackward(&gold, &y, &s, NULL, &deds, 1, loss);
+    _LogSoftmaxBackward(&gold, &y, &s, NULL, &deds, NULL, 1, loss);

    /* for s = x * w, we get 
       dE/w_{i,j} = dE/ds_j * ds/dw_{i,j} 

--- a/source/sample/transformer/T2TDecoder.cpp
+++ b/source/sample/transformer/T2TDecoder.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-10-09
+ */
+
+#include <math.h>
+#include "T2TDecoder.h"
+#include "../../tensor/core/CHeader.h"
+
+namespace transformer
+{
+
+/* constructor */
+AttDecoder::AttDecoder()
+{
+    attentionsEnde = NULL;
+    attEndeLayerNorms = NULL;
+}
+
+/* de-constructor */
+AttDecoder::~AttDecoder()
+{
+    delete[] attentionsEnde;
+    delete[] attEndeLayerNorms;
+}
+
+/* 
+initialize the model 
+>> argc - number of arguments
+>> argv - list of pointers to the arguments
+>> myIsMasked - indicates whether the masked attention is employed
+>> myIgnored - number of positions ignored in attention (from the start)
+>> myDevID - device id
+>> myMem - the memory pool
+*/
+void AttDecoder::InitModel(int argc, char ** argv, 
+                           bool myIsMasked, int myIgnored, 
+                           int myDevID, XMem * myMem)
+{
+    AttEncoder::InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem);
+
+    attentionsEnde = new T2TAttention[nlayer];
+    attEndeLayerNorms = new T2TLN[nlayer];
+
+    /* initialize the stacked layers */
+    for(int i = 0; i < nlayer; i++){
+        attentionsEnde[i].InitModel(argc, argv, false, myIgnored, myDevID, myMem);
+        attEndeLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
+    }
+}
+
+/* 
+make the decoding network
+>> inputDec - the input tensor of the decoder
+>> outputEnc - the output tensor of the encoder
+>> mask - the mask that indicate each position is valid
+>> isTraining - indicates whether the model is used for training
+<< return - the output tensor of the encoder
+*/
+XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, bool isTraining)
+{
+    XTensor x;
+
+    x = embedder.Make(inputDec);
+
+    /* dropout */
+    if(isTraining && dropoutP > 0)
+        x = Dropout(x, dropoutP);
+
+    for(int i = 0; i < nlayer; i++){
+        XTensor att;
+        XTensor ende;
+        XTensor ln;
+        XTensor fnn;
+        XTensor res;
+        XTensor nothing;
+
+        /******************/
+        /* self attention */
+        att = attentions[i].Make(x, x, x, mask, isTraining);
+
+        /* dropout */
+        if(isTraining && dropoutP > 0)
+            att = Dropout(att, dropoutP);
+
+        /* residual connection */
+        res = Sum(att, x);
+
+        /* layer normalization */
+        x = attLayerNorms[i].Make(res);
+
+        /*****************************/
+        /* encoder-decoder attention */
+        ende = attentionsEnde[i].Make(outputEnc, x, outputEnc, nothing, isTraining);
+
+        /* dropout */
+        if(isTraining && dropoutP > 0)
+            ende = Dropout(ende, dropoutP);
+
+        /* residual connection */
+        res = Sum(ende, x);
+
+        /* layer normalization */
+        x = attEndeLayerNorms[i].Make(res);
+
+        /*******/
+        /* fnn */
+        fnn = fnns[i].Make(x, isTraining);
+
+        /* dropout */
+        if(isTraining && dropoutP > 0)
+            fnn = Dropout(fnn, dropoutP);
+
+        /* residual connection */
+        res = Sum(fnn, x);
+
+        /* layer normalization */
+        x = fnnLayerNorms[i].Make(res);
+    }
+
+    return x;
+}
+
+
+}
--- a/source/sample/transformer/T2TDecoder.h
+++ b/source/sample/transformer/T2TDecoder.h
@@ -22,19 +22,33 @@
 #ifndef __T2TDECODER_H__
 #define __T2TDECODER_H__

+#include "T2TEncoder.h"
+
 namespace transformer
 {

-class T2TDecoder
+class AttDecoder : public AttEncoder
 {
+public:
+    /* encoder-decoder attention model of each layer */
+    T2TAttention * attentionsEnde;

-};
-
-class AttDecoder : T2TDecoder
-{
+    /* layer normalization for encoder-decoder attention */
+    T2TLN * attEndeLayerNorms;
 public:
+    /* constructor */
+    AttDecoder();
+
+    /* deconstructor */
+    ~AttDecoder();
+
    /* initialize the model */
-    void InitModel(int argc, char ** argv);
+    void InitModel(int argc, char ** argv, 
+                   bool myIsMasked, int myIgnored, 
+                   int myDevID = -1, XMem * myMem = NULL);
+
+    /* make the decoding network */
+    XTensor Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, bool isTraining);
 };

 }

--- a/source/sample/transformer/T2TEmbedding.cpp
+++ b/source/sample/transformer/T2TEmbedding.cpp
@@ -61,16 +61,17 @@ void T2TEmbedder::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
    InitTensor2D(&w, vSize, eSize, X_FLOAT, devID, mem);

    DTYPE v = 1.0F/(float)sqrt((float)eSize);
-    w.SetDataRand(-v, v);
+    w.SetDataRandn(0, v);

    /* create the positional embedding matrix */
    MakePosEmbedding(eSize, d, maxLength);
 }

 /* 
-make positional embeddings (of size eSize * length
-eSize - embedding size
-length - length of the sequenc
+make positional embeddings (of size eSize * length)
+>> eSize - embedding size
+>> d - dimension size of the hidden layers
+>> length - length of the sequence
 */
 void T2TEmbedder::MakePosEmbedding(int eSize, int d, int length)
 {
@@ -114,15 +115,15 @@ make the network
 */
 XTensor T2TEmbedder::Make(XTensor &input)
 {
-    CheckNTErrors(input.GetDim(-1) == vSize, "Wrong vocabulary size!");
+    //CheckNTErrors(input.GetDim(-1) == vSize, "Wrong vocabulary size!");
    CheckNTErrors(input.order > 1, "Wrong input tensor size!");
-    CheckNTErrors(input.dimSize[input.order - 2] < maxLength, "The sequence is too long!");
+    CheckNTErrors(input.dimSize[input.order - 1] < maxLength, "The sequence is too long!");
    CheckNTErrors(vSize > 0, "set vocabulary size by \"-vsize\"");
    CheckNTErrors(eSize > 0, "set embedding size by \"-esize\"");

    int dims[MAX_TENSOR_DIM_NUM];
    memcpy(dims, input.dimSize, input.order * sizeof(int));
-    dims[input.order - 1] = eSize;
+    dims[input.order] = eSize;

    XTensor wordEmbedding;
    XTensor posEmbedding;
@@ -138,7 +139,8 @@ XTensor T2TEmbedder::Make(XTensor &input)
    /* we make positional embeddings first */
    //if(!match){
    if(true){
-        InitTensor(&posEmbedding, input.order, dims, X_FLOAT, 1.0F, devID, mem);
+        InitTensor(&posEmbedding, input.order + 1, dims, X_FLOAT, 1.0F, devID, mem);
+
        XTensor * posTMP = NewTensorBuf(2, dims + 1, X_FLOAT, 1.0F, devID, mem);

        _CopyValues(&posEmbeddingBase, 0, posTMP->unitNum, posTMP, 0);
@@ -148,7 +150,9 @@ XTensor T2TEmbedder::Make(XTensor &input)
    }

    /* then we make word embeddings */
-    wordEmbedding = Linear(MMul(input, w), (float)sqrt((float)eSize));
+    //wordEmbedding = Linear(MMul(input, w), (float)sqrt((float)eSize));
+    wordEmbedding = Gather(w, input);
+    wordEmbedding = Linear(wordEmbedding, (float)sqrt((float)eSize));

    /* we sum over the two embeddings */
    return wordEmbedding + posEmbedding;

--- a/source/sample/transformer/T2TEncoder.cpp
+++ b/source/sample/transformer/T2TEncoder.cpp
@@ -31,6 +31,10 @@ namespace transformer
 /* constructor */
 AttEncoder::AttEncoder()
 {
+    attentions = NULL;
+    fnns = NULL;
+    attLayerNorms = NULL;
+    fnnLayerNorms = NULL;
 }

 /* de-constructor */

--- a/source/sample/transformer/T2TLayerNormal.cpp
+++ b/source/sample/transformer/T2TLayerNormal.cpp
@@ -59,10 +59,7 @@ void T2TLN::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
    InitTensor1D(&w, d, X_FLOAT, devID, mem);
    InitTensor1D(&b, d, X_FLOAT, devID, mem);

-    float scale = 1.0F;
-    float finfout = (float)sqrt(6.0F * scale / d);
-
-    w.SetDataRand(-finfout, finfout);
+    w.SetDataRand(1.0F, 1.0F);
    b.SetZeroAll();
 }


--- a/source/sample/transformer/T2TModel.cpp
+++ b/source/sample/transformer/T2TModel.cpp
@@ -57,8 +57,8 @@ void T2TModel::InitModel(int argc, char ** argv)
    LoadParamInt(argc, argv, "dev", &devID, -1);
    LoadParamBool(argc, argv, "mem", &useMem, useMem);
    LoadParamInt(argc, argv, "memsize", &memSize, 1024);
-    LoadParamBool(argc, argv, "lm", &isLM, true);
    LoadParamBool(argc, argv, "mt", &isMT, false);
+    LoadParamBool(argc, argv, "lm", &isLM, !isMT);
    LoadParamInt(argc, argv, "nhead", &nhead, 8);
    LoadParamBool(argc, argv, "freeotf", &isMemFreeOTF, false);

@@ -71,6 +71,9 @@ void T2TModel::InitModel(int argc, char ** argv)
    encoder.InitModel(argc, argv, isLM, 0, devID, mem);
    outputLayer.InitModel(argc, argv, devID, mem);

+    if(isMT)
+        decoder.InitModel(argc, argv, true, 0, devID, mem);
+
    XList params(10);
    GetParams(params);

@@ -87,74 +90,161 @@ make the encoding network
 >> isTraining - indicates whether we are training the model
 << return - encoding result
 */
-XTensor T2TModel::MakeEncoding(XTensor &input, XTensor &mask, bool isTraining)
+XTensor T2TModel::MakeEncoder(XTensor &input, XTensor &mask, bool isTraining)
 {
    return encoder.Make(input, mask, isTraining);
 }

 /* 
-make the entire network (with the output softmax layer) 
+make the decoding network
+>> inputDec - input tensor of the decoder
+>> outputEnc - output tensor of the encoder
+>> output - output tensor (distribution)
+>> mask - the mask for positions that are/not involved in computation
+>> isTraining - indicates whether we are training the model
+<< return - encoding result
+*/
+XTensor T2TModel::MakeDecoder(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, bool isTraining)
+{
+    return decoder.Make(inputDec, outputEnc, mask, isTraining);
+}
+
+/* 
+make the network for language modeling (with the output softmax layer) 
 >> input - input tensor
 >> output - output tensor (distribution)
 >> padding - padding of the sequences
 >> isTraining - indicates whether the model is for training
 */
-void T2TModel::Make(XTensor &input, XTensor &output, XTensor &padding, bool isTraining)
+void T2TModel::MakeLM(XTensor &input, XTensor &output, XTensor &padding, bool isTraining)
 {
    XTensor encoding;
    
-    if(isLM){
-        /* generate mask to see "previous" words only */
-        int len = input.GetDim(input.order - 2);
-        int * dims = new int[input.order + 1];
-        for(int i = 0; i < input.order; i++)
-            dims[i + 1] = input.GetDim(i);
-        dims[0] = nhead;
-        dims[input.order] = len;
-        XTensor mask(input.order + 1, dims, X_FLOAT, 1.0F, input.devID, input.mem);
-        
-        /* a upper triangular matrix where the cells of the upper triangular are set to -1e-9.
-           this matrix can be used to prevent the attention to current or following words in
-           a given sequence. */
-        _SetDataLowTri(&mask, 1e9F, 0);
-        _ScaleAndShiftMe(&mask, 1.0F, -1e9F);
-        
-        int * dimsPadding = new int[padding.order + 2];
-        for(int i = 0; i < padding.order - 1; i++)
-            dimsPadding[i] = padding.GetDim(i);
-        dimsPadding[padding.order - 1] = padding.GetDim(-1);
-        dimsPadding[padding.order] = padding.GetDim(-1);
-        
-        XTensor * padding2 = NewTensorBuf(padding.order + 1, dimsPadding, padding.dataType,
-                                          padding.denseRatio, padding.devID, padding.mem);
-        
-        for(int i = 0; i < padding2->order; i++)
-            dimsPadding[i + 1] = padding2->GetDim(i);
-        dimsPadding[0] = nhead;
-        
-        XTensor * padding3 = NewTensorBuf(padding.order + 2, dimsPadding, padding.dataType,
-                                          padding.denseRatio, padding.devID, padding.mem);
+    /* generate mask to see "previous" words only */
+    //int len = input.GetDim(input.order - 2);
+    //int * dims = new int[input.order + 1];
+    //for(int i = 0; i < input.order; i++)
+    //    dims[i + 1] = input.GetDim(i);
+    //dims[0] = nhead;
+    //dims[input.order] = len;
+    //XTensor mask(input.order + 1, dims, X_FLOAT, 1.0F, input.devID, input.mem);
+
+    int len = input.GetDim(input.order - 1);
+    int * dims = new int[input.order + 2];
+    for(int i = 0; i < input.order; i++)
+        dims[i + 1] = input.GetDim(i);
+    dims[0] = nhead;
+    dims[input.order + 1] = len;
+    XTensor mask(input.order + 2, dims, X_FLOAT, 1.0F, padding.devID, padding.mem);
+
+    /* a upper triangular matrix where the cells of the upper triangular are set to -1e-9.
+        this matrix can be used to prevent the attention to current or following words in
+        a given sequence. */
+    _SetDataLowTri(&mask, 1e9F, 0);
+    _ScaleAndShiftMe(&mask, 1.0F, -1e9F);
        
-        /* mask of the padding */
-        _Unsqueeze(&padding, padding2, padding.order - 1, padding.GetDim(-1));
-        _Unsqueeze(padding2, padding3, 0, nhead);
+    int * dimsPadding = new int[padding.order + 2];
+    for(int i = 0; i < padding.order - 1; i++)
+        dimsPadding[i] = padding.GetDim(i);
+    dimsPadding[padding.order - 1] = padding.GetDim(-1);
+    dimsPadding[padding.order] = padding.GetDim(-1);
+
+    XTensor * padding2 = NewTensorBuf(padding.order + 1, dimsPadding, padding.dataType,
+                                      padding.denseRatio, padding.devID, padding.mem);
+
+    for(int i = 0; i < padding2->order; i++)
+        dimsPadding[i + 1] = padding2->GetDim(i);
+    dimsPadding[0] = nhead;
+
+    //XTensor * padding3 = NewTensorBuf(padding.order + 2, dimsPadding, padding.dataType,
+    //                                  padding.denseRatio, padding.devID, padding.mem);
+    //    
+    ///* mask of the padding */
+    //_Unsqueeze(&padding, padding2, padding.order - 1, padding.GetDim(-1));
+    //_Unsqueeze(padding2, padding3, 0, nhead);
+    //    
+    //_ScaleAndShiftMe(padding3, 1e9F, -1e9F);
+    //    
+    ////_Sum(&mask, padding3, &mask);
+
+    encoding = MakeEncoder(input, mask, isTraining);
+    outputLayer.Make(encoding, output);
+
+    delete[] dims;
+    delete[] dimsPadding;
        
-        _ScaleAndShiftMe(padding3, 1e9F, -1e9F);
+    //DelTensorBuf(padding3);
+    DelTensorBuf(padding2);
+}
+
+/* 
+make the network for machine translation (with the output softmax layer) 
+>> inputEnc - input tensor of the encoder
+>> inputDec - input tensor of the decoder
+>> output - output tensor (distribution)
+>> paddingEnc - padding of the sequences (on the encoder side)
+>> isTraining - indicates whether the model is for training
+*/
+void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTensor &paddingEnc, bool isTraining)
+{
+    XTensor encoding;
+    XTensor decoding;
+    XTensor maskEnc;
+    XTensor maskDec;
+    
+    /* generate mask to see "previous" words on the decoder side */
+    int len = inputDec.GetDim(inputDec.order - 2);
+    int * dims = new int[inputDec.order + 1];
+    for(int i = 0; i < inputDec.order; i++)
+        dims[i + 1] = inputDec.GetDim(i);
+    dims[0] = nhead;
+    dims[inputDec.order] = len;
+    InitTensor(&maskDec, inputDec.order + 1, dims, X_FLOAT, 1.0F, inputDec.devID, inputDec.mem);
        
-        _Sum(&mask, padding3, &mask);
+    /* a upper triangular matrix where the cells of the upper triangular are set to -1e-9.
+       this matrix can be used to prevent the attention to current or following words in
+       a given sequence. */
+    _SetDataLowTri(&maskDec, 1e9F, 0);
+    _ScaleAndShiftMe(&maskDec, 1.0F, -1e9F);

-        encoding = MakeEncoding(input, mask, isTraining);
-        outputLayer.Make(encoding, output);
+    /* padding on the source side */
+    int * dimsPadding = new int[paddingEnc.order + 2];
+    for (int i = 0; i < paddingEnc.order - 1; i++)
+        dimsPadding[i] = paddingEnc.GetDim(i);
+    dimsPadding[paddingEnc.order - 1] = paddingEnc.GetDim(-1);
+    dimsPadding[paddingEnc.order] = paddingEnc.GetDim(-1);

-        delete[] dims;
-        delete[] dimsPadding;
-        
-        DelTensorBuf(padding2);
-        DelTensorBuf(padding3);
-    }
-    else{
-        ShowNTErrors("TODO!");
-    }
+    XTensor * padding2 = NewTensorBuf(paddingEnc.order + 1, dimsPadding, paddingEnc.dataType,
+                                      paddingEnc.denseRatio, paddingEnc.devID, paddingEnc.mem);
+
+    for (int i = 0; i < padding2->order; i++)
+        dimsPadding[i + 1] = padding2->GetDim(i);
+    dimsPadding[0] = nhead;
+
+    XTensor * padding3 = NewTensorBuf(paddingEnc.order + 2, dimsPadding, paddingEnc.dataType,
+                                      paddingEnc.denseRatio, paddingEnc.devID, paddingEnc.mem);
+
+    /* mask of the padding */
+    _Unsqueeze(&paddingEnc, padding2, paddingEnc.order - 1, paddingEnc.GetDim(-1));
+    _Unsqueeze(padding2, padding3, 0, nhead);
+
+    _ScaleAndShiftMe(padding3, 1e9F, -1e9F);
+
+    InitTensor(&maskEnc, padding3);
+    maskEnc.SetZeroAll();
+
+    /* generate the mask on the source language side (for padding) */
+    _Sum(&maskEnc, padding3, &maskEnc);
+
+    encoding = MakeEncoder(inputEnc, maskEnc, isTraining);
+    decoding = MakeDecoder(inputDec, encoding, maskDec, isTraining);
+    outputLayer.Make(decoding, output);
+
+    delete[] dims;
+    delete[] dimsPadding;
+
+    DelTensorBuf(padding3);
+    DelTensorBuf(padding2);
 }

 /* 
@@ -180,8 +270,33 @@ void T2TModel::GetParams(XList &list)
        list.Add(&encoder.attLayerNorms[i].w);
        list.Add(&encoder.attLayerNorms[i].b);
    }
-
+    
    list.Add(&encoder.embedder.w);
+
+    if(isMT){
+        for(int i = 0; i < decoder.nlayer; i++){
+            list.Add(&decoder.fnns[i].w1);
+            list.Add(&decoder.fnns[i].b1);
+            list.Add(&decoder.fnns[i].w2);
+            list.Add(&decoder.fnns[i].b2);
+            list.Add(&decoder.attentionsEnde[i].wk);
+            list.Add(&decoder.attentionsEnde[i].wq);
+            list.Add(&decoder.attentionsEnde[i].wv);
+            list.Add(&decoder.attentionsEnde[i].wa);
+            list.Add(&decoder.attEndeLayerNorms[i].w);
+            list.Add(&decoder.attEndeLayerNorms[i].b);
+            list.Add(&decoder.attentions[i].wk);
+            list.Add(&decoder.attentions[i].wq);
+            list.Add(&decoder.attentions[i].wv);
+            list.Add(&decoder.attentions[i].wa);
+            list.Add(&decoder.fnnLayerNorms[i].w);
+            list.Add(&decoder.fnnLayerNorms[i].b);
+            list.Add(&decoder.attLayerNorms[i].w);
+            list.Add(&decoder.attLayerNorms[i].b);
+        }
+        
+        list.Add(&decoder.embedder.w);
+    }
 }

 /*

--- a/source/sample/transformer/T2TModel.h
+++ b/source/sample/transformer/T2TModel.h
@@ -69,10 +69,16 @@ public:
    void InitModel(int argc, char ** argv);

    /* make the encoding network */
-    XTensor MakeEncoding(XTensor &input, XTensor &mask, bool isTraining);
+    XTensor MakeEncoder(XTensor &input, XTensor &mask, bool isTraining);

-    /* make the entire network (with the output softmax layer) */
-    void Make(XTensor &input, XTensor &output, XTensor &padding, bool isTraining);
+    /* make the encoding network */
+    XTensor MakeDecoder(XTensor &inputEnc, XTensor &inputDec, XTensor &mask, bool isTraining);
+
+    /* make the network for langauge modeling (with the output softmax layer) */
+    void MakeLM(XTensor &input, XTensor &output, XTensor &padding, bool isTraining);
+
+    /* make the network for machine translation (with the output softmax layer) */
+    void MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTensor &paddingEnc, bool isTraining);

    /* get parameter matrics */
    void GetParams(XList &list);

--- a/source/sample/transformer/T2TOutput.cpp
+++ b/source/sample/transformer/T2TOutput.cpp
@@ -66,6 +66,9 @@ void T2TOutput::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
    float scale = 1.0F;
    float finfout = (float)sqrt(6.0F * scale/(hSize + vSize));
    w.SetDataRand(-finfout, finfout);
+
+    DTYPE v = 1.0F/(float)sqrt((float)hSize);
+    w.SetDataRandn(0, v);
 }

 /* 
@@ -90,7 +93,8 @@ void T2TOutput::Make(XTensor &input, XTensor &output)
 {
    XTensor &x = input;

-    output = LogSoftmax(MMul(x, w), -1);
+    //output = LogSoftmax(MMul(x, w), -1);
+    output = Softmax(MMul(x, w), -1);
 }

 }
--- a/source/sample/transformer/T2TTrainer.cpp
+++ b/source/sample/transformer/T2TTrainer.cpp
@@ -101,6 +101,7 @@ void T2TTrainer::Init(int argc, char ** argv)
    LoadParamInt(argc, argv, "d", &d, 512);
    LoadParamInt(argc, argv, "nwarmup", &nwarmup, 4000);
    LoadParamInt(argc, argv, "vsize", &vSize, 1);
+    LoadParamInt(argc, argv, "vsizetgt", &vSizeTgt, vSize);
    LoadParamBool(argc, argv, "sorted", &isLenSorted, false);
    LoadParamInt(argc, argv, "bufsize", &bufSize, 50000);
    LoadParamBool(argc, argv, "adam", &useAdam, false);
@@ -113,6 +114,7 @@ void T2TTrainer::Init(int argc, char ** argv)
    LoadParamBool(argc, argv, "epochcheckpoint", &useEpochCheckpoint, false);
    LoadParamInt(argc, argv, "updatestep", &updateStep, 1);
    LoadParamBool(argc, argv, "doubledend", &isDoubledEnd, false);
+    LoadParamBool(argc, argv, "smallbatch", &isSmallBatch, false);

    buf  = new int[bufSize];
    buf2 = new int[bufSize];
@@ -122,6 +124,9 @@ void T2TTrainer::Init(int argc, char ** argv)

    adamBeta1T = 1.0F;
    adamBeta2T = 1.0F;
+
+    validStep = 0;
+    curEpoch = 0;
 }

 int tc = 0;
@@ -133,9 +138,10 @@ train the model
 >> modelFN - where we keep the model
 >> model - model to train
 */
-void T2TTrainer::Train(const char * fn, const char * validFN, const char * modelFN, T2TModel * model)
+bool T2TTrainer::Train(const char * fn, const char * validFN, const char * modelFN, T2TModel * model)
 {
-    int epoch = 0;
+    curEpoch += 1;
+
    int step = 0;
    int wc = 0;
    int wordCount = 0;
@@ -147,7 +153,7 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
    int nCheckpoint = 0;
    int nSkipped = 0;
    int gradStep = 0;
-    int validStep = 0;
+    //int validStep = 0;

    char * trainFN = new char[(int)strlen(fn) + 10];
    strcpy(trainFN, fn);
@@ -157,18 +163,18 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
        sprintf(trainFN, "%s.random", fn);
 #endif

-    PrepareModel(model);
-
    int devID = model->devID;
    XMem * mem = model->mem;
    XNet net;
    
+    PrepareModel(model);
+
    double startT = GetClockSec();
    
-    for(epoch = 1; epoch <= nepoch; epoch++){
+    //for(epoch = 1; epoch <= nepoch; epoch++){
 #ifndef WIN32
-        if(isShuffled)
-            Shuffle(fn, trainFN);
+    if(isShuffled)
+        Shuffle(fn, trainFN);
 #endif
        
        FILE * file = fopen(trainFN, "rb");
@@ -177,11 +183,13 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
        wordCount = 0;
        loss = 0;
        
-        /* batch of input sequences */
-        XTensor batch;
+        /* batch of sequences (on the encoder and decoder sides) */
+        XTensor batchEnc;
+        XTensor batchDec;

        /* padding */
-        XTensor padding;
+        XTensor paddingEnc;
+        XTensor paddingDec;

        /* gold standard */
        XTensor gold;
@@ -189,26 +197,40 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
        /* label smoothed gold standard (if needed) */
        XTensor goldSmoothed;
        
-        while (LoadBatch(file, true, &batch, &padding, &gold, NULL, 1, vSize, sBatchSize, wBatchSize, isLenSorted, wc, devID, mem)) {
+        while (LoadBatch(file, model->isLM, &batchEnc, &paddingEnc, &batchDec, &paddingDec, &gold, 
+                         NULL, vSize, vSizeTgt,
+                         sBatchSize, wBatchSize, isLenSorted, wc, devID, mem, true)) 
+        {

-            CheckNTErrors(batch.order == 3, "wrong tensor order of the sequence batch");
+            CheckNTErrors(batchEnc.order == 2, "wrong tensor order of the sequence batch");
+            //CheckNTErrors(batchEnc.order == 3, "wrong tensor order of the sequence batch");

            /* output probabilities */
            XTensor output;

            /* make the network */
-            model->Make(batch, output, padding, true);
+            if(model->isLM)
+                model->MakeLM(batchEnc, output, paddingEnc, true);
+            else if(model->isMT)
+                model->MakeMT(batchEnc, batchDec, output, paddingEnc, true);
+            else{
+                ShowNTErrors("Illegal model type!");
+            }

            /* back-propagation for obtaining gradients */
            if (labelSmoothingP > 0)
                LabelSmooth(&gold, &goldSmoothed, labelSmoothingP);

            /* make paddings for the output */
-            if (output.GetDim(0) > 1)
-                PadOutput(&output, &gold, &padding);
+            //if (output.GetDim(0) > 1)
+            //    PadOutput(&output, &gold, &paddingDec);
+
+            //output.Dump(tmpFILE, "output: ");
+            //fflush(tmpFILE);

            /* get probabilities */
            float prob = GetProb(&output, &gold, NULL);
+            
            DTYPE lossLocal = -prob / wc;
            bool doUpdate = (!IsNAN(lossLocal) && !IsINF(lossLocal) && lossLocal < 1e3F);

@@ -217,18 +239,11 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
            if (doUpdate) {
                
                /* recale the output for normalized loss */
-                RescaleOutput(&output, &g, &padding);
+                //RescaleOutput(&output, &g, &paddingDec);
                
                /* back-propagation */
-                net.Backward(output, g, CROSSENTROPY);
-
-                /*for(int i = 0; i < net.nodes.count; i++){
-                    XTensor * node = (XTensor*)net.nodes.Get(i);
-                    XLink::ShowNode(stderr, node);
-                }
-
-                exit(0);*/
-
+                net.Backward(output, g, paddingDec, CROSSENTROPY);
+                
                gradStep += 1;
                loss += -prob;
                wordCount += wc;
@@ -255,10 +270,10 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
                break;
            }
            
-            if (step % 1 == 0) {
+            if (step % 100 == 0) {
                double elapsed = GetClockSec() - startT;
                XPRINT8(0, stderr, "[INFO] lr=%.2e, elapsed=%.1fs, step=%d, epoch=%d, word=%d, loss=%.3f, ppl=%.3f, sppl=%.3f",
-                        lr, elapsed, step, epoch, wordCountTotal, loss/wordCount, exp(loss/wordCount), exp(-prob/wc));
+                        lr, elapsed, step, curEpoch, wordCountTotal, loss/wordCount, exp(loss/wordCount), exp(-prob/wc));
                if (!doUpdate)
                    XPRINT(0, stderr, " (no update)");
                XPRINT(0, stderr, "\n");
@@ -274,20 +289,20 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
        fclose(file);

        if (isEnd)
-            break;
-
-        if(useEpochCheckpoint)
-            MakeCheckpoint(model, validFN, modelFN, "epoch", epoch);
-    }
-
-    double elapsed = GetClockSec() - startT;
-    
-    epoch = MIN(epoch, nepoch);
-    
-    XPRINT7(0, stderr, "[INFO] lr=%.2e, elapsed=%.1fs, step=%d, epoch=%d, word=%d, loss=%.3f, ppl=%.3f\n",
-            lr, elapsed, step, epoch, wordCountTotal, loss/wordCount, exp(loss/wordCount));
-    XPRINT4(0, stderr, "[INFO] training finished (took %.1fs, step=%d, skipped=%d and epoch=%d)\n",
-            elapsed, step, nSkipped, epoch);
+            return false;
+        return true;
+        //if(useEpochCheckpoint)
+        //    MakeCheckpoint(model, validFN, modelFN, "epoch", epoch);
+    //}
+
+    //double elapsed = GetClockSec() - startT;
+    //
+    //epoch = MIN(epoch, nepoch);
+    //
+    //XPRINT7(0, stderr, "[INFO] lr=%.2e, elapsed=%.1fs, step=%d, epoch=%d, word=%d, loss=%.3f, ppl=%.3f\n",
+    //        lr, elapsed, step, epoch, wordCountTotal, loss/wordCount, exp(loss/wordCount));
+    //XPRINT4(0, stderr, "[INFO] training finished (took %.1fs, step=%d, skipped=%d and epoch=%d)\n",
+    //        elapsed, step, nSkipped, epoch);

    delete[] trainFN;
 }
@@ -322,10 +337,12 @@ void T2TTrainer::Test(const char * fn, const char * ofn, T2TModel * model)
    wordCount = 0;
        
    /* batch of input sequences */
-    XTensor batch;
+    XTensor batchEnc;
+    XTensor batchDec;

    /* padding */
-    XTensor padding;
+    XTensor paddingEnc;
+    XTensor paddingDec;

    /* gold standard */
    XTensor gold;
@@ -335,18 +352,28 @@ void T2TTrainer::Test(const char * fn, const char * ofn, T2TModel * model)
    
    ClearBuf();

-    while(LoadBatch(file, true, &batch, &padding, &gold, seqs, 1, vSize, 1, 1, false, wc, devID, mem)){
+    while(LoadBatch(file, model->isLM, &batchEnc, &paddingEnc, &paddingDec, &paddingDec, &gold, 
+                    seqs, vSize, vSizeTgt,
+                    1, 1, false, wc, devID, mem, false))
+    {

-        CheckNTErrors(batch.order == 3, "wrong tensor order of the sequence batch");
+        //CheckNTErrors(batchEnc.order == 3, "wrong tensor order of the sequence batch");
+        CheckNTErrors(batchEnc.order == 2, "wrong tensor order of the sequence batch");
            
        /* output probabilities */
        XTensor output;
            
        /* make the network */
-        model->Make(batch, output, padding, false);
+        if(model->isLM)
+            model->MakeLM(batchEnc, output, paddingEnc, false);
+        else if(model->isMT)
+            model->MakeMT(batchEnc, batchDec, output, paddingEnc, false);
+        else{
+            ShowNTErrors("Illegal model type!");
+        }

-        int bSize = batch.GetDim(0);
-        int length = batch.GetDim(1);
+        int bSize = output.GetDim(0);
+        int length = output.GetDim(1);

        /* prediction probabilities */
        XTensor probs;
@@ -391,7 +418,7 @@ void T2TTrainer::Test(const char * fn, const char * ofn, T2TModel * model)
    delete[] seqs;
    
    double elapsed = GetClockSec() - startT;
-    
+
    XPRINT3(0, stderr, "[INFO] test finished (took %.1fs, word=%d, and ppl=%.3f)\n",
            elapsed,wordCountTotal, exp(loss / wordCount));
 }
@@ -511,6 +538,7 @@ int T2TTrainer::LoadBuf(FILE * file, bool isSorted, int step)

    /* sort the sequences by length */
    if (isSorted) {
+        CheckNTErrors(seqCount % step == 0, "Wrong number of sequences!");
        SampleNode * nodes = new SampleNode[seqCount];
        int count = 0;
        int offset = 0;
@@ -526,19 +554,18 @@ int T2TTrainer::LoadBuf(FILE * file, bool isSorted, int step)
            offset += node.size;
        }

-        qsort(nodes, seqCount, sizeof(SampleNode), CompareSampleNode);
+        qsort(nodes, count, sizeof(SampleNode), CompareSampleNode);

        count = 0;
        offset = 0;
-        for(int i = 0; i < seqCount; i++){
+        for(int i = 0; i < seqCount; i += step){
            SampleNode &node = nodes[count];
-            //fprintf(stderr, "%d %d %d\n", node.size, node.id, node.value);
            memcpy(buf2 + offset, node.p, sizeof(int) * node.size);
            for(int j = 0; j < step; j++){
-                seqLen2[count + j] = seqLen[node.id + j];
-                seqOffset[count + j] = offset + (j > 0 ? seqLen[node.id + j - 1] : 0);
+                seqLen2[i + j] = seqLen[node.id + j];
+                seqOffset[i + j] = offset + (j > 0 ? seqLen[node.id + j - 1] : 0);
            }
-            count += step;
+            count += 1;
            offset += node.size;
        }

@@ -546,6 +573,7 @@ int T2TTrainer::LoadBuf(FILE * file, bool isSorted, int step)
        buf = buf2;
        buf2 = tmp;
        tmp = seqLen;
+
        seqLen = seqLen2;
        seqLen2 = tmp;

@@ -562,32 +590,79 @@ void T2TTrainer::ClearBuf()
    nextSeq = -1;
 }

-/* 
+/*
 load a batch of sequences 
 >> file - the handle to the data file
 >> isLM - indicates whether the data is used for training lms
->> batch - the batch of the input sequences
->> padding - padding of the input sequences
->> output - the batch of the output sequences
+>> batchEnc - the batch of the input sequences
+>> paddingEnc - padding of the input sequences
+>> batchDec - the batch of the output sequences
+>> paddingDec - padding of the output sequences
+>> gold - gold standard
 >> seqs - keep the sequences in an array
->> step - the step we go over when move to the next sequence
->> vs - vocabulary size
+>> vsEnc - size of the encoder vocabulary
+>> vsDec - size of the decoder vocabulary
 >> sBatch - batch size of sequences
 >> wBatch - batch size of words
 >> isSorted - indicates whether the sequences are sorted by length
 >> wCount - word count
 >> devID - device id
 >> mem - memory pool
+>> isTraining - indicates whether we are training the model
 */
 int T2TTrainer::LoadBatch(FILE * file, bool isLM, 
-                          XTensor * batch, XTensor * padding, XTensor * output, 
+                          XTensor * batchEnc, XTensor * paddingEnc, 
+                          XTensor * batchDec, XTensor * paddingDec,
+                          XTensor * gold,
                          int * seqs,
-                          int step, int vs, int sBatch, int wBatch, 
+                          int vsEnc, int vsDec, int sBatch, int wBatch, 
                          bool isSorted, int &wCount,
-                          int devID, XMem * mem)
+                          int devID, XMem * mem, 
+						  bool isTraining)
+{
+    if(isLM){
+        return LoadBatchLM(file, batchEnc, paddingEnc, batchDec, paddingDec, gold, 
+                           seqs, vsEnc, sBatch, wBatch, 
+                           isSorted, wCount, devID, mem, isTraining);
+    }
+    else{
+        return LoadBatchMT(file, batchEnc, paddingEnc, batchDec, paddingDec, gold, 
+                           seqs, vsEnc, vsDec, sBatch, wBatch, 
+                           isSorted, wCount, devID, mem, isTraining);
+    }
+}
+
+/* 
+load a batch of sequences (for LM)
+>> file - the handle to the data file
+>> isLM - indicates whether the data is used for training lms
+>> batchEnc - the batch of the input sequences
+>> paddingEnc - padding of the input sequences
+>> batchDec - the batch of the output sequences
+>> paddingDec - padding of the output sequences
+>> gold - gold standard
+>> seqs - keep the sequences in an array
+>> vs - vocabulary size
+>> sBatch - batch size of sequences
+>> wBatch - batch size of words
+>> isSorted - indicates whether the sequences are sorted by length
+>> wCount - word count
+>> devID - device id
+>> mem - memory pool
+>> isTraining - indicates whether we are training the model
+*/
+int T2TTrainer::LoadBatchLM(FILE * file, 
+                            XTensor * batchEnc, XTensor * paddingEnc,
+                            XTensor * batchDec, XTensor * paddingDec,
+                            XTensor * gold,
+                            int * seqs,
+                            int vs, int sBatch, int wBatch, 
+                            bool isSorted, int &wCount,
+                            int devID, XMem * mem,
+							bool isTraining)
 {
    if(nextSeq < 0 || nextSeq >= nseqBuf)
-        LoadBuf(file, isSorted, step);
+        LoadBuf(file, isSorted, 1);

    int seq = MAX(nextSeq, 0);
    int wc = 0;
@@ -604,7 +679,8 @@ int T2TTrainer::LoadBatch(FILE * file, bool isLM,
        if(max < wn)
            max = wn;

-        if(sc >= sBatch && wc >= wBatch)
+        int tc = isSmallBatch ? max * sc : wc;
+        if(sc >= sBatch && tc >= wBatch)
            break;
    }

@@ -614,74 +690,205 @@ int T2TTrainer::LoadBatch(FILE * file, bool isLM,
    if(sc <= 0)
        return 0;

-    if(isLM){
-        int dims[MAX_TENSOR_DIM_NUM];
-        dims[0] = sc;
-        dims[1] = max;
-        dims[2] = vs;
-
-        InitTensor(batch, 3, dims, X_FLOAT, 1.0F, devID, mem);
-        InitTensor2D(padding, sc, max, X_FLOAT, devID, mem);
-        InitTensor(output, 3, dims, X_FLOAT, 1.0F, devID, mem);
-
-        if(batch->grad == NULL)
-            XNoder::MakeGrad(batch);
-        else
-            InitTensor(batch->grad, 3, dims, X_FLOAT, 1.0F, devID, mem);
-
-        if(padding->grad == NULL)
-            XNoder::MakeGrad(padding);
-        else
-            InitTensor2D(padding->grad, sc, max, X_FLOAT, devID, mem);
-
-        if(output->grad == NULL)
-            XNoder::MakeGrad(output);
-        else
-            InitTensor(output->grad, 3, dims, X_FLOAT, 1.0F, devID, mem);
-
-        batch->SetZeroAll();
-        padding->SetZeroAll();
-        output->SetZeroAll();
-        batch->grad->SetZeroAll();
-        padding->grad->SetZeroAll();
-        output->grad->SetZeroAll();
-
-        int seqSize = 0;
-
-        //fprintf(tf, "batch %d(%d)\n", tc++, sc);
-
-        /* this might be slow on GPUs :( */
-        for(int s = seq; s < seq + sc; s++){
-            int len = isDoubledEnd ? seqLen[s] : seqLen[s] - 1;
-            CheckNTErrors(len <= max, "Something is wrong!");
-            for(int w = 0; w < len; w++){
-                batch->Set3D(1.0F, s - seq, w, buf[seqOffset[s] + w]);
-                padding->Set2D(1.0F, s - seq, w);
-                if(w > 0)
-                    output->Set3D(1.0F, s - seq, w - 1, buf[seqOffset[s] + w]);
-                if(w == len - 1){
-                    if(isDoubledEnd)
-                        output->Set3D(1.0F, s - seq, w, buf[seqOffset[s] + w]);
-                    else
-                        output->Set3D(1.0F, s - seq, w, buf[seqOffset[s] + w + 1]);
-                }
-                wCount++;
-                /*fprintf(tf, "%d", buf[seqOffset[s] + w]);
-                if(w < seqLen[s] - 1)
-                    fprintf(tf, " ");
+    int dims[MAX_TENSOR_DIM_NUM];
+    dims[0] = sc;
+    dims[1] = max;
+    dims[2] = vs;
+
+    InitTensor(batchEnc, 2, dims, X_INT, 1.0F, -1);
+    //InitTensor(batchEnc, 3, dims, X_FLOAT, 1.0F, devID, mem);
+    InitTensor2D(paddingEnc, sc, max, X_FLOAT, devID, mem);
+    InitTensor(gold, 3, dims, X_FLOAT, 1.0F, devID, mem);
+    InitTensor2D(paddingDec, sc, max, X_FLOAT, devID, mem);
+
+    batchEnc->SetZeroAll();
+    paddingEnc->SetZeroAll();
+    gold->SetZeroAll();
+    paddingDec->SetZeroAll();
+
+    if(isTraining) {
+        //XNoder::MakeGrad(batchEnc);
+        XNoder::MakeGrad(paddingEnc);
+        XNoder::MakeGrad(gold);
+        XNoder::MakeGrad(paddingDec);
+        //batchEnc->grad->SetZeroAll();
+        paddingEnc->grad->SetZeroAll();
+        gold->grad->SetZeroAll();
+        paddingDec->grad->SetZeroAll();
+    }
+
+    int seqSize = 0;
+
+    //fprintf(tf, "batch %d(%d)\n", tc++, sc);
+
+    /* this might be slow on GPUs :( */
+    for(int s = seq; s < seq + sc; s++){
+        int len = isDoubledEnd ? seqLen[s] : seqLen[s] - 1;
+        CheckNTErrors(len <= max, "Something is wrong!");
+        for(int w = 0; w < len; w++){
+            batchEnc->Set2DInt(buf[seqOffset[s] + w], s - seq, w);
+            //batchEnc->Set3D(1.0F, s - seq, w, buf[seqOffset[s] + w]);
+            paddingEnc->Set2D(1.0F, s - seq, w);
+            paddingDec->Set2D(1.0F, s - seq, w);
+            if (w > 0)
+                gold->Set3D(1.0F, s - seq, w - 1, buf[seqOffset[s] + w]);
+            
+            if (w == len - 1) {
+                if (isDoubledEnd)
+                    gold->Set3D(1.0F, s - seq, w, buf[seqOffset[s] + w]);
                else
-                    fprintf(tf, "\n");*/
-                if(seqs != NULL)
-                    seqs[seqSize++] = buf[seqOffset[s] + w];
+                    gold->Set3D(1.0F, s - seq, w, buf[seqOffset[s] + w + 1]);
            }

-            if(seqs != NULL){
-                for(int w = len; w < max; w++)
-                    seqs[seqSize++] = -1;
+            wCount++;
+            /*fprintf(tf, "%d", buf[seqOffset[s] + w]);
+            if(w < seqLen[s] - 1)
+                fprintf(tf, " ");
+            else
+                fprintf(tf, "\n");*/
+            if(seqs != NULL)
+                seqs[seqSize++] = buf[seqOffset[s] + w];
+        }
+
+        if(seqs != NULL){
+            for(int w = len; w < max; w++)
+                seqs[seqSize++] = -1;
+        }
+    }
+
+    fflush(tf);
+
+    return sc;
+}
+
+/*
+load a batch of sequences (for MT)
+>> file - the handle to the data file
+>> batchEnc - the batch of the input sequences
+>> paddingEnc - padding of the input sequences
+>> batchDec - the batch of the output sequences
+>> paddingDec - padding of the output sequences
+>> gold - gold standard
+>> seqs - keep the sequences in an array
+>> vsEnc - size of the encoder vocabulary
+>> vsDec - size of the decoder vocabulary
+>> sBatch - batch size of sequences
+>> wBatch - batch size of words
+>> isSorted - indicates whether the sequences are sorted by length
+>> wCount - word count
+>> devID - device id
+>> mem - memory pool
+>> isTraining - indicates whether we are training the model
+*/
+int T2TTrainer::LoadBatchMT(FILE * file, 
+                            XTensor * batchEnc, XTensor * paddingEnc, 
+                            XTensor * batchDec, XTensor * paddingDec,
+                            XTensor * gold,
+                            int * seqs,
+                            int vsEnc, int vsDec, int sBatch, int wBatch, 
+                            bool isSorted, int &wCount,
+                            int devID, XMem * mem, 
+							bool isTraining)
+{
+    if(nextSeq < 0 || nextSeq >= nseqBuf)
+        LoadBuf(file, isSorted, 2);
+
+    int seq = MAX(nextSeq, 0);
+    int wcEnc = 0;
+    int wcDec = 0;
+    int wnEnc = 0;
+    int wnDec = 0;
+    int maxEnc = 0;
+    int maxDec = 0;
+    int sc = 0;
+
+    CheckNTErrors((nseqBuf - seq) % 2 == 0, "Input sequence must be paired!");
+
+    while(seq + sc < nseqBuf){
+
+        /* source-side sequence */
+        wnEnc = seqLen[seq + sc];
+        wcEnc += wnEnc;
+        sc += 1;
+
+        if(maxEnc < wnEnc)
+            maxEnc = wnEnc;
+
+        /* target-side sequence */
+        int len = isDoubledEnd ? seqLen[seq + sc] : seqLen[seq + sc] - 1;
+        wnDec = len;
+        wcDec += wnDec;
+        sc += 1;
+
+        if(maxDec < wnDec)
+            maxDec = wnDec;
+
+        int tc = isSmallBatch ? maxEnc * sc / 2 : wcEnc;
+        if(sc >= sBatch * 2 && tc >= wBatch)
+            break;
+    }
+
+    nextSeq = seq + sc;
+
+    if(sc <= 0)
+        return 0;
+
+    int sCount = sc/2;
+    int seqSize = 0;
+    int dimsEnc[3] = {sCount, maxEnc, vsEnc};
+    int dimsDec[3] = {sCount, maxDec, vsDec};
+
+    InitTensor(batchEnc, 3, dimsEnc, X_FLOAT, 1.0F, devID, mem);
+    InitTensor2D(paddingEnc, sCount, maxEnc, X_FLOAT, devID, mem);
+    InitTensor(batchDec, 3, dimsDec, X_FLOAT, 1.0F, devID, mem);
+    InitTensor2D(paddingDec, sCount, maxDec, X_FLOAT, devID, mem);
+    InitTensor(gold, 3, dimsDec, X_FLOAT, 1.0F, devID, mem);
+
+    batchEnc->SetZeroAll();
+    paddingEnc->SetZeroAll();
+    batchDec->SetZeroAll();
+    paddingDec->SetZeroAll();
+    gold->SetZeroAll();
+
+    wCount = 0;
+
+    /* batch of the source-side sequences */
+    for(int s = seq; s < seq + sc; s += 2){
+        int len = seqLen[s];
+        int sent = (s - seq)/2;
+        for(int w = 0; w < len; w++){
+            batchEnc->Set3D(1.0F, sent, w, buf[seqOffset[s] + w]);
+            paddingEnc->Set2D(1.0F, sent, w);
+            wCount++;
+        }
+    }
+
+    /* batch of the target-side sequences */
+    for(int s = seq + 1; s < seq + sc; s += 2){
+        int len = isDoubledEnd ? seqLen[s] : seqLen[s] - 1;
+        CheckNTErrors(len <= maxDec, "Something is wrong!");
+        int sent = (s - seq - 1)/2;
+        for(int w = 0; w < len; w++){
+            paddingDec->Set2D(1.0F, sent, w);
+            batchDec->Set3D(1.0F, sent, w, buf[seqOffset[s] + w]);
+            if(w > 0)
+                gold->Set3D(1.0F, sent, w - 1, buf[seqOffset[s] + w]);
+            if (w == len - 1) {
+                if(isDoubledEnd)
+                    gold->Set3D(1.0F, sent, w, buf[seqOffset[s] + w]);
+                else
+                    gold->Set3D(1.0F, sent, w, buf[seqOffset[s] + w + 1]);
            }
+            wCount++;
+
+            if(seqs != NULL)
+                seqs[seqSize++] = buf[seqOffset[s] + w];
        }

-        fflush(tf);
+        if(seqs != NULL){
+            for(int w = len; w < maxDec; w++)
+                seqs[seqSize++] = -1;
+        }
    }

    return sc;
@@ -715,8 +922,12 @@ float T2TTrainer::GetProb(XTensor * output, XTensor * gold, XTensor * wordProbs)
    XTensor probs;
    InitTensor(&probs, output);
    
+    XTensor logOutput;
+    InitTensor(&logOutput, output);
+    _Log(output, &logOutput);
+
    /* probs[i,j] = output[i,j] * gold[i,j] */
-    _Multiply(output, gold, &probs);
+    _Multiply(&logOutput, gold, &probs);
    
    /* probability of each word */
    XTensor wprobs;
@@ -730,7 +941,7 @@ float T2TTrainer::GetProb(XTensor * output, XTensor * gold, XTensor * wordProbs)
        _CopyValues(&wprobs, wordProbs);
    
    /* reshape the tensor to fit it into the reduce procedure
-     TODO: XTensor supports scalars */
+       TODO: XTensor supports scalars */
    dims[0] = 1;
    dims[1] = probs.unitNum;
    probs.Reshape(2, dims);
@@ -885,18 +1096,13 @@ void T2TTrainer::RescaleOutput(XTensor * output, XTensor * gold, XTensor * paddi
 {
    CheckNTErrors(output->order == 3, "Wrong dimension number!");
    CheckNTErrors(gold->order == 3, "Wrong dimension number!");
-    
-    int num = padding->GetDim(0);
-    XTensor * factor = NewTensorBuf(1, &num, padding->dataType, 1.0F, padding->devID, padding->mem);
-    
-    _ReduceSum(padding, factor, padding->order - 1);
+
+    DTYPE count = _ReduceSumAll(padding);
    
    _ExpMe(output);
-    _DivDim(output, factor, output, 0);
+    _ScaleAndShiftMe(output, 1/count);
    _LogMe(output);
-    _DivDim(gold, factor, gold, 0);
-    
-    DelTensorBuf(factor);
+    _ScaleAndShiftMe(gold, 1/count);
 }
    
 /*

--- a/source/sample/transformer/T2TTrainer.h
+++ b/source/sample/transformer/T2TTrainer.h
@@ -79,6 +79,9 @@ public:
    /* vocabulary size of the source side */
    int vSize;

+    /* vocabulary size of the target side */
+    int vSizeTgt;
+
    /* learning rate */
    float lrate;
    
@@ -100,6 +103,10 @@ public:
    /* indicates whether we use adam */
    bool useAdam;

+    int validStep;
+
+    int curEpoch;
+
    /* hyper parameters of adam*/
    float adamBeta1;
    float adamBeta2;
@@ -128,8 +135,13 @@ public:
    /* number of batches on which we do model update */
    int updateStep;
    
-    /* indicates whether we double the </s> symble for the output of lms */
+    /* indicates whether we double the </s> symbol for the output of lms */
    bool isDoubledEnd;
+    
+    /* indicates whether we use batchsize = max * sc
+       rather rather than batchsize = word-number, where max is the maximum
+       length and sc is the sentence number */
+    bool isSmallBatch;

 public:
    /* constructor */
@@ -142,7 +154,7 @@ public:
    void Init(int argc, char ** argv);

    /* train the model */
-    void Train(const char * fn, const char * validFN, const char * modelFN, T2TModel * model);
+    bool Train(const char * fn, const char * validFN, const char * modelFN, T2TModel * model);

    /* test the model */
    void Test(const char * fn, const char * ofn, T2TModel * model);
@@ -158,11 +170,34 @@ public:

    /* load a batch of sequences */
    int LoadBatch(FILE * file, bool isLM,
-                  XTensor * batch, XTensor * padding, XTensor * output, 
+                  XTensor * batchEnc, XTensor * paddingEnc, 
+                  XTensor * batchDec, XTensor * paddingDec,
+                  XTensor * gold,
                  int * seqs,
-                  int step, int vs, int sBatch, int wBatch, 
+                  int vsEnc, int vsDec, int sBatch, int wBatch, 
                  bool isSorted, int &wCount,
-                  int devID, XMem * mem);
+                  int devID, XMem * mem, 
+				  bool isTraining);
+
+    /* load a batch of sequences (for language modeling) */
+    int LoadBatchLM(FILE * file, 
+                    XTensor * batchEnc, XTensor * paddingEnc,
+                    XTensor * batchDec, XTensor * paddingDec,
+                    XTensor * gold,
+                    int * seqs, int vs, int sBatch, int wBatch, 
+                    bool isSorted, int &wCount,
+                    int devID, XMem * mem, 
+					bool isTraining);
+
+    /* load a batch of sequences (for machine translation) */
+    int LoadBatchMT(FILE * file, 
+                    XTensor * batchEnc, XTensor * paddingEnc, 
+                    XTensor * batchDec, XTensor * paddingDec,
+                    XTensor * gold,
+                    int * seqs, int vsEnc, int vsDec, int sBatch, int wBatch, 
+                    bool isSorted, int &wCount,
+                    int devID, XMem * mem, 
+					bool isTraining);

    /* shuffle the data file */
    void Shuffle(const char * srcFile, const char * tgtFile);

--- a/source/sample/transformer/Transformer.cpp
+++ b/source/sample/transformer/Transformer.cpp
@@ -25,6 +25,8 @@
 #include "T2TUtility.h"
 #include "T2TTrainer.h"
 #include "../../tensor/XDevice.h"
+#include "../../tensor/XUtility.h"
+#include "../../tensor/XGlobal.h"

 namespace transformer
 {
@@ -56,20 +58,74 @@ int TransformerMain(int argc, const char ** argv)
    LoadParamString(argc, args, "test", testFN, "");
    LoadParamString(argc, args, "output", outputFN, "");

-    T2TTrainer trainer;
-    trainer.Init(argc, args);
-
-    T2TModel model;
-
-    model.InitModel(argc, args);
-
    /* learn model parameters */
-    if(strcmp(trainFN, ""))
-        trainer.Train(trainFN, testFN, strcmp(modelFN, "") ? modelFN : "checkpoint.model", &model);
+    if(strcmp(trainFN, "")) {
+        double startT = GetClockSec();
+
+        T2TTrainer trainer;
+        trainer.Init(argc, args);
+
+        char * fn = new char[MAX_LINE_LENGTH];
+        char * fn1 = new char[MAX_LINE_LENGTH];
+        char * fn2 = new char[MAX_LINE_LENGTH];
+        modelFN = strcmp(modelFN, "") ? modelFN : (char *)"checkpoint.model";
+
+        int epoch;
+        bool isTrain;
+
+        for(epoch = 1; epoch <= trainer.nepoch; epoch++) {
+            sprintf(fn, "%s.%s.%03d", modelFN, "epoch", epoch - 1);
+            sprintf(fn1, "%s.%s.%03d", modelFN, "epoch", epoch);
+            sprintf(fn2, "%s.%s.%03d.output", modelFN, "epoch", epoch);
+
+            if(epoch == 1) {
+                T2TModel model;
+                model.InitModel(argc, args);
+
+                isTrain = trainer.Train(trainFN, testFN, modelFN, &model);
+                model.Dump(fn1);
+            }
+            else {
+                T2TModel model;
+                model.InitModel(argc, args);
+                model.Read(fn);
+
+                isTrain = trainer.Train(trainFN, testFN, modelFN, &model);
+                model.Dump(fn1);
+            }
+
+            if(trainer.useEpochCheckpoint && strcmp(testFN, "")) {
+                T2TTrainer tester;
+                tester.Init(argc, args);
+
+                T2TModel model;
+                model.InitModel(argc, args);
+                model.Read(fn1);
+
+                tester.Test(testFN, fn2, &model);
+            }
+
+            if(!isTrain)
+                break;
+        }
+
+        double elapsed = GetClockSec() - startT;
+        epoch = MIN(epoch, trainer.nepoch);
+    
+        XPRINT2(0, stderr, "[INFO] training finished (took %.1fs and epoch=%d)\n", elapsed, epoch);

+        delete[] fn;
+        delete[] fn1;
+        delete[] fn2;
+    }
+
+    /* don't dump the final model */
    /* save the final model */
-    if(strcmp(modelFN, "") && strcmp(trainFN, ""))
-        model.Dump(modelFN);
+    //if(strcmp(modelFN, "") && strcmp(trainFN, ""))
+    //    model.Dump(modelFN);
+    
+    T2TModel model;
+    model.InitModel(argc, args);

    /* load the model if neccessary */
    if(strcmp(modelFN, ""))

--- a/source/tensor/XDevice.cpp
+++ b/source/tensor/XDevice.cpp
@@ -446,7 +446,7 @@ int XDevManager::GetCudaThread2D(const int devID, const int n, const int m, int 

    CheckNTErrors((!(b & (b-1))), "Block size (x-axis) must be in 2^x");
    CheckNTErrors((gXSize <= GPUs[devID].GPUMaxGridSize[0] && 
-                         gYSize <= GPUs[devID].GPUMaxGridSize[1]), "A too large grid size.");
+                   gYSize <= GPUs[devID].GPUMaxGridSize[1]), "A too large grid size.");

    blockSize[0] = bXSize;
    blockSize[1] = bYSize;

--- a/source/tensor/XMem.cpp
+++ b/source/tensor/XMem.cpp
@@ -292,7 +292,8 @@ void XMem::SetComputationMode(bool myIsForComputation)
    if(!myIsForComputation && devID >= 0 && cublasHandle != NULL)
        cublasDestroy(cublasHandle);
    if(myIsForComputation)
-        CheckNTErrors(cublasCreate(&cublasHandle) == CURAND_STATUS_SUCCESS, "Cannot create the cublas handle.");
+        CheckNTErrors((enum curandStatus)cublasCreate(&cublasHandle) == CURAND_STATUS_SUCCESS, 
+				      "Cannot create the cublas handle.");

    SetDevice(devIDBackup);
 #endif
@@ -1392,8 +1393,8 @@ void XMem::CreateBLASHandle()
                      "Cannot destroy the cublas handle.");
    }

-    CheckNTErrors(cublasCreate(&cublasHandle) == CURAND_STATUS_SUCCESS, 
-                 "Cannot create the cublas handle.");
+    CheckNTErrors((enum curandStatus)cublasCreate(&cublasHandle) == CURAND_STATUS_SUCCESS, 
+                  "Cannot create the cublas handle.");
 #endif
 }


--- a/source/tensor/XTensor.cpp
+++ b/source/tensor/XTensor.cpp
@@ -1057,9 +1057,9 @@ int XTensor::GetKeyInSparse(int i)

 /* 
 set the value of a cell 
->> value - value to assign to the cell
+>> value - value we tend to set
 >> index - index of the cell for each dimension
->> 
+>> size - size of the index
 */
 bool XTensor::Set(DTYPE value, int index[], int size)
 {
@@ -1070,8 +1070,9 @@ bool XTensor::Set(DTYPE value, int index[], int size)

 /* 
 set the value of a cell in a 1d tensor 
->> value - value to assign to the cell
+>> value - value we tend to set
 >> i - item offset
+<< return - succeeded or not
 */
 bool XTensor::Set1D(DTYPE value, int i)
 {
@@ -1124,6 +1125,78 @@ bool XTensor::Set3D(DTYPE value, int d0, int d1, int d2)
    return SetToDevice(devID, GetCell(dims, 3), value);
 }

+
+/* 
+set the integer value of a cell 
+>> value - value we tend to set
+>> index - index of the cell for each dimension
+>> size - size of the index
+<< return - succeeded or not
+*/
+bool XTensor::SetInt(int value, int index[], int size)
+{
+    CheckNTErrors((dataType == X_INT), "The tensor is not in integer type.");
+
+    return SetToDeviceInt(devID, GetCell(index, size), value);
+}
+
+/* 
+set the integer value of a cell in a 1d tensor 
+>> value - value we tend to set
+>> i - item offset
+<< return - succeeded or not
+*/
+bool XTensor::Set1DInt(int value, int i)
+{
+    CheckNTErrors((order == 1), "Cannot get a 2d cell for a tensor whose order is not 2!");
+    CheckNTErrors((i >= 0 && i < dimSize[0]), "dimension 0 is out of range!");
+    CheckNTErrors((dataType == X_INT), "The tensor is not in integer type.");
+
+    int dims[1] = {i};
+
+    return SetToDeviceInt(devID, GetCell(dims, 1), value);
+}
+
+/* 
+set the integer value of a cell in a 2d tensor in default type
+>> value - value we tend to set
+>> ni - row index
+>> mi - column index
+<< return - succeeded or not
+*/
+bool XTensor::Set2DInt(int value, int ni, int mi)
+{
+    CheckNTErrors((order == 2), "Cannot get a 2d cell for a tensor whose order is not 2!");
+    CheckNTErrors((ni >= 0 && ni < dimSize[0]), "dimension 0 is out of range!");
+    CheckNTErrors((mi >= 0 && mi < dimSize[1]), "dimension 1 is out of range!");
+    CheckNTErrors((dataType == X_INT), "The tensor is not in integer type.");
+
+    int dims[2] = {ni, mi};
+
+    return SetToDeviceInt(devID, GetCell(dims, 2), value);
+}
+
+/* 
+set the integer value of a cell in a 3d tensor in default type
+>> value - value we tend to set
+>> d0 - index of demension 0
+>> d1 - index of demension 1
+>> d2 - index of demension 2
+<< return - succeeded or not
+*/
+bool XTensor::Set3DInt(int value, int d0, int d1, int d2)
+{
+    CheckNTErrors(order == 3, "Cannot get a 2d cell for a tensor whose order is not 2!");
+    CheckNTErrors(d0 >= 0 && d0 < dimSize[0], "dimension 0 is out of range!");
+    CheckNTErrors(d1 >= 0 && d1 < dimSize[1], "dimension 1 is out of range!");
+    CheckNTErrors(d2 >= 0 && d2 < dimSize[2], "dimension 2 is out of range!");
+    CheckNTErrors((dataType == X_INT), "The tensor is not in integer type.");
+
+    int dims[3] = {d0, d1, d2};
+
+    return SetToDeviceInt(devID, GetCell(dims, 3), value);
+}
+
 /* 
 increase the value of a cell in a 2d tensor
 >> value - value we tend to set
@@ -1986,6 +2059,9 @@ XTensor * NewTensorBuf(const int myOrder, const int * myDimSize,

    XTensor * tensor = NewTensor(myOrder, dims, myDataType, myDenseRatio, devID, myMem);

+    if (tensor->unitNum * tensor->unitSize == 176657664) {
+        tensor->Dump(stderr, "", 200);
+    }
    if(myMem != NULL)
        tensor->data = myMem->AllocBuf(myMem->devID, tensor->unitNum * tensor->unitSize);
    else

--- a/source/tensor/XTensor.h
+++ b/source/tensor/XTensor.h
@@ -326,6 +326,18 @@ public:

    /* set the value of a cell in a 3d tensor */
    bool Set3D(DTYPE value, int d0, int d1, int d2);
+    
+    /* set the integer value of a cell */
+    bool SetInt(int value, int index[], int size = -1);
+
+    /* set the integer value of a cell in a 1d tensor */
+    bool Set1DInt(int value, int i);
+
+    /* set the integer value of a cell in a 2d tensor */
+    bool Set2DInt(int value, int ni, int mi);
+
+    /* set the integer value of a cell in a 3d tensor */
+    bool Set3DInt(int value, int d0, int d1, int d2);

    /* increase the value of a cell in a 2d */
    bool Add2D(DTYPE value, int ni, int mi);

--- a/source/tensor/XUtility.cpp
+++ b/source/tensor/XUtility.cpp
@@ -491,6 +491,21 @@ bool SetToDevice(int devID, void * p, DTYPE value)
    return true;
 }

+/* assign a integer number to a variable that is kept on a specified device */
+bool SetToDeviceInt(int devID, void * p, int value)
+{
+    if(p == NULL)
+        return false;
+
+    if(devID < 0)
+        *(int*)p = value;
+    else{
+        XMemCopy(p, devID, &value, -1, sizeof(int));
+    }
+
+    return true;
+}
+
 /* get the next number with power of 2 */
 unsigned int GetNextPower2(unsigned int n)
 {

--- a/source/tensor/XUtility.h
+++ b/source/tensor/XUtility.h
@@ -50,6 +50,7 @@ extern void XMemFreeOnDev(int devID, void * p);
 extern DTYPE ToCPU(int devID, void * value);
 extern int ToCPUInt(int devID, void * value);
 extern bool SetToDevice(int devID, void * p, DTYPE value);
+extern bool SetToDeviceInt(int devID, void * p, int value);
 extern unsigned int GetNextPower2(unsigned int n);
 extern void XSleep(int sleepTime);
 extern double GetClock();

--- a/source/tensor/core/getandset/SetData.cpp
+++ b/source/tensor/core/getandset/SetData.cpp
@@ -70,9 +70,9 @@ void _SetDataFanInOut(XTensor * tensor, DTYPE gain)
        fanOut = numOutputFmaps * receptiveFieldSize;
    }

-    DTYPE std = gain * (float)sqrt(2.0/(fanIn + fanOut));
-    DTYPE a = (DTYPE)sqrt(3.0) * std;
-    _SetDataRand(tensor, -a, a);
+    DTYPE finfout = gain * (float)sqrt(6.0F/(fanIn + fanOut));
+    tensor->SetDataRand(-finfout, finfout);
+    //_SetDataRand(tensor, -finfout, finfout);
 }

 /* 
@@ -393,7 +393,7 @@ void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper)
    if(tensor == NULL)
        return;
    
-    /* GPU code */
+    /* CPU code */
    if(tensor->devID < 0){
        DTYPE variance = upper - lower;
        

--- a/source/tensor/core/movement/Gather.cpp
+++ b/source/tensor/core/movement/Gather.cpp
@@ -21,6 +21,8 @@

 #include "Gather.h"
 #include "CopyIndexed.h"
+#include "../../XUtility.h"
+#include "../shape/Reshape.h"

 namespace nts{ // namespace nts(NiuTrans.Tensor)

@@ -75,4 +77,50 @@ XTensor Gather(const XTensor &s, int dim, int * srcIndex, int indexSize)
    return result;
 }

+/*
+gather indexed sub-tensors (return a XTensor structure)
+make a new tensor to keep the result and return it
+
+>> s - the source tensor(2D)
+>> index - the index tensor
+<< return - the result of copying indexed sub-tensors
+*/
+XTensor Gather(const XTensor &s, const XTensor &index)
+{
+    int indexSize = index.unitNum;
+    CheckNTErrors(s.order == 2, "The order of the input tensor must be 2!");
+ 
+    int * srcIndex = new int[index.unitNum];
+
+    if(index.dataType == X_INT) {
+        XMemCopy(srcIndex, -1, index.data, index.devID, indexSize * index.unitSize);
+    }
+    else if(index.dataType == X_FLOAT || index.dataType == X_DOUBLE) {
+        DTYPE * tmp = new DTYPE[indexSize];
+        XMemCopy(tmp, -1, index.data, index.devID, indexSize * index.unitSize);
+        for(int i = 0; i < indexSize; i++)
+            srcIndex[i] = (int)tmp[i];
+        delete[] tmp;
+    }
+
+    XTensor tensor;
+    tensor = Gather(s, 0, srcIndex, indexSize);
+    delete[] srcIndex;
+
+    if(index.order > 1) {
+        int * dims = new int[index.order + 1];
+        memcpy(dims, index.dimSize, index.order * sizeof(int));
+        dims[index.order] = tensor.GetDim(-1);
+
+        XTensor t;
+        t = Reshape(tensor, index.order + 1, dims);
+        delete[] dims;
+
+        return t;
+    }
+    else {
+        return tensor;
+    }   
+}
+
 } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/movement/Gather.h
+++ b/source/tensor/core/movement/Gather.h
@@ -33,6 +33,10 @@ void _Gather(const XTensor * s, XTensor * t, int dim, int * srcIndex, int indexS
   make a new tensor to keep the result and return it */
 XTensor Gather(const XTensor &s, int dim, int * srcIndex, int indexSize);

+/* gather selected sub-tensors (return a XTensor structure)
+   make a new tensor to keep the result and return it */
+XTensor Gather(const XTensor &s, const XTensor &index);
+
 } // namespace nts(NiuTrans.Tensor)

 #endif // __GATHER_H__
\ No newline at end of file
--- a/source/tensor/core/reduce/ReduceSum.cpp
+++ b/source/tensor/core/reduce/ReduceSum.cpp
@@ -16,8 +16,8 @@
 */

 /*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
-*/
+ * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
+ */

 #include <math.h>
 #include "ReduceSum.h"

--- a/source/tensor/core/reduce/ReduceSum.cu
+++ b/source/tensor/core/reduce/ReduceSum.cu
@@ -105,15 +105,15 @@ void KernelReduceSum(DTYPE * input, DTYPE * output,
    __shared__ DTYPE iData[MAX_CUDA_THREAD_NUM_PER_BLOCK * MIN_CUDA_SHARED_MEM_COL_SIZE/2];
    __shared__ DTYPE bias[MAX_CUDA_THREAD_NUM_PER_BLOCK];

-    int idx = threadIdx.x * blockDim.y + threadIdx.y;
-    unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
-    unsigned int j = blockIdx.y*blockDim.y + threadIdx.y;
+    int idx = threadIdx.y * blockDim.x + threadIdx.x;
+    unsigned int i = blockIdx.y*blockDim.y + threadIdx.y;
+    unsigned int j = blockIdx.x*blockDim.x + threadIdx.x;

    if(i >= stride * blockNum)
        return;

-    if(threadIdx.y == 0)
-        bias[threadIdx.x] = shift != NULL ? shift[i] : 0;
+    if(threadIdx.x == 0)
+        bias[threadIdx.y] = shift != NULL ? shift[i] : 0;

    __syncthreads();

@@ -121,7 +121,7 @@ void KernelReduceSum(DTYPE * input, DTYPE * output,
    int iOffset = i % stride;
    bool isValid = (i < stride * blockNum && j < strideNum);

-    DTYPE value =  isValid ? input[blockSize * k + stride * j + iOffset] - bias[threadIdx.x] : 0;
+    DTYPE value =  isValid ? input[blockSize * k + stride * j + iOffset] - bias[threadIdx.y] : 0;

    if(power != (DTYPE)1.0){
        if(power == (DTYPE)2.0)
@@ -136,21 +136,20 @@ void KernelReduceSum(DTYPE * input, DTYPE * output,
        value = exp(value);

    /* load data into the shared mem */
-    iData[threadIdx.x * blockDim.y + threadIdx.y] = value;
+    iData[threadIdx.y * blockDim.x + threadIdx.x] = value;

    __syncthreads();

    /* do reduction in shared mem */
-    for (unsigned int s = blockDim.y/2; s > 0; s >>= 1){
-        if (threadIdx.y < s)
+    for (unsigned int s = blockDim.x/2; s > 0; s >>= 1){
+        if (threadIdx.x < s)
            iData[idx] += iData[idx + s];

        __syncthreads();
    }
-
    /* write result for this block to the output array */
-    if (threadIdx.y == 0 && blockIdx.y < reducedStrideNum) 
-        output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = iData[threadIdx.x * blockDim.y];
+    if (threadIdx.x == 0 && blockIdx.x < reducedStrideNum) 
+        output[(k * reducedStrideNum + blockIdx.x) * stride + iOffset] = iData[threadIdx.y * blockDim.x];
 }

 /* 
@@ -282,15 +281,15 @@ void KernelReduceSumFast(DTYPE * input, DTYPE * output,
    __shared__ DTYPE iData[MAX_CUDA_THREAD_NUM_PER_BLOCK];
    __shared__ DTYPE bias[MAX_CUDA_THREAD_NUM_PER_BLOCK];

-    unsigned int tid = threadIdx.y;
-    unsigned int j = blockIdx.y * (blockDim.y * 2) + threadIdx.y;
-    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-    
+    unsigned int tid = threadIdx.x;
+    unsigned int j = blockIdx.x * (blockDim.x * 2) + threadIdx.x;
+    unsigned int i = blockIdx.y * blockDim.y + threadIdx.y;
+
    if(i >= stride * blockNum)
        return;
-    
-    if (threadIdx.y == 0)
-        bias[threadIdx.x] = shift != NULL ? shift[i] : 0;
+
+    if (threadIdx.x == 0)
+        bias[threadIdx.y] = shift != NULL ? shift[i] : 0;

    __syncthreads();

@@ -299,17 +298,17 @@ void KernelReduceSumFast(DTYPE * input, DTYPE * output,
    int iOffset = i % stride;

    bool isValid = j < strideNum;
-    bool isValid2 = j + blockDim.y < strideNum;
+    bool isValid2 = j + blockDim.x < strideNum;

-    DTYPE * data =  iData + threadIdx.x * blockDim.y;
+    DTYPE * data =  iData + threadIdx.y * blockDim.x;
    DTYPE * inputData = input  + k * blockSize;
-    DTYPE value  = isValid ? inputData[j * stride + iOffset] - bias[threadIdx.x]: 0;
-    DTYPE value2 = isValid2 ? inputData[(j + blockDim.y) * stride + iOffset] - bias[threadIdx.x]: 0;
+    DTYPE value  = isValid ? inputData[j * stride + iOffset] - bias[threadIdx.y]: 0;
+    DTYPE value2 = isValid2 ? inputData[(j + blockDim.x) * stride + iOffset] - bias[threadIdx.y]: 0;
    
    if(power != (DTYPE)1.0){
        if(power == (DTYPE)2.0){
            value = value * value;
-            value2 = value2 *value2;
+            value2 = value2 * value2;
        }
        else if(power == (DTYPE)0.5){
            value = sqrt(value);
@@ -329,17 +328,25 @@ void KernelReduceSumFast(DTYPE * input, DTYPE * output,
    }

    value = value + value2;
+
    __syncthreads();
+    
    value = shflDownReduceSum(value);
-    if ((tid & 0x1f) == 0) { data[tid / 32] = value; }
+    if ((tid & 0x1f) == 0) 
+        data[tid / 32] = value;
+
    __syncthreads();
+    
    if (tid < 32){
-        if (tid < blockDim.y / 32)
+        if (tid < blockDim.x / 32)
            value = data[tid];
-        else value = 0;
-            value = shflDownReduceSum(value);
-        if (tid == 0 && blockIdx.y < reducedStrideNum)
-            output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = value;
+        else
+	        value = 0;
+        value = shflDownReduceSum(value);
+
+        if (tid == 0 && blockIdx.x < reducedStrideNum) {
+            output[(k * reducedStrideNum + blockIdx.x) * stride + iOffset] = value;
+        }
    }
 }

@@ -480,7 +487,7 @@ void KernelReduceSumFast(__half * input, __half * output,
 if data storage is discontinuius ,use this way to reduce 
 */
 __global__ 
-void KernelReduceSumDiscontinuousStorage(DTYPE * input, DTYPE * output, int stride, int strideNum, 
+void KernelReduceSumDiscontinuousStorage(DTYPE * input, DTYPE * output, int stride, int strideNum,
                                         int blockNum, DTYPE * shift, DTYPE power, bool isExp)
 {
    __shared__ DTYPE bias[MAX_CUDA_THREAD_NUM_PER_BLOCK];
@@ -568,7 +575,8 @@ void KernelReduceSumOp(DTYPE * input, DTYPE * output,
    if (tid < 32){
        if (tid < blockDim.y / 32)
            threadSum = data[tid];
-        else threadSum = 0;
+        else 
+            threadSum = 0;
        threadSum = shflDownReduceSum(threadSum);
        if (tid == 0 && blockIdx.y < reducedStrideNum)
            output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = threadSum;
@@ -640,29 +648,28 @@ inline void continuousStorageThreadAllocation(dim3& grid, dim3& block, long long
 /* 
 this situation we use block.x * grid.x deal one vector for continuous read
 */
-inline void discontinuousStorageNoShareMemThreadAllocation(dim3& grid, dim3& block, int stride, int blockNum)
+void discontinuousStorageNoShareMemThreadAllocation(dim3* grid, dim3* block, int stride, int blockNum)
 {
-    block.x = 512;
-    block.y = 1;
-
+    block->x = 512;
+    block->y = 1;
    if ((stride * blockNum) % 512 == 0)
-        grid.x = (stride * blockNum) / 512;
+        grid->x = (stride * blockNum) / 512;
    else
-        grid.x = (stride * blockNum) / 512 + 1;
-    grid.y = 1;
+        grid->x = (stride * blockNum) / 512 + 1;
+    grid->y = 1;
 }

 /*
 adjust threads.x number then we can use warp optimization
 */
-inline void adjustThreadForUseWarpOptimization(dim3& blocks, dim3& threads)
+void adjustThreadForUseWarpOptimization(dim3* blocks, dim3* threads)
 {
-    if (threads.x > 1){
-        blocks.x *= threads.x;
-        threads.x = 1;
+    if (threads->y > 1){
+        blocks->y *= threads->y;
+        threads->y = 1;
    }
-    if (threads.y < 32)
-        threads.y = 32;
+    if (threads->x < 32)
+        threads->x = 32;
 }

 /* 
@@ -724,7 +731,7 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
    DTYPE * buf1 = buf;
    DTYPE * buf2 = buf + cudaGridSize[0] * stride * blockNum;
    DTYPE * sp = shift != NULL ? (DTYPE*)shift->data : NULL;
-
+    
    int devIDBackup;
    ProtectCudaDev(input->devID, devIDBackup);

@@ -733,19 +740,23 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
        dim3 blocks;
        continuousStorageThreadAllocation(grids, blocks, (long long)blockNum, strideNum);
        if (blocks.y >= 128)
-            KernelReduceSumOp <<<grids, blocks >>> ((DTYPE *)input->data, (DTYPE*)output->data, stride, strideNum, grids.y, blockSize, blockNum, sp, power, isExp);
+            KernelReduceSumOp <<<grids, blocks>>> ((DTYPE *)input->data, (DTYPE*)output->data, stride, 
+                                                    strideNum, grids.y, blockSize, blockNum, sp, power, isExp);
        else {
-            if (blockNum % 4 != 0) blockNum = (int)(blockNum / 4) + 1;
-            else blockNum = blockNum / 4;
-            KernelReduceSumOpLessBlocks << <blockNum, 128 >> > ((DTYPE *)input->data, (DTYPE*)output->data, strideNum, blockNum, sp, power, isExp);
+            if (blockNum % 4 != 0) 
+                blockNum = (int)(blockNum / 4) + 1;
+            else 
+                blockNum = blockNum / 4;
+            KernelReduceSumOpLessBlocks <<<blockNum, 128>>> ((DTYPE *)input->data, (DTYPE*)output->data, 
+                                                              strideNum, blockNum, sp, power, isExp);
        }
    }
    else if (stride != 1 && stride * blockNum > 4096){
        //GDevs->GetGridAndBlockSize2D(devID, stride * blockNum, strideNum,MAX_INT, cudaGridSize, cudaBlockSize);
        //unsigned int* goutput = (unsigned int *)input->data;
-        //convert2uintV2 <<<dim3(cudaGridSize[0], cudaGridSize[1]), dim3(cudaBlockSize[0], cudaBlockSize[1])>>> ((float*)input->data, goutput, stride, strideNum, blockNum, strideNum*blockNum*stride);
+        //convert2uintV2 << <dim3(cudaGridSize[0], cudaGridSize[1]), dim3(cudaBlockSize[0], cudaBlockSize[1]) >> > ((float*)input->data, goutput, stride, strideNum, blockNum, strideNum*blockNum*stride);
        dim3 grid, block;
-        discontinuousStorageNoShareMemThreadAllocation(grid, block, stride, blockNum);
+        discontinuousStorageNoShareMemThreadAllocation(&grid, &block, stride, blockNum);
        KernelReduceSumDiscontinuousStorage <<<grid, block>>> ((DTYPE *)input->data, (DTYPE*)output->data, stride, 
                                                                strideNum, blockNum,sp, power, isExp);
    }
@@ -769,50 +780,50 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
                /* unroll the reduction procedure. The code is messy but it is faster. */
                if (strideNum <= 32) {
                    GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
-                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
+                    dim3 blocks(cudaGridSize[0], cudaGridSize[1]), threads(cudaBlockSize[0], cudaBlockSize[1]);
                    if (cudaGridSize[0] == 1)
                        oData = (DTYPE*)output->data;
-                    KernelReduceSum <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, 
+                    KernelReduceSum <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.x, 
                                                           blockSize, blockNum, sp, power, isExp);
                }
                else if (strideNum < 128) {
                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 64), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
-                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
+                    dim3 blocks(cudaGridSize[0], cudaGridSize[1]), threads(cudaBlockSize[0], cudaBlockSize[1]);
                    if (cudaGridSize[0] == 1)
                        oData = (DTYPE*)output->data;
                    CheckNTErrors((cudaBlockSize[0] >= 64), "Incorrect thread number when calling the cuda kernel!");
-                    adjustThreadForUseWarpOptimization(blocks, threads);
-                    KernelReduceSumFast<64> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, 
+                    adjustThreadForUseWarpOptimization(&blocks, &threads);
+                    KernelReduceSumFast<64> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.x, 
                                                                   blockSize, blockNum, sp, power, isExp);
                }
                else if (strideNum < 256) {
                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 128), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
-                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
+                    dim3 blocks(cudaGridSize[0], cudaGridSize[1]), threads(cudaBlockSize[0], cudaBlockSize[1]);
                    if (cudaGridSize[0] == 1)
                        oData = (DTYPE*)output->data;
                    CheckNTErrors((cudaBlockSize[0] >= 128), "Incorrect thread number when calling the cuda kernel!");
-                    adjustThreadForUseWarpOptimization(blocks, threads);
-                    KernelReduceSumFast<128> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, 
+                    adjustThreadForUseWarpOptimization(&blocks, &threads);
+                    KernelReduceSumFast<128> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.x, 
                                                                    blockSize, blockNum, sp, power, isExp);
                }
                else if (strideNum < 512) {
                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 256), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
-                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
+                    dim3 blocks(cudaGridSize[0], cudaGridSize[1]), threads(cudaBlockSize[0], cudaBlockSize[1]);
                    if (cudaGridSize[0] == 1)
                        oData = (DTYPE*)output->data;
                    CheckNTErrors((cudaBlockSize[0] >= 256), "Incorrect thread number when calling the cuda kernel!");
-                    adjustThreadForUseWarpOptimization(blocks, threads);
-                    KernelReduceSumFast<256> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, 
+                    adjustThreadForUseWarpOptimization(&blocks, &threads);
+                    KernelReduceSumFast<256> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.x, 
                                                                    blockSize, blockNum, sp, power, isExp);
                }
                else {
                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 512), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
-                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
+                    dim3 blocks(cudaGridSize[0], cudaGridSize[1]), threads(cudaBlockSize[0], cudaBlockSize[1]);
                    if (cudaGridSize[0] == 1)
                        oData = (DTYPE*)output->data;
                    CheckNTErrors((cudaBlockSize[0] >= 512), "Incorrect thread number when calling the cuda kernel!");
-                    adjustThreadForUseWarpOptimization(blocks, threads);
-                    KernelReduceSumFast<512> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, 
+                    adjustThreadForUseWarpOptimization(&blocks, &threads);
+                    KernelReduceSumFast<512> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.x, 
                                                                    blockSize, blockNum, sp, power, isExp);
                }
            }

--- a/source/tensor/core/reduce/ReduceSumAll.cpp
+++ b/source/tensor/core/reduce/ReduceSumAll.cpp
@@ -44,23 +44,24 @@ sum all the items of the tensor (It should be optimized!)
 >> source - the inpute tensor
 << return - the total summation
 */
-DTYPE _ReduceSumAll(XTensor * source)
+DTYPE _ReduceSumAll(const XTensor * source)
 {
    int order = source->order;
    DTYPE summation;

    XTensor * big = NewTensor(source);
    _CopyValues(source, big);
-    for(int i = 0; i < order; i++) {
-
-        if(i == order - 1)
-            big->Reshape(big->unitNum, 1);
+    for(int i = order - 1; i >= 0; i--) {
+        if(i == 0)
+            big->Reshape(1, big->unitNum);

+        int leadingDim = big->order - 1;
        int * dimSize;
-        dimSize = getDimSize(big, 0);
-        XTensor * little = NewTensor(big->order - 1, dimSize, source->dataType, source->denseRatio, source->devID, source->mem);
+        dimSize = getDimSize(big, leadingDim);
+        XTensor * little = NewTensor(big->order - 1, dimSize, source->dataType, source->denseRatio, 
+                                     source->devID, source->mem);
           
-        _ReduceSum(big, little, 0);
+        _ReduceSum(big, little, leadingDim);

        delete big;
        delete dimSize;
@@ -81,7 +82,7 @@ sum all the items of the tensor
 >> source - the inpute tensor
 << return - the total summation   
 */
-DTYPE ReduceSumAll(XTensor & source)
+DTYPE ReduceSumAll(const XTensor & source)
 {
    return _ReduceSumAll(&source);
 }

--- a/source/tensor/core/reduce/ReduceSumAll.h
+++ b/source/tensor/core/reduce/ReduceSumAll.h
@@ -28,10 +28,10 @@
 namespace nts{ // namespace nts(NiuTrans.Tensor)

 /* sum all the items of the tensor */
-DTYPE _ReduceSumAll(XTensor * source);
+DTYPE _ReduceSumAll(const XTensor * source);

 /* sum all the items of the tensor */
-DTYPE ReduceSumAll(XTensor & source);
+DTYPE ReduceSumAll(const XTensor & source);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/function/CrossEntropy.cpp
+++ b/source/tensor/function/CrossEntropy.cpp
@@ -50,46 +50,33 @@ void _CrossEntropy(const XTensor * output, const XTensor * gold,
                   const XTensor * padding, int leadingDim)
 {
    int n = leadingDim < 0 ? output->order - 1 : leadingDim;
-    CheckNTErrors(n >= 0 && n < output->order, "Wrong leadingDim!");
-
    int unitNum = output->dimSize[n];

+    CheckNTErrors(n >= 0 && n < output->order, "Wrong leadingDim!");
    CheckNTErrors(XTensor::IsSameShaped(output, gold), 
                 "The output tensor and gold tensor must be of the same size!");
    CheckNTErrors(weight == NULL || weight->unitNum == unitNum, "Wrong weight tensor!");
-    CheckNTErrors(padding == NULL || XTensor::IsSameShaped(padding, loss), "The loss tensor and padding tensor must be same shape!");
+    CheckNTErrors(padding == NULL || XTensor::IsSameShaped(padding, loss), 
+                 "The loss tensor and padding tensor must be same shape!");
    CheckNTErrors(loss->order == output->order - 1, "Wrong loss dimension!");
    CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE, "TODO!");

-    XTensor * logBuf = NewTensorBuf(output, output->devID, output->mem);
-    XTensor * mulBuf = NewTensorBuf(output, output->devID, output->mem);
-
-    /* l = log(output) */
-    _Log(output, logBuf);
-
-    if(weight != NULL){
-        XTensor * weightBuf = NewTensorBuf(output, output->devID, output->mem);
-        
-        /* multiply gold with weight by broadcast wg = mulDim(g * w) */
-        _MultiplyDim(gold, weight, weightBuf, n, 0);
-        
-        /* multiply weighted gold with log(output) wgl = mul(wg, l) */
-        _Multiply(weightBuf, logBuf, mulBuf, 0);
-        
-        DelTensorBuf(weightBuf);
-    }
-    else{
-        /* multiply gold with log(output) gl = mul(g, l) */
-        _Multiply(gold, logBuf, mulBuf, 0);
-    }
-
-    /* negate result n = negate(mul) */
-    _NegateMe(mulBuf);
+    XTensor * interBuf1 = NewTensorBuf(output, output->devID, output->mem);
+    XTensor * interBuf2 = NewTensorBuf(output, output->devID, output->mem);
    
-    _ReduceSum(mulBuf, loss, n);
+    _Log(output, interBuf1);
+    _Multiply(gold, interBuf1, interBuf2);
+
+    if(weight != NULL)
+        _MultiplyDimMe(interBuf2, weight, n);
+    _NegateMe(interBuf2);
+    _ReduceSum(interBuf2, loss, n);
    
-    DelTensorBuf(mulBuf);
-    DelTensorBuf(logBuf);
+    if(padding != NULL)
+        _MultiplyMe(loss, padding);
+
+    DelTensorBuf(interBuf2);
+    DelTensorBuf(interBuf1);
 }

 /*
@@ -109,19 +96,12 @@ void _CrossEntropyFast(const XTensor * output, const XTensor * gold,
                       XTensor * loss, const XTensor * weight,
                       const XTensor * padding, int leadingDim)
 {
-#ifdef USE_CUDA
-    if(output->devID >= 0) {
-        _CudaCrossEntropyFast(output, gold, loss, weight, padding, leadingDim);
-        return;
-    }
-#endif
-
    int order = output->order;
    int n = leadingDim < 0 ? output->order - 1 : leadingDim;
    int leadingDimSize = output->GetDim(n);

    CheckNTErrors(n >= 0 && n < output->order, 
-                 "Wrong leadingDim!");
+                 "Wrong leading dimension!");
    CheckNTErrors(XTensor::IsSameShaped(output, gold), 
                 "The output tensor and gold tensor must be of the same size!");
    CheckNTErrors(weight == NULL || weight->unitNum == leadingDimSize, 
@@ -133,6 +113,22 @@ void _CrossEntropyFast(const XTensor * output, const XTensor * gold,
    CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE, 
                 "TODO!");
    
+    for(int i = 0; i < order; i++){
+        if(i < n){
+            CheckNTErrors((output->GetDim(i) == loss->GetDim(i)), "Unmatched tensors!");
+        }
+        else if(i > n){
+            CheckNTErrors((output->GetDim(i) == loss->GetDim(i - 1)), "Unmatched tensors!");
+        }
+    }
+
+#ifdef USE_CUDA
+    if(output->devID >= 0) {
+        _CudaCrossEntropyFast(output, gold, loss, weight, padding, leadingDim);
+        return;
+    }
+#endif
+
    int blockNum = 1;
    int blockSize = 1;
    int stride = 1;
@@ -148,31 +144,40 @@ void _CrossEntropyFast(const XTensor * output, const XTensor * gold,
    DTYPE * lossData = (DTYPE*)loss->data;

    DTYPE tmpLoss;
+    int lossPos;
+    int goldPos;
+
    if(weight == NULL) {
        if(padding == NULL) {
            for(int i = 0; i < blockNum; i++) {
-                int beg = i * blockSize;
-
-                tmpLoss = 0;
-                for(int j = 0; j < blockSize; j++) 
-                    tmpLoss += -(*(goldData + beg + j)) * 
-                                (DTYPE)log(*(outputData + beg + j));
-                *(lossData + i) = tmpLoss;
+                for(int j = 0; j < stride; j++) {
+                    tmpLoss = 0;
+                    lossPos = i * stride + j;
+                    for(int k = 0; k < leadingDimSize; k++) {
+                        goldPos = i * blockSize + j + k * stride;
+                        tmpLoss += -(*(goldData + goldPos)) * 
+                                    (DTYPE)log(*(outputData + goldPos));
+                    }
+                    *(lossData + lossPos) = tmpLoss;
+                }
            }
        }
        else {
            DTYPE * paddingData = (DTYPE*)padding->data;
            for(int i = 0; i < blockNum; i++) {
-                int beg = i * blockSize;
-
-                if(*(paddingData + i) == 0)
-                    *(lossData + i) = 0;
-                else{
-                    tmpLoss = 0;
-                    for(int j = 0; j < blockSize; j++)
-                        tmpLoss += -(*(goldData + beg + j)) * 
-                                    (DTYPE)log(*(outputData + beg + j));
-                    *(lossData + i) = tmpLoss;
+                for(int j = 0; j < stride; j++) {
+                    lossPos = i * stride + j;
+                    if(*(paddingData + lossPos) == 0)
+                        *(lossData + lossPos) = 0;
+                    else {
+                        tmpLoss = 0;
+                        for(int k = 0; k < leadingDimSize; k++) {
+                            goldPos = i * blockSize + j + k * stride;
+                            tmpLoss += -(*(goldData + goldPos)) * 
+                                        (DTYPE)log(*(outputData + goldPos));
+                        }
+                        *(lossData + lossPos) = tmpLoss;
+                    }
                }
            }            
        }
@@ -181,30 +186,36 @@ void _CrossEntropyFast(const XTensor * output, const XTensor * gold,
        DTYPE * weightData = (DTYPE*)weight->data;
        if(padding == NULL) {
            for(int i = 0; i < blockNum; i++) {
-                int beg = i * blockSize;
-
-                tmpLoss = 0;
-                for(int j = 0; j < blockSize; j++)
-                    tmpLoss += -(*(goldData + beg + j)) * 
-                                (DTYPE)log(*(outputData + beg + j)) * 
-                                (*(weightData + j));
-                *(lossData + i) = tmpLoss;
+                for(int j = 0; j < stride; j++) {
+                    tmpLoss = 0;
+                    lossPos = i * stride + j;
+                    for(int k = 0; k < leadingDimSize; k++) {
+                        goldPos = i * blockSize + j + k * stride;
+                        tmpLoss += -(*(goldData + goldPos)) * 
+                                    (DTYPE)log(*(outputData + goldPos)) *
+                                    (*(weightData + k));
+                    }
+                    *(lossData + lossPos) = tmpLoss;                    
+                }
            }
        }
        else {
            DTYPE * paddingData = (DTYPE*)padding->data;
            for(int i = 0; i < blockNum; i++) {
-                int beg = i * blockSize;
-
-                if(*(paddingData + i) == 0)
-                    *(lossData + i) = 0;
-                else{
-                    tmpLoss = 0;
-                    for(int j = 0; j < blockSize; j++)
-                        tmpLoss += -(*(goldData + beg + j)) * 
-                                    (DTYPE)log(*(outputData + beg + j)) * 
-                                    (*(weightData + j));
-                    *(lossData + i) = tmpLoss;
+                for(int j = 0; j < stride; j++) {
+                    lossPos = i * stride + j;
+                    if(*(paddingData + lossPos) == 0)
+                        *(lossData + lossPos) = 0;
+                    else {
+                        tmpLoss = 0;
+                        for(int k = 0; k < leadingDimSize; k++) {
+                            goldPos = i * blockSize + j + k * stride;
+                            tmpLoss += -(*(goldData + goldPos)) * 
+                                        (DTYPE)log(*(outputData + goldPos)) *
+                                        (*(weightData + k));
+                        }
+                        *(lossData + lossPos) = tmpLoss;
+                    }
                }
            }              
        }
@@ -212,26 +223,6 @@ void _CrossEntropyFast(const XTensor * output, const XTensor * gold,
 }

 /*
-get the dimSize after reduce operation
->> tensor - a tensor to be reduced
->> n - the reduce dimension 
-<< return - the pointer of dimSize
-*/
-int * reduceDimSize(const XTensor * tensor, int n)
-{
-    int order = tensor->order;
-    int * dimSize = new int[order - 1];
-
-    for (int i = 0; i < order; i++) {
-        if(i < n)
-            dimSize[i] = tensor->dimSize[i];
-        else if(i > n)
-            dimSize[i - 1] = tensor->dimSize[i];
-    }
-    return dimSize;
-}
-
-/*
 compute the cross entropy loss
 loss = sum_{i} (-gold_i * log(output_i))
 where gold and output are distributions 
@@ -247,73 +238,45 @@ DTYPE _CrossEntropy(const XTensor * output, const XTensor * gold,
                    LOSS_COMPUTE_WAY reduceWay, const XTensor * weight, 
                    const XTensor * padding, int leadingDim)
 {
+    DTYPE loss = 0;
+    
+    int order = output->order;
    int n = leadingDim < 0 ? output->order - 1 : leadingDim;
-    CheckNTErrors(n >= 0 && n < output->order, "Wrong leadingDim!");
-
    int unitNum = output->dimSize[n];
-
+    
+    CheckNTErrors(n >= 0 && n < output->order, "Wrong leadingDim!");
    CheckNTErrors(XTensor::IsSameShaped(output, gold), 
                 "The output tensor and gold tensor must be of the same size!");
    CheckNTErrors(weight == NULL || weight->unitNum == unitNum, "Wrong weight tensor!");
-    CheckNTErrors(padding == NULL || padding->order == output->order - 1, "The loss tensor and padding tensor must be same shape!");
+    CheckNTErrors(padding == NULL || padding->order == output->order - 1, 
+                 "The loss tensor and padding tensor must be same shape!");
    CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE, "TODO!");

-    XTensor * logBuf = NewTensorBuf(output, output->devID, output->mem);
-    XTensor * mulBuf = NewTensorBuf(output, output->devID, output->mem);
-
-    /* l = log(output) */
-    _Log(output, logBuf);
-
-    if(weight != NULL){
-        XTensor * weightBuf = NewTensorBuf(output, output->devID, output->mem);
-        
-        /* multiply gold with weight by broadcast wg = mulDim(g * w) */
-        _MultiplyDim(gold, weight, weightBuf, n, 0);
-        
-        /* multiply weighted gold with log(output) wgl = mul(wg, l) */
-        _Multiply(weightBuf, logBuf, mulBuf, 0);
-        DelTensorBuf(weightBuf);
-    }
-    else{
-        /* multiply gold with log(output) gl = mul(g, l) */
-        _Multiply(gold, logBuf, mulBuf, 0);
+    int * dimSize = new int[order - 1];
+    for (int i = 0; i < order; i++) {
+        if(i < n)
+            dimSize[i] = output->dimSize[i];
+        else if(i > n)
+            dimSize[i - 1] = output->dimSize[i];
    }

-    /* negate multiply result n = negate(mul) */
-    _NegateMe(mulBuf);
-    
-    int * dimSize;
-    dimSize = reduceDimSize(output, n);
-    XTensor * lossInter = NewTensor(output->order - 1, dimSize, output->dataType, output->denseRatio, output->devID, output->mem);
-    
-    /* reduce sum all classes */
-    _ReduceSum(mulBuf, lossInter, n);
+    XTensor * lossBuf = NewTensorBuf(output->order - 1, dimSize, output->dataType, output->denseRatio, 
+                                     output->devID, output->mem);

-    DelTensorBuf(mulBuf);
-    DelTensorBuf(logBuf);
-    
-    DTYPE loss;
+    _CrossEntropy(output, gold, lossBuf, weight, padding, leadingDim);

-    /* compute the total loss */
-    if(padding != NULL) {
-        XTensor * temp = NewTensor(lossInter);
-        _Multiply(lossInter, padding, temp);
-        loss = _ReduceSumAll(temp);
-        delete temp;
-    }
-    else 
-        loss = _ReduceSumAll(lossInter);
+    loss = _ReduceSumAll(lossBuf);

    if(reduceWay == REDUCE_MEAN) {
        int nonZeroNum;
        if(padding == NULL) {
-            nonZeroNum = lossInter->unitNum;
+            nonZeroNum = lossBuf->unitNum;
        }
        else {
-            XTensor * tmp = NewTensor(padding);
+            XTensor * tmp = NewTensorBuf(padding, padding->devID, padding->mem);
            _IsNonZero(padding, tmp);
            nonZeroNum = (int)_ReduceSumAll(tmp);
-            delete tmp;
+            DelTensorBuf(tmp);
        }

        loss = loss / (DTYPE)nonZeroNum;
@@ -326,7 +289,7 @@ DTYPE _CrossEntropy(const XTensor * output, const XTensor * gold,
    }

    delete[] dimSize;
-    delete lossInter;
+    DelTensorBuf(lossBuf);

    return loss;
 }
@@ -349,11 +312,7 @@ DTYPE _CrossEntropyFast(const XTensor * output, const XTensor * gold,
                        LOSS_COMPUTE_WAY reduceWay, const XTensor * weight,
                        const XTensor * padding, int leadingDim)
 {
-#ifdef USE_CUDA
-    if(output->devID >= 0) {
-        return _CudaCrossEntropyFast(output, gold, reduceWay, weight, padding, leadingDim);
-    }
-#endif
+    DTYPE loss = 0;

    int order = output->order;
    int n = leadingDim < 0 ? output->order - 1 : leadingDim;
@@ -370,6 +329,23 @@ DTYPE _CrossEntropyFast(const XTensor * output, const XTensor * gold,
    CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE, 
                 "TODO!");
    
+    if(padding != NULL) {
+        for(int i = 0; i < order; i++){
+            if(i < n){
+                CheckNTErrors((output->GetDim(i) == padding->GetDim(i)), "Unmatched tensors!");
+            }
+            else if(i > n){
+                CheckNTErrors((output->GetDim(i) == padding->dimSize[i - 1]), "Unmatched tensors!");
+            }
+        }
+    }
+
+#ifdef USE_CUDA
+    if(output->devID >= 0) {
+        return _CudaCrossEntropyFast(output, gold, reduceWay, weight, padding, leadingDim);
+    }
+#endif
+
    int blockNum = 1;
    int blockSize = 1;
    int stride = 1;
@@ -383,63 +359,78 @@ DTYPE _CrossEntropyFast(const XTensor * output, const XTensor * gold,
    DTYPE * outputData = (DTYPE*)output->data;
    DTYPE * goldData = (DTYPE*)gold->data;

-    DTYPE loss = 0;
+    int paddingPos;
+    int goldPos;
    int nonZeroNum = 0;

    if(weight == NULL) {
        if(padding == NULL) {
-            nonZeroNum = blockNum;
-            for(int i = 0; i < blockNum; i++) {
-                int beg = i * blockSize;
+            nonZeroNum = blockNum * stride;

-                for(int j = 0; j < blockSize; j++) 
-                    loss += -(*(goldData + beg + j)) * 
-                             (DTYPE)log(*(outputData + beg + j));
+            for(int i = 0; i < blockNum; i++) {
+                for(int j = 0; j < stride; j++) {
+                    paddingPos = i * stride + j;
+                    for(int k = 0; k < leadingDimSize; k++) {
+                        goldPos = i * blockSize + j + k * stride;
+                        loss += -(*(goldData + goldPos)) * 
+                                 (DTYPE)log(*(outputData + goldPos));
+                    }
+                }
            }
        }
        else {
            DTYPE * paddingData = (DTYPE*)padding->data;
            for(int i = 0; i < blockNum; i++) {
-                if(*(paddingData + i) == 0)
-                    continue;
-                else{
-                    nonZeroNum += 1;
-
-                    int beg = i * blockSize;
-                    for(int j = 0; j < blockSize; j++)
-                        loss += -(*(goldData + beg + j)) * 
-                                 (DTYPE)log(*(outputData + beg + j));
+                for(int j = 0; j < stride; j++) {
+                    paddingPos = i * stride + j;
+                    if(*(paddingData + paddingPos) == 0)
+                        continue;
+                    else {
+                        nonZeroNum += 1;
+                        for(int k = 0; k < leadingDimSize; k++) {
+                            goldPos = i * blockSize + j + k * stride;
+                            loss += -(*(goldData + goldPos)) * 
+                                     (DTYPE)log(*(outputData + goldPos));
+                        }    
+                    }
                }
-            }            
+            }
        }
    }
    else {
        DTYPE * weightData = (DTYPE*)weight->data;
        if(padding == NULL) {
-            nonZeroNum = blockNum;
+            nonZeroNum = blockNum * stride;
            for(int i = 0; i < blockNum; i++) {
-                int beg = i * blockSize;
-                for(int j = 0; j < blockSize; j++)
-                    loss += -(*(goldData + beg + j)) * 
-                             (DTYPE)log(*(outputData + beg + j)) * 
-                             (*(weightData + j));
+                for(int j = 0; j < stride; j++) {
+                    paddingPos = i * stride + j;
+                    for(int k = 0; k < leadingDimSize; k++) {
+                        goldPos = i * blockSize + j + k * stride;
+                        loss += -(*(goldData + goldPos)) * 
+                                 (DTYPE)log(*(outputData + goldPos)) *
+                                 (*(weightData + k));
+                    }
+                }
            }
        }
        else {
            DTYPE * paddingData = (DTYPE*)padding->data;
            for(int i = 0; i < blockNum; i++) {
-                if(*(paddingData + i) == 0)
-                    continue;
-                else{
-                    nonZeroNum += 1;
-
-                    int beg = i * blockSize;
-                    for(int j = 0; j < blockSize; j++)
-                        loss += -(*(goldData + beg + j)) * 
-                                 (DTYPE)log(*(outputData + beg + j)) * 
-                                 (*(weightData + j));
+                for(int j = 0; j < stride; j++) {
+                    paddingPos = i * stride + j;
+                    if(*(paddingData + paddingPos) == 0)
+                        continue;
+                    else {
+                        nonZeroNum += 1;
+                        for(int k = 0; k < leadingDimSize; k++) {
+                            goldPos = i * blockSize + j + k * stride;
+                            loss += -(*(goldData + goldPos)) * 
+                                     (DTYPE)log(*(outputData + goldPos)) *
+                                     (*(weightData + j));
+                        }    
+                    }
                }
-            }              
+            }
        }
    }

@@ -471,17 +462,10 @@ with respect to gold standard, and y this the model output
 >> padding - specify a target value that is ignored and does not contribute to the loss computation
 >> leadingDim - the leading dimension for the output
 */
-void _CrossEntropyBackward(XTensor * dedy, const XTensor * output, const XTensor * gold, 
-                           const XTensor * weight, const XTensor * padding, 
-                           int leadingDim)
+void _CrossEntropyBackward(XTensor * dedy, const XTensor * output, 
+                           const XTensor * gold, const XTensor * weight,
+                           XTensor * padding, int leadingDim)
 {
-#ifdef USE_CUDA
-    if(output->devID >= 0) {
-        _CudaCrossEntropyBackward(dedy, output, gold, weight, padding, leadingDim);
-        return;
-    }
-#endif
-
    int order = output->order;
    int n = leadingDim < 0 ? output->order - 1 : leadingDim;
    int leadingDimSize = output->GetDim(n);
@@ -497,7 +481,26 @@ void _CrossEntropyBackward(XTensor * dedy, const XTensor * output, const XTensor
                 "Wrong padding tensor!");
    CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE, 
                 "TODO!");
-    
+
+    if(padding != NULL) {
+        for(int i = 0; i < order; i++){
+            if(i < n){
+                CheckNTErrors((output->GetDim(i) == padding->GetDim(i)), "Unmatched tensors!");
+            }
+            else if(i > n){
+                CheckNTErrors((output->GetDim(i) == padding->dimSize[i - 1]), "Unmatched tensors!");
+            }
+        }    
+    }
+
+
+#ifdef USE_CUDA
+    if(output->devID >= 0) {
+        _CudaCrossEntropyBackward(dedy, output, gold, weight, padding, leadingDim);
+        return;
+    }
+#endif
+
    int blockNum = 1;
    int blockSize = 1;
    int stride = 1;
@@ -512,25 +515,35 @@ void _CrossEntropyBackward(XTensor * dedy, const XTensor * output, const XTensor
    DTYPE * outputData = (DTYPE*)output->data;
    DTYPE * goldData = (DTYPE*)gold->data;

+    int paddingPos;
+    int goldPos;
+
    if(weight == NULL) {
        if(padding == NULL) {
            for(int i = 0; i < blockNum; i++) {
-                int beg = i * blockSize;
-                for(int j = 0; j < blockSize; j++)
-                    *(dedyData + beg + j) = -(*(goldData + beg + j)) / 
-                                             (*(outputData + beg + j));
+                for(int j = 0; j < stride; j++) {
+                    for(int k = 0; k < leadingDimSize; k++) {
+                        goldPos = i * blockSize + j + k * stride;
+                        *(dedyData + goldPos) = -(*(goldData + goldPos)) / 
+                                                 (*(outputData + goldPos));
+                    }
+                }
            }
        }
        else {
            DTYPE * paddingData = (DTYPE*)padding->data;
            for(int i = 0; i < blockNum; i++) {
-                int beg = i * blockSize;
-                if(*(paddingData + i) == 0)
-                    memset(dedyData + beg, 0, blockSize * unitSize);
-                else
-                    for(int j = 0; j < blockSize; j++)
-                        *(dedyData + beg + j) = -(*(goldData + beg + j)) / 
-                                                 (*(outputData + beg + j));
+                for(int j = 0; j < stride; j++) {
+                    paddingPos = i * stride + j;
+                    for(int k = 0; k < leadingDimSize; k++) {
+                        goldPos = i * blockSize + j + k * stride;
+                        if(*(paddingData + paddingPos) == 0)
+                            *(dedyData + goldPos) = 0;
+                        else
+                            *(dedyData + goldPos) = -(*(goldData + goldPos)) / 
+                                                     (*(outputData + goldPos));
+                    }
+                }
            }
        }
    }
@@ -538,39 +551,45 @@ void _CrossEntropyBackward(XTensor * dedy, const XTensor * output, const XTensor
        DTYPE * weightData = (DTYPE*)weight->data;
        if(padding == NULL) {
            for(int i = 0; i < blockNum; i++) {
-                int beg = i * blockSize;
-                for(int j = 0; j < blockSize; j++)
-                    *(dedyData + beg + j) = -(*(weightData + j)) * 
-                                             (*(goldData + beg + j)) / 
-                                             (*(outputData + beg + j));
+                for(int j = 0; j < stride; j++) {
+                    for(int k = 0; k < leadingDimSize; k++) {
+                        goldPos = i * blockSize + j + k * stride;
+                        *(dedyData + goldPos) = -(*(weightData + k)) * 
+                                                 (*(goldData + goldPos)) / 
+                                                 (*(outputData + goldPos));
+                    }
+                }
            }
        }
        else {
            DTYPE * paddingData = (DTYPE*)padding->data;
            for(int i = 0; i < blockNum; i++) {
-                int beg = i * blockSize;
-                if(*(paddingData + i) == 0)
-                    memset(dedyData + beg, 0, blockSize * unitSize);
-                else
-                    for(int j = 0; j < blockSize; j++) {
-                        *(dedyData + beg + j) = -(*(weightData + j)) * 
-                                                 (*(goldData + beg + j)) / 
-                                                 (*(outputData + beg + j));
+                for(int j = 0; j < stride; j++) {
+                    paddingPos = i * stride + j;
+                    for(int k = 0; k < leadingDimSize; k++) {
+                        goldPos = i * blockSize + j + k * stride;
+                        if(*(paddingData + paddingPos) == 0)
+                            *(dedyData + goldPos) = 0;
+                        else
+                            *(dedyData + goldPos) = -(*(weightData + k)) * 
+                                                     (*(goldData + goldPos)) / 
+                                                     (*(outputData + goldPos));
+                    }
                }
            }
-        }        
+        }
    }

-    if(padding != NULL) {
-        XTensor * tmp = NewTensor(padding);
-        _IsNonZero(padding, tmp);
-        int nonZeroNum = (int)_ReduceSumAll(tmp);
-        _ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)nonZeroNum);
-        delete tmp;
-    }
-    else {
-        _ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)blockNum);
-    }
+    //if(padding != NULL) {
+    //    XTensor * tmp = NewTensor(padding);
+    //    _IsNonZero(padding, tmp);
+    //    int nonZeroNum = (int)_ReduceSumAll(tmp);
+    //    _ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)nonZeroNum);
+    //    delete tmp;
+    //}
+    //else {
+    //    _ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)blockNum);
+    //}
 }

 } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/function/CrossEntropy.cu
+++ b/source/tensor/function/CrossEntropy.cu
@@ -26,80 +26,20 @@
 #include "../XDevice.h"
 #include "CrossEntropy.cuh"
 #include "CrossEntropy.h"
-#include "../core/reduce/ReduceSumAll.h"
+#include "../core/arithmetic/Div.h"
+#include "../core/arithmetic/Multiply.h"
+#include "../core/arithmetic/MultiplyDim.h"
+#include "../core/arithmetic/Negate.h"
 #include "../core/math/Unary.h"
 #include "../core/math/ScaleAndShift.h"
+#include "../core/reduce/ReduceSum.h"
+#include "../core/reduce/ReduceSumAll.h"
+#include "../core/shape/Transpose.h"
+#include "../core/shape/Unsqueeze.h"

 namespace nts{ // namespace nts(NiuTrans.Tensor)

-/* 
-compute the cross entropy loss (cuda kernel) 
-
->> outputData - the data pointer of output tensor
->> goldData - the data pointer of gold tensor
->> lossData - the data pointer of loss tensor
->> weightData - the data pointer of weight tensor
->> paddingData - the data pointer of padding tensor
->> blockNum - the number of data blocks
->> stride - the size of a data block
-*/
-__global__
-void KernelCrossEntropy(DTYPE * outputData, DTYPE * goldData,
-                        DTYPE * lossData, DTYPE * weightData, 
-                        DTYPE * paddingData, int blockNum, int blockSize)
-{
-    /* block id */
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-
-    if(i >= blockNum)
-        return;
-
-    int beg = i * blockSize;
-    DTYPE tmpLoss = 0;
-
-    if(weightData == NULL) {
-        if(paddingData == NULL) {
-            tmpLoss = 0;
-            for(int j = 0; j < blockSize; j++) 
-                tmpLoss += -(*(goldData + beg + j)) * 
-                            (DTYPE)log(*(outputData + beg + j));
-            *(lossData + i) = tmpLoss;
-        }
-        else {
-            if(*(paddingData + i) == 0)
-                *(lossData + i) = tmpLoss;
-            else{
-                for(int j = 0; j < blockSize; j++)
-                    tmpLoss += -(*(goldData + beg + j)) * 
-                                (DTYPE)log(*(outputData + beg + j));
-                *(lossData + i) = tmpLoss;
-            }
-        }
-    }
-    else {
-        if(paddingData == NULL) {
-            for(int j = 0; j < blockSize; j++)
-                tmpLoss += -(*(goldData + beg + j)) * 
-                            (DTYPE)log(*(outputData + beg + j)) * 
-                            (*(weightData + j));
-            *(lossData + i) = tmpLoss;
-        }
-        else {
-            if(*(paddingData + i) == 0)
-                *(lossData + i) = tmpLoss;
-            else{
-                tmpLoss = 0;
-                for(int j = 0; j < blockSize; j++)
-                    tmpLoss += -(*(goldData + beg + j)) * 
-                                (DTYPE)log(*(outputData + beg + j)) * 
-                                (*(weightData + j));
-                *(lossData + i) = tmpLoss;
-            }
-        }
-    }
-}
-
-/* 
+/*
 compute the cross entropy loss (cuda version) 
 loss = sum_{i} (-gold_i * log(output_i))
 where gold and output are distributions 
@@ -112,79 +52,27 @@ where gold and output are distributions
 >> leadingDim - the leading dimension for the output
 */
 void _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,
-                             XTensor * loss, const XTensor * weight, 
-                             const XTensor * padding, int leadingDim)
+                           XTensor * loss, const XTensor * weight, 
+                           const XTensor * padding, int leadingDim)
 {
-    int order = output->order;
    int n = leadingDim < 0 ? output->order - 1 : leadingDim;
-    int leadingDimSize = output->GetDim(n);
-
-    CheckNTErrors(n >= 0 && n < output->order, 
-                 "Wrong leadingDim!");
-    CheckNTErrors(XTensor::IsSameShaped(output, gold), 
-                 "The output tensor and gold tensor must be of the same size!");
-    CheckNTErrors(weight == NULL || weight->unitNum == leadingDimSize, 
-                 "Wrong weight tensor!");
-    CheckNTErrors(padding == NULL || XTensor::IsSameShaped(padding, loss), 
-                 "The loss tensor and padding tensor must be same shape!");
-    CheckNTErrors(loss->order == output->order - 1, 
-                 "Wrong loss dimension!");
-    CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE, 
-                 "TODO!");
-    
-    int blockNum = 1;
-    int blockSize = 1;
-    int stride = 1;
-
-    for(int i = n + 1; i < order; i++)
-        stride *= output->GetDim(i);
    
-    blockSize = stride * leadingDimSize;
-    blockNum = output->unitNum / blockSize;
-
-    int cudaGrids[3];
-    int cudaBlocks[3];
-
-    //GDevs.GetCudaThread2D(output->devID, blockNum, blockSize, MAX_INT, cudaGrids, cudaBlocks);
-    GDevs.GetCudaThread(output->devID, blockNum, cudaGrids, cudaBlocks);
+    XTensor * interBuf1 = NewTensorBuf(output, output->devID, output->mem);
+    XTensor * interBuf2 = NewTensorBuf(output, output->devID, output->mem);
    
-    dim3 blocks(cudaGrids[0], cudaGrids[1]);
-    dim3 threads(cudaBlocks[0], cudaBlocks[1]);
+    _Log(output, interBuf1);
+    _Multiply(gold, interBuf1, interBuf2);

-    int devIDBackup;
-    ProtectCudaDev(output->devID, devIDBackup);
+    if(weight != NULL)
+        _MultiplyDimMe(interBuf2, weight, n);
+    _NegateMe(interBuf2);
+    _ReduceSum(interBuf2, loss, n);
    
-    DTYPE * outputData = (DTYPE*)output->data;
-    DTYPE * goldData = (DTYPE*)gold->data;
-    DTYPE * lossData = (DTYPE*)loss->data;
-
-    if(weight == NULL) {
-        if(padding == NULL)
-            KernelCrossEntropy<<<dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >>>
-                                (outputData, goldData, lossData, 
-                                 NULL, NULL,
-                                 blockNum, blockSize);
-        else
-            KernelCrossEntropy<<<dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >>>
-                                (outputData, goldData, lossData, 
-                                 NULL, (DTYPE*)padding->data,
-                                 blockNum, blockSize);
-    }
-    else {
-        if(padding == NULL)
-            KernelCrossEntropy<<<dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >>>
-                                (outputData, goldData, lossData, 
-                                 (DTYPE*)weight->data, NULL,
-                                 blockNum, blockSize);
-        else
-            KernelCrossEntropy<<<dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >>>
-                                (outputData, goldData, lossData, 
-                                 (DTYPE*)weight->data, (DTYPE*)padding->data,
-                                 blockNum, blockSize);
-    }
-    
-    BacktoCudaDev(output->devID, devIDBackup);
+    if(padding != NULL)
+        _MultiplyMe(loss, padding);

+    DelTensorBuf(interBuf2);
+    DelTensorBuf(interBuf1);
 }

 /*
@@ -230,87 +118,38 @@ DTYPE _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,
            dimSize[i - 1] = output->dimSize[i];
    }

-    XTensor * lossInter = NewTensor(output->order - 1, dimSize, output->dataType, output->denseRatio, output->devID, output->mem);
+    XTensor * lossBuf = NewTensorBuf(output->order - 1, dimSize, output->dataType, output->denseRatio, 
+                                     output->devID, output->mem);

-    _CudaCrossEntropyFast(output, gold, lossInter, weight, padding, leadingDim);
+    _CudaCrossEntropyFast(output, gold, lossBuf, weight, padding, leadingDim);

-    loss = _ReduceSumAll(lossInter);
+    loss = _ReduceSumAll(lossBuf);

    if(reduceWay == REDUCE_MEAN) {
        int nonZeroNum;
        if(padding == NULL) {
-            nonZeroNum = lossInter->unitNum;
+            nonZeroNum = lossBuf->unitNum;
        }
        else {
-            XTensor * tmp = NewTensor(padding);
+            XTensor * tmp = NewTensorBuf(padding, padding->devID, padding->mem);
            _IsNonZero(padding, tmp);
            nonZeroNum = (int)_ReduceSumAll(tmp);
-            delete tmp;
+            DelTensorBuf(tmp);
        }

        loss = loss / (DTYPE)nonZeroNum;
    }
-
-    return loss;
-}
-
-/* 
-backward computation of cross entropy function (kernel version)
-
->> dedyData - the data pointer of dedy tensor
->> outputData - the data pointer of output tensor
->> goldData - the data pointer of gold tensor
->> weightData - the data pointer of weight tensor
->> paddingData - the data pointer of padding tensor
->> blockNum - the number of data blocks
->> blockSize - the size of a data block
-*/
-__global__
-void KernelCrossEntropyBackward(DTYPE * dedyData, DTYPE * outputData, DTYPE * goldData,
-                                DTYPE * weightData, DTYPE * paddingData,
-                                int blockNum, int blockSize)
-{
-    /* block id */
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-
-    if(i >= blockNum)
-        return;
-
-    int beg = i * blockSize;
-
-    if(weightData == NULL) {
-        if(paddingData == NULL) {
-            for(int j = 0; j < blockSize; j++) 
-                *(dedyData + beg + j) = -(*(goldData + beg + j)) / 
-                                         (*(outputData + beg + j));
-        }
-        else {
-            if(*(paddingData + i) == 0)
-                memset(dedyData + beg, 0, blockSize * sizeof(DTYPE));
-            else
-                for(int j = 0; j < blockSize; j++)
-                    *(dedyData + beg + j) = -(*(goldData + beg + j)) / 
-                                             (*(outputData + beg + j));
-        }
+    else if(reduceWay == REDUCE_SUM) {
+        /* don't need to do anything */
    }
    else {
-        if(paddingData == NULL) {
-            for(int j = 0; j < blockSize; j++)
-                *(dedyData + beg + j) = -(*(weightData + j)) * 
-                                         (*(goldData + beg + j)) / 
-                                         (*(outputData + beg + j));
-        }
-        else {
-            if(*(paddingData + i) == 0)
-                memset(dedyData + beg, 0, blockSize * sizeof(DTYPE));
-            else
-                for(int j = 0; j < blockSize; j++) {
-                    *(dedyData + beg + j) = -(*(weightData + j)) * 
-                                             (*(goldData + beg + j)) / 
-                                             (*(outputData + beg + j));
-            }
-        }
+        ShowNTErrors("TODO");
    }
+
+    delete[] dimSize;
+    DelTensorBuf(lossBuf);
+
+    return loss;
 }

 /* 
@@ -330,85 +169,43 @@ with respect to gold standard, and y this the model output
 */
 void _CudaCrossEntropyBackward(XTensor * dedy, const XTensor * output, 
                               const XTensor * gold, const XTensor * weight,
-                               const XTensor * padding, int leadingDim)
+                               XTensor * padding, int leadingDim)
 {
-    int order = output->order;
    int n = leadingDim < 0 ? output->order - 1 : leadingDim;
-    int leadingDimSize = output->GetDim(n);
-
-    CheckNTErrors(n >= 0 && n < output->order, 
-                 "Wrong leading dimension!");
-    CheckNTErrors(XTensor::IsSameShaped(dedy, output, gold), 
-                 "The output tensor and gold tensor must be of the same size!");
-    CheckNTErrors(weight == NULL || weight->unitNum == leadingDimSize, 
-                 "Wrong weight tensor!");
-    CheckNTErrors(padding == NULL || padding->order == output->order - 1, 
-                 "Wrong padding tensor!");
-    CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE, 
-                 "TODO!");
-    
-    int blockNum = 1;
-    int blockSize = 1;
-    int stride = 1;
-
-    for(int i = n + 1; i < order; i++)
-        stride *= output->GetDim(i);
-    
-    blockSize = stride * leadingDimSize;
-    blockNum = output->unitNum / blockSize;
-
-    int cudaGrids[3];
-    int cudaBlocks[3];
-
-    GDevs.GetCudaThread(output->devID, blockNum, cudaGrids, cudaBlocks);
    
-    dim3 blocks(cudaGrids[0], cudaGrids[1]);
-    dim3 threads(cudaBlocks[0], cudaBlocks[1]);
-
-    int devIDBackup;
-    ProtectCudaDev(output->devID, devIDBackup);
-    
-    DTYPE * dedyData = (DTYPE*)dedy->data;
-    DTYPE * outputData = (DTYPE*)output->data;
-    DTYPE * goldData = (DTYPE*)gold->data;
-
-    if(weight == NULL) {
-        if(padding == NULL)
-            KernelCrossEntropyBackward<<<dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >>>
-                                        (dedyData, outputData, goldData,
-                                         NULL, NULL,
-                                         blockNum, blockSize);
-        else
-            KernelCrossEntropyBackward<<<dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >>>
-                                        (dedyData, outputData, goldData,
-                                         NULL, (DTYPE*)padding->data,
-                                         blockNum, blockSize);
-    }
-    else {
-        if(padding == NULL)
-            KernelCrossEntropyBackward<<<dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >>>
-                                        (dedyData, outputData, goldData,
-                                        (DTYPE*)weight->data, NULL,
-                                         blockNum, blockSize);
-        else
-            KernelCrossEntropyBackward<<<dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >>>
-                                        (dedyData, outputData, goldData,
-                                        (DTYPE*)weight->data, (DTYPE*)padding->data,
-                                         blockNum, blockSize);
-    }
-
+    _Div(gold, output, dedy);
+    _NegateMe(dedy);
+    if(weight != NULL)
+        _MultiplyDimMe(dedy, weight, n);
    if(padding != NULL) {
-        XTensor * tmp = NewTensor(padding);
-        _IsNonZero(padding, tmp);
-        int nonZeroNum = (int)_ReduceSumAll(tmp);
-        _ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)nonZeroNum);
-        delete tmp;
-    }
-    else {
-        _ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)blockNum);
+        int paddingOrder = padding->order;
+        int * paddingDims = new int[paddingOrder];
+        memcpy(paddingDims, padding->dimSize, padding->order * sizeof(int));
+        padding->Reshape(padding->unitNum);
+
+        int order = dedy->order;
+        int * dims = new int[order];
+        memcpy(dims, dedy->dimSize, dedy->order * sizeof(int));
+        dedy->Reshape(dedy->unitNum/dedy->GetDim(n), dedy->GetDim(n));
+        _MultiplyDimMe(dedy, padding, 0);
+
+        padding->Reshape(paddingOrder, paddingDims);
+        dedy->Reshape(order, dims);
+
+        delete[] paddingDims;
+        delete[] dims;
    }
-    
-    BacktoCudaDev(output->devID, devIDBackup);
+
+    //if(padding != NULL) {
+    //    XTensor * tmp = NewTensor(padding);
+    //    _IsNonZero(padding, tmp);
+    //    int nonZeroNum = (int)_ReduceSumAll(tmp);
+    //    _ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)nonZeroNum);
+    //    delete tmp;
+    //}
+    //else {
+    //    _ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)blockNum);
+    //}

 }


--- a/source/tensor/function/CrossEntropy.cuh
+++ b/source/tensor/function/CrossEntropy.cuh
@@ -40,7 +40,7 @@ DTYPE _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,
 /* backward computation of cross entropy function */
 void _CudaCrossEntropyBackward(XTensor * dedy, const XTensor * output, 
                               const XTensor * gold, const XTensor * weight = NULL, 
-                               const XTensor * padding = NULL, int leadingDim = -1);
+                               XTensor * padding = NULL, int leadingDim = -1);


 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/function/CrossEntropy.h
+++ b/source/tensor/function/CrossEntropy.h
@@ -52,9 +52,9 @@ DTYPE _CrossEntropyFast(const XTensor * output, const XTensor * gold,
                        const XTensor * padding = NULL, int leadingDim = -1);

 /* backward computation of cross entropy function */
-void _CrossEntropyBackward(XTensor * dedy, const XTensor * output, const XTensor * gold, 
-                           const XTensor * weight = NULL, const XTensor * padding = NULL, 
-                           int leadingDim = -1);
+void _CrossEntropyBackward(XTensor * dedy, const XTensor * output, 
+                           const XTensor * gold, const XTensor * weight = NULL, 
+                           XTensor * padding = NULL, int leadingDim = -1);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/function/LogSoftmax.cpp
+++ b/source/tensor/function/LogSoftmax.cpp
@@ -279,8 +279,8 @@ better numerical stability.
 >> leadDim - leading dimension (along which we perform reduction)
 */
 void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
-                         XTensor * dedy, XTensor * dedx,
-                         int leadDim,
+                         XTensor * dedy, XTensor * dedx, 
+                         XTensor * padding, int leadDim, 
                         LOSS_FUNCTION_NAME lossName)
 {
    CheckNTErrors((!dedx->isSparse), "The gradient matrix must be dense!");
@@ -292,7 +292,7 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
    int leadDimRDI = y->order - leadDim - 1;
 #ifdef USE_CUDA
    if (gold->devID >= 0) {
-        _CudaLogSoftmaxBackward(gold, y, x, dedy, dedx, leadDim, lossName);
+        _CudaLogSoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
        return;
    }
 #endif

--- a/source/tensor/function/LogSoftmax.cu
+++ b/source/tensor/function/LogSoftmax.cu
@@ -22,6 +22,7 @@
 #include "LogSoftmax.h"
 #include "LogSoftmax.cuh"
 #include "Loss.cuh"
+#include "../core/arithmetic/MultiplyDim.h"
 #include "../core/reduce/ReduceSum.cuh"
 #include "../core/reduce/ReduceMax.cuh"
 #include "../XDevice.h"
@@ -232,7 +233,8 @@ dE/dx = dE/dy * dy/dx
 >> lossName - name of the loss function
 */
 __global__
-void KernelLogSoftmaxBackwardDEDS(DTYPE * dedy, DTYPE * dedx, DTYPE * gold, DTYPE * y, DTYPE * x, int size, LOSS_FUNCTION_NAME lossName)
+void KernelLogSoftmaxBackwardDEDS(DTYPE * dedy, DTYPE * dedx, DTYPE * gold, DTYPE * y, DTYPE * x, 
+                                  int size, LOSS_FUNCTION_NAME lossName)
 {
    int i = blockDim.x * blockIdx.x + threadIdx.x;

@@ -371,10 +373,12 @@ better numerical stability.
 >> leadDim - leading dimension (along which we perform reduction)
 */
 void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
-                            XTensor * dedy, XTensor * dedx,
-                            int leadDim,
+                            XTensor * dedy, XTensor * dedx, 
+                            XTensor * padding, int leadDim, 
                            LOSS_FUNCTION_NAME lossName)
 {
+    leadDim = leadDim < 0 ? y->order - 1 : leadDim;
+
    CheckNTErrors((x->devID >= 0), "Backward computation of log softmax must be run on GPUs.");
    CheckNTErrors((x->devID == y->devID && gold->devID == y->devID),
                  "Tensors used in log softmax are not on the same GPU.");
@@ -441,6 +445,26 @@ void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
                                                    dimensionSize * stride, lossName);
                }
            }
+            if(padding != NULL) {
+                int n = leadDim;
+
+                int paddingOrder = padding->order;
+                int * paddingDims = new int[paddingOrder];
+                memcpy(paddingDims, padding->dimSize, padding->order * sizeof(int));
+                padding->Reshape(padding->unitNum);
+
+                int order = dedx->order;
+                int * dims = new int[order];
+                memcpy(dims, dedx->dimSize, dedx->order * sizeof(int));
+                dedx->Reshape(dedx->unitNum/dedx->GetDim(n), dedx->GetDim(n));
+                _MultiplyDimMe(dedx, padding, 0);
+
+                padding->Reshape(paddingOrder, paddingDims);
+                dedx->Reshape(order, dims);
+
+                delete[] paddingDims;
+                delete[] dims;
+            }
        }
        else {
            ShowNTErrors("TODO!");

--- a/source/tensor/function/LogSoftmax.cuh
+++ b/source/tensor/function/LogSoftmax.cuh
@@ -37,8 +37,8 @@ void _CudaLogSoftmaxSumMax(XTensor * x, XTensor * y, int leadDim, XTensor * sum,

 /* de/dx (Cuda version) */
 void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
-                            XTensor * dedy, XTensor * dedx,
-                            int leadDim, 
+                            XTensor * dedy, XTensor * dedx, 
+                            XTensor * padding, int leadDim, 
                            LOSS_FUNCTION_NAME lossName);

 #endif // USE_CUDA

--- a/source/tensor/function/LogSoftmax.h
+++ b/source/tensor/function/LogSoftmax.h
@@ -38,8 +38,8 @@ void LogSoftmax(const XTensor &x, XTensor &y, int leadDim);

 /* de/dx */
 void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, 
-                         XTensor * dedy, XTensor * dedx,
-                         int leadDim,
+                         XTensor * dedy, XTensor * dedx, 
+                         XTensor * padding, int leadDim, 
                         LOSS_FUNCTION_NAME lossName);

 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/function/Loss.cpp
+++ b/source/tensor/function/Loss.cpp
@@ -486,8 +486,9 @@ void _LossBackward(XTensor * dedy, XTensor * t, XTensor * y,
                for (int i = 0; i < blockNum; i++) {
                    for (int j = 0; j < stride; j++) {
                        for (int k = 0; k < tLen; k++) {
-                            *(dedyp + i * stride * dimensionSize + j + stride * (yBeg + k)) = -(DTYPE)*(tp + i * stride * dimensionSize
-                                + j + stride * (tBeg + k)) / (DTYPE)*(yp +  i * stride * dimensionSize + j + stride * (yBeg + k));
+                            *(dedyp + i * stride * dimensionSize + j + stride * (yBeg + k)) = 
+                            -(DTYPE)*(tp + i * stride * dimensionSize + j + stride * (tBeg + k)) / 
+                             (DTYPE)*(yp +  i * stride * dimensionSize + j + stride * (yBeg + k));
                        }
                    }
                }

--- a/source/tensor/function/Softmax.cpp
+++ b/source/tensor/function/Softmax.cpp
@@ -174,8 +174,8 @@ See more details in LogSoftmaxBackward(...)
 >> leadDim - leading dimension (along which we perform reduction)
 */
 void _SoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, 
-                      XTensor * dedy, XTensor * dedx,
-                      int leadDim,
+                      XTensor * dedy, XTensor * dedx, 
+                      XTensor * padding, int leadDim,
                      LOSS_FUNCTION_NAME lossName)
 {
    CheckNTErrors(dedx->isSparse == false, "The gradient tensor must be dense!");
@@ -188,7 +188,7 @@ void _SoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,

 #ifdef USE_CUDA
    if(y->devID >= 0){
-        _CudaSoftmaxBackward(gold, y, x, dedy, dedx, leadDim, lossName);
+        _CudaSoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
        return;
    }
 #endif

--- a/source/tensor/function/Softmax.cu
+++ b/source/tensor/function/Softmax.cu
@@ -24,6 +24,7 @@
 #include "Loss.cuh"
 #include "../core/reduce/ReduceSum.h"
 #include "../core/arithmetic/Multiply.h"
+#include "../core/arithmetic/MultiplyDim.h"
 #include "../core/shape/Unsqueeze.h"
 #include "../core/arithmetic/Sum.h"
 #include "../XDevice.h"
@@ -309,9 +310,11 @@ See more details in SoftmaxBackward
 */
 void _CudaSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, 
                          XTensor * dedy, XTensor * dedx,
-                          int leadDim,
+                          XTensor * padding, int leadDim,
                          LOSS_FUNCTION_NAME lossName)
 {
+    int n = leadDim < 0 ? y->order - 1 : leadDim;
+
    CheckNTErrors((x->devID >= 0), "Backward computation of log softmax must be run on GPUs.");
    CheckNTErrors((x->devID == y->devID), "Matrices used in log softmax are not on the same GPU.");
    CheckNTErrors((y->order >= 1), "Empty tensor!");
@@ -329,6 +332,24 @@ void _CudaSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,

        if(lossName == CROSSENTROPY || lossName == SQUAREDERROR){
            _Sum(y, gold, dedx, -1.0F);
+            if(padding != NULL) {
+                int paddingOrder = padding->order;
+                int * paddingDims = new int[paddingOrder];
+                memcpy(paddingDims, padding->dimSize, padding->order * sizeof(int));
+                padding->Reshape(padding->unitNum);
+
+                int order = dedx->order;
+                int * dims = new int[order];
+                memcpy(dims, dedx->dimSize, dedx->order * sizeof(int));
+                dedx->Reshape(dedx->unitNum/dedx->GetDim(n), dedx->GetDim(n));
+                _MultiplyDimMe(dedx, padding, 0);
+
+                padding->Reshape(paddingOrder, paddingDims);
+                dedx->Reshape(order, dims);
+
+                delete[] paddingDims;
+                delete[] dims;
+            }
        }
        else if(lossName == ONEHOTERROR){
            ShowNTErrors("TODO!");

--- a/source/tensor/function/Softmax.cuh
+++ b/source/tensor/function/Softmax.cuh
@@ -37,8 +37,8 @@ void _CudaSoftmaxSumMax(const XTensor * x, XTensor * y, int leadDim, XTensor * s

 /* de/dx (Cuda version) */
 void _CudaSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
-                          XTensor * dedy, XTensor * dedx,
-                          int leadDim, 
+                          XTensor * dedy, XTensor * dedx, 
+                          XTensor * padding, int leadDim, 
                          LOSS_FUNCTION_NAME lossName);

 #endif // USE_CUDA

--- a/source/tensor/function/Softmax.h
+++ b/source/tensor/function/Softmax.h
@@ -35,8 +35,8 @@ XTensor Softmax(const XTensor &x, int leadDim);

 /* de/dx */
 void _SoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, 
-                      XTensor * dedy, XTensor * dedx,
-                      int leadDim,
+                      XTensor * dedy, XTensor * dedx, 
+                      XTensor * padding, int leadDim,
                      LOSS_FUNCTION_NAME lossName);

 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/test/TDropout.cpp
+++ b/source/tensor/test/TDropout.cpp
@@ -169,8 +169,8 @@ bool TestDropout2()
    _DropoutBackward(y, x, dedy, dedx, 1, dropProb);

    /* check result */
-    y->Dump(stderr, "y");
-    dedx->Dump(stderr, "dedy");
+    //y->Dump(stderr, "y");
+    //dedx->Dump(stderr, "dedy");

 #ifdef USE_CUDA
    /* GPU test */
@@ -193,8 +193,8 @@ bool TestDropout2()
    _DropoutBackward(yGPU, xGPU, dedyGPU, dedxGPU, 1, dropProb);

    /* check result */
-    yGPU->Dump(stderr, "yGPU");
-    dedxGPU->Dump(stderr, "dedyGPU");
+    //yGPU->Dump(stderr, "yGPU");
+    //dedxGPU->Dump(stderr, "dedyGPU");

    /* destroy variables */
    delete x;

--- a/source/tensor/test/TLogSoftmax.cpp
+++ b/source/tensor/test/TLogSoftmax.cpp
@@ -146,7 +146,7 @@ bool TestLogSoftmax2()
    _LogSoftmax(x, y, 1);
    
    /* call LogSoftmaxBackward function */
-    _LogSoftmaxBackward(g, y, x, dedy, dedx, 1, CROSSENTROPY);
+    _LogSoftmaxBackward(g, y, x, dedy, dedx, NULL, 1, CROSSENTROPY);
    
    /* check result */
    cpuTest = y->CheckData(yAnswer, unitNum, 1e-4F) 
@@ -174,7 +174,7 @@ bool TestLogSoftmax2()
    _LogSoftmax(xGPU, yGPU, 1);

    /* call LogSoftmaxBackward function */
-    _LogSoftmaxBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, 1, CROSSENTROPY);
+    _LogSoftmaxBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, NULL, 1, CROSSENTROPY);
    
    /* check result */
    gpuTest = yGPU->CheckData(yAnswer, unitNum, 1e-4F) && dedxGPU->CheckData(dedxAnswer, unitNum, 1e-4F);
@@ -250,7 +250,7 @@ bool TestLogSoftmax3()
    _LogSoftmax(x, y, 1);
    
    /* call LogSoftmaxBackward function */
-    _LogSoftmaxBackward(g, y, x, dedy, dedx, 1, SQUAREDERROR);
+    _LogSoftmaxBackward(g, y, x, dedy, dedx, NULL, 1, SQUAREDERROR);
    
    /* check result */
    cpuTest = y->CheckData(yAnswer, unitNum, 1e-4F) 
@@ -278,7 +278,7 @@ bool TestLogSoftmax3()
    _LogSoftmax(xGPU, yGPU, 1);

    /* call LogSoftmaxBackward function */
-    _LogSoftmaxBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, 1, SQUAREDERROR);
+    _LogSoftmaxBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, NULL, 1, SQUAREDERROR);
    
    /* check result */
    gpuTest = yGPU->CheckData(yAnswer, unitNum, 1e-4F) 

--- a/source/tensor/test/TPower.cpp
+++ b/source/tensor/test/TPower.cpp
@@ -66,7 +66,9 @@ bool TestPower1()
    bUser = Power(*a, 2.0F);

 	/* check results */
-	cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) && aMe->CheckData(answer, aUnitNum, 1e-4F) && bUser.CheckData(answer, aUnitNum, 1e-4F);
+	cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) && 
+              aMe->CheckData(answer, aUnitNum, 1e-4F) && 
+              bUser.CheckData(answer, aUnitNum, 1e-4F);
    
 #ifdef USE_CUDA
 	/* GPU test */
@@ -88,7 +90,9 @@ bool TestPower1()
    bUserGPU = Power(*aGPU, 2.0F);

 	/* check results */
-	gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) && aMeGPU->CheckData(answer, aUnitNum, 1e-4F) && bUserGPU.CheckData(answer, aUnitNum, 1e-4F);
+	gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) && 
+              aMeGPU->CheckData(answer, aUnitNum, 1e-4F) && 
+              bUserGPU.CheckData(answer, aUnitNum, 1e-4F);
    
 	/* destroy variables */
 	delete a;
@@ -153,7 +157,9 @@ bool TestPower2()
    bUser = Power(*a, 1.0F);

 	/* check results */
-	cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) && aMe->CheckData(answer, aUnitNum, 1e-4F) && bUser.CheckData(answer, aUnitNum, 1e-4F);
+	cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) && 
+              aMe->CheckData(answer, aUnitNum, 1e-4F) && 
+              bUser.CheckData(answer, aUnitNum, 1e-4F);
    
 #ifdef USE_CUDA
 	/* GPU test */
@@ -175,7 +181,9 @@ bool TestPower2()
    bUserGPU = Power(*aGPU, 1.0F);

 	/* check results */
-	gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) && aMeGPU->CheckData(answer, aUnitNum, 1e-4F) && bUserGPU.CheckData(answer, aUnitNum, 1e-4F);
+	gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) && 
+              aMeGPU->CheckData(answer, aUnitNum, 1e-4F) && 
+              bUserGPU.CheckData(answer, aUnitNum, 1e-4F);

 	/* destroy variables */
 	delete a;
@@ -214,7 +222,7 @@ bool TestPower3()
 	for (int i = 0; i < aOrder; i++)
 		aUnitNum *= aDimSize[i];

-	DTYPE aData[3][2] = { {0.0F, 1.0F},
+	DTYPE aData[3][2] = { {1.0F, 1.0F},
 	                      {2.0F, 3.0F},
 	                      {4.0F, 5.0F} };
 	DTYPE answer[3][2] = { {1.0F, 1.0F},
@@ -240,7 +248,9 @@ bool TestPower3()
    bUser = Power(*a, 0.0F);

 	/* check results */
-	cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) && aMe->CheckData(answer, aUnitNum, 1e-4F) && bUser.CheckData(answer, aUnitNum, 1e-4F);
+	cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) && 
+              aMe->CheckData(answer, aUnitNum, 1e-4F) && 
+              bUser.CheckData(answer, aUnitNum, 1e-4F);
    
 #ifdef USE_CUDA
 	/* GPU test */
@@ -262,7 +272,9 @@ bool TestPower3()
    bUserGPU = Power(*aGPU, 0.0F);

 	/* check results */
-	gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) && aMeGPU->CheckData(answer, aUnitNum, 1e-4F) && bUserGPU.CheckData(answer, aUnitNum, 1e-4F);
+	gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) && 
+              aMeGPU->CheckData(answer, aUnitNum, 1e-4F) && 
+              bUserGPU.CheckData(answer, aUnitNum, 1e-4F);

 	/* destroy variables */
 	delete a;

--- a/source/tensor/test/TReduceSum.cpp
+++ b/source/tensor/test/TReduceSum.cpp
 /* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */

 /*
-* $Created by: LI Yinqiao (email: li.yin.qiao.2012@hotmail.com) 2018-04-30
-*/
+ * $Created by: LI Yinqiao (email: li.yin.qiao.2012@hotmail.com) 2018-04-30
+ */

 #include "TReduceSum.h"
+#include "../core/getandset/SetData.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)

@@ -155,6 +156,457 @@ bool TestReduceSum1()
 #endif // USE_CUDA
 }

+/* 
+case 2: test ReduceSum function.
+Sum the items along a dimension of the tensor.
+In this case, 
+C = 1, A >= 10, B >= 128
+(50, 1000000) -> (50), dim = 1
+*/
+bool TestReduceSum2()
+{
+    /* a tensor of size (50, 1000000) */
+    int sOrder = 2;
+    int * sDimSize = new int[sOrder];
+    sDimSize[0] = 50;
+    sDimSize[1] = 1000000;
+
+    int sUnitNum = 1;
+    for (int i = 0; i < sOrder; i++)
+        sUnitNum *= sDimSize[i];
+
+    /* a tensor of size (50) */
+    int tOrder = 1;
+    int * tDimSize = new int[tOrder];
+    tDimSize[0] = 50;
+
+    int tUnitNum = 1;
+    for (int i = 0; i < tOrder; i++)
+        tUnitNum *= tDimSize[i];
+
+    /* CPU test */
+    bool cpuTest = true;
+
+    /* create tensors */
+    XTensor * s = NewTensor(sOrder, sDimSize);
+    XTensor * t = NewTensor(tOrder, tDimSize);
+    XTensor * answer = NewTensor(tOrder, tDimSize);
+    XTensor tUser;
+
+    /* initialize variables */
+    _SetDataFixedFloat(s, 1.0F);
+    _SetDataFixedFloat(answer, (float)s->GetDim(1));
+
+    /* call ReduceSum function */
+    _ReduceSum(s, t, 1);
+    tUser = ReduceSum(*s, 1);
+
+    /* check results */
+    cpuTest = t->CheckData(answer->data, tUnitNum) && tUser.CheckData(answer->data, tUnitNum);
+
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+
+    /* create tensors */
+    XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
+    XTensor tUserGPU;
+
+    /* initialize variables */
+    _SetDataFixedFloat(sGPU, 1.0F);
+
+    /* call ReduceSum function */
+    _ReduceSum(sGPU, tGPU, 1);
+    tUserGPU = ReduceSum(*sGPU, 1);
+
+    /* check results */
+    gpuTest = tGPU->CheckData(answer->data, tUnitNum) && tUserGPU.CheckData(answer->data, tUnitNum);
+
+    /* destroy variables */
+    delete s;
+    delete t;
+    delete answer;
+    delete sGPU;
+    delete tGPU;
+    delete[] sDimSize;
+    delete[] tDimSize;
+    
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete s;
+    delete t;
+    delete answer;
+    delete[] sDimSize;
+    delete[] tDimSize;
+
+    return cpuTest;
+#endif // USE_CUDA
+}
+
+/* 
+case 3: test ReduceSum function.
+Sum the items along a dimension of the tensor.
+In this case, 
+C = 1, A >= 10, B < 128
+(1000000, 50) -> (1000000), dim = 1
+*/
+bool TestReduceSum3()
+{
+    /* a tensor of size (1000000, 50) */
+    int sOrder = 2;
+    int * sDimSize = new int[sOrder];
+    sDimSize[0] = 1000000;
+    sDimSize[1] = 50;
+
+    int sUnitNum = 1;
+    for (int i = 0; i < sOrder; i++)
+        sUnitNum *= sDimSize[i];
+
+    /* a tensor of size (1000000) */
+    int tOrder = 1;
+    int * tDimSize = new int[tOrder];
+    tDimSize[0] = 1000000;
+
+    int tUnitNum = 1;
+    for (int i = 0; i < tOrder; i++)
+        tUnitNum *= tDimSize[i];
+
+    /* CPU test */
+    bool cpuTest = true;
+
+    /* create tensors */
+    XTensor * s = NewTensor(sOrder, sDimSize);
+    XTensor * t = NewTensor(tOrder, tDimSize);
+    XTensor * answer = NewTensor(tOrder, tDimSize);
+    XTensor tUser;
+
+    /* initialize variables */
+    _SetDataFixedFloat(s, 1.0F);
+    _SetDataFixedFloat(answer, (float)s->GetDim(1));
+
+    /* call ReduceSum function */
+    _ReduceSum(s, t, 1);
+    tUser = ReduceSum(*s, 1);
+
+    /* check results */
+    cpuTest = t->CheckData(answer->data, tUnitNum) && tUser.CheckData(answer->data, tUnitNum);
+
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+
+    /* create tensors */
+    XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
+    XTensor tUserGPU;
+
+    /* initialize variables */
+    _SetDataFixedFloat(sGPU, 1.0F);
+
+    /* call ReduceSum function */
+    _ReduceSum(sGPU, tGPU, 1);
+    tUserGPU = ReduceSum(*sGPU, 1);
+
+    /* check results */
+    gpuTest = tGPU->CheckData(answer->data, tUnitNum) && tUserGPU.CheckData(answer->data, tUnitNum);
+
+    /* destroy variables */
+    delete s;
+    delete t;
+    delete answer;
+    delete sGPU;
+    delete tGPU;
+    delete[] sDimSize;
+    delete[] tDimSize;
+    
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete s;
+    delete t;
+    delete answer;
+    delete[] sDimSize;
+    delete[] tDimSize;
+
+    return cpuTest;
+#endif // USE_CUDA
+}
+
+/* 
+case 4: test ReduceSum function.
+Sum the items along a dimension of the tensor.
+In this case, 
+C = 1, A < 10, B is free
+(5, 1000000) -> (5), dim = 1
+*/
+bool TestReduceSum4()
+{
+    /* a tensor of size (5, 1000000) */
+    int sOrder = 2;
+    int * sDimSize = new int[sOrder];
+    sDimSize[0] = 5;
+    sDimSize[1] = 1000000;
+
+    int sUnitNum = 1;
+    for (int i = 0; i < sOrder; i++)
+        sUnitNum *= sDimSize[i];
+
+    /* a tensor of size (5) */
+    int tOrder = 1;
+    int * tDimSize = new int[tOrder];
+    tDimSize[0] = 5;
+
+    int tUnitNum = 1;
+    for (int i = 0; i < tOrder; i++)
+        tUnitNum *= tDimSize[i];
+
+    /* CPU test */
+    bool cpuTest = true;
+
+    /* create tensors */
+    XTensor * s = NewTensor(sOrder, sDimSize);
+    XTensor * t = NewTensor(tOrder, tDimSize);
+    XTensor * answer = NewTensor(tOrder, tDimSize);
+    XTensor tUser;
+
+    /* initialize variables */
+    _SetDataFixedFloat(s, 1.0F);
+    _SetDataFixedFloat(answer, (float)s->GetDim(1));
+
+    /* call ReduceSum function */
+    _ReduceSum(s, t, 1);
+    tUser = ReduceSum(*s, 1);
+
+    /* check results */
+    cpuTest = t->CheckData(answer->data, tUnitNum) && tUser.CheckData(answer->data, tUnitNum);
+
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+
+    /* create tensors */
+    XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
+    XTensor tUserGPU;
+
+    /* initialize variables */
+    _SetDataFixedFloat(sGPU, 1.0F);
+
+    /* call ReduceSum function */
+    _ReduceSum(sGPU, tGPU, 1);
+    tUserGPU = ReduceSum(*sGPU, 1);
+
+    /* check results */
+    gpuTest = tGPU->CheckData(answer->data, tUnitNum) && tUserGPU.CheckData(answer->data, tUnitNum);
+
+    /* destroy variables */
+    delete s;
+    delete t;
+    delete answer;
+    delete sGPU;
+    delete tGPU;
+    delete[] sDimSize;
+    delete[] tDimSize;
+    
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete s;
+    delete t;
+    delete answer;
+    delete[] sDimSize;
+    delete[] tDimSize;
+
+    return cpuTest;
+#endif // USE_CUDA
+}
+
+/* 
+case 5: test ReduceSum function.
+Sum the items along a dimension of the tensor.
+In this case, 
+C != 1, A*C > 4096
+(500, 1000, 500) -> (500, 500), dim = 1
+*/
+bool TestReduceSum5()
+{
+    /* a tensor of size (500, 1000, 500) */
+    int sOrder = 3;
+    int * sDimSize = new int[sOrder];
+    sDimSize[0] = 500;
+    sDimSize[1] = 1000;
+    sDimSize[2] = 500;
+
+    int sUnitNum = 1;
+    for (int i = 0; i < sOrder; i++)
+        sUnitNum *= sDimSize[i];
+
+    /* a tensor of size (500, 500) */
+    int tOrder = 2;
+    int * tDimSize = new int[tOrder];
+    tDimSize[0] = 50;
+    tDimSize[1] = 50;
+
+    int tUnitNum = 1;
+    for (int i = 0; i < tOrder; i++)
+        tUnitNum *= tDimSize[i];
+
+    /* CPU test */
+    bool cpuTest = true;
+
+    /* create tensors */
+    XTensor * s = NewTensor(sOrder, sDimSize);
+    XTensor * t = NewTensor(tOrder, tDimSize);
+    XTensor * answer = NewTensor(tOrder, tDimSize);
+    XTensor tUser;
+
+    /* initialize variables */
+    _SetDataFixedFloat(s, 1.0F);
+    _SetDataFixedFloat(answer, (float)s->GetDim(1));
+
+    /* call ReduceSum function */
+    _ReduceSum(s, t, 1);
+    tUser = ReduceSum(*s, 1);
+
+    /* check results */
+    cpuTest = t->CheckData(answer->data, tUnitNum) && tUser.CheckData(answer->data, tUnitNum);
+
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+
+    /* create tensors */
+    XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
+    XTensor tUserGPU;
+
+    /* initialize variables */
+    _SetDataFixedFloat(sGPU, 1.0F);
+
+    /* call ReduceSum function */
+    _ReduceSum(sGPU, tGPU, 1);
+    tUserGPU = ReduceSum(*sGPU, 1);
+
+    /* check results */
+    gpuTest = tGPU->CheckData(answer->data, tUnitNum) && tUserGPU.CheckData(answer->data, tUnitNum);
+
+    /* destroy variables */
+    delete s;
+    delete t;
+    delete answer;
+    delete sGPU;
+    delete tGPU;
+    delete[] sDimSize;
+    delete[] tDimSize;
+    
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete s;
+    delete t;
+    delete answer;
+    delete[] sDimSize;
+    delete[] tDimSize;
+
+    return cpuTest;
+#endif // USE_CUDA
+}
+
+
+/* 
+case 6: test ReduceSum function.
+Sum the items along a dimension of the tensor.
+In this case, 
+C != 1, A*C <= 4096
+(50, 10000, 50) -> (50, 50), dim = 1
+*/
+bool TestReduceSum6()
+{
+    /* a tensor of size (50, 10000, 50) */
+    int sOrder = 3;
+    int * sDimSize = new int[sOrder];
+    sDimSize[0] = 50;
+    sDimSize[1] = 10000;
+    sDimSize[2] = 50;
+
+    int sUnitNum = 1;
+    for (int i = 0; i < sOrder; i++)
+        sUnitNum *= sDimSize[i];
+
+    /* a tensor of size (50, 50) */
+    int tOrder = 2;
+    int * tDimSize = new int[tOrder];
+    tDimSize[0] = 50;
+    tDimSize[1] = 50;
+
+    int tUnitNum = 1;
+    for (int i = 0; i < tOrder; i++)
+        tUnitNum *= tDimSize[i];
+
+    /* CPU test */
+    bool cpuTest = true;
+
+    /* create tensors */
+    XTensor * s = NewTensor(sOrder, sDimSize);
+    XTensor * t = NewTensor(tOrder, tDimSize);
+    XTensor * answer = NewTensor(tOrder, tDimSize);
+    XTensor tUser;
+
+    /* initialize variables */
+    _SetDataFixedFloat(s, 1.0F);
+    _SetDataFixedFloat(answer, (float)s->GetDim(1));
+
+    /* call ReduceSum function */
+    _ReduceSum(s, t, 1);
+    tUser = ReduceSum(*s, 1);
+
+    /* check results */
+    cpuTest = t->CheckData(answer->data, tUnitNum) && tUser.CheckData(answer->data, tUnitNum);
+
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+
+    /* create tensors */
+    XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
+    XTensor tUserGPU;
+
+    /* initialize variables */
+    _SetDataFixedFloat(sGPU, 1.0F);
+
+    /* call ReduceSum function */
+    _ReduceSum(sGPU, tGPU, 1);
+    tUserGPU = ReduceSum(*sGPU, 1);
+
+    /* check results */
+    gpuTest = tGPU->CheckData(answer->data, tUnitNum) && tUserGPU.CheckData(answer->data, tUnitNum);
+
+    /* destroy variables */
+    delete s;
+    delete t;
+    delete answer;
+    delete sGPU;
+    delete tGPU;
+    delete[] sDimSize;
+    delete[] tDimSize;
+    
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete s;
+    delete t;
+    delete answer;
+    delete[] sDimSize;
+    delete[] tDimSize;
+
+    return cpuTest;
+#endif // USE_CUDA
+}
+
+
 /* other cases */
 /*
 TODO!!
@@ -175,6 +627,51 @@ bool TestReduceSum()
    else
        XPRINT(0, stdout, ">> case 1 passed!\n");

+    /* case 2 test */
+    caseFlag = TestReduceSum2();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 2 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 2 passed!\n");
+
+    ///* case 3 test */
+    //caseFlag = TestReduceSum3();
+    //if (!caseFlag) {
+    //    returnFlag = false;
+    //    XPRINT(0, stdout, ">> case 3 failed!\n");
+    //}
+    //else
+    //    XPRINT(0, stdout, ">> case 3 passed!\n");
+
+    /* case 4 test */
+    caseFlag = TestReduceSum4();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 4 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 4 passed!\n");
+
+    ///* case 5 test */
+    //caseFlag = TestReduceSum5();
+    //if (!caseFlag) {
+    //    returnFlag = false;
+    //    XPRINT(0, stdout, ">> case 5 failed!\n");
+    //}
+    //else
+    //    XPRINT(0, stdout, ">> case 5 passed!\n");
+    
+    /* case 6 test */
+    caseFlag = TestReduceSum6();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 6 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 6 passed!\n");
+
    /* other cases test */
    /*
    TODO!!

--- a/source/tensor/test/TSoftmax.cpp
+++ b/source/tensor/test/TSoftmax.cpp
@@ -146,7 +146,7 @@ bool TestSoftmax2()
    _Softmax(x, y, 1);
    
    /* call SoftmaxBackward function */
-    _SoftmaxBackward(g, y, x, dedy, dedx, 1, CROSSENTROPY);
+    _SoftmaxBackward(g, y, x, dedy, dedx, NULL, 1, CROSSENTROPY);
    
    /* check result */
    cpuTest = y->CheckData(yAnswer, unitNum, 1e-4F)
@@ -174,7 +174,7 @@ bool TestSoftmax2()
    _Softmax(xGPU, yGPU, 1);

    /* call SoftmaxBackward function */
-    _SoftmaxBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, 1, CROSSENTROPY);
+    _SoftmaxBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, NULL, 1, CROSSENTROPY);
    
    /* check result */
    gpuTest = yGPU->CheckData(yAnswer, unitNum, 1e-4F)

--- a/source/tensor/test/TSumDim.cpp
+++ b/source/tensor/test/TSumDim.cpp
@@ -20,8 +20,9 @@
 */

 #include "TSumDim.h"
-#include "../core/arithmetic/SumDim.h"
 #include "../XTensor.h"
+#include "../core/arithmetic/SumDim.h"
+#include "../core/getandset/SetData.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)

@@ -251,6 +252,225 @@ bool TestSumDim2()
 #endif // USE_CUDA
 }

+/* 
+case 3: tensor summation c = a + b * \beta 
+where the size of b is equal to the n-th dimension of a, 
+i.e., a is summed with b by broadcasting.
+In this case, 
+(20, 40, 4000) + (40) = (20, 40, 4000), dim = 1.
+*/
+bool TestSumDim3()
+{
+    /* a tensor of size (20, 40, 4000) */
+    int aOrder = 3;
+    int * aDimSize = new int[aOrder];
+    aDimSize[0] = 20;
+    aDimSize[1] = 40;
+    aDimSize[2] = 4000;
+
+    int aUnitNum = 1;
+    for (int i = 0; i < aOrder; i++)
+        aUnitNum *= aDimSize[i];
+
+    /* a tensor of size (40) */
+    int bOrder = 1;
+    int * bDimSize = new int[bOrder];
+    bDimSize[0] = 40;
+
+    int bUnitNum = 1;
+    for (int i = 0; i < bOrder; i++)
+        bUnitNum *= bDimSize[i];
+
+    /* CPU test */
+    bool cpuTest = true;
+
+    /* create tensors */
+    XTensor * a = NewTensor(aOrder, aDimSize);
+    XTensor * b = NewTensor(bOrder, bDimSize);
+    XTensor * c = NewTensor(aOrder, aDimSize);
+    XTensor * cMe = NewTensor(aOrder, aDimSize);
+    XTensor * answer = NewTensor(aOrder, aDimSize);
+    XTensor cUser;
+
+    /* initialize variables */
+    a->SetZeroAll();
+    cMe->SetZeroAll();
+    _SetDataFixedFloat(b, 1.0F);
+    _SetDataFixedFloat(answer, 1.0F);
+
+    /* call SumDim function */
+    _SumDim(a, b, c, 1);
+    _SumDim(cMe, b, 1);
+    cUser = SumDim(*a, *b, 1);
+    
+    /* check results */
+    cpuTest = c->CheckData(answer->data, aUnitNum) && 
+              cMe->CheckData(answer->data, aUnitNum) && 
+              cUser.CheckData(answer->data, aUnitNum);
+
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+
+    /* create tensor */
+    XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * bGPU = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * cGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * cMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+    XTensor cUserGPU;
+
+    /* Initialize variables */
+    aGPU->SetZeroAll();
+    cMe->SetZeroAll();
+    _SetDataFixedFloat(bGPU, 1.0F);
+
+    /* call sum function */
+    _SumDim(aGPU, bGPU, cGPU, 1);
+    _SumDim(cMeGPU, bGPU, 1);
+    cUserGPU = SumDim(*aGPU, *bGPU, 1);
+
+    /* check results */
+    gpuTest = cGPU->CheckData(answer->data, aUnitNum) && 
+              cMeGPU->CheckData(answer->data, aUnitNum) && 
+              cUserGPU.CheckData(answer->data, aUnitNum);
+
+    /* destroy variables */
+    delete a;
+    delete b;
+    delete c;
+    delete cMe;
+    delete answer;
+    delete aGPU;
+    delete bGPU;
+    delete cGPU;
+    delete cMeGPU;
+    delete[] aDimSize;
+    delete[] bDimSize;
+
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete a;
+	delete b;
+	delete c;
+    delete cMe;
+    delete answer;
+    delete[] aDimSize;
+    delete[] bDimSize;
+
+    return cpuTest;
+#endif // USE_CUDA
+}
+
+/* 
+case 4: tensor summation c = a + b * \beta 
+where the size of b is equal to the n-th dimension of a, 
+i.e., a is summed with b by broadcasting.
+In this case, 
+(200, 40, 4000) + (40) = (200, 40, 4000), dim = 1.
+*/
+bool TestSumDim4()
+{
+    /* a tensor of size (200, 40, 4000) */
+    int aOrder = 2;
+    int * aDimSize = new int[aOrder];
+    aDimSize[0] = 1000000;
+    aDimSize[1] = 50;
+
+    int aUnitNum = 1;
+    for (int i = 0; i < aOrder; i++)
+        aUnitNum *= aDimSize[i];
+
+    /* a tensor of size (40) */
+    int bOrder = 1;
+    int * bDimSize = new int[bOrder];
+    bDimSize[0] = 50;
+
+    int bUnitNum = 1;
+    for (int i = 0; i < bOrder; i++)
+        bUnitNum *= bDimSize[i];
+
+    /* CPU test */
+    bool cpuTest = true;
+
+    /* create tensors */
+    XTensor * a = NewTensor(aOrder, aDimSize);
+    XTensor * b = NewTensor(bOrder, bDimSize);
+    XTensor * c = NewTensor(aOrder, aDimSize);
+    XTensor * cMe = NewTensor(aOrder, aDimSize);
+    XTensor * answer = NewTensor(aOrder, aDimSize);
+    XTensor cUser;
+
+    /* initialize variables */
+    a->SetZeroAll();
+    cMe->SetZeroAll();
+    _SetDataFixedFloat(b, 1.0F);
+    _SetDataFixedFloat(answer, 1.0F);
+
+    /* call SumDim function */
+    _SumDim(a, b, c, 1);
+    _SumDim(cMe, b, 1);
+    cUser = SumDim(*a, *b, 1);
+    
+    /* check results */
+    cpuTest = c->CheckData(answer->data, aUnitNum) && 
+              cMe->CheckData(answer->data, aUnitNum) && 
+              cUser.CheckData(answer->data, aUnitNum);
+
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+
+    /* create tensor */
+    XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * bGPU = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * cGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * cMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+    XTensor cUserGPU;
+
+    /* Initialize variables */
+    aGPU->SetZeroAll();
+    cMe->SetZeroAll();
+    _SetDataFixedFloat(bGPU, 1.0F);
+
+    /* call sum function */
+    _SumDim(aGPU, bGPU, cGPU, 1);
+    _SumDim(cMeGPU, bGPU, 1);
+    cUserGPU = SumDim(*aGPU, *bGPU, 1);
+
+    /* check results */
+    gpuTest = cGPU->CheckData(answer->data, aUnitNum) && 
+              cMeGPU->CheckData(answer->data, aUnitNum) && 
+              cUserGPU.CheckData(answer->data, aUnitNum);
+
+    /* destroy variables */
+    delete a;
+    delete b;
+    delete c;
+    delete cMe;
+    delete answer;
+    delete aGPU;
+    delete bGPU;
+    delete cGPU;
+    delete cMeGPU;
+    delete[] aDimSize;
+    delete[] bDimSize;
+
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete a;
+	delete b;
+	delete c;
+    delete cMe;
+    delete answer;
+    delete[] aDimSize;
+    delete[] bDimSize;
+
+    return cpuTest;
+#endif // USE_CUDA
+}
+
 /* other cases */
 /*
    TODO!!
@@ -279,6 +499,24 @@ bool TestSumDim()
    }
    else
        XPRINT(0, stdout, ">> case 2 passed!\n");
+    
+    /* case 3 test */
+    caseFlag = TestSumDim3();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 3 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 3 passed!\n");
+        
+    ///* case 4 test */
+    //caseFlag = TestSumDim4();
+    //if (!caseFlag) {
+    //    returnFlag = false;
+    //    XPRINT(0, stdout, ">> case 4 failed!\n");
+    //}
+    //else
+    //    XPRINT(0, stdout, ">> case 4 passed!\n");

    /* other cases test */
    /*