1. add some base functions 2.better implementation for t2t

03a9836e · xuchen · 52c0e35a · 03a9836e · 03a9836e · 03a9836e
Commit 03a9836e authored Nov 13, 2018 by xuchen
--- a/source/network/XBackwardFunc.cpp
+++ b/source/network/XBackwardFunc.cpp
@@ -49,7 +49,7 @@ void XFuncGrad::MakeGrad(XTensor * node, bool isEfficient)
    else if(operID == FUNC_LOGSOFTMAX){
        int leadDim = income.GetParamInt(0);
        CheckNTErrors(leadDim >= 0 && leadDim < input->order, "wrong leading dimension in logsoftmax!");
-        _LogSoftmaxBackward(NULL, output, input, output->grad, input->grad, leadDim, NOLOSS);
+        _LogSoftmaxBackward(NULL, output, input, output->grad, input->grad, NULL, leadDim, NOLOSS);
    }
    else if(operID == FUNC_RECTIFY)
        _RectifyBackward(NULL, output, input, output->grad, input->grad, NOLOSS);
@@ -58,7 +58,7 @@ void XFuncGrad::MakeGrad(XTensor * node, bool isEfficient)
    else if(operID == FUNC_SOFTMAX){
        int leadDim = income.GetParamInt(0);
        CheckNTErrors(leadDim >= 0 && leadDim < input->order, "wrong leading dimension in softmax!");
-        _SoftmaxBackward(NULL, output, input, output->grad, input->grad, leadDim, NOLOSS);
+        _SoftmaxBackward(NULL, output, input, output->grad, input->grad, NULL, leadDim, NOLOSS);
    }
    else{
        ShowNTErrors("Wrong activation function type!");

--- a/source/network/XBackwardLoss.cpp
+++ b/source/network/XBackwardLoss.cpp
@@ -42,7 +42,7 @@ compute dE/dx for a given function y = f(x)
 >> lossName - name of the loss, e.g., cross entropy
 */
 void XLossGrad::Compute(XTensor * gold, XTensor * y, XTensor * x, 
-                        XTensor * dedy, XTensor * dedx,
+                        XTensor * dedy, XTensor * dedx, XTensor * padding,
                        int funcID, void * params,
                        LOSS_FUNCTION_NAME lossName)
 {
@@ -58,7 +58,7 @@ void XLossGrad::Compute(XTensor * gold, XTensor * y, XTensor * x,
    }
    else if(funcID == FUNC_LOGSOFTMAX){
        int leadDim = *(int*)params;
-        _LogSoftmaxBackward(gold, y, x, dedy, dedx, leadDim, lossName);
+        _LogSoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
    }
    else if(funcID == FUNC_RECTIFY){
        _RectifyBackward(gold, y, x, dedy, dedx, lossName);
@@ -67,7 +67,7 @@ void XLossGrad::Compute(XTensor * gold, XTensor * y, XTensor * x,
        _SigmoidBackward(gold, y, x, dedy, dedx, lossName);
    }else if(funcID == FUNC_SOFTMAX){
        int leadDim = *(int*)params;
-        _SoftmaxBackward(gold, y, x, dedy, dedx, leadDim, lossName);
+        _SoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
    }
    else{
        ShowNTErrors("wrong function found when call the backward process!");
@@ -83,10 +83,12 @@ compute dE/dy for variable y and error(loss) function E
 >> lossName - name of the loss, e.g., cross entropy
 */
 void XLossGrad::Compute(XTensor * gold, XTensor * y, 
-                        XTensor * dedy, 
+                        XTensor * dedy, XTensor * padding,
                        LOSS_FUNCTION_NAME lossName)
 {
-    _LossBackward(dedy, gold, y, lossName);
+    //_LossBackward(dedy, gold, y, lossName);
+    if(lossName == CROSSENTROPY)
+        _CrossEntropyBackward(dedy, y, gold, NULL, padding);
 }

 }
\ No newline at end of file
--- a/source/network/XBackwardLoss.h
+++ b/source/network/XBackwardLoss.h
@@ -36,13 +36,13 @@ class XLossGrad
 public:
    /* compute dE/dx for a given function y = f(x) */
    void Compute(XTensor * gold, XTensor * y, XTensor * x, 
-                 XTensor * dedy, XTensor * dedx,
+                 XTensor * dedy, XTensor * dedx, XTensor * padding,
                 int funcID, void * params,
                 LOSS_FUNCTION_NAME lossName);

    /* compute dE/dy for variable y and error(loss) function E */
    void Compute(XTensor * gold, XTensor * y, 
-                 XTensor * dedy, 
+                 XTensor * dedy, XTensor * padding,
                 LOSS_FUNCTION_NAME lossName);
 };


--- a/source/network/XBackwardShape.cpp
+++ b/source/network/XBackwardShape.cpp
@@ -469,8 +469,6 @@ void XShapeGrad::GradTranspose(XTensor * node, bool isEfficient)
    DelTensorBuf(b);

    node->visitMark = NODE_FINISHED;
-
-    delete b;
 }

 /* 

--- a/source/network/XNet.cpp
+++ b/source/network/XNet.cpp
@@ -55,7 +55,7 @@ void XNetClearAll()
 XNet::XNet()
 {
    nodes.Clear();
-    isGradEfficient = true;
+    isGradEfficient = false;
 }

 /* de-constructor */
@@ -86,7 +86,31 @@ void XNet::Backward(XTensor &root, XTensor &gold, LOSS_FUNCTION_NAME loss)
    XList golds(1);
    golds.Add(&gold);

-    Backward(roots, golds, loss);
+    XList paddings(1);
+    paddings.Add(NULL);
+
+    Backward(roots, golds, paddings, loss);
+}
+
+/* 
+backward propagation to obtain gradient wrt. the loss/error function 
+>> root - root node (output) of the network
+>> gold - gold standard for the output
+>> padding - specify a target value that is ignored and does not contribute to the loss computation
+>> loss - name of loss function
+*/
+void XNet::Backward(XTensor &root, XTensor &gold, XTensor &padding, LOSS_FUNCTION_NAME loss)
+{
+    XList roots(1);
+    roots.Add(&root);
+
+    XList golds(1);
+    golds.Add(&gold);
+
+    XList paddings(1);
+    paddings.Add(&padding);
+
+    Backward(roots, golds, paddings, loss);
 }

 /* 
@@ -102,7 +126,10 @@ void XNet::Backward(XTensor &root, LOSS_FUNCTION_NAME loss)
    XList golds(1);
    golds.Add(NULL);

-    Backward(roots, golds, loss);
+    XList paddings(1);
+    paddings.Add(NULL);
+
+    Backward(roots, golds, paddings, loss);
 }

 /* 
@@ -110,9 +137,10 @@ backward propagation to obtain gradient wrt. the loss/error function
 with a number of root nodes 
 >> root - a list of root nodes (output) of the network
 >> gold - a list of gold standard for the output
+>> padding - specify a target value that is ignored
 >> loss - name of loss function
 */
-void XNet::Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss)
+void XNet::Backward(XList &roots, XList &golds, XList &paddings, LOSS_FUNCTION_NAME loss)
 {
    Traverse(roots);

@@ -131,6 +159,7 @@ void XNet::Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss)
    for(int i = 0; i < roots.count; i++){
        XTensor * root = (XTensor*)roots.Get(i);
        XTensor * gold = (XTensor*)golds.Get(i);
+        XTensor * padding = (XTensor*)paddings.Get(i);
        XLink &income = root->income;
        int funcID = income.typeID;
        void * params = income.params;
@@ -139,15 +168,21 @@ void XNet::Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss)
           Note that we do not need to obtain dE/dy here because it is no use in the 
           folloing process of back-propagation */
        if(gold != NULL && income.tailNum == 1 && (funcID & FUNCTION_BASE)){
-            XTensor * x = income.tails[0];
-            XNoder::MakeGrad(x);
-            lossGrad.Compute(gold, root, x, NULL, x->grad, funcID, params, loss);
-            root->visitMark = NODE_FINISHED;
+            if(funcID == FUNC_LOGSOFTMAX || funcID == FUNC_SOFTMAX) {
+                XTensor * x = income.tails[0];
+                XNoder::MakeGrad(x);
+                lossGrad.Compute(gold, root, x, NULL, x->grad, padding, funcID, params, loss);
+                root->visitMark = NODE_FINISHED;
+            }
+            else {
+                XNoder::MakeGrad(root);
+                lossGrad.Compute(gold, root, root->grad, padding, loss);
+            }
        }
        /* we compuate dE/dy (y is the output) if no predefined activation function is used */
        else{
            XNoder::MakeGrad(root);
-            lossGrad.Compute(gold, root, root->grad, loss);
+            lossGrad.Compute(gold, root, root->grad, NULL, loss);
        }
    }
    
@@ -178,16 +213,35 @@ void XNet::Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss)
 /* 
 backward propagation to obtain gradient
 with a number of root nodes 
->> root - a list of root nodes (output) of the network
+>> roots - a list of root nodes (output) of the network
 >> loss - name of loss function
 */
 void XNet::Backward(XList &roots, LOSS_FUNCTION_NAME loss)
 {
    XList golds(roots.count);
-    for(int i = 0; i < roots.count; i++)
+    XList paddings(roots.count);
+    for(int i = 0; i < roots.count; i++) {
        golds.Add(NULL);
+        paddings.Add(NULL);
+    }
+
+    Backward(roots, golds, paddings, loss);
+}
+
+/* 
+backward propagation to obtain gradient
+with a number of root nodes 
+>> roots - a list of root nodes (output) of the network
+>> golds - a list of gold standard for the output
+>> loss - name of loss function
+*/
+void XNet::Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss)
+{
+    XList paddings(roots.count);
+    for(int i = 0; i < roots.count; i++)
+        paddings.Add(NULL);

-    Backward(roots, golds, loss);
+    Backward(roots, golds, paddings, loss);
 }

 /* 

--- a/source/network/XNet.h
+++ b/source/network/XNet.h
@@ -62,17 +62,24 @@ struct XNet
    /* backward propagation to obtain gradient wrt. the loss/error function */
    void Backward(XTensor &root, XTensor &gold, LOSS_FUNCTION_NAME loss = NOLOSS);

+    /* backward propagation to obtain gradient wrt. the loss/error function */
+    void Backward(XTensor &root, XTensor &gold, XTensor &padding, LOSS_FUNCTION_NAME loss = NOLOSS);
+
    /* backward propagation to obtain gradient */
    void Backward(XTensor &root, LOSS_FUNCTION_NAME loss = NOLOSS);

    /* backward propagation to obtain gradient wrt. the loss/error function
       with a number of root nodes */
-    void Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss = NOLOSS);
+    void Backward(XList &roots, XList &golds, XList &paddings, LOSS_FUNCTION_NAME loss = NOLOSS);

    /* backward propagation to obtain gradient
       with a number of root nodes */
    void Backward(XList &roots, LOSS_FUNCTION_NAME loss = NOLOSS);

+    /* backward propagation to obtain gradient
+       with a number of root nodes */
+    void Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss = NOLOSS);
+
    /* backward computation for a given node */
    void BackwardNode(XTensor * node, bool isEfficent = false);


--- a/source/sample/fnnlm/FNNLM.cpp
+++ b/source/sample/fnnlm/FNNLM.cpp
@@ -514,6 +514,8 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
        
        if(isEnd)
            break;
+
+        Test(testFN, outputFN, model);
    }

    double elapsed = GetClockSec() - startT;
@@ -890,7 +892,7 @@ void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NA

    /* for y = softmax(s), we get dE/ds
        where E is the error function (define by loss) */
-    _LogSoftmaxBackward(&gold, &y, &s, NULL, &deds, 1, loss);
+    _LogSoftmaxBackward(&gold, &y, &s, NULL, &deds, NULL, 1, loss);

    /* for s = x * w, we get 
       dE/w_{i,j} = dE/ds_j * ds/dw_{i,j} 

--- a/source/sample/transformer/T2TDecoder.cpp
+++ b/source/sample/transformer/T2TDecoder.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-10-09
+ */
+
+#include <math.h>
+#include "T2TDecoder.h"
+#include "../../tensor/core/CHeader.h"
+
+namespace transformer
+{
+
+/* constructor */
+AttDecoder::AttDecoder()
+{
+    attentionsEnde = NULL;
+    attEndeLayerNorms = NULL;
+}
+
+/* de-constructor */
+AttDecoder::~AttDecoder()
+{
+    delete[] attentionsEnde;
+    delete[] attEndeLayerNorms;
+}
+
+/* 
+initialize the model 
+>> argc - number of arguments
+>> argv - list of pointers to the arguments
+>> myIsMasked - indicates whether the masked attention is employed
+>> myIgnored - number of positions ignored in attention (from the start)
+>> myDevID - device id
+>> myMem - the memory pool
+*/
+void AttDecoder::InitModel(int argc, char ** argv, 
+                           bool myIsMasked, int myIgnored, 
+                           int myDevID, XMem * myMem)
+{
+    AttEncoder::InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem);
+
+    attentionsEnde = new T2TAttention[nlayer];
+    attEndeLayerNorms = new T2TLN[nlayer];
+
+    /* initialize the stacked layers */
+    for(int i = 0; i < nlayer; i++){
+        attentionsEnde[i].InitModel(argc, argv, false, myIgnored, myDevID, myMem);
+        attEndeLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
+    }
+}
+
+/* 
+make the decoding network
+>> inputDec - the input tensor of the decoder
+>> outputEnc - the output tensor of the encoder
+>> mask - the mask that indicate each position is valid
+>> isTraining - indicates whether the model is used for training
+<< return - the output tensor of the encoder
+*/
+XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, bool isTraining)
+{
+    XTensor x;
+
+    x = embedder.Make(inputDec);
+
+    /* dropout */
+    if(isTraining && dropoutP > 0)
+        x = Dropout(x, dropoutP);
+
+    for(int i = 0; i < nlayer; i++){
+        XTensor att;
+        XTensor ende;
+        XTensor ln;
+        XTensor fnn;
+        XTensor res;
+        XTensor nothing;
+
+        /******************/
+        /* self attention */
+        att = attentions[i].Make(x, x, x, mask, isTraining);
+
+        /* dropout */
+        if(isTraining && dropoutP > 0)
+            att = Dropout(att, dropoutP);
+
+        /* residual connection */
+        res = Sum(att, x);
+
+        /* layer normalization */
+        x = attLayerNorms[i].Make(res);
+
+        /*****************************/
+        /* encoder-decoder attention */
+        ende = attentionsEnde[i].Make(outputEnc, x, outputEnc, nothing, isTraining);
+
+        /* dropout */
+        if(isTraining && dropoutP > 0)
+            ende = Dropout(ende, dropoutP);
+
+        /* residual connection */
+        res = Sum(ende, x);
+
+        /* layer normalization */
+        x = attEndeLayerNorms[i].Make(res);
+
+        /*******/
+        /* fnn */
+        fnn = fnns[i].Make(x, isTraining);
+
+        /* dropout */
+        if(isTraining && dropoutP > 0)
+            fnn = Dropout(fnn, dropoutP);
+
+        /* residual connection */
+        res = Sum(fnn, x);
+
+        /* layer normalization */
+        x = fnnLayerNorms[i].Make(res);
+    }
+
+    return x;
+}
+
+
+}
--- a/source/sample/transformer/T2TDecoder.h
+++ b/source/sample/transformer/T2TDecoder.h
@@ -22,19 +22,33 @@
 #ifndef __T2TDECODER_H__
 #define __T2TDECODER_H__

+#include "T2TEncoder.h"
+
 namespace transformer
 {

-class T2TDecoder
+class AttDecoder : public AttEncoder
 {
+public:
+    /* encoder-decoder attention model of each layer */
+    T2TAttention * attentionsEnde;

-};
-
-class AttDecoder : T2TDecoder
-{
+    /* layer normalization for encoder-decoder attention */
+    T2TLN * attEndeLayerNorms;
 public:
+    /* constructor */
+    AttDecoder();
+
+    /* deconstructor */
+    ~AttDecoder();
+
    /* initialize the model */
-    void InitModel(int argc, char ** argv);
+    void InitModel(int argc, char ** argv, 
+                   bool myIsMasked, int myIgnored, 
+                   int myDevID = -1, XMem * myMem = NULL);
+
+    /* make the decoding network */
+    XTensor Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, bool isTraining);
 };

 }

--- a/source/sample/transformer/T2TEmbedding.cpp
+++ b/source/sample/transformer/T2TEmbedding.cpp
@@ -61,16 +61,17 @@ void T2TEmbedder::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
    InitTensor2D(&w, vSize, eSize, X_FLOAT, devID, mem);

    DTYPE v = 1.0F/(float)sqrt((float)eSize);
-    w.SetDataRand(-v, v);
+    w.SetDataRandn(0, v);

    /* create the positional embedding matrix */
    MakePosEmbedding(eSize, d, maxLength);
 }

 /* 
-make positional embeddings (of size eSize * length
-eSize - embedding size
-length - length of the sequenc
+make positional embeddings (of size eSize * length)
+>> eSize - embedding size
+>> d - dimension size of the hidden layers
+>> length - length of the sequence
 */
 void T2TEmbedder::MakePosEmbedding(int eSize, int d, int length)
 {
@@ -114,15 +115,15 @@ make the network
 */
 XTensor T2TEmbedder::Make(XTensor &input)
 {
-    CheckNTErrors(input.GetDim(-1) == vSize, "Wrong vocabulary size!");
+    //CheckNTErrors(input.GetDim(-1) == vSize, "Wrong vocabulary size!");
    CheckNTErrors(input.order > 1, "Wrong input tensor size!");
-    CheckNTErrors(input.dimSize[input.order - 2] < maxLength, "The sequence is too long!");
+    CheckNTErrors(input.dimSize[input.order - 1] < maxLength, "The sequence is too long!");
    CheckNTErrors(vSize > 0, "set vocabulary size by \"-vsize\"");
    CheckNTErrors(eSize > 0, "set embedding size by \"-esize\"");

    int dims[MAX_TENSOR_DIM_NUM];
    memcpy(dims, input.dimSize, input.order * sizeof(int));
-    dims[input.order - 1] = eSize;
+    dims[input.order] = eSize;

    XTensor wordEmbedding;
    XTensor posEmbedding;
@@ -138,7 +139,8 @@ XTensor T2TEmbedder::Make(XTensor &input)
    /* we make positional embeddings first */
    //if(!match){
    if(true){
-        InitTensor(&posEmbedding, input.order, dims, X_FLOAT, 1.0F, devID, mem);
+        InitTensor(&posEmbedding, input.order + 1, dims, X_FLOAT, 1.0F, devID, mem);
+
        XTensor * posTMP = NewTensorBuf(2, dims + 1, X_FLOAT, 1.0F, devID, mem);

        _CopyValues(&posEmbeddingBase, 0, posTMP->unitNum, posTMP, 0);
@@ -148,7 +150,9 @@ XTensor T2TEmbedder::Make(XTensor &input)
    }

    /* then we make word embeddings */
-    wordEmbedding = Linear(MMul(input, w), (float)sqrt((float)eSize));
+    //wordEmbedding = Linear(MMul(input, w), (float)sqrt((float)eSize));
+    wordEmbedding = Gather(w, input);
+    wordEmbedding = Linear(wordEmbedding, (float)sqrt((float)eSize));

    /* we sum over the two embeddings */
    return wordEmbedding + posEmbedding;

--- a/source/sample/transformer/T2TEncoder.cpp
+++ b/source/sample/transformer/T2TEncoder.cpp
@@ -31,6 +31,10 @@ namespace transformer
 /* constructor */
 AttEncoder::AttEncoder()
 {
+    attentions = NULL;
+    fnns = NULL;
+    attLayerNorms = NULL;
+    fnnLayerNorms = NULL;
 }

 /* de-constructor */

--- a/source/sample/transformer/T2TLayerNormal.cpp
+++ b/source/sample/transformer/T2TLayerNormal.cpp
@@ -59,10 +59,7 @@ void T2TLN::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
    InitTensor1D(&w, d, X_FLOAT, devID, mem);
    InitTensor1D(&b, d, X_FLOAT, devID, mem);

-    float scale = 1.0F;
-    float finfout = (float)sqrt(6.0F * scale / d);
-
-    w.SetDataRand(-finfout, finfout);
+    w.SetDataRand(1.0F, 1.0F);
    b.SetZeroAll();
 }


--- a/source/sample/transformer/T2TModel.cpp
+++ b/source/sample/transformer/T2TModel.cpp
--- a/source/sample/transformer/T2TModel.h
+++ b/source/sample/transformer/T2TModel.h
@@ -69,10 +69,16 @@ public:
    void InitModel(int argc, char ** argv);

    /* make the encoding network */
-    XTensor MakeEncoding(XTensor &input, XTensor &mask, bool isTraining);
+    XTensor MakeEncoder(XTensor &input, XTensor &mask, bool isTraining);

-    /* make the entire network (with the output softmax layer) */
-    void Make(XTensor &input, XTensor &output, XTensor &padding, bool isTraining);
+    /* make the encoding network */
+    XTensor MakeDecoder(XTensor &inputEnc, XTensor &inputDec, XTensor &mask, bool isTraining);
+
+    /* make the network for langauge modeling (with the output softmax layer) */
+    void MakeLM(XTensor &input, XTensor &output, XTensor &padding, bool isTraining);
+
+    /* make the network for machine translation (with the output softmax layer) */
+    void MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTensor &paddingEnc, bool isTraining);

    /* get parameter matrics */
    void GetParams(XList &list);

--- a/source/sample/transformer/T2TOutput.cpp
+++ b/source/sample/transformer/T2TOutput.cpp
@@ -66,6 +66,9 @@ void T2TOutput::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
    float scale = 1.0F;
    float finfout = (float)sqrt(6.0F * scale/(hSize + vSize));
    w.SetDataRand(-finfout, finfout);
+
+    DTYPE v = 1.0F/(float)sqrt((float)hSize);
+    w.SetDataRandn(0, v);
 }

 /* 
@@ -90,7 +93,8 @@ void T2TOutput::Make(XTensor &input, XTensor &output)
 {
    XTensor &x = input;

-    output = LogSoftmax(MMul(x, w), -1);
+    //output = LogSoftmax(MMul(x, w), -1);
+    output = Softmax(MMul(x, w), -1);
 }

 }
--- a/source/sample/transformer/T2TTrainer.cpp
+++ b/source/sample/transformer/T2TTrainer.cpp
--- a/source/sample/transformer/T2TTrainer.h
+++ b/source/sample/transformer/T2TTrainer.h
@@ -79,6 +79,9 @@ public:
    /* vocabulary size of the source side */
    int vSize;

+    /* vocabulary size of the target side */
+    int vSizeTgt;
+
    /* learning rate */
    float lrate;
    
@@ -100,6 +103,10 @@ public:
    /* indicates whether we use adam */
    bool useAdam;

+    int validStep;
+
+    int curEpoch;
+
    /* hyper parameters of adam*/
    float adamBeta1;
    float adamBeta2;
@@ -128,8 +135,13 @@ public:
    /* number of batches on which we do model update */
    int updateStep;
    
-    /* indicates whether we double the </s> symble for the output of lms */
+    /* indicates whether we double the </s> symbol for the output of lms */
    bool isDoubledEnd;
+    
+    /* indicates whether we use batchsize = max * sc
+       rather rather than batchsize = word-number, where max is the maximum
+       length and sc is the sentence number */
+    bool isSmallBatch;

 public:
    /* constructor */
@@ -142,7 +154,7 @@ public:
    void Init(int argc, char ** argv);

    /* train the model */
-    void Train(const char * fn, const char * validFN, const char * modelFN, T2TModel * model);
+    bool Train(const char * fn, const char * validFN, const char * modelFN, T2TModel * model);

    /* test the model */
    void Test(const char * fn, const char * ofn, T2TModel * model);
@@ -158,11 +170,34 @@ public:

    /* load a batch of sequences */
    int LoadBatch(FILE * file, bool isLM,
-                  XTensor * batch, XTensor * padding, XTensor * output, 
+                  XTensor * batchEnc, XTensor * paddingEnc, 
+                  XTensor * batchDec, XTensor * paddingDec,
+                  XTensor * gold,
                  int * seqs,
-                  int step, int vs, int sBatch, int wBatch, 
+                  int vsEnc, int vsDec, int sBatch, int wBatch, 
                  bool isSorted, int &wCount,
-                  int devID, XMem * mem);
+                  int devID, XMem * mem, 
+				  bool isTraining);
+
+    /* load a batch of sequences (for language modeling) */
+    int LoadBatchLM(FILE * file, 
+                    XTensor * batchEnc, XTensor * paddingEnc,
+                    XTensor * batchDec, XTensor * paddingDec,
+                    XTensor * gold,
+                    int * seqs, int vs, int sBatch, int wBatch, 
+                    bool isSorted, int &wCount,
+                    int devID, XMem * mem, 
+					bool isTraining);
+
+    /* load a batch of sequences (for machine translation) */
+    int LoadBatchMT(FILE * file, 
+                    XTensor * batchEnc, XTensor * paddingEnc, 
+                    XTensor * batchDec, XTensor * paddingDec,
+                    XTensor * gold,
+                    int * seqs, int vsEnc, int vsDec, int sBatch, int wBatch, 
+                    bool isSorted, int &wCount,
+                    int devID, XMem * mem, 
+					bool isTraining);

    /* shuffle the data file */
    void Shuffle(const char * srcFile, const char * tgtFile);

--- a/source/sample/transformer/Transformer.cpp
+++ b/source/sample/transformer/Transformer.cpp
@@ -25,6 +25,8 @@
 #include "T2TUtility.h"
 #include "T2TTrainer.h"
 #include "../../tensor/XDevice.h"
+#include "../../tensor/XUtility.h"
+#include "../../tensor/XGlobal.h"

 namespace transformer
 {
@@ -56,20 +58,74 @@ int TransformerMain(int argc, const char ** argv)
    LoadParamString(argc, args, "test", testFN, "");
    LoadParamString(argc, args, "output", outputFN, "");

-    T2TTrainer trainer;
-    trainer.Init(argc, args);
-
-    T2TModel model;
-
-    model.InitModel(argc, args);
-
    /* learn model parameters */
-    if(strcmp(trainFN, ""))
-        trainer.Train(trainFN, testFN, strcmp(modelFN, "") ? modelFN : "checkpoint.model", &model);
+    if(strcmp(trainFN, "")) {
+        double startT = GetClockSec();
+
+        T2TTrainer trainer;
+        trainer.Init(argc, args);
+
+        char * fn = new char[MAX_LINE_LENGTH];
+        char * fn1 = new char[MAX_LINE_LENGTH];
+        char * fn2 = new char[MAX_LINE_LENGTH];
+        modelFN = strcmp(modelFN, "") ? modelFN : (char *)"checkpoint.model";
+
+        int epoch;
+        bool isTrain;
+
+        for(epoch = 1; epoch <= trainer.nepoch; epoch++) {
+            sprintf(fn, "%s.%s.%03d", modelFN, "epoch", epoch - 1);
+            sprintf(fn1, "%s.%s.%03d", modelFN, "epoch", epoch);
+            sprintf(fn2, "%s.%s.%03d.output", modelFN, "epoch", epoch);
+
+            if(epoch == 1) {
+                T2TModel model;
+                model.InitModel(argc, args);
+
+                isTrain = trainer.Train(trainFN, testFN, modelFN, &model);
+                model.Dump(fn1);
+            }
+            else {
+                T2TModel model;
+                model.InitModel(argc, args);
+                model.Read(fn);
+
+                isTrain = trainer.Train(trainFN, testFN, modelFN, &model);
+                model.Dump(fn1);
+            }
+
+            if(trainer.useEpochCheckpoint && strcmp(testFN, "")) {
+                T2TTrainer tester;
+                tester.Init(argc, args);
+
+                T2TModel model;
+                model.InitModel(argc, args);
+                model.Read(fn1);
+
+                tester.Test(testFN, fn2, &model);
+            }
+
+            if(!isTrain)
+                break;
+        }
+
+        double elapsed = GetClockSec() - startT;
+        epoch = MIN(epoch, trainer.nepoch);
+    
+        XPRINT2(0, stderr, "[INFO] training finished (took %.1fs and epoch=%d)\n", elapsed, epoch);

+        delete[] fn;
+        delete[] fn1;
+        delete[] fn2;
+    }
+
+    /* don't dump the final model */
    /* save the final model */
-    if(strcmp(modelFN, "") && strcmp(trainFN, ""))
-        model.Dump(modelFN);
+    //if(strcmp(modelFN, "") && strcmp(trainFN, ""))
+    //    model.Dump(modelFN);
+    
+    T2TModel model;
+    model.InitModel(argc, args);

    /* load the model if neccessary */
    if(strcmp(modelFN, ""))

--- a/source/tensor/XDevice.cpp
+++ b/source/tensor/XDevice.cpp
@@ -446,7 +446,7 @@ int XDevManager::GetCudaThread2D(const int devID, const int n, const int m, int 

    CheckNTErrors((!(b & (b-1))), "Block size (x-axis) must be in 2^x");
    CheckNTErrors((gXSize <= GPUs[devID].GPUMaxGridSize[0] && 
-                         gYSize <= GPUs[devID].GPUMaxGridSize[1]), "A too large grid size.");
+                   gYSize <= GPUs[devID].GPUMaxGridSize[1]), "A too large grid size.");

    blockSize[0] = bXSize;
    blockSize[1] = bYSize;

--- a/source/tensor/XMem.cpp
+++ b/source/tensor/XMem.cpp
@@ -292,7 +292,8 @@ void XMem::SetComputationMode(bool myIsForComputation)
    if(!myIsForComputation && devID >= 0 && cublasHandle != NULL)
        cublasDestroy(cublasHandle);
    if(myIsForComputation)
-        CheckNTErrors(cublasCreate(&cublasHandle) == CURAND_STATUS_SUCCESS, "Cannot create the cublas handle.");
+        CheckNTErrors((enum curandStatus)cublasCreate(&cublasHandle) == CURAND_STATUS_SUCCESS, 
+				      "Cannot create the cublas handle.");

    SetDevice(devIDBackup);
 #endif
@@ -1392,8 +1393,8 @@ void XMem::CreateBLASHandle()
                      "Cannot destroy the cublas handle.");
    }

-    CheckNTErrors(cublasCreate(&cublasHandle) == CURAND_STATUS_SUCCESS, 
-                 "Cannot create the cublas handle.");
+    CheckNTErrors((enum curandStatus)cublasCreate(&cublasHandle) == CURAND_STATUS_SUCCESS, 
+                  "Cannot create the cublas handle.");
 #endif
 }


--- a/source/tensor/XTensor.cpp
+++ b/source/tensor/XTensor.cpp
@@ -1057,9 +1057,9 @@ int XTensor::GetKeyInSparse(int i)

 /* 
 set the value of a cell 
->> value - value to assign to the cell
+>> value - value we tend to set
 >> index - index of the cell for each dimension
->> 
+>> size - size of the index
 */
 bool XTensor::Set(DTYPE value, int index[], int size)
 {
@@ -1070,8 +1070,9 @@ bool XTensor::Set(DTYPE value, int index[], int size)

 /* 
 set the value of a cell in a 1d tensor 
->> value - value to assign to the cell
+>> value - value we tend to set
 >> i - item offset
+<< return - succeeded or not
 */
 bool XTensor::Set1D(DTYPE value, int i)
 {
@@ -1124,6 +1125,78 @@ bool XTensor::Set3D(DTYPE value, int d0, int d1, int d2)
    return SetToDevice(devID, GetCell(dims, 3), value);
 }

+
+/* 
+set the integer value of a cell 
+>> value - value we tend to set
+>> index - index of the cell for each dimension
+>> size - size of the index
+<< return - succeeded or not
+*/
+bool XTensor::SetInt(int value, int index[], int size)
+{
+    CheckNTErrors((dataType == X_INT), "The tensor is not in integer type.");
+
+    return SetToDeviceInt(devID, GetCell(index, size), value);
+}
+
+/* 
+set the integer value of a cell in a 1d tensor 
+>> value - value we tend to set
+>> i - item offset
+<< return - succeeded or not
+*/
+bool XTensor::Set1DInt(int value, int i)
+{
+    CheckNTErrors((order == 1), "Cannot get a 2d cell for a tensor whose order is not 2!");
+    CheckNTErrors((i >= 0 && i < dimSize[0]), "dimension 0 is out of range!");
+    CheckNTErrors((dataType == X_INT), "The tensor is not in integer type.");
+
+    int dims[1] = {i};
+
+    return SetToDeviceInt(devID, GetCell(dims, 1), value);
+}
+
+/* 
+set the integer value of a cell in a 2d tensor in default type
+>> value - value we tend to set
+>> ni - row index
+>> mi - column index
+<< return - succeeded or not
+*/
+bool XTensor::Set2DInt(int value, int ni, int mi)
+{
+    CheckNTErrors((order == 2), "Cannot get a 2d cell for a tensor whose order is not 2!");
+    CheckNTErrors((ni >= 0 && ni < dimSize[0]), "dimension 0 is out of range!");
+    CheckNTErrors((mi >= 0 && mi < dimSize[1]), "dimension 1 is out of range!");
+    CheckNTErrors((dataType == X_INT), "The tensor is not in integer type.");
+
+    int dims[2] = {ni, mi};
+
+    return SetToDeviceInt(devID, GetCell(dims, 2), value);
+}
+
+/* 
+set the integer value of a cell in a 3d tensor in default type
+>> value - value we tend to set
+>> d0 - index of demension 0
+>> d1 - index of demension 1
+>> d2 - index of demension 2
+<< return - succeeded or not
+*/
+bool XTensor::Set3DInt(int value, int d0, int d1, int d2)
+{
+    CheckNTErrors(order == 3, "Cannot get a 2d cell for a tensor whose order is not 2!");
+    CheckNTErrors(d0 >= 0 && d0 < dimSize[0], "dimension 0 is out of range!");
+    CheckNTErrors(d1 >= 0 && d1 < dimSize[1], "dimension 1 is out of range!");
+    CheckNTErrors(d2 >= 0 && d2 < dimSize[2], "dimension 2 is out of range!");
+    CheckNTErrors((dataType == X_INT), "The tensor is not in integer type.");
+
+    int dims[3] = {d0, d1, d2};
+
+    return SetToDeviceInt(devID, GetCell(dims, 3), value);
+}
+
 /* 
 increase the value of a cell in a 2d tensor
 >> value - value we tend to set
@@ -1986,6 +2059,9 @@ XTensor * NewTensorBuf(const int myOrder, const int * myDimSize,

    XTensor * tensor = NewTensor(myOrder, dims, myDataType, myDenseRatio, devID, myMem);

+    if (tensor->unitNum * tensor->unitSize == 176657664) {
+        tensor->Dump(stderr, "", 200);
+    }
    if(myMem != NULL)
        tensor->data = myMem->AllocBuf(myMem->devID, tensor->unitNum * tensor->unitSize);
    else

--- a/source/tensor/XTensor.h
+++ b/source/tensor/XTensor.h
@@ -326,6 +326,18 @@ public:

    /* set the value of a cell in a 3d tensor */
    bool Set3D(DTYPE value, int d0, int d1, int d2);
+    
+    /* set the integer value of a cell */
+    bool SetInt(int value, int index[], int size = -1);
+
+    /* set the integer value of a cell in a 1d tensor */
+    bool Set1DInt(int value, int i);
+
+    /* set the integer value of a cell in a 2d tensor */
+    bool Set2DInt(int value, int ni, int mi);
+
+    /* set the integer value of a cell in a 3d tensor */
+    bool Set3DInt(int value, int d0, int d1, int d2);

    /* increase the value of a cell in a 2d */
    bool Add2D(DTYPE value, int ni, int mi);

--- a/source/tensor/XUtility.cpp
+++ b/source/tensor/XUtility.cpp
@@ -491,6 +491,21 @@ bool SetToDevice(int devID, void * p, DTYPE value)
    return true;
 }

+/* assign a integer number to a variable that is kept on a specified device */
+bool SetToDeviceInt(int devID, void * p, int value)
+{
+    if(p == NULL)
+        return false;
+
+    if(devID < 0)
+        *(int*)p = value;
+    else{
+        XMemCopy(p, devID, &value, -1, sizeof(int));
+    }
+
+    return true;
+}
+
 /* get the next number with power of 2 */
 unsigned int GetNextPower2(unsigned int n)
 {

--- a/source/tensor/XUtility.h
+++ b/source/tensor/XUtility.h
@@ -50,6 +50,7 @@ extern void XMemFreeOnDev(int devID, void * p);
 extern DTYPE ToCPU(int devID, void * value);
 extern int ToCPUInt(int devID, void * value);
 extern bool SetToDevice(int devID, void * p, DTYPE value);
+extern bool SetToDeviceInt(int devID, void * p, int value);
 extern unsigned int GetNextPower2(unsigned int n);
 extern void XSleep(int sleepTime);
 extern double GetClock();

--- a/source/tensor/core/getandset/SetData.cpp
+++ b/source/tensor/core/getandset/SetData.cpp
@@ -70,9 +70,9 @@ void _SetDataFanInOut(XTensor * tensor, DTYPE gain)
        fanOut = numOutputFmaps * receptiveFieldSize;
    }

-    DTYPE std = gain * (float)sqrt(2.0/(fanIn + fanOut));
-    DTYPE a = (DTYPE)sqrt(3.0) * std;
-    _SetDataRand(tensor, -a, a);
+    DTYPE finfout = gain * (float)sqrt(6.0F/(fanIn + fanOut));
+    tensor->SetDataRand(-finfout, finfout);
+    //_SetDataRand(tensor, -finfout, finfout);
 }

 /* 
@@ -393,7 +393,7 @@ void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper)
    if(tensor == NULL)
        return;
    
-    /* GPU code */
+    /* CPU code */
    if(tensor->devID < 0){
        DTYPE variance = upper - lower;
        

--- a/source/tensor/core/movement/Gather.cpp
+++ b/source/tensor/core/movement/Gather.cpp
@@ -21,6 +21,8 @@

 #include "Gather.h"
 #include "CopyIndexed.h"
+#include "../../XUtility.h"
+#include "../shape/Reshape.h"

 namespace nts{ // namespace nts(NiuTrans.Tensor)

@@ -75,4 +77,50 @@ XTensor Gather(const XTensor &s, int dim, int * srcIndex, int indexSize)
    return result;
 }

+/*
+gather indexed sub-tensors (return a XTensor structure)
+make a new tensor to keep the result and return it
+
+>> s - the source tensor(2D)
+>> index - the index tensor
+<< return - the result of copying indexed sub-tensors
+*/
+XTensor Gather(const XTensor &s, const XTensor &index)
+{
+    int indexSize = index.unitNum;
+    CheckNTErrors(s.order == 2, "The order of the input tensor must be 2!");
+ 
+    int * srcIndex = new int[index.unitNum];
+
+    if(index.dataType == X_INT) {
+        XMemCopy(srcIndex, -1, index.data, index.devID, indexSize * index.unitSize);
+    }
+    else if(index.dataType == X_FLOAT || index.dataType == X_DOUBLE) {
+        DTYPE * tmp = new DTYPE[indexSize];
+        XMemCopy(tmp, -1, index.data, index.devID, indexSize * index.unitSize);
+        for(int i = 0; i < indexSize; i++)
+            srcIndex[i] = (int)tmp[i];
+        delete[] tmp;
+    }
+
+    XTensor tensor;
+    tensor = Gather(s, 0, srcIndex, indexSize);
+    delete[] srcIndex;
+
+    if(index.order > 1) {
+        int * dims = new int[index.order + 1];
+        memcpy(dims, index.dimSize, index.order * sizeof(int));
+        dims[index.order] = tensor.GetDim(-1);
+
+        XTensor t;
+        t = Reshape(tensor, index.order + 1, dims);
+        delete[] dims;
+
+        return t;
+    }
+    else {
+        return tensor;
+    }   
+}
+
 } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/movement/Gather.h
+++ b/source/tensor/core/movement/Gather.h
@@ -33,6 +33,10 @@ void _Gather(const XTensor * s, XTensor * t, int dim, int * srcIndex, int indexS
   make a new tensor to keep the result and return it */
 XTensor Gather(const XTensor &s, int dim, int * srcIndex, int indexSize);

+/* gather selected sub-tensors (return a XTensor structure)
+   make a new tensor to keep the result and return it */
+XTensor Gather(const XTensor &s, const XTensor &index);
+
 } // namespace nts(NiuTrans.Tensor)

 #endif // __GATHER_H__
\ No newline at end of file
--- a/source/tensor/core/reduce/ReduceSum.cpp
+++ b/source/tensor/core/reduce/ReduceSum.cpp
@@ -16,8 +16,8 @@
 */

 /*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
-*/
+ * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
+ */

 #include <math.h>
 #include "ReduceSum.h"

--- a/source/tensor/core/reduce/ReduceSum.cu
+++ b/source/tensor/core/reduce/ReduceSum.cu
--- a/source/tensor/core/reduce/ReduceSumAll.cpp
+++ b/source/tensor/core/reduce/ReduceSumAll.cpp
@@ -44,23 +44,24 @@ sum all the items of the tensor (It should be optimized!)
 >> source - the inpute tensor
 << return - the total summation
 */
-DTYPE _ReduceSumAll(XTensor * source)
+DTYPE _ReduceSumAll(const XTensor * source)
 {
    int order = source->order;
    DTYPE summation;

    XTensor * big = NewTensor(source);
    _CopyValues(source, big);
-    for(int i = 0; i < order; i++) {
-
-        if(i == order - 1)
-            big->Reshape(big->unitNum, 1);
+    for(int i = order - 1; i >= 0; i--) {
+        if(i == 0)
+            big->Reshape(1, big->unitNum);

+        int leadingDim = big->order - 1;
        int * dimSize;
-        dimSize = getDimSize(big, 0);
-        XTensor * little = NewTensor(big->order - 1, dimSize, source->dataType, source->denseRatio, source->devID, source->mem);
+        dimSize = getDimSize(big, leadingDim);
+        XTensor * little = NewTensor(big->order - 1, dimSize, source->dataType, source->denseRatio, 
+                                     source->devID, source->mem);
           
-        _ReduceSum(big, little, 0);
+        _ReduceSum(big, little, leadingDim);

        delete big;
        delete dimSize;
@@ -81,7 +82,7 @@ sum all the items of the tensor
 >> source - the inpute tensor
 << return - the total summation   
 */
-DTYPE ReduceSumAll(XTensor & source)
+DTYPE ReduceSumAll(const XTensor & source)
 {
    return _ReduceSumAll(&source);
 }

--- a/source/tensor/core/reduce/ReduceSumAll.h
+++ b/source/tensor/core/reduce/ReduceSumAll.h
@@ -28,10 +28,10 @@
 namespace nts{ // namespace nts(NiuTrans.Tensor)

 /* sum all the items of the tensor */
-DTYPE _ReduceSumAll(XTensor * source);
+DTYPE _ReduceSumAll(const XTensor * source);

 /* sum all the items of the tensor */
-DTYPE ReduceSumAll(XTensor & source);
+DTYPE ReduceSumAll(const XTensor & source);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/function/CrossEntropy.cpp
+++ b/source/tensor/function/CrossEntropy.cpp
--- a/source/tensor/function/CrossEntropy.cu
+++ b/source/tensor/function/CrossEntropy.cu
--- a/source/tensor/function/CrossEntropy.cuh
+++ b/source/tensor/function/CrossEntropy.cuh
@@ -40,7 +40,7 @@ DTYPE _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,
 /* backward computation of cross entropy function */
 void _CudaCrossEntropyBackward(XTensor * dedy, const XTensor * output, 
                               const XTensor * gold, const XTensor * weight = NULL, 
-                               const XTensor * padding = NULL, int leadingDim = -1);
+                               XTensor * padding = NULL, int leadingDim = -1);


 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/function/CrossEntropy.h
+++ b/source/tensor/function/CrossEntropy.h
@@ -52,9 +52,9 @@ DTYPE _CrossEntropyFast(const XTensor * output, const XTensor * gold,
                        const XTensor * padding = NULL, int leadingDim = -1);

 /* backward computation of cross entropy function */
-void _CrossEntropyBackward(XTensor * dedy, const XTensor * output, const XTensor * gold, 
-                           const XTensor * weight = NULL, const XTensor * padding = NULL, 
-                           int leadingDim = -1);
+void _CrossEntropyBackward(XTensor * dedy, const XTensor * output, 
+                           const XTensor * gold, const XTensor * weight = NULL, 
+                           XTensor * padding = NULL, int leadingDim = -1);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/function/LogSoftmax.cpp
+++ b/source/tensor/function/LogSoftmax.cpp
@@ -279,8 +279,8 @@ better numerical stability.
 >> leadDim - leading dimension (along which we perform reduction)
 */
 void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
-                         XTensor * dedy, XTensor * dedx,
-                         int leadDim,
+                         XTensor * dedy, XTensor * dedx, 
+                         XTensor * padding, int leadDim, 
                         LOSS_FUNCTION_NAME lossName)
 {
    CheckNTErrors((!dedx->isSparse), "The gradient matrix must be dense!");
@@ -292,7 +292,7 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
    int leadDimRDI = y->order - leadDim - 1;
 #ifdef USE_CUDA
    if (gold->devID >= 0) {
-        _CudaLogSoftmaxBackward(gold, y, x, dedy, dedx, leadDim, lossName);
+        _CudaLogSoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
        return;
    }
 #endif

--- a/source/tensor/function/LogSoftmax.cu
+++ b/source/tensor/function/LogSoftmax.cu
@@ -22,6 +22,7 @@
 #include "LogSoftmax.h"
 #include "LogSoftmax.cuh"
 #include "Loss.cuh"
+#include "../core/arithmetic/MultiplyDim.h"
 #include "../core/reduce/ReduceSum.cuh"
 #include "../core/reduce/ReduceMax.cuh"
 #include "../XDevice.h"
@@ -232,7 +233,8 @@ dE/dx = dE/dy * dy/dx
 >> lossName - name of the loss function
 */
 __global__
-void KernelLogSoftmaxBackwardDEDS(DTYPE * dedy, DTYPE * dedx, DTYPE * gold, DTYPE * y, DTYPE * x, int size, LOSS_FUNCTION_NAME lossName)
+void KernelLogSoftmaxBackwardDEDS(DTYPE * dedy, DTYPE * dedx, DTYPE * gold, DTYPE * y, DTYPE * x, 
+                                  int size, LOSS_FUNCTION_NAME lossName)
 {
    int i = blockDim.x * blockIdx.x + threadIdx.x;

@@ -371,10 +373,12 @@ better numerical stability.
 >> leadDim - leading dimension (along which we perform reduction)
 */
 void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
-                            XTensor * dedy, XTensor * dedx,
-                            int leadDim,
+                            XTensor * dedy, XTensor * dedx, 
+                            XTensor * padding, int leadDim, 
                            LOSS_FUNCTION_NAME lossName)
 {
+    leadDim = leadDim < 0 ? y->order - 1 : leadDim;
+
    CheckNTErrors((x->devID >= 0), "Backward computation of log softmax must be run on GPUs.");
    CheckNTErrors((x->devID == y->devID && gold->devID == y->devID),
                  "Tensors used in log softmax are not on the same GPU.");
@@ -441,6 +445,26 @@ void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
                                                    dimensionSize * stride, lossName);
                }
            }
+            if(padding != NULL) {
+                int n = leadDim;
+
+                int paddingOrder = padding->order;
+                int * paddingDims = new int[paddingOrder];
+                memcpy(paddingDims, padding->dimSize, padding->order * sizeof(int));
+                padding->Reshape(padding->unitNum);
+
+                int order = dedx->order;
+                int * dims = new int[order];
+                memcpy(dims, dedx->dimSize, dedx->order * sizeof(int));
+                dedx->Reshape(dedx->unitNum/dedx->GetDim(n), dedx->GetDim(n));
+                _MultiplyDimMe(dedx, padding, 0);
+
+                padding->Reshape(paddingOrder, paddingDims);
+                dedx->Reshape(order, dims);
+
+                delete[] paddingDims;
+                delete[] dims;
+            }
        }
        else {
            ShowNTErrors("TODO!");

--- a/source/tensor/function/LogSoftmax.cuh
+++ b/source/tensor/function/LogSoftmax.cuh
@@ -37,8 +37,8 @@ void _CudaLogSoftmaxSumMax(XTensor * x, XTensor * y, int leadDim, XTensor * sum,

 /* de/dx (Cuda version) */
 void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
-                            XTensor * dedy, XTensor * dedx,
-                            int leadDim, 
+                            XTensor * dedy, XTensor * dedx, 
+                            XTensor * padding, int leadDim, 
                            LOSS_FUNCTION_NAME lossName);

 #endif // USE_CUDA

--- a/source/tensor/function/LogSoftmax.h
+++ b/source/tensor/function/LogSoftmax.h
@@ -38,8 +38,8 @@ void LogSoftmax(const XTensor &x, XTensor &y, int leadDim);

 /* de/dx */
 void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, 
-                         XTensor * dedy, XTensor * dedx,
-                         int leadDim,
+                         XTensor * dedy, XTensor * dedx, 
+                         XTensor * padding, int leadDim, 
                         LOSS_FUNCTION_NAME lossName);

 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/function/Loss.cpp
+++ b/source/tensor/function/Loss.cpp
@@ -486,8 +486,9 @@ void _LossBackward(XTensor * dedy, XTensor * t, XTensor * y,
                for (int i = 0; i < blockNum; i++) {
                    for (int j = 0; j < stride; j++) {
                        for (int k = 0; k < tLen; k++) {
-                            *(dedyp + i * stride * dimensionSize + j + stride * (yBeg + k)) = -(DTYPE)*(tp + i * stride * dimensionSize
-                                + j + stride * (tBeg + k)) / (DTYPE)*(yp +  i * stride * dimensionSize + j + stride * (yBeg + k));
+                            *(dedyp + i * stride * dimensionSize + j + stride * (yBeg + k)) = 
+                            -(DTYPE)*(tp + i * stride * dimensionSize + j + stride * (tBeg + k)) / 
+                             (DTYPE)*(yp +  i * stride * dimensionSize + j + stride * (yBeg + k));
                        }
                    }
                }

--- a/source/tensor/function/Softmax.cpp
+++ b/source/tensor/function/Softmax.cpp
@@ -174,8 +174,8 @@ See more details in LogSoftmaxBackward(...)
 >> leadDim - leading dimension (along which we perform reduction)
 */
 void _SoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, 
-                      XTensor * dedy, XTensor * dedx,
-                      int leadDim,
+                      XTensor * dedy, XTensor * dedx, 
+                      XTensor * padding, int leadDim,
                      LOSS_FUNCTION_NAME lossName)
 {
    CheckNTErrors(dedx->isSparse == false, "The gradient tensor must be dense!");
@@ -188,7 +188,7 @@ void _SoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,

 #ifdef USE_CUDA
    if(y->devID >= 0){
-        _CudaSoftmaxBackward(gold, y, x, dedy, dedx, leadDim, lossName);
+        _CudaSoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
        return;
    }
 #endif

--- a/source/tensor/function/Softmax.cu
+++ b/source/tensor/function/Softmax.cu
@@ -24,6 +24,7 @@
 #include "Loss.cuh"
 #include "../core/reduce/ReduceSum.h"
 #include "../core/arithmetic/Multiply.h"
+#include "../core/arithmetic/MultiplyDim.h"
 #include "../core/shape/Unsqueeze.h"
 #include "../core/arithmetic/Sum.h"
 #include "../XDevice.h"
@@ -309,9 +310,11 @@ See more details in SoftmaxBackward
 */
 void _CudaSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, 
                          XTensor * dedy, XTensor * dedx,
-                          int leadDim,
+                          XTensor * padding, int leadDim,
                          LOSS_FUNCTION_NAME lossName)
 {
+    int n = leadDim < 0 ? y->order - 1 : leadDim;
+
    CheckNTErrors((x->devID >= 0), "Backward computation of log softmax must be run on GPUs.");
    CheckNTErrors((x->devID == y->devID), "Matrices used in log softmax are not on the same GPU.");
    CheckNTErrors((y->order >= 1), "Empty tensor!");
@@ -329,6 +332,24 @@ void _CudaSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,

        if(lossName == CROSSENTROPY || lossName == SQUAREDERROR){
            _Sum(y, gold, dedx, -1.0F);
+            if(padding != NULL) {
+                int paddingOrder = padding->order;
+                int * paddingDims = new int[paddingOrder];
+                memcpy(paddingDims, padding->dimSize, padding->order * sizeof(int));
+                padding->Reshape(padding->unitNum);
+
+                int order = dedx->order;
+                int * dims = new int[order];
+                memcpy(dims, dedx->dimSize, dedx->order * sizeof(int));
+                dedx->Reshape(dedx->unitNum/dedx->GetDim(n), dedx->GetDim(n));
+                _MultiplyDimMe(dedx, padding, 0);
+
+                padding->Reshape(paddingOrder, paddingDims);
+                dedx->Reshape(order, dims);
+
+                delete[] paddingDims;
+                delete[] dims;
+            }
        }
        else if(lossName == ONEHOTERROR){
            ShowNTErrors("TODO!");

--- a/source/tensor/function/Softmax.cuh
+++ b/source/tensor/function/Softmax.cuh
@@ -37,8 +37,8 @@ void _CudaSoftmaxSumMax(const XTensor * x, XTensor * y, int leadDim, XTensor * s

 /* de/dx (Cuda version) */
 void _CudaSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
-                          XTensor * dedy, XTensor * dedx,
-                          int leadDim, 
+                          XTensor * dedy, XTensor * dedx, 
+                          XTensor * padding, int leadDim, 
                          LOSS_FUNCTION_NAME lossName);

 #endif // USE_CUDA

--- a/source/tensor/function/Softmax.h
+++ b/source/tensor/function/Softmax.h
@@ -35,8 +35,8 @@ XTensor Softmax(const XTensor &x, int leadDim);

 /* de/dx */
 void _SoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, 
-                      XTensor * dedy, XTensor * dedx,
-                      int leadDim,
+                      XTensor * dedy, XTensor * dedx, 
+                      XTensor * padding, int leadDim,
                      LOSS_FUNCTION_NAME lossName);

 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/test/TDropout.cpp
+++ b/source/tensor/test/TDropout.cpp
@@ -169,8 +169,8 @@ bool TestDropout2()
    _DropoutBackward(y, x, dedy, dedx, 1, dropProb);

    /* check result */
-    y->Dump(stderr, "y");
-    dedx->Dump(stderr, "dedy");
+    //y->Dump(stderr, "y");
+    //dedx->Dump(stderr, "dedy");

 #ifdef USE_CUDA
    /* GPU test */
@@ -193,8 +193,8 @@ bool TestDropout2()
    _DropoutBackward(yGPU, xGPU, dedyGPU, dedxGPU, 1, dropProb);

    /* check result */
-    yGPU->Dump(stderr, "yGPU");
-    dedxGPU->Dump(stderr, "dedyGPU");
+    //yGPU->Dump(stderr, "yGPU");
+    //dedxGPU->Dump(stderr, "dedyGPU");

    /* destroy variables */
    delete x;

--- a/source/tensor/test/TLogSoftmax.cpp
+++ b/source/tensor/test/TLogSoftmax.cpp
@@ -146,7 +146,7 @@ bool TestLogSoftmax2()
    _LogSoftmax(x, y, 1);
    
    /* call LogSoftmaxBackward function */
-    _LogSoftmaxBackward(g, y, x, dedy, dedx, 1, CROSSENTROPY);
+    _LogSoftmaxBackward(g, y, x, dedy, dedx, NULL, 1, CROSSENTROPY);
    
    /* check result */
    cpuTest = y->CheckData(yAnswer, unitNum, 1e-4F) 
@@ -174,7 +174,7 @@ bool TestLogSoftmax2()
    _LogSoftmax(xGPU, yGPU, 1);

    /* call LogSoftmaxBackward function */
-    _LogSoftmaxBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, 1, CROSSENTROPY);
+    _LogSoftmaxBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, NULL, 1, CROSSENTROPY);
    
    /* check result */
    gpuTest = yGPU->CheckData(yAnswer, unitNum, 1e-4F) && dedxGPU->CheckData(dedxAnswer, unitNum, 1e-4F);
@@ -250,7 +250,7 @@ bool TestLogSoftmax3()
    _LogSoftmax(x, y, 1);
    
    /* call LogSoftmaxBackward function */
-    _LogSoftmaxBackward(g, y, x, dedy, dedx, 1, SQUAREDERROR);
+    _LogSoftmaxBackward(g, y, x, dedy, dedx, NULL, 1, SQUAREDERROR);
    
    /* check result */
    cpuTest = y->CheckData(yAnswer, unitNum, 1e-4F) 
@@ -278,7 +278,7 @@ bool TestLogSoftmax3()
    _LogSoftmax(xGPU, yGPU, 1);

    /* call LogSoftmaxBackward function */
-    _LogSoftmaxBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, 1, SQUAREDERROR);
+    _LogSoftmaxBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, NULL, 1, SQUAREDERROR);
    
    /* check result */
    gpuTest = yGPU->CheckData(yAnswer, unitNum, 1e-4F) 

--- a/source/tensor/test/TPower.cpp
+++ b/source/tensor/test/TPower.cpp
@@ -66,7 +66,9 @@ bool TestPower1()
    bUser = Power(*a, 2.0F);

 	/* check results */
-	cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) && aMe->CheckData(answer, aUnitNum, 1e-4F) && bUser.CheckData(answer, aUnitNum, 1e-4F);
+	cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) && 
+              aMe->CheckData(answer, aUnitNum, 1e-4F) && 
+              bUser.CheckData(answer, aUnitNum, 1e-4F);
    
 #ifdef USE_CUDA
 	/* GPU test */
@@ -88,7 +90,9 @@ bool TestPower1()
    bUserGPU = Power(*aGPU, 2.0F);

 	/* check results */
-	gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) && aMeGPU->CheckData(answer, aUnitNum, 1e-4F) && bUserGPU.CheckData(answer, aUnitNum, 1e-4F);
+	gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) && 
+              aMeGPU->CheckData(answer, aUnitNum, 1e-4F) && 
+              bUserGPU.CheckData(answer, aUnitNum, 1e-4F);
    
 	/* destroy variables */
 	delete a;
@@ -153,7 +157,9 @@ bool TestPower2()
    bUser = Power(*a, 1.0F);

 	/* check results */
-	cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) && aMe->CheckData(answer, aUnitNum, 1e-4F) && bUser.CheckData(answer, aUnitNum, 1e-4F);
+	cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) && 
+              aMe->CheckData(answer, aUnitNum, 1e-4F) && 
+              bUser.CheckData(answer, aUnitNum, 1e-4F);
    
 #ifdef USE_CUDA
 	/* GPU test */
@@ -175,7 +181,9 @@ bool TestPower2()
    bUserGPU = Power(*aGPU, 1.0F);

 	/* check results */
-	gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) && aMeGPU->CheckData(answer, aUnitNum, 1e-4F) && bUserGPU.CheckData(answer, aUnitNum, 1e-4F);
+	gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) && 
+              aMeGPU->CheckData(answer, aUnitNum, 1e-4F) && 
+              bUserGPU.CheckData(answer, aUnitNum, 1e-4F);

 	/* destroy variables */
 	delete a;
@@ -214,7 +222,7 @@ bool TestPower3()
 	for (int i = 0; i < aOrder; i++)
 		aUnitNum *= aDimSize[i];

-	DTYPE aData[3][2] = { {0.0F, 1.0F},
+	DTYPE aData[3][2] = { {1.0F, 1.0F},
 	                      {2.0F, 3.0F},
 	                      {4.0F, 5.0F} };
 	DTYPE answer[3][2] = { {1.0F, 1.0F},
@@ -240,7 +248,9 @@ bool TestPower3()
    bUser = Power(*a, 0.0F);

 	/* check results */
-	cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) && aMe->CheckData(answer, aUnitNum, 1e-4F) && bUser.CheckData(answer, aUnitNum, 1e-4F);
+	cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) && 
+              aMe->CheckData(answer, aUnitNum, 1e-4F) && 
+              bUser.CheckData(answer, aUnitNum, 1e-4F);
    
 #ifdef USE_CUDA
 	/* GPU test */
@@ -262,7 +272,9 @@ bool TestPower3()
    bUserGPU = Power(*aGPU, 0.0F);

 	/* check results */
-	gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) && aMeGPU->CheckData(answer, aUnitNum, 1e-4F) && bUserGPU.CheckData(answer, aUnitNum, 1e-4F);
+	gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) && 
+              aMeGPU->CheckData(answer, aUnitNum, 1e-4F) && 
+              bUserGPU.CheckData(answer, aUnitNum, 1e-4F);

 	/* destroy variables */
 	delete a;

--- a/source/tensor/test/TReduceSum.cpp
+++ b/source/tensor/test/TReduceSum.cpp
--- a/source/tensor/test/TSoftmax.cpp
+++ b/source/tensor/test/TSoftmax.cpp
@@ -146,7 +146,7 @@ bool TestSoftmax2()
    _Softmax(x, y, 1);
    
    /* call SoftmaxBackward function */
-    _SoftmaxBackward(g, y, x, dedy, dedx, 1, CROSSENTROPY);
+    _SoftmaxBackward(g, y, x, dedy, dedx, NULL, 1, CROSSENTROPY);
    
    /* check result */
    cpuTest = y->CheckData(yAnswer, unitNum, 1e-4F)
@@ -174,7 +174,7 @@ bool TestSoftmax2()
    _Softmax(xGPU, yGPU, 1);

    /* call SoftmaxBackward function */
-    _SoftmaxBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, 1, CROSSENTROPY);
+    _SoftmaxBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, NULL, 1, CROSSENTROPY);
    
    /* check result */
    gpuTest = yGPU->CheckData(yAnswer, unitNum, 1e-4F)

--- a/source/tensor/test/TSumDim.cpp
+++ b/source/tensor/test/TSumDim.cpp
@@ -20,8 +20,9 @@
 */

 #include "TSumDim.h"
-#include "../core/arithmetic/SumDim.h"
 #include "../XTensor.h"
+#include "../core/arithmetic/SumDim.h"
+#include "../core/getandset/SetData.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)

@@ -251,6 +252,225 @@ bool TestSumDim2()
 #endif // USE_CUDA
 }

+/* 
+case 3: tensor summation c = a + b * \beta 
+where the size of b is equal to the n-th dimension of a, 
+i.e., a is summed with b by broadcasting.
+In this case, 
+(20, 40, 4000) + (40) = (20, 40, 4000), dim = 1.
+*/
+bool TestSumDim3()
+{
+    /* a tensor of size (20, 40, 4000) */
+    int aOrder = 3;
+    int * aDimSize = new int[aOrder];
+    aDimSize[0] = 20;
+    aDimSize[1] = 40;
+    aDimSize[2] = 4000;
+
+    int aUnitNum = 1;
+    for (int i = 0; i < aOrder; i++)
+        aUnitNum *= aDimSize[i];
+
+    /* a tensor of size (40) */
+    int bOrder = 1;
+    int * bDimSize = new int[bOrder];
+    bDimSize[0] = 40;
+
+    int bUnitNum = 1;
+    for (int i = 0; i < bOrder; i++)
+        bUnitNum *= bDimSize[i];
+
+    /* CPU test */
+    bool cpuTest = true;
+
+    /* create tensors */
+    XTensor * a = NewTensor(aOrder, aDimSize);
+    XTensor * b = NewTensor(bOrder, bDimSize);
+    XTensor * c = NewTensor(aOrder, aDimSize);
+    XTensor * cMe = NewTensor(aOrder, aDimSize);
+    XTensor * answer = NewTensor(aOrder, aDimSize);
+    XTensor cUser;
+
+    /* initialize variables */
+    a->SetZeroAll();
+    cMe->SetZeroAll();
+    _SetDataFixedFloat(b, 1.0F);
+    _SetDataFixedFloat(answer, 1.0F);
+
+    /* call SumDim function */
+    _SumDim(a, b, c, 1);
+    _SumDim(cMe, b, 1);
+    cUser = SumDim(*a, *b, 1);
+    
+    /* check results */
+    cpuTest = c->CheckData(answer->data, aUnitNum) && 
+              cMe->CheckData(answer->data, aUnitNum) && 
+              cUser.CheckData(answer->data, aUnitNum);
+
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+
+    /* create tensor */
+    XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * bGPU = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * cGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * cMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+    XTensor cUserGPU;
+
+    /* Initialize variables */
+    aGPU->SetZeroAll();
+    cMe->SetZeroAll();
+    _SetDataFixedFloat(bGPU, 1.0F);
+
+    /* call sum function */
+    _SumDim(aGPU, bGPU, cGPU, 1);
+    _SumDim(cMeGPU, bGPU, 1);
+    cUserGPU = SumDim(*aGPU, *bGPU, 1);
+
+    /* check results */
+    gpuTest = cGPU->CheckData(answer->data, aUnitNum) && 
+              cMeGPU->CheckData(answer->data, aUnitNum) && 
+              cUserGPU.CheckData(answer->data, aUnitNum);
+
+    /* destroy variables */
+    delete a;
+    delete b;
+    delete c;
+    delete cMe;
+    delete answer;
+    delete aGPU;
+    delete bGPU;
+    delete cGPU;
+    delete cMeGPU;
+    delete[] aDimSize;
+    delete[] bDimSize;
+
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete a;
+	delete b;
+	delete c;
+    delete cMe;
+    delete answer;
+    delete[] aDimSize;
+    delete[] bDimSize;
+
+    return cpuTest;
+#endif // USE_CUDA
+}
+
+/* 
+case 4: tensor summation c = a + b * \beta 
+where the size of b is equal to the n-th dimension of a, 
+i.e., a is summed with b by broadcasting.
+In this case, 
+(200, 40, 4000) + (40) = (200, 40, 4000), dim = 1.
+*/
+bool TestSumDim4()
+{
+    /* a tensor of size (200, 40, 4000) */
+    int aOrder = 2;
+    int * aDimSize = new int[aOrder];
+    aDimSize[0] = 1000000;
+    aDimSize[1] = 50;
+
+    int aUnitNum = 1;
+    for (int i = 0; i < aOrder; i++)
+        aUnitNum *= aDimSize[i];
+
+    /* a tensor of size (40) */
+    int bOrder = 1;
+    int * bDimSize = new int[bOrder];
+    bDimSize[0] = 50;
+
+    int bUnitNum = 1;
+    for (int i = 0; i < bOrder; i++)
+        bUnitNum *= bDimSize[i];
+
+    /* CPU test */
+    bool cpuTest = true;
+
+    /* create tensors */
+    XTensor * a = NewTensor(aOrder, aDimSize);
+    XTensor * b = NewTensor(bOrder, bDimSize);
+    XTensor * c = NewTensor(aOrder, aDimSize);
+    XTensor * cMe = NewTensor(aOrder, aDimSize);
+    XTensor * answer = NewTensor(aOrder, aDimSize);
+    XTensor cUser;
+
+    /* initialize variables */
+    a->SetZeroAll();
+    cMe->SetZeroAll();
+    _SetDataFixedFloat(b, 1.0F);
+    _SetDataFixedFloat(answer, 1.0F);
+
+    /* call SumDim function */
+    _SumDim(a, b, c, 1);
+    _SumDim(cMe, b, 1);
+    cUser = SumDim(*a, *b, 1);
+    
+    /* check results */
+    cpuTest = c->CheckData(answer->data, aUnitNum) && 
+              cMe->CheckData(answer->data, aUnitNum) && 
+              cUser.CheckData(answer->data, aUnitNum);
+
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+
+    /* create tensor */
+    XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * bGPU = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * cGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * cMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+    XTensor cUserGPU;
+
+    /* Initialize variables */
+    aGPU->SetZeroAll();
+    cMe->SetZeroAll();
+    _SetDataFixedFloat(bGPU, 1.0F);
+
+    /* call sum function */
+    _SumDim(aGPU, bGPU, cGPU, 1);
+    _SumDim(cMeGPU, bGPU, 1);
+    cUserGPU = SumDim(*aGPU, *bGPU, 1);
+
+    /* check results */
+    gpuTest = cGPU->CheckData(answer->data, aUnitNum) && 
+              cMeGPU->CheckData(answer->data, aUnitNum) && 
+              cUserGPU.CheckData(answer->data, aUnitNum);
+
+    /* destroy variables */
+    delete a;
+    delete b;
+    delete c;
+    delete cMe;
+    delete answer;
+    delete aGPU;
+    delete bGPU;
+    delete cGPU;
+    delete cMeGPU;
+    delete[] aDimSize;
+    delete[] bDimSize;
+
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete a;
+	delete b;
+	delete c;
+    delete cMe;
+    delete answer;
+    delete[] aDimSize;
+    delete[] bDimSize;
+
+    return cpuTest;
+#endif // USE_CUDA
+}
+
 /* other cases */
 /*
    TODO!!
@@ -279,6 +499,24 @@ bool TestSumDim()
    }
    else
        XPRINT(0, stdout, ">> case 2 passed!\n");
+    
+    /* case 3 test */
+    caseFlag = TestSumDim3();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 3 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 3 passed!\n");
+        
+    ///* case 4 test */
+    //caseFlag = TestSumDim4();
+    //if (!caseFlag) {
+    //    returnFlag = false;
+    //    XPRINT(0, stdout, ">> case 4 failed!\n");
+    //}
+    //else
+    //    XPRINT(0, stdout, ">> case 4 passed!\n");

    /* other cases test */
    /*