Merge了最新版本代码，修复了一些Bug

44bf9fa6 · linye · c9c53870 · 44bf9fa6 · 44bf9fa6 · 44bf9fa6
Commit 44bf9fa6 authored Jan 21, 2019 by linye
--- a/source/network/Main.cpp
+++ b/source/network/Main.cpp
@@ -66,6 +66,7 @@ void PowerFP16Test();
 void ClipFP16Test();
 void GatherFP16Test();
 void SetDataGPUFP16Test();
+void SumIntTest();

 using namespace nts;
 using namespace fnnlm;
@@ -89,8 +90,6 @@ int main( int argc, const char ** argv )
    //return 0;  
    //ConvertBackwardTest();
    //return 0;
-    //DropoutFP16Test();
-    //return 0;
    //UnsqueezeFP16Test();
    //return 0;
    //ReduceMaxFP16Test();
@@ -143,11 +142,17 @@ int main( int argc, const char ** argv )
    //InitCPUFP16Test();
    //return 0;

-    SetDataGPUFP16Test();
-    return 0;
+    //MycublasGemmExTest();
+    //return 0;

-    MycublasGemmExTest();
-    return 0;
+    //SumIntTest();
+    //return 0;
+
+    //DropoutFP16Test();
+    //return 0;
+
+    //SetDataGPUFP16Test();
+    //return 0;


    if(argc > 1 && !strcmp(argv[1], "-fnnlm"))
@@ -205,27 +210,69 @@ void MycublasGemmExTest() {
    c1.Dump(stderr, "c1:");
 }

+
+void SumIntTest() {
+    XTensor a;
+    XTensor b;
+    XTensor c;
+    XTensor inta;
+    XTensor intb;
+    XTensor intc;
+    InitTensor2D(&a, 2, 2, X_FLOAT, 0);
+    InitTensor2D(&b, 2, 2, X_FLOAT, 0);
+    a.SetDataRand(-5.0, 5.0);
+    b.SetDataRand(-5.0, 5.0);
+
+    a.Dump(stderr, "a:");
+    b.Dump(stderr, "b:");  
+
+    inta = ConvertDataType(a, X_INT);
+    intb = ConvertDataType(b, X_INT);
+
+    inta.Dump(stderr, "inta:");
+    intb.Dump(stderr, "intb:");
+
+    intc = Sum(inta, intb);
+
+    intc.Dump(stderr, "intc:");
+
+}
+
 void SetDataGPUFP16Test() {
+
    srand(time(NULL));

-    XTensor a1;
-    InitTensor2D(&a1, 2, 2, X_FLOAT, 0);
-    _SetDataRand(&a1, -5.0F, 5.0F);
-    a1.Dump(&a1, stderr, "a:\n");
+    /*XTensor m;
+    InitTensor2D(&m, 2, 2, X_FLOAT, 0);
+    m.SetDataRand(0.0, 10.0);*/

-    XTensor a;
-    InitTensor2D(&a, 2, 2, X_FLOAT16, 0);
+    XTensor * m = NewTensor2D(2, 2, X_FLOAT, 0);
+    m->SetDataRand(0.0, 10.0);

-    _SetDataRand(&a, -5.0F, 5.0F);
+    //XTensor a1;
+    //InitTensor2D(&a1, 2, 2, X_FLOAT, 0);
+    //_CopyValues(&m, &a1);
+    //_SetDataRand(&a1, -1.0F, 1.0F);
+    //a1.Dump(&a1, stderr, "a:\n");

-    a.Dump(&a, stderr, "a:\n");
+    /*XTensor a;
+    InitTensor2D(&a, 2, 2, X_FLOAT16, 0);*/

-    XTensor b;
-    InitTensor2D(&b, 2, 2, X_FLOAT, 0);
+    XTensor * a = NewTensor2D(2, 2, X_FLOAT16, 0);

-    b = ConvertDataType(a, X_FLOAT);
+    _ConvertDataType(m, a);
+
+    a->Dump(a, stderr, "a:\n");
+
+    _SetDataRand(a, 0.0F, 1.0F);
+
+    a->Dump(a, stderr, "a:\n");
+
+    //XTensor b;
+    //InitTensor2D(&b, 2, 2, X_FLOAT, 0);
+    //b = ConvertDataType(a, X_FLOAT);
+    //b.Dump(stderr, "b:\n");

-    b.Dump(stderr, "b:\n");
 }

 void ClipFP16Test() {
@@ -447,7 +494,7 @@ void FloatToInt8Test() {
    InitTensor2D(&a, 2, 2, X_FLOAT, 0);
    InitTensor2D(&b, 2, 2, X_INT8, 0);

-    a.SetDataRand(-5.0F, 5.0F);
+    a.SetDataRand(5.0F, 5.0F);
    a.Dump(stderr, "a:");

    b = ConvertDataType(a, X_INT8);
@@ -741,8 +788,8 @@ void MultiplyDimFP16Test()
    halfA = ConvertDataType(a, X_FLOAT16);
    halfB = ConvertDataType(b, X_FLOAT16);

-    c1 = MultiplyDim(a1, b1, 1, 0);
-    halfC = MultiplyDim(halfA, halfB, 1, 0);
+    c1 = MultiplyDim(a1, b1, 1);
+    halfC = MultiplyDim(halfA, halfB, 1);

    c = ConvertDataType(halfC, X_FLOAT);

@@ -950,26 +997,26 @@ void SubFP16Test()

 void DropoutFP16Test()
 {
+    srand(time(NULL));
+
    XTensor a;
    XTensor b;
    XTensor b1;
    XTensor halfA;
    XTensor halfB;
-    InitTensor2D(&a, 10, 10, X_FLOAT, 0);
+    InitTensor2D(&a, 10, 1, X_FLOAT, 0);

    a.SetDataRand(-5.0F, 5.0F);

-    /*a.Dump(stderr, "a:");*/
+    a.Dump(stderr, "a:");
    
    halfA = ConvertDataType(a, X_FLOAT16);

-    halfB = Dropout(halfA, 0.5);
-    b1 = Dropout(a, 0.3);
-
-    b = ConvertDataType(halfB, X_FLOAT);
+    halfB = Dropout(halfA, 0.2);
+    b1 = Dropout(a, 0.2);

-    b.Dump(stderr, "b:");
-    //b1.Dump(stderr, "b1:");
+    halfB.Dump(&halfB, stderr, "halfB:");
+    b1.Dump(&b1, stderr, "b1:");
 }

 void ConvertBackwardTest()
@@ -1070,131 +1117,3 @@ void ConvertTest()
        a1.Dump(stderr, "halfa:");
    }
 }
\ No newline at end of file
-
-void MatrixMulFloat16AndFloatTest()
-{
-    XTensor a;
-    XTensor b;
-    XTensor c;
-    InitTensor2D(&a, 5000, 5000, X_FLOAT, 0);
-    InitTensor2D(&b, 5000, 5000, X_FLOAT, 0);
-    InitTensor2D(&c, 5000, 5000, X_FLOAT, 0);
-
-    a.SetDataRand(-10.0F, 10.0F);
-    b.SetDataRand(-10.0F, 10.0F);
-
-    int recurrentNum = 10000;
-    double startT1 = GetClockSec();    
-    for (int i1 = 0; i1 < recurrentNum; i1++)
-    {
-        c= MatrixMul(&a, &b);
-    }
-    printf("ElapsedFloat32 = %.2f s \n", GetClockSec() - startT1);
-
-    double startT2 = GetClockSec();
-    for (int i2 = 0; i2 < recurrentNum; i2++)
-    {
-        c = MatrixMulFloat16(&a, &b);
-    }
-    printf("ElapsedFloat16 = %.2f s \n", GetClockSec() - startT2);
-}
-
-void MatrixMul2DFloat16Test()
-{
-    XTensor a;
-    XTensor b;
-    XTensor c;
-    XTensor a00;
-    XTensor b00;
-    XTensor c00;
-    XTensor c01;
-    XTensor halfa;
-    XTensor halfb;
-    XTensor halfc;
-
-    InitTensor3D(&a, 3, 2, 3, X_FLOAT, 0);
-    InitTensor2D(&b, 3, 2, X_FLOAT, 0);
-    InitTensor3D(&c, 3, 2, 2, X_FLOAT, 0);
-    InitTensor3D(&a00, 3, 2, 3, X_FLOAT, 0);
-    InitTensor2D(&b00, 3, 2, X_FLOAT, 0);
-    InitTensor3D(&c00, 3, 2, 2, X_FLOAT, 0);
-    InitTensor3D(&c01, 3, 2, 2, X_FLOAT, 0);
-    InitTensor3D(&halfa, 3, 2, 3, X_FLOAT16, 0);
-    InitTensor2D(&halfb, 3, 2, X_FLOAT16, 0);
-    InitTensor3D(&halfc, 3, 2, 2, X_FLOAT16, 0);
-
-    DTYPE aData[3][2][3] = { { { 0.02121212144F, -1.0234556667F, 2.04354565678F },
-                                { 2.0234567332F, -1.0213469654F, -3.01568321F } },
-                              { { -1.022347899421F, 2.012589653664F, 4.035346643F },
-                                { 3.01234544634F, 1.0324354635F, 2.0546578332F } },
-                              { { -1.0235743446F, 3.0335753334F, 2.0653323234F },
-                                { 1.03235643232F, -1.023463345542F, 0.0335563322F } } };
-    DTYPE bData[3][2] = { { -1.034466323232F, -2.0546676442F },
-                           { -3.0224354656F, 4.034467866532F },
-                           { 5.02354657442F, -6.0324355767443F } };
-
-    a.SetData(aData, 18);
-    b.SetData(bData, 6);
-    _MatrixMul(&a, X_NOTRANS, &b, X_NOTRANS, &c);
-
-    _ConvertDataType(&a, &halfa);
-    _ConvertDataType(&b, &halfb);
-    _MatrixMul(&halfa, X_NOTRANS, &halfb, X_NOTRANS, &halfc);
-    _ConvertDataType(&halfc, &c01);
-
-    _ConvertDataType(&halfa, &a00);
-    _ConvertDataType(&halfb, &b00);
-    _MatrixMul(&a00, X_NOTRANS, &b00, X_NOTRANS, &c00);
-
-    c.Dump(stderr, "c:");
-    c01.Dump(stderr, "c01:");
-    c00.Dump(stderr, "c0:");
-
-
-
-    XTensor a1;
-    XTensor b1;
-    XTensor c1;
-    XTensor a10;
-    XTensor b10;
-    XTensor c10;
-    XTensor c11;
-    XTensor halfa1;
-    XTensor halfb1;
-    XTensor halfc1;
-
-    InitTensor2D(&a1, 3, 3, X_FLOAT, 0);
-    InitTensor2D(&b1, 3, 2, X_FLOAT, 0);
-    InitTensor2D(&c1, 3, 2, X_FLOAT, 0);
-    InitTensor2D(&a10, 3, 3, X_FLOAT, 0);
-    InitTensor2D(&b10, 3, 2, X_FLOAT, 0);
-    InitTensor2D(&c10, 3, 2, X_FLOAT, 0);
-    InitTensor2D(&c11, 3, 2, X_FLOAT, 0);
-    InitTensor2D(&halfa1, 3, 3, X_FLOAT16, 0);
-    InitTensor2D(&halfb1, 3, 2, X_FLOAT16, 0);
-    InitTensor2D(&halfc1, 3, 2, X_FLOAT16, 0);
-
-    DTYPE a1Data[3][3] = { { 0.02121212144F, -1.0234556667F, 2.043541565678F },
-                           { -2.0234567332F, 1.0213469657774F, -3.0156837543321F } ,
-                           { 1.022347899421F, -2.012589653664F, 4.03534634643F }};
-    DTYPE b1Data[3][2] = { { 1.034466323232F, -2.0546676442F },
-                           { 3.0224354656F, -4.034467866532F },
-                           { 5.02354657442F, 6.0324355767443F } };
-
-    a1.SetData(a1Data, 9);
-    b1.SetData(b1Data, 6);
-    _MatrixMul(&a1, X_NOTRANS, &b1, X_NOTRANS, &c1);
-
-    _ConvertDataType(&a1, &halfa1);
-    _ConvertDataType(&b1, &halfb1);
-    _MatrixMul(&halfa1, X_NOTRANS, &halfb1, X_NOTRANS, &halfc1);
-    _ConvertDataType(&halfc1, &c11);
-    _ConvertDataType(&halfa1, &a10);
-    _ConvertDataType(&halfb1, &b10);
-    _MatrixMul(&a10, X_NOTRANS, &b10, X_NOTRANS, &c10);
-
-    c1.Dump(stderr, "c1:");
-    c11.Dump(stderr, "c11:");
-    c10.Dump(stderr, "c10:");
-
-}
--- a/source/network/XBackwardLoss.cpp
+++ b/source/network/XBackwardLoss.cpp
@@ -87,8 +87,6 @@ void XLossGrad::Compute(XTensor * gold, XTensor * y,
                        XTensor * dedy, XTensor * padding,
                        LOSS_FUNCTION_NAME lossName)
 {
-    //return;
-
    if(gold == NULL){
        if(dedy->dataType == X_FLOAT)
            _SetDataFixedFloat(dedy, 1.0F);
@@ -97,7 +95,7 @@ void XLossGrad::Compute(XTensor * gold, XTensor * y,
        else if(dedy->dataType == X_INT)
            _SetDataFixedInt(dedy, 1);
        else{
-            //ShowNTErrors("TODO");
+            ShowNTErrors("TODO");
        }
        return;
    }

--- a/source/network/XBackwardMath.cpp
+++ b/source/network/XBackwardMath.cpp
@@ -71,6 +71,8 @@ void XMathGrad::MakeGrad(XTensor * node, bool isEfficient)
        GradMultiply(node, isEfficient);
    else if(operID == MATH_MULTIPLYDIM)
        GradMultiplyDim(node, isEfficient);
+    else if (operID == MATH_MULTIPLYBROADCAST)
+        GradMultiplyBroadcast(node, isEfficient);
    else if(operID == MATH_NEGATE)
        GradNegate(node, isEfficient);
    else if(operID == MATH_NORMALIZE)
@@ -87,6 +89,8 @@ void XMathGrad::MakeGrad(XTensor * node, bool isEfficient)
        GradSum(node, isEfficient);
    else if(operID == MATH_SUMDIM)
        GradSumDim(node, isEfficient);
+    else if(operID == MATH_SUMBROADCAST)
+        GradSumBroadcast(node, isEfficient);
    else if(operID == REDUCE_REDUCEMEAN)
        GradReduceMean(node, isEfficient);
    else if(operID == REDUCE_REDUCESUM)
@@ -736,10 +740,6 @@ dE/db = (dE/dc * a).reduce(0,...,n-1,n+1,...)
 */
 void XMathGrad::GradMultiplyDim(XTensor * node, bool isEfficient)
 {
- /*   XTensor node1;
-    node1 = ConvertDataType(*node, X_FLOAT);
-    node1.Dump(stderr, "node:");*/
-
    XLink &income = node->income;
    CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for MULTIPLYDIM!");

@@ -752,14 +752,6 @@ void XMathGrad::GradMultiplyDim(XTensor * node, bool isEfficient)
    /* dE/da */
    _MultiplyDim(node->grad, b, a->grad, n, 1.0F);
 	
-    //XTensor a1;
-    //a1 = ConvertDataType(*a, X_FLOAT);
-    //a1.Dump(stderr, "a:");
-
-    //XTensor b1;
-    //b1 = ConvertDataType(*b, X_FLOAT);
-    //b1.Dump(stderr, "b:");
-	
 	/* dE/db */
    int order = a->order;
    int dimSize[MAX_TENSOR_DIM_NUM];
@@ -777,19 +769,10 @@ void XMathGrad::GradMultiplyDim(XTensor * node, bool isEfficient)
           size of b. Then we can reduce the matrix into a row vector. */
        bGradTMP->Reshape(2, reshapedSize);

-        /*XTensor bGradTMP1;
-        bGradTMP1 = ConvertDataType(*bGradTMP, X_FLOAT);
-        bGradTMP1.Dump(stderr, "bGradTMP:");*/
-
        //if(b->outgo.tailNum > 1){
            XTensor * bGradTMP2 = NewTensorBuf(b->grad, b->devID, b->mem);

            _ReduceSum(bGradTMP, bGradTMP2, 0);
-
-           /* XTensor bGradTMP21;
-            bGradTMP21 = ConvertDataType(*bGradTMP2, X_FLOAT);
-            bGradTMP21.Dump(stderr, "bGradTMP2:");*/
-
            _Sum(b->grad, bGradTMP2, b->grad);

            DelTensorBuf(bGradTMP2);
@@ -832,21 +815,43 @@ void XMathGrad::GradMultiplyDim(XTensor * node, bool isEfficient)

        DelTensorBuf(interGrad);
    }
-    //printf("\n");
-
-    //XTensor a2;
-    //a2 = ConvertDataType(*a, X_FLOAT);
-    //a2.Dump(stderr, "a2:");
-
-    //XTensor b2;
-    //b2 = ConvertDataType(*b, X_FLOAT);
-    //b2.Dump(stderr, "b2:");

    DelTensorBuf(bGradTMP);
    node->visitMark = NODE_FINISHED;
 }

 /*
+gradient for multiplication by broadcasting: 
+c = a * b
+where some dimensions of b are of size 1
+
+dE/da = dE/dc * b
+dE/db = (dE/dc * a).reduce(0...n)
+where a.reduce(0...n) is the reduction along the dimension
+whose size is 1 in b. Note that there might be several reductions.
+
+>> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
+*/
+void XMathGrad::GradMultiplyBroadcast(XTensor * node, bool isEfficient)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for MULTIPLYBROADCAST!");
+
+    XTensor * a = income.tails[0];
+    XTensor * b = income.tails[1];
+    DTYPE beta = income.GetParam(0);
+
+    XNoder::MakeGrad(a);
+    _MultiplyBroadcast(node->grad, b, a->grad, 1.0F);
+
+    if(b->isVar || b->income.tailNum > 0){
+        ShowNTErrors("TODO");
+    }
+}
+
+/*
 gradient for negate
 for
 c = -a
@@ -1020,7 +1025,6 @@ void XMathGrad::GradScaleAndShift(XTensor * node, bool isEfficient)

    _Sum(a->grad, node->grad, a->grad, scale);

-
    node->visitMark = NODE_FINISHED;
 }

@@ -1285,6 +1289,37 @@ void XMathGrad::GradSumDim(XTensor * node, bool isEfficient)
 }

 /* 
+gradient for sum by broadcasting: 
+c = a + b * \beta
+where some dimensions of b are of size 1
+
+dE/da = dE/dc
+dE/db = dE/dc * a.reduce(0..n) * \beta 
+where a.reduce(0..n) is the reduction along the dimension
+whose size is 1 in b
+
+>> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
+*/
+void XMathGrad::GradSumBroadcast(XTensor * node, bool isEfficient)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for SUMBROADCAST!");
+
+    XTensor * a = income.tails[0];
+    XTensor * b = income.tails[1];
+    DTYPE beta = income.GetParam(0);
+
+    XNoder::MakeGrad(a);
+    _Sum(a->grad, node->grad, a->grad);
+
+    if(b->isVar || b->income.tailNum > 0){
+        ShowNTErrors("TODO");
+    }
+}
+
+/*
 gradient for reduceMean
 for
 c = reduceMean(a, dim)

--- a/source/network/XBackwardMath.h
+++ b/source/network/XBackwardMath.h
@@ -109,6 +109,11 @@ private:
    static
    void GradMultiplyDim(XTensor * node, bool isEfficient);

+    /* gradient for multiply one dimension: c =  a * b
+       where some dimensions of b are of size 1 */
+    static
+    void GradMultiplyBroadcast(XTensor * node, bool isEfficient);
+
    /* gradient for negate */
    static
    void GradNegate(XTensor * node, bool isEfficient);
@@ -143,6 +148,11 @@ private:
    static
    void GradSumDim(XTensor * node, bool isEfficient);

+    /* gradient for sum by broadcasting: c = a + b * \beta
+       where some dimensions of b are of size 1 */
+    static
+    void GradSumBroadcast(XTensor * node, bool isEfficient);
+
    /* gradient for reduceMean */
    static
    void GradReduceMean(XTensor * node, bool isEfficient);

--- a/source/network/XNet.cpp
+++ b/source/network/XNet.cpp
@@ -27,7 +27,6 @@
 #include "XBackwardFunc.h"
 #include "XBackwardShape.h"
 #include "../tensor/XName.h"
-#include "../tensor/core/CHeader.h"

 namespace nts{

@@ -266,7 +265,7 @@ void XNet::BackwardNode(XTensor * node, bool isEfficent)
            XMathGrad::MakeGrad(node, isEfficent);
        else if(XFuncGrad::IsFunc(node))
            XFuncGrad::MakeGrad(node, isEfficent);
-        else if(XDataGrad::IsDataOP(node))
+        else if (XDataGrad::IsDataOP(node))
            XDataGrad::MakeGrad(node, isEfficent);
        else if(XShapeGrad::IsShapeOP(node))
            XShapeGrad::MakeGrad(node, isEfficent);

--- a/source/sample/fnnlm/FNNLM.cpp
+++ b/source/sample/fnnlm/FNNLM.cpp
@@ -839,6 +839,9 @@ void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net)
        InitModelTensor2D(s, batchSize, model.vSize, model);
        InitModelTensor2D(y, batchSize, model.vSize, model);

+        ///* s = h_last * w  */
+        //_MatrixMul(&h_last, X_NOTRANS, &w, X_NOTRANS, &s);
+
        XTensor h_last1;
        h_last1 = ScaleAndShift(h_last, 100, 0);

@@ -852,20 +855,25 @@ void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net)
        int8W = ConvertDataType(w1, X_INT8);

        XTensor s1;
-        InitTensor2D(&s1, batchSize, model.vSize, X_FLOAT, model.devID, model.mem);
+        InitTensor2D(&s1, batchSize, model.vSize, X_INT, model.devID, model.mem);
        _MatrixMul2D(&int8H_last, X_NOTRANS, &int8W, X_NOTRANS, &s1);       

-        s = ScaleAndShift(s1, 0.0001, 0);
-
        XTensor b2D;
-        InitTensor(&b2D, &s);
+        InitTensor2D(&b2D, batchSize, model.vSize, X_FLOAT, model.devID, model.mem);
        _Unsqueeze(&b, &b2D, 0, batchSize);

-        _Sum(&s, &b2D, &s);
+        b2D = ScaleAndShift(b2D, 10000, 0);
+
+        XTensor b2D1;
+        b2D1 = ConvertDataType(b2D, X_INT);
+
+        _Sum(&s1, &b2D1, &s1);
+
+        s = ConvertDataType(s1, X_FLOAT);
+        s = ScaleAndShift(s, 0.0001, 0);

        /* y = softmax(s) */
        _LogSoftmax(&s, &y, 1);
-
    }
 }


--- a/source/sample/transformer/T2TAttention.cpp
+++ b/source/sample/transformer/T2TAttention.cpp
@@ -53,6 +53,42 @@ initialize the model
 >> myDevID - device id
 >> myMem - the memory pool
 */
+//void T2TAttention::InitModel(int argc, char ** argv, 
+//                             bool myIsMasked, int myIgnored, 
+//                             int myDevID, XMem * myMem)
+//{
+//    devID = myDevID;
+//    mem = myMem;
+//    isMasked = myIsMasked;
+//    ignored = myIgnored;
+//    
+//    float minmax = 0;
+//
+//    LoadParamInt(argc, argv, "nhead", &nhead, 8);
+//    LoadParamInt(argc, argv, "d", &dk, DEFAULT_EMBEDDING_SIZE);
+//    LoadParamInt(argc, argv, "d", &dv, DEFAULT_EMBEDDING_SIZE);
+//    LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);
+//    LoadParamFloat(argc, argv, "attminmax", &minmax, 0.1F);
+//    LoadParamFloat(argc, argv, "dropoutatt", &dropoutP, 0);
+//
+//    InitTensor2D(&wk, d, dk, X_FLOAT, devID, mem);
+//    InitTensor2D(&wq, d, dk, X_FLOAT, devID, mem);
+//    InitTensor2D(&wv, d, dv, X_FLOAT, devID, mem);
+//    InitTensor2D(&wa, d, d, X_FLOAT, devID, mem);
+//    InitTensor2D(&wbig, d, 3 * d, X_FLOAT, devID, mem);
+//
+//    float scale = 1.0F;
+//    float finfoutk = (float)sqrt(6.0F * scale/(d + dk));
+//    float finfoutv = (float)sqrt(6.0F * scale/(d + dv));
+//    float finfouta = (float)sqrt(6.0F * scale / (d + d));
+//    float finfoutbig = (float)sqrt(6.0F * scale / (d + 3*d));
+//
+//    wk.SetDataRand(-finfoutk, finfoutk);
+//    wq.SetDataRand(-finfoutk, finfoutk);
+//    wv.SetDataRand(-finfoutv, finfoutv);
+//    wa.SetDataRand(-finfouta, finfouta);
+//    wbig.SetDataRand(-finfoutbig, finfoutbig);
+//}

 void T2TAttention::InitModel(int argc, char ** argv,
    bool myIsMasked, int myIgnored,
@@ -76,20 +112,19 @@ void T2TAttention::InitModel(int argc, char ** argv,
    InitTensor2D(&wq, d, dk, X_FLOAT16, devID, mem);
    InitTensor2D(&wv, d, dv, X_FLOAT16, devID, mem);
    InitTensor2D(&wa, d, d, X_FLOAT16, devID, mem);
+    InitTensor2D(&wbig, d, 3 * d, X_FLOAT16, devID, mem);

    float scale = 1.0F;
    float finfoutk = (float)sqrt(6.0F * scale / (d + dk));
    float finfoutv = (float)sqrt(6.0F * scale / (d + dv));
    float finfouta = (float)sqrt(6.0F * scale / (d + d));
+    float finfoutbig = (float)sqrt(6.0F * scale / (d + 3 * d));

    wk.SetDataRand(-finfoutk, finfoutk);
    wq.SetDataRand(-finfoutk, finfoutk);
    wv.SetDataRand(-finfoutv, finfoutv);
    wa.SetDataRand(-finfouta, finfouta);
-    //_SetDataRand(&wk, -finfoutk, finfoutk);
-    //_SetDataRand(&wq, -finfoutk, finfoutk);
-    //_SetDataRand(&wv, -finfoutv, finfoutv);
-    //_SetDataRand(&wa, -finfouta, finfouta);
+    wbig.SetDataRand(-finfoutbig, finfoutbig);
 }

 /* 
@@ -103,40 +138,136 @@ make the network
 >> isTraining - indicates whether the model is used for training
 << return - multi-attention result
 */
+//XTensor T2TAttention::Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining, bool selfatt)
+//{
+//    XTensor k2;
+//    XTensor q2;
+//    XTensor v2;
+//
+//    if (selfatt){
+//        
+//        XTensor con;
+//        XList split;
+//
+//        con = MMul(k, wbig);
+//
+//        int d1 = con.GetDim(0);
+//        int d2 = con.GetDim(1);
+//        int d3 = con.GetDim(2) / 3;
+//
+//        InitTensor3D(&k2, d1, d2, d3, X_FLOAT, devID, mem);
+//        InitTensor3D(&q2, d1, d2, d3, X_FLOAT, devID, mem);
+//        InitTensor3D(&v2, d1, d2, d3, X_FLOAT, devID, mem);
+//
+//        split.Add(&q2);
+//        split.Add(&k2);
+//        split.Add(&v2);
+//
+//        Split(con, split, 2, 3);
+//    }
+//
+//    else{
+//        /* linear transofmration before self-attention */
+//        k2 = MMul(k, wk);
+//        q2 = MMul(q, wq);
+//        v2 = MMul(v, wv);
+//    }
+//
+//    XTensor kheads;
+//    XTensor qheads;
+//    XTensor vheads;
+//
+//    /* multi head */
+//    kheads = Split(k2, k2.order - 1, nhead);
+//    qheads = Split(q2, q2.order - 1, nhead);
+//    vheads = Split(v2, v2.order - 1, nhead);
+//
+//    XTensor att;
+//    XTensor dot;
+//    XTensor scalar;
+//
+//    /* scalar = softmax(Q * K^T / sqrt(dk)) * V */
+//    dot = BMMul(qheads, X_NOTRANS, kheads, X_TRANS);
+//
+//    if(isMasked)
+//        dot = dot + mask;
+//
+//    dot = Linear(dot, 1.0F/(float)sqrt((float)dk/nhead));
+//
+//    scalar = Softmax(dot, -1);
+//    
+//    if(isTraining && dropoutP > 0)
+//        scalar = Dropout(scalar, dropoutP);
+//
+//    att = BMMul(scalar, vheads);
+//
+//    /* concatenate the heads */
+//    return MMul(Merge(att, att.order - 1), wa);
+//}

-XTensor T2TAttention::Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining)
+XTensor T2TAttention::Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining, bool selfatt)
 {
-    XTensor halfK;
-
    XTensor halfK2;
    XTensor halfQ2;
    XTensor halfV2;

+    XTensor halfK;
    halfK = ConvertDataType(k, X_FLOAT16);

+    if (selfatt) {
+
+        XTensor halfCon;
+        XList halfSplit;
+        halfCon = MMul(halfK, wbig);
+
+        int d1 = halfCon.GetDim(0);
+        int d2 = halfCon.GetDim(1);
+        int d3 = halfCon.GetDim(2) / 3;
+
+        InitTensor3D(&halfK2, d1, d2, d3, X_FLOAT16, devID, mem);
+        InitTensor3D(&halfQ2, d1, d2, d3, X_FLOAT16, devID, mem);
+        InitTensor3D(&halfV2, d1, d2, d3, X_FLOAT16, devID, mem);
+
+        halfSplit.Add(&halfQ2);
+        halfSplit.Add(&halfK2);
+        halfSplit.Add(&halfV2);
+
+        Split(halfCon, halfSplit, 2, 3);
+    }
+
+    else {
+        XTensor halfQ;
+        XTensor halfV;
+        halfQ = ConvertDataType(q, X_FLOAT16);
+        halfV = ConvertDataType(v, X_FLOAT16);
+
+        /* linear transofmration before self-attention */
        halfK2 = MMul(halfK, wk);
-    halfQ2 = MMul(halfK, wq);
-    halfV2 = MMul(halfK, wv);
+        halfQ2 = MMul(halfQ, wq);
+        halfV2 = MMul(halfV, wv);
+    }

    XTensor halfKheads;
    XTensor halfQheads;
    XTensor halfVheads;

+    /* multi head */
    halfKheads = Split(halfK2, halfK2.order - 1, nhead);
    halfQheads = Split(halfQ2, halfQ2.order - 1, nhead);
    halfVheads = Split(halfV2, halfV2.order - 1, nhead);

-    XTensor halfMask;
+    XTensor halfAtt;
    XTensor halfDot;
    XTensor halfScalar;
-    XTensor halfAtt;
-
-    halfMask = ConvertDataType(mask, X_FLOAT16);

    /* scalar = softmax(Q * K^T / sqrt(dk)) * V */
    halfDot = BMMul(halfQheads, X_NOTRANS, halfKheads, X_TRANS);

+    //XTensor halfMask(mask.order, mask.dimSize, X_FLOAT16, mask.denseRatio, mask.devID, mask.mem);
+
    if (isMasked) {
+        XTensor halfMask;
+        halfMask = ConvertDataType(mask, X_FLOAT16);
        halfDot = Sum(halfDot, halfMask);
    }      

@@ -147,13 +278,10 @@ XTensor T2TAttention::Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bo
    if (isTraining && dropoutP > 0)
        halfScalar = Dropout(halfScalar, dropoutP);

-    /*att = BMMul(scalar, vheads);*/
    halfAtt = BMMul(halfScalar, halfVheads);

    /* concatenate the heads */
    return ConvertDataType(MMul(Merge(halfAtt, halfAtt.order - 1), wa), X_FLOAT);
-
 }

-
 }
--- a/source/sample/transformer/T2TAttention.h
+++ b/source/sample/transformer/T2TAttention.h
@@ -60,6 +60,7 @@ public:
    /* transformation after dot-product attention */
    XTensor wa;
    
+    XTensor wbig;
    /* size of transformed Q and K */
    int dk;

@@ -95,7 +96,7 @@ public:
                   int myDevID = -1, XMem * myMem = NULL);

    /* make the network */
-    XTensor Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining);
+    XTensor Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining, bool selfatt);
 };

 }

--- a/source/sample/transformer/T2TDecoder.cpp
+++ b/source/sample/transformer/T2TDecoder.cpp
@@ -21,6 +21,8 @@

 #include <math.h>
 #include "T2TDecoder.h"
+#include "T2TUtility.h"
+#include "T2TLayerNormal.h"
 #include "../../tensor/core/CHeader.h"

 namespace transformer
@@ -53,16 +55,43 @@ void AttDecoder::InitModel(int argc, char ** argv,
                           bool myIsMasked, int myIgnored, 
                           int myDevID, XMem * myMem)
 {
-    AttEncoder::InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem);
+    //AttEncoder::InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem);

+    devID = myDevID;
+    mem = myMem;
+    ignored = myIgnored;
+
+    LoadParamInt(argc, argv, "nlayer", &nlayer, 6);
+    LoadParamInt(argc, argv, "hsize", &hSize, DEFAULT_EMBEDDING_SIZE);
+    LoadParamInt(argc, argv, "esize", &eSize, DEFAULT_EMBEDDING_SIZE);
+    LoadParamInt(argc, argv, "vsizetgt", &vSize, -1);
+    LoadParamFloat(argc, argv, "dropout", &dropoutP, 0);
+
+    CheckNTErrors(nlayer >= 1, "We have one encoding layer at least!");
+    CheckNTErrors(vSize > 1, "set vocabulary size by \"-vsize\"");
+
+    /* embedding model */
+    embedder.InitModel(argc, argv, devID, mem, false);
+
+    attentions = new T2TAttention[nlayer];
+    fnns = new T2TFNN[nlayer];
+    attLayerNorms = new T2TLN[nlayer];
+    fnnLayerNorms = new T2TLN[nlayer];
    attentionsEnde = new T2TAttention[nlayer];
    attEndeLayerNorms = new T2TLN[nlayer];

+
    /* initialize the stacked layers */
-    for(int i = 0; i < nlayer; i++){
-        attentionsEnde[i].InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem);
+    for (int i = 0; i < nlayer; i++) {
+        attentions[i].InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem);
+        fnns[i].InitModel(argc, argv, myDevID, myMem);
+        attLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
+        fnnLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
+        attentionsEnde[i].InitModel(argc, argv, true, myIgnored, myDevID, myMem);
        attEndeLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
+
    }
+
 }

 /* 
@@ -93,7 +122,7 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X

        /******************/
        /* self attention */
-        att = attentions[i].Make(x, x, x, mask, isTraining);
+        att = attentions[i].Make(x, x, x, mask, isTraining, true);

        /* dropout */
        if(isTraining && dropoutP > 0)
@@ -107,7 +136,7 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X

        /*****************************/
        /* encoder-decoder attention */
-        ende = attentionsEnde[i].Make(outputEnc, x, outputEnc, maskEncDec, isTraining);
+        ende = attentionsEnde[i].Make(outputEnc, x, outputEnc, maskEncDec, isTraining, false);

        /* dropout */
        if(isTraining && dropoutP > 0)

--- a/source/sample/transformer/T2TDecoder.h
+++ b/source/sample/transformer/T2TDecoder.h
@@ -27,9 +27,56 @@
 namespace transformer
 {

-class AttDecoder : public AttEncoder
+class AttDecoder
 {
 public:
+
+    /* device id */
+    int devID;
+
+    /* memory pool */
+    XMem * mem;
+
+    /* layer number */
+    int nlayer;
+
+    /* hidden layer size of the FNN layer */
+    int hSize;
+
+    /* embedding size */
+    int eSize;
+
+    /* vocabulary size */
+    int vSize;
+
+    /* dropout probability */
+    DTYPE dropoutP;
+
+    /* some positions can be ignored in attention. this is useful in lm where the first position needs
+ *     special design for the attention model. */
+    int ignored;
+
+    /* embedding of word at each position */
+    T2TEmbedder embedder;
+
+    /* FNN model of each layer */
+    T2TFNN * fnns;
+
+    /* attention model of each layer */
+    T2TAttention * attentions;
+
+    /* layer normalization for fnn */
+    T2TLN * fnnLayerNorms;
+
+    /* layer normalization for attention */
+    T2TLN * attLayerNorms;
+
+    /* input tensor of the encoder */
+    XTensor * input;
+
+    /* output tensor of the encoder */
+    XTensor * output;
+
    /* encoder-decoder attention model of each layer */
    T2TAttention * attentionsEnde;


--- a/source/sample/transformer/T2TEmbedding.cpp
+++ b/source/sample/transformer/T2TEmbedding.cpp
@@ -48,12 +48,18 @@ initialize the model
 >> myDevID - device id
 >> myMem - the memory pool
 */
-void T2TEmbedder::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
+void T2TEmbedder::InitModel(int argc, char ** argv, int myDevID, XMem * myMem, bool isEnc)
 {
    devID = myDevID;
    mem = myMem;
    
+    if(isEnc){
        LoadParamInt(argc, argv, "vsize", &vSize, -1);
+    }
+    else{
+        LoadParamInt(argc, argv, "vsizetgt", &vSize, -1);
+    }
+    //LoadParamInt(argc, argv, "vsize", &vSize, -1);
    LoadParamInt(argc, argv, "maxlen", &maxLength, 512);
    LoadParamInt(argc, argv, "d", &eSize, DEFAULT_EMBEDDING_SIZE);
    LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);
@@ -110,7 +116,6 @@ void T2TEmbedder::MakePosEmbedding(int eSize, int d, int length)
    delete[] data;
 }

-
 /* 
 make the network 
 */

--- a/source/sample/transformer/T2TEmbedding.h
+++ b/source/sample/transformer/T2TEmbedding.h
@@ -71,7 +71,7 @@ public:
    ~T2TEmbedder();

    /* initialize the model */
-    void InitModel(int argc, char ** argv, int myDevID = -1, XMem * myMem = NULL);
+    void InitModel(int argc, char ** argv, int myDevID = -1, XMem * myMem = NULL, bool isEnc = true);

    /* make positional embeddings */
    void MakePosEmbedding(int eSize, int d, int length);

--- a/source/sample/transformer/T2TEncoder.cpp
+++ b/source/sample/transformer/T2TEncoder.cpp
@@ -116,7 +116,7 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, boo
        XTensor res;

        /* self attention */
-        att = attentions[i].Make(x, x, x, mask, isTraining);
+        att = attentions[i].Make(x, x, x, mask, isTraining, true);
        
        /* dropout */
        if(isTraining && dropoutP > 0)
@@ -160,3 +160,4 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool isTraining)
 }

 }
+
--- a/source/sample/transformer/T2TModel.cpp
+++ b/source/sample/transformer/T2TModel.cpp
@@ -236,10 +236,10 @@ void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTe
    XTensor * maskEncDecTMPDec = NewTensorBuf(maskEncDecTMPEnc, paddingEnc.devID, paddingEnc.mem);

    _Unsqueeze(&paddingEnc, maskEncDecTMPEnc, paddingEnc.order - 1, paddingDec.GetDim(-1));
-    _Unsqueeze(&paddingDec, maskEncDecTMPDec, paddingEnc.order, paddingEnc.GetDim(-1));
-    _Multiply(maskEncDecTMPDec, maskEncDecTMPEnc, maskEncDecTMPDec);
-    _ScaleAndShiftMe(maskEncDecTMPDec, 1e9F, -1e9F);
-    _Unsqueeze(maskEncDecTMPDec, &maskEncDec, 0, dims[0]);
+    //_Unsqueeze(&paddingDec, maskEncDecTMPDec, paddingEnc.order, paddingEnc.GetDim(-1));
+    //_Multiply(maskEncDecTMPDec, maskEncDecTMPEnc, maskEncDecTMPDec);
+    _ScaleAndShiftMe(maskEncDecTMPEnc, 1e9F, -1e9F);
+    _Unsqueeze(maskEncDecTMPEnc, &maskEncDec, 0, dims[0]);

    DelTensorBuf(maskEncDecTMPDec);
    DelTensorBuf(maskEncDecTMPEnc);
@@ -274,7 +274,10 @@ void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTe
    _Sum(&maskEnc, padding3, &maskEnc);

    encoding = MakeEncoder(inputEnc, maskEnc, isTraining);
+    //encoding.Dump(stderr, "encoding",10);
+
    decoding = MakeDecoder(inputDec, encoding, maskDec, maskEncDec, isTraining);
+    //decoding.Dump(stderr, "decoding", 10);
    outputLayer->Make(decoding, output);

    delete[] dims;
@@ -298,9 +301,10 @@ void T2TModel::GetParams(XList &list)
        list.Add(&encoder->fnns[i].b1);
        list.Add(&encoder->fnns[i].w2);
        list.Add(&encoder->fnns[i].b2);
-        list.Add(&encoder->attentions[i].wk);
-        list.Add(&encoder->attentions[i].wq);
-        list.Add(&encoder->attentions[i].wv);
+        //list.Add(&encoder->attentions[i].wk);
+        //list.Add(&encoder->attentions[i].wq);
+        //list.Add(&encoder->attentions[i].wv);
+        list.Add(&encoder->attentions[i].wbig);
        list.Add(&encoder->attentions[i].wa);
        list.Add(&encoder->fnnLayerNorms[i].w);
        list.Add(&encoder->fnnLayerNorms[i].b);
@@ -322,9 +326,10 @@ void T2TModel::GetParams(XList &list)
            list.Add(&decoder->attentionsEnde[i].wa);
            list.Add(&decoder->attEndeLayerNorms[i].w);
            list.Add(&decoder->attEndeLayerNorms[i].b);
-            list.Add(&decoder->attentions[i].wk);
-            list.Add(&decoder->attentions[i].wq);
-            list.Add(&decoder->attentions[i].wv);
+            //list.Add(&decoder->attentions[i].wk);
+            //list.Add(&decoder->attentions[i].wq);
+            //list.Add(&decoder->attentions[i].wv);
+            list.Add(&decoder->attentions[i].wbig);
            list.Add(&decoder->attentions[i].wa);
            list.Add(&decoder->fnnLayerNorms[i].w);
            list.Add(&decoder->fnnLayerNorms[i].b);

--- a/source/sample/transformer/T2TOutput.cpp
+++ b/source/sample/transformer/T2TOutput.cpp
@@ -56,7 +56,7 @@ void T2TOutput::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)

    float minmax = 0;

-    LoadParamInt(argc, argv, "vsize", &vSize, -1);
+    LoadParamInt(argc, argv, "vsizetgt", &vSize, -1);
    LoadParamInt(argc, argv, "d", &inSize, DEFAULT_EMBEDDING_SIZE);
    LoadParamInt(argc, argv, "d", &hSize, DEFAULT_EMBEDDING_SIZE);
    LoadParamFloat(argc, argv, "outputminmax", &minmax, 0.08F);

--- a/source/sample/transformer/T2TTrainer.cpp
+++ b/source/sample/transformer/T2TTrainer.cpp
@@ -41,12 +41,15 @@ T2TTrainer::T2TTrainer()
    seqLen2 = NULL;
    nseqBuf = 0;
    nextSeq = -1;
+    nextBatch = -1;

    argNum = 0;
    argArray = NULL;
    buf = NULL;
    buf2 = NULL;
+    bufBatch = NULL;
    bufSize = 0;
+    bufBatchSize = 0;
    seqOffset = NULL;
 }

@@ -55,6 +58,7 @@ T2TTrainer::~T2TTrainer()
 {
    delete[] buf;
    delete[] buf2;
+    delete[] bufBatch;
    delete[] seqLen;
    delete[] seqLen2;
    delete[] seqOffset;
@@ -116,10 +120,12 @@ void T2TTrainer::Init(int argc, char ** argv)
    LoadParamBool(argc, argv, "doubledend", &isDoubledEnd, false);
    LoadParamBool(argc, argv, "smallbatch", &isSmallBatch, true);
    LoadParamBool(argc, argv, "bigbatch", &isBigBatch, false);
-    LoadParamBool(argc, argv, "smallfootprint", &isSmallFootprint, false);
+    LoadParamBool(argc, argv, "debug", &isDebugged, false);
+    LoadParamBool(argc, argv, "randbatch", &isRandomBatch, false);

    buf  = new int[bufSize];
    buf2 = new int[bufSize];
+    bufBatch = new BatchNode[bufSize];
    seqLen  = new int[bufSize];
    seqLen2 = new int[bufSize];
    seqOffset = new int[bufSize];
@@ -165,13 +171,16 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
    XMem * mem = model->mem;
    XNet net;

-    if(isSmallFootprint)
-        net.SetGradEfficientFlag();
+    if(isDebugged)
+        net.SetGradEfficientFlag(false);
    
    PrepareModel(model);

    double startT = GetClockSec();

+    FILE * fileen = fopen("enc.txt", "w");
+    FILE * filede = fopen("dec.txt", "w");
+    
    for(epoch = 1; epoch <= nepoch; epoch++){
 #ifndef WIN32
        if(isShuffled)
@@ -205,6 +214,10 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model

            CheckNTErrors(batchEnc.order == 2, "wrong tensor order of the sequence batch");

+            //batchEnc.Dump(stderr, "enc",1);
+            //batchDec.Dump(stderr, "dec",1);
+
+            //paddingDec.Dump(stderr, "paddec");
            /* output probabilities */
            XTensor output;

@@ -222,17 +235,18 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
                LabelSmooth(&gold, &goldSmoothed, labelSmoothingP);

            /* make paddings for the output */
-            if (output.GetDim(0) > 1)
+            if (output.GetDim(0) > 0)
                PadOutput(&output, &gold, &paddingDec);

            /* get probabilities */
            float prob = GetProb(&output, &gold, NULL);
-            
+            //printf("%f\n", prob);
+            //float prob = 0;
            DTYPE lossLocal = -prob / wc;
            bool doUpdate = (!IsNAN(lossLocal) && !IsINF(lossLocal) && lossLocal < 1e3F);
          
            XTensor &g = labelSmoothingP > 0 ? goldSmoothed : gold;   
-
+            //doUpdate = false;
            if (doUpdate) {
                
                /* recale the output for normalized loss */
@@ -292,6 +306,9 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
            MakeCheckpoint(model, validFN, modelFN, "epoch", epoch);
    }

+    fclose(fileen);
+    fclose(filede);
+
    double elapsed = GetClockSec() - startT;
    
    epoch = MIN(epoch, nepoch);
@@ -434,11 +451,13 @@ void T2TTrainer::MakeCheckpoint(T2TModel * model, const char * validFN, const ch
    sprintf(fn2, "%s.%s.%03d.output", modelFN, label, id);

    model->Dump(fn);
-    if(validFN != NULL){
+    if (model->isLM) {
+        if (validFN != NULL) {
            T2TTrainer trainer;
            trainer.Init(argNum, argArray);
            trainer.Test(validFN, fn2, model);
        }
+    }   

    delete[] fn;
    delete[] fn2;
@@ -473,7 +492,8 @@ int T2TTrainer::LoadBuf(FILE * file, bool isSorted, int step)
    int wordCount = 0;
    while(fgets(line, MAX_SEQUENCE_LENGTH - 1, file)){
        int len = (int)strlen(line);
-
+        if(line[0]=='b')
+            break;
        while(line[len - 1] == '\r' || line[len - 1] == '\n'){
            line[len - 1] = 0;
            len--;
@@ -544,9 +564,14 @@ int T2TTrainer::LoadBuf(FILE * file, bool isSorted, int step)
            node.offset = i;
            node.p = buf + offset;
            node.size = 0;
-            for(int j = 0; j < step; j++)
+            int max = 0;
+            for(int j = 0; j < step; j++){
                node.size += seqLen[i + j];
-            node.value = seqLen[i];
+                max = MAX(max, seqLen[i + j]);
+            }
+            //node.value = seqLen[i+1];
+            //node.value = MAX(seqLen[i+1],seqLen[i]);
+            node.value = max;
            count++;
            offset += node.size;
        }
@@ -768,6 +793,12 @@ int T2TTrainer::LoadBatchLM(FILE * file,
    return sc;
 }

+int CompareBatchNode(const void * a, const void * b)
+{
+    return ((BatchNode*)b)->key - ((BatchNode*)a)->key;
+}
+
+
 /*
 load a batch of sequences (for MT)
 >> file - the handle to the data file
@@ -797,10 +828,19 @@ int T2TTrainer::LoadBatchMT(FILE * file,
                            int devID, XMem * mem, 
 							bool isTraining)
 {
-    if(nextSeq < 0 || nextSeq >= nseqBuf)
+    //if (nextSeq < 0 || nextSeq >= nseqBuf)
+    //    LoadBuf(file, isSorted, 2);
+    if (nextBatch < 0 || nextBatch >= bufBatchSize) {
        LoadBuf(file, isSorted, 2);

-    int seq = MAX(nextSeq, 0);
+        int seq = 0;
+
+        bufBatchSize = 0;
+        nextBatch = 0;
+
+        /* we segment the buffer into batches */
+        while (seq < nseqBuf) {
+
            int wcEnc = 0;
            int wcDec = 0;
            int wnEnc = 0;
@@ -809,38 +849,99 @@ int T2TTrainer::LoadBatchMT(FILE * file,
            int maxDec = 0;
            int sc = 0;

-    CheckNTErrors((nseqBuf - seq) % 2 == 0, "Input sequence must be paired!");
-
-    while(seq + sc < nseqBuf){
+            while (seq + sc < nseqBuf) {

        /* source-side sequence */
        wnEnc = seqLen[seq + sc];
+        
+        /* target-side sequence */
+        wnDec = isDoubledEnd ? seqLen[seq + sc + 1] : seqLen[seq + sc + 1] - 1;
+        
+        int tcEnc = isBigBatch ? (wcEnc + wnEnc): MAX(maxEnc, wnEnc) * (sc + 2) / 2;
+        int tcDec = isBigBatch ? (wcDec + wnDec): MAX(maxDec, wnDec) * (sc + 2) / 2;
+
+        if(sc != 0 && sc > sBatch * 2 && (tcEnc > wBatch || tcDec > wBatch))
+            break;
+
        wcEnc += wnEnc;
        sc += 1;

        if(maxEnc < wnEnc)
            maxEnc = wnEnc;

-        /* target-side sequence */
-        int len = isDoubledEnd ? seqLen[seq + sc] : seqLen[seq + sc] - 1;
-        wnDec = len;
        wcDec += wnDec;
        sc += 1;

        if(maxDec < wnDec)
            maxDec = wnDec;
+    }
+
+            BatchNode & batch = bufBatch[bufBatchSize];
+            batch.beg = seq;
+            batch.end = seq + sc;
+            batch.maxEnc = maxEnc;
+            batch.maxDec = maxDec;
+            batch.key = rand();
+
+            bufBatchSize++;
+            seq = seq + sc;
+        }
+
+        if(isRandomBatch)
+            qsort(bufBatch, bufBatchSize, sizeof(BatchNode), CompareBatchNode);
+    }
+
+    /*int seq = MAX(nextSeq, 0);
+    int wcEnc = 0;
+    int wcDec = 0;
+    int wnEnc = 0;
+    int wnDec = 0;
+    int maxEnc = 0;
+    int maxDec = 0;
+    int sc = 0;
+
+    CheckNTErrors((nseqBuf - seq) % 2 == 0, "Input sequence must be paired!");

-        int tcEnc = isBigBatch ? wcEnc : maxEnc * sc / 2;
-        int tcDec = isBigBatch ? wcDec : maxDec * sc / 2;
-        if(sc >= sBatch * 2 && (tcEnc >= wBatch || tcDec >= wBatch))
+    while(seq + sc < nseqBuf){
+
+        wnEnc = seqLen[seq + sc];
+        
+        wnDec = isDoubledEnd ? seqLen[seq + sc + 1] : seqLen[seq + sc + 1] - 1;
+        
+        int tcEnc = isBigBatch ? (wcEnc + wnEnc): MAX(maxEnc, wnEnc) * (sc + 2) / 2;
+        int tcDec = isBigBatch ? (wcDec + wnDec): MAX(maxDec, wnDec) * (sc + 2) / 2;
+
+        if(sc != 0 && sc > sBatch * 2 && (tcEnc > wBatch || tcDec > wBatch))
            break;
+
+        wcEnc += wnEnc;
+        sc += 1;
+
+        if(maxEnc < wnEnc)
+            maxEnc = wnEnc;
+
+        wcDec += wnDec;
+        sc += 1;
+
+        if(maxDec < wnDec)
+            maxDec = wnDec;
    }

    nextSeq = seq + sc;

    if(sc <= 0)
+        return 0;*/
+    if(bufBatchSize <= 0)
        return 0;

+    BatchNode & batch = bufBatch[nextBatch++];
+    int seq = batch.beg;
+    int sc = batch.end - batch.beg;
+    int maxEnc = batch.maxEnc;
+    int maxDec = batch.maxDec;
+
+    CheckNTErrors(sc % 2 == 0, "The input samples must be paired");
+
    int sCount = sc/2;
    int seqSize = 0;
    int dimsDec[3] = {sCount, maxDec, vsDec};
@@ -859,13 +960,14 @@ int T2TTrainer::LoadBatchMT(FILE * file,

    int wCountEnc = 0;
    int wCountDec = 0;
+    int wCountPad = 0;
    int wGold = 0;
    wCount = 0;

    int * batchEncValues = new int[batchEnc->unitNum];
    int * batchDecValues = new int[batchDec->unitNum];
    //MTYPE * paddingEncOffsets = new MTYPE[sc * maxEnc / 2];
-    //MTYPE * paddingDecOffsets = new MTYPE[sc * maxDec / 2];
+    MTYPE * paddingDecOffsets = new MTYPE[sc * maxDec / 2];
    MTYPE * goldOffsets = new MTYPE[sc * maxDec / 2];

    memset(batchEncValues, 0, sizeof(int) * batchEnc->unitNum);
@@ -899,7 +1001,10 @@ int T2TTrainer::LoadBatchMT(FILE * file,
            int num = buf[seqOffset[s] + w];
            batchDecValues[batchDec->GetOffset2D(sent, w)] = num;
            //paddingDecOffsets[wCountDec] = paddingDec->GetOffset2D(sent, w);
-            
+            if (w < len-1){
+                paddingDecOffsets[wCountPad++] = paddingDec->GetOffset2D(sent, w);
+                wCount++;
+            }
            if (w > 0)
                goldOffsets[wGold++] = gold->GetOffset3D(sent, w - 1, buf[seqOffset[s] + w]);
            
@@ -909,7 +1014,7 @@ int T2TTrainer::LoadBatchMT(FILE * file,
                else
                    goldOffsets[wGold++] = gold->GetOffset3D(sent, w, buf[seqOffset[s] + w + 1]);
            }
-            wCount++;
+            //wCount++;
            wCountDec++;
            if(seqs != NULL)
                seqs[seqSize++] = buf[seqOffset[s] + w];
@@ -922,19 +1027,19 @@ int T2TTrainer::LoadBatchMT(FILE * file,
    }

    batchDec->SetData(batchDecValues, batchDec->unitNum);
-    //paddingDec->SetDataBatched(paddingDecOffsets, 1.0F, wCountDec);
+    paddingDec->SetDataBatched(paddingDecOffsets, 1.0F, wCountPad);

-    XTensor * tmp2 = NewTensorBuf(paddingDec, devID, mem);
-    _ConvertDataType(batchDec, tmp2);
-    _NotEqual(tmp2, paddingDec, 0);
-    DelTensorBuf(tmp2);
+    //XTensor * tmp2 = NewTensorBuf(paddingDec, devID, mem);
+    //_ConvertDataType(batchDec, tmp2);
+    //_NotEqual(tmp2, paddingDec, 0);
+    //DelTensorBuf(tmp2);

    gold->SetDataBatched(goldOffsets, 1.0F, wGold);

    delete[] batchEncValues;
    delete[] batchDecValues;
    //delete[] paddingEncOffsets;
-    //delete[] paddingDecOffsets;
+    delete[] paddingDecOffsets;
    delete[] goldOffsets;

    return sc;
@@ -1015,20 +1120,20 @@ void T2TTrainer::Update(T2TModel * model, const float lr)

    model->GetParams(ws);

-    for(int i = 0; i < ws.count; i++){
+    for (int i = 0; i < ws.count; i++) {
        XTensor * para = (XTensor*)ws.Get(i);
        XTensor * paraGrad = para->grad;

-        if (para->dataType == X_FLOAT)
-        {
        if (paraGrad == NULL)
            continue;

        CheckNTErrors(para != NULL, "NULL parameter tensor!");
        CheckNTErrors(paraGrad != NULL, "NULL gradient tensor!");

-            if (useAdam) {
-
+        if (para->dataType == X_FLOAT)
+        {
+            if (useAdam) 
+            {
                adamBeta1T *= adamBeta1;
                adamBeta2T *= adamBeta2;
                DTYPE e = lr * (DTYPE)sqrt(1 - adamBeta2T) / (1 - adamBeta1T);
@@ -1070,19 +1175,13 @@ void T2TTrainer::Update(T2TModel * model, const float lr)
            /* clear gradient */
            paraGrad->SetZeroAll();
        }
-        else if (para->dataType == X_FLOAT16) {
-
+        else if (para->dataType == X_FLOAT16) 
+        {     
+            if (useAdam) 
+            {
                XTensor paraGrad1(paraGrad->order, paraGrad->dimSize, X_FLOAT, paraGrad->denseRatio, paraGrad->devID, paraGrad->mem);
                _ConvertDataType(paraGrad, &paraGrad1);

-            if (paraGrad == NULL)
-                continue;
-
-            CheckNTErrors(para != NULL, "NULL parameter tensor!");
-            CheckNTErrors(paraGrad != NULL, "NULL gradient tensor!");
-
-            if (useAdam) {
-
                adamBeta1T *= adamBeta1;
                adamBeta2T *= adamBeta2;
                DTYPE e = lr * (DTYPE)sqrt(1 - adamBeta2T) / (1 - adamBeta1T);
@@ -1136,9 +1235,67 @@ void T2TTrainer::Update(T2TModel * model, const float lr)
        }

    }
-
 }

+//void T2TTrainer::Update(T2TModel * model, const float lr)
+//{
+//    XList ws(100);
+//
+//    model->GetParams(ws);
+//
+//    for(int i = 0; i < ws.count; i++){
+//        XTensor * para = (XTensor*)ws.Get(i);
+//        XTensor * paraGrad = para->grad;
+//
+//        if (paraGrad == NULL)
+//            continue;
+//
+//        CheckNTErrors(para != NULL, "NULL parameter tensor!");
+//        CheckNTErrors(paraGrad != NULL, "NULL gradient tensor!");
+//
+//        if(useAdam){
+//            adamBeta1T *= adamBeta1;
+//            adamBeta2T *= adamBeta2;
+//            DTYPE e = lr * (DTYPE)sqrt(1 - adamBeta2T) / (1 - adamBeta1T);
+//            DTYPE d = adamDelta * (DTYPE)sqrt(1 - adamBeta2T);
+//
+//            if (para->dataType == X_FLOAT16) {
+//                d = d * 5e4;
+//            }
+//
+//            /* m = beta_1 * m + (1-beta_1) * grad */
+//            XTensor * m = (XTensor*)moments.Get(i);
+//            _ScaleAndShiftMe(m, adamBeta1, 0);
+//            _Sum(m, paraGrad, m, (1.0F - adamBeta1));
+//            
+//            /* v = beta_2 * v + (1-beta_2) * grad * grad*/
+//            XTensor * v = (XTensor*)moments2nd.Get(i);
+//            _Multiply(paraGrad, paraGrad, v, adamBeta2/(1.0F - adamBeta2));
+//            _ScaleAndShiftMe(v, (1.0F - adamBeta2), 0);
+//
+//            /* v2 = m / (sqrt(v) + delta) */
+//            XTensor * v2 = NewTensorBuf(v, v->devID, v->mem);
+//            _Power(v, v2, 0.5F);
+//            _ScaleAndShiftMe(v2, 1.0F, d);
+//            _Div(m, v2, v2);
+//
+//            /* the delta rule */
+//
+//            _Sum(para, v2, para, -e);      
+//
+//            DelTensorBuf(v2);
+//
+//        }
+//        else{
+//            /* the delta rule */
+//            _Sum(para, paraGrad, para, -lr);
+//        }
+//
+//        /* clear gradient */
+//        paraGrad->SetZeroAll();
+//    }
+//}
+
 /* 
 prepare model for training 
 >> model - the model for training

--- a/source/sample/transformer/T2TTrainer.h
+++ b/source/sample/transformer/T2TTrainer.h
@@ -33,6 +33,25 @@ using namespace nts;
 namespace transformer
 {

+/* node to keep batch information */
+struct BatchNode
+{
+    /* begining position */
+    int beg;
+
+    /* end position */
+    int end;
+
+    /* maximum word number on the encoder side */
+    int maxEnc;
+
+    /* maximum word number on the decoder side */
+    int maxDec;
+
+    /* a key for sorting */
+    int key;
+};
+
 /* trainer of the T2T model */
 class T2TTrainer
 {
@@ -49,9 +68,15 @@ public:
    /* another buffer */
    int * buf2;

+    /* batch buf */
+    BatchNode * bufBatch;
+
    /* buffer size */
    int bufSize;

+    /* size of batch buffer */
+    int bufBatchSize;
+
    /* length of each sequence */
    int * seqLen;

@@ -67,6 +92,9 @@ public:
    /* offset for next sequence in the buffer */
    int nextSeq;

+    /* offset for next batch */
+    int nextBatch;
+    
    /* indicates whether the sequence is sorted by length */
    bool isLenSorted;
    
@@ -142,8 +170,11 @@ public:
    /* counterpart of "isSmallBatch" */
    bool isBigBatch;

-    /* indicates whether we use small memory footprint for backward process */
-    bool isSmallFootprint;
+    /* randomize batches */
+    bool isRandomBatch;
+
+    /* indicates whether we intend to debug the net */
+    bool isDebugged;

 public:
    /* constructor */

--- a/source/sample/transformer/Transformer.cpp
+++ b/source/sample/transformer/Transformer.cpp
@@ -60,6 +60,7 @@ int TransformerMain(int argc, const char ** argv)
    LoadParamString(argc, args, "output", outputFN, "");

    srand((unsigned int)time(NULL));
+
    T2TTrainer trainer;
    trainer.Init(argc, args);


--- a/source/tensor/XDevice.cpp
+++ b/source/tensor/XDevice.cpp
@@ -24,6 +24,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <time.h>
 #include "XDevice.h"
 #include "XGlobal.h"
 #include "XThread.h"
@@ -59,6 +60,7 @@ XDevice::~XDevice()
        cublasDestroy(cublasHandle);
    if(stream != NULL)
        delete stream;
+    curandDestroyGenerator(gen);
 #endif
 }

@@ -68,6 +70,7 @@ void XDevice::Init(int myDevID)
    Clear();

    devID = myDevID;
+    seed = rand();

    /* CPU information */
    if(devID < 0){
@@ -80,6 +83,10 @@ void XDevice::Init(int myDevID)
        cudaDeviceProp prop;

        cudaSetDevice(myDevID);
+
+        curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT);
+        curandSetPseudoRandomGeneratorSeed(gen, seed);
+
        if(cudaGetDeviceProperties(&prop, devID) != cudaSuccess){
            XPRINT1(0, stderr, "cannot get GPU(%d) information.", devID);
            exit(1);
@@ -270,6 +277,8 @@ XDevManager::~XDevManager()
 /* initialize it and get the CPU and GPU information */
 void XDevManager::Init()
 {
+    srand((unsigned int)time(NULL));
+
    Clear();

    /* CPUs (we actually do not care about how many CPUs are using) */

--- a/source/tensor/XDevice.h
+++ b/source/tensor/XDevice.h
@@ -100,6 +100,9 @@ public:
    /* default stream for the device */
    XStream * stream;

+    /* seed for random number generation */
+    int seed;
+    
 #ifdef USE_CUDA
    /* mutex for handle (GPU cublas) */
    MUTEX_HANDLE cublasMutex;
@@ -109,6 +112,9 @@ public:

    /* specify if the handle is initialized */
    bool isHandleReady;
+
+    /* generater of random numbers */
+    curandGenerator_t gen;
 #endif



--- a/source/tensor/XMem.cpp
+++ b/source/tensor/XMem.cpp
@@ -1461,6 +1461,23 @@ void XMem::CreateBLASHandle()
 #endif
 }

+/* show profile of the memory pool */
+void XMem::ShowMemUsage(FILE * file)
+{
+    MTYPE used = 0;
+    MTYPE total = 0;
+
+    for(int i = 0; i < blockNum; i++){
+        if(blocks[i].mem != NULL){
+            used  += blocks[i].used;
+            total += blocks[i].size;
+        }
+    }
+
+    fprintf(file, "mem:%.1fMB used:%.1fMB usage:%.3f\n", 
+           (DTYPE)used/MILLION, (DTYPE)total/MILLION, (DTYPE)used/total);
+}
+
 #ifdef USE_CUDA

 /* get the handle of cublas */

--- a/source/tensor/XMem.h
+++ b/source/tensor/XMem.h
@@ -24,6 +24,7 @@
 #ifndef __XMEM_H__
 #define __XMEM_H__

+#include <stdio.h>
 #include <stdlib.h>

 #ifdef CUDA_BLAS
@@ -402,6 +403,9 @@ public:
    /* create a new cublas handle */
    void CreateBLASHandle();

+    /* show profile of the memory pool */
+    void ShowMemUsage(FILE * file);
+
 #ifdef USE_CUDA
    /* get the handle of cublas */
    cublasHandle_t * GetCublasHandle();

--- a/source/tensor/XName.cpp
+++ b/source/tensor/XName.cpp
@@ -67,6 +67,8 @@ const char * GetOPName(int type)
            return "M_MULTIPLY";
        else if (type == MATH_MULTIPLYDIM)
            return "M_MULTIPLYDIM";
+        else if (type == MATH_MULTIPLYBROADCAST)
+            return "M_MULTIPLYBROADCAST";
        else if (type == MATH_NEGATE)
            return "M_NEGATE";
        else if (type == MATH_NORMALIZE)
@@ -85,6 +87,8 @@ const char * GetOPName(int type)
            return "M_SUM";
        else if (type == MATH_SUMDIM)
            return "M_SUMDIM";
+        else if (type == MATH_SUMBROADCAST)
+            return "M_SUMBROADCAST";
        else if (type == REDUCE_REDUCEMAX)
            return "R_REDUCEMAX";
        else if (type == REDUCE_REDUCEMEAN)

--- a/source/tensor/XName.h
+++ b/source/tensor/XName.h
@@ -52,7 +52,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #define MATH_MATRIXMULBATCHED   MATH_MATRIXMUL + 1
 #define MATH_MULTIPLY           MATH_MATRIXMULBATCHED + 1
 #define MATH_MULTIPLYDIM        MATH_MULTIPLY + 1
-#define MATH_NEGATE             MATH_MULTIPLYDIM + 1
+#define MATH_MULTIPLYBROADCAST  MATH_MULTIPLYDIM + 1
+#define MATH_NEGATE             MATH_MULTIPLYBROADCAST + 1
 #define MATH_NORMALIZE          MATH_NEGATE + 1
 #define MATH_POWER              MATH_NORMALIZE + 1
 #define MATH_SCALEANDSHIFT      MATH_POWER + 1
@@ -61,8 +62,9 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #define MATH_SUBDIM             MATH_SUB + 1
 #define MATH_SUM                MATH_SUBDIM + 1
 #define MATH_SUMDIM             MATH_SUM + 1
+#define MATH_SUMBROADCAST       MATH_SUMDIM + 1

-#define REDUCE                  MATH_SUMDIM + 1
+#define REDUCE                  MATH_SUMBROADCAST + 1
 #define REDUCE_REDUCEMAX        REDUCE + 1
 #define REDUCE_REDUCEMEAN       REDUCE_REDUCEMAX + 1
 #define REDUCE_REDUCESUM        REDUCE_REDUCEMEAN + 1

--- a/source/tensor/XTensor.cpp
+++ b/source/tensor/XTensor.cpp
@@ -60,6 +60,7 @@
 #include "core/utilities/FlushToMem.cuh"
 #include "core/utilities/SetAscendingOrder.cuh"

+
 #endif

 /* the nts (NiuTrans.Tensor) namespace */
@@ -690,9 +691,6 @@ set the tensor items by a uniform distribution in range [lower, upper]
 >> lower - lower value of the range
 >> upper - upper value of the range
 */
-
-
-
 void XTensor::SetDataRand(DTYPE lower, DTYPE upper)
 {
    // TODO: cuda code!!!!!!!
@@ -703,7 +701,6 @@ void XTensor::SetDataRand(DTYPE lower, DTYPE upper)
    // srand((unsigned)time(0));
    DTYPE variance = upper - lower;
    void * d = NULL;
-
    if (dataType == X_FLOAT) {
        d = new float[unitNum];
        for (int i = 0; i < unitNum; i++) {
@@ -1700,7 +1697,6 @@ void XTensor::Dump(const XTensor * tensor, FILE * file, const char * label, cons
    {
        ShowNTErrors("TO DO!");
    }
-
 }

 /* 

--- a/source/tensor/core/arithmetic/Div.cpp
+++ b/source/tensor/core/arithmetic/Div.cpp
@@ -46,8 +46,6 @@ void _Div(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int le
                  "Unmatched tensors in multiplication!");
    CheckNTErrors((a->order == b->order && a->order == c->order), 
                  "Unmatched tensors!");
-    CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
-                  "Unmatched tensors in addition!");

 #ifdef USE_CUDA
    if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {

--- a/source/tensor/core/arithmetic/DivDim.cu
+++ b/source/tensor/core/arithmetic/DivDim.cu
@@ -22,6 +22,8 @@
 #include "DivDim.cuh"
 #include "../../XDevice.h"
 #include "cuda_fp16.h"
+#include "device_launch_parameters.h"
+#include "../../XDataType.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)

@@ -38,43 +40,89 @@ where a is a tensor and b is a row vector
 >> colNum - number of columns of a and c (i.e., the size of b)
 >> alpha - the scaling factor
 */
-template <class T, bool alphaFired>
+
+
 __global__
-void KernelDivWithRow(T * a, T * b, T * c, int rowNum, int colNum,DTYPE alpha)
+	void KernelDivWithRow(DTYPE * a, DTYPE * b, DTYPE * c, int rowNum, int colNum, DTYPE alpha,bool alphaFired)
 {
-    __shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
+	__shared__ DTYPE bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
 	int col = blockDim.x * blockIdx.x + threadIdx.x;
 	int row = blockDim.y * blockIdx.y + threadIdx.y;

-    if(col >= colNum || row >= rowNum)
+	if (col >= colNum || row >= rowNum)
 		return;

-    if(threadIdx.y == 0)
+	if (threadIdx.y == 0)
 		bv[threadIdx.x] = b[col];
 	__syncthreads();

-//#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
-
-	//int offset = colNum * row + col;
-	T alpha1;
-	if (sizeof(T) - sizeof(half) == 0) {
-		alpha1 = __float2half(alpha);
-	}
-	else {
-		alpha1 = (DTYPE)alpha;
-	}

 	int offset = colNum * row + col;
-	
 	if (alphaFired)
-		c[offset] = a[offset] / bv[threadIdx.x] + c[offset] * alpha1;
+		c[offset] = a[offset] / bv[threadIdx.x] + c[offset] * alpha;
 	else
 		c[offset] = a[offset] / bv[threadIdx.x];
+}
+
+__global__
+void KernelDivWithRowHalf(half * a, half * b, half * c, int rowNum, int colNum, half alpha, bool alphaFired)
+{
+	__shared__ half bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
+	int col = blockDim.x * blockIdx.x + threadIdx.x;
+	int row = blockDim.y * blockIdx.y + threadIdx.y;
+
+	if (col >= colNum || row >= rowNum)
+		return;
+
+	if (threadIdx.y == 0)
+		bv[threadIdx.x] = b[col];
+	__syncthreads();

-//#endif

+	int offset = colNum * row + col;
+	if (alphaFired)
+		c[offset] = a[offset] / bv[threadIdx.x] + c[offset] * alpha;
+	else
+		c[offset] = a[offset] / bv[threadIdx.x];
 }

+//template <class T, bool alphaFired>
+//__global__
+//void KernelDivWithRow(T * a, T * b, T * c, int rowNum, int colNum,DTYPE alpha)
+//{
+//    __shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
+//    int col = blockDim.x * blockIdx.x + threadIdx.x;
+//    int row = blockDim.y * blockIdx.y + threadIdx.y;
+//
+//    if(col >= colNum || row >= rowNum)
+//        return;
+//
+//    if(threadIdx.y == 0)
+//        bv[threadIdx.x] = b[col];
+//    __syncthreads();
+//
+////#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
+//
+//	//int offset = colNum * row + col;
+//	T alpha1;
+//	if (sizeof(T) - sizeof(half) == 0) {
+//		alpha1 = __float2half(alpha);
+//	}
+//	else {
+//		alpha1 = (DTYPE)alpha;
+//	}
+//	
+//	int offset = colNum * row + col;
+//	
+//	if (alphaFired)
+//		c[offset] = a[offset] / bv[threadIdx.x] + c[offset] * alpha1;
+//	else
+//		c[offset] = a[offset] / bv[threadIdx.x];
+//
+////#endif
+//
+//}
+
 /* 
 tensor division of a tensor and a colum vector
 c = a / b + alpha * c
@@ -88,11 +136,11 @@ where a is a tensor and b is a colum vector
 >> blockNum - number of matrics 
 >> alpha - the scaling factor
 */
-template <class T, bool alphaFired>
+
 __global__
-void KernelDivWithCol(T * a, T * b, T * c, int rowNum, int colNum, int blockSize, int blockNum, DTYPE alpha)
+void KernelDivWithCol(DTYPE * a, DTYPE * b, DTYPE * c, int rowNum, int colNum, int blockSize, int blockNum, DTYPE alpha,bool alphaFired)
 {
-    __shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
+	__shared__ DTYPE bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];

 	int colIndex = blockDim.x * blockIdx.x + threadIdx.x;
 	int row = blockDim.y * blockIdx.y + threadIdx.y;
@@ -100,33 +148,95 @@ void KernelDivWithCol(T * a, T * b, T * c, int rowNum, int colNum, int blockSize
 	int col = colIndex % colNum;
 	int block = colIndex / colNum;

-    if(row >= rowNum || block >= blockNum)
+	if (row >= rowNum || block >= blockNum)
 		return;

-    if(threadIdx.x == 0)
+	if (threadIdx.x == 0)
 		bv[threadIdx.y] = b[row];
 	__syncthreads();

-//#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)	
+	//#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)	

 	int offset = block * blockSize + row * colNum + col;

-	T alpha1;
-	if (sizeof(T) -sizeof(half) == 0) {
-		alpha1 = __float2half(alpha);
-	}
-	else {
-		alpha1 = alpha;
-	}
-
 	if (alphaFired)
-		c[offset] = a[offset] / bv[threadIdx.y] + c[offset] * alpha1;
+		c[offset] = a[offset] / bv[threadIdx.y] + c[offset] * alpha;
 	else
 		c[offset] = a[offset] / bv[threadIdx.y];

-//#endif
+	//#endif
 }

+__global__
+void KernelDivWithColHalf(half * a, half * b, half * c, int rowNum, int colNum, int blockSize, int blockNum, half alpha, bool alphaFired)
+{
+	__shared__ half bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
+
+	int colIndex = blockDim.x * blockIdx.x + threadIdx.x;
+	int row = blockDim.y * blockIdx.y + threadIdx.y;
+
+	int col = colIndex % colNum;
+	int block = colIndex / colNum;
+
+	if (row >= rowNum || block >= blockNum)
+		return;
+
+	if (threadIdx.x == 0)
+		bv[threadIdx.y] = b[row];
+	__syncthreads();
+
+
+	int offset = block * blockSize + row * colNum + col;
+
+	if (alphaFired)
+		c[offset] = a[offset] / bv[threadIdx.y] + c[offset] * alpha;
+	else
+		c[offset] = a[offset] / bv[threadIdx.y];
+\
+}
+
+
+//
+//
+//template <class T, bool alphaFired>
+//__global__
+//void KernelDivWithCol(T * a, T * b, T * c, int rowNum, int colNum, int blockSize, int blockNum, DTYPE alpha)
+//{
+//    __shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
+//
+//    int colIndex = blockDim.x * blockIdx.x + threadIdx.x;
+//    int row = blockDim.y * blockIdx.y + threadIdx.y;
+//
+//    int col = colIndex % colNum;
+//    int block = colIndex / colNum;
+//
+//    if(row >= rowNum || block >= blockNum)
+//        return;
+//
+//    if(threadIdx.x == 0)
+//        bv[threadIdx.y] = b[row];
+//    __syncthreads();
+//
+////#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)	
+//
+//	int offset = block * blockSize + row * colNum + col;
+//
+//	T alpha1;
+//	if (sizeof(T) -sizeof(half) == 0) {
+//		alpha1 = __float2half(alpha);
+//	}
+//	else {
+//		alpha1 = alpha;
+//	}
+//
+//	if (alphaFired)
+//		c[offset] = a[offset] / bv[threadIdx.y] + c[offset] * alpha1;
+//	else
+//		c[offset] = a[offset] / bv[threadIdx.y];
+//
+////#endif
+//}
+
 /*
 tensor division

@@ -171,24 +281,24 @@ void _CudaDivDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE
        if(stride > 1){
            GDevs.GetCudaThread2D(a->devID, stride * blockNum, blockSize, MAX_INT, cudaGrids, cudaBlocks);
            if(alpha == (DTYPE)0.0F)
-                KernelDivWithCol<DTYPE, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
+                KernelDivWithCol <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
                                                ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, 
-                                                  blockSize, stride, blockSize * stride, blockNum, alpha);
+                                                  blockSize, stride, blockSize * stride, blockNum, alpha,false);
            else
-                KernelDivWithCol<DTYPE, true>  <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
+                KernelDivWithCol<<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
                                                ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, 
-                                                  blockSize, stride, blockSize * stride, blockNum, alpha);
+                                                  blockSize, stride, blockSize * stride, blockNum, alpha,true);
        }
        else if(stride == 1){
            GDevs.GetCudaThread2D(a->devID, blockSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
            if(alpha == (DTYPE)0.0F)
-                KernelDivWithRow<DTYPE, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
+                KernelDivWithRow<<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
                                                ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, 
-                                                  blockNum, blockSize, alpha);
+                                                  blockNum, blockSize, alpha,false);
            else
-                KernelDivWithRow<DTYPE, true>  <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
+                KernelDivWithRow  <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
                                                ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, 
-                                                  blockNum, blockSize, alpha);
+                                                  blockNum, blockSize, alpha,true);
        }
        else{
            ShowNTErrors("Something is wrong!");
@@ -197,25 +307,40 @@ void _CudaDivDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE
 	else if (a->dataType == X_FLOAT16) {
 		if (stride > 1) {
 			GDevs.GetCudaThread2D(a->devID, stride * blockNum, blockSize, MAX_INT, cudaGrids, cudaBlocks);
-			if (alpha == (DTYPE)0.0F)
-				KernelDivWithCol<half, false> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
+			if (alpha == (DTYPE)0.0F){
+				unsigned short temp = FloatToFloat16(alpha);
+				half alpha1 = *((half *)&temp);
+				KernelDivWithColHalf << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
 					((half *)a->data, (half*)b->data, (half*)c->data,
-					blockSize, stride, blockSize * stride, blockNum, alpha);
-			else
-				KernelDivWithCol<half, true> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
+						blockSize, stride, blockSize * stride, blockNum, alpha1, false);
+			}
+				
+			else{
+				unsigned short temp = FloatToFloat16(alpha);
+				half alpha1 = *((half *)&temp);
+				KernelDivWithColHalf << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
 					((half*)a->data, (half*)b->data, (half*)c->data,
-					blockSize, stride, blockSize * stride, blockNum, alpha);
+						blockSize, stride, blockSize * stride, blockNum, alpha1, true);
+			}
+				
 		}
 		else if (stride == 1) {
 			GDevs.GetCudaThread2D(a->devID, blockSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
-			if (alpha == (DTYPE)0.0F)
-				KernelDivWithRow<half, false> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
+			if (alpha == (DTYPE)0.0F) {
+				unsigned short temp = FloatToFloat16(alpha);
+				half alpha1 = *((half *)&temp);
+				KernelDivWithRowHalf << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
 					((half*)a->data, (half*)b->data, (half*)c->data,
-					blockNum, blockSize, alpha);
-			else
-				KernelDivWithRow<half, true> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
+						blockNum, blockSize, alpha1, false);
+			}
+				
+			else {
+				unsigned short temp = FloatToFloat16(alpha);
+				half alpha1 = *((half *)&temp);
+				KernelDivWithRowHalf << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
 					((half*)a->data, (half*)b->data, (half*)c->data,
-					blockNum, blockSize, alpha);
+						blockNum, blockSize, alpha1, true);
+			}			
 		}
 		else {
 			ShowNTErrors("Something is wrong!");

--- a/source/tensor/core/arithmetic/MatrixMul.cpp
+++ b/source/tensor/core/arithmetic/MatrixMul.cpp
@@ -22,12 +22,10 @@
 #include "../../XTensor.h"
 #include "../../XDevice.h"
 #include "../../XName.h"
-#include "../CHeader.h"
 #include "MatrixMul.h"
 #include "MatrixMul2D.h"
 #include "XTensorBLAS.h"
 #include "MatrixMulBatched.h"
-#include "timer.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)

@@ -276,7 +274,8 @@ matrix multiplication with no transposition c = a * b * alpha
 >> parallelRunner - parallel processing module
 << return - the result of matrix multiplication
 */
-XTensor MatrixMul(const XTensor &a, const XTensor &b, DTYPE alpha, XPRunner * parallelRunner)
+XTensor MatrixMul(const XTensor &a, const XTensor &b, 
+                  DTYPE alpha, XPRunner * parallelRunner)
 {
    CheckNTErrors(a.dataType == b.dataType, "Input tensors should have the same data type!");
    CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!");
@@ -317,212 +316,4 @@ XTensor MatrixMul(const XTensor &a, const XTensor &b, DTYPE alpha, XPRunner * pa
    return c;
 }

-/*
-matrix multiplication (return a XTensor structure) c = trans(a) * trans(b) * alpha
-make a new tensor to keep the result and return it
-
-For the input tensors a and b, we perform matrix multiplication on the first two dimentsions.
-E.g., let A be a tensor of size y * z * m and B be a tensor of size x * y * n.
-For A * B, we go over each order-2 tensor of A (of size x * y) and each order-2 tensor B (of size z * x),
-like this c_{i,j} = trans(ai) * trans(bj) * alpha + c_{i,j} * beta
-where trans() returns the transposed matrix if the flag is fired, ai is the i-th element tensor of A,
-bj is the j-th element tensor of B, and c_{i,j} is the (i,j) element tensor of the result C.
-The result C should be a tensor of z * x * n * m.
-Obviously C = A * B performs normal matrix multiplication if A = y * z and B = x * y.
-
->> a - tensor a
->> transposedA - indicates whether the matrices in a are transposed
->> b - tensor b
->> transposedB - indicates whether teh matrices in b are transposed
->> alpha - a coefficient
->> parallelRunner - parallel processing module
-<< return - the result of matrix multiplication
-*/
-XTensor MatrixMulFloat16(const XTensor &a, MATRIX_TRANS_TYPE transposedA,
-    const XTensor &b, MATRIX_TRANS_TYPE transposedB,
-    DTYPE alpha, XPRunner * parallelRunner)
-{
-    CheckNTErrors(a.dataType == b.dataType, "Input tensors should have the same data type!");
-    CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!");
-
-    int an = transposedA == X_TRANS ? a.dimSizeRDI[0] : a.dimSizeRDI[1];
-    int am = transposedA == X_TRANS ? a.dimSizeRDI[1] : a.dimSizeRDI[0];
-    int bn = transposedB == X_TRANS ? b.dimSizeRDI[0] : b.dimSizeRDI[1];
-    int bm = transposedB == X_TRANS ? b.dimSizeRDI[1] : b.dimSizeRDI[0];
-
-    CheckNTErrors(am == bn, "Unmatched tensors in multiplication!");
-
-    int order = a.order + b.order - 2;
-    int sub = 0;
-    int * dimSize = new int[order];
-    for (int i = 2; i < a.order; i++)
-        dimSize[sub++] = a.dimSizeRDI[a.order + 1 - i];
-    for (int i = 2; i < b.order; i++)
-        dimSize[sub++] = b.dimSizeRDI[b.order + 1 - i];
-    dimSize[sub++] = an;
-    dimSize[sub++] = bm;
-
-    float dr = (!a.isSparse || !b.isSparse) ? 1.0F : MAX(a.denseRatio, b.denseRatio);
-    XTensor c(order, dimSize, a.dataType, dr, a.devID, a.mem);
-    c.SetTMPFlag();
-
-    //XTensor * halfA = NewTensorBuf(a.order, a.dimSize, X_FLOAT16, 1.0F, a.devID, a.mem);
-    //XTensor * halfB = NewTensorBuf(b.order, b.dimSize, X_FLOAT16, 1.0F, a.devID, a.mem);
-    //XTensor * halfC = NewTensorBuf(c.order, c.dimSize, X_FLOAT16, 1.0F, a.devID, a.mem);
-
-    //_ConvertDataType(&a, halfA);
-    //_ConvertDataType(&b, halfB);
-
-    //_MatrixMul(halfA, transposedA, halfB, transposedB, halfC, alpha, 0, parallelRunner);
-
-    //_ConvertDataType(halfC, &c);
-
-    //DelTensorBuf(halfC);
-    //DelTensorBuf(halfB);
-    //DelTensorBuf(halfA);
-
-
-    XTensor * halfA = NewTensorBuf(a.order, a.dimSize, X_FLOAT16, 1.0F, a.devID, a.mem);
-    XTensor * halfB = NewTensorBuf(b.order, b.dimSize, X_FLOAT16, 1.0F, a.devID, a.mem);
-    XTensor * halfC = NewTensorBuf(c.order, c.dimSize, X_FLOAT16, 1.0F, a.devID, a.mem);
-
-    /*timer_c timerConvert1;
-    timerConvert1.m_start_timer();
-*/
-    _ConvertDataType(&a, halfA);
-    _ConvertDataType(&b, halfB);
-
-    /*timerConvert1.m_end_timer();
-    printf("time convert1 %f ms\n", timerConvert1.m_get_time_diff_msec());
-
-    timer_c timerMatrixMul;
-    timerMatrixMul.m_start_timer();*/
-
-    _MatrixMul(halfA, transposedA, halfB, transposedB, halfC, alpha, 0, parallelRunner);
-
-    /*timerMatrixMul.m_end_timer();
-    printf("time matrixmul %f ms\n", timerMatrixMul.m_get_time_diff_msec());
-
-    timer_c timerConvert2;
-    timerConvert2.m_start_timer();
-    */
-    _ConvertDataType(halfC, &c);
-
-    /*timerConvert2.m_end_timer();
-    printf("time convert2 %f ms\n\n", timerConvert2.m_get_time_diff_msec());*/
-
-    DelTensorBuf(halfC);
-    DelTensorBuf(halfB);
-    DelTensorBuf(halfA);
-
-
-    /* tensor connections */
-    XLink::MakeLink(&a, &b, &c, MATH_MATRIXMUL);
-    XLink::AddParamToHeadTrans(&c, transposedA);
-    XLink::AddParamToHeadTrans(&c, transposedB);
-    XLink::AddParamToHead(&c, alpha);
-
-    /* destroy variables */
-    delete[] dimSize;
-
-    return c;
-}
-
-/*
-matrix multiplication with no transposition c = a * b * alpha
->> a - tensor a
->> b - tensor b
->> alpha - a coefficient
->> parallelRunner - parallel processing module
-<< return - the result of matrix multiplication
-*/
-XTensor MatrixMulFloat16(const XTensor &a, const XTensor &b,
-    DTYPE alpha, XPRunner * parallelRunner)
-{
-    CheckNTErrors(a.dataType == b.dataType, "Input tensors should have the same data type!");
-    CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!");
-
-    int an = a.dimSizeRDI[1];
-    int am = a.dimSizeRDI[0];
-    int bn = b.dimSizeRDI[1];
-    int bm = b.dimSizeRDI[0];
-
-    CheckNTErrors(am == bn, "Unmatched tensors in multiplication!");
-
-    int order = a.order + b.order - 2;
-    int sub = 0;
-    int * dimSize = new int[order];
-    for (int i = 2; i < a.order; i++)
-        dimSize[sub++] = a.dimSizeRDI[a.order + 1 - i];
-    for (int i = 2; i < b.order; i++)
-        dimSize[sub++] = b.dimSizeRDI[b.order + 1 - i];
-    dimSize[sub++] = an;
-    dimSize[sub++] = bm;
-
-    float dr = (!a.isSparse || !b.isSparse) ? 1.0F : MAX(a.denseRatio, b.denseRatio);
-    XTensor c(order, dimSize, a.dataType, dr, a.devID, a.mem);
-    c.SetTMPFlag();
-
-    XTensor * halfA = NewTensorBuf(a.order, a.dimSize, X_FLOAT16, 1.0F, a.devID, a.mem);
-    XTensor * halfB = NewTensorBuf(b.order, b.dimSize, X_FLOAT16, 1.0F, a.devID, a.mem);
-    XTensor * halfC = NewTensorBuf(c.order, c.dimSize, X_FLOAT16, 1.0F, a.devID, a.mem);
-
-    _ConvertDataType(&a, halfA);
-    _ConvertDataType(&b, halfB);
-
-    _MatrixMul(halfA, X_NOTRANS, halfB, X_NOTRANS, halfC, alpha, 0, parallelRunner);
-
-    _ConvertDataType(halfC, &c);
-
-    DelTensorBuf(halfC);
-    DelTensorBuf(halfB);
-    DelTensorBuf(halfA);
-
-
-    //XTensor * halfA = NewTensorBuf(a.order, a.dimSize, X_FLOAT16, 1.0F, a.devID, a.mem);
-    //XTensor * halfB = NewTensorBuf(b.order, b.dimSize, X_FLOAT16, 1.0F, a.devID, a.mem);
-    //XTensor * halfC = NewTensorBuf(c.order, c.dimSize, X_FLOAT16, 1.0F, a.devID, a.mem);
-
-    //timer_c timerConvert1;
-    //timerConvert1.m_start_timer();
-
-    //_ConvertDataType(&a, halfA);
-    //_ConvertDataType(&b, halfB);
-
-    //timerConvert1.m_end_timer();
-    //printf("time convert1 %f ms\n", timerConvert1.m_get_time_diff_msec());
-
-    //timer_c timerMatrixMul;
-    //timerMatrixMul.m_start_timer();
-
-    //_MatrixMul(halfA, X_NOTRANS, halfB, X_NOTRANS, halfC, alpha, 0, parallelRunner);
-
-    //timerMatrixMul.m_end_timer();
-    //printf("time matrixmul %f ms\n", timerMatrixMul.m_get_time_diff_msec());
-
-    //timer_c timerConvert2;
-    //timerConvert2.m_start_timer();
-
-    //_ConvertDataType(halfC, &c);
-
-    //timerConvert2.m_end_timer();
-    //printf("time convert2 %f ms\n\n", timerConvert2.m_get_time_diff_msec());
-
-    //DelTensorBuf(halfC);
-    //DelTensorBuf(halfB);
-    //DelTensorBuf(halfA);
-
-
-    /* tensor connections */
-    XLink::MakeLink(&a, &b, &c, MATH_MATRIXMUL);
-    XLink::AddParamToHeadTrans(&c, X_NOTRANS);
-    XLink::AddParamToHeadTrans(&c, X_NOTRANS);
-    XLink::AddParamToHead(&c, alpha);
-
-    /* destroy variables */
-    delete[] dimSize;
-
-    return c;
-}
-
-}
\ No newline at end of file
+}// namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/arithmetic/MatrixMul.h
+++ b/source/tensor/core/arithmetic/MatrixMul.h
@@ -27,7 +27,6 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)

 #define MMul MatrixMul
-#define MMul16 MatrixMulFloat16

 /*
 matrix multiplication c = trans(a) * trans(b) * alpha + c * beta
@@ -64,13 +63,6 @@ XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor
 XTensor MatrixMul(const XTensor &a, const XTensor &b,
                  DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);

-XTensor MatrixMulFloat16(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor &b, MATRIX_TRANS_TYPE transposedB,
-                        DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);
-
-/* matrix multiplication with no transposition c = a * b * alpha*/
-XTensor MatrixMulFloat16(const XTensor &a, const XTensor &b,
-                        DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);
-
 } // namespace nts(NiuTrans.Tensor)

 #endif // __MATRIXMUL_H__
\ No newline at end of file
--- a/source/tensor/core/arithmetic/MatrixMul2D.cpp
+++ b/source/tensor/core/arithmetic/MatrixMul2D.cpp
@@ -82,19 +82,7 @@ void _MatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
            _MatrixMULCPU(a, transposedA, b, transposedB, c, alpha, beta);
        else
            _MatrixMul2DParallel(a, transposedA, b, transposedB, c, alpha, beta, parallelRunner);
-        /*if (a->dataType == DEFAULT_DTYPE &&
-            b->dataType == DEFAULT_DTYPE &&
-            c->dataType == DEFAULT_DTYPE)
-        {
-            if (useBLAS)
-                _MatrixMULCPU(a, transposedA, b, transposedB, c, alpha, beta);
-            else
-                _MatrixMul2DParallel(a, transposedA, b, transposedB, c, alpha, beta, parallelRunner);
-        }*/
-        //else {
-        //    // TODO!!
-        //    ShowNTErrors("TODO!");
-        //}
+
    }
    /* a dense matrix multiply a sparse matrix */
    else if (!a->isSparse && b->isSparse) {

--- a/source/tensor/core/arithmetic/MatrixMul2D.cu
+++ b/source/tensor/core/arithmetic/MatrixMul2D.cu
@@ -156,18 +156,6 @@ void _CudaMatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
        if (stream != NULL)
            cublasSetStream(*handle, stream->stream);

-        //if (a->dataType == X_FLOAT && b->dataType == X_FLOAT && c->dataType == X_FLOAT) {
-        //    _CudaBLASMatrixMUL(handle, a->data, transposedA, a->dataType, 
-        //                       b->data, transposedB, a->dataType, c->data, c->dataType,
-        //                       a->dimSize[0], a->dimSize[1], 
-        //                       b->dimSize[0], b->dimSize[1], 
-        //                       c->dimSize[0], c->dimSize[1],
-        //                       alpha, beta);
-        //}
-        //else {
-        //    // TODO!!
-        //    ShowNTErrors("TODO!");
-        //}
        _CudaBLASMatrixMUL(handle, a->data, transposedA, a->dataType,
            b->data, transposedB, a->dataType, c->data, c->dataType,
            a->dimSize[0], a->dimSize[1],

--- a/source/tensor/core/arithmetic/MatrixMul2DParallel.cpp
+++ b/source/tensor/core/arithmetic/MatrixMul2DParallel.cpp
@@ -63,44 +63,6 @@ void _MatrixMul2DParallel(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
    /* trans(a) * b */
    else if (transposedA == X_TRANS && transposedB == X_NOTRANS) {
        int num = an;
-        /*if (a->dataType == X_FLOAT16) {
-            for (int i = 0; i < cn; i++) {
-                X_FLOAT16 *p3 = (X_FLOAT16*)c->data + i * cm;
-                for (int j = 0; j < cm; j++) {
-                    X_FLOAT16 r = 0;
-                    X_FLOAT16 * p1 = (X_FLOAT16*)a->data + 0 * am + i;
-                    X_FLOAT16 * p2 = (X_FLOAT16*)b->data + 0 * bm + j;
-
-                    for (int k = 0; k < num; k++) {
-                        r += (*p1) * (*p2) * alpha;
-                        p1 += aColNum;
-                        p2 += bColNum;
-                    }
-
-                    *p3 = *p3 * beta + r;
-                    p3 += 1;
-                }
-            }
-        }
-        else {
-            for (int i = 0; i < cn; i++) {
-                DTYPE * p3 = (DTYPE*)c->data + i * cm;
-                for (int j = 0; j < cm; j++) {
-                    DTYPE r = 0;
-                    DTYPE * p1 = (DTYPE*)a->data + 0 * am + i;
-                    DTYPE * p2 = (DTYPE*)b->data + 0 * bm + j;
-
-                    for (int k = 0; k < num; k++) {
-                        r += (*p1) * (*p2) * alpha;
-                        p1 += aColNum;
-                        p2 += bColNum;
-                    }
-
-                    *p3 = *p3 * beta + r;
-                    p3 += 1;
-                }
-            }
-        }*/
        for (int i = 0; i < cn; i++) {
            DTYPE * p3 = (DTYPE*)c->data + i * cm;
            for (int j = 0; j < cm; j++) {

--- a/source/tensor/core/arithmetic/MatrixMulBatched.cpp
+++ b/source/tensor/core/arithmetic/MatrixMulBatched.cpp
@@ -22,7 +22,6 @@
 #include "../../XTensor.h"
 #include "../../XDevice.h"
 #include "../../XName.h"
-#include "../CHeader.h"
 #include "MatrixMulBatched.h"
 #include "XTensorBLAS.h"
 #include "MatrixMul2D.h"
@@ -388,142 +387,4 @@ XTensor MatrixMulBatched(const XTensor &a, const XTensor &b,
    return c;
 }

-/*
-matrix multiplication of the two tensors (do it on site)
-c = trans(a) * trans(b) * alpha
-make a new tensor to keep the result and return it
-
-for each 2-dimensional data array in a (denoted as ai) and
-each 2-dimensional data array in b (denoted as bi), we have
-ci = trans(ai) * trans(bi) * alpha + cm * beta
-where trans() returns the transposed matrix if the flag is fired.
-
->> a - tensor a
->> transposedA - indicates whether the matrices in a are transposed
->> b - tensor b
->> transposedB - indicates whether teh matrices in b are transposed
->> alpha - a coefficient
->> parallelRunner - parallel processing module
-<< return - the result of matrix multiplication of the two tensors
-*/
-XTensor MatrixMulBatchedFloat16(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor &b, MATRIX_TRANS_TYPE transposedB,
-    DTYPE alpha, XPRunner * parallelRunner)
-{
-    CheckNTErrors(a.dataType == b.dataType, "Input tensors should have the same data type!");
-    CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!");
-    CheckNTErrors(a.order == b.order, "Input tensor and output tensor must have same order!");
-
-    int an = transposedA == X_TRANS ? a.dimSizeRDI[0] : a.dimSizeRDI[1];
-    int am = transposedA == X_TRANS ? a.dimSizeRDI[1] : a.dimSizeRDI[0];
-    int bn = transposedB == X_TRANS ? b.dimSizeRDI[0] : b.dimSizeRDI[1];
-    int bm = transposedB == X_TRANS ? b.dimSizeRDI[1] : b.dimSizeRDI[0];
-
-    CheckNTErrors(am == bn, "Unmatched tensors in multiplication!");
-
-    int order = a.order;
-    int sub = 0;
-    int * dimSize = new int[order];
-    for (int i = 0; i < a.order - 2; i++)
-        dimSize[sub++] = a.dimSize[i];
-    dimSize[sub++] = an;
-    dimSize[sub++] = bm;
-
-    float dr = (!a.isSparse || !b.isSparse) ? 1.0F : MAX(a.denseRatio, b.denseRatio);
-    XTensor c(order, dimSize, a.dataType, dr, a.devID, a.mem);
-    c.SetTMPFlag();
-
-    ///*call _MatrixMulBatched function */
-    //_MatrixMulBatched(&a, transposedA, &b, transposedB, &c, alpha, 0, parallelRunner);
-
-    XTensor * halfA = NewTensorBuf(a.order, a.dimSize, X_FLOAT16, 1.0F, a.devID, a.mem);
-    XTensor * halfB = NewTensorBuf(b.order, b.dimSize, X_FLOAT16, 1.0F, a.devID, a.mem);
-    XTensor * halfC = NewTensorBuf(c.order, c.dimSize, X_FLOAT16, 1.0F, a.devID, a.mem);
-    _ConvertDataType(&a, halfA);
-    _ConvertDataType(&b, halfB);
-    _MatrixMulBatched(halfA, transposedA, halfB, transposedB, halfC, alpha, 0, parallelRunner);
-    _ConvertDataType(halfC, &c);
-    DelTensorBuf(halfC);
-    DelTensorBuf(halfB);
-    DelTensorBuf(halfA);
-
-    /* tensor connections */
-    XLink::MakeLink(&a, &b, &c, MATH_MATRIXMULBATCHED);
-    XLink::AddParamToHeadTrans(&c, transposedA);
-    XLink::AddParamToHeadTrans(&c, transposedB);
-    XLink::AddParamToHead(&c, alpha);
-
-    /* destroy variables */
-    delete[] dimSize;
-
-    return c;
-}
-
-/*
-matrix multiplication of the two tensors (do it on site)
-c = a * b * alpha
-make a new tensor to keep the result and return it
-
-for each 2-dimensional data array in a (denoted as ai) and
-each 2-dimensional data array in b (denoted as bi), we have
-ci = ai * bi * alpha + cm * beta
-
->> a - tensor a
->> b - tensor b
->> alpha - a coefficient
->> parallelRunner - parallel processing module
-<< return - the result of matrix multiplication of the two tensors
-*/
-XTensor MatrixMulBatchedFloat16(const XTensor &a, const XTensor &b,
-    DTYPE alpha, XPRunner * parallelRunner)
-{
-    CheckNTErrors(a.dataType == b.dataType, "Input tensors should have the same data type!");
-    CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!");
-    CheckNTErrors(a.order == b.order, "Input tensor and output tensor must have same order!");
-
-    int an = a.dimSizeRDI[1];
-    int am = a.dimSizeRDI[0];
-    int bn = b.dimSizeRDI[1];
-    int bm = b.dimSizeRDI[0];
-
-    CheckNTErrors(am == bn, "Unmatched tensors in multiplication!");
-
-    int order = a.order;
-    int sub = 0;
-    int * dimSize = new int[order];
-    for (int i = 0; i < a.order - 2; i++)
-        dimSize[sub++] = a.dimSize[i];
-    dimSize[sub++] = an;
-    dimSize[sub++] = bm;
-
-    float dr = (!a.isSparse || !b.isSparse) ? 1.0F : MAX(a.denseRatio, b.denseRatio);
-    XTensor c(order, dimSize, a.dataType, dr, a.devID, a.mem);
-    c.SetTMPFlag();
-
-    ///*call _MatrixMulBatched function */
-    //_MatrixMulBatched(&a, X_NOTRANS, &b, X_NOTRANS, &c, alpha, 0, parallelRunner);
-
-    XTensor * halfA = NewTensorBuf(a.order, a.dimSize, X_FLOAT16, 1.0F, a.devID, a.mem);
-    XTensor * halfB = NewTensorBuf(b.order, b.dimSize, X_FLOAT16, 1.0F, a.devID, a.mem);
-    XTensor * halfC = NewTensorBuf(c.order, c.dimSize, X_FLOAT16, 1.0F, a.devID, a.mem);
-    _ConvertDataType(&a, halfA);
-    _ConvertDataType(&b, halfB);
-    _MatrixMulBatched(halfA, X_NOTRANS, halfB, X_NOTRANS, halfC, alpha, 0, parallelRunner);
-    _ConvertDataType(halfC, &c);
-    DelTensorBuf(halfC);
-    DelTensorBuf(halfB);
-    DelTensorBuf(halfA);
-
-    /* tensor connections */
-    XLink::MakeLink(&a, &b, &c, MATH_MATRIXMULBATCHED);
-    XLink::AddParamToHeadTrans(&c, X_NOTRANS);
-    XLink::AddParamToHeadTrans(&c, X_NOTRANS);
-    XLink::AddParamToHead(&c, alpha);
-
-    /* destroy variables */
-    delete[] dimSize;
-
-    return c;
-}
-
-
 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/arithmetic/MatrixMulBatched.h
+++ b/source/tensor/core/arithmetic/MatrixMulBatched.h
@@ -27,7 +27,6 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)

 #define BMMul MatrixMulBatched
-#define BMMul16 MatrixMulBatchedFloat16

 /*
 matrix multiplication of the two tensors c = trans(a) * trans(b) * alpha + c * beta
@@ -85,12 +84,6 @@ ci = ai * bi * alpha + cm * beta
 XTensor MatrixMulBatched(const XTensor &a, const XTensor &b, 
                         DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);

-XTensor MatrixMulBatchedFloat16(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor &b, MATRIX_TRANS_TYPE transposedB,
-                        DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);
-
-XTensor MatrixMulBatchedFloat16(const XTensor &a, const XTensor &b,
-                        DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);
-
 } // namespace nts(NiuTrans.Tensor)

 #endif // __MATRIXMULBATCHED_H__
\ No newline at end of file
--- a/source/tensor/core/arithmetic/Multiply.cpp
+++ b/source/tensor/core/arithmetic/Multiply.cpp
@@ -46,8 +46,6 @@ void _Multiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, i
                  "Unmatched tensors in multiplication!");
    CheckNTErrors((a->order == b->order && a->order == c->order), 
                  "Unmatched tensors!");
-    CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
-                  "Unmatched tensors in addition!");

 #ifdef USE_CUDA
    if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {

--- a/source/tensor/core/arithmetic/MultiplyDim.cpp
+++ b/source/tensor/core/arithmetic/MultiplyDim.cpp
@@ -22,9 +22,10 @@
 #include "Multiply.h"
 #include "MultiplyDim.h"
 #include "MultiplyDim.cuh"
+#include "../shape/Unsqueeze.h"
 #include "../../XName.h"
+#include "../../XUtility.h"
 #include "../movement/CopyValues.h"
-#include "../getandset/ConvertDataType.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)

@@ -136,28 +137,167 @@ void _MultiplyDimMe(XTensor * a, const XTensor * b, int n, DTYPE alpha)
 tensor multiplication (return an XTensor structure and make tensor connections)
 make a new tensor to keep the result and return it

-c = a * b + \alpha * c
+c = a * b
 where the size of b is equal to the n-th dimension of a,
 i.e., a is multiplied with b by broadcasting

 >> a - a tensor
 >> b - another tensor whose size is equal to that of dimension n of a
 >> n - the dimension index
->> alpha - the scaling factor
 << return - the result tensor by tensor multiplication
 */
-XTensor MultiplyDim(const XTensor &a, const XTensor &b, int n, DTYPE alpha)
+XTensor MultiplyDim(const XTensor &a, const XTensor &b, int n)
 {
    XTensor c(&a);
    c.SetTMPFlag();

    /* call _Multiply function */
-    _MultiplyDim(&a, &b, &c, n, alpha);
+    _MultiplyDim(&a, &b, &c, n, 0);

    /* tensor connections */
    XLink::MakeLink(&a, &b, &c, MATH_MULTIPLYDIM);
    XLink::AddParamToHeadInt(&c, n);
-    XLink::AddParamToHead(&c, alpha);
+    XLink::AddParamToHead(&c, 0);
+
+    return c;
+}
+
+/* 
+tensor broadcast multiplication
+c = a * b + c * \beta 
+where some of dimensions of b can be of size 1
+
+>> a - a tensor
+>> b - another tensor that would be broadcasted
+>> c - the resulting tensor
+>> beta - the scaling factor
+*/
+void _MultiplyBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
+{
+    CheckNTErrors(a->order == b->order, "Wrong tensor orders!");
+    CheckNTErrors(a->order == c->order, "Wrong tensor orders!");
+    CheckNTErrors(a->order > 0, "TODO!");
+    
+    int order = a->order;
+    int count = 0;
+    void * source = 0;
+    void * target = 0;
+    
+    for(int i = 0; i < order; i++){
+        if(a->GetDim(i) == b->GetDim(i))
+            continue;
+        
+        if(b->GetDim(i) == 1){
+            int fitSize = a->GetDim(i);
+            int j = i + 1;
+            
+            /* we define a range over dimensions. It is to be unsqueezed */
+            for(; j < order; j++){
+                if(a->GetDim(j) == b->GetDim(j))
+                    break;
+                fitSize *= a->GetDim(j);
+            }
+            
+            int dimsS[MAX_TENSOR_DIM_NUM];
+            int dimsT[MAX_TENSOR_DIM_NUM];
+            
+            for(int k = 0; k < i; k++){
+                dimsS[k] = a->GetDim(k);
+                dimsT[k] = a->GetDim(k);
+            }
+            
+            dimsT[i] = fitSize;
+            
+            bool isLast = true;
+            for(int k = j; k < order; k++){
+                dimsS[i + k - j + 0] = b->GetDim(k);
+                dimsT[i + k - j + 1] = b->GetDim(k);
+                if(a->GetDim(k) != b->GetDim(k)){
+                    if(b->GetDim(k) == 1)
+                        isLast = false;
+                    else{
+                        ShowNTErrors("Wrong dimension size!")
+                    }
+                }
+            }
+            
+            dimsS[0] = -dimsS[0];
+            dimsT[0] = -dimsT[0];
+            
+            XTensor * s = NewTensor(order - (j - i), dimsS, a->dataType, a->denseRatio, a->devID, a->mem);
+            XTensor * t = NewTensor(order - (j - i) + 1, dimsT, b->dataType, b->denseRatio, b->devID, b->mem);
+            
+            if(count == 0)
+                source = b->data;
+            else{
+                source = target;
+            }
+            
+            target = t->mem != NULL ?
+                     t->mem->AllocBuf(t->devID, t->unitNum * t->unitSize):
+                     XMemAlloc(t->devID, t->unitNum * t->unitSize);
+            
+            s->data = source;
+            t->data = target;
+            
+            _Unsqueeze(s, t, i, fitSize);
+            
+            /* free the memory space of the one before the last allocation */
+            if(count > 0){
+                int size = s->unitNum * s->unitSize;
+                if(t->mem != NULL)
+                    t->mem->ReleaseBuf(t->devID, size);
+                else
+                    XMemFree(t->devID, source);
+            }
+            
+            /* we do multiplication here */
+            if(isLast){
+                CheckNTErrors(t->unitNum == c->unitNum, "Wrong tensor size!");
+                _Multiply(a, t, c, beta);
+                if(t->mem != NULL)
+                    t->mem->ReleaseBuf(t->devID, t->unitNum * t->unitSize);
+                else
+                    XMemFree(t->devID, target);
+                target = NULL;
+            }
+            
+            s->data = NULL;
+            t->data = NULL;
+            DelTensor(s);
+            DelTensor(t);
+            
+            i = j;
+            count++;
+        }
+    }
+
+    if(count == 0)
+        _Multiply(a, b, c, beta);
+    
+    CheckNTErrors(target == NULL, "Something is wrong!");
+}
+
+/* 
+tensor broadcast multiplication
+c = a * b
+where some of dimensions of b can be of size 1
+
+>> a - a tensor
+>> b - another tensor that would be broadcasted
+<< return - the resulting tensor c
+*/
+XTensor MultiplyBroadcast(const XTensor &a, const XTensor &b)
+{
+    XTensor c(&a);
+    c.SetTMPFlag();
+    
+    /* call _SumBroadcast function */
+    _MultiplyBroadcast(&a, &b, &c, 0);
+    
+    /* tensor connections */
+    XLink::MakeLink(&a, &b, &c, MATH_MULTIPLYBROADCAST);
+    XLink::AddParamToHead(&c, 0);
    
    return c;
 }

--- a/source/tensor/core/arithmetic/MultiplyDim.cu
+++ b/source/tensor/core/arithmetic/MultiplyDim.cu
@@ -217,8 +217,6 @@ void _CudaMultiplyDim(const XTensor * a, const XTensor * b, XTensor * c, int n, 
        }

    }
-
-#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
    else if (a->dataType == X_FLOAT16) {
        
        if (stride > 1) {
@@ -243,7 +241,9 @@ void _CudaMultiplyDim(const XTensor * a, const XTensor * b, XTensor * c, int n, 
        }

    }
-#endif
+    else {
+        ShowNTErrors("TODO!");
+    }
    BacktoCudaDev(a->devID, devIDBackup);
 }


--- a/source/tensor/core/arithmetic/MultiplyDim.h
+++ b/source/tensor/core/arithmetic/MultiplyDim.h
@@ -34,9 +34,16 @@ void _MultiplyDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYP
   i.e., a is multiplied with b by broadcasting. we keep the result in the input tensor a and return nothing */
 void _MultiplyDimMe(XTensor * a, const XTensor * b, int n, DTYPE alpha = 0.0);

-/* tensor multiplication c = a * b + \alpha * c where the size of b is equal to the n-th dimension of a,
+/* tensor multiplication c = a * b where the size of b is equal to the n-th dimension of a,
   i.e., a is multiplied with b by broadcasting. We make a new tensor c to keep the result and return it */
-XTensor MultiplyDim(const XTensor &a, const XTensor &b, int n, DTYPE alpha = 0.0);
+XTensor MultiplyDim(const XTensor &a, const XTensor &b, int n);
+
+/* tensor multiplication summation c = a * b + c * \beta where some of dimensions of b can be of size 1 */
+void _MultiplyBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta = (DTYPE)1.0);
+
+/* tensor broadcast multiplication c = a * b where some of dimensions of b can be of size 1.
+   we return the resulting tensor here */
+XTensor MultiplyBroadcast(const XTensor &a, const XTensor &b);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/arithmetic/SubDim.cu
+++ b/source/tensor/core/arithmetic/SubDim.cu
@@ -22,6 +22,8 @@
 #include "SubDim.cuh"
 #include "../../XDevice.h"
 #include "cuda_fp16.h"
+#include "device_launch_parameters.h"
+#include "../../XDataType.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)

@@ -38,11 +40,10 @@ where a is a tensor and b is a row vector
 >> colNum - number of columns of a and c (i.e., the size of b)
 >> beta - the scaling factor
 */
-template <class T, bool betaFired>
 __global__
-	void KernelSubWithRow(T * a, T * b, T * c, int rowNum, int colNum, DTYPE beta)
+void KernelSubWithRow(DTYPE * a, DTYPE * b, DTYPE * c, int rowNum, int colNum, DTYPE beta,bool betaFired)
 {
-	__shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
+	__shared__ DTYPE bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
 	int col = blockDim.x * blockIdx.x + threadIdx.x;
 	int row = blockDim.y * blockIdx.y + threadIdx.y;

@@ -54,21 +55,66 @@ __global__

 	__syncthreads();

-	T beta1;
-	if (sizeof(T) - sizeof(half) == 0) {
-		beta1 =__float2half(beta);
-	}
-	else {
-		beta1 = beta;
-	}
+	int offset = colNum * row + col;
+	if (betaFired)
+		c[offset] = a[offset] - bv[threadIdx.x] * beta;
+	else
+		c[offset] = a[offset] - bv[threadIdx.x];
+}
+
+__global__
+void KernelSubWithRowHalf(half * a, half * b, half * c, int rowNum, int colNum, half beta, bool betaFired)
+{
+	__shared__ half bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
+	int col = blockDim.x * blockIdx.x + threadIdx.x;
+	int row = blockDim.y * blockIdx.y + threadIdx.y;
+
+	if (col >= colNum || row >= rowNum)
+		return;
+
+	if (threadIdx.y == 0)
+		bv[threadIdx.x] = b[col];
+
+	__syncthreads();

 	int offset = colNum * row + col;
 	if (betaFired)
-		c[offset] = a[offset] - bv[threadIdx.x] * beta1;
+		c[offset] = a[offset] - bv[threadIdx.x] * beta;
 	else
 		c[offset] = a[offset] - bv[threadIdx.x];
 }

+//template <class T, bool betaFired>
+//__global__
+//void KernelSubWithRow(T * a, T * b, T * c, int rowNum, int colNum, DTYPE beta)
+//{
+//	__shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
+//	int col = blockDim.x * blockIdx.x + threadIdx.x;
+//	int row = blockDim.y * blockIdx.y + threadIdx.y;
+//
+//	if (col >= colNum || row >= rowNum)
+//		return;
+//
+//	if (threadIdx.y == 0)
+//		bv[threadIdx.x] = b[col];
+//
+//	__syncthreads();
+//
+//	T beta1;
+//	if (sizeof(T) - sizeof(half) == 0) {
+//		beta1 =__float2half(beta);
+//	}
+//	else {
+//		beta1 = beta;
+//	}
+//
+//	int offset = colNum * row + col;
+//	if (betaFired)
+//		c[offset] = a[offset] - bv[threadIdx.x] * beta1;
+//	else
+//		c[offset] = a[offset] - bv[threadIdx.x];
+//}
+
 /*
 tensor subtraction of a tensor and a colum vector
 c = a - b * \beta
@@ -82,11 +128,11 @@ where a is a tensor and b is a colum vector
 >> blockNum - number of matrics
 >> beta - the scaling factor
 */
-template <class T, bool betaFired>
+
 __global__
-	void KernelSubWithCol(T * a, T * b, T * c, int rowNum, int colNum, int blockSize, int blockNum, DTYPE beta)
+void KernelSubWithCol(DTYPE * a, DTYPE * b, DTYPE * c, int rowNum, int colNum, int blockSize, int blockNum, DTYPE beta,bool betaFired)
 {
-	__shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
+	__shared__ DTYPE bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];

 	int colIndex = blockDim.x * blockIdx.x + threadIdx.x;
 	int row = blockDim.y * blockIdx.y + threadIdx.y;
@@ -102,23 +148,79 @@ __global__

 	__syncthreads();

-	T beta1;
+	int offset = block * blockSize + row * colNum + col;

-	if (sizeof(T) - sizeof(half) == 0) {
-		beta1 = __float2half(beta);
-	}
-	else {
-		beta1 = beta;
-	}
+	if (betaFired)
+		c[offset] = a[offset] - bv[threadIdx.y] * beta;
+	else
+		c[offset] = a[offset] - bv[threadIdx.y];
+}
+
+__global__
+void KernelSubWithColHalf(half * a, half * b, half * c, int rowNum, int colNum, int blockSize, int blockNum, half beta, bool betaFired)
+{
+	__shared__ half bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
+
+	int colIndex = blockDim.x * blockIdx.x + threadIdx.x;
+	int row = blockDim.y * blockIdx.y + threadIdx.y;
+
+	int col = colIndex % colNum;
+	int block = colIndex / colNum;
+
+	if (row >= rowNum || block >= blockNum)
+		return;
+
+	if (threadIdx.x == 0)
+		bv[threadIdx.y] = b[row];
+
+	__syncthreads();

 	int offset = block * blockSize + row * colNum + col;

 	if (betaFired)
-		c[offset] = a[offset] - bv[threadIdx.y] * beta1;
+		c[offset] = a[offset] - bv[threadIdx.y] * beta;
 	else
 		c[offset] = a[offset] - bv[threadIdx.y];
 }

+//
+//template <class T, bool betaFired>
+//__global__
+//	void KernelSubWithCol(T * a, T * b, T * c, int rowNum, int colNum, int blockSize, int blockNum, DTYPE beta)
+//{
+//	__shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
+//
+//	int colIndex = blockDim.x * blockIdx.x + threadIdx.x;
+//	int row = blockDim.y * blockIdx.y + threadIdx.y;
+//
+//	int col = colIndex % colNum;
+//	int block = colIndex / colNum;
+//
+//	if (row >= rowNum || block >= blockNum)
+//		return;
+//
+//	if (threadIdx.x == 0)
+//		bv[threadIdx.y] = b[row];
+//
+//	__syncthreads();
+//
+//	T beta1;
+//
+//	if (sizeof(T) - sizeof(half) == 0) {
+//		beta1 = __float2half(beta);
+//	}
+//	else {
+//		beta1 = beta;
+//	}
+//
+//	int offset = block * blockSize + row * colNum + col;
+//
+//	if (betaFired)
+//		c[offset] = a[offset] - bv[threadIdx.y] * beta1;
+//	else
+//		c[offset] = a[offset] - bv[threadIdx.y];
+//}
+
 /*
 tensor subtraction (cuda version)

@@ -163,24 +265,24 @@ void _CudaSubDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE
 		if (stride > 1) {
 			GDevs.GetCudaThread2D(a->devID, stride * blockNum, blockSize, MAX_INT, cudaGrids, cudaBlocks);
 			if (beta == (DTYPE)1.0F)
-				KernelSubWithCol<DTYPE, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
+				KernelSubWithCol <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
 				                                ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
-					                              blockSize, stride, blockSize * stride, blockNum, beta);
+					                              blockSize, stride, blockSize * stride, blockNum, beta,false);
 			else
-				KernelSubWithCol<DTYPE, true>  <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
+				KernelSubWithCol <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
 				                                ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
-					                              blockSize, stride, blockSize * stride, blockNum, beta);
+					                              blockSize, stride, blockSize * stride, blockNum, beta,true);
 		}
 		else if (stride == 1) {
 			GDevs.GetCudaThread2D(a->devID, blockSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
 			if (beta == (DTYPE)1.0F)
-				KernelSubWithRow<DTYPE, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
+				KernelSubWithRow <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
 				                                ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
-					                              blockNum, blockSize, beta);
+					                              blockNum, blockSize, beta,false);
 			else
-				KernelSubWithRow<DTYPE, true>  <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
+				KernelSubWithRow<<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
 				                                ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
-					                              blockNum, blockSize, beta);
+					                              blockNum, blockSize, beta,true);
 		}
 		else {
 			ShowNTErrors("Something is wrong!");
@@ -190,25 +292,40 @@ void _CudaSubDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE
 	else if (a->dataType == X_FLOAT16) {
 		if (stride > 1) {
 			GDevs.GetCudaThread2D(a->devID, stride * blockNum, blockSize, MAX_INT, cudaGrids, cudaBlocks);
-			if (beta == (DTYPE)1.0F)
-				KernelSubWithCol<half, false> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
+			if (beta == (DTYPE)1.0F){
+				unsigned short temp = FloatToFloat16(beta);
+				half beta1 = *((half *)&temp);
+				KernelSubWithColHalf << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
 					((half*)a->data, (half*)b->data, (half*)c->data,
-					blockSize, stride, blockSize * stride, blockNum, beta);
-			else
-				KernelSubWithCol<half, true> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
+						blockSize, stride, blockSize * stride, blockNum, beta1, false);
+			}
+				
+			else {
+				unsigned short temp = FloatToFloat16(beta);
+				half beta1 = *((half *)&temp);
+				KernelSubWithColHalf << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
 					((half*)a->data, (half*)b->data, (half*)c->data,
-					blockSize, stride, blockSize * stride, blockNum, beta);
+						blockSize, stride, blockSize * stride, blockNum, beta1, true);
+			}
+				
 		}
 		else if (stride == 1) {
 			GDevs.GetCudaThread2D(a->devID, blockSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
-			if (beta == (DTYPE)1.0F)
-				KernelSubWithRow<half, false> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
+			if (beta == (DTYPE)1.0F) {
+				unsigned short temp = FloatToFloat16(beta);
+				half beta1 = *((half *)&temp);
+				KernelSubWithRowHalf << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
 					((half*)a->data, (half*)b->data, (half*)c->data,
-					blockNum, blockSize, beta);
-			else
-				KernelSubWithRow<half, true> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
+						blockNum, blockSize, beta1, false);
+			}
+			else{
+				unsigned short temp = FloatToFloat16(beta);
+				half beta1 = *((half *)&temp);
+				KernelSubWithRowHalf << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
 					((half*)a->data, (half*)b->data, (half*)c->data,
-					blockNum, blockSize, beta);
+						blockNum, blockSize, beta1, true);
+			}
+				
 		}
 		else {
 			ShowNTErrors("Something is wrong!");

--- a/source/tensor/core/arithmetic/Sum.cu
+++ b/source/tensor/core/arithmetic/Sum.cu
@@ -60,6 +60,16 @@ void KernelADDHalf(__half * a, __half * b, __half * c, int size, DTYPE beta)
 #endif
 }

+__global__
+void KernelADDInt(int * a, int * b, int * c, int size, DTYPE beta)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < size)
+        c[i] = a[i] + b[i] * (int)beta;
+
+}
+

 /*
 tensor summation c = a + b * \beta (cuda version)
@@ -101,7 +111,7 @@ void _CudaSum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)

            if ((c == a && handle != NULL) && *handle != 0) {
 #ifdef DOUBELPRICSION
-                cublasDaxpy(*handle, a->unitNum, &beta, (DTYPE*)->data, 1, (DTYPE*)a->data, 1);
+                cublasDaxpy(*handle, a->unitNum, &beta, (DTYPE*)b->data, 1, (DTYPE*)a->data, 1);
 #else
                cublasSaxpy(*handle, a->unitNum, &beta, (DTYPE*)b->data, 1, (DTYPE*)a->data, 1);
 #endif
@@ -121,16 +131,6 @@ void _CudaSum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
                 b->dataType == X_FLOAT16 &&
                 c->dataType == X_FLOAT16)
        {
-            cublasHandle_t * handle = NULL;
-            if ((a->mem != NULL) && (b->mem != NULL)) {
-                cublasHandle_t * handleA = a->mem->GetCublasHandle();
-                cublasHandle_t * handleB = b->mem->GetCublasHandle();
-                handle = *handleA != 0 ? handleA : handleB;
-            }
-            else {
-                handle = GDevs.GetCudaHandle(a->devID);
-            }
-
            int gridSize[3], blockSize[3];

            GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);
@@ -141,6 +141,20 @@ void _CudaSum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
            KernelADDHalf << <blocks, threads >> >((__half *)a->data, (__half *)b->data, (__half *)c->data, a->unitNum, beta);

        }
+        else if (a->dataType == X_INT &&
+                 b->dataType == X_INT &&
+                 c->dataType == X_INT)
+        {
+            int gridSize[3], blockSize[3];
+
+            GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);
+            dim3 blocks(gridSize[0]);
+            dim3 threads(blockSize[0]);
+
+            //KernelADD << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, a->unitNum, beta);
+            KernelADDInt << <blocks, threads >> >((int *)a->data, (int *)b->data, (int *)c->data, a->unitNum, beta);
+
+        }

        else {
            // TODO!!

--- a/source/tensor/core/arithmetic/SumDim.cpp
+++ b/source/tensor/core/arithmetic/SumDim.cpp
@@ -17,12 +17,16 @@

 /*
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-07-29
+ * &Updated by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-12-26
+ * Add summation by broadcasting.
 */

 #include "Sum.h"
 #include "SumDim.h"
 #include "SumDim.cuh"
+#include "../shape/Unsqueeze.h"
 #include "../../XName.h"
+#include "../../XUtility.h"
 #include "../movement/CopyValues.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)
@@ -152,7 +156,7 @@ XTensor SumDim(const XTensor &a, const XTensor &b, int n, DTYPE beta)
    XTensor c(&a);
    c.SetTMPFlag();
    
-    /* call _Sum function */
+    /* call _SumDim function */
    _SumDim(&a, &b, &c, n, beta);
    
    /* tensor connections */
@@ -163,4 +167,145 @@ XTensor SumDim(const XTensor &a, const XTensor &b, int n, DTYPE beta)
    return c;
 }

+/* 
+tensor broadcast summation c = a + b * \beta where some of dimensions of b can be of size 1
+c = a + b * \beta
+
+>> a - a tensor
+>> b - another tensor that would be broadcasted
+>> c - the resulting tensor
+>> beta - the scaling factor
+*/
+void _SumBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
+{
+    CheckNTErrors(a->order == b->order, "Wrong tensor orders!");
+    CheckNTErrors(a->order == c->order, "Wrong tensor orders!");
+    CheckNTErrors(a->order > 0, "TODO!");
+    
+    int order = a->order;
+    int count = 0;
+    void * source = 0;
+    void * target = 0;
+    
+    for(int i = 0; i < order; i++){
+        if(a->GetDim(i) == b->GetDim(i))
+            continue;
+        
+        if(b->GetDim(i) == 1){
+            int fitSize = a->GetDim(i);
+            int j = i + 1;
+            
+            /* we define a range over dimensions. It is to be unsqueezed */
+            for(; j < order; j++){
+                if(a->GetDim(j) == b->GetDim(j))
+                    break;
+                fitSize *= a->GetDim(j);
+            }
+            
+            int dimsS[MAX_TENSOR_DIM_NUM];
+            int dimsT[MAX_TENSOR_DIM_NUM];
+            
+            for(int k = 0; k < i; k++){
+                dimsS[k] = a->GetDim(k);
+                dimsT[k] = a->GetDim(k);
+            }
+            
+            dimsT[i] = fitSize;
+            
+            bool isLast = true;
+            for(int k = j; k < order; k++){
+                dimsS[i + k - j + 0] = b->GetDim(k);
+                dimsT[i + k - j + 1] = b->GetDim(k);
+                if(a->GetDim(k) != b->GetDim(k)){
+                    if(b->GetDim(k) == 1)
+                        isLast = false;
+                    else{
+                        ShowNTErrors("Wrong dimension size!")
+                    }
+                }
+            }
+            
+            dimsS[0] = -dimsS[0];
+            dimsT[0] = -dimsT[0];
+            
+            XTensor * s = NewTensor(order - (j - i), dimsS, a->dataType, a->denseRatio, a->devID, a->mem);
+            XTensor * t = NewTensor(order - (j - i) + 1, dimsT, b->dataType, b->denseRatio, b->devID, b->mem);
+            
+            if(count == 0)
+                source = b->data;
+            else{
+                source = target;
+            }
+            
+            target = t->mem != NULL ?
+                     t->mem->AllocBuf(t->devID, t->unitNum * t->unitSize):
+                     XMemAlloc(t->devID, t->unitNum * t->unitSize);
+            
+            s->data = source;
+            t->data = target;
+            
+            _Unsqueeze(s, t, i, fitSize);
+            
+            /* free the memory space of the one before the last allocation */
+            if(count > 0){
+                int size = s->unitNum * s->unitSize;
+                if(t->mem != NULL)
+                    t->mem->ReleaseBuf(t->devID, size);
+                else
+                    XMemFree(t->devID, source);
+            }
+            
+            /* we do summation here */
+            if(isLast){
+                CheckNTErrors(t->unitNum == c->unitNum, "Wrong tensor size!");
+                _Sum(a, t, c, beta);
+                if(t->mem != NULL)
+                    t->mem->ReleaseBuf(t->devID, t->unitNum * t->unitSize);
+                else
+                    XMemFree(t->devID, target);
+                target = NULL;
+            }
+            
+            s->data = NULL;
+            t->data = NULL;
+            DelTensor(s);
+            DelTensor(t);
+            
+            i = j;
+            count++;
+        }
+    }
+
+    if(count == 0)
+        _Sum(a, b, c, beta);
+    
+    CheckNTErrors(target == NULL, "Something is wrong!");
+}
+
+/* 
+tensor broadcast summation c = a + b * \beta where some of dimensions of b can be of size 1
+c = a + b * \beta
+
+we return c here
+
+>> a - a tensor
+>> b - another tensor that would be broadcasted
+>> beta - the scaling factor
+<< return - the resulting tensor c
+*/
+XTensor SumBroadcast(const XTensor &a, const XTensor &b, DTYPE beta)
+{
+    XTensor c(&a);
+    c.SetTMPFlag();
+    
+    /* call _SumBroadcast function */
+    _SumBroadcast(&a, &b, &c, beta);
+    
+    /* tensor connections */
+    XLink::MakeLink(&a, &b, &c, MATH_SUMBROADCAST);
+    XLink::AddParamToHead(&c, beta);
+    
+    return c;
+}
+    
 }
--- a/source/tensor/core/arithmetic/SumDim.cu
+++ b/source/tensor/core/arithmetic/SumDim.cu
@@ -22,6 +22,8 @@
 #include "SumDim.cuh"
 #include "../../XDevice.h"
 #include "cuda_fp16.h"
+#include "device_launch_parameters.h"
+#include "../../XDataType.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)

@@ -38,38 +40,82 @@ where a is a tensor and b is a row vector
 >> colNum - number of columns of a and c (i.e., the size of b)
 >> beta - the scaling factor
 */
-template <class T, bool betaFired>
 __global__
-void KernelAddWithRow(T * a, T * b, T * c, int rowNum, int colNum, DTYPE beta)
-{
-    __shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
+void KernelAddWithRow(DTYPE *a, DTYPE *b, DTYPE *c, int rowNum, int colNum, DTYPE beta,bool betaFired) {
+	__shared__ DTYPE bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
 	int col = blockDim.x * blockIdx.x + threadIdx.x;
 	int row = blockDim.y * blockIdx.y + threadIdx.y;

-    if(col >= colNum || row >= rowNum)
+	if (col >= colNum || row >= rowNum)
 		return;

-    if(threadIdx.y == 0)
+	if (threadIdx.y == 0)
 		bv[threadIdx.x] = b[col];

 	__syncthreads();

-	T beta1;

-	if (sizeof(T) - sizeof(half) == 0) {
-		beta1 = __float2half(beta);
-	}
-	else {
-		beta1 = beta;
-	}
+	int offset = colNum * row + col;
+	if (betaFired)
+		c[offset] = a[offset] + bv[threadIdx.x] * beta;
+	else
+		c[offset] = a[offset] + bv[threadIdx.x];
+}
+
+__global__
+void KernelAddWithRowHalf(half *a, half *b, half *c, int rowNum, int colNum, half beta, bool betaFired) {
+	__shared__ half bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
+	int col = blockDim.x * blockIdx.x + threadIdx.x;
+	int row = blockDim.y * blockIdx.y + threadIdx.y;
+
+	if (col >= colNum || row >= rowNum)
+		return;
+
+	if (threadIdx.y == 0)
+		bv[threadIdx.x] = b[col];
+
+	__syncthreads();

 	int offset = colNum * row + col;
-    if(betaFired)
-        c[offset] = a[offset] + bv[threadIdx.x] * beta1;
+	if (betaFired)
+		c[offset] = a[offset] + bv[threadIdx.x] * beta;
 	else
 		c[offset] = a[offset] + bv[threadIdx.x];
 }

+//	
+//template <class T, bool betaFired>
+//__global__
+//void KernelAddWithRow(T * a, T * b, T * c, int rowNum, int colNum, DTYPE beta)
+//{
+//    __shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
+//    int col = blockDim.x * blockIdx.x + threadIdx.x;
+//    int row = blockDim.y * blockIdx.y + threadIdx.y;
+//
+//    if(col >= colNum || row >= rowNum)
+//        return;
+//
+//    if(threadIdx.y == 0)
+//        bv[threadIdx.x] = b[col];
+//
+//    __syncthreads();
+//
+//	T beta1;
+//
+//	if (sizeof(T) - sizeof(half) == 0) {
+//		beta1 = __float2half(beta);
+//	}
+//	else {
+//		beta1 = beta;
+//	}
+//
+//    int offset = colNum * row + col;
+//    if(betaFired)
+//        c[offset] = a[offset] + bv[threadIdx.x] * beta1;
+//    else
+//        c[offset] = a[offset] + bv[threadIdx.x];
+//}
+
 /* 
 tensor summation of a tensor and a colum vector
 c = a + b * \beta 
@@ -83,11 +129,9 @@ where a is a tensor and b is a colum vector
 >> blockNum - number of matrics 
 >> beta - the scaling factor
 */
-template <class T, bool betaFired>
 __global__
-void KernelAddWithCol(T * a, T * b, T * c, int rowNum, int colNum, int blockSize, int blockNum, DTYPE beta)
-{
-    __shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
+void KernelAddWithCol(DTYPE *a, DTYPE *b, DTYPE *c, int rowNum, int colNum, int blockSize, int blockNum, DTYPE beta, bool betaFired) {
+	__shared__ DTYPE bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];

 	int colIndex = blockDim.x * blockIdx.x + threadIdx.x;
 	int row = blockDim.y * blockIdx.y + threadIdx.y;
@@ -95,30 +139,85 @@ void KernelAddWithCol(T * a, T * b, T * c, int rowNum, int colNum, int blockSize
 	int col = colIndex % colNum;
 	int block = colIndex / colNum;

-    if(row >= rowNum || block >= blockNum)
+	if (row >= rowNum || block >= blockNum)
 		return;

-    if(threadIdx.x == 0)
+	if (threadIdx.x == 0)
 		bv[threadIdx.y] = b[row];

 	__syncthreads();

-	T beta1;
-	if (sizeof(T) - sizeof(half) == 0) {
-		beta1 = __float2half(beta);
-	}
-	else {
-		beta1 = beta;
-	}
+	int offset = block * blockSize + row * colNum + col;
+
+	if (betaFired)
+		c[offset] = a[offset] + bv[threadIdx.y] * beta;
+	else
+		c[offset] = a[offset] + bv[threadIdx.y];
+}
+
+__global__
+void KernelAddWithColHalf(half *a, half *b, half *c, int rowNum, int colNum, int blockSize, int blockNum, half beta, bool betaFired) {
+	__shared__ half bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
+
+	int colIndex = blockDim.x * blockIdx.x + threadIdx.x;
+	int row = blockDim.y * blockIdx.y + threadIdx.y;
+
+	int col = colIndex % colNum;
+	int block = colIndex / colNum;
+
+	if (row >= rowNum || block >= blockNum)
+		return;
+
+	if (threadIdx.x == 0)
+		bv[threadIdx.y] = b[row];
+
+	__syncthreads();

 	int offset = block * blockSize + row * colNum + col;

-    if(betaFired)
-        c[offset] = a[offset] + bv[threadIdx.y] * beta1;
+	if (betaFired)
+		c[offset] = a[offset] + bv[threadIdx.y] * beta;
 	else
 		c[offset] = a[offset] + bv[threadIdx.y];
 }

+//
+//template <class T, bool betaFired>
+//__global__
+//void KernelAddWithCol(T * a, T * b, T * c, int rowNum, int colNum, int blockSize, int blockNum, DTYPE beta)
+//{
+//    __shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
+//
+//    int colIndex = blockDim.x * blockIdx.x + threadIdx.x;
+//    int row = blockDim.y * blockIdx.y + threadIdx.y;
+//
+//    int col = colIndex % colNum;
+//    int block = colIndex / colNum;
+//
+//    if(row >= rowNum || block >= blockNum)
+//        return;
+//
+//    if(threadIdx.x == 0)
+//        bv[threadIdx.y] = b[row];
+//
+//    __syncthreads();
+//
+//	T beta1;
+//	if (sizeof(T) - sizeof(half) == 0) {
+//		beta1 = __float2half(beta);
+//	}
+//	else {
+//		beta1 = beta;
+//	}
+//
+//    int offset = block * blockSize + row * colNum + col;
+//    
+//    if(betaFired)
+//        c[offset] = a[offset] + bv[threadIdx.y] * beta1;
+//    else
+//        c[offset] = a[offset] + bv[threadIdx.y];
+//}
+
 /*
 tensor summation (cuda version)

@@ -162,25 +261,25 @@ void _CudaSumDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE
    if (a->dataType == DEFAULT_DTYPE){
        if(stride > 1){
            GDevs.GetCudaThread2D(a->devID, stride * blockNum, blockSize, MAX_INT, cudaGrids, cudaBlocks);
-            if(beta == (DTYPE)1.0F)
-                KernelAddWithCol<DTYPE, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
+			if (beta == (DTYPE)1.0F)
+				KernelAddWithCol << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
 					((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
-                                                  blockSize, stride, blockSize * stride, blockNum, beta);
+						blockSize, stride, blockSize * stride, blockNum, beta, false);
            else
-                KernelAddWithCol<DTYPE, true>  <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
+                KernelAddWithCol <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
                                                ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, 
-                                                  blockSize, stride, blockSize * stride, blockNum, beta);
+                                                  blockSize, stride, blockSize * stride, blockNum, beta, true);
        }
        else if(stride == 1){
            GDevs.GetCudaThread2D(a->devID, blockSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
            if(beta == (DTYPE)1.0F)
-                KernelAddWithRow<DTYPE, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
+                KernelAddWithRow <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
                                                ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, 
-                                                  blockNum, blockSize, beta);
+                                                  blockNum, blockSize, beta,false);
            else
-                KernelAddWithRow<DTYPE, true>  <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
+                KernelAddWithRow <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
                                                ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, 
-                                                  blockNum, blockSize, beta);
+                                                  blockNum, blockSize, beta,true);
        }
        else{
            ShowNTErrors("Something is wrong!");
@@ -190,25 +289,37 @@ void _CudaSumDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE
 	else if (a->dataType==X_FLOAT16) {
 		if (stride > 1) {
 			GDevs.GetCudaThread2D(a->devID, stride * blockNum, blockSize, MAX_INT, cudaGrids, cudaBlocks);
-			if (beta == (DTYPE)1.0F)
-				KernelAddWithCol<half, false> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
+			if (beta == (DTYPE)1.0F) {
+				unsigned short temp = FloatToFloat16(beta);
+				half beta1 = *((half *)&temp);
+				KernelAddWithColHalf << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
 					((half*)a->data, (half*)b->data, (half*)c->data,
-					blockSize, stride, blockSize * stride, blockNum, beta);
-			else
-				KernelAddWithCol<half, true> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
+						blockSize, stride, blockSize * stride, blockNum, beta1, false);
+			}	
+			else{
+				unsigned short temp = FloatToFloat16(beta);
+				half beta1 = *((half *)&temp);
+				KernelAddWithColHalf << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
 				((half*)a->data, (half*)b->data, (half*)c->data,
-					blockSize, stride, blockSize * stride, blockNum, beta);
+					blockSize, stride, blockSize * stride, blockNum, beta1,true);
+			}
 		}
 		else if (stride == 1) {
 			GDevs.GetCudaThread2D(a->devID, blockSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
-			if (beta == (DTYPE)1.0F)
-				KernelAddWithRow<half, false> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
+			if (beta == (DTYPE)1.0F) {
+				unsigned short temp = FloatToFloat16(beta);
+				half beta1 = *((half *)&temp);
+				KernelAddWithRowHalf << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
 					((half*)a->data, (half*)b->data, (half*)c->data,
-					blockNum, blockSize, beta);
-			else
-				KernelAddWithRow<half, true> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
+						blockNum, blockSize, beta1, false);
+			}
+			else{
+				unsigned short temp = FloatToFloat16(beta);
+				half beta1 = *((half *)&temp);
+				KernelAddWithRowHalf << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
 					((half*)a->data, (half*)b->data, (half*)c->data,
-					blockNum, blockSize, beta);
+						blockNum, blockSize, beta1, true);
+			}
 		}
 		else {
 			ShowNTErrors("Something is wrong!");

--- a/source/tensor/core/arithmetic/SumDim.cuh
+++ b/source/tensor/core/arithmetic/SumDim.cuh
@@ -17,6 +17,8 @@

 /*
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-07-29
+* &Updated by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-12-26
+* Add summation by broadcasting.
 */

 #ifndef __SUMDIM_CUH__

--- a/source/tensor/core/arithmetic/SumDim.h
+++ b/source/tensor/core/arithmetic/SumDim.h
@@ -18,6 +18,9 @@
 /*
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-07-29
 * It reached to 39 centigrade around 3:00 pm in Shenyang
+ * &Updated by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-12-26
+ * Add summation by broadcasting.
+ * Four of my master students graduated. Good luck to them for their future work!
 */

 #ifndef __SUMDIM_H__
@@ -39,6 +42,13 @@ void _SumDim(XTensor * a, const XTensor * b, int n, DTYPE beta = (DTYPE)1.0);
   i.e., a is summed with b by broadcasting. We make a new tensor c to keep the result and return it */
 XTensor SumDim(const XTensor &a, const XTensor &b, int n, DTYPE beta = (DTYPE)1.0);

+/* tensor broadcast summation c = a + b * \beta where some of dimensions of b can be of size 1 */
+void _SumBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta = (DTYPE)1.0);
+
+/* tensor broadcast summation c = a + b * \beta where some of dimensions of b can be of size 1.
+   we return the resulting tensor here */
+XTensor SumBroadcast(const XTensor &a, const XTensor &b, DTYPE beta = (DTYPE)1.0);
+    
 } // namespace nts(NiuTrans.Tensor)

 #endif // __SUMDIM_H__
--- a/source/tensor/core/arithmetic/XTensorBLAS.cu
+++ b/source/tensor/core/arithmetic/XTensorBLAS.cu
@@ -23,89 +23,12 @@
 #include "../../XDevice.h"
 #include "../../XTensor.h"
 #include "XTensorBLAS.h"
+#include <stdint.h>

 namespace nts { // namespace nts(NiuTrans.Tensor)

 #ifdef USE_CUDA

-#include <stdint.h>
-
-typedef unsigned short uint16_t;
-typedef unsigned int uint32_t;
-//typedef char  __int8;
-
-half uint16_as_fp16(uint16_t a)
-{
-    half res;
-#if defined (__cplusplus)  
-    memcpy(&res, &a, sizeof(res));
-#else /* __cplusplus */  
-    volatile union {
-        half f;
-        uint16_t i;
-    } cvt;
-    cvt.i = a;
-    res = cvt.f;
-#endif /* __cplusplus */  
-    return res;
-}
-
-uint32_t fp32_as_uint32(float a)
-{
-    uint32_t res;
-#if defined (__cplusplus)  
-    memcpy(&res, &a, sizeof(res));
-#else /* __cplusplus */  
-    volatile union {
-        float f;
-        uint32_t i;
-    } cvt;
-    cvt.f = a;
-    res = cvt.i;
-#endif /* __cplusplus */  
-    return res;
-}
-
-/* host version of device function __float2half_rn() */
-half float2half_rn(float a)
-{
-    uint32_t ia = fp32_as_uint32(a);
-    uint16_t ir;
-
-    ir = (ia >> 16) & 0x8000;
-    if ((ia & 0x7f800000) == 0x7f800000) {
-        if ((ia & 0x7fffffff) == 0x7f800000) {
-            ir |= 0x7c00; /* infinity */
-        }
-        else {
-            ir = 0x7fff; /* canonical NaN */
-        }
-    }
-    else if ((ia & 0x7f800000) >= 0x33000000) {
-        int shift = (int)((ia >> 23) & 0xff) - 127;
-        if (shift > 15) {
-            ir |= 0x7c00; /* infinity */
-        }
-        else {
-            ia = (ia & 0x007fffff) | 0x00800000; /* extract mantissa */
-            if (shift < -14) { /* denormal */
-                ir |= ia >> (-1 - shift);
-                ia = ia << (32 - (-1 - shift));
-            }
-            else { /* normal */
-                ir |= ia >> (24 - 11);
-                ia = ia << (32 - (24 - 11));
-                ir = ir + ((14 + shift) << 10);
-            }
-            /* IEEE-754 round to nearest of even */
-            if ((ia > 0x80000000) || ((ia == 0x80000000) && (ir & 1))) {
-                ir++;
-            }
-        }
-    }
-    return uint16_as_fp16(ir);
-}
-
 /*
 matrix multiplication via cuda version BLAS
 */
@@ -171,11 +94,24 @@ void _CudaBLASMatrixMUL(cublasHandle_t * handle,
            cublasGemmEx(*handle, CUBLAS_OP_T, CUBLAS_OP_T, mc, nc, na, &alpha2, (const int8_t*)b, CUDA_R_8I, mb, (const int8_t*)a, CUDA_R_8I, ma, &beta2, (float*)c, CUDA_R_32F, mc, CUDA_R_32F, CUBLAS_GEMM_DEFAULT);
    }
    else if (dataTypeA == X_INT8 && dataTypeB == X_INT8 && dataTypeC == X_INT) {
-        //ShowNTErrors("TO DO!");
        int alpha2 = (int)alpha;
        int beta2 = (int)beta;
+        /*
+        CUDA requires that the dimension of two tensor( lda, ldb ) should be multiples of 4.
+        details in https://devtalk.nvidia.com/default/topic/999101/about-cublasgemm-int8-support/
+        */
+        if (mb % 4 != 0 || ma % 4 != 0) {
+            ShowNTErrors("mb, ma( lda, ldb ) should be multiples of 4!");
+            return;
+        }
        if (transposedA == X_NOTRANS && transposedB == X_NOTRANS)
            cublasGemmEx(*handle, CUBLAS_OP_N, CUBLAS_OP_N, mc, nc, ma, &alpha2, (const int8_t*)b, CUDA_R_8I, mb, (const int8_t*)a, CUDA_R_8I, ma, &beta2, (int*)c, CUDA_R_32I, mc, CUDA_R_32I, CUBLAS_GEMM_DEFAULT);
+        else if (transposedA == X_TRANS && transposedB == X_NOTRANS)
+            cublasGemmEx(*handle, CUBLAS_OP_N, CUBLAS_OP_T, mc, nc, na, &alpha2, (const int8_t*)b, CUDA_R_8I, mb, (const int8_t*)a, CUDA_R_8I, ma, &beta2, (int*)c, CUDA_R_32I, mc, CUDA_R_32I, CUBLAS_GEMM_DEFAULT);
+        else if (transposedA == X_NOTRANS && transposedB == X_TRANS)
+            cublasGemmEx(*handle, CUBLAS_OP_T, CUBLAS_OP_N, mc, nc, ma, &alpha2, (const int8_t*)b, CUDA_R_8I, mb, (const int8_t*)a, CUDA_R_8I, ma, &beta2, (int*)c, CUDA_R_32I, mc, CUDA_R_32I, CUBLAS_GEMM_DEFAULT);
+        else if (transposedA == X_TRANS && transposedB == X_TRANS)
+            cublasGemmEx(*handle, CUBLAS_OP_T, CUBLAS_OP_T, mc, nc, na, &alpha2, (const int8_t*)b, CUDA_R_8I, mb, (const int8_t*)a, CUDA_R_8I, ma, &beta2, (int*)c, CUDA_R_32I, mc, CUDA_R_32I, CUBLAS_GEMM_DEFAULT);
    }
    else {
        ShowNTErrors("Unsupported data type!");

--- a/source/tensor/core/arithmetic/XTensorBLAS.h
+++ b/source/tensor/core/arithmetic/XTensorBLAS.h
@@ -26,8 +26,6 @@

 namespace nts { // namespace nts(NiuTrans.Tensor)

-half float2half_rn(float a);
-
 /* matrix multiplication (BLAS) */
 void _MatrixMULCPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, 
                   XTensor * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0);

--- a/source/tensor/core/getandset/SetData.cpp
+++ b/source/tensor/core/getandset/SetData.cpp
@@ -432,6 +432,39 @@ void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper)
 }

 /* 
+generate data items with a uniform distribution in [lower, upper] and set
+the item to a pre-defined value if the item >= p, set the item to 0 otherwise
+>> tensor - the tensor whose data array would be initialized
+>> lower - lower value of the range
+>> upper - upper value of the range
+>> p - the threshold
+>> value - the value we intend to assign to the item
+*/
+void _SetDataRandP(XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE p, DTYPE value)
+{
+    //CheckNTErrors(tensor->dataType == DEFAULT_DTYPE, "TODO");
+
+    if (tensor->devID < 0) {
+        _SetDataRand(tensor, lower, upper);
+
+        DTYPE * data = (DTYPE*)tensor->data;
+        for (int i = 0; i < tensor->unitNum; i++) {
+            if (data[i] >= p)
+                data[i] = value;
+            else
+                data[i] = 0;
+        }
+    }
+    else {
+#ifdef USE_CUDA
+        _CudaSetDataRandP(tensor, lower, upper, p, value);
+#else
+        ShowNTErrors("Please recompile the code by specifying USE_CUDA");
+#endif // USE_CUDA
+    }
+}
+    
+/*
 generate data items with a normal distribution with specified mean and standard deviation 
 >> tensor - the tensor that keeps the data
 >> mean - mean or expectation of the distribution

--- a/source/tensor/core/getandset/SetData.cu
+++ b/source/tensor/core/getandset/SetData.cu
@@ -28,6 +28,7 @@
 #include "../../XDevice.h"
 #include "../../XUtility.h"
 #include "../getandset/ConvertDataType.h"
+#include "../movement/CopyValues.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)

@@ -197,6 +198,7 @@ set data array with a uniform distribution in [low, high]
 __global__
 void KernelSetDataRandHalf(half * d, int size, DTYPE lower, DTYPE variance)
 {
+#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
 	half lowerHalf = __float2half(lower);
 	half varianceHalf = __float2half(variance);

@@ -204,6 +206,47 @@ void KernelSetDataRandHalf(half * d, int size, DTYPE lower, DTYPE variance)
 	if (i < size) {
 		d[i] = d[i] * varianceHalf + lowerHalf;
 	}
+#endif
+}
+
+
+
+/*
+set data items to a pre-defined value if its value >= p, set it to 0 otherwise
+>> d - pointer to the data array
+>> size - size of the array
+>> lower - low value of the range
+>> variance - the variance of the range
+*/
+__global__
+void KernelSetDataPCut(DTYPE * d, int size, DTYPE p, DTYPE value)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < size) {
+        if (d[i] >= p)
+            d[i] = value;
+        else
+            d[i] = 0;
+    }
+}
+
+__global__
+void KernelSetDataPCutHalf(half * d, int size, DTYPE p, DTYPE value)
+{
+#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
+    half halfP = __float2half(p);
+    half halfValue = __float2half(value);
+
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < size) {
+        if (d[i] >= halfP)
+            d[i] = halfValue;
+        else
+            d[i] = 0;
+    }
+#endif
 }

 /* 
@@ -473,30 +516,77 @@ void _CudaSetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper)
    int devIDBackup;
    ProtectCudaDev(tensor->devID, devIDBackup);
    
-    curandGenerator_t gen;
-    curandCreateGenerator (&gen, CURAND_RNG_PSEUDO_DEFAULT);
-    curandSetPseudoRandomGeneratorSeed(gen, time(NULL));
+	XTensor tensor1(tensor->order, tensor->dimSize, X_FLOAT, tensor->denseRatio, tensor->devID, tensor->mem);
+
+	if (tensor->dataType == X_FLOAT){
+		curandGenerator_t & gen = GDevs.GPUs[tensor->devID].gen;  
+		curandGenerateUniform(gen, (float*)tensor->data, tensor->unitNum);
+	}
+	else{
+		curandGenerator_t & gen = GDevs.GPUs[tensor->devID].gen;  
+		curandGenerateUniform(gen, (float*)tensor1.data, tensor1.unitNum);
+	}
+    //curandGenerator_t & gen = GDevs.GPUs[tensor->devID].gen;  
+    //curandGenerateUniform(gen, (float*)tensor->data, tensor->unitNum);
+
    DTYPE variance = upper - lower;

+    if (variance != 1.0F || lower != 0) {
        if (tensor->dataType == X_FLOAT) {
-		curandGenerateUniform(gen, (float*)tensor->data, tensor->unitNum);
-		curandDestroyGenerator(gen);
            KernelSetDataRandFloat << <blocks, threads >> >((float*)tensor->data, tensor->unitNum, lower, variance);
        }         
-        
        else if (tensor->dataType == X_DOUBLE) {
-		curandGenerateUniform(gen, (float*)tensor->data, tensor->unitNum);
-		curandDestroyGenerator(gen);
            KernelSetDataRandDouble << <blocks, threads >> >((double*)tensor->data, tensor->unitNum, lower, variance);
        }
-
        else if (tensor->dataType == X_FLOAT16) {    
-		XTensor tensor1(tensor->order, tensor->dimSize, X_FLOAT, tensor->denseRatio, tensor->devID,tensor->mem);
-		curandGenerateUniform(gen, (float *)tensor1.data, tensor1.unitNum);
-		curandDestroyGenerator(gen);
            _ConvertDataType(&tensor1, tensor);   
            KernelSetDataRandHalf << <blocks, threads >> >((half*)tensor->data, tensor->unitNum, lower, variance);
        }
+        else {
+            ShowNTErrors("TODO!");
+        }
+    }
+    else if (tensor->dataType == X_FLOAT16) {
+        _ConvertDataType(&tensor1, tensor);
+    }
+		
+    BacktoCudaDev(tensor->devID, devIDBackup);
+}
+
+
+/* 
+generate data items with a uniform distribution in [lower, upper] and set
+the item to a pre-defined value if the item >= p, set the item to 0 otherwise 
+>> tensor - the tensor whose data array would be initialized
+>> lower - lower value of the range
+>> upper - upper value of the range
+>> p - the threshold
+>> value - the value we intend to assign to the item
+*/
+void _CudaSetDataRandP(XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE p, DTYPE value)
+{
+    _CudaSetDataRand(tensor, lower, upper);
+
+    int gridSize[3];
+    int blockSize[3];
+
+    GDevs.GetCudaThread(tensor->devID, tensor->unitNum, gridSize, blockSize);
+
+    dim3 blocks(gridSize[0]);
+    dim3 threads(blockSize[0]);
+
+    int devIDBackup;
+    ProtectCudaDev(tensor->devID, devIDBackup);
+
+    if (tensor->dataType == X_FLOAT) {
+        KernelSetDataPCut << <blocks, threads >> >((float*)tensor->data, tensor->unitNum, p, value);
+    }
+    else if (tensor->dataType == X_FLOAT16) {
+        KernelSetDataPCutHalf << <blocks, threads >> >((__half*)tensor->data, tensor->unitNum, p, value);
+    }
+    else {
+        ShowNTErrors("TODO!")
+    }   

    BacktoCudaDev(tensor->devID, devIDBackup);
 }

--- a/source/tensor/core/getandset/SetData.cuh
+++ b/source/tensor/core/getandset/SetData.cuh
@@ -49,6 +49,10 @@ void _CudaSetDataLowTri(XTensor * tensor, DTYPE p, int shift);
 /* generate data items with a uniform distribution in [lower, upper] */
 void _CudaSetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper);

+/* generate data items with a uniform distribution in [lower, upper] and set
+   the item to a pre-defined value if the item >= p, set the item to 0 otherwise */
+void _CudaSetDataRandP(XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE p, DTYPE value);
+
 /* set the data with an array of offsets */
 void _CudaSetDataWithOffset(XTensor * tensor, MTYPE * offsets, DTYPE value, MTYPE num);


--- a/source/tensor/core/getandset/SetData.h
+++ b/source/tensor/core/getandset/SetData.h
@@ -57,6 +57,10 @@ void _SetDataLowTri(XTensor * tensor, DTYPE p, int shift);
 /* generate data items with a uniform distribution in [lower, upper] */
 void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper);

+/* generate data items with a uniform distribution in [lower, upper] and set 
+   the item to a pre-defined value if the item >= p, set the item to 0 otherwise */
+void _SetDataRandP(XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE p, DTYPE value);
+
 /* generate data items with a normal distribution with specified mean and standard deviation */
 void _SetDataRandN(XTensor * tensor, DTYPE mean = 0.0F, DTYPE standardDeviation = 1.0F);


--- a/source/tensor/core/math/Power.cpp
+++ b/source/tensor/core/math/Power.cpp
@@ -35,8 +35,6 @@ get the power(a, p)
 */
 void _Power(const XTensor * a, XTensor * b, DTYPE p)
 {
-    CheckNTErrors(a->dataType == b->dataType, "Unmatched tensors in addition!");
-
 #ifdef USE_CUDA
    /* run it on GPUs */
    if (a->devID >= 0) {

--- a/source/tensor/core/movement/Gather.cpp
+++ b/source/tensor/core/movement/Gather.cpp
--- a/source/tensor/core/movement/Spread.cpp
+++ b/source/tensor/core/movement/Spread.cpp
--- a/source/tensor/core/movement/Spread.cu
+++ b/source/tensor/core/movement/Spread.cu
--- a/source/tensor/core/reduce/ReduceMax.cpp
+++ b/source/tensor/core/reduce/ReduceMax.cpp
@@ -23,7 +23,6 @@
 #include "../../XName.h"
 #include "ReduceMax.h"
 #include "ReduceMax.cuh"
-#include "../getandset/ConvertDataType.h"

 namespace nts{ // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/reduce/ReduceMax.cu
+++ b/source/tensor/core/reduce/ReduceMax.cu
@@ -24,7 +24,6 @@
 #include "../../XUtility.h"
 #include "ReduceMax.h"
 #include "ReduceMax.cuh"
-#include "../getandset/ConvertDataType.h"

 namespace nts{ // namespace nts(NiuTrans.Tensor)

@@ -544,22 +543,22 @@ void _CudaReduceMax(const XTensor * input, XTensor * output, int dim)
    int devIDBackup;
    ProtectCudaDev(input->devID, devIDBackup);

-    if (input->dataType == DEFAULT_DTYPE) {
-        if (stride == 1 && blockNum >= 10) {
+    if (stride == 1 && blockNum >= 10 && input->dataType == DEFAULT_DTYPE) {
        dim3 grids;
        dim3 blocks;
        continuousStorageThreadAllocation(grids, blocks, (long long)blockNum, strideNum);
        if (blocks.y >= 128) {
-                KernelReduceMaxOp << <grids, blocks >> > ((DTYPE *)input->data, (DTYPE*)output->data, stride, strideNum, grids.y, blockSize, blockNum);
+            KernelReduceMaxOp <<<grids, blocks >>> ((DTYPE *)input->data, (DTYPE*)output->data, stride, strideNum, grids.y, blockSize, blockNum);
        }
        else {
            if (blockNum % 4 != 0) blockNum = (int)(blockNum / 4) + 1;
            else blockNum = blockNum / 4;
-                KernelReduceMaxOpLessBlocks << <blockNum, 128 >> > ((DTYPE *)input->data, (DTYPE*)output->data, strideNum, blockNum);
+            KernelReduceMaxOpLessBlocks <<<blockNum, 128 >>> ((DTYPE *)input->data, (DTYPE*)output->data, strideNum, blockNum);
        }
    }
    else {
        do {
+            if (input->dataType == DEFAULT_DTYPE) {
                DTYPE * iData = NULL;
                DTYPE * oData = NULL;
                if (iter == 0) {
@@ -581,7 +580,7 @@ void _CudaReduceMax(const XTensor * input, XTensor * output, int dim)
                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
                    if (cudaGridSize[0] == 1)
                        oData = (DTYPE*)output->data;
-                    KernelReduceMax << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
+                    KernelReduceMax <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
                }
                else if (strideNum < 128) {
                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 64), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
@@ -590,7 +589,7 @@ void _CudaReduceMax(const XTensor * input, XTensor * output, int dim)
                        oData = (DTYPE*)output->data;
                    CheckNTErrors(cudaBlockSize[0] >= 64, "Incorrect thread number when calling the cuda kernel!");
                    adjustThreadForUseWarpOptimization(blocks, threads);
-                    KernelReduceMaxFast<64> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
+                    KernelReduceMaxFast<64> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
                }
                else if (strideNum < 256) {
                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 128), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
@@ -599,7 +598,7 @@ void _CudaReduceMax(const XTensor * input, XTensor * output, int dim)
                        oData = (DTYPE*)output->data;
                    CheckNTErrors(cudaBlockSize[0] >= 128, "Incorrect thread number when calling the cuda kernel!");
                    adjustThreadForUseWarpOptimization(blocks, threads);
-                    KernelReduceMaxFast<128> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
+                    KernelReduceMaxFast<128> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
                }
                else if (strideNum < 512) {
                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 256), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
@@ -608,7 +607,7 @@ void _CudaReduceMax(const XTensor * input, XTensor * output, int dim)
                        oData = (DTYPE*)output->data;
                    CheckNTErrors(cudaBlockSize[0] >= 256, "Incorrect thread number when calling the cuda kernel!");
                    adjustThreadForUseWarpOptimization(blocks, threads);
-                    KernelReduceMaxFast<256> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
+                    KernelReduceMaxFast<256> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
                }
                else {
                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 512), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
@@ -617,18 +616,10 @@ void _CudaReduceMax(const XTensor * input, XTensor * output, int dim)
                        oData = (DTYPE*)output->data;
                    CheckNTErrors(cudaBlockSize[0] >= 512, "Incorrect thread number when calling the cuda kernel!");
                    adjustThreadForUseWarpOptimization(blocks, threads);
-                    KernelReduceMaxFast<512> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
-                }
-                strideNum = cudaGridSize[0];
-                blockSize = cudaGridSize[0];
-
-                iter++;
-
-            } while (strideNum > 1);
+                    KernelReduceMaxFast<512> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
                }
            }
            else if (input->dataType == X_FLOAT16) {
-        do {
                __half * buf1ft16 = (__half *)buf1;
                __half * buf2ft16 = (__half *)buf2;
                __half * iData = NULL;
@@ -652,7 +643,7 @@ void _CudaReduceMax(const XTensor * input, XTensor * output, int dim)
                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
                    if (cudaGridSize[0] == 1)
                        oData = (__half*)output->data;
-                KernelReduceMax << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
+                    KernelReduceMax <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
                }
                else if (strideNum < 128) {
                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 64), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
@@ -660,7 +651,7 @@ void _CudaReduceMax(const XTensor * input, XTensor * output, int dim)
                    if (cudaGridSize[0] == 1)
                        oData = (__half*)output->data;
                    CheckNTErrors(cudaBlockSize[0] >= 64, "Incorrect thread number when calling the cuda kernel!");
-                KernelReduceMaxFast<64> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
+                    KernelReduceMaxFast<64> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
                }
                else if (strideNum < 256) {
                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 128), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
@@ -668,7 +659,7 @@ void _CudaReduceMax(const XTensor * input, XTensor * output, int dim)
                    if (cudaGridSize[0] == 1)
                        oData = (__half*)output->data;
                    CheckNTErrors(cudaBlockSize[0] >= 128, "Incorrect thread number when calling the cuda kernel!");
-                KernelReduceMaxFast<128> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
+                    KernelReduceMaxFast<128> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
                }
                else if (strideNum < 512) {
                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 256), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
@@ -676,7 +667,7 @@ void _CudaReduceMax(const XTensor * input, XTensor * output, int dim)
                    if (cudaGridSize[0] == 1)
                        oData = (__half*)output->data;
                    CheckNTErrors(cudaBlockSize[0] >= 256, "Incorrect thread number when calling the cuda kernel!");
-                KernelReduceMaxFast<256> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
+                    KernelReduceMaxFast<256> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
                }
                else {
                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 512), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
@@ -684,8 +675,10 @@ void _CudaReduceMax(const XTensor * input, XTensor * output, int dim)
                    if (cudaGridSize[0] == 1)
                        oData = (__half*)output->data;
                    CheckNTErrors(cudaBlockSize[0] >= 512, "Incorrect thread number when calling the cuda kernel!");
-                KernelReduceMaxFast<512> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
+                    KernelReduceMaxFast<512> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
+                }
            }
+
            strideNum = cudaGridSize[0];
            blockSize = cudaGridSize[0];

@@ -693,9 +686,6 @@ void _CudaReduceMax(const XTensor * input, XTensor * output, int dim)

        } while (strideNum > 1);
    }
-    else {
-        ShowNTErrors("TODO!");
-    }

    BacktoCudaDev(input->devID, devIDBackup);


--- a/source/tensor/core/reduce/ReduceSum.cu
+++ b/source/tensor/core/reduce/ReduceSum.cu
@@ -735,8 +735,7 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
    int devIDBackup;
    ProtectCudaDev(input->devID, devIDBackup);

-    if (input->dataType == DEFAULT_DTYPE) {
-        if (stride == 1 && blockNum >= 10) {
+    if (stride == 1 && blockNum >= 10 && input->dataType == DEFAULT_DTYPE) {
        dim3 grids;
        dim3 blocks;
        continuousStorageThreadAllocation(grids, blocks, (long long)blockNum, strideNum);
@@ -752,7 +751,7 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
                                                              strideNum, blockNum, sp, power, isExp);
        }
    }
-        else if (stride != 1 && stride * blockNum > 4096){
+    else if (stride != 1 && stride * blockNum > 4096 && input->dataType == DEFAULT_DTYPE){
        //GDevs->GetGridAndBlockSize2D(devID, stride * blockNum, strideNum,MAX_INT, cudaGridSize, cudaBlockSize);
        //unsigned int* goutput = (unsigned int *)input->data;
        //convert2uintV2 << <dim3(cudaGridSize[0], cudaGridSize[1]), dim3(cudaBlockSize[0], cudaBlockSize[1]) >> > ((float*)input->data, goutput, stride, strideNum, blockNum, strideNum*blockNum*stride);
@@ -763,6 +762,7 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
    }
    else {
        do {
+            if (input->dataType == DEFAULT_DTYPE) {
                DTYPE * iData = NULL;
                DTYPE * oData = NULL;
                if (iter == 0) {
@@ -783,7 +783,7 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
                    dim3 blocks(cudaGridSize[0], cudaGridSize[1]), threads(cudaBlockSize[0], cudaBlockSize[1]);
                    if (cudaGridSize[0] == 1)
                        oData = (DTYPE*)output->data;
-                    KernelReduceSum << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.x,
+                    KernelReduceSum <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.x, 
                                                           blockSize, blockNum, sp, power, isExp);
                }
                else if (strideNum < 128) {
@@ -793,7 +793,7 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
                        oData = (DTYPE*)output->data;
                    CheckNTErrors((cudaBlockSize[0] >= 64), "Incorrect thread number when calling the cuda kernel!");
                    adjustThreadForUseWarpOptimization(&blocks, &threads);
-                    KernelReduceSumFast<64> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.x,
+                    KernelReduceSumFast<64> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.x, 
                                                                   blockSize, blockNum, sp, power, isExp);
                }
                else if (strideNum < 256) {
@@ -803,7 +803,7 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
                        oData = (DTYPE*)output->data;
                    CheckNTErrors((cudaBlockSize[0] >= 128), "Incorrect thread number when calling the cuda kernel!");
                    adjustThreadForUseWarpOptimization(&blocks, &threads);
-                    KernelReduceSumFast<128> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.x,
+                    KernelReduceSumFast<128> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.x, 
                                                                    blockSize, blockNum, sp, power, isExp);
                }
                else if (strideNum < 512) {
@@ -813,7 +813,7 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
                        oData = (DTYPE*)output->data;
                    CheckNTErrors((cudaBlockSize[0] >= 256), "Incorrect thread number when calling the cuda kernel!");
                    adjustThreadForUseWarpOptimization(&blocks, &threads);
-                    KernelReduceSumFast<256> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.x,
+                    KernelReduceSumFast<256> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.x, 
                                                                    blockSize, blockNum, sp, power, isExp);
                }
                else {
@@ -823,23 +823,11 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
                        oData = (DTYPE*)output->data;
                    CheckNTErrors((cudaBlockSize[0] >= 512), "Incorrect thread number when calling the cuda kernel!");
                    adjustThreadForUseWarpOptimization(&blocks, &threads);
-                    KernelReduceSumFast<512> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.x,
+                    KernelReduceSumFast<512> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.x, 
                                                                    blockSize, blockNum, sp, power, isExp);
                }
-
-                strideNum = cudaGridSize[0];
-                blockSize = cudaGridSize[0];
-                sp = NULL;
-                power = (DTYPE)1.0;
-                isExp = false;
-
-                iter++;
-
-            } while (strideNum > 1);
-        } 
            }
            else if (input->dataType == X_FLOAT16) {
-        do {
                __half * buf1ft16 = (__half *)buf1;
                __half * buf2ft16 = (__half *)buf2;
                __half * spft16 = (__half *)sp;
@@ -866,7 +854,7 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
                    if (cudaGridSize[0] == 1)
                        oData = (__half*)output->data;
-                KernelReduceSum << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y,
+                    KernelReduceSum <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, 
                                                           blockSize, blockNum, spft16, *powerft16p, isExp);
                }
                else if (strideNum < 128) {
@@ -875,7 +863,7 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
                    if (cudaGridSize[0] == 1)
                        oData = (__half*)output->data;
                    CheckNTErrors((cudaBlockSize[0] >= 64), "Incorrect thread number when calling the cuda kernel!");
-                KernelReduceSumFast<64> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y,
+                    KernelReduceSumFast<64> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, 
                                                                   blockSize, blockNum, spft16, *powerft16p, isExp);
                }
                else if (strideNum < 256) {
@@ -884,7 +872,7 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
                    if (cudaGridSize[0] == 1)
                        oData = (__half*)output->data;
                    CheckNTErrors((cudaBlockSize[0] >= 128), "Incorrect thread number when calling the cuda kernel!");
-                KernelReduceSumFast<128> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y,
+                    KernelReduceSumFast<128> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, 
                                                                    blockSize, blockNum, spft16, *powerft16p, isExp);
                }
                else if (strideNum < 512) {
@@ -893,7 +881,7 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
                    if (cudaGridSize[0] == 1)
                        oData = (__half*)output->data;
                    CheckNTErrors((cudaBlockSize[0] >= 256), "Incorrect thread number when calling the cuda kernel!");
-                KernelReduceSumFast<256> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y,
+                    KernelReduceSumFast<256> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, 
                                                                    blockSize, blockNum, spft16, *powerft16p, isExp);
                }
                else {
@@ -902,9 +890,10 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
                    if (cudaGridSize[0] == 1)
                        oData = (__half*)output->data;
                    CheckNTErrors((cudaBlockSize[0] >= 512), "Incorrect thread number when calling the cuda kernel!");
-                KernelReduceSumFast<512> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y,
+                    KernelReduceSumFast<512> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, 
                                                                    blockSize, blockNum, spft16, *powerft16p, isExp);
                }
+            }

            strideNum = cudaGridSize[0];
            blockSize = cudaGridSize[0];
@@ -915,13 +904,7 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
            iter++;

        } while (strideNum > 1);
-
-    }
-    else {
-        ShowNTErrors("TODO!");
    }
-
-
    ProtectCudaDev(input->devID, devIDBackup);

    if (mem != NULL)

--- a/source/tensor/function/Dropout.cpp
+++ b/source/tensor/function/Dropout.cpp
@@ -27,6 +27,7 @@
 #include "../core/arithmetic/MultiplyDim.h"
 #include "../core/math/ScaleAndShift.h"
 #include "../core/CHeader.h"
+#include "../core/getandset/SetData.h"

 namespace nts{ // namespace nts(NiuTrans.Tensor

@@ -40,7 +41,7 @@ for more details.

 Here, the output is scaled by a factor of \frac{1}{1-p} so that we do not need
 to mark the tensor with probability p in the inference phase. Instead we perform
-the same inference procedure as that with no use of dropout on the test data.
+the same inference procedure as that on the test data withno nb use of dropout.
 
 >> x - input tensor
 >> y - output tensor
@@ -139,25 +140,35 @@ the same inference procedure as that with no use of dropout on the test data.
 >> x - input tensor
 >> dropProb - probability to set an element to zero
 >> leadingDim - the dimension which we generate the random numbers and perform broadcasting
+>> leadingDim2 - another dimension which we generate the random numbers and perform broadcasting
+<< return - tensor after dropout
 */
-XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim)
+XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim, int leadingDim2)
 {
    CheckNTErrors(dropProb >= 0.0 && dropProb <= 1.0, "The probability must be 0-1!");

-    int n = leadingDim < 0 ? x.order - 1 : leadingDim;
+    XTensor mask;
+    DTYPE * maskArray = NULL;
+    DTYPE scaleFactor = (DTYPE)1.0 / ((DTYPE)1.0 - dropProb);

-    CheckNTErrors(n >= 0 && n < x.order, "Wrong leadingDim!");
+    if(leadingDim < 0 && leadingDim2 < 0){
+        XTensor mask;
+        InitTensor(&mask, &x);
+        _SetDataRandP(&mask, 0, 1.0F, dropProb, scaleFactor);
+        return Multiply(x, mask);
+    }
+    else if(leadingDim2 < 0){
+        int n = leadingDim;

-    DTYPE scaleFactor = (DTYPE)1.0 / ((DTYPE)1.0 - dropProb);
+        CheckNTErrors(n >= 0 && n < x.order, "Wrong leadingDim!");

        /* generate a mask tensor with probability p */
        int unitNum = x.dimSize[n];
-    DTYPE * maskArray = new DTYPE[unitNum];
+        maskArray = new DTYPE[unitNum];

        //srand((unsigned int)time(NULL));
-    for (int i = 0; i < unitNum; i++) {
+        for (int i = 0; i < unitNum; i++)
            maskArray[i] = RandomBernoulli(dropProb, scaleFactor);
-    }
    
        XTensor mask;
        InitTensor1D(&mask, unitNum, X_FLOAT, x.devID, x.mem);
@@ -167,57 +178,66 @@ XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim)

        if (x.dataType == X_FLOAT)
        {
-        return MultiplyDim(x, mask, n, 0);        
+            return MultiplyDim(x, mask, n);
        }
-    else
+        else if (x.dataType == X_FLOAT16)
        {
-        XTensor mask1;
-        mask1 = ConvertDataType(mask, X_FLOAT16);
+            XTensor mask1(mask.order, mask.dimSize, X_FLOAT16, mask.denseRatio, mask.devID, mask.mem);
+            //mask1 = ConvertDataType(mask, X_FLOAT16);
+            _ConvertDataType(&mask, &mask1);

-        return MultiplyDim(x, mask1, n, 0);
+            return MultiplyDim(x, mask1, n);
        }
-}
+        else {
+            ShowNTErrors("TODO!");
+        }
+    }
+    else{
+        int n = leadingDim;
+        int m = leadingDim2;

+        CheckNTErrors(n >= 0 && n < x.order, "Wrong leadingDim!");
+        CheckNTErrors(m >= 0 && m < x.order, "Wrong leadingDim!");
    
+        /* generate a mask tensor with probability p */
+        int unitNum = x.dimSize[n] * x.dimSize[m];
+        maskArray = new DTYPE[unitNum];

-//XTensor DropoutFloat16(const XTensor &x, DTYPE dropProb, int leadingDim)
-//{
-//    CheckNTErrors(dropProb >= 0.0 && dropProb <= 1.0, "The probability must be 0-1!");
-//
-//    int n = leadingDim < 0 ? x.order - 1 : leadingDim;
-//
-//    CheckNTErrors(n >= 0 && n < x.order, "Wrong leadingDim!");
-//
-//    DTYPE scaleFactor = (DTYPE)1.0 / ((DTYPE)1.0 - dropProb);
-//
-//    /* generate a mask tensor with probability p */
-//    int unitNum = x.dimSize[n];
-//    DTYPE * maskArray = new DTYPE[unitNum];
-//
-//    //srand((unsigned int)time(NULL));
-//    for (int i = 0; i < unitNum; i++) {
-//        maskArray[i] = RandomBernoulli(dropProb, scaleFactor);
-//    }
-//
-//    XTensor mask;
-//    InitTensor1D(&mask, unitNum, X_FLOAT, x.devID, x.mem);
-//    mask.SetData(maskArray, unitNum);
-//
-//    delete[] maskArray;
-//
-//    XTensor halfMask;
-//    halfMask = ConvertDataType(mask, X_FLOAT16);
-//    XTensor halfX;
-//    halfX = ConvertDataType(x, X_FLOAT16);
-//    XTensor result;
-//    XTensor halfResult;
-//
-//    halfResult = MultiplyDim(halfX, halfMask, n, 0);
-//
-//    result = ConvertDataType(halfResult, X_FLOAT);
-//    return result;
-//    /*return MultiplyDim(x, mask1, n, 0);*/
-//}
+        //srand((unsigned int)time(NULL));
+        for (int i = 0; i < unitNum; i++)
+            maskArray[i] = RandomBernoulli(dropProb, scaleFactor);
+
+        int dims[MAX_TENSOR_DIM_NUM];
+
+        for(int i = 0; i < x.order; i++)
+            dims[i] = 1;
+        dims[n] = x.GetDim(n);
+        dims[m] = x.GetDim(m);
+    
+        InitTensor(&mask, x.order, dims, X_FLOAT, x.denseRatio,x.devID, x.mem);
+        mask.SetData(maskArray, unitNum);
+
+        delete[] maskArray;
+    
+        if (x.dataType == X_FLOAT)
+        {
+            return MultiplyBroadcast(x, mask);
+        }
+        else if (x.dataType == X_FLOAT16)
+        {
+            XTensor mask1(mask.order, mask.dimSize, X_FLOAT16, mask.denseRatio, mask.devID, mask.mem);
+            //mask1 = ConvertDataType(mask, X_FLOAT16);
+            _ConvertDataType(&mask, &mask1);
+
+            return MultiplyBroadcast(x, mask1);
+        }
+        else {
+            ShowNTErrors("TODO!");
+        }
+    }
+
+    
+}

 /* 
 dropout function without broadcast 
@@ -235,7 +255,6 @@ XTensor DropoutWithoutBroadcast(const XTensor &x, DTYPE dropProb)
    int unitNum = x.unitNum;
    DTYPE * maskArray = new DTYPE[unitNum];

-    srand((unsigned int)time(NULL));
    for (int i = 0; i < unitNum; i++)
        maskArray[i] = RandomBernoulli(dropProb, scaleFactor);
    

--- a/source/tensor/function/Dropout.h
+++ b/source/tensor/function/Dropout.h
@@ -30,7 +30,6 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
 inline DTYPE RandomBernoulli(DTYPE dropProb, DTYPE value)
 {
    return (DTYPE)rand()/(DTYPE)RAND_MAX >= dropProb ? (DTYPE)value : 0;
-
 }

 /* dropout function */
@@ -42,9 +41,7 @@ void _DropoutBackward(const XTensor * y, const XTensor * x,
                      unsigned int seed, DTYPE dropProb, int leadingDim = -1);

 /* dropout function */
-XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim = -1);
-
-
+XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim = -1, int leadingDim2 = -1);
    
 /* dropout function without broadcast */
 XTensor DropoutWithoutBroadcast(const XTensor &x, DTYPE dropProb);

--- a/source/tensor/function/LogSoftmax.cpp
+++ b/source/tensor/function/LogSoftmax.cpp
@@ -27,9 +27,6 @@
 #include "../core/reduce/ReduceSum.h"
 #include "../core/reduce/ReduceMax.h"
 #include "../core/movement/CopyValues.h"
-#include "../../tensor/core/getandset/ConvertDataType.h"
-
-using namespace nts;

 namespace nts { // namespace nts(NiuTrans.Tensor)

@@ -182,125 +179,6 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim)

    delete[] dimSize;
  
-
-
-
-
-//	if (!x->isSparse && !y->isSparse &&
-//		x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE)
-//		{
-//			int * dimSize = new int[x->order - 1];
-//			for (int i = 0; i < x->order; i++) {
-//				if (i < leadDim)
-//					dimSize[i] = -x->dimSize[i];
-//				else if (i > leadDim)
-//					dimSize[i - 1] = -x->dimSize[i];
-//			}
-//
-//			XMem * mem = x->mem;
-//			XTensor * max = NULL;
-//			XTensor * sum = NULL;
-//			XTensor * blockx = NULL;
-//			XTensor * blocky = NULL;
-//			XTensor * blockMax = NULL;
-//			XTensor * blockSum = NULL;
-//
-//			int dimensionSize = y->dimSizeRDI[leadDimRDI];
-//			int stride = 1;
-//			int blockSize = 1;
-//			int blockNum = 1;
-//
-//			for (int i = 0; i < leadDimRDI; i++)
-//				stride *= y->dimSizeRDI[i];
-//			blockSize = stride * dimensionSize;
-//			blockNum = y->unitNum / blockSize;
-//
-//			max = NewTensorBuf(x->order - 1, dimSize, x->dataType, x->denseRatio, x->devID, mem);
-//			sum = NewTensorBuf(x->order - 1, dimSize, x->dataType, x->denseRatio, x->devID, mem);
-//
-//			_ReduceMax(x, max, leadDim);
-//			_ReduceSum(x, sum, leadDim, max, 1.0F, true);
-//
-//			if (x->devID >= 0) {
-//				if (leadDimRDI == 0) {
-//					blockSize = y->unitNum;
-//					blockNum = 1;
-//					blockx = NewTensor2D(blockSize / dimensionSize, -dimensionSize, x->dataType, x->devID, mem);
-//					blocky = NewTensor2D(blockSize / dimensionSize, -dimensionSize, x->dataType, x->devID, mem);
-//					blockMax = NewTensor2D(blockSize / dimensionSize, -1, x->dataType, x->devID, mem);
-//					blockSum = NewTensor2D(blockSize / dimensionSize, -1, x->dataType, x->devID, mem);
-//				}
-//				else {
-//					blockx = NewTensor2D(-stride, dimensionSize, x->dataType, x->devID, mem);
-//					blocky = NewTensor2D(-stride, dimensionSize, x->dataType, x->devID, mem);
-//					blockMax = NewTensor2D(-stride, 1, x->dataType, x->devID, mem);
-//					blockSum = NewTensor2D(-stride, 1, x->dataType, x->devID, mem);
-//				}
-//			}
-//
-//			for (int k = 0; k < blockNum; k++) {
-//				int m = stride;
-//				int n = dimensionSize;
-//
-//				DTYPE * ip = (DTYPE*)x->data + k * blockSize;
-//				DTYPE * op = (DTYPE*)y->data + k * blockSize;
-//				DTYPE * mp = (DTYPE*)max->data + k * blockSize / dimensionSize;
-//				DTYPE * sp = (DTYPE*)sum->data + k * blockSize / dimensionSize;
-//
-//				if (x->devID < 0) {
-//					for (int j = 0; j < m; j++) {
-//						DTYPE sumValue = sp[j];
-//						if (sumValue == 0) {
-//							for (int i = 0; i < n; i++)
-//								op[i * m + j] = 0;
-//						}
-//						else {
-//							for (int i = 0; i < n; i++) {
-//								DTYPE r = (DTYPE)log(exp(ip[i * m + j] - mp[j]) / sp[j]);
-//								if (IsNAN(r))
-//									r = LOGPROB_MIN;
-//								if (IsINF(r))
-//									r = LOGPROB_MIN;
-//
-//								op[i * m + j] = MAX(r, LOGPROB_MIN);
-//							}
-//						}
-//					}
-//				}
-//				else {
-//					blockx->data = ip;
-//					blocky->data = op;
-//					blockMax->data = mp;
-//					blockSum->data = sp;
-//#ifdef USE_CUDA
-//					if (leadDimRDI == 0)
-//						_CudaLogSoftmaxSumMax(blockx, blocky, 1, blockSum, blockMax);
-//					else
-//						_CudaLogSoftmaxSumMax(blockx, blocky, leadDim, blockSum, blockMax);
-//#else
-//					ShowNTErrors("Please specify USE_CUDA and recompile the code!");
-//#endif
-//					blockx->data = NULL;
-//					blocky->data = NULL;
-//					blockMax->data = NULL;
-//					blockSum->data = NULL;
-//				}
-//			}
-//
-//			DelTensorBuf(max);
-//			DelTensorBuf(sum);
-//
-//			if (x->devID >= 0) {
-//				delete blockx;
-//				delete blocky;
-//				delete blockMax;
-//				delete blockSum;
-//			}
-//
-//			delete[] dimSize;
-//		}
-//	else
-//		ShowNTErrors("TODO!");
 }

 /*

--- a/source/tensor/function/LogSoftmax.cu
+++ b/source/tensor/function/LogSoftmax.cu
@@ -26,7 +26,6 @@
 #include "../core/reduce/ReduceSum.cuh"
 #include "../core/reduce/ReduceMax.cuh"
 #include "../XDevice.h"
-#include "device_launch_parameters.h"
 #include "cuda_fp16.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/function/Softmax.cpp
+++ b/source/tensor/function/Softmax.cpp
@@ -26,7 +26,6 @@
 #include "../XUtility.h"
 #include "../core/reduce/ReduceSum.h"
 #include "../core/reduce/ReduceMax.h"
-#include "../core/getandset/ConvertDataType.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/function/Softmax.cu
+++ b/source/tensor/function/Softmax.cu
@@ -29,7 +29,6 @@
 #include "../core/arithmetic/Sum.h"
 #include "../XDevice.h"
 #include "../XUtility.h"
-#include "../core/getandset/ConvertDataType.h"


 namespace nts { // namespace nts(NiuTrans.Tensor)
@@ -86,8 +85,6 @@ void KernelSoftmaxComputeTensor(DTYPE * x, DTYPE * max, DTYPE * sum, DTYPE * y, 
    /* synchronize to make sure the values of max and sum are loaded */
    __syncthreads();

-    //printf("1:  %d  %d  %d  %d\n", i, strideSizeTotal, j, strideNum);
-
    if(i < strideSizeTotal && j < strideNum){
        int offset = int(i / stride) * blockSize + j * stride + i2[threadIdx.x];
        DTYPE r = exp(x[offset] - xMax[threadIdx.x])/xSum[threadIdx.x];
@@ -145,8 +142,6 @@ void KernelSoftmaxComputeTensorHalf(__half * x, __half * max, __half * sum, __ha
    /* synchronize to make sure the values of max and sum are loaded */
    __syncthreads();

-    //printf("2:  %d  %d  %d  %d\n",i ,stride * blockNum ,j ,strideNum);
-
    if(i < stride * blockNum && j < strideNum){
        int offset = int(i / stride) * blockSize + j * stride + i2[threadIdx.x];
 #if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
@@ -256,7 +251,6 @@ void _CudaSoftmaxSumMax(const XTensor * x, XTensor * y, int leadDim, XTensor * s

    if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE){
        if (leadDim != 0 || dimensionSize <= 10) {
-            //printf("%d %d %d %d\n", cudaGridSize[0], cudaGridSize[1], cudaBlockSize[0], cudaBlockSize[1]);
            KernelSoftmaxComputeTensor <<< dim3(cudaGridSize[0], cudaGridSize[1]), dim3(cudaBlockSize[0], cudaBlockSize[1]) >>>
                                         ((DTYPE*)x->data, (DTYPE*)max->data, (DTYPE*)sum->data, (DTYPE*)y->data,
                                           stride, dimensionSize, stride * dimensionSize, blockNum, stride * blockNum);
@@ -269,8 +263,6 @@ void _CudaSoftmaxSumMax(const XTensor * x, XTensor * y, int leadDim, XTensor * s
    }
    else if(x->dataType == X_FLOAT16 && y->dataType == X_FLOAT16){

-        //printf("%d\n\n",dimensionSize);
-        //printf("%d %d %d %d\n", cudaGridSize[0], cudaGridSize[1], cudaBlockSize[0], cudaBlockSize[1]);
        KernelSoftmaxComputeTensorHalf <<< dim3(cudaGridSize[0], cudaGridSize[1]), dim3(cudaBlockSize[0], cudaBlockSize[1]) >>>
                                     ((__half*)x->data, (__half*)max->data, (__half*)sum->data, (__half*)y->data, 
                                       stride, dimensionSize, blockNum);
@@ -279,10 +271,6 @@ void _CudaSoftmaxSumMax(const XTensor * x, XTensor * y, int leadDim, XTensor * s
        ShowNTErrors("TODO!");
    }

-    /*XTensor y1;
-    y1 = ConvertDataType(*y, X_FLOAT);
-    y1.Dump(stderr, "y1:");*/
-
    BacktoCudaDev(x->devID, devIDBackup);
 }