update files and start work

2a2bd312 · 张裕浩 · 30dd9d30 · 2a2bd312 · 2a2bd312 · 2a2bd312
Commit 2a2bd312 authored Jul 11, 2019 by 张裕浩
--- a/doc/Configuration.md
+++ b/doc/Configuration.md
+# NiuTrans.Tensor环境配置
+## 注意事项
+CUDA最新版本9.2尚且不支持VS2017最新版本，因此建议使用CUDA版本为9.0或9.1，建议使用VS版本为VS2015，或使用VS2017时安装v140工具集，解决方案平台设置为×64。
+## CUDA配置
+在已安装好VS、CUDA并配置好环境变量后，一些关键的CUDA配置选项如下所示，以下配置选项在 **项目 -> 属性** 中可以找到。
+>$(CUDA_PATH)\include
+加入到 **VC++目录 -> 包含** 中。
+>$(CUDA_PATH)\lib\Win32
+加入到 **VC++目录 -> 库** 中。
+>cuda.lib;cudadevrt.lib;cudart.lib;cudart_static.lib;nvcuvid.lib;OpenCL.lib;cublas.lib;curand.lib;
+加入到 **链接器->输入->附加依赖项** 中。
+配置完成后，右键 **工程->项目依赖性** ，选择CUDA9。
+在.cu文件上右键属性，在项类型中选择"CUDA C/C++"（最好搜索.cu文件，然后全选设置）。
+## 其他配置
+**C/C++->常规->SDL检查**，设为否。
+在 **C/C++->预处理器->预处理器定义** 中，添加
+>USE_CUDA;USE_BLAS;WIN32;MKL;_DEBUG;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_WARNINGS_
+CONSOLE;
+**链接器->系统->子系统**，设置为控制台。
+**常规->字符集**，使用Unicode字符集。
+**调试->命令参数**中设置可执行文件所需要的参数。
--- a/doc/manual.md
+++ b/doc/manual.md
--- a/source/network/Main.cpp
+++ b/source/network/Main.cpp
@@ -21,95 +21,275 @@
 #include <stdio.h>
 #include "XNet.h"
+#include "../tensor/XUtility.h"
 #include "../tensor/function/FHeader.h"
 #include "../tensor/core/CHeader.h"
-#include "../sample/fnnlm/FNNLM.h"
 #include "../tensor/test/Test.h"
-#include <cuda_runtime.h>
+#include "../sample/fnnlm/FNNLM.h"
-#include <time.h>
+#include "../sample/transformer/Transformer.h"
-#include <windows.h>
+//#include "../tensor/timer.h"
 //#define CRTDBG_MAP_ALLOC
 //#include <stdlib.h>
 //#include <crtdbg.h>
+void BackwardTest();
+void TransposeTest();
+void SumDimTest();
+void SplitBackwardTest();
+void MemTest();
 using namespace nts;
 using namespace fnnlm;
+using namespace transformer;
-void SetDataTest()
+void test()
 {
-    int * dimSize = new int[2];
+    XTensor a;
-    dimSize[0] = 10000;
+    InitTensor2D(&a, 100, 100, X_FLOAT, 0);
-    dimSize[1] = 1000;
+    XTensor b;
-    XTensor b1(2, dimSize, X_FLOAT, 1.0F, 0, NULL);
+    InitTensor2D(&b, 100, 100, X_FLOAT16, 0);
-    XTensor b2(2, dimSize, X_FLOAT, 1.0F, 0, NULL);
+    _ConvertDataType(&a, &b);
-    XTensor b3(2, dimSize, X_FLOAT, 1.0F, -1, NULL);
+    return;
-    DWORD  m_start_time;
-    DWORD  m_end_time;
-    double time_diff = 0.0;
-    m_start_time = GetTickCount();
-    _SetDataRand(&b1, -2.0F, 2.0F);
-    cudaThreadSynchronize();
-    m_end_time = GetTickCount();
-    time_diff = m_end_time - m_start_time;
-    printf("time %f ms\n", time_diff);
-    m_start_time = GetTickCount();
-    _SetDataRand(&b3, -2.0F,2.0F);
-    cudaThreadSynchronize();
-    m_end_time = GetTickCount();
-    time_diff = m_end_time - m_start_time;
-    printf("time %f ms\n", time_diff);
 }
 int main( int argc, const char ** argv )
 {
+    //timer_c asd;
-    if(argc > 1 && !strcmp(argv[1], "-test"))
+    test();
-        Test();
+    //MemTest();
-    //SetDataTest();
+    //return 0;
-    else if(argc > 1 && !strcmp(argv[1], "-fnnlm"))
+    //SplitBackwardTest();
+    //return 0;
+    //_CrtSetBreakAlloc(896);
+    //BackwardTest();
+    //return 0;
+    /*if(argc > 1 && !strcmp(argv[1], "-fnnlm"))
        FNNLMMain(argc - 1, argv + 1);
-    /*else{
+    else if(argc > 1 && !strcmp(argv[1], "-t2t"))
+        TransformerMain(argc - 1, argv + 1);
+    else{
        fprintf(stderr, "Thanks for using NiuTrans.Network! This is a library for building\n");
        fprintf(stderr, "neural networks in an easy way. \n\n");
        fprintf(stderr, "Run this program with \"-test\" for unit test!\n");
        fprintf(stderr, "Or run this program with \"-fnnlm\" for sample FNNLM!\n");
-    }
+    }*/
+    //_CrtDumpMemoryLeaks();
+    return 0;
+}
+void BackwardTest()
+{
    XNet net;
    XTensor a;
    XTensor b;
    XTensor c;
+    XTensor mean;
-    InitTensor2D(&a, 2, 2);
+    XTensor origin;
-    InitTensor2D(&b, 2, 4);
+    InitTensor2D(&a, 2, 3);
-    InitTensor2D(&c, 2, 4);
+    InitTensor1D(&b, 2);
    a.SetZeroAll();
    b.SetZeroAll();
-    c.SetZeroAll();
+    a.Set2D(1.0F, 0, 0);
+    a.Set2D(2.0F, 0, 1);
-    SetDataFixed(a, 0.1F);
+    a.Set2D(3.0F, 0, 2);
-    a.Set2D(0.3F, 1, 0);
+    a.Set2D(4.0F, 1, 0);
-    a.Set2D(0.4F, 1, 1);
+    a.Set2D(5.0F, 1, 1);
+    a.Set2D(6.0F, 1, 2);
-    b = Merge(a, a, 1);
+    b.Set1D(2.0F, 0);
-    c = HTanH(MMul(a, b));
+    b.Set1D(1.0F, 1);
-    a.Dump(stderr, "a:");
+    c = DivDim(a, b, 0);
-    b.Dump(stderr, "b:");
    c.Dump(stderr, "c:");
-    XLink::ShowNetwork(stderr, &c);
+    //XLink::ShowNetwork(stderr, &c);
    net.Backward(c);
    net.Dump(stderr);
+}
+void TransposeTest()
+{
+#ifdef USE_CUDA
+    XMem mem0(0, UNI_FREE, MILLION * 64, 1024, MILLION * 64);
+    //XMem mem1(1, UNI_FREE, MILLION * 64, 1024, MILLION * 64);
+    XTensor x;
+    XTensor y;
+    XTensor z;
+    int loops = 2000;
+    int B = 3 * 2 * 4;
+    int K = 8 * 1;
+    int N = 50;
+    int H = 512 * 4;
+    int nnn = GDevs.nGPU;
+    InitTensor3D(&x, B, N, H, X_FLOAT, 0);
+    InitTensor4D(&y, K, B, N, H/K, X_FLOAT, 0);
+    InitTensor3D(&z, B, N, H, X_FLOAT, 0);
+    cudaEvent_t ctime0;
+    cudaEvent_t ctime1;
+    cudaEvent_t ctime2;
+    cudaEvent_t ctime3;
+    cudaEvent_t ctime4;
+    cudaEvent_t ctime5;
+    float elapsedSplit = 0.0;
+    float elapsedMerge = 0.0;
+    float elapsedSum = 0.0;
+    cudaEventCreate(&ctime0);
+    cudaEventCreate(&ctime1);
+    cudaEventCreate(&ctime2);
+    cudaEventCreate(&ctime3);
+    cudaEventCreate(&ctime4);
+    cudaEventCreate(&ctime5);
+    cudaEventRecord(ctime0, 0);
+    double time0 = GetClock();
+    for(int i = 0; i < loops; i++)
+        _Split(&x, &y, 2, K);
+    double time1 = GetClock();
-    //_CrtDumpMemoryLeaks();*/
+    cudaEventRecord(ctime1, 0);
+    cudaEventSynchronize(ctime1);
+    cudaEventElapsedTime(&elapsedSplit, ctime0, ctime1);
+    cudaEventRecord(ctime2, 0);
+    double time2 = GetClock();
+    for(int i = 0; i < loops; i++)
+        _Merge(&y, &x, 3);
+    double time3 = GetClock();
+    cudaEventRecord(ctime3, 0);
+    cudaEventSynchronize(ctime3);
+    cudaEventElapsedTime(&elapsedMerge, ctime2, ctime3);
+    cudaEventRecord(ctime4, 0);
+    double time4 = GetClock();
+    for(int i = 0; i < loops; i++)
+        _Sum(&x, &z, &x);
+    double time5 = GetClock();
+    cudaEventRecord(ctime5, 0);
+    cudaEventSynchronize(ctime5);
+    cudaEventElapsedTime(&elapsedSum, ctime4, ctime5);
+    fprintf(stderr, "split:%f merge:%f sum:%f\n", time1 - time0, time3 - time2, time5 - time4);
+    fprintf(stderr, "split:%f merge:%f sum:%f\n", elapsedSplit, elapsedMerge, elapsedSum);
+#endif
+}
+void SumDimTest()
+{
+    XTensor x;
+    XTensor y;
+    XTensor z;
+    int a = 5;
+    int b = 7;
+    int c = 3;
+    InitTensor3D(&x, a, b, c, X_FLOAT, -1);
+    InitTensor1D(&y, c, X_FLOAT, -1);
+    InitTensor3D(&z, a, b, c, X_FLOAT, -1);
+    x.SetZeroAll();
+    y.SetZeroAll();
+    z.SetZeroAll();
+    DTYPE * data = new DTYPE[x.unitNum];
+    for(int i = 0; i < x.unitNum; i++)
+        data[i] = (DTYPE)i;
+    x.SetData(data, x.unitNum);
+    for(int i = 0; i < y.unitNum; i++)
+        data[i] = -(DTYPE)i;
+    y.SetData(data, y.unitNum);
+    _SumDim(&x, &y, &z, 2);
+    z.Dump(stderr, "z:");
+    delete[] data;
+}
+void SplitBackwardTest()
+{
+    int * dimSize = new int[2];
+    dimSize[0] = 2;
+    dimSize[1] = 4;
+    XTensor t1;
+    InitTensor2D(&t1, 2, 4, X_FLOAT, 0, NULL);
+    XTensor t2;
+    InitTensor2D(&t2, 2, 4, X_FLOAT, 0, NULL);
+    XTensor tensor;
-    return 0;
+    //_SetDataFixedFloat(&t1, 1.0F);
+    //_SetDataFixedFloat(&t2, 2.0F);
+    t1.SetDataRand();
+    t2.SetDataRand();
+    tensor = t1 + t2;
+    XList smalls;
+    XTensor first;
+    XTensor second;
+    InitTensor2D(&first, 2, 2, X_FLOAT, 0, NULL);
+    InitTensor2D(&second, 2, 2, X_FLOAT, 0, NULL);
+    smalls.Add(&first);
+    smalls.Add(&second);
+    Split(tensor, smalls, 1, 2);
+    XTensor mul;
+    mul = Sum(first, second);
+    XNet net;
+    net.Backward(mul);
+    net.Dump(stderr);
+    printf("Done!");
 }
+void MemTest()
+{
+    XMem * mem;
+    mem = new XMem(0, FREE_ON_THE_FLY, (MTYPE)MILLION, 1024, MILLION);
+    XTensor tensor;
+    InitTensor2D(&tensor, 2, 4, X_FLOAT, 0, mem);
+    tensor.SetZeroAll();
+    tensor.Dump(stderr);
+    delete mem;
+    if (tensor.mem != NULL) {
+        printf("It isn't null!\n");
+        printf("%d\n", (int)tensor.mem->signature);
+    }
+    else {
+        printf("It's null\n");
+    }
+    tensor.Dump(stderr);
+}
\ No newline at end of file
--- a/source/network/XBackwardData.cpp
+++ b/source/network/XBackwardData.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * backward computation for data operation
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-12-26
+ */
+#include "XNoder.h"
+#include "XBackwardData.h"
+#include "../tensor/XName.h"
+#include "../tensor/XUtility.h"
+#include "../tensor/core/CHeader.h"
+#include "../tensor/core/getandset/SetData.h"
+namespace nts{
+/* compute dE/dx of a node */
+void XDataGrad::MakeGrad(XTensor * node, bool isEfficent)
+{
+    CheckNTErrors(node->grad != NULL, "No gradient found!");
+    XLink &income = node->income;
+    int operID = income.typeID;
+    if(operID == GETANDSET_CONVERTDATATYPE)
+        GradConvertDataType(node, isEfficent);
+    else if(operID == GETANDSET_INDEXTOONEHOT)
+        GradIndexToOnehot(node, isEfficent);
+    else if(operID == GETANDSET_ONEHOTTOINDEX)
+        GradOnehotToIndex(node, isEfficent);
+    else{
+        ShowNTErrors("TODO!");
+    }
+}
+/* indicates whether the node is for a data operation */
+bool XDataGrad::IsDataOP(XTensor * node)
+{
+    XLink &income = node->income;
+    return (income.typeID & DATA_BASE) != 0;
+}
+/* 
+gradient computation for convert datatype
+for
+b = converdatatype(a) 
+we have
+dE/da = convertdatatype(b)
+>> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
+*/
+void XDataGrad::GradConvertDataType(XTensor * node, bool isEfficent)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for ConvertDataType!");
+    XTensor * input = income.tails[0];
+    XNoder::MakeGrad(input);
+    _ConvertDataType(node->grad, input->grad);
+}
+/* 
+gradient computation for OnehotToIndex
+for
+b = OnehotToIndex(a) 
+we have
+dE/da = IndexToOnehot(b)
+>> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
+*/
+void XDataGrad::GradOnehotToIndex(XTensor * node, bool isEfficent)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for IndexToOnehot!");
+    XTensor * input = income.tails[0];
+    XNoder::MakeGrad(input);
+}
+/* 
+gradient computation for IndexToOnehot
+for
+b = IndexToOnehot(a) 
+we have
+dE/da = IndexToOnehot(b)
+>> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
+*/
+void XDataGrad::GradIndexToOnehot(XTensor * node, bool isEfficent)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for IndexToOnehot!");
+    XTensor * input = income.tails[0];
+    XNoder::MakeGrad(input);
+}
+} // namespace nts(NiuTrans.Tensor)
--- a/source/network/XBackwardData.h
+++ b/source/network/XBackwardData.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * backward computation for data operation
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-12-26
+ */
+#include "../tensor/XTensor.h"
+#include "../tensor/function/FHeader.h"
+#ifndef __XBACKWARDDATA_H__
+#define __XBACKWARDDATA_H__
+namespace nts{
+/* this class computes the gradient for tensor data operation given a node */
+class XDataGrad
+{
+public:
+    /* compute dE/dx of a node */
+    static
+    void MakeGrad(XTensor * node, bool isEfficent);
+    /* indicates whether the node is for a shaping operation */
+    static
+    bool IsDataOP(XTensor * node);
+private:
+    /* gradient computation for ConverDataType: b = converdatatype(a, datatype) */
+    static
+    void GradConvertDataType(XTensor * node, bool isEfficent);
+    /* gradient computation for IndexToOnehot: b = indextoonehot(a, num) */
+    static
+    void GradIndexToOnehot(XTensor * node, bool isEfficent);
+    /* gradient computation for OnehotToIndex: b = onehottoindex(a, num) */
+    static
+    void GradOnehotToIndex(XTensor * node, bool isEfficent);
+};
+} // namespace nts(NiuTrans.Tensor)
+#endif
\ No newline at end of file
--- a/source/network/XBackwardFunc.cpp
+++ b/source/network/XBackwardFunc.cpp
@@ -29,10 +29,8 @@
 namespace nts{
 /* compute dE/dx of a node */
-void XFuncGrad::MakeGrad(XTensor * node)
+void XFuncGrad::MakeGrad(XTensor * node, bool isEfficient)
 {
    XLink &income = node->income;
    int operID = income.typeID;
@@ -51,7 +49,7 @@ void XFuncGrad::MakeGrad(XTensor * node)
    else if(operID == FUNC_LOGSOFTMAX){
        int leadDim = income.GetParamInt(0);
        CheckNTErrors(leadDim >= 0 && leadDim < input->order, "wrong leading dimension in logsoftmax!");
-        _LogSoftmaxBackward(NULL, output, input, output->grad, input->grad, leadDim, NOLOSS);
+        _LogSoftmaxBackward(NULL, output, input, output->grad, input->grad, NULL, leadDim, NOLOSS);
    }
    else if(operID == FUNC_RECTIFY)
        _RectifyBackward(NULL, output, input, output->grad, input->grad, NOLOSS);
@@ -60,7 +58,7 @@ void XFuncGrad::MakeGrad(XTensor * node)
    else if(operID == FUNC_SOFTMAX){
        int leadDim = income.GetParamInt(0);
        CheckNTErrors(leadDim >= 0 && leadDim < input->order, "wrong leading dimension in softmax!");
-        _SoftmaxBackward(NULL, output, input, output->grad, input->grad, leadDim, NOLOSS);
+        _SoftmaxBackward(NULL, output, input, output->grad, input->grad, NULL, leadDim, NOLOSS);
    }
    else{
        ShowNTErrors("Wrong activation function type!");

--- a/source/network/XBackwardFunc.h
+++ b/source/network/XBackwardFunc.h
@@ -35,7 +35,7 @@ class XFuncGrad
 public:
    /* compute dE/dx of a node */
    static
-    void MakeGrad(XTensor * node);
+    void MakeGrad(XTensor * node, bool isEfficient);
    /* indicates whether the node is for an activation function */
    static

--- a/source/network/XBackwardLoss.cpp
+++ b/source/network/XBackwardLoss.cpp
@@ -21,8 +21,13 @@
 #include "XBackwardLoss.h"
 #include "../tensor/XName.h"
+#include "../tensor/core/getandset/SetData.h"
 #include "../tensor/function/HardTanH.h"
+#include "../tensor/function/Identity.h"
 #include "../tensor/function/LogSoftmax.h"
+#include "../tensor/function/Rectify.h"
+#include "../tensor/function/Sigmoid.h"
+#include "../tensor/function/Softmax.h"
 namespace nts{
@@ -38,7 +43,7 @@ compute dE/dx for a given function y = f(x)
 >> lossName - name of the loss, e.g., cross entropy
 */
 void XLossGrad::Compute(XTensor * gold, XTensor * y, XTensor * x, 
-                        XTensor * dedy, XTensor * dedx,
+                        XTensor * dedy, XTensor * dedx, XTensor * padding,
                        int funcID, void * params,
                        LOSS_FUNCTION_NAME lossName)
 {
@@ -49,9 +54,21 @@ void XLossGrad::Compute(XTensor * gold, XTensor * y, XTensor * x,
    if(funcID == FUNC_HARDTANH){
        _HardTanHBackward(gold, y, x, dedy, dedx, lossName);
    }
+    else if(funcID == FUNC_IDENTITY){
+        _IdentityBackward(gold, y, x, dedy, dedx, lossName);
+    }
    else if(funcID == FUNC_LOGSOFTMAX){
        int leadDim = *(int*)params;
-        _LogSoftmaxBackward(gold, y, x, dedy, dedx, leadDim, lossName);
+        _LogSoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
+    }
+    else if(funcID == FUNC_RECTIFY){
+        _RectifyBackward(gold, y, x, dedy, dedx, lossName);
+    }
+    else if(funcID == FUNC_SIGMOID){
+        _SigmoidBackward(gold, y, x, dedy, dedx, lossName);
+    }else if(funcID == FUNC_SOFTMAX){
+        int leadDim = *(int*)params;
+        _SoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
    }
    else{
        ShowNTErrors("wrong function found when call the backward process!");
@@ -67,10 +84,26 @@ compute dE/dy for variable y and error(loss) function E
 >> lossName - name of the loss, e.g., cross entropy
 */
 void XLossGrad::Compute(XTensor * gold, XTensor * y, 
-                        XTensor * dedy, 
+                        XTensor * dedy, XTensor * padding,
                        LOSS_FUNCTION_NAME lossName)
 {
-    _LossBackward(dedy, gold, y, lossName);
+    if(gold == NULL){
+        if(dedy->dataType == X_FLOAT)
+            _SetDataFixedFloat(dedy, 1.0F);
+        else if(dedy->dataType == X_DOUBLE)
+            _SetDataFixedDouble(dedy, 1.0);
+        else if(dedy->dataType == X_INT)
+            _SetDataFixedInt(dedy, 1);
+        else{
+            ShowNTErrors("TODO");
+        }
+        return;
+    }
+    //_LossBackward(dedy, gold, y, lossName);
+    if(lossName == CROSSENTROPY)
+        _CrossEntropyBackward(dedy, y, gold, NULL, padding);
 }
 }
\ No newline at end of file
--- a/source/network/XBackwardLoss.h
+++ b/source/network/XBackwardLoss.h
@@ -36,13 +36,13 @@ class XLossGrad
 public:
    /* compute dE/dx for a given function y = f(x) */
    void Compute(XTensor * gold, XTensor * y, XTensor * x, 
-                 XTensor * dedy, XTensor * dedx,
+                 XTensor * dedy, XTensor * dedx, XTensor * padding,
                 int funcID, void * params,
                 LOSS_FUNCTION_NAME lossName);
    /* compute dE/dy for variable y and error(loss) function E */
    void Compute(XTensor * gold, XTensor * y, 
-                 XTensor * dedy, 
+                 XTensor * dedy, XTensor * padding,
                 LOSS_FUNCTION_NAME lossName);
 };

--- a/source/network/XBackwardMath.cpp
+++ b/source/network/XBackwardMath.cpp
--- a/source/network/XBackwardMath.h
+++ b/source/network/XBackwardMath.h
@@ -33,116 +33,131 @@ class XMathGrad
 public:
    /* compute dE/dx of a node */
    static
-    void MakeGrad(XTensor * node);
+    void MakeGrad(XTensor * node, bool isEfficient);
    /* indicates whether the node is for a math operation */
    static
    bool IsMathOP(XTensor * node);
 private:
-    /* gradient for sum: c =  a + b * \beta */
+    /* gradient for absolute */
    static
-    void GradSum(XTensor * node);
+    void GradAbsolute(XTensor * node, bool isEfficient);
-    /* gradient for sum with one dimension: c = a + b * \beta
+    /* gradient for cos */
-       where the size of b is equal to that of one dimension of a */
    static
-    void GradSumDim(XTensor * node);
+    void GradCos(XTensor * node, bool isEfficient);
-    /* gradient for multiply (dot production): c =  a * b * \alpha */
+    /* gradient for exp */
    static
-    void GradMultiply(XTensor * node);
+    void GradExp(XTensor * node, bool isEfficient);
-    /* gradient for matrix multiply: c = matmul(a, b) * \alpha */
+    /* gradient for log: c =  log(a) */
    static
-    void GradMatrixMul(XTensor * node);
+    void GradLog(XTensor * node, bool isEfficient);
-    /* gradient for matrix multiply: c = matmul(a, b) * \alpha */
+    /* gradient for round */
    static
-    void GradMatrixMul(XTensor * a, XTensor * deda, MATRIX_TRANS_TYPE transA,
+    void GradRound(XTensor * node, bool isEfficient);
-                       XTensor * b, XTensor * dedb, MATRIX_TRANS_TYPE transB,
-                       XTensor * dedc, DTYPE alpha);
+    /* gradient for sign */
-    /* gradient for matrix multiply in batch mode.
-       for each batch: c_i = matmul(a_i, b_i) * \alpha */
    static
-    void GradMatrixMulBatched(XTensor * node);
+    void GradSign(XTensor * node, bool isEfficient);
-    /* gradient for log: c =  log(a) */
+    /* gradient for sin */
    static
-    void GradLog(XTensor * node);
+    void GradSin(XTensor * node, bool isEfficient);
-    /* gradient for power */
+    /* gradient for tan */
    static
-    void GradPower(XTensor * node);
+    void GradTan(XTensor * node, bool isEfficient);
-    /* gradient for negate */
+    /* gradient for clip */
    static
-    void GradNegate(XTensor * node);
+    void GradClip(XTensor * node, bool isEfficient);
-    /* gradient for ScaleAndShift */
+    /* gradient for Divide */
    static
-    void GradScaleAndShift(XTensor * node);
+    void GradDiv(XTensor * node, bool isEfficient);
-    /* gradient for Minus */
+    /* gradient for DivideDim */
    static
-    void GradSub(XTensor * node);
+    void GradDivDim(XTensor * node, bool isEfficient);
-    /* gradient for Divide */
+    /* gradient for matrix multiply: c = matmul(a, b) * \alpha */
    static
-    void GradDiv(XTensor * node);
+    void GradMatrixMul(XTensor * node, bool isEfficient);
+    /* gradient for matrix multiply: c = matmul(a, b) * \alpha */
+    static
+    void GradMatrixMul(XTensor * a, XTensor * deda, MATRIX_TRANS_TYPE transA,
+                       XTensor * b, XTensor * dedb, MATRIX_TRANS_TYPE transB,
+                       XTensor * dedc, DTYPE alpha, bool isEfficient);
-    /* gradient for reduceMean */
+    /* gradient for matrix multiply in batch mode.
+       for each batch: c_i = matmul(a_i, b_i) * \alpha */
    static
-    void GradReduceMean(XTensor * node);
+    void GradMatrixMulBatched(XTensor * node, bool isEfficient);
-    /* gradient for reduceSum */
+    /* gradient for multiply (dot production): c =  a * b * \alpha */
    static
-    void GradReduceSum(XTensor * node);
+    void GradMultiply(XTensor * node, bool isEfficient);
-    /* gradient for reduceSumSquared */
+    /* gradient for multiply one dimension: c =  a * b * \alpha 
+       where the size of b is equal to that of one dimension of a */
    static
-    void GradReduceSumSquared(XTensor * node);
+    void GradMultiplyDim(XTensor * node, bool isEfficient);
-    /* gradient for reduceVariance */
+    /* gradient for negate */
    static
-    void GradReduceVariance(XTensor * node);
+    void GradNegate(XTensor * node, bool isEfficient);
+    /* gradient for normalize */
+    static
+    void GradNormalize(XTensor * node, bool isEfficient);
-    /* gradient for sin */
+    /* gradient for power */
    static
-    void GradSin(XTensor * node);
+    void GradPower(XTensor * node, bool isEfficient);
-    /* gradient for cos */
+    /* gradient for ScaleAndShift */
    static
-    void GradCos(XTensor * node);
+    void GradScaleAndShift(XTensor * node, bool isEfficient);
-    /* gradient for tan */
+    /* gradient for Minus */
    static
-    void GradTan(XTensor * node);
+    void GradSub(XTensor * node, bool isEfficient);
+	/* gradient for sub with one dimension: c = a - b * \beta
+	where the size of b is equal to that of one dimension of a */
+	static
+	void GradSubDim(XTensor * node, bool isEfficient);
-    /* gradient for exp */
+    /* gradient for sum: c =  a + b * \beta */
    static
-    void GradExp(XTensor * node);
+    void GradSum(XTensor * node, bool isEfficient);
-    /* gradient for normalize */
+    /* gradient for sum with one dimension: c = a + b * \beta
+       where the size of b is equal to that of one dimension of a */
    static
-    void GradNormalize(XTensor * node);
+    void GradSumDim(XTensor * node, bool isEfficient);
-    /* gradient for absolute */
+    /* gradient for reduceMean */
    static
-    void GradAbsolute(XTensor * node);
+    void GradReduceMean(XTensor * node, bool isEfficient);
-    /* gradient for sign */
+    /* gradient for reduceSum */
    static
-    void GradSign(XTensor * node);
+    void GradReduceSum(XTensor * node, bool isEfficient);
-    /* gradient for clip */
+    /* gradient for reduceSumSquared */
    static
-    void GradClip(XTensor * node);
+    void GradReduceSumSquared(XTensor * node, bool isEfficient);
-    /* gradient for round */
+    /* gradient for reduceVariance */
    static
-    void GradRound(XTensor * node);
+    void GradReduceVariance(XTensor * node, bool isEfficient);
 };
 }

--- a/source/network/XBackwardShape.cpp
+++ b/source/network/XBackwardShape.cpp
--- a/source/network/XBackwardShape.h
+++ b/source/network/XBackwardShape.h
@@ -34,7 +34,7 @@ class XShapeGrad
 public:
    /* compute dE/dx of a node */
    static
-    void MakeGrad(XTensor * node);
+    void MakeGrad(XTensor * node, bool isEfficent);
    /* indicates whether the node is for a shaping operation */
    static
@@ -42,39 +42,52 @@ public:
    /* post processing of a node */
    static
-    void PostProcessing(XTensor * node, int typeId);
+    void PostProcessing(XTensor * node, int typeId, bool isEfficent);
 private:
+    /* gradient computation for copying indexed sub-tensors: b = copyindexed(a, srcIndex, indexSize, tgtIndex, copyNum) */
+    static
+    void GradCopyIndexed(XTensor * node, bool isEfficent);
+    /* gradient computation for copying indexed sub-tensors: b = gather(a, index) */
+    static
+    void GradGather(XTensor * node, bool isEfficent);
    /* gradient computation for merge: c = merge(a, b, ...) */
    static
-    void GradMerge(XTensor * node);
+    void GradMerge(XTensor * node, bool isEfficent);
    /* gradient computation for merging a list of tensors : c = merge(list(a, b, ...)) */
    static
-    void GradMergeList(XTensor * node);
+    void GradMergeList(XTensor * node, bool isEfficent);
+    /* gradient computation for transposing a tensor : b = transpose(a) */
+    static
+    void GradTranspose(XTensor * node, bool isEfficent);
+    /* gradient computation for reshaping a tensor: c = reshape(a) */
+    static
+    void GradReshape(XTensor * node, bool isEfficent);
    /* gradient computation for split: c = split(a) */
    static
-    void GradSplit(XTensor * node);
+    void GradSplit(XTensor * node, bool isEfficent);
    /* gradient computation for spliting. we return the list of the splits : list(c_1, ...) = split(a) */
    static
-    void GradSplitList(XTensor * node);
+    void GradSplitList(XTensor * node, bool isEfficent);
    /* gradient computation for spliting. we return the list of the splits : list(c_1, ...) = split(a).
       this method is called only when all nodes of spliting have been processed. We do this in a post-processing
       manner because we can fuze multiple memory copy jobs one time. This is good for system speed up. */
    static
-    void GradSplitListPost(XTensor * node);
+    void GradSplitListPost(XTensor * node, bool isEfficent);
    /* gradient computation for unsqueezing a tensor : c = unsqueeze(a) */
    static
-    void GradUnsqueeze(XTensor * node);
+    void GradUnsqueeze(XTensor * node, bool isEfficent);
-    /* gradient computation for unsqueezing a tensor : c = unsqueeze(a) */
-    static
-    void GradTranspose(XTensor * node);
 };
 }

--- a/source/network/XNet.cpp
+++ b/source/network/XNet.cpp
--- a/source/network/XNet.h
+++ b/source/network/XNet.h
@@ -47,6 +47,9 @@ struct XNet
    /* input nodes of the network */
    XList inputs;
+    /* indicates whether the network just keeps the gradient for parameter tensors */
+    bool isGradEfficient;
    /* constructor */
    XNet();
@@ -56,25 +59,32 @@ struct XNet
    /* clear the network */
    void Clear();
+    /* backward propagation to obtain gradient */
+    void Backward(XTensor &root, LOSS_FUNCTION_NAME loss = NOLOSS);
    /* backward propagation to obtain gradient wrt. the loss/error function */
    void Backward(XTensor &root, XTensor &gold, LOSS_FUNCTION_NAME loss = NOLOSS);
-    /* backward propagation to obtain gradient */
+    /* backward propagation to obtain gradient wrt. the loss/error function */
-    void Backward(XTensor &root, LOSS_FUNCTION_NAME loss = NOLOSS);
+    void Backward(XTensor &root, XTensor &gold, XTensor &padding, LOSS_FUNCTION_NAME loss = NOLOSS);
-    /* backward propagation to obtain gradient wrt. the loss/error function
+    /* backward propagation to obtain gradient
       with a number of root nodes */
-    void Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss = NOLOSS);
+    void Backward(XList &roots, LOSS_FUNCTION_NAME loss = NOLOSS);
    /* backward propagation to obtain gradient
       with a number of root nodes */
-    void Backward(XList &roots, LOSS_FUNCTION_NAME loss = NOLOSS);
+    void Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss = NOLOSS);
+    /* backward propagation to obtain gradient wrt. the loss/error function
+       with a number of root nodes */
+    void Backward(XList &roots, XList &golds, XList &paddings, LOSS_FUNCTION_NAME loss = NOLOSS);
    /* backward computation for a given node */
-    void BackwardNode(XTensor * node);
+    void BackwardNode(XTensor * node, bool isEfficent = false);
    /* backward computation (in post processing) for a given node */
-    void BackwardNodePost(XTensor * node);
+    void BackwardNodePost(XTensor * node, bool isEfficent = false);
    /* traverse the net and find the topological order by 
       depth-first search (Tarjan's algorithm) */
@@ -89,6 +99,18 @@ struct XNet
    /* dump network information */
    void Dump(FILE * file);
+    /* set the flag of gradient-efficient */
+    void SetGradEfficientFlag(bool flag = true);
+    /* generate the gradient-efficient flag for every node */
+    void MakeEfficientNet();
+    /* clear the graident information if the node is no use */
+    void ClearGrad(XTensor * node);
+    /* show network topology */
+    void ShowNetwork(FILE * file, XTensor * node);
 };
 /* we make a unique id for every tensor */

--- a/source/sample/fnnlm/FNNLM.cpp
+++ b/source/sample/fnnlm/FNNLM.cpp
@@ -74,6 +74,7 @@ void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net);
 void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NAME loss, 
              FNNModel &model, FNNModel &grad, FNNNet &net);
 void ForwardAutoDiff(XTensor inputs[], XTensor &output, FNNModel &model);
+void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model);
 /* 
 entry of the program 
@@ -99,7 +100,7 @@ arguments:
            (how many words)
 -shuffle: shuffle the training data
 -devid D: the id of the device used
-           -1: GPU, >=0: GPUs
+           -1: CPU, >=0: GPUs
 -mempool: use memory pools for memory management
 -autodiff: use automatic differentiation for training
@@ -230,7 +231,7 @@ void LoadArgs(int argc, const char ** argv, FNNModel &model)
    }
    for(int i = 0; i < argc; i++){
-        if(!strcmp(argv[i], "-mempool"))
+        if (!strcmp(argv[i], "-mempool"))
            model.mem = new XMem(model.devID);
    }
 }
@@ -476,7 +477,12 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
                Clear(model, true);
                /* forward + backward process */
-                ForwardAutoDiff(inputs, output, model);
+				/* this is implemented by gather function */
+                ForwardAutoDiff(ngrams, ngramNum, output, model);
+				/* this is implemented by multiply function */
+				//ForwardAutoDiff(inputs, output, model);
                /* automatic differentiation */
                autoDiffer.Backward(output, gold, CROSSENTROPY);
@@ -508,6 +514,8 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
        if(isEnd)
            break;
+        Test(testFN, outputFN, model);
    }
    double elapsed = GetClockSec() - startT;
@@ -707,24 +715,16 @@ The indexed cell is set to 1, and 0 otherwise.
 >> devID - device id
 >> mem - memory pool
 */
-void InitZeroOneTensor2D(XTensor &tensor, int rowNum, int colNum, int * rows, int * cols, int itemNum, int devID, XMem * mem)
+void InitZeroOneTensor2D(XTensor &tensor, int rowNum, int colNum, int * rows, int * cols, 
+                         int itemNum, int devID, XMem * mem)
 {
-    if(devID >= 0 || (mem != NULL && mem->devID >= 0))
+    InitTensor2D(&tensor, rowNum, colNum, X_FLOAT, devID, mem);
-        InitTensor2D(&tensor, rowNum, colNum, X_FLOAT, -1);
-    else
-        InitTensor2D(&tensor, rowNum, colNum, X_FLOAT, devID, mem);
    tensor.SetZeroAll();
    /* set none-zero cells */
    for(int i = 0; i < itemNum; i++)
        tensor.Set2D(1.0F, rows[i], cols[i]);
-    if(devID >= 0 || (mem != NULL && mem->devID >= 0)){
-        XList list(1);
-        list.Add(&tensor);
-        CPUToGPUFlush(&list, devID, mem);
-    }
 }
 /*
@@ -851,8 +851,6 @@ void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net)
        /* y = softmax(s) */
        _LogSoftmax(&s, &y, 1);
    }
 }
 /*
@@ -884,7 +882,7 @@ void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NA
    /* for y = softmax(s), we get dE/ds
        where E is the error function (define by loss) */
-    _LogSoftmaxBackward(&gold, &y, &s, NULL, &deds, 1, loss);
+    _LogSoftmaxBackward(&gold, &y, &s, NULL, &deds, NULL, 1, loss);
    /* for s = x * w, we get 
       dE/w_{i,j} = dE/ds_j * ds/dw_{i,j} 
@@ -975,7 +973,55 @@ void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NA
 }
 /*
-forward process (with tensor connections)
+forward process (with tensor connections) (this is implemented by gather function)
+>> ngrams - the loaded ngrams
+>> batch - the tensor encoding a batch of words
+>> output - output probability
+>> model - the fnn model
+*/
+void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model)
+{
+    int n = model.n;
+    int depth = model.hDepth;
+    XTensor words;
+    XTensor embeddingBig;
+    XTensor hidden;
+    XTensor b;
+    int size = batch * (n-1);
+    int * index = new int[size];
+    for(int i = 0; i < batch; i++){
+        for (int j = 0; j < n-1; j++){
+            int a = i * (n - 1) + j;
+            index[a] = ngrams[i].words[j];
+        }
+    }
+    InitTensor1D(&words, size, X_INT, model.devID, model.mem);
+    words.SetData(index, size);
+    embeddingBig = Gather(model.embeddingW, words);
+    delete[] index;
+    int dimSize[2];
+    dimSize[0] = embeddingBig.GetDim(0) / (n - 1);
+    dimSize[1] = embeddingBig.GetDim(1) * (n - 1);
+    hidden = Reshape(embeddingBig, embeddingBig.order, dimSize);
+    /* hidden layers */
+    for(int i = 0; i < depth; i++)
+        hidden = HardTanH(MMul(hidden, model.hiddenW[i]) + model.hiddenB[i]);
+    /* output layer */
+    output = LogSoftmax(MMul(hidden, model.outputW) + model.outputB, 1);
+}
+/*
+forward process (with tensor connections) (this is implemented by multiply function)
 >> inputs - input word representations
 >> output - output probability
 >> model - the fnn model
@@ -1011,7 +1057,6 @@ void ForwardAutoDiff(XTensor inputs[], XTensor &output, FNNModel &model)
    /* output layer */
    output = LogSoftmax(MMul(hidden, model.outputW) + model.outputB, 1);
-    //XLink::ShowNetwork(stderr, &output);
 }
 /* 
@@ -1122,8 +1167,12 @@ void Test(const char * test, const char * result, FNNModel &model)
            /* forward computation */
            Forward(inputs, output, model, net);
        }
-        else {
+        else {			
-            ForwardAutoDiff(inputs, output, model);
+			/* this is implemented by gather function */
+            ForwardAutoDiff(ngrams, ngramNum, output, model);
+			/* this is implemented by multiply function */
+			//ForwardAutoDiff(inputs, output, model);
        }
        /* prediction probabilities */

--- a/source/sample/myFnn/myFNNLM.cpp
+++ b/source/sample/myFnn/myFNNLM.cpp
--- a/source/sample/myFnn/myFNNLM.h
+++ b/source/sample/myFnn/myFNNLM.h
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+*
+* This is a simple impelementation of the feed-forward network-baesd language
+* model (FNNLM). See more details about FNNLM in
+* "A Neural Probabilistic Language Model" by Bengio et al.
+* Journal of Machine Learning Research 3 (2003) 1137C1155
+*
+* $Created by: ZHANG Yuhao (yoohao.zhang@gmail.com) 2018-08-08
+* It's my first time to write neural network, hope there will be less BUG
+*/
+#ifndef __MYFNNLM_H__
+#define __MYFNNLM_H__
+#include "../../tensor/XGlobal.h"
+#include "../../tensor/XTensor.h"
+#include "../../tensor/core/CHeader.h"
+using namespace nts;
+namespace myfnnlm {
+    void fnnlm();
+}
+#endif
\ No newline at end of file
--- a/source/sample/myFnn/myFNNLM2.cpp
+++ b/source/sample/myFnn/myFNNLM2.cpp
--- a/source/sample/transformer/T2TAttention.cpp
+++ b/source/sample/transformer/T2TAttention.cpp
@@ -35,6 +35,8 @@ T2TAttention::T2TAttention()
    dk = -1;
    dv = -1;
    d  = -1;
+    isMasked = false;
+    ignored = 0;
 }
 /* deconstructor */
@@ -46,13 +48,19 @@ T2TAttention::~T2TAttention()
 initialize the model 
 >> argc - number of arguments
 >> argv - list of pointers to the arguments
+>> myIgnored - number of position ignored in attention (from the begining)
+>> myIsMasked - indicates whether the attention is with a mask
 >> myDevID - device id
 >> myMem - the memory pool
 */
-void T2TAttention::InitModel(int argc, const char ** argv, int myDevID, XMem * myMem)
+void T2TAttention::InitModel(int argc, char ** argv, 
+                             bool myIsMasked, int myIgnored, 
+                             int myDevID, XMem * myMem)
 {
    devID = myDevID;
    mem = myMem;
+    isMasked = myIsMasked;
+    ignored = myIgnored;
    float minmax = 0;
@@ -61,18 +69,22 @@ void T2TAttention::InitModel(int argc, const char ** argv, int myDevID, XMem * m
    LoadParamInt(argc, argv, "d", &dv, DEFAULT_EMBEDDING_SIZE);
    LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);
    LoadParamFloat(argc, argv, "attminmax", &minmax, 0.1F);
+    LoadParamFloat(argc, argv, "dropoutatt", &dropoutP, 0);
    InitTensor2D(&wk, d, dk, X_FLOAT, devID, mem);
    InitTensor2D(&wq, d, dk, X_FLOAT, devID, mem);
    InitTensor2D(&wv, d, dv, X_FLOAT, devID, mem);
+    InitTensor2D(&wa, d, d, X_FLOAT, devID, mem);
    float scale = 1.0F;
    float finfoutk = (float)sqrt(6.0F * scale/(d + dk));
    float finfoutv = (float)sqrt(6.0F * scale/(d + dv));
+    float finfouta = (float)sqrt(6.0F * scale / (d + d));
    wk.SetDataRand(-finfoutk, finfoutk);
    wq.SetDataRand(-finfoutk, finfoutk);
    wv.SetDataRand(-finfoutv, finfoutv);
+    wa.SetDataRand(-finfouta, finfouta);
 }
 /* 
@@ -82,9 +94,11 @@ make the network
       and H = vector size of each position
 >> q - queries
 >> v - values
+>> mask - as it is
+>> isTraining - indicates whether the model is used for training
 << return - multi-attention result
 */
-XTensor T2TAttention::Make(XTensor &k, XTensor &q, XTensor &v)
+XTensor T2TAttention::Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining)
 {
    XTensor k2;
    XTensor q2;
@@ -105,14 +119,26 @@ XTensor T2TAttention::Make(XTensor &k, XTensor &q, XTensor &v)
    vheads = Split(v2, v2.order - 1, nhead);
    XTensor att;
+    XTensor dot;
    XTensor scalar;
    /* scalar = softmax(Q * K^T / sqrt(dk)) * V */
-    scalar = Softmax(Linear(BMMul(qheads, X_NOTRANS, kheads, X_TRANS), 1/(float)sqrt((float)dk)), -1);
+    dot = BMMul(qheads, X_NOTRANS, kheads, X_TRANS);
+    if(isMasked)
+        dot = dot + mask;
+    dot = Linear(dot, 1.0F/(float)sqrt((float)dk/nhead));
+    scalar = Softmax(dot, -1);
+    if(isTraining && dropoutP > 0)
+        scalar = Dropout(scalar, dropoutP);
    att = BMMul(scalar, vheads);
    /* concatenate the heads */
-    return Merge(att, att.order - 1);
+    return MMul(Merge(att, att.order - 1), wa);
 }
 }
--- a/source/sample/transformer/T2TAttention.h
+++ b/source/sample/transformer/T2TAttention.h
@@ -57,6 +57,9 @@ public:
    /* transformation matrix for V */
    XTensor wv;
+    /* transformation after dot-product attention */
+    XTensor wa;
    /* size of transformed Q and K */
    int dk;
@@ -66,6 +69,19 @@ public:
    /* size of input Q, K and V */
    int d;
+    /* indicates whether the attention is masked */
+    bool isMasked;
+    /* some positions can be ignored in attention. this is useful in lm where the first position needs
+       special design for the attention model. */
+    int ignored;
+    /* indicates whether the model is used for training */
+    bool isTraining;
+    /* dropout probability */
+    DTYPE dropoutP;
 public:
    /* constructor */
    T2TAttention();
@@ -74,10 +90,12 @@ public:
    ~T2TAttention();
    /* initialize the model */
-    void InitModel(int argc, const char ** argv, int myDevID = -1, XMem * myMem = NULL);
+    void InitModel(int argc, char ** argv, 
+                   bool myIsMasked, int myIgnored, 
+                   int myDevID = -1, XMem * myMem = NULL);
    /* make the network */
-    XTensor Make(XTensor &k, XTensor &q, XTensor &v);
+    XTensor Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining);
 };
 }

--- a/source/sample/transformer/T2TDecoder.cpp
+++ b/source/sample/transformer/T2TDecoder.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-10-09
+ */
+#include <math.h>
+#include "T2TDecoder.h"
+#include "../../tensor/core/CHeader.h"
+namespace transformer
+{
+/* constructor */
+AttDecoder::AttDecoder()
+{
+    attentionsEnde = NULL;
+    attEndeLayerNorms = NULL;
+}
+/* de-constructor */
+AttDecoder::~AttDecoder()
+{
+    delete[] attentionsEnde;
+    delete[] attEndeLayerNorms;
+}
+/* 
+initialize the model 
+>> argc - number of arguments
+>> argv - list of pointers to the arguments
+>> myIsMasked - indicates whether the masked attention is employed
+>> myIgnored - number of positions ignored in attention (from the start)
+>> myDevID - device id
+>> myMem - the memory pool
+*/
+void AttDecoder::InitModel(int argc, char ** argv, 
+                           bool myIsMasked, int myIgnored, 
+                           int myDevID, XMem * myMem)
+{
+    AttEncoder::InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem);
+    attentionsEnde = new T2TAttention[nlayer];
+    attEndeLayerNorms = new T2TLN[nlayer];
+    /* initialize the stacked layers */
+    for(int i = 0; i < nlayer; i++){
+        attentionsEnde[i].InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem);
+        attEndeLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
+    }
+}
+/* 
+make the decoding network
+>> inputDec - the input tensor of the decoder
+>> outputEnc - the output tensor of the encoder
+>> mask - mask that indicates which position is valid
+>> maskEncDec - mask for the encoder-decoder attention
+>> isTraining - indicates whether the model is used for training
+<< return - the output tensor of the encoder
+*/
+XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, XTensor &maskEncDec, bool isTraining)
+{
+    XTensor x;
+    x = embedder.Make(inputDec);
+    /* dropout */
+    if(isTraining && dropoutP > 0)
+        x = Dropout(x, dropoutP);
+    for(int i = 0; i < nlayer; i++){
+        XTensor att;
+        XTensor ende;
+        XTensor ln;
+        XTensor fnn;
+        XTensor res;
+        /******************/
+        /* self attention */
+        att = attentions[i].Make(x, x, x, mask, isTraining);
+        /* dropout */
+        if(isTraining && dropoutP > 0)
+            att = Dropout(att, dropoutP);
+        /* residual connection */
+        res = Sum(att, x);
+        /* layer normalization */
+        x = attLayerNorms[i].Make(res);
+        /*****************************/
+        /* encoder-decoder attention */
+        ende = attentionsEnde[i].Make(outputEnc, x, outputEnc, maskEncDec, isTraining);
+        /* dropout */
+        if(isTraining && dropoutP > 0)
+            ende = Dropout(ende, dropoutP);
+        /* residual connection */
+        res = Sum(ende, x);
+        /* layer normalization */
+        x = attEndeLayerNorms[i].Make(res);
+        /*******/
+        /* fnn */
+        fnn = fnns[i].Make(x, isTraining);
+        /* dropout */
+        if(isTraining && dropoutP > 0)
+            fnn = Dropout(fnn, dropoutP);
+        /* residual connection */
+        res = Sum(fnn, x);
+        /* layer normalization */
+        x = fnnLayerNorms[i].Make(res);
+    }
+    return x;
+}
+}
--- a/source/sample/transformer/T2TDecoder.h
+++ b/source/sample/transformer/T2TDecoder.h
@@ -22,19 +22,33 @@
 #ifndef __T2TDECODER_H__
 #define __T2TDECODER_H__
+#include "T2TEncoder.h"
 namespace transformer
 {
-class T2TDecoder
+class AttDecoder : public AttEncoder
 {
+public:
+    /* encoder-decoder attention model of each layer */
+    T2TAttention * attentionsEnde;
-};
+    /* layer normalization for encoder-decoder attention */
+    T2TLN * attEndeLayerNorms;
-class AttDecoder : T2TDecoder
-{
 public:
+    /* constructor */
+    AttDecoder();
+    /* deconstructor */
+    ~AttDecoder();
    /* initialize the model */
-    void InitModel(int argc, const char ** argv);
+    void InitModel(int argc, char ** argv, 
+                   bool myIsMasked, int myIgnored, 
+                   int myDevID = -1, XMem * myMem = NULL);
+    /* make the decoding network */
+    XTensor Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, XTensor &maskEncDec, bool isTraining);
 };
 }

--- a/source/sample/transformer/T2TEmbedding.cpp
+++ b/source/sample/transformer/T2TEmbedding.cpp
@@ -48,7 +48,7 @@ initialize the model
 >> myDevID - device id
 >> myMem - the memory pool
 */
-void T2TEmbedder::InitModel(int argc, const char ** argv, int myDevID, XMem * myMem)
+void T2TEmbedder::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
 {
    devID = myDevID;
    mem = myMem;
@@ -60,16 +60,18 @@ void T2TEmbedder::InitModel(int argc, const char ** argv, int myDevID, XMem * my
    InitTensor2D(&w, vSize, eSize, X_FLOAT, devID, mem);
-    w.SetDataRandn(0, 1.0F/(float)sqrt((float)eSize));
+    DTYPE v = 1.0F/(float)sqrt((float)eSize);
+    w.SetDataRandn(0, v);
    /* create the positional embedding matrix */
    MakePosEmbedding(eSize, d, maxLength);
 }
 /* 
-make positional embeddings (of size eSize * length
+make positional embeddings (of size eSize * length)
-eSize - embedding size
+>> eSize - embedding size
-length - length of the sequenc
+>> d - dimension size of the hidden layers
+>> length - length of the sequence
 */
 void T2TEmbedder::MakePosEmbedding(int eSize, int d, int length)
 {
@@ -79,6 +81,17 @@ void T2TEmbedder::MakePosEmbedding(int eSize, int d, int length)
    for(int pos = 0; pos < length; pos++){
        float * dp = data + pos * eSize;
+        int channelSize = eSize / 2;
+        int offset = 0;
+        for(int i = 0; i < channelSize; i++){
+            dp[offset++] = (float)sin(pos/pow(10000.0F, 2.0F*i/(d - 2)));
+        }
+        for(int i = 0; i < channelSize; i++){
+            dp[offset++] = (float)cos(pos/pow(10000.0F, 2.0F*i/(d - 2)));
+        }
+        /*
        for(int k = 0; k < eSize; k++){
            if(k % 2 == 0){
                int i = k/2;
@@ -89,6 +102,7 @@ void T2TEmbedder::MakePosEmbedding(int eSize, int d, int length)
                dp[k] = (float)cos(pos/pow(10000.0F, 2.0F*i/d));
            }
        }
+        */
    }
    posEmbeddingBase.SetData(data, posEmbeddingBase.unitNum);
@@ -101,15 +115,18 @@ make the network
 */
 XTensor T2TEmbedder::Make(XTensor &input)
 {
-    CheckNTErrors(input.GetDim(-1) == vSize, "Wrong vocabulary size!");
+    //CheckNTErrors(input.GetDim(-1) == vSize, "Wrong vocabulary size!");
    CheckNTErrors(input.order > 1, "Wrong input tensor size!");
-    CheckNTErrors(input.dimSize[input.order - 2] < maxLength, "The sequence is too long!");
+    CheckNTErrors(input.dimSize[input.order - 1] < maxLength, "The sequence is too long!");
    CheckNTErrors(vSize > 0, "set vocabulary size by \"-vsize\"");
    CheckNTErrors(eSize > 0, "set embedding size by \"-esize\"");
    int dims[MAX_TENSOR_DIM_NUM];
    memcpy(dims, input.dimSize, input.order * sizeof(int));
-    dims[input.order - 1] = eSize;
+    dims[input.order] = eSize;
+    XTensor wordEmbedding;
+    XTensor posEmbedding;
    bool match = (posEmbedding.order == input.order);
    if(match){
@@ -120,8 +137,10 @@ XTensor T2TEmbedder::Make(XTensor &input)
    }
    /* we make positional embeddings first */
-    if(!match){
+    //if(!match){
-        InitTensor(&posEmbedding, input.order, dims, X_FLOAT, 1.0F, devID, mem);
+    if(true){
+        InitTensor(&posEmbedding, input.order + 1, dims, X_FLOAT, 1.0F, devID, mem);
        XTensor * posTMP = NewTensorBuf(2, dims + 1, X_FLOAT, 1.0F, devID, mem);
        _CopyValues(&posEmbeddingBase, 0, posTMP->unitNum, posTMP, 0);
@@ -130,13 +149,12 @@ XTensor T2TEmbedder::Make(XTensor &input)
        DelTensorBuf(posTMP);
    }
-    XTensor wordEmbedding;
    /* then we make word embeddings */
-    wordEmbedding = Linear(MMul(input, w), (float)sqrt((float)d));
+    wordEmbedding = Gather(w, input);
+    wordEmbedding = Linear(wordEmbedding, (float)sqrt((float)eSize));
    /* we sum over the two embeddings */
-    return wordEmbedding +posEmbedding;
+    return wordEmbedding + posEmbedding;
 }
 }
--- a/source/sample/transformer/T2TEmbedding.h
+++ b/source/sample/transformer/T2TEmbedding.h
@@ -63,9 +63,6 @@ public:
       the embedding processing by re-loading. */
    XTensor posEmbeddingBase;
-    /* positional embeddings */
-    XTensor posEmbedding;
 public:
    /* constructor */
    T2TEmbedder();
@@ -74,7 +71,7 @@ public:
    ~T2TEmbedder();
    /* initialize the model */
-    void InitModel(int argc, const char ** argv, int myDevID = -1, XMem * myMem = NULL);
+    void InitModel(int argc, char ** argv, int myDevID = -1, XMem * myMem = NULL);
    /* make positional embeddings */
    void MakePosEmbedding(int eSize, int d, int length);

--- a/source/sample/transformer/T2TEncoder.cpp
+++ b/source/sample/transformer/T2TEncoder.cpp
@@ -31,6 +31,10 @@ namespace transformer
 /* constructor */
 AttEncoder::AttEncoder()
 {
+    attentions = NULL;
+    fnns = NULL;
+    attLayerNorms = NULL;
+    fnnLayerNorms = NULL;
 }
 /* de-constructor */
@@ -46,18 +50,24 @@ AttEncoder::~AttEncoder()
 initialize the model 
 >> argc - number of arguments
 >> argv - list of pointers to the arguments
+>> myIsMasked - indicates whether the masked attention is employed
+>> myIgnored - number of positions ignored in attention (from the start)
 >> myDevID - device id
 >> myMem - the memory pool
 */
-void AttEncoder::InitModel(int argc, const char ** argv, int myDevID, XMem * myMem)
+void AttEncoder::InitModel(int argc, char ** argv, 
+                           bool myIsMasked, int myIgnored, 
+                           int myDevID, XMem * myMem)
 {
    devID = myDevID;
    mem = myMem;
+    ignored = myIgnored;
    LoadParamInt(argc, argv, "nlayer", &nlayer, 6);
    LoadParamInt(argc, argv, "hsize", &hSize, DEFAULT_EMBEDDING_SIZE);
    LoadParamInt(argc, argv, "esize", &eSize, DEFAULT_EMBEDDING_SIZE);
    LoadParamInt(argc, argv, "vsize", &vSize, -1);
+    LoadParamFloat(argc, argv, "dropout", &dropoutP, 0);
    CheckNTErrors(nlayer >= 1, "We have one encoding layer at least!");
    CheckNTErrors(vSize > 1, "set vocabulary size by \"-vsize\"");
@@ -72,7 +82,7 @@ void AttEncoder::InitModel(int argc, const char ** argv, int myDevID, XMem * myM
    /* initialize the stacked layers */
    for(int i = 0; i < nlayer; i++){
-        attentions[i].InitModel(argc, argv, myDevID, myMem);
+        attentions[i].InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem);
        fnns[i].InitModel(argc, argv, myDevID, myMem);
        attLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
        fnnLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
@@ -82,14 +92,23 @@ void AttEncoder::InitModel(int argc, const char ** argv, int myDevID, XMem * myM
 /* 
 make the encoding network
 >> input - the input tensor of the encoder
+>> mask - the mask that indicate each position is valid
+>> maskEncDec - no use
+>> isTraining - indicates whether the model is used for training
 << return - the output tensor of the encoder
 */
-XTensor AttEncoder::Make(XTensor &input)
+XTensor AttEncoder::Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, bool isTraining)
 {
    XTensor x;
    x = embedder.Make(input);
+    //x.Dump(tmpFILE, "embedding: ");
+    /* dropout */
+    if(isTraining && dropoutP > 0)
+        x = Dropout(x, dropoutP);
    for(int i = 0; i < nlayer; i++){
        XTensor att;
        XTensor ln;
@@ -97,24 +116,28 @@ XTensor AttEncoder::Make(XTensor &input)
        XTensor res;
        /* self attention */
-        att = attentions[i].Make(x, x, x);
+        att = attentions[i].Make(x, x, x, mask, isTraining);
+        /* dropout */
+        if(isTraining && dropoutP > 0)
+            att = Dropout(att, dropoutP);
        /* residual connection */
        res = Sum(att, x);
-        /* TODO: dropout */
        /* layer normalization */
        x = attLayerNorms[i].Make(res);
        /* fnn */
-        fnn = fnns[i].Make(x);
+        fnn = fnns[i].Make(x, isTraining);
+        /* dropout */
+        if(isTraining && dropoutP > 0)
+            fnn = Dropout(fnn, dropoutP);
        /* residual connection */
        res = Sum(fnn, x);
-        /* TODO: dropout */
        /* layer normalization */
        x = fnnLayerNorms[i].Make(res);
    }
@@ -122,4 +145,18 @@ XTensor AttEncoder::Make(XTensor &input)
    return x;
 }
+/*
+make the encoding network (wrapper) 
+>> input - the input tensor of the encoder
+>> mask - the mask that indicate each position is valid
+>> isTraining - indicates whether the model is used for training
+<< return - the output tensor of the encoder
+*/
+XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool isTraining)
+{
+    XTensor nothing;
+    return Make(input, mask, nothing, isTraining);
+}
 }
--- a/source/sample/transformer/T2TEncoder.h
+++ b/source/sample/transformer/T2TEncoder.h
@@ -40,7 +40,7 @@ class T2TEncoder
 {
 public:
    virtual
-    XTensor Make(XTensor &input) = 0;
+    XTensor Make(XTensor &input, XTensor &mask, XTensor &mask2, bool isTraining) = 0;
 };
 /* 
@@ -49,7 +49,7 @@ the encoder based on RNN
 class RNNEncoder : T2TEncoder
 {
 public:
-    XTensor Make(XTensor &input);
+    XTensor Make(XTensor &input, XTensor &mask, XTensor &mask2, bool isTraining);
 };
@@ -77,6 +77,13 @@ public:
    /* vocabulary size */
    int vSize;
+    /* dropout probability */
+    DTYPE dropoutP;
+    /* some positions can be ignored in attention. this is useful in lm where the first position needs
+       special design for the attention model. */
+    int ignored;
    /* embedding of word at each position */
    T2TEmbedder embedder;
@@ -106,10 +113,15 @@ public:
    ~AttEncoder();
    /* initialize the model */
-    void InitModel(int argc, const char ** argv, int myDevID = -1, XMem * myMem = NULL);
+    void InitModel(int argc, char ** argv, 
+                   bool myIsMasked, int myIgnored, 
+                   int myDevID = -1, XMem * myMem = NULL);
    /* make the encoding network */
-    XTensor Make(XTensor &input);
+    XTensor Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, bool isTraining);
+    /* make the encoding network (wrapper) */
+    XTensor Make(XTensor &input, XTensor &mask, bool isTraining);
 };

--- a/source/sample/transformer/T2TFNN.cpp
+++ b/source/sample/transformer/T2TFNN.cpp
@@ -49,7 +49,7 @@ initialize the model
 >> myDevID - device id
 >> myMem - the memory pool
 */
-void T2TFNN::InitModel(int argc, const char ** argv, int myDevID, XMem * myMem)
+void T2TFNN::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
 {
    devID = myDevID;
    mem = myMem;
@@ -58,8 +58,9 @@ void T2TFNN::InitModel(int argc, const char ** argv, int myDevID, XMem * myMem)
    LoadParamInt(argc, argv, "d", &inSize, DEFAULT_EMBEDDING_SIZE);
    LoadParamInt(argc, argv, "d", &outSize, DEFAULT_EMBEDDING_SIZE);
-    LoadParamInt(argc, argv, "fnnh", &hSize, DEFAULT_EMBEDDING_SIZE);
+    LoadParamInt(argc, argv, "fnnh", &hSize, outSize * 4);
    LoadParamFloat(argc, argv, "fnnminmax", &minmax, 0.1F);
+    LoadParamFloat(argc, argv, "dropoutfnn", &dropoutP, 0);
    InitTensor2D(&w1, inSize, hSize, X_FLOAT, devID, mem);
    InitTensor1D(&b1, hSize, X_FLOAT, devID, mem);
@@ -83,12 +84,15 @@ y = max(0, x * w1 + b1) * w2 + b2
 >> input - the input tensor
 >> return - the output tensor 
 */
-XTensor T2TFNN::Make(XTensor &input)
+XTensor T2TFNN::Make(XTensor &input, bool isTraining)
 {
    XTensor t1;
    /* t1 = max(0, x * w1 + b1) */
    t1 = Rectify(MMul(input, w1) + b1);
+    if(isTraining && dropoutP > 0)
+        t1 = Dropout(t1, dropoutP);
    /* result = t1 * w2 + b2 */
    return MMul(t1, w2) + b2;

--- a/source/sample/transformer/T2TFNN.h
+++ b/source/sample/transformer/T2TFNN.h
@@ -59,6 +59,9 @@ public:
    /* bias of transformation 2 */
    XTensor b2;
+    /* dropout probability */
+    DTYPE dropoutP;
 public:
@@ -69,10 +72,10 @@ public:
    ~T2TFNN();
    /* initialize the model */
-    void InitModel(int argc, const char ** argv, int myDevID = -1, XMem * myMem = NULL);
+    void InitModel(int argc, char ** argv, int myDevID = -1, XMem * myMem = NULL);
    /* make the network */
-    XTensor Make(XTensor &input);
+    XTensor Make(XTensor &input, bool isTraining);
 };

--- a/source/sample/transformer/T2TLayerNormal.cpp
+++ b/source/sample/transformer/T2TLayerNormal.cpp
@@ -19,6 +19,7 @@
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
 */
+#include <math.h>
 #include "T2TLayerNormal.h"
 #include "T2TUtility.h"
 #include "T2TEmbedding.h"
@@ -26,12 +27,13 @@
 namespace transformer
 {
 /* constructor */
 T2TLN::T2TLN()
 {
    devID = -1;
-    mem   = NULL;
+    mem = NULL;
+    d = 0;
 }
 /* de-constructor */
@@ -46,28 +48,25 @@ initialize the model
 >> myDevID - device id
 >> myMem - the memory pool
 */
-void T2TLN::InitModel(int argc, const char ** argv, int myDevID, XMem * myMem)
+void T2TLN::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
 {
    devID = myDevID;
    mem = myMem;
-    int d = 0;
+    d = 0;
    LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);
-    InitTensor2D(&w, d, d, X_FLOAT, devID, mem);
+    InitTensor1D(&w, d, X_FLOAT, devID, mem);
    InitTensor1D(&b, d, X_FLOAT, devID, mem);
-    float scale = 1.0F;
+    w.SetDataRand(1.0F, 1.0F);
-    float finfout = (float)sqrt(6.0F * scale / (d + d));
-    w.SetDataRand(-finfout, finfout);
    b.SetZeroAll();
 }
 /*
-make the network 
+make the network
 for each layer representation x, we have
-y = 
+y =
 >> input - the input tensor
 >> return - layer normalization output
 */
@@ -90,16 +89,16 @@ XTensor T2TLN::Make(XTensor &input)
    /* standard = sqrt(variance) */
    standard = Power(variance, 0.5F);
-    /* unsqueeze mean and standard deviation to fit them into 
+    /* unsqueeze mean and standard deviation to fit them into
       the same shape of x */
    meanFilled = Unsqueeze(mean, x.order - 1, x.GetDim(-1));
    standardFilled = Unsqueeze(standard, x.order - 1, x.GetDim(-1));
    /* x' = (x - \mu)/standard */
-    xn = (x - meanFilled)/standardFilled ;
+    xn = (x - meanFilled) / standardFilled;
    /* result = x' * w + b   */
-    return MMul(xn, w) + b;
+    return xn * w + b;
 }
 }
--- a/source/sample/transformer/T2TLayerNormal.h
+++ b/source/sample/transformer/T2TLayerNormal.h
@@ -45,6 +45,9 @@ public:
    /* the bias term b */
    XTensor b;
+    /* dimension size of the model */
+    int d;
 public:
    /* constructor */
@@ -54,7 +57,7 @@ public:
    ~T2TLN();
    /* initialize the model */
-    void InitModel(int argc, const char ** argv, int myDevID = -1, XMem * myMem = NULL);
+    void InitModel(int argc, char ** argv, int myDevID = -1, XMem * myMem = NULL);
    /* make the network */
    XTensor Make(XTensor &input);

--- a/source/sample/transformer/T2TModel.cpp
+++ b/source/sample/transformer/T2TModel.cpp
--- a/source/sample/transformer/T2TModel.h
+++ b/source/sample/transformer/T2TModel.h
@@ -41,13 +41,13 @@ public:
    XMem * mem;
    /* the encoder */
-    AttEncoder encoder;
+    AttEncoder * encoder;
    /* the decoder */
-    AttDecoder decoder;
+    AttDecoder * decoder;
    /* output layer */
-    T2TOutput outputLayer;
+    T2TOutput * outputLayer;
    /* indicates whether the model is running for language modeling */
    bool isLM;
@@ -55,6 +55,9 @@ public:
    /* indicates whether the model is running for machine translation */
    bool isMT;
+    /* number of heads in the attention model */
+    int nhead;
 public:
    /* constructor */
    T2TModel();
@@ -63,15 +66,30 @@ public:
    ~T2TModel();
    /* initialize the model */
-    void InitModel(int argc, const char ** argv);
+    void InitModel(int argc, char ** argv);
+    /* make the encoding network */
+    XTensor MakeEncoder(XTensor &input, XTensor &mask, bool isTraining);
    /* make the encoding network */
-    XTensor MakeEncoding(XTensor &input);
+    XTensor MakeDecoder(XTensor &inputEnc, XTensor &inputDec, XTensor &mask, XTensor &MaskEncDec, bool isTraining);
+    /* make the network for langauge modeling (with the output softmax layer) */
+    void MakeLM(XTensor &input, XTensor &output, XTensor &padding, bool isTraining);
+    /* make the network for machine translation (with the output softmax layer) */
+    void MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTensor &paddingEnc, XTensor &paddingDec, bool isTraining);
+    /* get parameter matrics */
+    void GetParams(XList &list);
+    /* dump the parameters */
+    void Dump(const char * fn);
-    /* make the entire network (with the output softmax layer) */
+    /* read the parameters */
-    void Make(XTensor &input, XTensor &output);
+    void Read(const char * fn);
 };
 }
 #endif
\ No newline at end of file
--- a/source/sample/transformer/T2TOutput.cpp
+++ b/source/sample/transformer/T2TOutput.cpp
@@ -49,7 +49,7 @@ initialize the model
 >> myDevID - device id
 >> myMem - the memory pool
 */
-void T2TOutput::InitModel(int argc, const char ** argv, int myDevID, XMem * myMem)
+void T2TOutput::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
 {
    devID = myDevID;
    mem = myMem;
@@ -66,6 +66,9 @@ void T2TOutput::InitModel(int argc, const char ** argv, int myDevID, XMem * myMe
    float scale = 1.0F;
    float finfout = (float)sqrt(6.0F * scale/(hSize + vSize));
    w.SetDataRand(-finfout, finfout);
+    DTYPE v = 1.0F/(float)sqrt((float)hSize);
+    w.SetDataRandn(0, v);
 }
 /* 
@@ -91,6 +94,7 @@ void T2TOutput::Make(XTensor &input, XTensor &output)
    XTensor &x = input;
    output = LogSoftmax(MMul(x, w), -1);
+    //output = Softmax(MMul(x, w), -1);
 }
 }
--- a/source/sample/transformer/T2TOutput.h
+++ b/source/sample/transformer/T2TOutput.h
@@ -59,7 +59,7 @@ public:
    ~T2TOutput();
    /* initialize the model */
-    void InitModel(int argc, const char ** argv, int myDevID = -1, XMem * myMem = NULL);
+    void InitModel(int argc, char ** argv, int myDevID = -1, XMem * myMem = NULL);
    /* make the network */
    XTensor Make(XTensor &input);

--- a/source/sample/transformer/T2TTrainer.cpp
+++ b/source/sample/transformer/T2TTrainer.cpp
--- a/source/sample/transformer/T2TTrainer.h
+++ b/source/sample/transformer/T2TTrainer.h
@@ -37,21 +37,27 @@ namespace transformer
 class T2TTrainer
 {
 public:
-    /* device id */
+    /* paramter number */
-    int devID;
+    int argNum;
-    /* memory pool */
+    /* parameter array */
-    XMem * mem;
+    char ** argArray;
    /* buffer for loading words */
    int * buf;
+    /* another buffer */
+    int * buf2;
    /* buffer size */
    int bufSize;
    /* length of each sequence */
    int * seqLen;
+    /* another array */
+    int * seqLen2;
    /* offset of the first word for each sequence */
    int * seqOffset;
@@ -73,8 +79,14 @@ public:
    /* vocabulary size of the source side */
    int vSize;
+    /* vocabulary size of the target side */
+    int vSizeTgt;
    /* learning rate */
    float lrate;
+    /* the parameter that controls the maximum learning rate in training */
+    float lrbias;
    /* sentence batch size */
    int sBatchSize;
@@ -88,6 +100,51 @@ public:
    /* traing step number */
    int nstep;
+    /* indicates whether we use adam */
+    bool useAdam;
+    /* hyper parameters of adam*/
+    float adamBeta1;
+    float adamBeta2;
+    float adamDelta;
+    float adamBeta1T;
+    float adamBeta2T;
+    /* list of the moment of the parameter matrics */
+    XList moments;
+    /* list of the 2nd order moment of the parameter matrics */
+    XList moments2nd;
+    /* indicates whether the data file is shuffled for training */
+    bool isShuffled;
+    /* the factor of label smoothing */
+    DTYPE labelSmoothingP;
+    /* number of steps after which we make a checkpoint */
+    int nStepCheckpoint;
+    /* indicates whether we make a checkpoint after each traing epoch */
+    bool useEpochCheckpoint;
+    /* number of batches on which we do model update */
+    int updateStep;
+    /* indicates whether we double the </s> symbol for the output of lms */
+    bool isDoubledEnd;
+    /* indicates whether we use batchsize = max * sc
+       rather rather than batchsize = word-number, where max is the maximum
+       length and sc is the sentence number */
+    bool isSmallBatch;
+    /* counterpart of "isSmallBatch" */
+    bool isBigBatch;
+    /* indicates whether we use small memory footprint for backward process */
+    bool isSmallFootprint;
 public:
    /* constructor */
    T2TTrainer();
@@ -96,22 +153,74 @@ public:
    ~T2TTrainer();
    /* initialize the trainer */
-    void Init(int argc, const char ** argv);
+    void Init(int argc, char ** argv);
    /* train the model */
-    void Train(const char * fn, T2TModel * model);
+    void Train(const char * fn, const char * validFN, const char * modelFN, T2TModel * model);
+    /* test the model */
+    void Test(const char * fn, const char * ofn, T2TModel * model);
+    /* make a checkpoint */
+    void MakeCheckpoint(T2TModel * model, const char * validFN, const char * modelFN, const char * label, int id);
    /* load data to buffer */
-    int LoadBuf(FILE * file);
+    int LoadBuf(FILE * file, bool isSorted, int step);
+    /* clear data buffer */
+    void ClearBuf();
    /* load a batch of sequences */
-    int LoadBatch(FILE * file, XTensor * batch, int step, int vs, int sBatch, int wBatch, bool isSorted, int &wCount);
+    int LoadBatch(FILE * file, bool isLM,
+                  XTensor * batchEnc, XTensor * paddingEnc, 
+                  XTensor * batchDec, XTensor * paddingDec,
+                  XTensor * gold,
+                  int * seqs,
+                  int vsEnc, int vsDec, int sBatch, int wBatch, 
+                  bool isSorted, int &wCount,
+                  int devID, XMem * mem, 
+				  bool isTraining);
+    /* load a batch of sequences (for language modeling) */
+    int LoadBatchLM(FILE * file, 
+                    XTensor * batchEnc, XTensor * paddingEnc,
+                    XTensor * batchDec, XTensor * paddingDec,
+                    XTensor * gold,
+                    int * seqs, int vs, int sBatch, int wBatch, 
+                    bool isSorted, int &wCount,
+                    int devID, XMem * mem, 
+					bool isTraining);
+    /* load a batch of sequences (for machine translation) */
+    int LoadBatchMT(FILE * file, 
+                    XTensor * batchEnc, XTensor * paddingEnc, 
+                    XTensor * batchDec, XTensor * paddingDec,
+                    XTensor * gold,
+                    int * seqs, int vsEnc, int vsDec, int sBatch, int wBatch, 
+                    bool isSorted, int &wCount,
+                    int devID, XMem * mem, 
+					bool isTraining);
+    /* shuffle the data file */
+    void Shuffle(const char * srcFile, const char * tgtFile);
    /* get word probabilities for a batch of sequences */
    float GetProb(XTensor * output, XTensor * gold, XTensor * wordProbs);
    /* update the model by delta rule */
    void Update(T2TModel * model, const float lr);
+    /* prepare model for training */
+    void PrepareModel(T2TModel * model);
+    /* do padding on the output */
+    void PadOutput(XTensor * output, XTensor * gold, XTensor * padding);
+    /* recale the output and gold tensors for normalized loss */
+    void RescaleOutput(XTensor * output, XTensor * gold, XTensor * padding);
+    /* perform label smoothing */
+    void LabelSmooth(XTensor * gold, XTensor * smoothed, DTYPE p);
 };

--- a/source/sample/transformer/T2TUtility.cpp
+++ b/source/sample/transformer/T2TUtility.cpp
@@ -27,8 +27,10 @@ namespace transformer
 {
 FILE * tmpFILE;
+int llnum = 0;
+FILE * tf = NULL;
-void LoadParamString(int argc, const char ** argv, const char * name, char * p, const char * defaultP)
+void LoadParamString(int argc, char ** argv, const char * name, char * p, const char * defaultP)
 {
    char vname[128];
    vname[0] = '-';
@@ -45,7 +47,7 @@ void LoadParamString(int argc, const char ** argv, const char * name, char * p, 
        strcpy(p, defaultP);
 }
-void LoadParamInt(int argc, const char ** argv, const char * name, int * p, int defaultP)
+void LoadParamInt(int argc, char ** argv, const char * name, int * p, int defaultP)
 {
    char vname[128];
    vname[0] = '-';
@@ -62,7 +64,7 @@ void LoadParamInt(int argc, const char ** argv, const char * name, int * p, int 
        *p = defaultP;
 }
-void LoadParamBool(int argc, const char ** argv, const char * name, bool * p, bool defaultP)
+void LoadParamBool(int argc, char ** argv, const char * name, bool * p, bool defaultP)
 {
    char vname[128];
    vname[0] = '-';
@@ -79,7 +81,7 @@ void LoadParamBool(int argc, const char ** argv, const char * name, bool * p, bo
        *p = defaultP;
 }
-void LoadParamFloat(int argc, const char ** argv, const char * name, float * p, float defaultP)
+void LoadParamFloat(int argc, char ** argv, const char * name, float * p, float defaultP)
 {
    char vname[128];
    vname[0] = '-';
@@ -96,11 +98,13 @@ void LoadParamFloat(int argc, const char ** argv, const char * name, float * p, 
        *p = defaultP;
 }
-void ShowParams(int argc, const char ** argv)
+void ShowParams(int argc, char ** argv)
 {
    fprintf(stderr, "args:\n");
    for(int i = 0; i < argc; i++){
-        if(argv[i][0] == '-'){
+        if(argv[i][1] == 0)
+            continue;
+        if(argv[i][0] == '-' && (argv[i][1] < '1' || argv[i][1] > '9')){
            if(i + 1 < argc && argv[i + 1][0] != '-')
                fprintf(stderr, " %s=%s\n", argv[i], argv[i + 1]);
            else

--- a/source/sample/transformer/T2TUtility.h
+++ b/source/sample/transformer/T2TUtility.h
@@ -30,13 +30,16 @@ namespace transformer
 extern FILE * tmpFILE;
 /* load arguments */
-void LoadParamString(int argc, const char ** argv, const char * name, char * p, const char * defaultP);
+void LoadParamString(int argc, char ** argv, const char * name, char * p, const char * defaultP);
-void LoadParamInt(int argc, const char ** argv, const char * name, int * p, int defaultP);
+void LoadParamInt(int argc, char ** argv, const char * name, int * p, int defaultP);
-void LoadParamBool(int argc, const char ** argv, const char * name, bool * p, bool defaultP);
+void LoadParamBool(int argc, char ** argv, const char * name, bool * p, bool defaultP);
-void LoadParamFloat(int argc, const char ** argv, const char * name, float * p, float defaultP);
+void LoadParamFloat(int argc, char ** argv, const char * name, float * p, float defaultP);
 /* show arguments */
-void ShowParams(int argc, const char ** argv);
+void ShowParams(int argc, char ** argv);
+extern int llnum;
+extern FILE * tf;
 }

--- a/source/sample/transformer/Transformer.cpp
+++ b/source/sample/transformer/Transformer.cpp
@@ -19,11 +19,15 @@
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
 */
+#include <math.h>
+#include <time.h>
 #include "Transformer.h"
 #include "T2TModel.h"
 #include "T2TUtility.h"
 #include "T2TTrainer.h"
 #include "../../tensor/XDevice.h"
+#include "../../tensor/XUtility.h"
+#include "../../tensor/XGlobal.h"
 namespace transformer
 {
@@ -32,30 +36,67 @@ int TransformerMain(int argc, const char ** argv)
 {
    if(argc == 0)
        return 1;
+    fprintf(stderr, "%e\n", log(1e-8F));
+    char ** args = new char*[argc];
+    for(int i = 0; i < argc; i++){
+        args[i] = new char[strlen(argv[i]) + 1];
+        strcpy(args[i], argv[i]);
+    }
    tmpFILE = fopen("tmp.txt", "wb");
-    ShowParams(argc, argv);
+    ShowParams(argc, args);
    char * trainFN = new char[MAX_LINE_LENGTH];
+    char * modelFN = new char[MAX_LINE_LENGTH];
+    char * testFN = new char[MAX_LINE_LENGTH];
+    char * outputFN = new char[MAX_LINE_LENGTH];
+    LoadParamString(argc, args, "train", trainFN, "");
+    LoadParamString(argc, args, "model", modelFN, "");
+    LoadParamString(argc, args, "test", testFN, "");
+    LoadParamString(argc, args, "output", outputFN, "");
-    LoadParamString(argc, argv, "train", trainFN, "");
+    srand((unsigned int)time(NULL));
+    T2TTrainer trainer;
+    trainer.Init(argc, args);
    T2TModel model;
+    model.InitModel(argc, args);
+    /* learn model parameters */
+    if(strcmp(trainFN, ""))
+        trainer.Train(trainFN, testFN, strcmp(modelFN, "") ? modelFN : "checkpoint.model", &model);
+    /* save the final model */
+    if(strcmp(modelFN, "") && strcmp(trainFN, ""))
+        model.Dump(modelFN);
+    /* load the model if neccessary */
+    if(strcmp(modelFN, ""))
+        model.Read(modelFN);
-    model.InitModel(argc, argv);
+    T2TTrainer tester;
+    tester.Init(argc, args);
-    if(strcmp(trainFN, "")){
+    /* test the model on the new data */
-        T2TTrainer trainer;
+    if(strcmp(testFN, "") && strcmp(outputFN, ""))
-        trainer.Init(argc, argv);
+        tester.Test(testFN, outputFN, &model);
-        trainer.Train(trainFN, &model);
-    }
    delete[] trainFN;
+    delete[] modelFN;
+    delete[] testFN;
+    delete[] outputFN;
+    for(int i = 0; i < argc; i++)
+        delete[] args[i];
+    delete[] args;
    fclose(tmpFILE);
    return 0;
 }
 }
\ No newline at end of file
--- a/source/tensor/Main.cpp
+++ b/source/tensor/Main.cpp
@@ -14,7 +14,7 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /*
 *
 * This is the entrance of the low-level tensor library : NiuTrans.Tensor
@@ -39,13 +39,24 @@ using namespace nts;
 void SmallTest();
 void TransposeTest();
+void LittleTest();
+void T2TTest();
+void T2TTest2();
+void PowerTest();
 int main( int argc, const char ** argv )
 {
+    //PowerTest();
+    //LittleTest();
+    //T2TTest();
+    //T2TTest2();
+    //return 0;
    //_CrtSetBreakAlloc(123);
    /* a tiny test */
-    SmallTest();
+    //SmallTest();
    //_CrtDumpMemoryLeaks();
    //return 0;
@@ -63,6 +74,34 @@ int main( int argc, const char ** argv )
    return 0;
 }
+void myRead(XTensor * tensor, const char * filename, const char * label)
+{
+    FILE * file = fopen(filename, "rb");
+    if(file == NULL)
+        printf("%s\n", filename);
+    tensor->Read(file, label);
+}
+void myDump(XTensor * tensor, const char * filename, const char * label)
+{
+    FILE * file = fopen(filename, "wb");
+    if(file == NULL)
+        printf("%s\n", filename);
+    tensor->Dump(file, label);
+}
+void PowerTest()
+{
+    XTensor input;
+    XTensor output;
+    InitTensor2D(&input, 256, 10000, X_FLOAT, 0);
+    InitTensor2D(&output, 256, 10000, X_FLOAT, 0);
+    myRead(&input, "1.txt", "");
+    _Power(&input, &output, 2);
+    output.Dump(stderr, "", 200);
+}
 void SmallTest()
 {
    XTensor a;
@@ -87,7 +126,7 @@ void SmallTest()
    d = a + b + c.Lin(0.5F);
    XLink::CheckNetwork(&d);
-    XLink::ShowNetwork(stderr, &d);
+    //XLink::ShowNetwork(stderr, &d);
    a.Dump(stderr, "a:");
    b.Dump(stderr, "b:");
@@ -126,3 +165,128 @@ void TransposeTest()
    delete[] data;
 }
+void LittleTest()
+{
+    int a = 5000;
+    int b = 100000;
+    int c = a*b;
+    printf("%d\n", c);
+    exit(1);
+}
+void T2TTest()
+{
+    XTensor * input;
+    XTensor * weight;
+    XTensor * output;
+    XTensor * gold;
+    XTensor * dedy;
+    XTensor * dedx;
+    XTensor * dedxTmp;
+    XTensor * dedw;
+    XTensor * padding;
+    DTYPE loss;
+    int * dimSize = new int[2];
+    dimSize[0] = 256;
+    dimSize[1] = 10001;
+    int * dimSize2 = new int[3];
+    dimSize2[0] = 2;
+    dimSize2[1] = 31;
+    dimSize2[2] = 256;
+    int * dimSize3 = new int[3];
+    dimSize3[0] = 2;
+    dimSize3[1] = 31;
+    dimSize3[2] = 10001;
+    int * dimSize4 = new int[2];
+    dimSize4[0] = 2;
+    dimSize4[1] = 31;
+    input = NewTensor(3, dimSize2, X_FLOAT, 1.0F, 0);
+    weight = NewTensor(2, dimSize, X_FLOAT, 1.0F, 0);
+    dedw = NewTensor(2, dimSize, X_FLOAT, 1.0F, 0);
+    gold = NewTensor(3, dimSize3, X_FLOAT, 1.0F, 0);
+    output = NewTensor(3, dimSize3, X_FLOAT, 1.0F, 0);
+    dedy = NewTensor(3, dimSize3, X_FLOAT, 1.0F, 0);
+    dedx = NewTensor(3, dimSize3, X_FLOAT, 1.0F, 0);
+    dedxTmp = NewTensor(3, dimSize3, X_FLOAT, 1.0F, 0);
+    padding = NewTensor(2, dimSize4, X_FLOAT, 1.0F, 0);
+    //weight = NewTensor(2, dimSize);
+    //dedw = NewTensor(2, dimSize);
+    //input = NewTensor(3, dimSize2);
+    //gold = NewTensor(3, dimSize3);
+    //output = NewTensor(3, dimSize3);
+    //dedy = NewTensor(3, dimSize3);
+    //dedx = NewTensor(3, dimSize3);
+    //dedxTmp = NewTensor(3, dimSize3);
+    //padding = NewTensor(2, dimSize4);
+    myRead(input, "x.txt", "x");
+    myRead(weight, "w.txt", "w");
+    myRead(gold, "gold.txt", "gold");
+    myRead(padding, "padding.txt", "padding");
+    XTensor inter;
+    inter = MMul(*input, *weight);
+    _Softmax(&inter, output, 2);
+    //_LogMe(output);
+    loss = _CrossEntropyFast(output, gold, REDUCE_MEAN, NULL, padding);
+    printf("loss: %f\n", loss);
+    _CrossEntropyBackward(dedy, output, gold, NULL);
+    //_CrossEntropyBackward(dedy, output, gold, NULL, padding);
+    myDump(dedy, "dedy.txt", "dedy");
+    _SoftmaxBackward(NULL, output, input, dedy, dedx, NULL, -1, NOLOSS);
+    _Sub(output, gold, dedxTmp);
+    myDump(dedx, "dedx.txt", "dedx");
+    dedx->Dump(stderr, "dedx", 200);
+    dedxTmp->Dump(stderr, "dedxTmp", 200);
+    input->Reshape(input->unitNum/input->GetDim(-1), input->GetDim(-1));
+    dedx->Reshape(dedx->unitNum/dedx->GetDim(-1), dedx->GetDim(-1));
+    _MatrixMulBatched(input, X_TRANS, dedx, X_NOTRANS, dedw);
+    myDump(dedw, "dedw.txt", "dedw");
+}
+void T2TTest2()
+{
+    int dimSize[3];
+    dimSize[0] = 161;
+    dimSize[1] = 47;
+    dimSize[2] = 10001;
+    XTensor * probs = NewTensor(3, dimSize, X_FLOAT, 1.0F, 0);
+    //XTensor * probs = NewTensor(3, dimSize, X_FLOAT, 1.0F, -1);
+    //myRead(probs, "probs.txt", " ");
+    _SetDataFixedFloat(probs, 1.0F);
+    probs->Reshape(1, probs->unitNum);
+    DTYPE sum = _ReduceSumAll(probs);
+    printf("%e\n", sum);
+    //XTensor tmp;
+    //tmp = IsNonZero(*probs);
+    //DTYPE nonZeroNum = ReduceSumAll(tmp);
+    //printf("%f\n", nonZeroNum);
+    //
+    //DTYPE gpu = ReduceSum(*probs, 1).Get2D(0, 0);
+    //printf("%e\n", gpu);
+}
--- a/source/tensor/XDataType.h
+++ b/source/tensor/XDataType.h
@@ -50,8 +50,8 @@ extern TENSOR_DATA_TYPE GetDataType(const char * typeName);
 unsigned short FloatToFloat16(float f);
 float Float16ToFloat(unsigned short h);
 void ConvertDataType(int devID, 
-                                void * s, TENSOR_DATA_TYPE typeS, 
+                     void * s, TENSOR_DATA_TYPE typeS, 
-                                void * t, TENSOR_DATA_TYPE typeT, int size);
+                     void * t, TENSOR_DATA_TYPE typeT, int size);
 #ifdef USE_CUDA
 void CudaConvertDataType(int devID, 

--- a/source/tensor/XDevice.cpp
+++ b/source/tensor/XDevice.cpp
@@ -41,6 +41,7 @@ XDevManager GDevs;
 XDevice::XDevice()
 {
    stream = NULL;
+    isInitialized = false;
    Clear();
 #ifdef USE_CUDA
@@ -126,6 +127,7 @@ void XDevice::Init(int myDevID)
 #endif
    }
+    isInitialized = true;
 }
 /* clear it */
@@ -152,11 +154,14 @@ void XDevice::Clear()
 /* get cublas handle */
 cublasHandle_t * XDevice::GetCublasHandle()
 {
+    if (!isInitialized)
+        Init(devID);
    if(!isHandleReady){
        MUTEX_LOCK(cublasMutex);
        int devIDBackup = 0;
        ProtectCudaDev(devID, devIDBackup);
-        CheckNTErrors(cublasCreate(&cublasHandle) == cudaSuccess, 
+        CheckNTErrors(cublasCreate(&cublasHandle) == CUBLAS_STATUS_SUCCESS, 
                     "Cannot create the cublas handle.");
        isHandleReady = true;
        BacktoCudaDev(devID, devIDBackup);
@@ -169,6 +174,9 @@ cublasHandle_t * XDevice::GetCublasHandle()
 /* get the stream of cuda */
 cudaStream_t * XDevice::GetCudaStream()
 {
+    if (!isInitialized)
+        Init(devID);
    CheckNTErrors(stream != NULL, "the stream is not initialized!");
    return &stream->stream;
@@ -279,33 +287,13 @@ void XDevManager::Init()
        exit(1);
    }
-    cudaDeviceProp prop[64];
    for(int i = 0; i < GPUCount; i++){
-        GPUs[i].Init(i);
+        GPUs[i].devID = i;
-        cudaGetDeviceProperties(&prop[i], i);
+        //GPUs[i].Init(i);
    }
-#ifdef USA_CUDA_P2P
-    for(int i = 0; i < GPUCount; i++){
-        cudaSetDevice(i);
-        for(int j = 0; j < GPUCount; j++){
-            if(i == j)
-                continue;
-            int access;
-            cudaDeviceCanAccessPeer(&access, i, j);
-            bool hasUVA = (prop[i].unifiedAddressing && prop[j].unifiedAddressing);
-            fprintf(stderr, "device %d -> device %d access:%d UVA:%d\n", i, j, access, hasUVA ? 1 : 0);
-            if(access != 0){
-                CheckNTErrors((hasUVA == true), "at least one GPU does not support UVA.")
-                CheckNTErrors((cudaDeviceEnablePeerAccess(j, 0)==cudaSuccess), "cannot set cuda p2t mode!");
-            }
-        }
-    }
-#endif
 #endif
    nGPU = GPUCount;
 }
@@ -351,6 +339,9 @@ into blocks
 */
 int XDevManager::GetCudaThread(const int devID, const int n, int * gridSize, int * blockSize)
 {
+    if (!GPUs[devID].isInitialized)
+        GPUs[devID].Init(devID);
    memset(gridSize, 0, sizeof(int) * 3);
    memset(blockSize, 0, sizeof(int) * 3);
@@ -402,6 +393,9 @@ into blocks
 */
 int XDevManager::GetCudaThread2D(const int devID, const int n, const int m, int nLimit, int * gridSize, int * blockSize)
 {
+    if (!GPUs[devID].isInitialized)
+        GPUs[devID].Init(devID);
    memset(gridSize, 0, sizeof(int) * 3);
    memset(blockSize, 0, sizeof(int) * 3);
@@ -452,7 +446,7 @@ int XDevManager::GetCudaThread2D(const int devID, const int n, const int m, int 
    CheckNTErrors((!(b & (b-1))), "Block size (x-axis) must be in 2^x");
    CheckNTErrors((gXSize <= GPUs[devID].GPUMaxGridSize[0] && 
-                         gYSize <= GPUs[devID].GPUMaxGridSize[1]), "A too large grid size.");
+                   gYSize <= GPUs[devID].GPUMaxGridSize[1]), "A too large grid size.");
    blockSize[0] = bXSize;
    blockSize[1] = bYSize;
@@ -530,7 +524,7 @@ int XDevManager::GetDeviceIDs(char * devInfo, int * devIDs)
    for(int i = 0; i < terms->count; i++){
        int devC, devID;
-        char dev[32];
+        char dev[32] = "";
        char * curDevInfo = (char*)terms->GetItem(i);
        if(sscanf(curDevInfo, "%d:%s", &devC, dev) < 2){

--- a/source/tensor/XDevice.h
+++ b/source/tensor/XDevice.h
@@ -67,6 +67,9 @@ public:
    /* warp size of an (Navida) GPU */
    int GPUWarpSize;
+    /* indicates whether the device class has been initialized */
+    bool isInitialized;
    /* 
    max grid size (or number of blocks) of an (Navida) GPU 
    NOTE: the grid size is alone with three dimensions (x, y, z)

--- a/source/tensor/XGlobal.h
+++ b/source/tensor/XGlobal.h
@@ -43,7 +43,7 @@
 /* the nts (NiuTrans.Tensor) namespace */
 namespace nts {
-#define _XINLINE_ inline
+#define _XINLINE_  
 //#define DOUBELPRICSION
@@ -55,6 +55,9 @@ namespace nts {
 #define DTYPE_MIN (DTYPE)-3.40E+38
 #endif
+#define LOGPROB_MIN (DTYPE)-2E+1
+#define GRAD_MAX (DTYPE)1E+5
 #if WIN32
 #define DELIMITER '\\'
 #else
@@ -62,10 +65,10 @@ namespace nts {
 #endif
 #ifndef MIN
-#define MIN(a,b) ((a < b) ? a : b)
+#define MIN(a,b) ((a) < (b) ? a : b)
 #endif
 #ifndef MAX
-#define MAX(a,b) ((a > b) ? a : b)
+#define MAX(a,b) ((a) > (b) ? a : b)
 #endif
 #define __FILENAME__ ( strrchr(__FILE__, DELIMITER) != NULL ? strrchr(__FILE__, DELIMITER)+1 : __FILE__ )
@@ -147,6 +150,8 @@ extern bool useCUDA;
 #define XPRINT4(VERBOSE,FILEH,STR,ARG,ARG2,ARG3,ARG4) {if(VERBOSE<=verboseLevel) {fprintf(FILEH,STR,ARG,ARG2,ARG3,ARG4);FFLUSH(FILEH);}}
 #define XPRINT5(VERBOSE,FILEH,STR,ARG,ARG2,ARG3,ARG4,ARG5) {if(VERBOSE<=verboseLevel) {fprintf(FILEH,STR,ARG,ARG2,ARG3,ARG4,ARG5);FFLUSH(FILEH);}}
 #define XPRINT6(VERBOSE,FILEH,STR,ARG,ARG2,ARG3,ARG4,ARG5,ARG6) {if(VERBOSE<=verboseLevel) {fprintf(FILEH,STR,ARG,ARG2,ARG3,ARG4,ARG5,ARG6);FFLUSH(FILEH);}}
+#define XPRINT7(VERBOSE,FILEH,STR,ARG,ARG2,ARG3,ARG4,ARG5,ARG6,ARG7) {if(VERBOSE<=verboseLevel) {fprintf(FILEH,STR,ARG,ARG2,ARG3,ARG4,ARG5,ARG6,ARG7);FFLUSH(FILEH);}}
+#define XPRINT8(VERBOSE,FILEH,STR,ARG,ARG2,ARG3,ARG4,ARG5,ARG6,ARG7,ARG8) {if(VERBOSE<=verboseLevel) {fprintf(FILEH,STR,ARG,ARG2,ARG3,ARG4,ARG5,ARG6,ARG7,ARG8);FFLUSH(FILEH);}}
 #define B2I(V) V==0?false:true

--- a/source/tensor/XLink.cpp
+++ b/source/tensor/XLink.cpp
@@ -263,6 +263,18 @@ int XLink::GetParamInt(int i)
    char * p = (char*)params + i * paramSize;
    return *(int*)p;
 }
+/* 
+get a paramter in integer 
+>> i - id of the parameter
+<< return - the parameter in integer
+*/
+void * XLink::GetParamPointer(int i)
+{
+    CheckNTErrors(params != NULL, "parameter array cannot be empty!");
+    char * p = (char*)params + i * paramSize;
+    return *(int **)p;
+}
 /*
 get a parameter in MATRIX_TRANS_TYPE
@@ -401,8 +413,7 @@ add a boolean parameter
 */
 void XLink::AddParamToHeadBool(XTensor * h, bool param)
 {
-    if(h != NULL)
+    CheckNTErrors(h != NULL, "head tensor cannot be empty!");
-        return;
    h->income.AddParam(&param, sizeof(bool));
 }
@@ -413,8 +424,7 @@ add a pointer parameter
 */
 void XLink::AddParamToHeadPointer(XTensor * h, void * param)
 {
-    if(h != NULL)
+    CheckNTErrors(h != NULL, "head tensor cannot be empty!");
-        return;
    h->income.AddParam(&param, sizeof(param));
 }
@@ -583,15 +593,15 @@ void XLink::CheckNetwork(XTensor * root)
 }
 /* 
-show the network encoded in a root node (tensor) 
+show a node 
 >> file - file to dump information
->> root - pointer to the root node
+>> root - pointer to the node
 */
-void XLink::ShowNetwork(FILE * file, XTensor * root)
+void XLink::ShowNode(FILE * file, XTensor * node)
 {
-    fprintf(file, "node %d - ", root->id);
+    fprintf(file, "node %d - ", node->id);
-    XLink &income = root->income;
+    XLink &income = node->income;
    if(income.head == NULL){
        fprintf(file, "income[%d]: null ", income.tailNum);
    }
@@ -607,7 +617,7 @@ void XLink::ShowNetwork(FILE * file, XTensor * root)
    }
    fprintf(stderr, ", ");
-    XLink &outgo = root->outgo;
+    XLink &outgo = node->outgo;
    if(outgo.head == NULL || outgo.tailNum == 0){
        fprintf(file, "outgo[%d]: null ", outgo.tailNum);
    }
@@ -623,11 +633,6 @@ void XLink::ShowNetwork(FILE * file, XTensor * root)
    }
    fprintf(stderr, "\n");
-    for(int i = 0; i < income.tailNum; i++){
-        XTensor * child = income.tails[i];
-        ShowNetwork(file, child);
-    }
 }
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/XLink.h
+++ b/source/tensor/XLink.h
@@ -127,6 +127,9 @@ struct XLink
    /* get a paramter in integer */
    int GetParamInt(int i);
+    /* get a paramter in pointer */
+    void * GetParamPointer(int i);
    /* get a parameter in MATRIX_TRANS_TYPE */
    MATRIX_TRANS_TYPE GetParamTrans(int i);
@@ -175,9 +178,9 @@ struct XLink
    static
    void CheckNetwork(XTensor * root);
-    /* show the network encoded in a root node (tensor) */
+    /* show a node */
    static
-    void ShowNetwork(FILE * file, XTensor * root);
+    void ShowNode(FILE * file, XTensor * node);
 };
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/XMem.cpp
+++ b/source/tensor/XMem.cpp
--- a/source/tensor/XMem.h
+++ b/source/tensor/XMem.h
@@ -53,28 +53,14 @@ typedef long long          INT_64;
 #define MIN_BLOCK_SIZE_FOR_MEMPOOL 128 * 1024 * 1024
 #define MIN_BLOCK_NUM_FOR_MEMPOOL 1024
-/* memory block */
-struct XMemBlock
-{
-    /* pointer to where to start */
-    void * mem;
-    /* size of the block */
-    MTYPE size;
-    /* size of the used memory in this block */
-    MTYPE used;
-    /* disired size of the block */
-    MTYPE sizeDesired;
-};
 /* 
 mode of runnig a memory pool 
 - UNI_FREE: free all memory space when the memory allocation is no use
 - FREE_ON_THE_FLY: run in normal "malloc" and "free" ways
 */
 enum MEMPOOL_MODE {UNI_FREE, FREE_ON_THE_FLY};
+struct MPieceNode;
 /* header of a memory piece (FREE_ON_THE_FLY) */
 struct MHeader
@@ -96,6 +82,9 @@ struct MHeader
    /* id of the memory block */
    int blockID;
+    /* pointer to the index node */
+    MPieceNode * indexNode;
 };
 /* index of memory piece */
@@ -112,6 +101,31 @@ struct MPieceNode
    /* pointer to the head of a memory piece */
    void * p;
+    /* pointer to the head of memory that is returned back to the user */
+    void * pReal;
+    /* header of the memory piece */
+    MHeader head;
+};
+/* memory block */
+struct XMemBlock
+{
+    /* pointer to where to start */
+    void * mem;
+    /* size of the block */
+    MTYPE size;
+    /* size of the used memory in this block */
+    MTYPE used;
+    /* desired size of the block */
+    MTYPE sizeDesired;
+    /* first head of the block */
+    MHeader * head;
 };
 /* 
@@ -138,6 +152,9 @@ public:
    /* mode of running the memory pool */
    MEMPOOL_MODE mode;
+    /* signature */
+    MTYPE signature;
    /* indicates whether the memory allocation is static */
    bool isStatic;
@@ -194,13 +211,16 @@ public:
 public:
    /* index of the free memory pieces */
-    MPieceNode * freeMemIndex;
+    MPieceNode * memIndex;
+    /* for double buffering */
+    MPieceNode * memIndex2;
    /* maximum number of index nodes */
-    INT_64 indexNodeNum;
+    INT_64 nodeNum;
    /* count of the used nodes */
-    INT_64 indexNodeNumUsed;
+    INT_64 nodeNumUsed;
    /* minimal size allocation for each index entry */
    MTYPE * minSizeIndex;
@@ -211,6 +231,9 @@ public:
    /* index offset */
    int indexOffset;
+    /* indicates whether we merge free memory pieces on the fly */
+    bool mergeFreeOTF;
 public:
    /* constructor */
@@ -235,6 +258,9 @@ public:
    /* free a piece of memory */
    void Free(int myDevID, void * mem);
+    /* get signature */
+    MTYPE GetSignature();
    /* use string as the name of the memory pool */
    void SetName(const char * myName);
@@ -282,10 +308,10 @@ public:
    void * AllocBuf(int myDevID, MTYPE mySize, int pitch = BUF_PITCH);
    /* release a piece of memory */
-    void Release(void * p);
+    void Release(void * p, MTYPE size, MTYPE code);
    /* release a piece of memory */
-    void Release(int myDevID, void * p);
+    void Release(int myDevID, void * p, MTYPE size);
    /* release a piece of memory in the buffer */
    void ReleaseBuf(int myDevID, MTYPE mySize, int pitch = BUF_PITCH);
@@ -302,14 +328,20 @@ public:
    /* find the index entry for allocation query */
    int FindIndexEntry(MTYPE mySize);
-    /* remove an index node */
+    /* remove an index node for available memory pieces */
    void RemoveIndexNode(MPieceNode * node, MPieceNode * entry = NULL);
-    /* add an index node */
+    /* add an index node for available memory pieces */
-    void AddIndexNode(MPieceNode * node, MPieceNode * entry = NULL);
+    void AddFreeIndexNode(MPieceNode * node, MPieceNode * entry = NULL);
+    /* remove an index node for memory pieces in use */
+    void RemoveAllocIndexNode(MPieceNode * node, MPieceNode * entry = NULL);
+    /* add an index node for available memory pieces */
+    void AddAllocIndexNode(MPieceNode * node, MPieceNode * entry = NULL);
    /* release a piece of memory as "free" */
-    void ReleaseStandard(int myDevID, void * p);
+    void ReleaseStandard(int myDevID, void * p, MTYPE size);
    /* rebuild index to merge small fragments of memory and free the block with no use */
    void RebuildIndex();
@@ -379,6 +411,9 @@ public:
 extern XMem * GMem;
+extern int testxmemid;
+extern void * recordp;
 } /* end of the nts (NiuTrans.Tensor) namespace */
 #endif
--- a/source/tensor/XName.cpp
+++ b/source/tensor/XName.cpp
@@ -29,10 +29,22 @@ const char * GetOPName(int type)
    if ((type & MATH_BASE) != 0){
        if (type == MATH_ABSOLUTE)
            return "M_ABSOLUTE";
+        else if (type == MATH_CEIL)
+            return "M_CEIL";
        else if (type == MATH_EXP)
            return "M_EXP";
+        else if (type == MATH_FLOOR)
+            return "M_FLOOR";
+        else if (type == MATH_ISNONZERO)
+            return "M_ISNONZERO";
+        else if (type == MATH_ISZERO)
+            return "M_ISZERO";
        else if (type == MATH_LOG)
            return "M_LOG";
+        else if (type == MATH_SQRT)
+            return "M_SQRT";
+        else if (type == MATH_SQUARE)
+            return "M_SQUARE";
        else if (type == MATH_SIN)
            return "M_SIN";
        else if (type == MATH_COS)
@@ -45,12 +57,16 @@ const char * GetOPName(int type)
            return "M_CLIP";
        else if (type == MATH_DIV)
            return "M_DIV";
+        else if (type == MATH_DIVDIM)
+            return "M_DIVDIM";
        else if (type == MATH_MATRIXMUL)
            return "M_MATRIXMUL";
        else if (type == MATH_MATRIXMULBATCHED)
            return "M_MATRIXMULBATCHED";
        else if (type == MATH_MULTIPLY)
            return "M_MULTIPLY";
+        else if (type == MATH_MULTIPLYDIM)
+            return "M_MULTIPLYDIM";
        else if (type == MATH_NEGATE)
            return "M_NEGATE";
        else if (type == MATH_NORMALIZE)
@@ -61,10 +77,12 @@ const char * GetOPName(int type)
            return "M_SCALEANDSHIFT";
        else if (type == MATH_SIGN)
            return "M_SIGN";
-        else if (type == MATH_SUM)
-            return "M_SUM";
        else if (type == MATH_SUB)
            return "M_SUB";
+        else if (type == MATH_SUBDIM)
+            return "M_SUBDIM";
+        else if (type == MATH_SUM)
+            return "M_SUM";
        else if (type == MATH_SUMDIM)
            return "M_SUMDIM";
        else if (type == REDUCE_REDUCEMAX)
@@ -79,12 +97,20 @@ const char * GetOPName(int type)
            return "R_REDUCEVARIANCE";
    }
    else if ((type & DATA_BASE) != 0){
-        if (type == GETANDSET_SELECT)
+        if (type == GETANDSET_CONVERTDATATYPE)
+            return "G_CONVERTDATATYPE";
+        else if (type == GETANDSET_INDEXTOONEHOT)
+            return "G_INDEXTOONEHOT";
+        else if (type == GETANDSET_ONEHOTTOINDEX)
+            return "G_ONEHOTTOINDEX";
+        else if (type == GETANDSET_SELECT)
            return "G_SELECT";
        else if (type == MOVEMENT_COPYINDEXED)
            return "M_COPYINDEXED";
        else if (type == MOVEMENT_COPYVALUES)
            return "M_COPYVALUES";
+        else if (type == MOVEMENT_GATHER)
+            return "M_GATHER";
        else if (type == SHAPE_CONCATENATE)
            return "S_CONCATENATE";
        else if (type == SHAPE_MERGE)
@@ -93,10 +119,14 @@ const char * GetOPName(int type)
            return "S_MERGE_LIST";
        else if (type == SHAPE_PERMUTE)
            return "S_PERMUTE";
+        else if (type == SHAPE_RESHAPE)
+            return "S_RESHAPE";
        else if (type == SHAPE_SPLIT)
            return "S_SPLIT";
        else if (type == SHAPE_SPLIT_LIST)
            return "S_SPLIT_LIST";
+        else if (type == SHAPE_SQUEEZE)
+            return "S_SQUEEZE";
        else if (type == SHAPE_TRANSPOSE)
            return "S_TRANSPOSE";
        else if (type == SHAPE_UNSQUEEZE)
@@ -107,7 +137,9 @@ const char * GetOPName(int type)
            return "S_TOPK";
    }
    else if ((type & FUNCTION_BASE) != 0){
-        if (type == FUNC_HARDTANH)
+        if (type == FUNC_DROPOUT)
+            return "F_DROPOUT";
+        else if (type == FUNC_HARDTANH)
            return "F_HARDTANH";
        else if (type == FUNC_IDENTITY)
            return "F_IDENTITY";

--- a/source/tensor/XName.h
+++ b/source/tensor/XName.h
@@ -32,26 +32,35 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #define MATH_BASE               0x00001000
 #define MATH_ABSOLUTE           MATH_BASE + 1
-#define MATH_EXP                MATH_ABSOLUTE + 1
+#define MATH_CEIL               MATH_ABSOLUTE + 1
-#define MATH_LOG                MATH_EXP + 1
+#define MATH_EXP                MATH_CEIL + 1
-#define MATH_SIN                MATH_LOG + 1
+#define MATH_FLOOR              MATH_EXP + 1
+#define MATH_ISNONZERO          MATH_FLOOR + 1
+#define MATH_ISZERO             MATH_ISNONZERO + 1
+#define MATH_LOG                MATH_ISZERO + 1
+#define MATH_SQRT               MATH_LOG + 1
+#define MATH_SQUARE             MATH_SQRT + 1
+#define MATH_SIN                MATH_SQUARE + 1
 #define MATH_COS                MATH_SIN + 1
 #define MATH_TAN                MATH_COS + 1
 #define MATH_ROUND              MATH_TAN + 1
 #define MATH_CLIP               MATH_ROUND + 1
 #define MATH_DIV                MATH_CLIP + 1
-#define MATH_MATRIXMUL          MATH_DIV + 1
+#define MATH_DIVDIM             MATH_DIV + 1
+#define MATH_MATRIXMUL          MATH_DIVDIM + 1
 #define MATH_MATRIXMULBATCHED   MATH_MATRIXMUL + 1
 #define MATH_MULTIPLY           MATH_MATRIXMULBATCHED + 1
-#define MATH_NEGATE             MATH_MULTIPLY + 1
+#define MATH_MULTIPLYDIM        MATH_MULTIPLY + 1
+#define MATH_NEGATE             MATH_MULTIPLYDIM + 1
 #define MATH_NORMALIZE          MATH_NEGATE + 1
 #define MATH_POWER              MATH_NORMALIZE + 1
 #define MATH_SCALEANDSHIFT      MATH_POWER + 1
 #define MATH_SIGN               MATH_SCALEANDSHIFT + 1
-#define MATH_SUM                MATH_SIGN + 1
+#define MATH_SUB                MATH_SIGN + 1
-#define MATH_SUB                MATH_SUM + 1
+#define MATH_SUBDIM             MATH_SUB + 1
-#define MATH_SUMDIM             MATH_SUB + 1
+#define MATH_SUM                MATH_SUBDIM + 1
+#define MATH_SUMDIM             MATH_SUM + 1
 #define REDUCE                  MATH_SUMDIM + 1
 #define REDUCE_REDUCEMAX        REDUCE + 1
@@ -63,20 +72,26 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /* data and shape related operations */
 #define DATA_BASE               MATH_BASE * 2
 #define GETANDSET               DATA_BASE + 1
-#define GETANDSET_SELECT        GETANDSET + 1
+#define GETANDSET_CONVERTDATATYPE GETANDSET + 1
+#define GETANDSET_INDEXTOONEHOT GETANDSET_CONVERTDATATYPE + 1
+#define GETANDSET_ONEHOTTOINDEX GETANDSET_INDEXTOONEHOT + 1
+#define GETANDSET_SELECT        GETANDSET_ONEHOTTOINDEX + 1
 #define MOVEMENT                GETANDSET_SELECT + 1
 #define MOVEMENT_COPYINDEXED    MOVEMENT + 1
 #define MOVEMENT_COPYVALUES     MOVEMENT_COPYINDEXED + 1
+#define MOVEMENT_GATHER         MOVEMENT_COPYVALUES + 1
-#define SHAPE                   MOVEMENT_COPYVALUES + 1
+#define SHAPE                   MOVEMENT_GATHER + 1
 #define SHAPE_CONCATENATE       SHAPE + 1
 #define SHAPE_MERGE             SHAPE_CONCATENATE + 1
 #define SHAPE_MERGE_LIST        SHAPE_MERGE + 1
 #define SHAPE_PERMUTE           SHAPE_MERGE_LIST + 1
-#define SHAPE_SPLIT             SHAPE_PERMUTE + 1
+#define SHAPE_RESHAPE           SHAPE_PERMUTE + 1
+#define SHAPE_SPLIT             SHAPE_RESHAPE + 1
 #define SHAPE_SPLIT_LIST        SHAPE_SPLIT + 1
-#define SHAPE_TRANSPOSE         SHAPE_SPLIT_LIST + 1
+#define SHAPE_SQUEEZE           SHAPE_SPLIT_LIST + 1
+#define SHAPE_TRANSPOSE         SHAPE_SQUEEZE + 1
 #define SHAPE_UNSQUEEZE         SHAPE_TRANSPOSE + 1
 #define SORT                    SHAPE_UNSQUEEZE + 1
@@ -85,7 +100,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /* activation functions */
 #define FUNCTION_BASE           DATA_BASE * 2
-#define FUNC_HARDTANH           FUNCTION_BASE + 1
+#define FUNC_DROPOUT            FUNCTION_BASE + 1
+#define FUNC_HARDTANH           FUNC_DROPOUT + 1
 #define FUNC_IDENTITY           FUNC_HARDTANH + 1
 #define FUNC_LOGSOFTMAX         FUNC_IDENTITY + 1
 #define FUNC_RECTIFY            FUNC_LOGSOFTMAX + 1

--- a/source/tensor/XTensor.cpp
+++ b/source/tensor/XTensor.cpp
--- a/source/tensor/XTensor.h
+++ b/source/tensor/XTensor.h
@@ -49,9 +49,10 @@ struct XLink;
 #define USE_BATCHED_STRIDED_MAT_MUL
 #define MIN_TENSOR_SPLIT_NUM 0
 #define MIN_TENSOR_SPLIT_LIST_NUM 1024
+#define MIN_TENSOR_MERGE_NUM 0
+#define MIN_TENSOR_MERGE_LIST_NUM 1024
 #define MIN_TENSOR_CAT_NUM 8
 /* computation flags */
 #define UNSAFE_BUT_FAST_MEM
 #define FAST_MATRIX
@@ -66,6 +67,9 @@ public:
    /* memory pool */
    XMem * mem;
+    /* signature of the memory pool */
+    MTYPE signature;
    /* data array to keep the elements */
    void * data;
@@ -143,6 +147,9 @@ public:
    /* indicates whether the tensor keeps the gradient when used as model parameters */
    bool isGrad;
+    /* indicates whether the tensor is used as paramters (or variables) */
+    bool isVar;
    /* mark for traversing the gragh */
    unsigned int visitMark;
@@ -199,15 +206,27 @@ public:
    /* overloading of the plus-sign */
    XTensor  operator+ (const XTensor &tensor);
+    /* overloading of the plus-sign */
+    XTensor  operator+ (const DTYPE shift);
    /* overloading of the multiply-sign */
    XTensor  operator* (const XTensor &tensor);
+    /* overloading of the multiply-sign */
+    XTensor  operator* (const DTYPE scale);
    /* overloading of the minus-sign */
    XTensor  operator- (const XTensor &tensor);
+    /* overloading of the minus-sign */
+    XTensor  operator- (const DTYPE shift);
    /* overloading of the division-sign */
    XTensor  operator/ (const XTensor &tensor);
+    /* overloading of the division-sign */
+    XTensor  operator/ (const DTYPE scale);
    /* linear transformation */
    XTensor Lin(DTYPE scale, DTYPE shift = 0);
@@ -218,13 +237,13 @@ public:
    /* judge whether the three matrices are in the same type and size */
    static
-    bool IsSameShaped(XTensor * a, XTensor * b, XTensor * c);
+    bool IsSameShaped(const XTensor * a, const XTensor * b, const XTensor * c);
    /* set the size of each dimension */
    void SetDim(int * myDimSize);
    /* get the size of a given dimension */
-    int GetDim(const int dim);
+    int GetDim(const int dim) const;
    /* reshape the tensor */
    void Reshape(const int order, const int * myDimSize);
@@ -244,18 +263,30 @@ public:
    /* get unit size in terms of "dataType" */
    int GetUnitSize(TENSOR_DATA_TYPE myDataType);
+    /* get offset (2D) */
+    MTYPE GetOffset2D(int row, int col);
+    /* get offset (3D) */
+    MTYPE GetOffset3D(int d0, int d1, int d2);
    /* a tensor with all entries of 0 */
    void SetZeroAll(XStream * stream = NULL);
    /* set the tensor with an data array */
    void SetData(const void * d, int num, int beg = 0);
-    /* set the tensor items by a uniform distribution */
+    /* set tensor items by a uniform distribution */
-    void SetDataRand(DTYPE lower, DTYPE upper);
+    void SetDataRand(DTYPE lower = 0.0F, DTYPE upper = 1.0F);
-    /* set the tensor items by a normal distribution */
+    /* set tensor items by a normal distribution */
    void SetDataRandn(DTYPE mean, DTYPE standardDeviation);
+    /* set tensor items with an array of offsets */
+    void SetDataBatched(MTYPE * offsets, DTYPE value, int num);
+    /* set tensor items with an array of values */
+    void SetDataBatchedWithValues(MTYPE * offsets, void * values, int num);
    /* check whether the data array is the same as the answer */
    bool CheckData(const void * answer, int num, int beg = 0);
@@ -309,6 +340,18 @@ public:
    /* set the value of a cell in a 3d tensor */
    bool Set3D(DTYPE value, int d0, int d1, int d2);
+    /* set the integer value of a cell */
+    bool SetInt(int value, int index[], int size = -1);
+    /* set the integer value of a cell in a 1d tensor */
+    bool Set1DInt(int value, int i);
+    /* set the integer value of a cell in a 2d tensor */
+    bool Set2DInt(int value, int ni, int mi);
+    /* set the integer value of a cell in a 3d tensor */
+    bool Set3DInt(int value, int d0, int d1, int d2);
    /* increase the value of a cell in a 2d */
    bool Add2D(DTYPE value, int ni, int mi);
@@ -317,21 +360,19 @@ public:
    int GetNonzeroSize();
    /* set the tensor as "temporary" */
-    void SetTMP(bool myIsTmp = true);
+    void SetTMPFlag(bool myIsTmp = true);
    /* set the tensor as "keep-gradient" */
-    void SetGrad(bool myIsGrad = true);
+    void SetGradFlag(bool myIsGrad = true);
+    /* set the tensor as "variable" */
+    void SetVarFlag(bool myIsVar = true);
    /* resize a matrix with a specified matrix size */
    bool Resize(const int myOrder, const int * myDimSize,
                const TENSOR_DATA_TYPE myDataType = DEFAULT_DTYPE,
                const float myDenseRatio = 1.0F);
-    /* resize a matrix with a specified matrix size (with no data filled) */
-    bool ResizeWithNoData(const int myOrder, const int * myDimSize, 
-                          const TENSOR_DATA_TYPE myDataType = DEFAULT_DTYPE,
-                          const float myDenseRatio = 1.0F);
    /* resize a matrix by another one */
    bool Resize(const XTensor * myTensor);
@@ -339,11 +380,11 @@ public:
    bool BinarySearch(int key, DTYPE &value, void * &position) const;
    /* dump data to a file */
-    void Dump(FILE * file, const char * label = NULL, const int n = -1, const int verbose = 0);
+    void Dump(FILE * file, const char * label = NULL, const int n = -1, const int beg = 0, const int verbose = 0);
    /* dump data to a file */
    static
-    void Dump(const XTensor * tensor, FILE * file, const char * label = NULL, const int n = -1, const int verbose = 0);
+    void Dump(const XTensor * tensor, FILE * file, const char * label = NULL, const int n = -1, const int beg = 0, const int verbose = 0);
    /* read data from a file */
    void Read(FILE * file, const char * label = NULL);
@@ -435,7 +476,7 @@ XTensor * NewTensor5D(const int d0, const int d1, const int d2, const int d3, co
                      const int myDevID = -1, XMem * myMem = NULL);
 /* generate a copy of XTensor (with a reference to a given tensor) */
-XTensor * NewTensor(XTensor * a, bool isFilledData = true);
+XTensor * NewTensor(const XTensor * a, bool isFilledData = true);
 /* free the data space of a given tensor */
 void DelTensor(XTensor * tensor);

--- a/source/tensor/XUtility.cpp
+++ b/source/tensor/XUtility.cpp
@@ -491,6 +491,21 @@ bool SetToDevice(int devID, void * p, DTYPE value)
    return true;
 }
+/* assign a integer number to a variable that is kept on a specified device */
+bool SetToDeviceInt(int devID, void * p, int value)
+{
+    if(p == NULL)
+        return false;
+    if(devID < 0)
+        *(int*)p = value;
+    else{
+        XMemCopy(p, devID, &value, -1, sizeof(int));
+    }
+    return true;
+}
 /* get the next number with power of 2 */
 unsigned int GetNextPower2(unsigned int n)
 {

--- a/source/tensor/XUtility.h
+++ b/source/tensor/XUtility.h
@@ -50,6 +50,7 @@ extern void XMemFreeOnDev(int devID, void * p);
 extern DTYPE ToCPU(int devID, void * value);
 extern int ToCPUInt(int devID, void * value);
 extern bool SetToDevice(int devID, void * p, DTYPE value);
+extern bool SetToDeviceInt(int devID, void * p, int value);
 extern unsigned int GetNextPower2(unsigned int n);
 extern void XSleep(int sleepTime);
 extern double GetClock();

--- a/source/tensor/core.7z
+++ b/source/tensor/core.7z
--- a/source/tensor/core/CHeader.h
+++ b/source/tensor/core/CHeader.h
@@ -16,8 +16,8 @@
 */
 /*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
+ * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
-*/
+ */
 /* this is a header to include all functions in the "core" workspace */
@@ -27,15 +27,18 @@
 #include "../XTensor.h"
 #include "arithmetic/Div.h"
+#include "arithmetic/DivDim.h"
 #include "arithmetic/MatrixMul.h"
 #include "arithmetic/MatrixMul2D.h"
 #include "arithmetic/MatrixMul2DMultiTheading.h"
 #include "arithmetic/MatrixMul2DParallel.h"
 #include "arithmetic/MatrixMulBatched.h"
 #include "arithmetic/Multiply.h"
+#include "arithmetic/MultiplyDim.h"
 #include "arithmetic/Negate.h"
 #include "arithmetic/Sign.h"
 #include "arithmetic/Sub.h"
+#include "arithmetic/SubDim.h"
 #include "arithmetic/Sum.h"
 #include "arithmetic/SumByColumnTV.h"
 #include "arithmetic/SumByColumnVT.h"
@@ -43,16 +46,17 @@
 #include "arithmetic/XTensorBLAS.h"
 #include "getandset/ConvertDataType.h"
+#include "getandset/OnehotAndIndex.h"
 #include "getandset/Select.h"
 #include "getandset/SetData.h"
 #include "math/Clip.h"
+#include "math/Compare.h"
 #include "math/Normalize.h"
 #include "math/Power.h"
 #include "math/ScaleAndShift.h"
 #include "math/Unary.h"
 #include "movement/CopyBlocks.h"
 #include "movement/CopyBlocksInGrid.h"
 #include "movement/CopyBlocksOnSite.h"
@@ -60,11 +64,14 @@
 #include "movement/CopyIndexed.h"
 #include "movement/CopyInGrid.h"
 #include "movement/CopyValues.h"
+#include "movement/Gather.h"
+#include "movement/Spread.h"
 #include "reduce/ReduceMax.h"
 #include "reduce/ReduceMean.h"
 #include "reduce/ReduceStandardVariance.h"
 #include "reduce/ReduceSum.h"
+#include "reduce/ReduceSumAll.h"
 #include "reduce/ReduceSumSquared.h"
 #include "reduce/ReduceVariance.h"
@@ -74,8 +81,10 @@
 #include "shape/MakeSplitBlockIndex.h"
 #include "shape/Merge.h"
 #include "shape/MergeBlockLists.h"
+#include "shape/Reshape.h"
 #include "shape/Permute.h"
 #include "shape/Split.h"
+#include "shape/Squeeze.h"
 #include "shape/Transpose.h"
 #include "shape/Unsqueeze.h"
@@ -85,4 +94,4 @@
 #include "utilities/XMatrixSegment.h"
 #include "utilities/FlushToMem.h"
 #endif // __CHEADER_H__
\ No newline at end of file
--- a/source/tensor/core/arithmetic/Div.cpp
+++ b/source/tensor/core/arithmetic/Div.cpp
@@ -23,6 +23,7 @@
 #include "../../XName.h"
 #include "Div.h"
 #include "Div.cuh"
+#include "DivDim.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
@@ -137,8 +138,37 @@ void _DivMe(XTensor * a, const XTensor * b, DTYPE alpha, int leadingDim)
    _Div(a, b, a, alpha, leadingDim);
 }
+/* 
+return a dimension if the division is performed as DivDim (in more details in DivDim.h)
+>> a - a tensor
+>> b - another tensor for division
+*/
+int GetDivDimIndex(const XTensor &a, const XTensor &b)
+{
+    if(a.order < b.order)
+        return -1;
+    if(XTensor::IsSameShaped(&a, &b))
+        return -1;
+    int hitCount = 0;
+    int hitDim = -1;
+    for(int i = 0; i < b.order; i++){
+        if(b.dimSize[b.order - 1 - i] == 1)
+            continue;
+        else if(b.dimSize[b.order - 1 - i] == a.dimSize[a.order - 1 - i]){
+            hitCount++;
+            hitDim = a.order - b.order + i;
+        }
+    }
+    if(hitCount == 1)
+        return hitDim;
+    else
+        return -1;
+}
 /*
-element-wise division of two tensors (return a XTensor structure)
+element-wise division of two tensors (return an XTensor structure)
 make a new tensor c to keep the result and return it
 c(i) = a(i)*b(i)
@@ -146,23 +176,41 @@ where i is the index of the item
 >> a - tensor a
 >> b - tensor b
+>> alpha - the coefficient
 >> leadingDim - the dimension along which we perform broadcasting
 << return - the product of the tensors
 */
-XTensor Div(const XTensor &a, const XTensor &b, int leadingDim)
+XTensor Div(const XTensor &a, const XTensor &b, DTYPE alpha, int leadingDim)
 {
-    CheckNTErrors(a.dimSize[leadingDim] == b.dimSize[leadingDim], "TODO!");
    XTensor c(&a);
-    c.SetTMP();
+    c.SetTMPFlag();
-    /* call _Multiply function */
+    int n = GetDivDimIndex(a, b);
-    _Div(&a, &b, &c, 0, leadingDim);
+    if(n == -1){
-    /* tensor connections */
+        CheckNTErrors(a.dimSize[leadingDim] == b.dimSize[leadingDim], "TODO!");
-    XLink::MakeLink(&a, &b, &c, MATH_DIV);
-    XLink::AddParamToHeadInt(&c, leadingDim);
+        /* call _Div function */
+        _Div(&a, &b, &c, alpha, leadingDim);
+        /* tensor connections */
+        XLink::MakeLink(&a, &b, &c, MATH_DIV);
+        XLink::AddParamToHead(&c, alpha);
+        XLink::AddParamToHeadInt(&c, leadingDim);
+    }
+    else if(n >= 0 && n < a.order){
+        /* call _DivDim function */
+        _DivDim(&a, &b, &c, n, alpha);
+        /* tensor connections */
+        XLink::MakeLink(&a, &b, &c, MATH_DIVDIM);
+        XLink::AddParamToHeadInt(&c, n);
+        XLink::AddParamToHead(&c, alpha);
+    }
+    else{
+        ShowNTErrors("Something is wrong!");
+    }
    return c;
 }

--- a/source/tensor/core/arithmetic/Div.h
+++ b/source/tensor/core/arithmetic/Div.h
@@ -31,7 +31,7 @@ element-wise division of two tensors:
 c(i) = a(i)/b(i) + \alpha * c(i) 
 where i is the index of the element
 */
-void _Div(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha = 0, int leadingDim = 0);
+void _Div(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha = 0.0, int leadingDim = 0);
 /* 
 element-wise division of two tensors (do it on site)
@@ -39,15 +39,15 @@ keep the result in the input tensor a and return nothing
 a(i) = a(i)/b(i) + \alpha * a(i) 
 where i is the index of the element 
 */
-void _DivMe(XTensor * a, const XTensor * b, DTYPE alpha = 0, int leadingDim = 0);
+void _DivMe(XTensor * a, const XTensor * b, DTYPE alpha = 0.0, int leadingDim = 0);
 /* 
-element-wise division of two tensors (return a XTensor structure)
+element-wise division of two tensors (return an XTensor structure)
 make a new tensor to keep the result and return it
 c(i) = a(i)/b(i)
 where i is the index of the element 
 */
-XTensor Div(const XTensor &a, const XTensor &b, int leadingDim = 0);
+XTensor Div(const XTensor &a, const XTensor &b, DTYPE alpha = 0.0, int leadingDim = 0);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/arithmetic/DivDim.cpp
+++ b/source/tensor/core/arithmetic/DivDim.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-15
+ */
+#include "Div.h"
+#include "DivDim.h"
+#include "DivDim.cuh"
+#include "../../XName.h"
+#include "../movement/CopyValues.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/*
+tensor division
+c = a / b + \alpha * c
+where the size of b is equal to the n-th dimension of a, 
+i.e., a is divided with b by broadcasting 
+>> a - a tensor
+>> b - another tensor whose size is equal to that of dimension n of a
+>> c - where we put result. we save it in a if c is NULL
+>> n - the dimension index
+>> alpha - the scaling factor
+*/
+void _DivDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE alpha)
+{
+    CheckNTErrors(a && b && c, "Empty tensor input!");
+    CheckNTErrors(a->unitNum == c->unitNum, "Unmatched tensors in division!");
+    CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
+                 "Unmatched data types in addition!");
+    CheckNTErrors(a->order == c->order, "The input tensors do not have the same order in division!");
+    CheckNTErrors(!a->isSparse && !b->isSparse && !c->isSparse, "Dense tensors are required!");
+    CheckNTErrors(a->dimSize[n] == b->unitNum, "Wrong tensor size!");
+    if(XTensor::IsSameShaped(a, b)){
+        _Div(a, b, c, alpha);
+        return;
+    }
+    if(a->devID >= 0 || b->devID >= 0 || c->devID >= 0){
+#ifdef USE_CUDA
+        _CudaDivDim(a, b, c, n, alpha);
+#else
+        ShowNTErrors("Please specify USE_CUDA and recompile the code!");
+#endif
+    }
+    else{
+        int stride = 1;
+        int blockSize = a->dimSize[n];
+        int blockNum = 1;
+        for(int i = a->order - 1; i >= 0; i--){
+            if(i > n)
+                stride *= a->dimSize[i];
+            else if(i < n)
+                blockNum *= a->dimSize[i];
+        }
+        if (a->dataType == DEFAULT_DTYPE){
+            int num = a->unitNum;
+            if(stride > 1){
+                for(int i = 0, j = 0; i < num; i += stride, j++){
+                    DTYPE * ap = (DTYPE*)a->data + i;
+                    DTYPE   bv = *((DTYPE*)b->data + j % blockSize);
+                    DTYPE * cp = (DTYPE*)c->data + i;
+                    for(int k = 0; k < stride; k++){
+                        if(alpha == 0.0F)
+                            cp[k] = ap[k] / bv;
+                        else
+                            cp[k] = ap[k] / bv + alpha * cp[k];
+                    }
+                }
+            }
+            else if(stride == 1){
+                DTYPE * bp = (DTYPE*)b->data;
+                for(int i = 0; i < num; i += blockSize){
+                    DTYPE * ap = (DTYPE*)a->data + i;
+                    DTYPE * cp = (DTYPE*)c->data + i;
+                    if(alpha == 0.0F){
+                        for(int j = 0; j < blockSize; j++)
+                            cp[j] = ap[j] / bp[j];
+                    }
+                    else{
+                        for(int j = 0; j < blockSize; j++)
+                            cp[j] = ap[j] / bp[j] + alpha * cp[j];
+                    }
+                }
+            }
+            else{
+                ShowNTErrors("Something is wrong!");
+            }
+        }
+        else {
+            ShowNTErrors("TODO!");
+        }
+    }
+}
+/*
+tensor division of two tensors (do it on site)
+keep the result in the input tensor and return nothing
+a = a/b + \alpha * a
+where the size of b is equal to the n-th dimension of a, 
+i.e., a is divided with b by broadcasting 
+>> a - a tensor
+>> b - another tensor whose size is equal to that of dimension n of a
+>> n - the dimension index
+>> alpha - the scaling factor
+*/
+void _DivDim(XTensor * a, const XTensor * b, int n, DTYPE alpha)
+{
+    _DivDim(a, b, a, n, alpha);
+}
+/*
+tensor division of two tensors (return an XTensor structure and make tensor connections)
+make a new tensor to keep the result and return it
+c = a/b + \alpha * c
+where the size of b is equal to the n-th dimension of a, 
+i.e., a is divided with b by broadcasting 
+>> a - a tensor
+>> b - another tensor whose size is equal to that of dimension n of a
+>> n - the dimension index
+>> alpha - the scaling factor
+<< return - the result tensor by tensor division
+*/
+XTensor DivDim(const XTensor &a, const XTensor &b, int n, DTYPE alpha)
+{
+    XTensor c(&a);
+    c.SetTMPFlag();
+    /* call _Div function */
+    _DivDim(&a, &b, &c, n, alpha);
+    /* tensor connections */
+    XLink::MakeLink(&a, &b, &c, MATH_DIVDIM);
+    XLink::AddParamToHeadInt(&c, n);
+    XLink::AddParamToHead(&c, alpha);
+    return c;
+}
+}
--- a/source/tensor/core/arithmetic/DivDim.cu
+++ b/source/tensor/core/arithmetic/DivDim.cu
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-15
+ */
+#include "DivDim.cuh"
+#include "../../XDevice.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+#ifdef USE_CUDA
+/* 
+tensor division of a tensor and a row vector
+c = a / b + alpha * c
+where a is a tensor and b is a row vector
+>> a - pointer to the data array of a
+>> b - pointer to the data array of b
+>> c - pointer to the data array of c
+>> rowNum - number of rows of a and c
+>> colNum - number of columns of a and c (i.e., the size of b)
+>> alpha - the scaling factor
+*/
+template <class T, bool alphaFired>
+__global__
+void KernelDivWithRow(T * a, T * b, T * c, int rowNum, int colNum, T alpha)
+{
+    __shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
+    int col = blockDim.x * blockIdx.x + threadIdx.x;
+    int row = blockDim.y * blockIdx.y + threadIdx.y;
+    if(col >= colNum || row >= rowNum)
+        return;
+    if(threadIdx.y == 0)
+        bv[threadIdx.x] = b[col];
+    __syncthreads();
+    int offset = colNum * row + col;
+    if(alphaFired)
+        c[offset] = a[offset] / bv[threadIdx.x] + c[offset] * alpha;
+    else
+        c[offset] = a[offset] / bv[threadIdx.x];
+}
+/* 
+tensor division of a tensor and a colum vector
+c = a / b + alpha * c
+where a is a tensor and b is a colum vector
+>> a - pointer to the data array of a
+>> b - pointer to the data array of b
+>> c - pointer to the data array of c
+>> rowNum - number of rows of a and c (i.e., the size of b)
+>> colNum - number of columns of a and c 
+>> blockNum - size of a block (matrix), i.e., rowNum * colNum
+>> blockNum - number of matrics 
+>> alpha - the scaling factor
+*/
+template <class T, bool alphaFired>
+__global__
+void KernelDivWithCol(T * a, T * b, T * c, int rowNum, int colNum, int blockSize, int blockNum, T alpha)
+{
+    __shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
+    int colIndex = blockDim.x * blockIdx.x + threadIdx.x;
+    int row = blockDim.y * blockIdx.y + threadIdx.y;
+    int col = colIndex % colNum;
+    int block = colIndex / colNum;
+    if(row >= rowNum || block >= blockNum)
+        return;
+    if(threadIdx.x == 0)
+        bv[threadIdx.y] = b[row];
+    __syncthreads();
+    int offset = block * blockSize + row * colNum + col;
+    if(alphaFired)
+        c[offset] = a[offset] / bv[threadIdx.y] + c[offset] * alpha;
+    else
+        c[offset] = a[offset] / bv[threadIdx.y];
+}
+/*
+tensor division
+c = a / b + \alpha * c
+where the size of b is equal to the n-th dimension of a, 
+i.e., a is divided with b by broadcasting 
+>> a - a tensor
+>> b - another tensor whose size is equal to that of dimension n of a
+>> c - where we put a / b + \alpha * c. we save it in a if c is NULL
+>> n - the dimension index
+>> alpha - the scaling factor
+*/
+void _CudaDivDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE alpha)
+{
+    CheckNTErrors(a && b && c, "Empty tensor input!");
+    CheckNTErrors(a->unitNum == c->unitNum, "Unmatched tensors in division!");
+    CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
+                 "Unmatched data types in division!");
+    CheckNTErrors(a->order == c->order, "The input tensors do not have the same order in division!");
+    CheckNTErrors(!a->isSparse && !b->isSparse && !c->isSparse, "Dense tensors are required!");
+    CheckNTErrors(a->dimSize[n] == b->unitNum, "Wrong tensor size!");
+    int stride = 1;
+    int blockSize = a->dimSize[n];
+    int blockNum = 1;
+    for(int i = a->order - 1; i >= 0; i--){
+        if(i > n)
+            stride *= a->dimSize[i];
+        else if(i < n)
+            blockNum *= a->dimSize[i];
+    }
+    int cudaGrids[3];
+    int cudaBlocks[3];
+    int devIDBackup = 0;
+    ProtectCudaDev(a->devID, devIDBackup);
+    if (a->dataType == DEFAULT_DTYPE){
+        if(stride > 1){
+            GDevs.GetCudaThread2D(a->devID, stride * blockNum, blockSize, MAX_INT, cudaGrids, cudaBlocks);
+            if(alpha == (DTYPE)0.0F)
+                KernelDivWithCol<DTYPE, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
+                                                ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, 
+                                                  blockSize, stride, blockSize * stride, blockNum, alpha);
+            else
+                KernelDivWithCol<DTYPE, true>  <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
+                                                ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, 
+                                                  blockSize, stride, blockSize * stride, blockNum, alpha);
+        }
+        else if(stride == 1){
+            GDevs.GetCudaThread2D(a->devID, blockSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
+            if(alpha == (DTYPE)0.0F)
+                KernelDivWithRow<DTYPE, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
+                                                ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, 
+                                                  blockNum, blockSize, alpha);
+            else
+                KernelDivWithRow<DTYPE, true>  <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
+                                                ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, 
+                                                  blockNum, blockSize, alpha);
+        }
+        else{
+            ShowNTErrors("Something is wrong!");
+        }
+    }
+    else {
+        ShowNTErrors("TODO!");
+    }
+    BacktoCudaDev(a->devID, devIDBackup);
+}
+#endif
+} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/arithmetic/DivDim.cuh
+++ b/source/tensor/core/arithmetic/DivDim.cuh
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-15
+ */
+#ifndef __DIVDIM_CUH__
+#define __DIVDIM_CUH__
+#include "../../XTensor.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+#ifdef USE_CUDA
+/* 
+tensor division
+c(i) = a/b + \alpha * c
+where the size of b is equal to the n-th dimension of a, 
+i.e., a is divided with b by broadcasting (cuda version)
+*/
+void _CudaDivDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE alpha = (DTYPE)0.0);
+#endif
+} // namespace nts(NiuTrans.Tensor)
+#endif // __DIVDIM_CUH__
--- a/source/tensor/core/arithmetic/DivDim.h
+++ b/source/tensor/core/arithmetic/DivDim.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-15
+ */
+#ifndef __DIVDIM_H__
+#define __DIVDIM_H__
+#include "../../XTensor.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/* 
+tensor division of two tensors:
+c(i) = a/b + \alpha * c
+where the size of b is equal to the n-th dimension of a, 
+i.e., a is divided with b by broadcasting 
+*/
+void _DivDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE alpha = (DTYPE)0.0);
+/*
+tensor division of two tensors:
+c(i) = a/b + \alpha * c
+where the size of b is equal to the n-th dimension of a, 
+i.e., a is divided with b by broadcasting 
+we keep the result in the input tensor a and return nothing
+*/
+void _DivDim(XTensor * a, const XTensor * b, int n, DTYPE alpha = (DTYPE)0.0);
+/*
+tensor division of two tensors:
+c(i) = a/b + \alpha * c
+where the size of b is equal to the n-th dimension of a, 
+i.e., a is divided with b by broadcasting 
+we make a new tensor c to keep the result and return it
+*/
+XTensor DivDim(const XTensor &a, const XTensor &b, int n, DTYPE alpha = (DTYPE)0.0);
+} // namespace nts(NiuTrans.Tensor)
+#endif // __DIVDIM_H__
--- a/source/tensor/core/arithmetic/MatrixMul.cpp
+++ b/source/tensor/core/arithmetic/MatrixMul.cpp
@@ -203,7 +203,7 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
 }
 /*
-matrix multiplication (return a XTensor structure) c = trans(a) * trans(b) * alpha
+matrix multiplication (return an XTensor structure) c = trans(a) * trans(b) * alpha
 make a new tensor to keep the result and return it
 For the input tensors a and b, we perform matrix multiplication on the first two dimentsions. 
@@ -249,7 +249,7 @@ XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA,
    float dr = (!a.isSparse || !b.isSparse) ? 1.0F : MAX(a.denseRatio, b.denseRatio);
    XTensor c(order, dimSize, a.dataType, dr, a.devID, a.mem);
-    c.SetTMP();
+    c.SetTMPFlag();
    /* call _MatrixMul function */
    _MatrixMul(&a, transposedA, &b, transposedB, &c, alpha, 0, parallelRunner);
@@ -299,7 +299,7 @@ XTensor MatrixMul(const XTensor &a, const XTensor &b,
    float dr = (!a.isSparse || !b.isSparse) ? 1.0F : MAX(a.denseRatio, b.denseRatio);
    XTensor c(order, dimSize, a.dataType, dr, a.devID, a.mem);
-    c.SetTMP();
+    c.SetTMPFlag();
    /* call _MatrixMul function */
    _MatrixMul(&a, X_NOTRANS, &b, X_NOTRANS, &c, alpha, 0, parallelRunner);

--- a/source/tensor/core/arithmetic/MatrixMul.h
+++ b/source/tensor/core/arithmetic/MatrixMul.h
@@ -44,7 +44,7 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor 
                DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL);
 /* 
-matrix multiplication (return a XTensor structure) c = trans(a) * trans(b) * alpha
+matrix multiplication (return an XTensor structure) c = trans(a) * trans(b) * alpha
 make a new tensor c to keep the result and return it
 For the input tensors a and b, we perform matrix multiplicationon the first two dimentsions. 

--- a/source/tensor/core/arithmetic/MatrixMulBatched.cpp
+++ b/source/tensor/core/arithmetic/MatrixMulBatched.cpp
@@ -314,7 +314,7 @@ XTensor MatrixMulBatched(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const 
    float dr = (!a.isSparse || !b.isSparse) ? 1.0F : MAX(a.denseRatio, b.denseRatio);
    XTensor c(order, dimSize, a.dataType, dr, a.devID, a.mem);
-    c.SetTMP();
+    c.SetTMPFlag();
    /*call _MatrixMulBatched function */
    _MatrixMulBatched(&a, transposedA, &b, transposedB, &c, alpha, 0, parallelRunner);
@@ -370,7 +370,7 @@ XTensor MatrixMulBatched(const XTensor &a, const XTensor &b,
    float dr = (!a.isSparse || !b.isSparse) ? 1.0F : MAX(a.denseRatio, b.denseRatio);
    XTensor c(order, dimSize, a.dataType, dr, a.devID, a.mem);
-    c.SetTMP();
+    c.SetTMPFlag();
    /*call _MatrixMulBatched function */
    _MatrixMulBatched(&a, X_NOTRANS, &b, X_NOTRANS, &c, alpha, 0, parallelRunner);

--- a/source/tensor/core/arithmetic/MatrixMulBatched.h
+++ b/source/tensor/core/arithmetic/MatrixMulBatched.h
@@ -62,7 +62,7 @@ void _MatrixMulBatchedCPU(const XList * a, MATRIX_TRANS_TYPE transposedA, const 
                          XList * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0);
 /*
-matrix multiplication of the two tensors (return a XTensor structure) c = trans(a) * trans(b) * alpha
+matrix multiplication of the two tensors (return an XTensor structure) c = trans(a) * trans(b) * alpha
 make a new tensor to keep the result and return it
 for each 2-dimensional data array in a (denoted as ai) and
@@ -74,7 +74,7 @@ XTensor MatrixMulBatched(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const 
                         DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);
 /*
-matrix multiplication of the two tensors (return a XTensor structure) c = a * b * alpha
+matrix multiplication of the two tensors (return an XTensor structure) c = a * b * alpha
 make a new tensor to keep the result and return it
 for each 2-dimensional data array in a (denoted as ai) and

--- a/source/tensor/core/arithmetic/Multiply.cpp
+++ b/source/tensor/core/arithmetic/Multiply.cpp
@@ -23,6 +23,7 @@
 #include "../../XName.h"
 #include "Multiply.h"
 #include "Multiply.cuh"
+#include "MultiplyDim.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
@@ -65,8 +66,8 @@ void _Multiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, i
    for (int i = 0; i < a->order; i++) {
        if (i != leadingDimRDI) {
            CheckNTErrors((a->dimSizeRDI[i] == b->dimSizeRDI[i] &&
-                a->dimSizeRDI[i] == c->dimSizeRDI[i]),
+                           a->dimSizeRDI[i] == c->dimSizeRDI[i]),
-                "Unmatched tensors!");
+                          "Unmatched tensors!");
        }
        if (i < leadingDimRDI)
            stride *= a->dimSizeRDI[i];
@@ -138,8 +139,37 @@ void _MultiplyMe(XTensor * a, const XTensor * b, DTYPE alpha, int leadingDim)
    _Multiply(a, b, a, alpha, leadingDim);
 }
+/* 
+return a dimension if the multiplication is performed as MultiplyDim (in more details in MultiplyDim.h)
+>> a - a tensor
+>> b - another tensor for multiplication
+*/
+int GetMultiplyDimIndex(const XTensor &a, const XTensor &b)
+{
+    if(a.order < b.order)
+        return -1;
+    if(XTensor::IsSameShaped(&a, &b))
+        return -1;
+    int hitCount = 0;
+    int hitDim = -1;
+    for(int i = 0; i < b.order; i++){
+        if(b.dimSize[b.order - 1 - i] == 1)
+            continue;
+        else if(b.dimSize[b.order - 1 - i] == a.dimSize[a.order - 1 - i]){
+            hitCount++;
+            hitDim = a.order - b.order + i;
+        }
+    }
+    if(hitCount == 1)
+        return hitDim;
+    else
+        return -1;
+}
 /*
-element-wise product of two tensors (return a XTensor structure)
+element-wise product of two tensors (return an XTensor structure)
 make a new tensor c to keep the result and return it
 c(i) = a(i)*b(i)
@@ -150,20 +180,38 @@ where i is the index of the item
 >> leadingDim - the dimension along which we perform broadcasting
 << return - the product of the tensors
 */
-XTensor Multiply(const XTensor &a, const XTensor &b, int leadingDim)
+XTensor Multiply(const XTensor &a, const XTensor &b, DTYPE alpha, int leadingDim)
 {
-    CheckNTErrors(a.dimSize[leadingDim] == b.dimSize[leadingDim], "TODO!");
    XTensor c(&a);
-    c.SetTMP();
+    c.SetTMPFlag();
-    /* call _Multiply function */
+    int n = GetMultiplyDimIndex(a, b);
-    _Multiply(&a, &b, &c, 0, leadingDim);
+    if(n == -1){
+        CheckNTErrors(a.dimSize[leadingDim] == b.dimSize[leadingDim], "TODO!");
-    /* tensor connections */
+        /* call _Multiply function */
-    XLink::MakeLink(&a, &b, &c, MATH_MULTIPLY);
+        _Multiply(&a, &b, &c, 0, leadingDim);
-    XLink::AddParamToHeadInt(&c, leadingDim);
+        /* tensor connections */
+        XLink::MakeLink(&a, &b, &c, MATH_MULTIPLY);
+        XLink::AddParamToHead(&c, alpha);
+        XLink::AddParamToHeadInt(&c, leadingDim);
+    }
+    else if(n >= 0 && n < a.order){
+        /* call _MultiplyDim function */
+        _MultiplyDim(&a, &b, &c, n, alpha);
+        /* tensor connections */
+        XLink::MakeLink(&a, &b, &c, MATH_MULTIPLYDIM);
+        XLink::AddParamToHeadInt(&c, n);
+        XLink::AddParamToHead(&c, alpha);
+    }
+    else{
+        ShowNTErrors("Something is wrong!");
+    }
    return c;
 }

--- a/source/tensor/core/arithmetic/Multiply.cu
+++ b/source/tensor/core/arithmetic/Multiply.cu
@@ -77,7 +77,7 @@ where |a_lead| means the size of the leading dimension of a
 */
 template<int nonZeroAlpha> __global__
 void KernelMulElementWiseTensorDynamic(DTYPE * a, DTYPE * b, DTYPE * c, DTYPE alpha,
-    int stride, int ldSizeA, int ldSizeB, int ldSizeC, int blockNum)
+                                       int stride, int ldSizeA, int ldSizeB, int ldSizeC, int blockNum)
 {
    __shared__ DTYPE* ap[MAX_CUDA_THREAD_NUM_PER_BLOCK];
    __shared__ DTYPE* bp[MAX_CUDA_THREAD_NUM_PER_BLOCK];
@@ -171,14 +171,12 @@ void _CudaMultiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alph
                if (alpha == 0) {
                    KernelMulElementWiseTensorDynamic<0> << <blocks, threads >> >
                        ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, 0,
-                            stride, dimensionSizeA, dimensionSizeB, dimensionSizeC,
+                          stride, dimensionSizeA, dimensionSizeB, dimensionSizeC, blockNum);
-                            blockNum);
                }
                else {
                    KernelMulElementWiseTensorDynamic<1> << <blocks, threads >> >
                        ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, alpha,
-                            stride, dimensionSizeA, dimensionSizeB, dimensionSizeC,
+                          stride, dimensionSizeA, dimensionSizeB, dimensionSizeC, blockNum);
-                            blockNum);
                }
            }
        }

--- a/source/tensor/core/arithmetic/Multiply.h
+++ b/source/tensor/core/arithmetic/Multiply.h
@@ -31,7 +31,7 @@ element-wise product of two tensors:
 c(i) = a(i)*b(i) + \alpha * c(i) 
 where i is the index of the element
 */
-void _Multiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha = 0, int leadingDim = 0);
+void _Multiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha = 0.0, int leadingDim = 0);
 /* 
 element-wise product of two tensors (do it on site)
@@ -39,15 +39,15 @@ keep the result in the input tensor a and return nothing
 a(i) = a(i)*b(i) + \alpha * a(i) 
 where i is the index of the element 
 */
-void _MultiplyMe(XTensor * a, const XTensor * b, DTYPE alpha = 0, int leadingDim = 0);
+void _MultiplyMe(XTensor * a, const XTensor * b, DTYPE alpha = 0.0, int leadingDim = 0);
 /* 
-element-wise product of two tensors (return a XTensor structure)
+element-wise product of two tensors (return an XTensor structure)
 make a new tensor to keep the result and return it
 c(i) = a(i)*b(i)
 where i is the index of the element 
 */
-XTensor Multiply(const XTensor &a, const XTensor &b, int leadingDim = 0);
+XTensor Multiply(const XTensor &a, const XTensor &b, DTYPE alpha = 0.0, int leadingDim = 0);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/arithmetic/MultiplyDim.cpp
+++ b/source/tensor/core/arithmetic/MultiplyDim.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Created by: JIANG Yufan (email: jiangyufan2018@outlook.com) 2018-08-14
+ */
+#include "Multiply.h"
+#include "MultiplyDim.h"
+#include "MultiplyDim.cuh"
+#include "../../XName.h"
+#include "../movement/CopyValues.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/*
+tensor multiplication
+c = a * b + \alpha * c
+where the size of b is equal to the n-th dimension of a,
+i.e., a is multiplied with b by broadcasting
+>> a - a tensor
+>> b - another tensor whose size is equal to that of dimension n of a
+>> c - where we put a * b + \alpha * c. we save it in a if c is NULL
+>> n - the dimension index
+>> alpha - the scaling factor
+*/
+void _MultiplyDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE alpha) {
+    CheckNTErrors(a && b && c, "Empty tensor input!");
+    CheckNTErrors(a->unitNum == c->unitNum, "Unmatched tensors in multiplication!");
+    CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
+                 "Unmatched data types in multiplication!");
+    CheckNTErrors(a->order == c->order, "The input tensors do not have the same order in multiplication!");
+    CheckNTErrors(!a->isSparse && !b->isSparse && !c->isSparse, "Dense tensors are required!");
+    CheckNTErrors(a->dimSize[n] == b->unitNum, "Wrong tensor size!");
+    if(XTensor::IsSameShaped(a, b)){
+        _Multiply(a, b, c, alpha);
+        return;
+    }
+    if(a->devID >= 0 || b->devID >= 0 || c->devID >= 0){
+#ifdef USE_CUDA
+        _CudaMultiplyDim(a, b, c, n, alpha);
+#else
+        ShowNTErrors("Please specify USE_CUDA and recompile the code!");
+#endif
+    }
+    else{
+        int stride = 1;
+        int blockSize = a->dimSize[n];
+        int blockNum = 1;
+        for(int i = a->order - 1; i >= 0; i--){
+            if(i > n)
+                stride *= a->dimSize[i];
+            else if(i < n)
+                blockNum *= a->dimSize[i];
+        }
+        if(a->dataType == DEFAULT_DTYPE){
+            int num = a->unitNum;
+            if(stride > 1){
+                for(int i = 0, j = 0; i < num; i += stride, j++){
+                    DTYPE * ap = (DTYPE*)a->data + i;
+                    DTYPE   bv = *((DTYPE*)b->data + j % blockSize);
+                    DTYPE * cp = (DTYPE*)c->data + i;
+                    for(int k = 0; k < stride; k++)
+                        if(alpha == 0.0F)
+                            cp[k] = ap[k] * bv;
+                        else
+                            cp[k] = ap[k] * bv + alpha * cp[k];
+                }
+            }
+            else if(stride == 1){
+                DTYPE * bp = (DTYPE*)b->data;
+                for(int i = 0; i < num; i += blockSize){
+                    DTYPE * ap = (DTYPE*)a->data + i;
+                    DTYPE * cp = (DTYPE*)c->data + i;
+                    if(alpha == 0.0F){
+                        for(int j = 0; j < blockSize; j++)
+                            cp[j] = ap[j] * bp[j];
+                    }
+                    else{
+                        for(int j = 0; j < blockSize; j++)
+                            cp[j] = ap[j] * bp[j] + alpha * cp[j];
+                    }
+                }
+            }
+            else{
+                ShowNTErrors("Something is wrong!");
+            }
+        }
+        else {
+            ShowNTErrors("TODO!");
+        }
+    }
+}
+/*
+tensor multiplication(do it on site)
+make a new tensor to keep the result and return it
+c = a * b + \alpha * c
+where the size of b is equal to the n-th dimension of a,
+i.e., a is multiplied with b by broadcasting
+>> a - a tensor
+>> b - another tensor whose size is equal to that of dimension n of a
+>> n - the dimension index
+>> alpha - the scaling factor
+*/
+void _MultiplyDimMe(XTensor * a, const XTensor * b, int n, DTYPE alpha)
+{
+    _MultiplyDim(a, b, a, n, alpha);
+}
+/*
+tensor multiplication (return an XTensor structure and make tensor connections)
+make a new tensor to keep the result and return it
+c = a * b + \alpha * c
+where the size of b is equal to the n-th dimension of a,
+i.e., a is multiplied with b by broadcasting
+>> a - a tensor
+>> b - another tensor whose size is equal to that of dimension n of a
+>> n - the dimension index
+>> alpha - the scaling factor
+<< return - the result tensor by tensor multiplication
+*/
+XTensor MultiplyDim(const XTensor &a, const XTensor &b, int n, DTYPE alpha)
+{
+    XTensor c(&a);
+    c.SetTMPFlag();
+    /* call _Multiply function */
+    _MultiplyDim(&a, &b, &c, n, alpha);
+    /* tensor connections */
+    XLink::MakeLink(&a, &b, &c, MATH_MULTIPLYDIM);
+    XLink::AddParamToHeadInt(&c, n);
+    XLink::AddParamToHead(&c, alpha);
+    return c;
+}
+}
--- a/source/tensor/core/arithmetic/MultiplyDim.cu
+++ b/source/tensor/core/arithmetic/MultiplyDim.cu
--- a/source/tensor/core/arithmetic/MultiplyDim.cuh
+++ b/source/tensor/core/arithmetic/MultiplyDim.cuh
--- a/source/tensor/core/arithmetic/MultiplyDim.h
+++ b/source/tensor/core/arithmetic/MultiplyDim.h
--- a/source/tensor/core/arithmetic/Negate.cpp
+++ b/source/tensor/core/arithmetic/Negate.cpp
--- a/source/tensor/core/arithmetic/Negate.h
+++ b/source/tensor/core/arithmetic/Negate.h
--- a/source/tensor/core/arithmetic/Sign.cpp
+++ b/source/tensor/core/arithmetic/Sign.cpp
--- a/source/tensor/core/arithmetic/Sign.h
+++ b/source/tensor/core/arithmetic/Sign.h
--- a/source/tensor/core/arithmetic/Sub.cpp
+++ b/source/tensor/core/arithmetic/Sub.cpp
--- a/source/tensor/core/arithmetic/SubDim.cpp
+++ b/source/tensor/core/arithmetic/SubDim.cpp
--- a/source/tensor/core/arithmetic/SubDim.cu
+++ b/source/tensor/core/arithmetic/SubDim.cu
--- a/source/tensor/core/arithmetic/SubDim.cuh
+++ b/source/tensor/core/arithmetic/SubDim.cuh
--- a/source/tensor/core/arithmetic/SubDim.h
+++ b/source/tensor/core/arithmetic/SubDim.h
--- a/source/tensor/core/arithmetic/Sum.cpp
+++ b/source/tensor/core/arithmetic/Sum.cpp
--- a/source/tensor/core/arithmetic/SumDim.cpp
+++ b/source/tensor/core/arithmetic/SumDim.cpp
--- a/source/tensor/core/arithmetic/XTensorBLAS.cu
+++ b/source/tensor/core/arithmetic/XTensorBLAS.cu
--- a/source/tensor/core/getandset/ConvertDataType.cpp
+++ b/source/tensor/core/getandset/ConvertDataType.cpp
--- a/source/tensor/core/getandset/ConvertDataType.cu
+++ b/source/tensor/core/getandset/ConvertDataType.cu
--- a/source/tensor/core/getandset/ConvertDataType.h
+++ b/source/tensor/core/getandset/ConvertDataType.h
--- a/source/tensor/core/getandset/OnehotAndIndex.cpp
+++ b/source/tensor/core/getandset/OnehotAndIndex.cpp
--- a/source/tensor/core/getandset/OnehotAndIndex.cu
+++ b/source/tensor/core/getandset/OnehotAndIndex.cu
--- a/source/tensor/core/getandset/OnehotAndIndex.cuh
+++ b/source/tensor/core/getandset/OnehotAndIndex.cuh
--- a/source/tensor/core/getandset/OnehotAndIndex.h
+++ b/source/tensor/core/getandset/OnehotAndIndex.h
--- a/source/tensor/core/getandset/Select.cpp
+++ b/source/tensor/core/getandset/Select.cpp
--- a/source/tensor/core/getandset/Select.h
+++ b/source/tensor/core/getandset/Select.h
--- a/source/tensor/core/getandset/SetData.cpp
+++ b/source/tensor/core/getandset/SetData.cpp
--- a/source/tensor/core/getandset/SetData.cu
+++ b/source/tensor/core/getandset/SetData.cu
--- a/source/tensor/core/getandset/SetData.cuh
+++ b/source/tensor/core/getandset/SetData.cuh
--- a/source/tensor/core/getandset/SetData.h
+++ b/source/tensor/core/getandset/SetData.h
--- a/source/tensor/core/math/Clip.cpp
+++ b/source/tensor/core/math/Clip.cpp
--- a/source/tensor/core/math/Clip.h
+++ b/source/tensor/core/math/Clip.h
--- a/source/tensor/core/math/Compare.cpp
+++ b/source/tensor/core/math/Compare.cpp
--- a/source/tensor/core/math/Compare.cu
+++ b/source/tensor/core/math/Compare.cu
--- a/source/tensor/core/math/Compare.cuh
+++ b/source/tensor/core/math/Compare.cuh
--- a/source/tensor/core/math/Compare.h
+++ b/source/tensor/core/math/Compare.h
--- a/source/tensor/core/math/Normalize.cpp
+++ b/source/tensor/core/math/Normalize.cpp
--- a/source/tensor/core/math/Normalize.h
+++ b/source/tensor/core/math/Normalize.h
--- a/source/tensor/core/math/Power.cpp
+++ b/source/tensor/core/math/Power.cpp
--- a/source/tensor/core/math/Power.h
+++ b/source/tensor/core/math/Power.h
--- a/source/tensor/core/math/ScaleAndShift.cpp
+++ b/source/tensor/core/math/ScaleAndShift.cpp
--- a/source/tensor/core/math/Unary.cpp
+++ b/source/tensor/core/math/Unary.cpp
--- a/source/tensor/core/math/Unary.cu
+++ b/source/tensor/core/math/Unary.cu
--- a/source/tensor/core/math/Unary.cuh
+++ b/source/tensor/core/math/Unary.cuh
--- a/source/tensor/core/math/Unary.h
+++ b/source/tensor/core/math/Unary.h
--- a/source/tensor/core/movement/CopyBlocks.cpp
+++ b/source/tensor/core/movement/CopyBlocks.cpp
--- a/source/tensor/core/movement/CopyBlocksInGrid.cpp
+++ b/source/tensor/core/movement/CopyBlocksInGrid.cpp
--- a/source/tensor/core/movement/CopyBlocksOnSite.cu
+++ b/source/tensor/core/movement/CopyBlocksOnSite.cu
--- a/source/tensor/core/movement/CopyBlocksSelected.cu
+++ b/source/tensor/core/movement/CopyBlocksSelected.cu
--- a/source/tensor/core/movement/CopyIndexed.cpp
+++ b/source/tensor/core/movement/CopyIndexed.cpp
--- a/source/tensor/core/movement/CopyIndexed.cu
+++ b/source/tensor/core/movement/CopyIndexed.cu
--- a/source/tensor/core/movement/CopyIndexed.cuh
+++ b/source/tensor/core/movement/CopyIndexed.cuh
--- a/source/tensor/core/movement/CopyIndexed.h
+++ b/source/tensor/core/movement/CopyIndexed.h
--- a/source/tensor/core/movement/CopyValues.cpp
+++ b/source/tensor/core/movement/CopyValues.cpp
--- a/source/tensor/core/movement/CopyValues.h
+++ b/source/tensor/core/movement/CopyValues.h
--- a/source/tensor/core/movement/Gather.cpp
+++ b/source/tensor/core/movement/Gather.cpp
--- a/source/tensor/core/movement/Gather.cu
+++ b/source/tensor/core/movement/Gather.cu
--- a/source/tensor/core/movement/Gather.cuh
+++ b/source/tensor/core/movement/Gather.cuh
--- a/source/tensor/core/movement/Gather.h
+++ b/source/tensor/core/movement/Gather.h
--- a/source/tensor/core/movement/Spread.cpp
+++ b/source/tensor/core/movement/Spread.cpp
--- a/source/tensor/core/movement/Spread.cu
+++ b/source/tensor/core/movement/Spread.cu
--- a/source/tensor/core/movement/Spread.cuh
+++ b/source/tensor/core/movement/Spread.cuh
--- a/source/tensor/core/movement/Spread.h
+++ b/source/tensor/core/movement/Spread.h
--- a/source/tensor/core/reduce/ReduceMax.cpp
+++ b/source/tensor/core/reduce/ReduceMax.cpp
--- a/source/tensor/core/reduce/ReduceMax.cu
+++ b/source/tensor/core/reduce/ReduceMax.cu
--- a/source/tensor/core/reduce/ReduceMax.h
+++ b/source/tensor/core/reduce/ReduceMax.h
--- a/source/tensor/core/reduce/ReduceMean.cpp
+++ b/source/tensor/core/reduce/ReduceMean.cpp
--- a/source/tensor/core/reduce/ReduceMean.h
+++ b/source/tensor/core/reduce/ReduceMean.h
--- a/source/tensor/core/reduce/ReduceSum.cpp
+++ b/source/tensor/core/reduce/ReduceSum.cpp
--- a/source/tensor/core/reduce/ReduceSum.cu
+++ b/source/tensor/core/reduce/ReduceSum.cu
--- a/source/tensor/core/reduce/ReduceSum.h
+++ b/source/tensor/core/reduce/ReduceSum.h
--- a/source/tensor/core/reduce/ReduceSumAll.cpp
+++ b/source/tensor/core/reduce/ReduceSumAll.cpp
--- a/source/tensor/core/reduce/ReduceSumAll.h
+++ b/source/tensor/core/reduce/ReduceSumAll.h
--- a/source/tensor/core/reduce/ReduceSumSquared.cpp
+++ b/source/tensor/core/reduce/ReduceSumSquared.cpp
--- a/source/tensor/core/reduce/ReduceSumSquared.h
+++ b/source/tensor/core/reduce/ReduceSumSquared.h
--- a/source/tensor/core/reduce/ReduceVariance.cpp
+++ b/source/tensor/core/reduce/ReduceVariance.cpp
--- a/source/tensor/core/reduce/ReduceVariance.h
+++ b/source/tensor/core/reduce/ReduceVariance.h
--- a/source/tensor/core/shape/Concatenate.cpp
+++ b/source/tensor/core/shape/Concatenate.cpp
--- a/source/tensor/core/shape/Concatenate.h
+++ b/source/tensor/core/shape/Concatenate.h
--- a/source/tensor/core/shape/Merge.cpp
+++ b/source/tensor/core/shape/Merge.cpp
--- a/source/tensor/core/shape/Merge.h
+++ b/source/tensor/core/shape/Merge.h
--- a/source/tensor/core/shape/Permute.h
+++ b/source/tensor/core/shape/Permute.h
--- a/source/tensor/core/shape/Reshape.cpp
+++ b/source/tensor/core/shape/Reshape.cpp
--- a/source/tensor/core/shape/Reshape.h
+++ b/source/tensor/core/shape/Reshape.h
--- a/source/tensor/core/shape/Split.cpp
+++ b/source/tensor/core/shape/Split.cpp
--- a/source/tensor/core/shape/Split.h
+++ b/source/tensor/core/shape/Split.h
--- a/source/tensor/core/shape/Squeeze.cpp
+++ b/source/tensor/core/shape/Squeeze.cpp
--- a/source/tensor/core/shape/Squeeze.h
+++ b/source/tensor/core/shape/Squeeze.h
--- a/source/tensor/core/shape/Transpose.cpp
+++ b/source/tensor/core/shape/Transpose.cpp
--- a/source/tensor/core/shape/Transpose.h
+++ b/source/tensor/core/shape/Transpose.h
--- a/source/tensor/core/shape/Unsqueeze.cpp
+++ b/source/tensor/core/shape/Unsqueeze.cpp
--- a/source/tensor/core/shape/Unsqueeze.cu
+++ b/source/tensor/core/shape/Unsqueeze.cu
--- a/source/tensor/core/shape/Unsqueeze.h
+++ b/source/tensor/core/shape/Unsqueeze.h
--- a/source/tensor/core/sort/Sort.cpp
+++ b/source/tensor/core/sort/Sort.cpp
--- a/source/tensor/core/sort/Sort.h
+++ b/source/tensor/core/sort/Sort.h
--- a/source/tensor/core/utilities/SetAscendingOrder.cu
+++ b/source/tensor/core/utilities/SetAscendingOrder.cu
--- a/source/tensor/function/CrossEntropy.cpp
+++ b/source/tensor/function/CrossEntropy.cpp
--- a/source/tensor/function/CrossEntropy.cu
+++ b/source/tensor/function/CrossEntropy.cu
--- a/source/tensor/function/CrossEntropy.cuh
+++ b/source/tensor/function/CrossEntropy.cuh
--- a/source/tensor/function/CrossEntropy.h
+++ b/source/tensor/function/CrossEntropy.h
--- a/source/tensor/function/Dropout.cpp
+++ b/source/tensor/function/Dropout.cpp
--- a/source/tensor/function/Dropout.cu
+++ b/source/tensor/function/Dropout.cu
--- a/source/tensor/function/Dropout.cuh
+++ b/source/tensor/function/Dropout.cuh
--- a/source/tensor/function/Dropout.h
+++ b/source/tensor/function/Dropout.h
--- a/source/tensor/function/FHeader.h
+++ b/source/tensor/function/FHeader.h
--- a/source/tensor/function/HardTanH.cpp
+++ b/source/tensor/function/HardTanH.cpp
--- a/source/tensor/function/HardTanH.cu
+++ b/source/tensor/function/HardTanH.cu
--- a/source/tensor/function/HardTanH.h
+++ b/source/tensor/function/HardTanH.h
--- a/source/tensor/function/Identity.cpp
+++ b/source/tensor/function/Identity.cpp
--- a/source/tensor/function/Identity.h
+++ b/source/tensor/function/Identity.h
--- a/source/tensor/function/LogSoftmax.cpp
+++ b/source/tensor/function/LogSoftmax.cpp
--- a/source/tensor/function/LogSoftmax.cu
+++ b/source/tensor/function/LogSoftmax.cu
--- a/source/tensor/function/LogSoftmax.cuh
+++ b/source/tensor/function/LogSoftmax.cuh
--- a/source/tensor/function/LogSoftmax.h
+++ b/source/tensor/function/LogSoftmax.h
--- a/source/tensor/function/Loss.cpp
+++ b/source/tensor/function/Loss.cpp
--- a/source/tensor/function/Loss.cu
+++ b/source/tensor/function/Loss.cu
--- a/source/tensor/function/Loss.h
+++ b/source/tensor/function/Loss.h
--- a/source/tensor/function/Rectify.cpp
+++ b/source/tensor/function/Rectify.cpp
--- a/source/tensor/function/Rectify.cu
+++ b/source/tensor/function/Rectify.cu
--- a/source/tensor/function/Rectify.h
+++ b/source/tensor/function/Rectify.h
--- a/source/tensor/function/Sigmoid.cpp
+++ b/source/tensor/function/Sigmoid.cpp
--- a/source/tensor/function/Sigmoid.cu
+++ b/source/tensor/function/Sigmoid.cu
--- a/source/tensor/function/Sigmoid.cuh
+++ b/source/tensor/function/Sigmoid.cuh
--- a/source/tensor/function/Sigmoid.h
+++ b/source/tensor/function/Sigmoid.h
--- a/source/tensor/function/Softmax.cpp
+++ b/source/tensor/function/Softmax.cpp
--- a/source/tensor/function/Softmax.cu
+++ b/source/tensor/function/Softmax.cu
--- a/source/tensor/function/Softmax.cuh
+++ b/source/tensor/function/Softmax.cuh
--- a/source/tensor/function/Softmax.h
+++ b/source/tensor/function/Softmax.h
--- a/source/tensor/test/TAbsolute.cpp
+++ b/source/tensor/test/TAbsolute.cpp
--- a/source/tensor/test/TAbsolute.h
+++ b/source/tensor/test/TAbsolute.h
--- a/source/tensor/test/TClip.cpp
+++ b/source/tensor/test/TClip.cpp
--- a/source/tensor/test/TClip.h
+++ b/source/tensor/test/TClip.h
--- a/source/tensor/test/TCompare.cpp
+++ b/source/tensor/test/TCompare.cpp
--- a/source/tensor/test/TCompare.h
+++ b/source/tensor/test/TCompare.h
--- a/source/tensor/test/TConvertDataType.cpp
+++ b/source/tensor/test/TConvertDataType.cpp
--- a/source/tensor/test/TCopyIndexed.cpp
+++ b/source/tensor/test/TCopyIndexed.cpp
--- a/source/tensor/test/TCopyIndexed.h
+++ b/source/tensor/test/TCopyIndexed.h
--- a/source/tensor/test/TCos.cpp
+++ b/source/tensor/test/TCos.cpp
--- a/source/tensor/test/TCrossEntropy.cpp
+++ b/source/tensor/test/TCrossEntropy.cpp
--- a/source/tensor/test/TCrossEntropy.h
+++ b/source/tensor/test/TCrossEntropy.h
--- a/source/tensor/test/TDivDim.cpp
+++ b/source/tensor/test/TDivDim.cpp
--- a/source/tensor/test/TDivDim.h
+++ b/source/tensor/test/TDivDim.h
--- a/source/tensor/test/TDropout.cpp
+++ b/source/tensor/test/TDropout.cpp
--- a/source/tensor/test/TDropout.h
+++ b/source/tensor/test/TDropout.h
--- a/source/tensor/test/TGather.cpp
+++ b/source/tensor/test/TGather.cpp
--- a/source/tensor/test/TGather.h
+++ b/source/tensor/test/TGather.h
--- a/source/tensor/test/TLogSoftmax.cpp
+++ b/source/tensor/test/TLogSoftmax.cpp
--- a/source/tensor/test/TLoss.cpp
+++ b/source/tensor/test/TLoss.cpp
--- a/source/tensor/test/TLoss.h
+++ b/source/tensor/test/TLoss.h
--- a/source/tensor/test/TMultiplyDim.cpp
+++ b/source/tensor/test/TMultiplyDim.cpp
--- a/source/tensor/test/TMultiplyDim.h
+++ b/source/tensor/test/TMultiplyDim.h
--- a/source/tensor/test/TPower.cpp
+++ b/source/tensor/test/TPower.cpp
--- a/source/tensor/test/TReduceSum.cpp
+++ b/source/tensor/test/TReduceSum.cpp
--- a/source/tensor/test/TReduceSumAll.cpp
+++ b/source/tensor/test/TReduceSumAll.cpp
--- a/source/tensor/test/TReduceSumAll.h
+++ b/source/tensor/test/TReduceSumAll.h
--- a/source/tensor/test/TSetData.cpp
+++ b/source/tensor/test/TSetData.cpp
--- a/source/tensor/test/TSoftmax.cpp
+++ b/source/tensor/test/TSoftmax.cpp
--- a/source/tensor/test/TSort.cpp
+++ b/source/tensor/test/TSort.cpp
--- a/source/tensor/test/TSplit.cpp
+++ b/source/tensor/test/TSplit.cpp
--- a/source/tensor/test/TSpread.cpp
+++ b/source/tensor/test/TSpread.cpp
--- a/source/tensor/test/TSpread.h
+++ b/source/tensor/test/TSpread.h
--- a/source/tensor/test/TSubDim.cpp
+++ b/source/tensor/test/TSubDim.cpp
--- a/source/tensor/test/TSubDim.h
+++ b/source/tensor/test/TSubDim.h
--- a/source/tensor/test/TSumDim.cpp
+++ b/source/tensor/test/TSumDim.cpp
--- a/source/tensor/test/TUnsqueeze.cpp
+++ b/source/tensor/test/TUnsqueeze.cpp
--- a/source/tensor/test/TXMem.cpp
+++ b/source/tensor/test/TXMem.cpp
--- a/source/tensor/test/Test.cpp
+++ b/source/tensor/test/Test.cpp
--- a/source/tensor/test/Test.h
+++ b/source/tensor/test/Test.h